1/*
2 * pandoc.ts
3 *
4 * Copyright (C) 2021 by RStudio, PBC
5 *
6 * Unless you have received this program directly from RStudio pursuant
7 * to the terms of a commercial license agreement with RStudio, then
8 * this program is licensed to you under the terms of version 3 of the
9 * GNU Affero General Public License. This program is distributed WITHOUT
10 * ANY EXPRESS OR IMPLIED WARRANTY, INCLUDING THOSE OF NON-INFRINGEMENT,
11 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Please refer to the
12 * AGPL (http://www.gnu.org/licenses/agpl-3.0.txt) for more details.
13 *
14 */
15
16import { Fragment, Mark, Node as ProsemirrorNode, Schema, NodeType } from 'prosemirror-model';
17
18import { PandocAttr, pandocAttrReadAST, kSpanChildren, kSpanAttr } from './pandoc_attr';
19import { PandocCapabilitiesResult } from './pandoc_capabilities';
20import { kQuoteType, kQuoteChildren, QuoteType } from './quote';
21import { BibliographyResult } from './bibliography/bibliography-provider_local';
22
23import { stringifyMath } from './math';
24import { kCodeText } from './code';
25import { kLinkChildren } from './link';
26
27export interface PandocServer {
28  getCapabilities(): Promise<PandocCapabilitiesResult>;
29  markdownToAst(markdown: string, format: string, options: string[]): Promise<PandocAst>;
30  astToMarkdown(ast: PandocAst, format: string, options: string[]): Promise<string>;
31  listExtensions(format: string): Promise<string>;
32  getBibliography(
33    file: string | null,
34    bibliography: string[],
35    refBlock: string | null,
36    etag: string | null,
37  ): Promise<BibliographyResult>;
38  addToBibliography(
39    bibliography: string,
40    project: boolean,
41    id: string,
42    sourceAsJson: string,
43    sourceAsBibTeX: string,
44  ): Promise<boolean>;
45  citationHTML(file: string | null, sourceAsJson: string, csl: string | null): Promise<string>;
46}
47
48export interface PandocWriterReferencesOptions {
49  location?: string; // block | section | document
50  prefix?: string;
51}
52
53export interface PandocWriterOptions {
54  atxHeaders?: boolean;
55  references?: PandocWriterReferencesOptions;
56  wrap?: string;
57  dpi?: number;
58}
59
60export interface PandocExtensions {
61  abbreviations: boolean;
62  all_symbols_escapable: boolean;
63  amuse: boolean;
64  angle_brackets_escapable: boolean;
65  ascii_identifiers: boolean;
66  auto_identifiers: boolean;
67  autolink_bare_uris: boolean;
68  backtick_code_blocks: boolean;
69  blank_before_blockquote: boolean;
70  blank_before_header: boolean;
71  bracketed_spans: boolean;
72  citations: boolean;
73  compact_definition_lists: boolean;
74  definition_lists: boolean;
75  east_asian_line_breaks: boolean;
76  emoji: boolean;
77  empty_paragraphs: boolean;
78  epub_html_exts: boolean;
79  escaped_line_breaks: boolean;
80  example_lists: boolean;
81  fancy_lists: boolean;
82  fenced_code_attributes: boolean;
83  fenced_code_blocks: boolean;
84  fenced_divs: boolean;
85  footnotes: boolean;
86  four_space_rule: boolean;
87  gfm_auto_identifiers: boolean;
88  grid_tables: boolean;
89  hard_line_breaks: boolean;
90  header_attributes: boolean;
91  ignore_line_breaks: boolean;
92  implicit_figures: boolean;
93  implicit_header_references: boolean;
94  inline_code_attributes: boolean;
95  inline_notes: boolean;
96  intraword_underscores: boolean;
97  latex_macros: boolean;
98  line_blocks: boolean;
99  link_attributes: boolean;
100  lists_without_preceding_blankline: boolean;
101  literate_haskell: boolean;
102  markdown_attribute: boolean;
103  markdown_in_html_blocks: boolean;
104  mmd_header_identifiers: boolean;
105  mmd_link_attributes: boolean;
106  mmd_title_block: boolean;
107  multiline_tables: boolean;
108  native_divs: boolean;
109  native_spans: boolean;
110  native_numbering: boolean;
111  ntb: boolean;
112  old_dashes: boolean;
113  pandoc_title_block: boolean;
114  pipe_tables: boolean;
115  raw_attribute: boolean;
116  raw_html: boolean;
117  raw_tex: boolean;
118  shortcut_reference_links: boolean;
119  simple_tables: boolean;
120  smart: boolean;
121  space_in_atx_header: boolean;
122  spaced_reference_links: boolean;
123  startnum: boolean;
124  strikeout: boolean;
125  subscript: boolean;
126  superscript: boolean;
127  styles: boolean;
128  task_lists: boolean;
129  table_captions: boolean;
130  tex_math_dollars: boolean;
131  tex_math_double_backslash: boolean;
132  tex_math_single_backslash: boolean;
133  yaml_metadata_block: boolean;
134  gutenberg: boolean;
135  // attributes: boolean; (not yet)
136  [key: string]: boolean;
137}
138
139export function imageAttributesAvailable(pandocExtensions: PandocExtensions) {
140  return pandocExtensions.link_attributes || pandocExtensions.raw_html;
141}
142
143export function parsePandocListOutput(output: string) {
144  return output.split(/\r?\n/).filter(entry => entry.length);
145}
146
147export interface PandocAst {
148  blocks: PandocToken[];
149  'pandoc-api-version': PandocApiVersion;
150  meta: any;
151  heading_ids?: string[]; // used only for reading not writing
152}
153
154export type PandocApiVersion = number[];
155
156export interface PandocToken {
157  t: string;
158  c?: any;
159}
160
161// https://github.com/jgm/pandoc-types/blob/master/Text/Pandoc/Definition.hs
162export enum PandocTokenType {
163  Str = 'Str',
164  Space = 'Space',
165  Strong = 'Strong',
166  Emph = 'Emph',
167  Code = 'Code',
168  Superscript = 'Superscript',
169  Subscript = 'Subscript',
170  Strikeout = 'Strikeout',
171  SmallCaps = 'SmallCaps',
172  Underline = 'Underline',
173  Quoted = 'Quoted',
174  RawInline = 'RawInline',
175  RawBlock = 'RawBlock',
176  LineBlock = 'LineBlock',
177  Para = 'Para',
178  Plain = 'Plain',
179  Header = 'Header',
180  CodeBlock = 'CodeBlock',
181  BlockQuote = 'BlockQuote',
182  BulletList = 'BulletList',
183  OrderedList = 'OrderedList',
184  DefinitionList = 'DefinitionList',
185  Image = 'Image',
186  Link = 'Link',
187  Note = 'Note',
188  Cite = 'Cite',
189  Table = 'Table',
190  AlignRight = 'AlignRight',
191  AlignLeft = 'AlignLeft',
192  AlignDefault = 'AlignDefault',
193  AlignCenter = 'AlignCenter',
194  ColWidth = 'ColWidth',
195  ColWidthDefault = 'ColWidthDefault',
196  HorizontalRule = 'HorizontalRule',
197  LineBreak = 'LineBreak',
198  SoftBreak = 'SoftBreak',
199  Math = 'Math',
200  InlineMath = 'InlineMath',
201  DisplayMath = 'DisplayMath',
202  Div = 'Div',
203  Span = 'Span',
204  Null = 'Null',
205}
206
207export interface PandocTokenReader {
208  // pandoc token name (e.g. "Str", "Emph", etc.)
209  readonly token: PandocTokenType;
210
211  // If present, gives a chance for the reader to decide whether it actually
212  // wants to handle the token, based on factors other than the PandocTokenType
213  readonly match?: (tok: PandocToken) => boolean;
214
215  // one and only one of these values must also be set
216  readonly text?: boolean;
217  readonly node?: string;
218  readonly block?: string;
219  readonly mark?: string;
220  readonly code_block?: boolean;
221
222  // functions for getting attributes and children
223  getAttrs?: (tok: PandocToken) => any;
224  getChildren?: (tok: PandocToken) => any[];
225  getText?: (tok: PandocToken) => string;
226
227  // lower-level handler function that overrides the above handler attributes
228  // (they are ignored when handler is specified)
229  handler?: (schema: Schema) => (writer: ProsemirrorWriter, tok: PandocToken) => void;
230
231  // post-processor for performing fixups that rely on seeing the entire
232  // document (e.g. recognizing implicit header references)
233  postprocessor?: PandocPostprocessorFn;
234}
235
236// constants used to read the contents of raw blocks
237export const kRawBlockFormat = 0;
238export const kRawBlockContent = 1;
239
240// filter sequences of tokens (e.g. for reducing some adjacent tokens to a single token)
241export type PandocTokensFilterFn = (tokens: PandocToken[], writer: ProsemirrorWriter) => PandocToken[];
242
243// special reader that gets a first shot at blocks (i.e. to convert a para w/ a single image into a figure)
244export type PandocBlockReaderFn = (schema: Schema, tok: PandocToken, writer: ProsemirrorWriter) => boolean;
245
246// reader that gets a first shot at inline html (e.g. image node parsing an <img> tag)
247export type PandocInlineHTMLReaderFn = (schema: Schema, html: string, writer?: ProsemirrorWriter) => boolean;
248
249export interface ProsemirrorWriter {
250  // open (then close) a node container
251  openNode(type: NodeType, attrs: {}): void;
252  closeNode(): ProsemirrorNode;
253
254  // special open call for note node containers
255  openNoteNode(ref: string): void;
256
257  // add a node to the current container
258  addNode(type: NodeType, attrs: {}, content: ProsemirrorNode[]): ProsemirrorNode | null;
259
260  // open and close marks
261  openMark(mark: Mark): void;
262  closeMark(mark: Mark): void;
263
264  // add text to the current node using the current mark set
265  writeText(text: string): void;
266
267  // write tokens into the current node
268  writeTokens(tokens: PandocToken[]): void;
269
270  // see if any inline HTML readers want to handle this html
271  hasInlineHTMLWriter(html: string): boolean;
272  writeInlineHTML(html: string): void;
273
274  // log an unrecoginzed token type
275  logUnrecognized(token: string): void;
276
277  // log the presence of example lists
278  logExampleList(): void;
279
280  // query whether a given node type is open
281  // (useful for e.g. conditional behavior when in a list or table)
282  isNodeOpen(type: NodeType): boolean;
283}
284
285export interface PandocNodeWriter {
286  readonly name: string;
287  readonly write: PandocNodeWriterFn;
288}
289
290export type PandocNodeWriterFn = (output: PandocOutput, node: ProsemirrorNode) => void;
291
292export type PandocPreprocessorFn = (markdown: string) => string;
293
294export type PandocPostprocessorFn = (doc: ProsemirrorNode) => ProsemirrorNode;
295
296export interface PandocMarkWriter {
297  // pandoc mark name
298  readonly name: string;
299
300  // The 'priority' property allows us to dicate the order of nesting
301  // for marks (this is required b/c Prosemirror uses a flat structure
302  // whereby multiple marks are attached to text nodes). This allows us
303  // to e.g. ensure that strong and em always occur outside code.
304  readonly priority: number;
305
306  // writer function
307  readonly write: PandocMarkWriterFn;
308}
309
310export type PandocMarkWriterFn = (output: PandocOutput, mark: Mark, parent: Fragment) => void;
311
312export type PandocOutputOption = 'writeSpaces';
313
314export interface PandocOutput {
315  extensions: PandocExtensions;
316  write(value: any): void;
317  writeToken(type: PandocTokenType, content?: (() => void) | any): void;
318  writeMark(type: PandocTokenType, parent: Fragment, expelEnclosingWhitespace?: boolean): void;
319  writeArray(content: () => void): void;
320  writeAttr(id?: string, classes?: string[], keyvalue?: [[string, string]]): void;
321  writeText(text: string | null): void;
322  writeLink(href: string, title: string, attr: PandocAttr | null, f: () => void): void;
323  writeNode(node: ProsemirrorNode): void;
324  writeNodes(parent: ProsemirrorNode): void;
325  writeNote(note: ProsemirrorNode): void;
326  writeInlines(fragment: Fragment): void;
327  writeRawMarkdown(markdown: Fragment | string, escapeSymbols?: boolean): void;
328  withOption(option: PandocOutputOption, value: boolean, f: () => void): void;
329}
330
331// collect the text from a collection of pandoc ast
332// elements (ignores marks, useful for ast elements
333// that support marks but whose prosemirror equivalent
334// does not, e.g. image alt text)
335// https://github.com/jgm/pandoc/blob/83880b0dbc318703babfbb6905b1046fa48f1216/src/Text/Pandoc/Shared.hs#L439
336export function stringifyTokens(c: PandocToken[], unemoji = false): string {
337  return c
338    .map(elem => {
339      if (elem.t === PandocTokenType.Str) {
340        return elem.c;
341      } else if (
342        elem.t === PandocTokenType.Space ||
343        elem.t === PandocTokenType.SoftBreak ||
344        elem.t === PandocTokenType.LineBreak
345      ) {
346        return ' ';
347      } else if (elem.t === PandocTokenType.Link) {
348        return stringifyTokens(elem.c[kLinkChildren]);
349      } else if (elem.t === PandocTokenType.Span) {
350        const attr = pandocAttrReadAST(elem, kSpanAttr);
351        if (unemoji && attr.classes && attr.classes[0] === 'emoji') {
352          return attr.keyvalue[0][1];
353        } else {
354          return stringifyTokens(elem.c[kSpanChildren]);
355        }
356      } else if (elem.t === PandocTokenType.Quoted) {
357        const type = elem.c[kQuoteType].t;
358        const quote = type === QuoteType.SingleQuote ? "'" : '"';
359        return quote + stringifyTokens(elem.c[kQuoteChildren]) + quote;
360      } else if (elem.t === PandocTokenType.Math) {
361        return stringifyMath(elem);
362      } else if (elem.t === PandocTokenType.Code) {
363        return elem.c[kCodeText];
364      } else if (elem.c) {
365        return stringifyTokens(elem.c);
366      } else {
367        return '';
368      }
369    })
370    .join('');
371}
372
373export function forEachToken(tokens: PandocToken[], f: (tok: PandocToken) => void) {
374  mapTokens(tokens, (tok: PandocToken) => {
375    f(tok);
376    return tok;
377  });
378}
379
380export function mapTokens(tokens: PandocToken[], f: (tok: PandocToken) => PandocToken) {
381  function isToken(val: any) {
382    if (val !== null && typeof val === 'object') {
383      return val.hasOwnProperty('t');
384    } else {
385      return false;
386    }
387  }
388
389  function tokenHasChildren(tok: PandocToken) {
390    return tok !== null && typeof tok === 'object' && Array.isArray(tok.c);
391  }
392
393  function mapValue(val: any): any {
394    if (isToken(val)) {
395      return mapToken(val);
396    } else if (Array.isArray(val)) {
397      return val.map(mapValue);
398    } else {
399      return val;
400    }
401  }
402
403  function mapToken(tok: PandocToken): PandocToken {
404    const mappedTok = f(tok);
405    if (tokenHasChildren(mappedTok)) {
406      mappedTok.c = mappedTok.c.map(mapValue);
407    }
408    return mappedTok;
409  }
410
411  return tokens.map(mapToken);
412}
413
414export function tokenTextEscaped(t: PandocToken) {
415  return t.c.replace(/\\/g, `\\\\`);
416}
417
418// sort marks by priority (in descending order)
419export function marksByPriority(marks: Mark[], markWriters: { [key: string]: PandocMarkWriter }) {
420  return marks.sort((a: Mark, b: Mark) => {
421    const aPriority = markWriters[a.type.name].priority;
422    const bPriority = markWriters[b.type.name].priority;
423    if (aPriority < bPriority) {
424      return 1;
425    } else if (bPriority < aPriority) {
426      return -1;
427    } else {
428      return 0;
429    }
430  });
431}
432