1/* 2 * pandoc.ts 3 * 4 * Copyright (C) 2021 by RStudio, PBC 5 * 6 * Unless you have received this program directly from RStudio pursuant 7 * to the terms of a commercial license agreement with RStudio, then 8 * this program is licensed to you under the terms of version 3 of the 9 * GNU Affero General Public License. This program is distributed WITHOUT 10 * ANY EXPRESS OR IMPLIED WARRANTY, INCLUDING THOSE OF NON-INFRINGEMENT, 11 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Please refer to the 12 * AGPL (http://www.gnu.org/licenses/agpl-3.0.txt) for more details. 13 * 14 */ 15 16import { Fragment, Mark, Node as ProsemirrorNode, Schema, NodeType } from 'prosemirror-model'; 17 18import { PandocAttr, pandocAttrReadAST, kSpanChildren, kSpanAttr } from './pandoc_attr'; 19import { PandocCapabilitiesResult } from './pandoc_capabilities'; 20import { kQuoteType, kQuoteChildren, QuoteType } from './quote'; 21import { BibliographyResult } from './bibliography/bibliography-provider_local'; 22 23import { stringifyMath } from './math'; 24import { kCodeText } from './code'; 25import { kLinkChildren } from './link'; 26 27export interface PandocServer { 28 getCapabilities(): Promise<PandocCapabilitiesResult>; 29 markdownToAst(markdown: string, format: string, options: string[]): Promise<PandocAst>; 30 astToMarkdown(ast: PandocAst, format: string, options: string[]): Promise<string>; 31 listExtensions(format: string): Promise<string>; 32 getBibliography( 33 file: string | null, 34 bibliography: string[], 35 refBlock: string | null, 36 etag: string | null, 37 ): Promise<BibliographyResult>; 38 addToBibliography( 39 bibliography: string, 40 project: boolean, 41 id: string, 42 sourceAsJson: string, 43 sourceAsBibTeX: string, 44 ): Promise<boolean>; 45 citationHTML(file: string | null, sourceAsJson: string, csl: string | null): Promise<string>; 46} 47 48export interface PandocWriterReferencesOptions { 49 location?: string; // block | section | document 50 prefix?: string; 51} 52 53export interface PandocWriterOptions { 54 atxHeaders?: boolean; 55 references?: PandocWriterReferencesOptions; 56 wrap?: string; 57 dpi?: number; 58} 59 60export interface PandocExtensions { 61 abbreviations: boolean; 62 all_symbols_escapable: boolean; 63 amuse: boolean; 64 angle_brackets_escapable: boolean; 65 ascii_identifiers: boolean; 66 auto_identifiers: boolean; 67 autolink_bare_uris: boolean; 68 backtick_code_blocks: boolean; 69 blank_before_blockquote: boolean; 70 blank_before_header: boolean; 71 bracketed_spans: boolean; 72 citations: boolean; 73 compact_definition_lists: boolean; 74 definition_lists: boolean; 75 east_asian_line_breaks: boolean; 76 emoji: boolean; 77 empty_paragraphs: boolean; 78 epub_html_exts: boolean; 79 escaped_line_breaks: boolean; 80 example_lists: boolean; 81 fancy_lists: boolean; 82 fenced_code_attributes: boolean; 83 fenced_code_blocks: boolean; 84 fenced_divs: boolean; 85 footnotes: boolean; 86 four_space_rule: boolean; 87 gfm_auto_identifiers: boolean; 88 grid_tables: boolean; 89 hard_line_breaks: boolean; 90 header_attributes: boolean; 91 ignore_line_breaks: boolean; 92 implicit_figures: boolean; 93 implicit_header_references: boolean; 94 inline_code_attributes: boolean; 95 inline_notes: boolean; 96 intraword_underscores: boolean; 97 latex_macros: boolean; 98 line_blocks: boolean; 99 link_attributes: boolean; 100 lists_without_preceding_blankline: boolean; 101 literate_haskell: boolean; 102 markdown_attribute: boolean; 103 markdown_in_html_blocks: boolean; 104 mmd_header_identifiers: boolean; 105 mmd_link_attributes: boolean; 106 mmd_title_block: boolean; 107 multiline_tables: boolean; 108 native_divs: boolean; 109 native_spans: boolean; 110 native_numbering: boolean; 111 ntb: boolean; 112 old_dashes: boolean; 113 pandoc_title_block: boolean; 114 pipe_tables: boolean; 115 raw_attribute: boolean; 116 raw_html: boolean; 117 raw_tex: boolean; 118 shortcut_reference_links: boolean; 119 simple_tables: boolean; 120 smart: boolean; 121 space_in_atx_header: boolean; 122 spaced_reference_links: boolean; 123 startnum: boolean; 124 strikeout: boolean; 125 subscript: boolean; 126 superscript: boolean; 127 styles: boolean; 128 task_lists: boolean; 129 table_captions: boolean; 130 tex_math_dollars: boolean; 131 tex_math_double_backslash: boolean; 132 tex_math_single_backslash: boolean; 133 yaml_metadata_block: boolean; 134 gutenberg: boolean; 135 // attributes: boolean; (not yet) 136 [key: string]: boolean; 137} 138 139export function imageAttributesAvailable(pandocExtensions: PandocExtensions) { 140 return pandocExtensions.link_attributes || pandocExtensions.raw_html; 141} 142 143export function parsePandocListOutput(output: string) { 144 return output.split(/\r?\n/).filter(entry => entry.length); 145} 146 147export interface PandocAst { 148 blocks: PandocToken[]; 149 'pandoc-api-version': PandocApiVersion; 150 meta: any; 151 heading_ids?: string[]; // used only for reading not writing 152} 153 154export type PandocApiVersion = number[]; 155 156export interface PandocToken { 157 t: string; 158 c?: any; 159} 160 161// https://github.com/jgm/pandoc-types/blob/master/Text/Pandoc/Definition.hs 162export enum PandocTokenType { 163 Str = 'Str', 164 Space = 'Space', 165 Strong = 'Strong', 166 Emph = 'Emph', 167 Code = 'Code', 168 Superscript = 'Superscript', 169 Subscript = 'Subscript', 170 Strikeout = 'Strikeout', 171 SmallCaps = 'SmallCaps', 172 Underline = 'Underline', 173 Quoted = 'Quoted', 174 RawInline = 'RawInline', 175 RawBlock = 'RawBlock', 176 LineBlock = 'LineBlock', 177 Para = 'Para', 178 Plain = 'Plain', 179 Header = 'Header', 180 CodeBlock = 'CodeBlock', 181 BlockQuote = 'BlockQuote', 182 BulletList = 'BulletList', 183 OrderedList = 'OrderedList', 184 DefinitionList = 'DefinitionList', 185 Image = 'Image', 186 Link = 'Link', 187 Note = 'Note', 188 Cite = 'Cite', 189 Table = 'Table', 190 AlignRight = 'AlignRight', 191 AlignLeft = 'AlignLeft', 192 AlignDefault = 'AlignDefault', 193 AlignCenter = 'AlignCenter', 194 ColWidth = 'ColWidth', 195 ColWidthDefault = 'ColWidthDefault', 196 HorizontalRule = 'HorizontalRule', 197 LineBreak = 'LineBreak', 198 SoftBreak = 'SoftBreak', 199 Math = 'Math', 200 InlineMath = 'InlineMath', 201 DisplayMath = 'DisplayMath', 202 Div = 'Div', 203 Span = 'Span', 204 Null = 'Null', 205} 206 207export interface PandocTokenReader { 208 // pandoc token name (e.g. "Str", "Emph", etc.) 209 readonly token: PandocTokenType; 210 211 // If present, gives a chance for the reader to decide whether it actually 212 // wants to handle the token, based on factors other than the PandocTokenType 213 readonly match?: (tok: PandocToken) => boolean; 214 215 // one and only one of these values must also be set 216 readonly text?: boolean; 217 readonly node?: string; 218 readonly block?: string; 219 readonly mark?: string; 220 readonly code_block?: boolean; 221 222 // functions for getting attributes and children 223 getAttrs?: (tok: PandocToken) => any; 224 getChildren?: (tok: PandocToken) => any[]; 225 getText?: (tok: PandocToken) => string; 226 227 // lower-level handler function that overrides the above handler attributes 228 // (they are ignored when handler is specified) 229 handler?: (schema: Schema) => (writer: ProsemirrorWriter, tok: PandocToken) => void; 230 231 // post-processor for performing fixups that rely on seeing the entire 232 // document (e.g. recognizing implicit header references) 233 postprocessor?: PandocPostprocessorFn; 234} 235 236// constants used to read the contents of raw blocks 237export const kRawBlockFormat = 0; 238export const kRawBlockContent = 1; 239 240// filter sequences of tokens (e.g. for reducing some adjacent tokens to a single token) 241export type PandocTokensFilterFn = (tokens: PandocToken[], writer: ProsemirrorWriter) => PandocToken[]; 242 243// special reader that gets a first shot at blocks (i.e. to convert a para w/ a single image into a figure) 244export type PandocBlockReaderFn = (schema: Schema, tok: PandocToken, writer: ProsemirrorWriter) => boolean; 245 246// reader that gets a first shot at inline html (e.g. image node parsing an <img> tag) 247export type PandocInlineHTMLReaderFn = (schema: Schema, html: string, writer?: ProsemirrorWriter) => boolean; 248 249export interface ProsemirrorWriter { 250 // open (then close) a node container 251 openNode(type: NodeType, attrs: {}): void; 252 closeNode(): ProsemirrorNode; 253 254 // special open call for note node containers 255 openNoteNode(ref: string): void; 256 257 // add a node to the current container 258 addNode(type: NodeType, attrs: {}, content: ProsemirrorNode[]): ProsemirrorNode | null; 259 260 // open and close marks 261 openMark(mark: Mark): void; 262 closeMark(mark: Mark): void; 263 264 // add text to the current node using the current mark set 265 writeText(text: string): void; 266 267 // write tokens into the current node 268 writeTokens(tokens: PandocToken[]): void; 269 270 // see if any inline HTML readers want to handle this html 271 hasInlineHTMLWriter(html: string): boolean; 272 writeInlineHTML(html: string): void; 273 274 // log an unrecoginzed token type 275 logUnrecognized(token: string): void; 276 277 // log the presence of example lists 278 logExampleList(): void; 279 280 // query whether a given node type is open 281 // (useful for e.g. conditional behavior when in a list or table) 282 isNodeOpen(type: NodeType): boolean; 283} 284 285export interface PandocNodeWriter { 286 readonly name: string; 287 readonly write: PandocNodeWriterFn; 288} 289 290export type PandocNodeWriterFn = (output: PandocOutput, node: ProsemirrorNode) => void; 291 292export type PandocPreprocessorFn = (markdown: string) => string; 293 294export type PandocPostprocessorFn = (doc: ProsemirrorNode) => ProsemirrorNode; 295 296export interface PandocMarkWriter { 297 // pandoc mark name 298 readonly name: string; 299 300 // The 'priority' property allows us to dicate the order of nesting 301 // for marks (this is required b/c Prosemirror uses a flat structure 302 // whereby multiple marks are attached to text nodes). This allows us 303 // to e.g. ensure that strong and em always occur outside code. 304 readonly priority: number; 305 306 // writer function 307 readonly write: PandocMarkWriterFn; 308} 309 310export type PandocMarkWriterFn = (output: PandocOutput, mark: Mark, parent: Fragment) => void; 311 312export type PandocOutputOption = 'writeSpaces'; 313 314export interface PandocOutput { 315 extensions: PandocExtensions; 316 write(value: any): void; 317 writeToken(type: PandocTokenType, content?: (() => void) | any): void; 318 writeMark(type: PandocTokenType, parent: Fragment, expelEnclosingWhitespace?: boolean): void; 319 writeArray(content: () => void): void; 320 writeAttr(id?: string, classes?: string[], keyvalue?: [[string, string]]): void; 321 writeText(text: string | null): void; 322 writeLink(href: string, title: string, attr: PandocAttr | null, f: () => void): void; 323 writeNode(node: ProsemirrorNode): void; 324 writeNodes(parent: ProsemirrorNode): void; 325 writeNote(note: ProsemirrorNode): void; 326 writeInlines(fragment: Fragment): void; 327 writeRawMarkdown(markdown: Fragment | string, escapeSymbols?: boolean): void; 328 withOption(option: PandocOutputOption, value: boolean, f: () => void): void; 329} 330 331// collect the text from a collection of pandoc ast 332// elements (ignores marks, useful for ast elements 333// that support marks but whose prosemirror equivalent 334// does not, e.g. image alt text) 335// https://github.com/jgm/pandoc/blob/83880b0dbc318703babfbb6905b1046fa48f1216/src/Text/Pandoc/Shared.hs#L439 336export function stringifyTokens(c: PandocToken[], unemoji = false): string { 337 return c 338 .map(elem => { 339 if (elem.t === PandocTokenType.Str) { 340 return elem.c; 341 } else if ( 342 elem.t === PandocTokenType.Space || 343 elem.t === PandocTokenType.SoftBreak || 344 elem.t === PandocTokenType.LineBreak 345 ) { 346 return ' '; 347 } else if (elem.t === PandocTokenType.Link) { 348 return stringifyTokens(elem.c[kLinkChildren]); 349 } else if (elem.t === PandocTokenType.Span) { 350 const attr = pandocAttrReadAST(elem, kSpanAttr); 351 if (unemoji && attr.classes && attr.classes[0] === 'emoji') { 352 return attr.keyvalue[0][1]; 353 } else { 354 return stringifyTokens(elem.c[kSpanChildren]); 355 } 356 } else if (elem.t === PandocTokenType.Quoted) { 357 const type = elem.c[kQuoteType].t; 358 const quote = type === QuoteType.SingleQuote ? "'" : '"'; 359 return quote + stringifyTokens(elem.c[kQuoteChildren]) + quote; 360 } else if (elem.t === PandocTokenType.Math) { 361 return stringifyMath(elem); 362 } else if (elem.t === PandocTokenType.Code) { 363 return elem.c[kCodeText]; 364 } else if (elem.c) { 365 return stringifyTokens(elem.c); 366 } else { 367 return ''; 368 } 369 }) 370 .join(''); 371} 372 373export function forEachToken(tokens: PandocToken[], f: (tok: PandocToken) => void) { 374 mapTokens(tokens, (tok: PandocToken) => { 375 f(tok); 376 return tok; 377 }); 378} 379 380export function mapTokens(tokens: PandocToken[], f: (tok: PandocToken) => PandocToken) { 381 function isToken(val: any) { 382 if (val !== null && typeof val === 'object') { 383 return val.hasOwnProperty('t'); 384 } else { 385 return false; 386 } 387 } 388 389 function tokenHasChildren(tok: PandocToken) { 390 return tok !== null && typeof tok === 'object' && Array.isArray(tok.c); 391 } 392 393 function mapValue(val: any): any { 394 if (isToken(val)) { 395 return mapToken(val); 396 } else if (Array.isArray(val)) { 397 return val.map(mapValue); 398 } else { 399 return val; 400 } 401 } 402 403 function mapToken(tok: PandocToken): PandocToken { 404 const mappedTok = f(tok); 405 if (tokenHasChildren(mappedTok)) { 406 mappedTok.c = mappedTok.c.map(mapValue); 407 } 408 return mappedTok; 409 } 410 411 return tokens.map(mapToken); 412} 413 414export function tokenTextEscaped(t: PandocToken) { 415 return t.c.replace(/\\/g, `\\\\`); 416} 417 418// sort marks by priority (in descending order) 419export function marksByPriority(marks: Mark[], markWriters: { [key: string]: PandocMarkWriter }) { 420 return marks.sort((a: Mark, b: Mark) => { 421 const aPriority = markWriters[a.type.name].priority; 422 const bPriority = markWriters[b.type.name].priority; 423 if (aPriority < bPriority) { 424 return 1; 425 } else if (bPriority < aPriority) { 426 return -1; 427 } else { 428 return 0; 429 } 430 }); 431} 432