1/* 2 * pandoc_converter.ts 3 * 4 * Copyright (C) 2021 by RStudio, PBC 5 * 6 * Unless you have received this program directly from RStudio pursuant 7 * to the terms of a commercial license agreement with RStudio, then 8 * this program is licensed to you under the terms of version 3 of the 9 * GNU Affero General Public License. This program is distributed WITHOUT 10 * ANY EXPRESS OR IMPLIED WARRANTY, INCLUDING THOSE OF NON-INFRINGEMENT, 11 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Please refer to the 12 * AGPL (http://www.gnu.org/licenses/agpl-3.0.txt) for more details. 13 * 14 */ 15 16import { Schema, Node as ProsemirrorNode } from 'prosemirror-model'; 17 18import { 19 PandocServer, 20 PandocTokenReader, 21 PandocNodeWriter, 22 PandocMarkWriter, 23 PandocPreprocessorFn, 24 PandocBlockReaderFn, 25 PandocPostprocessorFn, 26 PandocInlineHTMLReaderFn, 27 PandocWriterOptions, 28 PandocTokensFilterFn, 29} from '../api/pandoc'; 30 31import { haveTableCellsWithInlineRcode } from '../api/rmd'; 32 33import { pandocFormatWith, PandocFormat, kGfmFormat, kCommonmarkFormat } from '../api/pandoc_format'; 34import { PandocCapabilities } from '../api/pandoc_capabilities'; 35import { PandocBlockCapsuleFilter, pandocMarkdownWithBlockCapsules } from '../api/pandoc_capsule'; 36 37import { ExtensionManager } from '../editor/editor-extensions'; 38 39import { pandocToProsemirror } from './pandoc_to_prosemirror'; 40import { pandocFromProsemirror } from './pandoc_from_prosemirror'; 41 42export type PandocLineWrapping = 'none' | 'column' | 'sentence'; 43 44export interface PandocToProsemirrorResult { 45 doc: ProsemirrorNode; 46 line_wrapping: PandocLineWrapping; 47 unrecognized: string[]; 48 example_lists: boolean; 49 unparsed_meta: { [key: string]: any }; 50} 51 52export class PandocConverter { 53 private readonly schema: Schema; 54 private readonly preprocessors: readonly PandocPreprocessorFn[]; 55 private readonly postprocessors: readonly PandocPostprocessorFn[]; 56 private readonly readers: readonly PandocTokenReader[]; 57 private readonly tokensFilters: readonly PandocTokensFilterFn[]; 58 private readonly blockReaders: readonly PandocBlockReaderFn[]; 59 private readonly inlineHTMLReaders: readonly PandocInlineHTMLReaderFn[]; 60 private readonly blockCapsuleFilters: readonly PandocBlockCapsuleFilter[]; 61 private readonly nodeWriters: readonly PandocNodeWriter[]; 62 private readonly markWriters: readonly PandocMarkWriter[]; 63 private readonly pandoc: PandocServer; 64 private readonly pandocCapabilities: PandocCapabilities; 65 66 constructor( 67 schema: Schema, 68 extensions: ExtensionManager, 69 pandoc: PandocServer, 70 pandocCapabilities: PandocCapabilities, 71 ) { 72 this.schema = schema; 73 74 this.preprocessors = extensions.pandocPreprocessors(); 75 this.postprocessors = extensions.pandocPostprocessors(); 76 this.readers = extensions.pandocReaders(); 77 this.tokensFilters = extensions.pandocTokensFilters(); 78 this.blockReaders = extensions.pandocBlockReaders(); 79 this.inlineHTMLReaders = extensions.pandocInlineHTMLReaders(); 80 this.blockCapsuleFilters = extensions.pandocBlockCapsuleFilters(); 81 this.nodeWriters = extensions.pandocNodeWriters(); 82 this.markWriters = extensions.pandocMarkWriters(); 83 84 this.pandoc = pandoc; 85 this.pandocCapabilities = pandocCapabilities; 86 } 87 88 public async toProsemirror(markdown: string, format: PandocFormat): Promise<PandocToProsemirrorResult> { 89 // save original markdown (for aligning capsule positions) 90 const original = markdown; 91 92 // adjust format. we always need to *read* raw_html, raw_attribute, and backtick_code_blocks b/c 93 // that's how preprocessors hoist content through pandoc into our prosemirror token parser. 94 // we always need to read with auto_identifiers so we can catch any auto-generated ids 95 // required to fulfill links inside the document (we will strip out heading ids that 96 // aren't explicit or a link target using the heading_ids returned with the ast). we also 97 // disable 'smart' b/c that causes pandoc to insert non-breaking spaces before selected 98 // abbreviations like e.g. rather, we do our own implementation of 'smart' when we read 99 // PandocTokenType.Str from the ast 100 101 // determine type of auto_ids 102 const autoIds = format.extensions.gfm_auto_identifiers ? 'gfm_auto_identifiers' : 'auto_identifiers'; 103 const targetFormat = adjustedFormat( 104 format.fullName, 105 ['raw_html', 'raw_attribute', 'backtick_code_blocks', autoIds], 106 ['smart'], 107 ); 108 109 // run preprocessors 110 this.preprocessors.forEach(preprocessor => { 111 markdown = preprocessor(markdown); 112 }); 113 114 // create source capsules 115 this.blockCapsuleFilters.forEach(filter => { 116 markdown = pandocMarkdownWithBlockCapsules(original, markdown, filter); 117 }); 118 119 const ast = await this.pandoc.markdownToAst(markdown, targetFormat, []); 120 const result = pandocToProsemirror( 121 ast, 122 this.schema, 123 format.extensions, 124 this.readers, 125 this.tokensFilters, 126 this.blockReaders, 127 this.inlineHTMLReaders, 128 this.blockCapsuleFilters, 129 ); 130 131 // run post-processors 132 this.postprocessors.forEach(postprocessor => { 133 result.doc = postprocessor(result.doc); 134 }); 135 136 // return the doc 137 return result; 138 } 139 140 // NOTE: For a plain markdown file, this is the closest we can come to cannonicalizing w/ just pandoc: 141 // 142 // pandoc MANUAL.md --to markdown-auto_identifiers-smart -o MANUAL.md --self-contained --atx-headers --wrap=none 143 // 144 // For R Mardown files, we would need to pull out the Rmd chunks before sending to pandoc. 145 // 146 147 public async fromProsemirror( 148 doc: ProsemirrorNode, 149 pandocFormat: PandocFormat, 150 options: PandocWriterOptions, 151 ): Promise<string> { 152 // generate pandoc ast 153 const output = pandocFromProsemirror( 154 doc, 155 this.pandocCapabilities.api_version, 156 pandocFormat, 157 this.nodeWriters, 158 this.markWriters, 159 ); 160 161 // adjust format. we always need to be able to write raw_attribute b/c that's how preprocessors 162 // hoist content through pandoc into our prosemirror token parser. since we open this door when 163 // reading, users could end up writing raw inlines, and in that case we want them to echo back 164 // to the source document just the way they came in. for writing markdown from pm we don't 165 // ever want to generate auto identifiers so we disable them here. we also disable smart b/c 166 // we do this manually above in pandocFromProsemirror (so we can avoid pandoc's insertion of 167 // nbsp's after abbreviations, which is more approriate for final output than editing) 168 let format = adjustedFormat( 169 pandocFormat.fullName, 170 ['raw_html', 'raw_attribute'], // always enable 171 ['auto_identifiers', 'gfm_auto_identifiers', 'smart'], 172 ); // always disable 173 174 // disable selected format options 175 format = pandocFormatWith(format, disabledFormatOptions(format, doc), ''); 176 177 // prepare pandoc options 178 let pandocOptions: string[] = []; 179 if (!options.atxHeaders) { 180 pandocOptions.push('--markdown-headings=setext'); 181 } 182 if (options.dpi) { 183 pandocOptions.push('--dpi'); 184 } 185 // default to block level references (validate known types) 186 let referenceLocation = 'block'; 187 if (options.references?.location) { 188 referenceLocation = ['block', 'section', 'document'].includes(options.references.location) 189 ? options.references.location 190 : 'block'; 191 } 192 pandocOptions.push(`--reference-location=${referenceLocation}`); 193 194 // references prefix (if any) 195 if (options.references?.prefix) { 196 pandocOptions.push('--id-prefix', options.references.prefix); 197 } 198 199 // provide wrap options 200 pandocOptions = pandocOptions.concat(wrapOptions(options)); 201 202 // render to markdown 203 const markdown = await this.pandoc.astToMarkdown(output.ast, format, pandocOptions); 204 205 // normalize newlines (don't know if pandoc uses \r\n on windows) 206 return markdown.replace(/\r\n|\n\r|\r/g, '\n'); 207 } 208} 209 210// adjust the specified format 211function adjustedFormat(format: string, extensions: string[], disabled: string[]) { 212 let newFormat = pandocFormatWith( 213 format, 214 '', 215 extensions.map(ext => `+${ext}`).join('') + disabled.map(ext => `-${ext}`).join(''), 216 ); 217 218 // any extension specified needs to not have a - anywhere in the format 219 extensions.forEach(ext => { 220 newFormat = newFormat.replace('-' + ext, ''); 221 }); 222 223 return newFormat; 224} 225 226function disabledFormatOptions(format: string, doc: ProsemirrorNode) { 227 // (prefer pipe and grid tables). users can still force the availability of these by 228 // adding those format flags but all known markdown variants that support tables also 229 // support pipe tables so this seems unlikely to ever be required. 230 let disabledTableTypes = '-simple_tables-multiline_tables'; 231 232 // if there are tables with inline R code then disable grid tables (as the inline 233 // R code will mess up the column boundaries) 234 if (haveTableCellsWithInlineRcode(doc)) { 235 disabledTableTypes += '-grid_tables'; 236 } 237 238 // gfm and commonmark variants don't allow simple/multiline/grid tables (just pipe tables) 239 // and it's an error to even include these in the markdown format specifier -- so for 240 // these modes we just nix the disabling 241 if (format.startsWith(kGfmFormat) || format.startsWith(kCommonmarkFormat)) { 242 disabledTableTypes = ''; 243 } 244 245 // return 246 return disabledTableTypes; 247} 248 249function wrapOptions(options: PandocWriterOptions) { 250 const pandocOptions: string[] = []; 251 if (options.wrap) { 252 if (options.wrap === 'none' || options.wrap === 'sentence') { 253 pandocOptions.push('--wrap=none'); 254 } else { 255 const column = parseInt(options.wrap, 10); 256 if (column) { 257 pandocOptions.push('--wrap=auto'); 258 pandocOptions.push(`--columns=${column}`); 259 } else { 260 pandocOptions.push('--wrap=none'); 261 } 262 } 263 } else { 264 pandocOptions.push('--wrap=none'); 265 } 266 return pandocOptions; 267} 268