1 /*
2  * MD4C: Markdown parser for C
3  * (http://github.com/mity/md4c)
4  *
5  * Copyright (c) 2016-2020 Martin Mitas
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23  * IN THE SOFTWARE.
24  */
25 
26 #ifndef MD4C_H
27 #define MD4C_H
28 
29 #ifdef __cplusplus
30     extern "C" {
31 #endif
32 
33 #if defined MD4C_USE_UTF16
34     /* Magic to support UTF-16. Note that in order to use it, you have to define
35      * the macro MD4C_USE_UTF16 both when building MD4C as well as when
36      * including this header in your code. */
37     #ifdef _WIN32
38         #include <windows.h>
39         typedef WCHAR       MD_CHAR;
40     #else
41         #error MD4C_USE_UTF16 is only supported on Windows.
42     #endif
43 #else
44     typedef char            MD_CHAR;
45 #endif
46 
47 typedef unsigned MD_SIZE;
48 typedef unsigned MD_OFFSET;
49 
50 
51 /* Block represents a part of document hierarchy structure like a paragraph
52  * or list item.
53  */
54 typedef enum MD_BLOCKTYPE {
55     /* <body>...</body> */
56     MD_BLOCK_DOC = 0,
57 
58     /* <blockquote>...</blockquote> */
59     MD_BLOCK_QUOTE,
60 
61     /* <ul>...</ul>
62      * Detail: Structure MD_BLOCK_UL_DETAIL. */
63     MD_BLOCK_UL,
64 
65     /* <ol>...</ol>
66      * Detail: Structure MD_BLOCK_OL_DETAIL. */
67     MD_BLOCK_OL,
68 
69     /* <li>...</li>
70      * Detail: Structure MD_BLOCK_LI_DETAIL. */
71     MD_BLOCK_LI,
72 
73     /* <hr> */
74     MD_BLOCK_HR,
75 
76     /* <h1>...</h1> (for levels up to 6)
77      * Detail: Structure MD_BLOCK_H_DETAIL. */
78     MD_BLOCK_H,
79 
80     /* <pre><code>...</code></pre>
81      * Note the text lines within code blocks are terminated with '\n'
82      * instead of explicit MD_TEXT_BR. */
83     MD_BLOCK_CODE,
84 
85     /* Raw HTML block. This itself does not correspond to any particular HTML
86      * tag. The contents of it _is_ raw HTML source intended to be put
87      * in verbatim form to the HTML output. */
88     MD_BLOCK_HTML,
89 
90     /* <p>...</p> */
91     MD_BLOCK_P,
92 
93     /* <table>...</table> and its contents.
94      * Detail: Structure MD_BLOCK_TD_DETAIL (used with MD_BLOCK_TH and MD_BLOCK_TD)
95      * Note all of these are used only if extension MD_FLAG_TABLES is enabled. */
96     MD_BLOCK_TABLE,
97     MD_BLOCK_THEAD,
98     MD_BLOCK_TBODY,
99     MD_BLOCK_TR,
100     MD_BLOCK_TH,
101     MD_BLOCK_TD
102 } MD_BLOCKTYPE;
103 
104 /* Span represents an in-line piece of a document which should be rendered with
105  * the same font, color and other attributes. A sequence of spans forms a block
106  * like paragraph or list item. */
107 typedef enum MD_SPANTYPE {
108     /* <em>...</em> */
109     MD_SPAN_EM,
110 
111     /* <strong>...</strong> */
112     MD_SPAN_STRONG,
113 
114     /* <a href="xxx">...</a>
115      * Detail: Structure MD_SPAN_A_DETAIL. */
116     MD_SPAN_A,
117 
118     /* <img src="xxx">...</a>
119      * Detail: Structure MD_SPAN_IMG_DETAIL.
120      * Note: Image text can contain nested spans and even nested images.
121      * If rendered into ALT attribute of HTML <IMG> tag, it's responsibility
122      * of the parser to deal with it.
123      */
124     MD_SPAN_IMG,
125 
126     /* <code>...</code> */
127     MD_SPAN_CODE,
128 
129     /* <del>...</del>
130      * Note: Recognized only when MD_FLAG_STRIKETHROUGH is enabled.
131      */
132     MD_SPAN_DEL,
133 
134     /* For recognizing inline ($) and display ($$) equations
135      * Note: Recognized only when MD_FLAG_LATEXMATHSPANS is enabled.
136      */
137     MD_SPAN_LATEXMATH,
138     MD_SPAN_LATEXMATH_DISPLAY,
139 
140     /* Wiki links
141      * Note: Recognized only when MD_FLAG_WIKILINKS is enabled.
142      */
143     MD_SPAN_WIKILINK,
144 
145     /* <u>...</u>
146      * Note: Recognized only when MD_FLAG_UNDERLINE is enabled. */
147     MD_SPAN_U
148 } MD_SPANTYPE;
149 
150 /* Text is the actual textual contents of span. */
151 typedef enum MD_TEXTTYPE {
152     /* Normal text. */
153     MD_TEXT_NORMAL = 0,
154 
155     /* NULL character. CommonMark requires replacing NULL character with
156      * the replacement char U+FFFD, so this allows caller to do that easily. */
157     MD_TEXT_NULLCHAR,
158 
159     /* Line breaks.
160      * Note these are not sent from blocks with verbatim output (MD_BLOCK_CODE
161      * or MD_BLOCK_HTML). In such cases, '\n' is part of the text itself. */
162     MD_TEXT_BR,         /* <br> (hard break) */
163     MD_TEXT_SOFTBR,     /* '\n' in source text where it is not semantically meaningful (soft break) */
164 
165     /* Entity.
166      * (a) Named entity, e.g. &nbsp;
167      *     (Note MD4C does not have a list of known entities.
168      *     Anything matching the regexp /&[A-Za-z][A-Za-z0-9]{1,47};/ is
169      *     treated as a named entity.)
170      * (b) Numerical entity, e.g. &#1234;
171      * (c) Hexadecimal entity, e.g. &#x12AB;
172      *
173      * As MD4C is mostly encoding agnostic, application gets the verbatim
174      * entity text into the MD_PARSER::text_callback(). */
175     MD_TEXT_ENTITY,
176 
177     /* Text in a code block (inside MD_BLOCK_CODE) or inlined code (`code`).
178      * If it is inside MD_BLOCK_CODE, it includes spaces for indentation and
179      * '\n' for new lines. MD_TEXT_BR and MD_TEXT_SOFTBR are not sent for this
180      * kind of text. */
181     MD_TEXT_CODE,
182 
183     /* Text is a raw HTML. If it is contents of a raw HTML block (i.e. not
184      * an inline raw HTML), then MD_TEXT_BR and MD_TEXT_SOFTBR are not used.
185      * The text contains verbatim '\n' for the new lines. */
186     MD_TEXT_HTML,
187 
188     /* Text is inside an equation. This is processed the same way as inlined code
189      * spans (`code`). */
190     MD_TEXT_LATEXMATH
191 } MD_TEXTTYPE;
192 
193 
194 /* Alignment enumeration. */
195 typedef enum MD_ALIGN {
196     MD_ALIGN_DEFAULT = 0,   /* When unspecified. */
197     MD_ALIGN_LEFT,
198     MD_ALIGN_CENTER,
199     MD_ALIGN_RIGHT
200 } MD_ALIGN;
201 
202 
203 /* String attribute.
204  *
205  * This wraps strings which are outside of a normal text flow and which are
206  * propagated within various detailed structures, but which still may contain
207  * string portions of different types like e.g. entities.
208  *
209  * So, for example, lets consider this image:
210  *
211  *     ![image alt text](http://example.org/image.png 'foo &quot; bar')
212  *
213  * The image alt text is propagated as a normal text via the MD_PARSER::text()
214  * callback. However, the image title ('foo &quot; bar') is propagated as
215  * MD_ATTRIBUTE in MD_SPAN_IMG_DETAIL::title.
216  *
217  * Then the attribute MD_SPAN_IMG_DETAIL::title shall provide the following:
218  *  -- [0]: "foo "   (substr_types[0] == MD_TEXT_NORMAL; substr_offsets[0] == 0)
219  *  -- [1]: "&quot;" (substr_types[1] == MD_TEXT_ENTITY; substr_offsets[1] == 4)
220  *  -- [2]: " bar"   (substr_types[2] == MD_TEXT_NORMAL; substr_offsets[2] == 10)
221  *  -- [3]: (n/a)    (n/a                              ; substr_offsets[3] == 14)
222  *
223  * Note that these invariants are always guaranteed:
224  *  -- substr_offsets[0] == 0
225  *  -- substr_offsets[LAST+1] == size
226  *  -- Currently, only MD_TEXT_NORMAL, MD_TEXT_ENTITY, MD_TEXT_NULLCHAR
227  *     substrings can appear. This could change only of the specification
228  *     changes.
229  */
230 typedef struct MD_ATTRIBUTE {
231     const MD_CHAR* text;
232     MD_SIZE size;
233     const MD_TEXTTYPE* substr_types;
234     const MD_OFFSET* substr_offsets;
235 } MD_ATTRIBUTE;
236 
237 
238 /* Detailed info for MD_BLOCK_UL. */
239 typedef struct MD_BLOCK_UL_DETAIL {
240     int is_tight;           /* Non-zero if tight list, zero if loose. */
241     MD_CHAR mark;           /* Item bullet character in MarkDown source of the list, e.g. '-', '+', '*'. */
242 } MD_BLOCK_UL_DETAIL;
243 
244 /* Detailed info for MD_BLOCK_OL. */
245 typedef struct MD_BLOCK_OL_DETAIL {
246     unsigned start;         /* Start index of the ordered list. */
247     int is_tight;           /* Non-zero if tight list, zero if loose. */
248     MD_CHAR mark_delimiter; /* Character delimiting the item marks in MarkDown source, e.g. '.' or ')' */
249 } MD_BLOCK_OL_DETAIL;
250 
251 /* Detailed info for MD_BLOCK_LI. */
252 typedef struct MD_BLOCK_LI_DETAIL {
253     int is_task;            /* Can be non-zero only with MD_FLAG_TASKLISTS */
254     MD_CHAR task_mark;      /* If is_task, then one of 'x', 'X' or ' '. Undefined otherwise. */
255     MD_OFFSET task_mark_offset;  /* If is_task, then offset in the input of the char between '[' and ']'. */
256 } MD_BLOCK_LI_DETAIL;
257 
258 /* Detailed info for MD_BLOCK_H. */
259 typedef struct MD_BLOCK_H_DETAIL {
260     unsigned level;         /* Header level (1 - 6) */
261 } MD_BLOCK_H_DETAIL;
262 
263 /* Detailed info for MD_BLOCK_CODE. */
264 typedef struct MD_BLOCK_CODE_DETAIL {
265     MD_ATTRIBUTE info;
266     MD_ATTRIBUTE lang;
267     MD_CHAR fence_char;     /* The character used for fenced code block; or zero for indented code block. */
268 } MD_BLOCK_CODE_DETAIL;
269 
270 /* Detailed info for MD_BLOCK_TH and MD_BLOCK_TD. */
271 typedef struct MD_BLOCK_TD_DETAIL {
272     MD_ALIGN align;
273 } MD_BLOCK_TD_DETAIL;
274 
275 /* Detailed info for MD_SPAN_A. */
276 typedef struct MD_SPAN_A_DETAIL {
277     MD_ATTRIBUTE href;
278     MD_ATTRIBUTE title;
279 } MD_SPAN_A_DETAIL;
280 
281 /* Detailed info for MD_SPAN_IMG. */
282 typedef struct MD_SPAN_IMG_DETAIL {
283     MD_ATTRIBUTE src;
284     MD_ATTRIBUTE title;
285 } MD_SPAN_IMG_DETAIL;
286 
287 /* Detailed info for MD_SPAN_WIKILINK. */
288 typedef struct MD_SPAN_WIKILINK {
289     MD_ATTRIBUTE target;
290 } MD_SPAN_WIKILINK_DETAIL;
291 
292 /* Flags specifying extensions/deviations from CommonMark specification.
293  *
294  * By default (when MD_PARSER::flags == 0), we follow CommonMark specification.
295  * The following flags may allow some extensions or deviations from it.
296  */
297 #define MD_FLAG_COLLAPSEWHITESPACE          0x0001  /* In MD_TEXT_NORMAL, collapse non-trivial whitespace into single ' ' */
298 #define MD_FLAG_PERMISSIVEATXHEADERS        0x0002  /* Do not require space in ATX headers ( ###header ) */
299 #define MD_FLAG_PERMISSIVEURLAUTOLINKS      0x0004  /* Recognize URLs as autolinks even without '<', '>' */
300 #define MD_FLAG_PERMISSIVEEMAILAUTOLINKS    0x0008  /* Recognize e-mails as autolinks even without '<', '>' and 'mailto:' */
301 #define MD_FLAG_NOINDENTEDCODEBLOCKS        0x0010  /* Disable indented code blocks. (Only fenced code works.) */
302 #define MD_FLAG_NOHTMLBLOCKS                0x0020  /* Disable raw HTML blocks. */
303 #define MD_FLAG_NOHTMLSPANS                 0x0040  /* Disable raw HTML (inline). */
304 #define MD_FLAG_TABLES                      0x0100  /* Enable tables extension. */
305 #define MD_FLAG_STRIKETHROUGH               0x0200  /* Enable strikethrough extension. */
306 #define MD_FLAG_PERMISSIVEWWWAUTOLINKS      0x0400  /* Enable WWW autolinks (even without any scheme prefix, if they begin with 'www.') */
307 #define MD_FLAG_TASKLISTS                   0x0800  /* Enable task list extension. */
308 #define MD_FLAG_LATEXMATHSPANS              0x1000  /* Enable $ and $$ containing LaTeX equations. */
309 #define MD_FLAG_WIKILINKS                   0x2000  /* Enable wiki links extension. */
310 #define MD_FLAG_UNDERLINE                   0x4000  /* Enable underline extension (and disables '_' for normal emphasis). */
311 
312 #define MD_FLAG_PERMISSIVEAUTOLINKS         (MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS)
313 #define MD_FLAG_NOHTML                      (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS)
314 
315 /* Convenient sets of flags corresponding to well-known Markdown dialects.
316  *
317  * Note we may only support subset of features of the referred dialect.
318  * The constant just enables those extensions which bring us as close as
319  * possible given what features we implement.
320  *
321  * ABI compatibility note: Meaning of these can change in time as new
322  * extensions, bringing the dialect closer to the original, are implemented.
323  */
324 #define MD_DIALECT_COMMONMARK               0
325 #define MD_DIALECT_GITHUB                   (MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_TABLES | MD_FLAG_STRIKETHROUGH | MD_FLAG_TASKLISTS)
326 
327 /* Parser structure.
328  */
329 typedef struct MD_PARSER {
330     /* Reserved. Set to zero.
331      */
332     unsigned abi_version;
333 
334     /* Dialect options. Bitmask of MD_FLAG_xxxx values.
335      */
336     unsigned flags;
337 
338     /* Caller-provided rendering callbacks.
339      *
340      * For some block/span types, more detailed information is provided in a
341      * type-specific structure pointed by the argument 'detail'.
342      *
343      * The last argument of all callbacks, 'userdata', is just propagated from
344      * md_parse() and is available for any use by the application.
345      *
346      * Note any strings provided to the callbacks as their arguments or as
347      * members of any detail structure are generally not zero-terminated.
348      * Application has to take the respective size information into account.
349      *
350      * Any rendering callback may abort further parsing of the document by
351      * returning non-zero.
352      */
353     int (*enter_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
354     int (*leave_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
355 
356     int (*enter_span)(MD_SPANTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
357     int (*leave_span)(MD_SPANTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
358 
359     int (*text)(MD_TEXTTYPE /*type*/, const MD_CHAR* /*text*/, MD_SIZE /*size*/, void* /*userdata*/);
360 
361     /* Debug callback. Optional (may be NULL).
362      *
363      * If provided and something goes wrong, this function gets called.
364      * This is intended for debugging and problem diagnosis for developers;
365      * it is not intended to provide any errors suitable for displaying to an
366      * end user.
367      */
368     void (*debug_log)(const char* /*msg*/, void* /*userdata*/);
369 
370     /* Reserved. Set to NULL.
371      */
372     void (*syntax)(void);
373 } MD_PARSER;
374 
375 
376 /* For backward compatibility. Do not use in new code.
377  */
378 typedef MD_PARSER MD_RENDERER;
379 
380 
381 /* Parse the Markdown document stored in the string 'text' of size 'size'.
382  * The parser provides callbacks to be called during the parsing so the
383  * caller can render the document on the screen or convert the Markdown
384  * to another format.
385  *
386  * Zero is returned on success. If a runtime error occurs (e.g. a memory
387  * fails), -1 is returned. If the processing is aborted due any callback
388  * returning non-zero, the return value of the callback is returned.
389  */
390 int md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata);
391 
392 
393 #ifdef __cplusplus
394     }  /* extern "C" { */
395 #endif
396 
397 #endif  /* MD4C_H */
398