1 /*
2  * MD4C: Markdown parser for C
3  * (http://github.com/mity/md4c)
4  *
5  * Copyright (c) 2016-2020 Martin Mitas
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the "Software"),
9  * to deal in the Software without restriction, including without limitation
10  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11  * and/or sell copies of the Software, and to permit persons to whom the
12  * Software is furnished to do so, subject to the following conditions:
13  *
14  * The above copyright notice and this permission notice shall be included in
15  * all copies or substantial portions of the Software.
16  *
17  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23  * IN THE SOFTWARE.
24  */
25 
26 #include "md4c.h"
27 
28 #include <limits.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 
33 
34 /*****************************
35  ***  Miscellaneous Stuff  ***
36  *****************************/
37 
38 #if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199409L
39     /* C89/90 or old compilers in general may not understand "inline". */
40     #if defined __GNUC__
41         #define inline __inline__
42     #elif defined _MSC_VER
43         #define inline __inline
44     #else
45         #define inline
46     #endif
47 #endif
48 
49 /* Make the UTF-8 support the default. */
50 #if !defined MD4C_USE_ASCII && !defined MD4C_USE_UTF8 && !defined MD4C_USE_UTF16
51     #define MD4C_USE_UTF8
52 #endif
53 
54 /* Magic for making wide literals with MD4C_USE_UTF16. */
55 #ifdef _T
56     #undef _T
57 #endif
58 #if defined MD4C_USE_UTF16
59     #define _T(x)           L##x
60 #else
61     #define _T(x)           x
62 #endif
63 
64 /* Misc. macros. */
65 #define SIZEOF_ARRAY(a)     (sizeof(a) / sizeof(a[0]))
66 
67 #define STRINGIZE_(x)       #x
68 #define STRINGIZE(x)        STRINGIZE_(x)
69 
70 #ifndef TRUE
71     #define TRUE            1
72     #define FALSE           0
73 #endif
74 
75 
76 /************************
77  ***  Internal Types  ***
78  ************************/
79 
80 /* These are omnipresent so lets save some typing. */
81 #define CHAR    MD_CHAR
82 #define SZ      MD_SIZE
83 #define OFF     MD_OFFSET
84 
85 typedef struct MD_MARK_tag MD_MARK;
86 typedef struct MD_BLOCK_tag MD_BLOCK;
87 typedef struct MD_CONTAINER_tag MD_CONTAINER;
88 typedef struct MD_REF_DEF_tag MD_REF_DEF;
89 
90 
91 /* During analyzes of inline marks, we need to manage some "mark chains",
92  * of (yet unresolved) openers. This structure holds start/end of the chain.
93  * The chain internals are then realized through MD_MARK::prev and ::next.
94  */
95 typedef struct MD_MARKCHAIN_tag MD_MARKCHAIN;
96 struct MD_MARKCHAIN_tag {
97     int head;   /* Index of first mark in the chain, or -1 if empty. */
98     int tail;   /* Index of last mark in the chain, or -1 if empty. */
99 };
100 
101 /* Context propagated through all the parsing. */
102 typedef struct MD_CTX_tag MD_CTX;
103 struct MD_CTX_tag {
104     /* Immutable stuff (parameters of md_parse()). */
105     const CHAR* text;
106     SZ size;
107     MD_PARSER parser;
108     void* userdata;
109 
110     /* When this is true, it allows some optimizations. */
111     int doc_ends_with_newline;
112 
113     /* Helper temporary growing buffer. */
114     CHAR* buffer;
115     unsigned alloc_buffer;
116 
117     /* Reference definitions. */
118     MD_REF_DEF* ref_defs;
119     int n_ref_defs;
120     int alloc_ref_defs;
121     void** ref_def_hashtable;
122     int ref_def_hashtable_size;
123 
124     /* Stack of inline/span markers.
125      * This is only used for parsing a single block contents but by storing it
126      * here we may reuse the stack for subsequent blocks; i.e. we have fewer
127      * (re)allocations. */
128     MD_MARK* marks;
129     int n_marks;
130     int alloc_marks;
131 
132 #if defined MD4C_USE_UTF16
133     char mark_char_map[128];
134 #else
135     char mark_char_map[256];
136 #endif
137 
138     /* For resolving of inline spans. */
139     MD_MARKCHAIN mark_chains[13];
140 #define PTR_CHAIN                               (ctx->mark_chains[0])
141 #define TABLECELLBOUNDARIES                     (ctx->mark_chains[1])
142 #define ASTERISK_OPENERS_extraword_mod3_0       (ctx->mark_chains[2])
143 #define ASTERISK_OPENERS_extraword_mod3_1       (ctx->mark_chains[3])
144 #define ASTERISK_OPENERS_extraword_mod3_2       (ctx->mark_chains[4])
145 #define ASTERISK_OPENERS_intraword_mod3_0       (ctx->mark_chains[5])
146 #define ASTERISK_OPENERS_intraword_mod3_1       (ctx->mark_chains[6])
147 #define ASTERISK_OPENERS_intraword_mod3_2       (ctx->mark_chains[7])
148 #define UNDERSCORE_OPENERS                      (ctx->mark_chains[8])
149 #define TILDE_OPENERS_1                         (ctx->mark_chains[9])
150 #define TILDE_OPENERS_2                         (ctx->mark_chains[10])
151 #define BRACKET_OPENERS                         (ctx->mark_chains[11])
152 #define DOLLAR_OPENERS                          (ctx->mark_chains[12])
153 #define OPENERS_CHAIN_FIRST                     2
154 #define OPENERS_CHAIN_LAST                      12
155 
156     int n_table_cell_boundaries;
157 
158     /* For resolving links. */
159     int unresolved_link_head;
160     int unresolved_link_tail;
161 
162     /* For resolving raw HTML. */
163     OFF html_comment_horizon;
164     OFF html_proc_instr_horizon;
165     OFF html_decl_horizon;
166     OFF html_cdata_horizon;
167 
168     /* For block analysis.
169      * Notes:
170      *   -- It holds MD_BLOCK as well as MD_LINE structures. After each
171      *      MD_BLOCK, its (multiple) MD_LINE(s) follow.
172      *   -- For MD_BLOCK_HTML and MD_BLOCK_CODE, MD_VERBATIMLINE(s) are used
173      *      instead of MD_LINE(s).
174      */
175     void* block_bytes;
176     MD_BLOCK* current_block;
177     int n_block_bytes;
178     int alloc_block_bytes;
179 
180     /* For container block analysis. */
181     MD_CONTAINER* containers;
182     int n_containers;
183     int alloc_containers;
184 
185     /* Minimal indentation to call the block "indented code block". */
186     unsigned code_indent_offset;
187 
188     /* Contextual info for line analysis. */
189     SZ code_fence_length;   /* For checking closing fence length. */
190     int html_block_type;    /* For checking closing raw HTML condition. */
191     int last_line_has_list_loosening_effect;
192     int last_list_item_starts_with_two_blank_lines;
193 };
194 
195 enum MD_LINETYPE_tag {
196     MD_LINE_BLANK,
197     MD_LINE_HR,
198     MD_LINE_ATXHEADER,
199     MD_LINE_SETEXTHEADER,
200     MD_LINE_SETEXTUNDERLINE,
201     MD_LINE_INDENTEDCODE,
202     MD_LINE_FENCEDCODE,
203     MD_LINE_HTML,
204     MD_LINE_TEXT,
205     MD_LINE_TABLE,
206     MD_LINE_TABLEUNDERLINE
207 };
208 typedef enum MD_LINETYPE_tag MD_LINETYPE;
209 
210 typedef struct MD_LINE_ANALYSIS_tag MD_LINE_ANALYSIS;
211 struct MD_LINE_ANALYSIS_tag {
212     MD_LINETYPE type    : 16;
213     unsigned data       : 16;
214     OFF beg;
215     OFF end;
216     unsigned indent;        /* Indentation level. */
217 };
218 
219 typedef struct MD_LINE_tag MD_LINE;
220 struct MD_LINE_tag {
221     OFF beg;
222     OFF end;
223 };
224 
225 typedef struct MD_VERBATIMLINE_tag MD_VERBATIMLINE;
226 struct MD_VERBATIMLINE_tag {
227     OFF beg;
228     OFF end;
229     OFF indent;
230 };
231 
232 
233 /*******************
234  ***  Debugging  ***
235  *******************/
236 
237 #define MD_LOG(msg)                                                     \
238     do {                                                                \
239         if(ctx->parser.debug_log != NULL)                               \
240             ctx->parser.debug_log((msg), ctx->userdata);                \
241     } while(0)
242 
243 #ifdef DEBUG
244     #define MD_ASSERT(cond)                                             \
245             do {                                                        \
246                 if(!(cond)) {                                           \
247                     MD_LOG(__FILE__ ":" STRINGIZE(__LINE__) ": "        \
248                            "Assertion '" STRINGIZE(cond) "' failed.");  \
249                     exit(1);                                            \
250                 }                                                       \
251             } while(0)
252 
253     #define MD_UNREACHABLE()        MD_ASSERT(1 == 0)
254 #else
255     #ifdef __GNUC__
256         #define MD_ASSERT(cond)     do { if(!(cond)) __builtin_unreachable(); } while(0)
257         #define MD_UNREACHABLE()    do { __builtin_unreachable(); } while(0)
258     #elif defined _MSC_VER  &&  _MSC_VER > 120
259         #define MD_ASSERT(cond)     do { __assume(cond); } while(0)
260         #define MD_UNREACHABLE()    do { __assume(0); } while(0)
261     #else
262         #define MD_ASSERT(cond)     do {} while(0)
263         #define MD_UNREACHABLE()    do {} while(0)
264     #endif
265 #endif
266 
267 
268 /*****************
269  ***  Helpers  ***
270  *****************/
271 
272 /* Character accessors. */
273 #define CH(off)                 (ctx->text[(off)])
274 #define STR(off)                (ctx->text + (off))
275 
276 /* Character classification.
277  * Note we assume ASCII compatibility of code points < 128 here. */
278 #define ISIN_(ch, ch_min, ch_max)       ((ch_min) <= (unsigned)(ch) && (unsigned)(ch) <= (ch_max))
279 #define ISANYOF_(ch, palette)           ((ch) != _T('\0')  &&  md_strchr((palette), (ch)) != NULL)
280 #define ISANYOF2_(ch, ch1, ch2)         ((ch) == (ch1) || (ch) == (ch2))
281 #define ISANYOF3_(ch, ch1, ch2, ch3)    ((ch) == (ch1) || (ch) == (ch2) || (ch) == (ch3))
282 #define ISASCII_(ch)                    ((unsigned)(ch) <= 127)
283 #define ISBLANK_(ch)                    (ISANYOF2_((ch), _T(' '), _T('\t')))
284 #define ISNEWLINE_(ch)                  (ISANYOF2_((ch), _T('\r'), _T('\n')))
285 #define ISWHITESPACE_(ch)               (ISBLANK_(ch) || ISANYOF2_((ch), _T('\v'), _T('\f')))
286 #define ISCNTRL_(ch)                    ((unsigned)(ch) <= 31 || (unsigned)(ch) == 127)
287 #define ISPUNCT_(ch)                    (ISIN_(ch, 33, 47) || ISIN_(ch, 58, 64) || ISIN_(ch, 91, 96) || ISIN_(ch, 123, 126))
288 #define ISUPPER_(ch)                    (ISIN_(ch, _T('A'), _T('Z')))
289 #define ISLOWER_(ch)                    (ISIN_(ch, _T('a'), _T('z')))
290 #define ISALPHA_(ch)                    (ISUPPER_(ch) || ISLOWER_(ch))
291 #define ISDIGIT_(ch)                    (ISIN_(ch, _T('0'), _T('9')))
292 #define ISXDIGIT_(ch)                   (ISDIGIT_(ch) || ISIN_(ch, _T('A'), _T('F')) || ISIN_(ch, _T('a'), _T('f')))
293 #define ISALNUM_(ch)                    (ISALPHA_(ch) || ISDIGIT_(ch))
294 
295 #define ISANYOF(off, palette)           ISANYOF_(CH(off), (palette))
296 #define ISANYOF2(off, ch1, ch2)         ISANYOF2_(CH(off), (ch1), (ch2))
297 #define ISANYOF3(off, ch1, ch2, ch3)    ISANYOF3_(CH(off), (ch1), (ch2), (ch3))
298 #define ISASCII(off)                    ISASCII_(CH(off))
299 #define ISBLANK(off)                    ISBLANK_(CH(off))
300 #define ISNEWLINE(off)                  ISNEWLINE_(CH(off))
301 #define ISWHITESPACE(off)               ISWHITESPACE_(CH(off))
302 #define ISCNTRL(off)                    ISCNTRL_(CH(off))
303 #define ISPUNCT(off)                    ISPUNCT_(CH(off))
304 #define ISUPPER(off)                    ISUPPER_(CH(off))
305 #define ISLOWER(off)                    ISLOWER_(CH(off))
306 #define ISALPHA(off)                    ISALPHA_(CH(off))
307 #define ISDIGIT(off)                    ISDIGIT_(CH(off))
308 #define ISXDIGIT(off)                   ISXDIGIT_(CH(off))
309 #define ISALNUM(off)                    ISALNUM_(CH(off))
310 
311 
312 #if defined MD4C_USE_UTF16
313     #define md_strchr wcschr
314 #else
315     #define md_strchr strchr
316 #endif
317 
318 
319 /* Case insensitive check of string equality. */
320 static inline int
md_ascii_case_eq(const CHAR * s1,const CHAR * s2,SZ n)321 md_ascii_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
322 {
323     OFF i;
324     for(i = 0; i < n; i++) {
325         CHAR ch1 = s1[i];
326         CHAR ch2 = s2[i];
327 
328         if(ISLOWER_(ch1))
329             ch1 += ('A'-'a');
330         if(ISLOWER_(ch2))
331             ch2 += ('A'-'a');
332         if(ch1 != ch2)
333             return FALSE;
334     }
335     return TRUE;
336 }
337 
338 static inline int
md_ascii_eq(const CHAR * s1,const CHAR * s2,SZ n)339 md_ascii_eq(const CHAR* s1, const CHAR* s2, SZ n)
340 {
341     return memcmp(s1, s2, n * sizeof(CHAR)) == 0;
342 }
343 
344 static int
md_text_with_null_replacement(MD_CTX * ctx,MD_TEXTTYPE type,const CHAR * str,SZ size)345 md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size)
346 {
347     OFF off = 0;
348     int ret = 0;
349 
350     while(1) {
351         while(off < size  &&  str[off] != _T('\0'))
352             off++;
353 
354         if(off > 0) {
355             ret = ctx->parser.text(type, str, off, ctx->userdata);
356             if(ret != 0)
357                 return ret;
358 
359             str += off;
360             size -= off;
361             off = 0;
362         }
363 
364         if(off >= size)
365             return 0;
366 
367         ret = ctx->parser.text(MD_TEXT_NULLCHAR, _T(""), 1, ctx->userdata);
368         if(ret != 0)
369             return ret;
370         off++;
371     }
372 }
373 
374 
375 #define MD_CHECK(func)                                                      \
376     do {                                                                    \
377         ret = (func);                                                       \
378         if(ret < 0)                                                         \
379             goto abort;                                                     \
380     } while(0)
381 
382 
383 #define MD_TEMP_BUFFER(sz)                                                  \
384     do {                                                                    \
385         if(sz > ctx->alloc_buffer) {                                        \
386             CHAR* new_buffer;                                               \
387             SZ new_size = ((sz) + (sz) / 2 + 128) & ~127;                   \
388                                                                             \
389             new_buffer = realloc(ctx->buffer, new_size);                    \
390             if(new_buffer == NULL) {                                        \
391                 MD_LOG("realloc() failed.");                                \
392                 ret = -1;                                                   \
393                 goto abort;                                                 \
394             }                                                               \
395                                                                             \
396             ctx->buffer = new_buffer;                                       \
397             ctx->alloc_buffer = new_size;                                   \
398         }                                                                   \
399     } while(0)
400 
401 
402 #define MD_ENTER_BLOCK(type, arg)                                           \
403     do {                                                                    \
404         ret = ctx->parser.enter_block((type), (arg), ctx->userdata);        \
405         if(ret != 0) {                                                      \
406             MD_LOG("Aborted from enter_block() callback.");                 \
407             goto abort;                                                     \
408         }                                                                   \
409     } while(0)
410 
411 #define MD_LEAVE_BLOCK(type, arg)                                           \
412     do {                                                                    \
413         ret = ctx->parser.leave_block((type), (arg), ctx->userdata);        \
414         if(ret != 0) {                                                      \
415             MD_LOG("Aborted from leave_block() callback.");                 \
416             goto abort;                                                     \
417         }                                                                   \
418     } while(0)
419 
420 #define MD_ENTER_SPAN(type, arg)                                            \
421     do {                                                                    \
422         ret = ctx->parser.enter_span((type), (arg), ctx->userdata);         \
423         if(ret != 0) {                                                      \
424             MD_LOG("Aborted from enter_span() callback.");                  \
425             goto abort;                                                     \
426         }                                                                   \
427     } while(0)
428 
429 #define MD_LEAVE_SPAN(type, arg)                                            \
430     do {                                                                    \
431         ret = ctx->parser.leave_span((type), (arg), ctx->userdata);         \
432         if(ret != 0) {                                                      \
433             MD_LOG("Aborted from leave_span() callback.");                  \
434             goto abort;                                                     \
435         }                                                                   \
436     } while(0)
437 
438 #define MD_TEXT(type, str, size)                                            \
439     do {                                                                    \
440         if(size > 0) {                                                      \
441             ret = ctx->parser.text((type), (str), (size), ctx->userdata);   \
442             if(ret != 0) {                                                  \
443                 MD_LOG("Aborted from text() callback.");                    \
444                 goto abort;                                                 \
445             }                                                               \
446         }                                                                   \
447     } while(0)
448 
449 #define MD_TEXT_INSECURE(type, str, size)                                   \
450     do {                                                                    \
451         if(size > 0) {                                                      \
452             ret = md_text_with_null_replacement(ctx, type, str, size);      \
453             if(ret != 0) {                                                  \
454                 MD_LOG("Aborted from text() callback.");                    \
455                 goto abort;                                                 \
456             }                                                               \
457         }                                                                   \
458     } while(0)
459 
460 
461 
462 /*************************
463  ***  Unicode Support  ***
464  *************************/
465 
466 typedef struct MD_UNICODE_FOLD_INFO_tag MD_UNICODE_FOLD_INFO;
467 struct MD_UNICODE_FOLD_INFO_tag {
468     unsigned codepoints[3];
469     int n_codepoints;
470 };
471 
472 
473 #if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8
474     /* Binary search over sorted "map" of codepoints. Consecutive sequences
475      * of codepoints may be encoded in the map by just using the
476      * (MIN_CODEPOINT | 0x40000000) and (MAX_CODEPOINT | 0x80000000).
477      *
478      * Returns index of the found record in the map (in the case of ranges,
479      * the minimal value is used); or -1 on failure. */
480     static int
md_unicode_bsearch__(unsigned codepoint,const unsigned * map,size_t map_size)481     md_unicode_bsearch__(unsigned codepoint, const unsigned* map, size_t map_size)
482     {
483         int beg, end;
484         int pivot_beg, pivot_end;
485 
486         beg = 0;
487         end = (int) map_size-1;
488         while(beg <= end) {
489             /* Pivot may be a range, not just a single value. */
490             pivot_beg = pivot_end = (beg + end) / 2;
491             if(map[pivot_end] & 0x40000000)
492                 pivot_end++;
493             if(map[pivot_beg] & 0x80000000)
494                 pivot_beg--;
495 
496             if(codepoint < (map[pivot_beg] & 0x00ffffff))
497                 end = pivot_beg - 1;
498             else if(codepoint > (map[pivot_end] & 0x00ffffff))
499                 beg = pivot_end + 1;
500             else
501                 return pivot_beg;
502         }
503 
504         return -1;
505     }
506 
507     static int
md_is_unicode_whitespace__(unsigned codepoint)508     md_is_unicode_whitespace__(unsigned codepoint)
509     {
510 #define R(cp_min, cp_max)   ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
511 #define S(cp)               (cp)
512         /* Unicode "Zs" category.
513          * (generated by scripts/build_whitespace_map.py) */
514         static const unsigned WHITESPACE_MAP[] = {
515             S(0x0020), S(0x00a0), S(0x1680), R(0x2000,0x200a), S(0x202f), S(0x205f), S(0x3000)
516         };
517 #undef R
518 #undef S
519 
520         /* The ASCII ones are the most frequently used ones, also CommonMark
521          * specification requests few more in this range. */
522         if(codepoint <= 0x7f)
523             return ISWHITESPACE_(codepoint);
524 
525         return (md_unicode_bsearch__(codepoint, WHITESPACE_MAP, SIZEOF_ARRAY(WHITESPACE_MAP)) >= 0);
526     }
527 
528     static int
md_is_unicode_punct__(unsigned codepoint)529     md_is_unicode_punct__(unsigned codepoint)
530     {
531 #define R(cp_min, cp_max)   ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
532 #define S(cp)               (cp)
533         /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
534          * (generated by scripts/build_punct_map.py) */
535         static const unsigned PUNCT_MAP[] = {
536             R(0x0021,0x0023), R(0x0025,0x002a), R(0x002c,0x002f), R(0x003a,0x003b), R(0x003f,0x0040),
537             R(0x005b,0x005d), S(0x005f), S(0x007b), S(0x007d), S(0x00a1), S(0x00a7), S(0x00ab), R(0x00b6,0x00b7),
538             S(0x00bb), S(0x00bf), S(0x037e), S(0x0387), R(0x055a,0x055f), R(0x0589,0x058a), S(0x05be), S(0x05c0),
539             S(0x05c3), S(0x05c6), R(0x05f3,0x05f4), R(0x0609,0x060a), R(0x060c,0x060d), S(0x061b), R(0x061e,0x061f),
540             R(0x066a,0x066d), S(0x06d4), R(0x0700,0x070d), R(0x07f7,0x07f9), R(0x0830,0x083e), S(0x085e),
541             R(0x0964,0x0965), S(0x0970), S(0x09fd), S(0x0a76), S(0x0af0), S(0x0c77), S(0x0c84), S(0x0df4), S(0x0e4f),
542             R(0x0e5a,0x0e5b), R(0x0f04,0x0f12), S(0x0f14), R(0x0f3a,0x0f3d), S(0x0f85), R(0x0fd0,0x0fd4),
543             R(0x0fd9,0x0fda), R(0x104a,0x104f), S(0x10fb), R(0x1360,0x1368), S(0x1400), S(0x166e), R(0x169b,0x169c),
544             R(0x16eb,0x16ed), R(0x1735,0x1736), R(0x17d4,0x17d6), R(0x17d8,0x17da), R(0x1800,0x180a),
545             R(0x1944,0x1945), R(0x1a1e,0x1a1f), R(0x1aa0,0x1aa6), R(0x1aa8,0x1aad), R(0x1b5a,0x1b60),
546             R(0x1bfc,0x1bff), R(0x1c3b,0x1c3f), R(0x1c7e,0x1c7f), R(0x1cc0,0x1cc7), S(0x1cd3), R(0x2010,0x2027),
547             R(0x2030,0x2043), R(0x2045,0x2051), R(0x2053,0x205e), R(0x207d,0x207e), R(0x208d,0x208e),
548             R(0x2308,0x230b), R(0x2329,0x232a), R(0x2768,0x2775), R(0x27c5,0x27c6), R(0x27e6,0x27ef),
549             R(0x2983,0x2998), R(0x29d8,0x29db), R(0x29fc,0x29fd), R(0x2cf9,0x2cfc), R(0x2cfe,0x2cff), S(0x2d70),
550             R(0x2e00,0x2e2e), R(0x2e30,0x2e4f), S(0x2e52), R(0x3001,0x3003), R(0x3008,0x3011), R(0x3014,0x301f),
551             S(0x3030), S(0x303d), S(0x30a0), S(0x30fb), R(0xa4fe,0xa4ff), R(0xa60d,0xa60f), S(0xa673), S(0xa67e),
552             R(0xa6f2,0xa6f7), R(0xa874,0xa877), R(0xa8ce,0xa8cf), R(0xa8f8,0xa8fa), S(0xa8fc), R(0xa92e,0xa92f),
553             S(0xa95f), R(0xa9c1,0xa9cd), R(0xa9de,0xa9df), R(0xaa5c,0xaa5f), R(0xaade,0xaadf), R(0xaaf0,0xaaf1),
554             S(0xabeb), R(0xfd3e,0xfd3f), R(0xfe10,0xfe19), R(0xfe30,0xfe52), R(0xfe54,0xfe61), S(0xfe63), S(0xfe68),
555             R(0xfe6a,0xfe6b), R(0xff01,0xff03), R(0xff05,0xff0a), R(0xff0c,0xff0f), R(0xff1a,0xff1b),
556             R(0xff1f,0xff20), R(0xff3b,0xff3d), S(0xff3f), S(0xff5b), S(0xff5d), R(0xff5f,0xff65), R(0x10100,0x10102),
557             S(0x1039f), S(0x103d0), S(0x1056f), S(0x10857), S(0x1091f), S(0x1093f), R(0x10a50,0x10a58), S(0x10a7f),
558             R(0x10af0,0x10af6), R(0x10b39,0x10b3f), R(0x10b99,0x10b9c), S(0x10ead), R(0x10f55,0x10f59),
559             R(0x11047,0x1104d), R(0x110bb,0x110bc), R(0x110be,0x110c1), R(0x11140,0x11143), R(0x11174,0x11175),
560             R(0x111c5,0x111c8), S(0x111cd), S(0x111db), R(0x111dd,0x111df), R(0x11238,0x1123d), S(0x112a9),
561             R(0x1144b,0x1144f), R(0x1145a,0x1145b), S(0x1145d), S(0x114c6), R(0x115c1,0x115d7), R(0x11641,0x11643),
562             R(0x11660,0x1166c), R(0x1173c,0x1173e), S(0x1183b), R(0x11944,0x11946), S(0x119e2), R(0x11a3f,0x11a46),
563             R(0x11a9a,0x11a9c), R(0x11a9e,0x11aa2), R(0x11c41,0x11c45), R(0x11c70,0x11c71), R(0x11ef7,0x11ef8),
564             S(0x11fff), R(0x12470,0x12474), R(0x16a6e,0x16a6f), S(0x16af5), R(0x16b37,0x16b3b), S(0x16b44),
565             R(0x16e97,0x16e9a), S(0x16fe2), S(0x1bc9f), R(0x1da87,0x1da8b), R(0x1e95e,0x1e95f)
566         };
567 #undef R
568 #undef S
569 
570         /* The ASCII ones are the most frequently used ones, also CommonMark
571          * specification requests few more in this range. */
572         if(codepoint <= 0x7f)
573             return ISPUNCT_(codepoint);
574 
575         return (md_unicode_bsearch__(codepoint, PUNCT_MAP, SIZEOF_ARRAY(PUNCT_MAP)) >= 0);
576     }
577 
578     static void
md_get_unicode_fold_info(unsigned codepoint,MD_UNICODE_FOLD_INFO * info)579     md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
580     {
581 #define R(cp_min, cp_max)   ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
582 #define S(cp)               (cp)
583         /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
584          * (generated by scripts/build_punct_map.py) */
585         static const unsigned FOLD_MAP_1[] = {
586             R(0x0041,0x005a), S(0x00b5), R(0x00c0,0x00d6), R(0x00d8,0x00de), R(0x0100,0x012e), R(0x0132,0x0136),
587             R(0x0139,0x0147), R(0x014a,0x0176), S(0x0178), R(0x0179,0x017d), S(0x017f), S(0x0181), S(0x0182),
588             S(0x0184), S(0x0186), S(0x0187), S(0x0189), S(0x018a), S(0x018b), S(0x018e), S(0x018f), S(0x0190),
589             S(0x0191), S(0x0193), S(0x0194), S(0x0196), S(0x0197), S(0x0198), S(0x019c), S(0x019d), S(0x019f),
590             R(0x01a0,0x01a4), S(0x01a6), S(0x01a7), S(0x01a9), S(0x01ac), S(0x01ae), S(0x01af), S(0x01b1), S(0x01b2),
591             S(0x01b3), S(0x01b5), S(0x01b7), S(0x01b8), S(0x01bc), S(0x01c4), S(0x01c5), S(0x01c7), S(0x01c8),
592             S(0x01ca), R(0x01cb,0x01db), R(0x01de,0x01ee), S(0x01f1), S(0x01f2), S(0x01f4), S(0x01f6), S(0x01f7),
593             R(0x01f8,0x021e), S(0x0220), R(0x0222,0x0232), S(0x023a), S(0x023b), S(0x023d), S(0x023e), S(0x0241),
594             S(0x0243), S(0x0244), S(0x0245), R(0x0246,0x024e), S(0x0345), S(0x0370), S(0x0372), S(0x0376), S(0x037f),
595             S(0x0386), R(0x0388,0x038a), S(0x038c), S(0x038e), S(0x038f), R(0x0391,0x03a1), R(0x03a3,0x03ab),
596             S(0x03c2), S(0x03cf), S(0x03d0), S(0x03d1), S(0x03d5), S(0x03d6), R(0x03d8,0x03ee), S(0x03f0), S(0x03f1),
597             S(0x03f4), S(0x03f5), S(0x03f7), S(0x03f9), S(0x03fa), R(0x03fd,0x03ff), R(0x0400,0x040f),
598             R(0x0410,0x042f), R(0x0460,0x0480), R(0x048a,0x04be), S(0x04c0), R(0x04c1,0x04cd), R(0x04d0,0x052e),
599             R(0x0531,0x0556), R(0x10a0,0x10c5), S(0x10c7), S(0x10cd), R(0x13f8,0x13fd), S(0x1c80), S(0x1c81),
600             S(0x1c82), S(0x1c83), S(0x1c84), S(0x1c85), S(0x1c86), S(0x1c87), S(0x1c88), R(0x1c90,0x1cba),
601             R(0x1cbd,0x1cbf), R(0x1e00,0x1e94), S(0x1e9b), R(0x1ea0,0x1efe), R(0x1f08,0x1f0f), R(0x1f18,0x1f1d),
602             R(0x1f28,0x1f2f), R(0x1f38,0x1f3f), R(0x1f48,0x1f4d), S(0x1f59), S(0x1f5b), S(0x1f5d), S(0x1f5f),
603             R(0x1f68,0x1f6f), S(0x1fb8), S(0x1fb9), S(0x1fba), S(0x1fbb), S(0x1fbe), R(0x1fc8,0x1fcb), S(0x1fd8),
604             S(0x1fd9), S(0x1fda), S(0x1fdb), S(0x1fe8), S(0x1fe9), S(0x1fea), S(0x1feb), S(0x1fec), S(0x1ff8),
605             S(0x1ff9), S(0x1ffa), S(0x1ffb), S(0x2126), S(0x212a), S(0x212b), S(0x2132), R(0x2160,0x216f), S(0x2183),
606             R(0x24b6,0x24cf), R(0x2c00,0x2c2e), S(0x2c60), S(0x2c62), S(0x2c63), S(0x2c64), R(0x2c67,0x2c6b),
607             S(0x2c6d), S(0x2c6e), S(0x2c6f), S(0x2c70), S(0x2c72), S(0x2c75), S(0x2c7e), S(0x2c7f), R(0x2c80,0x2ce2),
608             S(0x2ceb), S(0x2ced), S(0x2cf2), R(0xa640,0xa66c), R(0xa680,0xa69a), R(0xa722,0xa72e), R(0xa732,0xa76e),
609             S(0xa779), S(0xa77b), S(0xa77d), R(0xa77e,0xa786), S(0xa78b), S(0xa78d), S(0xa790), S(0xa792),
610             R(0xa796,0xa7a8), S(0xa7aa), S(0xa7ab), S(0xa7ac), S(0xa7ad), S(0xa7ae), S(0xa7b0), S(0xa7b1), S(0xa7b2),
611             S(0xa7b3), R(0xa7b4,0xa7be), S(0xa7c2), S(0xa7c4), S(0xa7c5), S(0xa7c6), S(0xa7c7), S(0xa7c9), S(0xa7f5),
612             R(0xab70,0xabbf), R(0xff21,0xff3a), R(0x10400,0x10427), R(0x104b0,0x104d3), R(0x10c80,0x10cb2),
613             R(0x118a0,0x118bf), R(0x16e40,0x16e5f), R(0x1e900,0x1e921)
614         };
615         static const unsigned FOLD_MAP_1_DATA[] = {
616             0x0061, 0x007a, 0x03bc, 0x00e0, 0x00f6, 0x00f8, 0x00fe, 0x0101, 0x012f, 0x0133, 0x0137, 0x013a, 0x0148,
617             0x014b, 0x0177, 0x00ff, 0x017a, 0x017e, 0x0073, 0x0253, 0x0183, 0x0185, 0x0254, 0x0188, 0x0256, 0x0257,
618             0x018c, 0x01dd, 0x0259, 0x025b, 0x0192, 0x0260, 0x0263, 0x0269, 0x0268, 0x0199, 0x026f, 0x0272, 0x0275,
619             0x01a1, 0x01a5, 0x0280, 0x01a8, 0x0283, 0x01ad, 0x0288, 0x01b0, 0x028a, 0x028b, 0x01b4, 0x01b6, 0x0292,
620             0x01b9, 0x01bd, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01cc, 0x01cc, 0x01dc, 0x01df, 0x01ef, 0x01f3, 0x01f3,
621             0x01f5, 0x0195, 0x01bf, 0x01f9, 0x021f, 0x019e, 0x0223, 0x0233, 0x2c65, 0x023c, 0x019a, 0x2c66, 0x0242,
622             0x0180, 0x0289, 0x028c, 0x0247, 0x024f, 0x03b9, 0x0371, 0x0373, 0x0377, 0x03f3, 0x03ac, 0x03ad, 0x03af,
623             0x03cc, 0x03cd, 0x03ce, 0x03b1, 0x03c1, 0x03c3, 0x03cb, 0x03c3, 0x03d7, 0x03b2, 0x03b8, 0x03c6, 0x03c0,
624             0x03d9, 0x03ef, 0x03ba, 0x03c1, 0x03b8, 0x03b5, 0x03f8, 0x03f2, 0x03fb, 0x037b, 0x037d, 0x0450, 0x045f,
625             0x0430, 0x044f, 0x0461, 0x0481, 0x048b, 0x04bf, 0x04cf, 0x04c2, 0x04ce, 0x04d1, 0x052f, 0x0561, 0x0586,
626             0x2d00, 0x2d25, 0x2d27, 0x2d2d, 0x13f0, 0x13f5, 0x0432, 0x0434, 0x043e, 0x0441, 0x0442, 0x0442, 0x044a,
627             0x0463, 0xa64b, 0x10d0, 0x10fa, 0x10fd, 0x10ff, 0x1e01, 0x1e95, 0x1e61, 0x1ea1, 0x1eff, 0x1f00, 0x1f07,
628             0x1f10, 0x1f15, 0x1f20, 0x1f27, 0x1f30, 0x1f37, 0x1f40, 0x1f45, 0x1f51, 0x1f53, 0x1f55, 0x1f57, 0x1f60,
629             0x1f67, 0x1fb0, 0x1fb1, 0x1f70, 0x1f71, 0x03b9, 0x1f72, 0x1f75, 0x1fd0, 0x1fd1, 0x1f76, 0x1f77, 0x1fe0,
630             0x1fe1, 0x1f7a, 0x1f7b, 0x1fe5, 0x1f78, 0x1f79, 0x1f7c, 0x1f7d, 0x03c9, 0x006b, 0x00e5, 0x214e, 0x2170,
631             0x217f, 0x2184, 0x24d0, 0x24e9, 0x2c30, 0x2c5e, 0x2c61, 0x026b, 0x1d7d, 0x027d, 0x2c68, 0x2c6c, 0x0251,
632             0x0271, 0x0250, 0x0252, 0x2c73, 0x2c76, 0x023f, 0x0240, 0x2c81, 0x2ce3, 0x2cec, 0x2cee, 0x2cf3, 0xa641,
633             0xa66d, 0xa681, 0xa69b, 0xa723, 0xa72f, 0xa733, 0xa76f, 0xa77a, 0xa77c, 0x1d79, 0xa77f, 0xa787, 0xa78c,
634             0x0265, 0xa791, 0xa793, 0xa797, 0xa7a9, 0x0266, 0x025c, 0x0261, 0x026c, 0x026a, 0x029e, 0x0287, 0x029d,
635             0xab53, 0xa7b5, 0xa7bf, 0xa7c3, 0xa794, 0x0282, 0x1d8e, 0xa7c8, 0xa7ca, 0xa7f6, 0x13a0, 0x13ef, 0xff41,
636             0xff5a, 0x10428, 0x1044f, 0x104d8, 0x104fb, 0x10cc0, 0x10cf2, 0x118c0, 0x118df, 0x16e60, 0x16e7f, 0x1e922,
637             0x1e943
638         };
639         static const unsigned FOLD_MAP_2[] = {
640             S(0x00df), S(0x0130), S(0x0149), S(0x01f0), S(0x0587), S(0x1e96), S(0x1e97), S(0x1e98), S(0x1e99),
641             S(0x1e9a), S(0x1e9e), S(0x1f50), R(0x1f80,0x1f87), R(0x1f88,0x1f8f), R(0x1f90,0x1f97), R(0x1f98,0x1f9f),
642             R(0x1fa0,0x1fa7), R(0x1fa8,0x1faf), S(0x1fb2), S(0x1fb3), S(0x1fb4), S(0x1fb6), S(0x1fbc), S(0x1fc2),
643             S(0x1fc3), S(0x1fc4), S(0x1fc6), S(0x1fcc), S(0x1fd6), S(0x1fe4), S(0x1fe6), S(0x1ff2), S(0x1ff3),
644             S(0x1ff4), S(0x1ff6), S(0x1ffc), S(0xfb00), S(0xfb01), S(0xfb02), S(0xfb05), S(0xfb06), S(0xfb13),
645             S(0xfb14), S(0xfb15), S(0xfb16), S(0xfb17)
646         };
647         static const unsigned FOLD_MAP_2_DATA[] = {
648             0x0073,0x0073, 0x0069,0x0307, 0x02bc,0x006e, 0x006a,0x030c, 0x0565,0x0582, 0x0068,0x0331, 0x0074,0x0308,
649             0x0077,0x030a, 0x0079,0x030a, 0x0061,0x02be, 0x0073,0x0073, 0x03c5,0x0313, 0x1f00,0x03b9, 0x1f07,0x03b9,
650             0x1f00,0x03b9, 0x1f07,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f60,0x03b9,
651             0x1f67,0x03b9, 0x1f60,0x03b9, 0x1f67,0x03b9, 0x1f70,0x03b9, 0x03b1,0x03b9, 0x03ac,0x03b9, 0x03b1,0x0342,
652             0x03b1,0x03b9, 0x1f74,0x03b9, 0x03b7,0x03b9, 0x03ae,0x03b9, 0x03b7,0x0342, 0x03b7,0x03b9, 0x03b9,0x0342,
653             0x03c1,0x0313, 0x03c5,0x0342, 0x1f7c,0x03b9, 0x03c9,0x03b9, 0x03ce,0x03b9, 0x03c9,0x0342, 0x03c9,0x03b9,
654             0x0066,0x0066, 0x0066,0x0069, 0x0066,0x006c, 0x0073,0x0074, 0x0073,0x0074, 0x0574,0x0576, 0x0574,0x0565,
655             0x0574,0x056b, 0x057e,0x0576, 0x0574,0x056d
656         };
657         static const unsigned FOLD_MAP_3[] = {
658             S(0x0390), S(0x03b0), S(0x1f52), S(0x1f54), S(0x1f56), S(0x1fb7), S(0x1fc7), S(0x1fd2), S(0x1fd3),
659             S(0x1fd7), S(0x1fe2), S(0x1fe3), S(0x1fe7), S(0x1ff7), S(0xfb03), S(0xfb04)
660         };
661         static const unsigned FOLD_MAP_3_DATA[] = {
662             0x03b9,0x0308,0x0301, 0x03c5,0x0308,0x0301, 0x03c5,0x0313,0x0300, 0x03c5,0x0313,0x0301,
663             0x03c5,0x0313,0x0342, 0x03b1,0x0342,0x03b9, 0x03b7,0x0342,0x03b9, 0x03b9,0x0308,0x0300,
664             0x03b9,0x0308,0x0301, 0x03b9,0x0308,0x0342, 0x03c5,0x0308,0x0300, 0x03c5,0x0308,0x0301,
665             0x03c5,0x0308,0x0342, 0x03c9,0x0342,0x03b9, 0x0066,0x0066,0x0069, 0x0066,0x0066,0x006c
666         };
667 #undef R
668 #undef S
669         static const struct {
670             const unsigned* map;
671             const unsigned* data;
672             size_t map_size;
673             int n_codepoints;
674         } FOLD_MAP_LIST[] = {
675             { FOLD_MAP_1, FOLD_MAP_1_DATA, SIZEOF_ARRAY(FOLD_MAP_1), 1 },
676             { FOLD_MAP_2, FOLD_MAP_2_DATA, SIZEOF_ARRAY(FOLD_MAP_2), 2 },
677             { FOLD_MAP_3, FOLD_MAP_3_DATA, SIZEOF_ARRAY(FOLD_MAP_3), 3 }
678         };
679 
680         int i;
681 
682         /* Fast path for ASCII characters. */
683         if(codepoint <= 0x7f) {
684             info->codepoints[0] = codepoint;
685             if(ISUPPER_(codepoint))
686                 info->codepoints[0] += 'a' - 'A';
687             info->n_codepoints = 1;
688             return;
689         }
690 
691         /* Try to locate the codepoint in any of the maps. */
692         for(i = 0; i < (int) SIZEOF_ARRAY(FOLD_MAP_LIST); i++) {
693             int index;
694 
695             index = md_unicode_bsearch__(codepoint, FOLD_MAP_LIST[i].map, FOLD_MAP_LIST[i].map_size);
696             if(index >= 0) {
697                 /* Found the mapping. */
698                 int n_codepoints = FOLD_MAP_LIST[i].n_codepoints;
699                 const unsigned* map = FOLD_MAP_LIST[i].map;
700                 const unsigned* codepoints = FOLD_MAP_LIST[i].data + (index * n_codepoints);
701 
702                 memcpy(info->codepoints, codepoints, sizeof(unsigned) * n_codepoints);
703                 info->n_codepoints = n_codepoints;
704 
705                 if(FOLD_MAP_LIST[i].map[index] != codepoint) {
706                     /* The found mapping maps whole range of codepoints,
707                      * i.e. we have to offset info->codepoints[0] accordingly. */
708                     if((map[index] & 0x00ffffff)+1 == codepoints[0]) {
709                         /* Alternating type of the range. */
710                         info->codepoints[0] = codepoint + ((codepoint & 0x1) == (map[index] & 0x1) ? 1 : 0);
711                     } else {
712                         /* Range to range kind of mapping. */
713                         info->codepoints[0] += (codepoint - (map[index] & 0x00ffffff));
714                     }
715                 }
716 
717                 return;
718             }
719         }
720 
721         /* No mapping found. Map the codepoint to itself. */
722         info->codepoints[0] = codepoint;
723         info->n_codepoints = 1;
724     }
725 #endif
726 
727 
728 #if defined MD4C_USE_UTF16
729     #define IS_UTF16_SURROGATE_HI(word)     (((WORD)(word) & 0xfc00) == 0xd800)
730     #define IS_UTF16_SURROGATE_LO(word)     (((WORD)(word) & 0xfc00) == 0xdc00)
731     #define UTF16_DECODE_SURROGATE(hi, lo)  (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)))
732 
733     static unsigned
md_decode_utf16le__(const CHAR * str,SZ str_size,SZ * p_size)734     md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size)
735     {
736         if(IS_UTF16_SURROGATE_HI(str[0])) {
737             if(1 < str_size && IS_UTF16_SURROGATE_LO(str[1])) {
738                 if(p_size != NULL)
739                     *p_size = 2;
740                 return UTF16_DECODE_SURROGATE(str[0], str[1]);
741             }
742         }
743 
744         if(p_size != NULL)
745             *p_size = 1;
746         return str[0];
747     }
748 
749     static unsigned
md_decode_utf16le_before__(MD_CTX * ctx,OFF off)750     md_decode_utf16le_before__(MD_CTX* ctx, OFF off)
751     {
752         if(off > 2 && IS_UTF16_SURROGATE_HI(CH(off-2)) && IS_UTF16_SURROGATE_LO(CH(off-1)))
753             return UTF16_DECODE_SURROGATE(CH(off-2), CH(off-1));
754 
755         return CH(off);
756     }
757 
758     /* No whitespace uses surrogates, so no decoding needed here. */
759     #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
760     #define ISUNICODEWHITESPACE(off)        md_is_unicode_whitespace__(CH(off))
761     #define ISUNICODEWHITESPACEBEFORE(off)  md_is_unicode_whitespace__(CH((off)-1))
762 
763     #define ISUNICODEPUNCT(off)             md_is_unicode_punct__(md_decode_utf16le__(STR(off), ctx->size - (off), NULL))
764     #define ISUNICODEPUNCTBEFORE(off)       md_is_unicode_punct__(md_decode_utf16le_before__(ctx, off))
765 
766     static inline int
md_decode_unicode(const CHAR * str,OFF off,SZ str_size,SZ * p_char_size)767     md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
768     {
769         return md_decode_utf16le__(str+off, str_size-off, p_char_size);
770     }
771 #elif defined MD4C_USE_UTF8
772     #define IS_UTF8_LEAD1(byte)     ((unsigned char)(byte) <= 0x7f)
773     #define IS_UTF8_LEAD2(byte)     (((unsigned char)(byte) & 0xe0) == 0xc0)
774     #define IS_UTF8_LEAD3(byte)     (((unsigned char)(byte) & 0xf0) == 0xe0)
775     #define IS_UTF8_LEAD4(byte)     (((unsigned char)(byte) & 0xf8) == 0xf0)
776     #define IS_UTF8_TAIL(byte)      (((unsigned char)(byte) & 0xc0) == 0x80)
777 
778     static unsigned
md_decode_utf8__(const CHAR * str,SZ str_size,SZ * p_size)779     md_decode_utf8__(const CHAR* str, SZ str_size, SZ* p_size)
780     {
781         if(!IS_UTF8_LEAD1(str[0])) {
782             if(IS_UTF8_LEAD2(str[0])) {
783                 if(1 < str_size && IS_UTF8_TAIL(str[1])) {
784                     if(p_size != NULL)
785                         *p_size = 2;
786 
787                     return (((unsigned int)str[0] & 0x1f) << 6) |
788                            (((unsigned int)str[1] & 0x3f) << 0);
789                 }
790             } else if(IS_UTF8_LEAD3(str[0])) {
791                 if(2 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2])) {
792                     if(p_size != NULL)
793                         *p_size = 3;
794 
795                     return (((unsigned int)str[0] & 0x0f) << 12) |
796                            (((unsigned int)str[1] & 0x3f) << 6) |
797                            (((unsigned int)str[2] & 0x3f) << 0);
798                 }
799             } else if(IS_UTF8_LEAD4(str[0])) {
800                 if(3 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2]) && IS_UTF8_TAIL(str[3])) {
801                     if(p_size != NULL)
802                         *p_size = 4;
803 
804                     return (((unsigned int)str[0] & 0x07) << 18) |
805                            (((unsigned int)str[1] & 0x3f) << 12) |
806                            (((unsigned int)str[2] & 0x3f) << 6) |
807                            (((unsigned int)str[3] & 0x3f) << 0);
808                 }
809             }
810         }
811 
812         if(p_size != NULL)
813             *p_size = 1;
814         return (unsigned) str[0];
815     }
816 
817     static unsigned
md_decode_utf8_before__(MD_CTX * ctx,OFF off)818     md_decode_utf8_before__(MD_CTX* ctx, OFF off)
819     {
820         if(!IS_UTF8_LEAD1(CH(off-1))) {
821             if(off > 1 && IS_UTF8_LEAD2(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
822                 return (((unsigned int)CH(off-2) & 0x1f) << 6) |
823                        (((unsigned int)CH(off-1) & 0x3f) << 0);
824 
825             if(off > 2 && IS_UTF8_LEAD3(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
826                 return (((unsigned int)CH(off-3) & 0x0f) << 12) |
827                        (((unsigned int)CH(off-2) & 0x3f) << 6) |
828                        (((unsigned int)CH(off-1) & 0x3f) << 0);
829 
830             if(off > 3 && IS_UTF8_LEAD4(CH(off-4)) && IS_UTF8_TAIL(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
831                 return (((unsigned int)CH(off-4) & 0x07) << 18) |
832                        (((unsigned int)CH(off-3) & 0x3f) << 12) |
833                        (((unsigned int)CH(off-2) & 0x3f) << 6) |
834                        (((unsigned int)CH(off-1) & 0x3f) << 0);
835         }
836 
837         return (unsigned) CH(off-1);
838     }
839 
840     #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
841     #define ISUNICODEWHITESPACE(off)        md_is_unicode_whitespace__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
842     #define ISUNICODEWHITESPACEBEFORE(off)  md_is_unicode_whitespace__(md_decode_utf8_before__(ctx, off))
843 
844     #define ISUNICODEPUNCT(off)             md_is_unicode_punct__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
845     #define ISUNICODEPUNCTBEFORE(off)       md_is_unicode_punct__(md_decode_utf8_before__(ctx, off))
846 
847     static inline unsigned
md_decode_unicode(const CHAR * str,OFF off,SZ str_size,SZ * p_char_size)848     md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
849     {
850         return md_decode_utf8__(str+off, str_size-off, p_char_size);
851     }
852 #else
853     #define ISUNICODEWHITESPACE_(codepoint) ISWHITESPACE_(codepoint)
854     #define ISUNICODEWHITESPACE(off)        ISWHITESPACE(off)
855     #define ISUNICODEWHITESPACEBEFORE(off)  ISWHITESPACE((off)-1)
856 
857     #define ISUNICODEPUNCT(off)             ISPUNCT(off)
858     #define ISUNICODEPUNCTBEFORE(off)       ISPUNCT((off)-1)
859 
860     static inline void
md_get_unicode_fold_info(unsigned codepoint,MD_UNICODE_FOLD_INFO * info)861     md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
862     {
863         info->codepoints[0] = codepoint;
864         if(ISUPPER_(codepoint))
865             info->codepoints[0] += 'a' - 'A';
866         info->n_codepoints = 1;
867     }
868 
869     static inline unsigned
md_decode_unicode(const CHAR * str,OFF off,SZ str_size,SZ * p_size)870     md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_size)
871     {
872         *p_size = 1;
873         return (unsigned) str[off];
874     }
875 #endif
876 
877 
878 /*************************************
879  ***  Helper string manipulations  ***
880  *************************************/
881 
882 /* Fill buffer with copy of the string between 'beg' and 'end' but replace any
883  * line breaks with given replacement character.
884  *
885  * NOTE: Caller is responsible to make sure the buffer is large enough.
886  * (Given the output is always shorter then input, (end - beg) is good idea
887  * what the caller should allocate.)
888  */
889 static void
md_merge_lines(MD_CTX * ctx,OFF beg,OFF end,const MD_LINE * lines,int n_lines,CHAR line_break_replacement_char,CHAR * buffer,SZ * p_size)890 md_merge_lines(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
891                CHAR line_break_replacement_char, CHAR* buffer, SZ* p_size)
892 {
893     CHAR* ptr = buffer;
894     int line_index = 0;
895     OFF off = beg;
896 
897     while(1) {
898         const MD_LINE* line = &lines[line_index];
899         OFF line_end = line->end;
900         if(end < line_end)
901             line_end = end;
902 
903         while(off < line_end) {
904             *ptr = CH(off);
905             ptr++;
906             off++;
907         }
908 
909         if(off >= end) {
910             *p_size = ptr - buffer;
911             return;
912         }
913 
914         *ptr = line_break_replacement_char;
915         ptr++;
916 
917         line_index++;
918         off = lines[line_index].beg;
919     }
920 }
921 
922 /* Wrapper of md_merge_lines() which allocates new buffer for the output string.
923  */
924 static int
md_merge_lines_alloc(MD_CTX * ctx,OFF beg,OFF end,const MD_LINE * lines,int n_lines,CHAR line_break_replacement_char,CHAR ** p_str,SZ * p_size)925 md_merge_lines_alloc(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
926                     CHAR line_break_replacement_char, CHAR** p_str, SZ* p_size)
927 {
928     CHAR* buffer;
929 
930     buffer = (CHAR*) malloc(sizeof(CHAR) * (end - beg));
931     if(buffer == NULL) {
932         MD_LOG("malloc() failed.");
933         return -1;
934     }
935 
936     md_merge_lines(ctx, beg, end, lines, n_lines,
937                 line_break_replacement_char, buffer, p_size);
938 
939     *p_str = buffer;
940     return 0;
941 }
942 
943 static OFF
md_skip_unicode_whitespace(const CHAR * label,OFF off,SZ size)944 md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size)
945 {
946     SZ char_size;
947     unsigned codepoint;
948 
949     while(off < size) {
950         codepoint = md_decode_unicode(label, off, size, &char_size);
951         if(!ISUNICODEWHITESPACE_(codepoint)  &&  !ISNEWLINE_(label[off]))
952             break;
953         off += char_size;
954     }
955 
956     return off;
957 }
958 
959 
960 /******************************
961  ***  Recognizing raw HTML  ***
962  ******************************/
963 
964 /* md_is_html_tag() may be called when processing inlines (inline raw HTML)
965  * or when breaking document to blocks (checking for start of HTML block type 7).
966  *
967  * When breaking document to blocks, we do not yet know line boundaries, but
968  * in that case the whole tag has to live on a single line. We distinguish this
969  * by n_lines == 0.
970  */
971 static int
md_is_html_tag(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF max_end,OFF * p_end)972 md_is_html_tag(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
973 {
974     int attr_state;
975     OFF off = beg;
976     OFF line_end = (n_lines > 0) ? lines[0].end : ctx->size;
977     int i = 0;
978 
979     MD_ASSERT(CH(beg) == _T('<'));
980 
981     if(off + 1 >= line_end)
982         return FALSE;
983     off++;
984 
985     /* For parsing attributes, we need a little state automaton below.
986      * State -1: no attributes are allowed.
987      * State 0: attribute could follow after some whitespace.
988      * State 1: after a whitespace (attribute name may follow).
989      * State 2: after attribute name ('=' MAY follow).
990      * State 3: after '=' (value specification MUST follow).
991      * State 41: in middle of unquoted attribute value.
992      * State 42: in middle of single-quoted attribute value.
993      * State 43: in middle of double-quoted attribute value.
994      */
995     attr_state = 0;
996 
997     if(CH(off) == _T('/')) {
998         /* Closer tag "</ ... >". No attributes may be present. */
999         attr_state = -1;
1000         off++;
1001     }
1002 
1003     /* Tag name */
1004     if(off >= line_end  ||  !ISALPHA(off))
1005         return FALSE;
1006     off++;
1007     while(off < line_end  &&  (ISALNUM(off)  ||  CH(off) == _T('-')))
1008         off++;
1009 
1010     /* (Optional) attributes (if not closer), (optional) '/' (if not closer)
1011      * and final '>'. */
1012     while(1) {
1013         while(off < line_end  &&  !ISNEWLINE(off)) {
1014             if(attr_state > 40) {
1015                 if(attr_state == 41 && (ISBLANK(off) || ISANYOF(off, _T("\"'=<>`")))) {
1016                     attr_state = 0;
1017                     off--;  /* Put the char back for re-inspection in the new state. */
1018                 } else if(attr_state == 42 && CH(off) == _T('\'')) {
1019                     attr_state = 0;
1020                 } else if(attr_state == 43 && CH(off) == _T('"')) {
1021                     attr_state = 0;
1022                 }
1023                 off++;
1024             } else if(ISWHITESPACE(off)) {
1025                 if(attr_state == 0)
1026                     attr_state = 1;
1027                 off++;
1028             } else if(attr_state <= 2 && CH(off) == _T('>')) {
1029                 /* End. */
1030                 goto done;
1031             } else if(attr_state <= 2 && CH(off) == _T('/') && off+1 < line_end && CH(off+1) == _T('>')) {
1032                 /* End with digraph '/>' */
1033                 off++;
1034                 goto done;
1035             } else if((attr_state == 1 || attr_state == 2) && (ISALPHA(off) || CH(off) == _T('_') || CH(off) == _T(':'))) {
1036                 off++;
1037                 /* Attribute name */
1038                 while(off < line_end && (ISALNUM(off) || ISANYOF(off, _T("_.:-"))))
1039                     off++;
1040                 attr_state = 2;
1041             } else if(attr_state == 2 && CH(off) == _T('=')) {
1042                 /* Attribute assignment sign */
1043                 off++;
1044                 attr_state = 3;
1045             } else if(attr_state == 3) {
1046                 /* Expecting start of attribute value. */
1047                 if(CH(off) == _T('"'))
1048                     attr_state = 43;
1049                 else if(CH(off) == _T('\''))
1050                     attr_state = 42;
1051                 else if(!ISANYOF(off, _T("\"'=<>`"))  &&  !ISNEWLINE(off))
1052                     attr_state = 41;
1053                 else
1054                     return FALSE;
1055                 off++;
1056             } else {
1057                 /* Anything unexpected. */
1058                 return FALSE;
1059             }
1060         }
1061 
1062         /* We have to be on a single line. See definition of start condition
1063          * of HTML block, type 7. */
1064         if(n_lines == 0)
1065             return FALSE;
1066 
1067         i++;
1068         if(i >= n_lines)
1069             return FALSE;
1070 
1071         off = lines[i].beg;
1072         line_end = lines[i].end;
1073 
1074         if(attr_state == 0  ||  attr_state == 41)
1075             attr_state = 1;
1076 
1077         if(off >= max_end)
1078             return FALSE;
1079     }
1080 
1081 done:
1082     if(off >= max_end)
1083         return FALSE;
1084 
1085     *p_end = off+1;
1086     return TRUE;
1087 }
1088 
1089 static int
md_scan_for_html_closer(MD_CTX * ctx,const MD_CHAR * str,MD_SIZE len,const MD_LINE * lines,int n_lines,OFF beg,OFF max_end,OFF * p_end,OFF * p_scan_horizon)1090 md_scan_for_html_closer(MD_CTX* ctx, const MD_CHAR* str, MD_SIZE len,
1091                         const MD_LINE* lines, int n_lines,
1092                         OFF beg, OFF max_end, OFF* p_end,
1093                         OFF* p_scan_horizon)
1094 {
1095     OFF off = beg;
1096     int i = 0;
1097 
1098     if(off < *p_scan_horizon  &&  *p_scan_horizon >= max_end - len) {
1099         /* We have already scanned the range up to the max_end so we know
1100          * there is nothing to see. */
1101         return FALSE;
1102     }
1103 
1104     while(TRUE) {
1105         while(off + len <= lines[i].end  &&  off + len <= max_end) {
1106             if(md_ascii_eq(STR(off), str, len)) {
1107                 /* Success. */
1108                 *p_end = off + len;
1109                 return TRUE;
1110             }
1111             off++;
1112         }
1113 
1114         i++;
1115         if(off >= max_end  ||  i >= n_lines) {
1116             /* Failure. */
1117             *p_scan_horizon = off;
1118             return FALSE;
1119         }
1120 
1121         off = lines[i].beg;
1122     }
1123 }
1124 
1125 static int
md_is_html_comment(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF max_end,OFF * p_end)1126 md_is_html_comment(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1127 {
1128     OFF off = beg;
1129 
1130     MD_ASSERT(CH(beg) == _T('<'));
1131 
1132     if(off + 4 >= lines[0].end)
1133         return FALSE;
1134     if(CH(off+1) != _T('!')  ||  CH(off+2) != _T('-')  ||  CH(off+3) != _T('-'))
1135         return FALSE;
1136     off += 4;
1137 
1138     /* ">" and "->" must not follow the opening. */
1139     if(off < lines[0].end  &&  CH(off) == _T('>'))
1140         return FALSE;
1141     if(off+1 < lines[0].end  &&  CH(off) == _T('-')  &&  CH(off+1) == _T('>'))
1142         return FALSE;
1143 
1144     /* HTML comment must not contain "--", so we scan just for "--" instead
1145      * of "-->" and verify manually that '>' follows. */
1146     if(md_scan_for_html_closer(ctx, _T("--"), 2,
1147                 lines, n_lines, off, max_end, p_end, &ctx->html_comment_horizon))
1148     {
1149         if(*p_end < max_end  &&  CH(*p_end) == _T('>')) {
1150             *p_end = *p_end + 1;
1151             return TRUE;
1152         }
1153     }
1154 
1155     return FALSE;
1156 }
1157 
1158 static int
md_is_html_processing_instruction(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF max_end,OFF * p_end)1159 md_is_html_processing_instruction(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1160 {
1161     OFF off = beg;
1162 
1163     if(off + 2 >= lines[0].end)
1164         return FALSE;
1165     if(CH(off+1) != _T('?'))
1166         return FALSE;
1167     off += 2;
1168 
1169     return md_scan_for_html_closer(ctx, _T("?>"), 2,
1170                 lines, n_lines, off, max_end, p_end, &ctx->html_proc_instr_horizon);
1171 }
1172 
1173 static int
md_is_html_declaration(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF max_end,OFF * p_end)1174 md_is_html_declaration(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1175 {
1176     OFF off = beg;
1177 
1178     if(off + 2 >= lines[0].end)
1179         return FALSE;
1180     if(CH(off+1) != _T('!'))
1181         return FALSE;
1182     off += 2;
1183 
1184     /* Declaration name. */
1185     if(off >= lines[0].end  ||  !ISALPHA(off))
1186         return FALSE;
1187     off++;
1188     while(off < lines[0].end  &&  ISALPHA(off))
1189         off++;
1190     if(off < lines[0].end  &&  !ISWHITESPACE(off))
1191         return FALSE;
1192 
1193     return md_scan_for_html_closer(ctx, _T(">"), 1,
1194                 lines, n_lines, off, max_end, p_end, &ctx->html_decl_horizon);
1195 }
1196 
1197 static int
md_is_html_cdata(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF max_end,OFF * p_end)1198 md_is_html_cdata(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1199 {
1200     static const CHAR open_str[] = _T("<![CDATA[");
1201     static const SZ open_size = SIZEOF_ARRAY(open_str) - 1;
1202 
1203     OFF off = beg;
1204 
1205     if(off + open_size >= lines[0].end)
1206         return FALSE;
1207     if(memcmp(STR(off), open_str, open_size) != 0)
1208         return FALSE;
1209     off += open_size;
1210 
1211     if(lines[n_lines-1].end < max_end)
1212         max_end = lines[n_lines-1].end - 2;
1213 
1214     return md_scan_for_html_closer(ctx, _T("]]>"), 3,
1215                 lines, n_lines, off, max_end, p_end, &ctx->html_cdata_horizon);
1216 }
1217 
1218 static int
md_is_html_any(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF max_end,OFF * p_end)1219 md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1220 {
1221     MD_ASSERT(CH(beg) == _T('<'));
1222     return (md_is_html_tag(ctx, lines, n_lines, beg, max_end, p_end)  ||
1223             md_is_html_comment(ctx, lines, n_lines, beg, max_end, p_end)  ||
1224             md_is_html_processing_instruction(ctx, lines, n_lines, beg, max_end, p_end)  ||
1225             md_is_html_declaration(ctx, lines, n_lines, beg, max_end, p_end)  ||
1226             md_is_html_cdata(ctx, lines, n_lines, beg, max_end, p_end));
1227 }
1228 
1229 
1230 /****************************
1231  ***  Recognizing Entity  ***
1232  ****************************/
1233 
1234 static int
md_is_hex_entity_contents(MD_CTX * ctx,const CHAR * text,OFF beg,OFF max_end,OFF * p_end)1235 md_is_hex_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1236 {
1237     OFF off = beg;
1238 
1239     while(off < max_end  &&  ISXDIGIT_(text[off])  &&  off - beg <= 8)
1240         off++;
1241 
1242     if(1 <= off - beg  &&  off - beg <= 6) {
1243         *p_end = off;
1244         return TRUE;
1245     } else {
1246         return FALSE;
1247     }
1248 }
1249 
1250 static int
md_is_dec_entity_contents(MD_CTX * ctx,const CHAR * text,OFF beg,OFF max_end,OFF * p_end)1251 md_is_dec_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1252 {
1253     OFF off = beg;
1254 
1255     while(off < max_end  &&  ISDIGIT_(text[off])  &&  off - beg <= 8)
1256         off++;
1257 
1258     if(1 <= off - beg  &&  off - beg <= 7) {
1259         *p_end = off;
1260         return TRUE;
1261     } else {
1262         return FALSE;
1263     }
1264 }
1265 
1266 static int
md_is_named_entity_contents(MD_CTX * ctx,const CHAR * text,OFF beg,OFF max_end,OFF * p_end)1267 md_is_named_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1268 {
1269     OFF off = beg;
1270 
1271     if(off < max_end  &&  ISALPHA_(text[off]))
1272         off++;
1273     else
1274         return FALSE;
1275 
1276     while(off < max_end  &&  ISALNUM_(text[off])  &&  off - beg <= 48)
1277         off++;
1278 
1279     if(2 <= off - beg  &&  off - beg <= 48) {
1280         *p_end = off;
1281         return TRUE;
1282     } else {
1283         return FALSE;
1284     }
1285 }
1286 
1287 static int
md_is_entity_str(MD_CTX * ctx,const CHAR * text,OFF beg,OFF max_end,OFF * p_end)1288 md_is_entity_str(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1289 {
1290     int is_contents;
1291     OFF off = beg;
1292 
1293     MD_ASSERT(text[off] == _T('&'));
1294     off++;
1295 
1296     if(off+2 < max_end  &&  text[off] == _T('#')  &&  (text[off+1] == _T('x') || text[off+1] == _T('X')))
1297         is_contents = md_is_hex_entity_contents(ctx, text, off+2, max_end, &off);
1298     else if(off+1 < max_end  &&  text[off] == _T('#'))
1299         is_contents = md_is_dec_entity_contents(ctx, text, off+1, max_end, &off);
1300     else
1301         is_contents = md_is_named_entity_contents(ctx, text, off, max_end, &off);
1302 
1303     if(is_contents  &&  off < max_end  &&  text[off] == _T(';')) {
1304         *p_end = off+1;
1305         return TRUE;
1306     } else {
1307         return FALSE;
1308     }
1309 }
1310 
1311 static inline int
md_is_entity(MD_CTX * ctx,OFF beg,OFF max_end,OFF * p_end)1312 md_is_entity(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
1313 {
1314     return md_is_entity_str(ctx, ctx->text, beg, max_end, p_end);
1315 }
1316 
1317 
1318 /******************************
1319  ***  Attribute Management  ***
1320  ******************************/
1321 
1322 typedef struct MD_ATTRIBUTE_BUILD_tag MD_ATTRIBUTE_BUILD;
1323 struct MD_ATTRIBUTE_BUILD_tag {
1324     CHAR* text;
1325     MD_TEXTTYPE* substr_types;
1326     OFF* substr_offsets;
1327     int substr_count;
1328     int substr_alloc;
1329     MD_TEXTTYPE trivial_types[1];
1330     OFF trivial_offsets[2];
1331 };
1332 
1333 
1334 #define MD_BUILD_ATTR_NO_ESCAPES    0x0001
1335 
1336 static int
md_build_attr_append_substr(MD_CTX * ctx,MD_ATTRIBUTE_BUILD * build,MD_TEXTTYPE type,OFF off)1337 md_build_attr_append_substr(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build,
1338                             MD_TEXTTYPE type, OFF off)
1339 {
1340     if(build->substr_count >= build->substr_alloc) {
1341         MD_TEXTTYPE* new_substr_types;
1342         OFF* new_substr_offsets;
1343 
1344         build->substr_alloc = (build->substr_alloc > 0
1345                 ? build->substr_alloc + build->substr_alloc / 2
1346                 : 8);
1347         new_substr_types = (MD_TEXTTYPE*) realloc(build->substr_types,
1348                                     build->substr_alloc * sizeof(MD_TEXTTYPE));
1349         if(new_substr_types == NULL) {
1350             MD_LOG("realloc() failed.");
1351             return -1;
1352         }
1353         /* Note +1 to reserve space for final offset (== raw_size). */
1354         new_substr_offsets = (OFF*) realloc(build->substr_offsets,
1355                                     (build->substr_alloc+1) * sizeof(OFF));
1356         if(new_substr_offsets == NULL) {
1357             MD_LOG("realloc() failed.");
1358             free(new_substr_types);
1359             return -1;
1360         }
1361 
1362         build->substr_types = new_substr_types;
1363         build->substr_offsets = new_substr_offsets;
1364     }
1365 
1366     build->substr_types[build->substr_count] = type;
1367     build->substr_offsets[build->substr_count] = off;
1368     build->substr_count++;
1369     return 0;
1370 }
1371 
1372 static void
md_free_attribute(MD_CTX * ctx,MD_ATTRIBUTE_BUILD * build)1373 md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build)
1374 {
1375     if(build->substr_alloc > 0) {
1376         free(build->text);
1377         free(build->substr_types);
1378         free(build->substr_offsets);
1379     }
1380 }
1381 
1382 static int
md_build_attribute(MD_CTX * ctx,const CHAR * raw_text,SZ raw_size,unsigned flags,MD_ATTRIBUTE * attr,MD_ATTRIBUTE_BUILD * build)1383 md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size,
1384                    unsigned flags, MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build)
1385 {
1386     OFF raw_off, off;
1387     int is_trivial;
1388     int ret = 0;
1389 
1390     memset(build, 0, sizeof(MD_ATTRIBUTE_BUILD));
1391 
1392     /* If there is no backslash and no ampersand, build trivial attribute
1393      * without any malloc(). */
1394     is_trivial = TRUE;
1395     for(raw_off = 0; raw_off < raw_size; raw_off++) {
1396         if(ISANYOF3_(raw_text[raw_off], _T('\\'), _T('&'), _T('\0'))) {
1397             is_trivial = FALSE;
1398             break;
1399         }
1400     }
1401 
1402     if(is_trivial) {
1403         build->text = (CHAR*) (raw_size ? raw_text : NULL);
1404         build->substr_types = build->trivial_types;
1405         build->substr_offsets = build->trivial_offsets;
1406         build->substr_count = 1;
1407         build->substr_alloc = 0;
1408         build->trivial_types[0] = MD_TEXT_NORMAL;
1409         build->trivial_offsets[0] = 0;
1410         build->trivial_offsets[1] = raw_size;
1411         off = raw_size;
1412     } else {
1413         build->text = (CHAR*) malloc(raw_size * sizeof(CHAR));
1414         if(build->text == NULL) {
1415             MD_LOG("malloc() failed.");
1416             goto abort;
1417         }
1418 
1419         raw_off = 0;
1420         off = 0;
1421 
1422         while(raw_off < raw_size) {
1423             if(raw_text[raw_off] == _T('\0')) {
1424                 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NULLCHAR, off));
1425                 memcpy(build->text + off, raw_text + raw_off, 1);
1426                 off++;
1427                 raw_off++;
1428                 continue;
1429             }
1430 
1431             if(raw_text[raw_off] == _T('&')) {
1432                 OFF ent_end;
1433 
1434                 if(md_is_entity_str(ctx, raw_text, raw_off, raw_size, &ent_end)) {
1435                     MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_ENTITY, off));
1436                     memcpy(build->text + off, raw_text + raw_off, ent_end - raw_off);
1437                     off += ent_end - raw_off;
1438                     raw_off = ent_end;
1439                     continue;
1440                 }
1441             }
1442 
1443             if(build->substr_count == 0  ||  build->substr_types[build->substr_count-1] != MD_TEXT_NORMAL)
1444                 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NORMAL, off));
1445 
1446             if(!(flags & MD_BUILD_ATTR_NO_ESCAPES)  &&
1447                raw_text[raw_off] == _T('\\')  &&  raw_off+1 < raw_size  &&
1448                (ISPUNCT_(raw_text[raw_off+1]) || ISNEWLINE_(raw_text[raw_off+1])))
1449                 raw_off++;
1450 
1451             build->text[off++] = raw_text[raw_off++];
1452         }
1453         build->substr_offsets[build->substr_count] = off;
1454     }
1455 
1456     attr->text = build->text;
1457     attr->size = off;
1458     attr->substr_offsets = build->substr_offsets;
1459     attr->substr_types = build->substr_types;
1460     return 0;
1461 
1462 abort:
1463     md_free_attribute(ctx, build);
1464     return -1;
1465 }
1466 
1467 
1468 /*********************************************
1469  ***  Dictionary of Reference Definitions  ***
1470  *********************************************/
1471 
1472 #define MD_FNV1A_BASE       2166136261U
1473 #define MD_FNV1A_PRIME      16777619U
1474 
1475 static inline unsigned
md_fnv1a(unsigned base,const void * data,size_t n)1476 md_fnv1a(unsigned base, const void* data, size_t n)
1477 {
1478     const unsigned char* buf = (const unsigned char*) data;
1479     unsigned hash = base;
1480     size_t i;
1481 
1482     for(i = 0; i < n; i++) {
1483         hash ^= buf[i];
1484         hash *= MD_FNV1A_PRIME;
1485     }
1486 
1487     return hash;
1488 }
1489 
1490 
1491 struct MD_REF_DEF_tag {
1492     CHAR* label;
1493     CHAR* title;
1494     unsigned hash;
1495     SZ label_size;
1496     SZ title_size;
1497     OFF dest_beg;
1498     OFF dest_end;
1499     unsigned char label_needs_free : 1;
1500     unsigned char title_needs_free : 1;
1501 };
1502 
1503 /* Label equivalence is quite complicated with regards to whitespace and case
1504  * folding. This complicates computing a hash of it as well as direct comparison
1505  * of two labels. */
1506 
1507 static unsigned
md_link_label_hash(const CHAR * label,SZ size)1508 md_link_label_hash(const CHAR* label, SZ size)
1509 {
1510     unsigned hash = MD_FNV1A_BASE;
1511     OFF off;
1512     unsigned codepoint;
1513     int is_whitespace = FALSE;
1514 
1515     off = md_skip_unicode_whitespace(label, 0, size);
1516     while(off < size) {
1517         SZ char_size;
1518 
1519         codepoint = md_decode_unicode(label, off, size, &char_size);
1520         is_whitespace = ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE_(label[off]);
1521 
1522         if(is_whitespace) {
1523             codepoint = ' ';
1524             hash = md_fnv1a(hash, &codepoint, sizeof(unsigned));
1525             off = md_skip_unicode_whitespace(label, off, size);
1526         } else {
1527             MD_UNICODE_FOLD_INFO fold_info;
1528 
1529             md_get_unicode_fold_info(codepoint, &fold_info);
1530             hash = md_fnv1a(hash, fold_info.codepoints, fold_info.n_codepoints * sizeof(unsigned));
1531             off += char_size;
1532         }
1533     }
1534 
1535     return hash;
1536 }
1537 
1538 static OFF
md_link_label_cmp_load_fold_info(const CHAR * label,OFF off,SZ size,MD_UNICODE_FOLD_INFO * fold_info)1539 md_link_label_cmp_load_fold_info(const CHAR* label, OFF off, SZ size,
1540                                  MD_UNICODE_FOLD_INFO* fold_info)
1541 {
1542     unsigned codepoint;
1543     SZ char_size;
1544 
1545     if(off >= size) {
1546         /* Treat end of a link label as a whitespace. */
1547         goto whitespace;
1548     }
1549 
1550     if(ISNEWLINE_(label[off])) {
1551         /* Treat new lines as a whitespace. */
1552         off++;
1553         goto whitespace;
1554     }
1555 
1556     codepoint = md_decode_unicode(label, off, size, &char_size);
1557     off += char_size;
1558     if(ISUNICODEWHITESPACE_(codepoint)) {
1559         /* Treat all whitespace as equivalent */
1560         goto whitespace;
1561     }
1562 
1563     /* Get real folding info. */
1564     md_get_unicode_fold_info(codepoint, fold_info);
1565     return off;
1566 
1567 whitespace:
1568     fold_info->codepoints[0] = _T(' ');
1569     fold_info->n_codepoints = 1;
1570     return md_skip_unicode_whitespace(label, off, size);
1571 }
1572 
1573 static int
md_link_label_cmp(const CHAR * a_label,SZ a_size,const CHAR * b_label,SZ b_size)1574 md_link_label_cmp(const CHAR* a_label, SZ a_size, const CHAR* b_label, SZ b_size)
1575 {
1576     OFF a_off;
1577     OFF b_off;
1578     int a_reached_end = FALSE;
1579     int b_reached_end = FALSE;
1580     MD_UNICODE_FOLD_INFO a_fi = { { 0 }, 0 };
1581     MD_UNICODE_FOLD_INFO b_fi = { { 0 }, 0 };
1582     OFF a_fi_off = 0;
1583     OFF b_fi_off = 0;
1584     int cmp;
1585 
1586     a_off = md_skip_unicode_whitespace(a_label, 0, a_size);
1587     b_off = md_skip_unicode_whitespace(b_label, 0, b_size);
1588     while(!a_reached_end  ||  !b_reached_end) {
1589         /* If needed, load fold info for next char. */
1590         if(a_fi_off >= a_fi.n_codepoints) {
1591             a_fi_off = 0;
1592             a_off = md_link_label_cmp_load_fold_info(a_label, a_off, a_size, &a_fi);
1593             a_reached_end = (a_off >= a_size);
1594         }
1595         if(b_fi_off >= b_fi.n_codepoints) {
1596             b_fi_off = 0;
1597             b_off = md_link_label_cmp_load_fold_info(b_label, b_off, b_size, &b_fi);
1598             b_reached_end = (b_off >= b_size);
1599         }
1600 
1601         cmp = b_fi.codepoints[b_fi_off] - a_fi.codepoints[a_fi_off];
1602         if(cmp != 0)
1603             return cmp;
1604 
1605         a_fi_off++;
1606         b_fi_off++;
1607     }
1608 
1609     return 0;
1610 }
1611 
1612 typedef struct MD_REF_DEF_LIST_tag MD_REF_DEF_LIST;
1613 struct MD_REF_DEF_LIST_tag {
1614     int n_ref_defs;
1615     int alloc_ref_defs;
1616     MD_REF_DEF* ref_defs[];  /* Valid items always  point into ctx->ref_defs[] */
1617 };
1618 
1619 static int
md_ref_def_cmp(const void * a,const void * b)1620 md_ref_def_cmp(const void* a, const void* b)
1621 {
1622     const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1623     const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1624 
1625     if(a_ref->hash < b_ref->hash)
1626         return -1;
1627     else if(a_ref->hash > b_ref->hash)
1628         return +1;
1629     else
1630         return md_link_label_cmp(a_ref->label, a_ref->label_size, b_ref->label, b_ref->label_size);
1631 }
1632 
1633 static int
md_ref_def_cmp_for_sort(const void * a,const void * b)1634 md_ref_def_cmp_for_sort(const void* a, const void* b)
1635 {
1636     int cmp;
1637 
1638     cmp = md_ref_def_cmp(a, b);
1639 
1640     /* Ensure stability of the sorting. */
1641     if(cmp == 0) {
1642         const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1643         const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1644 
1645         if(a_ref < b_ref)
1646             cmp = -1;
1647         else if(a_ref > b_ref)
1648             cmp = +1;
1649         else
1650             cmp = 0;
1651     }
1652 
1653     return cmp;
1654 }
1655 
1656 static int
md_build_ref_def_hashtable(MD_CTX * ctx)1657 md_build_ref_def_hashtable(MD_CTX* ctx)
1658 {
1659     int i, j;
1660 
1661     if(ctx->n_ref_defs == 0)
1662         return 0;
1663 
1664     ctx->ref_def_hashtable_size = (ctx->n_ref_defs * 5) / 4;
1665     ctx->ref_def_hashtable = malloc(ctx->ref_def_hashtable_size * sizeof(void*));
1666     if(ctx->ref_def_hashtable == NULL) {
1667         MD_LOG("malloc() failed.");
1668         goto abort;
1669     }
1670     memset(ctx->ref_def_hashtable, 0, ctx->ref_def_hashtable_size * sizeof(void*));
1671 
1672     /* Each member of ctx->ref_def_hashtable[] can be:
1673      *  -- NULL,
1674      *  -- pointer to the MD_REF_DEF in ctx->ref_defs[], or
1675      *  -- pointer to a MD_REF_DEF_LIST, which holds multiple pointers to
1676      *     such MD_REF_DEFs.
1677      */
1678     for(i = 0; i < ctx->n_ref_defs; i++) {
1679         MD_REF_DEF* def = &ctx->ref_defs[i];
1680         void* bucket;
1681         MD_REF_DEF_LIST* list;
1682 
1683         def->hash = md_link_label_hash(def->label, def->label_size);
1684         bucket = ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size];
1685 
1686         if(bucket == NULL) {
1687             /* The bucket is empty. Make it just point to the def. */
1688             ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = def;
1689             continue;
1690         }
1691 
1692         if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1693             /* The bucket already contains one ref. def. Lets see whether it
1694              * is the same label (ref. def. duplicate) or different one
1695              * (hash conflict). */
1696             MD_REF_DEF* old_def = (MD_REF_DEF*) bucket;
1697 
1698             if(md_link_label_cmp(def->label, def->label_size, old_def->label, old_def->label_size) == 0) {
1699                 /* Duplicate label: Ignore this ref. def. */
1700                 continue;
1701             }
1702 
1703             /* Make the bucket complex, i.e. able to hold more ref. defs. */
1704             list = (MD_REF_DEF_LIST*) malloc(sizeof(MD_REF_DEF_LIST) + 2 * sizeof(MD_REF_DEF*));
1705             if(list == NULL) {
1706                 MD_LOG("malloc() failed.");
1707                 goto abort;
1708             }
1709             list->ref_defs[0] = old_def;
1710             list->ref_defs[1] = def;
1711             list->n_ref_defs = 2;
1712             list->alloc_ref_defs = 2;
1713             ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1714             continue;
1715         }
1716 
1717         /* Append the def to the complex bucket list.
1718          *
1719          * Note in this case we ignore potential duplicates to avoid expensive
1720          * iterating over the complex bucket. Below, we revisit all the complex
1721          * buckets and handle it more cheaply after the complex bucket contents
1722          * is sorted. */
1723         list = (MD_REF_DEF_LIST*) bucket;
1724         if(list->n_ref_defs >= list->alloc_ref_defs) {
1725             int alloc_ref_defs = list->alloc_ref_defs + list->alloc_ref_defs / 2;
1726             MD_REF_DEF_LIST* list_tmp = (MD_REF_DEF_LIST*) realloc(list,
1727                         sizeof(MD_REF_DEF_LIST) + alloc_ref_defs * sizeof(MD_REF_DEF*));
1728             if(list_tmp == NULL) {
1729                 MD_LOG("realloc() failed.");
1730                 goto abort;
1731             }
1732             list = list_tmp;
1733             list->alloc_ref_defs = alloc_ref_defs;
1734             ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1735         }
1736 
1737         list->ref_defs[list->n_ref_defs] = def;
1738         list->n_ref_defs++;
1739     }
1740 
1741     /* Sort the complex buckets so we can use bsearch() with them. */
1742     for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1743         void* bucket = ctx->ref_def_hashtable[i];
1744         MD_REF_DEF_LIST* list;
1745 
1746         if(bucket == NULL)
1747             continue;
1748         if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1749             continue;
1750 
1751         list = (MD_REF_DEF_LIST*) bucket;
1752         qsort(list->ref_defs, list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp_for_sort);
1753 
1754         /* Disable all duplicates in the complex bucket by forcing all such
1755          * records to point to the 1st such ref. def. I.e. no matter which
1756          * record is found during the lookup, it will always point to the right
1757          * ref. def. in ctx->ref_defs[]. */
1758         for(j = 1; j < list->n_ref_defs; j++) {
1759             if(md_ref_def_cmp(&list->ref_defs[j-1], &list->ref_defs[j]) == 0)
1760                 list->ref_defs[j] = list->ref_defs[j-1];
1761         }
1762     }
1763 
1764     return 0;
1765 
1766 abort:
1767     return -1;
1768 }
1769 
1770 static void
md_free_ref_def_hashtable(MD_CTX * ctx)1771 md_free_ref_def_hashtable(MD_CTX* ctx)
1772 {
1773     if(ctx->ref_def_hashtable != NULL) {
1774         int i;
1775 
1776         for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1777             void* bucket = ctx->ref_def_hashtable[i];
1778             if(bucket == NULL)
1779                 continue;
1780             if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1781                 continue;
1782             free(bucket);
1783         }
1784 
1785         free(ctx->ref_def_hashtable);
1786     }
1787 }
1788 
1789 static const MD_REF_DEF*
md_lookup_ref_def(MD_CTX * ctx,const CHAR * label,SZ label_size)1790 md_lookup_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size)
1791 {
1792     unsigned hash;
1793     void* bucket;
1794 
1795     if(ctx->ref_def_hashtable_size == 0)
1796         return NULL;
1797 
1798     hash = md_link_label_hash(label, label_size);
1799     bucket = ctx->ref_def_hashtable[hash % ctx->ref_def_hashtable_size];
1800 
1801     if(bucket == NULL) {
1802         return NULL;
1803     } else if(ctx->ref_defs <= (MD_REF_DEF*) bucket  &&  (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1804         const MD_REF_DEF* def = (MD_REF_DEF*) bucket;
1805 
1806         if(md_link_label_cmp(def->label, def->label_size, label, label_size) == 0)
1807             return def;
1808         else
1809             return NULL;
1810     } else {
1811         MD_REF_DEF_LIST* list = (MD_REF_DEF_LIST*) bucket;
1812         MD_REF_DEF key_buf;
1813         const MD_REF_DEF* key = &key_buf;
1814         const MD_REF_DEF** ret;
1815 
1816         key_buf.label = (CHAR*) label;
1817         key_buf.label_size = label_size;
1818         key_buf.hash = md_link_label_hash(key_buf.label, key_buf.label_size);
1819 
1820         ret = (const MD_REF_DEF**) bsearch(&key, list->ref_defs,
1821                     list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp);
1822         if(ret != NULL)
1823             return *ret;
1824         else
1825             return NULL;
1826     }
1827 }
1828 
1829 
1830 /***************************
1831  ***  Recognizing Links  ***
1832  ***************************/
1833 
1834 /* Note this code is partially shared between processing inlines and blocks
1835  * as reference definitions and links share some helper parser functions.
1836  */
1837 
1838 typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR;
1839 struct MD_LINK_ATTR_tag {
1840     OFF dest_beg;
1841     OFF dest_end;
1842 
1843     CHAR* title;
1844     SZ title_size;
1845     int title_needs_free;
1846 };
1847 
1848 
1849 static int
md_is_link_label(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF * p_end,int * p_beg_line_index,int * p_end_line_index,OFF * p_contents_beg,OFF * p_contents_end)1850 md_is_link_label(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
1851                  OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
1852                  OFF* p_contents_beg, OFF* p_contents_end)
1853 {
1854     OFF off = beg;
1855     OFF contents_beg = 0;
1856     OFF contents_end = 0;
1857     int line_index = 0;
1858     int len = 0;
1859 
1860     if(CH(off) != _T('['))
1861         return FALSE;
1862     off++;
1863 
1864     while(1) {
1865         OFF line_end = lines[line_index].end;
1866 
1867         while(off < line_end) {
1868             if(CH(off) == _T('\\')  &&  off+1 < ctx->size  &&  (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
1869                 if(contents_end == 0) {
1870                     contents_beg = off;
1871                     *p_beg_line_index = line_index;
1872                 }
1873                 contents_end = off + 2;
1874                 off += 2;
1875             } else if(CH(off) == _T('[')) {
1876                 return FALSE;
1877             } else if(CH(off) == _T(']')) {
1878                 if(contents_beg < contents_end) {
1879                     /* Success. */
1880                     *p_contents_beg = contents_beg;
1881                     *p_contents_end = contents_end;
1882                     *p_end = off+1;
1883                     *p_end_line_index = line_index;
1884                     return TRUE;
1885                 } else {
1886                     /* Link label must have some non-whitespace contents. */
1887                     return FALSE;
1888                 }
1889             } else {
1890                 unsigned codepoint;
1891                 SZ char_size;
1892 
1893                 codepoint = md_decode_unicode(ctx->text, off, ctx->size, &char_size);
1894                 if(!ISUNICODEWHITESPACE_(codepoint)) {
1895                     if(contents_end == 0) {
1896                         contents_beg = off;
1897                         *p_beg_line_index = line_index;
1898                     }
1899                     contents_end = off + char_size;
1900                 }
1901 
1902                 off += char_size;
1903             }
1904 
1905             len++;
1906             if(len > 999)
1907                 return FALSE;
1908         }
1909 
1910         line_index++;
1911         len++;
1912         if(line_index < n_lines)
1913             off = lines[line_index].beg;
1914         else
1915             break;
1916     }
1917 
1918     return FALSE;
1919 }
1920 
1921 static int
md_is_link_destination_A(MD_CTX * ctx,OFF beg,OFF max_end,OFF * p_end,OFF * p_contents_beg,OFF * p_contents_end)1922 md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1923                          OFF* p_contents_beg, OFF* p_contents_end)
1924 {
1925     OFF off = beg;
1926 
1927     if(off >= max_end  ||  CH(off) != _T('<'))
1928         return FALSE;
1929     off++;
1930 
1931     while(off < max_end) {
1932         if(CH(off) == _T('\\')  &&  off+1 < max_end  &&  ISPUNCT(off+1)) {
1933             off += 2;
1934             continue;
1935         }
1936 
1937         if(ISNEWLINE(off)  ||  CH(off) == _T('<'))
1938             return FALSE;
1939 
1940         if(CH(off) == _T('>')) {
1941             /* Success. */
1942             *p_contents_beg = beg+1;
1943             *p_contents_end = off;
1944             *p_end = off+1;
1945             return TRUE;
1946         }
1947 
1948         off++;
1949     }
1950 
1951     return FALSE;
1952 }
1953 
1954 static int
md_is_link_destination_B(MD_CTX * ctx,OFF beg,OFF max_end,OFF * p_end,OFF * p_contents_beg,OFF * p_contents_end)1955 md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1956                          OFF* p_contents_beg, OFF* p_contents_end)
1957 {
1958     OFF off = beg;
1959     int parenthesis_level = 0;
1960 
1961     while(off < max_end) {
1962         if(CH(off) == _T('\\')  &&  off+1 < max_end  &&  ISPUNCT(off+1)) {
1963             off += 2;
1964             continue;
1965         }
1966 
1967         if(ISWHITESPACE(off) || ISCNTRL(off))
1968             break;
1969 
1970         /* Link destination may include balanced pairs of unescaped '(' ')'.
1971          * Note we limit the maximal nesting level by 32 to protect us from
1972          * https://github.com/jgm/cmark/issues/214 */
1973         if(CH(off) == _T('(')) {
1974             parenthesis_level++;
1975             if(parenthesis_level > 32)
1976                 return FALSE;
1977         } else if(CH(off) == _T(')')) {
1978             if(parenthesis_level == 0)
1979                 break;
1980             parenthesis_level--;
1981         }
1982 
1983         off++;
1984     }
1985 
1986     if(parenthesis_level != 0  ||  off == beg)
1987         return FALSE;
1988 
1989     /* Success. */
1990     *p_contents_beg = beg;
1991     *p_contents_end = off;
1992     *p_end = off;
1993     return TRUE;
1994 }
1995 
1996 static inline int
md_is_link_destination(MD_CTX * ctx,OFF beg,OFF max_end,OFF * p_end,OFF * p_contents_beg,OFF * p_contents_end)1997 md_is_link_destination(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1998                        OFF* p_contents_beg, OFF* p_contents_end)
1999 {
2000     if(CH(beg) == _T('<'))
2001         return md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2002     else
2003         return md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2004 }
2005 
2006 static int
md_is_link_title(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF * p_end,int * p_beg_line_index,int * p_end_line_index,OFF * p_contents_beg,OFF * p_contents_end)2007 md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
2008                  OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
2009                  OFF* p_contents_beg, OFF* p_contents_end)
2010 {
2011     OFF off = beg;
2012     CHAR closer_char;
2013     int line_index = 0;
2014 
2015     /* White space with up to one line break. */
2016     while(off < lines[line_index].end  &&  ISWHITESPACE(off))
2017         off++;
2018     if(off >= lines[line_index].end) {
2019         line_index++;
2020         if(line_index >= n_lines)
2021             return FALSE;
2022         off = lines[line_index].beg;
2023     }
2024     if(off == beg)
2025         return FALSE;
2026 
2027     *p_beg_line_index = line_index;
2028 
2029     /* First char determines how to detect end of it. */
2030     switch(CH(off)) {
2031         case _T('"'):   closer_char = _T('"'); break;
2032         case _T('\''):  closer_char = _T('\''); break;
2033         case _T('('):   closer_char = _T(')'); break;
2034         default:        return FALSE;
2035     }
2036     off++;
2037 
2038     *p_contents_beg = off;
2039 
2040     while(line_index < n_lines) {
2041         OFF line_end = lines[line_index].end;
2042 
2043         while(off < line_end) {
2044             if(CH(off) == _T('\\')  &&  off+1 < ctx->size  &&  (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
2045                 off++;
2046             } else if(CH(off) == closer_char) {
2047                 /* Success. */
2048                 *p_contents_end = off;
2049                 *p_end = off+1;
2050                 *p_end_line_index = line_index;
2051                 return TRUE;
2052             } else if(closer_char == _T(')')  &&  CH(off) == _T('(')) {
2053                 /* ()-style title cannot contain (unescaped '(')) */
2054                 return FALSE;
2055             }
2056 
2057             off++;
2058         }
2059 
2060         line_index++;
2061     }
2062 
2063     return FALSE;
2064 }
2065 
2066 /* Returns 0 if it is not a reference definition.
2067  *
2068  * Returns N > 0 if it is a reference definition. N then corresponds to the
2069  * number of lines forming it). In this case the definition is stored for
2070  * resolving any links referring to it.
2071  *
2072  * Returns -1 in case of an error (out of memory).
2073  */
2074 static int
md_is_link_reference_definition(MD_CTX * ctx,const MD_LINE * lines,int n_lines)2075 md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
2076 {
2077     OFF label_contents_beg;
2078     OFF label_contents_end;
2079     int label_contents_line_index = -1;
2080     int label_is_multiline = FALSE;
2081     OFF dest_contents_beg;
2082     OFF dest_contents_end;
2083     OFF title_contents_beg;
2084     OFF title_contents_end;
2085     int title_contents_line_index;
2086     int title_is_multiline = FALSE;
2087     OFF off;
2088     int line_index = 0;
2089     int tmp_line_index;
2090     MD_REF_DEF* def = NULL;
2091     int ret = 0;
2092 
2093     /* Link label. */
2094     if(!md_is_link_label(ctx, lines, n_lines, lines[0].beg,
2095                 &off, &label_contents_line_index, &line_index,
2096                 &label_contents_beg, &label_contents_end))
2097         return FALSE;
2098     label_is_multiline = (label_contents_line_index != line_index);
2099 
2100     /* Colon. */
2101     if(off >= lines[line_index].end  ||  CH(off) != _T(':'))
2102         return FALSE;
2103     off++;
2104 
2105     /* Optional white space with up to one line break. */
2106     while(off < lines[line_index].end  &&  ISWHITESPACE(off))
2107         off++;
2108     if(off >= lines[line_index].end) {
2109         line_index++;
2110         if(line_index >= n_lines)
2111             return FALSE;
2112         off = lines[line_index].beg;
2113     }
2114 
2115     /* Link destination. */
2116     if(!md_is_link_destination(ctx, off, lines[line_index].end,
2117                 &off, &dest_contents_beg, &dest_contents_end))
2118         return FALSE;
2119 
2120     /* (Optional) title. Note we interpret it as an title only if nothing
2121      * more follows on its last line. */
2122     if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off,
2123                 &off, &title_contents_line_index, &tmp_line_index,
2124                 &title_contents_beg, &title_contents_end)
2125         &&  off >= lines[line_index + tmp_line_index].end)
2126     {
2127         title_is_multiline = (tmp_line_index != title_contents_line_index);
2128         title_contents_line_index += line_index;
2129         line_index += tmp_line_index;
2130     } else {
2131         /* Not a title. */
2132         title_is_multiline = FALSE;
2133         title_contents_beg = off;
2134         title_contents_end = off;
2135         title_contents_line_index = 0;
2136     }
2137 
2138     /* Nothing more can follow on the last line. */
2139     if(off < lines[line_index].end)
2140         return FALSE;
2141 
2142     /* So, it _is_ a reference definition. Remember it. */
2143     if(ctx->n_ref_defs >= ctx->alloc_ref_defs) {
2144         MD_REF_DEF* new_defs;
2145 
2146         ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0
2147                 ? ctx->alloc_ref_defs + ctx->alloc_ref_defs / 2
2148                 : 16);
2149         new_defs = (MD_REF_DEF*) realloc(ctx->ref_defs, ctx->alloc_ref_defs * sizeof(MD_REF_DEF));
2150         if(new_defs == NULL) {
2151             MD_LOG("realloc() failed.");
2152             goto abort;
2153         }
2154 
2155         ctx->ref_defs = new_defs;
2156     }
2157     def = &ctx->ref_defs[ctx->n_ref_defs];
2158     memset(def, 0, sizeof(MD_REF_DEF));
2159 
2160     if(label_is_multiline) {
2161         MD_CHECK(md_merge_lines_alloc(ctx, label_contents_beg, label_contents_end,
2162                     lines + label_contents_line_index, n_lines - label_contents_line_index,
2163                     _T(' '), &def->label, &def->label_size));
2164         def->label_needs_free = TRUE;
2165     } else {
2166         def->label = (CHAR*) STR(label_contents_beg);
2167         def->label_size = label_contents_end - label_contents_beg;
2168     }
2169 
2170     if(title_is_multiline) {
2171         MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2172                     lines + title_contents_line_index, n_lines - title_contents_line_index,
2173                     _T('\n'), &def->title, &def->title_size));
2174         def->title_needs_free = TRUE;
2175     } else {
2176         def->title = (CHAR*) STR(title_contents_beg);
2177         def->title_size = title_contents_end - title_contents_beg;
2178     }
2179 
2180     def->dest_beg = dest_contents_beg;
2181     def->dest_end = dest_contents_end;
2182 
2183     /* Success. */
2184     ctx->n_ref_defs++;
2185     return line_index + 1;
2186 
2187 abort:
2188     /* Failure. */
2189     if(def != NULL  &&  def->label_needs_free)
2190         free(def->label);
2191     if(def != NULL  &&  def->title_needs_free)
2192         free(def->title);
2193     return ret;
2194 }
2195 
2196 static int
md_is_link_reference(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF end,MD_LINK_ATTR * attr)2197 md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
2198                      OFF beg, OFF end, MD_LINK_ATTR* attr)
2199 {
2200     const MD_REF_DEF* def;
2201     const MD_LINE* beg_line;
2202     const MD_LINE* end_line;
2203     CHAR* label;
2204     SZ label_size;
2205     int ret;
2206 
2207     MD_ASSERT(CH(beg) == _T('[') || CH(beg) == _T('!'));
2208     MD_ASSERT(CH(end-1) == _T(']'));
2209 
2210     beg += (CH(beg) == _T('!') ? 2 : 1);
2211     end--;
2212 
2213     /* Find lines corresponding to the beg and end positions. */
2214     MD_ASSERT(lines[0].beg <= beg);
2215     beg_line = lines;
2216     while(beg >= beg_line->end)
2217         beg_line++;
2218 
2219     MD_ASSERT(end <= lines[n_lines-1].end);
2220     end_line = beg_line;
2221     while(end >= end_line->end)
2222         end_line++;
2223 
2224     if(beg_line != end_line) {
2225         MD_CHECK(md_merge_lines_alloc(ctx, beg, end, beg_line,
2226                  n_lines - (beg_line - lines), _T(' '), &label, &label_size));
2227     } else {
2228         label = (CHAR*) STR(beg);
2229         label_size = end - beg;
2230     }
2231 
2232     def = md_lookup_ref_def(ctx, label, label_size);
2233     if(def != NULL) {
2234         attr->dest_beg = def->dest_beg;
2235         attr->dest_end = def->dest_end;
2236         attr->title = def->title;
2237         attr->title_size = def->title_size;
2238         attr->title_needs_free = FALSE;
2239     }
2240 
2241     if(beg_line != end_line)
2242         free(label);
2243 
2244     ret = (def != NULL);
2245 
2246 abort:
2247     return ret;
2248 }
2249 
2250 static int
md_is_inline_link_spec(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF * p_end,MD_LINK_ATTR * attr)2251 md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
2252                        OFF beg, OFF* p_end, MD_LINK_ATTR* attr)
2253 {
2254     int line_index = 0;
2255     int tmp_line_index;
2256     OFF title_contents_beg;
2257     OFF title_contents_end;
2258     int title_contents_line_index;
2259     int title_is_multiline;
2260     OFF off = beg;
2261     int ret = FALSE;
2262 
2263     while(off >= lines[line_index].end)
2264         line_index++;
2265 
2266     MD_ASSERT(CH(off) == _T('('));
2267     off++;
2268 
2269     /* Optional white space with up to one line break. */
2270     while(off < lines[line_index].end  &&  ISWHITESPACE(off))
2271         off++;
2272     if(off >= lines[line_index].end  &&  ISNEWLINE(off)) {
2273         line_index++;
2274         if(line_index >= n_lines)
2275             return FALSE;
2276         off = lines[line_index].beg;
2277     }
2278 
2279     /* Link destination may be omitted, but only when not also having a title. */
2280     if(off < ctx->size  &&  CH(off) == _T(')')) {
2281         attr->dest_beg = off;
2282         attr->dest_end = off;
2283         attr->title = NULL;
2284         attr->title_size = 0;
2285         attr->title_needs_free = FALSE;
2286         off++;
2287         *p_end = off;
2288         return TRUE;
2289     }
2290 
2291     /* Link destination. */
2292     if(!md_is_link_destination(ctx, off, lines[line_index].end,
2293                         &off, &attr->dest_beg, &attr->dest_end))
2294         return FALSE;
2295 
2296     /* (Optional) title. */
2297     if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off,
2298                 &off, &title_contents_line_index, &tmp_line_index,
2299                 &title_contents_beg, &title_contents_end))
2300     {
2301         title_is_multiline = (tmp_line_index != title_contents_line_index);
2302         title_contents_line_index += line_index;
2303         line_index += tmp_line_index;
2304     } else {
2305         /* Not a title. */
2306         title_is_multiline = FALSE;
2307         title_contents_beg = off;
2308         title_contents_end = off;
2309         title_contents_line_index = 0;
2310     }
2311 
2312     /* Optional whitespace followed with final ')'. */
2313     while(off < lines[line_index].end  &&  ISWHITESPACE(off))
2314         off++;
2315     if(off >= lines[line_index].end  &&  ISNEWLINE(off)) {
2316         line_index++;
2317         if(line_index >= n_lines)
2318             return FALSE;
2319         off = lines[line_index].beg;
2320     }
2321     if(CH(off) != _T(')'))
2322         goto abort;
2323     off++;
2324 
2325     if(title_contents_beg >= title_contents_end) {
2326         attr->title = NULL;
2327         attr->title_size = 0;
2328         attr->title_needs_free = FALSE;
2329     } else if(!title_is_multiline) {
2330         attr->title = (CHAR*) STR(title_contents_beg);
2331         attr->title_size = title_contents_end - title_contents_beg;
2332         attr->title_needs_free = FALSE;
2333     } else {
2334         MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2335                     lines + title_contents_line_index, n_lines - title_contents_line_index,
2336                     _T('\n'), &attr->title, &attr->title_size));
2337         attr->title_needs_free = TRUE;
2338     }
2339 
2340     *p_end = off;
2341     ret = TRUE;
2342 
2343 abort:
2344     return ret;
2345 }
2346 
2347 static void
md_free_ref_defs(MD_CTX * ctx)2348 md_free_ref_defs(MD_CTX* ctx)
2349 {
2350     int i;
2351 
2352     for(i = 0; i < ctx->n_ref_defs; i++) {
2353         MD_REF_DEF* def = &ctx->ref_defs[i];
2354 
2355         if(def->label_needs_free)
2356             free(def->label);
2357         if(def->title_needs_free)
2358             free(def->title);
2359     }
2360 
2361     free(ctx->ref_defs);
2362 }
2363 
2364 
2365 /******************************************
2366  ***  Processing Inlines (a.k.a Spans)  ***
2367  ******************************************/
2368 
2369 /* We process inlines in few phases:
2370  *
2371  * (1) We go through the block text and collect all significant characters
2372  *     which may start/end a span or some other significant position into
2373  *     ctx->marks[]. Core of this is what md_collect_marks() does.
2374  *
2375  *     We also do some very brief preliminary context-less analysis, whether
2376  *     it might be opener or closer (e.g. of an emphasis span).
2377  *
2378  *     This speeds the other steps as we do not need to re-iterate over all
2379  *     characters anymore.
2380  *
2381  * (2) We analyze each potential mark types, in order by their precedence.
2382  *
2383  *     In each md_analyze_XXX() function, we re-iterate list of the marks,
2384  *     skipping already resolved regions (in preceding precedences) and try to
2385  *     resolve them.
2386  *
2387  * (2.1) For trivial marks, which are single (e.g. HTML entity), we just mark
2388  *       them as resolved.
2389  *
2390  * (2.2) For range-type marks, we analyze whether the mark could be closer
2391  *       and, if yes, whether there is some preceding opener it could satisfy.
2392  *
2393  *       If not we check whether it could be really an opener and if yes, we
2394  *       remember it so subsequent closers may resolve it.
2395  *
2396  * (3) Finally, when all marks were analyzed, we render the block contents
2397  *     by calling MD_RENDERER::text() callback, interrupting by ::enter_span()
2398  *     or ::close_span() whenever we reach a resolved mark.
2399  */
2400 
2401 
2402 /* The mark structure.
2403  *
2404  * '\\': Maybe escape sequence.
2405  * '\0': NULL char.
2406  *  '*': Maybe (strong) emphasis start/end.
2407  *  '_': Maybe (strong) emphasis start/end.
2408  *  '~': Maybe strikethrough start/end (needs MD_FLAG_STRIKETHROUGH).
2409  *  '`': Maybe code span start/end.
2410  *  '&': Maybe start of entity.
2411  *  ';': Maybe end of entity.
2412  *  '<': Maybe start of raw HTML or autolink.
2413  *  '>': Maybe end of raw HTML or autolink.
2414  *  '[': Maybe start of link label or link text.
2415  *  '!': Equivalent of '[' for image.
2416  *  ']': Maybe end of link label or link text.
2417  *  '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS).
2418  *  ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS).
2419  *  '.': Maybe permissive WWW auto-link (needs MD_FLAG_PERMISSIVEWWWAUTOLINKS).
2420  *  'D': Dummy mark, it reserves a space for splitting a previous mark
2421  *       (e.g. emphasis) or to make more space for storing some special data
2422  *       related to the preceding mark (e.g. link).
2423  *
2424  * Note that not all instances of these chars in the text imply creation of the
2425  * structure. Only those which have (or may have, after we see more context)
2426  * the special meaning.
2427  *
2428  * (Keep this struct as small as possible to fit as much of them into CPU
2429  * cache line.)
2430  */
2431 struct MD_MARK_tag {
2432     OFF beg;
2433     OFF end;
2434 
2435     /* For unresolved openers, 'prev' and 'next' form the chain of open openers
2436      * of given type 'ch'.
2437      *
2438      * During resolving, we disconnect from the chain and point to the
2439      * corresponding counterpart so opener points to its closer and vice versa.
2440      */
2441     int prev;
2442     int next;
2443     CHAR ch;
2444     unsigned char flags;
2445 };
2446 
2447 /* Mark flags (these apply to ALL mark types). */
2448 #define MD_MARK_POTENTIAL_OPENER            0x01  /* Maybe opener. */
2449 #define MD_MARK_POTENTIAL_CLOSER            0x02  /* Maybe closer. */
2450 #define MD_MARK_OPENER                      0x04  /* Definitely opener. */
2451 #define MD_MARK_CLOSER                      0x08  /* Definitely closer. */
2452 #define MD_MARK_RESOLVED                    0x10  /* Resolved in any definite way. */
2453 
2454 /* Mark flags specific for various mark types (so they can share bits). */
2455 #define MD_MARK_EMPH_INTRAWORD              0x20  /* Helper for the "rule of 3". */
2456 #define MD_MARK_EMPH_MOD3_0                 0x40
2457 #define MD_MARK_EMPH_MOD3_1                 0x80
2458 #define MD_MARK_EMPH_MOD3_2                 (0x40 | 0x80)
2459 #define MD_MARK_EMPH_MOD3_MASK              (0x40 | 0x80)
2460 #define MD_MARK_AUTOLINK                    0x20  /* Distinguisher for '<', '>'. */
2461 #define MD_MARK_VALIDPERMISSIVEAUTOLINK     0x20  /* For permissive autolinks. */
2462 
2463 static MD_MARKCHAIN*
md_asterisk_chain(MD_CTX * ctx,unsigned flags)2464 md_asterisk_chain(MD_CTX* ctx, unsigned flags)
2465 {
2466     switch(flags & (MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_MASK)) {
2467         case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_0:  return &ASTERISK_OPENERS_intraword_mod3_0;
2468         case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_1:  return &ASTERISK_OPENERS_intraword_mod3_1;
2469         case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_2:  return &ASTERISK_OPENERS_intraword_mod3_2;
2470         case MD_MARK_EMPH_MOD3_0:                           return &ASTERISK_OPENERS_extraword_mod3_0;
2471         case MD_MARK_EMPH_MOD3_1:                           return &ASTERISK_OPENERS_extraword_mod3_1;
2472         case MD_MARK_EMPH_MOD3_2:                           return &ASTERISK_OPENERS_extraword_mod3_2;
2473         default:                                            MD_UNREACHABLE();
2474     }
2475     return NULL;
2476 }
2477 
2478 static MD_MARKCHAIN*
md_mark_chain(MD_CTX * ctx,int mark_index)2479 md_mark_chain(MD_CTX* ctx, int mark_index)
2480 {
2481     MD_MARK* mark = &ctx->marks[mark_index];
2482 
2483     switch(mark->ch) {
2484         case _T('*'):   return md_asterisk_chain(ctx, mark->flags);
2485         case _T('_'):   return &UNDERSCORE_OPENERS;
2486         case _T('~'):   return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2;
2487         case _T('['):   return &BRACKET_OPENERS;
2488         case _T('|'):   return &TABLECELLBOUNDARIES;
2489         default:        return NULL;
2490     }
2491 }
2492 
2493 static MD_MARK*
md_push_mark(MD_CTX * ctx)2494 md_push_mark(MD_CTX* ctx)
2495 {
2496     if(ctx->n_marks >= ctx->alloc_marks) {
2497         MD_MARK* new_marks;
2498 
2499         ctx->alloc_marks = (ctx->alloc_marks > 0
2500                 ? ctx->alloc_marks + ctx->alloc_marks / 2
2501                 : 64);
2502         new_marks = realloc(ctx->marks, ctx->alloc_marks * sizeof(MD_MARK));
2503         if(new_marks == NULL) {
2504             MD_LOG("realloc() failed.");
2505             return NULL;
2506         }
2507 
2508         ctx->marks = new_marks;
2509     }
2510 
2511     return &ctx->marks[ctx->n_marks++];
2512 }
2513 
2514 #define PUSH_MARK_()                                                    \
2515         do {                                                            \
2516             mark = md_push_mark(ctx);                                   \
2517             if(mark == NULL) {                                          \
2518                 ret = -1;                                               \
2519                 goto abort;                                             \
2520             }                                                           \
2521         } while(0)
2522 
2523 #define PUSH_MARK(ch_, beg_, end_, flags_)                              \
2524         do {                                                            \
2525             PUSH_MARK_();                                               \
2526             mark->beg = (beg_);                                         \
2527             mark->end = (end_);                                         \
2528             mark->prev = -1;                                            \
2529             mark->next = -1;                                            \
2530             mark->ch = (char)(ch_);                                     \
2531             mark->flags = (flags_);                                     \
2532         } while(0)
2533 
2534 
2535 static void
md_mark_chain_append(MD_CTX * ctx,MD_MARKCHAIN * chain,int mark_index)2536 md_mark_chain_append(MD_CTX* ctx, MD_MARKCHAIN* chain, int mark_index)
2537 {
2538     if(chain->tail >= 0)
2539         ctx->marks[chain->tail].next = mark_index;
2540     else
2541         chain->head = mark_index;
2542 
2543     ctx->marks[mark_index].prev = chain->tail;
2544     ctx->marks[mark_index].next = -1;
2545     chain->tail = mark_index;
2546 }
2547 
2548 /* Sometimes, we need to store a pointer into the mark. It is quite rare
2549  * so we do not bother to make MD_MARK use union, and it can only happen
2550  * for dummy marks. */
2551 static inline void
md_mark_store_ptr(MD_CTX * ctx,int mark_index,void * ptr)2552 md_mark_store_ptr(MD_CTX* ctx, int mark_index, void* ptr)
2553 {
2554     MD_MARK* mark = &ctx->marks[mark_index];
2555     MD_ASSERT(mark->ch == 'D');
2556 
2557     /* Check only members beg and end are misused for this. */
2558     MD_ASSERT(sizeof(void*) <= 2 * sizeof(OFF));
2559     memcpy(mark, &ptr, sizeof(void*));
2560 }
2561 
2562 static inline void*
md_mark_get_ptr(MD_CTX * ctx,int mark_index)2563 md_mark_get_ptr(MD_CTX* ctx, int mark_index)
2564 {
2565     void* ptr;
2566     MD_MARK* mark = &ctx->marks[mark_index];
2567     MD_ASSERT(mark->ch == 'D');
2568     memcpy(&ptr, mark, sizeof(void*));
2569     return ptr;
2570 }
2571 
2572 static void
md_resolve_range(MD_CTX * ctx,MD_MARKCHAIN * chain,int opener_index,int closer_index)2573 md_resolve_range(MD_CTX* ctx, MD_MARKCHAIN* chain, int opener_index, int closer_index)
2574 {
2575     MD_MARK* opener = &ctx->marks[opener_index];
2576     MD_MARK* closer = &ctx->marks[closer_index];
2577 
2578     /* Remove opener from the list of openers. */
2579     if(chain != NULL) {
2580         if(opener->prev >= 0)
2581             ctx->marks[opener->prev].next = opener->next;
2582         else
2583             chain->head = opener->next;
2584 
2585         if(opener->next >= 0)
2586             ctx->marks[opener->next].prev = opener->prev;
2587         else
2588             chain->tail = opener->prev;
2589     }
2590 
2591     /* Interconnect opener and closer and mark both as resolved. */
2592     opener->next = closer_index;
2593     opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
2594     closer->prev = opener_index;
2595     closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
2596 }
2597 
2598 
2599 #define MD_ROLLBACK_ALL         0
2600 #define MD_ROLLBACK_CROSSING    1
2601 
2602 /* In the range ctx->marks[opener_index] ... [closer_index], undo some or all
2603  * resolvings accordingly to these rules:
2604  *
2605  * (1) All openers BEFORE the range corresponding to any closer inside the
2606  *     range are un-resolved and they are re-added to their respective chains
2607  *     of unresolved openers. This ensures we can reuse the opener for closers
2608  *     AFTER the range.
2609  *
2610  * (2) If 'how' is MD_ROLLBACK_ALL, then ALL resolved marks inside the range
2611  *     are discarded.
2612  *
2613  * (3) If 'how' is MD_ROLLBACK_CROSSING, only closers with openers handled
2614  *     in (1) are discarded. I.e. pairs of openers and closers which are both
2615  *     inside the range are retained as well as any unpaired marks.
2616  */
2617 static void
md_rollback(MD_CTX * ctx,int opener_index,int closer_index,int how)2618 md_rollback(MD_CTX* ctx, int opener_index, int closer_index, int how)
2619 {
2620     int i;
2621     int mark_index;
2622 
2623     /* Cut all unresolved openers at the mark index. */
2624     for(i = OPENERS_CHAIN_FIRST; i < OPENERS_CHAIN_LAST+1; i++) {
2625         MD_MARKCHAIN* chain = &ctx->mark_chains[i];
2626 
2627         while(chain->tail >= opener_index)
2628             chain->tail = ctx->marks[chain->tail].prev;
2629 
2630         if(chain->tail >= 0)
2631             ctx->marks[chain->tail].next = -1;
2632         else
2633             chain->head = -1;
2634     }
2635 
2636     /* Go backwards so that unresolved openers are re-added into their
2637      * respective chains, in the right order. */
2638     mark_index = closer_index - 1;
2639     while(mark_index > opener_index) {
2640         MD_MARK* mark = &ctx->marks[mark_index];
2641         int mark_flags = mark->flags;
2642         int discard_flag = (how == MD_ROLLBACK_ALL);
2643 
2644         if(mark->flags & MD_MARK_CLOSER) {
2645             int mark_opener_index = mark->prev;
2646 
2647             /* Undo opener BEFORE the range. */
2648             if(mark_opener_index < opener_index) {
2649                 MD_MARK* mark_opener = &ctx->marks[mark_opener_index];
2650                 MD_MARKCHAIN* chain;
2651 
2652                 mark_opener->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
2653                 chain = md_mark_chain(ctx, opener_index);
2654                 if(chain != NULL) {
2655                     md_mark_chain_append(ctx, chain, mark_opener_index);
2656                     discard_flag = 1;
2657                 }
2658             }
2659         }
2660 
2661         /* And reset our flags. */
2662         if(discard_flag)
2663             mark->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
2664 
2665         /* Jump as far as we can over unresolved or non-interesting marks. */
2666         switch(how) {
2667             case MD_ROLLBACK_CROSSING:
2668                 if((mark_flags & MD_MARK_CLOSER)  &&  mark->prev > opener_index) {
2669                     /* If we are closer with opener INSIDE the range, there may
2670                      * not be any other crosser inside the subrange. */
2671                     mark_index = mark->prev;
2672                     break;
2673                 }
2674                 /* Pass through. */
2675             default:
2676                 mark_index--;
2677                 break;
2678         }
2679     }
2680 }
2681 
2682 static void
md_build_mark_char_map(MD_CTX * ctx)2683 md_build_mark_char_map(MD_CTX* ctx)
2684 {
2685     memset(ctx->mark_char_map, 0, sizeof(ctx->mark_char_map));
2686 
2687     ctx->mark_char_map['\\'] = 1;
2688     ctx->mark_char_map['*'] = 1;
2689     ctx->mark_char_map['_'] = 1;
2690     ctx->mark_char_map['`'] = 1;
2691     ctx->mark_char_map['&'] = 1;
2692     ctx->mark_char_map[';'] = 1;
2693     ctx->mark_char_map['<'] = 1;
2694     ctx->mark_char_map['>'] = 1;
2695     ctx->mark_char_map['['] = 1;
2696     ctx->mark_char_map['!'] = 1;
2697     ctx->mark_char_map[']'] = 1;
2698     ctx->mark_char_map['\0'] = 1;
2699 
2700     if(ctx->parser.flags & MD_FLAG_STRIKETHROUGH)
2701         ctx->mark_char_map['~'] = 1;
2702 
2703     if(ctx->parser.flags & MD_FLAG_LATEXMATHSPANS)
2704         ctx->mark_char_map['$'] = 1;
2705 
2706     if(ctx->parser.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS)
2707         ctx->mark_char_map['@'] = 1;
2708 
2709     if(ctx->parser.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS)
2710         ctx->mark_char_map[':'] = 1;
2711 
2712     if(ctx->parser.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS)
2713         ctx->mark_char_map['.'] = 1;
2714 
2715     if((ctx->parser.flags & MD_FLAG_TABLES) || (ctx->parser.flags & MD_FLAG_WIKILINKS))
2716         ctx->mark_char_map['|'] = 1;
2717 
2718     if(ctx->parser.flags & MD_FLAG_COLLAPSEWHITESPACE) {
2719         int i;
2720 
2721         for(i = 0; i < (int) sizeof(ctx->mark_char_map); i++) {
2722             if(ISWHITESPACE_(i))
2723                 ctx->mark_char_map[i] = 1;
2724         }
2725     }
2726 }
2727 
2728 /* We limit code span marks to lower than 32 backticks. This solves the
2729  * pathologic case of too many openers, each of different length: Their
2730  * resolving would be then O(n^2). */
2731 #define CODESPAN_MARK_MAXLEN    32
2732 
2733 static int
md_is_code_span(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF * p_opener_beg,OFF * p_opener_end,OFF * p_closer_beg,OFF * p_closer_end,OFF last_potential_closers[CODESPAN_MARK_MAXLEN],int * p_reached_paragraph_end)2734 md_is_code_span(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
2735                 OFF* p_opener_beg, OFF* p_opener_end,
2736                 OFF* p_closer_beg, OFF* p_closer_end,
2737                 OFF last_potential_closers[CODESPAN_MARK_MAXLEN],
2738                 int* p_reached_paragraph_end)
2739 {
2740     OFF opener_beg = beg;
2741     OFF opener_end;
2742     OFF closer_beg;
2743     OFF closer_end;
2744     SZ mark_len;
2745     OFF line_end;
2746     int has_space_after_opener = FALSE;
2747     int has_eol_after_opener = FALSE;
2748     int has_space_before_closer = FALSE;
2749     int has_eol_before_closer = FALSE;
2750     int has_only_space = TRUE;
2751     int line_index = 0;
2752 
2753     line_end = lines[0].end;
2754     opener_end = opener_beg;
2755     while(opener_end < line_end  &&  CH(opener_end) == _T('`'))
2756         opener_end++;
2757     has_space_after_opener = (opener_end < line_end && CH(opener_end) == _T(' '));
2758     has_eol_after_opener = (opener_end == line_end);
2759 
2760     /* The caller needs to know end of the opening mark even if we fail. */
2761     *p_opener_end = opener_end;
2762 
2763     mark_len = opener_end - opener_beg;
2764     if(mark_len > CODESPAN_MARK_MAXLEN)
2765         return FALSE;
2766 
2767     /* Check whether we already know there is no closer of this length.
2768      * If so, re-scan does no sense. This fixes issue #59. */
2769     if(last_potential_closers[mark_len-1] >= lines[n_lines-1].end  ||
2770        (*p_reached_paragraph_end  &&  last_potential_closers[mark_len-1] < opener_end))
2771         return FALSE;
2772 
2773     closer_beg = opener_end;
2774     closer_end = opener_end;
2775 
2776     /* Find closer mark. */
2777     while(TRUE) {
2778         while(closer_beg < line_end  &&  CH(closer_beg) != _T('`')) {
2779             if(CH(closer_beg) != _T(' '))
2780                 has_only_space = FALSE;
2781             closer_beg++;
2782         }
2783         closer_end = closer_beg;
2784         while(closer_end < line_end  &&  CH(closer_end) == _T('`'))
2785             closer_end++;
2786 
2787         if(closer_end - closer_beg == mark_len) {
2788             /* Success. */
2789             has_space_before_closer = (closer_beg > lines[line_index].beg && CH(closer_beg-1) == _T(' '));
2790             has_eol_before_closer = (closer_beg == lines[line_index].beg);
2791             break;
2792         }
2793 
2794         if(closer_end - closer_beg > 0) {
2795             /* We have found a back-tick which is not part of the closer. */
2796             has_only_space = FALSE;
2797 
2798             /* But if we eventually fail, remember it as a potential closer
2799              * of its own length for future attempts. This mitigates needs for
2800              * rescans. */
2801             if(closer_end - closer_beg < CODESPAN_MARK_MAXLEN) {
2802                 if(closer_beg > last_potential_closers[closer_end - closer_beg - 1])
2803                     last_potential_closers[closer_end - closer_beg - 1] = closer_beg;
2804             }
2805         }
2806 
2807         if(closer_end >= line_end) {
2808             line_index++;
2809             if(line_index >= n_lines) {
2810                 /* Reached end of the paragraph and still nothing. */
2811                 *p_reached_paragraph_end = TRUE;
2812                 return FALSE;
2813             }
2814             /* Try on the next line. */
2815             line_end = lines[line_index].end;
2816             closer_beg = lines[line_index].beg;
2817         } else {
2818             closer_beg = closer_end;
2819         }
2820     }
2821 
2822     /* If there is a space or a new line both after and before the opener
2823      * (and if the code span is not made of spaces only), consume one initial
2824      * and one trailing space as part of the marks. */
2825     if(!has_only_space  &&
2826        (has_space_after_opener || has_eol_after_opener)  &&
2827        (has_space_before_closer || has_eol_before_closer))
2828     {
2829         if(has_space_after_opener)
2830             opener_end++;
2831         else
2832             opener_end = lines[1].beg;
2833 
2834         if(has_space_before_closer)
2835             closer_beg--;
2836         else {
2837             closer_beg = lines[line_index-1].end;
2838             /* We need to eat the preceding "\r\n" but not any line trailing
2839              * spaces. */
2840             while(closer_beg < ctx->size  &&  ISBLANK(closer_beg))
2841                 closer_beg++;
2842         }
2843     }
2844 
2845     *p_opener_beg = opener_beg;
2846     *p_opener_end = opener_end;
2847     *p_closer_beg = closer_beg;
2848     *p_closer_end = closer_end;
2849     return TRUE;
2850 }
2851 
2852 static int
md_is_autolink_uri(MD_CTX * ctx,OFF beg,OFF max_end,OFF * p_end)2853 md_is_autolink_uri(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2854 {
2855     OFF off = beg+1;
2856 
2857     MD_ASSERT(CH(beg) == _T('<'));
2858 
2859     /* Check for scheme. */
2860     if(off >= max_end  ||  !ISASCII(off))
2861         return FALSE;
2862     off++;
2863     while(1) {
2864         if(off >= max_end)
2865             return FALSE;
2866         if(off - beg > 32)
2867             return FALSE;
2868         if(CH(off) == _T(':')  &&  off - beg >= 3)
2869             break;
2870         if(!ISALNUM(off) && CH(off) != _T('+') && CH(off) != _T('-') && CH(off) != _T('.'))
2871             return FALSE;
2872         off++;
2873     }
2874 
2875     /* Check the path after the scheme. */
2876     while(off < max_end  &&  CH(off) != _T('>')) {
2877         if(ISWHITESPACE(off) || ISCNTRL(off) || CH(off) == _T('<'))
2878             return FALSE;
2879         off++;
2880     }
2881 
2882     if(off >= max_end)
2883         return FALSE;
2884 
2885     MD_ASSERT(CH(off) == _T('>'));
2886     *p_end = off+1;
2887     return TRUE;
2888 }
2889 
2890 static int
md_is_autolink_email(MD_CTX * ctx,OFF beg,OFF max_end,OFF * p_end)2891 md_is_autolink_email(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2892 {
2893     OFF off = beg + 1;
2894     int label_len;
2895 
2896     MD_ASSERT(CH(beg) == _T('<'));
2897 
2898     /* The code should correspond to this regexp:
2899             /^[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+
2900             @[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
2901             (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/
2902      */
2903 
2904     /* Username (before '@'). */
2905     while(off < max_end  &&  (ISALNUM(off) || ISANYOF(off, _T(".!#$%&'*+/=?^_`{|}~-"))))
2906         off++;
2907     if(off <= beg+1)
2908         return FALSE;
2909 
2910     /* '@' */
2911     if(off >= max_end  ||  CH(off) != _T('@'))
2912         return FALSE;
2913     off++;
2914 
2915     /* Labels delimited with '.'; each label is sequence of 1 - 63 alnum
2916      * characters or '-', but '-' is not allowed as first or last char. */
2917     label_len = 0;
2918     while(off < max_end) {
2919         if(ISALNUM(off))
2920             label_len++;
2921         else if(CH(off) == _T('-')  &&  label_len > 0)
2922             label_len++;
2923         else if(CH(off) == _T('.')  &&  label_len > 0  &&  CH(off-1) != _T('-'))
2924             label_len = 0;
2925         else
2926             break;
2927 
2928         if(label_len > 63)
2929             return FALSE;
2930 
2931         off++;
2932     }
2933 
2934     if(label_len <= 0  || off >= max_end  ||  CH(off) != _T('>') ||  CH(off-1) == _T('-'))
2935         return FALSE;
2936 
2937     *p_end = off+1;
2938     return TRUE;
2939 }
2940 
2941 static int
md_is_autolink(MD_CTX * ctx,OFF beg,OFF max_end,OFF * p_end,int * p_missing_mailto)2942 md_is_autolink(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, int* p_missing_mailto)
2943 {
2944     if(md_is_autolink_uri(ctx, beg, max_end, p_end)) {
2945         *p_missing_mailto = FALSE;
2946         return TRUE;
2947     }
2948 
2949     if(md_is_autolink_email(ctx, beg, max_end, p_end)) {
2950         *p_missing_mailto = TRUE;
2951         return TRUE;
2952     }
2953 
2954     return FALSE;
2955 }
2956 
2957 static int
md_collect_marks(MD_CTX * ctx,const MD_LINE * lines,int n_lines,int table_mode)2958 md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
2959 {
2960     int i;
2961     int ret = 0;
2962     MD_MARK* mark;
2963     OFF codespan_last_potential_closers[CODESPAN_MARK_MAXLEN] = { 0 };
2964     int codespan_scanned_till_paragraph_end = FALSE;
2965 
2966     for(i = 0; i < n_lines; i++) {
2967         const MD_LINE* line = &lines[i];
2968         OFF off = line->beg;
2969         OFF line_end = line->end;
2970 
2971         while(TRUE) {
2972             CHAR ch;
2973 
2974 #ifdef MD4C_USE_UTF16
2975     /* For UTF-16, mark_char_map[] covers only ASCII. */
2976     #define IS_MARK_CHAR(off)   ((CH(off) < SIZEOF_ARRAY(ctx->mark_char_map))  &&  \
2977                                 (ctx->mark_char_map[(unsigned char) CH(off)]))
2978 #else
2979     /* For 8-bit encodings, mark_char_map[] covers all 256 elements. */
2980     #define IS_MARK_CHAR(off)   (ctx->mark_char_map[(unsigned char) CH(off)])
2981 #endif
2982 
2983             /* Optimization: Use some loop unrolling. */
2984             while(off + 3 < line_end  &&  !IS_MARK_CHAR(off+0)  &&  !IS_MARK_CHAR(off+1)
2985                                       &&  !IS_MARK_CHAR(off+2)  &&  !IS_MARK_CHAR(off+3))
2986                 off += 4;
2987             while(off < line_end  &&  !IS_MARK_CHAR(off+0))
2988                 off++;
2989 
2990             if(off >= line_end)
2991                 break;
2992 
2993             ch = CH(off);
2994 
2995             /* A backslash escape.
2996              * It can go beyond line->end as it may involve escaped new
2997              * line to form a hard break. */
2998             if(ch == _T('\\')  &&  off+1 < ctx->size  &&  (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
2999                 /* Hard-break cannot be on the last line of the block. */
3000                 if(!ISNEWLINE(off+1)  ||  i+1 < n_lines)
3001                     PUSH_MARK(ch, off, off+2, MD_MARK_RESOLVED);
3002                 off += 2;
3003                 continue;
3004             }
3005 
3006             /* A potential (string) emphasis start/end. */
3007             if(ch == _T('*')  ||  ch == _T('_')) {
3008                 OFF tmp = off+1;
3009                 int left_level;     /* What precedes: 0 = whitespace; 1 = punctuation; 2 = other char. */
3010                 int right_level;    /* What follows: 0 = whitespace; 1 = punctuation; 2 = other char. */
3011 
3012                 while(tmp < line_end  &&  CH(tmp) == ch)
3013                     tmp++;
3014 
3015                 if(off == line->beg  ||  ISUNICODEWHITESPACEBEFORE(off))
3016                     left_level = 0;
3017                 else if(ISUNICODEPUNCTBEFORE(off))
3018                     left_level = 1;
3019                 else
3020                     left_level = 2;
3021 
3022                 if(tmp == line_end  ||  ISUNICODEWHITESPACE(tmp))
3023                     right_level = 0;
3024                 else if(ISUNICODEPUNCT(tmp))
3025                     right_level = 1;
3026                 else
3027                     right_level = 2;
3028 
3029                 /* Intra-word underscore doesn't have special meaning. */
3030                 if(ch == _T('_')  &&  left_level == 2  &&  right_level == 2) {
3031                     left_level = 0;
3032                     right_level = 0;
3033                 }
3034 
3035                 if(left_level != 0  ||  right_level != 0) {
3036                     unsigned flags = 0;
3037 
3038                     if(left_level > 0  &&  left_level >= right_level)
3039                         flags |= MD_MARK_POTENTIAL_CLOSER;
3040                     if(right_level > 0  &&  right_level >= left_level)
3041                         flags |= MD_MARK_POTENTIAL_OPENER;
3042                     if(left_level == 2  &&  right_level == 2)
3043                         flags |= MD_MARK_EMPH_INTRAWORD;
3044 
3045                     /* For "the rule of three" we need to remember the original
3046                      * size of the mark (modulo three), before we potentially
3047                      * split the mark when being later resolved partially by some
3048                      * shorter closer. */
3049                     switch((tmp - off) % 3) {
3050                         case 0: flags |= MD_MARK_EMPH_MOD3_0; break;
3051                         case 1: flags |= MD_MARK_EMPH_MOD3_1; break;
3052                         case 2: flags |= MD_MARK_EMPH_MOD3_2; break;
3053                     }
3054 
3055                     PUSH_MARK(ch, off, tmp, flags);
3056 
3057                     /* During resolving, multiple asterisks may have to be
3058                      * split into independent span start/ends. Consider e.g.
3059                      * "**foo* bar*". Therefore we push also some empty dummy
3060                      * marks to have enough space for that. */
3061                     off++;
3062                     while(off < tmp) {
3063                         PUSH_MARK('D', off, off, 0);
3064                         off++;
3065                     }
3066                     continue;
3067                 }
3068 
3069                 off = tmp;
3070                 continue;
3071             }
3072 
3073             /* A potential code span start/end. */
3074             if(ch == _T('`')) {
3075                 OFF opener_beg, opener_end;
3076                 OFF closer_beg, closer_end;
3077                 int is_code_span;
3078 
3079                 is_code_span = md_is_code_span(ctx, lines + i, n_lines - i, off,
3080                                     &opener_beg, &opener_end, &closer_beg, &closer_end,
3081                                     codespan_last_potential_closers,
3082                                     &codespan_scanned_till_paragraph_end);
3083                 if(is_code_span) {
3084                     PUSH_MARK(_T('`'), opener_beg, opener_end, MD_MARK_OPENER | MD_MARK_RESOLVED);
3085                     PUSH_MARK(_T('`'), closer_beg, closer_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3086                     ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3087                     ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3088 
3089                     off = closer_end;
3090 
3091                     /* Advance the current line accordingly. */
3092                     while(off > line_end) {
3093                         i++;
3094                         line++;
3095                         line_end = line->end;
3096                     }
3097                     continue;
3098                 }
3099 
3100                 off = opener_end;
3101                 continue;
3102             }
3103 
3104             /* A potential entity start. */
3105             if(ch == _T('&')) {
3106                 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3107                 off++;
3108                 continue;
3109             }
3110 
3111             /* A potential entity end. */
3112             if(ch == _T(';')) {
3113                 /* We surely cannot be entity unless the previous mark is '&'. */
3114                 if(ctx->n_marks > 0  &&  ctx->marks[ctx->n_marks-1].ch == _T('&'))
3115                     PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3116 
3117                 off++;
3118                 continue;
3119             }
3120 
3121             /* A potential autolink or raw HTML start/end. */
3122             if(ch == _T('<')) {
3123                 int is_autolink;
3124                 OFF autolink_end;
3125                 int missing_mailto;
3126 
3127                 if(!(ctx->parser.flags & MD_FLAG_NOHTMLSPANS)) {
3128                     int is_html;
3129                     OFF html_end;
3130 
3131                     /* Given the nature of the raw HTML, we have to recognize
3132                      * it here. Doing so later in md_analyze_lt_gt() could
3133                      * open can of worms of quadratic complexity. */
3134                     is_html = md_is_html_any(ctx, lines + i, n_lines - i, off,
3135                                     lines[n_lines-1].end, &html_end);
3136                     if(is_html) {
3137                         PUSH_MARK(_T('<'), off, off, MD_MARK_OPENER | MD_MARK_RESOLVED);
3138                         PUSH_MARK(_T('>'), html_end, html_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3139                         ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3140                         ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3141                         off = html_end;
3142 
3143                         /* Advance the current line accordingly. */
3144                         while(off > line_end) {
3145                             i++;
3146                             line++;
3147                             line_end = line->end;
3148                         }
3149                         continue;
3150                     }
3151                 }
3152 
3153                 is_autolink = md_is_autolink(ctx, off, lines[n_lines-1].end,
3154                                     &autolink_end, &missing_mailto);
3155                 if(is_autolink) {
3156                     PUSH_MARK((missing_mailto ? _T('@') : _T('<')), off, off+1,
3157                                 MD_MARK_OPENER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
3158                     PUSH_MARK(_T('>'), autolink_end-1, autolink_end,
3159                                 MD_MARK_CLOSER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
3160                     ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3161                     ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3162                     off = autolink_end;
3163                     continue;
3164                 }
3165 
3166                 off++;
3167                 continue;
3168             }
3169 
3170             /* A potential link or its part. */
3171             if(ch == _T('[')  ||  (ch == _T('!') && off+1 < line_end && CH(off+1) == _T('['))) {
3172                 OFF tmp = (ch == _T('[') ? off+1 : off+2);
3173                 PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER);
3174                 off = tmp;
3175                 /* Two dummies to make enough place for data we need if it is
3176                  * a link. */
3177                 PUSH_MARK('D', off, off, 0);
3178                 PUSH_MARK('D', off, off, 0);
3179                 continue;
3180             }
3181             if(ch == _T(']')) {
3182                 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3183                 off++;
3184                 continue;
3185             }
3186 
3187             /* A potential permissive e-mail autolink. */
3188             if(ch == _T('@')) {
3189                 if(line->beg + 1 <= off  &&  ISALNUM(off-1)  &&
3190                     off + 3 < line->end  &&  ISALNUM(off+1))
3191                 {
3192                     PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3193                     /* Push a dummy as a reserve for a closer. */
3194                     PUSH_MARK('D', off, off, 0);
3195                 }
3196 
3197                 off++;
3198                 continue;
3199             }
3200 
3201             /* A potential permissive URL autolink. */
3202             if(ch == _T(':')) {
3203                 static struct {
3204                     const CHAR* scheme;
3205                     SZ scheme_size;
3206                     const CHAR* suffix;
3207                     SZ suffix_size;
3208                 } scheme_map[] = {
3209                     /* In the order from the most frequently used, arguably. */
3210                     { _T("http"), 4,    _T("//"), 2 },
3211                     { _T("https"), 5,   _T("//"), 2 },
3212                     { _T("ftp"), 3,     _T("//"), 2 }
3213                 };
3214                 int scheme_index;
3215 
3216                 for(scheme_index = 0; scheme_index < (int) SIZEOF_ARRAY(scheme_map); scheme_index++) {
3217                     const CHAR* scheme = scheme_map[scheme_index].scheme;
3218                     const SZ scheme_size = scheme_map[scheme_index].scheme_size;
3219                     const CHAR* suffix = scheme_map[scheme_index].suffix;
3220                     const SZ suffix_size = scheme_map[scheme_index].suffix_size;
3221 
3222                     if(line->beg + scheme_size <= off  &&  md_ascii_eq(STR(off-scheme_size), scheme, scheme_size)  &&
3223                         (line->beg + scheme_size == off || ISWHITESPACE(off-scheme_size-1) || ISANYOF(off-scheme_size-1, _T("*_~([")))  &&
3224                         off + 1 + suffix_size < line->end  &&  md_ascii_eq(STR(off+1), suffix, suffix_size))
3225                     {
3226                         PUSH_MARK(ch, off-scheme_size, off+1+suffix_size, MD_MARK_POTENTIAL_OPENER);
3227                         /* Push a dummy as a reserve for a closer. */
3228                         PUSH_MARK('D', off, off, 0);
3229                         off += 1 + suffix_size;
3230                         continue;
3231                     }
3232                 }
3233 
3234                 off++;
3235                 continue;
3236             }
3237 
3238             /* A potential permissive WWW autolink. */
3239             if(ch == _T('.')) {
3240                 if(line->beg + 3 <= off  &&  md_ascii_eq(STR(off-3), _T("www"), 3)  &&
3241                     (line->beg + 3 == off || ISWHITESPACE(off-4) || ISANYOF(off-4, _T("*_~([")))  &&
3242                     off + 1 < line_end)
3243                 {
3244                     PUSH_MARK(ch, off-3, off+1, MD_MARK_POTENTIAL_OPENER);
3245                     /* Push a dummy as a reserve for a closer. */
3246                     PUSH_MARK('D', off, off, 0);
3247                     off++;
3248                     continue;
3249                 }
3250 
3251                 off++;
3252                 continue;
3253             }
3254 
3255             /* A potential table cell boundary or wiki link label delimiter. */
3256             if((table_mode || ctx->parser.flags & MD_FLAG_WIKILINKS) && ch == _T('|')) {
3257                 PUSH_MARK(ch, off, off+1, 0);
3258                 off++;
3259                 continue;
3260             }
3261 
3262             /* A potential strikethrough start/end. */
3263             if(ch == _T('~')) {
3264                 OFF tmp = off+1;
3265 
3266                 while(tmp < line_end  &&  CH(tmp) == _T('~'))
3267                     tmp++;
3268 
3269                 if(tmp - off < 3) {
3270                     unsigned flags = 0;
3271 
3272                     if(tmp < line_end  &&  !ISUNICODEWHITESPACE(tmp))
3273                         flags |= MD_MARK_POTENTIAL_OPENER;
3274                     if(off > line->beg  &&  !ISUNICODEWHITESPACEBEFORE(off))
3275                         flags |= MD_MARK_POTENTIAL_CLOSER;
3276                     if(flags != 0)
3277                         PUSH_MARK(ch, off, tmp, flags);
3278                 }
3279 
3280                 off = tmp;
3281                 continue;
3282             }
3283 
3284             /* A potential equation start/end */
3285             if(ch == _T('$')) {
3286                 /* We can have at most two consecutive $ signs,
3287                  * where two dollar signs signify a display equation. */
3288                 OFF tmp = off+1;
3289 
3290                 while(tmp < line_end && CH(tmp) == _T('$'))
3291                     tmp++;
3292 
3293                 if (tmp - off <= 2)
3294                     PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER);
3295                 off = tmp;
3296                 continue;
3297             }
3298 
3299             /* Turn non-trivial whitespace into single space. */
3300             if(ISWHITESPACE_(ch)) {
3301                 OFF tmp = off+1;
3302 
3303                 while(tmp < line_end  &&  ISWHITESPACE(tmp))
3304                     tmp++;
3305 
3306                 if(tmp - off > 1  ||  ch != _T(' '))
3307                     PUSH_MARK(ch, off, tmp, MD_MARK_RESOLVED);
3308 
3309                 off = tmp;
3310                 continue;
3311             }
3312 
3313             /* NULL character. */
3314             if(ch == _T('\0')) {
3315                 PUSH_MARK(ch, off, off+1, MD_MARK_RESOLVED);
3316                 off++;
3317                 continue;
3318             }
3319 
3320             off++;
3321         }
3322     }
3323 
3324     /* Add a dummy mark at the end of the mark vector to simplify
3325      * process_inlines(). */
3326     PUSH_MARK(127, ctx->size, ctx->size, MD_MARK_RESOLVED);
3327 
3328 abort:
3329     return ret;
3330 }
3331 
3332 static void
md_analyze_bracket(MD_CTX * ctx,int mark_index)3333 md_analyze_bracket(MD_CTX* ctx, int mark_index)
3334 {
3335     /* We cannot really resolve links here as for that we would need
3336      * more context. E.g. a following pair of brackets (reference link),
3337      * or enclosing pair of brackets (if the inner is the link, the outer
3338      * one cannot be.)
3339      *
3340      * Therefore we here only construct a list of resolved '[' ']' pairs
3341      * ordered by position of the closer. This allows ur to analyze what is
3342      * or is not link in the right order, from inside to outside in case
3343      * of nested brackets.
3344      *
3345      * The resolving itself is deferred into md_resolve_links().
3346      */
3347 
3348     MD_MARK* mark = &ctx->marks[mark_index];
3349 
3350     if(mark->flags & MD_MARK_POTENTIAL_OPENER) {
3351         md_mark_chain_append(ctx, &BRACKET_OPENERS, mark_index);
3352         return;
3353     }
3354 
3355     if(BRACKET_OPENERS.tail >= 0) {
3356         /* Pop the opener from the chain. */
3357         int opener_index = BRACKET_OPENERS.tail;
3358         MD_MARK* opener = &ctx->marks[opener_index];
3359         if(opener->prev >= 0)
3360             ctx->marks[opener->prev].next = -1;
3361         else
3362             BRACKET_OPENERS.head = -1;
3363         BRACKET_OPENERS.tail = opener->prev;
3364 
3365         /* Interconnect the opener and closer. */
3366         opener->next = mark_index;
3367         mark->prev = opener_index;
3368 
3369         /* Add the pair into chain of potential links for md_resolve_links().
3370          * Note we misuse opener->prev for this as opener->next points to its
3371          * closer. */
3372         if(ctx->unresolved_link_tail >= 0)
3373             ctx->marks[ctx->unresolved_link_tail].prev = opener_index;
3374         else
3375             ctx->unresolved_link_head = opener_index;
3376         ctx->unresolved_link_tail = opener_index;
3377         opener->prev = -1;
3378     }
3379 }
3380 
3381 /* Forward declaration. */
3382 static void md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
3383                                      int mark_beg, int mark_end);
3384 
3385 static int
md_resolve_links(MD_CTX * ctx,const MD_LINE * lines,int n_lines)3386 md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
3387 {
3388     int opener_index = ctx->unresolved_link_head;
3389     OFF last_link_beg = 0;
3390     OFF last_link_end = 0;
3391     OFF last_img_beg = 0;
3392     OFF last_img_end = 0;
3393 
3394     while(opener_index >= 0) {
3395         MD_MARK* opener = &ctx->marks[opener_index];
3396         int closer_index = opener->next;
3397         MD_MARK* closer = &ctx->marks[closer_index];
3398         int next_index = opener->prev;
3399         MD_MARK* next_opener;
3400         MD_MARK* next_closer;
3401         MD_LINK_ATTR attr;
3402         int is_link = FALSE;
3403 
3404         if(next_index >= 0) {
3405             next_opener = &ctx->marks[next_index];
3406             next_closer = &ctx->marks[next_opener->next];
3407         } else {
3408             next_opener = NULL;
3409             next_closer = NULL;
3410         }
3411 
3412         /* If nested ("[ [ ] ]"), we need to make sure that:
3413          *   - The outer does not end inside of (...) belonging to the inner.
3414          *   - The outer cannot be link if the inner is link (i.e. not image).
3415          *
3416          * (Note we here analyze from inner to outer as the marks are ordered
3417          * by closer->beg.)
3418          */
3419         if((opener->beg < last_link_beg  &&  closer->end < last_link_end)  ||
3420            (opener->beg < last_img_beg  &&  closer->end < last_img_end)  ||
3421            (opener->beg < last_link_end  &&  opener->ch == '['))
3422         {
3423             opener_index = next_index;
3424             continue;
3425         }
3426 
3427         /* Recognize and resolve wiki links.
3428          * Wiki-links maybe '[[destination]]' or '[[destination|label]]'.
3429          */
3430         if ((ctx->parser.flags & MD_FLAG_WIKILINKS) &&
3431             (opener->end - opener->beg == 1) &&         /* not image */
3432             next_opener != NULL &&                      /* double '[' opener */
3433             next_opener->ch == '[' &&
3434             (next_opener->beg == opener->beg - 1) &&
3435             (next_opener->end - next_opener->beg == 1) &&
3436             next_closer != NULL &&                      /* double ']' closer */
3437             next_closer->ch == ']' &&
3438             (next_closer->beg == closer->beg + 1) &&
3439             (next_closer->end - next_closer->beg == 1))
3440         {
3441             MD_MARK* delim = NULL;
3442             int delim_index;
3443             OFF dest_beg, dest_end;
3444 
3445             is_link = TRUE;
3446 
3447             /* We don't allow destination to be longer than 100 characters.
3448              * Lets scan to see whether there is '|'. (If not then the whole
3449              * wiki-link has to be below the 100 characters.) */
3450             delim_index = opener_index + 1;
3451             while(delim_index < closer_index) {
3452                 MD_MARK* m = &ctx->marks[delim_index];
3453                 if(m->ch == '|') {
3454                     delim = m;
3455                     break;
3456                 }
3457                 if(m->ch != 'D'  &&  m->beg - opener->end > 100)
3458                     break;
3459                 delim_index++;
3460             }
3461             dest_beg = opener->end;
3462             dest_end = (delim != NULL) ? delim->beg : closer->beg;
3463             if(dest_end - dest_beg == 0 || dest_end - dest_beg > 100)
3464                 is_link = FALSE;
3465 
3466             /* There may not be any new line in the destination. */
3467             if(is_link) {
3468                 OFF off;
3469                 for(off = dest_beg; off < dest_end; off++) {
3470                     if(ISNEWLINE(off)) {
3471                         is_link = FALSE;
3472                         break;
3473                     }
3474                 }
3475             }
3476 
3477             if(is_link) {
3478                 if(delim != NULL) {
3479                     if(delim->end < closer->beg) {
3480                         opener->end = delim->beg;
3481                     } else {
3482                         /* The pipe is just before the closer: [[foo|]] */
3483                         closer->beg = delim->beg;
3484                         delim = NULL;
3485                     }
3486                 }
3487 
3488                 opener->beg = next_opener->beg;
3489                 opener->next = closer_index;
3490                 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3491 
3492                 closer->end = next_closer->end;
3493                 closer->prev = opener_index;
3494                 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3495 
3496                 last_link_beg = opener->beg;
3497                 last_link_end = closer->end;
3498 
3499                 if(delim != NULL) {
3500                     delim->flags |= MD_MARK_RESOLVED;
3501                     md_rollback(ctx, opener_index, delim_index, MD_ROLLBACK_ALL);
3502                     md_analyze_link_contents(ctx, lines, n_lines, opener_index+1, closer_index);
3503                 } else {
3504                     md_rollback(ctx, opener_index, closer_index, MD_ROLLBACK_ALL);
3505                 }
3506 
3507                 opener_index = next_opener->prev;
3508                 continue;
3509             }
3510         }
3511 
3512         if(next_opener != NULL  &&  next_opener->beg == closer->end) {
3513             if(next_closer->beg > closer->end + 1) {
3514                 /* Might be full reference link. */
3515                 is_link = md_is_link_reference(ctx, lines, n_lines, next_opener->beg, next_closer->end, &attr);
3516             } else {
3517                 /* Might be shortcut reference link. */
3518                 is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
3519             }
3520 
3521             if(is_link < 0)
3522                 return -1;
3523 
3524             if(is_link) {
3525                 /* Eat the 2nd "[...]". */
3526                 closer->end = next_closer->end;
3527             }
3528         } else {
3529             if(closer->end < ctx->size  &&  CH(closer->end) == _T('(')) {
3530                 /* Might be inline link. */
3531                 OFF inline_link_end = UINT_MAX;
3532 
3533                 is_link = md_is_inline_link_spec(ctx, lines, n_lines, closer->end, &inline_link_end, &attr);
3534                 if(is_link < 0)
3535                     return -1;
3536 
3537                 /* Check the closing ')' is not inside an already resolved range
3538                  * (i.e. a range with a higher priority), e.g. a code span. */
3539                 if(is_link) {
3540                     int i = closer_index + 1;
3541 
3542                     while(i < ctx->n_marks) {
3543                         MD_MARK* mark = &ctx->marks[i];
3544 
3545                         if(mark->beg >= inline_link_end)
3546                             break;
3547                         if((mark->flags & (MD_MARK_OPENER | MD_MARK_RESOLVED)) == (MD_MARK_OPENER | MD_MARK_RESOLVED)) {
3548                             if(ctx->marks[mark->next].beg >= inline_link_end) {
3549                                 /* Cancel the link status. */
3550                                 if(attr.title_needs_free)
3551                                     free(attr.title);
3552                                 is_link = FALSE;
3553                                 break;
3554                             }
3555 
3556                             i = mark->next + 1;
3557                         } else {
3558                             i++;
3559                         }
3560                     }
3561                 }
3562 
3563                 if(is_link) {
3564                     /* Eat the "(...)" */
3565                     closer->end = inline_link_end;
3566                 }
3567             }
3568 
3569             if(!is_link) {
3570                 /* Might be collapsed reference link. */
3571                 is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
3572                 if(is_link < 0)
3573                     return -1;
3574             }
3575         }
3576 
3577         if(is_link) {
3578             /* Resolve the brackets as a link. */
3579             opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3580             closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3581 
3582             /* If it is a link, we store the destination and title in the two
3583              * dummy marks after the opener. */
3584             MD_ASSERT(ctx->marks[opener_index+1].ch == 'D');
3585             ctx->marks[opener_index+1].beg = attr.dest_beg;
3586             ctx->marks[opener_index+1].end = attr.dest_end;
3587 
3588             MD_ASSERT(ctx->marks[opener_index+2].ch == 'D');
3589             md_mark_store_ptr(ctx, opener_index+2, attr.title);
3590             /* The title might or might not have been allocated for us. */
3591             if(attr.title_needs_free)
3592                 md_mark_chain_append(ctx, &PTR_CHAIN, opener_index+2);
3593             ctx->marks[opener_index+2].prev = attr.title_size;
3594 
3595             if(opener->ch == '[') {
3596                 last_link_beg = opener->beg;
3597                 last_link_end = closer->end;
3598             } else {
3599                 last_img_beg = opener->beg;
3600                 last_img_end = closer->end;
3601             }
3602 
3603             md_analyze_link_contents(ctx, lines, n_lines, opener_index+1, closer_index);
3604         }
3605 
3606         opener_index = next_index;
3607     }
3608 
3609     return 0;
3610 }
3611 
3612 /* Analyze whether the mark '&' starts a HTML entity.
3613  * If so, update its flags as well as flags of corresponding closer ';'. */
3614 static void
md_analyze_entity(MD_CTX * ctx,int mark_index)3615 md_analyze_entity(MD_CTX* ctx, int mark_index)
3616 {
3617     MD_MARK* opener = &ctx->marks[mark_index];
3618     MD_MARK* closer;
3619     OFF off;
3620 
3621     /* Cannot be entity if there is no closer as the next mark.
3622      * (Any other mark between would mean strange character which cannot be
3623      * part of the entity.
3624      *
3625      * So we can do all the work on '&' and do not call this later for the
3626      * closing mark ';'.
3627      */
3628     if(mark_index + 1 >= ctx->n_marks)
3629         return;
3630     closer = &ctx->marks[mark_index+1];
3631     if(closer->ch != ';')
3632         return;
3633 
3634     if(md_is_entity(ctx, opener->beg, closer->end, &off)) {
3635         MD_ASSERT(off == closer->end);
3636 
3637         md_resolve_range(ctx, NULL, mark_index, mark_index+1);
3638         opener->end = closer->end;
3639     }
3640 }
3641 
3642 static void
md_analyze_table_cell_boundary(MD_CTX * ctx,int mark_index)3643 md_analyze_table_cell_boundary(MD_CTX* ctx, int mark_index)
3644 {
3645     MD_MARK* mark = &ctx->marks[mark_index];
3646     mark->flags |= MD_MARK_RESOLVED;
3647 
3648     md_mark_chain_append(ctx, &TABLECELLBOUNDARIES, mark_index);
3649     ctx->n_table_cell_boundaries++;
3650 }
3651 
3652 /* Split a longer mark into two. The new mark takes the given count of
3653  * characters. May only be called if an adequate number of dummy 'D' marks
3654  * follows.
3655  */
3656 static int
md_split_emph_mark(MD_CTX * ctx,int mark_index,SZ n)3657 md_split_emph_mark(MD_CTX* ctx, int mark_index, SZ n)
3658 {
3659     MD_MARK* mark = &ctx->marks[mark_index];
3660     int new_mark_index = mark_index + (mark->end - mark->beg - n);
3661     MD_MARK* dummy = &ctx->marks[new_mark_index];
3662 
3663     MD_ASSERT(mark->end - mark->beg > n);
3664     MD_ASSERT(dummy->ch == 'D');
3665 
3666     memcpy(dummy, mark, sizeof(MD_MARK));
3667     mark->end -= n;
3668     dummy->beg = mark->end;
3669 
3670     return new_mark_index;
3671 }
3672 
3673 static void
md_analyze_emph(MD_CTX * ctx,int mark_index)3674 md_analyze_emph(MD_CTX* ctx, int mark_index)
3675 {
3676     MD_MARK* mark = &ctx->marks[mark_index];
3677     MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
3678 
3679     /* If we can be a closer, try to resolve with the preceding opener. */
3680     if(mark->flags & MD_MARK_POTENTIAL_CLOSER) {
3681         MD_MARK* opener = NULL;
3682         int opener_index;
3683 
3684         if(mark->ch == _T('*')) {
3685             MD_MARKCHAIN* opener_chains[6];
3686             int i, n_opener_chains;
3687             unsigned flags = mark->flags;
3688 
3689             /* Apply the "rule of three". */
3690             n_opener_chains = 0;
3691             opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_0;
3692             if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3693                 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_1;
3694             if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3695                 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_2;
3696             opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_0;
3697             if(!(flags & MD_MARK_EMPH_INTRAWORD)  ||  (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3698                 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_1;
3699             if(!(flags & MD_MARK_EMPH_INTRAWORD)  ||  (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3700                 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_2;
3701 
3702             /* Opener is the most recent mark from the allowed chains. */
3703             for(i = 0; i < n_opener_chains; i++) {
3704                 if(opener_chains[i]->tail >= 0) {
3705                     int tmp_index = opener_chains[i]->tail;
3706                     MD_MARK* tmp_mark = &ctx->marks[tmp_index];
3707                     if(opener == NULL  ||  tmp_mark->end > opener->end) {
3708                         opener_index = tmp_index;
3709                         opener = tmp_mark;
3710                     }
3711                 }
3712             }
3713         } else {
3714             /* Simple emph. mark */
3715             if(chain->tail >= 0) {
3716                 opener_index = chain->tail;
3717                 opener = &ctx->marks[opener_index];
3718             }
3719         }
3720 
3721         /* Resolve, if we have found matching opener. */
3722         if(opener != NULL) {
3723             SZ opener_size = opener->end - opener->beg;
3724             SZ closer_size = mark->end - mark->beg;
3725             MD_MARKCHAIN* opener_chain = md_mark_chain(ctx, opener_index);
3726 
3727             if(opener_size > closer_size) {
3728                 opener_index = md_split_emph_mark(ctx, opener_index, closer_size);
3729                 md_mark_chain_append(ctx, opener_chain, opener_index);
3730             } else if(opener_size < closer_size) {
3731                 md_split_emph_mark(ctx, mark_index, closer_size - opener_size);
3732             }
3733 
3734             md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING);
3735             md_resolve_range(ctx, opener_chain, opener_index, mark_index);
3736             return;
3737         }
3738     }
3739 
3740     /* If we could not resolve as closer, we may be yet be an opener. */
3741     if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3742         md_mark_chain_append(ctx, chain, mark_index);
3743 }
3744 
3745 static void
md_analyze_tilde(MD_CTX * ctx,int mark_index)3746 md_analyze_tilde(MD_CTX* ctx, int mark_index)
3747 {
3748     MD_MARK* mark = &ctx->marks[mark_index];
3749     MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
3750 
3751     /* We attempt to be Github Flavored Markdown compatible here. GFM accepts
3752      * only tildes sequences of length 1 and 2, and the length of the opener
3753      * and closer has to match. */
3754 
3755     if((mark->flags & MD_MARK_POTENTIAL_CLOSER)  &&  chain->head >= 0) {
3756         int opener_index = chain->head;
3757 
3758         md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING);
3759         md_resolve_range(ctx, chain, opener_index, mark_index);
3760         return;
3761     }
3762 
3763     if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3764         md_mark_chain_append(ctx, chain, mark_index);
3765 }
3766 
3767 static void
md_analyze_dollar(MD_CTX * ctx,int mark_index)3768 md_analyze_dollar(MD_CTX* ctx, int mark_index)
3769 {
3770     /* This should mimic the way inline equations work in LaTeX, so there
3771      * can only ever be one item in the chain (i.e. the dollars can't be
3772      * nested). This is basically the same as the md_analyze_tilde function,
3773      * except that we require matching openers and closers to be of the same
3774      * length.
3775      *
3776      * E.g.: $abc$$def$$ => abc (display equation) def (end equation) */
3777     if(DOLLAR_OPENERS.head >= 0) {
3778         /* If the potential closer has a non-matching number of $, discard */
3779         MD_MARK* open = &ctx->marks[DOLLAR_OPENERS.head];
3780         MD_MARK* close = &ctx->marks[mark_index];
3781 
3782         int opener_index = DOLLAR_OPENERS.head;
3783         md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_ALL);
3784         if (open->end - open->beg == close->end - close->beg) {
3785             /* We are the matching closer */
3786             md_resolve_range(ctx, &DOLLAR_OPENERS, opener_index, mark_index);
3787         } else {
3788             /* We don't match the opener, so discard old opener and insert as opener */
3789             md_mark_chain_append(ctx, &DOLLAR_OPENERS, mark_index);
3790         }
3791     } else {
3792         /* No unmatched openers, so we are opener */
3793         md_mark_chain_append(ctx, &DOLLAR_OPENERS, mark_index);
3794     }
3795 }
3796 
3797 static void
md_analyze_permissive_url_autolink(MD_CTX * ctx,int mark_index)3798 md_analyze_permissive_url_autolink(MD_CTX* ctx, int mark_index)
3799 {
3800     MD_MARK* opener = &ctx->marks[mark_index];
3801     int closer_index = mark_index + 1;
3802     MD_MARK* closer = &ctx->marks[closer_index];
3803     MD_MARK* next_resolved_mark;
3804     OFF off = opener->end;
3805     int n_dots = FALSE;
3806     int has_underscore_in_last_seg = FALSE;
3807     int has_underscore_in_next_to_last_seg = FALSE;
3808     int n_opened_parenthesis = 0;
3809 
3810     /* Check for domain. */
3811     while(off < ctx->size) {
3812         if(ISALNUM(off) || CH(off) == _T('-')) {
3813             off++;
3814         } else if(CH(off) == _T('.')) {
3815             /* We must see at least one period. */
3816             n_dots++;
3817             has_underscore_in_next_to_last_seg = has_underscore_in_last_seg;
3818             has_underscore_in_last_seg = FALSE;
3819             off++;
3820         } else if(CH(off) == _T('_')) {
3821             /* No underscore may be present in the last two domain segments. */
3822             has_underscore_in_last_seg = TRUE;
3823             off++;
3824         } else {
3825             break;
3826         }
3827     }
3828     if(off > opener->end  &&  CH(off-1) == _T('.')) {
3829         off--;
3830         n_dots--;
3831     }
3832     if(off <= opener->end || n_dots == 0 || has_underscore_in_next_to_last_seg || has_underscore_in_last_seg)
3833         return;
3834 
3835     /* Check for path. */
3836     next_resolved_mark = closer + 1;
3837     while(next_resolved_mark->ch == 'D' || !(next_resolved_mark->flags & MD_MARK_RESOLVED))
3838         next_resolved_mark++;
3839     while(off < next_resolved_mark->beg  &&  CH(off) != _T('<')  &&  !ISWHITESPACE(off)  &&  !ISNEWLINE(off)) {
3840         /* Parenthesis must be balanced. */
3841         if(CH(off) == _T('(')) {
3842             n_opened_parenthesis++;
3843         } else if(CH(off) == _T(')')) {
3844             if(n_opened_parenthesis > 0)
3845                 n_opened_parenthesis--;
3846             else
3847                 break;
3848         }
3849 
3850         off++;
3851     }
3852     /* These cannot be last char In such case they are more likely normal
3853      * punctuation. */
3854     if(ISANYOF(off-1, _T("?!.,:*_~")))
3855         off--;
3856 
3857     /* Ok. Lets call it auto-link. Adapt opener and create closer to zero
3858      * length so all the contents becomes the link text. */
3859     MD_ASSERT(closer->ch == 'D');
3860     opener->end = opener->beg;
3861     closer->ch = opener->ch;
3862     closer->beg = off;
3863     closer->end = off;
3864     md_resolve_range(ctx, NULL, mark_index, closer_index);
3865 }
3866 
3867 /* The permissive autolinks do not have to be enclosed in '<' '>' but we
3868  * instead impose stricter rules what is understood as an e-mail address
3869  * here. Actually any non-alphanumeric characters with exception of '.'
3870  * are prohibited both in username and after '@'. */
3871 static void
md_analyze_permissive_email_autolink(MD_CTX * ctx,int mark_index)3872 md_analyze_permissive_email_autolink(MD_CTX* ctx, int mark_index)
3873 {
3874     MD_MARK* opener = &ctx->marks[mark_index];
3875     int closer_index;
3876     MD_MARK* closer;
3877     OFF beg = opener->beg;
3878     OFF end = opener->end;
3879     int dot_count = 0;
3880 
3881     MD_ASSERT(CH(beg) == _T('@'));
3882 
3883     /* Scan for name before '@'. */
3884     while(beg > 0  &&  (ISALNUM(beg-1) || ISANYOF(beg-1, _T(".-_+"))))
3885         beg--;
3886 
3887     /* Scan for domain after '@'. */
3888     while(end < ctx->size  &&  (ISALNUM(end) || ISANYOF(end, _T(".-_")))) {
3889         if(CH(end) == _T('.'))
3890             dot_count++;
3891         end++;
3892     }
3893     if(CH(end-1) == _T('.')) {  /* Final '.' not part of it. */
3894         dot_count--;
3895         end--;
3896     }
3897     else if(ISANYOF2(end-1, _T('-'), _T('_'))) /* These are forbidden at the end. */
3898         return;
3899     if(CH(end-1) == _T('@')  ||  dot_count == 0)
3900         return;
3901 
3902     /* Ok. Lets call it auto-link. Adapt opener and create closer to zero
3903      * length so all the contents becomes the link text. */
3904     closer_index = mark_index + 1;
3905     closer = &ctx->marks[closer_index];
3906     MD_ASSERT(closer->ch == 'D');
3907 
3908     opener->beg = beg;
3909     opener->end = beg;
3910     closer->ch = opener->ch;
3911     closer->beg = end;
3912     closer->end = end;
3913     md_resolve_range(ctx, NULL, mark_index, closer_index);
3914 }
3915 
3916 static inline void
md_analyze_marks(MD_CTX * ctx,const MD_LINE * lines,int n_lines,int mark_beg,int mark_end,const CHAR * mark_chars)3917 md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
3918                  int mark_beg, int mark_end, const CHAR* mark_chars)
3919 {
3920     int i = mark_beg;
3921 
3922     while(i < mark_end) {
3923         MD_MARK* mark = &ctx->marks[i];
3924 
3925         /* Skip resolved spans. */
3926         if(mark->flags & MD_MARK_RESOLVED) {
3927             if(mark->flags & MD_MARK_OPENER) {
3928                 MD_ASSERT(i < mark->next);
3929                 i = mark->next + 1;
3930             } else {
3931                 i++;
3932             }
3933             continue;
3934         }
3935 
3936         /* Skip marks we do not want to deal with. */
3937         if(!ISANYOF_(mark->ch, mark_chars)) {
3938             i++;
3939             continue;
3940         }
3941 
3942         /* Analyze the mark. */
3943         switch(mark->ch) {
3944             case '[':   /* Pass through. */
3945             case '!':   /* Pass through. */
3946             case ']':   md_analyze_bracket(ctx, i); break;
3947             case '&':   md_analyze_entity(ctx, i); break;
3948             case '|':   md_analyze_table_cell_boundary(ctx, i); break;
3949             case '_':   /* Pass through. */
3950             case '*':   md_analyze_emph(ctx, i); break;
3951             case '~':   md_analyze_tilde(ctx, i); break;
3952             case '$':   md_analyze_dollar(ctx, i); break;
3953             case '.':   /* Pass through. */
3954             case ':':   md_analyze_permissive_url_autolink(ctx, i); break;
3955             case '@':   md_analyze_permissive_email_autolink(ctx, i); break;
3956         }
3957 
3958         i++;
3959     }
3960 }
3961 
3962 /* Analyze marks (build ctx->marks). */
3963 static int
md_analyze_inlines(MD_CTX * ctx,const MD_LINE * lines,int n_lines,int table_mode)3964 md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
3965 {
3966     int ret;
3967 
3968     /* Reset the previously collected stack of marks. */
3969     ctx->n_marks = 0;
3970 
3971     /* Collect all marks. */
3972     MD_CHECK(md_collect_marks(ctx, lines, n_lines, table_mode));
3973 
3974     /* We analyze marks in few groups to handle their precedence. */
3975     /* (1) Entities; code spans; autolinks; raw HTML. */
3976     md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("&"));
3977 
3978     /* (2) Links. */
3979     md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("[]!"));
3980     MD_CHECK(md_resolve_links(ctx, lines, n_lines));
3981     BRACKET_OPENERS.head = -1;
3982     BRACKET_OPENERS.tail = -1;
3983     ctx->unresolved_link_head = -1;
3984     ctx->unresolved_link_tail = -1;
3985 
3986     if(table_mode) {
3987         /* (3) Analyze table cell boundaries.
3988          * Note we reset TABLECELLBOUNDARIES chain prior to the call md_analyze_marks(),
3989          * not after, because caller may need it. */
3990         MD_ASSERT(n_lines == 1);
3991         TABLECELLBOUNDARIES.head = -1;
3992         TABLECELLBOUNDARIES.tail = -1;
3993         ctx->n_table_cell_boundaries = 0;
3994         md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("|"));
3995         return ret;
3996     }
3997 
3998     /* (4) Emphasis and strong emphasis; permissive autolinks. */
3999     md_analyze_link_contents(ctx, lines, n_lines, 0, ctx->n_marks);
4000 
4001 abort:
4002     return ret;
4003 }
4004 
4005 static void
md_analyze_link_contents(MD_CTX * ctx,const MD_LINE * lines,int n_lines,int mark_beg,int mark_end)4006 md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
4007                          int mark_beg, int mark_end)
4008 {
4009     int i;
4010 
4011     md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("*_~$@:."));
4012 
4013     for(i = OPENERS_CHAIN_FIRST; i <= OPENERS_CHAIN_LAST; i++) {
4014         ctx->mark_chains[i].head = -1;
4015         ctx->mark_chains[i].tail = -1;
4016     }
4017 }
4018 
4019 static int
md_enter_leave_span_a(MD_CTX * ctx,int enter,MD_SPANTYPE type,const CHAR * dest,SZ dest_size,int prohibit_escapes_in_dest,const CHAR * title,SZ title_size)4020 md_enter_leave_span_a(MD_CTX* ctx, int enter, MD_SPANTYPE type,
4021                       const CHAR* dest, SZ dest_size, int prohibit_escapes_in_dest,
4022                       const CHAR* title, SZ title_size)
4023 {
4024     MD_ATTRIBUTE_BUILD href_build = { 0 };
4025     MD_ATTRIBUTE_BUILD title_build = { 0 };
4026     MD_SPAN_A_DETAIL det;
4027     int ret = 0;
4028 
4029     /* Note we here rely on fact that MD_SPAN_A_DETAIL and
4030      * MD_SPAN_IMG_DETAIL are binary-compatible. */
4031     memset(&det, 0, sizeof(MD_SPAN_A_DETAIL));
4032     MD_CHECK(md_build_attribute(ctx, dest, dest_size,
4033                     (prohibit_escapes_in_dest ? MD_BUILD_ATTR_NO_ESCAPES : 0),
4034                     &det.href, &href_build));
4035     MD_CHECK(md_build_attribute(ctx, title, title_size, 0, &det.title, &title_build));
4036 
4037     if(enter)
4038         MD_ENTER_SPAN(type, &det);
4039     else
4040         MD_LEAVE_SPAN(type, &det);
4041 
4042 abort:
4043     md_free_attribute(ctx, &href_build);
4044     md_free_attribute(ctx, &title_build);
4045     return ret;
4046 }
4047 
4048 static int
md_enter_leave_span_wikilink(MD_CTX * ctx,int enter,const CHAR * target,SZ target_size)4049 md_enter_leave_span_wikilink(MD_CTX* ctx, int enter, const CHAR* target, SZ target_size)
4050 {
4051     MD_ATTRIBUTE_BUILD target_build = { 0 };
4052     MD_SPAN_WIKILINK_DETAIL det;
4053     int ret = 0;
4054 
4055     memset(&det, 0, sizeof(MD_SPAN_WIKILINK_DETAIL));
4056     MD_CHECK(md_build_attribute(ctx, target, target_size, 0, &det.target, &target_build));
4057 
4058     if (enter)
4059         MD_ENTER_SPAN(MD_SPAN_WIKILINK, &det);
4060     else
4061         MD_LEAVE_SPAN(MD_SPAN_WIKILINK, &det);
4062 
4063 abort:
4064     md_free_attribute(ctx, &target_build);
4065     return ret;
4066 }
4067 
4068 
4069 /* Render the output, accordingly to the analyzed ctx->marks. */
4070 static int
md_process_inlines(MD_CTX * ctx,const MD_LINE * lines,int n_lines)4071 md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
4072 {
4073     MD_TEXTTYPE text_type;
4074     const MD_LINE* line = lines;
4075     MD_MARK* prev_mark = NULL;
4076     MD_MARK* mark;
4077     OFF off = lines[0].beg;
4078     OFF end = lines[n_lines-1].end;
4079     int enforce_hardbreak = 0;
4080     int ret = 0;
4081 
4082     /* Find first resolved mark. Note there is always at least one resolved
4083      * mark,  the dummy last one after the end of the latest line we actually
4084      * never really reach. This saves us of a lot of special checks and cases
4085      * in this function. */
4086     mark = ctx->marks;
4087     while(!(mark->flags & MD_MARK_RESOLVED))
4088         mark++;
4089 
4090     text_type = MD_TEXT_NORMAL;
4091 
4092     while(1) {
4093         /* Process the text up to the next mark or end-of-line. */
4094         OFF tmp = (line->end < mark->beg ? line->end : mark->beg);
4095         if(tmp > off) {
4096             MD_TEXT(text_type, STR(off), tmp - off);
4097             off = tmp;
4098         }
4099 
4100         /* If reached the mark, process it and move to next one. */
4101         if(off >= mark->beg) {
4102             switch(mark->ch) {
4103                 case '\\':      /* Backslash escape. */
4104                     if(ISNEWLINE(mark->beg+1))
4105                         enforce_hardbreak = 1;
4106                     else
4107                         MD_TEXT(text_type, STR(mark->beg+1), 1);
4108                     break;
4109 
4110                 case ' ':       /* Non-trivial space. */
4111                     MD_TEXT(text_type, _T(" "), 1);
4112                     break;
4113 
4114                 case '`':       /* Code span. */
4115                     if(mark->flags & MD_MARK_OPENER) {
4116                         MD_ENTER_SPAN(MD_SPAN_CODE, NULL);
4117                         text_type = MD_TEXT_CODE;
4118                     } else {
4119                         MD_LEAVE_SPAN(MD_SPAN_CODE, NULL);
4120                         text_type = MD_TEXT_NORMAL;
4121                     }
4122                     break;
4123 
4124                 case '_':       /* Underline (or emphasis if we fall through). */
4125                     if(ctx->parser.flags & MD_FLAG_UNDERLINE) {
4126                         if(mark->flags & MD_MARK_OPENER) {
4127                             while(off < mark->end) {
4128                                 MD_ENTER_SPAN(MD_SPAN_U, NULL);
4129                                 off++;
4130                             }
4131                         } else {
4132                             while(off < mark->end) {
4133                                 MD_LEAVE_SPAN(MD_SPAN_U, NULL);
4134                                 off++;
4135                             }
4136                         }
4137                         break;
4138                     }
4139                     /* Fall though. */
4140 
4141                 case '*':       /* Emphasis, strong emphasis. */
4142                     if(mark->flags & MD_MARK_OPENER) {
4143                         if((mark->end - off) % 2) {
4144                             MD_ENTER_SPAN(MD_SPAN_EM, NULL);
4145                             off++;
4146                         }
4147                         while(off + 1 < mark->end) {
4148                             MD_ENTER_SPAN(MD_SPAN_STRONG, NULL);
4149                             off += 2;
4150                         }
4151                     } else {
4152                         while(off + 1 < mark->end) {
4153                             MD_LEAVE_SPAN(MD_SPAN_STRONG, NULL);
4154                             off += 2;
4155                         }
4156                         if((mark->end - off) % 2) {
4157                             MD_LEAVE_SPAN(MD_SPAN_EM, NULL);
4158                             off++;
4159                         }
4160                     }
4161                     break;
4162 
4163                 case '~':
4164                     if(mark->flags & MD_MARK_OPENER)
4165                         MD_ENTER_SPAN(MD_SPAN_DEL, NULL);
4166                     else
4167                         MD_LEAVE_SPAN(MD_SPAN_DEL, NULL);
4168                     break;
4169 
4170                 case '$':
4171                     if(mark->flags & MD_MARK_OPENER) {
4172                         MD_ENTER_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4173                         text_type = MD_TEXT_LATEXMATH;
4174                     } else {
4175                         MD_LEAVE_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4176                         text_type = MD_TEXT_NORMAL;
4177                     }
4178                     break;
4179 
4180                 case '[':       /* Link, wiki link, image. */
4181                 case '!':
4182                 case ']':
4183                 {
4184                     const MD_MARK* opener = (mark->ch != ']' ? mark : &ctx->marks[mark->prev]);
4185                     const MD_MARK* closer = &ctx->marks[opener->next];
4186                     const MD_MARK* dest_mark;
4187                     const MD_MARK* title_mark;
4188 
4189                     if ((opener->ch == '[' && closer->ch == ']') &&
4190                         opener->end - opener->beg >= 2 &&
4191                         closer->end - closer->beg >= 2)
4192                     {
4193                         int has_label = (opener->end - opener->beg > 2);
4194                         SZ target_sz;
4195 
4196                         if(has_label)
4197                             target_sz = opener->end - (opener->beg+2);
4198                         else
4199                             target_sz = closer->beg - opener->end;
4200 
4201                         MD_CHECK(md_enter_leave_span_wikilink(ctx, (mark->ch != ']'),
4202                                  has_label ? STR(opener->beg+2) : STR(opener->end),
4203                                  target_sz));
4204 
4205                         break;
4206                     }
4207 
4208                     dest_mark = opener+1;
4209                     MD_ASSERT(dest_mark->ch == 'D');
4210                     title_mark = opener+2;
4211                     MD_ASSERT(title_mark->ch == 'D');
4212 
4213                     MD_CHECK(md_enter_leave_span_a(ctx, (mark->ch != ']'),
4214                                 (opener->ch == '!' ? MD_SPAN_IMG : MD_SPAN_A),
4215                                 STR(dest_mark->beg), dest_mark->end - dest_mark->beg, FALSE,
4216                                 md_mark_get_ptr(ctx, title_mark - ctx->marks), title_mark->prev));
4217 
4218                     /* link/image closer may span multiple lines. */
4219                     if(mark->ch == ']') {
4220                         while(mark->end > line->end)
4221                             line++;
4222                     }
4223 
4224                     break;
4225                 }
4226 
4227                 case '<':
4228                 case '>':       /* Autolink or raw HTML. */
4229                     if(!(mark->flags & MD_MARK_AUTOLINK)) {
4230                         /* Raw HTML. */
4231                         if(mark->flags & MD_MARK_OPENER)
4232                             text_type = MD_TEXT_HTML;
4233                         else
4234                             text_type = MD_TEXT_NORMAL;
4235                         break;
4236                     }
4237                     /* Pass through, if auto-link. */
4238 
4239                 case '@':       /* Permissive e-mail autolink. */
4240                 case ':':       /* Permissive URL autolink. */
4241                 case '.':       /* Permissive WWW autolink. */
4242                 {
4243                     MD_MARK* opener = ((mark->flags & MD_MARK_OPENER) ? mark : &ctx->marks[mark->prev]);
4244                     MD_MARK* closer = &ctx->marks[opener->next];
4245                     const CHAR* dest = STR(opener->end);
4246                     SZ dest_size = closer->beg - opener->end;
4247 
4248                     /* For permissive auto-links we do not know closer mark
4249                      * position at the time of md_collect_marks(), therefore
4250                      * it can be out-of-order in ctx->marks[].
4251                      *
4252                      * With this flag, we make sure that we output the closer
4253                      * only if we processed the opener. */
4254                     if(mark->flags & MD_MARK_OPENER)
4255                         closer->flags |= MD_MARK_VALIDPERMISSIVEAUTOLINK;
4256 
4257                     if(opener->ch == '@' || opener->ch == '.') {
4258                         dest_size += 7;
4259                         MD_TEMP_BUFFER(dest_size * sizeof(CHAR));
4260                         memcpy(ctx->buffer,
4261                                 (opener->ch == '@' ? _T("mailto:") : _T("http://")),
4262                                 7 * sizeof(CHAR));
4263                         memcpy(ctx->buffer + 7, dest, (dest_size-7) * sizeof(CHAR));
4264                         dest = ctx->buffer;
4265                     }
4266 
4267                     if(closer->flags & MD_MARK_VALIDPERMISSIVEAUTOLINK)
4268                         MD_CHECK(md_enter_leave_span_a(ctx, (mark->flags & MD_MARK_OPENER),
4269                                     MD_SPAN_A, dest, dest_size, TRUE, NULL, 0));
4270                     break;
4271                 }
4272 
4273                 case '&':       /* Entity. */
4274                     MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg);
4275                     break;
4276 
4277                 case '\0':
4278                     MD_TEXT(MD_TEXT_NULLCHAR, _T(""), 1);
4279                     break;
4280 
4281                 case 127:
4282                     goto abort;
4283             }
4284 
4285             off = mark->end;
4286 
4287             /* Move to next resolved mark. */
4288             prev_mark = mark;
4289             mark++;
4290             while(!(mark->flags & MD_MARK_RESOLVED)  ||  mark->beg < off)
4291                 mark++;
4292         }
4293 
4294         /* If reached end of line, move to next one. */
4295         if(off >= line->end) {
4296             /* If it is the last line, we are done. */
4297             if(off >= end)
4298                 break;
4299 
4300             if(text_type == MD_TEXT_CODE || text_type == MD_TEXT_LATEXMATH) {
4301                 OFF tmp;
4302 
4303                 MD_ASSERT(prev_mark != NULL);
4304                 MD_ASSERT(ISANYOF2_(prev_mark->ch, '`', '$')  &&  (prev_mark->flags & MD_MARK_OPENER));
4305                 MD_ASSERT(ISANYOF2_(mark->ch, '`', '$')  &&  (mark->flags & MD_MARK_CLOSER));
4306 
4307                 /* Inside a code span, trailing line whitespace has to be
4308                  * outputted. */
4309                 tmp = off;
4310                 while(off < ctx->size  &&  ISBLANK(off))
4311                     off++;
4312                 if(off > tmp)
4313                     MD_TEXT(text_type, STR(tmp), off-tmp);
4314 
4315                 /* and new lines are transformed into single spaces. */
4316                 if(prev_mark->end < off  &&  off < mark->beg)
4317                     MD_TEXT(text_type, _T(" "), 1);
4318             } else if(text_type == MD_TEXT_HTML) {
4319                 /* Inside raw HTML, we output the new line verbatim, including
4320                  * any trailing spaces. */
4321                 OFF tmp = off;
4322 
4323                 while(tmp < end  &&  ISBLANK(tmp))
4324                     tmp++;
4325                 if(tmp > off)
4326                     MD_TEXT(MD_TEXT_HTML, STR(off), tmp - off);
4327                 MD_TEXT(MD_TEXT_HTML, _T("\n"), 1);
4328             } else {
4329                 /* Output soft or hard line break. */
4330                 MD_TEXTTYPE break_type = MD_TEXT_SOFTBR;
4331 
4332                 if(text_type == MD_TEXT_NORMAL) {
4333                     if(enforce_hardbreak)
4334                         break_type = MD_TEXT_BR;
4335                     else if((CH(line->end) == _T(' ') && CH(line->end+1) == _T(' ')))
4336                         break_type = MD_TEXT_BR;
4337                 }
4338 
4339                 MD_TEXT(break_type, _T("\n"), 1);
4340             }
4341 
4342             /* Move to the next line. */
4343             line++;
4344             off = line->beg;
4345 
4346             enforce_hardbreak = 0;
4347         }
4348     }
4349 
4350 abort:
4351     return ret;
4352 }
4353 
4354 
4355 /***************************
4356  ***  Processing Tables  ***
4357  ***************************/
4358 
4359 static void
md_analyze_table_alignment(MD_CTX * ctx,OFF beg,OFF end,MD_ALIGN * align,int n_align)4360 md_analyze_table_alignment(MD_CTX* ctx, OFF beg, OFF end, MD_ALIGN* align, int n_align)
4361 {
4362     static const MD_ALIGN align_map[] = { MD_ALIGN_DEFAULT, MD_ALIGN_LEFT, MD_ALIGN_RIGHT, MD_ALIGN_CENTER };
4363     OFF off = beg;
4364 
4365     while(n_align > 0) {
4366         int index = 0;  /* index into align_map[] */
4367 
4368         while(CH(off) != _T('-'))
4369             off++;
4370         if(off > beg  &&  CH(off-1) == _T(':'))
4371             index |= 1;
4372         while(off < end  &&  CH(off) == _T('-'))
4373             off++;
4374         if(off < end  &&  CH(off) == _T(':'))
4375             index |= 2;
4376 
4377         *align = align_map[index];
4378         align++;
4379         n_align--;
4380     }
4381 
4382 }
4383 
4384 /* Forward declaration. */
4385 static int md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines);
4386 
4387 static int
md_process_table_cell(MD_CTX * ctx,MD_BLOCKTYPE cell_type,MD_ALIGN align,OFF beg,OFF end)4388 md_process_table_cell(MD_CTX* ctx, MD_BLOCKTYPE cell_type, MD_ALIGN align, OFF beg, OFF end)
4389 {
4390     MD_LINE line;
4391     MD_BLOCK_TD_DETAIL det;
4392     int ret = 0;
4393 
4394     while(beg < end  &&  ISWHITESPACE(beg))
4395         beg++;
4396     while(end > beg  &&  ISWHITESPACE(end-1))
4397         end--;
4398 
4399     det.align = align;
4400     line.beg = beg;
4401     line.end = end;
4402 
4403     MD_ENTER_BLOCK(cell_type, &det);
4404     MD_CHECK(md_process_normal_block_contents(ctx, &line, 1));
4405     MD_LEAVE_BLOCK(cell_type, &det);
4406 
4407 abort:
4408     return ret;
4409 }
4410 
4411 static int
md_process_table_row(MD_CTX * ctx,MD_BLOCKTYPE cell_type,OFF beg,OFF end,const MD_ALIGN * align,int col_count)4412 md_process_table_row(MD_CTX* ctx, MD_BLOCKTYPE cell_type, OFF beg, OFF end,
4413                      const MD_ALIGN* align, int col_count)
4414 {
4415     MD_LINE line;
4416     OFF* pipe_offs = NULL;
4417     int i, j, k, n;
4418     int ret = 0;
4419 
4420     line.beg = beg;
4421     line.end = end;
4422 
4423     /* Break the line into table cells by identifying pipe characters who
4424      * form the cell boundary. */
4425     MD_CHECK(md_analyze_inlines(ctx, &line, 1, TRUE));
4426 
4427     /* We have to remember the cell boundaries in local buffer because
4428      * ctx->marks[] shall be reused during cell contents processing. */
4429     n = ctx->n_table_cell_boundaries + 2;
4430     pipe_offs = (OFF*) malloc(n * sizeof(OFF));
4431     if(pipe_offs == NULL) {
4432         MD_LOG("malloc() failed.");
4433         ret = -1;
4434         goto abort;
4435     }
4436     j = 0;
4437     pipe_offs[j++] = beg;
4438     for(i = TABLECELLBOUNDARIES.head; i >= 0; i = ctx->marks[i].next) {
4439         MD_MARK* mark = &ctx->marks[i];
4440         pipe_offs[j++] = mark->end;
4441     }
4442     pipe_offs[j++] = end+1;
4443 
4444     /* Process cells. */
4445     MD_ENTER_BLOCK(MD_BLOCK_TR, NULL);
4446     k = 0;
4447     for(i = 0; i < j-1  &&  k < col_count; i++) {
4448         if(pipe_offs[i] < pipe_offs[i+1]-1)
4449             MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], pipe_offs[i], pipe_offs[i+1]-1));
4450     }
4451     /* Make sure we call enough table cells even if the current table contains
4452      * too few of them. */
4453     while(k < col_count)
4454         MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], 0, 0));
4455     MD_LEAVE_BLOCK(MD_BLOCK_TR, NULL);
4456 
4457 abort:
4458     free(pipe_offs);
4459 
4460     /* Free any temporary memory blocks stored within some dummy marks. */
4461     for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next)
4462         free(md_mark_get_ptr(ctx, i));
4463     PTR_CHAIN.head = -1;
4464     PTR_CHAIN.tail = -1;
4465 
4466     return ret;
4467 }
4468 
4469 static int
md_process_table_block_contents(MD_CTX * ctx,int col_count,const MD_LINE * lines,int n_lines)4470 md_process_table_block_contents(MD_CTX* ctx, int col_count, const MD_LINE* lines, int n_lines)
4471 {
4472     MD_ALIGN* align;
4473     int i;
4474     int ret = 0;
4475 
4476     /* At least two lines have to be present: The column headers and the line
4477      * with the underlines. */
4478     MD_ASSERT(n_lines >= 2);
4479 
4480     align = malloc(col_count * sizeof(MD_ALIGN));
4481     if(align == NULL) {
4482         MD_LOG("malloc() failed.");
4483         ret = -1;
4484         goto abort;
4485     }
4486 
4487     md_analyze_table_alignment(ctx, lines[1].beg, lines[1].end, align, col_count);
4488 
4489     MD_ENTER_BLOCK(MD_BLOCK_THEAD, NULL);
4490     MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TH,
4491                         lines[0].beg, lines[0].end, align, col_count));
4492     MD_LEAVE_BLOCK(MD_BLOCK_THEAD, NULL);
4493 
4494     MD_ENTER_BLOCK(MD_BLOCK_TBODY, NULL);
4495     for(i = 2; i < n_lines; i++) {
4496         MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TD,
4497                         lines[i].beg, lines[i].end, align, col_count));
4498     }
4499     MD_LEAVE_BLOCK(MD_BLOCK_TBODY, NULL);
4500 
4501 abort:
4502     free(align);
4503     return ret;
4504 }
4505 
4506 
4507 /**************************
4508  ***  Processing Block  ***
4509  **************************/
4510 
4511 #define MD_BLOCK_CONTAINER_OPENER   0x01
4512 #define MD_BLOCK_CONTAINER_CLOSER   0x02
4513 #define MD_BLOCK_CONTAINER          (MD_BLOCK_CONTAINER_OPENER | MD_BLOCK_CONTAINER_CLOSER)
4514 #define MD_BLOCK_LOOSE_LIST         0x04
4515 #define MD_BLOCK_SETEXT_HEADER      0x08
4516 
4517 struct MD_BLOCK_tag {
4518     MD_BLOCKTYPE type  :  8;
4519     unsigned flags     :  8;
4520 
4521     /* MD_BLOCK_H:      Header level (1 - 6)
4522      * MD_BLOCK_CODE:   Non-zero if fenced, zero if indented.
4523      * MD_BLOCK_LI:     Task mark character (0 if not task list item, 'x', 'X' or ' ').
4524      * MD_BLOCK_TABLE:  Column count (as determined by the table underline).
4525      */
4526     unsigned data      : 16;
4527 
4528     /* Leaf blocks:     Count of lines (MD_LINE or MD_VERBATIMLINE) on the block.
4529      * MD_BLOCK_LI:     Task mark offset in the input doc.
4530      * MD_BLOCK_OL:     Start item number.
4531      */
4532     unsigned n_lines;
4533 };
4534 
4535 struct MD_CONTAINER_tag {
4536     CHAR ch;
4537     unsigned is_loose    : 8;
4538     unsigned is_task     : 8;
4539     unsigned start;
4540     unsigned mark_indent;
4541     unsigned contents_indent;
4542     OFF block_byte_off;
4543     OFF task_mark_off;
4544 };
4545 
4546 
4547 static int
md_process_normal_block_contents(MD_CTX * ctx,const MD_LINE * lines,int n_lines)4548 md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
4549 {
4550     int i;
4551     int ret;
4552 
4553     MD_CHECK(md_analyze_inlines(ctx, lines, n_lines, FALSE));
4554     MD_CHECK(md_process_inlines(ctx, lines, n_lines));
4555 
4556 abort:
4557     /* Free any temporary memory blocks stored within some dummy marks. */
4558     for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next)
4559         free(md_mark_get_ptr(ctx, i));
4560     PTR_CHAIN.head = -1;
4561     PTR_CHAIN.tail = -1;
4562 
4563     return ret;
4564 }
4565 
4566 static int
md_process_verbatim_block_contents(MD_CTX * ctx,MD_TEXTTYPE text_type,const MD_VERBATIMLINE * lines,int n_lines)4567 md_process_verbatim_block_contents(MD_CTX* ctx, MD_TEXTTYPE text_type, const MD_VERBATIMLINE* lines, int n_lines)
4568 {
4569     static const CHAR indent_chunk_str[] = _T("                ");
4570     static const SZ indent_chunk_size = SIZEOF_ARRAY(indent_chunk_str) - 1;
4571 
4572     int i;
4573     int ret = 0;
4574 
4575     for(i = 0; i < n_lines; i++) {
4576         const MD_VERBATIMLINE* line = &lines[i];
4577         int indent = line->indent;
4578 
4579         MD_ASSERT(indent >= 0);
4580 
4581         /* Output code indentation. */
4582         while(indent > (int) indent_chunk_size) {
4583             MD_TEXT(text_type, indent_chunk_str, indent_chunk_size);
4584             indent -= indent_chunk_size;
4585         }
4586         if(indent > 0)
4587             MD_TEXT(text_type, indent_chunk_str, indent);
4588 
4589         /* Output the code line itself. */
4590         MD_TEXT_INSECURE(text_type, STR(line->beg), line->end - line->beg);
4591 
4592         /* Enforce end-of-line. */
4593         MD_TEXT(text_type, _T("\n"), 1);
4594     }
4595 
4596 abort:
4597     return ret;
4598 }
4599 
4600 static int
md_process_code_block_contents(MD_CTX * ctx,int is_fenced,const MD_VERBATIMLINE * lines,int n_lines)4601 md_process_code_block_contents(MD_CTX* ctx, int is_fenced, const MD_VERBATIMLINE* lines, int n_lines)
4602 {
4603     if(is_fenced) {
4604         /* Skip the first line in case of fenced code: It is the fence.
4605          * (Only the starting fence is present due to logic in md_analyze_line().) */
4606         lines++;
4607         n_lines--;
4608     } else {
4609         /* Ignore blank lines at start/end of indented code block. */
4610         while(n_lines > 0  &&  lines[0].beg == lines[0].end) {
4611             lines++;
4612             n_lines--;
4613         }
4614         while(n_lines > 0  &&  lines[n_lines-1].beg == lines[n_lines-1].end) {
4615             n_lines--;
4616         }
4617     }
4618 
4619     if(n_lines == 0)
4620         return 0;
4621 
4622     return md_process_verbatim_block_contents(ctx, MD_TEXT_CODE, lines, n_lines);
4623 }
4624 
4625 static int
md_setup_fenced_code_detail(MD_CTX * ctx,const MD_BLOCK * block,MD_BLOCK_CODE_DETAIL * det,MD_ATTRIBUTE_BUILD * info_build,MD_ATTRIBUTE_BUILD * lang_build)4626 md_setup_fenced_code_detail(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_CODE_DETAIL* det,
4627                             MD_ATTRIBUTE_BUILD* info_build, MD_ATTRIBUTE_BUILD* lang_build)
4628 {
4629     const MD_VERBATIMLINE* fence_line = (const MD_VERBATIMLINE*)(block + 1);
4630     OFF beg = fence_line->beg;
4631     OFF end = fence_line->end;
4632     OFF lang_end;
4633     CHAR fence_ch = CH(fence_line->beg);
4634     int ret = 0;
4635 
4636     /* Skip the fence itself. */
4637     while(beg < ctx->size  &&  CH(beg) == fence_ch)
4638         beg++;
4639     /* Trim initial spaces. */
4640     while(beg < ctx->size  &&  CH(beg) == _T(' '))
4641         beg++;
4642 
4643     /* Trim trailing spaces. */
4644     while(end > beg  &&  CH(end-1) == _T(' '))
4645         end--;
4646 
4647     /* Build info string attribute. */
4648     MD_CHECK(md_build_attribute(ctx, STR(beg), end - beg, 0, &det->info, info_build));
4649 
4650     /* Build info string attribute. */
4651     lang_end = beg;
4652     while(lang_end < end  &&  !ISWHITESPACE(lang_end))
4653         lang_end++;
4654     MD_CHECK(md_build_attribute(ctx, STR(beg), lang_end - beg, 0, &det->lang, lang_build));
4655 
4656     det->fence_char = fence_ch;
4657 
4658 abort:
4659     return ret;
4660 }
4661 
4662 static int
md_process_leaf_block(MD_CTX * ctx,const MD_BLOCK * block)4663 md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block)
4664 {
4665     union {
4666         MD_BLOCK_H_DETAIL header;
4667         MD_BLOCK_CODE_DETAIL code;
4668     } det;
4669     MD_ATTRIBUTE_BUILD info_build;
4670     MD_ATTRIBUTE_BUILD lang_build;
4671     int is_in_tight_list;
4672     int clean_fence_code_detail = FALSE;
4673     int ret = 0;
4674 
4675     memset(&det, 0, sizeof(det));
4676 
4677     if(ctx->n_containers == 0)
4678         is_in_tight_list = FALSE;
4679     else
4680         is_in_tight_list = !ctx->containers[ctx->n_containers-1].is_loose;
4681 
4682     switch(block->type) {
4683         case MD_BLOCK_H:
4684             det.header.level = block->data;
4685             break;
4686 
4687         case MD_BLOCK_CODE:
4688             /* For fenced code block, we may need to set the info string. */
4689             if(block->data != 0) {
4690                 memset(&det.code, 0, sizeof(MD_BLOCK_CODE_DETAIL));
4691                 clean_fence_code_detail = TRUE;
4692                 MD_CHECK(md_setup_fenced_code_detail(ctx, block, &det.code, &info_build, &lang_build));
4693             }
4694             break;
4695 
4696         default:
4697             /* Noop. */
4698             break;
4699     }
4700 
4701     if(!is_in_tight_list  ||  block->type != MD_BLOCK_P)
4702         MD_ENTER_BLOCK(block->type, (void*) &det);
4703 
4704     /* Process the block contents accordingly to is type. */
4705     switch(block->type) {
4706         case MD_BLOCK_HR:
4707             /* noop */
4708             break;
4709 
4710         case MD_BLOCK_CODE:
4711             MD_CHECK(md_process_code_block_contents(ctx, (block->data != 0),
4712                             (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4713             break;
4714 
4715         case MD_BLOCK_HTML:
4716             MD_CHECK(md_process_verbatim_block_contents(ctx, MD_TEXT_HTML,
4717                             (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4718             break;
4719 
4720         case MD_BLOCK_TABLE:
4721             MD_CHECK(md_process_table_block_contents(ctx, block->data,
4722                             (const MD_LINE*)(block + 1), block->n_lines));
4723             break;
4724 
4725         default:
4726             MD_CHECK(md_process_normal_block_contents(ctx,
4727                             (const MD_LINE*)(block + 1), block->n_lines));
4728             break;
4729     }
4730 
4731     if(!is_in_tight_list  ||  block->type != MD_BLOCK_P)
4732         MD_LEAVE_BLOCK(block->type, (void*) &det);
4733 
4734 abort:
4735     if(clean_fence_code_detail) {
4736         md_free_attribute(ctx, &info_build);
4737         md_free_attribute(ctx, &lang_build);
4738     }
4739     return ret;
4740 }
4741 
4742 static int
md_process_all_blocks(MD_CTX * ctx)4743 md_process_all_blocks(MD_CTX* ctx)
4744 {
4745     int byte_off = 0;
4746     int ret = 0;
4747 
4748     /* ctx->containers now is not needed for detection of lists and list items
4749      * so we reuse it for tracking what lists are loose or tight. We rely
4750      * on the fact the vector is large enough to hold the deepest nesting
4751      * level of lists. */
4752     ctx->n_containers = 0;
4753 
4754     while(byte_off < ctx->n_block_bytes) {
4755         MD_BLOCK* block = (MD_BLOCK*)((char*)ctx->block_bytes + byte_off);
4756         union {
4757             MD_BLOCK_UL_DETAIL ul;
4758             MD_BLOCK_OL_DETAIL ol;
4759             MD_BLOCK_LI_DETAIL li;
4760         } det;
4761 
4762         switch(block->type) {
4763             case MD_BLOCK_UL:
4764                 det.ul.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4765                 det.ul.mark = (CHAR) block->data;
4766                 break;
4767 
4768             case MD_BLOCK_OL:
4769                 det.ol.start = block->n_lines;
4770                 det.ol.is_tight =  (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4771                 det.ol.mark_delimiter = (CHAR) block->data;
4772                 break;
4773 
4774             case MD_BLOCK_LI:
4775                 det.li.is_task = (block->data != 0);
4776                 det.li.task_mark = (CHAR) block->data;
4777                 det.li.task_mark_offset = (OFF) block->n_lines;
4778                 break;
4779 
4780             default:
4781                 /* noop */
4782                 break;
4783         }
4784 
4785         if(block->flags & MD_BLOCK_CONTAINER) {
4786             if(block->flags & MD_BLOCK_CONTAINER_CLOSER) {
4787                 MD_LEAVE_BLOCK(block->type, &det);
4788 
4789                 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL || block->type == MD_BLOCK_QUOTE)
4790                     ctx->n_containers--;
4791             }
4792 
4793             if(block->flags & MD_BLOCK_CONTAINER_OPENER) {
4794                 MD_ENTER_BLOCK(block->type, &det);
4795 
4796                 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL) {
4797                     ctx->containers[ctx->n_containers].is_loose = (block->flags & MD_BLOCK_LOOSE_LIST);
4798                     ctx->n_containers++;
4799                 } else if(block->type == MD_BLOCK_QUOTE) {
4800                     /* This causes that any text in a block quote, even if
4801                      * nested inside a tight list item, is wrapped with
4802                      * <p>...</p>. */
4803                     ctx->containers[ctx->n_containers].is_loose = TRUE;
4804                     ctx->n_containers++;
4805                 }
4806             }
4807         } else {
4808             MD_CHECK(md_process_leaf_block(ctx, block));
4809 
4810             if(block->type == MD_BLOCK_CODE || block->type == MD_BLOCK_HTML)
4811                 byte_off += block->n_lines * sizeof(MD_VERBATIMLINE);
4812             else
4813                 byte_off += block->n_lines * sizeof(MD_LINE);
4814         }
4815 
4816         byte_off += sizeof(MD_BLOCK);
4817     }
4818 
4819     ctx->n_block_bytes = 0;
4820 
4821 abort:
4822     return ret;
4823 }
4824 
4825 
4826 /************************************
4827  ***  Grouping Lines into Blocks  ***
4828  ************************************/
4829 
4830 static void*
md_push_block_bytes(MD_CTX * ctx,int n_bytes)4831 md_push_block_bytes(MD_CTX* ctx, int n_bytes)
4832 {
4833     void* ptr;
4834 
4835     if(ctx->n_block_bytes + n_bytes > ctx->alloc_block_bytes) {
4836         void* new_block_bytes;
4837 
4838         ctx->alloc_block_bytes = (ctx->alloc_block_bytes > 0
4839                 ? ctx->alloc_block_bytes + ctx->alloc_block_bytes / 2
4840                 : 512);
4841         new_block_bytes = realloc(ctx->block_bytes, ctx->alloc_block_bytes);
4842         if(new_block_bytes == NULL) {
4843             MD_LOG("realloc() failed.");
4844             return NULL;
4845         }
4846 
4847         /* Fix the ->current_block after the reallocation. */
4848         if(ctx->current_block != NULL) {
4849             OFF off_current_block = (char*) ctx->current_block - (char*) ctx->block_bytes;
4850             ctx->current_block = (MD_BLOCK*) ((char*) new_block_bytes + off_current_block);
4851         }
4852 
4853         ctx->block_bytes = new_block_bytes;
4854     }
4855 
4856     ptr = (char*)ctx->block_bytes + ctx->n_block_bytes;
4857     ctx->n_block_bytes += n_bytes;
4858     return ptr;
4859 }
4860 
4861 static int
md_start_new_block(MD_CTX * ctx,const MD_LINE_ANALYSIS * line)4862 md_start_new_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* line)
4863 {
4864     MD_BLOCK* block;
4865 
4866     MD_ASSERT(ctx->current_block == NULL);
4867 
4868     block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK));
4869     if(block == NULL)
4870         return -1;
4871 
4872     switch(line->type) {
4873         case MD_LINE_HR:
4874             block->type = MD_BLOCK_HR;
4875             break;
4876 
4877         case MD_LINE_ATXHEADER:
4878         case MD_LINE_SETEXTHEADER:
4879             block->type = MD_BLOCK_H;
4880             break;
4881 
4882         case MD_LINE_FENCEDCODE:
4883         case MD_LINE_INDENTEDCODE:
4884             block->type = MD_BLOCK_CODE;
4885             break;
4886 
4887         case MD_LINE_TEXT:
4888             block->type = MD_BLOCK_P;
4889             break;
4890 
4891         case MD_LINE_HTML:
4892             block->type = MD_BLOCK_HTML;
4893             break;
4894 
4895         case MD_LINE_BLANK:
4896         case MD_LINE_SETEXTUNDERLINE:
4897         case MD_LINE_TABLEUNDERLINE:
4898         default:
4899             MD_UNREACHABLE();
4900             break;
4901     }
4902 
4903     block->flags = 0;
4904     block->data = line->data;
4905     block->n_lines = 0;
4906 
4907     ctx->current_block = block;
4908     return 0;
4909 }
4910 
4911 /* Eat from start of current (textual) block any reference definitions and
4912  * remember them so we can resolve any links referring to them.
4913  *
4914  * (Reference definitions can only be at start of it as they cannot break
4915  * a paragraph.)
4916  */
4917 static int
md_consume_link_reference_definitions(MD_CTX * ctx)4918 md_consume_link_reference_definitions(MD_CTX* ctx)
4919 {
4920     MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
4921     int n_lines = ctx->current_block->n_lines;
4922     int n = 0;
4923 
4924     /* Compute how many lines at the start of the block form one or more
4925      * reference definitions. */
4926     while(n < n_lines) {
4927         int n_link_ref_lines;
4928 
4929         n_link_ref_lines = md_is_link_reference_definition(ctx,
4930                                     lines + n, n_lines - n);
4931         /* Not a reference definition? */
4932         if(n_link_ref_lines == 0)
4933             break;
4934 
4935         /* We fail if it is the ref. def. but it could not be stored due
4936          * a memory allocation error. */
4937         if(n_link_ref_lines < 0)
4938             return -1;
4939 
4940         n += n_link_ref_lines;
4941     }
4942 
4943     /* If there was at least one reference definition, we need to remove
4944      * its lines from the block, or perhaps even the whole block. */
4945     if(n > 0) {
4946         if(n == n_lines) {
4947             /* Remove complete block. */
4948             ctx->n_block_bytes -= n * sizeof(MD_LINE);
4949             ctx->n_block_bytes -= sizeof(MD_BLOCK);
4950             ctx->current_block = NULL;
4951         } else {
4952             /* Remove just some initial lines from the block. */
4953             memmove(lines, lines + n, (n_lines - n) * sizeof(MD_LINE));
4954             ctx->current_block->n_lines -= n;
4955             ctx->n_block_bytes -= n * sizeof(MD_LINE);
4956         }
4957     }
4958 
4959     return 0;
4960 }
4961 
4962 static int
md_end_current_block(MD_CTX * ctx)4963 md_end_current_block(MD_CTX* ctx)
4964 {
4965     int ret = 0;
4966 
4967     if(ctx->current_block == NULL)
4968         return ret;
4969 
4970     /* Check whether there is a reference definition. (We do this here instead
4971      * of in md_analyze_line() because reference definition can take multiple
4972      * lines.) */
4973     if(ctx->current_block->type == MD_BLOCK_P  ||
4974        (ctx->current_block->type == MD_BLOCK_H  &&  (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)))
4975     {
4976         MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
4977         if(CH(lines[0].beg) == _T('[')) {
4978             MD_CHECK(md_consume_link_reference_definitions(ctx));
4979             if(ctx->current_block == NULL)
4980                 return ret;
4981         }
4982     }
4983 
4984     if(ctx->current_block->type == MD_BLOCK_H  &&  (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)) {
4985         int n_lines = ctx->current_block->n_lines;
4986 
4987         if(n_lines > 1) {
4988             /* Get rid of the underline. */
4989             ctx->current_block->n_lines--;
4990             ctx->n_block_bytes -= sizeof(MD_LINE);
4991         } else {
4992             /* Only the underline has left after eating the ref. defs.
4993              * Keep the line as beginning of a new ordinary paragraph. */
4994             ctx->current_block->type = MD_BLOCK_P;
4995             return 0;
4996         }
4997     }
4998 
4999     /* Mark we are not building any block anymore. */
5000     ctx->current_block = NULL;
5001 
5002 abort:
5003     return ret;
5004 }
5005 
5006 static int
md_add_line_into_current_block(MD_CTX * ctx,const MD_LINE_ANALYSIS * analysis)5007 md_add_line_into_current_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* analysis)
5008 {
5009     MD_ASSERT(ctx->current_block != NULL);
5010 
5011     if(ctx->current_block->type == MD_BLOCK_CODE || ctx->current_block->type == MD_BLOCK_HTML) {
5012         MD_VERBATIMLINE* line;
5013 
5014         line = (MD_VERBATIMLINE*) md_push_block_bytes(ctx, sizeof(MD_VERBATIMLINE));
5015         if(line == NULL)
5016             return -1;
5017 
5018         line->indent = analysis->indent;
5019         line->beg = analysis->beg;
5020         line->end = analysis->end;
5021     } else {
5022         MD_LINE* line;
5023 
5024         line = (MD_LINE*) md_push_block_bytes(ctx, sizeof(MD_LINE));
5025         if(line == NULL)
5026             return -1;
5027 
5028         line->beg = analysis->beg;
5029         line->end = analysis->end;
5030     }
5031     ctx->current_block->n_lines++;
5032 
5033     return 0;
5034 }
5035 
5036 static int
md_push_container_bytes(MD_CTX * ctx,MD_BLOCKTYPE type,unsigned start,unsigned data,unsigned flags)5037 md_push_container_bytes(MD_CTX* ctx, MD_BLOCKTYPE type, unsigned start,
5038                         unsigned data, unsigned flags)
5039 {
5040     MD_BLOCK* block;
5041     int ret = 0;
5042 
5043     MD_CHECK(md_end_current_block(ctx));
5044 
5045     block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK));
5046     if(block == NULL)
5047         return -1;
5048 
5049     block->type = type;
5050     block->flags = flags;
5051     block->data = data;
5052     block->n_lines = start;
5053 
5054 abort:
5055     return ret;
5056 }
5057 
5058 
5059 
5060 /***********************
5061  ***  Line Analysis  ***
5062  ***********************/
5063 
5064 static int
md_is_hr_line(MD_CTX * ctx,OFF beg,OFF * p_end,OFF * p_killer)5065 md_is_hr_line(MD_CTX* ctx, OFF beg, OFF* p_end, OFF* p_killer)
5066 {
5067     OFF off = beg + 1;
5068     int n = 1;
5069 
5070     while(off < ctx->size  &&  (CH(off) == CH(beg) || CH(off) == _T(' ') || CH(off) == _T('\t'))) {
5071         if(CH(off) == CH(beg))
5072             n++;
5073         off++;
5074     }
5075 
5076     if(n < 3) {
5077         *p_killer = off;
5078         return FALSE;
5079     }
5080 
5081     /* Nothing else can be present on the line. */
5082     if(off < ctx->size  &&  !ISNEWLINE(off)) {
5083         *p_killer = off;
5084         return FALSE;
5085     }
5086 
5087     *p_end = off;
5088     return TRUE;
5089 }
5090 
5091 static int
md_is_atxheader_line(MD_CTX * ctx,OFF beg,OFF * p_beg,OFF * p_end,unsigned * p_level)5092 md_is_atxheader_line(MD_CTX* ctx, OFF beg, OFF* p_beg, OFF* p_end, unsigned* p_level)
5093 {
5094     int n;
5095     OFF off = beg + 1;
5096 
5097     while(off < ctx->size  &&  CH(off) == _T('#')  &&  off - beg < 7)
5098         off++;
5099     n = off - beg;
5100 
5101     if(n > 6)
5102         return FALSE;
5103     *p_level = n;
5104 
5105     if(!(ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS)  &&  off < ctx->size  &&
5106        CH(off) != _T(' ')  &&  CH(off) != _T('\t')  &&  !ISNEWLINE(off))
5107         return FALSE;
5108 
5109     while(off < ctx->size  &&  CH(off) == _T(' '))
5110         off++;
5111     *p_beg = off;
5112     *p_end = off;
5113     return TRUE;
5114 }
5115 
5116 static int
md_is_setext_underline(MD_CTX * ctx,OFF beg,OFF * p_end,unsigned * p_level)5117 md_is_setext_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_level)
5118 {
5119     OFF off = beg + 1;
5120 
5121     while(off < ctx->size  &&  CH(off) == CH(beg))
5122         off++;
5123 
5124     /* Optionally, space(s) can follow. */
5125     while(off < ctx->size  &&  CH(off) == _T(' '))
5126         off++;
5127 
5128     /* But nothing more is allowed on the line. */
5129     if(off < ctx->size  &&  !ISNEWLINE(off))
5130         return FALSE;
5131 
5132     *p_level = (CH(beg) == _T('=') ? 1 : 2);
5133     *p_end = off;
5134     return TRUE;
5135 }
5136 
5137 static int
md_is_table_underline(MD_CTX * ctx,OFF beg,OFF * p_end,unsigned * p_col_count)5138 md_is_table_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_col_count)
5139 {
5140     OFF off = beg;
5141     int found_pipe = FALSE;
5142     unsigned col_count = 0;
5143 
5144     if(off < ctx->size  &&  CH(off) == _T('|')) {
5145         found_pipe = TRUE;
5146         off++;
5147         while(off < ctx->size  &&  ISWHITESPACE(off))
5148             off++;
5149     }
5150 
5151     while(1) {
5152         OFF cell_beg;
5153         int delimited = FALSE;
5154 
5155         /* Cell underline ("-----", ":----", "----:" or ":----:") */
5156         cell_beg = off;
5157         if(off < ctx->size  &&  CH(off) == _T(':'))
5158             off++;
5159         while(off < ctx->size  &&  CH(off) == _T('-'))
5160             off++;
5161         if(off < ctx->size  &&  CH(off) == _T(':'))
5162             off++;
5163         if(off - cell_beg < 3)
5164             return FALSE;
5165 
5166         col_count++;
5167 
5168         /* Pipe delimiter (optional at the end of line). */
5169         while(off < ctx->size  &&  ISWHITESPACE(off))
5170             off++;
5171         if(off < ctx->size  &&  CH(off) == _T('|')) {
5172             delimited = TRUE;
5173             found_pipe =  TRUE;
5174             off++;
5175             while(off < ctx->size  &&  ISWHITESPACE(off))
5176                 off++;
5177         }
5178 
5179         /* Success, if we reach end of line. */
5180         if(off >= ctx->size  ||  ISNEWLINE(off))
5181             break;
5182 
5183         if(!delimited)
5184             return FALSE;
5185     }
5186 
5187     if(!found_pipe)
5188         return FALSE;
5189 
5190     *p_end = off;
5191     *p_col_count = col_count;
5192     return TRUE;
5193 }
5194 
5195 static int
md_is_opening_code_fence(MD_CTX * ctx,OFF beg,OFF * p_end)5196 md_is_opening_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end)
5197 {
5198     OFF off = beg;
5199 
5200     while(off < ctx->size && CH(off) == CH(beg))
5201         off++;
5202 
5203     /* Fence must have at least three characters. */
5204     if(off - beg < 3)
5205         return FALSE;
5206 
5207     ctx->code_fence_length = off - beg;
5208 
5209     /* Optionally, space(s) can follow. */
5210     while(off < ctx->size  &&  CH(off) == _T(' '))
5211         off++;
5212 
5213     /* Optionally, an info string can follow. */
5214     while(off < ctx->size  &&  !ISNEWLINE(off)) {
5215         /* Backtick-based fence must not contain '`' in the info string. */
5216         if(CH(beg) == _T('`')  &&  CH(off) == _T('`'))
5217             return FALSE;
5218         off++;
5219     }
5220 
5221     *p_end = off;
5222     return TRUE;
5223 }
5224 
5225 static int
md_is_closing_code_fence(MD_CTX * ctx,CHAR ch,OFF beg,OFF * p_end)5226 md_is_closing_code_fence(MD_CTX* ctx, CHAR ch, OFF beg, OFF* p_end)
5227 {
5228     OFF off = beg;
5229     int ret = FALSE;
5230 
5231     /* Closing fence must have at least the same length and use same char as
5232      * opening one. */
5233     while(off < ctx->size  &&  CH(off) == ch)
5234         off++;
5235     if(off - beg < ctx->code_fence_length)
5236         goto out;
5237 
5238     /* Optionally, space(s) can follow */
5239     while(off < ctx->size  &&  CH(off) == _T(' '))
5240         off++;
5241 
5242     /* But nothing more is allowed on the line. */
5243     if(off < ctx->size  &&  !ISNEWLINE(off))
5244         goto out;
5245 
5246     ret = TRUE;
5247 
5248 out:
5249     /* Note we set *p_end even on failure: If we are not closing fence, caller
5250      * would eat the line anyway without any parsing. */
5251     *p_end = off;
5252     return ret;
5253 }
5254 
5255 /* Returns type of the raw HTML block, or FALSE if it is not HTML block.
5256  * (Refer to CommonMark specification for details about the types.)
5257  */
5258 static int
md_is_html_block_start_condition(MD_CTX * ctx,OFF beg)5259 md_is_html_block_start_condition(MD_CTX* ctx, OFF beg)
5260 {
5261     typedef struct TAG_tag TAG;
5262     struct TAG_tag {
5263         const CHAR* name;
5264         unsigned len    : 8;
5265     };
5266 
5267     /* Type 6 is started by a long list of allowed tags. We use two-level
5268      * tree to speed-up the search. */
5269 #ifdef X
5270     #undef X
5271 #endif
5272 #define X(name)     { _T(name), (sizeof(name)-1) / sizeof(CHAR) }
5273 #define Xend        { NULL, 0 }
5274     static const TAG t1[] = { X("script"), X("pre"), X("style"), Xend };
5275 
5276     static const TAG a6[] = { X("address"), X("article"), X("aside"), Xend };
5277     static const TAG b6[] = { X("base"), X("basefont"), X("blockquote"), X("body"), Xend };
5278     static const TAG c6[] = { X("caption"), X("center"), X("col"), X("colgroup"), Xend };
5279     static const TAG d6[] = { X("dd"), X("details"), X("dialog"), X("dir"),
5280                               X("div"), X("dl"), X("dt"), Xend };
5281     static const TAG f6[] = { X("fieldset"), X("figcaption"), X("figure"), X("footer"),
5282                               X("form"), X("frame"), X("frameset"), Xend };
5283     static const TAG h6[] = { X("h1"), X("head"), X("header"), X("hr"), X("html"), Xend };
5284     static const TAG i6[] = { X("iframe"), Xend };
5285     static const TAG l6[] = { X("legend"), X("li"), X("link"), Xend };
5286     static const TAG m6[] = { X("main"), X("menu"), X("menuitem"), Xend };
5287     static const TAG n6[] = { X("nav"), X("noframes"), Xend };
5288     static const TAG o6[] = { X("ol"), X("optgroup"), X("option"), Xend };
5289     static const TAG p6[] = { X("p"), X("param"), Xend };
5290     static const TAG s6[] = { X("section"), X("source"), X("summary"), Xend };
5291     static const TAG t6[] = { X("table"), X("tbody"), X("td"), X("tfoot"), X("th"),
5292                               X("thead"), X("title"), X("tr"), X("track"), Xend };
5293     static const TAG u6[] = { X("ul"), Xend };
5294     static const TAG xx[] = { Xend };
5295 #undef X
5296 
5297     static const TAG* map6[26] = {
5298         a6, b6, c6, d6, xx, f6, xx, h6, i6, xx, xx, l6, m6,
5299         n6, o6, p6, xx, xx, s6, t6, u6, xx, xx, xx, xx, xx
5300     };
5301     OFF off = beg + 1;
5302     int i;
5303 
5304     /* Check for type 1: <script, <pre, or <style */
5305     for(i = 0; t1[i].name != NULL; i++) {
5306         if(off + t1[i].len <= ctx->size) {
5307             if(md_ascii_case_eq(STR(off), t1[i].name, t1[i].len))
5308                 return 1;
5309         }
5310     }
5311 
5312     /* Check for type 2: <!-- */
5313     if(off + 3 < ctx->size  &&  CH(off) == _T('!')  &&  CH(off+1) == _T('-')  &&  CH(off+2) == _T('-'))
5314         return 2;
5315 
5316     /* Check for type 3: <? */
5317     if(off < ctx->size  &&  CH(off) == _T('?'))
5318         return 3;
5319 
5320     /* Check for type 4 or 5: <! */
5321     if(off < ctx->size  &&  CH(off) == _T('!')) {
5322         /* Check for type 4: <! followed by uppercase letter. */
5323         if(off + 1 < ctx->size  &&  ISUPPER(off+1))
5324             return 4;
5325 
5326         /* Check for type 5: <![CDATA[ */
5327         if(off + 8 < ctx->size) {
5328             if(md_ascii_eq(STR(off), _T("![CDATA["), 8))
5329                 return 5;
5330         }
5331     }
5332 
5333     /* Check for type 6: Many possible starting tags listed above. */
5334     if(off + 1 < ctx->size  &&  (ISALPHA(off) || (CH(off) == _T('/') && ISALPHA(off+1)))) {
5335         int slot;
5336         const TAG* tags;
5337 
5338         if(CH(off) == _T('/'))
5339             off++;
5340 
5341         slot = (ISUPPER(off) ? CH(off) - 'A' : CH(off) - 'a');
5342         tags = map6[slot];
5343 
5344         for(i = 0; tags[i].name != NULL; i++) {
5345             if(off + tags[i].len <= ctx->size) {
5346                 if(md_ascii_case_eq(STR(off), tags[i].name, tags[i].len)) {
5347                     OFF tmp = off + tags[i].len;
5348                     if(tmp >= ctx->size)
5349                         return 6;
5350                     if(ISBLANK(tmp) || ISNEWLINE(tmp) || CH(tmp) == _T('>'))
5351                         return 6;
5352                     if(tmp+1 < ctx->size && CH(tmp) == _T('/') && CH(tmp+1) == _T('>'))
5353                         return 6;
5354                     break;
5355                 }
5356             }
5357         }
5358     }
5359 
5360     /* Check for type 7: any COMPLETE other opening or closing tag. */
5361     if(off + 1 < ctx->size) {
5362         OFF end;
5363 
5364         if(md_is_html_tag(ctx, NULL, 0, beg, ctx->size, &end)) {
5365             /* Only optional whitespace and new line may follow. */
5366             while(end < ctx->size  &&  ISWHITESPACE(end))
5367                 end++;
5368             if(end >= ctx->size  ||  ISNEWLINE(end))
5369                 return 7;
5370         }
5371     }
5372 
5373     return FALSE;
5374 }
5375 
5376 /* Case sensitive check whether there is a substring 'what' between 'beg'
5377  * and end of line. */
5378 static int
md_line_contains(MD_CTX * ctx,OFF beg,const CHAR * what,SZ what_len,OFF * p_end)5379 md_line_contains(MD_CTX* ctx, OFF beg, const CHAR* what, SZ what_len, OFF* p_end)
5380 {
5381     OFF i;
5382     for(i = beg; i + what_len < ctx->size; i++) {
5383         if(ISNEWLINE(i))
5384             break;
5385         if(memcmp(STR(i), what, what_len * sizeof(CHAR)) == 0) {
5386             *p_end = i + what_len;
5387             return TRUE;
5388         }
5389     }
5390 
5391     *p_end = i;
5392     return FALSE;
5393 }
5394 
5395 /* Returns type of HTML block end condition or FALSE if not an end condition.
5396  *
5397  * Note it fills p_end even when it is not end condition as the caller
5398  * does not need to analyze contents of a raw HTML block.
5399  */
5400 static int
md_is_html_block_end_condition(MD_CTX * ctx,OFF beg,OFF * p_end)5401 md_is_html_block_end_condition(MD_CTX* ctx, OFF beg, OFF* p_end)
5402 {
5403     switch(ctx->html_block_type) {
5404         case 1:
5405         {
5406             OFF off = beg;
5407 
5408             while(off < ctx->size  &&  !ISNEWLINE(off)) {
5409                 if(CH(off) == _T('<')) {
5410                     if(md_ascii_case_eq(STR(off), _T("</script>"), 9)) {
5411                         *p_end = off + 9;
5412                         return TRUE;
5413                     }
5414 
5415                     if(md_ascii_case_eq(STR(off), _T("</style>"), 8)) {
5416                         *p_end = off + 8;
5417                         return TRUE;
5418                     }
5419 
5420                     if(md_ascii_case_eq(STR(off), _T("</pre>"), 6)) {
5421                         *p_end = off + 6;
5422                         return TRUE;
5423                     }
5424                 }
5425 
5426                 off++;
5427             }
5428             *p_end = off;
5429             return FALSE;
5430         }
5431 
5432         case 2:
5433             return (md_line_contains(ctx, beg, _T("-->"), 3, p_end) ? 2 : FALSE);
5434 
5435         case 3:
5436             return (md_line_contains(ctx, beg, _T("?>"), 2, p_end) ? 3 : FALSE);
5437 
5438         case 4:
5439             return (md_line_contains(ctx, beg, _T(">"), 1, p_end) ? 4 : FALSE);
5440 
5441         case 5:
5442             return (md_line_contains(ctx, beg, _T("]]>"), 3, p_end) ? 5 : FALSE);
5443 
5444         case 6:     /* Pass through */
5445         case 7:
5446             *p_end = beg;
5447             return (ISNEWLINE(beg) ? ctx->html_block_type : FALSE);
5448 
5449         default:
5450             MD_UNREACHABLE();
5451     }
5452     return FALSE;
5453 }
5454 
5455 
5456 static int
md_is_container_compatible(const MD_CONTAINER * pivot,const MD_CONTAINER * container)5457 md_is_container_compatible(const MD_CONTAINER* pivot, const MD_CONTAINER* container)
5458 {
5459     /* Block quote has no "items" like lists. */
5460     if(container->ch == _T('>'))
5461         return FALSE;
5462 
5463     if(container->ch != pivot->ch)
5464         return FALSE;
5465     if(container->mark_indent > pivot->contents_indent)
5466         return FALSE;
5467 
5468     return TRUE;
5469 }
5470 
5471 static int
md_push_container(MD_CTX * ctx,const MD_CONTAINER * container)5472 md_push_container(MD_CTX* ctx, const MD_CONTAINER* container)
5473 {
5474     if(ctx->n_containers >= ctx->alloc_containers) {
5475         MD_CONTAINER* new_containers;
5476 
5477         ctx->alloc_containers = (ctx->alloc_containers > 0
5478                 ? ctx->alloc_containers + ctx->alloc_containers / 2
5479                 : 16);
5480         new_containers = realloc(ctx->containers, ctx->alloc_containers * sizeof(MD_CONTAINER));
5481         if(new_containers == NULL) {
5482             MD_LOG("realloc() failed.");
5483             return -1;
5484         }
5485 
5486         ctx->containers = new_containers;
5487     }
5488 
5489     memcpy(&ctx->containers[ctx->n_containers++], container, sizeof(MD_CONTAINER));
5490     return 0;
5491 }
5492 
5493 static int
md_enter_child_containers(MD_CTX * ctx,int n_children,unsigned data)5494 md_enter_child_containers(MD_CTX* ctx, int n_children, unsigned data)
5495 {
5496     int i;
5497     int ret = 0;
5498 
5499     for(i = ctx->n_containers - n_children; i < ctx->n_containers; i++) {
5500         MD_CONTAINER* c = &ctx->containers[i];
5501         int is_ordered_list = FALSE;
5502 
5503         switch(c->ch) {
5504             case _T(')'):
5505             case _T('.'):
5506                 is_ordered_list = TRUE;
5507                 /* Pass through */
5508 
5509             case _T('-'):
5510             case _T('+'):
5511             case _T('*'):
5512                 /* Remember offset in ctx->block_bytes so we can revisit the
5513                  * block if we detect it is a loose list. */
5514                 md_end_current_block(ctx);
5515                 c->block_byte_off = ctx->n_block_bytes;
5516 
5517                 MD_CHECK(md_push_container_bytes(ctx,
5518                                 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL),
5519                                 c->start, data, MD_BLOCK_CONTAINER_OPENER));
5520                 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5521                                 c->task_mark_off,
5522                                 (c->is_task ? CH(c->task_mark_off) : 0),
5523                                 MD_BLOCK_CONTAINER_OPENER));
5524                 break;
5525 
5526             case _T('>'):
5527                 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0, 0, MD_BLOCK_CONTAINER_OPENER));
5528                 break;
5529 
5530             default:
5531                 MD_UNREACHABLE();
5532                 break;
5533         }
5534     }
5535 
5536 abort:
5537     return ret;
5538 }
5539 
5540 static int
md_leave_child_containers(MD_CTX * ctx,int n_keep)5541 md_leave_child_containers(MD_CTX* ctx, int n_keep)
5542 {
5543     int ret = 0;
5544 
5545     while(ctx->n_containers > n_keep) {
5546         MD_CONTAINER* c = &ctx->containers[ctx->n_containers-1];
5547         int is_ordered_list = FALSE;
5548 
5549         switch(c->ch) {
5550             case _T(')'):
5551             case _T('.'):
5552                 is_ordered_list = TRUE;
5553                 /* Pass through */
5554 
5555             case _T('-'):
5556             case _T('+'):
5557             case _T('*'):
5558                 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5559                                 c->task_mark_off, (c->is_task ? CH(c->task_mark_off) : 0),
5560                                 MD_BLOCK_CONTAINER_CLOSER));
5561                 MD_CHECK(md_push_container_bytes(ctx,
5562                                 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL), 0,
5563                                 c->ch, MD_BLOCK_CONTAINER_CLOSER));
5564                 break;
5565 
5566             case _T('>'):
5567                 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0,
5568                                 0, MD_BLOCK_CONTAINER_CLOSER));
5569                 break;
5570 
5571             default:
5572                 MD_UNREACHABLE();
5573                 break;
5574         }
5575 
5576         ctx->n_containers--;
5577     }
5578 
5579 abort:
5580     return ret;
5581 }
5582 
5583 static int
md_is_container_mark(MD_CTX * ctx,unsigned indent,OFF beg,OFF * p_end,MD_CONTAINER * p_container)5584 md_is_container_mark(MD_CTX* ctx, unsigned indent, OFF beg, OFF* p_end, MD_CONTAINER* p_container)
5585 {
5586     OFF off = beg;
5587     OFF max_end;
5588 
5589     if(indent >= ctx->code_indent_offset)
5590         return FALSE;
5591 
5592     /* Check for block quote mark. */
5593     if(off < ctx->size  &&  CH(off) == _T('>')) {
5594         off++;
5595         p_container->ch = _T('>');
5596         p_container->is_loose = FALSE;
5597         p_container->is_task = FALSE;
5598         p_container->mark_indent = indent;
5599         p_container->contents_indent = indent + 1;
5600         *p_end = off;
5601         return TRUE;
5602     }
5603 
5604     /* Check for list item bullet mark. */
5605     if(off+1 < ctx->size  &&  ISANYOF(off, _T("-+*"))  &&  (ISBLANK(off+1) || ISNEWLINE(off+1))) {
5606         p_container->ch = CH(off);
5607         p_container->is_loose = FALSE;
5608         p_container->is_task = FALSE;
5609         p_container->mark_indent = indent;
5610         p_container->contents_indent = indent + 1;
5611         *p_end = off + 1;
5612         return TRUE;
5613     }
5614 
5615     /* Check for ordered list item marks. */
5616     max_end = off + 9;
5617     if(max_end > ctx->size)
5618         max_end = ctx->size;
5619     p_container->start = 0;
5620     while(off < max_end  &&  ISDIGIT(off)) {
5621         p_container->start = p_container->start * 10 + CH(off) - _T('0');
5622         off++;
5623     }
5624     if(off > beg  &&  off+1 < ctx->size  &&
5625        (CH(off) == _T('.') || CH(off) == _T(')'))  &&
5626        (ISBLANK(off+1) || ISNEWLINE(off+1)))
5627     {
5628         p_container->ch = CH(off);
5629         p_container->is_loose = FALSE;
5630         p_container->is_task = FALSE;
5631         p_container->mark_indent = indent;
5632         p_container->contents_indent = indent + off - beg + 1;
5633         *p_end = off + 1;
5634         return TRUE;
5635     }
5636 
5637     return FALSE;
5638 }
5639 
5640 static unsigned
md_line_indentation(MD_CTX * ctx,unsigned total_indent,OFF beg,OFF * p_end)5641 md_line_indentation(MD_CTX* ctx, unsigned total_indent, OFF beg, OFF* p_end)
5642 {
5643     OFF off = beg;
5644     unsigned indent = total_indent;
5645 
5646     while(off < ctx->size  &&  ISBLANK(off)) {
5647         if(CH(off) == _T('\t'))
5648             indent = (indent + 4) & ~3;
5649         else
5650             indent++;
5651         off++;
5652     }
5653 
5654     *p_end = off;
5655     return indent - total_indent;
5656 }
5657 
5658 static const MD_LINE_ANALYSIS md_dummy_blank_line = { MD_LINE_BLANK, 0 };
5659 
5660 /* Analyze type of the line and find some its properties. This serves as a
5661  * main input for determining type and boundaries of a block. */
5662 static int
md_analyze_line(MD_CTX * ctx,OFF beg,OFF * p_end,const MD_LINE_ANALYSIS * pivot_line,MD_LINE_ANALYSIS * line)5663 md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end,
5664                 const MD_LINE_ANALYSIS* pivot_line, MD_LINE_ANALYSIS* line)
5665 {
5666     unsigned total_indent = 0;
5667     int n_parents = 0;
5668     int n_brothers = 0;
5669     int n_children = 0;
5670     MD_CONTAINER container = { 0 };
5671     int prev_line_has_list_loosening_effect = ctx->last_line_has_list_loosening_effect;
5672     OFF off = beg;
5673     OFF hr_killer = 0;
5674     int ret = 0;
5675 
5676     line->indent = md_line_indentation(ctx, total_indent, off, &off);
5677     total_indent += line->indent;
5678     line->beg = off;
5679 
5680     /* Given the indentation and block quote marks '>', determine how many of
5681      * the current containers are our parents. */
5682     while(n_parents < ctx->n_containers) {
5683         MD_CONTAINER* c = &ctx->containers[n_parents];
5684 
5685         if(c->ch == _T('>')  &&  line->indent < ctx->code_indent_offset  &&
5686             off < ctx->size  &&  CH(off) == _T('>'))
5687         {
5688             /* Block quote mark. */
5689             off++;
5690             total_indent++;
5691             line->indent = md_line_indentation(ctx, total_indent, off, &off);
5692             total_indent += line->indent;
5693 
5694             /* The optional 1st space after '>' is part of the block quote mark. */
5695             if(line->indent > 0)
5696                 line->indent--;
5697 
5698             line->beg = off;
5699 
5700         } else if(c->ch != _T('>')  &&  line->indent >= c->contents_indent) {
5701             /* List. */
5702             line->indent -= c->contents_indent;
5703         } else {
5704             break;
5705         }
5706 
5707         n_parents++;
5708     }
5709 
5710     if(off >= ctx->size  ||  ISNEWLINE(off)) {
5711         /* Blank line does not need any real indentation to be nested inside
5712          * a list. */
5713         if(n_brothers + n_children == 0) {
5714             while(n_parents < ctx->n_containers  &&  ctx->containers[n_parents].ch != _T('>'))
5715                 n_parents++;
5716         }
5717     }
5718 
5719     while(TRUE) {
5720         /* Check whether we are fenced code continuation. */
5721         if(pivot_line->type == MD_LINE_FENCEDCODE) {
5722             line->beg = off;
5723 
5724             /* We are another MD_LINE_FENCEDCODE unless we are closing fence
5725              * which we transform into MD_LINE_BLANK. */
5726             if(line->indent < ctx->code_indent_offset) {
5727                 if(md_is_closing_code_fence(ctx, CH(pivot_line->beg), off, &off)) {
5728                     line->type = MD_LINE_BLANK;
5729                     ctx->last_line_has_list_loosening_effect = FALSE;
5730                     break;
5731                 }
5732             }
5733 
5734             /* Change indentation accordingly to the initial code fence. */
5735             if(n_parents == ctx->n_containers) {
5736                 if(line->indent > pivot_line->indent)
5737                     line->indent -= pivot_line->indent;
5738                 else
5739                     line->indent = 0;
5740 
5741                 line->type = MD_LINE_FENCEDCODE;
5742                 break;
5743             }
5744         }
5745 
5746         /* Check whether we are HTML block continuation. */
5747         if(pivot_line->type == MD_LINE_HTML  &&  ctx->html_block_type > 0) {
5748             int html_block_type;
5749 
5750             html_block_type = md_is_html_block_end_condition(ctx, off, &off);
5751             if(html_block_type > 0) {
5752                 MD_ASSERT(html_block_type == ctx->html_block_type);
5753 
5754                 /* Make sure this is the last line of the block. */
5755                 ctx->html_block_type = 0;
5756 
5757                 /* Some end conditions serve as blank lines at the same time. */
5758                 if(html_block_type == 6 || html_block_type == 7) {
5759                     line->type = MD_LINE_BLANK;
5760                     line->indent = 0;
5761                     break;
5762                 }
5763             }
5764 
5765             if(n_parents == ctx->n_containers) {
5766                 line->type = MD_LINE_HTML;
5767                 break;
5768             }
5769         }
5770 
5771         /* Check for blank line. */
5772         if(off >= ctx->size  ||  ISNEWLINE(off)) {
5773             if(pivot_line->type == MD_LINE_INDENTEDCODE  &&  n_parents == ctx->n_containers) {
5774                 line->type = MD_LINE_INDENTEDCODE;
5775                 if(line->indent > ctx->code_indent_offset)
5776                     line->indent -= ctx->code_indent_offset;
5777                 else
5778                     line->indent = 0;
5779                 ctx->last_line_has_list_loosening_effect = FALSE;
5780             } else {
5781                 line->type = MD_LINE_BLANK;
5782                 ctx->last_line_has_list_loosening_effect = (n_parents > 0  &&
5783                         n_brothers + n_children == 0  &&
5784                         ctx->containers[n_parents-1].ch != _T('>'));
5785 
5786     #if 1
5787                 /* See https://github.com/mity/md4c/issues/6
5788                  *
5789                  * This ugly checking tests we are in (yet empty) list item but not
5790                  * its very first line (with the list item mark).
5791                  *
5792                  * If we are such blank line, then any following non-blank line
5793                  * which would be part of this list item actually ends the list
5794                  * because "a list item can begin with at most one blank line."
5795                  */
5796                 if(n_parents > 0  &&  ctx->containers[n_parents-1].ch != _T('>')  &&
5797                    n_brothers + n_children == 0  &&  ctx->current_block == NULL  &&
5798                    ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5799                 {
5800                     MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5801                     if(top_block->type == MD_BLOCK_LI)
5802                         ctx->last_list_item_starts_with_two_blank_lines = TRUE;
5803                 }
5804     #endif
5805             }
5806             break;
5807         } else {
5808     #if 1
5809             /* This is 2nd half of the hack. If the flag is set (that is there
5810              * were 2nd blank line at the start of the list item) and we would also
5811              * belonging to such list item, than interrupt the list. */
5812             ctx->last_line_has_list_loosening_effect = FALSE;
5813             if(ctx->last_list_item_starts_with_two_blank_lines) {
5814                 if(n_parents > 0  &&  ctx->containers[n_parents-1].ch != _T('>')  &&
5815                    n_brothers + n_children == 0  &&  ctx->current_block == NULL  &&
5816                    ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5817                 {
5818                     MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5819                     if(top_block->type == MD_BLOCK_LI)
5820                         n_parents--;
5821                 }
5822 
5823                 ctx->last_list_item_starts_with_two_blank_lines = FALSE;
5824             }
5825     #endif
5826         }
5827 
5828         /* Check whether we are Setext underline. */
5829         if(line->indent < ctx->code_indent_offset  &&  pivot_line->type == MD_LINE_TEXT
5830             &&  (CH(off) == _T('=') || CH(off) == _T('-'))
5831             &&  (n_parents == ctx->n_containers))
5832         {
5833             unsigned level;
5834 
5835             if(md_is_setext_underline(ctx, off, &off, &level)) {
5836                 line->type = MD_LINE_SETEXTUNDERLINE;
5837                 line->data = level;
5838                 break;
5839             }
5840         }
5841 
5842         /* Check for thematic break line. */
5843         if(line->indent < ctx->code_indent_offset  &&  ISANYOF(off, _T("-_*"))  &&  off >= hr_killer) {
5844             if(md_is_hr_line(ctx, off, &off, &hr_killer)) {
5845                 line->type = MD_LINE_HR;
5846                 break;
5847             }
5848         }
5849 
5850         /* Check for "brother" container. I.e. whether we are another list item
5851          * in already started list. */
5852         if(n_parents < ctx->n_containers  &&  n_brothers + n_children == 0) {
5853             OFF tmp;
5854 
5855             if(md_is_container_mark(ctx, line->indent, off, &tmp, &container)  &&
5856                md_is_container_compatible(&ctx->containers[n_parents], &container))
5857             {
5858                 pivot_line = &md_dummy_blank_line;
5859 
5860                 off = tmp;
5861 
5862                 total_indent += container.contents_indent - container.mark_indent;
5863                 line->indent = md_line_indentation(ctx, total_indent, off, &off);
5864                 total_indent += line->indent;
5865                 line->beg = off;
5866 
5867                 /* Some of the following whitespace actually still belongs to the mark. */
5868                 if(off >= ctx->size || ISNEWLINE(off)) {
5869                     container.contents_indent++;
5870                 } else if(line->indent <= ctx->code_indent_offset) {
5871                     container.contents_indent += line->indent;
5872                     line->indent = 0;
5873                 } else {
5874                     container.contents_indent += 1;
5875                     line->indent--;
5876                 }
5877 
5878                 ctx->containers[n_parents].mark_indent = container.mark_indent;
5879                 ctx->containers[n_parents].contents_indent = container.contents_indent;
5880 
5881                 n_brothers++;
5882                 continue;
5883             }
5884         }
5885 
5886         /* Check for indented code.
5887          * Note indented code block cannot interrupt a paragraph. */
5888         if(line->indent >= ctx->code_indent_offset  &&
5889             (pivot_line->type == MD_LINE_BLANK || pivot_line->type == MD_LINE_INDENTEDCODE))
5890         {
5891             line->type = MD_LINE_INDENTEDCODE;
5892             MD_ASSERT(line->indent >= ctx->code_indent_offset);
5893             line->indent -= ctx->code_indent_offset;
5894             line->data = 0;
5895             break;
5896         }
5897 
5898         /* Check for start of a new container block. */
5899         if(line->indent < ctx->code_indent_offset  &&
5900            md_is_container_mark(ctx, line->indent, off, &off, &container))
5901         {
5902             if(pivot_line->type == MD_LINE_TEXT  &&  n_parents == ctx->n_containers  &&
5903                         (off >= ctx->size || ISNEWLINE(off))  &&  container.ch != _T('>'))
5904             {
5905                 /* Noop. List mark followed by a blank line cannot interrupt a paragraph. */
5906             } else if(pivot_line->type == MD_LINE_TEXT  &&  n_parents == ctx->n_containers  &&
5907                         (container.ch == _T('.') || container.ch == _T(')'))  &&  container.start != 1)
5908             {
5909                 /* Noop. Ordered list cannot interrupt a paragraph unless the start index is 1. */
5910             } else {
5911                 total_indent += container.contents_indent - container.mark_indent;
5912                 line->indent = md_line_indentation(ctx, total_indent, off, &off);
5913                 total_indent += line->indent;
5914 
5915                 line->beg = off;
5916                 line->data = container.ch;
5917 
5918                 /* Some of the following whitespace actually still belongs to the mark. */
5919                 if(off >= ctx->size || ISNEWLINE(off)) {
5920                     container.contents_indent++;
5921                 } else if(line->indent <= ctx->code_indent_offset) {
5922                     container.contents_indent += line->indent;
5923                     line->indent = 0;
5924                 } else {
5925                     container.contents_indent += 1;
5926                     line->indent--;
5927                 }
5928 
5929                 if(n_brothers + n_children == 0)
5930                     pivot_line = &md_dummy_blank_line;
5931 
5932                 if(n_children == 0)
5933                     MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
5934 
5935                 n_children++;
5936                 MD_CHECK(md_push_container(ctx, &container));
5937                 continue;
5938             }
5939         }
5940 
5941         /* Check whether we are table continuation. */
5942         if(pivot_line->type == MD_LINE_TABLE  &&  n_parents == ctx->n_containers) {
5943             line->type = MD_LINE_TABLE;
5944             break;
5945         }
5946 
5947         /* Check for ATX header. */
5948         if(line->indent < ctx->code_indent_offset  &&  CH(off) == _T('#')) {
5949             unsigned level;
5950 
5951             if(md_is_atxheader_line(ctx, off, &line->beg, &off, &level)) {
5952                 line->type = MD_LINE_ATXHEADER;
5953                 line->data = level;
5954                 break;
5955             }
5956         }
5957 
5958         /* Check whether we are starting code fence. */
5959         if(CH(off) == _T('`') || CH(off) == _T('~')) {
5960             if(md_is_opening_code_fence(ctx, off, &off)) {
5961                 line->type = MD_LINE_FENCEDCODE;
5962                 line->data = 1;
5963                 break;
5964             }
5965         }
5966 
5967         /* Check for start of raw HTML block. */
5968         if(CH(off) == _T('<')  &&  !(ctx->parser.flags & MD_FLAG_NOHTMLBLOCKS))
5969         {
5970             ctx->html_block_type = md_is_html_block_start_condition(ctx, off);
5971 
5972             /* HTML block type 7 cannot interrupt paragraph. */
5973             if(ctx->html_block_type == 7  &&  pivot_line->type == MD_LINE_TEXT)
5974                 ctx->html_block_type = 0;
5975 
5976             if(ctx->html_block_type > 0) {
5977                 /* The line itself also may immediately close the block. */
5978                 if(md_is_html_block_end_condition(ctx, off, &off) == ctx->html_block_type) {
5979                     /* Make sure this is the last line of the block. */
5980                     ctx->html_block_type = 0;
5981                 }
5982 
5983                 line->type = MD_LINE_HTML;
5984                 break;
5985             }
5986         }
5987 
5988         /* Check for table underline. */
5989         if((ctx->parser.flags & MD_FLAG_TABLES)  &&  pivot_line->type == MD_LINE_TEXT  &&
5990            (CH(off) == _T('|') || CH(off) == _T('-') || CH(off) == _T(':'))  &&
5991            n_parents == ctx->n_containers)
5992         {
5993             unsigned col_count;
5994 
5995             if(ctx->current_block != NULL  &&  ctx->current_block->n_lines == 1  &&
5996                 md_is_table_underline(ctx, off, &off, &col_count))
5997             {
5998                 line->data = col_count;
5999                 line->type = MD_LINE_TABLEUNDERLINE;
6000                 break;
6001             }
6002         }
6003 
6004         /* By default, we are normal text line. */
6005         line->type = MD_LINE_TEXT;
6006         if(pivot_line->type == MD_LINE_TEXT  &&  n_brothers + n_children == 0) {
6007             /* Lazy continuation. */
6008             n_parents = ctx->n_containers;
6009         }
6010 
6011         /* Check for task mark. */
6012         if((ctx->parser.flags & MD_FLAG_TASKLISTS)  &&  n_brothers + n_children > 0  &&
6013            ISANYOF_(ctx->containers[ctx->n_containers-1].ch, _T("-+*.)")))
6014         {
6015             OFF tmp = off;
6016 
6017             while(tmp < ctx->size  &&  tmp < off + 3  &&  ISBLANK(tmp))
6018                 tmp++;
6019             if(tmp + 2 < ctx->size  &&  CH(tmp) == _T('[')  &&
6020                ISANYOF(tmp+1, _T("xX "))  &&  CH(tmp+2) == _T(']')  &&
6021                (tmp + 3 == ctx->size  ||  ISBLANK(tmp+3)  ||  ISNEWLINE(tmp+3)))
6022             {
6023                 MD_CONTAINER* task_container = (n_children > 0 ? &ctx->containers[ctx->n_containers-1] : &container);
6024                 task_container->is_task = TRUE;
6025                 task_container->task_mark_off = tmp + 1;
6026                 off = tmp + 3;
6027                 while(ISWHITESPACE(off))
6028                     off++;
6029                 line->beg = off;
6030             }
6031         }
6032 
6033         break;
6034     }
6035 
6036     /* Scan for end of the line.
6037      *
6038      * Note this is quite a bottleneck of the parsing as we here iterate almost
6039      * over compete document.
6040      */
6041 #if defined __linux__ && !defined MD4C_USE_UTF16
6042     /* Recent glibc versions have superbly optimized strcspn(), even using
6043      * vectorization if available. */
6044     if(ctx->doc_ends_with_newline  &&  off < ctx->size) {
6045         while(TRUE) {
6046             off += (OFF) strcspn(STR(off), "\r\n");
6047 
6048             /* strcspn() can stop on zero terminator; but that can appear
6049              * anywhere in the Markfown input... */
6050             if(CH(off) == _T('\0'))
6051                 off++;
6052             else
6053                 break;
6054         }
6055     } else
6056 #endif
6057     {
6058         /* Optimization: Use some loop unrolling. */
6059         while(off + 3 < ctx->size  &&  !ISNEWLINE(off+0)  &&  !ISNEWLINE(off+1)
6060                                    &&  !ISNEWLINE(off+2)  &&  !ISNEWLINE(off+3))
6061             off += 4;
6062         while(off < ctx->size  &&  !ISNEWLINE(off))
6063             off++;
6064     }
6065 
6066     /* Set end of the line. */
6067     line->end = off;
6068 
6069     /* But for ATX header, we should exclude the optional trailing mark. */
6070     if(line->type == MD_LINE_ATXHEADER) {
6071         OFF tmp = line->end;
6072         while(tmp > line->beg && CH(tmp-1) == _T(' '))
6073             tmp--;
6074         while(tmp > line->beg && CH(tmp-1) == _T('#'))
6075             tmp--;
6076         if(tmp == line->beg || CH(tmp-1) == _T(' ') || (ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS))
6077             line->end = tmp;
6078     }
6079 
6080     /* Trim trailing spaces. */
6081     if(line->type != MD_LINE_INDENTEDCODE  &&  line->type != MD_LINE_FENCEDCODE) {
6082         while(line->end > line->beg && CH(line->end-1) == _T(' '))
6083             line->end--;
6084     }
6085 
6086     /* Eat also the new line. */
6087     if(off < ctx->size && CH(off) == _T('\r'))
6088         off++;
6089     if(off < ctx->size && CH(off) == _T('\n'))
6090         off++;
6091 
6092     *p_end = off;
6093 
6094     /* If we belong to a list after seeing a blank line, the list is loose. */
6095     if(prev_line_has_list_loosening_effect  &&  line->type != MD_LINE_BLANK  &&  n_parents + n_brothers > 0) {
6096         MD_CONTAINER* c = &ctx->containers[n_parents + n_brothers - 1];
6097         if(c->ch != _T('>')) {
6098             MD_BLOCK* block = (MD_BLOCK*) (((char*)ctx->block_bytes) + c->block_byte_off);
6099             block->flags |= MD_BLOCK_LOOSE_LIST;
6100         }
6101     }
6102 
6103     /* Leave any containers we are not part of anymore. */
6104     if(n_children == 0  &&  n_parents + n_brothers < ctx->n_containers)
6105         MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
6106 
6107     /* Enter any container we found a mark for. */
6108     if(n_brothers > 0) {
6109         MD_ASSERT(n_brothers == 1);
6110         MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6111                     ctx->containers[n_parents].task_mark_off,
6112                     (ctx->containers[n_parents].is_task ? CH(ctx->containers[n_parents].task_mark_off) : 0),
6113                     MD_BLOCK_CONTAINER_CLOSER));
6114         MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6115                     container.task_mark_off,
6116                     (container.is_task ? CH(container.task_mark_off) : 0),
6117                     MD_BLOCK_CONTAINER_OPENER));
6118         ctx->containers[n_parents].is_task = container.is_task;
6119         ctx->containers[n_parents].task_mark_off = container.task_mark_off;
6120     }
6121 
6122     if(n_children > 0)
6123         MD_CHECK(md_enter_child_containers(ctx, n_children, line->data));
6124 
6125 abort:
6126     return ret;
6127 }
6128 
6129 static int
md_process_line(MD_CTX * ctx,const MD_LINE_ANALYSIS ** p_pivot_line,MD_LINE_ANALYSIS * line)6130 md_process_line(MD_CTX* ctx, const MD_LINE_ANALYSIS** p_pivot_line, MD_LINE_ANALYSIS* line)
6131 {
6132     const MD_LINE_ANALYSIS* pivot_line = *p_pivot_line;
6133     int ret = 0;
6134 
6135     /* Blank line ends current leaf block. */
6136     if(line->type == MD_LINE_BLANK) {
6137         MD_CHECK(md_end_current_block(ctx));
6138         *p_pivot_line = &md_dummy_blank_line;
6139         return 0;
6140     }
6141 
6142     /* Some line types form block on their own. */
6143     if(line->type == MD_LINE_HR || line->type == MD_LINE_ATXHEADER) {
6144         MD_CHECK(md_end_current_block(ctx));
6145 
6146         /* Add our single-line block. */
6147         MD_CHECK(md_start_new_block(ctx, line));
6148         MD_CHECK(md_add_line_into_current_block(ctx, line));
6149         MD_CHECK(md_end_current_block(ctx));
6150         *p_pivot_line = &md_dummy_blank_line;
6151         return 0;
6152     }
6153 
6154     /* MD_LINE_SETEXTUNDERLINE changes meaning of the current block and ends it. */
6155     if(line->type == MD_LINE_SETEXTUNDERLINE) {
6156         MD_ASSERT(ctx->current_block != NULL);
6157         ctx->current_block->type = MD_BLOCK_H;
6158         ctx->current_block->data = line->data;
6159         ctx->current_block->flags |= MD_BLOCK_SETEXT_HEADER;
6160         MD_CHECK(md_add_line_into_current_block(ctx, line));
6161         MD_CHECK(md_end_current_block(ctx));
6162         if(ctx->current_block == NULL) {
6163             *p_pivot_line = &md_dummy_blank_line;
6164         } else {
6165             /* This happens if we have consumed all the body as link ref. defs.
6166              * and downgraded the underline into start of a new paragraph block. */
6167             line->type = MD_LINE_TEXT;
6168             *p_pivot_line = line;
6169         }
6170         return 0;
6171     }
6172 
6173     /* MD_LINE_TABLEUNDERLINE changes meaning of the current block. */
6174     if(line->type == MD_LINE_TABLEUNDERLINE) {
6175         MD_ASSERT(ctx->current_block != NULL);
6176         MD_ASSERT(ctx->current_block->n_lines == 1);
6177         ctx->current_block->type = MD_BLOCK_TABLE;
6178         ctx->current_block->data = line->data;
6179         MD_ASSERT(pivot_line != &md_dummy_blank_line);
6180         ((MD_LINE_ANALYSIS*)pivot_line)->type = MD_LINE_TABLE;
6181         MD_CHECK(md_add_line_into_current_block(ctx, line));
6182         return 0;
6183     }
6184 
6185     /* The current block also ends if the line has different type. */
6186     if(line->type != pivot_line->type)
6187         MD_CHECK(md_end_current_block(ctx));
6188 
6189     /* The current line may start a new block. */
6190     if(ctx->current_block == NULL) {
6191         MD_CHECK(md_start_new_block(ctx, line));
6192         *p_pivot_line = line;
6193     }
6194 
6195     /* In all other cases the line is just a continuation of the current block. */
6196     MD_CHECK(md_add_line_into_current_block(ctx, line));
6197 
6198 abort:
6199     return ret;
6200 }
6201 
6202 static int
md_process_doc(MD_CTX * ctx)6203 md_process_doc(MD_CTX *ctx)
6204 {
6205     const MD_LINE_ANALYSIS* pivot_line = &md_dummy_blank_line;
6206     MD_LINE_ANALYSIS line_buf[2];
6207     MD_LINE_ANALYSIS* line = &line_buf[0];
6208     OFF off = 0;
6209     int ret = 0;
6210 
6211     MD_ENTER_BLOCK(MD_BLOCK_DOC, NULL);
6212 
6213     while(off < ctx->size) {
6214         if(line == pivot_line)
6215             line = (line == &line_buf[0] ? &line_buf[1] : &line_buf[0]);
6216 
6217         MD_CHECK(md_analyze_line(ctx, off, &off, pivot_line, line));
6218         MD_CHECK(md_process_line(ctx, &pivot_line, line));
6219     }
6220 
6221     md_end_current_block(ctx);
6222 
6223     MD_CHECK(md_build_ref_def_hashtable(ctx));
6224 
6225     /* Process all blocks. */
6226     MD_CHECK(md_leave_child_containers(ctx, 0));
6227     MD_CHECK(md_process_all_blocks(ctx));
6228 
6229     MD_LEAVE_BLOCK(MD_BLOCK_DOC, NULL);
6230 
6231 abort:
6232 
6233 #if 0
6234     /* Output some memory consumption statistics. */
6235     {
6236         char buffer[256];
6237         sprintf(buffer, "Alloced %u bytes for block buffer.",
6238                     (unsigned)(ctx->alloc_block_bytes));
6239         MD_LOG(buffer);
6240 
6241         sprintf(buffer, "Alloced %u bytes for containers buffer.",
6242                     (unsigned)(ctx->alloc_containers * sizeof(MD_CONTAINER)));
6243         MD_LOG(buffer);
6244 
6245         sprintf(buffer, "Alloced %u bytes for marks buffer.",
6246                     (unsigned)(ctx->alloc_marks * sizeof(MD_MARK)));
6247         MD_LOG(buffer);
6248 
6249         sprintf(buffer, "Alloced %u bytes for aux. buffer.",
6250                     (unsigned)(ctx->alloc_buffer * sizeof(MD_CHAR)));
6251         MD_LOG(buffer);
6252     }
6253 #endif
6254 
6255     return ret;
6256 }
6257 
6258 
6259 /********************
6260  ***  Public API  ***
6261  ********************/
6262 
6263 int
md_parse(const MD_CHAR * text,MD_SIZE size,const MD_PARSER * parser,void * userdata)6264 md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata)
6265 {
6266     MD_CTX ctx;
6267     int i;
6268     int ret;
6269 
6270     if(parser->abi_version != 0) {
6271         if(parser->debug_log != NULL)
6272             parser->debug_log("Unsupported abi_version.", userdata);
6273         return -1;
6274     }
6275 
6276     /* Setup context structure. */
6277     memset(&ctx, 0, sizeof(MD_CTX));
6278     ctx.text = text;
6279     ctx.size = size;
6280     memcpy(&ctx.parser, parser, sizeof(MD_PARSER));
6281     ctx.userdata = userdata;
6282     ctx.code_indent_offset = (ctx.parser.flags & MD_FLAG_NOINDENTEDCODEBLOCKS) ? (OFF)(-1) : 4;
6283     md_build_mark_char_map(&ctx);
6284     ctx.doc_ends_with_newline = (size > 0  &&  ISNEWLINE_(text[size-1]));
6285 
6286     /* Reset all unresolved opener mark chains. */
6287     for(i = 0; i < (int) SIZEOF_ARRAY(ctx.mark_chains); i++) {
6288         ctx.mark_chains[i].head = -1;
6289         ctx.mark_chains[i].tail = -1;
6290     }
6291     ctx.unresolved_link_head = -1;
6292     ctx.unresolved_link_tail = -1;
6293 
6294     /* All the work. */
6295     ret = md_process_doc(&ctx);
6296 
6297     /* Clean-up. */
6298     md_free_ref_defs(&ctx);
6299     md_free_ref_def_hashtable(&ctx);
6300     free(ctx.buffer);
6301     free(ctx.marks);
6302     free(ctx.block_bytes);
6303     free(ctx.containers);
6304 
6305     return ret;
6306 }
6307