1 /*
2 * MD4C: Markdown parser for C
3 * (http://github.com/mity/md4c)
4 *
5 * Copyright (c) 2016-2020 Martin Mitas
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be included in
15 * all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * IN THE SOFTWARE.
24 */
25
26 #include "md4c.h"
27
28 #include <limits.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32
33
34 /*****************************
35 *** Miscellaneous Stuff ***
36 *****************************/
37
38 #if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 199409L
39 /* C89/90 or old compilers in general may not understand "inline". */
40 #if defined __GNUC__
41 #define inline __inline__
42 #elif defined _MSC_VER
43 #define inline __inline
44 #else
45 #define inline
46 #endif
47 #endif
48
49 /* Make the UTF-8 support the default. */
50 #if !defined MD4C_USE_ASCII && !defined MD4C_USE_UTF8 && !defined MD4C_USE_UTF16
51 #define MD4C_USE_UTF8
52 #endif
53
54 /* Magic for making wide literals with MD4C_USE_UTF16. */
55 #ifdef _T
56 #undef _T
57 #endif
58 #if defined MD4C_USE_UTF16
59 #define _T(x) L##x
60 #else
61 #define _T(x) x
62 #endif
63
64 /* Misc. macros. */
65 #define SIZEOF_ARRAY(a) (sizeof(a) / sizeof(a[0]))
66
67 #define STRINGIZE_(x) #x
68 #define STRINGIZE(x) STRINGIZE_(x)
69
70 #ifndef TRUE
71 #define TRUE 1
72 #define FALSE 0
73 #endif
74
75
76 /************************
77 *** Internal Types ***
78 ************************/
79
80 /* These are omnipresent so lets save some typing. */
81 #define CHAR MD_CHAR
82 #define SZ MD_SIZE
83 #define OFF MD_OFFSET
84
85 typedef struct MD_MARK_tag MD_MARK;
86 typedef struct MD_BLOCK_tag MD_BLOCK;
87 typedef struct MD_CONTAINER_tag MD_CONTAINER;
88 typedef struct MD_REF_DEF_tag MD_REF_DEF;
89
90
91 /* During analyzes of inline marks, we need to manage some "mark chains",
92 * of (yet unresolved) openers. This structure holds start/end of the chain.
93 * The chain internals are then realized through MD_MARK::prev and ::next.
94 */
95 typedef struct MD_MARKCHAIN_tag MD_MARKCHAIN;
96 struct MD_MARKCHAIN_tag {
97 int head; /* Index of first mark in the chain, or -1 if empty. */
98 int tail; /* Index of last mark in the chain, or -1 if empty. */
99 };
100
101 /* Context propagated through all the parsing. */
102 typedef struct MD_CTX_tag MD_CTX;
103 struct MD_CTX_tag {
104 /* Immutable stuff (parameters of md_parse()). */
105 const CHAR* text;
106 SZ size;
107 MD_PARSER parser;
108 void* userdata;
109
110 /* When this is true, it allows some optimizations. */
111 int doc_ends_with_newline;
112
113 /* Helper temporary growing buffer. */
114 CHAR* buffer;
115 unsigned alloc_buffer;
116
117 /* Reference definitions. */
118 MD_REF_DEF* ref_defs;
119 int n_ref_defs;
120 int alloc_ref_defs;
121 void** ref_def_hashtable;
122 int ref_def_hashtable_size;
123
124 /* Stack of inline/span markers.
125 * This is only used for parsing a single block contents but by storing it
126 * here we may reuse the stack for subsequent blocks; i.e. we have fewer
127 * (re)allocations. */
128 MD_MARK* marks;
129 int n_marks;
130 int alloc_marks;
131
132 #if defined MD4C_USE_UTF16
133 char mark_char_map[128];
134 #else
135 char mark_char_map[256];
136 #endif
137
138 /* For resolving of inline spans. */
139 MD_MARKCHAIN mark_chains[13];
140 #define PTR_CHAIN (ctx->mark_chains[0])
141 #define TABLECELLBOUNDARIES (ctx->mark_chains[1])
142 #define ASTERISK_OPENERS_extraword_mod3_0 (ctx->mark_chains[2])
143 #define ASTERISK_OPENERS_extraword_mod3_1 (ctx->mark_chains[3])
144 #define ASTERISK_OPENERS_extraword_mod3_2 (ctx->mark_chains[4])
145 #define ASTERISK_OPENERS_intraword_mod3_0 (ctx->mark_chains[5])
146 #define ASTERISK_OPENERS_intraword_mod3_1 (ctx->mark_chains[6])
147 #define ASTERISK_OPENERS_intraword_mod3_2 (ctx->mark_chains[7])
148 #define UNDERSCORE_OPENERS (ctx->mark_chains[8])
149 #define TILDE_OPENERS_1 (ctx->mark_chains[9])
150 #define TILDE_OPENERS_2 (ctx->mark_chains[10])
151 #define BRACKET_OPENERS (ctx->mark_chains[11])
152 #define DOLLAR_OPENERS (ctx->mark_chains[12])
153 #define OPENERS_CHAIN_FIRST 2
154 #define OPENERS_CHAIN_LAST 12
155
156 int n_table_cell_boundaries;
157
158 /* For resolving links. */
159 int unresolved_link_head;
160 int unresolved_link_tail;
161
162 /* For resolving raw HTML. */
163 OFF html_comment_horizon;
164 OFF html_proc_instr_horizon;
165 OFF html_decl_horizon;
166 OFF html_cdata_horizon;
167
168 /* For block analysis.
169 * Notes:
170 * -- It holds MD_BLOCK as well as MD_LINE structures. After each
171 * MD_BLOCK, its (multiple) MD_LINE(s) follow.
172 * -- For MD_BLOCK_HTML and MD_BLOCK_CODE, MD_VERBATIMLINE(s) are used
173 * instead of MD_LINE(s).
174 */
175 void* block_bytes;
176 MD_BLOCK* current_block;
177 int n_block_bytes;
178 int alloc_block_bytes;
179
180 /* For container block analysis. */
181 MD_CONTAINER* containers;
182 int n_containers;
183 int alloc_containers;
184
185 /* Minimal indentation to call the block "indented code block". */
186 unsigned code_indent_offset;
187
188 /* Contextual info for line analysis. */
189 SZ code_fence_length; /* For checking closing fence length. */
190 int html_block_type; /* For checking closing raw HTML condition. */
191 int last_line_has_list_loosening_effect;
192 int last_list_item_starts_with_two_blank_lines;
193 };
194
195 enum MD_LINETYPE_tag {
196 MD_LINE_BLANK,
197 MD_LINE_HR,
198 MD_LINE_ATXHEADER,
199 MD_LINE_SETEXTHEADER,
200 MD_LINE_SETEXTUNDERLINE,
201 MD_LINE_INDENTEDCODE,
202 MD_LINE_FENCEDCODE,
203 MD_LINE_HTML,
204 MD_LINE_TEXT,
205 MD_LINE_TABLE,
206 MD_LINE_TABLEUNDERLINE
207 };
208 typedef enum MD_LINETYPE_tag MD_LINETYPE;
209
210 typedef struct MD_LINE_ANALYSIS_tag MD_LINE_ANALYSIS;
211 struct MD_LINE_ANALYSIS_tag {
212 MD_LINETYPE type : 16;
213 unsigned data : 16;
214 OFF beg;
215 OFF end;
216 unsigned indent; /* Indentation level. */
217 };
218
219 typedef struct MD_LINE_tag MD_LINE;
220 struct MD_LINE_tag {
221 OFF beg;
222 OFF end;
223 };
224
225 typedef struct MD_VERBATIMLINE_tag MD_VERBATIMLINE;
226 struct MD_VERBATIMLINE_tag {
227 OFF beg;
228 OFF end;
229 OFF indent;
230 };
231
232
233 /*******************
234 *** Debugging ***
235 *******************/
236
237 #define MD_LOG(msg) \
238 do { \
239 if(ctx->parser.debug_log != NULL) \
240 ctx->parser.debug_log((msg), ctx->userdata); \
241 } while(0)
242
243 #ifdef DEBUG
244 #define MD_ASSERT(cond) \
245 do { \
246 if(!(cond)) { \
247 MD_LOG(__FILE__ ":" STRINGIZE(__LINE__) ": " \
248 "Assertion '" STRINGIZE(cond) "' failed."); \
249 exit(1); \
250 } \
251 } while(0)
252
253 #define MD_UNREACHABLE() MD_ASSERT(1 == 0)
254 #else
255 #ifdef __GNUC__
256 #define MD_ASSERT(cond) do { if(!(cond)) __builtin_unreachable(); } while(0)
257 #define MD_UNREACHABLE() do { __builtin_unreachable(); } while(0)
258 #elif defined _MSC_VER && _MSC_VER > 120
259 #define MD_ASSERT(cond) do { __assume(cond); } while(0)
260 #define MD_UNREACHABLE() do { __assume(0); } while(0)
261 #else
262 #define MD_ASSERT(cond) do {} while(0)
263 #define MD_UNREACHABLE() do {} while(0)
264 #endif
265 #endif
266
267
268 /*****************
269 *** Helpers ***
270 *****************/
271
272 /* Character accessors. */
273 #define CH(off) (ctx->text[(off)])
274 #define STR(off) (ctx->text + (off))
275
276 /* Character classification.
277 * Note we assume ASCII compatibility of code points < 128 here. */
278 #define ISIN_(ch, ch_min, ch_max) ((ch_min) <= (unsigned)(ch) && (unsigned)(ch) <= (ch_max))
279 #define ISANYOF_(ch, palette) ((ch) != _T('\0') && md_strchr((palette), (ch)) != NULL)
280 #define ISANYOF2_(ch, ch1, ch2) ((ch) == (ch1) || (ch) == (ch2))
281 #define ISANYOF3_(ch, ch1, ch2, ch3) ((ch) == (ch1) || (ch) == (ch2) || (ch) == (ch3))
282 #define ISASCII_(ch) ((unsigned)(ch) <= 127)
283 #define ISBLANK_(ch) (ISANYOF2_((ch), _T(' '), _T('\t')))
284 #define ISNEWLINE_(ch) (ISANYOF2_((ch), _T('\r'), _T('\n')))
285 #define ISWHITESPACE_(ch) (ISBLANK_(ch) || ISANYOF2_((ch), _T('\v'), _T('\f')))
286 #define ISCNTRL_(ch) ((unsigned)(ch) <= 31 || (unsigned)(ch) == 127)
287 #define ISPUNCT_(ch) (ISIN_(ch, 33, 47) || ISIN_(ch, 58, 64) || ISIN_(ch, 91, 96) || ISIN_(ch, 123, 126))
288 #define ISUPPER_(ch) (ISIN_(ch, _T('A'), _T('Z')))
289 #define ISLOWER_(ch) (ISIN_(ch, _T('a'), _T('z')))
290 #define ISALPHA_(ch) (ISUPPER_(ch) || ISLOWER_(ch))
291 #define ISDIGIT_(ch) (ISIN_(ch, _T('0'), _T('9')))
292 #define ISXDIGIT_(ch) (ISDIGIT_(ch) || ISIN_(ch, _T('A'), _T('F')) || ISIN_(ch, _T('a'), _T('f')))
293 #define ISALNUM_(ch) (ISALPHA_(ch) || ISDIGIT_(ch))
294
295 #define ISANYOF(off, palette) ISANYOF_(CH(off), (palette))
296 #define ISANYOF2(off, ch1, ch2) ISANYOF2_(CH(off), (ch1), (ch2))
297 #define ISANYOF3(off, ch1, ch2, ch3) ISANYOF3_(CH(off), (ch1), (ch2), (ch3))
298 #define ISASCII(off) ISASCII_(CH(off))
299 #define ISBLANK(off) ISBLANK_(CH(off))
300 #define ISNEWLINE(off) ISNEWLINE_(CH(off))
301 #define ISWHITESPACE(off) ISWHITESPACE_(CH(off))
302 #define ISCNTRL(off) ISCNTRL_(CH(off))
303 #define ISPUNCT(off) ISPUNCT_(CH(off))
304 #define ISUPPER(off) ISUPPER_(CH(off))
305 #define ISLOWER(off) ISLOWER_(CH(off))
306 #define ISALPHA(off) ISALPHA_(CH(off))
307 #define ISDIGIT(off) ISDIGIT_(CH(off))
308 #define ISXDIGIT(off) ISXDIGIT_(CH(off))
309 #define ISALNUM(off) ISALNUM_(CH(off))
310
311
312 #if defined MD4C_USE_UTF16
313 #define md_strchr wcschr
314 #else
315 #define md_strchr strchr
316 #endif
317
318
319 /* Case insensitive check of string equality. */
320 static inline int
md_ascii_case_eq(const CHAR * s1,const CHAR * s2,SZ n)321 md_ascii_case_eq(const CHAR* s1, const CHAR* s2, SZ n)
322 {
323 OFF i;
324 for(i = 0; i < n; i++) {
325 CHAR ch1 = s1[i];
326 CHAR ch2 = s2[i];
327
328 if(ISLOWER_(ch1))
329 ch1 += ('A'-'a');
330 if(ISLOWER_(ch2))
331 ch2 += ('A'-'a');
332 if(ch1 != ch2)
333 return FALSE;
334 }
335 return TRUE;
336 }
337
338 static inline int
md_ascii_eq(const CHAR * s1,const CHAR * s2,SZ n)339 md_ascii_eq(const CHAR* s1, const CHAR* s2, SZ n)
340 {
341 return memcmp(s1, s2, n * sizeof(CHAR)) == 0;
342 }
343
344 static int
md_text_with_null_replacement(MD_CTX * ctx,MD_TEXTTYPE type,const CHAR * str,SZ size)345 md_text_with_null_replacement(MD_CTX* ctx, MD_TEXTTYPE type, const CHAR* str, SZ size)
346 {
347 OFF off = 0;
348 int ret = 0;
349
350 while(1) {
351 while(off < size && str[off] != _T('\0'))
352 off++;
353
354 if(off > 0) {
355 ret = ctx->parser.text(type, str, off, ctx->userdata);
356 if(ret != 0)
357 return ret;
358
359 str += off;
360 size -= off;
361 off = 0;
362 }
363
364 if(off >= size)
365 return 0;
366
367 ret = ctx->parser.text(MD_TEXT_NULLCHAR, _T(""), 1, ctx->userdata);
368 if(ret != 0)
369 return ret;
370 off++;
371 }
372 }
373
374
375 #define MD_CHECK(func) \
376 do { \
377 ret = (func); \
378 if(ret < 0) \
379 goto abort; \
380 } while(0)
381
382
383 #define MD_TEMP_BUFFER(sz) \
384 do { \
385 if(sz > ctx->alloc_buffer) { \
386 CHAR* new_buffer; \
387 SZ new_size = ((sz) + (sz) / 2 + 128) & ~127; \
388 \
389 new_buffer = realloc(ctx->buffer, new_size); \
390 if(new_buffer == NULL) { \
391 MD_LOG("realloc() failed."); \
392 ret = -1; \
393 goto abort; \
394 } \
395 \
396 ctx->buffer = new_buffer; \
397 ctx->alloc_buffer = new_size; \
398 } \
399 } while(0)
400
401
402 #define MD_ENTER_BLOCK(type, arg) \
403 do { \
404 ret = ctx->parser.enter_block((type), (arg), ctx->userdata); \
405 if(ret != 0) { \
406 MD_LOG("Aborted from enter_block() callback."); \
407 goto abort; \
408 } \
409 } while(0)
410
411 #define MD_LEAVE_BLOCK(type, arg) \
412 do { \
413 ret = ctx->parser.leave_block((type), (arg), ctx->userdata); \
414 if(ret != 0) { \
415 MD_LOG("Aborted from leave_block() callback."); \
416 goto abort; \
417 } \
418 } while(0)
419
420 #define MD_ENTER_SPAN(type, arg) \
421 do { \
422 ret = ctx->parser.enter_span((type), (arg), ctx->userdata); \
423 if(ret != 0) { \
424 MD_LOG("Aborted from enter_span() callback."); \
425 goto abort; \
426 } \
427 } while(0)
428
429 #define MD_LEAVE_SPAN(type, arg) \
430 do { \
431 ret = ctx->parser.leave_span((type), (arg), ctx->userdata); \
432 if(ret != 0) { \
433 MD_LOG("Aborted from leave_span() callback."); \
434 goto abort; \
435 } \
436 } while(0)
437
438 #define MD_TEXT(type, str, size) \
439 do { \
440 if(size > 0) { \
441 ret = ctx->parser.text((type), (str), (size), ctx->userdata); \
442 if(ret != 0) { \
443 MD_LOG("Aborted from text() callback."); \
444 goto abort; \
445 } \
446 } \
447 } while(0)
448
449 #define MD_TEXT_INSECURE(type, str, size) \
450 do { \
451 if(size > 0) { \
452 ret = md_text_with_null_replacement(ctx, type, str, size); \
453 if(ret != 0) { \
454 MD_LOG("Aborted from text() callback."); \
455 goto abort; \
456 } \
457 } \
458 } while(0)
459
460
461
462 /*************************
463 *** Unicode Support ***
464 *************************/
465
466 typedef struct MD_UNICODE_FOLD_INFO_tag MD_UNICODE_FOLD_INFO;
467 struct MD_UNICODE_FOLD_INFO_tag {
468 unsigned codepoints[3];
469 int n_codepoints;
470 };
471
472
473 #if defined MD4C_USE_UTF16 || defined MD4C_USE_UTF8
474 /* Binary search over sorted "map" of codepoints. Consecutive sequences
475 * of codepoints may be encoded in the map by just using the
476 * (MIN_CODEPOINT | 0x40000000) and (MAX_CODEPOINT | 0x80000000).
477 *
478 * Returns index of the found record in the map (in the case of ranges,
479 * the minimal value is used); or -1 on failure. */
480 static int
md_unicode_bsearch__(unsigned codepoint,const unsigned * map,size_t map_size)481 md_unicode_bsearch__(unsigned codepoint, const unsigned* map, size_t map_size)
482 {
483 int beg, end;
484 int pivot_beg, pivot_end;
485
486 beg = 0;
487 end = (int) map_size-1;
488 while(beg <= end) {
489 /* Pivot may be a range, not just a single value. */
490 pivot_beg = pivot_end = (beg + end) / 2;
491 if(map[pivot_end] & 0x40000000)
492 pivot_end++;
493 if(map[pivot_beg] & 0x80000000)
494 pivot_beg--;
495
496 if(codepoint < (map[pivot_beg] & 0x00ffffff))
497 end = pivot_beg - 1;
498 else if(codepoint > (map[pivot_end] & 0x00ffffff))
499 beg = pivot_end + 1;
500 else
501 return pivot_beg;
502 }
503
504 return -1;
505 }
506
507 static int
md_is_unicode_whitespace__(unsigned codepoint)508 md_is_unicode_whitespace__(unsigned codepoint)
509 {
510 #define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
511 #define S(cp) (cp)
512 /* Unicode "Zs" category.
513 * (generated by scripts/build_whitespace_map.py) */
514 static const unsigned WHITESPACE_MAP[] = {
515 S(0x0020), S(0x00a0), S(0x1680), R(0x2000,0x200a), S(0x202f), S(0x205f), S(0x3000)
516 };
517 #undef R
518 #undef S
519
520 /* The ASCII ones are the most frequently used ones, also CommonMark
521 * specification requests few more in this range. */
522 if(codepoint <= 0x7f)
523 return ISWHITESPACE_(codepoint);
524
525 return (md_unicode_bsearch__(codepoint, WHITESPACE_MAP, SIZEOF_ARRAY(WHITESPACE_MAP)) >= 0);
526 }
527
528 static int
md_is_unicode_punct__(unsigned codepoint)529 md_is_unicode_punct__(unsigned codepoint)
530 {
531 #define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
532 #define S(cp) (cp)
533 /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
534 * (generated by scripts/build_punct_map.py) */
535 static const unsigned PUNCT_MAP[] = {
536 R(0x0021,0x0023), R(0x0025,0x002a), R(0x002c,0x002f), R(0x003a,0x003b), R(0x003f,0x0040),
537 R(0x005b,0x005d), S(0x005f), S(0x007b), S(0x007d), S(0x00a1), S(0x00a7), S(0x00ab), R(0x00b6,0x00b7),
538 S(0x00bb), S(0x00bf), S(0x037e), S(0x0387), R(0x055a,0x055f), R(0x0589,0x058a), S(0x05be), S(0x05c0),
539 S(0x05c3), S(0x05c6), R(0x05f3,0x05f4), R(0x0609,0x060a), R(0x060c,0x060d), S(0x061b), R(0x061e,0x061f),
540 R(0x066a,0x066d), S(0x06d4), R(0x0700,0x070d), R(0x07f7,0x07f9), R(0x0830,0x083e), S(0x085e),
541 R(0x0964,0x0965), S(0x0970), S(0x09fd), S(0x0a76), S(0x0af0), S(0x0c77), S(0x0c84), S(0x0df4), S(0x0e4f),
542 R(0x0e5a,0x0e5b), R(0x0f04,0x0f12), S(0x0f14), R(0x0f3a,0x0f3d), S(0x0f85), R(0x0fd0,0x0fd4),
543 R(0x0fd9,0x0fda), R(0x104a,0x104f), S(0x10fb), R(0x1360,0x1368), S(0x1400), S(0x166e), R(0x169b,0x169c),
544 R(0x16eb,0x16ed), R(0x1735,0x1736), R(0x17d4,0x17d6), R(0x17d8,0x17da), R(0x1800,0x180a),
545 R(0x1944,0x1945), R(0x1a1e,0x1a1f), R(0x1aa0,0x1aa6), R(0x1aa8,0x1aad), R(0x1b5a,0x1b60),
546 R(0x1bfc,0x1bff), R(0x1c3b,0x1c3f), R(0x1c7e,0x1c7f), R(0x1cc0,0x1cc7), S(0x1cd3), R(0x2010,0x2027),
547 R(0x2030,0x2043), R(0x2045,0x2051), R(0x2053,0x205e), R(0x207d,0x207e), R(0x208d,0x208e),
548 R(0x2308,0x230b), R(0x2329,0x232a), R(0x2768,0x2775), R(0x27c5,0x27c6), R(0x27e6,0x27ef),
549 R(0x2983,0x2998), R(0x29d8,0x29db), R(0x29fc,0x29fd), R(0x2cf9,0x2cfc), R(0x2cfe,0x2cff), S(0x2d70),
550 R(0x2e00,0x2e2e), R(0x2e30,0x2e4f), S(0x2e52), R(0x3001,0x3003), R(0x3008,0x3011), R(0x3014,0x301f),
551 S(0x3030), S(0x303d), S(0x30a0), S(0x30fb), R(0xa4fe,0xa4ff), R(0xa60d,0xa60f), S(0xa673), S(0xa67e),
552 R(0xa6f2,0xa6f7), R(0xa874,0xa877), R(0xa8ce,0xa8cf), R(0xa8f8,0xa8fa), S(0xa8fc), R(0xa92e,0xa92f),
553 S(0xa95f), R(0xa9c1,0xa9cd), R(0xa9de,0xa9df), R(0xaa5c,0xaa5f), R(0xaade,0xaadf), R(0xaaf0,0xaaf1),
554 S(0xabeb), R(0xfd3e,0xfd3f), R(0xfe10,0xfe19), R(0xfe30,0xfe52), R(0xfe54,0xfe61), S(0xfe63), S(0xfe68),
555 R(0xfe6a,0xfe6b), R(0xff01,0xff03), R(0xff05,0xff0a), R(0xff0c,0xff0f), R(0xff1a,0xff1b),
556 R(0xff1f,0xff20), R(0xff3b,0xff3d), S(0xff3f), S(0xff5b), S(0xff5d), R(0xff5f,0xff65), R(0x10100,0x10102),
557 S(0x1039f), S(0x103d0), S(0x1056f), S(0x10857), S(0x1091f), S(0x1093f), R(0x10a50,0x10a58), S(0x10a7f),
558 R(0x10af0,0x10af6), R(0x10b39,0x10b3f), R(0x10b99,0x10b9c), S(0x10ead), R(0x10f55,0x10f59),
559 R(0x11047,0x1104d), R(0x110bb,0x110bc), R(0x110be,0x110c1), R(0x11140,0x11143), R(0x11174,0x11175),
560 R(0x111c5,0x111c8), S(0x111cd), S(0x111db), R(0x111dd,0x111df), R(0x11238,0x1123d), S(0x112a9),
561 R(0x1144b,0x1144f), R(0x1145a,0x1145b), S(0x1145d), S(0x114c6), R(0x115c1,0x115d7), R(0x11641,0x11643),
562 R(0x11660,0x1166c), R(0x1173c,0x1173e), S(0x1183b), R(0x11944,0x11946), S(0x119e2), R(0x11a3f,0x11a46),
563 R(0x11a9a,0x11a9c), R(0x11a9e,0x11aa2), R(0x11c41,0x11c45), R(0x11c70,0x11c71), R(0x11ef7,0x11ef8),
564 S(0x11fff), R(0x12470,0x12474), R(0x16a6e,0x16a6f), S(0x16af5), R(0x16b37,0x16b3b), S(0x16b44),
565 R(0x16e97,0x16e9a), S(0x16fe2), S(0x1bc9f), R(0x1da87,0x1da8b), R(0x1e95e,0x1e95f)
566 };
567 #undef R
568 #undef S
569
570 /* The ASCII ones are the most frequently used ones, also CommonMark
571 * specification requests few more in this range. */
572 if(codepoint <= 0x7f)
573 return ISPUNCT_(codepoint);
574
575 return (md_unicode_bsearch__(codepoint, PUNCT_MAP, SIZEOF_ARRAY(PUNCT_MAP)) >= 0);
576 }
577
578 static void
md_get_unicode_fold_info(unsigned codepoint,MD_UNICODE_FOLD_INFO * info)579 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
580 {
581 #define R(cp_min, cp_max) ((cp_min) | 0x40000000), ((cp_max) | 0x80000000)
582 #define S(cp) (cp)
583 /* Unicode "Pc", "Pd", "Pe", "Pf", "Pi", "Po", "Ps" categories.
584 * (generated by scripts/build_punct_map.py) */
585 static const unsigned FOLD_MAP_1[] = {
586 R(0x0041,0x005a), S(0x00b5), R(0x00c0,0x00d6), R(0x00d8,0x00de), R(0x0100,0x012e), R(0x0132,0x0136),
587 R(0x0139,0x0147), R(0x014a,0x0176), S(0x0178), R(0x0179,0x017d), S(0x017f), S(0x0181), S(0x0182),
588 S(0x0184), S(0x0186), S(0x0187), S(0x0189), S(0x018a), S(0x018b), S(0x018e), S(0x018f), S(0x0190),
589 S(0x0191), S(0x0193), S(0x0194), S(0x0196), S(0x0197), S(0x0198), S(0x019c), S(0x019d), S(0x019f),
590 R(0x01a0,0x01a4), S(0x01a6), S(0x01a7), S(0x01a9), S(0x01ac), S(0x01ae), S(0x01af), S(0x01b1), S(0x01b2),
591 S(0x01b3), S(0x01b5), S(0x01b7), S(0x01b8), S(0x01bc), S(0x01c4), S(0x01c5), S(0x01c7), S(0x01c8),
592 S(0x01ca), R(0x01cb,0x01db), R(0x01de,0x01ee), S(0x01f1), S(0x01f2), S(0x01f4), S(0x01f6), S(0x01f7),
593 R(0x01f8,0x021e), S(0x0220), R(0x0222,0x0232), S(0x023a), S(0x023b), S(0x023d), S(0x023e), S(0x0241),
594 S(0x0243), S(0x0244), S(0x0245), R(0x0246,0x024e), S(0x0345), S(0x0370), S(0x0372), S(0x0376), S(0x037f),
595 S(0x0386), R(0x0388,0x038a), S(0x038c), S(0x038e), S(0x038f), R(0x0391,0x03a1), R(0x03a3,0x03ab),
596 S(0x03c2), S(0x03cf), S(0x03d0), S(0x03d1), S(0x03d5), S(0x03d6), R(0x03d8,0x03ee), S(0x03f0), S(0x03f1),
597 S(0x03f4), S(0x03f5), S(0x03f7), S(0x03f9), S(0x03fa), R(0x03fd,0x03ff), R(0x0400,0x040f),
598 R(0x0410,0x042f), R(0x0460,0x0480), R(0x048a,0x04be), S(0x04c0), R(0x04c1,0x04cd), R(0x04d0,0x052e),
599 R(0x0531,0x0556), R(0x10a0,0x10c5), S(0x10c7), S(0x10cd), R(0x13f8,0x13fd), S(0x1c80), S(0x1c81),
600 S(0x1c82), S(0x1c83), S(0x1c84), S(0x1c85), S(0x1c86), S(0x1c87), S(0x1c88), R(0x1c90,0x1cba),
601 R(0x1cbd,0x1cbf), R(0x1e00,0x1e94), S(0x1e9b), R(0x1ea0,0x1efe), R(0x1f08,0x1f0f), R(0x1f18,0x1f1d),
602 R(0x1f28,0x1f2f), R(0x1f38,0x1f3f), R(0x1f48,0x1f4d), S(0x1f59), S(0x1f5b), S(0x1f5d), S(0x1f5f),
603 R(0x1f68,0x1f6f), S(0x1fb8), S(0x1fb9), S(0x1fba), S(0x1fbb), S(0x1fbe), R(0x1fc8,0x1fcb), S(0x1fd8),
604 S(0x1fd9), S(0x1fda), S(0x1fdb), S(0x1fe8), S(0x1fe9), S(0x1fea), S(0x1feb), S(0x1fec), S(0x1ff8),
605 S(0x1ff9), S(0x1ffa), S(0x1ffb), S(0x2126), S(0x212a), S(0x212b), S(0x2132), R(0x2160,0x216f), S(0x2183),
606 R(0x24b6,0x24cf), R(0x2c00,0x2c2e), S(0x2c60), S(0x2c62), S(0x2c63), S(0x2c64), R(0x2c67,0x2c6b),
607 S(0x2c6d), S(0x2c6e), S(0x2c6f), S(0x2c70), S(0x2c72), S(0x2c75), S(0x2c7e), S(0x2c7f), R(0x2c80,0x2ce2),
608 S(0x2ceb), S(0x2ced), S(0x2cf2), R(0xa640,0xa66c), R(0xa680,0xa69a), R(0xa722,0xa72e), R(0xa732,0xa76e),
609 S(0xa779), S(0xa77b), S(0xa77d), R(0xa77e,0xa786), S(0xa78b), S(0xa78d), S(0xa790), S(0xa792),
610 R(0xa796,0xa7a8), S(0xa7aa), S(0xa7ab), S(0xa7ac), S(0xa7ad), S(0xa7ae), S(0xa7b0), S(0xa7b1), S(0xa7b2),
611 S(0xa7b3), R(0xa7b4,0xa7be), S(0xa7c2), S(0xa7c4), S(0xa7c5), S(0xa7c6), S(0xa7c7), S(0xa7c9), S(0xa7f5),
612 R(0xab70,0xabbf), R(0xff21,0xff3a), R(0x10400,0x10427), R(0x104b0,0x104d3), R(0x10c80,0x10cb2),
613 R(0x118a0,0x118bf), R(0x16e40,0x16e5f), R(0x1e900,0x1e921)
614 };
615 static const unsigned FOLD_MAP_1_DATA[] = {
616 0x0061, 0x007a, 0x03bc, 0x00e0, 0x00f6, 0x00f8, 0x00fe, 0x0101, 0x012f, 0x0133, 0x0137, 0x013a, 0x0148,
617 0x014b, 0x0177, 0x00ff, 0x017a, 0x017e, 0x0073, 0x0253, 0x0183, 0x0185, 0x0254, 0x0188, 0x0256, 0x0257,
618 0x018c, 0x01dd, 0x0259, 0x025b, 0x0192, 0x0260, 0x0263, 0x0269, 0x0268, 0x0199, 0x026f, 0x0272, 0x0275,
619 0x01a1, 0x01a5, 0x0280, 0x01a8, 0x0283, 0x01ad, 0x0288, 0x01b0, 0x028a, 0x028b, 0x01b4, 0x01b6, 0x0292,
620 0x01b9, 0x01bd, 0x01c6, 0x01c6, 0x01c9, 0x01c9, 0x01cc, 0x01cc, 0x01dc, 0x01df, 0x01ef, 0x01f3, 0x01f3,
621 0x01f5, 0x0195, 0x01bf, 0x01f9, 0x021f, 0x019e, 0x0223, 0x0233, 0x2c65, 0x023c, 0x019a, 0x2c66, 0x0242,
622 0x0180, 0x0289, 0x028c, 0x0247, 0x024f, 0x03b9, 0x0371, 0x0373, 0x0377, 0x03f3, 0x03ac, 0x03ad, 0x03af,
623 0x03cc, 0x03cd, 0x03ce, 0x03b1, 0x03c1, 0x03c3, 0x03cb, 0x03c3, 0x03d7, 0x03b2, 0x03b8, 0x03c6, 0x03c0,
624 0x03d9, 0x03ef, 0x03ba, 0x03c1, 0x03b8, 0x03b5, 0x03f8, 0x03f2, 0x03fb, 0x037b, 0x037d, 0x0450, 0x045f,
625 0x0430, 0x044f, 0x0461, 0x0481, 0x048b, 0x04bf, 0x04cf, 0x04c2, 0x04ce, 0x04d1, 0x052f, 0x0561, 0x0586,
626 0x2d00, 0x2d25, 0x2d27, 0x2d2d, 0x13f0, 0x13f5, 0x0432, 0x0434, 0x043e, 0x0441, 0x0442, 0x0442, 0x044a,
627 0x0463, 0xa64b, 0x10d0, 0x10fa, 0x10fd, 0x10ff, 0x1e01, 0x1e95, 0x1e61, 0x1ea1, 0x1eff, 0x1f00, 0x1f07,
628 0x1f10, 0x1f15, 0x1f20, 0x1f27, 0x1f30, 0x1f37, 0x1f40, 0x1f45, 0x1f51, 0x1f53, 0x1f55, 0x1f57, 0x1f60,
629 0x1f67, 0x1fb0, 0x1fb1, 0x1f70, 0x1f71, 0x03b9, 0x1f72, 0x1f75, 0x1fd0, 0x1fd1, 0x1f76, 0x1f77, 0x1fe0,
630 0x1fe1, 0x1f7a, 0x1f7b, 0x1fe5, 0x1f78, 0x1f79, 0x1f7c, 0x1f7d, 0x03c9, 0x006b, 0x00e5, 0x214e, 0x2170,
631 0x217f, 0x2184, 0x24d0, 0x24e9, 0x2c30, 0x2c5e, 0x2c61, 0x026b, 0x1d7d, 0x027d, 0x2c68, 0x2c6c, 0x0251,
632 0x0271, 0x0250, 0x0252, 0x2c73, 0x2c76, 0x023f, 0x0240, 0x2c81, 0x2ce3, 0x2cec, 0x2cee, 0x2cf3, 0xa641,
633 0xa66d, 0xa681, 0xa69b, 0xa723, 0xa72f, 0xa733, 0xa76f, 0xa77a, 0xa77c, 0x1d79, 0xa77f, 0xa787, 0xa78c,
634 0x0265, 0xa791, 0xa793, 0xa797, 0xa7a9, 0x0266, 0x025c, 0x0261, 0x026c, 0x026a, 0x029e, 0x0287, 0x029d,
635 0xab53, 0xa7b5, 0xa7bf, 0xa7c3, 0xa794, 0x0282, 0x1d8e, 0xa7c8, 0xa7ca, 0xa7f6, 0x13a0, 0x13ef, 0xff41,
636 0xff5a, 0x10428, 0x1044f, 0x104d8, 0x104fb, 0x10cc0, 0x10cf2, 0x118c0, 0x118df, 0x16e60, 0x16e7f, 0x1e922,
637 0x1e943
638 };
639 static const unsigned FOLD_MAP_2[] = {
640 S(0x00df), S(0x0130), S(0x0149), S(0x01f0), S(0x0587), S(0x1e96), S(0x1e97), S(0x1e98), S(0x1e99),
641 S(0x1e9a), S(0x1e9e), S(0x1f50), R(0x1f80,0x1f87), R(0x1f88,0x1f8f), R(0x1f90,0x1f97), R(0x1f98,0x1f9f),
642 R(0x1fa0,0x1fa7), R(0x1fa8,0x1faf), S(0x1fb2), S(0x1fb3), S(0x1fb4), S(0x1fb6), S(0x1fbc), S(0x1fc2),
643 S(0x1fc3), S(0x1fc4), S(0x1fc6), S(0x1fcc), S(0x1fd6), S(0x1fe4), S(0x1fe6), S(0x1ff2), S(0x1ff3),
644 S(0x1ff4), S(0x1ff6), S(0x1ffc), S(0xfb00), S(0xfb01), S(0xfb02), S(0xfb05), S(0xfb06), S(0xfb13),
645 S(0xfb14), S(0xfb15), S(0xfb16), S(0xfb17)
646 };
647 static const unsigned FOLD_MAP_2_DATA[] = {
648 0x0073,0x0073, 0x0069,0x0307, 0x02bc,0x006e, 0x006a,0x030c, 0x0565,0x0582, 0x0068,0x0331, 0x0074,0x0308,
649 0x0077,0x030a, 0x0079,0x030a, 0x0061,0x02be, 0x0073,0x0073, 0x03c5,0x0313, 0x1f00,0x03b9, 0x1f07,0x03b9,
650 0x1f00,0x03b9, 0x1f07,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f20,0x03b9, 0x1f27,0x03b9, 0x1f60,0x03b9,
651 0x1f67,0x03b9, 0x1f60,0x03b9, 0x1f67,0x03b9, 0x1f70,0x03b9, 0x03b1,0x03b9, 0x03ac,0x03b9, 0x03b1,0x0342,
652 0x03b1,0x03b9, 0x1f74,0x03b9, 0x03b7,0x03b9, 0x03ae,0x03b9, 0x03b7,0x0342, 0x03b7,0x03b9, 0x03b9,0x0342,
653 0x03c1,0x0313, 0x03c5,0x0342, 0x1f7c,0x03b9, 0x03c9,0x03b9, 0x03ce,0x03b9, 0x03c9,0x0342, 0x03c9,0x03b9,
654 0x0066,0x0066, 0x0066,0x0069, 0x0066,0x006c, 0x0073,0x0074, 0x0073,0x0074, 0x0574,0x0576, 0x0574,0x0565,
655 0x0574,0x056b, 0x057e,0x0576, 0x0574,0x056d
656 };
657 static const unsigned FOLD_MAP_3[] = {
658 S(0x0390), S(0x03b0), S(0x1f52), S(0x1f54), S(0x1f56), S(0x1fb7), S(0x1fc7), S(0x1fd2), S(0x1fd3),
659 S(0x1fd7), S(0x1fe2), S(0x1fe3), S(0x1fe7), S(0x1ff7), S(0xfb03), S(0xfb04)
660 };
661 static const unsigned FOLD_MAP_3_DATA[] = {
662 0x03b9,0x0308,0x0301, 0x03c5,0x0308,0x0301, 0x03c5,0x0313,0x0300, 0x03c5,0x0313,0x0301,
663 0x03c5,0x0313,0x0342, 0x03b1,0x0342,0x03b9, 0x03b7,0x0342,0x03b9, 0x03b9,0x0308,0x0300,
664 0x03b9,0x0308,0x0301, 0x03b9,0x0308,0x0342, 0x03c5,0x0308,0x0300, 0x03c5,0x0308,0x0301,
665 0x03c5,0x0308,0x0342, 0x03c9,0x0342,0x03b9, 0x0066,0x0066,0x0069, 0x0066,0x0066,0x006c
666 };
667 #undef R
668 #undef S
669 static const struct {
670 const unsigned* map;
671 const unsigned* data;
672 size_t map_size;
673 int n_codepoints;
674 } FOLD_MAP_LIST[] = {
675 { FOLD_MAP_1, FOLD_MAP_1_DATA, SIZEOF_ARRAY(FOLD_MAP_1), 1 },
676 { FOLD_MAP_2, FOLD_MAP_2_DATA, SIZEOF_ARRAY(FOLD_MAP_2), 2 },
677 { FOLD_MAP_3, FOLD_MAP_3_DATA, SIZEOF_ARRAY(FOLD_MAP_3), 3 }
678 };
679
680 int i;
681
682 /* Fast path for ASCII characters. */
683 if(codepoint <= 0x7f) {
684 info->codepoints[0] = codepoint;
685 if(ISUPPER_(codepoint))
686 info->codepoints[0] += 'a' - 'A';
687 info->n_codepoints = 1;
688 return;
689 }
690
691 /* Try to locate the codepoint in any of the maps. */
692 for(i = 0; i < (int) SIZEOF_ARRAY(FOLD_MAP_LIST); i++) {
693 int index;
694
695 index = md_unicode_bsearch__(codepoint, FOLD_MAP_LIST[i].map, FOLD_MAP_LIST[i].map_size);
696 if(index >= 0) {
697 /* Found the mapping. */
698 int n_codepoints = FOLD_MAP_LIST[i].n_codepoints;
699 const unsigned* map = FOLD_MAP_LIST[i].map;
700 const unsigned* codepoints = FOLD_MAP_LIST[i].data + (index * n_codepoints);
701
702 memcpy(info->codepoints, codepoints, sizeof(unsigned) * n_codepoints);
703 info->n_codepoints = n_codepoints;
704
705 if(FOLD_MAP_LIST[i].map[index] != codepoint) {
706 /* The found mapping maps whole range of codepoints,
707 * i.e. we have to offset info->codepoints[0] accordingly. */
708 if((map[index] & 0x00ffffff)+1 == codepoints[0]) {
709 /* Alternating type of the range. */
710 info->codepoints[0] = codepoint + ((codepoint & 0x1) == (map[index] & 0x1) ? 1 : 0);
711 } else {
712 /* Range to range kind of mapping. */
713 info->codepoints[0] += (codepoint - (map[index] & 0x00ffffff));
714 }
715 }
716
717 return;
718 }
719 }
720
721 /* No mapping found. Map the codepoint to itself. */
722 info->codepoints[0] = codepoint;
723 info->n_codepoints = 1;
724 }
725 #endif
726
727
728 #if defined MD4C_USE_UTF16
729 #define IS_UTF16_SURROGATE_HI(word) (((WORD)(word) & 0xfc00) == 0xd800)
730 #define IS_UTF16_SURROGATE_LO(word) (((WORD)(word) & 0xfc00) == 0xdc00)
731 #define UTF16_DECODE_SURROGATE(hi, lo) (0x10000 + ((((unsigned)(hi) & 0x3ff) << 10) | (((unsigned)(lo) & 0x3ff) << 0)))
732
733 static unsigned
md_decode_utf16le__(const CHAR * str,SZ str_size,SZ * p_size)734 md_decode_utf16le__(const CHAR* str, SZ str_size, SZ* p_size)
735 {
736 if(IS_UTF16_SURROGATE_HI(str[0])) {
737 if(1 < str_size && IS_UTF16_SURROGATE_LO(str[1])) {
738 if(p_size != NULL)
739 *p_size = 2;
740 return UTF16_DECODE_SURROGATE(str[0], str[1]);
741 }
742 }
743
744 if(p_size != NULL)
745 *p_size = 1;
746 return str[0];
747 }
748
749 static unsigned
md_decode_utf16le_before__(MD_CTX * ctx,OFF off)750 md_decode_utf16le_before__(MD_CTX* ctx, OFF off)
751 {
752 if(off > 2 && IS_UTF16_SURROGATE_HI(CH(off-2)) && IS_UTF16_SURROGATE_LO(CH(off-1)))
753 return UTF16_DECODE_SURROGATE(CH(off-2), CH(off-1));
754
755 return CH(off);
756 }
757
758 /* No whitespace uses surrogates, so no decoding needed here. */
759 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
760 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(CH(off))
761 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(CH((off)-1))
762
763 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf16le__(STR(off), ctx->size - (off), NULL))
764 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf16le_before__(ctx, off))
765
766 static inline int
md_decode_unicode(const CHAR * str,OFF off,SZ str_size,SZ * p_char_size)767 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
768 {
769 return md_decode_utf16le__(str+off, str_size-off, p_char_size);
770 }
771 #elif defined MD4C_USE_UTF8
772 #define IS_UTF8_LEAD1(byte) ((unsigned char)(byte) <= 0x7f)
773 #define IS_UTF8_LEAD2(byte) (((unsigned char)(byte) & 0xe0) == 0xc0)
774 #define IS_UTF8_LEAD3(byte) (((unsigned char)(byte) & 0xf0) == 0xe0)
775 #define IS_UTF8_LEAD4(byte) (((unsigned char)(byte) & 0xf8) == 0xf0)
776 #define IS_UTF8_TAIL(byte) (((unsigned char)(byte) & 0xc0) == 0x80)
777
778 static unsigned
md_decode_utf8__(const CHAR * str,SZ str_size,SZ * p_size)779 md_decode_utf8__(const CHAR* str, SZ str_size, SZ* p_size)
780 {
781 if(!IS_UTF8_LEAD1(str[0])) {
782 if(IS_UTF8_LEAD2(str[0])) {
783 if(1 < str_size && IS_UTF8_TAIL(str[1])) {
784 if(p_size != NULL)
785 *p_size = 2;
786
787 return (((unsigned int)str[0] & 0x1f) << 6) |
788 (((unsigned int)str[1] & 0x3f) << 0);
789 }
790 } else if(IS_UTF8_LEAD3(str[0])) {
791 if(2 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2])) {
792 if(p_size != NULL)
793 *p_size = 3;
794
795 return (((unsigned int)str[0] & 0x0f) << 12) |
796 (((unsigned int)str[1] & 0x3f) << 6) |
797 (((unsigned int)str[2] & 0x3f) << 0);
798 }
799 } else if(IS_UTF8_LEAD4(str[0])) {
800 if(3 < str_size && IS_UTF8_TAIL(str[1]) && IS_UTF8_TAIL(str[2]) && IS_UTF8_TAIL(str[3])) {
801 if(p_size != NULL)
802 *p_size = 4;
803
804 return (((unsigned int)str[0] & 0x07) << 18) |
805 (((unsigned int)str[1] & 0x3f) << 12) |
806 (((unsigned int)str[2] & 0x3f) << 6) |
807 (((unsigned int)str[3] & 0x3f) << 0);
808 }
809 }
810 }
811
812 if(p_size != NULL)
813 *p_size = 1;
814 return (unsigned) str[0];
815 }
816
817 static unsigned
md_decode_utf8_before__(MD_CTX * ctx,OFF off)818 md_decode_utf8_before__(MD_CTX* ctx, OFF off)
819 {
820 if(!IS_UTF8_LEAD1(CH(off-1))) {
821 if(off > 1 && IS_UTF8_LEAD2(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
822 return (((unsigned int)CH(off-2) & 0x1f) << 6) |
823 (((unsigned int)CH(off-1) & 0x3f) << 0);
824
825 if(off > 2 && IS_UTF8_LEAD3(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
826 return (((unsigned int)CH(off-3) & 0x0f) << 12) |
827 (((unsigned int)CH(off-2) & 0x3f) << 6) |
828 (((unsigned int)CH(off-1) & 0x3f) << 0);
829
830 if(off > 3 && IS_UTF8_LEAD4(CH(off-4)) && IS_UTF8_TAIL(CH(off-3)) && IS_UTF8_TAIL(CH(off-2)) && IS_UTF8_TAIL(CH(off-1)))
831 return (((unsigned int)CH(off-4) & 0x07) << 18) |
832 (((unsigned int)CH(off-3) & 0x3f) << 12) |
833 (((unsigned int)CH(off-2) & 0x3f) << 6) |
834 (((unsigned int)CH(off-1) & 0x3f) << 0);
835 }
836
837 return (unsigned) CH(off-1);
838 }
839
840 #define ISUNICODEWHITESPACE_(codepoint) md_is_unicode_whitespace__(codepoint)
841 #define ISUNICODEWHITESPACE(off) md_is_unicode_whitespace__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
842 #define ISUNICODEWHITESPACEBEFORE(off) md_is_unicode_whitespace__(md_decode_utf8_before__(ctx, off))
843
844 #define ISUNICODEPUNCT(off) md_is_unicode_punct__(md_decode_utf8__(STR(off), ctx->size - (off), NULL))
845 #define ISUNICODEPUNCTBEFORE(off) md_is_unicode_punct__(md_decode_utf8_before__(ctx, off))
846
847 static inline unsigned
md_decode_unicode(const CHAR * str,OFF off,SZ str_size,SZ * p_char_size)848 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_char_size)
849 {
850 return md_decode_utf8__(str+off, str_size-off, p_char_size);
851 }
852 #else
853 #define ISUNICODEWHITESPACE_(codepoint) ISWHITESPACE_(codepoint)
854 #define ISUNICODEWHITESPACE(off) ISWHITESPACE(off)
855 #define ISUNICODEWHITESPACEBEFORE(off) ISWHITESPACE((off)-1)
856
857 #define ISUNICODEPUNCT(off) ISPUNCT(off)
858 #define ISUNICODEPUNCTBEFORE(off) ISPUNCT((off)-1)
859
860 static inline void
md_get_unicode_fold_info(unsigned codepoint,MD_UNICODE_FOLD_INFO * info)861 md_get_unicode_fold_info(unsigned codepoint, MD_UNICODE_FOLD_INFO* info)
862 {
863 info->codepoints[0] = codepoint;
864 if(ISUPPER_(codepoint))
865 info->codepoints[0] += 'a' - 'A';
866 info->n_codepoints = 1;
867 }
868
869 static inline unsigned
md_decode_unicode(const CHAR * str,OFF off,SZ str_size,SZ * p_size)870 md_decode_unicode(const CHAR* str, OFF off, SZ str_size, SZ* p_size)
871 {
872 *p_size = 1;
873 return (unsigned) str[off];
874 }
875 #endif
876
877
878 /*************************************
879 *** Helper string manipulations ***
880 *************************************/
881
882 /* Fill buffer with copy of the string between 'beg' and 'end' but replace any
883 * line breaks with given replacement character.
884 *
885 * NOTE: Caller is responsible to make sure the buffer is large enough.
886 * (Given the output is always shorter then input, (end - beg) is good idea
887 * what the caller should allocate.)
888 */
889 static void
md_merge_lines(MD_CTX * ctx,OFF beg,OFF end,const MD_LINE * lines,int n_lines,CHAR line_break_replacement_char,CHAR * buffer,SZ * p_size)890 md_merge_lines(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
891 CHAR line_break_replacement_char, CHAR* buffer, SZ* p_size)
892 {
893 CHAR* ptr = buffer;
894 int line_index = 0;
895 OFF off = beg;
896
897 while(1) {
898 const MD_LINE* line = &lines[line_index];
899 OFF line_end = line->end;
900 if(end < line_end)
901 line_end = end;
902
903 while(off < line_end) {
904 *ptr = CH(off);
905 ptr++;
906 off++;
907 }
908
909 if(off >= end) {
910 *p_size = ptr - buffer;
911 return;
912 }
913
914 *ptr = line_break_replacement_char;
915 ptr++;
916
917 line_index++;
918 off = lines[line_index].beg;
919 }
920 }
921
922 /* Wrapper of md_merge_lines() which allocates new buffer for the output string.
923 */
924 static int
md_merge_lines_alloc(MD_CTX * ctx,OFF beg,OFF end,const MD_LINE * lines,int n_lines,CHAR line_break_replacement_char,CHAR ** p_str,SZ * p_size)925 md_merge_lines_alloc(MD_CTX* ctx, OFF beg, OFF end, const MD_LINE* lines, int n_lines,
926 CHAR line_break_replacement_char, CHAR** p_str, SZ* p_size)
927 {
928 CHAR* buffer;
929
930 buffer = (CHAR*) malloc(sizeof(CHAR) * (end - beg));
931 if(buffer == NULL) {
932 MD_LOG("malloc() failed.");
933 return -1;
934 }
935
936 md_merge_lines(ctx, beg, end, lines, n_lines,
937 line_break_replacement_char, buffer, p_size);
938
939 *p_str = buffer;
940 return 0;
941 }
942
943 static OFF
md_skip_unicode_whitespace(const CHAR * label,OFF off,SZ size)944 md_skip_unicode_whitespace(const CHAR* label, OFF off, SZ size)
945 {
946 SZ char_size;
947 unsigned codepoint;
948
949 while(off < size) {
950 codepoint = md_decode_unicode(label, off, size, &char_size);
951 if(!ISUNICODEWHITESPACE_(codepoint) && !ISNEWLINE_(label[off]))
952 break;
953 off += char_size;
954 }
955
956 return off;
957 }
958
959
960 /******************************
961 *** Recognizing raw HTML ***
962 ******************************/
963
964 /* md_is_html_tag() may be called when processing inlines (inline raw HTML)
965 * or when breaking document to blocks (checking for start of HTML block type 7).
966 *
967 * When breaking document to blocks, we do not yet know line boundaries, but
968 * in that case the whole tag has to live on a single line. We distinguish this
969 * by n_lines == 0.
970 */
971 static int
md_is_html_tag(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF max_end,OFF * p_end)972 md_is_html_tag(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
973 {
974 int attr_state;
975 OFF off = beg;
976 OFF line_end = (n_lines > 0) ? lines[0].end : ctx->size;
977 int i = 0;
978
979 MD_ASSERT(CH(beg) == _T('<'));
980
981 if(off + 1 >= line_end)
982 return FALSE;
983 off++;
984
985 /* For parsing attributes, we need a little state automaton below.
986 * State -1: no attributes are allowed.
987 * State 0: attribute could follow after some whitespace.
988 * State 1: after a whitespace (attribute name may follow).
989 * State 2: after attribute name ('=' MAY follow).
990 * State 3: after '=' (value specification MUST follow).
991 * State 41: in middle of unquoted attribute value.
992 * State 42: in middle of single-quoted attribute value.
993 * State 43: in middle of double-quoted attribute value.
994 */
995 attr_state = 0;
996
997 if(CH(off) == _T('/')) {
998 /* Closer tag "</ ... >". No attributes may be present. */
999 attr_state = -1;
1000 off++;
1001 }
1002
1003 /* Tag name */
1004 if(off >= line_end || !ISALPHA(off))
1005 return FALSE;
1006 off++;
1007 while(off < line_end && (ISALNUM(off) || CH(off) == _T('-')))
1008 off++;
1009
1010 /* (Optional) attributes (if not closer), (optional) '/' (if not closer)
1011 * and final '>'. */
1012 while(1) {
1013 while(off < line_end && !ISNEWLINE(off)) {
1014 if(attr_state > 40) {
1015 if(attr_state == 41 && (ISBLANK(off) || ISANYOF(off, _T("\"'=<>`")))) {
1016 attr_state = 0;
1017 off--; /* Put the char back for re-inspection in the new state. */
1018 } else if(attr_state == 42 && CH(off) == _T('\'')) {
1019 attr_state = 0;
1020 } else if(attr_state == 43 && CH(off) == _T('"')) {
1021 attr_state = 0;
1022 }
1023 off++;
1024 } else if(ISWHITESPACE(off)) {
1025 if(attr_state == 0)
1026 attr_state = 1;
1027 off++;
1028 } else if(attr_state <= 2 && CH(off) == _T('>')) {
1029 /* End. */
1030 goto done;
1031 } else if(attr_state <= 2 && CH(off) == _T('/') && off+1 < line_end && CH(off+1) == _T('>')) {
1032 /* End with digraph '/>' */
1033 off++;
1034 goto done;
1035 } else if((attr_state == 1 || attr_state == 2) && (ISALPHA(off) || CH(off) == _T('_') || CH(off) == _T(':'))) {
1036 off++;
1037 /* Attribute name */
1038 while(off < line_end && (ISALNUM(off) || ISANYOF(off, _T("_.:-"))))
1039 off++;
1040 attr_state = 2;
1041 } else if(attr_state == 2 && CH(off) == _T('=')) {
1042 /* Attribute assignment sign */
1043 off++;
1044 attr_state = 3;
1045 } else if(attr_state == 3) {
1046 /* Expecting start of attribute value. */
1047 if(CH(off) == _T('"'))
1048 attr_state = 43;
1049 else if(CH(off) == _T('\''))
1050 attr_state = 42;
1051 else if(!ISANYOF(off, _T("\"'=<>`")) && !ISNEWLINE(off))
1052 attr_state = 41;
1053 else
1054 return FALSE;
1055 off++;
1056 } else {
1057 /* Anything unexpected. */
1058 return FALSE;
1059 }
1060 }
1061
1062 /* We have to be on a single line. See definition of start condition
1063 * of HTML block, type 7. */
1064 if(n_lines == 0)
1065 return FALSE;
1066
1067 i++;
1068 if(i >= n_lines)
1069 return FALSE;
1070
1071 off = lines[i].beg;
1072 line_end = lines[i].end;
1073
1074 if(attr_state == 0 || attr_state == 41)
1075 attr_state = 1;
1076
1077 if(off >= max_end)
1078 return FALSE;
1079 }
1080
1081 done:
1082 if(off >= max_end)
1083 return FALSE;
1084
1085 *p_end = off+1;
1086 return TRUE;
1087 }
1088
1089 static int
md_scan_for_html_closer(MD_CTX * ctx,const MD_CHAR * str,MD_SIZE len,const MD_LINE * lines,int n_lines,OFF beg,OFF max_end,OFF * p_end,OFF * p_scan_horizon)1090 md_scan_for_html_closer(MD_CTX* ctx, const MD_CHAR* str, MD_SIZE len,
1091 const MD_LINE* lines, int n_lines,
1092 OFF beg, OFF max_end, OFF* p_end,
1093 OFF* p_scan_horizon)
1094 {
1095 OFF off = beg;
1096 int i = 0;
1097
1098 if(off < *p_scan_horizon && *p_scan_horizon >= max_end - len) {
1099 /* We have already scanned the range up to the max_end so we know
1100 * there is nothing to see. */
1101 return FALSE;
1102 }
1103
1104 while(TRUE) {
1105 while(off + len <= lines[i].end && off + len <= max_end) {
1106 if(md_ascii_eq(STR(off), str, len)) {
1107 /* Success. */
1108 *p_end = off + len;
1109 return TRUE;
1110 }
1111 off++;
1112 }
1113
1114 i++;
1115 if(off >= max_end || i >= n_lines) {
1116 /* Failure. */
1117 *p_scan_horizon = off;
1118 return FALSE;
1119 }
1120
1121 off = lines[i].beg;
1122 }
1123 }
1124
1125 static int
md_is_html_comment(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF max_end,OFF * p_end)1126 md_is_html_comment(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1127 {
1128 OFF off = beg;
1129
1130 MD_ASSERT(CH(beg) == _T('<'));
1131
1132 if(off + 4 >= lines[0].end)
1133 return FALSE;
1134 if(CH(off+1) != _T('!') || CH(off+2) != _T('-') || CH(off+3) != _T('-'))
1135 return FALSE;
1136 off += 4;
1137
1138 /* ">" and "->" must not follow the opening. */
1139 if(off < lines[0].end && CH(off) == _T('>'))
1140 return FALSE;
1141 if(off+1 < lines[0].end && CH(off) == _T('-') && CH(off+1) == _T('>'))
1142 return FALSE;
1143
1144 /* HTML comment must not contain "--", so we scan just for "--" instead
1145 * of "-->" and verify manually that '>' follows. */
1146 if(md_scan_for_html_closer(ctx, _T("--"), 2,
1147 lines, n_lines, off, max_end, p_end, &ctx->html_comment_horizon))
1148 {
1149 if(*p_end < max_end && CH(*p_end) == _T('>')) {
1150 *p_end = *p_end + 1;
1151 return TRUE;
1152 }
1153 }
1154
1155 return FALSE;
1156 }
1157
1158 static int
md_is_html_processing_instruction(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF max_end,OFF * p_end)1159 md_is_html_processing_instruction(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1160 {
1161 OFF off = beg;
1162
1163 if(off + 2 >= lines[0].end)
1164 return FALSE;
1165 if(CH(off+1) != _T('?'))
1166 return FALSE;
1167 off += 2;
1168
1169 return md_scan_for_html_closer(ctx, _T("?>"), 2,
1170 lines, n_lines, off, max_end, p_end, &ctx->html_proc_instr_horizon);
1171 }
1172
1173 static int
md_is_html_declaration(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF max_end,OFF * p_end)1174 md_is_html_declaration(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1175 {
1176 OFF off = beg;
1177
1178 if(off + 2 >= lines[0].end)
1179 return FALSE;
1180 if(CH(off+1) != _T('!'))
1181 return FALSE;
1182 off += 2;
1183
1184 /* Declaration name. */
1185 if(off >= lines[0].end || !ISALPHA(off))
1186 return FALSE;
1187 off++;
1188 while(off < lines[0].end && ISALPHA(off))
1189 off++;
1190 if(off < lines[0].end && !ISWHITESPACE(off))
1191 return FALSE;
1192
1193 return md_scan_for_html_closer(ctx, _T(">"), 1,
1194 lines, n_lines, off, max_end, p_end, &ctx->html_decl_horizon);
1195 }
1196
1197 static int
md_is_html_cdata(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF max_end,OFF * p_end)1198 md_is_html_cdata(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1199 {
1200 static const CHAR open_str[] = _T("<![CDATA[");
1201 static const SZ open_size = SIZEOF_ARRAY(open_str) - 1;
1202
1203 OFF off = beg;
1204
1205 if(off + open_size >= lines[0].end)
1206 return FALSE;
1207 if(memcmp(STR(off), open_str, open_size) != 0)
1208 return FALSE;
1209 off += open_size;
1210
1211 if(lines[n_lines-1].end < max_end)
1212 max_end = lines[n_lines-1].end - 2;
1213
1214 return md_scan_for_html_closer(ctx, _T("]]>"), 3,
1215 lines, n_lines, off, max_end, p_end, &ctx->html_cdata_horizon);
1216 }
1217
1218 static int
md_is_html_any(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF max_end,OFF * p_end)1219 md_is_html_any(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg, OFF max_end, OFF* p_end)
1220 {
1221 MD_ASSERT(CH(beg) == _T('<'));
1222 return (md_is_html_tag(ctx, lines, n_lines, beg, max_end, p_end) ||
1223 md_is_html_comment(ctx, lines, n_lines, beg, max_end, p_end) ||
1224 md_is_html_processing_instruction(ctx, lines, n_lines, beg, max_end, p_end) ||
1225 md_is_html_declaration(ctx, lines, n_lines, beg, max_end, p_end) ||
1226 md_is_html_cdata(ctx, lines, n_lines, beg, max_end, p_end));
1227 }
1228
1229
1230 /****************************
1231 *** Recognizing Entity ***
1232 ****************************/
1233
1234 static int
md_is_hex_entity_contents(MD_CTX * ctx,const CHAR * text,OFF beg,OFF max_end,OFF * p_end)1235 md_is_hex_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1236 {
1237 OFF off = beg;
1238
1239 while(off < max_end && ISXDIGIT_(text[off]) && off - beg <= 8)
1240 off++;
1241
1242 if(1 <= off - beg && off - beg <= 6) {
1243 *p_end = off;
1244 return TRUE;
1245 } else {
1246 return FALSE;
1247 }
1248 }
1249
1250 static int
md_is_dec_entity_contents(MD_CTX * ctx,const CHAR * text,OFF beg,OFF max_end,OFF * p_end)1251 md_is_dec_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1252 {
1253 OFF off = beg;
1254
1255 while(off < max_end && ISDIGIT_(text[off]) && off - beg <= 8)
1256 off++;
1257
1258 if(1 <= off - beg && off - beg <= 7) {
1259 *p_end = off;
1260 return TRUE;
1261 } else {
1262 return FALSE;
1263 }
1264 }
1265
1266 static int
md_is_named_entity_contents(MD_CTX * ctx,const CHAR * text,OFF beg,OFF max_end,OFF * p_end)1267 md_is_named_entity_contents(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1268 {
1269 OFF off = beg;
1270
1271 if(off < max_end && ISALPHA_(text[off]))
1272 off++;
1273 else
1274 return FALSE;
1275
1276 while(off < max_end && ISALNUM_(text[off]) && off - beg <= 48)
1277 off++;
1278
1279 if(2 <= off - beg && off - beg <= 48) {
1280 *p_end = off;
1281 return TRUE;
1282 } else {
1283 return FALSE;
1284 }
1285 }
1286
1287 static int
md_is_entity_str(MD_CTX * ctx,const CHAR * text,OFF beg,OFF max_end,OFF * p_end)1288 md_is_entity_str(MD_CTX* ctx, const CHAR* text, OFF beg, OFF max_end, OFF* p_end)
1289 {
1290 int is_contents;
1291 OFF off = beg;
1292
1293 MD_ASSERT(text[off] == _T('&'));
1294 off++;
1295
1296 if(off+2 < max_end && text[off] == _T('#') && (text[off+1] == _T('x') || text[off+1] == _T('X')))
1297 is_contents = md_is_hex_entity_contents(ctx, text, off+2, max_end, &off);
1298 else if(off+1 < max_end && text[off] == _T('#'))
1299 is_contents = md_is_dec_entity_contents(ctx, text, off+1, max_end, &off);
1300 else
1301 is_contents = md_is_named_entity_contents(ctx, text, off, max_end, &off);
1302
1303 if(is_contents && off < max_end && text[off] == _T(';')) {
1304 *p_end = off+1;
1305 return TRUE;
1306 } else {
1307 return FALSE;
1308 }
1309 }
1310
1311 static inline int
md_is_entity(MD_CTX * ctx,OFF beg,OFF max_end,OFF * p_end)1312 md_is_entity(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
1313 {
1314 return md_is_entity_str(ctx, ctx->text, beg, max_end, p_end);
1315 }
1316
1317
1318 /******************************
1319 *** Attribute Management ***
1320 ******************************/
1321
1322 typedef struct MD_ATTRIBUTE_BUILD_tag MD_ATTRIBUTE_BUILD;
1323 struct MD_ATTRIBUTE_BUILD_tag {
1324 CHAR* text;
1325 MD_TEXTTYPE* substr_types;
1326 OFF* substr_offsets;
1327 int substr_count;
1328 int substr_alloc;
1329 MD_TEXTTYPE trivial_types[1];
1330 OFF trivial_offsets[2];
1331 };
1332
1333
1334 #define MD_BUILD_ATTR_NO_ESCAPES 0x0001
1335
1336 static int
md_build_attr_append_substr(MD_CTX * ctx,MD_ATTRIBUTE_BUILD * build,MD_TEXTTYPE type,OFF off)1337 md_build_attr_append_substr(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build,
1338 MD_TEXTTYPE type, OFF off)
1339 {
1340 if(build->substr_count >= build->substr_alloc) {
1341 MD_TEXTTYPE* new_substr_types;
1342 OFF* new_substr_offsets;
1343
1344 build->substr_alloc = (build->substr_alloc > 0
1345 ? build->substr_alloc + build->substr_alloc / 2
1346 : 8);
1347 new_substr_types = (MD_TEXTTYPE*) realloc(build->substr_types,
1348 build->substr_alloc * sizeof(MD_TEXTTYPE));
1349 if(new_substr_types == NULL) {
1350 MD_LOG("realloc() failed.");
1351 return -1;
1352 }
1353 /* Note +1 to reserve space for final offset (== raw_size). */
1354 new_substr_offsets = (OFF*) realloc(build->substr_offsets,
1355 (build->substr_alloc+1) * sizeof(OFF));
1356 if(new_substr_offsets == NULL) {
1357 MD_LOG("realloc() failed.");
1358 free(new_substr_types);
1359 return -1;
1360 }
1361
1362 build->substr_types = new_substr_types;
1363 build->substr_offsets = new_substr_offsets;
1364 }
1365
1366 build->substr_types[build->substr_count] = type;
1367 build->substr_offsets[build->substr_count] = off;
1368 build->substr_count++;
1369 return 0;
1370 }
1371
1372 static void
md_free_attribute(MD_CTX * ctx,MD_ATTRIBUTE_BUILD * build)1373 md_free_attribute(MD_CTX* ctx, MD_ATTRIBUTE_BUILD* build)
1374 {
1375 if(build->substr_alloc > 0) {
1376 free(build->text);
1377 free(build->substr_types);
1378 free(build->substr_offsets);
1379 }
1380 }
1381
1382 static int
md_build_attribute(MD_CTX * ctx,const CHAR * raw_text,SZ raw_size,unsigned flags,MD_ATTRIBUTE * attr,MD_ATTRIBUTE_BUILD * build)1383 md_build_attribute(MD_CTX* ctx, const CHAR* raw_text, SZ raw_size,
1384 unsigned flags, MD_ATTRIBUTE* attr, MD_ATTRIBUTE_BUILD* build)
1385 {
1386 OFF raw_off, off;
1387 int is_trivial;
1388 int ret = 0;
1389
1390 memset(build, 0, sizeof(MD_ATTRIBUTE_BUILD));
1391
1392 /* If there is no backslash and no ampersand, build trivial attribute
1393 * without any malloc(). */
1394 is_trivial = TRUE;
1395 for(raw_off = 0; raw_off < raw_size; raw_off++) {
1396 if(ISANYOF3_(raw_text[raw_off], _T('\\'), _T('&'), _T('\0'))) {
1397 is_trivial = FALSE;
1398 break;
1399 }
1400 }
1401
1402 if(is_trivial) {
1403 build->text = (CHAR*) (raw_size ? raw_text : NULL);
1404 build->substr_types = build->trivial_types;
1405 build->substr_offsets = build->trivial_offsets;
1406 build->substr_count = 1;
1407 build->substr_alloc = 0;
1408 build->trivial_types[0] = MD_TEXT_NORMAL;
1409 build->trivial_offsets[0] = 0;
1410 build->trivial_offsets[1] = raw_size;
1411 off = raw_size;
1412 } else {
1413 build->text = (CHAR*) malloc(raw_size * sizeof(CHAR));
1414 if(build->text == NULL) {
1415 MD_LOG("malloc() failed.");
1416 goto abort;
1417 }
1418
1419 raw_off = 0;
1420 off = 0;
1421
1422 while(raw_off < raw_size) {
1423 if(raw_text[raw_off] == _T('\0')) {
1424 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NULLCHAR, off));
1425 memcpy(build->text + off, raw_text + raw_off, 1);
1426 off++;
1427 raw_off++;
1428 continue;
1429 }
1430
1431 if(raw_text[raw_off] == _T('&')) {
1432 OFF ent_end;
1433
1434 if(md_is_entity_str(ctx, raw_text, raw_off, raw_size, &ent_end)) {
1435 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_ENTITY, off));
1436 memcpy(build->text + off, raw_text + raw_off, ent_end - raw_off);
1437 off += ent_end - raw_off;
1438 raw_off = ent_end;
1439 continue;
1440 }
1441 }
1442
1443 if(build->substr_count == 0 || build->substr_types[build->substr_count-1] != MD_TEXT_NORMAL)
1444 MD_CHECK(md_build_attr_append_substr(ctx, build, MD_TEXT_NORMAL, off));
1445
1446 if(!(flags & MD_BUILD_ATTR_NO_ESCAPES) &&
1447 raw_text[raw_off] == _T('\\') && raw_off+1 < raw_size &&
1448 (ISPUNCT_(raw_text[raw_off+1]) || ISNEWLINE_(raw_text[raw_off+1])))
1449 raw_off++;
1450
1451 build->text[off++] = raw_text[raw_off++];
1452 }
1453 build->substr_offsets[build->substr_count] = off;
1454 }
1455
1456 attr->text = build->text;
1457 attr->size = off;
1458 attr->substr_offsets = build->substr_offsets;
1459 attr->substr_types = build->substr_types;
1460 return 0;
1461
1462 abort:
1463 md_free_attribute(ctx, build);
1464 return -1;
1465 }
1466
1467
1468 /*********************************************
1469 *** Dictionary of Reference Definitions ***
1470 *********************************************/
1471
1472 #define MD_FNV1A_BASE 2166136261U
1473 #define MD_FNV1A_PRIME 16777619U
1474
1475 static inline unsigned
md_fnv1a(unsigned base,const void * data,size_t n)1476 md_fnv1a(unsigned base, const void* data, size_t n)
1477 {
1478 const unsigned char* buf = (const unsigned char*) data;
1479 unsigned hash = base;
1480 size_t i;
1481
1482 for(i = 0; i < n; i++) {
1483 hash ^= buf[i];
1484 hash *= MD_FNV1A_PRIME;
1485 }
1486
1487 return hash;
1488 }
1489
1490
1491 struct MD_REF_DEF_tag {
1492 CHAR* label;
1493 CHAR* title;
1494 unsigned hash;
1495 SZ label_size;
1496 SZ title_size;
1497 OFF dest_beg;
1498 OFF dest_end;
1499 unsigned char label_needs_free : 1;
1500 unsigned char title_needs_free : 1;
1501 };
1502
1503 /* Label equivalence is quite complicated with regards to whitespace and case
1504 * folding. This complicates computing a hash of it as well as direct comparison
1505 * of two labels. */
1506
1507 static unsigned
md_link_label_hash(const CHAR * label,SZ size)1508 md_link_label_hash(const CHAR* label, SZ size)
1509 {
1510 unsigned hash = MD_FNV1A_BASE;
1511 OFF off;
1512 unsigned codepoint;
1513 int is_whitespace = FALSE;
1514
1515 off = md_skip_unicode_whitespace(label, 0, size);
1516 while(off < size) {
1517 SZ char_size;
1518
1519 codepoint = md_decode_unicode(label, off, size, &char_size);
1520 is_whitespace = ISUNICODEWHITESPACE_(codepoint) || ISNEWLINE_(label[off]);
1521
1522 if(is_whitespace) {
1523 codepoint = ' ';
1524 hash = md_fnv1a(hash, &codepoint, sizeof(unsigned));
1525 off = md_skip_unicode_whitespace(label, off, size);
1526 } else {
1527 MD_UNICODE_FOLD_INFO fold_info;
1528
1529 md_get_unicode_fold_info(codepoint, &fold_info);
1530 hash = md_fnv1a(hash, fold_info.codepoints, fold_info.n_codepoints * sizeof(unsigned));
1531 off += char_size;
1532 }
1533 }
1534
1535 return hash;
1536 }
1537
1538 static OFF
md_link_label_cmp_load_fold_info(const CHAR * label,OFF off,SZ size,MD_UNICODE_FOLD_INFO * fold_info)1539 md_link_label_cmp_load_fold_info(const CHAR* label, OFF off, SZ size,
1540 MD_UNICODE_FOLD_INFO* fold_info)
1541 {
1542 unsigned codepoint;
1543 SZ char_size;
1544
1545 if(off >= size) {
1546 /* Treat end of a link label as a whitespace. */
1547 goto whitespace;
1548 }
1549
1550 if(ISNEWLINE_(label[off])) {
1551 /* Treat new lines as a whitespace. */
1552 off++;
1553 goto whitespace;
1554 }
1555
1556 codepoint = md_decode_unicode(label, off, size, &char_size);
1557 off += char_size;
1558 if(ISUNICODEWHITESPACE_(codepoint)) {
1559 /* Treat all whitespace as equivalent */
1560 goto whitespace;
1561 }
1562
1563 /* Get real folding info. */
1564 md_get_unicode_fold_info(codepoint, fold_info);
1565 return off;
1566
1567 whitespace:
1568 fold_info->codepoints[0] = _T(' ');
1569 fold_info->n_codepoints = 1;
1570 return md_skip_unicode_whitespace(label, off, size);
1571 }
1572
1573 static int
md_link_label_cmp(const CHAR * a_label,SZ a_size,const CHAR * b_label,SZ b_size)1574 md_link_label_cmp(const CHAR* a_label, SZ a_size, const CHAR* b_label, SZ b_size)
1575 {
1576 OFF a_off;
1577 OFF b_off;
1578 int a_reached_end = FALSE;
1579 int b_reached_end = FALSE;
1580 MD_UNICODE_FOLD_INFO a_fi = { { 0 }, 0 };
1581 MD_UNICODE_FOLD_INFO b_fi = { { 0 }, 0 };
1582 OFF a_fi_off = 0;
1583 OFF b_fi_off = 0;
1584 int cmp;
1585
1586 a_off = md_skip_unicode_whitespace(a_label, 0, a_size);
1587 b_off = md_skip_unicode_whitespace(b_label, 0, b_size);
1588 while(!a_reached_end || !b_reached_end) {
1589 /* If needed, load fold info for next char. */
1590 if(a_fi_off >= a_fi.n_codepoints) {
1591 a_fi_off = 0;
1592 a_off = md_link_label_cmp_load_fold_info(a_label, a_off, a_size, &a_fi);
1593 a_reached_end = (a_off >= a_size);
1594 }
1595 if(b_fi_off >= b_fi.n_codepoints) {
1596 b_fi_off = 0;
1597 b_off = md_link_label_cmp_load_fold_info(b_label, b_off, b_size, &b_fi);
1598 b_reached_end = (b_off >= b_size);
1599 }
1600
1601 cmp = b_fi.codepoints[b_fi_off] - a_fi.codepoints[a_fi_off];
1602 if(cmp != 0)
1603 return cmp;
1604
1605 a_fi_off++;
1606 b_fi_off++;
1607 }
1608
1609 return 0;
1610 }
1611
1612 typedef struct MD_REF_DEF_LIST_tag MD_REF_DEF_LIST;
1613 struct MD_REF_DEF_LIST_tag {
1614 int n_ref_defs;
1615 int alloc_ref_defs;
1616 MD_REF_DEF* ref_defs[]; /* Valid items always point into ctx->ref_defs[] */
1617 };
1618
1619 static int
md_ref_def_cmp(const void * a,const void * b)1620 md_ref_def_cmp(const void* a, const void* b)
1621 {
1622 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1623 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1624
1625 if(a_ref->hash < b_ref->hash)
1626 return -1;
1627 else if(a_ref->hash > b_ref->hash)
1628 return +1;
1629 else
1630 return md_link_label_cmp(a_ref->label, a_ref->label_size, b_ref->label, b_ref->label_size);
1631 }
1632
1633 static int
md_ref_def_cmp_for_sort(const void * a,const void * b)1634 md_ref_def_cmp_for_sort(const void* a, const void* b)
1635 {
1636 int cmp;
1637
1638 cmp = md_ref_def_cmp(a, b);
1639
1640 /* Ensure stability of the sorting. */
1641 if(cmp == 0) {
1642 const MD_REF_DEF* a_ref = *(const MD_REF_DEF**)a;
1643 const MD_REF_DEF* b_ref = *(const MD_REF_DEF**)b;
1644
1645 if(a_ref < b_ref)
1646 cmp = -1;
1647 else if(a_ref > b_ref)
1648 cmp = +1;
1649 else
1650 cmp = 0;
1651 }
1652
1653 return cmp;
1654 }
1655
1656 static int
md_build_ref_def_hashtable(MD_CTX * ctx)1657 md_build_ref_def_hashtable(MD_CTX* ctx)
1658 {
1659 int i, j;
1660
1661 if(ctx->n_ref_defs == 0)
1662 return 0;
1663
1664 ctx->ref_def_hashtable_size = (ctx->n_ref_defs * 5) / 4;
1665 ctx->ref_def_hashtable = malloc(ctx->ref_def_hashtable_size * sizeof(void*));
1666 if(ctx->ref_def_hashtable == NULL) {
1667 MD_LOG("malloc() failed.");
1668 goto abort;
1669 }
1670 memset(ctx->ref_def_hashtable, 0, ctx->ref_def_hashtable_size * sizeof(void*));
1671
1672 /* Each member of ctx->ref_def_hashtable[] can be:
1673 * -- NULL,
1674 * -- pointer to the MD_REF_DEF in ctx->ref_defs[], or
1675 * -- pointer to a MD_REF_DEF_LIST, which holds multiple pointers to
1676 * such MD_REF_DEFs.
1677 */
1678 for(i = 0; i < ctx->n_ref_defs; i++) {
1679 MD_REF_DEF* def = &ctx->ref_defs[i];
1680 void* bucket;
1681 MD_REF_DEF_LIST* list;
1682
1683 def->hash = md_link_label_hash(def->label, def->label_size);
1684 bucket = ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size];
1685
1686 if(bucket == NULL) {
1687 /* The bucket is empty. Make it just point to the def. */
1688 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = def;
1689 continue;
1690 }
1691
1692 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1693 /* The bucket already contains one ref. def. Lets see whether it
1694 * is the same label (ref. def. duplicate) or different one
1695 * (hash conflict). */
1696 MD_REF_DEF* old_def = (MD_REF_DEF*) bucket;
1697
1698 if(md_link_label_cmp(def->label, def->label_size, old_def->label, old_def->label_size) == 0) {
1699 /* Duplicate label: Ignore this ref. def. */
1700 continue;
1701 }
1702
1703 /* Make the bucket complex, i.e. able to hold more ref. defs. */
1704 list = (MD_REF_DEF_LIST*) malloc(sizeof(MD_REF_DEF_LIST) + 2 * sizeof(MD_REF_DEF*));
1705 if(list == NULL) {
1706 MD_LOG("malloc() failed.");
1707 goto abort;
1708 }
1709 list->ref_defs[0] = old_def;
1710 list->ref_defs[1] = def;
1711 list->n_ref_defs = 2;
1712 list->alloc_ref_defs = 2;
1713 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1714 continue;
1715 }
1716
1717 /* Append the def to the complex bucket list.
1718 *
1719 * Note in this case we ignore potential duplicates to avoid expensive
1720 * iterating over the complex bucket. Below, we revisit all the complex
1721 * buckets and handle it more cheaply after the complex bucket contents
1722 * is sorted. */
1723 list = (MD_REF_DEF_LIST*) bucket;
1724 if(list->n_ref_defs >= list->alloc_ref_defs) {
1725 int alloc_ref_defs = list->alloc_ref_defs + list->alloc_ref_defs / 2;
1726 MD_REF_DEF_LIST* list_tmp = (MD_REF_DEF_LIST*) realloc(list,
1727 sizeof(MD_REF_DEF_LIST) + alloc_ref_defs * sizeof(MD_REF_DEF*));
1728 if(list_tmp == NULL) {
1729 MD_LOG("realloc() failed.");
1730 goto abort;
1731 }
1732 list = list_tmp;
1733 list->alloc_ref_defs = alloc_ref_defs;
1734 ctx->ref_def_hashtable[def->hash % ctx->ref_def_hashtable_size] = list;
1735 }
1736
1737 list->ref_defs[list->n_ref_defs] = def;
1738 list->n_ref_defs++;
1739 }
1740
1741 /* Sort the complex buckets so we can use bsearch() with them. */
1742 for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1743 void* bucket = ctx->ref_def_hashtable[i];
1744 MD_REF_DEF_LIST* list;
1745
1746 if(bucket == NULL)
1747 continue;
1748 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1749 continue;
1750
1751 list = (MD_REF_DEF_LIST*) bucket;
1752 qsort(list->ref_defs, list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp_for_sort);
1753
1754 /* Disable all duplicates in the complex bucket by forcing all such
1755 * records to point to the 1st such ref. def. I.e. no matter which
1756 * record is found during the lookup, it will always point to the right
1757 * ref. def. in ctx->ref_defs[]. */
1758 for(j = 1; j < list->n_ref_defs; j++) {
1759 if(md_ref_def_cmp(&list->ref_defs[j-1], &list->ref_defs[j]) == 0)
1760 list->ref_defs[j] = list->ref_defs[j-1];
1761 }
1762 }
1763
1764 return 0;
1765
1766 abort:
1767 return -1;
1768 }
1769
1770 static void
md_free_ref_def_hashtable(MD_CTX * ctx)1771 md_free_ref_def_hashtable(MD_CTX* ctx)
1772 {
1773 if(ctx->ref_def_hashtable != NULL) {
1774 int i;
1775
1776 for(i = 0; i < ctx->ref_def_hashtable_size; i++) {
1777 void* bucket = ctx->ref_def_hashtable[i];
1778 if(bucket == NULL)
1779 continue;
1780 if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs)
1781 continue;
1782 free(bucket);
1783 }
1784
1785 free(ctx->ref_def_hashtable);
1786 }
1787 }
1788
1789 static const MD_REF_DEF*
md_lookup_ref_def(MD_CTX * ctx,const CHAR * label,SZ label_size)1790 md_lookup_ref_def(MD_CTX* ctx, const CHAR* label, SZ label_size)
1791 {
1792 unsigned hash;
1793 void* bucket;
1794
1795 if(ctx->ref_def_hashtable_size == 0)
1796 return NULL;
1797
1798 hash = md_link_label_hash(label, label_size);
1799 bucket = ctx->ref_def_hashtable[hash % ctx->ref_def_hashtable_size];
1800
1801 if(bucket == NULL) {
1802 return NULL;
1803 } else if(ctx->ref_defs <= (MD_REF_DEF*) bucket && (MD_REF_DEF*) bucket < ctx->ref_defs + ctx->n_ref_defs) {
1804 const MD_REF_DEF* def = (MD_REF_DEF*) bucket;
1805
1806 if(md_link_label_cmp(def->label, def->label_size, label, label_size) == 0)
1807 return def;
1808 else
1809 return NULL;
1810 } else {
1811 MD_REF_DEF_LIST* list = (MD_REF_DEF_LIST*) bucket;
1812 MD_REF_DEF key_buf;
1813 const MD_REF_DEF* key = &key_buf;
1814 const MD_REF_DEF** ret;
1815
1816 key_buf.label = (CHAR*) label;
1817 key_buf.label_size = label_size;
1818 key_buf.hash = md_link_label_hash(key_buf.label, key_buf.label_size);
1819
1820 ret = (const MD_REF_DEF**) bsearch(&key, list->ref_defs,
1821 list->n_ref_defs, sizeof(MD_REF_DEF*), md_ref_def_cmp);
1822 if(ret != NULL)
1823 return *ret;
1824 else
1825 return NULL;
1826 }
1827 }
1828
1829
1830 /***************************
1831 *** Recognizing Links ***
1832 ***************************/
1833
1834 /* Note this code is partially shared between processing inlines and blocks
1835 * as reference definitions and links share some helper parser functions.
1836 */
1837
1838 typedef struct MD_LINK_ATTR_tag MD_LINK_ATTR;
1839 struct MD_LINK_ATTR_tag {
1840 OFF dest_beg;
1841 OFF dest_end;
1842
1843 CHAR* title;
1844 SZ title_size;
1845 int title_needs_free;
1846 };
1847
1848
1849 static int
md_is_link_label(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF * p_end,int * p_beg_line_index,int * p_end_line_index,OFF * p_contents_beg,OFF * p_contents_end)1850 md_is_link_label(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
1851 OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
1852 OFF* p_contents_beg, OFF* p_contents_end)
1853 {
1854 OFF off = beg;
1855 OFF contents_beg = 0;
1856 OFF contents_end = 0;
1857 int line_index = 0;
1858 int len = 0;
1859
1860 if(CH(off) != _T('['))
1861 return FALSE;
1862 off++;
1863
1864 while(1) {
1865 OFF line_end = lines[line_index].end;
1866
1867 while(off < line_end) {
1868 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
1869 if(contents_end == 0) {
1870 contents_beg = off;
1871 *p_beg_line_index = line_index;
1872 }
1873 contents_end = off + 2;
1874 off += 2;
1875 } else if(CH(off) == _T('[')) {
1876 return FALSE;
1877 } else if(CH(off) == _T(']')) {
1878 if(contents_beg < contents_end) {
1879 /* Success. */
1880 *p_contents_beg = contents_beg;
1881 *p_contents_end = contents_end;
1882 *p_end = off+1;
1883 *p_end_line_index = line_index;
1884 return TRUE;
1885 } else {
1886 /* Link label must have some non-whitespace contents. */
1887 return FALSE;
1888 }
1889 } else {
1890 unsigned codepoint;
1891 SZ char_size;
1892
1893 codepoint = md_decode_unicode(ctx->text, off, ctx->size, &char_size);
1894 if(!ISUNICODEWHITESPACE_(codepoint)) {
1895 if(contents_end == 0) {
1896 contents_beg = off;
1897 *p_beg_line_index = line_index;
1898 }
1899 contents_end = off + char_size;
1900 }
1901
1902 off += char_size;
1903 }
1904
1905 len++;
1906 if(len > 999)
1907 return FALSE;
1908 }
1909
1910 line_index++;
1911 len++;
1912 if(line_index < n_lines)
1913 off = lines[line_index].beg;
1914 else
1915 break;
1916 }
1917
1918 return FALSE;
1919 }
1920
1921 static int
md_is_link_destination_A(MD_CTX * ctx,OFF beg,OFF max_end,OFF * p_end,OFF * p_contents_beg,OFF * p_contents_end)1922 md_is_link_destination_A(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1923 OFF* p_contents_beg, OFF* p_contents_end)
1924 {
1925 OFF off = beg;
1926
1927 if(off >= max_end || CH(off) != _T('<'))
1928 return FALSE;
1929 off++;
1930
1931 while(off < max_end) {
1932 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) {
1933 off += 2;
1934 continue;
1935 }
1936
1937 if(ISNEWLINE(off) || CH(off) == _T('<'))
1938 return FALSE;
1939
1940 if(CH(off) == _T('>')) {
1941 /* Success. */
1942 *p_contents_beg = beg+1;
1943 *p_contents_end = off;
1944 *p_end = off+1;
1945 return TRUE;
1946 }
1947
1948 off++;
1949 }
1950
1951 return FALSE;
1952 }
1953
1954 static int
md_is_link_destination_B(MD_CTX * ctx,OFF beg,OFF max_end,OFF * p_end,OFF * p_contents_beg,OFF * p_contents_end)1955 md_is_link_destination_B(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1956 OFF* p_contents_beg, OFF* p_contents_end)
1957 {
1958 OFF off = beg;
1959 int parenthesis_level = 0;
1960
1961 while(off < max_end) {
1962 if(CH(off) == _T('\\') && off+1 < max_end && ISPUNCT(off+1)) {
1963 off += 2;
1964 continue;
1965 }
1966
1967 if(ISWHITESPACE(off) || ISCNTRL(off))
1968 break;
1969
1970 /* Link destination may include balanced pairs of unescaped '(' ')'.
1971 * Note we limit the maximal nesting level by 32 to protect us from
1972 * https://github.com/jgm/cmark/issues/214 */
1973 if(CH(off) == _T('(')) {
1974 parenthesis_level++;
1975 if(parenthesis_level > 32)
1976 return FALSE;
1977 } else if(CH(off) == _T(')')) {
1978 if(parenthesis_level == 0)
1979 break;
1980 parenthesis_level--;
1981 }
1982
1983 off++;
1984 }
1985
1986 if(parenthesis_level != 0 || off == beg)
1987 return FALSE;
1988
1989 /* Success. */
1990 *p_contents_beg = beg;
1991 *p_contents_end = off;
1992 *p_end = off;
1993 return TRUE;
1994 }
1995
1996 static inline int
md_is_link_destination(MD_CTX * ctx,OFF beg,OFF max_end,OFF * p_end,OFF * p_contents_beg,OFF * p_contents_end)1997 md_is_link_destination(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end,
1998 OFF* p_contents_beg, OFF* p_contents_end)
1999 {
2000 if(CH(beg) == _T('<'))
2001 return md_is_link_destination_A(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2002 else
2003 return md_is_link_destination_B(ctx, beg, max_end, p_end, p_contents_beg, p_contents_end);
2004 }
2005
2006 static int
md_is_link_title(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF * p_end,int * p_beg_line_index,int * p_end_line_index,OFF * p_contents_beg,OFF * p_contents_end)2007 md_is_link_title(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
2008 OFF* p_end, int* p_beg_line_index, int* p_end_line_index,
2009 OFF* p_contents_beg, OFF* p_contents_end)
2010 {
2011 OFF off = beg;
2012 CHAR closer_char;
2013 int line_index = 0;
2014
2015 /* White space with up to one line break. */
2016 while(off < lines[line_index].end && ISWHITESPACE(off))
2017 off++;
2018 if(off >= lines[line_index].end) {
2019 line_index++;
2020 if(line_index >= n_lines)
2021 return FALSE;
2022 off = lines[line_index].beg;
2023 }
2024 if(off == beg)
2025 return FALSE;
2026
2027 *p_beg_line_index = line_index;
2028
2029 /* First char determines how to detect end of it. */
2030 switch(CH(off)) {
2031 case _T('"'): closer_char = _T('"'); break;
2032 case _T('\''): closer_char = _T('\''); break;
2033 case _T('('): closer_char = _T(')'); break;
2034 default: return FALSE;
2035 }
2036 off++;
2037
2038 *p_contents_beg = off;
2039
2040 while(line_index < n_lines) {
2041 OFF line_end = lines[line_index].end;
2042
2043 while(off < line_end) {
2044 if(CH(off) == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
2045 off++;
2046 } else if(CH(off) == closer_char) {
2047 /* Success. */
2048 *p_contents_end = off;
2049 *p_end = off+1;
2050 *p_end_line_index = line_index;
2051 return TRUE;
2052 } else if(closer_char == _T(')') && CH(off) == _T('(')) {
2053 /* ()-style title cannot contain (unescaped '(')) */
2054 return FALSE;
2055 }
2056
2057 off++;
2058 }
2059
2060 line_index++;
2061 }
2062
2063 return FALSE;
2064 }
2065
2066 /* Returns 0 if it is not a reference definition.
2067 *
2068 * Returns N > 0 if it is a reference definition. N then corresponds to the
2069 * number of lines forming it). In this case the definition is stored for
2070 * resolving any links referring to it.
2071 *
2072 * Returns -1 in case of an error (out of memory).
2073 */
2074 static int
md_is_link_reference_definition(MD_CTX * ctx,const MD_LINE * lines,int n_lines)2075 md_is_link_reference_definition(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
2076 {
2077 OFF label_contents_beg;
2078 OFF label_contents_end;
2079 int label_contents_line_index = -1;
2080 int label_is_multiline = FALSE;
2081 OFF dest_contents_beg;
2082 OFF dest_contents_end;
2083 OFF title_contents_beg;
2084 OFF title_contents_end;
2085 int title_contents_line_index;
2086 int title_is_multiline = FALSE;
2087 OFF off;
2088 int line_index = 0;
2089 int tmp_line_index;
2090 MD_REF_DEF* def = NULL;
2091 int ret = 0;
2092
2093 /* Link label. */
2094 if(!md_is_link_label(ctx, lines, n_lines, lines[0].beg,
2095 &off, &label_contents_line_index, &line_index,
2096 &label_contents_beg, &label_contents_end))
2097 return FALSE;
2098 label_is_multiline = (label_contents_line_index != line_index);
2099
2100 /* Colon. */
2101 if(off >= lines[line_index].end || CH(off) != _T(':'))
2102 return FALSE;
2103 off++;
2104
2105 /* Optional white space with up to one line break. */
2106 while(off < lines[line_index].end && ISWHITESPACE(off))
2107 off++;
2108 if(off >= lines[line_index].end) {
2109 line_index++;
2110 if(line_index >= n_lines)
2111 return FALSE;
2112 off = lines[line_index].beg;
2113 }
2114
2115 /* Link destination. */
2116 if(!md_is_link_destination(ctx, off, lines[line_index].end,
2117 &off, &dest_contents_beg, &dest_contents_end))
2118 return FALSE;
2119
2120 /* (Optional) title. Note we interpret it as an title only if nothing
2121 * more follows on its last line. */
2122 if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off,
2123 &off, &title_contents_line_index, &tmp_line_index,
2124 &title_contents_beg, &title_contents_end)
2125 && off >= lines[line_index + tmp_line_index].end)
2126 {
2127 title_is_multiline = (tmp_line_index != title_contents_line_index);
2128 title_contents_line_index += line_index;
2129 line_index += tmp_line_index;
2130 } else {
2131 /* Not a title. */
2132 title_is_multiline = FALSE;
2133 title_contents_beg = off;
2134 title_contents_end = off;
2135 title_contents_line_index = 0;
2136 }
2137
2138 /* Nothing more can follow on the last line. */
2139 if(off < lines[line_index].end)
2140 return FALSE;
2141
2142 /* So, it _is_ a reference definition. Remember it. */
2143 if(ctx->n_ref_defs >= ctx->alloc_ref_defs) {
2144 MD_REF_DEF* new_defs;
2145
2146 ctx->alloc_ref_defs = (ctx->alloc_ref_defs > 0
2147 ? ctx->alloc_ref_defs + ctx->alloc_ref_defs / 2
2148 : 16);
2149 new_defs = (MD_REF_DEF*) realloc(ctx->ref_defs, ctx->alloc_ref_defs * sizeof(MD_REF_DEF));
2150 if(new_defs == NULL) {
2151 MD_LOG("realloc() failed.");
2152 goto abort;
2153 }
2154
2155 ctx->ref_defs = new_defs;
2156 }
2157 def = &ctx->ref_defs[ctx->n_ref_defs];
2158 memset(def, 0, sizeof(MD_REF_DEF));
2159
2160 if(label_is_multiline) {
2161 MD_CHECK(md_merge_lines_alloc(ctx, label_contents_beg, label_contents_end,
2162 lines + label_contents_line_index, n_lines - label_contents_line_index,
2163 _T(' '), &def->label, &def->label_size));
2164 def->label_needs_free = TRUE;
2165 } else {
2166 def->label = (CHAR*) STR(label_contents_beg);
2167 def->label_size = label_contents_end - label_contents_beg;
2168 }
2169
2170 if(title_is_multiline) {
2171 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2172 lines + title_contents_line_index, n_lines - title_contents_line_index,
2173 _T('\n'), &def->title, &def->title_size));
2174 def->title_needs_free = TRUE;
2175 } else {
2176 def->title = (CHAR*) STR(title_contents_beg);
2177 def->title_size = title_contents_end - title_contents_beg;
2178 }
2179
2180 def->dest_beg = dest_contents_beg;
2181 def->dest_end = dest_contents_end;
2182
2183 /* Success. */
2184 ctx->n_ref_defs++;
2185 return line_index + 1;
2186
2187 abort:
2188 /* Failure. */
2189 if(def != NULL && def->label_needs_free)
2190 free(def->label);
2191 if(def != NULL && def->title_needs_free)
2192 free(def->title);
2193 return ret;
2194 }
2195
2196 static int
md_is_link_reference(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF end,MD_LINK_ATTR * attr)2197 md_is_link_reference(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
2198 OFF beg, OFF end, MD_LINK_ATTR* attr)
2199 {
2200 const MD_REF_DEF* def;
2201 const MD_LINE* beg_line;
2202 const MD_LINE* end_line;
2203 CHAR* label;
2204 SZ label_size;
2205 int ret;
2206
2207 MD_ASSERT(CH(beg) == _T('[') || CH(beg) == _T('!'));
2208 MD_ASSERT(CH(end-1) == _T(']'));
2209
2210 beg += (CH(beg) == _T('!') ? 2 : 1);
2211 end--;
2212
2213 /* Find lines corresponding to the beg and end positions. */
2214 MD_ASSERT(lines[0].beg <= beg);
2215 beg_line = lines;
2216 while(beg >= beg_line->end)
2217 beg_line++;
2218
2219 MD_ASSERT(end <= lines[n_lines-1].end);
2220 end_line = beg_line;
2221 while(end >= end_line->end)
2222 end_line++;
2223
2224 if(beg_line != end_line) {
2225 MD_CHECK(md_merge_lines_alloc(ctx, beg, end, beg_line,
2226 n_lines - (beg_line - lines), _T(' '), &label, &label_size));
2227 } else {
2228 label = (CHAR*) STR(beg);
2229 label_size = end - beg;
2230 }
2231
2232 def = md_lookup_ref_def(ctx, label, label_size);
2233 if(def != NULL) {
2234 attr->dest_beg = def->dest_beg;
2235 attr->dest_end = def->dest_end;
2236 attr->title = def->title;
2237 attr->title_size = def->title_size;
2238 attr->title_needs_free = FALSE;
2239 }
2240
2241 if(beg_line != end_line)
2242 free(label);
2243
2244 ret = (def != NULL);
2245
2246 abort:
2247 return ret;
2248 }
2249
2250 static int
md_is_inline_link_spec(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF * p_end,MD_LINK_ATTR * attr)2251 md_is_inline_link_spec(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
2252 OFF beg, OFF* p_end, MD_LINK_ATTR* attr)
2253 {
2254 int line_index = 0;
2255 int tmp_line_index;
2256 OFF title_contents_beg;
2257 OFF title_contents_end;
2258 int title_contents_line_index;
2259 int title_is_multiline;
2260 OFF off = beg;
2261 int ret = FALSE;
2262
2263 while(off >= lines[line_index].end)
2264 line_index++;
2265
2266 MD_ASSERT(CH(off) == _T('('));
2267 off++;
2268
2269 /* Optional white space with up to one line break. */
2270 while(off < lines[line_index].end && ISWHITESPACE(off))
2271 off++;
2272 if(off >= lines[line_index].end && ISNEWLINE(off)) {
2273 line_index++;
2274 if(line_index >= n_lines)
2275 return FALSE;
2276 off = lines[line_index].beg;
2277 }
2278
2279 /* Link destination may be omitted, but only when not also having a title. */
2280 if(off < ctx->size && CH(off) == _T(')')) {
2281 attr->dest_beg = off;
2282 attr->dest_end = off;
2283 attr->title = NULL;
2284 attr->title_size = 0;
2285 attr->title_needs_free = FALSE;
2286 off++;
2287 *p_end = off;
2288 return TRUE;
2289 }
2290
2291 /* Link destination. */
2292 if(!md_is_link_destination(ctx, off, lines[line_index].end,
2293 &off, &attr->dest_beg, &attr->dest_end))
2294 return FALSE;
2295
2296 /* (Optional) title. */
2297 if(md_is_link_title(ctx, lines + line_index, n_lines - line_index, off,
2298 &off, &title_contents_line_index, &tmp_line_index,
2299 &title_contents_beg, &title_contents_end))
2300 {
2301 title_is_multiline = (tmp_line_index != title_contents_line_index);
2302 title_contents_line_index += line_index;
2303 line_index += tmp_line_index;
2304 } else {
2305 /* Not a title. */
2306 title_is_multiline = FALSE;
2307 title_contents_beg = off;
2308 title_contents_end = off;
2309 title_contents_line_index = 0;
2310 }
2311
2312 /* Optional whitespace followed with final ')'. */
2313 while(off < lines[line_index].end && ISWHITESPACE(off))
2314 off++;
2315 if(off >= lines[line_index].end && ISNEWLINE(off)) {
2316 line_index++;
2317 if(line_index >= n_lines)
2318 return FALSE;
2319 off = lines[line_index].beg;
2320 }
2321 if(CH(off) != _T(')'))
2322 goto abort;
2323 off++;
2324
2325 if(title_contents_beg >= title_contents_end) {
2326 attr->title = NULL;
2327 attr->title_size = 0;
2328 attr->title_needs_free = FALSE;
2329 } else if(!title_is_multiline) {
2330 attr->title = (CHAR*) STR(title_contents_beg);
2331 attr->title_size = title_contents_end - title_contents_beg;
2332 attr->title_needs_free = FALSE;
2333 } else {
2334 MD_CHECK(md_merge_lines_alloc(ctx, title_contents_beg, title_contents_end,
2335 lines + title_contents_line_index, n_lines - title_contents_line_index,
2336 _T('\n'), &attr->title, &attr->title_size));
2337 attr->title_needs_free = TRUE;
2338 }
2339
2340 *p_end = off;
2341 ret = TRUE;
2342
2343 abort:
2344 return ret;
2345 }
2346
2347 static void
md_free_ref_defs(MD_CTX * ctx)2348 md_free_ref_defs(MD_CTX* ctx)
2349 {
2350 int i;
2351
2352 for(i = 0; i < ctx->n_ref_defs; i++) {
2353 MD_REF_DEF* def = &ctx->ref_defs[i];
2354
2355 if(def->label_needs_free)
2356 free(def->label);
2357 if(def->title_needs_free)
2358 free(def->title);
2359 }
2360
2361 free(ctx->ref_defs);
2362 }
2363
2364
2365 /******************************************
2366 *** Processing Inlines (a.k.a Spans) ***
2367 ******************************************/
2368
2369 /* We process inlines in few phases:
2370 *
2371 * (1) We go through the block text and collect all significant characters
2372 * which may start/end a span or some other significant position into
2373 * ctx->marks[]. Core of this is what md_collect_marks() does.
2374 *
2375 * We also do some very brief preliminary context-less analysis, whether
2376 * it might be opener or closer (e.g. of an emphasis span).
2377 *
2378 * This speeds the other steps as we do not need to re-iterate over all
2379 * characters anymore.
2380 *
2381 * (2) We analyze each potential mark types, in order by their precedence.
2382 *
2383 * In each md_analyze_XXX() function, we re-iterate list of the marks,
2384 * skipping already resolved regions (in preceding precedences) and try to
2385 * resolve them.
2386 *
2387 * (2.1) For trivial marks, which are single (e.g. HTML entity), we just mark
2388 * them as resolved.
2389 *
2390 * (2.2) For range-type marks, we analyze whether the mark could be closer
2391 * and, if yes, whether there is some preceding opener it could satisfy.
2392 *
2393 * If not we check whether it could be really an opener and if yes, we
2394 * remember it so subsequent closers may resolve it.
2395 *
2396 * (3) Finally, when all marks were analyzed, we render the block contents
2397 * by calling MD_RENDERER::text() callback, interrupting by ::enter_span()
2398 * or ::close_span() whenever we reach a resolved mark.
2399 */
2400
2401
2402 /* The mark structure.
2403 *
2404 * '\\': Maybe escape sequence.
2405 * '\0': NULL char.
2406 * '*': Maybe (strong) emphasis start/end.
2407 * '_': Maybe (strong) emphasis start/end.
2408 * '~': Maybe strikethrough start/end (needs MD_FLAG_STRIKETHROUGH).
2409 * '`': Maybe code span start/end.
2410 * '&': Maybe start of entity.
2411 * ';': Maybe end of entity.
2412 * '<': Maybe start of raw HTML or autolink.
2413 * '>': Maybe end of raw HTML or autolink.
2414 * '[': Maybe start of link label or link text.
2415 * '!': Equivalent of '[' for image.
2416 * ']': Maybe end of link label or link text.
2417 * '@': Maybe permissive e-mail auto-link (needs MD_FLAG_PERMISSIVEEMAILAUTOLINKS).
2418 * ':': Maybe permissive URL auto-link (needs MD_FLAG_PERMISSIVEURLAUTOLINKS).
2419 * '.': Maybe permissive WWW auto-link (needs MD_FLAG_PERMISSIVEWWWAUTOLINKS).
2420 * 'D': Dummy mark, it reserves a space for splitting a previous mark
2421 * (e.g. emphasis) or to make more space for storing some special data
2422 * related to the preceding mark (e.g. link).
2423 *
2424 * Note that not all instances of these chars in the text imply creation of the
2425 * structure. Only those which have (or may have, after we see more context)
2426 * the special meaning.
2427 *
2428 * (Keep this struct as small as possible to fit as much of them into CPU
2429 * cache line.)
2430 */
2431 struct MD_MARK_tag {
2432 OFF beg;
2433 OFF end;
2434
2435 /* For unresolved openers, 'prev' and 'next' form the chain of open openers
2436 * of given type 'ch'.
2437 *
2438 * During resolving, we disconnect from the chain and point to the
2439 * corresponding counterpart so opener points to its closer and vice versa.
2440 */
2441 int prev;
2442 int next;
2443 CHAR ch;
2444 unsigned char flags;
2445 };
2446
2447 /* Mark flags (these apply to ALL mark types). */
2448 #define MD_MARK_POTENTIAL_OPENER 0x01 /* Maybe opener. */
2449 #define MD_MARK_POTENTIAL_CLOSER 0x02 /* Maybe closer. */
2450 #define MD_MARK_OPENER 0x04 /* Definitely opener. */
2451 #define MD_MARK_CLOSER 0x08 /* Definitely closer. */
2452 #define MD_MARK_RESOLVED 0x10 /* Resolved in any definite way. */
2453
2454 /* Mark flags specific for various mark types (so they can share bits). */
2455 #define MD_MARK_EMPH_INTRAWORD 0x20 /* Helper for the "rule of 3". */
2456 #define MD_MARK_EMPH_MOD3_0 0x40
2457 #define MD_MARK_EMPH_MOD3_1 0x80
2458 #define MD_MARK_EMPH_MOD3_2 (0x40 | 0x80)
2459 #define MD_MARK_EMPH_MOD3_MASK (0x40 | 0x80)
2460 #define MD_MARK_AUTOLINK 0x20 /* Distinguisher for '<', '>'. */
2461 #define MD_MARK_VALIDPERMISSIVEAUTOLINK 0x20 /* For permissive autolinks. */
2462
2463 static MD_MARKCHAIN*
md_asterisk_chain(MD_CTX * ctx,unsigned flags)2464 md_asterisk_chain(MD_CTX* ctx, unsigned flags)
2465 {
2466 switch(flags & (MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_MASK)) {
2467 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_intraword_mod3_0;
2468 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_intraword_mod3_1;
2469 case MD_MARK_EMPH_INTRAWORD | MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_intraword_mod3_2;
2470 case MD_MARK_EMPH_MOD3_0: return &ASTERISK_OPENERS_extraword_mod3_0;
2471 case MD_MARK_EMPH_MOD3_1: return &ASTERISK_OPENERS_extraword_mod3_1;
2472 case MD_MARK_EMPH_MOD3_2: return &ASTERISK_OPENERS_extraword_mod3_2;
2473 default: MD_UNREACHABLE();
2474 }
2475 return NULL;
2476 }
2477
2478 static MD_MARKCHAIN*
md_mark_chain(MD_CTX * ctx,int mark_index)2479 md_mark_chain(MD_CTX* ctx, int mark_index)
2480 {
2481 MD_MARK* mark = &ctx->marks[mark_index];
2482
2483 switch(mark->ch) {
2484 case _T('*'): return md_asterisk_chain(ctx, mark->flags);
2485 case _T('_'): return &UNDERSCORE_OPENERS;
2486 case _T('~'): return (mark->end - mark->beg == 1) ? &TILDE_OPENERS_1 : &TILDE_OPENERS_2;
2487 case _T('['): return &BRACKET_OPENERS;
2488 case _T('|'): return &TABLECELLBOUNDARIES;
2489 default: return NULL;
2490 }
2491 }
2492
2493 static MD_MARK*
md_push_mark(MD_CTX * ctx)2494 md_push_mark(MD_CTX* ctx)
2495 {
2496 if(ctx->n_marks >= ctx->alloc_marks) {
2497 MD_MARK* new_marks;
2498
2499 ctx->alloc_marks = (ctx->alloc_marks > 0
2500 ? ctx->alloc_marks + ctx->alloc_marks / 2
2501 : 64);
2502 new_marks = realloc(ctx->marks, ctx->alloc_marks * sizeof(MD_MARK));
2503 if(new_marks == NULL) {
2504 MD_LOG("realloc() failed.");
2505 return NULL;
2506 }
2507
2508 ctx->marks = new_marks;
2509 }
2510
2511 return &ctx->marks[ctx->n_marks++];
2512 }
2513
2514 #define PUSH_MARK_() \
2515 do { \
2516 mark = md_push_mark(ctx); \
2517 if(mark == NULL) { \
2518 ret = -1; \
2519 goto abort; \
2520 } \
2521 } while(0)
2522
2523 #define PUSH_MARK(ch_, beg_, end_, flags_) \
2524 do { \
2525 PUSH_MARK_(); \
2526 mark->beg = (beg_); \
2527 mark->end = (end_); \
2528 mark->prev = -1; \
2529 mark->next = -1; \
2530 mark->ch = (char)(ch_); \
2531 mark->flags = (flags_); \
2532 } while(0)
2533
2534
2535 static void
md_mark_chain_append(MD_CTX * ctx,MD_MARKCHAIN * chain,int mark_index)2536 md_mark_chain_append(MD_CTX* ctx, MD_MARKCHAIN* chain, int mark_index)
2537 {
2538 if(chain->tail >= 0)
2539 ctx->marks[chain->tail].next = mark_index;
2540 else
2541 chain->head = mark_index;
2542
2543 ctx->marks[mark_index].prev = chain->tail;
2544 ctx->marks[mark_index].next = -1;
2545 chain->tail = mark_index;
2546 }
2547
2548 /* Sometimes, we need to store a pointer into the mark. It is quite rare
2549 * so we do not bother to make MD_MARK use union, and it can only happen
2550 * for dummy marks. */
2551 static inline void
md_mark_store_ptr(MD_CTX * ctx,int mark_index,void * ptr)2552 md_mark_store_ptr(MD_CTX* ctx, int mark_index, void* ptr)
2553 {
2554 MD_MARK* mark = &ctx->marks[mark_index];
2555 MD_ASSERT(mark->ch == 'D');
2556
2557 /* Check only members beg and end are misused for this. */
2558 MD_ASSERT(sizeof(void*) <= 2 * sizeof(OFF));
2559 memcpy(mark, &ptr, sizeof(void*));
2560 }
2561
2562 static inline void*
md_mark_get_ptr(MD_CTX * ctx,int mark_index)2563 md_mark_get_ptr(MD_CTX* ctx, int mark_index)
2564 {
2565 void* ptr;
2566 MD_MARK* mark = &ctx->marks[mark_index];
2567 MD_ASSERT(mark->ch == 'D');
2568 memcpy(&ptr, mark, sizeof(void*));
2569 return ptr;
2570 }
2571
2572 static void
md_resolve_range(MD_CTX * ctx,MD_MARKCHAIN * chain,int opener_index,int closer_index)2573 md_resolve_range(MD_CTX* ctx, MD_MARKCHAIN* chain, int opener_index, int closer_index)
2574 {
2575 MD_MARK* opener = &ctx->marks[opener_index];
2576 MD_MARK* closer = &ctx->marks[closer_index];
2577
2578 /* Remove opener from the list of openers. */
2579 if(chain != NULL) {
2580 if(opener->prev >= 0)
2581 ctx->marks[opener->prev].next = opener->next;
2582 else
2583 chain->head = opener->next;
2584
2585 if(opener->next >= 0)
2586 ctx->marks[opener->next].prev = opener->prev;
2587 else
2588 chain->tail = opener->prev;
2589 }
2590
2591 /* Interconnect opener and closer and mark both as resolved. */
2592 opener->next = closer_index;
2593 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
2594 closer->prev = opener_index;
2595 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
2596 }
2597
2598
2599 #define MD_ROLLBACK_ALL 0
2600 #define MD_ROLLBACK_CROSSING 1
2601
2602 /* In the range ctx->marks[opener_index] ... [closer_index], undo some or all
2603 * resolvings accordingly to these rules:
2604 *
2605 * (1) All openers BEFORE the range corresponding to any closer inside the
2606 * range are un-resolved and they are re-added to their respective chains
2607 * of unresolved openers. This ensures we can reuse the opener for closers
2608 * AFTER the range.
2609 *
2610 * (2) If 'how' is MD_ROLLBACK_ALL, then ALL resolved marks inside the range
2611 * are discarded.
2612 *
2613 * (3) If 'how' is MD_ROLLBACK_CROSSING, only closers with openers handled
2614 * in (1) are discarded. I.e. pairs of openers and closers which are both
2615 * inside the range are retained as well as any unpaired marks.
2616 */
2617 static void
md_rollback(MD_CTX * ctx,int opener_index,int closer_index,int how)2618 md_rollback(MD_CTX* ctx, int opener_index, int closer_index, int how)
2619 {
2620 int i;
2621 int mark_index;
2622
2623 /* Cut all unresolved openers at the mark index. */
2624 for(i = OPENERS_CHAIN_FIRST; i < OPENERS_CHAIN_LAST+1; i++) {
2625 MD_MARKCHAIN* chain = &ctx->mark_chains[i];
2626
2627 while(chain->tail >= opener_index)
2628 chain->tail = ctx->marks[chain->tail].prev;
2629
2630 if(chain->tail >= 0)
2631 ctx->marks[chain->tail].next = -1;
2632 else
2633 chain->head = -1;
2634 }
2635
2636 /* Go backwards so that unresolved openers are re-added into their
2637 * respective chains, in the right order. */
2638 mark_index = closer_index - 1;
2639 while(mark_index > opener_index) {
2640 MD_MARK* mark = &ctx->marks[mark_index];
2641 int mark_flags = mark->flags;
2642 int discard_flag = (how == MD_ROLLBACK_ALL);
2643
2644 if(mark->flags & MD_MARK_CLOSER) {
2645 int mark_opener_index = mark->prev;
2646
2647 /* Undo opener BEFORE the range. */
2648 if(mark_opener_index < opener_index) {
2649 MD_MARK* mark_opener = &ctx->marks[mark_opener_index];
2650 MD_MARKCHAIN* chain;
2651
2652 mark_opener->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
2653 chain = md_mark_chain(ctx, opener_index);
2654 if(chain != NULL) {
2655 md_mark_chain_append(ctx, chain, mark_opener_index);
2656 discard_flag = 1;
2657 }
2658 }
2659 }
2660
2661 /* And reset our flags. */
2662 if(discard_flag)
2663 mark->flags &= ~(MD_MARK_OPENER | MD_MARK_CLOSER | MD_MARK_RESOLVED);
2664
2665 /* Jump as far as we can over unresolved or non-interesting marks. */
2666 switch(how) {
2667 case MD_ROLLBACK_CROSSING:
2668 if((mark_flags & MD_MARK_CLOSER) && mark->prev > opener_index) {
2669 /* If we are closer with opener INSIDE the range, there may
2670 * not be any other crosser inside the subrange. */
2671 mark_index = mark->prev;
2672 break;
2673 }
2674 /* Pass through. */
2675 default:
2676 mark_index--;
2677 break;
2678 }
2679 }
2680 }
2681
2682 static void
md_build_mark_char_map(MD_CTX * ctx)2683 md_build_mark_char_map(MD_CTX* ctx)
2684 {
2685 memset(ctx->mark_char_map, 0, sizeof(ctx->mark_char_map));
2686
2687 ctx->mark_char_map['\\'] = 1;
2688 ctx->mark_char_map['*'] = 1;
2689 ctx->mark_char_map['_'] = 1;
2690 ctx->mark_char_map['`'] = 1;
2691 ctx->mark_char_map['&'] = 1;
2692 ctx->mark_char_map[';'] = 1;
2693 ctx->mark_char_map['<'] = 1;
2694 ctx->mark_char_map['>'] = 1;
2695 ctx->mark_char_map['['] = 1;
2696 ctx->mark_char_map['!'] = 1;
2697 ctx->mark_char_map[']'] = 1;
2698 ctx->mark_char_map['\0'] = 1;
2699
2700 if(ctx->parser.flags & MD_FLAG_STRIKETHROUGH)
2701 ctx->mark_char_map['~'] = 1;
2702
2703 if(ctx->parser.flags & MD_FLAG_LATEXMATHSPANS)
2704 ctx->mark_char_map['$'] = 1;
2705
2706 if(ctx->parser.flags & MD_FLAG_PERMISSIVEEMAILAUTOLINKS)
2707 ctx->mark_char_map['@'] = 1;
2708
2709 if(ctx->parser.flags & MD_FLAG_PERMISSIVEURLAUTOLINKS)
2710 ctx->mark_char_map[':'] = 1;
2711
2712 if(ctx->parser.flags & MD_FLAG_PERMISSIVEWWWAUTOLINKS)
2713 ctx->mark_char_map['.'] = 1;
2714
2715 if((ctx->parser.flags & MD_FLAG_TABLES) || (ctx->parser.flags & MD_FLAG_WIKILINKS))
2716 ctx->mark_char_map['|'] = 1;
2717
2718 if(ctx->parser.flags & MD_FLAG_COLLAPSEWHITESPACE) {
2719 int i;
2720
2721 for(i = 0; i < (int) sizeof(ctx->mark_char_map); i++) {
2722 if(ISWHITESPACE_(i))
2723 ctx->mark_char_map[i] = 1;
2724 }
2725 }
2726 }
2727
2728 /* We limit code span marks to lower than 32 backticks. This solves the
2729 * pathologic case of too many openers, each of different length: Their
2730 * resolving would be then O(n^2). */
2731 #define CODESPAN_MARK_MAXLEN 32
2732
2733 static int
md_is_code_span(MD_CTX * ctx,const MD_LINE * lines,int n_lines,OFF beg,OFF * p_opener_beg,OFF * p_opener_end,OFF * p_closer_beg,OFF * p_closer_end,OFF last_potential_closers[CODESPAN_MARK_MAXLEN],int * p_reached_paragraph_end)2734 md_is_code_span(MD_CTX* ctx, const MD_LINE* lines, int n_lines, OFF beg,
2735 OFF* p_opener_beg, OFF* p_opener_end,
2736 OFF* p_closer_beg, OFF* p_closer_end,
2737 OFF last_potential_closers[CODESPAN_MARK_MAXLEN],
2738 int* p_reached_paragraph_end)
2739 {
2740 OFF opener_beg = beg;
2741 OFF opener_end;
2742 OFF closer_beg;
2743 OFF closer_end;
2744 SZ mark_len;
2745 OFF line_end;
2746 int has_space_after_opener = FALSE;
2747 int has_eol_after_opener = FALSE;
2748 int has_space_before_closer = FALSE;
2749 int has_eol_before_closer = FALSE;
2750 int has_only_space = TRUE;
2751 int line_index = 0;
2752
2753 line_end = lines[0].end;
2754 opener_end = opener_beg;
2755 while(opener_end < line_end && CH(opener_end) == _T('`'))
2756 opener_end++;
2757 has_space_after_opener = (opener_end < line_end && CH(opener_end) == _T(' '));
2758 has_eol_after_opener = (opener_end == line_end);
2759
2760 /* The caller needs to know end of the opening mark even if we fail. */
2761 *p_opener_end = opener_end;
2762
2763 mark_len = opener_end - opener_beg;
2764 if(mark_len > CODESPAN_MARK_MAXLEN)
2765 return FALSE;
2766
2767 /* Check whether we already know there is no closer of this length.
2768 * If so, re-scan does no sense. This fixes issue #59. */
2769 if(last_potential_closers[mark_len-1] >= lines[n_lines-1].end ||
2770 (*p_reached_paragraph_end && last_potential_closers[mark_len-1] < opener_end))
2771 return FALSE;
2772
2773 closer_beg = opener_end;
2774 closer_end = opener_end;
2775
2776 /* Find closer mark. */
2777 while(TRUE) {
2778 while(closer_beg < line_end && CH(closer_beg) != _T('`')) {
2779 if(CH(closer_beg) != _T(' '))
2780 has_only_space = FALSE;
2781 closer_beg++;
2782 }
2783 closer_end = closer_beg;
2784 while(closer_end < line_end && CH(closer_end) == _T('`'))
2785 closer_end++;
2786
2787 if(closer_end - closer_beg == mark_len) {
2788 /* Success. */
2789 has_space_before_closer = (closer_beg > lines[line_index].beg && CH(closer_beg-1) == _T(' '));
2790 has_eol_before_closer = (closer_beg == lines[line_index].beg);
2791 break;
2792 }
2793
2794 if(closer_end - closer_beg > 0) {
2795 /* We have found a back-tick which is not part of the closer. */
2796 has_only_space = FALSE;
2797
2798 /* But if we eventually fail, remember it as a potential closer
2799 * of its own length for future attempts. This mitigates needs for
2800 * rescans. */
2801 if(closer_end - closer_beg < CODESPAN_MARK_MAXLEN) {
2802 if(closer_beg > last_potential_closers[closer_end - closer_beg - 1])
2803 last_potential_closers[closer_end - closer_beg - 1] = closer_beg;
2804 }
2805 }
2806
2807 if(closer_end >= line_end) {
2808 line_index++;
2809 if(line_index >= n_lines) {
2810 /* Reached end of the paragraph and still nothing. */
2811 *p_reached_paragraph_end = TRUE;
2812 return FALSE;
2813 }
2814 /* Try on the next line. */
2815 line_end = lines[line_index].end;
2816 closer_beg = lines[line_index].beg;
2817 } else {
2818 closer_beg = closer_end;
2819 }
2820 }
2821
2822 /* If there is a space or a new line both after and before the opener
2823 * (and if the code span is not made of spaces only), consume one initial
2824 * and one trailing space as part of the marks. */
2825 if(!has_only_space &&
2826 (has_space_after_opener || has_eol_after_opener) &&
2827 (has_space_before_closer || has_eol_before_closer))
2828 {
2829 if(has_space_after_opener)
2830 opener_end++;
2831 else
2832 opener_end = lines[1].beg;
2833
2834 if(has_space_before_closer)
2835 closer_beg--;
2836 else {
2837 closer_beg = lines[line_index-1].end;
2838 /* We need to eat the preceding "\r\n" but not any line trailing
2839 * spaces. */
2840 while(closer_beg < ctx->size && ISBLANK(closer_beg))
2841 closer_beg++;
2842 }
2843 }
2844
2845 *p_opener_beg = opener_beg;
2846 *p_opener_end = opener_end;
2847 *p_closer_beg = closer_beg;
2848 *p_closer_end = closer_end;
2849 return TRUE;
2850 }
2851
2852 static int
md_is_autolink_uri(MD_CTX * ctx,OFF beg,OFF max_end,OFF * p_end)2853 md_is_autolink_uri(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2854 {
2855 OFF off = beg+1;
2856
2857 MD_ASSERT(CH(beg) == _T('<'));
2858
2859 /* Check for scheme. */
2860 if(off >= max_end || !ISASCII(off))
2861 return FALSE;
2862 off++;
2863 while(1) {
2864 if(off >= max_end)
2865 return FALSE;
2866 if(off - beg > 32)
2867 return FALSE;
2868 if(CH(off) == _T(':') && off - beg >= 3)
2869 break;
2870 if(!ISALNUM(off) && CH(off) != _T('+') && CH(off) != _T('-') && CH(off) != _T('.'))
2871 return FALSE;
2872 off++;
2873 }
2874
2875 /* Check the path after the scheme. */
2876 while(off < max_end && CH(off) != _T('>')) {
2877 if(ISWHITESPACE(off) || ISCNTRL(off) || CH(off) == _T('<'))
2878 return FALSE;
2879 off++;
2880 }
2881
2882 if(off >= max_end)
2883 return FALSE;
2884
2885 MD_ASSERT(CH(off) == _T('>'));
2886 *p_end = off+1;
2887 return TRUE;
2888 }
2889
2890 static int
md_is_autolink_email(MD_CTX * ctx,OFF beg,OFF max_end,OFF * p_end)2891 md_is_autolink_email(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end)
2892 {
2893 OFF off = beg + 1;
2894 int label_len;
2895
2896 MD_ASSERT(CH(beg) == _T('<'));
2897
2898 /* The code should correspond to this regexp:
2899 /^[a-zA-Z0-9.!#$%&'*+\/=?^_`{|}~-]+
2900 @[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?
2901 (?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/
2902 */
2903
2904 /* Username (before '@'). */
2905 while(off < max_end && (ISALNUM(off) || ISANYOF(off, _T(".!#$%&'*+/=?^_`{|}~-"))))
2906 off++;
2907 if(off <= beg+1)
2908 return FALSE;
2909
2910 /* '@' */
2911 if(off >= max_end || CH(off) != _T('@'))
2912 return FALSE;
2913 off++;
2914
2915 /* Labels delimited with '.'; each label is sequence of 1 - 63 alnum
2916 * characters or '-', but '-' is not allowed as first or last char. */
2917 label_len = 0;
2918 while(off < max_end) {
2919 if(ISALNUM(off))
2920 label_len++;
2921 else if(CH(off) == _T('-') && label_len > 0)
2922 label_len++;
2923 else if(CH(off) == _T('.') && label_len > 0 && CH(off-1) != _T('-'))
2924 label_len = 0;
2925 else
2926 break;
2927
2928 if(label_len > 63)
2929 return FALSE;
2930
2931 off++;
2932 }
2933
2934 if(label_len <= 0 || off >= max_end || CH(off) != _T('>') || CH(off-1) == _T('-'))
2935 return FALSE;
2936
2937 *p_end = off+1;
2938 return TRUE;
2939 }
2940
2941 static int
md_is_autolink(MD_CTX * ctx,OFF beg,OFF max_end,OFF * p_end,int * p_missing_mailto)2942 md_is_autolink(MD_CTX* ctx, OFF beg, OFF max_end, OFF* p_end, int* p_missing_mailto)
2943 {
2944 if(md_is_autolink_uri(ctx, beg, max_end, p_end)) {
2945 *p_missing_mailto = FALSE;
2946 return TRUE;
2947 }
2948
2949 if(md_is_autolink_email(ctx, beg, max_end, p_end)) {
2950 *p_missing_mailto = TRUE;
2951 return TRUE;
2952 }
2953
2954 return FALSE;
2955 }
2956
2957 static int
md_collect_marks(MD_CTX * ctx,const MD_LINE * lines,int n_lines,int table_mode)2958 md_collect_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
2959 {
2960 int i;
2961 int ret = 0;
2962 MD_MARK* mark;
2963 OFF codespan_last_potential_closers[CODESPAN_MARK_MAXLEN] = { 0 };
2964 int codespan_scanned_till_paragraph_end = FALSE;
2965
2966 for(i = 0; i < n_lines; i++) {
2967 const MD_LINE* line = &lines[i];
2968 OFF off = line->beg;
2969 OFF line_end = line->end;
2970
2971 while(TRUE) {
2972 CHAR ch;
2973
2974 #ifdef MD4C_USE_UTF16
2975 /* For UTF-16, mark_char_map[] covers only ASCII. */
2976 #define IS_MARK_CHAR(off) ((CH(off) < SIZEOF_ARRAY(ctx->mark_char_map)) && \
2977 (ctx->mark_char_map[(unsigned char) CH(off)]))
2978 #else
2979 /* For 8-bit encodings, mark_char_map[] covers all 256 elements. */
2980 #define IS_MARK_CHAR(off) (ctx->mark_char_map[(unsigned char) CH(off)])
2981 #endif
2982
2983 /* Optimization: Use some loop unrolling. */
2984 while(off + 3 < line_end && !IS_MARK_CHAR(off+0) && !IS_MARK_CHAR(off+1)
2985 && !IS_MARK_CHAR(off+2) && !IS_MARK_CHAR(off+3))
2986 off += 4;
2987 while(off < line_end && !IS_MARK_CHAR(off+0))
2988 off++;
2989
2990 if(off >= line_end)
2991 break;
2992
2993 ch = CH(off);
2994
2995 /* A backslash escape.
2996 * It can go beyond line->end as it may involve escaped new
2997 * line to form a hard break. */
2998 if(ch == _T('\\') && off+1 < ctx->size && (ISPUNCT(off+1) || ISNEWLINE(off+1))) {
2999 /* Hard-break cannot be on the last line of the block. */
3000 if(!ISNEWLINE(off+1) || i+1 < n_lines)
3001 PUSH_MARK(ch, off, off+2, MD_MARK_RESOLVED);
3002 off += 2;
3003 continue;
3004 }
3005
3006 /* A potential (string) emphasis start/end. */
3007 if(ch == _T('*') || ch == _T('_')) {
3008 OFF tmp = off+1;
3009 int left_level; /* What precedes: 0 = whitespace; 1 = punctuation; 2 = other char. */
3010 int right_level; /* What follows: 0 = whitespace; 1 = punctuation; 2 = other char. */
3011
3012 while(tmp < line_end && CH(tmp) == ch)
3013 tmp++;
3014
3015 if(off == line->beg || ISUNICODEWHITESPACEBEFORE(off))
3016 left_level = 0;
3017 else if(ISUNICODEPUNCTBEFORE(off))
3018 left_level = 1;
3019 else
3020 left_level = 2;
3021
3022 if(tmp == line_end || ISUNICODEWHITESPACE(tmp))
3023 right_level = 0;
3024 else if(ISUNICODEPUNCT(tmp))
3025 right_level = 1;
3026 else
3027 right_level = 2;
3028
3029 /* Intra-word underscore doesn't have special meaning. */
3030 if(ch == _T('_') && left_level == 2 && right_level == 2) {
3031 left_level = 0;
3032 right_level = 0;
3033 }
3034
3035 if(left_level != 0 || right_level != 0) {
3036 unsigned flags = 0;
3037
3038 if(left_level > 0 && left_level >= right_level)
3039 flags |= MD_MARK_POTENTIAL_CLOSER;
3040 if(right_level > 0 && right_level >= left_level)
3041 flags |= MD_MARK_POTENTIAL_OPENER;
3042 if(left_level == 2 && right_level == 2)
3043 flags |= MD_MARK_EMPH_INTRAWORD;
3044
3045 /* For "the rule of three" we need to remember the original
3046 * size of the mark (modulo three), before we potentially
3047 * split the mark when being later resolved partially by some
3048 * shorter closer. */
3049 switch((tmp - off) % 3) {
3050 case 0: flags |= MD_MARK_EMPH_MOD3_0; break;
3051 case 1: flags |= MD_MARK_EMPH_MOD3_1; break;
3052 case 2: flags |= MD_MARK_EMPH_MOD3_2; break;
3053 }
3054
3055 PUSH_MARK(ch, off, tmp, flags);
3056
3057 /* During resolving, multiple asterisks may have to be
3058 * split into independent span start/ends. Consider e.g.
3059 * "**foo* bar*". Therefore we push also some empty dummy
3060 * marks to have enough space for that. */
3061 off++;
3062 while(off < tmp) {
3063 PUSH_MARK('D', off, off, 0);
3064 off++;
3065 }
3066 continue;
3067 }
3068
3069 off = tmp;
3070 continue;
3071 }
3072
3073 /* A potential code span start/end. */
3074 if(ch == _T('`')) {
3075 OFF opener_beg, opener_end;
3076 OFF closer_beg, closer_end;
3077 int is_code_span;
3078
3079 is_code_span = md_is_code_span(ctx, lines + i, n_lines - i, off,
3080 &opener_beg, &opener_end, &closer_beg, &closer_end,
3081 codespan_last_potential_closers,
3082 &codespan_scanned_till_paragraph_end);
3083 if(is_code_span) {
3084 PUSH_MARK(_T('`'), opener_beg, opener_end, MD_MARK_OPENER | MD_MARK_RESOLVED);
3085 PUSH_MARK(_T('`'), closer_beg, closer_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3086 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3087 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3088
3089 off = closer_end;
3090
3091 /* Advance the current line accordingly. */
3092 while(off > line_end) {
3093 i++;
3094 line++;
3095 line_end = line->end;
3096 }
3097 continue;
3098 }
3099
3100 off = opener_end;
3101 continue;
3102 }
3103
3104 /* A potential entity start. */
3105 if(ch == _T('&')) {
3106 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3107 off++;
3108 continue;
3109 }
3110
3111 /* A potential entity end. */
3112 if(ch == _T(';')) {
3113 /* We surely cannot be entity unless the previous mark is '&'. */
3114 if(ctx->n_marks > 0 && ctx->marks[ctx->n_marks-1].ch == _T('&'))
3115 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3116
3117 off++;
3118 continue;
3119 }
3120
3121 /* A potential autolink or raw HTML start/end. */
3122 if(ch == _T('<')) {
3123 int is_autolink;
3124 OFF autolink_end;
3125 int missing_mailto;
3126
3127 if(!(ctx->parser.flags & MD_FLAG_NOHTMLSPANS)) {
3128 int is_html;
3129 OFF html_end;
3130
3131 /* Given the nature of the raw HTML, we have to recognize
3132 * it here. Doing so later in md_analyze_lt_gt() could
3133 * open can of worms of quadratic complexity. */
3134 is_html = md_is_html_any(ctx, lines + i, n_lines - i, off,
3135 lines[n_lines-1].end, &html_end);
3136 if(is_html) {
3137 PUSH_MARK(_T('<'), off, off, MD_MARK_OPENER | MD_MARK_RESOLVED);
3138 PUSH_MARK(_T('>'), html_end, html_end, MD_MARK_CLOSER | MD_MARK_RESOLVED);
3139 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3140 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3141 off = html_end;
3142
3143 /* Advance the current line accordingly. */
3144 while(off > line_end) {
3145 i++;
3146 line++;
3147 line_end = line->end;
3148 }
3149 continue;
3150 }
3151 }
3152
3153 is_autolink = md_is_autolink(ctx, off, lines[n_lines-1].end,
3154 &autolink_end, &missing_mailto);
3155 if(is_autolink) {
3156 PUSH_MARK((missing_mailto ? _T('@') : _T('<')), off, off+1,
3157 MD_MARK_OPENER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
3158 PUSH_MARK(_T('>'), autolink_end-1, autolink_end,
3159 MD_MARK_CLOSER | MD_MARK_RESOLVED | MD_MARK_AUTOLINK);
3160 ctx->marks[ctx->n_marks-2].next = ctx->n_marks-1;
3161 ctx->marks[ctx->n_marks-1].prev = ctx->n_marks-2;
3162 off = autolink_end;
3163 continue;
3164 }
3165
3166 off++;
3167 continue;
3168 }
3169
3170 /* A potential link or its part. */
3171 if(ch == _T('[') || (ch == _T('!') && off+1 < line_end && CH(off+1) == _T('['))) {
3172 OFF tmp = (ch == _T('[') ? off+1 : off+2);
3173 PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER);
3174 off = tmp;
3175 /* Two dummies to make enough place for data we need if it is
3176 * a link. */
3177 PUSH_MARK('D', off, off, 0);
3178 PUSH_MARK('D', off, off, 0);
3179 continue;
3180 }
3181 if(ch == _T(']')) {
3182 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_CLOSER);
3183 off++;
3184 continue;
3185 }
3186
3187 /* A potential permissive e-mail autolink. */
3188 if(ch == _T('@')) {
3189 if(line->beg + 1 <= off && ISALNUM(off-1) &&
3190 off + 3 < line->end && ISALNUM(off+1))
3191 {
3192 PUSH_MARK(ch, off, off+1, MD_MARK_POTENTIAL_OPENER);
3193 /* Push a dummy as a reserve for a closer. */
3194 PUSH_MARK('D', off, off, 0);
3195 }
3196
3197 off++;
3198 continue;
3199 }
3200
3201 /* A potential permissive URL autolink. */
3202 if(ch == _T(':')) {
3203 static struct {
3204 const CHAR* scheme;
3205 SZ scheme_size;
3206 const CHAR* suffix;
3207 SZ suffix_size;
3208 } scheme_map[] = {
3209 /* In the order from the most frequently used, arguably. */
3210 { _T("http"), 4, _T("//"), 2 },
3211 { _T("https"), 5, _T("//"), 2 },
3212 { _T("ftp"), 3, _T("//"), 2 }
3213 };
3214 int scheme_index;
3215
3216 for(scheme_index = 0; scheme_index < (int) SIZEOF_ARRAY(scheme_map); scheme_index++) {
3217 const CHAR* scheme = scheme_map[scheme_index].scheme;
3218 const SZ scheme_size = scheme_map[scheme_index].scheme_size;
3219 const CHAR* suffix = scheme_map[scheme_index].suffix;
3220 const SZ suffix_size = scheme_map[scheme_index].suffix_size;
3221
3222 if(line->beg + scheme_size <= off && md_ascii_eq(STR(off-scheme_size), scheme, scheme_size) &&
3223 (line->beg + scheme_size == off || ISWHITESPACE(off-scheme_size-1) || ISANYOF(off-scheme_size-1, _T("*_~(["))) &&
3224 off + 1 + suffix_size < line->end && md_ascii_eq(STR(off+1), suffix, suffix_size))
3225 {
3226 PUSH_MARK(ch, off-scheme_size, off+1+suffix_size, MD_MARK_POTENTIAL_OPENER);
3227 /* Push a dummy as a reserve for a closer. */
3228 PUSH_MARK('D', off, off, 0);
3229 off += 1 + suffix_size;
3230 continue;
3231 }
3232 }
3233
3234 off++;
3235 continue;
3236 }
3237
3238 /* A potential permissive WWW autolink. */
3239 if(ch == _T('.')) {
3240 if(line->beg + 3 <= off && md_ascii_eq(STR(off-3), _T("www"), 3) &&
3241 (line->beg + 3 == off || ISWHITESPACE(off-4) || ISANYOF(off-4, _T("*_~(["))) &&
3242 off + 1 < line_end)
3243 {
3244 PUSH_MARK(ch, off-3, off+1, MD_MARK_POTENTIAL_OPENER);
3245 /* Push a dummy as a reserve for a closer. */
3246 PUSH_MARK('D', off, off, 0);
3247 off++;
3248 continue;
3249 }
3250
3251 off++;
3252 continue;
3253 }
3254
3255 /* A potential table cell boundary or wiki link label delimiter. */
3256 if((table_mode || ctx->parser.flags & MD_FLAG_WIKILINKS) && ch == _T('|')) {
3257 PUSH_MARK(ch, off, off+1, 0);
3258 off++;
3259 continue;
3260 }
3261
3262 /* A potential strikethrough start/end. */
3263 if(ch == _T('~')) {
3264 OFF tmp = off+1;
3265
3266 while(tmp < line_end && CH(tmp) == _T('~'))
3267 tmp++;
3268
3269 if(tmp - off < 3) {
3270 unsigned flags = 0;
3271
3272 if(tmp < line_end && !ISUNICODEWHITESPACE(tmp))
3273 flags |= MD_MARK_POTENTIAL_OPENER;
3274 if(off > line->beg && !ISUNICODEWHITESPACEBEFORE(off))
3275 flags |= MD_MARK_POTENTIAL_CLOSER;
3276 if(flags != 0)
3277 PUSH_MARK(ch, off, tmp, flags);
3278 }
3279
3280 off = tmp;
3281 continue;
3282 }
3283
3284 /* A potential equation start/end */
3285 if(ch == _T('$')) {
3286 /* We can have at most two consecutive $ signs,
3287 * where two dollar signs signify a display equation. */
3288 OFF tmp = off+1;
3289
3290 while(tmp < line_end && CH(tmp) == _T('$'))
3291 tmp++;
3292
3293 if (tmp - off <= 2)
3294 PUSH_MARK(ch, off, tmp, MD_MARK_POTENTIAL_OPENER | MD_MARK_POTENTIAL_CLOSER);
3295 off = tmp;
3296 continue;
3297 }
3298
3299 /* Turn non-trivial whitespace into single space. */
3300 if(ISWHITESPACE_(ch)) {
3301 OFF tmp = off+1;
3302
3303 while(tmp < line_end && ISWHITESPACE(tmp))
3304 tmp++;
3305
3306 if(tmp - off > 1 || ch != _T(' '))
3307 PUSH_MARK(ch, off, tmp, MD_MARK_RESOLVED);
3308
3309 off = tmp;
3310 continue;
3311 }
3312
3313 /* NULL character. */
3314 if(ch == _T('\0')) {
3315 PUSH_MARK(ch, off, off+1, MD_MARK_RESOLVED);
3316 off++;
3317 continue;
3318 }
3319
3320 off++;
3321 }
3322 }
3323
3324 /* Add a dummy mark at the end of the mark vector to simplify
3325 * process_inlines(). */
3326 PUSH_MARK(127, ctx->size, ctx->size, MD_MARK_RESOLVED);
3327
3328 abort:
3329 return ret;
3330 }
3331
3332 static void
md_analyze_bracket(MD_CTX * ctx,int mark_index)3333 md_analyze_bracket(MD_CTX* ctx, int mark_index)
3334 {
3335 /* We cannot really resolve links here as for that we would need
3336 * more context. E.g. a following pair of brackets (reference link),
3337 * or enclosing pair of brackets (if the inner is the link, the outer
3338 * one cannot be.)
3339 *
3340 * Therefore we here only construct a list of resolved '[' ']' pairs
3341 * ordered by position of the closer. This allows ur to analyze what is
3342 * or is not link in the right order, from inside to outside in case
3343 * of nested brackets.
3344 *
3345 * The resolving itself is deferred into md_resolve_links().
3346 */
3347
3348 MD_MARK* mark = &ctx->marks[mark_index];
3349
3350 if(mark->flags & MD_MARK_POTENTIAL_OPENER) {
3351 md_mark_chain_append(ctx, &BRACKET_OPENERS, mark_index);
3352 return;
3353 }
3354
3355 if(BRACKET_OPENERS.tail >= 0) {
3356 /* Pop the opener from the chain. */
3357 int opener_index = BRACKET_OPENERS.tail;
3358 MD_MARK* opener = &ctx->marks[opener_index];
3359 if(opener->prev >= 0)
3360 ctx->marks[opener->prev].next = -1;
3361 else
3362 BRACKET_OPENERS.head = -1;
3363 BRACKET_OPENERS.tail = opener->prev;
3364
3365 /* Interconnect the opener and closer. */
3366 opener->next = mark_index;
3367 mark->prev = opener_index;
3368
3369 /* Add the pair into chain of potential links for md_resolve_links().
3370 * Note we misuse opener->prev for this as opener->next points to its
3371 * closer. */
3372 if(ctx->unresolved_link_tail >= 0)
3373 ctx->marks[ctx->unresolved_link_tail].prev = opener_index;
3374 else
3375 ctx->unresolved_link_head = opener_index;
3376 ctx->unresolved_link_tail = opener_index;
3377 opener->prev = -1;
3378 }
3379 }
3380
3381 /* Forward declaration. */
3382 static void md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
3383 int mark_beg, int mark_end);
3384
3385 static int
md_resolve_links(MD_CTX * ctx,const MD_LINE * lines,int n_lines)3386 md_resolve_links(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
3387 {
3388 int opener_index = ctx->unresolved_link_head;
3389 OFF last_link_beg = 0;
3390 OFF last_link_end = 0;
3391 OFF last_img_beg = 0;
3392 OFF last_img_end = 0;
3393
3394 while(opener_index >= 0) {
3395 MD_MARK* opener = &ctx->marks[opener_index];
3396 int closer_index = opener->next;
3397 MD_MARK* closer = &ctx->marks[closer_index];
3398 int next_index = opener->prev;
3399 MD_MARK* next_opener;
3400 MD_MARK* next_closer;
3401 MD_LINK_ATTR attr;
3402 int is_link = FALSE;
3403
3404 if(next_index >= 0) {
3405 next_opener = &ctx->marks[next_index];
3406 next_closer = &ctx->marks[next_opener->next];
3407 } else {
3408 next_opener = NULL;
3409 next_closer = NULL;
3410 }
3411
3412 /* If nested ("[ [ ] ]"), we need to make sure that:
3413 * - The outer does not end inside of (...) belonging to the inner.
3414 * - The outer cannot be link if the inner is link (i.e. not image).
3415 *
3416 * (Note we here analyze from inner to outer as the marks are ordered
3417 * by closer->beg.)
3418 */
3419 if((opener->beg < last_link_beg && closer->end < last_link_end) ||
3420 (opener->beg < last_img_beg && closer->end < last_img_end) ||
3421 (opener->beg < last_link_end && opener->ch == '['))
3422 {
3423 opener_index = next_index;
3424 continue;
3425 }
3426
3427 /* Recognize and resolve wiki links.
3428 * Wiki-links maybe '[[destination]]' or '[[destination|label]]'.
3429 */
3430 if ((ctx->parser.flags & MD_FLAG_WIKILINKS) &&
3431 (opener->end - opener->beg == 1) && /* not image */
3432 next_opener != NULL && /* double '[' opener */
3433 next_opener->ch == '[' &&
3434 (next_opener->beg == opener->beg - 1) &&
3435 (next_opener->end - next_opener->beg == 1) &&
3436 next_closer != NULL && /* double ']' closer */
3437 next_closer->ch == ']' &&
3438 (next_closer->beg == closer->beg + 1) &&
3439 (next_closer->end - next_closer->beg == 1))
3440 {
3441 MD_MARK* delim = NULL;
3442 int delim_index;
3443 OFF dest_beg, dest_end;
3444
3445 is_link = TRUE;
3446
3447 /* We don't allow destination to be longer than 100 characters.
3448 * Lets scan to see whether there is '|'. (If not then the whole
3449 * wiki-link has to be below the 100 characters.) */
3450 delim_index = opener_index + 1;
3451 while(delim_index < closer_index) {
3452 MD_MARK* m = &ctx->marks[delim_index];
3453 if(m->ch == '|') {
3454 delim = m;
3455 break;
3456 }
3457 if(m->ch != 'D' && m->beg - opener->end > 100)
3458 break;
3459 delim_index++;
3460 }
3461 dest_beg = opener->end;
3462 dest_end = (delim != NULL) ? delim->beg : closer->beg;
3463 if(dest_end - dest_beg == 0 || dest_end - dest_beg > 100)
3464 is_link = FALSE;
3465
3466 /* There may not be any new line in the destination. */
3467 if(is_link) {
3468 OFF off;
3469 for(off = dest_beg; off < dest_end; off++) {
3470 if(ISNEWLINE(off)) {
3471 is_link = FALSE;
3472 break;
3473 }
3474 }
3475 }
3476
3477 if(is_link) {
3478 if(delim != NULL) {
3479 if(delim->end < closer->beg) {
3480 opener->end = delim->beg;
3481 } else {
3482 /* The pipe is just before the closer: [[foo|]] */
3483 closer->beg = delim->beg;
3484 delim = NULL;
3485 }
3486 }
3487
3488 opener->beg = next_opener->beg;
3489 opener->next = closer_index;
3490 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3491
3492 closer->end = next_closer->end;
3493 closer->prev = opener_index;
3494 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3495
3496 last_link_beg = opener->beg;
3497 last_link_end = closer->end;
3498
3499 if(delim != NULL) {
3500 delim->flags |= MD_MARK_RESOLVED;
3501 md_rollback(ctx, opener_index, delim_index, MD_ROLLBACK_ALL);
3502 md_analyze_link_contents(ctx, lines, n_lines, opener_index+1, closer_index);
3503 } else {
3504 md_rollback(ctx, opener_index, closer_index, MD_ROLLBACK_ALL);
3505 }
3506
3507 opener_index = next_opener->prev;
3508 continue;
3509 }
3510 }
3511
3512 if(next_opener != NULL && next_opener->beg == closer->end) {
3513 if(next_closer->beg > closer->end + 1) {
3514 /* Might be full reference link. */
3515 is_link = md_is_link_reference(ctx, lines, n_lines, next_opener->beg, next_closer->end, &attr);
3516 } else {
3517 /* Might be shortcut reference link. */
3518 is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
3519 }
3520
3521 if(is_link < 0)
3522 return -1;
3523
3524 if(is_link) {
3525 /* Eat the 2nd "[...]". */
3526 closer->end = next_closer->end;
3527 }
3528 } else {
3529 if(closer->end < ctx->size && CH(closer->end) == _T('(')) {
3530 /* Might be inline link. */
3531 OFF inline_link_end = UINT_MAX;
3532
3533 is_link = md_is_inline_link_spec(ctx, lines, n_lines, closer->end, &inline_link_end, &attr);
3534 if(is_link < 0)
3535 return -1;
3536
3537 /* Check the closing ')' is not inside an already resolved range
3538 * (i.e. a range with a higher priority), e.g. a code span. */
3539 if(is_link) {
3540 int i = closer_index + 1;
3541
3542 while(i < ctx->n_marks) {
3543 MD_MARK* mark = &ctx->marks[i];
3544
3545 if(mark->beg >= inline_link_end)
3546 break;
3547 if((mark->flags & (MD_MARK_OPENER | MD_MARK_RESOLVED)) == (MD_MARK_OPENER | MD_MARK_RESOLVED)) {
3548 if(ctx->marks[mark->next].beg >= inline_link_end) {
3549 /* Cancel the link status. */
3550 if(attr.title_needs_free)
3551 free(attr.title);
3552 is_link = FALSE;
3553 break;
3554 }
3555
3556 i = mark->next + 1;
3557 } else {
3558 i++;
3559 }
3560 }
3561 }
3562
3563 if(is_link) {
3564 /* Eat the "(...)" */
3565 closer->end = inline_link_end;
3566 }
3567 }
3568
3569 if(!is_link) {
3570 /* Might be collapsed reference link. */
3571 is_link = md_is_link_reference(ctx, lines, n_lines, opener->beg, closer->end, &attr);
3572 if(is_link < 0)
3573 return -1;
3574 }
3575 }
3576
3577 if(is_link) {
3578 /* Resolve the brackets as a link. */
3579 opener->flags |= MD_MARK_OPENER | MD_MARK_RESOLVED;
3580 closer->flags |= MD_MARK_CLOSER | MD_MARK_RESOLVED;
3581
3582 /* If it is a link, we store the destination and title in the two
3583 * dummy marks after the opener. */
3584 MD_ASSERT(ctx->marks[opener_index+1].ch == 'D');
3585 ctx->marks[opener_index+1].beg = attr.dest_beg;
3586 ctx->marks[opener_index+1].end = attr.dest_end;
3587
3588 MD_ASSERT(ctx->marks[opener_index+2].ch == 'D');
3589 md_mark_store_ptr(ctx, opener_index+2, attr.title);
3590 /* The title might or might not have been allocated for us. */
3591 if(attr.title_needs_free)
3592 md_mark_chain_append(ctx, &PTR_CHAIN, opener_index+2);
3593 ctx->marks[opener_index+2].prev = attr.title_size;
3594
3595 if(opener->ch == '[') {
3596 last_link_beg = opener->beg;
3597 last_link_end = closer->end;
3598 } else {
3599 last_img_beg = opener->beg;
3600 last_img_end = closer->end;
3601 }
3602
3603 md_analyze_link_contents(ctx, lines, n_lines, opener_index+1, closer_index);
3604 }
3605
3606 opener_index = next_index;
3607 }
3608
3609 return 0;
3610 }
3611
3612 /* Analyze whether the mark '&' starts a HTML entity.
3613 * If so, update its flags as well as flags of corresponding closer ';'. */
3614 static void
md_analyze_entity(MD_CTX * ctx,int mark_index)3615 md_analyze_entity(MD_CTX* ctx, int mark_index)
3616 {
3617 MD_MARK* opener = &ctx->marks[mark_index];
3618 MD_MARK* closer;
3619 OFF off;
3620
3621 /* Cannot be entity if there is no closer as the next mark.
3622 * (Any other mark between would mean strange character which cannot be
3623 * part of the entity.
3624 *
3625 * So we can do all the work on '&' and do not call this later for the
3626 * closing mark ';'.
3627 */
3628 if(mark_index + 1 >= ctx->n_marks)
3629 return;
3630 closer = &ctx->marks[mark_index+1];
3631 if(closer->ch != ';')
3632 return;
3633
3634 if(md_is_entity(ctx, opener->beg, closer->end, &off)) {
3635 MD_ASSERT(off == closer->end);
3636
3637 md_resolve_range(ctx, NULL, mark_index, mark_index+1);
3638 opener->end = closer->end;
3639 }
3640 }
3641
3642 static void
md_analyze_table_cell_boundary(MD_CTX * ctx,int mark_index)3643 md_analyze_table_cell_boundary(MD_CTX* ctx, int mark_index)
3644 {
3645 MD_MARK* mark = &ctx->marks[mark_index];
3646 mark->flags |= MD_MARK_RESOLVED;
3647
3648 md_mark_chain_append(ctx, &TABLECELLBOUNDARIES, mark_index);
3649 ctx->n_table_cell_boundaries++;
3650 }
3651
3652 /* Split a longer mark into two. The new mark takes the given count of
3653 * characters. May only be called if an adequate number of dummy 'D' marks
3654 * follows.
3655 */
3656 static int
md_split_emph_mark(MD_CTX * ctx,int mark_index,SZ n)3657 md_split_emph_mark(MD_CTX* ctx, int mark_index, SZ n)
3658 {
3659 MD_MARK* mark = &ctx->marks[mark_index];
3660 int new_mark_index = mark_index + (mark->end - mark->beg - n);
3661 MD_MARK* dummy = &ctx->marks[new_mark_index];
3662
3663 MD_ASSERT(mark->end - mark->beg > n);
3664 MD_ASSERT(dummy->ch == 'D');
3665
3666 memcpy(dummy, mark, sizeof(MD_MARK));
3667 mark->end -= n;
3668 dummy->beg = mark->end;
3669
3670 return new_mark_index;
3671 }
3672
3673 static void
md_analyze_emph(MD_CTX * ctx,int mark_index)3674 md_analyze_emph(MD_CTX* ctx, int mark_index)
3675 {
3676 MD_MARK* mark = &ctx->marks[mark_index];
3677 MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
3678
3679 /* If we can be a closer, try to resolve with the preceding opener. */
3680 if(mark->flags & MD_MARK_POTENTIAL_CLOSER) {
3681 MD_MARK* opener = NULL;
3682 int opener_index;
3683
3684 if(mark->ch == _T('*')) {
3685 MD_MARKCHAIN* opener_chains[6];
3686 int i, n_opener_chains;
3687 unsigned flags = mark->flags;
3688
3689 /* Apply the "rule of three". */
3690 n_opener_chains = 0;
3691 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_0;
3692 if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3693 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_1;
3694 if((flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3695 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_intraword_mod3_2;
3696 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_0;
3697 if(!(flags & MD_MARK_EMPH_INTRAWORD) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_2)
3698 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_1;
3699 if(!(flags & MD_MARK_EMPH_INTRAWORD) || (flags & MD_MARK_EMPH_MOD3_MASK) != MD_MARK_EMPH_MOD3_1)
3700 opener_chains[n_opener_chains++] = &ASTERISK_OPENERS_extraword_mod3_2;
3701
3702 /* Opener is the most recent mark from the allowed chains. */
3703 for(i = 0; i < n_opener_chains; i++) {
3704 if(opener_chains[i]->tail >= 0) {
3705 int tmp_index = opener_chains[i]->tail;
3706 MD_MARK* tmp_mark = &ctx->marks[tmp_index];
3707 if(opener == NULL || tmp_mark->end > opener->end) {
3708 opener_index = tmp_index;
3709 opener = tmp_mark;
3710 }
3711 }
3712 }
3713 } else {
3714 /* Simple emph. mark */
3715 if(chain->tail >= 0) {
3716 opener_index = chain->tail;
3717 opener = &ctx->marks[opener_index];
3718 }
3719 }
3720
3721 /* Resolve, if we have found matching opener. */
3722 if(opener != NULL) {
3723 SZ opener_size = opener->end - opener->beg;
3724 SZ closer_size = mark->end - mark->beg;
3725 MD_MARKCHAIN* opener_chain = md_mark_chain(ctx, opener_index);
3726
3727 if(opener_size > closer_size) {
3728 opener_index = md_split_emph_mark(ctx, opener_index, closer_size);
3729 md_mark_chain_append(ctx, opener_chain, opener_index);
3730 } else if(opener_size < closer_size) {
3731 md_split_emph_mark(ctx, mark_index, closer_size - opener_size);
3732 }
3733
3734 md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING);
3735 md_resolve_range(ctx, opener_chain, opener_index, mark_index);
3736 return;
3737 }
3738 }
3739
3740 /* If we could not resolve as closer, we may be yet be an opener. */
3741 if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3742 md_mark_chain_append(ctx, chain, mark_index);
3743 }
3744
3745 static void
md_analyze_tilde(MD_CTX * ctx,int mark_index)3746 md_analyze_tilde(MD_CTX* ctx, int mark_index)
3747 {
3748 MD_MARK* mark = &ctx->marks[mark_index];
3749 MD_MARKCHAIN* chain = md_mark_chain(ctx, mark_index);
3750
3751 /* We attempt to be Github Flavored Markdown compatible here. GFM accepts
3752 * only tildes sequences of length 1 and 2, and the length of the opener
3753 * and closer has to match. */
3754
3755 if((mark->flags & MD_MARK_POTENTIAL_CLOSER) && chain->head >= 0) {
3756 int opener_index = chain->head;
3757
3758 md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_CROSSING);
3759 md_resolve_range(ctx, chain, opener_index, mark_index);
3760 return;
3761 }
3762
3763 if(mark->flags & MD_MARK_POTENTIAL_OPENER)
3764 md_mark_chain_append(ctx, chain, mark_index);
3765 }
3766
3767 static void
md_analyze_dollar(MD_CTX * ctx,int mark_index)3768 md_analyze_dollar(MD_CTX* ctx, int mark_index)
3769 {
3770 /* This should mimic the way inline equations work in LaTeX, so there
3771 * can only ever be one item in the chain (i.e. the dollars can't be
3772 * nested). This is basically the same as the md_analyze_tilde function,
3773 * except that we require matching openers and closers to be of the same
3774 * length.
3775 *
3776 * E.g.: $abc$$def$$ => abc (display equation) def (end equation) */
3777 if(DOLLAR_OPENERS.head >= 0) {
3778 /* If the potential closer has a non-matching number of $, discard */
3779 MD_MARK* open = &ctx->marks[DOLLAR_OPENERS.head];
3780 MD_MARK* close = &ctx->marks[mark_index];
3781
3782 int opener_index = DOLLAR_OPENERS.head;
3783 md_rollback(ctx, opener_index, mark_index, MD_ROLLBACK_ALL);
3784 if (open->end - open->beg == close->end - close->beg) {
3785 /* We are the matching closer */
3786 md_resolve_range(ctx, &DOLLAR_OPENERS, opener_index, mark_index);
3787 } else {
3788 /* We don't match the opener, so discard old opener and insert as opener */
3789 md_mark_chain_append(ctx, &DOLLAR_OPENERS, mark_index);
3790 }
3791 } else {
3792 /* No unmatched openers, so we are opener */
3793 md_mark_chain_append(ctx, &DOLLAR_OPENERS, mark_index);
3794 }
3795 }
3796
3797 static void
md_analyze_permissive_url_autolink(MD_CTX * ctx,int mark_index)3798 md_analyze_permissive_url_autolink(MD_CTX* ctx, int mark_index)
3799 {
3800 MD_MARK* opener = &ctx->marks[mark_index];
3801 int closer_index = mark_index + 1;
3802 MD_MARK* closer = &ctx->marks[closer_index];
3803 MD_MARK* next_resolved_mark;
3804 OFF off = opener->end;
3805 int n_dots = FALSE;
3806 int has_underscore_in_last_seg = FALSE;
3807 int has_underscore_in_next_to_last_seg = FALSE;
3808 int n_opened_parenthesis = 0;
3809
3810 /* Check for domain. */
3811 while(off < ctx->size) {
3812 if(ISALNUM(off) || CH(off) == _T('-')) {
3813 off++;
3814 } else if(CH(off) == _T('.')) {
3815 /* We must see at least one period. */
3816 n_dots++;
3817 has_underscore_in_next_to_last_seg = has_underscore_in_last_seg;
3818 has_underscore_in_last_seg = FALSE;
3819 off++;
3820 } else if(CH(off) == _T('_')) {
3821 /* No underscore may be present in the last two domain segments. */
3822 has_underscore_in_last_seg = TRUE;
3823 off++;
3824 } else {
3825 break;
3826 }
3827 }
3828 if(off > opener->end && CH(off-1) == _T('.')) {
3829 off--;
3830 n_dots--;
3831 }
3832 if(off <= opener->end || n_dots == 0 || has_underscore_in_next_to_last_seg || has_underscore_in_last_seg)
3833 return;
3834
3835 /* Check for path. */
3836 next_resolved_mark = closer + 1;
3837 while(next_resolved_mark->ch == 'D' || !(next_resolved_mark->flags & MD_MARK_RESOLVED))
3838 next_resolved_mark++;
3839 while(off < next_resolved_mark->beg && CH(off) != _T('<') && !ISWHITESPACE(off) && !ISNEWLINE(off)) {
3840 /* Parenthesis must be balanced. */
3841 if(CH(off) == _T('(')) {
3842 n_opened_parenthesis++;
3843 } else if(CH(off) == _T(')')) {
3844 if(n_opened_parenthesis > 0)
3845 n_opened_parenthesis--;
3846 else
3847 break;
3848 }
3849
3850 off++;
3851 }
3852 /* These cannot be last char In such case they are more likely normal
3853 * punctuation. */
3854 if(ISANYOF(off-1, _T("?!.,:*_~")))
3855 off--;
3856
3857 /* Ok. Lets call it auto-link. Adapt opener and create closer to zero
3858 * length so all the contents becomes the link text. */
3859 MD_ASSERT(closer->ch == 'D');
3860 opener->end = opener->beg;
3861 closer->ch = opener->ch;
3862 closer->beg = off;
3863 closer->end = off;
3864 md_resolve_range(ctx, NULL, mark_index, closer_index);
3865 }
3866
3867 /* The permissive autolinks do not have to be enclosed in '<' '>' but we
3868 * instead impose stricter rules what is understood as an e-mail address
3869 * here. Actually any non-alphanumeric characters with exception of '.'
3870 * are prohibited both in username and after '@'. */
3871 static void
md_analyze_permissive_email_autolink(MD_CTX * ctx,int mark_index)3872 md_analyze_permissive_email_autolink(MD_CTX* ctx, int mark_index)
3873 {
3874 MD_MARK* opener = &ctx->marks[mark_index];
3875 int closer_index;
3876 MD_MARK* closer;
3877 OFF beg = opener->beg;
3878 OFF end = opener->end;
3879 int dot_count = 0;
3880
3881 MD_ASSERT(CH(beg) == _T('@'));
3882
3883 /* Scan for name before '@'. */
3884 while(beg > 0 && (ISALNUM(beg-1) || ISANYOF(beg-1, _T(".-_+"))))
3885 beg--;
3886
3887 /* Scan for domain after '@'. */
3888 while(end < ctx->size && (ISALNUM(end) || ISANYOF(end, _T(".-_")))) {
3889 if(CH(end) == _T('.'))
3890 dot_count++;
3891 end++;
3892 }
3893 if(CH(end-1) == _T('.')) { /* Final '.' not part of it. */
3894 dot_count--;
3895 end--;
3896 }
3897 else if(ISANYOF2(end-1, _T('-'), _T('_'))) /* These are forbidden at the end. */
3898 return;
3899 if(CH(end-1) == _T('@') || dot_count == 0)
3900 return;
3901
3902 /* Ok. Lets call it auto-link. Adapt opener and create closer to zero
3903 * length so all the contents becomes the link text. */
3904 closer_index = mark_index + 1;
3905 closer = &ctx->marks[closer_index];
3906 MD_ASSERT(closer->ch == 'D');
3907
3908 opener->beg = beg;
3909 opener->end = beg;
3910 closer->ch = opener->ch;
3911 closer->beg = end;
3912 closer->end = end;
3913 md_resolve_range(ctx, NULL, mark_index, closer_index);
3914 }
3915
3916 static inline void
md_analyze_marks(MD_CTX * ctx,const MD_LINE * lines,int n_lines,int mark_beg,int mark_end,const CHAR * mark_chars)3917 md_analyze_marks(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
3918 int mark_beg, int mark_end, const CHAR* mark_chars)
3919 {
3920 int i = mark_beg;
3921
3922 while(i < mark_end) {
3923 MD_MARK* mark = &ctx->marks[i];
3924
3925 /* Skip resolved spans. */
3926 if(mark->flags & MD_MARK_RESOLVED) {
3927 if(mark->flags & MD_MARK_OPENER) {
3928 MD_ASSERT(i < mark->next);
3929 i = mark->next + 1;
3930 } else {
3931 i++;
3932 }
3933 continue;
3934 }
3935
3936 /* Skip marks we do not want to deal with. */
3937 if(!ISANYOF_(mark->ch, mark_chars)) {
3938 i++;
3939 continue;
3940 }
3941
3942 /* Analyze the mark. */
3943 switch(mark->ch) {
3944 case '[': /* Pass through. */
3945 case '!': /* Pass through. */
3946 case ']': md_analyze_bracket(ctx, i); break;
3947 case '&': md_analyze_entity(ctx, i); break;
3948 case '|': md_analyze_table_cell_boundary(ctx, i); break;
3949 case '_': /* Pass through. */
3950 case '*': md_analyze_emph(ctx, i); break;
3951 case '~': md_analyze_tilde(ctx, i); break;
3952 case '$': md_analyze_dollar(ctx, i); break;
3953 case '.': /* Pass through. */
3954 case ':': md_analyze_permissive_url_autolink(ctx, i); break;
3955 case '@': md_analyze_permissive_email_autolink(ctx, i); break;
3956 }
3957
3958 i++;
3959 }
3960 }
3961
3962 /* Analyze marks (build ctx->marks). */
3963 static int
md_analyze_inlines(MD_CTX * ctx,const MD_LINE * lines,int n_lines,int table_mode)3964 md_analyze_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines, int table_mode)
3965 {
3966 int ret;
3967
3968 /* Reset the previously collected stack of marks. */
3969 ctx->n_marks = 0;
3970
3971 /* Collect all marks. */
3972 MD_CHECK(md_collect_marks(ctx, lines, n_lines, table_mode));
3973
3974 /* We analyze marks in few groups to handle their precedence. */
3975 /* (1) Entities; code spans; autolinks; raw HTML. */
3976 md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("&"));
3977
3978 /* (2) Links. */
3979 md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("[]!"));
3980 MD_CHECK(md_resolve_links(ctx, lines, n_lines));
3981 BRACKET_OPENERS.head = -1;
3982 BRACKET_OPENERS.tail = -1;
3983 ctx->unresolved_link_head = -1;
3984 ctx->unresolved_link_tail = -1;
3985
3986 if(table_mode) {
3987 /* (3) Analyze table cell boundaries.
3988 * Note we reset TABLECELLBOUNDARIES chain prior to the call md_analyze_marks(),
3989 * not after, because caller may need it. */
3990 MD_ASSERT(n_lines == 1);
3991 TABLECELLBOUNDARIES.head = -1;
3992 TABLECELLBOUNDARIES.tail = -1;
3993 ctx->n_table_cell_boundaries = 0;
3994 md_analyze_marks(ctx, lines, n_lines, 0, ctx->n_marks, _T("|"));
3995 return ret;
3996 }
3997
3998 /* (4) Emphasis and strong emphasis; permissive autolinks. */
3999 md_analyze_link_contents(ctx, lines, n_lines, 0, ctx->n_marks);
4000
4001 abort:
4002 return ret;
4003 }
4004
4005 static void
md_analyze_link_contents(MD_CTX * ctx,const MD_LINE * lines,int n_lines,int mark_beg,int mark_end)4006 md_analyze_link_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines,
4007 int mark_beg, int mark_end)
4008 {
4009 int i;
4010
4011 md_analyze_marks(ctx, lines, n_lines, mark_beg, mark_end, _T("*_~$@:."));
4012
4013 for(i = OPENERS_CHAIN_FIRST; i <= OPENERS_CHAIN_LAST; i++) {
4014 ctx->mark_chains[i].head = -1;
4015 ctx->mark_chains[i].tail = -1;
4016 }
4017 }
4018
4019 static int
md_enter_leave_span_a(MD_CTX * ctx,int enter,MD_SPANTYPE type,const CHAR * dest,SZ dest_size,int prohibit_escapes_in_dest,const CHAR * title,SZ title_size)4020 md_enter_leave_span_a(MD_CTX* ctx, int enter, MD_SPANTYPE type,
4021 const CHAR* dest, SZ dest_size, int prohibit_escapes_in_dest,
4022 const CHAR* title, SZ title_size)
4023 {
4024 MD_ATTRIBUTE_BUILD href_build = { 0 };
4025 MD_ATTRIBUTE_BUILD title_build = { 0 };
4026 MD_SPAN_A_DETAIL det;
4027 int ret = 0;
4028
4029 /* Note we here rely on fact that MD_SPAN_A_DETAIL and
4030 * MD_SPAN_IMG_DETAIL are binary-compatible. */
4031 memset(&det, 0, sizeof(MD_SPAN_A_DETAIL));
4032 MD_CHECK(md_build_attribute(ctx, dest, dest_size,
4033 (prohibit_escapes_in_dest ? MD_BUILD_ATTR_NO_ESCAPES : 0),
4034 &det.href, &href_build));
4035 MD_CHECK(md_build_attribute(ctx, title, title_size, 0, &det.title, &title_build));
4036
4037 if(enter)
4038 MD_ENTER_SPAN(type, &det);
4039 else
4040 MD_LEAVE_SPAN(type, &det);
4041
4042 abort:
4043 md_free_attribute(ctx, &href_build);
4044 md_free_attribute(ctx, &title_build);
4045 return ret;
4046 }
4047
4048 static int
md_enter_leave_span_wikilink(MD_CTX * ctx,int enter,const CHAR * target,SZ target_size)4049 md_enter_leave_span_wikilink(MD_CTX* ctx, int enter, const CHAR* target, SZ target_size)
4050 {
4051 MD_ATTRIBUTE_BUILD target_build = { 0 };
4052 MD_SPAN_WIKILINK_DETAIL det;
4053 int ret = 0;
4054
4055 memset(&det, 0, sizeof(MD_SPAN_WIKILINK_DETAIL));
4056 MD_CHECK(md_build_attribute(ctx, target, target_size, 0, &det.target, &target_build));
4057
4058 if (enter)
4059 MD_ENTER_SPAN(MD_SPAN_WIKILINK, &det);
4060 else
4061 MD_LEAVE_SPAN(MD_SPAN_WIKILINK, &det);
4062
4063 abort:
4064 md_free_attribute(ctx, &target_build);
4065 return ret;
4066 }
4067
4068
4069 /* Render the output, accordingly to the analyzed ctx->marks. */
4070 static int
md_process_inlines(MD_CTX * ctx,const MD_LINE * lines,int n_lines)4071 md_process_inlines(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
4072 {
4073 MD_TEXTTYPE text_type;
4074 const MD_LINE* line = lines;
4075 MD_MARK* prev_mark = NULL;
4076 MD_MARK* mark;
4077 OFF off = lines[0].beg;
4078 OFF end = lines[n_lines-1].end;
4079 int enforce_hardbreak = 0;
4080 int ret = 0;
4081
4082 /* Find first resolved mark. Note there is always at least one resolved
4083 * mark, the dummy last one after the end of the latest line we actually
4084 * never really reach. This saves us of a lot of special checks and cases
4085 * in this function. */
4086 mark = ctx->marks;
4087 while(!(mark->flags & MD_MARK_RESOLVED))
4088 mark++;
4089
4090 text_type = MD_TEXT_NORMAL;
4091
4092 while(1) {
4093 /* Process the text up to the next mark or end-of-line. */
4094 OFF tmp = (line->end < mark->beg ? line->end : mark->beg);
4095 if(tmp > off) {
4096 MD_TEXT(text_type, STR(off), tmp - off);
4097 off = tmp;
4098 }
4099
4100 /* If reached the mark, process it and move to next one. */
4101 if(off >= mark->beg) {
4102 switch(mark->ch) {
4103 case '\\': /* Backslash escape. */
4104 if(ISNEWLINE(mark->beg+1))
4105 enforce_hardbreak = 1;
4106 else
4107 MD_TEXT(text_type, STR(mark->beg+1), 1);
4108 break;
4109
4110 case ' ': /* Non-trivial space. */
4111 MD_TEXT(text_type, _T(" "), 1);
4112 break;
4113
4114 case '`': /* Code span. */
4115 if(mark->flags & MD_MARK_OPENER) {
4116 MD_ENTER_SPAN(MD_SPAN_CODE, NULL);
4117 text_type = MD_TEXT_CODE;
4118 } else {
4119 MD_LEAVE_SPAN(MD_SPAN_CODE, NULL);
4120 text_type = MD_TEXT_NORMAL;
4121 }
4122 break;
4123
4124 case '_': /* Underline (or emphasis if we fall through). */
4125 if(ctx->parser.flags & MD_FLAG_UNDERLINE) {
4126 if(mark->flags & MD_MARK_OPENER) {
4127 while(off < mark->end) {
4128 MD_ENTER_SPAN(MD_SPAN_U, NULL);
4129 off++;
4130 }
4131 } else {
4132 while(off < mark->end) {
4133 MD_LEAVE_SPAN(MD_SPAN_U, NULL);
4134 off++;
4135 }
4136 }
4137 break;
4138 }
4139 /* Fall though. */
4140
4141 case '*': /* Emphasis, strong emphasis. */
4142 if(mark->flags & MD_MARK_OPENER) {
4143 if((mark->end - off) % 2) {
4144 MD_ENTER_SPAN(MD_SPAN_EM, NULL);
4145 off++;
4146 }
4147 while(off + 1 < mark->end) {
4148 MD_ENTER_SPAN(MD_SPAN_STRONG, NULL);
4149 off += 2;
4150 }
4151 } else {
4152 while(off + 1 < mark->end) {
4153 MD_LEAVE_SPAN(MD_SPAN_STRONG, NULL);
4154 off += 2;
4155 }
4156 if((mark->end - off) % 2) {
4157 MD_LEAVE_SPAN(MD_SPAN_EM, NULL);
4158 off++;
4159 }
4160 }
4161 break;
4162
4163 case '~':
4164 if(mark->flags & MD_MARK_OPENER)
4165 MD_ENTER_SPAN(MD_SPAN_DEL, NULL);
4166 else
4167 MD_LEAVE_SPAN(MD_SPAN_DEL, NULL);
4168 break;
4169
4170 case '$':
4171 if(mark->flags & MD_MARK_OPENER) {
4172 MD_ENTER_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4173 text_type = MD_TEXT_LATEXMATH;
4174 } else {
4175 MD_LEAVE_SPAN((mark->end - off) % 2 ? MD_SPAN_LATEXMATH : MD_SPAN_LATEXMATH_DISPLAY, NULL);
4176 text_type = MD_TEXT_NORMAL;
4177 }
4178 break;
4179
4180 case '[': /* Link, wiki link, image. */
4181 case '!':
4182 case ']':
4183 {
4184 const MD_MARK* opener = (mark->ch != ']' ? mark : &ctx->marks[mark->prev]);
4185 const MD_MARK* closer = &ctx->marks[opener->next];
4186 const MD_MARK* dest_mark;
4187 const MD_MARK* title_mark;
4188
4189 if ((opener->ch == '[' && closer->ch == ']') &&
4190 opener->end - opener->beg >= 2 &&
4191 closer->end - closer->beg >= 2)
4192 {
4193 int has_label = (opener->end - opener->beg > 2);
4194 SZ target_sz;
4195
4196 if(has_label)
4197 target_sz = opener->end - (opener->beg+2);
4198 else
4199 target_sz = closer->beg - opener->end;
4200
4201 MD_CHECK(md_enter_leave_span_wikilink(ctx, (mark->ch != ']'),
4202 has_label ? STR(opener->beg+2) : STR(opener->end),
4203 target_sz));
4204
4205 break;
4206 }
4207
4208 dest_mark = opener+1;
4209 MD_ASSERT(dest_mark->ch == 'D');
4210 title_mark = opener+2;
4211 MD_ASSERT(title_mark->ch == 'D');
4212
4213 MD_CHECK(md_enter_leave_span_a(ctx, (mark->ch != ']'),
4214 (opener->ch == '!' ? MD_SPAN_IMG : MD_SPAN_A),
4215 STR(dest_mark->beg), dest_mark->end - dest_mark->beg, FALSE,
4216 md_mark_get_ptr(ctx, title_mark - ctx->marks), title_mark->prev));
4217
4218 /* link/image closer may span multiple lines. */
4219 if(mark->ch == ']') {
4220 while(mark->end > line->end)
4221 line++;
4222 }
4223
4224 break;
4225 }
4226
4227 case '<':
4228 case '>': /* Autolink or raw HTML. */
4229 if(!(mark->flags & MD_MARK_AUTOLINK)) {
4230 /* Raw HTML. */
4231 if(mark->flags & MD_MARK_OPENER)
4232 text_type = MD_TEXT_HTML;
4233 else
4234 text_type = MD_TEXT_NORMAL;
4235 break;
4236 }
4237 /* Pass through, if auto-link. */
4238
4239 case '@': /* Permissive e-mail autolink. */
4240 case ':': /* Permissive URL autolink. */
4241 case '.': /* Permissive WWW autolink. */
4242 {
4243 MD_MARK* opener = ((mark->flags & MD_MARK_OPENER) ? mark : &ctx->marks[mark->prev]);
4244 MD_MARK* closer = &ctx->marks[opener->next];
4245 const CHAR* dest = STR(opener->end);
4246 SZ dest_size = closer->beg - opener->end;
4247
4248 /* For permissive auto-links we do not know closer mark
4249 * position at the time of md_collect_marks(), therefore
4250 * it can be out-of-order in ctx->marks[].
4251 *
4252 * With this flag, we make sure that we output the closer
4253 * only if we processed the opener. */
4254 if(mark->flags & MD_MARK_OPENER)
4255 closer->flags |= MD_MARK_VALIDPERMISSIVEAUTOLINK;
4256
4257 if(opener->ch == '@' || opener->ch == '.') {
4258 dest_size += 7;
4259 MD_TEMP_BUFFER(dest_size * sizeof(CHAR));
4260 memcpy(ctx->buffer,
4261 (opener->ch == '@' ? _T("mailto:") : _T("http://")),
4262 7 * sizeof(CHAR));
4263 memcpy(ctx->buffer + 7, dest, (dest_size-7) * sizeof(CHAR));
4264 dest = ctx->buffer;
4265 }
4266
4267 if(closer->flags & MD_MARK_VALIDPERMISSIVEAUTOLINK)
4268 MD_CHECK(md_enter_leave_span_a(ctx, (mark->flags & MD_MARK_OPENER),
4269 MD_SPAN_A, dest, dest_size, TRUE, NULL, 0));
4270 break;
4271 }
4272
4273 case '&': /* Entity. */
4274 MD_TEXT(MD_TEXT_ENTITY, STR(mark->beg), mark->end - mark->beg);
4275 break;
4276
4277 case '\0':
4278 MD_TEXT(MD_TEXT_NULLCHAR, _T(""), 1);
4279 break;
4280
4281 case 127:
4282 goto abort;
4283 }
4284
4285 off = mark->end;
4286
4287 /* Move to next resolved mark. */
4288 prev_mark = mark;
4289 mark++;
4290 while(!(mark->flags & MD_MARK_RESOLVED) || mark->beg < off)
4291 mark++;
4292 }
4293
4294 /* If reached end of line, move to next one. */
4295 if(off >= line->end) {
4296 /* If it is the last line, we are done. */
4297 if(off >= end)
4298 break;
4299
4300 if(text_type == MD_TEXT_CODE || text_type == MD_TEXT_LATEXMATH) {
4301 OFF tmp;
4302
4303 MD_ASSERT(prev_mark != NULL);
4304 MD_ASSERT(ISANYOF2_(prev_mark->ch, '`', '$') && (prev_mark->flags & MD_MARK_OPENER));
4305 MD_ASSERT(ISANYOF2_(mark->ch, '`', '$') && (mark->flags & MD_MARK_CLOSER));
4306
4307 /* Inside a code span, trailing line whitespace has to be
4308 * outputted. */
4309 tmp = off;
4310 while(off < ctx->size && ISBLANK(off))
4311 off++;
4312 if(off > tmp)
4313 MD_TEXT(text_type, STR(tmp), off-tmp);
4314
4315 /* and new lines are transformed into single spaces. */
4316 if(prev_mark->end < off && off < mark->beg)
4317 MD_TEXT(text_type, _T(" "), 1);
4318 } else if(text_type == MD_TEXT_HTML) {
4319 /* Inside raw HTML, we output the new line verbatim, including
4320 * any trailing spaces. */
4321 OFF tmp = off;
4322
4323 while(tmp < end && ISBLANK(tmp))
4324 tmp++;
4325 if(tmp > off)
4326 MD_TEXT(MD_TEXT_HTML, STR(off), tmp - off);
4327 MD_TEXT(MD_TEXT_HTML, _T("\n"), 1);
4328 } else {
4329 /* Output soft or hard line break. */
4330 MD_TEXTTYPE break_type = MD_TEXT_SOFTBR;
4331
4332 if(text_type == MD_TEXT_NORMAL) {
4333 if(enforce_hardbreak)
4334 break_type = MD_TEXT_BR;
4335 else if((CH(line->end) == _T(' ') && CH(line->end+1) == _T(' ')))
4336 break_type = MD_TEXT_BR;
4337 }
4338
4339 MD_TEXT(break_type, _T("\n"), 1);
4340 }
4341
4342 /* Move to the next line. */
4343 line++;
4344 off = line->beg;
4345
4346 enforce_hardbreak = 0;
4347 }
4348 }
4349
4350 abort:
4351 return ret;
4352 }
4353
4354
4355 /***************************
4356 *** Processing Tables ***
4357 ***************************/
4358
4359 static void
md_analyze_table_alignment(MD_CTX * ctx,OFF beg,OFF end,MD_ALIGN * align,int n_align)4360 md_analyze_table_alignment(MD_CTX* ctx, OFF beg, OFF end, MD_ALIGN* align, int n_align)
4361 {
4362 static const MD_ALIGN align_map[] = { MD_ALIGN_DEFAULT, MD_ALIGN_LEFT, MD_ALIGN_RIGHT, MD_ALIGN_CENTER };
4363 OFF off = beg;
4364
4365 while(n_align > 0) {
4366 int index = 0; /* index into align_map[] */
4367
4368 while(CH(off) != _T('-'))
4369 off++;
4370 if(off > beg && CH(off-1) == _T(':'))
4371 index |= 1;
4372 while(off < end && CH(off) == _T('-'))
4373 off++;
4374 if(off < end && CH(off) == _T(':'))
4375 index |= 2;
4376
4377 *align = align_map[index];
4378 align++;
4379 n_align--;
4380 }
4381
4382 }
4383
4384 /* Forward declaration. */
4385 static int md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines);
4386
4387 static int
md_process_table_cell(MD_CTX * ctx,MD_BLOCKTYPE cell_type,MD_ALIGN align,OFF beg,OFF end)4388 md_process_table_cell(MD_CTX* ctx, MD_BLOCKTYPE cell_type, MD_ALIGN align, OFF beg, OFF end)
4389 {
4390 MD_LINE line;
4391 MD_BLOCK_TD_DETAIL det;
4392 int ret = 0;
4393
4394 while(beg < end && ISWHITESPACE(beg))
4395 beg++;
4396 while(end > beg && ISWHITESPACE(end-1))
4397 end--;
4398
4399 det.align = align;
4400 line.beg = beg;
4401 line.end = end;
4402
4403 MD_ENTER_BLOCK(cell_type, &det);
4404 MD_CHECK(md_process_normal_block_contents(ctx, &line, 1));
4405 MD_LEAVE_BLOCK(cell_type, &det);
4406
4407 abort:
4408 return ret;
4409 }
4410
4411 static int
md_process_table_row(MD_CTX * ctx,MD_BLOCKTYPE cell_type,OFF beg,OFF end,const MD_ALIGN * align,int col_count)4412 md_process_table_row(MD_CTX* ctx, MD_BLOCKTYPE cell_type, OFF beg, OFF end,
4413 const MD_ALIGN* align, int col_count)
4414 {
4415 MD_LINE line;
4416 OFF* pipe_offs = NULL;
4417 int i, j, k, n;
4418 int ret = 0;
4419
4420 line.beg = beg;
4421 line.end = end;
4422
4423 /* Break the line into table cells by identifying pipe characters who
4424 * form the cell boundary. */
4425 MD_CHECK(md_analyze_inlines(ctx, &line, 1, TRUE));
4426
4427 /* We have to remember the cell boundaries in local buffer because
4428 * ctx->marks[] shall be reused during cell contents processing. */
4429 n = ctx->n_table_cell_boundaries + 2;
4430 pipe_offs = (OFF*) malloc(n * sizeof(OFF));
4431 if(pipe_offs == NULL) {
4432 MD_LOG("malloc() failed.");
4433 ret = -1;
4434 goto abort;
4435 }
4436 j = 0;
4437 pipe_offs[j++] = beg;
4438 for(i = TABLECELLBOUNDARIES.head; i >= 0; i = ctx->marks[i].next) {
4439 MD_MARK* mark = &ctx->marks[i];
4440 pipe_offs[j++] = mark->end;
4441 }
4442 pipe_offs[j++] = end+1;
4443
4444 /* Process cells. */
4445 MD_ENTER_BLOCK(MD_BLOCK_TR, NULL);
4446 k = 0;
4447 for(i = 0; i < j-1 && k < col_count; i++) {
4448 if(pipe_offs[i] < pipe_offs[i+1]-1)
4449 MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], pipe_offs[i], pipe_offs[i+1]-1));
4450 }
4451 /* Make sure we call enough table cells even if the current table contains
4452 * too few of them. */
4453 while(k < col_count)
4454 MD_CHECK(md_process_table_cell(ctx, cell_type, align[k++], 0, 0));
4455 MD_LEAVE_BLOCK(MD_BLOCK_TR, NULL);
4456
4457 abort:
4458 free(pipe_offs);
4459
4460 /* Free any temporary memory blocks stored within some dummy marks. */
4461 for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next)
4462 free(md_mark_get_ptr(ctx, i));
4463 PTR_CHAIN.head = -1;
4464 PTR_CHAIN.tail = -1;
4465
4466 return ret;
4467 }
4468
4469 static int
md_process_table_block_contents(MD_CTX * ctx,int col_count,const MD_LINE * lines,int n_lines)4470 md_process_table_block_contents(MD_CTX* ctx, int col_count, const MD_LINE* lines, int n_lines)
4471 {
4472 MD_ALIGN* align;
4473 int i;
4474 int ret = 0;
4475
4476 /* At least two lines have to be present: The column headers and the line
4477 * with the underlines. */
4478 MD_ASSERT(n_lines >= 2);
4479
4480 align = malloc(col_count * sizeof(MD_ALIGN));
4481 if(align == NULL) {
4482 MD_LOG("malloc() failed.");
4483 ret = -1;
4484 goto abort;
4485 }
4486
4487 md_analyze_table_alignment(ctx, lines[1].beg, lines[1].end, align, col_count);
4488
4489 MD_ENTER_BLOCK(MD_BLOCK_THEAD, NULL);
4490 MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TH,
4491 lines[0].beg, lines[0].end, align, col_count));
4492 MD_LEAVE_BLOCK(MD_BLOCK_THEAD, NULL);
4493
4494 MD_ENTER_BLOCK(MD_BLOCK_TBODY, NULL);
4495 for(i = 2; i < n_lines; i++) {
4496 MD_CHECK(md_process_table_row(ctx, MD_BLOCK_TD,
4497 lines[i].beg, lines[i].end, align, col_count));
4498 }
4499 MD_LEAVE_BLOCK(MD_BLOCK_TBODY, NULL);
4500
4501 abort:
4502 free(align);
4503 return ret;
4504 }
4505
4506
4507 /**************************
4508 *** Processing Block ***
4509 **************************/
4510
4511 #define MD_BLOCK_CONTAINER_OPENER 0x01
4512 #define MD_BLOCK_CONTAINER_CLOSER 0x02
4513 #define MD_BLOCK_CONTAINER (MD_BLOCK_CONTAINER_OPENER | MD_BLOCK_CONTAINER_CLOSER)
4514 #define MD_BLOCK_LOOSE_LIST 0x04
4515 #define MD_BLOCK_SETEXT_HEADER 0x08
4516
4517 struct MD_BLOCK_tag {
4518 MD_BLOCKTYPE type : 8;
4519 unsigned flags : 8;
4520
4521 /* MD_BLOCK_H: Header level (1 - 6)
4522 * MD_BLOCK_CODE: Non-zero if fenced, zero if indented.
4523 * MD_BLOCK_LI: Task mark character (0 if not task list item, 'x', 'X' or ' ').
4524 * MD_BLOCK_TABLE: Column count (as determined by the table underline).
4525 */
4526 unsigned data : 16;
4527
4528 /* Leaf blocks: Count of lines (MD_LINE or MD_VERBATIMLINE) on the block.
4529 * MD_BLOCK_LI: Task mark offset in the input doc.
4530 * MD_BLOCK_OL: Start item number.
4531 */
4532 unsigned n_lines;
4533 };
4534
4535 struct MD_CONTAINER_tag {
4536 CHAR ch;
4537 unsigned is_loose : 8;
4538 unsigned is_task : 8;
4539 unsigned start;
4540 unsigned mark_indent;
4541 unsigned contents_indent;
4542 OFF block_byte_off;
4543 OFF task_mark_off;
4544 };
4545
4546
4547 static int
md_process_normal_block_contents(MD_CTX * ctx,const MD_LINE * lines,int n_lines)4548 md_process_normal_block_contents(MD_CTX* ctx, const MD_LINE* lines, int n_lines)
4549 {
4550 int i;
4551 int ret;
4552
4553 MD_CHECK(md_analyze_inlines(ctx, lines, n_lines, FALSE));
4554 MD_CHECK(md_process_inlines(ctx, lines, n_lines));
4555
4556 abort:
4557 /* Free any temporary memory blocks stored within some dummy marks. */
4558 for(i = PTR_CHAIN.head; i >= 0; i = ctx->marks[i].next)
4559 free(md_mark_get_ptr(ctx, i));
4560 PTR_CHAIN.head = -1;
4561 PTR_CHAIN.tail = -1;
4562
4563 return ret;
4564 }
4565
4566 static int
md_process_verbatim_block_contents(MD_CTX * ctx,MD_TEXTTYPE text_type,const MD_VERBATIMLINE * lines,int n_lines)4567 md_process_verbatim_block_contents(MD_CTX* ctx, MD_TEXTTYPE text_type, const MD_VERBATIMLINE* lines, int n_lines)
4568 {
4569 static const CHAR indent_chunk_str[] = _T(" ");
4570 static const SZ indent_chunk_size = SIZEOF_ARRAY(indent_chunk_str) - 1;
4571
4572 int i;
4573 int ret = 0;
4574
4575 for(i = 0; i < n_lines; i++) {
4576 const MD_VERBATIMLINE* line = &lines[i];
4577 int indent = line->indent;
4578
4579 MD_ASSERT(indent >= 0);
4580
4581 /* Output code indentation. */
4582 while(indent > (int) indent_chunk_size) {
4583 MD_TEXT(text_type, indent_chunk_str, indent_chunk_size);
4584 indent -= indent_chunk_size;
4585 }
4586 if(indent > 0)
4587 MD_TEXT(text_type, indent_chunk_str, indent);
4588
4589 /* Output the code line itself. */
4590 MD_TEXT_INSECURE(text_type, STR(line->beg), line->end - line->beg);
4591
4592 /* Enforce end-of-line. */
4593 MD_TEXT(text_type, _T("\n"), 1);
4594 }
4595
4596 abort:
4597 return ret;
4598 }
4599
4600 static int
md_process_code_block_contents(MD_CTX * ctx,int is_fenced,const MD_VERBATIMLINE * lines,int n_lines)4601 md_process_code_block_contents(MD_CTX* ctx, int is_fenced, const MD_VERBATIMLINE* lines, int n_lines)
4602 {
4603 if(is_fenced) {
4604 /* Skip the first line in case of fenced code: It is the fence.
4605 * (Only the starting fence is present due to logic in md_analyze_line().) */
4606 lines++;
4607 n_lines--;
4608 } else {
4609 /* Ignore blank lines at start/end of indented code block. */
4610 while(n_lines > 0 && lines[0].beg == lines[0].end) {
4611 lines++;
4612 n_lines--;
4613 }
4614 while(n_lines > 0 && lines[n_lines-1].beg == lines[n_lines-1].end) {
4615 n_lines--;
4616 }
4617 }
4618
4619 if(n_lines == 0)
4620 return 0;
4621
4622 return md_process_verbatim_block_contents(ctx, MD_TEXT_CODE, lines, n_lines);
4623 }
4624
4625 static int
md_setup_fenced_code_detail(MD_CTX * ctx,const MD_BLOCK * block,MD_BLOCK_CODE_DETAIL * det,MD_ATTRIBUTE_BUILD * info_build,MD_ATTRIBUTE_BUILD * lang_build)4626 md_setup_fenced_code_detail(MD_CTX* ctx, const MD_BLOCK* block, MD_BLOCK_CODE_DETAIL* det,
4627 MD_ATTRIBUTE_BUILD* info_build, MD_ATTRIBUTE_BUILD* lang_build)
4628 {
4629 const MD_VERBATIMLINE* fence_line = (const MD_VERBATIMLINE*)(block + 1);
4630 OFF beg = fence_line->beg;
4631 OFF end = fence_line->end;
4632 OFF lang_end;
4633 CHAR fence_ch = CH(fence_line->beg);
4634 int ret = 0;
4635
4636 /* Skip the fence itself. */
4637 while(beg < ctx->size && CH(beg) == fence_ch)
4638 beg++;
4639 /* Trim initial spaces. */
4640 while(beg < ctx->size && CH(beg) == _T(' '))
4641 beg++;
4642
4643 /* Trim trailing spaces. */
4644 while(end > beg && CH(end-1) == _T(' '))
4645 end--;
4646
4647 /* Build info string attribute. */
4648 MD_CHECK(md_build_attribute(ctx, STR(beg), end - beg, 0, &det->info, info_build));
4649
4650 /* Build info string attribute. */
4651 lang_end = beg;
4652 while(lang_end < end && !ISWHITESPACE(lang_end))
4653 lang_end++;
4654 MD_CHECK(md_build_attribute(ctx, STR(beg), lang_end - beg, 0, &det->lang, lang_build));
4655
4656 det->fence_char = fence_ch;
4657
4658 abort:
4659 return ret;
4660 }
4661
4662 static int
md_process_leaf_block(MD_CTX * ctx,const MD_BLOCK * block)4663 md_process_leaf_block(MD_CTX* ctx, const MD_BLOCK* block)
4664 {
4665 union {
4666 MD_BLOCK_H_DETAIL header;
4667 MD_BLOCK_CODE_DETAIL code;
4668 } det;
4669 MD_ATTRIBUTE_BUILD info_build;
4670 MD_ATTRIBUTE_BUILD lang_build;
4671 int is_in_tight_list;
4672 int clean_fence_code_detail = FALSE;
4673 int ret = 0;
4674
4675 memset(&det, 0, sizeof(det));
4676
4677 if(ctx->n_containers == 0)
4678 is_in_tight_list = FALSE;
4679 else
4680 is_in_tight_list = !ctx->containers[ctx->n_containers-1].is_loose;
4681
4682 switch(block->type) {
4683 case MD_BLOCK_H:
4684 det.header.level = block->data;
4685 break;
4686
4687 case MD_BLOCK_CODE:
4688 /* For fenced code block, we may need to set the info string. */
4689 if(block->data != 0) {
4690 memset(&det.code, 0, sizeof(MD_BLOCK_CODE_DETAIL));
4691 clean_fence_code_detail = TRUE;
4692 MD_CHECK(md_setup_fenced_code_detail(ctx, block, &det.code, &info_build, &lang_build));
4693 }
4694 break;
4695
4696 default:
4697 /* Noop. */
4698 break;
4699 }
4700
4701 if(!is_in_tight_list || block->type != MD_BLOCK_P)
4702 MD_ENTER_BLOCK(block->type, (void*) &det);
4703
4704 /* Process the block contents accordingly to is type. */
4705 switch(block->type) {
4706 case MD_BLOCK_HR:
4707 /* noop */
4708 break;
4709
4710 case MD_BLOCK_CODE:
4711 MD_CHECK(md_process_code_block_contents(ctx, (block->data != 0),
4712 (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4713 break;
4714
4715 case MD_BLOCK_HTML:
4716 MD_CHECK(md_process_verbatim_block_contents(ctx, MD_TEXT_HTML,
4717 (const MD_VERBATIMLINE*)(block + 1), block->n_lines));
4718 break;
4719
4720 case MD_BLOCK_TABLE:
4721 MD_CHECK(md_process_table_block_contents(ctx, block->data,
4722 (const MD_LINE*)(block + 1), block->n_lines));
4723 break;
4724
4725 default:
4726 MD_CHECK(md_process_normal_block_contents(ctx,
4727 (const MD_LINE*)(block + 1), block->n_lines));
4728 break;
4729 }
4730
4731 if(!is_in_tight_list || block->type != MD_BLOCK_P)
4732 MD_LEAVE_BLOCK(block->type, (void*) &det);
4733
4734 abort:
4735 if(clean_fence_code_detail) {
4736 md_free_attribute(ctx, &info_build);
4737 md_free_attribute(ctx, &lang_build);
4738 }
4739 return ret;
4740 }
4741
4742 static int
md_process_all_blocks(MD_CTX * ctx)4743 md_process_all_blocks(MD_CTX* ctx)
4744 {
4745 int byte_off = 0;
4746 int ret = 0;
4747
4748 /* ctx->containers now is not needed for detection of lists and list items
4749 * so we reuse it for tracking what lists are loose or tight. We rely
4750 * on the fact the vector is large enough to hold the deepest nesting
4751 * level of lists. */
4752 ctx->n_containers = 0;
4753
4754 while(byte_off < ctx->n_block_bytes) {
4755 MD_BLOCK* block = (MD_BLOCK*)((char*)ctx->block_bytes + byte_off);
4756 union {
4757 MD_BLOCK_UL_DETAIL ul;
4758 MD_BLOCK_OL_DETAIL ol;
4759 MD_BLOCK_LI_DETAIL li;
4760 } det;
4761
4762 switch(block->type) {
4763 case MD_BLOCK_UL:
4764 det.ul.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4765 det.ul.mark = (CHAR) block->data;
4766 break;
4767
4768 case MD_BLOCK_OL:
4769 det.ol.start = block->n_lines;
4770 det.ol.is_tight = (block->flags & MD_BLOCK_LOOSE_LIST) ? FALSE : TRUE;
4771 det.ol.mark_delimiter = (CHAR) block->data;
4772 break;
4773
4774 case MD_BLOCK_LI:
4775 det.li.is_task = (block->data != 0);
4776 det.li.task_mark = (CHAR) block->data;
4777 det.li.task_mark_offset = (OFF) block->n_lines;
4778 break;
4779
4780 default:
4781 /* noop */
4782 break;
4783 }
4784
4785 if(block->flags & MD_BLOCK_CONTAINER) {
4786 if(block->flags & MD_BLOCK_CONTAINER_CLOSER) {
4787 MD_LEAVE_BLOCK(block->type, &det);
4788
4789 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL || block->type == MD_BLOCK_QUOTE)
4790 ctx->n_containers--;
4791 }
4792
4793 if(block->flags & MD_BLOCK_CONTAINER_OPENER) {
4794 MD_ENTER_BLOCK(block->type, &det);
4795
4796 if(block->type == MD_BLOCK_UL || block->type == MD_BLOCK_OL) {
4797 ctx->containers[ctx->n_containers].is_loose = (block->flags & MD_BLOCK_LOOSE_LIST);
4798 ctx->n_containers++;
4799 } else if(block->type == MD_BLOCK_QUOTE) {
4800 /* This causes that any text in a block quote, even if
4801 * nested inside a tight list item, is wrapped with
4802 * <p>...</p>. */
4803 ctx->containers[ctx->n_containers].is_loose = TRUE;
4804 ctx->n_containers++;
4805 }
4806 }
4807 } else {
4808 MD_CHECK(md_process_leaf_block(ctx, block));
4809
4810 if(block->type == MD_BLOCK_CODE || block->type == MD_BLOCK_HTML)
4811 byte_off += block->n_lines * sizeof(MD_VERBATIMLINE);
4812 else
4813 byte_off += block->n_lines * sizeof(MD_LINE);
4814 }
4815
4816 byte_off += sizeof(MD_BLOCK);
4817 }
4818
4819 ctx->n_block_bytes = 0;
4820
4821 abort:
4822 return ret;
4823 }
4824
4825
4826 /************************************
4827 *** Grouping Lines into Blocks ***
4828 ************************************/
4829
4830 static void*
md_push_block_bytes(MD_CTX * ctx,int n_bytes)4831 md_push_block_bytes(MD_CTX* ctx, int n_bytes)
4832 {
4833 void* ptr;
4834
4835 if(ctx->n_block_bytes + n_bytes > ctx->alloc_block_bytes) {
4836 void* new_block_bytes;
4837
4838 ctx->alloc_block_bytes = (ctx->alloc_block_bytes > 0
4839 ? ctx->alloc_block_bytes + ctx->alloc_block_bytes / 2
4840 : 512);
4841 new_block_bytes = realloc(ctx->block_bytes, ctx->alloc_block_bytes);
4842 if(new_block_bytes == NULL) {
4843 MD_LOG("realloc() failed.");
4844 return NULL;
4845 }
4846
4847 /* Fix the ->current_block after the reallocation. */
4848 if(ctx->current_block != NULL) {
4849 OFF off_current_block = (char*) ctx->current_block - (char*) ctx->block_bytes;
4850 ctx->current_block = (MD_BLOCK*) ((char*) new_block_bytes + off_current_block);
4851 }
4852
4853 ctx->block_bytes = new_block_bytes;
4854 }
4855
4856 ptr = (char*)ctx->block_bytes + ctx->n_block_bytes;
4857 ctx->n_block_bytes += n_bytes;
4858 return ptr;
4859 }
4860
4861 static int
md_start_new_block(MD_CTX * ctx,const MD_LINE_ANALYSIS * line)4862 md_start_new_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* line)
4863 {
4864 MD_BLOCK* block;
4865
4866 MD_ASSERT(ctx->current_block == NULL);
4867
4868 block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK));
4869 if(block == NULL)
4870 return -1;
4871
4872 switch(line->type) {
4873 case MD_LINE_HR:
4874 block->type = MD_BLOCK_HR;
4875 break;
4876
4877 case MD_LINE_ATXHEADER:
4878 case MD_LINE_SETEXTHEADER:
4879 block->type = MD_BLOCK_H;
4880 break;
4881
4882 case MD_LINE_FENCEDCODE:
4883 case MD_LINE_INDENTEDCODE:
4884 block->type = MD_BLOCK_CODE;
4885 break;
4886
4887 case MD_LINE_TEXT:
4888 block->type = MD_BLOCK_P;
4889 break;
4890
4891 case MD_LINE_HTML:
4892 block->type = MD_BLOCK_HTML;
4893 break;
4894
4895 case MD_LINE_BLANK:
4896 case MD_LINE_SETEXTUNDERLINE:
4897 case MD_LINE_TABLEUNDERLINE:
4898 default:
4899 MD_UNREACHABLE();
4900 break;
4901 }
4902
4903 block->flags = 0;
4904 block->data = line->data;
4905 block->n_lines = 0;
4906
4907 ctx->current_block = block;
4908 return 0;
4909 }
4910
4911 /* Eat from start of current (textual) block any reference definitions and
4912 * remember them so we can resolve any links referring to them.
4913 *
4914 * (Reference definitions can only be at start of it as they cannot break
4915 * a paragraph.)
4916 */
4917 static int
md_consume_link_reference_definitions(MD_CTX * ctx)4918 md_consume_link_reference_definitions(MD_CTX* ctx)
4919 {
4920 MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
4921 int n_lines = ctx->current_block->n_lines;
4922 int n = 0;
4923
4924 /* Compute how many lines at the start of the block form one or more
4925 * reference definitions. */
4926 while(n < n_lines) {
4927 int n_link_ref_lines;
4928
4929 n_link_ref_lines = md_is_link_reference_definition(ctx,
4930 lines + n, n_lines - n);
4931 /* Not a reference definition? */
4932 if(n_link_ref_lines == 0)
4933 break;
4934
4935 /* We fail if it is the ref. def. but it could not be stored due
4936 * a memory allocation error. */
4937 if(n_link_ref_lines < 0)
4938 return -1;
4939
4940 n += n_link_ref_lines;
4941 }
4942
4943 /* If there was at least one reference definition, we need to remove
4944 * its lines from the block, or perhaps even the whole block. */
4945 if(n > 0) {
4946 if(n == n_lines) {
4947 /* Remove complete block. */
4948 ctx->n_block_bytes -= n * sizeof(MD_LINE);
4949 ctx->n_block_bytes -= sizeof(MD_BLOCK);
4950 ctx->current_block = NULL;
4951 } else {
4952 /* Remove just some initial lines from the block. */
4953 memmove(lines, lines + n, (n_lines - n) * sizeof(MD_LINE));
4954 ctx->current_block->n_lines -= n;
4955 ctx->n_block_bytes -= n * sizeof(MD_LINE);
4956 }
4957 }
4958
4959 return 0;
4960 }
4961
4962 static int
md_end_current_block(MD_CTX * ctx)4963 md_end_current_block(MD_CTX* ctx)
4964 {
4965 int ret = 0;
4966
4967 if(ctx->current_block == NULL)
4968 return ret;
4969
4970 /* Check whether there is a reference definition. (We do this here instead
4971 * of in md_analyze_line() because reference definition can take multiple
4972 * lines.) */
4973 if(ctx->current_block->type == MD_BLOCK_P ||
4974 (ctx->current_block->type == MD_BLOCK_H && (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)))
4975 {
4976 MD_LINE* lines = (MD_LINE*) (ctx->current_block + 1);
4977 if(CH(lines[0].beg) == _T('[')) {
4978 MD_CHECK(md_consume_link_reference_definitions(ctx));
4979 if(ctx->current_block == NULL)
4980 return ret;
4981 }
4982 }
4983
4984 if(ctx->current_block->type == MD_BLOCK_H && (ctx->current_block->flags & MD_BLOCK_SETEXT_HEADER)) {
4985 int n_lines = ctx->current_block->n_lines;
4986
4987 if(n_lines > 1) {
4988 /* Get rid of the underline. */
4989 ctx->current_block->n_lines--;
4990 ctx->n_block_bytes -= sizeof(MD_LINE);
4991 } else {
4992 /* Only the underline has left after eating the ref. defs.
4993 * Keep the line as beginning of a new ordinary paragraph. */
4994 ctx->current_block->type = MD_BLOCK_P;
4995 return 0;
4996 }
4997 }
4998
4999 /* Mark we are not building any block anymore. */
5000 ctx->current_block = NULL;
5001
5002 abort:
5003 return ret;
5004 }
5005
5006 static int
md_add_line_into_current_block(MD_CTX * ctx,const MD_LINE_ANALYSIS * analysis)5007 md_add_line_into_current_block(MD_CTX* ctx, const MD_LINE_ANALYSIS* analysis)
5008 {
5009 MD_ASSERT(ctx->current_block != NULL);
5010
5011 if(ctx->current_block->type == MD_BLOCK_CODE || ctx->current_block->type == MD_BLOCK_HTML) {
5012 MD_VERBATIMLINE* line;
5013
5014 line = (MD_VERBATIMLINE*) md_push_block_bytes(ctx, sizeof(MD_VERBATIMLINE));
5015 if(line == NULL)
5016 return -1;
5017
5018 line->indent = analysis->indent;
5019 line->beg = analysis->beg;
5020 line->end = analysis->end;
5021 } else {
5022 MD_LINE* line;
5023
5024 line = (MD_LINE*) md_push_block_bytes(ctx, sizeof(MD_LINE));
5025 if(line == NULL)
5026 return -1;
5027
5028 line->beg = analysis->beg;
5029 line->end = analysis->end;
5030 }
5031 ctx->current_block->n_lines++;
5032
5033 return 0;
5034 }
5035
5036 static int
md_push_container_bytes(MD_CTX * ctx,MD_BLOCKTYPE type,unsigned start,unsigned data,unsigned flags)5037 md_push_container_bytes(MD_CTX* ctx, MD_BLOCKTYPE type, unsigned start,
5038 unsigned data, unsigned flags)
5039 {
5040 MD_BLOCK* block;
5041 int ret = 0;
5042
5043 MD_CHECK(md_end_current_block(ctx));
5044
5045 block = (MD_BLOCK*) md_push_block_bytes(ctx, sizeof(MD_BLOCK));
5046 if(block == NULL)
5047 return -1;
5048
5049 block->type = type;
5050 block->flags = flags;
5051 block->data = data;
5052 block->n_lines = start;
5053
5054 abort:
5055 return ret;
5056 }
5057
5058
5059
5060 /***********************
5061 *** Line Analysis ***
5062 ***********************/
5063
5064 static int
md_is_hr_line(MD_CTX * ctx,OFF beg,OFF * p_end,OFF * p_killer)5065 md_is_hr_line(MD_CTX* ctx, OFF beg, OFF* p_end, OFF* p_killer)
5066 {
5067 OFF off = beg + 1;
5068 int n = 1;
5069
5070 while(off < ctx->size && (CH(off) == CH(beg) || CH(off) == _T(' ') || CH(off) == _T('\t'))) {
5071 if(CH(off) == CH(beg))
5072 n++;
5073 off++;
5074 }
5075
5076 if(n < 3) {
5077 *p_killer = off;
5078 return FALSE;
5079 }
5080
5081 /* Nothing else can be present on the line. */
5082 if(off < ctx->size && !ISNEWLINE(off)) {
5083 *p_killer = off;
5084 return FALSE;
5085 }
5086
5087 *p_end = off;
5088 return TRUE;
5089 }
5090
5091 static int
md_is_atxheader_line(MD_CTX * ctx,OFF beg,OFF * p_beg,OFF * p_end,unsigned * p_level)5092 md_is_atxheader_line(MD_CTX* ctx, OFF beg, OFF* p_beg, OFF* p_end, unsigned* p_level)
5093 {
5094 int n;
5095 OFF off = beg + 1;
5096
5097 while(off < ctx->size && CH(off) == _T('#') && off - beg < 7)
5098 off++;
5099 n = off - beg;
5100
5101 if(n > 6)
5102 return FALSE;
5103 *p_level = n;
5104
5105 if(!(ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS) && off < ctx->size &&
5106 CH(off) != _T(' ') && CH(off) != _T('\t') && !ISNEWLINE(off))
5107 return FALSE;
5108
5109 while(off < ctx->size && CH(off) == _T(' '))
5110 off++;
5111 *p_beg = off;
5112 *p_end = off;
5113 return TRUE;
5114 }
5115
5116 static int
md_is_setext_underline(MD_CTX * ctx,OFF beg,OFF * p_end,unsigned * p_level)5117 md_is_setext_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_level)
5118 {
5119 OFF off = beg + 1;
5120
5121 while(off < ctx->size && CH(off) == CH(beg))
5122 off++;
5123
5124 /* Optionally, space(s) can follow. */
5125 while(off < ctx->size && CH(off) == _T(' '))
5126 off++;
5127
5128 /* But nothing more is allowed on the line. */
5129 if(off < ctx->size && !ISNEWLINE(off))
5130 return FALSE;
5131
5132 *p_level = (CH(beg) == _T('=') ? 1 : 2);
5133 *p_end = off;
5134 return TRUE;
5135 }
5136
5137 static int
md_is_table_underline(MD_CTX * ctx,OFF beg,OFF * p_end,unsigned * p_col_count)5138 md_is_table_underline(MD_CTX* ctx, OFF beg, OFF* p_end, unsigned* p_col_count)
5139 {
5140 OFF off = beg;
5141 int found_pipe = FALSE;
5142 unsigned col_count = 0;
5143
5144 if(off < ctx->size && CH(off) == _T('|')) {
5145 found_pipe = TRUE;
5146 off++;
5147 while(off < ctx->size && ISWHITESPACE(off))
5148 off++;
5149 }
5150
5151 while(1) {
5152 OFF cell_beg;
5153 int delimited = FALSE;
5154
5155 /* Cell underline ("-----", ":----", "----:" or ":----:") */
5156 cell_beg = off;
5157 if(off < ctx->size && CH(off) == _T(':'))
5158 off++;
5159 while(off < ctx->size && CH(off) == _T('-'))
5160 off++;
5161 if(off < ctx->size && CH(off) == _T(':'))
5162 off++;
5163 if(off - cell_beg < 3)
5164 return FALSE;
5165
5166 col_count++;
5167
5168 /* Pipe delimiter (optional at the end of line). */
5169 while(off < ctx->size && ISWHITESPACE(off))
5170 off++;
5171 if(off < ctx->size && CH(off) == _T('|')) {
5172 delimited = TRUE;
5173 found_pipe = TRUE;
5174 off++;
5175 while(off < ctx->size && ISWHITESPACE(off))
5176 off++;
5177 }
5178
5179 /* Success, if we reach end of line. */
5180 if(off >= ctx->size || ISNEWLINE(off))
5181 break;
5182
5183 if(!delimited)
5184 return FALSE;
5185 }
5186
5187 if(!found_pipe)
5188 return FALSE;
5189
5190 *p_end = off;
5191 *p_col_count = col_count;
5192 return TRUE;
5193 }
5194
5195 static int
md_is_opening_code_fence(MD_CTX * ctx,OFF beg,OFF * p_end)5196 md_is_opening_code_fence(MD_CTX* ctx, OFF beg, OFF* p_end)
5197 {
5198 OFF off = beg;
5199
5200 while(off < ctx->size && CH(off) == CH(beg))
5201 off++;
5202
5203 /* Fence must have at least three characters. */
5204 if(off - beg < 3)
5205 return FALSE;
5206
5207 ctx->code_fence_length = off - beg;
5208
5209 /* Optionally, space(s) can follow. */
5210 while(off < ctx->size && CH(off) == _T(' '))
5211 off++;
5212
5213 /* Optionally, an info string can follow. */
5214 while(off < ctx->size && !ISNEWLINE(off)) {
5215 /* Backtick-based fence must not contain '`' in the info string. */
5216 if(CH(beg) == _T('`') && CH(off) == _T('`'))
5217 return FALSE;
5218 off++;
5219 }
5220
5221 *p_end = off;
5222 return TRUE;
5223 }
5224
5225 static int
md_is_closing_code_fence(MD_CTX * ctx,CHAR ch,OFF beg,OFF * p_end)5226 md_is_closing_code_fence(MD_CTX* ctx, CHAR ch, OFF beg, OFF* p_end)
5227 {
5228 OFF off = beg;
5229 int ret = FALSE;
5230
5231 /* Closing fence must have at least the same length and use same char as
5232 * opening one. */
5233 while(off < ctx->size && CH(off) == ch)
5234 off++;
5235 if(off - beg < ctx->code_fence_length)
5236 goto out;
5237
5238 /* Optionally, space(s) can follow */
5239 while(off < ctx->size && CH(off) == _T(' '))
5240 off++;
5241
5242 /* But nothing more is allowed on the line. */
5243 if(off < ctx->size && !ISNEWLINE(off))
5244 goto out;
5245
5246 ret = TRUE;
5247
5248 out:
5249 /* Note we set *p_end even on failure: If we are not closing fence, caller
5250 * would eat the line anyway without any parsing. */
5251 *p_end = off;
5252 return ret;
5253 }
5254
5255 /* Returns type of the raw HTML block, or FALSE if it is not HTML block.
5256 * (Refer to CommonMark specification for details about the types.)
5257 */
5258 static int
md_is_html_block_start_condition(MD_CTX * ctx,OFF beg)5259 md_is_html_block_start_condition(MD_CTX* ctx, OFF beg)
5260 {
5261 typedef struct TAG_tag TAG;
5262 struct TAG_tag {
5263 const CHAR* name;
5264 unsigned len : 8;
5265 };
5266
5267 /* Type 6 is started by a long list of allowed tags. We use two-level
5268 * tree to speed-up the search. */
5269 #ifdef X
5270 #undef X
5271 #endif
5272 #define X(name) { _T(name), (sizeof(name)-1) / sizeof(CHAR) }
5273 #define Xend { NULL, 0 }
5274 static const TAG t1[] = { X("script"), X("pre"), X("style"), Xend };
5275
5276 static const TAG a6[] = { X("address"), X("article"), X("aside"), Xend };
5277 static const TAG b6[] = { X("base"), X("basefont"), X("blockquote"), X("body"), Xend };
5278 static const TAG c6[] = { X("caption"), X("center"), X("col"), X("colgroup"), Xend };
5279 static const TAG d6[] = { X("dd"), X("details"), X("dialog"), X("dir"),
5280 X("div"), X("dl"), X("dt"), Xend };
5281 static const TAG f6[] = { X("fieldset"), X("figcaption"), X("figure"), X("footer"),
5282 X("form"), X("frame"), X("frameset"), Xend };
5283 static const TAG h6[] = { X("h1"), X("head"), X("header"), X("hr"), X("html"), Xend };
5284 static const TAG i6[] = { X("iframe"), Xend };
5285 static const TAG l6[] = { X("legend"), X("li"), X("link"), Xend };
5286 static const TAG m6[] = { X("main"), X("menu"), X("menuitem"), Xend };
5287 static const TAG n6[] = { X("nav"), X("noframes"), Xend };
5288 static const TAG o6[] = { X("ol"), X("optgroup"), X("option"), Xend };
5289 static const TAG p6[] = { X("p"), X("param"), Xend };
5290 static const TAG s6[] = { X("section"), X("source"), X("summary"), Xend };
5291 static const TAG t6[] = { X("table"), X("tbody"), X("td"), X("tfoot"), X("th"),
5292 X("thead"), X("title"), X("tr"), X("track"), Xend };
5293 static const TAG u6[] = { X("ul"), Xend };
5294 static const TAG xx[] = { Xend };
5295 #undef X
5296
5297 static const TAG* map6[26] = {
5298 a6, b6, c6, d6, xx, f6, xx, h6, i6, xx, xx, l6, m6,
5299 n6, o6, p6, xx, xx, s6, t6, u6, xx, xx, xx, xx, xx
5300 };
5301 OFF off = beg + 1;
5302 int i;
5303
5304 /* Check for type 1: <script, <pre, or <style */
5305 for(i = 0; t1[i].name != NULL; i++) {
5306 if(off + t1[i].len <= ctx->size) {
5307 if(md_ascii_case_eq(STR(off), t1[i].name, t1[i].len))
5308 return 1;
5309 }
5310 }
5311
5312 /* Check for type 2: <!-- */
5313 if(off + 3 < ctx->size && CH(off) == _T('!') && CH(off+1) == _T('-') && CH(off+2) == _T('-'))
5314 return 2;
5315
5316 /* Check for type 3: <? */
5317 if(off < ctx->size && CH(off) == _T('?'))
5318 return 3;
5319
5320 /* Check for type 4 or 5: <! */
5321 if(off < ctx->size && CH(off) == _T('!')) {
5322 /* Check for type 4: <! followed by uppercase letter. */
5323 if(off + 1 < ctx->size && ISUPPER(off+1))
5324 return 4;
5325
5326 /* Check for type 5: <![CDATA[ */
5327 if(off + 8 < ctx->size) {
5328 if(md_ascii_eq(STR(off), _T("![CDATA["), 8))
5329 return 5;
5330 }
5331 }
5332
5333 /* Check for type 6: Many possible starting tags listed above. */
5334 if(off + 1 < ctx->size && (ISALPHA(off) || (CH(off) == _T('/') && ISALPHA(off+1)))) {
5335 int slot;
5336 const TAG* tags;
5337
5338 if(CH(off) == _T('/'))
5339 off++;
5340
5341 slot = (ISUPPER(off) ? CH(off) - 'A' : CH(off) - 'a');
5342 tags = map6[slot];
5343
5344 for(i = 0; tags[i].name != NULL; i++) {
5345 if(off + tags[i].len <= ctx->size) {
5346 if(md_ascii_case_eq(STR(off), tags[i].name, tags[i].len)) {
5347 OFF tmp = off + tags[i].len;
5348 if(tmp >= ctx->size)
5349 return 6;
5350 if(ISBLANK(tmp) || ISNEWLINE(tmp) || CH(tmp) == _T('>'))
5351 return 6;
5352 if(tmp+1 < ctx->size && CH(tmp) == _T('/') && CH(tmp+1) == _T('>'))
5353 return 6;
5354 break;
5355 }
5356 }
5357 }
5358 }
5359
5360 /* Check for type 7: any COMPLETE other opening or closing tag. */
5361 if(off + 1 < ctx->size) {
5362 OFF end;
5363
5364 if(md_is_html_tag(ctx, NULL, 0, beg, ctx->size, &end)) {
5365 /* Only optional whitespace and new line may follow. */
5366 while(end < ctx->size && ISWHITESPACE(end))
5367 end++;
5368 if(end >= ctx->size || ISNEWLINE(end))
5369 return 7;
5370 }
5371 }
5372
5373 return FALSE;
5374 }
5375
5376 /* Case sensitive check whether there is a substring 'what' between 'beg'
5377 * and end of line. */
5378 static int
md_line_contains(MD_CTX * ctx,OFF beg,const CHAR * what,SZ what_len,OFF * p_end)5379 md_line_contains(MD_CTX* ctx, OFF beg, const CHAR* what, SZ what_len, OFF* p_end)
5380 {
5381 OFF i;
5382 for(i = beg; i + what_len < ctx->size; i++) {
5383 if(ISNEWLINE(i))
5384 break;
5385 if(memcmp(STR(i), what, what_len * sizeof(CHAR)) == 0) {
5386 *p_end = i + what_len;
5387 return TRUE;
5388 }
5389 }
5390
5391 *p_end = i;
5392 return FALSE;
5393 }
5394
5395 /* Returns type of HTML block end condition or FALSE if not an end condition.
5396 *
5397 * Note it fills p_end even when it is not end condition as the caller
5398 * does not need to analyze contents of a raw HTML block.
5399 */
5400 static int
md_is_html_block_end_condition(MD_CTX * ctx,OFF beg,OFF * p_end)5401 md_is_html_block_end_condition(MD_CTX* ctx, OFF beg, OFF* p_end)
5402 {
5403 switch(ctx->html_block_type) {
5404 case 1:
5405 {
5406 OFF off = beg;
5407
5408 while(off < ctx->size && !ISNEWLINE(off)) {
5409 if(CH(off) == _T('<')) {
5410 if(md_ascii_case_eq(STR(off), _T("</script>"), 9)) {
5411 *p_end = off + 9;
5412 return TRUE;
5413 }
5414
5415 if(md_ascii_case_eq(STR(off), _T("</style>"), 8)) {
5416 *p_end = off + 8;
5417 return TRUE;
5418 }
5419
5420 if(md_ascii_case_eq(STR(off), _T("</pre>"), 6)) {
5421 *p_end = off + 6;
5422 return TRUE;
5423 }
5424 }
5425
5426 off++;
5427 }
5428 *p_end = off;
5429 return FALSE;
5430 }
5431
5432 case 2:
5433 return (md_line_contains(ctx, beg, _T("-->"), 3, p_end) ? 2 : FALSE);
5434
5435 case 3:
5436 return (md_line_contains(ctx, beg, _T("?>"), 2, p_end) ? 3 : FALSE);
5437
5438 case 4:
5439 return (md_line_contains(ctx, beg, _T(">"), 1, p_end) ? 4 : FALSE);
5440
5441 case 5:
5442 return (md_line_contains(ctx, beg, _T("]]>"), 3, p_end) ? 5 : FALSE);
5443
5444 case 6: /* Pass through */
5445 case 7:
5446 *p_end = beg;
5447 return (ISNEWLINE(beg) ? ctx->html_block_type : FALSE);
5448
5449 default:
5450 MD_UNREACHABLE();
5451 }
5452 return FALSE;
5453 }
5454
5455
5456 static int
md_is_container_compatible(const MD_CONTAINER * pivot,const MD_CONTAINER * container)5457 md_is_container_compatible(const MD_CONTAINER* pivot, const MD_CONTAINER* container)
5458 {
5459 /* Block quote has no "items" like lists. */
5460 if(container->ch == _T('>'))
5461 return FALSE;
5462
5463 if(container->ch != pivot->ch)
5464 return FALSE;
5465 if(container->mark_indent > pivot->contents_indent)
5466 return FALSE;
5467
5468 return TRUE;
5469 }
5470
5471 static int
md_push_container(MD_CTX * ctx,const MD_CONTAINER * container)5472 md_push_container(MD_CTX* ctx, const MD_CONTAINER* container)
5473 {
5474 if(ctx->n_containers >= ctx->alloc_containers) {
5475 MD_CONTAINER* new_containers;
5476
5477 ctx->alloc_containers = (ctx->alloc_containers > 0
5478 ? ctx->alloc_containers + ctx->alloc_containers / 2
5479 : 16);
5480 new_containers = realloc(ctx->containers, ctx->alloc_containers * sizeof(MD_CONTAINER));
5481 if(new_containers == NULL) {
5482 MD_LOG("realloc() failed.");
5483 return -1;
5484 }
5485
5486 ctx->containers = new_containers;
5487 }
5488
5489 memcpy(&ctx->containers[ctx->n_containers++], container, sizeof(MD_CONTAINER));
5490 return 0;
5491 }
5492
5493 static int
md_enter_child_containers(MD_CTX * ctx,int n_children,unsigned data)5494 md_enter_child_containers(MD_CTX* ctx, int n_children, unsigned data)
5495 {
5496 int i;
5497 int ret = 0;
5498
5499 for(i = ctx->n_containers - n_children; i < ctx->n_containers; i++) {
5500 MD_CONTAINER* c = &ctx->containers[i];
5501 int is_ordered_list = FALSE;
5502
5503 switch(c->ch) {
5504 case _T(')'):
5505 case _T('.'):
5506 is_ordered_list = TRUE;
5507 /* Pass through */
5508
5509 case _T('-'):
5510 case _T('+'):
5511 case _T('*'):
5512 /* Remember offset in ctx->block_bytes so we can revisit the
5513 * block if we detect it is a loose list. */
5514 md_end_current_block(ctx);
5515 c->block_byte_off = ctx->n_block_bytes;
5516
5517 MD_CHECK(md_push_container_bytes(ctx,
5518 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL),
5519 c->start, data, MD_BLOCK_CONTAINER_OPENER));
5520 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5521 c->task_mark_off,
5522 (c->is_task ? CH(c->task_mark_off) : 0),
5523 MD_BLOCK_CONTAINER_OPENER));
5524 break;
5525
5526 case _T('>'):
5527 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0, 0, MD_BLOCK_CONTAINER_OPENER));
5528 break;
5529
5530 default:
5531 MD_UNREACHABLE();
5532 break;
5533 }
5534 }
5535
5536 abort:
5537 return ret;
5538 }
5539
5540 static int
md_leave_child_containers(MD_CTX * ctx,int n_keep)5541 md_leave_child_containers(MD_CTX* ctx, int n_keep)
5542 {
5543 int ret = 0;
5544
5545 while(ctx->n_containers > n_keep) {
5546 MD_CONTAINER* c = &ctx->containers[ctx->n_containers-1];
5547 int is_ordered_list = FALSE;
5548
5549 switch(c->ch) {
5550 case _T(')'):
5551 case _T('.'):
5552 is_ordered_list = TRUE;
5553 /* Pass through */
5554
5555 case _T('-'):
5556 case _T('+'):
5557 case _T('*'):
5558 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
5559 c->task_mark_off, (c->is_task ? CH(c->task_mark_off) : 0),
5560 MD_BLOCK_CONTAINER_CLOSER));
5561 MD_CHECK(md_push_container_bytes(ctx,
5562 (is_ordered_list ? MD_BLOCK_OL : MD_BLOCK_UL), 0,
5563 c->ch, MD_BLOCK_CONTAINER_CLOSER));
5564 break;
5565
5566 case _T('>'):
5567 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_QUOTE, 0,
5568 0, MD_BLOCK_CONTAINER_CLOSER));
5569 break;
5570
5571 default:
5572 MD_UNREACHABLE();
5573 break;
5574 }
5575
5576 ctx->n_containers--;
5577 }
5578
5579 abort:
5580 return ret;
5581 }
5582
5583 static int
md_is_container_mark(MD_CTX * ctx,unsigned indent,OFF beg,OFF * p_end,MD_CONTAINER * p_container)5584 md_is_container_mark(MD_CTX* ctx, unsigned indent, OFF beg, OFF* p_end, MD_CONTAINER* p_container)
5585 {
5586 OFF off = beg;
5587 OFF max_end;
5588
5589 if(indent >= ctx->code_indent_offset)
5590 return FALSE;
5591
5592 /* Check for block quote mark. */
5593 if(off < ctx->size && CH(off) == _T('>')) {
5594 off++;
5595 p_container->ch = _T('>');
5596 p_container->is_loose = FALSE;
5597 p_container->is_task = FALSE;
5598 p_container->mark_indent = indent;
5599 p_container->contents_indent = indent + 1;
5600 *p_end = off;
5601 return TRUE;
5602 }
5603
5604 /* Check for list item bullet mark. */
5605 if(off+1 < ctx->size && ISANYOF(off, _T("-+*")) && (ISBLANK(off+1) || ISNEWLINE(off+1))) {
5606 p_container->ch = CH(off);
5607 p_container->is_loose = FALSE;
5608 p_container->is_task = FALSE;
5609 p_container->mark_indent = indent;
5610 p_container->contents_indent = indent + 1;
5611 *p_end = off + 1;
5612 return TRUE;
5613 }
5614
5615 /* Check for ordered list item marks. */
5616 max_end = off + 9;
5617 if(max_end > ctx->size)
5618 max_end = ctx->size;
5619 p_container->start = 0;
5620 while(off < max_end && ISDIGIT(off)) {
5621 p_container->start = p_container->start * 10 + CH(off) - _T('0');
5622 off++;
5623 }
5624 if(off > beg && off+1 < ctx->size &&
5625 (CH(off) == _T('.') || CH(off) == _T(')')) &&
5626 (ISBLANK(off+1) || ISNEWLINE(off+1)))
5627 {
5628 p_container->ch = CH(off);
5629 p_container->is_loose = FALSE;
5630 p_container->is_task = FALSE;
5631 p_container->mark_indent = indent;
5632 p_container->contents_indent = indent + off - beg + 1;
5633 *p_end = off + 1;
5634 return TRUE;
5635 }
5636
5637 return FALSE;
5638 }
5639
5640 static unsigned
md_line_indentation(MD_CTX * ctx,unsigned total_indent,OFF beg,OFF * p_end)5641 md_line_indentation(MD_CTX* ctx, unsigned total_indent, OFF beg, OFF* p_end)
5642 {
5643 OFF off = beg;
5644 unsigned indent = total_indent;
5645
5646 while(off < ctx->size && ISBLANK(off)) {
5647 if(CH(off) == _T('\t'))
5648 indent = (indent + 4) & ~3;
5649 else
5650 indent++;
5651 off++;
5652 }
5653
5654 *p_end = off;
5655 return indent - total_indent;
5656 }
5657
5658 static const MD_LINE_ANALYSIS md_dummy_blank_line = { MD_LINE_BLANK, 0 };
5659
5660 /* Analyze type of the line and find some its properties. This serves as a
5661 * main input for determining type and boundaries of a block. */
5662 static int
md_analyze_line(MD_CTX * ctx,OFF beg,OFF * p_end,const MD_LINE_ANALYSIS * pivot_line,MD_LINE_ANALYSIS * line)5663 md_analyze_line(MD_CTX* ctx, OFF beg, OFF* p_end,
5664 const MD_LINE_ANALYSIS* pivot_line, MD_LINE_ANALYSIS* line)
5665 {
5666 unsigned total_indent = 0;
5667 int n_parents = 0;
5668 int n_brothers = 0;
5669 int n_children = 0;
5670 MD_CONTAINER container = { 0 };
5671 int prev_line_has_list_loosening_effect = ctx->last_line_has_list_loosening_effect;
5672 OFF off = beg;
5673 OFF hr_killer = 0;
5674 int ret = 0;
5675
5676 line->indent = md_line_indentation(ctx, total_indent, off, &off);
5677 total_indent += line->indent;
5678 line->beg = off;
5679
5680 /* Given the indentation and block quote marks '>', determine how many of
5681 * the current containers are our parents. */
5682 while(n_parents < ctx->n_containers) {
5683 MD_CONTAINER* c = &ctx->containers[n_parents];
5684
5685 if(c->ch == _T('>') && line->indent < ctx->code_indent_offset &&
5686 off < ctx->size && CH(off) == _T('>'))
5687 {
5688 /* Block quote mark. */
5689 off++;
5690 total_indent++;
5691 line->indent = md_line_indentation(ctx, total_indent, off, &off);
5692 total_indent += line->indent;
5693
5694 /* The optional 1st space after '>' is part of the block quote mark. */
5695 if(line->indent > 0)
5696 line->indent--;
5697
5698 line->beg = off;
5699
5700 } else if(c->ch != _T('>') && line->indent >= c->contents_indent) {
5701 /* List. */
5702 line->indent -= c->contents_indent;
5703 } else {
5704 break;
5705 }
5706
5707 n_parents++;
5708 }
5709
5710 if(off >= ctx->size || ISNEWLINE(off)) {
5711 /* Blank line does not need any real indentation to be nested inside
5712 * a list. */
5713 if(n_brothers + n_children == 0) {
5714 while(n_parents < ctx->n_containers && ctx->containers[n_parents].ch != _T('>'))
5715 n_parents++;
5716 }
5717 }
5718
5719 while(TRUE) {
5720 /* Check whether we are fenced code continuation. */
5721 if(pivot_line->type == MD_LINE_FENCEDCODE) {
5722 line->beg = off;
5723
5724 /* We are another MD_LINE_FENCEDCODE unless we are closing fence
5725 * which we transform into MD_LINE_BLANK. */
5726 if(line->indent < ctx->code_indent_offset) {
5727 if(md_is_closing_code_fence(ctx, CH(pivot_line->beg), off, &off)) {
5728 line->type = MD_LINE_BLANK;
5729 ctx->last_line_has_list_loosening_effect = FALSE;
5730 break;
5731 }
5732 }
5733
5734 /* Change indentation accordingly to the initial code fence. */
5735 if(n_parents == ctx->n_containers) {
5736 if(line->indent > pivot_line->indent)
5737 line->indent -= pivot_line->indent;
5738 else
5739 line->indent = 0;
5740
5741 line->type = MD_LINE_FENCEDCODE;
5742 break;
5743 }
5744 }
5745
5746 /* Check whether we are HTML block continuation. */
5747 if(pivot_line->type == MD_LINE_HTML && ctx->html_block_type > 0) {
5748 int html_block_type;
5749
5750 html_block_type = md_is_html_block_end_condition(ctx, off, &off);
5751 if(html_block_type > 0) {
5752 MD_ASSERT(html_block_type == ctx->html_block_type);
5753
5754 /* Make sure this is the last line of the block. */
5755 ctx->html_block_type = 0;
5756
5757 /* Some end conditions serve as blank lines at the same time. */
5758 if(html_block_type == 6 || html_block_type == 7) {
5759 line->type = MD_LINE_BLANK;
5760 line->indent = 0;
5761 break;
5762 }
5763 }
5764
5765 if(n_parents == ctx->n_containers) {
5766 line->type = MD_LINE_HTML;
5767 break;
5768 }
5769 }
5770
5771 /* Check for blank line. */
5772 if(off >= ctx->size || ISNEWLINE(off)) {
5773 if(pivot_line->type == MD_LINE_INDENTEDCODE && n_parents == ctx->n_containers) {
5774 line->type = MD_LINE_INDENTEDCODE;
5775 if(line->indent > ctx->code_indent_offset)
5776 line->indent -= ctx->code_indent_offset;
5777 else
5778 line->indent = 0;
5779 ctx->last_line_has_list_loosening_effect = FALSE;
5780 } else {
5781 line->type = MD_LINE_BLANK;
5782 ctx->last_line_has_list_loosening_effect = (n_parents > 0 &&
5783 n_brothers + n_children == 0 &&
5784 ctx->containers[n_parents-1].ch != _T('>'));
5785
5786 #if 1
5787 /* See https://github.com/mity/md4c/issues/6
5788 *
5789 * This ugly checking tests we are in (yet empty) list item but not
5790 * its very first line (with the list item mark).
5791 *
5792 * If we are such blank line, then any following non-blank line
5793 * which would be part of this list item actually ends the list
5794 * because "a list item can begin with at most one blank line."
5795 */
5796 if(n_parents > 0 && ctx->containers[n_parents-1].ch != _T('>') &&
5797 n_brothers + n_children == 0 && ctx->current_block == NULL &&
5798 ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5799 {
5800 MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5801 if(top_block->type == MD_BLOCK_LI)
5802 ctx->last_list_item_starts_with_two_blank_lines = TRUE;
5803 }
5804 #endif
5805 }
5806 break;
5807 } else {
5808 #if 1
5809 /* This is 2nd half of the hack. If the flag is set (that is there
5810 * were 2nd blank line at the start of the list item) and we would also
5811 * belonging to such list item, than interrupt the list. */
5812 ctx->last_line_has_list_loosening_effect = FALSE;
5813 if(ctx->last_list_item_starts_with_two_blank_lines) {
5814 if(n_parents > 0 && ctx->containers[n_parents-1].ch != _T('>') &&
5815 n_brothers + n_children == 0 && ctx->current_block == NULL &&
5816 ctx->n_block_bytes > (int) sizeof(MD_BLOCK))
5817 {
5818 MD_BLOCK* top_block = (MD_BLOCK*) ((char*)ctx->block_bytes + ctx->n_block_bytes - sizeof(MD_BLOCK));
5819 if(top_block->type == MD_BLOCK_LI)
5820 n_parents--;
5821 }
5822
5823 ctx->last_list_item_starts_with_two_blank_lines = FALSE;
5824 }
5825 #endif
5826 }
5827
5828 /* Check whether we are Setext underline. */
5829 if(line->indent < ctx->code_indent_offset && pivot_line->type == MD_LINE_TEXT
5830 && (CH(off) == _T('=') || CH(off) == _T('-'))
5831 && (n_parents == ctx->n_containers))
5832 {
5833 unsigned level;
5834
5835 if(md_is_setext_underline(ctx, off, &off, &level)) {
5836 line->type = MD_LINE_SETEXTUNDERLINE;
5837 line->data = level;
5838 break;
5839 }
5840 }
5841
5842 /* Check for thematic break line. */
5843 if(line->indent < ctx->code_indent_offset && ISANYOF(off, _T("-_*")) && off >= hr_killer) {
5844 if(md_is_hr_line(ctx, off, &off, &hr_killer)) {
5845 line->type = MD_LINE_HR;
5846 break;
5847 }
5848 }
5849
5850 /* Check for "brother" container. I.e. whether we are another list item
5851 * in already started list. */
5852 if(n_parents < ctx->n_containers && n_brothers + n_children == 0) {
5853 OFF tmp;
5854
5855 if(md_is_container_mark(ctx, line->indent, off, &tmp, &container) &&
5856 md_is_container_compatible(&ctx->containers[n_parents], &container))
5857 {
5858 pivot_line = &md_dummy_blank_line;
5859
5860 off = tmp;
5861
5862 total_indent += container.contents_indent - container.mark_indent;
5863 line->indent = md_line_indentation(ctx, total_indent, off, &off);
5864 total_indent += line->indent;
5865 line->beg = off;
5866
5867 /* Some of the following whitespace actually still belongs to the mark. */
5868 if(off >= ctx->size || ISNEWLINE(off)) {
5869 container.contents_indent++;
5870 } else if(line->indent <= ctx->code_indent_offset) {
5871 container.contents_indent += line->indent;
5872 line->indent = 0;
5873 } else {
5874 container.contents_indent += 1;
5875 line->indent--;
5876 }
5877
5878 ctx->containers[n_parents].mark_indent = container.mark_indent;
5879 ctx->containers[n_parents].contents_indent = container.contents_indent;
5880
5881 n_brothers++;
5882 continue;
5883 }
5884 }
5885
5886 /* Check for indented code.
5887 * Note indented code block cannot interrupt a paragraph. */
5888 if(line->indent >= ctx->code_indent_offset &&
5889 (pivot_line->type == MD_LINE_BLANK || pivot_line->type == MD_LINE_INDENTEDCODE))
5890 {
5891 line->type = MD_LINE_INDENTEDCODE;
5892 MD_ASSERT(line->indent >= ctx->code_indent_offset);
5893 line->indent -= ctx->code_indent_offset;
5894 line->data = 0;
5895 break;
5896 }
5897
5898 /* Check for start of a new container block. */
5899 if(line->indent < ctx->code_indent_offset &&
5900 md_is_container_mark(ctx, line->indent, off, &off, &container))
5901 {
5902 if(pivot_line->type == MD_LINE_TEXT && n_parents == ctx->n_containers &&
5903 (off >= ctx->size || ISNEWLINE(off)) && container.ch != _T('>'))
5904 {
5905 /* Noop. List mark followed by a blank line cannot interrupt a paragraph. */
5906 } else if(pivot_line->type == MD_LINE_TEXT && n_parents == ctx->n_containers &&
5907 (container.ch == _T('.') || container.ch == _T(')')) && container.start != 1)
5908 {
5909 /* Noop. Ordered list cannot interrupt a paragraph unless the start index is 1. */
5910 } else {
5911 total_indent += container.contents_indent - container.mark_indent;
5912 line->indent = md_line_indentation(ctx, total_indent, off, &off);
5913 total_indent += line->indent;
5914
5915 line->beg = off;
5916 line->data = container.ch;
5917
5918 /* Some of the following whitespace actually still belongs to the mark. */
5919 if(off >= ctx->size || ISNEWLINE(off)) {
5920 container.contents_indent++;
5921 } else if(line->indent <= ctx->code_indent_offset) {
5922 container.contents_indent += line->indent;
5923 line->indent = 0;
5924 } else {
5925 container.contents_indent += 1;
5926 line->indent--;
5927 }
5928
5929 if(n_brothers + n_children == 0)
5930 pivot_line = &md_dummy_blank_line;
5931
5932 if(n_children == 0)
5933 MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
5934
5935 n_children++;
5936 MD_CHECK(md_push_container(ctx, &container));
5937 continue;
5938 }
5939 }
5940
5941 /* Check whether we are table continuation. */
5942 if(pivot_line->type == MD_LINE_TABLE && n_parents == ctx->n_containers) {
5943 line->type = MD_LINE_TABLE;
5944 break;
5945 }
5946
5947 /* Check for ATX header. */
5948 if(line->indent < ctx->code_indent_offset && CH(off) == _T('#')) {
5949 unsigned level;
5950
5951 if(md_is_atxheader_line(ctx, off, &line->beg, &off, &level)) {
5952 line->type = MD_LINE_ATXHEADER;
5953 line->data = level;
5954 break;
5955 }
5956 }
5957
5958 /* Check whether we are starting code fence. */
5959 if(CH(off) == _T('`') || CH(off) == _T('~')) {
5960 if(md_is_opening_code_fence(ctx, off, &off)) {
5961 line->type = MD_LINE_FENCEDCODE;
5962 line->data = 1;
5963 break;
5964 }
5965 }
5966
5967 /* Check for start of raw HTML block. */
5968 if(CH(off) == _T('<') && !(ctx->parser.flags & MD_FLAG_NOHTMLBLOCKS))
5969 {
5970 ctx->html_block_type = md_is_html_block_start_condition(ctx, off);
5971
5972 /* HTML block type 7 cannot interrupt paragraph. */
5973 if(ctx->html_block_type == 7 && pivot_line->type == MD_LINE_TEXT)
5974 ctx->html_block_type = 0;
5975
5976 if(ctx->html_block_type > 0) {
5977 /* The line itself also may immediately close the block. */
5978 if(md_is_html_block_end_condition(ctx, off, &off) == ctx->html_block_type) {
5979 /* Make sure this is the last line of the block. */
5980 ctx->html_block_type = 0;
5981 }
5982
5983 line->type = MD_LINE_HTML;
5984 break;
5985 }
5986 }
5987
5988 /* Check for table underline. */
5989 if((ctx->parser.flags & MD_FLAG_TABLES) && pivot_line->type == MD_LINE_TEXT &&
5990 (CH(off) == _T('|') || CH(off) == _T('-') || CH(off) == _T(':')) &&
5991 n_parents == ctx->n_containers)
5992 {
5993 unsigned col_count;
5994
5995 if(ctx->current_block != NULL && ctx->current_block->n_lines == 1 &&
5996 md_is_table_underline(ctx, off, &off, &col_count))
5997 {
5998 line->data = col_count;
5999 line->type = MD_LINE_TABLEUNDERLINE;
6000 break;
6001 }
6002 }
6003
6004 /* By default, we are normal text line. */
6005 line->type = MD_LINE_TEXT;
6006 if(pivot_line->type == MD_LINE_TEXT && n_brothers + n_children == 0) {
6007 /* Lazy continuation. */
6008 n_parents = ctx->n_containers;
6009 }
6010
6011 /* Check for task mark. */
6012 if((ctx->parser.flags & MD_FLAG_TASKLISTS) && n_brothers + n_children > 0 &&
6013 ISANYOF_(ctx->containers[ctx->n_containers-1].ch, _T("-+*.)")))
6014 {
6015 OFF tmp = off;
6016
6017 while(tmp < ctx->size && tmp < off + 3 && ISBLANK(tmp))
6018 tmp++;
6019 if(tmp + 2 < ctx->size && CH(tmp) == _T('[') &&
6020 ISANYOF(tmp+1, _T("xX ")) && CH(tmp+2) == _T(']') &&
6021 (tmp + 3 == ctx->size || ISBLANK(tmp+3) || ISNEWLINE(tmp+3)))
6022 {
6023 MD_CONTAINER* task_container = (n_children > 0 ? &ctx->containers[ctx->n_containers-1] : &container);
6024 task_container->is_task = TRUE;
6025 task_container->task_mark_off = tmp + 1;
6026 off = tmp + 3;
6027 while(ISWHITESPACE(off))
6028 off++;
6029 line->beg = off;
6030 }
6031 }
6032
6033 break;
6034 }
6035
6036 /* Scan for end of the line.
6037 *
6038 * Note this is quite a bottleneck of the parsing as we here iterate almost
6039 * over compete document.
6040 */
6041 #if defined __linux__ && !defined MD4C_USE_UTF16
6042 /* Recent glibc versions have superbly optimized strcspn(), even using
6043 * vectorization if available. */
6044 if(ctx->doc_ends_with_newline && off < ctx->size) {
6045 while(TRUE) {
6046 off += (OFF) strcspn(STR(off), "\r\n");
6047
6048 /* strcspn() can stop on zero terminator; but that can appear
6049 * anywhere in the Markfown input... */
6050 if(CH(off) == _T('\0'))
6051 off++;
6052 else
6053 break;
6054 }
6055 } else
6056 #endif
6057 {
6058 /* Optimization: Use some loop unrolling. */
6059 while(off + 3 < ctx->size && !ISNEWLINE(off+0) && !ISNEWLINE(off+1)
6060 && !ISNEWLINE(off+2) && !ISNEWLINE(off+3))
6061 off += 4;
6062 while(off < ctx->size && !ISNEWLINE(off))
6063 off++;
6064 }
6065
6066 /* Set end of the line. */
6067 line->end = off;
6068
6069 /* But for ATX header, we should exclude the optional trailing mark. */
6070 if(line->type == MD_LINE_ATXHEADER) {
6071 OFF tmp = line->end;
6072 while(tmp > line->beg && CH(tmp-1) == _T(' '))
6073 tmp--;
6074 while(tmp > line->beg && CH(tmp-1) == _T('#'))
6075 tmp--;
6076 if(tmp == line->beg || CH(tmp-1) == _T(' ') || (ctx->parser.flags & MD_FLAG_PERMISSIVEATXHEADERS))
6077 line->end = tmp;
6078 }
6079
6080 /* Trim trailing spaces. */
6081 if(line->type != MD_LINE_INDENTEDCODE && line->type != MD_LINE_FENCEDCODE) {
6082 while(line->end > line->beg && CH(line->end-1) == _T(' '))
6083 line->end--;
6084 }
6085
6086 /* Eat also the new line. */
6087 if(off < ctx->size && CH(off) == _T('\r'))
6088 off++;
6089 if(off < ctx->size && CH(off) == _T('\n'))
6090 off++;
6091
6092 *p_end = off;
6093
6094 /* If we belong to a list after seeing a blank line, the list is loose. */
6095 if(prev_line_has_list_loosening_effect && line->type != MD_LINE_BLANK && n_parents + n_brothers > 0) {
6096 MD_CONTAINER* c = &ctx->containers[n_parents + n_brothers - 1];
6097 if(c->ch != _T('>')) {
6098 MD_BLOCK* block = (MD_BLOCK*) (((char*)ctx->block_bytes) + c->block_byte_off);
6099 block->flags |= MD_BLOCK_LOOSE_LIST;
6100 }
6101 }
6102
6103 /* Leave any containers we are not part of anymore. */
6104 if(n_children == 0 && n_parents + n_brothers < ctx->n_containers)
6105 MD_CHECK(md_leave_child_containers(ctx, n_parents + n_brothers));
6106
6107 /* Enter any container we found a mark for. */
6108 if(n_brothers > 0) {
6109 MD_ASSERT(n_brothers == 1);
6110 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6111 ctx->containers[n_parents].task_mark_off,
6112 (ctx->containers[n_parents].is_task ? CH(ctx->containers[n_parents].task_mark_off) : 0),
6113 MD_BLOCK_CONTAINER_CLOSER));
6114 MD_CHECK(md_push_container_bytes(ctx, MD_BLOCK_LI,
6115 container.task_mark_off,
6116 (container.is_task ? CH(container.task_mark_off) : 0),
6117 MD_BLOCK_CONTAINER_OPENER));
6118 ctx->containers[n_parents].is_task = container.is_task;
6119 ctx->containers[n_parents].task_mark_off = container.task_mark_off;
6120 }
6121
6122 if(n_children > 0)
6123 MD_CHECK(md_enter_child_containers(ctx, n_children, line->data));
6124
6125 abort:
6126 return ret;
6127 }
6128
6129 static int
md_process_line(MD_CTX * ctx,const MD_LINE_ANALYSIS ** p_pivot_line,MD_LINE_ANALYSIS * line)6130 md_process_line(MD_CTX* ctx, const MD_LINE_ANALYSIS** p_pivot_line, MD_LINE_ANALYSIS* line)
6131 {
6132 const MD_LINE_ANALYSIS* pivot_line = *p_pivot_line;
6133 int ret = 0;
6134
6135 /* Blank line ends current leaf block. */
6136 if(line->type == MD_LINE_BLANK) {
6137 MD_CHECK(md_end_current_block(ctx));
6138 *p_pivot_line = &md_dummy_blank_line;
6139 return 0;
6140 }
6141
6142 /* Some line types form block on their own. */
6143 if(line->type == MD_LINE_HR || line->type == MD_LINE_ATXHEADER) {
6144 MD_CHECK(md_end_current_block(ctx));
6145
6146 /* Add our single-line block. */
6147 MD_CHECK(md_start_new_block(ctx, line));
6148 MD_CHECK(md_add_line_into_current_block(ctx, line));
6149 MD_CHECK(md_end_current_block(ctx));
6150 *p_pivot_line = &md_dummy_blank_line;
6151 return 0;
6152 }
6153
6154 /* MD_LINE_SETEXTUNDERLINE changes meaning of the current block and ends it. */
6155 if(line->type == MD_LINE_SETEXTUNDERLINE) {
6156 MD_ASSERT(ctx->current_block != NULL);
6157 ctx->current_block->type = MD_BLOCK_H;
6158 ctx->current_block->data = line->data;
6159 ctx->current_block->flags |= MD_BLOCK_SETEXT_HEADER;
6160 MD_CHECK(md_add_line_into_current_block(ctx, line));
6161 MD_CHECK(md_end_current_block(ctx));
6162 if(ctx->current_block == NULL) {
6163 *p_pivot_line = &md_dummy_blank_line;
6164 } else {
6165 /* This happens if we have consumed all the body as link ref. defs.
6166 * and downgraded the underline into start of a new paragraph block. */
6167 line->type = MD_LINE_TEXT;
6168 *p_pivot_line = line;
6169 }
6170 return 0;
6171 }
6172
6173 /* MD_LINE_TABLEUNDERLINE changes meaning of the current block. */
6174 if(line->type == MD_LINE_TABLEUNDERLINE) {
6175 MD_ASSERT(ctx->current_block != NULL);
6176 MD_ASSERT(ctx->current_block->n_lines == 1);
6177 ctx->current_block->type = MD_BLOCK_TABLE;
6178 ctx->current_block->data = line->data;
6179 MD_ASSERT(pivot_line != &md_dummy_blank_line);
6180 ((MD_LINE_ANALYSIS*)pivot_line)->type = MD_LINE_TABLE;
6181 MD_CHECK(md_add_line_into_current_block(ctx, line));
6182 return 0;
6183 }
6184
6185 /* The current block also ends if the line has different type. */
6186 if(line->type != pivot_line->type)
6187 MD_CHECK(md_end_current_block(ctx));
6188
6189 /* The current line may start a new block. */
6190 if(ctx->current_block == NULL) {
6191 MD_CHECK(md_start_new_block(ctx, line));
6192 *p_pivot_line = line;
6193 }
6194
6195 /* In all other cases the line is just a continuation of the current block. */
6196 MD_CHECK(md_add_line_into_current_block(ctx, line));
6197
6198 abort:
6199 return ret;
6200 }
6201
6202 static int
md_process_doc(MD_CTX * ctx)6203 md_process_doc(MD_CTX *ctx)
6204 {
6205 const MD_LINE_ANALYSIS* pivot_line = &md_dummy_blank_line;
6206 MD_LINE_ANALYSIS line_buf[2];
6207 MD_LINE_ANALYSIS* line = &line_buf[0];
6208 OFF off = 0;
6209 int ret = 0;
6210
6211 MD_ENTER_BLOCK(MD_BLOCK_DOC, NULL);
6212
6213 while(off < ctx->size) {
6214 if(line == pivot_line)
6215 line = (line == &line_buf[0] ? &line_buf[1] : &line_buf[0]);
6216
6217 MD_CHECK(md_analyze_line(ctx, off, &off, pivot_line, line));
6218 MD_CHECK(md_process_line(ctx, &pivot_line, line));
6219 }
6220
6221 md_end_current_block(ctx);
6222
6223 MD_CHECK(md_build_ref_def_hashtable(ctx));
6224
6225 /* Process all blocks. */
6226 MD_CHECK(md_leave_child_containers(ctx, 0));
6227 MD_CHECK(md_process_all_blocks(ctx));
6228
6229 MD_LEAVE_BLOCK(MD_BLOCK_DOC, NULL);
6230
6231 abort:
6232
6233 #if 0
6234 /* Output some memory consumption statistics. */
6235 {
6236 char buffer[256];
6237 sprintf(buffer, "Alloced %u bytes for block buffer.",
6238 (unsigned)(ctx->alloc_block_bytes));
6239 MD_LOG(buffer);
6240
6241 sprintf(buffer, "Alloced %u bytes for containers buffer.",
6242 (unsigned)(ctx->alloc_containers * sizeof(MD_CONTAINER)));
6243 MD_LOG(buffer);
6244
6245 sprintf(buffer, "Alloced %u bytes for marks buffer.",
6246 (unsigned)(ctx->alloc_marks * sizeof(MD_MARK)));
6247 MD_LOG(buffer);
6248
6249 sprintf(buffer, "Alloced %u bytes for aux. buffer.",
6250 (unsigned)(ctx->alloc_buffer * sizeof(MD_CHAR)));
6251 MD_LOG(buffer);
6252 }
6253 #endif
6254
6255 return ret;
6256 }
6257
6258
6259 /********************
6260 *** Public API ***
6261 ********************/
6262
6263 int
md_parse(const MD_CHAR * text,MD_SIZE size,const MD_PARSER * parser,void * userdata)6264 md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata)
6265 {
6266 MD_CTX ctx;
6267 int i;
6268 int ret;
6269
6270 if(parser->abi_version != 0) {
6271 if(parser->debug_log != NULL)
6272 parser->debug_log("Unsupported abi_version.", userdata);
6273 return -1;
6274 }
6275
6276 /* Setup context structure. */
6277 memset(&ctx, 0, sizeof(MD_CTX));
6278 ctx.text = text;
6279 ctx.size = size;
6280 memcpy(&ctx.parser, parser, sizeof(MD_PARSER));
6281 ctx.userdata = userdata;
6282 ctx.code_indent_offset = (ctx.parser.flags & MD_FLAG_NOINDENTEDCODEBLOCKS) ? (OFF)(-1) : 4;
6283 md_build_mark_char_map(&ctx);
6284 ctx.doc_ends_with_newline = (size > 0 && ISNEWLINE_(text[size-1]));
6285
6286 /* Reset all unresolved opener mark chains. */
6287 for(i = 0; i < (int) SIZEOF_ARRAY(ctx.mark_chains); i++) {
6288 ctx.mark_chains[i].head = -1;
6289 ctx.mark_chains[i].tail = -1;
6290 }
6291 ctx.unresolved_link_head = -1;
6292 ctx.unresolved_link_tail = -1;
6293
6294 /* All the work. */
6295 ret = md_process_doc(&ctx);
6296
6297 /* Clean-up. */
6298 md_free_ref_defs(&ctx);
6299 md_free_ref_def_hashtable(&ctx);
6300 free(ctx.buffer);
6301 free(ctx.marks);
6302 free(ctx.block_bytes);
6303 free(ctx.containers);
6304
6305 return ret;
6306 }
6307