1 #include <stdlib.h>
2 #include <string.h>
3 #include <stdio.h>
4 
5 #include "cmark_ctype.h"
6 #include "config.h"
7 #include "node.h"
8 #include "parser.h"
9 #include "references.h"
10 #include "cmark-gfm.h"
11 #include "houdini.h"
12 #include "utf8.h"
13 #include "scanners.h"
14 #include "inlines.h"
15 #include "syntax_extension.h"
16 
17 static const char *EMDASH = "\xE2\x80\x94";
18 static const char *ENDASH = "\xE2\x80\x93";
19 static const char *ELLIPSES = "\xE2\x80\xA6";
20 static const char *LEFTDOUBLEQUOTE = "\xE2\x80\x9C";
21 static const char *RIGHTDOUBLEQUOTE = "\xE2\x80\x9D";
22 static const char *LEFTSINGLEQUOTE = "\xE2\x80\x98";
23 static const char *RIGHTSINGLEQUOTE = "\xE2\x80\x99";
24 
25 // Macros for creating various kinds of simple.
26 #define make_str(subj, sc, ec, s) make_literal(subj, CMARK_NODE_TEXT, sc, ec, s)
27 #define make_code(subj, sc, ec, s) make_literal(subj, CMARK_NODE_CODE, sc, ec, s)
28 #define make_raw_html(subj, sc, ec, s) make_literal(subj, CMARK_NODE_HTML_INLINE, sc, ec, s)
29 #define make_linebreak(mem) make_simple(mem, CMARK_NODE_LINEBREAK)
30 #define make_softbreak(mem) make_simple(mem, CMARK_NODE_SOFTBREAK)
31 #define make_emph(mem) make_simple(mem, CMARK_NODE_EMPH)
32 #define make_strong(mem) make_simple(mem, CMARK_NODE_STRONG)
33 
34 #define MAXBACKTICKS 80
35 
36 typedef struct bracket {
37   struct bracket *previous;
38   struct delimiter *previous_delimiter;
39   cmark_node *inl_text;
40   bufsize_t position;
41   bool image;
42   bool active;
43   bool bracket_after;
44 } bracket;
45 
46 typedef struct subject{
47   cmark_mem *mem;
48   cmark_chunk input;
49   int line;
50   bufsize_t pos;
51   int block_offset;
52   int column_offset;
53   cmark_map *refmap;
54   delimiter *last_delim;
55   bracket *last_bracket;
56   bufsize_t backticks[MAXBACKTICKS + 1];
57   bool scanned_for_backticks;
58 } subject;
59 
60 // Extensions may populate this.
61 static int8_t SKIP_CHARS[256];
62 
S_is_line_end_char(char c)63 static CMARK_INLINE bool S_is_line_end_char(char c) {
64   return (c == '\n' || c == '\r');
65 }
66 
67 static delimiter *S_insert_emph(subject *subj, delimiter *opener,
68                                 delimiter *closer);
69 
70 static int parse_inline(cmark_parser *parser, subject *subj, cmark_node *parent, int options);
71 
72 static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e,
73                              cmark_chunk *buffer, cmark_map *refmap);
74 static bufsize_t subject_find_special_char(subject *subj, int options);
75 
76 // Create an inline with a literal string value.
make_literal(subject * subj,cmark_node_type t,int start_column,int end_column,cmark_chunk s)77 static CMARK_INLINE cmark_node *make_literal(subject *subj, cmark_node_type t,
78                                              int start_column, int end_column,
79                                              cmark_chunk s) {
80   cmark_node *e = (cmark_node *)subj->mem->calloc(1, sizeof(*e));
81   cmark_strbuf_init(subj->mem, &e->content, 0);
82   e->type = (uint16_t)t;
83   e->as.literal = s;
84   e->start_line = e->end_line = subj->line;
85   // columns are 1 based.
86   e->start_column = start_column + 1 + subj->column_offset + subj->block_offset;
87   e->end_column = end_column + 1 + subj->column_offset + subj->block_offset;
88   return e;
89 }
90 
91 // Create an inline with no value.
make_simple(cmark_mem * mem,cmark_node_type t)92 static CMARK_INLINE cmark_node *make_simple(cmark_mem *mem, cmark_node_type t) {
93   cmark_node *e = (cmark_node *)mem->calloc(1, sizeof(*e));
94   cmark_strbuf_init(mem, &e->content, 0);
95   e->type = (uint16_t)t;
96   return e;
97 }
98 
99 // Like make_str, but parses entities.
make_str_with_entities(subject * subj,int start_column,int end_column,cmark_chunk * content)100 static cmark_node *make_str_with_entities(subject *subj,
101                                           int start_column, int end_column,
102                                           cmark_chunk *content) {
103   cmark_strbuf unescaped = CMARK_BUF_INIT(subj->mem);
104 
105   if (houdini_unescape_html(&unescaped, content->data, content->len)) {
106     return make_str(subj, start_column, end_column, cmark_chunk_buf_detach(&unescaped));
107   } else {
108     return make_str(subj, start_column, end_column, *content);
109   }
110 }
111 
112 // Duplicate a chunk by creating a copy of the buffer not by reusing the
113 // buffer like cmark_chunk_dup does.
chunk_clone(cmark_mem * mem,cmark_chunk * src)114 static cmark_chunk chunk_clone(cmark_mem *mem, cmark_chunk *src) {
115   cmark_chunk c;
116   bufsize_t len = src->len;
117 
118   c.len = len;
119   c.data = (unsigned char *)mem->calloc(len + 1, 1);
120   c.alloc = 1;
121   if (len)
122     memcpy(c.data, src->data, len);
123   c.data[len] = '\0';
124 
125   return c;
126 }
127 
cmark_clean_autolink(cmark_mem * mem,cmark_chunk * url,int is_email)128 static cmark_chunk cmark_clean_autolink(cmark_mem *mem, cmark_chunk *url,
129                                         int is_email) {
130   cmark_strbuf buf = CMARK_BUF_INIT(mem);
131 
132   cmark_chunk_trim(url);
133 
134   if (url->len == 0) {
135     cmark_chunk result = CMARK_CHUNK_EMPTY;
136     return result;
137   }
138 
139   if (is_email)
140     cmark_strbuf_puts(&buf, "mailto:");
141 
142   houdini_unescape_html_f(&buf, url->data, url->len);
143   return cmark_chunk_buf_detach(&buf);
144 }
145 
make_autolink(subject * subj,int start_column,int end_column,cmark_chunk url,int is_email)146 static CMARK_INLINE cmark_node *make_autolink(subject *subj,
147                                               int start_column, int end_column,
148                                               cmark_chunk url, int is_email) {
149   cmark_node *link = make_simple(subj->mem, CMARK_NODE_LINK);
150   link->as.link.url = cmark_clean_autolink(subj->mem, &url, is_email);
151   link->as.link.title = cmark_chunk_literal("");
152   link->start_line = link->end_line = subj->line;
153   link->start_column = start_column + 1;
154   link->end_column = end_column + 1;
155   cmark_node_append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url));
156   return link;
157 }
158 
subject_from_buf(cmark_mem * mem,int line_number,int block_offset,subject * e,cmark_chunk * chunk,cmark_map * refmap)159 static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e,
160                              cmark_chunk *chunk, cmark_map *refmap) {
161   int i;
162   e->mem = mem;
163   e->input = *chunk;
164   e->line = line_number;
165   e->pos = 0;
166   e->block_offset = block_offset;
167   e->column_offset = 0;
168   e->refmap = refmap;
169   e->last_delim = NULL;
170   e->last_bracket = NULL;
171   for (i = 0; i <= MAXBACKTICKS; i++) {
172     e->backticks[i] = 0;
173   }
174   e->scanned_for_backticks = false;
175 }
176 
isbacktick(int c)177 static CMARK_INLINE int isbacktick(int c) { return (c == '`'); }
178 
peek_char_n(subject * subj,bufsize_t n)179 static CMARK_INLINE unsigned char peek_char_n(subject *subj, bufsize_t n) {
180   // NULL bytes should have been stripped out by now.  If they're
181   // present, it's a programming error:
182   assert(!(subj->pos + n < subj->input.len && subj->input.data[subj->pos + n] == 0));
183   return (subj->pos + n < subj->input.len) ? subj->input.data[subj->pos + n] : 0;
184 }
185 
peek_char(subject * subj)186 static CMARK_INLINE unsigned char peek_char(subject *subj) {
187   return peek_char_n(subj, 0);
188 }
189 
peek_at(subject * subj,bufsize_t pos)190 static CMARK_INLINE unsigned char peek_at(subject *subj, bufsize_t pos) {
191   return subj->input.data[pos];
192 }
193 
194 // Return true if there are more characters in the subject.
is_eof(subject * subj)195 static CMARK_INLINE int is_eof(subject *subj) {
196   return (subj->pos >= subj->input.len);
197 }
198 
199 // Advance the subject.  Doesn't check for eof.
200 #define advance(subj) (subj)->pos += 1
201 
skip_spaces(subject * subj)202 static CMARK_INLINE bool skip_spaces(subject *subj) {
203   bool skipped = false;
204   while (peek_char(subj) == ' ' || peek_char(subj) == '\t') {
205     advance(subj);
206     skipped = true;
207   }
208   return skipped;
209 }
210 
skip_line_end(subject * subj)211 static CMARK_INLINE bool skip_line_end(subject *subj) {
212   bool seen_line_end_char = false;
213   if (peek_char(subj) == '\r') {
214     advance(subj);
215     seen_line_end_char = true;
216   }
217   if (peek_char(subj) == '\n') {
218     advance(subj);
219     seen_line_end_char = true;
220   }
221   return seen_line_end_char || is_eof(subj);
222 }
223 
224 // Take characters while a predicate holds, and return a string.
take_while(subject * subj,int (* f)(int))225 static CMARK_INLINE cmark_chunk take_while(subject *subj, int (*f)(int)) {
226   unsigned char c;
227   bufsize_t startpos = subj->pos;
228   bufsize_t len = 0;
229 
230   while ((c = peek_char(subj)) && (*f)(c)) {
231     advance(subj);
232     len++;
233   }
234 
235   return cmark_chunk_dup(&subj->input, startpos, len);
236 }
237 
238 // Return the number of newlines in a given span of text in a subject.  If
239 // the number is greater than zero, also return the number of characters
240 // between the last newline and the end of the span in `since_newline`.
count_newlines(subject * subj,bufsize_t from,bufsize_t len,int * since_newline)241 static int count_newlines(subject *subj, bufsize_t from, bufsize_t len, int *since_newline) {
242   int nls = 0;
243   int since_nl = 0;
244 
245   while (len--) {
246     if (subj->input.data[from++] == '\n') {
247       ++nls;
248       since_nl = 0;
249     } else {
250       ++since_nl;
251     }
252   }
253 
254   if (!nls)
255     return 0;
256 
257   *since_newline = since_nl;
258   return nls;
259 }
260 
261 // Adjust `node`'s `end_line`, `end_column`, and `subj`'s `line` and
262 // `column_offset` according to the number of newlines in a just-matched span
263 // of text in `subj`.
adjust_subj_node_newlines(subject * subj,cmark_node * node,int matchlen,int extra,int options)264 static void adjust_subj_node_newlines(subject *subj, cmark_node *node, int matchlen, int extra, int options) {
265   if (!(options & CMARK_OPT_SOURCEPOS)) {
266     return;
267   }
268 
269   int since_newline;
270   int newlines = count_newlines(subj, subj->pos - matchlen - extra, matchlen, &since_newline);
271   if (newlines) {
272     subj->line += newlines;
273     node->end_line += newlines;
274     node->end_column = since_newline;
275     subj->column_offset = -subj->pos + since_newline + extra;
276   }
277 }
278 
279 // Try to process a backtick code span that began with a
280 // span of ticks of length openticklength length (already
281 // parsed).  Return 0 if you don't find matching closing
282 // backticks, otherwise return the position in the subject
283 // after the closing backticks.
scan_to_closing_backticks(subject * subj,bufsize_t openticklength)284 static bufsize_t scan_to_closing_backticks(subject *subj,
285                                            bufsize_t openticklength) {
286 
287   bool found = false;
288   if (openticklength > MAXBACKTICKS) {
289     // we limit backtick string length because of the array subj->backticks:
290     return 0;
291   }
292   if (subj->scanned_for_backticks &&
293       subj->backticks[openticklength] <= subj->pos) {
294     // return if we already know there's no closer
295     return 0;
296   }
297   while (!found) {
298     // read non backticks
299     unsigned char c;
300     while ((c = peek_char(subj)) && c != '`') {
301       advance(subj);
302     }
303     if (is_eof(subj)) {
304       break;
305     }
306     bufsize_t numticks = 0;
307     while (peek_char(subj) == '`') {
308       advance(subj);
309       numticks++;
310     }
311     // store position of ender
312     if (numticks <= MAXBACKTICKS) {
313       subj->backticks[numticks] = subj->pos - numticks;
314     }
315     if (numticks == openticklength) {
316       return (subj->pos);
317     }
318   }
319   // got through whole input without finding closer
320   subj->scanned_for_backticks = true;
321   return 0;
322 }
323 
324 // Destructively modify string, converting newlines to
325 // spaces, then removing a single leading + trailing space.
S_normalize_code(cmark_strbuf * s)326 static void S_normalize_code(cmark_strbuf *s) {
327   bufsize_t r, w;
328 
329   for (r = 0, w = 0; r < s->size; ++r) {
330     switch (s->ptr[r]) {
331     case '\r':
332       if (s->ptr[r + 1] != '\n') {
333 	s->ptr[w++] = ' ';
334       }
335       break;
336     case '\n':
337       s->ptr[w++] = ' ';
338       break;
339     default:
340       s->ptr[w++] = s->ptr[r];
341     }
342   }
343 
344   // begins and ends with space?
345   if (s->ptr[0] == ' ' && s->ptr[w - 1] == ' ') {
346     cmark_strbuf_drop(s, 1);
347     cmark_strbuf_truncate(s, w - 2);
348   } else {
349     cmark_strbuf_truncate(s, w);
350   }
351 
352 }
353 
354 
355 // Parse backtick code section or raw backticks, return an inline.
356 // Assumes that the subject has a backtick at the current position.
handle_backticks(subject * subj,int options)357 static cmark_node *handle_backticks(subject *subj, int options) {
358   cmark_chunk openticks = take_while(subj, isbacktick);
359   bufsize_t startpos = subj->pos;
360   bufsize_t endpos = scan_to_closing_backticks(subj, openticks.len);
361 
362   if (endpos == 0) {      // not found
363     subj->pos = startpos; // rewind
364     return make_str(subj, subj->pos, subj->pos, openticks);
365   } else {
366     cmark_strbuf buf = CMARK_BUF_INIT(subj->mem);
367 
368     cmark_strbuf_set(&buf, subj->input.data + startpos,
369                      endpos - startpos - openticks.len);
370     S_normalize_code(&buf);
371 
372     cmark_node *node = make_code(subj, startpos, endpos - openticks.len - 1, cmark_chunk_buf_detach(&buf));
373     adjust_subj_node_newlines(subj, node, endpos - startpos, openticks.len, options);
374     return node;
375   }
376 }
377 
378 
379 // Scan ***, **, or * and return number scanned, or 0.
380 // Advances position.
scan_delims(subject * subj,unsigned char c,bool * can_open,bool * can_close)381 static int scan_delims(subject *subj, unsigned char c, bool *can_open,
382                        bool *can_close) {
383   int numdelims = 0;
384   bufsize_t before_char_pos, after_char_pos;
385   int32_t after_char = 0;
386   int32_t before_char = 0;
387   int len;
388   bool left_flanking, right_flanking;
389 
390   if (subj->pos == 0) {
391     before_char = 10;
392   } else {
393     before_char_pos = subj->pos - 1;
394     // walk back to the beginning of the UTF_8 sequence:
395     while ((peek_at(subj, before_char_pos) >> 6 == 2 || SKIP_CHARS[peek_at(subj, before_char_pos)]) && before_char_pos > 0) {
396       before_char_pos -= 1;
397     }
398     len = cmark_utf8proc_iterate(subj->input.data + before_char_pos,
399                                  subj->pos - before_char_pos, &before_char);
400     if (len == -1 || (before_char < 256 && SKIP_CHARS[(unsigned char) before_char])) {
401       before_char = 10;
402     }
403   }
404 
405   if (c == '\'' || c == '"') {
406     numdelims++;
407     advance(subj); // limit to 1 delim for quotes
408   } else {
409     while (peek_char(subj) == c) {
410       numdelims++;
411       advance(subj);
412     }
413   }
414 
415   if (subj->pos == subj->input.len) {
416     after_char = 10;
417   } else {
418     after_char_pos = subj->pos;
419     while (SKIP_CHARS[peek_at(subj, after_char_pos)] && after_char_pos < subj->input.len) {
420       after_char_pos += 1;
421     }
422     len = cmark_utf8proc_iterate(subj->input.data + after_char_pos,
423                                  subj->input.len - after_char_pos, &after_char);
424     if (len == -1 || (after_char < 256 && SKIP_CHARS[(unsigned char) after_char])) {
425     after_char = 10;
426   }
427   }
428 
429   left_flanking = numdelims > 0 && !cmark_utf8proc_is_space(after_char) &&
430                   (!cmark_utf8proc_is_punctuation(after_char) ||
431                    cmark_utf8proc_is_space(before_char) ||
432                    cmark_utf8proc_is_punctuation(before_char));
433   right_flanking = numdelims > 0 && !cmark_utf8proc_is_space(before_char) &&
434                    (!cmark_utf8proc_is_punctuation(before_char) ||
435                     cmark_utf8proc_is_space(after_char) ||
436                     cmark_utf8proc_is_punctuation(after_char));
437   if (c == '_') {
438     *can_open = left_flanking &&
439                 (!right_flanking || cmark_utf8proc_is_punctuation(before_char));
440     *can_close = right_flanking &&
441                  (!left_flanking || cmark_utf8proc_is_punctuation(after_char));
442   } else if (c == '\'' || c == '"') {
443     *can_open = left_flanking && !right_flanking &&
444 	         before_char != ']' && before_char != ')';
445     *can_close = right_flanking;
446   } else {
447     *can_open = left_flanking;
448     *can_close = right_flanking;
449   }
450   return numdelims;
451 }
452 
453 /*
454 static void print_delimiters(subject *subj)
455 {
456         delimiter *delim;
457         delim = subj->last_delim;
458         while (delim != NULL) {
459                 printf("Item at stack pos %p: %d %d %d next(%p) prev(%p)\n",
460                        (void*)delim, delim->delim_char,
461                        delim->can_open, delim->can_close,
462                        (void*)delim->next, (void*)delim->previous);
463                 delim = delim->previous;
464         }
465 }
466 */
467 
remove_delimiter(subject * subj,delimiter * delim)468 static void remove_delimiter(subject *subj, delimiter *delim) {
469   if (delim == NULL)
470     return;
471   if (delim->next == NULL) {
472     // end of list:
473     assert(delim == subj->last_delim);
474     subj->last_delim = delim->previous;
475   } else {
476     delim->next->previous = delim->previous;
477   }
478   if (delim->previous != NULL) {
479     delim->previous->next = delim->next;
480   }
481   subj->mem->free(delim);
482 }
483 
pop_bracket(subject * subj)484 static void pop_bracket(subject *subj) {
485   bracket *b;
486   if (subj->last_bracket == NULL)
487     return;
488   b = subj->last_bracket;
489   subj->last_bracket = subj->last_bracket->previous;
490   subj->mem->free(b);
491 }
492 
push_delimiter(subject * subj,unsigned char c,bool can_open,bool can_close,cmark_node * inl_text)493 static void push_delimiter(subject *subj, unsigned char c, bool can_open,
494                            bool can_close, cmark_node *inl_text) {
495   delimiter *delim = (delimiter *)subj->mem->calloc(1, sizeof(delimiter));
496   delim->delim_char = c;
497   delim->can_open = can_open;
498   delim->can_close = can_close;
499   delim->inl_text = inl_text;
500   delim->length = inl_text->as.literal.len;
501   delim->previous = subj->last_delim;
502   delim->next = NULL;
503   if (delim->previous != NULL) {
504     delim->previous->next = delim;
505   }
506   subj->last_delim = delim;
507 }
508 
push_bracket(subject * subj,bool image,cmark_node * inl_text)509 static void push_bracket(subject *subj, bool image, cmark_node *inl_text) {
510   bracket *b = (bracket *)subj->mem->calloc(1, sizeof(bracket));
511   if (subj->last_bracket != NULL) {
512     subj->last_bracket->bracket_after = true;
513   }
514   b->image = image;
515   b->active = true;
516   b->inl_text = inl_text;
517   b->previous = subj->last_bracket;
518   b->previous_delimiter = subj->last_delim;
519   b->position = subj->pos;
520   b->bracket_after = false;
521   subj->last_bracket = b;
522 }
523 
524 // Assumes the subject has a c at the current position.
handle_delim(subject * subj,unsigned char c,bool smart)525 static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) {
526   bufsize_t numdelims;
527   cmark_node *inl_text;
528   bool can_open, can_close;
529   cmark_chunk contents;
530 
531   numdelims = scan_delims(subj, c, &can_open, &can_close);
532 
533   if (c == '\'' && smart) {
534     contents = cmark_chunk_literal(RIGHTSINGLEQUOTE);
535   } else if (c == '"' && smart) {
536     contents =
537         cmark_chunk_literal(can_close ? RIGHTDOUBLEQUOTE : LEFTDOUBLEQUOTE);
538   } else {
539     contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims);
540   }
541 
542   inl_text = make_str(subj, subj->pos - numdelims, subj->pos - 1, contents);
543 
544   if ((can_open || can_close) && (!(c == '\'' || c == '"') || smart)) {
545     push_delimiter(subj, c, can_open, can_close, inl_text);
546   }
547 
548   return inl_text;
549 }
550 
551 // Assumes we have a hyphen at the current position.
handle_hyphen(subject * subj,bool smart)552 static cmark_node *handle_hyphen(subject *subj, bool smart) {
553   int startpos = subj->pos;
554 
555   advance(subj);
556 
557   if (!smart || peek_char(subj) != '-') {
558     return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("-"));
559   }
560 
561   while (smart && peek_char(subj) == '-') {
562     advance(subj);
563   }
564 
565   int numhyphens = subj->pos - startpos;
566   int en_count = 0;
567   int em_count = 0;
568   int i;
569   cmark_strbuf buf = CMARK_BUF_INIT(subj->mem);
570 
571   if (numhyphens % 3 == 0) { // if divisible by 3, use all em dashes
572     em_count = numhyphens / 3;
573   } else if (numhyphens % 2 == 0) { // if divisible by 2, use all en dashes
574     en_count = numhyphens / 2;
575   } else if (numhyphens % 3 == 2) { // use one en dash at end
576     en_count = 1;
577     em_count = (numhyphens - 2) / 3;
578   } else { // use two en dashes at the end
579     en_count = 2;
580     em_count = (numhyphens - 4) / 3;
581   }
582 
583   for (i = em_count; i > 0; i--) {
584     cmark_strbuf_puts(&buf, EMDASH);
585   }
586 
587   for (i = en_count; i > 0; i--) {
588     cmark_strbuf_puts(&buf, ENDASH);
589   }
590 
591   return make_str(subj, startpos, subj->pos - 1, cmark_chunk_buf_detach(&buf));
592 }
593 
594 // Assumes we have a period at the current position.
handle_period(subject * subj,bool smart)595 static cmark_node *handle_period(subject *subj, bool smart) {
596   advance(subj);
597   if (smart && peek_char(subj) == '.') {
598     advance(subj);
599     if (peek_char(subj) == '.') {
600       advance(subj);
601       return make_str(subj, subj->pos - 3, subj->pos - 1, cmark_chunk_literal(ELLIPSES));
602     } else {
603       return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal(".."));
604     }
605   } else {
606     return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("."));
607   }
608 }
609 
get_extension_for_special_char(cmark_parser * parser,unsigned char c)610 static cmark_syntax_extension *get_extension_for_special_char(cmark_parser *parser, unsigned char c) {
611   cmark_llist *tmp_ext;
612 
613   for (tmp_ext = parser->inline_syntax_extensions; tmp_ext; tmp_ext=tmp_ext->next) {
614     cmark_syntax_extension *ext = (cmark_syntax_extension *) tmp_ext->data;
615     cmark_llist *tmp_char;
616     for (tmp_char = ext->special_inline_chars; tmp_char; tmp_char=tmp_char->next) {
617       unsigned char tmp_c = (unsigned char)(size_t)tmp_char->data;
618 
619       if (tmp_c == c) {
620         return ext;
621       }
622     }
623   }
624 
625   return NULL;
626 }
627 
process_emphasis(cmark_parser * parser,subject * subj,delimiter * stack_bottom)628 static void process_emphasis(cmark_parser *parser, subject *subj, delimiter *stack_bottom) {
629   delimiter *closer = subj->last_delim;
630   delimiter *opener;
631   delimiter *old_closer;
632   bool opener_found;
633   bool odd_match;
634   delimiter *openers_bottom[3][128];
635   int i;
636 
637   // initialize openers_bottom:
638   memset(&openers_bottom, 0, sizeof(openers_bottom));
639   for (i=0; i < 3; i++) {
640     openers_bottom[i]['*'] = stack_bottom;
641     openers_bottom[i]['_'] = stack_bottom;
642     openers_bottom[i]['\''] = stack_bottom;
643     openers_bottom[i]['"'] = stack_bottom;
644   }
645 
646   // move back to first relevant delim.
647   while (closer != NULL && closer->previous != stack_bottom) {
648     closer = closer->previous;
649   }
650 
651   // now move forward, looking for closers, and handling each
652   while (closer != NULL) {
653     cmark_syntax_extension *extension = get_extension_for_special_char(parser, closer->delim_char);
654     if (closer->can_close) {
655       // Now look backwards for first matching opener:
656       opener = closer->previous;
657       opener_found = false;
658       odd_match = false;
659       while (opener != NULL && opener != stack_bottom &&
660              opener != openers_bottom[closer->length % 3][closer->delim_char]) {
661         if (opener->can_open && opener->delim_char == closer->delim_char) {
662           // interior closer of size 2 can't match opener of size 1
663           // or of size 1 can't match 2
664           odd_match = (closer->can_open || opener->can_close) &&
665                       ((opener->length + closer->length) % 3 == 0);
666           if (!odd_match) {
667             opener_found = true;
668             break;
669           }
670         }
671         opener = opener->previous;
672       }
673       old_closer = closer;
674 
675       if (extension) {
676         if (opener_found)
677           closer = extension->insert_inline_from_delim(extension, parser, subj, opener, closer);
678         else
679           closer = closer->next;
680       } else if (closer->delim_char == '*' || closer->delim_char == '_') {
681         if (opener_found) {
682           closer = S_insert_emph(subj, opener, closer);
683         } else {
684           closer = closer->next;
685         }
686       } else if (closer->delim_char == '\'') {
687         cmark_chunk_free(subj->mem, &closer->inl_text->as.literal);
688         closer->inl_text->as.literal = cmark_chunk_literal(RIGHTSINGLEQUOTE);
689         if (opener_found) {
690           cmark_chunk_free(subj->mem, &opener->inl_text->as.literal);
691           opener->inl_text->as.literal = cmark_chunk_literal(LEFTSINGLEQUOTE);
692         }
693         closer = closer->next;
694       } else if (closer->delim_char == '"') {
695         cmark_chunk_free(subj->mem, &closer->inl_text->as.literal);
696         closer->inl_text->as.literal = cmark_chunk_literal(RIGHTDOUBLEQUOTE);
697         if (opener_found) {
698           cmark_chunk_free(subj->mem, &opener->inl_text->as.literal);
699           opener->inl_text->as.literal = cmark_chunk_literal(LEFTDOUBLEQUOTE);
700         }
701         closer = closer->next;
702       }
703       if (!opener_found) {
704         // set lower bound for future searches for openers
705         openers_bottom[old_closer->length % 3][old_closer->delim_char] =
706 		old_closer->previous;
707         if (!old_closer->can_open) {
708           // we can remove a closer that can't be an
709           // opener, once we've seen there's no
710           // matching opener:
711           remove_delimiter(subj, old_closer);
712         }
713       }
714     } else {
715       closer = closer->next;
716     }
717   }
718   // free all delimiters in list until stack_bottom:
719   while (subj->last_delim != NULL && subj->last_delim != stack_bottom) {
720     remove_delimiter(subj, subj->last_delim);
721   }
722 }
723 
S_insert_emph(subject * subj,delimiter * opener,delimiter * closer)724 static delimiter *S_insert_emph(subject *subj, delimiter *opener,
725                                 delimiter *closer) {
726   delimiter *delim, *tmp_delim;
727   bufsize_t use_delims;
728   cmark_node *opener_inl = opener->inl_text;
729   cmark_node *closer_inl = closer->inl_text;
730   bufsize_t opener_num_chars = opener_inl->as.literal.len;
731   bufsize_t closer_num_chars = closer_inl->as.literal.len;
732   cmark_node *tmp, *tmpnext, *emph;
733 
734   // calculate the actual number of characters used from this closer
735   use_delims = (closer_num_chars >= 2 && opener_num_chars >= 2) ? 2 : 1;
736 
737   // remove used characters from associated inlines.
738   opener_num_chars -= use_delims;
739   closer_num_chars -= use_delims;
740   opener_inl->as.literal.len = opener_num_chars;
741   closer_inl->as.literal.len = closer_num_chars;
742 
743   // free delimiters between opener and closer
744   delim = closer->previous;
745   while (delim != NULL && delim != opener) {
746     tmp_delim = delim->previous;
747     remove_delimiter(subj, delim);
748     delim = tmp_delim;
749   }
750 
751   // create new emph or strong, and splice it in to our inlines
752   // between the opener and closer
753   emph = use_delims == 1 ? make_emph(subj->mem) : make_strong(subj->mem);
754 
755   tmp = opener_inl->next;
756   while (tmp && tmp != closer_inl) {
757     tmpnext = tmp->next;
758     cmark_node_append_child(emph, tmp);
759     tmp = tmpnext;
760   }
761   cmark_node_insert_after(opener_inl, emph);
762 
763   emph->start_line = emph->end_line = subj->line;
764   emph->start_column = opener_inl->start_column + subj->column_offset;
765   emph->end_column = closer_inl->end_column + subj->column_offset;
766 
767   // if opener has 0 characters, remove it and its associated inline
768   if (opener_num_chars == 0) {
769     cmark_node_free(opener_inl);
770     remove_delimiter(subj, opener);
771   }
772 
773   // if closer has 0 characters, remove it and its associated inline
774   if (closer_num_chars == 0) {
775     // remove empty closer inline
776     cmark_node_free(closer_inl);
777     // remove closer from list
778     tmp_delim = closer->next;
779     remove_delimiter(subj, closer);
780     closer = tmp_delim;
781   }
782 
783   return closer;
784 }
785 
786 // Parse backslash-escape or just a backslash, returning an inline.
handle_backslash(cmark_parser * parser,subject * subj)787 static cmark_node *handle_backslash(cmark_parser *parser, subject *subj) {
788   advance(subj);
789   unsigned char nextchar = peek_char(subj);
790   if ((parser->backslash_ispunct ? parser->backslash_ispunct : cmark_ispunct)(nextchar)) {
791     // only ascii symbols and newline can be escaped
792     advance(subj);
793     return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_dup(&subj->input, subj->pos - 1, 1));
794   } else if (!is_eof(subj) && skip_line_end(subj)) {
795     return make_linebreak(subj->mem);
796   } else {
797     return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("\\"));
798   }
799 }
800 
801 // Parse an entity or a regular "&" string.
802 // Assumes the subject has an '&' character at the current position.
handle_entity(subject * subj)803 static cmark_node *handle_entity(subject *subj) {
804   cmark_strbuf ent = CMARK_BUF_INIT(subj->mem);
805   bufsize_t len;
806 
807   advance(subj);
808 
809   len = houdini_unescape_ent(&ent, subj->input.data + subj->pos,
810                              subj->input.len - subj->pos);
811 
812   if (len == 0)
813     return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("&"));
814 
815   subj->pos += len;
816   return make_str(subj, subj->pos - 1 - len, subj->pos - 1, cmark_chunk_buf_detach(&ent));
817 }
818 
819 // Clean a URL: remove surrounding whitespace, and remove \ that escape
820 // punctuation.
cmark_clean_url(cmark_mem * mem,cmark_chunk * url)821 cmark_chunk cmark_clean_url(cmark_mem *mem, cmark_chunk *url) {
822   cmark_strbuf buf = CMARK_BUF_INIT(mem);
823 
824   cmark_chunk_trim(url);
825 
826   if (url->len == 0) {
827     cmark_chunk result = CMARK_CHUNK_EMPTY;
828     return result;
829   }
830 
831   houdini_unescape_html_f(&buf, url->data, url->len);
832 
833   cmark_strbuf_unescape(&buf);
834   return cmark_chunk_buf_detach(&buf);
835 }
836 
cmark_clean_title(cmark_mem * mem,cmark_chunk * title)837 cmark_chunk cmark_clean_title(cmark_mem *mem, cmark_chunk *title) {
838   cmark_strbuf buf = CMARK_BUF_INIT(mem);
839   unsigned char first, last;
840 
841   if (title->len == 0) {
842     cmark_chunk result = CMARK_CHUNK_EMPTY;
843     return result;
844   }
845 
846   first = title->data[0];
847   last = title->data[title->len - 1];
848 
849   // remove surrounding quotes if any:
850   if ((first == '\'' && last == '\'') || (first == '(' && last == ')') ||
851       (first == '"' && last == '"')) {
852     houdini_unescape_html_f(&buf, title->data + 1, title->len - 2);
853   } else {
854     houdini_unescape_html_f(&buf, title->data, title->len);
855   }
856 
857   cmark_strbuf_unescape(&buf);
858   return cmark_chunk_buf_detach(&buf);
859 }
860 
861 // Parse an autolink or HTML tag.
862 // Assumes the subject has a '<' character at the current position.
handle_pointy_brace(subject * subj,int options)863 static cmark_node *handle_pointy_brace(subject *subj, int options) {
864   bufsize_t matchlen = 0;
865   cmark_chunk contents;
866 
867   advance(subj); // advance past first <
868 
869   // first try to match a URL autolink
870   matchlen = scan_autolink_uri(&subj->input, subj->pos);
871   if (matchlen > 0) {
872     contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1);
873     subj->pos += matchlen;
874 
875     return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 0);
876   }
877 
878   // next try to match an email autolink
879   matchlen = scan_autolink_email(&subj->input, subj->pos);
880   if (matchlen > 0) {
881     contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1);
882     subj->pos += matchlen;
883 
884     return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 1);
885   }
886 
887   // finally, try to match an html tag
888   matchlen = scan_html_tag(&subj->input, subj->pos);
889   if (matchlen > 0) {
890     contents = cmark_chunk_dup(&subj->input, subj->pos - 1, matchlen + 1);
891     subj->pos += matchlen;
892     cmark_node *node = make_raw_html(subj, subj->pos - matchlen - 1, subj->pos - 1, contents);
893     adjust_subj_node_newlines(subj, node, matchlen, 1, options);
894     return node;
895   }
896 
897   if (options & CMARK_OPT_LIBERAL_HTML_TAG) {
898     matchlen = scan_liberal_html_tag(&subj->input, subj->pos);
899     if (matchlen > 0) {
900       contents = cmark_chunk_dup(&subj->input, subj->pos - 1, matchlen + 1);
901       subj->pos += matchlen;
902       cmark_node *node = make_raw_html(subj, subj->pos - matchlen - 1, subj->pos - 1, contents);
903       adjust_subj_node_newlines(subj, node, matchlen, 1, options);
904       return node;
905     }
906   }
907 
908   // if nothing matches, just return the opening <:
909   return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("<"));
910 }
911 
912 // Parse a link label.  Returns 1 if successful.
913 // Note:  unescaped brackets are not allowed in labels.
914 // The label begins with `[` and ends with the first `]` character
915 // encountered.  Backticks in labels do not start code spans.
link_label(subject * subj,cmark_chunk * raw_label)916 static int link_label(subject *subj, cmark_chunk *raw_label) {
917   bufsize_t startpos = subj->pos;
918   int length = 0;
919   unsigned char c;
920 
921   // advance past [
922   if (peek_char(subj) == '[') {
923     advance(subj);
924   } else {
925     return 0;
926   }
927 
928   while ((c = peek_char(subj)) && c != '[' && c != ']') {
929     if (c == '\\') {
930       advance(subj);
931       length++;
932       if (cmark_ispunct(peek_char(subj))) {
933         advance(subj);
934         length++;
935       }
936     } else {
937       advance(subj);
938       length++;
939     }
940     if (length > MAX_LINK_LABEL_LENGTH) {
941       goto noMatch;
942     }
943   }
944 
945   if (c == ']') { // match found
946     *raw_label =
947         cmark_chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1));
948     cmark_chunk_trim(raw_label);
949     advance(subj); // advance past ]
950     return 1;
951   }
952 
953 noMatch:
954   subj->pos = startpos; // rewind
955   return 0;
956 }
957 
manual_scan_link_url_2(cmark_chunk * input,bufsize_t offset,cmark_chunk * output)958 static bufsize_t manual_scan_link_url_2(cmark_chunk *input, bufsize_t offset,
959                                         cmark_chunk *output) {
960   bufsize_t i = offset;
961   size_t nb_p = 0;
962 
963   while (i < input->len) {
964     if (input->data[i] == '\\' &&
965         i + 1 < input-> len &&
966         cmark_ispunct(input->data[i+1]))
967       i += 2;
968     else if (input->data[i] == '(') {
969       ++nb_p;
970       ++i;
971         if (nb_p > 32)
972           return -1;
973     } else if (input->data[i] == ')') {
974       if (nb_p == 0)
975         break;
976       --nb_p;
977       ++i;
978     } else if (cmark_isspace(input->data[i]))
979       break;
980     else
981       ++i;
982   }
983 
984   if (i >= input->len)
985     return -1;
986 
987   {
988     cmark_chunk result = {input->data + offset, i - offset, 0};
989     *output = result;
990   }
991   return i - offset;
992 }
993 
manual_scan_link_url(cmark_chunk * input,bufsize_t offset,cmark_chunk * output)994 static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset,
995                                       cmark_chunk *output) {
996   bufsize_t i = offset;
997 
998   if (i < input->len && input->data[i] == '<') {
999     ++i;
1000     while (i < input->len) {
1001       if (input->data[i] == '>') {
1002         ++i;
1003         break;
1004       } else if (input->data[i] == '\\')
1005         i += 2;
1006       else if (input->data[i] == '\n' || input->data[i] == '<')
1007         return manual_scan_link_url_2(input, offset, output);
1008       else
1009         ++i;
1010     }
1011   } else {
1012     return manual_scan_link_url_2(input, offset, output);
1013   }
1014 
1015   if (i >= input->len)
1016     return -1;
1017 
1018   {
1019     cmark_chunk result = {input->data + offset + 1, i - 2 - offset, 0};
1020     *output = result;
1021   }
1022   return i - offset;
1023 }
1024 
1025 // Return a link, an image, or a literal close bracket.
handle_close_bracket(cmark_parser * parser,subject * subj)1026 static cmark_node *handle_close_bracket(cmark_parser *parser, subject *subj) {
1027   bufsize_t initial_pos, after_link_text_pos;
1028   bufsize_t endurl, starttitle, endtitle, endall;
1029   bufsize_t sps, n;
1030   cmark_reference *ref = NULL;
1031   cmark_chunk url_chunk, title_chunk;
1032   cmark_chunk url, title;
1033   bracket *opener;
1034   cmark_node *inl;
1035   cmark_chunk raw_label;
1036   int found_label;
1037   cmark_node *tmp, *tmpnext;
1038   bool is_image;
1039 
1040   advance(subj); // advance past ]
1041   initial_pos = subj->pos;
1042 
1043   // get last [ or ![
1044   opener = subj->last_bracket;
1045 
1046   if (opener == NULL) {
1047     return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
1048   }
1049 
1050   if (!opener->active) {
1051     // take delimiter off stack
1052     pop_bracket(subj);
1053     return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
1054   }
1055 
1056   // If we got here, we matched a potential link/image text.
1057   // Now we check to see if it's a link/image.
1058   is_image = opener->image;
1059 
1060   after_link_text_pos = subj->pos;
1061 
1062   // First, look for an inline link.
1063   if (peek_char(subj) == '(' &&
1064       ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) &&
1065       ((n = manual_scan_link_url(&subj->input, subj->pos + 1 + sps,
1066                                  &url_chunk)) > -1)) {
1067 
1068     // try to parse an explicit link:
1069     endurl = subj->pos + 1 + sps + n;
1070     starttitle = endurl + scan_spacechars(&subj->input, endurl);
1071 
1072     // ensure there are spaces btw url and title
1073     endtitle = (starttitle == endurl)
1074                    ? starttitle
1075                    : starttitle + scan_link_title(&subj->input, starttitle);
1076 
1077     endall = endtitle + scan_spacechars(&subj->input, endtitle);
1078 
1079     if (peek_at(subj, endall) == ')') {
1080       subj->pos = endall + 1;
1081 
1082       title_chunk =
1083           cmark_chunk_dup(&subj->input, starttitle, endtitle - starttitle);
1084       url = cmark_clean_url(subj->mem, &url_chunk);
1085       title = cmark_clean_title(subj->mem, &title_chunk);
1086       cmark_chunk_free(subj->mem, &url_chunk);
1087       cmark_chunk_free(subj->mem, &title_chunk);
1088       goto match;
1089 
1090     } else {
1091       // it could still be a shortcut reference link
1092       subj->pos = after_link_text_pos;
1093     }
1094   }
1095 
1096   // Next, look for a following [link label] that matches in refmap.
1097   // skip spaces
1098   raw_label = cmark_chunk_literal("");
1099   found_label = link_label(subj, &raw_label);
1100   if (!found_label) {
1101     // If we have a shortcut reference link, back up
1102     // to before the spacse we skipped.
1103     subj->pos = initial_pos;
1104   }
1105 
1106   if ((!found_label || raw_label.len == 0) && !opener->bracket_after) {
1107     cmark_chunk_free(subj->mem, &raw_label);
1108     raw_label = cmark_chunk_dup(&subj->input, opener->position,
1109                                 initial_pos - opener->position - 1);
1110     found_label = true;
1111   }
1112 
1113   if (found_label) {
1114     ref = (cmark_reference *)cmark_map_lookup(subj->refmap, &raw_label);
1115     cmark_chunk_free(subj->mem, &raw_label);
1116   }
1117 
1118   if (ref != NULL) { // found
1119     url = chunk_clone(subj->mem, &ref->url);
1120     title = chunk_clone(subj->mem, &ref->title);
1121     goto match;
1122   } else {
1123     goto noMatch;
1124   }
1125 
1126 noMatch:
1127   // If we fall through to here, it means we didn't match a link.
1128   // What if we're a footnote link?
1129   if (parser->options & CMARK_OPT_FOOTNOTES &&
1130       opener->inl_text->next &&
1131       opener->inl_text->next->type == CMARK_NODE_TEXT &&
1132       !opener->inl_text->next->next) {
1133     cmark_chunk *literal = &opener->inl_text->next->as.literal;
1134     if (literal->len > 1 && literal->data[0] == '^') {
1135       inl = make_simple(subj->mem, CMARK_NODE_FOOTNOTE_REFERENCE);
1136       inl->as.literal = cmark_chunk_dup(literal, 1, literal->len - 1);
1137       inl->start_line = inl->end_line = subj->line;
1138       inl->start_column = opener->inl_text->start_column;
1139       inl->end_column = subj->pos + subj->column_offset + subj->block_offset;
1140       cmark_node_insert_before(opener->inl_text, inl);
1141       cmark_node_free(opener->inl_text->next);
1142       cmark_node_free(opener->inl_text);
1143       process_emphasis(parser, subj, opener->previous_delimiter);
1144       pop_bracket(subj);
1145       return NULL;
1146     }
1147   }
1148 
1149   pop_bracket(subj); // remove this opener from delimiter list
1150   subj->pos = initial_pos;
1151   return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
1152 
1153 match:
1154   inl = make_simple(subj->mem, is_image ? CMARK_NODE_IMAGE : CMARK_NODE_LINK);
1155   inl->as.link.url = url;
1156   inl->as.link.title = title;
1157   inl->start_line = inl->end_line = subj->line;
1158   inl->start_column = opener->inl_text->start_column;
1159   inl->end_column = subj->pos + subj->column_offset + subj->block_offset;
1160   cmark_node_insert_before(opener->inl_text, inl);
1161   // Add link text:
1162   tmp = opener->inl_text->next;
1163   while (tmp) {
1164     tmpnext = tmp->next;
1165     cmark_node_append_child(inl, tmp);
1166     tmp = tmpnext;
1167   }
1168 
1169   // Free the bracket [:
1170   cmark_node_free(opener->inl_text);
1171 
1172   process_emphasis(parser, subj, opener->previous_delimiter);
1173   pop_bracket(subj);
1174 
1175   // Now, if we have a link, we also want to deactivate earlier link
1176   // delimiters. (This code can be removed if we decide to allow links
1177   // inside links.)
1178   if (!is_image) {
1179     opener = subj->last_bracket;
1180     while (opener != NULL) {
1181       if (!opener->image) {
1182         if (!opener->active) {
1183           break;
1184         } else {
1185           opener->active = false;
1186         }
1187       }
1188       opener = opener->previous;
1189     }
1190   }
1191 
1192   return NULL;
1193 }
1194 
1195 // Parse a hard or soft linebreak, returning an inline.
1196 // Assumes the subject has a cr or newline at the current position.
handle_newline(subject * subj)1197 static cmark_node *handle_newline(subject *subj) {
1198   bufsize_t nlpos = subj->pos;
1199   // skip over cr, crlf, or lf:
1200   if (peek_at(subj, subj->pos) == '\r') {
1201     advance(subj);
1202   }
1203   if (peek_at(subj, subj->pos) == '\n') {
1204     advance(subj);
1205   }
1206   ++subj->line;
1207   subj->column_offset = -subj->pos;
1208   // skip spaces at beginning of line
1209   skip_spaces(subj);
1210   if (nlpos > 1 && peek_at(subj, nlpos - 1) == ' ' &&
1211       peek_at(subj, nlpos - 2) == ' ') {
1212     return make_linebreak(subj->mem);
1213   } else {
1214     return make_softbreak(subj->mem);
1215   }
1216 }
1217 
1218 // "\r\n\\`&_*[]<!"
1219 static int8_t SPECIAL_CHARS[256] = {
1220       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1221       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
1222       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1223       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
1224       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1225       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1226       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1227       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1228       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1229       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1230       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1231 
1232 // " ' . -
1233 static char SMART_PUNCT_CHARS[] = {
1234       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1235       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
1236       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1237       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1238       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1239       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1240       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1241       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1242       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1243       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1244       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1245 };
1246 
subject_find_special_char(subject * subj,int options)1247 static bufsize_t subject_find_special_char(subject *subj, int options) {
1248   bufsize_t n = subj->pos + 1;
1249 
1250   while (n < subj->input.len) {
1251     if (SPECIAL_CHARS[subj->input.data[n]])
1252       return n;
1253     if (options & CMARK_OPT_SMART && SMART_PUNCT_CHARS[subj->input.data[n]])
1254       return n;
1255     n++;
1256   }
1257 
1258   return subj->input.len;
1259 }
1260 
cmark_inlines_add_special_character(unsigned char c,bool emphasis)1261 void cmark_inlines_add_special_character(unsigned char c, bool emphasis) {
1262   SPECIAL_CHARS[c] = 1;
1263   if (emphasis)
1264     SKIP_CHARS[c] = 1;
1265 }
1266 
cmark_inlines_remove_special_character(unsigned char c,bool emphasis)1267 void cmark_inlines_remove_special_character(unsigned char c, bool emphasis) {
1268   SPECIAL_CHARS[c] = 0;
1269   if (emphasis)
1270     SKIP_CHARS[c] = 0;
1271 }
1272 
try_extensions(cmark_parser * parser,cmark_node * parent,unsigned char c,subject * subj)1273 static cmark_node *try_extensions(cmark_parser *parser,
1274                                   cmark_node *parent,
1275                                   unsigned char c,
1276                                   subject *subj) {
1277   cmark_node *res = NULL;
1278   cmark_llist *tmp;
1279 
1280   for (tmp = parser->inline_syntax_extensions; tmp; tmp = tmp->next) {
1281     cmark_syntax_extension *ext = (cmark_syntax_extension *) tmp->data;
1282     res = ext->match_inline(ext, parser, parent, c, subj);
1283 
1284     if (res)
1285       break;
1286   }
1287 
1288   return res;
1289 }
1290 
1291 // Parse an inline, advancing subject, and add it as a child of parent.
1292 // Return 0 if no inline can be parsed, 1 otherwise.
parse_inline(cmark_parser * parser,subject * subj,cmark_node * parent,int options)1293 static int parse_inline(cmark_parser *parser, subject *subj, cmark_node *parent, int options) {
1294   cmark_node *new_inl = NULL;
1295   cmark_chunk contents;
1296   unsigned char c;
1297   bufsize_t startpos, endpos;
1298   c = peek_char(subj);
1299   if (c == 0) {
1300     return 0;
1301   }
1302   switch (c) {
1303   case '\r':
1304   case '\n':
1305     new_inl = handle_newline(subj);
1306     break;
1307   case '`':
1308     new_inl = handle_backticks(subj, options);
1309     break;
1310   case '\\':
1311     new_inl = handle_backslash(parser, subj);
1312     break;
1313   case '&':
1314     new_inl = handle_entity(subj);
1315     break;
1316   case '<':
1317     new_inl = handle_pointy_brace(subj, options);
1318     break;
1319   case '*':
1320   case '_':
1321   case '\'':
1322   case '"':
1323     new_inl = handle_delim(subj, c, (options & CMARK_OPT_SMART) != 0);
1324     break;
1325   case '-':
1326     new_inl = handle_hyphen(subj, (options & CMARK_OPT_SMART) != 0);
1327     break;
1328   case '.':
1329     new_inl = handle_period(subj, (options & CMARK_OPT_SMART) != 0);
1330     break;
1331   case '[':
1332     advance(subj);
1333     new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("["));
1334     push_bracket(subj, false, new_inl);
1335     break;
1336   case ']':
1337     new_inl = handle_close_bracket(parser, subj);
1338     break;
1339   case '!':
1340     advance(subj);
1341     if (peek_char(subj) == '[' && peek_char_n(subj, 1) != '^') {
1342       advance(subj);
1343       new_inl = make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal("!["));
1344       push_bracket(subj, true, new_inl);
1345     } else {
1346       new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("!"));
1347     }
1348     break;
1349   default:
1350     new_inl = try_extensions(parser, parent, c, subj);
1351     if (new_inl != NULL)
1352       break;
1353 
1354     endpos = subject_find_special_char(subj, options);
1355     contents = cmark_chunk_dup(&subj->input, subj->pos, endpos - subj->pos);
1356     startpos = subj->pos;
1357     subj->pos = endpos;
1358 
1359     // if we're at a newline, strip trailing spaces.
1360     if (S_is_line_end_char(peek_char(subj))) {
1361       cmark_chunk_rtrim(&contents);
1362     }
1363 
1364     new_inl = make_str(subj, startpos, endpos - 1, contents);
1365   }
1366   if (new_inl != NULL) {
1367     cmark_node_append_child(parent, new_inl);
1368   }
1369 
1370   return 1;
1371 }
1372 
1373 // Parse inlines from parent's string_content, adding as children of parent.
cmark_parse_inlines(cmark_parser * parser,cmark_node * parent,cmark_map * refmap,int options)1374 void cmark_parse_inlines(cmark_parser *parser,
1375                          cmark_node *parent,
1376                          cmark_map *refmap,
1377                          int options) {
1378   subject subj;
1379   cmark_chunk content = {parent->content.ptr, parent->content.size, 0};
1380   subject_from_buf(parser->mem, parent->start_line, parent->start_column - 1 + parent->internal_offset, &subj, &content, refmap);
1381   cmark_chunk_rtrim(&subj.input);
1382 
1383   while (!is_eof(&subj) && parse_inline(parser, &subj, parent, options))
1384     ;
1385 
1386   process_emphasis(parser, &subj, NULL);
1387   // free bracket and delim stack
1388   while (subj.last_delim) {
1389     remove_delimiter(&subj, subj.last_delim);
1390   }
1391   while (subj.last_bracket) {
1392     pop_bracket(&subj);
1393   }
1394 }
1395 
1396 // Parse zero or more space characters, including at most one newline.
spnl(subject * subj)1397 static void spnl(subject *subj) {
1398   skip_spaces(subj);
1399   if (skip_line_end(subj)) {
1400     skip_spaces(subj);
1401   }
1402 }
1403 
1404 // Parse reference.  Assumes string begins with '[' character.
1405 // Modify refmap if a reference is encountered.
1406 // Return 0 if no reference found, otherwise position of subject
1407 // after reference is parsed.
cmark_parse_reference_inline(cmark_mem * mem,cmark_chunk * input,cmark_map * refmap)1408 bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_chunk *input,
1409                                        cmark_map *refmap) {
1410   subject subj;
1411 
1412   cmark_chunk lab;
1413   cmark_chunk url;
1414   cmark_chunk title;
1415 
1416   bufsize_t matchlen = 0;
1417   bufsize_t beforetitle;
1418 
1419   subject_from_buf(mem, -1, 0, &subj, input, NULL);
1420 
1421   // parse label:
1422   if (!link_label(&subj, &lab) || lab.len == 0)
1423     return 0;
1424 
1425   // colon:
1426   if (peek_char(&subj) == ':') {
1427     advance(&subj);
1428   } else {
1429     return 0;
1430   }
1431 
1432   // parse link url:
1433   spnl(&subj);
1434   if ((matchlen = manual_scan_link_url(&subj.input, subj.pos, &url)) > -1 &&
1435       url.len > 0) {
1436     subj.pos += matchlen;
1437   } else {
1438     return 0;
1439   }
1440 
1441   // parse optional link_title
1442   beforetitle = subj.pos;
1443   spnl(&subj);
1444   matchlen = subj.pos == beforetitle ? 0 : scan_link_title(&subj.input, subj.pos);
1445   if (matchlen) {
1446     title = cmark_chunk_dup(&subj.input, subj.pos, matchlen);
1447     subj.pos += matchlen;
1448   } else {
1449     subj.pos = beforetitle;
1450     title = cmark_chunk_literal("");
1451   }
1452 
1453   // parse final spaces and newline:
1454   skip_spaces(&subj);
1455   if (!skip_line_end(&subj)) {
1456     if (matchlen) { // try rewinding before title
1457       subj.pos = beforetitle;
1458       skip_spaces(&subj);
1459       if (!skip_line_end(&subj)) {
1460         return 0;
1461       }
1462     } else {
1463       return 0;
1464     }
1465   }
1466   // insert reference into refmap
1467   cmark_reference_create(refmap, &lab, &url, &title);
1468   return subj.pos;
1469 }
1470 
cmark_inline_parser_peek_char(cmark_inline_parser * parser)1471 unsigned char cmark_inline_parser_peek_char(cmark_inline_parser *parser) {
1472   return peek_char(parser);
1473 }
1474 
cmark_inline_parser_peek_at(cmark_inline_parser * parser,bufsize_t pos)1475 unsigned char cmark_inline_parser_peek_at(cmark_inline_parser *parser, bufsize_t pos) {
1476   return peek_at(parser, pos);
1477 }
1478 
cmark_inline_parser_is_eof(cmark_inline_parser * parser)1479 int cmark_inline_parser_is_eof(cmark_inline_parser *parser) {
1480   return is_eof(parser);
1481 }
1482 
1483 static char *
my_strndup(const char * s,size_t n)1484 my_strndup (const char *s, size_t n)
1485 {
1486   char *result;
1487   size_t len = strlen (s);
1488 
1489   if (n < len)
1490     len = n;
1491 
1492   result = (char *) malloc (len + 1);
1493   if (!result)
1494     return 0;
1495 
1496   result[len] = '\0';
1497   return (char *) memcpy (result, s, len);
1498 }
1499 
cmark_inline_parser_take_while(cmark_inline_parser * parser,cmark_inline_predicate pred)1500 char *cmark_inline_parser_take_while(cmark_inline_parser *parser, cmark_inline_predicate pred) {
1501   unsigned char c;
1502   bufsize_t startpos = parser->pos;
1503   bufsize_t len = 0;
1504 
1505   while ((c = peek_char(parser)) && (*pred)(c)) {
1506     advance(parser);
1507     len++;
1508   }
1509 
1510   return my_strndup((const char *) parser->input.data + startpos, len);
1511 }
1512 
cmark_inline_parser_push_delimiter(cmark_inline_parser * parser,unsigned char c,int can_open,int can_close,cmark_node * inl_text)1513 void cmark_inline_parser_push_delimiter(cmark_inline_parser *parser,
1514                                   unsigned char c,
1515                                   int can_open,
1516                                   int can_close,
1517                                   cmark_node *inl_text) {
1518   push_delimiter(parser, c, can_open != 0, can_close != 0, inl_text);
1519 }
1520 
cmark_inline_parser_remove_delimiter(cmark_inline_parser * parser,delimiter * delim)1521 void cmark_inline_parser_remove_delimiter(cmark_inline_parser *parser, delimiter *delim) {
1522   remove_delimiter(parser, delim);
1523 }
1524 
cmark_inline_parser_scan_delimiters(cmark_inline_parser * parser,int max_delims,unsigned char c,int * left_flanking,int * right_flanking,int * punct_before,int * punct_after)1525 int cmark_inline_parser_scan_delimiters(cmark_inline_parser *parser,
1526                                   int max_delims,
1527                                   unsigned char c,
1528                                   int *left_flanking,
1529                                   int *right_flanking,
1530                                   int *punct_before,
1531                                   int *punct_after) {
1532   int numdelims = 0;
1533   bufsize_t before_char_pos;
1534   int32_t after_char = 0;
1535   int32_t before_char = 0;
1536   int len;
1537   bool space_before, space_after;
1538 
1539   if (parser->pos == 0) {
1540     before_char = 10;
1541   } else {
1542     before_char_pos = parser->pos - 1;
1543     // walk back to the beginning of the UTF_8 sequence:
1544     while (peek_at(parser, before_char_pos) >> 6 == 2 && before_char_pos > 0) {
1545       before_char_pos -= 1;
1546     }
1547     len = cmark_utf8proc_iterate(parser->input.data + before_char_pos,
1548                                  parser->pos - before_char_pos, &before_char);
1549     if (len == -1) {
1550       before_char = 10;
1551     }
1552   }
1553 
1554   while (peek_char(parser) == c && numdelims < max_delims) {
1555     numdelims++;
1556     advance(parser);
1557   }
1558 
1559   len = cmark_utf8proc_iterate(parser->input.data + parser->pos,
1560                                parser->input.len - parser->pos, &after_char);
1561   if (len == -1) {
1562     after_char = 10;
1563   }
1564 
1565   *punct_before = cmark_utf8proc_is_punctuation(before_char);
1566   *punct_after = cmark_utf8proc_is_punctuation(after_char);
1567   space_before = cmark_utf8proc_is_space(before_char) != 0;
1568   space_after = cmark_utf8proc_is_space(after_char) != 0;
1569 
1570   *left_flanking = numdelims > 0 && !cmark_utf8proc_is_space(after_char) &&
1571                   !(*punct_after && !space_before && !*punct_before);
1572   *right_flanking = numdelims > 0 && !cmark_utf8proc_is_space(before_char) &&
1573                   !(*punct_before && !space_after && !*punct_after);
1574 
1575   return numdelims;
1576 }
1577 
cmark_inline_parser_advance_offset(cmark_inline_parser * parser)1578 void cmark_inline_parser_advance_offset(cmark_inline_parser *parser) {
1579   advance(parser);
1580 }
1581 
cmark_inline_parser_get_offset(cmark_inline_parser * parser)1582 int cmark_inline_parser_get_offset(cmark_inline_parser *parser) {
1583   return parser->pos;
1584 }
1585 
cmark_inline_parser_set_offset(cmark_inline_parser * parser,int offset)1586 void cmark_inline_parser_set_offset(cmark_inline_parser *parser, int offset) {
1587   parser->pos = offset;
1588 }
1589 
cmark_inline_parser_get_column(cmark_inline_parser * parser)1590 int cmark_inline_parser_get_column(cmark_inline_parser *parser) {
1591   return parser->pos + 1 + parser->column_offset + parser->block_offset;
1592 }
1593 
cmark_inline_parser_get_chunk(cmark_inline_parser * parser)1594 cmark_chunk *cmark_inline_parser_get_chunk(cmark_inline_parser *parser) {
1595   return &parser->input;
1596 }
1597 
cmark_inline_parser_in_bracket(cmark_inline_parser * parser,int image)1598 int cmark_inline_parser_in_bracket(cmark_inline_parser *parser, int image) {
1599   for (bracket *b = parser->last_bracket; b; b = b->previous)
1600     if (b->active && b->image == (image != 0))
1601       return 1;
1602   return 0;
1603 }
1604 
cmark_node_unput(cmark_node * node,int n)1605 void cmark_node_unput(cmark_node *node, int n) {
1606 	node = node->last_child;
1607 	while (n > 0 && node && node->type == CMARK_NODE_TEXT) {
1608 		if (node->as.literal.len < n) {
1609 			n -= node->as.literal.len;
1610 			node->as.literal.len = 0;
1611 		} else {
1612 			node->as.literal.len -= n;
1613 			n = 0;
1614 		}
1615 		node = node->prev;
1616 	}
1617 }
1618 
cmark_inline_parser_get_last_delimiter(cmark_inline_parser * parser)1619 delimiter *cmark_inline_parser_get_last_delimiter(cmark_inline_parser *parser) {
1620   return parser->last_delim;
1621 }
1622 
cmark_inline_parser_get_line(cmark_inline_parser * parser)1623 int cmark_inline_parser_get_line(cmark_inline_parser *parser) {
1624   return parser->line;
1625 }
1626