1 /**
2  * Block parsing implementation.
3  *
4  * For a high-level overview of the block parsing process,
5  * see http://spec.commonmark.org/0.24/#phase-1-block-structure
6  */
7 
8 #include <stdlib.h>
9 #include <assert.h>
10 #include <stdio.h>
11 
12 #include "cmark_ctype.h"
13 #include "syntax_extension.h"
14 #include "config.h"
15 #include "parser.h"
16 #include "cmark-gfm.h"
17 #include "node.h"
18 #include "references.h"
19 #include "utf8.h"
20 #include "scanners.h"
21 #include "inlines.h"
22 #include "houdini.h"
23 #include "buffer.h"
24 #include "footnotes.h"
25 
26 #define CODE_INDENT 4
27 #define TAB_STOP 4
28 
29 #ifndef MIN
30 #define MIN(x, y) ((x < y) ? x : y)
31 #endif
32 
33 #define peek_at(i, n) (i)->data[n]
34 
S_last_line_blank(const cmark_node * node)35 static bool S_last_line_blank(const cmark_node *node) {
36   return (node->flags & CMARK_NODE__LAST_LINE_BLANK) != 0;
37 }
38 
S_type(const cmark_node * node)39 static CMARK_INLINE cmark_node_type S_type(const cmark_node *node) {
40   return (cmark_node_type)node->type;
41 }
42 
S_set_last_line_blank(cmark_node * node,bool is_blank)43 static void S_set_last_line_blank(cmark_node *node, bool is_blank) {
44   if (is_blank)
45     node->flags |= CMARK_NODE__LAST_LINE_BLANK;
46   else
47     node->flags &= ~CMARK_NODE__LAST_LINE_BLANK;
48 }
49 
S_is_line_end_char(char c)50 static CMARK_INLINE bool S_is_line_end_char(char c) {
51   return (c == '\n' || c == '\r');
52 }
53 
S_is_space_or_tab(char c)54 static CMARK_INLINE bool S_is_space_or_tab(char c) {
55   return (c == ' ' || c == '\t');
56 }
57 
58 static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer,
59                           size_t len, bool eof);
60 
61 static void S_process_line(cmark_parser *parser, const unsigned char *buffer,
62                            bufsize_t bytes);
63 
make_block(cmark_mem * mem,cmark_node_type tag,int start_line,int start_column)64 static cmark_node *make_block(cmark_mem *mem, cmark_node_type tag,
65                               int start_line, int start_column) {
66   cmark_node *e;
67 
68   e = (cmark_node *)mem->calloc(1, sizeof(*e));
69   cmark_strbuf_init(mem, &e->content, 32);
70   e->type = (uint16_t)tag;
71   e->flags = CMARK_NODE__OPEN;
72   e->start_line = start_line;
73   e->start_column = start_column;
74   e->end_line = start_line;
75 
76   return e;
77 }
78 
79 // Create a root document node.
make_document(cmark_mem * mem)80 static cmark_node *make_document(cmark_mem *mem) {
81   cmark_node *e = make_block(mem, CMARK_NODE_DOCUMENT, 1, 1);
82   return e;
83 }
84 
cmark_parser_attach_syntax_extension(cmark_parser * parser,cmark_syntax_extension * extension)85 int cmark_parser_attach_syntax_extension(cmark_parser *parser,
86                                          cmark_syntax_extension *extension) {
87   parser->syntax_extensions = cmark_llist_append(parser->mem, parser->syntax_extensions, extension);
88   if (extension->match_inline || extension->insert_inline_from_delim) {
89     parser->inline_syntax_extensions = cmark_llist_append(
90       parser->mem, parser->inline_syntax_extensions, extension);
91   }
92 
93   return 1;
94 }
95 
cmark_parser_dispose(cmark_parser * parser)96 static void cmark_parser_dispose(cmark_parser *parser) {
97   if (parser->root)
98     cmark_node_free(parser->root);
99 
100   if (parser->refmap)
101     cmark_map_free(parser->refmap);
102 }
103 
cmark_parser_reset(cmark_parser * parser)104 static void cmark_parser_reset(cmark_parser *parser) {
105   cmark_llist *saved_exts = parser->syntax_extensions;
106   cmark_llist *saved_inline_exts = parser->inline_syntax_extensions;
107   int saved_options = parser->options;
108   cmark_mem *saved_mem = parser->mem;
109 
110   cmark_parser_dispose(parser);
111 
112   memset(parser, 0, sizeof(cmark_parser));
113   parser->mem = saved_mem;
114 
115   cmark_strbuf_init(parser->mem, &parser->curline, 256);
116   cmark_strbuf_init(parser->mem, &parser->linebuf, 0);
117 
118   cmark_node *document = make_document(parser->mem);
119 
120   parser->refmap = cmark_reference_map_new(parser->mem);
121   parser->root = document;
122   parser->current = document;
123 
124   parser->last_buffer_ended_with_cr = false;
125 
126   parser->syntax_extensions = saved_exts;
127   parser->inline_syntax_extensions = saved_inline_exts;
128   parser->options = saved_options;
129 }
130 
cmark_parser_new_with_mem(int options,cmark_mem * mem)131 cmark_parser *cmark_parser_new_with_mem(int options, cmark_mem *mem) {
132   cmark_parser *parser = (cmark_parser *)mem->calloc(1, sizeof(cmark_parser));
133   parser->mem = mem;
134   parser->options = options;
135   cmark_parser_reset(parser);
136   return parser;
137 }
138 
cmark_parser_new(int options)139 cmark_parser *cmark_parser_new(int options) {
140   extern cmark_mem CMARK_DEFAULT_MEM_ALLOCATOR;
141   return cmark_parser_new_with_mem(options, &CMARK_DEFAULT_MEM_ALLOCATOR);
142 }
143 
cmark_parser_free(cmark_parser * parser)144 void cmark_parser_free(cmark_parser *parser) {
145   cmark_mem *mem = parser->mem;
146   cmark_parser_dispose(parser);
147   cmark_strbuf_free(&parser->curline);
148   cmark_strbuf_free(&parser->linebuf);
149   cmark_llist_free(parser->mem, parser->syntax_extensions);
150   cmark_llist_free(parser->mem, parser->inline_syntax_extensions);
151   mem->free(parser);
152 }
153 
154 static cmark_node *finalize(cmark_parser *parser, cmark_node *b);
155 
156 // Returns true if line has only space characters, else false.
is_blank(cmark_strbuf * s,bufsize_t offset)157 static bool is_blank(cmark_strbuf *s, bufsize_t offset) {
158   while (offset < s->size) {
159     switch (s->ptr[offset]) {
160     case '\r':
161     case '\n':
162       return true;
163     case ' ':
164       offset++;
165       break;
166     case '\t':
167       offset++;
168       break;
169     default:
170       return false;
171     }
172   }
173 
174   return true;
175 }
176 
accepts_lines(cmark_node_type block_type)177 static CMARK_INLINE bool accepts_lines(cmark_node_type block_type) {
178   return (block_type == CMARK_NODE_PARAGRAPH ||
179           block_type == CMARK_NODE_HEADING ||
180           block_type == CMARK_NODE_CODE_BLOCK);
181 }
182 
contains_inlines(cmark_node * node)183 static CMARK_INLINE bool contains_inlines(cmark_node *node) {
184   if (node->extension && node->extension->contains_inlines_func) {
185     return node->extension->contains_inlines_func(node->extension, node) != 0;
186   }
187 
188   return (node->type == CMARK_NODE_PARAGRAPH ||
189           node->type == CMARK_NODE_HEADING);
190 }
191 
add_line(cmark_node * node,cmark_chunk * ch,cmark_parser * parser)192 static void add_line(cmark_node *node, cmark_chunk *ch, cmark_parser *parser) {
193   int chars_to_tab;
194   int i;
195   assert(node->flags & CMARK_NODE__OPEN);
196   if (parser->partially_consumed_tab) {
197     parser->offset += 1; // skip over tab
198     // add space characters:
199     chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
200     for (i = 0; i < chars_to_tab; i++) {
201       cmark_strbuf_putc(&node->content, ' ');
202     }
203   }
204   cmark_strbuf_put(&node->content, ch->data + parser->offset,
205                    ch->len - parser->offset);
206 }
207 
remove_trailing_blank_lines(cmark_strbuf * ln)208 static void remove_trailing_blank_lines(cmark_strbuf *ln) {
209   bufsize_t i;
210   unsigned char c;
211 
212   for (i = ln->size - 1; i >= 0; --i) {
213     c = ln->ptr[i];
214 
215     if (c != ' ' && c != '\t' && !S_is_line_end_char(c))
216       break;
217   }
218 
219   if (i < 0) {
220     cmark_strbuf_clear(ln);
221     return;
222   }
223 
224   for (; i < ln->size; ++i) {
225     c = ln->ptr[i];
226 
227     if (!S_is_line_end_char(c))
228       continue;
229 
230     cmark_strbuf_truncate(ln, i);
231     break;
232   }
233 }
234 
235 // Check to see if a node ends with a blank line, descending
236 // if needed into lists and sublists.
ends_with_blank_line(cmark_node * node)237 static bool ends_with_blank_line(cmark_node *node) {
238   cmark_node *cur = node;
239   while (cur != NULL) {
240     if (S_last_line_blank(cur)) {
241       return true;
242     }
243     if (S_type(cur) == CMARK_NODE_LIST || S_type(cur) == CMARK_NODE_ITEM) {
244       cur = cur->last_child;
245     } else {
246       cur = NULL;
247     }
248   }
249   return false;
250 }
251 
finalize(cmark_parser * parser,cmark_node * b)252 static cmark_node *finalize(cmark_parser *parser, cmark_node *b) {
253   bufsize_t pos;
254   cmark_node *item;
255   cmark_node *subitem;
256   cmark_node *parent;
257 
258   parent = b->parent;
259   assert(b->flags &
260          CMARK_NODE__OPEN); // shouldn't call finalize on closed blocks
261   b->flags &= ~CMARK_NODE__OPEN;
262 
263   if (parser->curline.size == 0) {
264     // end of input - line number has not been incremented
265     b->end_line = parser->line_number;
266     b->end_column = parser->last_line_length;
267   } else if (S_type(b) == CMARK_NODE_DOCUMENT ||
268              (S_type(b) == CMARK_NODE_CODE_BLOCK && b->as.code.fenced) ||
269              (S_type(b) == CMARK_NODE_HEADING && b->as.heading.setext)) {
270     b->end_line = parser->line_number;
271     b->end_column = parser->curline.size;
272     if (b->end_column && parser->curline.ptr[b->end_column - 1] == '\n')
273       b->end_column -= 1;
274     if (b->end_column && parser->curline.ptr[b->end_column - 1] == '\r')
275       b->end_column -= 1;
276   } else {
277     b->end_line = parser->line_number - 1;
278     b->end_column = parser->last_line_length;
279   }
280 
281   cmark_strbuf *node_content = &b->content;
282 
283   switch (S_type(b)) {
284   case CMARK_NODE_PARAGRAPH:
285   {
286     cmark_chunk chunk = {node_content->ptr, node_content->size, 0};
287     while (chunk.len && chunk.data[0] == '[' &&
288            (pos = cmark_parse_reference_inline(parser->mem, &chunk, parser->refmap))) {
289 
290       chunk.data += pos;
291       chunk.len -= pos;
292     }
293     cmark_strbuf_drop(node_content, (node_content->size - chunk.len));
294     if (is_blank(node_content, 0)) {
295       // remove blank node (former reference def)
296       cmark_node_free(b);
297     }
298     break;
299   }
300 
301   case CMARK_NODE_CODE_BLOCK:
302     if (!b->as.code.fenced) { // indented code
303       remove_trailing_blank_lines(node_content);
304       cmark_strbuf_putc(node_content, '\n');
305     } else {
306       // first line of contents becomes info
307       for (pos = 0; pos < node_content->size; ++pos) {
308         if (S_is_line_end_char(node_content->ptr[pos]))
309           break;
310       }
311       assert(pos < node_content->size);
312 
313       cmark_strbuf tmp = CMARK_BUF_INIT(parser->mem);
314       houdini_unescape_html_f(&tmp, node_content->ptr, pos);
315       cmark_strbuf_trim(&tmp);
316       cmark_strbuf_unescape(&tmp);
317       b->as.code.info = cmark_chunk_buf_detach(&tmp);
318 
319       if (node_content->ptr[pos] == '\r')
320         pos += 1;
321       if (node_content->ptr[pos] == '\n')
322         pos += 1;
323       cmark_strbuf_drop(node_content, pos);
324     }
325     b->as.code.literal = cmark_chunk_buf_detach(node_content);
326     break;
327 
328   case CMARK_NODE_HTML_BLOCK:
329     b->as.literal = cmark_chunk_buf_detach(node_content);
330     break;
331 
332   case CMARK_NODE_LIST:      // determine tight/loose status
333     b->as.list.tight = true; // tight by default
334     item = b->first_child;
335 
336     while (item) {
337       // check for non-final non-empty list item ending with blank line:
338       if (S_last_line_blank(item) && item->next) {
339         b->as.list.tight = false;
340         break;
341       }
342       // recurse into children of list item, to see if there are
343       // spaces between them:
344       subitem = item->first_child;
345       while (subitem) {
346         if (ends_with_blank_line(subitem) && (item->next || subitem->next)) {
347           b->as.list.tight = false;
348           break;
349         }
350         subitem = subitem->next;
351       }
352       if (!(b->as.list.tight)) {
353         break;
354       }
355       item = item->next;
356     }
357 
358     break;
359 
360   default:
361     break;
362   }
363 
364   return parent;
365 }
366 
367 // Add a node as child of another.  Return pointer to child.
add_child(cmark_parser * parser,cmark_node * parent,cmark_node_type block_type,int start_column)368 static cmark_node *add_child(cmark_parser *parser, cmark_node *parent,
369                              cmark_node_type block_type, int start_column) {
370   assert(parent);
371 
372   // if 'parent' isn't the kind of node that can accept this child,
373   // then back up til we hit a node that can.
374   while (!cmark_node_can_contain_type(parent, block_type)) {
375     parent = finalize(parser, parent);
376   }
377 
378   cmark_node *child =
379       make_block(parser->mem, block_type, parser->line_number, start_column);
380   child->parent = parent;
381 
382   if (parent->last_child) {
383     parent->last_child->next = child;
384     child->prev = parent->last_child;
385   } else {
386     parent->first_child = child;
387     child->prev = NULL;
388   }
389   parent->last_child = child;
390   return child;
391 }
392 
cmark_manage_extensions_special_characters(cmark_parser * parser,int add)393 void cmark_manage_extensions_special_characters(cmark_parser *parser, int add) {
394   cmark_llist *tmp_ext;
395 
396   for (tmp_ext = parser->inline_syntax_extensions; tmp_ext; tmp_ext=tmp_ext->next) {
397     cmark_syntax_extension *ext = (cmark_syntax_extension *) tmp_ext->data;
398     cmark_llist *tmp_char;
399     for (tmp_char = ext->special_inline_chars; tmp_char; tmp_char=tmp_char->next) {
400       unsigned char c = (unsigned char)(size_t)tmp_char->data;
401       if (add)
402         cmark_inlines_add_special_character(c, ext->emphasis);
403       else
404         cmark_inlines_remove_special_character(c, ext->emphasis);
405     }
406   }
407 }
408 
409 // Walk through node and all children, recursively, parsing
410 // string content into inline content where appropriate.
process_inlines(cmark_parser * parser,cmark_map * refmap,int options)411 static void process_inlines(cmark_parser *parser,
412                             cmark_map *refmap, int options) {
413   cmark_iter *iter = cmark_iter_new(parser->root);
414   cmark_node *cur;
415   cmark_event_type ev_type;
416 
417   cmark_manage_extensions_special_characters(parser, true);
418 
419   while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
420     cur = cmark_iter_get_node(iter);
421     if (ev_type == CMARK_EVENT_ENTER) {
422       if (contains_inlines(cur)) {
423         cmark_parse_inlines(parser, cur, refmap, options);
424       }
425     }
426   }
427 
428   cmark_manage_extensions_special_characters(parser, false);
429 
430   cmark_iter_free(iter);
431 }
432 
sort_footnote_by_ix(const void * _a,const void * _b)433 static int sort_footnote_by_ix(const void *_a, const void *_b) {
434   cmark_footnote *a = *(cmark_footnote **)_a;
435   cmark_footnote *b = *(cmark_footnote **)_b;
436   return (int)a->ix - (int)b->ix;
437 }
438 
process_footnotes(cmark_parser * parser)439 static void process_footnotes(cmark_parser *parser) {
440   // * Collect definitions in a map.
441   // * Iterate the references in the document in order, assigning indices to
442   //   definitions in the order they're seen.
443   // * Write out the footnotes at the bottom of the document in index order.
444 
445   cmark_map *map = cmark_footnote_map_new(parser->mem);
446 
447   cmark_iter *iter = cmark_iter_new(parser->root);
448   cmark_node *cur;
449   cmark_event_type ev_type;
450 
451   while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
452     cur = cmark_iter_get_node(iter);
453     if (ev_type == CMARK_EVENT_EXIT && cur->type == CMARK_NODE_FOOTNOTE_DEFINITION) {
454       cmark_node_unlink(cur);
455       cmark_footnote_create(map, cur);
456     }
457   }
458 
459   cmark_iter_free(iter);
460   iter = cmark_iter_new(parser->root);
461   unsigned int ix = 0;
462 
463   while ((ev_type = cmark_iter_next(iter)) != CMARK_EVENT_DONE) {
464     cur = cmark_iter_get_node(iter);
465     if (ev_type == CMARK_EVENT_EXIT && cur->type == CMARK_NODE_FOOTNOTE_REFERENCE) {
466       cmark_footnote *footnote = (cmark_footnote *)cmark_map_lookup(map, &cur->as.literal);
467       if (footnote) {
468         if (!footnote->ix)
469           footnote->ix = ++ix;
470 
471         char n[32];
472         snprintf(n, sizeof(n), "%d", footnote->ix);
473         cmark_chunk_free(parser->mem, &cur->as.literal);
474         cmark_strbuf buf = CMARK_BUF_INIT(parser->mem);
475         cmark_strbuf_puts(&buf, n);
476 
477         cur->as.literal = cmark_chunk_buf_detach(&buf);
478       } else {
479         cmark_node *text = (cmark_node *)parser->mem->calloc(1, sizeof(*text));
480         cmark_strbuf_init(parser->mem, &text->content, 0);
481         text->type = (uint16_t) CMARK_NODE_TEXT;
482 
483         cmark_strbuf buf = CMARK_BUF_INIT(parser->mem);
484         cmark_strbuf_puts(&buf, "[^");
485         cmark_strbuf_put(&buf, cur->as.literal.data, cur->as.literal.len);
486         cmark_strbuf_putc(&buf, ']');
487 
488         text->as.literal = cmark_chunk_buf_detach(&buf);
489         cmark_node_insert_after(cur, text);
490         cmark_node_free(cur);
491       }
492     }
493   }
494 
495   cmark_iter_free(iter);
496 
497   if (map->sorted) {
498     qsort(map->sorted, map->size, sizeof(cmark_map_entry *), sort_footnote_by_ix);
499     for (unsigned int i = 0; i < map->size; ++i) {
500       cmark_footnote *footnote = (cmark_footnote *)map->sorted[i];
501       if (!footnote->ix)
502         continue;
503       cmark_node_append_child(parser->root, footnote->node);
504       footnote->node = NULL;
505     }
506   }
507 
508   cmark_map_free(map);
509 }
510 
511 // Attempts to parse a list item marker (bullet or enumerated).
512 // On success, returns length of the marker, and populates
513 // data with the details.  On failure, returns 0.
parse_list_marker(cmark_mem * mem,cmark_chunk * input,bufsize_t pos,bool interrupts_paragraph,cmark_list ** dataptr)514 static bufsize_t parse_list_marker(cmark_mem *mem, cmark_chunk *input,
515                                    bufsize_t pos, bool interrupts_paragraph,
516                                    cmark_list **dataptr) {
517   unsigned char c;
518   bufsize_t startpos;
519   cmark_list *data;
520   bufsize_t i;
521 
522   startpos = pos;
523   c = peek_at(input, pos);
524 
525   if (c == '*' || c == '-' || c == '+') {
526     pos++;
527     if (!cmark_isspace(peek_at(input, pos))) {
528       return 0;
529     }
530 
531     if (interrupts_paragraph) {
532       i = pos;
533       // require non-blank content after list marker:
534       while (S_is_space_or_tab(peek_at(input, i))) {
535         i++;
536       }
537       if (peek_at(input, i) == '\n') {
538         return 0;
539       }
540     }
541 
542     data = (cmark_list *)mem->calloc(1, sizeof(*data));
543     data->marker_offset = 0; // will be adjusted later
544     data->list_type = CMARK_BULLET_LIST;
545     data->bullet_char = c;
546     data->start = 0;
547     data->delimiter = CMARK_NO_DELIM;
548     data->tight = false;
549   } else if (cmark_isdigit(c)) {
550     int start = 0;
551     int digits = 0;
552 
553     do {
554       start = (10 * start) + (peek_at(input, pos) - '0');
555       pos++;
556       digits++;
557       // We limit to 9 digits to avoid overflow,
558       // assuming max int is 2^31 - 1
559       // This also seems to be the limit for 'start' in some browsers.
560     } while (digits < 9 && cmark_isdigit(peek_at(input, pos)));
561 
562     if (interrupts_paragraph && start != 1) {
563       return 0;
564     }
565     c = peek_at(input, pos);
566     if (c == '.' || c == ')') {
567       pos++;
568       if (!cmark_isspace(peek_at(input, pos))) {
569         return 0;
570       }
571       if (interrupts_paragraph) {
572         // require non-blank content after list marker:
573         i = pos;
574         while (S_is_space_or_tab(peek_at(input, i))) {
575           i++;
576         }
577         if (S_is_line_end_char(peek_at(input, i))) {
578           return 0;
579         }
580       }
581 
582       data = (cmark_list *)mem->calloc(1, sizeof(*data));
583       data->marker_offset = 0; // will be adjusted later
584       data->list_type = CMARK_ORDERED_LIST;
585       data->bullet_char = 0;
586       data->start = start;
587       data->delimiter = (c == '.' ? CMARK_PERIOD_DELIM : CMARK_PAREN_DELIM);
588       data->tight = false;
589     } else {
590       return 0;
591     }
592   } else {
593     return 0;
594   }
595 
596   *dataptr = data;
597   return (pos - startpos);
598 }
599 
600 // Return 1 if list item belongs in list, else 0.
lists_match(cmark_list * list_data,cmark_list * item_data)601 static int lists_match(cmark_list *list_data, cmark_list *item_data) {
602   return (list_data->list_type == item_data->list_type &&
603           list_data->delimiter == item_data->delimiter &&
604           // list_data->marker_offset == item_data.marker_offset &&
605           list_data->bullet_char == item_data->bullet_char);
606 }
607 
finalize_document(cmark_parser * parser)608 static cmark_node *finalize_document(cmark_parser *parser) {
609   while (parser->current != parser->root) {
610     parser->current = finalize(parser, parser->current);
611   }
612 
613   finalize(parser, parser->root);
614   process_inlines(parser, parser->refmap, parser->options);
615   if (parser->options & CMARK_OPT_FOOTNOTES)
616     process_footnotes(parser);
617 
618   return parser->root;
619 }
620 
cmark_parse_file(FILE * f,int options)621 cmark_node *cmark_parse_file(FILE *f, int options) {
622   unsigned char buffer[4096];
623   cmark_parser *parser = cmark_parser_new(options);
624   size_t bytes;
625   cmark_node *document;
626 
627   while ((bytes = fread(buffer, 1, sizeof(buffer), f)) > 0) {
628     bool eof = bytes < sizeof(buffer);
629     S_parser_feed(parser, buffer, bytes, eof);
630     if (eof) {
631       break;
632     }
633   }
634 
635   document = cmark_parser_finish(parser);
636   cmark_parser_free(parser);
637   return document;
638 }
639 
cmark_parse_document(const char * buffer,size_t len,int options)640 cmark_node *cmark_parse_document(const char *buffer, size_t len, int options) {
641   cmark_parser *parser = cmark_parser_new(options);
642   cmark_node *document;
643 
644   S_parser_feed(parser, (const unsigned char *)buffer, len, true);
645 
646   document = cmark_parser_finish(parser);
647   cmark_parser_free(parser);
648   return document;
649 }
650 
cmark_parser_feed(cmark_parser * parser,const char * buffer,size_t len)651 void cmark_parser_feed(cmark_parser *parser, const char *buffer, size_t len) {
652   S_parser_feed(parser, (const unsigned char *)buffer, len, false);
653 }
654 
cmark_parser_feed_reentrant(cmark_parser * parser,const char * buffer,size_t len)655 void cmark_parser_feed_reentrant(cmark_parser *parser, const char *buffer, size_t len) {
656   cmark_strbuf saved_linebuf;
657 
658   cmark_strbuf_init(parser->mem, &saved_linebuf, 0);
659   cmark_strbuf_puts(&saved_linebuf, cmark_strbuf_cstr(&parser->linebuf));
660   cmark_strbuf_clear(&parser->linebuf);
661 
662   S_parser_feed(parser, (const unsigned char *)buffer, len, true);
663 
664   cmark_strbuf_sets(&parser->linebuf, cmark_strbuf_cstr(&saved_linebuf));
665   cmark_strbuf_free(&saved_linebuf);
666 }
667 
S_parser_feed(cmark_parser * parser,const unsigned char * buffer,size_t len,bool eof)668 static void S_parser_feed(cmark_parser *parser, const unsigned char *buffer,
669                           size_t len, bool eof) {
670   const unsigned char *end = buffer + len;
671   static const uint8_t repl[] = {239, 191, 189};
672 
673   if (parser->last_buffer_ended_with_cr && *buffer == '\n') {
674     // skip NL if last buffer ended with CR ; see #117
675     buffer++;
676   }
677   parser->last_buffer_ended_with_cr = false;
678   while (buffer < end) {
679     const unsigned char *eol;
680     bufsize_t chunk_len;
681     bool process = false;
682     for (eol = buffer; eol < end; ++eol) {
683       if (S_is_line_end_char(*eol)) {
684         process = true;
685         break;
686       }
687       if (*eol == '\0' && eol < end) {
688         break;
689       }
690     }
691     if (eol >= end && eof) {
692       process = true;
693     }
694 
695     chunk_len = (bufsize_t)(eol - buffer);
696     if (process) {
697       if (parser->linebuf.size > 0) {
698         cmark_strbuf_put(&parser->linebuf, buffer, chunk_len);
699         S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size);
700         cmark_strbuf_clear(&parser->linebuf);
701       } else {
702         S_process_line(parser, buffer, chunk_len);
703       }
704     } else {
705       if (eol < end && *eol == '\0') {
706         // omit NULL byte
707         cmark_strbuf_put(&parser->linebuf, buffer, chunk_len);
708         // add replacement character
709         cmark_strbuf_put(&parser->linebuf, repl, 3);
710       } else {
711         cmark_strbuf_put(&parser->linebuf, buffer, chunk_len);
712       }
713     }
714 
715     buffer += chunk_len;
716     if (buffer < end) {
717       if (*buffer == '\0') {
718         // skip over NULL
719         buffer++;
720       } else {
721         // skip over line ending characters
722         if (*buffer == '\r') {
723           buffer++;
724           if (buffer == end)
725             parser->last_buffer_ended_with_cr = true;
726         }
727         if (buffer < end && *buffer == '\n')
728           buffer++;
729       }
730     }
731   }
732 }
733 
chop_trailing_hashtags(cmark_chunk * ch)734 static void chop_trailing_hashtags(cmark_chunk *ch) {
735   bufsize_t n, orig_n;
736 
737   cmark_chunk_rtrim(ch);
738   orig_n = n = ch->len - 1;
739 
740   // if string ends in space followed by #s, remove these:
741   while (n >= 0 && peek_at(ch, n) == '#')
742     n--;
743 
744   // Check for a space before the final #s:
745   if (n != orig_n && n >= 0 && S_is_space_or_tab(peek_at(ch, n))) {
746     ch->len = n;
747     cmark_chunk_rtrim(ch);
748   }
749 }
750 
751 // Find first nonspace character from current offset, setting
752 // parser->first_nonspace, parser->first_nonspace_column,
753 // parser->indent, and parser->blank. Does not advance parser->offset.
S_find_first_nonspace(cmark_parser * parser,cmark_chunk * input)754 static void S_find_first_nonspace(cmark_parser *parser, cmark_chunk *input) {
755   char c;
756   int chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
757 
758   if (parser->first_nonspace <= parser->offset) {
759     parser->first_nonspace = parser->offset;
760     parser->first_nonspace_column = parser->column;
761     while ((c = peek_at(input, parser->first_nonspace))) {
762       if (c == ' ') {
763         parser->first_nonspace += 1;
764         parser->first_nonspace_column += 1;
765         chars_to_tab = chars_to_tab - 1;
766         if (chars_to_tab == 0) {
767           chars_to_tab = TAB_STOP;
768         }
769       } else if (c == '\t') {
770         parser->first_nonspace += 1;
771         parser->first_nonspace_column += chars_to_tab;
772         chars_to_tab = TAB_STOP;
773       } else {
774         break;
775       }
776     }
777   }
778 
779   parser->indent = parser->first_nonspace_column - parser->column;
780   parser->blank = S_is_line_end_char(peek_at(input, parser->first_nonspace));
781 }
782 
783 // Advance parser->offset and parser->column.  parser->offset is the
784 // byte position in input; parser->column is a virtual column number
785 // that takes into account tabs. (Multibyte characters are not taken
786 // into account, because the Markdown line prefixes we are interested in
787 // analyzing are entirely ASCII.)  The count parameter indicates
788 // how far to advance the offset.  If columns is true, then count
789 // indicates a number of columns; otherwise, a number of bytes.
790 // If advancing a certain number of columns partially consumes
791 // a tab character, parser->partially_consumed_tab is set to true.
S_advance_offset(cmark_parser * parser,cmark_chunk * input,bufsize_t count,bool columns)792 static void S_advance_offset(cmark_parser *parser, cmark_chunk *input,
793                              bufsize_t count, bool columns) {
794   char c;
795   int chars_to_tab;
796   int chars_to_advance;
797   while (count > 0 && (c = peek_at(input, parser->offset))) {
798     if (c == '\t') {
799       chars_to_tab = TAB_STOP - (parser->column % TAB_STOP);
800       if (columns) {
801         parser->partially_consumed_tab = chars_to_tab > count;
802         chars_to_advance = MIN(count, chars_to_tab);
803         parser->column += chars_to_advance;
804         parser->offset += (parser->partially_consumed_tab ? 0 : 1);
805         count -= chars_to_advance;
806       } else {
807         parser->partially_consumed_tab = false;
808         parser->column += chars_to_tab;
809         parser->offset += 1;
810         count -= 1;
811       }
812     } else {
813       parser->partially_consumed_tab = false;
814       parser->offset += 1;
815       parser->column += 1; // assume ascii; block starts are ascii
816       count -= 1;
817     }
818   }
819 }
820 
S_last_child_is_open(cmark_node * container)821 static bool S_last_child_is_open(cmark_node *container) {
822   return container->last_child &&
823          (container->last_child->flags & CMARK_NODE__OPEN);
824 }
825 
parse_block_quote_prefix(cmark_parser * parser,cmark_chunk * input)826 static bool parse_block_quote_prefix(cmark_parser *parser, cmark_chunk *input) {
827   bool res = false;
828   bufsize_t matched = 0;
829 
830   matched =
831       parser->indent <= 3 && peek_at(input, parser->first_nonspace) == '>';
832   if (matched) {
833 
834     S_advance_offset(parser, input, parser->indent + 1, true);
835 
836     if (S_is_space_or_tab(peek_at(input, parser->offset))) {
837       S_advance_offset(parser, input, 1, true);
838     }
839 
840     res = true;
841   }
842   return res;
843 }
844 
parse_footnote_definition_block_prefix(cmark_parser * parser,cmark_chunk * input,cmark_node * container)845 static bool parse_footnote_definition_block_prefix(cmark_parser *parser, cmark_chunk *input,
846                                                    cmark_node *container) {
847   if (parser->indent >= 4) {
848     S_advance_offset(parser, input, 4, true);
849     return true;
850   } else if (input->len > 0 && (input->data[0] == '\n' || (input->data[0] == '\r' && input->data[1] == '\n'))) {
851     return true;
852   }
853 
854   return false;
855 }
856 
parse_node_item_prefix(cmark_parser * parser,cmark_chunk * input,cmark_node * container)857 static bool parse_node_item_prefix(cmark_parser *parser, cmark_chunk *input,
858                                    cmark_node *container) {
859   bool res = false;
860 
861   if (parser->indent >=
862       container->as.list.marker_offset + container->as.list.padding) {
863     S_advance_offset(parser, input, container->as.list.marker_offset +
864                                         container->as.list.padding,
865                      true);
866     res = true;
867   } else if (parser->blank && container->first_child != NULL) {
868     // if container->first_child is NULL, then the opening line
869     // of the list item was blank after the list marker; in this
870     // case, we are done with the list item.
871     S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
872                      false);
873     res = true;
874   }
875   return res;
876 }
877 
parse_code_block_prefix(cmark_parser * parser,cmark_chunk * input,cmark_node * container,bool * should_continue)878 static bool parse_code_block_prefix(cmark_parser *parser, cmark_chunk *input,
879                                     cmark_node *container,
880                                     bool *should_continue) {
881   bool res = false;
882 
883   if (!container->as.code.fenced) { // indented
884     if (parser->indent >= CODE_INDENT) {
885       S_advance_offset(parser, input, CODE_INDENT, true);
886       res = true;
887     } else if (parser->blank) {
888       S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
889                        false);
890       res = true;
891     }
892   } else { // fenced
893     bufsize_t matched = 0;
894 
895     if (parser->indent <= 3 && (peek_at(input, parser->first_nonspace) ==
896                                 container->as.code.fence_char)) {
897       matched = scan_close_code_fence(input, parser->first_nonspace);
898     }
899 
900     if (matched >= container->as.code.fence_length) {
901       // closing fence - and since we're at
902       // the end of a line, we can stop processing it:
903       *should_continue = false;
904       S_advance_offset(parser, input, matched, false);
905       parser->current = finalize(parser, container);
906     } else {
907       // skip opt. spaces of fence parser->offset
908       int i = container->as.code.fence_offset;
909 
910       while (i > 0 && S_is_space_or_tab(peek_at(input, parser->offset))) {
911         S_advance_offset(parser, input, 1, true);
912         i--;
913       }
914       res = true;
915     }
916   }
917 
918   return res;
919 }
920 
parse_html_block_prefix(cmark_parser * parser,cmark_node * container)921 static bool parse_html_block_prefix(cmark_parser *parser,
922                                     cmark_node *container) {
923   bool res = false;
924   int html_block_type = container->as.html_block_type;
925 
926   assert(html_block_type >= 1 && html_block_type <= 7);
927   switch (html_block_type) {
928   case 1:
929   case 2:
930   case 3:
931   case 4:
932   case 5:
933     // these types of blocks can accept blanks
934     res = true;
935     break;
936   case 6:
937   case 7:
938     res = !parser->blank;
939     break;
940   }
941 
942   return res;
943 }
944 
parse_extension_block(cmark_parser * parser,cmark_node * container,cmark_chunk * input)945 static bool parse_extension_block(cmark_parser *parser,
946                                   cmark_node *container,
947                                   cmark_chunk *input)
948 {
949   bool res = false;
950 
951   if (container->extension->last_block_matches) {
952     if (container->extension->last_block_matches(
953         container->extension, parser, input->data, input->len, container))
954       res = true;
955   }
956 
957   return res;
958 }
959 
960 /**
961  * For each containing node, try to parse the associated line start.
962  *
963  * Will not close unmatched blocks, as we may have a lazy continuation
964  * line -> http://spec.commonmark.org/0.24/#lazy-continuation-line
965  *
966  * Returns: The last matching node, or NULL
967  */
check_open_blocks(cmark_parser * parser,cmark_chunk * input,bool * all_matched)968 static cmark_node *check_open_blocks(cmark_parser *parser, cmark_chunk *input,
969                                      bool *all_matched) {
970   bool should_continue = true;
971   *all_matched = false;
972   cmark_node *container = parser->root;
973   cmark_node_type cont_type;
974 
975   while (S_last_child_is_open(container)) {
976     container = container->last_child;
977     cont_type = S_type(container);
978 
979     S_find_first_nonspace(parser, input);
980 
981     if (container->extension) {
982       if (!parse_extension_block(parser, container, input))
983         goto done;
984       continue;
985     }
986 
987     switch (cont_type) {
988     case CMARK_NODE_BLOCK_QUOTE:
989       if (!parse_block_quote_prefix(parser, input))
990         goto done;
991       break;
992     case CMARK_NODE_ITEM:
993       if (!parse_node_item_prefix(parser, input, container))
994         goto done;
995       break;
996     case CMARK_NODE_CODE_BLOCK:
997       if (!parse_code_block_prefix(parser, input, container, &should_continue))
998         goto done;
999       break;
1000     case CMARK_NODE_HEADING:
1001       // a heading can never contain more than one line
1002       goto done;
1003     case CMARK_NODE_HTML_BLOCK:
1004       if (!parse_html_block_prefix(parser, container))
1005         goto done;
1006       break;
1007     case CMARK_NODE_PARAGRAPH:
1008       if (parser->blank)
1009         goto done;
1010       break;
1011 		case CMARK_NODE_FOOTNOTE_DEFINITION:
1012 			if (!parse_footnote_definition_block_prefix(parser, input, container))
1013 				goto done;
1014 			break;
1015     default:
1016       break;
1017     }
1018   }
1019 
1020   *all_matched = true;
1021 
1022 done:
1023   if (!*all_matched) {
1024     container = container->parent; // back up to last matching node
1025   }
1026 
1027   if (!should_continue) {
1028     container = NULL;
1029   }
1030 
1031   return container;
1032 }
1033 
open_new_blocks(cmark_parser * parser,cmark_node ** container,cmark_chunk * input,bool all_matched)1034 static void open_new_blocks(cmark_parser *parser, cmark_node **container,
1035                             cmark_chunk *input, bool all_matched) {
1036   bool indented;
1037   cmark_list *data = NULL;
1038   bool maybe_lazy = S_type(parser->current) == CMARK_NODE_PARAGRAPH;
1039   cmark_node_type cont_type = S_type(*container);
1040   bufsize_t matched = 0;
1041   int lev = 0;
1042   bool save_partially_consumed_tab;
1043   int save_offset;
1044   int save_column;
1045 
1046   while (cont_type != CMARK_NODE_CODE_BLOCK &&
1047          cont_type != CMARK_NODE_HTML_BLOCK) {
1048 
1049     S_find_first_nonspace(parser, input);
1050     indented = parser->indent >= CODE_INDENT;
1051 
1052     if (!indented && peek_at(input, parser->first_nonspace) == '>') {
1053 
1054       bufsize_t blockquote_startpos = parser->first_nonspace;
1055 
1056       S_advance_offset(parser, input,
1057                        parser->first_nonspace + 1 - parser->offset, false);
1058       // optional following character
1059       if (S_is_space_or_tab(peek_at(input, parser->offset))) {
1060         S_advance_offset(parser, input, 1, true);
1061       }
1062       *container = add_child(parser, *container, CMARK_NODE_BLOCK_QUOTE,
1063                              blockquote_startpos + 1);
1064 
1065     } else if (!indented && (matched = scan_atx_heading_start(
1066                                  input, parser->first_nonspace))) {
1067       bufsize_t hashpos;
1068       int level = 0;
1069       bufsize_t heading_startpos = parser->first_nonspace;
1070 
1071       S_advance_offset(parser, input,
1072                        parser->first_nonspace + matched - parser->offset,
1073                        false);
1074       *container = add_child(parser, *container, CMARK_NODE_HEADING,
1075                              heading_startpos + 1);
1076 
1077       hashpos = cmark_chunk_strchr(input, '#', parser->first_nonspace);
1078 
1079       while (peek_at(input, hashpos) == '#') {
1080         level++;
1081         hashpos++;
1082       }
1083 
1084       (*container)->as.heading.level = level;
1085       (*container)->as.heading.setext = false;
1086       (*container)->internal_offset = matched;
1087 
1088     } else if (!indented && (matched = scan_open_code_fence(
1089                                  input, parser->first_nonspace))) {
1090       *container = add_child(parser, *container, CMARK_NODE_CODE_BLOCK,
1091                              parser->first_nonspace + 1);
1092       (*container)->as.code.fenced = true;
1093       (*container)->as.code.fence_char = peek_at(input, parser->first_nonspace);
1094       (*container)->as.code.fence_length = (matched > 255) ? 255 : (uint8_t)matched;
1095       (*container)->as.code.fence_offset =
1096           (int8_t)(parser->first_nonspace - parser->offset);
1097       (*container)->as.code.info = cmark_chunk_literal("");
1098       S_advance_offset(parser, input,
1099                        parser->first_nonspace + matched - parser->offset,
1100                        false);
1101 
1102     } else if (!indented && ((matched = scan_html_block_start(
1103                                   input, parser->first_nonspace)) ||
1104                              (cont_type != CMARK_NODE_PARAGRAPH &&
1105                               (matched = scan_html_block_start_7(
1106                                    input, parser->first_nonspace))))) {
1107       *container = add_child(parser, *container, CMARK_NODE_HTML_BLOCK,
1108                              parser->first_nonspace + 1);
1109       (*container)->as.html_block_type = matched;
1110       // note, we don't adjust parser->offset because the tag is part of the
1111       // text
1112     } else if (!indented && cont_type == CMARK_NODE_PARAGRAPH &&
1113                (lev =
1114                     scan_setext_heading_line(input, parser->first_nonspace))) {
1115       (*container)->type = (uint16_t)CMARK_NODE_HEADING;
1116       (*container)->as.heading.level = lev;
1117       (*container)->as.heading.setext = true;
1118       S_advance_offset(parser, input, input->len - 1 - parser->offset, false);
1119     } else if (!indented &&
1120                !(cont_type == CMARK_NODE_PARAGRAPH && !all_matched) &&
1121                (matched = scan_thematic_break(input, parser->first_nonspace))) {
1122       // it's only now that we know the line is not part of a setext heading:
1123       *container = add_child(parser, *container, CMARK_NODE_THEMATIC_BREAK,
1124                              parser->first_nonspace + 1);
1125       S_advance_offset(parser, input, input->len - 1 - parser->offset, false);
1126     } else if (!indented &&
1127                parser->options & CMARK_OPT_FOOTNOTES &&
1128                (matched = scan_footnote_definition(input, parser->first_nonspace))) {
1129       cmark_chunk c = cmark_chunk_dup(input, parser->first_nonspace + 2, matched - 2);
1130       cmark_chunk_to_cstr(parser->mem, &c);
1131 
1132       while (c.data[c.len - 1] != ']')
1133         --c.len;
1134       --c.len;
1135 
1136       S_advance_offset(parser, input, parser->first_nonspace + matched - parser->offset, false);
1137       *container = add_child(parser, *container, CMARK_NODE_FOOTNOTE_DEFINITION, parser->first_nonspace + matched + 1);
1138       (*container)->as.literal = c;
1139 
1140       (*container)->internal_offset = matched;
1141     } else if ((!indented || cont_type == CMARK_NODE_LIST) &&
1142 	       parser->indent < 4 &&
1143                (matched = parse_list_marker(
1144                     parser->mem, input, parser->first_nonspace,
1145                     (*container)->type == CMARK_NODE_PARAGRAPH, &data))) {
1146 
1147       // Note that we can have new list items starting with >= 4
1148       // spaces indent, as long as the list container is still open.
1149       int i = 0;
1150 
1151       // compute padding:
1152       S_advance_offset(parser, input,
1153                        parser->first_nonspace + matched - parser->offset,
1154                        false);
1155 
1156       save_partially_consumed_tab = parser->partially_consumed_tab;
1157       save_offset = parser->offset;
1158       save_column = parser->column;
1159 
1160       while (parser->column - save_column <= 5 &&
1161              S_is_space_or_tab(peek_at(input, parser->offset))) {
1162         S_advance_offset(parser, input, 1, true);
1163       }
1164 
1165       i = parser->column - save_column;
1166       if (i >= 5 || i < 1 ||
1167           // only spaces after list marker:
1168           S_is_line_end_char(peek_at(input, parser->offset))) {
1169         data->padding = matched + 1;
1170         parser->offset = save_offset;
1171         parser->column = save_column;
1172         parser->partially_consumed_tab = save_partially_consumed_tab;
1173         if (i > 0) {
1174           S_advance_offset(parser, input, 1, true);
1175         }
1176       } else {
1177         data->padding = matched + i;
1178       }
1179 
1180       // check container; if it's a list, see if this list item
1181       // can continue the list; otherwise, create a list container.
1182 
1183       data->marker_offset = parser->indent;
1184 
1185       if (cont_type != CMARK_NODE_LIST ||
1186           !lists_match(&((*container)->as.list), data)) {
1187         *container = add_child(parser, *container, CMARK_NODE_LIST,
1188                                parser->first_nonspace + 1);
1189 
1190         memcpy(&((*container)->as.list), data, sizeof(*data));
1191       }
1192 
1193       // add the list item
1194       *container = add_child(parser, *container, CMARK_NODE_ITEM,
1195                              parser->first_nonspace + 1);
1196       /* TODO: static */
1197       memcpy(&((*container)->as.list), data, sizeof(*data));
1198       parser->mem->free(data);
1199     } else if (indented && !maybe_lazy && !parser->blank) {
1200       S_advance_offset(parser, input, CODE_INDENT, true);
1201       *container = add_child(parser, *container, CMARK_NODE_CODE_BLOCK,
1202                              parser->offset + 1);
1203       (*container)->as.code.fenced = false;
1204       (*container)->as.code.fence_char = 0;
1205       (*container)->as.code.fence_length = 0;
1206       (*container)->as.code.fence_offset = 0;
1207       (*container)->as.code.info = cmark_chunk_literal("");
1208     } else {
1209       cmark_llist *tmp;
1210       cmark_node *new_container = NULL;
1211 
1212       for (tmp = parser->syntax_extensions; tmp; tmp=tmp->next) {
1213         cmark_syntax_extension *ext = (cmark_syntax_extension *) tmp->data;
1214 
1215         if (ext->try_opening_block) {
1216           new_container = ext->try_opening_block(
1217               ext, indented, parser, *container, input->data, input->len);
1218 
1219           if (new_container) {
1220             *container = new_container;
1221             break;
1222           }
1223         }
1224       }
1225 
1226       if (!new_container) {
1227         break;
1228       }
1229     }
1230 
1231     if (accepts_lines(S_type(*container))) {
1232       // if it's a line container, it can't contain other containers
1233       break;
1234     }
1235 
1236     cont_type = S_type(*container);
1237     maybe_lazy = false;
1238   }
1239 }
1240 
add_text_to_container(cmark_parser * parser,cmark_node * container,cmark_node * last_matched_container,cmark_chunk * input)1241 static void add_text_to_container(cmark_parser *parser, cmark_node *container,
1242                                   cmark_node *last_matched_container,
1243                                   cmark_chunk *input) {
1244   cmark_node *tmp;
1245   // what remains at parser->offset is a text line.  add the text to the
1246   // appropriate container.
1247 
1248   S_find_first_nonspace(parser, input);
1249 
1250   if (parser->blank && container->last_child)
1251     S_set_last_line_blank(container->last_child, true);
1252 
1253   // block quote lines are never blank as they start with >
1254   // and we don't count blanks in fenced code for purposes of tight/loose
1255   // lists or breaking out of lists.  we also don't set last_line_blank
1256   // on an empty list item.
1257   const cmark_node_type ctype = S_type(container);
1258   const bool last_line_blank =
1259       (parser->blank && ctype != CMARK_NODE_BLOCK_QUOTE &&
1260        ctype != CMARK_NODE_HEADING && ctype != CMARK_NODE_THEMATIC_BREAK &&
1261        !(ctype == CMARK_NODE_CODE_BLOCK && container->as.code.fenced) &&
1262        !(ctype == CMARK_NODE_ITEM && container->first_child == NULL &&
1263          container->start_line == parser->line_number));
1264 
1265   S_set_last_line_blank(container, last_line_blank);
1266 
1267   tmp = container;
1268   while (tmp->parent) {
1269     S_set_last_line_blank(tmp->parent, false);
1270     tmp = tmp->parent;
1271   }
1272 
1273   // If the last line processed belonged to a paragraph node,
1274   // and we didn't match all of the line prefixes for the open containers,
1275   // and we didn't start any new containers,
1276   // and the line isn't blank,
1277   // then treat this as a "lazy continuation line" and add it to
1278   // the open paragraph.
1279   if (parser->current != last_matched_container &&
1280       container == last_matched_container && !parser->blank &&
1281       S_type(parser->current) == CMARK_NODE_PARAGRAPH) {
1282     add_line(parser->current, input, parser);
1283   } else { // not a lazy continuation
1284     // Finalize any blocks that were not matched and set cur to container:
1285     while (parser->current != last_matched_container) {
1286       parser->current = finalize(parser, parser->current);
1287       assert(parser->current != NULL);
1288     }
1289 
1290     if (S_type(container) == CMARK_NODE_CODE_BLOCK) {
1291       add_line(container, input, parser);
1292     } else if (S_type(container) == CMARK_NODE_HTML_BLOCK) {
1293       add_line(container, input, parser);
1294 
1295       int matches_end_condition;
1296       switch (container->as.html_block_type) {
1297       case 1:
1298         // </script>, </style>, </pre>
1299         matches_end_condition =
1300             scan_html_block_end_1(input, parser->first_nonspace);
1301         break;
1302       case 2:
1303         // -->
1304         matches_end_condition =
1305             scan_html_block_end_2(input, parser->first_nonspace);
1306         break;
1307       case 3:
1308         // ?>
1309         matches_end_condition =
1310             scan_html_block_end_3(input, parser->first_nonspace);
1311         break;
1312       case 4:
1313         // >
1314         matches_end_condition =
1315             scan_html_block_end_4(input, parser->first_nonspace);
1316         break;
1317       case 5:
1318         // ]]>
1319         matches_end_condition =
1320             scan_html_block_end_5(input, parser->first_nonspace);
1321         break;
1322       default:
1323         matches_end_condition = 0;
1324         break;
1325       }
1326 
1327       if (matches_end_condition) {
1328         container = finalize(parser, container);
1329         assert(parser->current != NULL);
1330       }
1331     } else if (parser->blank) {
1332       // ??? do nothing
1333     } else if (accepts_lines(S_type(container))) {
1334       if (S_type(container) == CMARK_NODE_HEADING &&
1335           container->as.heading.setext == false) {
1336         chop_trailing_hashtags(input);
1337       }
1338       S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
1339                        false);
1340       add_line(container, input, parser);
1341     } else {
1342       // create paragraph container for line
1343       container = add_child(parser, container, CMARK_NODE_PARAGRAPH,
1344                             parser->first_nonspace + 1);
1345       S_advance_offset(parser, input, parser->first_nonspace - parser->offset,
1346                        false);
1347       add_line(container, input, parser);
1348     }
1349 
1350     parser->current = container;
1351   }
1352 }
1353 
1354 /* See http://spec.commonmark.org/0.24/#phase-1-block-structure */
S_process_line(cmark_parser * parser,const unsigned char * buffer,bufsize_t bytes)1355 static void S_process_line(cmark_parser *parser, const unsigned char *buffer,
1356                            bufsize_t bytes) {
1357   cmark_node *last_matched_container;
1358   bool all_matched = true;
1359   cmark_node *container;
1360   cmark_chunk input;
1361   cmark_node *current;
1362 
1363   cmark_strbuf_clear(&parser->curline);
1364 
1365   if (parser->options & CMARK_OPT_VALIDATE_UTF8)
1366     cmark_utf8proc_check(&parser->curline, buffer, bytes);
1367   else
1368     cmark_strbuf_put(&parser->curline, buffer, bytes);
1369 
1370   bytes = parser->curline.size;
1371 
1372   // ensure line ends with a newline:
1373   if (bytes == 0 || !S_is_line_end_char(parser->curline.ptr[bytes - 1]))
1374     cmark_strbuf_putc(&parser->curline, '\n');
1375 
1376   parser->offset = 0;
1377   parser->column = 0;
1378   parser->first_nonspace = 0;
1379   parser->first_nonspace_column = 0;
1380   parser->indent = 0;
1381   parser->blank = false;
1382   parser->partially_consumed_tab = false;
1383 
1384   input.data = parser->curline.ptr;
1385   input.len = parser->curline.size;
1386   input.alloc = 0;
1387 
1388   // Skip UTF-8 BOM.
1389   if (parser->line_number == 0 &&
1390       input.len >= 3 &&
1391       memcmp(input.data, "\xef\xbb\xbf", 3) == 0)
1392     parser->offset += 3;
1393 
1394   parser->line_number++;
1395 
1396   last_matched_container = check_open_blocks(parser, &input, &all_matched);
1397 
1398   if (!last_matched_container)
1399     goto finished;
1400 
1401   container = last_matched_container;
1402 
1403   current = parser->current;
1404 
1405   open_new_blocks(parser, &container, &input, all_matched);
1406 
1407   /* parser->current might have changed if feed_reentrant was called */
1408   if (current == parser->current)
1409   add_text_to_container(parser, container, last_matched_container, &input);
1410 
1411 finished:
1412   parser->last_line_length = input.len;
1413   if (parser->last_line_length &&
1414       input.data[parser->last_line_length - 1] == '\n')
1415     parser->last_line_length -= 1;
1416   if (parser->last_line_length &&
1417       input.data[parser->last_line_length - 1] == '\r')
1418     parser->last_line_length -= 1;
1419 
1420   cmark_strbuf_clear(&parser->curline);
1421 }
1422 
cmark_parser_finish(cmark_parser * parser)1423 cmark_node *cmark_parser_finish(cmark_parser *parser) {
1424   cmark_node *res;
1425   cmark_llist *extensions;
1426 
1427   /* Parser was already finished once */
1428   if (parser->root == NULL)
1429     return NULL;
1430 
1431   if (parser->linebuf.size) {
1432     S_process_line(parser, parser->linebuf.ptr, parser->linebuf.size);
1433     cmark_strbuf_clear(&parser->linebuf);
1434   }
1435 
1436   finalize_document(parser);
1437 
1438   cmark_consolidate_text_nodes(parser->root);
1439 
1440   cmark_strbuf_free(&parser->curline);
1441   cmark_strbuf_free(&parser->linebuf);
1442 
1443 #if CMARK_DEBUG_NODES
1444   if (cmark_node_check(parser->root, stderr)) {
1445     abort();
1446   }
1447 #endif
1448 
1449   for (extensions = parser->syntax_extensions; extensions; extensions = extensions->next) {
1450     cmark_syntax_extension *ext = (cmark_syntax_extension *) extensions->data;
1451     if (ext->postprocess_func) {
1452       cmark_node *processed = ext->postprocess_func(ext, parser, parser->root);
1453       if (processed)
1454         parser->root = processed;
1455     }
1456   }
1457 
1458   res = parser->root;
1459   parser->root = NULL;
1460 
1461   cmark_parser_reset(parser);
1462 
1463   return res;
1464 }
1465 
cmark_parser_get_line_number(cmark_parser * parser)1466 int cmark_parser_get_line_number(cmark_parser *parser) {
1467   return parser->line_number;
1468 }
1469 
cmark_parser_get_offset(cmark_parser * parser)1470 bufsize_t cmark_parser_get_offset(cmark_parser *parser) {
1471   return parser->offset;
1472 }
1473 
cmark_parser_get_column(cmark_parser * parser)1474 bufsize_t cmark_parser_get_column(cmark_parser *parser) {
1475   return parser->column;
1476 }
1477 
cmark_parser_get_first_nonspace(cmark_parser * parser)1478 int cmark_parser_get_first_nonspace(cmark_parser *parser) {
1479   return parser->first_nonspace;
1480 }
1481 
cmark_parser_get_first_nonspace_column(cmark_parser * parser)1482 int cmark_parser_get_first_nonspace_column(cmark_parser *parser) {
1483   return parser->first_nonspace_column;
1484 }
1485 
cmark_parser_get_indent(cmark_parser * parser)1486 int cmark_parser_get_indent(cmark_parser *parser) {
1487   return parser->indent;
1488 }
1489 
cmark_parser_is_blank(cmark_parser * parser)1490 int cmark_parser_is_blank(cmark_parser *parser) {
1491   return parser->blank;
1492 }
1493 
cmark_parser_has_partially_consumed_tab(cmark_parser * parser)1494 int cmark_parser_has_partially_consumed_tab(cmark_parser *parser) {
1495   return parser->partially_consumed_tab;
1496 }
1497 
cmark_parser_get_last_line_length(cmark_parser * parser)1498 int cmark_parser_get_last_line_length(cmark_parser *parser) {
1499   return parser->last_line_length;
1500 }
1501 
cmark_parser_add_child(cmark_parser * parser,cmark_node * parent,cmark_node_type block_type,int start_column)1502 cmark_node *cmark_parser_add_child(cmark_parser *parser,
1503                                    cmark_node   *parent,
1504                                    cmark_node_type block_type,
1505                                    int start_column) {
1506   return add_child(parser, parent, block_type, start_column);
1507 }
1508 
cmark_parser_advance_offset(cmark_parser * parser,const char * input,int count,int columns)1509 void cmark_parser_advance_offset(cmark_parser *parser,
1510                                  const char *input,
1511                                  int count,
1512                                  int columns) {
1513   cmark_chunk input_chunk = cmark_chunk_literal(input);
1514 
1515   S_advance_offset(parser, &input_chunk, count, columns != 0);
1516 }
1517 
cmark_parser_set_backslash_ispunct_func(cmark_parser * parser,cmark_ispunct_func func)1518 void cmark_parser_set_backslash_ispunct_func(cmark_parser *parser,
1519                                              cmark_ispunct_func func) {
1520   parser->backslash_ispunct = func;
1521 }
1522 
cmark_parser_get_syntax_extensions(cmark_parser * parser)1523 cmark_llist *cmark_parser_get_syntax_extensions(cmark_parser *parser) {
1524   return parser->syntax_extensions;
1525 }
1526