1 #include <stdlib.h>
2 #include <string.h>
3 #include <stdio.h>
4
5 #include "cmark_ctype.h"
6 #include "config.h"
7 #include "node.h"
8 #include "parser.h"
9 #include "references.h"
10 #include "cmark-gfm.h"
11 #include "houdini.h"
12 #include "utf8.h"
13 #include "scanners.h"
14 #include "inlines.h"
15 #include "syntax_extension.h"
16
17 static const char *EMDASH = "\xE2\x80\x94";
18 static const char *ENDASH = "\xE2\x80\x93";
19 static const char *ELLIPSES = "\xE2\x80\xA6";
20 static const char *LEFTDOUBLEQUOTE = "\xE2\x80\x9C";
21 static const char *RIGHTDOUBLEQUOTE = "\xE2\x80\x9D";
22 static const char *LEFTSINGLEQUOTE = "\xE2\x80\x98";
23 static const char *RIGHTSINGLEQUOTE = "\xE2\x80\x99";
24
25 // Macros for creating various kinds of simple.
26 #define make_str(subj, sc, ec, s) make_literal(subj, CMARK_NODE_TEXT, sc, ec, s)
27 #define make_code(subj, sc, ec, s) make_literal(subj, CMARK_NODE_CODE, sc, ec, s)
28 #define make_raw_html(subj, sc, ec, s) make_literal(subj, CMARK_NODE_HTML_INLINE, sc, ec, s)
29 #define make_linebreak(mem) make_simple(mem, CMARK_NODE_LINEBREAK)
30 #define make_softbreak(mem) make_simple(mem, CMARK_NODE_SOFTBREAK)
31 #define make_emph(mem) make_simple(mem, CMARK_NODE_EMPH)
32 #define make_strong(mem) make_simple(mem, CMARK_NODE_STRONG)
33
34 #define MAXBACKTICKS 80
35
36 typedef struct bracket {
37 struct bracket *previous;
38 struct delimiter *previous_delimiter;
39 cmark_node *inl_text;
40 bufsize_t position;
41 bool image;
42 bool active;
43 bool bracket_after;
44 } bracket;
45
46 typedef struct subject{
47 cmark_mem *mem;
48 cmark_chunk input;
49 int line;
50 bufsize_t pos;
51 int block_offset;
52 int column_offset;
53 cmark_map *refmap;
54 delimiter *last_delim;
55 bracket *last_bracket;
56 bufsize_t backticks[MAXBACKTICKS + 1];
57 bool scanned_for_backticks;
58 } subject;
59
60 // Extensions may populate this.
61 static int8_t SKIP_CHARS[256];
62
S_is_line_end_char(char c)63 static CMARK_INLINE bool S_is_line_end_char(char c) {
64 return (c == '\n' || c == '\r');
65 }
66
67 static delimiter *S_insert_emph(subject *subj, delimiter *opener,
68 delimiter *closer);
69
70 static int parse_inline(cmark_parser *parser, subject *subj, cmark_node *parent, int options);
71
72 static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e,
73 cmark_chunk *buffer, cmark_map *refmap);
74 static bufsize_t subject_find_special_char(subject *subj, int options);
75
76 // Create an inline with a literal string value.
make_literal(subject * subj,cmark_node_type t,int start_column,int end_column,cmark_chunk s)77 static CMARK_INLINE cmark_node *make_literal(subject *subj, cmark_node_type t,
78 int start_column, int end_column,
79 cmark_chunk s) {
80 cmark_node *e = (cmark_node *)subj->mem->calloc(1, sizeof(*e));
81 cmark_strbuf_init(subj->mem, &e->content, 0);
82 e->type = (uint16_t)t;
83 e->as.literal = s;
84 e->start_line = e->end_line = subj->line;
85 // columns are 1 based.
86 e->start_column = start_column + 1 + subj->column_offset + subj->block_offset;
87 e->end_column = end_column + 1 + subj->column_offset + subj->block_offset;
88 return e;
89 }
90
91 // Create an inline with no value.
make_simple(cmark_mem * mem,cmark_node_type t)92 static CMARK_INLINE cmark_node *make_simple(cmark_mem *mem, cmark_node_type t) {
93 cmark_node *e = (cmark_node *)mem->calloc(1, sizeof(*e));
94 cmark_strbuf_init(mem, &e->content, 0);
95 e->type = (uint16_t)t;
96 return e;
97 }
98
99 // Like make_str, but parses entities.
make_str_with_entities(subject * subj,int start_column,int end_column,cmark_chunk * content)100 static cmark_node *make_str_with_entities(subject *subj,
101 int start_column, int end_column,
102 cmark_chunk *content) {
103 cmark_strbuf unescaped = CMARK_BUF_INIT(subj->mem);
104
105 if (houdini_unescape_html(&unescaped, content->data, content->len)) {
106 return make_str(subj, start_column, end_column, cmark_chunk_buf_detach(&unescaped));
107 } else {
108 return make_str(subj, start_column, end_column, *content);
109 }
110 }
111
112 // Duplicate a chunk by creating a copy of the buffer not by reusing the
113 // buffer like cmark_chunk_dup does.
chunk_clone(cmark_mem * mem,cmark_chunk * src)114 static cmark_chunk chunk_clone(cmark_mem *mem, cmark_chunk *src) {
115 cmark_chunk c;
116 bufsize_t len = src->len;
117
118 c.len = len;
119 c.data = (unsigned char *)mem->calloc(len + 1, 1);
120 c.alloc = 1;
121 if (len)
122 memcpy(c.data, src->data, len);
123 c.data[len] = '\0';
124
125 return c;
126 }
127
cmark_clean_autolink(cmark_mem * mem,cmark_chunk * url,int is_email)128 static cmark_chunk cmark_clean_autolink(cmark_mem *mem, cmark_chunk *url,
129 int is_email) {
130 cmark_strbuf buf = CMARK_BUF_INIT(mem);
131
132 cmark_chunk_trim(url);
133
134 if (url->len == 0) {
135 cmark_chunk result = CMARK_CHUNK_EMPTY;
136 return result;
137 }
138
139 if (is_email)
140 cmark_strbuf_puts(&buf, "mailto:");
141
142 houdini_unescape_html_f(&buf, url->data, url->len);
143 return cmark_chunk_buf_detach(&buf);
144 }
145
make_autolink(subject * subj,int start_column,int end_column,cmark_chunk url,int is_email)146 static CMARK_INLINE cmark_node *make_autolink(subject *subj,
147 int start_column, int end_column,
148 cmark_chunk url, int is_email) {
149 cmark_node *link = make_simple(subj->mem, CMARK_NODE_LINK);
150 link->as.link.url = cmark_clean_autolink(subj->mem, &url, is_email);
151 link->as.link.title = cmark_chunk_literal("");
152 link->start_line = link->end_line = subj->line;
153 link->start_column = start_column + 1;
154 link->end_column = end_column + 1;
155 cmark_node_append_child(link, make_str_with_entities(subj, start_column + 1, end_column - 1, &url));
156 return link;
157 }
158
subject_from_buf(cmark_mem * mem,int line_number,int block_offset,subject * e,cmark_chunk * chunk,cmark_map * refmap)159 static void subject_from_buf(cmark_mem *mem, int line_number, int block_offset, subject *e,
160 cmark_chunk *chunk, cmark_map *refmap) {
161 int i;
162 e->mem = mem;
163 e->input = *chunk;
164 e->line = line_number;
165 e->pos = 0;
166 e->block_offset = block_offset;
167 e->column_offset = 0;
168 e->refmap = refmap;
169 e->last_delim = NULL;
170 e->last_bracket = NULL;
171 for (i = 0; i <= MAXBACKTICKS; i++) {
172 e->backticks[i] = 0;
173 }
174 e->scanned_for_backticks = false;
175 }
176
isbacktick(int c)177 static CMARK_INLINE int isbacktick(int c) { return (c == '`'); }
178
peek_char_n(subject * subj,bufsize_t n)179 static CMARK_INLINE unsigned char peek_char_n(subject *subj, bufsize_t n) {
180 // NULL bytes should have been stripped out by now. If they're
181 // present, it's a programming error:
182 assert(!(subj->pos + n < subj->input.len && subj->input.data[subj->pos + n] == 0));
183 return (subj->pos + n < subj->input.len) ? subj->input.data[subj->pos + n] : 0;
184 }
185
peek_char(subject * subj)186 static CMARK_INLINE unsigned char peek_char(subject *subj) {
187 return peek_char_n(subj, 0);
188 }
189
peek_at(subject * subj,bufsize_t pos)190 static CMARK_INLINE unsigned char peek_at(subject *subj, bufsize_t pos) {
191 return subj->input.data[pos];
192 }
193
194 // Return true if there are more characters in the subject.
is_eof(subject * subj)195 static CMARK_INLINE int is_eof(subject *subj) {
196 return (subj->pos >= subj->input.len);
197 }
198
199 // Advance the subject. Doesn't check for eof.
200 #define advance(subj) (subj)->pos += 1
201
skip_spaces(subject * subj)202 static CMARK_INLINE bool skip_spaces(subject *subj) {
203 bool skipped = false;
204 while (peek_char(subj) == ' ' || peek_char(subj) == '\t') {
205 advance(subj);
206 skipped = true;
207 }
208 return skipped;
209 }
210
skip_line_end(subject * subj)211 static CMARK_INLINE bool skip_line_end(subject *subj) {
212 bool seen_line_end_char = false;
213 if (peek_char(subj) == '\r') {
214 advance(subj);
215 seen_line_end_char = true;
216 }
217 if (peek_char(subj) == '\n') {
218 advance(subj);
219 seen_line_end_char = true;
220 }
221 return seen_line_end_char || is_eof(subj);
222 }
223
224 // Take characters while a predicate holds, and return a string.
take_while(subject * subj,int (* f)(int))225 static CMARK_INLINE cmark_chunk take_while(subject *subj, int (*f)(int)) {
226 unsigned char c;
227 bufsize_t startpos = subj->pos;
228 bufsize_t len = 0;
229
230 while ((c = peek_char(subj)) && (*f)(c)) {
231 advance(subj);
232 len++;
233 }
234
235 return cmark_chunk_dup(&subj->input, startpos, len);
236 }
237
238 // Return the number of newlines in a given span of text in a subject. If
239 // the number is greater than zero, also return the number of characters
240 // between the last newline and the end of the span in `since_newline`.
count_newlines(subject * subj,bufsize_t from,bufsize_t len,int * since_newline)241 static int count_newlines(subject *subj, bufsize_t from, bufsize_t len, int *since_newline) {
242 int nls = 0;
243 int since_nl = 0;
244
245 while (len--) {
246 if (subj->input.data[from++] == '\n') {
247 ++nls;
248 since_nl = 0;
249 } else {
250 ++since_nl;
251 }
252 }
253
254 if (!nls)
255 return 0;
256
257 *since_newline = since_nl;
258 return nls;
259 }
260
261 // Adjust `node`'s `end_line`, `end_column`, and `subj`'s `line` and
262 // `column_offset` according to the number of newlines in a just-matched span
263 // of text in `subj`.
adjust_subj_node_newlines(subject * subj,cmark_node * node,int matchlen,int extra,int options)264 static void adjust_subj_node_newlines(subject *subj, cmark_node *node, int matchlen, int extra, int options) {
265 if (!(options & CMARK_OPT_SOURCEPOS)) {
266 return;
267 }
268
269 int since_newline;
270 int newlines = count_newlines(subj, subj->pos - matchlen - extra, matchlen, &since_newline);
271 if (newlines) {
272 subj->line += newlines;
273 node->end_line += newlines;
274 node->end_column = since_newline;
275 subj->column_offset = -subj->pos + since_newline + extra;
276 }
277 }
278
279 // Try to process a backtick code span that began with a
280 // span of ticks of length openticklength length (already
281 // parsed). Return 0 if you don't find matching closing
282 // backticks, otherwise return the position in the subject
283 // after the closing backticks.
scan_to_closing_backticks(subject * subj,bufsize_t openticklength)284 static bufsize_t scan_to_closing_backticks(subject *subj,
285 bufsize_t openticklength) {
286
287 bool found = false;
288 if (openticklength > MAXBACKTICKS) {
289 // we limit backtick string length because of the array subj->backticks:
290 return 0;
291 }
292 if (subj->scanned_for_backticks &&
293 subj->backticks[openticklength] <= subj->pos) {
294 // return if we already know there's no closer
295 return 0;
296 }
297 while (!found) {
298 // read non backticks
299 unsigned char c;
300 while ((c = peek_char(subj)) && c != '`') {
301 advance(subj);
302 }
303 if (is_eof(subj)) {
304 break;
305 }
306 bufsize_t numticks = 0;
307 while (peek_char(subj) == '`') {
308 advance(subj);
309 numticks++;
310 }
311 // store position of ender
312 if (numticks <= MAXBACKTICKS) {
313 subj->backticks[numticks] = subj->pos - numticks;
314 }
315 if (numticks == openticklength) {
316 return (subj->pos);
317 }
318 }
319 // got through whole input without finding closer
320 subj->scanned_for_backticks = true;
321 return 0;
322 }
323
324 // Destructively modify string, converting newlines to
325 // spaces, then removing a single leading + trailing space.
S_normalize_code(cmark_strbuf * s)326 static void S_normalize_code(cmark_strbuf *s) {
327 bufsize_t r, w;
328
329 for (r = 0, w = 0; r < s->size; ++r) {
330 switch (s->ptr[r]) {
331 case '\r':
332 if (s->ptr[r + 1] != '\n') {
333 s->ptr[w++] = ' ';
334 }
335 break;
336 case '\n':
337 s->ptr[w++] = ' ';
338 break;
339 default:
340 s->ptr[w++] = s->ptr[r];
341 }
342 }
343
344 // begins and ends with space?
345 if (s->ptr[0] == ' ' && s->ptr[w - 1] == ' ') {
346 cmark_strbuf_drop(s, 1);
347 cmark_strbuf_truncate(s, w - 2);
348 } else {
349 cmark_strbuf_truncate(s, w);
350 }
351
352 }
353
354
355 // Parse backtick code section or raw backticks, return an inline.
356 // Assumes that the subject has a backtick at the current position.
handle_backticks(subject * subj,int options)357 static cmark_node *handle_backticks(subject *subj, int options) {
358 cmark_chunk openticks = take_while(subj, isbacktick);
359 bufsize_t startpos = subj->pos;
360 bufsize_t endpos = scan_to_closing_backticks(subj, openticks.len);
361
362 if (endpos == 0) { // not found
363 subj->pos = startpos; // rewind
364 return make_str(subj, subj->pos, subj->pos, openticks);
365 } else {
366 cmark_strbuf buf = CMARK_BUF_INIT(subj->mem);
367
368 cmark_strbuf_set(&buf, subj->input.data + startpos,
369 endpos - startpos - openticks.len);
370 S_normalize_code(&buf);
371
372 cmark_node *node = make_code(subj, startpos, endpos - openticks.len - 1, cmark_chunk_buf_detach(&buf));
373 adjust_subj_node_newlines(subj, node, endpos - startpos, openticks.len, options);
374 return node;
375 }
376 }
377
378
379 // Scan ***, **, or * and return number scanned, or 0.
380 // Advances position.
scan_delims(subject * subj,unsigned char c,bool * can_open,bool * can_close)381 static int scan_delims(subject *subj, unsigned char c, bool *can_open,
382 bool *can_close) {
383 int numdelims = 0;
384 bufsize_t before_char_pos, after_char_pos;
385 int32_t after_char = 0;
386 int32_t before_char = 0;
387 int len;
388 bool left_flanking, right_flanking;
389
390 if (subj->pos == 0) {
391 before_char = 10;
392 } else {
393 before_char_pos = subj->pos - 1;
394 // walk back to the beginning of the UTF_8 sequence:
395 while ((peek_at(subj, before_char_pos) >> 6 == 2 || SKIP_CHARS[peek_at(subj, before_char_pos)]) && before_char_pos > 0) {
396 before_char_pos -= 1;
397 }
398 len = cmark_utf8proc_iterate(subj->input.data + before_char_pos,
399 subj->pos - before_char_pos, &before_char);
400 if (len == -1 || (before_char < 256 && SKIP_CHARS[(unsigned char) before_char])) {
401 before_char = 10;
402 }
403 }
404
405 if (c == '\'' || c == '"') {
406 numdelims++;
407 advance(subj); // limit to 1 delim for quotes
408 } else {
409 while (peek_char(subj) == c) {
410 numdelims++;
411 advance(subj);
412 }
413 }
414
415 if (subj->pos == subj->input.len) {
416 after_char = 10;
417 } else {
418 after_char_pos = subj->pos;
419 while (SKIP_CHARS[peek_at(subj, after_char_pos)] && after_char_pos < subj->input.len) {
420 after_char_pos += 1;
421 }
422 len = cmark_utf8proc_iterate(subj->input.data + after_char_pos,
423 subj->input.len - after_char_pos, &after_char);
424 if (len == -1 || (after_char < 256 && SKIP_CHARS[(unsigned char) after_char])) {
425 after_char = 10;
426 }
427 }
428
429 left_flanking = numdelims > 0 && !cmark_utf8proc_is_space(after_char) &&
430 (!cmark_utf8proc_is_punctuation(after_char) ||
431 cmark_utf8proc_is_space(before_char) ||
432 cmark_utf8proc_is_punctuation(before_char));
433 right_flanking = numdelims > 0 && !cmark_utf8proc_is_space(before_char) &&
434 (!cmark_utf8proc_is_punctuation(before_char) ||
435 cmark_utf8proc_is_space(after_char) ||
436 cmark_utf8proc_is_punctuation(after_char));
437 if (c == '_') {
438 *can_open = left_flanking &&
439 (!right_flanking || cmark_utf8proc_is_punctuation(before_char));
440 *can_close = right_flanking &&
441 (!left_flanking || cmark_utf8proc_is_punctuation(after_char));
442 } else if (c == '\'' || c == '"') {
443 *can_open = left_flanking && !right_flanking &&
444 before_char != ']' && before_char != ')';
445 *can_close = right_flanking;
446 } else {
447 *can_open = left_flanking;
448 *can_close = right_flanking;
449 }
450 return numdelims;
451 }
452
453 /*
454 static void print_delimiters(subject *subj)
455 {
456 delimiter *delim;
457 delim = subj->last_delim;
458 while (delim != NULL) {
459 printf("Item at stack pos %p: %d %d %d next(%p) prev(%p)\n",
460 (void*)delim, delim->delim_char,
461 delim->can_open, delim->can_close,
462 (void*)delim->next, (void*)delim->previous);
463 delim = delim->previous;
464 }
465 }
466 */
467
remove_delimiter(subject * subj,delimiter * delim)468 static void remove_delimiter(subject *subj, delimiter *delim) {
469 if (delim == NULL)
470 return;
471 if (delim->next == NULL) {
472 // end of list:
473 assert(delim == subj->last_delim);
474 subj->last_delim = delim->previous;
475 } else {
476 delim->next->previous = delim->previous;
477 }
478 if (delim->previous != NULL) {
479 delim->previous->next = delim->next;
480 }
481 subj->mem->free(delim);
482 }
483
pop_bracket(subject * subj)484 static void pop_bracket(subject *subj) {
485 bracket *b;
486 if (subj->last_bracket == NULL)
487 return;
488 b = subj->last_bracket;
489 subj->last_bracket = subj->last_bracket->previous;
490 subj->mem->free(b);
491 }
492
push_delimiter(subject * subj,unsigned char c,bool can_open,bool can_close,cmark_node * inl_text)493 static void push_delimiter(subject *subj, unsigned char c, bool can_open,
494 bool can_close, cmark_node *inl_text) {
495 delimiter *delim = (delimiter *)subj->mem->calloc(1, sizeof(delimiter));
496 delim->delim_char = c;
497 delim->can_open = can_open;
498 delim->can_close = can_close;
499 delim->inl_text = inl_text;
500 delim->length = inl_text->as.literal.len;
501 delim->previous = subj->last_delim;
502 delim->next = NULL;
503 if (delim->previous != NULL) {
504 delim->previous->next = delim;
505 }
506 subj->last_delim = delim;
507 }
508
push_bracket(subject * subj,bool image,cmark_node * inl_text)509 static void push_bracket(subject *subj, bool image, cmark_node *inl_text) {
510 bracket *b = (bracket *)subj->mem->calloc(1, sizeof(bracket));
511 if (subj->last_bracket != NULL) {
512 subj->last_bracket->bracket_after = true;
513 }
514 b->image = image;
515 b->active = true;
516 b->inl_text = inl_text;
517 b->previous = subj->last_bracket;
518 b->previous_delimiter = subj->last_delim;
519 b->position = subj->pos;
520 b->bracket_after = false;
521 subj->last_bracket = b;
522 }
523
524 // Assumes the subject has a c at the current position.
handle_delim(subject * subj,unsigned char c,bool smart)525 static cmark_node *handle_delim(subject *subj, unsigned char c, bool smart) {
526 bufsize_t numdelims;
527 cmark_node *inl_text;
528 bool can_open, can_close;
529 cmark_chunk contents;
530
531 numdelims = scan_delims(subj, c, &can_open, &can_close);
532
533 if (c == '\'' && smart) {
534 contents = cmark_chunk_literal(RIGHTSINGLEQUOTE);
535 } else if (c == '"' && smart) {
536 contents =
537 cmark_chunk_literal(can_close ? RIGHTDOUBLEQUOTE : LEFTDOUBLEQUOTE);
538 } else {
539 contents = cmark_chunk_dup(&subj->input, subj->pos - numdelims, numdelims);
540 }
541
542 inl_text = make_str(subj, subj->pos - numdelims, subj->pos - 1, contents);
543
544 if ((can_open || can_close) && (!(c == '\'' || c == '"') || smart)) {
545 push_delimiter(subj, c, can_open, can_close, inl_text);
546 }
547
548 return inl_text;
549 }
550
551 // Assumes we have a hyphen at the current position.
handle_hyphen(subject * subj,bool smart)552 static cmark_node *handle_hyphen(subject *subj, bool smart) {
553 int startpos = subj->pos;
554
555 advance(subj);
556
557 if (!smart || peek_char(subj) != '-') {
558 return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("-"));
559 }
560
561 while (smart && peek_char(subj) == '-') {
562 advance(subj);
563 }
564
565 int numhyphens = subj->pos - startpos;
566 int en_count = 0;
567 int em_count = 0;
568 int i;
569 cmark_strbuf buf = CMARK_BUF_INIT(subj->mem);
570
571 if (numhyphens % 3 == 0) { // if divisible by 3, use all em dashes
572 em_count = numhyphens / 3;
573 } else if (numhyphens % 2 == 0) { // if divisible by 2, use all en dashes
574 en_count = numhyphens / 2;
575 } else if (numhyphens % 3 == 2) { // use one en dash at end
576 en_count = 1;
577 em_count = (numhyphens - 2) / 3;
578 } else { // use two en dashes at the end
579 en_count = 2;
580 em_count = (numhyphens - 4) / 3;
581 }
582
583 for (i = em_count; i > 0; i--) {
584 cmark_strbuf_puts(&buf, EMDASH);
585 }
586
587 for (i = en_count; i > 0; i--) {
588 cmark_strbuf_puts(&buf, ENDASH);
589 }
590
591 return make_str(subj, startpos, subj->pos - 1, cmark_chunk_buf_detach(&buf));
592 }
593
594 // Assumes we have a period at the current position.
handle_period(subject * subj,bool smart)595 static cmark_node *handle_period(subject *subj, bool smart) {
596 advance(subj);
597 if (smart && peek_char(subj) == '.') {
598 advance(subj);
599 if (peek_char(subj) == '.') {
600 advance(subj);
601 return make_str(subj, subj->pos - 3, subj->pos - 1, cmark_chunk_literal(ELLIPSES));
602 } else {
603 return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal(".."));
604 }
605 } else {
606 return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("."));
607 }
608 }
609
get_extension_for_special_char(cmark_parser * parser,unsigned char c)610 static cmark_syntax_extension *get_extension_for_special_char(cmark_parser *parser, unsigned char c) {
611 cmark_llist *tmp_ext;
612
613 for (tmp_ext = parser->inline_syntax_extensions; tmp_ext; tmp_ext=tmp_ext->next) {
614 cmark_syntax_extension *ext = (cmark_syntax_extension *) tmp_ext->data;
615 cmark_llist *tmp_char;
616 for (tmp_char = ext->special_inline_chars; tmp_char; tmp_char=tmp_char->next) {
617 unsigned char tmp_c = (unsigned char)(size_t)tmp_char->data;
618
619 if (tmp_c == c) {
620 return ext;
621 }
622 }
623 }
624
625 return NULL;
626 }
627
process_emphasis(cmark_parser * parser,subject * subj,delimiter * stack_bottom)628 static void process_emphasis(cmark_parser *parser, subject *subj, delimiter *stack_bottom) {
629 delimiter *closer = subj->last_delim;
630 delimiter *opener;
631 delimiter *old_closer;
632 bool opener_found;
633 bool odd_match;
634 delimiter *openers_bottom[3][128];
635 int i;
636
637 // initialize openers_bottom:
638 memset(&openers_bottom, 0, sizeof(openers_bottom));
639 for (i=0; i < 3; i++) {
640 openers_bottom[i]['*'] = stack_bottom;
641 openers_bottom[i]['_'] = stack_bottom;
642 openers_bottom[i]['\''] = stack_bottom;
643 openers_bottom[i]['"'] = stack_bottom;
644 }
645
646 // move back to first relevant delim.
647 while (closer != NULL && closer->previous != stack_bottom) {
648 closer = closer->previous;
649 }
650
651 // now move forward, looking for closers, and handling each
652 while (closer != NULL) {
653 cmark_syntax_extension *extension = get_extension_for_special_char(parser, closer->delim_char);
654 if (closer->can_close) {
655 // Now look backwards for first matching opener:
656 opener = closer->previous;
657 opener_found = false;
658 odd_match = false;
659 while (opener != NULL && opener != stack_bottom &&
660 opener != openers_bottom[closer->length % 3][closer->delim_char]) {
661 if (opener->can_open && opener->delim_char == closer->delim_char) {
662 // interior closer of size 2 can't match opener of size 1
663 // or of size 1 can't match 2
664 odd_match = (closer->can_open || opener->can_close) &&
665 ((opener->length + closer->length) % 3 == 0);
666 if (!odd_match) {
667 opener_found = true;
668 break;
669 }
670 }
671 opener = opener->previous;
672 }
673 old_closer = closer;
674
675 if (extension) {
676 if (opener_found)
677 closer = extension->insert_inline_from_delim(extension, parser, subj, opener, closer);
678 else
679 closer = closer->next;
680 } else if (closer->delim_char == '*' || closer->delim_char == '_') {
681 if (opener_found) {
682 closer = S_insert_emph(subj, opener, closer);
683 } else {
684 closer = closer->next;
685 }
686 } else if (closer->delim_char == '\'') {
687 cmark_chunk_free(subj->mem, &closer->inl_text->as.literal);
688 closer->inl_text->as.literal = cmark_chunk_literal(RIGHTSINGLEQUOTE);
689 if (opener_found) {
690 cmark_chunk_free(subj->mem, &opener->inl_text->as.literal);
691 opener->inl_text->as.literal = cmark_chunk_literal(LEFTSINGLEQUOTE);
692 }
693 closer = closer->next;
694 } else if (closer->delim_char == '"') {
695 cmark_chunk_free(subj->mem, &closer->inl_text->as.literal);
696 closer->inl_text->as.literal = cmark_chunk_literal(RIGHTDOUBLEQUOTE);
697 if (opener_found) {
698 cmark_chunk_free(subj->mem, &opener->inl_text->as.literal);
699 opener->inl_text->as.literal = cmark_chunk_literal(LEFTDOUBLEQUOTE);
700 }
701 closer = closer->next;
702 }
703 if (!opener_found) {
704 // set lower bound for future searches for openers
705 openers_bottom[old_closer->length % 3][old_closer->delim_char] =
706 old_closer->previous;
707 if (!old_closer->can_open) {
708 // we can remove a closer that can't be an
709 // opener, once we've seen there's no
710 // matching opener:
711 remove_delimiter(subj, old_closer);
712 }
713 }
714 } else {
715 closer = closer->next;
716 }
717 }
718 // free all delimiters in list until stack_bottom:
719 while (subj->last_delim != NULL && subj->last_delim != stack_bottom) {
720 remove_delimiter(subj, subj->last_delim);
721 }
722 }
723
S_insert_emph(subject * subj,delimiter * opener,delimiter * closer)724 static delimiter *S_insert_emph(subject *subj, delimiter *opener,
725 delimiter *closer) {
726 delimiter *delim, *tmp_delim;
727 bufsize_t use_delims;
728 cmark_node *opener_inl = opener->inl_text;
729 cmark_node *closer_inl = closer->inl_text;
730 bufsize_t opener_num_chars = opener_inl->as.literal.len;
731 bufsize_t closer_num_chars = closer_inl->as.literal.len;
732 cmark_node *tmp, *tmpnext, *emph;
733
734 // calculate the actual number of characters used from this closer
735 use_delims = (closer_num_chars >= 2 && opener_num_chars >= 2) ? 2 : 1;
736
737 // remove used characters from associated inlines.
738 opener_num_chars -= use_delims;
739 closer_num_chars -= use_delims;
740 opener_inl->as.literal.len = opener_num_chars;
741 closer_inl->as.literal.len = closer_num_chars;
742
743 // free delimiters between opener and closer
744 delim = closer->previous;
745 while (delim != NULL && delim != opener) {
746 tmp_delim = delim->previous;
747 remove_delimiter(subj, delim);
748 delim = tmp_delim;
749 }
750
751 // create new emph or strong, and splice it in to our inlines
752 // between the opener and closer
753 emph = use_delims == 1 ? make_emph(subj->mem) : make_strong(subj->mem);
754
755 tmp = opener_inl->next;
756 while (tmp && tmp != closer_inl) {
757 tmpnext = tmp->next;
758 cmark_node_append_child(emph, tmp);
759 tmp = tmpnext;
760 }
761 cmark_node_insert_after(opener_inl, emph);
762
763 emph->start_line = emph->end_line = subj->line;
764 emph->start_column = opener_inl->start_column + subj->column_offset;
765 emph->end_column = closer_inl->end_column + subj->column_offset;
766
767 // if opener has 0 characters, remove it and its associated inline
768 if (opener_num_chars == 0) {
769 cmark_node_free(opener_inl);
770 remove_delimiter(subj, opener);
771 }
772
773 // if closer has 0 characters, remove it and its associated inline
774 if (closer_num_chars == 0) {
775 // remove empty closer inline
776 cmark_node_free(closer_inl);
777 // remove closer from list
778 tmp_delim = closer->next;
779 remove_delimiter(subj, closer);
780 closer = tmp_delim;
781 }
782
783 return closer;
784 }
785
786 // Parse backslash-escape or just a backslash, returning an inline.
handle_backslash(cmark_parser * parser,subject * subj)787 static cmark_node *handle_backslash(cmark_parser *parser, subject *subj) {
788 advance(subj);
789 unsigned char nextchar = peek_char(subj);
790 if ((parser->backslash_ispunct ? parser->backslash_ispunct : cmark_ispunct)(nextchar)) {
791 // only ascii symbols and newline can be escaped
792 advance(subj);
793 return make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_dup(&subj->input, subj->pos - 1, 1));
794 } else if (!is_eof(subj) && skip_line_end(subj)) {
795 return make_linebreak(subj->mem);
796 } else {
797 return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("\\"));
798 }
799 }
800
801 // Parse an entity or a regular "&" string.
802 // Assumes the subject has an '&' character at the current position.
handle_entity(subject * subj)803 static cmark_node *handle_entity(subject *subj) {
804 cmark_strbuf ent = CMARK_BUF_INIT(subj->mem);
805 bufsize_t len;
806
807 advance(subj);
808
809 len = houdini_unescape_ent(&ent, subj->input.data + subj->pos,
810 subj->input.len - subj->pos);
811
812 if (len == 0)
813 return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("&"));
814
815 subj->pos += len;
816 return make_str(subj, subj->pos - 1 - len, subj->pos - 1, cmark_chunk_buf_detach(&ent));
817 }
818
819 // Clean a URL: remove surrounding whitespace, and remove \ that escape
820 // punctuation.
cmark_clean_url(cmark_mem * mem,cmark_chunk * url)821 cmark_chunk cmark_clean_url(cmark_mem *mem, cmark_chunk *url) {
822 cmark_strbuf buf = CMARK_BUF_INIT(mem);
823
824 cmark_chunk_trim(url);
825
826 if (url->len == 0) {
827 cmark_chunk result = CMARK_CHUNK_EMPTY;
828 return result;
829 }
830
831 houdini_unescape_html_f(&buf, url->data, url->len);
832
833 cmark_strbuf_unescape(&buf);
834 return cmark_chunk_buf_detach(&buf);
835 }
836
cmark_clean_title(cmark_mem * mem,cmark_chunk * title)837 cmark_chunk cmark_clean_title(cmark_mem *mem, cmark_chunk *title) {
838 cmark_strbuf buf = CMARK_BUF_INIT(mem);
839 unsigned char first, last;
840
841 if (title->len == 0) {
842 cmark_chunk result = CMARK_CHUNK_EMPTY;
843 return result;
844 }
845
846 first = title->data[0];
847 last = title->data[title->len - 1];
848
849 // remove surrounding quotes if any:
850 if ((first == '\'' && last == '\'') || (first == '(' && last == ')') ||
851 (first == '"' && last == '"')) {
852 houdini_unescape_html_f(&buf, title->data + 1, title->len - 2);
853 } else {
854 houdini_unescape_html_f(&buf, title->data, title->len);
855 }
856
857 cmark_strbuf_unescape(&buf);
858 return cmark_chunk_buf_detach(&buf);
859 }
860
861 // Parse an autolink or HTML tag.
862 // Assumes the subject has a '<' character at the current position.
handle_pointy_brace(subject * subj,int options)863 static cmark_node *handle_pointy_brace(subject *subj, int options) {
864 bufsize_t matchlen = 0;
865 cmark_chunk contents;
866
867 advance(subj); // advance past first <
868
869 // first try to match a URL autolink
870 matchlen = scan_autolink_uri(&subj->input, subj->pos);
871 if (matchlen > 0) {
872 contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1);
873 subj->pos += matchlen;
874
875 return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 0);
876 }
877
878 // next try to match an email autolink
879 matchlen = scan_autolink_email(&subj->input, subj->pos);
880 if (matchlen > 0) {
881 contents = cmark_chunk_dup(&subj->input, subj->pos, matchlen - 1);
882 subj->pos += matchlen;
883
884 return make_autolink(subj, subj->pos - 1 - matchlen, subj->pos - 1, contents, 1);
885 }
886
887 // finally, try to match an html tag
888 matchlen = scan_html_tag(&subj->input, subj->pos);
889 if (matchlen > 0) {
890 contents = cmark_chunk_dup(&subj->input, subj->pos - 1, matchlen + 1);
891 subj->pos += matchlen;
892 cmark_node *node = make_raw_html(subj, subj->pos - matchlen - 1, subj->pos - 1, contents);
893 adjust_subj_node_newlines(subj, node, matchlen, 1, options);
894 return node;
895 }
896
897 if (options & CMARK_OPT_LIBERAL_HTML_TAG) {
898 matchlen = scan_liberal_html_tag(&subj->input, subj->pos);
899 if (matchlen > 0) {
900 contents = cmark_chunk_dup(&subj->input, subj->pos - 1, matchlen + 1);
901 subj->pos += matchlen;
902 cmark_node *node = make_raw_html(subj, subj->pos - matchlen - 1, subj->pos - 1, contents);
903 adjust_subj_node_newlines(subj, node, matchlen, 1, options);
904 return node;
905 }
906 }
907
908 // if nothing matches, just return the opening <:
909 return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("<"));
910 }
911
912 // Parse a link label. Returns 1 if successful.
913 // Note: unescaped brackets are not allowed in labels.
914 // The label begins with `[` and ends with the first `]` character
915 // encountered. Backticks in labels do not start code spans.
link_label(subject * subj,cmark_chunk * raw_label)916 static int link_label(subject *subj, cmark_chunk *raw_label) {
917 bufsize_t startpos = subj->pos;
918 int length = 0;
919 unsigned char c;
920
921 // advance past [
922 if (peek_char(subj) == '[') {
923 advance(subj);
924 } else {
925 return 0;
926 }
927
928 while ((c = peek_char(subj)) && c != '[' && c != ']') {
929 if (c == '\\') {
930 advance(subj);
931 length++;
932 if (cmark_ispunct(peek_char(subj))) {
933 advance(subj);
934 length++;
935 }
936 } else {
937 advance(subj);
938 length++;
939 }
940 if (length > MAX_LINK_LABEL_LENGTH) {
941 goto noMatch;
942 }
943 }
944
945 if (c == ']') { // match found
946 *raw_label =
947 cmark_chunk_dup(&subj->input, startpos + 1, subj->pos - (startpos + 1));
948 cmark_chunk_trim(raw_label);
949 advance(subj); // advance past ]
950 return 1;
951 }
952
953 noMatch:
954 subj->pos = startpos; // rewind
955 return 0;
956 }
957
manual_scan_link_url_2(cmark_chunk * input,bufsize_t offset,cmark_chunk * output)958 static bufsize_t manual_scan_link_url_2(cmark_chunk *input, bufsize_t offset,
959 cmark_chunk *output) {
960 bufsize_t i = offset;
961 size_t nb_p = 0;
962
963 while (i < input->len) {
964 if (input->data[i] == '\\' &&
965 i + 1 < input-> len &&
966 cmark_ispunct(input->data[i+1]))
967 i += 2;
968 else if (input->data[i] == '(') {
969 ++nb_p;
970 ++i;
971 if (nb_p > 32)
972 return -1;
973 } else if (input->data[i] == ')') {
974 if (nb_p == 0)
975 break;
976 --nb_p;
977 ++i;
978 } else if (cmark_isspace(input->data[i]))
979 break;
980 else
981 ++i;
982 }
983
984 if (i >= input->len)
985 return -1;
986
987 {
988 cmark_chunk result = {input->data + offset, i - offset, 0};
989 *output = result;
990 }
991 return i - offset;
992 }
993
manual_scan_link_url(cmark_chunk * input,bufsize_t offset,cmark_chunk * output)994 static bufsize_t manual_scan_link_url(cmark_chunk *input, bufsize_t offset,
995 cmark_chunk *output) {
996 bufsize_t i = offset;
997
998 if (i < input->len && input->data[i] == '<') {
999 ++i;
1000 while (i < input->len) {
1001 if (input->data[i] == '>') {
1002 ++i;
1003 break;
1004 } else if (input->data[i] == '\\')
1005 i += 2;
1006 else if (input->data[i] == '\n' || input->data[i] == '<')
1007 return manual_scan_link_url_2(input, offset, output);
1008 else
1009 ++i;
1010 }
1011 } else {
1012 return manual_scan_link_url_2(input, offset, output);
1013 }
1014
1015 if (i >= input->len)
1016 return -1;
1017
1018 {
1019 cmark_chunk result = {input->data + offset + 1, i - 2 - offset, 0};
1020 *output = result;
1021 }
1022 return i - offset;
1023 }
1024
1025 // Return a link, an image, or a literal close bracket.
handle_close_bracket(cmark_parser * parser,subject * subj)1026 static cmark_node *handle_close_bracket(cmark_parser *parser, subject *subj) {
1027 bufsize_t initial_pos, after_link_text_pos;
1028 bufsize_t endurl, starttitle, endtitle, endall;
1029 bufsize_t sps, n;
1030 cmark_reference *ref = NULL;
1031 cmark_chunk url_chunk, title_chunk;
1032 cmark_chunk url, title;
1033 bracket *opener;
1034 cmark_node *inl;
1035 cmark_chunk raw_label;
1036 int found_label;
1037 cmark_node *tmp, *tmpnext;
1038 bool is_image;
1039
1040 advance(subj); // advance past ]
1041 initial_pos = subj->pos;
1042
1043 // get last [ or ![
1044 opener = subj->last_bracket;
1045
1046 if (opener == NULL) {
1047 return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
1048 }
1049
1050 if (!opener->active) {
1051 // take delimiter off stack
1052 pop_bracket(subj);
1053 return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
1054 }
1055
1056 // If we got here, we matched a potential link/image text.
1057 // Now we check to see if it's a link/image.
1058 is_image = opener->image;
1059
1060 after_link_text_pos = subj->pos;
1061
1062 // First, look for an inline link.
1063 if (peek_char(subj) == '(' &&
1064 ((sps = scan_spacechars(&subj->input, subj->pos + 1)) > -1) &&
1065 ((n = manual_scan_link_url(&subj->input, subj->pos + 1 + sps,
1066 &url_chunk)) > -1)) {
1067
1068 // try to parse an explicit link:
1069 endurl = subj->pos + 1 + sps + n;
1070 starttitle = endurl + scan_spacechars(&subj->input, endurl);
1071
1072 // ensure there are spaces btw url and title
1073 endtitle = (starttitle == endurl)
1074 ? starttitle
1075 : starttitle + scan_link_title(&subj->input, starttitle);
1076
1077 endall = endtitle + scan_spacechars(&subj->input, endtitle);
1078
1079 if (peek_at(subj, endall) == ')') {
1080 subj->pos = endall + 1;
1081
1082 title_chunk =
1083 cmark_chunk_dup(&subj->input, starttitle, endtitle - starttitle);
1084 url = cmark_clean_url(subj->mem, &url_chunk);
1085 title = cmark_clean_title(subj->mem, &title_chunk);
1086 cmark_chunk_free(subj->mem, &url_chunk);
1087 cmark_chunk_free(subj->mem, &title_chunk);
1088 goto match;
1089
1090 } else {
1091 // it could still be a shortcut reference link
1092 subj->pos = after_link_text_pos;
1093 }
1094 }
1095
1096 // Next, look for a following [link label] that matches in refmap.
1097 // skip spaces
1098 raw_label = cmark_chunk_literal("");
1099 found_label = link_label(subj, &raw_label);
1100 if (!found_label) {
1101 // If we have a shortcut reference link, back up
1102 // to before the spacse we skipped.
1103 subj->pos = initial_pos;
1104 }
1105
1106 if ((!found_label || raw_label.len == 0) && !opener->bracket_after) {
1107 cmark_chunk_free(subj->mem, &raw_label);
1108 raw_label = cmark_chunk_dup(&subj->input, opener->position,
1109 initial_pos - opener->position - 1);
1110 found_label = true;
1111 }
1112
1113 if (found_label) {
1114 ref = (cmark_reference *)cmark_map_lookup(subj->refmap, &raw_label);
1115 cmark_chunk_free(subj->mem, &raw_label);
1116 }
1117
1118 if (ref != NULL) { // found
1119 url = chunk_clone(subj->mem, &ref->url);
1120 title = chunk_clone(subj->mem, &ref->title);
1121 goto match;
1122 } else {
1123 goto noMatch;
1124 }
1125
1126 noMatch:
1127 // If we fall through to here, it means we didn't match a link.
1128 // What if we're a footnote link?
1129 if (parser->options & CMARK_OPT_FOOTNOTES &&
1130 opener->inl_text->next &&
1131 opener->inl_text->next->type == CMARK_NODE_TEXT &&
1132 !opener->inl_text->next->next) {
1133 cmark_chunk *literal = &opener->inl_text->next->as.literal;
1134 if (literal->len > 1 && literal->data[0] == '^') {
1135 inl = make_simple(subj->mem, CMARK_NODE_FOOTNOTE_REFERENCE);
1136 inl->as.literal = cmark_chunk_dup(literal, 1, literal->len - 1);
1137 inl->start_line = inl->end_line = subj->line;
1138 inl->start_column = opener->inl_text->start_column;
1139 inl->end_column = subj->pos + subj->column_offset + subj->block_offset;
1140 cmark_node_insert_before(opener->inl_text, inl);
1141 cmark_node_free(opener->inl_text->next);
1142 cmark_node_free(opener->inl_text);
1143 process_emphasis(parser, subj, opener->previous_delimiter);
1144 pop_bracket(subj);
1145 return NULL;
1146 }
1147 }
1148
1149 pop_bracket(subj); // remove this opener from delimiter list
1150 subj->pos = initial_pos;
1151 return make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("]"));
1152
1153 match:
1154 inl = make_simple(subj->mem, is_image ? CMARK_NODE_IMAGE : CMARK_NODE_LINK);
1155 inl->as.link.url = url;
1156 inl->as.link.title = title;
1157 inl->start_line = inl->end_line = subj->line;
1158 inl->start_column = opener->inl_text->start_column;
1159 inl->end_column = subj->pos + subj->column_offset + subj->block_offset;
1160 cmark_node_insert_before(opener->inl_text, inl);
1161 // Add link text:
1162 tmp = opener->inl_text->next;
1163 while (tmp) {
1164 tmpnext = tmp->next;
1165 cmark_node_append_child(inl, tmp);
1166 tmp = tmpnext;
1167 }
1168
1169 // Free the bracket [:
1170 cmark_node_free(opener->inl_text);
1171
1172 process_emphasis(parser, subj, opener->previous_delimiter);
1173 pop_bracket(subj);
1174
1175 // Now, if we have a link, we also want to deactivate earlier link
1176 // delimiters. (This code can be removed if we decide to allow links
1177 // inside links.)
1178 if (!is_image) {
1179 opener = subj->last_bracket;
1180 while (opener != NULL) {
1181 if (!opener->image) {
1182 if (!opener->active) {
1183 break;
1184 } else {
1185 opener->active = false;
1186 }
1187 }
1188 opener = opener->previous;
1189 }
1190 }
1191
1192 return NULL;
1193 }
1194
1195 // Parse a hard or soft linebreak, returning an inline.
1196 // Assumes the subject has a cr or newline at the current position.
handle_newline(subject * subj)1197 static cmark_node *handle_newline(subject *subj) {
1198 bufsize_t nlpos = subj->pos;
1199 // skip over cr, crlf, or lf:
1200 if (peek_at(subj, subj->pos) == '\r') {
1201 advance(subj);
1202 }
1203 if (peek_at(subj, subj->pos) == '\n') {
1204 advance(subj);
1205 }
1206 ++subj->line;
1207 subj->column_offset = -subj->pos;
1208 // skip spaces at beginning of line
1209 skip_spaces(subj);
1210 if (nlpos > 1 && peek_at(subj, nlpos - 1) == ' ' &&
1211 peek_at(subj, nlpos - 2) == ' ') {
1212 return make_linebreak(subj->mem);
1213 } else {
1214 return make_softbreak(subj->mem);
1215 }
1216 }
1217
1218 // "\r\n\\`&_*[]<!"
1219 static int8_t SPECIAL_CHARS[256] = {
1220 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1221 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
1222 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1223 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
1224 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1225 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1226 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1228 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1230 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1231
1232 // " ' . -
1233 static char SMART_PUNCT_CHARS[] = {
1234 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
1236 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1237 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1239 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1240 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1241 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1242 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1243 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1244 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1245 };
1246
subject_find_special_char(subject * subj,int options)1247 static bufsize_t subject_find_special_char(subject *subj, int options) {
1248 bufsize_t n = subj->pos + 1;
1249
1250 while (n < subj->input.len) {
1251 if (SPECIAL_CHARS[subj->input.data[n]])
1252 return n;
1253 if (options & CMARK_OPT_SMART && SMART_PUNCT_CHARS[subj->input.data[n]])
1254 return n;
1255 n++;
1256 }
1257
1258 return subj->input.len;
1259 }
1260
cmark_inlines_add_special_character(unsigned char c,bool emphasis)1261 void cmark_inlines_add_special_character(unsigned char c, bool emphasis) {
1262 SPECIAL_CHARS[c] = 1;
1263 if (emphasis)
1264 SKIP_CHARS[c] = 1;
1265 }
1266
cmark_inlines_remove_special_character(unsigned char c,bool emphasis)1267 void cmark_inlines_remove_special_character(unsigned char c, bool emphasis) {
1268 SPECIAL_CHARS[c] = 0;
1269 if (emphasis)
1270 SKIP_CHARS[c] = 0;
1271 }
1272
try_extensions(cmark_parser * parser,cmark_node * parent,unsigned char c,subject * subj)1273 static cmark_node *try_extensions(cmark_parser *parser,
1274 cmark_node *parent,
1275 unsigned char c,
1276 subject *subj) {
1277 cmark_node *res = NULL;
1278 cmark_llist *tmp;
1279
1280 for (tmp = parser->inline_syntax_extensions; tmp; tmp = tmp->next) {
1281 cmark_syntax_extension *ext = (cmark_syntax_extension *) tmp->data;
1282 res = ext->match_inline(ext, parser, parent, c, subj);
1283
1284 if (res)
1285 break;
1286 }
1287
1288 return res;
1289 }
1290
1291 // Parse an inline, advancing subject, and add it as a child of parent.
1292 // Return 0 if no inline can be parsed, 1 otherwise.
parse_inline(cmark_parser * parser,subject * subj,cmark_node * parent,int options)1293 static int parse_inline(cmark_parser *parser, subject *subj, cmark_node *parent, int options) {
1294 cmark_node *new_inl = NULL;
1295 cmark_chunk contents;
1296 unsigned char c;
1297 bufsize_t startpos, endpos;
1298 c = peek_char(subj);
1299 if (c == 0) {
1300 return 0;
1301 }
1302 switch (c) {
1303 case '\r':
1304 case '\n':
1305 new_inl = handle_newline(subj);
1306 break;
1307 case '`':
1308 new_inl = handle_backticks(subj, options);
1309 break;
1310 case '\\':
1311 new_inl = handle_backslash(parser, subj);
1312 break;
1313 case '&':
1314 new_inl = handle_entity(subj);
1315 break;
1316 case '<':
1317 new_inl = handle_pointy_brace(subj, options);
1318 break;
1319 case '*':
1320 case '_':
1321 case '\'':
1322 case '"':
1323 new_inl = handle_delim(subj, c, (options & CMARK_OPT_SMART) != 0);
1324 break;
1325 case '-':
1326 new_inl = handle_hyphen(subj, (options & CMARK_OPT_SMART) != 0);
1327 break;
1328 case '.':
1329 new_inl = handle_period(subj, (options & CMARK_OPT_SMART) != 0);
1330 break;
1331 case '[':
1332 advance(subj);
1333 new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("["));
1334 push_bracket(subj, false, new_inl);
1335 break;
1336 case ']':
1337 new_inl = handle_close_bracket(parser, subj);
1338 break;
1339 case '!':
1340 advance(subj);
1341 if (peek_char(subj) == '[' && peek_char_n(subj, 1) != '^') {
1342 advance(subj);
1343 new_inl = make_str(subj, subj->pos - 2, subj->pos - 1, cmark_chunk_literal("!["));
1344 push_bracket(subj, true, new_inl);
1345 } else {
1346 new_inl = make_str(subj, subj->pos - 1, subj->pos - 1, cmark_chunk_literal("!"));
1347 }
1348 break;
1349 default:
1350 new_inl = try_extensions(parser, parent, c, subj);
1351 if (new_inl != NULL)
1352 break;
1353
1354 endpos = subject_find_special_char(subj, options);
1355 contents = cmark_chunk_dup(&subj->input, subj->pos, endpos - subj->pos);
1356 startpos = subj->pos;
1357 subj->pos = endpos;
1358
1359 // if we're at a newline, strip trailing spaces.
1360 if (S_is_line_end_char(peek_char(subj))) {
1361 cmark_chunk_rtrim(&contents);
1362 }
1363
1364 new_inl = make_str(subj, startpos, endpos - 1, contents);
1365 }
1366 if (new_inl != NULL) {
1367 cmark_node_append_child(parent, new_inl);
1368 }
1369
1370 return 1;
1371 }
1372
1373 // Parse inlines from parent's string_content, adding as children of parent.
cmark_parse_inlines(cmark_parser * parser,cmark_node * parent,cmark_map * refmap,int options)1374 void cmark_parse_inlines(cmark_parser *parser,
1375 cmark_node *parent,
1376 cmark_map *refmap,
1377 int options) {
1378 subject subj;
1379 cmark_chunk content = {parent->content.ptr, parent->content.size, 0};
1380 subject_from_buf(parser->mem, parent->start_line, parent->start_column - 1 + parent->internal_offset, &subj, &content, refmap);
1381 cmark_chunk_rtrim(&subj.input);
1382
1383 while (!is_eof(&subj) && parse_inline(parser, &subj, parent, options))
1384 ;
1385
1386 process_emphasis(parser, &subj, NULL);
1387 // free bracket and delim stack
1388 while (subj.last_delim) {
1389 remove_delimiter(&subj, subj.last_delim);
1390 }
1391 while (subj.last_bracket) {
1392 pop_bracket(&subj);
1393 }
1394 }
1395
1396 // Parse zero or more space characters, including at most one newline.
spnl(subject * subj)1397 static void spnl(subject *subj) {
1398 skip_spaces(subj);
1399 if (skip_line_end(subj)) {
1400 skip_spaces(subj);
1401 }
1402 }
1403
1404 // Parse reference. Assumes string begins with '[' character.
1405 // Modify refmap if a reference is encountered.
1406 // Return 0 if no reference found, otherwise position of subject
1407 // after reference is parsed.
cmark_parse_reference_inline(cmark_mem * mem,cmark_chunk * input,cmark_map * refmap)1408 bufsize_t cmark_parse_reference_inline(cmark_mem *mem, cmark_chunk *input,
1409 cmark_map *refmap) {
1410 subject subj;
1411
1412 cmark_chunk lab;
1413 cmark_chunk url;
1414 cmark_chunk title;
1415
1416 bufsize_t matchlen = 0;
1417 bufsize_t beforetitle;
1418
1419 subject_from_buf(mem, -1, 0, &subj, input, NULL);
1420
1421 // parse label:
1422 if (!link_label(&subj, &lab) || lab.len == 0)
1423 return 0;
1424
1425 // colon:
1426 if (peek_char(&subj) == ':') {
1427 advance(&subj);
1428 } else {
1429 return 0;
1430 }
1431
1432 // parse link url:
1433 spnl(&subj);
1434 if ((matchlen = manual_scan_link_url(&subj.input, subj.pos, &url)) > -1 &&
1435 url.len > 0) {
1436 subj.pos += matchlen;
1437 } else {
1438 return 0;
1439 }
1440
1441 // parse optional link_title
1442 beforetitle = subj.pos;
1443 spnl(&subj);
1444 matchlen = subj.pos == beforetitle ? 0 : scan_link_title(&subj.input, subj.pos);
1445 if (matchlen) {
1446 title = cmark_chunk_dup(&subj.input, subj.pos, matchlen);
1447 subj.pos += matchlen;
1448 } else {
1449 subj.pos = beforetitle;
1450 title = cmark_chunk_literal("");
1451 }
1452
1453 // parse final spaces and newline:
1454 skip_spaces(&subj);
1455 if (!skip_line_end(&subj)) {
1456 if (matchlen) { // try rewinding before title
1457 subj.pos = beforetitle;
1458 skip_spaces(&subj);
1459 if (!skip_line_end(&subj)) {
1460 return 0;
1461 }
1462 } else {
1463 return 0;
1464 }
1465 }
1466 // insert reference into refmap
1467 cmark_reference_create(refmap, &lab, &url, &title);
1468 return subj.pos;
1469 }
1470
cmark_inline_parser_peek_char(cmark_inline_parser * parser)1471 unsigned char cmark_inline_parser_peek_char(cmark_inline_parser *parser) {
1472 return peek_char(parser);
1473 }
1474
cmark_inline_parser_peek_at(cmark_inline_parser * parser,bufsize_t pos)1475 unsigned char cmark_inline_parser_peek_at(cmark_inline_parser *parser, bufsize_t pos) {
1476 return peek_at(parser, pos);
1477 }
1478
cmark_inline_parser_is_eof(cmark_inline_parser * parser)1479 int cmark_inline_parser_is_eof(cmark_inline_parser *parser) {
1480 return is_eof(parser);
1481 }
1482
1483 static char *
my_strndup(const char * s,size_t n)1484 my_strndup (const char *s, size_t n)
1485 {
1486 char *result;
1487 size_t len = strlen (s);
1488
1489 if (n < len)
1490 len = n;
1491
1492 result = (char *) malloc (len + 1);
1493 if (!result)
1494 return 0;
1495
1496 result[len] = '\0';
1497 return (char *) memcpy (result, s, len);
1498 }
1499
cmark_inline_parser_take_while(cmark_inline_parser * parser,cmark_inline_predicate pred)1500 char *cmark_inline_parser_take_while(cmark_inline_parser *parser, cmark_inline_predicate pred) {
1501 unsigned char c;
1502 bufsize_t startpos = parser->pos;
1503 bufsize_t len = 0;
1504
1505 while ((c = peek_char(parser)) && (*pred)(c)) {
1506 advance(parser);
1507 len++;
1508 }
1509
1510 return my_strndup((const char *) parser->input.data + startpos, len);
1511 }
1512
cmark_inline_parser_push_delimiter(cmark_inline_parser * parser,unsigned char c,int can_open,int can_close,cmark_node * inl_text)1513 void cmark_inline_parser_push_delimiter(cmark_inline_parser *parser,
1514 unsigned char c,
1515 int can_open,
1516 int can_close,
1517 cmark_node *inl_text) {
1518 push_delimiter(parser, c, can_open != 0, can_close != 0, inl_text);
1519 }
1520
cmark_inline_parser_remove_delimiter(cmark_inline_parser * parser,delimiter * delim)1521 void cmark_inline_parser_remove_delimiter(cmark_inline_parser *parser, delimiter *delim) {
1522 remove_delimiter(parser, delim);
1523 }
1524
cmark_inline_parser_scan_delimiters(cmark_inline_parser * parser,int max_delims,unsigned char c,int * left_flanking,int * right_flanking,int * punct_before,int * punct_after)1525 int cmark_inline_parser_scan_delimiters(cmark_inline_parser *parser,
1526 int max_delims,
1527 unsigned char c,
1528 int *left_flanking,
1529 int *right_flanking,
1530 int *punct_before,
1531 int *punct_after) {
1532 int numdelims = 0;
1533 bufsize_t before_char_pos;
1534 int32_t after_char = 0;
1535 int32_t before_char = 0;
1536 int len;
1537 bool space_before, space_after;
1538
1539 if (parser->pos == 0) {
1540 before_char = 10;
1541 } else {
1542 before_char_pos = parser->pos - 1;
1543 // walk back to the beginning of the UTF_8 sequence:
1544 while (peek_at(parser, before_char_pos) >> 6 == 2 && before_char_pos > 0) {
1545 before_char_pos -= 1;
1546 }
1547 len = cmark_utf8proc_iterate(parser->input.data + before_char_pos,
1548 parser->pos - before_char_pos, &before_char);
1549 if (len == -1) {
1550 before_char = 10;
1551 }
1552 }
1553
1554 while (peek_char(parser) == c && numdelims < max_delims) {
1555 numdelims++;
1556 advance(parser);
1557 }
1558
1559 len = cmark_utf8proc_iterate(parser->input.data + parser->pos,
1560 parser->input.len - parser->pos, &after_char);
1561 if (len == -1) {
1562 after_char = 10;
1563 }
1564
1565 *punct_before = cmark_utf8proc_is_punctuation(before_char);
1566 *punct_after = cmark_utf8proc_is_punctuation(after_char);
1567 space_before = cmark_utf8proc_is_space(before_char) != 0;
1568 space_after = cmark_utf8proc_is_space(after_char) != 0;
1569
1570 *left_flanking = numdelims > 0 && !cmark_utf8proc_is_space(after_char) &&
1571 !(*punct_after && !space_before && !*punct_before);
1572 *right_flanking = numdelims > 0 && !cmark_utf8proc_is_space(before_char) &&
1573 !(*punct_before && !space_after && !*punct_after);
1574
1575 return numdelims;
1576 }
1577
cmark_inline_parser_advance_offset(cmark_inline_parser * parser)1578 void cmark_inline_parser_advance_offset(cmark_inline_parser *parser) {
1579 advance(parser);
1580 }
1581
cmark_inline_parser_get_offset(cmark_inline_parser * parser)1582 int cmark_inline_parser_get_offset(cmark_inline_parser *parser) {
1583 return parser->pos;
1584 }
1585
cmark_inline_parser_set_offset(cmark_inline_parser * parser,int offset)1586 void cmark_inline_parser_set_offset(cmark_inline_parser *parser, int offset) {
1587 parser->pos = offset;
1588 }
1589
cmark_inline_parser_get_column(cmark_inline_parser * parser)1590 int cmark_inline_parser_get_column(cmark_inline_parser *parser) {
1591 return parser->pos + 1 + parser->column_offset + parser->block_offset;
1592 }
1593
cmark_inline_parser_get_chunk(cmark_inline_parser * parser)1594 cmark_chunk *cmark_inline_parser_get_chunk(cmark_inline_parser *parser) {
1595 return &parser->input;
1596 }
1597
cmark_inline_parser_in_bracket(cmark_inline_parser * parser,int image)1598 int cmark_inline_parser_in_bracket(cmark_inline_parser *parser, int image) {
1599 for (bracket *b = parser->last_bracket; b; b = b->previous)
1600 if (b->active && b->image == (image != 0))
1601 return 1;
1602 return 0;
1603 }
1604
cmark_node_unput(cmark_node * node,int n)1605 void cmark_node_unput(cmark_node *node, int n) {
1606 node = node->last_child;
1607 while (n > 0 && node && node->type == CMARK_NODE_TEXT) {
1608 if (node->as.literal.len < n) {
1609 n -= node->as.literal.len;
1610 node->as.literal.len = 0;
1611 } else {
1612 node->as.literal.len -= n;
1613 n = 0;
1614 }
1615 node = node->prev;
1616 }
1617 }
1618
cmark_inline_parser_get_last_delimiter(cmark_inline_parser * parser)1619 delimiter *cmark_inline_parser_get_last_delimiter(cmark_inline_parser *parser) {
1620 return parser->last_delim;
1621 }
1622
cmark_inline_parser_get_line(cmark_inline_parser * parser)1623 int cmark_inline_parser_get_line(cmark_inline_parser *parser) {
1624 return parser->line;
1625 }
1626