1 /* smi - simple markup interpreter
2  * Copyright (C) <2014> Chris Hutchinson <portmaster bsdforge com>
3  * based on work (C) <2007, 2008> Enno boland <g s01 de>
4  *
5  * See LICENSE for terms, and usage information
6  */
7 
8 #include <stdlib.h>
9 #include <stdio.h>
10 #include <stdarg.h>
11 #include <string.h>
12 
13 #define BUFFERSIZE 512
14 #define LENGTH(x) sizeof(x)/sizeof(x[0])
15 #define ADDC(b,i) if(i % BUFFERSIZE == 0) \
16 	{ b = realloc(b, (i + BUFFERSIZE) * sizeof(b)); if(!b) eprint("Malloc failed."); } b[i]
17 
18 
19 typedef int (*Parser)(const char *, const char *, int);
20 struct Tag {
21 	char *search;
22 	int process;
23 	char *before, *after;
24 };
25 
26 
27 void eprint(const char *format, ...);	/* Prints error and exits */
28 int doamp(const char *begin, const char *end, int newblock);
29 							/* Parser for & */
30 int dogtlt(const char *begin, const char *end, int newblock);
31 							/* Parser for < and > */
32 int dohtml(const char *begin, const char *end, int newblock);
33 							/* Parser for html */
34 int dolineprefix(const char *begin, const char *end, int newblock);
35 							/* Parser for line prefix tags */
36 int dolink(const char *begin, const char *end, int newblock);
37 							/* Parser for links and images */
38 int dolist(const char *begin, const char *end, int newblock);
39 							/* Parser for lists */
40 int doparagraph(const char *begin, const char *end, int newblock);
41 							/* Parser for paragraphs */
42 int doreplace(const char *begin, const char *end, int newblock);
43 							/* Parser for simple replaces */
44 int doshortlink(const char *begin, const char *end, int newblock);
45 							/* Parser for links and images */
46 int dosurround(const char *begin, const char *end, int newblock);
47 							/* Parser for surrounding tags */
48 int dounderline(const char *begin, const char *end, int newblock);
49 							/* Parser for underline tags */
50 void hprint(const char *begin, const char *end);	/* escapes HTML and prints it to stdout*/
51 void process(const char *begin, const char *end, int isblock);
52 							/* Processes range between begin and end. */
53 
54 Parser parsers[] = { dounderline, dohtml, dolineprefix, dolist, doparagraph,
55 	dogtlt, dosurround, dolink, doshortlink, doamp, doreplace };
56 							/* list of parsers */
57 FILE *source;
58 unsigned int nohtml = 0;
59 struct Tag lineprefix[] = {
60 	{ "   ",	0,	"<pre><code>", "</code></pre>" },
61 	{ "\t",		0,	"<pre><code>", "</code></pre>" },
62 	{ "> ",		2,	"<blockquote>",	"</blockquote>" },
63 	{ "###### ",	1,	"<h6>",		"</h6>" },
64 	{ "##### ",	1,	"<h5>",		"</h5>" },
65 	{ "#### ",	1,	"<h4>",		"</h4>" },
66 	{ "### ",	1,	"<h3>",		"</h3>" },
67 	{ "## ",	1,	"<h2>",		"</h2>" },
68 	{ "# ",		1,	"<h1>",		"</h1>" },
69 	{ "- - -\n",	1,	"<hr />",	""},
70 };
71 struct Tag underline[] = {
72 	{ "=",		1,	"<h1>",		"</h1>\n" },
73 	{ "-",		1,	"<h2>",		"</h2>\n" },
74 };
75 struct Tag surround[] = {
76 	{ "``",		0,	"<code>",	"</code>" },
77 	{ "`",		0,	"<code>",	"</code>" },
78 	{ "___",	1,	"<strong><em>",	"</em></strong>" },
79 	{ "***",	1,	"<strong><em>",	"</em></strong>" },
80 	{ "__",		1,	"<strong>",	"</strong>" },
81 	{ "**",		1,	"<strong>",	"</strong>" },
82 	{ "_",		1,	"<em>",		"</em>" },
83 	{ "*",		1,	"<em>",		"</em>" },
84 };
85 char * replace[][2] = {
86 	{ "\\\\",	"\\" },
87 	{ "\\`",	"`" },
88 	{ "\\*",	"*" },
89 	{ "\\_",	"_" },
90 	{ "\\{",	"{" },
91 	{ "\\}",	"}" },
92 	{ "\\[",	"[" },
93 	{ "\\]",	"]" },
94 	{ "\\(",	"(" },
95 	{ "\\)",	")" },
96 	{ "\\#",	"#" },
97 	{ "\\+",	"+" },
98 	{ "\\-",	"-" },
99 	{ "\\.",	"." },
100 	{ "\\!",	"!" },
101 };
102 char * insert[][2] = {
103 	{ "  \n",	"<br />" },
104 };
105 
106 void
eprint(const char * format,...)107 eprint(const char *format, ...) {
108 	va_list ap;
109 
110 	va_start(ap, format);
111 	vfprintf(stderr, format, ap);
112 	va_end(ap);
113 	exit(EXIT_FAILURE);
114 }
115 
116 int
doamp(const char * begin,const char * end,int newblock)117 doamp(const char *begin, const char *end, int newblock) {
118 	const char *p;
119 
120 	if(*begin != '&')
121 		return 0;
122 	if(!nohtml) {
123 		for(p = begin + 1; p != end && !strchr("; \\\n\t", *p); p++);
124 		if(p == end || *p == ';')
125 			return 0;
126 	}
127 	fputs("&amp;", stdout);
128 	return 1;
129 }
130 
131 int
dogtlt(const char * begin,const char * end,int newblock)132 dogtlt(const char *begin, const char *end, int newblock) {
133 	int brpos;
134 	char c;
135 
136 	if(nohtml || begin + 1 >= end)
137 		return 0;
138 	brpos = begin[1] == '>';
139 	if(!brpos && *begin != '<')
140 		return 0;
141 	c = begin[brpos ? 0 : 1];
142 	if(!brpos && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) {
143 		fputs("&lt;",stdout);
144 		return 1;
145 	}
146 	else if(brpos && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') && !strchr("/\"'",c)) {
147 		printf("%c&gt;",c);
148 		return 2;
149 	}
150 	return 0;
151 }
152 
153 int
dohtml(const char * begin,const char * end,int newblock)154 dohtml(const char *begin, const char *end, int newblock) {
155 	const char *p, *tag, *tagend;
156 
157 	if(nohtml || !newblock || *begin == '\n' || begin + 2 >= end)
158 		return 0;
159 	p = begin;
160 	if(p[1] == '\n')
161 		p++;
162 	if(p[1] != '<' || strchr(" /\n\t\\", p[2]))
163 		return 0;
164 	tag = p + 2;
165 	p += 2;
166 	for(; !strchr(" >", *p); p++);
167 	tagend = p;
168 	while((p = strstr(p, "\n</")) && p < end) {
169 		p += 3;
170 		if(strncmp(p, tag, tagend - tag) == 0 && p[tagend - tag] == '>') {
171 			p++;
172 			fwrite(begin, sizeof(char), p - begin + tagend - tag, stdout);
173 			puts("\n");
174 			return -(p - begin + tagend - tag);
175 		}
176 	}
177 	return 0;
178 }
179 
180 int
dolineprefix(const char * begin,const char * end,int newblock)181 dolineprefix(const char *begin, const char *end, int newblock) {
182 	unsigned int i, j, l;
183 	char *buffer;
184 	const char *p;
185 
186 	if(newblock)
187 		p = begin;
188 	else if(*begin == '\n')
189 		p = begin + 1;
190 	else
191 		return 0;
192 	for(i = 0; i < LENGTH(lineprefix); i++) {
193 		l = strlen(lineprefix[i].search);
194 		if(end - p < l)
195 			continue;
196 		if(strncmp(lineprefix[i].search, p, l))
197 			continue;
198 		if(*begin == '\n')
199 			fputc('\n', stdout);
200 		fputs(lineprefix[i].before, stdout);
201 		if(lineprefix[i].search[l-1] == '\n') {
202 			fputc('\n', stdout);
203 			return l;
204 		}
205 		if(!(buffer = malloc(BUFFERSIZE)))
206 			eprint("Malloc failed.");
207 		buffer[0] = '\0';
208 		for(j = 0, p += l; p < end; p++, j++) {
209 			ADDC(buffer, j) = *p;
210 			if(*p == '\n' && p + l < end) {
211 				if(strncmp(lineprefix[i].search, p + 1, l) != 0)
212 					break;
213 				p += l;
214 			}
215 		}
216 		ADDC(buffer, j) = '\0';
217 		if(lineprefix[i].process)
218 			process(buffer, buffer + strlen(buffer), lineprefix[i].process >= 2);
219 		else
220 			hprint(buffer, buffer + strlen(buffer));
221 		puts(lineprefix[i].after);
222 		free(buffer);
223 		return -(p - begin);
224 	}
225 	return 0;
226 }
227 
228 int
dolink(const char * begin,const char * end,int newblock)229 dolink(const char *begin, const char *end, int newblock) {
230 	int img;
231 	const char *desc, *link, *p, *q, *descend, *linkend;
232 
233 	if(*begin == '[')
234 		img = 0;
235 	else if(strncmp(begin, "![", 2) == 0)
236 		img = 1;
237 	else
238 		return 0;
239 	p = desc = begin + 1 + img;
240 	if(!(p = strstr(desc, "](")) || p > end)
241 		return 0;
242 	for(q = strstr(desc, "!["); q && q < end && q < p; q = strstr(q + 1, "!["))
243 		if(!(p = strstr(p + 1, "](")) || p > end)
244 			return 0;
245 	descend = p;
246 	link = p + 2;
247 	if(!(p = strstr(link, ")")) || p > end)
248 		return 0;
249 	linkend = p;
250 	if(img) {
251 		fputs("<img src=\"", stdout);
252 		hprint(link, linkend);
253 		fputs("\" alt=\"", stdout);
254 		hprint(desc, descend);
255 		fputs("\" />", stdout);
256 	}
257 	else {
258 		fputs("<a href=\"", stdout);
259 		hprint(link, linkend);
260 		fputs("\">", stdout);
261 		process(desc, descend, 0);
262 		fputs("</a>", stdout);
263 	}
264 	return p + 1 - begin;
265 }
266 
267 int
dolist(const char * begin,const char * end,int newblock)268 dolist(const char *begin, const char *end, int newblock) {
269 	unsigned int i, j, indent, run, ul, isblock;
270 	const char *p, *q;
271 	char *buffer;
272 
273 	isblock = 0;
274 	if(newblock)
275 		p = begin;
276 	else if(*begin == '\n')
277 		p = begin + 1;
278 	else
279 		return 0;
280 	q = p;
281 	if(*p == '-' || *p == '*' || *p == '+')
282 		ul = 1;
283 	else {
284 		ul = 0;
285 		for(; p < end && *p >= '0' && *p <= '9'; p++);
286 		if(p >= end || *p != '.')
287 			return 0;
288 	}
289 	p++;
290 	if(p >= end || !(*p == ' ' || *p == '\t'))
291 		return 0;
292 	for(p++; p != end && (*p == ' ' || *p == '\t'); p++);
293 	indent = p - q;
294 	if(!(buffer = malloc(BUFFERSIZE)))
295 		eprint("Malloc failed.");
296 	if(!newblock)
297 		putchar('\n');
298 	fputs(ul ? "<ul>\n" : "<ol>\n", stdout);
299 	run = 1;
300 	for(; p < end && run; p++) {
301 		for(i = 0; p < end && run; p++, i++) {
302 			if(*p == '\n') {
303 				if(p + 1 == end)
304 					break;
305 				else if(p[1] == '\n') {
306 					p++;
307 					ADDC(buffer, i) = '\n';
308 					i++;
309 					run = 0;
310 					isblock++;
311 				}
312 				q = p + 1;
313 				j = 0;
314 				if(ul && (*q == '-' || *q == '*' || *q == '+'))
315 					j = 1;
316 				else if(!ul) {
317 					for(; q + j != end && q[j] >= '0' && q[j] <= '9' && j < indent; j++);
318 					if(q + j == end)
319 						break;
320 					if(j > 0 && q[j] == '.')
321 						j++;
322 					else
323 						j = 0;
324 				}
325 				if(q + indent < end)
326 					for(; (q[j] == ' ' || q[j] == '\t') && j < indent; j++);
327 				if(j == indent) {
328 					ADDC(buffer, i) = '\n';
329 					i++;
330 					p += indent;
331 					run = 1;
332 					if(*q == ' ' || *q == '\t')
333 						p++;
334 					else
335 						break;
336 				}
337 			}
338 			ADDC(buffer, i) = *p;
339 		}
340 		ADDC(buffer, i) = '\0';
341 		fputs("<li>", stdout);
342 		process(buffer, buffer + i, isblock > 1 || (isblock == 1 && run));
343 		fputs("</li>\n", stdout);
344 	}
345 	fputs(ul ? "</ul>\n" : "</ol>\n", stdout);
346 	free(buffer);
347 	p--;
348 	while(*(--p) == '\n');
349 	return -(p - begin + 1);
350 }
351 
352 int
doparagraph(const char * begin,const char * end,int newblock)353 doparagraph(const char *begin, const char *end, int newblock) {
354 	const char *p;
355 
356 	if(!newblock)
357 		return 0;
358 	p = strstr(begin, "\n\n");
359 	if(!p || p > end)
360 		p = end;
361 	if(p - begin <= 1)
362 		return 0;
363 	fputs("<p>\n", stdout);
364 	process(begin, p, 0);
365 	fputs("</p>\n", stdout);
366 	return -(p - begin);
367 }
368 
369 int
doreplace(const char * begin,const char * end,int newblock)370 doreplace(const char *begin, const char *end, int newblock) {
371 	unsigned int i, l;
372 
373 	for(i = 0; i < LENGTH(insert); i++)
374 		if(strncmp(insert[i][0], begin, strlen(insert[i][0])) == 0)
375 			fputs(insert[i][1], stdout);
376 	for(i = 0; i < LENGTH(replace); i++) {
377 		l = strlen(replace[i][0]);
378 		if(end - begin < l)
379 			continue;
380 		if(strncmp(replace[i][0], begin, l) == 0) {
381 			fputs(replace[i][1], stdout);
382 			return l;
383 		}
384 	}
385 	return 0;
386 }
387 
388 int
doshortlink(const char * begin,const char * end,int newblock)389 doshortlink(const char *begin, const char *end, int newblock) {
390 	const char *p, *c;
391 	int ismail = 0;
392 
393 	if(*begin != '<')
394 		return 0;
395 	for(p = begin + 1; p != end; p++) {
396 		switch(*p) {
397 		case ' ':
398 		case '\t':
399 		case '\n':
400 			return 0;
401 		case '#':
402 		case ':':
403 			ismail = -1;
404 			break;
405 		case '@':
406 			if(ismail == 0)
407 				ismail = 1;
408 			break;
409 		case '>':
410 			if(ismail == 0)
411 				return 0;
412 			fputs("<a href=\"", stdout);
413 			if(ismail == 1) {
414 				/* mailto: */
415 				fputs("&#x6D;&#x61;i&#x6C;&#x74;&#x6F;:", stdout);
416 				for(c = begin + 1; *c != '>'; c++)
417 					printf("&#%u;", *c);
418 				fputs("\">", stdout);
419 				for(c = begin + 1; *c != '>'; c++)
420 					printf("&#%u;", *c);
421 			}
422 			else {
423 				hprint(begin + 1, p);
424 				fputs("\">", stdout);
425 				hprint(begin + 1, p);
426 			}
427 			fputs("</a>", stdout);
428 			return p - begin + 1;
429 		}
430 	}
431 	return 0;
432 }
433 
434 int
dosurround(const char * begin,const char * end,int newblock)435 dosurround(const char *begin, const char *end, int newblock) {
436 	unsigned int i, l;
437 	const char *p, *start, *stop;
438 
439 	for(i = 0; i < LENGTH(surround); i++) {
440 		l = strlen(surround[i].search);
441 		if(end - begin < 2*l || strncmp(begin, surround[i].search, l) != 0)
442 			continue;
443 		start = begin + l;
444 		p = start - 1;
445 		do {
446 			p = strstr(p + 1, surround[i].search);
447 		} while(p && p[-1] == '\\');
448 		if(!p || p >= end ||
449 				!(stop = strstr(start, surround[i].search)) || stop >= end)
450 			continue;
451 		fputs(surround[i].before, stdout);
452 		if(surround[i].process)
453 			process(start, stop, 0);
454 		else
455 			hprint(start, stop);
456 		fputs(surround[i].after, stdout);
457 		return stop - begin + l;
458 	}
459 	return 0;
460 }
461 
462 int
dounderline(const char * begin,const char * end,int newblock)463 dounderline(const char *begin, const char *end, int newblock) {
464 	unsigned int i, j, l;
465 	const char *p;
466 
467 	if(!newblock)
468 		return 0;
469 	p = begin;
470 	for(l = 0; p + l != end && p[l] != '\n'; l++);
471 	p += l + 1;
472 	if(l == 0)
473 		return 0;
474 	for(i = 0; i < LENGTH(underline); i++) {
475 		for(j = 0; p + j != end && p[j] != '\n' && p[j] == underline[i].search[0]; j++);
476 		if(j >= l) {
477 			fputs(underline[i].before, stdout);
478 			if(underline[i].process)
479 				process(begin, begin + l, 0);
480 			else
481 				hprint(begin, begin + l);
482 			fputs(underline[i].after, stdout);
483 			return -(j + p - begin);
484 		}
485 	}
486 	return 0;
487 }
488 
489 void
hprint(const char * begin,const char * end)490 hprint(const char *begin, const char *end) {
491 	const char *p;
492 
493 	for(p = begin; p != end; p++) {
494 		if(*p == '&')
495 			fputs("&amp;", stdout);
496 		else if(*p == '"')
497 			fputs("&quot;", stdout);
498 		else if(*p == '>')
499 			fputs("&gt;", stdout);
500 		else if(*p == '<')
501 			fputs("&lt;", stdout);
502 		else
503 			putchar(*p);
504 	}
505 }
506 
507 void
process(const char * begin,const char * end,int newblock)508 process(const char *begin, const char *end, int newblock) {
509 	const char *p, *q;
510 	int affected;
511 	unsigned int i;
512 
513 	for(p = begin; p != end;) {
514 		if(newblock)
515 			while(*p == '\n')
516 				if (++p == end)
517 					return;
518 		affected = 0;
519 		for(i = 0; i < LENGTH(parsers) && affected == 0; i++)
520 			affected = parsers[i](p, end, newblock);
521 		p += abs(affected);
522 		if(!affected) {
523 			if(nohtml)
524 				hprint(p, p + 1);
525 			else
526 				putchar(*p);
527 			p++;
528 		}
529 		for(q = p; q != end && *q == '\n'; q++);
530 		if(q == end)
531 			return;
532 		else if(p[0] == '\n' && p + 1 != end && p[1] == '\n')
533 			newblock = 1;
534 		else
535 			newblock = affected < 0;
536 	}
537 }
538 
539 int
main(int argc,char * argv[])540 main(int argc, char *argv[]) {
541 	char *buffer;
542 	int s;
543 	unsigned long len, bsize;
544 
545 	source = stdin;
546 	if(argc > 1 && strcmp("-v", argv[1]) == 0)
547 		eprint("Simple Markup Interpreter %s (C) Chris Hutchinson\n",VERSION);
548 	else if(argc > 1 && strcmp("-h", argv[1]) == 0)
549 		eprint("Usage %s [-n] [file]\n -n escape html strictly\n",argv[0]);
550 	if(argc > 1 && strcmp("-n", argv[1]) == 0)
551 		nohtml = 1;
552 	if(argc > 1 + nohtml && strcmp("-", argv[1 + nohtml]) != 0
553 			&& !(source = fopen(argv[1 + nohtml],"r")))
554 		eprint("Cannot open file `%s`\n",argv[1 + nohtml]);
555 	bsize = 2 * BUFFERSIZE;
556 	if(!(buffer = malloc(bsize)))
557 		eprint("Malloc failed.");
558 	len = 0;
559 	while((s = fread(buffer + len, 1, BUFFERSIZE, source))) {
560 		len += s;
561 		if(BUFFERSIZE + len + 1 > bsize) {
562 			bsize += BUFFERSIZE;
563 			if(!(buffer = realloc(buffer, bsize)))
564 				eprint("Malloc failed.");
565 		}
566 	}
567 	buffer[len] = '\0';
568 	process(buffer, buffer + len, 1);
569 	fclose(source);
570 	free(buffer);
571 	return EXIT_SUCCESS;
572 }
573