1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2021 Tobias Kortkamp <tobik@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 #include "config.h"
29 
30 #include <sys/param.h>
31 #include <inttypes.h>
32 #include <stdbool.h>
33 #include <stdlib.h>
34 #include <stdio.h>
35 #include <string.h>
36 
37 #include "array.h"
38 #include "flow.h"
39 #include "mem.h"
40 #include "mempool.h"
41 #include "peg.h"
42 #include "queue.h"
43 #include "set.h"
44 #include "stack.h"
45 #include "str.h"
46 #include "utf8.h"
47 
48 struct PEGError {
49 	size_t index;
50 	size_t pos;
51 	const char *rule;
52 	const char *msg;
53 };
54 
55 struct PEG {
56 	const char *const buf;
57 	const size_t len;
58 	size_t pos;
59 	size_t depth;
60 
61 	struct {
62 		struct Mempool *pool;
63 		struct Set *set;
64 		struct Queue *queue;
65 		struct Stack *pos;
66 		bool enabled;
67 	} captures;
68 
69 	bool debug;
70 	struct Array *errors;
71 	size_t error_index;
72 	struct Queue *rule_trace;
73 
74 	struct Mempool *pool;
75 };
76 
77 // Prototypes
78 DECLARE_COMPARE(compare_capture);
79 DECLARE_COMPARE(compare_error);
80 static bool peg_match_thruto(struct PEG *, const char *, const char *, bool);
81 static void peg_line_col_at_pos(struct PEG *, size_t, size_t *, size_t *);
82 static void peg_reset(struct PEG *);
83 
84 // Constants
85 static const size_t PEG_MAX_DEPTH = 10000;
86 static const size_t PEG_MAX_ERRORS = 4;
87 
88 #define MATCHER_INIT() \
89 	size_t MATCHER_INIT_captures_queue_len = queue_len(peg->captures.queue); \
90 	size_t MATCHER_INIT_rule_trace_len = queue_len(peg->rule_trace); \
91 	do { \
92 		peg->depth++; \
93 		if (peg->depth > PEG_MAX_DEPTH || peg->pos > peg->len) { \
94 			MATCHER_RETURN(0); \
95 		} \
96 		if (peg->debug) { \
97 			queue_push(peg->rule_trace, (char *)rule); \
98 		} \
99 	} while (0)
100 #define MATCHER_POP(x) \
101 do { \
102 	for (size_t i = MATCHER_INIT_captures_queue_len; i < queue_len(peg->captures.queue); i++) { \
103 		struct PEGCapture *capture = queue_dequeue(peg->captures.queue); \
104 		set_remove(peg->captures.set, capture); \
105 		mempool_release(peg->captures.pool, capture); \
106 	} \
107 	if (peg->debug) { \
108 		for (size_t i = MATCHER_INIT_rule_trace_len; i < queue_len(peg->rule_trace); i++) { \
109 			queue_dequeue(peg->rule_trace); \
110 		} \
111 	} \
112 } while(0)
113 #define MATCHER_RETURN(x) \
114 do { \
115 	peg->depth--; \
116 	if (!(x)) { \
117 		MATCHER_POP(); \
118 	} \
119 	return (x); \
120 } while (0)
121 
DEFINE_COMPARE(compare_capture,struct PEGCapture,void)122 DEFINE_COMPARE(compare_capture, struct PEGCapture, void)
123 {
124 	if (a->pos < b->pos) {
125 		return -1;
126 	} else if (a->pos > b->pos) {
127 		return 1;
128 	} else if (a->len < b->len) {
129 		return -1;
130 	} else if (a->len > b->len) {
131 		return 1;
132 	} else if (a->state < b->state) {
133 		return -1;
134 	} else if (a->state > b->state) {
135 		return 1;
136 	} else if (a->tag < b->tag) {
137 		return -1;
138 	} else if (a->tag > b->tag) {
139 		return 1;
140 	} else {
141 		return 0;
142 	}
143 }
144 
DEFINE_COMPARE(compare_error,struct PEGError,void)145 DEFINE_COMPARE(compare_error, struct PEGError, void)
146 {
147 	if (a->pos < b->pos) {
148 		return 1;
149 	} else if (a->pos > b->pos) {
150 		return -1;
151 	} else if (this != NULL && a->index < b->index) {
152 		return -1;
153 	} else if (this != NULL && a->index > b->index) {
154 		return 1;
155 	} else if (a->rule == NULL && b->rule == NULL) {
156 		return 0;
157 	} else if (a->rule == NULL) {
158 		return 1;
159 	} else if (b->rule == NULL) {
160 		return -1;
161 	} else {
162 		return strcmp(a->rule, b->rule);
163 	}
164 }
165 
166 bool
peg_match(struct PEG * peg,RuleFn rulefn,CaptureFn capture_machine,void * userdata)167 peg_match(struct PEG *peg, RuleFn rulefn, CaptureFn capture_machine, void *userdata)
168 {
169 	peg_reset(peg);
170 
171 	peg->captures.enabled = capture_machine != NULL;
172 	bool result = peg_match_rule(peg, ":main", rulefn);
173 
174 	if (capture_machine) {
175 		bool stop = false;
176 		struct PEGCapture *capture;
177 		while (!stop && (capture = queue_pop(peg->captures.queue))) {
178 			switch (capture_machine(capture, userdata)) {
179 			case PEG_CAPTURE_CONTINUE:
180 				break;
181 			case PEG_CAPTURE_STOP:
182 				stop = true;
183 				break;
184 			}
185 		}
186 		if (stop) {
187 			result = false;
188 		} else {
189 			struct PEGCapture capture = {
190 				.peg = peg,
191 				.buf = peg->buf,
192 				.pos = 0,
193 				.len = peg->pos,
194 				.tag = -1,
195 				.state = 0, // Accept state
196 			};
197 			capture_machine(&capture, userdata);
198 		}
199 	}
200 
201 	peg_reset(peg);
202 
203 	if (peg->debug && queue_len(peg->rule_trace) > 0) {
204 		char *rule = queue_pop(peg->rule_trace);
205 		printf("%s", rule);
206 		while ((rule = queue_pop(peg->rule_trace))) {
207 			printf(" -> %s", rule);
208 		}
209 		printf(" -> end@%zu\n", peg->pos);
210 	}
211 
212 	return result;
213 }
214 
215 bool
peg_match_atleast(struct PEG * peg,const char * rule,RuleFn rulefn,uint32_t n)216 peg_match_atleast(struct PEG *peg, const char *rule, RuleFn rulefn, uint32_t n)
217 {
218 	MATCHER_INIT();
219 	size_t pos = peg->pos;
220 	uint32_t i;
221 	for (i = 0; ; i++) {
222 		if (!rulefn(peg)) {
223 			break;
224 		}
225 	}
226 	if (i < n) {
227 		peg->pos = pos;
228 		MATCHER_RETURN(false);
229 	}
230 	MATCHER_RETURN(true);
231 }
232 
233 bool
peg_match_between(struct PEG * peg,const char * rule,RuleFn rulefn,uint32_t a,uint32_t b)234 peg_match_between(struct PEG *peg, const char *rule, RuleFn rulefn, uint32_t a, uint32_t b)
235 {
236 	MATCHER_INIT();
237 	size_t pos = peg->pos;
238 	uint32_t i;
239 	for (i = 0; i <= b; i++) {
240 		if (!rulefn(peg)) {
241 			break;
242 		}
243 	}
244 	if (i >= a && i <= b) {
245 		MATCHER_RETURN(true);
246 	} else {
247 		peg->pos = pos;
248 		MATCHER_RETURN(false);
249 	}
250 }
251 
252 bool
peg_match_capture_start(struct PEG * peg)253 peg_match_capture_start(struct PEG *peg)
254 {
255 	if (peg->captures.enabled) {
256 		stack_push(peg->captures.pos, (void *)(uintptr_t)peg->pos);
257 	}
258 	return true;
259 }
260 
261 bool
peg_match_capture_end(struct PEG * peg,uint32_t tag,uint32_t state,bool retval)262 peg_match_capture_end(struct PEG *peg, uint32_t tag, uint32_t state, bool retval)
263 {
264 	if (peg->captures.enabled && stack_len(peg->captures.pos) > 0) {
265 		size_t start = (size_t)stack_pop(peg->captures.pos);
266 		if (retval) {
267 			size_t len = peg->pos - start;
268 			struct PEGCapture c = { .tag = tag, .state = state, .pos = start, .len = len };
269 			if (!set_get(peg->captures.set, &c)) {
270 				struct PEGCapture *capture = mempool_alloc(peg->captures.pool, sizeof(struct PEGCapture));
271 				capture->tag = tag;
272 				capture->state = state;
273 				capture->buf = peg->buf + start;
274 				capture->pos = start;
275 				capture->len = len;
276 				capture->peg = peg;
277 				queue_push(peg->captures.queue, capture);
278 				set_add(peg->captures.set, capture);
279 			}
280 		}
281 	}
282 	return retval;
283 }
284 
285 bool
peg_match_char_f(struct PEG * peg,const char * rule,int (* f)(int))286 peg_match_char_f(struct PEG *peg, const char *rule, int (*f)(int))
287 {
288 	MATCHER_INIT();
289 	if (peg->pos >= peg->len) {
290 		MATCHER_RETURN(false);
291 	}
292 	if (f((unsigned char)peg->buf[peg->pos])) {
293 		peg->pos++;
294 		MATCHER_RETURN(true);
295 	} else {
296 		MATCHER_RETURN(false);
297 	}
298 }
299 
300 bool
peg_match_chars(struct PEG * peg,const char * rule,uint32_t chars[],size_t len)301 peg_match_chars(struct PEG *peg, const char *rule, uint32_t chars[], size_t len)
302 {
303 	MATCHER_INIT();
304 	for (size_t i = 0; i < len; i++) {
305 		char needle[UTF_SIZE + 1];
306 		if (!utf8_encode(chars[i], needle)) {
307 			MATCHER_RETURN(false);
308 		}
309 		size_t len = strlen(needle);
310 		if ((peg->len - peg->pos) >= len &&
311 		    strncmp(peg->buf + peg->pos, needle, len) == 0) {
312 			peg->pos += len;
313 			MATCHER_RETURN(true);
314 		}
315 	}
316 	MATCHER_RETURN(false);
317 }
318 
319 bool
peg_match_eos(struct PEG * peg,const char * rule)320 peg_match_eos(struct PEG *peg, const char *rule)
321 {
322 	MATCHER_INIT();
323 	MATCHER_RETURN(peg->pos == peg->len);
324 }
325 
326 bool
peg_match_error(struct PEG * peg,const char * rule,const char * msg)327 peg_match_error(struct PEG *peg, const char *rule, const char *msg)
328 {
329 	struct PEGError key = { .rule = rule, .msg = msg, .pos = peg->pos };
330 	if (array_find(peg->errors, &key, compare_error, &key) == -1) {
331 		struct PEGError *err = array_get(peg->errors, PEG_MAX_ERRORS - 1);
332 		err->index = peg->error_index++;
333 		err->rule = rule;
334 		err->msg = msg;
335 		err->pos = peg->pos;
336 		array_sort(peg->errors, compare_error, NULL);
337 	}
338 	return false;
339 }
340 
341 bool
peg_match_lookahead(struct PEG * peg,const char * rule,RuleFn rulefn)342 peg_match_lookahead(struct PEG *peg, const char *rule, RuleFn rulefn)
343 {
344 	MATCHER_INIT();
345 	size_t pos = peg->pos;
346 	if (rulefn(peg)) {
347 		peg->pos = pos;
348 		MATCHER_POP();
349 		return true;
350 	} else {
351 		peg->pos = pos;
352 		MATCHER_RETURN(false);
353 	}
354 }
355 
356 bool
peg_match_range(struct PEG * peg,const char * rule,uint32_t a,uint32_t b)357 peg_match_range(struct PEG *peg, const char *rule, uint32_t a, uint32_t b)
358 {
359 	MATCHER_INIT();
360 	if (peg->pos >= peg->len) {
361 		MATCHER_RETURN(false);
362 	}
363 
364 	uint32_t c;
365 	size_t clen = utf8_decode(peg->buf + peg->pos, peg->len - peg->pos, &c);
366 	if (UTF_INVALID == clen || a > c || c > b) {
367 		MATCHER_RETURN(false);
368 	}
369 	peg->pos += clen;
370 	MATCHER_RETURN(true);
371 }
372 
373 bool
peg_match_repeat(struct PEG * peg,const char * rule,RuleFn rulefn,uint32_t n)374 peg_match_repeat(struct PEG *peg, const char *rule, RuleFn rulefn, uint32_t n)
375 {
376 	MATCHER_INIT();
377 	size_t pos = peg->pos;
378 	for (uint32_t i = 0; i < n; i++) {
379 		if (!rulefn(peg)) {
380 			peg->pos = pos;
381 			MATCHER_RETURN(false);
382 		}
383 	}
384 	MATCHER_RETURN(true);
385 }
386 
387 bool
peg_match_rule(struct PEG * peg,const char * rule,RuleFn rulefn)388 peg_match_rule(struct PEG *peg, const char *rule, RuleFn rulefn)
389 {
390 	MATCHER_INIT();
391 	size_t pos = peg->pos;
392 	if (rulefn(peg)) {
393 		MATCHER_RETURN(true);
394 	} else {
395 		peg->pos = pos;
396 		MATCHER_RETURN(false);
397 	}
398 }
399 
400 bool
peg_match_string(struct PEG * peg,const char * rule,const char * needle[],size_t needlelen)401 peg_match_string(struct PEG *peg, const char *rule, const char *needle[], size_t needlelen)
402 {
403 	MATCHER_INIT();
404 	for (size_t i = 0; i < needlelen; i++) {
405 		size_t len = strlen(needle[i]);
406 		if ((peg->len - peg->pos) >= len &&
407 		    strncmp(peg->buf + peg->pos, needle[i], len) == 0) {
408 			peg->pos += len;
409 			MATCHER_RETURN(true);
410 		}
411 	}
412 	MATCHER_RETURN(false);
413 }
414 
415 bool
peg_match_thruto(struct PEG * peg,const char * rule,const char * needle,bool thru)416 peg_match_thruto(struct PEG *peg, const char *rule, const char *needle, bool thru)
417 {
418 	MATCHER_INIT();
419 	if (strcmp(needle, "") == 0) {
420 		MATCHER_RETURN(false);
421 	}
422 
423 	char *ptr = strnstr(peg->buf + peg->pos, needle, peg->len);
424 	if (ptr == NULL) {
425 		MATCHER_RETURN(true);
426 	}
427 
428 	size_t offset = 0;
429 	if (thru) {
430 		offset = strlen(needle);
431 	}
432 	size_t len = ptr - (peg->buf + peg->pos) + offset;
433 	peg->pos += len;
434 
435 	MATCHER_RETURN(true);
436 }
437 
438 bool
peg_match_thru(struct PEG * peg,const char * rule,const char * needle)439 peg_match_thru(struct PEG *peg, const char *rule, const char *needle)
440 {
441 	return peg_match_thruto(peg, rule, needle, 1);
442 }
443 
444 bool
peg_match_to(struct PEG * peg,const char * rule,const char * needle)445 peg_match_to(struct PEG *peg, const char *rule, const char *needle)
446 {
447 	return peg_match_thruto(peg, rule, needle, 0);
448 }
449 
450 void
peg_line_col_at_pos(struct PEG * peg,size_t pos,size_t * line,size_t * col)451 peg_line_col_at_pos(struct PEG *peg, size_t pos, size_t *line, size_t *col)
452 {
453 	*line = 1;
454 	*col = 1;
455 	size_t len = MIN(pos, peg->len);
456 	for (size_t i = 0; i < len;) {
457 		uint32_t c;
458 		size_t clen = utf8_decode(peg->buf + i, peg->len - i, &c);
459 		if (UTF_INVALID == clen) {
460 			*line = 1;
461 			*col = i;
462 			return;
463 		}
464 		if (c == '\n') {
465 			(*line)++;
466 			*col = 1;
467 		} else {
468 			(*col)++;
469 		}
470 		i += clen;
471 	}
472 }
473 
474 struct Array *
peg_backtrace(struct PEG * peg,struct Mempool * pool)475 peg_backtrace(struct PEG *peg, struct Mempool *pool)
476 {
477 	struct Array *errors = mempool_array(pool);
478 	ARRAY_FOREACH(peg->errors, struct PEGError *, err) {
479 		if (err->rule == NULL) {
480 			break;
481 		}
482 		size_t line;
483 		size_t col;
484 		peg_line_col_at_pos(peg, err->pos, &line, &col);
485 		char *buf;
486 		if (!err->msg || strcmp(err->msg, "") == 0) {
487 			buf = str_printf(pool, "%zu:%zu: in %s", line, col, err->rule);
488 		} else {
489 			buf = str_printf(pool, "%zu:%zu: in %s: %s", line, col, err->rule, err->msg);
490 		}
491 		array_append(errors, buf);
492 	}
493 	return errors;
494 }
495 
496 struct PEG *
peg_new(const char * const buf,size_t len)497 peg_new(const char *const buf, size_t len)
498 {
499 	struct PEG proto = {
500 		.buf = buf,
501 		.len = len,
502 	};
503 	struct PEG *peg = xmalloc(sizeof(struct PEG));
504 	memcpy(peg, &proto, sizeof(*peg));
505 
506 	peg->pool = mempool_new();
507 	mempool_take(peg->pool, peg);
508 
509 	peg->errors = mempool_array(peg->pool);
510 	for (size_t i = 0; i < PEG_MAX_ERRORS; i++) {
511 		array_append(peg->errors, mempool_alloc(peg->pool, sizeof(struct PEGError)));
512 	}
513 
514 	peg->debug = getenv("LIBIAS_PEG_DEBUG") != NULL;
515 	peg->rule_trace = mempool_queue(peg->pool);
516 
517 	peg->captures.pool = mempool_pool(peg->pool);
518 	peg->captures.set = mempool_set(peg->pool, compare_capture, peg);
519 	peg->captures.queue = mempool_queue(peg->pool);
520 	peg->captures.pos = mempool_stack(peg->pool);
521 
522 	return peg;
523 }
524 
525 void
peg_reset(struct PEG * peg)526 peg_reset(struct PEG *peg)
527 {
528 	peg->pos = 0;
529 	peg->depth = 0;
530 	peg->error_index = 0;
531 
532 	mempool_release_all(peg->captures.pool);
533 	stack_truncate(peg->captures.pos);
534 	queue_truncate(peg->captures.queue);
535 	set_truncate(peg->captures.set);
536 	queue_truncate(peg->rule_trace);
537 }
538 
539 void
peg_free(struct PEG * peg)540 peg_free(struct PEG *peg)
541 {
542 	if (peg == NULL) {
543 		return;
544 	}
545 	mempool_free(peg->pool);
546 }
547