xref: /openbsd/usr.bin/dig/lib/isc/lex.c (revision ac19a2a7)
1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14  * PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 /* $Id: lex.c,v 1.15 2022/06/25 12:14:18 jsg Exp $ */
18 
19 /*! \file */
20 
21 #include <stdlib.h>
22 
23 #include <isc/buffer.h>
24 
25 #include <isc/lex.h>
26 
27 #include <errno.h>
28 #include <string.h>
29 #include <isc/util.h>
30 
31 #include "unix/errno2result.h"
32 
33 typedef struct inputsource {
34 	isc_result_t			result;
35 	int			is_file;
36 	int			need_close;
37 	int			at_eof;
38 	int			last_was_eol;
39 	isc_buffer_t *			pushback;
40 	unsigned int			ignored;
41 	void *				input;
42 	char *				name;
43 	unsigned long			line;
44 	unsigned long			saved_line;
45 	ISC_LINK(struct inputsource)	link;
46 } inputsource;
47 
48 struct isc_lex {
49 	/* Unlocked. */
50 	size_t				max_token;
51 	char *				data;
52 	unsigned int			comments;
53 	int			comment_ok;
54 	int			last_was_eol;
55 	unsigned int			paren_count;
56 	unsigned int			saved_paren_count;
57 	isc_lexspecials_t		specials;
58 	LIST(struct inputsource)	sources;
59 };
60 
61 static inline isc_result_t
grow_data(isc_lex_t * lex,size_t * remainingp,char ** currp,char ** prevp)62 grow_data(isc_lex_t *lex, size_t *remainingp, char **currp, char **prevp) {
63 	char *tmp;
64 
65 	tmp = malloc(lex->max_token * 2 + 1);
66 	if (tmp == NULL)
67 		return (ISC_R_NOMEMORY);
68 	memmove(tmp, lex->data, lex->max_token + 1);
69 	*currp = tmp + (*currp - lex->data);
70 	if (*prevp != NULL)
71 		*prevp = tmp + (*prevp - lex->data);
72 	free(lex->data);
73 	lex->data = tmp;
74 	*remainingp += lex->max_token;
75 	lex->max_token *= 2;
76 	return (ISC_R_SUCCESS);
77 }
78 
79 isc_result_t
isc_lex_create(size_t max_token,isc_lex_t ** lexp)80 isc_lex_create(size_t max_token, isc_lex_t **lexp) {
81 	isc_lex_t *lex;
82 
83 	/*
84 	 * Create a lexer.
85 	 */
86 	REQUIRE(lexp != NULL && *lexp == NULL);
87 
88 	if (max_token == 0U)
89 		max_token = 1;
90 
91 	lex = malloc(sizeof(*lex));
92 	if (lex == NULL)
93 		return (ISC_R_NOMEMORY);
94 	lex->data = malloc(max_token + 1);
95 	if (lex->data == NULL) {
96 		free(lex);
97 		return (ISC_R_NOMEMORY);
98 	}
99 	lex->max_token = max_token;
100 	lex->comments = 0;
101 	lex->comment_ok = 1;
102 	lex->last_was_eol = 1;
103 	lex->paren_count = 0;
104 	lex->saved_paren_count = 0;
105 	memset(lex->specials, 0, 256);
106 	INIT_LIST(lex->sources);
107 
108 	*lexp = lex;
109 
110 	return (ISC_R_SUCCESS);
111 }
112 
113 void
isc_lex_destroy(isc_lex_t ** lexp)114 isc_lex_destroy(isc_lex_t **lexp) {
115 	isc_lex_t *lex;
116 
117 	/*
118 	 * Destroy the lexer.
119 	 */
120 
121 	REQUIRE(lexp != NULL);
122 	lex = *lexp;
123 
124 	while (!EMPTY(lex->sources))
125 		RUNTIME_CHECK(isc_lex_close(lex) == ISC_R_SUCCESS);
126 	if (lex->data != NULL)
127 		free(lex->data);
128 	free(lex);
129 
130 	*lexp = NULL;
131 }
132 
133 void
isc_lex_setcomments(isc_lex_t * lex,unsigned int comments)134 isc_lex_setcomments(isc_lex_t *lex, unsigned int comments) {
135 	/*
136 	 * Set allowed lexer commenting styles.
137 	 */
138 
139 	lex->comments = comments;
140 }
141 
142 void
isc_lex_setspecials(isc_lex_t * lex,isc_lexspecials_t specials)143 isc_lex_setspecials(isc_lex_t *lex, isc_lexspecials_t specials) {
144 	/*
145 	 * The characters in 'specials' are returned as tokens.  Along with
146 	 * whitespace, they delimit strings and numbers.
147 	 */
148 
149 	memmove(lex->specials, specials, 256);
150 }
151 
152 static inline isc_result_t
new_source(isc_lex_t * lex,int is_file,int need_close,void * input,const char * name)153 new_source(isc_lex_t *lex, int is_file, int need_close,
154 	   void *input, const char *name)
155 {
156 	inputsource *source;
157 	isc_result_t result;
158 
159 	source = malloc(sizeof(*source));
160 	if (source == NULL)
161 		return (ISC_R_NOMEMORY);
162 	source->result = ISC_R_SUCCESS;
163 	source->is_file = is_file;
164 	source->need_close = need_close;
165 	source->at_eof = 0;
166 	source->last_was_eol = lex->last_was_eol;
167 	source->input = input;
168 	source->name = strdup(name);
169 	if (source->name == NULL) {
170 		free(source);
171 		return (ISC_R_NOMEMORY);
172 	}
173 	source->pushback = NULL;
174 	result = isc_buffer_allocate(&source->pushback,
175 				     (unsigned int)lex->max_token);
176 	if (result != ISC_R_SUCCESS) {
177 		free(source->name);
178 		free(source);
179 		return (result);
180 	}
181 	source->ignored = 0;
182 	source->line = 1;
183 	ISC_LIST_INITANDPREPEND(lex->sources, source, link);
184 
185 	return (ISC_R_SUCCESS);
186 }
187 
188 isc_result_t
isc_lex_openfile(isc_lex_t * lex,const char * filename)189 isc_lex_openfile(isc_lex_t *lex, const char *filename) {
190 	isc_result_t result = ISC_R_SUCCESS;
191 	FILE *stream = NULL;
192 
193 	/*
194 	 * Open 'filename' and make it the current input source for 'lex'.
195 	 */
196 
197 	if ((stream = fopen(filename, "r")) == NULL)
198 		return (isc__errno2result(errno));
199 
200 	result = new_source(lex, 1, 1, stream, filename);
201 	if (result != ISC_R_SUCCESS)
202 		(void)fclose(stream);
203 	return (result);
204 }
205 
206 isc_result_t
isc_lex_close(isc_lex_t * lex)207 isc_lex_close(isc_lex_t *lex) {
208 	inputsource *source;
209 
210 	/*
211 	 * Close the most recently opened object (i.e. file or buffer).
212 	 */
213 
214 	source = HEAD(lex->sources);
215 	if (source == NULL)
216 		return (ISC_R_NOMORE);
217 
218 	ISC_LIST_UNLINK(lex->sources, source, link);
219 	lex->last_was_eol = source->last_was_eol;
220 	if (source->is_file) {
221 		if (source->need_close)
222 			(void)fclose((FILE *)(source->input));
223 	}
224 	free(source->name);
225 	isc_buffer_free(&source->pushback);
226 	free(source);
227 
228 	return (ISC_R_SUCCESS);
229 }
230 
231 typedef enum {
232 	lexstate_start,
233 	lexstate_string,
234 	lexstate_maybecomment,
235 	lexstate_ccomment,
236 	lexstate_ccommentend,
237 	lexstate_eatline,
238 	lexstate_qstring
239 } lexstate;
240 
241 static void
pushback(inputsource * source,int c)242 pushback(inputsource *source, int c) {
243 	REQUIRE(source->pushback->current > 0);
244 	if (c == EOF) {
245 		source->at_eof = 0;
246 		return;
247 	}
248 	source->pushback->current--;
249 	if (c == '\n')
250 		source->line--;
251 }
252 
253 static isc_result_t
pushandgrow(inputsource * source,int c)254 pushandgrow(inputsource *source, int c) {
255 	if (isc_buffer_availablelength(source->pushback) == 0) {
256 		isc_buffer_t *tbuf = NULL;
257 		unsigned int oldlen;
258 		isc_region_t used;
259 		isc_result_t result;
260 
261 		oldlen = isc_buffer_length(source->pushback);
262 		result = isc_buffer_allocate(&tbuf, oldlen * 2);
263 		if (result != ISC_R_SUCCESS)
264 			return (result);
265 		isc_buffer_usedregion(source->pushback, &used);
266 		result = isc_buffer_copyregion(tbuf, &used);
267 		INSIST(result == ISC_R_SUCCESS);
268 		tbuf->current = source->pushback->current;
269 		isc_buffer_free(&source->pushback);
270 		source->pushback = tbuf;
271 	}
272 	isc_buffer_putuint8(source->pushback, (uint8_t)c);
273 	return (ISC_R_SUCCESS);
274 }
275 
276 isc_result_t
isc_lex_gettoken(isc_lex_t * lex,unsigned int options,isc_token_t * tokenp)277 isc_lex_gettoken(isc_lex_t *lex, unsigned int options, isc_token_t *tokenp) {
278 	inputsource *source;
279 	int c;
280 	int done = 0;
281 	int no_comments = 0;
282 	int escaped = 0;
283 	lexstate state = lexstate_start;
284 	lexstate saved_state = lexstate_start;
285 	isc_buffer_t *buffer;
286 	FILE *stream;
287 	char *curr, *prev;
288 	size_t remaining;
289 	isc_result_t result;
290 
291 	/*
292 	 * Get the next token.
293 	 */
294 
295 	source = HEAD(lex->sources);
296 	REQUIRE(tokenp != NULL);
297 
298 	if (source == NULL) {
299 		if ((options & ISC_LEXOPT_NOMORE) != 0) {
300 			tokenp->type = isc_tokentype_nomore;
301 			return (ISC_R_SUCCESS);
302 		}
303 		return (ISC_R_NOMORE);
304 	}
305 
306 	if (source->result != ISC_R_SUCCESS)
307 		return (source->result);
308 
309 	lex->saved_paren_count = lex->paren_count;
310 	source->saved_line = source->line;
311 
312 	if (isc_buffer_remaininglength(source->pushback) == 0 &&
313 	    source->at_eof)
314 	{
315 		if ((options & ISC_LEXOPT_EOF) != 0) {
316 			tokenp->type = isc_tokentype_eof;
317 			return (ISC_R_SUCCESS);
318 		}
319 		return (ISC_R_EOF);
320 	}
321 
322 	isc_buffer_compact(source->pushback);
323 
324 	curr = lex->data;
325 	*curr = '\0';
326 
327 	prev = NULL;
328 	remaining = lex->max_token;
329 
330 	if (source->is_file)
331 		flockfile(source->input);
332 
333 	do {
334 		if (isc_buffer_remaininglength(source->pushback) == 0) {
335 			if (source->is_file) {
336 				stream = source->input;
337 
338 				c = getc_unlocked(stream);
339 				if (c == EOF) {
340 					if (ferror(stream)) {
341 						source->result = ISC_R_IOERROR;
342 						result = source->result;
343 						goto done;
344 					}
345 					source->at_eof = 1;
346 				}
347 			} else {
348 				buffer = source->input;
349 
350 				if (buffer->current == buffer->used) {
351 					c = EOF;
352 					source->at_eof = 1;
353 				} else {
354 					c = *((unsigned char *)buffer->base +
355 					      buffer->current);
356 					buffer->current++;
357 				}
358 			}
359 			if (c != EOF) {
360 				source->result = pushandgrow(source, c);
361 				if (source->result != ISC_R_SUCCESS) {
362 					result = source->result;
363 					goto done;
364 				}
365 			}
366 		}
367 
368 		if (!source->at_eof) {
369 			if (state == lexstate_start)
370 				/* Token has not started yet. */
371 				source->ignored =
372 				   isc_buffer_consumedlength(source->pushback);
373 			c = isc_buffer_getuint8(source->pushback);
374 		} else {
375 			c = EOF;
376 		}
377 
378 		if (c == '\n')
379 			source->line++;
380 
381 		if (lex->comment_ok && !no_comments) {
382 			if (c == '/' &&
383 				   (lex->comments &
384 				    (ISC_LEXCOMMENT_C|
385 				     ISC_LEXCOMMENT_CPLUSPLUS)) != 0) {
386 				saved_state = state;
387 				state = lexstate_maybecomment;
388 				no_comments = 1;
389 				continue;
390 			} else if (c == '#' &&
391 				   ((lex->comments & ISC_LEXCOMMENT_SHELL)
392 				    != 0)) {
393 				saved_state = state;
394 				state = lexstate_eatline;
395 				no_comments = 1;
396 				continue;
397 			}
398 		}
399 
400 	no_read:
401 		/* INSIST(c == EOF || (c >= 0 && c <= 255)); */
402 		switch (state) {
403 		case lexstate_start:
404 			if (c == EOF) {
405 				lex->last_was_eol = 0;
406 				if ((options & ISC_LEXOPT_EOF) == 0) {
407 					result = ISC_R_EOF;
408 					goto done;
409 				}
410 				tokenp->type = isc_tokentype_eof;
411 				done = 1;
412 			} else if (c == ' ' || c == '\t') {
413 				lex->last_was_eol = 0;
414 			} else if (c == '\n') {
415 				lex->last_was_eol = 1;
416 			} else if (c == '\r') {
417 				lex->last_was_eol = 0;
418 			} else if (c == '"' &&
419 				   (options & ISC_LEXOPT_QSTRING) != 0) {
420 				lex->last_was_eol = 0;
421 				no_comments = 1;
422 				state = lexstate_qstring;
423 			} else if (lex->specials[c]) {
424 				lex->last_was_eol = 0;
425 				tokenp->type = isc_tokentype_special;
426 				tokenp->value.as_char = c;
427 				done = 1;
428 			} else {
429 				lex->last_was_eol = 0;
430 				state = lexstate_string;
431 				goto no_read;
432 			}
433 			break;
434 		case lexstate_string:
435 			/*
436 			 * EOF needs to be checked before lex->specials[c]
437 			 * as lex->specials[EOF] is not a good idea.
438 			 */
439 			if (c == '\r' || c == '\n' || c == EOF ||
440 			    (!escaped &&
441 			     (c == ' ' || c == '\t' || lex->specials[c]))) {
442 				pushback(source, c);
443 				if (source->result != ISC_R_SUCCESS) {
444 					result = source->result;
445 					goto done;
446 				}
447 				tokenp->type = isc_tokentype_string;
448 				tokenp->value.as_textregion.base = lex->data;
449 				tokenp->value.as_textregion.length =
450 					(unsigned int)
451 					(lex->max_token - remaining);
452 				done = 1;
453 				continue;
454 			}
455 			if (remaining == 0U) {
456 				result = grow_data(lex, &remaining,
457 						   &curr, &prev);
458 				if (result != ISC_R_SUCCESS)
459 					goto done;
460 			}
461 			INSIST(remaining > 0U);
462 			*curr++ = c;
463 			*curr = '\0';
464 			remaining--;
465 			break;
466 		case lexstate_maybecomment:
467 			if (c == '*' &&
468 			    (lex->comments & ISC_LEXCOMMENT_C) != 0) {
469 				state = lexstate_ccomment;
470 				continue;
471 			} else if (c == '/' &&
472 			    (lex->comments & ISC_LEXCOMMENT_CPLUSPLUS) != 0) {
473 				state = lexstate_eatline;
474 				continue;
475 			}
476 			pushback(source, c);
477 			c = '/';
478 			no_comments = 0;
479 			state = saved_state;
480 			goto no_read;
481 		case lexstate_ccomment:
482 			if (c == EOF) {
483 				result = ISC_R_UNEXPECTEDEND;
484 				goto done;
485 			}
486 			if (c == '*')
487 				state = lexstate_ccommentend;
488 			break;
489 		case lexstate_ccommentend:
490 			if (c == EOF) {
491 				result = ISC_R_UNEXPECTEDEND;
492 				goto done;
493 			}
494 			if (c == '/') {
495 				/*
496 				 * C-style comments become a single space.
497 				 * We do this to ensure that a comment will
498 				 * act as a delimiter for strings and
499 				 * numbers.
500 				 */
501 				c = ' ';
502 				no_comments = 0;
503 				state = saved_state;
504 				goto no_read;
505 			} else if (c != '*')
506 				state = lexstate_ccomment;
507 			break;
508 		case lexstate_eatline:
509 			if ((c == '\n') || (c == EOF)) {
510 				no_comments = 0;
511 				state = saved_state;
512 				goto no_read;
513 			}
514 			break;
515 		case lexstate_qstring:
516 			if (c == EOF) {
517 				result = ISC_R_UNEXPECTEDEND;
518 				goto done;
519 			}
520 			if (c == '"') {
521 				if (escaped) {
522 					escaped = 0;
523 					/*
524 					 * Overwrite the preceding backslash.
525 					 */
526 					INSIST(prev != NULL);
527 					*prev = '"';
528 				} else {
529 					tokenp->type = isc_tokentype_qstring;
530 					tokenp->value.as_textregion.base =
531 						lex->data;
532 					tokenp->value.as_textregion.length =
533 						(unsigned int)
534 						(lex->max_token - remaining);
535 					no_comments = 0;
536 					done = 1;
537 				}
538 			} else {
539 				if (c == '\n' && !escaped &&
540 			    (options & ISC_LEXOPT_QSTRINGMULTILINE) == 0) {
541 					pushback(source, c);
542 					result = ISC_R_UNBALANCEDQUOTES;
543 					goto done;
544 				}
545 				if (c == '\\' && !escaped)
546 					escaped = 1;
547 				else
548 					escaped = 0;
549 				if (remaining == 0U) {
550 					result = grow_data(lex, &remaining,
551 							   &curr, &prev);
552 					if (result != ISC_R_SUCCESS)
553 						goto done;
554 				}
555 				INSIST(remaining > 0U);
556 				prev = curr;
557 				*curr++ = c;
558 				*curr = '\0';
559 				remaining--;
560 			}
561 			break;
562 		default:
563 			FATAL_ERROR(__FILE__, __LINE__, "Unexpected state %d",
564 				    state);
565 			/* Does not return. */
566 		}
567 
568 	} while (!done);
569 
570 	result = ISC_R_SUCCESS;
571  done:
572 	if (source->is_file)
573 		funlockfile(source->input);
574 	return (result);
575 }
576 
577 void
isc_lex_ungettoken(isc_lex_t * lex,isc_token_t * tokenp)578 isc_lex_ungettoken(isc_lex_t *lex, isc_token_t *tokenp) {
579 	inputsource *source;
580 	/*
581 	 * Unget the current token.
582 	 */
583 
584 	source = HEAD(lex->sources);
585 	REQUIRE(source != NULL);
586 	REQUIRE(tokenp != NULL);
587 	REQUIRE(isc_buffer_consumedlength(source->pushback) != 0 ||
588 		tokenp->type == isc_tokentype_eof);
589 
590 	UNUSED(tokenp);
591 
592 	isc_buffer_first(source->pushback);
593 	lex->paren_count = lex->saved_paren_count;
594 	source->line = source->saved_line;
595 	source->at_eof = 0;
596 }
597 
598 void
isc_lex_getlasttokentext(isc_lex_t * lex,isc_token_t * tokenp,isc_region_t * r)599 isc_lex_getlasttokentext(isc_lex_t *lex, isc_token_t *tokenp, isc_region_t *r)
600 {
601 	inputsource *source;
602 
603 	source = HEAD(lex->sources);
604 	REQUIRE(source != NULL);
605 	REQUIRE(tokenp != NULL);
606 	REQUIRE(isc_buffer_consumedlength(source->pushback) != 0 ||
607 		tokenp->type == isc_tokentype_eof);
608 
609 	UNUSED(tokenp);
610 
611 	INSIST(source->ignored <= isc_buffer_consumedlength(source->pushback));
612 	r->base = (unsigned char *)isc_buffer_base(source->pushback) +
613 		  source->ignored;
614 	r->length = isc_buffer_consumedlength(source->pushback) -
615 		    source->ignored;
616 }
617 
618 char *
isc_lex_getsourcename(isc_lex_t * lex)619 isc_lex_getsourcename(isc_lex_t *lex) {
620 	inputsource *source;
621 
622 	source = HEAD(lex->sources);
623 
624 	if (source == NULL)
625 		return (NULL);
626 
627 	return (source->name);
628 }
629 
630 unsigned long
isc_lex_getsourceline(isc_lex_t * lex)631 isc_lex_getsourceline(isc_lex_t *lex) {
632 	inputsource *source;
633 
634 	source = HEAD(lex->sources);
635 
636 	if (source == NULL)
637 		return (0);
638 
639 	return (source->line);
640 }
641