xref: /freebsd/contrib/bc/include/lex.h (revision 9768746b)
1 /*
2  * *****************************************************************************
3  *
4  * SPDX-License-Identifier: BSD-2-Clause
5  *
6  * Copyright (c) 2018-2023 Gavin D. Howard and contributors.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * * Redistributions of source code must retain the above copyright notice, this
12  *   list of conditions and the following disclaimer.
13  *
14  * * Redistributions in binary form must reproduce the above copyright notice,
15  *   this list of conditions and the following disclaimer in the documentation
16  *   and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  *
30  * *****************************************************************************
31  *
32  * Definitions for bc's lexer.
33  *
34  */
35 
36 #ifndef BC_LEX_H
37 #define BC_LEX_H
38 
39 #include <stdbool.h>
40 #include <stddef.h>
41 
42 #include <status.h>
43 #include <vector.h>
44 #include <lang.h>
45 
46 /**
47  * A convenience macro for throwing errors in lex code. This takes care of
48  * plumbing like passing in the current line the lexer is on.
49  * @param l  The lexer.
50  * @param e  The error.
51  */
52 #ifndef NDEBUG
53 #define bc_lex_err(l, e) (bc_vm_handleError((e), __FILE__, __LINE__, (l)->line))
54 #else // NDEBUG
55 #define bc_lex_err(l, e) (bc_vm_handleError((e), (l)->line))
56 #endif // NDEBUG
57 
58 /**
59  * A convenience macro for throwing errors in lex code. This takes care of
60  * plumbing like passing in the current line the lexer is on.
61  * @param l  The lexer.
62  * @param e  The error.
63  */
64 #ifndef NDEBUG
65 #define bc_lex_verr(l, e, ...) \
66 	(bc_vm_handleError((e), __FILE__, __LINE__, (l)->line, __VA_ARGS__))
67 #else // NDEBUG
68 #define bc_lex_verr(l, e, ...) (bc_vm_handleError((e), (l)->line, __VA_ARGS__))
69 #endif // NDEBUG
70 
71 // BC_LEX_NEG_CHAR returns the char that corresponds to negative for the
72 // current calculator.
73 //
74 // BC_LEX_LAST_NUM_CHAR returns the char that corresponds to the last valid
75 // char for numbers. In bc and dc, capital letters are part of numbers, to a
76 // point. (dc only goes up to hex, so its last valid char is 'F'.)
77 #if BC_ENABLED
78 
79 #if DC_ENABLED
80 #define BC_LEX_NEG_CHAR (BC_IS_BC ? '-' : '_')
81 #define BC_LEX_LAST_NUM_CHAR (BC_IS_BC ? 'Z' : 'F')
82 #else // DC_ENABLED
83 #define BC_LEX_NEG_CHAR ('-')
84 #define BC_LEX_LAST_NUM_CHAR ('Z')
85 #endif // DC_ENABLED
86 
87 #else // BC_ENABLED
88 
89 #define BC_LEX_NEG_CHAR ('_')
90 #define BC_LEX_LAST_NUM_CHAR ('F')
91 
92 #endif // BC_ENABLED
93 
94 /**
95  * Returns true if c is a valid number character.
96  * @param c         The char to check.
97  * @param pt        If a decimal point has already been seen.
98  * @param int_only  True if the number is expected to be an int only, false if
99  *                  non-integers are allowed.
100  * @return          True if @a c is a valid number character.
101  */
102 #define BC_LEX_NUM_CHAR(c, pt, int_only)                               \
103 	(isdigit(c) != 0 || ((c) >= 'A' && (c) <= BC_LEX_LAST_NUM_CHAR) || \
104 	 ((c) == '.' && !(pt) && !(int_only)))
105 
106 /// An enum of lex token types.
107 typedef enum BcLexType
108 {
109 	/// End of file.
110 	BC_LEX_EOF,
111 
112 	/// Marker for invalid tokens, used by bc and dc for const data.
113 	BC_LEX_INVALID,
114 
115 #if BC_ENABLED
116 
117 	/// Increment operator.
118 	BC_LEX_OP_INC,
119 
120 	/// Decrement operator.
121 	BC_LEX_OP_DEC,
122 
123 #endif // BC_ENABLED
124 
125 	/// BC_LEX_NEG is not used in lexing; it is only for parsing. The lexer
126 	/// marks all '-' characters as BC_LEX_OP_MINUS, but the parser needs to be
127 	/// able to distinguish them.
128 	BC_LEX_NEG,
129 
130 	/// Boolean not.
131 	BC_LEX_OP_BOOL_NOT,
132 
133 #if BC_ENABLE_EXTRA_MATH
134 
135 	/// Truncation operator.
136 	BC_LEX_OP_TRUNC,
137 
138 #endif // BC_ENABLE_EXTRA_MATH
139 
140 	/// Power operator.
141 	BC_LEX_OP_POWER,
142 
143 	/// Multiplication operator.
144 	BC_LEX_OP_MULTIPLY,
145 
146 	/// Division operator.
147 	BC_LEX_OP_DIVIDE,
148 
149 	/// Modulus operator.
150 	BC_LEX_OP_MODULUS,
151 
152 	/// Addition operator.
153 	BC_LEX_OP_PLUS,
154 
155 	/// Subtraction operator.
156 	BC_LEX_OP_MINUS,
157 
158 #if BC_ENABLE_EXTRA_MATH
159 
160 	/// Places (truncate or extend) operator.
161 	BC_LEX_OP_PLACES,
162 
163 	/// Left (decimal) shift operator.
164 	BC_LEX_OP_LSHIFT,
165 
166 	/// Right (decimal) shift operator.
167 	BC_LEX_OP_RSHIFT,
168 
169 #endif // BC_ENABLE_EXTRA_MATH
170 
171 	/// Equal operator.
172 	BC_LEX_OP_REL_EQ,
173 
174 	/// Less than or equal operator.
175 	BC_LEX_OP_REL_LE,
176 
177 	/// Greater than or equal operator.
178 	BC_LEX_OP_REL_GE,
179 
180 	/// Not equal operator.
181 	BC_LEX_OP_REL_NE,
182 
183 	/// Less than operator.
184 	BC_LEX_OP_REL_LT,
185 
186 	/// Greater than operator.
187 	BC_LEX_OP_REL_GT,
188 
189 	/// Boolean or operator.
190 	BC_LEX_OP_BOOL_OR,
191 
192 	/// Boolean and operator.
193 	BC_LEX_OP_BOOL_AND,
194 
195 #if BC_ENABLED
196 
197 	/// Power assignment operator.
198 	BC_LEX_OP_ASSIGN_POWER,
199 
200 	/// Multiplication assignment operator.
201 	BC_LEX_OP_ASSIGN_MULTIPLY,
202 
203 	/// Division assignment operator.
204 	BC_LEX_OP_ASSIGN_DIVIDE,
205 
206 	/// Modulus assignment operator.
207 	BC_LEX_OP_ASSIGN_MODULUS,
208 
209 	/// Addition assignment operator.
210 	BC_LEX_OP_ASSIGN_PLUS,
211 
212 	/// Subtraction assignment operator.
213 	BC_LEX_OP_ASSIGN_MINUS,
214 
215 #if BC_ENABLE_EXTRA_MATH
216 
217 	/// Places (truncate or extend) assignment operator.
218 	BC_LEX_OP_ASSIGN_PLACES,
219 
220 	/// Left (decimal) shift assignment operator.
221 	BC_LEX_OP_ASSIGN_LSHIFT,
222 
223 	/// Right (decimal) shift assignment operator.
224 	BC_LEX_OP_ASSIGN_RSHIFT,
225 
226 #endif // BC_ENABLE_EXTRA_MATH
227 #endif // BC_ENABLED
228 
229 	/// Assignment operator.
230 	BC_LEX_OP_ASSIGN,
231 
232 	/// Newline.
233 	BC_LEX_NLINE,
234 
235 	/// Whitespace.
236 	BC_LEX_WHITESPACE,
237 
238 	/// Left parenthesis.
239 	BC_LEX_LPAREN,
240 
241 	/// Right parenthesis.
242 	BC_LEX_RPAREN,
243 
244 	/// Left bracket.
245 	BC_LEX_LBRACKET,
246 
247 	/// Comma.
248 	BC_LEX_COMMA,
249 
250 	/// Right bracket.
251 	BC_LEX_RBRACKET,
252 
253 	/// Left brace.
254 	BC_LEX_LBRACE,
255 
256 	/// Semicolon.
257 	BC_LEX_SCOLON,
258 
259 	/// Right brace.
260 	BC_LEX_RBRACE,
261 
262 	/// String.
263 	BC_LEX_STR,
264 
265 	/// Identifier/name.
266 	BC_LEX_NAME,
267 
268 	/// Constant number.
269 	BC_LEX_NUMBER,
270 
271 	// These keywords are in the order they are in for a reason. Don't change
272 	// the order unless you want a bunch of weird failures in the test suite.
273 	// In fact, almost all of these tokens are in a specific order for a reason.
274 
275 #if BC_ENABLED
276 
277 	/// bc auto keyword.
278 	BC_LEX_KW_AUTO,
279 
280 	/// bc break keyword.
281 	BC_LEX_KW_BREAK,
282 
283 	/// bc continue keyword.
284 	BC_LEX_KW_CONTINUE,
285 
286 	/// bc define keyword.
287 	BC_LEX_KW_DEFINE,
288 
289 	/// bc for keyword.
290 	BC_LEX_KW_FOR,
291 
292 	/// bc if keyword.
293 	BC_LEX_KW_IF,
294 
295 	/// bc limits keyword.
296 	BC_LEX_KW_LIMITS,
297 
298 	/// bc return keyword.
299 	BC_LEX_KW_RETURN,
300 
301 	/// bc while keyword.
302 	BC_LEX_KW_WHILE,
303 
304 	/// bc halt keyword.
305 	BC_LEX_KW_HALT,
306 
307 	/// bc last keyword.
308 	BC_LEX_KW_LAST,
309 
310 #endif // BC_ENABLED
311 
312 	/// bc ibase keyword.
313 	BC_LEX_KW_IBASE,
314 
315 	/// bc obase keyword.
316 	BC_LEX_KW_OBASE,
317 
318 	/// bc scale keyword.
319 	BC_LEX_KW_SCALE,
320 
321 #if BC_ENABLE_EXTRA_MATH
322 
323 	/// bc seed keyword.
324 	BC_LEX_KW_SEED,
325 
326 #endif // BC_ENABLE_EXTRA_MATH
327 
328 	/// bc length keyword.
329 	BC_LEX_KW_LENGTH,
330 
331 	/// bc print keyword.
332 	BC_LEX_KW_PRINT,
333 
334 	/// bc sqrt keyword.
335 	BC_LEX_KW_SQRT,
336 
337 	/// bc abs keyword.
338 	BC_LEX_KW_ABS,
339 
340 	/// bc is_number keyword.
341 	BC_LEX_KW_IS_NUMBER,
342 
343 	/// bc is_string keyword.
344 	BC_LEX_KW_IS_STRING,
345 
346 #if BC_ENABLE_EXTRA_MATH
347 
348 	/// bc irand keyword.
349 	BC_LEX_KW_IRAND,
350 
351 #endif // BC_ENABLE_EXTRA_MATH
352 
353 	/// bc asciffy keyword.
354 	BC_LEX_KW_ASCIIFY,
355 
356 	/// bc modexp keyword.
357 	BC_LEX_KW_MODEXP,
358 
359 	/// bc divmod keyword.
360 	BC_LEX_KW_DIVMOD,
361 
362 	/// bc quit keyword.
363 	BC_LEX_KW_QUIT,
364 
365 	/// bc read keyword.
366 	BC_LEX_KW_READ,
367 
368 #if BC_ENABLE_EXTRA_MATH
369 
370 	/// bc rand keyword.
371 	BC_LEX_KW_RAND,
372 
373 #endif // BC_ENABLE_EXTRA_MATH
374 
375 	/// bc maxibase keyword.
376 	BC_LEX_KW_MAXIBASE,
377 
378 	/// bc maxobase keyword.
379 	BC_LEX_KW_MAXOBASE,
380 
381 	/// bc maxscale keyword.
382 	BC_LEX_KW_MAXSCALE,
383 
384 #if BC_ENABLE_EXTRA_MATH
385 
386 	/// bc maxrand keyword.
387 	BC_LEX_KW_MAXRAND,
388 
389 #endif // BC_ENABLE_EXTRA_MATH
390 
391 	/// bc line_length keyword.
392 	BC_LEX_KW_LINE_LENGTH,
393 
394 #if BC_ENABLED
395 
396 	/// bc global_stacks keyword.
397 	BC_LEX_KW_GLOBAL_STACKS,
398 
399 #endif // BC_ENABLED
400 
401 	/// bc leading_zero keyword.
402 	BC_LEX_KW_LEADING_ZERO,
403 
404 	/// bc stream keyword.
405 	BC_LEX_KW_STREAM,
406 
407 	/// bc else keyword.
408 	BC_LEX_KW_ELSE,
409 
410 #if DC_ENABLED
411 
412 	/// A special token for dc to calculate equal without a register.
413 	BC_LEX_EQ_NO_REG,
414 
415 	/// Colon (array) operator.
416 	BC_LEX_COLON,
417 
418 	/// Execute command.
419 	BC_LEX_EXECUTE,
420 
421 	/// Print stack command.
422 	BC_LEX_PRINT_STACK,
423 
424 	/// Clear stack command.
425 	BC_LEX_CLEAR_STACK,
426 
427 	/// Register stack level command.
428 	BC_LEX_REG_STACK_LEVEL,
429 
430 	/// Main stack level command.
431 	BC_LEX_STACK_LEVEL,
432 
433 	/// Duplicate command.
434 	BC_LEX_DUPLICATE,
435 
436 	/// Swap (reverse) command.
437 	BC_LEX_SWAP,
438 
439 	/// Pop (remove) command.
440 	BC_LEX_POP,
441 
442 	/// Store ibase command.
443 	BC_LEX_STORE_IBASE,
444 
445 	/// Store obase command.
446 	BC_LEX_STORE_OBASE,
447 
448 	/// Store scale command.
449 	BC_LEX_STORE_SCALE,
450 
451 #if BC_ENABLE_EXTRA_MATH
452 
453 	/// Store seed command.
454 	BC_LEX_STORE_SEED,
455 
456 #endif // BC_ENABLE_EXTRA_MATH
457 
458 	/// Load variable onto stack command.
459 	BC_LEX_LOAD,
460 
461 	/// Pop off of variable stack onto results stack command.
462 	BC_LEX_LOAD_POP,
463 
464 	/// Push onto variable stack command.
465 	BC_LEX_STORE_PUSH,
466 
467 	/// Print with pop command.
468 	BC_LEX_PRINT_POP,
469 
470 	/// Parameterized quit command.
471 	BC_LEX_NQUIT,
472 
473 	/// Execution stack depth command.
474 	BC_LEX_EXEC_STACK_LENGTH,
475 
476 	/// Scale of number command. This is needed specifically for dc because bc
477 	/// parses the scale function in parts.
478 	BC_LEX_SCALE_FACTOR,
479 
480 	/// Array length command. This is needed specifically for dc because bc
481 	/// just reuses its length keyword.
482 	BC_LEX_ARRAY_LENGTH,
483 
484 #endif // DC_ENABLED
485 
486 } BcLexType;
487 
488 struct BcLex;
489 
490 /**
491  * A function pointer to call when another token is needed. Mostly called by the
492  * parser.
493  * @param l  The lexer.
494  */
495 typedef void (*BcLexNext)(struct BcLex* l);
496 
497 /// The lexer.
498 typedef struct BcLex
499 {
500 	/// A pointer to the text to lex.
501 	const char* buf;
502 
503 	/// The current index into buf.
504 	size_t i;
505 
506 	/// The current line.
507 	size_t line;
508 
509 	/// The length of buf.
510 	size_t len;
511 
512 	/// The current token.
513 	BcLexType t;
514 
515 	/// The previous token.
516 	BcLexType last;
517 
518 	/// A string to store extra data for tokens. For example, the @a BC_LEX_STR
519 	/// token really needs to store the actual string, and numbers also need the
520 	/// string.
521 	BcVec str;
522 
523 	/// The mode the lexer is in.
524 	BcMode mode;
525 
526 } BcLex;
527 
528 /**
529  * Initializes a lexer.
530  * @param l  The lexer to initialize.
531  */
532 void
533 bc_lex_init(BcLex* l);
534 
535 /**
536  * Frees a lexer. This is not guarded by #ifndef NDEBUG because a separate
537  * parser is created at runtime to parse read() expressions and dc strings, and
538  * that parser needs a lexer.
539  * @param l  The lexer to free.
540  */
541 void
542 bc_lex_free(BcLex* l);
543 
544 /**
545  * Sets the filename that the lexer will be lexing.
546  * @param l     The lexer.
547  * @param file  The filename that the lexer will lex.
548  */
549 void
550 bc_lex_file(BcLex* l, const char* file);
551 
552 /**
553  * Sets the text the lexer will lex.
554  * @param l     The lexer.
555  * @param text  The text to lex.
556  * @param mode  The mode to lex in.
557  */
558 void
559 bc_lex_text(BcLex* l, const char* text, BcMode mode);
560 
561 /**
562  * Generic next function for the parser to call. It takes care of calling the
563  * correct @a BcLexNext function and consuming whitespace.
564  * @param l  The lexer.
565  */
566 void
567 bc_lex_next(BcLex* l);
568 
569 /**
570  * Lexes a line comment (one beginning with '#' and going to a newline).
571  * @param l  The lexer.
572  */
573 void
574 bc_lex_lineComment(BcLex* l);
575 
576 /**
577  * Lexes a general comment (C-style comment).
578  * @param l  The lexer.
579  */
580 void
581 bc_lex_comment(BcLex* l);
582 
583 /**
584  * Lexes whitespace, finding as much as possible.
585  * @param l  The lexer.
586  */
587 void
588 bc_lex_whitespace(BcLex* l);
589 
590 /**
591  * Lexes a number that begins with char @a start. This takes care of parsing
592  * numbers in scientific and engineering notations.
593  * @param l      The lexer.
594  * @param start  The starting char of the number. To detect a number and call
595  *               this function, the lexer had to eat the first char. It fixes
596  *               that by passing it in.
597  */
598 void
599 bc_lex_number(BcLex* l, char start);
600 
601 /**
602  * Lexes a name/identifier.
603  * @param l  The lexer.
604  */
605 void
606 bc_lex_name(BcLex* l);
607 
608 /**
609  * Lexes common whitespace characters.
610  * @param l  The lexer.
611  * @param c  The character to lex.
612  */
613 void
614 bc_lex_commonTokens(BcLex* l, char c);
615 
616 /**
617  * Throws a parse error because char @a c was invalid.
618  * @param l  The lexer.
619  * @param c  The problem character.
620  */
621 void
622 bc_lex_invalidChar(BcLex* l, char c);
623 
624 /**
625  * Reads a line from stdin and puts it into the lexer's buffer.
626  * @param l  The lexer.
627  */
628 bool
629 bc_lex_readLine(BcLex* l);
630 
631 #endif // BC_LEX_H
632