1 /****************************************************************************
2 **
3 ** This file is part of GAP, a system for computational discrete algebra.
4 **
5 ** Copyright of GAP belongs to its developers, whose names are too numerous
6 ** to list here. Please refer to the COPYRIGHT file for details.
7 **
8 ** SPDX-License-Identifier: GPL-2.0-or-later
9 **
10 ** This file declares the functions of the scanner, which provides a very
11 ** abstractions, namely the concept that an input file is a stream of
12 ** symbols, such nasty things as <space>, <tab>, <newline> characters or
13 ** comments (they are worst :-), characters making up identifiers or digits
14 ** that make up integers are hidden from the rest of GAP.
15 */
16
17 #ifndef GAP_SCANNER_H
18 #define GAP_SCANNER_H
19
20 #include "system.h"
21
22 /****************************************************************************
23 **
24 *V Symbol . . . . . . . . . . . . . . . . . current symbol read from input
25 **
26 ** The variable 'Symbol' contains the current symbol read from the input.
27 ** It is represented as an unsigned long integer.
28 **
29 ** The possible values for 'Symbol' are defined in the definition file of
30 ** this package as follows:
31 */
32 enum SCANNER_SYMBOLS {
33 S_ILLEGAL = 0UL,
34
35 S_IDENT = (1UL<< 3),
36 S_UNBIND = (1UL<< 3)+1,
37 S_ISBOUND = (1UL<< 3)+2,
38 S_TRYNEXT = (1UL<< 3)+3,
39 S_INFO = (1UL<< 3)+4,
40 S_ASSERT = (1UL<< 3)+5,
41 S_LBRACK = (1UL<< 4)+0,
42 S_LBRACE = (1UL<< 4)+1,
43 S_BLBRACK = (1UL<< 4)+2,
44 S_RBRACK = (1UL<< 5)+0,
45 S_RBRACE = (1UL<< 5)+1,
46 S_DOT = (1UL<< 6)+0,
47 S_BDOT = (1UL<< 6)+1,
48 S_LPAREN = (1UL<< 7),
49 S_RPAREN = (1UL<< 8),
50 S_COMMA = (1UL<< 9)+0,
51 S_DOTDOT = (1UL<< 9)+1,
52 S_COLON = (1UL<< 9)+2,
53 S_READWRITE = (1UL<< 9)+3,
54 S_READONLY = (1UL<< 9)+4,
55 S_DOTDOTDOT = (1UL<< 9)+5,
56
57 S_INT = (1UL<<10)+0,
58 S_FLOAT = (1UL<<10)+1,
59
60 S_TRUE = (1UL<<11)+0,
61 S_FALSE = (1UL<<11)+1,
62 S_CHAR = (1UL<<11)+2,
63 S_STRING = (1UL<<11)+3,
64 S_TILDE = (1UL<<11)+4,
65 S_HELP = (1UL<<11)+5,
66 S_PRAGMA = (1UL<<11)+6,
67
68
69 S_REC = (1UL<<12)+0,
70
71 S_FUNCTION = (1UL<<13),
72 S_LOCAL = (1UL<<14),
73 S_END = (1UL<<15),
74 S_MAPTO = (1UL<<16),
75
76 S_MULT = (1UL<<17)+0,
77 S_DIV = (1UL<<17)+1,
78 S_MOD = (1UL<<17)+2,
79 S_POW = (1UL<<17)+3,
80
81 S_PLUS = (1UL<<18)+0,
82 S_MINUS = (1UL<<18)+1,
83
84 S_EQ = (1UL<<19)+0,
85 S_LT = (1UL<<19)+1,
86 S_GT = (1UL<<19)+2,
87 S_NE = (1UL<<19)+3,
88 S_LE = (1UL<<19)+4,
89 S_GE = (1UL<<19)+5,
90 S_IN = (1UL<<19)+6,
91
92 S_NOT = (1UL<<20)+0,
93 S_AND = (1UL<<20)+1,
94 S_OR = (1UL<<20)+2,
95
96 S_ASSIGN = (1UL<<21),
97
98 S_IF = (1UL<<22)+0,
99 S_FOR = (1UL<<22)+1,
100 S_WHILE = (1UL<<22)+2,
101 S_REPEAT = (1UL<<22)+3,
102 S_ATOMIC = (1UL<<22)+4,
103
104 S_THEN = (1UL<<23),
105 S_ELIF = (1UL<<24)+0,
106 S_ELSE = (1UL<<24)+1,
107 S_FI = (1UL<<25),
108 S_DO = (1UL<<26),
109 S_OD = (1UL<<27),
110 S_UNTIL = (1UL<<28),
111
112 S_BREAK = (1UL<<29)+0,
113 S_RETURN = (1UL<<29)+1,
114 S_QUIT = (1UL<<29)+2,
115 S_QQUIT = (1UL<<29)+3,
116 S_CONTINUE = (1UL<<29)+4,
117
118 S_SEMICOLON = (1UL<<30)+0,
119 S_DUALSEMICOLON = (1UL<<30)+1,
120
121 S_EOF = (1UL<<31),
122 };
123
124
125 /****************************************************************************
126 **
127 *T TypSymbolSet . . . . . . . . . . . . . . . . . . type of sets of symbols
128 **
129 ** 'TypSymbolSet' is the type of sets of symbols. Sets of symbols are used
130 ** in the error recovery of the parser to specify that 'Match' should skip
131 ** all symbols until finding one in a specified set.
132 **
133 ** If there were less than 32 different symbols things would be very easy.
134 ** We could simply assign the symbolic constants that are the possible
135 ** values for 'Symbol' values 1, 2, 4, 8, 16, ... and so on. Then making a
136 ** set would simply mean or-ing the values, as in 'S_INT|S_STRING', and
137 ** checking whether a symbol is in a set would be '(<symbol> & <set>) != 0'.
138 **
139 ** There are however more than 32 different symbols, so we must be more
140 ** clever. We group some symbols that are syntactically equivalent like
141 ** '*', '/' in a class. We use the least significant 3 bits to differentiate
142 ** between members in one class. And now every symbol class, many of which
143 ** contain just one symbol, has exactly one of the remaining most
144 ** significant 29 bits set. Thus sets of symbols are represented as
145 ** unsigned long integers, which is typedef-ed to 'TypSymbolSet'.
146 **
147 ** The classes are as follows, all other symbols are in a class themself:
148 ** identifiers, IsBound, Unbind, Info, Assert
149 ** if, for, repeat, while, return
150 ** elif, else
151 ** not, and, or
152 ** =, <>, <, >=, <=, >, in
153 ** +, -
154 ** *, /, mod, ^
155 **
156 ** 'TypSymbolSet' is defined in the definition file of this package as
157 ** follows:
158 */
159 typedef UInt TypSymbolSet;
160
161
162 /****************************************************************************
163 **
164 *F IS_IN( <symbol>, <set> ) . . . . . . . . is a symbol in a set of symbols
165 **
166 ** 'IS_IN' returns non-zero if the symbol <symbol> is in the symbol set
167 ** <set> and 0 otherwise. Due to the grouping into classes some symbol sets
168 ** may contain more than mentioned.
169 ** For example 'IS_IN(S_POW,S_MULT|S_DIV|S_MOD)' is 1.
170 **
171 ** 'IS_IN' is defined in the definition file of this package as follows:
172 */
173 #define IS_IN(symbol,set) ((symbol) & ((set) & ~7))
174
175
176 /****************************************************************************
177 **
178 *V EXPRBEGIN . . . . . . . . . . . . set of symbols that start an expression
179 *V STATBEGIN . . . . . . . . . . . . . set of symbols that start a statement
180 **
181 ** 'EXPRBEGIN' is the set of symbols that might start an expression.
182 **
183 ** 'STATBEGIN' is the set of symbols that might start a statement.
184 */
185 #define EXPRBEGIN (S_IDENT|S_ISBOUND|S_INT|S_TRUE|S_FALSE|S_TILDE \
186 |S_CHAR|S_STRING|S_LBRACK|S_REC|S_FUNCTION \
187 |S_PLUS|S_MINUS|S_NOT|S_LPAREN)
188
189 #define STATBEGIN (S_IDENT|S_UNBIND|S_IF|S_FOR|S_WHILE|S_REPEAT \
190 |S_BREAK|S_RETURN|S_HELP|S_QUIT)
191
192
193 /****************************************************************************
194 **
195 *T ScannerState
196 **
197 ** The struct 'ScannerState' encapsulates the state of the scanner.
198 **
199 ** In the future, it is planned to allow use of multiple instances of the
200 ** scanner simultaneously within a single thread. However, this is not yet
201 ** ready, and currently only once instance of 'ScannerState' is used, which
202 ** is stored inside the global instance of struct 'GAPState'.
203 */
204 typedef struct {
205
206 /****************************************************************************
207 **
208 *V Value . . . . . . . . . . . . . value of the identifier, float or integer
209 *V ValueObj . . . . . . . . . . . . . . . . . . . . . . value of the string
210 **
211 ** If 'Symbol' is 'S_IDENT', 'S_INT' or 'S_FLOAT' then normally the
212 ** variable 'Value' holds the name of the identifier, the digits of
213 ** the integer or float literal as a C string. For large integer or float
214 ** literals that do not fit into 'Value', instead 'ValueObj'
215 ** holds the the literal as a GAP string object. If the symbol is 'S_STRING'
216 ** or 'S_HELP', the string literal or help text is always stored in
217 ** 'ValueObj' as a GAP string object.
218 **
219 ** Note that the size of identifiers in GAP is limited to 1023 characters,
220 ** hence identifiers are always stored in 'Value'. For this reason,
221 ** 'GetIdent' truncates an identifier after that many characters.
222 */
223 Obj ValueObj;
224 Char Value[1024];
225
226 enum SCANNER_SYMBOLS Symbol;
227
228 // Track the last three symbols, for 'Unbound global' warnings
229 UInt SymbolStartPos[3];
230 UInt SymbolStartLine[3];
231 } ScannerState;
232
233 /****************************************************************************
234 **
235 *V NrError . . . . . . . . . . . . . . . . number of errors in current expr
236 *V NrErrLine . . . . . . . . . . . . . . . number of errors on current line
237 **
238 ** 'NrError' is an integer whose value is the number of errors already found
239 ** in the current expression. It is set to 0 at the beginning of 'Read' and
240 ** incremented with each 'SyntaxError' call, including those from 'Match'.
241 **
242 ** If 'NrError' is greater than zero the parser functions will not create
243 ** new bags. This prevents the parser from creating new bags after an error
244 ** occured.
245 **
246 ** 'NrErrLine' is an integer whose value is the number of errors found on
247 ** the current line. It is set to 0 in 'GetLine' and incremented with each
248 ** 'SyntaxError' call, including those from 'Match'.
249 **
250 ** If 'NrErrLine' is greater than zero 'SyntaxError' will not print an
251 ** error message. This prevents the printing of multiple error messages for
252 ** one line, since they probabely just reflect the fact that the parser
253 ** has not resynchronized yet.
254 */
255 /* TL: extern UInt NrError; */
256 /* TL: extern UInt NrErrLine; */
257
258
IsIdent(char c)259 EXPORT_INLINE int IsIdent(char c)
260 {
261 return IsAlpha(c) || c == '_' || c == '@';
262 }
263
264 int IsKeyword(const char * str);
265
266
267 /****************************************************************************
268 **
269 *F SyntaxError( <msg> ) . . . . . . . . . . . . . . . . raise a syntax error
270 *F SyntaxWarning( <msg> ) . . . . . . . . . . . . . . raise a syntax warning
271 **
272 ** 'SyntaxError' prints the current line, followed by the error message:
273 **
274 ** ^ syntax error, <msg> in <current file name>
275 **
276 ** with the '^' pointing to the current symbol on the current line. If the
277 ** <current file name> is '*stdin*' it is not printed.
278 **
279 ** 'SyntaxError' is called from the parser to print error messages for those
280 ** errors that are not caught by 'Match', for example if the left hand side
281 ** of an assignment is not a variable, a list element or a record component,
282 ** or if two formal arguments of a function have the same identifier.
283 **
284 ** 'SyntaxError' first increments 'NrError' by 1. If 'NrError' is greater
285 ** than zero the parser functions will not create new bags. This prevents
286 ** the parser from creating new bags after an error occured.
287 **
288 ** 'SyntaxError' also increments 'NrErrLine' by 1. If 'NrErrLine' is
289 ** greater than zero 'SyntaxError' will not print an error message. This
290 ** prevents the printing of multiple error messages for one line, since they
291 ** probabely just reflect the fact that the parser has not resynchronized
292 ** yet. 'NrErrLine' is reset to 0 if a new line is read in 'GetLine'.
293 **
294 ** 'SyntaxWarning' displays in the same way but does not increase 'NrError'
295 ** or 'NrErrLine'.
296 **
297 ** Note that unlike 'ErrorQuit', neither function raises an actual error,
298 ** so execution continues as normal. Thus you must make sure that subsequent
299 ** code can safely recover from the indicated error.
300 **
301 ** Both functions should only be called from the scanner or reader, but not
302 ** from e.g. the interpreter or coder, let alone any other parts of GAP.
303 **
304 ** The 'WithOffset' variants allow marking a previously parsed token as
305 ** the syntax error. This is used by 'Unbound global variable', as GAP
306 ** does not know if a variable is unbound until another 2 tokens are read.
307 **
308 */
309 void SyntaxErrorWithOffset(ScannerState * s,
310 const Char * msg,
311 Int tokenoffset);
312
313 void SyntaxWarningWithOffset(ScannerState * s,
314 const Char * msg,
315 Int tokenoffset);
316
SyntaxError(ScannerState * s,const Char * msg)317 EXPORT_INLINE void SyntaxError(ScannerState * s, const Char * msg)
318 {
319 SyntaxErrorWithOffset(s, msg, 0);
320 }
321
SyntaxWarning(ScannerState * s,const Char * msg)322 EXPORT_INLINE void SyntaxWarning(ScannerState * s, const Char * msg)
323 {
324 SyntaxWarningWithOffset(s, msg, 0);
325 }
326
327
328 /****************************************************************************
329 **
330 *F Match( <symbol>, <msg>, <skipto> ) . match current symbol and fetch next
331 **
332 ** 'Match' is the main interface between the scanner and the parser. It
333 ** performs the four most common actions in the scanner with just one call.
334 ** First it checks that the current symbol stored in the variable 'Symbol'
335 ** is the expected symbol as passed in the argument <symbol>. If it is,
336 ** 'Match' reads the next symbol from input and returns. Otherwise 'Match'
337 ** first prints the current input line followed by the syntax error message:
338 ** '^ syntax error, <msg> expected' with '^' pointing to the current symbol.
339 ** It then skips symbols up to one in the resynchronisation set <skipto>.
340 ** Actually 'Match' calls 'SyntaxError' so its comments apply here too.
341 **
342 ** One kind of typical 'Match' call has the form
343 **
344 ** 'Match( Symbol, "", 0L );'.
345 **
346 ** This is used if the parser knows that the current symbol is correct, for
347 ** example in 'ReadReturn' the first symbol must be 'S_RETURN', otherwise
348 ** 'ReadReturn' would not have been called. Called this way 'Match' will of
349 ** course never raise a syntax error, therefore <msg> and <skipto> are of no
350 ** concern. The effect of this call is merely to read the next symbol from
351 ** input.
352 **
353 ** Another typical 'Match' call is in 'ReadIf' after we read the if symbol
354 ** and the condition following, and now expect to see the 'then' symbol:
355 **
356 ** Match( S_THEN, "then", STATBEGIN|S_ELIF|S_ELSE|S_FI|follow );
357 **
358 ** If the current symbol is 'S_THEN' it is matched and the next symbol is
359 ** read. Otherwise 'Match' prints the current line followed by the error
360 ** message: '^ syntax error, then expected'. Then 'Match' skips all symbols
361 ** until finding either a symbol that can begin a statment, an 'elif' or
362 ** 'else' or 'fi' symbol, or a symbol that is contained in the set <follow>
363 ** which is passed to 'ReadIf' and contains all symbols allowing one of the
364 ** calling functions to resynchronize, for example 'S_OD' if 'ReadIf' has
365 ** been called from 'ReadFor'. <follow> always contain 'S_EOF', which 'Read'
366 ** uses to resynchronise.
367 **
368 ** If 'Match' needs to read a new line from '*stdin*' or '*errin*' to get
369 ** the next symbol it prints the string pointed to by 'Prompt'.
370 */
371 void Match(ScannerState * s,
372 UInt symbol,
373 const Char * msg,
374 TypSymbolSet skipto);
375
376
377 /****************************************************************************
378 **
379 *F ScanForFloatAfterDotHACK()
380 **
381 ** This function is called by 'ReadLiteral' if it encounters a single dot in
382 ** form the of the symbol 'S_DOT'. The only legal way this could happen is
383 ** if the dot is the start of a float literal like '.123'. As the scanner
384 ** cannot detect this without being context aware, we must provide this
385 ** function to allow the reader to signal to the scanner about this.
386 */
387 void ScanForFloatAfterDotHACK(ScannerState * s);
388
389
390 /****************************************************************************
391 **
392 *F * * * * * * * * * * * * * initialize module * * * * * * * * * * * * * * *
393 */
394
395 /****************************************************************************
396 **
397 *F InitInfoScanner() . . . . . . . . . . . . . . . . table of init functions
398 */
399 StructInitInfo * InitInfoScanner ( void );
400
401 #endif // GAP_SCANNER_H
402