1 /****************************************************************************
2 **
3 **  This file is part of GAP, a system for computational discrete algebra.
4 **
5 **  Copyright of GAP belongs to its developers, whose names are too numerous
6 **  to list here. Please refer to the COPYRIGHT file for details.
7 **
8 **  SPDX-License-Identifier: GPL-2.0-or-later
9 **
10 **  This file declares the functions of the scanner, which provides a very
11 **  abstractions, namely the concept that an input file is a stream of
12 **  symbols, such nasty things as <space>, <tab>, <newline> characters or
13 **  comments (they are worst :-), characters making up identifiers or digits
14 **  that make up integers are hidden from the rest of GAP.
15 */
16 
17 #ifndef GAP_SCANNER_H
18 #define GAP_SCANNER_H
19 
20 #include "system.h"
21 
22 /****************************************************************************
23 **
24 *V  Symbol  . . . . . . . . . . . . . . . . .  current symbol read from input
25 **
26 **  The  variable 'Symbol' contains the current  symbol read from  the input.
27 **  It is represented as an unsigned long integer.
28 **
29 **  The possible values for 'Symbol' are defined in the  definition  file  of
30 **  this package as follows:
31 */
32 enum SCANNER_SYMBOLS {
33     S_ILLEGAL           = 0UL,
34 
35     S_IDENT             = (1UL<< 3),
36     S_UNBIND            = (1UL<< 3)+1,
37     S_ISBOUND           = (1UL<< 3)+2,
38     S_TRYNEXT           = (1UL<< 3)+3,
39     S_INFO              = (1UL<< 3)+4,
40     S_ASSERT            = (1UL<< 3)+5,
41     S_LBRACK            = (1UL<< 4)+0,
42     S_LBRACE            = (1UL<< 4)+1,
43     S_BLBRACK           = (1UL<< 4)+2,
44     S_RBRACK            = (1UL<< 5)+0,
45     S_RBRACE            = (1UL<< 5)+1,
46     S_DOT               = (1UL<< 6)+0,
47     S_BDOT              = (1UL<< 6)+1,
48     S_LPAREN            = (1UL<< 7),
49     S_RPAREN            = (1UL<< 8),
50     S_COMMA             = (1UL<< 9)+0,
51     S_DOTDOT            = (1UL<< 9)+1,
52     S_COLON             = (1UL<< 9)+2,
53     S_READWRITE         = (1UL<< 9)+3,
54     S_READONLY          = (1UL<< 9)+4,
55     S_DOTDOTDOT         = (1UL<< 9)+5,
56 
57     S_INT               = (1UL<<10)+0,
58     S_FLOAT             = (1UL<<10)+1,
59 
60     S_TRUE              = (1UL<<11)+0,
61     S_FALSE             = (1UL<<11)+1,
62     S_CHAR              = (1UL<<11)+2,
63     S_STRING            = (1UL<<11)+3,
64     S_TILDE             = (1UL<<11)+4,
65     S_HELP              = (1UL<<11)+5,
66     S_PRAGMA            = (1UL<<11)+6,
67 
68 
69     S_REC               = (1UL<<12)+0,
70 
71     S_FUNCTION          = (1UL<<13),
72     S_LOCAL             = (1UL<<14),
73     S_END               = (1UL<<15),
74     S_MAPTO             = (1UL<<16),
75 
76     S_MULT              = (1UL<<17)+0,
77     S_DIV               = (1UL<<17)+1,
78     S_MOD               = (1UL<<17)+2,
79     S_POW               = (1UL<<17)+3,
80 
81     S_PLUS              = (1UL<<18)+0,
82     S_MINUS             = (1UL<<18)+1,
83 
84     S_EQ                = (1UL<<19)+0,
85     S_LT                = (1UL<<19)+1,
86     S_GT                = (1UL<<19)+2,
87     S_NE                = (1UL<<19)+3,
88     S_LE                = (1UL<<19)+4,
89     S_GE                = (1UL<<19)+5,
90     S_IN                = (1UL<<19)+6,
91 
92     S_NOT               = (1UL<<20)+0,
93     S_AND               = (1UL<<20)+1,
94     S_OR                = (1UL<<20)+2,
95 
96     S_ASSIGN            = (1UL<<21),
97 
98     S_IF                = (1UL<<22)+0,
99     S_FOR               = (1UL<<22)+1,
100     S_WHILE             = (1UL<<22)+2,
101     S_REPEAT            = (1UL<<22)+3,
102     S_ATOMIC            = (1UL<<22)+4,
103 
104     S_THEN              = (1UL<<23),
105     S_ELIF              = (1UL<<24)+0,
106     S_ELSE              = (1UL<<24)+1,
107     S_FI                = (1UL<<25),
108     S_DO                = (1UL<<26),
109     S_OD                = (1UL<<27),
110     S_UNTIL             = (1UL<<28),
111 
112     S_BREAK             = (1UL<<29)+0,
113     S_RETURN            = (1UL<<29)+1,
114     S_QUIT              = (1UL<<29)+2,
115     S_QQUIT             = (1UL<<29)+3,
116     S_CONTINUE          = (1UL<<29)+4,
117 
118     S_SEMICOLON         = (1UL<<30)+0,
119     S_DUALSEMICOLON     = (1UL<<30)+1,
120 
121     S_EOF               = (1UL<<31),
122 };
123 
124 
125 /****************************************************************************
126 **
127 *T  TypSymbolSet  . . . . . . . . . . . . . . . . . . type of sets of symbols
128 **
129 **  'TypSymbolSet' is the type of sets of symbols.  Sets  of symbols are used
130 **  in the error recovery of the  parser  to specify that 'Match' should skip
131 **  all symbols until finding one in a specified set.
132 **
133 **  If there were less than 32 different symbols  things would be  very easy.
134 **  We could  simply assign   the  symbolic constants   that are the possible
135 **  values for 'Symbol' values 1, 2, 4, 8, 16, ...  and so on.  Then making a
136 **  set  would  simply mean  or-ing the  values, as in  'S_INT|S_STRING', and
137 **  checking whether a symbol is in a set would be '(<symbol> & <set>) != 0'.
138 **
139 **  There  are however more  than 32 different  symbols, so  we must  be more
140 **  clever.  We  group some  symbols that  are syntactically  equivalent like
141 **  '*', '/' in a class. We use the least significant 3 bits to differentiate
142 **  between members in one class.  And now  every symbol class, many of which
143 **  contain   just  one  symbol,  has exactly  one   of  the  remaining most
144 **  significant 29  bits  set.   Thus   sets  of symbols  are  represented as
145 **  unsigned long integers, which is typedef-ed to 'TypSymbolSet'.
146 **
147 **  The classes are as follows, all other symbols are in a class themself:
148 **      identifiers, IsBound, Unbind, Info, Assert
149 **      if, for, repeat, while, return
150 **      elif, else
151 **      not, and, or
152 **      =, <>, <, >=, <=, >, in
153 **      +, -
154 **      *, /, mod, ^
155 **
156 **  'TypSymbolSet'  is defined in the   definition  file of  this  package as
157 **  follows:
158 */
159 typedef UInt            TypSymbolSet;
160 
161 
162 /****************************************************************************
163 **
164 *F  IS_IN( <symbol>, <set> )  . . . . . . . . is a symbol in a set of symbols
165 **
166 **  'IS_IN' returns non-zero if the symbol <symbol> is in the symbol set
167 **  <set> and 0 otherwise. Due to the grouping into classes some symbol sets
168 **  may contain more than mentioned.
169 **  For example 'IS_IN(S_POW,S_MULT|S_DIV|S_MOD)' is 1.
170 **
171 **  'IS_IN' is defined in the definition file of this package as follows:
172 */
173 #define IS_IN(symbol,set)       ((symbol) & ((set) & ~7))
174 
175 
176 /****************************************************************************
177 **
178 *V  EXPRBEGIN . . . . . . . . . . . . set of symbols that start an expression
179 *V  STATBEGIN . . . . . . . . . . . . . set of symbols that start a statement
180 **
181 **  'EXPRBEGIN'  is the  set  of  symbols   that might  start  an expression.
182 **
183 **  'STATBEGIN' is the set of symbols that might start a statement.
184 */
185 #define EXPRBEGIN  (S_IDENT|S_ISBOUND|S_INT|S_TRUE|S_FALSE|S_TILDE \
186                     |S_CHAR|S_STRING|S_LBRACK|S_REC|S_FUNCTION \
187                     |S_PLUS|S_MINUS|S_NOT|S_LPAREN)
188 
189 #define STATBEGIN  (S_IDENT|S_UNBIND|S_IF|S_FOR|S_WHILE|S_REPEAT \
190                     |S_BREAK|S_RETURN|S_HELP|S_QUIT)
191 
192 
193 /****************************************************************************
194 **
195 *T  ScannerState
196 **
197 **  The struct 'ScannerState' encapsulates the state of the scanner.
198 **
199 **  In the future, it is planned to allow use of multiple instances of the
200 **  scanner simultaneously within a single thread. However, this is not yet
201 **  ready, and currently only once instance of 'ScannerState' is used, which
202 **  is stored inside the global instance of struct 'GAPState'.
203 */
204 typedef struct {
205 
206 /****************************************************************************
207 **
208 *V  Value . . . . . . . . . . . . . value of the identifier, float or integer
209 *V  ValueObj . . . . . . . . . . . . . . . . . . . . . .  value of the string
210 **
211 **  If 'Symbol' is 'S_IDENT', 'S_INT' or 'S_FLOAT' then normally the
212 **  variable 'Value' holds the name of the identifier, the digits of
213 **  the integer or float literal as a C string. For large integer or float
214 **  literals that do not fit into 'Value', instead 'ValueObj'
215 **  holds the the literal as a GAP string object. If the symbol is 'S_STRING'
216 **  or 'S_HELP', the string literal or help text is always stored in
217 **  'ValueObj' as a GAP string object.
218 **
219 **  Note that the size of identifiers in GAP is limited to 1023 characters,
220 **  hence identifiers are always stored in 'Value'. For this reason,
221 **  'GetIdent' truncates an identifier after that many characters.
222 */
223     Obj    ValueObj;
224     Char   Value[1024];
225 
226     enum SCANNER_SYMBOLS Symbol;
227 
228     // Track the last three symbols, for 'Unbound global' warnings
229     UInt   SymbolStartPos[3];
230     UInt   SymbolStartLine[3];
231 } ScannerState;
232 
233 /****************************************************************************
234 **
235 *V  NrError . . . . . . . . . . . . . . . .  number of errors in current expr
236 *V  NrErrLine . . . . . . . . . . . . . . .  number of errors on current line
237 **
238 **  'NrError' is an integer whose value is the number of errors already found
239 **  in the current expression.  It is set to 0 at the beginning of 'Read' and
240 **  incremented with each 'SyntaxError' call, including those  from  'Match'.
241 **
242 **  If 'NrError' is greater than zero the parser functions  will  not  create
243 **  new bags.  This prevents the parser from creating new bags after an error
244 **  occured.
245 **
246 **  'NrErrLine' is an integer whose value is the number of  errors  found  on
247 **  the current line.  It is set to 0 in 'GetLine' and incremented with  each
248 **  'SyntaxError' call, including those from 'Match'.
249 **
250 **  If 'NrErrLine' is greater  than  zero  'SyntaxError' will  not  print  an
251 **  error message.  This prevents the printing of multiple error messages for
252 **  one line, since they  probabely  just reflect  the  fact that the  parser
253 **  has not resynchronized yet.
254 */
255 /* TL: extern  UInt            NrError; */
256 /* TL: extern  UInt            NrErrLine; */
257 
258 
IsIdent(char c)259 EXPORT_INLINE int IsIdent(char c)
260 {
261     return IsAlpha(c) || c == '_' || c == '@';
262 }
263 
264 int IsKeyword(const char * str);
265 
266 
267 /****************************************************************************
268 **
269 *F  SyntaxError( <msg> ) . . . . . . . . . . . . . . . . raise a syntax error
270 *F  SyntaxWarning( <msg> ) . . . . . . . . . . . . . . raise a syntax warning
271 **
272 **  'SyntaxError' prints the current line, followed by the error message:
273 **
274 **      ^ syntax error, <msg> in <current file name>
275 **
276 **  with the '^' pointing to the current symbol on the current line.  If  the
277 **  <current file name> is '*stdin*' it is not printed.
278 **
279 **  'SyntaxError' is called from the parser to print error messages for those
280 **  errors that are not caught by 'Match',  for example if the left hand side
281 **  of an assignment is not a variable, a list element or a record component,
282 **  or if two formal arguments of a function have the same identifier.
283 **
284 **  'SyntaxError' first increments 'NrError' by   1.  If 'NrError' is greater
285 **  than zero the parser functions  will not create  new bags.  This prevents
286 **  the parser from creating new bags after an error occured.
287 **
288 **  'SyntaxError'  also  increments  'NrErrLine'  by  1.  If  'NrErrLine'  is
289 **  greater than zero  'SyntaxError' will not print an  error  message.  This
290 **  prevents the printing of multiple error messages for one line, since they
291 **  probabely  just reflect the  fact  that the parser has not resynchronized
292 **  yet.  'NrErrLine' is reset to 0 if a new line is read in 'GetLine'.
293 **
294 **  'SyntaxWarning' displays in the same way but does not increase 'NrError'
295 **  or 'NrErrLine'.
296 **
297 **  Note that unlike 'ErrorQuit', neither function raises an actual error,
298 **  so execution continues as normal. Thus you must make sure that subsequent
299 **  code can safely recover from the indicated error.
300 **
301 **  Both functions should only be called from the scanner or reader, but not
302 **  from e.g. the interpreter or coder, let alone any other parts of GAP.
303 **
304 **  The 'WithOffset' variants allow marking a previously parsed token as
305 **  the syntax error. This is used by 'Unbound global variable', as GAP
306 **  does not know if a variable is unbound until another 2 tokens are read.
307 **
308 */
309 void SyntaxErrorWithOffset(ScannerState * s,
310                            const Char *   msg,
311                            Int            tokenoffset);
312 
313 void SyntaxWarningWithOffset(ScannerState * s,
314                              const Char *   msg,
315                              Int            tokenoffset);
316 
SyntaxError(ScannerState * s,const Char * msg)317 EXPORT_INLINE void SyntaxError(ScannerState * s, const Char * msg)
318 {
319     SyntaxErrorWithOffset(s, msg, 0);
320 }
321 
SyntaxWarning(ScannerState * s,const Char * msg)322 EXPORT_INLINE void SyntaxWarning(ScannerState * s, const Char * msg)
323 {
324     SyntaxWarningWithOffset(s, msg, 0);
325 }
326 
327 
328 /****************************************************************************
329 **
330 *F  Match( <symbol>, <msg>, <skipto> )  . match current symbol and fetch next
331 **
332 **  'Match' is the main  interface between the  scanner and the  parser.   It
333 **  performs the four most common actions in the scanner with  just one call.
334 **  First it checks that  the current symbol stored  in the variable 'Symbol'
335 **  is the expected symbol  as passed in the  argument <symbol>.  If  it  is,
336 **  'Match' reads the next symbol from input  and returns.  Otherwise 'Match'
337 **  first prints the current input line followed by the syntax error message:
338 **  '^ syntax error, <msg> expected' with '^' pointing to the current symbol.
339 **  It then  skips symbols up to one  in the resynchronisation  set <skipto>.
340 **  Actually 'Match' calls 'SyntaxError' so its comments apply here too.
341 **
342 **  One kind of typical 'Match' call has the form
343 **
344 **      'Match( Symbol, "", 0L );'.
345 **
346 **  This is used if the parser knows that the current  symbol is correct, for
347 **  example in 'ReadReturn'  the  first symbol must be 'S_RETURN',  otherwise
348 **  'ReadReturn' would not have been called. Called this  way 'Match' will of
349 **  course never raise a syntax error, therefore <msg> and <skipto> are of no
350 **  concern.  The effect of this call is  merely to read the next symbol from
351 **  input.
352 **
353 **  Another typical 'Match' call is in 'ReadIf' after we read the if symbol
354 **  and the condition following, and now expect to see the 'then' symbol:
355 **
356 **      Match( S_THEN, "then", STATBEGIN|S_ELIF|S_ELSE|S_FI|follow );
357 **
358 **  If the current symbol  is 'S_THEN' it is  matched  and the next symbol is
359 **  read.  Otherwise 'Match'  prints the  current line followed by the  error
360 **  message: '^ syntax error, then expected'.  Then 'Match' skips all symbols
361 **  until finding either  a symbol  that can begin  a statment,  an 'elif' or
362 **  'else' or 'fi' symbol, or a symbol that is  contained in the set <follow>
363 **  which is passed to 'ReadIf' and contains all symbols allowing  one of the
364 **  calling functions  to resynchronize,  for example 'S_OD' if 'ReadIf'  has
365 **  been called from 'ReadFor'. <follow> always contain 'S_EOF', which 'Read'
366 **  uses to resynchronise.
367 **
368 **  If 'Match' needs to  read a  new line from  '*stdin*' or '*errin*' to get
369 **  the next symbol it prints the string pointed to by 'Prompt'.
370 */
371 void Match(ScannerState * s,
372            UInt           symbol,
373            const Char *   msg,
374            TypSymbolSet   skipto);
375 
376 
377 /****************************************************************************
378 **
379 *F  ScanForFloatAfterDotHACK()
380 **
381 **  This function is called by 'ReadLiteral' if it encounters a single dot in
382 **  form the of the symbol 'S_DOT'. The only legal way this could happen is
383 **  if the dot is the start of a float literal like '.123'. As the scanner
384 **  cannot detect this without being context aware, we must provide this
385 **  function to allow the reader to signal to the scanner about this.
386 */
387 void ScanForFloatAfterDotHACK(ScannerState * s);
388 
389 
390 /****************************************************************************
391 **
392 *F * * * * * * * * * * * * * initialize module * * * * * * * * * * * * * * *
393 */
394 
395 /****************************************************************************
396 **
397 *F  InitInfoScanner() . . . . . . . . . . . . . . . . table of init functions
398 */
399 StructInitInfo * InitInfoScanner ( void );
400 
401 #endif // GAP_SCANNER_H
402