1------------------------------------------------------------------------------
2--                                                                          --
3--                         GNAT COMPILER COMPONENTS                         --
4--                                                                          --
5--                                S C A N S                                 --
6--                                                                          --
7--                                 S p e c                                  --
8--                                                                          --
9--          Copyright (C) 1992-2014, Free Software Foundation, Inc.         --
10--                                                                          --
11-- GNAT is free software;  you can  redistribute it  and/or modify it under --
12-- terms of the  GNU General Public License as published  by the Free Soft- --
13-- ware  Foundation;  either version 3,  or (at your option) any later ver- --
14-- sion.  GNAT is distributed in the hope that it will be useful, but WITH- --
15-- OUT ANY WARRANTY;  without even the  implied warranty of MERCHANTABILITY --
16-- or FITNESS FOR A PARTICULAR PURPOSE.                                     --
17--                                                                          --
18-- As a special exception under Section 7 of GPL version 3, you are granted --
19-- additional permissions described in the GCC Runtime Library Exception,   --
20-- version 3.1, as published by the Free Software Foundation.               --
21--                                                                          --
22-- You should have received a copy of the GNU General Public License and    --
23-- a copy of the GCC Runtime Library Exception along with this program;     --
24-- see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see    --
25-- <http://www.gnu.org/licenses/>.                                          --
26--                                                                          --
27-- GNAT was originally developed  by the GNAT team at  New York University. --
28-- Extensive contributions were provided by Ada Core Technologies Inc.      --
29--                                                                          --
30------------------------------------------------------------------------------
31
32with Namet;  use Namet;
33with Types;  use Types;
34with Uintp;  use Uintp;
35with Urealp; use Urealp;
36
37package Scans is
38
39--  The scanner maintains a current state in the global variables defined
40--  in this package. The call to the Scan routine advances this state to
41--  the next token. The state is initialized by the call to one of the
42--  initialization routines in Sinput.
43
44   --  The following type is used to identify token types returned by Scan.
45   --  The class column in this table indicates the token classes which
46   --  apply to the token, as defined by subsequent subtype declarations.
47
48   --  Note: Namet.Is_Keyword_Name depends on the fact that the first entry in
49   --  this type declaration is *not* for a reserved word. For details on why
50   --  there is this requirement, see Initialize_Ada_Keywords below.
51
52   type Token_Type is (
53
54      --  Token name          Token type   Class(es)
55
56      Tok_Integer_Literal, -- numeric lit  Literal, Lit_Or_Name
57
58      Tok_Real_Literal,    -- numeric lit  Literal, Lit_Or_Name
59
60      Tok_String_Literal,  -- string lit   Literal. Lit_Or_Name
61
62      Tok_Char_Literal,    -- char lit     Name, Literal. Lit_Or_Name
63
64      Tok_Operator_Symbol, -- op symbol    Name, Literal, Lit_Or_Name, Desig
65
66      Tok_Identifier,      -- identifier   Name, Lit_Or_Name, Desig
67
68      Tok_Double_Asterisk, -- **
69
70      Tok_Ampersand,       -- &            Binary_Addop
71      Tok_Minus,           -- -            Binary_Addop, Unary_Addop
72      Tok_Plus,            -- +            Binary_Addop, Unary_Addop
73
74      Tok_Asterisk,        -- *            Mulop
75      Tok_Mod,             -- MOD          Mulop
76      Tok_Rem,             -- REM          Mulop
77      Tok_Slash,           -- /            Mulop
78
79      Tok_New,             -- NEW
80
81      Tok_Abs,             -- ABS
82      Tok_Others,          -- OTHERS
83      Tok_Null,            -- NULL
84
85      --  Note: Tok_Raise is in no categories now, it used to be Cterm, Eterm,
86      --  After_SM, but now that Ada 2012 has added raise expressions, the
87      --  raise token can appear anywhere. Note in particular that Tok_Raise
88      --  being in Eterm stopped the parser from recognizing "return raise
89      --  exception-name". This degrades error recovery slightly, and perhaps
90      --  we could do better, but not worth the effort.
91
92      Tok_Raise,           -- RAISE
93
94      Tok_Dot,             -- .            Namext
95      Tok_Apostrophe,      -- '            Namext
96
97      Tok_Left_Paren,      -- (            Namext, Consk
98
99      Tok_Delta,           -- DELTA        Atkwd, Sterm, Consk
100      Tok_Digits,          -- DIGITS       Atkwd, Sterm, Consk
101      Tok_Range,           -- RANGE        Atkwd, Sterm, Consk
102
103      Tok_Right_Paren,     -- )            Sterm
104      Tok_Comma,           -- ,            Sterm
105
106      Tok_And,             -- AND          Logop, Sterm
107      Tok_Or,              -- OR           Logop, Sterm
108      Tok_Xor,             -- XOR          Logop, Sterm
109
110      Tok_Less,            -- <            Relop, Sterm
111      Tok_Equal,           -- =            Relop, Sterm
112      Tok_Greater,         -- >            Relop, Sterm
113      Tok_Not_Equal,       -- /=           Relop, Sterm
114      Tok_Greater_Equal,   -- >=           Relop, Sterm
115      Tok_Less_Equal,      -- <=           Relop, Sterm
116
117      Tok_In,              -- IN           Relop, Sterm
118      Tok_Not,             -- NOT          Relop, Sterm
119
120      Tok_Box,             -- <>           Relop, Eterm, Sterm
121      Tok_Colon_Equal,     -- :=           Eterm, Sterm
122      Tok_Colon,           -- :            Eterm, Sterm
123      Tok_Greater_Greater, -- >>           Eterm, Sterm
124
125      Tok_Abstract,        -- ABSTRACT     Eterm, Sterm
126      Tok_Access,          -- ACCESS       Eterm, Sterm
127      Tok_Aliased,         -- ALIASED      Eterm, Sterm
128      Tok_All,             -- ALL          Eterm, Sterm
129      Tok_Array,           -- ARRAY        Eterm, Sterm
130      Tok_At,              -- AT           Eterm, Sterm
131      Tok_Body,            -- BODY         Eterm, Sterm
132      Tok_Constant,        -- CONSTANT     Eterm, Sterm
133      Tok_Do,              -- DO           Eterm, Sterm
134      Tok_Is,              -- IS           Eterm, Sterm
135      Tok_Interface,       -- INTERFACE    Eterm, Sterm
136      Tok_Limited,         -- LIMITED      Eterm, Sterm
137      Tok_Of,              -- OF           Eterm, Sterm
138      Tok_Out,             -- OUT          Eterm, Sterm
139      Tok_Record,          -- RECORD       Eterm, Sterm
140      Tok_Renames,         -- RENAMES      Eterm, Sterm
141      Tok_Reverse,         -- REVERSE      Eterm, Sterm
142      Tok_Some,            -- SOME         Eterm, Sterm
143      Tok_Tagged,          -- TAGGED       Eterm, Sterm
144      Tok_Then,            -- THEN         Eterm, Sterm
145
146      Tok_Less_Less,       -- <<           Eterm, Sterm, After_SM
147
148      Tok_Abort,           -- ABORT        Eterm, Sterm, After_SM
149      Tok_Accept,          -- ACCEPT       Eterm, Sterm, After_SM
150      Tok_Case,            -- CASE         Eterm, Sterm, After_SM
151      Tok_Delay,           -- DELAY        Eterm, Sterm, After_SM
152      Tok_Else,            -- ELSE         Eterm, Sterm, After_SM
153      Tok_Elsif,           -- ELSIF        Eterm, Sterm, After_SM
154      Tok_End,             -- END          Eterm, Sterm, After_SM
155      Tok_Exception,       -- EXCEPTION    Eterm, Sterm, After_SM
156      Tok_Exit,            -- EXIT         Eterm, Sterm, After_SM
157      Tok_Goto,            -- GOTO         Eterm, Sterm, After_SM
158      Tok_If,              -- IF           Eterm, Sterm, After_SM
159      Tok_Pragma,          -- PRAGMA       Eterm, Sterm, After_SM
160      Tok_Requeue,         -- REQUEUE      Eterm, Sterm, After_SM
161      Tok_Return,          -- RETURN       Eterm, Sterm, After_SM
162      Tok_Select,          -- SELECT       Eterm, Sterm, After_SM
163      Tok_Terminate,       -- TERMINATE    Eterm, Sterm, After_SM
164      Tok_Until,           -- UNTIL        Eterm, Sterm, After_SM
165      Tok_When,            -- WHEN         Eterm, Sterm, After_SM
166
167      Tok_Begin,           -- BEGIN        Eterm, Sterm, After_SM, Labeled_Stmt
168      Tok_Declare,         -- DECLARE      Eterm, Sterm, After_SM, Labeled_Stmt
169      Tok_For,             -- FOR          Eterm, Sterm, After_SM, Labeled_Stmt
170      Tok_Loop,            -- LOOP         Eterm, Sterm, After_SM, Labeled_Stmt
171      Tok_While,           -- WHILE        Eterm, Sterm, After_SM, Labeled_Stmt
172
173      Tok_Entry,           -- ENTRY        Eterm, Sterm, Declk, Deckn, After_SM
174      Tok_Protected,       -- PROTECTED    Eterm, Sterm, Declk, Deckn, After_SM
175      Tok_Task,            -- TASK         Eterm, Sterm, Declk, Deckn, After_SM
176      Tok_Type,            -- TYPE         Eterm, Sterm, Declk, Deckn, After_SM
177      Tok_Subtype,         -- SUBTYPE      Eterm, Sterm, Declk, Deckn, After_SM
178      Tok_Overriding,      -- OVERRIDING   Eterm, Sterm, Declk, Declk, After_SM
179      Tok_Synchronized,    -- SYNCHRONIZED Eterm, Sterm, Declk, Deckn, After_SM
180      Tok_Use,             -- USE          Eterm, Sterm, Declk, Deckn, After_SM
181
182      Tok_Function,        -- FUNCTION     Eterm, Sterm, Cunit, Declk, After_SM
183      Tok_Generic,         -- GENERIC      Eterm, Sterm, Cunit, Declk, After_SM
184      Tok_Package,         -- PACKAGE      Eterm, Sterm, Cunit, Declk, After_SM
185      Tok_Procedure,       -- PROCEDURE    Eterm, Sterm, Cunit, Declk, After_SM
186
187      Tok_Private,         -- PRIVATE      Eterm, Sterm, Cunit, After_SM
188      Tok_With,            -- WITH         Eterm, Sterm, Cunit, After_SM
189      Tok_Separate,        -- SEPARATE     Eterm, Sterm, Cunit, After_SM
190
191      Tok_EOF,             -- End of file  Eterm, Sterm, Cterm, After_SM
192
193      Tok_Semicolon,       -- ;            Eterm, Sterm, Cterm
194
195      Tok_Arrow,           -- =>           Sterm, Cterm, Chtok
196
197      Tok_Vertical_Bar,    -- |            Cterm, Sterm, Chtok
198
199      Tok_Dot_Dot,         -- ..           Sterm, Chtok
200
201      Tok_Project,
202      Tok_Extends,
203      Tok_External,
204      Tok_External_As_List,
205      --  These four entries represent keywords for the project file language
206      --  and can be returned only in the case of scanning project files.
207
208      Tok_Comment,
209      --  This entry is used when scanning project files (where it represents
210      --  an entire comment), and in preprocessing with the -C switch set
211      --  (where it represents just the "--" of a comment). For the project
212      --  file case, the text of the comment is stored in Comment_Id.
213
214      Tok_End_Of_Line,
215      --  Represents an end of line. Not used during normal compilation scans
216      --  where end of line is ignored. Active for preprocessor scanning and
217      --  also when scanning project files (where it is needed because of ???)
218
219      Tok_Special,
220      --  Used only in preprocessor scanning (to represent one of the
221      --  characters '#', '$', '?', '@', '`', '\', '^', '~', or '_'. The
222      --  character value itself is stored in Scans.Special_Character.
223
224      Tok_SPARK_Hide,
225      --  HIDE directive in SPARK
226
227      No_Token);
228      --  No_Token is used for initializing Token values to indicate that
229      --  no value has been set yet.
230
231   --  Note: in the RM, operator symbol is a special case of string literal.
232   --  We distinguish at the lexical level in this compiler, since there are
233   --  many syntactic situations in which only an operator symbol is allowed.
234
235   --  The following subtype declarations group the token types into classes.
236   --  These are used for class tests in the parser.
237
238   subtype Token_Class_Numeric_Literal is
239     Token_Type range Tok_Integer_Literal .. Tok_Real_Literal;
240   --  Numeric literal
241
242   subtype Token_Class_Literal is
243     Token_Type range Tok_Integer_Literal .. Tok_Operator_Symbol;
244   --  Literal
245
246   subtype Token_Class_Lit_Or_Name is
247     Token_Type range Tok_Integer_Literal .. Tok_Identifier;
248
249   subtype Token_Class_Binary_Addop is
250     Token_Type range Tok_Ampersand .. Tok_Plus;
251   --  Binary adding operator (& + -)
252
253   subtype Token_Class_Unary_Addop is
254     Token_Type range Tok_Minus .. Tok_Plus;
255   --  Unary adding operator (+ -)
256
257   subtype Token_Class_Mulop is
258     Token_Type range Tok_Asterisk .. Tok_Slash;
259   --  Multiplying operator
260
261   subtype Token_Class_Logop is
262     Token_Type range Tok_And .. Tok_Xor;
263   --  Logical operator (and, or, xor)
264
265   subtype Token_Class_Relop is
266     Token_Type range Tok_Less .. Tok_Box;
267   --  Relational operator (= /= < <= > >= not, in plus <> to catch misuse
268   --  of Pascal style not equal operator).
269
270   subtype Token_Class_Name is
271     Token_Type range Tok_Char_Literal .. Tok_Identifier;
272   --  First token of name (4.1),
273   --    (identifier, char literal, operator symbol)
274
275   subtype Token_Class_Desig is
276     Token_Type range Tok_Operator_Symbol .. Tok_Identifier;
277   --  Token which can be a Designator (identifier, operator symbol)
278
279   subtype Token_Class_Namext is
280     Token_Type range Tok_Dot .. Tok_Left_Paren;
281   --  Name extension tokens. These are tokens which can appear immediately
282   --  after a name to extend it recursively (period, quote, left paren)
283
284   subtype Token_Class_Consk is
285     Token_Type range Tok_Left_Paren .. Tok_Range;
286   --  Keywords which can start constraint
287   --    (left paren, delta, digits, range)
288
289   subtype Token_Class_Eterm is
290     Token_Type range Tok_Colon_Equal .. Tok_Semicolon;
291   --  Expression terminators. These tokens can never appear within a simple
292   --  expression. This is used for error recovery purposes (if we encounter
293   --  an error in an expression, we simply scan to the next Eterm token).
294
295   subtype Token_Class_Sterm is
296     Token_Type range Tok_Delta .. Tok_Dot_Dot;
297   --  Simple_Expression terminators. A Simple_Expression must be followed
298   --  by a token in this class, or an error message is issued complaining
299   --  about a missing binary operator.
300
301   subtype Token_Class_Atkwd is
302     Token_Type range Tok_Delta .. Tok_Range;
303   --  Attribute keywords. This class includes keywords which can be used
304   --  as an Attribute_Designator, namely DELTA, DIGITS and RANGE
305
306   subtype Token_Class_Cterm is
307     Token_Type range Tok_EOF .. Tok_Vertical_Bar;
308   --  Choice terminators. These tokens terminate a choice. This is used for
309   --  error recovery purposes (if we encounter an error in a Choice, we
310   --  simply scan to the next Cterm token).
311
312   subtype Token_Class_Chtok is
313     Token_Type range Tok_Arrow .. Tok_Dot_Dot;
314   --  Choice tokens. These tokens signal a choice when used in an Aggregate
315
316   subtype Token_Class_Cunit is
317     Token_Type range Tok_Function .. Tok_Separate;
318   --  Tokens which can begin a compilation unit
319
320   subtype Token_Class_Declk is
321     Token_Type range Tok_Entry .. Tok_Procedure;
322   --  Keywords which start a declaration
323
324   subtype Token_Class_Deckn is
325     Token_Type range Tok_Entry .. Tok_Use;
326   --  Keywords which start a declaration but can't start a compilation unit
327
328   subtype Token_Class_After_SM is
329     Token_Type range Tok_Less_Less .. Tok_EOF;
330   --  Tokens which always, or almost always, appear after a semicolon. Used
331   --  in the Resync_Past_Semicolon routine to avoid gobbling up stuff when
332   --  a semicolon is missing. Of significance only for error recovery.
333
334   subtype Token_Class_Labeled_Stmt is
335     Token_Type range Tok_Begin .. Tok_While;
336   --  Tokens which start labeled statements
337
338   type Token_Flag_Array is array (Token_Type) of Boolean;
339   Is_Reserved_Keyword : constant Token_Flag_Array :=
340                           Token_Flag_Array'
341                             (Tok_Mod      .. Tok_Rem      => True,
342                              Tok_New      .. Tok_Null     => True,
343                              Tok_Delta    .. Tok_Range    => True,
344                              Tok_And      .. Tok_Xor      => True,
345                              Tok_In       .. Tok_Not      => True,
346                              Tok_Abstract .. Tok_Then     => True,
347                              Tok_Abort    .. Tok_Separate => True,
348                              others                       => False);
349   --  Flag array used to test for reserved word
350
351   procedure Initialize_Ada_Keywords;
352   --  Set up Token_Type values in Names table entries for Ada reserved
353   --  words. This ignores Ada_Version; Ada_Version is taken into account in
354   --  Snames.Is_Keyword_Name.
355
356   --------------------------
357   -- Scan State Variables --
358   --------------------------
359
360   --  Note: these variables can only be referenced during the parsing of a
361   --  file. Reference to any of them from Sem or the expander is wrong.
362
363   --  These variables are initialized as required by Scn.Initialize_Scanner,
364   --  and should not be referenced before such a call. However, there are
365   --  situations in which these variables are saved and restored, and this
366   --  may happen before the first Initialize_Scanner call, resulting in the
367   --  assignment of invalid values. To avoid this, and allow building with
368   --  the -gnatVa switch, we initialize some variables to known valid values.
369
370   Scan_Ptr : Source_Ptr := No_Location; -- init for -gnatVa
371   --  Current scan pointer location. After a call to Scan, this points
372   --  just past the end of the token just scanned.
373
374   Token : Token_Type := No_Token; -- init for -gnatVa
375   --  Type of current token
376
377   Token_Ptr : Source_Ptr := No_Location; -- init for -gnatVa
378   --  Pointer to first character of current token
379
380   Current_Line_Start : Source_Ptr := No_Location; -- init for -gnatVa
381   --  Pointer to first character of line containing current token
382
383   Start_Column : Column_Number := No_Column_Number; -- init for -gnatVa
384   --  Starting column number (zero origin) of the first non-blank character
385   --  on the line containing the current token. This is used for error
386   --  recovery circuits which depend on looking at the column line up.
387
388   Type_Token_Location : Source_Ptr := No_Location; -- init for -gnatVa
389   --  Within a type declaration, gives the location of the TYPE keyword that
390   --  opened the type declaration. Used in checking the end column of a record
391   --  declaration, which can line up either with the TYPE keyword, or with the
392   --  start of the line containing the RECORD keyword.
393
394   Checksum : Word := 0; -- init for -gnatVa
395   --  Used to accumulate a CRC representing the tokens in the source
396   --  file being compiled. This CRC includes only program tokens, and
397   --  excludes comments.
398
399   First_Non_Blank_Location : Source_Ptr := No_Location; -- init for -gnatVa
400   --  Location of first non-blank character on the line containing the
401   --  current token (i.e. the location of the character whose column number
402   --  is stored in Start_Column).
403
404   Token_Node : Node_Id := Empty;
405   --  Node table Id for the current token. This is set only if the current
406   --  token is one for which the scanner constructs a node (i.e. it is an
407   --  identifier, operator symbol, or literal). For other token types,
408   --  Token_Node is undefined.
409
410   Token_Name : Name_Id := No_Name;
411   --  For identifiers, this is set to the Name_Id of the identifier scanned.
412   --  For all other tokens, Token_Name is set to Error_Name. Note that it
413   --  would be possible for the caller to extract this information from
414   --  Token_Node. We set Token_Name separately for two reasons. First it
415   --  allows a quicker test for a specific identifier. Second, it allows
416   --  a version of the parser to be built that does not build tree nodes,
417   --  usable as a syntax checker.
418
419   Prev_Token : Token_Type := No_Token;
420   --  Type of previous token
421
422   Prev_Token_Ptr : Source_Ptr;
423   --  Pointer to first character of previous token
424
425   Version_To_Be_Found : Boolean;
426   --  This flag is True if the scanner is still looking for an RCS version
427   --  number in a comment. Normally it is initialized to False so that this
428   --  circuit is not activated. If the -dv switch is set, then this flag is
429   --  initialized to True, and then reset when the version number is found.
430   --  We do things this way to minimize the impact on comment scanning.
431
432   Character_Code : Char_Code;
433   --  Valid only when Token is Tok_Char_Literal. Contains the value of the
434   --  scanned literal.
435
436   Real_Literal_Value : Ureal;
437   --  Valid only when Token is Tok_Real_Literal, contains the value of the
438   --  scanned literal.
439
440   Int_Literal_Value : Uint;
441   --  Valid only when Token = Tok_Integer_Literal, contains the value of the
442   --  scanned literal.
443
444   Based_Literal_Uses_Colon : Boolean;
445   --  Valid only when Token = Tok_Integer_Literal or Tok_Real_Literal. Set
446   --  True only for the case of a based literal using ':' instead of '#'.
447
448   String_Literal_Id : String_Id;
449   --  Valid only when Token = Tok_String_Literal or Tok_Operator_Symbol.
450   --  Contains the Id for currently scanned string value.
451
452   Wide_Character_Found : Boolean := False;
453   --  Valid only when Token = Tok_String_Literal. Set True if wide character
454   --  found (i.e. a character that does not fit in Character, but fits in
455   --  Wide_Wide_Character).
456
457   Wide_Wide_Character_Found : Boolean := False;
458   --  Valid only when Token = Tok_String_Literal. Set True if wide wide
459   --  character found (i.e. a character that does not fit in Character or
460   --  Wide_Character).
461
462   Special_Character : Character;
463   --  Valid only when Token = Tok_Special. Returns one of the characters
464   --  '#', '$', '?', '@', '`', '\', '^', '~', or '_'.
465   --
466   --  Why only this set? What about wide characters???
467
468   Comment_Id : Name_Id := No_Name;
469   --  Valid only when Token = Tok_Comment. Store the string that follows
470   --  the "--" of a comment when scanning project files.
471   --
472   --  Is it really right for this to be a Name rather than a String, what
473   --  about the case of Wide_Wide_Characters???
474
475   Inside_Depends : Boolean := False;
476   --  Flag set True for parsing the argument of a Depends pragma or aspect
477   --  (used to allow/require non-standard style rules for =>+ with -gnatyt).
478
479   Inside_If_Expression : Nat := 0;
480   --  This is a counter that is set non-zero while scanning out an if
481   --  expression (incremented on entry, decremented on exit). It is used to
482   --  disconnect format checks that normally apply to keywords THEN, ELSE etc.
483
484   --------------------------------------------------------
485   -- Procedures for Saving and Restoring the Scan State --
486   --------------------------------------------------------
487
488   --  The following procedures can be used to save and restore the entire
489   --  scan state. They are used in cases where it is necessary to backup
490   --  the scan during the parse.
491
492   type Saved_Scan_State is private;
493   --  Used for saving and restoring the scan state
494
495   procedure Save_Scan_State (Saved_State : out Saved_Scan_State);
496   pragma Inline (Save_Scan_State);
497   --  Saves the current scan state for possible later restoration. Note that
498   --  there is no harm in saving the state and then never restoring it.
499
500   procedure Restore_Scan_State (Saved_State : Saved_Scan_State);
501   pragma Inline (Restore_Scan_State);
502   --  Restores a scan state saved by a call to Save_Scan_State.
503   --  The saved scan state must refer to the current source file.
504
505private
506   type Saved_Scan_State is record
507      Save_Scan_Ptr                 : Source_Ptr;
508      Save_Token                    : Token_Type;
509      Save_Token_Ptr                : Source_Ptr;
510      Save_Current_Line_Start       : Source_Ptr;
511      Save_Start_Column             : Column_Number;
512      Save_Checksum                 : Word;
513      Save_First_Non_Blank_Location : Source_Ptr;
514      Save_Token_Node               : Node_Id;
515      Save_Token_Name               : Name_Id;
516      Save_Prev_Token               : Token_Type;
517      Save_Prev_Token_Ptr           : Source_Ptr;
518   end record;
519
520end Scans;
521