1------------------------------------------------------------------------------ 2-- -- 3-- GNAT COMPILER COMPONENTS -- 4-- -- 5-- S C A N S -- 6-- -- 7-- S p e c -- 8-- -- 9-- Copyright (C) 1992-2019, Free Software Foundation, Inc. -- 10-- -- 11-- GNAT is free software; you can redistribute it and/or modify it under -- 12-- terms of the GNU General Public License as published by the Free Soft- -- 13-- ware Foundation; either version 3, or (at your option) any later ver- -- 14-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- 15-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- 16-- or FITNESS FOR A PARTICULAR PURPOSE. -- 17-- -- 18-- As a special exception under Section 7 of GPL version 3, you are granted -- 19-- additional permissions described in the GCC Runtime Library Exception, -- 20-- version 3.1, as published by the Free Software Foundation. -- 21-- -- 22-- You should have received a copy of the GNU General Public License and -- 23-- a copy of the GCC Runtime Library Exception along with this program; -- 24-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- 25-- <http://www.gnu.org/licenses/>. -- 26-- -- 27-- GNAT was originally developed by the GNAT team at New York University. -- 28-- Extensive contributions were provided by Ada Core Technologies Inc. -- 29-- -- 30------------------------------------------------------------------------------ 31 32with Namet; use Namet; 33with Types; use Types; 34with Uintp; use Uintp; 35with Urealp; use Urealp; 36 37package Scans is 38 39-- The scanner maintains a current state in the global variables defined 40-- in this package. The call to the Scan routine advances this state to 41-- the next token. The state is initialized by the call to one of the 42-- initialization routines in Sinput. 43 44 -- The following type is used to identify token types returned by Scan. 45 -- The class column in this table indicates the token classes which 46 -- apply to the token, as defined by subsequent subtype declarations. 47 48 type Token_Type is ( 49 50 -- Token name Token type Class(es) 51 52 Tok_Integer_Literal, -- numeric lit Literal, Lit_Or_Name 53 54 Tok_Real_Literal, -- numeric lit Literal, Lit_Or_Name 55 56 Tok_String_Literal, -- string lit Literal. Lit_Or_Name 57 58 Tok_Char_Literal, -- char lit Name, Literal. Lit_Or_Name 59 60 Tok_Operator_Symbol, -- op symbol Name, Literal, Lit_Or_Name, Desig 61 62 Tok_Identifier, -- identifier Name, Lit_Or_Name, Desig 63 64 Tok_At_Sign, -- @ AI12-0125-3 : target name 65 66 Tok_Double_Asterisk, -- ** 67 68 Tok_Ampersand, -- & Binary_Addop 69 Tok_Minus, -- - Binary_Addop, Unary_Addop 70 Tok_Plus, -- + Binary_Addop, Unary_Addop 71 72 Tok_Asterisk, -- * Mulop 73 Tok_Mod, -- MOD Mulop 74 Tok_Rem, -- REM Mulop 75 Tok_Slash, -- / Mulop 76 77 Tok_New, -- NEW 78 79 Tok_Abs, -- ABS 80 Tok_Others, -- OTHERS 81 Tok_Null, -- NULL 82 83 -- Note: Tok_Raise is in no categories now, it used to be Cterm, Eterm, 84 -- After_SM, but now that Ada 2012 has added raise expressions, the 85 -- raise token can appear anywhere. Note in particular that Tok_Raise 86 -- being in Eterm stopped the parser from recognizing "return raise 87 -- exception-name". This degrades error recovery slightly, and perhaps 88 -- we could do better, but not worth the effort. 89 90 Tok_Raise, -- RAISE 91 92 Tok_Dot, -- . Namext 93 Tok_Apostrophe, -- ' Namext 94 95 Tok_Left_Paren, -- ( Namext, Consk 96 97 Tok_Delta, -- DELTA Atkwd, Sterm, Consk 98 Tok_Digits, -- DIGITS Atkwd, Sterm, Consk 99 Tok_Range, -- RANGE Atkwd, Sterm, Consk 100 101 Tok_Right_Paren, -- ) Sterm 102 Tok_Comma, -- , Sterm 103 104 Tok_And, -- AND Logop, Sterm 105 Tok_Or, -- OR Logop, Sterm 106 Tok_Xor, -- XOR Logop, Sterm 107 108 Tok_Less, -- < Relop, Sterm 109 Tok_Equal, -- = Relop, Sterm 110 Tok_Greater, -- > Relop, Sterm 111 Tok_Not_Equal, -- /= Relop, Sterm 112 Tok_Greater_Equal, -- >= Relop, Sterm 113 Tok_Less_Equal, -- <= Relop, Sterm 114 115 Tok_In, -- IN Relop, Sterm 116 Tok_Not, -- NOT Relop, Sterm 117 118 Tok_Box, -- <> Relop, Eterm, Sterm 119 Tok_Colon_Equal, -- := Eterm, Sterm 120 Tok_Colon, -- : Eterm, Sterm 121 Tok_Greater_Greater, -- >> Eterm, Sterm 122 123 Tok_Abstract, -- ABSTRACT Eterm, Sterm 124 Tok_Access, -- ACCESS Eterm, Sterm 125 Tok_Aliased, -- ALIASED Eterm, Sterm 126 Tok_All, -- ALL Eterm, Sterm 127 Tok_Array, -- ARRAY Eterm, Sterm 128 Tok_At, -- AT Eterm, Sterm 129 Tok_Body, -- BODY Eterm, Sterm 130 Tok_Constant, -- CONSTANT Eterm, Sterm 131 Tok_Do, -- DO Eterm, Sterm 132 Tok_Is, -- IS Eterm, Sterm 133 Tok_Interface, -- INTERFACE Eterm, Sterm 134 Tok_Limited, -- LIMITED Eterm, Sterm 135 Tok_Of, -- OF Eterm, Sterm 136 Tok_Out, -- OUT Eterm, Sterm 137 Tok_Record, -- RECORD Eterm, Sterm 138 Tok_Renames, -- RENAMES Eterm, Sterm 139 Tok_Reverse, -- REVERSE Eterm, Sterm 140 Tok_Some, -- SOME Eterm, Sterm 141 Tok_Tagged, -- TAGGED Eterm, Sterm 142 Tok_Then, -- THEN Eterm, Sterm 143 144 Tok_Less_Less, -- << Eterm, Sterm, After_SM 145 146 Tok_Abort, -- ABORT Eterm, Sterm, After_SM 147 Tok_Accept, -- ACCEPT Eterm, Sterm, After_SM 148 Tok_Case, -- CASE Eterm, Sterm, After_SM 149 Tok_Delay, -- DELAY Eterm, Sterm, After_SM 150 Tok_Else, -- ELSE Eterm, Sterm, After_SM 151 Tok_Elsif, -- ELSIF Eterm, Sterm, After_SM 152 Tok_End, -- END Eterm, Sterm, After_SM 153 Tok_Exception, -- EXCEPTION Eterm, Sterm, After_SM 154 Tok_Exit, -- EXIT Eterm, Sterm, After_SM 155 Tok_Goto, -- GOTO Eterm, Sterm, After_SM 156 Tok_If, -- IF Eterm, Sterm, After_SM 157 Tok_Pragma, -- PRAGMA Eterm, Sterm, After_SM 158 Tok_Requeue, -- REQUEUE Eterm, Sterm, After_SM 159 Tok_Return, -- RETURN Eterm, Sterm, After_SM 160 Tok_Select, -- SELECT Eterm, Sterm, After_SM 161 Tok_Terminate, -- TERMINATE Eterm, Sterm, After_SM 162 Tok_Until, -- UNTIL Eterm, Sterm, After_SM 163 Tok_When, -- WHEN Eterm, Sterm, After_SM 164 165 Tok_Begin, -- BEGIN Eterm, Sterm, After_SM, Labeled_Stmt 166 Tok_Declare, -- DECLARE Eterm, Sterm, After_SM, Labeled_Stmt 167 Tok_For, -- FOR Eterm, Sterm, After_SM, Labeled_Stmt 168 Tok_Loop, -- LOOP Eterm, Sterm, After_SM, Labeled_Stmt 169 Tok_While, -- WHILE Eterm, Sterm, After_SM, Labeled_Stmt 170 171 Tok_Entry, -- ENTRY Eterm, Sterm, Declk, Deckn, After_SM 172 Tok_Protected, -- PROTECTED Eterm, Sterm, Declk, Deckn, After_SM 173 Tok_Task, -- TASK Eterm, Sterm, Declk, Deckn, After_SM 174 Tok_Type, -- TYPE Eterm, Sterm, Declk, Deckn, After_SM 175 Tok_Subtype, -- SUBTYPE Eterm, Sterm, Declk, Deckn, After_SM 176 Tok_Overriding, -- OVERRIDING Eterm, Sterm, Declk, Declk, After_SM 177 Tok_Synchronized, -- SYNCHRONIZED Eterm, Sterm, Declk, Deckn, After_SM 178 Tok_Use, -- USE Eterm, Sterm, Declk, Deckn, After_SM 179 180 Tok_Function, -- FUNCTION Eterm, Sterm, Cunit, Declk, After_SM 181 Tok_Generic, -- GENERIC Eterm, Sterm, Cunit, Declk, After_SM 182 Tok_Package, -- PACKAGE Eterm, Sterm, Cunit, Declk, After_SM 183 Tok_Procedure, -- PROCEDURE Eterm, Sterm, Cunit, Declk, After_SM 184 185 Tok_Private, -- PRIVATE Eterm, Sterm, Cunit, After_SM 186 Tok_With, -- WITH Eterm, Sterm, Cunit, After_SM 187 Tok_Separate, -- SEPARATE Eterm, Sterm, Cunit, After_SM 188 189 Tok_EOF, -- End of file Eterm, Sterm, Cterm, After_SM 190 191 Tok_Semicolon, -- ; Eterm, Sterm, Cterm 192 193 Tok_Arrow, -- => Sterm, Cterm, Chtok 194 195 Tok_Vertical_Bar, -- | Cterm, Sterm, Chtok 196 197 Tok_Dot_Dot, -- .. Sterm, Chtok 198 199 Tok_Project, 200 Tok_Extends, 201 Tok_External, 202 Tok_External_As_List, 203 -- These four entries represent keywords for the project file language 204 -- and can be returned only in the case of scanning project files. 205 206 Tok_Comment, 207 -- This entry is used when scanning project files (where it represents 208 -- an entire comment), and in preprocessing with the -C switch set 209 -- (where it represents just the "--" of a comment). For the project 210 -- file case, the text of the comment is stored in Comment_Id. 211 212 Tok_End_Of_Line, 213 -- Represents an end of line. Not used during normal compilation scans 214 -- where end of line is ignored. Active for preprocessor scanning and 215 -- also when scanning project files (where it is needed because of ???) 216 217 Tok_Special, 218 -- AI12-0125-03 : target name as abbreviation for LHS 219 220 -- Otherwise used only in preprocessor scanning (to represent one of 221 -- the characters '#', '$', '?', '@', '`', '\', '^', '~', or '_'. The 222 -- character value itself is stored in Scans.Special_Character. 223 224 Tok_SPARK_Hide, 225 -- HIDE directive in SPARK 226 227 No_Token); 228 -- No_Token is used for initializing Token values to indicate that 229 -- no value has been set yet. 230 231 function Keyword_Name (Token : Token_Type) return Name_Id; 232 -- Given a token that is a reserved word, return the corresponding Name_Id 233 -- in lower case. E.g. Keyword_Name (Tok_Begin) = Name_Find ("begin"). 234 -- It is an error to pass any other kind of token. 235 236 -- Note: in the RM, operator symbol is a special case of string literal. 237 -- We distinguish at the lexical level in this compiler, since there are 238 -- many syntactic situations in which only an operator symbol is allowed. 239 240 -- The following subtype declarations group the token types into classes. 241 -- These are used for class tests in the parser. 242 243 subtype Token_Class_Numeric_Literal is 244 Token_Type range Tok_Integer_Literal .. Tok_Real_Literal; 245 -- Numeric literal 246 247 subtype Token_Class_Literal is 248 Token_Type range Tok_Integer_Literal .. Tok_Operator_Symbol; 249 -- Literal 250 251 subtype Token_Class_Lit_Or_Name is 252 Token_Type range Tok_Integer_Literal .. Tok_Identifier; 253 254 subtype Token_Class_Binary_Addop is 255 Token_Type range Tok_Ampersand .. Tok_Plus; 256 -- Binary adding operator (& + -) 257 258 subtype Token_Class_Unary_Addop is 259 Token_Type range Tok_Minus .. Tok_Plus; 260 -- Unary adding operator (+ -) 261 262 subtype Token_Class_Mulop is 263 Token_Type range Tok_Asterisk .. Tok_Slash; 264 -- Multiplying operator 265 266 subtype Token_Class_Logop is 267 Token_Type range Tok_And .. Tok_Xor; 268 -- Logical operator (and, or, xor) 269 270 subtype Token_Class_Relop is 271 Token_Type range Tok_Less .. Tok_Box; 272 -- Relational operator (= /= < <= > >= not, in plus <> to catch misuse 273 -- of Pascal style not equal operator). 274 275 subtype Token_Class_Name is 276 Token_Type range Tok_Char_Literal .. Tok_At_Sign; 277 -- First token of name (4.1), 278 -- (identifier, char literal, operator symbol) 279 -- Includes '@' after Ada2012 corrigendum. 280 281 subtype Token_Class_Desig is 282 Token_Type range Tok_Operator_Symbol .. Tok_At_Sign; 283 -- Token which can be a Designator (identifier, operator symbol) 284 285 subtype Token_Class_Namext is 286 Token_Type range Tok_Dot .. Tok_Left_Paren; 287 -- Name extension tokens. These are tokens which can appear immediately 288 -- after a name to extend it recursively (period, quote, left paren) 289 290 subtype Token_Class_Consk is 291 Token_Type range Tok_Left_Paren .. Tok_Range; 292 -- Keywords which can start constraint 293 -- (left paren, delta, digits, range) 294 295 subtype Token_Class_Eterm is 296 Token_Type range Tok_Colon_Equal .. Tok_Semicolon; 297 -- Expression terminators. These tokens can never appear within a simple 298 -- expression. This is used for error recovery purposes (if we encounter 299 -- an error in an expression, we simply scan to the next Eterm token). 300 301 subtype Token_Class_Sterm is 302 Token_Type range Tok_Delta .. Tok_Dot_Dot; 303 -- Simple_Expression terminators. A Simple_Expression must be followed 304 -- by a token in this class, or an error message is issued complaining 305 -- about a missing binary operator. 306 307 subtype Token_Class_Atkwd is 308 Token_Type range Tok_Delta .. Tok_Range; 309 -- Attribute keywords. This class includes keywords which can be used 310 -- as an Attribute_Designator, namely DELTA, DIGITS and RANGE 311 312 subtype Token_Class_Cterm is 313 Token_Type range Tok_EOF .. Tok_Vertical_Bar; 314 -- Choice terminators. These tokens terminate a choice. This is used for 315 -- error recovery purposes (if we encounter an error in a Choice, we 316 -- simply scan to the next Cterm token). 317 318 subtype Token_Class_Chtok is 319 Token_Type range Tok_Arrow .. Tok_Dot_Dot; 320 -- Choice tokens. These tokens signal a choice when used in an Aggregate 321 322 subtype Token_Class_Cunit is 323 Token_Type range Tok_Function .. Tok_Separate; 324 -- Tokens which can begin a compilation unit 325 326 subtype Token_Class_Declk is 327 Token_Type range Tok_Entry .. Tok_Procedure; 328 -- Keywords which start a declaration 329 330 subtype Token_Class_Deckn is 331 Token_Type range Tok_Entry .. Tok_Use; 332 -- Keywords which start a declaration but can't start a compilation unit 333 334 subtype Token_Class_After_SM is 335 Token_Type range Tok_Less_Less .. Tok_EOF; 336 -- Tokens which always, or almost always, appear after a semicolon. Used 337 -- in the Resync_Past_Semicolon routine to avoid gobbling up stuff when 338 -- a semicolon is missing. Of significance only for error recovery. 339 340 subtype Token_Class_Labeled_Stmt is 341 Token_Type range Tok_Begin .. Tok_While; 342 -- Tokens which start labeled statements 343 344 type Token_Flag_Array is array (Token_Type) of Boolean; 345 Is_Reserved_Keyword : constant Token_Flag_Array := 346 Token_Flag_Array' 347 (Tok_Mod .. Tok_Rem => True, 348 Tok_New .. Tok_Null => True, 349 Tok_Delta .. Tok_Range => True, 350 Tok_And .. Tok_Xor => True, 351 Tok_In .. Tok_Not => True, 352 Tok_Abstract .. Tok_Then => True, 353 Tok_Abort .. Tok_Separate => True, 354 others => False); 355 -- Flag array used to test for reserved word 356 357 procedure Initialize_Ada_Keywords; 358 -- Set up Token_Type values in Names table entries for Ada reserved 359 -- words. This ignores Ada_Version; Ada_Version is taken into account in 360 -- Snames.Is_Keyword_Name. 361 362 -------------------------- 363 -- Scan State Variables -- 364 -------------------------- 365 366 -- Note: these variables can only be referenced during the parsing of a 367 -- file. Reference to any of them from Sem or the expander is wrong. 368 369 -- These variables are initialized as required by Scn.Initialize_Scanner, 370 -- and should not be referenced before such a call. However, there are 371 -- situations in which these variables are saved and restored, and this 372 -- may happen before the first Initialize_Scanner call, resulting in the 373 -- assignment of invalid values. To avoid this, and allow building with 374 -- the -gnatVa switch, we initialize some variables to known valid values. 375 376 Scan_Ptr : Source_Ptr := No_Location; -- init for -gnatVa 377 -- Current scan pointer location. After a call to Scan, this points 378 -- just past the end of the token just scanned. 379 380 Token : Token_Type := No_Token; -- init for -gnatVa 381 -- Type of current token 382 383 Token_Ptr : Source_Ptr := No_Location; -- init for -gnatVa 384 -- Pointer to first character of current token 385 386 Current_Line_Start : Source_Ptr := No_Location; -- init for -gnatVa 387 -- Pointer to first character of line containing current token 388 389 Start_Column : Column_Number := No_Column_Number; -- init for -gnatVa 390 -- Starting column number (zero origin) of the first non-blank character 391 -- on the line containing the current token. This is used for error 392 -- recovery circuits which depend on looking at the column line up. 393 394 Type_Token_Location : Source_Ptr := No_Location; -- init for -gnatVa 395 -- Within a type declaration, gives the location of the TYPE keyword that 396 -- opened the type declaration. Used in checking the end column of a record 397 -- declaration, which can line up either with the TYPE keyword, or with the 398 -- start of the line containing the RECORD keyword. 399 400 Checksum : Word := 0; -- init for -gnatVa 401 -- Used to accumulate a CRC representing the tokens in the source 402 -- file being compiled. This CRC includes only program tokens, and 403 -- excludes comments. 404 405 Limited_Checksum : Word := 0; 406 -- Used to accumulate a CRC representing significant tokens in the 407 -- limited view of a package, i.e. visible type names and related 408 -- tagged indicators. 409 410 First_Non_Blank_Location : Source_Ptr := No_Location; -- init for -gnatVa 411 -- Location of first non-blank character on the line containing the 412 -- current token (i.e. the location of the character whose column number 413 -- is stored in Start_Column). 414 415 Token_Node : Node_Id := Empty; 416 -- Node table Id for the current token. This is set only if the current 417 -- token is one for which the scanner constructs a node (i.e. it is an 418 -- identifier, operator symbol, or literal). For other token types, 419 -- Token_Node is undefined. 420 421 Token_Name : Name_Id := No_Name; 422 -- For identifiers, this is set to the Name_Id of the identifier scanned. 423 -- For all other tokens, Token_Name is set to Error_Name. Note that it 424 -- would be possible for the caller to extract this information from 425 -- Token_Node. We set Token_Name separately for two reasons. First it 426 -- allows a quicker test for a specific identifier. Second, it allows 427 -- a version of the parser to be built that does not build tree nodes, 428 -- usable as a syntax checker. 429 430 Prev_Token : Token_Type := No_Token; 431 -- Type of previous token 432 433 Prev_Token_Ptr : Source_Ptr; 434 -- Pointer to first character of previous token 435 436 Version_To_Be_Found : Boolean; 437 -- This flag is True if the scanner is still looking for an RCS version 438 -- number in a comment. Normally it is initialized to False so that this 439 -- circuit is not activated. If the -dv switch is set, then this flag is 440 -- initialized to True, and then reset when the version number is found. 441 -- We do things this way to minimize the impact on comment scanning. 442 443 Character_Code : Char_Code; 444 -- Valid only when Token is Tok_Char_Literal. Contains the value of the 445 -- scanned literal. 446 447 Real_Literal_Value : Ureal; 448 -- Valid only when Token is Tok_Real_Literal, contains the value of the 449 -- scanned literal. 450 451 Int_Literal_Value : Uint; 452 -- Valid only when Token = Tok_Integer_Literal, contains the value of the 453 -- scanned literal. 454 455 Based_Literal_Uses_Colon : Boolean; 456 -- Valid only when Token = Tok_Integer_Literal or Tok_Real_Literal. Set 457 -- True only for the case of a based literal using ':' instead of '#'. 458 459 String_Literal_Id : String_Id; 460 -- Valid only when Token = Tok_String_Literal or Tok_Operator_Symbol. 461 -- Contains the Id for currently scanned string value. 462 463 Wide_Character_Found : Boolean := False; 464 -- Valid only when Token = Tok_String_Literal. Set True if wide character 465 -- found (i.e. a character that does not fit in Character, but fits in 466 -- Wide_Wide_Character). 467 468 Wide_Wide_Character_Found : Boolean := False; 469 -- Valid only when Token = Tok_String_Literal. Set True if wide wide 470 -- character found (i.e. a character that does not fit in Character or 471 -- Wide_Character). 472 473 Special_Character : Character; 474 -- AI12-0125-03 : '@' as target name is handled elsewhere. 475 -- Valid only when Token = Tok_Special. Returns one of the characters 476 -- '#', '$', '?', '`', '\', '^', '~', or '_'. 477 -- 478 -- Why only this set? What about wide characters??? 479 480 Comment_Id : Name_Id := No_Name; 481 -- Valid only when Token = Tok_Comment. Store the string that follows 482 -- the "--" of a comment when scanning project files. 483 -- 484 -- Is it really right for this to be a Name rather than a String, what 485 -- about the case of Wide_Wide_Characters??? 486 487 Inside_Depends : Boolean := False; 488 -- True while parsing the argument of a Depends or Refined_Depends pragma 489 -- or aspect. Used to allow/require nonstandard style rules for =>+ with 490 -- -gnatyt. 491 492 Inside_If_Expression : Nat := 0; 493 -- This is a counter that is set non-zero while scanning out an if 494 -- expression (incremented on entry, decremented on exit). It is used to 495 -- disconnect format checks that normally apply to keywords THEN, ELSE etc. 496 497 Inside_Pragma : Boolean := False; 498 -- True within a pragma. Used to avoid complaining about reserved words 499 -- within pragmas (see Scan_Reserved_Identifier). 500 501 -------------------------------------------------------- 502 -- Procedures for Saving and Restoring the Scan State -- 503 -------------------------------------------------------- 504 505 -- The following procedures can be used to save and restore the entire 506 -- scan state. They are used in cases where it is necessary to backup 507 -- the scan during the parse. 508 509 type Saved_Scan_State is private; 510 -- Used for saving and restoring the scan state 511 512 procedure Save_Scan_State (Saved_State : out Saved_Scan_State); 513 pragma Inline (Save_Scan_State); 514 -- Saves the current scan state for possible later restoration. Note that 515 -- there is no harm in saving the state and then never restoring it. 516 517 procedure Restore_Scan_State (Saved_State : Saved_Scan_State); 518 pragma Inline (Restore_Scan_State); 519 -- Restores a scan state saved by a call to Save_Scan_State. 520 -- The saved scan state must refer to the current source file. 521 522private 523 type Saved_Scan_State is record 524 Save_Scan_Ptr : Source_Ptr; 525 Save_Token : Token_Type; 526 Save_Token_Ptr : Source_Ptr; 527 Save_Current_Line_Start : Source_Ptr; 528 Save_Start_Column : Column_Number; 529 Save_Checksum : Word; 530 Save_First_Non_Blank_Location : Source_Ptr; 531 Save_Token_Node : Node_Id; 532 Save_Token_Name : Name_Id; 533 Save_Prev_Token : Token_Type; 534 Save_Prev_Token_Ptr : Source_Ptr; 535 end record; 536 537end Scans; 538