1------------------------------------------------------------------------------ 2-- -- 3-- GNAT2XML COMPONENTS -- 4-- -- 5-- G N A T 2 X M L . S C A N N E R -- 6-- -- 7-- S p e c -- 8-- -- 9-- Copyright (C) 2012-2014, AdaCore, Inc. -- 10-- -- 11-- Gnat2xml is free software; you can redistribute it and/or modify it -- 12-- under terms of the GNU General Public License as published by the Free -- 13-- Software Foundation; either version 2, or (at your option) any later -- 14-- version. Gnat2xml is distributed in the hope that it will be useful, -- 15-- but WITHOUT ANY WARRANTY; without even the implied warranty of MER- -- 16-- CHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General -- 17-- Public License for more details. You should have received a copy of the -- 18-- GNU General Public License distributed with GNAT; see file COPYING. If -- 19-- not, write to the Free Software Foundation, 59 Temple Place Suite 330, -- 20-- Boston, MA 02111-1307, USA. -- 21-- The gnat2xml tool was derived from the Avatox sources. -- 22------------------------------------------------------------------------------ 23 24pragma Ada_2012; 25 26with ASIS_UL.Vectors; 27 28with Ada_Trees.Buffers; use Ada_Trees.Buffers; 29use Ada_Trees.Buffers.Marker_Vectors; 30-- use all type Ada_Trees.Buffers.Marker_Vector; 31 32package Ada_Trees.Scanner is 33 34 -- This package provides a simple lexical scanner for Ada tokens. There are 35 -- some unusual things about this scanner: 36 -- 37 -- We don't distinguish most of the different kinds of tokens; most are 38 -- lumped together under the Lexeme kind, and reserved words are lumped 39 -- together under Reserved_Word. We only distinguish where we need to. 40 -- 41 -- We do not ignore comments; a comment is considered to be a token. 42 -- 43 -- We do not ignore blank lines. We do ignore a single line break, 44 -- if Ignore_Single_Line_Breaks is True. Other whitespace (blanks and 45 -- tabs) between tokens is always ignored. 46 -- 47 -- We don't check for errors, because we're in ASIS, where Ada code is 48 -- known to be legal. 49 50 type Token_Kind is 51 (Nil, 52 Start_Of_Input, 53 End_Of_Input, 54 Identifier, 55 Reserved_Word, 56 String_Literal, 57 Numeric_Literal, 58 Lexeme, -- misc lexemes as defined in the RM 59 Pp_Off_Comment, 60 -- A whole-line comment that matches the --pp-off string 61 Pp_On_Comment, 62 -- A whole-line comment that matches the --pp-on string 63 Other_Whole_Line_Comment, 64 -- A comment that appears by itself on a line. Multiple comments that may 65 -- be filled as a "paragraph" are combined into a single Whole_Line_Comment 66 -- token. This comment is a Whole_Line_Comment. 67 End_Of_Line_Comment, 68 -- A comment that appears at the end of a line, after some other 69 -- program text. The above comment starting "misc lexemes" is an 70 -- End_Of_Line_Comment. 71 End_Of_Line, -- First in a series of one or more NLs. 72 Blank_Line); -- Second, third, ... in a series of one or more NLs. 73 74 subtype Whole_Line_Comment is Token_Kind with 75 Static_Predicate => Whole_Line_Comment in 76 Pp_Off_Comment | Pp_On_Comment | Other_Whole_Line_Comment; 77 78 subtype Comment_Kind is Token_Kind with 79 Predicate => Comment_Kind in Whole_Line_Comment | End_Of_Line_Comment; 80 81 subtype Pp_Off_On_Comment is Token_Kind with 82 Predicate => Pp_Off_On_Comment in Pp_Off_Comment | Pp_On_Comment; 83 84 type Source_Location is record 85 Line, Col : Positive; -- 1-based line and column numbers 86 First : Positive; 87 Last : Natural; 88 89 Firstx, Lastx : Marker; 90 -- ???Same information as First&Last. These should replace First&Last 91 -- eventually. Note that Lastx points one past the last character. 92 end record; 93 94 function First_Pos (Input : Buffer; Sloc : Source_Location) return Positive; 95 function Last_Pos (Input : Buffer; Sloc : Source_Location) return Natural; 96 -- Absolute position in Input (parameter of Get_Tokens) of the start and 97 -- end of the token. So the text of the token is exactly equal to the slice 98 -- Input (First..Last). Note that Input'First might not be 1. 99 100 function Image 101 (Sloc : Source_Location) 102 return String is 103 (Image (Sloc.Line) & 104 ":" & 105 Image (Sloc.Col) & 106 "(" & 107 Image (Sloc.First) & 108 ".." & 109 Image (Sloc.Last) & 110 ")"); 111 112 function Message_Image 113 (Sloc : Source_Location) return String is 114 (Image (Sloc.Line) & 115 ":" & 116 Image (Sloc.Col)); 117 118 function Message_Image 119 (Tree : Ada_Tree; Sloc : Source_Location) return String is 120 -- Tree is the A_Compilation_Unit node 121 (Get_Name_String (Tree.Source_File) & 122 ":" & 123 Image (Sloc.Line) & 124 ":" & 125 Image (Sloc.Col)); 126 127 type Token is record 128 Kind : Token_Kind := Nil; 129 130 Text : Name_Id; 131 -- The text of the token as it appears in the source, with these 132 -- exceptions and clarifications: 133 -- 134 -- Start_Of_Input and End_Of_Input have Text = "". 135 -- 136 -- For Blank_Line: does not include the text of the preceding 137 -- End_Of_Line or Blank_Line (i.e. it is usually just LF, but could 138 -- be CR/LF -- not LF,LF nor CR,LF,CR,LF). 139 -- 140 -- For comments, the text of the comment excluding the initial "--" 141 -- and leading and trailing blanks, and followed by an extra NL. For 142 -- multi-line comment "paragraphs", used for filling, NL terminates each 143 -- line. The NL at the end isn't really part of the comment; the next 144 -- token in the stream will be End_Of_Line. The reason for the extra NL 145 -- is that GNATCOLL.Paragraph_Filling expects it, so it's simpler and 146 -- more efficient this way. 147 148 Normalized : Name_Id; 149 -- Same as Text, or converted to lower case, depending on the Kind. 150 -- Comments have Normalized = No_Name, so we can detect specific 151 -- reserved words. For example, the "BEGIN" reserved word will have Text 152 -- = "BEGIN" and Normalized = "begin". The comment "-- begin" will have 153 -- Text = "begin" and Normalized = No_Name. 154 155 Leading_Blanks : Natural; 156 -- For comments, the number of leading blanks, which are blanks after 157 -- the initial "--" and before any nonblank characters. For other 158 -- tokens, zero. 159 160 Width : Natural; 161 -- For most tokens, this is the width of the token, i.e. the same as 162 -- Sloc.Last-Sloc.First+1, and the same as the length of Text. For 163 -- multi-line comments, this is the width of the widest line. For all 164 -- comments, the initial "--" and any leading blanks are included, but 165 -- the NL's are not. 166 167 Is_Special_Comment : Boolean; 168 -- True if this is a "special" comment; that is, one that should not be 169 -- formatted in any way. False for other comments and for non-comments. 170 171 Is_Fillable_Comment : Boolean; 172 -- True if this is a fillable comment; that is, one that should be 173 -- filled if filling is turned on. False for other comments and for 174 -- non-comments. Special comments are not fillable; Is_Special_Comment 175 -- implies not Is_Fillable_Comment. 176 177 Sloc : Source_Location; 178 end record; 179 180 type Token_Index is new Positive; 181 type Token_Array is array (Token_Index range <>) of Token; 182 package Token_Vectors is new ASIS_UL.Vectors 183 (Token_Index, 184 Token, 185 Token_Array); 186 subtype Token_Vector is Token_Vectors.Vector; 187 type Token_Vector_Ptr is access all Token_Vector; 188 use Token_Vectors; 189 -- use all type Token_Vector; 190 191 function Line_Length 192 (Input : in out Buffer; 193 Ends : Marker_Vector; 194 Line_Num : Positive) 195 return Natural; 196 -- Doesn't count the NL character. This doesn't work for CR/LF line 197 -- endings, which is OK, because we only use it for internally-generated 198 -- text that always uses a single NL. 199 200 Default_Pp_Off_String : aliased constant W_Str := "--!pp off"; 201 Default_Pp_On_String : aliased constant W_Str := "--!pp on"; 202 203 type Pp_Off_On_Delimiters_Rec is record 204 Off : access constant W_Str := Default_Pp_Off_String'Access; 205 On : access constant W_Str := Default_Pp_On_String'Access; 206 -- Text of comments for turning pretting printing off and on, including 207 -- the leading '--'. For example, if the user specified --pp-off='pp-', 208 -- then Off will be "--pp-". A whole-line comment of the form "--pp-" 209 -- will disable pretty printing. 210 -- We do not want these comments to be fillable. 211 end record; 212 213 Gen_Plus : constant W_Str := "--gen+"; -- (style) two spaces required 214 Gen_Minus : constant W_Str := "--gen-"; 215 -- Strings to mark start and end of automatically generated code. 216 217 procedure Get_Tokens 218 (Input : in out Buffer; 219 Result : out Token_Vectors.Vector; 220 Pp_Off_On_Delimiters : Pp_Off_On_Delimiters_Rec; 221 Ignore_Single_Line_Breaks : Boolean := True; 222 Max_Tokens : Token_Index := Token_Index'Last; 223 Line_Ends : Marker_Vector_Ptr := null; 224 Gen_Regions : Token_Vector_Ptr := null); 225 -- Return in Result the sequence of tokens in the Input string. The 226 -- first one is always Start_Of_Input, and the last one End_Of_Input. 227 -- Ignore_Single_Line_Breaks means we should skip any End_Of_Line tokens 228 -- (but not Blank_Lines). Max_Tokens places a limit on the number of tokens 229 -- (not counting Start_Of_Input); we quit before reaching end of input if 230 -- we've gotten that many. 231 -- 232 -- If Line_Ends is non-null, we compute all the line endings in 233 -- Line_Ends.all, which is a mapping from line numbers to Markers in the 234 -- Input string. Each element points to a NL character in the corresponding 235 -- buffer. 236 -- 237 -- Comments starting with Gen_Plus and Gen_Minus, and tokens in between, do 238 -- not appear in Result. If Gen_Regions is non-null, we use it to return 239 -- the sequence of Gen_Plus and Gen_Minus tokens. The generated code is in 240 -- the slices Gen_Regions(1).Sloc..Gen_Regions(2).Sloc, 241 -- Gen_Regions(3).Sloc..Gen_Regions(4).Sloc, and so on. 242 243 function Next_Lexeme 244 (Tokens : Token_Vectors.Vector; 245 Index : Token_Index) 246 return Token; 247 -- Returns the next token after Index that is not a blank line or comment 248 249 function Prev_Lexeme 250 (Tokens : Token_Vectors.Vector; 251 Index : Token_Index) 252 return Token; 253 -- Returns the previous token before Index that is not a blank line or 254 -- comment 255 256 function Get_Token (Input : W_Str) return Token; 257 -- Get just one token, ignoring single line breaks 258 259 procedure Check_Same_Tokens (X, Y : Token_Vectors.Vector); 260 -- Checks that X and Y are the same except for Slocs and line breaks; raise 261 -- an exception if not. 262 263 function In_Gen_Regions 264 (Line : Positive; Gen_Regions : Token_Vector) return Boolean; 265 -- True if the line number is within one of the regions of Gen_Regions. 266 -- The comments are always on a line by themselves, so we don't have to 267 -- worry about column numbers. 268 269 procedure Put_Token (Tok : Token; Index : Token_Index := 1); 270 procedure Put_Tokens 271 (Tokens : Token_Vectors.Vector; 272 First : Token_Index'Base := 1; 273 Last : Token_Index'Base := Token_Index'Last; 274 Highlight : Token_Index'Base := 0); 275 -- Put token(s) to standard output (even if Text_IO.Current_Output has been 276 -- redirected). The tokens come out in compilable form, one per line, with 277 -- the text of the token first, and the other information commented out. 278 -- This one-token-per line code can be used for testing the scanner -- it 279 -- should have identical semantics to the original Ada code. First and Last 280 -- indicate a slice of Tokens, and we tolerate out-of-bounds indices. 281 -- We draw a comment line before Highlight. 282 283end Ada_Trees.Scanner; 284