1------------------------------------------------------------------------------
2--                                                                          --
3--                            GNAT2XML COMPONENTS                           --
4--                                                                          --
5--                      G N A T 2 X M L . S C A N N E R                     --
6--                                                                          --
7--                                 S p e c                                  --
8--                                                                          --
9--                  Copyright (C) 2012-2014, AdaCore, Inc.                  --
10--                                                                          --
11-- Gnat2xml is free software; you can redistribute it and/or modify it      --
12-- under terms of the  GNU General Public License  as published by the Free --
13-- Software Foundation;  either version 2,  or  (at your option)  any later --
14-- version. Gnat2xml is distributed  in the hope  that it will be useful,   --
15-- but WITHOUT ANY WARRANTY; without even the implied warranty of MER-      --
16-- CHANTABILITY or  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General  --
17-- Public License for more details. You should have received a copy of the  --
18-- GNU General Public License distributed with GNAT; see file COPYING. If   --
19-- not, write to the Free Software Foundation, 59 Temple Place Suite 330,   --
20-- Boston, MA 02111-1307, USA.                                              --
21-- The gnat2xml tool was derived from the Avatox sources.                   --
22------------------------------------------------------------------------------
23
24pragma Ada_2012;
25
26with ASIS_UL.Vectors;
27
28with Ada_Trees.Buffers; use Ada_Trees.Buffers;
29use Ada_Trees.Buffers.Marker_Vectors;
30--  use all type Ada_Trees.Buffers.Marker_Vector;
31
32package Ada_Trees.Scanner is
33
34   --  This package provides a simple lexical scanner for Ada tokens. There are
35   --  some unusual things about this scanner:
36   --
37   --     We don't distinguish most of the different kinds of tokens; most are
38   --     lumped together under the Lexeme kind, and reserved words are lumped
39   --     together under Reserved_Word. We only distinguish where we need to.
40   --
41   --     We do not ignore comments; a comment is considered to be a token.
42   --
43   --     We do not ignore blank lines. We do ignore a single line break,
44   --     if Ignore_Single_Line_Breaks is True. Other whitespace (blanks and
45   --     tabs) between tokens is always ignored.
46   --
47   --     We don't check for errors, because we're in ASIS, where Ada code is
48   --     known to be legal.
49
50   type Token_Kind is
51     (Nil,
52      Start_Of_Input,
53      End_Of_Input,
54      Identifier,
55      Reserved_Word,
56      String_Literal,
57      Numeric_Literal,
58      Lexeme, -- misc lexemes as defined in the RM
59      Pp_Off_Comment,
60      --  A whole-line comment that matches the --pp-off string
61      Pp_On_Comment,
62      --  A whole-line comment that matches the --pp-on string
63      Other_Whole_Line_Comment,
64   --  A comment that appears by itself on a line. Multiple comments that may
65   --  be filled as a "paragraph" are combined into a single Whole_Line_Comment
66   --  token. This comment is a Whole_Line_Comment.
67      End_Of_Line_Comment,
68   --  A comment that appears at the end of a line, after some other
69   --  program text. The above comment starting "misc lexemes" is an
70   --  End_Of_Line_Comment.
71      End_Of_Line, -- First in a series of one or more NLs.
72      Blank_Line); -- Second, third, ... in a series of one or more NLs.
73
74   subtype Whole_Line_Comment is Token_Kind with
75     Static_Predicate => Whole_Line_Comment in
76       Pp_Off_Comment | Pp_On_Comment | Other_Whole_Line_Comment;
77
78   subtype Comment_Kind is Token_Kind with
79        Predicate => Comment_Kind in Whole_Line_Comment | End_Of_Line_Comment;
80
81   subtype Pp_Off_On_Comment is Token_Kind with
82        Predicate => Pp_Off_On_Comment in Pp_Off_Comment | Pp_On_Comment;
83
84   type Source_Location is record
85      Line, Col : Positive; -- 1-based line and column numbers
86      First     : Positive;
87      Last      : Natural;
88
89      Firstx, Lastx : Marker;
90      --  ???Same information as First&Last. These should replace First&Last
91      --  eventually. Note that Lastx points one past the last character.
92   end record;
93
94   function First_Pos (Input : Buffer; Sloc : Source_Location) return Positive;
95   function Last_Pos (Input : Buffer; Sloc : Source_Location) return Natural;
96   --  Absolute position in Input (parameter of Get_Tokens) of the start and
97   --  end of the token. So the text of the token is exactly equal to the slice
98   --  Input (First..Last). Note that Input'First might not be 1.
99
100   function Image
101     (Sloc : Source_Location)
102      return String is
103     (Image (Sloc.Line) &
104      ":" &
105      Image (Sloc.Col) &
106      "(" &
107      Image (Sloc.First) &
108      ".." &
109      Image (Sloc.Last) &
110      ")");
111
112   function Message_Image
113     (Sloc : Source_Location) return String is
114     (Image (Sloc.Line) &
115      ":" &
116      Image (Sloc.Col));
117
118   function Message_Image
119     (Tree : Ada_Tree; Sloc : Source_Location) return String is
120      --  Tree is the A_Compilation_Unit node
121     (Get_Name_String (Tree.Source_File) &
122      ":" &
123      Image (Sloc.Line) &
124      ":" &
125      Image (Sloc.Col));
126
127   type Token is record
128      Kind : Token_Kind := Nil;
129
130      Text : Name_Id;
131      --  The text of the token as it appears in the source, with these
132      --  exceptions and clarifications:
133      --
134      --  Start_Of_Input and End_Of_Input have Text = "".
135      --
136      --  For Blank_Line: does not include the text of the preceding
137      --  End_Of_Line or Blank_Line (i.e. it is usually just LF, but could
138      --  be CR/LF -- not LF,LF nor CR,LF,CR,LF).
139      --
140      --  For comments, the text of the comment excluding the initial "--"
141      --  and leading and trailing blanks, and followed by an extra NL. For
142      --  multi-line comment "paragraphs", used for filling, NL terminates each
143      --  line. The NL at the end isn't really part of the comment; the next
144      --  token in the stream will be End_Of_Line. The reason for the extra NL
145      --  is that GNATCOLL.Paragraph_Filling expects it, so it's simpler and
146      --  more efficient this way.
147
148      Normalized : Name_Id;
149      --  Same as Text, or converted to lower case, depending on the Kind.
150      --  Comments have Normalized = No_Name, so we can detect specific
151      --  reserved words. For example, the "BEGIN" reserved word will have Text
152      --  = "BEGIN" and Normalized = "begin". The comment "-- begin" will have
153      --  Text = "begin" and Normalized = No_Name.
154
155      Leading_Blanks : Natural;
156      --  For comments, the number of leading blanks, which are blanks after
157      --  the initial "--" and before any nonblank characters. For other
158      --  tokens, zero.
159
160      Width : Natural;
161      --  For most tokens, this is the width of the token, i.e. the same as
162      --  Sloc.Last-Sloc.First+1, and the same as the length of Text. For
163      --  multi-line comments, this is the width of the widest line. For all
164      --  comments, the initial "--" and any leading blanks are included, but
165      --  the NL's are not.
166
167      Is_Special_Comment : Boolean;
168      --  True if this is a "special" comment; that is, one that should not be
169      --  formatted in any way. False for other comments and for non-comments.
170
171      Is_Fillable_Comment : Boolean;
172      --  True if this is a fillable comment; that is, one that should be
173      --  filled if filling is turned on. False for other comments and for
174      --  non-comments. Special comments are not fillable; Is_Special_Comment
175      --  implies not Is_Fillable_Comment.
176
177      Sloc : Source_Location;
178   end record;
179
180   type Token_Index is new Positive;
181   type Token_Array is array (Token_Index range <>) of Token;
182   package Token_Vectors is new ASIS_UL.Vectors
183     (Token_Index,
184      Token,
185      Token_Array);
186   subtype Token_Vector is Token_Vectors.Vector;
187   type Token_Vector_Ptr is access all Token_Vector;
188   use Token_Vectors;
189   --  use all type Token_Vector;
190
191   function Line_Length
192     (Input    : in out Buffer;
193      Ends     : Marker_Vector;
194      Line_Num : Positive)
195      return     Natural;
196   --  Doesn't count the NL character. This doesn't work for CR/LF line
197   --  endings, which is OK, because we only use it for internally-generated
198   --  text that always uses a single NL.
199
200   Default_Pp_Off_String : aliased constant W_Str := "--!pp off";
201   Default_Pp_On_String : aliased constant W_Str := "--!pp on";
202
203   type Pp_Off_On_Delimiters_Rec is record
204      Off : access constant W_Str := Default_Pp_Off_String'Access;
205      On : access constant W_Str := Default_Pp_On_String'Access;
206      --  Text of comments for turning pretting printing off and on, including
207      --  the leading '--'. For example, if the user specified --pp-off='pp-',
208      --  then Off will be "--pp-". A whole-line comment of the form "--pp-"
209      --  will disable pretty printing.
210      --  We do not want these comments to be fillable.
211   end record;
212
213   Gen_Plus : constant W_Str := "--gen+"; -- (style) two spaces required
214   Gen_Minus : constant W_Str := "--gen-";
215   --  Strings to mark start and end of automatically generated code.
216
217   procedure Get_Tokens
218     (Input                     : in out Buffer;
219      Result                    : out Token_Vectors.Vector;
220      Pp_Off_On_Delimiters      : Pp_Off_On_Delimiters_Rec;
221      Ignore_Single_Line_Breaks : Boolean           := True;
222      Max_Tokens                : Token_Index       := Token_Index'Last;
223      Line_Ends                 : Marker_Vector_Ptr := null;
224      Gen_Regions               : Token_Vector_Ptr  := null);
225   --  Return in Result the sequence of tokens in the Input string. The
226   --  first one is always Start_Of_Input, and the last one End_Of_Input.
227   --  Ignore_Single_Line_Breaks means we should skip any End_Of_Line tokens
228   --  (but not Blank_Lines). Max_Tokens places a limit on the number of tokens
229   --  (not counting Start_Of_Input); we quit before reaching end of input if
230   --  we've gotten that many.
231   --
232   --  If Line_Ends is non-null, we compute all the line endings in
233   --  Line_Ends.all, which is a mapping from line numbers to Markers in the
234   --  Input string. Each element points to a NL character in the corresponding
235   --  buffer.
236   --
237   --  Comments starting with Gen_Plus and Gen_Minus, and tokens in between, do
238   --  not appear in Result. If Gen_Regions is non-null, we use it to return
239   --  the sequence of Gen_Plus and Gen_Minus tokens.  The generated code is in
240   --  the slices Gen_Regions(1).Sloc..Gen_Regions(2).Sloc,
241   --  Gen_Regions(3).Sloc..Gen_Regions(4).Sloc, and so on.
242
243   function Next_Lexeme
244     (Tokens : Token_Vectors.Vector;
245      Index  : Token_Index)
246      return   Token;
247   --  Returns the next token after Index that is not a blank line or comment
248
249   function Prev_Lexeme
250     (Tokens : Token_Vectors.Vector;
251      Index  : Token_Index)
252      return   Token;
253   --  Returns the previous token before Index that is not a blank line or
254   --  comment
255
256   function Get_Token (Input : W_Str) return Token;
257   --  Get just one token, ignoring single line breaks
258
259   procedure Check_Same_Tokens (X, Y : Token_Vectors.Vector);
260   --  Checks that X and Y are the same except for Slocs and line breaks; raise
261   --  an exception if not.
262
263   function In_Gen_Regions
264     (Line : Positive; Gen_Regions : Token_Vector) return Boolean;
265   --  True if the line number is within one of the regions of Gen_Regions.
266   --  The comments are always on a line by themselves, so we don't have to
267   --  worry about column numbers.
268
269   procedure Put_Token (Tok : Token; Index : Token_Index := 1);
270   procedure Put_Tokens
271     (Tokens    : Token_Vectors.Vector;
272      First     : Token_Index'Base := 1;
273      Last      : Token_Index'Base := Token_Index'Last;
274      Highlight : Token_Index'Base := 0);
275   --  Put token(s) to standard output (even if Text_IO.Current_Output has been
276   --  redirected). The tokens come out in compilable form, one per line, with
277   --  the text of the token first, and the other information commented out.
278   --  This one-token-per line code can be used for testing the scanner -- it
279   --  should have identical semantics to the original Ada code. First and Last
280   --  indicate a slice of Tokens, and we tolerate out-of-bounds indices.
281   --  We draw a comment line before Highlight.
282
283end Ada_Trees.Scanner;
284