1 // Copyright 2009 The Archiveopteryx Developers <info@aox.org>
2 
3 #include "parser.h"
4 
5 
6 /*! \class Parser parser.h
7 
8   The Parser class does basic C++ parsing.
9 
10   It doesn't actually parse C++: all it does is lend some support to
11   the header and source handling, which needs to find certain
12   constructs and look at them.
13 */
14 
15 
16 /*! Constructs a Parser for string \a s. The parser's cursor is left
17     at the beginning of \a s. */
18 
Parser(const EString & s)19 Parser::Parser( const EString & s )
20     : t( s ), i( 0 ), ln( 1 ), li( 0 )
21 {
22     // nothing necessary
23 }
24 
25 
26 /*! Returns true if the parser has reached the end of its input, and
27     false if not.
28 */
29 
atEnd() const30 bool Parser::atEnd() const
31 {
32     return i >= t.length();
33 }
34 
35 
36 /*! Returns the parser's current line number.
37 
38 The line number is that of the first unparsed nonwhitespace
39 character. This implies that if the parser's cursor is at the end of a
40 line, then the line number returned is that of the next nonempty line.
41 */
42 
line()43 uint Parser::line()
44 {
45     if ( li > i ) {
46         ln = 1;
47         li = 0;
48     }
49     while ( li < i ||
50             ( t[li] == 32 || t[li] == 9 || t[li] == 13 || t[li] == 10 ) ) {
51         if ( t[li] == 10 )
52             ln++;
53         li++;
54     }
55     return ln;
56 }
57 
58 
59 /*! Scans forward until an instance of \a text is found, and positions
60     the cursor at the first character after that string. */
61 
scan(const EString & text)62 void Parser::scan( const EString & text )
63 {
64     uint j = 0;
65     while ( i < t.length() && j < text.length() ) {
66         j = 0;
67         while ( j < text.length() && t[i+j] == text[j] )
68             j++;
69         if ( j < text.length() )
70             i++;
71     }
72     if ( j == text.length() )
73         i += j;
74 }
75 
76 
77 /*! Scans for \a text and returns all the text, without the trailing
78     instance of \a text. The cursor is left after \a text. */
79 
textUntil(const EString & text)80 EString Parser::textUntil( const EString & text )
81 {
82     uint j = i;
83     scan( text );
84     if ( atEnd() )
85         return t.mid( j, i-j );
86     return t.mid( j, i-j-text.length() );
87 }
88 
89 
90 /*! Scans past whitespace, leaving the cursor at the end or at a
91     nonwhitespace character.
92 */
93 
whitespace()94 void Parser::whitespace()
95 {
96     i = whitespace( i );
97 }
98 
99 
spaceless(const EString & t)100 static EString spaceless( const EString & t )
101 {
102     uint i = 0;
103     EString r;
104     while ( i < t.length() ) {
105         if ( t[i] != 32 && t[i] != 9 && t[i] != 13 && t[i] != 10 )
106             r.append( t[i] );
107         i++;
108     }
109     return r;
110 }
111 
112 
113 /*! Returns the C++ identifier at the cursor, or an empty string if
114     there isn't any. Steps past the identifier and any trailing whitespace.
115 */
116 
identifier()117 EString Parser::identifier()
118 {
119     int j = complexIdentifier( i );
120     EString r = spaceless( t.mid( i, j - i ) );
121     i = j;
122     return r;
123 }
124 
125 
126 /*! Scans past the simpler identifier starting at \a j, returning the
127     first position afte the identifier. If something goes wrong,
128     simpleIdentifier() returns \a j.
129 
130     A simple identifier is a text label not containing ::, <, >,
131     whitespace or the like.
132 */
133 
simpleIdentifier(uint j)134 uint Parser::simpleIdentifier( uint j )
135 {
136     uint k = whitespace( j );
137     if ( t.mid( k, 8 ) == "operator" )
138         return operatorHack( k );
139     if ( ( t[k] >= 'A' && t[k] <= 'z' ) ||
140          ( t[k] >= 'a' && t[k] <= 'z' ) ) {
141         j = k + 1;
142         while ( ( t[j] >= 'A' && t[j] <= 'z' ) ||
143                 ( t[j] >= 'a' && t[j] <= 'z' ) ||
144                 ( t[j] >= '0' && t[j] <= '9' ) ||
145                 ( t[j] == '_' ) )
146             j++;
147     }
148     return j;
149 }
150 
151 
152 /*! Scans past the complex identifier starting at \a j, returning the
153     first position after the identifier. If something goes wrong,
154     complexIdentifier() returns \a j.
155 
156     A complex identifier is anything that may be used as an identifier
157     in C++, even "operator const char *".
158 */
159 
complexIdentifier(uint j)160 uint Parser::complexIdentifier( uint j )
161 {
162     uint k = whitespace( j );
163     if ( t[k] == ':' && t[k+1] == ':' )
164         k = whitespace( k + 2 );
165     uint l = simpleIdentifier( k );
166     if ( l == k )
167         return j;
168     j = whitespace( l );
169 
170     while ( t[j] == ':' && t[j+1] == ':' ) {
171         if ( t.mid( j+2, 8 ) == "operator" )
172             j = operatorHack( j+2 );
173         else if ( t[j+2] == '~' )
174             j = simpleIdentifier( j + 3 );
175         else
176             j = simpleIdentifier( j + 2 );
177     }
178 
179     j = whitespace( j );
180     if ( t[j] == '<' ) {
181         k = complexIdentifier( j + 1 );
182         if ( k > j + 1 && t[k] == '>' )
183             j = k+1;
184     }
185     return j;
186 }
187 
188 
189 /*! Parses a type name starting at \a j and returns the first
190     character after the type name (and after trailing whitespace). If
191     a type name can't be parsed, \a j is returned.
192 */
193 
type(uint j)194 uint Parser::type( uint j )
195 {
196     // first, we have zero or more of const, static etc.
197     uint l = j;
198     uint k;
199     do {
200         k = l;
201         l = whitespace( k );
202         while ( t[l] >= 'a' && t[l] <= 'z' )
203             l++;
204         EString modifier = t.mid( k, l-k ).simplified();
205         if ( !( modifier == "const" ||
206                 modifier == "inline" ||
207                 modifier == "unsigned" ||
208                 modifier == "signed" ||
209                 modifier == "class" ||
210                 modifier == "struct" ||
211                 modifier == "virtual" ||
212                 modifier == "static" ) )
213             l = k;
214     } while ( l > k );
215 
216     l = complexIdentifier( k );
217     if ( l == k )
218         return j;
219 
220     k = whitespace( l );
221     if ( t[k] == ':' && t[k+1] == ':' ) {
222         l = whitespace( simpleIdentifier( k+2 ) );
223         if ( l == k )
224             return j;
225         k = l;
226     }
227 
228     if ( t[k] == '&' || t[k] == '*' )
229         k = whitespace( k + 1 );
230     return k;
231 }
232 
233 
234 /*! Parses a type specifier and returns it as a string. If the cursor
235     doesn't point to one, type() returns an empty string.
236 
237 */
238 
type()239 EString Parser::type()
240 {
241     uint j = type( i );
242     EString r = t.mid( i, j-i ).simplified(); // simplified() is not quite right
243     i = j;
244     while ( r.startsWith( "class " ) )
245         r = r.mid( 6 );
246     r.replace( " class ", " " );
247     return r;
248 }
249 
250 
251 /*! Parses an argument list (for a particularly misleading meaning of
252     parse) and returns it. The cursor must be on the leading '(', it
253     will be left immediately after the trailing ')'.
254 
255     The argument list is returned including parentheses. In case of an
256     error, an empty string is returned and the cursor is left near the
257     error.
258 */
259 
argumentList()260 EString Parser::argumentList()
261 {
262     EString r;
263     uint j = whitespace( i );
264     if ( t[j] != '(' )
265         return r;
266     r = "( ";
267     i = whitespace( j + 1 );
268     if ( t[i] == ')' ) {
269         i++;
270         return "()";
271     }
272     EString s = "";
273     bool more = true;
274     while ( more ) {
275         EString tp = type();
276         if ( tp.isEmpty() )
277             return ""; // error message here?
278         whitespace();
279         j = simpleIdentifier( i );
280         if ( j > i ) { // there is a variable name
281             tp = tp + " " + t.mid( i, j-i ).simplified();
282             i = j;
283         }
284         r = r + s + tp;
285         whitespace();
286         if ( t[i] == '=' ) { // there is a default value...
287             while ( i < t.length() && t[i] != ',' && t[i] != ')' )
288                 i++;
289             whitespace();
290         }
291         else if ( t[i] == '[' && t[i+1] == ']' ) { // this argument is an array
292             i = i + 2;
293             r.append( "[]" );
294             whitespace();
295         }
296         s = ", ";
297         if ( t[i] == ',' ) {
298             more = true;
299             i++;
300         }
301         else {
302             more = false;
303         }
304     }
305     if ( t[i] != ')' )
306         return "";
307     r.append( " )" );
308     i++;
309     return r;
310 }
311 
312 
313 /*! Steps the Parser past one character. */
314 
step()315 void Parser::step()
316 {
317     i++;
318 }
319 
320 
321 /*! Returns true if the first unparsed characters of the string are
322     the same as \a pattern, and false if not. */
323 
lookingAt(const EString & pattern)324 bool Parser::lookingAt( const EString & pattern )
325 {
326     return t.mid( i, pattern.length() ) == pattern;
327 }
328 
329 
330 /*! Parses and steps past a single word. If the next nonwhitespace
331     character is not a word character, this function returns an empty
332     string.
333 */
334 
word()335 EString Parser::word()
336 {
337     uint j = simpleIdentifier( i );
338     while ( t[j] == '-' ) {
339         uint k = simpleIdentifier( j+1 );
340         if ( k > j + 1 )
341             j = k;
342     }
343     EString r = t.mid( i, j-i ).simplified();
344     if ( !r.isEmpty() )
345         i = j;
346     return r;
347 }
348 
349 
350 /*! Parses and steps past a single value, which is either a number or
351     an identifier.
352 */
353 
value()354 EString Parser::value()
355 {
356     uint j = whitespace( i );
357     if ( t[j] == '-' ||
358          ( t[j] >= '0' && t[j] <= '9' ) ) {
359         uint k = j;
360         if ( t[k] == '-' )
361             k++;
362         while ( t[k] >= '0' && t[k] <= '9' )
363             k++;
364         EString r( t.mid( j, k-j ) );
365         i = k;
366         return r;
367     }
368     return identifier();
369 }
370 
371 
372 /*! Steps past the whitespace starting at \a j and return the index of
373     the first following nonwhitespace character.
374 */
375 
whitespace(uint j)376 uint Parser::whitespace( uint j )
377 {
378     uint k;
379     do {
380         k = j;
381 
382         while ( t[j] == 32 || t[j] == 9 || t[j] == 13 || t[j] == 10 )
383             j++;
384 
385         if ( t[j] == '/' && t[j+1] == '/' ) {
386             while ( j < t.length() && t[j] != '\n' )
387                 j++;
388         }
389     } while ( j > k );
390 
391     return j;
392 }
393 
394 
395 /*! Reads past an operator name starting at \a j and returns the index
396     of the following characters. If \a j does not point to an operator
397     name, operatorHack() returns \a j.
398 */
399 
operatorHack(uint j)400 uint Parser::operatorHack( uint j )
401 {
402     uint i, k = j+8;
403     k = whitespace( k );
404 
405     // Four possible cases: We're looking at a single character, two
406     // characters, '()', or "EString".
407 
408     uint chars = 0;
409 
410     if ( t[k] == '(' && t[k+1] == ')' ) {
411         chars = 2;
412     }
413     else if ( ( ( t[k] > ' ' && t[k] < '@' ) ||
414                   ( t[k] > 'Z' && t[k] < 'a' ) ) &&
415                 !( t[k] >= '0' && t[k] <= '9' ) ) {
416         chars = 1;
417         if ( t[k+1] != '(' &&
418              ( ( t[k+1] > ' ' && t[k+1] < '@' ) ||
419                ( t[k] > 'Z' && t[k] < 'a' ) ) &&
420              !( t[k+1] >= '0' && t[k+1] <= '9' ) )
421             chars = 2;
422     }
423     else if ( ( i = type( k ) ) > k ) {
424         chars = i-k;
425     }
426 
427     if ( chars > 0 ) {
428         k = whitespace( k+chars );
429         if ( t[k] == '(' )
430             return k;
431     }
432     return j;
433 }
434