1 /*
2  * Copyright 1993-2002 Christopher Seiwald and Perforce Software, Inc.
3  *
4  * This file is part of Jam - see jam.c for Copyright information.
5  */
6 
7 /*
8  * scan.c - the jam yacc scanner
9  *
10  */
11 
12 #include "jam.h"
13 #include "scan.h"
14 #include "output.h"
15 
16 #include "constants.h"
17 #include "jambase.h"
18 #include "jamgram.hpp"
19 
20 
21 struct keyword
22 {
23     const char * word;
24     int    type;
25 } keywords[] =
26 {
27 #include "jamgramtab.h"
28     { 0, 0 }
29 };
30 
31 typedef struct include include;
32 struct include
33 {
34     include   * next;        /* next serial include file */
35     char      * string;      /* pointer into current line */
36     char    * * strings;     /* for yyfparse() -- text to parse */
37     LISTITER    pos;         /* for yysparse() -- text to parse */
38     LIST      * list;        /* for yysparse() -- text to parse */
39     FILE      * file;        /* for yyfparse() -- file being read */
40     OBJECT    * fname;       /* for yyfparse() -- file name */
41     int         line;        /* line counter for error messages */
42     char        buf[ 512 ];  /* for yyfparse() -- line buffer */
43 };
44 
45 static include * incp = 0;  /* current file; head of chain */
46 
47 static int scanmode = SCAN_NORMAL;
48 static int anyerrors = 0;
49 
50 
51 static char * symdump( YYSTYPE * );
52 
53 #define BIGGEST_TOKEN 10240  /* no single token can be larger */
54 
55 
56 /*
57  * Set parser mode: normal, string, or keyword.
58  */
59 
yymode(int n)60 int yymode( int n )
61 {
62     int result = scanmode;
63     scanmode = n;
64     return result;
65 }
66 
67 
yyerror(char const * s)68 void yyerror( char const * s )
69 {
70     /* We use yylval instead of incp to access the error location information as
71      * the incp pointer will already be reset to 0 in case the error occurred at
72      * EOF.
73      *
74      * The two may differ only if ran into an unexpected EOF or we get an error
75      * while reading a lexical token spanning multiple lines, e.g. a multi-line
76      * string literal or action body, in which case yylval location information
77      * will hold the information about where the token started while incp will
78      * hold the information about where reading it broke.
79      */
80     out_printf( "%s:%d: %s at %s\n", object_str( yylval.file ), yylval.line, s,
81             symdump( &yylval ) );
82     ++anyerrors;
83 }
84 
85 
yyanyerrors()86 int yyanyerrors()
87 {
88     return anyerrors != 0;
89 }
90 
91 
yyfparse(OBJECT * s)92 void yyfparse( OBJECT * s )
93 {
94     include * i = (include *)BJAM_MALLOC( sizeof( *i ) );
95 
96     /* Push this onto the incp chain. */
97     i->string = (char*)"";
98     i->strings = 0;
99     i->file = 0;
100     i->fname = object_copy( s );
101     i->line = 0;
102     i->next = incp;
103     incp = i;
104 
105     /* If the filename is "+", it means use the internal jambase. */
106     if ( !strcmp( object_str( s ), "+" ) )
107         i->strings = (char**)jambase;
108 }
109 
110 
yysparse(OBJECT * name,const char ** lines)111 void yysparse( OBJECT * name, const char * * lines )
112 {
113     yyfparse( name );
114     incp->strings = (char * *)lines;
115 }
116 
117 
118 /*
119  * yyfdone() - cleanup after we're done parsing a file.
120  */
yyfdone(void)121 void yyfdone( void )
122 {
123     include * const i = incp;
124     incp = i->next;
125 
126     /* Close file, free name. */
127     if(i->file && (i->file != stdin))
128         fclose(i->file);
129     object_free(i->fname);
130     BJAM_FREE((char *)i);
131 }
132 
133 
134 /*
135  * yyline() - read new line and return first character.
136  *
137  * Fabricates a continuous stream of characters across include files, returning
138  * EOF at the bitter end.
139  */
140 
yyline()141 int yyline()
142 {
143     include * const i = incp;
144 
145     if ( !incp )
146         return EOF;
147 
148     /* Once we start reading from the input stream, we reset the include
149      * insertion point so that the next include file becomes the head of the
150      * list.
151      */
152 
153     /* If there is more data in this line, return it. */
154     if ( *i->string )
155         return *i->string++;
156 
157     /* If we are reading from an internal string list, go to the next string. */
158     if ( i->strings )
159     {
160         if ( *i->strings )
161         {
162             ++i->line;
163             i->string = *(i->strings++);
164             return *i->string++;
165         }
166     }
167     else
168     {
169         /* If necessary, open the file. */
170         if ( !i->file )
171         {
172             FILE * f = stdin;
173             if ( strcmp( object_str( i->fname ), "-" ) && !( f = fopen( object_str( i->fname ), "r" ) ) )
174                 perror( object_str( i->fname ) );
175             i->file = f;
176         }
177 
178         /* If there is another line in this file, start it. */
179         if ( i->file && fgets( i->buf, sizeof( i->buf ), i->file ) )
180         {
181             ++i->line;
182             i->string = i->buf;
183             return *i->string++;
184         }
185     }
186 
187     /* This include is done. Return EOF so yyparse() returns to
188      * parse_file().
189      */
190 
191     return EOF;
192 }
193 
194 /* This allows us to get an extra character of lookahead.
195  * There are a few places where we need to look ahead two
196  * characters and yyprev only guarantees a single character
197  * of putback.
198  */
yypeek()199 int yypeek()
200 {
201     if ( *incp->string )
202     {
203         return *incp->string;
204     }
205     else if ( incp->strings )
206     {
207         if ( *incp->strings )
208             return **incp->strings;
209     }
210     else if ( incp->file )
211     {
212         /* Don't bother opening the file.  yypeek is
213          * only used in special cases and never at the
214          * beginning of a file.
215          */
216         int ch = fgetc( incp->file );
217         if ( ch != EOF )
218             ungetc( ch, incp->file );
219         return ch;
220     }
221     return EOF;
222 }
223 
224 /*
225  * yylex() - set yylval to current token; return its type.
226  *
227  * Macros to move things along:
228  *
229  *  yychar() - return and advance character; invalid after EOF.
230  *  yyprev() - back up one character; invalid before yychar().
231  *
232  * yychar() returns a continuous stream of characters, until it hits the EOF of
233  * the current include file.
234  */
235 
236 #define yychar() ( *incp->string ? *incp->string++ : yyline() )
237 #define yyprev() ( incp->string-- )
238 
239 static int use_new_scanner = 0;
240 static int expect_whitespace;
241 
242 #define yystartkeyword() if(use_new_scanner) break; else token_warning()
243 #define yyendkeyword() if(use_new_scanner) break; else if ( 1 ) { expect_whitespace = 1; continue; } else (void)0
244 
do_token_warning()245 void do_token_warning()
246 {
247     out_printf( "%s:%d: %s %s\n", object_str( yylval.file ), yylval.line, "Unescaped special character in",
248             symdump( &yylval ) );
249 }
250 
251 #define token_warning() has_token_warning = 1
252 
yylex()253 int yylex()
254 {
255     int c;
256     char buf[ BIGGEST_TOKEN ];
257     char * b = buf;
258 
259     if ( !incp )
260         goto eof;
261 
262     /* Get first character (whitespace or of token). */
263     c = yychar();
264 
265     if ( scanmode == SCAN_STRING )
266     {
267         /* If scanning for a string (action's {}'s), look for the closing brace.
268          * We handle matching braces, if they match.
269          */
270 
271         int nest = 1;
272 
273         while ( ( c != EOF ) && ( b < buf + sizeof( buf ) ) )
274         {
275             if ( c == '{' )
276                 ++nest;
277 
278             if ( ( c == '}' ) && !--nest )
279                 break;
280 
281             *b++ = c;
282 
283             c = yychar();
284 
285             /* Turn trailing "\r\n" sequences into plain "\n" for Cygwin. */
286             if ( ( c == '\n' ) && ( b[ -1 ] == '\r' ) )
287                 --b;
288         }
289 
290         /* We ate the ending brace -- regurgitate it. */
291         if ( c != EOF )
292             yyprev();
293 
294         /* Check for obvious errors. */
295         if ( b == buf + sizeof( buf ) )
296         {
297             yyerror( "action block too big" );
298             goto eof;
299         }
300 
301         if ( nest )
302         {
303             yyerror( "unmatched {} in action block" );
304             goto eof;
305         }
306 
307         *b = 0;
308         yylval.type = STRING;
309         yylval.string = object_new( buf );
310         yylval.file = incp->fname;
311         yylval.line = incp->line;
312     }
313     else
314     {
315         char * b = buf;
316         struct keyword * k;
317         int inquote = 0;
318         int notkeyword;
319         int hastoken = 0;
320         int hasquote = 0;
321         int ingrist = 0;
322         int invarexpand = 0;
323         int expect_whitespace = 0;
324         int has_token_warning = 0;
325 
326         /* Eat white space. */
327         for ( ; ; )
328         {
329             /* Skip past white space. */
330             while ( ( c != EOF ) && isspace( c ) )
331                 c = yychar();
332 
333             /* Not a comment? */
334             if ( c != '#' )
335                 break;
336 
337             c = yychar();
338             if ( ( c != EOF ) && c == '|' )
339             {
340                 /* Swallow up block comment. */
341                 int c0 = yychar();
342                 int c1 = yychar();
343                 while ( ! ( c0 == '|' && c1 == '#' ) && ( c0 != EOF && c1 != EOF ) )
344                 {
345                     c0 = c1;
346                     c1 = yychar();
347                 }
348                 c = yychar();
349             }
350             else
351             {
352                 /* Swallow up comment line. */
353                 while ( ( c != EOF ) && ( c != '\n' ) ) c = yychar();
354             }
355         }
356 
357         /* c now points to the first character of a token. */
358         if ( c == EOF )
359             goto eof;
360 
361         yylval.file = incp->fname;
362         yylval.line = incp->line;
363 
364         /* While scanning the word, disqualify it for (expensive) keyword lookup
365          * when we can: $anything, "anything", \anything
366          */
367         notkeyword = c == '$';
368 
369         /* Look for white space to delimit word. "'s get stripped but preserve
370          * white space. \ protects next character.
371          */
372         while
373         (
374             ( c != EOF ) &&
375             ( b < buf + sizeof( buf ) ) &&
376             ( inquote || invarexpand || !isspace( c ) )
377         )
378         {
379             if ( expect_whitespace || ( isspace( c ) && ! inquote ) )
380             {
381                 token_warning();
382                 expect_whitespace = 0;
383             }
384             if ( !inquote && !invarexpand )
385             {
386                 if ( scanmode == SCAN_COND || scanmode == SCAN_CONDB )
387                 {
388                     if ( hastoken && ( c == '=' || c == '<' || c == '>' || c == '!' || c == '(' || c == ')' || c == '&' || c == '|' ) )
389                     {
390                         /* Don't treat > as special if we started with a grist. */
391                         if ( ! ( scanmode == SCAN_CONDB && ingrist == 1 && c == '>' ) )
392                         {
393                             yystartkeyword();
394                         }
395                     }
396                     else if ( c == '=' || c == '(' || c == ')' )
397                     {
398                         *b++ = c;
399                         c = yychar();
400                         yyendkeyword();
401                     }
402                     else if ( c == '!' || ( scanmode == SCAN_COND && ( c == '<' || c == '>' ) ) )
403                     {
404                         *b++ = c;
405                         if ( ( c = yychar() ) == '=' )
406                         {
407                             *b++ = c;
408                             c = yychar();
409                         }
410                         yyendkeyword();
411                     }
412                     else if ( c == '&' || c == '|' )
413                     {
414                         *b++ = c;
415                         if ( yychar() == c )
416                         {
417                             *b++ = c;
418                             c = yychar();
419                         }
420                         yyendkeyword();
421                     }
422                 }
423                 else if ( scanmode == SCAN_PARAMS )
424                 {
425                     if ( c == '*' || c == '+' || c == '?' || c == '(' || c == ')' )
426                     {
427                         if ( !hastoken )
428                         {
429                             *b++ = c;
430                             c = yychar();
431                             yyendkeyword();
432                         }
433                         else
434                         {
435                             yystartkeyword();
436                         }
437                     }
438                 }
439                 else if ( scanmode == SCAN_XASSIGN && ! hastoken )
440                 {
441                     if ( c == '=' )
442                     {
443                         *b++ = c;
444                         c = yychar();
445                         yyendkeyword();
446                     }
447                     else if ( c == '+' || c == '?' )
448                     {
449                         if ( yypeek() == '=' )
450                         {
451                             *b++ = c;
452                             *b++ = yychar();
453                             c = yychar();
454                             yyendkeyword();
455                         }
456                     }
457                 }
458                 else if ( scanmode == SCAN_NORMAL || scanmode == SCAN_ASSIGN )
459                 {
460                     if ( c == '=' )
461                     {
462                         if ( !hastoken )
463                         {
464                             *b++ = c;
465                             c = yychar();
466                             yyendkeyword();
467                         }
468                         else
469                         {
470                             yystartkeyword();
471                         }
472                     }
473                     else if ( c == '+' || c == '?' )
474                     {
475                         if ( yypeek() == '=' )
476                         {
477                             if ( hastoken )
478                             {
479                                 yystartkeyword();
480                             }
481                             else
482                             {
483                                 *b++ = c;
484                                 *b++ = yychar();
485                                 c = yychar();
486                                 yyendkeyword();
487                             }
488                         }
489                     }
490                 }
491                 if ( scanmode != SCAN_CASE && ( c == ';' || c == '{' || c == '}' ||
492                     ( scanmode != SCAN_PARAMS && ( c == '[' || c == ']' ) ) ) )
493                 {
494                     if ( ! hastoken )
495                     {
496                         *b++ = c;
497                         c = yychar();
498                         yyendkeyword();
499                     }
500                     else
501                     {
502                         yystartkeyword();
503                     }
504                 }
505                 else if ( c == ':' )
506                 {
507                     if ( ! hastoken )
508                     {
509                         *b++ = c;
510                         c = yychar();
511                         yyendkeyword();
512                         break;
513                     }
514                     else if ( hasquote )
515                     {
516                         /* Special rules for ':' do not apply after we quote anything. */
517                         yystartkeyword();
518                     }
519                     else if ( ingrist == 0 )
520                     {
521                         int next = yychar();
522                         int is_win_path = 0;
523                         int is_conditional = 0;
524                         if ( next == '\\' )
525                         {
526                             if( yypeek() == '\\' )
527                             {
528                                 is_win_path = 1;
529                             }
530                         }
531                         else if ( next == '/' )
532                         {
533                             is_win_path = 1;
534                         }
535                         yyprev();
536                         if ( is_win_path )
537                         {
538                             /* Accept windows paths iff they are at the start or immediately follow a grist. */
539                             if ( b > buf && isalpha( b[ -1 ] ) && ( b == buf + 1 || b[ -2 ] == '>' ) )
540                             {
541                                 is_win_path = 1;
542                             }
543                             else
544                             {
545                                 is_win_path = 0;
546                             }
547                         }
548                         if ( next == '<' )
549                         {
550                             /* Accept conditionals only for tokens that start with "<" or "!<" */
551                             if ( ( (b > buf) && (buf[ 0 ] == '<') ) ||
552                                 ( (b > (buf + 1)) && (buf[ 0 ] == '!') && (buf[ 1 ] == '<') ))
553                             {
554                                 is_conditional = 1;
555                             }
556                         }
557                         if ( !is_conditional && !is_win_path )
558                         {
559                             yystartkeyword();
560                         }
561                     }
562                 }
563             }
564             hastoken = 1;
565             if ( c == '"' )
566             {
567                 /* begin or end " */
568                 inquote = !inquote;
569                 hasquote = 1;
570                 notkeyword = 1;
571             }
572             else if ( c != '\\' )
573             {
574                 if ( !invarexpand && c == '<' )
575                 {
576                     if ( ingrist == 0 ) ingrist = 1;
577                     else ingrist = -1;
578                 }
579                 else if ( !invarexpand && c == '>' )
580                 {
581                     if ( ingrist == 1 ) ingrist = 0;
582                     else ingrist = -1;
583                 }
584                 else if ( c == '$' )
585                 {
586                     if ( ( c = yychar() ) == EOF )
587                     {
588                         *b++ = '$';
589                         break;
590                     }
591                     else if ( c == '(' )
592                     {
593                         /* inside $(), we only care about quotes */
594                         *b++ = '$';
595                         c = '(';
596                         ++invarexpand;
597                     }
598                     else
599                     {
600                         c = '$';
601                         yyprev();
602                     }
603                 }
604                 else if ( c == '@' )
605                 {
606                     if ( ( c = yychar() ) == EOF )
607                     {
608                         *b++ = '@';
609                         break;
610                     }
611                     else if ( c == '(' )
612                     {
613                         /* inside @(), we only care about quotes */
614                         *b++ = '@';
615                         c = '(';
616                         ++invarexpand;
617                     }
618                     else
619                     {
620                         c = '@';
621                         yyprev();
622                     }
623                 }
624                 else if ( invarexpand && c == '(' )
625                 {
626                     ++invarexpand;
627                 }
628                 else if ( invarexpand && c == ')' )
629                 {
630                     --invarexpand;
631                 }
632                 /* normal char */
633                 *b++ = c;
634             }
635             else if ( ( c = yychar() ) != EOF )
636             {
637                 /* \c */
638                 if (c == 'n')
639                     c = '\n';
640                 else if (c == 'r')
641                     c = '\r';
642                 else if (c == 't')
643                     c = '\t';
644                 *b++ = c;
645                 notkeyword = 1;
646             }
647             else
648             {
649                 /* \EOF */
650                 break;
651             }
652 
653             c = yychar();
654         }
655 
656         /* Automatically switch modes after reading the token. */
657         if ( scanmode == SCAN_CONDB )
658             scanmode = SCAN_COND;
659 
660         /* Check obvious errors. */
661         if ( b == buf + sizeof( buf ) )
662         {
663             yyerror( "string too big" );
664             goto eof;
665         }
666 
667         if ( inquote )
668         {
669             yyerror( "unmatched \" in string" );
670             goto eof;
671         }
672 
673         /* We looked ahead a character - back up. */
674         if ( c != EOF )
675             yyprev();
676 
677         /* Scan token table. Do not scan if it is obviously not a keyword or if
678          * it is an alphabetic when were looking for punctuation.
679          */
680 
681         *b = 0;
682         yylval.type = ARG;
683 
684         if ( !notkeyword && !( isalpha( *buf ) && ( scanmode == SCAN_PUNCT || scanmode == SCAN_PARAMS || scanmode == SCAN_ASSIGN ) ) )
685             for ( k = keywords; k->word; ++k )
686                 if ( ( *buf == *k->word ) && !strcmp( k->word, buf ) )
687                 {
688                     yylval.type = k->type;
689                     yylval.keyword = k->word;  /* used by symdump */
690                     break;
691                 }
692 
693         if ( yylval.type == ARG )
694             yylval.string = object_new( buf );
695 
696         if ( scanmode == SCAN_NORMAL && yylval.type == ARG )
697             scanmode = SCAN_XASSIGN;
698 
699         if ( has_token_warning )
700             do_token_warning();
701     }
702 
703     if ( DEBUG_SCAN )
704         out_printf( "scan %s\n", symdump( &yylval ) );
705 
706     return yylval.type;
707 
708 eof:
709     /* We do not reset yylval.file & yylval.line here so unexpected EOF error
710      * messages would include correct error location information.
711      */
712     yylval.type = EOF;
713     return yylval.type;
714 }
715 
716 
symdump(YYSTYPE * s)717 static char * symdump( YYSTYPE * s )
718 {
719     static char buf[ BIGGEST_TOKEN + 20 ];
720     switch ( s->type )
721     {
722         case EOF   : sprintf( buf, "EOF"                                        ); break;
723         case 0     : sprintf( buf, "unknown symbol %s", object_str( s->string ) ); break;
724         case ARG   : sprintf( buf, "argument %s"      , object_str( s->string ) ); break;
725         case STRING: sprintf( buf, "string \"%s\""    , object_str( s->string ) ); break;
726         default    : sprintf( buf, "keyword %s"       , s->keyword              ); break;
727     }
728     return buf;
729 }
730 
731 
732 /*
733  * Get information about the current file and line, for those epsilon
734  * transitions that produce a parse.
735  */
736 
yyinput_last_read_token(OBJECT ** name,int * line)737 void yyinput_last_read_token( OBJECT * * name, int * line )
738 {
739     /* TODO: Consider whether and when we might want to report where the last
740      * read token ended, e.g. EOF errors inside string literals.
741      */
742     *name = yylval.file;
743     *line = yylval.line;
744 }
745