1 // Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "prescan.h"
16 #include "characters.h"
17 #include "message.h"
18 #include "preprocessor.h"
19 #include "source.h"
20 #include "token-sequence.h"
21 #include "../common/idioms.h"
22 #include <cstddef>
23 #include <cstring>
24 #include <sstream>
25 #include <utility>
26 #include <vector>
27 
28 namespace Fortran::parser {
29 
30 static constexpr int maxPrescannerNesting{100};
31 
Prescanner(Messages & messages,CookedSource & cooked,Preprocessor & preprocessor,LanguageFeatureControl lfc)32 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
33     Preprocessor &preprocessor, LanguageFeatureControl lfc)
34   : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
35     features_{lfc}, encoding_{cooked.allSources().encoding()} {}
36 
Prescanner(const Prescanner & that)37 Prescanner::Prescanner(const Prescanner &that)
38   : messages_{that.messages_}, cooked_{that.cooked_},
39     preprocessor_{that.preprocessor_}, features_{that.features_},
40     inFixedForm_{that.inFixedForm_},
41     fixedFormColumnLimit_{that.fixedFormColumnLimit_},
42     encoding_{that.encoding_}, prescannerNesting_{that.prescannerNesting_ + 1},
43     skipLeadingAmpersand_{that.skipLeadingAmpersand_},
44     compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
45     compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
46 
IsFixedFormCommentChar(char ch)47 static inline constexpr bool IsFixedFormCommentChar(char ch) {
48   return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
49 }
50 
NormalizeCompilerDirectiveCommentMarker(TokenSequence & dir)51 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
52   char *p{dir.GetMutableCharData()};
53   char *limit{p + dir.SizeInChars()};
54   for (; p < limit; ++p) {
55     if (*p != ' ') {
56       CHECK(IsFixedFormCommentChar(*p));
57       *p = '!';
58       return;
59     }
60   }
61   DIE("compiler directive all blank");
62 }
63 
Prescan(ProvenanceRange range)64 void Prescanner::Prescan(ProvenanceRange range) {
65   AllSources &allSources{cooked_.allSources()};
66   startProvenance_ = range.start();
67   std::size_t offset{0};
68   const SourceFile *source{allSources.GetSourceFile(startProvenance_, &offset)};
69   CHECK(source != nullptr);
70   start_ = source->content() + offset;
71   limit_ = start_ + range.size();
72   nextLine_ = start_;
73   const bool beganInFixedForm{inFixedForm_};
74   if (prescannerNesting_ > maxPrescannerNesting) {
75     Say(GetProvenance(start_),
76         "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
77     return;
78   }
79   while (nextLine_ < limit_) {
80     Statement();
81   }
82   if (inFixedForm_ != beganInFixedForm) {
83     std::string dir{"!dir$ "};
84     if (beganInFixedForm) {
85       dir += "fixed";
86     } else {
87       dir += "free";
88     }
89     dir += '\n';
90     TokenSequence tokens{dir, allSources.AddCompilerInsertion(dir).start()};
91     tokens.Emit(cooked_);
92   }
93 }
94 
Statement()95 void Prescanner::Statement() {
96   TokenSequence tokens;
97   LineClassification line{ClassifyLine(nextLine_)};
98   switch (line.kind) {
99   case LineClassification::Kind::Comment:
100     nextLine_ += line.payloadOffset;  // advance to '!' or newline
101     NextLine();
102     return;
103   case LineClassification::Kind::IncludeLine:
104     FortranInclude(nextLine_ + line.payloadOffset);
105     NextLine();
106     return;
107   case LineClassification::Kind::ConditionalCompilationDirective:
108   case LineClassification::Kind::IncludeDirective:
109   case LineClassification::Kind::DefinitionDirective:
110   case LineClassification::Kind::PreprocessorDirective:
111     preprocessor_.Directive(TokenizePreprocessorDirective(), this);
112     return;
113   case LineClassification::Kind::CompilerDirective:
114     directiveSentinel_ = line.sentinel;
115     CHECK(InCompilerDirective());
116     BeginSourceLineAndAdvance();
117     if (inFixedForm_) {
118       CHECK(IsFixedFormCommentChar(*at_));
119     } else {
120       while (*at_ == ' ' || *at_ == '\t') {
121         ++at_, ++column_;
122       }
123       CHECK(*at_ == '!');
124     }
125     if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
126       // OpenMP conditional compilation line.  Remove the sentinel and then
127       // treat the line as if it were normal source.
128       at_ += 2, column_ += 2;
129       if (inFixedForm_) {
130         LabelField(tokens);
131       } else {
132         SkipSpaces();
133       }
134     } else {
135       // Compiler directive.  Emit normalized sentinel.
136       EmitChar(tokens, '!');
137       ++at_, ++column_;
138       for (const char *sp{directiveSentinel_}; *sp != '\0';
139            ++sp, ++at_, ++column_) {
140         EmitChar(tokens, *sp);
141       }
142       if (*at_ == ' ') {
143         EmitChar(tokens, ' ');
144         ++at_, ++column_;
145       }
146       tokens.CloseToken();
147     }
148     break;
149   case LineClassification::Kind::Source:
150     BeginSourceLineAndAdvance();
151     if (inFixedForm_) {
152       LabelField(tokens);
153     } else if (skipLeadingAmpersand_) {
154       skipLeadingAmpersand_ = false;
155       const char *p{SkipWhiteSpace(at_)};
156       if (p < limit_ && *p == '&') {
157         column_ += ++p - at_;
158         at_ = p;
159       }
160     } else {
161       SkipSpaces();
162     }
163     break;
164   }
165 
166   while (NextToken(tokens)) {
167   }
168 
169   Provenance newlineProvenance{GetCurrentProvenance()};
170   if (std::optional<TokenSequence> preprocessed{
171           preprocessor_.MacroReplacement(tokens, *this)}) {
172     // Reprocess the preprocessed line.  Append a newline temporarily.
173     preprocessed->PutNextTokenChar('\n', newlineProvenance);
174     preprocessed->CloseToken();
175     const char *ppd{preprocessed->ToCharBlock().begin()};
176     LineClassification ppl{ClassifyLine(ppd)};
177     preprocessed->RemoveLastToken();  // remove the newline
178     switch (ppl.kind) {
179     case LineClassification::Kind::Comment: break;
180     case LineClassification::Kind::IncludeLine:
181       FortranInclude(ppd + ppl.payloadOffset);
182       break;
183     case LineClassification::Kind::ConditionalCompilationDirective:
184     case LineClassification::Kind::IncludeDirective:
185     case LineClassification::Kind::DefinitionDirective:
186     case LineClassification::Kind::PreprocessorDirective:
187       Say(preprocessed->GetProvenanceRange(),
188           "Preprocessed line resembles a preprocessor directive"_en_US);
189       preprocessed->ToLowerCase().Emit(cooked_);
190       break;
191     case LineClassification::Kind::CompilerDirective:
192       if (preprocessed->HasRedundantBlanks()) {
193         preprocessed->RemoveRedundantBlanks();
194       }
195       NormalizeCompilerDirectiveCommentMarker(*preprocessed);
196       preprocessed->ToLowerCase();
197       SourceFormChange(preprocessed->ToString());
198       preprocessed->ClipComment(true /* skip first ! */).Emit(cooked_);
199       break;
200     case LineClassification::Kind::Source:
201       if (inFixedForm_) {
202         if (preprocessed->HasBlanks(/*after column*/ 6)) {
203           preprocessed->RemoveBlanks(/*after column*/ 6);
204         }
205       } else {
206         if (preprocessed->HasRedundantBlanks()) {
207           preprocessed->RemoveRedundantBlanks();
208         }
209       }
210       preprocessed->ToLowerCase().ClipComment().Emit(cooked_);
211       break;
212     }
213   } else {
214     tokens.ToLowerCase();
215     if (line.kind == LineClassification::Kind::CompilerDirective) {
216       SourceFormChange(tokens.ToString());
217     }
218     tokens.Emit(cooked_);
219   }
220   if (omitNewline_) {
221     omitNewline_ = false;
222   } else {
223     cooked_.Put('\n', newlineProvenance);
224   }
225   directiveSentinel_ = nullptr;
226 }
227 
TokenizePreprocessorDirective()228 TokenSequence Prescanner::TokenizePreprocessorDirective() {
229   CHECK(nextLine_ < limit_ && !inPreprocessorDirective_);
230   inPreprocessorDirective_ = true;
231   BeginSourceLineAndAdvance();
232   TokenSequence tokens;
233   while (NextToken(tokens)) {
234   }
235   inPreprocessorDirective_ = false;
236   return tokens;
237 }
238 
NextLine()239 void Prescanner::NextLine() {
240   void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
241   void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
242   if (v == nullptr) {
243     nextLine_ = limit_;
244   } else {
245     const char *nl{const_cast<const char *>(static_cast<char *>(v))};
246     nextLine_ = nl + 1;
247   }
248 }
249 
LabelField(TokenSequence & token,int outCol)250 void Prescanner::LabelField(TokenSequence &token, int outCol) {
251   for (; *at_ != '\n' && column_ <= 6; ++at_) {
252     if (*at_ == '\t') {
253       ++at_;
254       column_ = 7;
255       break;
256     }
257     if (*at_ != ' ' &&
258         !(*at_ == '0' && column_ == 6)) {  // '0' in column 6 becomes space
259       EmitChar(token, *at_);
260       ++outCol;
261     }
262     ++column_;
263   }
264   if (outCol > 1) {
265     token.CloseToken();
266   }
267   if (outCol < 7) {
268     if (outCol == 1) {
269       token.Put("      ", 6, sixSpaceProvenance_.start());
270     } else {
271       for (; outCol < 7; ++outCol) {
272         token.PutNextTokenChar(' ', spaceProvenance_);
273       }
274       token.CloseToken();
275     }
276   }
277 }
278 
SkipToEndOfLine()279 void Prescanner::SkipToEndOfLine() {
280   while (*at_ != '\n') {
281     ++at_, ++column_;
282   }
283 }
284 
MustSkipToEndOfLine() const285 bool Prescanner::MustSkipToEndOfLine() const {
286   if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
287     return true;  // skip over ignored columns in right margin (73:80)
288   } else if (*at_ == '!' && !inCharLiteral_) {
289     return true;  // inline comment goes to end of source line
290   } else {
291     return false;
292   }
293 }
294 
NextChar()295 void Prescanner::NextChar() {
296   CHECK(*at_ != '\n');
297   ++at_, ++column_;
298   while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
299     // UTF-8 byte order mark - treat this file as UTF-8
300     at_ += 3;
301     encoding_ = Encoding::UTF_8;
302   }
303   if (inPreprocessorDirective_) {
304     SkipCComments();
305   } else {
306     bool mightNeedSpace{false};
307     if (MustSkipToEndOfLine()) {
308       SkipToEndOfLine();
309     } else {
310       mightNeedSpace = *at_ == '\n';
311     }
312     for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
313       if (MustSkipToEndOfLine()) {
314         SkipToEndOfLine();
315       }
316     }
317     if (*at_ == '\t') {
318       tabInCurrentLine_ = true;
319     }
320   }
321 }
322 
SkipCComments()323 void Prescanner::SkipCComments() {
324   while (true) {
325     if (IsCComment(at_)) {
326       if (const char *after{SkipCComment(at_)}) {
327         column_ += after - at_;
328         // May have skipped over one or more newlines; relocate the start of
329         // the next line.
330         nextLine_ = at_ = after;
331         NextLine();
332       } else {
333         // Don't emit any messages about unclosed C-style comments, because
334         // the sequence /* can appear legally in a FORMAT statement.  There's
335         // no ambiguity, since the sequence */ cannot appear legally.
336         break;
337       }
338     } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
339         at_[1] == '\n' && nextLine_ < limit_) {
340       BeginSourceLineAndAdvance();
341     } else {
342       break;
343     }
344   }
345 }
346 
SkipSpaces()347 void Prescanner::SkipSpaces() {
348   while (*at_ == ' ' || *at_ == '\t') {
349     NextChar();
350   }
351   insertASpace_ = false;
352 }
353 
SkipWhiteSpace(const char * p)354 const char *Prescanner::SkipWhiteSpace(const char *p) {
355   while (*p == ' ' || *p == '\t') {
356     ++p;
357   }
358   return p;
359 }
360 
SkipWhiteSpaceAndCComments(const char * p) const361 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
362   while (true) {
363     if (*p == ' ' || *p == '\t') {
364       ++p;
365     } else if (IsCComment(p)) {
366       if (const char *after{SkipCComment(p)}) {
367         p = after;
368       } else {
369         break;
370       }
371     } else {
372       break;
373     }
374   }
375   return p;
376 }
377 
SkipCComment(const char * p) const378 const char *Prescanner::SkipCComment(const char *p) const {
379   char star{' '}, slash{' '};
380   p += 2;
381   while (star != '*' || slash != '/') {
382     if (p >= limit_) {
383       return nullptr;  // signifies an unterminated comment
384     }
385     star = slash;
386     slash = *p++;
387   }
388   return p;
389 }
390 
NextToken(TokenSequence & tokens)391 bool Prescanner::NextToken(TokenSequence &tokens) {
392   CHECK(at_ >= start_ && at_ < limit_);
393   if (InFixedFormSource()) {
394     SkipSpaces();
395   } else {
396     if (*at_ == '/' && IsCComment(at_)) {
397       // Recognize and skip over classic C style /*comments*/ when
398       // outside a character literal.
399       if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
400         Say(GetProvenance(at_), "nonstandard usage: C-style comment"_en_US);
401       }
402       SkipCComments();
403     }
404     if (*at_ == ' ' || *at_ == '\t') {
405       // Compress free-form white space into a single space character.
406       const auto theSpace{at_};
407       char previous{at_ <= start_ ? ' ' : at_[-1]};
408       NextChar();
409       SkipSpaces();
410       if (*at_ == '\n') {
411         // Discard white space at the end of a line.
412       } else if (!inPreprocessorDirective_ &&
413           (previous == '(' || *at_ == '(' || *at_ == ')')) {
414         // Discard white space before/after '(' and before ')', unless in a
415         // preprocessor directive.  This helps yield space-free contiguous
416         // names for generic interfaces like OPERATOR( + ) and
417         // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
418         // This has the effect of silently ignoring the illegal spaces in
419         // the array constructor ( /1,2/ ) but that seems benign; it's
420         // hard to avoid that while still removing spaces from OPERATOR( / )
421         // and OPERATOR( // ).
422       } else {
423         // Preserve the squashed white space as a single space character.
424         tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
425         tokens.CloseToken();
426         return true;
427       }
428     }
429   }
430   if (insertASpace_) {
431     tokens.PutNextTokenChar(' ', spaceProvenance_);
432     insertASpace_ = false;
433   }
434   if (*at_ == '\n') {
435     return false;
436   }
437   const char *start{at_};
438   if (*at_ == '\'' || *at_ == '"') {
439     QuotedCharacterLiteral(tokens, start);
440     preventHollerith_ = false;
441   } else if (IsDecimalDigit(*at_)) {
442     int n{0}, digits{0};
443     static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
444     do {
445       if (n < maxHollerith) {
446         n = 10 * n + DecimalDigitValue(*at_);
447       }
448       EmitCharAndAdvance(tokens, *at_);
449       ++digits;
450       if (InFixedFormSource()) {
451         SkipSpaces();
452       }
453     } while (IsDecimalDigit(*at_));
454     if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
455         !preventHollerith_) {
456       Hollerith(tokens, n, start);
457     } else if (*at_ == '.') {
458       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
459       }
460       ExponentAndKind(tokens);
461     } else if (ExponentAndKind(tokens)) {
462     } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
463         inPreprocessorDirective_) {
464       do {
465         EmitCharAndAdvance(tokens, *at_);
466       } while (IsHexadecimalDigit(*at_));
467     } else if (IsLetter(*at_)) {
468       // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
469       // we don't misrecognize I9HOLLERITH as an identifier in the next case.
470       EmitCharAndAdvance(tokens, *at_);
471     } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) {
472       EmitCharAndAdvance(tokens, *at_);
473       QuotedCharacterLiteral(tokens, start);
474     }
475     preventHollerith_ = false;
476   } else if (*at_ == '.') {
477     char nch{EmitCharAndAdvance(tokens, '.')};
478     if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
479       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
480       }
481       ExponentAndKind(tokens);
482     } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
483       EmitCharAndAdvance(tokens, '.');  // variadic macro definition ellipsis
484     }
485     preventHollerith_ = false;
486   } else if (IsLegalInIdentifier(*at_)) {
487     do {
488     } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_)));
489     if (*at_ == '\'' || *at_ == '"') {
490       QuotedCharacterLiteral(tokens, start);
491       preventHollerith_ = false;
492     } else {
493       // Subtle: Don't misrecognize labeled DO statement label as Hollerith
494       // when the loop control variable starts with 'H'.
495       preventHollerith_ = true;
496     }
497   } else if (*at_ == '*') {
498     if (EmitCharAndAdvance(tokens, '*') == '*') {
499       EmitCharAndAdvance(tokens, '*');
500     } else {
501       // Subtle ambiguity:
502       //  CHARACTER*2H     declares H because *2 is a kind specifier
503       //  DATAC/N*2H  /    is repeated Hollerith
504       preventHollerith_ = !slashInCurrentLine_;
505     }
506   } else {
507     char ch{*at_};
508     if (ch == '(' || ch == '[') {
509       ++delimiterNesting_;
510     } else if ((ch == ')' || ch == ']') && delimiterNesting_ > 0) {
511       --delimiterNesting_;
512     }
513     char nch{EmitCharAndAdvance(tokens, ch)};
514     preventHollerith_ = false;
515     if ((nch == '=' &&
516             (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
517         (ch == nch &&
518             (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
519                 ch == '|' || ch == '<' || ch == '>')) ||
520         (ch == '=' && nch == '>')) {
521       // token comprises two characters
522       EmitCharAndAdvance(tokens, nch);
523     } else if (ch == '/') {
524       slashInCurrentLine_ = true;
525     }
526   }
527   tokens.CloseToken();
528   return true;
529 }
530 
ExponentAndKind(TokenSequence & tokens)531 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
532   char ed{ToLowerCaseLetter(*at_)};
533   if (ed != 'e' && ed != 'd') {
534     return false;
535   }
536   EmitCharAndAdvance(tokens, ed);
537   if (*at_ == '+' || *at_ == '-') {
538     EmitCharAndAdvance(tokens, *at_);
539   }
540   while (IsDecimalDigit(*at_)) {
541     EmitCharAndAdvance(tokens, *at_);
542   }
543   if (*at_ == '_') {
544     while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
545     }
546   }
547   return true;
548 }
549 
QuotedCharacterLiteral(TokenSequence & tokens,const char * start)550 void Prescanner::QuotedCharacterLiteral(
551     TokenSequence &tokens, const char *start) {
552   char quote{*at_};
553   const char *end{at_ + 1};
554   inCharLiteral_ = true;
555   const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
556   const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
557   bool isEscaped{false};
558   bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
559   while (true) {
560     if (*at_ == '\\') {
561       if (escapesEnabled) {
562         isEscaped = !isEscaped;
563       } else {
564         // The parser always processes escape sequences, so don't confuse it
565         // when escapes are disabled.
566         insert('\\');
567       }
568     } else {
569       isEscaped = false;
570     }
571     EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
572         Encoding::LATIN_1);
573     while (PadOutCharacterLiteral(tokens)) {
574     }
575     if (*at_ == '\n') {
576       if (!inPreprocessorDirective_) {
577         Say(GetProvenanceRange(start, end),
578             "Incomplete character literal"_err_en_US);
579       }
580       break;
581     }
582     end = at_ + 1;
583     NextChar();
584     if (*at_ == quote && !isEscaped) {
585       // A doubled unescaped quote mark becomes a single instance of that
586       // quote character in the literal (later).  There can be spaces between
587       // the quotes in fixed form source.
588       EmitChar(tokens, quote);
589       inCharLiteral_ = false;  // for cases like print *, '...'!comment
590       NextChar();
591       if (InFixedFormSource()) {
592         SkipSpaces();
593       }
594       if (*at_ != quote) {
595         break;
596       }
597       inCharLiteral_ = true;
598     }
599   }
600   inCharLiteral_ = false;
601 }
602 
Hollerith(TokenSequence & tokens,int count,const char * start)603 void Prescanner::Hollerith(
604     TokenSequence &tokens, int count, const char *start) {
605   inCharLiteral_ = true;
606   CHECK(*at_ == 'h' || *at_ == 'H');
607   EmitChar(tokens, 'H');
608   while (count-- > 0) {
609     if (PadOutCharacterLiteral(tokens)) {
610     } else if (*at_ == '\n') {
611       Say(GetProvenanceRange(start, at_),
612           "Possible truncated Hollerith literal"_en_US);
613       break;
614     } else {
615       NextChar();
616       // Each multi-byte character encoding counts as a single character.
617       // No escape sequences are recognized.
618       // Hollerith is always emitted to the cooked character
619       // stream in UTF-8.
620       DecodedCharacter decoded{DecodeCharacter(
621           encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
622       if (decoded.bytes > 0) {
623         EncodedCharacter utf8{
624             EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
625         for (int j{0}; j < utf8.bytes; ++j) {
626           EmitChar(tokens, utf8.buffer[j]);
627         }
628         at_ += decoded.bytes - 1;
629       } else {
630         Say(GetProvenanceRange(start, at_),
631             "Bad character in Hollerith literal"_err_en_US);
632         break;
633       }
634     }
635   }
636   if (*at_ != '\n') {
637     NextChar();
638   }
639   inCharLiteral_ = false;
640 }
641 
642 // In fixed form, source card images must be processed as if they were at
643 // least 72 columns wide, at least in character literal contexts.
PadOutCharacterLiteral(TokenSequence & tokens)644 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
645   while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
646     if (column_ < fixedFormColumnLimit_) {
647       tokens.PutNextTokenChar(' ', spaceProvenance_);
648       ++column_;
649       return true;
650     }
651     if (!FixedFormContinuation(false /*no need to insert space*/) ||
652         tabInCurrentLine_) {
653       return false;
654     }
655     CHECK(column_ == 7);
656     --at_;  // point to column 6 of continuation line
657     column_ = 6;
658   }
659   return false;
660 }
661 
IsFixedFormCommentLine(const char * start) const662 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
663   const char *p{start};
664   if (IsFixedFormCommentChar(*p) || *p == '%' ||  // VAX %list, %eject, &c.
665       ((*p == 'D' || *p == 'd') &&
666           !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
667     return true;
668   }
669   bool anyTabs{false};
670   while (true) {
671     if (*p == ' ') {
672       ++p;
673     } else if (*p == '\t') {
674       anyTabs = true;
675       ++p;
676     } else if (*p == '0' && !anyTabs && p == start + 5) {
677       ++p;  // 0 in column 6 must treated as a space
678     } else {
679       break;
680     }
681   }
682   if (!anyTabs && p >= start + fixedFormColumnLimit_) {
683     return true;
684   }
685   if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
686     return true;
687   }
688   return *p == '\n';
689 }
690 
IsFreeFormComment(const char * p) const691 const char *Prescanner::IsFreeFormComment(const char *p) const {
692   p = SkipWhiteSpaceAndCComments(p);
693   if (*p == '!' || *p == '\n') {
694     return p;
695   } else {
696     return nullptr;
697   }
698 }
699 
IsIncludeLine(const char * start) const700 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
701   const char *p{SkipWhiteSpace(start)};
702   for (char ch : "include"s) {
703     if (ToLowerCaseLetter(*p++) != ch) {
704       return std::nullopt;
705     }
706   }
707   p = SkipWhiteSpace(p);
708   if (*p == '"' || *p == '\'') {
709     return {p - start};
710   }
711   return std::nullopt;
712 }
713 
FortranInclude(const char * firstQuote)714 void Prescanner::FortranInclude(const char *firstQuote) {
715   const char *p{firstQuote};
716   while (*p != '"' && *p != '\'') {
717     ++p;
718   }
719   char quote{*p};
720   std::string path;
721   for (++p; *p != '\n'; ++p) {
722     if (*p == quote) {
723       if (p[1] != quote) {
724         break;
725       }
726       ++p;
727     }
728     path += *p;
729   }
730   if (*p != quote) {
731     Say(GetProvenanceRange(firstQuote, p),
732         "malformed path name string"_err_en_US);
733     return;
734   }
735   p = SkipWhiteSpace(p + 1);
736   if (*p != '\n' && *p != '!') {
737     const char *garbage{p};
738     for (; *p != '\n' && *p != '!'; ++p) {
739     }
740     Say(GetProvenanceRange(garbage, p),
741         "excess characters after path name"_en_US);
742   }
743   std::stringstream error;
744   Provenance provenance{GetProvenance(nextLine_)};
745   AllSources &allSources{cooked_.allSources()};
746   const SourceFile *currentFile{allSources.GetSourceFile(provenance)};
747   if (currentFile != nullptr) {
748     allSources.PushSearchPathDirectory(DirectoryName(currentFile->path()));
749   }
750   const SourceFile *included{allSources.Open(path, &error)};
751   if (currentFile != nullptr) {
752     allSources.PopSearchPathDirectory();
753   }
754   if (included == nullptr) {
755     Say(provenance, "INCLUDE: %s"_err_en_US, error.str());
756   } else if (included->bytes() > 0) {
757     ProvenanceRange includeLineRange{
758         provenance, static_cast<std::size_t>(p - nextLine_)};
759     ProvenanceRange fileRange{
760         allSources.AddIncludedFile(*included, includeLineRange)};
761     Prescanner{*this}.set_encoding(included->encoding()).Prescan(fileRange);
762   }
763 }
764 
IsPreprocessorDirectiveLine(const char * start) const765 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
766   const char *p{start};
767   for (; *p == ' '; ++p) {
768   }
769   if (*p == '#') {
770     if (inFixedForm_ && p == start + 5) {
771       return nullptr;
772     }
773   } else {
774     p = SkipWhiteSpace(p);
775     if (*p != '#') {
776       return nullptr;
777     }
778   }
779   return SkipWhiteSpace(p + 1);
780 }
781 
IsNextLinePreprocessorDirective() const782 bool Prescanner::IsNextLinePreprocessorDirective() const {
783   return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
784 }
785 
SkipCommentLine(bool afterAmpersand)786 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
787   if (nextLine_ >= limit_) {
788     if (afterAmpersand && prescannerNesting_ > 0) {
789       // A continuation marker at the end of the last line in an
790       // include file inhibits the newline for that line.
791       SkipToEndOfLine();
792       omitNewline_ = true;
793     }
794     return false;
795   }
796   auto lineClass{ClassifyLine(nextLine_)};
797   if (lineClass.kind == LineClassification::Kind::Comment) {
798     NextLine();
799     return true;
800   } else if (inPreprocessorDirective_) {
801     return false;
802   } else if (lineClass.kind ==
803           LineClassification::Kind::ConditionalCompilationDirective ||
804       lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
805     // Allow conditional compilation directives (e.g., #ifdef) to affect
806     // continuation lines.
807     // Allow other preprocessor directives, too, except #include
808     // (when it does not follow '&'), #define, and #undef (because
809     // they cannot be allowed to affect preceding text on a
810     // continued line).
811     preprocessor_.Directive(TokenizePreprocessorDirective(), this);
812     return true;
813   } else if (afterAmpersand &&
814       (lineClass.kind == LineClassification::Kind::IncludeDirective ||
815           lineClass.kind == LineClassification::Kind::IncludeLine)) {
816     SkipToEndOfLine();
817     omitNewline_ = true;
818     skipLeadingAmpersand_ = true;
819     return false;
820   } else {
821     return false;
822   }
823 }
824 
FixedFormContinuationLine(bool mightNeedSpace)825 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
826   if (nextLine_ >= limit_) {
827     return nullptr;
828   }
829   tabInCurrentLine_ = false;
830   char col1{*nextLine_};
831   if (InCompilerDirective()) {
832     // Must be a continued compiler directive.
833     if (!IsFixedFormCommentChar(col1)) {
834       return nullptr;
835     }
836     int j{1};
837     for (; j < 5; ++j) {
838       char ch{directiveSentinel_[j - 1]};
839       if (ch == '\0') {
840         break;
841       }
842       if (ch != ToLowerCaseLetter(nextLine_[j])) {
843         return nullptr;
844       }
845     }
846     for (; j < 5; ++j) {
847       if (nextLine_[j] != ' ') {
848         return nullptr;
849       }
850     }
851     char col6{nextLine_[5]};
852     if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
853       if (nextLine_[6] != ' ' && mightNeedSpace) {
854         insertASpace_ = true;
855       }
856       return nextLine_ + 6;
857     }
858     return nullptr;
859   } else {
860     // Normal case: not in a compiler directive.
861     if (col1 == '&' &&
862         features_.IsEnabled(
863             LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
864       // Extension: '&' as continuation marker
865       if (features_.ShouldWarn(
866               LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
867         Say(GetProvenance(nextLine_), "nonstandard usage"_en_US);
868       }
869       return nextLine_ + 1;
870     }
871     if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
872       tabInCurrentLine_ = true;
873       return nextLine_ + 2;  // VAX extension
874     }
875     if (col1 == ' ' && nextLine_[1] == ' ' && nextLine_[2] == ' ' &&
876         nextLine_[3] == ' ' && nextLine_[4] == ' ') {
877       char col6{nextLine_[5]};
878       if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
879         return nextLine_ + 6;
880       }
881     }
882     if (delimiterNesting_ > 0) {
883       if (!IsFixedFormCommentChar(col1)) {
884         return nextLine_;
885       }
886     }
887   }
888   return nullptr;  // not a continuation line
889 }
890 
FreeFormContinuationLine(bool ampersand)891 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
892   const char *p{nextLine_};
893   if (p >= limit_) {
894     return nullptr;
895   }
896   p = SkipWhiteSpace(p);
897   if (InCompilerDirective()) {
898     if (*p++ != '!') {
899       return nullptr;
900     }
901     for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
902       if (*s != ToLowerCaseLetter(*p)) {
903         return nullptr;
904       }
905     }
906     p = SkipWhiteSpace(p);
907     if (*p == '&') {
908       if (!ampersand) {
909         insertASpace_ = true;
910       }
911       return p + 1;
912     } else if (ampersand) {
913       return p;
914     } else {
915       return nullptr;
916     }
917   } else {
918     if (*p == '&') {
919       return p + 1;
920     } else if (*p == '!' || *p == '\n' || *p == '#') {
921       return nullptr;
922     } else if (ampersand || delimiterNesting_ > 0) {
923       if (p > nextLine_) {
924         --p;
925       } else {
926         insertASpace_ = true;
927       }
928       return p;
929     } else {
930       return nullptr;
931     }
932   }
933 }
934 
FixedFormContinuation(bool mightNeedSpace)935 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
936   // N.B. We accept '&' as a continuation indicator in fixed form, too,
937   // but not in a character literal.
938   if (*at_ == '&' && inCharLiteral_) {
939     return false;
940   }
941   do {
942     if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
943       BeginSourceLine(cont);
944       column_ = 7;
945       NextLine();
946       return true;
947     }
948   } while (SkipCommentLine(false /* not after ampersand */));
949   return false;
950 }
951 
FreeFormContinuation()952 bool Prescanner::FreeFormContinuation() {
953   const char *p{at_};
954   bool ampersand{*p == '&'};
955   if (ampersand) {
956     p = SkipWhiteSpace(p + 1);
957   }
958   if (*p != '\n') {
959     if (inCharLiteral_) {
960       return false;
961     } else if (*p != '!' &&
962         features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
963       Say(GetProvenance(p), "missing ! before comment after &"_en_US);
964     }
965   }
966   do {
967     if (const char *cont{FreeFormContinuationLine(ampersand)}) {
968       BeginSourceLine(cont);
969       NextLine();
970       return true;
971     }
972   } while (SkipCommentLine(ampersand));
973   return false;
974 }
975 
Continuation(bool mightNeedFixedFormSpace)976 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
977   if (*at_ == '\n' || *at_ == '&') {
978     if (inFixedForm_) {
979       return FixedFormContinuation(mightNeedFixedFormSpace);
980     } else {
981       return FreeFormContinuation();
982     }
983   } else {
984     return false;
985   }
986 }
987 
988 std::optional<Prescanner::LineClassification>
IsFixedFormCompilerDirectiveLine(const char * start) const989 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
990   const char *p{start};
991   char col1{*p++};
992   if (!IsFixedFormCommentChar(col1)) {
993     return std::nullopt;
994   }
995   char sentinel[5], *sp{sentinel};
996   int column{2};
997   for (; column < 6; ++column, ++p) {
998     if (*p != ' ') {
999       if (*p == '\n' || *p == '\t') {
1000         break;
1001       }
1002       if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1003         // OpenMP conditional compilation line: leave the label alone
1004         break;
1005       }
1006       *sp++ = ToLowerCaseLetter(*p);
1007     }
1008   }
1009   if (column == 6) {
1010     if (*p == ' ' || *p == '\t' || *p == '0') {
1011       ++p;
1012     } else {
1013       // This is a Continuation line, not an initial directive line.
1014       return std::nullopt;
1015     }
1016   }
1017   if (sp == sentinel) {
1018     return std::nullopt;
1019   }
1020   *sp = '\0';
1021   if (const char *ss{IsCompilerDirectiveSentinel(sentinel)}) {
1022     std::size_t payloadOffset = p - start;
1023     return {LineClassification{
1024         LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1025   }
1026   return std::nullopt;
1027 }
1028 
1029 std::optional<Prescanner::LineClassification>
IsFreeFormCompilerDirectiveLine(const char * start) const1030 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1031   char sentinel[8];
1032   const char *p{SkipWhiteSpace(start)};
1033   if (*p++ != '!') {
1034     return std::nullopt;
1035   }
1036   for (std::size_t j{0}; j + 1 < sizeof sentinel; ++p, ++j) {
1037     if (*p == '\n') {
1038       break;
1039     }
1040     if (*p == ' ' || *p == '\t' || *p == '&') {
1041       if (j == 0) {
1042         break;
1043       }
1044       sentinel[j] = '\0';
1045       p = SkipWhiteSpace(p + 1);
1046       if (*p == '!') {
1047         break;
1048       }
1049       if (const char *sp{IsCompilerDirectiveSentinel(sentinel)}) {
1050         std::size_t offset = p - start;
1051         return {LineClassification{
1052             LineClassification::Kind::CompilerDirective, offset, sp}};
1053       }
1054       break;
1055     }
1056     sentinel[j] = ToLowerCaseLetter(*p);
1057   }
1058   return std::nullopt;
1059 }
1060 
AddCompilerDirectiveSentinel(const std::string & dir)1061 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1062   std::uint64_t packed{0};
1063   for (char ch : dir) {
1064     packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1065   }
1066   compilerDirectiveBloomFilter_.set(packed % prime1);
1067   compilerDirectiveBloomFilter_.set(packed % prime2);
1068   compilerDirectiveSentinels_.insert(dir);
1069   return *this;
1070 }
1071 
IsCompilerDirectiveSentinel(const char * sentinel) const1072 const char *Prescanner::IsCompilerDirectiveSentinel(
1073     const char *sentinel) const {
1074   std::uint64_t packed{0};
1075   std::size_t n{0};
1076   for (; sentinel[n] != '\0'; ++n) {
1077     packed = (packed << 8) | (sentinel[n] & 0xff);
1078   }
1079   if (n == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1080       !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1081     return nullptr;
1082   }
1083   const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, n))};
1084   return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->data();
1085 }
1086 
ClassifyLine(const char * start) const1087 Prescanner::LineClassification Prescanner::ClassifyLine(
1088     const char *start) const {
1089   if (inFixedForm_) {
1090     if (std::optional<LineClassification> lc{
1091             IsFixedFormCompilerDirectiveLine(start)}) {
1092       return std::move(*lc);
1093     }
1094     if (IsFixedFormCommentLine(start)) {
1095       return {LineClassification::Kind::Comment};
1096     }
1097   } else {
1098     if (std::optional<LineClassification> lc{
1099             IsFreeFormCompilerDirectiveLine(start)}) {
1100       return std::move(*lc);
1101     }
1102     if (const char *bang{IsFreeFormComment(start)}) {
1103       return {LineClassification::Kind::Comment,
1104           static_cast<std::size_t>(bang - start)};
1105     }
1106   }
1107   if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1108     return {LineClassification::Kind::IncludeLine, *quoteOffset};
1109   }
1110   if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1111     if (std::memcmp(dir, "if", 2) == 0 || std::memcmp(dir, "elif", 4) == 0 ||
1112         std::memcmp(dir, "else", 4) == 0 || std::memcmp(dir, "endif", 5) == 0) {
1113       return {LineClassification::Kind::ConditionalCompilationDirective};
1114     } else if (std::memcmp(dir, "include", 7) == 0) {
1115       return {LineClassification::Kind::IncludeDirective};
1116     } else if (std::memcmp(dir, "define", 6) == 0 ||
1117         std::memcmp(dir, "undef", 5) == 0) {
1118       return {LineClassification::Kind::DefinitionDirective};
1119     } else {
1120       return {LineClassification::Kind::PreprocessorDirective};
1121     }
1122   }
1123   return {LineClassification::Kind::Source};
1124 }
1125 
SourceFormChange(std::string && dir)1126 void Prescanner::SourceFormChange(std::string &&dir) {
1127   if (dir == "!dir$ free") {
1128     inFixedForm_ = false;
1129   } else if (dir == "!dir$ fixed") {
1130     inFixedForm_ = true;
1131   }
1132 }
1133 }
1134