1 //===-- lib/Parser/prescan.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "prescan.h"
10 #include "preprocessor.h"
11 #include "token-sequence.h"
12 #include "flang/Common/idioms.h"
13 #include "flang/Parser/characters.h"
14 #include "flang/Parser/message.h"
15 #include "flang/Parser/source.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <cstddef>
18 #include <cstring>
19 #include <utility>
20 #include <vector>
21 
22 namespace Fortran::parser {
23 
24 using common::LanguageFeature;
25 
26 static constexpr int maxPrescannerNesting{100};
27 
Prescanner(Messages & messages,CookedSource & cooked,Preprocessor & preprocessor,common::LanguageFeatureControl lfc)28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
29     Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
30     : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
31       features_{lfc}, encoding_{cooked.allSources().encoding()} {}
32 
Prescanner(const Prescanner & that)33 Prescanner::Prescanner(const Prescanner &that)
34     : messages_{that.messages_}, cooked_{that.cooked_},
35       preprocessor_{that.preprocessor_}, features_{that.features_},
36       inFixedForm_{that.inFixedForm_},
37       fixedFormColumnLimit_{that.fixedFormColumnLimit_},
38       encoding_{that.encoding_}, prescannerNesting_{that.prescannerNesting_ +
39                                      1},
40       skipLeadingAmpersand_{that.skipLeadingAmpersand_},
41       compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
42       compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
43 
IsFixedFormCommentChar(char ch)44 static inline constexpr bool IsFixedFormCommentChar(char ch) {
45   return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
46 }
47 
NormalizeCompilerDirectiveCommentMarker(TokenSequence & dir)48 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
49   char *p{dir.GetMutableCharData()};
50   char *limit{p + dir.SizeInChars()};
51   for (; p < limit; ++p) {
52     if (*p != ' ') {
53       CHECK(IsFixedFormCommentChar(*p));
54       *p = '!';
55       return;
56     }
57   }
58   DIE("compiler directive all blank");
59 }
60 
Prescan(ProvenanceRange range)61 void Prescanner::Prescan(ProvenanceRange range) {
62   AllSources &allSources{cooked_.allSources()};
63   startProvenance_ = range.start();
64   std::size_t offset{0};
65   const SourceFile *source{allSources.GetSourceFile(startProvenance_, &offset)};
66   CHECK(source);
67   start_ = source->content().data() + offset;
68   limit_ = start_ + range.size();
69   nextLine_ = start_;
70   const bool beganInFixedForm{inFixedForm_};
71   if (prescannerNesting_ > maxPrescannerNesting) {
72     Say(GetProvenance(start_),
73         "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
74     return;
75   }
76   while (nextLine_ < limit_) {
77     Statement();
78   }
79   if (inFixedForm_ != beganInFixedForm) {
80     std::string dir{"!dir$ "};
81     if (beganInFixedForm) {
82       dir += "fixed";
83     } else {
84       dir += "free";
85     }
86     dir += '\n';
87     TokenSequence tokens{dir, allSources.AddCompilerInsertion(dir).start()};
88     tokens.Emit(cooked_);
89   }
90 }
91 
Statement()92 void Prescanner::Statement() {
93   TokenSequence tokens;
94   LineClassification line{ClassifyLine(nextLine_)};
95   switch (line.kind) {
96   case LineClassification::Kind::Comment:
97     nextLine_ += line.payloadOffset; // advance to '!' or newline
98     NextLine();
99     return;
100   case LineClassification::Kind::IncludeLine:
101     FortranInclude(nextLine_ + line.payloadOffset);
102     NextLine();
103     return;
104   case LineClassification::Kind::ConditionalCompilationDirective:
105   case LineClassification::Kind::IncludeDirective:
106   case LineClassification::Kind::DefinitionDirective:
107   case LineClassification::Kind::PreprocessorDirective:
108     preprocessor_.Directive(TokenizePreprocessorDirective(), this);
109     return;
110   case LineClassification::Kind::CompilerDirective:
111     directiveSentinel_ = line.sentinel;
112     CHECK(InCompilerDirective());
113     BeginSourceLineAndAdvance();
114     if (inFixedForm_) {
115       CHECK(IsFixedFormCommentChar(*at_));
116     } else {
117       while (*at_ == ' ' || *at_ == '\t') {
118         ++at_, ++column_;
119       }
120       CHECK(*at_ == '!');
121     }
122     if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
123       // OpenMP conditional compilation line.  Remove the sentinel and then
124       // treat the line as if it were normal source.
125       at_ += 2, column_ += 2;
126       if (inFixedForm_) {
127         LabelField(tokens);
128       } else {
129         SkipSpaces();
130       }
131     } else {
132       // Compiler directive.  Emit normalized sentinel.
133       EmitChar(tokens, '!');
134       ++at_, ++column_;
135       for (const char *sp{directiveSentinel_}; *sp != '\0';
136            ++sp, ++at_, ++column_) {
137         EmitChar(tokens, *sp);
138       }
139       if (*at_ == ' ') {
140         EmitChar(tokens, ' ');
141         ++at_, ++column_;
142       }
143       tokens.CloseToken();
144     }
145     break;
146   case LineClassification::Kind::Source:
147     BeginSourceLineAndAdvance();
148     if (inFixedForm_) {
149       LabelField(tokens);
150     } else if (skipLeadingAmpersand_) {
151       skipLeadingAmpersand_ = false;
152       const char *p{SkipWhiteSpace(at_)};
153       if (p < limit_ && *p == '&') {
154         column_ += ++p - at_;
155         at_ = p;
156       }
157     } else {
158       SkipSpaces();
159     }
160     break;
161   }
162 
163   while (NextToken(tokens)) {
164   }
165 
166   Provenance newlineProvenance{GetCurrentProvenance()};
167   if (std::optional<TokenSequence> preprocessed{
168           preprocessor_.MacroReplacement(tokens, *this)}) {
169     // Reprocess the preprocessed line.  Append a newline temporarily.
170     preprocessed->PutNextTokenChar('\n', newlineProvenance);
171     preprocessed->CloseToken();
172     const char *ppd{preprocessed->ToCharBlock().begin()};
173     LineClassification ppl{ClassifyLine(ppd)};
174     preprocessed->RemoveLastToken(); // remove the newline
175     switch (ppl.kind) {
176     case LineClassification::Kind::Comment:
177       break;
178     case LineClassification::Kind::IncludeLine:
179       FortranInclude(ppd + ppl.payloadOffset);
180       break;
181     case LineClassification::Kind::ConditionalCompilationDirective:
182     case LineClassification::Kind::IncludeDirective:
183     case LineClassification::Kind::DefinitionDirective:
184     case LineClassification::Kind::PreprocessorDirective:
185       Say(preprocessed->GetProvenanceRange(),
186           "Preprocessed line resembles a preprocessor directive"_en_US);
187       preprocessed->ToLowerCase().Emit(cooked_);
188       break;
189     case LineClassification::Kind::CompilerDirective:
190       if (preprocessed->HasRedundantBlanks()) {
191         preprocessed->RemoveRedundantBlanks();
192       }
193       NormalizeCompilerDirectiveCommentMarker(*preprocessed);
194       preprocessed->ToLowerCase();
195       SourceFormChange(preprocessed->ToString());
196       preprocessed->ClipComment(true /* skip first ! */).Emit(cooked_);
197       break;
198     case LineClassification::Kind::Source:
199       if (inFixedForm_) {
200         if (preprocessed->HasBlanks(/*after column*/ 6)) {
201           preprocessed->RemoveBlanks(/*after column*/ 6);
202         }
203       } else {
204         if (preprocessed->HasRedundantBlanks()) {
205           preprocessed->RemoveRedundantBlanks();
206         }
207       }
208       preprocessed->ToLowerCase().ClipComment().Emit(cooked_);
209       break;
210     }
211   } else {
212     tokens.ToLowerCase();
213     if (line.kind == LineClassification::Kind::CompilerDirective) {
214       SourceFormChange(tokens.ToString());
215     }
216     tokens.Emit(cooked_);
217   }
218   if (omitNewline_) {
219     omitNewline_ = false;
220   } else {
221     cooked_.Put('\n', newlineProvenance);
222   }
223   directiveSentinel_ = nullptr;
224 }
225 
TokenizePreprocessorDirective()226 TokenSequence Prescanner::TokenizePreprocessorDirective() {
227   CHECK(nextLine_ < limit_ && !inPreprocessorDirective_);
228   inPreprocessorDirective_ = true;
229   BeginSourceLineAndAdvance();
230   TokenSequence tokens;
231   while (NextToken(tokens)) {
232   }
233   inPreprocessorDirective_ = false;
234   return tokens;
235 }
236 
NextLine()237 void Prescanner::NextLine() {
238   void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
239   void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
240   if (!v) {
241     nextLine_ = limit_;
242   } else {
243     const char *nl{const_cast<const char *>(static_cast<char *>(v))};
244     nextLine_ = nl + 1;
245   }
246 }
247 
LabelField(TokenSequence & token,int outCol)248 void Prescanner::LabelField(TokenSequence &token, int outCol) {
249   for (; *at_ != '\n' && column_ <= 6; ++at_) {
250     if (*at_ == '\t') {
251       ++at_;
252       column_ = 7;
253       break;
254     }
255     if (*at_ != ' ' &&
256         !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
257       EmitChar(token, *at_);
258       ++outCol;
259     }
260     ++column_;
261   }
262   if (outCol > 1) {
263     token.CloseToken();
264   }
265   if (outCol < 7) {
266     if (outCol == 1) {
267       token.Put("      ", 6, sixSpaceProvenance_.start());
268     } else {
269       for (; outCol < 7; ++outCol) {
270         token.PutNextTokenChar(' ', spaceProvenance_);
271       }
272       token.CloseToken();
273     }
274   }
275   SkipToNextSignificantCharacter();
276 }
277 
SkipToEndOfLine()278 void Prescanner::SkipToEndOfLine() {
279   while (*at_ != '\n') {
280     ++at_, ++column_;
281   }
282 }
283 
MustSkipToEndOfLine() const284 bool Prescanner::MustSkipToEndOfLine() const {
285   if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
286     return true; // skip over ignored columns in right margin (73:80)
287   } else if (*at_ == '!' && !inCharLiteral_) {
288     return true; // inline comment goes to end of source line
289   } else {
290     return false;
291   }
292 }
293 
NextChar()294 void Prescanner::NextChar() {
295   CHECK(*at_ != '\n');
296   ++at_, ++column_;
297   while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
298     // UTF-8 byte order mark - treat this file as UTF-8
299     at_ += 3;
300     encoding_ = Encoding::UTF_8;
301   }
302   SkipToNextSignificantCharacter();
303 }
304 
305 // Skip everything that should be ignored until the next significant
306 // character is reached; handles C-style comments in preprocessing
307 // directives, Fortran ! comments, stuff after the right margin in
308 // fixed form, and all forms of line continuation.
SkipToNextSignificantCharacter()309 void Prescanner::SkipToNextSignificantCharacter() {
310   if (inPreprocessorDirective_) {
311     SkipCComments();
312   } else {
313     bool mightNeedSpace{false};
314     if (MustSkipToEndOfLine()) {
315       SkipToEndOfLine();
316     } else {
317       mightNeedSpace = *at_ == '\n';
318     }
319     for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
320       if (MustSkipToEndOfLine()) {
321         SkipToEndOfLine();
322       }
323     }
324     if (*at_ == '\t') {
325       tabInCurrentLine_ = true;
326     }
327   }
328 }
329 
SkipCComments()330 void Prescanner::SkipCComments() {
331   while (true) {
332     if (IsCComment(at_)) {
333       if (const char *after{SkipCComment(at_)}) {
334         column_ += after - at_;
335         // May have skipped over one or more newlines; relocate the start of
336         // the next line.
337         nextLine_ = at_ = after;
338         NextLine();
339       } else {
340         // Don't emit any messages about unclosed C-style comments, because
341         // the sequence /* can appear legally in a FORMAT statement.  There's
342         // no ambiguity, since the sequence */ cannot appear legally.
343         break;
344       }
345     } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
346         at_[1] == '\n' && nextLine_ < limit_) {
347       BeginSourceLineAndAdvance();
348     } else {
349       break;
350     }
351   }
352 }
353 
SkipSpaces()354 void Prescanner::SkipSpaces() {
355   while (*at_ == ' ' || *at_ == '\t') {
356     NextChar();
357   }
358   insertASpace_ = false;
359 }
360 
SkipWhiteSpace(const char * p)361 const char *Prescanner::SkipWhiteSpace(const char *p) {
362   while (*p == ' ' || *p == '\t') {
363     ++p;
364   }
365   return p;
366 }
367 
SkipWhiteSpaceAndCComments(const char * p) const368 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
369   while (true) {
370     if (*p == ' ' || *p == '\t') {
371       ++p;
372     } else if (IsCComment(p)) {
373       if (const char *after{SkipCComment(p)}) {
374         p = after;
375       } else {
376         break;
377       }
378     } else {
379       break;
380     }
381   }
382   return p;
383 }
384 
SkipCComment(const char * p) const385 const char *Prescanner::SkipCComment(const char *p) const {
386   char star{' '}, slash{' '};
387   p += 2;
388   while (star != '*' || slash != '/') {
389     if (p >= limit_) {
390       return nullptr; // signifies an unterminated comment
391     }
392     star = slash;
393     slash = *p++;
394   }
395   return p;
396 }
397 
NextToken(TokenSequence & tokens)398 bool Prescanner::NextToken(TokenSequence &tokens) {
399   CHECK(at_ >= start_ && at_ < limit_);
400   if (InFixedFormSource()) {
401     SkipSpaces();
402   } else {
403     if (*at_ == '/' && IsCComment(at_)) {
404       // Recognize and skip over classic C style /*comments*/ when
405       // outside a character literal.
406       if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
407         Say(GetProvenance(at_), "nonstandard usage: C-style comment"_en_US);
408       }
409       SkipCComments();
410     }
411     if (*at_ == ' ' || *at_ == '\t') {
412       // Compress free-form white space into a single space character.
413       const auto theSpace{at_};
414       char previous{at_ <= start_ ? ' ' : at_[-1]};
415       NextChar();
416       SkipSpaces();
417       if (*at_ == '\n') {
418         // Discard white space at the end of a line.
419       } else if (!inPreprocessorDirective_ &&
420           (previous == '(' || *at_ == '(' || *at_ == ')')) {
421         // Discard white space before/after '(' and before ')', unless in a
422         // preprocessor directive.  This helps yield space-free contiguous
423         // names for generic interfaces like OPERATOR( + ) and
424         // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
425         // This has the effect of silently ignoring the illegal spaces in
426         // the array constructor ( /1,2/ ) but that seems benign; it's
427         // hard to avoid that while still removing spaces from OPERATOR( / )
428         // and OPERATOR( // ).
429       } else {
430         // Preserve the squashed white space as a single space character.
431         tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
432         tokens.CloseToken();
433         return true;
434       }
435     }
436   }
437   if (insertASpace_) {
438     tokens.PutNextTokenChar(' ', spaceProvenance_);
439     insertASpace_ = false;
440   }
441   if (*at_ == '\n') {
442     return false;
443   }
444   const char *start{at_};
445   if (*at_ == '\'' || *at_ == '"') {
446     QuotedCharacterLiteral(tokens, start);
447     preventHollerith_ = false;
448   } else if (IsDecimalDigit(*at_)) {
449     int n{0}, digits{0};
450     static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
451     do {
452       if (n < maxHollerith) {
453         n = 10 * n + DecimalDigitValue(*at_);
454       }
455       EmitCharAndAdvance(tokens, *at_);
456       ++digits;
457       if (InFixedFormSource()) {
458         SkipSpaces();
459       }
460     } while (IsDecimalDigit(*at_));
461     if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
462         !preventHollerith_) {
463       Hollerith(tokens, n, start);
464     } else if (*at_ == '.') {
465       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
466       }
467       ExponentAndKind(tokens);
468     } else if (ExponentAndKind(tokens)) {
469     } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
470         inPreprocessorDirective_) {
471       do {
472         EmitCharAndAdvance(tokens, *at_);
473       } while (IsHexadecimalDigit(*at_));
474     } else if (IsLetter(*at_)) {
475       // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
476       // we don't misrecognize I9HOLLERITH as an identifier in the next case.
477       EmitCharAndAdvance(tokens, *at_);
478     } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) {
479       EmitCharAndAdvance(tokens, *at_);
480       QuotedCharacterLiteral(tokens, start);
481     }
482     preventHollerith_ = false;
483   } else if (*at_ == '.') {
484     char nch{EmitCharAndAdvance(tokens, '.')};
485     if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
486       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
487       }
488       ExponentAndKind(tokens);
489     } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
490       EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
491     }
492     preventHollerith_ = false;
493   } else if (IsLegalInIdentifier(*at_)) {
494     do {
495     } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_)));
496     if (*at_ == '\'' || *at_ == '"') {
497       QuotedCharacterLiteral(tokens, start);
498       preventHollerith_ = false;
499     } else {
500       // Subtle: Don't misrecognize labeled DO statement label as Hollerith
501       // when the loop control variable starts with 'H'.
502       preventHollerith_ = true;
503     }
504   } else if (*at_ == '*') {
505     if (EmitCharAndAdvance(tokens, '*') == '*') {
506       EmitCharAndAdvance(tokens, '*');
507     } else {
508       // Subtle ambiguity:
509       //  CHARACTER*2H     declares H because *2 is a kind specifier
510       //  DATAC/N*2H  /    is repeated Hollerith
511       preventHollerith_ = !slashInCurrentLine_;
512     }
513   } else {
514     char ch{*at_};
515     if (ch == '(' || ch == '[') {
516       ++delimiterNesting_;
517     } else if ((ch == ')' || ch == ']') && delimiterNesting_ > 0) {
518       --delimiterNesting_;
519     }
520     char nch{EmitCharAndAdvance(tokens, ch)};
521     preventHollerith_ = false;
522     if ((nch == '=' &&
523             (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
524         (ch == nch &&
525             (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
526                 ch == '|' || ch == '<' || ch == '>')) ||
527         (ch == '=' && nch == '>')) {
528       // token comprises two characters
529       EmitCharAndAdvance(tokens, nch);
530     } else if (ch == '/') {
531       slashInCurrentLine_ = true;
532     }
533   }
534   tokens.CloseToken();
535   return true;
536 }
537 
ExponentAndKind(TokenSequence & tokens)538 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
539   char ed{ToLowerCaseLetter(*at_)};
540   if (ed != 'e' && ed != 'd') {
541     return false;
542   }
543   EmitCharAndAdvance(tokens, ed);
544   if (*at_ == '+' || *at_ == '-') {
545     EmitCharAndAdvance(tokens, *at_);
546   }
547   while (IsDecimalDigit(*at_)) {
548     EmitCharAndAdvance(tokens, *at_);
549   }
550   if (*at_ == '_') {
551     while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
552     }
553   }
554   return true;
555 }
556 
QuotedCharacterLiteral(TokenSequence & tokens,const char * start)557 void Prescanner::QuotedCharacterLiteral(
558     TokenSequence &tokens, const char *start) {
559   char quote{*at_};
560   const char *end{at_ + 1};
561   inCharLiteral_ = true;
562   const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
563   const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
564   bool isEscaped{false};
565   bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
566   while (true) {
567     if (*at_ == '\\') {
568       if (escapesEnabled) {
569         isEscaped = !isEscaped;
570       } else {
571         // The parser always processes escape sequences, so don't confuse it
572         // when escapes are disabled.
573         insert('\\');
574       }
575     } else {
576       isEscaped = false;
577     }
578     EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
579         Encoding::LATIN_1);
580     while (PadOutCharacterLiteral(tokens)) {
581     }
582     if (*at_ == '\n') {
583       if (!inPreprocessorDirective_) {
584         Say(GetProvenanceRange(start, end),
585             "Incomplete character literal"_err_en_US);
586       }
587       break;
588     }
589     end = at_ + 1;
590     NextChar();
591     if (*at_ == quote && !isEscaped) {
592       // A doubled unescaped quote mark becomes a single instance of that
593       // quote character in the literal (later).  There can be spaces between
594       // the quotes in fixed form source.
595       EmitChar(tokens, quote);
596       inCharLiteral_ = false; // for cases like print *, '...'!comment
597       NextChar();
598       if (InFixedFormSource()) {
599         SkipSpaces();
600       }
601       if (*at_ != quote) {
602         break;
603       }
604       inCharLiteral_ = true;
605     }
606   }
607   inCharLiteral_ = false;
608 }
609 
Hollerith(TokenSequence & tokens,int count,const char * start)610 void Prescanner::Hollerith(
611     TokenSequence &tokens, int count, const char *start) {
612   inCharLiteral_ = true;
613   CHECK(*at_ == 'h' || *at_ == 'H');
614   EmitChar(tokens, 'H');
615   while (count-- > 0) {
616     if (PadOutCharacterLiteral(tokens)) {
617     } else if (*at_ == '\n') {
618       Say(GetProvenanceRange(start, at_),
619           "Possible truncated Hollerith literal"_en_US);
620       break;
621     } else {
622       NextChar();
623       // Each multi-byte character encoding counts as a single character.
624       // No escape sequences are recognized.
625       // Hollerith is always emitted to the cooked character
626       // stream in UTF-8.
627       DecodedCharacter decoded{DecodeCharacter(
628           encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
629       if (decoded.bytes > 0) {
630         EncodedCharacter utf8{
631             EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
632         for (int j{0}; j < utf8.bytes; ++j) {
633           EmitChar(tokens, utf8.buffer[j]);
634         }
635         at_ += decoded.bytes - 1;
636       } else {
637         Say(GetProvenanceRange(start, at_),
638             "Bad character in Hollerith literal"_err_en_US);
639         break;
640       }
641     }
642   }
643   if (*at_ != '\n') {
644     NextChar();
645   }
646   inCharLiteral_ = false;
647 }
648 
649 // In fixed form, source card images must be processed as if they were at
650 // least 72 columns wide, at least in character literal contexts.
PadOutCharacterLiteral(TokenSequence & tokens)651 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
652   while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
653     if (column_ < fixedFormColumnLimit_) {
654       tokens.PutNextTokenChar(' ', spaceProvenance_);
655       ++column_;
656       return true;
657     }
658     if (!FixedFormContinuation(false /*no need to insert space*/) ||
659         tabInCurrentLine_) {
660       return false;
661     }
662     CHECK(column_ == 7);
663     --at_; // point to column 6 of continuation line
664     column_ = 6;
665   }
666   return false;
667 }
668 
IsFixedFormCommentLine(const char * start) const669 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
670   const char *p{start};
671   if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
672       ((*p == 'D' || *p == 'd') &&
673           !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
674     return true;
675   }
676   bool anyTabs{false};
677   while (true) {
678     if (*p == ' ') {
679       ++p;
680     } else if (*p == '\t') {
681       anyTabs = true;
682       ++p;
683     } else if (*p == '0' && !anyTabs && p == start + 5) {
684       ++p; // 0 in column 6 must treated as a space
685     } else {
686       break;
687     }
688   }
689   if (!anyTabs && p >= start + fixedFormColumnLimit_) {
690     return true;
691   }
692   if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
693     return true;
694   }
695   return *p == '\n';
696 }
697 
IsFreeFormComment(const char * p) const698 const char *Prescanner::IsFreeFormComment(const char *p) const {
699   p = SkipWhiteSpaceAndCComments(p);
700   if (*p == '!' || *p == '\n') {
701     return p;
702   } else {
703     return nullptr;
704   }
705 }
706 
IsIncludeLine(const char * start) const707 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
708   const char *p{SkipWhiteSpace(start)};
709   for (char ch : "include"s) {
710     if (ToLowerCaseLetter(*p++) != ch) {
711       return std::nullopt;
712     }
713   }
714   p = SkipWhiteSpace(p);
715   if (*p == '"' || *p == '\'') {
716     return {p - start};
717   }
718   return std::nullopt;
719 }
720 
FortranInclude(const char * firstQuote)721 void Prescanner::FortranInclude(const char *firstQuote) {
722   const char *p{firstQuote};
723   while (*p != '"' && *p != '\'') {
724     ++p;
725   }
726   char quote{*p};
727   std::string path;
728   for (++p; *p != '\n'; ++p) {
729     if (*p == quote) {
730       if (p[1] != quote) {
731         break;
732       }
733       ++p;
734     }
735     path += *p;
736   }
737   if (*p != quote) {
738     Say(GetProvenanceRange(firstQuote, p),
739         "malformed path name string"_err_en_US);
740     return;
741   }
742   p = SkipWhiteSpace(p + 1);
743   if (*p != '\n' && *p != '!') {
744     const char *garbage{p};
745     for (; *p != '\n' && *p != '!'; ++p) {
746     }
747     Say(GetProvenanceRange(garbage, p),
748         "excess characters after path name"_en_US);
749   }
750   std::string buf;
751   llvm::raw_string_ostream error{buf};
752   Provenance provenance{GetProvenance(nextLine_)};
753   AllSources &allSources{cooked_.allSources()};
754   const SourceFile *currentFile{allSources.GetSourceFile(provenance)};
755   if (currentFile) {
756     allSources.PushSearchPathDirectory(DirectoryName(currentFile->path()));
757   }
758   const SourceFile *included{allSources.Open(path, error)};
759   if (currentFile) {
760     allSources.PopSearchPathDirectory();
761   }
762   if (!included) {
763     Say(provenance, "INCLUDE: %s"_err_en_US, error.str());
764   } else if (included->bytes() > 0) {
765     ProvenanceRange includeLineRange{
766         provenance, static_cast<std::size_t>(p - nextLine_)};
767     ProvenanceRange fileRange{
768         allSources.AddIncludedFile(*included, includeLineRange)};
769     Prescanner{*this}.set_encoding(included->encoding()).Prescan(fileRange);
770   }
771 }
772 
IsPreprocessorDirectiveLine(const char * start) const773 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
774   const char *p{start};
775   for (; *p == ' '; ++p) {
776   }
777   if (*p == '#') {
778     if (inFixedForm_ && p == start + 5) {
779       return nullptr;
780     }
781   } else {
782     p = SkipWhiteSpace(p);
783     if (*p != '#') {
784       return nullptr;
785     }
786   }
787   return SkipWhiteSpace(p + 1);
788 }
789 
IsNextLinePreprocessorDirective() const790 bool Prescanner::IsNextLinePreprocessorDirective() const {
791   return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
792 }
793 
SkipCommentLine(bool afterAmpersand)794 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
795   if (nextLine_ >= limit_) {
796     if (afterAmpersand && prescannerNesting_ > 0) {
797       // A continuation marker at the end of the last line in an
798       // include file inhibits the newline for that line.
799       SkipToEndOfLine();
800       omitNewline_ = true;
801     }
802     return false;
803   }
804   auto lineClass{ClassifyLine(nextLine_)};
805   if (lineClass.kind == LineClassification::Kind::Comment) {
806     NextLine();
807     return true;
808   } else if (inPreprocessorDirective_) {
809     return false;
810   } else if (lineClass.kind ==
811           LineClassification::Kind::ConditionalCompilationDirective ||
812       lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
813     // Allow conditional compilation directives (e.g., #ifdef) to affect
814     // continuation lines.
815     // Allow other preprocessor directives, too, except #include
816     // (when it does not follow '&'), #define, and #undef (because
817     // they cannot be allowed to affect preceding text on a
818     // continued line).
819     preprocessor_.Directive(TokenizePreprocessorDirective(), this);
820     return true;
821   } else if (afterAmpersand &&
822       (lineClass.kind == LineClassification::Kind::IncludeDirective ||
823           lineClass.kind == LineClassification::Kind::IncludeLine)) {
824     SkipToEndOfLine();
825     omitNewline_ = true;
826     skipLeadingAmpersand_ = true;
827     return false;
828   } else {
829     return false;
830   }
831 }
832 
FixedFormContinuationLine(bool mightNeedSpace)833 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
834   if (nextLine_ >= limit_) {
835     return nullptr;
836   }
837   tabInCurrentLine_ = false;
838   char col1{*nextLine_};
839   if (InCompilerDirective()) {
840     // Must be a continued compiler directive.
841     if (!IsFixedFormCommentChar(col1)) {
842       return nullptr;
843     }
844     int j{1};
845     for (; j < 5; ++j) {
846       char ch{directiveSentinel_[j - 1]};
847       if (ch == '\0') {
848         break;
849       }
850       if (ch != ToLowerCaseLetter(nextLine_[j])) {
851         return nullptr;
852       }
853     }
854     for (; j < 5; ++j) {
855       if (nextLine_[j] != ' ') {
856         return nullptr;
857       }
858     }
859     char col6{nextLine_[5]};
860     if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
861       if (nextLine_[6] != ' ' && mightNeedSpace) {
862         insertASpace_ = true;
863       }
864       return nextLine_ + 6;
865     }
866     return nullptr;
867   } else {
868     // Normal case: not in a compiler directive.
869     if (col1 == '&' &&
870         features_.IsEnabled(
871             LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
872       // Extension: '&' as continuation marker
873       if (features_.ShouldWarn(
874               LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
875         Say(GetProvenance(nextLine_), "nonstandard usage"_en_US);
876       }
877       return nextLine_ + 1;
878     }
879     if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
880       tabInCurrentLine_ = true;
881       return nextLine_ + 2; // VAX extension
882     }
883     if (col1 == ' ' && nextLine_[1] == ' ' && nextLine_[2] == ' ' &&
884         nextLine_[3] == ' ' && nextLine_[4] == ' ') {
885       char col6{nextLine_[5]};
886       if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
887         return nextLine_ + 6;
888       }
889     }
890     if (delimiterNesting_ > 0) {
891       if (!IsFixedFormCommentChar(col1)) {
892         return nextLine_;
893       }
894     }
895   }
896   return nullptr; // not a continuation line
897 }
898 
FreeFormContinuationLine(bool ampersand)899 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
900   const char *p{nextLine_};
901   if (p >= limit_) {
902     return nullptr;
903   }
904   p = SkipWhiteSpace(p);
905   if (InCompilerDirective()) {
906     if (*p++ != '!') {
907       return nullptr;
908     }
909     for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
910       if (*s != ToLowerCaseLetter(*p)) {
911         return nullptr;
912       }
913     }
914     p = SkipWhiteSpace(p);
915     if (*p == '&') {
916       if (!ampersand) {
917         insertASpace_ = true;
918       }
919       return p + 1;
920     } else if (ampersand) {
921       return p;
922     } else {
923       return nullptr;
924     }
925   } else {
926     if (*p == '&') {
927       return p + 1;
928     } else if (*p == '!' || *p == '\n' || *p == '#') {
929       return nullptr;
930     } else if (ampersand || delimiterNesting_ > 0) {
931       if (p > nextLine_) {
932         --p;
933       } else {
934         insertASpace_ = true;
935       }
936       return p;
937     } else {
938       return nullptr;
939     }
940   }
941 }
942 
FixedFormContinuation(bool mightNeedSpace)943 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
944   // N.B. We accept '&' as a continuation indicator in fixed form, too,
945   // but not in a character literal.
946   if (*at_ == '&' && inCharLiteral_) {
947     return false;
948   }
949   do {
950     if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
951       BeginSourceLine(cont);
952       column_ = 7;
953       NextLine();
954       return true;
955     }
956   } while (SkipCommentLine(false /* not after ampersand */));
957   return false;
958 }
959 
FreeFormContinuation()960 bool Prescanner::FreeFormContinuation() {
961   const char *p{at_};
962   bool ampersand{*p == '&'};
963   if (ampersand) {
964     p = SkipWhiteSpace(p + 1);
965   }
966   if (*p != '\n') {
967     if (inCharLiteral_) {
968       return false;
969     } else if (*p != '!' &&
970         features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
971       Say(GetProvenance(p), "missing ! before comment after &"_en_US);
972     }
973   }
974   do {
975     if (const char *cont{FreeFormContinuationLine(ampersand)}) {
976       BeginSourceLine(cont);
977       NextLine();
978       return true;
979     }
980   } while (SkipCommentLine(ampersand));
981   return false;
982 }
983 
Continuation(bool mightNeedFixedFormSpace)984 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
985   if (*at_ == '\n' || *at_ == '&') {
986     if (inFixedForm_) {
987       return FixedFormContinuation(mightNeedFixedFormSpace);
988     } else {
989       return FreeFormContinuation();
990     }
991   } else {
992     return false;
993   }
994 }
995 
996 std::optional<Prescanner::LineClassification>
IsFixedFormCompilerDirectiveLine(const char * start) const997 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
998   const char *p{start};
999   char col1{*p++};
1000   if (!IsFixedFormCommentChar(col1)) {
1001     return std::nullopt;
1002   }
1003   char sentinel[5], *sp{sentinel};
1004   int column{2};
1005   for (; column < 6; ++column, ++p) {
1006     if (*p != ' ') {
1007       if (*p == '\n' || *p == '\t') {
1008         break;
1009       }
1010       if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1011         // OpenMP conditional compilation line: leave the label alone
1012         break;
1013       }
1014       *sp++ = ToLowerCaseLetter(*p);
1015     }
1016   }
1017   if (column == 6) {
1018     if (*p == ' ' || *p == '\t' || *p == '0') {
1019       ++p;
1020     } else {
1021       // This is a Continuation line, not an initial directive line.
1022       return std::nullopt;
1023     }
1024   }
1025   if (sp == sentinel) {
1026     return std::nullopt;
1027   }
1028   *sp = '\0';
1029   if (const char *ss{IsCompilerDirectiveSentinel(sentinel)}) {
1030     std::size_t payloadOffset = p - start;
1031     return {LineClassification{
1032         LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1033   }
1034   return std::nullopt;
1035 }
1036 
1037 std::optional<Prescanner::LineClassification>
IsFreeFormCompilerDirectiveLine(const char * start) const1038 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1039   char sentinel[8];
1040   const char *p{SkipWhiteSpace(start)};
1041   if (*p++ != '!') {
1042     return std::nullopt;
1043   }
1044   for (std::size_t j{0}; j + 1 < sizeof sentinel; ++p, ++j) {
1045     if (*p == '\n') {
1046       break;
1047     }
1048     if (*p == ' ' || *p == '\t' || *p == '&') {
1049       if (j == 0) {
1050         break;
1051       }
1052       sentinel[j] = '\0';
1053       p = SkipWhiteSpace(p + 1);
1054       if (*p == '!') {
1055         break;
1056       }
1057       if (const char *sp{IsCompilerDirectiveSentinel(sentinel)}) {
1058         std::size_t offset = p - start;
1059         return {LineClassification{
1060             LineClassification::Kind::CompilerDirective, offset, sp}};
1061       }
1062       break;
1063     }
1064     sentinel[j] = ToLowerCaseLetter(*p);
1065   }
1066   return std::nullopt;
1067 }
1068 
AddCompilerDirectiveSentinel(const std::string & dir)1069 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1070   std::uint64_t packed{0};
1071   for (char ch : dir) {
1072     packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1073   }
1074   compilerDirectiveBloomFilter_.set(packed % prime1);
1075   compilerDirectiveBloomFilter_.set(packed % prime2);
1076   compilerDirectiveSentinels_.insert(dir);
1077   return *this;
1078 }
1079 
IsCompilerDirectiveSentinel(const char * sentinel) const1080 const char *Prescanner::IsCompilerDirectiveSentinel(
1081     const char *sentinel) const {
1082   std::uint64_t packed{0};
1083   std::size_t n{0};
1084   for (; sentinel[n] != '\0'; ++n) {
1085     packed = (packed << 8) | (sentinel[n] & 0xff);
1086   }
1087   if (n == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1088       !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1089     return nullptr;
1090   }
1091   const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, n))};
1092   return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
1093 }
1094 
ClassifyLine(const char * start) const1095 Prescanner::LineClassification Prescanner::ClassifyLine(
1096     const char *start) const {
1097   if (inFixedForm_) {
1098     if (std::optional<LineClassification> lc{
1099             IsFixedFormCompilerDirectiveLine(start)}) {
1100       return std::move(*lc);
1101     }
1102     if (IsFixedFormCommentLine(start)) {
1103       return {LineClassification::Kind::Comment};
1104     }
1105   } else {
1106     if (std::optional<LineClassification> lc{
1107             IsFreeFormCompilerDirectiveLine(start)}) {
1108       return std::move(*lc);
1109     }
1110     if (const char *bang{IsFreeFormComment(start)}) {
1111       return {LineClassification::Kind::Comment,
1112           static_cast<std::size_t>(bang - start)};
1113     }
1114   }
1115   if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1116     return {LineClassification::Kind::IncludeLine, *quoteOffset};
1117   }
1118   if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1119     if (std::memcmp(dir, "if", 2) == 0 || std::memcmp(dir, "elif", 4) == 0 ||
1120         std::memcmp(dir, "else", 4) == 0 || std::memcmp(dir, "endif", 5) == 0) {
1121       return {LineClassification::Kind::ConditionalCompilationDirective};
1122     } else if (std::memcmp(dir, "include", 7) == 0) {
1123       return {LineClassification::Kind::IncludeDirective};
1124     } else if (std::memcmp(dir, "define", 6) == 0 ||
1125         std::memcmp(dir, "undef", 5) == 0) {
1126       return {LineClassification::Kind::DefinitionDirective};
1127     } else {
1128       return {LineClassification::Kind::PreprocessorDirective};
1129     }
1130   }
1131   return {LineClassification::Kind::Source};
1132 }
1133 
SourceFormChange(std::string && dir)1134 void Prescanner::SourceFormChange(std::string &&dir) {
1135   if (dir == "!dir$ free") {
1136     inFixedForm_ = false;
1137   } else if (dir == "!dir$ fixed") {
1138     inFixedForm_ = true;
1139   }
1140 }
1141 } // namespace Fortran::parser
1142