1 //===-- lib/Parser/prescan.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "prescan.h"
10 #include "preprocessor.h"
11 #include "token-sequence.h"
12 #include "flang/Common/idioms.h"
13 #include "flang/Parser/characters.h"
14 #include "flang/Parser/message.h"
15 #include "flang/Parser/source.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <cstddef>
18 #include <cstring>
19 #include <utility>
20 #include <vector>
21 
22 namespace Fortran::parser {
23 
24 using common::LanguageFeature;
25 
26 static constexpr int maxPrescannerNesting{100};
27 
Prescanner(Messages & messages,CookedSource & cooked,Preprocessor & preprocessor,common::LanguageFeatureControl lfc)28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
29     Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
30     : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
31       allSources_{preprocessor_.allSources()}, features_{lfc},
32       encoding_{allSources_.encoding()} {}
33 
Prescanner(const Prescanner & that)34 Prescanner::Prescanner(const Prescanner &that)
35     : messages_{that.messages_}, cooked_{that.cooked_},
36       preprocessor_{that.preprocessor_}, allSources_{that.allSources_},
37       features_{that.features_}, inFixedForm_{that.inFixedForm_},
38       fixedFormColumnLimit_{that.fixedFormColumnLimit_},
39       encoding_{that.encoding_}, prescannerNesting_{that.prescannerNesting_ +
40                                      1},
41       skipLeadingAmpersand_{that.skipLeadingAmpersand_},
42       compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
43       compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
44 
IsFixedFormCommentChar(char ch)45 static inline constexpr bool IsFixedFormCommentChar(char ch) {
46   return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
47 }
48 
NormalizeCompilerDirectiveCommentMarker(TokenSequence & dir)49 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
50   char *p{dir.GetMutableCharData()};
51   char *limit{p + dir.SizeInChars()};
52   for (; p < limit; ++p) {
53     if (*p != ' ') {
54       CHECK(IsFixedFormCommentChar(*p));
55       *p = '!';
56       return;
57     }
58   }
59   DIE("compiler directive all blank");
60 }
61 
Prescan(ProvenanceRange range)62 void Prescanner::Prescan(ProvenanceRange range) {
63   startProvenance_ = range.start();
64   start_ = allSources_.GetSource(range);
65   CHECK(start_);
66   limit_ = start_ + range.size();
67   nextLine_ = start_;
68   const bool beganInFixedForm{inFixedForm_};
69   if (prescannerNesting_ > maxPrescannerNesting) {
70     Say(GetProvenance(start_),
71         "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
72     return;
73   }
74   while (!IsAtEnd()) {
75     Statement();
76   }
77   if (inFixedForm_ != beganInFixedForm) {
78     std::string dir{"!dir$ "};
79     if (beganInFixedForm) {
80       dir += "fixed";
81     } else {
82       dir += "free";
83     }
84     dir += '\n';
85     TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()};
86     tokens.Emit(cooked_);
87   }
88 }
89 
Statement()90 void Prescanner::Statement() {
91   TokenSequence tokens;
92   LineClassification line{ClassifyLine(nextLine_)};
93   switch (line.kind) {
94   case LineClassification::Kind::Comment:
95     nextLine_ += line.payloadOffset; // advance to '!' or newline
96     NextLine();
97     return;
98   case LineClassification::Kind::IncludeLine:
99     FortranInclude(nextLine_ + line.payloadOffset);
100     NextLine();
101     return;
102   case LineClassification::Kind::ConditionalCompilationDirective:
103   case LineClassification::Kind::IncludeDirective:
104   case LineClassification::Kind::DefinitionDirective:
105   case LineClassification::Kind::PreprocessorDirective:
106     preprocessor_.Directive(TokenizePreprocessorDirective(), this);
107     return;
108   case LineClassification::Kind::CompilerDirective:
109     directiveSentinel_ = line.sentinel;
110     CHECK(InCompilerDirective());
111     BeginStatementAndAdvance();
112     if (inFixedForm_) {
113       CHECK(IsFixedFormCommentChar(*at_));
114     } else {
115       while (*at_ == ' ' || *at_ == '\t') {
116         ++at_, ++column_;
117       }
118       CHECK(*at_ == '!');
119     }
120     if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
121       // OpenMP conditional compilation line.  Remove the sentinel and then
122       // treat the line as if it were normal source.
123       at_ += 2, column_ += 2;
124       if (inFixedForm_) {
125         LabelField(tokens);
126       } else {
127         SkipSpaces();
128       }
129     } else {
130       // Compiler directive.  Emit normalized sentinel.
131       EmitChar(tokens, '!');
132       ++at_, ++column_;
133       for (const char *sp{directiveSentinel_}; *sp != '\0';
134            ++sp, ++at_, ++column_) {
135         EmitChar(tokens, *sp);
136       }
137       if (*at_ == ' ') {
138         EmitChar(tokens, ' ');
139         ++at_, ++column_;
140       }
141       tokens.CloseToken();
142     }
143     break;
144   case LineClassification::Kind::Source:
145     BeginStatementAndAdvance();
146     if (inFixedForm_) {
147       LabelField(tokens);
148     } else if (skipLeadingAmpersand_) {
149       skipLeadingAmpersand_ = false;
150       const char *p{SkipWhiteSpace(at_)};
151       if (p < limit_ && *p == '&') {
152         column_ += ++p - at_;
153         at_ = p;
154       }
155     } else {
156       SkipSpaces();
157     }
158     break;
159   }
160 
161   while (NextToken(tokens)) {
162   }
163 
164   Provenance newlineProvenance{GetCurrentProvenance()};
165   if (std::optional<TokenSequence> preprocessed{
166           preprocessor_.MacroReplacement(tokens, *this)}) {
167     // Reprocess the preprocessed line.  Append a newline temporarily.
168     preprocessed->PutNextTokenChar('\n', newlineProvenance);
169     preprocessed->CloseToken();
170     const char *ppd{preprocessed->ToCharBlock().begin()};
171     LineClassification ppl{ClassifyLine(ppd)};
172     preprocessed->RemoveLastToken(); // remove the newline
173     switch (ppl.kind) {
174     case LineClassification::Kind::Comment:
175       break;
176     case LineClassification::Kind::IncludeLine:
177       FortranInclude(ppd + ppl.payloadOffset);
178       break;
179     case LineClassification::Kind::ConditionalCompilationDirective:
180     case LineClassification::Kind::IncludeDirective:
181     case LineClassification::Kind::DefinitionDirective:
182     case LineClassification::Kind::PreprocessorDirective:
183       Say(preprocessed->GetProvenanceRange(),
184           "Preprocessed line resembles a preprocessor directive"_en_US);
185       preprocessed->ToLowerCase().CheckBadFortranCharacters(messages_).Emit(
186           cooked_);
187       break;
188     case LineClassification::Kind::CompilerDirective:
189       if (preprocessed->HasRedundantBlanks()) {
190         preprocessed->RemoveRedundantBlanks();
191       }
192       NormalizeCompilerDirectiveCommentMarker(*preprocessed);
193       preprocessed->ToLowerCase();
194       SourceFormChange(preprocessed->ToString());
195       preprocessed->ClipComment(true /* skip first ! */)
196           .CheckBadFortranCharacters(messages_)
197           .Emit(cooked_);
198       break;
199     case LineClassification::Kind::Source:
200       if (inFixedForm_) {
201         if (preprocessed->HasBlanks(/*after column*/ 6)) {
202           preprocessed->RemoveBlanks(/*after column*/ 6);
203         }
204       } else {
205         if (preprocessed->HasRedundantBlanks()) {
206           preprocessed->RemoveRedundantBlanks();
207         }
208       }
209       preprocessed->ToLowerCase()
210           .ClipComment()
211           .CheckBadFortranCharacters(messages_)
212           .Emit(cooked_);
213       break;
214     }
215   } else {
216     tokens.ToLowerCase();
217     if (line.kind == LineClassification::Kind::CompilerDirective) {
218       SourceFormChange(tokens.ToString());
219     }
220     tokens.CheckBadFortranCharacters(messages_).Emit(cooked_);
221   }
222   if (omitNewline_) {
223     omitNewline_ = false;
224   } else {
225     cooked_.Put('\n', newlineProvenance);
226   }
227   directiveSentinel_ = nullptr;
228 }
229 
TokenizePreprocessorDirective()230 TokenSequence Prescanner::TokenizePreprocessorDirective() {
231   CHECK(!IsAtEnd() && !inPreprocessorDirective_);
232   inPreprocessorDirective_ = true;
233   BeginStatementAndAdvance();
234   TokenSequence tokens;
235   while (NextToken(tokens)) {
236   }
237   inPreprocessorDirective_ = false;
238   return tokens;
239 }
240 
NextLine()241 void Prescanner::NextLine() {
242   void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
243   void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
244   if (!v) {
245     nextLine_ = limit_;
246   } else {
247     const char *nl{const_cast<const char *>(static_cast<char *>(v))};
248     nextLine_ = nl + 1;
249   }
250 }
251 
LabelField(TokenSequence & token)252 void Prescanner::LabelField(TokenSequence &token) {
253   const char *bad{nullptr};
254   int outCol{1};
255   for (; *at_ != '\n' && column_ <= 6; ++at_) {
256     if (*at_ == '\t') {
257       ++at_;
258       column_ = 7;
259       break;
260     }
261     if (*at_ != ' ' &&
262         !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
263       EmitChar(token, *at_);
264       ++outCol;
265       if (!bad && !IsDecimalDigit(*at_)) {
266         bad = at_;
267       }
268     }
269     ++column_;
270   }
271   if (outCol == 1) { // empty label field
272     // Emit a space so that, if the line is rescanned after preprocessing,
273     // a leading 'C' or 'D' won't be left-justified and then accidentally
274     // misinterpreted as a comment card.
275     EmitChar(token, ' ');
276     ++outCol;
277   } else {
278     if (bad && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) {
279       Say(GetProvenance(bad),
280           "Character in fixed-form label field must be a digit"_en_US);
281     }
282   }
283   token.CloseToken();
284   SkipToNextSignificantCharacter();
285   if (IsDecimalDigit(*at_)) {
286     Say(GetProvenance(at_),
287         "Label digit is not in fixed-form label field"_en_US);
288   }
289 }
290 
SkipToEndOfLine()291 void Prescanner::SkipToEndOfLine() {
292   while (*at_ != '\n') {
293     ++at_, ++column_;
294   }
295 }
296 
MustSkipToEndOfLine() const297 bool Prescanner::MustSkipToEndOfLine() const {
298   if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
299     return true; // skip over ignored columns in right margin (73:80)
300   } else if (*at_ == '!' && !inCharLiteral_) {
301     return true; // inline comment goes to end of source line
302   } else {
303     return false;
304   }
305 }
306 
NextChar()307 void Prescanner::NextChar() {
308   CHECK(*at_ != '\n');
309   ++at_, ++column_;
310   while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
311     // UTF-8 byte order mark - treat this file as UTF-8
312     at_ += 3;
313     encoding_ = Encoding::UTF_8;
314   }
315   SkipToNextSignificantCharacter();
316 }
317 
318 // Skip everything that should be ignored until the next significant
319 // character is reached; handles C-style comments in preprocessing
320 // directives, Fortran ! comments, stuff after the right margin in
321 // fixed form, and all forms of line continuation.
SkipToNextSignificantCharacter()322 void Prescanner::SkipToNextSignificantCharacter() {
323   if (inPreprocessorDirective_) {
324     SkipCComments();
325   } else {
326     bool mightNeedSpace{false};
327     if (MustSkipToEndOfLine()) {
328       SkipToEndOfLine();
329     } else {
330       mightNeedSpace = *at_ == '\n';
331     }
332     for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
333       if (MustSkipToEndOfLine()) {
334         SkipToEndOfLine();
335       }
336     }
337     if (*at_ == '\t') {
338       tabInCurrentLine_ = true;
339     }
340   }
341 }
342 
SkipCComments()343 void Prescanner::SkipCComments() {
344   while (true) {
345     if (IsCComment(at_)) {
346       if (const char *after{SkipCComment(at_)}) {
347         column_ += after - at_;
348         // May have skipped over one or more newlines; relocate the start of
349         // the next line.
350         nextLine_ = at_ = after;
351         NextLine();
352       } else {
353         // Don't emit any messages about unclosed C-style comments, because
354         // the sequence /* can appear legally in a FORMAT statement.  There's
355         // no ambiguity, since the sequence */ cannot appear legally.
356         break;
357       }
358     } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
359         at_[1] == '\n' && !IsAtEnd()) {
360       BeginSourceLineAndAdvance();
361     } else {
362       break;
363     }
364   }
365 }
366 
SkipSpaces()367 void Prescanner::SkipSpaces() {
368   while (*at_ == ' ' || *at_ == '\t') {
369     NextChar();
370   }
371   insertASpace_ = false;
372 }
373 
SkipWhiteSpace(const char * p)374 const char *Prescanner::SkipWhiteSpace(const char *p) {
375   while (*p == ' ' || *p == '\t') {
376     ++p;
377   }
378   return p;
379 }
380 
SkipWhiteSpaceAndCComments(const char * p) const381 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
382   while (true) {
383     if (*p == ' ' || *p == '\t') {
384       ++p;
385     } else if (IsCComment(p)) {
386       if (const char *after{SkipCComment(p)}) {
387         p = after;
388       } else {
389         break;
390       }
391     } else {
392       break;
393     }
394   }
395   return p;
396 }
397 
SkipCComment(const char * p) const398 const char *Prescanner::SkipCComment(const char *p) const {
399   char star{' '}, slash{' '};
400   p += 2;
401   while (star != '*' || slash != '/') {
402     if (p >= limit_) {
403       return nullptr; // signifies an unterminated comment
404     }
405     star = slash;
406     slash = *p++;
407   }
408   return p;
409 }
410 
NextToken(TokenSequence & tokens)411 bool Prescanner::NextToken(TokenSequence &tokens) {
412   CHECK(at_ >= start_ && at_ < limit_);
413   if (InFixedFormSource()) {
414     SkipSpaces();
415   } else {
416     if (*at_ == '/' && IsCComment(at_)) {
417       // Recognize and skip over classic C style /*comments*/ when
418       // outside a character literal.
419       if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
420         Say(GetProvenance(at_), "nonstandard usage: C-style comment"_en_US);
421       }
422       SkipCComments();
423     }
424     if (*at_ == ' ' || *at_ == '\t') {
425       // Compress free-form white space into a single space character.
426       const auto theSpace{at_};
427       char previous{at_ <= start_ ? ' ' : at_[-1]};
428       NextChar();
429       SkipSpaces();
430       if (*at_ == '\n') {
431         // Discard white space at the end of a line.
432       } else if (!inPreprocessorDirective_ &&
433           (previous == '(' || *at_ == '(' || *at_ == ')')) {
434         // Discard white space before/after '(' and before ')', unless in a
435         // preprocessor directive.  This helps yield space-free contiguous
436         // names for generic interfaces like OPERATOR( + ) and
437         // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
438         // This has the effect of silently ignoring the illegal spaces in
439         // the array constructor ( /1,2/ ) but that seems benign; it's
440         // hard to avoid that while still removing spaces from OPERATOR( / )
441         // and OPERATOR( // ).
442       } else {
443         // Preserve the squashed white space as a single space character.
444         tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
445         tokens.CloseToken();
446         return true;
447       }
448     }
449   }
450   if (insertASpace_) {
451     tokens.PutNextTokenChar(' ', spaceProvenance_);
452     insertASpace_ = false;
453   }
454   if (*at_ == '\n') {
455     return false;
456   }
457   const char *start{at_};
458   if (*at_ == '\'' || *at_ == '"') {
459     QuotedCharacterLiteral(tokens, start);
460     preventHollerith_ = false;
461   } else if (IsDecimalDigit(*at_)) {
462     int n{0}, digits{0};
463     static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
464     do {
465       if (n < maxHollerith) {
466         n = 10 * n + DecimalDigitValue(*at_);
467       }
468       EmitCharAndAdvance(tokens, *at_);
469       ++digits;
470       if (InFixedFormSource()) {
471         SkipSpaces();
472       }
473     } while (IsDecimalDigit(*at_));
474     if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
475         !preventHollerith_) {
476       Hollerith(tokens, n, start);
477     } else if (*at_ == '.') {
478       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
479       }
480       ExponentAndKind(tokens);
481     } else if (ExponentAndKind(tokens)) {
482     } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
483         inPreprocessorDirective_) {
484       do {
485         EmitCharAndAdvance(tokens, *at_);
486       } while (IsHexadecimalDigit(*at_));
487     } else if (IsLetter(*at_)) {
488       // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
489       // we don't misrecognize I9HOLLERITH as an identifier in the next case.
490       EmitCharAndAdvance(tokens, *at_);
491     } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..."
492       EmitCharAndAdvance(tokens, *at_);
493       QuotedCharacterLiteral(tokens, start);
494     }
495     preventHollerith_ = false;
496   } else if (*at_ == '.') {
497     char nch{EmitCharAndAdvance(tokens, '.')};
498     if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
499       while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
500       }
501       ExponentAndKind(tokens);
502     } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
503       EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
504     }
505     preventHollerith_ = false;
506   } else if (IsLegalInIdentifier(*at_)) {
507     do {
508     } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_)));
509     if ((*at_ == '\'' || *at_ == '"') &&
510         tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..."
511       QuotedCharacterLiteral(tokens, start);
512     }
513     preventHollerith_ = false;
514   } else if (*at_ == '*') {
515     if (EmitCharAndAdvance(tokens, '*') == '*') {
516       EmitCharAndAdvance(tokens, '*');
517     } else {
518       // Subtle ambiguity:
519       //  CHARACTER*2H     declares H because *2 is a kind specifier
520       //  DATAC/N*2H  /    is repeated Hollerith
521       preventHollerith_ = !slashInCurrentStatement_;
522     }
523   } else {
524     char ch{*at_};
525     if (ch == '(' || ch == '[') {
526       ++delimiterNesting_;
527     } else if ((ch == ')' || ch == ']') && delimiterNesting_ > 0) {
528       --delimiterNesting_;
529     }
530     char nch{EmitCharAndAdvance(tokens, ch)};
531     preventHollerith_ = false;
532     if ((nch == '=' &&
533             (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
534         (ch == nch &&
535             (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
536                 ch == '|' || ch == '<' || ch == '>')) ||
537         (ch == '=' && nch == '>')) {
538       // token comprises two characters
539       EmitCharAndAdvance(tokens, nch);
540     } else if (ch == '/') {
541       slashInCurrentStatement_ = true;
542     }
543   }
544   tokens.CloseToken();
545   return true;
546 }
547 
ExponentAndKind(TokenSequence & tokens)548 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
549   char ed{ToLowerCaseLetter(*at_)};
550   if (ed != 'e' && ed != 'd') {
551     return false;
552   }
553   EmitCharAndAdvance(tokens, ed);
554   if (*at_ == '+' || *at_ == '-') {
555     EmitCharAndAdvance(tokens, *at_);
556   }
557   while (IsDecimalDigit(*at_)) {
558     EmitCharAndAdvance(tokens, *at_);
559   }
560   if (*at_ == '_') {
561     while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
562     }
563   }
564   return true;
565 }
566 
QuotedCharacterLiteral(TokenSequence & tokens,const char * start)567 void Prescanner::QuotedCharacterLiteral(
568     TokenSequence &tokens, const char *start) {
569   char quote{*at_};
570   const char *end{at_ + 1};
571   inCharLiteral_ = true;
572   const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
573   const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
574   bool isEscaped{false};
575   bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
576   while (true) {
577     if (*at_ == '\\') {
578       if (escapesEnabled) {
579         isEscaped = !isEscaped;
580       } else {
581         // The parser always processes escape sequences, so don't confuse it
582         // when escapes are disabled.
583         insert('\\');
584       }
585     } else {
586       isEscaped = false;
587     }
588     EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
589         Encoding::LATIN_1);
590     while (PadOutCharacterLiteral(tokens)) {
591     }
592     if (*at_ == '\n') {
593       if (!inPreprocessorDirective_) {
594         Say(GetProvenanceRange(start, end),
595             "Incomplete character literal"_err_en_US);
596       }
597       break;
598     }
599     end = at_ + 1;
600     NextChar();
601     if (*at_ == quote && !isEscaped) {
602       // A doubled unescaped quote mark becomes a single instance of that
603       // quote character in the literal (later).  There can be spaces between
604       // the quotes in fixed form source.
605       EmitChar(tokens, quote);
606       inCharLiteral_ = false; // for cases like print *, '...'!comment
607       NextChar();
608       if (InFixedFormSource()) {
609         SkipSpaces();
610       }
611       if (*at_ != quote) {
612         break;
613       }
614       inCharLiteral_ = true;
615     }
616   }
617   inCharLiteral_ = false;
618 }
619 
Hollerith(TokenSequence & tokens,int count,const char * start)620 void Prescanner::Hollerith(
621     TokenSequence &tokens, int count, const char *start) {
622   inCharLiteral_ = true;
623   CHECK(*at_ == 'h' || *at_ == 'H');
624   EmitChar(tokens, 'H');
625   while (count-- > 0) {
626     if (PadOutCharacterLiteral(tokens)) {
627     } else if (*at_ == '\n') {
628       Say(GetProvenanceRange(start, at_),
629           "Possible truncated Hollerith literal"_en_US);
630       break;
631     } else {
632       NextChar();
633       // Each multi-byte character encoding counts as a single character.
634       // No escape sequences are recognized.
635       // Hollerith is always emitted to the cooked character
636       // stream in UTF-8.
637       DecodedCharacter decoded{DecodeCharacter(
638           encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
639       if (decoded.bytes > 0) {
640         EncodedCharacter utf8{
641             EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
642         for (int j{0}; j < utf8.bytes; ++j) {
643           EmitChar(tokens, utf8.buffer[j]);
644         }
645         at_ += decoded.bytes - 1;
646       } else {
647         Say(GetProvenanceRange(start, at_),
648             "Bad character in Hollerith literal"_err_en_US);
649         break;
650       }
651     }
652   }
653   if (*at_ != '\n') {
654     NextChar();
655   }
656   inCharLiteral_ = false;
657 }
658 
659 // In fixed form, source card images must be processed as if they were at
660 // least 72 columns wide, at least in character literal contexts.
PadOutCharacterLiteral(TokenSequence & tokens)661 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
662   while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
663     if (column_ < fixedFormColumnLimit_) {
664       tokens.PutNextTokenChar(' ', spaceProvenance_);
665       ++column_;
666       return true;
667     }
668     if (!FixedFormContinuation(false /*no need to insert space*/) ||
669         tabInCurrentLine_) {
670       return false;
671     }
672     CHECK(column_ == 7);
673     --at_; // point to column 6 of continuation line
674     column_ = 6;
675   }
676   return false;
677 }
678 
IsFixedFormCommentLine(const char * start) const679 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
680   const char *p{start};
681   if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
682       ((*p == 'D' || *p == 'd') &&
683           !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
684     return true;
685   }
686   bool anyTabs{false};
687   while (true) {
688     if (*p == ' ') {
689       ++p;
690     } else if (*p == '\t') {
691       anyTabs = true;
692       ++p;
693     } else if (*p == '0' && !anyTabs && p == start + 5) {
694       ++p; // 0 in column 6 must treated as a space
695     } else {
696       break;
697     }
698   }
699   if (!anyTabs && p >= start + fixedFormColumnLimit_) {
700     return true;
701   }
702   if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
703     return true;
704   }
705   return *p == '\n';
706 }
707 
IsFreeFormComment(const char * p) const708 const char *Prescanner::IsFreeFormComment(const char *p) const {
709   p = SkipWhiteSpaceAndCComments(p);
710   if (*p == '!' || *p == '\n') {
711     return p;
712   } else {
713     return nullptr;
714   }
715 }
716 
IsIncludeLine(const char * start) const717 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
718   const char *p{SkipWhiteSpace(start)};
719   for (char ch : "include"s) {
720     if (ToLowerCaseLetter(*p++) != ch) {
721       return std::nullopt;
722     }
723   }
724   p = SkipWhiteSpace(p);
725   if (*p == '"' || *p == '\'') {
726     return {p - start};
727   }
728   return std::nullopt;
729 }
730 
FortranInclude(const char * firstQuote)731 void Prescanner::FortranInclude(const char *firstQuote) {
732   const char *p{firstQuote};
733   while (*p != '"' && *p != '\'') {
734     ++p;
735   }
736   char quote{*p};
737   std::string path;
738   for (++p; *p != '\n'; ++p) {
739     if (*p == quote) {
740       if (p[1] != quote) {
741         break;
742       }
743       ++p;
744     }
745     path += *p;
746   }
747   if (*p != quote) {
748     Say(GetProvenanceRange(firstQuote, p),
749         "malformed path name string"_err_en_US);
750     return;
751   }
752   p = SkipWhiteSpace(p + 1);
753   if (*p != '\n' && *p != '!') {
754     const char *garbage{p};
755     for (; *p != '\n' && *p != '!'; ++p) {
756     }
757     Say(GetProvenanceRange(garbage, p),
758         "excess characters after path name"_en_US);
759   }
760   std::string buf;
761   llvm::raw_string_ostream error{buf};
762   Provenance provenance{GetProvenance(nextLine_)};
763   std::optional<std::string> prependPath;
764   if (const SourceFile * currentFile{allSources_.GetSourceFile(provenance)}) {
765     prependPath = DirectoryName(currentFile->path());
766   }
767   const SourceFile *included{
768       allSources_.Open(path, error, std::move(prependPath))};
769   if (!included) {
770     Say(provenance, "INCLUDE: %s"_err_en_US, error.str());
771   } else if (included->bytes() > 0) {
772     ProvenanceRange includeLineRange{
773         provenance, static_cast<std::size_t>(p - nextLine_)};
774     ProvenanceRange fileRange{
775         allSources_.AddIncludedFile(*included, includeLineRange)};
776     Prescanner{*this}.set_encoding(included->encoding()).Prescan(fileRange);
777   }
778 }
779 
IsPreprocessorDirectiveLine(const char * start) const780 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
781   const char *p{start};
782   for (; *p == ' '; ++p) {
783   }
784   if (*p == '#') {
785     if (inFixedForm_ && p == start + 5) {
786       return nullptr;
787     }
788   } else {
789     p = SkipWhiteSpace(p);
790     if (*p != '#') {
791       return nullptr;
792     }
793   }
794   return SkipWhiteSpace(p + 1);
795 }
796 
IsNextLinePreprocessorDirective() const797 bool Prescanner::IsNextLinePreprocessorDirective() const {
798   return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
799 }
800 
SkipCommentLine(bool afterAmpersand)801 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
802   if (IsAtEnd()) {
803     if (afterAmpersand && prescannerNesting_ > 0) {
804       // A continuation marker at the end of the last line in an
805       // include file inhibits the newline for that line.
806       SkipToEndOfLine();
807       omitNewline_ = true;
808     }
809     return false;
810   }
811   auto lineClass{ClassifyLine(nextLine_)};
812   if (lineClass.kind == LineClassification::Kind::Comment) {
813     NextLine();
814     return true;
815   } else if (inPreprocessorDirective_) {
816     return false;
817   } else if (lineClass.kind ==
818           LineClassification::Kind::ConditionalCompilationDirective ||
819       lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
820     // Allow conditional compilation directives (e.g., #ifdef) to affect
821     // continuation lines.
822     // Allow other preprocessor directives, too, except #include
823     // (when it does not follow '&'), #define, and #undef (because
824     // they cannot be allowed to affect preceding text on a
825     // continued line).
826     preprocessor_.Directive(TokenizePreprocessorDirective(), this);
827     return true;
828   } else if (afterAmpersand &&
829       (lineClass.kind == LineClassification::Kind::IncludeDirective ||
830           lineClass.kind == LineClassification::Kind::IncludeLine)) {
831     SkipToEndOfLine();
832     omitNewline_ = true;
833     skipLeadingAmpersand_ = true;
834     return false;
835   } else {
836     return false;
837   }
838 }
839 
FixedFormContinuationLine(bool mightNeedSpace)840 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
841   if (IsAtEnd()) {
842     return nullptr;
843   }
844   tabInCurrentLine_ = false;
845   char col1{*nextLine_};
846   if (InCompilerDirective()) {
847     // Must be a continued compiler directive.
848     if (!IsFixedFormCommentChar(col1)) {
849       return nullptr;
850     }
851     int j{1};
852     for (; j < 5; ++j) {
853       char ch{directiveSentinel_[j - 1]};
854       if (ch == '\0') {
855         break;
856       }
857       if (ch != ToLowerCaseLetter(nextLine_[j])) {
858         return nullptr;
859       }
860     }
861     for (; j < 5; ++j) {
862       if (nextLine_[j] != ' ') {
863         return nullptr;
864       }
865     }
866     char col6{nextLine_[5]};
867     if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
868       if (nextLine_[6] != ' ' && mightNeedSpace) {
869         insertASpace_ = true;
870       }
871       return nextLine_ + 6;
872     }
873     return nullptr;
874   } else {
875     // Normal case: not in a compiler directive.
876     if (col1 == '&' &&
877         features_.IsEnabled(
878             LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
879       // Extension: '&' as continuation marker
880       if (features_.ShouldWarn(
881               LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
882         Say(GetProvenance(nextLine_), "nonstandard usage"_en_US);
883       }
884       return nextLine_ + 1;
885     }
886     if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
887       tabInCurrentLine_ = true;
888       return nextLine_ + 2; // VAX extension
889     }
890     if (col1 == ' ' && nextLine_[1] == ' ' && nextLine_[2] == ' ' &&
891         nextLine_[3] == ' ' && nextLine_[4] == ' ') {
892       char col6{nextLine_[5]};
893       if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
894         return nextLine_ + 6;
895       }
896     }
897     if (IsImplicitContinuation()) {
898       return nextLine_;
899     }
900   }
901   return nullptr; // not a continuation line
902 }
903 
FreeFormContinuationLine(bool ampersand)904 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
905   const char *p{nextLine_};
906   if (p >= limit_) {
907     return nullptr;
908   }
909   p = SkipWhiteSpace(p);
910   if (InCompilerDirective()) {
911     if (*p++ != '!') {
912       return nullptr;
913     }
914     for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
915       if (*s != ToLowerCaseLetter(*p)) {
916         return nullptr;
917       }
918     }
919     p = SkipWhiteSpace(p);
920     if (*p == '&') {
921       if (!ampersand) {
922         insertASpace_ = true;
923       }
924       return p + 1;
925     } else if (ampersand) {
926       return p;
927     } else {
928       return nullptr;
929     }
930   } else {
931     if (*p == '&') {
932       return p + 1;
933     } else if (*p == '!' || *p == '\n' || *p == '#') {
934       return nullptr;
935     } else if (ampersand || IsImplicitContinuation()) {
936       if (p > nextLine_) {
937         --p;
938       } else {
939         insertASpace_ = true;
940       }
941       return p;
942     } else {
943       return nullptr;
944     }
945   }
946 }
947 
FixedFormContinuation(bool mightNeedSpace)948 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
949   // N.B. We accept '&' as a continuation indicator in fixed form, too,
950   // but not in a character literal.
951   if (*at_ == '&' && inCharLiteral_) {
952     return false;
953   }
954   do {
955     if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
956       BeginSourceLine(cont);
957       column_ = 7;
958       NextLine();
959       return true;
960     }
961   } while (SkipCommentLine(false /* not after ampersand */));
962   return false;
963 }
964 
FreeFormContinuation()965 bool Prescanner::FreeFormContinuation() {
966   const char *p{at_};
967   bool ampersand{*p == '&'};
968   if (ampersand) {
969     p = SkipWhiteSpace(p + 1);
970   }
971   if (*p != '\n') {
972     if (inCharLiteral_) {
973       return false;
974     } else if (*p != '!' &&
975         features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
976       Say(GetProvenance(p), "missing ! before comment after &"_en_US);
977     }
978   }
979   do {
980     if (const char *cont{FreeFormContinuationLine(ampersand)}) {
981       BeginSourceLine(cont);
982       NextLine();
983       return true;
984     }
985   } while (SkipCommentLine(ampersand));
986   return false;
987 }
988 
989 // Implicit line continuation allows a preprocessor macro call with
990 // arguments to span multiple lines.
IsImplicitContinuation() const991 bool Prescanner::IsImplicitContinuation() const {
992   return !inPreprocessorDirective_ && !inCharLiteral_ &&
993       delimiterNesting_ > 0 && !IsAtEnd() &&
994       ClassifyLine(nextLine_).kind == LineClassification::Kind::Source;
995 }
996 
Continuation(bool mightNeedFixedFormSpace)997 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
998   if (*at_ == '\n' || *at_ == '&') {
999     if (inFixedForm_) {
1000       return FixedFormContinuation(mightNeedFixedFormSpace);
1001     } else {
1002       return FreeFormContinuation();
1003     }
1004   } else {
1005     return false;
1006   }
1007 }
1008 
1009 std::optional<Prescanner::LineClassification>
IsFixedFormCompilerDirectiveLine(const char * start) const1010 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
1011   const char *p{start};
1012   char col1{*p++};
1013   if (!IsFixedFormCommentChar(col1)) {
1014     return std::nullopt;
1015   }
1016   char sentinel[5], *sp{sentinel};
1017   int column{2};
1018   for (; column < 6; ++column, ++p) {
1019     if (*p != ' ') {
1020       if (*p == '\n' || *p == '\t') {
1021         break;
1022       }
1023       if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1024         // OpenMP conditional compilation line: leave the label alone
1025         break;
1026       }
1027       *sp++ = ToLowerCaseLetter(*p);
1028     }
1029   }
1030   if (column == 6) {
1031     if (*p == ' ' || *p == '\t' || *p == '0') {
1032       ++p;
1033     } else {
1034       // This is a Continuation line, not an initial directive line.
1035       return std::nullopt;
1036     }
1037   }
1038   if (sp == sentinel) {
1039     return std::nullopt;
1040   }
1041   *sp = '\0';
1042   if (const char *ss{IsCompilerDirectiveSentinel(sentinel)}) {
1043     std::size_t payloadOffset = p - start;
1044     return {LineClassification{
1045         LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1046   }
1047   return std::nullopt;
1048 }
1049 
1050 std::optional<Prescanner::LineClassification>
IsFreeFormCompilerDirectiveLine(const char * start) const1051 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1052   char sentinel[8];
1053   const char *p{SkipWhiteSpace(start)};
1054   if (*p++ != '!') {
1055     return std::nullopt;
1056   }
1057   for (std::size_t j{0}; j + 1 < sizeof sentinel; ++p, ++j) {
1058     if (*p == '\n') {
1059       break;
1060     }
1061     if (*p == ' ' || *p == '\t' || *p == '&') {
1062       if (j == 0) {
1063         break;
1064       }
1065       sentinel[j] = '\0';
1066       p = SkipWhiteSpace(p + 1);
1067       if (*p == '!') {
1068         break;
1069       }
1070       if (const char *sp{IsCompilerDirectiveSentinel(sentinel)}) {
1071         std::size_t offset = p - start;
1072         return {LineClassification{
1073             LineClassification::Kind::CompilerDirective, offset, sp}};
1074       }
1075       break;
1076     }
1077     sentinel[j] = ToLowerCaseLetter(*p);
1078   }
1079   return std::nullopt;
1080 }
1081 
AddCompilerDirectiveSentinel(const std::string & dir)1082 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1083   std::uint64_t packed{0};
1084   for (char ch : dir) {
1085     packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1086   }
1087   compilerDirectiveBloomFilter_.set(packed % prime1);
1088   compilerDirectiveBloomFilter_.set(packed % prime2);
1089   compilerDirectiveSentinels_.insert(dir);
1090   return *this;
1091 }
1092 
IsCompilerDirectiveSentinel(const char * sentinel) const1093 const char *Prescanner::IsCompilerDirectiveSentinel(
1094     const char *sentinel) const {
1095   std::uint64_t packed{0};
1096   std::size_t n{0};
1097   for (; sentinel[n] != '\0'; ++n) {
1098     packed = (packed << 8) | (sentinel[n] & 0xff);
1099   }
1100   if (n == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1101       !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1102     return nullptr;
1103   }
1104   const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, n))};
1105   return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
1106 }
1107 
IsDirective(const char * match,const char * dir)1108 constexpr bool IsDirective(const char *match, const char *dir) {
1109   for (; *match; ++match) {
1110     if (*match != ToLowerCaseLetter(*dir++)) {
1111       return false;
1112     }
1113   }
1114   return true;
1115 }
1116 
ClassifyLine(const char * start) const1117 Prescanner::LineClassification Prescanner::ClassifyLine(
1118     const char *start) const {
1119   if (inFixedForm_) {
1120     if (std::optional<LineClassification> lc{
1121             IsFixedFormCompilerDirectiveLine(start)}) {
1122       return std::move(*lc);
1123     }
1124     if (IsFixedFormCommentLine(start)) {
1125       return {LineClassification::Kind::Comment};
1126     }
1127   } else {
1128     if (std::optional<LineClassification> lc{
1129             IsFreeFormCompilerDirectiveLine(start)}) {
1130       return std::move(*lc);
1131     }
1132     if (const char *bang{IsFreeFormComment(start)}) {
1133       return {LineClassification::Kind::Comment,
1134           static_cast<std::size_t>(bang - start)};
1135     }
1136   }
1137   if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1138     return {LineClassification::Kind::IncludeLine, *quoteOffset};
1139   }
1140   if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1141     if (IsDirective("if", dir) || IsDirective("elif", dir) ||
1142         IsDirective("else", dir) || IsDirective("endif", dir)) {
1143       return {LineClassification::Kind::ConditionalCompilationDirective};
1144     } else if (IsDirective("include", dir)) {
1145       return {LineClassification::Kind::IncludeDirective};
1146     } else if (IsDirective("define", dir) || IsDirective("undef", dir)) {
1147       return {LineClassification::Kind::DefinitionDirective};
1148     } else {
1149       return {LineClassification::Kind::PreprocessorDirective};
1150     }
1151   }
1152   return {LineClassification::Kind::Source};
1153 }
1154 
SourceFormChange(std::string && dir)1155 void Prescanner::SourceFormChange(std::string &&dir) {
1156   if (dir == "!dir$ free") {
1157     inFixedForm_ = false;
1158   } else if (dir == "!dir$ fixed") {
1159     inFixedForm_ = true;
1160   }
1161 }
1162 } // namespace Fortran::parser
1163