1 //===-- lib/Parser/prescan.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "prescan.h"
10 #include "preprocessor.h"
11 #include "token-sequence.h"
12 #include "flang/Common/idioms.h"
13 #include "flang/Parser/characters.h"
14 #include "flang/Parser/message.h"
15 #include "flang/Parser/source.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <cstddef>
18 #include <cstring>
19 #include <utility>
20 #include <vector>
21
22 namespace Fortran::parser {
23
24 using common::LanguageFeature;
25
26 static constexpr int maxPrescannerNesting{100};
27
Prescanner(Messages & messages,CookedSource & cooked,Preprocessor & preprocessor,common::LanguageFeatureControl lfc)28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
29 Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
30 : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
31 features_{lfc}, encoding_{cooked.allSources().encoding()} {}
32
Prescanner(const Prescanner & that)33 Prescanner::Prescanner(const Prescanner &that)
34 : messages_{that.messages_}, cooked_{that.cooked_},
35 preprocessor_{that.preprocessor_}, features_{that.features_},
36 inFixedForm_{that.inFixedForm_},
37 fixedFormColumnLimit_{that.fixedFormColumnLimit_},
38 encoding_{that.encoding_}, prescannerNesting_{that.prescannerNesting_ +
39 1},
40 skipLeadingAmpersand_{that.skipLeadingAmpersand_},
41 compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
42 compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
43
IsFixedFormCommentChar(char ch)44 static inline constexpr bool IsFixedFormCommentChar(char ch) {
45 return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
46 }
47
NormalizeCompilerDirectiveCommentMarker(TokenSequence & dir)48 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
49 char *p{dir.GetMutableCharData()};
50 char *limit{p + dir.SizeInChars()};
51 for (; p < limit; ++p) {
52 if (*p != ' ') {
53 CHECK(IsFixedFormCommentChar(*p));
54 *p = '!';
55 return;
56 }
57 }
58 DIE("compiler directive all blank");
59 }
60
Prescan(ProvenanceRange range)61 void Prescanner::Prescan(ProvenanceRange range) {
62 AllSources &allSources{cooked_.allSources()};
63 startProvenance_ = range.start();
64 std::size_t offset{0};
65 const SourceFile *source{allSources.GetSourceFile(startProvenance_, &offset)};
66 CHECK(source);
67 start_ = source->content().data() + offset;
68 limit_ = start_ + range.size();
69 nextLine_ = start_;
70 const bool beganInFixedForm{inFixedForm_};
71 if (prescannerNesting_ > maxPrescannerNesting) {
72 Say(GetProvenance(start_),
73 "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
74 return;
75 }
76 while (nextLine_ < limit_) {
77 Statement();
78 }
79 if (inFixedForm_ != beganInFixedForm) {
80 std::string dir{"!dir$ "};
81 if (beganInFixedForm) {
82 dir += "fixed";
83 } else {
84 dir += "free";
85 }
86 dir += '\n';
87 TokenSequence tokens{dir, allSources.AddCompilerInsertion(dir).start()};
88 tokens.Emit(cooked_);
89 }
90 }
91
Statement()92 void Prescanner::Statement() {
93 TokenSequence tokens;
94 LineClassification line{ClassifyLine(nextLine_)};
95 switch (line.kind) {
96 case LineClassification::Kind::Comment:
97 nextLine_ += line.payloadOffset; // advance to '!' or newline
98 NextLine();
99 return;
100 case LineClassification::Kind::IncludeLine:
101 FortranInclude(nextLine_ + line.payloadOffset);
102 NextLine();
103 return;
104 case LineClassification::Kind::ConditionalCompilationDirective:
105 case LineClassification::Kind::IncludeDirective:
106 case LineClassification::Kind::DefinitionDirective:
107 case LineClassification::Kind::PreprocessorDirective:
108 preprocessor_.Directive(TokenizePreprocessorDirective(), this);
109 return;
110 case LineClassification::Kind::CompilerDirective:
111 directiveSentinel_ = line.sentinel;
112 CHECK(InCompilerDirective());
113 BeginSourceLineAndAdvance();
114 if (inFixedForm_) {
115 CHECK(IsFixedFormCommentChar(*at_));
116 } else {
117 while (*at_ == ' ' || *at_ == '\t') {
118 ++at_, ++column_;
119 }
120 CHECK(*at_ == '!');
121 }
122 if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
123 // OpenMP conditional compilation line. Remove the sentinel and then
124 // treat the line as if it were normal source.
125 at_ += 2, column_ += 2;
126 if (inFixedForm_) {
127 LabelField(tokens);
128 } else {
129 SkipSpaces();
130 }
131 } else {
132 // Compiler directive. Emit normalized sentinel.
133 EmitChar(tokens, '!');
134 ++at_, ++column_;
135 for (const char *sp{directiveSentinel_}; *sp != '\0';
136 ++sp, ++at_, ++column_) {
137 EmitChar(tokens, *sp);
138 }
139 if (*at_ == ' ') {
140 EmitChar(tokens, ' ');
141 ++at_, ++column_;
142 }
143 tokens.CloseToken();
144 }
145 break;
146 case LineClassification::Kind::Source:
147 BeginSourceLineAndAdvance();
148 if (inFixedForm_) {
149 LabelField(tokens);
150 } else if (skipLeadingAmpersand_) {
151 skipLeadingAmpersand_ = false;
152 const char *p{SkipWhiteSpace(at_)};
153 if (p < limit_ && *p == '&') {
154 column_ += ++p - at_;
155 at_ = p;
156 }
157 } else {
158 SkipSpaces();
159 }
160 break;
161 }
162
163 while (NextToken(tokens)) {
164 }
165
166 Provenance newlineProvenance{GetCurrentProvenance()};
167 if (std::optional<TokenSequence> preprocessed{
168 preprocessor_.MacroReplacement(tokens, *this)}) {
169 // Reprocess the preprocessed line. Append a newline temporarily.
170 preprocessed->PutNextTokenChar('\n', newlineProvenance);
171 preprocessed->CloseToken();
172 const char *ppd{preprocessed->ToCharBlock().begin()};
173 LineClassification ppl{ClassifyLine(ppd)};
174 preprocessed->RemoveLastToken(); // remove the newline
175 switch (ppl.kind) {
176 case LineClassification::Kind::Comment:
177 break;
178 case LineClassification::Kind::IncludeLine:
179 FortranInclude(ppd + ppl.payloadOffset);
180 break;
181 case LineClassification::Kind::ConditionalCompilationDirective:
182 case LineClassification::Kind::IncludeDirective:
183 case LineClassification::Kind::DefinitionDirective:
184 case LineClassification::Kind::PreprocessorDirective:
185 Say(preprocessed->GetProvenanceRange(),
186 "Preprocessed line resembles a preprocessor directive"_en_US);
187 preprocessed->ToLowerCase().Emit(cooked_);
188 break;
189 case LineClassification::Kind::CompilerDirective:
190 if (preprocessed->HasRedundantBlanks()) {
191 preprocessed->RemoveRedundantBlanks();
192 }
193 NormalizeCompilerDirectiveCommentMarker(*preprocessed);
194 preprocessed->ToLowerCase();
195 SourceFormChange(preprocessed->ToString());
196 preprocessed->ClipComment(true /* skip first ! */).Emit(cooked_);
197 break;
198 case LineClassification::Kind::Source:
199 if (inFixedForm_) {
200 if (preprocessed->HasBlanks(/*after column*/ 6)) {
201 preprocessed->RemoveBlanks(/*after column*/ 6);
202 }
203 } else {
204 if (preprocessed->HasRedundantBlanks()) {
205 preprocessed->RemoveRedundantBlanks();
206 }
207 }
208 preprocessed->ToLowerCase().ClipComment().Emit(cooked_);
209 break;
210 }
211 } else {
212 tokens.ToLowerCase();
213 if (line.kind == LineClassification::Kind::CompilerDirective) {
214 SourceFormChange(tokens.ToString());
215 }
216 tokens.Emit(cooked_);
217 }
218 if (omitNewline_) {
219 omitNewline_ = false;
220 } else {
221 cooked_.Put('\n', newlineProvenance);
222 }
223 directiveSentinel_ = nullptr;
224 }
225
TokenizePreprocessorDirective()226 TokenSequence Prescanner::TokenizePreprocessorDirective() {
227 CHECK(nextLine_ < limit_ && !inPreprocessorDirective_);
228 inPreprocessorDirective_ = true;
229 BeginSourceLineAndAdvance();
230 TokenSequence tokens;
231 while (NextToken(tokens)) {
232 }
233 inPreprocessorDirective_ = false;
234 return tokens;
235 }
236
NextLine()237 void Prescanner::NextLine() {
238 void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
239 void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
240 if (!v) {
241 nextLine_ = limit_;
242 } else {
243 const char *nl{const_cast<const char *>(static_cast<char *>(v))};
244 nextLine_ = nl + 1;
245 }
246 }
247
LabelField(TokenSequence & token,int outCol)248 void Prescanner::LabelField(TokenSequence &token, int outCol) {
249 for (; *at_ != '\n' && column_ <= 6; ++at_) {
250 if (*at_ == '\t') {
251 ++at_;
252 column_ = 7;
253 break;
254 }
255 if (*at_ != ' ' &&
256 !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
257 EmitChar(token, *at_);
258 ++outCol;
259 }
260 ++column_;
261 }
262 if (outCol > 1) {
263 token.CloseToken();
264 }
265 if (outCol < 7) {
266 if (outCol == 1) {
267 token.Put(" ", 6, sixSpaceProvenance_.start());
268 } else {
269 for (; outCol < 7; ++outCol) {
270 token.PutNextTokenChar(' ', spaceProvenance_);
271 }
272 token.CloseToken();
273 }
274 }
275 SkipToNextSignificantCharacter();
276 }
277
SkipToEndOfLine()278 void Prescanner::SkipToEndOfLine() {
279 while (*at_ != '\n') {
280 ++at_, ++column_;
281 }
282 }
283
MustSkipToEndOfLine() const284 bool Prescanner::MustSkipToEndOfLine() const {
285 if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
286 return true; // skip over ignored columns in right margin (73:80)
287 } else if (*at_ == '!' && !inCharLiteral_) {
288 return true; // inline comment goes to end of source line
289 } else {
290 return false;
291 }
292 }
293
NextChar()294 void Prescanner::NextChar() {
295 CHECK(*at_ != '\n');
296 ++at_, ++column_;
297 while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
298 // UTF-8 byte order mark - treat this file as UTF-8
299 at_ += 3;
300 encoding_ = Encoding::UTF_8;
301 }
302 SkipToNextSignificantCharacter();
303 }
304
305 // Skip everything that should be ignored until the next significant
306 // character is reached; handles C-style comments in preprocessing
307 // directives, Fortran ! comments, stuff after the right margin in
308 // fixed form, and all forms of line continuation.
SkipToNextSignificantCharacter()309 void Prescanner::SkipToNextSignificantCharacter() {
310 if (inPreprocessorDirective_) {
311 SkipCComments();
312 } else {
313 bool mightNeedSpace{false};
314 if (MustSkipToEndOfLine()) {
315 SkipToEndOfLine();
316 } else {
317 mightNeedSpace = *at_ == '\n';
318 }
319 for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
320 if (MustSkipToEndOfLine()) {
321 SkipToEndOfLine();
322 }
323 }
324 if (*at_ == '\t') {
325 tabInCurrentLine_ = true;
326 }
327 }
328 }
329
SkipCComments()330 void Prescanner::SkipCComments() {
331 while (true) {
332 if (IsCComment(at_)) {
333 if (const char *after{SkipCComment(at_)}) {
334 column_ += after - at_;
335 // May have skipped over one or more newlines; relocate the start of
336 // the next line.
337 nextLine_ = at_ = after;
338 NextLine();
339 } else {
340 // Don't emit any messages about unclosed C-style comments, because
341 // the sequence /* can appear legally in a FORMAT statement. There's
342 // no ambiguity, since the sequence */ cannot appear legally.
343 break;
344 }
345 } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
346 at_[1] == '\n' && nextLine_ < limit_) {
347 BeginSourceLineAndAdvance();
348 } else {
349 break;
350 }
351 }
352 }
353
SkipSpaces()354 void Prescanner::SkipSpaces() {
355 while (*at_ == ' ' || *at_ == '\t') {
356 NextChar();
357 }
358 insertASpace_ = false;
359 }
360
SkipWhiteSpace(const char * p)361 const char *Prescanner::SkipWhiteSpace(const char *p) {
362 while (*p == ' ' || *p == '\t') {
363 ++p;
364 }
365 return p;
366 }
367
SkipWhiteSpaceAndCComments(const char * p) const368 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
369 while (true) {
370 if (*p == ' ' || *p == '\t') {
371 ++p;
372 } else if (IsCComment(p)) {
373 if (const char *after{SkipCComment(p)}) {
374 p = after;
375 } else {
376 break;
377 }
378 } else {
379 break;
380 }
381 }
382 return p;
383 }
384
SkipCComment(const char * p) const385 const char *Prescanner::SkipCComment(const char *p) const {
386 char star{' '}, slash{' '};
387 p += 2;
388 while (star != '*' || slash != '/') {
389 if (p >= limit_) {
390 return nullptr; // signifies an unterminated comment
391 }
392 star = slash;
393 slash = *p++;
394 }
395 return p;
396 }
397
NextToken(TokenSequence & tokens)398 bool Prescanner::NextToken(TokenSequence &tokens) {
399 CHECK(at_ >= start_ && at_ < limit_);
400 if (InFixedFormSource()) {
401 SkipSpaces();
402 } else {
403 if (*at_ == '/' && IsCComment(at_)) {
404 // Recognize and skip over classic C style /*comments*/ when
405 // outside a character literal.
406 if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
407 Say(GetProvenance(at_), "nonstandard usage: C-style comment"_en_US);
408 }
409 SkipCComments();
410 }
411 if (*at_ == ' ' || *at_ == '\t') {
412 // Compress free-form white space into a single space character.
413 const auto theSpace{at_};
414 char previous{at_ <= start_ ? ' ' : at_[-1]};
415 NextChar();
416 SkipSpaces();
417 if (*at_ == '\n') {
418 // Discard white space at the end of a line.
419 } else if (!inPreprocessorDirective_ &&
420 (previous == '(' || *at_ == '(' || *at_ == ')')) {
421 // Discard white space before/after '(' and before ')', unless in a
422 // preprocessor directive. This helps yield space-free contiguous
423 // names for generic interfaces like OPERATOR( + ) and
424 // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
425 // This has the effect of silently ignoring the illegal spaces in
426 // the array constructor ( /1,2/ ) but that seems benign; it's
427 // hard to avoid that while still removing spaces from OPERATOR( / )
428 // and OPERATOR( // ).
429 } else {
430 // Preserve the squashed white space as a single space character.
431 tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
432 tokens.CloseToken();
433 return true;
434 }
435 }
436 }
437 if (insertASpace_) {
438 tokens.PutNextTokenChar(' ', spaceProvenance_);
439 insertASpace_ = false;
440 }
441 if (*at_ == '\n') {
442 return false;
443 }
444 const char *start{at_};
445 if (*at_ == '\'' || *at_ == '"') {
446 QuotedCharacterLiteral(tokens, start);
447 preventHollerith_ = false;
448 } else if (IsDecimalDigit(*at_)) {
449 int n{0}, digits{0};
450 static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
451 do {
452 if (n < maxHollerith) {
453 n = 10 * n + DecimalDigitValue(*at_);
454 }
455 EmitCharAndAdvance(tokens, *at_);
456 ++digits;
457 if (InFixedFormSource()) {
458 SkipSpaces();
459 }
460 } while (IsDecimalDigit(*at_));
461 if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
462 !preventHollerith_) {
463 Hollerith(tokens, n, start);
464 } else if (*at_ == '.') {
465 while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
466 }
467 ExponentAndKind(tokens);
468 } else if (ExponentAndKind(tokens)) {
469 } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
470 inPreprocessorDirective_) {
471 do {
472 EmitCharAndAdvance(tokens, *at_);
473 } while (IsHexadecimalDigit(*at_));
474 } else if (IsLetter(*at_)) {
475 // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
476 // we don't misrecognize I9HOLLERITH as an identifier in the next case.
477 EmitCharAndAdvance(tokens, *at_);
478 } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) {
479 EmitCharAndAdvance(tokens, *at_);
480 QuotedCharacterLiteral(tokens, start);
481 }
482 preventHollerith_ = false;
483 } else if (*at_ == '.') {
484 char nch{EmitCharAndAdvance(tokens, '.')};
485 if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
486 while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
487 }
488 ExponentAndKind(tokens);
489 } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
490 EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
491 }
492 preventHollerith_ = false;
493 } else if (IsLegalInIdentifier(*at_)) {
494 do {
495 } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_)));
496 if (*at_ == '\'' || *at_ == '"') {
497 QuotedCharacterLiteral(tokens, start);
498 preventHollerith_ = false;
499 } else {
500 // Subtle: Don't misrecognize labeled DO statement label as Hollerith
501 // when the loop control variable starts with 'H'.
502 preventHollerith_ = true;
503 }
504 } else if (*at_ == '*') {
505 if (EmitCharAndAdvance(tokens, '*') == '*') {
506 EmitCharAndAdvance(tokens, '*');
507 } else {
508 // Subtle ambiguity:
509 // CHARACTER*2H declares H because *2 is a kind specifier
510 // DATAC/N*2H / is repeated Hollerith
511 preventHollerith_ = !slashInCurrentLine_;
512 }
513 } else {
514 char ch{*at_};
515 if (ch == '(' || ch == '[') {
516 ++delimiterNesting_;
517 } else if ((ch == ')' || ch == ']') && delimiterNesting_ > 0) {
518 --delimiterNesting_;
519 }
520 char nch{EmitCharAndAdvance(tokens, ch)};
521 preventHollerith_ = false;
522 if ((nch == '=' &&
523 (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
524 (ch == nch &&
525 (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
526 ch == '|' || ch == '<' || ch == '>')) ||
527 (ch == '=' && nch == '>')) {
528 // token comprises two characters
529 EmitCharAndAdvance(tokens, nch);
530 } else if (ch == '/') {
531 slashInCurrentLine_ = true;
532 }
533 }
534 tokens.CloseToken();
535 return true;
536 }
537
ExponentAndKind(TokenSequence & tokens)538 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
539 char ed{ToLowerCaseLetter(*at_)};
540 if (ed != 'e' && ed != 'd') {
541 return false;
542 }
543 EmitCharAndAdvance(tokens, ed);
544 if (*at_ == '+' || *at_ == '-') {
545 EmitCharAndAdvance(tokens, *at_);
546 }
547 while (IsDecimalDigit(*at_)) {
548 EmitCharAndAdvance(tokens, *at_);
549 }
550 if (*at_ == '_') {
551 while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
552 }
553 }
554 return true;
555 }
556
QuotedCharacterLiteral(TokenSequence & tokens,const char * start)557 void Prescanner::QuotedCharacterLiteral(
558 TokenSequence &tokens, const char *start) {
559 char quote{*at_};
560 const char *end{at_ + 1};
561 inCharLiteral_ = true;
562 const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
563 const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
564 bool isEscaped{false};
565 bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
566 while (true) {
567 if (*at_ == '\\') {
568 if (escapesEnabled) {
569 isEscaped = !isEscaped;
570 } else {
571 // The parser always processes escape sequences, so don't confuse it
572 // when escapes are disabled.
573 insert('\\');
574 }
575 } else {
576 isEscaped = false;
577 }
578 EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
579 Encoding::LATIN_1);
580 while (PadOutCharacterLiteral(tokens)) {
581 }
582 if (*at_ == '\n') {
583 if (!inPreprocessorDirective_) {
584 Say(GetProvenanceRange(start, end),
585 "Incomplete character literal"_err_en_US);
586 }
587 break;
588 }
589 end = at_ + 1;
590 NextChar();
591 if (*at_ == quote && !isEscaped) {
592 // A doubled unescaped quote mark becomes a single instance of that
593 // quote character in the literal (later). There can be spaces between
594 // the quotes in fixed form source.
595 EmitChar(tokens, quote);
596 inCharLiteral_ = false; // for cases like print *, '...'!comment
597 NextChar();
598 if (InFixedFormSource()) {
599 SkipSpaces();
600 }
601 if (*at_ != quote) {
602 break;
603 }
604 inCharLiteral_ = true;
605 }
606 }
607 inCharLiteral_ = false;
608 }
609
Hollerith(TokenSequence & tokens,int count,const char * start)610 void Prescanner::Hollerith(
611 TokenSequence &tokens, int count, const char *start) {
612 inCharLiteral_ = true;
613 CHECK(*at_ == 'h' || *at_ == 'H');
614 EmitChar(tokens, 'H');
615 while (count-- > 0) {
616 if (PadOutCharacterLiteral(tokens)) {
617 } else if (*at_ == '\n') {
618 Say(GetProvenanceRange(start, at_),
619 "Possible truncated Hollerith literal"_en_US);
620 break;
621 } else {
622 NextChar();
623 // Each multi-byte character encoding counts as a single character.
624 // No escape sequences are recognized.
625 // Hollerith is always emitted to the cooked character
626 // stream in UTF-8.
627 DecodedCharacter decoded{DecodeCharacter(
628 encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
629 if (decoded.bytes > 0) {
630 EncodedCharacter utf8{
631 EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
632 for (int j{0}; j < utf8.bytes; ++j) {
633 EmitChar(tokens, utf8.buffer[j]);
634 }
635 at_ += decoded.bytes - 1;
636 } else {
637 Say(GetProvenanceRange(start, at_),
638 "Bad character in Hollerith literal"_err_en_US);
639 break;
640 }
641 }
642 }
643 if (*at_ != '\n') {
644 NextChar();
645 }
646 inCharLiteral_ = false;
647 }
648
649 // In fixed form, source card images must be processed as if they were at
650 // least 72 columns wide, at least in character literal contexts.
PadOutCharacterLiteral(TokenSequence & tokens)651 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
652 while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
653 if (column_ < fixedFormColumnLimit_) {
654 tokens.PutNextTokenChar(' ', spaceProvenance_);
655 ++column_;
656 return true;
657 }
658 if (!FixedFormContinuation(false /*no need to insert space*/) ||
659 tabInCurrentLine_) {
660 return false;
661 }
662 CHECK(column_ == 7);
663 --at_; // point to column 6 of continuation line
664 column_ = 6;
665 }
666 return false;
667 }
668
IsFixedFormCommentLine(const char * start) const669 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
670 const char *p{start};
671 if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
672 ((*p == 'D' || *p == 'd') &&
673 !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
674 return true;
675 }
676 bool anyTabs{false};
677 while (true) {
678 if (*p == ' ') {
679 ++p;
680 } else if (*p == '\t') {
681 anyTabs = true;
682 ++p;
683 } else if (*p == '0' && !anyTabs && p == start + 5) {
684 ++p; // 0 in column 6 must treated as a space
685 } else {
686 break;
687 }
688 }
689 if (!anyTabs && p >= start + fixedFormColumnLimit_) {
690 return true;
691 }
692 if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
693 return true;
694 }
695 return *p == '\n';
696 }
697
IsFreeFormComment(const char * p) const698 const char *Prescanner::IsFreeFormComment(const char *p) const {
699 p = SkipWhiteSpaceAndCComments(p);
700 if (*p == '!' || *p == '\n') {
701 return p;
702 } else {
703 return nullptr;
704 }
705 }
706
IsIncludeLine(const char * start) const707 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
708 const char *p{SkipWhiteSpace(start)};
709 for (char ch : "include"s) {
710 if (ToLowerCaseLetter(*p++) != ch) {
711 return std::nullopt;
712 }
713 }
714 p = SkipWhiteSpace(p);
715 if (*p == '"' || *p == '\'') {
716 return {p - start};
717 }
718 return std::nullopt;
719 }
720
FortranInclude(const char * firstQuote)721 void Prescanner::FortranInclude(const char *firstQuote) {
722 const char *p{firstQuote};
723 while (*p != '"' && *p != '\'') {
724 ++p;
725 }
726 char quote{*p};
727 std::string path;
728 for (++p; *p != '\n'; ++p) {
729 if (*p == quote) {
730 if (p[1] != quote) {
731 break;
732 }
733 ++p;
734 }
735 path += *p;
736 }
737 if (*p != quote) {
738 Say(GetProvenanceRange(firstQuote, p),
739 "malformed path name string"_err_en_US);
740 return;
741 }
742 p = SkipWhiteSpace(p + 1);
743 if (*p != '\n' && *p != '!') {
744 const char *garbage{p};
745 for (; *p != '\n' && *p != '!'; ++p) {
746 }
747 Say(GetProvenanceRange(garbage, p),
748 "excess characters after path name"_en_US);
749 }
750 std::string buf;
751 llvm::raw_string_ostream error{buf};
752 Provenance provenance{GetProvenance(nextLine_)};
753 AllSources &allSources{cooked_.allSources()};
754 const SourceFile *currentFile{allSources.GetSourceFile(provenance)};
755 if (currentFile) {
756 allSources.PushSearchPathDirectory(DirectoryName(currentFile->path()));
757 }
758 const SourceFile *included{allSources.Open(path, error)};
759 if (currentFile) {
760 allSources.PopSearchPathDirectory();
761 }
762 if (!included) {
763 Say(provenance, "INCLUDE: %s"_err_en_US, error.str());
764 } else if (included->bytes() > 0) {
765 ProvenanceRange includeLineRange{
766 provenance, static_cast<std::size_t>(p - nextLine_)};
767 ProvenanceRange fileRange{
768 allSources.AddIncludedFile(*included, includeLineRange)};
769 Prescanner{*this}.set_encoding(included->encoding()).Prescan(fileRange);
770 }
771 }
772
IsPreprocessorDirectiveLine(const char * start) const773 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
774 const char *p{start};
775 for (; *p == ' '; ++p) {
776 }
777 if (*p == '#') {
778 if (inFixedForm_ && p == start + 5) {
779 return nullptr;
780 }
781 } else {
782 p = SkipWhiteSpace(p);
783 if (*p != '#') {
784 return nullptr;
785 }
786 }
787 return SkipWhiteSpace(p + 1);
788 }
789
IsNextLinePreprocessorDirective() const790 bool Prescanner::IsNextLinePreprocessorDirective() const {
791 return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
792 }
793
SkipCommentLine(bool afterAmpersand)794 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
795 if (nextLine_ >= limit_) {
796 if (afterAmpersand && prescannerNesting_ > 0) {
797 // A continuation marker at the end of the last line in an
798 // include file inhibits the newline for that line.
799 SkipToEndOfLine();
800 omitNewline_ = true;
801 }
802 return false;
803 }
804 auto lineClass{ClassifyLine(nextLine_)};
805 if (lineClass.kind == LineClassification::Kind::Comment) {
806 NextLine();
807 return true;
808 } else if (inPreprocessorDirective_) {
809 return false;
810 } else if (lineClass.kind ==
811 LineClassification::Kind::ConditionalCompilationDirective ||
812 lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
813 // Allow conditional compilation directives (e.g., #ifdef) to affect
814 // continuation lines.
815 // Allow other preprocessor directives, too, except #include
816 // (when it does not follow '&'), #define, and #undef (because
817 // they cannot be allowed to affect preceding text on a
818 // continued line).
819 preprocessor_.Directive(TokenizePreprocessorDirective(), this);
820 return true;
821 } else if (afterAmpersand &&
822 (lineClass.kind == LineClassification::Kind::IncludeDirective ||
823 lineClass.kind == LineClassification::Kind::IncludeLine)) {
824 SkipToEndOfLine();
825 omitNewline_ = true;
826 skipLeadingAmpersand_ = true;
827 return false;
828 } else {
829 return false;
830 }
831 }
832
FixedFormContinuationLine(bool mightNeedSpace)833 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
834 if (nextLine_ >= limit_) {
835 return nullptr;
836 }
837 tabInCurrentLine_ = false;
838 char col1{*nextLine_};
839 if (InCompilerDirective()) {
840 // Must be a continued compiler directive.
841 if (!IsFixedFormCommentChar(col1)) {
842 return nullptr;
843 }
844 int j{1};
845 for (; j < 5; ++j) {
846 char ch{directiveSentinel_[j - 1]};
847 if (ch == '\0') {
848 break;
849 }
850 if (ch != ToLowerCaseLetter(nextLine_[j])) {
851 return nullptr;
852 }
853 }
854 for (; j < 5; ++j) {
855 if (nextLine_[j] != ' ') {
856 return nullptr;
857 }
858 }
859 char col6{nextLine_[5]};
860 if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
861 if (nextLine_[6] != ' ' && mightNeedSpace) {
862 insertASpace_ = true;
863 }
864 return nextLine_ + 6;
865 }
866 return nullptr;
867 } else {
868 // Normal case: not in a compiler directive.
869 if (col1 == '&' &&
870 features_.IsEnabled(
871 LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
872 // Extension: '&' as continuation marker
873 if (features_.ShouldWarn(
874 LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
875 Say(GetProvenance(nextLine_), "nonstandard usage"_en_US);
876 }
877 return nextLine_ + 1;
878 }
879 if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
880 tabInCurrentLine_ = true;
881 return nextLine_ + 2; // VAX extension
882 }
883 if (col1 == ' ' && nextLine_[1] == ' ' && nextLine_[2] == ' ' &&
884 nextLine_[3] == ' ' && nextLine_[4] == ' ') {
885 char col6{nextLine_[5]};
886 if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
887 return nextLine_ + 6;
888 }
889 }
890 if (delimiterNesting_ > 0) {
891 if (!IsFixedFormCommentChar(col1)) {
892 return nextLine_;
893 }
894 }
895 }
896 return nullptr; // not a continuation line
897 }
898
FreeFormContinuationLine(bool ampersand)899 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
900 const char *p{nextLine_};
901 if (p >= limit_) {
902 return nullptr;
903 }
904 p = SkipWhiteSpace(p);
905 if (InCompilerDirective()) {
906 if (*p++ != '!') {
907 return nullptr;
908 }
909 for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
910 if (*s != ToLowerCaseLetter(*p)) {
911 return nullptr;
912 }
913 }
914 p = SkipWhiteSpace(p);
915 if (*p == '&') {
916 if (!ampersand) {
917 insertASpace_ = true;
918 }
919 return p + 1;
920 } else if (ampersand) {
921 return p;
922 } else {
923 return nullptr;
924 }
925 } else {
926 if (*p == '&') {
927 return p + 1;
928 } else if (*p == '!' || *p == '\n' || *p == '#') {
929 return nullptr;
930 } else if (ampersand || delimiterNesting_ > 0) {
931 if (p > nextLine_) {
932 --p;
933 } else {
934 insertASpace_ = true;
935 }
936 return p;
937 } else {
938 return nullptr;
939 }
940 }
941 }
942
FixedFormContinuation(bool mightNeedSpace)943 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
944 // N.B. We accept '&' as a continuation indicator in fixed form, too,
945 // but not in a character literal.
946 if (*at_ == '&' && inCharLiteral_) {
947 return false;
948 }
949 do {
950 if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
951 BeginSourceLine(cont);
952 column_ = 7;
953 NextLine();
954 return true;
955 }
956 } while (SkipCommentLine(false /* not after ampersand */));
957 return false;
958 }
959
FreeFormContinuation()960 bool Prescanner::FreeFormContinuation() {
961 const char *p{at_};
962 bool ampersand{*p == '&'};
963 if (ampersand) {
964 p = SkipWhiteSpace(p + 1);
965 }
966 if (*p != '\n') {
967 if (inCharLiteral_) {
968 return false;
969 } else if (*p != '!' &&
970 features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
971 Say(GetProvenance(p), "missing ! before comment after &"_en_US);
972 }
973 }
974 do {
975 if (const char *cont{FreeFormContinuationLine(ampersand)}) {
976 BeginSourceLine(cont);
977 NextLine();
978 return true;
979 }
980 } while (SkipCommentLine(ampersand));
981 return false;
982 }
983
Continuation(bool mightNeedFixedFormSpace)984 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
985 if (*at_ == '\n' || *at_ == '&') {
986 if (inFixedForm_) {
987 return FixedFormContinuation(mightNeedFixedFormSpace);
988 } else {
989 return FreeFormContinuation();
990 }
991 } else {
992 return false;
993 }
994 }
995
996 std::optional<Prescanner::LineClassification>
IsFixedFormCompilerDirectiveLine(const char * start) const997 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
998 const char *p{start};
999 char col1{*p++};
1000 if (!IsFixedFormCommentChar(col1)) {
1001 return std::nullopt;
1002 }
1003 char sentinel[5], *sp{sentinel};
1004 int column{2};
1005 for (; column < 6; ++column, ++p) {
1006 if (*p != ' ') {
1007 if (*p == '\n' || *p == '\t') {
1008 break;
1009 }
1010 if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1011 // OpenMP conditional compilation line: leave the label alone
1012 break;
1013 }
1014 *sp++ = ToLowerCaseLetter(*p);
1015 }
1016 }
1017 if (column == 6) {
1018 if (*p == ' ' || *p == '\t' || *p == '0') {
1019 ++p;
1020 } else {
1021 // This is a Continuation line, not an initial directive line.
1022 return std::nullopt;
1023 }
1024 }
1025 if (sp == sentinel) {
1026 return std::nullopt;
1027 }
1028 *sp = '\0';
1029 if (const char *ss{IsCompilerDirectiveSentinel(sentinel)}) {
1030 std::size_t payloadOffset = p - start;
1031 return {LineClassification{
1032 LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1033 }
1034 return std::nullopt;
1035 }
1036
1037 std::optional<Prescanner::LineClassification>
IsFreeFormCompilerDirectiveLine(const char * start) const1038 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1039 char sentinel[8];
1040 const char *p{SkipWhiteSpace(start)};
1041 if (*p++ != '!') {
1042 return std::nullopt;
1043 }
1044 for (std::size_t j{0}; j + 1 < sizeof sentinel; ++p, ++j) {
1045 if (*p == '\n') {
1046 break;
1047 }
1048 if (*p == ' ' || *p == '\t' || *p == '&') {
1049 if (j == 0) {
1050 break;
1051 }
1052 sentinel[j] = '\0';
1053 p = SkipWhiteSpace(p + 1);
1054 if (*p == '!') {
1055 break;
1056 }
1057 if (const char *sp{IsCompilerDirectiveSentinel(sentinel)}) {
1058 std::size_t offset = p - start;
1059 return {LineClassification{
1060 LineClassification::Kind::CompilerDirective, offset, sp}};
1061 }
1062 break;
1063 }
1064 sentinel[j] = ToLowerCaseLetter(*p);
1065 }
1066 return std::nullopt;
1067 }
1068
AddCompilerDirectiveSentinel(const std::string & dir)1069 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1070 std::uint64_t packed{0};
1071 for (char ch : dir) {
1072 packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1073 }
1074 compilerDirectiveBloomFilter_.set(packed % prime1);
1075 compilerDirectiveBloomFilter_.set(packed % prime2);
1076 compilerDirectiveSentinels_.insert(dir);
1077 return *this;
1078 }
1079
IsCompilerDirectiveSentinel(const char * sentinel) const1080 const char *Prescanner::IsCompilerDirectiveSentinel(
1081 const char *sentinel) const {
1082 std::uint64_t packed{0};
1083 std::size_t n{0};
1084 for (; sentinel[n] != '\0'; ++n) {
1085 packed = (packed << 8) | (sentinel[n] & 0xff);
1086 }
1087 if (n == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1088 !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1089 return nullptr;
1090 }
1091 const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, n))};
1092 return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
1093 }
1094
ClassifyLine(const char * start) const1095 Prescanner::LineClassification Prescanner::ClassifyLine(
1096 const char *start) const {
1097 if (inFixedForm_) {
1098 if (std::optional<LineClassification> lc{
1099 IsFixedFormCompilerDirectiveLine(start)}) {
1100 return std::move(*lc);
1101 }
1102 if (IsFixedFormCommentLine(start)) {
1103 return {LineClassification::Kind::Comment};
1104 }
1105 } else {
1106 if (std::optional<LineClassification> lc{
1107 IsFreeFormCompilerDirectiveLine(start)}) {
1108 return std::move(*lc);
1109 }
1110 if (const char *bang{IsFreeFormComment(start)}) {
1111 return {LineClassification::Kind::Comment,
1112 static_cast<std::size_t>(bang - start)};
1113 }
1114 }
1115 if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1116 return {LineClassification::Kind::IncludeLine, *quoteOffset};
1117 }
1118 if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1119 if (std::memcmp(dir, "if", 2) == 0 || std::memcmp(dir, "elif", 4) == 0 ||
1120 std::memcmp(dir, "else", 4) == 0 || std::memcmp(dir, "endif", 5) == 0) {
1121 return {LineClassification::Kind::ConditionalCompilationDirective};
1122 } else if (std::memcmp(dir, "include", 7) == 0) {
1123 return {LineClassification::Kind::IncludeDirective};
1124 } else if (std::memcmp(dir, "define", 6) == 0 ||
1125 std::memcmp(dir, "undef", 5) == 0) {
1126 return {LineClassification::Kind::DefinitionDirective};
1127 } else {
1128 return {LineClassification::Kind::PreprocessorDirective};
1129 }
1130 }
1131 return {LineClassification::Kind::Source};
1132 }
1133
SourceFormChange(std::string && dir)1134 void Prescanner::SourceFormChange(std::string &&dir) {
1135 if (dir == "!dir$ free") {
1136 inFixedForm_ = false;
1137 } else if (dir == "!dir$ fixed") {
1138 inFixedForm_ = true;
1139 }
1140 }
1141 } // namespace Fortran::parser
1142