1 //===-- lib/Parser/prescan.cpp --------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "prescan.h"
10 #include "preprocessor.h"
11 #include "token-sequence.h"
12 #include "flang/Common/idioms.h"
13 #include "flang/Parser/characters.h"
14 #include "flang/Parser/message.h"
15 #include "flang/Parser/source.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include <cstddef>
18 #include <cstring>
19 #include <utility>
20 #include <vector>
21
22 namespace Fortran::parser {
23
24 using common::LanguageFeature;
25
26 static constexpr int maxPrescannerNesting{100};
27
Prescanner(Messages & messages,CookedSource & cooked,Preprocessor & preprocessor,common::LanguageFeatureControl lfc)28 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
29 Preprocessor &preprocessor, common::LanguageFeatureControl lfc)
30 : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
31 allSources_{preprocessor_.allSources()}, features_{lfc},
32 encoding_{allSources_.encoding()} {}
33
Prescanner(const Prescanner & that)34 Prescanner::Prescanner(const Prescanner &that)
35 : messages_{that.messages_}, cooked_{that.cooked_},
36 preprocessor_{that.preprocessor_}, allSources_{that.allSources_},
37 features_{that.features_}, inFixedForm_{that.inFixedForm_},
38 fixedFormColumnLimit_{that.fixedFormColumnLimit_},
39 encoding_{that.encoding_}, prescannerNesting_{that.prescannerNesting_ +
40 1},
41 skipLeadingAmpersand_{that.skipLeadingAmpersand_},
42 compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
43 compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
44
IsFixedFormCommentChar(char ch)45 static inline constexpr bool IsFixedFormCommentChar(char ch) {
46 return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
47 }
48
NormalizeCompilerDirectiveCommentMarker(TokenSequence & dir)49 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
50 char *p{dir.GetMutableCharData()};
51 char *limit{p + dir.SizeInChars()};
52 for (; p < limit; ++p) {
53 if (*p != ' ') {
54 CHECK(IsFixedFormCommentChar(*p));
55 *p = '!';
56 return;
57 }
58 }
59 DIE("compiler directive all blank");
60 }
61
Prescan(ProvenanceRange range)62 void Prescanner::Prescan(ProvenanceRange range) {
63 startProvenance_ = range.start();
64 start_ = allSources_.GetSource(range);
65 CHECK(start_);
66 limit_ = start_ + range.size();
67 nextLine_ = start_;
68 const bool beganInFixedForm{inFixedForm_};
69 if (prescannerNesting_ > maxPrescannerNesting) {
70 Say(GetProvenance(start_),
71 "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
72 return;
73 }
74 while (!IsAtEnd()) {
75 Statement();
76 }
77 if (inFixedForm_ != beganInFixedForm) {
78 std::string dir{"!dir$ "};
79 if (beganInFixedForm) {
80 dir += "fixed";
81 } else {
82 dir += "free";
83 }
84 dir += '\n';
85 TokenSequence tokens{dir, allSources_.AddCompilerInsertion(dir).start()};
86 tokens.Emit(cooked_);
87 }
88 }
89
Statement()90 void Prescanner::Statement() {
91 TokenSequence tokens;
92 LineClassification line{ClassifyLine(nextLine_)};
93 switch (line.kind) {
94 case LineClassification::Kind::Comment:
95 nextLine_ += line.payloadOffset; // advance to '!' or newline
96 NextLine();
97 return;
98 case LineClassification::Kind::IncludeLine:
99 FortranInclude(nextLine_ + line.payloadOffset);
100 NextLine();
101 return;
102 case LineClassification::Kind::ConditionalCompilationDirective:
103 case LineClassification::Kind::IncludeDirective:
104 case LineClassification::Kind::DefinitionDirective:
105 case LineClassification::Kind::PreprocessorDirective:
106 preprocessor_.Directive(TokenizePreprocessorDirective(), this);
107 return;
108 case LineClassification::Kind::CompilerDirective:
109 directiveSentinel_ = line.sentinel;
110 CHECK(InCompilerDirective());
111 BeginStatementAndAdvance();
112 if (inFixedForm_) {
113 CHECK(IsFixedFormCommentChar(*at_));
114 } else {
115 while (*at_ == ' ' || *at_ == '\t') {
116 ++at_, ++column_;
117 }
118 CHECK(*at_ == '!');
119 }
120 if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
121 // OpenMP conditional compilation line. Remove the sentinel and then
122 // treat the line as if it were normal source.
123 at_ += 2, column_ += 2;
124 if (inFixedForm_) {
125 LabelField(tokens);
126 } else {
127 SkipSpaces();
128 }
129 } else {
130 // Compiler directive. Emit normalized sentinel.
131 EmitChar(tokens, '!');
132 ++at_, ++column_;
133 for (const char *sp{directiveSentinel_}; *sp != '\0';
134 ++sp, ++at_, ++column_) {
135 EmitChar(tokens, *sp);
136 }
137 if (*at_ == ' ') {
138 EmitChar(tokens, ' ');
139 ++at_, ++column_;
140 }
141 tokens.CloseToken();
142 }
143 break;
144 case LineClassification::Kind::Source:
145 BeginStatementAndAdvance();
146 if (inFixedForm_) {
147 LabelField(tokens);
148 } else if (skipLeadingAmpersand_) {
149 skipLeadingAmpersand_ = false;
150 const char *p{SkipWhiteSpace(at_)};
151 if (p < limit_ && *p == '&') {
152 column_ += ++p - at_;
153 at_ = p;
154 }
155 } else {
156 SkipSpaces();
157 }
158 break;
159 }
160
161 while (NextToken(tokens)) {
162 }
163
164 Provenance newlineProvenance{GetCurrentProvenance()};
165 if (std::optional<TokenSequence> preprocessed{
166 preprocessor_.MacroReplacement(tokens, *this)}) {
167 // Reprocess the preprocessed line. Append a newline temporarily.
168 preprocessed->PutNextTokenChar('\n', newlineProvenance);
169 preprocessed->CloseToken();
170 const char *ppd{preprocessed->ToCharBlock().begin()};
171 LineClassification ppl{ClassifyLine(ppd)};
172 preprocessed->RemoveLastToken(); // remove the newline
173 switch (ppl.kind) {
174 case LineClassification::Kind::Comment:
175 break;
176 case LineClassification::Kind::IncludeLine:
177 FortranInclude(ppd + ppl.payloadOffset);
178 break;
179 case LineClassification::Kind::ConditionalCompilationDirective:
180 case LineClassification::Kind::IncludeDirective:
181 case LineClassification::Kind::DefinitionDirective:
182 case LineClassification::Kind::PreprocessorDirective:
183 Say(preprocessed->GetProvenanceRange(),
184 "Preprocessed line resembles a preprocessor directive"_en_US);
185 preprocessed->ToLowerCase().CheckBadFortranCharacters(messages_).Emit(
186 cooked_);
187 break;
188 case LineClassification::Kind::CompilerDirective:
189 if (preprocessed->HasRedundantBlanks()) {
190 preprocessed->RemoveRedundantBlanks();
191 }
192 NormalizeCompilerDirectiveCommentMarker(*preprocessed);
193 preprocessed->ToLowerCase();
194 SourceFormChange(preprocessed->ToString());
195 preprocessed->ClipComment(true /* skip first ! */)
196 .CheckBadFortranCharacters(messages_)
197 .Emit(cooked_);
198 break;
199 case LineClassification::Kind::Source:
200 if (inFixedForm_) {
201 if (preprocessed->HasBlanks(/*after column*/ 6)) {
202 preprocessed->RemoveBlanks(/*after column*/ 6);
203 }
204 } else {
205 if (preprocessed->HasRedundantBlanks()) {
206 preprocessed->RemoveRedundantBlanks();
207 }
208 }
209 preprocessed->ToLowerCase()
210 .ClipComment()
211 .CheckBadFortranCharacters(messages_)
212 .Emit(cooked_);
213 break;
214 }
215 } else {
216 tokens.ToLowerCase();
217 if (line.kind == LineClassification::Kind::CompilerDirective) {
218 SourceFormChange(tokens.ToString());
219 }
220 tokens.CheckBadFortranCharacters(messages_).Emit(cooked_);
221 }
222 if (omitNewline_) {
223 omitNewline_ = false;
224 } else {
225 cooked_.Put('\n', newlineProvenance);
226 }
227 directiveSentinel_ = nullptr;
228 }
229
TokenizePreprocessorDirective()230 TokenSequence Prescanner::TokenizePreprocessorDirective() {
231 CHECK(!IsAtEnd() && !inPreprocessorDirective_);
232 inPreprocessorDirective_ = true;
233 BeginStatementAndAdvance();
234 TokenSequence tokens;
235 while (NextToken(tokens)) {
236 }
237 inPreprocessorDirective_ = false;
238 return tokens;
239 }
240
NextLine()241 void Prescanner::NextLine() {
242 void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
243 void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
244 if (!v) {
245 nextLine_ = limit_;
246 } else {
247 const char *nl{const_cast<const char *>(static_cast<char *>(v))};
248 nextLine_ = nl + 1;
249 }
250 }
251
LabelField(TokenSequence & token)252 void Prescanner::LabelField(TokenSequence &token) {
253 const char *bad{nullptr};
254 int outCol{1};
255 for (; *at_ != '\n' && column_ <= 6; ++at_) {
256 if (*at_ == '\t') {
257 ++at_;
258 column_ = 7;
259 break;
260 }
261 if (*at_ != ' ' &&
262 !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
263 EmitChar(token, *at_);
264 ++outCol;
265 if (!bad && !IsDecimalDigit(*at_)) {
266 bad = at_;
267 }
268 }
269 ++column_;
270 }
271 if (outCol == 1) { // empty label field
272 // Emit a space so that, if the line is rescanned after preprocessing,
273 // a leading 'C' or 'D' won't be left-justified and then accidentally
274 // misinterpreted as a comment card.
275 EmitChar(token, ' ');
276 ++outCol;
277 } else {
278 if (bad && !preprocessor_.IsNameDefined(token.CurrentOpenToken())) {
279 Say(GetProvenance(bad),
280 "Character in fixed-form label field must be a digit"_en_US);
281 }
282 }
283 token.CloseToken();
284 SkipToNextSignificantCharacter();
285 if (IsDecimalDigit(*at_)) {
286 Say(GetProvenance(at_),
287 "Label digit is not in fixed-form label field"_en_US);
288 }
289 }
290
SkipToEndOfLine()291 void Prescanner::SkipToEndOfLine() {
292 while (*at_ != '\n') {
293 ++at_, ++column_;
294 }
295 }
296
MustSkipToEndOfLine() const297 bool Prescanner::MustSkipToEndOfLine() const {
298 if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
299 return true; // skip over ignored columns in right margin (73:80)
300 } else if (*at_ == '!' && !inCharLiteral_) {
301 return true; // inline comment goes to end of source line
302 } else {
303 return false;
304 }
305 }
306
NextChar()307 void Prescanner::NextChar() {
308 CHECK(*at_ != '\n');
309 ++at_, ++column_;
310 while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
311 // UTF-8 byte order mark - treat this file as UTF-8
312 at_ += 3;
313 encoding_ = Encoding::UTF_8;
314 }
315 SkipToNextSignificantCharacter();
316 }
317
318 // Skip everything that should be ignored until the next significant
319 // character is reached; handles C-style comments in preprocessing
320 // directives, Fortran ! comments, stuff after the right margin in
321 // fixed form, and all forms of line continuation.
SkipToNextSignificantCharacter()322 void Prescanner::SkipToNextSignificantCharacter() {
323 if (inPreprocessorDirective_) {
324 SkipCComments();
325 } else {
326 bool mightNeedSpace{false};
327 if (MustSkipToEndOfLine()) {
328 SkipToEndOfLine();
329 } else {
330 mightNeedSpace = *at_ == '\n';
331 }
332 for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
333 if (MustSkipToEndOfLine()) {
334 SkipToEndOfLine();
335 }
336 }
337 if (*at_ == '\t') {
338 tabInCurrentLine_ = true;
339 }
340 }
341 }
342
SkipCComments()343 void Prescanner::SkipCComments() {
344 while (true) {
345 if (IsCComment(at_)) {
346 if (const char *after{SkipCComment(at_)}) {
347 column_ += after - at_;
348 // May have skipped over one or more newlines; relocate the start of
349 // the next line.
350 nextLine_ = at_ = after;
351 NextLine();
352 } else {
353 // Don't emit any messages about unclosed C-style comments, because
354 // the sequence /* can appear legally in a FORMAT statement. There's
355 // no ambiguity, since the sequence */ cannot appear legally.
356 break;
357 }
358 } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
359 at_[1] == '\n' && !IsAtEnd()) {
360 BeginSourceLineAndAdvance();
361 } else {
362 break;
363 }
364 }
365 }
366
SkipSpaces()367 void Prescanner::SkipSpaces() {
368 while (*at_ == ' ' || *at_ == '\t') {
369 NextChar();
370 }
371 insertASpace_ = false;
372 }
373
SkipWhiteSpace(const char * p)374 const char *Prescanner::SkipWhiteSpace(const char *p) {
375 while (*p == ' ' || *p == '\t') {
376 ++p;
377 }
378 return p;
379 }
380
SkipWhiteSpaceAndCComments(const char * p) const381 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
382 while (true) {
383 if (*p == ' ' || *p == '\t') {
384 ++p;
385 } else if (IsCComment(p)) {
386 if (const char *after{SkipCComment(p)}) {
387 p = after;
388 } else {
389 break;
390 }
391 } else {
392 break;
393 }
394 }
395 return p;
396 }
397
SkipCComment(const char * p) const398 const char *Prescanner::SkipCComment(const char *p) const {
399 char star{' '}, slash{' '};
400 p += 2;
401 while (star != '*' || slash != '/') {
402 if (p >= limit_) {
403 return nullptr; // signifies an unterminated comment
404 }
405 star = slash;
406 slash = *p++;
407 }
408 return p;
409 }
410
NextToken(TokenSequence & tokens)411 bool Prescanner::NextToken(TokenSequence &tokens) {
412 CHECK(at_ >= start_ && at_ < limit_);
413 if (InFixedFormSource()) {
414 SkipSpaces();
415 } else {
416 if (*at_ == '/' && IsCComment(at_)) {
417 // Recognize and skip over classic C style /*comments*/ when
418 // outside a character literal.
419 if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
420 Say(GetProvenance(at_), "nonstandard usage: C-style comment"_en_US);
421 }
422 SkipCComments();
423 }
424 if (*at_ == ' ' || *at_ == '\t') {
425 // Compress free-form white space into a single space character.
426 const auto theSpace{at_};
427 char previous{at_ <= start_ ? ' ' : at_[-1]};
428 NextChar();
429 SkipSpaces();
430 if (*at_ == '\n') {
431 // Discard white space at the end of a line.
432 } else if (!inPreprocessorDirective_ &&
433 (previous == '(' || *at_ == '(' || *at_ == ')')) {
434 // Discard white space before/after '(' and before ')', unless in a
435 // preprocessor directive. This helps yield space-free contiguous
436 // names for generic interfaces like OPERATOR( + ) and
437 // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
438 // This has the effect of silently ignoring the illegal spaces in
439 // the array constructor ( /1,2/ ) but that seems benign; it's
440 // hard to avoid that while still removing spaces from OPERATOR( / )
441 // and OPERATOR( // ).
442 } else {
443 // Preserve the squashed white space as a single space character.
444 tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
445 tokens.CloseToken();
446 return true;
447 }
448 }
449 }
450 if (insertASpace_) {
451 tokens.PutNextTokenChar(' ', spaceProvenance_);
452 insertASpace_ = false;
453 }
454 if (*at_ == '\n') {
455 return false;
456 }
457 const char *start{at_};
458 if (*at_ == '\'' || *at_ == '"') {
459 QuotedCharacterLiteral(tokens, start);
460 preventHollerith_ = false;
461 } else if (IsDecimalDigit(*at_)) {
462 int n{0}, digits{0};
463 static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
464 do {
465 if (n < maxHollerith) {
466 n = 10 * n + DecimalDigitValue(*at_);
467 }
468 EmitCharAndAdvance(tokens, *at_);
469 ++digits;
470 if (InFixedFormSource()) {
471 SkipSpaces();
472 }
473 } while (IsDecimalDigit(*at_));
474 if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
475 !preventHollerith_) {
476 Hollerith(tokens, n, start);
477 } else if (*at_ == '.') {
478 while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
479 }
480 ExponentAndKind(tokens);
481 } else if (ExponentAndKind(tokens)) {
482 } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
483 inPreprocessorDirective_) {
484 do {
485 EmitCharAndAdvance(tokens, *at_);
486 } while (IsHexadecimalDigit(*at_));
487 } else if (IsLetter(*at_)) {
488 // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
489 // we don't misrecognize I9HOLLERITH as an identifier in the next case.
490 EmitCharAndAdvance(tokens, *at_);
491 } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) { // 4_"..."
492 EmitCharAndAdvance(tokens, *at_);
493 QuotedCharacterLiteral(tokens, start);
494 }
495 preventHollerith_ = false;
496 } else if (*at_ == '.') {
497 char nch{EmitCharAndAdvance(tokens, '.')};
498 if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
499 while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
500 }
501 ExponentAndKind(tokens);
502 } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
503 EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
504 }
505 preventHollerith_ = false;
506 } else if (IsLegalInIdentifier(*at_)) {
507 do {
508 } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_)));
509 if ((*at_ == '\'' || *at_ == '"') &&
510 tokens.CharAt(tokens.SizeInChars() - 1) == '_') { // kind_"..."
511 QuotedCharacterLiteral(tokens, start);
512 }
513 preventHollerith_ = false;
514 } else if (*at_ == '*') {
515 if (EmitCharAndAdvance(tokens, '*') == '*') {
516 EmitCharAndAdvance(tokens, '*');
517 } else {
518 // Subtle ambiguity:
519 // CHARACTER*2H declares H because *2 is a kind specifier
520 // DATAC/N*2H / is repeated Hollerith
521 preventHollerith_ = !slashInCurrentStatement_;
522 }
523 } else {
524 char ch{*at_};
525 if (ch == '(' || ch == '[') {
526 ++delimiterNesting_;
527 } else if ((ch == ')' || ch == ']') && delimiterNesting_ > 0) {
528 --delimiterNesting_;
529 }
530 char nch{EmitCharAndAdvance(tokens, ch)};
531 preventHollerith_ = false;
532 if ((nch == '=' &&
533 (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
534 (ch == nch &&
535 (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
536 ch == '|' || ch == '<' || ch == '>')) ||
537 (ch == '=' && nch == '>')) {
538 // token comprises two characters
539 EmitCharAndAdvance(tokens, nch);
540 } else if (ch == '/') {
541 slashInCurrentStatement_ = true;
542 }
543 }
544 tokens.CloseToken();
545 return true;
546 }
547
ExponentAndKind(TokenSequence & tokens)548 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
549 char ed{ToLowerCaseLetter(*at_)};
550 if (ed != 'e' && ed != 'd') {
551 return false;
552 }
553 EmitCharAndAdvance(tokens, ed);
554 if (*at_ == '+' || *at_ == '-') {
555 EmitCharAndAdvance(tokens, *at_);
556 }
557 while (IsDecimalDigit(*at_)) {
558 EmitCharAndAdvance(tokens, *at_);
559 }
560 if (*at_ == '_') {
561 while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
562 }
563 }
564 return true;
565 }
566
QuotedCharacterLiteral(TokenSequence & tokens,const char * start)567 void Prescanner::QuotedCharacterLiteral(
568 TokenSequence &tokens, const char *start) {
569 char quote{*at_};
570 const char *end{at_ + 1};
571 inCharLiteral_ = true;
572 const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
573 const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
574 bool isEscaped{false};
575 bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
576 while (true) {
577 if (*at_ == '\\') {
578 if (escapesEnabled) {
579 isEscaped = !isEscaped;
580 } else {
581 // The parser always processes escape sequences, so don't confuse it
582 // when escapes are disabled.
583 insert('\\');
584 }
585 } else {
586 isEscaped = false;
587 }
588 EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
589 Encoding::LATIN_1);
590 while (PadOutCharacterLiteral(tokens)) {
591 }
592 if (*at_ == '\n') {
593 if (!inPreprocessorDirective_) {
594 Say(GetProvenanceRange(start, end),
595 "Incomplete character literal"_err_en_US);
596 }
597 break;
598 }
599 end = at_ + 1;
600 NextChar();
601 if (*at_ == quote && !isEscaped) {
602 // A doubled unescaped quote mark becomes a single instance of that
603 // quote character in the literal (later). There can be spaces between
604 // the quotes in fixed form source.
605 EmitChar(tokens, quote);
606 inCharLiteral_ = false; // for cases like print *, '...'!comment
607 NextChar();
608 if (InFixedFormSource()) {
609 SkipSpaces();
610 }
611 if (*at_ != quote) {
612 break;
613 }
614 inCharLiteral_ = true;
615 }
616 }
617 inCharLiteral_ = false;
618 }
619
Hollerith(TokenSequence & tokens,int count,const char * start)620 void Prescanner::Hollerith(
621 TokenSequence &tokens, int count, const char *start) {
622 inCharLiteral_ = true;
623 CHECK(*at_ == 'h' || *at_ == 'H');
624 EmitChar(tokens, 'H');
625 while (count-- > 0) {
626 if (PadOutCharacterLiteral(tokens)) {
627 } else if (*at_ == '\n') {
628 Say(GetProvenanceRange(start, at_),
629 "Possible truncated Hollerith literal"_en_US);
630 break;
631 } else {
632 NextChar();
633 // Each multi-byte character encoding counts as a single character.
634 // No escape sequences are recognized.
635 // Hollerith is always emitted to the cooked character
636 // stream in UTF-8.
637 DecodedCharacter decoded{DecodeCharacter(
638 encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
639 if (decoded.bytes > 0) {
640 EncodedCharacter utf8{
641 EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
642 for (int j{0}; j < utf8.bytes; ++j) {
643 EmitChar(tokens, utf8.buffer[j]);
644 }
645 at_ += decoded.bytes - 1;
646 } else {
647 Say(GetProvenanceRange(start, at_),
648 "Bad character in Hollerith literal"_err_en_US);
649 break;
650 }
651 }
652 }
653 if (*at_ != '\n') {
654 NextChar();
655 }
656 inCharLiteral_ = false;
657 }
658
659 // In fixed form, source card images must be processed as if they were at
660 // least 72 columns wide, at least in character literal contexts.
PadOutCharacterLiteral(TokenSequence & tokens)661 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
662 while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
663 if (column_ < fixedFormColumnLimit_) {
664 tokens.PutNextTokenChar(' ', spaceProvenance_);
665 ++column_;
666 return true;
667 }
668 if (!FixedFormContinuation(false /*no need to insert space*/) ||
669 tabInCurrentLine_) {
670 return false;
671 }
672 CHECK(column_ == 7);
673 --at_; // point to column 6 of continuation line
674 column_ = 6;
675 }
676 return false;
677 }
678
IsFixedFormCommentLine(const char * start) const679 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
680 const char *p{start};
681 if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
682 ((*p == 'D' || *p == 'd') &&
683 !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
684 return true;
685 }
686 bool anyTabs{false};
687 while (true) {
688 if (*p == ' ') {
689 ++p;
690 } else if (*p == '\t') {
691 anyTabs = true;
692 ++p;
693 } else if (*p == '0' && !anyTabs && p == start + 5) {
694 ++p; // 0 in column 6 must treated as a space
695 } else {
696 break;
697 }
698 }
699 if (!anyTabs && p >= start + fixedFormColumnLimit_) {
700 return true;
701 }
702 if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
703 return true;
704 }
705 return *p == '\n';
706 }
707
IsFreeFormComment(const char * p) const708 const char *Prescanner::IsFreeFormComment(const char *p) const {
709 p = SkipWhiteSpaceAndCComments(p);
710 if (*p == '!' || *p == '\n') {
711 return p;
712 } else {
713 return nullptr;
714 }
715 }
716
IsIncludeLine(const char * start) const717 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
718 const char *p{SkipWhiteSpace(start)};
719 for (char ch : "include"s) {
720 if (ToLowerCaseLetter(*p++) != ch) {
721 return std::nullopt;
722 }
723 }
724 p = SkipWhiteSpace(p);
725 if (*p == '"' || *p == '\'') {
726 return {p - start};
727 }
728 return std::nullopt;
729 }
730
FortranInclude(const char * firstQuote)731 void Prescanner::FortranInclude(const char *firstQuote) {
732 const char *p{firstQuote};
733 while (*p != '"' && *p != '\'') {
734 ++p;
735 }
736 char quote{*p};
737 std::string path;
738 for (++p; *p != '\n'; ++p) {
739 if (*p == quote) {
740 if (p[1] != quote) {
741 break;
742 }
743 ++p;
744 }
745 path += *p;
746 }
747 if (*p != quote) {
748 Say(GetProvenanceRange(firstQuote, p),
749 "malformed path name string"_err_en_US);
750 return;
751 }
752 p = SkipWhiteSpace(p + 1);
753 if (*p != '\n' && *p != '!') {
754 const char *garbage{p};
755 for (; *p != '\n' && *p != '!'; ++p) {
756 }
757 Say(GetProvenanceRange(garbage, p),
758 "excess characters after path name"_en_US);
759 }
760 std::string buf;
761 llvm::raw_string_ostream error{buf};
762 Provenance provenance{GetProvenance(nextLine_)};
763 std::optional<std::string> prependPath;
764 if (const SourceFile * currentFile{allSources_.GetSourceFile(provenance)}) {
765 prependPath = DirectoryName(currentFile->path());
766 }
767 const SourceFile *included{
768 allSources_.Open(path, error, std::move(prependPath))};
769 if (!included) {
770 Say(provenance, "INCLUDE: %s"_err_en_US, error.str());
771 } else if (included->bytes() > 0) {
772 ProvenanceRange includeLineRange{
773 provenance, static_cast<std::size_t>(p - nextLine_)};
774 ProvenanceRange fileRange{
775 allSources_.AddIncludedFile(*included, includeLineRange)};
776 Prescanner{*this}.set_encoding(included->encoding()).Prescan(fileRange);
777 }
778 }
779
IsPreprocessorDirectiveLine(const char * start) const780 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
781 const char *p{start};
782 for (; *p == ' '; ++p) {
783 }
784 if (*p == '#') {
785 if (inFixedForm_ && p == start + 5) {
786 return nullptr;
787 }
788 } else {
789 p = SkipWhiteSpace(p);
790 if (*p != '#') {
791 return nullptr;
792 }
793 }
794 return SkipWhiteSpace(p + 1);
795 }
796
IsNextLinePreprocessorDirective() const797 bool Prescanner::IsNextLinePreprocessorDirective() const {
798 return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
799 }
800
SkipCommentLine(bool afterAmpersand)801 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
802 if (IsAtEnd()) {
803 if (afterAmpersand && prescannerNesting_ > 0) {
804 // A continuation marker at the end of the last line in an
805 // include file inhibits the newline for that line.
806 SkipToEndOfLine();
807 omitNewline_ = true;
808 }
809 return false;
810 }
811 auto lineClass{ClassifyLine(nextLine_)};
812 if (lineClass.kind == LineClassification::Kind::Comment) {
813 NextLine();
814 return true;
815 } else if (inPreprocessorDirective_) {
816 return false;
817 } else if (lineClass.kind ==
818 LineClassification::Kind::ConditionalCompilationDirective ||
819 lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
820 // Allow conditional compilation directives (e.g., #ifdef) to affect
821 // continuation lines.
822 // Allow other preprocessor directives, too, except #include
823 // (when it does not follow '&'), #define, and #undef (because
824 // they cannot be allowed to affect preceding text on a
825 // continued line).
826 preprocessor_.Directive(TokenizePreprocessorDirective(), this);
827 return true;
828 } else if (afterAmpersand &&
829 (lineClass.kind == LineClassification::Kind::IncludeDirective ||
830 lineClass.kind == LineClassification::Kind::IncludeLine)) {
831 SkipToEndOfLine();
832 omitNewline_ = true;
833 skipLeadingAmpersand_ = true;
834 return false;
835 } else {
836 return false;
837 }
838 }
839
FixedFormContinuationLine(bool mightNeedSpace)840 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
841 if (IsAtEnd()) {
842 return nullptr;
843 }
844 tabInCurrentLine_ = false;
845 char col1{*nextLine_};
846 if (InCompilerDirective()) {
847 // Must be a continued compiler directive.
848 if (!IsFixedFormCommentChar(col1)) {
849 return nullptr;
850 }
851 int j{1};
852 for (; j < 5; ++j) {
853 char ch{directiveSentinel_[j - 1]};
854 if (ch == '\0') {
855 break;
856 }
857 if (ch != ToLowerCaseLetter(nextLine_[j])) {
858 return nullptr;
859 }
860 }
861 for (; j < 5; ++j) {
862 if (nextLine_[j] != ' ') {
863 return nullptr;
864 }
865 }
866 char col6{nextLine_[5]};
867 if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
868 if (nextLine_[6] != ' ' && mightNeedSpace) {
869 insertASpace_ = true;
870 }
871 return nextLine_ + 6;
872 }
873 return nullptr;
874 } else {
875 // Normal case: not in a compiler directive.
876 if (col1 == '&' &&
877 features_.IsEnabled(
878 LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
879 // Extension: '&' as continuation marker
880 if (features_.ShouldWarn(
881 LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
882 Say(GetProvenance(nextLine_), "nonstandard usage"_en_US);
883 }
884 return nextLine_ + 1;
885 }
886 if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
887 tabInCurrentLine_ = true;
888 return nextLine_ + 2; // VAX extension
889 }
890 if (col1 == ' ' && nextLine_[1] == ' ' && nextLine_[2] == ' ' &&
891 nextLine_[3] == ' ' && nextLine_[4] == ' ') {
892 char col6{nextLine_[5]};
893 if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
894 return nextLine_ + 6;
895 }
896 }
897 if (IsImplicitContinuation()) {
898 return nextLine_;
899 }
900 }
901 return nullptr; // not a continuation line
902 }
903
FreeFormContinuationLine(bool ampersand)904 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
905 const char *p{nextLine_};
906 if (p >= limit_) {
907 return nullptr;
908 }
909 p = SkipWhiteSpace(p);
910 if (InCompilerDirective()) {
911 if (*p++ != '!') {
912 return nullptr;
913 }
914 for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
915 if (*s != ToLowerCaseLetter(*p)) {
916 return nullptr;
917 }
918 }
919 p = SkipWhiteSpace(p);
920 if (*p == '&') {
921 if (!ampersand) {
922 insertASpace_ = true;
923 }
924 return p + 1;
925 } else if (ampersand) {
926 return p;
927 } else {
928 return nullptr;
929 }
930 } else {
931 if (*p == '&') {
932 return p + 1;
933 } else if (*p == '!' || *p == '\n' || *p == '#') {
934 return nullptr;
935 } else if (ampersand || IsImplicitContinuation()) {
936 if (p > nextLine_) {
937 --p;
938 } else {
939 insertASpace_ = true;
940 }
941 return p;
942 } else {
943 return nullptr;
944 }
945 }
946 }
947
FixedFormContinuation(bool mightNeedSpace)948 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
949 // N.B. We accept '&' as a continuation indicator in fixed form, too,
950 // but not in a character literal.
951 if (*at_ == '&' && inCharLiteral_) {
952 return false;
953 }
954 do {
955 if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
956 BeginSourceLine(cont);
957 column_ = 7;
958 NextLine();
959 return true;
960 }
961 } while (SkipCommentLine(false /* not after ampersand */));
962 return false;
963 }
964
FreeFormContinuation()965 bool Prescanner::FreeFormContinuation() {
966 const char *p{at_};
967 bool ampersand{*p == '&'};
968 if (ampersand) {
969 p = SkipWhiteSpace(p + 1);
970 }
971 if (*p != '\n') {
972 if (inCharLiteral_) {
973 return false;
974 } else if (*p != '!' &&
975 features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
976 Say(GetProvenance(p), "missing ! before comment after &"_en_US);
977 }
978 }
979 do {
980 if (const char *cont{FreeFormContinuationLine(ampersand)}) {
981 BeginSourceLine(cont);
982 NextLine();
983 return true;
984 }
985 } while (SkipCommentLine(ampersand));
986 return false;
987 }
988
989 // Implicit line continuation allows a preprocessor macro call with
990 // arguments to span multiple lines.
IsImplicitContinuation() const991 bool Prescanner::IsImplicitContinuation() const {
992 return !inPreprocessorDirective_ && !inCharLiteral_ &&
993 delimiterNesting_ > 0 && !IsAtEnd() &&
994 ClassifyLine(nextLine_).kind == LineClassification::Kind::Source;
995 }
996
Continuation(bool mightNeedFixedFormSpace)997 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
998 if (*at_ == '\n' || *at_ == '&') {
999 if (inFixedForm_) {
1000 return FixedFormContinuation(mightNeedFixedFormSpace);
1001 } else {
1002 return FreeFormContinuation();
1003 }
1004 } else {
1005 return false;
1006 }
1007 }
1008
1009 std::optional<Prescanner::LineClassification>
IsFixedFormCompilerDirectiveLine(const char * start) const1010 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
1011 const char *p{start};
1012 char col1{*p++};
1013 if (!IsFixedFormCommentChar(col1)) {
1014 return std::nullopt;
1015 }
1016 char sentinel[5], *sp{sentinel};
1017 int column{2};
1018 for (; column < 6; ++column, ++p) {
1019 if (*p != ' ') {
1020 if (*p == '\n' || *p == '\t') {
1021 break;
1022 }
1023 if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1024 // OpenMP conditional compilation line: leave the label alone
1025 break;
1026 }
1027 *sp++ = ToLowerCaseLetter(*p);
1028 }
1029 }
1030 if (column == 6) {
1031 if (*p == ' ' || *p == '\t' || *p == '0') {
1032 ++p;
1033 } else {
1034 // This is a Continuation line, not an initial directive line.
1035 return std::nullopt;
1036 }
1037 }
1038 if (sp == sentinel) {
1039 return std::nullopt;
1040 }
1041 *sp = '\0';
1042 if (const char *ss{IsCompilerDirectiveSentinel(sentinel)}) {
1043 std::size_t payloadOffset = p - start;
1044 return {LineClassification{
1045 LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1046 }
1047 return std::nullopt;
1048 }
1049
1050 std::optional<Prescanner::LineClassification>
IsFreeFormCompilerDirectiveLine(const char * start) const1051 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1052 char sentinel[8];
1053 const char *p{SkipWhiteSpace(start)};
1054 if (*p++ != '!') {
1055 return std::nullopt;
1056 }
1057 for (std::size_t j{0}; j + 1 < sizeof sentinel; ++p, ++j) {
1058 if (*p == '\n') {
1059 break;
1060 }
1061 if (*p == ' ' || *p == '\t' || *p == '&') {
1062 if (j == 0) {
1063 break;
1064 }
1065 sentinel[j] = '\0';
1066 p = SkipWhiteSpace(p + 1);
1067 if (*p == '!') {
1068 break;
1069 }
1070 if (const char *sp{IsCompilerDirectiveSentinel(sentinel)}) {
1071 std::size_t offset = p - start;
1072 return {LineClassification{
1073 LineClassification::Kind::CompilerDirective, offset, sp}};
1074 }
1075 break;
1076 }
1077 sentinel[j] = ToLowerCaseLetter(*p);
1078 }
1079 return std::nullopt;
1080 }
1081
AddCompilerDirectiveSentinel(const std::string & dir)1082 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1083 std::uint64_t packed{0};
1084 for (char ch : dir) {
1085 packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1086 }
1087 compilerDirectiveBloomFilter_.set(packed % prime1);
1088 compilerDirectiveBloomFilter_.set(packed % prime2);
1089 compilerDirectiveSentinels_.insert(dir);
1090 return *this;
1091 }
1092
IsCompilerDirectiveSentinel(const char * sentinel) const1093 const char *Prescanner::IsCompilerDirectiveSentinel(
1094 const char *sentinel) const {
1095 std::uint64_t packed{0};
1096 std::size_t n{0};
1097 for (; sentinel[n] != '\0'; ++n) {
1098 packed = (packed << 8) | (sentinel[n] & 0xff);
1099 }
1100 if (n == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1101 !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1102 return nullptr;
1103 }
1104 const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, n))};
1105 return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
1106 }
1107
IsDirective(const char * match,const char * dir)1108 constexpr bool IsDirective(const char *match, const char *dir) {
1109 for (; *match; ++match) {
1110 if (*match != ToLowerCaseLetter(*dir++)) {
1111 return false;
1112 }
1113 }
1114 return true;
1115 }
1116
ClassifyLine(const char * start) const1117 Prescanner::LineClassification Prescanner::ClassifyLine(
1118 const char *start) const {
1119 if (inFixedForm_) {
1120 if (std::optional<LineClassification> lc{
1121 IsFixedFormCompilerDirectiveLine(start)}) {
1122 return std::move(*lc);
1123 }
1124 if (IsFixedFormCommentLine(start)) {
1125 return {LineClassification::Kind::Comment};
1126 }
1127 } else {
1128 if (std::optional<LineClassification> lc{
1129 IsFreeFormCompilerDirectiveLine(start)}) {
1130 return std::move(*lc);
1131 }
1132 if (const char *bang{IsFreeFormComment(start)}) {
1133 return {LineClassification::Kind::Comment,
1134 static_cast<std::size_t>(bang - start)};
1135 }
1136 }
1137 if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1138 return {LineClassification::Kind::IncludeLine, *quoteOffset};
1139 }
1140 if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1141 if (IsDirective("if", dir) || IsDirective("elif", dir) ||
1142 IsDirective("else", dir) || IsDirective("endif", dir)) {
1143 return {LineClassification::Kind::ConditionalCompilationDirective};
1144 } else if (IsDirective("include", dir)) {
1145 return {LineClassification::Kind::IncludeDirective};
1146 } else if (IsDirective("define", dir) || IsDirective("undef", dir)) {
1147 return {LineClassification::Kind::DefinitionDirective};
1148 } else {
1149 return {LineClassification::Kind::PreprocessorDirective};
1150 }
1151 }
1152 return {LineClassification::Kind::Source};
1153 }
1154
SourceFormChange(std::string && dir)1155 void Prescanner::SourceFormChange(std::string &&dir) {
1156 if (dir == "!dir$ free") {
1157 inFixedForm_ = false;
1158 } else if (dir == "!dir$ fixed") {
1159 inFixedForm_ = true;
1160 }
1161 }
1162 } // namespace Fortran::parser
1163