1 // Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "prescan.h"
16 #include "characters.h"
17 #include "message.h"
18 #include "preprocessor.h"
19 #include "source.h"
20 #include "token-sequence.h"
21 #include "../common/idioms.h"
22 #include <cstddef>
23 #include <cstring>
24 #include <sstream>
25 #include <utility>
26 #include <vector>
27
28 namespace Fortran::parser {
29
30 static constexpr int maxPrescannerNesting{100};
31
Prescanner(Messages & messages,CookedSource & cooked,Preprocessor & preprocessor,LanguageFeatureControl lfc)32 Prescanner::Prescanner(Messages &messages, CookedSource &cooked,
33 Preprocessor &preprocessor, LanguageFeatureControl lfc)
34 : messages_{messages}, cooked_{cooked}, preprocessor_{preprocessor},
35 features_{lfc}, encoding_{cooked.allSources().encoding()} {}
36
Prescanner(const Prescanner & that)37 Prescanner::Prescanner(const Prescanner &that)
38 : messages_{that.messages_}, cooked_{that.cooked_},
39 preprocessor_{that.preprocessor_}, features_{that.features_},
40 inFixedForm_{that.inFixedForm_},
41 fixedFormColumnLimit_{that.fixedFormColumnLimit_},
42 encoding_{that.encoding_}, prescannerNesting_{that.prescannerNesting_ + 1},
43 skipLeadingAmpersand_{that.skipLeadingAmpersand_},
44 compilerDirectiveBloomFilter_{that.compilerDirectiveBloomFilter_},
45 compilerDirectiveSentinels_{that.compilerDirectiveSentinels_} {}
46
IsFixedFormCommentChar(char ch)47 static inline constexpr bool IsFixedFormCommentChar(char ch) {
48 return ch == '!' || ch == '*' || ch == 'C' || ch == 'c';
49 }
50
NormalizeCompilerDirectiveCommentMarker(TokenSequence & dir)51 static void NormalizeCompilerDirectiveCommentMarker(TokenSequence &dir) {
52 char *p{dir.GetMutableCharData()};
53 char *limit{p + dir.SizeInChars()};
54 for (; p < limit; ++p) {
55 if (*p != ' ') {
56 CHECK(IsFixedFormCommentChar(*p));
57 *p = '!';
58 return;
59 }
60 }
61 DIE("compiler directive all blank");
62 }
63
Prescan(ProvenanceRange range)64 void Prescanner::Prescan(ProvenanceRange range) {
65 AllSources &allSources{cooked_.allSources()};
66 startProvenance_ = range.start();
67 std::size_t offset{0};
68 const SourceFile *source{allSources.GetSourceFile(startProvenance_, &offset)};
69 CHECK(source != nullptr);
70 start_ = source->content() + offset;
71 limit_ = start_ + range.size();
72 nextLine_ = start_;
73 const bool beganInFixedForm{inFixedForm_};
74 if (prescannerNesting_ > maxPrescannerNesting) {
75 Say(GetProvenance(start_),
76 "too many nested INCLUDE/#include files, possibly circular"_err_en_US);
77 return;
78 }
79 while (nextLine_ < limit_) {
80 Statement();
81 }
82 if (inFixedForm_ != beganInFixedForm) {
83 std::string dir{"!dir$ "};
84 if (beganInFixedForm) {
85 dir += "fixed";
86 } else {
87 dir += "free";
88 }
89 dir += '\n';
90 TokenSequence tokens{dir, allSources.AddCompilerInsertion(dir).start()};
91 tokens.Emit(cooked_);
92 }
93 }
94
Statement()95 void Prescanner::Statement() {
96 TokenSequence tokens;
97 LineClassification line{ClassifyLine(nextLine_)};
98 switch (line.kind) {
99 case LineClassification::Kind::Comment:
100 nextLine_ += line.payloadOffset; // advance to '!' or newline
101 NextLine();
102 return;
103 case LineClassification::Kind::IncludeLine:
104 FortranInclude(nextLine_ + line.payloadOffset);
105 NextLine();
106 return;
107 case LineClassification::Kind::ConditionalCompilationDirective:
108 case LineClassification::Kind::IncludeDirective:
109 case LineClassification::Kind::DefinitionDirective:
110 case LineClassification::Kind::PreprocessorDirective:
111 preprocessor_.Directive(TokenizePreprocessorDirective(), this);
112 return;
113 case LineClassification::Kind::CompilerDirective:
114 directiveSentinel_ = line.sentinel;
115 CHECK(InCompilerDirective());
116 BeginSourceLineAndAdvance();
117 if (inFixedForm_) {
118 CHECK(IsFixedFormCommentChar(*at_));
119 } else {
120 while (*at_ == ' ' || *at_ == '\t') {
121 ++at_, ++column_;
122 }
123 CHECK(*at_ == '!');
124 }
125 if (directiveSentinel_[0] == '$' && directiveSentinel_[1] == '\0') {
126 // OpenMP conditional compilation line. Remove the sentinel and then
127 // treat the line as if it were normal source.
128 at_ += 2, column_ += 2;
129 if (inFixedForm_) {
130 LabelField(tokens);
131 } else {
132 SkipSpaces();
133 }
134 } else {
135 // Compiler directive. Emit normalized sentinel.
136 EmitChar(tokens, '!');
137 ++at_, ++column_;
138 for (const char *sp{directiveSentinel_}; *sp != '\0';
139 ++sp, ++at_, ++column_) {
140 EmitChar(tokens, *sp);
141 }
142 if (*at_ == ' ') {
143 EmitChar(tokens, ' ');
144 ++at_, ++column_;
145 }
146 tokens.CloseToken();
147 }
148 break;
149 case LineClassification::Kind::Source:
150 BeginSourceLineAndAdvance();
151 if (inFixedForm_) {
152 LabelField(tokens);
153 } else if (skipLeadingAmpersand_) {
154 skipLeadingAmpersand_ = false;
155 const char *p{SkipWhiteSpace(at_)};
156 if (p < limit_ && *p == '&') {
157 column_ += ++p - at_;
158 at_ = p;
159 }
160 } else {
161 SkipSpaces();
162 }
163 break;
164 }
165
166 while (NextToken(tokens)) {
167 }
168
169 Provenance newlineProvenance{GetCurrentProvenance()};
170 if (std::optional<TokenSequence> preprocessed{
171 preprocessor_.MacroReplacement(tokens, *this)}) {
172 // Reprocess the preprocessed line. Append a newline temporarily.
173 preprocessed->PutNextTokenChar('\n', newlineProvenance);
174 preprocessed->CloseToken();
175 const char *ppd{preprocessed->ToCharBlock().begin()};
176 LineClassification ppl{ClassifyLine(ppd)};
177 preprocessed->RemoveLastToken(); // remove the newline
178 switch (ppl.kind) {
179 case LineClassification::Kind::Comment: break;
180 case LineClassification::Kind::IncludeLine:
181 FortranInclude(ppd + ppl.payloadOffset);
182 break;
183 case LineClassification::Kind::ConditionalCompilationDirective:
184 case LineClassification::Kind::IncludeDirective:
185 case LineClassification::Kind::DefinitionDirective:
186 case LineClassification::Kind::PreprocessorDirective:
187 Say(preprocessed->GetProvenanceRange(),
188 "Preprocessed line resembles a preprocessor directive"_en_US);
189 preprocessed->ToLowerCase().Emit(cooked_);
190 break;
191 case LineClassification::Kind::CompilerDirective:
192 if (preprocessed->HasRedundantBlanks()) {
193 preprocessed->RemoveRedundantBlanks();
194 }
195 NormalizeCompilerDirectiveCommentMarker(*preprocessed);
196 preprocessed->ToLowerCase();
197 SourceFormChange(preprocessed->ToString());
198 preprocessed->ClipComment(true /* skip first ! */).Emit(cooked_);
199 break;
200 case LineClassification::Kind::Source:
201 if (inFixedForm_) {
202 if (preprocessed->HasBlanks(/*after column*/ 6)) {
203 preprocessed->RemoveBlanks(/*after column*/ 6);
204 }
205 } else {
206 if (preprocessed->HasRedundantBlanks()) {
207 preprocessed->RemoveRedundantBlanks();
208 }
209 }
210 preprocessed->ToLowerCase().ClipComment().Emit(cooked_);
211 break;
212 }
213 } else {
214 tokens.ToLowerCase();
215 if (line.kind == LineClassification::Kind::CompilerDirective) {
216 SourceFormChange(tokens.ToString());
217 }
218 tokens.Emit(cooked_);
219 }
220 if (omitNewline_) {
221 omitNewline_ = false;
222 } else {
223 cooked_.Put('\n', newlineProvenance);
224 }
225 directiveSentinel_ = nullptr;
226 }
227
TokenizePreprocessorDirective()228 TokenSequence Prescanner::TokenizePreprocessorDirective() {
229 CHECK(nextLine_ < limit_ && !inPreprocessorDirective_);
230 inPreprocessorDirective_ = true;
231 BeginSourceLineAndAdvance();
232 TokenSequence tokens;
233 while (NextToken(tokens)) {
234 }
235 inPreprocessorDirective_ = false;
236 return tokens;
237 }
238
NextLine()239 void Prescanner::NextLine() {
240 void *vstart{static_cast<void *>(const_cast<char *>(nextLine_))};
241 void *v{std::memchr(vstart, '\n', limit_ - nextLine_)};
242 if (v == nullptr) {
243 nextLine_ = limit_;
244 } else {
245 const char *nl{const_cast<const char *>(static_cast<char *>(v))};
246 nextLine_ = nl + 1;
247 }
248 }
249
LabelField(TokenSequence & token,int outCol)250 void Prescanner::LabelField(TokenSequence &token, int outCol) {
251 for (; *at_ != '\n' && column_ <= 6; ++at_) {
252 if (*at_ == '\t') {
253 ++at_;
254 column_ = 7;
255 break;
256 }
257 if (*at_ != ' ' &&
258 !(*at_ == '0' && column_ == 6)) { // '0' in column 6 becomes space
259 EmitChar(token, *at_);
260 ++outCol;
261 }
262 ++column_;
263 }
264 if (outCol > 1) {
265 token.CloseToken();
266 }
267 if (outCol < 7) {
268 if (outCol == 1) {
269 token.Put(" ", 6, sixSpaceProvenance_.start());
270 } else {
271 for (; outCol < 7; ++outCol) {
272 token.PutNextTokenChar(' ', spaceProvenance_);
273 }
274 token.CloseToken();
275 }
276 }
277 }
278
SkipToEndOfLine()279 void Prescanner::SkipToEndOfLine() {
280 while (*at_ != '\n') {
281 ++at_, ++column_;
282 }
283 }
284
MustSkipToEndOfLine() const285 bool Prescanner::MustSkipToEndOfLine() const {
286 if (inFixedForm_ && column_ > fixedFormColumnLimit_ && !tabInCurrentLine_) {
287 return true; // skip over ignored columns in right margin (73:80)
288 } else if (*at_ == '!' && !inCharLiteral_) {
289 return true; // inline comment goes to end of source line
290 } else {
291 return false;
292 }
293 }
294
NextChar()295 void Prescanner::NextChar() {
296 CHECK(*at_ != '\n');
297 ++at_, ++column_;
298 while (at_[0] == '\xef' && at_[1] == '\xbb' && at_[2] == '\xbf') {
299 // UTF-8 byte order mark - treat this file as UTF-8
300 at_ += 3;
301 encoding_ = Encoding::UTF_8;
302 }
303 if (inPreprocessorDirective_) {
304 SkipCComments();
305 } else {
306 bool mightNeedSpace{false};
307 if (MustSkipToEndOfLine()) {
308 SkipToEndOfLine();
309 } else {
310 mightNeedSpace = *at_ == '\n';
311 }
312 for (; Continuation(mightNeedSpace); mightNeedSpace = false) {
313 if (MustSkipToEndOfLine()) {
314 SkipToEndOfLine();
315 }
316 }
317 if (*at_ == '\t') {
318 tabInCurrentLine_ = true;
319 }
320 }
321 }
322
SkipCComments()323 void Prescanner::SkipCComments() {
324 while (true) {
325 if (IsCComment(at_)) {
326 if (const char *after{SkipCComment(at_)}) {
327 column_ += after - at_;
328 // May have skipped over one or more newlines; relocate the start of
329 // the next line.
330 nextLine_ = at_ = after;
331 NextLine();
332 } else {
333 // Don't emit any messages about unclosed C-style comments, because
334 // the sequence /* can appear legally in a FORMAT statement. There's
335 // no ambiguity, since the sequence */ cannot appear legally.
336 break;
337 }
338 } else if (inPreprocessorDirective_ && at_[0] == '\\' && at_ + 2 < limit_ &&
339 at_[1] == '\n' && nextLine_ < limit_) {
340 BeginSourceLineAndAdvance();
341 } else {
342 break;
343 }
344 }
345 }
346
SkipSpaces()347 void Prescanner::SkipSpaces() {
348 while (*at_ == ' ' || *at_ == '\t') {
349 NextChar();
350 }
351 insertASpace_ = false;
352 }
353
SkipWhiteSpace(const char * p)354 const char *Prescanner::SkipWhiteSpace(const char *p) {
355 while (*p == ' ' || *p == '\t') {
356 ++p;
357 }
358 return p;
359 }
360
SkipWhiteSpaceAndCComments(const char * p) const361 const char *Prescanner::SkipWhiteSpaceAndCComments(const char *p) const {
362 while (true) {
363 if (*p == ' ' || *p == '\t') {
364 ++p;
365 } else if (IsCComment(p)) {
366 if (const char *after{SkipCComment(p)}) {
367 p = after;
368 } else {
369 break;
370 }
371 } else {
372 break;
373 }
374 }
375 return p;
376 }
377
SkipCComment(const char * p) const378 const char *Prescanner::SkipCComment(const char *p) const {
379 char star{' '}, slash{' '};
380 p += 2;
381 while (star != '*' || slash != '/') {
382 if (p >= limit_) {
383 return nullptr; // signifies an unterminated comment
384 }
385 star = slash;
386 slash = *p++;
387 }
388 return p;
389 }
390
NextToken(TokenSequence & tokens)391 bool Prescanner::NextToken(TokenSequence &tokens) {
392 CHECK(at_ >= start_ && at_ < limit_);
393 if (InFixedFormSource()) {
394 SkipSpaces();
395 } else {
396 if (*at_ == '/' && IsCComment(at_)) {
397 // Recognize and skip over classic C style /*comments*/ when
398 // outside a character literal.
399 if (features_.ShouldWarn(LanguageFeature::ClassicCComments)) {
400 Say(GetProvenance(at_), "nonstandard usage: C-style comment"_en_US);
401 }
402 SkipCComments();
403 }
404 if (*at_ == ' ' || *at_ == '\t') {
405 // Compress free-form white space into a single space character.
406 const auto theSpace{at_};
407 char previous{at_ <= start_ ? ' ' : at_[-1]};
408 NextChar();
409 SkipSpaces();
410 if (*at_ == '\n') {
411 // Discard white space at the end of a line.
412 } else if (!inPreprocessorDirective_ &&
413 (previous == '(' || *at_ == '(' || *at_ == ')')) {
414 // Discard white space before/after '(' and before ')', unless in a
415 // preprocessor directive. This helps yield space-free contiguous
416 // names for generic interfaces like OPERATOR( + ) and
417 // READ ( UNFORMATTED ), without misinterpreting #define f (notAnArg).
418 // This has the effect of silently ignoring the illegal spaces in
419 // the array constructor ( /1,2/ ) but that seems benign; it's
420 // hard to avoid that while still removing spaces from OPERATOR( / )
421 // and OPERATOR( // ).
422 } else {
423 // Preserve the squashed white space as a single space character.
424 tokens.PutNextTokenChar(' ', GetProvenance(theSpace));
425 tokens.CloseToken();
426 return true;
427 }
428 }
429 }
430 if (insertASpace_) {
431 tokens.PutNextTokenChar(' ', spaceProvenance_);
432 insertASpace_ = false;
433 }
434 if (*at_ == '\n') {
435 return false;
436 }
437 const char *start{at_};
438 if (*at_ == '\'' || *at_ == '"') {
439 QuotedCharacterLiteral(tokens, start);
440 preventHollerith_ = false;
441 } else if (IsDecimalDigit(*at_)) {
442 int n{0}, digits{0};
443 static constexpr int maxHollerith{256 /*lines*/ * (132 - 6 /*columns*/)};
444 do {
445 if (n < maxHollerith) {
446 n = 10 * n + DecimalDigitValue(*at_);
447 }
448 EmitCharAndAdvance(tokens, *at_);
449 ++digits;
450 if (InFixedFormSource()) {
451 SkipSpaces();
452 }
453 } while (IsDecimalDigit(*at_));
454 if ((*at_ == 'h' || *at_ == 'H') && n > 0 && n < maxHollerith &&
455 !preventHollerith_) {
456 Hollerith(tokens, n, start);
457 } else if (*at_ == '.') {
458 while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
459 }
460 ExponentAndKind(tokens);
461 } else if (ExponentAndKind(tokens)) {
462 } else if (digits == 1 && n == 0 && (*at_ == 'x' || *at_ == 'X') &&
463 inPreprocessorDirective_) {
464 do {
465 EmitCharAndAdvance(tokens, *at_);
466 } while (IsHexadecimalDigit(*at_));
467 } else if (IsLetter(*at_)) {
468 // Handles FORMAT(3I9HHOLLERITH) by skipping over the first I so that
469 // we don't misrecognize I9HOLLERITH as an identifier in the next case.
470 EmitCharAndAdvance(tokens, *at_);
471 } else if (at_[0] == '_' && (at_[1] == '\'' || at_[1] == '"')) {
472 EmitCharAndAdvance(tokens, *at_);
473 QuotedCharacterLiteral(tokens, start);
474 }
475 preventHollerith_ = false;
476 } else if (*at_ == '.') {
477 char nch{EmitCharAndAdvance(tokens, '.')};
478 if (!inPreprocessorDirective_ && IsDecimalDigit(nch)) {
479 while (IsDecimalDigit(EmitCharAndAdvance(tokens, *at_))) {
480 }
481 ExponentAndKind(tokens);
482 } else if (nch == '.' && EmitCharAndAdvance(tokens, '.') == '.') {
483 EmitCharAndAdvance(tokens, '.'); // variadic macro definition ellipsis
484 }
485 preventHollerith_ = false;
486 } else if (IsLegalInIdentifier(*at_)) {
487 do {
488 } while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_)));
489 if (*at_ == '\'' || *at_ == '"') {
490 QuotedCharacterLiteral(tokens, start);
491 preventHollerith_ = false;
492 } else {
493 // Subtle: Don't misrecognize labeled DO statement label as Hollerith
494 // when the loop control variable starts with 'H'.
495 preventHollerith_ = true;
496 }
497 } else if (*at_ == '*') {
498 if (EmitCharAndAdvance(tokens, '*') == '*') {
499 EmitCharAndAdvance(tokens, '*');
500 } else {
501 // Subtle ambiguity:
502 // CHARACTER*2H declares H because *2 is a kind specifier
503 // DATAC/N*2H / is repeated Hollerith
504 preventHollerith_ = !slashInCurrentLine_;
505 }
506 } else {
507 char ch{*at_};
508 if (ch == '(' || ch == '[') {
509 ++delimiterNesting_;
510 } else if ((ch == ')' || ch == ']') && delimiterNesting_ > 0) {
511 --delimiterNesting_;
512 }
513 char nch{EmitCharAndAdvance(tokens, ch)};
514 preventHollerith_ = false;
515 if ((nch == '=' &&
516 (ch == '<' || ch == '>' || ch == '/' || ch == '=' || ch == '!')) ||
517 (ch == nch &&
518 (ch == '/' || ch == ':' || ch == '*' || ch == '#' || ch == '&' ||
519 ch == '|' || ch == '<' || ch == '>')) ||
520 (ch == '=' && nch == '>')) {
521 // token comprises two characters
522 EmitCharAndAdvance(tokens, nch);
523 } else if (ch == '/') {
524 slashInCurrentLine_ = true;
525 }
526 }
527 tokens.CloseToken();
528 return true;
529 }
530
ExponentAndKind(TokenSequence & tokens)531 bool Prescanner::ExponentAndKind(TokenSequence &tokens) {
532 char ed{ToLowerCaseLetter(*at_)};
533 if (ed != 'e' && ed != 'd') {
534 return false;
535 }
536 EmitCharAndAdvance(tokens, ed);
537 if (*at_ == '+' || *at_ == '-') {
538 EmitCharAndAdvance(tokens, *at_);
539 }
540 while (IsDecimalDigit(*at_)) {
541 EmitCharAndAdvance(tokens, *at_);
542 }
543 if (*at_ == '_') {
544 while (IsLegalInIdentifier(EmitCharAndAdvance(tokens, *at_))) {
545 }
546 }
547 return true;
548 }
549
QuotedCharacterLiteral(TokenSequence & tokens,const char * start)550 void Prescanner::QuotedCharacterLiteral(
551 TokenSequence &tokens, const char *start) {
552 char quote{*at_};
553 const char *end{at_ + 1};
554 inCharLiteral_ = true;
555 const auto emit{[&](char ch) { EmitChar(tokens, ch); }};
556 const auto insert{[&](char ch) { EmitInsertedChar(tokens, ch); }};
557 bool isEscaped{false};
558 bool escapesEnabled{features_.IsEnabled(LanguageFeature::BackslashEscapes)};
559 while (true) {
560 if (*at_ == '\\') {
561 if (escapesEnabled) {
562 isEscaped = !isEscaped;
563 } else {
564 // The parser always processes escape sequences, so don't confuse it
565 // when escapes are disabled.
566 insert('\\');
567 }
568 } else {
569 isEscaped = false;
570 }
571 EmitQuotedChar(static_cast<unsigned char>(*at_), emit, insert, false,
572 Encoding::LATIN_1);
573 while (PadOutCharacterLiteral(tokens)) {
574 }
575 if (*at_ == '\n') {
576 if (!inPreprocessorDirective_) {
577 Say(GetProvenanceRange(start, end),
578 "Incomplete character literal"_err_en_US);
579 }
580 break;
581 }
582 end = at_ + 1;
583 NextChar();
584 if (*at_ == quote && !isEscaped) {
585 // A doubled unescaped quote mark becomes a single instance of that
586 // quote character in the literal (later). There can be spaces between
587 // the quotes in fixed form source.
588 EmitChar(tokens, quote);
589 inCharLiteral_ = false; // for cases like print *, '...'!comment
590 NextChar();
591 if (InFixedFormSource()) {
592 SkipSpaces();
593 }
594 if (*at_ != quote) {
595 break;
596 }
597 inCharLiteral_ = true;
598 }
599 }
600 inCharLiteral_ = false;
601 }
602
Hollerith(TokenSequence & tokens,int count,const char * start)603 void Prescanner::Hollerith(
604 TokenSequence &tokens, int count, const char *start) {
605 inCharLiteral_ = true;
606 CHECK(*at_ == 'h' || *at_ == 'H');
607 EmitChar(tokens, 'H');
608 while (count-- > 0) {
609 if (PadOutCharacterLiteral(tokens)) {
610 } else if (*at_ == '\n') {
611 Say(GetProvenanceRange(start, at_),
612 "Possible truncated Hollerith literal"_en_US);
613 break;
614 } else {
615 NextChar();
616 // Each multi-byte character encoding counts as a single character.
617 // No escape sequences are recognized.
618 // Hollerith is always emitted to the cooked character
619 // stream in UTF-8.
620 DecodedCharacter decoded{DecodeCharacter(
621 encoding_, at_, static_cast<std::size_t>(limit_ - at_), false)};
622 if (decoded.bytes > 0) {
623 EncodedCharacter utf8{
624 EncodeCharacter<Encoding::UTF_8>(decoded.codepoint)};
625 for (int j{0}; j < utf8.bytes; ++j) {
626 EmitChar(tokens, utf8.buffer[j]);
627 }
628 at_ += decoded.bytes - 1;
629 } else {
630 Say(GetProvenanceRange(start, at_),
631 "Bad character in Hollerith literal"_err_en_US);
632 break;
633 }
634 }
635 }
636 if (*at_ != '\n') {
637 NextChar();
638 }
639 inCharLiteral_ = false;
640 }
641
642 // In fixed form, source card images must be processed as if they were at
643 // least 72 columns wide, at least in character literal contexts.
PadOutCharacterLiteral(TokenSequence & tokens)644 bool Prescanner::PadOutCharacterLiteral(TokenSequence &tokens) {
645 while (inFixedForm_ && !tabInCurrentLine_ && at_[1] == '\n') {
646 if (column_ < fixedFormColumnLimit_) {
647 tokens.PutNextTokenChar(' ', spaceProvenance_);
648 ++column_;
649 return true;
650 }
651 if (!FixedFormContinuation(false /*no need to insert space*/) ||
652 tabInCurrentLine_) {
653 return false;
654 }
655 CHECK(column_ == 7);
656 --at_; // point to column 6 of continuation line
657 column_ = 6;
658 }
659 return false;
660 }
661
IsFixedFormCommentLine(const char * start) const662 bool Prescanner::IsFixedFormCommentLine(const char *start) const {
663 const char *p{start};
664 if (IsFixedFormCommentChar(*p) || *p == '%' || // VAX %list, %eject, &c.
665 ((*p == 'D' || *p == 'd') &&
666 !features_.IsEnabled(LanguageFeature::OldDebugLines))) {
667 return true;
668 }
669 bool anyTabs{false};
670 while (true) {
671 if (*p == ' ') {
672 ++p;
673 } else if (*p == '\t') {
674 anyTabs = true;
675 ++p;
676 } else if (*p == '0' && !anyTabs && p == start + 5) {
677 ++p; // 0 in column 6 must treated as a space
678 } else {
679 break;
680 }
681 }
682 if (!anyTabs && p >= start + fixedFormColumnLimit_) {
683 return true;
684 }
685 if (*p == '!' && !inCharLiteral_ && (anyTabs || p != start + 5)) {
686 return true;
687 }
688 return *p == '\n';
689 }
690
IsFreeFormComment(const char * p) const691 const char *Prescanner::IsFreeFormComment(const char *p) const {
692 p = SkipWhiteSpaceAndCComments(p);
693 if (*p == '!' || *p == '\n') {
694 return p;
695 } else {
696 return nullptr;
697 }
698 }
699
IsIncludeLine(const char * start) const700 std::optional<std::size_t> Prescanner::IsIncludeLine(const char *start) const {
701 const char *p{SkipWhiteSpace(start)};
702 for (char ch : "include"s) {
703 if (ToLowerCaseLetter(*p++) != ch) {
704 return std::nullopt;
705 }
706 }
707 p = SkipWhiteSpace(p);
708 if (*p == '"' || *p == '\'') {
709 return {p - start};
710 }
711 return std::nullopt;
712 }
713
FortranInclude(const char * firstQuote)714 void Prescanner::FortranInclude(const char *firstQuote) {
715 const char *p{firstQuote};
716 while (*p != '"' && *p != '\'') {
717 ++p;
718 }
719 char quote{*p};
720 std::string path;
721 for (++p; *p != '\n'; ++p) {
722 if (*p == quote) {
723 if (p[1] != quote) {
724 break;
725 }
726 ++p;
727 }
728 path += *p;
729 }
730 if (*p != quote) {
731 Say(GetProvenanceRange(firstQuote, p),
732 "malformed path name string"_err_en_US);
733 return;
734 }
735 p = SkipWhiteSpace(p + 1);
736 if (*p != '\n' && *p != '!') {
737 const char *garbage{p};
738 for (; *p != '\n' && *p != '!'; ++p) {
739 }
740 Say(GetProvenanceRange(garbage, p),
741 "excess characters after path name"_en_US);
742 }
743 std::stringstream error;
744 Provenance provenance{GetProvenance(nextLine_)};
745 AllSources &allSources{cooked_.allSources()};
746 const SourceFile *currentFile{allSources.GetSourceFile(provenance)};
747 if (currentFile != nullptr) {
748 allSources.PushSearchPathDirectory(DirectoryName(currentFile->path()));
749 }
750 const SourceFile *included{allSources.Open(path, &error)};
751 if (currentFile != nullptr) {
752 allSources.PopSearchPathDirectory();
753 }
754 if (included == nullptr) {
755 Say(provenance, "INCLUDE: %s"_err_en_US, error.str());
756 } else if (included->bytes() > 0) {
757 ProvenanceRange includeLineRange{
758 provenance, static_cast<std::size_t>(p - nextLine_)};
759 ProvenanceRange fileRange{
760 allSources.AddIncludedFile(*included, includeLineRange)};
761 Prescanner{*this}.set_encoding(included->encoding()).Prescan(fileRange);
762 }
763 }
764
IsPreprocessorDirectiveLine(const char * start) const765 const char *Prescanner::IsPreprocessorDirectiveLine(const char *start) const {
766 const char *p{start};
767 for (; *p == ' '; ++p) {
768 }
769 if (*p == '#') {
770 if (inFixedForm_ && p == start + 5) {
771 return nullptr;
772 }
773 } else {
774 p = SkipWhiteSpace(p);
775 if (*p != '#') {
776 return nullptr;
777 }
778 }
779 return SkipWhiteSpace(p + 1);
780 }
781
IsNextLinePreprocessorDirective() const782 bool Prescanner::IsNextLinePreprocessorDirective() const {
783 return IsPreprocessorDirectiveLine(nextLine_) != nullptr;
784 }
785
SkipCommentLine(bool afterAmpersand)786 bool Prescanner::SkipCommentLine(bool afterAmpersand) {
787 if (nextLine_ >= limit_) {
788 if (afterAmpersand && prescannerNesting_ > 0) {
789 // A continuation marker at the end of the last line in an
790 // include file inhibits the newline for that line.
791 SkipToEndOfLine();
792 omitNewline_ = true;
793 }
794 return false;
795 }
796 auto lineClass{ClassifyLine(nextLine_)};
797 if (lineClass.kind == LineClassification::Kind::Comment) {
798 NextLine();
799 return true;
800 } else if (inPreprocessorDirective_) {
801 return false;
802 } else if (lineClass.kind ==
803 LineClassification::Kind::ConditionalCompilationDirective ||
804 lineClass.kind == LineClassification::Kind::PreprocessorDirective) {
805 // Allow conditional compilation directives (e.g., #ifdef) to affect
806 // continuation lines.
807 // Allow other preprocessor directives, too, except #include
808 // (when it does not follow '&'), #define, and #undef (because
809 // they cannot be allowed to affect preceding text on a
810 // continued line).
811 preprocessor_.Directive(TokenizePreprocessorDirective(), this);
812 return true;
813 } else if (afterAmpersand &&
814 (lineClass.kind == LineClassification::Kind::IncludeDirective ||
815 lineClass.kind == LineClassification::Kind::IncludeLine)) {
816 SkipToEndOfLine();
817 omitNewline_ = true;
818 skipLeadingAmpersand_ = true;
819 return false;
820 } else {
821 return false;
822 }
823 }
824
FixedFormContinuationLine(bool mightNeedSpace)825 const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
826 if (nextLine_ >= limit_) {
827 return nullptr;
828 }
829 tabInCurrentLine_ = false;
830 char col1{*nextLine_};
831 if (InCompilerDirective()) {
832 // Must be a continued compiler directive.
833 if (!IsFixedFormCommentChar(col1)) {
834 return nullptr;
835 }
836 int j{1};
837 for (; j < 5; ++j) {
838 char ch{directiveSentinel_[j - 1]};
839 if (ch == '\0') {
840 break;
841 }
842 if (ch != ToLowerCaseLetter(nextLine_[j])) {
843 return nullptr;
844 }
845 }
846 for (; j < 5; ++j) {
847 if (nextLine_[j] != ' ') {
848 return nullptr;
849 }
850 }
851 char col6{nextLine_[5]};
852 if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
853 if (nextLine_[6] != ' ' && mightNeedSpace) {
854 insertASpace_ = true;
855 }
856 return nextLine_ + 6;
857 }
858 return nullptr;
859 } else {
860 // Normal case: not in a compiler directive.
861 if (col1 == '&' &&
862 features_.IsEnabled(
863 LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
864 // Extension: '&' as continuation marker
865 if (features_.ShouldWarn(
866 LanguageFeature::FixedFormContinuationWithColumn1Ampersand)) {
867 Say(GetProvenance(nextLine_), "nonstandard usage"_en_US);
868 }
869 return nextLine_ + 1;
870 }
871 if (col1 == '\t' && nextLine_[1] >= '1' && nextLine_[1] <= '9') {
872 tabInCurrentLine_ = true;
873 return nextLine_ + 2; // VAX extension
874 }
875 if (col1 == ' ' && nextLine_[1] == ' ' && nextLine_[2] == ' ' &&
876 nextLine_[3] == ' ' && nextLine_[4] == ' ') {
877 char col6{nextLine_[5]};
878 if (col6 != '\n' && col6 != '\t' && col6 != ' ' && col6 != '0') {
879 return nextLine_ + 6;
880 }
881 }
882 if (delimiterNesting_ > 0) {
883 if (!IsFixedFormCommentChar(col1)) {
884 return nextLine_;
885 }
886 }
887 }
888 return nullptr; // not a continuation line
889 }
890
FreeFormContinuationLine(bool ampersand)891 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
892 const char *p{nextLine_};
893 if (p >= limit_) {
894 return nullptr;
895 }
896 p = SkipWhiteSpace(p);
897 if (InCompilerDirective()) {
898 if (*p++ != '!') {
899 return nullptr;
900 }
901 for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
902 if (*s != ToLowerCaseLetter(*p)) {
903 return nullptr;
904 }
905 }
906 p = SkipWhiteSpace(p);
907 if (*p == '&') {
908 if (!ampersand) {
909 insertASpace_ = true;
910 }
911 return p + 1;
912 } else if (ampersand) {
913 return p;
914 } else {
915 return nullptr;
916 }
917 } else {
918 if (*p == '&') {
919 return p + 1;
920 } else if (*p == '!' || *p == '\n' || *p == '#') {
921 return nullptr;
922 } else if (ampersand || delimiterNesting_ > 0) {
923 if (p > nextLine_) {
924 --p;
925 } else {
926 insertASpace_ = true;
927 }
928 return p;
929 } else {
930 return nullptr;
931 }
932 }
933 }
934
FixedFormContinuation(bool mightNeedSpace)935 bool Prescanner::FixedFormContinuation(bool mightNeedSpace) {
936 // N.B. We accept '&' as a continuation indicator in fixed form, too,
937 // but not in a character literal.
938 if (*at_ == '&' && inCharLiteral_) {
939 return false;
940 }
941 do {
942 if (const char *cont{FixedFormContinuationLine(mightNeedSpace)}) {
943 BeginSourceLine(cont);
944 column_ = 7;
945 NextLine();
946 return true;
947 }
948 } while (SkipCommentLine(false /* not after ampersand */));
949 return false;
950 }
951
FreeFormContinuation()952 bool Prescanner::FreeFormContinuation() {
953 const char *p{at_};
954 bool ampersand{*p == '&'};
955 if (ampersand) {
956 p = SkipWhiteSpace(p + 1);
957 }
958 if (*p != '\n') {
959 if (inCharLiteral_) {
960 return false;
961 } else if (*p != '!' &&
962 features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
963 Say(GetProvenance(p), "missing ! before comment after &"_en_US);
964 }
965 }
966 do {
967 if (const char *cont{FreeFormContinuationLine(ampersand)}) {
968 BeginSourceLine(cont);
969 NextLine();
970 return true;
971 }
972 } while (SkipCommentLine(ampersand));
973 return false;
974 }
975
Continuation(bool mightNeedFixedFormSpace)976 bool Prescanner::Continuation(bool mightNeedFixedFormSpace) {
977 if (*at_ == '\n' || *at_ == '&') {
978 if (inFixedForm_) {
979 return FixedFormContinuation(mightNeedFixedFormSpace);
980 } else {
981 return FreeFormContinuation();
982 }
983 } else {
984 return false;
985 }
986 }
987
988 std::optional<Prescanner::LineClassification>
IsFixedFormCompilerDirectiveLine(const char * start) const989 Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
990 const char *p{start};
991 char col1{*p++};
992 if (!IsFixedFormCommentChar(col1)) {
993 return std::nullopt;
994 }
995 char sentinel[5], *sp{sentinel};
996 int column{2};
997 for (; column < 6; ++column, ++p) {
998 if (*p != ' ') {
999 if (*p == '\n' || *p == '\t') {
1000 break;
1001 }
1002 if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
1003 // OpenMP conditional compilation line: leave the label alone
1004 break;
1005 }
1006 *sp++ = ToLowerCaseLetter(*p);
1007 }
1008 }
1009 if (column == 6) {
1010 if (*p == ' ' || *p == '\t' || *p == '0') {
1011 ++p;
1012 } else {
1013 // This is a Continuation line, not an initial directive line.
1014 return std::nullopt;
1015 }
1016 }
1017 if (sp == sentinel) {
1018 return std::nullopt;
1019 }
1020 *sp = '\0';
1021 if (const char *ss{IsCompilerDirectiveSentinel(sentinel)}) {
1022 std::size_t payloadOffset = p - start;
1023 return {LineClassification{
1024 LineClassification::Kind::CompilerDirective, payloadOffset, ss}};
1025 }
1026 return std::nullopt;
1027 }
1028
1029 std::optional<Prescanner::LineClassification>
IsFreeFormCompilerDirectiveLine(const char * start) const1030 Prescanner::IsFreeFormCompilerDirectiveLine(const char *start) const {
1031 char sentinel[8];
1032 const char *p{SkipWhiteSpace(start)};
1033 if (*p++ != '!') {
1034 return std::nullopt;
1035 }
1036 for (std::size_t j{0}; j + 1 < sizeof sentinel; ++p, ++j) {
1037 if (*p == '\n') {
1038 break;
1039 }
1040 if (*p == ' ' || *p == '\t' || *p == '&') {
1041 if (j == 0) {
1042 break;
1043 }
1044 sentinel[j] = '\0';
1045 p = SkipWhiteSpace(p + 1);
1046 if (*p == '!') {
1047 break;
1048 }
1049 if (const char *sp{IsCompilerDirectiveSentinel(sentinel)}) {
1050 std::size_t offset = p - start;
1051 return {LineClassification{
1052 LineClassification::Kind::CompilerDirective, offset, sp}};
1053 }
1054 break;
1055 }
1056 sentinel[j] = ToLowerCaseLetter(*p);
1057 }
1058 return std::nullopt;
1059 }
1060
AddCompilerDirectiveSentinel(const std::string & dir)1061 Prescanner &Prescanner::AddCompilerDirectiveSentinel(const std::string &dir) {
1062 std::uint64_t packed{0};
1063 for (char ch : dir) {
1064 packed = (packed << 8) | (ToLowerCaseLetter(ch) & 0xff);
1065 }
1066 compilerDirectiveBloomFilter_.set(packed % prime1);
1067 compilerDirectiveBloomFilter_.set(packed % prime2);
1068 compilerDirectiveSentinels_.insert(dir);
1069 return *this;
1070 }
1071
IsCompilerDirectiveSentinel(const char * sentinel) const1072 const char *Prescanner::IsCompilerDirectiveSentinel(
1073 const char *sentinel) const {
1074 std::uint64_t packed{0};
1075 std::size_t n{0};
1076 for (; sentinel[n] != '\0'; ++n) {
1077 packed = (packed << 8) | (sentinel[n] & 0xff);
1078 }
1079 if (n == 0 || !compilerDirectiveBloomFilter_.test(packed % prime1) ||
1080 !compilerDirectiveBloomFilter_.test(packed % prime2)) {
1081 return nullptr;
1082 }
1083 const auto iter{compilerDirectiveSentinels_.find(std::string(sentinel, n))};
1084 return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->data();
1085 }
1086
ClassifyLine(const char * start) const1087 Prescanner::LineClassification Prescanner::ClassifyLine(
1088 const char *start) const {
1089 if (inFixedForm_) {
1090 if (std::optional<LineClassification> lc{
1091 IsFixedFormCompilerDirectiveLine(start)}) {
1092 return std::move(*lc);
1093 }
1094 if (IsFixedFormCommentLine(start)) {
1095 return {LineClassification::Kind::Comment};
1096 }
1097 } else {
1098 if (std::optional<LineClassification> lc{
1099 IsFreeFormCompilerDirectiveLine(start)}) {
1100 return std::move(*lc);
1101 }
1102 if (const char *bang{IsFreeFormComment(start)}) {
1103 return {LineClassification::Kind::Comment,
1104 static_cast<std::size_t>(bang - start)};
1105 }
1106 }
1107 if (std::optional<std::size_t> quoteOffset{IsIncludeLine(start)}) {
1108 return {LineClassification::Kind::IncludeLine, *quoteOffset};
1109 }
1110 if (const char *dir{IsPreprocessorDirectiveLine(start)}) {
1111 if (std::memcmp(dir, "if", 2) == 0 || std::memcmp(dir, "elif", 4) == 0 ||
1112 std::memcmp(dir, "else", 4) == 0 || std::memcmp(dir, "endif", 5) == 0) {
1113 return {LineClassification::Kind::ConditionalCompilationDirective};
1114 } else if (std::memcmp(dir, "include", 7) == 0) {
1115 return {LineClassification::Kind::IncludeDirective};
1116 } else if (std::memcmp(dir, "define", 6) == 0 ||
1117 std::memcmp(dir, "undef", 5) == 0) {
1118 return {LineClassification::Kind::DefinitionDirective};
1119 } else {
1120 return {LineClassification::Kind::PreprocessorDirective};
1121 }
1122 }
1123 return {LineClassification::Kind::Source};
1124 }
1125
SourceFormChange(std::string && dir)1126 void Prescanner::SourceFormChange(std::string &&dir) {
1127 if (dir == "!dir$ free") {
1128 inFixedForm_ = false;
1129 } else if (dir == "!dir$ fixed") {
1130 inFixedForm_ = true;
1131 }
1132 }
1133 }
1134