1 //===-- include/flang/Parser/characters.h -----------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #ifndef FORTRAN_PARSER_CHARACTERS_H_
10 #define FORTRAN_PARSER_CHARACTERS_H_
11
12 // Define some character classification predicates and
13 // conversions here to avoid dependences upon <cctype> and
14 // also to accomodate Fortran tokenization.
15
16 #include <cstddef>
17 #include <optional>
18 #include <string>
19
20 namespace Fortran::parser {
21
22 extern bool useHexadecimalEscapeSequences;
23
24 // We can easily support Fortran program source in any character
25 // set whose first 128 code points correspond to ASCII codes 0-127 (ISO/IEC646).
26 // The specific encodings that we can handle include:
27 // LATIN_1: ISO 8859-1 Latin-1
28 // UTF_8: Multi-byte encoding of Unicode (ISO/IEC 10646)
29 enum class Encoding { LATIN_1, UTF_8 };
30
IsUpperCaseLetter(char ch)31 inline constexpr bool IsUpperCaseLetter(char ch) {
32 return ch >= 'A' && ch <= 'Z';
33 }
34
IsLowerCaseLetter(char ch)35 inline constexpr bool IsLowerCaseLetter(char ch) {
36 return ch >= 'a' && ch <= 'z';
37 }
38
IsLetter(char ch)39 inline constexpr bool IsLetter(char ch) {
40 return IsUpperCaseLetter(ch) || IsLowerCaseLetter(ch);
41 }
42
IsDecimalDigit(char ch)43 inline constexpr bool IsDecimalDigit(char ch) { return ch >= '0' && ch <= '9'; }
44
IsHexadecimalDigit(char ch)45 inline constexpr bool IsHexadecimalDigit(char ch) {
46 return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') ||
47 (ch >= 'a' && ch <= 'f');
48 }
49
IsOctalDigit(char ch)50 inline constexpr bool IsOctalDigit(char ch) { return ch >= '0' && ch <= '7'; }
51
IsLegalIdentifierStart(char ch)52 inline constexpr bool IsLegalIdentifierStart(char ch) {
53 return IsLetter(ch) || ch == '_' || ch == '@' || ch == '$';
54 }
55
IsLegalInIdentifier(char ch)56 inline constexpr bool IsLegalInIdentifier(char ch) {
57 return IsLegalIdentifierStart(ch) || IsDecimalDigit(ch);
58 }
59
ToLowerCaseLetter(char ch)60 inline constexpr char ToLowerCaseLetter(char ch) {
61 return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch;
62 }
63
ToLowerCaseLetter(char && ch)64 inline constexpr char ToLowerCaseLetter(char &&ch) {
65 return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch;
66 }
67
ToLowerCaseLetters(const std::string & str)68 inline std::string ToLowerCaseLetters(const std::string &str) {
69 std::string lowered{str};
70 for (char &ch : lowered) {
71 ch = ToLowerCaseLetter(ch);
72 }
73 return lowered;
74 }
75
ToUpperCaseLetter(char ch)76 inline constexpr char ToUpperCaseLetter(char ch) {
77 return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch;
78 }
79
ToUpperCaseLetter(char && ch)80 inline constexpr char ToUpperCaseLetter(char &&ch) {
81 return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch;
82 }
83
ToUpperCaseLetters(const std::string & str)84 inline std::string ToUpperCaseLetters(const std::string &str) {
85 std::string raised{str};
86 for (char &ch : raised) {
87 ch = ToUpperCaseLetter(ch);
88 }
89 return raised;
90 }
91
IsSameApartFromCase(char x,char y)92 inline constexpr bool IsSameApartFromCase(char x, char y) {
93 return ToLowerCaseLetter(x) == ToLowerCaseLetter(y);
94 }
95
DecimalDigitValue(char ch)96 inline constexpr char DecimalDigitValue(char ch) { return ch - '0'; }
97
HexadecimalDigitValue(char ch)98 inline constexpr char HexadecimalDigitValue(char ch) {
99 return IsUpperCaseLetter(ch) ? ch - 'A' + 10
100 : IsLowerCaseLetter(ch) ? ch - 'a' + 10
101 : DecimalDigitValue(ch);
102 }
103
BackslashEscapeValue(char ch)104 inline constexpr std::optional<char> BackslashEscapeValue(char ch) {
105 switch (ch) {
106 case 'a':
107 return std::nullopt; // '\a'; PGF90 doesn't know \a
108 case 'b':
109 return '\b';
110 case 'f':
111 return '\f';
112 case 'n':
113 return '\n';
114 case 'r':
115 return '\r';
116 case 't':
117 return '\t';
118 case 'v':
119 return '\v';
120 case '"':
121 case '\'':
122 case '\\':
123 return ch;
124 default:
125 return std::nullopt;
126 }
127 }
128
BackslashEscapeChar(char ch)129 inline constexpr std::optional<char> BackslashEscapeChar(char ch) {
130 switch (ch) {
131 case '\a':
132 return std::nullopt; // 'a'; PGF90 doesn't know \a
133 case '\b':
134 return 'b';
135 case '\f':
136 return 'f';
137 case '\n':
138 return 'n';
139 case '\r':
140 return 'r';
141 case '\t':
142 return 't';
143 case '\v':
144 return 'v';
145 case '"':
146 case '\'':
147 case '\\':
148 return ch;
149 default:
150 return std::nullopt;
151 }
152 }
153
154 // Does not include spaces or line ending characters.
IsValidFortranTokenCharacter(char ch)155 inline constexpr bool IsValidFortranTokenCharacter(char ch) {
156 switch (ch) {
157 case '"':
158 case '%':
159 case '\'':
160 case '(':
161 case ')':
162 case '*':
163 case '+':
164 case ',':
165 case '-':
166 case '.':
167 case '/':
168 case ':':
169 case ';':
170 case '<':
171 case '=':
172 case '>':
173 case '[':
174 case ']':
175 return true;
176 default:
177 return IsLegalIdentifierStart(ch) || IsDecimalDigit(ch);
178 }
179 }
180
181 struct EncodedCharacter {
182 static constexpr int maxEncodingBytes{6};
183 char buffer[maxEncodingBytes];
184 int bytes{0};
185 };
186
187 template <Encoding ENCODING> EncodedCharacter EncodeCharacter(char32_t ucs);
188 template <> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t);
189 template <> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t);
190
191 EncodedCharacter EncodeCharacter(Encoding, char32_t ucs);
192
193 template <Encoding ENCODING, typename STRING>
194 std::string EncodeString(const STRING &);
195 extern template std::string EncodeString<Encoding::LATIN_1, std::string>(
196 const std::string &);
197 extern template std::string EncodeString<Encoding::UTF_8, std::u32string>(
198 const std::u32string &);
199
200 // EmitQuotedChar drives callbacks "emit" and "insert" to output the
201 // bytes of an encoding for a codepoint.
202 template <typename NORMAL, typename INSERTED>
203 void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
204 bool backslashEscapes = true, Encoding encoding = Encoding::UTF_8) {
205 auto emitOneByte{[&](std::uint8_t ch) {
206 if (backslashEscapes && (ch < ' ' || ch >= 0x7f || ch == '\\')) {
207 if (std::optional<char> escape{BackslashEscapeChar(ch)}) {
208 insert('\\');
209 emit(*escape);
210 } else if (useHexadecimalEscapeSequences) {
211 insert('\\');
212 insert('x');
213 int top{ch >> 4}, bottom{ch & 0xf};
214 insert(top > 9 ? 'a' + top - 10 : '0' + top);
215 insert(bottom > 9 ? 'a' + bottom - 10 : '0' + bottom);
216 } else {
217 // octal escape sequence; always emit 3 digits to avoid ambiguity
218 insert('\\');
219 insert('0' + (ch >> 6));
220 insert('0' + ((ch >> 3) & 7));
221 insert('0' + (ch & 7));
222 }
223 } else if (ch == '\n') { // always escape newlines
224 insert('\\');
225 insert('n');
226 } else {
227 emit(ch);
228 }
229 }};
230 if (ch <= 0x7f) {
231 emitOneByte(ch);
232 } else {
233 EncodedCharacter encoded{EncodeCharacter(encoding, ch)};
234 for (int j{0}; j < encoded.bytes; ++j) {
235 emitOneByte(encoded.buffer[j]);
236 }
237 }
238 }
239
240 std::string QuoteCharacterLiteral(const std::string &,
241 bool backslashEscapes = true, Encoding = Encoding::LATIN_1);
242 std::string QuoteCharacterLiteral(const std::u16string &,
243 bool backslashEscapes = true, Encoding = Encoding::UTF_8);
244 std::string QuoteCharacterLiteral(const std::u32string &,
245 bool backslashEscapes = true, Encoding = Encoding::UTF_8);
246
247 int UTF_8CharacterBytes(const char *);
248
249 struct DecodedCharacter {
250 char32_t codepoint{0};
251 int bytes{0}; // signifying failure
252 };
253
254 template <Encoding ENCODING>
255 DecodedCharacter DecodeRawCharacter(const char *, std::size_t);
256 template <>
257 DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>(
258 const char *, std::size_t);
259
260 template <>
261 DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(const char *, std::size_t);
262
263 // DecodeCharacter optionally handles backslash escape sequences, too.
264 template <Encoding ENCODING>
265 DecodedCharacter DecodeCharacter(
266 const char *, std::size_t, bool backslashEscapes);
267 extern template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>(
268 const char *, std::size_t, bool);
269 extern template DecodedCharacter DecodeCharacter<Encoding::UTF_8>(
270 const char *, std::size_t, bool);
271
272 DecodedCharacter DecodeCharacter(
273 Encoding, const char *, std::size_t, bool backslashEscapes);
274
275 template <typename RESULT, Encoding ENCODING>
276 RESULT DecodeString(const std::string &, bool backslashEscapes);
277 extern template std::string DecodeString<std::string, Encoding::LATIN_1>(
278 const std::string &, bool);
279 extern template std::u16string DecodeString<std::u16string, Encoding::UTF_8>(
280 const std::string &, bool);
281 extern template std::u32string DecodeString<std::u32string, Encoding::UTF_8>(
282 const std::string &, bool);
283 } // namespace Fortran::parser
284 #endif // FORTRAN_PARSER_CHARACTERS_H_
285