1 // Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #ifndef FORTRAN_PARSER_CHARACTERS_H_
16 #define FORTRAN_PARSER_CHARACTERS_H_
17
18 // Define some character classification predicates and
19 // conversions here to avoid dependences upon <cctype> and
20 // also to accomodate Fortran tokenization.
21
22 #include <cstddef>
23 #include <optional>
24 #include <string>
25
26 namespace Fortran::parser {
27
28 // We can easily support Fortran program source in any character
29 // set whose first 128 code points correspond to ASCII codes 0-127 (ISO/IEC646).
30 // The specific encodings that we can handle include:
31 // LATIN_1: ISO 8859-1 Latin-1
32 // UTF_8: Multi-byte encoding of Unicode (ISO/IEC 10646)
33 enum class Encoding { LATIN_1, UTF_8 };
34
IsUpperCaseLetter(char ch)35 inline constexpr bool IsUpperCaseLetter(char ch) {
36 return ch >= 'A' && ch <= 'Z';
37 }
38
IsLowerCaseLetter(char ch)39 inline constexpr bool IsLowerCaseLetter(char ch) {
40 return ch >= 'a' && ch <= 'z';
41 }
42
IsLetter(char ch)43 inline constexpr bool IsLetter(char ch) {
44 return IsUpperCaseLetter(ch) || IsLowerCaseLetter(ch);
45 }
46
IsDecimalDigit(char ch)47 inline constexpr bool IsDecimalDigit(char ch) { return ch >= '0' && ch <= '9'; }
48
IsHexadecimalDigit(char ch)49 inline constexpr bool IsHexadecimalDigit(char ch) {
50 return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') ||
51 (ch >= 'a' && ch <= 'f');
52 }
53
IsOctalDigit(char ch)54 inline constexpr bool IsOctalDigit(char ch) { return ch >= '0' && ch <= '7'; }
55
IsLegalIdentifierStart(char ch)56 inline constexpr bool IsLegalIdentifierStart(char ch) {
57 return IsLetter(ch) || ch == '_' || ch == '@' || ch == '$';
58 }
59
IsLegalInIdentifier(char ch)60 inline constexpr bool IsLegalInIdentifier(char ch) {
61 return IsLegalIdentifierStart(ch) || IsDecimalDigit(ch);
62 }
63
ToLowerCaseLetter(char ch)64 inline constexpr char ToLowerCaseLetter(char ch) {
65 return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch;
66 }
67
ToLowerCaseLetter(char && ch)68 inline constexpr char ToLowerCaseLetter(char &&ch) {
69 return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch;
70 }
71
ToLowerCaseLetters(const std::string & str)72 inline std::string ToLowerCaseLetters(const std::string &str) {
73 std::string lowered{str};
74 for (char &ch : lowered) {
75 ch = ToLowerCaseLetter(ch);
76 }
77 return lowered;
78 }
79
ToUpperCaseLetter(char ch)80 inline constexpr char ToUpperCaseLetter(char ch) {
81 return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch;
82 }
83
ToUpperCaseLetter(char && ch)84 inline constexpr char ToUpperCaseLetter(char &&ch) {
85 return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch;
86 }
87
ToUpperCaseLetters(const std::string & str)88 inline std::string ToUpperCaseLetters(const std::string &str) {
89 std::string raised{str};
90 for (char &ch : raised) {
91 ch = ToUpperCaseLetter(ch);
92 }
93 return raised;
94 }
95
IsSameApartFromCase(char x,char y)96 inline constexpr bool IsSameApartFromCase(char x, char y) {
97 return ToLowerCaseLetter(x) == ToLowerCaseLetter(y);
98 }
99
DecimalDigitValue(char ch)100 inline constexpr char DecimalDigitValue(char ch) { return ch - '0'; }
101
HexadecimalDigitValue(char ch)102 inline constexpr char HexadecimalDigitValue(char ch) {
103 return IsUpperCaseLetter(ch)
104 ? ch - 'A' + 10
105 : IsLowerCaseLetter(ch) ? ch - 'a' + 10 : DecimalDigitValue(ch);
106 }
107
BackslashEscapeValue(char ch)108 inline constexpr std::optional<char> BackslashEscapeValue(char ch) {
109 switch (ch) {
110 case 'a': return std::nullopt; // '\a'; PGF90 doesn't know \a
111 case 'b': return '\b';
112 case 'f': return '\f';
113 case 'n': return '\n';
114 case 'r': return '\r';
115 case 't': return '\t';
116 case 'v': return '\v';
117 case '"':
118 case '\'':
119 case '\\': return ch;
120 default: return std::nullopt;
121 }
122 }
123
BackslashEscapeChar(char ch)124 inline constexpr std::optional<char> BackslashEscapeChar(char ch) {
125 switch (ch) {
126 case '\a': return std::nullopt; // 'a'; PGF90 doesn't know \a
127 case '\b': return 'b';
128 case '\f': return 'f';
129 case '\n': return 'n';
130 case '\r': return 'r';
131 case '\t': return 't';
132 case '\v': return 'v';
133 case '"':
134 case '\'':
135 case '\\': return ch;
136 default: return std::nullopt;
137 }
138 }
139
140 struct EncodedCharacter {
141 static constexpr int maxEncodingBytes{6};
142 char buffer[maxEncodingBytes];
143 int bytes{0};
144 };
145
146 template<Encoding ENCODING> EncodedCharacter EncodeCharacter(char32_t ucs);
147 template<> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t);
148 template<> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t);
149
150 EncodedCharacter EncodeCharacter(Encoding, char32_t ucs);
151
152 template<Encoding ENCODING, typename STRING>
153 std::string EncodeString(const STRING &);
154 extern template std::string EncodeString<Encoding::LATIN_1, std::string>(
155 const std::string &);
156 extern template std::string EncodeString<Encoding::UTF_8, std::u32string>(
157 const std::u32string &);
158
159 // EmitQuotedChar drives callbacks "emit" and "insert" to output the
160 // bytes of an encoding for a codepoint.
161 template<typename NORMAL, typename INSERTED>
162 void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
163 bool backslashEscapes = true, Encoding encoding = Encoding::UTF_8) {
164 auto emitOneChar{[&](std::uint8_t ch) {
165 if (ch < ' ' || (backslashEscapes && (ch == '\\' || ch >= 0x7f))) {
166 insert('\\');
167 if (std::optional<char> escape{BackslashEscapeChar(ch)}) {
168 emit(*escape);
169 } else {
170 // octal escape sequence; always emit 3 digits to avoid ambiguity
171 insert('0' + (ch >> 6));
172 insert('0' + ((ch >> 3) & 7));
173 insert('0' + (ch & 7));
174 }
175 } else {
176 emit(ch);
177 }
178 }};
179 if (ch <= 0x7f) {
180 emitOneChar(ch);
181 } else {
182 EncodedCharacter encoded{EncodeCharacter(encoding, ch)};
183 for (int j{0}; j < encoded.bytes; ++j) {
184 emitOneChar(encoded.buffer[j]);
185 }
186 }
187 }
188
189 std::string QuoteCharacterLiteral(const std::string &,
190 bool backslashEscapes = true, Encoding = Encoding::LATIN_1);
191 std::string QuoteCharacterLiteral(const std::u16string &,
192 bool backslashEscapes = true, Encoding = Encoding::UTF_8);
193 std::string QuoteCharacterLiteral(const std::u32string &,
194 bool backslashEscapes = true, Encoding = Encoding::UTF_8);
195
196 int UTF_8CharacterBytes(const char *);
197
198 struct DecodedCharacter {
199 char32_t codepoint{0};
200 int bytes{0}; // signifying failure
201 };
202
203 template<Encoding ENCODING>
204 DecodedCharacter DecodeRawCharacter(const char *, std::size_t);
205 template<>
206 DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>(
207 const char *, std::size_t);
208
209 template<>
210 DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(const char *, std::size_t);
211
212 // DecodeCharacter optionally handles backslash escape sequences, too.
213 template<Encoding ENCODING>
214 DecodedCharacter DecodeCharacter(
215 const char *, std::size_t, bool backslashEscapes);
216 extern template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>(
217 const char *, std::size_t, bool);
218 extern template DecodedCharacter DecodeCharacter<Encoding::UTF_8>(
219 const char *, std::size_t, bool);
220
221 DecodedCharacter DecodeCharacter(
222 Encoding, const char *, std::size_t, bool backslashEscapes);
223
224 template<typename RESULT, Encoding ENCODING>
225 RESULT DecodeString(const std::string &, bool backslashEscapes);
226 extern template std::string DecodeString<std::string, Encoding::LATIN_1>(
227 const std::string &, bool);
228 extern template std::u16string DecodeString<std::u16string, Encoding::UTF_8>(
229 const std::string &, bool);
230 extern template std::u32string DecodeString<std::u32string, Encoding::UTF_8>(
231 const std::string &, bool);
232 }
233 #endif // FORTRAN_PARSER_CHARACTERS_H_
234