1 // Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef FORTRAN_PARSER_CHARACTERS_H_
16 #define FORTRAN_PARSER_CHARACTERS_H_
17 
18 // Define some character classification predicates and
19 // conversions here to avoid dependences upon <cctype> and
20 // also to accomodate Fortran tokenization.
21 
22 #include <cstddef>
23 #include <optional>
24 #include <string>
25 
26 namespace Fortran::parser {
27 
28 // We can easily support Fortran program source in any character
29 // set whose first 128 code points correspond to ASCII codes 0-127 (ISO/IEC646).
30 // The specific encodings that we can handle include:
31 //   LATIN_1: ISO 8859-1 Latin-1
32 //   UTF_8: Multi-byte encoding of Unicode (ISO/IEC 10646)
33 enum class Encoding { LATIN_1, UTF_8 };
34 
IsUpperCaseLetter(char ch)35 inline constexpr bool IsUpperCaseLetter(char ch) {
36   return ch >= 'A' && ch <= 'Z';
37 }
38 
IsLowerCaseLetter(char ch)39 inline constexpr bool IsLowerCaseLetter(char ch) {
40   return ch >= 'a' && ch <= 'z';
41 }
42 
IsLetter(char ch)43 inline constexpr bool IsLetter(char ch) {
44   return IsUpperCaseLetter(ch) || IsLowerCaseLetter(ch);
45 }
46 
IsDecimalDigit(char ch)47 inline constexpr bool IsDecimalDigit(char ch) { return ch >= '0' && ch <= '9'; }
48 
IsHexadecimalDigit(char ch)49 inline constexpr bool IsHexadecimalDigit(char ch) {
50   return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') ||
51       (ch >= 'a' && ch <= 'f');
52 }
53 
IsOctalDigit(char ch)54 inline constexpr bool IsOctalDigit(char ch) { return ch >= '0' && ch <= '7'; }
55 
IsLegalIdentifierStart(char ch)56 inline constexpr bool IsLegalIdentifierStart(char ch) {
57   return IsLetter(ch) || ch == '_' || ch == '@' || ch == '$';
58 }
59 
IsLegalInIdentifier(char ch)60 inline constexpr bool IsLegalInIdentifier(char ch) {
61   return IsLegalIdentifierStart(ch) || IsDecimalDigit(ch);
62 }
63 
ToLowerCaseLetter(char ch)64 inline constexpr char ToLowerCaseLetter(char ch) {
65   return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch;
66 }
67 
ToLowerCaseLetter(char && ch)68 inline constexpr char ToLowerCaseLetter(char &&ch) {
69   return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch;
70 }
71 
ToLowerCaseLetters(const std::string & str)72 inline std::string ToLowerCaseLetters(const std::string &str) {
73   std::string lowered{str};
74   for (char &ch : lowered) {
75     ch = ToLowerCaseLetter(ch);
76   }
77   return lowered;
78 }
79 
ToUpperCaseLetter(char ch)80 inline constexpr char ToUpperCaseLetter(char ch) {
81   return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch;
82 }
83 
ToUpperCaseLetter(char && ch)84 inline constexpr char ToUpperCaseLetter(char &&ch) {
85   return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch;
86 }
87 
ToUpperCaseLetters(const std::string & str)88 inline std::string ToUpperCaseLetters(const std::string &str) {
89   std::string raised{str};
90   for (char &ch : raised) {
91     ch = ToUpperCaseLetter(ch);
92   }
93   return raised;
94 }
95 
IsSameApartFromCase(char x,char y)96 inline constexpr bool IsSameApartFromCase(char x, char y) {
97   return ToLowerCaseLetter(x) == ToLowerCaseLetter(y);
98 }
99 
DecimalDigitValue(char ch)100 inline constexpr char DecimalDigitValue(char ch) { return ch - '0'; }
101 
HexadecimalDigitValue(char ch)102 inline constexpr char HexadecimalDigitValue(char ch) {
103   return IsUpperCaseLetter(ch)
104       ? ch - 'A' + 10
105       : IsLowerCaseLetter(ch) ? ch - 'a' + 10 : DecimalDigitValue(ch);
106 }
107 
BackslashEscapeValue(char ch)108 inline constexpr std::optional<char> BackslashEscapeValue(char ch) {
109   switch (ch) {
110   case 'a': return std::nullopt;  // '\a';  PGF90 doesn't know \a
111   case 'b': return '\b';
112   case 'f': return '\f';
113   case 'n': return '\n';
114   case 'r': return '\r';
115   case 't': return '\t';
116   case 'v': return '\v';
117   case '"':
118   case '\'':
119   case '\\': return ch;
120   default: return std::nullopt;
121   }
122 }
123 
BackslashEscapeChar(char ch)124 inline constexpr std::optional<char> BackslashEscapeChar(char ch) {
125   switch (ch) {
126   case '\a': return std::nullopt;  // 'a';  PGF90 doesn't know \a
127   case '\b': return 'b';
128   case '\f': return 'f';
129   case '\n': return 'n';
130   case '\r': return 'r';
131   case '\t': return 't';
132   case '\v': return 'v';
133   case '"':
134   case '\'':
135   case '\\': return ch;
136   default: return std::nullopt;
137   }
138 }
139 
140 struct EncodedCharacter {
141   static constexpr int maxEncodingBytes{6};
142   char buffer[maxEncodingBytes];
143   int bytes{0};
144 };
145 
146 template<Encoding ENCODING> EncodedCharacter EncodeCharacter(char32_t ucs);
147 template<> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t);
148 template<> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t);
149 
150 EncodedCharacter EncodeCharacter(Encoding, char32_t ucs);
151 
152 template<Encoding ENCODING, typename STRING>
153 std::string EncodeString(const STRING &);
154 extern template std::string EncodeString<Encoding::LATIN_1, std::string>(
155     const std::string &);
156 extern template std::string EncodeString<Encoding::UTF_8, std::u32string>(
157     const std::u32string &);
158 
159 // EmitQuotedChar drives callbacks "emit" and "insert" to output the
160 // bytes of an encoding for a codepoint.
161 template<typename NORMAL, typename INSERTED>
162 void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
163     bool backslashEscapes = true, Encoding encoding = Encoding::UTF_8) {
164   auto emitOneChar{[&](std::uint8_t ch) {
165     if (ch < ' ' || (backslashEscapes && (ch == '\\' || ch >= 0x7f))) {
166       insert('\\');
167       if (std::optional<char> escape{BackslashEscapeChar(ch)}) {
168         emit(*escape);
169       } else {
170         // octal escape sequence; always emit 3 digits to avoid ambiguity
171         insert('0' + (ch >> 6));
172         insert('0' + ((ch >> 3) & 7));
173         insert('0' + (ch & 7));
174       }
175     } else {
176       emit(ch);
177     }
178   }};
179   if (ch <= 0x7f) {
180     emitOneChar(ch);
181   } else {
182     EncodedCharacter encoded{EncodeCharacter(encoding, ch)};
183     for (int j{0}; j < encoded.bytes; ++j) {
184       emitOneChar(encoded.buffer[j]);
185     }
186   }
187 }
188 
189 std::string QuoteCharacterLiteral(const std::string &,
190     bool backslashEscapes = true, Encoding = Encoding::LATIN_1);
191 std::string QuoteCharacterLiteral(const std::u16string &,
192     bool backslashEscapes = true, Encoding = Encoding::UTF_8);
193 std::string QuoteCharacterLiteral(const std::u32string &,
194     bool backslashEscapes = true, Encoding = Encoding::UTF_8);
195 
196 int UTF_8CharacterBytes(const char *);
197 
198 struct DecodedCharacter {
199   char32_t codepoint{0};
200   int bytes{0};  // signifying failure
201 };
202 
203 template<Encoding ENCODING>
204 DecodedCharacter DecodeRawCharacter(const char *, std::size_t);
205 template<>
206 DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>(
207     const char *, std::size_t);
208 
209 template<>
210 DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(const char *, std::size_t);
211 
212 // DecodeCharacter optionally handles backslash escape sequences, too.
213 template<Encoding ENCODING>
214 DecodedCharacter DecodeCharacter(
215     const char *, std::size_t, bool backslashEscapes);
216 extern template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>(
217     const char *, std::size_t, bool);
218 extern template DecodedCharacter DecodeCharacter<Encoding::UTF_8>(
219     const char *, std::size_t, bool);
220 
221 DecodedCharacter DecodeCharacter(
222     Encoding, const char *, std::size_t, bool backslashEscapes);
223 
224 template<typename RESULT, Encoding ENCODING>
225 RESULT DecodeString(const std::string &, bool backslashEscapes);
226 extern template std::string DecodeString<std::string, Encoding::LATIN_1>(
227     const std::string &, bool);
228 extern template std::u16string DecodeString<std::u16string, Encoding::UTF_8>(
229     const std::string &, bool);
230 extern template std::u32string DecodeString<std::u32string, Encoding::UTF_8>(
231     const std::string &, bool);
232 }
233 #endif  // FORTRAN_PARSER_CHARACTERS_H_
234