1 //===-- lib/Parser/characters.cpp -----------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "flang/Parser/characters.h"
10 #include "flang/Common/idioms.h"
11 #include <algorithm>
12 #include <cstddef>
13 #include <optional>
14 #include <type_traits>
15 
16 namespace Fortran::parser {
17 
18 bool useHexadecimalEscapeSequences{false};
19 
UTF_8CharacterBytes(const char * p)20 int UTF_8CharacterBytes(const char *p) {
21   if ((*p & 0x80) == 0) {
22     return 1;
23   } else if ((*p & 0xe0) == 0xc0) {
24     return 2;
25   } else if ((*p & 0xf0) == 0xe0) {
26     return 3;
27   } else if ((*p & 0xf8) == 0xf0) {
28     return 4;
29   } else if ((*p & 0xfc) == 0xf8) {
30     return 5;
31   } else {
32     return 6;
33   }
34 }
35 
36 template <typename STRING>
QuoteCharacterLiteralHelper(const STRING & str,bool backslashEscapes,Encoding encoding)37 std::string QuoteCharacterLiteralHelper(
38     const STRING &str, bool backslashEscapes, Encoding encoding) {
39   std::string result{'"'};
40   const auto emit{[&](char ch) { result += ch; }};
41   for (auto ch : str) {
42     using CharT = std::decay_t<decltype(ch)>;
43     char32_t ch32{static_cast<std::make_unsigned_t<CharT>>(ch)};
44     if (ch32 == static_cast<unsigned char>('"')) {
45       emit('"'); // double the " when it appears in the text
46     }
47     EmitQuotedChar(ch32, emit, emit, backslashEscapes, encoding);
48   }
49   result += '"';
50   return result;
51 }
52 
QuoteCharacterLiteral(const std::string & str,bool backslashEscapes,Encoding encoding)53 std::string QuoteCharacterLiteral(
54     const std::string &str, bool backslashEscapes, Encoding encoding) {
55   return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
56 }
57 
QuoteCharacterLiteral(const std::u16string & str,bool backslashEscapes,Encoding encoding)58 std::string QuoteCharacterLiteral(
59     const std::u16string &str, bool backslashEscapes, Encoding encoding) {
60   return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
61 }
62 
QuoteCharacterLiteral(const std::u32string & str,bool backslashEscapes,Encoding encoding)63 std::string QuoteCharacterLiteral(
64     const std::u32string &str, bool backslashEscapes, Encoding encoding) {
65   return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
66 }
67 
EncodeCharacter(char32_t ucs)68 template <> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t ucs) {
69   CHECK(ucs <= 0xff);
70   EncodedCharacter result;
71   result.buffer[0] = ucs;
72   result.bytes = 1;
73   return result;
74 }
75 
EncodeCharacter(char32_t ucs)76 template <> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t ucs) {
77   // N.B. char32_t is unsigned
78   EncodedCharacter result;
79   if (ucs <= 0x7f) {
80     result.buffer[0] = ucs;
81     result.bytes = 1;
82   } else if (ucs <= 0x7ff) {
83     result.buffer[0] = 0xc0 | (ucs >> 6);
84     result.buffer[1] = 0x80 | (ucs & 0x3f);
85     result.bytes = 2;
86   } else if (ucs <= 0xffff) {
87     result.buffer[0] = 0xe0 | (ucs >> 12);
88     result.buffer[1] = 0x80 | ((ucs >> 6) & 0x3f);
89     result.buffer[2] = 0x80 | (ucs & 0x3f);
90     result.bytes = 3;
91   } else if (ucs <= 0x1fffff) {
92     // UCS actually only goes up to 0x10ffff, but the
93     // UTF-8 encoding can handle 32 bits.
94     result.buffer[0] = 0xf0 | (ucs >> 18);
95     result.buffer[1] = 0x80 | ((ucs >> 12) & 0x3f);
96     result.buffer[2] = 0x80 | ((ucs >> 6) & 0x3f);
97     result.buffer[3] = 0x80 | (ucs & 0x3f);
98     result.bytes = 4;
99   } else if (ucs <= 0x3ffffff) {
100     result.buffer[0] = 0xf8 | (ucs >> 24);
101     result.buffer[1] = 0x80 | ((ucs >> 18) & 0x3f);
102     result.buffer[2] = 0x80 | ((ucs >> 12) & 0x3f);
103     result.buffer[3] = 0x80 | ((ucs >> 6) & 0x3f);
104     result.buffer[4] = 0x80 | (ucs & 0x3f);
105     result.bytes = 5;
106   } else {
107     result.buffer[0] = 0xfc | (ucs >> 30);
108     result.buffer[1] = 0x80 | ((ucs >> 24) & 0x3f);
109     result.buffer[2] = 0x80 | ((ucs >> 18) & 0x3f);
110     result.buffer[3] = 0x80 | ((ucs >> 12) & 0x3f);
111     result.buffer[4] = 0x80 | ((ucs >> 6) & 0x3f);
112     result.buffer[5] = 0x80 | (ucs & 0x3f);
113     result.bytes = 6;
114   }
115   return result;
116 }
117 
EncodeCharacter(Encoding encoding,char32_t ucs)118 EncodedCharacter EncodeCharacter(Encoding encoding, char32_t ucs) {
119   switch (encoding) {
120     SWITCH_COVERS_ALL_CASES
121   case Encoding::LATIN_1:
122     return EncodeCharacter<Encoding::LATIN_1>(ucs);
123   case Encoding::UTF_8:
124     return EncodeCharacter<Encoding::UTF_8>(ucs);
125   }
126 }
127 
128 template <Encoding ENCODING, typename STRING>
EncodeString(const STRING & str)129 std::string EncodeString(const STRING &str) {
130   std::string result;
131   for (auto ch : str) {
132     char32_t uch{static_cast<std::make_unsigned_t<decltype(ch)>>(ch)};
133     EncodedCharacter encoded{EncodeCharacter<ENCODING>(uch)};
134     result.append(encoded.buffer, static_cast<std::size_t>(encoded.bytes));
135   }
136   return result;
137 }
138 
139 template std::string EncodeString<Encoding::LATIN_1, std::string>(
140     const std::string &);
141 template std::string EncodeString<Encoding::UTF_8, std::u16string>(
142     const std::u16string &);
143 template std::string EncodeString<Encoding::UTF_8, std::u32string>(
144     const std::u32string &);
145 
146 template <>
DecodeRawCharacter(const char * cp,std::size_t bytes)147 DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>(
148     const char *cp, std::size_t bytes) {
149   if (bytes >= 1) {
150     return {*reinterpret_cast<const std::uint8_t *>(cp), 1};
151   } else {
152     return {};
153   }
154 }
155 
156 template <>
DecodeRawCharacter(const char * cp,std::size_t bytes)157 DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(
158     const char *cp, std::size_t bytes) {
159   auto p{reinterpret_cast<const std::uint8_t *>(cp)};
160   char32_t ch{*p};
161   if (ch <= 0x7f) {
162     return {ch, 1};
163   } else if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 &&
164       ((p[1] | p[2] | p[3]) & 0xc0) == 0x80) {
165     ch = ((ch & 7) << 6) | (p[1] & 0x3f);
166     ch = (ch << 6) | (p[2] & 0x3f);
167     ch = (ch << 6) | (p[3] & 0x3f);
168     return {ch, 4};
169   } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 &&
170       ((p[1] | p[2]) & 0xc0) == 0x80) {
171     ch = ((ch & 0xf) << 6) | (p[1] & 0x3f);
172     ch = (ch << 6) | (p[2] & 0x3f);
173     return {ch, 3};
174   } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 &&
175       (p[1] & 0xc0) == 0x80) {
176     ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f);
177     return {ch, 2};
178   } else {
179     return {}; // not valid UTF-8
180   }
181 }
182 
DecodeEscapedCharacter(const char * cp,std::size_t bytes)183 static DecodedCharacter DecodeEscapedCharacter(
184     const char *cp, std::size_t bytes) {
185   if (cp[0] == '\\' && bytes >= 2) {
186     if (std::optional<char> escChar{BackslashEscapeValue(cp[1])}) {
187       return {static_cast<unsigned char>(*escChar), 2};
188     } else if (IsOctalDigit(cp[1])) {
189       std::size_t maxLen{std::min(std::size_t{4}, bytes)};
190       char32_t code{static_cast<char32_t>(DecimalDigitValue(cp[1]))};
191       std::size_t len{2}; // so far
192       for (; code <= 037 && len < maxLen && IsOctalDigit(cp[len]); ++len) {
193         code = 8 * code + DecimalDigitValue(cp[len]);
194       }
195       return {code, static_cast<int>(len)};
196     } else if (bytes >= 4 && ToLowerCaseLetter(cp[1]) == 'x' &&
197         IsHexadecimalDigit(cp[2]) && IsHexadecimalDigit(cp[3])) {
198       return {static_cast<char32_t>(16 * HexadecimalDigitValue(cp[2]) +
199                   HexadecimalDigitValue(cp[3])),
200           4};
201     } else if (IsLetter(cp[1])) {
202       // Unknown escape - ignore the '\' (PGI compatibility)
203       return {static_cast<unsigned char>(cp[1]), 2};
204     } else {
205       // Not an escape character.
206       return {'\\', 1};
207     }
208   }
209   return {static_cast<unsigned char>(cp[0]), 1};
210 }
211 
212 template <Encoding ENCODING>
DecodeEscapedCharacters(const char * cp,std::size_t bytes)213 static DecodedCharacter DecodeEscapedCharacters(
214     const char *cp, std::size_t bytes) {
215   char buffer[EncodedCharacter::maxEncodingBytes];
216   int count[EncodedCharacter::maxEncodingBytes];
217   std::size_t at{0}, len{0};
218   for (; len < EncodedCharacter::maxEncodingBytes && at < bytes; ++len) {
219     DecodedCharacter code{DecodeEscapedCharacter(cp + at, bytes - at)};
220     buffer[len] = code.codepoint;
221     at += code.bytes;
222     count[len] = at;
223   }
224   DecodedCharacter code{DecodeCharacter<ENCODING>(buffer, len, false)};
225   if (code.bytes > 0) {
226     code.bytes = count[code.bytes - 1];
227   } else {
228     code.codepoint = buffer[0] & 0xff;
229     code.bytes = count[0];
230   }
231   return code;
232 }
233 
234 template <Encoding ENCODING>
DecodeCharacter(const char * cp,std::size_t bytes,bool backslashEscapes)235 DecodedCharacter DecodeCharacter(
236     const char *cp, std::size_t bytes, bool backslashEscapes) {
237   if (backslashEscapes && bytes >= 2 && *cp == '\\') {
238     return DecodeEscapedCharacters<ENCODING>(cp, bytes);
239   } else {
240     return DecodeRawCharacter<ENCODING>(cp, bytes);
241   }
242 }
243 
244 template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>(
245     const char *, std::size_t, bool);
246 template DecodedCharacter DecodeCharacter<Encoding::UTF_8>(
247     const char *, std::size_t, bool);
248 
DecodeCharacter(Encoding encoding,const char * cp,std::size_t bytes,bool backslashEscapes)249 DecodedCharacter DecodeCharacter(Encoding encoding, const char *cp,
250     std::size_t bytes, bool backslashEscapes) {
251   switch (encoding) {
252     SWITCH_COVERS_ALL_CASES
253   case Encoding::LATIN_1:
254     return DecodeCharacter<Encoding::LATIN_1>(cp, bytes, backslashEscapes);
255   case Encoding::UTF_8:
256     return DecodeCharacter<Encoding::UTF_8>(cp, bytes, backslashEscapes);
257   }
258 }
259 
260 template <typename RESULT, Encoding ENCODING>
261 RESULT DecodeString(const std::string &s, bool backslashEscapes) {
262   RESULT result;
263   const char *p{s.c_str()};
264   for (auto bytes{s.size()}; bytes != 0;) {
265     DecodedCharacter decoded{
266         DecodeCharacter<ENCODING>(p, bytes, backslashEscapes)};
267     if (decoded.bytes > 0) {
268       if (static_cast<std::size_t>(decoded.bytes) <= bytes) {
269         result.append(1, decoded.codepoint);
270         bytes -= decoded.bytes;
271         p += decoded.bytes;
272         continue;
273       }
274     }
275     result.append(1, static_cast<uint8_t>(*p));
276     ++p;
277     --bytes;
278   }
279   return result;
280 }
281 
282 template std::string DecodeString<std::string, Encoding::LATIN_1>(
283     const std::string &, bool);
284 template std::u16string DecodeString<std::u16string, Encoding::UTF_8>(
285     const std::string &, bool);
286 template std::u32string DecodeString<std::u32string, Encoding::UTF_8>(
287     const std::string &, bool);
288 } // namespace Fortran::parser
289