1 // Copyright (c) 2018-2019, NVIDIA CORPORATION.  All rights reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "characters.h"
16 #include "../common/idioms.h"
17 #include <algorithm>
18 #include <cstddef>
19 #include <optional>
20 #include <type_traits>
21 
22 namespace Fortran::parser {
23 
UTF_8CharacterBytes(const char * p)24 int UTF_8CharacterBytes(const char *p) {
25   if ((*p & 0x80) == 0) {
26     return 1;
27   } else if ((*p & 0xe0) == 0xc0) {
28     return 2;
29   } else if ((*p & 0xf0) == 0xe0) {
30     return 3;
31   } else if ((*p & 0xf8) == 0xf0) {
32     return 4;
33   } else if ((*p & 0xfc) == 0xf8) {
34     return 5;
35   } else {
36     return 6;
37   }
38 }
39 
40 template<typename STRING>
QuoteCharacterLiteralHelper(const STRING & str,bool backslashEscapes,Encoding encoding)41 std::string QuoteCharacterLiteralHelper(
42     const STRING &str, bool backslashEscapes, Encoding encoding) {
43   std::string result{'"'};
44   const auto emit{[&](char ch) { result += ch; }};
45   for (auto ch : str) {
46     using CharT = std::decay_t<decltype(ch)>;
47     char32_t ch32{static_cast<std::make_unsigned_t<CharT>>(ch)};
48     if (ch32 == static_cast<unsigned char>('"')) {
49       emit('"');  // double the " when it appears in the text
50     }
51     EmitQuotedChar(ch32, emit, emit, backslashEscapes, encoding);
52   }
53   result += '"';
54   return result;
55 }
56 
QuoteCharacterLiteral(const std::string & str,bool backslashEscapes,Encoding encoding)57 std::string QuoteCharacterLiteral(
58     const std::string &str, bool backslashEscapes, Encoding encoding) {
59   return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
60 }
61 
QuoteCharacterLiteral(const std::u16string & str,bool backslashEscapes,Encoding encoding)62 std::string QuoteCharacterLiteral(
63     const std::u16string &str, bool backslashEscapes, Encoding encoding) {
64   return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
65 }
66 
QuoteCharacterLiteral(const std::u32string & str,bool backslashEscapes,Encoding encoding)67 std::string QuoteCharacterLiteral(
68     const std::u32string &str, bool backslashEscapes, Encoding encoding) {
69   return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
70 }
71 
EncodeCharacter(char32_t ucs)72 template<> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t ucs) {
73   CHECK(ucs <= 0xff);
74   EncodedCharacter result;
75   result.buffer[0] = ucs;
76   result.bytes = 1;
77   return result;
78 }
79 
EncodeCharacter(char32_t ucs)80 template<> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t ucs) {
81   // N.B. char32_t is unsigned
82   EncodedCharacter result;
83   if (ucs <= 0x7f) {
84     result.buffer[0] = ucs;
85     result.bytes = 1;
86   } else if (ucs <= 0x7ff) {
87     result.buffer[0] = 0xc0 | (ucs >> 6);
88     result.buffer[1] = 0x80 | (ucs & 0x3f);
89     result.bytes = 2;
90   } else if (ucs <= 0xffff) {
91     result.buffer[0] = 0xe0 | (ucs >> 12);
92     result.buffer[1] = 0x80 | ((ucs >> 6) & 0x3f);
93     result.buffer[2] = 0x80 | (ucs & 0x3f);
94     result.bytes = 3;
95   } else if (ucs <= 0x1fffff) {
96     // UCS actually only goes up to 0x10ffff, but the
97     // UTF-8 encoding can handle 32 bits.
98     result.buffer[0] = 0xf0 | (ucs >> 18);
99     result.buffer[1] = 0x80 | ((ucs >> 12) & 0x3f);
100     result.buffer[2] = 0x80 | ((ucs >> 6) & 0x3f);
101     result.buffer[3] = 0x80 | (ucs & 0x3f);
102     result.bytes = 4;
103   } else if (ucs <= 0x3ffffff) {
104     result.buffer[0] = 0xf8 | (ucs >> 24);
105     result.buffer[1] = 0x80 | ((ucs >> 18) & 0x3f);
106     result.buffer[2] = 0x80 | ((ucs >> 12) & 0x3f);
107     result.buffer[3] = 0x80 | ((ucs >> 6) & 0x3f);
108     result.buffer[4] = 0x80 | (ucs & 0x3f);
109     result.bytes = 5;
110   } else {
111     result.buffer[0] = 0xfc | (ucs >> 30);
112     result.buffer[1] = 0x80 | ((ucs >> 24) & 0x3f);
113     result.buffer[2] = 0x80 | ((ucs >> 18) & 0x3f);
114     result.buffer[3] = 0x80 | ((ucs >> 12) & 0x3f);
115     result.buffer[4] = 0x80 | ((ucs >> 6) & 0x3f);
116     result.buffer[5] = 0x80 | (ucs & 0x3f);
117     result.bytes = 6;
118   }
119   return result;
120 }
121 
EncodeCharacter(Encoding encoding,char32_t ucs)122 EncodedCharacter EncodeCharacter(Encoding encoding, char32_t ucs) {
123   switch (encoding) {
124     SWITCH_COVERS_ALL_CASES
125   case Encoding::LATIN_1: return EncodeCharacter<Encoding::LATIN_1>(ucs);
126   case Encoding::UTF_8: return EncodeCharacter<Encoding::UTF_8>(ucs);
127   }
128 }
129 
130 template<Encoding ENCODING, typename STRING>
EncodeString(const STRING & str)131 std::string EncodeString(const STRING &str) {
132   std::string result;
133   for (auto ch : str) {
134     char32_t uch{static_cast<std::make_unsigned_t<decltype(ch)>>(ch)};
135     EncodedCharacter encoded{EncodeCharacter<ENCODING>(uch)};
136     result.append(encoded.buffer, static_cast<std::size_t>(encoded.bytes));
137   }
138   return result;
139 }
140 
141 template std::string EncodeString<Encoding::LATIN_1, std::string>(
142     const std::string &);
143 template std::string EncodeString<Encoding::UTF_8, std::u16string>(
144     const std::u16string &);
145 template std::string EncodeString<Encoding::UTF_8, std::u32string>(
146     const std::u32string &);
147 
148 template<>
DecodeRawCharacter(const char * cp,std::size_t bytes)149 DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>(
150     const char *cp, std::size_t bytes) {
151   if (bytes >= 1) {
152     return {*reinterpret_cast<const std::uint8_t *>(cp), 1};
153   } else {
154     return {};
155   }
156 }
157 
158 template<>
DecodeRawCharacter(const char * cp,std::size_t bytes)159 DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(
160     const char *cp, std::size_t bytes) {
161   auto p{reinterpret_cast<const std::uint8_t *>(cp)};
162   char32_t ch{*p};
163   if (ch <= 0x7f) {
164     return {ch, 1};
165   } else if ((ch & 0xf8) == 0xf0 && bytes >= 4 && ch > 0xf0 &&
166       ((p[1] | p[2] | p[3]) & 0xc0) == 0x80) {
167     ch = ((ch & 7) << 6) | (p[1] & 0x3f);
168     ch = (ch << 6) | (p[2] & 0x3f);
169     ch = (ch << 6) | (p[3] & 0x3f);
170     return {ch, 4};
171   } else if ((ch & 0xf0) == 0xe0 && bytes >= 3 && ch > 0xe0 &&
172       ((p[1] | p[2]) & 0xc0) == 0x80) {
173     ch = ((ch & 0xf) << 6) | (p[1] & 0x3f);
174     ch = (ch << 6) | (p[2] & 0x3f);
175     return {ch, 3};
176   } else if ((ch & 0xe0) == 0xc0 && bytes >= 2 && ch > 0xc0 &&
177       (p[1] & 0xc0) == 0x80) {
178     ch = ((ch & 0x1f) << 6) | (p[1] & 0x3f);
179     return {ch, 2};
180   } else {
181     return {};  // not valid UTF-8
182   }
183 }
184 
DecodeEscapedCharacter(const char * cp,std::size_t bytes)185 static DecodedCharacter DecodeEscapedCharacter(
186     const char *cp, std::size_t bytes) {
187   if (cp[0] == '\\' && bytes >= 2) {
188     if (std::optional<char> escChar{BackslashEscapeValue(cp[1])}) {
189       return {static_cast<unsigned char>(*escChar), 2};
190     } else if (IsOctalDigit(cp[1])) {
191       std::size_t maxLen{std::min(std::size_t{4}, bytes)};
192       char32_t code{static_cast<char32_t>(DecimalDigitValue(cp[1]))};
193       std::size_t len{2};  // so far
194       for (; code <= 037 && len < maxLen && IsOctalDigit(cp[len]); ++len) {
195         code = 8 * code + DecimalDigitValue(cp[len]);
196       }
197       return {code, static_cast<int>(len)};
198     } else if (bytes >= 4 && ToLowerCaseLetter(cp[1]) == 'x' &&
199         IsHexadecimalDigit(cp[2]) && IsHexadecimalDigit(cp[3])) {
200       return {static_cast<char32_t>(16 * HexadecimalDigitValue(cp[2]) +
201                   HexadecimalDigitValue(cp[3])),
202           4};
203     } else if (IsLetter(cp[1])) {
204       // Unknown escape - ignore the '\' (PGI compatibility)
205       return {static_cast<unsigned char>(cp[1]), 2};
206     } else {
207       // Not an escape character.
208       return {'\\', 1};
209     }
210   }
211   return {static_cast<unsigned char>(cp[0]), 1};
212 }
213 
214 template<Encoding ENCODING>
DecodeEscapedCharacters(const char * cp,std::size_t bytes)215 static DecodedCharacter DecodeEscapedCharacters(
216     const char *cp, std::size_t bytes) {
217   char buffer[EncodedCharacter::maxEncodingBytes];
218   int count[EncodedCharacter::maxEncodingBytes];
219   std::size_t at{0}, len{0};
220   for (; len < EncodedCharacter::maxEncodingBytes && at < bytes; ++len) {
221     DecodedCharacter code{DecodeEscapedCharacter(cp + at, bytes - at)};
222     buffer[len] = code.codepoint;
223     at += code.bytes;
224     count[len] = at;
225   }
226   DecodedCharacter code{DecodeCharacter<ENCODING>(buffer, len, false)};
227   if (code.bytes > 0) {
228     code.bytes = count[code.bytes - 1];
229   } else {
230     code.codepoint = buffer[0] & 0xff;
231     code.bytes = count[0];
232   }
233   return code;
234 }
235 
236 template<Encoding ENCODING>
DecodeCharacter(const char * cp,std::size_t bytes,bool backslashEscapes)237 DecodedCharacter DecodeCharacter(
238     const char *cp, std::size_t bytes, bool backslashEscapes) {
239   if (backslashEscapes && bytes >= 2 && *cp == '\\') {
240     return DecodeEscapedCharacters<ENCODING>(cp, bytes);
241   } else {
242     return DecodeRawCharacter<ENCODING>(cp, bytes);
243   }
244 }
245 
246 template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>(
247     const char *, std::size_t, bool);
248 template DecodedCharacter DecodeCharacter<Encoding::UTF_8>(
249     const char *, std::size_t, bool);
250 
DecodeCharacter(Encoding encoding,const char * cp,std::size_t bytes,bool backslashEscapes)251 DecodedCharacter DecodeCharacter(Encoding encoding, const char *cp,
252     std::size_t bytes, bool backslashEscapes) {
253   switch (encoding) {
254     SWITCH_COVERS_ALL_CASES
255   case Encoding::LATIN_1:
256     return DecodeCharacter<Encoding::LATIN_1>(cp, bytes, backslashEscapes);
257   case Encoding::UTF_8:
258     return DecodeCharacter<Encoding::UTF_8>(cp, bytes, backslashEscapes);
259   }
260 }
261 
262 template<typename RESULT, Encoding ENCODING>
263 RESULT DecodeString(const std::string &s, bool backslashEscapes) {
264   RESULT result;
265   const char *p{s.c_str()};
266   for (auto bytes{s.size()}; bytes != 0;) {
267     DecodedCharacter decoded{
268         DecodeCharacter<ENCODING>(p, bytes, backslashEscapes)};
269     if (decoded.bytes > 0) {
270       if (static_cast<std::size_t>(decoded.bytes) <= bytes) {
271         result.append(1, decoded.codepoint);
272         bytes -= decoded.bytes;
273         p += decoded.bytes;
274         continue;
275       }
276     }
277     result.append(1, static_cast<uint8_t>(*p));
278     ++p;
279     --bytes;
280   }
281   return result;
282 }
283 
284 template std::string DecodeString<std::string, Encoding::LATIN_1>(
285     const std::string &, bool);
286 template std::u16string DecodeString<std::u16string, Encoding::UTF_8>(
287     const std::string &, bool);
288 template std::u32string DecodeString<std::u32string, Encoding::UTF_8>(
289     const std::string &, bool);
290 }
291