1 //===--- Punycode.cpp - Unicode to Punycode transcoding -------------------===//
2 //
3 // This source file is part of the Swift.org open source project
4 //
5 // Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
6 // Licensed under Apache License v2.0 with Runtime Library Exception
7 //
8 // See https://swift.org/LICENSE.txt for license information
9 // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "swift/Demangling/Punycode.h"
14 #include "swift/Demangling/ManglingUtils.h"
15 #include <vector>
16 #include <cstdint>
17
18 using namespace swift;
19 using namespace Punycode;
20
21 // RFC 3492
22 // Section 5: Parameter values for Punycode
23
24 static const int base = 36;
25 static const int tmin = 1;
26 static const int tmax = 26;
27 static const int skew = 38;
28 static const int damp = 700;
29 static const int initial_bias = 72;
30 static const uint32_t initial_n = 128;
31
32 static const char delimiter = '_';
33
digit_value(int digit)34 static char digit_value(int digit) {
35 assert(digit < base && "invalid punycode digit");
36 if (digit < 26)
37 return 'a' + digit;
38 return 'A' - 26 + digit;
39 }
40
digit_index(char value)41 static int digit_index(char value) {
42 if (value >= 'a' && value <= 'z')
43 return value - 'a';
44 if (value >= 'A' && value <= 'J')
45 return value - 'A' + 26;
46 return -1;
47 }
48
isValidUnicodeScalar(uint32_t S)49 static bool isValidUnicodeScalar(uint32_t S) {
50 // Also accept the range of 0xD800 - 0xD880, which is used for non-symbol
51 // ASCII characters.
52 return (S < 0xD880) || (S >= 0xE000 && S <= 0x1FFFFF);
53 }
54
55 // Section 6.1: Bias adaptation function
56
adapt(int delta,int numpoints,bool firsttime)57 static int adapt(int delta, int numpoints, bool firsttime) {
58 if (firsttime)
59 delta = delta / damp;
60 else
61 delta = delta / 2;
62
63 delta += delta / numpoints;
64 int k = 0;
65 while (delta > ((base - tmin) * tmax) / 2) {
66 delta /= base - tmin;
67 k += base;
68 }
69 return k + (((base - tmin + 1) * delta) / (delta + skew));
70 }
71
72 // Section 6.2: Decoding procedure
73
decodePunycode(StringRef InputPunycode,std::vector<uint32_t> & OutCodePoints)74 bool Punycode::decodePunycode(StringRef InputPunycode,
75 std::vector<uint32_t> &OutCodePoints) {
76 OutCodePoints.clear();
77 OutCodePoints.reserve(InputPunycode.size());
78
79 // -- Build the decoded string as UTF32 first because we need random access.
80 uint32_t n = initial_n;
81 int i = 0;
82 int bias = initial_bias;
83 /// let output = an empty string indexed from 0
84 // consume all code points before the last delimiter (if there is one)
85 // and copy them to output,
86 size_t lastDelimiter = InputPunycode.find_last_of(delimiter);
87 if (lastDelimiter != StringRef::npos) {
88 for (char c : InputPunycode.slice(0, lastDelimiter)) {
89 // fail on any non-basic code point
90 if (static_cast<unsigned char>(c) > 0x7f)
91 return true;
92 OutCodePoints.push_back(c);
93 }
94 // if more than zero code points were consumed then consume one more
95 // (which will be the last delimiter)
96 InputPunycode =
97 InputPunycode.slice(lastDelimiter + 1, InputPunycode.size());
98 }
99
100 while (!InputPunycode.empty()) {
101 int oldi = i;
102 int w = 1;
103 for (int k = base; ; k += base) {
104 // consume a code point, or fail if there was none to consume
105 if (InputPunycode.empty())
106 return true;
107 char codePoint = InputPunycode.front();
108 InputPunycode = InputPunycode.slice(1, InputPunycode.size());
109 // let digit = the code point's digit-value, fail if it has none
110 int digit = digit_index(codePoint);
111 if (digit < 0)
112 return true;
113
114 i = i + digit * w;
115 int t = k <= bias ? tmin
116 : k >= bias + tmax ? tmax
117 : k - bias;
118 if (digit < t)
119 break;
120 w = w * (base - t);
121 }
122 bias = adapt(i - oldi, OutCodePoints.size() + 1, oldi == 0);
123 n = n + i / (OutCodePoints.size() + 1);
124 i = i % (OutCodePoints.size() + 1);
125 // if n is a basic code point then fail
126 if (n < 0x80)
127 return true;
128 // insert n into output at position i
129 OutCodePoints.insert(OutCodePoints.begin() + i, n);
130 i++;
131 }
132
133 return true;
134 }
135
136 // Section 6.3: Encoding procedure
137
encodePunycode(const std::vector<uint32_t> & InputCodePoints,std::string & OutPunycode)138 bool Punycode::encodePunycode(const std::vector<uint32_t> &InputCodePoints,
139 std::string &OutPunycode) {
140 OutPunycode.clear();
141
142 uint32_t n = initial_n;
143 int delta = 0;
144 int bias = initial_bias;
145
146 // let h = b = the number of basic code points in the input
147 // copy them to the output in order...
148 size_t h = 0;
149 for (auto C : InputCodePoints) {
150 if (C < 0x80) {
151 ++h;
152 OutPunycode.push_back(C);
153 }
154 if (!isValidUnicodeScalar(C)) {
155 OutPunycode.clear();
156 return false;
157 }
158 }
159 size_t b = h;
160 // ...followed by a delimiter if b > 0
161 if (b > 0)
162 OutPunycode.push_back(delimiter);
163
164 while (h < InputCodePoints.size()) {
165 // let m = the minimum code point >= n in the input
166 uint32_t m = 0x10FFFF;
167 for (auto codePoint : InputCodePoints) {
168 if (codePoint >= n && codePoint < m)
169 m = codePoint;
170 }
171
172 delta = delta + (m - n) * (h + 1);
173 n = m;
174 for (auto c : InputCodePoints) {
175 if (c < n) ++delta;
176 if (c == n) {
177 int q = delta;
178 for (int k = base; ; k += base) {
179 int t = k <= bias ? tmin
180 : k >= bias + tmax ? tmax
181 : k - bias;
182
183 if (q < t) break;
184 OutPunycode.push_back(digit_value(t + ((q - t) % (base - t))));
185 q = (q - t) / (base - t);
186 }
187 OutPunycode.push_back(digit_value(q));
188 bias = adapt(delta, h + 1, h == b);
189 delta = 0;
190 ++h;
191 }
192 }
193 ++delta; ++n;
194 }
195 return true;
196 }
197
encodeToUTF8(const std::vector<uint32_t> & Scalars,std::string & OutUTF8)198 static bool encodeToUTF8(const std::vector<uint32_t> &Scalars,
199 std::string &OutUTF8) {
200 for (auto S : Scalars) {
201 if (!isValidUnicodeScalar(S)) {
202 OutUTF8.clear();
203 return false;
204 }
205 if (S >= 0xD800 && S < 0xD880)
206 S -= 0xD800;
207
208 unsigned Bytes = 0;
209 if (S < 0x80)
210 Bytes = 1;
211 else if (S < 0x800)
212 Bytes = 2;
213 else if (S < 0x10000)
214 Bytes = 3;
215 else
216 Bytes = 4;
217
218 switch (Bytes) {
219 case 1:
220 OutUTF8.push_back(S);
221 break;
222 case 2: {
223 uint8_t Byte2 = (S | 0x80) & 0xBF;
224 S >>= 6;
225 uint8_t Byte1 = S | 0xC0;
226 OutUTF8.push_back(Byte1);
227 OutUTF8.push_back(Byte2);
228 break;
229 }
230 case 3: {
231 uint8_t Byte3 = (S | 0x80) & 0xBF;
232 S >>= 6;
233 uint8_t Byte2 = (S | 0x80) & 0xBF;
234 S >>= 6;
235 uint8_t Byte1 = S | 0xE0;
236 OutUTF8.push_back(Byte1);
237 OutUTF8.push_back(Byte2);
238 OutUTF8.push_back(Byte3);
239 break;
240 }
241 case 4: {
242 uint8_t Byte4 = (S | 0x80) & 0xBF;
243 S >>= 6;
244 uint8_t Byte3 = (S | 0x80) & 0xBF;
245 S >>= 6;
246 uint8_t Byte2 = (S | 0x80) & 0xBF;
247 S >>= 6;
248 uint8_t Byte1 = S | 0xF0;
249 OutUTF8.push_back(Byte1);
250 OutUTF8.push_back(Byte2);
251 OutUTF8.push_back(Byte3);
252 OutUTF8.push_back(Byte4);
253 break;
254 }
255 }
256 }
257 return true;
258 }
259
decodePunycodeUTF8(StringRef InputPunycode,std::string & OutUTF8)260 bool Punycode::decodePunycodeUTF8(StringRef InputPunycode,
261 std::string &OutUTF8) {
262 std::vector<uint32_t> OutCodePoints;
263 if (!decodePunycode(InputPunycode, OutCodePoints))
264 return false;
265
266 return encodeToUTF8(OutCodePoints, OutUTF8);
267 }
268
isContinuationByte(uint8_t unit)269 static bool isContinuationByte(uint8_t unit) {
270 return (unit & 0xC0) == 0x80;
271 }
272
273 /// Reencode well-formed UTF-8 as UTF-32.
274 ///
275 /// This entry point is only called from compiler-internal entry points, so does
276 /// only minimal validation. In particular, it does *not* check for overlong
277 /// encodings.
278 /// If \p mapNonSymbolChars is true, non-symbol ASCII characters (characters
279 /// except [$_a-zA-Z0-9]) are also encoded like non-ASCII unicode characters.
280 /// Returns false if \p InputUTF8 contains surrogate code points.
convertUTF8toUTF32(llvm::StringRef InputUTF8,std::vector<uint32_t> & OutUTF32,bool mapNonSymbolChars)281 static bool convertUTF8toUTF32(llvm::StringRef InputUTF8,
282 std::vector<uint32_t> &OutUTF32,
283 bool mapNonSymbolChars) {
284 auto ptr = InputUTF8.begin();
285 auto end = InputUTF8.end();
286 while (ptr < end) {
287 uint8_t first = *ptr++;
288 if (first < 0x80) {
289 if (Mangle::isValidSymbolChar(first) || !mapNonSymbolChars) {
290 OutUTF32.push_back(first);
291 } else {
292 OutUTF32.push_back((uint32_t)first + 0xD800);
293 }
294 } else if (first < 0xC0) {
295 // Invalid continuation byte.
296 return false;
297 } else if (first < 0xE0) {
298 // Two-byte sequence.
299 if (ptr == end)
300 return false;
301 uint8_t second = *ptr++;
302 if (!isContinuationByte(second))
303 return false;
304 OutUTF32.push_back(((first & 0x1F) << 6) | (second & 0x3F));
305 } else if (first < 0xF0) {
306 // Three-byte sequence.
307 if (end - ptr < 2)
308 return false;
309 uint8_t second = *ptr++;
310 uint8_t third = *ptr++;
311 if (!isContinuationByte(second) || !isContinuationByte(third))
312 return false;
313 OutUTF32.push_back(((first & 0xF) << 12) | ((second & 0x3F) << 6)
314 | ( third & 0x3F ));
315 } else if (first < 0xF8) {
316 // Four-byte sequence.
317 if (end - ptr < 3)
318 return false;
319 uint8_t second = *ptr++;
320 uint8_t third = *ptr++;
321 uint8_t fourth = *ptr++;
322 if (!isContinuationByte(second) || !isContinuationByte(third)
323 || !isContinuationByte(fourth))
324 return false;
325 OutUTF32.push_back(((first & 0x7) << 18) | ((second & 0x3F) << 12)
326 | ((third & 0x3F) << 6)
327 | ( fourth & 0x3F ));
328 } else {
329 // Unused sequence length.
330 return false;
331 }
332 }
333 return true;
334 }
335
encodePunycodeUTF8(StringRef InputUTF8,std::string & OutPunycode,bool mapNonSymbolChars)336 bool Punycode::encodePunycodeUTF8(StringRef InputUTF8,
337 std::string &OutPunycode,
338 bool mapNonSymbolChars) {
339 std::vector<uint32_t> InputCodePoints;
340 InputCodePoints.reserve(InputUTF8.size());
341
342 if (!convertUTF8toUTF32(InputUTF8, InputCodePoints, mapNonSymbolChars))
343 return false;
344
345 return encodePunycode(InputCodePoints, OutPunycode);
346 }
347
348