1 //===--- Punycode.cpp - Unicode to Punycode transcoding -------------------===//
2 //
3 // This source file is part of the Swift.org open source project
4 //
5 // Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
6 // Licensed under Apache License v2.0 with Runtime Library Exception
7 //
8 // See https://swift.org/LICENSE.txt for license information
9 // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "swift/Demangling/Punycode.h"
14 #include "swift/Demangling/ManglingUtils.h"
15 #include <vector>
16 #include <cstdint>
17 
18 using namespace swift;
19 using namespace Punycode;
20 
21 // RFC 3492
22 // Section 5: Parameter values for Punycode
23 
24 static const int base         = 36;
25 static const int tmin         = 1;
26 static const int tmax         = 26;
27 static const int skew         = 38;
28 static const int damp         = 700;
29 static const int initial_bias = 72;
30 static const uint32_t initial_n = 128;
31 
32 static const char delimiter = '_';
33 
digit_value(int digit)34 static char digit_value(int digit) {
35   assert(digit < base && "invalid punycode digit");
36   if (digit < 26)
37     return 'a' + digit;
38   return 'A' - 26 + digit;
39 }
40 
digit_index(char value)41 static int digit_index(char value) {
42   if (value >= 'a' && value <= 'z')
43     return value - 'a';
44   if (value >= 'A' && value <= 'J')
45     return value - 'A' + 26;
46   return -1;
47 }
48 
isValidUnicodeScalar(uint32_t S)49 static bool isValidUnicodeScalar(uint32_t S) {
50   // Also accept the range of 0xD800 - 0xD880, which is used for non-symbol
51   // ASCII characters.
52   return (S < 0xD880) || (S >= 0xE000 && S <= 0x1FFFFF);
53 }
54 
55 // Section 6.1: Bias adaptation function
56 
adapt(int delta,int numpoints,bool firsttime)57 static int adapt(int delta, int numpoints, bool firsttime) {
58   if (firsttime)
59     delta = delta / damp;
60   else
61     delta = delta / 2;
62 
63   delta += delta / numpoints;
64   int k = 0;
65   while (delta > ((base - tmin) * tmax) / 2) {
66     delta /= base - tmin;
67     k += base;
68   }
69   return k + (((base - tmin + 1) * delta) / (delta + skew));
70 }
71 
72 // Section 6.2: Decoding procedure
73 
decodePunycode(StringRef InputPunycode,std::vector<uint32_t> & OutCodePoints)74 bool Punycode::decodePunycode(StringRef InputPunycode,
75                               std::vector<uint32_t> &OutCodePoints) {
76   OutCodePoints.clear();
77   OutCodePoints.reserve(InputPunycode.size());
78 
79   // -- Build the decoded string as UTF32 first because we need random access.
80   uint32_t n = initial_n;
81   int i = 0;
82   int bias = initial_bias;
83   /// let output = an empty string indexed from 0
84   // consume all code points before the last delimiter (if there is one)
85   //  and copy them to output,
86   size_t lastDelimiter = InputPunycode.find_last_of(delimiter);
87   if (lastDelimiter != StringRef::npos) {
88     for (char c : InputPunycode.slice(0, lastDelimiter)) {
89       // fail on any non-basic code point
90       if (static_cast<unsigned char>(c) > 0x7f)
91         return true;
92       OutCodePoints.push_back(c);
93     }
94     // if more than zero code points were consumed then consume one more
95     //  (which will be the last delimiter)
96     InputPunycode =
97         InputPunycode.slice(lastDelimiter + 1, InputPunycode.size());
98   }
99 
100   while (!InputPunycode.empty()) {
101     int oldi = i;
102     int w = 1;
103     for (int k = base; ; k += base) {
104       // consume a code point, or fail if there was none to consume
105       if (InputPunycode.empty())
106         return true;
107       char codePoint = InputPunycode.front();
108       InputPunycode = InputPunycode.slice(1, InputPunycode.size());
109       // let digit = the code point's digit-value, fail if it has none
110       int digit = digit_index(codePoint);
111       if (digit < 0)
112         return true;
113 
114       i = i + digit * w;
115       int t = k <= bias ? tmin
116             : k >= bias + tmax ? tmax
117             : k - bias;
118       if (digit < t)
119         break;
120       w = w * (base - t);
121     }
122     bias = adapt(i - oldi, OutCodePoints.size() + 1, oldi == 0);
123     n = n + i / (OutCodePoints.size() + 1);
124     i = i % (OutCodePoints.size() + 1);
125     // if n is a basic code point then fail
126     if (n < 0x80)
127       return true;
128     // insert n into output at position i
129     OutCodePoints.insert(OutCodePoints.begin() + i, n);
130     i++;
131   }
132 
133   return true;
134 }
135 
136 // Section 6.3: Encoding procedure
137 
encodePunycode(const std::vector<uint32_t> & InputCodePoints,std::string & OutPunycode)138 bool Punycode::encodePunycode(const std::vector<uint32_t> &InputCodePoints,
139                               std::string &OutPunycode) {
140   OutPunycode.clear();
141 
142   uint32_t n = initial_n;
143   int delta = 0;
144   int bias = initial_bias;
145 
146   // let h = b = the number of basic code points in the input
147   // copy them to the output in order...
148   size_t h = 0;
149   for (auto C : InputCodePoints) {
150     if (C < 0x80) {
151       ++h;
152       OutPunycode.push_back(C);
153     }
154     if (!isValidUnicodeScalar(C)) {
155       OutPunycode.clear();
156       return false;
157     }
158   }
159   size_t b = h;
160   // ...followed by a delimiter if b > 0
161   if (b > 0)
162     OutPunycode.push_back(delimiter);
163 
164   while (h < InputCodePoints.size()) {
165     // let m = the minimum code point >= n in the input
166     uint32_t m = 0x10FFFF;
167     for (auto codePoint : InputCodePoints) {
168       if (codePoint >= n && codePoint < m)
169         m = codePoint;
170     }
171 
172     delta = delta + (m - n) * (h + 1);
173     n = m;
174     for (auto c : InputCodePoints) {
175       if (c < n) ++delta;
176       if (c == n) {
177         int q = delta;
178         for (int k = base; ; k += base) {
179           int t = k <= bias ? tmin
180                 : k >= bias + tmax ? tmax
181                 : k - bias;
182 
183           if (q < t) break;
184           OutPunycode.push_back(digit_value(t + ((q - t) % (base - t))));
185           q = (q - t) / (base - t);
186         }
187         OutPunycode.push_back(digit_value(q));
188         bias = adapt(delta, h + 1, h == b);
189         delta = 0;
190         ++h;
191       }
192     }
193     ++delta; ++n;
194   }
195   return true;
196 }
197 
encodeToUTF8(const std::vector<uint32_t> & Scalars,std::string & OutUTF8)198 static bool encodeToUTF8(const std::vector<uint32_t> &Scalars,
199                          std::string &OutUTF8) {
200   for (auto S : Scalars) {
201     if (!isValidUnicodeScalar(S)) {
202       OutUTF8.clear();
203       return false;
204     }
205     if (S >= 0xD800 && S < 0xD880)
206       S -= 0xD800;
207 
208     unsigned Bytes = 0;
209     if (S < 0x80)
210       Bytes = 1;
211     else if (S < 0x800)
212       Bytes = 2;
213     else if (S < 0x10000)
214       Bytes = 3;
215     else
216       Bytes = 4;
217 
218     switch (Bytes) {
219     case 1:
220       OutUTF8.push_back(S);
221       break;
222     case 2: {
223       uint8_t Byte2 = (S | 0x80) & 0xBF;
224       S >>= 6;
225       uint8_t Byte1 = S | 0xC0;
226       OutUTF8.push_back(Byte1);
227       OutUTF8.push_back(Byte2);
228       break;
229     }
230     case 3: {
231       uint8_t Byte3 = (S | 0x80) & 0xBF;
232       S >>= 6;
233       uint8_t Byte2 = (S | 0x80) & 0xBF;
234       S >>= 6;
235       uint8_t Byte1 = S | 0xE0;
236       OutUTF8.push_back(Byte1);
237       OutUTF8.push_back(Byte2);
238       OutUTF8.push_back(Byte3);
239       break;
240     }
241     case 4: {
242       uint8_t Byte4 = (S | 0x80) & 0xBF;
243       S >>= 6;
244       uint8_t Byte3 = (S | 0x80) & 0xBF;
245       S >>= 6;
246       uint8_t Byte2 = (S | 0x80) & 0xBF;
247       S >>= 6;
248       uint8_t Byte1 = S | 0xF0;
249       OutUTF8.push_back(Byte1);
250       OutUTF8.push_back(Byte2);
251       OutUTF8.push_back(Byte3);
252       OutUTF8.push_back(Byte4);
253       break;
254     }
255     }
256   }
257   return true;
258 }
259 
decodePunycodeUTF8(StringRef InputPunycode,std::string & OutUTF8)260 bool Punycode::decodePunycodeUTF8(StringRef InputPunycode,
261                                   std::string &OutUTF8) {
262   std::vector<uint32_t> OutCodePoints;
263   if (!decodePunycode(InputPunycode, OutCodePoints))
264     return false;
265 
266   return encodeToUTF8(OutCodePoints, OutUTF8);
267 }
268 
isContinuationByte(uint8_t unit)269 static bool isContinuationByte(uint8_t unit) {
270   return (unit & 0xC0) == 0x80;
271 }
272 
273 /// Reencode well-formed UTF-8 as UTF-32.
274 ///
275 /// This entry point is only called from compiler-internal entry points, so does
276 /// only minimal validation. In particular, it does *not* check for overlong
277 /// encodings.
278 /// If \p mapNonSymbolChars is true, non-symbol ASCII characters (characters
279 /// except [$_a-zA-Z0-9]) are also encoded like non-ASCII unicode characters.
280 /// Returns false if \p InputUTF8 contains surrogate code points.
convertUTF8toUTF32(llvm::StringRef InputUTF8,std::vector<uint32_t> & OutUTF32,bool mapNonSymbolChars)281 static bool convertUTF8toUTF32(llvm::StringRef InputUTF8,
282                                std::vector<uint32_t> &OutUTF32,
283                                bool mapNonSymbolChars) {
284   auto ptr = InputUTF8.begin();
285   auto end = InputUTF8.end();
286   while (ptr < end) {
287     uint8_t first = *ptr++;
288     if (first < 0x80) {
289       if (Mangle::isValidSymbolChar(first) || !mapNonSymbolChars) {
290         OutUTF32.push_back(first);
291       } else {
292         OutUTF32.push_back((uint32_t)first + 0xD800);
293       }
294     } else if (first < 0xC0) {
295       // Invalid continuation byte.
296       return false;
297     } else if (first < 0xE0) {
298       // Two-byte sequence.
299       if (ptr == end)
300         return false;
301       uint8_t second = *ptr++;
302       if (!isContinuationByte(second))
303         return false;
304       OutUTF32.push_back(((first & 0x1F) << 6) | (second & 0x3F));
305     } else if (first < 0xF0) {
306       // Three-byte sequence.
307       if (end - ptr < 2)
308         return false;
309       uint8_t second = *ptr++;
310       uint8_t third = *ptr++;
311       if (!isContinuationByte(second) || !isContinuationByte(third))
312         return false;
313       OutUTF32.push_back(((first & 0xF) << 12) | ((second & 0x3F) << 6)
314                          | ( third  & 0x3F      ));
315     } else if (first < 0xF8) {
316       // Four-byte sequence.
317       if (end - ptr < 3)
318         return false;
319       uint8_t second = *ptr++;
320       uint8_t third = *ptr++;
321       uint8_t fourth = *ptr++;
322       if (!isContinuationByte(second) || !isContinuationByte(third)
323           || !isContinuationByte(fourth))
324         return false;
325       OutUTF32.push_back(((first & 0x7) << 18) | ((second & 0x3F) << 12)
326                          | ((third  & 0x3F) <<  6)
327                          | ( fourth & 0x3F       ));
328     } else {
329       // Unused sequence length.
330       return false;
331     }
332   }
333   return true;
334 }
335 
encodePunycodeUTF8(StringRef InputUTF8,std::string & OutPunycode,bool mapNonSymbolChars)336 bool Punycode::encodePunycodeUTF8(StringRef InputUTF8,
337                                   std::string &OutPunycode,
338                                   bool mapNonSymbolChars) {
339   std::vector<uint32_t> InputCodePoints;
340   InputCodePoints.reserve(InputUTF8.size());
341 
342   if (!convertUTF8toUTF32(InputUTF8, InputCodePoints, mapNonSymbolChars))
343     return false;
344 
345   return encodePunycode(InputCodePoints, OutPunycode);
346 }
347 
348