1 /* ScummVM - Graphic Adventure Engine
2 *
3 * ScummVM is the legal property of its developers, whose names
4 * are too numerous to list here. Please refer to the COPYRIGHT
5 * file distributed with this source distribution.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 */
22
23 /**
24 * Copyright (C) 2011 by Ben Noordhuis <info@bnoordhuis.nl>
25 *
26 * Permission is hereby granted, free of charge, to any person obtaining a copy
27 * of this software and associated documentation files (the "Software"), to deal
28 * in the Software without restriction, including without limitation the rights
29 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
30 * copies of the Software, and to permit persons to whom the Software is
31 * furnished to do so, subject to the following conditions:
32 *
33 * The above copyright notice and this permission notice shall be included in
34 * all copies or substantial portions of the Software.
35 *
36 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
37 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
38 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
39 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
40 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
41 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
42 * THE SOFTWARE.
43 */
44
45 #include "common/punycode.h"
46 #include "common/debug.h"
47 #include "common/tokenizer.h"
48 #include "common/util.h"
49
50 namespace Common {
51
52 /* punycode parameters, see http://tools.ietf.org/html/rfc3492#section-5 */
53 #define BASE 36
54 #define TMIN 1
55 #define TMAX 26
56 #define SKEW 38
57 #define DAMP 700
58 #define INITIAL_N 128
59 #define INITIAL_BIAS 72
60 #define SMAX 2147483647 // maximum Unicode code point
61
62 #define SPECIAL_SYMBOLS "/\":*|\\?%<>,;="
63
adapt_bias(uint32 delta,unsigned n_points,int is_first)64 static uint32 adapt_bias(uint32 delta, unsigned n_points, int is_first) {
65 uint32 k;
66
67 delta /= is_first ? DAMP : 2;
68 delta += delta / n_points;
69
70 /* while delta > 455: delta /= 35 */
71 for (k = 0; delta > ((BASE - TMIN) * TMAX) / 2; k += BASE) {
72 delta /= (BASE - TMIN);
73 }
74
75 return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
76 }
77
encode_digit(int c)78 static char encode_digit(int c) {
79 assert(c >= 0 && c <= BASE - TMIN);
80 if (c > 25) {
81 return c + 22; /* '0'..'9' */
82 } else {
83 return c + 0x61; /* 'a'..'z' */
84 }
85 }
86
87 /* Encode as a generalized variable-length integer. Returns number of bytes written. */
encode_var_int(const size_t bias,const size_t delta)88 static String encode_var_int(const size_t bias, const size_t delta) {
89 size_t k, q, t;
90 String dst;
91
92 k = BASE;
93 q = delta;
94
95 while (true) {
96 if (k <= bias) {
97 t = TMIN;
98 } else if (k >= bias + TMAX) {
99 t = TMAX;
100 } else {
101 t = k - bias;
102 }
103
104 if (q < t) {
105 break;
106 }
107
108 dst += encode_digit(t + (q - t) % (BASE - t));
109
110 q = (q - t) / (BASE - t);
111 k += BASE;
112 }
113
114 dst += encode_digit(q);
115
116 return dst;
117 }
118
decode_digit(uint32 v)119 static size_t decode_digit(uint32 v) {
120 if (Common::isDigit(v)) {
121 return 26 + (v - '0');
122 }
123 if (Common::isLower(v)) {
124 return v - 'a';
125 }
126 if (Common::isUpper(v)) {
127 return v - 'A';
128 }
129 return SMAX;
130 }
131
punycode_encode(U32String src)132 String punycode_encode(U32String src) {
133 size_t srclen = src.size();
134 size_t h = 0, si;
135 String dst = "xn--";
136
137 for (si = 0; si < srclen; si++) {
138 if (src[si] < 128) {
139 dst += src[si];
140 h++;
141 }
142 }
143
144 size_t b = h;
145
146 // If every character is ASCII, return the original string.
147 if (h == srclen) {
148 // If string ends with space or dot, still punycode it
149 // because certain FSes do not support files with those
150 // endings
151 if (src[h - 1] == ' ' || src[h - 1] == '.')
152 return dst + '-';
153
154 return src;
155 }
156
157 // If we have any ASCII characters, add '-' to separate them from
158 // the non-ASCII character insertions.
159 if (h > 0)
160 dst += "-";
161
162 size_t n = INITIAL_N;
163 size_t bias = INITIAL_BIAS;
164 size_t delta = 0;
165 size_t m;
166
167 for (; h < srclen; n++, delta++) {
168 /* Find next smallest non-basic code point. */
169 for (m = SMAX, si = 0; si < srclen; si++) {
170 if (src[si] >= n && src[si] < m) {
171 m = src[si];
172 }
173 }
174
175 if ((m - n) > (SMAX - delta) / (h + 1)) {
176 /* OVERFLOW */
177 warning("punycode_encode: overflow1");
178 return src;
179 }
180
181 delta += (m - n) * (h + 1);
182 n = m;
183
184 for (si = 0; si < srclen; si++) {
185 if (src[si] < n) {
186 if (++delta == 0) {
187 /* OVERFLOW */
188 warning("punycode_encode: overflow2");
189 return src;
190 }
191 } else if (src[si] == n) {
192 dst += encode_var_int(bias, delta);
193 bias = adapt_bias(delta, h + 1, h == b);
194 delta = 0;
195 h++;
196 }
197 }
198 }
199
200 /* Return how many Unicode code points were converted. */
201 return dst;
202 }
203
punycode_hasprefix(const String src)204 bool punycode_hasprefix(const String src) {
205 return src.hasPrefix("xn--");
206 }
207
punycode_needEncode(const String src)208 bool punycode_needEncode(const String src) {
209 for (uint si = 0; si < src.size(); si++) {
210 if (src[si] & 0x80 || src[si] < 0x20 || strchr(SPECIAL_SYMBOLS, src[si])) {
211 return true;
212 }
213 }
214
215 // If name ends with space or dot, we have to encode it
216 if (src[src.size() - 1] == ' ' || src[src.size() - 1] == '.')
217 return true;
218
219 return false;
220 }
221
punycode_decode(const String src1)222 U32String punycode_decode(const String src1) {
223 if (!src1.hasPrefix("xn--"))
224 return src1;
225
226 String src(&src1.c_str()[4]); // Skip the prefix for simplification
227 int srclen = src.size();
228
229 /* Ensure that the input contains only ASCII characters. */
230 for (int si = 0; si < srclen; si++) {
231 if (src[si] & 0x80) {
232 return src1;
233 }
234 }
235
236 size_t di = src.findLastOf('-');
237
238 // If we have no '-', the entire string is non-ASCII character insertions.
239 if (di == String::npos)
240 di = 0;
241
242 U32String dst;
243
244 for (size_t i = 0; i < di; i++) {
245 dst += src[i];
246 }
247
248 size_t b = di;
249 size_t i = 0;
250 size_t n = INITIAL_N;
251 size_t bias = INITIAL_BIAS;
252
253 for (int si = b + (b > 0 ? 1 : 0); si < srclen; di++) {
254 size_t org_i = i;
255
256 for (size_t w = 1, k = BASE; true; k += BASE) {
257 size_t digit = decode_digit(src[si++]);
258
259 if (digit == SMAX) {
260 warning("punycode_decode: incorrect digit");
261 return src1;
262 }
263
264 if (digit > (SMAX - i) / w) {
265 /* OVERFLOW */
266 warning("punycode_decode: overflow1");
267 return src1;
268 }
269
270 i += digit * w;
271 size_t t;
272
273 if (k <= bias) {
274 t = TMIN;
275 } else if (k >= bias + TMAX) {
276 t = TMAX;
277 } else {
278 t = k - bias;
279 }
280
281 if (digit < t) {
282 break;
283 }
284
285 if (w > SMAX / (BASE - t)) {
286 /* OVERFLOW */
287 warning("punycode_decode: overflow2");
288 return src1;
289 }
290
291 w *= BASE - t;
292 }
293
294 bias = adapt_bias(i - org_i, di + 1, org_i == 0);
295
296 if (i / (di + 1) > SMAX - n) {
297 /* OVERFLOW */
298 warning("punycode_decode: overflow3");
299 return src1;
300 }
301
302 n += i / (di + 1);
303 i %= (di + 1);
304
305 U32String dst1(dst.c_str(), i);
306 dst1 += (u32char_type_t)n;
307 dst1 += U32String(&dst.c_str()[i]);
308 dst = dst1;
309 i++;
310 }
311
312 return dst;
313 }
314
punycode_encodefilename(const U32String src)315 String punycode_encodefilename(const U32String src) {
316 U32String dst;
317
318 for (uint i = 0; i < src.size(); i++) {
319 if (src[i] == 0x81) { // In case we have our escape character present
320 dst += 0x81;
321 dst += 0x79;
322 // Encode special symbols and non-printables
323 } else if ((src[i] < 0x80 && strchr(SPECIAL_SYMBOLS, (byte)src[i])) || src[i] < 0x20) {
324 dst += 0x81;
325 dst += src[i] + 0x80;
326 } else {
327 dst += src[i];
328 }
329 }
330
331 return punycode_encode(dst);
332 }
333
punycode_decodefilename(const String src1)334 U32String punycode_decodefilename(const String src1) {
335 U32String dst;
336 U32String src = punycode_decode(src1);
337
338 // Check if the string did not change which could be
339 // also on decoding failure
340 if (src == src1)
341 return src;
342
343 for (uint i = 0; i < src.size(); i++) {
344 if (src[i] == 0x81 && i + 1 < src.size()) {
345 i++;
346 if (src[i] == 0x79)
347 dst += 0x81;
348 else
349 dst += src[i] - 0x80;
350 } else {
351 dst += src[i];
352 }
353 }
354
355 return dst;
356 }
357
punycode_decodepath(const Path & src)358 Path punycode_decodepath(const Path &src) {
359 StringTokenizer tok(src.rawString(), Common::String(DIR_SEPARATOR));
360 String res;
361
362 while (!tok.empty()) {
363 res += punycode_decodefilename(tok.nextToken());
364 if (!tok.empty())
365 res += DIR_SEPARATOR;
366 }
367
368 return Path(res, DIR_SEPARATOR);
369 }
370
punycode_encodepath(const Path & src)371 Path punycode_encodepath(const Path &src) {
372 StringTokenizer tok(src.rawString(), Common::String(DIR_SEPARATOR));
373 String res;
374
375 while (!tok.empty()) {
376 String part = tok.nextToken();
377 if (punycode_needEncode(part))
378 res += punycode_encodefilename(part);
379 else
380 res += part;
381
382 if (!tok.empty())
383 res += DIR_SEPARATOR;
384 }
385
386 return Path(res, DIR_SEPARATOR);
387 }
388
389 } // end of namespace Common
390