1 /* ScummVM - Graphic Adventure Engine
2  *
3  * ScummVM is the legal property of its developers, whose names
4  * are too numerous to list here. Please refer to the COPYRIGHT
5  * file distributed with this source distribution.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 2
10  * of the License, or (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20  *
21  */
22 
23 /**
24  * Copyright (C) 2011 by Ben Noordhuis <info@bnoordhuis.nl>
25  *
26  * Permission is hereby granted, free of charge, to any person obtaining a copy
27  * of this software and associated documentation files (the "Software"), to deal
28  * in the Software without restriction, including without limitation the rights
29  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
30  * copies of the Software, and to permit persons to whom the Software is
31  * furnished to do so, subject to the following conditions:
32  *
33  * The above copyright notice and this permission notice shall be included in
34  * all copies or substantial portions of the Software.
35  *
36  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
37  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
38  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
39  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
40  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
41  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
42  * THE SOFTWARE.
43  */
44 
45 #include "common/punycode.h"
46 #include "common/debug.h"
47 #include "common/tokenizer.h"
48 #include "common/util.h"
49 
50 namespace Common {
51 
52 /* punycode parameters, see http://tools.ietf.org/html/rfc3492#section-5 */
53 #define BASE 36
54 #define TMIN 1
55 #define TMAX 26
56 #define SKEW 38
57 #define DAMP 700
58 #define INITIAL_N 128
59 #define INITIAL_BIAS 72
60 #define SMAX 2147483647 // maximum Unicode code point
61 
62 #define SPECIAL_SYMBOLS "/\":*|\\?%<>,;="
63 
adapt_bias(uint32 delta,unsigned n_points,int is_first)64 static uint32 adapt_bias(uint32 delta, unsigned n_points, int is_first) {
65 	uint32 k;
66 
67 	delta /= is_first ? DAMP : 2;
68 	delta += delta / n_points;
69 
70 	/* while delta > 455: delta /= 35 */
71 	for (k = 0; delta > ((BASE - TMIN) * TMAX) / 2; k += BASE) {
72 		delta /= (BASE - TMIN);
73 	}
74 
75 	return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
76 }
77 
encode_digit(int c)78 static char encode_digit(int c) {
79 	assert(c >= 0 && c <= BASE - TMIN);
80 	if (c > 25) {
81 		return c + 22; /* '0'..'9' */
82 	} else {
83 		return c + 0x61; /* 'a'..'z' */
84 	}
85 }
86 
87 /* Encode as a generalized variable-length integer. Returns number of bytes written. */
encode_var_int(const size_t bias,const size_t delta)88 static String encode_var_int(const size_t bias, const size_t delta) {
89 	size_t k, q, t;
90 	String dst;
91 
92 	k = BASE;
93 	q = delta;
94 
95 	while (true) {
96 		if (k <= bias) {
97 			t = TMIN;
98 		} else if (k >= bias + TMAX) {
99 			t = TMAX;
100 		} else {
101 			t = k - bias;
102 		}
103 
104 		if (q < t) {
105 			break;
106 		}
107 
108 		dst += encode_digit(t + (q - t) % (BASE - t));
109 
110 		q = (q - t) / (BASE - t);
111 		k += BASE;
112 	}
113 
114 	dst += encode_digit(q);
115 
116 	return dst;
117 }
118 
decode_digit(uint32 v)119 static size_t decode_digit(uint32 v) {
120 	if (Common::isDigit(v)) {
121 		return 26 + (v - '0');
122 	}
123 	if (Common::isLower(v)) {
124 		return v - 'a';
125 	}
126 	if (Common::isUpper(v)) {
127 		return v - 'A';
128 	}
129 	return SMAX;
130 }
131 
punycode_encode(U32String src)132 String punycode_encode(U32String src) {
133 	size_t srclen = src.size();
134 	size_t h = 0, si;
135 	String dst = "xn--";
136 
137 	for (si = 0; si < srclen; si++) {
138 		if (src[si] < 128) {
139 			dst += src[si];
140 			h++;
141 		}
142 	}
143 
144 	size_t b = h;
145 
146 	// If every character is ASCII, return the original string.
147 	if (h == srclen) {
148 		// If string ends with space or dot, still punycode it
149 		// because certain FSes do not support files with those
150 		// endings
151 		if (src[h - 1] == ' ' || src[h - 1] == '.')
152 			return dst + '-';
153 
154 		return src;
155 	}
156 
157 	// If we have any ASCII characters, add '-' to separate them from
158 	// the non-ASCII character insertions.
159 	if (h > 0)
160 		dst += "-";
161 
162 	size_t n = INITIAL_N;
163 	size_t bias = INITIAL_BIAS;
164 	size_t delta = 0;
165 	size_t m;
166 
167 	for (; h < srclen; n++, delta++) {
168 		/* Find next smallest non-basic code point. */
169 		for (m = SMAX, si = 0; si < srclen; si++) {
170 			if (src[si] >= n && src[si] < m) {
171 				m = src[si];
172 			}
173 		}
174 
175 		if ((m - n) > (SMAX - delta) / (h + 1)) {
176 			/* OVERFLOW */
177 			warning("punycode_encode: overflow1");
178 			return src;
179 		}
180 
181 		delta += (m - n) * (h + 1);
182 		n = m;
183 
184 		for (si = 0; si < srclen; si++) {
185 			if (src[si] < n) {
186 				if (++delta == 0) {
187 					/* OVERFLOW */
188 					warning("punycode_encode: overflow2");
189 					return src;
190 				}
191 			} else if (src[si] == n) {
192 				dst += encode_var_int(bias, delta);
193 				bias = adapt_bias(delta, h + 1, h == b);
194 				delta = 0;
195 				h++;
196 			}
197 		}
198 	}
199 
200 	/* Return how many Unicode code points were converted. */
201 	return dst;
202 }
203 
punycode_hasprefix(const String src)204 bool punycode_hasprefix(const String src) {
205 	return src.hasPrefix("xn--");
206 }
207 
punycode_needEncode(const String src)208 bool punycode_needEncode(const String src) {
209 	for (uint si = 0; si < src.size(); si++) {
210 		if (src[si] & 0x80 || src[si] < 0x20 || strchr(SPECIAL_SYMBOLS, src[si])) {
211 			return true;
212 		}
213 	}
214 
215 	// If name ends with space or dot, we have to encode it
216 	if (src[src.size() - 1] == ' ' || src[src.size() - 1] == '.')
217 		return true;
218 
219 	return false;
220 }
221 
punycode_decode(const String src1)222 U32String punycode_decode(const String src1) {
223 	if (!src1.hasPrefix("xn--"))
224 		return src1;
225 
226 	String src(&src1.c_str()[4]); // Skip the prefix for simplification
227 	int srclen = src.size();
228 
229 	/* Ensure that the input contains only ASCII characters. */
230 	for (int si = 0; si < srclen; si++) {
231 		if (src[si] & 0x80) {
232 			return src1;
233 		}
234 	}
235 
236 	size_t di = src.findLastOf('-');
237 
238 	// If we have no '-', the entire string is non-ASCII character insertions.
239 	if (di == String::npos)
240 		di = 0;
241 
242 	U32String dst;
243 
244 	for (size_t i = 0; i < di; i++) {
245 		dst += src[i];
246 	}
247 
248 	size_t b = di;
249 	size_t i = 0;
250 	size_t n = INITIAL_N;
251 	size_t bias = INITIAL_BIAS;
252 
253 	for (int si = b + (b > 0 ? 1 : 0); si < srclen; di++) {
254 		size_t org_i = i;
255 
256 		for (size_t w = 1, k = BASE; true; k += BASE) {
257 			size_t digit = decode_digit(src[si++]);
258 
259 			if (digit == SMAX) {
260 				warning("punycode_decode: incorrect digit");
261 				return src1;
262 			}
263 
264 			if (digit > (SMAX - i) / w) {
265 				/* OVERFLOW */
266 				warning("punycode_decode: overflow1");
267 				return src1;
268 			}
269 
270 			i += digit * w;
271 			size_t t;
272 
273 			if (k <= bias) {
274 				t = TMIN;
275 			} else if (k >= bias + TMAX) {
276 				t = TMAX;
277 			} else {
278 				t = k - bias;
279 			}
280 
281 			if (digit < t) {
282 				break;
283 			}
284 
285 			if (w > SMAX / (BASE - t)) {
286 				/* OVERFLOW */
287 				warning("punycode_decode: overflow2");
288 				return src1;
289 			}
290 
291 			w *= BASE - t;
292 		}
293 
294 		bias = adapt_bias(i - org_i, di + 1, org_i == 0);
295 
296 		if (i / (di + 1) > SMAX - n) {
297 			/* OVERFLOW */
298 				warning("punycode_decode: overflow3");
299 			return src1;
300 		}
301 
302 		n += i / (di + 1);
303 		i %= (di + 1);
304 
305 		U32String dst1(dst.c_str(), i);
306 		dst1 += (u32char_type_t)n;
307 		dst1 += U32String(&dst.c_str()[i]);
308 		dst = dst1;
309 		i++;
310 	}
311 
312 	return dst;
313 }
314 
punycode_encodefilename(const U32String src)315 String punycode_encodefilename(const U32String src) {
316 	U32String dst;
317 
318 	for (uint i = 0; i < src.size(); i++) {
319 		if (src[i] == 0x81) {	// In case we have our escape character present
320 			dst += 0x81;
321 			dst += 0x79;
322 		// Encode special symbols and non-printables
323 		} else if ((src[i] < 0x80 && strchr(SPECIAL_SYMBOLS, (byte)src[i])) || src[i] < 0x20) {
324 			dst += 0x81;
325 			dst += src[i] + 0x80;
326 		} else {
327 			dst += src[i];
328 		}
329 	}
330 
331 	return punycode_encode(dst);
332 }
333 
punycode_decodefilename(const String src1)334 U32String punycode_decodefilename(const String src1) {
335 	U32String dst;
336 	U32String src = punycode_decode(src1);
337 
338 	// Check if the string did not change which could be
339 	// also on decoding failure
340 	if (src == src1)
341 		return src;
342 
343 	for (uint i = 0; i < src.size(); i++) {
344 		if (src[i] == 0x81 && i + 1 < src.size()) {
345 			i++;
346 			if (src[i] == 0x79)
347 				dst += 0x81;
348 			else
349 				dst += src[i] - 0x80;
350 		} else {
351 			dst += src[i];
352 		}
353 	}
354 
355 	return dst;
356 }
357 
punycode_decodepath(const Path & src)358 Path punycode_decodepath(const Path &src) {
359 	StringTokenizer tok(src.rawString(), Common::String(DIR_SEPARATOR));
360 	String res;
361 
362 	while (!tok.empty()) {
363 		res += punycode_decodefilename(tok.nextToken());
364 		if (!tok.empty())
365 			res += DIR_SEPARATOR;
366 	}
367 
368 	return Path(res, DIR_SEPARATOR);
369 }
370 
punycode_encodepath(const Path & src)371 Path punycode_encodepath(const Path &src) {
372 	StringTokenizer tok(src.rawString(), Common::String(DIR_SEPARATOR));
373 	String res;
374 
375 	while (!tok.empty()) {
376 		String part = tok.nextToken();
377 		if (punycode_needEncode(part))
378 			res += punycode_encodefilename(part);
379 		else
380 			res += part;
381 
382 		if (!tok.empty())
383 			res += DIR_SEPARATOR;
384 	}
385 
386 	return Path(res, DIR_SEPARATOR);
387 }
388 
389 } // end of namespace Common
390