1 /*
2 punycode.c from RFC 3492
3 http://www.nicemice.net/idn/
4 Adam M. Costello
5 http://www.nicemice.net/amc/
6
7 This is ANSI C code (C89) implementing Punycode (RFC 3492).
8
9
10 C. Disclaimer and license
11
12 Regarding this entire document or any portion of it (including
13 the pseudocode and C code), the author makes no guarantees and
14 is not responsible for any damage resulting from its use. The
15 author grants irrevocable permission to anyone to use, modify,
16 and distribute it in any way that does not diminish the rights
17 of anyone else to use, modify, and distribute it, provided that
18 redistributed derivative works do not contain misleading author or
19 version information. Derivative works need not be licensed under
20 similar terms.
21 */
22
23 #include "punycode.h"
24
25 /**********************************************************/
26 /* Implementation (would normally go in its own .c file): */
27
28 #include <string.h>
29
30 /*** Bootstring parameters for Punycode ***/
31
32 enum {
33 base = 36,
34 tmin = 1,
35 tmax = 26,
36 skew = 38,
37 damp = 700,
38 initial_bias = 72,
39 initial_n = 0x80,
40 delimiter = 0x2D
41 };
42
43 /* basic(cp) tests whether cp is a basic code point: */
44 #define basic(cp) ((punycode_uint)(cp) < 0x80)
45
46 /* delim(cp) tests whether cp is a delimiter: */
47 #define delim(cp) ((cp) == delimiter)
48
49 /* decode_digit(cp) returns the numeric value of a basic code */
50 /* point (for use in representing integers) in the range 0 to */
51 /* base-1, or base if cp is does not represent a value. */
52
decode_digit(punycode_uint cp)53 static punycode_uint decode_digit(punycode_uint cp) {
54 return cp - 48 < 10 ? cp - 22
55 : cp - 65 < 26 ? cp - 65 : cp - 97 < 26 ? cp - 97 : base;
56 }
57
58 /* encode_digit(d,flag) returns the basic code point whose value */
59 /* (when used for representing integers) is d, which needs to be in */
60 /* the range 0 to base-1. The lowercase form is used unless flag is */
61 /* nonzero, in which case the uppercase form is used. The behavior */
62 /* is undefined if flag is nonzero and digit d has no uppercase form. */
63
encode_digit(punycode_uint d,int flag)64 static char encode_digit(punycode_uint d, int flag) {
65 return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
66 /* 0..25 map to ASCII a..z or A..Z */
67 /* 26..35 map to ASCII 0..9 */
68 }
69
70 /* flagged(bcp) tests whether a basic code point is flagged */
71 /* (uppercase). The behavior is undefined if bcp is not a */
72 /* basic code point. */
73
74 #define flagged(bcp) ((punycode_uint)(bcp)-65 < 26)
75
76 /* encode_basic(bcp,flag) forces a basic code point to lowercase */
77 /* if flag is zero, uppercase if flag is nonzero, and returns */
78 /* the resulting code point. The code point is unchanged if it */
79 /* is caseless. The behavior is undefined if bcp is not a basic */
80 /* code point. */
81
encode_basic(punycode_uint bcp,int flag)82 static char encode_basic(punycode_uint bcp, int flag) {
83 bcp -= (bcp - 97 < 26) << 5;
84 return bcp + ((!flag && (bcp - 65 < 26)) << 5);
85 }
86
87 /*** Platform-specific constants ***/
88
89 /* maxint is the maximum value of a punycode_uint variable: */
90 static const punycode_uint maxint = (punycode_uint)-1;
91 /* Because maxint is unsigned, -1 becomes the maximum value. */
92
93 /*** Bias adaptation function ***/
94
adapt(punycode_uint delta,punycode_uint numpoints,int firsttime)95 static punycode_uint adapt(punycode_uint delta, punycode_uint numpoints,
96 int firsttime) {
97 punycode_uint k;
98
99 delta = firsttime ? delta / damp : delta >> 1;
100 /* delta >> 1 is a faster way of doing delta / 2 */
101 delta += delta / numpoints;
102
103 for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base) {
104 delta /= base - tmin;
105 }
106
107 return k + (base - tmin + 1) * delta / (delta + skew);
108 }
109
110 /*** Main encode function ***/
111
punycode_encode(punycode_uint input_length,const punycode_uint input[],const unsigned char case_flags[],punycode_uint * output_length,char output[])112 enum punycode_status punycode_encode(punycode_uint input_length,
113 const punycode_uint input[],
114 const unsigned char case_flags[],
115 punycode_uint *output_length,
116 char output[]) {
117 punycode_uint n, delta, h, b, out, max_out, bias, j, m, q, k, t;
118
119 /* Initialize the state: */
120
121 n = initial_n;
122 delta = out = 0;
123 max_out = *output_length;
124 bias = initial_bias;
125
126 /* Handle the basic code points: */
127
128 for (j = 0; j < input_length; ++j) {
129 if (basic(input[j])) {
130 if (max_out - out < 2) return punycode_big_output;
131 output[out++] =
132 case_flags ? encode_basic(input[j], case_flags[j]) : (char)input[j];
133 }
134 /* else if (input[j] < n) return punycode_bad_input; */
135 /* (not needed for Punycode with unsigned code points) */
136 }
137
138 h = b = out;
139
140 /* h is the number of code points that have been handled, b is the */
141 /* number of basic code points, and out is the number of characters */
142 /* that have been output. */
143
144 if (b > 0) output[out++] = delimiter;
145
146 /* Main encoding loop: */
147
148 while (h < input_length) {
149 /* All non-basic code points < n have been */
150 /* handled already. Find the next larger one: */
151
152 for (m = maxint, j = 0; j < input_length; ++j) {
153 /* if (basic(input[j])) continue; */
154 /* (not needed for Punycode) */
155 if (input[j] >= n && input[j] < m) m = input[j];
156 }
157
158 /* Increase delta enough to advance the decoder's */
159 /* <n,i> state to <m,0>, but guard against overflow: */
160
161 if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow;
162 delta += (m - n) * (h + 1);
163 n = m;
164
165 for (j = 0; j < input_length; ++j) {
166 /* Punycode does not need to check whether input[j] is basic: */
167 if (input[j] < n /* || basic(input[j]) */) {
168 if (++delta == 0) return punycode_overflow;
169 }
170
171 if (input[j] == n) {
172 /* Represent delta as a generalized variable-length integer: */
173
174 for (q = delta, k = base;; k += base) {
175 if (out >= max_out) return punycode_big_output;
176 t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
177 k >= bias + tmax ? tmax : k - bias;
178 if (q < t) break;
179 output[out++] = encode_digit(t + (q - t) % (base - t), 0);
180 q = (q - t) / (base - t);
181 }
182
183 output[out++] = encode_digit(q, case_flags && case_flags[j]);
184 bias = adapt(delta, h + 1, h == b);
185 delta = 0;
186 ++h;
187 }
188 }
189
190 ++delta, ++n;
191 }
192
193 *output_length = out;
194 return punycode_success;
195 }
196
197 /*** Main decode function ***/
198
punycode_decode(punycode_uint input_length,const char input[],punycode_uint * output_length,punycode_uint output[],unsigned char case_flags[])199 enum punycode_status punycode_decode(punycode_uint input_length,
200 const char input[],
201 punycode_uint *output_length,
202 punycode_uint output[],
203 unsigned char case_flags[]) {
204 punycode_uint n, out, i, max_out, bias, b, j, in, oldi, w, k, digit, t;
205
206 if (!input_length) {
207 return punycode_bad_input;
208 }
209
210 /* Initialize the state: */
211
212 n = initial_n;
213 out = i = 0;
214 max_out = *output_length;
215 bias = initial_bias;
216
217 /* Handle the basic code points: Let b be the number of input code */
218 /* points before the last delimiter, or 0 if there is none, then */
219 /* copy the first b code points to the output. */
220
221 for (b = 0, j = input_length - 1; j > 0; --j) {
222 if (delim(input[j])) {
223 b = j;
224 break;
225 }
226 }
227 if (b > max_out) return punycode_big_output;
228
229 for (j = 0; j < b; ++j) {
230 if (case_flags) case_flags[out] = flagged(input[j]);
231 if (!basic(input[j])) return punycode_bad_input;
232 output[out++] = input[j];
233 }
234
235 /* Main decoding loop: Start just after the last delimiter if any */
236 /* basic code points were copied; start at the beginning otherwise. */
237
238 for (in = b > 0 ? b + 1 : 0; in < input_length; ++out) {
239 /* in is the index of the next character to be consumed, and */
240 /* out is the number of code points in the output array. */
241
242 /* Decode a generalized variable-length integer into delta, */
243 /* which gets added to i. The overflow checking is easier */
244 /* if we increase i as we go, then subtract off its starting */
245 /* value at the end to obtain delta. */
246
247 for (oldi = i, w = 1, k = base;; k += base) {
248 if (in >= input_length) return punycode_bad_input;
249 digit = decode_digit(input[in++]);
250 if (digit >= base) return punycode_bad_input;
251 if (digit > (maxint - i) / w) return punycode_overflow;
252 i += digit * w;
253 t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
254 k >= bias + tmax ? tmax : k - bias;
255 if (digit < t) break;
256 if (w > maxint / (base - t)) return punycode_overflow;
257 w *= (base - t);
258 }
259
260 bias = adapt(i - oldi, out + 1, oldi == 0);
261
262 /* i was supposed to wrap around from out+1 to 0, */
263 /* incrementing n each time, so we'll fix that now: */
264
265 if (i / (out + 1) > maxint - n) return punycode_overflow;
266 n += i / (out + 1);
267 i %= (out + 1);
268
269 /* Insert n at position i of the output: */
270
271 /* not needed for Punycode: */
272 /* if (decode_digit(n) <= base) return punycode_invalid_input; */
273 if (out >= max_out) return punycode_big_output;
274
275 if (case_flags) {
276 memmove(case_flags + i + 1, case_flags + i, out - i);
277 /* Case of last character determines uppercase flag: */
278 case_flags[i] = flagged(input[in - 1]);
279 }
280
281 memmove(output + i + 1, output + i, (out - i) * sizeof *output);
282 output[i++] = n;
283 }
284
285 *output_length = out;
286 return punycode_success;
287 }
288