1 /*
2 punycode.c from RFC 3492
3 http://www.nicemice.net/idn/
4 Adam M. Costello
5 http://www.nicemice.net/amc/
6 
7 This is ANSI C code (C89) implementing Punycode (RFC 3492).
8 
9 */
10 
11 #include <string.h>
12 
13 /*** Bootstring parameters for Punycode ***/
14 
15 enum { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
16        initial_bias = 72, initial_n = 0x80, delimiter = 0x2D };
17 
18 /* basic(cp) tests whether cp is a basic code point: */
19 #define basic(cp) ((punycode_uint)(cp) < 0x80)
20 
21 /* delim(cp) tests whether cp is a delimiter: */
22 #define delim(cp) ((cp) == delimiter)
23 
24 /* decode_digit(cp) returns the numeric value of a basic code */
25 /* point (for use in representing integers) in the range 0 to */
26 /* base-1, or base if cp is does not represent a value.       */
27 
decode_digit(punycode_uint cp)28 static punycode_uint decode_digit(punycode_uint cp)
29 {
30   return  cp - 48 < 10 ? cp - 22 :  cp - 65 < 26 ? cp - 65 :
31           cp - 97 < 26 ? cp - 97 :  (punycode_uint) base;
32 }
33 
34 /* encode_digit(d,flag) returns the basic code point whose value      */
35 /* (when used for representing integers) is d, which needs to be in   */
36 /* the range 0 to base-1.  The lowercase form is used unless flag is  */
37 /* nonzero, in which case the uppercase form is used.  The behavior   */
38 /* is undefined if flag is nonzero and digit d has no uppercase form. */
39 
encode_digit(punycode_uint d,int flag)40 static char encode_digit(punycode_uint d, int flag)
41 {
42   return char(d + 22 + 75 * (d < 26) - ((flag != 0) << 5));
43   /*  0..25 map to ASCII a..z or A..Z */
44   /* 26..35 map to ASCII 0..9         */
45 }
46 
47 /* flagged(bcp) tests whether a basic code point is flagged */
48 /* (uppercase).  The behavior is undefined if bcp is not a  */
49 /* basic code point.                                        */
50 
51 #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
52 
53 /* encode_basic(bcp,flag) forces a basic code point to lowercase */
54 /* if flag is zero, uppercase if flag is nonzero, and returns    */
55 /* the resulting code point.  The code point is unchanged if it  */
56 /* is caseless.  The behavior is undefined if bcp is not a basic */
57 /* code point.                                                   */
58 
encode_basic(punycode_uint bcp,int flag)59 static char encode_basic(punycode_uint bcp, int flag)
60 {
61   bcp -= (bcp - 97 < 26) << 5;
62   return char(bcp + ((!flag && (bcp - 65 < 26)) << 5));
63 }
64 
65 /*** Platform-specific constants ***/
66 
67 /* maxint is the maximum value of a punycode_uint variable: */
68 static const punycode_uint maxint = -1U;
69 /* Because maxint is unsigned, -1 becomes the maximum value. */
70 
71 /*** Bias adaptation function ***/
72 
adapt(punycode_uint delta,punycode_uint numpoints,int firsttime)73 static punycode_uint adapt(
74   punycode_uint delta, punycode_uint numpoints, int firsttime )
75 {
76   punycode_uint k;
77 
78   delta = firsttime ? delta / damp : delta >> 1;
79   /* delta >> 1 is a faster way of doing delta / 2 */
80   delta += delta / numpoints;
81 
82   for (k = 0;  delta > ((base - tmin) * tmax) / 2;  k += base) {
83     delta /= base - tmin;
84   }
85 
86   return k + (base - tmin + 1) * delta / (delta + skew);
87 }
88 
89 /*** Main encode function ***/
90 
punycode_encode(punycode_uint input_length,const punycode_uint input[],const unsigned char case_flags[],punycode_uint * output_length,char output[])91 enum punycode_status punycode_encode(
92   punycode_uint input_length,
93   const punycode_uint input[],
94   const unsigned char case_flags[],
95   punycode_uint *output_length,
96   char output[] )
97 {
98   punycode_uint n, delta, h, b, out, max_out, bias, j, m, q, k, t;
99 
100   /* Initialize the state: */
101 
102   n = initial_n;
103   delta = out = 0;
104   max_out = *output_length;
105   bias = initial_bias;
106 
107   /* Handle the basic code points: */
108 
109   for (j = 0;  j < input_length;  ++j) {
110     if (basic(input[j])) {
111       if (max_out - out < 2) return punycode_big_output;
112       output[out++] = char(
113         case_flags ?  encode_basic(input[j], case_flags[j]) : input[j]
114       );
115     }
116     /* else if (input[j] < n) return punycode_bad_input; */
117     /* (not needed for Punycode with unsigned code points) */
118   }
119 
120   h = b = out;
121 
122   /* h is the number of code points that have been handled, b is the  */
123   /* number of basic code points, and out is the number of characters */
124   /* that have been output.                                           */
125 
126   if (b > 0) output[out++] = delimiter;
127 
128   /* Main encoding loop: */
129 
130   while (h < input_length) {
131     /* All non-basic code points < n have been     */
132     /* handled already.  Find the next larger one: */
133 
134     for (m = maxint, j = 0;  j < input_length;  ++j) {
135       /* if (basic(input[j])) continue; */
136       /* (not needed for Punycode) */
137       if (input[j] >= n && input[j] < m) m = input[j];
138     }
139 
140     /* Increase delta enough to advance the decoder's    */
141     /* <n,i> state to <m,0>, but guard against overflow: */
142 
143     if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow;
144     delta += (m - n) * (h + 1);
145     n = m;
146 
147     for (j = 0;  j < input_length;  ++j) {
148       /* Punycode does not need to check whether input[j] is basic: */
149       if (input[j] < n /* || basic(input[j]) */ ) {
150         if (++delta == 0) return punycode_overflow;
151       }
152 
153       if (input[j] == n) {
154         /* Represent delta as a generalized variable-length integer: */
155 
156         for (q = delta, k = base;  ;  k += base) {
157           if (out >= max_out) return punycode_big_output;
158           t = k <= bias /* + tmin */ ? (punycode_uint) tmin :     /* +tmin not needed */
159               k >= (punycode_uint) bias + (punycode_uint) tmax ? (punycode_uint) tmax : k - (punycode_uint) bias;
160           if (q < t) break;
161           output[out++] = encode_digit(t + (q - t) % (base - t), 0);
162           q = (q - t) / (base - t);
163         }
164 
165         output[out++] = encode_digit(q, case_flags && case_flags[j]);
166         bias = adapt(delta, h + 1, h == b);
167         delta = 0;
168         ++h;
169       }
170     }
171 
172     ++delta, ++n;
173   }
174 
175   *output_length = out;
176   return punycode_success;
177 }
178 
179 /*** Main decode function ***/
180 
punycode_decode(punycode_uint input_length,const char input[],punycode_uint * output_length,punycode_uint output[],unsigned char case_flags[])181 enum punycode_status punycode_decode(
182   punycode_uint input_length,
183   const char input[],
184   punycode_uint *output_length,
185   punycode_uint output[],
186   unsigned char case_flags[] )
187 {
188   punycode_uint n, out, i, max_out, bias,
189                  b, j, in, oldi, w, k, digit, t;
190 
191   /* Initialize the state: */
192 
193   n = initial_n;
194   out = i = 0;
195   max_out = *output_length;
196   bias = initial_bias;
197 
198   /* Handle the basic code points:  Let b be the number of input code */
199   /* points before the last delimiter, or 0 if there is none, then    */
200   /* copy the first b code points to the output.                      */
201 
202   for (b = j = 0;  j < input_length;  ++j) if (delim(input[j])) b = j;
203   if (b > max_out) return punycode_big_output;
204 
205   for (j = 0;  j < b;  ++j) {
206     if (case_flags) case_flags[out] = flagged(input[j]);
207     if (!basic(input[j])) return punycode_bad_input;
208     output[out++] = input[j];
209   }
210 
211   /* Main decoding loop:  Start just after the last delimiter if any  */
212   /* basic code points were copied; start at the beginning otherwise. */
213 
214   for (in = b > 0 ? b + 1 : 0;  in < input_length;  ++out) {
215 
216     /* in is the index of the next character to be consumed, and */
217     /* out is the number of code points in the output array.     */
218 
219     /* Decode a generalized variable-length integer into delta,  */
220     /* which gets added to i.  The overflow checking is easier   */
221     /* if we increase i as we go, then subtract off its starting */
222     /* value at the end to obtain delta.                         */
223 
224     for (oldi = i, w = 1, k = base;  ;  k += base) {
225       if (in >= input_length) return punycode_bad_input;
226       digit = decode_digit(input[in++]);
227       if (digit >= base) return punycode_bad_input;
228       if (digit > (maxint - i) / w) return punycode_overflow;
229       i += digit * w;
230       t = k <= (punycode_uint) bias /* + tmin */ ? (punycode_uint) tmin :     /* +tmin not needed */
231           k >= (punycode_uint) bias + (punycode_uint) tmax ? (punycode_uint) tmax : k - (punycode_uint) bias;
232       if (digit < t) break;
233       if (w > maxint / (base - t)) return punycode_overflow;
234       w *= (base - t);
235     }
236 
237     bias = adapt(i - oldi, out + 1, oldi == 0);
238 
239     /* i was supposed to wrap around from out+1 to 0,   */
240     /* incrementing n each time, so we'll fix that now: */
241 
242     if (i / (out + 1) > maxint - n) return punycode_overflow;
243     n += i / (out + 1);
244     i %= (out + 1);
245 
246     /* Insert n at position i of the output: */
247 
248     /* not needed for Punycode: */
249     /* if (decode_digit(n) <= base) return punycode_invalid_input; */
250     if (out >= max_out) return punycode_big_output;
251 
252     if (case_flags) {
253       memmove(case_flags + i + 1, case_flags + i, out - i);
254       /* Case of last character determines uppercase flag: */
255       case_flags[i] = flagged(input[in - 1]);
256     }
257 
258     memmove(output + i + 1, output + i, (out - i) * sizeof *output);
259     output[i++] = n;
260   }
261 
262   *output_length = out;
263   return punycode_success;
264 }
265