1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2 of the License, or
6 (at your option) any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 */
17
18
19 /*
20 punycode.c from RFC 3492
21 http://www.nicemice.net/idn/
22 Adam M. Costello
23 http://www.nicemice.net/amc/
24
25 This is ANSI C code (C89) implementing Punycode (RFC 3492).
26
27 Copyright (C) The Internet Society (2003). All Rights Reserved.
28
29 This document and translations of it may be copied and furnished to
30 others, and derivative works that comment on or otherwise explain it
31 or assist in its implementation may be prepared, copied, published
32 and distributed, in whole or in part, without restriction of any
33 kind, provided that the above copyright notice and this paragraph are
34 included on all such copies and derivative works. However, this
35 document itself may not be modified in any way, such as by removing
36 the copyright notice or references to the Internet Society or other
37 Internet organizations, except as needed for the purpose of
38 developing Internet standards in which case the procedures for
39 copyrights defined in the Internet Standards process must be
40 followed, or as required to translate it into languages other than
41 English.
42
43 The limited permissions granted above are perpetual and will not be
44 revoked by the Internet Society or its successors or assigns.
45
46 This document and the information contained herein is provided on an
47 "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
48 TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
49 BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
50 HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
51 MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
52 */
53
54 /************************************************************/
55 /* Public interface (would normally go in its own .h file): */
56
57 #include <limits.h>
58
59 enum punycode_status {
60 punycode_success,
61 punycode_bad_input, /* Input is invalid. */
62 punycode_big_output, /* Output would exceed the space provided. */
63 punycode_overflow /* Input needs wider integers to process. */
64 };
65
66 #if UINT_MAX >= (1 << 26) - 1
67 typedef unsigned int punycode_uint;
68 #else
69 #error Bad UINT_MAX
70 typedef unsigned long punycode_uint;
71 #endif
72
73 enum punycode_status punycode_encode(
74 punycode_uint input_length,
75 const punycode_uint input[],
76 const unsigned char case_flags[],
77 punycode_uint *output_length,
78 char output[] );
79
80 /* punycode_encode() converts Unicode to Punycode. The input */
81 /* is represented as an array of Unicode code points (not code */
82 /* units; surrogate pairs are not allowed), and the output */
83 /* will be represented as an array of ASCII code points. The */
84 /* output string is *not* null-terminated; it will contain */
85 /* zeros if and only if the input contains zeros. (Of course */
86 /* the caller can leave room for a terminator and add one if */
87 /* needed.) The input_length is the number of code points in */
88 /* the input. The output_length is an in/out argument: the */
89 /* caller passes in the maximum number of code points that it */
90 /* can receive, and on successful return it will contain the */
91 /* number of code points actually output. The case_flags array */
92 /* holds input_length boolean values, where nonzero suggests that */
93 /* the corresponding Unicode character be forced to uppercase */
94 /* after being decoded (if possible), and zero suggests that */
95 /* it be forced to lowercase (if possible). ASCII code points */
96 /* are encoded literally, except that ASCII letters are forced */
97 /* to uppercase or lowercase according to the corresponding */
98 /* uppercase flags. If case_flags is a null pointer then ASCII */
99 /* letters are left as they are, and other code points are */
100 /* treated as if their uppercase flags were zero. The return */
101 /* value can be any of the punycode_status values defined above */
102 /* except punycode_bad_input; if not punycode_success, then */
103 /* output_size and output might contain garbage. */
104
105 enum punycode_status punycode_decode(
106 punycode_uint input_length,
107 const char input[],
108 punycode_uint *output_length,
109 punycode_uint output[],
110 unsigned char case_flags[] );
111
112 /* punycode_decode() converts Punycode to Unicode. The input is */
113 /* represented as an array of ASCII code points, and the output */
114 /* will be represented as an array of Unicode code points. The */
115 /* input_length is the number of code points in the input. The */
116 /* output_length is an in/out argument: the caller passes in */
117 /* the maximum number of code points that it can receive, and */
118 /* on successful return it will contain the actual number of */
119 /* code points output. The case_flags array needs room for at */
120 /* least output_length values, or it can be a null pointer if the */
121 /* case information is not needed. A nonzero flag suggests that */
122 /* the corresponding Unicode character be forced to uppercase */
123 /* by the caller (if possible), while zero suggests that it be */
124 /* forced to lowercase (if possible). ASCII code points are */
125 /* output already in the proper case, but their flags will be set */
126 /* appropriately so that applying the flags would be harmless. */
127 /* The return value can be any of the punycode_status values */
128 /* defined above; if not punycode_success, then output_length, */
129 /* output, and case_flags might contain garbage. On success, the */
130 /* decoder will never need to write an output_length greater than */
131 /* input_length, because of how the encoding is defined. */
132
133 /**********************************************************/
134 /* Implementation (would normally go in its own .c file): */
135
136 #include <string.h>
137
138
139
140 /*** Bootstring parameters for Punycode ***/
141
142 enum { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
143 initial_bias = 72, initial_n = 0x80, delimiter = 0x2D };
144
145 /* basic(cp) tests whether cp is a basic code point: */
146 #define basic(cp) ((punycode_uint)(cp) < 0x80)
147
148 /* delim(cp) tests whether cp is a delimiter: */
149 #define delim(cp) ((cp) == delimiter)
150
151 /* decode_digit(cp) returns the numeric value of a basic code */
152 /* point (for use in representing integers) in the range 0 to */
153 /* base-1, or base if cp is does not represent a value. */
154
decode_digit(punycode_uint cp)155 static punycode_uint decode_digit(punycode_uint cp)
156 {
157 return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 :
158 cp - 97 < 26 ? cp - 97 : base;
159 }
160
161 /* encode_digit(d,flag) returns the basic code point whose value */
162 /* (when used for representing integers) is d, which needs to be in */
163 /* the range 0 to base-1. The lowercase form is used unless flag is */
164 /* nonzero, in which case the uppercase form is used. The behavior */
165 /* is undefined if flag is nonzero and digit d has no uppercase form. */
166
encode_digit(punycode_uint d,int flag)167 static char encode_digit(punycode_uint d, int flag)
168 {
169 return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
170 /* 0..25 map to ASCII a..z or A..Z */
171 /* 26..35 map to ASCII 0..9 */
172 }
173
174 /* flagged(bcp) tests whether a basic code point is flagged */
175 /* (uppercase). The behavior is undefined if bcp is not a */
176 /* basic code point. */
177
178 #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
179
180 /* encode_basic(bcp,flag) forces a basic code point to lowercase */
181 /* if flag is zero, uppercase if flag is nonzero, and returns */
182 /* the resulting code point. The code point is unchanged if it */
183 /* is caseless. The behavior is undefined if bcp is not a basic */
184 /* code point. */
185
encode_basic(punycode_uint bcp,int flag)186 static char encode_basic(punycode_uint bcp, int flag)
187 {
188 bcp -= (bcp - 97 < 26) << 5;
189 return bcp + ((!flag && (bcp - 65 < 26)) << 5);
190 }
191
192 /*** Platform-specific constants ***/
193
194 /* maxint is the maximum value of a punycode_uint variable: */
195 static const punycode_uint maxint = -1;
196 /* Because maxint is unsigned, -1 becomes the maximum value. */
197
198 /*** Bias adaptation function ***/
199
adapt(punycode_uint delta,punycode_uint numpoints,int firsttime)200 static punycode_uint adapt(
201 punycode_uint delta, punycode_uint numpoints, int firsttime )
202 {
203 punycode_uint k;
204
205 delta = firsttime ? delta / damp : delta >> 1;
206 /* delta >> 1 is a faster way of doing delta / 2 */
207 delta += delta / numpoints;
208
209 for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base) {
210 delta /= base - tmin;
211 }
212
213 return k + (base - tmin + 1) * delta / (delta + skew);
214 }
215
216 /*** Main encode function ***/
217
punycode_encode(punycode_uint input_length,const punycode_uint input[],const unsigned char case_flags[],punycode_uint * output_length,char output[])218 enum punycode_status punycode_encode(
219 punycode_uint input_length,
220 const punycode_uint input[],
221 const unsigned char case_flags[],
222 punycode_uint *output_length,
223 char output[] )
224 {
225 punycode_uint n, delta, h, b, out, max_out, bias, j, m, q, k, t;
226
227 /* Initialize the state: */
228
229 n = initial_n;
230 delta = out = 0;
231 max_out = *output_length;
232 bias = initial_bias;
233
234 /* Handle the basic code points: */
235
236
237 for (j = 0; j < input_length; ++j) {
238 if (basic(input[j])) {
239 if (max_out - out < 2) return punycode_big_output;
240 output[out++] =
241 case_flags ? encode_basic(input[j], case_flags[j]) : input[j];
242 }
243 /* else if (input[j] < n) return punycode_bad_input; */
244 /* (not needed for Punycode with unsigned code points) */
245 }
246
247 h = b = out;
248
249 /* h is the number of code points that have been handled, b is the */
250 /* number of basic code points, and out is the number of characters */
251 /* that have been output. */
252
253 if (b > 0) output[out++] = delimiter;
254
255 /* Main encoding loop: */
256
257 while (h < input_length) {
258 /* All non-basic code points < n have been */
259 /* handled already. Find the next larger one: */
260
261 for (m = maxint, j = 0; j < input_length; ++j) {
262 /* if (basic(input[j])) continue; */
263 /* (not needed for Punycode) */
264 if (input[j] >= n && input[j] < m) m = input[j];
265 }
266
267 /* Increase delta enough to advance the decoder's */
268 /* <n,i> state to <m,0>, but guard against overflow: */
269
270 if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow;
271 delta += (m - n) * (h + 1);
272 n = m;
273
274 for (j = 0; j < input_length; ++j) {
275 /* Punycode does not need to check whether input[j] is basic: */
276 if (input[j] < n /* || basic(input[j]) */ ) {
277 if (++delta == 0) return punycode_overflow;
278 }
279
280 if (input[j] == n) {
281 /* Represent delta as a generalized variable-length integer: */
282
283 for (q = delta, k = base; ; k += base) {
284 if (out >= max_out) return punycode_big_output;
285 t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
286 k >= bias + tmax ? tmax : k - bias;
287 if (q < t) break;
288 output[out++] = encode_digit(t + (q - t) % (base - t), 0);
289 q = (q - t) / (base - t);
290 }
291
292 output[out++] = encode_digit(q, case_flags && case_flags[j]);
293 bias = adapt(delta, h + 1, h == b);
294 delta = 0;
295 ++h;
296 }
297 }
298
299 ++delta, ++n;
300 }
301
302 *output_length = out;
303 return punycode_success;
304 }
305
306 /*** Main decode function ***/
307
punycode_decode(punycode_uint input_length,const char input[],punycode_uint * output_length,punycode_uint output[],unsigned char case_flags[])308 enum punycode_status punycode_decode(
309 punycode_uint input_length,
310 const char input[],
311 punycode_uint *output_length,
312 punycode_uint output[],
313 unsigned char case_flags[] )
314 {
315 punycode_uint n, out, i, max_out, bias,
316 b, j, in, oldi, w, k, digit, t;
317
318 /* Initialize the state: */
319
320 n = initial_n;
321 out = i = 0;
322 max_out = *output_length;
323 bias = initial_bias;
324
325 /* Handle the basic code points: Let b be the number of input code */
326 /* points before the last delimiter, or 0 if there is none, then */
327 /* copy the first b code points to the output. */
328
329 for (b = j = 0; j < input_length; ++j) if (delim(input[j])) b = j;
330 if (b > max_out) return punycode_big_output;
331
332 for (j = 0; j < b; ++j) {
333 if (case_flags) case_flags[out] = flagged(input[j]);
334 if (!basic(input[j])) return punycode_bad_input;
335 output[out++] = input[j];
336 }
337
338 /* Main decoding loop: Start just after the last delimiter if any */
339 /* basic code points were copied; start at the beginning otherwise. */
340
341 for (in = b > 0 ? b + 1 : 0; in < input_length; ++out) {
342
343 /* in is the index of the next character to be consumed, and */
344 /* out is the number of code points in the output array. */
345
346 /* Decode a generalized variable-length integer into delta, */
347 /* which gets added to i. The overflow checking is easier */
348 /* if we increase i as we go, then subtract off its starting */
349 /* value at the end to obtain delta. */
350
351 for (oldi = i, w = 1, k = base; ; k += base) {
352 if (in >= input_length) return punycode_bad_input;
353 digit = decode_digit(input[in++]);
354 if (digit >= base) return punycode_bad_input;
355 if (digit > (maxint - i) / w) return punycode_overflow;
356 i += digit * w;
357 t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
358 k >= bias + tmax ? tmax : k - bias;
359 if (digit < t) break;
360 if (w > maxint / (base - t)) return punycode_overflow;
361 w *= (base - t);
362 }
363
364 bias = adapt(i - oldi, out + 1, oldi == 0);
365
366 /* i was supposed to wrap around from out+1 to 0, */
367 /* incrementing n each time, so we'll fix that now: */
368
369 if (i / (out + 1) > maxint - n) return punycode_overflow;
370 n += i / (out + 1);
371 i %= (out + 1);
372
373 /* Insert n at position i of the output: */
374
375 /* not needed for Punycode: */
376 /* if (decode_digit(n) <= base) return punycode_invalid_input; */
377 if (out >= max_out) return punycode_big_output;
378
379 if (case_flags) {
380 memmove(case_flags + i + 1, case_flags + i, out - i); /* TODO-OVERFLOW */
381
382 /* Case of last character determines uppercase flag: */
383 case_flags[i] = flagged(input[in - 1]);
384 }
385
386 memmove(output + i + 1, output + i, (out - i) * sizeof *output); /* TODO-OVERFLOW */
387 output[i++] = n;
388 }
389
390 *output_length = out;
391 return punycode_success;
392 }
393
394 /**************** end of punycode.c *******************************/
395
396
397 /******************************************************************/
398 #include "udm_config.h"
399 #include "udm_common.h"
400 #include "udm_utils.h"
401 #include "udm_uniconv.h"
402 #include "udm_url.h"
403
404 #include <assert.h>
405 #include <stdio.h>
406 #include <stdlib.h>
407 #include <string.h>
408
409
410 #define LABEL_LEN 63*4
411
412
413 static size_t
UdmPunyEncode(UDM_CHARSET * cs,const char * src,size_t srclen,char * dst,size_t dstlen)414 UdmPunyEncode(UDM_CHARSET *cs,
415 const char *src, size_t srclen,
416 char *dst, size_t dstlen)
417 {
418 UDM_CONV cnv;
419 int len;
420 punycode_uint input[LABEL_LEN + 1];
421 enum punycode_status status;
422 punycode_uint dst_length;
423
424 UDM_ASSERT(dstlen > 0);
425 UDM_ASSERT(sizeof(punycode_uint) == sizeof(int));
426
427 UdmConvInit(&cnv, cs, &udm_charset_sys_int);
428
429 if ((len= UdmConv(&cnv, (char*) input, sizeof(input), src, srclen, 0)) < 0 ||
430 len >= (sizeof(input) - sizeof(input[0])))
431 {
432 *dst= 0;
433 return 0;
434 }
435
436 len/= sizeof(input[0]);
437 dst_length= dstlen - 1;
438 status= punycode_encode(len, input, NULL, &dst_length, dst);
439 if (status != punycode_success)
440 {
441 *dst= 0;
442 return 0;
443 }
444 dst[dst_length]= 0;
445 return dst_length;
446 }
447
448
449 static size_t
UdmPunyDecode(UDM_CHARSET * cs,const char * src,size_t srclen,char * dst,size_t dstlen)450 UdmPunyDecode(UDM_CHARSET *cs,
451 const char *src, size_t srclen,
452 char *dst, size_t dstlen)
453 {
454 punycode_uint output[LABEL_LEN + 1];
455 punycode_uint output_length= sizeof(output)/sizeof(output[0]);
456 enum punycode_status status;
457 UDM_CONV cnv;
458 int len;
459
460 UDM_ASSERT(dstlen > 0);
461 UDM_ASSERT(sizeof(punycode_uint) == sizeof(int));
462
463 status= punycode_decode(srclen, src, &output_length, output, NULL);
464 if (status != punycode_success)
465 {
466 *dst= 0;
467 return 0;
468 }
469 UdmConvInit(&cnv, &udm_charset_sys_int, cs);
470 len= UdmConv(&cnv, dst, dstlen - 1, (const char *) output, output_length * sizeof(output[0]), 0);
471 if (len < 0 || len > dstlen - 1)
472 {
473 *dst= 0;
474 return 0;
475 }
476 dst[len]= '\0';
477 return (size_t) len;
478 }
479
480
481 #define UDM_ACE_PREFIX "xn--"
482 #define UDM_ACE_PREFIX_LEN 4
483
484
485 size_t
UdmIDNEncode(UDM_CHARSET * cs,const char * src,char * dst,size_t dstlen)486 UdmIDNEncode(UDM_CHARSET *cs, const char *src, char *dst, size_t dstlen)
487 {
488 const char *src0;
489 char *dst0= dst;
490 int have_nonascii= 0;
491 size_t result_len= 0;
492
493 for (src0= src; ; src++)
494 {
495 if (*src == '.' || *src == '\0')
496 {
497 size_t punylen, len;
498 char puny[LABEL_LEN + 1];
499 if (!(punylen= UdmPunyEncode(cs, src0, src - src0, puny, sizeof(puny))))
500 {
501 *dst0= '\0';
502 return 0;
503 }
504 if (have_nonascii)
505 {
506 len= udm_snprintf(dst, dstlen, "%s%s%s",
507 result_len ? "." : "", UDM_ACE_PREFIX, puny);
508 }
509 else
510 len= udm_snprintf(dst, dstlen, "%s%.*s",
511 result_len ? "." : "", (int) (src - src0), src0);
512 if (len >= dstlen) /* Encoded lable does not fit to the result */
513 {
514 *dst0= '\0';
515 return 0;
516 }
517 result_len+= len;
518 dstlen-= len;
519 dst+= len;
520 src0= src + 1;
521 have_nonascii= 0;
522 if (!*src)
523 break;
524 }
525 else if (((unsigned const char*) src)[0] >= 0x80)
526 have_nonascii++;
527 }
528 return result_len;
529 }
530
531
532 size_t
UdmIDNDecode(UDM_CHARSET * cs,const char * src,char * dst,size_t dstlen)533 UdmIDNDecode(UDM_CHARSET *cs, const char *src, char *dst, size_t dstlen)
534 {
535 const char *src0;
536 char *dst0= dst;
537 size_t result_len= 0;
538 for (src0= src; ; src++)
539 {
540 if (*src == '.' || *src == '\0')
541 {
542 size_t len;
543
544 if (strncmp(src0, UDM_ACE_PREFIX, UDM_ACE_PREFIX_LEN))
545 {
546 /* ASCII */
547 len= udm_snprintf(dst, dstlen, "%s%.*s",
548 result_len ? "." : "", (int) (src - src0), src0);
549 }
550 else
551 {
552 size_t punylen;
553 char puny[LABEL_LEN + 1];
554 src0+= UDM_ACE_PREFIX_LEN;
555
556 if (!(punylen= UdmPunyDecode(cs, src0, src - src0, puny, sizeof(puny))))
557 {
558 *dst0= '\0';
559 return 0;
560 }
561 len= udm_snprintf(dst, dstlen, "%s%s", result_len ? "." : "", puny);
562 }
563 if (len >= dstlen)
564 {
565 *dst0= '\0';
566 return 0;
567 }
568 result_len+= len;
569 dstlen-= len;
570 dst+= len;
571 src0= src + 1;
572 if (!*src)
573 break;
574 }
575 }
576 return result_len;
577 }
578
579
580 #ifdef TEST_IDN
581
582 static int
UdmTest(void)583 UdmTest(void)
584 {
585 char str[1024], dst[1024], dec[1024], idn[1024], idu[1024];
586 UDM_CHARSET *cs= &udm_charset_utf8;
587 assert(cs != NULL);
588
589 while (fgets(str, sizeof(str), stdin))
590 {
591 size_t result_len;
592 UdmRTrim(str, "\r\n");
593 UdmPunyEncode(cs, str, strlen(str), dst, sizeof(dst));
594 UdmPunyDecode(cs, dst, strlen(dst), dec, sizeof(dec));
595 printf("dst=%s\n", dst);
596 printf("dec=%s\n", dec);
597 result_len= UdmIDNEncode(cs, str, idn, sizeof(idn));
598 printf("idn=%s (%d)\n", idn, result_len);
599 result_len= UdmIDNDecode(cs, idn, idu, sizeof(idu));
600 printf("idu=%s (%d)\n", idu, result_len);
601 }
602 return UDM_OK;
603 }
604
605 #endif /* TEST_IDN */
606