1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 
19 /*
20    punycode.c from RFC 3492
21    http://www.nicemice.net/idn/
22    Adam M. Costello
23    http://www.nicemice.net/amc/
24 
25    This is ANSI C code (C89) implementing Punycode (RFC 3492).
26 
27    Copyright (C) The Internet Society (2003).  All Rights Reserved.
28 
29    This document and translations of it may be copied and furnished to
30    others, and derivative works that comment on or otherwise explain it
31    or assist in its implementation may be prepared, copied, published
32    and distributed, in whole or in part, without restriction of any
33    kind, provided that the above copyright notice and this paragraph are
34    included on all such copies and derivative works.  However, this
35    document itself may not be modified in any way, such as by removing
36    the copyright notice or references to the Internet Society or other
37    Internet organizations, except as needed for the purpose of
38    developing Internet standards in which case the procedures for
39    copyrights defined in the Internet Standards process must be
40    followed, or as required to translate it into languages other than
41    English.
42 
43    The limited permissions granted above are perpetual and will not be
44    revoked by the Internet Society or its successors or assigns.
45 
46    This document and the information contained herein is provided on an
47    "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
48    TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
49    BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
50    HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
51    MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
52 */
53 
54 /************************************************************/
55 /* Public interface (would normally go in its own .h file): */
56 
57 #include <limits.h>
58 
59 enum punycode_status {
60   punycode_success,
61   punycode_bad_input,   /* Input is invalid.                       */
62   punycode_big_output,  /* Output would exceed the space provided. */
63   punycode_overflow     /* Input needs wider integers to process.  */
64 };
65 
66 #if UINT_MAX >= (1 << 26) - 1
67 typedef unsigned int punycode_uint;
68 #else
69 #error Bad UINT_MAX
70 typedef unsigned long punycode_uint;
71 #endif
72 
73 enum punycode_status punycode_encode(
74   punycode_uint input_length,
75   const punycode_uint input[],
76   const unsigned char case_flags[],
77   punycode_uint *output_length,
78   char output[] );
79 
80     /* punycode_encode() converts Unicode to Punycode.  The input     */
81     /* is represented as an array of Unicode code points (not code    */
82     /* units; surrogate pairs are not allowed), and the output        */
83     /* will be represented as an array of ASCII code points.  The     */
84     /* output string is *not* null-terminated; it will contain        */
85     /* zeros if and only if the input contains zeros.  (Of course     */
86     /* the caller can leave room for a terminator and add one if      */
87     /* needed.)  The input_length is the number of code points in     */
88     /* the input.  The output_length is an in/out argument: the       */
89     /* caller passes in the maximum number of code points that it     */
90     /* can receive, and on successful return it will contain the      */
91     /* number of code points actually output.  The case_flags array   */
92     /* holds input_length boolean values, where nonzero suggests that */
93     /* the corresponding Unicode character be forced to uppercase     */
94     /* after being decoded (if possible), and zero suggests that      */
95     /* it be forced to lowercase (if possible).  ASCII code points    */
96     /* are encoded literally, except that ASCII letters are forced    */
97     /* to uppercase or lowercase according to the corresponding       */
98     /* uppercase flags.  If case_flags is a null pointer then ASCII   */
99     /* letters are left as they are, and other code points are        */
100     /* treated as if their uppercase flags were zero.  The return     */
101     /* value can be any of the punycode_status values defined above   */
102     /* except punycode_bad_input; if not punycode_success, then       */
103     /* output_size and output might contain garbage.                  */
104 
105 enum punycode_status punycode_decode(
106   punycode_uint input_length,
107   const char input[],
108   punycode_uint *output_length,
109   punycode_uint output[],
110   unsigned char case_flags[] );
111 
112     /* punycode_decode() converts Punycode to Unicode.  The input is  */
113     /* represented as an array of ASCII code points, and the output   */
114     /* will be represented as an array of Unicode code points.  The   */
115     /* input_length is the number of code points in the input.  The   */
116     /* output_length is an in/out argument: the caller passes in      */
117     /* the maximum number of code points that it can receive, and     */
118     /* on successful return it will contain the actual number of      */
119     /* code points output.  The case_flags array needs room for at    */
120     /* least output_length values, or it can be a null pointer if the */
121     /* case information is not needed.  A nonzero flag suggests that  */
122     /* the corresponding Unicode character be forced to uppercase     */
123     /* by the caller (if possible), while zero suggests that it be    */
124     /* forced to lowercase (if possible).  ASCII code points are      */
125     /* output already in the proper case, but their flags will be set */
126     /* appropriately so that applying the flags would be harmless.    */
127     /* The return value can be any of the punycode_status values      */
128     /* defined above; if not punycode_success, then output_length,    */
129     /* output, and case_flags might contain garbage.  On success, the */
130     /* decoder will never need to write an output_length greater than */
131     /* input_length, because of how the encoding is defined.          */
132 
133 /**********************************************************/
134 /* Implementation (would normally go in its own .c file): */
135 
136 #include <string.h>
137 
138 
139 
140 /*** Bootstring parameters for Punycode ***/
141 
142 enum { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
143        initial_bias = 72, initial_n = 0x80, delimiter = 0x2D };
144 
145 /* basic(cp) tests whether cp is a basic code point: */
146 #define basic(cp) ((punycode_uint)(cp) < 0x80)
147 
148 /* delim(cp) tests whether cp is a delimiter: */
149 #define delim(cp) ((cp) == delimiter)
150 
151 /* decode_digit(cp) returns the numeric value of a basic code */
152 /* point (for use in representing integers) in the range 0 to */
153 /* base-1, or base if cp is does not represent a value.       */
154 
decode_digit(punycode_uint cp)155 static punycode_uint decode_digit(punycode_uint cp)
156 {
157   return  cp - 48 < 10 ? cp - 22 :  cp - 65 < 26 ? cp - 65 :
158           cp - 97 < 26 ? cp - 97 :  base;
159 }
160 
161 /* encode_digit(d,flag) returns the basic code point whose value      */
162 /* (when used for representing integers) is d, which needs to be in   */
163 /* the range 0 to base-1.  The lowercase form is used unless flag is  */
164 /* nonzero, in which case the uppercase form is used.  The behavior   */
165 /* is undefined if flag is nonzero and digit d has no uppercase form. */
166 
encode_digit(punycode_uint d,int flag)167 static char encode_digit(punycode_uint d, int flag)
168 {
169   return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
170   /*  0..25 map to ASCII a..z or A..Z */
171   /* 26..35 map to ASCII 0..9         */
172 }
173 
174 /* flagged(bcp) tests whether a basic code point is flagged */
175 /* (uppercase).  The behavior is undefined if bcp is not a  */
176 /* basic code point.                                        */
177 
178 #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26)
179 
180 /* encode_basic(bcp,flag) forces a basic code point to lowercase */
181 /* if flag is zero, uppercase if flag is nonzero, and returns    */
182 /* the resulting code point.  The code point is unchanged if it  */
183 /* is caseless.  The behavior is undefined if bcp is not a basic */
184 /* code point.                                                   */
185 
encode_basic(punycode_uint bcp,int flag)186 static char encode_basic(punycode_uint bcp, int flag)
187 {
188   bcp -= (bcp - 97 < 26) << 5;
189   return bcp + ((!flag && (bcp - 65 < 26)) << 5);
190 }
191 
192 /*** Platform-specific constants ***/
193 
194 /* maxint is the maximum value of a punycode_uint variable: */
195 static const punycode_uint maxint = -1;
196 /* Because maxint is unsigned, -1 becomes the maximum value. */
197 
198 /*** Bias adaptation function ***/
199 
adapt(punycode_uint delta,punycode_uint numpoints,int firsttime)200 static punycode_uint adapt(
201   punycode_uint delta, punycode_uint numpoints, int firsttime )
202 {
203   punycode_uint k;
204 
205   delta = firsttime ? delta / damp : delta >> 1;
206   /* delta >> 1 is a faster way of doing delta / 2 */
207   delta += delta / numpoints;
208 
209   for (k = 0;  delta > ((base - tmin) * tmax) / 2;  k += base) {
210     delta /= base - tmin;
211   }
212 
213   return k + (base - tmin + 1) * delta / (delta + skew);
214 }
215 
216 /*** Main encode function ***/
217 
punycode_encode(punycode_uint input_length,const punycode_uint input[],const unsigned char case_flags[],punycode_uint * output_length,char output[])218 enum punycode_status punycode_encode(
219   punycode_uint input_length,
220   const punycode_uint input[],
221   const unsigned char case_flags[],
222   punycode_uint *output_length,
223   char output[] )
224 {
225   punycode_uint n, delta, h, b, out, max_out, bias, j, m, q, k, t;
226 
227   /* Initialize the state: */
228 
229   n = initial_n;
230   delta = out = 0;
231   max_out = *output_length;
232   bias = initial_bias;
233 
234   /* Handle the basic code points: */
235 
236 
237   for (j = 0;  j < input_length;  ++j) {
238     if (basic(input[j])) {
239       if (max_out - out < 2) return punycode_big_output;
240       output[out++] =
241         case_flags ?  encode_basic(input[j], case_flags[j]) : input[j];
242     }
243     /* else if (input[j] < n) return punycode_bad_input; */
244     /* (not needed for Punycode with unsigned code points) */
245   }
246 
247   h = b = out;
248 
249   /* h is the number of code points that have been handled, b is the  */
250   /* number of basic code points, and out is the number of characters */
251   /* that have been output.                                           */
252 
253   if (b > 0) output[out++] = delimiter;
254 
255   /* Main encoding loop: */
256 
257   while (h < input_length) {
258     /* All non-basic code points < n have been     */
259     /* handled already.  Find the next larger one: */
260 
261     for (m = maxint, j = 0;  j < input_length;  ++j) {
262       /* if (basic(input[j])) continue; */
263       /* (not needed for Punycode) */
264       if (input[j] >= n && input[j] < m) m = input[j];
265     }
266 
267     /* Increase delta enough to advance the decoder's    */
268     /* <n,i> state to <m,0>, but guard against overflow: */
269 
270     if (m - n > (maxint - delta) / (h + 1)) return punycode_overflow;
271     delta += (m - n) * (h + 1);
272     n = m;
273 
274     for (j = 0;  j < input_length;  ++j) {
275       /* Punycode does not need to check whether input[j] is basic: */
276       if (input[j] < n /* || basic(input[j]) */ ) {
277         if (++delta == 0) return punycode_overflow;
278       }
279 
280       if (input[j] == n) {
281         /* Represent delta as a generalized variable-length integer: */
282 
283         for (q = delta, k = base;  ;  k += base) {
284           if (out >= max_out) return punycode_big_output;
285           t = k <= bias /* + tmin */ ? tmin :     /* +tmin not needed */
286               k >= bias + tmax ? tmax : k - bias;
287           if (q < t) break;
288           output[out++] = encode_digit(t + (q - t) % (base - t), 0);
289           q = (q - t) / (base - t);
290         }
291 
292         output[out++] = encode_digit(q, case_flags && case_flags[j]);
293         bias = adapt(delta, h + 1, h == b);
294         delta = 0;
295         ++h;
296       }
297     }
298 
299     ++delta, ++n;
300   }
301 
302   *output_length = out;
303   return punycode_success;
304 }
305 
306 /*** Main decode function ***/
307 
punycode_decode(punycode_uint input_length,const char input[],punycode_uint * output_length,punycode_uint output[],unsigned char case_flags[])308 enum punycode_status punycode_decode(
309   punycode_uint input_length,
310   const char input[],
311   punycode_uint *output_length,
312   punycode_uint output[],
313   unsigned char case_flags[] )
314 {
315   punycode_uint n, out, i, max_out, bias,
316                  b, j, in, oldi, w, k, digit, t;
317 
318   /* Initialize the state: */
319 
320   n = initial_n;
321   out = i = 0;
322   max_out = *output_length;
323   bias = initial_bias;
324 
325   /* Handle the basic code points:  Let b be the number of input code */
326   /* points before the last delimiter, or 0 if there is none, then    */
327   /* copy the first b code points to the output.                      */
328 
329   for (b = j = 0;  j < input_length;  ++j) if (delim(input[j])) b = j;
330   if (b > max_out) return punycode_big_output;
331 
332   for (j = 0;  j < b;  ++j) {
333     if (case_flags) case_flags[out] = flagged(input[j]);
334     if (!basic(input[j])) return punycode_bad_input;
335     output[out++] = input[j];
336   }
337 
338   /* Main decoding loop:  Start just after the last delimiter if any  */
339   /* basic code points were copied; start at the beginning otherwise. */
340 
341   for (in = b > 0 ? b + 1 : 0;  in < input_length;  ++out) {
342 
343     /* in is the index of the next character to be consumed, and */
344     /* out is the number of code points in the output array.     */
345 
346     /* Decode a generalized variable-length integer into delta,  */
347     /* which gets added to i.  The overflow checking is easier   */
348     /* if we increase i as we go, then subtract off its starting */
349     /* value at the end to obtain delta.                         */
350 
351     for (oldi = i, w = 1, k = base;  ;  k += base) {
352       if (in >= input_length) return punycode_bad_input;
353       digit = decode_digit(input[in++]);
354       if (digit >= base) return punycode_bad_input;
355       if (digit > (maxint - i) / w) return punycode_overflow;
356       i += digit * w;
357       t = k <= bias /* + tmin */ ? tmin :     /* +tmin not needed */
358           k >= bias + tmax ? tmax : k - bias;
359       if (digit < t) break;
360       if (w > maxint / (base - t)) return punycode_overflow;
361       w *= (base - t);
362     }
363 
364     bias = adapt(i - oldi, out + 1, oldi == 0);
365 
366     /* i was supposed to wrap around from out+1 to 0,   */
367     /* incrementing n each time, so we'll fix that now: */
368 
369     if (i / (out + 1) > maxint - n) return punycode_overflow;
370     n += i / (out + 1);
371     i %= (out + 1);
372 
373     /* Insert n at position i of the output: */
374 
375     /* not needed for Punycode: */
376     /* if (decode_digit(n) <= base) return punycode_invalid_input; */
377     if (out >= max_out) return punycode_big_output;
378 
379     if (case_flags) {
380       memmove(case_flags + i + 1, case_flags + i, out - i); /* TODO-OVERFLOW */
381 
382       /* Case of last character determines uppercase flag: */
383       case_flags[i] = flagged(input[in - 1]);
384     }
385 
386     memmove(output + i + 1, output + i, (out - i) * sizeof *output); /* TODO-OVERFLOW */
387     output[i++] = n;
388   }
389 
390   *output_length = out;
391   return punycode_success;
392 }
393 
394 /**************** end of punycode.c *******************************/
395 
396 
397 /******************************************************************/
398 #include "udm_config.h"
399 #include "udm_common.h"
400 #include "udm_utils.h"
401 #include "udm_uniconv.h"
402 #include "udm_url.h"
403 
404 #include <assert.h>
405 #include <stdio.h>
406 #include <stdlib.h>
407 #include <string.h>
408 
409 
410 #define LABEL_LEN 63*4
411 
412 
413 static size_t
UdmPunyEncode(UDM_CHARSET * cs,const char * src,size_t srclen,char * dst,size_t dstlen)414 UdmPunyEncode(UDM_CHARSET *cs,
415               const char *src, size_t srclen,
416               char *dst, size_t dstlen)
417 {
418   UDM_CONV cnv;
419   int len;
420   punycode_uint input[LABEL_LEN + 1];
421   enum punycode_status status;
422   punycode_uint dst_length;
423 
424   UDM_ASSERT(dstlen > 0);
425   UDM_ASSERT(sizeof(punycode_uint) == sizeof(int));
426 
427   UdmConvInit(&cnv, cs, &udm_charset_sys_int);
428 
429   if ((len= UdmConv(&cnv, (char*) input, sizeof(input), src, srclen, 0)) < 0 ||
430       len >= (sizeof(input) - sizeof(input[0])))
431   {
432     *dst= 0;
433     return 0;
434   }
435 
436   len/= sizeof(input[0]);
437   dst_length= dstlen - 1;
438   status= punycode_encode(len, input, NULL, &dst_length, dst);
439   if (status != punycode_success)
440   {
441     *dst= 0;
442     return 0;
443   }
444   dst[dst_length]= 0;
445   return dst_length;
446 }
447 
448 
449 static size_t
UdmPunyDecode(UDM_CHARSET * cs,const char * src,size_t srclen,char * dst,size_t dstlen)450 UdmPunyDecode(UDM_CHARSET *cs,
451               const char *src, size_t srclen,
452               char *dst, size_t dstlen)
453 {
454   punycode_uint output[LABEL_LEN + 1];
455   punycode_uint output_length= sizeof(output)/sizeof(output[0]);
456   enum punycode_status status;
457   UDM_CONV cnv;
458   int len;
459 
460   UDM_ASSERT(dstlen > 0);
461   UDM_ASSERT(sizeof(punycode_uint) == sizeof(int));
462 
463   status= punycode_decode(srclen, src, &output_length, output, NULL);
464   if (status != punycode_success)
465   {
466     *dst= 0;
467     return 0;
468   }
469   UdmConvInit(&cnv, &udm_charset_sys_int, cs);
470   len= UdmConv(&cnv, dst, dstlen - 1, (const char *) output, output_length * sizeof(output[0]), 0);
471   if (len < 0 || len > dstlen - 1)
472   {
473     *dst= 0;
474     return 0;
475   }
476   dst[len]= '\0';
477   return (size_t) len;
478 }
479 
480 
481 #define UDM_ACE_PREFIX "xn--"
482 #define UDM_ACE_PREFIX_LEN 4
483 
484 
485 size_t
UdmIDNEncode(UDM_CHARSET * cs,const char * src,char * dst,size_t dstlen)486 UdmIDNEncode(UDM_CHARSET *cs, const char *src, char *dst, size_t dstlen)
487 {
488   const char *src0;
489   char *dst0= dst;
490   int have_nonascii= 0;
491   size_t result_len= 0;
492 
493   for (src0= src; ; src++)
494   {
495     if (*src == '.' || *src == '\0')
496     {
497       size_t punylen, len;
498       char puny[LABEL_LEN + 1];
499       if (!(punylen= UdmPunyEncode(cs, src0, src - src0, puny, sizeof(puny))))
500       {
501         *dst0= '\0';
502         return 0;
503       }
504       if (have_nonascii)
505       {
506         len= udm_snprintf(dst, dstlen, "%s%s%s",
507                           result_len ? "." : "", UDM_ACE_PREFIX, puny);
508       }
509       else
510         len= udm_snprintf(dst, dstlen, "%s%.*s",
511                           result_len ? "." : "", (int) (src - src0), src0);
512       if (len >= dstlen) /* Encoded lable does not fit to the result */
513       {
514         *dst0= '\0';
515         return 0;
516       }
517       result_len+= len;
518       dstlen-= len;
519       dst+= len;
520       src0= src + 1;
521       have_nonascii= 0;
522       if (!*src)
523         break;
524     }
525     else if (((unsigned const char*) src)[0] >= 0x80)
526       have_nonascii++;
527   }
528   return result_len;
529 }
530 
531 
532 size_t
UdmIDNDecode(UDM_CHARSET * cs,const char * src,char * dst,size_t dstlen)533 UdmIDNDecode(UDM_CHARSET *cs, const char *src, char *dst, size_t dstlen)
534 {
535   const char *src0;
536   char *dst0= dst;
537   size_t result_len= 0;
538   for (src0= src; ; src++)
539   {
540     if (*src == '.' || *src == '\0')
541     {
542       size_t len;
543 
544       if (strncmp(src0, UDM_ACE_PREFIX, UDM_ACE_PREFIX_LEN))
545       {
546         /* ASCII */
547         len= udm_snprintf(dst, dstlen, "%s%.*s",
548                           result_len ? "." : "", (int) (src - src0), src0);
549       }
550       else
551       {
552         size_t punylen;
553         char puny[LABEL_LEN + 1];
554         src0+= UDM_ACE_PREFIX_LEN;
555 
556         if (!(punylen= UdmPunyDecode(cs, src0, src - src0, puny, sizeof(puny))))
557         {
558           *dst0= '\0';
559           return 0;
560         }
561         len= udm_snprintf(dst, dstlen, "%s%s", result_len ? "." : "", puny);
562       }
563       if (len >= dstlen)
564       {
565         *dst0= '\0';
566         return 0;
567       }
568       result_len+= len;
569       dstlen-= len;
570       dst+= len;
571       src0= src + 1;
572       if (!*src)
573         break;
574     }
575   }
576   return result_len;
577 }
578 
579 
580 #ifdef TEST_IDN
581 
582 static int
UdmTest(void)583 UdmTest(void)
584 {
585   char str[1024], dst[1024], dec[1024], idn[1024], idu[1024];
586   UDM_CHARSET *cs= &udm_charset_utf8;
587   assert(cs != NULL);
588 
589   while (fgets(str, sizeof(str), stdin))
590   {
591     size_t result_len;
592     UdmRTrim(str, "\r\n");
593     UdmPunyEncode(cs, str, strlen(str), dst, sizeof(dst));
594     UdmPunyDecode(cs, dst, strlen(dst), dec, sizeof(dec));
595     printf("dst=%s\n", dst);
596     printf("dec=%s\n", dec);
597     result_len= UdmIDNEncode(cs, str, idn, sizeof(idn));
598     printf("idn=%s (%d)\n", idn, result_len);
599     result_len= UdmIDNDecode(cs, idn, idu, sizeof(idu));
600     printf("idu=%s (%d)\n", idu, result_len);
601   }
602   return UDM_OK;
603 }
604 
605 #endif /* TEST_IDN */
606