1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2002-2011, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  punycode.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2002jan31
16 *   created by: Markus W. Scherer
17 */
18 
19 
20 /* This ICU code derived from: */
21 /*
22 punycode.c 0.4.0 (2001-Nov-17-Sat)
23 http://www.cs.berkeley.edu/~amc/idn/
24 Adam M. Costello
25 http://www.nicemice.net/amc/
26 
27 Disclaimer and license
28 
29     Regarding this entire document or any portion of it (including
30     the pseudocode and C code), the author makes no guarantees and
31     is not responsible for any damage resulting from its use.  The
32     author grants irrevocable permission to anyone to use, modify,
33     and distribute it in any way that does not diminish the rights
34     of anyone else to use, modify, and distribute it, provided that
35     redistributed derivative works do not contain misleading author or
36     version information.  Derivative works need not be licensed under
37     similar terms.
38 */
39 /*
40  * ICU modifications:
41  * - ICU data types and coding conventions
42  * - ICU string buffer handling with implicit source lengths
43  *   and destination preflighting
44  * - UTF-16 handling
45  */
46 
47 #include "unicode/utypes.h"
48 
49 #if !UCONFIG_NO_IDNA
50 
51 #include "unicode/ustring.h"
52 #include "unicode/utf.h"
53 #include "unicode/utf16.h"
54 #include "ustr_imp.h"
55 #include "cstring.h"
56 #include "cmemory.h"
57 #include "punycode.h"
58 #include "uassert.h"
59 
60 
61 /* Punycode ----------------------------------------------------------------- */
62 
63 /* Punycode parameters for Bootstring */
64 #define BASE            36
65 #define TMIN            1
66 #define TMAX            26
67 #define SKEW            38
68 #define DAMP            700
69 #define INITIAL_BIAS    72
70 #define INITIAL_N       0x80
71 
72 /* "Basic" Unicode/ASCII code points */
73 #define _HYPHEN         0X2d
74 #define DELIMITER       _HYPHEN
75 
76 #define _ZERO_          0X30
77 #define _NINE           0x39
78 
79 #define _SMALL_A        0X61
80 #define _SMALL_Z        0X7a
81 
82 #define _CAPITAL_A      0X41
83 #define _CAPITAL_Z      0X5a
84 
85 #define IS_BASIC(c) ((c)<0x80)
86 #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
87 
88 /**
89  * digitToBasic() returns the basic code point whose value
90  * (when used for representing integers) is d, which must be in the
91  * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
92  * nonzero, in which case the uppercase form is used.
93  */
94 static inline char
digitToBasic(int32_t digit,UBool uppercase)95 digitToBasic(int32_t digit, UBool uppercase) {
96     /*  0..25 map to ASCII a..z or A..Z */
97     /* 26..35 map to ASCII 0..9         */
98     if(digit<26) {
99         if(uppercase) {
100             return (char)(_CAPITAL_A+digit);
101         } else {
102             return (char)(_SMALL_A+digit);
103         }
104     } else {
105         return (char)((_ZERO_-26)+digit);
106     }
107 }
108 
109 /**
110  * @return the numeric value of a basic code point (for use in representing integers)
111  *         in the range 0 to BASE-1, or a negative value if cp is invalid.
112  */
decodeDigit(int32_t cp)113 static int32_t decodeDigit(int32_t cp) {
114     if(cp<=u'Z') {
115         if(cp<=u'9') {
116             if(cp<u'0') {
117                 return -1;
118             } else {
119                 return cp-u'0'+26;  // 0..9 -> 26..35
120             }
121         } else {
122             return cp-u'A';  // A-Z -> 0..25
123         }
124     } else if(cp<=u'z') {
125         return cp-'a';  // a..z -> 0..25
126     } else {
127         return -1;
128     }
129 }
130 
131 static inline char
asciiCaseMap(char b,UBool uppercase)132 asciiCaseMap(char b, UBool uppercase) {
133     if(uppercase) {
134         if(_SMALL_A<=b && b<=_SMALL_Z) {
135             b-=(_SMALL_A-_CAPITAL_A);
136         }
137     } else {
138         if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
139             b+=(_SMALL_A-_CAPITAL_A);
140         }
141     }
142     return b;
143 }
144 
145 /* Punycode-specific Bootstring code ---------------------------------------- */
146 
147 /*
148  * The following code omits the {parts} of the pseudo-algorithm in the spec
149  * that are not used with the Punycode parameter set.
150  */
151 
152 /* Bias adaptation function. */
153 static int32_t
adaptBias(int32_t delta,int32_t length,UBool firstTime)154 adaptBias(int32_t delta, int32_t length, UBool firstTime) {
155     int32_t count;
156 
157     if(firstTime) {
158         delta/=DAMP;
159     } else {
160         delta/=2;
161     }
162 
163     delta+=delta/length;
164     for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
165         delta/=(BASE-TMIN);
166     }
167 
168     return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
169 }
170 
171 namespace {
172 
173 // ICU-13727: Limit input length for n^2 algorithm
174 // where well-formed strings are at most 59 characters long.
175 constexpr int32_t ENCODE_MAX_CODE_UNITS=1000;
176 constexpr int32_t DECODE_MAX_CHARS=2000;
177 
178 }  // namespace
179 
180 // encode
181 U_CAPI int32_t
u_strToPunycode(const UChar * src,int32_t srcLength,UChar * dest,int32_t destCapacity,const UBool * caseFlags,UErrorCode * pErrorCode)182 u_strToPunycode(const UChar *src, int32_t srcLength,
183                 UChar *dest, int32_t destCapacity,
184                 const UBool *caseFlags,
185                 UErrorCode *pErrorCode) {
186 
187     int32_t cpBuffer[ENCODE_MAX_CODE_UNITS];
188     int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
189     UChar c, c2;
190 
191     /* argument checking */
192     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
193         return 0;
194     }
195 
196     if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
197         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
198         return 0;
199     }
200     if (srcLength>ENCODE_MAX_CODE_UNITS) {
201         *pErrorCode=U_INPUT_TOO_LONG_ERROR;
202         return 0;
203     }
204 
205     /*
206      * Handle the basic code points and
207      * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
208      */
209     srcCPCount=destLength=0;
210     if(srcLength==-1) {
211         /* NUL-terminated input */
212         for(j=0; /* no condition */; ++j) {
213             if((c=src[j])==0) {
214                 break;
215             }
216             if(j>=ENCODE_MAX_CODE_UNITS) {
217                 *pErrorCode=U_INPUT_TOO_LONG_ERROR;
218                 return 0;
219             }
220             if(IS_BASIC(c)) {
221                 cpBuffer[srcCPCount++]=0;
222                 if(destLength<destCapacity) {
223                     dest[destLength]=
224                         caseFlags!=NULL ?
225                             asciiCaseMap((char)c, caseFlags[j]) :
226                             (char)c;
227                 }
228                 ++destLength;
229             } else {
230                 n=(caseFlags!=NULL && caseFlags[j])<<31L;
231                 if(U16_IS_SINGLE(c)) {
232                     n|=c;
233                 } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
234                     ++j;
235                     n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
236                 } else {
237                     /* error: unmatched surrogate */
238                     *pErrorCode=U_INVALID_CHAR_FOUND;
239                     return 0;
240                 }
241                 cpBuffer[srcCPCount++]=n;
242             }
243         }
244     } else {
245         /* length-specified input */
246         for(j=0; j<srcLength; ++j) {
247             c=src[j];
248             if(IS_BASIC(c)) {
249                 cpBuffer[srcCPCount++]=0;
250                 if(destLength<destCapacity) {
251                     dest[destLength]=
252                         caseFlags!=NULL ?
253                             asciiCaseMap((char)c, caseFlags[j]) :
254                             (char)c;
255                 }
256                 ++destLength;
257             } else {
258                 n=(caseFlags!=NULL && caseFlags[j])<<31L;
259                 if(U16_IS_SINGLE(c)) {
260                     n|=c;
261                 } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
262                     ++j;
263                     n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
264                 } else {
265                     /* error: unmatched surrogate */
266                     *pErrorCode=U_INVALID_CHAR_FOUND;
267                     return 0;
268                 }
269                 cpBuffer[srcCPCount++]=n;
270             }
271         }
272     }
273 
274     /* Finish the basic string - if it is not empty - with a delimiter. */
275     basicLength=destLength;
276     if(basicLength>0) {
277         if(destLength<destCapacity) {
278             dest[destLength]=DELIMITER;
279         }
280         ++destLength;
281     }
282 
283     /*
284      * handledCPCount is the number of code points that have been handled
285      * basicLength is the number of basic code points
286      * destLength is the number of chars that have been output
287      */
288 
289     /* Initialize the state: */
290     n=INITIAL_N;
291     delta=0;
292     bias=INITIAL_BIAS;
293 
294     /* Main encoding loop: */
295     for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
296         /*
297          * All non-basic code points < n have been handled already.
298          * Find the next larger one:
299          */
300         for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
301             q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
302             if(n<=q && q<m) {
303                 m=q;
304             }
305         }
306 
307         /*
308          * Increase delta enough to advance the decoder's
309          * <n,i> state to <m,0>, but guard against overflow:
310          */
311         if(m-n>(0x7fffffff-handledCPCount-delta)/(handledCPCount+1)) {
312             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
313             return 0;
314         }
315         delta+=(m-n)*(handledCPCount+1);
316         n=m;
317 
318         /* Encode a sequence of same code points n */
319         for(j=0; j<srcCPCount; ++j) {
320             q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
321             if(q<n) {
322                 ++delta;
323             } else if(q==n) {
324                 /* Represent delta as a generalized variable-length integer: */
325                 for(q=delta, k=BASE; /* no condition */; k+=BASE) {
326 
327                     /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
328 
329                     t=k-bias;
330                     if(t<TMIN) {
331                         t=TMIN;
332                     } else if(t>TMAX) {
333                         t=TMAX;
334                     }
335                     */
336 
337                     t=k-bias;
338                     if(t<TMIN) {
339                         t=TMIN;
340                     } else if(k>=(bias+TMAX)) {
341                         t=TMAX;
342                     }
343 
344                     if(q<t) {
345                         break;
346                     }
347 
348                     if(destLength<destCapacity) {
349                         dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
350                     }
351                     ++destLength;
352                     q=(q-t)/(BASE-t);
353                 }
354 
355                 if(destLength<destCapacity) {
356                     dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
357                 }
358                 ++destLength;
359                 bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
360                 delta=0;
361                 ++handledCPCount;
362             }
363         }
364 
365         ++delta;
366         ++n;
367     }
368 
369     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
370 }
371 
372 // decode
373 U_CAPI int32_t
u_strFromPunycode(const UChar * src,int32_t srcLength,UChar * dest,int32_t destCapacity,UBool * caseFlags,UErrorCode * pErrorCode)374 u_strFromPunycode(const UChar *src, int32_t srcLength,
375                   UChar *dest, int32_t destCapacity,
376                   UBool *caseFlags,
377                   UErrorCode *pErrorCode) {
378     int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
379             destCPCount, firstSupplementaryIndex, cpLength;
380     UChar b;
381 
382     /* argument checking */
383     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
384         return 0;
385     }
386 
387     if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
388         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
389         return 0;
390     }
391 
392     if(srcLength==-1) {
393         srcLength=u_strlen(src);
394     }
395     if (srcLength>DECODE_MAX_CHARS) {
396         *pErrorCode=U_INPUT_TOO_LONG_ERROR;
397         return 0;
398     }
399 
400     /*
401      * Handle the basic code points:
402      * Let basicLength be the number of input code points
403      * before the last delimiter, or 0 if there is none,
404      * then copy the first basicLength code points to the output.
405      *
406      * The two following loops iterate backward.
407      */
408     for(j=srcLength; j>0;) {
409         if(src[--j]==DELIMITER) {
410             break;
411         }
412     }
413     destLength=basicLength=destCPCount=j;
414     U_ASSERT(destLength>=0);
415 
416     while(j>0) {
417         b=src[--j];
418         if(!IS_BASIC(b)) {
419             *pErrorCode=U_INVALID_CHAR_FOUND;
420             return 0;
421         }
422 
423         if(j<destCapacity) {
424             dest[j]=(UChar)b;
425 
426             if(caseFlags!=NULL) {
427                 caseFlags[j]=IS_BASIC_UPPERCASE(b);
428             }
429         }
430     }
431 
432     /* Initialize the state: */
433     n=INITIAL_N;
434     i=0;
435     bias=INITIAL_BIAS;
436     firstSupplementaryIndex=1000000000;
437 
438     /*
439      * Main decoding loop:
440      * Start just after the last delimiter if any
441      * basic code points were copied; start at the beginning otherwise.
442      */
443     for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
444         /*
445          * in is the index of the next character to be consumed, and
446          * destCPCount is the number of code points in the output array.
447          *
448          * Decode a generalized variable-length integer into delta,
449          * which gets added to i.  The overflow checking is easier
450          * if we increase i as we go, then subtract off its starting
451          * value at the end to obtain delta.
452          */
453         for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
454             if(in>=srcLength) {
455                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
456                 return 0;
457             }
458 
459             digit=decodeDigit(src[in++]);
460             if(digit<0) {
461                 *pErrorCode=U_INVALID_CHAR_FOUND;
462                 return 0;
463             }
464             if(digit>(0x7fffffff-i)/w) {
465                 /* integer overflow */
466                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
467                 return 0;
468             }
469 
470             i+=digit*w;
471             /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
472             t=k-bias;
473             if(t<TMIN) {
474                 t=TMIN;
475             } else if(t>TMAX) {
476                 t=TMAX;
477             }
478             */
479             t=k-bias;
480             if(t<TMIN) {
481                 t=TMIN;
482             } else if(k>=(bias+TMAX)) {
483                 t=TMAX;
484             }
485             if(digit<t) {
486                 break;
487             }
488 
489             if(w>0x7fffffff/(BASE-t)) {
490                 /* integer overflow */
491                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
492                 return 0;
493             }
494             w*=BASE-t;
495         }
496 
497         /*
498          * Modification from sample code:
499          * Increments destCPCount here,
500          * where needed instead of in for() loop tail.
501          */
502         ++destCPCount;
503         bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
504 
505         /*
506          * i was supposed to wrap around from (incremented) destCPCount to 0,
507          * incrementing n each time, so we'll fix that now:
508          */
509         if(i/destCPCount>(0x7fffffff-n)) {
510             /* integer overflow */
511             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
512             return 0;
513         }
514 
515         n+=i/destCPCount;
516         i%=destCPCount;
517         /* not needed for Punycode: */
518         /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
519 
520         if(n>0x10ffff || U_IS_SURROGATE(n)) {
521             /* Unicode code point overflow */
522             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
523             return 0;
524         }
525 
526         /* Insert n at position i of the output: */
527         cpLength=U16_LENGTH(n);
528         if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
529             int32_t codeUnitIndex;
530 
531             /*
532              * Handle indexes when supplementary code points are present.
533              *
534              * In almost all cases, there will be only BMP code points before i
535              * and even in the entire string.
536              * This is handled with the same efficiency as with UTF-32.
537              *
538              * Only the rare cases with supplementary code points are handled
539              * more slowly - but not too bad since this is an insertion anyway.
540              */
541             if(i<=firstSupplementaryIndex) {
542                 codeUnitIndex=i;
543                 if(cpLength>1) {
544                     firstSupplementaryIndex=codeUnitIndex;
545                 } else {
546                     ++firstSupplementaryIndex;
547                 }
548             } else {
549                 codeUnitIndex=firstSupplementaryIndex;
550                 U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
551             }
552 
553             /* use the UChar index codeUnitIndex instead of the code point index i */
554             if(codeUnitIndex<destLength) {
555                 uprv_memmove(dest+codeUnitIndex+cpLength,
556                              dest+codeUnitIndex,
557                              (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
558                 if(caseFlags!=NULL) {
559                     uprv_memmove(caseFlags+codeUnitIndex+cpLength,
560                                  caseFlags+codeUnitIndex,
561                                  destLength-codeUnitIndex);
562                 }
563             }
564             if(cpLength==1) {
565                 /* BMP, insert one code unit */
566                 dest[codeUnitIndex]=(UChar)n;
567             } else {
568                 /* supplementary character, insert two code units */
569                 dest[codeUnitIndex]=U16_LEAD(n);
570                 dest[codeUnitIndex+1]=U16_TRAIL(n);
571             }
572             if(caseFlags!=NULL) {
573                 /* Case of last character determines uppercase flag: */
574                 caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
575                 if(cpLength==2) {
576                     caseFlags[codeUnitIndex+1]=FALSE;
577                 }
578             }
579         }
580         destLength+=cpLength;
581         U_ASSERT(destLength>=0);
582         ++i;
583     }
584 
585     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
586 }
587 
588 /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
589 
590 #endif /* #if !UCONFIG_NO_IDNA */
591