1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2002-2011, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  punycode.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2002jan31
16 *   created by: Markus W. Scherer
17 */
18 
19 
20 /* This ICU code derived from: */
21 /*
22 punycode.c 0.4.0 (2001-Nov-17-Sat)
23 http://www.cs.berkeley.edu/~amc/idn/
24 Adam M. Costello
25 http://www.nicemice.net/amc/
26 
27 Disclaimer and license
28 
29     Regarding this entire document or any portion of it (including
30     the pseudocode and C code), the author makes no guarantees and
31     is not responsible for any damage resulting from its use.  The
32     author grants irrevocable permission to anyone to use, modify,
33     and distribute it in any way that does not diminish the rights
34     of anyone else to use, modify, and distribute it, provided that
35     redistributed derivative works do not contain misleading author or
36     version information.  Derivative works need not be licensed under
37     similar terms.
38 */
39 /*
40  * ICU modifications:
41  * - ICU data types and coding conventions
42  * - ICU string buffer handling with implicit source lengths
43  *   and destination preflighting
44  * - UTF-16 handling
45  */
46 
47 #include "unicode/utypes.h"
48 
49 #if !UCONFIG_NO_IDNA
50 
51 #include "unicode/ustring.h"
52 #include "unicode/utf.h"
53 #include "unicode/utf16.h"
54 #include "ustr_imp.h"
55 #include "cstring.h"
56 #include "cmemory.h"
57 #include "punycode.h"
58 #include "uassert.h"
59 
60 
61 /* Punycode ----------------------------------------------------------------- */
62 
63 /* Punycode parameters for Bootstring */
64 #define BASE            36
65 #define TMIN            1
66 #define TMAX            26
67 #define SKEW            38
68 #define DAMP            700
69 #define INITIAL_BIAS    72
70 #define INITIAL_N       0x80
71 
72 /* "Basic" Unicode/ASCII code points */
73 #define _HYPHEN         0X2d
74 #define DELIMITER       _HYPHEN
75 
76 #define _ZERO_          0X30
77 #define _NINE           0x39
78 
79 #define _SMALL_A        0X61
80 #define _SMALL_Z        0X7a
81 
82 #define _CAPITAL_A      0X41
83 #define _CAPITAL_Z      0X5a
84 
85 #define IS_BASIC(c) ((c)<0x80)
86 #define IS_BASIC_UPPERCASE(c) (_CAPITAL_A<=(c) && (c)<=_CAPITAL_Z)
87 
88 /**
89  * digitToBasic() returns the basic code point whose value
90  * (when used for representing integers) is d, which must be in the
91  * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
92  * nonzero, in which case the uppercase form is used.
93  */
94 static inline char
digitToBasic(int32_t digit,UBool uppercase)95 digitToBasic(int32_t digit, UBool uppercase) {
96     /*  0..25 map to ASCII a..z or A..Z */
97     /* 26..35 map to ASCII 0..9         */
98     if(digit<26) {
99         if(uppercase) {
100             return (char)(_CAPITAL_A+digit);
101         } else {
102             return (char)(_SMALL_A+digit);
103         }
104     } else {
105         return (char)((_ZERO_-26)+digit);
106     }
107 }
108 
109 /**
110  * basicToDigit[] contains the numeric value of a basic code
111  * point (for use in representing integers) in the range 0 to
112  * BASE-1, or -1 if b is does not represent a value.
113  */
114 static const int8_t
115 basicToDigit[256]={
116     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
117     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
118 
119     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
120     26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
121 
122     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
123     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
124 
125     -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
126     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
127 
128     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
129     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
130 
131     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
132     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
133 
134     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
135     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
136 
137     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
138     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
139 };
140 
141 static inline char
asciiCaseMap(char b,UBool uppercase)142 asciiCaseMap(char b, UBool uppercase) {
143     if(uppercase) {
144         if(_SMALL_A<=b && b<=_SMALL_Z) {
145             b-=(_SMALL_A-_CAPITAL_A);
146         }
147     } else {
148         if(_CAPITAL_A<=b && b<=_CAPITAL_Z) {
149             b+=(_SMALL_A-_CAPITAL_A);
150         }
151     }
152     return b;
153 }
154 
155 /* Punycode-specific Bootstring code ---------------------------------------- */
156 
157 /*
158  * The following code omits the {parts} of the pseudo-algorithm in the spec
159  * that are not used with the Punycode parameter set.
160  */
161 
162 /* Bias adaptation function. */
163 static int32_t
adaptBias(int32_t delta,int32_t length,UBool firstTime)164 adaptBias(int32_t delta, int32_t length, UBool firstTime) {
165     int32_t count;
166 
167     if(firstTime) {
168         delta/=DAMP;
169     } else {
170         delta/=2;
171     }
172 
173     delta+=delta/length;
174     for(count=0; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
175         delta/=(BASE-TMIN);
176     }
177 
178     return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
179 }
180 
181 #define MAX_CP_COUNT    200
182 
183 U_CFUNC int32_t
u_strToPunycode(const UChar * src,int32_t srcLength,UChar * dest,int32_t destCapacity,const UBool * caseFlags,UErrorCode * pErrorCode)184 u_strToPunycode(const UChar *src, int32_t srcLength,
185                 UChar *dest, int32_t destCapacity,
186                 const UBool *caseFlags,
187                 UErrorCode *pErrorCode) {
188 
189     int32_t cpBuffer[MAX_CP_COUNT];
190     int32_t n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
191     UChar c, c2;
192 
193     /* argument checking */
194     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
195         return 0;
196     }
197 
198     if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
199         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
200         return 0;
201     }
202 
203     /*
204      * Handle the basic code points and
205      * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
206      */
207     srcCPCount=destLength=0;
208     if(srcLength==-1) {
209         /* NUL-terminated input */
210         for(j=0; /* no condition */; ++j) {
211             if((c=src[j])==0) {
212                 break;
213             }
214             if(srcCPCount==MAX_CP_COUNT) {
215                 /* too many input code points */
216                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
217                 return 0;
218             }
219             if(IS_BASIC(c)) {
220                 cpBuffer[srcCPCount++]=0;
221                 if(destLength<destCapacity) {
222                     dest[destLength]=
223                         caseFlags!=NULL ?
224                             asciiCaseMap((char)c, caseFlags[j]) :
225                             (char)c;
226                 }
227                 ++destLength;
228             } else {
229                 n=(caseFlags!=NULL && caseFlags[j])<<31L;
230                 if(U16_IS_SINGLE(c)) {
231                     n|=c;
232                 } else if(U16_IS_LEAD(c) && U16_IS_TRAIL(c2=src[j+1])) {
233                     ++j;
234                     n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
235                 } else {
236                     /* error: unmatched surrogate */
237                     *pErrorCode=U_INVALID_CHAR_FOUND;
238                     return 0;
239                 }
240                 cpBuffer[srcCPCount++]=n;
241             }
242         }
243     } else {
244         /* length-specified input */
245         for(j=0; j<srcLength; ++j) {
246             if(srcCPCount==MAX_CP_COUNT) {
247                 /* too many input code points */
248                 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
249                 return 0;
250             }
251             c=src[j];
252             if(IS_BASIC(c)) {
253                 cpBuffer[srcCPCount++]=0;
254                 if(destLength<destCapacity) {
255                     dest[destLength]=
256                         caseFlags!=NULL ?
257                             asciiCaseMap((char)c, caseFlags[j]) :
258                             (char)c;
259                 }
260                 ++destLength;
261             } else {
262                 n=(caseFlags!=NULL && caseFlags[j])<<31L;
263                 if(U16_IS_SINGLE(c)) {
264                     n|=c;
265                 } else if(U16_IS_LEAD(c) && (j+1)<srcLength && U16_IS_TRAIL(c2=src[j+1])) {
266                     ++j;
267                     n|=(int32_t)U16_GET_SUPPLEMENTARY(c, c2);
268                 } else {
269                     /* error: unmatched surrogate */
270                     *pErrorCode=U_INVALID_CHAR_FOUND;
271                     return 0;
272                 }
273                 cpBuffer[srcCPCount++]=n;
274             }
275         }
276     }
277 
278     /* Finish the basic string - if it is not empty - with a delimiter. */
279     basicLength=destLength;
280     if(basicLength>0) {
281         if(destLength<destCapacity) {
282             dest[destLength]=DELIMITER;
283         }
284         ++destLength;
285     }
286 
287     /*
288      * handledCPCount is the number of code points that have been handled
289      * basicLength is the number of basic code points
290      * destLength is the number of chars that have been output
291      */
292 
293     /* Initialize the state: */
294     n=INITIAL_N;
295     delta=0;
296     bias=INITIAL_BIAS;
297 
298     /* Main encoding loop: */
299     for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
300         /*
301          * All non-basic code points < n have been handled already.
302          * Find the next larger one:
303          */
304         for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
305             q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
306             if(n<=q && q<m) {
307                 m=q;
308             }
309         }
310 
311         /*
312          * Increase delta enough to advance the decoder's
313          * <n,i> state to <m,0>, but guard against overflow:
314          */
315         if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
316             *pErrorCode=U_INTERNAL_PROGRAM_ERROR;
317             return 0;
318         }
319         delta+=(m-n)*(handledCPCount+1);
320         n=m;
321 
322         /* Encode a sequence of same code points n */
323         for(j=0; j<srcCPCount; ++j) {
324             q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
325             if(q<n) {
326                 ++delta;
327             } else if(q==n) {
328                 /* Represent delta as a generalized variable-length integer: */
329                 for(q=delta, k=BASE; /* no condition */; k+=BASE) {
330 
331                     /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
332 
333                     t=k-bias;
334                     if(t<TMIN) {
335                         t=TMIN;
336                     } else if(t>TMAX) {
337                         t=TMAX;
338                     }
339                     */
340 
341                     t=k-bias;
342                     if(t<TMIN) {
343                         t=TMIN;
344                     } else if(k>=(bias+TMAX)) {
345                         t=TMAX;
346                     }
347 
348                     if(q<t) {
349                         break;
350                     }
351 
352                     if(destLength<destCapacity) {
353                         dest[destLength]=digitToBasic(t+(q-t)%(BASE-t), 0);
354                     }
355                     ++destLength;
356                     q=(q-t)/(BASE-t);
357                 }
358 
359                 if(destLength<destCapacity) {
360                     dest[destLength]=digitToBasic(q, (UBool)(cpBuffer[j]<0));
361                 }
362                 ++destLength;
363                 bias=adaptBias(delta, handledCPCount+1, (UBool)(handledCPCount==basicLength));
364                 delta=0;
365                 ++handledCPCount;
366             }
367         }
368 
369         ++delta;
370         ++n;
371     }
372 
373     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
374 }
375 
376 U_CFUNC int32_t
u_strFromPunycode(const UChar * src,int32_t srcLength,UChar * dest,int32_t destCapacity,UBool * caseFlags,UErrorCode * pErrorCode)377 u_strFromPunycode(const UChar *src, int32_t srcLength,
378                   UChar *dest, int32_t destCapacity,
379                   UBool *caseFlags,
380                   UErrorCode *pErrorCode) {
381     int32_t n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
382             destCPCount, firstSupplementaryIndex, cpLength;
383     UChar b;
384 
385     /* argument checking */
386     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
387         return 0;
388     }
389 
390     if(src==NULL || srcLength<-1 || (dest==NULL && destCapacity!=0)) {
391         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
392         return 0;
393     }
394 
395     if(srcLength==-1) {
396         srcLength=u_strlen(src);
397     }
398 
399     /*
400      * Handle the basic code points:
401      * Let basicLength be the number of input code points
402      * before the last delimiter, or 0 if there is none,
403      * then copy the first basicLength code points to the output.
404      *
405      * The two following loops iterate backward.
406      */
407     for(j=srcLength; j>0;) {
408         if(src[--j]==DELIMITER) {
409             break;
410         }
411     }
412     destLength=basicLength=destCPCount=j;
413     U_ASSERT(destLength>=0);
414 
415     while(j>0) {
416         b=src[--j];
417         if(!IS_BASIC(b)) {
418             *pErrorCode=U_INVALID_CHAR_FOUND;
419             return 0;
420         }
421 
422         if(j<destCapacity) {
423             dest[j]=(UChar)b;
424 
425             if(caseFlags!=NULL) {
426                 caseFlags[j]=IS_BASIC_UPPERCASE(b);
427             }
428         }
429     }
430 
431     /* Initialize the state: */
432     n=INITIAL_N;
433     i=0;
434     bias=INITIAL_BIAS;
435     firstSupplementaryIndex=1000000000;
436 
437     /*
438      * Main decoding loop:
439      * Start just after the last delimiter if any
440      * basic code points were copied; start at the beginning otherwise.
441      */
442     for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
443         /*
444          * in is the index of the next character to be consumed, and
445          * destCPCount is the number of code points in the output array.
446          *
447          * Decode a generalized variable-length integer into delta,
448          * which gets added to i.  The overflow checking is easier
449          * if we increase i as we go, then subtract off its starting
450          * value at the end to obtain delta.
451          */
452         for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
453             if(in>=srcLength) {
454                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
455                 return 0;
456             }
457 
458             digit=basicToDigit[(uint8_t)src[in++]];
459             if(digit<0) {
460                 *pErrorCode=U_INVALID_CHAR_FOUND;
461                 return 0;
462             }
463             if(digit>(0x7fffffff-i)/w) {
464                 /* integer overflow */
465                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
466                 return 0;
467             }
468 
469             i+=digit*w;
470             /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
471             t=k-bias;
472             if(t<TMIN) {
473                 t=TMIN;
474             } else if(t>TMAX) {
475                 t=TMAX;
476             }
477             */
478             t=k-bias;
479             if(t<TMIN) {
480                 t=TMIN;
481             } else if(k>=(bias+TMAX)) {
482                 t=TMAX;
483             }
484             if(digit<t) {
485                 break;
486             }
487 
488             if(w>0x7fffffff/(BASE-t)) {
489                 /* integer overflow */
490                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
491                 return 0;
492             }
493             w*=BASE-t;
494         }
495 
496         /*
497          * Modification from sample code:
498          * Increments destCPCount here,
499          * where needed instead of in for() loop tail.
500          */
501         ++destCPCount;
502         bias=adaptBias(i-oldi, destCPCount, (UBool)(oldi==0));
503 
504         /*
505          * i was supposed to wrap around from (incremented) destCPCount to 0,
506          * incrementing n each time, so we'll fix that now:
507          */
508         if(i/destCPCount>(0x7fffffff-n)) {
509             /* integer overflow */
510             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
511             return 0;
512         }
513 
514         n+=i/destCPCount;
515         i%=destCPCount;
516         /* not needed for Punycode: */
517         /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
518 
519         if(n>0x10ffff || U_IS_SURROGATE(n)) {
520             /* Unicode code point overflow */
521             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
522             return 0;
523         }
524 
525         /* Insert n at position i of the output: */
526         cpLength=U16_LENGTH(n);
527         if(dest!=NULL && ((destLength+cpLength)<=destCapacity)) {
528             int32_t codeUnitIndex;
529 
530             /*
531              * Handle indexes when supplementary code points are present.
532              *
533              * In almost all cases, there will be only BMP code points before i
534              * and even in the entire string.
535              * This is handled with the same efficiency as with UTF-32.
536              *
537              * Only the rare cases with supplementary code points are handled
538              * more slowly - but not too bad since this is an insertion anyway.
539              */
540             if(i<=firstSupplementaryIndex) {
541                 codeUnitIndex=i;
542                 if(cpLength>1) {
543                     firstSupplementaryIndex=codeUnitIndex;
544                 } else {
545                     ++firstSupplementaryIndex;
546                 }
547             } else {
548                 codeUnitIndex=firstSupplementaryIndex;
549                 U16_FWD_N(dest, codeUnitIndex, destLength, i-codeUnitIndex);
550             }
551 
552             /* use the UChar index codeUnitIndex instead of the code point index i */
553             if(codeUnitIndex<destLength) {
554                 uprv_memmove(dest+codeUnitIndex+cpLength,
555                              dest+codeUnitIndex,
556                              (destLength-codeUnitIndex)*U_SIZEOF_UCHAR);
557                 if(caseFlags!=NULL) {
558                     uprv_memmove(caseFlags+codeUnitIndex+cpLength,
559                                  caseFlags+codeUnitIndex,
560                                  destLength-codeUnitIndex);
561                 }
562             }
563             if(cpLength==1) {
564                 /* BMP, insert one code unit */
565                 dest[codeUnitIndex]=(UChar)n;
566             } else {
567                 /* supplementary character, insert two code units */
568                 dest[codeUnitIndex]=U16_LEAD(n);
569                 dest[codeUnitIndex+1]=U16_TRAIL(n);
570             }
571             if(caseFlags!=NULL) {
572                 /* Case of last character determines uppercase flag: */
573                 caseFlags[codeUnitIndex]=IS_BASIC_UPPERCASE(src[in-1]);
574                 if(cpLength==2) {
575                     caseFlags[codeUnitIndex+1]=FALSE;
576                 }
577             }
578         }
579         destLength+=cpLength;
580         U_ASSERT(destLength>=0);
581         ++i;
582     }
583 
584     return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
585 }
586 
587 /* ### check notes on overflow handling - only necessary if not IDNA? are these Punycode functions to be public? */
588 
589 #endif /* #if !UCONFIG_NO_IDNA */
590