1 /*
2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
3  *
4  * This code is free software; you can redistribute it and/or modify it
5  * under the terms of the GNU General Public License version 2 only, as
6  * published by the Free Software Foundation.  Oracle designates this
7  * particular file as subject to the "Classpath" exception as provided
8  * by Oracle in the LICENSE file that accompanied this code.
9  *
10  * This code is distributed in the hope that it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13  * version 2 for more details (a copy is included in the LICENSE file that
14  * accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License version
17  * 2 along with this work; if not, write to the Free Software Foundation,
18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19  *
20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21  * or visit www.oracle.com if you need additional information or have any
22  * questions.
23  */
24 /*
25  *******************************************************************************
26  * Copyright (C) 2003-2004, International Business Machines Corporation and    *
27  * others. All Rights Reserved.                                                *
28  *******************************************************************************
29  */
30 //
31 // CHANGELOG
32 //      2005-05-19 Edward Wang
33 //          - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/Punycode.java
34 //          - move from package com.ibm.icu.text to package sun.net.idn
35 //          - use ParseException instead of StringPrepParseException
36 //      2007-08-14 Martin Buchholz
37 //          - remove redundant casts
38 //
39 package sun.net.idn;
40 
41 import java.text.ParseException;
42 import sun.text.normalizer.UCharacter;
43 import sun.text.normalizer.UTF16;
44 
45 /**
46  * Ported code from ICU punycode.c
47  * @author ram
48  */
49 
50 /* Package Private class */
51 public final class Punycode {
52 
53     /* Punycode parameters for Bootstring */
54     private static final int BASE           = 36;
55     private static final int TMIN           = 1;
56     private static final int TMAX           = 26;
57     private static final int SKEW           = 38;
58     private static final int DAMP           = 700;
59     private static final int INITIAL_BIAS   = 72;
60     private static final int INITIAL_N      = 0x80;
61 
62     /* "Basic" Unicode/ASCII code points */
63     private static final int HYPHEN         = 0x2d;
64     private static final int DELIMITER      = HYPHEN;
65 
66     private static final int ZERO           = 0x30;
67     private static final int NINE           = 0x39;
68 
69     private static final int SMALL_A        = 0x61;
70     private static final int SMALL_Z        = 0x7a;
71 
72     private static final int CAPITAL_A      = 0x41;
73     private static final int CAPITAL_Z      = 0x5a;
74 
75     //  TODO: eliminate the 256 limitation
76     private static final int MAX_CP_COUNT   = 256;
77 
78     private static final int UINT_MAGIC     = 0x80000000;
79     private static final long ULONG_MAGIC   = 0x8000000000000000L;
80 
adaptBias(int delta, int length, boolean firstTime)81     private static int adaptBias(int delta, int length, boolean firstTime){
82         if(firstTime){
83             delta /=DAMP;
84         }else{
85             delta /=  2;
86         }
87         delta += delta/length;
88 
89         int count=0;
90         for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
91             delta/=(BASE-TMIN);
92         }
93 
94         return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
95     }
96 
97     /**
98      * basicToDigit[] contains the numeric value of a basic code
99      * point (for use in representing integers) in the range 0 to
100      * BASE-1, or -1 if b is does not represent a value.
101      */
102     static final int[]    basicToDigit= new int[]{
103         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
104         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
105 
106         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
107         26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
108 
109         -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
110         15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
111 
112         -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
113         15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
114 
115         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
116         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
117 
118         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
119         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
120 
121         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
122         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
123 
124         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
125         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
126     };
127 
asciiCaseMap(char b, boolean uppercase)128     private static char asciiCaseMap(char b, boolean uppercase) {
129         if(uppercase) {
130             if(SMALL_A<=b && b<=SMALL_Z) {
131                 b-=(SMALL_A-CAPITAL_A);
132             }
133         } else {
134             if(CAPITAL_A<=b && b<=CAPITAL_Z) {
135                 b+=(SMALL_A-CAPITAL_A);
136             }
137         }
138         return b;
139     }
140 
141     /**
142      * digitToBasic() returns the basic code point whose value
143      * (when used for representing integers) is d, which must be in the
144      * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
145      * nonzero, in which case the uppercase form is used.
146      */
digitToBasic(int digit, boolean uppercase)147     private static char digitToBasic(int digit, boolean uppercase) {
148         /*  0..25 map to ASCII a..z or A..Z */
149         /* 26..35 map to ASCII 0..9         */
150         if(digit<26) {
151             if(uppercase) {
152                 return (char)(CAPITAL_A+digit);
153             } else {
154                 return (char)(SMALL_A+digit);
155             }
156         } else {
157             return (char)((ZERO-26)+digit);
158         }
159     }
160     /**
161      * Converts Unicode to Punycode.
162      * The input string must not contain single, unpaired surrogates.
163      * The output will be represented as an array of ASCII code points.
164      *
165      * @param src
166      * @param caseFlags
167      * @return
168      * @throws ParseException
169      */
encode(StringBuffer src, boolean[] caseFlags)170     public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException{
171 
172         int[] cpBuffer = new int[MAX_CP_COUNT];
173         int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
174         char c, c2;
175         int srcLength = src.length();
176         int destCapacity = MAX_CP_COUNT;
177         char[] dest = new char[destCapacity];
178         StringBuffer result = new StringBuffer();
179         /*
180          * Handle the basic code points and
181          * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
182          */
183         srcCPCount=destLength=0;
184 
185         for(j=0; j<srcLength; ++j) {
186             if(srcCPCount==MAX_CP_COUNT) {
187                 /* too many input code points */
188                 throw new ParseException("Too many input code points", -1);
189             }
190             c=src.charAt(j);
191             if(isBasic(c)) {
192                 if(destLength<destCapacity) {
193                     cpBuffer[srcCPCount++]=0;
194                     dest[destLength]=
195                         caseFlags!=null ?
196                             asciiCaseMap(c, caseFlags[j]) :
197                             c;
198                 }
199                 ++destLength;
200             } else {
201                 n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L;
202                 if(!UTF16.isSurrogate(c)) {
203                     n|=c;
204                 } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {
205                     ++j;
206 
207                     n|=UCharacter.getCodePoint(c, c2);
208                 } else {
209                     /* error: unmatched surrogate */
210                     throw new ParseException("Illegal char found", -1);
211                 }
212                 cpBuffer[srcCPCount++]=n;
213             }
214         }
215 
216         /* Finish the basic string - if it is not empty - with a delimiter. */
217         basicLength=destLength;
218         if(basicLength>0) {
219             if(destLength<destCapacity) {
220                 dest[destLength]=DELIMITER;
221             }
222             ++destLength;
223         }
224 
225         /*
226          * handledCPCount is the number of code points that have been handled
227          * basicLength is the number of basic code points
228          * destLength is the number of chars that have been output
229          */
230 
231         /* Initialize the state: */
232         n=INITIAL_N;
233         delta=0;
234         bias=INITIAL_BIAS;
235 
236         /* Main encoding loop: */
237         for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
238             /*
239              * All non-basic code points < n have been handled already.
240              * Find the next larger one:
241              */
242             for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
243                 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
244                 if(n<=q && q<m) {
245                     m=q;
246                 }
247             }
248 
249             /*
250              * Increase delta enough to advance the decoder's
251              * <n,i> state to <m,0>, but guard against overflow:
252              */
253             if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
254                 throw new RuntimeException("Internal program error");
255             }
256             delta+=(m-n)*(handledCPCount+1);
257             n=m;
258 
259             /* Encode a sequence of same code points n */
260             for(j=0; j<srcCPCount; ++j) {
261                 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
262                 if(q<n) {
263                     ++delta;
264                 } else if(q==n) {
265                     /* Represent delta as a generalized variable-length integer: */
266                     for(q=delta, k=BASE; /* no condition */; k+=BASE) {
267 
268                         /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
269 
270                         t=k-bias;
271                         if(t<TMIN) {
272                             t=TMIN;
273                         } else if(t>TMAX) {
274                             t=TMAX;
275                         }
276                         */
277 
278                         t=k-bias;
279                         if(t<TMIN) {
280                             t=TMIN;
281                         } else if(k>=(bias+TMAX)) {
282                             t=TMAX;
283                         }
284 
285                         if(q<t) {
286                             break;
287                         }
288 
289                         if(destLength<destCapacity) {
290                             dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), false);
291                         }
292                         q=(q-t)/(BASE-t);
293                     }
294 
295                     if(destLength<destCapacity) {
296                         dest[destLength++]=digitToBasic(q, (cpBuffer[j]<0));
297                     }
298                     bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength));
299                     delta=0;
300                     ++handledCPCount;
301                 }
302             }
303 
304             ++delta;
305             ++n;
306         }
307 
308         return result.append(dest, 0, destLength);
309     }
310 
isBasic(int ch)311     private static boolean isBasic(int ch){
312         return (ch < INITIAL_N);
313     }
314 
isBasicUpperCase(int ch)315     private static boolean isBasicUpperCase(int ch){
316         return( CAPITAL_A <= ch && ch <= CAPITAL_Z);
317     }
318 
isSurrogate(int ch)319     private static boolean isSurrogate(int ch){
320         return (((ch)&0xfffff800)==0xd800);
321     }
322     /**
323      * Converts Punycode to Unicode.
324      * The Unicode string will be at most as long as the Punycode string.
325      *
326      * @param src
327      * @param caseFlags
328      * @return
329      * @throws ParseException
330      */
decode(StringBuffer src, boolean[] caseFlags)331     public static StringBuffer decode(StringBuffer src, boolean[] caseFlags)
332                                throws ParseException{
333         int srcLength = src.length();
334         StringBuffer result = new StringBuffer();
335         int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
336                 destCPCount, firstSupplementaryIndex, cpLength;
337         char b;
338         int destCapacity = MAX_CP_COUNT;
339         char[] dest = new char[destCapacity];
340 
341         /*
342          * Handle the basic code points:
343          * Let basicLength be the number of input code points
344          * before the last delimiter, or 0 if there is none,
345          * then copy the first basicLength code points to the output.
346          *
347          * The two following loops iterate backward.
348          */
349         for(j=srcLength; j>0;) {
350             if(src.charAt(--j)==DELIMITER) {
351                 break;
352             }
353         }
354         destLength=basicLength=destCPCount=j;
355 
356         while(j>0) {
357             b=src.charAt(--j);
358             if(!isBasic(b)) {
359                 throw new ParseException("Illegal char found", -1);
360             }
361 
362             if(j<destCapacity) {
363                 dest[j]= b;
364 
365                 if(caseFlags!=null) {
366                     caseFlags[j]=isBasicUpperCase(b);
367                 }
368             }
369         }
370 
371         /* Initialize the state: */
372         n=INITIAL_N;
373         i=0;
374         bias=INITIAL_BIAS;
375         firstSupplementaryIndex=1000000000;
376 
377         /*
378          * Main decoding loop:
379          * Start just after the last delimiter if any
380          * basic code points were copied; start at the beginning otherwise.
381          */
382         for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
383             /*
384              * in is the index of the next character to be consumed, and
385              * destCPCount is the number of code points in the output array.
386              *
387              * Decode a generalized variable-length integer into delta,
388              * which gets added to i.  The overflow checking is easier
389              * if we increase i as we go, then subtract off its starting
390              * value at the end to obtain delta.
391              */
392             for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
393                 if(in>=srcLength) {
394                     throw new ParseException("Illegal char found", -1);
395                 }
396 
397                 digit=basicToDigit[(byte)src.charAt(in++)];
398                 if(digit<0) {
399                     throw new ParseException("Invalid char found", -1);
400                 }
401                 if(digit>(0x7fffffff-i)/w) {
402                     /* integer overflow */
403                     throw new ParseException("Illegal char found", -1);
404                 }
405 
406                 i+=digit*w;
407                 t=k-bias;
408                 if(t<TMIN) {
409                     t=TMIN;
410                 } else if(k>=(bias+TMAX)) {
411                     t=TMAX;
412                 }
413                 if(digit<t) {
414                     break;
415                 }
416 
417                 if(w>0x7fffffff/(BASE-t)) {
418                     /* integer overflow */
419                     throw new ParseException("Illegal char found", -1);
420                 }
421                 w*=BASE-t;
422             }
423 
424             /*
425              * Modification from sample code:
426              * Increments destCPCount here,
427              * where needed instead of in for() loop tail.
428              */
429             ++destCPCount;
430             bias=adaptBias(i-oldi, destCPCount, (oldi==0));
431 
432             /*
433              * i was supposed to wrap around from (incremented) destCPCount to 0,
434              * incrementing n each time, so we'll fix that now:
435              */
436             if(i/destCPCount>(0x7fffffff-n)) {
437                 /* integer overflow */
438                 throw new ParseException("Illegal char found", -1);
439             }
440 
441             n+=i/destCPCount;
442             i%=destCPCount;
443             /* not needed for Punycode: */
444             /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
445 
446             if(n>0x10ffff || isSurrogate(n)) {
447                 /* Unicode code point overflow */
448                 throw new ParseException("Illegal char found", -1);
449             }
450 
451             /* Insert n at position i of the output: */
452             cpLength=UTF16.getCharCount(n);
453             if((destLength+cpLength)<destCapacity) {
454                 int codeUnitIndex;
455 
456                 /*
457                  * Handle indexes when supplementary code points are present.
458                  *
459                  * In almost all cases, there will be only BMP code points before i
460                  * and even in the entire string.
461                  * This is handled with the same efficiency as with UTF-32.
462                  *
463                  * Only the rare cases with supplementary code points are handled
464                  * more slowly - but not too bad since this is an insertion anyway.
465                  */
466                 if(i<=firstSupplementaryIndex) {
467                     codeUnitIndex=i;
468                     if(cpLength>1) {
469                         firstSupplementaryIndex=codeUnitIndex;
470                     } else {
471                         ++firstSupplementaryIndex;
472                     }
473                 } else {
474                     codeUnitIndex=firstSupplementaryIndex;
475                     codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex);
476                 }
477 
478                 /* use the UChar index codeUnitIndex instead of the code point index i */
479                 if(codeUnitIndex<destLength) {
480                     System.arraycopy(dest, codeUnitIndex,
481                                      dest, codeUnitIndex+cpLength,
482                                     (destLength-codeUnitIndex));
483                     if(caseFlags!=null) {
484                         System.arraycopy(caseFlags, codeUnitIndex,
485                                          caseFlags, codeUnitIndex+cpLength,
486                                          destLength-codeUnitIndex);
487                     }
488                 }
489                 if(cpLength==1) {
490                     /* BMP, insert one code unit */
491                     dest[codeUnitIndex]=(char)n;
492                 } else {
493                     /* supplementary character, insert two code units */
494                     dest[codeUnitIndex]=UTF16.getLeadSurrogate(n);
495                     dest[codeUnitIndex+1]=UTF16.getTrailSurrogate(n);
496                 }
497                 if(caseFlags!=null) {
498                     /* Case of last character determines uppercase flag: */
499                     caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1));
500                     if(cpLength==2) {
501                         caseFlags[codeUnitIndex+1]=false;
502                     }
503                 }
504             }
505             destLength+=cpLength;
506             ++i;
507         }
508         result.append(dest, 0, destLength);
509         return result;
510     }
511 }
512