1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 3 * 4 * This code is free software; you can redistribute it and/or modify it 5 * under the terms of the GNU General Public License version 2 only, as 6 * published by the Free Software Foundation. Oracle designates this 7 * particular file as subject to the "Classpath" exception as provided 8 * by Oracle in the LICENSE file that accompanied this code. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 */ 24 /* 25 ******************************************************************************* 26 * Copyright (C) 2003-2004, International Business Machines Corporation and * 27 * others. All Rights Reserved. * 28 ******************************************************************************* 29 */ 30 // 31 // CHANGELOG 32 // 2005-05-19 Edward Wang 33 // - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/Punycode.java 34 // - move from package com.ibm.icu.text to package sun.net.idn 35 // - use ParseException instead of StringPrepParseException 36 // 2007-08-14 Martin Buchholz 37 // - remove redundant casts 38 // 39 package sun.net.idn; 40 41 import java.text.ParseException; 42 import sun.text.normalizer.UCharacter; 43 import sun.text.normalizer.UTF16; 44 45 /** 46 * Ported code from ICU punycode.c 47 * @author ram 48 */ 49 50 /* Package Private class */ 51 public final class Punycode { 52 53 /* Punycode parameters for Bootstring */ 54 private static final int BASE = 36; 55 private static final int TMIN = 1; 56 private static final int TMAX = 26; 57 private static final int SKEW = 38; 58 private static final int DAMP = 700; 59 private static final int INITIAL_BIAS = 72; 60 private static final int INITIAL_N = 0x80; 61 62 /* "Basic" Unicode/ASCII code points */ 63 private static final int HYPHEN = 0x2d; 64 private static final int DELIMITER = HYPHEN; 65 66 private static final int ZERO = 0x30; 67 private static final int NINE = 0x39; 68 69 private static final int SMALL_A = 0x61; 70 private static final int SMALL_Z = 0x7a; 71 72 private static final int CAPITAL_A = 0x41; 73 private static final int CAPITAL_Z = 0x5a; 74 75 // TODO: eliminate the 256 limitation 76 private static final int MAX_CP_COUNT = 256; 77 78 private static final int UINT_MAGIC = 0x80000000; 79 private static final long ULONG_MAGIC = 0x8000000000000000L; 80 adaptBias(int delta, int length, boolean firstTime)81 private static int adaptBias(int delta, int length, boolean firstTime){ 82 if(firstTime){ 83 delta /=DAMP; 84 }else{ 85 delta /= 2; 86 } 87 delta += delta/length; 88 89 int count=0; 90 for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) { 91 delta/=(BASE-TMIN); 92 } 93 94 return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); 95 } 96 97 /** 98 * basicToDigit[] contains the numeric value of a basic code 99 * point (for use in representing integers) in the range 0 to 100 * BASE-1, or -1 if b is does not represent a value. 101 */ 102 static final int[] basicToDigit= new int[]{ 103 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 104 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 105 106 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 107 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, 108 109 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 110 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 111 112 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 113 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, 114 115 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 116 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 117 118 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 119 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 120 121 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 122 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 123 124 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 125 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 126 }; 127 asciiCaseMap(char b, boolean uppercase)128 private static char asciiCaseMap(char b, boolean uppercase) { 129 if(uppercase) { 130 if(SMALL_A<=b && b<=SMALL_Z) { 131 b-=(SMALL_A-CAPITAL_A); 132 } 133 } else { 134 if(CAPITAL_A<=b && b<=CAPITAL_Z) { 135 b+=(SMALL_A-CAPITAL_A); 136 } 137 } 138 return b; 139 } 140 141 /** 142 * digitToBasic() returns the basic code point whose value 143 * (when used for representing integers) is d, which must be in the 144 * range 0 to BASE-1. The lowercase form is used unless the uppercase flag is 145 * nonzero, in which case the uppercase form is used. 146 */ digitToBasic(int digit, boolean uppercase)147 private static char digitToBasic(int digit, boolean uppercase) { 148 /* 0..25 map to ASCII a..z or A..Z */ 149 /* 26..35 map to ASCII 0..9 */ 150 if(digit<26) { 151 if(uppercase) { 152 return (char)(CAPITAL_A+digit); 153 } else { 154 return (char)(SMALL_A+digit); 155 } 156 } else { 157 return (char)((ZERO-26)+digit); 158 } 159 } 160 /** 161 * Converts Unicode to Punycode. 162 * The input string must not contain single, unpaired surrogates. 163 * The output will be represented as an array of ASCII code points. 164 * 165 * @param src 166 * @param caseFlags 167 * @return 168 * @throws ParseException 169 */ encode(StringBuffer src, boolean[] caseFlags)170 public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException{ 171 172 int[] cpBuffer = new int[MAX_CP_COUNT]; 173 int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount; 174 char c, c2; 175 int srcLength = src.length(); 176 int destCapacity = MAX_CP_COUNT; 177 char[] dest = new char[destCapacity]; 178 StringBuffer result = new StringBuffer(); 179 /* 180 * Handle the basic code points and 181 * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): 182 */ 183 srcCPCount=destLength=0; 184 185 for(j=0; j<srcLength; ++j) { 186 if(srcCPCount==MAX_CP_COUNT) { 187 /* too many input code points */ 188 throw new ParseException("Too many input code points", -1); 189 } 190 c=src.charAt(j); 191 if(isBasic(c)) { 192 if(destLength<destCapacity) { 193 cpBuffer[srcCPCount++]=0; 194 dest[destLength]= 195 caseFlags!=null ? 196 asciiCaseMap(c, caseFlags[j]) : 197 c; 198 } 199 ++destLength; 200 } else { 201 n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L; 202 if(!UTF16.isSurrogate(c)) { 203 n|=c; 204 } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) { 205 ++j; 206 207 n|=UCharacter.getCodePoint(c, c2); 208 } else { 209 /* error: unmatched surrogate */ 210 throw new ParseException("Illegal char found", -1); 211 } 212 cpBuffer[srcCPCount++]=n; 213 } 214 } 215 216 /* Finish the basic string - if it is not empty - with a delimiter. */ 217 basicLength=destLength; 218 if(basicLength>0) { 219 if(destLength<destCapacity) { 220 dest[destLength]=DELIMITER; 221 } 222 ++destLength; 223 } 224 225 /* 226 * handledCPCount is the number of code points that have been handled 227 * basicLength is the number of basic code points 228 * destLength is the number of chars that have been output 229 */ 230 231 /* Initialize the state: */ 232 n=INITIAL_N; 233 delta=0; 234 bias=INITIAL_BIAS; 235 236 /* Main encoding loop: */ 237 for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) { 238 /* 239 * All non-basic code points < n have been handled already. 240 * Find the next larger one: 241 */ 242 for(m=0x7fffffff, j=0; j<srcCPCount; ++j) { 243 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ 244 if(n<=q && q<m) { 245 m=q; 246 } 247 } 248 249 /* 250 * Increase delta enough to advance the decoder's 251 * <n,i> state to <m,0>, but guard against overflow: 252 */ 253 if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) { 254 throw new RuntimeException("Internal program error"); 255 } 256 delta+=(m-n)*(handledCPCount+1); 257 n=m; 258 259 /* Encode a sequence of same code points n */ 260 for(j=0; j<srcCPCount; ++j) { 261 q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */ 262 if(q<n) { 263 ++delta; 264 } else if(q==n) { 265 /* Represent delta as a generalized variable-length integer: */ 266 for(q=delta, k=BASE; /* no condition */; k+=BASE) { 267 268 /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt 269 270 t=k-bias; 271 if(t<TMIN) { 272 t=TMIN; 273 } else if(t>TMAX) { 274 t=TMAX; 275 } 276 */ 277 278 t=k-bias; 279 if(t<TMIN) { 280 t=TMIN; 281 } else if(k>=(bias+TMAX)) { 282 t=TMAX; 283 } 284 285 if(q<t) { 286 break; 287 } 288 289 if(destLength<destCapacity) { 290 dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), false); 291 } 292 q=(q-t)/(BASE-t); 293 } 294 295 if(destLength<destCapacity) { 296 dest[destLength++]=digitToBasic(q, (cpBuffer[j]<0)); 297 } 298 bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength)); 299 delta=0; 300 ++handledCPCount; 301 } 302 } 303 304 ++delta; 305 ++n; 306 } 307 308 return result.append(dest, 0, destLength); 309 } 310 isBasic(int ch)311 private static boolean isBasic(int ch){ 312 return (ch < INITIAL_N); 313 } 314 isBasicUpperCase(int ch)315 private static boolean isBasicUpperCase(int ch){ 316 return( CAPITAL_A <= ch && ch <= CAPITAL_Z); 317 } 318 isSurrogate(int ch)319 private static boolean isSurrogate(int ch){ 320 return (((ch)&0xfffff800)==0xd800); 321 } 322 /** 323 * Converts Punycode to Unicode. 324 * The Unicode string will be at most as long as the Punycode string. 325 * 326 * @param src 327 * @param caseFlags 328 * @return 329 * @throws ParseException 330 */ decode(StringBuffer src, boolean[] caseFlags)331 public static StringBuffer decode(StringBuffer src, boolean[] caseFlags) 332 throws ParseException{ 333 int srcLength = src.length(); 334 StringBuffer result = new StringBuffer(); 335 int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t, 336 destCPCount, firstSupplementaryIndex, cpLength; 337 char b; 338 int destCapacity = MAX_CP_COUNT; 339 char[] dest = new char[destCapacity]; 340 341 /* 342 * Handle the basic code points: 343 * Let basicLength be the number of input code points 344 * before the last delimiter, or 0 if there is none, 345 * then copy the first basicLength code points to the output. 346 * 347 * The two following loops iterate backward. 348 */ 349 for(j=srcLength; j>0;) { 350 if(src.charAt(--j)==DELIMITER) { 351 break; 352 } 353 } 354 destLength=basicLength=destCPCount=j; 355 356 while(j>0) { 357 b=src.charAt(--j); 358 if(!isBasic(b)) { 359 throw new ParseException("Illegal char found", -1); 360 } 361 362 if(j<destCapacity) { 363 dest[j]= b; 364 365 if(caseFlags!=null) { 366 caseFlags[j]=isBasicUpperCase(b); 367 } 368 } 369 } 370 371 /* Initialize the state: */ 372 n=INITIAL_N; 373 i=0; 374 bias=INITIAL_BIAS; 375 firstSupplementaryIndex=1000000000; 376 377 /* 378 * Main decoding loop: 379 * Start just after the last delimiter if any 380 * basic code points were copied; start at the beginning otherwise. 381 */ 382 for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) { 383 /* 384 * in is the index of the next character to be consumed, and 385 * destCPCount is the number of code points in the output array. 386 * 387 * Decode a generalized variable-length integer into delta, 388 * which gets added to i. The overflow checking is easier 389 * if we increase i as we go, then subtract off its starting 390 * value at the end to obtain delta. 391 */ 392 for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) { 393 if(in>=srcLength) { 394 throw new ParseException("Illegal char found", -1); 395 } 396 397 digit=basicToDigit[(byte)src.charAt(in++)]; 398 if(digit<0) { 399 throw new ParseException("Invalid char found", -1); 400 } 401 if(digit>(0x7fffffff-i)/w) { 402 /* integer overflow */ 403 throw new ParseException("Illegal char found", -1); 404 } 405 406 i+=digit*w; 407 t=k-bias; 408 if(t<TMIN) { 409 t=TMIN; 410 } else if(k>=(bias+TMAX)) { 411 t=TMAX; 412 } 413 if(digit<t) { 414 break; 415 } 416 417 if(w>0x7fffffff/(BASE-t)) { 418 /* integer overflow */ 419 throw new ParseException("Illegal char found", -1); 420 } 421 w*=BASE-t; 422 } 423 424 /* 425 * Modification from sample code: 426 * Increments destCPCount here, 427 * where needed instead of in for() loop tail. 428 */ 429 ++destCPCount; 430 bias=adaptBias(i-oldi, destCPCount, (oldi==0)); 431 432 /* 433 * i was supposed to wrap around from (incremented) destCPCount to 0, 434 * incrementing n each time, so we'll fix that now: 435 */ 436 if(i/destCPCount>(0x7fffffff-n)) { 437 /* integer overflow */ 438 throw new ParseException("Illegal char found", -1); 439 } 440 441 n+=i/destCPCount; 442 i%=destCPCount; 443 /* not needed for Punycode: */ 444 /* if (decode_digit(n) <= BASE) return punycode_invalid_input; */ 445 446 if(n>0x10ffff || isSurrogate(n)) { 447 /* Unicode code point overflow */ 448 throw new ParseException("Illegal char found", -1); 449 } 450 451 /* Insert n at position i of the output: */ 452 cpLength=UTF16.getCharCount(n); 453 if((destLength+cpLength)<destCapacity) { 454 int codeUnitIndex; 455 456 /* 457 * Handle indexes when supplementary code points are present. 458 * 459 * In almost all cases, there will be only BMP code points before i 460 * and even in the entire string. 461 * This is handled with the same efficiency as with UTF-32. 462 * 463 * Only the rare cases with supplementary code points are handled 464 * more slowly - but not too bad since this is an insertion anyway. 465 */ 466 if(i<=firstSupplementaryIndex) { 467 codeUnitIndex=i; 468 if(cpLength>1) { 469 firstSupplementaryIndex=codeUnitIndex; 470 } else { 471 ++firstSupplementaryIndex; 472 } 473 } else { 474 codeUnitIndex=firstSupplementaryIndex; 475 codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex); 476 } 477 478 /* use the UChar index codeUnitIndex instead of the code point index i */ 479 if(codeUnitIndex<destLength) { 480 System.arraycopy(dest, codeUnitIndex, 481 dest, codeUnitIndex+cpLength, 482 (destLength-codeUnitIndex)); 483 if(caseFlags!=null) { 484 System.arraycopy(caseFlags, codeUnitIndex, 485 caseFlags, codeUnitIndex+cpLength, 486 destLength-codeUnitIndex); 487 } 488 } 489 if(cpLength==1) { 490 /* BMP, insert one code unit */ 491 dest[codeUnitIndex]=(char)n; 492 } else { 493 /* supplementary character, insert two code units */ 494 dest[codeUnitIndex]=UTF16.getLeadSurrogate(n); 495 dest[codeUnitIndex+1]=UTF16.getTrailSurrogate(n); 496 } 497 if(caseFlags!=null) { 498 /* Case of last character determines uppercase flag: */ 499 caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1)); 500 if(cpLength==2) { 501 caseFlags[codeUnitIndex+1]=false; 502 } 503 } 504 } 505 destLength+=cpLength; 506 ++i; 507 } 508 result.append(dest, 0, destLength); 509 return result; 510 } 511 } 512