1 /* 2 * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 package java.net; 26 27 import java.io.InputStream; 28 import java.io.IOException; 29 import java.security.AccessController; 30 import java.security.PrivilegedAction; 31 32 import sun.net.idn.StringPrep; 33 import sun.net.idn.Punycode; 34 import sun.text.normalizer.UCharacterIterator; 35 36 /** 37 * Provides methods to convert internationalized domain names (IDNs) between 38 * a normal Unicode representation and an ASCII Compatible Encoding (ACE) representation. 39 * Internationalized domain names can use characters from the entire range of 40 * Unicode, while traditional domain names are restricted to ASCII characters. 41 * ACE is an encoding of Unicode strings that uses only ASCII characters and 42 * can be used with software (such as the Domain Name System) that only 43 * understands traditional domain names. 44 * 45 * <p>Internationalized domain names are defined in <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. 46 * RFC 3490 defines two operations: ToASCII and ToUnicode. These 2 operations employ 47 * <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a> algorithm, which is a 48 * profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a>, and 49 * <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a> algorithm to convert 50 * domain name string back and forth. 51 * 52 * <p>The behavior of aforementioned conversion process can be adjusted by various flags: 53 * <ul> 54 * <li>If the ALLOW_UNASSIGNED flag is used, the domain name string to be converted 55 * can contain code points that are unassigned in Unicode 3.2, which is the 56 * Unicode version on which IDN conversion is based. If the flag is not used, 57 * the presence of such unassigned code points is treated as an error. 58 * <li>If the USE_STD3_ASCII_RULES flag is used, ASCII strings are checked against <a href="http://www.ietf.org/rfc/rfc1122.txt">RFC 1122</a> and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC 1123</a>. 59 * It is an error if they don't meet the requirements. 60 * </ul> 61 * These flags can be logically OR'ed together. 62 * 63 * <p>The security consideration is important with respect to internationalization 64 * domain name support. For example, English domain names may be <i>homographed</i> 65 * - maliciously misspelled by substitution of non-Latin letters. 66 * <a href="http://www.unicode.org/reports/tr36/">Unicode Technical Report #36</a> 67 * discusses security issues of IDN support as well as possible solutions. 68 * Applications are responsible for taking adequate security measures when using 69 * international domain names. 70 * 71 * @author Edward Wang 72 * @since 1.6 73 * 74 */ 75 public final class IDN { 76 /** 77 * Flag to allow processing of unassigned code points 78 */ 79 public static final int ALLOW_UNASSIGNED = 0x01; 80 81 /** 82 * Flag to turn on the check against STD-3 ASCII rules 83 */ 84 public static final int USE_STD3_ASCII_RULES = 0x02; 85 86 87 /** 88 * Translates a string from Unicode to ASCII Compatible Encoding (ACE), 89 * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. 90 * 91 * <p>ToASCII operation can fail. ToASCII fails if any step of it fails. 92 * If ToASCII operation fails, an IllegalArgumentException will be thrown. 93 * In this case, the input string should not be used in an internationalized domain name. 94 * 95 * <p> A label is an individual part of a domain name. The original ToASCII operation, 96 * as defined in RFC 3490, only operates on a single label. This method can handle 97 * both label and entire domain name, by assuming that labels in a domain name are 98 * always separated by dots. The following characters are recognized as dots: 99 * \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop), 100 * and \uFF61 (halfwidth ideographic full stop). if dots are 101 * used as label separators, this method also changes all of them to \u002E (full stop) 102 * in output translated string. 103 * 104 * @param input the string to be processed 105 * @param flag process flag; can be 0 or any logical OR of possible flags 106 * 107 * @return the translated {@code String} 108 * 109 * @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification 110 */ toASCII(String input, int flag)111 public static String toASCII(String input, int flag) 112 { 113 int p = 0, q = 0; 114 StringBuilder out = new StringBuilder(); 115 116 if (isRootLabel(input)) { 117 return "."; 118 } 119 120 while (p < input.length()) { 121 q = searchDots(input, p); 122 out.append(toASCIIInternal(input.substring(p, q), flag)); 123 if (q != (input.length())) { 124 // has more labels, or keep the trailing dot as at present 125 out.append('.'); 126 } 127 p = q + 1; 128 } 129 130 return out.toString(); 131 } 132 133 134 /** 135 * Translates a string from Unicode to ASCII Compatible Encoding (ACE), 136 * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. 137 * 138 * <p> This convenience method works as if by invoking the 139 * two-argument counterpart as follows: 140 * <blockquote> 141 * {@link #toASCII(String, int) toASCII}(input, 0); 142 * </blockquote> 143 * 144 * @param input the string to be processed 145 * 146 * @return the translated {@code String} 147 * 148 * @throws IllegalArgumentException if the input string doesn't conform to RFC 3490 specification 149 */ toASCII(String input)150 public static String toASCII(String input) { 151 return toASCII(input, 0); 152 } 153 154 155 /** 156 * Translates a string from ASCII Compatible Encoding (ACE) to Unicode, 157 * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. 158 * 159 * <p>ToUnicode never fails. In case of any error, the input string is returned unmodified. 160 * 161 * <p> A label is an individual part of a domain name. The original ToUnicode operation, 162 * as defined in RFC 3490, only operates on a single label. This method can handle 163 * both label and entire domain name, by assuming that labels in a domain name are 164 * always separated by dots. The following characters are recognized as dots: 165 * \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop), 166 * and \uFF61 (halfwidth ideographic full stop). 167 * 168 * @param input the string to be processed 169 * @param flag process flag; can be 0 or any logical OR of possible flags 170 * 171 * @return the translated {@code String} 172 */ toUnicode(String input, int flag)173 public static String toUnicode(String input, int flag) { 174 int p = 0, q = 0; 175 StringBuilder out = new StringBuilder(); 176 177 if (isRootLabel(input)) { 178 return "."; 179 } 180 181 while (p < input.length()) { 182 q = searchDots(input, p); 183 out.append(toUnicodeInternal(input.substring(p, q), flag)); 184 if (q != (input.length())) { 185 // has more labels, or keep the trailing dot as at present 186 out.append('.'); 187 } 188 p = q + 1; 189 } 190 191 return out.toString(); 192 } 193 194 195 /** 196 * Translates a string from ASCII Compatible Encoding (ACE) to Unicode, 197 * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>. 198 * 199 * <p> This convenience method works as if by invoking the 200 * two-argument counterpart as follows: 201 * <blockquote> 202 * {@link #toUnicode(String, int) toUnicode}(input, 0); 203 * </blockquote> 204 * 205 * @param input the string to be processed 206 * 207 * @return the translated {@code String} 208 */ toUnicode(String input)209 public static String toUnicode(String input) { 210 return toUnicode(input, 0); 211 } 212 213 214 /* ---------------- Private members -------------- */ 215 216 // ACE Prefix is "xn--" 217 private static final String ACE_PREFIX = "xn--"; 218 private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length(); 219 220 private static final int MAX_LABEL_LENGTH = 63; 221 222 // single instance of nameprep 223 private static StringPrep namePrep = null; 224 225 static { 226 InputStream stream = null; 227 228 try { 229 final String IDN_PROFILE = "uidna.spp"; 230 if (System.getSecurityManager() != null) { 231 stream = AccessController.doPrivileged(new PrivilegedAction<>() { 232 public InputStream run() { 233 return StringPrep.class.getResourceAsStream(IDN_PROFILE); 234 } 235 }); 236 } else { 237 stream = StringPrep.class.getResourceAsStream(IDN_PROFILE); 238 } 239 240 namePrep = new StringPrep(stream); stream.close()241 stream.close(); 242 } catch (IOException e) { 243 // should never reach here 244 assert false; 245 } 246 } 247 248 249 /* ---------------- Private operations -------------- */ 250 251 252 // 253 // to suppress the default zero-argument constructor 254 // IDN()255 private IDN() {} 256 257 // 258 // toASCII operation; should only apply to a single label 259 // toASCIIInternal(String label, int flag)260 private static String toASCIIInternal(String label, int flag) 261 { 262 // step 1 263 // Check if the string contains code points outside the ASCII range 0..0x7c. 264 boolean isASCII = isAllASCII(label); 265 StringBuffer dest; 266 267 // step 2 268 // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here 269 if (!isASCII) { 270 UCharacterIterator iter = UCharacterIterator.getInstance(label); 271 try { 272 dest = namePrep.prepare(iter, flag); 273 } catch (java.text.ParseException e) { 274 throw new IllegalArgumentException(e); 275 } 276 } else { 277 dest = new StringBuffer(label); 278 } 279 280 // step 8, move forward to check the smallest number of the code points 281 // the length must be inside 1..63 282 if (dest.length() == 0) { 283 throw new IllegalArgumentException( 284 "Empty label is not a legal name"); 285 } 286 287 // step 3 288 // Verify the absence of non-LDH ASCII code points 289 // 0..0x2c, 0x2e..0x2f, 0x3a..0x40, 0x5b..0x60, 0x7b..0x7f 290 // Verify the absence of leading and trailing hyphen 291 boolean useSTD3ASCIIRules = ((flag & USE_STD3_ASCII_RULES) != 0); 292 if (useSTD3ASCIIRules) { 293 for (int i = 0; i < dest.length(); i++) { 294 int c = dest.charAt(i); 295 if (isNonLDHAsciiCodePoint(c)) { 296 throw new IllegalArgumentException( 297 "Contains non-LDH ASCII characters"); 298 } 299 } 300 301 if (dest.charAt(0) == '-' || 302 dest.charAt(dest.length() - 1) == '-') { 303 304 throw new IllegalArgumentException( 305 "Has leading or trailing hyphen"); 306 } 307 } 308 309 if (!isASCII) { 310 // step 4 311 // If all code points are inside 0..0x7f, skip to step 8 312 if (!isAllASCII(dest.toString())) { 313 // step 5 314 // verify the sequence does not begin with ACE prefix 315 if(!startsWithACEPrefix(dest)){ 316 317 // step 6 318 // encode the sequence with punycode 319 try { 320 dest = Punycode.encode(dest, null); 321 } catch (java.text.ParseException e) { 322 throw new IllegalArgumentException(e); 323 } 324 325 dest = toASCIILower(dest); 326 327 // step 7 328 // prepend the ACE prefix 329 dest.insert(0, ACE_PREFIX); 330 } else { 331 throw new IllegalArgumentException("The input starts with the ACE Prefix"); 332 } 333 334 } 335 } 336 337 // step 8 338 // the length must be inside 1..63 339 if (dest.length() > MAX_LABEL_LENGTH) { 340 throw new IllegalArgumentException("The label in the input is too long"); 341 } 342 343 return dest.toString(); 344 } 345 346 // 347 // toUnicode operation; should only apply to a single label 348 // toUnicodeInternal(String label, int flag)349 private static String toUnicodeInternal(String label, int flag) { 350 boolean[] caseFlags = null; 351 StringBuffer dest; 352 353 // step 1 354 // find out if all the codepoints in input are ASCII 355 boolean isASCII = isAllASCII(label); 356 357 if(!isASCII){ 358 // step 2 359 // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here 360 try { 361 UCharacterIterator iter = UCharacterIterator.getInstance(label); 362 dest = namePrep.prepare(iter, flag); 363 } catch (Exception e) { 364 // toUnicode never fails; if any step fails, return the input string 365 return label; 366 } 367 } else { 368 dest = new StringBuffer(label); 369 } 370 371 // step 3 372 // verify ACE Prefix 373 if(startsWithACEPrefix(dest)) { 374 375 // step 4 376 // Remove the ACE Prefix 377 String temp = dest.substring(ACE_PREFIX_LENGTH, dest.length()); 378 379 try { 380 // step 5 381 // Decode using punycode 382 StringBuffer decodeOut = Punycode.decode(new StringBuffer(temp), null); 383 384 // step 6 385 // Apply toASCII 386 String toASCIIOut = toASCII(decodeOut.toString(), flag); 387 388 // step 7 389 // verify 390 if (toASCIIOut.equalsIgnoreCase(dest.toString())) { 391 // step 8 392 // return output of step 5 393 return decodeOut.toString(); 394 } 395 } catch (Exception ignored) { 396 // no-op 397 } 398 } 399 400 // just return the input 401 return label; 402 } 403 404 405 // 406 // LDH stands for "letter/digit/hyphen", with characters restricted to the 407 // 26-letter Latin alphabet <A-Z a-z>, the digits <0-9>, and the hyphen 408 // <->. 409 // Non LDH refers to characters in the ASCII range, but which are not 410 // letters, digits or the hypen. 411 // 412 // non-LDH = 0..0x2C, 0x2E..0x2F, 0x3A..0x40, 0x5B..0x60, 0x7B..0x7F 413 // isNonLDHAsciiCodePoint(int ch)414 private static boolean isNonLDHAsciiCodePoint(int ch){ 415 return (0x0000 <= ch && ch <= 0x002C) || 416 (0x002E <= ch && ch <= 0x002F) || 417 (0x003A <= ch && ch <= 0x0040) || 418 (0x005B <= ch && ch <= 0x0060) || 419 (0x007B <= ch && ch <= 0x007F); 420 } 421 422 // 423 // search dots in a string and return the index of that character; 424 // or if there is no dots, return the length of input string 425 // dots might be: \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop), 426 // and \uFF61 (halfwidth ideographic full stop). 427 // searchDots(String s, int start)428 private static int searchDots(String s, int start) { 429 int i; 430 for (i = start; i < s.length(); i++) { 431 if (isLabelSeparator(s.charAt(i))) { 432 break; 433 } 434 } 435 436 return i; 437 } 438 439 // 440 // to check if a string is a root label, ".". 441 // isRootLabel(String s)442 private static boolean isRootLabel(String s) { 443 return (s.length() == 1 && isLabelSeparator(s.charAt(0))); 444 } 445 446 // 447 // to check if a character is a label separator, i.e. a dot character. 448 // isLabelSeparator(char c)449 private static boolean isLabelSeparator(char c) { 450 return (c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61'); 451 } 452 453 // 454 // to check if a string only contains US-ASCII code point 455 // isAllASCII(String input)456 private static boolean isAllASCII(String input) { 457 boolean isASCII = true; 458 for (int i = 0; i < input.length(); i++) { 459 int c = input.charAt(i); 460 if (c > 0x7F) { 461 isASCII = false; 462 break; 463 } 464 } 465 return isASCII; 466 } 467 468 // 469 // to check if a string starts with ACE-prefix 470 // startsWithACEPrefix(StringBuffer input)471 private static boolean startsWithACEPrefix(StringBuffer input){ 472 boolean startsWithPrefix = true; 473 474 if(input.length() < ACE_PREFIX_LENGTH){ 475 return false; 476 } 477 for(int i = 0; i < ACE_PREFIX_LENGTH; i++){ 478 if(toASCIILower(input.charAt(i)) != ACE_PREFIX.charAt(i)){ 479 startsWithPrefix = false; 480 } 481 } 482 return startsWithPrefix; 483 } 484 toASCIILower(char ch)485 private static char toASCIILower(char ch){ 486 if('A' <= ch && ch <= 'Z'){ 487 return (char)(ch + 'a' - 'A'); 488 } 489 return ch; 490 } 491 toASCIILower(StringBuffer input)492 private static StringBuffer toASCIILower(StringBuffer input){ 493 StringBuffer dest = new StringBuffer(); 494 for(int i = 0; i < input.length();i++){ 495 dest.append(toASCIILower(input.charAt(i))); 496 } 497 return dest; 498 } 499 } 500