1 /*
2  * Copyright (c) 2005, 2013, Oracle and/or its affiliates. All rights reserved.
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * This code is free software; you can redistribute it and/or modify it
6  * under the terms of the GNU General Public License version 2 only, as
7  * published by the Free Software Foundation.  Oracle designates this
8  * particular file as subject to the "Classpath" exception as provided
9  * by Oracle in the LICENSE file that accompanied this code.
10  *
11  * This code is distributed in the hope that it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14  * version 2 for more details (a copy is included in the LICENSE file that
15  * accompanied this code).
16  *
17  * You should have received a copy of the GNU General Public License version
18  * 2 along with this work; if not, write to the Free Software Foundation,
19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20  *
21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22  * or visit www.oracle.com if you need additional information or have any
23  * questions.
24  */
25 package java.net;
26 
27 import java.io.InputStream;
28 import java.io.IOException;
29 import java.security.AccessController;
30 import java.security.PrivilegedAction;
31 
32 import sun.net.idn.StringPrep;
33 import sun.net.idn.Punycode;
34 import sun.text.normalizer.UCharacterIterator;
35 
36 /**
37  * Provides methods to convert internationalized domain names (IDNs) between
38  * a normal Unicode representation and an ASCII Compatible Encoding (ACE) representation.
39  * Internationalized domain names can use characters from the entire range of
40  * Unicode, while traditional domain names are restricted to ASCII characters.
41  * ACE is an encoding of Unicode strings that uses only ASCII characters and
42  * can be used with software (such as the Domain Name System) that only
43  * understands traditional domain names.
44  *
45  * <p>Internationalized domain names are defined in <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
46  * RFC 3490 defines two operations: ToASCII and ToUnicode. These 2 operations employ
47  * <a href="http://www.ietf.org/rfc/rfc3491.txt">Nameprep</a> algorithm, which is a
48  * profile of <a href="http://www.ietf.org/rfc/rfc3454.txt">Stringprep</a>, and
49  * <a href="http://www.ietf.org/rfc/rfc3492.txt">Punycode</a> algorithm to convert
50  * domain name string back and forth.
51  *
52  * <p>The behavior of aforementioned conversion process can be adjusted by various flags:
53  *   <ul>
54  *     <li>If the ALLOW_UNASSIGNED flag is used, the domain name string to be converted
55  *         can contain code points that are unassigned in Unicode 3.2, which is the
56  *         Unicode version on which IDN conversion is based. If the flag is not used,
57  *         the presence of such unassigned code points is treated as an error.
58  *     <li>If the USE_STD3_ASCII_RULES flag is used, ASCII strings are checked against <a href="http://www.ietf.org/rfc/rfc1122.txt">RFC 1122</a> and <a href="http://www.ietf.org/rfc/rfc1123.txt">RFC 1123</a>.
59  *         It is an error if they don't meet the requirements.
60  *   </ul>
61  * These flags can be logically OR'ed together.
62  *
63  * <p>The security consideration is important with respect to internationalization
64  * domain name support. For example, English domain names may be <i>homographed</i>
65  * - maliciously misspelled by substitution of non-Latin letters.
66  * <a href="http://www.unicode.org/reports/tr36/">Unicode Technical Report #36</a>
67  * discusses security issues of IDN support as well as possible solutions.
68  * Applications are responsible for taking adequate security measures when using
69  * international domain names.
70  *
71  * @author Edward Wang
72  * @since 1.6
73  *
74  */
75 public final class IDN {
76     /**
77      * Flag to allow processing of unassigned code points
78      */
79     public static final int ALLOW_UNASSIGNED = 0x01;
80 
81     /**
82      * Flag to turn on the check against STD-3 ASCII rules
83      */
84     public static final int USE_STD3_ASCII_RULES = 0x02;
85 
86 
87     /**
88      * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
89      * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
90      *
91      * <p>ToASCII operation can fail. ToASCII fails if any step of it fails.
92      * If ToASCII operation fails, an IllegalArgumentException will be thrown.
93      * In this case, the input string should not be used in an internationalized domain name.
94      *
95      * <p> A label is an individual part of a domain name. The original ToASCII operation,
96      * as defined in RFC 3490, only operates on a single label. This method can handle
97      * both label and entire domain name, by assuming that labels in a domain name are
98      * always separated by dots. The following characters are recognized as dots:
99      * &#0092;u002E (full stop), &#0092;u3002 (ideographic full stop), &#0092;uFF0E (fullwidth full stop),
100      * and &#0092;uFF61 (halfwidth ideographic full stop). if dots are
101      * used as label separators, this method also changes all of them to &#0092;u002E (full stop)
102      * in output translated string.
103      *
104      * @param input     the string to be processed
105      * @param flag      process flag; can be 0 or any logical OR of possible flags
106      *
107      * @return          the translated {@code String}
108      *
109      * @throws IllegalArgumentException   if the input string doesn't conform to RFC 3490 specification
110      */
toASCII(String input, int flag)111     public static String toASCII(String input, int flag)
112     {
113         int p = 0, q = 0;
114         StringBuilder out = new StringBuilder();
115 
116         if (isRootLabel(input)) {
117             return ".";
118         }
119 
120         while (p < input.length()) {
121             q = searchDots(input, p);
122             out.append(toASCIIInternal(input.substring(p, q),  flag));
123             if (q != (input.length())) {
124                // has more labels, or keep the trailing dot as at present
125                out.append('.');
126             }
127             p = q + 1;
128         }
129 
130         return out.toString();
131     }
132 
133 
134     /**
135      * Translates a string from Unicode to ASCII Compatible Encoding (ACE),
136      * as defined by the ToASCII operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
137      *
138      * <p> This convenience method works as if by invoking the
139      * two-argument counterpart as follows:
140      * <blockquote>
141      * {@link #toASCII(String, int) toASCII}(input,&nbsp;0);
142      * </blockquote>
143      *
144      * @param input     the string to be processed
145      *
146      * @return          the translated {@code String}
147      *
148      * @throws IllegalArgumentException   if the input string doesn't conform to RFC 3490 specification
149      */
toASCII(String input)150     public static String toASCII(String input) {
151         return toASCII(input, 0);
152     }
153 
154 
155     /**
156      * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
157      * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
158      *
159      * <p>ToUnicode never fails. In case of any error, the input string is returned unmodified.
160      *
161      * <p> A label is an individual part of a domain name. The original ToUnicode operation,
162      * as defined in RFC 3490, only operates on a single label. This method can handle
163      * both label and entire domain name, by assuming that labels in a domain name are
164      * always separated by dots. The following characters are recognized as dots:
165      * &#0092;u002E (full stop), &#0092;u3002 (ideographic full stop), &#0092;uFF0E (fullwidth full stop),
166      * and &#0092;uFF61 (halfwidth ideographic full stop).
167      *
168      * @param input     the string to be processed
169      * @param flag      process flag; can be 0 or any logical OR of possible flags
170      *
171      * @return          the translated {@code String}
172      */
toUnicode(String input, int flag)173     public static String toUnicode(String input, int flag) {
174         int p = 0, q = 0;
175         StringBuilder out = new StringBuilder();
176 
177         if (isRootLabel(input)) {
178             return ".";
179         }
180 
181         while (p < input.length()) {
182             q = searchDots(input, p);
183             out.append(toUnicodeInternal(input.substring(p, q),  flag));
184             if (q != (input.length())) {
185                // has more labels, or keep the trailing dot as at present
186                out.append('.');
187             }
188             p = q + 1;
189         }
190 
191         return out.toString();
192     }
193 
194 
195     /**
196      * Translates a string from ASCII Compatible Encoding (ACE) to Unicode,
197      * as defined by the ToUnicode operation of <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>.
198      *
199      * <p> This convenience method works as if by invoking the
200      * two-argument counterpart as follows:
201      * <blockquote>
202      * {@link #toUnicode(String, int) toUnicode}(input,&nbsp;0);
203      * </blockquote>
204      *
205      * @param input     the string to be processed
206      *
207      * @return          the translated {@code String}
208      */
toUnicode(String input)209     public static String toUnicode(String input) {
210         return toUnicode(input, 0);
211     }
212 
213 
214     /* ---------------- Private members -------------- */
215 
216     // ACE Prefix is "xn--"
217     private static final String ACE_PREFIX = "xn--";
218     private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length();
219 
220     private static final int MAX_LABEL_LENGTH   = 63;
221 
222     // single instance of nameprep
223     private static StringPrep namePrep = null;
224 
225     static {
226         InputStream stream = null;
227 
228         try {
229             final String IDN_PROFILE = "uidna.spp";
230             if (System.getSecurityManager() != null) {
231                 stream = AccessController.doPrivileged(new PrivilegedAction<>() {
232                     public InputStream run() {
233                         return StringPrep.class.getResourceAsStream(IDN_PROFILE);
234                     }
235                 });
236             } else {
237                 stream = StringPrep.class.getResourceAsStream(IDN_PROFILE);
238             }
239 
240             namePrep = new StringPrep(stream);
stream.close()241             stream.close();
242         } catch (IOException e) {
243             // should never reach here
244             assert false;
245         }
246     }
247 
248 
249     /* ---------------- Private operations -------------- */
250 
251 
252     //
253     // to suppress the default zero-argument constructor
254     //
IDN()255     private IDN() {}
256 
257     //
258     // toASCII operation; should only apply to a single label
259     //
toASCIIInternal(String label, int flag)260     private static String toASCIIInternal(String label, int flag)
261     {
262         // step 1
263         // Check if the string contains code points outside the ASCII range 0..0x7c.
264         boolean isASCII  = isAllASCII(label);
265         StringBuffer dest;
266 
267         // step 2
268         // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
269         if (!isASCII) {
270             UCharacterIterator iter = UCharacterIterator.getInstance(label);
271             try {
272                 dest = namePrep.prepare(iter, flag);
273             } catch (java.text.ParseException e) {
274                 throw new IllegalArgumentException(e);
275             }
276         } else {
277             dest = new StringBuffer(label);
278         }
279 
280         // step 8, move forward to check the smallest number of the code points
281         // the length must be inside 1..63
282         if (dest.length() == 0) {
283             throw new IllegalArgumentException(
284                         "Empty label is not a legal name");
285         }
286 
287         // step 3
288         // Verify the absence of non-LDH ASCII code points
289         //   0..0x2c, 0x2e..0x2f, 0x3a..0x40, 0x5b..0x60, 0x7b..0x7f
290         // Verify the absence of leading and trailing hyphen
291         boolean useSTD3ASCIIRules = ((flag & USE_STD3_ASCII_RULES) != 0);
292         if (useSTD3ASCIIRules) {
293             for (int i = 0; i < dest.length(); i++) {
294                 int c = dest.charAt(i);
295                 if (isNonLDHAsciiCodePoint(c)) {
296                     throw new IllegalArgumentException(
297                         "Contains non-LDH ASCII characters");
298                 }
299             }
300 
301             if (dest.charAt(0) == '-' ||
302                 dest.charAt(dest.length() - 1) == '-') {
303 
304                 throw new IllegalArgumentException(
305                         "Has leading or trailing hyphen");
306             }
307         }
308 
309         if (!isASCII) {
310             // step 4
311             // If all code points are inside 0..0x7f, skip to step 8
312             if (!isAllASCII(dest.toString())) {
313                 // step 5
314                 // verify the sequence does not begin with ACE prefix
315                 if(!startsWithACEPrefix(dest)){
316 
317                     // step 6
318                     // encode the sequence with punycode
319                     try {
320                         dest = Punycode.encode(dest, null);
321                     } catch (java.text.ParseException e) {
322                         throw new IllegalArgumentException(e);
323                     }
324 
325                     dest = toASCIILower(dest);
326 
327                     // step 7
328                     // prepend the ACE prefix
329                     dest.insert(0, ACE_PREFIX);
330                 } else {
331                     throw new IllegalArgumentException("The input starts with the ACE Prefix");
332                 }
333 
334             }
335         }
336 
337         // step 8
338         // the length must be inside 1..63
339         if (dest.length() > MAX_LABEL_LENGTH) {
340             throw new IllegalArgumentException("The label in the input is too long");
341         }
342 
343         return dest.toString();
344     }
345 
346     //
347     // toUnicode operation; should only apply to a single label
348     //
toUnicodeInternal(String label, int flag)349     private static String toUnicodeInternal(String label, int flag) {
350         boolean[] caseFlags = null;
351         StringBuffer dest;
352 
353         // step 1
354         // find out if all the codepoints in input are ASCII
355         boolean isASCII = isAllASCII(label);
356 
357         if(!isASCII){
358             // step 2
359             // perform the nameprep operation; flag ALLOW_UNASSIGNED is used here
360             try {
361                 UCharacterIterator iter = UCharacterIterator.getInstance(label);
362                 dest = namePrep.prepare(iter, flag);
363             } catch (Exception e) {
364                 // toUnicode never fails; if any step fails, return the input string
365                 return label;
366             }
367         } else {
368             dest = new StringBuffer(label);
369         }
370 
371         // step 3
372         // verify ACE Prefix
373         if(startsWithACEPrefix(dest)) {
374 
375             // step 4
376             // Remove the ACE Prefix
377             String temp = dest.substring(ACE_PREFIX_LENGTH, dest.length());
378 
379             try {
380                 // step 5
381                 // Decode using punycode
382                 StringBuffer decodeOut = Punycode.decode(new StringBuffer(temp), null);
383 
384                 // step 6
385                 // Apply toASCII
386                 String toASCIIOut = toASCII(decodeOut.toString(), flag);
387 
388                 // step 7
389                 // verify
390                 if (toASCIIOut.equalsIgnoreCase(dest.toString())) {
391                     // step 8
392                     // return output of step 5
393                     return decodeOut.toString();
394                 }
395             } catch (Exception ignored) {
396                 // no-op
397             }
398         }
399 
400         // just return the input
401         return label;
402     }
403 
404 
405     //
406     // LDH stands for "letter/digit/hyphen", with characters restricted to the
407     // 26-letter Latin alphabet <A-Z a-z>, the digits <0-9>, and the hyphen
408     // <->.
409     // Non LDH refers to characters in the ASCII range, but which are not
410     // letters, digits or the hypen.
411     //
412     // non-LDH = 0..0x2C, 0x2E..0x2F, 0x3A..0x40, 0x5B..0x60, 0x7B..0x7F
413     //
isNonLDHAsciiCodePoint(int ch)414     private static boolean isNonLDHAsciiCodePoint(int ch){
415         return (0x0000 <= ch && ch <= 0x002C) ||
416                (0x002E <= ch && ch <= 0x002F) ||
417                (0x003A <= ch && ch <= 0x0040) ||
418                (0x005B <= ch && ch <= 0x0060) ||
419                (0x007B <= ch && ch <= 0x007F);
420     }
421 
422     //
423     // search dots in a string and return the index of that character;
424     // or if there is no dots, return the length of input string
425     // dots might be: \u002E (full stop), \u3002 (ideographic full stop), \uFF0E (fullwidth full stop),
426     // and \uFF61 (halfwidth ideographic full stop).
427     //
searchDots(String s, int start)428     private static int searchDots(String s, int start) {
429         int i;
430         for (i = start; i < s.length(); i++) {
431             if (isLabelSeparator(s.charAt(i))) {
432                 break;
433             }
434         }
435 
436         return i;
437     }
438 
439     //
440     // to check if a string is a root label, ".".
441     //
isRootLabel(String s)442     private static boolean isRootLabel(String s) {
443         return (s.length() == 1 && isLabelSeparator(s.charAt(0)));
444     }
445 
446     //
447     // to check if a character is a label separator, i.e. a dot character.
448     //
isLabelSeparator(char c)449     private static boolean isLabelSeparator(char c) {
450         return (c == '.' || c == '\u3002' || c == '\uFF0E' || c == '\uFF61');
451     }
452 
453     //
454     // to check if a string only contains US-ASCII code point
455     //
isAllASCII(String input)456     private static boolean isAllASCII(String input) {
457         boolean isASCII = true;
458         for (int i = 0; i < input.length(); i++) {
459             int c = input.charAt(i);
460             if (c > 0x7F) {
461                 isASCII = false;
462                 break;
463             }
464         }
465         return isASCII;
466     }
467 
468     //
469     // to check if a string starts with ACE-prefix
470     //
startsWithACEPrefix(StringBuffer input)471     private static boolean startsWithACEPrefix(StringBuffer input){
472         boolean startsWithPrefix = true;
473 
474         if(input.length() < ACE_PREFIX_LENGTH){
475             return false;
476         }
477         for(int i = 0; i < ACE_PREFIX_LENGTH; i++){
478             if(toASCIILower(input.charAt(i)) != ACE_PREFIX.charAt(i)){
479                 startsWithPrefix = false;
480             }
481         }
482         return startsWithPrefix;
483     }
484 
toASCIILower(char ch)485     private static char toASCIILower(char ch){
486         if('A' <= ch && ch <= 'Z'){
487             return (char)(ch + 'a' - 'A');
488         }
489         return ch;
490     }
491 
toASCIILower(StringBuffer input)492     private static StringBuffer toASCIILower(StringBuffer input){
493         StringBuffer dest = new StringBuffer();
494         for(int i = 0; i < input.length();i++){
495             dest.append(toASCIILower(input.charAt(i)));
496         }
497         return dest;
498     }
499 }
500