1 /*
2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
3  *
4  * This code is free software; you can redistribute it and/or modify it
5  * under the terms of the GNU General Public License version 2 only, as
6  * published by the Free Software Foundation.  Oracle designates this
7  * particular file as subject to the "Classpath" exception as provided
8  * by Oracle in the LICENSE file that accompanied this code.
9  *
10  * This code is distributed in the hope that it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13  * version 2 for more details (a copy is included in the LICENSE file that
14  * accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License version
17  * 2 along with this work; if not, write to the Free Software Foundation,
18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19  *
20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21  * or visit www.oracle.com if you need additional information or have any
22  * questions.
23  */
24 /*
25 /*
26  *******************************************************************************
27  * Copyright (C) 2003-2004, International Business Machines Corporation and         *
28  * others. All Rights Reserved.                                                *
29  *******************************************************************************
30  */
31 //
32 // CHANGELOG
33 //      2005-05-19 Edward Wang
34 //          - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java
35 //          - move from package com.ibm.icu.text to package sun.net.idn
36 //          - use ParseException instead of StringPrepParseException
37 //          - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()'
38 //          - remove all @deprecated tag to make compiler happy
39 //      2007-08-14 Martin Buchholz
40 //          - remove redundant casts
41 //
42 package sun.net.idn;
43 
44 import java.io.BufferedInputStream;
45 import java.io.ByteArrayInputStream;
46 import java.io.IOException;
47 import java.io.InputStream;
48 import java.text.ParseException;
49 
50 import sun.text.Normalizer;
51 import sun.text.normalizer.CharTrie;
52 import sun.text.normalizer.Trie;
53 import sun.text.normalizer.VersionInfo;
54 import sun.text.normalizer.UCharacter;
55 import sun.text.normalizer.UCharacterIterator;
56 import sun.text.normalizer.UTF16;
57 import sun.net.idn.UCharacterDirection;
58 import sun.net.idn.StringPrepDataReader;
59 
60 /**
61  * StringPrep API implements the StingPrep framework as described by
62  * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.
63  * StringPrep prepares Unicode strings for use in network protocols.
64  * Profiles of StingPrep are set of rules and data according to which the
65  * Unicode Strings are prepared. Each profiles contains tables which describe
66  * how a code point should be treated. The tables are broadly classied into
67  * <ul>
68  *     <li> Unassigned Table: Contains code points that are unassigned
69  *          in the Unicode Version supported by StringPrep. Currently
70  *          RFC 3454 supports Unicode 3.2. </li>
71  *     <li> Prohibited Table: Contains code points that are prohibted from
72  *          the output of the StringPrep processing function. </li>
73  *     <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
74  * </ul>
75  *
76  * The procedure for preparing Unicode strings:
77  * <ol>
78  *      <li> Map: For each character in the input, check if it has a mapping
79  *           and, if so, replace it with its mapping. </li>
80  *      <li> Normalize: Possibly normalize the result of step 1 using Unicode
81  *           normalization. </li>
82  *      <li> Prohibit: Check for any characters that are not allowed in the
83  *           output.  If any are found, return an error.</li>
84  *      <li> Check bidi: Possibly check for right-to-left characters, and if
85  *           any are found, make sure that the whole string satisfies the
86  *           requirements for bidirectional strings.  If the string does not
87  *           satisfy the requirements for bidirectional strings, return an
88  *           error.  </li>
89  * </ol>
90  * @author Ram Viswanadha
91  * @draft ICU 2.8
92  */
93 public final class StringPrep {
94     /**
95      * Option to prohibit processing of unassigned code points in the input
96      *
97      * @see   #prepare
98      * @draft ICU 2.8
99      */
100     public static final int DEFAULT = 0x0000;
101 
102     /**
103      * Option to allow processing of unassigned code points in the input
104      *
105      * @see   #prepare
106      * @draft ICU 2.8
107      */
108     public static final int ALLOW_UNASSIGNED = 0x0001;
109 
110     private static final int UNASSIGNED        = 0x0000;
111     private static final int MAP               = 0x0001;
112     private static final int PROHIBITED        = 0x0002;
113     private static final int DELETE            = 0x0003;
114     private static final int TYPE_LIMIT        = 0x0004;
115 
116     private static final int NORMALIZATION_ON  = 0x0001;
117     private static final int CHECK_BIDI_ON     = 0x0002;
118 
119     private static final int TYPE_THRESHOLD       = 0xFFF0;
120     private static final int MAX_INDEX_VALUE      = 0x3FBF;   /*16139*/
121     private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
122 
123     /* indexes[] value names */
124     private static final int INDEX_TRIE_SIZE                  =  0; /* number of bytes in normalization trie */
125     private static final int INDEX_MAPPING_DATA_SIZE          =  1; /* The array that contains the mapping   */
126     private static final int NORM_CORRECTNS_LAST_UNI_VERSION  =  2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */
127     private static final int ONE_UCHAR_MAPPING_INDEX_START    =  3; /* The starting index of 1 UChar mapping index in the mapping data array */
128     private static final int TWO_UCHARS_MAPPING_INDEX_START   =  4; /* The starting index of 2 UChars mapping index in the mapping data array */
129     private static final int THREE_UCHARS_MAPPING_INDEX_START =  5;
130     private static final int FOUR_UCHARS_MAPPING_INDEX_START  =  6;
131     private static final int OPTIONS                          =  7; /* Bit set of options to turn on in the profile */
132     private static final int INDEX_TOP                        = 16;                          /* changing this requires a new formatVersion */
133 
134 
135     /**
136      * Default buffer size of datafile
137      */
138     private static final int DATA_BUFFER_SIZE = 25000;
139 
140     /* Wrappers for Trie implementations */
141     private static final class StringPrepTrieImpl implements Trie.DataManipulate{
142         private CharTrie sprepTrie = null;
143        /**
144         * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
145         * data the index array offset of the indexes for that lead surrogate.
146         * @param property data value for a surrogate from the trie, including
147         *        the folding offset
148         * @return data offset or 0 if there is no data for the lead surrogate
149         */
getFoldingOffset(int value)150          public int getFoldingOffset(int value){
151             return value;
152         }
153     }
154 
155     // CharTrie implementation for reading the trie data
156     private StringPrepTrieImpl sprepTrieImpl;
157     // Indexes read from the data file
158     private int[] indexes;
159     // mapping data read from the data file
160     private char[] mappingData;
161     // format version of the data file
162     private byte[] formatVersion;
163     // the version of Unicode supported by the data file
164     private VersionInfo sprepUniVer;
165     // the Unicode version of last entry in the
166     // NormalizationCorrections.txt file if normalization
167     // is turned on
168     private VersionInfo normCorrVer;
169     // Option to turn on Normalization
170     private boolean doNFKC;
171     // Option to turn on checking for BiDi rules
172     private boolean checkBiDi;
173 
174 
getCodePointValue(int ch)175     private char getCodePointValue(int ch){
176         return sprepTrieImpl.sprepTrie.getCodePointValue(ch);
177     }
178 
getVersionInfo(int comp)179     private static VersionInfo getVersionInfo(int comp){
180         int micro = comp & 0xFF;
181         int milli =(comp >> 8)  & 0xFF;
182         int minor =(comp >> 16) & 0xFF;
183         int major =(comp >> 24) & 0xFF;
184         return VersionInfo.getInstance(major,minor,milli,micro);
185     }
getVersionInfo(byte[] version)186     private static VersionInfo getVersionInfo(byte[] version){
187         if(version.length != 4){
188             return null;
189         }
190         return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]);
191     }
192     /**
193      * Creates an StringPrep object after reading the input stream.
194      * The object does not hold a reference to the input steam, so the stream can be
195      * closed after the method returns.
196      *
197      * @param inputStream The stream for reading the StringPrep profile binarySun
198      * @throws IOException
199      * @draft ICU 2.8
200      */
StringPrep(InputStream inputStream)201     public StringPrep(InputStream inputStream) throws IOException{
202 
203         BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE);
204 
205         StringPrepDataReader reader = new StringPrepDataReader(b);
206 
207         // read the indexes
208         indexes = reader.readIndexes(INDEX_TOP);
209 
210         byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];
211 
212 
213         //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
214         mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2];
215         // load the rest of the data and initialize the data members
216         reader.read(sprepBytes,mappingData);
217 
218         sprepTrieImpl           = new StringPrepTrieImpl();
219         sprepTrieImpl.sprepTrie = new CharTrie( new ByteArrayInputStream(sprepBytes),sprepTrieImpl  );
220 
221         // get the data format version
222         formatVersion = reader.getDataFormatVersion();
223 
224         // get the options
225         doNFKC            = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
226         checkBiDi         = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
227         sprepUniVer   = getVersionInfo(reader.getUnicodeVersion());
228         normCorrVer   = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
229         VersionInfo normUniVer = UCharacter.getUnicodeVersion();
230         if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
231            normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
232            ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
233            ){
234             throw new IOException("Normalization Correction version not supported");
235         }
236         b.close();
237     }
238 
239     private static final class Values{
240         boolean isIndex;
241         int value;
242         int type;
reset()243         public void reset(){
244             isIndex = false;
245             value = 0;
246             type = -1;
247         }
248     }
249 
getValues(char trieWord,Values values)250     private static final void getValues(char trieWord,Values values){
251         values.reset();
252         if(trieWord == 0){
253             /*
254              * Initial value stored in the mapping table
255              * just return TYPE_LIMIT .. so that
256              * the source codepoint is copied to the destination
257              */
258             values.type = TYPE_LIMIT;
259         }else if(trieWord >= TYPE_THRESHOLD){
260             values.type = (trieWord - TYPE_THRESHOLD);
261         }else{
262             /* get the type */
263             values.type = MAP;
264             /* ascertain if the value is index or delta */
265             if((trieWord & 0x02)>0){
266                 values.isIndex = true;
267                 values.value = trieWord  >> 2; //mask off the lower 2 bits and shift
268 
269             }else{
270                 values.isIndex = false;
271                 values.value = (trieWord<<16)>>16;
272                 values.value =  (values.value >> 2);
273 
274             }
275 
276             if((trieWord>>2) == MAX_INDEX_VALUE){
277                 values.type = DELETE;
278                 values.isIndex = false;
279                 values.value = 0;
280             }
281         }
282     }
283 
284 
285 
map( UCharacterIterator iter, int options)286     private StringBuffer map( UCharacterIterator iter, int options)
287                             throws ParseException {
288 
289         Values val = new Values();
290         char result = 0;
291         int ch  = UCharacterIterator.DONE;
292         StringBuffer dest = new StringBuffer();
293         boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0);
294 
295         while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
296 
297             result = getCodePointValue(ch);
298             getValues(result,val);
299 
300             // check if the source codepoint is unassigned
301             if(val.type == UNASSIGNED && allowUnassigned == false){
302                  throw new ParseException("An unassigned code point was found in the input " +
303                                           iter.getText(), iter.getIndex());
304             }else if((val.type == MAP)){
305                 int index, length;
306 
307                 if(val.isIndex){
308                     index = val.value;
309                     if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
310                              index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){
311                         length = 1;
312                     }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
313                              index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){
314                         length = 2;
315                     }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
316                              index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){
317                         length = 3;
318                     }else{
319                         length = mappingData[index++];
320                     }
321                     /* copy mapping to destination */
322                     dest.append(mappingData,index,length);
323                     continue;
324 
325                 }else{
326                     ch -= val.value;
327                 }
328             }else if(val.type == DELETE){
329                 // just consume the codepoint and contine
330                 continue;
331             }
332             //copy the source into destination
333             UTF16.append(dest,ch);
334         }
335 
336         return dest;
337     }
338 
339 
normalize(StringBuffer src)340     private StringBuffer normalize(StringBuffer src){
341         /*
342          * Option UNORM_BEFORE_PRI_29:
343          *
344          * IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
345          * requires strict adherence to Unicode 3.2 normalization,
346          * including buggy composition from before fixing Public Review Issue #29.
347          * Note that this results in some valid but nonsensical text to be
348          * either corrupted or rejected, depending on the text.
349          * See http://www.unicode.org/review/resolved-pri.html#pri29
350          * See unorm.cpp and cnormtst.c
351          */
352         return new StringBuffer(
353             Normalizer.normalize(
354                 src.toString(),
355                 java.text.Normalizer.Form.NFKC,
356                 Normalizer.UNICODE_3_2));
357     }
358     /*
359     boolean isLabelSeparator(int ch){
360         int result = getCodePointValue(ch);
361         if( (result & 0x07)  == LABEL_SEPARATOR){
362             return true;
363         }
364         return false;
365     }
366     */
367      /*
368        1) Map -- For each character in the input, check if it has a mapping
369           and, if so, replace it with its mapping.
370 
371        2) Normalize -- Possibly normalize the result of step 1 using Unicode
372           normalization.
373 
374        3) Prohibit -- Check for any characters that are not allowed in the
375           output.  If any are found, return an error.
376 
377        4) Check bidi -- Possibly check for right-to-left characters, and if
378           any are found, make sure that the whole string satisfies the
379           requirements for bidirectional strings.  If the string does not
380           satisfy the requirements for bidirectional strings, return an
381           error.
382           [Unicode3.2] defines several bidirectional categories; each character
383            has one bidirectional category assigned to it.  For the purposes of
384            the requirements below, an "RandALCat character" is a character that
385            has Unicode bidirectional categories "R" or "AL"; an "LCat character"
386            is a character that has Unicode bidirectional category "L".  Note
387 
388 
389            that there are many characters which fall in neither of the above
390            definitions; Latin digits (<U+0030> through <U+0039>) are examples of
391            this because they have bidirectional category "EN".
392 
393            In any profile that specifies bidirectional character handling, all
394            three of the following requirements MUST be met:
395 
396            1) The characters in section 5.8 MUST be prohibited.
397 
398            2) If a string contains any RandALCat character, the string MUST NOT
399               contain any LCat character.
400 
401            3) If a string contains any RandALCat character, a RandALCat
402               character MUST be the first character of the string, and a
403               RandALCat character MUST be the last character of the string.
404     */
405     /**
406      * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
407      * checks for prohited and BiDi characters in the order defined by RFC 3454
408      * depending on the options specified in the profile.
409      *
410      * @param src           A UCharacterIterator object containing the source string
411      * @param options       A bit set of options:
412      *
413      *  - StringPrep.NONE               Prohibit processing of unassigned code points in the input
414      *
415      *  - StringPrep.ALLOW_UNASSIGNED   Treat the unassigned code points are in the input
416      *                                  as normal Unicode code points.
417      *
418      * @return StringBuffer A StringBuffer containing the output
419      * @throws ParseException
420      * @draft ICU 2.8
421      */
prepare(UCharacterIterator src, int options)422     public StringBuffer prepare(UCharacterIterator src, int options)
423                         throws ParseException{
424 
425         // map
426         StringBuffer mapOut = map(src,options);
427         StringBuffer normOut = mapOut;// initialize
428 
429         if(doNFKC){
430             // normalize
431             normOut = normalize(mapOut);
432         }
433 
434         int ch;
435         char result;
436         UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
437         Values val = new Values();
438         int direction=UCharacterDirection.CHAR_DIRECTION_COUNT,
439             firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT;
440         int rtlPos=-1, ltrPos=-1;
441         boolean rightToLeft=false, leftToRight=false;
442 
443         while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
444             result = getCodePointValue(ch);
445             getValues(result,val);
446 
447             if(val.type == PROHIBITED ){
448                 throw new ParseException("A prohibited code point was found in the input" +
449                                          iter.getText(), val.value);
450             }
451 
452             direction = UCharacter.getDirection(ch);
453             if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){
454                 firstCharDir = direction;
455             }
456             if(direction == UCharacterDirection.LEFT_TO_RIGHT){
457                 leftToRight = true;
458                 ltrPos = iter.getIndex()-1;
459             }
460             if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){
461                 rightToLeft = true;
462                 rtlPos = iter.getIndex()-1;
463             }
464         }
465         if(checkBiDi == true){
466             // satisfy 2
467             if( leftToRight == true && rightToLeft == true){
468                 throw new ParseException("The input does not conform to the rules for BiDi code points." +
469                                          iter.getText(),
470                                          (rtlPos>ltrPos) ? rtlPos : ltrPos);
471              }
472 
473             //satisfy 3
474             if( rightToLeft == true &&
475                 !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) &&
476                 (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))
477               ){
478                 throw new ParseException("The input does not conform to the rules for BiDi code points." +
479                                          iter.getText(),
480                                          (rtlPos>ltrPos) ? rtlPos : ltrPos);
481             }
482         }
483         return normOut;
484 
485       }
486 }
487