1 /* 2 * Copyright (c) 1996, 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 28 * (C) Copyright IBM Corp. 1996, 1997 - All Rights Reserved 29 * 30 * The original version of this source code and documentation is copyrighted 31 * and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These 32 * materials are provided under terms of a License Agreement between Taligent 33 * and Sun. This technology is protected by multiple US and International 34 * patents. This notice and attribution to Taligent may not be removed. 35 * Taligent is a registered trademark of Taligent, Inc. 36 * 37 */ 38 39 package java.text; 40 41 import java.util.ArrayList; 42 43 /** 44 * Utility class for normalizing and merging patterns for collation. 45 * Patterns are strings of the form <entry>*, where <entry> has the 46 * form: 47 * <pattern> := <entry>* 48 * <entry> := <separator><chars>{"/"<extension>} 49 * <separator> := "=", ",", ";", "<", "&" 50 * <chars>, and <extension> are both arbitrary strings. 51 * unquoted whitespaces are ignored. 52 * 'xxx' can be used to quote characters 53 * One difference from Collator is that & is used to reset to a current 54 * point. Or, in other words, it introduces a new sequence which is to 55 * be added to the old. 56 * That is: "a < b < c < d" is the same as "a < b & b < c & c < d" OR 57 * "a < b < d & b < c" 58 * XXX: make '' be a single quote. 59 * @see PatternEntry 60 * @author Mark Davis, Helena Shih 61 */ 62 63 final class MergeCollation { 64 65 /** 66 * Creates from a pattern 67 * @throws ParseException If the input pattern is incorrect. 68 */ MergeCollation(String pattern)69 public MergeCollation(String pattern) throws ParseException 70 { 71 for (int i = 0; i < statusArray.length; i++) 72 statusArray[i] = 0; 73 setPattern(pattern); 74 } 75 76 /** 77 * recovers current pattern 78 */ getPattern()79 public String getPattern() { 80 return getPattern(true); 81 } 82 83 /** 84 * recovers current pattern. 85 * @param withWhiteSpace puts spacing around the entries, and \n 86 * before & and < 87 */ getPattern(boolean withWhiteSpace)88 public String getPattern(boolean withWhiteSpace) { 89 StringBuffer result = new StringBuffer(); 90 PatternEntry tmp = null; 91 ArrayList<PatternEntry> extList = null; 92 int i; 93 for (i = 0; i < patterns.size(); ++i) { 94 PatternEntry entry = patterns.get(i); 95 if (!entry.extension.isEmpty()) { 96 if (extList == null) 97 extList = new ArrayList<>(); 98 extList.add(entry); 99 } else { 100 if (extList != null) { 101 PatternEntry last = findLastWithNoExtension(i-1); 102 for (int j = extList.size() - 1; j >= 0 ; j--) { 103 tmp = extList.get(j); 104 tmp.addToBuffer(result, false, withWhiteSpace, last); 105 } 106 extList = null; 107 } 108 entry.addToBuffer(result, false, withWhiteSpace, null); 109 } 110 } 111 if (extList != null) { 112 PatternEntry last = findLastWithNoExtension(i-1); 113 for (int j = extList.size() - 1; j >= 0 ; j--) { 114 tmp = extList.get(j); 115 tmp.addToBuffer(result, false, withWhiteSpace, last); 116 } 117 extList = null; 118 } 119 return result.toString(); 120 } 121 findLastWithNoExtension(int i)122 private final PatternEntry findLastWithNoExtension(int i) { 123 for (--i;i >= 0; --i) { 124 PatternEntry entry = patterns.get(i); 125 if (entry.extension.isEmpty()) { 126 return entry; 127 } 128 } 129 return null; 130 } 131 132 /** 133 * emits the pattern for collation builder. 134 * @return emits the string in the format understable to the collation 135 * builder. 136 */ emitPattern()137 public String emitPattern() { 138 return emitPattern(true); 139 } 140 141 /** 142 * emits the pattern for collation builder. 143 * @param withWhiteSpace puts spacing around the entries, and \n 144 * before & and < 145 * @return emits the string in the format understable to the collation 146 * builder. 147 */ emitPattern(boolean withWhiteSpace)148 public String emitPattern(boolean withWhiteSpace) { 149 StringBuffer result = new StringBuffer(); 150 for (int i = 0; i < patterns.size(); ++i) 151 { 152 PatternEntry entry = patterns.get(i); 153 if (entry != null) { 154 entry.addToBuffer(result, true, withWhiteSpace, null); 155 } 156 } 157 return result.toString(); 158 } 159 160 /** 161 * sets the pattern. 162 */ setPattern(String pattern)163 public void setPattern(String pattern) throws ParseException 164 { 165 patterns.clear(); 166 addPattern(pattern); 167 } 168 169 /** 170 * adds a pattern to the current one. 171 * @param pattern the new pattern to be added 172 */ addPattern(String pattern)173 public void addPattern(String pattern) throws ParseException 174 { 175 if (pattern == null) 176 return; 177 178 PatternEntry.Parser parser = new PatternEntry.Parser(pattern); 179 180 PatternEntry entry = parser.next(); 181 while (entry != null) { 182 fixEntry(entry); 183 entry = parser.next(); 184 } 185 } 186 187 /** 188 * gets count of separate entries 189 * @return the size of pattern entries 190 */ getCount()191 public int getCount() { 192 return patterns.size(); 193 } 194 195 /** 196 * gets count of separate entries 197 * @param index the offset of the desired pattern entry 198 * @return the requested pattern entry 199 */ getItemAt(int index)200 public PatternEntry getItemAt(int index) { 201 return patterns.get(index); 202 } 203 204 //============================================================ 205 // privates 206 //============================================================ 207 ArrayList<PatternEntry> patterns = new ArrayList<>(); // a list of PatternEntries 208 209 private transient PatternEntry saveEntry = null; 210 private transient PatternEntry lastEntry = null; 211 212 // This is really used as a local variable inside fixEntry, but we cache 213 // it here to avoid newing it up every time the method is called. 214 private transient StringBuffer excess = new StringBuffer(); 215 216 // 217 // When building a MergeCollation, we need to do lots of searches to see 218 // whether a given entry is already in the table. Since we're using an 219 // array, this would make the algorithm O(N*N). To speed things up, we 220 // use this bit array to remember whether the array contains any entries 221 // starting with each Unicode character. If not, we can avoid the search. 222 // Using BitSet would make this easier, but it's significantly slower. 223 // 224 private transient byte[] statusArray = new byte[8192]; 225 private final byte BITARRAYMASK = (byte)0x1; 226 private final int BYTEPOWER = 3; 227 private final int BYTEMASK = (1 << BYTEPOWER) - 1; 228 229 /* 230 If the strength is RESET, then just change the lastEntry to 231 be the current. (If the current is not in patterns, signal an error). 232 If not, then remove the current entry, and add it after lastEntry 233 (which is usually at the end). 234 */ fixEntry(PatternEntry newEntry)235 private final void fixEntry(PatternEntry newEntry) throws ParseException 236 { 237 // check to see whether the new entry has the same characters as the previous 238 // entry did (this can happen when a pattern declaring a difference between two 239 // strings that are canonically equivalent is normalized). If so, and the strength 240 // is anything other than IDENTICAL or RESET, throw an exception (you can't 241 // declare a string to be unequal to itself). --rtg 5/24/99 242 if (lastEntry != null && newEntry.chars.equals(lastEntry.chars) 243 && newEntry.extension.equals(lastEntry.extension)) { 244 if (newEntry.strength != Collator.IDENTICAL 245 && newEntry.strength != PatternEntry.RESET) { 246 throw new ParseException("The entries " + lastEntry + " and " 247 + newEntry + " are adjacent in the rules, but have conflicting " 248 + "strengths: A character can't be unequal to itself.", -1); 249 } else { 250 // otherwise, just skip this entry and behave as though you never saw it 251 return; 252 } 253 } 254 255 boolean changeLastEntry = true; 256 if (newEntry.strength != PatternEntry.RESET) { 257 int oldIndex = -1; 258 259 if ((newEntry.chars.length() == 1)) { 260 261 char c = newEntry.chars.charAt(0); 262 int statusIndex = c >> BYTEPOWER; 263 byte bitClump = statusArray[statusIndex]; 264 byte setBit = (byte)(BITARRAYMASK << (c & BYTEMASK)); 265 266 if (bitClump != 0 && (bitClump & setBit) != 0) { 267 oldIndex = patterns.lastIndexOf(newEntry); 268 } else { 269 // We're going to add an element that starts with this 270 // character, so go ahead and set its bit. 271 statusArray[statusIndex] = (byte)(bitClump | setBit); 272 } 273 } else { 274 oldIndex = patterns.lastIndexOf(newEntry); 275 } 276 if (oldIndex != -1) { 277 patterns.remove(oldIndex); 278 } 279 280 excess.setLength(0); 281 int lastIndex = findLastEntry(lastEntry, excess); 282 283 if (excess.length() != 0) { 284 newEntry.extension = excess + newEntry.extension; 285 if (lastIndex != patterns.size()) { 286 lastEntry = saveEntry; 287 changeLastEntry = false; 288 } 289 } 290 if (lastIndex == patterns.size()) { 291 patterns.add(newEntry); 292 saveEntry = newEntry; 293 } else { 294 patterns.add(lastIndex, newEntry); 295 } 296 } 297 if (changeLastEntry) { 298 lastEntry = newEntry; 299 } 300 } 301 findLastEntry(PatternEntry entry, StringBuffer excessChars)302 private final int findLastEntry(PatternEntry entry, 303 StringBuffer excessChars) throws ParseException 304 { 305 if (entry == null) 306 return 0; 307 308 if (entry.strength != PatternEntry.RESET) { 309 // Search backwards for string that contains this one; 310 // most likely entry is last one 311 312 int oldIndex = -1; 313 if ((entry.chars.length() == 1)) { 314 int index = entry.chars.charAt(0) >> BYTEPOWER; 315 if ((statusArray[index] & 316 (BITARRAYMASK << (entry.chars.charAt(0) & BYTEMASK))) != 0) { 317 oldIndex = patterns.lastIndexOf(entry); 318 } 319 } else { 320 oldIndex = patterns.lastIndexOf(entry); 321 } 322 if ((oldIndex == -1)) 323 throw new ParseException("couldn't find last entry: " 324 + entry, oldIndex); 325 return oldIndex + 1; 326 } else { 327 int i; 328 for (i = patterns.size() - 1; i >= 0; --i) { 329 PatternEntry e = patterns.get(i); 330 if (e.chars.regionMatches(0,entry.chars,0, 331 e.chars.length())) { 332 excessChars.append(entry.chars, e.chars.length(), 333 entry.chars.length()); 334 break; 335 } 336 } 337 if (i == -1) 338 throw new ParseException("couldn't find: " + entry, i); 339 return i + 1; 340 } 341 } 342 } 343