1 /* 2 * Copyright (c) 1999, 2012, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 28 * (C) Copyright IBM Corp. 1996-1998 - All Rights Reserved 29 * 30 * The original version of this source code and documentation is copyrighted 31 * and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These 32 * materials are provided under terms of a License Agreement between Taligent 33 * and Sun. This technology is protected by multiple US and International 34 * patents. This notice and attribution to Taligent may not be removed. 35 * Taligent is a registered trademark of Taligent, Inc. 36 * 37 */ 38 39 package java.text; 40 41 import java.util.Vector; 42 import sun.text.UCompactIntArray; 43 import sun.text.IntHashtable; 44 import sun.text.ComposedCharIter; 45 import sun.text.CollatorUtilities; 46 import sun.text.normalizer.NormalizerImpl; 47 48 /** 49 * This class contains all the code to parse a RuleBasedCollator pattern 50 * and build a RBCollationTables object from it. A particular instance 51 * of tis class exists only during the actual build process-- once an 52 * RBCollationTables object has been built, the RBTableBuilder object 53 * goes away. This object carries all of the state which is only needed 54 * during the build process, plus a "shadow" copy of all of the state 55 * that will go into the tables object itself. This object communicates 56 * with RBCollationTables through a separate class, RBCollationTables.BuildAPI, 57 * this is an inner class of RBCollationTables and provides a separate 58 * private API for communication with RBTableBuilder. 59 * This class isn't just an inner class of RBCollationTables itself because 60 * of its large size. For source-code readability, it seemed better for the 61 * builder to have its own source file. 62 */ 63 final class RBTableBuilder { 64 RBTableBuilder(RBCollationTables.BuildAPI tables)65 public RBTableBuilder(RBCollationTables.BuildAPI tables) { 66 this.tables = tables; 67 } 68 69 /** 70 * Create a table-based collation object with the given rules. 71 * This is the main function that actually builds the tables and 72 * stores them back in the RBCollationTables object. It is called 73 * ONLY by the RBCollationTables constructor. 74 * @see RuleBasedCollator#RuleBasedCollator 75 * @exception ParseException If the rules format is incorrect. 76 */ 77 build(String pattern, int decmp)78 public void build(String pattern, int decmp) throws ParseException { 79 String expChars; 80 String groupChars; 81 if (pattern.isEmpty()) 82 throw new ParseException("Build rules empty.", 0); 83 84 // This array maps Unicode characters to their collation ordering 85 mapping = new UCompactIntArray(RBCollationTables.UNMAPPED); 86 // Normalize the build rules. Find occurances of all decomposed characters 87 // and normalize the rules before feeding into the builder. By "normalize", 88 // we mean that all precomposed Unicode characters must be converted into 89 // a base character and one or more combining characters (such as accents). 90 // When there are multiple combining characters attached to a base character, 91 // the combining characters must be in their canonical order 92 // 93 // sherman/Note: 94 //(1)decmp will be NO_DECOMPOSITION only in ko locale to prevent decompose 95 //hangual syllables to jamos, so we can actually just call decompose with 96 //normalizer's IGNORE_HANGUL option turned on 97 // 98 //(2)just call the "special version" in NormalizerImpl directly 99 //pattern = Normalizer.decompose(pattern, false, Normalizer.IGNORE_HANGUL, true); 100 // 101 //Normalizer.Mode mode = CollatorUtilities.toNormalizerMode(decmp); 102 //pattern = Normalizer.normalize(pattern, mode, 0, true); 103 104 pattern = NormalizerImpl.canonicalDecomposeWithSingleQuotation(pattern); 105 106 // Build the merged collation entries 107 // Since rules can be specified in any order in the string 108 // (e.g. "c , C < d , D < e , E .... C < CH") 109 // this splits all of the rules in the string out into separate 110 // objects and then sorts them. In the above example, it merges the 111 // "C < CH" rule in just before the "C < D" rule. 112 // 113 114 mPattern = new MergeCollation(pattern); 115 116 int order = 0; 117 118 // Now walk though each entry and add it to my own tables 119 for (int i = 0; i < mPattern.getCount(); ++i) { 120 PatternEntry entry = mPattern.getItemAt(i); 121 if (entry != null) { 122 groupChars = entry.getChars(); 123 if (groupChars.length() > 1) { 124 switch(groupChars.charAt(groupChars.length()-1)) { 125 case '@': 126 frenchSec = true; 127 groupChars = groupChars.substring(0, groupChars.length()-1); 128 break; 129 case '!': 130 seAsianSwapping = true; 131 groupChars = groupChars.substring(0, groupChars.length()-1); 132 break; 133 } 134 } 135 136 order = increment(entry.getStrength(), order); 137 expChars = entry.getExtension(); 138 139 if (!expChars.isEmpty()) { 140 addExpandOrder(groupChars, expChars, order); 141 } else if (groupChars.length() > 1) { 142 char ch = groupChars.charAt(0); 143 if (Character.isHighSurrogate(ch) && groupChars.length() == 2) { 144 addOrder(Character.toCodePoint(ch, groupChars.charAt(1)), order); 145 } else { 146 addContractOrder(groupChars, order); 147 } 148 } else { 149 char ch = groupChars.charAt(0); 150 addOrder(ch, order); 151 } 152 } 153 } 154 addComposedChars(); 155 156 commit(); 157 mapping.compact(); 158 /* 159 System.out.println("mappingSize=" + mapping.getKSize()); 160 for (int j = 0; j < 0xffff; j++) { 161 int value = mapping.elementAt(j); 162 if (value != RBCollationTables.UNMAPPED) 163 System.out.println("index=" + Integer.toString(j, 16) 164 + ", value=" + Integer.toString(value, 16)); 165 } 166 */ 167 tables.fillInTables(frenchSec, seAsianSwapping, mapping, contractTable, expandTable, 168 contractFlags, maxSecOrder, maxTerOrder); 169 } 170 171 /** Add expanding entries for pre-composed unicode characters so that this 172 * collator can be used reasonably well with decomposition turned off. 173 */ addComposedChars()174 private void addComposedChars() throws ParseException { 175 // Iterate through all of the pre-composed characters in Unicode 176 ComposedCharIter iter = new ComposedCharIter(); 177 int c; 178 while ((c = iter.next()) != ComposedCharIter.DONE) { 179 if (getCharOrder(c) == RBCollationTables.UNMAPPED) { 180 // 181 // We don't already have an ordering for this pre-composed character. 182 // 183 // First, see if the decomposed string is already in our 184 // tables as a single contracting-string ordering. 185 // If so, just map the precomposed character to that order. 186 // 187 // TODO: What we should really be doing here is trying to find the 188 // longest initial substring of the decomposition that is present 189 // in the tables as a contracting character sequence, and find its 190 // ordering. Then do this recursively with the remaining chars 191 // so that we build a list of orderings, and add that list to 192 // the expansion table. 193 // That would be more correct but also significantly slower, so 194 // I'm not totally sure it's worth doing. 195 // 196 String s = iter.decomposition(); 197 198 //sherman/Note: if this is 1 character decomposed string, the 199 //only thing need to do is to check if this decomposed character 200 //has an entry in our order table, this order is not necessary 201 //to be a contraction order, if it does have one, add an entry 202 //for the precomposed character by using the same order, the 203 //previous impl unnecessarily adds a single character expansion 204 //entry. 205 if (s.length() == 1) { 206 int order = getCharOrder(s.charAt(0)); 207 if (order != RBCollationTables.UNMAPPED) { 208 addOrder(c, order); 209 } 210 continue; 211 } else if (s.length() == 2) { 212 char ch0 = s.charAt(0); 213 if (Character.isHighSurrogate(ch0)) { 214 int order = getCharOrder(s.codePointAt(0)); 215 if (order != RBCollationTables.UNMAPPED) { 216 addOrder(c, order); 217 } 218 continue; 219 } 220 } 221 int contractOrder = getContractOrder(s); 222 if (contractOrder != RBCollationTables.UNMAPPED) { 223 addOrder(c, contractOrder); 224 } else { 225 // 226 // We don't have a contracting ordering for the entire string 227 // that results from the decomposition, but if we have orders 228 // for each individual character, we can add an expanding 229 // table entry for the pre-composed character 230 // 231 boolean allThere = true; 232 for (int i = 0; i < s.length(); i++) { 233 if (getCharOrder(s.charAt(i)) == RBCollationTables.UNMAPPED) { 234 allThere = false; 235 break; 236 } 237 } 238 if (allThere) { 239 addExpandOrder(c, s, RBCollationTables.UNMAPPED); 240 } 241 } 242 } 243 } 244 } 245 246 /** 247 * Look up for unmapped values in the expanded character table. 248 * 249 * When the expanding character tables are built by addExpandOrder, 250 * it doesn't know what the final ordering of each character 251 * in the expansion will be. Instead, it just puts the raw character 252 * code into the table, adding CHARINDEX as a flag. Now that we've 253 * finished building the mapping table, we can go back and look up 254 * that character to see what its real collation order is and 255 * stick that into the expansion table. That lets us avoid doing 256 * a two-stage lookup later. 257 */ commit()258 private final void commit() 259 { 260 if (expandTable != null) { 261 for (int i = 0; i < expandTable.size(); i++) { 262 int[] valueList = expandTable.elementAt(i); 263 for (int j = 0; j < valueList.length; j++) { 264 int order = valueList[j]; 265 if (order < RBCollationTables.EXPANDCHARINDEX && order > CHARINDEX) { 266 // found a expanding character that isn't filled in yet 267 int ch = order - CHARINDEX; 268 269 // Get the real values for the non-filled entry 270 int realValue = getCharOrder(ch); 271 272 if (realValue == RBCollationTables.UNMAPPED) { 273 // The real value is still unmapped, maybe it's ignorable 274 valueList[j] = IGNORABLEMASK & ch; 275 } else { 276 // just fill in the value 277 valueList[j] = realValue; 278 } 279 } 280 } 281 } 282 } 283 } 284 /** 285 * Increment of the last order based on the comparison level. 286 */ increment(int aStrength, int lastValue)287 private final int increment(int aStrength, int lastValue) 288 { 289 switch(aStrength) 290 { 291 case Collator.PRIMARY: 292 // increment priamry order and mask off secondary and tertiary difference 293 lastValue += PRIMARYORDERINCREMENT; 294 lastValue &= RBCollationTables.PRIMARYORDERMASK; 295 isOverIgnore = true; 296 break; 297 case Collator.SECONDARY: 298 // increment secondary order and mask off tertiary difference 299 lastValue += SECONDARYORDERINCREMENT; 300 lastValue &= RBCollationTables.SECONDARYDIFFERENCEONLY; 301 // record max # of ignorable chars with secondary difference 302 if (!isOverIgnore) 303 maxSecOrder++; 304 break; 305 case Collator.TERTIARY: 306 // increment tertiary order 307 lastValue += TERTIARYORDERINCREMENT; 308 // record max # of ignorable chars with tertiary difference 309 if (!isOverIgnore) 310 maxTerOrder++; 311 break; 312 } 313 return lastValue; 314 } 315 316 /** 317 * Adds a character and its designated order into the collation table. 318 */ addOrder(int ch, int anOrder)319 private final void addOrder(int ch, int anOrder) 320 { 321 // See if the char already has an order in the mapping table 322 int order = mapping.elementAt(ch); 323 324 if (order >= RBCollationTables.CONTRACTCHARINDEX) { 325 // There's already an entry for this character that points to a contracting 326 // character table. Instead of adding the character directly to the mapping 327 // table, we must add it to the contract table instead. 328 int length = 1; 329 if (Character.isSupplementaryCodePoint(ch)) { 330 length = Character.toChars(ch, keyBuf, 0); 331 } else { 332 keyBuf[0] = (char)ch; 333 } 334 addContractOrder(new String(keyBuf, 0, length), anOrder); 335 } else { 336 // add the entry to the mapping table, 337 // the same later entry replaces the previous one 338 mapping.setElementAt(ch, anOrder); 339 } 340 } 341 addContractOrder(String groupChars, int anOrder)342 private final void addContractOrder(String groupChars, int anOrder) { 343 addContractOrder(groupChars, anOrder, true); 344 } 345 346 /** 347 * Adds the contracting string into the collation table. 348 */ addContractOrder(String groupChars, int anOrder, boolean fwd)349 private final void addContractOrder(String groupChars, int anOrder, 350 boolean fwd) 351 { 352 if (contractTable == null) { 353 contractTable = new Vector<>(INITIALTABLESIZE); 354 } 355 356 //initial character 357 int ch = groupChars.codePointAt(0); 358 /* 359 char ch0 = groupChars.charAt(0); 360 int ch = Character.isHighSurrogate(ch0)? 361 Character.toCodePoint(ch0, groupChars.charAt(1)):ch0; 362 */ 363 // See if the initial character of the string already has a contract table. 364 int entry = mapping.elementAt(ch); 365 Vector<EntryPair> entryTable = getContractValuesImpl(entry - RBCollationTables.CONTRACTCHARINDEX); 366 367 if (entryTable == null) { 368 // We need to create a new table of contract entries for this base char 369 int tableIndex = RBCollationTables.CONTRACTCHARINDEX + contractTable.size(); 370 entryTable = new Vector<>(INITIALTABLESIZE); 371 contractTable.addElement(entryTable); 372 373 // Add the initial character's current ordering first. then 374 // update its mapping to point to this contract table 375 entryTable.addElement(new EntryPair(groupChars.substring(0,Character.charCount(ch)), entry)); 376 mapping.setElementAt(ch, tableIndex); 377 } 378 379 // Now add (or replace) this string in the table 380 int index = RBCollationTables.getEntry(entryTable, groupChars, fwd); 381 if (index != RBCollationTables.UNMAPPED) { 382 EntryPair pair = entryTable.elementAt(index); 383 pair.value = anOrder; 384 } else { 385 EntryPair pair = entryTable.lastElement(); 386 387 // NOTE: This little bit of logic is here to speed CollationElementIterator 388 // .nextContractChar(). This code ensures that the longest sequence in 389 // this list is always the _last_ one in the list. This keeps 390 // nextContractChar() from having to search the entire list for the longest 391 // sequence. 392 if (groupChars.length() > pair.entryName.length()) { 393 entryTable.addElement(new EntryPair(groupChars, anOrder, fwd)); 394 } else { 395 entryTable.insertElementAt(new EntryPair(groupChars, anOrder, 396 fwd), entryTable.size() - 1); 397 } 398 } 399 400 // If this was a forward mapping for a contracting string, also add a 401 // reverse mapping for it, so that CollationElementIterator.previous 402 // can work right 403 if (fwd && groupChars.length() > 1) { 404 addContractFlags(groupChars); 405 addContractOrder(new StringBuffer(groupChars).reverse().toString(), 406 anOrder, false); 407 } 408 } 409 410 /** 411 * If the given string has been specified as a contracting string 412 * in this collation table, return its ordering. 413 * Otherwise return UNMAPPED. 414 */ getContractOrder(String groupChars)415 private int getContractOrder(String groupChars) 416 { 417 int result = RBCollationTables.UNMAPPED; 418 if (contractTable != null) { 419 int ch = groupChars.codePointAt(0); 420 /* 421 char ch0 = groupChars.charAt(0); 422 int ch = Character.isHighSurrogate(ch0)? 423 Character.toCodePoint(ch0, groupChars.charAt(1)):ch0; 424 */ 425 Vector<EntryPair> entryTable = getContractValues(ch); 426 if (entryTable != null) { 427 int index = RBCollationTables.getEntry(entryTable, groupChars, true); 428 if (index != RBCollationTables.UNMAPPED) { 429 EntryPair pair = entryTable.elementAt(index); 430 result = pair.value; 431 } 432 } 433 } 434 return result; 435 } 436 getCharOrder(int ch)437 private final int getCharOrder(int ch) { 438 int order = mapping.elementAt(ch); 439 440 if (order >= RBCollationTables.CONTRACTCHARINDEX) { 441 Vector<EntryPair> groupList = getContractValuesImpl(order - RBCollationTables.CONTRACTCHARINDEX); 442 EntryPair pair = groupList.firstElement(); 443 order = pair.value; 444 } 445 return order; 446 } 447 448 /** 449 * Get the entry of hash table of the contracting string in the collation 450 * table. 451 * @param ch the starting character of the contracting string 452 */ getContractValues(int ch)453 private Vector<EntryPair> getContractValues(int ch) 454 { 455 int index = mapping.elementAt(ch); 456 return getContractValuesImpl(index - RBCollationTables.CONTRACTCHARINDEX); 457 } 458 getContractValuesImpl(int index)459 private Vector<EntryPair> getContractValuesImpl(int index) 460 { 461 if (index >= 0) 462 { 463 return contractTable.elementAt(index); 464 } 465 else // not found 466 { 467 return null; 468 } 469 } 470 471 /** 472 * Adds the expanding string into the collation table. 473 */ addExpandOrder(String contractChars, String expandChars, int anOrder)474 private final void addExpandOrder(String contractChars, 475 String expandChars, 476 int anOrder) throws ParseException 477 { 478 // Create an expansion table entry 479 int tableIndex = addExpansion(anOrder, expandChars); 480 481 // And add its index into the main mapping table 482 if (contractChars.length() > 1) { 483 char ch = contractChars.charAt(0); 484 if (Character.isHighSurrogate(ch) && contractChars.length() == 2) { 485 char ch2 = contractChars.charAt(1); 486 if (Character.isLowSurrogate(ch2)) { 487 //only add into table when it is a legal surrogate 488 addOrder(Character.toCodePoint(ch, ch2), tableIndex); 489 } 490 } else { 491 addContractOrder(contractChars, tableIndex); 492 } 493 } else { 494 addOrder(contractChars.charAt(0), tableIndex); 495 } 496 } 497 addExpandOrder(int ch, String expandChars, int anOrder)498 private final void addExpandOrder(int ch, String expandChars, int anOrder) 499 throws ParseException 500 { 501 int tableIndex = addExpansion(anOrder, expandChars); 502 addOrder(ch, tableIndex); 503 } 504 505 /** 506 * Create a new entry in the expansion table that contains the orderings 507 * for the given characers. If anOrder is valid, it is added to the 508 * beginning of the expanded list of orders. 509 */ addExpansion(int anOrder, String expandChars)510 private int addExpansion(int anOrder, String expandChars) { 511 if (expandTable == null) { 512 expandTable = new Vector<>(INITIALTABLESIZE); 513 } 514 515 // If anOrder is valid, we want to add it at the beginning of the list 516 int offset = (anOrder == RBCollationTables.UNMAPPED) ? 0 : 1; 517 518 int[] valueList = new int[expandChars.length() + offset]; 519 if (offset == 1) { 520 valueList[0] = anOrder; 521 } 522 523 int j = offset; 524 for (int i = 0; i < expandChars.length(); i++) { 525 char ch0 = expandChars.charAt(i); 526 char ch1; 527 int ch; 528 if (Character.isHighSurrogate(ch0)) { 529 if (++i == expandChars.length() || 530 !Character.isLowSurrogate(ch1=expandChars.charAt(i))) { 531 //ether we are missing the low surrogate or the next char 532 //is not a legal low surrogate, so stop loop 533 break; 534 } 535 ch = Character.toCodePoint(ch0, ch1); 536 537 } else { 538 ch = ch0; 539 } 540 541 int mapValue = getCharOrder(ch); 542 543 if (mapValue != RBCollationTables.UNMAPPED) { 544 valueList[j++] = mapValue; 545 } else { 546 // can't find it in the table, will be filled in by commit(). 547 valueList[j++] = CHARINDEX + ch; 548 } 549 } 550 if (j < valueList.length) { 551 //we had at least one supplementary character, the size of valueList 552 //is bigger than it really needs... 553 int[] tmpBuf = new int[j]; 554 while (--j >= 0) { 555 tmpBuf[j] = valueList[j]; 556 } 557 valueList = tmpBuf; 558 } 559 // Add the expanding char list into the expansion table. 560 int tableIndex = RBCollationTables.EXPANDCHARINDEX + expandTable.size(); 561 expandTable.addElement(valueList); 562 563 return tableIndex; 564 } 565 addContractFlags(String chars)566 private void addContractFlags(String chars) { 567 char c0; 568 int c; 569 int len = chars.length(); 570 for (int i = 0; i < len; i++) { 571 c0 = chars.charAt(i); 572 c = Character.isHighSurrogate(c0) 573 ?Character.toCodePoint(c0, chars.charAt(++i)) 574 :c0; 575 contractFlags.put(c, 1); 576 } 577 } 578 579 // ============================================================== 580 // constants 581 // ============================================================== 582 static final int CHARINDEX = 0x70000000; // need look up in .commit() 583 584 private static final int IGNORABLEMASK = 0x0000ffff; 585 private static final int PRIMARYORDERINCREMENT = 0x00010000; 586 private static final int SECONDARYORDERINCREMENT = 0x00000100; 587 private static final int TERTIARYORDERINCREMENT = 0x00000001; 588 private static final int INITIALTABLESIZE = 20; 589 private static final int MAXKEYSIZE = 5; 590 591 // ============================================================== 592 // instance variables 593 // ============================================================== 594 595 // variables used by the build process 596 private RBCollationTables.BuildAPI tables = null; 597 private MergeCollation mPattern = null; 598 private boolean isOverIgnore = false; 599 private char[] keyBuf = new char[MAXKEYSIZE]; 600 private IntHashtable contractFlags = new IntHashtable(100); 601 602 // "shadow" copies of the instance variables in RBCollationTables 603 // (the values in these variables are copied back into RBCollationTables 604 // at the end of the build process) 605 private boolean frenchSec = false; 606 private boolean seAsianSwapping = false; 607 608 private UCompactIntArray mapping = null; 609 private Vector<Vector<EntryPair>> contractTable = null; 610 private Vector<int[]> expandTable = null; 611 612 private short maxSecOrder = 0; 613 private short maxTerOrder = 0; 614 } 615