1 /* 2 * Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 */ 23 24 /* @test 25 * @bug 8186801 8186751 26 * @summary Test the charset mappings 27 * @modules jdk.charsets 28 */ 29 30 import java.io.*; 31 import java.nio.*; 32 import java.nio.file.*; 33 import java.nio.charset.*; 34 import java.util.*; 35 import java.util.function.*; 36 import java.util.regex.*; 37 import java.util.stream.*; 38 39 public class TestCharsetMapping { 40 41 private static final int BUFSIZ = 8192; // Initial buffer size 42 private static final int MAXERRS = 10; // Errors reported per test 43 44 private static final PrintStream log = System.out; 45 46 // Set by -v on the command line 47 private static boolean verbose = false; 48 49 // Test modes 50 private static final int ENCODE = 1; 51 private static final int DECODE = 2; 52 53 // Utilities expand(ByteBuffer bb)54 private static ByteBuffer expand(ByteBuffer bb) { 55 ByteBuffer nbb = ByteBuffer.allocate(bb.capacity() * 2); 56 bb.flip(); 57 nbb.put(bb); 58 return nbb; 59 } 60 expand(CharBuffer cb)61 private static CharBuffer expand(CharBuffer cb) { 62 CharBuffer ncb = CharBuffer.allocate(cb.capacity() * 2); 63 cb.flip(); 64 ncb.put(cb); 65 return ncb; 66 } 67 parseBytes(String s)68 private static byte[] parseBytes(String s) { 69 int nb = s.length() / 2; 70 byte[] bs = new byte[nb]; 71 for (int i = 0; i < nb; i++) { 72 int j = i * 2; 73 if (j + 2 > s.length()) 74 throw new RuntimeException("Malformed byte string: " + s); 75 bs[i] = (byte)Integer.parseInt(s.substring(j, j + 2), 16); 76 } 77 return bs; 78 } 79 printBytes(byte[] bs)80 private static String printBytes(byte[] bs) { 81 StringBuffer sb = new StringBuffer(); 82 for (int i = 0; i < bs.length; i++) { 83 sb.append(Integer.toHexString((bs[i] >> 4) & 0xf)); 84 sb.append(Integer.toHexString((bs[i] >> 0) & 0xf)); 85 } 86 return sb.toString(); 87 } 88 printCodePoint(int cp)89 private static String printCodePoint(int cp) { 90 StringBuffer sb = new StringBuffer(); 91 sb.append("U+"); 92 if (cp > 0xffff) 93 sb.append(Integer.toHexString((cp >> 16) & 0xf)); 94 sb.append(Integer.toHexString((cp >> 12) & 0xf)); 95 sb.append(Integer.toHexString((cp >> 8) & 0xf)); 96 sb.append(Integer.toHexString((cp >> 4) & 0xf)); 97 sb.append(Integer.toHexString((cp >> 0) & 0xf)); 98 return sb.toString(); 99 } 100 getCodePoint(CharBuffer cb)101 private static int getCodePoint(CharBuffer cb) { 102 char c = cb.get(); 103 if (Character.isHighSurrogate(c)) 104 return Character.toCodePoint(c, cb.get()); 105 else 106 return c; 107 } 108 plural(int n)109 private static String plural(int n) { 110 return (n == 1 ? "" : "s"); 111 } 112 113 // TestCharsetMapping 114 private CharsetInfo csinfo; 115 private CharsetDecoder decoder = null; 116 private CharsetEncoder encoder = null; 117 118 // Stateful dbcs encoding has leading shift byte '0x0e' 119 // and trailing shift byte '0x0f'. 120 // The flag variable shiftHackDBCS is 'true' for stateful 121 // EBCDIC encodings, which indicates the need of adding/ 122 // removing the shift bytes. 123 private boolean shiftHackDBCS = false; 124 TestCharsetMapping(CharsetInfo csinfo)125 private TestCharsetMapping(CharsetInfo csinfo) throws Exception { 126 this.csinfo = csinfo; 127 this.encoder = csinfo.cs.newEncoder() 128 .onUnmappableCharacter(CodingErrorAction.REPLACE) 129 .onMalformedInput(CodingErrorAction.REPLACE); 130 this.decoder = csinfo.cs.newDecoder() 131 .onUnmappableCharacter(CodingErrorAction.REPLACE) 132 .onMalformedInput(CodingErrorAction.REPLACE); 133 } 134 135 private class Test { 136 // An instance of this class tests all mappings for 137 // a particular bytesPerChar value 138 private int bytesPerChar; 139 140 // Reference data from .map/nr/c2b files 141 private ByteBuffer refBytes = ByteBuffer.allocate(BUFSIZ); 142 private CharBuffer refChars = CharBuffer.allocate(BUFSIZ); 143 144 private ByteBuffer dRefBytes = ByteBuffer.allocateDirect(BUFSIZ); 145 private CharBuffer dRefChars = ByteBuffer.allocateDirect(BUFSIZ*2).asCharBuffer(); 146 Test(int bpc)147 private Test(int bpc) { 148 bytesPerChar = bpc; 149 } 150 151 // shiftHackDBCS can add the leading/trailing shift bytesa put(byte[] bs)152 private void put(byte[] bs) { 153 if (refBytes.remaining() < bytesPerChar) 154 refBytes = expand(refBytes); 155 refBytes.put(bs); 156 } 157 put(byte[] bs, char[] cc)158 private void put(byte[] bs, char[] cc) { 159 if (bs.length != bytesPerChar) 160 throw new IllegalArgumentException(bs.length 161 + " != " 162 + bytesPerChar); 163 if (refBytes.remaining() < bytesPerChar) 164 refBytes = expand(refBytes); 165 refBytes.put(bs); 166 if (refChars.remaining() < cc.length) 167 refChars = expand(refChars); 168 refChars.put(cc); 169 } 170 decode(ByteBuffer refBytes, CharBuffer refChars)171 private boolean decode(ByteBuffer refBytes, CharBuffer refChars) 172 throws Exception { 173 log.println(" decode" + (refBytes.isDirect()?" (direct)":"")); 174 CharBuffer out = decoder.decode(refBytes); 175 176 refBytes.rewind(); 177 byte[] bs = new byte[bytesPerChar]; 178 int e = 0; 179 180 if (shiftHackDBCS && bytesPerChar == 2 && refBytes.get() != (byte)0x0e) { 181 log.println("Missing leading byte"); 182 } 183 184 while (refChars.hasRemaining()) { 185 refBytes.get(bs); 186 int rcp = getCodePoint(refChars); 187 int ocp = getCodePoint(out); 188 if (rcp != ocp) { 189 log.println(" Error: " 190 + printBytes(bs) 191 + " --> " 192 + printCodePoint(ocp) 193 + ", expected " 194 + printCodePoint(rcp)); 195 if (++e >= MAXERRS) { 196 log.println(" Too many errors, giving up"); 197 break; 198 } 199 } 200 if (verbose) { 201 log.println(" " 202 + printBytes(bs) 203 + " --> " 204 + printCodePoint(rcp)); 205 } 206 } 207 208 if (shiftHackDBCS && bytesPerChar == 2 && refBytes.get() != (byte)0x0f) { 209 log.println("Missing trailing byte"); 210 } 211 212 if (e == 0 && (refChars.hasRemaining() || out.hasRemaining())) { 213 // Paranoia: Didn't consume everything 214 throw new IllegalStateException(); 215 } 216 refBytes.rewind(); 217 refChars.rewind(); 218 return (e == 0); 219 } 220 encode(ByteBuffer refBytes, CharBuffer refChars)221 private boolean encode(ByteBuffer refBytes, CharBuffer refChars) 222 throws Exception { 223 log.println(" encode" + (refBytes.isDirect()?" (direct)":"")); 224 ByteBuffer out = encoder.encode(refChars); 225 refChars.rewind(); 226 227 if (shiftHackDBCS && bytesPerChar == 2 && out.get() != refBytes.get()) { 228 log.println("Missing leading byte"); 229 return false; 230 } 231 232 byte[] rbs = new byte[bytesPerChar]; 233 byte[] obs = new byte[bytesPerChar]; 234 int e = 0; 235 while (refChars.hasRemaining()) { 236 int cp = getCodePoint(refChars); 237 refBytes.get(rbs); 238 out.get(obs); 239 boolean eq = true; 240 for (int i = 0; i < bytesPerChar; i++) 241 eq &= rbs[i] == obs[i]; 242 if (!eq) { 243 log.println(" Error: " 244 + printCodePoint(cp) 245 + " --> " 246 + printBytes(obs) 247 + ", expected " 248 + printBytes(rbs)); 249 if (++e >= MAXERRS) { 250 log.println(" Too many errors, giving up"); 251 break; 252 } 253 } 254 if (verbose) { 255 log.println(" " 256 + printCodePoint(cp) 257 + " --> " 258 + printBytes(rbs)); 259 } 260 } 261 262 if (shiftHackDBCS && bytesPerChar == 2 && out.get() != refBytes.get()) { 263 log.println("Missing trailing byte"); 264 return false; 265 } 266 267 if (e == 0 && (refBytes.hasRemaining() || out.hasRemaining())) { 268 // Paranoia: Didn't consume everything 269 throw new IllegalStateException(); 270 } 271 272 refBytes.rewind(); 273 refChars.rewind(); 274 return (e == 0); 275 } 276 run(int mode)277 private boolean run(int mode) throws Exception { 278 log.println(" " + bytesPerChar 279 + " byte" + plural(bytesPerChar) + "/char"); 280 281 if (dRefBytes.capacity() < refBytes.capacity()) { 282 dRefBytes = ByteBuffer.allocateDirect(refBytes.capacity()); 283 } 284 if (dRefChars.capacity() < refChars.capacity()) { 285 dRefChars = ByteBuffer.allocateDirect(refChars.capacity()*2) 286 .asCharBuffer(); 287 } 288 refBytes.flip(); 289 refChars.flip(); 290 dRefBytes.clear(); 291 dRefChars.clear(); 292 293 dRefBytes.put(refBytes).flip(); 294 dRefChars.put(refChars).flip(); 295 refBytes.flip(); 296 refChars.flip(); 297 298 boolean rv = true; 299 if (mode != ENCODE) { 300 rv &= decode(refBytes, refChars); 301 rv &= decode(dRefBytes, dRefChars); 302 } 303 if (mode != DECODE) { 304 rv &= encode(refBytes, refChars); 305 rv &= encode(dRefBytes, dRefChars); 306 } 307 return rv; 308 } 309 } 310 311 // Maximum bytes/char being tested 312 private int maxBytesPerChar = 0; 313 314 // Tests, indexed by bytesPerChar - 1 315 private Test[] tests; 316 clearTests()317 private void clearTests() { 318 maxBytesPerChar = 0; 319 tests = new Test[0]; 320 } 321 322 // Find the test for the given bytes/char value, 323 // expanding the test array if needed 324 // testFor(int bpc)325 private Test testFor(int bpc) { 326 if (bpc > maxBytesPerChar) { 327 Test[] ts = new Test[bpc]; 328 System.arraycopy(tests, 0, ts, 0, maxBytesPerChar); 329 for (int i = maxBytesPerChar; i < bpc; i++) 330 ts[i] = new Test(i + 1); 331 tests = ts; 332 maxBytesPerChar = bpc; 333 } 334 return tests[bpc - 1]; 335 } 336 testStringConv()337 private boolean testStringConv() throws Exception { 338 if (shiftHackDBCS) { 339 log.println(" string de/encoding skipped for ebcdic"); 340 return true; 341 } 342 boolean rv = true; 343 log.println(" string de/encoding"); 344 // for new String() 345 ByteArrayOutputStream baosDec = new ByteArrayOutputStream(); 346 StringBuilder sbDec = new StringBuilder(); 347 // for String.getBytes() 348 ByteArrayOutputStream baosEnc = new ByteArrayOutputStream(); 349 StringBuilder sbEnc = new StringBuilder(); 350 351 for (Entry e : csinfo.mappings) { 352 baosDec.write(e.bs); 353 sbDec.append(Character.toChars(e.cp)); 354 if (e.cp2 != 0) 355 sbDec.append(e.cp2); 356 357 // non-roundtrip b2c, and c2b 358 if (csinfo.nr != null && csinfo.nr.containsKey(e.bb) || 359 csinfo.c2b != null && !csinfo.c2b.containsKey(e.cp)) 360 continue; 361 baosEnc.write(e.bs); 362 sbEnc.append(Character.toChars(e.cp)); 363 if (e.cp2 != 0) 364 sbEnc.append(e.cp2); 365 } 366 log.println(" new String()"); 367 if (!new String(baosDec.toByteArray(), csinfo.csName).equals(sbDec.toString())) { 368 log.println(" Error: new String() failed"); 369 rv = false; 370 } 371 log.println(" String.getBytes()"); 372 if (!Arrays.equals(baosEnc.toByteArray(), sbEnc.toString().getBytes(csinfo.csName))) { 373 log.println(" Error: String().getBytes() failed"); 374 rv = false; 375 } 376 return rv; 377 } 378 run()379 private boolean run() throws Exception { 380 boolean rv = true; 381 shiftHackDBCS = csinfo.type.equals("ebcdic"); // isStateful; 382 383 // (1) new String()/String.getBytes() 384 rv &= testStringConv(); 385 386 // (2) DECODE: 387 clearTests(); 388 if (shiftHackDBCS) { 389 testFor(2).put(new byte[] { 0x0e }); 390 } 391 csinfo.mappings.forEach(e -> { 392 if (e.cp2 != 0) 393 return; // skip composite (base+cc) for now 394 byte[] bs = e.bs; 395 char[] cc = Character.toChars(e.cp); 396 testFor(bs.length).put(bs, cc); 397 }); 398 if (shiftHackDBCS) { 399 testFor(2).put(new byte[] { 0x0f }); 400 } 401 for (int i = 0; i < maxBytesPerChar; i++) { 402 rv &= tests[i].run(DECODE); 403 } 404 405 // (3) ENCODE: 406 clearTests(); 407 if (shiftHackDBCS) { 408 testFor(2).put(new byte[] { 0x0e }); 409 } 410 csinfo.mappings.forEach(e -> { 411 if (e.cp2 != 0) 412 return; // skip composite (base+cc) for now 413 if (csinfo.nr != null && csinfo.nr.containsKey(e.bb)) 414 return; // non-roundtrip b2c 415 if (csinfo.c2b != null && csinfo.c2b.containsKey(e.cp)) 416 return; // c2b only mapping 417 byte[] bs = e.bs; 418 char[] cc = Character.toChars(e.cp); 419 testFor(bs.length).put(bs, cc); 420 }); 421 if (csinfo.c2b != null) 422 csinfo.c2b.values().forEach(e -> { 423 byte[] bs = e.bs; 424 char[] cc = Character.toChars(e.cp); 425 testFor(bs.length).put(bs, cc); 426 }); 427 if (shiftHackDBCS) { 428 testFor(2).put(new byte[] { 0x0f }); 429 } 430 for (int i = 0; i < maxBytesPerChar; i++) { 431 rv &= tests[i].run(ENCODE); 432 } 433 return rv; 434 } 435 436 private static class Entry { 437 byte[] bs; // byte sequence reps 438 int cp; // Unicode codepoint 439 int cp2; // CC of composite 440 long bb; // bs in "long" form for nr lookup; 441 } 442 443 private final static int UNMAPPABLE = 0xFFFD; 444 private static final Pattern ptn = Pattern.compile("(?:0x)?(\\p{XDigit}++)\\s++(?:U\\+|0x)?(\\p{XDigit}++)(?:\\s++#.*)?"); 445 private static final int G_BS = 1; 446 private static final int G_CP = 2; 447 private static final int G_CP2 = 3; 448 449 private static class CharsetInfo { 450 Charset cs; 451 String pkgName; 452 String clzName; 453 String csName; 454 String hisName; 455 String type; 456 boolean isInternal; 457 Set<String> aliases = new HashSet<>(); 458 459 // mapping entries 460 List<Entry> mappings; 461 Map<Long, Entry> nr; // bytes -> entry 462 Map<Integer, Entry> c2b; // cp -> entry 463 CharsetInfo(String csName, String clzName)464 CharsetInfo(String csName, String clzName) { 465 this.csName = csName; 466 this.clzName = clzName; 467 } 468 parse(Matcher m)469 private Entry parse(Matcher m) { 470 Entry e = new Entry(); 471 e.bb = Long.parseLong(m.group(G_BS), 16); 472 if (e.bb < 0x100) 473 e.bs = new byte[] { (byte)e.bb }; 474 else 475 e.bs = parseBytes(m.group(G_BS)); 476 e.cp = Integer.parseInt(m.group(G_CP), 16); 477 if (G_CP2 <= m.groupCount() && m.group(G_CP2) != null) 478 e.cp2 = Integer.parseInt(m.group(G_CP2), 16); 479 else 480 e.cp2 = 0; 481 return e; 482 } 483 loadMappings(Path dir)484 boolean loadMappings(Path dir) throws IOException { 485 // xxx.map 486 Path path = dir.resolve(clzName + ".map"); 487 if (!Files.exists(path)) { 488 return false; 489 } 490 Matcher m = ptn.matcher(""); 491 mappings = Files.lines(path) 492 .filter(ln -> !ln.startsWith("#") && m.reset(ln).lookingAt()) 493 .map(ln -> parse(m)) 494 .filter(e -> e.cp != UNMAPPABLE) // non-mapping 495 .collect(Collectors.toList()); 496 // xxx.nr 497 path = dir.resolve(clzName + ".nr"); 498 if (Files.exists(path)) { 499 nr = Files.lines(path) 500 .filter(ln -> !ln.startsWith("#") && m.reset(ln).lookingAt()) 501 .map(ln -> parse(m)) 502 .collect(Collectors.toMap(e -> e.bb, Function.identity())); 503 } 504 // xxx.c2b 505 path = dir.resolve(clzName + ".c2b"); 506 if (Files.exists(path)) { 507 c2b = Files.lines(path) 508 .filter(ln -> !ln.startsWith("#") && m.reset(ln).lookingAt()) 509 .map(ln -> parse(m)) 510 .collect(Collectors.toMap(e -> e.cp, Function.identity())); 511 } 512 return true; 513 } 514 } 515 charsets(Path cslist)516 private static Set<CharsetInfo> charsets(Path cslist) throws IOException { 517 Set<CharsetInfo> charsets = new LinkedHashSet<>(); 518 Iterator<String> itr = Files.readAllLines(cslist).iterator(); 519 CharsetInfo cs = null; 520 521 while (itr.hasNext()) { 522 String line = itr.next(); 523 if (line.startsWith("#") || line.length() == 0) { 524 continue; 525 } 526 String[] tokens = line.split("\\s+"); 527 if (tokens.length < 2) { 528 continue; 529 } 530 if ("charset".equals(tokens[0])) { 531 if (cs != null) { 532 charsets.add(cs); 533 cs = null; 534 } 535 if (tokens.length < 3) { 536 throw new RuntimeException("Error: incorrect charset line [" + line + "]"); 537 } 538 cs = new CharsetInfo(tokens[1], tokens[2]); 539 } else { 540 String key = tokens[1]; // leading empty str 541 switch (key) { 542 case "alias": 543 if (tokens.length < 3) { 544 throw new RuntimeException("Error: incorrect alias line [" + line + "]"); 545 } 546 cs.aliases.add(tokens[2]); // ALIAS_NAME 547 break; 548 case "package": 549 cs.pkgName = tokens[2]; 550 break; 551 case "type": 552 cs.type = tokens[2]; 553 break; 554 case "hisname": 555 cs.hisName = tokens[2]; 556 break; 557 case "internal": 558 cs.isInternal = Boolean.parseBoolean(tokens[2]); 559 break; 560 default: // ignore 561 } 562 } 563 } 564 if (cs != null) { 565 charsets.add(cs); 566 } 567 return charsets; 568 } 569 main(String args[])570 public static void main(String args[]) throws Exception { 571 Path dir = Paths.get(System.getProperty("test.src", ".") + 572 "/../../../../make/data/charsetmapping"); 573 if (!Files.exists(dir)) { 574 // not inside jdk repo, no mappings, exit silently 575 log.println("Nothing done, not in a jdk repo: "); 576 return; 577 } 578 if (args.length > 0 && "-v".equals(args[0])) { 579 // For debugging: java CoderTest [-v] 580 verbose = true; 581 } 582 583 int errors = 0; 584 int tested = 0; 585 int skipped = 0; 586 int known = 0; 587 588 for (CharsetInfo csinfo : charsets(dir.resolve("charsets"))) { 589 String csname = csinfo.csName; 590 591 if (csinfo.isInternal) { 592 continue; 593 } 594 595 log.printf("%ntesting: %-16s", csname); 596 597 if (!Charset.isSupported(csname)) { 598 errors++; 599 log.println(" [error: charset is not supported]"); 600 continue; 601 } 602 603 Charset cs = csinfo.cs = Charset.forName(csinfo.csName); 604 // test name() 605 if (!cs.name().equals(csinfo.csName)) { 606 errors++; 607 log.printf(" [error: wrong csname: " + csinfo.csName 608 + " vs " + cs.name() + "]"); 609 } 610 // test aliases() 611 if (!cs.aliases().equals(csinfo.aliases)) { 612 errors++; 613 log.printf(" [error wrong aliases]"); 614 if (verbose) { 615 log.println(); 616 log.println(" expected: " + csinfo.aliases); 617 log.println(" got: " + cs.aliases()); 618 } 619 } 620 621 if (csinfo.type.equals("source")) { 622 log.println(" [skipped: source based]"); 623 skipped++; 624 continue; 625 } 626 627 if (!csinfo.loadMappings(dir)) { 628 log.println(" [error loading mappings failed]"); 629 errors++; 630 continue; 631 } 632 633 tested++; 634 log.println(); 635 if (!new TestCharsetMapping(csinfo).run()) { 636 637 /////////////// known nr/c2b issues //////////////// 638 if (csinfo.csName.equals("x-IBM948") || 639 csinfo.csName.equals("x-IBM950") || 640 csinfo.csName.equals("x-IBM937") || 641 csinfo.csName.equals("x-IBM1383")) 642 { 643 log.println(" [**** skipped, KNOWN nr/c2b mapping issue]"); 644 known++; 645 continue; 646 } 647 648 errors++; 649 } 650 } 651 652 log.println(); 653 log.println(tested + " charset" + plural(tested) + " tested, " 654 + skipped + " skipped, " + known + " known issue(s)"); 655 log.println(); 656 if (errors > 0) 657 throw new Exception("Errors detected in " 658 + errors + " charset" + plural(errors)); 659 } 660 } 661