1#!/usr/bin/python 2 3# Copyright 2013-2016 Mozilla Foundation. See the COPYRIGHT 4# file at the top-level directory of this distribution. 5# 6# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 7# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license 8# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your 9# option. This file may not be copied, modified, or distributed 10# except according to those terms. 11 12import json 13import subprocess 14import sys 15 16def cmp_from_end(one, other): 17 c = cmp(len(one), len(other)) 18 if c != 0: 19 return c 20 i = len(one) - 1 21 while i >= 0: 22 c = cmp(one[i], other[i]) 23 if c != 0: 24 return c 25 i -= 1 26 return 0 27 28 29class Label: 30 def __init__(self, label, preferred): 31 self.label = label 32 self.preferred = preferred 33 def __cmp__(self, other): 34 return cmp_from_end(self.label, other.label) 35 36def static_u16_table(name, data): 37 data_file.write('''pub static %s: [u16; %d] = [ 38 ''' % (name, len(data))) 39 40 for i in xrange(len(data)): 41 data_file.write('0x%04X,\n' % data[i]) 42 43 data_file.write(''']; 44 45 ''') 46 47def static_u16_table_from_indexable(name, data, item): 48 data_file.write('''#[cfg(not(feature = "no-static-ideograph-encoder-tables"))] 49static %s: [u16; %d] = [ 50 ''' % (name, len(data))) 51 52 for i in xrange(len(data)): 53 data_file.write('0x%04X,\n' % data[i][item]) 54 55 data_file.write(''']; 56 57 ''') 58 59def static_u8_pair_table_from_indexable(name, data, item): 60 data_file.write('''#[cfg(not(feature = "no-static-ideograph-encoder-tables"))] 61static %s: [[u8; 2]; %d] = [ 62 ''' % (name, len(data))) 63 64 for i in xrange(len(data)): 65 data_file.write('[0x%02X, 0x%02X],\n' % data[i][item]) 66 67 data_file.write(''']; 68 69 ''') 70 71preferred = [] 72 73dom = [] 74 75labels = [] 76 77data = json.load(open("../encoding/encodings.json", "r")) 78 79indexes = json.load(open("../encoding/indexes.json", "r")) 80 81single_byte = [] 82 83multi_byte = [] 84 85def to_camel_name(name): 86 if name == u"iso-8859-8-i": 87 return u"Iso8I" 88 if name.startswith(u"iso-8859-"): 89 return name.replace(u"iso-8859-", u"Iso") 90 return name.title().replace(u"X-", u"").replace(u"-", u"").replace(u"_", u"") 91 92def to_constant_name(name): 93 return name.replace(u"-", u"_").upper() 94 95def to_snake_name(name): 96 return name.replace(u"-", u"_").lower() 97 98def to_dom_name(name): 99 return name 100 101# 102 103for group in data: 104 if group["heading"] == "Legacy single-byte encodings": 105 single_byte = group["encodings"] 106 else: 107 multi_byte.extend(group["encodings"]) 108 for encoding in group["encodings"]: 109 preferred.append(encoding["name"]) 110 for label in encoding["labels"]: 111 labels.append(Label(label, encoding["name"])) 112 113for name in preferred: 114 dom.append(to_dom_name(name)) 115 116preferred.sort() 117labels.sort() 118dom.sort(cmp=cmp_from_end) 119 120longest_label_length = 0 121longest_name_length = 0 122longest_label = None 123longest_name = None 124 125for name in preferred: 126 if len(name) > longest_name_length: 127 longest_name_length = len(name) 128 longest_name = name 129 130for label in labels: 131 if len(label.label) > longest_label_length: 132 longest_label_length = len(label.label) 133 longest_label = label.label 134 135def is_single_byte(name): 136 for encoding in single_byte: 137 if name == encoding["name"]: 138 return True 139 return False 140 141def read_non_generated(path): 142 partially_generated_file = open(path, "r") 143 full = partially_generated_file.read() 144 partially_generated_file.close() 145 146 generated_begin = "// BEGIN GENERATED CODE. PLEASE DO NOT EDIT." 147 generated_end = "// END GENERATED CODE" 148 149 generated_begin_index = full.find(generated_begin) 150 if generated_begin_index < 0: 151 print "Can't find generated code start marker in %s. Exiting." % path 152 sys.exit(-1) 153 generated_end_index = full.find(generated_end) 154 if generated_end_index < 0: 155 print "Can't find generated code end marker in %s. Exiting." % path 156 sys.exit(-1) 157 158 return (full[0:generated_begin_index + len(generated_begin)], 159 full[generated_end_index:]) 160 161(lib_rs_begin, lib_rs_end) = read_non_generated("src/lib.rs") 162 163label_file = open("src/lib.rs", "w") 164 165label_file.write(lib_rs_begin) 166label_file.write(""" 167// Instead, please regenerate using generate-encoding-data.py 168 169const LONGEST_LABEL_LENGTH: usize = %d; // %s 170 171""" % (longest_label_length, longest_label)) 172 173for name in preferred: 174 variant = None 175 if is_single_byte(name): 176 variant = "SingleByte(data::%s_DATA)" % to_constant_name(u"iso-8859-8" if name == u"ISO-8859-8-I" else name) 177 else: 178 variant = to_camel_name(name) 179 180 label_file.write('''/// The initializer for the %s encoding. 181/// 182/// For use only for taking the address of this form when 183/// Rust prohibits the use of the non-`_INIT` form directly, 184/// such as in initializers of other `static`s. If in doubt, 185/// use the corresponding non-`_INIT` reference-typed `static`. 186/// 187/// This part of the public API will go away if Rust changes 188/// to make the referent of `pub const FOO: &'static Encoding` 189/// unique cross-crate or if Rust starts allowing static arrays 190/// to be initialized with `pub static FOO: &'static Encoding` 191/// items. 192pub static %s_INIT: Encoding = Encoding { 193 name: "%s", 194 variant: VariantEncoding::%s, 195}; 196 197/// The %s encoding. 198/// 199/// This will change from `static` to `const` if Rust changes 200/// to make the referent of `pub const FOO: &'static Encoding` 201/// unique cross-crate, so don't take the address of this 202/// `static`. 203pub static %s: &'static Encoding = &%s_INIT; 204 205''' % (to_dom_name(name), to_constant_name(name), to_dom_name(name), variant, to_dom_name(name), to_constant_name(name), to_constant_name(name))) 206 207label_file.write("""static LABELS_SORTED: [&'static str; %d] = [ 208""" % len(labels)) 209 210for label in labels: 211 label_file.write('''"%s",\n''' % label.label) 212 213label_file.write("""]; 214 215static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; %d] = [ 216""" % len(labels)) 217 218for label in labels: 219 label_file.write('''&%s_INIT,\n''' % to_constant_name(label.preferred)) 220 221label_file.write(''']; 222 223''') 224label_file.write(lib_rs_end) 225label_file.close() 226 227label_test_file = open("src/test_labels_names.rs", "w") 228label_test_file.write('''// Any copyright to the test code below this comment is dedicated to the 229// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ 230 231// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. 232// Instead, please regenerate using generate-encoding-data.py 233 234use super::*; 235 236#[test] 237fn test_all_labels() { 238''') 239 240for label in labels: 241 label_test_file.write('''assert_eq!(Encoding::for_label(b"%s"), Some(%s));\n''' % (label.label, to_constant_name(label.preferred))) 242 243label_test_file.write('''} 244''') 245label_test_file.close() 246 247def null_to_zero(code_point): 248 if not code_point: 249 code_point = 0 250 return code_point 251 252data_file = open("src/data.rs", "w") 253data_file.write('''// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT 254// file at the top-level directory of this distribution. 255// 256// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 257// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license 258// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your 259// option. This file may not be copied, modified, or distributed 260// except according to those terms. 261 262// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. 263// Instead, please regenerate using generate-encoding-data.py 264 265''') 266 267# Single-byte 268 269for encoding in single_byte: 270 name = encoding["name"] 271 if name == u"ISO-8859-8-I": 272 continue 273 274 data_file.write('''pub const %s_DATA: &'static [u16; 128] = &[ 275''' % to_constant_name(name)) 276 277 for code_point in indexes[name.lower()]: 278 data_file.write('0x%04X,\n' % null_to_zero(code_point)) 279 280 data_file.write(''']; 281 282''') 283 284# Big5 285 286index = indexes["big5"] 287 288astralness = [] 289low_bits = [] 290 291for code_point in index[942:19782]: 292 if code_point: 293 astralness.append(1 if code_point > 0xFFFF else 0) 294 low_bits.append(code_point & 0xFFFF) 295 else: 296 astralness.append(0) 297 low_bits.append(0) 298 299# pad length to multiple of 32 300for j in xrange(32 - (len(astralness) % 32)): 301 astralness.append(0) 302 303data_file.write('''static BIG5_ASTRALNESS: [u32; %d] = [ 304''' % (len(astralness) / 32)) 305 306i = 0 307while i < len(astralness): 308 accu = 0 309 for j in xrange(32): 310 accu |= astralness[i + j] << j 311 data_file.write('0x%08X,\n' % accu) 312 i += 32 313 314data_file.write(''']; 315 316''') 317 318static_u16_table("BIG5_LOW_BITS", low_bits) 319 320# Encoder table for Level 1 Hanzi 321# Note: If we were OK with doubling this table, we 322# could use a directly-indexable table instead... 323level1_hanzi_index = index[5495:10896] 324level1_hanzi_pairs = [] 325for i in xrange(len(level1_hanzi_index)): 326 hanzi_lead = (i / 157) + 0xA4 327 hanzi_trail = (i % 157) 328 hanzi_trail += 0x40 if hanzi_trail < 0x3F else 0x62 329 level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail))) 330level1_hanzi_pairs.append((0x4E5A, (0xC8, 0x7B))) 331level1_hanzi_pairs.append((0x5202, (0xC8, 0x7D))) 332level1_hanzi_pairs.append((0x9FB0, (0xC8, 0xA1))) 333level1_hanzi_pairs.append((0x5188, (0xC8, 0xA2))) 334level1_hanzi_pairs.append((0x9FB1, (0xC8, 0xA3))) 335level1_hanzi_pairs.sort(key=lambda x: x[0]) 336 337static_u16_table_from_indexable("BIG5_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0) 338static_u8_pair_table_from_indexable("BIG5_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1) 339 340# JIS0208 341 342index = indexes["jis0208"] 343 344# JIS 0208 Level 1 Kanji 345static_u16_table("JIS0208_LEVEL1_KANJI", index[1410:4375]) 346 347# JIS 0208 Level 2 Kanji and Additional Kanji 348static_u16_table("JIS0208_LEVEL2_AND_ADDITIONAL_KANJI", index[4418:7808]) 349 350# IBM Kanji 351static_u16_table("IBM_KANJI", index[8272:8632]) 352 353# Check that the other instance is the same 354if index[8272:8632] != index[10744:11104]: 355 raise Error() 356 357# JIS 0208 symbols (all non-Kanji, non-range items) 358symbol_index = [] 359symbol_triples = [] 360pointers_to_scan = [ 361 (0, 188), 362 (658, 691), 363 (1159, 1221), 364] 365in_run = False 366run_start_pointer = 0 367run_start_array_index = 0 368for (start, end) in pointers_to_scan: 369 for i in range(start, end): 370 code_point = index[i] 371 if in_run: 372 if code_point: 373 symbol_index.append(code_point) 374 else: 375 symbol_triples.append(run_start_pointer) 376 symbol_triples.append(i - run_start_pointer) 377 symbol_triples.append(run_start_array_index) 378 in_run = False 379 else: 380 if code_point: 381 in_run = True 382 run_start_pointer = i 383 run_start_array_index = len(symbol_index) 384 symbol_index.append(code_point) 385 if in_run: 386 symbol_triples.append(run_start_pointer) 387 symbol_triples.append(end - run_start_pointer) 388 symbol_triples.append(run_start_array_index) 389 in_run = False 390if in_run: 391 raise Error() 392 393# Now add manually the two overlapping slices of 394# index from the NEC/IBM extensions. 395run_start_array_index = len(symbol_index) 396symbol_index.extend(index[10736:10744]) 397# Later 398symbol_triples.append(10736) 399symbol_triples.append(8) 400symbol_triples.append(run_start_array_index) 401# Earlier 402symbol_triples.append(8644) 403symbol_triples.append(4) 404symbol_triples.append(run_start_array_index) 405 406static_u16_table("JIS0208_SYMBOLS", symbol_index) 407static_u16_table("JIS0208_SYMBOL_TRIPLES", symbol_triples) 408 409# Write down the magic numbers needed when preferring the earlier case 410data_file.write('''const IBM_SYMBOL_START: usize = %d;''' % (run_start_array_index + 1)) 411data_file.write('''const IBM_SYMBOL_END: usize = %d;''' % (run_start_array_index + 4)) 412data_file.write('''const IBM_SYMBOL_POINTER_START: usize = %d;''' % 8645) 413 414# JIS 0208 ranges (excluding kana) 415range_triples = [] 416pointers_to_scan = [ 417 (188, 281), 418 (470, 657), 419 (1128, 1159), 420 (8634, 8644), 421 (10716, 10736), 422] 423in_run = False 424run_start_pointer = 0 425run_start_code_point = 0 426previous_code_point = 0 427for (start, end) in pointers_to_scan: 428 for i in range(start, end): 429 code_point = index[i] 430 if in_run: 431 if code_point: 432 if previous_code_point + 1 != code_point: 433 range_triples.append(run_start_pointer) 434 range_triples.append(i - run_start_pointer) 435 range_triples.append(run_start_code_point) 436 run_start_pointer = i 437 run_start_code_point = code_point 438 previous_code_point = code_point 439 else: 440 range_triples.append(run_start_pointer) 441 range_triples.append(i - run_start_pointer) 442 range_triples.append(run_start_code_point) 443 run_start_pointer = 0 444 run_start_code_point = 0 445 previous_code_point = 0 446 in_run = False 447 else: 448 if code_point: 449 in_run = True 450 run_start_pointer = i 451 run_start_code_point = code_point 452 previous_code_point = code_point 453 if in_run: 454 range_triples.append(run_start_pointer) 455 range_triples.append(end - run_start_pointer) 456 range_triples.append(run_start_code_point) 457 run_start_pointer = 0 458 run_start_code_point = 0 459 previous_code_point = 0 460 in_run = False 461if in_run: 462 raise Error() 463 464static_u16_table("JIS0208_RANGE_TRIPLES", range_triples) 465 466# Encoder table for Level 1 Kanji 467# Note: If we were OK with 30 KB more footprint, we 468# could use a directly-indexable table instead... 469level1_kanji_index = index[1410:4375] 470level1_kanji_pairs = [] 471for i in xrange(len(level1_kanji_index)): 472 pointer = 1410 + i 473 (lead, trail) = divmod(pointer, 188) 474 lead += 0x81 if lead < 0x1F else 0xC1 475 trail += 0x40 if trail < 0x3F else 0x41 476 level1_kanji_pairs.append((level1_kanji_index[i], (lead, trail))) 477level1_kanji_pairs.sort(key=lambda x: x[0]) 478 479static_u16_table_from_indexable("JIS0208_LEVEL1_KANJI_CODE_POINTS", level1_kanji_pairs, 0) 480static_u8_pair_table_from_indexable("JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES", level1_kanji_pairs, 1) 481 482# ISO-2022-JP half-width katakana 483 484# index is still jis0208 485half_width_index = indexes["iso-2022-jp-katakana"] 486 487data_file.write('''pub static ISO_2022_JP_HALF_WIDTH_TRAIL: [u8; %d] = [ 488''' % len(half_width_index)) 489 490for i in xrange(len(half_width_index)): 491 code_point = half_width_index[i] 492 pointer = index.index(code_point) 493 trail = pointer % 94 + 0x21 494 data_file.write('0x%02X,\n' % trail) 495 496data_file.write(''']; 497 498''') 499 500# EUC-KR 501 502index = indexes["euc-kr"] 503 504# Unicode 1.1 Hangul above the old KS X 1001 block 505# Compressed form takes 35% of uncompressed form 506pointers = [] 507offsets = [] 508previous_code_point = 0 509for row in xrange(0x20): 510 for column in xrange(190): 511 i = column + (row * 190) 512 # Skip the gaps 513 if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40): 514 continue 515 code_point = index[i] 516 if previous_code_point > code_point: 517 raise Error() 518 if code_point - previous_code_point != 1: 519 adjustment = 0 520 if column >= 0x40: 521 adjustment = 12 522 elif column >= 0x20: 523 adjustment = 6 524 pointers.append(column - adjustment + (row * (190 - 12))) 525 offsets.append(code_point) 526 previous_code_point = code_point 527 528static_u16_table("CP949_TOP_HANGUL_POINTERS", pointers) 529static_u16_table("CP949_TOP_HANGUL_OFFSETS", offsets) 530 531# Unicode 1.1 Hangul to the left of the old KS X 1001 block 532pointers = [] 533offsets = [] 534previous_code_point = 0 535for row in xrange(0x46 - 0x20): 536 for column in xrange(190 - 94): 537 i = 6080 + column + (row * 190) 538 # Skip the gaps 539 if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40): 540 continue 541 if i > 13127: 542 # Exclude unassigned on partial last row 543 break 544 code_point = index[i] 545 if previous_code_point > code_point: 546 raise Error() 547 if code_point - previous_code_point != 1: 548 adjustment = 0 549 if column >= 0x40: 550 adjustment = 12 551 elif column >= 0x20: 552 adjustment = 6 553 pointers.append(column - adjustment + (row * (190 - 94 - 12))) 554 offsets.append(code_point) 555 previous_code_point = code_point 556 557static_u16_table("CP949_LEFT_HANGUL_POINTERS", pointers) 558static_u16_table("CP949_LEFT_HANGUL_OFFSETS", offsets) 559 560# KS X 1001 Hangul 561hangul_index = [] 562previous_code_point = 0 563for row in xrange(0x48 - 0x2F): 564 for column in xrange(94): 565 code_point = index[9026 + column + (row * 190)] 566 if previous_code_point >= code_point: 567 raise Error() 568 hangul_index.append(code_point) 569 previous_code_point = code_point 570 571static_u16_table("KSX1001_HANGUL", hangul_index) 572 573# KS X 1001 Hanja 574hanja_index = [] 575for row in xrange(0x7D - 0x49): 576 for column in xrange(94): 577 hanja_index.append(index[13966 + column + (row * 190)]) 578 579static_u16_table("KSX1001_HANJA", hanja_index) 580 581# KS X 1001 symbols 582symbol_index = [] 583for i in range(6176, 6270): 584 symbol_index.append(index[i]) 585for i in range(6366, 6437): 586 symbol_index.append(index[i]) 587 588static_u16_table("KSX1001_SYMBOLS", symbol_index) 589 590# KS X 1001 Uppercase Latin 591subindex = [] 592for i in range(7506, 7521): 593 subindex.append(null_to_zero(index[i])) 594 595static_u16_table("KSX1001_UPPERCASE", subindex) 596 597# KS X 1001 Lowercase Latin 598subindex = [] 599for i in range(7696, 7712): 600 subindex.append(index[i]) 601 602static_u16_table("KSX1001_LOWERCASE", subindex) 603 604# KS X 1001 Box drawing 605subindex = [] 606for i in range(7126, 7194): 607 subindex.append(index[i]) 608 609static_u16_table("KSX1001_BOX", subindex) 610 611# KS X 1001 other 612pointers = [] 613offsets = [] 614previous_code_point = 0 615for row in xrange(10): 616 for column in xrange(94): 617 i = 6556 + column + (row * 190) 618 code_point = index[i] 619 # Exclude ranges that were processed as lookup tables 620 # or that contain unmapped cells by filling them with 621 # ASCII. Upon encode, ASCII code points will 622 # never appear as the search key. 623 if (i >= 6946 and i <= 6950): 624 code_point = i - 6946 625 elif (i >= 6961 and i <= 6967): 626 code_point = i - 6961 627 elif (i >= 6992 and i <= 6999): 628 code_point = i - 6992 629 elif (i >= 7024 and i <= 7029): 630 code_point = i - 7024 631 elif (i >= 7126 and i <= 7219): 632 code_point = i - 7126 633 elif (i >= 7395 and i <= 7409): 634 code_point = i - 7395 635 elif (i >= 7506 and i <= 7521): 636 code_point = i - 7506 637 elif (i >= 7696 and i <= 7711): 638 code_point = i - 7696 639 elif (i >= 7969 and i <= 7979): 640 code_point = i - 7969 641 elif (i >= 8162 and i <= 8169): 642 code_point = i - 8162 643 elif (i >= 8299 and i <= 8313): 644 code_point = i - 8299 645 elif (i >= 8347 and i <= 8359): 646 code_point = i - 8347 647 if code_point - previous_code_point != 1: 648 pointers.append(column + (row * 94)) 649 offsets.append(code_point) 650 previous_code_point = code_point 651 652static_u16_table("KSX1001_OTHER_POINTERS", pointers) 653# Omit the last offset, because the end of the last line 654# is unmapped, so we don't want to look at it. 655static_u16_table("KSX1001_OTHER_UNSORTED_OFFSETS", offsets[:-1]) 656 657# JIS 0212 658 659index = indexes["jis0212"] 660 661# JIS 0212 Kanji 662static_u16_table("JIS0212_KANJI", index[1410:7211]) 663 664# JIS 0212 accented (all non-Kanji, non-range items) 665symbol_index = [] 666symbol_triples = [] 667pointers_to_scan = [ 668 (0, 596), 669 (608, 644), 670 (656, 1409), 671] 672in_run = False 673run_start_pointer = 0 674run_start_array_index = 0 675for (start, end) in pointers_to_scan: 676 for i in range(start, end): 677 code_point = index[i] 678 if in_run: 679 if code_point: 680 symbol_index.append(code_point) 681 elif index[i + 1]: 682 symbol_index.append(0) 683 else: 684 symbol_triples.append(run_start_pointer) 685 symbol_triples.append(i - run_start_pointer) 686 symbol_triples.append(run_start_array_index) 687 in_run = False 688 else: 689 if code_point: 690 in_run = True 691 run_start_pointer = i 692 run_start_array_index = len(symbol_index) 693 symbol_index.append(code_point) 694 if in_run: 695 symbol_triples.append(run_start_pointer) 696 symbol_triples.append(end - run_start_pointer) 697 symbol_triples.append(run_start_array_index) 698 in_run = False 699if in_run: 700 raise Error() 701 702static_u16_table("JIS0212_ACCENTED", symbol_index) 703static_u16_table("JIS0212_ACCENTED_TRIPLES", symbol_triples) 704 705# gb18030 706 707index = indexes["gb18030"] 708 709# Unicode 1.1 ideographs above the old GB2312 block 710# Compressed form takes 63% of uncompressed form 711pointers = [] 712offsets = [] 713previous_code_point = 0 714for i in xrange(6080): 715 code_point = index[i] 716 if previous_code_point > code_point: 717 raise Error() 718 if code_point - previous_code_point != 1: 719 pointers.append(i) 720 offsets.append(code_point) 721 previous_code_point = code_point 722 723static_u16_table("GBK_TOP_IDEOGRAPH_POINTERS", pointers) 724static_u16_table("GBK_TOP_IDEOGRAPH_OFFSETS", offsets) 725 726# Unicode 1.1 ideographs to the left of the old GB2312 block 727# Compressed form takes 40% of uncompressed form 728pointers = [] 729offsets = [] 730previous_code_point = 0 731for row in xrange(0x7D - 0x29): 732 for column in xrange(190 - 94): 733 i = 7790 + column + (row * 190) 734 if i > 23650: 735 # Exclude compatibility ideographs at the end 736 break 737 code_point = index[i] 738 if previous_code_point > code_point: 739 raise Error() 740 if code_point - previous_code_point != 1: 741 pointers.append(column + (row * (190 - 94))) 742 offsets.append(code_point) 743 previous_code_point = code_point 744 745static_u16_table("GBK_LEFT_IDEOGRAPH_POINTERS", pointers) 746static_u16_table("GBK_LEFT_IDEOGRAPH_OFFSETS", offsets) 747 748# GBK other (excl. Ext A, Compat & PUA at the bottom) 749pointers = [] 750offsets = [] 751previous_code_point = 0 752for row in xrange(0x29 - 0x20): 753 for column in xrange(190 - 94): 754 i = 6080 + column + (row * 190) 755 code_point = index[i] 756 if code_point - previous_code_point != 1: 757 pointers.append(column + (row * (190 - 94))) 758 offsets.append(code_point) 759 previous_code_point = code_point 760 761pointers.append((190 - 94) * (0x29 - 0x20)) 762static_u16_table("GBK_OTHER_POINTERS", pointers) 763static_u16_table("GBK_OTHER_UNSORTED_OFFSETS", offsets) 764 765# GBK bottom: Compatibility ideagraphs, Ext A and PUA 766bottom_index = [] 767# 5 compat following Unified Ideographs 768for i in range(23651, 23656): 769 bottom_index.append(index[i]) 770# Last row 771for i in range(23750, 23846): 772 bottom_index.append(index[i]) 773 774static_u16_table("GBK_BOTTOM", bottom_index) 775 776# GB2312 Hanzi 777# (and the 5 PUA code points in between Level 1 and Level 2) 778hanzi_index = [] 779for row in xrange(0x77 - 0x2F): 780 for column in xrange(94): 781 hanzi_index.append(index[9026 + column + (row * 190)]) 782 783static_u16_table("GB2312_HANZI", hanzi_index) 784 785# GB2312 symbols 786symbol_index = [] 787for i in xrange(94): 788 symbol_index.append(index[6176 + i]) 789 790static_u16_table("GB2312_SYMBOLS", symbol_index) 791 792# GB2312 symbols on Greek row (incl. PUA) 793symbol_index = [] 794for i in xrange(22): 795 symbol_index.append(index[7189 + i]) 796 797static_u16_table("GB2312_SYMBOLS_AFTER_GREEK", symbol_index) 798 799# GB2312 Pinyin 800pinyin_index = [] 801for i in xrange(32): 802 pinyin_index.append(index[7506 + i]) 803 804static_u16_table("GB2312_PINYIN", pinyin_index) 805 806# GB2312 other (excl. bottom PUA) 807pointers = [] 808offsets = [] 809previous_code_point = 0 810for row in xrange(14): 811 for column in xrange(94): 812 i = 6366 + column + (row * 190) 813 code_point = index[i] 814 # Exclude the two ranges that were processed as 815 # lookup tables above by filling them with 816 # ASCII. Upon encode, ASCII code points will 817 # never appear as the search key. 818 if (i >= 7189 and i < 7189 + 22): 819 code_point = i - 7189 820 elif (i >= 7506 and i < 7506 + 32): 821 code_point = i - 7506 822 if code_point - previous_code_point != 1: 823 pointers.append(column + (row * 94)) 824 offsets.append(code_point) 825 previous_code_point = code_point 826 827pointers.append(14 * 94) 828static_u16_table("GB2312_OTHER_POINTERS", pointers) 829static_u16_table("GB2312_OTHER_UNSORTED_OFFSETS", offsets) 830 831# Non-gbk code points 832pointers = [] 833offsets = [] 834for pair in indexes["gb18030-ranges"]: 835 if pair[1] == 0x10000: 836 break # the last entry doesn't fit in u16 837 pointers.append(pair[0]) 838 offsets.append(pair[1]) 839 840static_u16_table("GB18030_RANGE_POINTERS", pointers) 841static_u16_table("GB18030_RANGE_OFFSETS", offsets) 842 843# Encoder table for Level 1 Hanzi 844# The units here really fit into 12 bits, but since we're 845# looking for speed here, let's use 16 bits per unit. 846# Once we use 16 bits per unit, we might as well precompute 847# the output bytes. 848level1_hanzi_index = hanzi_index[:(94 * (0xD8 - 0xB0) - 5)] 849level1_hanzi_pairs = [] 850for i in xrange(len(level1_hanzi_index)): 851 hanzi_lead = (i / 94) + 0xB0 852 hanzi_trail = (i % 94) + 0xA1 853 level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail))) 854level1_hanzi_pairs.sort(key=lambda x: x[0]) 855 856static_u16_table_from_indexable("GB2312_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0) 857static_u8_pair_table_from_indexable("GB2312_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1) 858 859data_file.write('''#[inline(always)] 860fn map_with_ranges(haystack: &[u16], other: &[u16], needle: u16) -> u16 { 861 debug_assert_eq!(haystack.len(), other.len()); 862 match haystack.binary_search(&needle) { 863 Ok(i) => other[i], 864 Err(i) => other[i - 1] + (needle - haystack[i - 1]), 865 } 866} 867 868#[inline(always)] 869fn map_with_unsorted_ranges(haystack: &[u16], other: &[u16], needle: u16) -> Option<u16> { 870 debug_assert_eq!(haystack.len() + 1, other.len()); 871 for i in 0..haystack.len() { 872 let start = other[i]; 873 let end = other[i + 1]; 874 let length = end - start; 875 let offset = needle.wrapping_sub(haystack[i]); 876 if offset < length { 877 return Some(start + offset); 878 } 879 } 880 None 881} 882 883#[inline(always)] 884pub fn position(haystack: &[u16], needle: u16) -> Option<usize> { 885 haystack.iter().position(|&x| x == needle) 886} 887 888#[inline(always)] 889pub fn gb18030_range_decode(pointer: u16) -> u16 { 890 map_with_ranges(&GB18030_RANGE_POINTERS[..], 891 &GB18030_RANGE_OFFSETS[..], 892 pointer) 893} 894 895#[inline(always)] 896pub fn gb18030_range_encode(bmp: u16) -> usize { 897 if bmp == 0xE7C7 { 898 return 7457; 899 } 900 map_with_ranges(&GB18030_RANGE_OFFSETS[..], &GB18030_RANGE_POINTERS[..], bmp) as usize 901} 902 903#[inline(always)] 904pub fn gbk_top_ideograph_decode(pointer: u16) -> u16 { 905 map_with_ranges(&GBK_TOP_IDEOGRAPH_POINTERS[..], 906 &GBK_TOP_IDEOGRAPH_OFFSETS[..], 907 pointer) 908} 909 910#[inline(always)] 911pub fn gbk_top_ideograph_encode(bmp: u16) -> u16 { 912 map_with_ranges(&GBK_TOP_IDEOGRAPH_OFFSETS[..], 913 &GBK_TOP_IDEOGRAPH_POINTERS[..], 914 bmp) 915} 916 917#[inline(always)] 918pub fn gbk_left_ideograph_decode(pointer: u16) -> u16 { 919 map_with_ranges(&GBK_LEFT_IDEOGRAPH_POINTERS[..], 920 &GBK_LEFT_IDEOGRAPH_OFFSETS[..], 921 pointer) 922} 923 924#[inline(always)] 925pub fn gbk_left_ideograph_encode(bmp: u16) -> u16 { 926 map_with_ranges(&GBK_LEFT_IDEOGRAPH_OFFSETS[..], 927 &GBK_LEFT_IDEOGRAPH_POINTERS[..], 928 bmp) 929} 930 931#[inline(always)] 932pub fn cp949_top_hangul_decode(pointer: u16) -> u16 { 933 map_with_ranges(&CP949_TOP_HANGUL_POINTERS[..], 934 &CP949_TOP_HANGUL_OFFSETS[..], 935 pointer) 936} 937 938#[inline(always)] 939pub fn cp949_top_hangul_encode(bmp: u16) -> u16 { 940 map_with_ranges(&CP949_TOP_HANGUL_OFFSETS[..], 941 &CP949_TOP_HANGUL_POINTERS[..], 942 bmp) 943} 944 945#[inline(always)] 946pub fn cp949_left_hangul_decode(pointer: u16) -> u16 { 947 map_with_ranges(&CP949_LEFT_HANGUL_POINTERS[..], 948 &CP949_LEFT_HANGUL_OFFSETS[..], 949 pointer) 950} 951 952#[inline(always)] 953pub fn cp949_left_hangul_encode(bmp: u16) -> u16 { 954 map_with_ranges(&CP949_LEFT_HANGUL_OFFSETS[..], 955 &CP949_LEFT_HANGUL_POINTERS[..], 956 bmp) 957} 958 959#[inline(always)] 960pub fn gbk_other_decode(pointer: u16) -> u16 { 961 map_with_ranges(&GBK_OTHER_POINTERS[..GBK_OTHER_POINTERS.len() - 1], 962 &GBK_OTHER_UNSORTED_OFFSETS[..], 963 pointer) 964} 965 966#[inline(always)] 967pub fn gbk_other_encode(bmp: u16) -> Option<u16> { 968 map_with_unsorted_ranges(&GBK_OTHER_UNSORTED_OFFSETS[..], 969 &GBK_OTHER_POINTERS[..], 970 bmp) 971} 972 973#[inline(always)] 974pub fn gb2312_other_decode(pointer: u16) -> u16 { 975 map_with_ranges(&GB2312_OTHER_POINTERS[..GB2312_OTHER_POINTERS.len() - 1], 976 &GB2312_OTHER_UNSORTED_OFFSETS[..], 977 pointer) 978} 979 980#[inline(always)] 981pub fn gb2312_other_encode(bmp: u16) -> Option<u16> { 982 map_with_unsorted_ranges(&GB2312_OTHER_UNSORTED_OFFSETS[..], 983 &GB2312_OTHER_POINTERS[..], 984 bmp) 985} 986 987#[cfg(feature = "no-static-ideograph-encoder-tables")] 988#[inline(always)] 989pub fn gb2312_level1_hanzi_encode(bmp: u16) -> Option<(u8, u8)> { 990 position(&GB2312_HANZI[..(94 * (0xD8 - 0xB0) - 5)], bmp).map(|hanzi_pointer| { 991 let hanzi_lead = (hanzi_pointer / 94) + 0xB0; 992 let hanzi_trail = (hanzi_pointer % 94) + 0xA1; 993 (hanzi_lead as u8, hanzi_trail as u8) 994 }) 995} 996 997#[cfg(not(feature = "no-static-ideograph-encoder-tables"))] 998#[inline(always)] 999pub fn gb2312_level1_hanzi_encode(bmp: u16) -> Option<(u8, u8)> { 1000 match GB2312_LEVEL1_HANZI_CODE_POINTS.binary_search(&bmp) { 1001 Ok(i) => { 1002 let pair = &GB2312_LEVEL1_HANZI_BYTES[i]; 1003 Some((pair[0], pair[1])) 1004 } 1005 Err(_) => None, 1006 } 1007} 1008 1009#[inline(always)] 1010pub fn gb2312_level2_hanzi_encode(bmp: u16) -> Option<usize> { 1011 // TODO: optimize 1012 position(&GB2312_HANZI[(94 * (0xD8 - 0xB0))..], bmp) 1013} 1014 1015#[inline(always)] 1016pub fn ksx1001_other_decode(pointer: u16) -> u16 { 1017 map_with_ranges(&KSX1001_OTHER_POINTERS[..KSX1001_OTHER_POINTERS.len() - 1], 1018 &KSX1001_OTHER_UNSORTED_OFFSETS[..], 1019 pointer) 1020} 1021 1022#[inline(always)] 1023pub fn ksx1001_other_encode(bmp: u16) -> Option<u16> { 1024 map_with_unsorted_ranges(&KSX1001_OTHER_UNSORTED_OFFSETS[..], 1025 &KSX1001_OTHER_POINTERS[..], 1026 bmp) 1027} 1028 1029#[cfg(feature = "no-static-ideograph-encoder-tables")] 1030#[inline(always)] 1031pub fn jis0208_level1_kanji_shift_jis_encode(bmp: u16) -> Option<(u8, u8)> { 1032 position(&JIS0208_LEVEL1_KANJI[..], bmp).map(|kanji_pointer| { 1033 let pointer = 1410 + kanji_pointer; 1034 let lead = pointer / 188; 1035 let lead_offset = if lead < 0x1F { 1036 0x81 1037 } else { 1038 0xC1 1039 }; 1040 let trail = pointer % 188; 1041 let trail_offset = if trail < 0x3F { 1042 0x40 1043 } else { 1044 0x41 1045 }; 1046 ((lead + lead_offset) as u8, (trail + trail_offset) as u8) 1047 }) 1048} 1049 1050#[cfg(not(feature = "no-static-ideograph-encoder-tables"))] 1051#[inline(always)] 1052pub fn jis0208_level1_kanji_shift_jis_encode(bmp: u16) -> Option<(u8, u8)> { 1053 match JIS0208_LEVEL1_KANJI_CODE_POINTS.binary_search(&bmp) { 1054 Ok(i) => { 1055 let pair = &JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES[i]; 1056 Some((pair[0], pair[1])) 1057 } 1058 Err(_) => None, 1059 } 1060} 1061 1062#[cfg(feature = "no-static-ideograph-encoder-tables")] 1063#[inline(always)] 1064pub fn jis0208_level1_kanji_euc_jp_encode(bmp: u16) -> Option<(u8, u8)> { 1065 position(&JIS0208_LEVEL1_KANJI[..], bmp).map(|kanji_pointer| { 1066 let lead = (kanji_pointer / 94) + 0xB0; 1067 let trail = (kanji_pointer % 94) + 0xA1; 1068 (lead as u8, trail as u8) 1069 }) 1070} 1071 1072#[cfg(not(feature = "no-static-ideograph-encoder-tables"))] 1073#[inline(always)] 1074pub fn jis0208_level1_kanji_euc_jp_encode(bmp: u16) -> Option<(u8, u8)> { 1075 jis0208_level1_kanji_shift_jis_encode(bmp).map(|(shift_jis_lead, shift_jis_trail)| { 1076 let mut lead = shift_jis_lead as usize; 1077 if shift_jis_lead >= 0xA0 { 1078 lead -= 0xC1 - 0x81; 1079 } 1080 // The next line would overflow u8. Letting it go over allows us to 1081 // subtract fewer times. 1082 lead <<= 1; 1083 // Bring it back to u8 range 1084 lead -= 0x61; 1085 let trail = if shift_jis_trail >= 0x9F { 1086 lead += 1; 1087 shift_jis_trail + (0xA1 - 0x9F) 1088 } else if shift_jis_trail < 0x7F { 1089 shift_jis_trail + (0xA1 - 0x40) 1090 } else { 1091 shift_jis_trail + (0xA1 - 0x41) 1092 }; 1093 (lead as u8, trail) 1094 }) 1095} 1096 1097#[cfg(feature = "no-static-ideograph-encoder-tables")] 1098#[inline(always)] 1099pub fn jis0208_level1_kanji_iso_2022_jp_encode(bmp: u16) -> Option<(u8, u8)> { 1100 position(&JIS0208_LEVEL1_KANJI[..], bmp).map(|kanji_pointer| { 1101 let lead = (kanji_pointer / 94) + (0xB0 - 0x80); 1102 let trail = (kanji_pointer % 94) + 0x21; 1103 (lead as u8, trail as u8) 1104 }) 1105} 1106 1107#[cfg(not(feature = "no-static-ideograph-encoder-tables"))] 1108#[inline(always)] 1109pub fn jis0208_level1_kanji_iso_2022_jp_encode(bmp: u16) -> Option<(u8, u8)> { 1110 jis0208_level1_kanji_shift_jis_encode(bmp).map(|(shift_jis_lead, shift_jis_trail)| { 1111 let mut lead = shift_jis_lead as usize; 1112 if shift_jis_lead >= 0xA0 { 1113 lead -= 0xC1 - 0x81; 1114 } 1115 // The next line would overflow u8. Letting it go over allows us to 1116 // subtract fewer times. 1117 lead <<= 1; 1118 // Bring it back to u8 range 1119 lead -= 0xE1; 1120 let trail = if shift_jis_trail >= 0x9F { 1121 lead += 1; 1122 shift_jis_trail - (0x9F - 0x21) 1123 } else if shift_jis_trail < 0x7F { 1124 shift_jis_trail - (0x40 - 0x21) 1125 } else { 1126 shift_jis_trail - (0x41 - 0x21) 1127 }; 1128 (lead as u8, trail) 1129 }) 1130} 1131 1132#[inline(always)] 1133pub fn jis0208_level2_and_additional_kanji_encode(bmp: u16) -> Option<usize> { 1134 // TODO: optimize 1135 position(&JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[..], bmp) 1136} 1137 1138pub fn jis0208_symbol_decode(pointer: usize) -> Option<u16> { 1139 let mut i = 0; 1140 while i < JIS0208_SYMBOL_TRIPLES.len() { 1141 let start = JIS0208_SYMBOL_TRIPLES[i] as usize; 1142 let length = JIS0208_SYMBOL_TRIPLES[i + 1] as usize; 1143 let pointer_minus_start = pointer.wrapping_sub(start); 1144 if pointer_minus_start < length { 1145 let offset = JIS0208_SYMBOL_TRIPLES[i + 2] as usize; 1146 return Some(JIS0208_SYMBOLS[pointer_minus_start + offset]); 1147 } 1148 i += 3; 1149 } 1150 None 1151} 1152 1153/// Prefers Shift_JIS pointers for the three symbols that are in both ranges. 1154#[inline(always)] 1155pub fn jis0208_symbol_encode(bmp: u16) -> Option<usize> { 1156 let mut i = 0; 1157 while i < JIS0208_SYMBOL_TRIPLES.len() { 1158 let pointer_start = JIS0208_SYMBOL_TRIPLES[i] as usize; 1159 let length = JIS0208_SYMBOL_TRIPLES[i + 1] as usize; 1160 let symbol_start = JIS0208_SYMBOL_TRIPLES[i + 2] as usize; 1161 let symbol_end = symbol_start + length; 1162 let mut symbol_pos = symbol_start; 1163 while symbol_pos < symbol_end { 1164 if JIS0208_SYMBOLS[symbol_pos] == bmp { 1165 return Some(symbol_pos - symbol_start + pointer_start); 1166 } 1167 symbol_pos += 1; 1168 } 1169 i += 3; 1170 } 1171 None 1172} 1173 1174#[inline(always)] 1175pub fn ibm_symbol_encode(bmp: u16) -> Option<usize> { 1176 position(&JIS0208_SYMBOLS[IBM_SYMBOL_START..IBM_SYMBOL_END], bmp) 1177 .map(|x| x + IBM_SYMBOL_POINTER_START) 1178} 1179 1180#[inline(always)] 1181pub fn jis0208_range_decode(pointer: usize) -> Option<u16> { 1182 let mut i = 0; 1183 while i < JIS0208_RANGE_TRIPLES.len() { 1184 let start = JIS0208_RANGE_TRIPLES[i] as usize; 1185 let length = JIS0208_RANGE_TRIPLES[i + 1] as usize; 1186 let pointer_minus_start = pointer.wrapping_sub(start); 1187 if pointer_minus_start < length { 1188 let offset = JIS0208_RANGE_TRIPLES[i + 2] as usize; 1189 return Some((pointer_minus_start + offset) as u16); 1190 } 1191 i += 3; 1192 } 1193 None 1194} 1195 1196#[inline(always)] 1197pub fn jis0208_range_encode(bmp: u16) -> Option<usize> { 1198 let mut i = 0; 1199 while i < JIS0208_RANGE_TRIPLES.len() { 1200 let start = JIS0208_RANGE_TRIPLES[i + 2] as usize; 1201 let length = JIS0208_RANGE_TRIPLES[i + 1] as usize; 1202 let bmp_minus_start = (bmp as usize).wrapping_sub(start); 1203 if bmp_minus_start < length { 1204 let offset = JIS0208_RANGE_TRIPLES[i] as usize; 1205 return Some(bmp_minus_start + offset); 1206 } 1207 i += 3; 1208 } 1209 None 1210} 1211 1212pub fn jis0212_accented_decode(pointer: usize) -> Option<u16> { 1213 let mut i = 0; 1214 while i < JIS0212_ACCENTED_TRIPLES.len() { 1215 let start = JIS0212_ACCENTED_TRIPLES[i] as usize; 1216 let length = JIS0212_ACCENTED_TRIPLES[i + 1] as usize; 1217 let pointer_minus_start = pointer.wrapping_sub(start); 1218 if pointer_minus_start < length { 1219 let offset = JIS0212_ACCENTED_TRIPLES[i + 2] as usize; 1220 let candidate = JIS0212_ACCENTED[pointer_minus_start + offset]; 1221 if candidate == 0 { 1222 return None; 1223 } 1224 return Some(candidate); 1225 } 1226 i += 3; 1227 } 1228 None 1229} 1230 1231#[inline(always)] 1232pub fn big5_is_astral(rebased_pointer: usize) -> bool { 1233 (BIG5_ASTRALNESS[rebased_pointer >> 5] & (1 << (rebased_pointer & 0x1F))) != 0 1234} 1235 1236#[inline(always)] 1237pub fn big5_low_bits(rebased_pointer: usize) -> u16 { 1238 if rebased_pointer < BIG5_LOW_BITS.len() { 1239 BIG5_LOW_BITS[rebased_pointer] 1240 } else { 1241 0 1242 } 1243} 1244 1245#[inline(always)] 1246pub fn big5_astral_encode(low_bits: u16) -> Option<usize> { 1247 match low_bits { 1248 0x00CC => Some(11205 - 942), 1249 0x008A => Some(11207 - 942), 1250 0x7607 => Some(11213 - 942), 1251 _ => { 1252 let mut i = 18997 - 942; 1253 while i < BIG5_LOW_BITS.len() - 1 { 1254 if BIG5_LOW_BITS[i] == low_bits && big5_is_astral(i) { 1255 return Some(i); 1256 } 1257 i += 1; 1258 } 1259 None 1260 } 1261 } 1262} 1263 1264#[cfg(feature = "no-static-ideograph-encoder-tables")] 1265#[inline(always)] 1266pub fn big5_level1_hanzi_encode(bmp: u16) -> Option<(u8, u8)> { 1267 if super::in_inclusive_range16(bmp, 0x4E00, 0x9FB1) { 1268 if let Some(hanzi_pointer) = position(&BIG5_LOW_BITS[(5495 - 942)..(10951 - 942)], bmp) { 1269 let lead = hanzi_pointer / 157 + 0xA4; 1270 let remainder = hanzi_pointer % 157; 1271 let trail = if remainder < 0x3F { 1272 remainder + 0x40 1273 } else { 1274 remainder + 0x62 1275 }; 1276 return Some((lead as u8, trail as u8)); 1277 } 1278 match bmp { 1279 0x4E5A => { 1280 return Some((0xC8, 0x7B)); 1281 } 1282 0x5202 => { 1283 return Some((0xC8, 0x7D)); 1284 } 1285 0x9FB0 => { 1286 return Some((0xC8, 0xA1)); 1287 } 1288 0x5188 => { 1289 return Some((0xC8, 0xA2)); 1290 } 1291 0x9FB1 => { 1292 return Some((0xC8, 0xA3)); 1293 } 1294 _ => { 1295 return None; 1296 } 1297 } 1298 } 1299 None 1300} 1301 1302#[cfg(not(feature = "no-static-ideograph-encoder-tables"))] 1303#[inline(always)] 1304pub fn big5_level1_hanzi_encode(bmp: u16) -> Option<(u8, u8)> { 1305 if super::in_inclusive_range16(bmp, 0x4E00, 0x9FB1) { 1306 match BIG5_LEVEL1_HANZI_CODE_POINTS.binary_search(&bmp) { 1307 Ok(i) => { 1308 let pair = &BIG5_LEVEL1_HANZI_BYTES[i]; 1309 Some((pair[0], pair[1])) 1310 } 1311 Err(_) => None, 1312 } 1313 } else { 1314 None 1315 } 1316} 1317 1318#[inline(always)] 1319pub fn big5_box_encode(bmp: u16) -> Option<usize> { 1320 position(&BIG5_LOW_BITS[(18963 - 942)..(18992 - 942)], bmp).map(|x| x + 18963) 1321} 1322 1323#[inline(always)] 1324pub fn big5_other_encode(bmp: u16) -> Option<usize> { 1325 if 0x4491 == bmp { 1326 return Some(11209); 1327 } 1328 if let Some(pos) = position(&BIG5_LOW_BITS[(5024 - 942)..(5466 - 942)], bmp) { 1329 return Some(pos + 5024); 1330 } 1331 if let Some(pos) = position(&BIG5_LOW_BITS[(10896 - 942)..(11205 - 942)], bmp) { 1332 return Some(pos + 10896); 1333 } 1334 if let Some(pos) = position(&BIG5_LOW_BITS[(11254 - 942)..(18963 - 942)], bmp) { 1335 return Some(pos + 11254); 1336 } 1337 let mut i = 18996 - 942; 1338 while i < BIG5_LOW_BITS.len() { 1339 if BIG5_LOW_BITS[i] == bmp && !big5_is_astral(i) { 1340 return Some(i + 942); 1341 } 1342 i += 1; 1343 } 1344 None 1345} 1346 1347#[inline(always)] 1348pub fn mul_94(lead: u8) -> usize { 1349 lead as usize * 94 1350} 1351''') 1352 1353data_file.close() 1354 1355# Variant 1356 1357variant_file = open("src/variant.rs", "w") 1358variant_file.write('''// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT 1359// file at the top-level directory of this distribution. 1360// 1361// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 1362// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license 1363// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your 1364// option. This file may not be copied, modified, or distributed 1365// except according to those terms. 1366 1367// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. 1368// Instead, please regenerate using generate-encoding-data.py 1369 1370//! This module provides enums that wrap the various decoders and encoders. 1371//! The purpose is to make `Decoder` and `Encoder` `Sized` by writing the 1372//! dispatch explicitly for a finite set of specialized decoders and encoders. 1373//! Unfortunately, this means the compiler doesn't generate the dispatch code 1374//! and it has to be written here instead. 1375//! 1376//! The purpose of making `Decoder` and `Encoder` `Sized` is to allow stack 1377//! allocation in Rust code, including the convenience methods on `Encoding`. 1378 1379''') 1380 1381encoding_variants = [u"single-byte",] 1382for encoding in multi_byte: 1383 if encoding["name"] in [u"UTF-16LE", u"UTF-16BE"]: 1384 continue 1385 else: 1386 encoding_variants.append(encoding["name"]) 1387encoding_variants.append(u"UTF-16") 1388 1389decoder_variants = [] 1390for variant in encoding_variants: 1391 if variant == u"GBK": 1392 continue 1393 decoder_variants.append(variant) 1394 1395encoder_variants = [] 1396for variant in encoding_variants: 1397 if variant in [u"replacement", u"GBK", u"UTF-16"]: 1398 continue 1399 encoder_variants.append(variant) 1400 1401for variant in decoder_variants: 1402 variant_file.write("use %s::*;\n" % to_snake_name(variant)) 1403 1404variant_file.write('''use super::*; 1405 1406pub enum VariantDecoder { 1407''') 1408 1409for variant in decoder_variants: 1410 variant_file.write(" %s(%sDecoder),\n" % (to_camel_name(variant), to_camel_name(variant))) 1411 1412variant_file.write('''} 1413 1414impl VariantDecoder { 1415''') 1416 1417def write_variant_method(name, mut, arg_list, ret, variants, excludes, kind): 1418 variant_file.write('''pub fn %s(&''' % name) 1419 if mut: 1420 variant_file.write('''mut ''') 1421 variant_file.write('''self''') 1422 for arg in arg_list: 1423 variant_file.write(''', %s: %s''' % (arg[0], arg[1])) 1424 variant_file.write(''')''') 1425 if ret: 1426 variant_file.write(''' -> %s''' % ret) 1427 variant_file.write(''' {\nmatch *self {\n''') 1428 for variant in variants: 1429 variant_file.write('''Variant%s::%s(ref ''' % (kind, to_camel_name(variant))) 1430 if mut: 1431 variant_file.write('''mut ''') 1432 if variant in excludes: 1433 variant_file.write('''v) => (),''') 1434 continue 1435 variant_file.write('''v) => v.%s(''' % name) 1436 first = True 1437 for arg in arg_list: 1438 if not first: 1439 variant_file.write(''', ''') 1440 first = False 1441 variant_file.write(arg[0]) 1442 variant_file.write('''),\n''') 1443 variant_file.write('''}\n}\n\n''') 1444 1445write_variant_method("max_utf16_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder") 1446 1447write_variant_method("max_utf8_buffer_length_without_replacement", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder") 1448 1449write_variant_method("max_utf8_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder") 1450 1451write_variant_method("decode_to_utf16_raw", True, [("src", "&[u8]"), 1452 ("dst", "&mut [u16]"), 1453 ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder") 1454 1455write_variant_method("decode_to_utf8_raw", True, [("src", "&[u8]"), 1456 ("dst", "&mut [u8]"), 1457 ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder") 1458 1459variant_file.write(''' 1460} 1461 1462pub enum VariantEncoder { 1463''') 1464 1465for variant in encoder_variants: 1466 variant_file.write(" %s(%sEncoder),\n" % (to_camel_name(variant), to_camel_name(variant))) 1467 1468variant_file.write('''} 1469 1470impl VariantEncoder { 1471 pub fn has_pending_state(&self) -> bool { 1472 match *self { 1473 VariantEncoder::Iso2022Jp(ref v) => { 1474 v.has_pending_state() 1475 } 1476 _ => false, 1477 } 1478 } 1479''') 1480 1481write_variant_method("max_buffer_length_from_utf16_without_replacement", False, [("u16_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder") 1482 1483write_variant_method("max_buffer_length_from_utf8_without_replacement", False, [("byte_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder") 1484 1485write_variant_method("encode_from_utf16_raw", True, [("src", "&[u16]"), 1486 ("dst", "&mut [u8]"), 1487 ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder") 1488 1489write_variant_method("encode_from_utf8_raw", True, [("src", "&str"), 1490 ("dst", "&mut [u8]"), 1491 ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder") 1492 1493 1494variant_file.write('''} 1495 1496pub enum VariantEncoding { 1497 SingleByte(&'static [u16; 128]),''') 1498 1499for encoding in multi_byte: 1500 variant_file.write("%s,\n" % to_camel_name(encoding["name"])) 1501 1502variant_file.write('''} 1503 1504impl VariantEncoding { 1505 pub fn new_variant_decoder(&self) -> VariantDecoder { 1506 match *self { 1507 VariantEncoding::SingleByte(table) => SingleByteDecoder::new(table), 1508 VariantEncoding::Utf8 => Utf8Decoder::new(), 1509 VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(), 1510 VariantEncoding::Big5 => Big5Decoder::new(), 1511 VariantEncoding::EucJp => EucJpDecoder::new(), 1512 VariantEncoding::Iso2022Jp => Iso2022JpDecoder::new(), 1513 VariantEncoding::ShiftJis => ShiftJisDecoder::new(), 1514 VariantEncoding::EucKr => EucKrDecoder::new(), 1515 VariantEncoding::Replacement => ReplacementDecoder::new(), 1516 VariantEncoding::UserDefined => UserDefinedDecoder::new(), 1517 VariantEncoding::Utf16Be => Utf16Decoder::new(true), 1518 VariantEncoding::Utf16Le => Utf16Decoder::new(false), 1519 } 1520 } 1521 1522 pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder { 1523 match *self { 1524 VariantEncoding::SingleByte(table) => SingleByteEncoder::new(encoding, table), 1525 VariantEncoding::Utf8 => Utf8Encoder::new(encoding), 1526 VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false), 1527 VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true), 1528 VariantEncoding::Big5 => Big5Encoder::new(encoding), 1529 VariantEncoding::EucJp => EucJpEncoder::new(encoding), 1530 VariantEncoding::Iso2022Jp => Iso2022JpEncoder::new(encoding), 1531 VariantEncoding::ShiftJis => ShiftJisEncoder::new(encoding), 1532 VariantEncoding::EucKr => EucKrEncoder::new(encoding), 1533 VariantEncoding::UserDefined => UserDefinedEncoder::new(encoding), 1534 VariantEncoding::Utf16Be | VariantEncoding::Replacement | 1535 VariantEncoding::Utf16Le => unreachable!(), 1536 } 1537 } 1538} 1539''') 1540 1541variant_file.close() 1542 1543(ffi_rs_begin, ffi_rs_end) = read_non_generated("../encoding_c/src/lib.rs") 1544 1545ffi_file = open("../encoding_c/src/lib.rs", "w") 1546 1547ffi_file.write(ffi_rs_begin) 1548ffi_file.write(""" 1549// Instead, please regenerate using generate-encoding-data.py 1550 1551/// The minimum length of buffers that may be passed to `encoding_name()`. 1552pub const ENCODING_NAME_MAX_LENGTH: usize = %d; // %s 1553 1554""" % (longest_name_length, longest_name)) 1555 1556for name in preferred: 1557 ffi_file.write('''/// The %s encoding. 1558#[no_mangle] 1559pub static %s_ENCODING: ConstEncoding = ConstEncoding(&%s_INIT); 1560 1561''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name))) 1562 1563ffi_file.write(ffi_rs_end) 1564ffi_file.close() 1565 1566(single_byte_rs_begin, single_byte_rs_end) = read_non_generated("src/single_byte.rs") 1567 1568single_byte_file = open("src/single_byte.rs", "w") 1569 1570single_byte_file.write(single_byte_rs_begin) 1571single_byte_file.write(""" 1572// Instead, please regenerate using generate-encoding-data.py 1573 1574 #[test] 1575 fn test_single_byte_decode() {""") 1576 1577for name in preferred: 1578 if name == u"ISO-8859-8-I": 1579 continue; 1580 if is_single_byte(name): 1581 single_byte_file.write(""" 1582 decode_single_byte(%s, %s_DATA);""" % (to_constant_name(name), to_constant_name(name))) 1583 1584single_byte_file.write(""" 1585 } 1586 1587 #[test] 1588 fn test_single_byte_encode() {""") 1589 1590for name in preferred: 1591 if name == u"ISO-8859-8-I": 1592 continue; 1593 if is_single_byte(name): 1594 single_byte_file.write(""" 1595 encode_single_byte(%s, %s_DATA);""" % (to_constant_name(name), to_constant_name(name))) 1596 1597 1598single_byte_file.write(""" 1599 } 1600""") 1601 1602single_byte_file.write(single_byte_rs_end) 1603single_byte_file.close() 1604 1605static_file = open("../encoding_c/include/encoding_rs_statics.h", "w") 1606 1607static_file.write("""// Copyright 2016 Mozilla Foundation. See the COPYRIGHT 1608// file at the top-level directory of this distribution. 1609// 1610// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 1611// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license 1612// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your 1613// option. This file may not be copied, modified, or distributed 1614// except according to those terms. 1615 1616// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. 1617// Instead, please regenerate using generate-encoding-data.py 1618 1619// This file is not meant to be included directly. Instead, encoding_rs.h 1620// includes this file. 1621 1622#ifndef encoding_rs_statics_h_ 1623#define encoding_rs_statics_h_ 1624 1625#ifndef ENCODING_RS_ENCODING 1626#define ENCODING_RS_ENCODING Encoding 1627#ifndef __cplusplus 1628typedef struct Encoding_ Encoding; 1629#endif 1630#endif 1631 1632#ifndef ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR 1633#define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ENCODING_RS_ENCODING* 1634#endif 1635 1636#ifndef ENCODING_RS_ENCODER 1637#define ENCODING_RS_ENCODER Encoder 1638#ifndef __cplusplus 1639typedef struct Encoder_ Encoder; 1640#endif 1641#endif 1642 1643#ifndef ENCODING_RS_DECODER 1644#define ENCODING_RS_DECODER Decoder 1645#ifndef __cplusplus 1646typedef struct Decoder_ Decoder; 1647#endif 1648#endif 1649 1650#define INPUT_EMPTY 0 1651 1652#define OUTPUT_FULL 0xFFFFFFFF 1653 1654// %s 1655#define ENCODING_NAME_MAX_LENGTH %d 1656 1657""" % (longest_name, longest_name_length)) 1658 1659for name in preferred: 1660 static_file.write('''/// The %s encoding. 1661extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const %s_ENCODING; 1662 1663''' % (to_dom_name(name), to_constant_name(name))) 1664 1665static_file.write("""#endif // encoding_rs_statics_h_ 1666""") 1667static_file.close() 1668 1669(utf_8_rs_begin, utf_8_rs_end) = read_non_generated("src/utf_8.rs") 1670 1671utf_8_file = open("src/utf_8.rs", "w") 1672 1673utf_8_file.write(utf_8_rs_begin) 1674utf_8_file.write(""" 1675// Instead, please regenerate using generate-encoding-data.py 1676 1677/// Bit is 1 if the trail is invalid. 1678static UTF8_TRAIL_INVALID: [u8; 256] = [""") 1679 1680for i in range(256): 1681 combined = 0 1682 if i < 0x80 or i > 0xBF: 1683 combined |= (1 << 3) 1684 if i < 0xA0 or i > 0xBF: 1685 combined |= (1 << 4) 1686 if i < 0x80 or i > 0x9F: 1687 combined |= (1 << 5) 1688 if i < 0x90 or i > 0xBF: 1689 combined |= (1 << 6) 1690 if i < 0x80 or i > 0x8F: 1691 combined |= (1 << 7) 1692 utf_8_file.write("%d," % combined) 1693 1694utf_8_file.write(""" 1695]; 1696""") 1697 1698utf_8_file.write(utf_8_rs_end) 1699utf_8_file.close() 1700 1701# Unit tests 1702 1703TEST_HEADER = '''Any copyright to the test code below this comment is dedicated to the 1704Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ 1705 1706This is a generated file. Please do not edit. 1707Instead, please regenerate using generate-encoding-data.py 1708''' 1709 1710index = indexes["jis0208"] 1711 1712jis0208_in_file = open("src/test_data/jis0208_in.txt", "w") 1713jis0208_in_file.write(TEST_HEADER) 1714for pointer in range(0, 94 * 94): 1715 (lead, trail) = divmod(pointer, 94) 1716 lead += 0xA1 1717 trail += 0xA1 1718 jis0208_in_file.write("%s%s\n" % (chr(lead), chr(trail))) 1719jis0208_in_file.close() 1720 1721jis0208_in_ref_file = open("src/test_data/jis0208_in_ref.txt", "w") 1722jis0208_in_ref_file.write(TEST_HEADER) 1723for pointer in range(0, 94 * 94): 1724 code_point = index[pointer] 1725 if code_point: 1726 jis0208_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1727 else: 1728 jis0208_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1729jis0208_in_ref_file.close() 1730 1731jis0208_out_file = open("src/test_data/jis0208_out.txt", "w") 1732jis0208_out_ref_file = open("src/test_data/jis0208_out_ref.txt", "w") 1733jis0208_out_file.write(TEST_HEADER) 1734jis0208_out_ref_file.write(TEST_HEADER) 1735for pointer in range(0, 94 * 94): 1736 code_point = index[pointer] 1737 if code_point: 1738 revised_pointer = pointer 1739 if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220): 1740 revised_pointer = index.index(code_point) 1741 (lead, trail) = divmod(revised_pointer, 94) 1742 lead += 0xA1 1743 trail += 0xA1 1744 jis0208_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 1745 jis0208_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1746jis0208_out_file.close() 1747jis0208_out_ref_file.close() 1748 1749shift_jis_in_file = open("src/test_data/shift_jis_in.txt", "w") 1750shift_jis_in_file.write(TEST_HEADER) 1751for pointer in range(0, len(index)): 1752 (lead, trail) = divmod(pointer, 188) 1753 lead += 0x81 if lead < 0x1F else 0xC1 1754 trail += 0x40 if trail < 0x3F else 0x41 1755 shift_jis_in_file.write("%s%s\n" % (chr(lead), chr(trail))) 1756shift_jis_in_file.close() 1757 1758shift_jis_in_ref_file = open("src/test_data/shift_jis_in_ref.txt", "w") 1759shift_jis_in_ref_file.write(TEST_HEADER) 1760for pointer in range(0, len(index)): 1761 code_point = 0xE000 - 8836 + pointer if pointer >= 8836 and pointer <= 10715 else index[pointer] 1762 if code_point: 1763 shift_jis_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1764 else: 1765 trail = pointer % 188 1766 trail += 0x40 if trail < 0x3F else 0x41 1767 if trail < 0x80: 1768 shift_jis_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) 1769 else: 1770 shift_jis_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1771shift_jis_in_ref_file.close() 1772 1773shift_jis_out_file = open("src/test_data/shift_jis_out.txt", "w") 1774shift_jis_out_ref_file = open("src/test_data/shift_jis_out_ref.txt", "w") 1775shift_jis_out_file.write(TEST_HEADER) 1776shift_jis_out_ref_file.write(TEST_HEADER) 1777for pointer in range(0, 8272): 1778 code_point = index[pointer] 1779 if code_point: 1780 revised_pointer = pointer 1781 if revised_pointer >= 1207 and revised_pointer < 1220: 1782 revised_pointer = index.index(code_point) 1783 (lead, trail) = divmod(revised_pointer, 188) 1784 lead += 0x81 if lead < 0x1F else 0xC1 1785 trail += 0x40 if trail < 0x3F else 0x41 1786 shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 1787 shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1788for pointer in range(8836, len(index)): 1789 code_point = index[pointer] 1790 if code_point: 1791 revised_pointer = index.index(code_point) 1792 if revised_pointer >= 8272 and revised_pointer < 8836: 1793 revised_pointer = pointer 1794 (lead, trail) = divmod(revised_pointer, 188) 1795 lead += 0x81 if lead < 0x1F else 0xC1 1796 trail += 0x40 if trail < 0x3F else 0x41 1797 shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 1798 shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1799shift_jis_out_file.close() 1800shift_jis_out_ref_file.close() 1801 1802iso_2022_jp_in_file = open("src/test_data/iso_2022_jp_in.txt", "w") 1803iso_2022_jp_in_file.write(TEST_HEADER) 1804for pointer in range(0, 94 * 94): 1805 (lead, trail) = divmod(pointer, 94) 1806 lead += 0x21 1807 trail += 0x21 1808 iso_2022_jp_in_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail))) 1809iso_2022_jp_in_file.close() 1810 1811iso_2022_jp_in_ref_file = open("src/test_data/iso_2022_jp_in_ref.txt", "w") 1812iso_2022_jp_in_ref_file.write(TEST_HEADER) 1813for pointer in range(0, 94 * 94): 1814 code_point = index[pointer] 1815 if code_point: 1816 iso_2022_jp_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1817 else: 1818 iso_2022_jp_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1819iso_2022_jp_in_ref_file.close() 1820 1821iso_2022_jp_out_file = open("src/test_data/iso_2022_jp_out.txt", "w") 1822iso_2022_jp_out_ref_file = open("src/test_data/iso_2022_jp_out_ref.txt", "w") 1823iso_2022_jp_out_file.write(TEST_HEADER) 1824iso_2022_jp_out_ref_file.write(TEST_HEADER) 1825for pointer in range(0, 94 * 94): 1826 code_point = index[pointer] 1827 if code_point: 1828 revised_pointer = pointer 1829 if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220): 1830 revised_pointer = index.index(code_point) 1831 (lead, trail) = divmod(revised_pointer, 94) 1832 lead += 0x21 1833 trail += 0x21 1834 iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail))) 1835 iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1836for i in xrange(len(half_width_index)): 1837 code_point = i + 0xFF61 1838 normalized_code_point = half_width_index[i] 1839 pointer = index.index(normalized_code_point) 1840 (lead, trail) = divmod(pointer, 94) 1841 lead += 0x21 1842 trail += 0x21 1843 iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail))) 1844 iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1845iso_2022_jp_out_file.close() 1846iso_2022_jp_out_ref_file.close() 1847 1848index = indexes["euc-kr"] 1849 1850euc_kr_in_file = open("src/test_data/euc_kr_in.txt", "w") 1851euc_kr_in_file.write(TEST_HEADER) 1852for pointer in range(0, len(index)): 1853 (lead, trail) = divmod(pointer, 190) 1854 lead += 0x81 1855 trail += 0x41 1856 euc_kr_in_file.write("%s%s\n" % (chr(lead), chr(trail))) 1857euc_kr_in_file.close() 1858 1859euc_kr_in_ref_file = open("src/test_data/euc_kr_in_ref.txt", "w") 1860euc_kr_in_ref_file.write(TEST_HEADER) 1861for pointer in range(0, len(index)): 1862 code_point = index[pointer] 1863 if code_point: 1864 euc_kr_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1865 else: 1866 trail = pointer % 190 1867 trail += 0x41 1868 if trail < 0x80: 1869 euc_kr_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) 1870 else: 1871 euc_kr_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1872euc_kr_in_ref_file.close() 1873 1874euc_kr_out_file = open("src/test_data/euc_kr_out.txt", "w") 1875euc_kr_out_ref_file = open("src/test_data/euc_kr_out_ref.txt", "w") 1876euc_kr_out_file.write(TEST_HEADER) 1877euc_kr_out_ref_file.write(TEST_HEADER) 1878for pointer in range(0, len(index)): 1879 code_point = index[pointer] 1880 if code_point: 1881 (lead, trail) = divmod(pointer, 190) 1882 lead += 0x81 1883 trail += 0x41 1884 euc_kr_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 1885 euc_kr_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1886euc_kr_out_file.close() 1887euc_kr_out_ref_file.close() 1888 1889index = indexes["gb18030"] 1890 1891gb18030_in_file = open("src/test_data/gb18030_in.txt", "w") 1892gb18030_in_file.write(TEST_HEADER) 1893for pointer in range(0, len(index)): 1894 (lead, trail) = divmod(pointer, 190) 1895 lead += 0x81 1896 trail += 0x40 if trail < 0x3F else 0x41 1897 gb18030_in_file.write("%s%s\n" % (chr(lead), chr(trail))) 1898gb18030_in_file.close() 1899 1900gb18030_in_ref_file = open("src/test_data/gb18030_in_ref.txt", "w") 1901gb18030_in_ref_file.write(TEST_HEADER) 1902for pointer in range(0, len(index)): 1903 code_point = index[pointer] 1904 if code_point: 1905 gb18030_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1906 else: 1907 trail = pointer % 190 1908 trail += 0x40 if trail < 0x3F else 0x41 1909 if trail < 0x80: 1910 gb18030_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) 1911 else: 1912 gb18030_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1913gb18030_in_ref_file.close() 1914 1915gb18030_out_file = open("src/test_data/gb18030_out.txt", "w") 1916gb18030_out_ref_file = open("src/test_data/gb18030_out_ref.txt", "w") 1917gb18030_out_file.write(TEST_HEADER) 1918gb18030_out_ref_file.write(TEST_HEADER) 1919for pointer in range(0, len(index)): 1920 if pointer == 6555: 1921 continue 1922 code_point = index[pointer] 1923 if code_point: 1924 (lead, trail) = divmod(pointer, 190) 1925 lead += 0x81 1926 trail += 0x40 if trail < 0x3F else 0x41 1927 gb18030_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 1928 gb18030_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1929gb18030_out_file.close() 1930gb18030_out_ref_file.close() 1931 1932index = indexes["big5"] 1933 1934big5_in_file = open("src/test_data/big5_in.txt", "w") 1935big5_in_file.write(TEST_HEADER) 1936for pointer in range(0, len(index)): 1937 (lead, trail) = divmod(pointer, 157) 1938 lead += 0x81 1939 trail += 0x40 if trail < 0x3F else 0x62 1940 big5_in_file.write("%s%s\n" % (chr(lead), chr(trail))) 1941big5_in_file.close() 1942 1943big5_two_characters = { 1944 1133: u"\u00CA\u0304", 1945 1135: u"\u00CA\u030C", 1946 1164: u"\u00EA\u0304", 1947 1166: u"\u00EA\u030C", 1948} 1949 1950big5_in_ref_file = open("src/test_data/big5_in_ref.txt", "w") 1951big5_in_ref_file.write(TEST_HEADER) 1952for pointer in range(0, len(index)): 1953 if pointer in big5_two_characters.keys(): 1954 big5_in_ref_file.write((u"%s\n" % big5_two_characters[pointer]).encode("utf-8")) 1955 continue 1956 code_point = index[pointer] 1957 if code_point: 1958 big5_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1959 else: 1960 trail = pointer % 157 1961 trail += 0x40 if trail < 0x3F else 0x62 1962 if trail < 0x80: 1963 big5_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) 1964 else: 1965 big5_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1966big5_in_ref_file.close() 1967 1968prefer_last = [ 1969 0x2550, 1970 0x255E, 1971 0x2561, 1972 0x256A, 1973 0x5341, 1974 0x5345, 1975] 1976 1977pointer_for_prefer_last = [] 1978 1979for code_point in prefer_last: 1980 # Python lists don't have .rindex() :-( 1981 for i in xrange(len(index) - 1, -1, -1): 1982 candidate = index[i] 1983 if candidate == code_point: 1984 pointer_for_prefer_last.append(i) 1985 break 1986 1987big5_out_file = open("src/test_data/big5_out.txt", "w") 1988big5_out_ref_file = open("src/test_data/big5_out_ref.txt", "w") 1989big5_out_file.write(TEST_HEADER) 1990big5_out_ref_file.write(TEST_HEADER) 1991for pointer in range(((0xA1 - 0x81) * 157), len(index)): 1992 code_point = index[pointer] 1993 if code_point: 1994 if code_point in prefer_last: 1995 if pointer != pointer_for_prefer_last[prefer_last.index(code_point)]: 1996 continue 1997 else: 1998 if pointer != index.index(code_point): 1999 continue 2000 (lead, trail) = divmod(pointer, 157) 2001 lead += 0x81 2002 trail += 0x40 if trail < 0x3F else 0x62 2003 big5_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 2004 big5_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 2005big5_out_file.close() 2006big5_out_ref_file.close() 2007 2008index = indexes["jis0212"] 2009 2010jis0212_in_file = open("src/test_data/jis0212_in.txt", "w") 2011jis0212_in_file.write(TEST_HEADER) 2012for pointer in range(0, len(index)): 2013 (lead, trail) = divmod(pointer, 94) 2014 lead += 0xA1 2015 trail += 0xA1 2016 jis0212_in_file.write("\x8F%s%s\n" % (chr(lead), chr(trail))) 2017jis0212_in_file.close() 2018 2019jis0212_in_ref_file = open("src/test_data/jis0212_in_ref.txt", "w") 2020jis0212_in_ref_file.write(TEST_HEADER) 2021for pointer in range(0, len(index)): 2022 code_point = index[pointer] 2023 if code_point: 2024 jis0212_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 2025 else: 2026 jis0212_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 2027jis0212_in_ref_file.close() 2028 2029subprocess.call(["cargo", "fmt"]) 2030