1#!/usr/bin/python 2 3# Copyright Mozilla Foundation. See the COPYRIGHT 4# file at the top-level directory of this distribution. 5# 6# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 7# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license 8# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your 9# option. This file may not be copied, modified, or distributed 10# except according to those terms. 11 12import json 13import subprocess 14import sys 15import os.path 16 17if (not os.path.isfile("../encoding/encodings.json")) or (not os.path.isfile("../encoding/indexes.json")): 18 sys.stderr.write("This script needs a clone of https://github.com/whatwg/encoding/ (preferably at revision f381389) next to the encoding_rs directory.\n"); 19 sys.exit(-1) 20 21if not os.path.isfile("../encoding_c/src/lib.rs"): 22 sys.stderr.write("This script also writes the generated parts of the encoding_c crate and needs a clone of https://github.com/hsivonen/encoding_c next to the encoding_rs directory.\n"); 23 sys.exit(-1) 24 25if not os.path.isfile("../codepage/src/lib.rs"): 26 sys.stderr.write("This script also writes the generated parts of the codepage crate and needs a clone of https://github.com/hsivonen/codepage next to the encoding_rs directory.\n"); 27 sys.exit(-1) 28 29def cmp_from_end(one, other): 30 c = cmp(len(one), len(other)) 31 if c != 0: 32 return c 33 i = len(one) - 1 34 while i >= 0: 35 c = cmp(one[i], other[i]) 36 if c != 0: 37 return c 38 i -= 1 39 return 0 40 41 42class Label: 43 def __init__(self, label, preferred): 44 self.label = label 45 self.preferred = preferred 46 def __cmp__(self, other): 47 return cmp_from_end(self.label, other.label) 48 49class CodePage: 50 def __init__(self, code_page, preferred): 51 self.code_page = code_page 52 self.preferred = preferred 53 def __cmp__(self, other): 54 return self.code_page, other.code_page 55 56def static_u16_table(name, data): 57 data_file.write('''pub static %s: [u16; %d] = [ 58 ''' % (name, len(data))) 59 60 for i in xrange(len(data)): 61 data_file.write('0x%04X,\n' % data[i]) 62 63 data_file.write(''']; 64 65 ''') 66 67def static_u16_table_from_indexable(name, data, item, feature): 68 data_file.write('''#[cfg(all( 69 feature = "less-slow-%s", 70 not(feature = "fast-%s") 71))] 72static %s: [u16; %d] = [ 73 ''' % (feature, feature, name, len(data))) 74 75 for i in xrange(len(data)): 76 data_file.write('0x%04X,\n' % data[i][item]) 77 78 data_file.write(''']; 79 80 ''') 81 82def static_u8_pair_table_from_indexable(name, data, item, feature): 83 data_file.write('''#[cfg(all( 84 feature = "less-slow-%s", 85 not(feature = "fast-%s") 86))] 87static %s: [[u8; 2]; %d] = [ 88 ''' % (feature, feature, name, len(data))) 89 90 for i in xrange(len(data)): 91 data_file.write('[0x%02X, 0x%02X],\n' % data[i][item]) 92 93 data_file.write(''']; 94 95 ''') 96 97def static_u8_pair_table(name, data, feature): 98 data_file.write('''#[cfg(feature = "%s")] 99static %s: [[u8; 2]; %d] = [ 100 ''' % (feature, name, len(data))) 101 102 for i in xrange(len(data)): 103 pair = data[i] 104 if not pair: 105 pair = (0, 0) 106 data_file.write('[0x%02X, 0x%02X],\n' % pair) 107 108 data_file.write(''']; 109 110 ''') 111 112preferred = [] 113 114dom = [] 115 116labels = [] 117 118data = json.load(open("../encoding/encodings.json", "r")) 119 120indexes = json.load(open("../encoding/indexes.json", "r")) 121 122single_byte = [] 123 124multi_byte = [] 125 126def to_camel_name(name): 127 if name == u"iso-8859-8-i": 128 return u"Iso8I" 129 if name.startswith(u"iso-8859-"): 130 return name.replace(u"iso-8859-", u"Iso") 131 return name.title().replace(u"X-", u"").replace(u"-", u"").replace(u"_", u"") 132 133def to_constant_name(name): 134 return name.replace(u"-", u"_").upper() 135 136def to_snake_name(name): 137 return name.replace(u"-", u"_").lower() 138 139def to_dom_name(name): 140 return name 141 142# Guestimate based on 143# https://w3techs.com/technologies/overview/character_encoding/all 144# whose methodology is known to be bogus, but the results are credible for 145# this purpose. UTF-16LE lifted up due to prevalence on Windows and 146# "ANSI codepages" prioritized. 147encodings_by_code_page_frequency = [ 148 "UTF-8", 149 "UTF-16LE", 150 "windows-1252", 151 "windows-1251", 152 "GBK", 153 "Shift_JIS", 154 "EUC-KR", 155 "windows-1250", 156 "windows-1256", 157 "windows-1254", 158 "Big5", 159 "windows-874", 160 "windows-1255", 161 "windows-1253", 162 "windows-1257", 163 "windows-1258", 164 "EUC-JP", 165 "ISO-8859-2", 166 "ISO-8859-15", 167 "ISO-8859-7", 168 "KOI8-R", 169 "gb18030", 170 "ISO-8859-5", 171 "ISO-8859-8-I", 172 "ISO-8859-4", 173 "ISO-8859-6", 174 "ISO-2022-JP", 175 "KOI8-U", 176 "ISO-8859-13", 177 "ISO-8859-3", 178 "UTF-16BE", 179 "IBM866", 180 "ISO-8859-10", 181 "ISO-8859-8", 182 "macintosh", 183 "x-mac-cyrillic", 184 "ISO-8859-14", 185 "ISO-8859-16", 186] 187 188encodings_by_code_page = { 189 932: "Shift_JIS", 190 936: "GBK", 191 949: "EUC-KR", 192 950: "Big5", 193 866: "IBM866", 194 874: "windows-874", 195 1200: "UTF-16LE", 196 1201: "UTF-16BE", 197 1250: "windows-1250", 198 1251: "windows-1251", 199 1252: "windows-1252", 200 1253: "windows-1253", 201 1254: "windows-1254", 202 1255: "windows-1255", 203 1256: "windows-1256", 204 1257: "windows-1257", 205 1258: "windows-1258", 206 10000: "macintosh", 207 10017: "x-mac-cyrillic", 208 20866: "KOI8-R", 209 20932: "EUC-JP", 210 21866: "KOI8-U", 211 28592: "ISO-8859-2", 212 28593: "ISO-8859-3", 213 28594: "ISO-8859-4", 214 28595: "ISO-8859-5", 215 28596: "ISO-8859-6", 216 28597: "ISO-8859-7", 217 28598: "ISO-8859-8", 218 28600: "ISO-8859-10", 219 28603: "ISO-8859-13", 220 28604: "ISO-8859-14", 221 28605: "ISO-8859-15", 222 28606: "ISO-8859-16", 223 38598: "ISO-8859-8-I", 224 50221: "ISO-2022-JP", 225 54936: "gb18030", 226 65001: "UTF-8", 227} 228 229code_pages_by_encoding = {} 230 231for code_page, encoding in encodings_by_code_page.iteritems(): 232 code_pages_by_encoding[encoding] = code_page 233 234encoding_by_alias_code_page = { 235 951: "Big5", 236 10007: "x-mac-cyrillic", 237 20936: "GBK", 238 20949: "EUC-KR", 239 21010: "UTF-16LE", # Undocumented; needed by calamine for Excel compat 240 28591: "windows-1252", 241 28599: "windows-1254", 242 28601: "windows-874", 243 50220: "ISO-2022-JP", 244 50222: "ISO-2022-JP", 245 50225: "replacement", # ISO-2022-KR 246 50227: "replacement", # ISO-2022-CN 247 51949: "EUC-JP", 248 51936: "GBK", 249 51949: "EUC-KR", 250 52936: "replacement", # HZ 251} 252 253code_pages = [] 254 255for name in encodings_by_code_page_frequency: 256 code_pages.append(code_pages_by_encoding[name]) 257 258encodings_by_code_page.update(encoding_by_alias_code_page) 259 260temp_keys = encodings_by_code_page.keys() 261temp_keys.sort() 262for code_page in temp_keys: 263 if not code_page in code_pages: 264 code_pages.append(code_page) 265 266# The position in the index (0 is the first index entry, 267# i.e. byte value 0x80) that starts the longest run of 268# consecutive code points. Must not be in the first 269# quadrant. If the character to be encoded is not in this 270# run, the part of the index after the run is searched 271# forward. Then the part of the index from 32 to the start 272# of the run. The first quadrant is searched last. 273# 274# If there is no obviously most useful longest run, 275# the index here is just used to affect the search order. 276start_of_longest_run_in_single_byte = { 277 "IBM866": 96, # 0 would be longest, but we don't want to start in the first quadrant 278 "windows-874": 33, 279 "windows-1250": 92, 280 "windows-1251": 64, 281 "windows-1252": 32, 282 "windows-1253": 83, 283 "windows-1254": 95, 284 "windows-1255": 96, 285 "windows-1256": 65, 286 "windows-1257": 95, # not actually longest 287 "windows-1258": 95, # not actually longest 288 "macintosh": 106, # useless 289 "x-mac-cyrillic": 96, 290 "KOI8-R": 64, # not actually longest 291 "KOI8-U": 64, # not actually longest 292 "ISO-8859-2": 95, # not actually longest 293 "ISO-8859-3": 95, # not actually longest 294 "ISO-8859-4": 95, # not actually longest 295 "ISO-8859-5": 46, 296 "ISO-8859-6": 65, 297 "ISO-8859-7": 83, 298 "ISO-8859-8": 96, 299 "ISO-8859-10": 90, # not actually longest 300 "ISO-8859-13": 95, # not actually longest 301 "ISO-8859-14": 95, 302 "ISO-8859-15": 63, 303 "ISO-8859-16": 95, # not actually longest 304} 305 306# 307 308for group in data: 309 if group["heading"] == "Legacy single-byte encodings": 310 single_byte = group["encodings"] 311 else: 312 multi_byte.extend(group["encodings"]) 313 for encoding in group["encodings"]: 314 preferred.append(encoding["name"]) 315 for label in encoding["labels"]: 316 labels.append(Label(label, encoding["name"])) 317 318for name in preferred: 319 dom.append(to_dom_name(name)) 320 321preferred.sort() 322labels.sort() 323dom.sort(cmp=cmp_from_end) 324 325longest_label_length = 0 326longest_name_length = 0 327longest_label = None 328longest_name = None 329 330for name in preferred: 331 if len(name) > longest_name_length: 332 longest_name_length = len(name) 333 longest_name = name 334 335for label in labels: 336 if len(label.label) > longest_label_length: 337 longest_label_length = len(label.label) 338 longest_label = label.label 339 340def longest_run_for_single_byte(name): 341 if name == u"ISO-8859-8-I": 342 name = u"ISO-8859-8" 343 index = indexes[name.lower()] 344 run_byte_offset = start_of_longest_run_in_single_byte[name] 345 run_bmp_offset = index[run_byte_offset] 346 previous_code_point = run_bmp_offset 347 run_length = 1 348 while True: 349 i = run_byte_offset + run_length 350 if i == len(index): 351 break 352 code_point = index[i] 353 if previous_code_point + 1 != code_point: 354 break 355 previous_code_point = code_point 356 run_length += 1 357 return (run_bmp_offset, run_byte_offset, run_length) 358 359def is_single_byte(name): 360 for encoding in single_byte: 361 if name == encoding["name"]: 362 return True 363 return False 364 365def read_non_generated(path): 366 partially_generated_file = open(path, "r") 367 full = partially_generated_file.read() 368 partially_generated_file.close() 369 370 generated_begin = "// BEGIN GENERATED CODE. PLEASE DO NOT EDIT." 371 generated_end = "// END GENERATED CODE" 372 373 generated_begin_index = full.find(generated_begin) 374 if generated_begin_index < 0: 375 sys.stderr.write("Can't find generated code start marker in %s. Exiting.\n" % path) 376 sys.exit(-1) 377 generated_end_index = full.find(generated_end) 378 if generated_end_index < 0: 379 sys.stderr.write("Can't find generated code end marker in %s. Exiting.\n" % path) 380 sys.exit(-1) 381 382 return (full[0:generated_begin_index + len(generated_begin)], 383 full[generated_end_index:]) 384 385(lib_rs_begin, lib_rs_end) = read_non_generated("src/lib.rs") 386 387label_file = open("src/lib.rs", "w") 388 389label_file.write(lib_rs_begin) 390label_file.write(""" 391// Instead, please regenerate using generate-encoding-data.py 392 393const LONGEST_LABEL_LENGTH: usize = %d; // %s 394 395""" % (longest_label_length, longest_label)) 396 397for name in preferred: 398 variant = None 399 if is_single_byte(name): 400 (run_bmp_offset, run_byte_offset, run_length) = longest_run_for_single_byte(name) 401 variant = "SingleByte(&data::SINGLE_BYTE_DATA.%s, 0x%04X, %d, %d)" % (to_snake_name(u"iso-8859-8" if name == u"ISO-8859-8-I" else name), run_bmp_offset, run_byte_offset, run_length) 402 else: 403 variant = to_camel_name(name) 404 405 docfile = open("doc/%s.txt" % name, "r") 406 doctext = docfile.read() 407 docfile.close() 408 409 label_file.write('''/// The initializer for the [%s](static.%s.html) encoding. 410/// 411/// For use only for taking the address of this form when 412/// Rust prohibits the use of the non-`_INIT` form directly, 413/// such as in initializers of other `static`s. If in doubt, 414/// use the corresponding non-`_INIT` reference-typed `static`. 415/// 416/// This part of the public API will go away if Rust changes 417/// to make the referent of `pub const FOO: &'static Encoding` 418/// unique cross-crate or if Rust starts allowing static arrays 419/// to be initialized with `pub static FOO: &'static Encoding` 420/// items. 421pub static %s_INIT: Encoding = Encoding { 422 name: "%s", 423 variant: VariantEncoding::%s, 424}; 425 426/// The %s encoding. 427/// 428%s/// 429/// This will change from `static` to `const` if Rust changes 430/// to make the referent of `pub const FOO: &'static Encoding` 431/// unique cross-crate, so don't take the address of this 432/// `static`. 433pub static %s: &'static Encoding = &%s_INIT; 434 435''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name), to_dom_name(name), variant, to_dom_name(name), doctext, to_constant_name(name), to_constant_name(name))) 436 437label_file.write("""static LABELS_SORTED: [&'static str; %d] = [ 438""" % len(labels)) 439 440for label in labels: 441 label_file.write('''"%s",\n''' % label.label) 442 443label_file.write("""]; 444 445static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; %d] = [ 446""" % len(labels)) 447 448for label in labels: 449 label_file.write('''&%s_INIT,\n''' % to_constant_name(label.preferred)) 450 451label_file.write(''']; 452 453''') 454label_file.write(lib_rs_end) 455label_file.close() 456 457label_test_file = open("src/test_labels_names.rs", "w") 458label_test_file.write('''// Any copyright to the test code below this comment is dedicated to the 459// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ 460 461// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. 462// Instead, please regenerate using generate-encoding-data.py 463 464use super::*; 465 466#[test] 467fn test_all_labels() { 468''') 469 470for label in labels: 471 label_test_file.write('''assert_eq!(Encoding::for_label(b"%s"), Some(%s));\n''' % (label.label, to_constant_name(label.preferred))) 472 473label_test_file.write('''} 474''') 475label_test_file.close() 476 477def null_to_zero(code_point): 478 if not code_point: 479 code_point = 0 480 return code_point 481 482(data_rs_begin, data_rs_end) = read_non_generated("src/data.rs") 483 484data_file = open("src/data.rs", "w") 485data_file.write(data_rs_begin) 486data_file.write(''' 487// Instead, please regenerate using generate-encoding-data.py 488 489#[repr(align(64))] // Align to cache lines 490pub struct SingleByteData { 491''') 492 493# Single-byte 494 495for encoding in single_byte: 496 name = encoding["name"] 497 if name == u"ISO-8859-8-I": 498 continue 499 500 data_file.write(''' pub %s: [u16; 128], 501''' % to_snake_name(name)) 502 503data_file.write('''} 504 505pub static SINGLE_BYTE_DATA: SingleByteData = SingleByteData { 506''') 507 508for encoding in single_byte: 509 name = encoding["name"] 510 if name == u"ISO-8859-8-I": 511 continue 512 513 data_file.write(''' %s: [ 514''' % to_snake_name(name)) 515 516 for code_point in indexes[name.lower()]: 517 data_file.write('0x%04X,\n' % null_to_zero(code_point)) 518 519 data_file.write('''], 520''') 521 522data_file.write('''}; 523 524''') 525 526# Big5 527 528index = indexes["big5"] 529 530astralness = [] 531low_bits = [] 532 533for code_point in index[942:19782]: 534 if code_point: 535 astralness.append(1 if code_point > 0xFFFF else 0) 536 low_bits.append(code_point & 0xFFFF) 537 else: 538 astralness.append(0) 539 low_bits.append(0) 540 541# pad length to multiple of 32 542for j in xrange(32 - (len(astralness) % 32)): 543 astralness.append(0) 544 545data_file.write('''#[cfg_attr(feature = "cargo-clippy", allow(unreadable_literal))] 546static BIG5_ASTRALNESS: [u32; %d] = [ 547''' % (len(astralness) / 32)) 548 549i = 0 550while i < len(astralness): 551 accu = 0 552 for j in xrange(32): 553 accu |= astralness[i + j] << j 554 data_file.write('0x%08X,\n' % accu) 555 i += 32 556 557data_file.write(''']; 558 559''') 560 561static_u16_table("BIG5_LOW_BITS", low_bits) 562 563# Encoder table for Level 1 Hanzi 564# Note: If we were OK with doubling this table, we 565# could use a directly-indexable table instead... 566level1_hanzi_index = index[5495:10896] 567level1_hanzi_pairs = [] 568for i in xrange(len(level1_hanzi_index)): 569 hanzi_lead = (i / 157) + 0xA4 570 hanzi_trail = (i % 157) 571 hanzi_trail += 0x40 if hanzi_trail < 0x3F else 0x62 572 level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail))) 573level1_hanzi_pairs.append((0x4E5A, (0xC8, 0x7B))) 574level1_hanzi_pairs.append((0x5202, (0xC8, 0x7D))) 575level1_hanzi_pairs.append((0x9FB0, (0xC8, 0xA1))) 576level1_hanzi_pairs.append((0x5188, (0xC8, 0xA2))) 577level1_hanzi_pairs.append((0x9FB1, (0xC8, 0xA3))) 578level1_hanzi_pairs.sort(key=lambda x: x[0]) 579 580static_u16_table_from_indexable("BIG5_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "big5-hanzi-encode") 581static_u8_pair_table_from_indexable("BIG5_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "big5-hanzi-encode") 582 583# Fast Unified Ideograph encode 584big5_unified_ideograph_bytes = [None] * (0x9FCC - 0x4E00) 585for row in xrange(0x7E - 0x20): 586 for column in xrange(157): 587 pointer = 5024 + column + (row * 157) 588 code_point = index[pointer] 589 if code_point and code_point >= 0x4E00 and code_point <= 0x9FCB: 590 unified_offset = code_point - 0x4E00 591 unified_lead = 0xA1 + row 592 unified_trail = (0x40 if column < 0x3F else 0x62) + column 593 if code_point == 0x5341 or code_point == 0x5345 or not big5_unified_ideograph_bytes[unified_offset]: 594 big5_unified_ideograph_bytes[unified_offset] = (unified_lead, unified_trail) 595 596static_u8_pair_table("BIG5_UNIFIED_IDEOGRAPH_BYTES", big5_unified_ideograph_bytes, "fast-big5-hanzi-encode") 597 598# JIS0208 599 600index = indexes["jis0208"] 601 602# JIS 0208 Level 1 Kanji 603static_u16_table("JIS0208_LEVEL1_KANJI", index[1410:4375]) 604 605# JIS 0208 Level 2 Kanji and Additional Kanji 606static_u16_table("JIS0208_LEVEL2_AND_ADDITIONAL_KANJI", index[4418:7808]) 607 608# IBM Kanji 609static_u16_table("IBM_KANJI", index[8272:8632]) 610 611# Check that the other instance is the same 612if index[8272:8632] != index[10744:11104]: 613 raise Error() 614 615# JIS 0208 symbols (all non-Kanji, non-range items) 616symbol_index = [] 617symbol_triples = [] 618pointers_to_scan = [ 619 (0, 188), 620 (658, 691), 621 (1159, 1221), 622] 623in_run = False 624run_start_pointer = 0 625run_start_array_index = 0 626for (start, end) in pointers_to_scan: 627 for i in range(start, end): 628 code_point = index[i] 629 if in_run: 630 if code_point: 631 symbol_index.append(code_point) 632 else: 633 symbol_triples.append(run_start_pointer) 634 symbol_triples.append(i - run_start_pointer) 635 symbol_triples.append(run_start_array_index) 636 in_run = False 637 else: 638 if code_point: 639 in_run = True 640 run_start_pointer = i 641 run_start_array_index = len(symbol_index) 642 symbol_index.append(code_point) 643 if in_run: 644 symbol_triples.append(run_start_pointer) 645 symbol_triples.append(end - run_start_pointer) 646 symbol_triples.append(run_start_array_index) 647 in_run = False 648if in_run: 649 raise Error() 650 651# Now add manually the two overlapping slices of 652# index from the NEC/IBM extensions. 653run_start_array_index = len(symbol_index) 654symbol_index.extend(index[10736:10744]) 655# Later 656symbol_triples.append(10736) 657symbol_triples.append(8) 658symbol_triples.append(run_start_array_index) 659# Earlier 660symbol_triples.append(8644) 661symbol_triples.append(4) 662symbol_triples.append(run_start_array_index) 663 664static_u16_table("JIS0208_SYMBOLS", symbol_index) 665static_u16_table("JIS0208_SYMBOL_TRIPLES", symbol_triples) 666 667# Write down the magic numbers needed when preferring the earlier case 668data_file.write('''const IBM_SYMBOL_START: usize = %d;''' % (run_start_array_index + 1)) 669data_file.write('''const IBM_SYMBOL_END: usize = %d;''' % (run_start_array_index + 4)) 670data_file.write('''const IBM_SYMBOL_POINTER_START: usize = %d;''' % 8645) 671 672# JIS 0208 ranges (excluding kana) 673range_triples = [] 674pointers_to_scan = [ 675 (188, 281), 676 (470, 657), 677 (1128, 1159), 678 (8634, 8644), 679 (10716, 10736), 680] 681in_run = False 682run_start_pointer = 0 683run_start_code_point = 0 684previous_code_point = 0 685for (start, end) in pointers_to_scan: 686 for i in range(start, end): 687 code_point = index[i] 688 if in_run: 689 if code_point: 690 if previous_code_point + 1 != code_point: 691 range_triples.append(run_start_pointer) 692 range_triples.append(i - run_start_pointer) 693 range_triples.append(run_start_code_point) 694 run_start_pointer = i 695 run_start_code_point = code_point 696 previous_code_point = code_point 697 else: 698 range_triples.append(run_start_pointer) 699 range_triples.append(i - run_start_pointer) 700 range_triples.append(run_start_code_point) 701 run_start_pointer = 0 702 run_start_code_point = 0 703 previous_code_point = 0 704 in_run = False 705 else: 706 if code_point: 707 in_run = True 708 run_start_pointer = i 709 run_start_code_point = code_point 710 previous_code_point = code_point 711 if in_run: 712 range_triples.append(run_start_pointer) 713 range_triples.append(end - run_start_pointer) 714 range_triples.append(run_start_code_point) 715 run_start_pointer = 0 716 run_start_code_point = 0 717 previous_code_point = 0 718 in_run = False 719if in_run: 720 raise Error() 721 722static_u16_table("JIS0208_RANGE_TRIPLES", range_triples) 723 724# Encoder table for Level 1 Kanji 725# Note: If we were OK with 30 KB more footprint, we 726# could use a directly-indexable table instead... 727level1_kanji_index = index[1410:4375] 728level1_kanji_pairs = [] 729for i in xrange(len(level1_kanji_index)): 730 pointer = 1410 + i 731 (lead, trail) = divmod(pointer, 188) 732 lead += 0x81 if lead < 0x1F else 0xC1 733 trail += 0x40 if trail < 0x3F else 0x41 734 level1_kanji_pairs.append((level1_kanji_index[i], (lead, trail))) 735level1_kanji_pairs.sort(key=lambda x: x[0]) 736 737static_u16_table_from_indexable("JIS0208_LEVEL1_KANJI_CODE_POINTS", level1_kanji_pairs, 0, "kanji-encode") 738static_u8_pair_table_from_indexable("JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES", level1_kanji_pairs, 1, "kanji-encode") 739 740# Fast encoder table for Kanji 741kanji_bytes = [None] * (0x9FA1 - 0x4E00) 742for pointer in xrange(len(index)): 743 code_point = index[pointer] 744 if code_point and code_point >= 0x4E00 and code_point <= 0x9FA0: 745 (lead, trail) = divmod(pointer, 188) 746 lead += 0x81 if lead < 0x1F else 0xC1 747 trail += 0x40 if trail < 0x3F else 0x41 748 # unset the high bit of lead if IBM Kanji 749 if pointer >= 8272: 750 lead = lead & 0x7F 751 kanji_bytes[code_point - 0x4E00] = (lead, trail) 752 753static_u8_pair_table("JIS0208_KANJI_BYTES", kanji_bytes, "fast-kanji-encode") 754 755# ISO-2022-JP half-width katakana 756 757# index is still jis0208 758half_width_index = indexes["iso-2022-jp-katakana"] 759 760data_file.write('''pub static ISO_2022_JP_HALF_WIDTH_TRAIL: [u8; %d] = [ 761''' % len(half_width_index)) 762 763for i in xrange(len(half_width_index)): 764 code_point = half_width_index[i] 765 pointer = index.index(code_point) 766 trail = pointer % 94 + 0x21 767 data_file.write('0x%02X,\n' % trail) 768 769data_file.write(''']; 770 771''') 772 773# EUC-KR 774 775index = indexes["euc-kr"] 776 777# Unicode 1.1 Hangul above the old KS X 1001 block 778# Compressed form takes 35% of uncompressed form 779pointers = [] 780offsets = [] 781previous_code_point = 0 782for row in xrange(0x20): 783 for column in xrange(190): 784 i = column + (row * 190) 785 # Skip the gaps 786 if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40): 787 continue 788 code_point = index[i] 789 if previous_code_point > code_point: 790 raise Error() 791 if code_point - previous_code_point != 1: 792 adjustment = 0 793 if column >= 0x40: 794 adjustment = 12 795 elif column >= 0x20: 796 adjustment = 6 797 pointers.append(column - adjustment + (row * (190 - 12))) 798 offsets.append(code_point) 799 previous_code_point = code_point 800 801static_u16_table("CP949_TOP_HANGUL_POINTERS", pointers) 802static_u16_table("CP949_TOP_HANGUL_OFFSETS", offsets) 803 804# Unicode 1.1 Hangul to the left of the old KS X 1001 block 805pointers = [] 806offsets = [] 807previous_code_point = 0 808for row in xrange(0x46 - 0x20): 809 for column in xrange(190 - 94): 810 i = 6080 + column + (row * 190) 811 # Skip the gaps 812 if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40): 813 continue 814 if i > 13127: 815 # Exclude unassigned on partial last row 816 break 817 code_point = index[i] 818 if previous_code_point > code_point: 819 raise Error() 820 if code_point - previous_code_point != 1: 821 adjustment = 0 822 if column >= 0x40: 823 adjustment = 12 824 elif column >= 0x20: 825 adjustment = 6 826 pointers.append(column - adjustment + (row * (190 - 94 - 12))) 827 offsets.append(code_point) 828 previous_code_point = code_point 829 830static_u16_table("CP949_LEFT_HANGUL_POINTERS", pointers) 831static_u16_table("CP949_LEFT_HANGUL_OFFSETS", offsets) 832 833# KS X 1001 Hangul 834hangul_index = [] 835previous_code_point = 0 836for row in xrange(0x48 - 0x2F): 837 for column in xrange(94): 838 code_point = index[9026 + column + (row * 190)] 839 if previous_code_point >= code_point: 840 raise Error() 841 hangul_index.append(code_point) 842 previous_code_point = code_point 843 844static_u16_table("KSX1001_HANGUL", hangul_index) 845 846# KS X 1001 Hanja 847hanja_index = [] 848for row in xrange(0x7D - 0x49): 849 for column in xrange(94): 850 hanja_index.append(index[13966 + column + (row * 190)]) 851 852static_u16_table("KSX1001_HANJA", hanja_index) 853 854# KS X 1001 symbols 855symbol_index = [] 856for i in range(6176, 6270): 857 symbol_index.append(index[i]) 858for i in range(6366, 6437): 859 symbol_index.append(index[i]) 860 861static_u16_table("KSX1001_SYMBOLS", symbol_index) 862 863# KS X 1001 Uppercase Latin 864subindex = [] 865for i in range(7506, 7521): 866 subindex.append(null_to_zero(index[i])) 867 868static_u16_table("KSX1001_UPPERCASE", subindex) 869 870# KS X 1001 Lowercase Latin 871subindex = [] 872for i in range(7696, 7712): 873 subindex.append(index[i]) 874 875static_u16_table("KSX1001_LOWERCASE", subindex) 876 877# KS X 1001 Box drawing 878subindex = [] 879for i in range(7126, 7194): 880 subindex.append(index[i]) 881 882static_u16_table("KSX1001_BOX", subindex) 883 884# KS X 1001 other 885pointers = [] 886offsets = [] 887previous_code_point = 0 888for row in xrange(10): 889 for column in xrange(94): 890 i = 6556 + column + (row * 190) 891 code_point = index[i] 892 # Exclude ranges that were processed as lookup tables 893 # or that contain unmapped cells by filling them with 894 # ASCII. Upon encode, ASCII code points will 895 # never appear as the search key. 896 if (i >= 6946 and i <= 6950): 897 code_point = i - 6946 898 elif (i >= 6961 and i <= 6967): 899 code_point = i - 6961 900 elif (i >= 6992 and i <= 6999): 901 code_point = i - 6992 902 elif (i >= 7024 and i <= 7029): 903 code_point = i - 7024 904 elif (i >= 7126 and i <= 7219): 905 code_point = i - 7126 906 elif (i >= 7395 and i <= 7409): 907 code_point = i - 7395 908 elif (i >= 7506 and i <= 7521): 909 code_point = i - 7506 910 elif (i >= 7696 and i <= 7711): 911 code_point = i - 7696 912 elif (i >= 7969 and i <= 7979): 913 code_point = i - 7969 914 elif (i >= 8162 and i <= 8169): 915 code_point = i - 8162 916 elif (i >= 8299 and i <= 8313): 917 code_point = i - 8299 918 elif (i >= 8347 and i <= 8359): 919 code_point = i - 8347 920 if code_point - previous_code_point != 1: 921 pointers.append(column + (row * 94)) 922 offsets.append(code_point) 923 previous_code_point = code_point 924 925static_u16_table("KSX1001_OTHER_POINTERS", pointers) 926# Omit the last offset, because the end of the last line 927# is unmapped, so we don't want to look at it. 928static_u16_table("KSX1001_OTHER_UNSORTED_OFFSETS", offsets[:-1]) 929 930# Fast Hangul and Hanja encode 931hangul_bytes = [None] * (0xD7A4 - 0xAC00) 932hanja_unified_bytes = [None] * (0x9F9D - 0x4E00) 933hanja_compatibility_bytes = [None] * (0xFA0C - 0xF900) 934for row in xrange(0x7D): 935 for column in xrange(190): 936 pointer = column + (row * 190) 937 code_point = index[pointer] 938 if code_point: 939 lead = 0x81 + row 940 trail = 0x41 + column 941 if code_point >= 0xAC00 and code_point < 0xD7A4: 942 hangul_bytes[code_point - 0xAC00] = (lead, trail) 943 elif code_point >= 0x4E00 and code_point < 0x9F9D: 944 hanja_unified_bytes[code_point - 0x4E00] = (lead, trail) 945 elif code_point >= 0xF900 and code_point < 0xFA0C: 946 hanja_compatibility_bytes[code_point - 0xF900] = (lead, trail) 947 948static_u8_pair_table("CP949_HANGUL_BYTES", hangul_bytes, "fast-hangul-encode") 949static_u8_pair_table("KSX1001_UNIFIED_HANJA_BYTES", hanja_unified_bytes, "fast-hanja-encode") 950static_u8_pair_table("KSX1001_COMPATIBILITY_HANJA_BYTES", hanja_compatibility_bytes, "fast-hanja-encode") 951 952# JIS 0212 953 954index = indexes["jis0212"] 955 956# JIS 0212 Kanji 957static_u16_table("JIS0212_KANJI", index[1410:7211]) 958 959# JIS 0212 accented (all non-Kanji, non-range items) 960symbol_index = [] 961symbol_triples = [] 962pointers_to_scan = [ 963 (0, 596), 964 (608, 644), 965 (656, 1409), 966] 967in_run = False 968run_start_pointer = 0 969run_start_array_index = 0 970for (start, end) in pointers_to_scan: 971 for i in range(start, end): 972 code_point = index[i] 973 if in_run: 974 if code_point: 975 symbol_index.append(code_point) 976 elif index[i + 1]: 977 symbol_index.append(0) 978 else: 979 symbol_triples.append(run_start_pointer) 980 symbol_triples.append(i - run_start_pointer) 981 symbol_triples.append(run_start_array_index) 982 in_run = False 983 else: 984 if code_point: 985 in_run = True 986 run_start_pointer = i 987 run_start_array_index = len(symbol_index) 988 symbol_index.append(code_point) 989 if in_run: 990 symbol_triples.append(run_start_pointer) 991 symbol_triples.append(end - run_start_pointer) 992 symbol_triples.append(run_start_array_index) 993 in_run = False 994if in_run: 995 raise Error() 996 997static_u16_table("JIS0212_ACCENTED", symbol_index) 998static_u16_table("JIS0212_ACCENTED_TRIPLES", symbol_triples) 999 1000# gb18030 1001 1002index = indexes["gb18030"] 1003 1004# Unicode 1.1 ideographs above the old GB2312 block 1005# Compressed form takes 63% of uncompressed form 1006pointers = [] 1007offsets = [] 1008previous_code_point = 0 1009for i in xrange(6080): 1010 code_point = index[i] 1011 if previous_code_point > code_point: 1012 raise Error() 1013 if code_point - previous_code_point != 1: 1014 pointers.append(i) 1015 offsets.append(code_point) 1016 previous_code_point = code_point 1017 1018static_u16_table("GBK_TOP_IDEOGRAPH_POINTERS", pointers) 1019static_u16_table("GBK_TOP_IDEOGRAPH_OFFSETS", offsets) 1020 1021# Unicode 1.1 ideographs to the left of the old GB2312 block 1022# Compressed form takes 40% of uncompressed form 1023pointers = [] 1024offsets = [] 1025previous_code_point = 0 1026for row in xrange(0x7D - 0x29): 1027 for column in xrange(190 - 94): 1028 i = 7790 + column + (row * 190) 1029 if i > 23650: 1030 # Exclude compatibility ideographs at the end 1031 break 1032 code_point = index[i] 1033 if previous_code_point > code_point: 1034 raise Error() 1035 if code_point - previous_code_point != 1: 1036 pointers.append(column + (row * (190 - 94))) 1037 offsets.append(code_point) 1038 previous_code_point = code_point 1039 1040static_u16_table("GBK_LEFT_IDEOGRAPH_POINTERS", pointers) 1041static_u16_table("GBK_LEFT_IDEOGRAPH_OFFSETS", offsets) 1042 1043# GBK other (excl. Ext A, Compat & PUA at the bottom) 1044pointers = [] 1045offsets = [] 1046previous_code_point = 0 1047for row in xrange(0x29 - 0x20): 1048 for column in xrange(190 - 94): 1049 i = 6080 + column + (row * 190) 1050 code_point = index[i] 1051 if code_point - previous_code_point != 1: 1052 pointers.append(column + (row * (190 - 94))) 1053 offsets.append(code_point) 1054 previous_code_point = code_point 1055 1056pointers.append((190 - 94) * (0x29 - 0x20)) 1057static_u16_table("GBK_OTHER_POINTERS", pointers) 1058static_u16_table("GBK_OTHER_UNSORTED_OFFSETS", offsets) 1059 1060# GBK bottom: Compatibility ideagraphs, Ext A and PUA 1061bottom_index = [] 1062# 5 compat following Unified Ideographs 1063for i in range(23651, 23656): 1064 bottom_index.append(index[i]) 1065# Last row 1066for i in range(23750, 23846): 1067 bottom_index.append(index[i]) 1068 1069static_u16_table("GBK_BOTTOM", bottom_index) 1070 1071# GB2312 Hanzi 1072# (and the 5 PUA code points in between Level 1 and Level 2) 1073hanzi_index = [] 1074for row in xrange(0x77 - 0x2F): 1075 for column in xrange(94): 1076 hanzi_index.append(index[9026 + column + (row * 190)]) 1077 1078static_u16_table("GB2312_HANZI", hanzi_index) 1079 1080# GB2312 symbols 1081symbol_index = [] 1082for i in xrange(94): 1083 symbol_index.append(index[6176 + i]) 1084 1085static_u16_table("GB2312_SYMBOLS", symbol_index) 1086 1087# GB2312 symbols on Greek row (incl. PUA) 1088symbol_index = [] 1089for i in xrange(22): 1090 symbol_index.append(index[7189 + i]) 1091 1092static_u16_table("GB2312_SYMBOLS_AFTER_GREEK", symbol_index) 1093 1094# GB2312 Pinyin 1095pinyin_index = [] 1096for i in xrange(32): 1097 pinyin_index.append(index[7506 + i]) 1098 1099static_u16_table("GB2312_PINYIN", pinyin_index) 1100 1101# GB2312 other (excl. bottom PUA) 1102pointers = [] 1103offsets = [] 1104previous_code_point = 0 1105for row in xrange(14): 1106 for column in xrange(94): 1107 i = 6366 + column + (row * 190) 1108 code_point = index[i] 1109 # Exclude the two ranges that were processed as 1110 # lookup tables above by filling them with 1111 # ASCII. Upon encode, ASCII code points will 1112 # never appear as the search key. 1113 if (i >= 7189 and i < 7189 + 22): 1114 code_point = i - 7189 1115 elif (i >= 7506 and i < 7506 + 32): 1116 code_point = i - 7506 1117 if code_point - previous_code_point != 1: 1118 pointers.append(column + (row * 94)) 1119 offsets.append(code_point) 1120 previous_code_point = code_point 1121 1122pointers.append(14 * 94) 1123static_u16_table("GB2312_OTHER_POINTERS", pointers) 1124static_u16_table("GB2312_OTHER_UNSORTED_OFFSETS", offsets) 1125 1126# Non-gbk code points 1127pointers = [] 1128offsets = [] 1129for pair in indexes["gb18030-ranges"]: 1130 if pair[1] == 0x10000: 1131 break # the last entry doesn't fit in u16 1132 pointers.append(pair[0]) 1133 offsets.append(pair[1]) 1134 1135static_u16_table("GB18030_RANGE_POINTERS", pointers) 1136static_u16_table("GB18030_RANGE_OFFSETS", offsets) 1137 1138# Encoder table for Level 1 Hanzi 1139# The units here really fit into 12 bits, but since we're 1140# looking for speed here, let's use 16 bits per unit. 1141# Once we use 16 bits per unit, we might as well precompute 1142# the output bytes. 1143level1_hanzi_index = hanzi_index[:(94 * (0xD8 - 0xB0) - 5)] 1144level1_hanzi_pairs = [] 1145for i in xrange(len(level1_hanzi_index)): 1146 hanzi_lead = (i / 94) + 0xB0 1147 hanzi_trail = (i % 94) + 0xA1 1148 level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail))) 1149level1_hanzi_pairs.sort(key=lambda x: x[0]) 1150 1151static_u16_table_from_indexable("GB2312_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "gb-hanzi-encode") 1152static_u8_pair_table_from_indexable("GB2312_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "gb-hanzi-encode") 1153 1154# Fast Hanzi encoder table 1155hanzi_bytes = [None] * (0x9FA7 - 0x4E00) 1156for row in xrange(126): 1157 for column in xrange(190): 1158 pointer = column + (row * 190) 1159 code_point = index[pointer] 1160 if code_point and code_point >= 0x4E00 and code_point <= 0x9FA6: 1161 hanzi_lead = 0x81 + row 1162 hanzi_trail = column + (0x40 if column < 0x3F else 0x41) 1163 hanzi_bytes[code_point - 0x4E00] = (hanzi_lead, hanzi_trail) 1164 1165static_u8_pair_table("GBK_HANZI_BYTES", hanzi_bytes, "fast-gb-hanzi-encode") 1166 1167data_file.write(data_rs_end) 1168 1169data_file.close() 1170 1171# Variant 1172 1173variant_file = open("src/variant.rs", "w") 1174variant_file.write('''// Copyright Mozilla Foundation. See the COPYRIGHT 1175// file at the top-level directory of this distribution. 1176// 1177// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 1178// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license 1179// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your 1180// option. This file may not be copied, modified, or distributed 1181// except according to those terms. 1182 1183// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. 1184// Instead, please regenerate using generate-encoding-data.py 1185 1186//! This module provides enums that wrap the various decoders and encoders. 1187//! The purpose is to make `Decoder` and `Encoder` `Sized` by writing the 1188//! dispatch explicitly for a finite set of specialized decoders and encoders. 1189//! Unfortunately, this means the compiler doesn't generate the dispatch code 1190//! and it has to be written here instead. 1191//! 1192//! The purpose of making `Decoder` and `Encoder` `Sized` is to allow stack 1193//! allocation in Rust code, including the convenience methods on `Encoding`. 1194 1195''') 1196 1197encoding_variants = [u"single-byte",] 1198for encoding in multi_byte: 1199 if encoding["name"] in [u"UTF-16LE", u"UTF-16BE"]: 1200 continue 1201 else: 1202 encoding_variants.append(encoding["name"]) 1203encoding_variants.append(u"UTF-16") 1204 1205decoder_variants = [] 1206for variant in encoding_variants: 1207 if variant == u"GBK": 1208 continue 1209 decoder_variants.append(variant) 1210 1211encoder_variants = [] 1212for variant in encoding_variants: 1213 if variant in [u"replacement", u"GBK", u"UTF-16"]: 1214 continue 1215 encoder_variants.append(variant) 1216 1217for variant in decoder_variants: 1218 variant_file.write("use %s::*;\n" % to_snake_name(variant)) 1219 1220variant_file.write('''use super::*; 1221 1222pub enum VariantDecoder { 1223''') 1224 1225for variant in decoder_variants: 1226 variant_file.write(" %s(%sDecoder),\n" % (to_camel_name(variant), to_camel_name(variant))) 1227 1228variant_file.write('''} 1229 1230impl VariantDecoder { 1231''') 1232 1233def write_variant_method(name, mut, arg_list, ret, variants, excludes, kind): 1234 variant_file.write('''pub fn %s(&''' % name) 1235 if mut: 1236 variant_file.write('''mut ''') 1237 variant_file.write('''self''') 1238 for arg in arg_list: 1239 variant_file.write(''', %s: %s''' % (arg[0], arg[1])) 1240 variant_file.write(''')''') 1241 if ret: 1242 variant_file.write(''' -> %s''' % ret) 1243 variant_file.write(''' {\nmatch *self {\n''') 1244 for variant in variants: 1245 variant_file.write('''Variant%s::%s(ref ''' % (kind, to_camel_name(variant))) 1246 if mut: 1247 variant_file.write('''mut ''') 1248 if variant in excludes: 1249 variant_file.write('''v) => (),''') 1250 continue 1251 variant_file.write('''v) => v.%s(''' % name) 1252 first = True 1253 for arg in arg_list: 1254 if not first: 1255 variant_file.write(''', ''') 1256 first = False 1257 variant_file.write(arg[0]) 1258 variant_file.write('''),\n''') 1259 variant_file.write('''}\n}\n\n''') 1260 1261write_variant_method("max_utf16_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder") 1262 1263write_variant_method("max_utf8_buffer_length_without_replacement", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder") 1264 1265write_variant_method("max_utf8_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder") 1266 1267write_variant_method("decode_to_utf16_raw", True, [("src", "&[u8]"), 1268 ("dst", "&mut [u16]"), 1269 ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder") 1270 1271write_variant_method("decode_to_utf8_raw", True, [("src", "&[u8]"), 1272 ("dst", "&mut [u8]"), 1273 ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder") 1274 1275variant_file.write(''' 1276 1277 pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> Option<usize> { 1278 match *self { 1279 VariantDecoder::SingleByte(ref v) => { 1280 return Some(v.latin1_byte_compatible_up_to(buffer)); 1281 } 1282 VariantDecoder::Utf8(ref v) => { 1283 if !v.in_neutral_state() { 1284 return None; 1285 } 1286 } 1287 VariantDecoder::Gb18030(ref v) => { 1288 if !v.in_neutral_state() { 1289 return None; 1290 } 1291 } 1292 VariantDecoder::Big5(ref v) => { 1293 if !v.in_neutral_state() { 1294 return None; 1295 } 1296 } 1297 VariantDecoder::EucJp(ref v) => { 1298 if !v.in_neutral_state() { 1299 return None; 1300 } 1301 } 1302 VariantDecoder::Iso2022Jp(ref v) => { 1303 if v.in_neutral_state() { 1304 return Some(Encoding::iso_2022_jp_ascii_valid_up_to(buffer)); 1305 } 1306 return None; 1307 } 1308 VariantDecoder::ShiftJis(ref v) => { 1309 if !v.in_neutral_state() { 1310 return None; 1311 } 1312 } 1313 VariantDecoder::EucKr(ref v) => { 1314 if !v.in_neutral_state() { 1315 return None; 1316 } 1317 } 1318 VariantDecoder::UserDefined(_) => {} 1319 VariantDecoder::Replacement(_) | VariantDecoder::Utf16(_) => { 1320 return None; 1321 } 1322 }; 1323 Some(Encoding::ascii_valid_up_to(buffer)) 1324 } 1325} 1326 1327pub enum VariantEncoder { 1328''') 1329 1330for variant in encoder_variants: 1331 variant_file.write(" %s(%sEncoder),\n" % (to_camel_name(variant), to_camel_name(variant))) 1332 1333variant_file.write('''} 1334 1335impl VariantEncoder { 1336 pub fn has_pending_state(&self) -> bool { 1337 match *self { 1338 VariantEncoder::Iso2022Jp(ref v) => { 1339 v.has_pending_state() 1340 } 1341 _ => false, 1342 } 1343 } 1344''') 1345 1346write_variant_method("max_buffer_length_from_utf16_without_replacement", False, [("u16_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder") 1347 1348write_variant_method("max_buffer_length_from_utf8_without_replacement", False, [("byte_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder") 1349 1350write_variant_method("encode_from_utf16_raw", True, [("src", "&[u16]"), 1351 ("dst", "&mut [u8]"), 1352 ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder") 1353 1354write_variant_method("encode_from_utf8_raw", True, [("src", "&str"), 1355 ("dst", "&mut [u8]"), 1356 ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder") 1357 1358 1359variant_file.write('''} 1360 1361pub enum VariantEncoding { 1362 SingleByte(&'static [u16; 128], u16, u8, u8),''') 1363 1364for encoding in multi_byte: 1365 variant_file.write("%s,\n" % to_camel_name(encoding["name"])) 1366 1367variant_file.write('''} 1368 1369impl VariantEncoding { 1370 pub fn new_variant_decoder(&self) -> VariantDecoder { 1371 match *self { 1372 VariantEncoding::SingleByte(table, _, _, _) => SingleByteDecoder::new(table), 1373 VariantEncoding::Utf8 => Utf8Decoder::new(), 1374 VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(), 1375 VariantEncoding::Big5 => Big5Decoder::new(), 1376 VariantEncoding::EucJp => EucJpDecoder::new(), 1377 VariantEncoding::Iso2022Jp => Iso2022JpDecoder::new(), 1378 VariantEncoding::ShiftJis => ShiftJisDecoder::new(), 1379 VariantEncoding::EucKr => EucKrDecoder::new(), 1380 VariantEncoding::Replacement => ReplacementDecoder::new(), 1381 VariantEncoding::UserDefined => UserDefinedDecoder::new(), 1382 VariantEncoding::Utf16Be => Utf16Decoder::new(true), 1383 VariantEncoding::Utf16Le => Utf16Decoder::new(false), 1384 } 1385 } 1386 1387 pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder { 1388 match *self { 1389 VariantEncoding::SingleByte(table, run_bmp_offset, run_byte_offset, run_length) => SingleByteEncoder::new(encoding, table, run_bmp_offset, run_byte_offset, run_length), 1390 VariantEncoding::Utf8 => Utf8Encoder::new(encoding), 1391 VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false), 1392 VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true), 1393 VariantEncoding::Big5 => Big5Encoder::new(encoding), 1394 VariantEncoding::EucJp => EucJpEncoder::new(encoding), 1395 VariantEncoding::Iso2022Jp => Iso2022JpEncoder::new(encoding), 1396 VariantEncoding::ShiftJis => ShiftJisEncoder::new(encoding), 1397 VariantEncoding::EucKr => EucKrEncoder::new(encoding), 1398 VariantEncoding::UserDefined => UserDefinedEncoder::new(encoding), 1399 VariantEncoding::Utf16Be | VariantEncoding::Replacement | 1400 VariantEncoding::Utf16Le => unreachable!(), 1401 } 1402 } 1403 1404 pub fn is_single_byte(&self) -> bool { 1405 match *self { 1406 VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true, 1407 _ => false, 1408 } 1409 } 1410} 1411''') 1412 1413variant_file.close() 1414 1415(ffi_rs_begin, ffi_rs_end) = read_non_generated("../encoding_c/src/lib.rs") 1416 1417ffi_file = open("../encoding_c/src/lib.rs", "w") 1418 1419ffi_file.write(ffi_rs_begin) 1420ffi_file.write(""" 1421// Instead, please regenerate using generate-encoding-data.py 1422 1423/// The minimum length of buffers that may be passed to `encoding_name()`. 1424pub const ENCODING_NAME_MAX_LENGTH: usize = %d; // %s 1425 1426""" % (longest_name_length, longest_name)) 1427 1428for name in preferred: 1429 ffi_file.write('''/// The %s encoding. 1430#[no_mangle] 1431pub static %s_ENCODING: ConstEncoding = ConstEncoding(&%s_INIT); 1432 1433''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name))) 1434 1435ffi_file.write(ffi_rs_end) 1436ffi_file.close() 1437 1438(single_byte_rs_begin, single_byte_rs_end) = read_non_generated("src/single_byte.rs") 1439 1440single_byte_file = open("src/single_byte.rs", "w") 1441 1442single_byte_file.write(single_byte_rs_begin) 1443single_byte_file.write(""" 1444// Instead, please regenerate using generate-encoding-data.py 1445 1446 #[test] 1447 fn test_single_byte_decode() {""") 1448 1449idx = 0 # for Miri, return after 2nd test 1450for name in preferred: 1451 if name == u"ISO-8859-8-I": 1452 continue; 1453 if is_single_byte(name): 1454 single_byte_file.write(""" 1455 decode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name))) 1456 idx += 1 1457 if idx == 2: 1458 single_byte_file.write(""" 1459 if cfg!(miri) { 1460 // Miri is too slow 1461 return; 1462 }""") 1463 1464single_byte_file.write(""" 1465 } 1466 1467 #[test] 1468 fn test_single_byte_encode() {""") 1469 1470 1471idx = 0 # for Miri, return after 2nd test 1472for name in preferred: 1473 if name == u"ISO-8859-8-I": 1474 continue; 1475 if is_single_byte(name): 1476 single_byte_file.write(""" 1477 encode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name))) 1478 idx += 1 1479 if idx == 2: 1480 single_byte_file.write(""" 1481 if cfg!(miri) { 1482 // Miri is too slow 1483 return; 1484 }""") 1485 1486 1487single_byte_file.write(""" 1488 } 1489""") 1490 1491single_byte_file.write(single_byte_rs_end) 1492single_byte_file.close() 1493 1494static_file = open("../encoding_c/include/encoding_rs_statics.h", "w") 1495 1496static_file.write("""// Copyright Mozilla Foundation. See the COPYRIGHT 1497// file at the top-level directory of this distribution. 1498// 1499// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 1500// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license 1501// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your 1502// option. This file may not be copied, modified, or distributed 1503// except according to those terms. 1504 1505// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. 1506// Instead, please regenerate using generate-encoding-data.py 1507 1508// This file is not meant to be included directly. Instead, encoding_rs.h 1509// includes this file. 1510 1511#ifndef encoding_rs_statics_h_ 1512#define encoding_rs_statics_h_ 1513 1514#ifndef ENCODING_RS_ENCODING 1515#define ENCODING_RS_ENCODING Encoding 1516#ifndef __cplusplus 1517typedef struct Encoding_ Encoding; 1518#endif 1519#endif 1520 1521#ifndef ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR 1522#define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ENCODING_RS_ENCODING* 1523#endif 1524 1525#ifndef ENCODING_RS_ENCODER 1526#define ENCODING_RS_ENCODER Encoder 1527#ifndef __cplusplus 1528typedef struct Encoder_ Encoder; 1529#endif 1530#endif 1531 1532#ifndef ENCODING_RS_DECODER 1533#define ENCODING_RS_DECODER Decoder 1534#ifndef __cplusplus 1535typedef struct Decoder_ Decoder; 1536#endif 1537#endif 1538 1539#define INPUT_EMPTY 0 1540 1541#define OUTPUT_FULL 0xFFFFFFFF 1542 1543// %s 1544#define ENCODING_NAME_MAX_LENGTH %d 1545 1546""" % (longest_name, longest_name_length)) 1547 1548for name in preferred: 1549 static_file.write('''/// The %s encoding. 1550extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const %s_ENCODING; 1551 1552''' % (to_dom_name(name), to_constant_name(name))) 1553 1554static_file.write("""#endif // encoding_rs_statics_h_ 1555""") 1556static_file.close() 1557 1558(utf_8_rs_begin, utf_8_rs_end) = read_non_generated("src/utf_8.rs") 1559 1560utf_8_file = open("src/utf_8.rs", "w") 1561 1562utf_8_file.write(utf_8_rs_begin) 1563utf_8_file.write(""" 1564// Instead, please regenerate using generate-encoding-data.py 1565 1566pub static UTF8_DATA: Utf8Data = Utf8Data { 1567 table: [ 1568""") 1569 1570for i in range(256): 1571 combined = (1 << 2) # invalid lead 1572 if i < 0x80 or i > 0xBF: 1573 combined |= (1 << 3) # normal trail 1574 if i < 0xA0 or i > 0xBF: 1575 combined |= (1 << 4) # three-byte special lower bound 1576 if i < 0x80 or i > 0x9F: 1577 combined |= (1 << 5) # three-byte special upper bound 1578 if i < 0x90 or i > 0xBF: 1579 combined |= (1 << 6) # four-byte special lower bound 1580 if i < 0x80 or i > 0x8F: 1581 combined |= (1 << 7) # four-byte special upper bound 1582 utf_8_file.write("%d," % combined) 1583 1584for i in range(128, 256): 1585 lane = (1 << 2) # invalid lead 1586 if i >= 0xC2 and i <= 0xDF: 1587 lane = (1 << 3) # normal trail 1588 elif i == 0xE0: 1589 lane = (1 << 4) # three-byte special lower bound 1590 elif i >= 0xE1 and i <= 0xEC: 1591 lane = (1 << 3) # normal trail 1592 elif i == 0xED: 1593 lane = (1 << 5) # three-byte special upper bound 1594 elif i >= 0xEE and i <= 0xEF: 1595 lane = (1 << 3) # normal trail 1596 elif i == 0xF0: 1597 lane = (1 << 6) # four-byte special lower bound 1598 elif i >= 0xF1 and i <= 0xF3: 1599 lane = (1 << 3) # normal trail 1600 elif i == 0xF4: 1601 lane = (1 << 7) # four-byte special upper bound 1602 utf_8_file.write("%d," % lane) 1603 1604utf_8_file.write(""" 1605 ], 1606}; 1607 1608""") 1609 1610utf_8_file.write(utf_8_rs_end) 1611utf_8_file.close() 1612 1613# Unit tests 1614 1615TEST_HEADER = '''Any copyright to the test code below this comment is dedicated to the 1616Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ 1617 1618This is a generated file. Please do not edit. 1619Instead, please regenerate using generate-encoding-data.py 1620''' 1621 1622index = indexes["jis0208"] 1623 1624jis0208_in_file = open("src/test_data/jis0208_in.txt", "w") 1625jis0208_in_file.write(TEST_HEADER) 1626for pointer in range(0, 94 * 94): 1627 (lead, trail) = divmod(pointer, 94) 1628 lead += 0xA1 1629 trail += 0xA1 1630 jis0208_in_file.write("%s%s\n" % (chr(lead), chr(trail))) 1631jis0208_in_file.close() 1632 1633jis0208_in_ref_file = open("src/test_data/jis0208_in_ref.txt", "w") 1634jis0208_in_ref_file.write(TEST_HEADER) 1635for pointer in range(0, 94 * 94): 1636 code_point = index[pointer] 1637 if code_point: 1638 jis0208_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1639 else: 1640 jis0208_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1641jis0208_in_ref_file.close() 1642 1643jis0208_out_file = open("src/test_data/jis0208_out.txt", "w") 1644jis0208_out_ref_file = open("src/test_data/jis0208_out_ref.txt", "w") 1645jis0208_out_file.write(TEST_HEADER) 1646jis0208_out_ref_file.write(TEST_HEADER) 1647for pointer in range(0, 94 * 94): 1648 code_point = index[pointer] 1649 if code_point: 1650 revised_pointer = pointer 1651 if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220): 1652 revised_pointer = index.index(code_point) 1653 (lead, trail) = divmod(revised_pointer, 94) 1654 lead += 0xA1 1655 trail += 0xA1 1656 jis0208_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 1657 jis0208_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1658jis0208_out_file.close() 1659jis0208_out_ref_file.close() 1660 1661shift_jis_in_file = open("src/test_data/shift_jis_in.txt", "w") 1662shift_jis_in_file.write(TEST_HEADER) 1663for pointer in range(0, len(index)): 1664 (lead, trail) = divmod(pointer, 188) 1665 lead += 0x81 if lead < 0x1F else 0xC1 1666 trail += 0x40 if trail < 0x3F else 0x41 1667 shift_jis_in_file.write("%s%s\n" % (chr(lead), chr(trail))) 1668shift_jis_in_file.close() 1669 1670shift_jis_in_ref_file = open("src/test_data/shift_jis_in_ref.txt", "w") 1671shift_jis_in_ref_file.write(TEST_HEADER) 1672for pointer in range(0, len(index)): 1673 code_point = 0xE000 - 8836 + pointer if pointer >= 8836 and pointer <= 10715 else index[pointer] 1674 if code_point: 1675 shift_jis_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1676 else: 1677 trail = pointer % 188 1678 trail += 0x40 if trail < 0x3F else 0x41 1679 if trail < 0x80: 1680 shift_jis_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) 1681 else: 1682 shift_jis_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1683shift_jis_in_ref_file.close() 1684 1685shift_jis_out_file = open("src/test_data/shift_jis_out.txt", "w") 1686shift_jis_out_ref_file = open("src/test_data/shift_jis_out_ref.txt", "w") 1687shift_jis_out_file.write(TEST_HEADER) 1688shift_jis_out_ref_file.write(TEST_HEADER) 1689for pointer in range(0, 8272): 1690 code_point = index[pointer] 1691 if code_point: 1692 revised_pointer = pointer 1693 if revised_pointer >= 1207 and revised_pointer < 1220: 1694 revised_pointer = index.index(code_point) 1695 (lead, trail) = divmod(revised_pointer, 188) 1696 lead += 0x81 if lead < 0x1F else 0xC1 1697 trail += 0x40 if trail < 0x3F else 0x41 1698 shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 1699 shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1700for pointer in range(8836, len(index)): 1701 code_point = index[pointer] 1702 if code_point: 1703 revised_pointer = index.index(code_point) 1704 if revised_pointer >= 8272 and revised_pointer < 8836: 1705 revised_pointer = pointer 1706 (lead, trail) = divmod(revised_pointer, 188) 1707 lead += 0x81 if lead < 0x1F else 0xC1 1708 trail += 0x40 if trail < 0x3F else 0x41 1709 shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 1710 shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1711shift_jis_out_file.close() 1712shift_jis_out_ref_file.close() 1713 1714iso_2022_jp_in_file = open("src/test_data/iso_2022_jp_in.txt", "w") 1715iso_2022_jp_in_file.write(TEST_HEADER) 1716for pointer in range(0, 94 * 94): 1717 (lead, trail) = divmod(pointer, 94) 1718 lead += 0x21 1719 trail += 0x21 1720 iso_2022_jp_in_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail))) 1721iso_2022_jp_in_file.close() 1722 1723iso_2022_jp_in_ref_file = open("src/test_data/iso_2022_jp_in_ref.txt", "w") 1724iso_2022_jp_in_ref_file.write(TEST_HEADER) 1725for pointer in range(0, 94 * 94): 1726 code_point = index[pointer] 1727 if code_point: 1728 iso_2022_jp_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1729 else: 1730 iso_2022_jp_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1731iso_2022_jp_in_ref_file.close() 1732 1733iso_2022_jp_out_file = open("src/test_data/iso_2022_jp_out.txt", "w") 1734iso_2022_jp_out_ref_file = open("src/test_data/iso_2022_jp_out_ref.txt", "w") 1735iso_2022_jp_out_file.write(TEST_HEADER) 1736iso_2022_jp_out_ref_file.write(TEST_HEADER) 1737for pointer in range(0, 94 * 94): 1738 code_point = index[pointer] 1739 if code_point: 1740 revised_pointer = pointer 1741 if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220): 1742 revised_pointer = index.index(code_point) 1743 (lead, trail) = divmod(revised_pointer, 94) 1744 lead += 0x21 1745 trail += 0x21 1746 iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail))) 1747 iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1748for i in xrange(len(half_width_index)): 1749 code_point = i + 0xFF61 1750 normalized_code_point = half_width_index[i] 1751 pointer = index.index(normalized_code_point) 1752 (lead, trail) = divmod(pointer, 94) 1753 lead += 0x21 1754 trail += 0x21 1755 iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail))) 1756 iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1757iso_2022_jp_out_file.close() 1758iso_2022_jp_out_ref_file.close() 1759 1760index = indexes["euc-kr"] 1761 1762euc_kr_in_file = open("src/test_data/euc_kr_in.txt", "w") 1763euc_kr_in_file.write(TEST_HEADER) 1764for pointer in range(0, len(index)): 1765 (lead, trail) = divmod(pointer, 190) 1766 lead += 0x81 1767 trail += 0x41 1768 euc_kr_in_file.write("%s%s\n" % (chr(lead), chr(trail))) 1769euc_kr_in_file.close() 1770 1771euc_kr_in_ref_file = open("src/test_data/euc_kr_in_ref.txt", "w") 1772euc_kr_in_ref_file.write(TEST_HEADER) 1773for pointer in range(0, len(index)): 1774 code_point = index[pointer] 1775 if code_point: 1776 euc_kr_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1777 else: 1778 trail = pointer % 190 1779 trail += 0x41 1780 if trail < 0x80: 1781 euc_kr_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) 1782 else: 1783 euc_kr_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1784euc_kr_in_ref_file.close() 1785 1786euc_kr_out_file = open("src/test_data/euc_kr_out.txt", "w") 1787euc_kr_out_ref_file = open("src/test_data/euc_kr_out_ref.txt", "w") 1788euc_kr_out_file.write(TEST_HEADER) 1789euc_kr_out_ref_file.write(TEST_HEADER) 1790for pointer in range(0, len(index)): 1791 code_point = index[pointer] 1792 if code_point: 1793 (lead, trail) = divmod(pointer, 190) 1794 lead += 0x81 1795 trail += 0x41 1796 euc_kr_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 1797 euc_kr_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1798euc_kr_out_file.close() 1799euc_kr_out_ref_file.close() 1800 1801index = indexes["gb18030"] 1802 1803gb18030_in_file = open("src/test_data/gb18030_in.txt", "w") 1804gb18030_in_file.write(TEST_HEADER) 1805for pointer in range(0, len(index)): 1806 (lead, trail) = divmod(pointer, 190) 1807 lead += 0x81 1808 trail += 0x40 if trail < 0x3F else 0x41 1809 gb18030_in_file.write("%s%s\n" % (chr(lead), chr(trail))) 1810gb18030_in_file.close() 1811 1812gb18030_in_ref_file = open("src/test_data/gb18030_in_ref.txt", "w") 1813gb18030_in_ref_file.write(TEST_HEADER) 1814for pointer in range(0, len(index)): 1815 code_point = index[pointer] 1816 if code_point: 1817 gb18030_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1818 else: 1819 trail = pointer % 190 1820 trail += 0x40 if trail < 0x3F else 0x41 1821 if trail < 0x80: 1822 gb18030_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) 1823 else: 1824 gb18030_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1825gb18030_in_ref_file.close() 1826 1827gb18030_out_file = open("src/test_data/gb18030_out.txt", "w") 1828gb18030_out_ref_file = open("src/test_data/gb18030_out_ref.txt", "w") 1829gb18030_out_file.write(TEST_HEADER) 1830gb18030_out_ref_file.write(TEST_HEADER) 1831for pointer in range(0, len(index)): 1832 if pointer == 6555: 1833 continue 1834 code_point = index[pointer] 1835 if code_point: 1836 (lead, trail) = divmod(pointer, 190) 1837 lead += 0x81 1838 trail += 0x40 if trail < 0x3F else 0x41 1839 gb18030_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 1840 gb18030_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1841gb18030_out_file.close() 1842gb18030_out_ref_file.close() 1843 1844index = indexes["big5"] 1845 1846big5_in_file = open("src/test_data/big5_in.txt", "w") 1847big5_in_file.write(TEST_HEADER) 1848for pointer in range(0, len(index)): 1849 (lead, trail) = divmod(pointer, 157) 1850 lead += 0x81 1851 trail += 0x40 if trail < 0x3F else 0x62 1852 big5_in_file.write("%s%s\n" % (chr(lead), chr(trail))) 1853big5_in_file.close() 1854 1855big5_two_characters = { 1856 1133: u"\u00CA\u0304", 1857 1135: u"\u00CA\u030C", 1858 1164: u"\u00EA\u0304", 1859 1166: u"\u00EA\u030C", 1860} 1861 1862big5_in_ref_file = open("src/test_data/big5_in_ref.txt", "w") 1863big5_in_ref_file.write(TEST_HEADER) 1864for pointer in range(0, len(index)): 1865 if pointer in big5_two_characters.keys(): 1866 big5_in_ref_file.write((u"%s\n" % big5_two_characters[pointer]).encode("utf-8")) 1867 continue 1868 code_point = index[pointer] 1869 if code_point: 1870 big5_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1871 else: 1872 trail = pointer % 157 1873 trail += 0x40 if trail < 0x3F else 0x62 1874 if trail < 0x80: 1875 big5_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) 1876 else: 1877 big5_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1878big5_in_ref_file.close() 1879 1880prefer_last = [ 1881 0x2550, 1882 0x255E, 1883 0x2561, 1884 0x256A, 1885 0x5341, 1886 0x5345, 1887] 1888 1889pointer_for_prefer_last = [] 1890 1891for code_point in prefer_last: 1892 # Python lists don't have .rindex() :-( 1893 for i in xrange(len(index) - 1, -1, -1): 1894 candidate = index[i] 1895 if candidate == code_point: 1896 pointer_for_prefer_last.append(i) 1897 break 1898 1899big5_out_file = open("src/test_data/big5_out.txt", "w") 1900big5_out_ref_file = open("src/test_data/big5_out_ref.txt", "w") 1901big5_out_file.write(TEST_HEADER) 1902big5_out_ref_file.write(TEST_HEADER) 1903for pointer in range(((0xA1 - 0x81) * 157), len(index)): 1904 code_point = index[pointer] 1905 if code_point: 1906 if code_point in prefer_last: 1907 if pointer != pointer_for_prefer_last[prefer_last.index(code_point)]: 1908 continue 1909 else: 1910 if pointer != index.index(code_point): 1911 continue 1912 (lead, trail) = divmod(pointer, 157) 1913 lead += 0x81 1914 trail += 0x40 if trail < 0x3F else 0x62 1915 big5_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 1916 big5_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1917big5_out_file.close() 1918big5_out_ref_file.close() 1919 1920index = indexes["jis0212"] 1921 1922jis0212_in_file = open("src/test_data/jis0212_in.txt", "w") 1923jis0212_in_file.write(TEST_HEADER) 1924for pointer in range(0, len(index)): 1925 (lead, trail) = divmod(pointer, 94) 1926 lead += 0xA1 1927 trail += 0xA1 1928 jis0212_in_file.write("\x8F%s%s\n" % (chr(lead), chr(trail))) 1929jis0212_in_file.close() 1930 1931jis0212_in_ref_file = open("src/test_data/jis0212_in_ref.txt", "w") 1932jis0212_in_ref_file.write(TEST_HEADER) 1933for pointer in range(0, len(index)): 1934 code_point = index[pointer] 1935 if code_point: 1936 jis0212_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1937 else: 1938 jis0212_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1939jis0212_in_ref_file.close() 1940 1941(codepage_begin, codepage_end) = read_non_generated("../codepage/src/lib.rs") 1942 1943codepage_file = open("../codepage/src/lib.rs", "w") 1944 1945codepage_file.write(codepage_begin) 1946codepage_file.write(""" 1947// Instead, please regenerate using generate-encoding-data.py 1948 1949/// Supported code page numbers in estimated order of usage frequency 1950static CODE_PAGES: [u16; %d] = [ 1951""" % len(code_pages)) 1952 1953for code_page in code_pages: 1954 codepage_file.write(" %d,\n" % code_page) 1955 1956codepage_file.write("""]; 1957 1958/// Encodings corresponding to the code page numbers in the same order 1959static ENCODINGS: [&'static Encoding; %d] = [ 1960""" % len(code_pages)) 1961 1962for code_page in code_pages: 1963 name = encodings_by_code_page[code_page] 1964 codepage_file.write(" &%s_INIT,\n" % to_constant_name(name)) 1965 1966codepage_file.write("""]; 1967 1968""") 1969 1970codepage_file.write(codepage_end) 1971codepage_file.close() 1972 1973(codepage_test_begin, codepage_test_end) = read_non_generated("../codepage/src/tests.rs") 1974 1975codepage_test_file = open("../codepage/src/tests.rs", "w") 1976 1977codepage_test_file.write(codepage_test_begin) 1978codepage_test_file.write(""" 1979// Instead, please regenerate using generate-encoding-data.py 1980 1981#[test] 1982fn test_to_encoding() { 1983 assert_eq!(to_encoding(0), None); 1984 1985""") 1986 1987for code_page in code_pages: 1988 codepage_test_file.write(" assert_eq!(to_encoding(%d), Some(%s));\n" % (code_page, to_constant_name(encodings_by_code_page[code_page]))) 1989 1990codepage_test_file.write("""} 1991 1992#[test] 1993fn test_from_encoding() { 1994""") 1995 1996for name in preferred: 1997 if code_pages_by_encoding.has_key(name): 1998 codepage_test_file.write(" assert_eq!(from_encoding(%s), Some(%d));\n" % (to_constant_name(name), code_pages_by_encoding[name])) 1999 else: 2000 codepage_test_file.write(" assert_eq!(from_encoding(%s), None);\n" % to_constant_name(name)) 2001 2002codepage_test_file.write("""} 2003""") 2004 2005codepage_test_file.write(codepage_test_end) 2006codepage_test_file.close() 2007 2008subprocess.call(["cargo", "fmt"]) 2009