1#!/usr/bin/python 2 3# Copyright 2013-2016 Mozilla Foundation. See the COPYRIGHT 4# file at the top-level directory of this distribution. 5# 6# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 7# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license 8# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your 9# option. This file may not be copied, modified, or distributed 10# except according to those terms. 11 12import json 13import subprocess 14import sys 15import os.path 16 17if (not os.path.isfile("../encoding/encodings.json")) or (not os.path.isfile("../encoding/indexes.json")): 18 sys.stderr.write("This script needs a clone of https://github.com/whatwg/encoding/ (preferably at revision f381389) next to the encoding_rs directory.\n"); 19 sys.exit(-1) 20 21if not os.path.isfile("../encoding_c/src/lib.rs"): 22 sys.stderr.write("This script also writes the generated parts of the encoding_c crate and needs a clone of https://github.com/hsivonen/encoding_c next to the encoding_rs directory.\n"); 23 sys.exit(-1) 24 25if not os.path.isfile("../codepage/src/lib.rs"): 26 sys.stderr.write("This script also writes the generated parts of the codepage crate and needs a clone of https://github.com/hsivonen/codepage next to the encoding_rs directory.\n"); 27 sys.exit(-1) 28 29def cmp_from_end(one, other): 30 c = cmp(len(one), len(other)) 31 if c != 0: 32 return c 33 i = len(one) - 1 34 while i >= 0: 35 c = cmp(one[i], other[i]) 36 if c != 0: 37 return c 38 i -= 1 39 return 0 40 41 42class Label: 43 def __init__(self, label, preferred): 44 self.label = label 45 self.preferred = preferred 46 def __cmp__(self, other): 47 return cmp_from_end(self.label, other.label) 48 49class CodePage: 50 def __init__(self, code_page, preferred): 51 self.code_page = code_page 52 self.preferred = preferred 53 def __cmp__(self, other): 54 return self.code_page, other.code_page 55 56def static_u16_table(name, data): 57 data_file.write('''pub static %s: [u16; %d] = [ 58 ''' % (name, len(data))) 59 60 for i in xrange(len(data)): 61 data_file.write('0x%04X,\n' % data[i]) 62 63 data_file.write(''']; 64 65 ''') 66 67def static_u16_table_from_indexable(name, data, item, feature): 68 data_file.write('''#[cfg(all( 69 feature = "less-slow-%s", 70 not(feature = "fast-%s") 71))] 72static %s: [u16; %d] = [ 73 ''' % (feature, feature, name, len(data))) 74 75 for i in xrange(len(data)): 76 data_file.write('0x%04X,\n' % data[i][item]) 77 78 data_file.write(''']; 79 80 ''') 81 82def static_u8_pair_table_from_indexable(name, data, item, feature): 83 data_file.write('''#[cfg(all( 84 feature = "less-slow-%s", 85 not(feature = "fast-%s") 86))] 87static %s: [[u8; 2]; %d] = [ 88 ''' % (feature, feature, name, len(data))) 89 90 for i in xrange(len(data)): 91 data_file.write('[0x%02X, 0x%02X],\n' % data[i][item]) 92 93 data_file.write(''']; 94 95 ''') 96 97def static_u8_pair_table(name, data, feature): 98 data_file.write('''#[cfg(feature = "%s")] 99static %s: [[u8; 2]; %d] = [ 100 ''' % (feature, name, len(data))) 101 102 for i in xrange(len(data)): 103 pair = data[i] 104 if not pair: 105 pair = (0, 0) 106 data_file.write('[0x%02X, 0x%02X],\n' % pair) 107 108 data_file.write(''']; 109 110 ''') 111 112preferred = [] 113 114dom = [] 115 116labels = [] 117 118data = json.load(open("../encoding/encodings.json", "r")) 119 120indexes = json.load(open("../encoding/indexes.json", "r")) 121 122single_byte = [] 123 124multi_byte = [] 125 126def to_camel_name(name): 127 if name == u"iso-8859-8-i": 128 return u"Iso8I" 129 if name.startswith(u"iso-8859-"): 130 return name.replace(u"iso-8859-", u"Iso") 131 return name.title().replace(u"X-", u"").replace(u"-", u"").replace(u"_", u"") 132 133def to_constant_name(name): 134 return name.replace(u"-", u"_").upper() 135 136def to_snake_name(name): 137 return name.replace(u"-", u"_").lower() 138 139def to_dom_name(name): 140 return name 141 142# Guestimate based on 143# https://w3techs.com/technologies/overview/character_encoding/all 144# whose methodology is known to be bogus, but the results are credible for 145# this purpose. UTF-16LE lifted up due to prevalence on Windows and 146# "ANSI codepages" prioritized. 147encodings_by_code_page_frequency = [ 148 "UTF-8", 149 "UTF-16LE", 150 "windows-1252", 151 "windows-1251", 152 "GBK", 153 "Shift_JIS", 154 "EUC-KR", 155 "windows-1250", 156 "windows-1256", 157 "windows-1254", 158 "Big5", 159 "windows-874", 160 "windows-1255", 161 "windows-1253", 162 "windows-1257", 163 "windows-1258", 164 "EUC-JP", 165 "ISO-8859-2", 166 "ISO-8859-15", 167 "ISO-8859-7", 168 "KOI8-R", 169 "gb18030", 170 "ISO-8859-5", 171 "ISO-8859-8-I", 172 "ISO-8859-4", 173 "ISO-8859-6", 174 "ISO-2022-JP", 175 "KOI8-U", 176 "ISO-8859-13", 177 "ISO-8859-3", 178 "UTF-16BE", 179 "IBM866", 180 "ISO-8859-10", 181 "ISO-8859-8", 182 "macintosh", 183 "x-mac-cyrillic", 184 "ISO-8859-14", 185 "ISO-8859-16", 186] 187 188encodings_by_code_page = { 189 932: "Shift_JIS", 190 936: "GBK", 191 949: "EUC-KR", 192 950: "Big5", 193 866: "IBM866", 194 874: "windows-874", 195 1200: "UTF-16LE", 196 1201: "UTF-16BE", 197 1250: "windows-1250", 198 1251: "windows-1251", 199 1252: "windows-1252", 200 1253: "windows-1253", 201 1254: "windows-1254", 202 1255: "windows-1255", 203 1256: "windows-1256", 204 1257: "windows-1257", 205 1258: "windows-1258", 206 10000: "macintosh", 207 10017: "x-mac-cyrillic", 208 20866: "KOI8-R", 209 20932: "EUC-JP", 210 21866: "KOI8-U", 211 28592: "ISO-8859-2", 212 28593: "ISO-8859-3", 213 28594: "ISO-8859-4", 214 28595: "ISO-8859-5", 215 28596: "ISO-8859-6", 216 28597: "ISO-8859-7", 217 28598: "ISO-8859-8", 218 28600: "ISO-8859-10", 219 28603: "ISO-8859-13", 220 28604: "ISO-8859-14", 221 28605: "ISO-8859-15", 222 28606: "ISO-8859-16", 223 38598: "ISO-8859-8-I", 224 50221: "ISO-2022-JP", 225 54936: "gb18030", 226 65001: "UTF-8", 227} 228 229code_pages_by_encoding = {} 230 231for code_page, encoding in encodings_by_code_page.iteritems(): 232 code_pages_by_encoding[encoding] = code_page 233 234encoding_by_alias_code_page = { 235 951: "Big5", 236 10007: "x-mac-cyrillic", 237 20936: "GBK", 238 20949: "EUC-KR", 239 21010: "UTF-16LE", # Undocumented; needed by calamine for Excel compat 240 28591: "windows-1252", 241 28599: "windows-1254", 242 28601: "windows-874", 243 50220: "ISO-2022-JP", 244 50222: "ISO-2022-JP", 245 50225: "replacement", # ISO-2022-KR 246 50227: "replacement", # ISO-2022-CN 247 51949: "EUC-JP", 248 51936: "GBK", 249 51949: "EUC-KR", 250 52936: "replacement", # HZ 251} 252 253code_pages = [] 254 255for name in encodings_by_code_page_frequency: 256 code_pages.append(code_pages_by_encoding[name]) 257 258encodings_by_code_page.update(encoding_by_alias_code_page) 259 260temp_keys = encodings_by_code_page.keys() 261temp_keys.sort() 262for code_page in temp_keys: 263 if not code_page in code_pages: 264 code_pages.append(code_page) 265 266# The position in the index (0 is the first index entry, 267# i.e. byte value 0x80) that starts the longest run of 268# consecutive code points. Must not be in the first 269# quadrant. If the character to be encoded is not in this 270# run, the part of the index after the run is searched 271# forward. Then the part of the index from 32 to the start 272# of the run. The first quadrant is searched last. 273# 274# If there is no obviously most useful longest run, 275# the index here is just used to affect the search order. 276start_of_longest_run_in_single_byte = { 277 "IBM866": 96, # 0 would be longest, but we don't want to start in the first quadrant 278 "windows-874": 33, 279 "windows-1250": 92, 280 "windows-1251": 64, 281 "windows-1252": 32, 282 "windows-1253": 83, 283 "windows-1254": 95, 284 "windows-1255": 96, 285 "windows-1256": 65, 286 "windows-1257": 95, # not actually longest 287 "windows-1258": 95, # not actually longest 288 "macintosh": 106, # useless 289 "x-mac-cyrillic": 96, 290 "KOI8-R": 64, # not actually longest 291 "KOI8-U": 64, # not actually longest 292 "ISO-8859-2": 95, # not actually longest 293 "ISO-8859-3": 95, # not actually longest 294 "ISO-8859-4": 95, # not actually longest 295 "ISO-8859-5": 46, 296 "ISO-8859-6": 65, 297 "ISO-8859-7": 83, 298 "ISO-8859-8": 96, 299 "ISO-8859-10": 90, # not actually longest 300 "ISO-8859-13": 95, # not actually longest 301 "ISO-8859-14": 95, 302 "ISO-8859-15": 63, 303 "ISO-8859-16": 95, # not actually longest 304} 305 306# 307 308for group in data: 309 if group["heading"] == "Legacy single-byte encodings": 310 single_byte = group["encodings"] 311 else: 312 multi_byte.extend(group["encodings"]) 313 for encoding in group["encodings"]: 314 preferred.append(encoding["name"]) 315 for label in encoding["labels"]: 316 labels.append(Label(label, encoding["name"])) 317 318for name in preferred: 319 dom.append(to_dom_name(name)) 320 321preferred.sort() 322labels.sort() 323dom.sort(cmp=cmp_from_end) 324 325longest_label_length = 0 326longest_name_length = 0 327longest_label = None 328longest_name = None 329 330for name in preferred: 331 if len(name) > longest_name_length: 332 longest_name_length = len(name) 333 longest_name = name 334 335for label in labels: 336 if len(label.label) > longest_label_length: 337 longest_label_length = len(label.label) 338 longest_label = label.label 339 340def longest_run_for_single_byte(name): 341 if name == u"ISO-8859-8-I": 342 name = u"ISO-8859-8" 343 index = indexes[name.lower()] 344 run_byte_offset = start_of_longest_run_in_single_byte[name] 345 run_bmp_offset = index[run_byte_offset] 346 previous_code_point = run_bmp_offset 347 run_length = 1 348 while True: 349 i = run_byte_offset + run_length 350 if i == len(index): 351 break 352 code_point = index[i] 353 if previous_code_point + 1 != code_point: 354 break 355 previous_code_point = code_point 356 run_length += 1 357 return (run_bmp_offset, run_byte_offset, run_length) 358 359def is_single_byte(name): 360 for encoding in single_byte: 361 if name == encoding["name"]: 362 return True 363 return False 364 365def read_non_generated(path): 366 partially_generated_file = open(path, "r") 367 full = partially_generated_file.read() 368 partially_generated_file.close() 369 370 generated_begin = "// BEGIN GENERATED CODE. PLEASE DO NOT EDIT." 371 generated_end = "// END GENERATED CODE" 372 373 generated_begin_index = full.find(generated_begin) 374 if generated_begin_index < 0: 375 sys.stderr.write("Can't find generated code start marker in %s. Exiting.\n" % path) 376 sys.exit(-1) 377 generated_end_index = full.find(generated_end) 378 if generated_end_index < 0: 379 sys.stderr.write("Can't find generated code end marker in %s. Exiting.\n" % path) 380 sys.exit(-1) 381 382 return (full[0:generated_begin_index + len(generated_begin)], 383 full[generated_end_index:]) 384 385(lib_rs_begin, lib_rs_end) = read_non_generated("src/lib.rs") 386 387label_file = open("src/lib.rs", "w") 388 389label_file.write(lib_rs_begin) 390label_file.write(""" 391// Instead, please regenerate using generate-encoding-data.py 392 393const LONGEST_LABEL_LENGTH: usize = %d; // %s 394 395""" % (longest_label_length, longest_label)) 396 397for name in preferred: 398 variant = None 399 if is_single_byte(name): 400 (run_bmp_offset, run_byte_offset, run_length) = longest_run_for_single_byte(name) 401 variant = "SingleByte(&data::SINGLE_BYTE_DATA.%s, 0x%04X, %d, %d)" % (to_snake_name(u"iso-8859-8" if name == u"ISO-8859-8-I" else name), run_bmp_offset, run_byte_offset, run_length) 402 else: 403 variant = to_camel_name(name) 404 405 docfile = open("doc/%s.txt" % name, "r") 406 doctext = docfile.read() 407 docfile.close() 408 409 label_file.write('''/// The initializer for the [%s](static.%s.html) encoding. 410/// 411/// For use only for taking the address of this form when 412/// Rust prohibits the use of the non-`_INIT` form directly, 413/// such as in initializers of other `static`s. If in doubt, 414/// use the corresponding non-`_INIT` reference-typed `static`. 415/// 416/// This part of the public API will go away if Rust changes 417/// to make the referent of `pub const FOO: &'static Encoding` 418/// unique cross-crate or if Rust starts allowing static arrays 419/// to be initialized with `pub static FOO: &'static Encoding` 420/// items. 421pub static %s_INIT: Encoding = Encoding { 422 name: "%s", 423 variant: VariantEncoding::%s, 424}; 425 426/// The %s encoding. 427/// 428%s/// 429/// This will change from `static` to `const` if Rust changes 430/// to make the referent of `pub const FOO: &'static Encoding` 431/// unique cross-crate, so don't take the address of this 432/// `static`. 433pub static %s: &'static Encoding = &%s_INIT; 434 435''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name), to_dom_name(name), variant, to_dom_name(name), doctext, to_constant_name(name), to_constant_name(name))) 436 437label_file.write("""static LABELS_SORTED: [&'static str; %d] = [ 438""" % len(labels)) 439 440for label in labels: 441 label_file.write('''"%s",\n''' % label.label) 442 443label_file.write("""]; 444 445static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; %d] = [ 446""" % len(labels)) 447 448for label in labels: 449 label_file.write('''&%s_INIT,\n''' % to_constant_name(label.preferred)) 450 451label_file.write(''']; 452 453''') 454label_file.write(lib_rs_end) 455label_file.close() 456 457label_test_file = open("src/test_labels_names.rs", "w") 458label_test_file.write('''// Any copyright to the test code below this comment is dedicated to the 459// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ 460 461// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. 462// Instead, please regenerate using generate-encoding-data.py 463 464use super::*; 465 466#[test] 467fn test_all_labels() { 468''') 469 470for label in labels: 471 label_test_file.write('''assert_eq!(Encoding::for_label(b"%s"), Some(%s));\n''' % (label.label, to_constant_name(label.preferred))) 472 473label_test_file.write('''} 474''') 475label_test_file.close() 476 477def null_to_zero(code_point): 478 if not code_point: 479 code_point = 0 480 return code_point 481 482(data_rs_begin, data_rs_end) = read_non_generated("src/data.rs") 483 484data_file = open("src/data.rs", "w") 485data_file.write(data_rs_begin) 486data_file.write(''' 487// Instead, please regenerate using generate-encoding-data.py 488 489#[repr(align(64))] // Align to cache lines 490pub struct SingleByteData { 491''') 492 493# Single-byte 494 495for encoding in single_byte: 496 name = encoding["name"] 497 if name == u"ISO-8859-8-I": 498 continue 499 500 data_file.write(''' pub %s: [u16; 128], 501''' % to_snake_name(name)) 502 503data_file.write('''} 504 505pub static SINGLE_BYTE_DATA: SingleByteData = SingleByteData { 506''') 507 508for encoding in single_byte: 509 name = encoding["name"] 510 if name == u"ISO-8859-8-I": 511 continue 512 513 data_file.write(''' %s: [ 514''' % to_snake_name(name)) 515 516 for code_point in indexes[name.lower()]: 517 data_file.write('0x%04X,\n' % null_to_zero(code_point)) 518 519 data_file.write('''], 520''') 521 522data_file.write('''}; 523 524''') 525 526# Big5 527 528index = indexes["big5"] 529 530astralness = [] 531low_bits = [] 532 533for code_point in index[942:19782]: 534 if code_point: 535 astralness.append(1 if code_point > 0xFFFF else 0) 536 low_bits.append(code_point & 0xFFFF) 537 else: 538 astralness.append(0) 539 low_bits.append(0) 540 541# pad length to multiple of 32 542for j in xrange(32 - (len(astralness) % 32)): 543 astralness.append(0) 544 545data_file.write('''#[cfg_attr(feature = "cargo-clippy", allow(unreadable_literal))] 546static BIG5_ASTRALNESS: [u32; %d] = [ 547''' % (len(astralness) / 32)) 548 549i = 0 550while i < len(astralness): 551 accu = 0 552 for j in xrange(32): 553 accu |= astralness[i + j] << j 554 data_file.write('0x%08X,\n' % accu) 555 i += 32 556 557data_file.write(''']; 558 559''') 560 561static_u16_table("BIG5_LOW_BITS", low_bits) 562 563# Encoder table for Level 1 Hanzi 564# Note: If we were OK with doubling this table, we 565# could use a directly-indexable table instead... 566level1_hanzi_index = index[5495:10896] 567level1_hanzi_pairs = [] 568for i in xrange(len(level1_hanzi_index)): 569 hanzi_lead = (i / 157) + 0xA4 570 hanzi_trail = (i % 157) 571 hanzi_trail += 0x40 if hanzi_trail < 0x3F else 0x62 572 level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail))) 573level1_hanzi_pairs.append((0x4E5A, (0xC8, 0x7B))) 574level1_hanzi_pairs.append((0x5202, (0xC8, 0x7D))) 575level1_hanzi_pairs.append((0x9FB0, (0xC8, 0xA1))) 576level1_hanzi_pairs.append((0x5188, (0xC8, 0xA2))) 577level1_hanzi_pairs.append((0x9FB1, (0xC8, 0xA3))) 578level1_hanzi_pairs.sort(key=lambda x: x[0]) 579 580static_u16_table_from_indexable("BIG5_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "big5-hanzi-encode") 581static_u8_pair_table_from_indexable("BIG5_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "big5-hanzi-encode") 582 583# Fast Unified Ideograph encode 584big5_unified_ideograph_bytes = [None] * (0x9FCC - 0x4E00) 585for row in xrange(0x7E - 0x20): 586 for column in xrange(157): 587 pointer = 5024 + column + (row * 157) 588 code_point = index[pointer] 589 if code_point and code_point >= 0x4E00 and code_point <= 0x9FCB: 590 unified_offset = code_point - 0x4E00 591 unified_lead = 0xA1 + row 592 unified_trail = (0x40 if column < 0x3F else 0x62) + column 593 if code_point == 0x5341 or code_point == 0x5345 or not big5_unified_ideograph_bytes[unified_offset]: 594 big5_unified_ideograph_bytes[unified_offset] = (unified_lead, unified_trail) 595 596static_u8_pair_table("BIG5_UNIFIED_IDEOGRAPH_BYTES", big5_unified_ideograph_bytes, "fast-big5-hanzi-encode") 597 598# JIS0208 599 600index = indexes["jis0208"] 601 602# JIS 0208 Level 1 Kanji 603static_u16_table("JIS0208_LEVEL1_KANJI", index[1410:4375]) 604 605# JIS 0208 Level 2 Kanji and Additional Kanji 606static_u16_table("JIS0208_LEVEL2_AND_ADDITIONAL_KANJI", index[4418:7808]) 607 608# IBM Kanji 609static_u16_table("IBM_KANJI", index[8272:8632]) 610 611# Check that the other instance is the same 612if index[8272:8632] != index[10744:11104]: 613 raise Error() 614 615# JIS 0208 symbols (all non-Kanji, non-range items) 616symbol_index = [] 617symbol_triples = [] 618pointers_to_scan = [ 619 (0, 188), 620 (658, 691), 621 (1159, 1221), 622] 623in_run = False 624run_start_pointer = 0 625run_start_array_index = 0 626for (start, end) in pointers_to_scan: 627 for i in range(start, end): 628 code_point = index[i] 629 if in_run: 630 if code_point: 631 symbol_index.append(code_point) 632 else: 633 symbol_triples.append(run_start_pointer) 634 symbol_triples.append(i - run_start_pointer) 635 symbol_triples.append(run_start_array_index) 636 in_run = False 637 else: 638 if code_point: 639 in_run = True 640 run_start_pointer = i 641 run_start_array_index = len(symbol_index) 642 symbol_index.append(code_point) 643 if in_run: 644 symbol_triples.append(run_start_pointer) 645 symbol_triples.append(end - run_start_pointer) 646 symbol_triples.append(run_start_array_index) 647 in_run = False 648if in_run: 649 raise Error() 650 651# Now add manually the two overlapping slices of 652# index from the NEC/IBM extensions. 653run_start_array_index = len(symbol_index) 654symbol_index.extend(index[10736:10744]) 655# Later 656symbol_triples.append(10736) 657symbol_triples.append(8) 658symbol_triples.append(run_start_array_index) 659# Earlier 660symbol_triples.append(8644) 661symbol_triples.append(4) 662symbol_triples.append(run_start_array_index) 663 664static_u16_table("JIS0208_SYMBOLS", symbol_index) 665static_u16_table("JIS0208_SYMBOL_TRIPLES", symbol_triples) 666 667# Write down the magic numbers needed when preferring the earlier case 668data_file.write('''const IBM_SYMBOL_START: usize = %d;''' % (run_start_array_index + 1)) 669data_file.write('''const IBM_SYMBOL_END: usize = %d;''' % (run_start_array_index + 4)) 670data_file.write('''const IBM_SYMBOL_POINTER_START: usize = %d;''' % 8645) 671 672# JIS 0208 ranges (excluding kana) 673range_triples = [] 674pointers_to_scan = [ 675 (188, 281), 676 (470, 657), 677 (1128, 1159), 678 (8634, 8644), 679 (10716, 10736), 680] 681in_run = False 682run_start_pointer = 0 683run_start_code_point = 0 684previous_code_point = 0 685for (start, end) in pointers_to_scan: 686 for i in range(start, end): 687 code_point = index[i] 688 if in_run: 689 if code_point: 690 if previous_code_point + 1 != code_point: 691 range_triples.append(run_start_pointer) 692 range_triples.append(i - run_start_pointer) 693 range_triples.append(run_start_code_point) 694 run_start_pointer = i 695 run_start_code_point = code_point 696 previous_code_point = code_point 697 else: 698 range_triples.append(run_start_pointer) 699 range_triples.append(i - run_start_pointer) 700 range_triples.append(run_start_code_point) 701 run_start_pointer = 0 702 run_start_code_point = 0 703 previous_code_point = 0 704 in_run = False 705 else: 706 if code_point: 707 in_run = True 708 run_start_pointer = i 709 run_start_code_point = code_point 710 previous_code_point = code_point 711 if in_run: 712 range_triples.append(run_start_pointer) 713 range_triples.append(end - run_start_pointer) 714 range_triples.append(run_start_code_point) 715 run_start_pointer = 0 716 run_start_code_point = 0 717 previous_code_point = 0 718 in_run = False 719if in_run: 720 raise Error() 721 722static_u16_table("JIS0208_RANGE_TRIPLES", range_triples) 723 724# Encoder table for Level 1 Kanji 725# Note: If we were OK with 30 KB more footprint, we 726# could use a directly-indexable table instead... 727level1_kanji_index = index[1410:4375] 728level1_kanji_pairs = [] 729for i in xrange(len(level1_kanji_index)): 730 pointer = 1410 + i 731 (lead, trail) = divmod(pointer, 188) 732 lead += 0x81 if lead < 0x1F else 0xC1 733 trail += 0x40 if trail < 0x3F else 0x41 734 level1_kanji_pairs.append((level1_kanji_index[i], (lead, trail))) 735level1_kanji_pairs.sort(key=lambda x: x[0]) 736 737static_u16_table_from_indexable("JIS0208_LEVEL1_KANJI_CODE_POINTS", level1_kanji_pairs, 0, "kanji-encode") 738static_u8_pair_table_from_indexable("JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES", level1_kanji_pairs, 1, "kanji-encode") 739 740# Fast encoder table for Kanji 741kanji_bytes = [None] * (0x9FA1 - 0x4E00) 742for pointer in xrange(len(index)): 743 code_point = index[pointer] 744 if code_point and code_point >= 0x4E00 and code_point <= 0x9FA0: 745 (lead, trail) = divmod(pointer, 188) 746 lead += 0x81 if lead < 0x1F else 0xC1 747 trail += 0x40 if trail < 0x3F else 0x41 748 # unset the high bit of lead if IBM Kanji 749 if pointer >= 8272: 750 lead = lead & 0x7F 751 kanji_bytes[code_point - 0x4E00] = (lead, trail) 752 753static_u8_pair_table("JIS0208_KANJI_BYTES", kanji_bytes, "fast-kanji-encode") 754 755# ISO-2022-JP half-width katakana 756 757# index is still jis0208 758half_width_index = indexes["iso-2022-jp-katakana"] 759 760data_file.write('''pub static ISO_2022_JP_HALF_WIDTH_TRAIL: [u8; %d] = [ 761''' % len(half_width_index)) 762 763for i in xrange(len(half_width_index)): 764 code_point = half_width_index[i] 765 pointer = index.index(code_point) 766 trail = pointer % 94 + 0x21 767 data_file.write('0x%02X,\n' % trail) 768 769data_file.write(''']; 770 771''') 772 773# EUC-KR 774 775index = indexes["euc-kr"] 776 777# Unicode 1.1 Hangul above the old KS X 1001 block 778# Compressed form takes 35% of uncompressed form 779pointers = [] 780offsets = [] 781previous_code_point = 0 782for row in xrange(0x20): 783 for column in xrange(190): 784 i = column + (row * 190) 785 # Skip the gaps 786 if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40): 787 continue 788 code_point = index[i] 789 if previous_code_point > code_point: 790 raise Error() 791 if code_point - previous_code_point != 1: 792 adjustment = 0 793 if column >= 0x40: 794 adjustment = 12 795 elif column >= 0x20: 796 adjustment = 6 797 pointers.append(column - adjustment + (row * (190 - 12))) 798 offsets.append(code_point) 799 previous_code_point = code_point 800 801static_u16_table("CP949_TOP_HANGUL_POINTERS", pointers) 802static_u16_table("CP949_TOP_HANGUL_OFFSETS", offsets) 803 804# Unicode 1.1 Hangul to the left of the old KS X 1001 block 805pointers = [] 806offsets = [] 807previous_code_point = 0 808for row in xrange(0x46 - 0x20): 809 for column in xrange(190 - 94): 810 i = 6080 + column + (row * 190) 811 # Skip the gaps 812 if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40): 813 continue 814 if i > 13127: 815 # Exclude unassigned on partial last row 816 break 817 code_point = index[i] 818 if previous_code_point > code_point: 819 raise Error() 820 if code_point - previous_code_point != 1: 821 adjustment = 0 822 if column >= 0x40: 823 adjustment = 12 824 elif column >= 0x20: 825 adjustment = 6 826 pointers.append(column - adjustment + (row * (190 - 94 - 12))) 827 offsets.append(code_point) 828 previous_code_point = code_point 829 830static_u16_table("CP949_LEFT_HANGUL_POINTERS", pointers) 831static_u16_table("CP949_LEFT_HANGUL_OFFSETS", offsets) 832 833# KS X 1001 Hangul 834hangul_index = [] 835previous_code_point = 0 836for row in xrange(0x48 - 0x2F): 837 for column in xrange(94): 838 code_point = index[9026 + column + (row * 190)] 839 if previous_code_point >= code_point: 840 raise Error() 841 hangul_index.append(code_point) 842 previous_code_point = code_point 843 844static_u16_table("KSX1001_HANGUL", hangul_index) 845 846# KS X 1001 Hanja 847hanja_index = [] 848for row in xrange(0x7D - 0x49): 849 for column in xrange(94): 850 hanja_index.append(index[13966 + column + (row * 190)]) 851 852static_u16_table("KSX1001_HANJA", hanja_index) 853 854# KS X 1001 symbols 855symbol_index = [] 856for i in range(6176, 6270): 857 symbol_index.append(index[i]) 858for i in range(6366, 6437): 859 symbol_index.append(index[i]) 860 861static_u16_table("KSX1001_SYMBOLS", symbol_index) 862 863# KS X 1001 Uppercase Latin 864subindex = [] 865for i in range(7506, 7521): 866 subindex.append(null_to_zero(index[i])) 867 868static_u16_table("KSX1001_UPPERCASE", subindex) 869 870# KS X 1001 Lowercase Latin 871subindex = [] 872for i in range(7696, 7712): 873 subindex.append(index[i]) 874 875static_u16_table("KSX1001_LOWERCASE", subindex) 876 877# KS X 1001 Box drawing 878subindex = [] 879for i in range(7126, 7194): 880 subindex.append(index[i]) 881 882static_u16_table("KSX1001_BOX", subindex) 883 884# KS X 1001 other 885pointers = [] 886offsets = [] 887previous_code_point = 0 888for row in xrange(10): 889 for column in xrange(94): 890 i = 6556 + column + (row * 190) 891 code_point = index[i] 892 # Exclude ranges that were processed as lookup tables 893 # or that contain unmapped cells by filling them with 894 # ASCII. Upon encode, ASCII code points will 895 # never appear as the search key. 896 if (i >= 6946 and i <= 6950): 897 code_point = i - 6946 898 elif (i >= 6961 and i <= 6967): 899 code_point = i - 6961 900 elif (i >= 6992 and i <= 6999): 901 code_point = i - 6992 902 elif (i >= 7024 and i <= 7029): 903 code_point = i - 7024 904 elif (i >= 7126 and i <= 7219): 905 code_point = i - 7126 906 elif (i >= 7395 and i <= 7409): 907 code_point = i - 7395 908 elif (i >= 7506 and i <= 7521): 909 code_point = i - 7506 910 elif (i >= 7696 and i <= 7711): 911 code_point = i - 7696 912 elif (i >= 7969 and i <= 7979): 913 code_point = i - 7969 914 elif (i >= 8162 and i <= 8169): 915 code_point = i - 8162 916 elif (i >= 8299 and i <= 8313): 917 code_point = i - 8299 918 elif (i >= 8347 and i <= 8359): 919 code_point = i - 8347 920 if code_point - previous_code_point != 1: 921 pointers.append(column + (row * 94)) 922 offsets.append(code_point) 923 previous_code_point = code_point 924 925static_u16_table("KSX1001_OTHER_POINTERS", pointers) 926# Omit the last offset, because the end of the last line 927# is unmapped, so we don't want to look at it. 928static_u16_table("KSX1001_OTHER_UNSORTED_OFFSETS", offsets[:-1]) 929 930# Fast Hangul and Hanja encode 931hangul_bytes = [None] * (0xD7A4 - 0xAC00) 932hanja_unified_bytes = [None] * (0x9F9D - 0x4E00) 933hanja_compatibility_bytes = [None] * (0xFA0C - 0xF900) 934for row in xrange(0x7D): 935 for column in xrange(190): 936 pointer = column + (row * 190) 937 code_point = index[pointer] 938 if code_point: 939 lead = 0x81 + row 940 trail = 0x41 + column 941 if code_point >= 0xAC00 and code_point < 0xD7A4: 942 hangul_bytes[code_point - 0xAC00] = (lead, trail) 943 elif code_point >= 0x4E00 and code_point < 0x9F9D: 944 hanja_unified_bytes[code_point - 0x4E00] = (lead, trail) 945 elif code_point >= 0xF900 and code_point < 0xFA0C: 946 hanja_compatibility_bytes[code_point - 0xF900] = (lead, trail) 947 948static_u8_pair_table("CP949_HANGUL_BYTES", hangul_bytes, "fast-hangul-encode") 949static_u8_pair_table("KSX1001_UNIFIED_HANJA_BYTES", hanja_unified_bytes, "fast-hanja-encode") 950static_u8_pair_table("KSX1001_COMPATIBILITY_HANJA_BYTES", hanja_compatibility_bytes, "fast-hanja-encode") 951 952# JIS 0212 953 954index = indexes["jis0212"] 955 956# JIS 0212 Kanji 957static_u16_table("JIS0212_KANJI", index[1410:7211]) 958 959# JIS 0212 accented (all non-Kanji, non-range items) 960symbol_index = [] 961symbol_triples = [] 962pointers_to_scan = [ 963 (0, 596), 964 (608, 644), 965 (656, 1409), 966] 967in_run = False 968run_start_pointer = 0 969run_start_array_index = 0 970for (start, end) in pointers_to_scan: 971 for i in range(start, end): 972 code_point = index[i] 973 if in_run: 974 if code_point: 975 symbol_index.append(code_point) 976 elif index[i + 1]: 977 symbol_index.append(0) 978 else: 979 symbol_triples.append(run_start_pointer) 980 symbol_triples.append(i - run_start_pointer) 981 symbol_triples.append(run_start_array_index) 982 in_run = False 983 else: 984 if code_point: 985 in_run = True 986 run_start_pointer = i 987 run_start_array_index = len(symbol_index) 988 symbol_index.append(code_point) 989 if in_run: 990 symbol_triples.append(run_start_pointer) 991 symbol_triples.append(end - run_start_pointer) 992 symbol_triples.append(run_start_array_index) 993 in_run = False 994if in_run: 995 raise Error() 996 997static_u16_table("JIS0212_ACCENTED", symbol_index) 998static_u16_table("JIS0212_ACCENTED_TRIPLES", symbol_triples) 999 1000# gb18030 1001 1002index = indexes["gb18030"] 1003 1004# Unicode 1.1 ideographs above the old GB2312 block 1005# Compressed form takes 63% of uncompressed form 1006pointers = [] 1007offsets = [] 1008previous_code_point = 0 1009for i in xrange(6080): 1010 code_point = index[i] 1011 if previous_code_point > code_point: 1012 raise Error() 1013 if code_point - previous_code_point != 1: 1014 pointers.append(i) 1015 offsets.append(code_point) 1016 previous_code_point = code_point 1017 1018static_u16_table("GBK_TOP_IDEOGRAPH_POINTERS", pointers) 1019static_u16_table("GBK_TOP_IDEOGRAPH_OFFSETS", offsets) 1020 1021# Unicode 1.1 ideographs to the left of the old GB2312 block 1022# Compressed form takes 40% of uncompressed form 1023pointers = [] 1024offsets = [] 1025previous_code_point = 0 1026for row in xrange(0x7D - 0x29): 1027 for column in xrange(190 - 94): 1028 i = 7790 + column + (row * 190) 1029 if i > 23650: 1030 # Exclude compatibility ideographs at the end 1031 break 1032 code_point = index[i] 1033 if previous_code_point > code_point: 1034 raise Error() 1035 if code_point - previous_code_point != 1: 1036 pointers.append(column + (row * (190 - 94))) 1037 offsets.append(code_point) 1038 previous_code_point = code_point 1039 1040static_u16_table("GBK_LEFT_IDEOGRAPH_POINTERS", pointers) 1041static_u16_table("GBK_LEFT_IDEOGRAPH_OFFSETS", offsets) 1042 1043# GBK other (excl. Ext A, Compat & PUA at the bottom) 1044pointers = [] 1045offsets = [] 1046previous_code_point = 0 1047for row in xrange(0x29 - 0x20): 1048 for column in xrange(190 - 94): 1049 i = 6080 + column + (row * 190) 1050 code_point = index[i] 1051 if code_point - previous_code_point != 1: 1052 pointers.append(column + (row * (190 - 94))) 1053 offsets.append(code_point) 1054 previous_code_point = code_point 1055 1056pointers.append((190 - 94) * (0x29 - 0x20)) 1057static_u16_table("GBK_OTHER_POINTERS", pointers) 1058static_u16_table("GBK_OTHER_UNSORTED_OFFSETS", offsets) 1059 1060# GBK bottom: Compatibility ideagraphs, Ext A and PUA 1061bottom_index = [] 1062# 5 compat following Unified Ideographs 1063for i in range(23651, 23656): 1064 bottom_index.append(index[i]) 1065# Last row 1066for i in range(23750, 23846): 1067 bottom_index.append(index[i]) 1068 1069static_u16_table("GBK_BOTTOM", bottom_index) 1070 1071# GB2312 Hanzi 1072# (and the 5 PUA code points in between Level 1 and Level 2) 1073hanzi_index = [] 1074for row in xrange(0x77 - 0x2F): 1075 for column in xrange(94): 1076 hanzi_index.append(index[9026 + column + (row * 190)]) 1077 1078static_u16_table("GB2312_HANZI", hanzi_index) 1079 1080# GB2312 symbols 1081symbol_index = [] 1082for i in xrange(94): 1083 symbol_index.append(index[6176 + i]) 1084 1085static_u16_table("GB2312_SYMBOLS", symbol_index) 1086 1087# GB2312 symbols on Greek row (incl. PUA) 1088symbol_index = [] 1089for i in xrange(22): 1090 symbol_index.append(index[7189 + i]) 1091 1092static_u16_table("GB2312_SYMBOLS_AFTER_GREEK", symbol_index) 1093 1094# GB2312 Pinyin 1095pinyin_index = [] 1096for i in xrange(32): 1097 pinyin_index.append(index[7506 + i]) 1098 1099static_u16_table("GB2312_PINYIN", pinyin_index) 1100 1101# GB2312 other (excl. bottom PUA) 1102pointers = [] 1103offsets = [] 1104previous_code_point = 0 1105for row in xrange(14): 1106 for column in xrange(94): 1107 i = 6366 + column + (row * 190) 1108 code_point = index[i] 1109 # Exclude the two ranges that were processed as 1110 # lookup tables above by filling them with 1111 # ASCII. Upon encode, ASCII code points will 1112 # never appear as the search key. 1113 if (i >= 7189 and i < 7189 + 22): 1114 code_point = i - 7189 1115 elif (i >= 7506 and i < 7506 + 32): 1116 code_point = i - 7506 1117 if code_point - previous_code_point != 1: 1118 pointers.append(column + (row * 94)) 1119 offsets.append(code_point) 1120 previous_code_point = code_point 1121 1122pointers.append(14 * 94) 1123static_u16_table("GB2312_OTHER_POINTERS", pointers) 1124static_u16_table("GB2312_OTHER_UNSORTED_OFFSETS", offsets) 1125 1126# Non-gbk code points 1127pointers = [] 1128offsets = [] 1129for pair in indexes["gb18030-ranges"]: 1130 if pair[1] == 0x10000: 1131 break # the last entry doesn't fit in u16 1132 pointers.append(pair[0]) 1133 offsets.append(pair[1]) 1134 1135static_u16_table("GB18030_RANGE_POINTERS", pointers) 1136static_u16_table("GB18030_RANGE_OFFSETS", offsets) 1137 1138# Encoder table for Level 1 Hanzi 1139# The units here really fit into 12 bits, but since we're 1140# looking for speed here, let's use 16 bits per unit. 1141# Once we use 16 bits per unit, we might as well precompute 1142# the output bytes. 1143level1_hanzi_index = hanzi_index[:(94 * (0xD8 - 0xB0) - 5)] 1144level1_hanzi_pairs = [] 1145for i in xrange(len(level1_hanzi_index)): 1146 hanzi_lead = (i / 94) + 0xB0 1147 hanzi_trail = (i % 94) + 0xA1 1148 level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail))) 1149level1_hanzi_pairs.sort(key=lambda x: x[0]) 1150 1151static_u16_table_from_indexable("GB2312_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "gb-hanzi-encode") 1152static_u8_pair_table_from_indexable("GB2312_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "gb-hanzi-encode") 1153 1154# Fast Hanzi encoder table 1155hanzi_bytes = [None] * (0x9FA7 - 0x4E00) 1156for row in xrange(126): 1157 for column in xrange(190): 1158 pointer = column + (row * 190) 1159 code_point = index[pointer] 1160 if code_point and code_point >= 0x4E00 and code_point <= 0x9FA6: 1161 hanzi_lead = 0x81 + row 1162 hanzi_trail = column + (0x40 if column < 0x3F else 0x41) 1163 hanzi_bytes[code_point - 0x4E00] = (hanzi_lead, hanzi_trail) 1164 1165static_u8_pair_table("GBK_HANZI_BYTES", hanzi_bytes, "fast-gb-hanzi-encode") 1166 1167data_file.write(data_rs_end) 1168 1169data_file.close() 1170 1171# Variant 1172 1173variant_file = open("src/variant.rs", "w") 1174variant_file.write('''// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT 1175// file at the top-level directory of this distribution. 1176// 1177// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 1178// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license 1179// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your 1180// option. This file may not be copied, modified, or distributed 1181// except according to those terms. 1182 1183// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. 1184// Instead, please regenerate using generate-encoding-data.py 1185 1186//! This module provides enums that wrap the various decoders and encoders. 1187//! The purpose is to make `Decoder` and `Encoder` `Sized` by writing the 1188//! dispatch explicitly for a finite set of specialized decoders and encoders. 1189//! Unfortunately, this means the compiler doesn't generate the dispatch code 1190//! and it has to be written here instead. 1191//! 1192//! The purpose of making `Decoder` and `Encoder` `Sized` is to allow stack 1193//! allocation in Rust code, including the convenience methods on `Encoding`. 1194 1195''') 1196 1197encoding_variants = [u"single-byte",] 1198for encoding in multi_byte: 1199 if encoding["name"] in [u"UTF-16LE", u"UTF-16BE"]: 1200 continue 1201 else: 1202 encoding_variants.append(encoding["name"]) 1203encoding_variants.append(u"UTF-16") 1204 1205decoder_variants = [] 1206for variant in encoding_variants: 1207 if variant == u"GBK": 1208 continue 1209 decoder_variants.append(variant) 1210 1211encoder_variants = [] 1212for variant in encoding_variants: 1213 if variant in [u"replacement", u"GBK", u"UTF-16"]: 1214 continue 1215 encoder_variants.append(variant) 1216 1217for variant in decoder_variants: 1218 variant_file.write("use %s::*;\n" % to_snake_name(variant)) 1219 1220variant_file.write('''use super::*; 1221 1222pub enum VariantDecoder { 1223''') 1224 1225for variant in decoder_variants: 1226 variant_file.write(" %s(%sDecoder),\n" % (to_camel_name(variant), to_camel_name(variant))) 1227 1228variant_file.write('''} 1229 1230impl VariantDecoder { 1231''') 1232 1233def write_variant_method(name, mut, arg_list, ret, variants, excludes, kind): 1234 variant_file.write('''pub fn %s(&''' % name) 1235 if mut: 1236 variant_file.write('''mut ''') 1237 variant_file.write('''self''') 1238 for arg in arg_list: 1239 variant_file.write(''', %s: %s''' % (arg[0], arg[1])) 1240 variant_file.write(''')''') 1241 if ret: 1242 variant_file.write(''' -> %s''' % ret) 1243 variant_file.write(''' {\nmatch *self {\n''') 1244 for variant in variants: 1245 variant_file.write('''Variant%s::%s(ref ''' % (kind, to_camel_name(variant))) 1246 if mut: 1247 variant_file.write('''mut ''') 1248 if variant in excludes: 1249 variant_file.write('''v) => (),''') 1250 continue 1251 variant_file.write('''v) => v.%s(''' % name) 1252 first = True 1253 for arg in arg_list: 1254 if not first: 1255 variant_file.write(''', ''') 1256 first = False 1257 variant_file.write(arg[0]) 1258 variant_file.write('''),\n''') 1259 variant_file.write('''}\n}\n\n''') 1260 1261write_variant_method("max_utf16_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder") 1262 1263write_variant_method("max_utf8_buffer_length_without_replacement", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder") 1264 1265write_variant_method("max_utf8_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder") 1266 1267write_variant_method("decode_to_utf16_raw", True, [("src", "&[u8]"), 1268 ("dst", "&mut [u16]"), 1269 ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder") 1270 1271write_variant_method("decode_to_utf8_raw", True, [("src", "&[u8]"), 1272 ("dst", "&mut [u8]"), 1273 ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder") 1274 1275variant_file.write(''' 1276 1277 pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> Option<usize> { 1278 match *self { 1279 VariantDecoder::SingleByte(ref v) => { 1280 return Some(v.latin1_byte_compatible_up_to(buffer)); 1281 } 1282 VariantDecoder::Utf8(ref v) => { 1283 if !v.in_neutral_state() { 1284 return None; 1285 } 1286 } 1287 VariantDecoder::Gb18030(ref v) => { 1288 if !v.in_neutral_state() { 1289 return None; 1290 } 1291 } 1292 VariantDecoder::Big5(ref v) => { 1293 if !v.in_neutral_state() { 1294 return None; 1295 } 1296 } 1297 VariantDecoder::EucJp(ref v) => { 1298 if !v.in_neutral_state() { 1299 return None; 1300 } 1301 } 1302 VariantDecoder::Iso2022Jp(ref v) => { 1303 if v.in_neutral_state() { 1304 return Some(Encoding::iso_2022_jp_ascii_valid_up_to(buffer)); 1305 } 1306 return None; 1307 } 1308 VariantDecoder::ShiftJis(ref v) => { 1309 if !v.in_neutral_state() { 1310 return None; 1311 } 1312 } 1313 VariantDecoder::EucKr(ref v) => { 1314 if !v.in_neutral_state() { 1315 return None; 1316 } 1317 } 1318 VariantDecoder::UserDefined(_) => {} 1319 VariantDecoder::Replacement(_) | VariantDecoder::Utf16(_) => { 1320 return None; 1321 } 1322 }; 1323 Some(Encoding::ascii_valid_up_to(buffer)) 1324 } 1325} 1326 1327pub enum VariantEncoder { 1328''') 1329 1330for variant in encoder_variants: 1331 variant_file.write(" %s(%sEncoder),\n" % (to_camel_name(variant), to_camel_name(variant))) 1332 1333variant_file.write('''} 1334 1335impl VariantEncoder { 1336 pub fn has_pending_state(&self) -> bool { 1337 match *self { 1338 VariantEncoder::Iso2022Jp(ref v) => { 1339 v.has_pending_state() 1340 } 1341 _ => false, 1342 } 1343 } 1344''') 1345 1346write_variant_method("max_buffer_length_from_utf16_without_replacement", False, [("u16_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder") 1347 1348write_variant_method("max_buffer_length_from_utf8_without_replacement", False, [("byte_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder") 1349 1350write_variant_method("encode_from_utf16_raw", True, [("src", "&[u16]"), 1351 ("dst", "&mut [u8]"), 1352 ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder") 1353 1354write_variant_method("encode_from_utf8_raw", True, [("src", "&str"), 1355 ("dst", "&mut [u8]"), 1356 ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder") 1357 1358 1359variant_file.write('''} 1360 1361pub enum VariantEncoding { 1362 SingleByte(&'static [u16; 128], u16, u8, u8),''') 1363 1364for encoding in multi_byte: 1365 variant_file.write("%s,\n" % to_camel_name(encoding["name"])) 1366 1367variant_file.write('''} 1368 1369impl VariantEncoding { 1370 pub fn new_variant_decoder(&self) -> VariantDecoder { 1371 match *self { 1372 VariantEncoding::SingleByte(table, _, _, _) => SingleByteDecoder::new(table), 1373 VariantEncoding::Utf8 => Utf8Decoder::new(), 1374 VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(), 1375 VariantEncoding::Big5 => Big5Decoder::new(), 1376 VariantEncoding::EucJp => EucJpDecoder::new(), 1377 VariantEncoding::Iso2022Jp => Iso2022JpDecoder::new(), 1378 VariantEncoding::ShiftJis => ShiftJisDecoder::new(), 1379 VariantEncoding::EucKr => EucKrDecoder::new(), 1380 VariantEncoding::Replacement => ReplacementDecoder::new(), 1381 VariantEncoding::UserDefined => UserDefinedDecoder::new(), 1382 VariantEncoding::Utf16Be => Utf16Decoder::new(true), 1383 VariantEncoding::Utf16Le => Utf16Decoder::new(false), 1384 } 1385 } 1386 1387 pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder { 1388 match *self { 1389 VariantEncoding::SingleByte(table, run_bmp_offset, run_byte_offset, run_length) => SingleByteEncoder::new(encoding, table, run_bmp_offset, run_byte_offset, run_length), 1390 VariantEncoding::Utf8 => Utf8Encoder::new(encoding), 1391 VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false), 1392 VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true), 1393 VariantEncoding::Big5 => Big5Encoder::new(encoding), 1394 VariantEncoding::EucJp => EucJpEncoder::new(encoding), 1395 VariantEncoding::Iso2022Jp => Iso2022JpEncoder::new(encoding), 1396 VariantEncoding::ShiftJis => ShiftJisEncoder::new(encoding), 1397 VariantEncoding::EucKr => EucKrEncoder::new(encoding), 1398 VariantEncoding::UserDefined => UserDefinedEncoder::new(encoding), 1399 VariantEncoding::Utf16Be | VariantEncoding::Replacement | 1400 VariantEncoding::Utf16Le => unreachable!(), 1401 } 1402 } 1403 1404 pub fn is_single_byte(&self) -> bool { 1405 match *self { 1406 VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true, 1407 _ => false, 1408 } 1409 } 1410} 1411''') 1412 1413variant_file.close() 1414 1415(ffi_rs_begin, ffi_rs_end) = read_non_generated("../encoding_c/src/lib.rs") 1416 1417ffi_file = open("../encoding_c/src/lib.rs", "w") 1418 1419ffi_file.write(ffi_rs_begin) 1420ffi_file.write(""" 1421// Instead, please regenerate using generate-encoding-data.py 1422 1423/// The minimum length of buffers that may be passed to `encoding_name()`. 1424pub const ENCODING_NAME_MAX_LENGTH: usize = %d; // %s 1425 1426""" % (longest_name_length, longest_name)) 1427 1428for name in preferred: 1429 ffi_file.write('''/// The %s encoding. 1430#[no_mangle] 1431pub static %s_ENCODING: ConstEncoding = ConstEncoding(&%s_INIT); 1432 1433''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name))) 1434 1435ffi_file.write(ffi_rs_end) 1436ffi_file.close() 1437 1438(single_byte_rs_begin, single_byte_rs_end) = read_non_generated("src/single_byte.rs") 1439 1440single_byte_file = open("src/single_byte.rs", "w") 1441 1442single_byte_file.write(single_byte_rs_begin) 1443single_byte_file.write(""" 1444// Instead, please regenerate using generate-encoding-data.py 1445 1446 #[test] 1447 fn test_single_byte_decode() {""") 1448 1449for name in preferred: 1450 if name == u"ISO-8859-8-I": 1451 continue; 1452 if is_single_byte(name): 1453 single_byte_file.write(""" 1454 decode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name))) 1455 1456single_byte_file.write(""" 1457 } 1458 1459 #[test] 1460 fn test_single_byte_encode() {""") 1461 1462for name in preferred: 1463 if name == u"ISO-8859-8-I": 1464 continue; 1465 if is_single_byte(name): 1466 single_byte_file.write(""" 1467 encode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name))) 1468 1469 1470single_byte_file.write(""" 1471 } 1472""") 1473 1474single_byte_file.write(single_byte_rs_end) 1475single_byte_file.close() 1476 1477static_file = open("../encoding_c/include/encoding_rs_statics.h", "w") 1478 1479static_file.write("""// Copyright 2016 Mozilla Foundation. See the COPYRIGHT 1480// file at the top-level directory of this distribution. 1481// 1482// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 1483// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license 1484// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your 1485// option. This file may not be copied, modified, or distributed 1486// except according to those terms. 1487 1488// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT. 1489// Instead, please regenerate using generate-encoding-data.py 1490 1491// This file is not meant to be included directly. Instead, encoding_rs.h 1492// includes this file. 1493 1494#ifndef encoding_rs_statics_h_ 1495#define encoding_rs_statics_h_ 1496 1497#ifndef ENCODING_RS_ENCODING 1498#define ENCODING_RS_ENCODING Encoding 1499#ifndef __cplusplus 1500typedef struct Encoding_ Encoding; 1501#endif 1502#endif 1503 1504#ifndef ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR 1505#define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ENCODING_RS_ENCODING* 1506#endif 1507 1508#ifndef ENCODING_RS_ENCODER 1509#define ENCODING_RS_ENCODER Encoder 1510#ifndef __cplusplus 1511typedef struct Encoder_ Encoder; 1512#endif 1513#endif 1514 1515#ifndef ENCODING_RS_DECODER 1516#define ENCODING_RS_DECODER Decoder 1517#ifndef __cplusplus 1518typedef struct Decoder_ Decoder; 1519#endif 1520#endif 1521 1522#define INPUT_EMPTY 0 1523 1524#define OUTPUT_FULL 0xFFFFFFFF 1525 1526// %s 1527#define ENCODING_NAME_MAX_LENGTH %d 1528 1529""" % (longest_name, longest_name_length)) 1530 1531for name in preferred: 1532 static_file.write('''/// The %s encoding. 1533extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const %s_ENCODING; 1534 1535''' % (to_dom_name(name), to_constant_name(name))) 1536 1537static_file.write("""#endif // encoding_rs_statics_h_ 1538""") 1539static_file.close() 1540 1541(utf_8_rs_begin, utf_8_rs_end) = read_non_generated("src/utf_8.rs") 1542 1543utf_8_file = open("src/utf_8.rs", "w") 1544 1545utf_8_file.write(utf_8_rs_begin) 1546utf_8_file.write(""" 1547// Instead, please regenerate using generate-encoding-data.py 1548 1549pub static UTF8_DATA: Utf8Data = Utf8Data { 1550 table: [ 1551""") 1552 1553for i in range(256): 1554 combined = (1 << 2) # invalid lead 1555 if i < 0x80 or i > 0xBF: 1556 combined |= (1 << 3) # normal trail 1557 if i < 0xA0 or i > 0xBF: 1558 combined |= (1 << 4) # three-byte special lower bound 1559 if i < 0x80 or i > 0x9F: 1560 combined |= (1 << 5) # three-byte special upper bound 1561 if i < 0x90 or i > 0xBF: 1562 combined |= (1 << 6) # four-byte special lower bound 1563 if i < 0x80 or i > 0x8F: 1564 combined |= (1 << 7) # four-byte special upper bound 1565 utf_8_file.write("%d," % combined) 1566 1567for i in range(128, 256): 1568 lane = (1 << 2) # invalid lead 1569 if i >= 0xC2 and i <= 0xDF: 1570 lane = (1 << 3) # normal trail 1571 elif i == 0xE0: 1572 lane = (1 << 4) # three-byte special lower bound 1573 elif i >= 0xE1 and i <= 0xEC: 1574 lane = (1 << 3) # normal trail 1575 elif i == 0xED: 1576 lane = (1 << 5) # three-byte special upper bound 1577 elif i >= 0xEE and i <= 0xEF: 1578 lane = (1 << 3) # normal trail 1579 elif i == 0xF0: 1580 lane = (1 << 6) # four-byte special lower bound 1581 elif i >= 0xF1 and i <= 0xF3: 1582 lane = (1 << 3) # normal trail 1583 elif i == 0xF4: 1584 lane = (1 << 7) # four-byte special upper bound 1585 utf_8_file.write("%d," % lane) 1586 1587utf_8_file.write(""" 1588 ], 1589}; 1590 1591""") 1592 1593utf_8_file.write(utf_8_rs_end) 1594utf_8_file.close() 1595 1596# Unit tests 1597 1598TEST_HEADER = '''Any copyright to the test code below this comment is dedicated to the 1599Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ 1600 1601This is a generated file. Please do not edit. 1602Instead, please regenerate using generate-encoding-data.py 1603''' 1604 1605index = indexes["jis0208"] 1606 1607jis0208_in_file = open("src/test_data/jis0208_in.txt", "w") 1608jis0208_in_file.write(TEST_HEADER) 1609for pointer in range(0, 94 * 94): 1610 (lead, trail) = divmod(pointer, 94) 1611 lead += 0xA1 1612 trail += 0xA1 1613 jis0208_in_file.write("%s%s\n" % (chr(lead), chr(trail))) 1614jis0208_in_file.close() 1615 1616jis0208_in_ref_file = open("src/test_data/jis0208_in_ref.txt", "w") 1617jis0208_in_ref_file.write(TEST_HEADER) 1618for pointer in range(0, 94 * 94): 1619 code_point = index[pointer] 1620 if code_point: 1621 jis0208_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1622 else: 1623 jis0208_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1624jis0208_in_ref_file.close() 1625 1626jis0208_out_file = open("src/test_data/jis0208_out.txt", "w") 1627jis0208_out_ref_file = open("src/test_data/jis0208_out_ref.txt", "w") 1628jis0208_out_file.write(TEST_HEADER) 1629jis0208_out_ref_file.write(TEST_HEADER) 1630for pointer in range(0, 94 * 94): 1631 code_point = index[pointer] 1632 if code_point: 1633 revised_pointer = pointer 1634 if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220): 1635 revised_pointer = index.index(code_point) 1636 (lead, trail) = divmod(revised_pointer, 94) 1637 lead += 0xA1 1638 trail += 0xA1 1639 jis0208_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 1640 jis0208_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1641jis0208_out_file.close() 1642jis0208_out_ref_file.close() 1643 1644shift_jis_in_file = open("src/test_data/shift_jis_in.txt", "w") 1645shift_jis_in_file.write(TEST_HEADER) 1646for pointer in range(0, len(index)): 1647 (lead, trail) = divmod(pointer, 188) 1648 lead += 0x81 if lead < 0x1F else 0xC1 1649 trail += 0x40 if trail < 0x3F else 0x41 1650 shift_jis_in_file.write("%s%s\n" % (chr(lead), chr(trail))) 1651shift_jis_in_file.close() 1652 1653shift_jis_in_ref_file = open("src/test_data/shift_jis_in_ref.txt", "w") 1654shift_jis_in_ref_file.write(TEST_HEADER) 1655for pointer in range(0, len(index)): 1656 code_point = 0xE000 - 8836 + pointer if pointer >= 8836 and pointer <= 10715 else index[pointer] 1657 if code_point: 1658 shift_jis_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1659 else: 1660 trail = pointer % 188 1661 trail += 0x40 if trail < 0x3F else 0x41 1662 if trail < 0x80: 1663 shift_jis_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) 1664 else: 1665 shift_jis_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1666shift_jis_in_ref_file.close() 1667 1668shift_jis_out_file = open("src/test_data/shift_jis_out.txt", "w") 1669shift_jis_out_ref_file = open("src/test_data/shift_jis_out_ref.txt", "w") 1670shift_jis_out_file.write(TEST_HEADER) 1671shift_jis_out_ref_file.write(TEST_HEADER) 1672for pointer in range(0, 8272): 1673 code_point = index[pointer] 1674 if code_point: 1675 revised_pointer = pointer 1676 if revised_pointer >= 1207 and revised_pointer < 1220: 1677 revised_pointer = index.index(code_point) 1678 (lead, trail) = divmod(revised_pointer, 188) 1679 lead += 0x81 if lead < 0x1F else 0xC1 1680 trail += 0x40 if trail < 0x3F else 0x41 1681 shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 1682 shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1683for pointer in range(8836, len(index)): 1684 code_point = index[pointer] 1685 if code_point: 1686 revised_pointer = index.index(code_point) 1687 if revised_pointer >= 8272 and revised_pointer < 8836: 1688 revised_pointer = pointer 1689 (lead, trail) = divmod(revised_pointer, 188) 1690 lead += 0x81 if lead < 0x1F else 0xC1 1691 trail += 0x40 if trail < 0x3F else 0x41 1692 shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 1693 shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1694shift_jis_out_file.close() 1695shift_jis_out_ref_file.close() 1696 1697iso_2022_jp_in_file = open("src/test_data/iso_2022_jp_in.txt", "w") 1698iso_2022_jp_in_file.write(TEST_HEADER) 1699for pointer in range(0, 94 * 94): 1700 (lead, trail) = divmod(pointer, 94) 1701 lead += 0x21 1702 trail += 0x21 1703 iso_2022_jp_in_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail))) 1704iso_2022_jp_in_file.close() 1705 1706iso_2022_jp_in_ref_file = open("src/test_data/iso_2022_jp_in_ref.txt", "w") 1707iso_2022_jp_in_ref_file.write(TEST_HEADER) 1708for pointer in range(0, 94 * 94): 1709 code_point = index[pointer] 1710 if code_point: 1711 iso_2022_jp_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1712 else: 1713 iso_2022_jp_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1714iso_2022_jp_in_ref_file.close() 1715 1716iso_2022_jp_out_file = open("src/test_data/iso_2022_jp_out.txt", "w") 1717iso_2022_jp_out_ref_file = open("src/test_data/iso_2022_jp_out_ref.txt", "w") 1718iso_2022_jp_out_file.write(TEST_HEADER) 1719iso_2022_jp_out_ref_file.write(TEST_HEADER) 1720for pointer in range(0, 94 * 94): 1721 code_point = index[pointer] 1722 if code_point: 1723 revised_pointer = pointer 1724 if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220): 1725 revised_pointer = index.index(code_point) 1726 (lead, trail) = divmod(revised_pointer, 94) 1727 lead += 0x21 1728 trail += 0x21 1729 iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail))) 1730 iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1731for i in xrange(len(half_width_index)): 1732 code_point = i + 0xFF61 1733 normalized_code_point = half_width_index[i] 1734 pointer = index.index(normalized_code_point) 1735 (lead, trail) = divmod(pointer, 94) 1736 lead += 0x21 1737 trail += 0x21 1738 iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail))) 1739 iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1740iso_2022_jp_out_file.close() 1741iso_2022_jp_out_ref_file.close() 1742 1743index = indexes["euc-kr"] 1744 1745euc_kr_in_file = open("src/test_data/euc_kr_in.txt", "w") 1746euc_kr_in_file.write(TEST_HEADER) 1747for pointer in range(0, len(index)): 1748 (lead, trail) = divmod(pointer, 190) 1749 lead += 0x81 1750 trail += 0x41 1751 euc_kr_in_file.write("%s%s\n" % (chr(lead), chr(trail))) 1752euc_kr_in_file.close() 1753 1754euc_kr_in_ref_file = open("src/test_data/euc_kr_in_ref.txt", "w") 1755euc_kr_in_ref_file.write(TEST_HEADER) 1756for pointer in range(0, len(index)): 1757 code_point = index[pointer] 1758 if code_point: 1759 euc_kr_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1760 else: 1761 trail = pointer % 190 1762 trail += 0x41 1763 if trail < 0x80: 1764 euc_kr_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) 1765 else: 1766 euc_kr_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1767euc_kr_in_ref_file.close() 1768 1769euc_kr_out_file = open("src/test_data/euc_kr_out.txt", "w") 1770euc_kr_out_ref_file = open("src/test_data/euc_kr_out_ref.txt", "w") 1771euc_kr_out_file.write(TEST_HEADER) 1772euc_kr_out_ref_file.write(TEST_HEADER) 1773for pointer in range(0, len(index)): 1774 code_point = index[pointer] 1775 if code_point: 1776 (lead, trail) = divmod(pointer, 190) 1777 lead += 0x81 1778 trail += 0x41 1779 euc_kr_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 1780 euc_kr_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1781euc_kr_out_file.close() 1782euc_kr_out_ref_file.close() 1783 1784index = indexes["gb18030"] 1785 1786gb18030_in_file = open("src/test_data/gb18030_in.txt", "w") 1787gb18030_in_file.write(TEST_HEADER) 1788for pointer in range(0, len(index)): 1789 (lead, trail) = divmod(pointer, 190) 1790 lead += 0x81 1791 trail += 0x40 if trail < 0x3F else 0x41 1792 gb18030_in_file.write("%s%s\n" % (chr(lead), chr(trail))) 1793gb18030_in_file.close() 1794 1795gb18030_in_ref_file = open("src/test_data/gb18030_in_ref.txt", "w") 1796gb18030_in_ref_file.write(TEST_HEADER) 1797for pointer in range(0, len(index)): 1798 code_point = index[pointer] 1799 if code_point: 1800 gb18030_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1801 else: 1802 trail = pointer % 190 1803 trail += 0x40 if trail < 0x3F else 0x41 1804 if trail < 0x80: 1805 gb18030_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) 1806 else: 1807 gb18030_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1808gb18030_in_ref_file.close() 1809 1810gb18030_out_file = open("src/test_data/gb18030_out.txt", "w") 1811gb18030_out_ref_file = open("src/test_data/gb18030_out_ref.txt", "w") 1812gb18030_out_file.write(TEST_HEADER) 1813gb18030_out_ref_file.write(TEST_HEADER) 1814for pointer in range(0, len(index)): 1815 if pointer == 6555: 1816 continue 1817 code_point = index[pointer] 1818 if code_point: 1819 (lead, trail) = divmod(pointer, 190) 1820 lead += 0x81 1821 trail += 0x40 if trail < 0x3F else 0x41 1822 gb18030_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 1823 gb18030_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1824gb18030_out_file.close() 1825gb18030_out_ref_file.close() 1826 1827index = indexes["big5"] 1828 1829big5_in_file = open("src/test_data/big5_in.txt", "w") 1830big5_in_file.write(TEST_HEADER) 1831for pointer in range(0, len(index)): 1832 (lead, trail) = divmod(pointer, 157) 1833 lead += 0x81 1834 trail += 0x40 if trail < 0x3F else 0x62 1835 big5_in_file.write("%s%s\n" % (chr(lead), chr(trail))) 1836big5_in_file.close() 1837 1838big5_two_characters = { 1839 1133: u"\u00CA\u0304", 1840 1135: u"\u00CA\u030C", 1841 1164: u"\u00EA\u0304", 1842 1166: u"\u00EA\u030C", 1843} 1844 1845big5_in_ref_file = open("src/test_data/big5_in_ref.txt", "w") 1846big5_in_ref_file.write(TEST_HEADER) 1847for pointer in range(0, len(index)): 1848 if pointer in big5_two_characters.keys(): 1849 big5_in_ref_file.write((u"%s\n" % big5_two_characters[pointer]).encode("utf-8")) 1850 continue 1851 code_point = index[pointer] 1852 if code_point: 1853 big5_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1854 else: 1855 trail = pointer % 157 1856 trail += 0x40 if trail < 0x3F else 0x62 1857 if trail < 0x80: 1858 big5_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8")) 1859 else: 1860 big5_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1861big5_in_ref_file.close() 1862 1863prefer_last = [ 1864 0x2550, 1865 0x255E, 1866 0x2561, 1867 0x256A, 1868 0x5341, 1869 0x5345, 1870] 1871 1872pointer_for_prefer_last = [] 1873 1874for code_point in prefer_last: 1875 # Python lists don't have .rindex() :-( 1876 for i in xrange(len(index) - 1, -1, -1): 1877 candidate = index[i] 1878 if candidate == code_point: 1879 pointer_for_prefer_last.append(i) 1880 break 1881 1882big5_out_file = open("src/test_data/big5_out.txt", "w") 1883big5_out_ref_file = open("src/test_data/big5_out_ref.txt", "w") 1884big5_out_file.write(TEST_HEADER) 1885big5_out_ref_file.write(TEST_HEADER) 1886for pointer in range(((0xA1 - 0x81) * 157), len(index)): 1887 code_point = index[pointer] 1888 if code_point: 1889 if code_point in prefer_last: 1890 if pointer != pointer_for_prefer_last[prefer_last.index(code_point)]: 1891 continue 1892 else: 1893 if pointer != index.index(code_point): 1894 continue 1895 (lead, trail) = divmod(pointer, 157) 1896 lead += 0x81 1897 trail += 0x40 if trail < 0x3F else 0x62 1898 big5_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail))) 1899 big5_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1900big5_out_file.close() 1901big5_out_ref_file.close() 1902 1903index = indexes["jis0212"] 1904 1905jis0212_in_file = open("src/test_data/jis0212_in.txt", "w") 1906jis0212_in_file.write(TEST_HEADER) 1907for pointer in range(0, len(index)): 1908 (lead, trail) = divmod(pointer, 94) 1909 lead += 0xA1 1910 trail += 0xA1 1911 jis0212_in_file.write("\x8F%s%s\n" % (chr(lead), chr(trail))) 1912jis0212_in_file.close() 1913 1914jis0212_in_ref_file = open("src/test_data/jis0212_in_ref.txt", "w") 1915jis0212_in_ref_file.write(TEST_HEADER) 1916for pointer in range(0, len(index)): 1917 code_point = index[pointer] 1918 if code_point: 1919 jis0212_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8")) 1920 else: 1921 jis0212_in_ref_file.write(u"\uFFFD\n".encode("utf-8")) 1922jis0212_in_ref_file.close() 1923 1924(codepage_begin, codepage_end) = read_non_generated("../codepage/src/lib.rs") 1925 1926codepage_file = open("../codepage/src/lib.rs", "w") 1927 1928codepage_file.write(codepage_begin) 1929codepage_file.write(""" 1930// Instead, please regenerate using generate-encoding-data.py 1931 1932/// Supported code page numbers in estimated order of usage frequency 1933static CODE_PAGES: [u16; %d] = [ 1934""" % len(code_pages)) 1935 1936for code_page in code_pages: 1937 codepage_file.write(" %d,\n" % code_page) 1938 1939codepage_file.write("""]; 1940 1941/// Encodings corresponding to the code page numbers in the same order 1942static ENCODINGS: [&'static Encoding; %d] = [ 1943""" % len(code_pages)) 1944 1945for code_page in code_pages: 1946 name = encodings_by_code_page[code_page] 1947 codepage_file.write(" &%s_INIT,\n" % to_constant_name(name)) 1948 1949codepage_file.write("""]; 1950 1951""") 1952 1953codepage_file.write(codepage_end) 1954codepage_file.close() 1955 1956(codepage_test_begin, codepage_test_end) = read_non_generated("../codepage/src/tests.rs") 1957 1958codepage_test_file = open("../codepage/src/tests.rs", "w") 1959 1960codepage_test_file.write(codepage_test_begin) 1961codepage_test_file.write(""" 1962// Instead, please regenerate using generate-encoding-data.py 1963 1964#[test] 1965fn test_to_encoding() { 1966 assert_eq!(to_encoding(0), None); 1967 1968""") 1969 1970for code_page in code_pages: 1971 codepage_test_file.write(" assert_eq!(to_encoding(%d), Some(%s));\n" % (code_page, to_constant_name(encodings_by_code_page[code_page]))) 1972 1973codepage_test_file.write("""} 1974 1975#[test] 1976fn test_from_encoding() { 1977""") 1978 1979for name in preferred: 1980 if code_pages_by_encoding.has_key(name): 1981 codepage_test_file.write(" assert_eq!(from_encoding(%s), Some(%d));\n" % (to_constant_name(name), code_pages_by_encoding[name])) 1982 else: 1983 codepage_test_file.write(" assert_eq!(from_encoding(%s), None);\n" % to_constant_name(name)) 1984 1985codepage_test_file.write("""} 1986""") 1987 1988codepage_test_file.write(codepage_test_end) 1989codepage_test_file.close() 1990 1991subprocess.call(["cargo", "fmt"]) 1992