1#!/usr/bin/python
2
3# Copyright 2013-2016 Mozilla Foundation. See the COPYRIGHT
4# file at the top-level directory of this distribution.
5#
6# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
7# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
8# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
9# option. This file may not be copied, modified, or distributed
10# except according to those terms.
11
12import json
13import subprocess
14import sys
15import os.path
16
17if (not os.path.isfile("../encoding/encodings.json")) or (not os.path.isfile("../encoding/indexes.json")):
18  sys.stderr.write("This script needs a clone of https://github.com/whatwg/encoding/ (preferably at revision f381389) next to the encoding_rs directory.\n");
19  sys.exit(-1)
20
21if not os.path.isfile("../encoding_c/src/lib.rs"):
22  sys.stderr.write("This script also writes the generated parts of the encoding_c crate and needs a clone of https://github.com/hsivonen/encoding_c next to the encoding_rs directory.\n");
23  sys.exit(-1)
24
25if not os.path.isfile("../codepage/src/lib.rs"):
26  sys.stderr.write("This script also writes the generated parts of the codepage crate and needs a clone of https://github.com/hsivonen/codepage next to the encoding_rs directory.\n");
27  sys.exit(-1)
28
29def cmp_from_end(one, other):
30  c = cmp(len(one), len(other))
31  if c != 0:
32    return c
33  i = len(one) - 1
34  while i >= 0:
35    c = cmp(one[i], other[i])
36    if c != 0:
37      return c
38    i -= 1
39  return 0
40
41
42class Label:
43  def __init__(self, label, preferred):
44    self.label = label
45    self.preferred = preferred
46  def __cmp__(self, other):
47    return cmp_from_end(self.label, other.label)
48
49class CodePage:
50  def __init__(self, code_page, preferred):
51    self.code_page = code_page
52    self.preferred = preferred
53  def __cmp__(self, other):
54    return self.code_page, other.code_page
55
56def static_u16_table(name, data):
57  data_file.write('''pub static %s: [u16; %d] = [
58  ''' % (name, len(data)))
59
60  for i in xrange(len(data)):
61    data_file.write('0x%04X,\n' % data[i])
62
63  data_file.write('''];
64
65  ''')
66
67def static_u16_table_from_indexable(name, data, item, feature):
68  data_file.write('''#[cfg(all(
69    feature = "less-slow-%s",
70    not(feature = "fast-%s")
71))]
72static %s: [u16; %d] = [
73  ''' % (feature, feature, name, len(data)))
74
75  for i in xrange(len(data)):
76    data_file.write('0x%04X,\n' % data[i][item])
77
78  data_file.write('''];
79
80  ''')
81
82def static_u8_pair_table_from_indexable(name, data, item, feature):
83  data_file.write('''#[cfg(all(
84    feature = "less-slow-%s",
85    not(feature = "fast-%s")
86))]
87static %s: [[u8; 2]; %d] = [
88  ''' % (feature, feature, name, len(data)))
89
90  for i in xrange(len(data)):
91    data_file.write('[0x%02X, 0x%02X],\n' % data[i][item])
92
93  data_file.write('''];
94
95  ''')
96
97def static_u8_pair_table(name, data, feature):
98  data_file.write('''#[cfg(feature = "%s")]
99static %s: [[u8; 2]; %d] = [
100  ''' % (feature, name, len(data)))
101
102  for i in xrange(len(data)):
103    pair = data[i]
104    if not pair:
105      pair = (0, 0)
106    data_file.write('[0x%02X, 0x%02X],\n' % pair)
107
108  data_file.write('''];
109
110  ''')
111
112preferred = []
113
114dom = []
115
116labels = []
117
118data = json.load(open("../encoding/encodings.json", "r"))
119
120indexes = json.load(open("../encoding/indexes.json", "r"))
121
122single_byte = []
123
124multi_byte = []
125
126def to_camel_name(name):
127  if name == u"iso-8859-8-i":
128    return u"Iso8I"
129  if name.startswith(u"iso-8859-"):
130    return name.replace(u"iso-8859-", u"Iso")
131  return name.title().replace(u"X-", u"").replace(u"-", u"").replace(u"_", u"")
132
133def to_constant_name(name):
134  return name.replace(u"-", u"_").upper()
135
136def to_snake_name(name):
137  return name.replace(u"-", u"_").lower()
138
139def to_dom_name(name):
140  return name
141
142# Guestimate based on
143# https://w3techs.com/technologies/overview/character_encoding/all
144# whose methodology is known to be bogus, but the results are credible for
145# this purpose. UTF-16LE lifted up due to prevalence on Windows and
146# "ANSI codepages" prioritized.
147encodings_by_code_page_frequency = [
148  "UTF-8",
149  "UTF-16LE",
150  "windows-1252",
151  "windows-1251",
152  "GBK",
153  "Shift_JIS",
154  "EUC-KR",
155  "windows-1250",
156  "windows-1256",
157  "windows-1254",
158  "Big5",
159  "windows-874",
160  "windows-1255",
161  "windows-1253",
162  "windows-1257",
163  "windows-1258",
164  "EUC-JP",
165  "ISO-8859-2",
166  "ISO-8859-15",
167  "ISO-8859-7",
168  "KOI8-R",
169  "gb18030",
170  "ISO-8859-5",
171  "ISO-8859-8-I",
172  "ISO-8859-4",
173  "ISO-8859-6",
174  "ISO-2022-JP",
175  "KOI8-U",
176  "ISO-8859-13",
177  "ISO-8859-3",
178  "UTF-16BE",
179  "IBM866",
180  "ISO-8859-10",
181  "ISO-8859-8",
182  "macintosh",
183  "x-mac-cyrillic",
184  "ISO-8859-14",
185  "ISO-8859-16",
186]
187
188encodings_by_code_page = {
189  932: "Shift_JIS",
190  936: "GBK",
191  949: "EUC-KR",
192  950: "Big5",
193  866: "IBM866",
194  874: "windows-874",
195  1200: "UTF-16LE",
196  1201: "UTF-16BE",
197  1250: "windows-1250",
198  1251: "windows-1251",
199  1252: "windows-1252",
200  1253: "windows-1253",
201  1254: "windows-1254",
202  1255: "windows-1255",
203  1256: "windows-1256",
204  1257: "windows-1257",
205  1258: "windows-1258",
206  10000: "macintosh",
207  10017: "x-mac-cyrillic",
208  20866: "KOI8-R",
209  20932: "EUC-JP",
210  21866: "KOI8-U",
211  28592: "ISO-8859-2",
212  28593: "ISO-8859-3",
213  28594: "ISO-8859-4",
214  28595: "ISO-8859-5",
215  28596: "ISO-8859-6",
216  28597: "ISO-8859-7",
217  28598: "ISO-8859-8",
218  28600: "ISO-8859-10",
219  28603: "ISO-8859-13",
220  28604: "ISO-8859-14",
221  28605: "ISO-8859-15",
222  28606: "ISO-8859-16",
223  38598: "ISO-8859-8-I",
224  50221: "ISO-2022-JP",
225  54936: "gb18030",
226  65001: "UTF-8",
227}
228
229code_pages_by_encoding = {}
230
231for code_page, encoding in encodings_by_code_page.iteritems():
232  code_pages_by_encoding[encoding] = code_page
233
234encoding_by_alias_code_page = {
235  951: "Big5",
236  10007: "x-mac-cyrillic",
237  20936: "GBK",
238  20949: "EUC-KR",
239  21010: "UTF-16LE", # Undocumented; needed by calamine for Excel compat
240  28591: "windows-1252",
241  28599: "windows-1254",
242  28601: "windows-874",
243  50220: "ISO-2022-JP",
244  50222: "ISO-2022-JP",
245  50225: "replacement", # ISO-2022-KR
246  50227: "replacement", # ISO-2022-CN
247  51949: "EUC-JP",
248  51936: "GBK",
249  51949: "EUC-KR",
250  52936: "replacement", # HZ
251}
252
253code_pages = []
254
255for name in encodings_by_code_page_frequency:
256  code_pages.append(code_pages_by_encoding[name])
257
258encodings_by_code_page.update(encoding_by_alias_code_page)
259
260temp_keys = encodings_by_code_page.keys()
261temp_keys.sort()
262for code_page in temp_keys:
263  if not code_page in code_pages:
264    code_pages.append(code_page)
265
266# The position in the index (0 is the first index entry,
267# i.e. byte value 0x80) that starts the longest run of
268# consecutive code points. Must not be in the first
269# quadrant. If the character to be encoded is not in this
270# run, the part of the index after the run is searched
271# forward. Then the part of the index from 32 to the start
272# of the run. The first quadrant is searched last.
273#
274# If there is no obviously most useful longest run,
275# the index here is just used to affect the search order.
276start_of_longest_run_in_single_byte = {
277  "IBM866": 96, # 0 would be longest, but we don't want to start in the first quadrant
278  "windows-874": 33,
279  "windows-1250": 92,
280  "windows-1251": 64,
281  "windows-1252": 32,
282  "windows-1253": 83,
283  "windows-1254": 95,
284  "windows-1255": 96,
285  "windows-1256": 65,
286  "windows-1257": 95, # not actually longest
287  "windows-1258": 95, # not actually longest
288  "macintosh": 106, # useless
289  "x-mac-cyrillic": 96,
290  "KOI8-R": 64, # not actually longest
291  "KOI8-U": 64, # not actually longest
292  "ISO-8859-2": 95, # not actually longest
293  "ISO-8859-3": 95, # not actually longest
294  "ISO-8859-4": 95, # not actually longest
295  "ISO-8859-5": 46,
296  "ISO-8859-6": 65,
297  "ISO-8859-7": 83,
298  "ISO-8859-8": 96,
299  "ISO-8859-10": 90, # not actually longest
300  "ISO-8859-13": 95, # not actually longest
301  "ISO-8859-14": 95,
302  "ISO-8859-15": 63,
303  "ISO-8859-16": 95, # not actually longest
304}
305
306#
307
308for group in data:
309  if group["heading"] == "Legacy single-byte encodings":
310    single_byte = group["encodings"]
311  else:
312    multi_byte.extend(group["encodings"])
313  for encoding in group["encodings"]:
314    preferred.append(encoding["name"])
315    for label in encoding["labels"]:
316      labels.append(Label(label, encoding["name"]))
317
318for name in preferred:
319  dom.append(to_dom_name(name))
320
321preferred.sort()
322labels.sort()
323dom.sort(cmp=cmp_from_end)
324
325longest_label_length = 0
326longest_name_length = 0
327longest_label = None
328longest_name = None
329
330for name in preferred:
331  if len(name) > longest_name_length:
332    longest_name_length = len(name)
333    longest_name = name
334
335for label in labels:
336  if len(label.label) > longest_label_length:
337    longest_label_length = len(label.label)
338    longest_label = label.label
339
340def longest_run_for_single_byte(name):
341  if name == u"ISO-8859-8-I":
342    name = u"ISO-8859-8"
343  index = indexes[name.lower()]
344  run_byte_offset = start_of_longest_run_in_single_byte[name]
345  run_bmp_offset = index[run_byte_offset]
346  previous_code_point = run_bmp_offset
347  run_length = 1
348  while True:
349    i = run_byte_offset + run_length
350    if i == len(index):
351      break
352    code_point = index[i]
353    if previous_code_point + 1 != code_point:
354      break
355    previous_code_point = code_point
356    run_length += 1
357  return (run_bmp_offset, run_byte_offset, run_length)
358
359def is_single_byte(name):
360  for encoding in single_byte:
361    if name == encoding["name"]:
362      return True
363  return False
364
365def read_non_generated(path):
366  partially_generated_file = open(path, "r")
367  full = partially_generated_file.read()
368  partially_generated_file.close()
369
370  generated_begin = "// BEGIN GENERATED CODE. PLEASE DO NOT EDIT."
371  generated_end = "// END GENERATED CODE"
372
373  generated_begin_index = full.find(generated_begin)
374  if generated_begin_index < 0:
375    sys.stderr.write("Can't find generated code start marker in %s. Exiting.\n" % path)
376    sys.exit(-1)
377  generated_end_index = full.find(generated_end)
378  if generated_end_index < 0:
379    sys.stderr.write("Can't find generated code end marker in %s. Exiting.\n" % path)
380    sys.exit(-1)
381
382  return (full[0:generated_begin_index + len(generated_begin)],
383          full[generated_end_index:])
384
385(lib_rs_begin, lib_rs_end) = read_non_generated("src/lib.rs")
386
387label_file = open("src/lib.rs", "w")
388
389label_file.write(lib_rs_begin)
390label_file.write("""
391// Instead, please regenerate using generate-encoding-data.py
392
393const LONGEST_LABEL_LENGTH: usize = %d; // %s
394
395""" % (longest_label_length, longest_label))
396
397for name in preferred:
398  variant = None
399  if is_single_byte(name):
400    (run_bmp_offset, run_byte_offset, run_length) = longest_run_for_single_byte(name)
401    variant = "SingleByte(&data::SINGLE_BYTE_DATA.%s, 0x%04X, %d, %d)" % (to_snake_name(u"iso-8859-8" if name == u"ISO-8859-8-I" else name), run_bmp_offset, run_byte_offset, run_length)
402  else:
403    variant = to_camel_name(name)
404
405  docfile = open("doc/%s.txt" % name, "r")
406  doctext = docfile.read()
407  docfile.close()
408
409  label_file.write('''/// The initializer for the [%s](static.%s.html) encoding.
410///
411/// For use only for taking the address of this form when
412/// Rust prohibits the use of the non-`_INIT` form directly,
413/// such as in initializers of other `static`s. If in doubt,
414/// use the corresponding non-`_INIT` reference-typed `static`.
415///
416/// This part of the public API will go away if Rust changes
417/// to make the referent of `pub const FOO: &'static Encoding`
418/// unique cross-crate or if Rust starts allowing static arrays
419/// to be initialized with `pub static FOO: &'static Encoding`
420/// items.
421pub static %s_INIT: Encoding = Encoding {
422    name: "%s",
423    variant: VariantEncoding::%s,
424};
425
426/// The %s encoding.
427///
428%s///
429/// This will change from `static` to `const` if Rust changes
430/// to make the referent of `pub const FOO: &'static Encoding`
431/// unique cross-crate, so don't take the address of this
432/// `static`.
433pub static %s: &'static Encoding = &%s_INIT;
434
435''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name), to_dom_name(name), variant, to_dom_name(name), doctext, to_constant_name(name), to_constant_name(name)))
436
437label_file.write("""static LABELS_SORTED: [&'static str; %d] = [
438""" % len(labels))
439
440for label in labels:
441  label_file.write('''"%s",\n''' % label.label)
442
443label_file.write("""];
444
445static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; %d] = [
446""" % len(labels))
447
448for label in labels:
449  label_file.write('''&%s_INIT,\n''' % to_constant_name(label.preferred))
450
451label_file.write('''];
452
453''')
454label_file.write(lib_rs_end)
455label_file.close()
456
457label_test_file = open("src/test_labels_names.rs", "w")
458label_test_file.write('''// Any copyright to the test code below this comment is dedicated to the
459// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
460
461// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
462// Instead, please regenerate using generate-encoding-data.py
463
464use super::*;
465
466#[test]
467fn test_all_labels() {
468''')
469
470for label in labels:
471  label_test_file.write('''assert_eq!(Encoding::for_label(b"%s"), Some(%s));\n''' % (label.label, to_constant_name(label.preferred)))
472
473label_test_file.write('''}
474''')
475label_test_file.close()
476
477def null_to_zero(code_point):
478  if not code_point:
479    code_point = 0
480  return code_point
481
482(data_rs_begin, data_rs_end) = read_non_generated("src/data.rs")
483
484data_file = open("src/data.rs", "w")
485data_file.write(data_rs_begin)
486data_file.write('''
487// Instead, please regenerate using generate-encoding-data.py
488
489#[repr(align(64))] // Align to cache lines
490pub struct SingleByteData {
491''')
492
493# Single-byte
494
495for encoding in single_byte:
496  name = encoding["name"]
497  if name == u"ISO-8859-8-I":
498    continue
499
500  data_file.write('''    pub %s: [u16; 128],
501''' % to_snake_name(name))
502
503data_file.write('''}
504
505pub static SINGLE_BYTE_DATA: SingleByteData = SingleByteData {
506''')
507
508for encoding in single_byte:
509  name = encoding["name"]
510  if name == u"ISO-8859-8-I":
511    continue
512
513  data_file.write('''    %s: [
514''' % to_snake_name(name))
515
516  for code_point in indexes[name.lower()]:
517    data_file.write('0x%04X,\n' % null_to_zero(code_point))
518
519  data_file.write('''],
520''')
521
522data_file.write('''};
523
524''')
525
526# Big5
527
528index = indexes["big5"]
529
530astralness = []
531low_bits = []
532
533for code_point in index[942:19782]:
534  if code_point:
535    astralness.append(1 if code_point > 0xFFFF else 0)
536    low_bits.append(code_point & 0xFFFF)
537  else:
538    astralness.append(0)
539    low_bits.append(0)
540
541# pad length to multiple of 32
542for j in xrange(32 - (len(astralness) % 32)):
543  astralness.append(0)
544
545data_file.write('''#[cfg_attr(feature = "cargo-clippy", allow(unreadable_literal))]
546static BIG5_ASTRALNESS: [u32; %d] = [
547''' % (len(astralness) / 32))
548
549i = 0
550while i < len(astralness):
551  accu = 0
552  for j in xrange(32):
553    accu |= astralness[i + j] << j
554  data_file.write('0x%08X,\n' % accu)
555  i += 32
556
557data_file.write('''];
558
559''')
560
561static_u16_table("BIG5_LOW_BITS", low_bits)
562
563# Encoder table for Level 1 Hanzi
564# Note: If we were OK with doubling this table, we
565# could use a directly-indexable table instead...
566level1_hanzi_index = index[5495:10896]
567level1_hanzi_pairs = []
568for i in xrange(len(level1_hanzi_index)):
569  hanzi_lead = (i / 157) + 0xA4
570  hanzi_trail = (i % 157)
571  hanzi_trail += 0x40 if hanzi_trail < 0x3F else 0x62
572  level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail)))
573level1_hanzi_pairs.append((0x4E5A, (0xC8, 0x7B)))
574level1_hanzi_pairs.append((0x5202, (0xC8, 0x7D)))
575level1_hanzi_pairs.append((0x9FB0, (0xC8, 0xA1)))
576level1_hanzi_pairs.append((0x5188, (0xC8, 0xA2)))
577level1_hanzi_pairs.append((0x9FB1, (0xC8, 0xA3)))
578level1_hanzi_pairs.sort(key=lambda x: x[0])
579
580static_u16_table_from_indexable("BIG5_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "big5-hanzi-encode")
581static_u8_pair_table_from_indexable("BIG5_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "big5-hanzi-encode")
582
583# Fast Unified Ideograph encode
584big5_unified_ideograph_bytes = [None] * (0x9FCC - 0x4E00)
585for row in xrange(0x7E - 0x20):
586  for column in xrange(157):
587    pointer = 5024 + column + (row * 157)
588    code_point = index[pointer]
589    if code_point and code_point >= 0x4E00 and code_point <= 0x9FCB:
590      unified_offset = code_point - 0x4E00
591      unified_lead = 0xA1 + row
592      unified_trail = (0x40 if column < 0x3F else 0x62) + column
593      if code_point == 0x5341 or code_point == 0x5345 or not big5_unified_ideograph_bytes[unified_offset]:
594        big5_unified_ideograph_bytes[unified_offset] = (unified_lead, unified_trail)
595
596static_u8_pair_table("BIG5_UNIFIED_IDEOGRAPH_BYTES", big5_unified_ideograph_bytes, "fast-big5-hanzi-encode")
597
598# JIS0208
599
600index = indexes["jis0208"]
601
602# JIS 0208 Level 1 Kanji
603static_u16_table("JIS0208_LEVEL1_KANJI", index[1410:4375])
604
605# JIS 0208 Level 2 Kanji and Additional Kanji
606static_u16_table("JIS0208_LEVEL2_AND_ADDITIONAL_KANJI", index[4418:7808])
607
608# IBM Kanji
609static_u16_table("IBM_KANJI", index[8272:8632])
610
611# Check that the other instance is the same
612if index[8272:8632] != index[10744:11104]:
613  raise Error()
614
615# JIS 0208 symbols (all non-Kanji, non-range items)
616symbol_index = []
617symbol_triples = []
618pointers_to_scan = [
619  (0, 188),
620  (658, 691),
621  (1159, 1221),
622]
623in_run = False
624run_start_pointer = 0
625run_start_array_index = 0
626for (start, end) in pointers_to_scan:
627  for i in range(start, end):
628    code_point = index[i]
629    if in_run:
630      if code_point:
631        symbol_index.append(code_point)
632      else:
633        symbol_triples.append(run_start_pointer)
634        symbol_triples.append(i - run_start_pointer)
635        symbol_triples.append(run_start_array_index)
636        in_run = False
637    else:
638      if code_point:
639        in_run = True
640        run_start_pointer = i
641        run_start_array_index = len(symbol_index)
642        symbol_index.append(code_point)
643  if in_run:
644    symbol_triples.append(run_start_pointer)
645    symbol_triples.append(end - run_start_pointer)
646    symbol_triples.append(run_start_array_index)
647    in_run = False
648if in_run:
649  raise Error()
650
651# Now add manually the two overlapping slices of
652# index from the NEC/IBM extensions.
653run_start_array_index = len(symbol_index)
654symbol_index.extend(index[10736:10744])
655# Later
656symbol_triples.append(10736)
657symbol_triples.append(8)
658symbol_triples.append(run_start_array_index)
659# Earlier
660symbol_triples.append(8644)
661symbol_triples.append(4)
662symbol_triples.append(run_start_array_index)
663
664static_u16_table("JIS0208_SYMBOLS", symbol_index)
665static_u16_table("JIS0208_SYMBOL_TRIPLES", symbol_triples)
666
667# Write down the magic numbers needed when preferring the earlier case
668data_file.write('''const IBM_SYMBOL_START: usize = %d;''' % (run_start_array_index + 1))
669data_file.write('''const IBM_SYMBOL_END: usize = %d;''' % (run_start_array_index + 4))
670data_file.write('''const IBM_SYMBOL_POINTER_START: usize = %d;''' % 8645)
671
672# JIS 0208 ranges (excluding kana)
673range_triples = []
674pointers_to_scan = [
675  (188, 281),
676  (470, 657),
677  (1128, 1159),
678  (8634, 8644),
679  (10716, 10736),
680]
681in_run = False
682run_start_pointer = 0
683run_start_code_point = 0
684previous_code_point = 0
685for (start, end) in pointers_to_scan:
686  for i in range(start, end):
687    code_point = index[i]
688    if in_run:
689      if code_point:
690        if previous_code_point + 1 != code_point:
691          range_triples.append(run_start_pointer)
692          range_triples.append(i - run_start_pointer)
693          range_triples.append(run_start_code_point)
694          run_start_pointer = i
695          run_start_code_point = code_point
696        previous_code_point = code_point
697      else:
698          range_triples.append(run_start_pointer)
699          range_triples.append(i - run_start_pointer)
700          range_triples.append(run_start_code_point)
701          run_start_pointer = 0
702          run_start_code_point = 0
703          previous_code_point = 0
704          in_run = False
705    else:
706      if code_point:
707        in_run = True
708        run_start_pointer = i
709        run_start_code_point = code_point
710        previous_code_point = code_point
711  if in_run:
712    range_triples.append(run_start_pointer)
713    range_triples.append(end - run_start_pointer)
714    range_triples.append(run_start_code_point)
715    run_start_pointer = 0
716    run_start_code_point = 0
717    previous_code_point = 0
718    in_run = False
719if in_run:
720  raise Error()
721
722static_u16_table("JIS0208_RANGE_TRIPLES", range_triples)
723
724# Encoder table for Level 1 Kanji
725# Note: If we were OK with 30 KB more footprint, we
726# could use a directly-indexable table instead...
727level1_kanji_index = index[1410:4375]
728level1_kanji_pairs = []
729for i in xrange(len(level1_kanji_index)):
730  pointer = 1410 + i
731  (lead, trail) = divmod(pointer, 188)
732  lead += 0x81 if lead < 0x1F else 0xC1
733  trail += 0x40 if trail < 0x3F else 0x41
734  level1_kanji_pairs.append((level1_kanji_index[i], (lead, trail)))
735level1_kanji_pairs.sort(key=lambda x: x[0])
736
737static_u16_table_from_indexable("JIS0208_LEVEL1_KANJI_CODE_POINTS", level1_kanji_pairs, 0, "kanji-encode")
738static_u8_pair_table_from_indexable("JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES", level1_kanji_pairs, 1, "kanji-encode")
739
740# Fast encoder table for Kanji
741kanji_bytes = [None] * (0x9FA1 - 0x4E00)
742for pointer in xrange(len(index)):
743  code_point = index[pointer]
744  if code_point and code_point >= 0x4E00 and code_point <= 0x9FA0:
745    (lead, trail) = divmod(pointer, 188)
746    lead += 0x81 if lead < 0x1F else 0xC1
747    trail += 0x40 if trail < 0x3F else 0x41
748    # unset the high bit of lead if IBM Kanji
749    if pointer >= 8272:
750      lead = lead & 0x7F
751    kanji_bytes[code_point - 0x4E00] = (lead, trail)
752
753static_u8_pair_table("JIS0208_KANJI_BYTES", kanji_bytes, "fast-kanji-encode")
754
755# ISO-2022-JP half-width katakana
756
757# index is still jis0208
758half_width_index = indexes["iso-2022-jp-katakana"]
759
760data_file.write('''pub static ISO_2022_JP_HALF_WIDTH_TRAIL: [u8; %d] = [
761''' % len(half_width_index))
762
763for i in xrange(len(half_width_index)):
764  code_point = half_width_index[i]
765  pointer = index.index(code_point)
766  trail = pointer % 94 + 0x21
767  data_file.write('0x%02X,\n' % trail)
768
769data_file.write('''];
770
771''')
772
773# EUC-KR
774
775index = indexes["euc-kr"]
776
777# Unicode 1.1 Hangul above the old KS X 1001 block
778# Compressed form takes 35% of uncompressed form
779pointers = []
780offsets = []
781previous_code_point = 0
782for row in xrange(0x20):
783  for column in xrange(190):
784    i = column + (row * 190)
785    # Skip the gaps
786    if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40):
787      continue
788    code_point = index[i]
789    if previous_code_point > code_point:
790      raise Error()
791    if code_point - previous_code_point != 1:
792      adjustment = 0
793      if column >= 0x40:
794        adjustment = 12
795      elif column >= 0x20:
796        adjustment = 6
797      pointers.append(column - adjustment + (row * (190 - 12)))
798      offsets.append(code_point)
799    previous_code_point = code_point
800
801static_u16_table("CP949_TOP_HANGUL_POINTERS", pointers)
802static_u16_table("CP949_TOP_HANGUL_OFFSETS", offsets)
803
804# Unicode 1.1 Hangul to the left of the old KS X 1001 block
805pointers = []
806offsets = []
807previous_code_point = 0
808for row in xrange(0x46 - 0x20):
809  for column in xrange(190 - 94):
810    i = 6080 + column + (row * 190)
811    # Skip the gaps
812    if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40):
813      continue
814    if i > 13127:
815      # Exclude unassigned on partial last row
816      break
817    code_point = index[i]
818    if previous_code_point > code_point:
819      raise Error()
820    if code_point - previous_code_point != 1:
821      adjustment = 0
822      if column >= 0x40:
823        adjustment = 12
824      elif column >= 0x20:
825        adjustment = 6
826      pointers.append(column - adjustment + (row * (190 - 94 - 12)))
827      offsets.append(code_point)
828    previous_code_point = code_point
829
830static_u16_table("CP949_LEFT_HANGUL_POINTERS", pointers)
831static_u16_table("CP949_LEFT_HANGUL_OFFSETS", offsets)
832
833# KS X 1001 Hangul
834hangul_index = []
835previous_code_point = 0
836for row in xrange(0x48 - 0x2F):
837  for column in xrange(94):
838    code_point = index[9026 + column + (row * 190)]
839    if previous_code_point >= code_point:
840      raise Error()
841    hangul_index.append(code_point)
842    previous_code_point = code_point
843
844static_u16_table("KSX1001_HANGUL", hangul_index)
845
846# KS X 1001 Hanja
847hanja_index = []
848for row in xrange(0x7D - 0x49):
849  for column in xrange(94):
850    hanja_index.append(index[13966 + column + (row * 190)])
851
852static_u16_table("KSX1001_HANJA", hanja_index)
853
854# KS X 1001 symbols
855symbol_index = []
856for i in range(6176, 6270):
857  symbol_index.append(index[i])
858for i in range(6366, 6437):
859  symbol_index.append(index[i])
860
861static_u16_table("KSX1001_SYMBOLS", symbol_index)
862
863# KS X 1001 Uppercase Latin
864subindex = []
865for i in range(7506, 7521):
866  subindex.append(null_to_zero(index[i]))
867
868static_u16_table("KSX1001_UPPERCASE", subindex)
869
870# KS X 1001 Lowercase Latin
871subindex = []
872for i in range(7696, 7712):
873  subindex.append(index[i])
874
875static_u16_table("KSX1001_LOWERCASE", subindex)
876
877# KS X 1001 Box drawing
878subindex = []
879for i in range(7126, 7194):
880  subindex.append(index[i])
881
882static_u16_table("KSX1001_BOX", subindex)
883
884# KS X 1001 other
885pointers = []
886offsets = []
887previous_code_point = 0
888for row in xrange(10):
889  for column in xrange(94):
890    i = 6556 + column + (row * 190)
891    code_point = index[i]
892    # Exclude ranges that were processed as lookup tables
893    # or that contain unmapped cells by filling them with
894    # ASCII. Upon encode, ASCII code points will
895    # never appear as the search key.
896    if (i >= 6946 and i <= 6950):
897      code_point = i - 6946
898    elif (i >= 6961 and i <= 6967):
899      code_point = i - 6961
900    elif (i >= 6992 and i <= 6999):
901      code_point = i - 6992
902    elif (i >= 7024 and i <= 7029):
903      code_point = i - 7024
904    elif (i >= 7126 and i <= 7219):
905      code_point = i - 7126
906    elif (i >= 7395 and i <= 7409):
907      code_point = i - 7395
908    elif (i >= 7506 and i <= 7521):
909      code_point = i - 7506
910    elif (i >= 7696 and i <= 7711):
911      code_point = i - 7696
912    elif (i >= 7969 and i <= 7979):
913      code_point = i - 7969
914    elif (i >= 8162 and i <= 8169):
915      code_point = i - 8162
916    elif (i >= 8299 and i <= 8313):
917      code_point = i - 8299
918    elif (i >= 8347 and i <= 8359):
919      code_point = i - 8347
920    if code_point - previous_code_point != 1:
921      pointers.append(column + (row * 94))
922      offsets.append(code_point)
923    previous_code_point = code_point
924
925static_u16_table("KSX1001_OTHER_POINTERS", pointers)
926# Omit the last offset, because the end of the last line
927# is unmapped, so we don't want to look at it.
928static_u16_table("KSX1001_OTHER_UNSORTED_OFFSETS", offsets[:-1])
929
930# Fast Hangul and Hanja encode
931hangul_bytes = [None] * (0xD7A4 - 0xAC00)
932hanja_unified_bytes = [None] * (0x9F9D - 0x4E00)
933hanja_compatibility_bytes = [None] * (0xFA0C - 0xF900)
934for row in xrange(0x7D):
935  for column in xrange(190):
936    pointer = column + (row * 190)
937    code_point = index[pointer]
938    if code_point:
939      lead = 0x81 + row
940      trail = 0x41 + column
941      if code_point >= 0xAC00 and code_point < 0xD7A4:
942        hangul_bytes[code_point - 0xAC00] = (lead, trail)
943      elif code_point >= 0x4E00 and code_point < 0x9F9D:
944        hanja_unified_bytes[code_point - 0x4E00] = (lead, trail)
945      elif code_point >= 0xF900 and code_point < 0xFA0C:
946        hanja_compatibility_bytes[code_point - 0xF900] = (lead, trail)
947
948static_u8_pair_table("CP949_HANGUL_BYTES", hangul_bytes, "fast-hangul-encode")
949static_u8_pair_table("KSX1001_UNIFIED_HANJA_BYTES", hanja_unified_bytes, "fast-hanja-encode")
950static_u8_pair_table("KSX1001_COMPATIBILITY_HANJA_BYTES", hanja_compatibility_bytes, "fast-hanja-encode")
951
952# JIS 0212
953
954index = indexes["jis0212"]
955
956# JIS 0212 Kanji
957static_u16_table("JIS0212_KANJI", index[1410:7211])
958
959# JIS 0212 accented (all non-Kanji, non-range items)
960symbol_index = []
961symbol_triples = []
962pointers_to_scan = [
963  (0, 596),
964  (608, 644),
965  (656, 1409),
966]
967in_run = False
968run_start_pointer = 0
969run_start_array_index = 0
970for (start, end) in pointers_to_scan:
971  for i in range(start, end):
972    code_point = index[i]
973    if in_run:
974      if code_point:
975        symbol_index.append(code_point)
976      elif index[i + 1]:
977        symbol_index.append(0)
978      else:
979        symbol_triples.append(run_start_pointer)
980        symbol_triples.append(i - run_start_pointer)
981        symbol_triples.append(run_start_array_index)
982        in_run = False
983    else:
984      if code_point:
985        in_run = True
986        run_start_pointer = i
987        run_start_array_index = len(symbol_index)
988        symbol_index.append(code_point)
989  if in_run:
990    symbol_triples.append(run_start_pointer)
991    symbol_triples.append(end - run_start_pointer)
992    symbol_triples.append(run_start_array_index)
993    in_run = False
994if in_run:
995  raise Error()
996
997static_u16_table("JIS0212_ACCENTED", symbol_index)
998static_u16_table("JIS0212_ACCENTED_TRIPLES", symbol_triples)
999
1000# gb18030
1001
1002index = indexes["gb18030"]
1003
1004# Unicode 1.1 ideographs above the old GB2312 block
1005# Compressed form takes 63% of uncompressed form
1006pointers = []
1007offsets = []
1008previous_code_point = 0
1009for i in xrange(6080):
1010  code_point = index[i]
1011  if previous_code_point > code_point:
1012    raise Error()
1013  if code_point - previous_code_point != 1:
1014    pointers.append(i)
1015    offsets.append(code_point)
1016  previous_code_point = code_point
1017
1018static_u16_table("GBK_TOP_IDEOGRAPH_POINTERS", pointers)
1019static_u16_table("GBK_TOP_IDEOGRAPH_OFFSETS", offsets)
1020
1021# Unicode 1.1 ideographs to the left of the old GB2312 block
1022# Compressed form takes 40% of uncompressed form
1023pointers = []
1024offsets = []
1025previous_code_point = 0
1026for row in xrange(0x7D - 0x29):
1027  for column in xrange(190 - 94):
1028    i = 7790 + column + (row * 190)
1029    if i > 23650:
1030      # Exclude compatibility ideographs at the end
1031      break
1032    code_point = index[i]
1033    if previous_code_point > code_point:
1034      raise Error()
1035    if code_point - previous_code_point != 1:
1036      pointers.append(column + (row * (190 - 94)))
1037      offsets.append(code_point)
1038    previous_code_point = code_point
1039
1040static_u16_table("GBK_LEFT_IDEOGRAPH_POINTERS", pointers)
1041static_u16_table("GBK_LEFT_IDEOGRAPH_OFFSETS", offsets)
1042
1043# GBK other (excl. Ext A, Compat & PUA at the bottom)
1044pointers = []
1045offsets = []
1046previous_code_point = 0
1047for row in xrange(0x29 - 0x20):
1048  for column in xrange(190 - 94):
1049    i = 6080 + column + (row * 190)
1050    code_point = index[i]
1051    if code_point - previous_code_point != 1:
1052      pointers.append(column + (row * (190 - 94)))
1053      offsets.append(code_point)
1054    previous_code_point = code_point
1055
1056pointers.append((190 - 94) * (0x29 - 0x20))
1057static_u16_table("GBK_OTHER_POINTERS", pointers)
1058static_u16_table("GBK_OTHER_UNSORTED_OFFSETS", offsets)
1059
1060# GBK bottom: Compatibility ideagraphs, Ext A and PUA
1061bottom_index = []
1062# 5 compat following Unified Ideographs
1063for i in range(23651, 23656):
1064  bottom_index.append(index[i])
1065# Last row
1066for i in range(23750, 23846):
1067  bottom_index.append(index[i])
1068
1069static_u16_table("GBK_BOTTOM", bottom_index)
1070
1071# GB2312 Hanzi
1072# (and the 5 PUA code points in between Level 1 and Level 2)
1073hanzi_index = []
1074for row in xrange(0x77 - 0x2F):
1075  for column in xrange(94):
1076    hanzi_index.append(index[9026 + column + (row * 190)])
1077
1078static_u16_table("GB2312_HANZI", hanzi_index)
1079
1080# GB2312 symbols
1081symbol_index = []
1082for i in xrange(94):
1083  symbol_index.append(index[6176 + i])
1084
1085static_u16_table("GB2312_SYMBOLS", symbol_index)
1086
1087# GB2312 symbols on Greek row (incl. PUA)
1088symbol_index = []
1089for i in xrange(22):
1090  symbol_index.append(index[7189 + i])
1091
1092static_u16_table("GB2312_SYMBOLS_AFTER_GREEK", symbol_index)
1093
1094# GB2312 Pinyin
1095pinyin_index = []
1096for i in xrange(32):
1097  pinyin_index.append(index[7506 + i])
1098
1099static_u16_table("GB2312_PINYIN", pinyin_index)
1100
1101# GB2312 other (excl. bottom PUA)
1102pointers = []
1103offsets = []
1104previous_code_point = 0
1105for row in xrange(14):
1106  for column in xrange(94):
1107    i = 6366 + column + (row * 190)
1108    code_point = index[i]
1109    # Exclude the two ranges that were processed as
1110    # lookup tables above by filling them with
1111    # ASCII. Upon encode, ASCII code points will
1112    # never appear as the search key.
1113    if (i >= 7189 and i < 7189 + 22):
1114      code_point = i - 7189
1115    elif (i >= 7506 and i < 7506 + 32):
1116      code_point = i - 7506
1117    if code_point - previous_code_point != 1:
1118      pointers.append(column + (row * 94))
1119      offsets.append(code_point)
1120    previous_code_point = code_point
1121
1122pointers.append(14 * 94)
1123static_u16_table("GB2312_OTHER_POINTERS", pointers)
1124static_u16_table("GB2312_OTHER_UNSORTED_OFFSETS", offsets)
1125
1126# Non-gbk code points
1127pointers = []
1128offsets = []
1129for pair in indexes["gb18030-ranges"]:
1130  if pair[1] == 0x10000:
1131    break # the last entry doesn't fit in u16
1132  pointers.append(pair[0])
1133  offsets.append(pair[1])
1134
1135static_u16_table("GB18030_RANGE_POINTERS", pointers)
1136static_u16_table("GB18030_RANGE_OFFSETS", offsets)
1137
1138# Encoder table for Level 1 Hanzi
1139# The units here really fit into 12 bits, but since we're
1140# looking for speed here, let's use 16 bits per unit.
1141# Once we use 16 bits per unit, we might as well precompute
1142# the output bytes.
1143level1_hanzi_index = hanzi_index[:(94 * (0xD8 - 0xB0) - 5)]
1144level1_hanzi_pairs = []
1145for i in xrange(len(level1_hanzi_index)):
1146  hanzi_lead = (i / 94) + 0xB0
1147  hanzi_trail = (i % 94) + 0xA1
1148  level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail)))
1149level1_hanzi_pairs.sort(key=lambda x: x[0])
1150
1151static_u16_table_from_indexable("GB2312_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0, "gb-hanzi-encode")
1152static_u8_pair_table_from_indexable("GB2312_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1, "gb-hanzi-encode")
1153
1154# Fast Hanzi encoder table
1155hanzi_bytes = [None] * (0x9FA7 - 0x4E00)
1156for row in xrange(126):
1157  for column in xrange(190):
1158    pointer = column + (row * 190)
1159    code_point = index[pointer]
1160    if code_point and code_point >= 0x4E00 and code_point <= 0x9FA6:
1161      hanzi_lead = 0x81 + row
1162      hanzi_trail = column + (0x40 if column < 0x3F else 0x41)
1163      hanzi_bytes[code_point - 0x4E00] = (hanzi_lead, hanzi_trail)
1164
1165static_u8_pair_table("GBK_HANZI_BYTES", hanzi_bytes, "fast-gb-hanzi-encode")
1166
1167data_file.write(data_rs_end)
1168
1169data_file.close()
1170
1171# Variant
1172
1173variant_file = open("src/variant.rs", "w")
1174variant_file.write('''// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
1175// file at the top-level directory of this distribution.
1176//
1177// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
1178// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
1179// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
1180// option. This file may not be copied, modified, or distributed
1181// except according to those terms.
1182
1183// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
1184// Instead, please regenerate using generate-encoding-data.py
1185
1186//! This module provides enums that wrap the various decoders and encoders.
1187//! The purpose is to make `Decoder` and `Encoder` `Sized` by writing the
1188//! dispatch explicitly for a finite set of specialized decoders and encoders.
1189//! Unfortunately, this means the compiler doesn't generate the dispatch code
1190//! and it has to be written here instead.
1191//!
1192//! The purpose of making `Decoder` and `Encoder` `Sized` is to allow stack
1193//! allocation in Rust code, including the convenience methods on `Encoding`.
1194
1195''')
1196
1197encoding_variants = [u"single-byte",]
1198for encoding in multi_byte:
1199  if encoding["name"] in [u"UTF-16LE", u"UTF-16BE"]:
1200    continue
1201  else:
1202    encoding_variants.append(encoding["name"])
1203encoding_variants.append(u"UTF-16")
1204
1205decoder_variants = []
1206for variant in encoding_variants:
1207  if variant == u"GBK":
1208    continue
1209  decoder_variants.append(variant)
1210
1211encoder_variants = []
1212for variant in encoding_variants:
1213  if variant in [u"replacement", u"GBK", u"UTF-16"]:
1214    continue
1215  encoder_variants.append(variant)
1216
1217for variant in decoder_variants:
1218  variant_file.write("use %s::*;\n" % to_snake_name(variant))
1219
1220variant_file.write('''use super::*;
1221
1222pub enum VariantDecoder {
1223''')
1224
1225for variant in decoder_variants:
1226  variant_file.write("   %s(%sDecoder),\n" % (to_camel_name(variant), to_camel_name(variant)))
1227
1228variant_file.write('''}
1229
1230impl VariantDecoder {
1231''')
1232
1233def write_variant_method(name, mut, arg_list, ret, variants, excludes, kind):
1234  variant_file.write('''pub fn %s(&''' % name)
1235  if mut:
1236    variant_file.write('''mut ''')
1237  variant_file.write('''self''')
1238  for arg in arg_list:
1239    variant_file.write(''', %s: %s''' % (arg[0], arg[1]))
1240  variant_file.write(''')''')
1241  if ret:
1242    variant_file.write(''' -> %s''' % ret)
1243  variant_file.write(''' {\nmatch *self {\n''')
1244  for variant in variants:
1245    variant_file.write('''Variant%s::%s(ref ''' % (kind, to_camel_name(variant)))
1246    if mut:
1247      variant_file.write('''mut ''')
1248    if variant in excludes:
1249      variant_file.write('''v) => (),''')
1250      continue
1251    variant_file.write('''v) => v.%s(''' % name)
1252    first = True
1253    for arg in arg_list:
1254      if not first:
1255        variant_file.write(''', ''')
1256      first = False
1257      variant_file.write(arg[0])
1258    variant_file.write('''),\n''')
1259  variant_file.write('''}\n}\n\n''')
1260
1261write_variant_method("max_utf16_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")
1262
1263write_variant_method("max_utf8_buffer_length_without_replacement", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")
1264
1265write_variant_method("max_utf8_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")
1266
1267write_variant_method("decode_to_utf16_raw", True, [("src", "&[u8]"),
1268                           ("dst", "&mut [u16]"),
1269                           ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder")
1270
1271write_variant_method("decode_to_utf8_raw", True, [("src", "&[u8]"),
1272                           ("dst", "&mut [u8]"),
1273                           ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder")
1274
1275variant_file.write('''
1276
1277    pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> Option<usize> {
1278        match *self {
1279            VariantDecoder::SingleByte(ref v) => {
1280                return Some(v.latin1_byte_compatible_up_to(buffer));
1281            }
1282            VariantDecoder::Utf8(ref v) => {
1283                if !v.in_neutral_state() {
1284                    return None;
1285                }
1286            }
1287            VariantDecoder::Gb18030(ref v) => {
1288                if !v.in_neutral_state() {
1289                    return None;
1290                }
1291            }
1292            VariantDecoder::Big5(ref v) => {
1293                if !v.in_neutral_state() {
1294                    return None;
1295                }
1296            }
1297            VariantDecoder::EucJp(ref v) => {
1298                if !v.in_neutral_state() {
1299                    return None;
1300                }
1301            }
1302            VariantDecoder::Iso2022Jp(ref v) => {
1303                if v.in_neutral_state() {
1304                    return Some(Encoding::iso_2022_jp_ascii_valid_up_to(buffer));
1305                }
1306                return None;
1307            }
1308            VariantDecoder::ShiftJis(ref v) => {
1309                if !v.in_neutral_state() {
1310                    return None;
1311                }
1312            }
1313            VariantDecoder::EucKr(ref v) => {
1314                if !v.in_neutral_state() {
1315                    return None;
1316                }
1317            }
1318            VariantDecoder::UserDefined(_) => {}
1319            VariantDecoder::Replacement(_) | VariantDecoder::Utf16(_) => {
1320                return None;
1321            }
1322        };
1323        Some(Encoding::ascii_valid_up_to(buffer))
1324    }
1325}
1326
1327pub enum VariantEncoder {
1328''')
1329
1330for variant in encoder_variants:
1331  variant_file.write("   %s(%sEncoder),\n" % (to_camel_name(variant), to_camel_name(variant)))
1332
1333variant_file.write('''}
1334
1335impl VariantEncoder {
1336    pub fn has_pending_state(&self) -> bool {
1337        match *self {
1338            VariantEncoder::Iso2022Jp(ref v) => {
1339                v.has_pending_state()
1340            }
1341            _ => false,
1342        }
1343    }
1344''')
1345
1346write_variant_method("max_buffer_length_from_utf16_without_replacement", False, [("u16_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder")
1347
1348write_variant_method("max_buffer_length_from_utf8_without_replacement", False, [("byte_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder")
1349
1350write_variant_method("encode_from_utf16_raw", True, [("src", "&[u16]"),
1351                           ("dst", "&mut [u8]"),
1352                           ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder")
1353
1354write_variant_method("encode_from_utf8_raw", True, [("src", "&str"),
1355                           ("dst", "&mut [u8]"),
1356                           ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder")
1357
1358
1359variant_file.write('''}
1360
1361pub enum VariantEncoding {
1362    SingleByte(&'static [u16; 128], u16, u8, u8),''')
1363
1364for encoding in multi_byte:
1365  variant_file.write("%s,\n" % to_camel_name(encoding["name"]))
1366
1367variant_file.write('''}
1368
1369impl VariantEncoding {
1370    pub fn new_variant_decoder(&self) -> VariantDecoder {
1371        match *self {
1372            VariantEncoding::SingleByte(table, _, _, _) => SingleByteDecoder::new(table),
1373            VariantEncoding::Utf8 => Utf8Decoder::new(),
1374            VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(),
1375            VariantEncoding::Big5 => Big5Decoder::new(),
1376            VariantEncoding::EucJp => EucJpDecoder::new(),
1377            VariantEncoding::Iso2022Jp => Iso2022JpDecoder::new(),
1378            VariantEncoding::ShiftJis => ShiftJisDecoder::new(),
1379            VariantEncoding::EucKr => EucKrDecoder::new(),
1380            VariantEncoding::Replacement => ReplacementDecoder::new(),
1381            VariantEncoding::UserDefined => UserDefinedDecoder::new(),
1382            VariantEncoding::Utf16Be => Utf16Decoder::new(true),
1383            VariantEncoding::Utf16Le => Utf16Decoder::new(false),
1384        }
1385    }
1386
1387    pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder {
1388        match *self {
1389            VariantEncoding::SingleByte(table, run_bmp_offset, run_byte_offset, run_length) => SingleByteEncoder::new(encoding, table, run_bmp_offset, run_byte_offset, run_length),
1390            VariantEncoding::Utf8 => Utf8Encoder::new(encoding),
1391            VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false),
1392            VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true),
1393            VariantEncoding::Big5 => Big5Encoder::new(encoding),
1394            VariantEncoding::EucJp => EucJpEncoder::new(encoding),
1395            VariantEncoding::Iso2022Jp => Iso2022JpEncoder::new(encoding),
1396            VariantEncoding::ShiftJis => ShiftJisEncoder::new(encoding),
1397            VariantEncoding::EucKr => EucKrEncoder::new(encoding),
1398            VariantEncoding::UserDefined => UserDefinedEncoder::new(encoding),
1399            VariantEncoding::Utf16Be | VariantEncoding::Replacement |
1400            VariantEncoding::Utf16Le => unreachable!(),
1401        }
1402    }
1403
1404    pub fn is_single_byte(&self) -> bool {
1405        match *self {
1406            VariantEncoding::SingleByte(_, _, _, _) | VariantEncoding::UserDefined => true,
1407            _ => false,
1408        }
1409    }
1410}
1411''')
1412
1413variant_file.close()
1414
1415(ffi_rs_begin, ffi_rs_end) = read_non_generated("../encoding_c/src/lib.rs")
1416
1417ffi_file = open("../encoding_c/src/lib.rs", "w")
1418
1419ffi_file.write(ffi_rs_begin)
1420ffi_file.write("""
1421// Instead, please regenerate using generate-encoding-data.py
1422
1423/// The minimum length of buffers that may be passed to `encoding_name()`.
1424pub const ENCODING_NAME_MAX_LENGTH: usize = %d; // %s
1425
1426""" % (longest_name_length, longest_name))
1427
1428for name in preferred:
1429  ffi_file.write('''/// The %s encoding.
1430#[no_mangle]
1431pub static %s_ENCODING: ConstEncoding = ConstEncoding(&%s_INIT);
1432
1433''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name)))
1434
1435ffi_file.write(ffi_rs_end)
1436ffi_file.close()
1437
1438(single_byte_rs_begin, single_byte_rs_end) = read_non_generated("src/single_byte.rs")
1439
1440single_byte_file = open("src/single_byte.rs", "w")
1441
1442single_byte_file.write(single_byte_rs_begin)
1443single_byte_file.write("""
1444// Instead, please regenerate using generate-encoding-data.py
1445
1446    #[test]
1447    fn test_single_byte_decode() {""")
1448
1449for name in preferred:
1450  if name == u"ISO-8859-8-I":
1451    continue;
1452  if is_single_byte(name):
1453    single_byte_file.write("""
1454        decode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name)))
1455
1456single_byte_file.write("""
1457    }
1458
1459    #[test]
1460    fn test_single_byte_encode() {""")
1461
1462for name in preferred:
1463  if name == u"ISO-8859-8-I":
1464    continue;
1465  if is_single_byte(name):
1466    single_byte_file.write("""
1467        encode_single_byte(%s, &data::SINGLE_BYTE_DATA.%s);""" % (to_constant_name(name), to_snake_name(name)))
1468
1469
1470single_byte_file.write("""
1471    }
1472""")
1473
1474single_byte_file.write(single_byte_rs_end)
1475single_byte_file.close()
1476
1477static_file = open("../encoding_c/include/encoding_rs_statics.h", "w")
1478
1479static_file.write("""// Copyright 2016 Mozilla Foundation. See the COPYRIGHT
1480// file at the top-level directory of this distribution.
1481//
1482// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
1483// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
1484// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
1485// option. This file may not be copied, modified, or distributed
1486// except according to those terms.
1487
1488// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
1489// Instead, please regenerate using generate-encoding-data.py
1490
1491// This file is not meant to be included directly. Instead, encoding_rs.h
1492// includes this file.
1493
1494#ifndef encoding_rs_statics_h_
1495#define encoding_rs_statics_h_
1496
1497#ifndef ENCODING_RS_ENCODING
1498#define ENCODING_RS_ENCODING Encoding
1499#ifndef __cplusplus
1500typedef struct Encoding_ Encoding;
1501#endif
1502#endif
1503
1504#ifndef ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR
1505#define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ENCODING_RS_ENCODING*
1506#endif
1507
1508#ifndef ENCODING_RS_ENCODER
1509#define ENCODING_RS_ENCODER Encoder
1510#ifndef __cplusplus
1511typedef struct Encoder_ Encoder;
1512#endif
1513#endif
1514
1515#ifndef ENCODING_RS_DECODER
1516#define ENCODING_RS_DECODER Decoder
1517#ifndef __cplusplus
1518typedef struct Decoder_ Decoder;
1519#endif
1520#endif
1521
1522#define INPUT_EMPTY 0
1523
1524#define OUTPUT_FULL 0xFFFFFFFF
1525
1526// %s
1527#define ENCODING_NAME_MAX_LENGTH %d
1528
1529""" % (longest_name, longest_name_length))
1530
1531for name in preferred:
1532  static_file.write('''/// The %s encoding.
1533extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const %s_ENCODING;
1534
1535''' % (to_dom_name(name), to_constant_name(name)))
1536
1537static_file.write("""#endif // encoding_rs_statics_h_
1538""")
1539static_file.close()
1540
1541(utf_8_rs_begin, utf_8_rs_end) = read_non_generated("src/utf_8.rs")
1542
1543utf_8_file = open("src/utf_8.rs", "w")
1544
1545utf_8_file.write(utf_8_rs_begin)
1546utf_8_file.write("""
1547// Instead, please regenerate using generate-encoding-data.py
1548
1549pub static UTF8_DATA: Utf8Data = Utf8Data {
1550    table: [
1551""")
1552
1553for i in range(256):
1554  combined = (1 << 2) # invalid lead
1555  if i < 0x80 or i > 0xBF:
1556    combined |= (1 << 3) # normal trail
1557  if i < 0xA0 or i > 0xBF:
1558    combined |= (1 << 4) # three-byte special lower bound
1559  if i < 0x80 or i > 0x9F:
1560    combined |= (1 << 5) # three-byte special upper bound
1561  if i < 0x90 or i > 0xBF:
1562    combined |= (1 << 6) # four-byte special lower bound
1563  if i < 0x80 or i > 0x8F:
1564    combined |= (1 << 7) # four-byte special upper bound
1565  utf_8_file.write("%d," % combined)
1566
1567for i in range(128, 256):
1568  lane = (1 << 2) # invalid lead
1569  if i >= 0xC2 and i <= 0xDF:
1570    lane = (1 << 3) # normal trail
1571  elif i == 0xE0:
1572    lane = (1 << 4) # three-byte special lower bound
1573  elif i >= 0xE1 and i <= 0xEC:
1574    lane = (1 << 3) # normal trail
1575  elif i == 0xED:
1576    lane = (1 << 5) # three-byte special upper bound
1577  elif i >= 0xEE and i <= 0xEF:
1578    lane = (1 << 3) # normal trail
1579  elif i == 0xF0:
1580    lane = (1 << 6) # four-byte special lower bound
1581  elif i >= 0xF1 and i <= 0xF3:
1582    lane = (1 << 3) # normal trail
1583  elif i == 0xF4:
1584    lane = (1 << 7) # four-byte special upper bound
1585  utf_8_file.write("%d," % lane)
1586
1587utf_8_file.write("""
1588    ],
1589};
1590
1591""")
1592
1593utf_8_file.write(utf_8_rs_end)
1594utf_8_file.close()
1595
1596# Unit tests
1597
1598TEST_HEADER = '''Any copyright to the test code below this comment is dedicated to the
1599Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
1600
1601This is a generated file. Please do not edit.
1602Instead, please regenerate using generate-encoding-data.py
1603'''
1604
1605index = indexes["jis0208"]
1606
1607jis0208_in_file = open("src/test_data/jis0208_in.txt", "w")
1608jis0208_in_file.write(TEST_HEADER)
1609for pointer in range(0, 94 * 94):
1610  (lead, trail) = divmod(pointer, 94)
1611  lead += 0xA1
1612  trail += 0xA1
1613  jis0208_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
1614jis0208_in_file.close()
1615
1616jis0208_in_ref_file = open("src/test_data/jis0208_in_ref.txt", "w")
1617jis0208_in_ref_file.write(TEST_HEADER)
1618for pointer in range(0, 94 * 94):
1619  code_point = index[pointer]
1620  if code_point:
1621    jis0208_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1622  else:
1623    jis0208_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
1624jis0208_in_ref_file.close()
1625
1626jis0208_out_file = open("src/test_data/jis0208_out.txt", "w")
1627jis0208_out_ref_file = open("src/test_data/jis0208_out_ref.txt", "w")
1628jis0208_out_file.write(TEST_HEADER)
1629jis0208_out_ref_file.write(TEST_HEADER)
1630for pointer in range(0, 94 * 94):
1631  code_point = index[pointer]
1632  if code_point:
1633    revised_pointer = pointer
1634    if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220):
1635      revised_pointer = index.index(code_point)
1636    (lead, trail) = divmod(revised_pointer, 94)
1637    lead += 0xA1
1638    trail += 0xA1
1639    jis0208_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
1640    jis0208_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1641jis0208_out_file.close()
1642jis0208_out_ref_file.close()
1643
1644shift_jis_in_file = open("src/test_data/shift_jis_in.txt", "w")
1645shift_jis_in_file.write(TEST_HEADER)
1646for pointer in range(0, len(index)):
1647  (lead, trail) = divmod(pointer, 188)
1648  lead += 0x81 if lead < 0x1F else 0xC1
1649  trail += 0x40 if trail < 0x3F else 0x41
1650  shift_jis_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
1651shift_jis_in_file.close()
1652
1653shift_jis_in_ref_file = open("src/test_data/shift_jis_in_ref.txt", "w")
1654shift_jis_in_ref_file.write(TEST_HEADER)
1655for pointer in range(0, len(index)):
1656  code_point = 0xE000 - 8836 + pointer if pointer >= 8836 and pointer <= 10715 else index[pointer]
1657  if code_point:
1658    shift_jis_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1659  else:
1660    trail = pointer % 188
1661    trail += 0x40 if trail < 0x3F else 0x41
1662    if trail < 0x80:
1663      shift_jis_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
1664    else:
1665      shift_jis_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
1666shift_jis_in_ref_file.close()
1667
1668shift_jis_out_file = open("src/test_data/shift_jis_out.txt", "w")
1669shift_jis_out_ref_file = open("src/test_data/shift_jis_out_ref.txt", "w")
1670shift_jis_out_file.write(TEST_HEADER)
1671shift_jis_out_ref_file.write(TEST_HEADER)
1672for pointer in range(0, 8272):
1673  code_point = index[pointer]
1674  if code_point:
1675    revised_pointer = pointer
1676    if revised_pointer >= 1207 and revised_pointer < 1220:
1677      revised_pointer = index.index(code_point)
1678    (lead, trail) = divmod(revised_pointer, 188)
1679    lead += 0x81 if lead < 0x1F else 0xC1
1680    trail += 0x40 if trail < 0x3F else 0x41
1681    shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
1682    shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1683for pointer in range(8836, len(index)):
1684  code_point = index[pointer]
1685  if code_point:
1686    revised_pointer = index.index(code_point)
1687    if revised_pointer >= 8272 and revised_pointer < 8836:
1688      revised_pointer = pointer
1689    (lead, trail) = divmod(revised_pointer, 188)
1690    lead += 0x81 if lead < 0x1F else 0xC1
1691    trail += 0x40 if trail < 0x3F else 0x41
1692    shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
1693    shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1694shift_jis_out_file.close()
1695shift_jis_out_ref_file.close()
1696
1697iso_2022_jp_in_file = open("src/test_data/iso_2022_jp_in.txt", "w")
1698iso_2022_jp_in_file.write(TEST_HEADER)
1699for pointer in range(0, 94 * 94):
1700  (lead, trail) = divmod(pointer, 94)
1701  lead += 0x21
1702  trail += 0x21
1703  iso_2022_jp_in_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
1704iso_2022_jp_in_file.close()
1705
1706iso_2022_jp_in_ref_file = open("src/test_data/iso_2022_jp_in_ref.txt", "w")
1707iso_2022_jp_in_ref_file.write(TEST_HEADER)
1708for pointer in range(0, 94 * 94):
1709  code_point = index[pointer]
1710  if code_point:
1711    iso_2022_jp_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1712  else:
1713    iso_2022_jp_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
1714iso_2022_jp_in_ref_file.close()
1715
1716iso_2022_jp_out_file = open("src/test_data/iso_2022_jp_out.txt", "w")
1717iso_2022_jp_out_ref_file = open("src/test_data/iso_2022_jp_out_ref.txt", "w")
1718iso_2022_jp_out_file.write(TEST_HEADER)
1719iso_2022_jp_out_ref_file.write(TEST_HEADER)
1720for pointer in range(0, 94 * 94):
1721  code_point = index[pointer]
1722  if code_point:
1723    revised_pointer = pointer
1724    if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220):
1725      revised_pointer = index.index(code_point)
1726    (lead, trail) = divmod(revised_pointer, 94)
1727    lead += 0x21
1728    trail += 0x21
1729    iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
1730    iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1731for i in xrange(len(half_width_index)):
1732  code_point = i + 0xFF61
1733  normalized_code_point = half_width_index[i]
1734  pointer = index.index(normalized_code_point)
1735  (lead, trail) = divmod(pointer, 94)
1736  lead += 0x21
1737  trail += 0x21
1738  iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
1739  iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1740iso_2022_jp_out_file.close()
1741iso_2022_jp_out_ref_file.close()
1742
1743index = indexes["euc-kr"]
1744
1745euc_kr_in_file = open("src/test_data/euc_kr_in.txt", "w")
1746euc_kr_in_file.write(TEST_HEADER)
1747for pointer in range(0, len(index)):
1748  (lead, trail) = divmod(pointer, 190)
1749  lead += 0x81
1750  trail += 0x41
1751  euc_kr_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
1752euc_kr_in_file.close()
1753
1754euc_kr_in_ref_file = open("src/test_data/euc_kr_in_ref.txt", "w")
1755euc_kr_in_ref_file.write(TEST_HEADER)
1756for pointer in range(0, len(index)):
1757  code_point = index[pointer]
1758  if code_point:
1759    euc_kr_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1760  else:
1761    trail = pointer % 190
1762    trail += 0x41
1763    if trail < 0x80:
1764      euc_kr_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
1765    else:
1766      euc_kr_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
1767euc_kr_in_ref_file.close()
1768
1769euc_kr_out_file = open("src/test_data/euc_kr_out.txt", "w")
1770euc_kr_out_ref_file = open("src/test_data/euc_kr_out_ref.txt", "w")
1771euc_kr_out_file.write(TEST_HEADER)
1772euc_kr_out_ref_file.write(TEST_HEADER)
1773for pointer in range(0, len(index)):
1774  code_point = index[pointer]
1775  if code_point:
1776    (lead, trail) = divmod(pointer, 190)
1777    lead += 0x81
1778    trail += 0x41
1779    euc_kr_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
1780    euc_kr_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1781euc_kr_out_file.close()
1782euc_kr_out_ref_file.close()
1783
1784index = indexes["gb18030"]
1785
1786gb18030_in_file = open("src/test_data/gb18030_in.txt", "w")
1787gb18030_in_file.write(TEST_HEADER)
1788for pointer in range(0, len(index)):
1789  (lead, trail) = divmod(pointer, 190)
1790  lead += 0x81
1791  trail += 0x40 if trail < 0x3F else 0x41
1792  gb18030_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
1793gb18030_in_file.close()
1794
1795gb18030_in_ref_file = open("src/test_data/gb18030_in_ref.txt", "w")
1796gb18030_in_ref_file.write(TEST_HEADER)
1797for pointer in range(0, len(index)):
1798  code_point = index[pointer]
1799  if code_point:
1800    gb18030_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1801  else:
1802    trail = pointer % 190
1803    trail += 0x40 if trail < 0x3F else 0x41
1804    if trail < 0x80:
1805      gb18030_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
1806    else:
1807      gb18030_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
1808gb18030_in_ref_file.close()
1809
1810gb18030_out_file = open("src/test_data/gb18030_out.txt", "w")
1811gb18030_out_ref_file = open("src/test_data/gb18030_out_ref.txt", "w")
1812gb18030_out_file.write(TEST_HEADER)
1813gb18030_out_ref_file.write(TEST_HEADER)
1814for pointer in range(0, len(index)):
1815  if pointer == 6555:
1816    continue
1817  code_point = index[pointer]
1818  if code_point:
1819    (lead, trail) = divmod(pointer, 190)
1820    lead += 0x81
1821    trail += 0x40 if trail < 0x3F else 0x41
1822    gb18030_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
1823    gb18030_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1824gb18030_out_file.close()
1825gb18030_out_ref_file.close()
1826
1827index = indexes["big5"]
1828
1829big5_in_file = open("src/test_data/big5_in.txt", "w")
1830big5_in_file.write(TEST_HEADER)
1831for pointer in range(0, len(index)):
1832  (lead, trail) = divmod(pointer, 157)
1833  lead += 0x81
1834  trail += 0x40 if trail < 0x3F else 0x62
1835  big5_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
1836big5_in_file.close()
1837
1838big5_two_characters = {
1839  1133: u"\u00CA\u0304",
1840  1135: u"\u00CA\u030C",
1841  1164: u"\u00EA\u0304",
1842  1166: u"\u00EA\u030C",
1843}
1844
1845big5_in_ref_file = open("src/test_data/big5_in_ref.txt", "w")
1846big5_in_ref_file.write(TEST_HEADER)
1847for pointer in range(0, len(index)):
1848  if pointer in big5_two_characters.keys():
1849    big5_in_ref_file.write((u"%s\n" % big5_two_characters[pointer]).encode("utf-8"))
1850    continue
1851  code_point = index[pointer]
1852  if code_point:
1853    big5_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1854  else:
1855    trail = pointer % 157
1856    trail += 0x40 if trail < 0x3F else 0x62
1857    if trail < 0x80:
1858      big5_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
1859    else:
1860      big5_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
1861big5_in_ref_file.close()
1862
1863prefer_last = [
1864  0x2550,
1865  0x255E,
1866  0x2561,
1867  0x256A,
1868  0x5341,
1869  0x5345,
1870]
1871
1872pointer_for_prefer_last = []
1873
1874for code_point in prefer_last:
1875  # Python lists don't have .rindex() :-(
1876  for i in xrange(len(index) - 1, -1, -1):
1877    candidate = index[i]
1878    if candidate == code_point:
1879       pointer_for_prefer_last.append(i)
1880       break
1881
1882big5_out_file = open("src/test_data/big5_out.txt", "w")
1883big5_out_ref_file = open("src/test_data/big5_out_ref.txt", "w")
1884big5_out_file.write(TEST_HEADER)
1885big5_out_ref_file.write(TEST_HEADER)
1886for pointer in range(((0xA1 - 0x81) * 157), len(index)):
1887  code_point = index[pointer]
1888  if code_point:
1889    if code_point in prefer_last:
1890      if pointer != pointer_for_prefer_last[prefer_last.index(code_point)]:
1891        continue
1892    else:
1893      if pointer != index.index(code_point):
1894        continue
1895    (lead, trail) = divmod(pointer, 157)
1896    lead += 0x81
1897    trail += 0x40 if trail < 0x3F else 0x62
1898    big5_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
1899    big5_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1900big5_out_file.close()
1901big5_out_ref_file.close()
1902
1903index = indexes["jis0212"]
1904
1905jis0212_in_file = open("src/test_data/jis0212_in.txt", "w")
1906jis0212_in_file.write(TEST_HEADER)
1907for pointer in range(0, len(index)):
1908  (lead, trail) = divmod(pointer, 94)
1909  lead += 0xA1
1910  trail += 0xA1
1911  jis0212_in_file.write("\x8F%s%s\n" % (chr(lead), chr(trail)))
1912jis0212_in_file.close()
1913
1914jis0212_in_ref_file = open("src/test_data/jis0212_in_ref.txt", "w")
1915jis0212_in_ref_file.write(TEST_HEADER)
1916for pointer in range(0, len(index)):
1917  code_point = index[pointer]
1918  if code_point:
1919    jis0212_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1920  else:
1921    jis0212_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
1922jis0212_in_ref_file.close()
1923
1924(codepage_begin, codepage_end) = read_non_generated("../codepage/src/lib.rs")
1925
1926codepage_file = open("../codepage/src/lib.rs", "w")
1927
1928codepage_file.write(codepage_begin)
1929codepage_file.write("""
1930// Instead, please regenerate using generate-encoding-data.py
1931
1932/// Supported code page numbers in estimated order of usage frequency
1933static CODE_PAGES: [u16; %d] = [
1934""" % len(code_pages))
1935
1936for code_page in code_pages:
1937  codepage_file.write("    %d,\n" % code_page)
1938
1939codepage_file.write("""];
1940
1941/// Encodings corresponding to the code page numbers in the same order
1942static ENCODINGS: [&'static Encoding; %d] = [
1943""" % len(code_pages))
1944
1945for code_page in code_pages:
1946  name = encodings_by_code_page[code_page]
1947  codepage_file.write("    &%s_INIT,\n" % to_constant_name(name))
1948
1949codepage_file.write("""];
1950
1951""")
1952
1953codepage_file.write(codepage_end)
1954codepage_file.close()
1955
1956(codepage_test_begin, codepage_test_end) = read_non_generated("../codepage/src/tests.rs")
1957
1958codepage_test_file = open("../codepage/src/tests.rs", "w")
1959
1960codepage_test_file.write(codepage_test_begin)
1961codepage_test_file.write("""
1962// Instead, please regenerate using generate-encoding-data.py
1963
1964#[test]
1965fn test_to_encoding() {
1966    assert_eq!(to_encoding(0), None);
1967
1968""")
1969
1970for code_page in code_pages:
1971  codepage_test_file.write("    assert_eq!(to_encoding(%d), Some(%s));\n" % (code_page, to_constant_name(encodings_by_code_page[code_page])))
1972
1973codepage_test_file.write("""}
1974
1975#[test]
1976fn test_from_encoding() {
1977""")
1978
1979for name in preferred:
1980  if code_pages_by_encoding.has_key(name):
1981    codepage_test_file.write("    assert_eq!(from_encoding(%s), Some(%d));\n" % (to_constant_name(name), code_pages_by_encoding[name]))
1982  else:
1983    codepage_test_file.write("    assert_eq!(from_encoding(%s), None);\n" % to_constant_name(name))
1984
1985codepage_test_file.write("""}
1986""")
1987
1988codepage_test_file.write(codepage_test_end)
1989codepage_test_file.close()
1990
1991subprocess.call(["cargo", "fmt"])
1992