1#!/usr/bin/python
2
3# Copyright 2013-2016 Mozilla Foundation. See the COPYRIGHT
4# file at the top-level directory of this distribution.
5#
6# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
7# https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
8# <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
9# option. This file may not be copied, modified, or distributed
10# except according to those terms.
11
12import json
13import subprocess
14import sys
15
16def cmp_from_end(one, other):
17  c = cmp(len(one), len(other))
18  if c != 0:
19    return c
20  i = len(one) - 1
21  while i >= 0:
22    c = cmp(one[i], other[i])
23    if c != 0:
24      return c
25    i -= 1
26  return 0
27
28
29class Label:
30  def __init__(self, label, preferred):
31    self.label = label
32    self.preferred = preferred
33  def __cmp__(self, other):
34    return cmp_from_end(self.label, other.label)
35
36def static_u16_table(name, data):
37  data_file.write('''pub static %s: [u16; %d] = [
38  ''' % (name, len(data)))
39
40  for i in xrange(len(data)):
41    data_file.write('0x%04X,\n' % data[i])
42
43  data_file.write('''];
44
45  ''')
46
47def static_u16_table_from_indexable(name, data, item):
48  data_file.write('''#[cfg(not(feature = "no-static-ideograph-encoder-tables"))]
49static %s: [u16; %d] = [
50  ''' % (name, len(data)))
51
52  for i in xrange(len(data)):
53    data_file.write('0x%04X,\n' % data[i][item])
54
55  data_file.write('''];
56
57  ''')
58
59def static_u8_pair_table_from_indexable(name, data, item):
60  data_file.write('''#[cfg(not(feature = "no-static-ideograph-encoder-tables"))]
61static %s: [[u8; 2]; %d] = [
62  ''' % (name, len(data)))
63
64  for i in xrange(len(data)):
65    data_file.write('[0x%02X, 0x%02X],\n' % data[i][item])
66
67  data_file.write('''];
68
69  ''')
70
71preferred = []
72
73dom = []
74
75labels = []
76
77data = json.load(open("../encoding/encodings.json", "r"))
78
79indexes = json.load(open("../encoding/indexes.json", "r"))
80
81single_byte = []
82
83multi_byte = []
84
85def to_camel_name(name):
86  if name == u"iso-8859-8-i":
87    return u"Iso8I"
88  if name.startswith(u"iso-8859-"):
89    return name.replace(u"iso-8859-", u"Iso")
90  return name.title().replace(u"X-", u"").replace(u"-", u"").replace(u"_", u"")
91
92def to_constant_name(name):
93  return name.replace(u"-", u"_").upper()
94
95def to_snake_name(name):
96  return name.replace(u"-", u"_").lower()
97
98def to_dom_name(name):
99  return name
100
101#
102
103for group in data:
104  if group["heading"] == "Legacy single-byte encodings":
105    single_byte = group["encodings"]
106  else:
107    multi_byte.extend(group["encodings"])
108  for encoding in group["encodings"]:
109    preferred.append(encoding["name"])
110    for label in encoding["labels"]:
111      labels.append(Label(label, encoding["name"]))
112
113for name in preferred:
114  dom.append(to_dom_name(name))
115
116preferred.sort()
117labels.sort()
118dom.sort(cmp=cmp_from_end)
119
120longest_label_length = 0
121longest_name_length = 0
122longest_label = None
123longest_name = None
124
125for name in preferred:
126  if len(name) > longest_name_length:
127    longest_name_length = len(name)
128    longest_name = name
129
130for label in labels:
131  if len(label.label) > longest_label_length:
132    longest_label_length = len(label.label)
133    longest_label = label.label
134
135def is_single_byte(name):
136  for encoding in single_byte:
137    if name == encoding["name"]:
138      return True
139  return False
140
141def read_non_generated(path):
142  partially_generated_file = open(path, "r")
143  full = partially_generated_file.read()
144  partially_generated_file.close()
145
146  generated_begin = "// BEGIN GENERATED CODE. PLEASE DO NOT EDIT."
147  generated_end = "// END GENERATED CODE"
148
149  generated_begin_index = full.find(generated_begin)
150  if generated_begin_index < 0:
151    print "Can't find generated code start marker in %s. Exiting." % path
152    sys.exit(-1)
153  generated_end_index = full.find(generated_end)
154  if generated_end_index < 0:
155    print "Can't find generated code end marker in %s. Exiting." % path
156    sys.exit(-1)
157
158  return (full[0:generated_begin_index + len(generated_begin)],
159          full[generated_end_index:])
160
161(lib_rs_begin, lib_rs_end) = read_non_generated("src/lib.rs")
162
163label_file = open("src/lib.rs", "w")
164
165label_file.write(lib_rs_begin)
166label_file.write("""
167// Instead, please regenerate using generate-encoding-data.py
168
169const LONGEST_LABEL_LENGTH: usize = %d; // %s
170
171""" % (longest_label_length, longest_label))
172
173for name in preferred:
174  variant = None
175  if is_single_byte(name):
176    variant = "SingleByte(data::%s_DATA)" % to_constant_name(u"iso-8859-8" if name == u"ISO-8859-8-I" else name)
177  else:
178    variant = to_camel_name(name)
179
180  label_file.write('''/// The initializer for the %s encoding.
181///
182/// For use only for taking the address of this form when
183/// Rust prohibits the use of the non-`_INIT` form directly,
184/// such as in initializers of other `static`s. If in doubt,
185/// use the corresponding non-`_INIT` reference-typed `static`.
186///
187/// This part of the public API will go away if Rust changes
188/// to make the referent of `pub const FOO: &'static Encoding`
189/// unique cross-crate or if Rust starts allowing static arrays
190/// to be initialized with `pub static FOO: &'static Encoding`
191/// items.
192pub static %s_INIT: Encoding = Encoding {
193    name: "%s",
194    variant: VariantEncoding::%s,
195};
196
197/// The %s encoding.
198///
199/// This will change from `static` to `const` if Rust changes
200/// to make the referent of `pub const FOO: &'static Encoding`
201/// unique cross-crate, so don't take the address of this
202/// `static`.
203pub static %s: &'static Encoding = &%s_INIT;
204
205''' % (to_dom_name(name), to_constant_name(name), to_dom_name(name), variant, to_dom_name(name), to_constant_name(name), to_constant_name(name)))
206
207label_file.write("""static LABELS_SORTED: [&'static str; %d] = [
208""" % len(labels))
209
210for label in labels:
211  label_file.write('''"%s",\n''' % label.label)
212
213label_file.write("""];
214
215static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; %d] = [
216""" % len(labels))
217
218for label in labels:
219  label_file.write('''&%s_INIT,\n''' % to_constant_name(label.preferred))
220
221label_file.write('''];
222
223''')
224label_file.write(lib_rs_end)
225label_file.close()
226
227label_test_file = open("src/test_labels_names.rs", "w")
228label_test_file.write('''// Any copyright to the test code below this comment is dedicated to the
229// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
230
231// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
232// Instead, please regenerate using generate-encoding-data.py
233
234use super::*;
235
236#[test]
237fn test_all_labels() {
238''')
239
240for label in labels:
241  label_test_file.write('''assert_eq!(Encoding::for_label(b"%s"), Some(%s));\n''' % (label.label, to_constant_name(label.preferred)))
242
243label_test_file.write('''}
244''')
245label_test_file.close()
246
247def null_to_zero(code_point):
248  if not code_point:
249    code_point = 0
250  return code_point
251
252data_file = open("src/data.rs", "w")
253data_file.write('''// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
254// file at the top-level directory of this distribution.
255//
256// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
257// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
258// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
259// option. This file may not be copied, modified, or distributed
260// except according to those terms.
261
262// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
263// Instead, please regenerate using generate-encoding-data.py
264
265''')
266
267# Single-byte
268
269for encoding in single_byte:
270  name = encoding["name"]
271  if name == u"ISO-8859-8-I":
272    continue
273
274  data_file.write('''pub const %s_DATA: &'static [u16; 128] = &[
275''' % to_constant_name(name))
276
277  for code_point in indexes[name.lower()]:
278    data_file.write('0x%04X,\n' % null_to_zero(code_point))
279
280  data_file.write('''];
281
282''')
283
284# Big5
285
286index = indexes["big5"]
287
288astralness = []
289low_bits = []
290
291for code_point in index[942:19782]:
292  if code_point:
293    astralness.append(1 if code_point > 0xFFFF else 0)
294    low_bits.append(code_point & 0xFFFF)
295  else:
296    astralness.append(0)
297    low_bits.append(0)
298
299# pad length to multiple of 32
300for j in xrange(32 - (len(astralness) % 32)):
301  astralness.append(0)
302
303data_file.write('''static BIG5_ASTRALNESS: [u32; %d] = [
304''' % (len(astralness) / 32))
305
306i = 0
307while i < len(astralness):
308  accu = 0
309  for j in xrange(32):
310    accu |= astralness[i + j] << j
311  data_file.write('0x%08X,\n' % accu)
312  i += 32
313
314data_file.write('''];
315
316''')
317
318static_u16_table("BIG5_LOW_BITS", low_bits)
319
320# Encoder table for Level 1 Hanzi
321# Note: If we were OK with doubling this table, we
322# could use a directly-indexable table instead...
323level1_hanzi_index = index[5495:10896]
324level1_hanzi_pairs = []
325for i in xrange(len(level1_hanzi_index)):
326  hanzi_lead = (i / 157) + 0xA4
327  hanzi_trail = (i % 157)
328  hanzi_trail += 0x40 if hanzi_trail < 0x3F else 0x62
329  level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail)))
330level1_hanzi_pairs.append((0x4E5A, (0xC8, 0x7B)))
331level1_hanzi_pairs.append((0x5202, (0xC8, 0x7D)))
332level1_hanzi_pairs.append((0x9FB0, (0xC8, 0xA1)))
333level1_hanzi_pairs.append((0x5188, (0xC8, 0xA2)))
334level1_hanzi_pairs.append((0x9FB1, (0xC8, 0xA3)))
335level1_hanzi_pairs.sort(key=lambda x: x[0])
336
337static_u16_table_from_indexable("BIG5_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0)
338static_u8_pair_table_from_indexable("BIG5_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1)
339
340# JIS0208
341
342index = indexes["jis0208"]
343
344# JIS 0208 Level 1 Kanji
345static_u16_table("JIS0208_LEVEL1_KANJI", index[1410:4375])
346
347# JIS 0208 Level 2 Kanji and Additional Kanji
348static_u16_table("JIS0208_LEVEL2_AND_ADDITIONAL_KANJI", index[4418:7808])
349
350# IBM Kanji
351static_u16_table("IBM_KANJI", index[8272:8632])
352
353# Check that the other instance is the same
354if index[8272:8632] != index[10744:11104]:
355  raise Error()
356
357# JIS 0208 symbols (all non-Kanji, non-range items)
358symbol_index = []
359symbol_triples = []
360pointers_to_scan = [
361  (0, 188),
362  (658, 691),
363  (1159, 1221),
364]
365in_run = False
366run_start_pointer = 0
367run_start_array_index = 0
368for (start, end) in pointers_to_scan:
369  for i in range(start, end):
370    code_point = index[i]
371    if in_run:
372      if code_point:
373        symbol_index.append(code_point)
374      else:
375        symbol_triples.append(run_start_pointer)
376        symbol_triples.append(i - run_start_pointer)
377        symbol_triples.append(run_start_array_index)
378        in_run = False
379    else:
380      if code_point:
381        in_run = True
382        run_start_pointer = i
383        run_start_array_index = len(symbol_index)
384        symbol_index.append(code_point)
385  if in_run:
386    symbol_triples.append(run_start_pointer)
387    symbol_triples.append(end - run_start_pointer)
388    symbol_triples.append(run_start_array_index)
389    in_run = False
390if in_run:
391  raise Error()
392
393# Now add manually the two overlapping slices of
394# index from the NEC/IBM extensions.
395run_start_array_index = len(symbol_index)
396symbol_index.extend(index[10736:10744])
397# Later
398symbol_triples.append(10736)
399symbol_triples.append(8)
400symbol_triples.append(run_start_array_index)
401# Earlier
402symbol_triples.append(8644)
403symbol_triples.append(4)
404symbol_triples.append(run_start_array_index)
405
406static_u16_table("JIS0208_SYMBOLS", symbol_index)
407static_u16_table("JIS0208_SYMBOL_TRIPLES", symbol_triples)
408
409# Write down the magic numbers needed when preferring the earlier case
410data_file.write('''const IBM_SYMBOL_START: usize = %d;''' % (run_start_array_index + 1))
411data_file.write('''const IBM_SYMBOL_END: usize = %d;''' % (run_start_array_index + 4))
412data_file.write('''const IBM_SYMBOL_POINTER_START: usize = %d;''' % 8645)
413
414# JIS 0208 ranges (excluding kana)
415range_triples = []
416pointers_to_scan = [
417  (188, 281),
418  (470, 657),
419  (1128, 1159),
420  (8634, 8644),
421  (10716, 10736),
422]
423in_run = False
424run_start_pointer = 0
425run_start_code_point = 0
426previous_code_point = 0
427for (start, end) in pointers_to_scan:
428  for i in range(start, end):
429    code_point = index[i]
430    if in_run:
431      if code_point:
432        if previous_code_point + 1 != code_point:
433          range_triples.append(run_start_pointer)
434          range_triples.append(i - run_start_pointer)
435          range_triples.append(run_start_code_point)
436          run_start_pointer = i
437          run_start_code_point = code_point
438        previous_code_point = code_point
439      else:
440          range_triples.append(run_start_pointer)
441          range_triples.append(i - run_start_pointer)
442          range_triples.append(run_start_code_point)
443          run_start_pointer = 0
444          run_start_code_point = 0
445          previous_code_point = 0
446          in_run = False
447    else:
448      if code_point:
449        in_run = True
450        run_start_pointer = i
451        run_start_code_point = code_point
452        previous_code_point = code_point
453  if in_run:
454    range_triples.append(run_start_pointer)
455    range_triples.append(end - run_start_pointer)
456    range_triples.append(run_start_code_point)
457    run_start_pointer = 0
458    run_start_code_point = 0
459    previous_code_point = 0
460    in_run = False
461if in_run:
462  raise Error()
463
464static_u16_table("JIS0208_RANGE_TRIPLES", range_triples)
465
466# Encoder table for Level 1 Kanji
467# Note: If we were OK with 30 KB more footprint, we
468# could use a directly-indexable table instead...
469level1_kanji_index = index[1410:4375]
470level1_kanji_pairs = []
471for i in xrange(len(level1_kanji_index)):
472  pointer = 1410 + i
473  (lead, trail) = divmod(pointer, 188)
474  lead += 0x81 if lead < 0x1F else 0xC1
475  trail += 0x40 if trail < 0x3F else 0x41
476  level1_kanji_pairs.append((level1_kanji_index[i], (lead, trail)))
477level1_kanji_pairs.sort(key=lambda x: x[0])
478
479static_u16_table_from_indexable("JIS0208_LEVEL1_KANJI_CODE_POINTS", level1_kanji_pairs, 0)
480static_u8_pair_table_from_indexable("JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES", level1_kanji_pairs, 1)
481
482# ISO-2022-JP half-width katakana
483
484# index is still jis0208
485half_width_index = indexes["iso-2022-jp-katakana"]
486
487data_file.write('''pub static ISO_2022_JP_HALF_WIDTH_TRAIL: [u8; %d] = [
488''' % len(half_width_index))
489
490for i in xrange(len(half_width_index)):
491  code_point = half_width_index[i]
492  pointer = index.index(code_point)
493  trail = pointer % 94 + 0x21
494  data_file.write('0x%02X,\n' % trail)
495
496data_file.write('''];
497
498''')
499
500# EUC-KR
501
502index = indexes["euc-kr"]
503
504# Unicode 1.1 Hangul above the old KS X 1001 block
505# Compressed form takes 35% of uncompressed form
506pointers = []
507offsets = []
508previous_code_point = 0
509for row in xrange(0x20):
510  for column in xrange(190):
511    i = column + (row * 190)
512    # Skip the gaps
513    if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40):
514      continue
515    code_point = index[i]
516    if previous_code_point > code_point:
517      raise Error()
518    if code_point - previous_code_point != 1:
519      adjustment = 0
520      if column >= 0x40:
521        adjustment = 12
522      elif column >= 0x20:
523        adjustment = 6
524      pointers.append(column - adjustment + (row * (190 - 12)))
525      offsets.append(code_point)
526    previous_code_point = code_point
527
528static_u16_table("CP949_TOP_HANGUL_POINTERS", pointers)
529static_u16_table("CP949_TOP_HANGUL_OFFSETS", offsets)
530
531# Unicode 1.1 Hangul to the left of the old KS X 1001 block
532pointers = []
533offsets = []
534previous_code_point = 0
535for row in xrange(0x46 - 0x20):
536  for column in xrange(190 - 94):
537    i = 6080 + column + (row * 190)
538    # Skip the gaps
539    if (column >= 0x1A and column < 0x20) or (column >= 0x3A and column < 0x40):
540      continue
541    if i > 13127:
542      # Exclude unassigned on partial last row
543      break
544    code_point = index[i]
545    if previous_code_point > code_point:
546      raise Error()
547    if code_point - previous_code_point != 1:
548      adjustment = 0
549      if column >= 0x40:
550        adjustment = 12
551      elif column >= 0x20:
552        adjustment = 6
553      pointers.append(column - adjustment + (row * (190 - 94 - 12)))
554      offsets.append(code_point)
555    previous_code_point = code_point
556
557static_u16_table("CP949_LEFT_HANGUL_POINTERS", pointers)
558static_u16_table("CP949_LEFT_HANGUL_OFFSETS", offsets)
559
560# KS X 1001 Hangul
561hangul_index = []
562previous_code_point = 0
563for row in xrange(0x48 - 0x2F):
564  for column in xrange(94):
565    code_point = index[9026 + column + (row * 190)]
566    if previous_code_point >= code_point:
567      raise Error()
568    hangul_index.append(code_point)
569    previous_code_point = code_point
570
571static_u16_table("KSX1001_HANGUL", hangul_index)
572
573# KS X 1001 Hanja
574hanja_index = []
575for row in xrange(0x7D - 0x49):
576  for column in xrange(94):
577    hanja_index.append(index[13966 + column + (row * 190)])
578
579static_u16_table("KSX1001_HANJA", hanja_index)
580
581# KS X 1001 symbols
582symbol_index = []
583for i in range(6176, 6270):
584  symbol_index.append(index[i])
585for i in range(6366, 6437):
586  symbol_index.append(index[i])
587
588static_u16_table("KSX1001_SYMBOLS", symbol_index)
589
590# KS X 1001 Uppercase Latin
591subindex = []
592for i in range(7506, 7521):
593  subindex.append(null_to_zero(index[i]))
594
595static_u16_table("KSX1001_UPPERCASE", subindex)
596
597# KS X 1001 Lowercase Latin
598subindex = []
599for i in range(7696, 7712):
600  subindex.append(index[i])
601
602static_u16_table("KSX1001_LOWERCASE", subindex)
603
604# KS X 1001 Box drawing
605subindex = []
606for i in range(7126, 7194):
607  subindex.append(index[i])
608
609static_u16_table("KSX1001_BOX", subindex)
610
611# KS X 1001 other
612pointers = []
613offsets = []
614previous_code_point = 0
615for row in xrange(10):
616  for column in xrange(94):
617    i = 6556 + column + (row * 190)
618    code_point = index[i]
619    # Exclude ranges that were processed as lookup tables
620    # or that contain unmapped cells by filling them with
621    # ASCII. Upon encode, ASCII code points will
622    # never appear as the search key.
623    if (i >= 6946 and i <= 6950):
624      code_point = i - 6946
625    elif (i >= 6961 and i <= 6967):
626      code_point = i - 6961
627    elif (i >= 6992 and i <= 6999):
628      code_point = i - 6992
629    elif (i >= 7024 and i <= 7029):
630      code_point = i - 7024
631    elif (i >= 7126 and i <= 7219):
632      code_point = i - 7126
633    elif (i >= 7395 and i <= 7409):
634      code_point = i - 7395
635    elif (i >= 7506 and i <= 7521):
636      code_point = i - 7506
637    elif (i >= 7696 and i <= 7711):
638      code_point = i - 7696
639    elif (i >= 7969 and i <= 7979):
640      code_point = i - 7969
641    elif (i >= 8162 and i <= 8169):
642      code_point = i - 8162
643    elif (i >= 8299 and i <= 8313):
644      code_point = i - 8299
645    elif (i >= 8347 and i <= 8359):
646      code_point = i - 8347
647    if code_point - previous_code_point != 1:
648      pointers.append(column + (row * 94))
649      offsets.append(code_point)
650    previous_code_point = code_point
651
652static_u16_table("KSX1001_OTHER_POINTERS", pointers)
653# Omit the last offset, because the end of the last line
654# is unmapped, so we don't want to look at it.
655static_u16_table("KSX1001_OTHER_UNSORTED_OFFSETS", offsets[:-1])
656
657# JIS 0212
658
659index = indexes["jis0212"]
660
661# JIS 0212 Kanji
662static_u16_table("JIS0212_KANJI", index[1410:7211])
663
664# JIS 0212 accented (all non-Kanji, non-range items)
665symbol_index = []
666symbol_triples = []
667pointers_to_scan = [
668  (0, 596),
669  (608, 644),
670  (656, 1409),
671]
672in_run = False
673run_start_pointer = 0
674run_start_array_index = 0
675for (start, end) in pointers_to_scan:
676  for i in range(start, end):
677    code_point = index[i]
678    if in_run:
679      if code_point:
680        symbol_index.append(code_point)
681      elif index[i + 1]:
682        symbol_index.append(0)
683      else:
684        symbol_triples.append(run_start_pointer)
685        symbol_triples.append(i - run_start_pointer)
686        symbol_triples.append(run_start_array_index)
687        in_run = False
688    else:
689      if code_point:
690        in_run = True
691        run_start_pointer = i
692        run_start_array_index = len(symbol_index)
693        symbol_index.append(code_point)
694  if in_run:
695    symbol_triples.append(run_start_pointer)
696    symbol_triples.append(end - run_start_pointer)
697    symbol_triples.append(run_start_array_index)
698    in_run = False
699if in_run:
700  raise Error()
701
702static_u16_table("JIS0212_ACCENTED", symbol_index)
703static_u16_table("JIS0212_ACCENTED_TRIPLES", symbol_triples)
704
705# gb18030
706
707index = indexes["gb18030"]
708
709# Unicode 1.1 ideographs above the old GB2312 block
710# Compressed form takes 63% of uncompressed form
711pointers = []
712offsets = []
713previous_code_point = 0
714for i in xrange(6080):
715  code_point = index[i]
716  if previous_code_point > code_point:
717    raise Error()
718  if code_point - previous_code_point != 1:
719    pointers.append(i)
720    offsets.append(code_point)
721  previous_code_point = code_point
722
723static_u16_table("GBK_TOP_IDEOGRAPH_POINTERS", pointers)
724static_u16_table("GBK_TOP_IDEOGRAPH_OFFSETS", offsets)
725
726# Unicode 1.1 ideographs to the left of the old GB2312 block
727# Compressed form takes 40% of uncompressed form
728pointers = []
729offsets = []
730previous_code_point = 0
731for row in xrange(0x7D - 0x29):
732  for column in xrange(190 - 94):
733    i = 7790 + column + (row * 190)
734    if i > 23650:
735      # Exclude compatibility ideographs at the end
736      break
737    code_point = index[i]
738    if previous_code_point > code_point:
739      raise Error()
740    if code_point - previous_code_point != 1:
741      pointers.append(column + (row * (190 - 94)))
742      offsets.append(code_point)
743    previous_code_point = code_point
744
745static_u16_table("GBK_LEFT_IDEOGRAPH_POINTERS", pointers)
746static_u16_table("GBK_LEFT_IDEOGRAPH_OFFSETS", offsets)
747
748# GBK other (excl. Ext A, Compat & PUA at the bottom)
749pointers = []
750offsets = []
751previous_code_point = 0
752for row in xrange(0x29 - 0x20):
753  for column in xrange(190 - 94):
754    i = 6080 + column + (row * 190)
755    code_point = index[i]
756    if code_point - previous_code_point != 1:
757      pointers.append(column + (row * (190 - 94)))
758      offsets.append(code_point)
759    previous_code_point = code_point
760
761pointers.append((190 - 94) * (0x29 - 0x20))
762static_u16_table("GBK_OTHER_POINTERS", pointers)
763static_u16_table("GBK_OTHER_UNSORTED_OFFSETS", offsets)
764
765# GBK bottom: Compatibility ideagraphs, Ext A and PUA
766bottom_index = []
767# 5 compat following Unified Ideographs
768for i in range(23651, 23656):
769  bottom_index.append(index[i])
770# Last row
771for i in range(23750, 23846):
772  bottom_index.append(index[i])
773
774static_u16_table("GBK_BOTTOM", bottom_index)
775
776# GB2312 Hanzi
777# (and the 5 PUA code points in between Level 1 and Level 2)
778hanzi_index = []
779for row in xrange(0x77 - 0x2F):
780  for column in xrange(94):
781    hanzi_index.append(index[9026 + column + (row * 190)])
782
783static_u16_table("GB2312_HANZI", hanzi_index)
784
785# GB2312 symbols
786symbol_index = []
787for i in xrange(94):
788  symbol_index.append(index[6176 + i])
789
790static_u16_table("GB2312_SYMBOLS", symbol_index)
791
792# GB2312 symbols on Greek row (incl. PUA)
793symbol_index = []
794for i in xrange(22):
795  symbol_index.append(index[7189 + i])
796
797static_u16_table("GB2312_SYMBOLS_AFTER_GREEK", symbol_index)
798
799# GB2312 Pinyin
800pinyin_index = []
801for i in xrange(32):
802  pinyin_index.append(index[7506 + i])
803
804static_u16_table("GB2312_PINYIN", pinyin_index)
805
806# GB2312 other (excl. bottom PUA)
807pointers = []
808offsets = []
809previous_code_point = 0
810for row in xrange(14):
811  for column in xrange(94):
812    i = 6366 + column + (row * 190)
813    code_point = index[i]
814    # Exclude the two ranges that were processed as
815    # lookup tables above by filling them with
816    # ASCII. Upon encode, ASCII code points will
817    # never appear as the search key.
818    if (i >= 7189 and i < 7189 + 22):
819      code_point = i - 7189
820    elif (i >= 7506 and i < 7506 + 32):
821      code_point = i - 7506
822    if code_point - previous_code_point != 1:
823      pointers.append(column + (row * 94))
824      offsets.append(code_point)
825    previous_code_point = code_point
826
827pointers.append(14 * 94)
828static_u16_table("GB2312_OTHER_POINTERS", pointers)
829static_u16_table("GB2312_OTHER_UNSORTED_OFFSETS", offsets)
830
831# Non-gbk code points
832pointers = []
833offsets = []
834for pair in indexes["gb18030-ranges"]:
835  if pair[1] == 0x10000:
836    break # the last entry doesn't fit in u16
837  pointers.append(pair[0])
838  offsets.append(pair[1])
839
840static_u16_table("GB18030_RANGE_POINTERS", pointers)
841static_u16_table("GB18030_RANGE_OFFSETS", offsets)
842
843# Encoder table for Level 1 Hanzi
844# The units here really fit into 12 bits, but since we're
845# looking for speed here, let's use 16 bits per unit.
846# Once we use 16 bits per unit, we might as well precompute
847# the output bytes.
848level1_hanzi_index = hanzi_index[:(94 * (0xD8 - 0xB0) - 5)]
849level1_hanzi_pairs = []
850for i in xrange(len(level1_hanzi_index)):
851  hanzi_lead = (i / 94) + 0xB0
852  hanzi_trail = (i % 94) + 0xA1
853  level1_hanzi_pairs.append((level1_hanzi_index[i], (hanzi_lead, hanzi_trail)))
854level1_hanzi_pairs.sort(key=lambda x: x[0])
855
856static_u16_table_from_indexable("GB2312_LEVEL1_HANZI_CODE_POINTS", level1_hanzi_pairs, 0)
857static_u8_pair_table_from_indexable("GB2312_LEVEL1_HANZI_BYTES", level1_hanzi_pairs, 1)
858
859data_file.write('''#[inline(always)]
860fn map_with_ranges(haystack: &[u16], other: &[u16], needle: u16) -> u16 {
861    debug_assert_eq!(haystack.len(), other.len());
862    match haystack.binary_search(&needle) {
863        Ok(i) => other[i],
864        Err(i) => other[i - 1] + (needle - haystack[i - 1]),
865    }
866}
867
868#[inline(always)]
869fn map_with_unsorted_ranges(haystack: &[u16], other: &[u16], needle: u16) -> Option<u16> {
870    debug_assert_eq!(haystack.len() + 1, other.len());
871    for i in 0..haystack.len() {
872        let start = other[i];
873        let end = other[i + 1];
874        let length = end - start;
875        let offset = needle.wrapping_sub(haystack[i]);
876        if offset < length {
877            return Some(start + offset);
878        }
879    }
880    None
881}
882
883#[inline(always)]
884pub fn position(haystack: &[u16], needle: u16) -> Option<usize> {
885    haystack.iter().position(|&x| x == needle)
886}
887
888#[inline(always)]
889pub fn gb18030_range_decode(pointer: u16) -> u16 {
890    map_with_ranges(&GB18030_RANGE_POINTERS[..],
891                    &GB18030_RANGE_OFFSETS[..],
892                    pointer)
893}
894
895#[inline(always)]
896pub fn gb18030_range_encode(bmp: u16) -> usize {
897    if bmp == 0xE7C7 {
898        return 7457;
899    }
900    map_with_ranges(&GB18030_RANGE_OFFSETS[..], &GB18030_RANGE_POINTERS[..], bmp) as usize
901}
902
903#[inline(always)]
904pub fn gbk_top_ideograph_decode(pointer: u16) -> u16 {
905    map_with_ranges(&GBK_TOP_IDEOGRAPH_POINTERS[..],
906                    &GBK_TOP_IDEOGRAPH_OFFSETS[..],
907                    pointer)
908}
909
910#[inline(always)]
911pub fn gbk_top_ideograph_encode(bmp: u16) -> u16 {
912    map_with_ranges(&GBK_TOP_IDEOGRAPH_OFFSETS[..],
913                    &GBK_TOP_IDEOGRAPH_POINTERS[..],
914                    bmp)
915}
916
917#[inline(always)]
918pub fn gbk_left_ideograph_decode(pointer: u16) -> u16 {
919    map_with_ranges(&GBK_LEFT_IDEOGRAPH_POINTERS[..],
920                    &GBK_LEFT_IDEOGRAPH_OFFSETS[..],
921                    pointer)
922}
923
924#[inline(always)]
925pub fn gbk_left_ideograph_encode(bmp: u16) -> u16 {
926    map_with_ranges(&GBK_LEFT_IDEOGRAPH_OFFSETS[..],
927                    &GBK_LEFT_IDEOGRAPH_POINTERS[..],
928                    bmp)
929}
930
931#[inline(always)]
932pub fn cp949_top_hangul_decode(pointer: u16) -> u16 {
933    map_with_ranges(&CP949_TOP_HANGUL_POINTERS[..],
934                    &CP949_TOP_HANGUL_OFFSETS[..],
935                    pointer)
936}
937
938#[inline(always)]
939pub fn cp949_top_hangul_encode(bmp: u16) -> u16 {
940    map_with_ranges(&CP949_TOP_HANGUL_OFFSETS[..],
941                    &CP949_TOP_HANGUL_POINTERS[..],
942                    bmp)
943}
944
945#[inline(always)]
946pub fn cp949_left_hangul_decode(pointer: u16) -> u16 {
947    map_with_ranges(&CP949_LEFT_HANGUL_POINTERS[..],
948                    &CP949_LEFT_HANGUL_OFFSETS[..],
949                    pointer)
950}
951
952#[inline(always)]
953pub fn cp949_left_hangul_encode(bmp: u16) -> u16 {
954    map_with_ranges(&CP949_LEFT_HANGUL_OFFSETS[..],
955                    &CP949_LEFT_HANGUL_POINTERS[..],
956                    bmp)
957}
958
959#[inline(always)]
960pub fn gbk_other_decode(pointer: u16) -> u16 {
961    map_with_ranges(&GBK_OTHER_POINTERS[..GBK_OTHER_POINTERS.len() - 1],
962                    &GBK_OTHER_UNSORTED_OFFSETS[..],
963                    pointer)
964}
965
966#[inline(always)]
967pub fn gbk_other_encode(bmp: u16) -> Option<u16> {
968    map_with_unsorted_ranges(&GBK_OTHER_UNSORTED_OFFSETS[..],
969                             &GBK_OTHER_POINTERS[..],
970                             bmp)
971}
972
973#[inline(always)]
974pub fn gb2312_other_decode(pointer: u16) -> u16 {
975    map_with_ranges(&GB2312_OTHER_POINTERS[..GB2312_OTHER_POINTERS.len() - 1],
976                    &GB2312_OTHER_UNSORTED_OFFSETS[..],
977                    pointer)
978}
979
980#[inline(always)]
981pub fn gb2312_other_encode(bmp: u16) -> Option<u16> {
982    map_with_unsorted_ranges(&GB2312_OTHER_UNSORTED_OFFSETS[..],
983                             &GB2312_OTHER_POINTERS[..],
984                             bmp)
985}
986
987#[cfg(feature = "no-static-ideograph-encoder-tables")]
988#[inline(always)]
989pub fn gb2312_level1_hanzi_encode(bmp: u16) -> Option<(u8, u8)> {
990    position(&GB2312_HANZI[..(94 * (0xD8 - 0xB0) - 5)], bmp).map(|hanzi_pointer| {
991        let hanzi_lead = (hanzi_pointer / 94) + 0xB0;
992        let hanzi_trail = (hanzi_pointer % 94) + 0xA1;
993        (hanzi_lead as u8, hanzi_trail as u8)
994    })
995}
996
997#[cfg(not(feature = "no-static-ideograph-encoder-tables"))]
998#[inline(always)]
999pub fn gb2312_level1_hanzi_encode(bmp: u16) -> Option<(u8, u8)> {
1000    match GB2312_LEVEL1_HANZI_CODE_POINTS.binary_search(&bmp) {
1001        Ok(i) => {
1002            let pair = &GB2312_LEVEL1_HANZI_BYTES[i];
1003            Some((pair[0], pair[1]))
1004        }
1005        Err(_) => None,
1006    }
1007}
1008
1009#[inline(always)]
1010pub fn gb2312_level2_hanzi_encode(bmp: u16) -> Option<usize> {
1011    // TODO: optimize
1012    position(&GB2312_HANZI[(94 * (0xD8 - 0xB0))..], bmp)
1013}
1014
1015#[inline(always)]
1016pub fn ksx1001_other_decode(pointer: u16) -> u16 {
1017    map_with_ranges(&KSX1001_OTHER_POINTERS[..KSX1001_OTHER_POINTERS.len() - 1],
1018                    &KSX1001_OTHER_UNSORTED_OFFSETS[..],
1019                    pointer)
1020}
1021
1022#[inline(always)]
1023pub fn ksx1001_other_encode(bmp: u16) -> Option<u16> {
1024    map_with_unsorted_ranges(&KSX1001_OTHER_UNSORTED_OFFSETS[..],
1025                             &KSX1001_OTHER_POINTERS[..],
1026                             bmp)
1027}
1028
1029#[cfg(feature = "no-static-ideograph-encoder-tables")]
1030#[inline(always)]
1031pub fn jis0208_level1_kanji_shift_jis_encode(bmp: u16) -> Option<(u8, u8)> {
1032    position(&JIS0208_LEVEL1_KANJI[..], bmp).map(|kanji_pointer| {
1033        let pointer = 1410 + kanji_pointer;
1034        let lead = pointer / 188;
1035        let lead_offset = if lead < 0x1F {
1036            0x81
1037        } else {
1038            0xC1
1039        };
1040        let trail = pointer % 188;
1041        let trail_offset = if trail < 0x3F {
1042            0x40
1043        } else {
1044            0x41
1045        };
1046        ((lead + lead_offset) as u8, (trail + trail_offset) as u8)
1047    })
1048}
1049
1050#[cfg(not(feature = "no-static-ideograph-encoder-tables"))]
1051#[inline(always)]
1052pub fn jis0208_level1_kanji_shift_jis_encode(bmp: u16) -> Option<(u8, u8)> {
1053    match JIS0208_LEVEL1_KANJI_CODE_POINTS.binary_search(&bmp) {
1054        Ok(i) => {
1055            let pair = &JIS0208_LEVEL1_KANJI_SHIFT_JIS_BYTES[i];
1056            Some((pair[0], pair[1]))
1057        }
1058        Err(_) => None,
1059    }
1060}
1061
1062#[cfg(feature = "no-static-ideograph-encoder-tables")]
1063#[inline(always)]
1064pub fn jis0208_level1_kanji_euc_jp_encode(bmp: u16) -> Option<(u8, u8)> {
1065    position(&JIS0208_LEVEL1_KANJI[..], bmp).map(|kanji_pointer| {
1066        let lead = (kanji_pointer / 94) + 0xB0;
1067        let trail = (kanji_pointer % 94) + 0xA1;
1068        (lead as u8, trail as u8)
1069    })
1070}
1071
1072#[cfg(not(feature = "no-static-ideograph-encoder-tables"))]
1073#[inline(always)]
1074pub fn jis0208_level1_kanji_euc_jp_encode(bmp: u16) -> Option<(u8, u8)> {
1075    jis0208_level1_kanji_shift_jis_encode(bmp).map(|(shift_jis_lead, shift_jis_trail)| {
1076        let mut lead = shift_jis_lead as usize;
1077        if shift_jis_lead >= 0xA0 {
1078            lead -= 0xC1 - 0x81;
1079        }
1080        // The next line would overflow u8. Letting it go over allows us to
1081        // subtract fewer times.
1082        lead <<= 1;
1083        // Bring it back to u8 range
1084        lead -= 0x61;
1085        let trail = if shift_jis_trail >= 0x9F {
1086            lead += 1;
1087            shift_jis_trail + (0xA1 - 0x9F)
1088        } else if shift_jis_trail < 0x7F {
1089            shift_jis_trail + (0xA1 - 0x40)
1090        } else {
1091            shift_jis_trail + (0xA1 - 0x41)
1092        };
1093        (lead as u8, trail)
1094    })
1095}
1096
1097#[cfg(feature = "no-static-ideograph-encoder-tables")]
1098#[inline(always)]
1099pub fn jis0208_level1_kanji_iso_2022_jp_encode(bmp: u16) -> Option<(u8, u8)> {
1100    position(&JIS0208_LEVEL1_KANJI[..], bmp).map(|kanji_pointer| {
1101        let lead = (kanji_pointer / 94) + (0xB0 - 0x80);
1102        let trail = (kanji_pointer % 94) + 0x21;
1103        (lead as u8, trail as u8)
1104    })
1105}
1106
1107#[cfg(not(feature = "no-static-ideograph-encoder-tables"))]
1108#[inline(always)]
1109pub fn jis0208_level1_kanji_iso_2022_jp_encode(bmp: u16) -> Option<(u8, u8)> {
1110    jis0208_level1_kanji_shift_jis_encode(bmp).map(|(shift_jis_lead, shift_jis_trail)| {
1111        let mut lead = shift_jis_lead as usize;
1112        if shift_jis_lead >= 0xA0 {
1113            lead -= 0xC1 - 0x81;
1114        }
1115        // The next line would overflow u8. Letting it go over allows us to
1116        // subtract fewer times.
1117        lead <<= 1;
1118        // Bring it back to u8 range
1119        lead -= 0xE1;
1120        let trail = if shift_jis_trail >= 0x9F {
1121            lead += 1;
1122            shift_jis_trail - (0x9F - 0x21)
1123        } else if shift_jis_trail < 0x7F {
1124            shift_jis_trail - (0x40 - 0x21)
1125        } else {
1126            shift_jis_trail - (0x41 - 0x21)
1127        };
1128        (lead as u8, trail)
1129    })
1130}
1131
1132#[inline(always)]
1133pub fn jis0208_level2_and_additional_kanji_encode(bmp: u16) -> Option<usize> {
1134    // TODO: optimize
1135    position(&JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[..], bmp)
1136}
1137
1138pub fn jis0208_symbol_decode(pointer: usize) -> Option<u16> {
1139    let mut i = 0;
1140    while i < JIS0208_SYMBOL_TRIPLES.len() {
1141        let start = JIS0208_SYMBOL_TRIPLES[i] as usize;
1142        let length = JIS0208_SYMBOL_TRIPLES[i + 1] as usize;
1143        let pointer_minus_start = pointer.wrapping_sub(start);
1144        if pointer_minus_start < length {
1145            let offset = JIS0208_SYMBOL_TRIPLES[i + 2] as usize;
1146            return Some(JIS0208_SYMBOLS[pointer_minus_start + offset]);
1147        }
1148        i += 3;
1149    }
1150    None
1151}
1152
1153/// Prefers Shift_JIS pointers for the three symbols that are in both ranges.
1154#[inline(always)]
1155pub fn jis0208_symbol_encode(bmp: u16) -> Option<usize> {
1156    let mut i = 0;
1157    while i < JIS0208_SYMBOL_TRIPLES.len() {
1158        let pointer_start = JIS0208_SYMBOL_TRIPLES[i] as usize;
1159        let length = JIS0208_SYMBOL_TRIPLES[i + 1] as usize;
1160        let symbol_start = JIS0208_SYMBOL_TRIPLES[i + 2] as usize;
1161        let symbol_end = symbol_start + length;
1162        let mut symbol_pos = symbol_start;
1163        while symbol_pos < symbol_end {
1164            if JIS0208_SYMBOLS[symbol_pos] == bmp {
1165                return Some(symbol_pos - symbol_start + pointer_start);
1166            }
1167            symbol_pos += 1;
1168        }
1169        i += 3;
1170    }
1171    None
1172}
1173
1174#[inline(always)]
1175pub fn ibm_symbol_encode(bmp: u16) -> Option<usize> {
1176    position(&JIS0208_SYMBOLS[IBM_SYMBOL_START..IBM_SYMBOL_END], bmp)
1177        .map(|x| x + IBM_SYMBOL_POINTER_START)
1178}
1179
1180#[inline(always)]
1181pub fn jis0208_range_decode(pointer: usize) -> Option<u16> {
1182    let mut i = 0;
1183    while i < JIS0208_RANGE_TRIPLES.len() {
1184        let start = JIS0208_RANGE_TRIPLES[i] as usize;
1185        let length = JIS0208_RANGE_TRIPLES[i + 1] as usize;
1186        let pointer_minus_start = pointer.wrapping_sub(start);
1187        if pointer_minus_start < length {
1188            let offset = JIS0208_RANGE_TRIPLES[i + 2] as usize;
1189            return Some((pointer_minus_start + offset) as u16);
1190        }
1191        i += 3;
1192    }
1193    None
1194}
1195
1196#[inline(always)]
1197pub fn jis0208_range_encode(bmp: u16) -> Option<usize> {
1198    let mut i = 0;
1199    while i < JIS0208_RANGE_TRIPLES.len() {
1200        let start = JIS0208_RANGE_TRIPLES[i + 2] as usize;
1201        let length = JIS0208_RANGE_TRIPLES[i + 1] as usize;
1202        let bmp_minus_start = (bmp as usize).wrapping_sub(start);
1203        if bmp_minus_start < length {
1204            let offset = JIS0208_RANGE_TRIPLES[i] as usize;
1205            return Some(bmp_minus_start + offset);
1206        }
1207        i += 3;
1208    }
1209    None
1210}
1211
1212pub fn jis0212_accented_decode(pointer: usize) -> Option<u16> {
1213    let mut i = 0;
1214    while i < JIS0212_ACCENTED_TRIPLES.len() {
1215        let start = JIS0212_ACCENTED_TRIPLES[i] as usize;
1216        let length = JIS0212_ACCENTED_TRIPLES[i + 1] as usize;
1217        let pointer_minus_start = pointer.wrapping_sub(start);
1218        if pointer_minus_start < length {
1219            let offset = JIS0212_ACCENTED_TRIPLES[i + 2] as usize;
1220            let candidate = JIS0212_ACCENTED[pointer_minus_start + offset];
1221            if candidate == 0 {
1222                return None;
1223            }
1224            return Some(candidate);
1225        }
1226        i += 3;
1227    }
1228    None
1229}
1230
1231#[inline(always)]
1232pub fn big5_is_astral(rebased_pointer: usize) -> bool {
1233    (BIG5_ASTRALNESS[rebased_pointer >> 5] & (1 << (rebased_pointer & 0x1F))) != 0
1234}
1235
1236#[inline(always)]
1237pub fn big5_low_bits(rebased_pointer: usize) -> u16 {
1238    if rebased_pointer < BIG5_LOW_BITS.len() {
1239        BIG5_LOW_BITS[rebased_pointer]
1240    } else {
1241        0
1242    }
1243}
1244
1245#[inline(always)]
1246pub fn big5_astral_encode(low_bits: u16) -> Option<usize> {
1247    match low_bits {
1248        0x00CC => Some(11205 - 942),
1249        0x008A => Some(11207 - 942),
1250        0x7607 => Some(11213 - 942),
1251        _ => {
1252            let mut i = 18997 - 942;
1253            while i < BIG5_LOW_BITS.len() - 1 {
1254                if BIG5_LOW_BITS[i] == low_bits && big5_is_astral(i) {
1255                    return Some(i);
1256                }
1257                i += 1;
1258            }
1259            None
1260        }
1261    }
1262}
1263
1264#[cfg(feature = "no-static-ideograph-encoder-tables")]
1265#[inline(always)]
1266pub fn big5_level1_hanzi_encode(bmp: u16) -> Option<(u8, u8)> {
1267    if super::in_inclusive_range16(bmp, 0x4E00, 0x9FB1) {
1268        if let Some(hanzi_pointer) = position(&BIG5_LOW_BITS[(5495 - 942)..(10951 - 942)], bmp) {
1269            let lead = hanzi_pointer / 157 + 0xA4;
1270            let remainder = hanzi_pointer % 157;
1271            let trail = if remainder < 0x3F {
1272                remainder + 0x40
1273            } else {
1274                remainder + 0x62
1275            };
1276            return Some((lead as u8, trail as u8));
1277        }
1278        match bmp {
1279            0x4E5A => {
1280                return Some((0xC8, 0x7B));
1281            }
1282            0x5202 => {
1283                return Some((0xC8, 0x7D));
1284            }
1285            0x9FB0 => {
1286                return Some((0xC8, 0xA1));
1287            }
1288            0x5188 => {
1289                return Some((0xC8, 0xA2));
1290            }
1291            0x9FB1 => {
1292                return Some((0xC8, 0xA3));
1293            }
1294            _ => {
1295                return None;
1296            }
1297        }
1298    }
1299    None
1300}
1301
1302#[cfg(not(feature = "no-static-ideograph-encoder-tables"))]
1303#[inline(always)]
1304pub fn big5_level1_hanzi_encode(bmp: u16) -> Option<(u8, u8)> {
1305    if super::in_inclusive_range16(bmp, 0x4E00, 0x9FB1) {
1306        match BIG5_LEVEL1_HANZI_CODE_POINTS.binary_search(&bmp) {
1307            Ok(i) => {
1308                let pair = &BIG5_LEVEL1_HANZI_BYTES[i];
1309                Some((pair[0], pair[1]))
1310            }
1311            Err(_) => None,
1312        }
1313    } else {
1314        None
1315    }
1316}
1317
1318#[inline(always)]
1319pub fn big5_box_encode(bmp: u16) -> Option<usize> {
1320    position(&BIG5_LOW_BITS[(18963 - 942)..(18992 - 942)], bmp).map(|x| x + 18963)
1321}
1322
1323#[inline(always)]
1324pub fn big5_other_encode(bmp: u16) -> Option<usize> {
1325    if 0x4491 == bmp {
1326        return Some(11209);
1327    }
1328    if let Some(pos) = position(&BIG5_LOW_BITS[(5024 - 942)..(5466 - 942)], bmp) {
1329        return Some(pos + 5024);
1330    }
1331    if let Some(pos) = position(&BIG5_LOW_BITS[(10896 - 942)..(11205 - 942)], bmp) {
1332        return Some(pos + 10896);
1333    }
1334    if let Some(pos) = position(&BIG5_LOW_BITS[(11254 - 942)..(18963 - 942)], bmp) {
1335        return Some(pos + 11254);
1336    }
1337    let mut i = 18996 - 942;
1338    while i < BIG5_LOW_BITS.len() {
1339        if BIG5_LOW_BITS[i] == bmp && !big5_is_astral(i) {
1340            return Some(i + 942);
1341        }
1342        i += 1;
1343    }
1344    None
1345}
1346
1347#[inline(always)]
1348pub fn mul_94(lead: u8) -> usize {
1349    lead as usize * 94
1350}
1351''')
1352
1353data_file.close()
1354
1355# Variant
1356
1357variant_file = open("src/variant.rs", "w")
1358variant_file.write('''// Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
1359// file at the top-level directory of this distribution.
1360//
1361// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
1362// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
1363// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
1364// option. This file may not be copied, modified, or distributed
1365// except according to those terms.
1366
1367// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
1368// Instead, please regenerate using generate-encoding-data.py
1369
1370//! This module provides enums that wrap the various decoders and encoders.
1371//! The purpose is to make `Decoder` and `Encoder` `Sized` by writing the
1372//! dispatch explicitly for a finite set of specialized decoders and encoders.
1373//! Unfortunately, this means the compiler doesn't generate the dispatch code
1374//! and it has to be written here instead.
1375//!
1376//! The purpose of making `Decoder` and `Encoder` `Sized` is to allow stack
1377//! allocation in Rust code, including the convenience methods on `Encoding`.
1378
1379''')
1380
1381encoding_variants = [u"single-byte",]
1382for encoding in multi_byte:
1383  if encoding["name"] in [u"UTF-16LE", u"UTF-16BE"]:
1384    continue
1385  else:
1386    encoding_variants.append(encoding["name"])
1387encoding_variants.append(u"UTF-16")
1388
1389decoder_variants = []
1390for variant in encoding_variants:
1391  if variant == u"GBK":
1392    continue
1393  decoder_variants.append(variant)
1394
1395encoder_variants = []
1396for variant in encoding_variants:
1397  if variant in [u"replacement", u"GBK", u"UTF-16"]:
1398    continue
1399  encoder_variants.append(variant)
1400
1401for variant in decoder_variants:
1402  variant_file.write("use %s::*;\n" % to_snake_name(variant))
1403
1404variant_file.write('''use super::*;
1405
1406pub enum VariantDecoder {
1407''')
1408
1409for variant in decoder_variants:
1410  variant_file.write("   %s(%sDecoder),\n" % (to_camel_name(variant), to_camel_name(variant)))
1411
1412variant_file.write('''}
1413
1414impl VariantDecoder {
1415''')
1416
1417def write_variant_method(name, mut, arg_list, ret, variants, excludes, kind):
1418  variant_file.write('''pub fn %s(&''' % name)
1419  if mut:
1420    variant_file.write('''mut ''')
1421  variant_file.write('''self''')
1422  for arg in arg_list:
1423    variant_file.write(''', %s: %s''' % (arg[0], arg[1]))
1424  variant_file.write(''')''')
1425  if ret:
1426    variant_file.write(''' -> %s''' % ret)
1427  variant_file.write(''' {\nmatch *self {\n''')
1428  for variant in variants:
1429    variant_file.write('''Variant%s::%s(ref ''' % (kind, to_camel_name(variant)))
1430    if mut:
1431      variant_file.write('''mut ''')
1432    if variant in excludes:
1433      variant_file.write('''v) => (),''')
1434      continue
1435    variant_file.write('''v) => v.%s(''' % name)
1436    first = True
1437    for arg in arg_list:
1438      if not first:
1439        variant_file.write(''', ''')
1440      first = False
1441      variant_file.write(arg[0])
1442    variant_file.write('''),\n''')
1443  variant_file.write('''}\n}\n\n''')
1444
1445write_variant_method("max_utf16_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")
1446
1447write_variant_method("max_utf8_buffer_length_without_replacement", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")
1448
1449write_variant_method("max_utf8_buffer_length", False, [("byte_length", "usize")], "Option<usize>", decoder_variants, [], "Decoder")
1450
1451write_variant_method("decode_to_utf16_raw", True, [("src", "&[u8]"),
1452                           ("dst", "&mut [u16]"),
1453                           ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder")
1454
1455write_variant_method("decode_to_utf8_raw", True, [("src", "&[u8]"),
1456                           ("dst", "&mut [u8]"),
1457                           ("last", "bool")], "(DecoderResult, usize, usize)", decoder_variants, [], "Decoder")
1458
1459variant_file.write('''
1460}
1461
1462pub enum VariantEncoder {
1463''')
1464
1465for variant in encoder_variants:
1466  variant_file.write("   %s(%sEncoder),\n" % (to_camel_name(variant), to_camel_name(variant)))
1467
1468variant_file.write('''}
1469
1470impl VariantEncoder {
1471    pub fn has_pending_state(&self) -> bool {
1472        match *self {
1473            VariantEncoder::Iso2022Jp(ref v) => {
1474                v.has_pending_state()
1475            }
1476            _ => false,
1477        }
1478    }
1479''')
1480
1481write_variant_method("max_buffer_length_from_utf16_without_replacement", False, [("u16_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder")
1482
1483write_variant_method("max_buffer_length_from_utf8_without_replacement", False, [("byte_length", "usize")], "Option<usize>", encoder_variants, [], "Encoder")
1484
1485write_variant_method("encode_from_utf16_raw", True, [("src", "&[u16]"),
1486                           ("dst", "&mut [u8]"),
1487                           ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder")
1488
1489write_variant_method("encode_from_utf8_raw", True, [("src", "&str"),
1490                           ("dst", "&mut [u8]"),
1491                           ("last", "bool")], "(EncoderResult, usize, usize)", encoder_variants, [], "Encoder")
1492
1493
1494variant_file.write('''}
1495
1496pub enum VariantEncoding {
1497    SingleByte(&'static [u16; 128]),''')
1498
1499for encoding in multi_byte:
1500  variant_file.write("%s,\n" % to_camel_name(encoding["name"]))
1501
1502variant_file.write('''}
1503
1504impl VariantEncoding {
1505    pub fn new_variant_decoder(&self) -> VariantDecoder {
1506        match *self {
1507            VariantEncoding::SingleByte(table) => SingleByteDecoder::new(table),
1508            VariantEncoding::Utf8 => Utf8Decoder::new(),
1509            VariantEncoding::Gbk | VariantEncoding::Gb18030 => Gb18030Decoder::new(),
1510            VariantEncoding::Big5 => Big5Decoder::new(),
1511            VariantEncoding::EucJp => EucJpDecoder::new(),
1512            VariantEncoding::Iso2022Jp => Iso2022JpDecoder::new(),
1513            VariantEncoding::ShiftJis => ShiftJisDecoder::new(),
1514            VariantEncoding::EucKr => EucKrDecoder::new(),
1515            VariantEncoding::Replacement => ReplacementDecoder::new(),
1516            VariantEncoding::UserDefined => UserDefinedDecoder::new(),
1517            VariantEncoding::Utf16Be => Utf16Decoder::new(true),
1518            VariantEncoding::Utf16Le => Utf16Decoder::new(false),
1519        }
1520    }
1521
1522    pub fn new_encoder(&self, encoding: &'static Encoding) -> Encoder {
1523        match *self {
1524            VariantEncoding::SingleByte(table) => SingleByteEncoder::new(encoding, table),
1525            VariantEncoding::Utf8 => Utf8Encoder::new(encoding),
1526            VariantEncoding::Gbk => Gb18030Encoder::new(encoding, false),
1527            VariantEncoding::Gb18030 => Gb18030Encoder::new(encoding, true),
1528            VariantEncoding::Big5 => Big5Encoder::new(encoding),
1529            VariantEncoding::EucJp => EucJpEncoder::new(encoding),
1530            VariantEncoding::Iso2022Jp => Iso2022JpEncoder::new(encoding),
1531            VariantEncoding::ShiftJis => ShiftJisEncoder::new(encoding),
1532            VariantEncoding::EucKr => EucKrEncoder::new(encoding),
1533            VariantEncoding::UserDefined => UserDefinedEncoder::new(encoding),
1534            VariantEncoding::Utf16Be | VariantEncoding::Replacement |
1535            VariantEncoding::Utf16Le => unreachable!(),
1536        }
1537    }
1538}
1539''')
1540
1541variant_file.close()
1542
1543(ffi_rs_begin, ffi_rs_end) = read_non_generated("../encoding_c/src/lib.rs")
1544
1545ffi_file = open("../encoding_c/src/lib.rs", "w")
1546
1547ffi_file.write(ffi_rs_begin)
1548ffi_file.write("""
1549// Instead, please regenerate using generate-encoding-data.py
1550
1551/// The minimum length of buffers that may be passed to `encoding_name()`.
1552pub const ENCODING_NAME_MAX_LENGTH: usize = %d; // %s
1553
1554""" % (longest_name_length, longest_name))
1555
1556for name in preferred:
1557  ffi_file.write('''/// The %s encoding.
1558#[no_mangle]
1559pub static %s_ENCODING: ConstEncoding = ConstEncoding(&%s_INIT);
1560
1561''' % (to_dom_name(name), to_constant_name(name), to_constant_name(name)))
1562
1563ffi_file.write(ffi_rs_end)
1564ffi_file.close()
1565
1566(single_byte_rs_begin, single_byte_rs_end) = read_non_generated("src/single_byte.rs")
1567
1568single_byte_file = open("src/single_byte.rs", "w")
1569
1570single_byte_file.write(single_byte_rs_begin)
1571single_byte_file.write("""
1572// Instead, please regenerate using generate-encoding-data.py
1573
1574    #[test]
1575    fn test_single_byte_decode() {""")
1576
1577for name in preferred:
1578  if name == u"ISO-8859-8-I":
1579    continue;
1580  if is_single_byte(name):
1581    single_byte_file.write("""
1582        decode_single_byte(%s, %s_DATA);""" % (to_constant_name(name), to_constant_name(name)))
1583
1584single_byte_file.write("""
1585    }
1586
1587    #[test]
1588    fn test_single_byte_encode() {""")
1589
1590for name in preferred:
1591  if name == u"ISO-8859-8-I":
1592    continue;
1593  if is_single_byte(name):
1594    single_byte_file.write("""
1595        encode_single_byte(%s, %s_DATA);""" % (to_constant_name(name), to_constant_name(name)))
1596
1597
1598single_byte_file.write("""
1599    }
1600""")
1601
1602single_byte_file.write(single_byte_rs_end)
1603single_byte_file.close()
1604
1605static_file = open("../encoding_c/include/encoding_rs_statics.h", "w")
1606
1607static_file.write("""// Copyright 2016 Mozilla Foundation. See the COPYRIGHT
1608// file at the top-level directory of this distribution.
1609//
1610// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
1611// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
1612// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
1613// option. This file may not be copied, modified, or distributed
1614// except according to those terms.
1615
1616// THIS IS A GENERATED FILE. PLEASE DO NOT EDIT.
1617// Instead, please regenerate using generate-encoding-data.py
1618
1619// This file is not meant to be included directly. Instead, encoding_rs.h
1620// includes this file.
1621
1622#ifndef encoding_rs_statics_h_
1623#define encoding_rs_statics_h_
1624
1625#ifndef ENCODING_RS_ENCODING
1626#define ENCODING_RS_ENCODING Encoding
1627#ifndef __cplusplus
1628typedef struct Encoding_ Encoding;
1629#endif
1630#endif
1631
1632#ifndef ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR
1633#define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const ENCODING_RS_ENCODING*
1634#endif
1635
1636#ifndef ENCODING_RS_ENCODER
1637#define ENCODING_RS_ENCODER Encoder
1638#ifndef __cplusplus
1639typedef struct Encoder_ Encoder;
1640#endif
1641#endif
1642
1643#ifndef ENCODING_RS_DECODER
1644#define ENCODING_RS_DECODER Decoder
1645#ifndef __cplusplus
1646typedef struct Decoder_ Decoder;
1647#endif
1648#endif
1649
1650#define INPUT_EMPTY 0
1651
1652#define OUTPUT_FULL 0xFFFFFFFF
1653
1654// %s
1655#define ENCODING_NAME_MAX_LENGTH %d
1656
1657""" % (longest_name, longest_name_length))
1658
1659for name in preferred:
1660  static_file.write('''/// The %s encoding.
1661extern ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR const %s_ENCODING;
1662
1663''' % (to_dom_name(name), to_constant_name(name)))
1664
1665static_file.write("""#endif // encoding_rs_statics_h_
1666""")
1667static_file.close()
1668
1669(utf_8_rs_begin, utf_8_rs_end) = read_non_generated("src/utf_8.rs")
1670
1671utf_8_file = open("src/utf_8.rs", "w")
1672
1673utf_8_file.write(utf_8_rs_begin)
1674utf_8_file.write("""
1675// Instead, please regenerate using generate-encoding-data.py
1676
1677/// Bit is 1 if the trail is invalid.
1678static UTF8_TRAIL_INVALID: [u8; 256] = [""")
1679
1680for i in range(256):
1681  combined = 0
1682  if i < 0x80 or i > 0xBF:
1683    combined |= (1 << 3)
1684  if i < 0xA0 or i > 0xBF:
1685    combined |= (1 << 4)
1686  if i < 0x80 or i > 0x9F:
1687    combined |= (1 << 5)
1688  if i < 0x90 or i > 0xBF:
1689    combined |= (1 << 6)
1690  if i < 0x80 or i > 0x8F:
1691    combined |= (1 << 7)
1692  utf_8_file.write("%d," % combined)
1693
1694utf_8_file.write("""
1695];
1696""")
1697
1698utf_8_file.write(utf_8_rs_end)
1699utf_8_file.close()
1700
1701# Unit tests
1702
1703TEST_HEADER = '''Any copyright to the test code below this comment is dedicated to the
1704Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
1705
1706This is a generated file. Please do not edit.
1707Instead, please regenerate using generate-encoding-data.py
1708'''
1709
1710index = indexes["jis0208"]
1711
1712jis0208_in_file = open("src/test_data/jis0208_in.txt", "w")
1713jis0208_in_file.write(TEST_HEADER)
1714for pointer in range(0, 94 * 94):
1715  (lead, trail) = divmod(pointer, 94)
1716  lead += 0xA1
1717  trail += 0xA1
1718  jis0208_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
1719jis0208_in_file.close()
1720
1721jis0208_in_ref_file = open("src/test_data/jis0208_in_ref.txt", "w")
1722jis0208_in_ref_file.write(TEST_HEADER)
1723for pointer in range(0, 94 * 94):
1724  code_point = index[pointer]
1725  if code_point:
1726    jis0208_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1727  else:
1728    jis0208_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
1729jis0208_in_ref_file.close()
1730
1731jis0208_out_file = open("src/test_data/jis0208_out.txt", "w")
1732jis0208_out_ref_file = open("src/test_data/jis0208_out_ref.txt", "w")
1733jis0208_out_file.write(TEST_HEADER)
1734jis0208_out_ref_file.write(TEST_HEADER)
1735for pointer in range(0, 94 * 94):
1736  code_point = index[pointer]
1737  if code_point:
1738    revised_pointer = pointer
1739    if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220):
1740      revised_pointer = index.index(code_point)
1741    (lead, trail) = divmod(revised_pointer, 94)
1742    lead += 0xA1
1743    trail += 0xA1
1744    jis0208_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
1745    jis0208_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1746jis0208_out_file.close()
1747jis0208_out_ref_file.close()
1748
1749shift_jis_in_file = open("src/test_data/shift_jis_in.txt", "w")
1750shift_jis_in_file.write(TEST_HEADER)
1751for pointer in range(0, len(index)):
1752  (lead, trail) = divmod(pointer, 188)
1753  lead += 0x81 if lead < 0x1F else 0xC1
1754  trail += 0x40 if trail < 0x3F else 0x41
1755  shift_jis_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
1756shift_jis_in_file.close()
1757
1758shift_jis_in_ref_file = open("src/test_data/shift_jis_in_ref.txt", "w")
1759shift_jis_in_ref_file.write(TEST_HEADER)
1760for pointer in range(0, len(index)):
1761  code_point = 0xE000 - 8836 + pointer if pointer >= 8836 and pointer <= 10715 else index[pointer]
1762  if code_point:
1763    shift_jis_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1764  else:
1765    trail = pointer % 188
1766    trail += 0x40 if trail < 0x3F else 0x41
1767    if trail < 0x80:
1768      shift_jis_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
1769    else:
1770      shift_jis_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
1771shift_jis_in_ref_file.close()
1772
1773shift_jis_out_file = open("src/test_data/shift_jis_out.txt", "w")
1774shift_jis_out_ref_file = open("src/test_data/shift_jis_out_ref.txt", "w")
1775shift_jis_out_file.write(TEST_HEADER)
1776shift_jis_out_ref_file.write(TEST_HEADER)
1777for pointer in range(0, 8272):
1778  code_point = index[pointer]
1779  if code_point:
1780    revised_pointer = pointer
1781    if revised_pointer >= 1207 and revised_pointer < 1220:
1782      revised_pointer = index.index(code_point)
1783    (lead, trail) = divmod(revised_pointer, 188)
1784    lead += 0x81 if lead < 0x1F else 0xC1
1785    trail += 0x40 if trail < 0x3F else 0x41
1786    shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
1787    shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1788for pointer in range(8836, len(index)):
1789  code_point = index[pointer]
1790  if code_point:
1791    revised_pointer = index.index(code_point)
1792    if revised_pointer >= 8272 and revised_pointer < 8836:
1793      revised_pointer = pointer
1794    (lead, trail) = divmod(revised_pointer, 188)
1795    lead += 0x81 if lead < 0x1F else 0xC1
1796    trail += 0x40 if trail < 0x3F else 0x41
1797    shift_jis_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
1798    shift_jis_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1799shift_jis_out_file.close()
1800shift_jis_out_ref_file.close()
1801
1802iso_2022_jp_in_file = open("src/test_data/iso_2022_jp_in.txt", "w")
1803iso_2022_jp_in_file.write(TEST_HEADER)
1804for pointer in range(0, 94 * 94):
1805  (lead, trail) = divmod(pointer, 94)
1806  lead += 0x21
1807  trail += 0x21
1808  iso_2022_jp_in_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
1809iso_2022_jp_in_file.close()
1810
1811iso_2022_jp_in_ref_file = open("src/test_data/iso_2022_jp_in_ref.txt", "w")
1812iso_2022_jp_in_ref_file.write(TEST_HEADER)
1813for pointer in range(0, 94 * 94):
1814  code_point = index[pointer]
1815  if code_point:
1816    iso_2022_jp_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1817  else:
1818    iso_2022_jp_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
1819iso_2022_jp_in_ref_file.close()
1820
1821iso_2022_jp_out_file = open("src/test_data/iso_2022_jp_out.txt", "w")
1822iso_2022_jp_out_ref_file = open("src/test_data/iso_2022_jp_out_ref.txt", "w")
1823iso_2022_jp_out_file.write(TEST_HEADER)
1824iso_2022_jp_out_ref_file.write(TEST_HEADER)
1825for pointer in range(0, 94 * 94):
1826  code_point = index[pointer]
1827  if code_point:
1828    revised_pointer = pointer
1829    if revised_pointer == 8644 or (revised_pointer >= 1207 and revised_pointer < 1220):
1830      revised_pointer = index.index(code_point)
1831    (lead, trail) = divmod(revised_pointer, 94)
1832    lead += 0x21
1833    trail += 0x21
1834    iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
1835    iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1836for i in xrange(len(half_width_index)):
1837  code_point = i + 0xFF61
1838  normalized_code_point = half_width_index[i]
1839  pointer = index.index(normalized_code_point)
1840  (lead, trail) = divmod(pointer, 94)
1841  lead += 0x21
1842  trail += 0x21
1843  iso_2022_jp_out_ref_file.write("\x1B$B%s%s\x1B(B\n" % (chr(lead), chr(trail)))
1844  iso_2022_jp_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1845iso_2022_jp_out_file.close()
1846iso_2022_jp_out_ref_file.close()
1847
1848index = indexes["euc-kr"]
1849
1850euc_kr_in_file = open("src/test_data/euc_kr_in.txt", "w")
1851euc_kr_in_file.write(TEST_HEADER)
1852for pointer in range(0, len(index)):
1853  (lead, trail) = divmod(pointer, 190)
1854  lead += 0x81
1855  trail += 0x41
1856  euc_kr_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
1857euc_kr_in_file.close()
1858
1859euc_kr_in_ref_file = open("src/test_data/euc_kr_in_ref.txt", "w")
1860euc_kr_in_ref_file.write(TEST_HEADER)
1861for pointer in range(0, len(index)):
1862  code_point = index[pointer]
1863  if code_point:
1864    euc_kr_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1865  else:
1866    trail = pointer % 190
1867    trail += 0x41
1868    if trail < 0x80:
1869      euc_kr_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
1870    else:
1871      euc_kr_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
1872euc_kr_in_ref_file.close()
1873
1874euc_kr_out_file = open("src/test_data/euc_kr_out.txt", "w")
1875euc_kr_out_ref_file = open("src/test_data/euc_kr_out_ref.txt", "w")
1876euc_kr_out_file.write(TEST_HEADER)
1877euc_kr_out_ref_file.write(TEST_HEADER)
1878for pointer in range(0, len(index)):
1879  code_point = index[pointer]
1880  if code_point:
1881    (lead, trail) = divmod(pointer, 190)
1882    lead += 0x81
1883    trail += 0x41
1884    euc_kr_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
1885    euc_kr_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1886euc_kr_out_file.close()
1887euc_kr_out_ref_file.close()
1888
1889index = indexes["gb18030"]
1890
1891gb18030_in_file = open("src/test_data/gb18030_in.txt", "w")
1892gb18030_in_file.write(TEST_HEADER)
1893for pointer in range(0, len(index)):
1894  (lead, trail) = divmod(pointer, 190)
1895  lead += 0x81
1896  trail += 0x40 if trail < 0x3F else 0x41
1897  gb18030_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
1898gb18030_in_file.close()
1899
1900gb18030_in_ref_file = open("src/test_data/gb18030_in_ref.txt", "w")
1901gb18030_in_ref_file.write(TEST_HEADER)
1902for pointer in range(0, len(index)):
1903  code_point = index[pointer]
1904  if code_point:
1905    gb18030_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1906  else:
1907    trail = pointer % 190
1908    trail += 0x40 if trail < 0x3F else 0x41
1909    if trail < 0x80:
1910      gb18030_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
1911    else:
1912      gb18030_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
1913gb18030_in_ref_file.close()
1914
1915gb18030_out_file = open("src/test_data/gb18030_out.txt", "w")
1916gb18030_out_ref_file = open("src/test_data/gb18030_out_ref.txt", "w")
1917gb18030_out_file.write(TEST_HEADER)
1918gb18030_out_ref_file.write(TEST_HEADER)
1919for pointer in range(0, len(index)):
1920  if pointer == 6555:
1921    continue
1922  code_point = index[pointer]
1923  if code_point:
1924    (lead, trail) = divmod(pointer, 190)
1925    lead += 0x81
1926    trail += 0x40 if trail < 0x3F else 0x41
1927    gb18030_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
1928    gb18030_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1929gb18030_out_file.close()
1930gb18030_out_ref_file.close()
1931
1932index = indexes["big5"]
1933
1934big5_in_file = open("src/test_data/big5_in.txt", "w")
1935big5_in_file.write(TEST_HEADER)
1936for pointer in range(0, len(index)):
1937  (lead, trail) = divmod(pointer, 157)
1938  lead += 0x81
1939  trail += 0x40 if trail < 0x3F else 0x62
1940  big5_in_file.write("%s%s\n" % (chr(lead), chr(trail)))
1941big5_in_file.close()
1942
1943big5_two_characters = {
1944  1133: u"\u00CA\u0304",
1945  1135: u"\u00CA\u030C",
1946  1164: u"\u00EA\u0304",
1947  1166: u"\u00EA\u030C",
1948}
1949
1950big5_in_ref_file = open("src/test_data/big5_in_ref.txt", "w")
1951big5_in_ref_file.write(TEST_HEADER)
1952for pointer in range(0, len(index)):
1953  if pointer in big5_two_characters.keys():
1954    big5_in_ref_file.write((u"%s\n" % big5_two_characters[pointer]).encode("utf-8"))
1955    continue
1956  code_point = index[pointer]
1957  if code_point:
1958    big5_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
1959  else:
1960    trail = pointer % 157
1961    trail += 0x40 if trail < 0x3F else 0x62
1962    if trail < 0x80:
1963      big5_in_ref_file.write((u"\uFFFD%s\n" % unichr(trail)).encode("utf-8"))
1964    else:
1965      big5_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
1966big5_in_ref_file.close()
1967
1968prefer_last = [
1969  0x2550,
1970  0x255E,
1971  0x2561,
1972  0x256A,
1973  0x5341,
1974  0x5345,
1975]
1976
1977pointer_for_prefer_last = []
1978
1979for code_point in prefer_last:
1980  # Python lists don't have .rindex() :-(
1981  for i in xrange(len(index) - 1, -1, -1):
1982    candidate = index[i]
1983    if candidate == code_point:
1984       pointer_for_prefer_last.append(i)
1985       break
1986
1987big5_out_file = open("src/test_data/big5_out.txt", "w")
1988big5_out_ref_file = open("src/test_data/big5_out_ref.txt", "w")
1989big5_out_file.write(TEST_HEADER)
1990big5_out_ref_file.write(TEST_HEADER)
1991for pointer in range(((0xA1 - 0x81) * 157), len(index)):
1992  code_point = index[pointer]
1993  if code_point:
1994    if code_point in prefer_last:
1995      if pointer != pointer_for_prefer_last[prefer_last.index(code_point)]:
1996        continue
1997    else:
1998      if pointer != index.index(code_point):
1999        continue
2000    (lead, trail) = divmod(pointer, 157)
2001    lead += 0x81
2002    trail += 0x40 if trail < 0x3F else 0x62
2003    big5_out_ref_file.write("%s%s\n" % (chr(lead), chr(trail)))
2004    big5_out_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
2005big5_out_file.close()
2006big5_out_ref_file.close()
2007
2008index = indexes["jis0212"]
2009
2010jis0212_in_file = open("src/test_data/jis0212_in.txt", "w")
2011jis0212_in_file.write(TEST_HEADER)
2012for pointer in range(0, len(index)):
2013  (lead, trail) = divmod(pointer, 94)
2014  lead += 0xA1
2015  trail += 0xA1
2016  jis0212_in_file.write("\x8F%s%s\n" % (chr(lead), chr(trail)))
2017jis0212_in_file.close()
2018
2019jis0212_in_ref_file = open("src/test_data/jis0212_in_ref.txt", "w")
2020jis0212_in_ref_file.write(TEST_HEADER)
2021for pointer in range(0, len(index)):
2022  code_point = index[pointer]
2023  if code_point:
2024    jis0212_in_ref_file.write((u"%s\n" % unichr(code_point)).encode("utf-8"))
2025  else:
2026    jis0212_in_ref_file.write(u"\uFFFD\n".encode("utf-8"))
2027jis0212_in_ref_file.close()
2028
2029subprocess.call(["cargo", "fmt"])
2030