1# frozen_string_literal: true
2
3require 'optparse'
4require 'erb'
5require 'fileutils'
6require 'pp'
7
8class Array
9  unless [].respond_to? :product
10    def product(*args)
11      if args.empty?
12        self.map {|e| [e] }
13      else
14        result = []
15        self.each {|e0|
16          result.concat args.first.product(*args[1..-1]).map {|es| [e0, *es] }
17        }
18        result
19      end
20    end
21  end
22end
23
24class String
25  unless "".respond_to? :start_with?
26    def start_with?(*prefixes)
27      prefixes.each {|prefix|
28        return true if prefix.length <= self.length && prefix == self[0, prefix.length]
29      }
30      false
31    end
32  end
33end
34
35NUM_ELEM_BYTELOOKUP = 2
36
37C_ESC = {
38  "\\" => "\\\\",
39  '"' => '\"',
40  "\n" => '\n',
41}
42
430x00.upto(0x1f) {|ch| C_ESC[[ch].pack("C")] ||= "\\%03o" % ch }
440x7f.upto(0xff) {|ch| C_ESC[[ch].pack("C")] = "\\%03o" % ch }
45C_ESC_PAT = Regexp.union(*C_ESC.keys)
46
47def c_esc(str)
48  '"' + str.gsub(C_ESC_PAT) { C_ESC[$&] } + '"'
49end
50
51HEX2 = /(?:[0-9A-Fa-f]{2})/
52
53class ArrayCode
54  def initialize(type, name)
55    @type = type
56    @name = name
57    @len = 0;
58    @content = ''.dup
59  end
60
61  def length
62    @len
63  end
64
65  def insert_at_last(num, str)
66    # newnum = self.length + num
67    @content << str
68    @len += num
69  end
70
71  def to_s
72    <<"End"
73static const #{@type}
74#{@name}[#{@len}] = {
75#{@content}};
76End
77  end
78end
79
80class Action
81  def initialize(value)
82    @value = value
83  end
84  attr_reader :value
85
86  def hash
87    @value.hash
88  end
89
90  def eql?(other)
91    self.class == other.class &&
92    @value == other.value
93  end
94  alias == eql?
95end
96
97class Branch
98  def initialize(byte_min, byte_max, child_tree)
99    @byte_min = byte_min
100    @byte_max = byte_max
101    @child_tree = child_tree
102    @hash = byte_min.hash ^ byte_max.hash ^ child_tree.hash
103  end
104  attr_reader :byte_min, :byte_max, :child_tree, :hash
105
106  def eql?(other)
107    self.class == other.class &&
108    @hash == other.hash &&
109    @byte_min == other.byte_min &&
110    @byte_max == other.byte_max &&
111    @child_tree == other.child_tree
112  end
113  alias == eql?
114end
115
116class ActionMap
117  def self.parse_to_rects(mapping)
118    rects = []
119    n = 0
120    mapping.each {|pat, action|
121      pat = pat.to_s
122      if /\A\s*\(empset\)\s*\z/ =~ pat
123        next
124      elsif /\A\s*\(empstr\)\s*\z/ =~ pat
125        rects << ['', '', action]
126        n += 1
127      elsif /\A\s*(#{HEX2}+)\s*\z/o =~ pat
128        hex = $1.upcase
129        rects << [hex, hex, action]
130      elsif /\A\s*((#{HEX2}|\{#{HEX2}(?:-#{HEX2})?(,#{HEX2}(?:-#{HEX2})?)*\})+(\s+|\z))*\z/o =~ pat
131        pat = pat.upcase
132        pat.scan(/\S+/) {
133          pat1 = $&
134          ranges_list = []
135          pat1.scan(/#{HEX2}|\{([^\}]*)\}/o) {
136            ranges_list << []
137            if !$1
138              ranges_list.last << [$&,$&]
139            else
140              set = {}
141              $1.scan(/(#{HEX2})(?:-(#{HEX2}))?/o) {
142                if !$2
143                  c = $1.to_i(16)
144                  set[c] = true
145                else
146                  b = $1.to_i(16)
147                  e = $2.to_i(16)
148                  b.upto(e) {|_| set[_] = true }
149                end
150              }
151              i = nil
152              0.upto(256) {|j|
153                if set[j]
154                  if !i
155                    i = j
156                  end
157                  if !set[j+1]
158                    ranges_list.last << ["%02X" % i, "%02X" % j]
159                    i = nil
160                  end
161                end
162              }
163            end
164          }
165          first_ranges = ranges_list.shift
166          first_ranges.product(*ranges_list).each {|range_list|
167            min = range_list.map {|x, y| x }.join
168            max = range_list.map {|x, y| y }.join
169            rects << [min, max, action]
170          }
171        }
172      else
173        raise ArgumentError, "invalid pattern: #{pat.inspect}"
174      end
175    }
176    rects
177  end
178
179  def self.unambiguous_action(actions0)
180    actions = actions0.uniq
181    if actions.length == 1
182      actions[0]
183    else
184      actions.delete(:nomap0)
185      if actions.length == 1
186        actions[0]
187      else
188        raise ArgumentError, "ambiguous actions: #{actions0.inspect}"
189      end
190    end
191  end
192
193  def self.build_tree(rects)
194    expand(rects) {|prefix, actions|
195      unambiguous_action(actions)
196    }
197  end
198
199  def self.parse(mapping)
200    rects = parse_to_rects(mapping)
201    tree = build_tree(rects)
202    self.new(tree)
203  end
204
205  def self.merge_rects(*rects_list)
206    if rects_list.length < 2
207      raise ArgumentError, "not enough arguments"
208    end
209
210    all_rects = []
211    rects_list.each_with_index {|rects, i|
212      all_rects.concat rects.map {|min, max, action| [min, max, [i, action]] }
213    }
214
215    tree = expand(all_rects) {|prefix, actions|
216      args = Array.new(rects_list.length) { [] }
217      actions.each {|i, action|
218        args[i] << action
219      }
220      yield(prefix, *args)
221    }
222
223    self.new(tree)
224  end
225
226  def self.merge(*mappings, &block)
227    merge_rects(*mappings.map {|m| parse_to_rects(m) }, &block)
228  end
229
230  def self.merge2(map1, map2, &block)
231    rects1 = parse_to_rects(map1)
232    rects2 = parse_to_rects(map2)
233
234    actions = []
235    all_rects = []
236
237    rects1.each {|rect|
238      _, _, action = rect
239      rect[2] = actions.length
240      actions << action
241      all_rects << rect
242    }
243
244    boundary = actions.length
245
246    rects2.each {|rect|
247      _, _, action = rect
248      rect[2] = actions.length
249      actions << action
250      all_rects << rect
251    }
252
253    tree = expand(all_rects) {|prefix, as0|
254      as1 = []
255      as2 = []
256      as0.each {|i|
257        if i < boundary
258          as1 << actions[i]
259        else
260          as2 << actions[i]
261        end
262      }
263      yield(prefix, as1, as2)
264    }
265
266    self.new(tree)
267  end
268
269  def self.expand(rects, &block)
270    #numsing = numreg = 0
271    #rects.each {|min, max, action| if min == max then numsing += 1 else numreg += 1 end }
272    #puts "#{numsing} singleton mappings and #{numreg} region mappings."
273    singleton_rects = []
274    region_rects = []
275    rects.each {|rect|
276      min, max, = rect
277      if min == max
278        singleton_rects << rect
279      else
280        region_rects << rect
281      end
282    }
283    @singleton_rects = singleton_rects.sort_by {|min, max, action| min }
284    @singleton_rects.reverse!
285    ret = expand_rec("", region_rects, &block)
286    @singleton_rects = nil
287    ret
288  end
289
290  TMPHASH = {}
291  def self.expand_rec(prefix, region_rects, &block)
292    return region_rects if region_rects.empty? && !((s_rect = @singleton_rects.last) && s_rect[0].start_with?(prefix))
293    if region_rects.empty? ? s_rect[0].length == prefix.length : region_rects[0][0].empty?
294      h = TMPHASH
295      while (s_rect = @singleton_rects.last) && s_rect[0].start_with?(prefix)
296        min, _, action = @singleton_rects.pop
297        raise ArgumentError, "ambiguous pattern: #{prefix}" if min.length != prefix.length
298        h[action] = true
299      end
300      for min, _, action in region_rects
301        raise ArgumentError, "ambiguous pattern: #{prefix}" if !min.empty?
302        h[action] = true
303      end
304      tree = Action.new(block.call(prefix, h.keys))
305      h.clear
306    else
307      tree = []
308      each_firstbyte_range(prefix, region_rects) {|byte_min, byte_max, r_rects2|
309        if byte_min == byte_max
310          prefix2 = prefix + "%02X" % byte_min
311        else
312          prefix2 = prefix + "{%02X-%02X}" % [byte_min, byte_max]
313        end
314        child_tree = expand_rec(prefix2, r_rects2, &block)
315        tree << Branch.new(byte_min, byte_max, child_tree)
316      }
317    end
318    return tree
319  end
320
321  def self.each_firstbyte_range(prefix, region_rects)
322    index_from = TMPHASH
323
324    region_ary = []
325    region_rects.each {|min, max, action|
326      raise ArgumentError, "ambiguous pattern: #{prefix}" if min.empty?
327      min_firstbyte = min[0,2].to_i(16)
328      min_rest = min[2..-1]
329      max_firstbyte = max[0,2].to_i(16)
330      max_rest = max[2..-1]
331      region_ary << [min_firstbyte, max_firstbyte, [min_rest, max_rest, action]]
332      index_from[min_firstbyte] = true
333      index_from[max_firstbyte+1] = true
334    }
335
336    byte_from = Array.new(index_from.size)
337    bytes = index_from.keys
338    bytes.sort!
339    bytes.reverse!
340    bytes.each_with_index {|byte, i|
341      index_from[byte] = i
342      byte_from[i] = byte
343    }
344
345    region_rects_ary = Array.new(index_from.size) { [] }
346    region_ary.each {|min_firstbyte, max_firstbyte, rest_elt|
347      index_from[min_firstbyte].downto(index_from[max_firstbyte+1]+1) {|i|
348        region_rects_ary[i] << rest_elt
349      }
350    }
351
352    index_from.clear
353
354    r_rects = region_rects_ary.pop
355    region_byte = byte_from.pop
356    prev_r_start = region_byte
357    prev_r_rects = []
358    while r_rects && (s_rect = @singleton_rects.last) && (seq = s_rect[0]).start_with?(prefix)
359      singleton_byte = seq[prefix.length, 2].to_i(16)
360      min_byte = singleton_byte < region_byte ? singleton_byte : region_byte
361      if prev_r_start < min_byte && !prev_r_rects.empty?
362        yield prev_r_start, min_byte-1, prev_r_rects
363      end
364      if region_byte < singleton_byte
365        prev_r_start = region_byte
366        prev_r_rects = r_rects
367        r_rects = region_rects_ary.pop
368        region_byte = byte_from.pop
369      elsif region_byte > singleton_byte
370        yield singleton_byte, singleton_byte, prev_r_rects
371        prev_r_start = singleton_byte+1
372      else # region_byte == singleton_byte
373        prev_r_start = region_byte+1
374        prev_r_rects = r_rects
375        r_rects = region_rects_ary.pop
376        region_byte = byte_from.pop
377        yield singleton_byte, singleton_byte, prev_r_rects
378      end
379    end
380
381    while r_rects
382      if prev_r_start < region_byte && !prev_r_rects.empty?
383        yield prev_r_start, region_byte-1, prev_r_rects
384      end
385      prev_r_start = region_byte
386      prev_r_rects = r_rects
387      r_rects = region_rects_ary.pop
388      region_byte = byte_from.pop
389    end
390
391    while (s_rect = @singleton_rects.last) && (seq = s_rect[0]).start_with?(prefix)
392      singleton_byte = seq[prefix.length, 2].to_i(16)
393      yield singleton_byte, singleton_byte, []
394    end
395  end
396
397  def initialize(tree)
398    @tree = tree
399  end
400
401  def inspect
402    "\#<#{self.class}:" +
403    @tree.inspect +
404    ">"
405  end
406
407  def max_input_length_rec(tree)
408    case tree
409    when Action
410      0
411    else
412      tree.map {|branch|
413        max_input_length_rec(branch.child_tree)
414      }.max + 1
415    end
416  end
417
418  def max_input_length
419    max_input_length_rec(@tree)
420  end
421
422  def empty_action
423    if @tree.kind_of? Action
424      @tree.value
425    else
426      nil
427    end
428  end
429
430  OffsetsMemo = {}
431  InfosMemo = {}
432
433  def format_offsets(min, max, offsets)
434    offsets = offsets[min..max]
435    code = "%d, %d,\n" % [min, max]
436    0.step(offsets.length-1,16) {|i|
437      code << "    "
438      code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('')
439      if i+8 < offsets.length
440        code << "  "
441        code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('')
442      end
443      code << "\n"
444    }
445    code
446  end
447
448  UsedName = {}
449
450  StrMemo = {}
451
452  def str_name(bytes)
453    size = @bytes_code.length
454    rawbytes = [bytes].pack("H*")
455
456    n = nil
457    if !n && !(suf = rawbytes.gsub(/[^A-Za-z0-9_]/, '')).empty? && !UsedName[nn = "str1_" + suf] then n = nn end
458    if !n && !UsedName[nn = "str1_" + bytes] then n = nn end
459    n ||= "str1s_#{size}"
460
461    StrMemo[bytes] = n
462    UsedName[n] = true
463    n
464  end
465
466  def gen_str(bytes)
467    if n = StrMemo[bytes]
468      n
469    else
470      len = bytes.length/2
471      size = @bytes_code.length
472      n = str_name(bytes)
473      @bytes_code.insert_at_last(1 + len,
474        "\#define #{n} makeSTR1(#{size})\n" +
475        "    makeSTR1LEN(#{len})," + bytes.gsub(/../, ' 0x\&,') + "\n\n")
476      n
477    end
478  end
479
480  def generate_info(info)
481    case info
482    when :nomap, :nomap0
483      # :nomap0 is low priority.  it never collides.
484      "NOMAP"
485    when :undef
486      "UNDEF"
487    when :invalid
488      "INVALID"
489    when :func_ii
490      "FUNii"
491    when :func_si
492      "FUNsi"
493    when :func_io
494      "FUNio"
495    when :func_so
496      "FUNso"
497    when /\A(#{HEX2})\z/o
498      "o1(0x#$1)"
499    when /\A(#{HEX2})(#{HEX2})\z/o
500      "o2(0x#$1,0x#$2)"
501    when /\A(#{HEX2})(#{HEX2})(#{HEX2})\z/o
502      "o3(0x#$1,0x#$2,0x#$3)"
503    when /funsio\((\d+)\)/
504      "funsio(#{$1})"
505    when /\A(#{HEX2})(3[0-9])(#{HEX2})(3[0-9])\z/o
506      "g4(0x#$1,0x#$2,0x#$3,0x#$4)"
507    when /\A(f[0-7])(#{HEX2})(#{HEX2})(#{HEX2})\z/o
508      "o4(0x#$1,0x#$2,0x#$3,0x#$4)"
509    when /\A(#{HEX2}){4,259}\z/o
510      gen_str(info.upcase)
511    when /\A\/\*BYTE_LOOKUP\*\// # pointer to BYTE_LOOKUP structure
512      $'.to_s
513    else
514      raise "unexpected action: #{info.inspect}"
515    end
516  end
517
518  def format_infos(infos)
519    infos = infos.map {|info| generate_info(info) }
520    maxlen = infos.map {|info| info.length }.max
521    columns = maxlen <= 16 ? 4 : 2
522    code = "".dup
523    0.step(infos.length-1, columns) {|i|
524      code << "    "
525      is = infos[i,columns]
526      is.each {|info|
527        code << sprintf(" %#{maxlen}s,", info)
528      }
529      code << "\n"
530    }
531    code
532  end
533
534  def generate_lookup_node(name, table)
535    bytes_code = @bytes_code
536    words_code = @words_code
537    offsets = []
538    infos = []
539    infomap = {}
540    min = max = nil
541    table.each_with_index {|action, byte|
542      action ||= :invalid
543      if action != :invalid
544        min = byte if !min
545        max = byte
546      end
547      unless o = infomap[action]
548        infomap[action] = o = infos.length
549        infos[o] = action
550      end
551      offsets[byte] = o
552    }
553    infomap.clear
554    if !min
555      min = max = 0
556    end
557
558    offsets_key = [min, max, offsets[min..max]]
559    if n = OffsetsMemo[offsets_key]
560      offsets_name = n
561    else
562      offsets_name = "#{name}_offsets"
563      OffsetsMemo[offsets_key] = offsets_name
564      size = bytes_code.length
565      bytes_code.insert_at_last(2+max-min+1,
566        "\#define #{offsets_name} #{size}\n" +
567        format_offsets(min,max,offsets) + "\n")
568    end
569
570    if n = InfosMemo[infos]
571      infos_name = n
572    else
573      infos_name = "#{name}_infos"
574      InfosMemo[infos] = infos_name
575
576      size = words_code.length
577      words_code.insert_at_last(infos.length,
578        "\#define #{infos_name} WORDINDEX2INFO(#{size})\n" +
579        format_infos(infos) + "\n")
580    end
581
582    size = words_code.length
583    words_code.insert_at_last(NUM_ELEM_BYTELOOKUP,
584      "\#define #{name} WORDINDEX2INFO(#{size})\n" +
585      <<"End" + "\n")
586    #{offsets_name},
587    #{infos_name},
588End
589  end
590
591  PreMemo = {}
592  NextName = "a"
593
594  def generate_node(name_hint=nil)
595    if n = PreMemo[@tree]
596      return n
597    end
598
599    table = Array.new(0x100, :invalid)
600    @tree.each {|branch|
601      byte_min, byte_max, child_tree = branch.byte_min, branch.byte_max, branch.child_tree
602      rest = ActionMap.new(child_tree)
603      if a = rest.empty_action
604        table.fill(a, byte_min..byte_max)
605      else
606        name_hint2 = nil
607        if name_hint
608          name_hint2 = "#{name_hint}_#{byte_min == byte_max ? '%02X' % byte_min : '%02Xto%02X' % [byte_min, byte_max]}"
609        end
610        v = "/*BYTE_LOOKUP*/" + rest.gennode(@bytes_code, @words_code, name_hint2)
611        table.fill(v, byte_min..byte_max)
612      end
613    }
614
615    if !name_hint
616      name_hint = "fun_" + NextName
617      NextName.succ!
618    end
619
620    PreMemo[@tree] = name_hint
621
622    generate_lookup_node(name_hint, table)
623    name_hint
624  end
625
626  def gennode(bytes_code, words_code, name_hint=nil)
627    @bytes_code = bytes_code
628    @words_code = words_code
629    name = generate_node(name_hint)
630    @bytes_code = nil
631    @words_code = nil
632    return name
633  end
634end
635
636def citrus_mskanji_cstomb(csid, index)
637  case csid
638  when 0
639    index
640  when 1
641    index + 0x80
642  when 2, 3
643    row = index >> 8
644    raise "invalid byte sequence" if row < 0x21
645    if csid == 3
646      if row <= 0x2F
647        offset = (row == 0x22 || row >= 0x26) ? 0xED : 0xF0
648      elsif row >= 0x4D && row <= 0x7E
649        offset = 0xCE
650      else
651        raise "invalid byte sequence"
652      end
653    else
654      raise "invalid byte sequence" if row > 0x97
655      offset = (row < 0x5F) ? 0x81 : 0xC1
656    end
657    col = index & 0xFF
658    raise "invalid byte sequence" if (col < 0x21 || col > 0x7E)
659
660    row -= 0x21
661    col -= 0x21
662    if (row & 1) == 0
663      col += 0x40
664      col += 1 if (col >= 0x7F)
665    else
666      col += 0x9F;
667    end
668    row = row / 2 + offset
669    (row << 8) | col
670  end.to_s(16)
671end
672
673def citrus_euc_cstomb(csid, index)
674  case csid
675  when 0x0000
676    index
677  when 0x8080
678    index | 0x8080
679  when 0x0080
680    index | 0x8E80
681  when 0x8000
682    index | 0x8F8080
683  end.to_s(16)
684end
685
686def citrus_stateless_iso_cstomb(csid, index)
687  (index | 0x8080 | (csid << 16)).to_s(16)
688end
689
690def citrus_cstomb(ces, csid, index)
691  case ces
692  when 'mskanji'
693    citrus_mskanji_cstomb(csid, index)
694  when 'euc'
695    citrus_euc_cstomb(csid, index)
696  when 'stateless_iso'
697    citrus_stateless_iso_cstomb(csid, index)
698  end
699end
700
701SUBDIR = %w/APPLE AST BIG5 CNS CP EBCDIC EMOJI GB GEORGIAN ISO646 ISO-8859 JIS KAZAKH KOI KS MISC TCVN/
702
703
704def citrus_decode_mapsrc(ces, csid, mapsrcs)
705  table = []
706  mapsrcs.split(',').each do |mapsrc|
707    path = [$srcdir]
708    mode = nil
709    if mapsrc.rindex(/UCS(?:@[A-Z]+)?/, 0)
710      mode = :from_ucs
711      from = mapsrc[$&.size+1..-1]
712      path << SUBDIR.find{|x| from.rindex(x, 0) }
713    else
714      mode = :to_ucs
715      path << SUBDIR.find{|x| mapsrc.rindex(x, 0) }
716    end
717    if /\bUCS@(BMP|SMP|SIP|TIP|SSP)\b/ =~ mapsrc
718      plane = {"BMP"=>0, "SMP"=>1, "SIP"=>2, "TIP"=>3, "SSP"=>14}[$1]
719    else
720      plane = 0
721    end
722    plane <<= 16
723    path << mapsrc.gsub(':', '@')
724    path = File.join(*path)
725    path << ".src"
726    path[path.rindex('/')] = '%'
727    STDERR.puts 'load mapsrc %s' % path if VERBOSE_MODE
728    open(path, 'rb') do |f|
729      f.each_line do |l|
730        break if /^BEGIN_MAP/ =~ l
731      end
732      f.each_line do |l|
733        next if /^\s*(?:#|$)/ =~ l
734        break if /^END_MAP/ =~ l
735        case mode
736        when :from_ucs
737          case l
738          when /0x(\w+)\s*-\s*0x(\w+)\s*=\s*INVALID/
739            # Citrus OOB_MODE
740          when /(0x\w+)\s*=\s*(0x\w+)/
741            table.push << [plane | $1.hex, citrus_cstomb(ces, csid, $2.hex)]
742          else
743            raise "unknown notation '%s'"% l.chomp
744          end
745        when :to_ucs
746          case l
747          when /(0x\w+)\s*=\s*(0x\w+)/
748            table.push << [citrus_cstomb(ces, csid, $1.hex), plane | $2.hex]
749          else
750            raise "unknown notation '%s'"% l.chomp
751          end
752        end
753      end
754    end
755  end
756  return table
757end
758
759def import_ucm(path)
760  to_ucs = []
761  from_ucs = []
762  File.foreach(File.join($srcdir, "ucm", path)) do |line|
763    uc, bs, fb = nil
764    if /^<U([0-9a-fA-F]+)>\s*([\+0-9a-fA-Fx\\]+)\s*\|(\d)/ =~ line
765      uc = $1.hex
766      bs = $2.delete('x\\')
767      fb = $3.to_i
768      next if uc < 128 && uc == bs.hex
769    elsif /^([<U0-9a-fA-F>+]+)\s*([\+0-9a-fA-Fx\\]+)\s*\|(\d)/ =~ line
770      uc = $1.scan(/[0-9a-fA-F]+>/).map(&:hex).pack("U*").unpack("H*")[0]
771      bs = $2.delete('x\\')
772      fb = $3.to_i
773    end
774    to_ucs << [bs, uc] if fb == 0 || fb == 3
775    from_ucs << [uc, bs] if fb == 0 || fb == 1
776  end
777  [to_ucs, from_ucs]
778end
779
780def encode_utf8(map)
781  r = []
782  map.each {|k, v|
783    # integer means UTF-8 encoded sequence.
784    k = [k].pack("U").unpack("H*")[0].upcase if Integer === k
785    v = [v].pack("U").unpack("H*")[0].upcase if Integer === v
786    r << [k,v]
787  }
788  r
789end
790
791UnspecifiedValidEncoding = Object.new
792
793def transcode_compile_tree(name, from, map, valid_encoding)
794  map = encode_utf8(map)
795  h = {}
796  map.each {|k, v|
797    h[k] = v unless h[k] # use first mapping
798  }
799  if valid_encoding.equal? UnspecifiedValidEncoding
800    valid_encoding = ValidEncoding.fetch(from)
801  end
802  if valid_encoding
803    am = ActionMap.merge2(h, {valid_encoding => :undef}) {|prefix, as1, as2|
804      a1 = as1.empty? ? nil : ActionMap.unambiguous_action(as1)
805      a2 = as2.empty? ? nil : ActionMap.unambiguous_action(as2)
806      if !a2
807        raise "invalid mapping: #{prefix}"
808      end
809      a1 || a2
810    }
811  else
812    am = ActionMap.parse(h)
813  end
814  h.clear
815
816  max_input = am.max_input_length
817  defined_name = am.gennode(TRANSCODE_GENERATED_BYTES_CODE, TRANSCODE_GENERATED_WORDS_CODE, name)
818  return defined_name, max_input
819end
820
821TRANSCODERS = []
822TRANSCODE_GENERATED_TRANSCODER_CODE = ''.dup
823
824def transcode_tbl_only(from, to, map, valid_encoding=UnspecifiedValidEncoding)
825  if VERBOSE_MODE
826    if from.empty? || to.empty?
827      STDERR.puts "converter for #{from.empty? ? to : from}"
828    else
829      STDERR.puts "converter from #{from} to #{to}"
830    end
831  end
832  id_from = from.tr('^0-9A-Za-z', '_')
833  id_to = to.tr('^0-9A-Za-z', '_')
834  if from == "UTF-8"
835    tree_name = "to_#{id_to}"
836  elsif to == "UTF-8"
837    tree_name = "from_#{id_from}"
838  else
839    tree_name = "from_#{id_from}_to_#{id_to}"
840  end
841  real_tree_name, max_input = transcode_compile_tree(tree_name, from, map, valid_encoding)
842  return map, tree_name, real_tree_name, max_input
843end
844
845#
846# call-seq:
847#   transcode_tblgen(from_name, to_name, map [, valid_encoding_check [, ascii_compatibility]]) -> ''
848#
849# Returns an empty string just in case the result is used somewhere.
850# Stores the actual product for later output with transcode_generated_code and
851# transcode_register_code.
852#
853# The first argument is a string that will be used for the source (from) encoding.
854# The second argument is a string that will be used for the target (to) encoding.
855#
856# The third argument is the actual data, a map represented as an array of two-element
857# arrays. Each element of the array stands for one character being converted. The
858# first element of each subarray is the code of the character in the source encoding,
859# the second element of each subarray is the code of the character in the target encoding.
860#
861# Each code (i.e. byte sequence) is represented as a string of hexadecimal characters
862# of even length. Codes can also be represented as integers (usually in the form Ox...),
863# in which case they are interpreted as Unicode codepoints encoded in UTF-8. So as
864# an example, 0x677E is the same as "E69DBE" (but somewhat easier to produce and check).
865#
866# In addition, the following symbols can also be used instead of actual codes in the
867# second element of a subarray:
868# :nomap (no mapping, just copy input to output), :nomap0 (same as :nomap, but low priority),
869# :undef (input code undefined in the destination encoding),
870# :invalid (input code is an invalid byte sequence in the source encoding),
871# :func_ii, :func_si, :func_io, :func_so (conversion by function with specific call
872# convention).
873#
874# The forth argument specifies the overall structure of the encoding. For examples,
875# see ValidEncoding below. This is used to cross-check the data in the third argument
876# and to automatically add :undef and :invalid mappings where necessary.
877#
878# The fifth argument gives the ascii-compatibility of the transcoding. See
879# rb_transcoder_asciicompat_type_t in transcode_data.h for details. In most
880# cases, this argument can be left out.
881#
882def transcode_tblgen(from, to, map, valid_encoding=UnspecifiedValidEncoding,
883                     ascii_compatibility='asciicompat_converter')
884  map, tree_name, real_tree_name, max_input = transcode_tbl_only(from, to, map, valid_encoding)
885  transcoder_name = "rb_#{tree_name}"
886  TRANSCODERS << transcoder_name
887  input_unit_length = UnitLength[from]
888  max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max
889  transcoder_code = <<"End"
890static const rb_transcoder
891#{transcoder_name} = {
892    #{c_esc from}, #{c_esc to}, #{real_tree_name},
893    TRANSCODE_TABLE_INFO,
894    #{input_unit_length}, /* input_unit_length */
895    #{max_input}, /* max_input */
896    #{max_output}, /* max_output */
897    #{ascii_compatibility}, /* asciicompat_type */
898    0, NULL, NULL, /* state_size, state_init, state_fini */
899    NULL, NULL, NULL, NULL,
900    NULL, NULL, NULL
901};
902End
903  TRANSCODE_GENERATED_TRANSCODER_CODE << transcoder_code
904  ''
905end
906
907def transcode_generate_node(am, name_hint=nil)
908  STDERR.puts "converter for #{name_hint}" if VERBOSE_MODE
909  am.gennode(TRANSCODE_GENERATED_BYTES_CODE, TRANSCODE_GENERATED_WORDS_CODE, name_hint)
910  ''
911end
912
913def transcode_generated_code
914  TRANSCODE_GENERATED_BYTES_CODE.to_s +
915    TRANSCODE_GENERATED_WORDS_CODE.to_s +
916    "\#define TRANSCODE_TABLE_INFO " +
917    "#{OUTPUT_PREFIX}byte_array, #{TRANSCODE_GENERATED_BYTES_CODE.length}, " +
918    "#{OUTPUT_PREFIX}word_array, #{TRANSCODE_GENERATED_WORDS_CODE.length}, " +
919    "((int)sizeof(unsigned int))\n" +
920    TRANSCODE_GENERATED_TRANSCODER_CODE
921end
922
923def transcode_register_code
924  code = ''.dup
925  TRANSCODERS.each {|transcoder_name|
926    code << "    rb_register_transcoder(&#{transcoder_name});\n"
927  }
928  code
929end
930
931UnitLength = {
932  'UTF-16BE'    => 2,
933  'UTF-16LE'    => 2,
934  'UTF-32BE'    => 4,
935  'UTF-32LE'    => 4,
936}
937UnitLength.default = 1
938
939ValidEncoding = {
940  '1byte'        => '{00-ff}',
941  '2byte'        => '{00-ff}{00-ff}',
942  '4byte'        => '{00-ff}{00-ff}{00-ff}{00-ff}',
943  'US-ASCII'     => '{00-7f}',
944  'UTF-8'        => '{00-7f}
945                     {c2-df}{80-bf}
946                          e0{a0-bf}{80-bf}
947                     {e1-ec}{80-bf}{80-bf}
948                          ed{80-9f}{80-bf}
949                     {ee-ef}{80-bf}{80-bf}
950                          f0{90-bf}{80-bf}{80-bf}
951                     {f1-f3}{80-bf}{80-bf}{80-bf}
952                          f4{80-8f}{80-bf}{80-bf}',
953  'UTF-16BE'     => '{00-d7,e0-ff}{00-ff}
954                     {d8-db}{00-ff}{dc-df}{00-ff}',
955  'UTF-16LE'     => '{00-ff}{00-d7,e0-ff}
956                     {00-ff}{d8-db}{00-ff}{dc-df}',
957  'UTF-32BE'     => '0000{00-d7,e0-ff}{00-ff}
958                     00{01-10}{00-ff}{00-ff}',
959  'UTF-32LE'     => '{00-ff}{00-d7,e0-ff}0000
960                     {00-ff}{00-ff}{01-10}00',
961  'EUC-JP'       => '{00-7f}
962                     {a1-fe}{a1-fe}
963                     8e{a1-fe}
964                     8f{a1-fe}{a1-fe}',
965  'CP51932'      => '{00-7f}
966                     {a1-fe}{a1-fe}
967                     8e{a1-fe}',
968  'EUC-JIS-2004' => '{00-7f}
969                     {a1-fe}{a1-fe}
970                     8e{a1-fe}
971                     8f{a1-fe}{a1-fe}',
972  'Shift_JIS'    => '{00-7f}
973                     {81-9f,e0-fc}{40-7e,80-fc}
974                     {a1-df}',
975  'EUC-KR'       => '{00-7f}
976                     {a1-fe}{a1-fe}',
977  'CP949'        => '{00-7f}
978                     {81-fe}{41-5a,61-7a,81-fe}',
979  'Big5'         => '{00-7f}
980                     {81-fe}{40-7e,a1-fe}',
981  'EUC-TW'       => '{00-7f}
982                     {a1-fe}{a1-fe}
983                     8e{a1-b0}{a1-fe}{a1-fe}',
984  'GBK'          => '{00-80}
985                     {81-fe}{40-7e,80-fe}',
986  'GB18030'      => '{00-7f}
987                     {81-fe}{40-7e,80-fe}
988                     {81-fe}{30-39}{81-fe}{30-39}',
989}
990
991def ValidEncoding(enc)
992  ValidEncoding.fetch(enc)
993end
994
995def set_valid_byte_pattern(encoding, pattern_or_label)
996  pattern =
997    if ValidEncoding[pattern_or_label]
998      ValidEncoding[pattern_or_label]
999    else
1000      pattern_or_label
1001    end
1002  if ValidEncoding[encoding] and ValidEncoding[encoding]!=pattern
1003    raise ArgumentError, "trying to change valid byte pattern for encoding #{encoding} from #{ValidEncoding[encoding]} to #{pattern}"
1004  end
1005  ValidEncoding[encoding] = pattern
1006end
1007
1008# the following may be used in different places, so keep them here for the moment
1009set_valid_byte_pattern 'ASCII-8BIT', '1byte'
1010set_valid_byte_pattern 'Windows-31J', 'Shift_JIS'
1011set_valid_byte_pattern 'eucJP-ms', 'EUC-JP'
1012
1013def make_signature(filename, src)
1014  "src=#{filename.dump}, len=#{src.length}, checksum=#{src.sum}"
1015end
1016
1017if __FILE__ == $0
1018  start_time = Time.now
1019
1020  output_filename = nil
1021  verbose_mode = false
1022  force_mode = false
1023
1024  op = OptionParser.new
1025  op.def_option("--help", "show help message") { puts op; exit 0 }
1026  op.def_option("--verbose", "verbose mode") { verbose_mode = true }
1027  op.def_option("--force", "force table generation") { force_mode = true }
1028  op.def_option("--output=FILE", "specify output file") {|arg| output_filename = arg }
1029  op.parse!
1030
1031  VERBOSE_MODE = verbose_mode
1032
1033  OUTPUT_FILENAME = output_filename
1034  OUTPUT_PREFIX = output_filename ? File.basename(output_filename)[/\A[A-Za-z0-9_]*/] : "".dup
1035  OUTPUT_PREFIX.sub!(/\A_+/, '')
1036  OUTPUT_PREFIX.sub!(/_*\z/, '_')
1037
1038  TRANSCODE_GENERATED_BYTES_CODE = ArrayCode.new("unsigned char", "#{OUTPUT_PREFIX}byte_array")
1039  TRANSCODE_GENERATED_WORDS_CODE = ArrayCode.new("unsigned int", "#{OUTPUT_PREFIX}word_array")
1040
1041  arg = ARGV.shift
1042  $srcdir = File.dirname(arg)
1043  $:.unshift $srcdir unless $:.include? $srcdir
1044  src = File.read(arg)
1045  src.force_encoding("ascii-8bit") if src.respond_to? :force_encoding
1046  this_script = File.read(__FILE__)
1047  this_script.force_encoding("ascii-8bit") if this_script.respond_to? :force_encoding
1048
1049  base_signature = "/* autogenerated. */\n".dup
1050  base_signature << "/* #{make_signature(File.basename(__FILE__), this_script)} */\n"
1051  base_signature << "/* #{make_signature(File.basename(arg), src)} */\n"
1052
1053  if !force_mode && output_filename && File.readable?(output_filename)
1054    old_signature = File.open(output_filename) {|f| f.gets("").chomp }
1055    chk_signature = base_signature.dup
1056    old_signature.each_line {|line|
1057      if %r{/\* src="([0-9a-z_.-]+)",} =~ line
1058        name = $1
1059        next if name == File.basename(arg) || name == File.basename(__FILE__)
1060        path = File.join($srcdir, name)
1061        if File.readable? path
1062          chk_signature << "/* #{make_signature(name, File.read(path))} */\n"
1063        end
1064      end
1065    }
1066    if old_signature == chk_signature
1067      now = Time.now
1068      File.utime(now, now, output_filename)
1069      STDERR.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE
1070      exit
1071    end
1072  end
1073
1074  if VERBOSE_MODE
1075    if output_filename
1076      STDERR.puts "generating #{output_filename} ..."
1077    end
1078  end
1079
1080  libs1 = $".dup
1081  if ERB.instance_method(:initialize).parameters.assoc(:key) # Ruby 2.6+
1082    erb = ERB.new(src, trim_mode: '%')
1083  else
1084    erb = ERB.new(src, nil, '%')
1085  end
1086  erb.filename = arg
1087  erb_result = erb.result(binding)
1088  libs2 = $".dup
1089
1090  libs = libs2 - libs1
1091  lib_sigs = ''.dup
1092  libs.each {|lib|
1093    lib = File.basename(lib)
1094    path = File.join($srcdir, lib)
1095    if File.readable? path
1096      lib_sigs << "/* #{make_signature(lib, File.read(path))} */\n"
1097    end
1098  }
1099
1100  result = ''.dup
1101  result << base_signature
1102  result << lib_sigs
1103  result << "\n"
1104  result << erb_result
1105  result << "\n"
1106
1107  if output_filename
1108    new_filename = output_filename + ".new"
1109    FileUtils.mkdir_p(File.dirname(output_filename))
1110    File.open(new_filename, "wb") {|f| f << result }
1111    File.rename(new_filename, output_filename)
1112    tms = Process.times
1113    elapsed = Time.now - start_time
1114    STDERR.puts "done.  (#{'%.2f' % tms.utime}user #{'%.2f' % tms.stime}system #{'%.2f' % elapsed}elapsed)" if VERBOSE_MODE
1115  else
1116    print result
1117  end
1118end
1119