1# frozen_string_literal: true 2 3require 'optparse' 4require 'erb' 5require 'fileutils' 6require 'pp' 7 8class Array 9 unless [].respond_to? :product 10 def product(*args) 11 if args.empty? 12 self.map {|e| [e] } 13 else 14 result = [] 15 self.each {|e0| 16 result.concat args.first.product(*args[1..-1]).map {|es| [e0, *es] } 17 } 18 result 19 end 20 end 21 end 22end 23 24class String 25 unless "".respond_to? :start_with? 26 def start_with?(*prefixes) 27 prefixes.each {|prefix| 28 return true if prefix.length <= self.length && prefix == self[0, prefix.length] 29 } 30 false 31 end 32 end 33end 34 35NUM_ELEM_BYTELOOKUP = 2 36 37C_ESC = { 38 "\\" => "\\\\", 39 '"' => '\"', 40 "\n" => '\n', 41} 42 430x00.upto(0x1f) {|ch| C_ESC[[ch].pack("C")] ||= "\\%03o" % ch } 440x7f.upto(0xff) {|ch| C_ESC[[ch].pack("C")] = "\\%03o" % ch } 45C_ESC_PAT = Regexp.union(*C_ESC.keys) 46 47def c_esc(str) 48 '"' + str.gsub(C_ESC_PAT) { C_ESC[$&] } + '"' 49end 50 51HEX2 = /(?:[0-9A-Fa-f]{2})/ 52 53class ArrayCode 54 def initialize(type, name) 55 @type = type 56 @name = name 57 @len = 0; 58 @content = ''.dup 59 end 60 61 def length 62 @len 63 end 64 65 def insert_at_last(num, str) 66 # newnum = self.length + num 67 @content << str 68 @len += num 69 end 70 71 def to_s 72 <<"End" 73static const #{@type} 74#{@name}[#{@len}] = { 75#{@content}}; 76End 77 end 78end 79 80class Action 81 def initialize(value) 82 @value = value 83 end 84 attr_reader :value 85 86 def hash 87 @value.hash 88 end 89 90 def eql?(other) 91 self.class == other.class && 92 @value == other.value 93 end 94 alias == eql? 95end 96 97class Branch 98 def initialize(byte_min, byte_max, child_tree) 99 @byte_min = byte_min 100 @byte_max = byte_max 101 @child_tree = child_tree 102 @hash = byte_min.hash ^ byte_max.hash ^ child_tree.hash 103 end 104 attr_reader :byte_min, :byte_max, :child_tree, :hash 105 106 def eql?(other) 107 self.class == other.class && 108 @hash == other.hash && 109 @byte_min == other.byte_min && 110 @byte_max == other.byte_max && 111 @child_tree == other.child_tree 112 end 113 alias == eql? 114end 115 116class ActionMap 117 def self.parse_to_rects(mapping) 118 rects = [] 119 n = 0 120 mapping.each {|pat, action| 121 pat = pat.to_s 122 if /\A\s*\(empset\)\s*\z/ =~ pat 123 next 124 elsif /\A\s*\(empstr\)\s*\z/ =~ pat 125 rects << ['', '', action] 126 n += 1 127 elsif /\A\s*(#{HEX2}+)\s*\z/o =~ pat 128 hex = $1.upcase 129 rects << [hex, hex, action] 130 elsif /\A\s*((#{HEX2}|\{#{HEX2}(?:-#{HEX2})?(,#{HEX2}(?:-#{HEX2})?)*\})+(\s+|\z))*\z/o =~ pat 131 pat = pat.upcase 132 pat.scan(/\S+/) { 133 pat1 = $& 134 ranges_list = [] 135 pat1.scan(/#{HEX2}|\{([^\}]*)\}/o) { 136 ranges_list << [] 137 if !$1 138 ranges_list.last << [$&,$&] 139 else 140 set = {} 141 $1.scan(/(#{HEX2})(?:-(#{HEX2}))?/o) { 142 if !$2 143 c = $1.to_i(16) 144 set[c] = true 145 else 146 b = $1.to_i(16) 147 e = $2.to_i(16) 148 b.upto(e) {|_| set[_] = true } 149 end 150 } 151 i = nil 152 0.upto(256) {|j| 153 if set[j] 154 if !i 155 i = j 156 end 157 if !set[j+1] 158 ranges_list.last << ["%02X" % i, "%02X" % j] 159 i = nil 160 end 161 end 162 } 163 end 164 } 165 first_ranges = ranges_list.shift 166 first_ranges.product(*ranges_list).each {|range_list| 167 min = range_list.map {|x, y| x }.join 168 max = range_list.map {|x, y| y }.join 169 rects << [min, max, action] 170 } 171 } 172 else 173 raise ArgumentError, "invalid pattern: #{pat.inspect}" 174 end 175 } 176 rects 177 end 178 179 def self.unambiguous_action(actions0) 180 actions = actions0.uniq 181 if actions.length == 1 182 actions[0] 183 else 184 actions.delete(:nomap0) 185 if actions.length == 1 186 actions[0] 187 else 188 raise ArgumentError, "ambiguous actions: #{actions0.inspect}" 189 end 190 end 191 end 192 193 def self.build_tree(rects) 194 expand(rects) {|prefix, actions| 195 unambiguous_action(actions) 196 } 197 end 198 199 def self.parse(mapping) 200 rects = parse_to_rects(mapping) 201 tree = build_tree(rects) 202 self.new(tree) 203 end 204 205 def self.merge_rects(*rects_list) 206 if rects_list.length < 2 207 raise ArgumentError, "not enough arguments" 208 end 209 210 all_rects = [] 211 rects_list.each_with_index {|rects, i| 212 all_rects.concat rects.map {|min, max, action| [min, max, [i, action]] } 213 } 214 215 tree = expand(all_rects) {|prefix, actions| 216 args = Array.new(rects_list.length) { [] } 217 actions.each {|i, action| 218 args[i] << action 219 } 220 yield(prefix, *args) 221 } 222 223 self.new(tree) 224 end 225 226 def self.merge(*mappings, &block) 227 merge_rects(*mappings.map {|m| parse_to_rects(m) }, &block) 228 end 229 230 def self.merge2(map1, map2, &block) 231 rects1 = parse_to_rects(map1) 232 rects2 = parse_to_rects(map2) 233 234 actions = [] 235 all_rects = [] 236 237 rects1.each {|rect| 238 _, _, action = rect 239 rect[2] = actions.length 240 actions << action 241 all_rects << rect 242 } 243 244 boundary = actions.length 245 246 rects2.each {|rect| 247 _, _, action = rect 248 rect[2] = actions.length 249 actions << action 250 all_rects << rect 251 } 252 253 tree = expand(all_rects) {|prefix, as0| 254 as1 = [] 255 as2 = [] 256 as0.each {|i| 257 if i < boundary 258 as1 << actions[i] 259 else 260 as2 << actions[i] 261 end 262 } 263 yield(prefix, as1, as2) 264 } 265 266 self.new(tree) 267 end 268 269 def self.expand(rects, &block) 270 #numsing = numreg = 0 271 #rects.each {|min, max, action| if min == max then numsing += 1 else numreg += 1 end } 272 #puts "#{numsing} singleton mappings and #{numreg} region mappings." 273 singleton_rects = [] 274 region_rects = [] 275 rects.each {|rect| 276 min, max, = rect 277 if min == max 278 singleton_rects << rect 279 else 280 region_rects << rect 281 end 282 } 283 @singleton_rects = singleton_rects.sort_by {|min, max, action| min } 284 @singleton_rects.reverse! 285 ret = expand_rec("", region_rects, &block) 286 @singleton_rects = nil 287 ret 288 end 289 290 TMPHASH = {} 291 def self.expand_rec(prefix, region_rects, &block) 292 return region_rects if region_rects.empty? && !((s_rect = @singleton_rects.last) && s_rect[0].start_with?(prefix)) 293 if region_rects.empty? ? s_rect[0].length == prefix.length : region_rects[0][0].empty? 294 h = TMPHASH 295 while (s_rect = @singleton_rects.last) && s_rect[0].start_with?(prefix) 296 min, _, action = @singleton_rects.pop 297 raise ArgumentError, "ambiguous pattern: #{prefix}" if min.length != prefix.length 298 h[action] = true 299 end 300 for min, _, action in region_rects 301 raise ArgumentError, "ambiguous pattern: #{prefix}" if !min.empty? 302 h[action] = true 303 end 304 tree = Action.new(block.call(prefix, h.keys)) 305 h.clear 306 else 307 tree = [] 308 each_firstbyte_range(prefix, region_rects) {|byte_min, byte_max, r_rects2| 309 if byte_min == byte_max 310 prefix2 = prefix + "%02X" % byte_min 311 else 312 prefix2 = prefix + "{%02X-%02X}" % [byte_min, byte_max] 313 end 314 child_tree = expand_rec(prefix2, r_rects2, &block) 315 tree << Branch.new(byte_min, byte_max, child_tree) 316 } 317 end 318 return tree 319 end 320 321 def self.each_firstbyte_range(prefix, region_rects) 322 index_from = TMPHASH 323 324 region_ary = [] 325 region_rects.each {|min, max, action| 326 raise ArgumentError, "ambiguous pattern: #{prefix}" if min.empty? 327 min_firstbyte = min[0,2].to_i(16) 328 min_rest = min[2..-1] 329 max_firstbyte = max[0,2].to_i(16) 330 max_rest = max[2..-1] 331 region_ary << [min_firstbyte, max_firstbyte, [min_rest, max_rest, action]] 332 index_from[min_firstbyte] = true 333 index_from[max_firstbyte+1] = true 334 } 335 336 byte_from = Array.new(index_from.size) 337 bytes = index_from.keys 338 bytes.sort! 339 bytes.reverse! 340 bytes.each_with_index {|byte, i| 341 index_from[byte] = i 342 byte_from[i] = byte 343 } 344 345 region_rects_ary = Array.new(index_from.size) { [] } 346 region_ary.each {|min_firstbyte, max_firstbyte, rest_elt| 347 index_from[min_firstbyte].downto(index_from[max_firstbyte+1]+1) {|i| 348 region_rects_ary[i] << rest_elt 349 } 350 } 351 352 index_from.clear 353 354 r_rects = region_rects_ary.pop 355 region_byte = byte_from.pop 356 prev_r_start = region_byte 357 prev_r_rects = [] 358 while r_rects && (s_rect = @singleton_rects.last) && (seq = s_rect[0]).start_with?(prefix) 359 singleton_byte = seq[prefix.length, 2].to_i(16) 360 min_byte = singleton_byte < region_byte ? singleton_byte : region_byte 361 if prev_r_start < min_byte && !prev_r_rects.empty? 362 yield prev_r_start, min_byte-1, prev_r_rects 363 end 364 if region_byte < singleton_byte 365 prev_r_start = region_byte 366 prev_r_rects = r_rects 367 r_rects = region_rects_ary.pop 368 region_byte = byte_from.pop 369 elsif region_byte > singleton_byte 370 yield singleton_byte, singleton_byte, prev_r_rects 371 prev_r_start = singleton_byte+1 372 else # region_byte == singleton_byte 373 prev_r_start = region_byte+1 374 prev_r_rects = r_rects 375 r_rects = region_rects_ary.pop 376 region_byte = byte_from.pop 377 yield singleton_byte, singleton_byte, prev_r_rects 378 end 379 end 380 381 while r_rects 382 if prev_r_start < region_byte && !prev_r_rects.empty? 383 yield prev_r_start, region_byte-1, prev_r_rects 384 end 385 prev_r_start = region_byte 386 prev_r_rects = r_rects 387 r_rects = region_rects_ary.pop 388 region_byte = byte_from.pop 389 end 390 391 while (s_rect = @singleton_rects.last) && (seq = s_rect[0]).start_with?(prefix) 392 singleton_byte = seq[prefix.length, 2].to_i(16) 393 yield singleton_byte, singleton_byte, [] 394 end 395 end 396 397 def initialize(tree) 398 @tree = tree 399 end 400 401 def inspect 402 "\#<#{self.class}:" + 403 @tree.inspect + 404 ">" 405 end 406 407 def max_input_length_rec(tree) 408 case tree 409 when Action 410 0 411 else 412 tree.map {|branch| 413 max_input_length_rec(branch.child_tree) 414 }.max + 1 415 end 416 end 417 418 def max_input_length 419 max_input_length_rec(@tree) 420 end 421 422 def empty_action 423 if @tree.kind_of? Action 424 @tree.value 425 else 426 nil 427 end 428 end 429 430 OffsetsMemo = {} 431 InfosMemo = {} 432 433 def format_offsets(min, max, offsets) 434 offsets = offsets[min..max] 435 code = "%d, %d,\n" % [min, max] 436 0.step(offsets.length-1,16) {|i| 437 code << " " 438 code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('') 439 if i+8 < offsets.length 440 code << " " 441 code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('') 442 end 443 code << "\n" 444 } 445 code 446 end 447 448 UsedName = {} 449 450 StrMemo = {} 451 452 def str_name(bytes) 453 size = @bytes_code.length 454 rawbytes = [bytes].pack("H*") 455 456 n = nil 457 if !n && !(suf = rawbytes.gsub(/[^A-Za-z0-9_]/, '')).empty? && !UsedName[nn = "str1_" + suf] then n = nn end 458 if !n && !UsedName[nn = "str1_" + bytes] then n = nn end 459 n ||= "str1s_#{size}" 460 461 StrMemo[bytes] = n 462 UsedName[n] = true 463 n 464 end 465 466 def gen_str(bytes) 467 if n = StrMemo[bytes] 468 n 469 else 470 len = bytes.length/2 471 size = @bytes_code.length 472 n = str_name(bytes) 473 @bytes_code.insert_at_last(1 + len, 474 "\#define #{n} makeSTR1(#{size})\n" + 475 " makeSTR1LEN(#{len})," + bytes.gsub(/../, ' 0x\&,') + "\n\n") 476 n 477 end 478 end 479 480 def generate_info(info) 481 case info 482 when :nomap, :nomap0 483 # :nomap0 is low priority. it never collides. 484 "NOMAP" 485 when :undef 486 "UNDEF" 487 when :invalid 488 "INVALID" 489 when :func_ii 490 "FUNii" 491 when :func_si 492 "FUNsi" 493 when :func_io 494 "FUNio" 495 when :func_so 496 "FUNso" 497 when /\A(#{HEX2})\z/o 498 "o1(0x#$1)" 499 when /\A(#{HEX2})(#{HEX2})\z/o 500 "o2(0x#$1,0x#$2)" 501 when /\A(#{HEX2})(#{HEX2})(#{HEX2})\z/o 502 "o3(0x#$1,0x#$2,0x#$3)" 503 when /funsio\((\d+)\)/ 504 "funsio(#{$1})" 505 when /\A(#{HEX2})(3[0-9])(#{HEX2})(3[0-9])\z/o 506 "g4(0x#$1,0x#$2,0x#$3,0x#$4)" 507 when /\A(f[0-7])(#{HEX2})(#{HEX2})(#{HEX2})\z/o 508 "o4(0x#$1,0x#$2,0x#$3,0x#$4)" 509 when /\A(#{HEX2}){4,259}\z/o 510 gen_str(info.upcase) 511 when /\A\/\*BYTE_LOOKUP\*\// # pointer to BYTE_LOOKUP structure 512 $'.to_s 513 else 514 raise "unexpected action: #{info.inspect}" 515 end 516 end 517 518 def format_infos(infos) 519 infos = infos.map {|info| generate_info(info) } 520 maxlen = infos.map {|info| info.length }.max 521 columns = maxlen <= 16 ? 4 : 2 522 code = "".dup 523 0.step(infos.length-1, columns) {|i| 524 code << " " 525 is = infos[i,columns] 526 is.each {|info| 527 code << sprintf(" %#{maxlen}s,", info) 528 } 529 code << "\n" 530 } 531 code 532 end 533 534 def generate_lookup_node(name, table) 535 bytes_code = @bytes_code 536 words_code = @words_code 537 offsets = [] 538 infos = [] 539 infomap = {} 540 min = max = nil 541 table.each_with_index {|action, byte| 542 action ||= :invalid 543 if action != :invalid 544 min = byte if !min 545 max = byte 546 end 547 unless o = infomap[action] 548 infomap[action] = o = infos.length 549 infos[o] = action 550 end 551 offsets[byte] = o 552 } 553 infomap.clear 554 if !min 555 min = max = 0 556 end 557 558 offsets_key = [min, max, offsets[min..max]] 559 if n = OffsetsMemo[offsets_key] 560 offsets_name = n 561 else 562 offsets_name = "#{name}_offsets" 563 OffsetsMemo[offsets_key] = offsets_name 564 size = bytes_code.length 565 bytes_code.insert_at_last(2+max-min+1, 566 "\#define #{offsets_name} #{size}\n" + 567 format_offsets(min,max,offsets) + "\n") 568 end 569 570 if n = InfosMemo[infos] 571 infos_name = n 572 else 573 infos_name = "#{name}_infos" 574 InfosMemo[infos] = infos_name 575 576 size = words_code.length 577 words_code.insert_at_last(infos.length, 578 "\#define #{infos_name} WORDINDEX2INFO(#{size})\n" + 579 format_infos(infos) + "\n") 580 end 581 582 size = words_code.length 583 words_code.insert_at_last(NUM_ELEM_BYTELOOKUP, 584 "\#define #{name} WORDINDEX2INFO(#{size})\n" + 585 <<"End" + "\n") 586 #{offsets_name}, 587 #{infos_name}, 588End 589 end 590 591 PreMemo = {} 592 NextName = "a" 593 594 def generate_node(name_hint=nil) 595 if n = PreMemo[@tree] 596 return n 597 end 598 599 table = Array.new(0x100, :invalid) 600 @tree.each {|branch| 601 byte_min, byte_max, child_tree = branch.byte_min, branch.byte_max, branch.child_tree 602 rest = ActionMap.new(child_tree) 603 if a = rest.empty_action 604 table.fill(a, byte_min..byte_max) 605 else 606 name_hint2 = nil 607 if name_hint 608 name_hint2 = "#{name_hint}_#{byte_min == byte_max ? '%02X' % byte_min : '%02Xto%02X' % [byte_min, byte_max]}" 609 end 610 v = "/*BYTE_LOOKUP*/" + rest.gennode(@bytes_code, @words_code, name_hint2) 611 table.fill(v, byte_min..byte_max) 612 end 613 } 614 615 if !name_hint 616 name_hint = "fun_" + NextName 617 NextName.succ! 618 end 619 620 PreMemo[@tree] = name_hint 621 622 generate_lookup_node(name_hint, table) 623 name_hint 624 end 625 626 def gennode(bytes_code, words_code, name_hint=nil) 627 @bytes_code = bytes_code 628 @words_code = words_code 629 name = generate_node(name_hint) 630 @bytes_code = nil 631 @words_code = nil 632 return name 633 end 634end 635 636def citrus_mskanji_cstomb(csid, index) 637 case csid 638 when 0 639 index 640 when 1 641 index + 0x80 642 when 2, 3 643 row = index >> 8 644 raise "invalid byte sequence" if row < 0x21 645 if csid == 3 646 if row <= 0x2F 647 offset = (row == 0x22 || row >= 0x26) ? 0xED : 0xF0 648 elsif row >= 0x4D && row <= 0x7E 649 offset = 0xCE 650 else 651 raise "invalid byte sequence" 652 end 653 else 654 raise "invalid byte sequence" if row > 0x97 655 offset = (row < 0x5F) ? 0x81 : 0xC1 656 end 657 col = index & 0xFF 658 raise "invalid byte sequence" if (col < 0x21 || col > 0x7E) 659 660 row -= 0x21 661 col -= 0x21 662 if (row & 1) == 0 663 col += 0x40 664 col += 1 if (col >= 0x7F) 665 else 666 col += 0x9F; 667 end 668 row = row / 2 + offset 669 (row << 8) | col 670 end.to_s(16) 671end 672 673def citrus_euc_cstomb(csid, index) 674 case csid 675 when 0x0000 676 index 677 when 0x8080 678 index | 0x8080 679 when 0x0080 680 index | 0x8E80 681 when 0x8000 682 index | 0x8F8080 683 end.to_s(16) 684end 685 686def citrus_stateless_iso_cstomb(csid, index) 687 (index | 0x8080 | (csid << 16)).to_s(16) 688end 689 690def citrus_cstomb(ces, csid, index) 691 case ces 692 when 'mskanji' 693 citrus_mskanji_cstomb(csid, index) 694 when 'euc' 695 citrus_euc_cstomb(csid, index) 696 when 'stateless_iso' 697 citrus_stateless_iso_cstomb(csid, index) 698 end 699end 700 701SUBDIR = %w/APPLE AST BIG5 CNS CP EBCDIC EMOJI GB GEORGIAN ISO646 ISO-8859 JIS KAZAKH KOI KS MISC TCVN/ 702 703 704def citrus_decode_mapsrc(ces, csid, mapsrcs) 705 table = [] 706 mapsrcs.split(',').each do |mapsrc| 707 path = [$srcdir] 708 mode = nil 709 if mapsrc.rindex(/UCS(?:@[A-Z]+)?/, 0) 710 mode = :from_ucs 711 from = mapsrc[$&.size+1..-1] 712 path << SUBDIR.find{|x| from.rindex(x, 0) } 713 else 714 mode = :to_ucs 715 path << SUBDIR.find{|x| mapsrc.rindex(x, 0) } 716 end 717 if /\bUCS@(BMP|SMP|SIP|TIP|SSP)\b/ =~ mapsrc 718 plane = {"BMP"=>0, "SMP"=>1, "SIP"=>2, "TIP"=>3, "SSP"=>14}[$1] 719 else 720 plane = 0 721 end 722 plane <<= 16 723 path << mapsrc.gsub(':', '@') 724 path = File.join(*path) 725 path << ".src" 726 path[path.rindex('/')] = '%' 727 STDERR.puts 'load mapsrc %s' % path if VERBOSE_MODE 728 open(path, 'rb') do |f| 729 f.each_line do |l| 730 break if /^BEGIN_MAP/ =~ l 731 end 732 f.each_line do |l| 733 next if /^\s*(?:#|$)/ =~ l 734 break if /^END_MAP/ =~ l 735 case mode 736 when :from_ucs 737 case l 738 when /0x(\w+)\s*-\s*0x(\w+)\s*=\s*INVALID/ 739 # Citrus OOB_MODE 740 when /(0x\w+)\s*=\s*(0x\w+)/ 741 table.push << [plane | $1.hex, citrus_cstomb(ces, csid, $2.hex)] 742 else 743 raise "unknown notation '%s'"% l.chomp 744 end 745 when :to_ucs 746 case l 747 when /(0x\w+)\s*=\s*(0x\w+)/ 748 table.push << [citrus_cstomb(ces, csid, $1.hex), plane | $2.hex] 749 else 750 raise "unknown notation '%s'"% l.chomp 751 end 752 end 753 end 754 end 755 end 756 return table 757end 758 759def import_ucm(path) 760 to_ucs = [] 761 from_ucs = [] 762 File.foreach(File.join($srcdir, "ucm", path)) do |line| 763 uc, bs, fb = nil 764 if /^<U([0-9a-fA-F]+)>\s*([\+0-9a-fA-Fx\\]+)\s*\|(\d)/ =~ line 765 uc = $1.hex 766 bs = $2.delete('x\\') 767 fb = $3.to_i 768 next if uc < 128 && uc == bs.hex 769 elsif /^([<U0-9a-fA-F>+]+)\s*([\+0-9a-fA-Fx\\]+)\s*\|(\d)/ =~ line 770 uc = $1.scan(/[0-9a-fA-F]+>/).map(&:hex).pack("U*").unpack("H*")[0] 771 bs = $2.delete('x\\') 772 fb = $3.to_i 773 end 774 to_ucs << [bs, uc] if fb == 0 || fb == 3 775 from_ucs << [uc, bs] if fb == 0 || fb == 1 776 end 777 [to_ucs, from_ucs] 778end 779 780def encode_utf8(map) 781 r = [] 782 map.each {|k, v| 783 # integer means UTF-8 encoded sequence. 784 k = [k].pack("U").unpack("H*")[0].upcase if Integer === k 785 v = [v].pack("U").unpack("H*")[0].upcase if Integer === v 786 r << [k,v] 787 } 788 r 789end 790 791UnspecifiedValidEncoding = Object.new 792 793def transcode_compile_tree(name, from, map, valid_encoding) 794 map = encode_utf8(map) 795 h = {} 796 map.each {|k, v| 797 h[k] = v unless h[k] # use first mapping 798 } 799 if valid_encoding.equal? UnspecifiedValidEncoding 800 valid_encoding = ValidEncoding.fetch(from) 801 end 802 if valid_encoding 803 am = ActionMap.merge2(h, {valid_encoding => :undef}) {|prefix, as1, as2| 804 a1 = as1.empty? ? nil : ActionMap.unambiguous_action(as1) 805 a2 = as2.empty? ? nil : ActionMap.unambiguous_action(as2) 806 if !a2 807 raise "invalid mapping: #{prefix}" 808 end 809 a1 || a2 810 } 811 else 812 am = ActionMap.parse(h) 813 end 814 h.clear 815 816 max_input = am.max_input_length 817 defined_name = am.gennode(TRANSCODE_GENERATED_BYTES_CODE, TRANSCODE_GENERATED_WORDS_CODE, name) 818 return defined_name, max_input 819end 820 821TRANSCODERS = [] 822TRANSCODE_GENERATED_TRANSCODER_CODE = ''.dup 823 824def transcode_tbl_only(from, to, map, valid_encoding=UnspecifiedValidEncoding) 825 if VERBOSE_MODE 826 if from.empty? || to.empty? 827 STDERR.puts "converter for #{from.empty? ? to : from}" 828 else 829 STDERR.puts "converter from #{from} to #{to}" 830 end 831 end 832 id_from = from.tr('^0-9A-Za-z', '_') 833 id_to = to.tr('^0-9A-Za-z', '_') 834 if from == "UTF-8" 835 tree_name = "to_#{id_to}" 836 elsif to == "UTF-8" 837 tree_name = "from_#{id_from}" 838 else 839 tree_name = "from_#{id_from}_to_#{id_to}" 840 end 841 real_tree_name, max_input = transcode_compile_tree(tree_name, from, map, valid_encoding) 842 return map, tree_name, real_tree_name, max_input 843end 844 845# 846# call-seq: 847# transcode_tblgen(from_name, to_name, map [, valid_encoding_check [, ascii_compatibility]]) -> '' 848# 849# Returns an empty string just in case the result is used somewhere. 850# Stores the actual product for later output with transcode_generated_code and 851# transcode_register_code. 852# 853# The first argument is a string that will be used for the source (from) encoding. 854# The second argument is a string that will be used for the target (to) encoding. 855# 856# The third argument is the actual data, a map represented as an array of two-element 857# arrays. Each element of the array stands for one character being converted. The 858# first element of each subarray is the code of the character in the source encoding, 859# the second element of each subarray is the code of the character in the target encoding. 860# 861# Each code (i.e. byte sequence) is represented as a string of hexadecimal characters 862# of even length. Codes can also be represented as integers (usually in the form Ox...), 863# in which case they are interpreted as Unicode codepoints encoded in UTF-8. So as 864# an example, 0x677E is the same as "E69DBE" (but somewhat easier to produce and check). 865# 866# In addition, the following symbols can also be used instead of actual codes in the 867# second element of a subarray: 868# :nomap (no mapping, just copy input to output), :nomap0 (same as :nomap, but low priority), 869# :undef (input code undefined in the destination encoding), 870# :invalid (input code is an invalid byte sequence in the source encoding), 871# :func_ii, :func_si, :func_io, :func_so (conversion by function with specific call 872# convention). 873# 874# The forth argument specifies the overall structure of the encoding. For examples, 875# see ValidEncoding below. This is used to cross-check the data in the third argument 876# and to automatically add :undef and :invalid mappings where necessary. 877# 878# The fifth argument gives the ascii-compatibility of the transcoding. See 879# rb_transcoder_asciicompat_type_t in transcode_data.h for details. In most 880# cases, this argument can be left out. 881# 882def transcode_tblgen(from, to, map, valid_encoding=UnspecifiedValidEncoding, 883 ascii_compatibility='asciicompat_converter') 884 map, tree_name, real_tree_name, max_input = transcode_tbl_only(from, to, map, valid_encoding) 885 transcoder_name = "rb_#{tree_name}" 886 TRANSCODERS << transcoder_name 887 input_unit_length = UnitLength[from] 888 max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max 889 transcoder_code = <<"End" 890static const rb_transcoder 891#{transcoder_name} = { 892 #{c_esc from}, #{c_esc to}, #{real_tree_name}, 893 TRANSCODE_TABLE_INFO, 894 #{input_unit_length}, /* input_unit_length */ 895 #{max_input}, /* max_input */ 896 #{max_output}, /* max_output */ 897 #{ascii_compatibility}, /* asciicompat_type */ 898 0, NULL, NULL, /* state_size, state_init, state_fini */ 899 NULL, NULL, NULL, NULL, 900 NULL, NULL, NULL 901}; 902End 903 TRANSCODE_GENERATED_TRANSCODER_CODE << transcoder_code 904 '' 905end 906 907def transcode_generate_node(am, name_hint=nil) 908 STDERR.puts "converter for #{name_hint}" if VERBOSE_MODE 909 am.gennode(TRANSCODE_GENERATED_BYTES_CODE, TRANSCODE_GENERATED_WORDS_CODE, name_hint) 910 '' 911end 912 913def transcode_generated_code 914 TRANSCODE_GENERATED_BYTES_CODE.to_s + 915 TRANSCODE_GENERATED_WORDS_CODE.to_s + 916 "\#define TRANSCODE_TABLE_INFO " + 917 "#{OUTPUT_PREFIX}byte_array, #{TRANSCODE_GENERATED_BYTES_CODE.length}, " + 918 "#{OUTPUT_PREFIX}word_array, #{TRANSCODE_GENERATED_WORDS_CODE.length}, " + 919 "((int)sizeof(unsigned int))\n" + 920 TRANSCODE_GENERATED_TRANSCODER_CODE 921end 922 923def transcode_register_code 924 code = ''.dup 925 TRANSCODERS.each {|transcoder_name| 926 code << " rb_register_transcoder(&#{transcoder_name});\n" 927 } 928 code 929end 930 931UnitLength = { 932 'UTF-16BE' => 2, 933 'UTF-16LE' => 2, 934 'UTF-32BE' => 4, 935 'UTF-32LE' => 4, 936} 937UnitLength.default = 1 938 939ValidEncoding = { 940 '1byte' => '{00-ff}', 941 '2byte' => '{00-ff}{00-ff}', 942 '4byte' => '{00-ff}{00-ff}{00-ff}{00-ff}', 943 'US-ASCII' => '{00-7f}', 944 'UTF-8' => '{00-7f} 945 {c2-df}{80-bf} 946 e0{a0-bf}{80-bf} 947 {e1-ec}{80-bf}{80-bf} 948 ed{80-9f}{80-bf} 949 {ee-ef}{80-bf}{80-bf} 950 f0{90-bf}{80-bf}{80-bf} 951 {f1-f3}{80-bf}{80-bf}{80-bf} 952 f4{80-8f}{80-bf}{80-bf}', 953 'UTF-16BE' => '{00-d7,e0-ff}{00-ff} 954 {d8-db}{00-ff}{dc-df}{00-ff}', 955 'UTF-16LE' => '{00-ff}{00-d7,e0-ff} 956 {00-ff}{d8-db}{00-ff}{dc-df}', 957 'UTF-32BE' => '0000{00-d7,e0-ff}{00-ff} 958 00{01-10}{00-ff}{00-ff}', 959 'UTF-32LE' => '{00-ff}{00-d7,e0-ff}0000 960 {00-ff}{00-ff}{01-10}00', 961 'EUC-JP' => '{00-7f} 962 {a1-fe}{a1-fe} 963 8e{a1-fe} 964 8f{a1-fe}{a1-fe}', 965 'CP51932' => '{00-7f} 966 {a1-fe}{a1-fe} 967 8e{a1-fe}', 968 'EUC-JIS-2004' => '{00-7f} 969 {a1-fe}{a1-fe} 970 8e{a1-fe} 971 8f{a1-fe}{a1-fe}', 972 'Shift_JIS' => '{00-7f} 973 {81-9f,e0-fc}{40-7e,80-fc} 974 {a1-df}', 975 'EUC-KR' => '{00-7f} 976 {a1-fe}{a1-fe}', 977 'CP949' => '{00-7f} 978 {81-fe}{41-5a,61-7a,81-fe}', 979 'Big5' => '{00-7f} 980 {81-fe}{40-7e,a1-fe}', 981 'EUC-TW' => '{00-7f} 982 {a1-fe}{a1-fe} 983 8e{a1-b0}{a1-fe}{a1-fe}', 984 'GBK' => '{00-80} 985 {81-fe}{40-7e,80-fe}', 986 'GB18030' => '{00-7f} 987 {81-fe}{40-7e,80-fe} 988 {81-fe}{30-39}{81-fe}{30-39}', 989} 990 991def ValidEncoding(enc) 992 ValidEncoding.fetch(enc) 993end 994 995def set_valid_byte_pattern(encoding, pattern_or_label) 996 pattern = 997 if ValidEncoding[pattern_or_label] 998 ValidEncoding[pattern_or_label] 999 else 1000 pattern_or_label 1001 end 1002 if ValidEncoding[encoding] and ValidEncoding[encoding]!=pattern 1003 raise ArgumentError, "trying to change valid byte pattern for encoding #{encoding} from #{ValidEncoding[encoding]} to #{pattern}" 1004 end 1005 ValidEncoding[encoding] = pattern 1006end 1007 1008# the following may be used in different places, so keep them here for the moment 1009set_valid_byte_pattern 'ASCII-8BIT', '1byte' 1010set_valid_byte_pattern 'Windows-31J', 'Shift_JIS' 1011set_valid_byte_pattern 'eucJP-ms', 'EUC-JP' 1012 1013def make_signature(filename, src) 1014 "src=#{filename.dump}, len=#{src.length}, checksum=#{src.sum}" 1015end 1016 1017if __FILE__ == $0 1018 start_time = Time.now 1019 1020 output_filename = nil 1021 verbose_mode = false 1022 force_mode = false 1023 1024 op = OptionParser.new 1025 op.def_option("--help", "show help message") { puts op; exit 0 } 1026 op.def_option("--verbose", "verbose mode") { verbose_mode = true } 1027 op.def_option("--force", "force table generation") { force_mode = true } 1028 op.def_option("--output=FILE", "specify output file") {|arg| output_filename = arg } 1029 op.parse! 1030 1031 VERBOSE_MODE = verbose_mode 1032 1033 OUTPUT_FILENAME = output_filename 1034 OUTPUT_PREFIX = output_filename ? File.basename(output_filename)[/\A[A-Za-z0-9_]*/] : "".dup 1035 OUTPUT_PREFIX.sub!(/\A_+/, '') 1036 OUTPUT_PREFIX.sub!(/_*\z/, '_') 1037 1038 TRANSCODE_GENERATED_BYTES_CODE = ArrayCode.new("unsigned char", "#{OUTPUT_PREFIX}byte_array") 1039 TRANSCODE_GENERATED_WORDS_CODE = ArrayCode.new("unsigned int", "#{OUTPUT_PREFIX}word_array") 1040 1041 arg = ARGV.shift 1042 $srcdir = File.dirname(arg) 1043 $:.unshift $srcdir unless $:.include? $srcdir 1044 src = File.read(arg) 1045 src.force_encoding("ascii-8bit") if src.respond_to? :force_encoding 1046 this_script = File.read(__FILE__) 1047 this_script.force_encoding("ascii-8bit") if this_script.respond_to? :force_encoding 1048 1049 base_signature = "/* autogenerated. */\n".dup 1050 base_signature << "/* #{make_signature(File.basename(__FILE__), this_script)} */\n" 1051 base_signature << "/* #{make_signature(File.basename(arg), src)} */\n" 1052 1053 if !force_mode && output_filename && File.readable?(output_filename) 1054 old_signature = File.open(output_filename) {|f| f.gets("").chomp } 1055 chk_signature = base_signature.dup 1056 old_signature.each_line {|line| 1057 if %r{/\* src="([0-9a-z_.-]+)",} =~ line 1058 name = $1 1059 next if name == File.basename(arg) || name == File.basename(__FILE__) 1060 path = File.join($srcdir, name) 1061 if File.readable? path 1062 chk_signature << "/* #{make_signature(name, File.read(path))} */\n" 1063 end 1064 end 1065 } 1066 if old_signature == chk_signature 1067 now = Time.now 1068 File.utime(now, now, output_filename) 1069 STDERR.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE 1070 exit 1071 end 1072 end 1073 1074 if VERBOSE_MODE 1075 if output_filename 1076 STDERR.puts "generating #{output_filename} ..." 1077 end 1078 end 1079 1080 libs1 = $".dup 1081 if ERB.instance_method(:initialize).parameters.assoc(:key) # Ruby 2.6+ 1082 erb = ERB.new(src, trim_mode: '%') 1083 else 1084 erb = ERB.new(src, nil, '%') 1085 end 1086 erb.filename = arg 1087 erb_result = erb.result(binding) 1088 libs2 = $".dup 1089 1090 libs = libs2 - libs1 1091 lib_sigs = ''.dup 1092 libs.each {|lib| 1093 lib = File.basename(lib) 1094 path = File.join($srcdir, lib) 1095 if File.readable? path 1096 lib_sigs << "/* #{make_signature(lib, File.read(path))} */\n" 1097 end 1098 } 1099 1100 result = ''.dup 1101 result << base_signature 1102 result << lib_sigs 1103 result << "\n" 1104 result << erb_result 1105 result << "\n" 1106 1107 if output_filename 1108 new_filename = output_filename + ".new" 1109 FileUtils.mkdir_p(File.dirname(output_filename)) 1110 File.open(new_filename, "wb") {|f| f << result } 1111 File.rename(new_filename, output_filename) 1112 tms = Process.times 1113 elapsed = Time.now - start_time 1114 STDERR.puts "done. (#{'%.2f' % tms.utime}user #{'%.2f' % tms.stime}system #{'%.2f' % elapsed}elapsed)" if VERBOSE_MODE 1115 else 1116 print result 1117 end 1118end 1119