1# 2# Copyright (c) 2001-2018, PostgreSQL Global Development Group 3# 4# src/backend/utils/mb/Unicode/convutils.pm 5 6package convutils; 7 8use strict; 9 10use Carp; 11use Exporter 'import'; 12 13our @EXPORT = 14 qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables); 15 16# Constants used in the 'direction' field of the character maps 17use constant { 18 NONE => 0, 19 TO_UNICODE => 1, 20 FROM_UNICODE => 2, 21 BOTH => 3 22}; 23 24####################################################################### 25# read_source - common routine to read source file 26# 27# fname ; input file name 28# 29sub read_source 30{ 31 my ($fname) = @_; 32 my @r; 33 34 open(my $in, '<', $fname) || die("cannot open $fname"); 35 36 while (<$in>) 37 { 38 next if (/^#/); 39 chop; 40 41 next if (/^$/); # Ignore empty lines 42 43 next if (/^0x([0-9A-F]+)\s+(#.*)$/); 44 45 # The Unicode source files have three columns 46 # 1: The "foreign" code (in hex) 47 # 2: Unicode code point (in hex) 48 # 3: Unicode name 49 if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(#.*)$/) 50 { 51 print STDERR "READ ERROR at line $. in $fname: $_\n"; 52 exit; 53 } 54 my $out = { 55 code => hex($1), 56 ucs => hex($2), 57 comment => $4, 58 direction => BOTH, 59 f => $fname, 60 l => $. 61 }; 62 63 # Ignore pure ASCII mappings. PostgreSQL character conversion code 64 # never even passes these to the conversion code. 65 next if ($out->{code} < 0x80 || $out->{ucs} < 0x80); 66 67 push(@r, $out); 68 } 69 close($in); 70 71 return \@r; 72} 73 74################################################################## 75# print_conversion_tables - output mapping tables 76# 77# print_conversion_tables($this_script, $csname, \%charset) 78# 79# this_script - the name of the *caller script* of this feature 80# csname - character set name other than ucs 81# charset - ref to character set array 82# 83# Input character set array format: 84# 85# Each element in the character set array is a hash. Each hash has the following fields: 86# direction - BOTH, TO_UNICODE, or FROM_UNICODE (or NONE, to ignore the entry altogether) 87# ucs - Unicode code point 88# ucs_second - Second Unicode code point, if this is a "combined" character. 89# code - Byte sequence in the "other" character set, as an integer 90# comment - Text representation of the character 91# f - Source filename 92# l - Line number in source file 93# 94sub print_conversion_tables 95{ 96 my ($this_script, $csname, $charset) = @_; 97 98 print_conversion_tables_direction($this_script, $csname, FROM_UNICODE, 99 $charset); 100 print_conversion_tables_direction($this_script, $csname, TO_UNICODE, 101 $charset); 102 return; 103} 104 105############################################################################# 106# INTERNAL ROUTINES 107 108####################################################################### 109# print_conversion_tables_direction - write the whole content of C source of radix tree 110# 111# print_conversion_tables_direction($this_script, $csname, $direction, \%charset, $tblwidth) 112# 113# this_script - the name of the *caller script* of this feature 114# csname - character set name other than ucs 115# direction - desired direction, TO_UNICODE or FROM_UNICODE 116# charset - ref to character set array 117# 118sub print_conversion_tables_direction 119{ 120 my ($this_script, $csname, $direction, $charset) = @_; 121 122 my $fname; 123 my $tblname; 124 if ($direction == TO_UNICODE) 125 { 126 $fname = lc("${csname}_to_utf8.map"); 127 $tblname = lc("${csname}_to_unicode_tree"); 128 129 print "- Writing ${csname}=>UTF8 conversion table: $fname\n"; 130 } 131 else 132 { 133 $fname = lc("utf8_to_${csname}.map"); 134 $tblname = lc("${csname}_from_unicode_tree"); 135 136 print "- Writing UTF8=>${csname} conversion table: $fname\n"; 137 } 138 139 open(my $out, '>', $fname) || die("cannot open $fname"); 140 141 print $out "/* src/backend/utils/mb/Unicode/$fname */\n"; 142 print $out "/* This file is generated by $this_script */\n\n"; 143 144 # Collect regular, non-combined, mappings, and create the radix tree from them. 145 my $charmap = &make_charmap($out, $charset, $direction, 0); 146 print_radix_table($out, $tblname, $charmap); 147 148 # Collect combined characters, and create combined character table (if any) 149 my $charmap_combined = &make_charmap_combined($charset, $direction); 150 151 if (scalar @{$charmap_combined} > 0) 152 { 153 if ($direction == TO_UNICODE) 154 { 155 print_to_utf8_combined_map($out, $csname, $charmap_combined, 1); 156 } 157 else 158 { 159 print_from_utf8_combined_map($out, $csname, $charmap_combined, 1); 160 } 161 } 162 163 close($out); 164 return; 165} 166 167sub print_from_utf8_combined_map 168{ 169 my ($out, $charset, $table, $verbose) = @_; 170 171 my $last_comment = ""; 172 173 printf $out "\n/* Combined character map */\n"; 174 printf $out 175 "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {", 176 scalar(@$table); 177 my $first = 1; 178 foreach my $i (sort { $a->{utf8} <=> $b->{utf8} } @$table) 179 { 180 print($out ",") if (!$first); 181 $first = 0; 182 print $out "\t/* $last_comment */" 183 if ($verbose && $last_comment ne ""); 184 185 printf $out "\n {0x%08x, 0x%08x, 0x%04x}", 186 $i->{utf8}, $i->{utf8_second}, $i->{code}; 187 if ($verbose >= 2) 188 { 189 $last_comment = 190 sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); 191 } 192 elsif ($verbose >= 1) 193 { 194 $last_comment = $i->{comment}; 195 } 196 } 197 print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); 198 print $out "\n};\n"; 199 return; 200} 201 202sub print_to_utf8_combined_map 203{ 204 my ($out, $charset, $table, $verbose) = @_; 205 206 my $last_comment = ""; 207 208 printf $out "\n/* Combined character map */\n"; 209 printf $out 210 "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {", 211 scalar(@$table); 212 213 my $first = 1; 214 foreach my $i (sort { $a->{code} <=> $b->{code} } @$table) 215 { 216 print($out ",") if (!$first); 217 $first = 0; 218 print $out "\t/* $last_comment */" 219 if ($verbose && $last_comment ne ""); 220 221 printf $out "\n {0x%04x, 0x%08x, 0x%08x}", 222 $i->{code}, $i->{utf8}, $i->{utf8_second}; 223 224 if ($verbose >= 2) 225 { 226 $last_comment = 227 sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); 228 } 229 elsif ($verbose >= 1) 230 { 231 $last_comment = $i->{comment}; 232 } 233 } 234 print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); 235 print $out "\n};\n"; 236 return; 237} 238 239####################################################################### 240# print_radix_table(<output handle>, <table name>, <charmap hash ref>) 241# 242# Input: A hash, mapping an input character to an output character. 243# 244# Constructs a radix tree from the hash, and prints it out as a C-struct. 245# 246sub print_radix_table 247{ 248 my ($out, $tblname, $c) = @_; 249 250 ### 251 ### Build radix trees in memory, for 1-, 2-, 3- and 4-byte inputs. Each 252 ### radix tree is represented as a nested hash, each hash indexed by 253 ### input byte 254 ### 255 my %b1map; 256 my %b2map; 257 my %b3map; 258 my %b4map; 259 foreach my $in (keys %$c) 260 { 261 my $out = $c->{$in}; 262 263 if ($in <= 0xff) 264 { 265 $b1map{$in} = $out; 266 } 267 elsif ($in <= 0xffff) 268 { 269 my $b1 = $in >> 8; 270 my $b2 = $in & 0xff; 271 272 $b2map{$b1}{$b2} = $out; 273 } 274 elsif ($in <= 0xffffff) 275 { 276 my $b1 = $in >> 16; 277 my $b2 = ($in >> 8) & 0xff; 278 my $b3 = $in & 0xff; 279 280 $b3map{$b1}{$b2}{$b3} = $out; 281 } 282 elsif ($in <= 0xffffffff) 283 { 284 my $b1 = $in >> 24; 285 my $b2 = ($in >> 16) & 0xff; 286 my $b3 = ($in >> 8) & 0xff; 287 my $b4 = $in & 0xff; 288 289 $b4map{$b1}{$b2}{$b3}{$b4} = $out; 290 } 291 else 292 { 293 die sprintf("up to 4 byte code is supported: %x", $in); 294 } 295 } 296 297 my @segments; 298 299 ### 300 ### Build a linear list of "segments", from the nested hashes. 301 ### 302 ### Each segment is a lookup table, keyed by the next byte in the input. 303 ### The segments are written out physically to one big array in the final 304 ### step, but logically, they form a radix tree. Or rather, four radix 305 ### trees: one for 1-byte inputs, another for 2-byte inputs, 3-byte 306 ### inputs, and 4-byte inputs. 307 ### 308 ### Each segment is represented by a hash with following fields: 309 ### 310 ### comment => <string to output as a comment> 311 ### label => <label that can be used to refer to this segment from elsewhere> 312 ### values => <a hash, keyed by byte, 0-0xff> 313 ### 314 ### Entries in 'values' can be integers (for leaf-level segments), or 315 ### string labels, pointing to a segment with that label. Any missing 316 ### values are treated as zeros. If 'values' hash is missing altogether, 317 ### it's treated as all-zeros. 318 ### 319 ### Subsequent steps will enrich the segments with more fields. 320 ### 321 322 # Add the segments for the radix trees themselves. 323 push @segments, 324 build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map); 325 push @segments, 326 build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map); 327 push @segments, 328 build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map); 329 push @segments, 330 build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map); 331 332 ### 333 ### Find min and max index used in each level of each tree. 334 ### 335 ### These are stored separately, and we can then leave out the unused 336 ### parts of every segment. (When using the resulting tree, you must 337 ### check each input byte against the min and max.) 338 ### 339 my %min_idx; 340 my %max_idx; 341 foreach my $seg (@segments) 342 { 343 my $this_min = $min_idx{ $seg->{depth} }->{ $seg->{level} }; 344 my $this_max = $max_idx{ $seg->{depth} }->{ $seg->{level} }; 345 346 foreach my $i (keys %{ $seg->{values} }) 347 { 348 $this_min = $i if (!defined $this_min || $i < $this_min); 349 $this_max = $i if (!defined $this_max || $i > $this_max); 350 } 351 352 $min_idx{ $seg->{depth} }{ $seg->{level} } = $this_min; 353 $max_idx{ $seg->{depth} }{ $seg->{level} } = $this_max; 354 } 355 356 # Copy the mins and max's back to every segment, for convenience. 357 foreach my $seg (@segments) 358 { 359 $seg->{min_idx} = $min_idx{ $seg->{depth} }{ $seg->{level} }; 360 $seg->{max_idx} = $max_idx{ $seg->{depth} }{ $seg->{level} }; 361 } 362 363 ### 364 ### Prepend a dummy all-zeros map to the beginning. 365 ### 366 ### A 0 is an invalid value anywhere in the table, and this allows us to 367 ### point to 0 offset from any table, to get a 0 result. 368 ### 369 370 # Find the max range between min and max indexes in any of the segments. 371 my $widest_range = 0; 372 foreach my $seg (@segments) 373 { 374 my $this_range = $seg->{max_idx} - $seg->{min_idx}; 375 $widest_range = $this_range if ($this_range > $widest_range); 376 } 377 378 unshift @segments, 379 { 380 header => "Dummy map, for invalid values", 381 min_idx => 0, 382 max_idx => $widest_range 383 }; 384 385 ### 386 ### Eliminate overlapping zeros 387 ### 388 ### For each segment, if there are zero values at the end of, and there 389 ### are also zero values at the beginning of the next segment, we can 390 ### overlay the tail of this segment with the head of next segment, to 391 ### save space. 392 ### 393 ### To achieve that, we subtract the 'max_idx' of each segment with the 394 ### amount of zeros that can be overlaid. 395 ### 396 for (my $j = 0; $j < $#segments - 1; $j++) 397 { 398 my $seg = $segments[$j]; 399 my $nextseg = $segments[ $j + 1 ]; 400 401 # Count the number of zero values at the end of this segment. 402 my $this_trail_zeros = 0; 403 for ( 404 my $i = $seg->{max_idx}; 405 $i >= $seg->{min_idx} && !$seg->{values}->{$i}; 406 $i--) 407 { 408 $this_trail_zeros++; 409 } 410 411 # Count the number of zeros at the beginning of next segment. 412 my $next_lead_zeros = 0; 413 for ( 414 my $i = $nextseg->{min_idx}; 415 $i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i}; 416 $i++) 417 { 418 $next_lead_zeros++; 419 } 420 421 # How many zeros in common? 422 my $overlaid_trail_zeros = 423 ($this_trail_zeros > $next_lead_zeros) 424 ? $next_lead_zeros 425 : $this_trail_zeros; 426 427 $seg->{overlaid_trail_zeros} = $overlaid_trail_zeros; 428 $seg->{max_idx} = $seg->{max_idx} - $overlaid_trail_zeros; 429 } 430 431 ### 432 ### Replace label references with real offsets. 433 ### 434 ### So far, the non-leaf segments have referred to other segments by 435 ### their labels. Replace them with numerical offsets from the beginning 436 ### of the final array. You cannot move, add, or remove segments after 437 ### this step, as that would invalidate the offsets calculated here! 438 ### 439 my $flatoff = 0; 440 my %segmap; 441 442 # First pass: assign offsets to each segment, and build hash 443 # of label => offset. 444 foreach my $seg (@segments) 445 { 446 $seg->{offset} = $flatoff; 447 $segmap{ $seg->{label} } = $flatoff; 448 $flatoff += $seg->{max_idx} - $seg->{min_idx} + 1; 449 } 450 my $tblsize = $flatoff; 451 452 # Second pass: look up the offset of each label reference in the hash. 453 foreach my $seg (@segments) 454 { 455 while (my ($i, $val) = each %{ $seg->{values} }) 456 { 457 if (!($val =~ /^[0-9,.E]+$/)) 458 { 459 my $segoff = $segmap{$val}; 460 if ($segoff) 461 { 462 $seg->{values}->{$i} = $segoff; 463 } 464 else 465 { 466 die "no segment with label $val"; 467 } 468 } 469 } 470 } 471 472 # Also look up the positions of the roots in the table. 473 my $b1root = $segmap{"1-byte"}; 474 my $b2root = $segmap{"2-byte"}; 475 my $b3root = $segmap{"3-byte"}; 476 my $b4root = $segmap{"4-byte"}; 477 478 # And the lower-upper values of each level in each radix tree. 479 my $b1_lower = $min_idx{1}{1}; 480 my $b1_upper = $max_idx{1}{1}; 481 482 my $b2_1_lower = $min_idx{2}{1}; 483 my $b2_1_upper = $max_idx{2}{1}; 484 my $b2_2_lower = $min_idx{2}{2}; 485 my $b2_2_upper = $max_idx{2}{2}; 486 487 my $b3_1_lower = $min_idx{3}{1}; 488 my $b3_1_upper = $max_idx{3}{1}; 489 my $b3_2_lower = $min_idx{3}{2}; 490 my $b3_2_upper = $max_idx{3}{2}; 491 my $b3_3_lower = $min_idx{3}{3}; 492 my $b3_3_upper = $max_idx{3}{3}; 493 494 my $b4_1_lower = $min_idx{4}{1}; 495 my $b4_1_upper = $max_idx{4}{1}; 496 my $b4_2_lower = $min_idx{4}{2}; 497 my $b4_2_upper = $max_idx{4}{2}; 498 my $b4_3_lower = $min_idx{4}{3}; 499 my $b4_3_upper = $max_idx{4}{3}; 500 my $b4_4_lower = $min_idx{4}{4}; 501 my $b4_4_upper = $max_idx{4}{4}; 502 503 ### 504 ### Find the maximum value in the whole table, to determine if we can 505 ### use uint16 or if we need to use uint32. 506 ### 507 my $max_val = 0; 508 foreach my $seg (@segments) 509 { 510 foreach my $val (values %{ $seg->{values} }) 511 { 512 $max_val = $val if ($val > $max_val); 513 } 514 } 515 516 my $datatype = ($max_val <= 0xffff) ? "uint16" : "uint32"; 517 518 # For formatting, determine how many values we can fit on a single 519 # line, and how wide each value needs to be to align nicely. 520 my $vals_per_line; 521 my $colwidth; 522 523 if ($max_val <= 0xffff) 524 { 525 $vals_per_line = 8; 526 $colwidth = 4; 527 } 528 elsif ($max_val <= 0xffffff) 529 { 530 $vals_per_line = 4; 531 $colwidth = 6; 532 } 533 else 534 { 535 $vals_per_line = 4; 536 $colwidth = 8; 537 } 538 539 ### 540 ### Print the struct and array. 541 ### 542 printf $out "static const $datatype ${tblname}_table[$tblsize];\n"; 543 printf $out "\n"; 544 printf $out "static const pg_mb_radix_tree $tblname =\n"; 545 printf $out "{\n"; 546 if ($datatype eq "uint16") 547 { 548 print $out " ${tblname}_table,\n"; 549 print $out " NULL, /* 32-bit table not used */\n"; 550 } 551 if ($datatype eq "uint32") 552 { 553 print $out " NULL, /* 16-bit table not used */\n"; 554 print $out " ${tblname}_table,\n"; 555 } 556 printf $out "\n"; 557 printf $out " 0x%04x, /* offset of table for 1-byte inputs */\n", 558 $b1root; 559 printf $out " 0x%02x, /* b1_lower */\n", $b1_lower; 560 printf $out " 0x%02x, /* b1_upper */\n", $b1_upper; 561 printf $out "\n"; 562 printf $out " 0x%04x, /* offset of table for 2-byte inputs */\n", 563 $b2root; 564 printf $out " 0x%02x, /* b2_1_lower */\n", $b2_1_lower; 565 printf $out " 0x%02x, /* b2_1_upper */\n", $b2_1_upper; 566 printf $out " 0x%02x, /* b2_2_lower */\n", $b2_2_lower; 567 printf $out " 0x%02x, /* b2_2_upper */\n", $b2_2_upper; 568 printf $out "\n"; 569 printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", 570 $b3root; 571 printf $out " 0x%02x, /* b3_1_lower */\n", $b3_1_lower; 572 printf $out " 0x%02x, /* b3_1_upper */\n", $b3_1_upper; 573 printf $out " 0x%02x, /* b3_2_lower */\n", $b3_2_lower; 574 printf $out " 0x%02x, /* b3_2_upper */\n", $b3_2_upper; 575 printf $out " 0x%02x, /* b3_3_lower */\n", $b3_3_lower; 576 printf $out " 0x%02x, /* b3_3_upper */\n", $b3_3_upper; 577 printf $out "\n"; 578 printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", 579 $b4root; 580 printf $out " 0x%02x, /* b4_1_lower */\n", $b4_1_lower; 581 printf $out " 0x%02x, /* b4_1_upper */\n", $b4_1_upper; 582 printf $out " 0x%02x, /* b4_2_lower */\n", $b4_2_lower; 583 printf $out " 0x%02x, /* b4_2_upper */\n", $b4_2_upper; 584 printf $out " 0x%02x, /* b4_3_lower */\n", $b4_3_lower; 585 printf $out " 0x%02x, /* b4_3_upper */\n", $b4_3_upper; 586 printf $out " 0x%02x, /* b4_4_lower */\n", $b4_4_lower; 587 printf $out " 0x%02x /* b4_4_upper */\n", $b4_4_upper; 588 print $out "};\n"; 589 print $out "\n"; 590 print $out "static const $datatype ${tblname}_table[$tblsize] =\n"; 591 print $out "{"; 592 my $off = 0; 593 594 foreach my $seg (@segments) 595 { 596 printf $out "\n"; 597 printf $out " /*** %s - offset 0x%05x ***/\n", $seg->{header}, $off; 598 printf $out "\n"; 599 600 for (my $i = $seg->{min_idx}; $i <= $seg->{max_idx};) 601 { 602 603 # Print the next line's worth of values. 604 # XXX pad to begin at a nice boundary 605 printf $out " /* %02x */ ", $i; 606 for (my $j = 0; 607 $j < $vals_per_line && $i <= $seg->{max_idx}; $j++) 608 { 609 my $val = $seg->{values}->{$i}; 610 611 printf $out " 0x%0*x", $colwidth, $val; 612 $off++; 613 if ($off != $tblsize) 614 { 615 print $out ","; 616 } 617 $i++; 618 } 619 print $out "\n"; 620 } 621 if ($seg->{overlaid_trail_zeros}) 622 { 623 printf $out 624 " /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n"; 625 } 626 } 627 628 # Sanity check. 629 if ($off != $tblsize) { die "table size didn't match!"; } 630 631 print $out "};\n"; 632 return; 633} 634 635### 636sub build_segments_from_tree 637{ 638 my ($header, $rootlabel, $depth, $map) = @_; 639 640 my @segments; 641 642 if (%{$map}) 643 { 644 @segments = 645 build_segments_recurse($header, $rootlabel, "", 1, $depth, $map); 646 647 # Sort the segments into "breadth-first" order. Not strictly required, 648 # but makes the maps nicer to read. 649 @segments = 650 sort { $a->{level} cmp $b->{level} or $a->{path} cmp $b->{path} } 651 @segments; 652 } 653 654 return @segments; 655} 656 657### 658sub build_segments_recurse 659{ 660 my ($header, $label, $path, $level, $depth, $map) = @_; 661 662 my @segments; 663 664 if ($level == $depth) 665 { 666 push @segments, 667 { 668 header => $header . ", leaf: ${path}xx", 669 label => $label, 670 level => $level, 671 depth => $depth, 672 path => $path, 673 values => $map 674 }; 675 } 676 else 677 { 678 my %children; 679 680 while (my ($i, $val) = each %$map) 681 { 682 my $childpath = $path . sprintf("%02x", $i); 683 my $childlabel = "$depth-level-$level-$childpath"; 684 685 push @segments, 686 build_segments_recurse($header, $childlabel, $childpath, 687 $level + 1, $depth, $val); 688 $children{$i} = $childlabel; 689 } 690 691 push @segments, 692 { 693 header => $header . ", byte #$level: ${path}xx", 694 label => $label, 695 level => $level, 696 depth => $depth, 697 path => $path, 698 values => \%children 699 }; 700 } 701 return @segments; 702} 703 704####################################################################### 705# make_charmap - convert charset table to charmap hash 706# 707# make_charmap(\@charset, $direction) 708# charset - ref to charset table : see print_conversion_tables 709# direction - conversion direction 710# 711sub make_charmap 712{ 713 my ($out, $charset, $direction, $verbose) = @_; 714 715 croak "unacceptable direction : $direction" 716 if ($direction != TO_UNICODE && $direction != FROM_UNICODE); 717 718 # In verbose mode, print a large comment with the source and comment of 719 # each character 720 if ($verbose) 721 { 722 print $out "/*\n"; 723 print $out "<src> <dst> <file>:<lineno> <comment>\n"; 724 } 725 726 my %charmap; 727 foreach my $c (@$charset) 728 { 729 730 # combined characters are handled elsewhere 731 next if (defined $c->{ucs_second}); 732 733 next if ($c->{direction} != $direction && $c->{direction} != BOTH); 734 735 my ($src, $dst) = 736 $direction == TO_UNICODE 737 ? ($c->{code}, ucs2utf($c->{ucs})) 738 : (ucs2utf($c->{ucs}), $c->{code}); 739 740 # check for duplicate source codes 741 if (defined $charmap{$src}) 742 { 743 printf STDERR 744 "Error: duplicate source code on %s:%d: 0x%04x => 0x%04x, 0x%04x\n", 745 $c->{f}, $c->{l}, $src, $charmap{$src}, $dst; 746 exit; 747 } 748 $charmap{$src} = $dst; 749 750 if ($verbose) 751 { 752 printf $out "0x%04x 0x%04x %s:%d %s\n", $src, $dst, $c->{f}, 753 $c->{l}, $c->{comment}; 754 } 755 } 756 if ($verbose) 757 { 758 print $out "*/\n\n"; 759 } 760 761 return \%charmap; 762} 763 764####################################################################### 765# make_charmap_combined - convert charset table to charmap hash 766# with checking duplicate source code 767# 768# make_charmap_combined(\@charset, $direction) 769# charset - ref to charset table : see print_conversion_tables 770# direction - conversion direction 771# 772sub make_charmap_combined 773{ 774 my ($charset, $direction) = @_; 775 776 croak "unacceptable direction : $direction" 777 if ($direction != TO_UNICODE && $direction != FROM_UNICODE); 778 779 my @combined; 780 foreach my $c (@$charset) 781 { 782 next if ($c->{direction} != $direction && $c->{direction} != BOTH); 783 784 if (defined $c->{ucs_second}) 785 { 786 my $entry = { 787 utf8 => ucs2utf($c->{ucs}), 788 utf8_second => ucs2utf($c->{ucs_second}), 789 code => $c->{code}, 790 comment => $c->{comment}, 791 f => $c->{f}, 792 l => $c->{l} 793 }; 794 push @combined, $entry; 795 } 796 } 797 798 return \@combined; 799} 800 801####################################################################### 802# convert UCS-4 to UTF-8 803# 804sub ucs2utf 805{ 806 my ($ucs) = @_; 807 my $utf; 808 809 if ($ucs <= 0x007f) 810 { 811 $utf = $ucs; 812 } 813 elsif ($ucs > 0x007f && $ucs <= 0x07ff) 814 { 815 $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8); 816 } 817 elsif ($ucs > 0x07ff && $ucs <= 0xffff) 818 { 819 $utf = 820 ((($ucs >> 12) | 0xe0) << 16) | 821 (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); 822 } 823 else 824 { 825 $utf = 826 ((($ucs >> 18) | 0xf0) << 24) | 827 (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) | 828 (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); 829 } 830 return $utf; 831} 832 8331; 834