1# 2# Copyright (c) 2001-2020, PostgreSQL Global Development Group 3# 4# src/backend/utils/mb/Unicode/convutils.pm 5 6package convutils; 7 8use strict; 9use warnings; 10 11use Carp; 12use Exporter 'import'; 13 14our @EXPORT = 15 qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables); 16 17# Constants used in the 'direction' field of the character maps 18use constant { 19 NONE => 0, 20 TO_UNICODE => 1, 21 FROM_UNICODE => 2, 22 BOTH => 3 23}; 24 25####################################################################### 26# read_source - common routine to read source file 27# 28# fname ; input file name 29# 30sub read_source 31{ 32 my ($fname) = @_; 33 my @r; 34 35 open(my $in, '<', $fname) || die("cannot open $fname"); 36 37 while (<$in>) 38 { 39 next if (/^#/); 40 chop; 41 42 next if (/^$/); # Ignore empty lines 43 44 next if (/^0x([0-9A-F]+)\s+(#.*)$/); 45 46 # The Unicode source files have three columns 47 # 1: The "foreign" code (in hex) 48 # 2: Unicode code point (in hex) 49 # 3: Unicode name 50 if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(#.*)$/) 51 { 52 print STDERR "READ ERROR at line $. in $fname: $_\n"; 53 exit; 54 } 55 my $out = { 56 code => hex($1), 57 ucs => hex($2), 58 comment => $4, 59 direction => BOTH, 60 f => $fname, 61 l => $. 62 }; 63 64 # Ignore pure ASCII mappings. PostgreSQL character conversion code 65 # never even passes these to the conversion code. 66 next if ($out->{code} < 0x80 || $out->{ucs} < 0x80); 67 68 push(@r, $out); 69 } 70 close($in); 71 72 return \@r; 73} 74 75################################################################## 76# print_conversion_tables - output mapping tables 77# 78# print_conversion_tables($this_script, $csname, \%charset) 79# 80# this_script - the name of the *caller script* of this feature 81# csname - character set name other than ucs 82# charset - ref to character set array 83# 84# Input character set array format: 85# 86# Each element in the character set array is a hash. Each hash has the following fields: 87# direction - BOTH, TO_UNICODE, or FROM_UNICODE (or NONE, to ignore the entry altogether) 88# ucs - Unicode code point 89# ucs_second - Second Unicode code point, if this is a "combined" character. 90# code - Byte sequence in the "other" character set, as an integer 91# comment - Text representation of the character 92# f - Source filename 93# l - Line number in source file 94# 95sub print_conversion_tables 96{ 97 my ($this_script, $csname, $charset) = @_; 98 99 print_conversion_tables_direction($this_script, $csname, FROM_UNICODE, 100 $charset); 101 print_conversion_tables_direction($this_script, $csname, TO_UNICODE, 102 $charset); 103 return; 104} 105 106############################################################################# 107# INTERNAL ROUTINES 108 109####################################################################### 110# print_conversion_tables_direction - write the whole content of C source of radix tree 111# 112# print_conversion_tables_direction($this_script, $csname, $direction, \%charset, $tblwidth) 113# 114# this_script - the name of the *caller script* of this feature 115# csname - character set name other than ucs 116# direction - desired direction, TO_UNICODE or FROM_UNICODE 117# charset - ref to character set array 118# 119sub print_conversion_tables_direction 120{ 121 my ($this_script, $csname, $direction, $charset) = @_; 122 123 my $fname; 124 my $tblname; 125 if ($direction == TO_UNICODE) 126 { 127 $fname = lc("${csname}_to_utf8.map"); 128 $tblname = lc("${csname}_to_unicode_tree"); 129 130 print "- Writing ${csname}=>UTF8 conversion table: $fname\n"; 131 } 132 else 133 { 134 $fname = lc("utf8_to_${csname}.map"); 135 $tblname = lc("${csname}_from_unicode_tree"); 136 137 print "- Writing UTF8=>${csname} conversion table: $fname\n"; 138 } 139 140 open(my $out, '>', $fname) || die("cannot open $fname"); 141 142 print $out "/* src/backend/utils/mb/Unicode/$fname */\n"; 143 print $out "/* This file is generated by $this_script */\n\n"; 144 145 # Collect regular, non-combined, mappings, and create the radix tree from them. 146 my $charmap = &make_charmap($out, $charset, $direction, 0); 147 print_radix_table($out, $tblname, $charmap); 148 149 # Collect combined characters, and create combined character table (if any) 150 my $charmap_combined = &make_charmap_combined($charset, $direction); 151 152 if (scalar @{$charmap_combined} > 0) 153 { 154 if ($direction == TO_UNICODE) 155 { 156 print_to_utf8_combined_map($out, $csname, $charmap_combined, 1); 157 } 158 else 159 { 160 print_from_utf8_combined_map($out, $csname, $charmap_combined, 1); 161 } 162 } 163 164 close($out); 165 return; 166} 167 168sub print_from_utf8_combined_map 169{ 170 my ($out, $charset, $table, $verbose) = @_; 171 172 my $last_comment = ""; 173 174 printf $out "\n/* Combined character map */\n"; 175 printf $out 176 "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {", 177 scalar(@$table); 178 my $first = 1; 179 foreach my $i (sort { $a->{utf8} <=> $b->{utf8} } @$table) 180 { 181 print($out ",") if (!$first); 182 $first = 0; 183 print $out "\t/* $last_comment */" 184 if ($verbose && $last_comment ne ""); 185 186 printf $out "\n {0x%08x, 0x%08x, 0x%04x}", 187 $i->{utf8}, $i->{utf8_second}, $i->{code}; 188 if ($verbose >= 2) 189 { 190 $last_comment = 191 sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); 192 } 193 elsif ($verbose >= 1) 194 { 195 $last_comment = $i->{comment}; 196 } 197 } 198 print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); 199 print $out "\n};\n"; 200 return; 201} 202 203sub print_to_utf8_combined_map 204{ 205 my ($out, $charset, $table, $verbose) = @_; 206 207 my $last_comment = ""; 208 209 printf $out "\n/* Combined character map */\n"; 210 printf $out 211 "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {", 212 scalar(@$table); 213 214 my $first = 1; 215 foreach my $i (sort { $a->{code} <=> $b->{code} } @$table) 216 { 217 print($out ",") if (!$first); 218 $first = 0; 219 print $out "\t/* $last_comment */" 220 if ($verbose && $last_comment ne ""); 221 222 printf $out "\n {0x%04x, 0x%08x, 0x%08x}", 223 $i->{code}, $i->{utf8}, $i->{utf8_second}; 224 225 if ($verbose >= 2) 226 { 227 $last_comment = 228 sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); 229 } 230 elsif ($verbose >= 1) 231 { 232 $last_comment = $i->{comment}; 233 } 234 } 235 print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); 236 print $out "\n};\n"; 237 return; 238} 239 240####################################################################### 241# print_radix_table(<output handle>, <table name>, <charmap hash ref>) 242# 243# Input: A hash, mapping an input character to an output character. 244# 245# Constructs a radix tree from the hash, and prints it out as a C-struct. 246# 247sub print_radix_table 248{ 249 my ($out, $tblname, $c) = @_; 250 251 ### 252 ### Build radix trees in memory, for 1-, 2-, 3- and 4-byte inputs. Each 253 ### radix tree is represented as a nested hash, each hash indexed by 254 ### input byte 255 ### 256 my %b1map; 257 my %b2map; 258 my %b3map; 259 my %b4map; 260 foreach my $in (keys %$c) 261 { 262 my $out = $c->{$in}; 263 264 if ($in <= 0xff) 265 { 266 $b1map{$in} = $out; 267 } 268 elsif ($in <= 0xffff) 269 { 270 my $b1 = $in >> 8; 271 my $b2 = $in & 0xff; 272 273 $b2map{$b1}{$b2} = $out; 274 } 275 elsif ($in <= 0xffffff) 276 { 277 my $b1 = $in >> 16; 278 my $b2 = ($in >> 8) & 0xff; 279 my $b3 = $in & 0xff; 280 281 $b3map{$b1}{$b2}{$b3} = $out; 282 } 283 elsif ($in <= 0xffffffff) 284 { 285 my $b1 = $in >> 24; 286 my $b2 = ($in >> 16) & 0xff; 287 my $b3 = ($in >> 8) & 0xff; 288 my $b4 = $in & 0xff; 289 290 $b4map{$b1}{$b2}{$b3}{$b4} = $out; 291 } 292 else 293 { 294 die sprintf("up to 4 byte code is supported: %x", $in); 295 } 296 } 297 298 my @segments; 299 300 ### 301 ### Build a linear list of "segments", from the nested hashes. 302 ### 303 ### Each segment is a lookup table, keyed by the next byte in the input. 304 ### The segments are written out physically to one big array in the final 305 ### step, but logically, they form a radix tree. Or rather, four radix 306 ### trees: one for 1-byte inputs, another for 2-byte inputs, 3-byte 307 ### inputs, and 4-byte inputs. 308 ### 309 ### Each segment is represented by a hash with following fields: 310 ### 311 ### comment => <string to output as a comment> 312 ### label => <label that can be used to refer to this segment from elsewhere> 313 ### values => <a hash, keyed by byte, 0-0xff> 314 ### 315 ### Entries in 'values' can be integers (for leaf-level segments), or 316 ### string labels, pointing to a segment with that label. Any missing 317 ### values are treated as zeros. If 'values' hash is missing altogether, 318 ### it's treated as all-zeros. 319 ### 320 ### Subsequent steps will enrich the segments with more fields. 321 ### 322 323 # Add the segments for the radix trees themselves. 324 push @segments, 325 build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map); 326 push @segments, 327 build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map); 328 push @segments, 329 build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map); 330 push @segments, 331 build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map); 332 333 ### 334 ### Find min and max index used in each level of each tree. 335 ### 336 ### These are stored separately, and we can then leave out the unused 337 ### parts of every segment. (When using the resulting tree, you must 338 ### check each input byte against the min and max.) 339 ### 340 my %min_idx; 341 my %max_idx; 342 foreach my $seg (@segments) 343 { 344 my $this_min = $min_idx{ $seg->{depth} }->{ $seg->{level} }; 345 my $this_max = $max_idx{ $seg->{depth} }->{ $seg->{level} }; 346 347 foreach my $i (keys %{ $seg->{values} }) 348 { 349 $this_min = $i if (!defined $this_min || $i < $this_min); 350 $this_max = $i if (!defined $this_max || $i > $this_max); 351 } 352 353 $min_idx{ $seg->{depth} }{ $seg->{level} } = $this_min; 354 $max_idx{ $seg->{depth} }{ $seg->{level} } = $this_max; 355 } 356 357 # Copy the mins and max's back to every segment, for convenience. 358 foreach my $seg (@segments) 359 { 360 $seg->{min_idx} = $min_idx{ $seg->{depth} }{ $seg->{level} }; 361 $seg->{max_idx} = $max_idx{ $seg->{depth} }{ $seg->{level} }; 362 } 363 364 ### 365 ### Prepend a dummy all-zeros map to the beginning. 366 ### 367 ### A 0 is an invalid value anywhere in the table, and this allows us to 368 ### point to 0 offset from any table, to get a 0 result. 369 ### 370 371 # Find the max range between min and max indexes in any of the segments. 372 my $widest_range = 0; 373 foreach my $seg (@segments) 374 { 375 my $this_range = $seg->{max_idx} - $seg->{min_idx}; 376 $widest_range = $this_range if ($this_range > $widest_range); 377 } 378 379 unshift @segments, 380 { 381 header => "Dummy map, for invalid values", 382 min_idx => 0, 383 max_idx => $widest_range 384 }; 385 386 ### 387 ### Eliminate overlapping zeros 388 ### 389 ### For each segment, if there are zero values at the end of, and there 390 ### are also zero values at the beginning of the next segment, we can 391 ### overlay the tail of this segment with the head of next segment, to 392 ### save space. 393 ### 394 ### To achieve that, we subtract the 'max_idx' of each segment with the 395 ### amount of zeros that can be overlaid. 396 ### 397 for (my $j = 0; $j < $#segments - 1; $j++) 398 { 399 my $seg = $segments[$j]; 400 my $nextseg = $segments[ $j + 1 ]; 401 402 # Count the number of zero values at the end of this segment. 403 my $this_trail_zeros = 0; 404 for ( 405 my $i = $seg->{max_idx}; 406 $i >= $seg->{min_idx} && !$seg->{values}->{$i}; 407 $i--) 408 { 409 $this_trail_zeros++; 410 } 411 412 # Count the number of zeros at the beginning of next segment. 413 my $next_lead_zeros = 0; 414 for ( 415 my $i = $nextseg->{min_idx}; 416 $i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i}; 417 $i++) 418 { 419 $next_lead_zeros++; 420 } 421 422 # How many zeros in common? 423 my $overlaid_trail_zeros = 424 ($this_trail_zeros > $next_lead_zeros) 425 ? $next_lead_zeros 426 : $this_trail_zeros; 427 428 $seg->{overlaid_trail_zeros} = $overlaid_trail_zeros; 429 $seg->{max_idx} = $seg->{max_idx} - $overlaid_trail_zeros; 430 } 431 432 ### 433 ### Replace label references with real offsets. 434 ### 435 ### So far, the non-leaf segments have referred to other segments by 436 ### their labels. Replace them with numerical offsets from the beginning 437 ### of the final array. You cannot move, add, or remove segments after 438 ### this step, as that would invalidate the offsets calculated here! 439 ### 440 my $flatoff = 0; 441 my %segmap; 442 443 # First pass: assign offsets to each segment, and build hash 444 # of label => offset. 445 foreach my $seg (@segments) 446 { 447 $seg->{offset} = $flatoff; 448 $segmap{ $seg->{label} } = $flatoff; 449 $flatoff += $seg->{max_idx} - $seg->{min_idx} + 1; 450 } 451 my $tblsize = $flatoff; 452 453 # Second pass: look up the offset of each label reference in the hash. 454 foreach my $seg (@segments) 455 { 456 while (my ($i, $val) = each %{ $seg->{values} }) 457 { 458 if (!($val =~ /^[0-9,.E]+$/)) 459 { 460 my $segoff = $segmap{$val}; 461 if ($segoff) 462 { 463 $seg->{values}->{$i} = $segoff; 464 } 465 else 466 { 467 die "no segment with label $val"; 468 } 469 } 470 } 471 } 472 473 # Also look up the positions of the roots in the table. 474 my $b1root = $segmap{"1-byte"}; 475 my $b2root = $segmap{"2-byte"}; 476 my $b3root = $segmap{"3-byte"}; 477 my $b4root = $segmap{"4-byte"}; 478 479 # And the lower-upper values of each level in each radix tree. 480 my $b1_lower = $min_idx{1}{1}; 481 my $b1_upper = $max_idx{1}{1}; 482 483 my $b2_1_lower = $min_idx{2}{1}; 484 my $b2_1_upper = $max_idx{2}{1}; 485 my $b2_2_lower = $min_idx{2}{2}; 486 my $b2_2_upper = $max_idx{2}{2}; 487 488 my $b3_1_lower = $min_idx{3}{1}; 489 my $b3_1_upper = $max_idx{3}{1}; 490 my $b3_2_lower = $min_idx{3}{2}; 491 my $b3_2_upper = $max_idx{3}{2}; 492 my $b3_3_lower = $min_idx{3}{3}; 493 my $b3_3_upper = $max_idx{3}{3}; 494 495 my $b4_1_lower = $min_idx{4}{1}; 496 my $b4_1_upper = $max_idx{4}{1}; 497 my $b4_2_lower = $min_idx{4}{2}; 498 my $b4_2_upper = $max_idx{4}{2}; 499 my $b4_3_lower = $min_idx{4}{3}; 500 my $b4_3_upper = $max_idx{4}{3}; 501 my $b4_4_lower = $min_idx{4}{4}; 502 my $b4_4_upper = $max_idx{4}{4}; 503 504 ### 505 ### Find the maximum value in the whole table, to determine if we can 506 ### use uint16 or if we need to use uint32. 507 ### 508 my $max_val = 0; 509 foreach my $seg (@segments) 510 { 511 foreach my $val (values %{ $seg->{values} }) 512 { 513 $max_val = $val if ($val > $max_val); 514 } 515 } 516 517 my $datatype = ($max_val <= 0xffff) ? "uint16" : "uint32"; 518 519 # For formatting, determine how many values we can fit on a single 520 # line, and how wide each value needs to be to align nicely. 521 my $vals_per_line; 522 my $colwidth; 523 524 if ($max_val <= 0xffff) 525 { 526 $vals_per_line = 8; 527 $colwidth = 4; 528 } 529 elsif ($max_val <= 0xffffff) 530 { 531 $vals_per_line = 4; 532 $colwidth = 6; 533 } 534 else 535 { 536 $vals_per_line = 4; 537 $colwidth = 8; 538 } 539 540 ### 541 ### Print the struct and array. 542 ### 543 printf $out "static const $datatype ${tblname}_table[$tblsize];\n"; 544 printf $out "\n"; 545 printf $out "static const pg_mb_radix_tree $tblname =\n"; 546 printf $out "{\n"; 547 if ($datatype eq "uint16") 548 { 549 print $out " ${tblname}_table,\n"; 550 print $out " NULL, /* 32-bit table not used */\n"; 551 } 552 if ($datatype eq "uint32") 553 { 554 print $out " NULL, /* 16-bit table not used */\n"; 555 print $out " ${tblname}_table,\n"; 556 } 557 printf $out "\n"; 558 printf $out " 0x%04x, /* offset of table for 1-byte inputs */\n", 559 $b1root; 560 printf $out " 0x%02x, /* b1_lower */\n", $b1_lower; 561 printf $out " 0x%02x, /* b1_upper */\n", $b1_upper; 562 printf $out "\n"; 563 printf $out " 0x%04x, /* offset of table for 2-byte inputs */\n", 564 $b2root; 565 printf $out " 0x%02x, /* b2_1_lower */\n", $b2_1_lower; 566 printf $out " 0x%02x, /* b2_1_upper */\n", $b2_1_upper; 567 printf $out " 0x%02x, /* b2_2_lower */\n", $b2_2_lower; 568 printf $out " 0x%02x, /* b2_2_upper */\n", $b2_2_upper; 569 printf $out "\n"; 570 printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", 571 $b3root; 572 printf $out " 0x%02x, /* b3_1_lower */\n", $b3_1_lower; 573 printf $out " 0x%02x, /* b3_1_upper */\n", $b3_1_upper; 574 printf $out " 0x%02x, /* b3_2_lower */\n", $b3_2_lower; 575 printf $out " 0x%02x, /* b3_2_upper */\n", $b3_2_upper; 576 printf $out " 0x%02x, /* b3_3_lower */\n", $b3_3_lower; 577 printf $out " 0x%02x, /* b3_3_upper */\n", $b3_3_upper; 578 printf $out "\n"; 579 printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", 580 $b4root; 581 printf $out " 0x%02x, /* b4_1_lower */\n", $b4_1_lower; 582 printf $out " 0x%02x, /* b4_1_upper */\n", $b4_1_upper; 583 printf $out " 0x%02x, /* b4_2_lower */\n", $b4_2_lower; 584 printf $out " 0x%02x, /* b4_2_upper */\n", $b4_2_upper; 585 printf $out " 0x%02x, /* b4_3_lower */\n", $b4_3_lower; 586 printf $out " 0x%02x, /* b4_3_upper */\n", $b4_3_upper; 587 printf $out " 0x%02x, /* b4_4_lower */\n", $b4_4_lower; 588 printf $out " 0x%02x /* b4_4_upper */\n", $b4_4_upper; 589 print $out "};\n"; 590 print $out "\n"; 591 print $out "static const $datatype ${tblname}_table[$tblsize] =\n"; 592 print $out "{"; 593 my $off = 0; 594 595 foreach my $seg (@segments) 596 { 597 printf $out "\n"; 598 printf $out " /*** %s - offset 0x%05x ***/\n", $seg->{header}, $off; 599 printf $out "\n"; 600 601 for (my $i = $seg->{min_idx}; $i <= $seg->{max_idx};) 602 { 603 604 # Print the next line's worth of values. 605 # XXX pad to begin at a nice boundary 606 printf $out " /* %02x */ ", $i; 607 for (my $j = 0; 608 $j < $vals_per_line && $i <= $seg->{max_idx}; $j++) 609 { 610 my $val = $seg->{values}->{$i}; 611 612 printf $out " 0x%0*x", $colwidth, $val; 613 $off++; 614 if ($off != $tblsize) 615 { 616 print $out ","; 617 } 618 $i++; 619 } 620 print $out "\n"; 621 } 622 if ($seg->{overlaid_trail_zeros}) 623 { 624 printf $out 625 " /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n"; 626 } 627 } 628 629 # Sanity check. 630 if ($off != $tblsize) { die "table size didn't match!"; } 631 632 print $out "};\n"; 633 return; 634} 635 636### 637sub build_segments_from_tree 638{ 639 my ($header, $rootlabel, $depth, $map) = @_; 640 641 my @segments; 642 643 if (%{$map}) 644 { 645 @segments = 646 build_segments_recurse($header, $rootlabel, "", 1, $depth, $map); 647 648 # Sort the segments into "breadth-first" order. Not strictly required, 649 # but makes the maps nicer to read. 650 @segments = 651 sort { $a->{level} cmp $b->{level} or $a->{path} cmp $b->{path} } 652 @segments; 653 } 654 655 return @segments; 656} 657 658### 659sub build_segments_recurse 660{ 661 my ($header, $label, $path, $level, $depth, $map) = @_; 662 663 my @segments; 664 665 if ($level == $depth) 666 { 667 push @segments, 668 { 669 header => $header . ", leaf: ${path}xx", 670 label => $label, 671 level => $level, 672 depth => $depth, 673 path => $path, 674 values => $map 675 }; 676 } 677 else 678 { 679 my %children; 680 681 while (my ($i, $val) = each %$map) 682 { 683 my $childpath = $path . sprintf("%02x", $i); 684 my $childlabel = "$depth-level-$level-$childpath"; 685 686 push @segments, 687 build_segments_recurse($header, $childlabel, $childpath, 688 $level + 1, $depth, $val); 689 $children{$i} = $childlabel; 690 } 691 692 push @segments, 693 { 694 header => $header . ", byte #$level: ${path}xx", 695 label => $label, 696 level => $level, 697 depth => $depth, 698 path => $path, 699 values => \%children 700 }; 701 } 702 return @segments; 703} 704 705####################################################################### 706# make_charmap - convert charset table to charmap hash 707# 708# make_charmap(\@charset, $direction) 709# charset - ref to charset table : see print_conversion_tables 710# direction - conversion direction 711# 712sub make_charmap 713{ 714 my ($out, $charset, $direction, $verbose) = @_; 715 716 croak "unacceptable direction : $direction" 717 if ($direction != TO_UNICODE && $direction != FROM_UNICODE); 718 719 # In verbose mode, print a large comment with the source and comment of 720 # each character 721 if ($verbose) 722 { 723 print $out "/*\n"; 724 print $out "<src> <dst> <file>:<lineno> <comment>\n"; 725 } 726 727 my %charmap; 728 foreach my $c (@$charset) 729 { 730 731 # combined characters are handled elsewhere 732 next if (defined $c->{ucs_second}); 733 734 next if ($c->{direction} != $direction && $c->{direction} != BOTH); 735 736 my ($src, $dst) = 737 $direction == TO_UNICODE 738 ? ($c->{code}, ucs2utf($c->{ucs})) 739 : (ucs2utf($c->{ucs}), $c->{code}); 740 741 # check for duplicate source codes 742 if (defined $charmap{$src}) 743 { 744 printf STDERR 745 "Error: duplicate source code on %s:%d: 0x%04x => 0x%04x, 0x%04x\n", 746 $c->{f}, $c->{l}, $src, $charmap{$src}, $dst; 747 exit; 748 } 749 $charmap{$src} = $dst; 750 751 if ($verbose) 752 { 753 printf $out "0x%04x 0x%04x %s:%d %s\n", $src, $dst, $c->{f}, 754 $c->{l}, $c->{comment}; 755 } 756 } 757 if ($verbose) 758 { 759 print $out "*/\n\n"; 760 } 761 762 return \%charmap; 763} 764 765####################################################################### 766# make_charmap_combined - convert charset table to charmap hash 767# with checking duplicate source code 768# 769# make_charmap_combined(\@charset, $direction) 770# charset - ref to charset table : see print_conversion_tables 771# direction - conversion direction 772# 773sub make_charmap_combined 774{ 775 my ($charset, $direction) = @_; 776 777 croak "unacceptable direction : $direction" 778 if ($direction != TO_UNICODE && $direction != FROM_UNICODE); 779 780 my @combined; 781 foreach my $c (@$charset) 782 { 783 next if ($c->{direction} != $direction && $c->{direction} != BOTH); 784 785 if (defined $c->{ucs_second}) 786 { 787 my $entry = { 788 utf8 => ucs2utf($c->{ucs}), 789 utf8_second => ucs2utf($c->{ucs_second}), 790 code => $c->{code}, 791 comment => $c->{comment}, 792 f => $c->{f}, 793 l => $c->{l} 794 }; 795 push @combined, $entry; 796 } 797 } 798 799 return \@combined; 800} 801 802####################################################################### 803# convert UCS-4 to UTF-8 804# 805sub ucs2utf 806{ 807 my ($ucs) = @_; 808 my $utf; 809 810 if ($ucs <= 0x007f) 811 { 812 $utf = $ucs; 813 } 814 elsif ($ucs > 0x007f && $ucs <= 0x07ff) 815 { 816 $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8); 817 } 818 elsif ($ucs > 0x07ff && $ucs <= 0xffff) 819 { 820 $utf = 821 ((($ucs >> 12) | 0xe0) << 16) | 822 (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); 823 } 824 else 825 { 826 $utf = 827 ((($ucs >> 18) | 0xf0) << 24) | 828 (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) | 829 (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); 830 } 831 return $utf; 832} 833 8341; 835