1# 2# Copyright (c) 2001-2017, PostgreSQL Global Development Group 3# 4# src/backend/utils/mb/Unicode/convutils.pm 5 6package convutils; 7 8use strict; 9 10use Exporter 'import'; 11 12our @EXPORT = 13 qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables); 14 15# Constants used in the 'direction' field of the character maps 16use constant { 17 NONE => 0, 18 TO_UNICODE => 1, 19 FROM_UNICODE => 2, 20 BOTH => 3 }; 21 22####################################################################### 23# read_source - common routine to read source file 24# 25# fname ; input file name 26# 27sub read_source 28{ 29 my ($fname) = @_; 30 my @r; 31 32 open(my $in, '<', $fname) || die("cannot open $fname"); 33 34 while (<$in>) 35 { 36 next if (/^#/); 37 chop; 38 39 next if (/^$/); # Ignore empty lines 40 41 next if (/^0x([0-9A-F]+)\s+(#.*)$/); 42 43 # The Unicode source files have three columns 44 # 1: The "foreign" code (in hex) 45 # 2: Unicode code point (in hex) 46 # 3: Unicode name 47 if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(#.*)$/) 48 { 49 print STDERR "READ ERROR at line $. in $fname: $_\n"; 50 exit; 51 } 52 my $out = { 53 code => hex($1), 54 ucs => hex($2), 55 comment => $4, 56 direction => BOTH, 57 f => $fname, 58 l => $. }; 59 60 # Ignore pure ASCII mappings. PostgreSQL character conversion code 61 # never even passes these to the conversion code. 62 next if ($out->{code} < 0x80 || $out->{ucs} < 0x80); 63 64 push(@r, $out); 65 } 66 close($in); 67 68 return \@r; 69} 70 71################################################################## 72# print_conversion_tables - output mapping tables 73# 74# print_conversion_tables($this_script, $csname, \%charset) 75# 76# this_script - the name of the *caller script* of this feature 77# csname - character set name other than ucs 78# charset - ref to character set array 79# 80# Input character set array format: 81# 82# Each element in the character set array is a hash. Each hash has the following fields: 83# direction - BOTH, TO_UNICODE, or FROM_UNICODE (or NONE, to ignore the entry altogether) 84# ucs - Unicode code point 85# ucs_second - Second Unicode code point, if this is a "combined" character. 86# code - Byte sequence in the "other" character set, as an integer 87# comment - Text representation of the character 88# f - Source filename 89# l - Line number in source file 90# 91sub print_conversion_tables 92{ 93 my ($this_script, $csname, $charset) = @_; 94 95 print_conversion_tables_direction($this_script, $csname, FROM_UNICODE, 96 $charset); 97 print_conversion_tables_direction($this_script, $csname, TO_UNICODE, 98 $charset); 99} 100 101############################################################################# 102# INTERNAL ROUTINES 103 104####################################################################### 105# print_conversion_tables_direction - write the whole content of C source of radix tree 106# 107# print_conversion_tables_direction($this_script, $csname, $direction, \%charset, $tblwidth) 108# 109# this_script - the name of the *caller script* of this feature 110# csname - character set name other than ucs 111# direction - desired direction, TO_UNICODE or FROM_UNICODE 112# charset - ref to character set array 113# 114sub print_conversion_tables_direction 115{ 116 my ($this_script, $csname, $direction, $charset) = @_; 117 118 my $fname; 119 my $tblname; 120 if ($direction == TO_UNICODE) 121 { 122 $fname = lc("${csname}_to_utf8.map"); 123 $tblname = lc("${csname}_to_unicode_tree"); 124 125 print "- Writing ${csname}=>UTF8 conversion table: $fname\n"; 126 } 127 else 128 { 129 $fname = lc("utf8_to_${csname}.map"); 130 $tblname = lc("${csname}_from_unicode_tree"); 131 132 print "- Writing UTF8=>${csname} conversion table: $fname\n"; 133 } 134 135 open(my $out, '>', $fname) || die("cannot open $fname"); 136 137 print $out "/* src/backend/utils/mb/Unicode/$fname */\n"; 138 print $out "/* This file is generated by $this_script */\n\n"; 139 140# Collect regular, non-combined, mappings, and create the radix tree from them. 141 my $charmap = &make_charmap($out, $charset, $direction, 0); 142 print_radix_table($out, $tblname, $charmap); 143 144 # Collect combined characters, and create combined character table (if any) 145 my $charmap_combined = &make_charmap_combined($charset, $direction); 146 147 if (scalar @{$charmap_combined} > 0) 148 { 149 if ($direction == TO_UNICODE) 150 { 151 print_to_utf8_combined_map($out, $csname, $charmap_combined, 1); 152 } 153 else 154 { 155 print_from_utf8_combined_map($out, $csname, $charmap_combined, 1); 156 } 157 } 158 159 close($out); 160} 161 162sub print_from_utf8_combined_map 163{ 164 my ($out, $charset, $table, $verbose) = @_; 165 166 my $last_comment = ""; 167 168 printf $out "\n/* Combined character map */\n"; 169 printf $out 170"static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {", 171 scalar(@$table); 172 my $first = 1; 173 foreach my $i (sort { $a->{utf8} <=> $b->{utf8} } @$table) 174 { 175 print($out ",") if (!$first); 176 $first = 0; 177 print $out "\t/* $last_comment */" 178 if ($verbose && $last_comment ne ""); 179 180 printf $out "\n {0x%08x, 0x%08x, 0x%04x}", 181 $i->{utf8}, $i->{utf8_second}, $i->{code}; 182 if ($verbose >= 2) 183 { 184 $last_comment = 185 sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); 186 } 187 elsif ($verbose >= 1) 188 { 189 $last_comment = $i->{comment}; 190 } 191 } 192 print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); 193 print $out "\n};\n"; 194} 195 196sub print_to_utf8_combined_map 197{ 198 my ($out, $charset, $table, $verbose) = @_; 199 200 my $last_comment = ""; 201 202 printf $out "\n/* Combined character map */\n"; 203 printf $out 204"static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {", 205 scalar(@$table); 206 207 my $first = 1; 208 foreach my $i (sort { $a->{code} <=> $b->{code} } @$table) 209 { 210 print($out ",") if (!$first); 211 $first = 0; 212 print $out "\t/* $last_comment */" 213 if ($verbose && $last_comment ne ""); 214 215 printf $out "\n {0x%04x, 0x%08x, 0x%08x}", 216 $i->{code}, $i->{utf8}, $i->{utf8_second}; 217 218 if ($verbose >= 2) 219 { 220 $last_comment = 221 sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment}); 222 } 223 elsif ($verbose >= 1) 224 { 225 $last_comment = $i->{comment}; 226 } 227 } 228 print $out "\t/* $last_comment */" if ($verbose && $last_comment ne ""); 229 print $out "\n};\n"; 230} 231 232####################################################################### 233# print_radix_table(<output handle>, <table name>, <charmap hash ref>) 234# 235# Input: A hash, mapping an input character to an output character. 236# 237# Constructs a radix tree from the hash, and prints it out as a C-struct. 238# 239sub print_radix_table 240{ 241 my ($out, $tblname, $c) = @_; 242 243 ### 244 ### Build radix trees in memory, for 1-, 2-, 3- and 4-byte inputs. Each 245 ### radix tree is represented as a nested hash, each hash indexed by 246 ### input byte 247 ### 248 my %b1map; 249 my %b2map; 250 my %b3map; 251 my %b4map; 252 foreach my $in (keys %$c) 253 { 254 my $out = $c->{$in}; 255 256 if ($in < 0x100) 257 { 258 $b1map{$in} = $out; 259 } 260 elsif ($in < 0x10000) 261 { 262 my $b1 = $in >> 8; 263 my $b2 = $in & 0xff; 264 265 $b2map{$b1}{$b2} = $out; 266 } 267 elsif ($in < 0x1000000) 268 { 269 my $b1 = $in >> 16; 270 my $b2 = ($in >> 8) & 0xff; 271 my $b3 = $in & 0xff; 272 273 $b3map{$b1}{$b2}{$b3} = $out; 274 } 275 elsif ($in < 0x100000000) 276 { 277 my $b1 = $in >> 24; 278 my $b2 = ($in >> 16) & 0xff; 279 my $b3 = ($in >> 8) & 0xff; 280 my $b4 = $in & 0xff; 281 282 $b4map{$b1}{$b2}{$b3}{$b4} = $out; 283 } 284 else 285 { 286 die sprintf("up to 4 byte code is supported: %x", $in); 287 } 288 } 289 290 my @segments; 291 292 ### 293 ### Build a linear list of "segments", from the nested hashes. 294 ### 295 ### Each segment is a lookup table, keyed by the next byte in the input. 296 ### The segments are written out physically to one big array in the final 297 ### step, but logically, they form a radix tree. Or rather, four radix 298 ### trees: one for 1-byte inputs, another for 2-byte inputs, 3-byte 299 ### inputs, and 4-byte inputs. 300 ### 301 ### Each segment is represented by a hash with following fields: 302 ### 303 ### comment => <string to output as a comment> 304 ### label => <label that can be used to refer to this segment from elsewhere> 305 ### values => <a hash, keyed by byte, 0-0xff> 306 ### 307 ### Entries in 'values' can be integers (for leaf-level segments), or 308 ### string labels, pointing to a segment with that label. Any missing 309 ### values are treated as zeros. If 'values' hash is missing altogether, 310 ### it's treated as all-zeros. 311 ### 312 ### Subsequent steps will enrich the segments with more fields. 313 ### 314 315 # Add the segments for the radix trees themselves. 316 push @segments, 317 build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map); 318 push @segments, 319 build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map); 320 push @segments, 321 build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map); 322 push @segments, 323 build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map); 324 325 ### 326 ### Find min and max index used in each level of each tree. 327 ### 328 ### These are stored separately, and we can then leave out the unused 329 ### parts of every segment. (When using the resulting tree, you must 330 ### check each input byte against the min and max.) 331 ### 332 my %min_idx; 333 my %max_idx; 334 foreach my $seg (@segments) 335 { 336 my $this_min = $min_idx{ $seg->{depth} }->{ $seg->{level} }; 337 my $this_max = $max_idx{ $seg->{depth} }->{ $seg->{level} }; 338 339 foreach my $i (keys %{ $seg->{values} }) 340 { 341 $this_min = $i if (!defined $this_min || $i < $this_min); 342 $this_max = $i if (!defined $this_max || $i > $this_max); 343 } 344 345 $min_idx{ $seg->{depth} }{ $seg->{level} } = $this_min; 346 $max_idx{ $seg->{depth} }{ $seg->{level} } = $this_max; 347 } 348 349 # Copy the mins and max's back to every segment, for convenience. 350 foreach my $seg (@segments) 351 { 352 $seg->{min_idx} = $min_idx{ $seg->{depth} }{ $seg->{level} }; 353 $seg->{max_idx} = $max_idx{ $seg->{depth} }{ $seg->{level} }; 354 } 355 356 ### 357 ### Prepend a dummy all-zeros map to the beginning. 358 ### 359 ### A 0 is an invalid value anywhere in the table, and this allows us to 360 ### point to 0 offset from any table, to get a 0 result. 361 ### 362 363 # Find the max range between min and max indexes in any of the segments. 364 my $widest_range = 0; 365 foreach my $seg (@segments) 366 { 367 my $this_range = $seg->{max_idx} - $seg->{min_idx}; 368 $widest_range = $this_range if ($this_range > $widest_range); 369 } 370 371 unshift @segments, 372 { header => "Dummy map, for invalid values", 373 min_idx => 0, 374 max_idx => $widest_range }; 375 376 ### 377 ### Eliminate overlapping zeros 378 ### 379 ### For each segment, if there are zero values at the end of, and there 380 ### are also zero values at the beginning of the next segment, we can 381 ### overlay the tail of this segment with the head of next segment, to 382 ### save space. 383 ### 384 ### To achieve that, we subtract the 'max_idx' of each segment with the 385 ### amount of zeros that can be overlaid. 386 ### 387 for (my $j = 0; $j < $#segments - 1; $j++) 388 { 389 my $seg = $segments[$j]; 390 my $nextseg = $segments[ $j + 1 ]; 391 392 # Count the number of zero values at the end of this segment. 393 my $this_trail_zeros = 0; 394 for ( 395 my $i = $seg->{max_idx}; 396 $i >= $seg->{min_idx} && !$seg->{values}->{$i}; 397 $i--) 398 { 399 $this_trail_zeros++; 400 } 401 402 # Count the number of zeros at the beginning of next segment. 403 my $next_lead_zeros = 0; 404 for ( 405 my $i = $nextseg->{min_idx}; 406 $i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i}; 407 $i++) 408 { 409 $next_lead_zeros++; 410 } 411 412 # How many zeros in common? 413 my $overlaid_trail_zeros = 414 ($this_trail_zeros > $next_lead_zeros) 415 ? $next_lead_zeros 416 : $this_trail_zeros; 417 418 $seg->{overlaid_trail_zeros} = $overlaid_trail_zeros; 419 $seg->{max_idx} = $seg->{max_idx} - $overlaid_trail_zeros; 420 } 421 422 ### 423 ### Replace label references with real offsets. 424 ### 425 ### So far, the non-leaf segments have referred to other segments by 426 ### their labels. Replace them with numerical offsets from the beginning 427 ### of the final array. You cannot move, add, or remove segments after 428 ### this step, as that would invalidate the offsets calculated here! 429 ### 430 my $flatoff = 0; 431 my %segmap; 432 433 # First pass: assign offsets to each segment, and build hash 434 # of label => offset. 435 foreach my $seg (@segments) 436 { 437 $seg->{offset} = $flatoff; 438 $segmap{ $seg->{label} } = $flatoff; 439 $flatoff += $seg->{max_idx} - $seg->{min_idx} + 1; 440 } 441 my $tblsize = $flatoff; 442 443 # Second pass: look up the offset of each label reference in the hash. 444 foreach my $seg (@segments) 445 { 446 while (my ($i, $val) = each %{ $seg->{values} }) 447 { 448 if (!($val =~ /^[0-9,.E]+$/)) 449 { 450 my $segoff = $segmap{$val}; 451 if ($segoff) 452 { 453 $seg->{values}->{$i} = $segoff; 454 } 455 else 456 { 457 die "no segment with label $val"; 458 } 459 } 460 } 461 } 462 463 # Also look up the positions of the roots in the table. 464 my $b1root = $segmap{"1-byte"}; 465 my $b2root = $segmap{"2-byte"}; 466 my $b3root = $segmap{"3-byte"}; 467 my $b4root = $segmap{"4-byte"}; 468 469 # And the lower-upper values of each level in each radix tree. 470 my $b1_lower = $min_idx{1}{1}; 471 my $b1_upper = $max_idx{1}{1}; 472 473 my $b2_1_lower = $min_idx{2}{1}; 474 my $b2_1_upper = $max_idx{2}{1}; 475 my $b2_2_lower = $min_idx{2}{2}; 476 my $b2_2_upper = $max_idx{2}{2}; 477 478 my $b3_1_lower = $min_idx{3}{1}; 479 my $b3_1_upper = $max_idx{3}{1}; 480 my $b3_2_lower = $min_idx{3}{2}; 481 my $b3_2_upper = $max_idx{3}{2}; 482 my $b3_3_lower = $min_idx{3}{3}; 483 my $b3_3_upper = $max_idx{3}{3}; 484 485 my $b4_1_lower = $min_idx{4}{1}; 486 my $b4_1_upper = $max_idx{4}{1}; 487 my $b4_2_lower = $min_idx{4}{2}; 488 my $b4_2_upper = $max_idx{4}{2}; 489 my $b4_3_lower = $min_idx{4}{3}; 490 my $b4_3_upper = $max_idx{4}{3}; 491 my $b4_4_lower = $min_idx{4}{4}; 492 my $b4_4_upper = $max_idx{4}{4}; 493 494 ### 495 ### Find the maximum value in the whole table, to determine if we can 496 ### use uint16 or if we need to use uint32. 497 ### 498 my $max_val = 0; 499 foreach my $seg (@segments) 500 { 501 foreach my $val (values %{ $seg->{values} }) 502 { 503 $max_val = $val if ($val > $max_val); 504 } 505 } 506 507 my $datatype = ($max_val <= 0xffff) ? "uint16" : "uint32"; 508 509 # For formatting, determine how many values we can fit on a single 510 # line, and how wide each value needs to be to align nicely. 511 my $vals_per_line; 512 my $colwidth; 513 514 if ($max_val <= 0xffff) 515 { 516 $vals_per_line = 8; 517 $colwidth = 4; 518 } 519 elsif ($max_val <= 0xffffff) 520 { 521 $vals_per_line = 4; 522 $colwidth = 6; 523 } 524 else 525 { 526 $vals_per_line = 4; 527 $colwidth = 8; 528 } 529 530 ### 531 ### Print the struct and array. 532 ### 533 printf $out "static const $datatype ${tblname}_table[$tblsize];\n"; 534 printf $out "\n"; 535 printf $out "static const pg_mb_radix_tree $tblname =\n"; 536 printf $out "{\n"; 537 if ($datatype eq "uint16") 538 { 539 print $out " ${tblname}_table,\n"; 540 print $out " NULL, /* 32-bit table not used */\n"; 541 } 542 if ($datatype eq "uint32") 543 { 544 print $out " NULL, /* 16-bit table not used */\n"; 545 print $out " ${tblname}_table,\n"; 546 } 547 printf $out "\n"; 548 printf $out " 0x%04x, /* offset of table for 1-byte inputs */\n", 549 $b1root; 550 printf $out " 0x%02x, /* b1_lower */\n", $b1_lower; 551 printf $out " 0x%02x, /* b1_upper */\n", $b1_upper; 552 printf $out "\n"; 553 printf $out " 0x%04x, /* offset of table for 2-byte inputs */\n", 554 $b2root; 555 printf $out " 0x%02x, /* b2_1_lower */\n", $b2_1_lower; 556 printf $out " 0x%02x, /* b2_1_upper */\n", $b2_1_upper; 557 printf $out " 0x%02x, /* b2_2_lower */\n", $b2_2_lower; 558 printf $out " 0x%02x, /* b2_2_upper */\n", $b2_2_upper; 559 printf $out "\n"; 560 printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", 561 $b3root; 562 printf $out " 0x%02x, /* b3_1_lower */\n", $b3_1_lower; 563 printf $out " 0x%02x, /* b3_1_upper */\n", $b3_1_upper; 564 printf $out " 0x%02x, /* b3_2_lower */\n", $b3_2_lower; 565 printf $out " 0x%02x, /* b3_2_upper */\n", $b3_2_upper; 566 printf $out " 0x%02x, /* b3_3_lower */\n", $b3_3_lower; 567 printf $out " 0x%02x, /* b3_3_upper */\n", $b3_3_upper; 568 printf $out "\n"; 569 printf $out " 0x%04x, /* offset of table for 3-byte inputs */\n", 570 $b4root; 571 printf $out " 0x%02x, /* b4_1_lower */\n", $b4_1_lower; 572 printf $out " 0x%02x, /* b4_1_upper */\n", $b4_1_upper; 573 printf $out " 0x%02x, /* b4_2_lower */\n", $b4_2_lower; 574 printf $out " 0x%02x, /* b4_2_upper */\n", $b4_2_upper; 575 printf $out " 0x%02x, /* b4_3_lower */\n", $b4_3_lower; 576 printf $out " 0x%02x, /* b4_3_upper */\n", $b4_3_upper; 577 printf $out " 0x%02x, /* b4_4_lower */\n", $b4_4_lower; 578 printf $out " 0x%02x /* b4_4_upper */\n", $b4_4_upper; 579 print $out "};\n"; 580 print $out "\n"; 581 print $out "static const $datatype ${tblname}_table[$tblsize] =\n"; 582 print $out "{"; 583 my $off = 0; 584 585 foreach my $seg (@segments) 586 { 587 printf $out "\n"; 588 printf $out " /*** %s - offset 0x%05x ***/\n", $seg->{header}, $off; 589 printf $out "\n"; 590 591 for (my $i = $seg->{min_idx}; $i <= $seg->{max_idx};) 592 { 593 594 # Print the next line's worth of values. 595 # XXX pad to begin at a nice boundary 596 printf $out " /* %02x */ ", $i; 597 for (my $j = 0; 598 $j < $vals_per_line && $i <= $seg->{max_idx}; $j++) 599 { 600 my $val = $seg->{values}->{$i}; 601 602 printf $out " 0x%0*x", $colwidth, $val; 603 $off++; 604 if ($off != $tblsize) 605 { 606 print $out ","; 607 } 608 $i++; 609 } 610 print $out "\n"; 611 } 612 if ($seg->{overlaid_trail_zeros}) 613 { 614 printf $out 615" /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n"; 616 } 617 } 618 619 # Sanity check. 620 if ($off != $tblsize) { die "table size didn't match!"; } 621 622 print $out "};\n"; 623} 624 625### 626sub build_segments_from_tree 627{ 628 my ($header, $rootlabel, $depth, $map) = @_; 629 630 my @segments; 631 632 if (%{$map}) 633 { 634 @segments = 635 build_segments_recurse($header, $rootlabel, "", 1, $depth, $map); 636 637 # Sort the segments into "breadth-first" order. Not strictly required, 638 # but makes the maps nicer to read. 639 @segments = 640 sort { $a->{level} cmp $b->{level} or $a->{path} cmp $b->{path} } 641 @segments; 642 } 643 644 return @segments; 645} 646 647### 648sub build_segments_recurse 649{ 650 my ($header, $label, $path, $level, $depth, $map) = @_; 651 652 my @segments; 653 654 if ($level == $depth) 655 { 656 push @segments, 657 { header => $header . ", leaf: ${path}xx", 658 label => $label, 659 level => $level, 660 depth => $depth, 661 path => $path, 662 values => $map }; 663 } 664 else 665 { 666 my %children; 667 668 while (my ($i, $val) = each %$map) 669 { 670 my $childpath = $path . sprintf("%02x", $i); 671 my $childlabel = "$depth-level-$level-$childpath"; 672 673 push @segments, 674 build_segments_recurse($header, $childlabel, $childpath, 675 $level + 1, $depth, $val); 676 $children{$i} = $childlabel; 677 } 678 679 push @segments, 680 { header => $header . ", byte #$level: ${path}xx", 681 label => $label, 682 level => $level, 683 depth => $depth, 684 path => $path, 685 values => \%children }; 686 } 687 return @segments; 688} 689 690####################################################################### 691# make_charmap - convert charset table to charmap hash 692# 693# make_charmap(\@charset, $direction) 694# charset - ref to charset table : see print_conversion_tables 695# direction - conversion direction 696# 697sub make_charmap 698{ 699 my ($out, $charset, $direction, $verbose) = @_; 700 701 die "unacceptable direction : $direction" 702 if ($direction != TO_UNICODE && $direction != FROM_UNICODE); 703 704 # In verbose mode, print a large comment with the source and comment of 705 # each character 706 if ($verbose) 707 { 708 print $out "/*\n"; 709 print $out "<src> <dst> <file>:<lineno> <comment>\n"; 710 } 711 712 my %charmap; 713 foreach my $c (@$charset) 714 { 715 716 # combined characters are handled elsewhere 717 next if (defined $c->{ucs_second}); 718 719 next if ($c->{direction} != $direction && $c->{direction} != BOTH); 720 721 my ($src, $dst) = 722 $direction == TO_UNICODE 723 ? ($c->{code}, ucs2utf($c->{ucs})) 724 : (ucs2utf($c->{ucs}), $c->{code}); 725 726 # check for duplicate source codes 727 if (defined $charmap{$src}) 728 { 729 printf STDERR 730"Error: duplicate source code on %s:%d: 0x%04x => 0x%04x, 0x%04x\n", 731 $c->{f}, $c->{l}, $src, $charmap{$src}, $dst; 732 exit; 733 } 734 $charmap{$src} = $dst; 735 736 if ($verbose) 737 { 738 printf $out "0x%04x 0x%04x %s:%d %s\n", $src, $dst, $c->{f}, 739 $c->{l}, $c->{comment}; 740 } 741 } 742 if ($verbose) 743 { 744 print $out "*/\n\n"; 745 } 746 747 return \%charmap; 748} 749 750####################################################################### 751# make_charmap_combined - convert charset table to charmap hash 752# with checking duplicate source code 753# 754# make_charmap_combined(\@charset, $direction) 755# charset - ref to charset table : see print_conversion_tables 756# direction - conversion direction 757# 758sub make_charmap_combined 759{ 760 my ($charset, $direction) = @_; 761 762 die "unacceptable direction : $direction" 763 if ($direction != TO_UNICODE && $direction != FROM_UNICODE); 764 765 my @combined; 766 foreach my $c (@$charset) 767 { 768 next if ($c->{direction} != $direction && $c->{direction} != BOTH); 769 770 if (defined $c->{ucs_second}) 771 { 772 my $entry = { 773 utf8 => ucs2utf($c->{ucs}), 774 utf8_second => ucs2utf($c->{ucs_second}), 775 code => $c->{code}, 776 comment => $c->{comment}, 777 f => $c->{f}, 778 l => $c->{l} }; 779 push @combined, $entry; 780 } 781 } 782 783 return \@combined; 784} 785 786####################################################################### 787# convert UCS-4 to UTF-8 788# 789sub ucs2utf 790{ 791 my ($ucs) = @_; 792 my $utf; 793 794 if ($ucs <= 0x007f) 795 { 796 $utf = $ucs; 797 } 798 elsif ($ucs > 0x007f && $ucs <= 0x07ff) 799 { 800 $utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8); 801 } 802 elsif ($ucs > 0x07ff && $ucs <= 0xffff) 803 { 804 $utf = 805 ((($ucs >> 12) | 0xe0) << 16) | 806 (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); 807 } 808 else 809 { 810 $utf = 811 ((($ucs >> 18) | 0xf0) << 24) | 812 (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) | 813 (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80); 814 } 815 return ($utf); 816} 817 8181; 819