1#! /usr/bin/perl 2# 3# Copyright (c) 2001-2020, PostgreSQL Global Development Group 4# 5# src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl 6# 7# Generate UTF-8 <--> EUC_JP code conversion tables from 8# map files provided by Unicode organization. 9# Unfortunately it is prohibited by the organization 10# to distribute the map files. So if you try to use this script, 11# you have to obtain CP932.TXT and JIS0212.TXT from the 12# organization's ftp site. 13 14use strict; 15use warnings; 16 17use convutils; 18 19my $this_script = 'src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl'; 20 21# Load JIS0212.TXT 22my $jis0212 = &read_source("JIS0212.TXT"); 23 24my @mapping; 25 26foreach my $i (@$jis0212) 27{ 28 29 # We have a different mapping for this in the EUC_JP to UTF-8 direction. 30 if ($i->{code} == 0x2243) 31 { 32 $i->{direction} = FROM_UNICODE; 33 } 34 35 if ($i->{code} == 0x2271) 36 { 37 $i->{direction} = TO_UNICODE; 38 } 39 40 if ($i->{ucs} >= 0x080) 41 { 42 $i->{code} = $i->{code} | 0x8f8080; 43 } 44 else 45 { 46 next; 47 } 48 49 push @mapping, $i; 50} 51 52# Load CP932.TXT. 53my $ct932 = &read_source("CP932.TXT"); 54 55foreach my $i (@$ct932) 56{ 57 my $sjis = $i->{code}; 58 59 # We have a different mapping for this in the EUC_JP to UTF-8 direction. 60 if ( $sjis == 0xeefa 61 || $sjis == 0xeefb 62 || $sjis == 0xeefc) 63 { 64 next; 65 } 66 67 if ($sjis >= 0xa1) 68 { 69 my $jis = &sjis2jis($sjis); 70 71 $i->{code} = $jis | ( 72 $jis < 0x100 73 ? 0x8e00 74 : ($sjis >= 0xeffd ? 0x8f8080 : 0x8080)); 75 76 # Remember the SJIS code for later. 77 $i->{sjis} = $sjis; 78 79 push @mapping, $i; 80 } 81} 82 83foreach my $i (@mapping) 84{ 85 my $sjis = $i->{sjis}; 86 87 # These SJIS characters are excluded completely. 88 if ( $sjis >= 0xed00 && $sjis <= 0xeef9 89 || $sjis >= 0xfa54 && $sjis <= 0xfa56 90 || $sjis >= 0xfa58 && $sjis <= 0xfc4b) 91 { 92 $i->{direction} = NONE; 93 next; 94 } 95 96 # These SJIS characters are only in the UTF-8 to EUC_JP table 97 if ($sjis == 0xeefa || $sjis == 0xeefb || $sjis == 0xeefc) 98 { 99 $i->{direction} = FROM_UNICODE; 100 next; 101 } 102 103 if ( $sjis == 0x8790 104 || $sjis == 0x8791 105 || $sjis == 0x8792 106 || $sjis == 0x8795 107 || $sjis == 0x8796 108 || $sjis == 0x8797 109 || $sjis == 0x879a 110 || $sjis == 0x879b 111 || $sjis == 0x879c 112 || ($sjis >= 0xfa4a && $sjis <= 0xfa53)) 113 { 114 $i->{direction} = TO_UNICODE; 115 next; 116 } 117} 118 119push @mapping, ( 120 { 121 direction => BOTH, 122 ucs => 0x4efc, 123 code => 0x8ff4af, 124 comment => '# CJK(4EFC)' 125 }, 126 { 127 direction => BOTH, 128 ucs => 0x50f4, 129 code => 0x8ff4b0, 130 comment => '# CJK(50F4)' 131 }, 132 { 133 direction => BOTH, 134 ucs => 0x51EC, 135 code => 0x8ff4b1, 136 comment => '# CJK(51EC)' 137 }, 138 { 139 direction => BOTH, 140 ucs => 0x5307, 141 code => 0x8ff4b2, 142 comment => '# CJK(5307)' 143 }, 144 { 145 direction => BOTH, 146 ucs => 0x5324, 147 code => 0x8ff4b3, 148 comment => '# CJK(5324)' 149 }, 150 { 151 direction => BOTH, 152 ucs => 0x548A, 153 code => 0x8ff4b5, 154 comment => '# CJK(548A)' 155 }, 156 { 157 direction => BOTH, 158 ucs => 0x5759, 159 code => 0x8ff4b6, 160 comment => '# CJK(5759)' 161 }, 162 { 163 direction => BOTH, 164 ucs => 0x589E, 165 code => 0x8ff4b9, 166 comment => '# CJK(589E)' 167 }, 168 { 169 direction => BOTH, 170 ucs => 0x5BEC, 171 code => 0x8ff4ba, 172 comment => '# CJK(5BEC)' 173 }, 174 { 175 direction => BOTH, 176 ucs => 0x5CF5, 177 code => 0x8ff4bb, 178 comment => '# CJK(5CF5)' 179 }, 180 { 181 direction => BOTH, 182 ucs => 0x5D53, 183 code => 0x8ff4bc, 184 comment => '# CJK(5D53)' 185 }, 186 { 187 direction => BOTH, 188 ucs => 0x5FB7, 189 code => 0x8ff4be, 190 comment => '# CJK(5FB7)' 191 }, 192 { 193 direction => BOTH, 194 ucs => 0x6085, 195 code => 0x8ff4bf, 196 comment => '# CJK(6085)' 197 }, 198 { 199 direction => BOTH, 200 ucs => 0x6120, 201 code => 0x8ff4c0, 202 comment => '# CJK(6120)' 203 }, 204 { 205 direction => BOTH, 206 ucs => 0x654E, 207 code => 0x8ff4c1, 208 comment => '# CJK(654E)' 209 }, 210 { 211 direction => BOTH, 212 ucs => 0x663B, 213 code => 0x8ff4c2, 214 comment => '# CJK(663B)' 215 }, 216 { 217 direction => BOTH, 218 ucs => 0x6665, 219 code => 0x8ff4c3, 220 comment => '# CJK(6665)' 221 }, 222 { 223 direction => BOTH, 224 ucs => 0x6801, 225 code => 0x8ff4c6, 226 comment => '# CJK(6801)' 227 }, 228 { 229 direction => BOTH, 230 ucs => 0x6A6B, 231 code => 0x8ff4c9, 232 comment => '# CJK(6A6B)' 233 }, 234 { 235 direction => BOTH, 236 ucs => 0x6AE2, 237 code => 0x8ff4ca, 238 comment => '# CJK(6AE2)' 239 }, 240 { 241 direction => BOTH, 242 ucs => 0x6DF2, 243 code => 0x8ff4cc, 244 comment => '# CJK(6DF2)' 245 }, 246 { 247 direction => BOTH, 248 ucs => 0x6DF8, 249 code => 0x8ff4cb, 250 comment => '# CJK(6DF8)' 251 }, 252 { 253 direction => BOTH, 254 ucs => 0x7028, 255 code => 0x8ff4cd, 256 comment => '# CJK(7028)' 257 }, 258 { 259 direction => BOTH, 260 ucs => 0x70BB, 261 code => 0x8ff4ae, 262 comment => '# CJK(70BB)' 263 }, 264 { 265 direction => BOTH, 266 ucs => 0x7501, 267 code => 0x8ff4d0, 268 comment => '# CJK(7501)' 269 }, 270 { 271 direction => BOTH, 272 ucs => 0x7682, 273 code => 0x8ff4d1, 274 comment => '# CJK(7682)' 275 }, 276 { 277 direction => BOTH, 278 ucs => 0x769E, 279 code => 0x8ff4d2, 280 comment => '# CJK(769E)' 281 }, 282 { 283 direction => BOTH, 284 ucs => 0x7930, 285 code => 0x8ff4d4, 286 comment => '# CJK(7930)' 287 }, 288 { 289 direction => BOTH, 290 ucs => 0x7AE7, 291 code => 0x8ff4d9, 292 comment => '# CJK(7AE7)' 293 }, 294 { 295 direction => BOTH, 296 ucs => 0x7DA0, 297 code => 0x8ff4dc, 298 comment => '# CJK(7DA0)' 299 }, 300 { 301 direction => BOTH, 302 ucs => 0x7DD6, 303 code => 0x8ff4dd, 304 comment => '# CJK(7DD6)' 305 }, 306 { 307 direction => BOTH, 308 ucs => 0x8362, 309 code => 0x8ff4df, 310 comment => '# CJK(8362)' 311 }, 312 { 313 direction => BOTH, 314 ucs => 0x85B0, 315 code => 0x8ff4e1, 316 comment => '# CJK(85B0)' 317 }, 318 { 319 direction => BOTH, 320 ucs => 0x8807, 321 code => 0x8ff4e4, 322 comment => '# CJK(8807)' 323 }, 324 { 325 direction => BOTH, 326 ucs => 0x8B7F, 327 code => 0x8ff4e6, 328 comment => '# CJK(8B7F)' 329 }, 330 { 331 direction => BOTH, 332 ucs => 0x8CF4, 333 code => 0x8ff4e7, 334 comment => '# CJK(8CF4)' 335 }, 336 { 337 direction => BOTH, 338 ucs => 0x8D76, 339 code => 0x8ff4e8, 340 comment => '# CJK(8D76)' 341 }, 342 { 343 direction => BOTH, 344 ucs => 0x90DE, 345 code => 0x8ff4ec, 346 comment => '# CJK(90DE)' 347 }, 348 { 349 direction => BOTH, 350 ucs => 0x9115, 351 code => 0x8ff4ee, 352 comment => '# CJK(9115)' 353 }, 354 { 355 direction => BOTH, 356 ucs => 0x9592, 357 code => 0x8ff4f1, 358 comment => '# CJK(9592)' 359 }, 360 { 361 direction => BOTH, 362 ucs => 0x973B, 363 code => 0x8ff4f4, 364 comment => '# CJK(973B)' 365 }, 366 { 367 direction => BOTH, 368 ucs => 0x974D, 369 code => 0x8ff4f5, 370 comment => '# CJK(974D)' 371 }, 372 { 373 direction => BOTH, 374 ucs => 0x9751, 375 code => 0x8ff4f6, 376 comment => '# CJK(9751)' 377 }, 378 { 379 direction => BOTH, 380 ucs => 0x999E, 381 code => 0x8ff4fa, 382 comment => '# CJK(999E)' 383 }, 384 { 385 direction => BOTH, 386 ucs => 0x9AD9, 387 code => 0x8ff4fb, 388 comment => '# CJK(9AD9)' 389 }, 390 { 391 direction => BOTH, 392 ucs => 0x9B72, 393 code => 0x8ff4fc, 394 comment => '# CJK(9B72)' 395 }, 396 { 397 direction => BOTH, 398 ucs => 0x9ED1, 399 code => 0x8ff4fe, 400 comment => '# CJK(9ED1)' 401 }, 402 { 403 direction => BOTH, 404 ucs => 0xF929, 405 code => 0x8ff4c5, 406 comment => '# CJK COMPATIBILITY IDEOGRAPH-F929' 407 }, 408 { 409 direction => BOTH, 410 ucs => 0xF9DC, 411 code => 0x8ff4f2, 412 comment => '# CJK COMPATIBILITY IDEOGRAPH-F9DC' 413 }, 414 { 415 direction => BOTH, 416 ucs => 0xFA0E, 417 code => 0x8ff4b4, 418 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0E' 419 }, 420 { 421 direction => BOTH, 422 ucs => 0xFA0F, 423 code => 0x8ff4b7, 424 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA0F' 425 }, 426 { 427 direction => BOTH, 428 ucs => 0xFA10, 429 code => 0x8ff4b8, 430 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA10' 431 }, 432 { 433 direction => BOTH, 434 ucs => 0xFA11, 435 code => 0x8ff4bd, 436 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA11' 437 }, 438 { 439 direction => BOTH, 440 ucs => 0xFA12, 441 code => 0x8ff4c4, 442 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA12' 443 }, 444 { 445 direction => BOTH, 446 ucs => 0xFA13, 447 code => 0x8ff4c7, 448 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA13' 449 }, 450 { 451 direction => BOTH, 452 ucs => 0xFA14, 453 code => 0x8ff4c8, 454 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA14' 455 }, 456 { 457 direction => BOTH, 458 ucs => 0xFA15, 459 code => 0x8ff4ce, 460 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA15' 461 }, 462 { 463 direction => BOTH, 464 ucs => 0xFA16, 465 code => 0x8ff4cf, 466 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA16' 467 }, 468 { 469 direction => BOTH, 470 ucs => 0xFA17, 471 code => 0x8ff4d3, 472 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA17' 473 }, 474 { 475 direction => BOTH, 476 ucs => 0xFA18, 477 code => 0x8ff4d5, 478 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA18' 479 }, 480 { 481 direction => BOTH, 482 ucs => 0xFA19, 483 code => 0x8ff4d6, 484 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA19' 485 }, 486 { 487 direction => BOTH, 488 ucs => 0xFA1A, 489 code => 0x8ff4d7, 490 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1A' 491 }, 492 { 493 direction => BOTH, 494 ucs => 0xFA1B, 495 code => 0x8ff4d8, 496 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1B' 497 }, 498 { 499 direction => BOTH, 500 ucs => 0xFA1C, 501 code => 0x8ff4da, 502 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1C' 503 }, 504 { 505 direction => BOTH, 506 ucs => 0xFA1D, 507 code => 0x8ff4db, 508 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1D' 509 }, 510 { 511 direction => BOTH, 512 ucs => 0xFA1E, 513 code => 0x8ff4de, 514 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1E' 515 }, 516 { 517 direction => BOTH, 518 ucs => 0xFA1F, 519 code => 0x8ff4e0, 520 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA1F' 521 }, 522 { 523 direction => BOTH, 524 ucs => 0xFA20, 525 code => 0x8ff4e2, 526 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA20' 527 }, 528 { 529 direction => BOTH, 530 ucs => 0xFA21, 531 code => 0x8ff4e3, 532 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA21' 533 }, 534 { 535 direction => BOTH, 536 ucs => 0xFA22, 537 code => 0x8ff4e5, 538 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA22' 539 }, 540 { 541 direction => BOTH, 542 ucs => 0xFA23, 543 code => 0x8ff4e9, 544 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA23' 545 }, 546 { 547 direction => BOTH, 548 ucs => 0xFA24, 549 code => 0x8ff4ea, 550 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA24' 551 }, 552 { 553 direction => BOTH, 554 ucs => 0xFA25, 555 code => 0x8ff4eb, 556 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA25' 557 }, 558 { 559 direction => BOTH, 560 ucs => 0xFA26, 561 code => 0x8ff4ed, 562 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA26' 563 }, 564 { 565 direction => BOTH, 566 ucs => 0xFA27, 567 code => 0x8ff4ef, 568 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA27' 569 }, 570 { 571 direction => BOTH, 572 ucs => 0xFA28, 573 code => 0x8ff4f0, 574 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA28' 575 }, 576 { 577 direction => BOTH, 578 ucs => 0xFA29, 579 code => 0x8ff4f3, 580 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA29' 581 }, 582 { 583 direction => BOTH, 584 ucs => 0xFA2A, 585 code => 0x8ff4f7, 586 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2A' 587 }, 588 { 589 direction => BOTH, 590 ucs => 0xFA2B, 591 code => 0x8ff4f8, 592 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2B' 593 }, 594 { 595 direction => BOTH, 596 ucs => 0xFA2C, 597 code => 0x8ff4f9, 598 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2C' 599 }, 600 { 601 direction => BOTH, 602 ucs => 0xFA2D, 603 code => 0x8ff4fd, 604 comment => '# CJK COMPATIBILITY IDEOGRAPH-FA2D' 605 }, 606 { 607 direction => BOTH, 608 ucs => 0xFF07, 609 code => 0x8ff4a9, 610 comment => '# FULLWIDTH APOSTROPHE' 611 }, 612 { 613 direction => BOTH, 614 ucs => 0xFFE4, 615 code => 0x8fa2c3, 616 comment => '# FULLWIDTH BROKEN BAR' 617 }, 618 619 # additional conversions for EUC_JP -> UTF-8 conversion 620 { 621 direction => TO_UNICODE, 622 ucs => 0x2116, 623 code => 0x8ff4ac, 624 comment => '# NUMERO SIGN' 625 }, 626 { 627 direction => TO_UNICODE, 628 ucs => 0x2121, 629 code => 0x8ff4ad, 630 comment => '# TELEPHONE SIGN' 631 }, 632 { 633 direction => TO_UNICODE, 634 ucs => 0x3231, 635 code => 0x8ff4ab, 636 comment => '# PARENTHESIZED IDEOGRAPH STOCK' 637 }); 638 639print_conversion_tables($this_script, "EUC_JP", \@mapping); 640 641 642####################################################################### 643# sjis2jis ; SJIS => JIS conversion 644sub sjis2jis 645{ 646 my ($sjis) = @_; 647 648 return $sjis if ($sjis <= 0x100); 649 650 my $hi = $sjis >> 8; 651 my $lo = $sjis & 0xff; 652 653 if ($lo >= 0x80) { $lo--; } 654 $lo -= 0x40; 655 if ($hi >= 0xe0) { $hi -= 0x40; } 656 $hi -= 0x81; 657 my $pos = $lo + $hi * 0xbc; 658 659 if ($pos >= 114 * 0x5e && $pos <= 115 * 0x5e + 0x1b) 660 { 661 662 # This region (115-ku) is out of range of JIS code but for 663 # convenient to generate code in EUC CODESET 3, move this to 664 # seemingly duplicate region (83-84-ku). 665 $pos = $pos - ((31 * 0x5e) + 12); 666 667 # after 85-ku 82-ten needs to be moved 2 codepoints 668 $pos = $pos - 2 if ($pos >= 84 * 0x5c + 82); 669 } 670 671 my $hi2 = $pos / 0x5e; 672 my $lo2 = ($pos % 0x5e); 673 674 my $ret = $lo2 + 0x21 + (($hi2 + 0x21) << 8); 675 676 return $ret; 677} 678