1#! /usr/bin/perl 2# 3# Copyright (c) 2001-2016, PostgreSQL Global Development Group 4# 5# src/backend/utils/mb/Unicode/UCS_to_BIG5.pl 6# 7# Generate UTF-8 <--> BIG5 conversion tables from 8# map files provided by Unicode organization. 9# Unfortunately it is prohibited by the organization 10# to distribute the map files. So if you try to use this script, 11# you have to obtain the map files from the organization's ftp site. 12# ftp://www.unicode.org/Public/MAPPINGS/ 13# 14# Our "big5" comes from BIG5.TXT, with the addition of the characters 15# in the range 0xf9d6-0xf9dc from CP950.TXT. 16# 17# BIG5.TXT format: 18# BIG5 code in hex 19# UCS-2 code in hex 20# # and Unicode name (not used in this script) 21# 22# CP950.TXT format: 23# CP950 code in hex 24# UCS-2 code in hex 25# # and Unicode name (not used in this script) 26 27 28require "ucs2utf.pl"; 29 30 31# 32# first, generate UTF8 --> BIG5 table 33# 34$in_file = "BIG5.TXT"; 35 36open(FILE, $in_file) || die("cannot open $in_file"); 37 38reset 'array'; 39 40while (<FILE>) 41{ 42 chop; 43 if (/^#/) 44 { 45 next; 46 } 47 ($c, $u, $rest) = split; 48 $ucs = hex($u); 49 $code = hex($c); 50 if ($code >= 0x80 && $ucs >= 0x0080) 51 { 52 $utf = &ucs2utf($ucs); 53 if ($array{$utf} ne "") 54 { 55 printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; 56 next; 57 } 58 $count++; 59 $array{$utf} = $code; 60 } 61} 62close(FILE); 63 64$in_file = "CP950.TXT"; 65 66open(FILE, $in_file) || die("cannot open $in_file"); 67 68while (<FILE>) 69{ 70 chop; 71 if (/^#/) 72 { 73 next; 74 } 75 ($c, $u, $rest) = split; 76 $ucs = hex($u); 77 $code = hex($c); 78 79 # Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc 80 # from CP950.TXT 81 if ( $code >= 0x80 82 && $ucs >= 0x0080 83 && $code >= 0xf9d6 84 && $code <= 0xf9dc) 85 { 86 $utf = &ucs2utf($ucs); 87 if ($array{$utf} ne "") 88 { 89 printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; 90 next; 91 } 92 $count++; 93 $array{$utf} = $code; 94 } 95} 96close(FILE); 97 98$file = lc("utf8_to_big5.map"); 99open(FILE, "> $file") || die("cannot open $file"); 100 101print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; 102print FILE "static const pg_utf_to_local ULmapBIG5[ $count ] = {\n"; 103 104for $index (sort { $a <=> $b } keys(%array)) 105{ 106 $code = $array{$index}; 107 $count--; 108 if ($count == 0) 109 { 110 printf FILE " {0x%04x, 0x%04x}\n", $index, $code; 111 } 112 else 113 { 114 printf FILE " {0x%04x, 0x%04x},\n", $index, $code; 115 } 116} 117 118print FILE "};\n"; 119close(FILE); 120 121# 122# then generate BIG5 --> UTF8 table 123# 124$in_file = "BIG5.TXT"; 125 126open(FILE, $in_file) || die("cannot open $in_file"); 127 128reset 'array'; 129 130while (<FILE>) 131{ 132 chop; 133 if (/^#/) 134 { 135 next; 136 } 137 ($c, $u, $rest) = split; 138 $ucs = hex($u); 139 $code = hex($c); 140 if ($code >= 0x80 && $ucs >= 0x0080) 141 { 142 $utf = &ucs2utf($ucs); 143 if ($array{$utf} ne "") 144 { 145 printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; 146 next; 147 } 148 $count++; 149 $array{$code} = $utf; 150 } 151} 152close(FILE); 153 154$in_file = "CP950.TXT"; 155 156open(FILE, $in_file) || die("cannot open $in_file"); 157 158while (<FILE>) 159{ 160 chop; 161 if (/^#/) 162 { 163 next; 164 } 165 ($c, $u, $rest) = split; 166 $ucs = hex($u); 167 $code = hex($c); 168 169 # Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc 170 # from CP950.TXT 171 if ( $code >= 0x80 172 && $ucs >= 0x0080 173 && $code >= 0xf9d6 174 && $code <= 0xf9dc) 175 { 176 $utf = &ucs2utf($ucs); 177 if ($array{$utf} ne "") 178 { 179 printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs; 180 next; 181 } 182 $count++; 183 $array{$code} = $utf; 184 } 185} 186close(FILE); 187 188$file = lc("big5_to_utf8.map"); 189open(FILE, "> $file") || die("cannot open $file"); 190 191print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n"; 192print FILE "static const pg_local_to_utf LUmapBIG5[ $count ] = {\n"; 193for $index (sort { $a <=> $b } keys(%array)) 194{ 195 $utf = $array{$index}; 196 $count--; 197 if ($count == 0) 198 { 199 printf FILE " {0x%04x, 0x%04x}\n", $index, $utf; 200 } 201 else 202 { 203 printf FILE " {0x%04x, 0x%04x},\n", $index, $utf; 204 } 205} 206 207print FILE "};\n"; 208close(FILE); 209