1#! /usr/bin/perl
2#
3# Copyright (c) 2001-2016, PostgreSQL Global Development Group
4#
5# src/backend/utils/mb/Unicode/UCS_to_BIG5.pl
6#
7# Generate UTF-8 <--> BIG5 conversion tables from
8# map files provided by Unicode organization.
9# Unfortunately it is prohibited by the organization
10# to distribute the map files. So if you try to use this script,
11# you have to obtain the map files from the organization's ftp site.
12# ftp://www.unicode.org/Public/MAPPINGS/
13#
14# Our "big5" comes from BIG5.TXT, with the addition of the characters
15# in the range 0xf9d6-0xf9dc from CP950.TXT.
16#
17# BIG5.TXT format:
18#		 BIG5 code in hex
19#		 UCS-2 code in hex
20#		 # and Unicode name (not used in this script)
21#
22# CP950.TXT format:
23#		 CP950 code in hex
24#		 UCS-2 code in hex
25#		 # and Unicode name (not used in this script)
26
27
28require "ucs2utf.pl";
29
30
31#
32# first, generate UTF8 --> BIG5 table
33#
34$in_file = "BIG5.TXT";
35
36open(FILE, $in_file) || die("cannot open $in_file");
37
38reset 'array';
39
40while (<FILE>)
41{
42	chop;
43	if (/^#/)
44	{
45		next;
46	}
47	($c, $u, $rest) = split;
48	$ucs  = hex($u);
49	$code = hex($c);
50	if ($code >= 0x80 && $ucs >= 0x0080)
51	{
52		$utf = &ucs2utf($ucs);
53		if ($array{$utf} ne "")
54		{
55			printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
56			next;
57		}
58		$count++;
59		$array{$utf} = $code;
60	}
61}
62close(FILE);
63
64$in_file = "CP950.TXT";
65
66open(FILE, $in_file) || die("cannot open $in_file");
67
68while (<FILE>)
69{
70	chop;
71	if (/^#/)
72	{
73		next;
74	}
75	($c, $u, $rest) = split;
76	$ucs  = hex($u);
77	$code = hex($c);
78
79	# Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
80	# from CP950.TXT
81	if (   $code >= 0x80
82		&& $ucs >= 0x0080
83		&& $code >= 0xf9d6
84		&& $code <= 0xf9dc)
85	{
86		$utf = &ucs2utf($ucs);
87		if ($array{$utf} ne "")
88		{
89			printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
90			next;
91		}
92		$count++;
93		$array{$utf} = $code;
94	}
95}
96close(FILE);
97
98$file = lc("utf8_to_big5.map");
99open(FILE, "> $file") || die("cannot open $file");
100
101print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
102print FILE "static const pg_utf_to_local ULmapBIG5[ $count ] = {\n";
103
104for $index (sort { $a <=> $b } keys(%array))
105{
106	$code = $array{$index};
107	$count--;
108	if ($count == 0)
109	{
110		printf FILE "  {0x%04x, 0x%04x}\n", $index, $code;
111	}
112	else
113	{
114		printf FILE "  {0x%04x, 0x%04x},\n", $index, $code;
115	}
116}
117
118print FILE "};\n";
119close(FILE);
120
121#
122# then generate BIG5 --> UTF8 table
123#
124$in_file = "BIG5.TXT";
125
126open(FILE, $in_file) || die("cannot open $in_file");
127
128reset 'array';
129
130while (<FILE>)
131{
132	chop;
133	if (/^#/)
134	{
135		next;
136	}
137	($c, $u, $rest) = split;
138	$ucs  = hex($u);
139	$code = hex($c);
140	if ($code >= 0x80 && $ucs >= 0x0080)
141	{
142		$utf = &ucs2utf($ucs);
143		if ($array{$utf} ne "")
144		{
145			printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
146			next;
147		}
148		$count++;
149		$array{$code} = $utf;
150	}
151}
152close(FILE);
153
154$in_file = "CP950.TXT";
155
156open(FILE, $in_file) || die("cannot open $in_file");
157
158while (<FILE>)
159{
160	chop;
161	if (/^#/)
162	{
163		next;
164	}
165	($c, $u, $rest) = split;
166	$ucs  = hex($u);
167	$code = hex($c);
168
169	# Pick only the ETEN extended characters in the range 0xf9d6 - 0xf9dc
170	# from CP950.TXT
171	if (   $code >= 0x80
172		&& $ucs >= 0x0080
173		&& $code >= 0xf9d6
174		&& $code <= 0xf9dc)
175	{
176		$utf = &ucs2utf($ucs);
177		if ($array{$utf} ne "")
178		{
179			printf STDERR "Warning: duplicate UTF8: %04x\n", $ucs;
180			next;
181		}
182		$count++;
183		$array{$code} = $utf;
184	}
185}
186close(FILE);
187
188$file = lc("big5_to_utf8.map");
189open(FILE, "> $file") || die("cannot open $file");
190
191print FILE "/* src/backend/utils/mb/Unicode/$file */\n\n";
192print FILE "static const pg_local_to_utf LUmapBIG5[ $count ] = {\n";
193for $index (sort { $a <=> $b } keys(%array))
194{
195	$utf = $array{$index};
196	$count--;
197	if ($count == 0)
198	{
199		printf FILE "  {0x%04x, 0x%04x}\n", $index, $utf;
200	}
201	else
202	{
203		printf FILE "  {0x%04x, 0x%04x},\n", $index, $utf;
204	}
205}
206
207print FILE "};\n";
208close(FILE);
209