1#! /usr/bin/perl
2#
3# Copyright (c) 2001-2020, PostgreSQL Global Development Group
4#
5# src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl
6#
7# Generate UTF-8 <--> EUC_JP code conversion tables from
8# map files provided by Unicode organization.
9# Unfortunately it is prohibited by the organization
10# to distribute the map files. So if you try to use this script,
11# you have to obtain CP932.TXT and JIS0212.TXT from the
12# organization's ftp site.
13
14use strict;
15use warnings;
16
17use convutils;
18
19my $this_script = 'src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl';
20
21# Load JIS0212.TXT
22my $jis0212 = &read_source("JIS0212.TXT");
23
24my @mapping;
25
26foreach my $i (@$jis0212)
27{
28
29	# We have a different mapping for this in the EUC_JP to UTF-8 direction.
30	if ($i->{code} == 0x2243)
31	{
32		$i->{direction} = FROM_UNICODE;
33	}
34
35	if ($i->{code} == 0x2271)
36	{
37		$i->{direction} = TO_UNICODE;
38	}
39
40	if ($i->{ucs} >= 0x080)
41	{
42		$i->{code} = $i->{code} | 0x8f8080;
43	}
44	else
45	{
46		next;
47	}
48
49	push @mapping, $i;
50}
51
52# Load CP932.TXT.
53my $ct932 = &read_source("CP932.TXT");
54
55foreach my $i (@$ct932)
56{
57	my $sjis = $i->{code};
58
59	# We have a different mapping for this in the EUC_JP to UTF-8 direction.
60	if (   $sjis == 0xeefa
61		|| $sjis == 0xeefb
62		|| $sjis == 0xeefc)
63	{
64		next;
65	}
66
67	if ($sjis >= 0xa1)
68	{
69		my $jis = &sjis2jis($sjis);
70
71		$i->{code} = $jis | (
72			$jis < 0x100
73			? 0x8e00
74			: ($sjis >= 0xeffd ? 0x8f8080 : 0x8080));
75
76		# Remember the SJIS code for later.
77		$i->{sjis} = $sjis;
78
79		push @mapping, $i;
80	}
81}
82
83foreach my $i (@mapping)
84{
85	my $sjis = $i->{sjis};
86
87	# These SJIS characters are excluded completely.
88	if (   $sjis >= 0xed00 && $sjis <= 0xeef9
89		|| $sjis >= 0xfa54 && $sjis <= 0xfa56
90		|| $sjis >= 0xfa58 && $sjis <= 0xfc4b)
91	{
92		$i->{direction} = NONE;
93		next;
94	}
95
96	# These SJIS characters are only in the UTF-8 to EUC_JP table
97	if ($sjis == 0xeefa || $sjis == 0xeefb || $sjis == 0xeefc)
98	{
99		$i->{direction} = FROM_UNICODE;
100		next;
101	}
102
103	if (   $sjis == 0x8790
104		|| $sjis == 0x8791
105		|| $sjis == 0x8792
106		|| $sjis == 0x8795
107		|| $sjis == 0x8796
108		|| $sjis == 0x8797
109		|| $sjis == 0x879a
110		|| $sjis == 0x879b
111		|| $sjis == 0x879c
112		|| ($sjis >= 0xfa4a && $sjis <= 0xfa53))
113	{
114		$i->{direction} = TO_UNICODE;
115		next;
116	}
117}
118
119push @mapping, (
120	{
121		direction => BOTH,
122		ucs       => 0x4efc,
123		code      => 0x8ff4af,
124		comment   => '# CJK(4EFC)'
125	},
126	{
127		direction => BOTH,
128		ucs       => 0x50f4,
129		code      => 0x8ff4b0,
130		comment   => '# CJK(50F4)'
131	},
132	{
133		direction => BOTH,
134		ucs       => 0x51EC,
135		code      => 0x8ff4b1,
136		comment   => '# CJK(51EC)'
137	},
138	{
139		direction => BOTH,
140		ucs       => 0x5307,
141		code      => 0x8ff4b2,
142		comment   => '# CJK(5307)'
143	},
144	{
145		direction => BOTH,
146		ucs       => 0x5324,
147		code      => 0x8ff4b3,
148		comment   => '# CJK(5324)'
149	},
150	{
151		direction => BOTH,
152		ucs       => 0x548A,
153		code      => 0x8ff4b5,
154		comment   => '# CJK(548A)'
155	},
156	{
157		direction => BOTH,
158		ucs       => 0x5759,
159		code      => 0x8ff4b6,
160		comment   => '# CJK(5759)'
161	},
162	{
163		direction => BOTH,
164		ucs       => 0x589E,
165		code      => 0x8ff4b9,
166		comment   => '# CJK(589E)'
167	},
168	{
169		direction => BOTH,
170		ucs       => 0x5BEC,
171		code      => 0x8ff4ba,
172		comment   => '# CJK(5BEC)'
173	},
174	{
175		direction => BOTH,
176		ucs       => 0x5CF5,
177		code      => 0x8ff4bb,
178		comment   => '# CJK(5CF5)'
179	},
180	{
181		direction => BOTH,
182		ucs       => 0x5D53,
183		code      => 0x8ff4bc,
184		comment   => '# CJK(5D53)'
185	},
186	{
187		direction => BOTH,
188		ucs       => 0x5FB7,
189		code      => 0x8ff4be,
190		comment   => '# CJK(5FB7)'
191	},
192	{
193		direction => BOTH,
194		ucs       => 0x6085,
195		code      => 0x8ff4bf,
196		comment   => '# CJK(6085)'
197	},
198	{
199		direction => BOTH,
200		ucs       => 0x6120,
201		code      => 0x8ff4c0,
202		comment   => '# CJK(6120)'
203	},
204	{
205		direction => BOTH,
206		ucs       => 0x654E,
207		code      => 0x8ff4c1,
208		comment   => '# CJK(654E)'
209	},
210	{
211		direction => BOTH,
212		ucs       => 0x663B,
213		code      => 0x8ff4c2,
214		comment   => '# CJK(663B)'
215	},
216	{
217		direction => BOTH,
218		ucs       => 0x6665,
219		code      => 0x8ff4c3,
220		comment   => '# CJK(6665)'
221	},
222	{
223		direction => BOTH,
224		ucs       => 0x6801,
225		code      => 0x8ff4c6,
226		comment   => '# CJK(6801)'
227	},
228	{
229		direction => BOTH,
230		ucs       => 0x6A6B,
231		code      => 0x8ff4c9,
232		comment   => '# CJK(6A6B)'
233	},
234	{
235		direction => BOTH,
236		ucs       => 0x6AE2,
237		code      => 0x8ff4ca,
238		comment   => '# CJK(6AE2)'
239	},
240	{
241		direction => BOTH,
242		ucs       => 0x6DF2,
243		code      => 0x8ff4cc,
244		comment   => '# CJK(6DF2)'
245	},
246	{
247		direction => BOTH,
248		ucs       => 0x6DF8,
249		code      => 0x8ff4cb,
250		comment   => '# CJK(6DF8)'
251	},
252	{
253		direction => BOTH,
254		ucs       => 0x7028,
255		code      => 0x8ff4cd,
256		comment   => '# CJK(7028)'
257	},
258	{
259		direction => BOTH,
260		ucs       => 0x70BB,
261		code      => 0x8ff4ae,
262		comment   => '# CJK(70BB)'
263	},
264	{
265		direction => BOTH,
266		ucs       => 0x7501,
267		code      => 0x8ff4d0,
268		comment   => '# CJK(7501)'
269	},
270	{
271		direction => BOTH,
272		ucs       => 0x7682,
273		code      => 0x8ff4d1,
274		comment   => '# CJK(7682)'
275	},
276	{
277		direction => BOTH,
278		ucs       => 0x769E,
279		code      => 0x8ff4d2,
280		comment   => '# CJK(769E)'
281	},
282	{
283		direction => BOTH,
284		ucs       => 0x7930,
285		code      => 0x8ff4d4,
286		comment   => '# CJK(7930)'
287	},
288	{
289		direction => BOTH,
290		ucs       => 0x7AE7,
291		code      => 0x8ff4d9,
292		comment   => '# CJK(7AE7)'
293	},
294	{
295		direction => BOTH,
296		ucs       => 0x7DA0,
297		code      => 0x8ff4dc,
298		comment   => '# CJK(7DA0)'
299	},
300	{
301		direction => BOTH,
302		ucs       => 0x7DD6,
303		code      => 0x8ff4dd,
304		comment   => '# CJK(7DD6)'
305	},
306	{
307		direction => BOTH,
308		ucs       => 0x8362,
309		code      => 0x8ff4df,
310		comment   => '# CJK(8362)'
311	},
312	{
313		direction => BOTH,
314		ucs       => 0x85B0,
315		code      => 0x8ff4e1,
316		comment   => '# CJK(85B0)'
317	},
318	{
319		direction => BOTH,
320		ucs       => 0x8807,
321		code      => 0x8ff4e4,
322		comment   => '# CJK(8807)'
323	},
324	{
325		direction => BOTH,
326		ucs       => 0x8B7F,
327		code      => 0x8ff4e6,
328		comment   => '# CJK(8B7F)'
329	},
330	{
331		direction => BOTH,
332		ucs       => 0x8CF4,
333		code      => 0x8ff4e7,
334		comment   => '# CJK(8CF4)'
335	},
336	{
337		direction => BOTH,
338		ucs       => 0x8D76,
339		code      => 0x8ff4e8,
340		comment   => '# CJK(8D76)'
341	},
342	{
343		direction => BOTH,
344		ucs       => 0x90DE,
345		code      => 0x8ff4ec,
346		comment   => '# CJK(90DE)'
347	},
348	{
349		direction => BOTH,
350		ucs       => 0x9115,
351		code      => 0x8ff4ee,
352		comment   => '# CJK(9115)'
353	},
354	{
355		direction => BOTH,
356		ucs       => 0x9592,
357		code      => 0x8ff4f1,
358		comment   => '# CJK(9592)'
359	},
360	{
361		direction => BOTH,
362		ucs       => 0x973B,
363		code      => 0x8ff4f4,
364		comment   => '# CJK(973B)'
365	},
366	{
367		direction => BOTH,
368		ucs       => 0x974D,
369		code      => 0x8ff4f5,
370		comment   => '# CJK(974D)'
371	},
372	{
373		direction => BOTH,
374		ucs       => 0x9751,
375		code      => 0x8ff4f6,
376		comment   => '# CJK(9751)'
377	},
378	{
379		direction => BOTH,
380		ucs       => 0x999E,
381		code      => 0x8ff4fa,
382		comment   => '# CJK(999E)'
383	},
384	{
385		direction => BOTH,
386		ucs       => 0x9AD9,
387		code      => 0x8ff4fb,
388		comment   => '# CJK(9AD9)'
389	},
390	{
391		direction => BOTH,
392		ucs       => 0x9B72,
393		code      => 0x8ff4fc,
394		comment   => '# CJK(9B72)'
395	},
396	{
397		direction => BOTH,
398		ucs       => 0x9ED1,
399		code      => 0x8ff4fe,
400		comment   => '# CJK(9ED1)'
401	},
402	{
403		direction => BOTH,
404		ucs       => 0xF929,
405		code      => 0x8ff4c5,
406		comment   => '# CJK COMPATIBILITY IDEOGRAPH-F929'
407	},
408	{
409		direction => BOTH,
410		ucs       => 0xF9DC,
411		code      => 0x8ff4f2,
412		comment   => '# CJK COMPATIBILITY IDEOGRAPH-F9DC'
413	},
414	{
415		direction => BOTH,
416		ucs       => 0xFA0E,
417		code      => 0x8ff4b4,
418		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA0E'
419	},
420	{
421		direction => BOTH,
422		ucs       => 0xFA0F,
423		code      => 0x8ff4b7,
424		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA0F'
425	},
426	{
427		direction => BOTH,
428		ucs       => 0xFA10,
429		code      => 0x8ff4b8,
430		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA10'
431	},
432	{
433		direction => BOTH,
434		ucs       => 0xFA11,
435		code      => 0x8ff4bd,
436		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA11'
437	},
438	{
439		direction => BOTH,
440		ucs       => 0xFA12,
441		code      => 0x8ff4c4,
442		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA12'
443	},
444	{
445		direction => BOTH,
446		ucs       => 0xFA13,
447		code      => 0x8ff4c7,
448		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA13'
449	},
450	{
451		direction => BOTH,
452		ucs       => 0xFA14,
453		code      => 0x8ff4c8,
454		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA14'
455	},
456	{
457		direction => BOTH,
458		ucs       => 0xFA15,
459		code      => 0x8ff4ce,
460		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA15'
461	},
462	{
463		direction => BOTH,
464		ucs       => 0xFA16,
465		code      => 0x8ff4cf,
466		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA16'
467	},
468	{
469		direction => BOTH,
470		ucs       => 0xFA17,
471		code      => 0x8ff4d3,
472		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA17'
473	},
474	{
475		direction => BOTH,
476		ucs       => 0xFA18,
477		code      => 0x8ff4d5,
478		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA18'
479	},
480	{
481		direction => BOTH,
482		ucs       => 0xFA19,
483		code      => 0x8ff4d6,
484		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA19'
485	},
486	{
487		direction => BOTH,
488		ucs       => 0xFA1A,
489		code      => 0x8ff4d7,
490		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA1A'
491	},
492	{
493		direction => BOTH,
494		ucs       => 0xFA1B,
495		code      => 0x8ff4d8,
496		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA1B'
497	},
498	{
499		direction => BOTH,
500		ucs       => 0xFA1C,
501		code      => 0x8ff4da,
502		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA1C'
503	},
504	{
505		direction => BOTH,
506		ucs       => 0xFA1D,
507		code      => 0x8ff4db,
508		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA1D'
509	},
510	{
511		direction => BOTH,
512		ucs       => 0xFA1E,
513		code      => 0x8ff4de,
514		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA1E'
515	},
516	{
517		direction => BOTH,
518		ucs       => 0xFA1F,
519		code      => 0x8ff4e0,
520		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA1F'
521	},
522	{
523		direction => BOTH,
524		ucs       => 0xFA20,
525		code      => 0x8ff4e2,
526		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA20'
527	},
528	{
529		direction => BOTH,
530		ucs       => 0xFA21,
531		code      => 0x8ff4e3,
532		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA21'
533	},
534	{
535		direction => BOTH,
536		ucs       => 0xFA22,
537		code      => 0x8ff4e5,
538		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA22'
539	},
540	{
541		direction => BOTH,
542		ucs       => 0xFA23,
543		code      => 0x8ff4e9,
544		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA23'
545	},
546	{
547		direction => BOTH,
548		ucs       => 0xFA24,
549		code      => 0x8ff4ea,
550		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA24'
551	},
552	{
553		direction => BOTH,
554		ucs       => 0xFA25,
555		code      => 0x8ff4eb,
556		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA25'
557	},
558	{
559		direction => BOTH,
560		ucs       => 0xFA26,
561		code      => 0x8ff4ed,
562		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA26'
563	},
564	{
565		direction => BOTH,
566		ucs       => 0xFA27,
567		code      => 0x8ff4ef,
568		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA27'
569	},
570	{
571		direction => BOTH,
572		ucs       => 0xFA28,
573		code      => 0x8ff4f0,
574		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA28'
575	},
576	{
577		direction => BOTH,
578		ucs       => 0xFA29,
579		code      => 0x8ff4f3,
580		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA29'
581	},
582	{
583		direction => BOTH,
584		ucs       => 0xFA2A,
585		code      => 0x8ff4f7,
586		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA2A'
587	},
588	{
589		direction => BOTH,
590		ucs       => 0xFA2B,
591		code      => 0x8ff4f8,
592		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA2B'
593	},
594	{
595		direction => BOTH,
596		ucs       => 0xFA2C,
597		code      => 0x8ff4f9,
598		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA2C'
599	},
600	{
601		direction => BOTH,
602		ucs       => 0xFA2D,
603		code      => 0x8ff4fd,
604		comment   => '# CJK COMPATIBILITY IDEOGRAPH-FA2D'
605	},
606	{
607		direction => BOTH,
608		ucs       => 0xFF07,
609		code      => 0x8ff4a9,
610		comment   => '# FULLWIDTH APOSTROPHE'
611	},
612	{
613		direction => BOTH,
614		ucs       => 0xFFE4,
615		code      => 0x8fa2c3,
616		comment   => '# FULLWIDTH BROKEN BAR'
617	},
618
619	# additional conversions for EUC_JP -> UTF-8 conversion
620	{
621		direction => TO_UNICODE,
622		ucs       => 0x2116,
623		code      => 0x8ff4ac,
624		comment   => '# NUMERO SIGN'
625	},
626	{
627		direction => TO_UNICODE,
628		ucs       => 0x2121,
629		code      => 0x8ff4ad,
630		comment   => '# TELEPHONE SIGN'
631	},
632	{
633		direction => TO_UNICODE,
634		ucs       => 0x3231,
635		code      => 0x8ff4ab,
636		comment   => '# PARENTHESIZED IDEOGRAPH STOCK'
637	});
638
639print_conversion_tables($this_script, "EUC_JP", \@mapping);
640
641
642#######################################################################
643# sjis2jis ; SJIS => JIS conversion
644sub sjis2jis
645{
646	my ($sjis) = @_;
647
648	return $sjis if ($sjis <= 0x100);
649
650	my $hi = $sjis >> 8;
651	my $lo = $sjis & 0xff;
652
653	if ($lo >= 0x80) { $lo--; }
654	$lo -= 0x40;
655	if ($hi >= 0xe0) { $hi -= 0x40; }
656	$hi -= 0x81;
657	my $pos = $lo + $hi * 0xbc;
658
659	if ($pos >= 114 * 0x5e && $pos <= 115 * 0x5e + 0x1b)
660	{
661
662		# This region (115-ku) is out of range of JIS code but for
663		# convenient to generate code in EUC CODESET 3, move this to
664		# seemingly duplicate region (83-84-ku).
665		$pos = $pos - ((31 * 0x5e) + 12);
666
667		# after 85-ku 82-ten needs to be moved 2 codepoints
668		$pos = $pos - 2 if ($pos >= 84 * 0x5c + 82);
669	}
670
671	my $hi2 = $pos / 0x5e;
672	my $lo2 = ($pos % 0x5e);
673
674	my $ret = $lo2 + 0x21 + (($hi2 + 0x21) << 8);
675
676	return $ret;
677}
678