1#
2# Copyright (c) 2001-2017, PostgreSQL Global Development Group
3#
4# src/backend/utils/mb/Unicode/convutils.pm
5
6package convutils;
7
8use strict;
9
10use Exporter 'import';
11
12our @EXPORT =
13  qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables);
14
15# Constants used in the 'direction' field of the character maps
16use constant {
17	NONE         => 0,
18	TO_UNICODE   => 1,
19	FROM_UNICODE => 2,
20	BOTH         => 3 };
21
22#######################################################################
23# read_source - common routine to read source file
24#
25# fname ; input file name
26#
27sub read_source
28{
29	my ($fname) = @_;
30	my @r;
31
32	open(my $in, '<', $fname) || die("cannot open $fname");
33
34	while (<$in>)
35	{
36		next if (/^#/);
37		chop;
38
39		next if (/^$/);    # Ignore empty lines
40
41		next if (/^0x([0-9A-F]+)\s+(#.*)$/);
42
43		# The Unicode source files have three columns
44		# 1: The "foreign" code (in hex)
45		# 2: Unicode code point (in hex)
46		# 3: Unicode name
47		if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(#.*)$/)
48		{
49			print STDERR "READ ERROR at line $. in $fname: $_\n";
50			exit;
51		}
52		my $out = {
53			code      => hex($1),
54			ucs       => hex($2),
55			comment   => $4,
56			direction => BOTH,
57			f         => $fname,
58			l         => $. };
59
60		# Ignore pure ASCII mappings. PostgreSQL character conversion code
61		# never even passes these to the conversion code.
62		next if ($out->{code} < 0x80 || $out->{ucs} < 0x80);
63
64		push(@r, $out);
65	}
66	close($in);
67
68	return \@r;
69}
70
71##################################################################
72# print_conversion_tables - output mapping tables
73#
74# print_conversion_tables($this_script, $csname, \%charset)
75#
76# this_script - the name of the *caller script* of this feature
77# csname      - character set name other than ucs
78# charset     - ref to character set array
79#
80# Input character set array format:
81#
82# Each element in the character set array is a hash. Each hash has the following fields:
83#   direction  - BOTH, TO_UNICODE, or FROM_UNICODE (or NONE, to ignore the entry altogether)
84#   ucs        - Unicode code point
85#   ucs_second - Second Unicode code point, if this is a "combined" character.
86#   code       - Byte sequence in the "other" character set, as an integer
87#   comment    - Text representation of the character
88#   f          - Source filename
89#   l          - Line number in source file
90#
91sub print_conversion_tables
92{
93	my ($this_script, $csname, $charset) = @_;
94
95	print_conversion_tables_direction($this_script, $csname, FROM_UNICODE,
96		$charset);
97	print_conversion_tables_direction($this_script, $csname, TO_UNICODE,
98		$charset);
99}
100
101#############################################################################
102# INTERNAL ROUTINES
103
104#######################################################################
105# print_conversion_tables_direction - write the whole content of C source of radix tree
106#
107# print_conversion_tables_direction($this_script, $csname, $direction, \%charset, $tblwidth)
108#
109# this_script - the name of the *caller script* of this feature
110# csname      - character set name other than ucs
111# direction   - desired direction, TO_UNICODE or FROM_UNICODE
112# charset     - ref to character set array
113#
114sub print_conversion_tables_direction
115{
116	my ($this_script, $csname, $direction, $charset) = @_;
117
118	my $fname;
119	my $tblname;
120	if ($direction == TO_UNICODE)
121	{
122		$fname   = lc("${csname}_to_utf8.map");
123		$tblname = lc("${csname}_to_unicode_tree");
124
125		print "- Writing ${csname}=>UTF8 conversion table: $fname\n";
126	}
127	else
128	{
129		$fname   = lc("utf8_to_${csname}.map");
130		$tblname = lc("${csname}_from_unicode_tree");
131
132		print "- Writing UTF8=>${csname} conversion table: $fname\n";
133	}
134
135	open(my $out, '>', $fname) || die("cannot open $fname");
136
137	print $out "/* src/backend/utils/mb/Unicode/$fname */\n";
138	print $out "/* This file is generated by $this_script */\n\n";
139
140# Collect regular, non-combined, mappings, and create the radix tree from them.
141	my $charmap = &make_charmap($out, $charset, $direction, 0);
142	print_radix_table($out, $tblname, $charmap);
143
144   # Collect combined characters, and create combined character table (if any)
145	my $charmap_combined = &make_charmap_combined($charset, $direction);
146
147	if (scalar @{$charmap_combined} > 0)
148	{
149		if ($direction == TO_UNICODE)
150		{
151			print_to_utf8_combined_map($out, $csname, $charmap_combined, 1);
152		}
153		else
154		{
155			print_from_utf8_combined_map($out, $csname, $charmap_combined, 1);
156		}
157	}
158
159	close($out);
160}
161
162sub print_from_utf8_combined_map
163{
164	my ($out, $charset, $table, $verbose) = @_;
165
166	my $last_comment = "";
167
168	printf $out "\n/* Combined character map */\n";
169	printf $out
170"static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
171	  scalar(@$table);
172	my $first = 1;
173	foreach my $i (sort { $a->{utf8} <=> $b->{utf8} } @$table)
174	{
175		print($out ",") if (!$first);
176		$first = 0;
177		print $out "\t/* $last_comment */"
178		  if ($verbose && $last_comment ne "");
179
180		printf $out "\n  {0x%08x, 0x%08x, 0x%04x}",
181		  $i->{utf8}, $i->{utf8_second}, $i->{code};
182		if ($verbose >= 2)
183		{
184			$last_comment =
185			  sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
186		}
187		elsif ($verbose >= 1)
188		{
189			$last_comment = $i->{comment};
190		}
191	}
192	print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
193	print $out "\n};\n";
194}
195
196sub print_to_utf8_combined_map
197{
198	my ($out, $charset, $table, $verbose) = @_;
199
200	my $last_comment = "";
201
202	printf $out "\n/* Combined character map */\n";
203	printf $out
204"static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
205	  scalar(@$table);
206
207	my $first = 1;
208	foreach my $i (sort { $a->{code} <=> $b->{code} } @$table)
209	{
210		print($out ",") if (!$first);
211		$first = 0;
212		print $out "\t/* $last_comment */"
213		  if ($verbose && $last_comment ne "");
214
215		printf $out "\n  {0x%04x, 0x%08x, 0x%08x}",
216		  $i->{code}, $i->{utf8}, $i->{utf8_second};
217
218		if ($verbose >= 2)
219		{
220			$last_comment =
221			  sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
222		}
223		elsif ($verbose >= 1)
224		{
225			$last_comment = $i->{comment};
226		}
227	}
228	print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
229	print $out "\n};\n";
230}
231
232#######################################################################
233# print_radix_table(<output handle>, <table name>, <charmap hash ref>)
234#
235# Input: A hash, mapping an input character to an output character.
236#
237# Constructs a radix tree from the hash, and prints it out as a C-struct.
238#
239sub print_radix_table
240{
241	my ($out, $tblname, $c) = @_;
242
243	###
244	### Build radix trees in memory, for 1-, 2-, 3- and 4-byte inputs. Each
245	### radix tree is represented as a nested hash, each hash indexed by
246	### input byte
247	###
248	my %b1map;
249	my %b2map;
250	my %b3map;
251	my %b4map;
252	foreach my $in (keys %$c)
253	{
254		my $out = $c->{$in};
255
256		if ($in < 0x100)
257		{
258			$b1map{$in} = $out;
259		}
260		elsif ($in < 0x10000)
261		{
262			my $b1 = $in >> 8;
263			my $b2 = $in & 0xff;
264
265			$b2map{$b1}{$b2} = $out;
266		}
267		elsif ($in < 0x1000000)
268		{
269			my $b1 = $in >> 16;
270			my $b2 = ($in >> 8) & 0xff;
271			my $b3 = $in & 0xff;
272
273			$b3map{$b1}{$b2}{$b3} = $out;
274		}
275		elsif ($in < 0x100000000)
276		{
277			my $b1 = $in >> 24;
278			my $b2 = ($in >> 16) & 0xff;
279			my $b3 = ($in >> 8) & 0xff;
280			my $b4 = $in & 0xff;
281
282			$b4map{$b1}{$b2}{$b3}{$b4} = $out;
283		}
284		else
285		{
286			die sprintf("up to 4 byte code is supported: %x", $in);
287		}
288	}
289
290	my @segments;
291
292	###
293	### Build a linear list of "segments", from the nested hashes.
294	###
295	### Each segment is a lookup table, keyed by the next byte in the input.
296	### The segments are written out physically to one big array in the final
297	### step, but logically, they form a radix tree. Or rather, four radix
298	### trees: one for 1-byte inputs, another for 2-byte inputs, 3-byte
299	### inputs, and 4-byte inputs.
300	###
301	### Each segment is represented by a hash with following fields:
302	###
303	### comment => <string to output as a comment>
304	### label => <label that can be used to refer to this segment from elsewhere>
305	### values => <a hash, keyed by byte, 0-0xff>
306	###
307	### Entries in 'values' can be integers (for leaf-level segments), or
308	### string labels, pointing to a segment with that label. Any missing
309	### values are treated as zeros. If 'values' hash is missing altogether,
310	### it's treated as all-zeros.
311	###
312	### Subsequent steps will enrich the segments with more fields.
313	###
314
315	# Add the segments for the radix trees themselves.
316	push @segments,
317	  build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map);
318	push @segments,
319	  build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map);
320	push @segments,
321	  build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map);
322	push @segments,
323	  build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map);
324
325	###
326	### Find min and max index used in each level of each tree.
327	###
328	### These are stored separately, and we can then leave out the unused
329	### parts of every segment. (When using the resulting tree, you must
330	### check each input byte against the min and max.)
331	###
332	my %min_idx;
333	my %max_idx;
334	foreach my $seg (@segments)
335	{
336		my $this_min = $min_idx{ $seg->{depth} }->{ $seg->{level} };
337		my $this_max = $max_idx{ $seg->{depth} }->{ $seg->{level} };
338
339		foreach my $i (keys %{ $seg->{values} })
340		{
341			$this_min = $i if (!defined $this_min || $i < $this_min);
342			$this_max = $i if (!defined $this_max || $i > $this_max);
343		}
344
345		$min_idx{ $seg->{depth} }{ $seg->{level} } = $this_min;
346		$max_idx{ $seg->{depth} }{ $seg->{level} } = $this_max;
347	}
348
349	# Copy the mins and max's back to every segment, for convenience.
350	foreach my $seg (@segments)
351	{
352		$seg->{min_idx} = $min_idx{ $seg->{depth} }{ $seg->{level} };
353		$seg->{max_idx} = $max_idx{ $seg->{depth} }{ $seg->{level} };
354	}
355
356	###
357	### Prepend a dummy all-zeros map to the beginning.
358	###
359	### A 0 is an invalid value anywhere in the table, and this allows us to
360	### point to 0 offset from any table, to get a 0 result.
361	###
362
363	# Find the max range between min and max indexes in any of the segments.
364	my $widest_range = 0;
365	foreach my $seg (@segments)
366	{
367		my $this_range = $seg->{max_idx} - $seg->{min_idx};
368		$widest_range = $this_range if ($this_range > $widest_range);
369	}
370
371	unshift @segments,
372	  { header  => "Dummy map, for invalid values",
373		min_idx => 0,
374		max_idx => $widest_range };
375
376	###
377	### Eliminate overlapping zeros
378	###
379	### For each segment, if there are zero values at the end of, and there
380	### are also zero values at the beginning of the next segment, we can
381	### overlay the tail of this segment with the head of next segment, to
382	### save space.
383	###
384	### To achieve that, we subtract the 'max_idx' of each segment with the
385	### amount of zeros that can be overlaid.
386	###
387	for (my $j = 0; $j < $#segments - 1; $j++)
388	{
389		my $seg     = $segments[$j];
390		my $nextseg = $segments[ $j + 1 ];
391
392		# Count the number of zero values at the end of this segment.
393		my $this_trail_zeros = 0;
394		for (
395			my $i = $seg->{max_idx};
396			$i >= $seg->{min_idx} && !$seg->{values}->{$i};
397			$i--)
398		{
399			$this_trail_zeros++;
400		}
401
402		# Count the number of zeros at the beginning of next segment.
403		my $next_lead_zeros = 0;
404		for (
405			my $i = $nextseg->{min_idx};
406			$i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i};
407			$i++)
408		{
409			$next_lead_zeros++;
410		}
411
412		# How many zeros in common?
413		my $overlaid_trail_zeros =
414		  ($this_trail_zeros > $next_lead_zeros)
415		  ? $next_lead_zeros
416		  : $this_trail_zeros;
417
418		$seg->{overlaid_trail_zeros} = $overlaid_trail_zeros;
419		$seg->{max_idx} = $seg->{max_idx} - $overlaid_trail_zeros;
420	}
421
422	###
423	### Replace label references with real offsets.
424	###
425	### So far, the non-leaf segments have referred to other segments by
426	### their labels. Replace them with numerical offsets from the beginning
427	### of the final array. You cannot move, add, or remove segments after
428	### this step, as that would invalidate the offsets calculated here!
429	###
430	my $flatoff = 0;
431	my %segmap;
432
433	# First pass: assign offsets to each segment, and build hash
434	# of label => offset.
435	foreach my $seg (@segments)
436	{
437		$seg->{offset} = $flatoff;
438		$segmap{ $seg->{label} } = $flatoff;
439		$flatoff += $seg->{max_idx} - $seg->{min_idx} + 1;
440	}
441	my $tblsize = $flatoff;
442
443	# Second pass: look up the offset of each label reference in the hash.
444	foreach my $seg (@segments)
445	{
446		while (my ($i, $val) = each %{ $seg->{values} })
447		{
448			if (!($val =~ /^[0-9,.E]+$/))
449			{
450				my $segoff = $segmap{$val};
451				if ($segoff)
452				{
453					$seg->{values}->{$i} = $segoff;
454				}
455				else
456				{
457					die "no segment with label $val";
458				}
459			}
460		}
461	}
462
463	# Also look up the positions of the roots in the table.
464	my $b1root = $segmap{"1-byte"};
465	my $b2root = $segmap{"2-byte"};
466	my $b3root = $segmap{"3-byte"};
467	my $b4root = $segmap{"4-byte"};
468
469	# And the lower-upper values of each level in each radix tree.
470	my $b1_lower = $min_idx{1}{1};
471	my $b1_upper = $max_idx{1}{1};
472
473	my $b2_1_lower = $min_idx{2}{1};
474	my $b2_1_upper = $max_idx{2}{1};
475	my $b2_2_lower = $min_idx{2}{2};
476	my $b2_2_upper = $max_idx{2}{2};
477
478	my $b3_1_lower = $min_idx{3}{1};
479	my $b3_1_upper = $max_idx{3}{1};
480	my $b3_2_lower = $min_idx{3}{2};
481	my $b3_2_upper = $max_idx{3}{2};
482	my $b3_3_lower = $min_idx{3}{3};
483	my $b3_3_upper = $max_idx{3}{3};
484
485	my $b4_1_lower = $min_idx{4}{1};
486	my $b4_1_upper = $max_idx{4}{1};
487	my $b4_2_lower = $min_idx{4}{2};
488	my $b4_2_upper = $max_idx{4}{2};
489	my $b4_3_lower = $min_idx{4}{3};
490	my $b4_3_upper = $max_idx{4}{3};
491	my $b4_4_lower = $min_idx{4}{4};
492	my $b4_4_upper = $max_idx{4}{4};
493
494	###
495	### Find the maximum value in the whole table, to determine if we can
496	### use uint16 or if we need to use uint32.
497	###
498	my $max_val = 0;
499	foreach my $seg (@segments)
500	{
501		foreach my $val (values %{ $seg->{values} })
502		{
503			$max_val = $val if ($val > $max_val);
504		}
505	}
506
507	my $datatype = ($max_val <= 0xffff) ? "uint16" : "uint32";
508
509	# For formatting, determine how many values we can fit on a single
510	# line, and how wide each value needs to be to align nicely.
511	my $vals_per_line;
512	my $colwidth;
513
514	if ($max_val <= 0xffff)
515	{
516		$vals_per_line = 8;
517		$colwidth      = 4;
518	}
519	elsif ($max_val <= 0xffffff)
520	{
521		$vals_per_line = 4;
522		$colwidth      = 6;
523	}
524	else
525	{
526		$vals_per_line = 4;
527		$colwidth      = 8;
528	}
529
530	###
531	### Print the struct and array.
532	###
533	printf $out "static const $datatype ${tblname}_table[$tblsize];\n";
534	printf $out "\n";
535	printf $out "static const pg_mb_radix_tree $tblname =\n";
536	printf $out "{\n";
537	if ($datatype eq "uint16")
538	{
539		print $out "  ${tblname}_table,\n";
540		print $out "  NULL, /* 32-bit table not used */\n";
541	}
542	if ($datatype eq "uint32")
543	{
544		print $out "  NULL, /* 16-bit table not used */\n";
545		print $out "  ${tblname}_table,\n";
546	}
547	printf $out "\n";
548	printf $out "  0x%04x, /* offset of table for 1-byte inputs */\n",
549	  $b1root;
550	printf $out "  0x%02x, /* b1_lower */\n", $b1_lower;
551	printf $out "  0x%02x, /* b1_upper */\n", $b1_upper;
552	printf $out "\n";
553	printf $out "  0x%04x, /* offset of table for 2-byte inputs */\n",
554	  $b2root;
555	printf $out "  0x%02x, /* b2_1_lower */\n", $b2_1_lower;
556	printf $out "  0x%02x, /* b2_1_upper */\n", $b2_1_upper;
557	printf $out "  0x%02x, /* b2_2_lower */\n", $b2_2_lower;
558	printf $out "  0x%02x, /* b2_2_upper */\n", $b2_2_upper;
559	printf $out "\n";
560	printf $out "  0x%04x, /* offset of table for 3-byte inputs */\n",
561	  $b3root;
562	printf $out "  0x%02x, /* b3_1_lower */\n", $b3_1_lower;
563	printf $out "  0x%02x, /* b3_1_upper */\n", $b3_1_upper;
564	printf $out "  0x%02x, /* b3_2_lower */\n", $b3_2_lower;
565	printf $out "  0x%02x, /* b3_2_upper */\n", $b3_2_upper;
566	printf $out "  0x%02x, /* b3_3_lower */\n", $b3_3_lower;
567	printf $out "  0x%02x, /* b3_3_upper */\n", $b3_3_upper;
568	printf $out "\n";
569	printf $out "  0x%04x, /* offset of table for 3-byte inputs */\n",
570	  $b4root;
571	printf $out "  0x%02x, /* b4_1_lower */\n", $b4_1_lower;
572	printf $out "  0x%02x, /* b4_1_upper */\n", $b4_1_upper;
573	printf $out "  0x%02x, /* b4_2_lower */\n", $b4_2_lower;
574	printf $out "  0x%02x, /* b4_2_upper */\n", $b4_2_upper;
575	printf $out "  0x%02x, /* b4_3_lower */\n", $b4_3_lower;
576	printf $out "  0x%02x, /* b4_3_upper */\n", $b4_3_upper;
577	printf $out "  0x%02x, /* b4_4_lower */\n", $b4_4_lower;
578	printf $out "  0x%02x  /* b4_4_upper */\n", $b4_4_upper;
579	print $out "};\n";
580	print $out "\n";
581	print $out "static const $datatype ${tblname}_table[$tblsize] =\n";
582	print $out "{";
583	my $off = 0;
584
585	foreach my $seg (@segments)
586	{
587		printf $out "\n";
588		printf $out "  /*** %s - offset 0x%05x ***/\n", $seg->{header}, $off;
589		printf $out "\n";
590
591		for (my $i = $seg->{min_idx}; $i <= $seg->{max_idx};)
592		{
593
594			# Print the next line's worth of values.
595			# XXX pad to begin at a nice boundary
596			printf $out "  /* %02x */ ", $i;
597			for (my $j = 0;
598				$j < $vals_per_line && $i <= $seg->{max_idx}; $j++)
599			{
600				my $val = $seg->{values}->{$i};
601
602				printf $out " 0x%0*x", $colwidth, $val;
603				$off++;
604				if ($off != $tblsize)
605				{
606					print $out ",";
607				}
608				$i++;
609			}
610			print $out "\n";
611		}
612		if ($seg->{overlaid_trail_zeros})
613		{
614			printf $out
615"    /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n";
616		}
617	}
618
619	# Sanity check.
620	if ($off != $tblsize) { die "table size didn't match!"; }
621
622	print $out "};\n";
623}
624
625###
626sub build_segments_from_tree
627{
628	my ($header, $rootlabel, $depth, $map) = @_;
629
630	my @segments;
631
632	if (%{$map})
633	{
634		@segments =
635		  build_segments_recurse($header, $rootlabel, "", 1, $depth, $map);
636
637		# Sort the segments into "breadth-first" order. Not strictly required,
638		# but makes the maps nicer to read.
639		@segments =
640		  sort { $a->{level} cmp $b->{level} or $a->{path} cmp $b->{path} }
641		  @segments;
642	}
643
644	return @segments;
645}
646
647###
648sub build_segments_recurse
649{
650	my ($header, $label, $path, $level, $depth, $map) = @_;
651
652	my @segments;
653
654	if ($level == $depth)
655	{
656		push @segments,
657		  { header => $header . ", leaf: ${path}xx",
658			label  => $label,
659			level  => $level,
660			depth  => $depth,
661			path   => $path,
662			values => $map };
663	}
664	else
665	{
666		my %children;
667
668		while (my ($i, $val) = each %$map)
669		{
670			my $childpath = $path . sprintf("%02x", $i);
671			my $childlabel = "$depth-level-$level-$childpath";
672
673			push @segments,
674			  build_segments_recurse($header, $childlabel, $childpath,
675				$level + 1, $depth, $val);
676			$children{$i} = $childlabel;
677		}
678
679		push @segments,
680		  { header => $header . ", byte #$level: ${path}xx",
681			label  => $label,
682			level  => $level,
683			depth  => $depth,
684			path   => $path,
685			values => \%children };
686	}
687	return @segments;
688}
689
690#######################################################################
691# make_charmap - convert charset table to charmap hash
692#
693# make_charmap(\@charset, $direction)
694# charset     - ref to charset table : see print_conversion_tables
695# direction   - conversion direction
696#
697sub make_charmap
698{
699	my ($out, $charset, $direction, $verbose) = @_;
700
701	die "unacceptable direction : $direction"
702	  if ($direction != TO_UNICODE && $direction != FROM_UNICODE);
703
704	# In verbose mode, print a large comment with the source and comment of
705	# each character
706	if ($verbose)
707	{
708		print $out "/*\n";
709		print $out "<src>  <dst>    <file>:<lineno> <comment>\n";
710	}
711
712	my %charmap;
713	foreach my $c (@$charset)
714	{
715
716		# combined characters are handled elsewhere
717		next if (defined $c->{ucs_second});
718
719		next if ($c->{direction} != $direction && $c->{direction} != BOTH);
720
721		my ($src, $dst) =
722		  $direction == TO_UNICODE
723		  ? ($c->{code}, ucs2utf($c->{ucs}))
724		  : (ucs2utf($c->{ucs}), $c->{code});
725
726		# check for duplicate source codes
727		if (defined $charmap{$src})
728		{
729			printf STDERR
730"Error: duplicate source code on %s:%d: 0x%04x => 0x%04x, 0x%04x\n",
731			  $c->{f}, $c->{l}, $src, $charmap{$src}, $dst;
732			exit;
733		}
734		$charmap{$src} = $dst;
735
736		if ($verbose)
737		{
738			printf $out "0x%04x 0x%04x %s:%d %s\n", $src, $dst, $c->{f},
739			  $c->{l}, $c->{comment};
740		}
741	}
742	if ($verbose)
743	{
744		print $out "*/\n\n";
745	}
746
747	return \%charmap;
748}
749
750#######################################################################
751# make_charmap_combined - convert charset table to charmap hash
752#     with checking duplicate source code
753#
754# make_charmap_combined(\@charset, $direction)
755# charset     - ref to charset table : see print_conversion_tables
756# direction   - conversion direction
757#
758sub make_charmap_combined
759{
760	my ($charset, $direction) = @_;
761
762	die "unacceptable direction : $direction"
763	  if ($direction != TO_UNICODE && $direction != FROM_UNICODE);
764
765	my @combined;
766	foreach my $c (@$charset)
767	{
768		next if ($c->{direction} != $direction && $c->{direction} != BOTH);
769
770		if (defined $c->{ucs_second})
771		{
772			my $entry = {
773				utf8        => ucs2utf($c->{ucs}),
774				utf8_second => ucs2utf($c->{ucs_second}),
775				code        => $c->{code},
776				comment     => $c->{comment},
777				f           => $c->{f},
778				l           => $c->{l} };
779			push @combined, $entry;
780		}
781	}
782
783	return \@combined;
784}
785
786#######################################################################
787# convert UCS-4 to UTF-8
788#
789sub ucs2utf
790{
791	my ($ucs) = @_;
792	my $utf;
793
794	if ($ucs <= 0x007f)
795	{
796		$utf = $ucs;
797	}
798	elsif ($ucs > 0x007f && $ucs <= 0x07ff)
799	{
800		$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
801	}
802	elsif ($ucs > 0x07ff && $ucs <= 0xffff)
803	{
804		$utf =
805		  ((($ucs >> 12) | 0xe0) << 16) |
806		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
807	}
808	else
809	{
810		$utf =
811		  ((($ucs >> 18) | 0xf0) << 24) |
812		  (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
813		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
814	}
815	return ($utf);
816}
817
8181;
819