1#
2# Copyright (c) 2001-2020, PostgreSQL Global Development Group
3#
4# src/backend/utils/mb/Unicode/convutils.pm
5
6package convutils;
7
8use strict;
9use warnings;
10
11use Carp;
12use Exporter 'import';
13
14our @EXPORT =
15  qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables);
16
17# Constants used in the 'direction' field of the character maps
18use constant {
19	NONE         => 0,
20	TO_UNICODE   => 1,
21	FROM_UNICODE => 2,
22	BOTH         => 3
23};
24
25#######################################################################
26# read_source - common routine to read source file
27#
28# fname ; input file name
29#
30sub read_source
31{
32	my ($fname) = @_;
33	my @r;
34
35	open(my $in, '<', $fname) || die("cannot open $fname");
36
37	while (<$in>)
38	{
39		next if (/^#/);
40		chop;
41
42		next if (/^$/);    # Ignore empty lines
43
44		next if (/^0x([0-9A-F]+)\s+(#.*)$/);
45
46		# The Unicode source files have three columns
47		# 1: The "foreign" code (in hex)
48		# 2: Unicode code point (in hex)
49		# 3: Unicode name
50		if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(#.*)$/)
51		{
52			print STDERR "READ ERROR at line $. in $fname: $_\n";
53			exit;
54		}
55		my $out = {
56			code      => hex($1),
57			ucs       => hex($2),
58			comment   => $4,
59			direction => BOTH,
60			f         => $fname,
61			l         => $.
62		};
63
64		# Ignore pure ASCII mappings. PostgreSQL character conversion code
65		# never even passes these to the conversion code.
66		next if ($out->{code} < 0x80 || $out->{ucs} < 0x80);
67
68		push(@r, $out);
69	}
70	close($in);
71
72	return \@r;
73}
74
75##################################################################
76# print_conversion_tables - output mapping tables
77#
78# print_conversion_tables($this_script, $csname, \%charset)
79#
80# this_script - the name of the *caller script* of this feature
81# csname      - character set name other than ucs
82# charset     - ref to character set array
83#
84# Input character set array format:
85#
86# Each element in the character set array is a hash. Each hash has the following fields:
87#   direction  - BOTH, TO_UNICODE, or FROM_UNICODE (or NONE, to ignore the entry altogether)
88#   ucs        - Unicode code point
89#   ucs_second - Second Unicode code point, if this is a "combined" character.
90#   code       - Byte sequence in the "other" character set, as an integer
91#   comment    - Text representation of the character
92#   f          - Source filename
93#   l          - Line number in source file
94#
95sub print_conversion_tables
96{
97	my ($this_script, $csname, $charset) = @_;
98
99	print_conversion_tables_direction($this_script, $csname, FROM_UNICODE,
100		$charset);
101	print_conversion_tables_direction($this_script, $csname, TO_UNICODE,
102		$charset);
103	return;
104}
105
106#############################################################################
107# INTERNAL ROUTINES
108
109#######################################################################
110# print_conversion_tables_direction - write the whole content of C source of radix tree
111#
112# print_conversion_tables_direction($this_script, $csname, $direction, \%charset, $tblwidth)
113#
114# this_script - the name of the *caller script* of this feature
115# csname      - character set name other than ucs
116# direction   - desired direction, TO_UNICODE or FROM_UNICODE
117# charset     - ref to character set array
118#
119sub print_conversion_tables_direction
120{
121	my ($this_script, $csname, $direction, $charset) = @_;
122
123	my $fname;
124	my $tblname;
125	if ($direction == TO_UNICODE)
126	{
127		$fname   = lc("${csname}_to_utf8.map");
128		$tblname = lc("${csname}_to_unicode_tree");
129
130		print "- Writing ${csname}=>UTF8 conversion table: $fname\n";
131	}
132	else
133	{
134		$fname   = lc("utf8_to_${csname}.map");
135		$tblname = lc("${csname}_from_unicode_tree");
136
137		print "- Writing UTF8=>${csname} conversion table: $fname\n";
138	}
139
140	open(my $out, '>', $fname) || die("cannot open $fname");
141
142	print $out "/* src/backend/utils/mb/Unicode/$fname */\n";
143	print $out "/* This file is generated by $this_script */\n\n";
144
145	# Collect regular, non-combined, mappings, and create the radix tree from them.
146	my $charmap = &make_charmap($out, $charset, $direction, 0);
147	print_radix_table($out, $tblname, $charmap);
148
149	# Collect combined characters, and create combined character table (if any)
150	my $charmap_combined = &make_charmap_combined($charset, $direction);
151
152	if (scalar @{$charmap_combined} > 0)
153	{
154		if ($direction == TO_UNICODE)
155		{
156			print_to_utf8_combined_map($out, $csname, $charmap_combined, 1);
157		}
158		else
159		{
160			print_from_utf8_combined_map($out, $csname, $charmap_combined, 1);
161		}
162	}
163
164	close($out);
165	return;
166}
167
168sub print_from_utf8_combined_map
169{
170	my ($out, $charset, $table, $verbose) = @_;
171
172	my $last_comment = "";
173
174	printf $out "\n/* Combined character map */\n";
175	printf $out
176	  "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
177	  scalar(@$table);
178	my $first = 1;
179	foreach my $i (sort { $a->{utf8} <=> $b->{utf8} } @$table)
180	{
181		print($out ",") if (!$first);
182		$first = 0;
183		print $out "\t/* $last_comment */"
184		  if ($verbose && $last_comment ne "");
185
186		printf $out "\n  {0x%08x, 0x%08x, 0x%04x}",
187		  $i->{utf8}, $i->{utf8_second}, $i->{code};
188		if ($verbose >= 2)
189		{
190			$last_comment =
191			  sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
192		}
193		elsif ($verbose >= 1)
194		{
195			$last_comment = $i->{comment};
196		}
197	}
198	print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
199	print $out "\n};\n";
200	return;
201}
202
203sub print_to_utf8_combined_map
204{
205	my ($out, $charset, $table, $verbose) = @_;
206
207	my $last_comment = "";
208
209	printf $out "\n/* Combined character map */\n";
210	printf $out
211	  "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
212	  scalar(@$table);
213
214	my $first = 1;
215	foreach my $i (sort { $a->{code} <=> $b->{code} } @$table)
216	{
217		print($out ",") if (!$first);
218		$first = 0;
219		print $out "\t/* $last_comment */"
220		  if ($verbose && $last_comment ne "");
221
222		printf $out "\n  {0x%04x, 0x%08x, 0x%08x}",
223		  $i->{code}, $i->{utf8}, $i->{utf8_second};
224
225		if ($verbose >= 2)
226		{
227			$last_comment =
228			  sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
229		}
230		elsif ($verbose >= 1)
231		{
232			$last_comment = $i->{comment};
233		}
234	}
235	print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
236	print $out "\n};\n";
237	return;
238}
239
240#######################################################################
241# print_radix_table(<output handle>, <table name>, <charmap hash ref>)
242#
243# Input: A hash, mapping an input character to an output character.
244#
245# Constructs a radix tree from the hash, and prints it out as a C-struct.
246#
247sub print_radix_table
248{
249	my ($out, $tblname, $c) = @_;
250
251	###
252	### Build radix trees in memory, for 1-, 2-, 3- and 4-byte inputs. Each
253	### radix tree is represented as a nested hash, each hash indexed by
254	### input byte
255	###
256	my %b1map;
257	my %b2map;
258	my %b3map;
259	my %b4map;
260	foreach my $in (keys %$c)
261	{
262		my $out = $c->{$in};
263
264		if ($in <= 0xff)
265		{
266			$b1map{$in} = $out;
267		}
268		elsif ($in <= 0xffff)
269		{
270			my $b1 = $in >> 8;
271			my $b2 = $in & 0xff;
272
273			$b2map{$b1}{$b2} = $out;
274		}
275		elsif ($in <= 0xffffff)
276		{
277			my $b1 = $in >> 16;
278			my $b2 = ($in >> 8) & 0xff;
279			my $b3 = $in & 0xff;
280
281			$b3map{$b1}{$b2}{$b3} = $out;
282		}
283		elsif ($in <= 0xffffffff)
284		{
285			my $b1 = $in >> 24;
286			my $b2 = ($in >> 16) & 0xff;
287			my $b3 = ($in >> 8) & 0xff;
288			my $b4 = $in & 0xff;
289
290			$b4map{$b1}{$b2}{$b3}{$b4} = $out;
291		}
292		else
293		{
294			die sprintf("up to 4 byte code is supported: %x", $in);
295		}
296	}
297
298	my @segments;
299
300	###
301	### Build a linear list of "segments", from the nested hashes.
302	###
303	### Each segment is a lookup table, keyed by the next byte in the input.
304	### The segments are written out physically to one big array in the final
305	### step, but logically, they form a radix tree. Or rather, four radix
306	### trees: one for 1-byte inputs, another for 2-byte inputs, 3-byte
307	### inputs, and 4-byte inputs.
308	###
309	### Each segment is represented by a hash with following fields:
310	###
311	### comment => <string to output as a comment>
312	### label => <label that can be used to refer to this segment from elsewhere>
313	### values => <a hash, keyed by byte, 0-0xff>
314	###
315	### Entries in 'values' can be integers (for leaf-level segments), or
316	### string labels, pointing to a segment with that label. Any missing
317	### values are treated as zeros. If 'values' hash is missing altogether,
318	### it's treated as all-zeros.
319	###
320	### Subsequent steps will enrich the segments with more fields.
321	###
322
323	# Add the segments for the radix trees themselves.
324	push @segments,
325	  build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map);
326	push @segments,
327	  build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map);
328	push @segments,
329	  build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map);
330	push @segments,
331	  build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map);
332
333	###
334	### Find min and max index used in each level of each tree.
335	###
336	### These are stored separately, and we can then leave out the unused
337	### parts of every segment. (When using the resulting tree, you must
338	### check each input byte against the min and max.)
339	###
340	my %min_idx;
341	my %max_idx;
342	foreach my $seg (@segments)
343	{
344		my $this_min = $min_idx{ $seg->{depth} }->{ $seg->{level} };
345		my $this_max = $max_idx{ $seg->{depth} }->{ $seg->{level} };
346
347		foreach my $i (keys %{ $seg->{values} })
348		{
349			$this_min = $i if (!defined $this_min || $i < $this_min);
350			$this_max = $i if (!defined $this_max || $i > $this_max);
351		}
352
353		$min_idx{ $seg->{depth} }{ $seg->{level} } = $this_min;
354		$max_idx{ $seg->{depth} }{ $seg->{level} } = $this_max;
355	}
356
357	# Copy the mins and max's back to every segment, for convenience.
358	foreach my $seg (@segments)
359	{
360		$seg->{min_idx} = $min_idx{ $seg->{depth} }{ $seg->{level} };
361		$seg->{max_idx} = $max_idx{ $seg->{depth} }{ $seg->{level} };
362	}
363
364	###
365	### Prepend a dummy all-zeros map to the beginning.
366	###
367	### A 0 is an invalid value anywhere in the table, and this allows us to
368	### point to 0 offset from any table, to get a 0 result.
369	###
370
371	# Find the max range between min and max indexes in any of the segments.
372	my $widest_range = 0;
373	foreach my $seg (@segments)
374	{
375		my $this_range = $seg->{max_idx} - $seg->{min_idx};
376		$widest_range = $this_range if ($this_range > $widest_range);
377	}
378
379	unshift @segments,
380	  {
381		header  => "Dummy map, for invalid values",
382		min_idx => 0,
383		max_idx => $widest_range
384	  };
385
386	###
387	### Eliminate overlapping zeros
388	###
389	### For each segment, if there are zero values at the end of, and there
390	### are also zero values at the beginning of the next segment, we can
391	### overlay the tail of this segment with the head of next segment, to
392	### save space.
393	###
394	### To achieve that, we subtract the 'max_idx' of each segment with the
395	### amount of zeros that can be overlaid.
396	###
397	for (my $j = 0; $j < $#segments - 1; $j++)
398	{
399		my $seg     = $segments[$j];
400		my $nextseg = $segments[ $j + 1 ];
401
402		# Count the number of zero values at the end of this segment.
403		my $this_trail_zeros = 0;
404		for (
405			my $i = $seg->{max_idx};
406			$i >= $seg->{min_idx} && !$seg->{values}->{$i};
407			$i--)
408		{
409			$this_trail_zeros++;
410		}
411
412		# Count the number of zeros at the beginning of next segment.
413		my $next_lead_zeros = 0;
414		for (
415			my $i = $nextseg->{min_idx};
416			$i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i};
417			$i++)
418		{
419			$next_lead_zeros++;
420		}
421
422		# How many zeros in common?
423		my $overlaid_trail_zeros =
424		  ($this_trail_zeros > $next_lead_zeros)
425		  ? $next_lead_zeros
426		  : $this_trail_zeros;
427
428		$seg->{overlaid_trail_zeros} = $overlaid_trail_zeros;
429		$seg->{max_idx} = $seg->{max_idx} - $overlaid_trail_zeros;
430	}
431
432	###
433	### Replace label references with real offsets.
434	###
435	### So far, the non-leaf segments have referred to other segments by
436	### their labels. Replace them with numerical offsets from the beginning
437	### of the final array. You cannot move, add, or remove segments after
438	### this step, as that would invalidate the offsets calculated here!
439	###
440	my $flatoff = 0;
441	my %segmap;
442
443	# First pass: assign offsets to each segment, and build hash
444	# of label => offset.
445	foreach my $seg (@segments)
446	{
447		$seg->{offset} = $flatoff;
448		$segmap{ $seg->{label} } = $flatoff;
449		$flatoff += $seg->{max_idx} - $seg->{min_idx} + 1;
450	}
451	my $tblsize = $flatoff;
452
453	# Second pass: look up the offset of each label reference in the hash.
454	foreach my $seg (@segments)
455	{
456		while (my ($i, $val) = each %{ $seg->{values} })
457		{
458			if (!($val =~ /^[0-9,.E]+$/))
459			{
460				my $segoff = $segmap{$val};
461				if ($segoff)
462				{
463					$seg->{values}->{$i} = $segoff;
464				}
465				else
466				{
467					die "no segment with label $val";
468				}
469			}
470		}
471	}
472
473	# Also look up the positions of the roots in the table.
474	my $b1root = $segmap{"1-byte"};
475	my $b2root = $segmap{"2-byte"};
476	my $b3root = $segmap{"3-byte"};
477	my $b4root = $segmap{"4-byte"};
478
479	# And the lower-upper values of each level in each radix tree.
480	my $b1_lower = $min_idx{1}{1};
481	my $b1_upper = $max_idx{1}{1};
482
483	my $b2_1_lower = $min_idx{2}{1};
484	my $b2_1_upper = $max_idx{2}{1};
485	my $b2_2_lower = $min_idx{2}{2};
486	my $b2_2_upper = $max_idx{2}{2};
487
488	my $b3_1_lower = $min_idx{3}{1};
489	my $b3_1_upper = $max_idx{3}{1};
490	my $b3_2_lower = $min_idx{3}{2};
491	my $b3_2_upper = $max_idx{3}{2};
492	my $b3_3_lower = $min_idx{3}{3};
493	my $b3_3_upper = $max_idx{3}{3};
494
495	my $b4_1_lower = $min_idx{4}{1};
496	my $b4_1_upper = $max_idx{4}{1};
497	my $b4_2_lower = $min_idx{4}{2};
498	my $b4_2_upper = $max_idx{4}{2};
499	my $b4_3_lower = $min_idx{4}{3};
500	my $b4_3_upper = $max_idx{4}{3};
501	my $b4_4_lower = $min_idx{4}{4};
502	my $b4_4_upper = $max_idx{4}{4};
503
504	###
505	### Find the maximum value in the whole table, to determine if we can
506	### use uint16 or if we need to use uint32.
507	###
508	my $max_val = 0;
509	foreach my $seg (@segments)
510	{
511		foreach my $val (values %{ $seg->{values} })
512		{
513			$max_val = $val if ($val > $max_val);
514		}
515	}
516
517	my $datatype = ($max_val <= 0xffff) ? "uint16" : "uint32";
518
519	# For formatting, determine how many values we can fit on a single
520	# line, and how wide each value needs to be to align nicely.
521	my $vals_per_line;
522	my $colwidth;
523
524	if ($max_val <= 0xffff)
525	{
526		$vals_per_line = 8;
527		$colwidth      = 4;
528	}
529	elsif ($max_val <= 0xffffff)
530	{
531		$vals_per_line = 4;
532		$colwidth      = 6;
533	}
534	else
535	{
536		$vals_per_line = 4;
537		$colwidth      = 8;
538	}
539
540	###
541	### Print the struct and array.
542	###
543	printf $out "static const $datatype ${tblname}_table[$tblsize];\n";
544	printf $out "\n";
545	printf $out "static const pg_mb_radix_tree $tblname =\n";
546	printf $out "{\n";
547	if ($datatype eq "uint16")
548	{
549		print $out "  ${tblname}_table,\n";
550		print $out "  NULL, /* 32-bit table not used */\n";
551	}
552	if ($datatype eq "uint32")
553	{
554		print $out "  NULL, /* 16-bit table not used */\n";
555		print $out "  ${tblname}_table,\n";
556	}
557	printf $out "\n";
558	printf $out "  0x%04x, /* offset of table for 1-byte inputs */\n",
559	  $b1root;
560	printf $out "  0x%02x, /* b1_lower */\n", $b1_lower;
561	printf $out "  0x%02x, /* b1_upper */\n", $b1_upper;
562	printf $out "\n";
563	printf $out "  0x%04x, /* offset of table for 2-byte inputs */\n",
564	  $b2root;
565	printf $out "  0x%02x, /* b2_1_lower */\n", $b2_1_lower;
566	printf $out "  0x%02x, /* b2_1_upper */\n", $b2_1_upper;
567	printf $out "  0x%02x, /* b2_2_lower */\n", $b2_2_lower;
568	printf $out "  0x%02x, /* b2_2_upper */\n", $b2_2_upper;
569	printf $out "\n";
570	printf $out "  0x%04x, /* offset of table for 3-byte inputs */\n",
571	  $b3root;
572	printf $out "  0x%02x, /* b3_1_lower */\n", $b3_1_lower;
573	printf $out "  0x%02x, /* b3_1_upper */\n", $b3_1_upper;
574	printf $out "  0x%02x, /* b3_2_lower */\n", $b3_2_lower;
575	printf $out "  0x%02x, /* b3_2_upper */\n", $b3_2_upper;
576	printf $out "  0x%02x, /* b3_3_lower */\n", $b3_3_lower;
577	printf $out "  0x%02x, /* b3_3_upper */\n", $b3_3_upper;
578	printf $out "\n";
579	printf $out "  0x%04x, /* offset of table for 3-byte inputs */\n",
580	  $b4root;
581	printf $out "  0x%02x, /* b4_1_lower */\n", $b4_1_lower;
582	printf $out "  0x%02x, /* b4_1_upper */\n", $b4_1_upper;
583	printf $out "  0x%02x, /* b4_2_lower */\n", $b4_2_lower;
584	printf $out "  0x%02x, /* b4_2_upper */\n", $b4_2_upper;
585	printf $out "  0x%02x, /* b4_3_lower */\n", $b4_3_lower;
586	printf $out "  0x%02x, /* b4_3_upper */\n", $b4_3_upper;
587	printf $out "  0x%02x, /* b4_4_lower */\n", $b4_4_lower;
588	printf $out "  0x%02x  /* b4_4_upper */\n", $b4_4_upper;
589	print $out "};\n";
590	print $out "\n";
591	print $out "static const $datatype ${tblname}_table[$tblsize] =\n";
592	print $out "{";
593	my $off = 0;
594
595	foreach my $seg (@segments)
596	{
597		printf $out "\n";
598		printf $out "  /*** %s - offset 0x%05x ***/\n", $seg->{header}, $off;
599		printf $out "\n";
600
601		for (my $i = $seg->{min_idx}; $i <= $seg->{max_idx};)
602		{
603
604			# Print the next line's worth of values.
605			# XXX pad to begin at a nice boundary
606			printf $out "  /* %02x */ ", $i;
607			for (my $j = 0;
608				$j < $vals_per_line && $i <= $seg->{max_idx}; $j++)
609			{
610				my $val = $seg->{values}->{$i};
611
612				printf $out " 0x%0*x", $colwidth, $val;
613				$off++;
614				if ($off != $tblsize)
615				{
616					print $out ",";
617				}
618				$i++;
619			}
620			print $out "\n";
621		}
622		if ($seg->{overlaid_trail_zeros})
623		{
624			printf $out
625			  "    /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n";
626		}
627	}
628
629	# Sanity check.
630	if ($off != $tblsize) { die "table size didn't match!"; }
631
632	print $out "};\n";
633	return;
634}
635
636###
637sub build_segments_from_tree
638{
639	my ($header, $rootlabel, $depth, $map) = @_;
640
641	my @segments;
642
643	if (%{$map})
644	{
645		@segments =
646		  build_segments_recurse($header, $rootlabel, "", 1, $depth, $map);
647
648		# Sort the segments into "breadth-first" order. Not strictly required,
649		# but makes the maps nicer to read.
650		@segments =
651		  sort { $a->{level} cmp $b->{level} or $a->{path} cmp $b->{path} }
652		  @segments;
653	}
654
655	return @segments;
656}
657
658###
659sub build_segments_recurse
660{
661	my ($header, $label, $path, $level, $depth, $map) = @_;
662
663	my @segments;
664
665	if ($level == $depth)
666	{
667		push @segments,
668		  {
669			header => $header . ", leaf: ${path}xx",
670			label  => $label,
671			level  => $level,
672			depth  => $depth,
673			path   => $path,
674			values => $map
675		  };
676	}
677	else
678	{
679		my %children;
680
681		while (my ($i, $val) = each %$map)
682		{
683			my $childpath = $path . sprintf("%02x", $i);
684			my $childlabel = "$depth-level-$level-$childpath";
685
686			push @segments,
687			  build_segments_recurse($header, $childlabel, $childpath,
688				$level + 1, $depth, $val);
689			$children{$i} = $childlabel;
690		}
691
692		push @segments,
693		  {
694			header => $header . ", byte #$level: ${path}xx",
695			label  => $label,
696			level  => $level,
697			depth  => $depth,
698			path   => $path,
699			values => \%children
700		  };
701	}
702	return @segments;
703}
704
705#######################################################################
706# make_charmap - convert charset table to charmap hash
707#
708# make_charmap(\@charset, $direction)
709# charset     - ref to charset table : see print_conversion_tables
710# direction   - conversion direction
711#
712sub make_charmap
713{
714	my ($out, $charset, $direction, $verbose) = @_;
715
716	croak "unacceptable direction : $direction"
717	  if ($direction != TO_UNICODE && $direction != FROM_UNICODE);
718
719	# In verbose mode, print a large comment with the source and comment of
720	# each character
721	if ($verbose)
722	{
723		print $out "/*\n";
724		print $out "<src>  <dst>    <file>:<lineno> <comment>\n";
725	}
726
727	my %charmap;
728	foreach my $c (@$charset)
729	{
730
731		# combined characters are handled elsewhere
732		next if (defined $c->{ucs_second});
733
734		next if ($c->{direction} != $direction && $c->{direction} != BOTH);
735
736		my ($src, $dst) =
737		  $direction == TO_UNICODE
738		  ? ($c->{code}, ucs2utf($c->{ucs}))
739		  : (ucs2utf($c->{ucs}), $c->{code});
740
741		# check for duplicate source codes
742		if (defined $charmap{$src})
743		{
744			printf STDERR
745			  "Error: duplicate source code on %s:%d: 0x%04x => 0x%04x, 0x%04x\n",
746			  $c->{f}, $c->{l}, $src, $charmap{$src}, $dst;
747			exit;
748		}
749		$charmap{$src} = $dst;
750
751		if ($verbose)
752		{
753			printf $out "0x%04x 0x%04x %s:%d %s\n", $src, $dst, $c->{f},
754			  $c->{l}, $c->{comment};
755		}
756	}
757	if ($verbose)
758	{
759		print $out "*/\n\n";
760	}
761
762	return \%charmap;
763}
764
765#######################################################################
766# make_charmap_combined - convert charset table to charmap hash
767#     with checking duplicate source code
768#
769# make_charmap_combined(\@charset, $direction)
770# charset     - ref to charset table : see print_conversion_tables
771# direction   - conversion direction
772#
773sub make_charmap_combined
774{
775	my ($charset, $direction) = @_;
776
777	croak "unacceptable direction : $direction"
778	  if ($direction != TO_UNICODE && $direction != FROM_UNICODE);
779
780	my @combined;
781	foreach my $c (@$charset)
782	{
783		next if ($c->{direction} != $direction && $c->{direction} != BOTH);
784
785		if (defined $c->{ucs_second})
786		{
787			my $entry = {
788				utf8        => ucs2utf($c->{ucs}),
789				utf8_second => ucs2utf($c->{ucs_second}),
790				code        => $c->{code},
791				comment     => $c->{comment},
792				f           => $c->{f},
793				l           => $c->{l}
794			};
795			push @combined, $entry;
796		}
797	}
798
799	return \@combined;
800}
801
802#######################################################################
803# convert UCS-4 to UTF-8
804#
805sub ucs2utf
806{
807	my ($ucs) = @_;
808	my $utf;
809
810	if ($ucs <= 0x007f)
811	{
812		$utf = $ucs;
813	}
814	elsif ($ucs > 0x007f && $ucs <= 0x07ff)
815	{
816		$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
817	}
818	elsif ($ucs > 0x07ff && $ucs <= 0xffff)
819	{
820		$utf =
821		  ((($ucs >> 12) | 0xe0) << 16) |
822		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
823	}
824	else
825	{
826		$utf =
827		  ((($ucs >> 18) | 0xf0) << 24) |
828		  (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
829		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
830	}
831	return $utf;
832}
833
8341;
835