1#
2# Copyright (c) 2001-2018, PostgreSQL Global Development Group
3#
4# src/backend/utils/mb/Unicode/convutils.pm
5
6package convutils;
7
8use strict;
9
10use Carp;
11use Exporter 'import';
12
13our @EXPORT =
14  qw( NONE TO_UNICODE FROM_UNICODE BOTH read_source print_conversion_tables);
15
16# Constants used in the 'direction' field of the character maps
17use constant {
18	NONE         => 0,
19	TO_UNICODE   => 1,
20	FROM_UNICODE => 2,
21	BOTH         => 3
22};
23
24#######################################################################
25# read_source - common routine to read source file
26#
27# fname ; input file name
28#
29sub read_source
30{
31	my ($fname) = @_;
32	my @r;
33
34	open(my $in, '<', $fname) || die("cannot open $fname");
35
36	while (<$in>)
37	{
38		next if (/^#/);
39		chop;
40
41		next if (/^$/);    # Ignore empty lines
42
43		next if (/^0x([0-9A-F]+)\s+(#.*)$/);
44
45		# The Unicode source files have three columns
46		# 1: The "foreign" code (in hex)
47		# 2: Unicode code point (in hex)
48		# 3: Unicode name
49		if (!/^0x([0-9A-Fa-f]+)\s+0x([0-9A-Fa-f]+)\s+(#.*)$/)
50		{
51			print STDERR "READ ERROR at line $. in $fname: $_\n";
52			exit;
53		}
54		my $out = {
55			code      => hex($1),
56			ucs       => hex($2),
57			comment   => $4,
58			direction => BOTH,
59			f         => $fname,
60			l         => $.
61		};
62
63		# Ignore pure ASCII mappings. PostgreSQL character conversion code
64		# never even passes these to the conversion code.
65		next if ($out->{code} < 0x80 || $out->{ucs} < 0x80);
66
67		push(@r, $out);
68	}
69	close($in);
70
71	return \@r;
72}
73
74##################################################################
75# print_conversion_tables - output mapping tables
76#
77# print_conversion_tables($this_script, $csname, \%charset)
78#
79# this_script - the name of the *caller script* of this feature
80# csname      - character set name other than ucs
81# charset     - ref to character set array
82#
83# Input character set array format:
84#
85# Each element in the character set array is a hash. Each hash has the following fields:
86#   direction  - BOTH, TO_UNICODE, or FROM_UNICODE (or NONE, to ignore the entry altogether)
87#   ucs        - Unicode code point
88#   ucs_second - Second Unicode code point, if this is a "combined" character.
89#   code       - Byte sequence in the "other" character set, as an integer
90#   comment    - Text representation of the character
91#   f          - Source filename
92#   l          - Line number in source file
93#
94sub print_conversion_tables
95{
96	my ($this_script, $csname, $charset) = @_;
97
98	print_conversion_tables_direction($this_script, $csname, FROM_UNICODE,
99		$charset);
100	print_conversion_tables_direction($this_script, $csname, TO_UNICODE,
101		$charset);
102	return;
103}
104
105#############################################################################
106# INTERNAL ROUTINES
107
108#######################################################################
109# print_conversion_tables_direction - write the whole content of C source of radix tree
110#
111# print_conversion_tables_direction($this_script, $csname, $direction, \%charset, $tblwidth)
112#
113# this_script - the name of the *caller script* of this feature
114# csname      - character set name other than ucs
115# direction   - desired direction, TO_UNICODE or FROM_UNICODE
116# charset     - ref to character set array
117#
118sub print_conversion_tables_direction
119{
120	my ($this_script, $csname, $direction, $charset) = @_;
121
122	my $fname;
123	my $tblname;
124	if ($direction == TO_UNICODE)
125	{
126		$fname   = lc("${csname}_to_utf8.map");
127		$tblname = lc("${csname}_to_unicode_tree");
128
129		print "- Writing ${csname}=>UTF8 conversion table: $fname\n";
130	}
131	else
132	{
133		$fname   = lc("utf8_to_${csname}.map");
134		$tblname = lc("${csname}_from_unicode_tree");
135
136		print "- Writing UTF8=>${csname} conversion table: $fname\n";
137	}
138
139	open(my $out, '>', $fname) || die("cannot open $fname");
140
141	print $out "/* src/backend/utils/mb/Unicode/$fname */\n";
142	print $out "/* This file is generated by $this_script */\n\n";
143
144	# Collect regular, non-combined, mappings, and create the radix tree from them.
145	my $charmap = &make_charmap($out, $charset, $direction, 0);
146	print_radix_table($out, $tblname, $charmap);
147
148	# Collect combined characters, and create combined character table (if any)
149	my $charmap_combined = &make_charmap_combined($charset, $direction);
150
151	if (scalar @{$charmap_combined} > 0)
152	{
153		if ($direction == TO_UNICODE)
154		{
155			print_to_utf8_combined_map($out, $csname, $charmap_combined, 1);
156		}
157		else
158		{
159			print_from_utf8_combined_map($out, $csname, $charmap_combined, 1);
160		}
161	}
162
163	close($out);
164	return;
165}
166
167sub print_from_utf8_combined_map
168{
169	my ($out, $charset, $table, $verbose) = @_;
170
171	my $last_comment = "";
172
173	printf $out "\n/* Combined character map */\n";
174	printf $out
175	  "static const pg_utf_to_local_combined ULmap${charset}_combined[ %d ] = {",
176	  scalar(@$table);
177	my $first = 1;
178	foreach my $i (sort { $a->{utf8} <=> $b->{utf8} } @$table)
179	{
180		print($out ",") if (!$first);
181		$first = 0;
182		print $out "\t/* $last_comment */"
183		  if ($verbose && $last_comment ne "");
184
185		printf $out "\n  {0x%08x, 0x%08x, 0x%04x}",
186		  $i->{utf8}, $i->{utf8_second}, $i->{code};
187		if ($verbose >= 2)
188		{
189			$last_comment =
190			  sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
191		}
192		elsif ($verbose >= 1)
193		{
194			$last_comment = $i->{comment};
195		}
196	}
197	print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
198	print $out "\n};\n";
199	return;
200}
201
202sub print_to_utf8_combined_map
203{
204	my ($out, $charset, $table, $verbose) = @_;
205
206	my $last_comment = "";
207
208	printf $out "\n/* Combined character map */\n";
209	printf $out
210	  "static const pg_local_to_utf_combined LUmap${charset}_combined[ %d ] = {",
211	  scalar(@$table);
212
213	my $first = 1;
214	foreach my $i (sort { $a->{code} <=> $b->{code} } @$table)
215	{
216		print($out ",") if (!$first);
217		$first = 0;
218		print $out "\t/* $last_comment */"
219		  if ($verbose && $last_comment ne "");
220
221		printf $out "\n  {0x%04x, 0x%08x, 0x%08x}",
222		  $i->{code}, $i->{utf8}, $i->{utf8_second};
223
224		if ($verbose >= 2)
225		{
226			$last_comment =
227			  sprintf("%s:%d %s", $i->{f}, $i->{l}, $i->{comment});
228		}
229		elsif ($verbose >= 1)
230		{
231			$last_comment = $i->{comment};
232		}
233	}
234	print $out "\t/* $last_comment */" if ($verbose && $last_comment ne "");
235	print $out "\n};\n";
236	return;
237}
238
239#######################################################################
240# print_radix_table(<output handle>, <table name>, <charmap hash ref>)
241#
242# Input: A hash, mapping an input character to an output character.
243#
244# Constructs a radix tree from the hash, and prints it out as a C-struct.
245#
246sub print_radix_table
247{
248	my ($out, $tblname, $c) = @_;
249
250	###
251	### Build radix trees in memory, for 1-, 2-, 3- and 4-byte inputs. Each
252	### radix tree is represented as a nested hash, each hash indexed by
253	### input byte
254	###
255	my %b1map;
256	my %b2map;
257	my %b3map;
258	my %b4map;
259	foreach my $in (keys %$c)
260	{
261		my $out = $c->{$in};
262
263		if ($in <= 0xff)
264		{
265			$b1map{$in} = $out;
266		}
267		elsif ($in <= 0xffff)
268		{
269			my $b1 = $in >> 8;
270			my $b2 = $in & 0xff;
271
272			$b2map{$b1}{$b2} = $out;
273		}
274		elsif ($in <= 0xffffff)
275		{
276			my $b1 = $in >> 16;
277			my $b2 = ($in >> 8) & 0xff;
278			my $b3 = $in & 0xff;
279
280			$b3map{$b1}{$b2}{$b3} = $out;
281		}
282		elsif ($in <= 0xffffffff)
283		{
284			my $b1 = $in >> 24;
285			my $b2 = ($in >> 16) & 0xff;
286			my $b3 = ($in >> 8) & 0xff;
287			my $b4 = $in & 0xff;
288
289			$b4map{$b1}{$b2}{$b3}{$b4} = $out;
290		}
291		else
292		{
293			die sprintf("up to 4 byte code is supported: %x", $in);
294		}
295	}
296
297	my @segments;
298
299	###
300	### Build a linear list of "segments", from the nested hashes.
301	###
302	### Each segment is a lookup table, keyed by the next byte in the input.
303	### The segments are written out physically to one big array in the final
304	### step, but logically, they form a radix tree. Or rather, four radix
305	### trees: one for 1-byte inputs, another for 2-byte inputs, 3-byte
306	### inputs, and 4-byte inputs.
307	###
308	### Each segment is represented by a hash with following fields:
309	###
310	### comment => <string to output as a comment>
311	### label => <label that can be used to refer to this segment from elsewhere>
312	### values => <a hash, keyed by byte, 0-0xff>
313	###
314	### Entries in 'values' can be integers (for leaf-level segments), or
315	### string labels, pointing to a segment with that label. Any missing
316	### values are treated as zeros. If 'values' hash is missing altogether,
317	### it's treated as all-zeros.
318	###
319	### Subsequent steps will enrich the segments with more fields.
320	###
321
322	# Add the segments for the radix trees themselves.
323	push @segments,
324	  build_segments_from_tree("Single byte table", "1-byte", 1, \%b1map);
325	push @segments,
326	  build_segments_from_tree("Two byte table", "2-byte", 2, \%b2map);
327	push @segments,
328	  build_segments_from_tree("Three byte table", "3-byte", 3, \%b3map);
329	push @segments,
330	  build_segments_from_tree("Four byte table", "4-byte", 4, \%b4map);
331
332	###
333	### Find min and max index used in each level of each tree.
334	###
335	### These are stored separately, and we can then leave out the unused
336	### parts of every segment. (When using the resulting tree, you must
337	### check each input byte against the min and max.)
338	###
339	my %min_idx;
340	my %max_idx;
341	foreach my $seg (@segments)
342	{
343		my $this_min = $min_idx{ $seg->{depth} }->{ $seg->{level} };
344		my $this_max = $max_idx{ $seg->{depth} }->{ $seg->{level} };
345
346		foreach my $i (keys %{ $seg->{values} })
347		{
348			$this_min = $i if (!defined $this_min || $i < $this_min);
349			$this_max = $i if (!defined $this_max || $i > $this_max);
350		}
351
352		$min_idx{ $seg->{depth} }{ $seg->{level} } = $this_min;
353		$max_idx{ $seg->{depth} }{ $seg->{level} } = $this_max;
354	}
355
356	# Copy the mins and max's back to every segment, for convenience.
357	foreach my $seg (@segments)
358	{
359		$seg->{min_idx} = $min_idx{ $seg->{depth} }{ $seg->{level} };
360		$seg->{max_idx} = $max_idx{ $seg->{depth} }{ $seg->{level} };
361	}
362
363	###
364	### Prepend a dummy all-zeros map to the beginning.
365	###
366	### A 0 is an invalid value anywhere in the table, and this allows us to
367	### point to 0 offset from any table, to get a 0 result.
368	###
369
370	# Find the max range between min and max indexes in any of the segments.
371	my $widest_range = 0;
372	foreach my $seg (@segments)
373	{
374		my $this_range = $seg->{max_idx} - $seg->{min_idx};
375		$widest_range = $this_range if ($this_range > $widest_range);
376	}
377
378	unshift @segments,
379	  {
380		header  => "Dummy map, for invalid values",
381		min_idx => 0,
382		max_idx => $widest_range
383	  };
384
385	###
386	### Eliminate overlapping zeros
387	###
388	### For each segment, if there are zero values at the end of, and there
389	### are also zero values at the beginning of the next segment, we can
390	### overlay the tail of this segment with the head of next segment, to
391	### save space.
392	###
393	### To achieve that, we subtract the 'max_idx' of each segment with the
394	### amount of zeros that can be overlaid.
395	###
396	for (my $j = 0; $j < $#segments - 1; $j++)
397	{
398		my $seg     = $segments[$j];
399		my $nextseg = $segments[ $j + 1 ];
400
401		# Count the number of zero values at the end of this segment.
402		my $this_trail_zeros = 0;
403		for (
404			my $i = $seg->{max_idx};
405			$i >= $seg->{min_idx} && !$seg->{values}->{$i};
406			$i--)
407		{
408			$this_trail_zeros++;
409		}
410
411		# Count the number of zeros at the beginning of next segment.
412		my $next_lead_zeros = 0;
413		for (
414			my $i = $nextseg->{min_idx};
415			$i <= $nextseg->{max_idx} && !$nextseg->{values}->{$i};
416			$i++)
417		{
418			$next_lead_zeros++;
419		}
420
421		# How many zeros in common?
422		my $overlaid_trail_zeros =
423		  ($this_trail_zeros > $next_lead_zeros)
424		  ? $next_lead_zeros
425		  : $this_trail_zeros;
426
427		$seg->{overlaid_trail_zeros} = $overlaid_trail_zeros;
428		$seg->{max_idx} = $seg->{max_idx} - $overlaid_trail_zeros;
429	}
430
431	###
432	### Replace label references with real offsets.
433	###
434	### So far, the non-leaf segments have referred to other segments by
435	### their labels. Replace them with numerical offsets from the beginning
436	### of the final array. You cannot move, add, or remove segments after
437	### this step, as that would invalidate the offsets calculated here!
438	###
439	my $flatoff = 0;
440	my %segmap;
441
442	# First pass: assign offsets to each segment, and build hash
443	# of label => offset.
444	foreach my $seg (@segments)
445	{
446		$seg->{offset} = $flatoff;
447		$segmap{ $seg->{label} } = $flatoff;
448		$flatoff += $seg->{max_idx} - $seg->{min_idx} + 1;
449	}
450	my $tblsize = $flatoff;
451
452	# Second pass: look up the offset of each label reference in the hash.
453	foreach my $seg (@segments)
454	{
455		while (my ($i, $val) = each %{ $seg->{values} })
456		{
457			if (!($val =~ /^[0-9,.E]+$/))
458			{
459				my $segoff = $segmap{$val};
460				if ($segoff)
461				{
462					$seg->{values}->{$i} = $segoff;
463				}
464				else
465				{
466					die "no segment with label $val";
467				}
468			}
469		}
470	}
471
472	# Also look up the positions of the roots in the table.
473	my $b1root = $segmap{"1-byte"};
474	my $b2root = $segmap{"2-byte"};
475	my $b3root = $segmap{"3-byte"};
476	my $b4root = $segmap{"4-byte"};
477
478	# And the lower-upper values of each level in each radix tree.
479	my $b1_lower = $min_idx{1}{1};
480	my $b1_upper = $max_idx{1}{1};
481
482	my $b2_1_lower = $min_idx{2}{1};
483	my $b2_1_upper = $max_idx{2}{1};
484	my $b2_2_lower = $min_idx{2}{2};
485	my $b2_2_upper = $max_idx{2}{2};
486
487	my $b3_1_lower = $min_idx{3}{1};
488	my $b3_1_upper = $max_idx{3}{1};
489	my $b3_2_lower = $min_idx{3}{2};
490	my $b3_2_upper = $max_idx{3}{2};
491	my $b3_3_lower = $min_idx{3}{3};
492	my $b3_3_upper = $max_idx{3}{3};
493
494	my $b4_1_lower = $min_idx{4}{1};
495	my $b4_1_upper = $max_idx{4}{1};
496	my $b4_2_lower = $min_idx{4}{2};
497	my $b4_2_upper = $max_idx{4}{2};
498	my $b4_3_lower = $min_idx{4}{3};
499	my $b4_3_upper = $max_idx{4}{3};
500	my $b4_4_lower = $min_idx{4}{4};
501	my $b4_4_upper = $max_idx{4}{4};
502
503	###
504	### Find the maximum value in the whole table, to determine if we can
505	### use uint16 or if we need to use uint32.
506	###
507	my $max_val = 0;
508	foreach my $seg (@segments)
509	{
510		foreach my $val (values %{ $seg->{values} })
511		{
512			$max_val = $val if ($val > $max_val);
513		}
514	}
515
516	my $datatype = ($max_val <= 0xffff) ? "uint16" : "uint32";
517
518	# For formatting, determine how many values we can fit on a single
519	# line, and how wide each value needs to be to align nicely.
520	my $vals_per_line;
521	my $colwidth;
522
523	if ($max_val <= 0xffff)
524	{
525		$vals_per_line = 8;
526		$colwidth      = 4;
527	}
528	elsif ($max_val <= 0xffffff)
529	{
530		$vals_per_line = 4;
531		$colwidth      = 6;
532	}
533	else
534	{
535		$vals_per_line = 4;
536		$colwidth      = 8;
537	}
538
539	###
540	### Print the struct and array.
541	###
542	printf $out "static const $datatype ${tblname}_table[$tblsize];\n";
543	printf $out "\n";
544	printf $out "static const pg_mb_radix_tree $tblname =\n";
545	printf $out "{\n";
546	if ($datatype eq "uint16")
547	{
548		print $out "  ${tblname}_table,\n";
549		print $out "  NULL, /* 32-bit table not used */\n";
550	}
551	if ($datatype eq "uint32")
552	{
553		print $out "  NULL, /* 16-bit table not used */\n";
554		print $out "  ${tblname}_table,\n";
555	}
556	printf $out "\n";
557	printf $out "  0x%04x, /* offset of table for 1-byte inputs */\n",
558	  $b1root;
559	printf $out "  0x%02x, /* b1_lower */\n", $b1_lower;
560	printf $out "  0x%02x, /* b1_upper */\n", $b1_upper;
561	printf $out "\n";
562	printf $out "  0x%04x, /* offset of table for 2-byte inputs */\n",
563	  $b2root;
564	printf $out "  0x%02x, /* b2_1_lower */\n", $b2_1_lower;
565	printf $out "  0x%02x, /* b2_1_upper */\n", $b2_1_upper;
566	printf $out "  0x%02x, /* b2_2_lower */\n", $b2_2_lower;
567	printf $out "  0x%02x, /* b2_2_upper */\n", $b2_2_upper;
568	printf $out "\n";
569	printf $out "  0x%04x, /* offset of table for 3-byte inputs */\n",
570	  $b3root;
571	printf $out "  0x%02x, /* b3_1_lower */\n", $b3_1_lower;
572	printf $out "  0x%02x, /* b3_1_upper */\n", $b3_1_upper;
573	printf $out "  0x%02x, /* b3_2_lower */\n", $b3_2_lower;
574	printf $out "  0x%02x, /* b3_2_upper */\n", $b3_2_upper;
575	printf $out "  0x%02x, /* b3_3_lower */\n", $b3_3_lower;
576	printf $out "  0x%02x, /* b3_3_upper */\n", $b3_3_upper;
577	printf $out "\n";
578	printf $out "  0x%04x, /* offset of table for 3-byte inputs */\n",
579	  $b4root;
580	printf $out "  0x%02x, /* b4_1_lower */\n", $b4_1_lower;
581	printf $out "  0x%02x, /* b4_1_upper */\n", $b4_1_upper;
582	printf $out "  0x%02x, /* b4_2_lower */\n", $b4_2_lower;
583	printf $out "  0x%02x, /* b4_2_upper */\n", $b4_2_upper;
584	printf $out "  0x%02x, /* b4_3_lower */\n", $b4_3_lower;
585	printf $out "  0x%02x, /* b4_3_upper */\n", $b4_3_upper;
586	printf $out "  0x%02x, /* b4_4_lower */\n", $b4_4_lower;
587	printf $out "  0x%02x  /* b4_4_upper */\n", $b4_4_upper;
588	print $out "};\n";
589	print $out "\n";
590	print $out "static const $datatype ${tblname}_table[$tblsize] =\n";
591	print $out "{";
592	my $off = 0;
593
594	foreach my $seg (@segments)
595	{
596		printf $out "\n";
597		printf $out "  /*** %s - offset 0x%05x ***/\n", $seg->{header}, $off;
598		printf $out "\n";
599
600		for (my $i = $seg->{min_idx}; $i <= $seg->{max_idx};)
601		{
602
603			# Print the next line's worth of values.
604			# XXX pad to begin at a nice boundary
605			printf $out "  /* %02x */ ", $i;
606			for (my $j = 0;
607				$j < $vals_per_line && $i <= $seg->{max_idx}; $j++)
608			{
609				my $val = $seg->{values}->{$i};
610
611				printf $out " 0x%0*x", $colwidth, $val;
612				$off++;
613				if ($off != $tblsize)
614				{
615					print $out ",";
616				}
617				$i++;
618			}
619			print $out "\n";
620		}
621		if ($seg->{overlaid_trail_zeros})
622		{
623			printf $out
624			  "    /* $seg->{overlaid_trail_zeros} trailing zero values shared with next segment */\n";
625		}
626	}
627
628	# Sanity check.
629	if ($off != $tblsize) { die "table size didn't match!"; }
630
631	print $out "};\n";
632	return;
633}
634
635###
636sub build_segments_from_tree
637{
638	my ($header, $rootlabel, $depth, $map) = @_;
639
640	my @segments;
641
642	if (%{$map})
643	{
644		@segments =
645		  build_segments_recurse($header, $rootlabel, "", 1, $depth, $map);
646
647		# Sort the segments into "breadth-first" order. Not strictly required,
648		# but makes the maps nicer to read.
649		@segments =
650		  sort { $a->{level} cmp $b->{level} or $a->{path} cmp $b->{path} }
651		  @segments;
652	}
653
654	return @segments;
655}
656
657###
658sub build_segments_recurse
659{
660	my ($header, $label, $path, $level, $depth, $map) = @_;
661
662	my @segments;
663
664	if ($level == $depth)
665	{
666		push @segments,
667		  {
668			header => $header . ", leaf: ${path}xx",
669			label  => $label,
670			level  => $level,
671			depth  => $depth,
672			path   => $path,
673			values => $map
674		  };
675	}
676	else
677	{
678		my %children;
679
680		while (my ($i, $val) = each %$map)
681		{
682			my $childpath = $path . sprintf("%02x", $i);
683			my $childlabel = "$depth-level-$level-$childpath";
684
685			push @segments,
686			  build_segments_recurse($header, $childlabel, $childpath,
687				$level + 1, $depth, $val);
688			$children{$i} = $childlabel;
689		}
690
691		push @segments,
692		  {
693			header => $header . ", byte #$level: ${path}xx",
694			label  => $label,
695			level  => $level,
696			depth  => $depth,
697			path   => $path,
698			values => \%children
699		  };
700	}
701	return @segments;
702}
703
704#######################################################################
705# make_charmap - convert charset table to charmap hash
706#
707# make_charmap(\@charset, $direction)
708# charset     - ref to charset table : see print_conversion_tables
709# direction   - conversion direction
710#
711sub make_charmap
712{
713	my ($out, $charset, $direction, $verbose) = @_;
714
715	croak "unacceptable direction : $direction"
716	  if ($direction != TO_UNICODE && $direction != FROM_UNICODE);
717
718	# In verbose mode, print a large comment with the source and comment of
719	# each character
720	if ($verbose)
721	{
722		print $out "/*\n";
723		print $out "<src>  <dst>    <file>:<lineno> <comment>\n";
724	}
725
726	my %charmap;
727	foreach my $c (@$charset)
728	{
729
730		# combined characters are handled elsewhere
731		next if (defined $c->{ucs_second});
732
733		next if ($c->{direction} != $direction && $c->{direction} != BOTH);
734
735		my ($src, $dst) =
736		  $direction == TO_UNICODE
737		  ? ($c->{code}, ucs2utf($c->{ucs}))
738		  : (ucs2utf($c->{ucs}), $c->{code});
739
740		# check for duplicate source codes
741		if (defined $charmap{$src})
742		{
743			printf STDERR
744			  "Error: duplicate source code on %s:%d: 0x%04x => 0x%04x, 0x%04x\n",
745			  $c->{f}, $c->{l}, $src, $charmap{$src}, $dst;
746			exit;
747		}
748		$charmap{$src} = $dst;
749
750		if ($verbose)
751		{
752			printf $out "0x%04x 0x%04x %s:%d %s\n", $src, $dst, $c->{f},
753			  $c->{l}, $c->{comment};
754		}
755	}
756	if ($verbose)
757	{
758		print $out "*/\n\n";
759	}
760
761	return \%charmap;
762}
763
764#######################################################################
765# make_charmap_combined - convert charset table to charmap hash
766#     with checking duplicate source code
767#
768# make_charmap_combined(\@charset, $direction)
769# charset     - ref to charset table : see print_conversion_tables
770# direction   - conversion direction
771#
772sub make_charmap_combined
773{
774	my ($charset, $direction) = @_;
775
776	croak "unacceptable direction : $direction"
777	  if ($direction != TO_UNICODE && $direction != FROM_UNICODE);
778
779	my @combined;
780	foreach my $c (@$charset)
781	{
782		next if ($c->{direction} != $direction && $c->{direction} != BOTH);
783
784		if (defined $c->{ucs_second})
785		{
786			my $entry = {
787				utf8        => ucs2utf($c->{ucs}),
788				utf8_second => ucs2utf($c->{ucs_second}),
789				code        => $c->{code},
790				comment     => $c->{comment},
791				f           => $c->{f},
792				l           => $c->{l}
793			};
794			push @combined, $entry;
795		}
796	}
797
798	return \@combined;
799}
800
801#######################################################################
802# convert UCS-4 to UTF-8
803#
804sub ucs2utf
805{
806	my ($ucs) = @_;
807	my $utf;
808
809	if ($ucs <= 0x007f)
810	{
811		$utf = $ucs;
812	}
813	elsif ($ucs > 0x007f && $ucs <= 0x07ff)
814	{
815		$utf = (($ucs & 0x003f) | 0x80) | ((($ucs >> 6) | 0xc0) << 8);
816	}
817	elsif ($ucs > 0x07ff && $ucs <= 0xffff)
818	{
819		$utf =
820		  ((($ucs >> 12) | 0xe0) << 16) |
821		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
822	}
823	else
824	{
825		$utf =
826		  ((($ucs >> 18) | 0xf0) << 24) |
827		  (((($ucs & 0x3ffff) >> 12) | 0x80) << 16) |
828		  (((($ucs & 0x0fc0) >> 6) | 0x80) << 8) | (($ucs & 0x003f) | 0x80);
829	}
830	return $utf;
831}
832
8331;
834