1#!/usr/bin/perl
2#
3# Generate a composition table, using Unicode data files as input
4#
5# Input: UnicodeData.txt and CompositionExclusions.txt
6# Output: unicode_norm_table.h
7#
8# Copyright (c) 2000-2017, PostgreSQL Global Development Group
9
10use strict;
11use warnings;
12
13my $output_file = "unicode_norm_table.h";
14
15my $FH;
16
17# Read list of codes that should be excluded from re-composition.
18my @composition_exclusion_codes = ();
19open($FH, '<', "CompositionExclusions.txt")
20  or die "Could not open CompositionExclusions.txt: $!.";
21while (my $line = <$FH>)
22{
23	if ($line =~ /^([[:xdigit:]]+)/)
24	{
25		push @composition_exclusion_codes, $1;
26	}
27}
28close $FH;
29
30# Read entries from UnicodeData.txt into a list, and a hash table. We need
31# three fields from each row: the codepoint, canonical combining class,
32# and character decomposition mapping
33my @characters     = ();
34my %character_hash = ();
35open($FH, '<', "UnicodeData.txt")
36  or die "Could not open UnicodeData.txt: $!.";
37while (my $line = <$FH>)
38{
39
40	# Split the line wanted and get the fields needed:
41	# - Unicode code value
42	# - Canonical Combining Class
43	# - Character Decomposition Mapping
44	my @elts   = split(';', $line);
45	my $code   = $elts[0];
46	my $class  = $elts[3];
47	my $decomp = $elts[5];
48
49	# Skip codepoints above U+10FFFF. They cannot be represented in 4 bytes
50	# in UTF-8, and PostgreSQL doesn't support UTF-8 characters longer than
51	# 4 bytes. (This is just pro forma, as there aren't any such entries in
52	# the data file, currently.)
53	next if hex($code) > 0x10FFFF;
54
55	# Skip characters with no decompositions and a class of 0, to reduce the
56	# table size.
57	next if $class eq '0' && $decomp eq '';
58
59	my %char_entry = (code => $code, class => $class, decomp => $decomp);
60	push(@characters, \%char_entry);
61	$character_hash{$code} = \%char_entry;
62}
63close $FH;
64
65my $num_characters = scalar @characters;
66
67# Start writing out the output file
68open my $OUTPUT, '>', $output_file
69  or die "Could not open output file $output_file: $!\n";
70
71print $OUTPUT <<HEADER;
72/*-------------------------------------------------------------------------
73 *
74 * unicode_norm_table.h
75 *	  Composition table used for Unicode normalization
76 *
77 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
78 * Portions Copyright (c) 1994, Regents of the University of California
79 *
80 * src/include/common/unicode_norm_table.h
81 *
82 *-------------------------------------------------------------------------
83 */
84
85/*
86 * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl,
87 * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_TABLE_H
88 * here.
89 */
90typedef struct
91{
92	uint32		codepoint;		/* Unicode codepoint */
93	uint8		comb_class;		/* combining class of character */
94	uint8		dec_size_flags; /* size and flags of decomposition code list */
95	uint16		dec_index;		/* index into UnicodeDecomp_codepoints, or the
96								 * decomposition itself if DECOMP_INLINE */
97} pg_unicode_decomposition;
98
99#define DECOMP_NO_COMPOSE	0x80	/* don't use for re-composition */
100#define DECOMP_INLINE		0x40	/* decomposition is stored inline in dec_index */
101
102#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x3F)
103#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & DECOMP_NO_COMPOSE) != 0)
104#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0)
105
106/* Table of Unicode codepoints and their decompositions */
107static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =
108{
109HEADER
110
111my $decomp_index  = 0;
112my $decomp_string = "";
113
114my $last_code = $characters[-1]->{code};
115foreach my $char (@characters)
116{
117	my $code   = $char->{code};
118	my $class  = $char->{class};
119	my $decomp = $char->{decomp};
120
121	# The character decomposition mapping field in UnicodeData.txt is a list
122	# of unicode codepoints, separated by space. But it can be prefixed with
123	# so-called compatibility formatting tag, like "<compat>", or "<font>".
124	# The entries with compatibility formatting tags should not be used for
125	# re-composing characters during normalization, so flag them in the table.
126	# (The tag doesn't matter, only whether there is a tag or not)
127	my $compat = 0;
128	if ($decomp =~ /\<.*\>/)
129	{
130		$compat = 1;
131		$decomp =~ s/\<[^][]*\>//g;
132	}
133	my @decomp_elts = split(" ", $decomp);
134
135	# Decomposition size
136	# Print size of decomposition
137	my $decomp_size = scalar(@decomp_elts);
138
139	my $first_decomp = shift @decomp_elts;
140
141	my $flags   = "";
142	my $comment = "";
143
144	if ($decomp_size == 2)
145	{
146
147		# Should this be used for recomposition?
148		if ($compat)
149		{
150			$flags .= " | DECOMP_NO_COMPOSE";
151			$comment = "compatibility mapping";
152		}
153		elsif ($character_hash{$first_decomp}
154			&& $character_hash{$first_decomp}->{class} != 0)
155		{
156			$flags .= " | DECOMP_NO_COMPOSE";
157			$comment = "non-starter decomposition";
158		}
159		else
160		{
161			foreach my $lcode (@composition_exclusion_codes)
162			{
163				if ($lcode eq $char->{code})
164				{
165					$flags .= " | DECOMP_NO_COMPOSE";
166					$comment = "in exclusion list";
167					last;
168				}
169			}
170		}
171	}
172
173	if ($decomp_size == 0)
174	{
175		print $OUTPUT "\t{0x$code, $class, 0$flags, 0}";
176	}
177	elsif ($decomp_size == 1 && length($first_decomp) <= 4)
178	{
179
180		# The decomposition consists of a single codepoint, and it fits
181		# in a uint16, so we can store it "inline" in the main table.
182		$flags .= " | DECOMP_INLINE";
183		print $OUTPUT "\t{0x$code, $class, 1$flags, 0x$first_decomp}";
184	}
185	else
186	{
187		print $OUTPUT
188		  "\t{0x$code, $class, $decomp_size$flags, $decomp_index}";
189
190		# Now save the decompositions into a dedicated area that will
191		# be written afterwards.  First build the entry dedicated to
192		# a sub-table with the code and decomposition.
193		$decomp_string .= ",\n" if ($decomp_string ne "");
194
195		$decomp_string .= "\t /* $decomp_index */ 0x$first_decomp";
196		foreach (@decomp_elts)
197		{
198			$decomp_string .= ", 0x$_";
199		}
200
201		$decomp_index = $decomp_index + $decomp_size;
202	}
203
204	# Print a comma after all items except the last one.
205	print $OUTPUT "," unless ($code eq $last_code);
206	if ($comment ne "")
207	{
208
209		# If the line is wide already, indent the comment with one tab,
210		# otherwise with two. This is to make the output match the way
211		# pgindent would mangle it. (This is quite hacky. To do this
212		# properly, we should actually track how long the line is so far,
213		# but this works for now.)
214		print $OUTPUT "\t" if ($decomp_index < 10);
215
216		print $OUTPUT "\t/* $comment */" if ($comment ne "");
217	}
218	print $OUTPUT "\n";
219}
220print $OUTPUT "\n};\n\n";
221
222# Print the array of decomposed codes.
223print $OUTPUT <<HEADER;
224/* codepoints array  */
225static const uint32 UnicodeDecomp_codepoints[$decomp_index] =
226{
227$decomp_string
228};
229HEADER
230
231close $OUTPUT;
232