1#!/usr/bin/perl 2# 3# Generate a composition table, using Unicode data files as input 4# 5# Input: UnicodeData.txt and CompositionExclusions.txt 6# Output: unicode_norm_table.h 7# 8# Copyright (c) 2000-2017, PostgreSQL Global Development Group 9 10use strict; 11use warnings; 12 13my $output_file = "unicode_norm_table.h"; 14 15my $FH; 16 17# Read list of codes that should be excluded from re-composition. 18my @composition_exclusion_codes = (); 19open($FH, '<', "CompositionExclusions.txt") 20 or die "Could not open CompositionExclusions.txt: $!."; 21while (my $line = <$FH>) 22{ 23 if ($line =~ /^([[:xdigit:]]+)/) 24 { 25 push @composition_exclusion_codes, $1; 26 } 27} 28close $FH; 29 30# Read entries from UnicodeData.txt into a list, and a hash table. We need 31# three fields from each row: the codepoint, canonical combining class, 32# and character decomposition mapping 33my @characters = (); 34my %character_hash = (); 35open($FH, '<', "UnicodeData.txt") 36 or die "Could not open UnicodeData.txt: $!."; 37while (my $line = <$FH>) 38{ 39 40 # Split the line wanted and get the fields needed: 41 # - Unicode code value 42 # - Canonical Combining Class 43 # - Character Decomposition Mapping 44 my @elts = split(';', $line); 45 my $code = $elts[0]; 46 my $class = $elts[3]; 47 my $decomp = $elts[5]; 48 49 # Skip codepoints above U+10FFFF. They cannot be represented in 4 bytes 50 # in UTF-8, and PostgreSQL doesn't support UTF-8 characters longer than 51 # 4 bytes. (This is just pro forma, as there aren't any such entries in 52 # the data file, currently.) 53 next if hex($code) > 0x10FFFF; 54 55 # Skip characters with no decompositions and a class of 0, to reduce the 56 # table size. 57 next if $class eq '0' && $decomp eq ''; 58 59 my %char_entry = (code => $code, class => $class, decomp => $decomp); 60 push(@characters, \%char_entry); 61 $character_hash{$code} = \%char_entry; 62} 63close $FH; 64 65my $num_characters = scalar @characters; 66 67# Start writing out the output file 68open my $OUTPUT, '>', $output_file 69 or die "Could not open output file $output_file: $!\n"; 70 71print $OUTPUT <<HEADER; 72/*------------------------------------------------------------------------- 73 * 74 * unicode_norm_table.h 75 * Composition table used for Unicode normalization 76 * 77 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group 78 * Portions Copyright (c) 1994, Regents of the University of California 79 * 80 * src/include/common/unicode_norm_table.h 81 * 82 *------------------------------------------------------------------------- 83 */ 84 85/* 86 * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl, 87 * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_TABLE_H 88 * here. 89 */ 90typedef struct 91{ 92 uint32 codepoint; /* Unicode codepoint */ 93 uint8 comb_class; /* combining class of character */ 94 uint8 dec_size_flags; /* size and flags of decomposition code list */ 95 uint16 dec_index; /* index into UnicodeDecomp_codepoints, or the 96 * decomposition itself if DECOMP_INLINE */ 97} pg_unicode_decomposition; 98 99#define DECOMP_NO_COMPOSE 0x80 /* don't use for re-composition */ 100#define DECOMP_INLINE 0x40 /* decomposition is stored inline in dec_index */ 101 102#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x3F) 103#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & DECOMP_NO_COMPOSE) != 0) 104#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0) 105 106/* Table of Unicode codepoints and their decompositions */ 107static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] = 108{ 109HEADER 110 111my $decomp_index = 0; 112my $decomp_string = ""; 113 114my $last_code = $characters[-1]->{code}; 115foreach my $char (@characters) 116{ 117 my $code = $char->{code}; 118 my $class = $char->{class}; 119 my $decomp = $char->{decomp}; 120 121 # The character decomposition mapping field in UnicodeData.txt is a list 122 # of unicode codepoints, separated by space. But it can be prefixed with 123 # so-called compatibility formatting tag, like "<compat>", or "<font>". 124 # The entries with compatibility formatting tags should not be used for 125 # re-composing characters during normalization, so flag them in the table. 126 # (The tag doesn't matter, only whether there is a tag or not) 127 my $compat = 0; 128 if ($decomp =~ /\<.*\>/) 129 { 130 $compat = 1; 131 $decomp =~ s/\<[^][]*\>//g; 132 } 133 my @decomp_elts = split(" ", $decomp); 134 135 # Decomposition size 136 # Print size of decomposition 137 my $decomp_size = scalar(@decomp_elts); 138 139 my $first_decomp = shift @decomp_elts; 140 141 my $flags = ""; 142 my $comment = ""; 143 144 if ($decomp_size == 2) 145 { 146 147 # Should this be used for recomposition? 148 if ($compat) 149 { 150 $flags .= " | DECOMP_NO_COMPOSE"; 151 $comment = "compatibility mapping"; 152 } 153 elsif ($character_hash{$first_decomp} 154 && $character_hash{$first_decomp}->{class} != 0) 155 { 156 $flags .= " | DECOMP_NO_COMPOSE"; 157 $comment = "non-starter decomposition"; 158 } 159 else 160 { 161 foreach my $lcode (@composition_exclusion_codes) 162 { 163 if ($lcode eq $char->{code}) 164 { 165 $flags .= " | DECOMP_NO_COMPOSE"; 166 $comment = "in exclusion list"; 167 last; 168 } 169 } 170 } 171 } 172 173 if ($decomp_size == 0) 174 { 175 print $OUTPUT "\t{0x$code, $class, 0$flags, 0}"; 176 } 177 elsif ($decomp_size == 1 && length($first_decomp) <= 4) 178 { 179 180 # The decomposition consists of a single codepoint, and it fits 181 # in a uint16, so we can store it "inline" in the main table. 182 $flags .= " | DECOMP_INLINE"; 183 print $OUTPUT "\t{0x$code, $class, 1$flags, 0x$first_decomp}"; 184 } 185 else 186 { 187 print $OUTPUT 188 "\t{0x$code, $class, $decomp_size$flags, $decomp_index}"; 189 190 # Now save the decompositions into a dedicated area that will 191 # be written afterwards. First build the entry dedicated to 192 # a sub-table with the code and decomposition. 193 $decomp_string .= ",\n" if ($decomp_string ne ""); 194 195 $decomp_string .= "\t /* $decomp_index */ 0x$first_decomp"; 196 foreach (@decomp_elts) 197 { 198 $decomp_string .= ", 0x$_"; 199 } 200 201 $decomp_index = $decomp_index + $decomp_size; 202 } 203 204 # Print a comma after all items except the last one. 205 print $OUTPUT "," unless ($code eq $last_code); 206 if ($comment ne "") 207 { 208 209 # If the line is wide already, indent the comment with one tab, 210 # otherwise with two. This is to make the output match the way 211 # pgindent would mangle it. (This is quite hacky. To do this 212 # properly, we should actually track how long the line is so far, 213 # but this works for now.) 214 print $OUTPUT "\t" if ($decomp_index < 10); 215 216 print $OUTPUT "\t/* $comment */" if ($comment ne ""); 217 } 218 print $OUTPUT "\n"; 219} 220print $OUTPUT "\n};\n\n"; 221 222# Print the array of decomposed codes. 223print $OUTPUT <<HEADER; 224/* codepoints array */ 225static const uint32 UnicodeDecomp_codepoints[$decomp_index] = 226{ 227$decomp_string 228}; 229HEADER 230 231close $OUTPUT; 232