1<?php 2# Copyright (C) 2004 Brion Vibber <brion@pobox.com> 3# http://www.mediawiki.org/ 4# 5# This program is free software; you can redistribute it and/or modify 6# it under the terms of the GNU General Public License as published by 7# the Free Software Foundation; either version 2 of the License, or 8# (at your option) any later version. 9# 10# This program is distributed in the hope that it will be useful, 11# but WITHOUT ANY WARRANTY; without even the implied warranty of 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13# GNU General Public License for more details. 14# 15# You should have received a copy of the GNU General Public License along 16# with this program; if not, write to the Free Software Foundation, Inc., 17# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 18# http://www.gnu.org/copyleft/gpl.html 19 20/** 21 * This script generates UniNormalData.inc from the Unicode Character Database 22 * and supplementary files. 23 * 24 * @addtogroup UtfNormal 25 * @access private 26 */ 27 28/** */ 29 30if (php_sapi_name() != 'cli') { 31 die("Run me from the command line please.\n"); 32} 33 34require_once 'include/Unicode/UtfNormalUtil.php'; 35 36$in = fopen("DerivedNormalizationProps.txt", "rt"); 37if (!$in) { 38 print "Can't open DerivedNormalizationProps.txt for reading.\n"; 39 print "If necessary, fetch this file from the internet:\n"; 40 print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n"; 41 exit(-1); 42} 43print "Initializing normalization quick check tables...\n"; 44$checkNFC = array(); 45while (false !== ($line = fgets($in))) { 46 $matches = array(); 47 if (preg_match('/^([0-9A-F]+)(?:..([0-9A-F]+))?\s*;\s*(NFC_QC)\s*;\s*([MN])/', $line, $matches)) { 48 list($junk, $first, $last, $prop, $value) = $matches; 49 #print "$first $last $prop $value\n"; 50 if (!$last) { 51 $last = $first; 52 } 53 for ($i = hexdec($first); $i <= hexdec($last); $i++) { 54 $char = codepointToUtf8($i); 55 $checkNFC[$char] = $value; 56 } 57 } 58} 59fclose($in); 60 61$in = fopen("CompositionExclusions.txt", "rt"); 62if (!$in) { 63 print "Can't open CompositionExclusions.txt for reading.\n"; 64 print "If necessary, fetch this file from the internet:\n"; 65 print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n"; 66 exit(-1); 67} 68$exclude = array(); 69while (false !== ($line = fgets($in))) { 70 if (preg_match('/^([0-9A-F]+)/i', $line, $matches)) { 71 $codepoint = $matches[1]; 72 $source = codepointToUtf8(hexdec($codepoint)); 73 $exclude[$source] = true; 74 } 75} 76fclose($in); 77 78$in = fopen("UnicodeData.txt", "rt"); 79if (!$in) { 80 print "Can't open UnicodeData.txt for reading.\n"; 81 print "If necessary, fetch this file from the internet:\n"; 82 print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n"; 83 exit(-1); 84} 85 86$compatibilityDecomp = array(); 87$canonicalDecomp = array(); 88$canonicalComp = array(); 89$combiningClass = array(); 90$total = 0; 91$compat = 0; 92$canon = 0; 93 94print "Reading character definitions...\n"; 95while (false !== ($line = fgets($in))) { 96 $columns = split(';', $line); 97 $codepoint = $columns[0]; 98 $name = $columns[1]; 99 $canonicalCombiningClass = $columns[3]; 100 $decompositionMapping = $columns[5]; 101 102 $source = codepointToUtf8(hexdec($codepoint)); 103 104 if ($canonicalCombiningClass != 0) { 105 $combiningClass[$source] = intval($canonicalCombiningClass); 106 } 107 108 if ($decompositionMapping === '') { 109 continue; 110 } 111 if (preg_match('/^<(.+)> (.*)$/', $decompositionMapping, $matches)) { 112 # Compatibility decomposition 113 $canonical = false; 114 $decompositionMapping = $matches[2]; 115 $compat++; 116 } else { 117 $canonical = true; 118 $canon++; 119 } 120 $total++; 121 $dest = hexSequenceToUtf8($decompositionMapping); 122 123 $compatibilityDecomp[$source] = $dest; 124 if ($canonical) { 125 $canonicalDecomp[$source] = $dest; 126 if (empty($exclude[$source])) { 127 $canonicalComp[$dest] = $source; 128 } 129 } 130 #print "$codepoint | $canonicalCombiningClasses | $decompositionMapping\n"; 131} 132fclose($in); 133 134print "Recursively expanding canonical mappings...\n"; 135$changed = 42; 136$pass = 1; 137while ($changed > 0) { 138 print "pass $pass\n"; 139 $changed = 0; 140 foreach ($canonicalDecomp as $source => $dest) { 141 $newDest = preg_replace_callback( 142 '/([\xc0-\xff][\x80-\xbf]+)/', 143 'callbackCanonical', 144 $dest 145 ); 146 if ($newDest === $dest) { 147 continue; 148 } 149 $changed++; 150 $canonicalDecomp[$source] = $newDest; 151 } 152 $pass++; 153} 154 155print "Recursively expanding compatibility mappings...\n"; 156$changed = 42; 157$pass = 1; 158while ($changed > 0) { 159 print "pass $pass\n"; 160 $changed = 0; 161 foreach ($compatibilityDecomp as $source => $dest) { 162 $newDest = preg_replace_callback( 163 '/([\xc0-\xff][\x80-\xbf]+)/', 164 'callbackCompat', 165 $dest 166 ); 167 if ($newDest === $dest) { 168 continue; 169 } 170 $changed++; 171 $compatibilityDecomp[$source] = $newDest; 172 } 173 $pass++; 174} 175 176print "$total decomposition mappings ($canon canonical, $compat compatibility)\n"; 177 178$out = fopen("UtfNormalData.inc", "wt"); 179if ($out) { 180 $serCombining = escapeSingleString(serialize($combiningClass)); 181 $serComp = escapeSingleString(serialize($canonicalComp)); 182 $serCanon = escapeSingleString(serialize($canonicalDecomp)); 183 $serCheckNFC = escapeSingleString(serialize($checkNFC)); 184 $outdata = "<" . "?php 185/** 186 * This file was automatically generated -- do not edit! 187 * Run UtfNormalGenerate.php to create this file again (make clean && make) 188 */ 189/** */ 190global \$utfCombiningClass, \$utfCanonicalComp, \$utfCanonicalDecomp, \$utfCheckNFC; 191\$utfCombiningClass = unserialize( '$serCombining' ); 192\$utfCanonicalComp = unserialize( '$serComp' ); 193\$utfCanonicalDecomp = unserialize( '$serCanon' ); 194\$utfCheckNFC = unserialize( '$serCheckNFC' ); 195?" . ">\n"; 196 fputs($out, $outdata); 197 fclose($out); 198 print "Wrote out UtfNormalData.inc\n"; 199} else { 200 print "Can't create file UtfNormalData.inc\n"; 201 exit(-1); 202} 203 204 205$out = fopen("UtfNormalDataK.inc", "wt"); 206if ($out) { 207 $serCompat = escapeSingleString(serialize($compatibilityDecomp)); 208 $outdata = "<" . "?php 209/** 210 * This file was automatically generated -- do not edit! 211 * Run UtfNormalGenerate.php to create this file again (make clean && make) 212 */ 213/** */ 214global \$utfCompatibilityDecomp; 215\$utfCompatibilityDecomp = unserialize( '$serCompat' ); 216?" . ">\n"; 217 fputs($out, $outdata); 218 fclose($out); 219 print "Wrote out UtfNormalDataK.inc\n"; 220 exit(0); 221} else { 222 print "Can't create file UtfNormalDataK.inc\n"; 223 exit(-1); 224} 225 226# --------------- 227 228function callbackCanonical($matches) 229{ 230 global $canonicalDecomp; 231 if (isset($canonicalDecomp[$matches[1]])) { 232 return $canonicalDecomp[$matches[1]]; 233 } 234 return $matches[1]; 235} 236 237function callbackCompat($matches) 238{ 239 global $compatibilityDecomp; 240 if (isset($compatibilityDecomp[$matches[1]])) { 241 return $compatibilityDecomp[$matches[1]]; 242 } 243 return $matches[1]; 244} 245