1<?php
2# Copyright (C) 2004 Brion Vibber <brion@pobox.com>
3# http://www.mediawiki.org/
4#
5# This program is free software; you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation; either version 2 of the License, or
8# (at your option) any later version.
9#
10# This program is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License along
16# with this program; if not, write to the Free Software Foundation, Inc.,
17# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18# http://www.gnu.org/copyleft/gpl.html
19
20/**
21 * This script generates UniNormalData.inc from the Unicode Character Database
22 * and supplementary files.
23 *
24 * @addtogroup UtfNormal
25 * @access private
26 */
27
28/** */
29
30if (php_sapi_name() != 'cli') {
31    die("Run me from the command line please.\n");
32}
33
34require_once 'include/Unicode/UtfNormalUtil.php';
35
36$in = fopen("DerivedNormalizationProps.txt", "rt");
37if (!$in) {
38    print "Can't open DerivedNormalizationProps.txt for reading.\n";
39    print "If necessary, fetch this file from the internet:\n";
40    print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
41    exit(-1);
42}
43print "Initializing normalization quick check tables...\n";
44$checkNFC = array();
45while (false !== ($line = fgets($in))) {
46    $matches = array();
47    if (preg_match('/^([0-9A-F]+)(?:..([0-9A-F]+))?\s*;\s*(NFC_QC)\s*;\s*([MN])/', $line, $matches)) {
48        list($junk, $first, $last, $prop, $value) = $matches;
49        #print "$first $last $prop $value\n";
50        if (!$last) {
51            $last = $first;
52        }
53        for ($i = hexdec($first); $i <= hexdec($last); $i++) {
54            $char = codepointToUtf8($i);
55            $checkNFC[$char] = $value;
56        }
57    }
58}
59fclose($in);
60
61$in = fopen("CompositionExclusions.txt", "rt");
62if (!$in) {
63    print "Can't open CompositionExclusions.txt for reading.\n";
64    print "If necessary, fetch this file from the internet:\n";
65    print "http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt\n";
66    exit(-1);
67}
68$exclude = array();
69while (false !== ($line = fgets($in))) {
70    if (preg_match('/^([0-9A-F]+)/i', $line, $matches)) {
71        $codepoint = $matches[1];
72        $source = codepointToUtf8(hexdec($codepoint));
73        $exclude[$source] = true;
74    }
75}
76fclose($in);
77
78$in = fopen("UnicodeData.txt", "rt");
79if (!$in) {
80    print "Can't open UnicodeData.txt for reading.\n";
81    print "If necessary, fetch this file from the internet:\n";
82    print "http://www.unicode.org/Public/UNIDATA/UnicodeData.txt\n";
83    exit(-1);
84}
85
86$compatibilityDecomp = array();
87$canonicalDecomp = array();
88$canonicalComp = array();
89$combiningClass = array();
90$total = 0;
91$compat = 0;
92$canon = 0;
93
94print "Reading character definitions...\n";
95while (false !== ($line = fgets($in))) {
96    $columns = split(';', $line);
97    $codepoint = $columns[0];
98    $name = $columns[1];
99    $canonicalCombiningClass = $columns[3];
100    $decompositionMapping = $columns[5];
101
102    $source = codepointToUtf8(hexdec($codepoint));
103
104    if ($canonicalCombiningClass != 0) {
105        $combiningClass[$source] = intval($canonicalCombiningClass);
106    }
107
108    if ($decompositionMapping === '') {
109        continue;
110    }
111    if (preg_match('/^<(.+)> (.*)$/', $decompositionMapping, $matches)) {
112        # Compatibility decomposition
113        $canonical = false;
114        $decompositionMapping = $matches[2];
115        $compat++;
116    } else {
117        $canonical = true;
118        $canon++;
119    }
120    $total++;
121    $dest = hexSequenceToUtf8($decompositionMapping);
122
123    $compatibilityDecomp[$source] = $dest;
124    if ($canonical) {
125        $canonicalDecomp[$source] = $dest;
126        if (empty($exclude[$source])) {
127            $canonicalComp[$dest] = $source;
128        }
129    }
130    #print "$codepoint | $canonicalCombiningClasses | $decompositionMapping\n";
131}
132fclose($in);
133
134print "Recursively expanding canonical mappings...\n";
135$changed = 42;
136$pass = 1;
137while ($changed > 0) {
138    print "pass $pass\n";
139    $changed = 0;
140    foreach ($canonicalDecomp as $source => $dest) {
141        $newDest = preg_replace_callback(
142            '/([\xc0-\xff][\x80-\xbf]+)/',
143            'callbackCanonical',
144            $dest
145        );
146        if ($newDest === $dest) {
147            continue;
148        }
149        $changed++;
150        $canonicalDecomp[$source] = $newDest;
151    }
152    $pass++;
153}
154
155print "Recursively expanding compatibility mappings...\n";
156$changed = 42;
157$pass = 1;
158while ($changed > 0) {
159    print "pass $pass\n";
160    $changed = 0;
161    foreach ($compatibilityDecomp as $source => $dest) {
162        $newDest = preg_replace_callback(
163            '/([\xc0-\xff][\x80-\xbf]+)/',
164            'callbackCompat',
165            $dest
166        );
167        if ($newDest === $dest) {
168            continue;
169        }
170        $changed++;
171        $compatibilityDecomp[$source] = $newDest;
172    }
173    $pass++;
174}
175
176print "$total decomposition mappings ($canon canonical, $compat compatibility)\n";
177
178$out = fopen("UtfNormalData.inc", "wt");
179if ($out) {
180    $serCombining = escapeSingleString(serialize($combiningClass));
181    $serComp = escapeSingleString(serialize($canonicalComp));
182    $serCanon = escapeSingleString(serialize($canonicalDecomp));
183    $serCheckNFC = escapeSingleString(serialize($checkNFC));
184    $outdata = "<" . "?php
185/**
186 * This file was automatically generated -- do not edit!
187 * Run UtfNormalGenerate.php to create this file again (make clean && make)
188 */
189/** */
190global \$utfCombiningClass, \$utfCanonicalComp, \$utfCanonicalDecomp, \$utfCheckNFC;
191\$utfCombiningClass = unserialize( '$serCombining' );
192\$utfCanonicalComp = unserialize( '$serComp' );
193\$utfCanonicalDecomp = unserialize( '$serCanon' );
194\$utfCheckNFC = unserialize( '$serCheckNFC' );
195?" . ">\n";
196    fputs($out, $outdata);
197    fclose($out);
198    print "Wrote out UtfNormalData.inc\n";
199} else {
200    print "Can't create file UtfNormalData.inc\n";
201    exit(-1);
202}
203
204
205$out = fopen("UtfNormalDataK.inc", "wt");
206if ($out) {
207    $serCompat = escapeSingleString(serialize($compatibilityDecomp));
208    $outdata = "<" . "?php
209/**
210 * This file was automatically generated -- do not edit!
211 * Run UtfNormalGenerate.php to create this file again (make clean && make)
212 */
213/** */
214global \$utfCompatibilityDecomp;
215\$utfCompatibilityDecomp = unserialize( '$serCompat' );
216?" . ">\n";
217    fputs($out, $outdata);
218    fclose($out);
219    print "Wrote out UtfNormalDataK.inc\n";
220    exit(0);
221} else {
222    print "Can't create file UtfNormalDataK.inc\n";
223    exit(-1);
224}
225
226# ---------------
227
228function callbackCanonical($matches)
229{
230    global $canonicalDecomp;
231    if (isset($canonicalDecomp[$matches[1]])) {
232        return $canonicalDecomp[$matches[1]];
233    }
234    return $matches[1];
235}
236
237function callbackCompat($matches)
238{
239    global $compatibilityDecomp;
240    if (isset($compatibilityDecomp[$matches[1]])) {
241        return $compatibilityDecomp[$matches[1]];
242    }
243    return $matches[1];
244}
245