1# uniParse.tcl --
2#
3#	This program parses the UnicodeData file and generates the
4#	corresponding source file with compressed character
5#	data tables.  The input to this program should be the latest
6#	UnicodeData file from:
7#	    ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
8#
9# Copyright (c) 1998-1999 by Scriptics Corporation.
10# All rights reserved.
11
12
13namespace eval uni {
14    set shift 8;		# number of bits of data within a page
15				# This value can be adjusted to find the
16				# best split to minimize table size
17
18    variable pMap;		# map from page to page index, each entry is
19				# an index into the pages table, indexed by
20				# page number
21    variable pages;		# map from page index to page info, each
22				# entry is a list of indices into the groups
23				# table, the list is indexed by the offset
24    variable groups;		# list of character info values, indexed by
25				# group number, initialized with the
26				# unassigned character group
27
28    variable categories {
29	Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
30	Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
31    };				# Ordered list of character categories, must
32				# match the enumeration in the header file.
33}
34
35proc uni::getValue {items index} {
36    variable categories
37
38    # Extract character info
39
40    set category [lindex $items 2]
41    if {[scan [lindex $items 12] %x toupper] == 1} {
42	set toupper [expr {$index - $toupper}]
43    } else {
44	set toupper 0
45    }
46    if {[scan [lindex $items 13] %x tolower] == 1} {
47	set tolower [expr {$tolower - $index}]
48    } else {
49	set tolower 0
50    }
51    if {[scan [lindex $items 14] %x totitle] == 1} {
52	set totitle [expr {$index - $totitle}]
53    } elseif {$tolower} {
54	set totitle 0
55    } else {
56	set totitle $toupper
57    }
58
59    set categoryIndex [lsearch -exact $categories $category]
60    if {$categoryIndex < 0} {
61	puts "Unexpected character category: $index($category)"
62	set categoryIndex 0
63    }
64
65    return [list $categoryIndex $toupper $tolower $totitle]
66}
67
68proc uni::getGroup {value} {
69    variable groups
70
71    set gIndex [lsearch -exact $groups $value]
72    if {$gIndex == -1} {
73	set gIndex [llength $groups]
74	lappend groups $value
75    }
76    return $gIndex
77}
78
79proc uni::addPage {info} {
80    variable pMap
81    variable pages
82
83    set pIndex [lsearch -exact $pages $info]
84    if {$pIndex == -1} {
85	set pIndex [llength $pages]
86	lappend pages $info
87    }
88    lappend pMap $pIndex
89    return
90}
91
92proc uni::buildTables {data} {
93    variable shift
94
95    variable pMap {}
96    variable pages {}
97    variable groups {{0 0 0 0}}
98    variable next 0
99    set info {}			;# temporary page info
100
101    set mask [expr {(1 << $shift) - 1}]
102
103    foreach line [split $data \n] {
104	if {$line eq ""} {
105	    if {!($next & $mask)} {
106		# next character is already on page boundary
107		continue
108	    }
109	    # fill remaining page
110	    set line [format %X [expr {($next-1)|$mask}]]
111	    append line ";;Cn;0;ON;;;;;N;;;;;\n"
112	}
113
114	set items [split $line \;]
115
116	scan [lindex $items 0] %x index
117	if {$index > 0x2ffff} then {
118	    # Ignore non-BMP characters, as long as Tcl doesn't support them
119	    #continue
120	}
121	set index [format %d $index]
122
123	set gIndex [getGroup [getValue $items $index]]
124
125	# Since the input table omits unassigned characters, these will
126	# show up as gaps in the index sequence.  There are a few special cases
127	# where the gaps correspond to a uniform block of assigned characters.
128	# These are indicated as such in the character name.
129
130	# Enter all unassigned characters up to the current character.
131	if {($index > $next) \
132		&& ![regexp "Last>$" [lindex $items 1]]} {
133	    for {} {$next < $index} {incr next} {
134		lappend info 0
135		if {($next & $mask) == $mask} {
136		    addPage $info
137		    set info {}
138		}
139	    }
140	}
141
142	# Enter all assigned characters up to the current character
143	for {set i $next} {$i <= $index} {incr i} {
144	    # Add the group index to the info for the current page
145	    lappend info $gIndex
146
147	    # If this is the last entry in the page, add the page
148	    if {($i & $mask) == $mask} {
149		addPage $info
150		set info {}
151	    }
152	}
153	set next [expr {$index + 1}]
154    }
155    return
156}
157
158proc uni::main {} {
159    global argc argv0 argv
160    variable pMap
161    variable pages
162    variable groups
163    variable shift
164    variable next
165    variable max_delta
166
167    if {$argc != 3} {
168	puts stderr "\nusage: $argv0 <datafile> <version> <outfile>\n"
169	exit 1
170    }
171
172    set f [open [lindex $argv 0] r]
173    set data [read $f]
174    close $f
175
176    buildTables $data
177    puts "X = [llength $pMap]  Y= [llength $pages]  A= [llength $groups]"
178    set size [expr {[llength $pMap] + ([llength $pages]<<$shift)}]
179    puts "shift = $shift, space = $size"
180
181    set f [open [lindex $argv 2] w]
182    fconfigure $f -translation lf
183    puts $f "/*
184 * [lindex $argv 2] --
185 *
186 *	Declarations of Unicode [lindex $argv 1] character information tables.  This
187 *	file is automatically generated by a modified version of the
188 *	tools/uniParse.tcl script from the Tcl sources.
189 *
190 *	Do not modify this file by hand!
191 *
192 * Copyright (c) 1998 by Scriptics Corporation.
193 * All rights reserved.
194 */
195
196#include <config.h>
197
198#include <xapian/unicode.h>
199
200/*
201 * A 16-bit Unicode character is split into two parts in order to index
202 * into the following tables.  The lower OFFSET_BITS comprise an offset
203 * into a page of characters.  The upper bits comprise the page number.
204 */
205
206#define OFFSET_BITS $shift
207
208/*
209 * The pageMap is indexed by page number and returns an alternate page number
210 * that identifies a unique page of characters.  Many Unicode characters map
211 * to the same alternate page number.
212 */
213
214static const unsigned char pageMap\[\] = {"
215    set line "    "
216    set last [expr {[llength $pMap] - 1}]
217    for {set i 0} {$i <= $last} {incr i} {
218#	if {$i == [expr {0x10000 >> $shift}]} {
219#	    set line [string trimright $line " \t,"]
220#	    puts $f $line
221#	    set lastpage [expr {[lindex $line end] >> $shift}]
222#	    puts stdout "lastpage: $lastpage"
223#	    puts $f "#if TCL_UTF_MAX > 3"
224#	    set line "    ,"
225#	}
226	append line [lindex $pMap $i]
227	if {$i != $last} {
228	    append line ", "
229	}
230	if {[string length $line] > 70} {
231	    puts $f [string trimright $line]
232	    set line "    "
233	}
234    }
235    puts $f $line
236#    puts $f "#endif /* TCL_UTF_MAX > 3 */"
237    puts $f "};
238
239/*
240 * The groupMap is indexed by combining the alternate page number with
241 * the page offset and returns a group number that identifies a unique
242 * set of character attributes.
243 */
244
245static const unsigned char groupMap\[\] = {"
246    set line "    "
247    set lasti [expr {[llength $pages] - 1}]
248    for {set i 0} {$i <= $lasti} {incr i} {
249	set page [lindex $pages $i]
250	set lastj [expr {[llength $page] - 1}]
251#	if {$i == ($lastpage + 1)} {
252#	    puts $f [string trimright $line " \t,"]
253#	    puts $f "#if TCL_UTF_MAX > 3"
254#	    set line "    ,"
255#	}
256	for {set j 0} {$j <= $lastj} {incr j} {
257	    append line [lindex $page $j]
258	    if {$j != $lastj || $i != $lasti} {
259		append line ", "
260	    }
261	    if {[string length $line] > 70} {
262		puts $f [string trimright $line]
263		set line "    "
264	    }
265	}
266    }
267    puts $f $line
268#    puts $f "#endif /* TCL_UTF_MAX > 3 */"
269    puts $f "};
270
271/*
272 * Each group represents a unique set of character attributes.  The attributes
273 * are encoded into a 32-bit value as follows:
274 *
275 * Bits 0-4	Character category: see the constants listed below.
276 *
277 * Bits 5-7	Case delta type: 000 = identity
278 *				 010 = add delta for lower
279 *				 011 = add delta for lower, add 1 for title
280 *				 100 = subtract delta for title/upper
281 *				 101 = sub delta for upper, sub 1 for title
282 *				 110 = sub delta for upper, add delta for lower
283 *
284 * Bits 8-31	Case delta: delta for case conversions.  This should be the
285 *			    highest field so we can easily sign extend.
286 */
287
288static const int groups\[\] = {"
289    set line "    "
290    set last [expr {[llength $groups] - 1}]
291    set max_delta -1
292    for {set i 0} {$i <= $last} {incr i} {
293	foreach {type toupper tolower totitle} [lindex $groups $i] {}
294
295	# Compute the case conversion type and delta
296
297	if {$totitle} {
298	    if {$totitle == $toupper} {
299		# subtract delta for title or upper
300		set case 4
301		set delta $toupper
302		if {$tolower} {
303		    error "New case conversion type needed: $toupper $tolower $totitle"
304		}
305	    } elseif {$toupper} {
306		# subtract delta for upper, subtract 1 for title
307		set case 5
308		set delta $toupper
309		if {($totitle != 1) || $tolower} {
310		    error "New case conversion type needed: $toupper $tolower $totitle"
311		}
312	    } else {
313		# add delta for lower, add 1 for title
314		set case 3
315		set delta $tolower
316		if {$totitle != -1} {
317		    error "New case conversion type needed: $toupper $tolower $totitle"
318		}
319	    }
320	} elseif {$toupper} {
321	    # subtract delta for upper, add delta for lower
322	    set case 6
323	    set delta $toupper
324	    if {$tolower != $toupper} {
325		error "New case conversion type needed: $toupper $tolower $totitle"
326	    }
327	} elseif {$tolower} {
328	    # add delta for lower
329	    set case 2
330	    set delta $tolower
331	} else {
332	    # noop
333	    set case 0
334	    set delta 0
335	}
336
337	if {$delta >= (1 << 23) || $delta < -(1<<23)} {
338	    error "delta $delta out of range"
339	}
340	if {$delta > $max_delta} {
341	    set max_delta $delta
342	} elseif {-$delta > $max_delta} {
343	    set max_delta [expr {-$delta}]
344	}
345	append line [expr {($delta << 8) | ($case << 5) | $type}]
346	if {$i != $last} {
347	    append line ", "
348	}
349	if {[string length $line] > 65} {
350	    puts $f [string trimright $line]
351	    set line "    "
352	}
353    }
354    puts "max_delta = $max_delta"
355    puts $f $line
356    puts -nonewline $f "};
357
358#if 0
359
360#if TCL_UTF_MAX > 3
361#   define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1fffff) >= [format 0x%x $next])
362#else
363#   define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1f0000) != 0)
364#endif
365
366/*
367 * The following constants are used to determine the category of a
368 * Unicode character.
369 */
370
371enum {
372    UNASSIGNED,
373    UPPERCASE_LETTER,
374    LOWERCASE_LETTER,
375    TITLECASE_LETTER,
376    MODIFIER_LETTER,
377    OTHER_LETTER,
378    NON_SPACING_MARK,
379    ENCLOSING_MARK,
380    COMBINING_SPACING_MARK,
381    DECIMAL_DIGIT_NUMBER,
382    LETTER_NUMBER,
383    OTHER_NUMBER,
384    SPACE_SEPARATOR,
385    LINE_SEPARATOR,
386    PARAGRAPH_SEPARATOR,
387    CONTROL,
388    FORMAT,
389    PRIVATE_USE,
390    SURROGATE,
391    CONNECTOR_PUNCTUATION,
392    DASH_PUNCTUATION,
393    OPEN_PUNCTUATION,
394    CLOSE_PUNCTUATION,
395    INITIAL_QUOTE_PUNCTUATION,
396    FINAL_QUOTE_PUNCTUATION,
397    OTHER_PUNCTUATION,
398    MATH_SYMBOL,
399    CURRENCY_SYMBOL,
400    MODIFIER_SYMBOL,
401    OTHER_SYMBOL
402};
403
404/*
405 * The following macros extract the fields of the character info.  The
406 * GetDelta() macro is complicated because we can't rely on the C compiler
407 * to do sign extension on right shifts.
408 */
409
410#define GetCaseType(info) (((info) & 0xe0) >> 5)
411#define GetCategory(ch) (GetUniCharInfo(ch) & 0x1f)
412#define GetDelta(info) ((info) >> 8)
413#endif
414
415/** Extract information about a Unicode character.
416 *
417 *  This function extracts the information about a character from the
418 *  Unicode character tables.
419 */
420int
421Xapian::Unicode::Internal::get_character_info(unsigned ch) XAPIAN_NOEXCEPT
422{
423    if (rare(ch >= 0x110000)) {
424	// Categorise non-Unicode values as UNASSIGNED with no case variants.
425	return Xapian::Unicode::UNASSIGNED;
426    }
427    auto group = (pageMap\[int(ch) >> OFFSET_BITS\] << OFFSET_BITS) |
428		 ((ch) & ((1 << OFFSET_BITS) - 1));
429    return groups\[groupMap\[group\]\];
430}
431"
432
433    close $f
434}
435
436uni::main
437
438return
439