1# uniParse.tcl --
2#
3#	This program parses the UnicodeData file and generates the
4#	corresponding tclUniData.c file with compressed character
5#	data tables.  The input to this program should be the latest
6#	UnicodeData file from:
7#	    ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
8#
9# Copyright (c) 1998-1999 Scriptics Corporation.
10# All rights reserved.
11
12
13namespace eval uni {
14    set shift 5;		# number of bits of data within a page
15				# This value can be adjusted to find the
16				# best split to minimize table size
17
18    variable pMap;		# map from page to page index, each entry is
19				# an index into the pages table, indexed by
20				# page number
21    variable pages;		# map from page index to page info, each
22				# entry is a list of indices into the groups
23				# table, the list is indexed by the offset
24    variable groups;		# list of character info values, indexed by
25				# group number, initialized with the
26				# unassigned character group
27
28    variable categories {
29	Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp
30	Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So
31    };				# Ordered list of character categories, must
32				# match the enumeration in the header file.
33}
34
35proc uni::getValue {items index} {
36    variable categories
37
38    # Extract character info
39
40    set category [lindex $items 2]
41    if {[scan [lindex $items 12] %x toupper] == 1} {
42	set toupper [expr {$index - $toupper}]
43    } else {
44	set toupper 0
45    }
46    if {[scan [lindex $items 13] %x tolower] == 1} {
47	set tolower [expr {$tolower - $index}]
48    } else {
49	set tolower 0
50    }
51    if {[scan [lindex $items 14] %x totitle] == 1} {
52	set totitle [expr {$index - $totitle}]
53    } elseif {$tolower} {
54	set totitle 0
55    } else {
56	set totitle $toupper
57    }
58
59    set categoryIndex [lsearch -exact $categories $category]
60    if {$categoryIndex < 0} {
61	error "Unexpected character category: $index($category)"
62    }
63
64    return [list $categoryIndex $toupper $tolower $totitle]
65}
66
67proc uni::getGroup {value} {
68    variable groups
69
70    set gIndex [lsearch -exact $groups $value]
71    if {$gIndex < 0} {
72	set gIndex [llength $groups]
73	lappend groups $value
74    }
75    return $gIndex
76}
77
78proc uni::addPage {info} {
79    variable pMap
80    variable pages
81    variable shift
82
83    set pIndex [lsearch -exact $pages $info]
84    if {$pIndex < 0} {
85	set pIndex [llength $pages]
86	lappend pages $info
87    }
88    lappend pMap [expr {$pIndex << $shift}]
89    return
90}
91
92proc uni::buildTables {data} {
93    variable shift
94
95    variable pMap {}
96    variable pages {}
97    variable groups {{0 0 0 0}}
98    variable next 0
99    set info {}			;# temporary page info
100
101    set mask [expr {(1 << $shift) - 1}]
102
103    foreach line [split $data \n] {
104	if {$line eq ""} {
105	    if {!($next & $mask)} {
106		# next character is already on page boundary
107		continue
108	    }
109	    # fill remaining page
110	    set line [format %X [expr {($next-1)|$mask}]]
111	    append line ";;Cn;0;ON;;;;;N;;;;;\n"
112	}
113
114	set items [split $line \;]
115
116	scan [lindex $items 0] %x index
117	if {$index > 0x3FFFF} then {
118	    # Ignore characters > plane 3
119	    continue
120	}
121	set index [format %d $index]
122
123	set gIndex [getGroup [getValue $items $index]]
124
125	# Since the input table omits unassigned characters, these will
126	# show up as gaps in the index sequence.  There are a few special cases
127	# where the gaps correspond to a uniform block of assigned characters.
128	# These are indicated as such in the character name.
129
130	# Enter all unassigned characters up to the current character.
131	if {($index > $next) \
132		&& ![regexp "Last>$" [lindex $items 1]]} {
133	    for {} {$next < $index} {incr next} {
134		lappend info 0
135		if {($next & $mask) == $mask} {
136		    addPage $info
137		    set info {}
138		}
139	    }
140	}
141
142	# Enter all assigned characters up to the current character
143	for {set i $next} {$i <= $index} {incr i} {
144	    # Add the group index to the info for the current page
145	    lappend info $gIndex
146
147	    # If this is the last entry in the page, add the page
148	    if {($i & $mask) == $mask} {
149		addPage $info
150		set info {}
151	    }
152	}
153	set next [expr {$index + 1}]
154    }
155    return
156}
157
158proc uni::main {} {
159    global argc argv0 argv
160    variable pMap
161    variable pages
162    variable groups
163    variable shift
164    variable next
165
166    if {$argc != 2} {
167	puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
168	exit 1
169    }
170    set f [open [lindex $argv 0] r]
171    set data [read $f]
172    close $f
173
174    buildTables $data
175    puts "X = [llength $pMap]  Y= [llength $pages]  A= [llength $groups]"
176    set size [expr {[llength $pMap]*2 + ([llength $pages]<<$shift)}]
177    puts "shift = $shift, space = $size"
178
179    set f [open [file join [lindex $argv 1] tclUniData.c] w]
180    fconfigure $f -translation lf -encoding utf-8
181    puts $f "/*
182 * tclUniData.c --
183 *
184 *	Declarations of Unicode character information tables.  This file is
185 *	automatically generated by the tools/uniParse.tcl script.  Do not
186 *	modify this file by hand.
187 *
188 * Copyright (c) 1998 Scriptics Corporation.
189 * All rights reserved.
190 */
191
192/*
193 * A 16-bit Unicode character is split into two parts in order to index
194 * into the following tables.  The lower OFFSET_BITS comprise an offset
195 * into a page of characters.  The upper bits comprise the page number.
196 */
197
198#define OFFSET_BITS $shift
199
200/*
201 * The pageMap is indexed by page number and returns an alternate page number
202 * that identifies a unique page of characters.  Many Unicode characters map
203 * to the same alternate page number.
204 */
205
206static const unsigned short pageMap\[\] = {"
207    set line "    "
208    set last [expr {[llength $pMap] - 1}]
209    for {set i 0} {$i <= $last} {incr i} {
210	if {$i == [expr {0x10000 >> $shift}]} {
211	    set line [string trimright $line " \t,"]
212	    puts $f $line
213	    set lastpage [expr {[lindex $line end] >> $shift}]
214	    puts stdout "lastpage: $lastpage"
215	    puts $f "#if TCL_UTF_MAX > 3 || TCL_MAJOR_VERSION > 8 || TCL_MINOR_VERSION > 6"
216	    set line "    ,"
217	}
218	append line [lindex $pMap $i]
219	if {$i != $last} {
220	    append line ", "
221	}
222	if {[string length $line] > 70} {
223	    puts $f [string trimright $line]
224	    set line "    "
225	}
226    }
227    puts $f $line
228    puts $f "#endif /* TCL_UTF_MAX > 3 */"
229    puts $f "};
230
231/*
232 * The groupMap is indexed by combining the alternate page number with
233 * the page offset and returns a group number that identifies a unique
234 * set of character attributes.
235 */
236
237static const unsigned char groupMap\[\] = {"
238    set line "    "
239    set lasti [expr {[llength $pages] - 1}]
240    for {set i 0} {$i <= $lasti} {incr i} {
241	set page [lindex $pages $i]
242	set lastj [expr {[llength $page] - 1}]
243	if {$i == ($lastpage + 1)} {
244	    puts $f [string trimright $line " \t,"]
245	    puts $f "#if TCL_UTF_MAX > 3 || TCL_MAJOR_VERSION > 8 || TCL_MINOR_VERSION > 6"
246	    set line "    ,"
247	}
248	for {set j 0} {$j <= $lastj} {incr j} {
249	    append line [lindex $page $j]
250	    if {$j != $lastj || $i != $lasti} {
251		append line ", "
252	    }
253	    if {[string length $line] > 70} {
254		puts $f [string trimright $line]
255		set line "    "
256	    }
257	}
258    }
259    puts $f $line
260    puts $f "#endif /* TCL_UTF_MAX > 3 */"
261    puts $f "};
262
263/*
264 * Each group represents a unique set of character attributes.  The attributes
265 * are encoded into a 32-bit value as follows:
266 *
267 * Bits 0-4	Character category: see the constants listed below.
268 *
269 * Bits 5-7	Case delta type: 000 = identity
270 *				 010 = add delta for lower
271 *				 011 = add delta for lower, add 1 for title
272 *				 100 = subtract delta for title/upper
273 *				 101 = sub delta for upper, sub 1 for title
274 *				 110 = sub delta for upper, add delta for lower
275 *				 111 = subtract delta for upper
276 *
277 * Bits 8-31	Case delta: delta for case conversions.  This should be the
278 *			    highest field so we can easily sign extend.
279 */
280
281static const int groups\[\] = {"
282    set line "    "
283    set last [expr {[llength $groups] - 1}]
284    for {set i 0} {$i <= $last} {incr i} {
285	foreach {type toupper tolower totitle} [lindex $groups $i] {}
286
287	# Compute the case conversion type and delta
288
289	if {$totitle} {
290	    if {$totitle == $toupper} {
291		# subtract delta for title or upper
292		set case 4
293		set delta $toupper
294		if {$tolower} {
295		    error "New case conversion type needed: $toupper $tolower $totitle"
296		}
297	    } elseif {$toupper} {
298		# subtract delta for upper, subtract 1 for title
299		set case 5
300		set delta $toupper
301		if {($totitle != 1) || $tolower} {
302		    error "New case conversion type needed: $toupper $tolower $totitle"
303		}
304	    } else {
305		# add delta for lower, add 1 for title
306		set case 3
307		set delta $tolower
308		if {$totitle != -1} {
309		    error "New case conversion type needed: $toupper $tolower $totitle"
310		}
311	    }
312	} elseif {$toupper} {
313	    set delta $toupper
314	    if {$tolower == $toupper} {
315		# subtract delta for upper, add delta for lower
316		set case 6
317	    } elseif {!$tolower} {
318		# subtract delta for upper
319		set case 7
320	    } else {
321		error "New case conversion type needed: $toupper $tolower $totitle"
322	    }
323	} elseif {$tolower} {
324	    # add delta for lower
325	    set case 2
326	    set delta $tolower
327	} else {
328	    # noop
329	    set case 0
330	    set delta 0
331	}
332
333	append line [expr {($delta << 8) | ($case << 5) | $type}]
334	if {$i != $last} {
335	    append line ", "
336	}
337	if {[string length $line] > 65} {
338	    puts $f [string trimright $line]
339	    set line "    "
340	}
341    }
342    puts $f $line
343    puts -nonewline $f "};
344
345#if TCL_UTF_MAX > 3 || TCL_MAJOR_VERSION > 8 || TCL_MINOR_VERSION > 6
346#   define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1FFFFF) >= [format 0x%X $next])
347#else
348#   define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1F0000) != 0)
349#endif
350
351/*
352 * The following constants are used to determine the category of a
353 * Unicode character.
354 */
355
356enum {
357    UNASSIGNED,
358    UPPERCASE_LETTER,
359    LOWERCASE_LETTER,
360    TITLECASE_LETTER,
361    MODIFIER_LETTER,
362    OTHER_LETTER,
363    NON_SPACING_MARK,
364    ENCLOSING_MARK,
365    COMBINING_SPACING_MARK,
366    DECIMAL_DIGIT_NUMBER,
367    LETTER_NUMBER,
368    OTHER_NUMBER,
369    SPACE_SEPARATOR,
370    LINE_SEPARATOR,
371    PARAGRAPH_SEPARATOR,
372    CONTROL,
373    FORMAT,
374    PRIVATE_USE,
375    SURROGATE,
376    CONNECTOR_PUNCTUATION,
377    DASH_PUNCTUATION,
378    OPEN_PUNCTUATION,
379    CLOSE_PUNCTUATION,
380    INITIAL_QUOTE_PUNCTUATION,
381    FINAL_QUOTE_PUNCTUATION,
382    OTHER_PUNCTUATION,
383    MATH_SYMBOL,
384    CURRENCY_SYMBOL,
385    MODIFIER_SYMBOL,
386    OTHER_SYMBOL
387};
388
389/*
390 * The following macros extract the fields of the character info.  The
391 * GetDelta() macro is complicated because we can't rely on the C compiler
392 * to do sign extension on right shifts.
393 */
394
395#define GetCaseType(info) (((info) & 0xE0) >> 5)
396#define GetCategory(ch) (GetUniCharInfo(ch) & 0x1F)
397#define GetDelta(info) ((info) >> 8)
398
399/*
400 * This macro extracts the information about a character from the
401 * Unicode character tables.
402 */
403
404#if TCL_UTF_MAX > 3 || TCL_MAJOR_VERSION > 8 || TCL_MINOR_VERSION > 6
405#   define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0x1FFFFF) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
406#else
407#   define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0xFFFF) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\])
408#endif
409"
410
411    close $f
412}
413
414uni::main
415
416return
417