1# uniParse.tcl -- 2# 3# This program parses the UnicodeData file and generates the 4# corresponding tclUniData.c file with compressed character 5# data tables. The input to this program should be the latest 6# UnicodeData file from: 7# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt 8# 9# Copyright (c) 1998-1999 Scriptics Corporation. 10# All rights reserved. 11 12 13namespace eval uni { 14 set shift 5; # number of bits of data within a page 15 # This value can be adjusted to find the 16 # best split to minimize table size 17 18 variable pMap; # map from page to page index, each entry is 19 # an index into the pages table, indexed by 20 # page number 21 variable pages; # map from page index to page info, each 22 # entry is a list of indices into the groups 23 # table, the list is indexed by the offset 24 variable groups; # list of character info values, indexed by 25 # group number, initialized with the 26 # unassigned character group 27 28 variable categories { 29 Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp 30 Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So 31 }; # Ordered list of character categories, must 32 # match the enumeration in the header file. 33} 34 35proc uni::getValue {items index} { 36 variable categories 37 38 # Extract character info 39 40 set category [lindex $items 2] 41 if {[scan [lindex $items 12] %x toupper] == 1} { 42 set toupper [expr {$index - $toupper}] 43 } else { 44 set toupper 0 45 } 46 if {[scan [lindex $items 13] %x tolower] == 1} { 47 set tolower [expr {$tolower - $index}] 48 } else { 49 set tolower 0 50 } 51 if {[scan [lindex $items 14] %x totitle] == 1} { 52 set totitle [expr {$index - $totitle}] 53 } elseif {$tolower} { 54 set totitle 0 55 } else { 56 set totitle $toupper 57 } 58 59 set categoryIndex [lsearch -exact $categories $category] 60 if {$categoryIndex < 0} { 61 error "Unexpected character category: $index($category)" 62 } 63 64 return [list $categoryIndex $toupper $tolower $totitle] 65} 66 67proc uni::getGroup {value} { 68 variable groups 69 70 set gIndex [lsearch -exact $groups $value] 71 if {$gIndex < 0} { 72 set gIndex [llength $groups] 73 lappend groups $value 74 } 75 return $gIndex 76} 77 78proc uni::addPage {info} { 79 variable pMap 80 variable pages 81 variable shift 82 83 set pIndex [lsearch -exact $pages $info] 84 if {$pIndex < 0} { 85 set pIndex [llength $pages] 86 lappend pages $info 87 } 88 lappend pMap [expr {$pIndex << $shift}] 89 return 90} 91 92proc uni::buildTables {data} { 93 variable shift 94 95 variable pMap {} 96 variable pages {} 97 variable groups {{0 0 0 0}} 98 variable next 0 99 set info {} ;# temporary page info 100 101 set mask [expr {(1 << $shift) - 1}] 102 103 foreach line [split $data \n] { 104 if {$line eq ""} { 105 if {!($next & $mask)} { 106 # next character is already on page boundary 107 continue 108 } 109 # fill remaining page 110 set line [format %X [expr {($next-1)|$mask}]] 111 append line ";;Cn;0;ON;;;;;N;;;;;\n" 112 } 113 114 set items [split $line \;] 115 116 scan [lindex $items 0] %x index 117 if {$index > 0x3FFFF} then { 118 # Ignore characters > plane 3 119 continue 120 } 121 set index [format %d $index] 122 123 set gIndex [getGroup [getValue $items $index]] 124 125 # Since the input table omits unassigned characters, these will 126 # show up as gaps in the index sequence. There are a few special cases 127 # where the gaps correspond to a uniform block of assigned characters. 128 # These are indicated as such in the character name. 129 130 # Enter all unassigned characters up to the current character. 131 if {($index > $next) \ 132 && ![regexp "Last>$" [lindex $items 1]]} { 133 for {} {$next < $index} {incr next} { 134 lappend info 0 135 if {($next & $mask) == $mask} { 136 addPage $info 137 set info {} 138 } 139 } 140 } 141 142 # Enter all assigned characters up to the current character 143 for {set i $next} {$i <= $index} {incr i} { 144 # Add the group index to the info for the current page 145 lappend info $gIndex 146 147 # If this is the last entry in the page, add the page 148 if {($i & $mask) == $mask} { 149 addPage $info 150 set info {} 151 } 152 } 153 set next [expr {$index + 1}] 154 } 155 return 156} 157 158proc uni::main {} { 159 global argc argv0 argv 160 variable pMap 161 variable pages 162 variable groups 163 variable shift 164 variable next 165 166 if {$argc != 2} { 167 puts stderr "\nusage: $argv0 <datafile> <outdir>\n" 168 exit 1 169 } 170 set f [open [lindex $argv 0] r] 171 set data [read $f] 172 close $f 173 174 buildTables $data 175 puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" 176 set size [expr {[llength $pMap]*2 + ([llength $pages]<<$shift)}] 177 puts "shift = $shift, space = $size" 178 179 set f [open [file join [lindex $argv 1] tclUniData.c] w] 180 fconfigure $f -translation lf -encoding utf-8 181 puts $f "/* 182 * tclUniData.c -- 183 * 184 * Declarations of Unicode character information tables. This file is 185 * automatically generated by the tools/uniParse.tcl script. Do not 186 * modify this file by hand. 187 * 188 * Copyright (c) 1998 Scriptics Corporation. 189 * All rights reserved. 190 */ 191 192/* 193 * A 16-bit Unicode character is split into two parts in order to index 194 * into the following tables. The lower OFFSET_BITS comprise an offset 195 * into a page of characters. The upper bits comprise the page number. 196 */ 197 198#define OFFSET_BITS $shift 199 200/* 201 * The pageMap is indexed by page number and returns an alternate page number 202 * that identifies a unique page of characters. Many Unicode characters map 203 * to the same alternate page number. 204 */ 205 206static const unsigned short pageMap\[\] = {" 207 set line " " 208 set last [expr {[llength $pMap] - 1}] 209 for {set i 0} {$i <= $last} {incr i} { 210 if {$i == [expr {0x10000 >> $shift}]} { 211 set line [string trimright $line " \t,"] 212 puts $f $line 213 set lastpage [expr {[lindex $line end] >> $shift}] 214 puts stdout "lastpage: $lastpage" 215 puts $f "#if TCL_UTF_MAX > 3 || TCL_MAJOR_VERSION > 8 || TCL_MINOR_VERSION > 6" 216 set line " ," 217 } 218 append line [lindex $pMap $i] 219 if {$i != $last} { 220 append line ", " 221 } 222 if {[string length $line] > 70} { 223 puts $f [string trimright $line] 224 set line " " 225 } 226 } 227 puts $f $line 228 puts $f "#endif /* TCL_UTF_MAX > 3 */" 229 puts $f "}; 230 231/* 232 * The groupMap is indexed by combining the alternate page number with 233 * the page offset and returns a group number that identifies a unique 234 * set of character attributes. 235 */ 236 237static const unsigned char groupMap\[\] = {" 238 set line " " 239 set lasti [expr {[llength $pages] - 1}] 240 for {set i 0} {$i <= $lasti} {incr i} { 241 set page [lindex $pages $i] 242 set lastj [expr {[llength $page] - 1}] 243 if {$i == ($lastpage + 1)} { 244 puts $f [string trimright $line " \t,"] 245 puts $f "#if TCL_UTF_MAX > 3 || TCL_MAJOR_VERSION > 8 || TCL_MINOR_VERSION > 6" 246 set line " ," 247 } 248 for {set j 0} {$j <= $lastj} {incr j} { 249 append line [lindex $page $j] 250 if {$j != $lastj || $i != $lasti} { 251 append line ", " 252 } 253 if {[string length $line] > 70} { 254 puts $f [string trimright $line] 255 set line " " 256 } 257 } 258 } 259 puts $f $line 260 puts $f "#endif /* TCL_UTF_MAX > 3 */" 261 puts $f "}; 262 263/* 264 * Each group represents a unique set of character attributes. The attributes 265 * are encoded into a 32-bit value as follows: 266 * 267 * Bits 0-4 Character category: see the constants listed below. 268 * 269 * Bits 5-7 Case delta type: 000 = identity 270 * 010 = add delta for lower 271 * 011 = add delta for lower, add 1 for title 272 * 100 = subtract delta for title/upper 273 * 101 = sub delta for upper, sub 1 for title 274 * 110 = sub delta for upper, add delta for lower 275 * 111 = subtract delta for upper 276 * 277 * Bits 8-31 Case delta: delta for case conversions. This should be the 278 * highest field so we can easily sign extend. 279 */ 280 281static const int groups\[\] = {" 282 set line " " 283 set last [expr {[llength $groups] - 1}] 284 for {set i 0} {$i <= $last} {incr i} { 285 foreach {type toupper tolower totitle} [lindex $groups $i] {} 286 287 # Compute the case conversion type and delta 288 289 if {$totitle} { 290 if {$totitle == $toupper} { 291 # subtract delta for title or upper 292 set case 4 293 set delta $toupper 294 if {$tolower} { 295 error "New case conversion type needed: $toupper $tolower $totitle" 296 } 297 } elseif {$toupper} { 298 # subtract delta for upper, subtract 1 for title 299 set case 5 300 set delta $toupper 301 if {($totitle != 1) || $tolower} { 302 error "New case conversion type needed: $toupper $tolower $totitle" 303 } 304 } else { 305 # add delta for lower, add 1 for title 306 set case 3 307 set delta $tolower 308 if {$totitle != -1} { 309 error "New case conversion type needed: $toupper $tolower $totitle" 310 } 311 } 312 } elseif {$toupper} { 313 set delta $toupper 314 if {$tolower == $toupper} { 315 # subtract delta for upper, add delta for lower 316 set case 6 317 } elseif {!$tolower} { 318 # subtract delta for upper 319 set case 7 320 } else { 321 error "New case conversion type needed: $toupper $tolower $totitle" 322 } 323 } elseif {$tolower} { 324 # add delta for lower 325 set case 2 326 set delta $tolower 327 } else { 328 # noop 329 set case 0 330 set delta 0 331 } 332 333 append line [expr {($delta << 8) | ($case << 5) | $type}] 334 if {$i != $last} { 335 append line ", " 336 } 337 if {[string length $line] > 65} { 338 puts $f [string trimright $line] 339 set line " " 340 } 341 } 342 puts $f $line 343 puts -nonewline $f "}; 344 345#if TCL_UTF_MAX > 3 || TCL_MAJOR_VERSION > 8 || TCL_MINOR_VERSION > 6 346# define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1FFFFF) >= [format 0x%X $next]) 347#else 348# define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1F0000) != 0) 349#endif 350 351/* 352 * The following constants are used to determine the category of a 353 * Unicode character. 354 */ 355 356enum { 357 UNASSIGNED, 358 UPPERCASE_LETTER, 359 LOWERCASE_LETTER, 360 TITLECASE_LETTER, 361 MODIFIER_LETTER, 362 OTHER_LETTER, 363 NON_SPACING_MARK, 364 ENCLOSING_MARK, 365 COMBINING_SPACING_MARK, 366 DECIMAL_DIGIT_NUMBER, 367 LETTER_NUMBER, 368 OTHER_NUMBER, 369 SPACE_SEPARATOR, 370 LINE_SEPARATOR, 371 PARAGRAPH_SEPARATOR, 372 CONTROL, 373 FORMAT, 374 PRIVATE_USE, 375 SURROGATE, 376 CONNECTOR_PUNCTUATION, 377 DASH_PUNCTUATION, 378 OPEN_PUNCTUATION, 379 CLOSE_PUNCTUATION, 380 INITIAL_QUOTE_PUNCTUATION, 381 FINAL_QUOTE_PUNCTUATION, 382 OTHER_PUNCTUATION, 383 MATH_SYMBOL, 384 CURRENCY_SYMBOL, 385 MODIFIER_SYMBOL, 386 OTHER_SYMBOL 387}; 388 389/* 390 * The following macros extract the fields of the character info. The 391 * GetDelta() macro is complicated because we can't rely on the C compiler 392 * to do sign extension on right shifts. 393 */ 394 395#define GetCaseType(info) (((info) & 0xE0) >> 5) 396#define GetCategory(ch) (GetUniCharInfo(ch) & 0x1F) 397#define GetDelta(info) ((info) >> 8) 398 399/* 400 * This macro extracts the information about a character from the 401 * Unicode character tables. 402 */ 403 404#if TCL_UTF_MAX > 3 || TCL_MAJOR_VERSION > 8 || TCL_MINOR_VERSION > 6 405# define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0x1FFFFF) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) 406#else 407# define GetUniCharInfo(ch) (groups\[groupMap\[pageMap\[((ch) & 0xFFFF) >> OFFSET_BITS\] | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) 408#endif 409" 410 411 close $f 412} 413 414uni::main 415 416return 417