1# uniParse.tcl -- 2# 3# This program parses the UnicodeData file and generates the 4# corresponding source file with compressed character 5# data tables. The input to this program should be the latest 6# UnicodeData file from: 7# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt 8# 9# Copyright (c) 1998-1999 by Scriptics Corporation. 10# All rights reserved. 11 12 13namespace eval uni { 14 set shift 8; # number of bits of data within a page 15 # This value can be adjusted to find the 16 # best split to minimize table size 17 18 variable pMap; # map from page to page index, each entry is 19 # an index into the pages table, indexed by 20 # page number 21 variable pages; # map from page index to page info, each 22 # entry is a list of indices into the groups 23 # table, the list is indexed by the offset 24 variable groups; # list of character info values, indexed by 25 # group number, initialized with the 26 # unassigned character group 27 28 variable categories { 29 Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp 30 Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So 31 }; # Ordered list of character categories, must 32 # match the enumeration in the header file. 33} 34 35proc uni::getValue {items index} { 36 variable categories 37 38 # Extract character info 39 40 set category [lindex $items 2] 41 if {[scan [lindex $items 12] %x toupper] == 1} { 42 set toupper [expr {$index - $toupper}] 43 } else { 44 set toupper 0 45 } 46 if {[scan [lindex $items 13] %x tolower] == 1} { 47 set tolower [expr {$tolower - $index}] 48 } else { 49 set tolower 0 50 } 51 if {[scan [lindex $items 14] %x totitle] == 1} { 52 set totitle [expr {$index - $totitle}] 53 } elseif {$tolower} { 54 set totitle 0 55 } else { 56 set totitle $toupper 57 } 58 59 set categoryIndex [lsearch -exact $categories $category] 60 if {$categoryIndex < 0} { 61 puts "Unexpected character category: $index($category)" 62 set categoryIndex 0 63 } 64 65 return [list $categoryIndex $toupper $tolower $totitle] 66} 67 68proc uni::getGroup {value} { 69 variable groups 70 71 set gIndex [lsearch -exact $groups $value] 72 if {$gIndex == -1} { 73 set gIndex [llength $groups] 74 lappend groups $value 75 } 76 return $gIndex 77} 78 79proc uni::addPage {info} { 80 variable pMap 81 variable pages 82 83 set pIndex [lsearch -exact $pages $info] 84 if {$pIndex == -1} { 85 set pIndex [llength $pages] 86 lappend pages $info 87 } 88 lappend pMap $pIndex 89 return 90} 91 92proc uni::buildTables {data} { 93 variable shift 94 95 variable pMap {} 96 variable pages {} 97 variable groups {{0 0 0 0}} 98 variable next 0 99 set info {} ;# temporary page info 100 101 set mask [expr {(1 << $shift) - 1}] 102 103 foreach line [split $data \n] { 104 if {$line eq ""} { 105 if {!($next & $mask)} { 106 # next character is already on page boundary 107 continue 108 } 109 # fill remaining page 110 set line [format %X [expr {($next-1)|$mask}]] 111 append line ";;Cn;0;ON;;;;;N;;;;;\n" 112 } 113 114 set items [split $line \;] 115 116 scan [lindex $items 0] %x index 117 if {$index > 0x2ffff} then { 118 # Ignore non-BMP characters, as long as Tcl doesn't support them 119 #continue 120 } 121 set index [format %d $index] 122 123 set gIndex [getGroup [getValue $items $index]] 124 125 # Since the input table omits unassigned characters, these will 126 # show up as gaps in the index sequence. There are a few special cases 127 # where the gaps correspond to a uniform block of assigned characters. 128 # These are indicated as such in the character name. 129 130 # Enter all unassigned characters up to the current character. 131 if {($index > $next) \ 132 && ![regexp "Last>$" [lindex $items 1]]} { 133 for {} {$next < $index} {incr next} { 134 lappend info 0 135 if {($next & $mask) == $mask} { 136 addPage $info 137 set info {} 138 } 139 } 140 } 141 142 # Enter all assigned characters up to the current character 143 for {set i $next} {$i <= $index} {incr i} { 144 # Add the group index to the info for the current page 145 lappend info $gIndex 146 147 # If this is the last entry in the page, add the page 148 if {($i & $mask) == $mask} { 149 addPage $info 150 set info {} 151 } 152 } 153 set next [expr {$index + 1}] 154 } 155 return 156} 157 158proc uni::main {} { 159 global argc argv0 argv 160 variable pMap 161 variable pages 162 variable groups 163 variable shift 164 variable next 165 variable max_delta 166 167 if {$argc != 3} { 168 puts stderr "\nusage: $argv0 <datafile> <version> <outfile>\n" 169 exit 1 170 } 171 172 set f [open [lindex $argv 0] r] 173 set data [read $f] 174 close $f 175 176 buildTables $data 177 puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" 178 set size [expr {[llength $pMap] + ([llength $pages]<<$shift)}] 179 puts "shift = $shift, space = $size" 180 181 set f [open [lindex $argv 2] w] 182 fconfigure $f -translation lf 183 puts $f "/* 184 * [lindex $argv 2] -- 185 * 186 * Declarations of Unicode [lindex $argv 1] character information tables. This 187 * file is automatically generated by a modified version of the 188 * tools/uniParse.tcl script from the Tcl sources. 189 * 190 * Do not modify this file by hand! 191 * 192 * Copyright (c) 1998 by Scriptics Corporation. 193 * All rights reserved. 194 */ 195 196#include <config.h> 197 198#include <xapian/unicode.h> 199 200/* 201 * A 16-bit Unicode character is split into two parts in order to index 202 * into the following tables. The lower OFFSET_BITS comprise an offset 203 * into a page of characters. The upper bits comprise the page number. 204 */ 205 206#define OFFSET_BITS $shift 207 208/* 209 * The pageMap is indexed by page number and returns an alternate page number 210 * that identifies a unique page of characters. Many Unicode characters map 211 * to the same alternate page number. 212 */ 213 214static const unsigned char pageMap\[\] = {" 215 set line " " 216 set last [expr {[llength $pMap] - 1}] 217 for {set i 0} {$i <= $last} {incr i} { 218# if {$i == [expr {0x10000 >> $shift}]} { 219# set line [string trimright $line " \t,"] 220# puts $f $line 221# set lastpage [expr {[lindex $line end] >> $shift}] 222# puts stdout "lastpage: $lastpage" 223# puts $f "#if TCL_UTF_MAX > 3" 224# set line " ," 225# } 226 append line [lindex $pMap $i] 227 if {$i != $last} { 228 append line ", " 229 } 230 if {[string length $line] > 70} { 231 puts $f [string trimright $line] 232 set line " " 233 } 234 } 235 puts $f $line 236# puts $f "#endif /* TCL_UTF_MAX > 3 */" 237 puts $f "}; 238 239/* 240 * The groupMap is indexed by combining the alternate page number with 241 * the page offset and returns a group number that identifies a unique 242 * set of character attributes. 243 */ 244 245static const unsigned char groupMap\[\] = {" 246 set line " " 247 set lasti [expr {[llength $pages] - 1}] 248 for {set i 0} {$i <= $lasti} {incr i} { 249 set page [lindex $pages $i] 250 set lastj [expr {[llength $page] - 1}] 251# if {$i == ($lastpage + 1)} { 252# puts $f [string trimright $line " \t,"] 253# puts $f "#if TCL_UTF_MAX > 3" 254# set line " ," 255# } 256 for {set j 0} {$j <= $lastj} {incr j} { 257 append line [lindex $page $j] 258 if {$j != $lastj || $i != $lasti} { 259 append line ", " 260 } 261 if {[string length $line] > 70} { 262 puts $f [string trimright $line] 263 set line " " 264 } 265 } 266 } 267 puts $f $line 268# puts $f "#endif /* TCL_UTF_MAX > 3 */" 269 puts $f "}; 270 271/* 272 * Each group represents a unique set of character attributes. The attributes 273 * are encoded into a 32-bit value as follows: 274 * 275 * Bits 0-4 Character category: see the constants listed below. 276 * 277 * Bits 5-7 Case delta type: 000 = identity 278 * 010 = add delta for lower 279 * 011 = add delta for lower, add 1 for title 280 * 100 = subtract delta for title/upper 281 * 101 = sub delta for upper, sub 1 for title 282 * 110 = sub delta for upper, add delta for lower 283 * 284 * Bits 8-31 Case delta: delta for case conversions. This should be the 285 * highest field so we can easily sign extend. 286 */ 287 288static const int groups\[\] = {" 289 set line " " 290 set last [expr {[llength $groups] - 1}] 291 set max_delta -1 292 for {set i 0} {$i <= $last} {incr i} { 293 foreach {type toupper tolower totitle} [lindex $groups $i] {} 294 295 # Compute the case conversion type and delta 296 297 if {$totitle} { 298 if {$totitle == $toupper} { 299 # subtract delta for title or upper 300 set case 4 301 set delta $toupper 302 if {$tolower} { 303 error "New case conversion type needed: $toupper $tolower $totitle" 304 } 305 } elseif {$toupper} { 306 # subtract delta for upper, subtract 1 for title 307 set case 5 308 set delta $toupper 309 if {($totitle != 1) || $tolower} { 310 error "New case conversion type needed: $toupper $tolower $totitle" 311 } 312 } else { 313 # add delta for lower, add 1 for title 314 set case 3 315 set delta $tolower 316 if {$totitle != -1} { 317 error "New case conversion type needed: $toupper $tolower $totitle" 318 } 319 } 320 } elseif {$toupper} { 321 # subtract delta for upper, add delta for lower 322 set case 6 323 set delta $toupper 324 if {$tolower != $toupper} { 325 error "New case conversion type needed: $toupper $tolower $totitle" 326 } 327 } elseif {$tolower} { 328 # add delta for lower 329 set case 2 330 set delta $tolower 331 } else { 332 # noop 333 set case 0 334 set delta 0 335 } 336 337 if {$delta >= (1 << 23) || $delta < -(1<<23)} { 338 error "delta $delta out of range" 339 } 340 if {$delta > $max_delta} { 341 set max_delta $delta 342 } elseif {-$delta > $max_delta} { 343 set max_delta [expr {-$delta}] 344 } 345 append line [expr {($delta << 8) | ($case << 5) | $type}] 346 if {$i != $last} { 347 append line ", " 348 } 349 if {[string length $line] > 65} { 350 puts $f [string trimright $line] 351 set line " " 352 } 353 } 354 puts "max_delta = $max_delta" 355 puts $f $line 356 puts -nonewline $f "}; 357 358#if 0 359 360#if TCL_UTF_MAX > 3 361# define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1fffff) >= [format 0x%x $next]) 362#else 363# define UNICODE_OUT_OF_RANGE(ch) (((ch) & 0x1f0000) != 0) 364#endif 365 366/* 367 * The following constants are used to determine the category of a 368 * Unicode character. 369 */ 370 371enum { 372 UNASSIGNED, 373 UPPERCASE_LETTER, 374 LOWERCASE_LETTER, 375 TITLECASE_LETTER, 376 MODIFIER_LETTER, 377 OTHER_LETTER, 378 NON_SPACING_MARK, 379 ENCLOSING_MARK, 380 COMBINING_SPACING_MARK, 381 DECIMAL_DIGIT_NUMBER, 382 LETTER_NUMBER, 383 OTHER_NUMBER, 384 SPACE_SEPARATOR, 385 LINE_SEPARATOR, 386 PARAGRAPH_SEPARATOR, 387 CONTROL, 388 FORMAT, 389 PRIVATE_USE, 390 SURROGATE, 391 CONNECTOR_PUNCTUATION, 392 DASH_PUNCTUATION, 393 OPEN_PUNCTUATION, 394 CLOSE_PUNCTUATION, 395 INITIAL_QUOTE_PUNCTUATION, 396 FINAL_QUOTE_PUNCTUATION, 397 OTHER_PUNCTUATION, 398 MATH_SYMBOL, 399 CURRENCY_SYMBOL, 400 MODIFIER_SYMBOL, 401 OTHER_SYMBOL 402}; 403 404/* 405 * The following macros extract the fields of the character info. The 406 * GetDelta() macro is complicated because we can't rely on the C compiler 407 * to do sign extension on right shifts. 408 */ 409 410#define GetCaseType(info) (((info) & 0xe0) >> 5) 411#define GetCategory(ch) (GetUniCharInfo(ch) & 0x1f) 412#define GetDelta(info) ((info) >> 8) 413#endif 414 415/** Extract information about a Unicode character. 416 * 417 * This function extracts the information about a character from the 418 * Unicode character tables. 419 */ 420int 421Xapian::Unicode::Internal::get_character_info(unsigned ch) XAPIAN_NOEXCEPT 422{ 423 if (rare(ch >= 0x110000)) { 424 // Categorise non-Unicode values as UNASSIGNED with no case variants. 425 return Xapian::Unicode::UNASSIGNED; 426 } 427 auto group = (pageMap\[int(ch) >> OFFSET_BITS\] << OFFSET_BITS) | 428 ((ch) & ((1 << OFFSET_BITS) - 1)); 429 return groups\[groupMap\[group\]\]; 430} 431" 432 433 close $f 434} 435 436uni::main 437 438return 439