1#lang racket/base 2 3;; This script parses UnicodeData.txt (the standard Unicode database, 4;; available from the web) and other such files, and it produces 5;; "schuchar.inc" and "schustr.inc". The former is used by 6;; scheme_isalpha, etc., and thus `char-alphabetic?', etc. The latter 7;; is used for string operations. 8 9;; Run as 10;; racket mk-uchar.rkt 11;; in the script's directory, and have a copy of UnicodeData.txt, etc. 12;; in the "Unicode" directory. The file schuchar.inc will be 13;; overwritten. 14 15(require racket/list) 16 17(define mark-cats '("Mn" "Mc" "Me")) 18(define letter-cats '("Lu" "Ll" "Lt" "Lm" "Lo")) 19(define digit-cats '("Nd" "No" "Nl")) 20(define space-cats '("Zl" "Zs" "Zp")) 21(define punc-cats '("Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po")) 22(define sym-cats '("Sm" "Sc" "Sk" "So")) 23(define sympart-non-cats '("Ps" "Pe" "Pi" "Pf" "Zl" "Zs" "Zp")) 24(define graphic-cats (append mark-cats 25 letter-cats 26 digit-cats 27 punc-cats 28 sym-cats)) 29 30(define cases (cons (make-hash) (box 0))) 31 32(define (indirect t v limit) 33 (let ([r (hash-ref (car t) v (lambda () #f))]) 34 (or r 35 (let ([r (unbox (cdr t))]) 36 (set-box! (cdr t) (add1 r)) 37 (hash-set! (car t) v r) 38 (when (r . > . limit) 39 (error "too many indirects")) 40 r)))) 41 42(define (combine . l) 43 ;; The scheme_is...() macros in scheme.h must match 44 ;; the bit layout produced here 45 (let loop ([l l][v 0]) 46 (if (null? l) 47 v 48 (loop (cdr l) (bitwise-ior (arithmetic-shift v 1) 49 (if (car l) 50 1 51 0)))))) 52 53(define (combine-case up down title fold combining) 54 (indirect cases (list up down title fold combining) 256)) 55 56(define general-categories (make-hash)) 57(define (combine-cat cat) 58 (hash-ref general-categories cat 59 (lambda () 60 (let ([v (hash-count general-categories)]) 61 (hash-set! general-categories cat v) 62 v)))) 63;; So they're in order: 64(with-input-from-file "schgencat.h" 65 (lambda () 66 (let loop () 67 (let ([l (read-line)]) 68 (unless (eof-object? l) 69 (let ([m (regexp-match #rx"mzu_([A-Z][a-z])" l)]) 70 (when m 71 (combine-cat (cadr m)))) 72 (loop)))))) 73 74(define hexes (map char->integer (string->list "0123456789abcdefABCDEF"))) 75 76(define combining-class-ht (make-hasheq)) 77 78;; In principle, adjust this number to tune the result, but 79;; the macros for accessing the table (in scheme.h) need to 80;; be updated accordingly. 81;; In practice, it's unlikely that anything will ever work 82;; much better than 8. 83(define low-bits 8) 84 85(define low (sub1 (expt 2 low-bits))) 86(define hi-count (expt 2 (- 21 low-bits))) 87(define hi (arithmetic-shift (sub1 hi-count) low-bits)) 88 89(define top (make-vector hi-count #f)) 90(define top2 (make-vector hi-count #f)) 91(define top3 (make-vector hi-count #f)) 92 93(define range-bottom 0) 94(define range-top -1) 95(define range-v -1) 96(define range-v2 -1) 97(define range-v3 -1) 98(define ranges null) 99 100(define ccount 0) 101 102(define (map1 c v v2 v3 cc) 103 (hash-set! combining-class-ht c cc) 104 (set! ccount (add1 ccount)) 105 (if (= c (add1 range-top)) 106 (begin 107 (unless (and (= v range-v) 108 (= v2 range-v2) 109 (= v3 range-v3)) 110 (set! range-v -1)) 111 (set! range-top c)) 112 (begin 113 ;; Drop surrogate from range. 114 ;; At the time of implementation, the following 115 ;; was never executed, because #D7FF wasn't mapped: 116 (when (and (< range-bottom #xD800) 117 (> range-top #xD800)) 118 (set! ranges (cons (list range-bottom #xD7FF (range-v . > . -1)) 119 ranges)) 120 (set! range-bottom #xE000)) 121 ;; ... but this one was executed. 122 (when (= range-bottom #xD800) 123 (set! range-bottom #xE000)) 124 (set! ranges (cons (list range-bottom range-top (range-v . > . -1)) 125 ranges)) 126 (set! range-bottom c) 127 (set! range-top c) 128 (set! range-v v) 129 (set! range-v2 v2) 130 (set! range-v3 v3))) 131 (let ([top-index (arithmetic-shift c (- low-bits))]) 132 (let ([vec (vector-ref top top-index)] 133 [vec2 (vector-ref top2 top-index)] 134 [vec3 (vector-ref top3 top-index)]) 135 (unless vec 136 (vector-set! top top-index (make-vector (add1 low)))) 137 (unless vec2 138 (vector-set! top2 top-index (make-vector (add1 low)))) 139 (unless vec3 140 (vector-set! top3 top-index (make-vector (add1 low)))) 141 (let ([vec (vector-ref top top-index)] 142 [vec2 (vector-ref top2 top-index)] 143 [vec3 (vector-ref top3 top-index)]) 144 (vector-set! vec (bitwise-and c low) v) 145 (vector-set! vec2 (bitwise-and c low) v2) 146 (vector-set! vec3 (bitwise-and c low) v3))))) 147 148(define (mapn c from v v2 v3 cc) 149 (if (= c from) 150 (map1 c v v2 v3 cc) 151 (begin 152 (map1 from v v2 v3 cc) 153 (mapn c (add1 from) v v2 v3 cc)))) 154 155(define (set-compose-initial! c) 156 (let ([top-index (arithmetic-shift c (- low-bits))]) 157 (let ([vec (vector-ref top top-index)] 158 [i (bitwise-and c low) ]) 159 (vector-set! vec i (bitwise-ior #x8000 (vector-ref vec i)))))) 160 161(define (string->codes s) 162 (let ([m (regexp-match #rx"^[^0-9A-F]*([0-9A-F]+)" s)]) 163 (if m 164 (cons (string->number (cadr m) 16) 165 (string->codes (substring s (string-length (car m))))) 166 null))) 167 168;; This code assumes that Final_Sigma is the only condition that we care about: 169(define case-foldings (make-hash)) 170(define special-case-foldings (make-hash)) 171(call-with-input-file "Unicode/CaseFolding.txt" 172 (lambda (i) 173 (let loop () 174 (let ([l (read-line i)]) 175 (unless (eof-object? l) 176 (let ([m (regexp-match #rx"^([0-9A-F]+); *([CSF]) *;([^;]*)" l)]) 177 (when m 178 (let ([code (string->number (cadr m) 16)] 179 [variant (list-ref m 2)] 180 [folded (string->codes (list-ref m 3))]) 181 (if (string=? variant "F") 182 (hash-set! special-case-foldings code folded) 183 (hash-set! case-foldings code (car folded)))))) 184 (loop)))))) 185 186;; This code assumes that Final_Sigma is the only condition that we care about: 187(define special-casings (make-hash)) 188(define-struct special-casing (lower upper title folding final-sigma?) #:mutable) 189(call-with-input-file "Unicode/SpecialCasing.txt" 190 (lambda (i) 191 (let loop () 192 (let ([l (read-line i)]) 193 (unless (eof-object? l) 194 (let ([m (regexp-match #rx"^([0-9A-F]+);([^;]*);([^;]*);([^;]*);([^;]*)" l)]) 195 (when (and m 196 (regexp-match #rx"^(?:(?: *Final_Sigma *)|(?: *))(?:$|[;#].*)" (list-ref m 5))) 197 (let ([code (string->number (cadr m) 16)] 198 [lower (string->codes (list-ref m 2))] 199 [upper (string->codes (list-ref m 4))] 200 [title (string->codes (list-ref m 3))] 201 [final-sigma? (and (regexp-match #rx"Final_Sigma" (list-ref m 5)) #t)]) 202 (let ([folding (list (hash-ref case-foldings code (lambda () code)))]) 203 (hash-set! special-casings code (make-special-casing lower upper title folding final-sigma?)))))) 204 (loop)))))) 205 206(define lower-case (make-hash)) 207(define upper-case (make-hash)) 208(define alphabetic (make-hash)) 209(define case-ignorable (make-hash)) 210 211(with-input-from-file "Unicode/DerivedCoreProperties.txt" 212 (lambda () 213 (let loop () 214 (let ([l (read-line)]) 215 (unless (eof-object? l) 216 (let ([m (regexp-match #rx"^([0-9A-F.]+) *; ((Lower|Upper)case|Alphabetic|Case_Ignorable)" l)]) 217 (when m 218 (let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)] 219 [end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))]) 220 (if m 221 (string->number (cadr m) 16) 222 start))] 223 [t (cond 224 [(string=? (caddr m) "Lowercase") lower-case] 225 [(string=? (caddr m) "Uppercase") upper-case] 226 [(string=? (caddr m) "Alphabetic") alphabetic] 227 [(string=? (caddr m) "Case_Ignorable") case-ignorable] 228 [else (error "unknown property section")])]) 229 (let loop ([i start]) 230 (hash-set! t i #t) 231 (unless (= i end) 232 (loop (add1 i))))))) 233 (loop)))))) 234 235(define white_spaces (make-hash)) 236 237(with-input-from-file "Unicode/PropList.txt" 238 (lambda () 239 (let loop () 240 (let ([l (read-line)]) 241 (unless (eof-object? l) 242 (let ([m (regexp-match #rx"^([0-9A-F.]+) *; White_Space" l)]) 243 (when m 244 (let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)] 245 [end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))]) 246 (if m 247 (string->number (cadr m) 16) 248 start))]) 249 (let loop ([i start]) 250 (hash-set! white_spaces i #t) 251 (unless (= i end) 252 (loop (add1 i))))))) 253 (loop)))))) 254 255(define decomp-ht (make-hasheq)) 256(define k-decomp-ht (make-hasheq)) 257(define compose-initial-ht (make-hasheq)) 258(define compose-map (make-hash)) 259(define do-not-compose-ht (make-hash)) 260 261(with-input-from-file "Unicode/CompositionExclusions.txt" 262 (lambda () 263 (let loop () 264 (let ([l (read-line)]) 265 (unless (eof-object? l) 266 (let ([m (regexp-match #rx"^([0-9A-F.]+)" l)]) 267 (when m 268 (let ([code (string->number (car m) 16)]) 269 (hash-set! do-not-compose-ht code #t)))) 270 (loop)))))) 271 272(define (composition-key a b) 273 ;; If `a` and `b` are both in the BMP (i.e., both fit in 16 bits), 274 ;; map to a 32-bit key. 275 (bitwise-ior (arithmetic-shift (bitwise-and a #xFFFF) 16) 276 (bitwise-and b #xFFFF) 277 (arithmetic-shift 278 (bitwise-ior (arithmetic-shift (arithmetic-shift a -16) 279 5) 280 (arithmetic-shift b -16)) 281 32))) 282 283(define (composition-key-first k) 284 (bitwise-ior (bitwise-and (arithmetic-shift k -16) #xFFFF) 285 (arithmetic-shift (arithmetic-shift k -37) 16))) 286 287(define (extract-decomp decomp code) 288 (if (string=? decomp "") 289 #f 290 (let ([m (regexp-match #rx"^([0-9A-F]+) ?([0-9A-F]*)$" decomp)]) 291 (if m 292 ;; Canonical decomp 293 (let ([a (string->number (cadr m) 16)] 294 [b (if (string=? "" (caddr m)) 295 0 296 (string->number (caddr m) 16))]) 297 ;; Canonical composition? 298 (when (and (positive? b) 299 (not (hash-ref do-not-compose-ht 300 code 301 (lambda () #f)))) 302 (hash-set! compose-initial-ht a #t) 303 (let ([key (composition-key a b)]) 304 (when (hash-ref compose-map key (lambda () #f)) 305 (error 'decomp "composition already mapped: ~x for: ~x" key code)) 306 (hash-set! compose-map key code))) 307 (hash-set! decomp-ht code (cons a b)) 308 #t) 309 ;; Compatibility decomp 310 (let ([seq 311 (let loop ([str (cadr (regexp-match #rx"^<[^>]*> *(.*)$" decomp))]) 312 (let ([m (regexp-match #rx"^([0-9A-F]+) *(.*)$" str)]) 313 (if m 314 (cons (string->number (cadr m) 16) 315 (loop (caddr m))) 316 null)))]) 317 (hash-set! k-decomp-ht code seq) 318 #t))))) 319 320(define default-casing (make-hash)) 321 322(call-with-input-file "Unicode/UnicodeData.txt" 323 (lambda (i) 324 (let loop ([prev-code 0]) 325 (let ([l (read-line i)]) 326 (unless (eof-object? l) 327 (let ([m (regexp-match #rx"^([0-9A-F]+);([^;]*);([^;]*);([^;]*);[^;]*;([^;]*);[^;]*;[^;]*;([^;]*);[^;]*;[^;]*;[^;]*;([^;]*);([^;]*);([^;]*)" 328 l)]) 329 (unless m 330 (printf "no match: ~a\n" l)) 331 (let ([code (string->number (cadr m) 16)] 332 [name (caddr m)] 333 [cat (cadddr m)] 334 [combining (string->number (cadddr (cdr m)))] 335 [decomp (cadddr (cddr m))] 336 [numeric (cadddr (cdddr m))] 337 [up (string->number (cadddr (cddddr m)) 16)] 338 [down (string->number (cadddr (cddddr (cdr m))) 16)] 339 [title (string->number (cadddr (cddddr (cddr m))) 16)]) 340 (hash-set! default-casing code (list down up title)) 341 (let ([alphabetic? (hash-ref alphabetic code #f)] 342 [numeric? (not (string=? numeric ""))] 343 [symbolic? (member cat sym-cats)] 344 [punctuation? (member cat punc-cats)]) 345 (mapn code 346 (if (regexp-match #rx", Last>" name) 347 (add1 prev-code) 348 code) 349 ;; The booleans below are in most-siginficant-bit-first order 350 (combine 351 ;; Decomposition 352 (extract-decomp decomp code) 353 ;; special-casing 354 (or (hash-ref special-casings code (lambda () #f)) 355 (hash-ref special-case-foldings code (lambda () #f))) 356 ;; case-ignoreable 357 (hash-ref case-ignorable code #f) 358 ;; graphic 359 (or alphabetic? 360 numeric? 361 symbolic? 362 punctuation? 363 (member cat graphic-cats)) 364 ;; lowercase: 365 (hash-ref lower-case code (lambda () #f)) 366 #; 367 (and (not (<= #x2000 code #x2FFF)) 368 (not down) 369 (or up 370 (regexp-match #rx"SMALL LETTER" name) 371 (regexp-match #rx"SMALL LIGATURE" name))) 372 ;; uppercase; 373 (hash-ref upper-case code (lambda () #f)) 374 #; 375 (and (not (<= #x2000 code #x2FFF)) 376 (not up) 377 (or down 378 (regexp-match #rx"CAPITAL LETTER" name) 379 (regexp-match #rx"CAPITAL LIGATURE" name))) 380 ;; titlecase: 381 (string=? cat "Lt") 382 ;; letter 383 alphabetic? 384 #; 385 (member cat letter-cats) 386 ;; digit 387 numeric? 388 #; 389 (member cat digit-cats) 390 ;; SOMETHING - this bit not yet used 391 #f 392 ;; whitespace 393 (hash-ref white_spaces code #f) 394 #; 395 (or (member cat space-cats) 396 (member code '(#x9 #xa #xb #xc #xd #x85))) 397 ;; control 398 (or (<= #x0000 code #x001F) 399 (<= #x007F code #x009F)) 400 ;; punctuation 401 punctuation? 402 ;; symbol 403 symbolic? 404 ;; blank 405 (or (string=? cat "Zs") 406 (= code #x9))) 407 ;; Cases 408 (combine-case 409 (if up (- up code) 0) 410 (if down (- down code) 0) 411 (if title (- title code) 0) 412 (let ([case-fold (hash-ref case-foldings code (lambda () #f))]) 413 (if case-fold (- case-fold code) 0)) 414 combining) 415 ;; Category 416 (combine-cat cat) 417 ;; Combining class - used again to filter initial composes 418 combining)) 419 (loop code)))))))) 420 421(hash-for-each compose-initial-ht 422 (lambda (k v) 423 ;; A canonical decomposition that starts with a non-0 combining 424 ;; class is not re-created in a canonical composition. There 425 ;; are only two such leading character as of Unicode 4.0: 426 ;; U+0308 and U+0F71. 427 (when (zero? (hash-ref combining-class-ht k)) 428 (set-compose-initial! k)))) 429 430;; Remove compositions from compose map that start with 431;; a character whose combining class is not 0. As of Unicode 432;; 4.0, there are only four of these: U+0344, U+0F73, 433;; U+0F75, and U+0F81. 434(for-each (lambda (k) 435 (let ([a (composition-key-first k)]) 436 (unless (zero? (hash-ref combining-class-ht a)) 437 (hash-remove! compose-map k)))) 438 (hash-map compose-map (lambda (k v) k))) 439 440(define k-decomp-map-ht (make-hasheq)) 441(define k-decomp-strs-ht (make-hash)) 442(define k-decomp-strs-len 0) 443(define k-decomp-strs null) 444 445(define (fold-decomp s) 446 (cond 447 [(empty? s) empty] 448 [(empty? (cdr s)) 449 (let ([code (car s)]) 450 (let ([v (hash-ref decomp-ht code (lambda () #f))]) 451 (if v 452 (if (zero? (cdr v)) 453 (fold-decomp (list (car v))) 454 (fold-decomp (list (car v) (cdr v)))) 455 (let ([v (hash-ref k-decomp-ht code (lambda () #f))]) 456 (if v 457 (fold-decomp v) 458 (list code))))))] 459 [else (append (fold-decomp (list (car s))) 460 (fold-decomp (cdr s)))])) 461 462(for-each 463 (lambda (p) 464 (let* ([code (car p)] 465 [seq (fold-decomp (cdr p))] 466 [pos (hash-ref k-decomp-strs-ht seq 467 (lambda () 468 (begin0 469 k-decomp-strs-len 470 (hash-set! k-decomp-strs-ht seq 471 k-decomp-strs-len) 472 (set! k-decomp-strs 473 (append (reverse seq) k-decomp-strs)) 474 (set! k-decomp-strs-len (+ k-decomp-strs-len 475 (length seq))))))]) 476 (hash-set! k-decomp-map-ht code (cons pos (length seq))))) 477 ;; Sort to keep it deterministic: 478 (sort (hash-map k-decomp-ht cons) 479 (lambda (a b) (< (car a) (car b))))) 480 481 482(define vectors (make-hash)) 483(define vectors2 (make-hash)) 484(define vectors3 (make-hash)) 485 486(define pos 0) 487(define pos2 0) 488(define pos3 0) 489 490(current-output-port (open-output-file "schuchar.inc" #:exists 'truncate/replace)) 491 492(define (hash-vectors! top vectors get-pos set-pos!) 493 (let loop ([i 0]) 494 (unless (= i hi-count) 495 (let ([vec (vector-ref top i)]) 496 (when vec 497 (unless (hash-ref vectors vec (lambda () #f)) 498 (set-pos! (add1 (get-pos))) 499 (hash-set! vectors vec (get-pos)))) 500 (loop (add1 i)))))) 501 502(hash-vectors! top vectors (lambda () pos) (lambda (v) (set! pos v))) 503(hash-vectors! top2 vectors2 (lambda () pos2) (lambda (v) (set! pos2 v))) 504(hash-vectors! top3 vectors3 (lambda () pos3) (lambda (v) (set! pos3 v))) 505 506;; copy folding special cases to the special-cases table, if not there already: 507(hash-for-each special-case-foldings 508 (lambda (k v) 509 (let ([sc (hash-ref special-casings k (lambda () 510 (define d (hash-ref default-casing k)) 511 (let ([sc (make-special-casing 512 (list (or (car d) k)) 513 (list (or (cadr d) k)) 514 (list (or (caddr d) k)) 515 (list k) 516 #f)]) 517 (hash-set! special-casings k sc) 518 sc)))]) 519 (set-special-casing-folding! sc v)))) 520 521(define world-count (expt 2 10)) 522 523(printf "/* Generated by mk-uchar.rkt */\n\n") 524 525(printf "/* Character count: ~a */\n" ccount) 526(printf "/* Total bytes for all tables: ~a */\n\n" 527 (+ (* (add1 low) 528 (* 2 (add1 (length (hash-map vectors cons))))) 529 (* (add1 low) 530 (* 1 (add1 (length (hash-map vectors2 cons))))) 531 (* (add1 low) 532 (* 1 (add1 (length (hash-map vectors3 cons))))) 533 (* (hash-count decomp-ht) 534 8) 535 (* (hash-count compose-map) 536 2) 537 (* (hash-count k-decomp-map-ht) (+ 4 1 2)) 538 (* 2 k-decomp-strs-len) 539 (* 4 4 (unbox (cdr cases))) 540 (* 4 (* 2 hi-count)))) 541 542(printf (string-append 543 "/* Each of the following maps a character to a value\n" 544 " via the scheme_uchar_find() macro in scheme.h. */\n\n")) 545 546(printf "/* Character properties: */\n") 547(printf "READ_ONLY unsigned short *scheme_uchar_table[~a];\n" hi-count) 548 549(printf "\n/* Character case mapping as index into scheme_uchar_ups, etc.: */\n") 550(printf "READ_ONLY unsigned char *scheme_uchar_cases_table[~a];\n" hi-count) 551 552(printf "\n/* Character general categories: */\n") 553(printf "READ_ONLY unsigned char *scheme_uchar_cats_table[~a];\n" hi-count) 554 555(printf "\n/* The udata... arrays are used by init_uchar_table to fill the above mappings.*/\n\n") 556 557(define print-row 558 (lambda (vec name pos hex?) 559 (printf " /* ~a */\n" name) 560 (let loop ([i 0]) 561 (printf (if hex? " 0x~x~a" " ~a~a") 562 (or (vector-ref vec i) "0") 563 (if (and (= name pos) 564 (= i low)) 565 "" ",")) 566 (when (zero? (modulo (add1 i) 16)) 567 (newline)) 568 (unless (= i low) 569 (loop (add1 i)))))) 570 571(define (print-table type suffix vectors pos hex?) 572 (printf "READ_ONLY static unsigned ~a udata~a[] = {\n" type suffix) 573 (print-row (make-vector (add1 low) 0) 0 pos hex?) 574 (map (lambda (p) 575 (print-row (car p) (cdr p) pos hex?)) 576 (sort (hash-map vectors cons) 577 (lambda (a b) (< (cdr a) (cdr b))))) 578 (printf "};\n")) 579(print-table "short" "" vectors pos #t) 580(printf "\n") 581(print-table "char" "_cases" vectors2 pos2 #f) 582(print-table "char" "_cats" vectors3 pos3 #f) 583 584(printf "\n/* Case mapping size: ~a */\n" (hash-count (car cases))) 585(printf "/* Find an index into the ups, downs, etc. table for a character\n") 586(printf " by using scheme_uchar_cases_table; then, the value at the index\n") 587(printf " is relative to the original character (except for combining class,\n") 588(printf " of course). */\n") 589 590(define (print-shift t end select type name) 591 (printf "\nREAD_ONLY ~a scheme_uchar_~a[] = {\n" type name) 592 (for-each (lambda (p) 593 (printf " ~a~a" 594 (select (car p)) 595 (if (= (cdr p) (sub1 end)) 596 "" 597 ",")) 598 (when (zero? (modulo (add1 (cdr p)) 16)) 599 (newline))) 600 (sort (hash-map t cons) 601 (lambda (a b) (< (cdr a) (cdr b))))) 602 (printf " };\n")) 603 604(print-shift (car cases) (unbox (cdr cases)) car "int" "ups") 605(print-shift (car cases) (unbox (cdr cases)) cadr "int" "downs") 606(print-shift (car cases) (unbox (cdr cases)) caddr "int" "titles") 607(print-shift (car cases) (unbox (cdr cases)) cadddr "int" "folds") 608(print-shift (car cases) (unbox (cdr cases)) (lambda (x) (cadddr (cdr x))) "unsigned char" "combining_classes") 609 610(let ([l (sort (hash-map general-categories cons) 611 (lambda (a b) (< (cdr a) (cdr b))))]) 612 (printf "\n#define NUM_GENERAL_CATEGORIES ~a\n" (length l)) 613 (printf "READ_ONLY static const char *general_category_names[] = {") 614 (for-each (lambda (c) 615 (printf (if (zero? (cdr c)) 616 "\n ~s" 617 ",\n ~s") 618 (string-downcase (car c)))) 619 l) 620 (printf "\n};\n")) 621 622(set! ranges (cons (list range-bottom range-top (range-v . > . -1)) 623 ranges)) 624 625(printf "\n#define NUM_UCHAR_RANGES ~a\n" (length ranges)) 626(printf "\n#define URANGE_VARIES 0x40000000\n") 627(printf "READ_ONLY static int mapped_uchar_ranges[] = {\n") 628(for-each (lambda (r) 629 (printf " 0x~x, 0x~x~a~a\n" 630 (car r) 631 (cadr r) 632 (if (caddr r) "" " | URANGE_VARIES") 633 (if (= (cadr r) range-top) 634 "" 635 ","))) 636 (reverse ranges)) 637(printf "};\n") 638 639(printf "\nstatic void init_uchar_table(void)\n{\n") 640(printf " int i;\n\n") 641(printf " for (i = 0; i < ~a; i++) { \n" hi-count) 642(printf " scheme_uchar_table[i] = udata;\n") 643(printf " scheme_uchar_cases_table[i] = udata_cases;\n") 644(printf " scheme_uchar_cats_table[i] = udata_cats;\n") 645(printf " }\n") 646(printf "\n") 647(define (print-init top vectors suffix) 648 (let loop ([i 0]) 649 (unless (= i hi-count) 650 (let ([vec (vector-ref top i)]) 651 (if vec 652 (let ([same-count (let loop ([j (add1 i)]) 653 (if (equal? vec (vector-ref top j)) 654 (loop (add1 j)) 655 (- j i)))] 656 [vec-pos (* (add1 low) (hash-ref vectors vec))]) 657 (if (> same-count 4) 658 (begin 659 (printf " for (i = ~a; i < ~a; i++) {\n" 660 i (+ i same-count)) 661 (printf " scheme_uchar~a_table[i] = udata~a + ~a;\n" 662 suffix suffix 663 vec-pos) 664 (printf " }\n") 665 (loop (+ same-count i))) 666 (begin 667 (printf " scheme_uchar~a_table[~a] = udata~a + ~a;\n" 668 suffix 669 i 670 suffix 671 vec-pos) 672 (loop (add1 i))))) 673 (loop (add1 i))))))) 674(print-init top vectors "") 675(print-init top2 vectors2 "_cases") 676(print-init top3 vectors3 "_cats") 677(printf "}\n") 678 679;; ---------------------------------------- 680 681(current-output-port (open-output-file "schustr.inc" #:exists 'truncate/replace)) 682 683(printf "/* Generated by mk-uchar.rkt */\n\n") 684 685(define specials null) 686(define special-count 0) 687(define (register-special l) 688 (let ([l (reverse l)]) 689 (unless (let loop ([l l][specials specials]) 690 (cond 691 [(null? l) #t] 692 [(null? specials) #f] 693 [(= (car l) (car specials)) (loop (cdr l) (cdr specials))] 694 [else #f])) 695 (set! specials (append l specials)) 696 (set! special-count (+ special-count (length l)))) 697 (- special-count (length l)))) 698 699(printf "#define NUM_SPECIAL_CASINGS ~a\n\n" (hash-count special-casings)) 700(printf "READ_ONLY static int uchar_special_casings[] = {\n") 701(printf " /* code, down len, off, up len, off, title len, off, fold len, off, final-sigma? */\n") 702(let ([n (hash-count special-casings)]) 703 (for-each (lambda (p) 704 (set! n (sub1 n)) 705 (let ([code (car p)] 706 [sc (cdr p)]) 707 (let ([lower-start (register-special (special-casing-lower sc))] 708 [upper-start (register-special (special-casing-upper sc))] 709 [title-start (register-special (special-casing-title sc))] 710 [folding-start (register-special (special-casing-folding sc))]) 711 (printf " ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a, ~a~a" 712 code 713 (length (special-casing-lower sc)) lower-start 714 (length (special-casing-upper sc)) upper-start 715 (length (special-casing-title sc)) title-start 716 (length (special-casing-folding sc)) folding-start 717 (if (special-casing-final-sigma? sc) 1 0) 718 (if (zero? n) " " ",\n"))))) 719 (sort (hash-map special-casings cons) 720 (lambda (a b) (< (car a) (car b)))))) 721(printf "};\n") 722(printf "\n/* Offsets in scheme_uchar_special_casings point into here: */\n") 723(printf "READ_ONLY static int uchar_special_casing_data[] = {\n ") 724(let ([n 0]) 725 (for-each (lambda (v) 726 (printf 727 (cond 728 [(zero? n) "~a"] 729 [(zero? (modulo n 16)) ",\n ~a"] 730 [else ", ~a"]) 731 v) 732 (set! n (add1 n))) 733 (reverse specials))) 734(printf " };\n") 735 736(printf "\n#define SPECIAL_CASE_FOLD_MAX ~a\n" (apply 737 max 738 (hash-map 739 special-casings 740 (lambda (k v) 741 (length (special-casing-folding v)))))) 742 743 744 745 746(let () 747 (define (make-composes-table ps) 748 (list->vector (sort ps (lambda (a b) (< (car a) (car b)))))) 749 750 (define canon-composes 751 (make-composes-table (for/list ([(k v) (in-hash compose-map)] 752 #:when (k . <= . #xFFFFFFFF)) 753 (cons k v)))) 754 (define count (vector-length canon-composes)) 755 756 (define long-canon-composes 757 (make-composes-table (for/list ([(k v) (in-hash compose-map)] 758 #:when (k . > . #xFFFFFFFF)) 759 (cons k v)))) 760 (define long-count (vector-length long-canon-composes)) 761 762 (define-values (all-composes decomp-vector long-composes) 763 (let ([decomp-pos-ht (make-hasheq)] 764 [counter count] 765 [extra null] 766 [long-counter 0] 767 [longs null]) 768 (hash-for-each decomp-ht 769 (lambda (k v) 770 ;; Use table of composed shorts: 771 (let ([key (composition-key (car v) (cdr v))]) 772 (let ([pos 773 (if (and ((car v) . <= . #xFFFF) 774 ((cdr v) . <= . #xFFFF)) 775 (if (hash-ref compose-map key (lambda () #f)) 776 ;; Find index in comp vector: 777 (let loop ([i 0]) 778 (if (= key (car (vector-ref canon-composes i))) 779 i 780 (loop (add1 i)))) 781 ;; Add to compose table: 782 (begin0 783 counter 784 (set! extra (cons (cons key #f) extra)) 785 (set! counter (add1 counter)))) 786 ;; Use table of long+long sequences: 787 (begin 788 (set! long-counter (add1 long-counter)) 789 (set! longs (cons (cdr v) (cons (car v) longs))) 790 (- long-counter)))]) 791 (hash-set! decomp-pos-ht k pos)))) 792 ;; sort: 793 #t) 794 (values 795 (list->vector (append (vector->list canon-composes) 796 (reverse extra))) 797 (list->vector 798 (sort (hash-map decomp-pos-ht cons) 799 (lambda (a b) (< (car a) (car b))))) 800 (list->vector (reverse longs))))) 801 802 (printf "\n/* Subset of ~a decompositions used for canonical composition: */\n" 803 (vector-length all-composes)) 804 (printf "#define COMPOSE_TABLE_SIZE ~a\n\n" count) 805 806 (let ([print-compose-data 807 (lambda (type suffix which composes count hex? row-len) 808 (printf "READ_ONLY static ~a utable_~a[] = {\n" 809 type suffix) 810 (let loop ([i 0]) 811 (let ([v (which (vector-ref composes i))]) 812 (if (= i (sub1 count)) 813 (printf (format " ~a\n};\n" (if hex? "0x~x" "~a")) v) 814 (begin 815 (printf (format " ~a," (if hex? "0x~x" "~a")) v) 816 (when (zero? (modulo (add1 i) row-len)) 817 (newline)) 818 (loop (add1 i)))))))]) 819 (printf "/* utable_compose_pairs contains BMP pairs that form a canonical decomposition.\n") 820 (printf " The first COMPOSE_TABLE_SIZE are also canonical compositions, and they are\n") 821 (printf " sorted, so that a binary search can find the pair; the utable_compose_result\n") 822 (printf " table is in parallel for those COMPOSE_TABLE_SIZE to indicate the composed\n") 823 (printf " characters. Use scheme_needs_maybe_compose() from scheme.h to check whether\n") 824 (printf " a character might start a canonical decomposition. A zero as the second element\n") 825 (printf " of a composition means that it is a singleton decomposition.\n") 826 (printf " The entire utable_compose_pairs table is referenced by utable_decomp_indices\n") 827 (printf " to map characters to canonical decompositions.\n") 828 (printf " None of the [de]composition tables includes Hangol. */\n") 829 (print-compose-data "unsigned int" "compose_pairs" car all-composes (vector-length all-composes) #t 8) 830 (print-compose-data "unsigned int" "compose_result" cdr canon-composes count #t 8) 831 (printf "\n") 832 (printf "/* utable_compose_long_pairs contains a sequence of character pairs where at\n") 833 (printf " least one is outside the BMP, so it doesn't fit in utable_compose_pairs.\n") 834 (printf " Negative values in utable_decomp_indices map to this table; add one to\n") 835 (printf " the mapped index, negate, then multiply by 2 to find the pair. */\n") 836 (print-compose-data "unsigned int" "compose_long_pairs" values long-composes (vector-length long-composes) #t 8) 837 (printf "\n") 838 (printf "/* utable_canon_compose_long_pairs repeats information from utable_compose_long_pairs,\n") 839 (printf " but for canonical compositions only. The two characters are combined by putting the\n") 840 (printf " lower 16 bits of the combined numbers in the low 32 bits, and then the next higher 10\n") 841 (printf " bits provide the remaining 5 bits of each character, and the array is sorted. The\n") 842 (printf " canon_compose_long_result array provides in parellel the composed character. */\n") 843 (printf "#define LONG_COMPOSE_TABLE_SIZE ~a\n\n" long-count) 844 (print-compose-data "mzlonglong" "canon_compose_long_pairs" car long-canon-composes long-count #t 8) 845 (print-compose-data "unsigned int" "canon_compose_long_result" cdr long-canon-composes long-count #t 8) 846 (printf "\n") 847 (printf "/* utable_decomp_keys identifies characters that have a canonical decomposition;\n") 848 (printf " it is sorted, so binary search can be used, but use scheme_needs_decompose()\n") 849 (printf " from scheme.h to first determine whether a character may have a mapping in this table.\n") 850 (printf " (If scheme_needs_decompose(), may instead have a mapping in the kompat table.).\n") 851 (printf " The parallel utable_decomp_indices maps the corresponding character in this table\n") 852 (printf " to a composition pair in either utable_compose_pairs (when the index is positive) or\n") 853 (printf " utable_long_compose_pairs (when the index is negative). */\n") 854 (printf "#define DECOMPOSE_TABLE_SIZE ~a\n\n" (vector-length decomp-vector)) 855 (print-compose-data "unsigned int" "decomp_keys" car decomp-vector (vector-length decomp-vector) #t 8) 856 (print-compose-data "short" "decomp_indices" cdr decomp-vector (vector-length decomp-vector) #f 8) 857 858 (let ([k-decomp-vector 859 (list->vector 860 (sort (hash-map k-decomp-map-ht cons) 861 (lambda (a b) (< (car a) (car b)))))]) 862 (printf "\n") 863 (printf "/* utable_kompat_decomp_keys identifies characters that have a compatibility decomposition;\n") 864 (printf " it is sorted, and scheme_needs_decompose() is true for every key (but a character\n") 865 (printf " with scheme_needs_decompose(), may instead have a mapping in the canonical table.).\n") 866 (printf " The parallel utable_kompat_decomp_indices maps the corresponding character in this table\n") 867 (printf " to a composition string in kompat_decomp_strs with a length determined by the\n") 868 (printf " utable_kompat_decomp_lens table. The decomposition never contains characters that need\n") 869 (printf " further decomposition. */\n") 870 (printf "\n#define KOMPAT_DECOMPOSE_TABLE_SIZE ~a\n\n" (vector-length k-decomp-vector)) 871 (print-compose-data "unsigned int" "kompat_decomp_keys" car k-decomp-vector (vector-length k-decomp-vector) #t 8) 872 (print-compose-data "char" "kompat_decomp_lens" cddr 873 k-decomp-vector (vector-length k-decomp-vector) #f 24) 874 (print-compose-data "short" "kompat_decomp_indices" cadr 875 k-decomp-vector (vector-length k-decomp-vector) #f 16) 876 (let ([l (list->vector (reverse k-decomp-strs))]) 877 (print-compose-data "unsigned short" "kompat_decomp_strs" values l (vector-length l) #t 8))))) 878