1;;; 2;;; gen-unicode.scm - generate unicode-handling tables 3;;; 4;;; Originally written by Shiro Kawai, 2011 5;;; Public Domain - use as you like. 6;;; 7 8;; Reads Unicode data tables and generates various source files. 9 10;; This script can serve three operations. 11;; 12;; (1) Generate unicode-data.scm from Unicode character database 13;; It is only necessary when new version of Unicode is published, 14;; and the resulting unicode-data.scm is checked in to the source 15;; tree so that other developers don't need UCD. 16;; 17;; gosh ./gen-unicode.scm --import <unicode-database-directory> unicode-data.scm 18;; 19;; (2) Generate source files from unicode-data.scm 20;; To reduce runtime overhead, unicode properties are saved in binary 21;; tables, within the following two generated source files: 22;; 23;; char_attr.c - General category and case mappings. 24;; gauche/priv/unicode_attr.h - Grapheme break, word break, East-asian width 25;; 26;; This is done when you build from git source tree. 27;; 28;; gosh ./gen-unicode.scm --compile unicode-data.scm 29;; 30;; (3) Fetch Unicode character database files. This is for developer's 31;; convenience. 32;; 33;; gosh ./gen-unicode.scm --fetch <unicode-database-directory> [<unicode-version> 34;; Note that the original content in <unicode-database-directory> is 35;; overwritten. 36;; You might need to adjust code for newer version of Unicode. 37;; See doc/HOWTO-unicode.txt for the details. 38 39(use srfi-1) 40(use srfi-13) 41(use srfi-42) 42(use text.csv) 43(use util.match) 44(use file.util) 45(use gauche.dictionary) 46(use gauche.sequence) 47(use gauche.record) 48(use gauche.charconv) 49(use gauche.uvector) 50 51;; needs to load from source 52(add-load-path "../lib" :relative) 53(use text.unicode.ucd) 54(use text.unicode.codeset) 55 56;; rfc.http is only required by '--fetch' operation. We don't want to load 57;; it during build process, so let's autoload it. 58(autoload rfc.http http-get) 59 60 61;; We generate four kind of lookup structures. Each structure consists 62;; of various types of tables in order to reduce the size. 63;; 64;; * General categories and case bits 65;; 66;; This structure maps Unicode codepoint to a byte containing the 67;; following info: 68;; 69;; bit 7-6 - Alphabetic and case info 70;; 00 : non-alphabetic char 71;; 01 : lowercase alphabetic char 72;; 10 : uppercase alphabetic char 73;; 11 : caseless or titlecase alphabetic char 74;; bit 5 - Reserved 75;; bit 4-0 - General category (Lu, Nd, Cc, etc.) 76;; See ucd-general-categories 77;; and the enum in src/gauche/char_attr.h 78;; 79;; For U+0000 to U+1ffff, we have a single table that directly maps 80;; the codepoint to the above byte. 81;; 82;; unsigned char ucs_general_category_00000[0x20000] (131072 bytes) 83;; 84;; Above U+20000 the mappings are sparse and mostly contiguous, so we 85;; generate a C code that performs a binary search. 86;; 87;; unsigned char ucs_general_category_20000(ScmChar code) 88;; 89;; * Case mapping 90;; 91;; Almost all characters that needs case mappings are below #xffff. In 92;; Unicode 6.0, the only exceptions are 80 characters in U+104xx range 93;; (Deseret). 94;; 95;; Among case-mapped characters, almost all of them requires a simple 96;; rule---that is, either a character can be lowercased by adding some 97;; small value to the codepoint, or a character can be upcased and 98;; titlecased by adding some small value to the codepoint. These have 99;; simple casemap entries. Characters that requires more complex handling 100;; have extended casemap entries. 101;; 102;; The tables first maps a codepoint into 16bit entry. The entry 103;; represents either a simple casemap entry, or an index to an 104;; extended casemap entry. 105;; 106;; bit15 == 0: 107;; This is a simple entry. uppercase and titlecase is the same, 108;; no special case mappings, and conversion between 109;; (upper,title) <-> lower is simply done by adding the given offset. 110;; 111;; If bit14 is 0, this letter is uppercase. Converting to uppercase and 112;; titlecase is noop. Converting to lowercase is to add 113;; the offset. 114;; If bit14 is 1, this letter is lowercase. Converting to lowercase is 115;; noop. Converting to uppercase and titlecase is to add 116;; the offset. 117;; bit13-bit0, singned integer offset [-8192, 8191] 118;; 119;; bit15 == 1: 120;; This is an extended entry (except #xffff). 121;; bit14-bit0 is an index to an extended entry table. 122;; 123;; #xffff indicates empty entry. 124;; 125;; Characters that require case mapping tend to cluster, so we use two-staged 126;; table to lookup the 16-bit entry from the codepoint below U+10000. 127;; 128;; The bit 15-8 is the index of this table: 129;; 130;; static unsiged char casemap_000[256] 131;; 132;; If the value is 255, the character doesn't have case mappings. 133;; Otherwise, let V be the value of the above lookup, the entry 134;; can be looked up by the following table: 135;; 136;; static unsiged char casemap_subtable[V][<lower 8 bit of codepoint>]; 137;; 138;; We only need to have 18 subtables. 139;; 140;; * Digit values 141;; 142;; Characters with categrory Nd has associated numeric value 0..9. 143;; A set of decimal numeric characters for 0..9 are always contiguous. 144;; As of Unicode 6.2, we have 42 of such sets. 145;; 146;; * Break properties 147;; 148;; This structure maps codepoint to the Grapheme_Cluster_Break, 149;; Word_Break and Sentence_Break properties. 150;; 151;; See http://www.unicode.org/reports/tr29/ 152;; http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt 153;; http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt 154;; http://www.unicode.org/Public/UNIDATA/auxiliary/SentenceBreakProperty.txt 155;; 156;; An entry is 8-bit, indicating the character's Word_Break and 157;; Grapheme_Break properties. In unicode_attr.h, they are 158;; prefixed with WB_ and GB_, respectively. 159;; Note that WB_CR, WB_LF, WB_Single_Quote, WB_Double_Quote, 160;; GB_CR and GB_LF values are *not* stored in the table. Each 161;; property value is assigned to single character and we check 162;; them separately. 163;; 164;; bit7-4: Word_Break property 165;; 10 CR (not stored in the table) 166;; 11 LF (not stored in the table) 167;; 0 Newline 168;; 1 Extend 169;; 2 Regional_Indicator 170;; 3 Format 171;; 4 Katakana 172;; 5 Hebrew_Letter 173;; 6 ALetter 174;; 12 Single_Quote (not stored in the table) 175;; 13 Double_Quote (not stored in the table) 176;; 7 MidLetter 177;; 8 MidNum 178;; 9 MidNumLet 179;; a Numeric 180;; b ExtendNumLet 181;; c WSegSpace 182;; d ZWJ 183;; e Other 184;; bit3-0: Graphene_Break property 185;; 10 CR (not stored in the table) 186;; 11 LF (not stored in the table) 187;; 0 Control 188;; 1 Extend 189;; 2 Regional_Indicator 190;; 3 Prepend 191;; 4 SpacingMark 192;; 5 L 193;; 6 V 194;; 7 T 195;; 8 LV 196;; 9 LVT 197;; a ZWJ 198;; b Other 199;; 200;; Codepoints below U+20000 are looked up by two-staged tables. 201;; First, look up this table with (codepoint >> 8). 202;; 203;; static unsigned char break_table[0x200] 204;; 205;; If the value is 255, both properties are 'Other'. 206;; Otherwise, the value is an index to the secondary table. 207;; 208;; static unsigned char break_subtable[index][256] 209;; 210;; The value of this table encodes WB and GB properties. 211;; 212;; Codepoints on or above U+20000 are all 'Other', except the following 213;; ranges. They are handled specially in the lookup procedure. 214;; 215;; E0001 GB_Control, WB_Format 216;; E0020..E007F GB_Control, WB_Format 217;; E0100..E01EF GB_Extend, WB_Extend 218;; 219 220;;; 221;;; Main entry 222;;; 223 224(define (main args) 225 (match (cdr args) 226 [("--fetch" dir . maybe-version) (apply fetch-ucd dir maybe-version)] 227 [("--import" dir ucdfile) 228 (unless (file-is-directory? dir) 229 (exit 1 "Directory required, but got: ~a" dir)) 230 (with-output-to-file ucdfile 231 (cut ucd-save-db (ucd-parse-files dir)))] 232 [("--compile" ucdfile) 233 (unless (file-exists? ucdfile) 234 (exit 1 "Couldn't open unicode data file: ~a" ucdfile)) 235 (generate-tables (call-with-input-file ucdfile ucd-load-db))] 236 [else 237 (exit 1 "Usage:\n\ 238 gen-unicode.scm --fetch <unicode-database-dir> [<unicode-version>]\n\ 239 gen-unicode.scm --import <unicode-database-dir> <ucd-file>\n\ 240 gen-unicode.scm --compile <unicode-data.scm>")]) 241 0) 242 243;;; 244;;; Fetching 245;;; 246(define (fetch-ucd dir :optional (version #f)) 247 (let ([path (if version 248 #"/Public/~|version|/ucd" 249 "/Public/UCD/latest/ucd")] 250 [datafiles '("UnicodeData.txt" 251 "SpecialCasing.txt" 252 "PropList.txt" 253 "EastAsianWidth.txt" 254 "auxiliary/GraphemeBreakProperty.txt" 255 "auxiliary/SentenceBreakProperty.txt" 256 "auxiliary/WordBreakProperty.txt")]) 257 (make-directory* (build-path dir "auxiliary")) 258 (dolist [f datafiles] 259 (let1 p (build-path dir f) 260 (display #"Getting ~|f|... ") (flush) 261 (call-with-output-file p 262 (^o (http-get "www.unicode.org" (build-path path f) 263 :sink o :flusher (^ _ #t)))) 264 (display "done\n"))))) 265 266;;; 267;;; Generate source file 268;;; 269 270(define (generate-tables db) 271 (receive (char_attr.p char_attr.c) (sys-mkstemp "char_attr.c.") 272 (receive (unicode_attr.p unicode_attr.h) (sys-mkstemp "unicode_attr.h.") 273 (with-output-to-port char_attr.p 274 (^() (preamble db) 275 (generate-category-tables db) 276 (generate-case-tables db) 277 (generate-digit-value-tables db))) 278 (with-output-to-port unicode_attr.p 279 (^() (preamble db) 280 (generate-break-tables db) 281 (generate-width-tables db))) 282 (close-port char_attr.p) 283 (close-port unicode_attr.p) 284 (sys-rename char_attr.c "char_attr.c") 285 (sys-rename unicode_attr.h "gauche/priv/unicode_attr.h")))) 286 287(define (preamble db) 288 (print "/* Generated automatically from Unicode character database */") 289 (print "/* See src/gen-unicode.scm for the description of data structures. */") 290 (print #"/* Unicode version ~(ucd-version db). Do not edit. */") 291 ) 292 293;; NB: The caracter category tables are generated for each supported 294;; internal encodings. The lookup function is defined in gauche/char_*.h, 295;; and must be in sync with the tables generated here. 296(define (generate-category-tables db) 297 (define (emit-entry e) 298 (if e 299 (cond 300 [(ucd-entry-uppercase e) (format #t " U(~a)," (ucd-entry-category e))] 301 [(ucd-entry-lowercase e) (format #t " L(~a)," (ucd-entry-category e))] 302 [(ucd-entry-alphabetic e) (format #t " A(~a)," (ucd-entry-category e))] 303 [else (format #t " ~a," (ucd-entry-category e))]) 304 (format #t " Cn,"))) 305 (define (emit-columns start end p) 306 (do-ec (:parallel (: k start end) 307 (:integers i)) 308 (begin 309 (cond [(zero? (mod i 256)) (format #t "\n /* ~5,'0x - */\n" k)] 310 [(zero? (mod i 8)) (newline)]) 311 (p k))) 312 (print)) 313 (define (ucs->entry ucs) (ucd-get-entry db ucs)) 314 315 ;; Setup 316 (dolist [c (ucd-general-categories)] 317 (format #t "#define ~a SCM_CHAR_CATEGORY_~a\n" c c)) 318 (print "#undef A") 319 (print "#undef U") 320 (print "#undef L") 321 (print "#define A(x) ((x)|SCM_CHAR_ALPHABETIC_BITS)") 322 (print "#define U(x) ((x)|SCM_CHAR_UPPERCASE_BITS)") 323 (print "#define L(x) ((x)|SCM_CHAR_LOWERCASE_BITS)") 324 325 ;; utf-8 ('none' is also here, for we treat it as latin-1.) 326 (let ([code-sets (build-code-sets db 'utf8 (ucd-general-categories))]) 327 (print "#if !defined(GAUCHE_CHAR_ENCODING_EUC_JP) && !defined(GAUCHE_CHAR_ENCODING_SJIS)") 328 ;; U+0000 - U+1ffff : direct table lookup 329 (display "static unsigned char ucs_general_category_00000[] = {") 330 (emit-columns 0 #x20000 (^i (emit-entry (ucs->entry i)))) 331 (print "};") 332 ;; Over U+20000 - binary search 333 (print) 334 (print "static unsigned char ucs_general_category_20000(ScmChar code)") 335 (print "{") 336 (print " /*") 337 (for-each (^e (format #t " ~6x ~a\n" (car e) (cdr e))) 338 (ucd-get-category-ranges db)) 339 (print " */") 340 (generate-bisect (coerce-to <vector> (ucd-get-category-ranges db)) 341 (^e (format "return ~a;" 342 (case (cdr e) 343 [(Lo) "A(Lo)"] 344 [(#f) "Cn"] 345 [else => identity])))) 346 (print "}") 347 (for-each 348 (^s (dump-code-set-in-C (cdr s))) 349 (sort (hash-table->alist code-sets) string<? (^x (x->string (car x))))) 350 (print "#endif /*defined(GAUCHE_CHAR_ENCODING_UTF_8)*/") 351 (print)) 352 353 ;; euc-jp 354 (let ([code-sets (build-code-sets db 'eucjp (ucd-general-categories))]) 355 (print "#if defined(GAUCHE_CHAR_ENCODING_EUC_JP)") 356 (display "static unsigned char eucjp_general_category_G0[] = {") 357 (emit-columns 0 #x80 (^i (emit-entry (eucjp->ucd-entry db i)))) 358 (print "};") 359 (display "static unsigned char eucjp_general_category_G1[] = {") 360 (dotimes [z (- #xff #xa1)] 361 (emit-columns (+ (ash (+ z #xa1) 8) #xa1) 362 (+ (ash (+ z #xa1) 8) #xff) 363 (^i (emit-entry (eucjp->ucd-entry db i))))) 364 (print "};") 365 (display "static unsigned char eucjp_general_category_G2[] = {") 366 (emit-columns #x8ea1 #x8ee0 (^i (emit-entry (eucjp->ucd-entry db i)))) 367 (print "};") 368 (display "static unsigned char eucjp_general_category_G3[] = {") 369 (dotimes [z (- #xff #xa1)] 370 (emit-columns (+ (ash (+ z #xa1) 8) #x8f00a1) 371 (+ (ash (+ z #xa1) 8) #x8f00ff) 372 (^i (emit-entry (eucjp->ucd-entry db i))))) 373 (print "};") 374 (for-each 375 (^s (dump-code-set-in-C (cdr s))) 376 (sort (hash-table->alist code-sets) string<? (^x (x->string (car x))))) 377 (print "#endif /*defined(GAUCHE_CHAR_ENCODING_EUC_JP)*/") 378 (print)) 379 380 ;; sjis 381 (let ([code-sets (build-code-sets db 'sjis (ucd-general-categories))]) 382 (print "#if defined(GAUCHE_CHAR_ENCODING_SJIS)") 383 (display "static unsigned char sjis_general_category_00[] = {") 384 (emit-columns 0 #x80 (^i (emit-entry (sjis->ucd-entry db i)))) 385 (print "};") 386 (display "static unsigned char sjis_general_category_a0[] = {") 387 (emit-columns #xa0 #xdf (^i (emit-entry (sjis->ucd-entry db i)))) 388 (print "};") 389 (display "static unsigned char sjis_general_category_8000[] = {") 390 (do-ec (: z #x8000 #xa000 256) 391 (emit-columns (+ z #x40) 392 (+ z #xfd) 393 (^i (emit-entry (sjis->ucd-entry db i))))) 394 (print "};") 395 (display "static unsigned char sjis_general_category_e000[] = {") 396 (do-ec (: z #xe000 #x10000 256) 397 (emit-columns (+ z #x40) 398 (+ z #xfd) 399 (^i (emit-entry (sjis->ucd-entry db i))))) 400 (print "};") 401 (for-each 402 (^s (dump-code-set-in-C (cdr s))) 403 (sort (hash-table->alist code-sets) string<? (^x (x->string (car x))))) 404 (print "#endif /*defined(GAUCHE_CHAR_ENCODING_SJIS)*/") 405 (print)) 406 407 ;; Bind predefined charset 408 ;; The CAT entry can be just a symbol or (alias symbol). 409 (print "static void init_predefined_charsets() {") 410 (dolist [cat (append (ucd-general-categories) 411 '(L LC M N P S Z C 412 (LETTER L) ASCII_LETTER 413 (DIGIT Nd) ASCII_DIGIT 414 LETTER_DIGIT ASCII_LETTER_DIGIT 415 (LOWER Ll) ASCII_LOWER 416 (UPPER Lu) ASCII_UPPER 417 (TITLE Lt) 418 GRAPHIC ASCII_GRAPHIC 419 PRINTING ASCII_PRINTING 420 (PUNCTUATION P) ASCII_PUNCTUATION 421 (SYMBOL S) ASCII_SYMBOL 422 (ISO_CONTROL Cc) ASCII_CONTROL 423 HEX_DIGIT 424 WHITESPACE ASCII_WHITESPACE 425 BLANK ASCII_BLANK 426 ASCII EMPTY 427 ASCII_WORD (WORD ASCII_WORD)))] 428 (let1 setname (if (pair? cat) (car cat) cat) 429 (print #" predef_sets[SCM_CHAR_SET_~|setname|] =") 430 (if (pair? cat) 431 (print #" predef_sets[SCM_CHAR_SET_~(cadr cat)];") 432 (print #" make_charset_~|setname|();")))) 433 (print "}") 434 435 ;; Teardown 436 (dolist [c (ucd-general-categories)] 437 (format #t "#undef ~a\n" c)) 438 (format #t "#undef A\n") 439 (format #t "#undef U\n") 440 (format #t "#undef L\n") 441 (print)) 442 443(define (generate-case-tables db) 444 (define subtables '()) 445 (define extended '()) 446 ;; returns a list of (code . ucd-entry-case-map) with U+HHHxx where HHH 447 ;; is given to the hi arg. 448 (define (gather-entries hb) 449 (fold-ec '() (: lb 256) (ucd-get-entry db (+ (* hb 256) lb)) 450 (^(e seed) (if (or (not e) (not (ucd-entry-case-map e))) 451 seed 452 (acons (+ (* hb 256) lb) (ucd-entry-case-map e) seed))))) 453 ;; ucd-entry-alist :: ((code . ucd-entry-case-map) ...) 454 ;; subtable :: (start-code . #(....)) 455 ;; returns (start-code . subtable-number) 456 (define (gen-subtable start-code ucd-entry-alist) 457 (if (null? ucd-entry-alist) 458 `(,start-code . 255) 459 (let* ([table-num (length subtables)] 460 [vec (make-vector 256 #f)]) 461 (dolist [p ucd-entry-alist] 462 (when (ucd-extended-case-map? (cdr p)) 463 (push! extended (cdr p))) 464 (set! (~ vec (logand (car p) #xff)) (cdr p))) 465 (push! subtables (cons start-code vec)) 466 (cons start-code table-num)))) 467 ;; 468 (define (gather-toptable) 469 (list-ec (: hi 256) (gen-subtable (* hi 256) (gather-entries hi)))) 470 471 ;; 472 (define (emit-subtable subtable) 473 (format #t " /* ~4,'0x - ~4,'0x */\n" 474 (car subtable) 475 (+ (car subtable) 255)) 476 (format #t " {\n") 477 (dotimes [n 256] 478 (format #t " ~20a, /* 0x~4,'0x */\n" 479 (if-let1 cmap (~ (cdr subtable) n) 480 (if (ucd-simple-case-map? cmap) 481 (format "~a(~a)" 482 (if (eq? (ucd-simple-case-map-case cmap) 'upper) 483 "TOLOWER" 484 "TOUPPER") 485 (ucd-simple-case-map-offset cmap)) 486 (format "EXTENDED(~a)" 487 (find-index (cut eq? cmap <>) extended))) 488 "NO_CASE_MAPPING") 489 (+ (car subtable) n))) 490 (format #t " },\n")) 491 492 (define (emit-subtables) 493 (format #t "static unsigned short casemap_subtable[][256] = {\n") 494 (for-each emit-subtable (reverse subtables)) 495 (format #t "};\n\n")) 496 497 (define (emit-toptable subtable-num-alist) 498 (let1 vec (make-vector 256 255) 499 (dolist [s subtable-num-alist] 500 (set! (~ vec (div (car s) 256)) (cdr s))) 501 (format #t "static unsigned char casemap_000[] = {") 502 (dotimes [i 256] 503 (when (zero? (mod i 16)) (newline)) 504 (format #t " ~3d," (~ vec i))) 505 (format #t "\n};\n"))) 506 507 (define (emit-extended-case-maps) 508 (format #t "static ScmCharCaseMap extended_casemaps[] = {\n") 509 (dolist [e extended] 510 (let ([simple (ucd-extended-case-map-simple-map e)] 511 [special (ucd-extended-case-map-special-map e)]) 512 (format #t " { ~s,~s,~s," 513 (or (~ simple 0) 0) 514 (or (~ simple 1) 0) 515 (or (~ simple 2) 0)) 516 (if special 517 (format #t "~a,~a,~a" 518 (extended-special-array (~ special 0)) 519 (extended-special-array (~ special 1)) 520 (extended-special-array (~ special 2))) 521 (format #t "{-1},{-1},{-1}")) 522 (format #t "}, /* ~4,'0x */\n" (ucd-extended-case-map-code e)))) 523 (format #t "};\n\n")) 524 (define (extended-special-array lis) 525 (apply format "{~a,~a,~a,-1}" (map (^i (list-ref lis i -1)) '(0 1 2)))) 526 527 ;; body of generate-case-tables 528 (print "#define NO_CASE_MAPPING SCM_CHAR_NO_CASE_MAPPING") 529 (print "#define TOLOWER(x) SCM_CHAR_CASEMAP_TOLOWER(x)") 530 (print "#define TOUPPER(x) SCM_CHAR_CASEMAP_TOUPPER(x)") 531 (print "#define EXTENDED(x) SCM_CHAR_CASEMAP_EXTENDED(x)") 532 (let1 subtable-num-alist (gather-toptable) 533 (emit-extended-case-maps) 534 (emit-subtables) 535 (emit-toptable subtable-num-alist)) 536 ) 537 538(define (generate-bisect entries gen-value) 539 (define (bisect lo hi indent) 540 (if (= (+ lo 1) hi) 541 (format #t "~v,a~a\n" (* 2 indent) " " (gen-value (~ entries lo))) 542 (let1 mid (div (+ lo hi) 2) 543 (format #t "~v,aif (code < 0x~x) {\n" 544 (* 2 indent) " " (car (~ entries mid))) 545 (bisect lo mid (+ indent 1)) 546 (format #t "~v,a} else {\n" (* 2 indent) " ") 547 (bisect mid hi (+ indent 1)) 548 (format #t "~v,a}\n" (* 2 indent) " ")))) 549 (bisect 0 (size-of entries) 1)) 550 551;; Digit-value tables. Note that we'll have a shortcut for the first 552;; chunk [0x30, 0x39], so we only generate for the second chunk and after 553(define (generate-digit-value-tables db) 554 (let1 ranges ($ reverse 555 $ fold (^[p s] (match s 556 [() `(,p (0 . #f))] 557 [((_ . last) . rest) 558 (if (= (car p) last) 559 `(,p ,@s) 560 `(,p (,last . #f) ,@s))])) 561 '() 562 $ cdr $ (cut sort-by <> car) $ filter identity 563 $ ucd-map-entries db 564 (^[code entry] 565 (and (eq? (ucd-entry-category entry) 'Nd) 566 (zero? (ucd-entry-digit-value entry)) 567 (cons code (+ code 10))))) 568 569 (print) 570 (print "static int ucs_digit_value(ScmChar code)") 571 (print "{") 572 (dolist [r ranges] 573 (format #t " /* ~5,'0x- ~a */\n" (car r) (if (cdr r) "Nd" "* "))) 574 (generate-bisect (coerce-to <vector> ranges) 575 (^e (if (cdr e) 576 (format "return (code - 0x~x);" (car e)) 577 "return -1;"))) 578 (print "}"))) 579 580;; Predefined character sets 581 582(define (for-each-char-code/none proc) 583 (dotimes [n 256] (proc n n))) 584 585(define (for-each-char-code/ucs proc) 586 (dotimes [n #x2ffff] (proc n n)) 587 (proc #xe0000 #xe007f) ; Language tags; Cf 588 (proc #xe0100 #xe01ef) ; Variation selectors; Mn 589 (proc #xf0000 #xffffd) ; Private use; Co 590 (proc #x100000 #x10fffd) ; Private use; Co 591 ) 592 593(define (for-each-char-code/euc-jp proc) 594 (dotimes [n 256] (proc n n)) 595 (do-ec (: n #x8ea1 #x8eff) (proc n n)) 596 (do-ec (: n #xa1a1 #xfeff) (proc n n)) 597 (do-ec (: n #x8fa1a1 #x8ffef7) (proc n n))) 598 599(define (for-each-char-code/sjis proc) 600 (dotimes [n 128] (proc n n)) 601 (do-ec (: n #xa0 #xe0) (proc n n)) 602 (do-ec (: n #xfd #x100) (proc n n)) 603 (do-ec (: n #x8140 #x9ffd) (proc n n)) 604 (do-ec (: n #xe040 #xfcfd) (proc n n))) 605 606(define (build-code-sets db encoding categories) 607 (define sets (make-hash-table 'eq?)) 608 (define walker (ecase encoding 609 [(none) for-each-char-code/none] 610 [(utf8) for-each-char-code/ucs] 611 [(eucjp) for-each-char-code/euc-jp] 612 [(sjis) for-each-char-code/sjis])) 613 (define get-entry 614 (let ([eucjp-entry-cache (make-hash-table 'eqv?)] 615 [sjis-entry-cache (make-hash-table 'eqv?)]) 616 (ecase encoding 617 [(none utf8) ucd-get-entry] 618 [(eucjp) (^[db n] 619 (if-let1 e (hash-table-get eucjp-entry-cache n #f) 620 (if (eq? e 'nothing) #f e) 621 (rlet1 e (eucjp->ucd-entry db n) 622 (hash-table-put! eucjp-entry-cache n (or e 'nothing)))))] 623 [(sjis) (^[db n] 624 (if-let1 e (hash-table-get sjis-entry-cache n #f) 625 (if (eq? e 'nothing) #f e) 626 (rlet1 e (sjis->ucd-entry db n) 627 (hash-table-put! sjis-entry-cache n (or e 'nothing)))))] 628 ))) 629 (define (register set cat n m) 630 (and-let1 e (get-entry db n) 631 (if (eq? cat (ucd-entry-category e)) 632 (if (= n m) 633 (add-code! set n) 634 (add-code-range! set n m))))) 635 ;; general category charsets (char-set:Lt etc.) 636 (dolist [cat categories] 637 (let1 cs (make <char-code-set> :name cat) 638 (walker (cut register cs cat <> <>)) 639 (hash-table-put! sets cat cs))) 640 ;; general category class charsets (char-set:L etc.) 641 (dolist [gcats (group-collection categories 642 :key (^c (string-ref (symbol->string c) 0)))] 643 (let1 cs 644 (make <char-code-set> 645 :name (string->symbol (substring (symbol->string (car gcats)) 0 1))) 646 (for-each (^c (walker (cut register cs c <> <>))) gcats) 647 (hash-table-put! sets (~ cs'name) cs))) 648 ;; srfi-14 charsets (the ones that has equivalent set in general category set 649 ;; is handled implicitly. 650 (hash-table-put! sets 'ASCII_UPPER 651 (rlet1 cs (make <char-code-set> :name 'ASCII_UPPER) 652 (add-code-range! cs 653 (char->integer #\A) 654 (char->integer #\Z)))) 655 (hash-table-put! sets 'ASCII_LOWER 656 (rlet1 cs (make <char-code-set> :name 'ASCII_LOWER) 657 (add-code-range! cs 658 (char->integer #\a) 659 (char->integer #\z)))) 660 (hash-table-put! sets 'ASCII_LETTER 661 (code-set-union 'ASCII_LETTER 662 (hash-table-ref sets 'ASCII_UPPER) 663 (hash-table-ref sets 'ASCII_LOWER))) 664 (hash-table-put! sets 'ASCII_DIGIT 665 (rlet1 cs (make <char-code-set> :name 'ASCII_DIGIT) 666 (add-code-range! cs 667 (char->integer #\0) 668 (char->integer #\9)))) 669 (hash-table-put! sets 'LETTER_DIGIT 670 (code-set-union 'LETTER_DIGIT 671 (hash-table-ref sets 'L) 672 (hash-table-ref sets 'Nd))) 673 (hash-table-put! sets 'ASCII_LETTER_DIGIT 674 (code-set-union 'ASCII_LETTER_DIGIT 675 (hash-table-ref sets 'ASCII_LETTER) 676 (hash-table-ref sets 'ASCII_DIGIT))) 677 (hash-table-put! sets 'ASCII_WHITESPACE 678 (rlet1 cs (make <char-code-set> :name 'ASCII_WHITESPACE) 679 (add-code-range! cs 9 13) ;TAB,LF,LTAB,FF,CR 680 (add-code! cs (char->integer #\space)))) 681 (hash-table-put! sets 'WHITESPACE 682 (code-set-union 'WHITESPACE 683 (hash-table-ref sets 'ASCII_WHITESPACE) 684 (hash-table-ref sets 'Z))) 685 (hash-table-put! sets 'ASCII_BLANK 686 (rlet1 cs (make <char-code-set> :name 'ASCII_BLANK) 687 (add-code! cs 9) ;TAB 688 (add-code! cs (char->integer #\space)))) 689 (hash-table-put! sets 'BLANK 690 (code-set-union 'BLANK 691 (hash-table-ref sets 'ASCII_BLANK) 692 (hash-table-ref sets 'Zs))) 693 (hash-table-put! sets 'ASCII_PUNCTUATION 694 (rlet1 cs (make <char-code-set> :name 'ASCII_PUNCTUATION) 695 (add-code! cs 33) ;! 696 (add-code! cs 34) ;" 697 (add-code! cs 35) ;# 698 (add-code! cs 37) ;% 699 (add-code! cs 38) ;& 700 (add-code! cs 39) ;' 701 (add-code! cs 40) ;( 702 (add-code! cs 41) ;) 703 (add-code! cs 42) ;* 704 (add-code! cs 44) ;, 705 (add-code! cs 45) ;- 706 (add-code! cs 46) ;. 707 (add-code! cs 47) ;/ 708 (add-code! cs 58) ;: 709 (add-code! cs 59) ;; 710 (add-code! cs 63) ;? 711 (add-code! cs 64) ;@ 712 (add-code! cs 91) ;[ 713 (add-code! cs 92) ;\ 714 (add-code! cs 93) ;) 715 (add-code! cs 95) ;_ 716 (add-code! cs 123) ;{ 717 (add-code! cs 125) ;} 718 )) 719 (hash-table-put! sets 'ASCII_SYMBOL 720 (rlet1 cs (make <char-code-set> :name 'ASCII_SYMBOL) 721 (add-code! cs 36) ;$ 722 (add-code! cs 43) ;+ 723 (add-code! cs 60) ;< 724 (add-code! cs 61) ;= 725 (add-code! cs 62) ;> 726 (add-code! cs 94) ;^ 727 (add-code! cs 96) ;` 728 (add-code! cs 124) ;| 729 (add-code! cs 126) ;~ 730 )) 731 (hash-table-put! sets 'GRAPHIC 732 (code-set-union 'GRAPHIC 733 (hash-table-ref sets 'L) 734 (hash-table-ref sets 'N) 735 (hash-table-ref sets 'P) 736 (hash-table-ref sets 'S))) 737 (hash-table-put! sets 'ASCII_GRAPHIC 738 (code-set-union 'ASCII_GRAPHIC 739 (hash-table-ref sets 'ASCII_LETTER) 740 (hash-table-ref sets 'ASCII_DIGIT) 741 (hash-table-ref sets 'ASCII_PUNCTUATION) 742 (hash-table-ref sets 'ASCII_SYMBOL))) 743 (hash-table-put! sets 'PRINTING 744 (code-set-union 'PRINTING 745 (hash-table-ref sets 'GRAPHIC) 746 (hash-table-ref sets 'WHITESPACE))) 747 (hash-table-put! sets 'ASCII_PRINTING 748 (code-set-union 'ASCII_PRINTING 749 (hash-table-ref sets 'ASCII_GRAPHIC) 750 (hash-table-ref sets 'ASCII_WHITESPACE))) 751 (hash-table-put! sets 'ASCII_CONTROL 752 (rlet1 cs (make <char-code-set> :name 'ASCII_CONTROL) 753 (add-code-range! cs #x00 #x1f) 754 (add-code! cs #x7f))) 755 (hash-table-put! sets 'HEX_DIGIT 756 (rlet1 cs (make <char-code-set> :name 'HEX_DIGIT) 757 (add-code-range! cs 758 (char->integer #\0) 759 (char->integer #\9)) 760 (add-code-range! cs 761 (char->integer #\A) 762 (char->integer #\F)) 763 (add-code-range! cs 764 (char->integer #\a) 765 (char->integer #\f)))) 766 (hash-table-put! sets 'ASCII 767 (rlet1 cs (make <char-code-set> :name 'ASCII) 768 (add-code-range! cs 0 127))) 769 (hash-table-put! sets 'ASCII_WORD 770 (rlet1 cs (make <char-code-set> :name 'ASCII_WORD) 771 (add-code-range! cs 772 (char->integer #\0) 773 (char->integer #\9)) 774 (add-code-range! cs 775 (char->integer #\A) 776 (char->integer #\Z)) 777 (add-code-range! cs 778 (char->integer #\a) 779 (char->integer #\z)) 780 (add-code! cs (char->integer #\_)))) 781 (hash-table-put! sets 'LC 782 (code-set-union 'LC 783 (hash-table-ref sets 'Lt) 784 (hash-table-ref sets 'Ll) 785 (hash-table-ref sets 'Lu))) 786 (hash-table-put! sets 'EMPTY (make <char-code-set> :name 'EMPTY)) 787 sets) 788 789;; Break property values 790;; This goes to src/gauche/priv/unicode_attr.h 791;; 792;; The data structure is two-level tables that maps codepoint 0-1FFFFF to 793;; an octet. 794;; 795;; The first level, break_table, is 512-length byte vector. 796;; Upper 9bit of codepoint is used to index this table. If the entry 797;; is 255, the break property of that codepoint takes the default value. 798;; 799;; Otherwise, the first table entry is the subtable, break_subtable[N][], 800;; and the lower 8bit of codepoint is used to index the subtable. 801;; it returns an octet, in which upper 4 bit is for grapheme break property 802;; and lower 4 bit is for word break property. 803 804(define (generate-break-tables db) 805 (define subtable-count 0) 806 807 (define (gen-entry code) 808 (let1 e (ucd-get-break-property db code) 809 (if (not e) 810 (format #t " BREAK_ENTRY(GB_Other, WB_Other),\n") 811 (format #t " BREAK_ENTRY(GB_~a, WB_~a),\n" 812 (let1 gp (ucd-break-property-grapheme e) 813 (if (memq gp '(CR LF)) 814 'Other 815 gp)) 816 (let1 wp (ucd-break-property-word e) 817 (if (memq wp '(CR LF Single_Quote Double_Quote)) 818 'Other 819 wp)))))) 820 821 ;; returns subtable-number 822 (define (gen-subtable start-code) 823 (if (any?-ec (: lb 256) (ucd-get-break-property db (+ start-code lb))) 824 (rlet1 subtable-num subtable-count 825 (format #t " {\n") 826 (do-ec (: lb 256) (gen-entry (+ start-code lb))) 827 (format #t " },\n") 828 (inc! subtable-count)) 829 255)) 830 831 ;; Generate table of symbols 832 (define (gen-symbol-table constants prefix) 833 (receive (normals specials) (break not constants) 834 (for-each-with-index (^(i c) (format #t "#define ~a_~a ~a\n" prefix c i)) 835 normals) 836 (for-each-with-index (^(i c) (format #t "#define ~a_~a ~a\n" prefix c 837 (+ 16 i))) 838 (cdr specials))) 839 (format #t "static void init_~a_symbols(ScmModule *mod) {\n" prefix) 840 (for-each (^c 841 (and c 842 (format #t 843 "Scm_DefineConst(mod, \ 844 SCM_SYMBOL(SCM_INTERN(\"~a_~a\")),\ 845 SCM_MAKE_INT(~a_~a));\n" 846 prefix c prefix c))) 847 constants) 848 (print "}")) 849 850 (print) 851 (gen-symbol-table (ucd-grapheme-break-properties) "GB") 852 (gen-symbol-table (ucd-word-break-properties) "WB") 853 (print) 854 (print "#define BREAK_ENTRY(g, w) (((g)<<4)|(w))") 855 (print) 856 (print "static unsigned char break_subtable[][256] = {") 857 (let1 nlist (list-ec (: n 512) (gen-subtable (* n 256))) 858 (print "};") 859 (print) 860 (format #t "static unsigned char break_table[] = {") 861 (do-ec (:parallel (: n nlist) (:integers i)) 862 (begin (when (zero? (mod i 8)) (format #t "\n ")) 863 (format #t " ~3d," n))) 864 (format #t "\n};\n")) 865 ) 866 867;; EastAsianWidth property values 868;; This goes to src/gauche/priv/unicode_attr.h 869;; 870;; The data structure is two-level tables that maps codepoint 0-1FFFFF to 871;; an octet. 872;; 873;; The first level, width_table, is 512-length byte vector. 874;; Upper 9bit of codepoint is used to index this table. If the entry 875;; is one of WIDTH_x values, all codepoints of that range has that property 876;; value. 877;; 878;; Otherwise, the entry value - NUM_WIDTH_PROPERTIES points to the subtable, 879;; width_subtable[N][]. It is a vecto of nibbles; to look up, first take 880;; 1-7bit of the codepoint and look up an octet; if the LSB of codepoint 881;; is 0, take lower nibble; otherwise, take upper nibble. 882 883(define (generate-width-tables db) 884 (define prop-count (length (ucd-east-asian-widths))) 885 (define subtable-count prop-count) 886 (define width-table (~ db'width-table)) 887 888 ;; returns subtable-number 889 (define (gen-subtable start-code) 890 (let1 e (dict-get width-table start-code 'N) 891 (if (any?-ec (: lb 256) 892 (not (eq? (dict-get width-table (+ start-code lb) 'N) e))) 893 (rlet1 subtable-num subtable-count 894 (print " {") 895 (do-ec (: c 128) 896 (let ([w0 (dict-get width-table (+ start-code (* c 2)) 'N)] 897 [w1 (dict-get width-table (+ start-code (* c 2) 1) 'N)]) 898 (print #" WIDTH_ENTRY(WIDTH_~|w1|, WIDTH_~|w0|),"))) 899 (print " },") 900 (inc! subtable-count)) 901 ;; all codepoints of this range shares the value. 902 (find-index (cut eqv? e <>) (ucd-east-asian-widths))))) 903 904 ;; Generate table of symbols 905 (define (gen-symbol-table) 906 (for-each-with-index (^(i c) (format #t "#define WIDTH_~a ~a\n" c i)) 907 (ucd-east-asian-widths)) 908 (print #"#define NUM_WIDTH_PROPERTIES ~|prop-count|") 909 (print) 910 (print "static void init_WIDTH_symbols(ScmModule *mod) {") 911 (print #" ScmObj v = Scm_MakeVector(~|prop-count|, SCM_FALSE);") 912 (for-each-with-index 913 (^[i c] 914 (print #" SCM_VECTOR_ELEMENT(v, ~|i|) = SCM_INTERN(\"~|c|\");")) 915 (ucd-east-asian-widths)) 916 (print " Scm_DefineConst(mod, \ 917 SCM_SYMBOL(SCM_INTERN(\"*east-asian-widths*\")), \ 918 v);") 919 (print "}")) 920 921 (print) 922 (gen-symbol-table) 923 (print) 924 (print "#define WIDTH_ENTRY(a, b) (((a)<<4) | (b))") 925 (print) 926 (print "static unsigned char width_subtable[][128] = {") 927 (let1 nlist (list-ec (: n 512) (gen-subtable (* n 256))) 928 (print "};") 929 (print) 930 (format #t "static unsigned char width_table[] = {") 931 (do-ec (:parallel (: n nlist) (:integers i)) 932 (begin (when (zero? (mod i 8)) (format #t "\n ")) 933 (format #t " ~3d," n))) 934 (format #t "\n};\n")) 935 ) 936