1;;;
2;;;  gen-unicode.scm - generate unicode-handling tables
3;;;
4;;;    Originally written by Shiro Kawai, 2011
5;;;    Public Domain - use as you like.
6;;;
7
8;; Reads Unicode data tables and generates various source files.
9
10;; This script can serve three operations.
11;;
12;; (1) Generate unicode-data.scm from Unicode character database
13;;   It is only necessary when new version of Unicode is published,
14;;   and the resulting unicode-data.scm is checked in to the source
15;;   tree so that other developers don't need UCD.
16;;
17;;    gosh ./gen-unicode.scm --import <unicode-database-directory> unicode-data.scm
18;;
19;; (2) Generate source files from unicode-data.scm
20;;   To reduce runtime overhead, unicode properties are saved in binary
21;;   tables, within the following two generated source files:
22;;
23;;   char_attr.c                - General category and case mappings.
24;;   gauche/priv/unicode_attr.h - Grapheme break, word break, East-asian width
25;;
26;;   This is done when you build from git source tree.
27;;
28;;    gosh ./gen-unicode.scm --compile unicode-data.scm
29;;
30;; (3) Fetch Unicode character database files.  This is for developer's
31;;   convenience.
32;;
33;;    gosh ./gen-unicode.scm --fetch <unicode-database-directory> [<unicode-version>
34;;   Note that the original content in <unicode-database-directory> is
35;;   overwritten.
36;;   You might need to adjust code for newer version of Unicode.
37;;   See doc/HOWTO-unicode.txt for the details.
38
39(use srfi-1)
40(use srfi-13)
41(use srfi-42)
42(use text.csv)
43(use util.match)
44(use file.util)
45(use gauche.dictionary)
46(use gauche.sequence)
47(use gauche.record)
48(use gauche.charconv)
49(use gauche.uvector)
50
51;; needs to load from source
52(add-load-path "../lib" :relative)
53(use text.unicode.ucd)
54(use text.unicode.codeset)
55
56;; rfc.http is only required by '--fetch' operation.  We don't want to load
57;; it during build process, so let's autoload it.
58(autoload rfc.http http-get)
59
60
61;; We generate four kind of lookup structures.  Each structure consists
62;; of various types of tables in order to reduce the size.
63;;
64;; * General categories and case bits
65;;
66;;    This structure maps Unicode codepoint to a byte containing the
67;;    following info:
68;;
69;;    bit 7-6 - Alphabetic and case info
70;;                00 : non-alphabetic char
71;;                01 : lowercase alphabetic char
72;;                10 : uppercase alphabetic char
73;;                11 : caseless or titlecase alphabetic char
74;;    bit 5   - Reserved
75;;    bit 4-0 - General category (Lu, Nd, Cc, etc.)
76;;              See ucd-general-categories
77;;              and the enum in src/gauche/char_attr.h
78;;
79;;    For U+0000 to U+1ffff, we have a single table that directly maps
80;;    the codepoint to the above byte.
81;;
82;;      unsigned char ucs_general_category_00000[0x20000]  (131072 bytes)
83;;
84;;    Above U+20000 the mappings are sparse and mostly contiguous, so we
85;;    generate a C code that performs a binary search.
86;;
87;;      unsigned char ucs_general_category_20000(ScmChar code)
88;;
89;; * Case mapping
90;;
91;;    Almost all characters that needs case mappings are below #xffff.  In
92;;    Unicode 6.0, the only exceptions are 80 characters in U+104xx range
93;;    (Deseret).
94;;
95;;    Among case-mapped characters, almost all of them requires a simple
96;;    rule---that is, either a character can be lowercased by adding some
97;;    small value to the codepoint, or a character can be upcased and
98;;    titlecased by adding some small value to the codepoint.  These have
99;;    simple casemap entries.  Characters that requires more complex handling
100;;    have extended casemap entries.
101;;
102;;    The tables first maps a codepoint into 16bit entry.  The entry
103;;    represents either a simple casemap entry, or an index to an
104;;    extended casemap entry.
105;;
106;;    bit15 == 0:
107;;      This is a simple entry.  uppercase and titlecase is the same,
108;;      no special case mappings, and conversion between
109;;      (upper,title) <-> lower is simply done by adding the given offset.
110;;
111;;      If bit14 is 0, this letter is uppercase.  Converting to uppercase and
112;;                     titlecase is noop.  Converting to lowercase is to add
113;;                    the offset.
114;;      If bit14 is 1, this letter is lowercase.  Converting to lowercase is
115;;                     noop.  Converting to uppercase and titlecase is to add
116;;                     the offset.
117;;      bit13-bit0, singned integer offset [-8192, 8191]
118;;
119;;    bit15 == 1:
120;;      This is an extended entry (except #xffff).
121;;      bit14-bit0 is an index to an extended entry table.
122;;
123;;    #xffff indicates empty entry.
124;;
125;;    Characters that require case mapping tend to cluster, so we use two-staged
126;;    table to lookup the 16-bit entry from the codepoint below U+10000.
127;;
128;;    The bit 15-8 is the index of this table:
129;;
130;;       static unsiged char casemap_000[256]
131;;
132;;    If the value is 255, the character doesn't have case mappings.
133;;    Otherwise, let V be the value of the above lookup, the entry
134;;    can be looked up by the following table:
135;;
136;;       static unsiged char casemap_subtable[V][<lower 8 bit of codepoint>];
137;;
138;;    We only need to have 18 subtables.
139;;
140;; * Digit values
141;;
142;;    Characters with categrory Nd has associated numeric value 0..9.
143;;    A set of decimal numeric characters for 0..9 are always contiguous.
144;;    As of Unicode 6.2, we have 42 of such sets.
145;;
146;; * Break properties
147;;
148;;    This structure maps codepoint to the Grapheme_Cluster_Break,
149;;    Word_Break and Sentence_Break properties.
150;;
151;;    See http://www.unicode.org/reports/tr29/
152;;    http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
153;;    http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
154;;    http://www.unicode.org/Public/UNIDATA/auxiliary/SentenceBreakProperty.txt
155;;
156;;    An entry is 8-bit, indicating the character's Word_Break and
157;;    Grapheme_Break properties.  In unicode_attr.h, they are
158;;    prefixed with WB_ and GB_, respectively.
159;;    Note that WB_CR, WB_LF, WB_Single_Quote, WB_Double_Quote,
160;;    GB_CR and GB_LF values are *not* stored in the table.  Each
161;;    property value is assigned to single character and we check
162;;    them separately.
163;;
164;;    bit7-4:  Word_Break property
165;;             10    CR     (not stored in the table)
166;;             11    LF     (not stored in the table)
167;;              0    Newline
168;;              1    Extend
169;;              2    Regional_Indicator
170;;              3    Format
171;;              4    Katakana
172;;              5    Hebrew_Letter
173;;              6    ALetter
174;;             12    Single_Quote (not stored in the table)
175;;             13    Double_Quote (not stored in the table)
176;;              7    MidLetter
177;;              8    MidNum
178;;              9    MidNumLet
179;;              a    Numeric
180;;              b    ExtendNumLet
181;;              c    WSegSpace
182;;              d    ZWJ
183;;              e    Other
184;;    bit3-0:  Graphene_Break property
185;;             10    CR (not stored in the table)
186;;             11    LF (not stored in the table)
187;;              0    Control
188;;              1    Extend
189;;              2    Regional_Indicator
190;;              3    Prepend
191;;              4    SpacingMark
192;;              5    L
193;;              6    V
194;;              7    T
195;;              8    LV
196;;              9    LVT
197;;              a    ZWJ
198;;              b    Other
199;;
200;;   Codepoints below U+20000 are looked up by two-staged tables.
201;;   First, look up this table with (codepoint >> 8).
202;;
203;;     static unsigned char break_table[0x200]
204;;
205;;   If the value is 255, both properties are 'Other'.
206;;   Otherwise, the value is an index to the secondary table.
207;;
208;;     static unsigned char break_subtable[index][256]
209;;
210;;   The value of this table encodes WB and GB properties.
211;;
212;;   Codepoints on or above U+20000 are all 'Other', except the following
213;;   ranges.  They are handled specially in the lookup procedure.
214;;
215;;    E0001          GB_Control, WB_Format
216;;    E0020..E007F   GB_Control, WB_Format
217;;    E0100..E01EF   GB_Extend, WB_Extend
218;;
219
220;;;
221;;;  Main entry
222;;;
223
224(define (main args)
225  (match (cdr args)
226    [("--fetch" dir . maybe-version) (apply fetch-ucd dir maybe-version)]
227    [("--import" dir ucdfile)
228     (unless (file-is-directory? dir)
229       (exit 1 "Directory required, but got: ~a" dir))
230     (with-output-to-file ucdfile
231       (cut ucd-save-db (ucd-parse-files dir)))]
232    [("--compile" ucdfile)
233     (unless (file-exists? ucdfile)
234       (exit 1 "Couldn't open unicode data file: ~a" ucdfile))
235     (generate-tables (call-with-input-file ucdfile ucd-load-db))]
236    [else
237     (exit 1 "Usage:\n\
238              gen-unicode.scm --fetch <unicode-database-dir> [<unicode-version>]\n\
239              gen-unicode.scm --import <unicode-database-dir> <ucd-file>\n\
240              gen-unicode.scm --compile <unicode-data.scm>")])
241  0)
242
243;;;
244;;; Fetching
245;;;
246(define (fetch-ucd dir :optional (version #f))
247  (let ([path (if version
248                #"/Public/~|version|/ucd"
249                "/Public/UCD/latest/ucd")]
250        [datafiles   '("UnicodeData.txt"
251                       "SpecialCasing.txt"
252                       "PropList.txt"
253                       "EastAsianWidth.txt"
254                       "auxiliary/GraphemeBreakProperty.txt"
255                       "auxiliary/SentenceBreakProperty.txt"
256                       "auxiliary/WordBreakProperty.txt")])
257    (make-directory* (build-path dir "auxiliary"))
258    (dolist [f datafiles]
259      (let1 p (build-path dir f)
260        (display #"Getting ~|f|... ") (flush)
261        (call-with-output-file p
262          (^o (http-get "www.unicode.org" (build-path path f)
263                        :sink o :flusher (^ _ #t))))
264        (display "done\n")))))
265
266;;;
267;;;  Generate source file
268;;;
269
270(define (generate-tables db)
271  (receive (char_attr.p char_attr.c) (sys-mkstemp "char_attr.c.")
272    (receive (unicode_attr.p unicode_attr.h) (sys-mkstemp "unicode_attr.h.")
273      (with-output-to-port char_attr.p
274        (^() (preamble db)
275          (generate-category-tables db)
276          (generate-case-tables db)
277          (generate-digit-value-tables db)))
278      (with-output-to-port unicode_attr.p
279        (^() (preamble db)
280          (generate-break-tables db)
281          (generate-width-tables db)))
282      (close-port char_attr.p)
283      (close-port unicode_attr.p)
284      (sys-rename char_attr.c "char_attr.c")
285      (sys-rename unicode_attr.h "gauche/priv/unicode_attr.h"))))
286
287(define (preamble db)
288  (print "/* Generated automatically from Unicode character database */")
289  (print "/* See src/gen-unicode.scm for the description of data structures. */")
290  (print #"/* Unicode version ~(ucd-version db).  Do not edit. */")
291  )
292
293;; NB: The caracter category tables are generated for each supported
294;; internal encodings.  The lookup function is defined in gauche/char_*.h,
295;; and must be in sync with the tables generated here.
296(define (generate-category-tables db)
297  (define (emit-entry e)
298    (if e
299      (cond
300       [(ucd-entry-uppercase e)  (format #t " U(~a)," (ucd-entry-category e))]
301       [(ucd-entry-lowercase e)  (format #t " L(~a)," (ucd-entry-category e))]
302       [(ucd-entry-alphabetic e) (format #t " A(~a)," (ucd-entry-category e))]
303       [else                 (format #t "    ~a," (ucd-entry-category e))])
304      (format #t "    Cn,")))
305  (define (emit-columns start end p)
306    (do-ec (:parallel (: k start end)
307                      (:integers i))
308           (begin
309             (cond [(zero? (mod i 256)) (format #t "\n /* ~5,'0x - */\n" k)]
310                   [(zero? (mod i 8))  (newline)])
311             (p k)))
312    (print))
313  (define (ucs->entry ucs) (ucd-get-entry db ucs))
314
315  ;; Setup
316  (dolist [c (ucd-general-categories)]
317    (format #t "#define ~a SCM_CHAR_CATEGORY_~a\n" c c))
318  (print "#undef A")
319  (print "#undef U")
320  (print "#undef L")
321  (print "#define A(x) ((x)|SCM_CHAR_ALPHABETIC_BITS)")
322  (print "#define U(x) ((x)|SCM_CHAR_UPPERCASE_BITS)")
323  (print "#define L(x) ((x)|SCM_CHAR_LOWERCASE_BITS)")
324
325  ;; utf-8 ('none' is also here, for we treat it as latin-1.)
326  (let ([code-sets (build-code-sets db 'utf8 (ucd-general-categories))])
327    (print "#if !defined(GAUCHE_CHAR_ENCODING_EUC_JP) && !defined(GAUCHE_CHAR_ENCODING_SJIS)")
328    ;;   U+0000 - U+1ffff : direct table lookup
329    (display "static unsigned char ucs_general_category_00000[] = {")
330    (emit-columns 0 #x20000 (^i (emit-entry (ucs->entry i))))
331    (print "};")
332    ;;   Over U+20000 - binary search
333    (print)
334    (print "static unsigned char ucs_general_category_20000(ScmChar code)")
335    (print "{")
336    (print "  /*")
337    (for-each (^e (format #t "    ~6x ~a\n" (car e) (cdr e)))
338              (ucd-get-category-ranges db))
339    (print "  */")
340    (generate-bisect (coerce-to <vector> (ucd-get-category-ranges db))
341                     (^e (format "return ~a;"
342                                 (case (cdr e)
343                                   [(Lo) "A(Lo)"]
344                                   [(#f) "Cn"]
345                                   [else => identity]))))
346    (print "}")
347    (for-each
348     (^s (dump-code-set-in-C (cdr s)))
349     (sort (hash-table->alist code-sets) string<? (^x (x->string (car x)))))
350    (print "#endif /*defined(GAUCHE_CHAR_ENCODING_UTF_8)*/")
351    (print))
352
353  ;; euc-jp
354  (let ([code-sets (build-code-sets db 'eucjp (ucd-general-categories))])
355    (print "#if defined(GAUCHE_CHAR_ENCODING_EUC_JP)")
356    (display "static unsigned char eucjp_general_category_G0[] = {")
357    (emit-columns 0 #x80 (^i (emit-entry (eucjp->ucd-entry db i))))
358    (print "};")
359    (display "static unsigned char eucjp_general_category_G1[] = {")
360    (dotimes [z (- #xff #xa1)]
361      (emit-columns (+ (ash (+ z #xa1) 8) #xa1)
362                    (+ (ash (+ z #xa1) 8) #xff)
363                    (^i (emit-entry (eucjp->ucd-entry db i)))))
364    (print "};")
365    (display "static unsigned char eucjp_general_category_G2[] = {")
366    (emit-columns #x8ea1 #x8ee0 (^i (emit-entry (eucjp->ucd-entry db i))))
367    (print "};")
368    (display "static unsigned char eucjp_general_category_G3[] = {")
369    (dotimes [z (- #xff #xa1)]
370      (emit-columns (+ (ash (+ z #xa1) 8) #x8f00a1)
371                    (+ (ash (+ z #xa1) 8) #x8f00ff)
372                    (^i (emit-entry (eucjp->ucd-entry db i)))))
373    (print "};")
374    (for-each
375     (^s (dump-code-set-in-C (cdr s)))
376     (sort (hash-table->alist code-sets) string<? (^x (x->string (car x)))))
377    (print "#endif /*defined(GAUCHE_CHAR_ENCODING_EUC_JP)*/")
378    (print))
379
380  ;; sjis
381  (let ([code-sets (build-code-sets db 'sjis (ucd-general-categories))])
382    (print "#if defined(GAUCHE_CHAR_ENCODING_SJIS)")
383    (display "static unsigned char sjis_general_category_00[] = {")
384    (emit-columns 0 #x80 (^i (emit-entry (sjis->ucd-entry db i))))
385    (print "};")
386    (display "static unsigned char sjis_general_category_a0[] = {")
387    (emit-columns #xa0 #xdf (^i (emit-entry (sjis->ucd-entry db i))))
388    (print "};")
389    (display "static unsigned char sjis_general_category_8000[] = {")
390    (do-ec (: z #x8000 #xa000 256)
391           (emit-columns (+ z #x40)
392                         (+ z #xfd)
393                         (^i (emit-entry (sjis->ucd-entry db i)))))
394    (print "};")
395    (display "static unsigned char sjis_general_category_e000[] = {")
396    (do-ec (: z #xe000 #x10000 256)
397           (emit-columns (+ z #x40)
398                         (+ z #xfd)
399                         (^i (emit-entry (sjis->ucd-entry db i)))))
400    (print "};")
401    (for-each
402     (^s (dump-code-set-in-C (cdr s)))
403     (sort (hash-table->alist code-sets) string<? (^x (x->string (car x)))))
404    (print "#endif /*defined(GAUCHE_CHAR_ENCODING_SJIS)*/")
405    (print))
406
407  ;; Bind predefined charset
408  ;; The CAT entry can be just a symbol or (alias symbol).
409  (print "static void init_predefined_charsets() {")
410  (dolist [cat (append (ucd-general-categories)
411                       '(L LC M N P S Z C
412                         (LETTER L) ASCII_LETTER
413                         (DIGIT Nd) ASCII_DIGIT
414                         LETTER_DIGIT ASCII_LETTER_DIGIT
415                         (LOWER Ll) ASCII_LOWER
416                         (UPPER Lu) ASCII_UPPER
417                         (TITLE Lt)
418                         GRAPHIC ASCII_GRAPHIC
419                         PRINTING ASCII_PRINTING
420                         (PUNCTUATION P) ASCII_PUNCTUATION
421                         (SYMBOL S) ASCII_SYMBOL
422                         (ISO_CONTROL Cc) ASCII_CONTROL
423                         HEX_DIGIT
424                         WHITESPACE ASCII_WHITESPACE
425                         BLANK ASCII_BLANK
426                         ASCII EMPTY
427                         ASCII_WORD (WORD ASCII_WORD)))]
428    (let1 setname (if (pair? cat) (car cat) cat)
429      (print #"  predef_sets[SCM_CHAR_SET_~|setname|] =")
430      (if (pair? cat)
431        (print #"    predef_sets[SCM_CHAR_SET_~(cadr cat)];")
432        (print #"    make_charset_~|setname|();"))))
433  (print "}")
434
435  ;; Teardown
436  (dolist [c (ucd-general-categories)]
437    (format #t "#undef ~a\n" c))
438  (format #t "#undef A\n")
439  (format #t "#undef U\n")
440  (format #t "#undef L\n")
441  (print))
442
443(define (generate-case-tables db)
444  (define subtables '())
445  (define extended '())
446  ;; returns a list of (code . ucd-entry-case-map) with U+HHHxx where HHH
447  ;; is given to the hi arg.
448  (define (gather-entries hb)
449    (fold-ec '() (: lb 256) (ucd-get-entry db (+ (* hb 256) lb))
450             (^(e seed) (if (or (not e) (not (ucd-entry-case-map e)))
451                          seed
452                          (acons (+ (* hb 256) lb) (ucd-entry-case-map e) seed)))))
453  ;; ucd-entry-alist :: ((code . ucd-entry-case-map) ...)
454  ;; subtable :: (start-code . #(....))
455  ;; returns (start-code . subtable-number)
456  (define (gen-subtable start-code ucd-entry-alist)
457    (if (null? ucd-entry-alist)
458      `(,start-code . 255)
459      (let* ([table-num (length subtables)]
460             [vec   (make-vector 256 #f)])
461        (dolist [p ucd-entry-alist]
462          (when (ucd-extended-case-map? (cdr p))
463            (push! extended (cdr p)))
464          (set! (~ vec (logand (car p) #xff)) (cdr p)))
465        (push! subtables (cons start-code vec))
466        (cons start-code table-num))))
467  ;;
468  (define (gather-toptable)
469    (list-ec (: hi 256) (gen-subtable (* hi 256) (gather-entries hi))))
470
471  ;;
472  (define (emit-subtable subtable)
473    (format #t "  /* ~4,'0x - ~4,'0x */\n"
474            (car subtable)
475            (+ (car subtable) 255))
476    (format #t "  {\n")
477    (dotimes [n 256]
478      (format #t "    ~20a,   /* 0x~4,'0x */\n"
479              (if-let1 cmap (~ (cdr subtable) n)
480                (if (ucd-simple-case-map? cmap)
481                  (format "~a(~a)"
482                          (if (eq? (ucd-simple-case-map-case cmap) 'upper)
483                            "TOLOWER"
484                            "TOUPPER")
485                          (ucd-simple-case-map-offset cmap))
486                  (format "EXTENDED(~a)"
487                          (find-index (cut eq? cmap <>) extended)))
488                "NO_CASE_MAPPING")
489              (+ (car subtable) n)))
490    (format #t "  },\n"))
491
492  (define (emit-subtables)
493    (format #t "static unsigned short casemap_subtable[][256] = {\n")
494    (for-each emit-subtable (reverse subtables))
495    (format #t "};\n\n"))
496
497  (define (emit-toptable subtable-num-alist)
498    (let1 vec (make-vector 256 255)
499      (dolist [s subtable-num-alist]
500        (set! (~ vec (div (car s) 256)) (cdr s)))
501      (format #t "static unsigned char casemap_000[] = {")
502      (dotimes [i 256]
503        (when (zero? (mod i 16)) (newline))
504        (format #t " ~3d," (~ vec i)))
505      (format #t "\n};\n")))
506
507  (define (emit-extended-case-maps)
508    (format #t "static ScmCharCaseMap extended_casemaps[] = {\n")
509    (dolist [e extended]
510      (let ([simple  (ucd-extended-case-map-simple-map e)]
511            [special (ucd-extended-case-map-special-map e)])
512        (format #t "  { ~s,~s,~s,"
513                (or (~ simple 0) 0)
514                (or (~ simple 1) 0)
515                (or (~ simple 2) 0))
516        (if special
517          (format #t "~a,~a,~a"
518                  (extended-special-array (~ special 0))
519                  (extended-special-array (~ special 1))
520                  (extended-special-array (~ special 2)))
521          (format #t "{-1},{-1},{-1}"))
522        (format #t "}, /* ~4,'0x */\n" (ucd-extended-case-map-code e))))
523    (format #t "};\n\n"))
524  (define (extended-special-array lis)
525    (apply format "{~a,~a,~a,-1}" (map (^i (list-ref lis i -1)) '(0 1 2))))
526
527  ;; body of generate-case-tables
528  (print "#define NO_CASE_MAPPING SCM_CHAR_NO_CASE_MAPPING")
529  (print "#define TOLOWER(x) SCM_CHAR_CASEMAP_TOLOWER(x)")
530  (print "#define TOUPPER(x) SCM_CHAR_CASEMAP_TOUPPER(x)")
531  (print "#define EXTENDED(x) SCM_CHAR_CASEMAP_EXTENDED(x)")
532  (let1 subtable-num-alist (gather-toptable)
533    (emit-extended-case-maps)
534    (emit-subtables)
535    (emit-toptable subtable-num-alist))
536  )
537
538(define (generate-bisect entries gen-value)
539  (define (bisect lo hi indent)
540    (if (= (+ lo 1) hi)
541      (format #t "~v,a~a\n" (* 2 indent) " " (gen-value (~ entries lo)))
542      (let1 mid (div (+ lo hi) 2)
543        (format #t "~v,aif (code < 0x~x) {\n"
544                (* 2 indent) " " (car (~ entries mid)))
545        (bisect lo mid (+ indent 1))
546        (format #t "~v,a} else {\n" (* 2 indent) " ")
547        (bisect mid hi (+ indent 1))
548        (format #t "~v,a}\n" (* 2 indent) " "))))
549  (bisect 0 (size-of entries) 1))
550
551;; Digit-value tables.  Note that we'll have a shortcut for the first
552;; chunk [0x30, 0x39], so we only generate for the second chunk and after
553(define (generate-digit-value-tables db)
554  (let1 ranges ($ reverse
555                  $ fold (^[p s] (match s
556                                   [() `(,p (0 . #f))]
557                                   [((_ . last) . rest)
558                                    (if (= (car p) last)
559                                      `(,p ,@s)
560                                      `(,p (,last . #f) ,@s))]))
561                  '()
562                  $ cdr $ (cut sort-by <> car) $ filter identity
563                  $ ucd-map-entries db
564                  (^[code entry]
565                    (and (eq? (ucd-entry-category entry) 'Nd)
566                         (zero? (ucd-entry-digit-value entry))
567                         (cons code (+ code 10)))))
568
569    (print)
570    (print "static int ucs_digit_value(ScmChar code)")
571    (print "{")
572    (dolist [r ranges]
573      (format #t "  /* ~5,'0x- ~a */\n" (car r) (if (cdr r) "Nd" "* ")))
574    (generate-bisect (coerce-to <vector> ranges)
575                     (^e (if (cdr e)
576                           (format "return (code - 0x~x);" (car e))
577                           "return -1;")))
578    (print "}")))
579
580;; Predefined character sets
581
582(define (for-each-char-code/none proc)
583  (dotimes [n 256] (proc n n)))
584
585(define (for-each-char-code/ucs proc)
586  (dotimes [n #x2ffff] (proc n n))
587  (proc #xe0000 #xe007f) ; Language tags; Cf
588  (proc #xe0100 #xe01ef) ; Variation selectors; Mn
589  (proc #xf0000 #xffffd) ; Private use; Co
590  (proc #x100000 #x10fffd) ; Private use; Co
591  )
592
593(define (for-each-char-code/euc-jp proc)
594  (dotimes [n 256] (proc n n))
595  (do-ec (: n #x8ea1 #x8eff) (proc n n))
596  (do-ec (: n #xa1a1 #xfeff) (proc n n))
597  (do-ec (: n #x8fa1a1 #x8ffef7) (proc n n)))
598
599(define (for-each-char-code/sjis proc)
600  (dotimes [n 128] (proc n n))
601  (do-ec (: n #xa0 #xe0) (proc n n))
602  (do-ec (: n #xfd #x100) (proc n n))
603  (do-ec (: n #x8140 #x9ffd) (proc n n))
604  (do-ec (: n #xe040 #xfcfd) (proc n n)))
605
606(define (build-code-sets db encoding categories)
607  (define sets (make-hash-table 'eq?))
608  (define walker (ecase encoding
609                   [(none) for-each-char-code/none]
610                   [(utf8) for-each-char-code/ucs]
611                   [(eucjp) for-each-char-code/euc-jp]
612                   [(sjis) for-each-char-code/sjis]))
613  (define get-entry
614    (let ([eucjp-entry-cache (make-hash-table 'eqv?)]
615          [sjis-entry-cache (make-hash-table 'eqv?)])
616      (ecase encoding
617        [(none utf8) ucd-get-entry]
618        [(eucjp) (^[db n]
619                   (if-let1 e (hash-table-get eucjp-entry-cache n #f)
620                     (if (eq? e 'nothing) #f e)
621                     (rlet1 e (eucjp->ucd-entry db n)
622                       (hash-table-put! eucjp-entry-cache n (or e 'nothing)))))]
623        [(sjis) (^[db n]
624                  (if-let1 e (hash-table-get sjis-entry-cache n #f)
625                    (if (eq? e 'nothing) #f e)
626                    (rlet1 e (sjis->ucd-entry db n)
627                      (hash-table-put! sjis-entry-cache n (or e 'nothing)))))]
628        )))
629  (define (register set cat n m)
630    (and-let1 e (get-entry db n)
631      (if (eq? cat (ucd-entry-category e))
632        (if (= n m)
633          (add-code! set n)
634          (add-code-range! set n m)))))
635  ;; general category charsets (char-set:Lt etc.)
636  (dolist [cat categories]
637    (let1 cs (make <char-code-set> :name cat)
638      (walker (cut register cs cat <> <>))
639      (hash-table-put! sets cat cs)))
640  ;; general category class charsets (char-set:L etc.)
641  (dolist [gcats (group-collection categories
642                                  :key (^c (string-ref (symbol->string c) 0)))]
643    (let1 cs
644        (make <char-code-set>
645          :name (string->symbol (substring (symbol->string (car gcats)) 0 1)))
646      (for-each (^c (walker (cut register cs c <> <>))) gcats)
647      (hash-table-put! sets (~ cs'name) cs)))
648  ;; srfi-14 charsets (the ones that has equivalent set in general category set
649  ;; is handled implicitly.
650  (hash-table-put! sets 'ASCII_UPPER
651                   (rlet1 cs (make <char-code-set> :name 'ASCII_UPPER)
652                     (add-code-range! cs
653                                      (char->integer #\A)
654                                      (char->integer #\Z))))
655  (hash-table-put! sets 'ASCII_LOWER
656                   (rlet1 cs (make <char-code-set> :name 'ASCII_LOWER)
657                     (add-code-range! cs
658                                      (char->integer #\a)
659                                      (char->integer #\z))))
660  (hash-table-put! sets 'ASCII_LETTER
661                   (code-set-union 'ASCII_LETTER
662                                   (hash-table-ref sets 'ASCII_UPPER)
663                                   (hash-table-ref sets 'ASCII_LOWER)))
664  (hash-table-put! sets 'ASCII_DIGIT
665                   (rlet1 cs (make <char-code-set> :name 'ASCII_DIGIT)
666                     (add-code-range! cs
667                                      (char->integer #\0)
668                                      (char->integer #\9))))
669  (hash-table-put! sets 'LETTER_DIGIT
670                   (code-set-union 'LETTER_DIGIT
671                                   (hash-table-ref sets 'L)
672                                   (hash-table-ref sets 'Nd)))
673  (hash-table-put! sets 'ASCII_LETTER_DIGIT
674                   (code-set-union 'ASCII_LETTER_DIGIT
675                                   (hash-table-ref sets 'ASCII_LETTER)
676                                   (hash-table-ref sets 'ASCII_DIGIT)))
677  (hash-table-put! sets 'ASCII_WHITESPACE
678                   (rlet1 cs (make <char-code-set> :name 'ASCII_WHITESPACE)
679                     (add-code-range! cs 9 13) ;TAB,LF,LTAB,FF,CR
680                     (add-code! cs (char->integer #\space))))
681  (hash-table-put! sets 'WHITESPACE
682                   (code-set-union 'WHITESPACE
683                                   (hash-table-ref sets 'ASCII_WHITESPACE)
684                                   (hash-table-ref sets 'Z)))
685  (hash-table-put! sets 'ASCII_BLANK
686                   (rlet1 cs (make <char-code-set> :name 'ASCII_BLANK)
687                     (add-code! cs 9) ;TAB
688                     (add-code! cs (char->integer #\space))))
689  (hash-table-put! sets 'BLANK
690                   (code-set-union 'BLANK
691                                   (hash-table-ref sets 'ASCII_BLANK)
692                                   (hash-table-ref sets 'Zs)))
693  (hash-table-put! sets 'ASCII_PUNCTUATION
694                   (rlet1 cs (make <char-code-set> :name 'ASCII_PUNCTUATION)
695                     (add-code! cs 33)  ;!
696                     (add-code! cs 34)  ;"
697                     (add-code! cs 35)  ;#
698                     (add-code! cs 37)  ;%
699                     (add-code! cs 38)  ;&
700                     (add-code! cs 39)  ;'
701                     (add-code! cs 40)  ;(
702                     (add-code! cs 41)  ;)
703                     (add-code! cs 42)  ;*
704                     (add-code! cs 44)  ;,
705                     (add-code! cs 45)  ;-
706                     (add-code! cs 46)  ;.
707                     (add-code! cs 47)  ;/
708                     (add-code! cs 58)  ;:
709                     (add-code! cs 59)  ;;
710                     (add-code! cs 63)  ;?
711                     (add-code! cs 64)  ;@
712                     (add-code! cs 91)  ;[
713                     (add-code! cs 92)  ;\
714                     (add-code! cs 93)  ;)
715                     (add-code! cs 95)  ;_
716                     (add-code! cs 123) ;{
717                     (add-code! cs 125) ;}
718                     ))
719  (hash-table-put! sets 'ASCII_SYMBOL
720                   (rlet1 cs (make <char-code-set> :name 'ASCII_SYMBOL)
721                     (add-code! cs 36)  ;$
722                     (add-code! cs 43)  ;+
723                     (add-code! cs 60)  ;<
724                     (add-code! cs 61)  ;=
725                     (add-code! cs 62)  ;>
726                     (add-code! cs 94)  ;^
727                     (add-code! cs 96)  ;`
728                     (add-code! cs 124) ;|
729                     (add-code! cs 126) ;~
730                     ))
731  (hash-table-put! sets 'GRAPHIC
732                   (code-set-union 'GRAPHIC
733                                   (hash-table-ref sets 'L)
734                                   (hash-table-ref sets 'N)
735                                   (hash-table-ref sets 'P)
736                                   (hash-table-ref sets 'S)))
737  (hash-table-put! sets 'ASCII_GRAPHIC
738                   (code-set-union 'ASCII_GRAPHIC
739                                   (hash-table-ref sets 'ASCII_LETTER)
740                                   (hash-table-ref sets 'ASCII_DIGIT)
741                                   (hash-table-ref sets 'ASCII_PUNCTUATION)
742                                   (hash-table-ref sets 'ASCII_SYMBOL)))
743  (hash-table-put! sets 'PRINTING
744                   (code-set-union 'PRINTING
745                                   (hash-table-ref sets 'GRAPHIC)
746                                   (hash-table-ref sets 'WHITESPACE)))
747  (hash-table-put! sets 'ASCII_PRINTING
748                   (code-set-union 'ASCII_PRINTING
749                                   (hash-table-ref sets 'ASCII_GRAPHIC)
750                                   (hash-table-ref sets 'ASCII_WHITESPACE)))
751  (hash-table-put! sets 'ASCII_CONTROL
752                   (rlet1 cs (make <char-code-set> :name 'ASCII_CONTROL)
753                     (add-code-range! cs #x00 #x1f)
754                     (add-code! cs #x7f)))
755  (hash-table-put! sets 'HEX_DIGIT
756                   (rlet1 cs (make <char-code-set> :name 'HEX_DIGIT)
757                     (add-code-range! cs
758                                      (char->integer #\0)
759                                      (char->integer #\9))
760                     (add-code-range! cs
761                                      (char->integer #\A)
762                                      (char->integer #\F))
763                     (add-code-range! cs
764                                      (char->integer #\a)
765                                      (char->integer #\f))))
766  (hash-table-put! sets 'ASCII
767                   (rlet1 cs (make <char-code-set> :name 'ASCII)
768                     (add-code-range! cs 0 127)))
769  (hash-table-put! sets 'ASCII_WORD
770                   (rlet1 cs (make <char-code-set> :name 'ASCII_WORD)
771                     (add-code-range! cs
772                                      (char->integer #\0)
773                                      (char->integer #\9))
774                     (add-code-range! cs
775                                      (char->integer #\A)
776                                      (char->integer #\Z))
777                     (add-code-range! cs
778                                      (char->integer #\a)
779                                      (char->integer #\z))
780                     (add-code! cs (char->integer #\_))))
781  (hash-table-put! sets 'LC
782                   (code-set-union 'LC
783                                   (hash-table-ref sets 'Lt)
784                                   (hash-table-ref sets 'Ll)
785                                   (hash-table-ref sets 'Lu)))
786  (hash-table-put! sets 'EMPTY (make <char-code-set> :name 'EMPTY))
787  sets)
788
789;; Break property values
790;; This goes to src/gauche/priv/unicode_attr.h
791;;
792;; The data structure is two-level tables that maps codepoint 0-1FFFFF to
793;; an octet.
794;;
795;; The first level, break_table, is 512-length byte vector.
796;; Upper 9bit of codepoint is used to index this table.  If the entry
797;; is 255, the break property of that codepoint takes the default value.
798;;
799;; Otherwise, the first table entry is the subtable, break_subtable[N][],
800;; and the lower 8bit of codepoint is used to index the subtable.
801;; it returns an octet, in which upper 4 bit is for grapheme break property
802;; and lower 4 bit is for word break property.
803
804(define (generate-break-tables db)
805  (define subtable-count 0)
806
807  (define (gen-entry code)
808    (let1 e (ucd-get-break-property db code)
809      (if (not e)
810        (format #t "    BREAK_ENTRY(GB_Other, WB_Other),\n")
811        (format #t "    BREAK_ENTRY(GB_~a, WB_~a),\n"
812                (let1 gp (ucd-break-property-grapheme e)
813                  (if (memq gp '(CR LF))
814                    'Other
815                    gp))
816                (let1 wp (ucd-break-property-word e)
817                  (if (memq wp '(CR LF Single_Quote Double_Quote))
818                    'Other
819                    wp))))))
820
821  ;; returns subtable-number
822  (define (gen-subtable start-code)
823    (if (any?-ec (: lb 256) (ucd-get-break-property db (+ start-code lb)))
824      (rlet1 subtable-num subtable-count
825        (format #t "  {\n")
826        (do-ec (: lb 256) (gen-entry (+ start-code lb)))
827        (format #t "  },\n")
828        (inc! subtable-count))
829      255))
830
831  ;; Generate table of symbols
832  (define (gen-symbol-table constants prefix)
833    (receive (normals specials) (break not constants)
834      (for-each-with-index (^(i c) (format #t "#define ~a_~a ~a\n" prefix c i))
835                           normals)
836      (for-each-with-index (^(i c) (format #t "#define ~a_~a ~a\n" prefix c
837                                           (+ 16 i)))
838                           (cdr specials)))
839    (format #t "static void init_~a_symbols(ScmModule *mod) {\n" prefix)
840    (for-each (^c
841               (and c
842                    (format #t
843                            "Scm_DefineConst(mod, \
844                          SCM_SYMBOL(SCM_INTERN(\"~a_~a\")),\
845                          SCM_MAKE_INT(~a_~a));\n"
846                            prefix c prefix c)))
847              constants)
848    (print "}"))
849
850  (print)
851  (gen-symbol-table (ucd-grapheme-break-properties) "GB")
852  (gen-symbol-table (ucd-word-break-properties) "WB")
853  (print)
854  (print "#define BREAK_ENTRY(g, w)  (((g)<<4)|(w))")
855  (print)
856  (print "static unsigned char break_subtable[][256] = {")
857  (let1 nlist (list-ec (: n 512) (gen-subtable (* n 256)))
858    (print "};")
859    (print)
860    (format #t "static unsigned char break_table[] = {")
861    (do-ec (:parallel (: n nlist) (:integers i))
862           (begin (when (zero? (mod i 8)) (format #t "\n   "))
863                  (format #t " ~3d," n)))
864    (format #t "\n};\n"))
865  )
866
867;; EastAsianWidth property values
868;; This goes to src/gauche/priv/unicode_attr.h
869;;
870;; The data structure is two-level tables that maps codepoint 0-1FFFFF to
871;; an octet.
872;;
873;; The first level, width_table, is 512-length byte vector.
874;; Upper 9bit of codepoint is used to index this table.  If the entry
875;; is one of WIDTH_x values, all codepoints of that range has that property
876;; value.
877;;
878;; Otherwise, the entry value - NUM_WIDTH_PROPERTIES points to the subtable,
879;; width_subtable[N][].  It is a vecto of nibbles; to look up, first take
880;; 1-7bit of the codepoint and look up an octet; if the LSB of codepoint
881;; is 0, take lower nibble; otherwise, take upper nibble.
882
883(define (generate-width-tables db)
884  (define prop-count (length (ucd-east-asian-widths)))
885  (define subtable-count prop-count)
886  (define width-table (~ db'width-table))
887
888  ;; returns subtable-number
889  (define (gen-subtable start-code)
890    (let1 e (dict-get width-table start-code 'N)
891      (if (any?-ec (: lb 256)
892                   (not (eq? (dict-get width-table (+ start-code lb) 'N) e)))
893        (rlet1 subtable-num subtable-count
894          (print "  {")
895          (do-ec (: c 128)
896                 (let ([w0 (dict-get width-table (+ start-code (* c 2)) 'N)]
897                       [w1 (dict-get width-table (+ start-code (* c 2) 1) 'N)])
898                   (print #"    WIDTH_ENTRY(WIDTH_~|w1|, WIDTH_~|w0|),")))
899          (print "  },")
900          (inc! subtable-count))
901        ;; all codepoints of this range shares the value.
902        (find-index (cut eqv? e <>) (ucd-east-asian-widths)))))
903
904  ;; Generate table of symbols
905  (define (gen-symbol-table)
906    (for-each-with-index (^(i c) (format #t "#define WIDTH_~a ~a\n" c i))
907                         (ucd-east-asian-widths))
908    (print #"#define NUM_WIDTH_PROPERTIES ~|prop-count|")
909    (print)
910    (print "static void init_WIDTH_symbols(ScmModule *mod) {")
911    (print #"  ScmObj v = Scm_MakeVector(~|prop-count|, SCM_FALSE);")
912    (for-each-with-index
913     (^[i c]
914       (print #"  SCM_VECTOR_ELEMENT(v, ~|i|) = SCM_INTERN(\"~|c|\");"))
915     (ucd-east-asian-widths))
916    (print "  Scm_DefineConst(mod, \
917                 SCM_SYMBOL(SCM_INTERN(\"*east-asian-widths*\")), \
918                 v);")
919    (print "}"))
920
921  (print)
922  (gen-symbol-table)
923  (print)
924  (print "#define WIDTH_ENTRY(a, b)  (((a)<<4) | (b))")
925  (print)
926  (print "static unsigned char width_subtable[][128] = {")
927  (let1 nlist (list-ec (: n 512) (gen-subtable (* n 256)))
928    (print "};")
929    (print)
930    (format #t "static unsigned char width_table[] = {")
931    (do-ec (:parallel (: n nlist) (:integers i))
932           (begin (when (zero? (mod i 8)) (format #t "\n   "))
933                  (format #t " ~3d," n)))
934    (format #t "\n};\n"))
935  )
936