1#lang racket/base
2
3;; This script parses UnicodeData.txt (the standard Unicode database,
4;; available from the web) and other such files, and it produces
5;; "schuchar.inc" and "schustr.inc". The former is used by
6;; scheme_isalpha, etc., and thus `char-alphabetic?', etc. The latter
7;; is used for string operations.
8
9;; Run as
10;;   racket mk-uchar.rkt
11;; in the script's directory, and have a copy of UnicodeData.txt, etc.
12;; in the "Unicode" directory. The file schuchar.inc will be
13;; overwritten.
14
15(require racket/list)
16
17(define mark-cats '("Mn" "Mc" "Me"))
18(define letter-cats '("Lu" "Ll" "Lt" "Lm" "Lo"))
19(define digit-cats '("Nd" "No" "Nl"))
20(define space-cats '("Zl" "Zs" "Zp"))
21(define punc-cats '("Pc" "Pd" "Ps" "Pe" "Pi" "Pf" "Po"))
22(define sym-cats '("Sm" "Sc" "Sk" "So"))
23(define sympart-non-cats '("Ps" "Pe" "Pi" "Pf" "Zl" "Zs" "Zp"))
24(define graphic-cats (append mark-cats
25			     letter-cats
26			     digit-cats
27			     punc-cats
28			     sym-cats))
29
30(define cases (cons (make-hash) (box 0)))
31
32(define (indirect t v limit)
33  (let ([r (hash-ref (car t) v (lambda () #f))])
34    (or r
35	(let ([r (unbox (cdr t))])
36	  (set-box! (cdr t) (add1 r))
37	  (hash-set! (car t) v r)
38	  (when (r . > . limit)
39	    (error "too many indirects"))
40	  r))))
41
42(define (combine . l)
43  ;; The scheme_is...() macros in scheme.h must match
44  ;;  the bit layout produced here
45  (let loop ([l l][v 0])
46    (if (null? l)
47	v
48	(loop (cdr l) (bitwise-ior (arithmetic-shift v 1)
49				   (if (car l)
50				       1
51				       0))))))
52
53(define (combine-case up down title fold combining)
54  (indirect cases (list up down title fold combining) 256))
55
56(define general-categories (make-hash))
57(define (combine-cat cat)
58  (hash-ref general-categories cat
59            (lambda ()
60              (let ([v (hash-count general-categories)])
61                (hash-set! general-categories cat v)
62                v))))
63;; So they're in order:
64(with-input-from-file "schgencat.h"
65  (lambda ()
66    (let loop ()
67      (let ([l (read-line)])
68	(unless (eof-object? l)
69	  (let ([m (regexp-match #rx"mzu_([A-Z][a-z])" l)])
70	    (when m
71	      (combine-cat (cadr m))))
72	  (loop))))))
73
74(define hexes (map char->integer (string->list "0123456789abcdefABCDEF")))
75
76(define combining-class-ht (make-hasheq))
77
78;; In principle, adjust this number to tune the result, but
79;;  the macros for accessing the table (in scheme.h) need to
80;;  be updated accordingly.
81;; In practice, it's unlikely that anything will ever work
82;;  much better than 8.
83(define low-bits 8)
84
85(define low (sub1 (expt 2 low-bits)))
86(define hi-count (expt 2 (- 21 low-bits)))
87(define hi (arithmetic-shift (sub1 hi-count) low-bits))
88
89(define top (make-vector hi-count #f))
90(define top2 (make-vector hi-count #f))
91(define top3 (make-vector hi-count #f))
92
93(define range-bottom 0)
94(define range-top -1)
95(define range-v -1)
96(define range-v2 -1)
97(define range-v3 -1)
98(define ranges null)
99
100(define ccount 0)
101
102(define (map1 c v v2 v3 cc)
103  (hash-set! combining-class-ht c cc)
104  (set! ccount (add1 ccount))
105  (if (= c (add1 range-top))
106      (begin
107	(unless (and (= v range-v)
108		     (= v2 range-v2)
109		     (= v3 range-v3))
110	  (set! range-v -1))
111	(set! range-top c))
112      (begin
113	;; Drop surrogate from range.
114	;;  At the time of implementation, the following
115	;;  was never executed, because #D7FF wasn't mapped:
116	(when (and (< range-bottom #xD800)
117		   (> range-top #xD800))
118	  (set! ranges (cons (list range-bottom #xD7FF (range-v . > . -1))
119			     ranges))
120	  (set! range-bottom #xE000))
121	;; ... but this one was executed.
122	(when (= range-bottom #xD800)
123	  (set! range-bottom #xE000))
124	(set! ranges (cons (list range-bottom range-top (range-v . > . -1))
125			   ranges))
126	(set! range-bottom c)
127	(set! range-top c)
128	(set! range-v v)
129	(set! range-v2 v2)
130	(set! range-v3 v3)))
131  (let ([top-index (arithmetic-shift c (- low-bits))])
132    (let ([vec (vector-ref top top-index)]
133	  [vec2 (vector-ref top2 top-index)]
134	  [vec3 (vector-ref top3 top-index)])
135      (unless vec
136	(vector-set! top top-index (make-vector (add1 low))))
137      (unless vec2
138	(vector-set! top2 top-index (make-vector (add1 low))))
139      (unless vec3
140	(vector-set! top3 top-index (make-vector (add1 low))))
141      (let ([vec (vector-ref top top-index)]
142	    [vec2 (vector-ref top2 top-index)]
143	    [vec3 (vector-ref top3 top-index)])
144	(vector-set! vec (bitwise-and c low) v)
145	(vector-set! vec2 (bitwise-and c low) v2)
146	(vector-set! vec3 (bitwise-and c low) v3)))))
147
148(define (mapn c from v v2 v3 cc)
149  (if (= c from)
150      (map1 c v v2 v3 cc)
151      (begin
152	(map1 from v v2 v3 cc)
153	(mapn c (add1 from) v v2 v3 cc))))
154
155(define (set-compose-initial! c)
156  (let ([top-index (arithmetic-shift c (- low-bits))])
157    (let ([vec (vector-ref top top-index)]
158	  [i (bitwise-and c low) ])
159      (vector-set! vec i (bitwise-ior #x8000 (vector-ref vec i))))))
160
161(define (string->codes s)
162  (let ([m (regexp-match #rx"^[^0-9A-F]*([0-9A-F]+)" s)])
163    (if m
164	(cons (string->number (cadr m) 16)
165	      (string->codes (substring s (string-length (car m)))))
166	null)))
167
168;; This code assumes that Final_Sigma is the only condition that we care about:
169(define case-foldings (make-hash))
170(define special-case-foldings (make-hash))
171(call-with-input-file "Unicode/CaseFolding.txt"
172  (lambda (i)
173    (let loop ()
174      (let ([l (read-line i)])
175	(unless (eof-object? l)
176	  (let ([m (regexp-match #rx"^([0-9A-F]+); *([CSF]) *;([^;]*)" l)])
177	    (when m
178	      (let ([code (string->number (cadr m) 16)]
179		    [variant (list-ref m 2)]
180		    [folded (string->codes (list-ref m 3))])
181		(if (string=? variant "F")
182		    (hash-set! special-case-foldings code folded)
183		    (hash-set! case-foldings code (car folded))))))
184	  (loop))))))
185
186;; This code assumes that Final_Sigma is the only condition that we care about:
187(define special-casings (make-hash))
188(define-struct special-casing (lower upper title folding final-sigma?) #:mutable)
189(call-with-input-file "Unicode/SpecialCasing.txt"
190  (lambda (i)
191    (let loop ()
192      (let ([l (read-line i)])
193	(unless (eof-object? l)
194	  (let ([m (regexp-match #rx"^([0-9A-F]+);([^;]*);([^;]*);([^;]*);([^;]*)" l)])
195	    (when (and m
196		       (regexp-match #rx"^(?:(?: *Final_Sigma *)|(?: *))(?:$|[;#].*)" (list-ref m 5)))
197	      (let ([code (string->number (cadr m) 16)]
198		    [lower (string->codes (list-ref m 2))]
199		    [upper (string->codes (list-ref m 4))]
200		    [title (string->codes (list-ref m 3))]
201		    [final-sigma? (and (regexp-match #rx"Final_Sigma" (list-ref m 5)) #t)])
202		(let ([folding (list (hash-ref case-foldings code (lambda () code)))])
203		  (hash-set! special-casings code (make-special-casing lower upper title folding final-sigma?))))))
204	  (loop))))))
205
206(define lower-case  (make-hash))
207(define upper-case  (make-hash))
208(define alphabetic  (make-hash))
209(define case-ignorable (make-hash))
210
211(with-input-from-file "Unicode/DerivedCoreProperties.txt"
212  (lambda ()
213    (let loop ()
214      (let ([l (read-line)])
215	(unless (eof-object? l)
216	  (let ([m (regexp-match #rx"^([0-9A-F.]+) *; ((Lower|Upper)case|Alphabetic|Case_Ignorable)" l)])
217	    (when m
218	      (let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)]
219		     [end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))])
220			    (if m
221				(string->number (cadr m) 16)
222				start))]
223		     [t (cond
224                         [(string=? (caddr m) "Lowercase") lower-case]
225                         [(string=? (caddr m) "Uppercase") upper-case]
226                         [(string=? (caddr m) "Alphabetic") alphabetic]
227                         [(string=? (caddr m) "Case_Ignorable") case-ignorable]
228                         [else (error "unknown property section")])])
229		(let loop ([i start])
230		  (hash-set! t i #t)
231		  (unless (= i end)
232		    (loop (add1 i)))))))
233	  (loop))))))
234
235(define white_spaces (make-hash))
236
237(with-input-from-file "Unicode/PropList.txt"
238  (lambda ()
239    (let loop ()
240      (let ([l (read-line)])
241	(unless (eof-object? l)
242	  (let ([m (regexp-match #rx"^([0-9A-F.]+) *; White_Space" l)])
243	    (when m
244	      (let* ([start (string->number (car (regexp-match #rx"^[0-9A-F]+" (car m))) 16)]
245		     [end (let ([m (regexp-match #rx"^[0-9A-F]+[.][.]([0-9A-F]+)" (car m))])
246			    (if m
247				(string->number (cadr m) 16)
248				start))])
249		(let loop ([i start])
250		  (hash-set! white_spaces i #t)
251		  (unless (= i end)
252		    (loop (add1 i)))))))
253	  (loop))))))
254
255(define decomp-ht (make-hasheq))
256(define k-decomp-ht (make-hasheq))
257(define compose-initial-ht (make-hasheq))
258(define compose-map (make-hash))
259(define do-not-compose-ht (make-hash))
260
261(with-input-from-file "Unicode/CompositionExclusions.txt"
262  (lambda ()
263    (let loop ()
264      (let ([l (read-line)])
265	(unless (eof-object? l)
266	  (let ([m (regexp-match #rx"^([0-9A-F.]+)" l)])
267	    (when m
268	      (let ([code (string->number (car m) 16)])
269		(hash-set! do-not-compose-ht code #t))))
270	  (loop))))))
271
272(define (composition-key a b)
273  ;; If `a` and `b` are both in the BMP (i.e., both fit in 16 bits),
274  ;; map to a 32-bit key.
275  (bitwise-ior (arithmetic-shift (bitwise-and a #xFFFF) 16)
276               (bitwise-and b #xFFFF)
277               (arithmetic-shift
278                (bitwise-ior (arithmetic-shift (arithmetic-shift a -16)
279                                               5)
280                             (arithmetic-shift b -16))
281                32)))
282
283(define (composition-key-first k)
284  (bitwise-ior (bitwise-and (arithmetic-shift k -16) #xFFFF)
285               (arithmetic-shift (arithmetic-shift k -37) 16)))
286
287(define (extract-decomp decomp code)
288  (if (string=? decomp "")
289      #f
290      (let ([m (regexp-match #rx"^([0-9A-F]+) ?([0-9A-F]*)$" decomp)])
291	(if m
292	    ;; Canonical decomp
293	    (let ([a (string->number (cadr m) 16)]
294		  [b (if (string=? "" (caddr m))
295			 0
296			 (string->number (caddr m) 16))])
297	      ;; Canonical composition?
298	      (when (and (positive? b)
299			 (not (hash-ref do-not-compose-ht
300                                        code
301                                        (lambda () #f))))
302		(hash-set! compose-initial-ht a #t)
303		(let ([key (composition-key a b)])
304		  (when (hash-ref compose-map key (lambda () #f))
305		    (error 'decomp "composition already mapped: ~x for: ~x" key code))
306		  (hash-set! compose-map key code)))
307	      (hash-set! decomp-ht code (cons a b))
308	      #t)
309	    ;; Compatibility decomp
310	    (let ([seq
311		   (let loop ([str (cadr (regexp-match #rx"^<[^>]*> *(.*)$" decomp))])
312		     (let ([m (regexp-match #rx"^([0-9A-F]+) *(.*)$" str)])
313		       (if m
314			   (cons (string->number (cadr m) 16)
315				 (loop (caddr m)))
316			   null)))])
317	      (hash-set! k-decomp-ht code seq)
318	      #t)))))
319
320(define default-casing (make-hash))
321
322(call-with-input-file "Unicode/UnicodeData.txt"
323  (lambda (i)
324    (let loop ([prev-code 0])
325      (let ([l (read-line i)])
326	(unless (eof-object? l)
327	  (let ([m (regexp-match #rx"^([0-9A-F]+);([^;]*);([^;]*);([^;]*);[^;]*;([^;]*);[^;]*;[^;]*;([^;]*);[^;]*;[^;]*;[^;]*;([^;]*);([^;]*);([^;]*)"
328				 l)])
329	    (unless m
330	      (printf "no match: ~a\n" l))
331	    (let ([code (string->number (cadr m) 16)]
332		  [name (caddr m)]
333		  [cat (cadddr m)]
334		  [combining (string->number (cadddr (cdr m)))]
335		  [decomp (cadddr (cddr m))]
336                  [numeric (cadddr (cdddr m))]
337		  [up (string->number (cadddr (cddddr m)) 16)]
338		  [down (string->number (cadddr (cddddr (cdr m))) 16)]
339		  [title (string->number (cadddr (cddddr (cddr m))) 16)])
340              (hash-set! default-casing code (list down up title))
341              (let ([alphabetic? (hash-ref alphabetic code #f)]
342                    [numeric? (not (string=? numeric ""))]
343                    [symbolic? (member cat sym-cats)]
344                    [punctuation? (member cat punc-cats)])
345                (mapn code
346                      (if (regexp-match #rx", Last>" name)
347                          (add1 prev-code)
348                          code)
349                      ;; The booleans below are in most-siginficant-bit-first order
350                      (combine
351                       ;; Decomposition
352                       (extract-decomp decomp code)
353                       ;; special-casing
354                       (or (hash-ref special-casings code (lambda () #f))
355                           (hash-ref special-case-foldings code (lambda () #f)))
356                       ;; case-ignoreable
357                       (hash-ref case-ignorable code #f)
358                       ;; graphic
359                       (or alphabetic?
360                           numeric?
361                           symbolic?
362                           punctuation?
363                           (member cat graphic-cats))
364                       ;; lowercase:
365                       (hash-ref lower-case code (lambda () #f))
366                       #;
367                       (and (not (<= #x2000 code #x2FFF))
368                            (not down)
369                            (or up
370                                (regexp-match #rx"SMALL LETTER" name)
371                                (regexp-match #rx"SMALL LIGATURE" name)))
372                       ;; uppercase;
373                       (hash-ref upper-case code (lambda () #f))
374                       #;
375                       (and (not (<= #x2000 code #x2FFF))
376                            (not up)
377                            (or down
378                                (regexp-match #rx"CAPITAL LETTER" name)
379                                (regexp-match #rx"CAPITAL LIGATURE" name)))
380                       ;; titlecase:
381                       (string=? cat "Lt")
382                       ;; letter
383                       alphabetic?
384                       #;
385                       (member cat letter-cats)
386                       ;; digit
387                       numeric?
388                       #;
389                       (member cat digit-cats)
390                       ;; SOMETHING - this bit not yet used
391                       #f
392                       ;; whitespace
393                       (hash-ref white_spaces code #f)
394                       #;
395                       (or (member cat space-cats)
396                       (member code '(#x9 #xa #xb #xc #xd #x85)))
397                       ;; control
398                       (or (<= #x0000 code #x001F)
399                           (<= #x007F code #x009F))
400                       ;; punctuation
401                       punctuation?
402                       ;; symbol
403                       symbolic?
404                       ;; blank
405                       (or (string=? cat "Zs")
406                           (= code #x9)))
407                      ;; Cases
408                      (combine-case
409                       (if up (- up code) 0)
410                       (if down (- down code) 0)
411                       (if title (- title code) 0)
412                       (let ([case-fold (hash-ref case-foldings code (lambda () #f))])
413                         (if case-fold (- case-fold code) 0))
414                       combining)
415                      ;; Category
416                      (combine-cat cat)
417                      ;; Combining class - used again to filter initial composes
418                      combining))
419	      (loop code))))))))
420
421(hash-for-each compose-initial-ht
422               (lambda (k v)
423                 ;; A canonical decomposition that starts with a non-0 combining
424                 ;;  class is not re-created in a canonical composition. There
425                 ;;  are only two such leading character as of Unicode 4.0:
426                 ;;  U+0308 and U+0F71.
427                 (when (zero? (hash-ref combining-class-ht k))
428                   (set-compose-initial! k))))
429
430;; Remove compositions from compose map that start with
431;;  a character whose combining class is not 0. As of Unicode
432;;  4.0, there are only four of these: U+0344, U+0F73,
433;;  U+0F75, and U+0F81.
434(for-each (lambda (k)
435	    (let ([a (composition-key-first k)])
436	      (unless (zero? (hash-ref combining-class-ht a))
437		(hash-remove! compose-map k))))
438	  (hash-map compose-map (lambda (k v) k)))
439
440(define k-decomp-map-ht (make-hasheq))
441(define k-decomp-strs-ht (make-hash))
442(define k-decomp-strs-len 0)
443(define k-decomp-strs null)
444
445(define (fold-decomp s)
446  (cond
447   [(empty? s) empty]
448   [(empty? (cdr s))
449    (let ([code (car s)])
450      (let ([v (hash-ref decomp-ht code (lambda () #f))])
451	(if v
452	    (if (zero? (cdr v))
453		(fold-decomp (list (car v)))
454		(fold-decomp (list (car v) (cdr v))))
455	    (let ([v (hash-ref k-decomp-ht code (lambda () #f))])
456	      (if v
457		  (fold-decomp v)
458		  (list code))))))]
459   [else (append (fold-decomp (list (car s)))
460		 (fold-decomp (cdr s)))]))
461
462(for-each
463 (lambda (p)
464   (let* ([code (car p)]
465	  [seq (fold-decomp (cdr p))]
466	  [pos (hash-ref k-decomp-strs-ht seq
467                         (lambda ()
468                           (begin0
469                            k-decomp-strs-len
470                            (hash-set! k-decomp-strs-ht seq
471                                       k-decomp-strs-len)
472                            (set! k-decomp-strs
473                                  (append (reverse seq) k-decomp-strs))
474                            (set! k-decomp-strs-len (+ k-decomp-strs-len
475                                                       (length seq))))))])
476     (hash-set! k-decomp-map-ht code (cons pos (length seq)))))
477 ;; Sort to keep it deterministic:
478 (sort (hash-map k-decomp-ht cons)
479       (lambda (a b) (< (car a) (car b)))))
480
481
482(define vectors (make-hash))
483(define vectors2 (make-hash))
484(define vectors3 (make-hash))
485
486(define pos 0)
487(define pos2 0)
488(define pos3 0)
489
490(current-output-port (open-output-file "schuchar.inc" #:exists 'truncate/replace))
491
492(define (hash-vectors! top vectors get-pos set-pos!)
493  (let loop ([i 0])
494    (unless (= i hi-count)
495      (let ([vec (vector-ref top i)])
496	(when vec
497	  (unless (hash-ref vectors vec (lambda () #f))
498	    (set-pos! (add1 (get-pos)))
499	    (hash-set! vectors vec (get-pos))))
500	(loop (add1 i))))))
501
502(hash-vectors! top vectors (lambda () pos) (lambda (v) (set! pos v)))
503(hash-vectors! top2 vectors2 (lambda () pos2) (lambda (v) (set! pos2 v)))
504(hash-vectors! top3 vectors3 (lambda () pos3) (lambda (v) (set! pos3 v)))
505
506;; copy folding special cases to the special-cases table, if not there already:
507(hash-for-each special-case-foldings
508               (lambda (k v)
509                 (let ([sc (hash-ref special-casings k (lambda ()
510                                                         (define d (hash-ref default-casing k))
511                                                         (let ([sc (make-special-casing
512                                                                    (list (or (car d) k))
513                                                                    (list (or (cadr d) k))
514                                                                    (list (or (caddr d) k))
515                                                                    (list k)
516                                                                    #f)])
517                                                           (hash-set! special-casings k sc)
518                                                           sc)))])
519                   (set-special-casing-folding! sc v))))
520
521(define world-count (expt 2 10))
522
523(printf "/* Generated by mk-uchar.rkt */\n\n")
524
525(printf "/* Character count: ~a */\n" ccount)
526(printf "/* Total bytes for all tables: ~a */\n\n"
527	(+ (* (add1 low)
528	      (* 2 (add1 (length (hash-map vectors cons)))))
529	   (* (add1 low)
530	      (* 1 (add1 (length (hash-map vectors2 cons)))))
531	   (* (add1 low)
532	      (* 1 (add1 (length (hash-map vectors3 cons)))))
533	   (* (hash-count decomp-ht)
534	      8)
535	   (* (hash-count compose-map)
536	      2)
537	   (* (hash-count k-decomp-map-ht) (+ 4 1 2))
538	   (* 2 k-decomp-strs-len)
539	   (* 4 4 (unbox (cdr cases)))
540	   (* 4 (* 2 hi-count))))
541
542(printf (string-append
543	 "/* Each of the following maps a character to a value\n"
544	 "   via the scheme_uchar_find() macro in scheme.h. */\n\n"))
545
546(printf "/* Character properties: */\n")
547(printf "READ_ONLY unsigned short *scheme_uchar_table[~a];\n" hi-count)
548
549(printf "\n/* Character case mapping as index into scheme_uchar_ups, etc.: */\n")
550(printf "READ_ONLY unsigned char *scheme_uchar_cases_table[~a];\n" hi-count)
551
552(printf "\n/* Character general categories: */\n")
553(printf "READ_ONLY unsigned char *scheme_uchar_cats_table[~a];\n" hi-count)
554
555(printf "\n/* The udata... arrays are used by init_uchar_table to fill the above mappings.*/\n\n")
556
557(define print-row
558 (lambda (vec name pos hex?)
559   (printf " /* ~a */\n" name)
560   (let loop ([i 0])
561     (printf (if hex? " 0x~x~a" " ~a~a")
562	     (or (vector-ref vec i) "0")
563	     (if (and (= name pos)
564		      (= i low))
565		 "" ","))
566     (when (zero? (modulo (add1 i) 16))
567	 (newline))
568     (unless (= i low)
569       (loop (add1 i))))))
570
571(define (print-table type suffix vectors pos hex?)
572  (printf "READ_ONLY static unsigned ~a udata~a[] = {\n" type suffix)
573  (print-row (make-vector (add1 low) 0) 0 pos hex?)
574  (map (lambda (p)
575	 (print-row (car p) (cdr p) pos hex?))
576       (sort (hash-map vectors cons)
577             (lambda (a b) (< (cdr a) (cdr b)))))
578  (printf "};\n"))
579(print-table "short" "" vectors pos #t)
580(printf "\n")
581(print-table "char" "_cases" vectors2 pos2 #f)
582(print-table "char" "_cats" vectors3 pos3 #f)
583
584(printf "\n/* Case mapping size: ~a */\n" (hash-count (car cases)))
585(printf "/* Find an index into the ups, downs, etc. table for a character\n")
586(printf "   by using scheme_uchar_cases_table; then, the value at the index\n")
587(printf "   is relative to the original character (except for combining class,\n")
588(printf "   of course). */\n")
589
590(define (print-shift t end select type name)
591  (printf "\nREAD_ONLY ~a scheme_uchar_~a[] = {\n" type name)
592  (for-each (lambda (p)
593	      (printf " ~a~a"
594		      (select (car p))
595		      (if (= (cdr p) (sub1 end))
596			  ""
597			  ","))
598	      (when (zero? (modulo (add1 (cdr p)) 16))
599		(newline)))
600	    (sort (hash-map t cons)
601                  (lambda (a b) (< (cdr a) (cdr b)))))
602  (printf " };\n"))
603
604(print-shift (car cases) (unbox (cdr cases)) car "int" "ups")
605(print-shift (car cases) (unbox (cdr cases)) cadr "int" "downs")
606(print-shift (car cases) (unbox (cdr cases)) caddr "int" "titles")
607(print-shift (car cases) (unbox (cdr cases)) cadddr "int" "folds")
608(print-shift (car cases) (unbox (cdr cases)) (lambda (x) (cadddr (cdr x))) "unsigned char" "combining_classes")
609
610(let ([l (sort (hash-map general-categories cons)
611               (lambda (a b) (< (cdr a) (cdr b))))])
612  (printf "\n#define NUM_GENERAL_CATEGORIES ~a\n" (length l))
613  (printf "READ_ONLY static const char *general_category_names[] = {")
614  (for-each (lambda (c)
615	      (printf (if (zero? (cdr c))
616			  "\n  ~s"
617			  ",\n  ~s")
618		      (string-downcase (car c))))
619	    l)
620  (printf "\n};\n"))
621
622(set! ranges (cons (list range-bottom range-top (range-v . > . -1))
623		   ranges))
624
625(printf "\n#define NUM_UCHAR_RANGES ~a\n" (length ranges))
626(printf "\n#define URANGE_VARIES 0x40000000\n")
627(printf "READ_ONLY static int mapped_uchar_ranges[] = {\n")
628(for-each (lambda (r)
629	    (printf "  0x~x, 0x~x~a~a\n"
630		    (car r)
631		    (cadr r)
632		    (if (caddr r) "" " | URANGE_VARIES")
633		    (if (= (cadr r) range-top)
634			""
635			",")))
636	  (reverse ranges))
637(printf "};\n")
638
639(printf "\nstatic void init_uchar_table(void)\n{\n")
640(printf "  int i;\n\n")
641(printf "  for (i = 0; i < ~a; i++) { \n" hi-count)
642(printf "    scheme_uchar_table[i] = udata;\n")
643(printf "    scheme_uchar_cases_table[i] = udata_cases;\n")
644(printf "    scheme_uchar_cats_table[i] = udata_cats;\n")
645(printf "  }\n")
646(printf "\n")
647(define (print-init top vectors suffix)
648  (let loop ([i 0])
649    (unless (= i hi-count)
650      (let ([vec (vector-ref top i)])
651	(if vec
652	    (let ([same-count (let loop ([j (add1 i)])
653				(if (equal? vec (vector-ref top j))
654				    (loop (add1 j))
655				    (- j i)))]
656		  [vec-pos (* (add1 low) (hash-ref vectors vec))])
657	      (if (> same-count 4)
658		  (begin
659		    (printf "  for (i = ~a; i < ~a; i++) {\n"
660			    i (+ i same-count))
661		    (printf "    scheme_uchar~a_table[i] = udata~a + ~a;\n"
662			    suffix suffix
663			    vec-pos)
664		    (printf "  }\n")
665		    (loop (+ same-count i)))
666		  (begin
667		    (printf "  scheme_uchar~a_table[~a] = udata~a + ~a;\n"
668			    suffix
669			    i
670			    suffix
671			    vec-pos)
672		    (loop (add1 i)))))
673	    (loop (add1 i)))))))
674(print-init top vectors "")
675(print-init top2 vectors2 "_cases")
676(print-init top3 vectors3 "_cats")
677(printf "}\n")
678
679;; ----------------------------------------
680
681(current-output-port (open-output-file "schustr.inc" #:exists 'truncate/replace))
682
683(printf "/* Generated by mk-uchar.rkt */\n\n")
684
685(define specials null)
686(define special-count 0)
687(define (register-special l)
688  (let ([l (reverse l)])
689    (unless (let loop ([l l][specials specials])
690	      (cond
691	       [(null? l) #t]
692	       [(null? specials) #f]
693	       [(= (car l) (car specials)) (loop (cdr l) (cdr specials))]
694	       [else #f]))
695      (set! specials (append l specials))
696      (set! special-count (+ special-count (length l))))
697    (- special-count (length l))))
698
699(printf "#define NUM_SPECIAL_CASINGS ~a\n\n" (hash-count special-casings))
700(printf "READ_ONLY static int uchar_special_casings[] = {\n")
701(printf "  /* code,  down len, off,  up len, off,  title len, off,  fold len, off,  final-sigma? */\n")
702(let ([n (hash-count special-casings)])
703  (for-each (lambda (p)
704	      (set! n (sub1 n))
705	      (let ([code (car p)]
706		    [sc (cdr p)])
707		(let ([lower-start (register-special (special-casing-lower sc))]
708		      [upper-start (register-special (special-casing-upper sc))]
709		      [title-start (register-special (special-casing-title sc))]
710		      [folding-start (register-special (special-casing-folding sc))])
711		  (printf "  ~a,  ~a, ~a,  ~a, ~a,  ~a, ~a,  ~a, ~a,  ~a~a"
712			  code
713			  (length (special-casing-lower sc)) lower-start
714			  (length (special-casing-upper sc)) upper-start
715			  (length (special-casing-title sc)) title-start
716			  (length (special-casing-folding sc)) folding-start
717			  (if (special-casing-final-sigma? sc) 1 0)
718			  (if (zero? n) " " ",\n")))))
719	    (sort (hash-map special-casings cons)
720                  (lambda (a b) (< (car a) (car b))))))
721(printf "};\n")
722(printf "\n/* Offsets in scheme_uchar_special_casings point into here: */\n")
723(printf "READ_ONLY static int uchar_special_casing_data[] = {\n  ")
724(let ([n 0])
725  (for-each (lambda (v)
726	      (printf
727	       (cond
728		[(zero? n) "~a"]
729		[(zero? (modulo n 16)) ",\n  ~a"]
730		[else ", ~a"])
731	       v)
732	      (set! n (add1 n)))
733	    (reverse specials)))
734(printf " };\n")
735
736(printf "\n#define SPECIAL_CASE_FOLD_MAX ~a\n" (apply
737						max
738						(hash-map
739						 special-casings
740						 (lambda (k v)
741						   (length (special-casing-folding v))))))
742
743
744
745
746(let ()
747  (define (make-composes-table ps)
748    (list->vector (sort ps (lambda (a b) (< (car a) (car b))))))
749
750  (define canon-composes
751    (make-composes-table (for/list ([(k v) (in-hash compose-map)]
752                                    #:when (k . <= . #xFFFFFFFF))
753                           (cons k v))))
754  (define count (vector-length canon-composes))
755
756  (define long-canon-composes
757    (make-composes-table (for/list ([(k v) (in-hash compose-map)]
758                                    #:when (k . > . #xFFFFFFFF))
759                           (cons k v))))
760  (define long-count (vector-length long-canon-composes))
761
762  (define-values (all-composes decomp-vector long-composes)
763    (let ([decomp-pos-ht (make-hasheq)]
764	  [counter count]
765	  [extra null]
766	  [long-counter 0]
767	  [longs null])
768      (hash-for-each decomp-ht
769                     (lambda (k v)
770                       ;; Use table of composed shorts:
771                       (let ([key (composition-key (car v) (cdr v))])
772                         (let ([pos
773                                (if (and ((car v) . <= . #xFFFF)
774                                         ((cdr v) . <= . #xFFFF))
775                                    (if (hash-ref compose-map key (lambda () #f))
776                                        ;; Find index in comp vector:
777                                        (let loop ([i 0])
778                                          (if (= key (car (vector-ref canon-composes i)))
779                                              i
780                                              (loop (add1 i))))
781                                        ;; Add to compose table:
782                                        (begin0
783                                         counter
784                                         (set! extra (cons (cons key #f) extra))
785                                         (set! counter (add1 counter))))
786                                    ;; Use table of long+long sequences:
787                                    (begin
788                                      (set! long-counter (add1 long-counter))
789                                      (set! longs (cons (cdr v) (cons (car v) longs)))
790                                      (- long-counter)))])
791                           (hash-set! decomp-pos-ht k pos))))
792                     ;; sort:
793                     #t)
794      (values
795       (list->vector (append (vector->list canon-composes)
796			     (reverse extra)))
797       (list->vector
798	(sort (hash-map decomp-pos-ht cons)
799              (lambda (a b) (< (car a) (car b)))))
800       (list->vector (reverse longs)))))
801
802  (printf "\n/* Subset of ~a decompositions used for canonical composition: */\n"
803	  (vector-length all-composes))
804  (printf "#define COMPOSE_TABLE_SIZE ~a\n\n" count)
805
806  (let ([print-compose-data
807	 (lambda (type suffix which composes count hex? row-len)
808	   (printf "READ_ONLY static ~a utable_~a[] = {\n"
809		   type suffix)
810	   (let loop ([i 0])
811	     (let ([v (which (vector-ref composes i))])
812	       (if (= i (sub1 count))
813		   (printf (format " ~a\n};\n" (if hex? "0x~x" "~a")) v)
814		   (begin
815		     (printf (format " ~a," (if hex? "0x~x" "~a")) v)
816		     (when (zero? (modulo (add1 i) row-len))
817		       (newline))
818		     (loop (add1 i)))))))])
819    (printf "/* utable_compose_pairs contains BMP pairs that form a canonical decomposition.\n")
820    (printf "   The first COMPOSE_TABLE_SIZE are also canonical compositions, and they are\n")
821    (printf "   sorted, so that a binary search can find the pair; the utable_compose_result\n")
822    (printf "   table is in parallel for those COMPOSE_TABLE_SIZE to indicate the composed\n")
823    (printf "   characters. Use scheme_needs_maybe_compose() from scheme.h to check whether\n")
824    (printf "   a character might start a canonical decomposition. A zero as the second element\n")
825    (printf "   of a composition means that it is a singleton decomposition.\n")
826    (printf "   The entire utable_compose_pairs table is referenced by utable_decomp_indices\n")
827    (printf "   to map characters to canonical decompositions.\n")
828    (printf "   None of the [de]composition tables includes Hangol. */\n")
829    (print-compose-data "unsigned int" "compose_pairs" car all-composes (vector-length all-composes) #t 8)
830    (print-compose-data "unsigned int" "compose_result" cdr canon-composes count #t 8)
831    (printf "\n")
832    (printf "/* utable_compose_long_pairs contains a sequence of character pairs where at\n")
833    (printf "   least one is outside the BMP, so it doesn't fit in utable_compose_pairs.\n")
834    (printf "   Negative values in utable_decomp_indices map to this table; add one to\n")
835    (printf "   the mapped index, negate, then multiply by 2 to find the pair. */\n")
836    (print-compose-data "unsigned int" "compose_long_pairs" values long-composes (vector-length long-composes) #t 8)
837    (printf "\n")
838    (printf "/* utable_canon_compose_long_pairs repeats information from utable_compose_long_pairs,\n")
839    (printf "   but for canonical compositions only. The two characters are combined by putting the\n")
840    (printf "   lower 16 bits of the combined numbers in the low 32 bits, and then the next higher 10\n")
841    (printf "   bits provide the remaining 5 bits of each character, and the array is sorted. The\n")
842    (printf "   canon_compose_long_result array provides in parellel the composed character. */\n")
843    (printf "#define LONG_COMPOSE_TABLE_SIZE ~a\n\n" long-count)
844    (print-compose-data "mzlonglong" "canon_compose_long_pairs" car long-canon-composes long-count #t 8)
845    (print-compose-data "unsigned int" "canon_compose_long_result" cdr long-canon-composes long-count #t 8)
846    (printf "\n")
847    (printf "/* utable_decomp_keys identifies characters that have a canonical decomposition;\n")
848    (printf "   it is sorted, so binary search can be used, but use scheme_needs_decompose()\n")
849    (printf "   from scheme.h to first determine whether a character may have a mapping in this table.\n")
850    (printf "   (If scheme_needs_decompose(), may instead have a mapping in the kompat table.).\n")
851    (printf "   The parallel utable_decomp_indices maps the corresponding character in this table\n")
852    (printf "   to a composition pair in either utable_compose_pairs (when the index is positive) or\n")
853    (printf "   utable_long_compose_pairs (when the index is negative). */\n")
854    (printf "#define DECOMPOSE_TABLE_SIZE ~a\n\n" (vector-length decomp-vector))
855    (print-compose-data "unsigned int" "decomp_keys" car decomp-vector (vector-length decomp-vector) #t 8)
856    (print-compose-data "short" "decomp_indices" cdr decomp-vector (vector-length decomp-vector) #f 8)
857
858    (let ([k-decomp-vector
859	   (list->vector
860	    (sort (hash-map k-decomp-map-ht cons)
861                  (lambda (a b) (< (car a) (car b)))))])
862      (printf "\n")
863      (printf "/* utable_kompat_decomp_keys identifies characters that have a compatibility decomposition;\n")
864      (printf "   it is sorted, and scheme_needs_decompose() is true for every key (but a character\n")
865      (printf "   with scheme_needs_decompose(), may instead have a mapping in the canonical table.).\n")
866      (printf "   The parallel utable_kompat_decomp_indices maps the corresponding character in this table\n")
867      (printf "   to a composition string in kompat_decomp_strs with a length determined by the\n")
868      (printf "   utable_kompat_decomp_lens table. The decomposition never contains characters that need\n")
869      (printf "   further decomposition. */\n")
870      (printf "\n#define KOMPAT_DECOMPOSE_TABLE_SIZE ~a\n\n"  (vector-length k-decomp-vector))
871      (print-compose-data "unsigned int" "kompat_decomp_keys" car k-decomp-vector (vector-length k-decomp-vector) #t 8)
872      (print-compose-data "char" "kompat_decomp_lens" cddr
873			  k-decomp-vector (vector-length k-decomp-vector) #f 24)
874      (print-compose-data "short" "kompat_decomp_indices" cadr
875			  k-decomp-vector (vector-length k-decomp-vector) #f 16)
876      (let ([l (list->vector (reverse k-decomp-strs))])
877	(print-compose-data "unsigned short" "kompat_decomp_strs" values l (vector-length l) #t 8)))))
878