1;;;
2;;; srfi-14.scm - character set
3;;;
4;;;   Copyright (c) 2000-2020  Shiro Kawai  <shiro@acm.org>
5;;;
6;;;   Redistribution and use in source and binary forms, with or without
7;;;   modification, are permitted provided that the following conditions
8;;;   are met:
9;;;
10;;;   1. Redistributions of source code must retain the above copyright
11;;;      notice, this list of conditions and the following disclaimer.
12;;;
13;;;   2. Redistributions in binary form must reproduce the above copyright
14;;;      notice, this list of conditions and the following disclaimer in the
15;;;      documentation and/or other materials provided with the distribution.
16;;;
17;;;   3. Neither the name of the authors nor the names of its contributors
18;;;      may be used to endorse or promote products derived from this
19;;;      software without specific prior written permission.
20;;;
21;;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22;;;   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23;;;   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24;;;   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25;;;   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26;;;   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27;;;   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28;;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29;;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30;;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31;;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32;;;
33
34;; Basic operators are built in the Gauche kernel.  This module
35;; defines auxiliary procedures.
36;;
37;; Some of the procedures are implemented with no optimization
38;; effort.   I included them just for completeness.
39;; If you think you need faster operation, please optimize it and
40;; send me a patch.
41;;
42;; ucs-range->char-set performs poorly if the given charcter code
43;; is outside ASCII and the native character encoding is not utf-8.
44
45;; Functions supported by the core interpreter:
46;;    char-set char-set? char-set-contains? char-set-copy
47;;    char-set-complement char-set-complement!
48;;    %char-set-equal? %char-set-add-chars! %char-set-add-range!
49;;    %char-set-add! %char-set-ranges
50;; Functions not in SRFI but defined here
51;;    integer-range->char-set integer-range->char-set!
52
53(define-module srfi-14
54  (export char-set= char-set<=
55          char-set-cursor char-set-ref char-set-cursor-next end-of-char-set?
56          char-set-fold char-set-unfold char-set-unfold!
57          char-set-for-each char-set-map
58          list->char-set list->char-set! string->char-set string->char-set!
59          char-set-filter char-set-filter!
60          ucs-range->char-set ucs-range->char-set!
61          integer-range->char-set integer-range->char-set!
62          ->char-set
63          char-set-count char-set->list char-set->string
64          char-set-every char-set-any
65          char-set-adjoin char-set-adjoin! char-set-delete char-set-delete!
66          char-set-complement char-set-complement!
67          char-set-union char-set-union!
68          char-set-intersection char-set-intersection!
69          char-set-difference char-set-difference!
70          char-set-xor char-set-xor!
71          char-set-diff+intersection char-set-diff+intersection!
72          char-set:lower-case char-set:upper-case char-set:title-case
73          char-set:letter char-set:digit char-set:letter+digit
74          char-set:graphic char-set:printing char-set:whitespace
75          char-set:iso-control char-set:punctuation char-set:symbol
76          char-set:hex-digit char-set:blank char-set:ascii
77          char-set:empty char-set:full
78
79          ;; The followings are defined in core
80          char-set char-set? char-set-contains? char-set-copy
81          char-set-hash char-set-size
82          char-set-complement char-set-complement!
83          ))
84(select-module srfi-14)
85
86;; used in ->char-set
87(autoload gauche.lazy x->lseq)
88
89;; some built-in support
90(define %char-set-equal? (with-module gauche.internal %char-set-equal?))
91(define %char-set<=? (with-module gauche.internal %char-set<=?))
92(define %char-set-add-chars! (with-module gauche.internal %char-set-add-chars!))
93(define %char-set-add-range! (with-module gauche.internal %char-set-add-range!))
94(define %char-set-add! (with-module gauche.internal %char-set-add!))
95(define %char-set-ranges (with-module gauche.internal %char-set-ranges))
96(define %char-set-predefined (with-module gauche.internal %char-set-predefined))
97
98(define char-set-complement  (with-module gauche char-set-complement))
99(define char-set-complement! (with-module gauche char-set-complement!))
100
101;; Predefined charsets - built-in
102
103;;-------------------------------------------------------------------
104;; Comparison
105(define char-set=
106  (case-lambda [() #t]
107               [args (every %char-set-equal? args (cdr args))]))
108
109(define char-set<=
110  (case-lambda [() #t]
111               [args (every %char-set<=? args (cdr args))]))
112
113;;-------------------------------------------------------------------
114;; Iteration
115;;
116
117;; slow implementation.  minimal error check.
118;; cursor === (code . ranges) | #f
119
120(define (char-set-cursor cs)
121  (let1 ranges (%char-set-ranges cs)
122    (if (null? ranges) #f (cons (caar ranges) ranges))))
123
124(define (char-set-ref cs cursor)
125  (if (and (pair? cursor) (integer? (car cursor)))
126    (integer->char (car cursor))
127    (error "bad character-set cursor:" cursor)))
128
129(define (char-set-cursor-next cs cursor)
130  (if (pair? cursor)
131    (let ([code (car cursor)]
132          [range (cadr cursor)])
133      (cond [(< code (cdr range))  (cons (+ code 1) (cdr cursor))]
134            [(null? (cddr cursor)) #f]
135            [else (cons (caaddr cursor) (cddr cursor))]))
136    (error "bad character-set cursor:" cursor)))
137
138(define (end-of-char-set? cursor) (not cursor))
139
140;; functional ops
141
142(define (char-set-fold kons knil cs)
143  (let loop ([cursor (char-set-cursor cs)]
144             [result knil])
145    (if (end-of-char-set? cursor)
146      result
147      (loop (char-set-cursor-next cs cursor)
148            (kons (char-set-ref cs cursor) result)))))
149
150(define (char-set-unfold! pred fun gen seed base)
151  (let loop ([seed seed])
152    (if (pred seed)
153      base
154      (let1 c (fun seed)
155        (%char-set-add-range! base c c)
156        (loop (gen seed))))))
157
158(define (char-set-unfold pred fun gen seed :optional (base char-set:empty))
159  (char-set-unfold! pred fun gen seed (char-set-copy base)))
160
161(define (char-set-for-each proc cs)
162  (let loop ([cursor (char-set-cursor cs)])
163    (unless (end-of-char-set? cursor)
164      (proc (char-set-ref cs cursor))
165      (loop (char-set-cursor-next cs cursor)))))
166
167(define (char-set-map proc cs)
168  (rlet1 new (char-set)
169    (let loop ([cursor (char-set-cursor cs)])
170      (unless (end-of-char-set? cursor)
171        (let1 c (proc (char-set-ref cs cursor))
172          (%char-set-add-range! new c c)
173          (loop (char-set-cursor-next cs cursor)))))))
174
175;;-------------------------------------------------------------------
176;; Construction
177;;
178
179;; char-set-copy : native
180;; char-set : native
181
182(define (list->char-set chars :optional (base char-set:empty))
183  (%char-set-add-chars! (char-set-copy base) chars))
184
185(define (list->char-set! chars base)
186  (%char-set-add-chars! base chars))
187
188(define (string->char-set str :optional (base char-set:empty))
189  (%char-set-add-chars! (char-set-copy base) (string->list str)))
190
191(define (string->char-set! str base)
192  (%char-set-add-chars! base (string->list str)))
193
194(define (char-set-filter pred cs :optional (base char-set:empty))
195  (char-set-filter! pred cs (char-set-copy base)))
196
197(define (char-set-filter! pred cs base)
198  (let loop ([cursor (char-set-cursor cs)])
199    (if (end-of-char-set? cursor)
200      base
201      (let1 c (char-set-ref cs cursor)
202        (when (pred c) (%char-set-add-range! base c c))
203        (loop (char-set-cursor-next cs cursor))))))
204
205(define (integer-range->char-set low upper :optional (error? #f)
206                                                     (base char-set:empty))
207  (integer-range->char-set! low upper error? (char-set-copy base)))
208
209(define (integer-range->char-set! low upper error? base)
210  (when (< low 0)
211    (if error?
212      (error "argument out of range:" low)
213      (set! low 0)))
214  (when (> upper (+ *char-code-max* 1))
215    (if error?
216      (error "argument out of range:" upper)
217      (set! upper (+ *char-code-max* 1))))
218  (%char-set-add-range! base low (- upper 1)))
219
220(define (ucs-range->char-set low upper :optional (error? #f)
221                                                 (base char-set:empty))
222  (ucs-range->char-set! low upper error? (char-set-copy base)))
223
224(define ucs-range->char-set!
225  (if (eq? (gauche-character-encoding) 'utf-8)
226    integer-range->char-set!
227    (lambda (low upper error? base)
228      (when (< low 0)
229        (if error?
230          (error "argument out of range:" low)
231          (set! low 0)))
232      (if (< upper 128)
233        (%char-set-add-range! base low (- upper 1))
234        (begin
235          (when (< low 128)
236            (%char-set-add-range! base low 128)
237            (set! low 128))
238          (do ([i low (+ i 1)])
239              [(>= i upper) base]
240            (let1 c (ucs->char i)
241              (if c
242                (%char-set-add-range! base c c)
243                (if error?
244                  (errorf "unicode character #\\u~8,'0x is not supported in the native character set (~a)"
245                          i (gauche-character-encoding)))))
246            )))
247      )
248    ))
249
250(define (->char-set obj)
251  (cond [(list? obj)   (list->char-set obj)]
252        [(string? obj) (string->char-set obj)]
253        [(char-set? obj) obj]
254        [(char? obj) (char-set obj)]
255        [(is-a? obj <collection>) (list->char-set (x->lseq obj))]
256        [else (errorf "cannot coerse ~s into a char-set" obj)]))
257
258;;-------------------------------------------------------------------
259;; Querying
260;;
261
262(autoload "srfi-14/query" char-set-count
263                          char-set->list char-set->string
264                          char-set-every char-set-any)
265
266;;-------------------------------------------------------------------
267;; Algebra
268;;
269
270(autoload "srfi-14/set"   char-set-adjoin char-set-adjoin!
271                          char-set-delete char-set-delete!
272                          char-set-union char-set-union!
273                          char-set-intersection char-set-intersection!
274                          char-set-difference char-set-difference!
275                          char-set-xor char-set-xor!
276                          char-set-diff+intersection
277                          char-set-diff+intersection!)
278
279