1;;; 2;;; srfi-14.scm - character set 3;;; 4;;; Copyright (c) 2000-2020 Shiro Kawai <shiro@acm.org> 5;;; 6;;; Redistribution and use in source and binary forms, with or without 7;;; modification, are permitted provided that the following conditions 8;;; are met: 9;;; 10;;; 1. Redistributions of source code must retain the above copyright 11;;; notice, this list of conditions and the following disclaimer. 12;;; 13;;; 2. Redistributions in binary form must reproduce the above copyright 14;;; notice, this list of conditions and the following disclaimer in the 15;;; documentation and/or other materials provided with the distribution. 16;;; 17;;; 3. Neither the name of the authors nor the names of its contributors 18;;; may be used to endorse or promote products derived from this 19;;; software without specific prior written permission. 20;;; 21;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24;;; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25;;; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26;;; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 27;;; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28;;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 29;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32;;; 33 34;; Basic operators are built in the Gauche kernel. This module 35;; defines auxiliary procedures. 36;; 37;; Some of the procedures are implemented with no optimization 38;; effort. I included them just for completeness. 39;; If you think you need faster operation, please optimize it and 40;; send me a patch. 41;; 42;; ucs-range->char-set performs poorly if the given charcter code 43;; is outside ASCII and the native character encoding is not utf-8. 44 45;; Functions supported by the core interpreter: 46;; char-set char-set? char-set-contains? char-set-copy 47;; char-set-complement char-set-complement! 48;; %char-set-equal? %char-set-add-chars! %char-set-add-range! 49;; %char-set-add! %char-set-ranges 50;; Functions not in SRFI but defined here 51;; integer-range->char-set integer-range->char-set! 52 53(define-module srfi-14 54 (export char-set= char-set<= 55 char-set-cursor char-set-ref char-set-cursor-next end-of-char-set? 56 char-set-fold char-set-unfold char-set-unfold! 57 char-set-for-each char-set-map 58 list->char-set list->char-set! string->char-set string->char-set! 59 char-set-filter char-set-filter! 60 ucs-range->char-set ucs-range->char-set! 61 integer-range->char-set integer-range->char-set! 62 ->char-set 63 char-set-count char-set->list char-set->string 64 char-set-every char-set-any 65 char-set-adjoin char-set-adjoin! char-set-delete char-set-delete! 66 char-set-complement char-set-complement! 67 char-set-union char-set-union! 68 char-set-intersection char-set-intersection! 69 char-set-difference char-set-difference! 70 char-set-xor char-set-xor! 71 char-set-diff+intersection char-set-diff+intersection! 72 char-set:lower-case char-set:upper-case char-set:title-case 73 char-set:letter char-set:digit char-set:letter+digit 74 char-set:graphic char-set:printing char-set:whitespace 75 char-set:iso-control char-set:punctuation char-set:symbol 76 char-set:hex-digit char-set:blank char-set:ascii 77 char-set:empty char-set:full 78 79 ;; The followings are defined in core 80 char-set char-set? char-set-contains? char-set-copy 81 char-set-hash char-set-size 82 char-set-complement char-set-complement! 83 )) 84(select-module srfi-14) 85 86;; used in ->char-set 87(autoload gauche.lazy x->lseq) 88 89;; some built-in support 90(define %char-set-equal? (with-module gauche.internal %char-set-equal?)) 91(define %char-set<=? (with-module gauche.internal %char-set<=?)) 92(define %char-set-add-chars! (with-module gauche.internal %char-set-add-chars!)) 93(define %char-set-add-range! (with-module gauche.internal %char-set-add-range!)) 94(define %char-set-add! (with-module gauche.internal %char-set-add!)) 95(define %char-set-ranges (with-module gauche.internal %char-set-ranges)) 96(define %char-set-predefined (with-module gauche.internal %char-set-predefined)) 97 98(define char-set-complement (with-module gauche char-set-complement)) 99(define char-set-complement! (with-module gauche char-set-complement!)) 100 101;; Predefined charsets - built-in 102 103;;------------------------------------------------------------------- 104;; Comparison 105(define char-set= 106 (case-lambda [() #t] 107 [args (every %char-set-equal? args (cdr args))])) 108 109(define char-set<= 110 (case-lambda [() #t] 111 [args (every %char-set<=? args (cdr args))])) 112 113;;------------------------------------------------------------------- 114;; Iteration 115;; 116 117;; slow implementation. minimal error check. 118;; cursor === (code . ranges) | #f 119 120(define (char-set-cursor cs) 121 (let1 ranges (%char-set-ranges cs) 122 (if (null? ranges) #f (cons (caar ranges) ranges)))) 123 124(define (char-set-ref cs cursor) 125 (if (and (pair? cursor) (integer? (car cursor))) 126 (integer->char (car cursor)) 127 (error "bad character-set cursor:" cursor))) 128 129(define (char-set-cursor-next cs cursor) 130 (if (pair? cursor) 131 (let ([code (car cursor)] 132 [range (cadr cursor)]) 133 (cond [(< code (cdr range)) (cons (+ code 1) (cdr cursor))] 134 [(null? (cddr cursor)) #f] 135 [else (cons (caaddr cursor) (cddr cursor))])) 136 (error "bad character-set cursor:" cursor))) 137 138(define (end-of-char-set? cursor) (not cursor)) 139 140;; functional ops 141 142(define (char-set-fold kons knil cs) 143 (let loop ([cursor (char-set-cursor cs)] 144 [result knil]) 145 (if (end-of-char-set? cursor) 146 result 147 (loop (char-set-cursor-next cs cursor) 148 (kons (char-set-ref cs cursor) result))))) 149 150(define (char-set-unfold! pred fun gen seed base) 151 (let loop ([seed seed]) 152 (if (pred seed) 153 base 154 (let1 c (fun seed) 155 (%char-set-add-range! base c c) 156 (loop (gen seed)))))) 157 158(define (char-set-unfold pred fun gen seed :optional (base char-set:empty)) 159 (char-set-unfold! pred fun gen seed (char-set-copy base))) 160 161(define (char-set-for-each proc cs) 162 (let loop ([cursor (char-set-cursor cs)]) 163 (unless (end-of-char-set? cursor) 164 (proc (char-set-ref cs cursor)) 165 (loop (char-set-cursor-next cs cursor))))) 166 167(define (char-set-map proc cs) 168 (rlet1 new (char-set) 169 (let loop ([cursor (char-set-cursor cs)]) 170 (unless (end-of-char-set? cursor) 171 (let1 c (proc (char-set-ref cs cursor)) 172 (%char-set-add-range! new c c) 173 (loop (char-set-cursor-next cs cursor))))))) 174 175;;------------------------------------------------------------------- 176;; Construction 177;; 178 179;; char-set-copy : native 180;; char-set : native 181 182(define (list->char-set chars :optional (base char-set:empty)) 183 (%char-set-add-chars! (char-set-copy base) chars)) 184 185(define (list->char-set! chars base) 186 (%char-set-add-chars! base chars)) 187 188(define (string->char-set str :optional (base char-set:empty)) 189 (%char-set-add-chars! (char-set-copy base) (string->list str))) 190 191(define (string->char-set! str base) 192 (%char-set-add-chars! base (string->list str))) 193 194(define (char-set-filter pred cs :optional (base char-set:empty)) 195 (char-set-filter! pred cs (char-set-copy base))) 196 197(define (char-set-filter! pred cs base) 198 (let loop ([cursor (char-set-cursor cs)]) 199 (if (end-of-char-set? cursor) 200 base 201 (let1 c (char-set-ref cs cursor) 202 (when (pred c) (%char-set-add-range! base c c)) 203 (loop (char-set-cursor-next cs cursor)))))) 204 205(define (integer-range->char-set low upper :optional (error? #f) 206 (base char-set:empty)) 207 (integer-range->char-set! low upper error? (char-set-copy base))) 208 209(define (integer-range->char-set! low upper error? base) 210 (when (< low 0) 211 (if error? 212 (error "argument out of range:" low) 213 (set! low 0))) 214 (when (> upper (+ *char-code-max* 1)) 215 (if error? 216 (error "argument out of range:" upper) 217 (set! upper (+ *char-code-max* 1)))) 218 (%char-set-add-range! base low (- upper 1))) 219 220(define (ucs-range->char-set low upper :optional (error? #f) 221 (base char-set:empty)) 222 (ucs-range->char-set! low upper error? (char-set-copy base))) 223 224(define ucs-range->char-set! 225 (if (eq? (gauche-character-encoding) 'utf-8) 226 integer-range->char-set! 227 (lambda (low upper error? base) 228 (when (< low 0) 229 (if error? 230 (error "argument out of range:" low) 231 (set! low 0))) 232 (if (< upper 128) 233 (%char-set-add-range! base low (- upper 1)) 234 (begin 235 (when (< low 128) 236 (%char-set-add-range! base low 128) 237 (set! low 128)) 238 (do ([i low (+ i 1)]) 239 [(>= i upper) base] 240 (let1 c (ucs->char i) 241 (if c 242 (%char-set-add-range! base c c) 243 (if error? 244 (errorf "unicode character #\\u~8,'0x is not supported in the native character set (~a)" 245 i (gauche-character-encoding))))) 246 ))) 247 ) 248 )) 249 250(define (->char-set obj) 251 (cond [(list? obj) (list->char-set obj)] 252 [(string? obj) (string->char-set obj)] 253 [(char-set? obj) obj] 254 [(char? obj) (char-set obj)] 255 [(is-a? obj <collection>) (list->char-set (x->lseq obj))] 256 [else (errorf "cannot coerse ~s into a char-set" obj)])) 257 258;;------------------------------------------------------------------- 259;; Querying 260;; 261 262(autoload "srfi-14/query" char-set-count 263 char-set->list char-set->string 264 char-set-every char-set-any) 265 266;;------------------------------------------------------------------- 267;; Algebra 268;; 269 270(autoload "srfi-14/set" char-set-adjoin char-set-adjoin! 271 char-set-delete char-set-delete! 272 char-set-union char-set-union! 273 char-set-intersection char-set-intersection! 274 char-set-difference char-set-difference! 275 char-set-xor char-set-xor! 276 char-set-diff+intersection 277 char-set-diff+intersection!) 278 279