1; -*- Mode: Scheme; Syntax: Scheme; Package: Scheme; -*- 2; Part of Scheme 48 1.9. See file COPYING for notices and license. 3 4; Authors: Mike Sperber 5 6(define (encode-scalar-value encoding value buffer count) 7 (let-syntax ((encode 8 (syntax-rules () 9 ((encode ?encode-proc) 10 (call-with-values 11 (lambda () 12 (?encode-proc value buffer count)) 13 (lambda (encoding-ok? out-of-space? count) 14 (values #t encoding-ok? out-of-space? count))))))) 15 (enum-case 16 text-encoding-option encoding 17 ((us-ascii) (encode encode-scalar-value/us-ascii)) 18 ((latin-1) (encode encode-scalar-value/latin-1)) 19 ((utf-8) (encode encode-scalar-value/utf-8)) 20 ((utf-16le) (encode encode-scalar-value/utf-16le)) 21 ((utf-16be) (encode encode-scalar-value/utf-16be)) 22 ((utf-32le) (encode encode-scalar-value/utf-32le)) 23 ((utf-32be) (encode encode-scalar-value/utf-32be)) 24 (else 25 (values #f #f #f 0))))) 26 27(define (decode-scalar-value encoding buffer count) 28 (let-syntax ((decode 29 (syntax-rules () 30 ((decode ?decode-proc) 31 (call-with-values 32 (lambda () (?decode-proc buffer count)) 33 (lambda (ok? incomplete? value count) 34 (values #t ok? incomplete? value count))))))) 35 (enum-case 36 text-encoding-option encoding 37 ((us-ascii) (decode decode-scalar-value/us-ascii)) 38 ((latin-1) (decode decode-scalar-value/latin-1)) 39 ((utf-8) (decode decode-scalar-value/utf-8)) 40 ((utf-16le) (decode decode-scalar-value/utf-16le)) 41 ((utf-16be) (decode decode-scalar-value/utf-16be)) 42 ((utf-32le) (decode decode-scalar-value/utf-32le)) 43 ((utf-32be) (decode decode-scalar-value/utf-32be)) 44 (else 45 (values #f #f #f 0 0))))) 46 47;; US-ASCII 48 49;; This is mainly needed because it might be the default locale 50;; encoding reported by the OS. 51 52(define (encode-scalar-value/us-ascii value buffer count) 53 (cond 54 ((< count 1) 55 (values #t #t 1)) 56 ((< value 128) 57 (buffer-set! buffer 0 value) 58 (values #t #f 1)) 59 (else 60 (values #f #f 0)))) 61 62(define (decode-scalar-value/us-ascii buffer count) 63 (values #t ; OK? 64 #f ; incomplete? 65 (buffer-ref buffer 0) 66 1)) 67 68; Latin-1 69 70(define (encode-scalar-value/latin-1 value buffer count) 71 (cond 72 ((< count 1) 73 (values #t #t 1)) 74 ((< value 256) 75 (buffer-set! buffer 0 value) 76 (values #t #f 1)) 77 (else 78 (values #f #f 0)))) 79 80(define (decode-scalar-value/latin-1 buffer count) 81 (values #t ; OK? 82 #f ; incomplete? 83 (buffer-ref buffer 0) 84 1)) 85; UTF-8 86 87(define (encode-scalar-value/utf-8 value buffer count) 88 (cond 89 ((<= value #x7f) 90 (if (>= count 1) 91 (begin 92 (buffer-set! buffer 0 value) 93 (values #t #f 1)) 94 (values #t #t 1))) 95 ((<= value #x7ff) 96 (if (>= count 2) 97 (begin 98 (buffer-set! 99 buffer 0 100 (+ #xc0 101 (logical-shift-right (bitwise-and value #b11111000000) 102 6))) 103 (buffer-set! 104 buffer 1 105 (+ #x80 106 (bitwise-and value #b111111))) 107 (values #t #f 2)) 108 (values #t #t 2))) 109 ((<= value #xffff) 110 (if (>= count 3) 111 (begin 112 (buffer-set! 113 buffer 0 114 (+ #xe0 115 (logical-shift-right (bitwise-and value #b1111000000000000) 116 12))) 117 (buffer-set! 118 buffer 1 119 (+ #x80 120 (logical-shift-right (bitwise-and value #b111111000000) 121 6))) 122 (buffer-set! 123 buffer 2 124 (+ #x80 125 (bitwise-and value #b111111))) 126 (values #t #f 3)) 127 (values #t #t 3))) 128 (else 129 (if (>= count 4) 130 (begin 131 (buffer-set! 132 buffer 0 133 (+ #xf0 134 (logical-shift-right (bitwise-and value #b111000000000000000000) 135 18))) 136 (buffer-set! 137 buffer 1 138 (+ #x80 139 (logical-shift-right (bitwise-and value #b111111000000000000) 140 12))) 141 (buffer-set! 142 buffer 2 143 (+ #x80 144 (logical-shift-right (bitwise-and value #b111111000000) 145 6))) 146 (buffer-set! 147 buffer 3 148 (+ #x80 149 (bitwise-and value #b111111))) 150 (values #t #f 4)) 151 (values #t #t 4))))) 152 153 154; The table, and the associated decoding algorithm, is from 155; Richard Gillam: "Unicode Demystified", chapter 14 156 157(define *utf-8-state-table* 158 '#(;; state 0 159 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 1 1 1 1 2 2 3 -1 160 ;; state 1 161 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 0 0 0 0 0 0 0 0 -2 -2 -2 -2 -2 -2 -2 -2 162 ;; state 2 163 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 1 1 1 1 1 1 1 1 -2 -2 -2 -2 -2 -2 -2 -2 164 ;; state 3 165 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 2 2 2 2 2 2 2 2 -2 -2 -2 -2 -2 -2 -2 -2)) 166 167(define *utf-8-masks* '#(#x7f #x1f #x0f #x07)) 168 169; We don't check for non-shortest-form UTF-8. Too bad. 170 171(define (decode-scalar-value/utf-8 buffer count) 172 (let loop ((q 0) (state 0) (mask 0) (scalar-value 0)) 173 (if (< q count) 174 (let* ((c (buffer-ref buffer q)) 175 (state (vector-ref *utf-8-state-table* 176 (+ (shift-left state 5) ; (* state 32) 177 (arithmetic-shift-right c 3))))) 178 (case state 179 ((0) 180 (let ((scalar-value (+ scalar-value 181 (bitwise-and c #x7f)))) 182 (if (scalar-value? scalar-value) 183 (values #t #f scalar-value (+ q 1)) 184 (values #f #f 0 0)))) 185 ((1 2 3) 186 (loop (+ 1 q) state #x3f 187 (shift-left (+ scalar-value 188 (bitwise-and c 189 (if (= 0 mask) 190 (vector-ref *utf-8-masks* state) 191 mask))) 192 6))) 193 ((-2 -1) 194 (values #f #f 0 0)) 195 (else ; this can't happen 196 (values #f #f 0 0)))) 197 (values #t #t 0 (+ 1 q))))) 198 199; UTF-16 200 201(define (buffer-set-word16/le! buffer index word) 202 (buffer-set! buffer index 203 (bitwise-and #b11111111 word)) 204 (buffer-set! buffer (+ index 1) 205 (logical-shift-right word 8))) 206 207(define (buffer-set-word16/be! buffer index word) 208 (buffer-set! buffer index 209 (logical-shift-right word 8)) 210 (buffer-set! buffer (+ index 1) 211 (bitwise-and #b11111111 word))) 212 213(define (make-encode-scalar-value/utf-16 buffer-set-word16!) 214 (lambda (value buffer count) 215 (if (<= value #xffff) 216 (if (< count 2) 217 (values #t #t 2) 218 (begin 219 (buffer-set-word16! buffer 0 value) 220 (values #t #f 2))) 221 (if (< count 4) 222 (values #t #t 4) 223 (begin 224 (buffer-set-word16! 225 buffer 0 226 (+ (logical-shift-right value 10) #xd7c0)) 227 (buffer-set-word16! 228 buffer 2 229 (+ (bitwise-and value #x3ff) #xdc00)) 230 (values #t #f 4)))))) 231 232(define encode-scalar-value/utf-16le 233 (make-encode-scalar-value/utf-16 buffer-set-word16/le!)) 234(define encode-scalar-value/utf-16be 235 (make-encode-scalar-value/utf-16 buffer-set-word16/be!)) 236 237(define (buffer-ref-word16/le codes index) 238 (+ (buffer-ref codes index) 239 (shift-left (buffer-ref codes (+ index 1)) 8))) 240 241(define (buffer-ref-word16/be codes index) 242 (+ (shift-left (buffer-ref codes index) 8) 243 (buffer-ref codes (+ index 1)))) 244 245(define (make-decode-scalar-value/utf-16 buffer-ref-word16) 246 (lambda (buffer count) 247 (if (< count 2) 248 (values #t #t 0 2) 249 (let ((word0 (buffer-ref-word16 buffer 0))) 250 (cond 251 ((or (< word0 #xd800) 252 (> word0 #xdfff)) 253 (values #t #f word0 2)) 254 ((< count 4) 255 (values #t #t 0 4)) 256 ((<= word0 #xdbff) 257 (let ((word1 (buffer-ref-word16 buffer 2 ))) 258 (if (and (>= word1 #xdc00) 259 (<= word1 #xdfff)) 260 (values #t #f 261 (+ (shift-left (- word0 #xd7c0) 10) 262 (bitwise-and word1 #x3ff)) 263 4) 264 (values #f #f 0 0)))) 265 (else 266 (values #f #f 0 0))))))) 267 268(define decode-scalar-value/utf-16le 269 (make-decode-scalar-value/utf-16 buffer-ref-word16/le)) 270(define decode-scalar-value/utf-16be 271 (make-decode-scalar-value/utf-16 buffer-ref-word16/be)) 272 273; UTF-32 274 275(define (encode-scalar-value/utf-32le value buffer count) 276 (if (< count 4) 277 (values #t #t 4) 278 (begin 279 (buffer-set! buffer 0 280 (bitwise-and value #xff)) 281 (buffer-set! buffer 1 282 (logical-shift-right 283 (bitwise-and value #xff00) 284 8)) 285 (buffer-set! buffer 2 286 (logical-shift-right 287 (bitwise-and value #xff0000) 288 16)) 289 (buffer-set! buffer 3 290 (logical-shift-right value 24)) 291 (values #t #f 4)))) 292 293(define (encode-scalar-value/utf-32be value buffer count) 294 (if (< count 4) 295 (values #t #t 4) 296 (begin 297 (buffer-set! buffer 0 298 (logical-shift-right value 24)) 299 (buffer-set! buffer 1 300 (logical-shift-right 301 (bitwise-and value #xff0000) 302 16)) 303 (buffer-set! buffer 2 304 (logical-shift-right 305 (bitwise-and value #xff00) 306 8)) 307 (buffer-set! buffer 3 308 (bitwise-and value #xff)) 309 (values #t #f 4)))) 310 311(define (decode-scalar-value/utf-32le buffer count) 312 (if (< count 4) 313 (values #t #t 0 4) 314 (let ((code-point 315 (+ (buffer-ref buffer 0) 316 (shift-left (buffer-ref buffer 1) 317 8) 318 (shift-left (buffer-ref buffer 2) 319 16) 320 (shift-left (buffer-ref buffer 3) 321 24)))) 322 (if (scalar-value? code-point) 323 (values #t #f 324 code-point 325 4) 326 (values #f #f 0 0))))) 327 328(define (decode-scalar-value/utf-32be buffer count) 329 (if (< count 4) 330 (values #t #t 0 4) 331 (let ((code-point 332 (+ (shift-left (buffer-ref buffer 0) 333 24) 334 (shift-left (buffer-ref buffer 1) 335 16) 336 (shift-left 337 (buffer-ref buffer 2) 338 8) 339 (buffer-ref buffer 3)))) 340 (if (scalar-value? code-point) 341 (values #t #f 342 code-point 343 4) 344 (values #f #f 0 0))))) 345 346; Utilities 347 348(define (scalar-value? x) 349 (and (>= x 0) 350 (or (<= x #xd7ff) 351 (and (>= x #xe000) (<= x #x10ffff))))) 352 353(define (buffer-ref b i) 354 (unsigned-byte-ref (address+ b i))) 355 356(define (buffer-set! b i v) 357 (unsigned-byte-set! (address+ b i) v)) 358