1; -*- Mode: Scheme; Syntax: Scheme; Package: Scheme; -*-
2; Part of Scheme 48 1.9.  See file COPYING for notices and license.
3
4; Authors: Mike Sperber
5
6(define (encode-scalar-value encoding value buffer count)
7  (let-syntax ((encode
8		(syntax-rules ()
9		  ((encode ?encode-proc)
10		   (call-with-values
11		       (lambda ()
12			 (?encode-proc value buffer count))
13		     (lambda (encoding-ok? out-of-space? count)
14		       (values #t encoding-ok? out-of-space? count)))))))
15    (enum-case
16     text-encoding-option encoding
17     ((us-ascii) (encode encode-scalar-value/us-ascii))
18     ((latin-1) (encode encode-scalar-value/latin-1))
19     ((utf-8) (encode encode-scalar-value/utf-8))
20     ((utf-16le) (encode encode-scalar-value/utf-16le))
21     ((utf-16be) (encode encode-scalar-value/utf-16be))
22     ((utf-32le) (encode encode-scalar-value/utf-32le))
23     ((utf-32be) (encode encode-scalar-value/utf-32be))
24     (else
25      (values #f #f #f 0)))))
26
27(define (decode-scalar-value encoding buffer count)
28  (let-syntax ((decode
29		(syntax-rules ()
30		  ((decode ?decode-proc)
31		   (call-with-values
32		       (lambda () (?decode-proc buffer count))
33		     (lambda (ok? incomplete? value count)
34		       (values #t ok? incomplete? value count)))))))
35    (enum-case
36     text-encoding-option encoding
37     ((us-ascii) (decode decode-scalar-value/us-ascii))
38     ((latin-1) (decode decode-scalar-value/latin-1))
39     ((utf-8) (decode decode-scalar-value/utf-8))
40     ((utf-16le) (decode decode-scalar-value/utf-16le))
41     ((utf-16be) (decode decode-scalar-value/utf-16be))
42     ((utf-32le) (decode decode-scalar-value/utf-32le))
43     ((utf-32be) (decode decode-scalar-value/utf-32be))
44     (else
45      (values #f #f #f 0 0)))))
46
47;; US-ASCII
48
49;; This is mainly needed because it might be the default locale
50;; encoding reported by the OS.
51
52(define (encode-scalar-value/us-ascii value buffer count)
53  (cond
54   ((< count 1)
55    (values #t #t 1))
56   ((< value 128)
57    (buffer-set! buffer 0 value)
58    (values #t #f 1))
59   (else
60    (values #f #f 0))))
61
62(define (decode-scalar-value/us-ascii buffer count)
63  (values #t ; OK?
64	  #f ; incomplete?
65	  (buffer-ref buffer 0)
66	  1))
67
68; Latin-1
69
70(define (encode-scalar-value/latin-1 value buffer count)
71  (cond
72   ((< count 1)
73    (values #t #t 1))
74   ((< value 256)
75    (buffer-set! buffer 0 value)
76    (values #t #f 1))
77   (else
78    (values #f #f 0))))
79
80(define (decode-scalar-value/latin-1 buffer count)
81  (values #t ; OK?
82	  #f ; incomplete?
83	  (buffer-ref buffer 0)
84	  1))
85; UTF-8
86
87(define (encode-scalar-value/utf-8 value buffer count)
88  (cond
89   ((<= value #x7f)
90    (if (>= count 1)
91	(begin
92	  (buffer-set! buffer 0 value)
93	  (values #t #f 1))
94	(values #t #t 1)))
95   ((<= value #x7ff)
96    (if (>= count 2)
97	(begin
98	  (buffer-set!
99	   buffer 0
100	   (+ #xc0
101	      (logical-shift-right (bitwise-and value #b11111000000)
102				   6)))
103	  (buffer-set!
104	   buffer 1
105	   (+ #x80
106	      (bitwise-and value #b111111)))
107	  (values #t #f 2))
108	(values #t #t 2)))
109   ((<= value #xffff)
110    (if (>= count 3)
111	(begin
112	  (buffer-set!
113	   buffer 0
114	   (+ #xe0
115	      (logical-shift-right (bitwise-and value #b1111000000000000)
116				   12)))
117	  (buffer-set!
118	   buffer 1
119	   (+ #x80
120	      (logical-shift-right (bitwise-and value #b111111000000)
121				   6)))
122	  (buffer-set!
123	   buffer 2
124	   (+ #x80
125	      (bitwise-and value #b111111)))
126	  (values #t #f 3))
127	(values #t #t 3)))
128   (else
129    (if (>= count 4)
130	(begin
131	  (buffer-set!
132	   buffer 0
133	   (+ #xf0
134	      (logical-shift-right (bitwise-and value #b111000000000000000000)
135				   18)))
136	  (buffer-set!
137	   buffer 1
138	   (+ #x80
139	      (logical-shift-right (bitwise-and value #b111111000000000000)
140				   12)))
141	  (buffer-set!
142	   buffer 2
143	   (+ #x80
144	      (logical-shift-right (bitwise-and value #b111111000000)
145				   6)))
146	  (buffer-set!
147	   buffer 3
148	   (+ #x80
149	      (bitwise-and value #b111111)))
150	  (values #t #f 4))
151	(values #t #t 4)))))
152
153
154; The table, and the associated decoding algorithm, is from
155; Richard Gillam: "Unicode Demystified", chapter 14
156
157(define *utf-8-state-table*
158  '#(;; state 0
159     0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 1 1 1 1 2 2 3 -1
160       ;; state 1
161       -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 0 0 0 0 0 0 0 0 -2 -2 -2 -2 -2 -2 -2 -2
162       ;; state 2
163       -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 1 1 1 1 1 1 1 1 -2 -2 -2 -2 -2 -2 -2 -2
164       ;; state 3
165       -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 2 2 2 2 2 2 2 2 -2 -2 -2 -2 -2 -2 -2 -2))
166
167(define *utf-8-masks* '#(#x7f #x1f #x0f #x07))
168
169; We don't check for non-shortest-form UTF-8.  Too bad.
170
171(define (decode-scalar-value/utf-8 buffer count)
172  (let loop ((q 0) (state 0) (mask 0) (scalar-value 0))
173    (if (< q count)
174	(let* ((c (buffer-ref buffer q))
175	       (state (vector-ref *utf-8-state-table*
176				  (+ (shift-left state 5)	; (* state 32)
177				     (arithmetic-shift-right c 3)))))
178	  (case state
179	    ((0)
180	     (let ((scalar-value (+ scalar-value
181				    (bitwise-and c #x7f))))
182	       (if (scalar-value? scalar-value)
183		   (values #t #f scalar-value (+ q 1))
184		   (values #f #f 0 0))))
185	    ((1 2 3)
186	     (loop (+ 1 q) state #x3f
187		   (shift-left (+ scalar-value
188				  (bitwise-and c
189					       (if (= 0 mask)
190						   (vector-ref *utf-8-masks* state)
191						   mask)))
192			       6)))
193	    ((-2 -1)
194	     (values #f #f 0 0))
195	    (else ; this can't happen
196	     (values #f #f 0 0))))
197	(values #t #t 0 (+ 1 q)))))
198
199; UTF-16
200
201(define (buffer-set-word16/le! buffer index word)
202  (buffer-set! buffer index
203	       (bitwise-and #b11111111 word))
204  (buffer-set! buffer (+ index 1)
205	       (logical-shift-right word 8)))
206
207(define (buffer-set-word16/be! buffer index word)
208  (buffer-set! buffer index
209	       (logical-shift-right word 8))
210  (buffer-set! buffer (+ index 1)
211	       (bitwise-and #b11111111 word)))
212
213(define (make-encode-scalar-value/utf-16 buffer-set-word16!)
214  (lambda (value buffer count)
215    (if (<= value #xffff)
216	(if (< count 2)
217	    (values #t #t 2)
218	    (begin
219	      (buffer-set-word16! buffer 0 value)
220	      (values #t #f 2)))
221	(if (< count 4)
222	    (values #t #t 4)
223	    (begin
224	      (buffer-set-word16!
225	       buffer 0
226	       (+ (logical-shift-right value 10) #xd7c0))
227	      (buffer-set-word16!
228	       buffer 2
229	       (+ (bitwise-and value #x3ff) #xdc00))
230	      (values #t #f 4))))))
231
232(define encode-scalar-value/utf-16le
233  (make-encode-scalar-value/utf-16 buffer-set-word16/le!))
234(define encode-scalar-value/utf-16be
235  (make-encode-scalar-value/utf-16 buffer-set-word16/be!))
236
237(define (buffer-ref-word16/le codes index)
238  (+ (buffer-ref codes index)
239     (shift-left (buffer-ref codes (+ index 1)) 8)))
240
241(define (buffer-ref-word16/be codes index)
242  (+ (shift-left (buffer-ref codes index) 8)
243     (buffer-ref codes (+ index 1))))
244
245(define (make-decode-scalar-value/utf-16 buffer-ref-word16)
246  (lambda (buffer count)
247    (if (< count 2)
248	(values #t #t 0 2)
249	(let ((word0 (buffer-ref-word16 buffer 0)))
250	  (cond
251	   ((or (< word0 #xd800)
252		(> word0 #xdfff))
253	    (values #t #f word0 2))
254	   ((< count 4)
255	    (values #t #t 0 4))
256	   ((<= word0 #xdbff)
257	    (let ((word1 (buffer-ref-word16 buffer 2 )))
258	      (if (and (>= word1 #xdc00)
259		       (<= word1 #xdfff))
260		  (values #t #f
261			  (+ (shift-left (- word0 #xd7c0) 10)
262			     (bitwise-and word1 #x3ff))
263			  4)
264		  (values #f #f 0 0))))
265	   (else
266	    (values #f #f 0 0)))))))
267
268(define decode-scalar-value/utf-16le
269  (make-decode-scalar-value/utf-16 buffer-ref-word16/le))
270(define decode-scalar-value/utf-16be
271  (make-decode-scalar-value/utf-16 buffer-ref-word16/be))
272
273; UTF-32
274
275(define (encode-scalar-value/utf-32le value buffer count)
276  (if (< count 4)
277      (values #t #t 4)
278      (begin
279	(buffer-set! buffer 0
280		     (bitwise-and value #xff))
281	(buffer-set! buffer 1
282		     (logical-shift-right
283		      (bitwise-and value #xff00)
284		      8))
285	(buffer-set! buffer 2
286		     (logical-shift-right
287		      (bitwise-and value #xff0000)
288		      16))
289	(buffer-set! buffer 3
290		     (logical-shift-right value 24))
291	(values #t #f 4))))
292
293(define (encode-scalar-value/utf-32be value buffer count)
294  (if (< count 4)
295      (values #t #t 4)
296      (begin
297	(buffer-set! buffer 0
298		     (logical-shift-right value 24))
299	(buffer-set! buffer 1
300		     (logical-shift-right
301		      (bitwise-and value #xff0000)
302		      16))
303	(buffer-set! buffer 2
304		     (logical-shift-right
305		      (bitwise-and value #xff00)
306		      8))
307	(buffer-set! buffer 3
308		     (bitwise-and value #xff))
309	(values #t #f 4))))
310
311(define (decode-scalar-value/utf-32le buffer count)
312  (if (< count 4)
313      (values #t #t 0 4)
314      (let ((code-point
315	     (+ (buffer-ref buffer 0)
316		(shift-left (buffer-ref buffer 1)
317			    8)
318		(shift-left (buffer-ref buffer 2)
319			    16)
320		(shift-left (buffer-ref buffer 3)
321			    24))))
322	(if (scalar-value? code-point)
323	    (values #t #f
324		    code-point
325		    4)
326	    (values #f #f 0 0)))))
327
328(define (decode-scalar-value/utf-32be buffer count)
329  (if (< count 4)
330      (values #t #t 0 4)
331      (let ((code-point
332	     (+ (shift-left (buffer-ref buffer 0)
333			    24)
334		(shift-left (buffer-ref buffer 1)
335			    16)
336		(shift-left
337		 (buffer-ref buffer 2)
338		 8)
339		(buffer-ref buffer 3))))
340	(if (scalar-value? code-point)
341	    (values #t #f
342		    code-point
343		    4)
344	    (values #f #f 0 0)))))
345
346; Utilities
347
348(define (scalar-value? x)
349  (and (>= x 0)
350       (or (<= x #xd7ff)
351	   (and (>= x #xe000) (<= x #x10ffff)))))
352
353(define (buffer-ref b i)
354  (unsigned-byte-ref (address+ b i)))
355
356(define (buffer-set! b i v)
357  (unsigned-byte-set! (address+ b i) v))
358