1****************
2*	SNOBOL4 functions to implement UTF-encoded unicode handling
3*	$Id: utf.sno,v 1.5 2020-10-31 03:07:36 phil Exp $
4
5*	Phil Budne <phil@Ultimate.COM>
6*	September 1996
7*	updated for 4/5/6 byte runes June 1997
8*	updated to note 5/6 byte runes illegal, October 2020
9*		outlawed in https://tools.ietf.org/html/rfc3629#section-3
10
11*	Most of this is just AWFUL, but this is just a proof of
12*	concept, and like all SNOBOL there is always a smaller, faster
13*	and more perverse way to do anything.
14
15****************
16*	TODO;
17*
18*	UTFNOTANY
19*	UTFSPAN
20*	UTFBREAK
21*	UTFBREAKX
22
23****************
24*	Pattern to match one UTF encoded rune
25
26*	helper function to pattern that matches a range of byte values.
27*	the true SNO-head might implement this as a function that
28*	takes strings of SPAN('01') ARBNO('x') directly!!
29	DEFINE("RANGE(START,END)")			:(ERANGE)
30RANGE
31*	SPITBOL compiles ANY() into a bitmap;
32	&ALPHABET ARB (CHAR(START) ARB CHAR(END)) . RANGE :F(FRETURN)
33*	SPITBOL compiles tables once;
34	RANGE = ANY(RANGE)				:(RETURN)
35*	for SNOBOL4; create big pattern
36*	GE(START,END)					:S(FRETURN)
37*	RANGE = CHAR(START)
38*RANGE1	GE(STAT,END)					:S(RETURN)
39*	START = START + 1
40*	RANGE = RANGE | CHAR(START)			:(RANGE1)
41*
42ERANGE
43
44*	single byte:	0xxxxxxx	values 0..0x7f
45	UTF.T1 = RANGE(0,127)
46
47*	two-byte leader: 110xxxxx	values 0x80..0x7ff
48	UTF.T2 = RANGE(192,223)
49
50*	three-byte leader: 1110xxxx	values 0x800..0xffff
51	UTF.T3 = RANGE(224,239)
52
53*	NEW in UTF-8???
54
55*	four-byte leader: 11110xxx	values 0x10000..0x1fffff
56	UTF.T4 = RANGE(240,247)
57
58* described in https://tools.ietf.org/html/rfc2279
59* outlawed in https://tools.ietf.org/html/rfc3629#section-3
60*	five-byte leader: 111110xx	values 0x200000..0x3ffffff
61	UTF.T5 = RANGE(248,251)
62
63* described in https://tools.ietf.org/html/rfc2279
64* outlawed in https://tools.ietf.org/html/rfc3629#section-3
65*	five-byte leader: 1111110x	values 0x4000000..0x7fffffff
66	UTF.T6 = RANGE(252,253)
67
68*	subsequent byte: 10xxxxxx
69	UTF.Tx = RANGE(128,191)
70
71*	match first byte, then do table lookup (bad entries contain FAIL?)
72*	on LEN(1) $ FIRSTBYTE?
73
74	UTF.RUNE = FENCE
75+			(UTF.T1 |
76+			 UTF.T2 UTF.Tx |
77+			 UTF.T3 UTF.Tx UTF.Tx |
78+			 UTF.T4 UTF.Tx UTF.Tx UTF.Tx |
79+			 UTF.T5 UTF.Tx UTF.Tx UTF.Tx UTF.Tx |
80+			 UTF.T6 UTF.Tx UTF.Tx UTF.Tx UTF.Tx UTF.Tx)
81
82	UTF.RUNE0 = POS(0) UTF.RUNE
83
84*	pattern to move to next sync char;
85*	UTF.SYNC = BREAKX(UTF.T1 UTF.T2 UTF.T3 UTF.T4 UTF.T5 UTF.T6)
86
87
88****************
89*	UTF analog of ARB
90	UTFARB = ARBNO(UTF.RUNE)
91
92****************
93*	UTF analog of LEN
94	DEFINE("UTFLEN(N)")				:(EUTFLEN)
95UTFLEN	LT(N,0)						:(FRETURN)
96	UTFLEN =
97UTFLEN2	EQ(N,0)						:S(RETURN)
98	UTFLEN = UTFLEN UTF.RUNE
99	N = N - 1					:(UTFLEN2)
100EUTFLEN
101
102****************
103*	UTF analog for ANY(STR)
104	DEFINE("UTFANY(STR)R")				:(EUTFANY)
105UTFANY	STR UTF.RUNE0 . UTFANY =			:F(FRETURN)
106UTFANY2	IDENT(STR)					:S(RETURN)
107	STR UTF.RUNE0 . R =				:F(FRETURN)
108	UTFANY = UTFANY | R				:(UTFANY2)
109EUTFANY
110
111****************
112*	UTF analog for CHAR(N)
113
114	DEFINE("RUNE(N)")				:(ERUNE)
115RUNE	LT(N,0)						:S(FRETURN)
116	GT(N,127)					:S(RUNE2)
117*	single byte rune;
118	RUNE = CHAR(N)					:(RETURN)
119RUNE2	GT(N,2047)					:S(RUNE3)
120*	two-byte rune;
121	RUNE = CHAR(192 + (N / 64))
122+		CHAR(128 + REMDR(N, 64))		:(RETURN)
123*	three-byte rune;
124RUNE3	GT(N,65535)					:S(RUNE4)
125* illegal to encode D800-DFFF -- reserved for UTF-16 surrogates
126	GE(N,55296) LE(N,57343)				:S(FRETURN)
127	RUNE = CHAR(224 + N / 4096)
128+		CHAR(128 + REMDR(N / 64, 64))
129+		CHAR(128 + REMDR(N, 64))		:(RETURN)
130* four-byte rune;
131RUNE4	GT(N,2097151)					:S(RUNE5)
132	RUNE = CHAR(240 + N / 262144)
133+		CHAR(128 + REMDR(N / 4096, 64))
134+		CHAR(128 + REMDR(N / 64, 64))
135+		CHAR(128 + REMDR(N, 64))		:(RETURN)
136* five-byte rune (no longer legal);
137RUNE5	GT(N,67108863)					:S(RUNE6)
138	RUNE = CHAR(248 + N / 16777216)
139+		CHAR(128 + REMDR(N / 262144, 64))
140+		CHAR(128 + REMDR(N / 4096, 64))
141+		CHAR(128 + REMDR(N / 64, 64))
142+		CHAR(128 + REMDR(N, 64))		:(RETURN)
143* six-byte rune (no longer legal);
144RUNE6	GT(N,2147483647)				:S(FRETURN)
145	RUNE = CHAR(252 + N / 1073741824)
146+		CHAR(128 + REMDR(N / 16777216, 64))
147+		CHAR(128 + REMDR(N / 262144, 64))
148+		CHAR(128 + REMDR(N / 4096, 64))
149+		CHAR(128 + REMDR(N / 64, 64))
150+		CHAR(128 + REMDR(N, 64))		:(RETURN)
151ERUNE
152
153****************
154*	UTF analog for SIZE(STR)
155
156	DEFINE("UTFSIZE(STR)")				:(EUTFSIZE)
157UTFSIZE	STR UTF.RUNE =					:F(UTFSIZ2)
158	UTFSIZE = UTFSIZE + 1				:(UTFSIZE)
159UTFSIZ2	IDENT(STR)					:S(RETURN)F(FRETURN)
160EUTFSIZE
161
162
163****************
164*	UTF analog for REPLACE()
165
166	DEFINE("UTFREPLACE(IN,FROM,TO)T,R1,R2")		:(EUTFREPLACE)
167UTFREPLACE
168	T = TABLE()
169*	peel runes off input and output languages one at a time
170UTFREP1	FROM UTF.RUNE0 . R1 =				:F(UTFREP2)
171	TO UTF.RUNE0 . R2 =				:F(UTFREP2)
172	T<R1> = R2					:(UTFREP1)
173*	both input and output languages should now be empty
174UTFREP2	DIFFER(FROM)					:S(FRETURN)
175	DIFFER(TO)					:S(FRETURN)
176*	freeze table for SNOBOL4+?
177*	peel runes input one at a time, feed thru table
178	UTFREPLACE =
179UTFREP3	IN UTF.RUNE0 . R1 =				:F(UTFREP4)
180	UTFREPLACE = UTFREPLACE (IDENT(T<R1>) R1, T<R1>) :(UTFREP3)
181UTFREP4	IDENT(IN)					:S(RETURN)F(FRETURN)
182EUTFREPLACE
183
184****************************************************************
185* tests
186*
187*	&ANCHOR = 1
188*	S = "Hello World!!"
189*
190*	OUTPUT = UTFREPLACE(S,&UCASE "!",&LCASE "?")
191*
192*L	S UTF.RUNE $ OUTPUT =				:S(L)
193*
194*	"HELLO WORLD!" UTFARB . OUTPUT RPOS(0)
195*	"Hello World!" UTFARB UTFANY(&LCASE) . OUTPUT
196*	"Hello World!" UTFANY(&UCASE) . OUTPUT
197*
198*	OUTPUT = RUNE(0)
199*	OUTPUT = RUNE(127)
200*	OUTPUT = '-------'
201*	OUTPUT = RUNE(128)
202*	OUTPUT = RUNE(255)
203*	OUTPUT = RUNE(2047)
204*	OUTPUT = '-------'
205*	OUTPUT = RUNE(2048)
206*	OUTPUT = RUNE(65535)
207*	OUTPUT = '-------'
208*	OUTPUT = RUNE(65536)
209*	OUTPUT = RUNE(2097151)
210*	OUTPUT = '-------'
211*	OUTPUT = RUNE(2097152)
212*	OUTPUT = RUNE(67108863)
213*	OUTPUT = '-------'
214*	OUTPUT = RUNE(67108864)
215*	OUTPUT = RUNE(2147483647)
216*
217*END
218