1**************** 2* SNOBOL4 functions to implement UTF-encoded unicode handling 3* $Id: utf.sno,v 1.5 2020-10-31 03:07:36 phil Exp $ 4 5* Phil Budne <phil@Ultimate.COM> 6* September 1996 7* updated for 4/5/6 byte runes June 1997 8* updated to note 5/6 byte runes illegal, October 2020 9* outlawed in https://tools.ietf.org/html/rfc3629#section-3 10 11* Most of this is just AWFUL, but this is just a proof of 12* concept, and like all SNOBOL there is always a smaller, faster 13* and more perverse way to do anything. 14 15**************** 16* TODO; 17* 18* UTFNOTANY 19* UTFSPAN 20* UTFBREAK 21* UTFBREAKX 22 23**************** 24* Pattern to match one UTF encoded rune 25 26* helper function to pattern that matches a range of byte values. 27* the true SNO-head might implement this as a function that 28* takes strings of SPAN('01') ARBNO('x') directly!! 29 DEFINE("RANGE(START,END)") :(ERANGE) 30RANGE 31* SPITBOL compiles ANY() into a bitmap; 32 &ALPHABET ARB (CHAR(START) ARB CHAR(END)) . RANGE :F(FRETURN) 33* SPITBOL compiles tables once; 34 RANGE = ANY(RANGE) :(RETURN) 35* for SNOBOL4; create big pattern 36* GE(START,END) :S(FRETURN) 37* RANGE = CHAR(START) 38*RANGE1 GE(STAT,END) :S(RETURN) 39* START = START + 1 40* RANGE = RANGE | CHAR(START) :(RANGE1) 41* 42ERANGE 43 44* single byte: 0xxxxxxx values 0..0x7f 45 UTF.T1 = RANGE(0,127) 46 47* two-byte leader: 110xxxxx values 0x80..0x7ff 48 UTF.T2 = RANGE(192,223) 49 50* three-byte leader: 1110xxxx values 0x800..0xffff 51 UTF.T3 = RANGE(224,239) 52 53* NEW in UTF-8??? 54 55* four-byte leader: 11110xxx values 0x10000..0x1fffff 56 UTF.T4 = RANGE(240,247) 57 58* described in https://tools.ietf.org/html/rfc2279 59* outlawed in https://tools.ietf.org/html/rfc3629#section-3 60* five-byte leader: 111110xx values 0x200000..0x3ffffff 61 UTF.T5 = RANGE(248,251) 62 63* described in https://tools.ietf.org/html/rfc2279 64* outlawed in https://tools.ietf.org/html/rfc3629#section-3 65* five-byte leader: 1111110x values 0x4000000..0x7fffffff 66 UTF.T6 = RANGE(252,253) 67 68* subsequent byte: 10xxxxxx 69 UTF.Tx = RANGE(128,191) 70 71* match first byte, then do table lookup (bad entries contain FAIL?) 72* on LEN(1) $ FIRSTBYTE? 73 74 UTF.RUNE = FENCE 75+ (UTF.T1 | 76+ UTF.T2 UTF.Tx | 77+ UTF.T3 UTF.Tx UTF.Tx | 78+ UTF.T4 UTF.Tx UTF.Tx UTF.Tx | 79+ UTF.T5 UTF.Tx UTF.Tx UTF.Tx UTF.Tx | 80+ UTF.T6 UTF.Tx UTF.Tx UTF.Tx UTF.Tx UTF.Tx) 81 82 UTF.RUNE0 = POS(0) UTF.RUNE 83 84* pattern to move to next sync char; 85* UTF.SYNC = BREAKX(UTF.T1 UTF.T2 UTF.T3 UTF.T4 UTF.T5 UTF.T6) 86 87 88**************** 89* UTF analog of ARB 90 UTFARB = ARBNO(UTF.RUNE) 91 92**************** 93* UTF analog of LEN 94 DEFINE("UTFLEN(N)") :(EUTFLEN) 95UTFLEN LT(N,0) :(FRETURN) 96 UTFLEN = 97UTFLEN2 EQ(N,0) :S(RETURN) 98 UTFLEN = UTFLEN UTF.RUNE 99 N = N - 1 :(UTFLEN2) 100EUTFLEN 101 102**************** 103* UTF analog for ANY(STR) 104 DEFINE("UTFANY(STR)R") :(EUTFANY) 105UTFANY STR UTF.RUNE0 . UTFANY = :F(FRETURN) 106UTFANY2 IDENT(STR) :S(RETURN) 107 STR UTF.RUNE0 . R = :F(FRETURN) 108 UTFANY = UTFANY | R :(UTFANY2) 109EUTFANY 110 111**************** 112* UTF analog for CHAR(N) 113 114 DEFINE("RUNE(N)") :(ERUNE) 115RUNE LT(N,0) :S(FRETURN) 116 GT(N,127) :S(RUNE2) 117* single byte rune; 118 RUNE = CHAR(N) :(RETURN) 119RUNE2 GT(N,2047) :S(RUNE3) 120* two-byte rune; 121 RUNE = CHAR(192 + (N / 64)) 122+ CHAR(128 + REMDR(N, 64)) :(RETURN) 123* three-byte rune; 124RUNE3 GT(N,65535) :S(RUNE4) 125* illegal to encode D800-DFFF -- reserved for UTF-16 surrogates 126 GE(N,55296) LE(N,57343) :S(FRETURN) 127 RUNE = CHAR(224 + N / 4096) 128+ CHAR(128 + REMDR(N / 64, 64)) 129+ CHAR(128 + REMDR(N, 64)) :(RETURN) 130* four-byte rune; 131RUNE4 GT(N,2097151) :S(RUNE5) 132 RUNE = CHAR(240 + N / 262144) 133+ CHAR(128 + REMDR(N / 4096, 64)) 134+ CHAR(128 + REMDR(N / 64, 64)) 135+ CHAR(128 + REMDR(N, 64)) :(RETURN) 136* five-byte rune (no longer legal); 137RUNE5 GT(N,67108863) :S(RUNE6) 138 RUNE = CHAR(248 + N / 16777216) 139+ CHAR(128 + REMDR(N / 262144, 64)) 140+ CHAR(128 + REMDR(N / 4096, 64)) 141+ CHAR(128 + REMDR(N / 64, 64)) 142+ CHAR(128 + REMDR(N, 64)) :(RETURN) 143* six-byte rune (no longer legal); 144RUNE6 GT(N,2147483647) :S(FRETURN) 145 RUNE = CHAR(252 + N / 1073741824) 146+ CHAR(128 + REMDR(N / 16777216, 64)) 147+ CHAR(128 + REMDR(N / 262144, 64)) 148+ CHAR(128 + REMDR(N / 4096, 64)) 149+ CHAR(128 + REMDR(N / 64, 64)) 150+ CHAR(128 + REMDR(N, 64)) :(RETURN) 151ERUNE 152 153**************** 154* UTF analog for SIZE(STR) 155 156 DEFINE("UTFSIZE(STR)") :(EUTFSIZE) 157UTFSIZE STR UTF.RUNE = :F(UTFSIZ2) 158 UTFSIZE = UTFSIZE + 1 :(UTFSIZE) 159UTFSIZ2 IDENT(STR) :S(RETURN)F(FRETURN) 160EUTFSIZE 161 162 163**************** 164* UTF analog for REPLACE() 165 166 DEFINE("UTFREPLACE(IN,FROM,TO)T,R1,R2") :(EUTFREPLACE) 167UTFREPLACE 168 T = TABLE() 169* peel runes off input and output languages one at a time 170UTFREP1 FROM UTF.RUNE0 . R1 = :F(UTFREP2) 171 TO UTF.RUNE0 . R2 = :F(UTFREP2) 172 T<R1> = R2 :(UTFREP1) 173* both input and output languages should now be empty 174UTFREP2 DIFFER(FROM) :S(FRETURN) 175 DIFFER(TO) :S(FRETURN) 176* freeze table for SNOBOL4+? 177* peel runes input one at a time, feed thru table 178 UTFREPLACE = 179UTFREP3 IN UTF.RUNE0 . R1 = :F(UTFREP4) 180 UTFREPLACE = UTFREPLACE (IDENT(T<R1>) R1, T<R1>) :(UTFREP3) 181UTFREP4 IDENT(IN) :S(RETURN)F(FRETURN) 182EUTFREPLACE 183 184**************************************************************** 185* tests 186* 187* &ANCHOR = 1 188* S = "Hello World!!" 189* 190* OUTPUT = UTFREPLACE(S,&UCASE "!",&LCASE "?") 191* 192*L S UTF.RUNE $ OUTPUT = :S(L) 193* 194* "HELLO WORLD!" UTFARB . OUTPUT RPOS(0) 195* "Hello World!" UTFARB UTFANY(&LCASE) . OUTPUT 196* "Hello World!" UTFANY(&UCASE) . OUTPUT 197* 198* OUTPUT = RUNE(0) 199* OUTPUT = RUNE(127) 200* OUTPUT = '-------' 201* OUTPUT = RUNE(128) 202* OUTPUT = RUNE(255) 203* OUTPUT = RUNE(2047) 204* OUTPUT = '-------' 205* OUTPUT = RUNE(2048) 206* OUTPUT = RUNE(65535) 207* OUTPUT = '-------' 208* OUTPUT = RUNE(65536) 209* OUTPUT = RUNE(2097151) 210* OUTPUT = '-------' 211* OUTPUT = RUNE(2097152) 212* OUTPUT = RUNE(67108863) 213* OUTPUT = '-------' 214* OUTPUT = RUNE(67108864) 215* OUTPUT = RUNE(2147483647) 216* 217*END 218