1 /*
2  * The authors of this software are Rob Pike and Ken Thompson.
3  *              Copyright (c) 2002 by Lucent Technologies.
4  * Permission to use, copy, modify, and distribute this software for any
5  * purpose without fee is hereby granted, provided that this entire notice
6  * is included in all copies of any software which is or includes a copy
7  * or modification of this software and in all copies of the supporting
8  * documentation for such software.
9  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10  * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
11  * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12  * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13  */
14 #include <stdlib.h>
15 #include <string.h>
16 
17 #include "utf.h"
18 #include "utfdata.h"
19 
20 #define nelem(a) (int)(sizeof (a) / sizeof (a)[0])
21 
22 typedef unsigned char uchar;
23 
24 enum
25 {
26 	Bit1	= 7,
27 	Bitx	= 6,
28 	Bit2	= 5,
29 	Bit3	= 4,
30 	Bit4	= 3,
31 	Bit5	= 2,
32 
33 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
34 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
35 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
36 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
37 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
38 	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
39 
40 	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0000 0000 0111 1111 */
41 	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0000 0000 0111 1111 1111 */
42 	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 0000 0000 1111 1111 1111 1111 */
43 	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0001 1111 1111 1111 1111 1111 */
44 
45 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
46 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
47 
48 	Bad	= Runeerror
49 };
50 
51 int
chartorune(Rune * rune,const char * str)52 chartorune(Rune *rune, const char *str)
53 {
54 	int c, c1, c2, c3;
55 	int l;
56 
57 	/* overlong null character */
58 	if((uchar)str[0] == 0xc0 && (uchar)str[1] == 0x80) {
59 		*rune = 0;
60 		return 2;
61 	}
62 
63 	/*
64 	 * one character sequence
65 	 *	00000-0007F => T1
66 	 */
67 	c = *(uchar*)str;
68 	if(c < Tx) {
69 		*rune = c;
70 		return 1;
71 	}
72 
73 	/*
74 	 * two character sequence
75 	 *	0080-07FF => T2 Tx
76 	 */
77 	c1 = *(uchar*)(str+1) ^ Tx;
78 	if(c1 & Testx)
79 		goto bad;
80 	if(c < T3) {
81 		if(c < T2)
82 			goto bad;
83 		l = ((c << Bitx) | c1) & Rune2;
84 		if(l <= Rune1)
85 			goto bad;
86 		*rune = l;
87 		return 2;
88 	}
89 
90 	/*
91 	 * three character sequence
92 	 *	0800-FFFF => T3 Tx Tx
93 	 */
94 	c2 = *(uchar*)(str+2) ^ Tx;
95 	if(c2 & Testx)
96 		goto bad;
97 	if(c < T4) {
98 		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
99 		if(l <= Rune2)
100 			goto bad;
101 		*rune = l;
102 		return 3;
103 	}
104 
105 	/*
106 	 * four character sequence
107 	 *	10000-10FFFF => T4 Tx Tx Tx
108 	 */
109 	if(UTFmax >= 4) {
110 		c3 = *(uchar*)(str+3) ^ Tx;
111 		if(c3 & Testx)
112 			goto bad;
113 		if(c < T5) {
114 			l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
115 			if(l <= Rune3)
116 				goto bad;
117 			if(l > Runemax)
118 				goto bad;
119 			*rune = l;
120 			return 4;
121 		}
122 	}
123 
124 	/*
125 	 * bad decoding
126 	 */
127 bad:
128 	*rune = Bad;
129 	return 1;
130 }
131 
132 int
runetochar(char * str,const Rune * rune)133 runetochar(char *str, const Rune *rune)
134 {
135 	int c = *rune;
136 
137 	/* overlong null character */
138 	if (c == 0) {
139 		str[0] = (char)0xc0;
140 		str[1] = (char)0x80;
141 		return 2;
142 	}
143 
144 	/*
145 	 * one character sequence
146 	 *	00000-0007F => 00-7F
147 	 */
148 	if(c <= Rune1) {
149 		str[0] = c;
150 		return 1;
151 	}
152 
153 	/*
154 	 * two character sequence
155 	 *	00080-007FF => T2 Tx
156 	 */
157 	if(c <= Rune2) {
158 		str[0] = T2 | (c >> 1*Bitx);
159 		str[1] = Tx | (c & Maskx);
160 		return 2;
161 	}
162 
163 	/*
164 	 * three character sequence
165 	 *	00800-0FFFF => T3 Tx Tx
166 	 */
167 	if(c > Runemax)
168 		c = Runeerror;
169 	if(c <= Rune3) {
170 		str[0] = T3 |  (c >> 2*Bitx);
171 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
172 		str[2] = Tx |  (c & Maskx);
173 		return 3;
174 	}
175 
176 	/*
177 	 * four character sequence
178 	 *	010000-1FFFFF => T4 Tx Tx Tx
179 	 */
180 	str[0] = T4 |  (c >> 3*Bitx);
181 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
182 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
183 	str[3] = Tx |  (c & Maskx);
184 	return 4;
185 }
186 
187 int
runelen(int c)188 runelen(int c)
189 {
190 	Rune rune;
191 	char str[10];
192 
193 	rune = c;
194 	return runetochar(str, &rune);
195 }
196 
197 int
utflen(const char * s)198 utflen(const char *s)
199 {
200 	int c;
201 	int n;
202 	Rune rune;
203 
204 	n = 0;
205 	for(;;) {
206 		c = *(uchar*)s;
207 		if(c < Runeself) {
208 			if(c == 0)
209 				return n;
210 			s++;
211 		} else
212 			s += chartorune(&rune, s);
213 		n++;
214 	}
215 }
216 
217 static const Rune *
ucd_bsearch(Rune c,const Rune * t,int n,int ne)218 ucd_bsearch(Rune c, const Rune *t, int n, int ne)
219 {
220 	const Rune *p;
221 	int m;
222 
223 	while(n > 1) {
224 		m = n/2;
225 		p = t + m*ne;
226 		if(c >= p[0]) {
227 			t = p;
228 			n = n-m;
229 		} else
230 			n = m;
231 	}
232 	if(n && c >= t[0])
233 		return t;
234 	return 0;
235 }
236 
237 Rune
tolowerrune(Rune c)238 tolowerrune(Rune c)
239 {
240 	const Rune *p;
241 
242 	p = ucd_bsearch(c, ucd_tolower2, nelem(ucd_tolower2)/3, 3);
243 	if(p && c >= p[0] && c <= p[1])
244 		return c + p[2];
245 	p = ucd_bsearch(c, ucd_tolower1, nelem(ucd_tolower1)/2, 2);
246 	if(p && c == p[0])
247 		return c + p[1];
248 	return c;
249 }
250 
251 Rune
toupperrune(Rune c)252 toupperrune(Rune c)
253 {
254 	const Rune *p;
255 
256 	p = ucd_bsearch(c, ucd_toupper2, nelem(ucd_toupper2)/3, 3);
257 	if(p && c >= p[0] && c <= p[1])
258 		return c + p[2];
259 	p = ucd_bsearch(c, ucd_toupper1, nelem(ucd_toupper1)/2, 2);
260 	if(p && c == p[0])
261 		return c + p[1];
262 	return c;
263 }
264 
265 int
islowerrune(Rune c)266 islowerrune(Rune c)
267 {
268 	const Rune *p;
269 
270 	p = ucd_bsearch(c, ucd_toupper2, nelem(ucd_toupper2)/3, 3);
271 	if(p && c >= p[0] && c <= p[1])
272 		return 1;
273 	p = ucd_bsearch(c, ucd_toupper1, nelem(ucd_toupper1)/2, 2);
274 	if(p && c == p[0])
275 		return 1;
276 	return 0;
277 }
278 
279 int
isupperrune(Rune c)280 isupperrune(Rune c)
281 {
282 	const Rune *p;
283 
284 	p = ucd_bsearch(c, ucd_tolower2, nelem(ucd_tolower2)/3, 3);
285 	if(p && c >= p[0] && c <= p[1])
286 		return 1;
287 	p = ucd_bsearch(c, ucd_tolower1, nelem(ucd_tolower1)/2, 2);
288 	if(p && c == p[0])
289 		return 1;
290 	return 0;
291 }
292 
293 int
isalpharune(Rune c)294 isalpharune(Rune c)
295 {
296 	const Rune *p;
297 
298 	p = ucd_bsearch(c, ucd_alpha2, nelem(ucd_alpha2)/2, 2);
299 	if(p && c >= p[0] && c <= p[1])
300 		return 1;
301 	p = ucd_bsearch(c, ucd_alpha1, nelem(ucd_alpha1), 1);
302 	if(p && c == p[0])
303 		return 1;
304 	return 0;
305 }
306