1 /*
2  * The authors of this software are Rob Pike and Ken Thompson.
3  *              Copyright (c) 2002 by Lucent Technologies.
4  * Permission to use, copy, modify, and distribute this software for any
5  * purpose without fee is hereby granted, provided that this entire notice
6  * is included in all copies of any software which is or includes a copy
7  * or modification of this software and in all copies of the supporting
8  * documentation for such software.
9  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10  * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
11  * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12  * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13  */
14 #include <stdarg.h>
15 #include <string.h>
16 #include "plan9.h"
17 #include "utf.h"
18 
19 enum
20 {
21 	Bit1	= 7,
22 	Bitx	= 6,
23 	Bit2	= 5,
24 	Bit3	= 4,
25 	Bit4	= 3,
26 	Bit5	= 2,
27 
28 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
29 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
30 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
31 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
32 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
33 	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
34 
35 	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0000 0000 0111 1111 */
36 	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0000 0000 0111 1111 1111 */
37 	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 0000 0000 1111 1111 1111 1111 */
38 	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0011 1111 1111 1111 1111 1111 */
39 
40 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
41 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
42 
43 	Bad	= Runeerror
44 };
45 
46 int
chartorune(Rune * rune,char * str)47 chartorune(Rune *rune, char *str)
48 {
49 	int c, c1, c2, c3;
50 	long l;
51 
52 	/*
53 	 * one character sequence
54 	 *	00000-0007F => T1
55 	 */
56 	c = *(uchar*)str;
57 	if(c < Tx) {
58 		*rune = c;
59 		return 1;
60 	}
61 
62 	/*
63 	 * two character sequence
64 	 *	0080-07FF => T2 Tx
65 	 */
66 	c1 = *(uchar*)(str+1) ^ Tx;
67 	if(c1 & Testx)
68 		goto bad;
69 	if(c < T3) {
70 		if(c < T2)
71 			goto bad;
72 		l = ((c << Bitx) | c1) & Rune2;
73 		if(l <= Rune1)
74 			goto bad;
75 		*rune = l;
76 		return 2;
77 	}
78 
79 	/*
80 	 * three character sequence
81 	 *	0800-FFFF => T3 Tx Tx
82 	 */
83 	c2 = *(uchar*)(str+2) ^ Tx;
84 	if(c2 & Testx)
85 		goto bad;
86 	if(c < T4) {
87 		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
88 		if(l <= Rune2)
89 			goto bad;
90 		*rune = l;
91 		return 3;
92 	}
93 
94 	/*
95 	 * four character sequence
96 	 *	10000-10FFFF => T4 Tx Tx Tx
97 	 */
98 	if(UTFmax >= 4) {
99 		c3 = *(uchar*)(str+3) ^ Tx;
100 		if(c3 & Testx)
101 			goto bad;
102 		if(c < T5) {
103 			l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
104 			if(l <= Rune3)
105 				goto bad;
106 			if(l > Runemax)
107 				goto bad;
108 			*rune = l;
109 			return 4;
110 		}
111 	}
112 
113 	/*
114 	 * bad decoding
115 	 */
116 bad:
117 	*rune = Bad;
118 	return 1;
119 }
120 
121 int
runetochar(char * str,Rune * rune)122 runetochar(char *str, Rune *rune)
123 {
124 	long c;
125 
126 	/*
127 	 * one character sequence
128 	 *	00000-0007F => 00-7F
129 	 */
130 	c = *rune;
131 	if(c <= Rune1) {
132 		str[0] = c;
133 		return 1;
134 	}
135 
136 	/*
137 	 * two character sequence
138 	 *	00080-007FF => T2 Tx
139 	 */
140 	if(c <= Rune2) {
141 		str[0] = T2 | (c >> 1*Bitx);
142 		str[1] = Tx | (c & Maskx);
143 		return 2;
144 	}
145 
146 	/*
147 	 * three character sequence
148 	 *	00800-0FFFF => T3 Tx Tx
149 	 */
150 	if(c > Runemax)
151 		c = Runeerror;
152 	if(c <= Rune3) {
153 		str[0] = T3 |  (c >> 2*Bitx);
154 		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
155 		str[2] = Tx |  (c & Maskx);
156 		return 3;
157 	}
158 
159 	/*
160 	 * four character sequence
161 	 *	010000-1FFFFF => T4 Tx Tx Tx
162 	 */
163 	str[0] = T4 |  (c >> 3*Bitx);
164 	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
165 	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
166 	str[3] = Tx |  (c & Maskx);
167 	return 4;
168 }
169 
170 int
runelen(long c)171 runelen(long c)
172 {
173 	Rune rune;
174 	char str[10];
175 
176 	rune = c;
177 	return runetochar(str, &rune);
178 }
179 
180 int
runenlen(Rune * r,int nrune)181 runenlen(Rune *r, int nrune)
182 {
183 	int nb, c;
184 
185 	nb = 0;
186 	while(nrune--) {
187 		c = *r++;
188 		if(c <= Rune1)
189 			nb++;
190 		else
191 		if(c <= Rune2)
192 			nb += 2;
193 		else
194 		if(c <= Rune3 || c > Runemax)
195 			nb += 3;
196 		else
197 			nb += 4;
198 	}
199 	return nb;
200 }
201 
202 int
fullrune(char * str,int n)203 fullrune(char *str, int n)
204 {
205 	int c;
206 
207 	if(n <= 0)
208 		return 0;
209 	c = *(uchar*)str;
210 	if(c < Tx)
211 		return 1;
212 	if(c < T3)
213 		return n >= 2;
214 	if(UTFmax == 3 || c < T4)
215 		return n >= 3;
216 	return n >= 4;
217 }
218