1 /*
2  * The authors of this software are Rob Pike and Ken Thompson.
3  *              Copyright (c) 2002 by Lucent Technologies.
4  *              Portions Copyright (c) 2009 The Go Authors.  All rights reserved.
5  * Permission to use, copy, modify, and distribute this software for any
6  * purpose without fee is hereby granted, provided that this entire notice
7  * is included in all copies of any software which is or includes a copy
8  * or modification of this software and in all copies of the supporting
9  * documentation for such software.
10  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
11  * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
12  * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
13  * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
14  */
15 #include "phonenumbers/utf/utf.h"
16 #include "phonenumbers/utf/utfdef.h"
17 
18 enum
19 {
20 	Bit1	= 7,
21 	Bitx	= 6,
22 	Bit2	= 5,
23 	Bit3	= 4,
24 	Bit4	= 3,
25 	Bit5	= 2,
26 
27 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
28 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
29 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
30 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
31 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
32 	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
33 
34 	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
35 	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
36 	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
37 	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0001 1111 1111 1111 1111 1111 */
38 
39 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
40 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
41 
42 	SurrogateMin	= 0xD800,
43 	SurrogateMax	= 0xDFFF,
44 
45 	Bad	= Runeerror,
46 };
47 
48 /*
49  * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
50  * This is a slower but "safe" version of the old chartorune
51  * that works on strings that are not necessarily null-terminated.
52  *
53  * If you know for sure that your string is null-terminated,
54  * chartorune will be a bit faster.
55  *
56  * It is guaranteed not to attempt to access "length"
57  * past the incoming pointer.  This is to avoid
58  * possible access violations.  If the string appears to be
59  * well-formed but incomplete (i.e., to get the whole Rune
60  * we'd need to read past str+length) then we'll set the Rune
61  * to Bad and return 0.
62  *
63  * Note that if we have decoding problems for other
64  * reasons, we return 1 instead of 0.
65  */
66 int
charntorune(Rune * rune,const char * str,int length)67 charntorune(Rune *rune, const char *str, int length)
68 {
69 	int c, c1, c2, c3;
70 	long l;
71 
72 	/* When we're not allowed to read anything */
73 	if(length <= 0) {
74 		goto badlen;
75 	}
76 
77 	/*
78 	 * one character sequence (7-bit value)
79 	 *	00000-0007F => T1
80 	 */
81 	c = *(uchar*)str;
82 	if(c < Tx) {
83 		*rune = (Rune)c;
84 		return 1;
85 	}
86 
87 	// If we can't read more than one character we must stop
88 	if(length <= 1) {
89 		goto badlen;
90 	}
91 
92 	/*
93 	 * two character sequence (11-bit value)
94 	 *	0080-07FF => T2 Tx
95 	 */
96 	c1 = *(uchar*)(str+1) ^ Tx;
97 	if(c1 & Testx)
98 		goto bad;
99 	if(c < T3) {
100 		if(c < T2)
101 			goto bad;
102 		l = ((c << Bitx) | c1) & Rune2;
103 		if(l <= Rune1)
104 			goto bad;
105 		*rune = (Rune)l;
106 		return 2;
107 	}
108 
109 	// If we can't read more than two characters we must stop
110 	if(length <= 2) {
111 		goto badlen;
112 	}
113 
114 	/*
115 	 * three character sequence (16-bit value)
116 	 *	0800-FFFF => T3 Tx Tx
117 	 */
118 	c2 = *(uchar*)(str+2) ^ Tx;
119 	if(c2 & Testx)
120 		goto bad;
121 	if(c < T4) {
122 		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
123 		if(l <= Rune2)
124 			goto bad;
125 		if (SurrogateMin <= l && l <= SurrogateMax)
126 			goto bad;
127 		*rune = (Rune)l;
128 		return 3;
129 	}
130 
131 	if (length <= 3)
132 		goto badlen;
133 
134 	/*
135 	 * four character sequence (21-bit value)
136 	 *	10000-1FFFFF => T4 Tx Tx Tx
137 	 */
138 	c3 = *(uchar*)(str+3) ^ Tx;
139 	if (c3 & Testx)
140 		goto bad;
141 	if (c < T5) {
142 		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
143 		if (l <= Rune3 || l > Runemax)
144 			goto bad;
145 		*rune = (Rune)l;
146 		return 4;
147 	}
148 
149 	// Support for 5-byte or longer UTF-8 would go here, but
150 	// since we don't have that, we'll just fall through to bad.
151 
152 	/*
153 	 * bad decoding
154 	 */
155 bad:
156 	*rune = Bad;
157 	return 1;
158 badlen:
159 	*rune = Bad;
160 	return 0;
161 
162 }
163 
164 
165 /*
166  * This is the older "unsafe" version, which works fine on
167  * null-terminated strings.
168  */
169 int
chartorune(Rune * rune,const char * str)170 chartorune(Rune *rune, const char *str)
171 {
172 	int c, c1, c2, c3;
173 	long l;
174 
175 	/*
176 	 * one character sequence
177 	 *	00000-0007F => T1
178 	 */
179 	c = *(uchar*)str;
180 	if(c < Tx) {
181 		*rune = (Rune)c;
182 		return 1;
183 	}
184 
185 	/*
186 	 * two character sequence
187 	 *	0080-07FF => T2 Tx
188 	 */
189 	c1 = *(uchar*)(str+1) ^ Tx;
190 	if(c1 & Testx)
191 		goto bad;
192 	if(c < T3) {
193 		if(c < T2)
194 			goto bad;
195 		l = ((c << Bitx) | c1) & Rune2;
196 		if(l <= Rune1)
197 			goto bad;
198 		*rune = (Rune)l;
199 		return 2;
200 	}
201 
202 	/*
203 	 * three character sequence
204 	 *	0800-FFFF => T3 Tx Tx
205 	 */
206 	c2 = *(uchar*)(str+2) ^ Tx;
207 	if(c2 & Testx)
208 		goto bad;
209 	if(c < T4) {
210 		l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
211 		if(l <= Rune2)
212 			goto bad;
213 		if (SurrogateMin <= l && l <= SurrogateMax)
214 			goto bad;
215 		*rune = (Rune)l;
216 		return 3;
217 	}
218 
219 	/*
220 	 * four character sequence (21-bit value)
221 	 *	10000-1FFFFF => T4 Tx Tx Tx
222 	 */
223 	c3 = *(uchar*)(str+3) ^ Tx;
224 	if (c3 & Testx)
225 		goto bad;
226 	if (c < T5) {
227 		l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
228 		if (l <= Rune3 || l > Runemax)
229 			goto bad;
230 		*rune = (Rune)l;
231 		return 4;
232 	}
233 
234 	/*
235 	 * Support for 5-byte or longer UTF-8 would go here, but
236 	 * since we don't have that, we'll just fall through to bad.
237 	 */
238 
239 	/*
240 	 * bad decoding
241 	 */
242 bad:
243 	*rune = Bad;
244 	return 1;
245 }
246 
247 int
isvalidcharntorune(const char * str,int length,Rune * rune,int * consumed)248 isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed)
249 {
250 	*consumed = charntorune(rune, str, length);
251 	return *rune != Runeerror || *consumed == 3;
252 }
253 
254 int
runetochar(char * str,const Rune * rune)255 runetochar(char *str, const Rune *rune)
256 {
257 	/* Runes are signed, so convert to unsigned for range check. */
258 	unsigned long c;
259 
260 	/*
261 	 * one character sequence
262 	 *	00000-0007F => 00-7F
263 	 */
264 	c = *rune;
265 	if(c <= Rune1) {
266 		str[0] = (char)c;
267 		return 1;
268 	}
269 
270 	/*
271 	 * two character sequence
272 	 *	0080-07FF => T2 Tx
273 	 */
274 	if(c <= Rune2) {
275 		str[0] = (char)(T2 | (c >> 1*Bitx));
276 		str[1] = (char)(Tx | (c & Maskx));
277 		return 2;
278 	}
279 
280 	/*
281 	 * If the Rune is out of range or a surrogate half, convert it to the error rune.
282 	 * Do this test here because the error rune encodes to three bytes.
283 	 * Doing it earlier would duplicate work, since an out of range
284 	 * Rune wouldn't have fit in one or two bytes.
285 	 */
286 	if (c > Runemax)
287 		c = Runeerror;
288 	if (SurrogateMin <= c && c <= SurrogateMax)
289 		c = Runeerror;
290 
291 	/*
292 	 * three character sequence
293 	 *	0800-FFFF => T3 Tx Tx
294 	 */
295 	if (c <= Rune3) {
296 		str[0] = (char)(T3 |  (c >> 2*Bitx));
297 		str[1] = (char)(Tx | ((c >> 1*Bitx) & Maskx));
298 		str[2] = (char)(Tx |  (c & Maskx));
299 		return 3;
300 	}
301 
302 	/*
303 	 * four character sequence (21-bit value)
304 	 *     10000-1FFFFF => T4 Tx Tx Tx
305 	 */
306 	str[0] = (char)(T4 | (c >> 3*Bitx));
307 	str[1] = (char)(Tx | ((c >> 2*Bitx) & Maskx));
308 	str[2] = (char)(Tx | ((c >> 1*Bitx) & Maskx));
309 	str[3] = (char)(Tx | (c & Maskx));
310 	return 4;
311 }
312 
313 int
runelen(Rune rune)314 runelen(Rune rune)
315 {
316 	char str[10];
317 
318 	return runetochar(str, &rune);
319 }
320 
321 int
runenlen(const Rune * r,int nrune)322 runenlen(const Rune *r, int nrune)
323 {
324 	int nb, c;
325 
326 	nb = 0;
327 	while(nrune--) {
328 		c = (int)*r++;
329 		if (c <= Rune1)
330 			nb++;
331 		else if (c <= Rune2)
332 			nb += 2;
333 		else if (c <= Rune3)
334 			nb += 3;
335 		else /* assert(c <= Rune4) */
336 			nb += 4;
337 	}
338 	return nb;
339 }
340 
341 int
fullrune(const char * str,int n)342 fullrune(const char *str, int n)
343 {
344 	if (n > 0) {
345 		int c = *(uchar*)str;
346 		if (c < Tx)
347 			return 1;
348 		if (n > 1) {
349 			if (c < T3)
350 				return 1;
351 			if (n > 2) {
352 				if (c < T4 || n > 3)
353 					return 1;
354 			}
355 		}
356 	}
357 	return 0;
358 }
359