1 /*
2  * The authors of this software are Rob Pike and Ken Thompson.
3  *              Copyright (c) 2002 by Lucent Technologies.
4  * Permission to use, copy, modify, and distribute this software for any
5  * purpose without fee is hereby granted, provided that this entire notice
6  * is included in all copies of any software which is or includes a copy
7  * or modification of this software and in all copies of the supporting
8  * documentation for such software.
9  * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10  * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
11  * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12  * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13  */
14 #include <stdarg.h>
15 #include <string.h>
16 #include "third_party/utf/utf.h"
17 #include "third_party/utf/utfdef.h"
18 
19 enum {
20   Bit1 = 7,
21   Bitx = 6,
22   Bit2 = 5,
23   Bit3 = 4,
24   Bit4 = 3,
25   Bit5 = 2,
26 
27   T1 = ((1 << (Bit1 + 1)) - 1) ^ 0xFF, /* 0000 0000 */
28   Tx = ((1 << (Bitx + 1)) - 1) ^ 0xFF, /* 1000 0000 */
29   T2 = ((1 << (Bit2 + 1)) - 1) ^ 0xFF, /* 1100 0000 */
30   T3 = ((1 << (Bit3 + 1)) - 1) ^ 0xFF, /* 1110 0000 */
31   T4 = ((1 << (Bit4 + 1)) - 1) ^ 0xFF, /* 1111 0000 */
32   T5 = ((1 << (Bit5 + 1)) - 1) ^ 0xFF, /* 1111 1000 */
33 
34   Rune1 = (1 << (Bit1 + 0 * Bitx)) - 1, /* 0000 0000 0111 1111 */
35   Rune2 = (1 << (Bit2 + 1 * Bitx)) - 1, /* 0000 0111 1111 1111 */
36   Rune3 = (1 << (Bit3 + 2 * Bitx)) - 1, /* 1111 1111 1111 1111 */
37   Rune4 = (1 << (Bit4 + 3 * Bitx)) - 1,
38   /* 0001 1111 1111 1111 1111 1111 */
39 
40   Maskx = (1 << Bitx) - 1, /* 0011 1111 */
41   Testx = Maskx ^ 0xFF,    /* 1100 0000 */
42 
43   Bad = Runeerror,
44 };
45 
46 /*
47  * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24
48  * This is a slower but "safe" version of the old chartorune
49  * that works on strings that are not necessarily null-terminated.
50  *
51  * If you know for sure that your string is null-terminated,
52  * chartorune will be a bit faster.
53  *
54  * It is guaranteed not to attempt to access "length"
55  * past the incoming pointer.  This is to avoid
56  * possible access violations.  If the string appears to be
57  * well-formed but incomplete (i.e., to get the whole Rune
58  * we'd need to read past str+length) then we'll set the Rune
59  * to Bad and return 0.
60  *
61  * Note that if we have decoding problems for other
62  * reasons, we return 1 instead of 0.
63  */
charntorune(Rune * rune,const char * str,int length)64 int charntorune(Rune *rune, const char *str, int length) {
65   int c, c1, c2, c3;
66   long l;
67 
68   /* When we're not allowed to read anything */
69   if (length <= 0) {
70     goto badlen;
71   }
72 
73   /*
74    * one character sequence (7-bit value)
75    *	00000-0007F => T1
76    */
77   c = *(uchar *)str;
78   if (c < Tx) {
79     *rune = c;
80     return 1;
81   }
82 
83   // If we can't read more than one character we must stop
84   if (length <= 1) {
85     goto badlen;
86   }
87 
88   /*
89    * two character sequence (11-bit value)
90    *	0080-07FF => T2 Tx
91    */
92   c1 = *(uchar *)(str + 1) ^ Tx;
93   if (c1 & Testx)
94     goto bad;
95   if (c < T3) {
96     if (c < T2)
97       goto bad;
98     l = ((c << Bitx) | c1) & Rune2;
99     if (l <= Rune1)
100       goto bad;
101     *rune = l;
102     return 2;
103   }
104 
105   // If we can't read more than two characters we must stop
106   if (length <= 2) {
107     goto badlen;
108   }
109 
110   /*
111    * three character sequence (16-bit value)
112    *	0800-FFFF => T3 Tx Tx
113    */
114   c2 = *(uchar *)(str + 2) ^ Tx;
115   if (c2 & Testx)
116     goto bad;
117   if (c < T4) {
118     l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
119     if (l <= Rune2)
120       goto bad;
121     *rune = l;
122     return 3;
123   }
124 
125   if (length <= 3)
126     goto badlen;
127 
128   /*
129    * four character sequence (21-bit value)
130    *	10000-1FFFFF => T4 Tx Tx Tx
131    */
132   c3 = *(uchar *)(str + 3) ^ Tx;
133   if (c3 & Testx)
134     goto bad;
135   if (c < T5) {
136     l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
137     if (l <= Rune3)
138       goto bad;
139     if (l > Runemax)
140       goto bad;
141     *rune = l;
142     return 4;
143   }
144 
145   // Support for 5-byte or longer UTF-8 would go here, but
146   // since we don't have that, we'll just fall through to bad.
147 
148   /*
149    * bad decoding
150    */
151 bad:
152   *rune = Bad;
153   return 1;
154 badlen:
155   *rune = Bad;
156   return 0;
157 }
158 
159 /*
160  * This is the older "unsafe" version, which works fine on
161  * null-terminated strings.
162  */
chartorune(Rune * rune,const char * str)163 int chartorune(Rune *rune, const char *str) {
164   int c, c1, c2, c3;
165   long l;
166 
167   /*
168    * one character sequence
169    *	00000-0007F => T1
170    */
171   c = *(uchar *)str;
172   if (c < Tx) {
173     *rune = c;
174     return 1;
175   }
176 
177   /*
178    * two character sequence
179    *	0080-07FF => T2 Tx
180    */
181   c1 = *(uchar *)(str + 1) ^ Tx;
182   if (c1 & Testx)
183     goto bad;
184   if (c < T3) {
185     if (c < T2)
186       goto bad;
187     l = ((c << Bitx) | c1) & Rune2;
188     if (l <= Rune1)
189       goto bad;
190     *rune = l;
191     return 2;
192   }
193 
194   /*
195    * three character sequence
196    *	0800-FFFF => T3 Tx Tx
197    */
198   c2 = *(uchar *)(str + 2) ^ Tx;
199   if (c2 & Testx)
200     goto bad;
201   if (c < T4) {
202     l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
203     if (l <= Rune2)
204       goto bad;
205     *rune = l;
206     return 3;
207   }
208 
209   /*
210    * four character sequence (21-bit value)
211    *	10000-1FFFFF => T4 Tx Tx Tx
212    */
213   c3 = *(uchar *)(str + 3) ^ Tx;
214   if (c3 & Testx)
215     goto bad;
216   if (c < T5) {
217     l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
218     if (l <= Rune3)
219       goto bad;
220     if (l > Runemax)
221       goto bad;
222     *rune = l;
223     return 4;
224   }
225 
226   /*
227    * Support for 5-byte or longer UTF-8 would go here, but
228    * since we don't have that, we'll just fall through to bad.
229    */
230 
231   /*
232    * bad decoding
233    */
234 bad:
235   *rune = Bad;
236   return 1;
237 }
238 
isvalidcharntorune(const char * str,int length,Rune * rune,int * consumed)239 int isvalidcharntorune(const char *str, int length, Rune *rune, int *consumed) {
240   *consumed = charntorune(rune, str, length);
241   return *rune != Runeerror || *consumed == 3;
242 }
243 
runetochar(char * str,const Rune * rune)244 int runetochar(char *str, const Rune *rune) {
245   /* Runes are signed, so convert to unsigned for range check. */
246   unsigned long c;
247 
248   /*
249    * one character sequence
250    *	00000-0007F => 00-7F
251    */
252   c = *rune;
253   if (c <= Rune1) {
254     str[0] = c;
255     return 1;
256   }
257 
258   /*
259    * two character sequence
260    *	0080-07FF => T2 Tx
261    */
262   if (c <= Rune2) {
263     str[0] = T2 | (c >> 1 * Bitx);
264     str[1] = Tx | (c & Maskx);
265     return 2;
266   }
267 
268   /*
269    * If the Rune is out of range, convert it to the error rune.
270    * Do this test here because the error rune encodes to three bytes.
271    * Doing it earlier would duplicate work, since an out of range
272    * Rune wouldn't have fit in one or two bytes.
273    */
274   if (c > Runemax)
275     c = Runeerror;
276 
277   /*
278    * three character sequence
279    *	0800-FFFF => T3 Tx Tx
280    */
281   if (c <= Rune3) {
282     str[0] = T3 | (c >> 2 * Bitx);
283     str[1] = Tx | ((c >> 1 * Bitx) & Maskx);
284     str[2] = Tx | (c & Maskx);
285     return 3;
286   }
287 
288   /*
289    * four character sequence (21-bit value)
290    *     10000-1FFFFF => T4 Tx Tx Tx
291    */
292   str[0] = T4 | (c >> 3 * Bitx);
293   str[1] = Tx | ((c >> 2 * Bitx) & Maskx);
294   str[2] = Tx | ((c >> 1 * Bitx) & Maskx);
295   str[3] = Tx | (c & Maskx);
296   return 4;
297 }
298 
runelen(Rune rune)299 int runelen(Rune rune) {
300   char str[10];
301 
302   return runetochar(str, &rune);
303 }
304 
runenlen(const Rune * r,int nrune)305 int runenlen(const Rune *r, int nrune) {
306   int nb;
307   ulong c; /* Rune is signed, so use unsigned for range check. */
308 
309   nb = 0;
310   while (nrune--) {
311     c = *r++;
312     if (c <= Rune1)
313       nb++;
314     else if (c <= Rune2)
315       nb += 2;
316     else if (c <= Rune3)
317       nb += 3;
318     else if (c <= Runemax)
319       nb += 4;
320     else
321       nb += 3; /* Runeerror = 0xFFFD, see runetochar */
322   }
323   return nb;
324 }
325 
fullrune(const char * str,int n)326 int fullrune(const char *str, int n) {
327   if (n > 0) {
328     int c = *(uchar *)str;
329     if (c < Tx)
330       return 1;
331     if (n > 1) {
332       if (c < T3)
333         return 1;
334       if (n > 2) {
335         if (c < T4 || n > 3)
336           return 1;
337       }
338     }
339   }
340   return 0;
341 }
342