1 #ifndef _LINUX_UTF_H
2 #define _LINUX_UTF_H
3 
4 #include <asm/unaligned.h>
5 
utf8_to_utf16le(const char * s,__le16 * cp,unsigned len)6 static inline int utf8_to_utf16le(const char *s, __le16 *cp, unsigned len)
7 {
8 	int	count = 0;
9 	u8	c;
10 	u16	uchar;
11 
12 	/*
13 	 * this insists on correct encodings, though not minimal ones.
14 	 * BUT it currently rejects legit 4-byte UTF-8 code points,
15 	 * which need surrogate pairs.  (Unicode 3.1 can use them.)
16 	 */
17 	while (len != 0 && (c = (u8) *s++) != 0) {
18 		if ((c & 0x80)) {
19 			/*
20 			 * 2-byte sequence:
21 			 * 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
22 			 */
23 			if ((c & 0xe0) == 0xc0) {
24 				uchar = (c & 0x1f) << 6;
25 
26 				c = (u8) *s++;
27 				if ((c & 0xc0) != 0x80)
28 					goto fail;
29 				c &= 0x3f;
30 				uchar |= c;
31 
32 			/*
33 			 * 3-byte sequence (most CJKV characters):
34 			 * zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
35 			 */
36 			} else if ((c & 0xf0) == 0xe0) {
37 				uchar = (c & 0x0f) << 12;
38 
39 				c = (u8) *s++;
40 				if ((c & 0xc0) != 0x80)
41 					goto fail;
42 				c &= 0x3f;
43 				uchar |= c << 6;
44 
45 				c = (u8) *s++;
46 				if ((c & 0xc0) != 0x80)
47 					goto fail;
48 				c &= 0x3f;
49 				uchar |= c;
50 
51 				/* no bogus surrogates */
52 				if (0xd800 <= uchar && uchar <= 0xdfff)
53 					goto fail;
54 
55 			/*
56 			 * 4-byte sequence (surrogate pairs, currently rare):
57 			 * 11101110wwwwzzzzyy + 110111yyyyxxxxxx
58 			 *     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
59 			 * (uuuuu = wwww + 1)
60 			 * FIXME accept the surrogate code points (only)
61 			 */
62 			} else
63 				goto fail;
64 		} else
65 			uchar = c;
66 		put_unaligned_le16(uchar, cp++);
67 		count++;
68 		len--;
69 	}
70 	return count;
71 fail:
72 	return -1;
73 }
74 
75 #endif /* _LINUX_UTF_H */
76