1 #ifndef _LINUX_UTF_H 2 #define _LINUX_UTF_H 3 4 #include <asm/unaligned.h> 5 utf8_to_utf16le(const char * s,__le16 * cp,unsigned len)6static inline int utf8_to_utf16le(const char *s, __le16 *cp, unsigned len) 7 { 8 int count = 0; 9 u8 c; 10 u16 uchar; 11 12 /* 13 * this insists on correct encodings, though not minimal ones. 14 * BUT it currently rejects legit 4-byte UTF-8 code points, 15 * which need surrogate pairs. (Unicode 3.1 can use them.) 16 */ 17 while (len != 0 && (c = (u8) *s++) != 0) { 18 if ((c & 0x80)) { 19 /* 20 * 2-byte sequence: 21 * 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx 22 */ 23 if ((c & 0xe0) == 0xc0) { 24 uchar = (c & 0x1f) << 6; 25 26 c = (u8) *s++; 27 if ((c & 0xc0) != 0x80) 28 goto fail; 29 c &= 0x3f; 30 uchar |= c; 31 32 /* 33 * 3-byte sequence (most CJKV characters): 34 * zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx 35 */ 36 } else if ((c & 0xf0) == 0xe0) { 37 uchar = (c & 0x0f) << 12; 38 39 c = (u8) *s++; 40 if ((c & 0xc0) != 0x80) 41 goto fail; 42 c &= 0x3f; 43 uchar |= c << 6; 44 45 c = (u8) *s++; 46 if ((c & 0xc0) != 0x80) 47 goto fail; 48 c &= 0x3f; 49 uchar |= c; 50 51 /* no bogus surrogates */ 52 if (0xd800 <= uchar && uchar <= 0xdfff) 53 goto fail; 54 55 /* 56 * 4-byte sequence (surrogate pairs, currently rare): 57 * 11101110wwwwzzzzyy + 110111yyyyxxxxxx 58 * = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 59 * (uuuuu = wwww + 1) 60 * FIXME accept the surrogate code points (only) 61 */ 62 } else 63 goto fail; 64 } else 65 uchar = c; 66 put_unaligned_le16(uchar, cp++); 67 count++; 68 len--; 69 } 70 return count; 71 fail: 72 return -1; 73 } 74 75 #endif /* _LINUX_UTF_H */ 76