1 /* $XFree86: xc/lib/X11/lcUniConv/utf8.h,v 1.3 2000/11/28 18:50:07 dawes Exp $ */
2 
3 /*
4  * UTF-8
5  */
6 
7 /* Specification: RFC 2279 */
8 
9 static int
utf8_mbtowc(conv_t conv,ucs4_t * pwc,const unsigned char * s,int n)10 utf8_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
11 {
12   unsigned char c = s[0];
13 
14   if (c < 0x80) {
15     *pwc = c;
16     return 1;
17   } else if (c < 0xc2) {
18     return RET_ILSEQ;
19   } else if (c < 0xe0) {
20     if (n < 2)
21       return RET_TOOFEW(0);
22     if (!((s[1] ^ 0x80) < 0x40))
23       return RET_ILSEQ;
24     *pwc = ((ucs4_t) (c & 0x1f) << 6)
25            | (ucs4_t) (s[1] ^ 0x80);
26     return 2;
27   } else if (c < 0xf0) {
28     if (n < 3)
29       return RET_TOOFEW(0);
30     if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
31           && (c >= 0xe1 || s[1] >= 0xa0)))
32       return RET_ILSEQ;
33     *pwc = ((ucs4_t) (c & 0x0f) << 12)
34            | ((ucs4_t) (s[1] ^ 0x80) << 6)
35            | (ucs4_t) (s[2] ^ 0x80);
36     return 3;
37   } else if (c < 0xf8) {
38     if (n < 4)
39       return RET_TOOFEW(0);
40     if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
41           && (s[3] ^ 0x80) < 0x40
42           && (c >= 0xf1 || s[1] >= 0x90)))
43       return RET_ILSEQ;
44     *pwc = ((ucs4_t) (c & 0x07) << 18)
45            | ((ucs4_t) (s[1] ^ 0x80) << 12)
46            | ((ucs4_t) (s[2] ^ 0x80) << 6)
47            | (ucs4_t) (s[3] ^ 0x80);
48     return 4;
49   } else if (c < 0xfc) {
50     if (n < 5)
51       return RET_TOOFEW(0);
52     if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
53           && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
54           && (c >= 0xf9 || s[1] >= 0x88)))
55       return RET_ILSEQ;
56     *pwc = ((ucs4_t) (c & 0x03) << 24)
57            | ((ucs4_t) (s[1] ^ 0x80) << 18)
58            | ((ucs4_t) (s[2] ^ 0x80) << 12)
59            | ((ucs4_t) (s[3] ^ 0x80) << 6)
60            | (ucs4_t) (s[4] ^ 0x80);
61     return 5;
62   } else if (c < 0xfe) {
63     if (n < 6)
64       return RET_TOOFEW(0);
65     if (!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40
66           && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40
67           && (s[5] ^ 0x80) < 0x40
68           && (c >= 0xfd || s[1] >= 0x84)))
69       return RET_ILSEQ;
70     *pwc = ((ucs4_t) (c & 0x01) << 30)
71            | ((ucs4_t) (s[1] ^ 0x80) << 24)
72            | ((ucs4_t) (s[2] ^ 0x80) << 18)
73            | ((ucs4_t) (s[3] ^ 0x80) << 12)
74            | ((ucs4_t) (s[4] ^ 0x80) << 6)
75            | (ucs4_t) (s[5] ^ 0x80);
76     return 6;
77   } else
78     return RET_ILSEQ;
79 }
80 
81 static int
utf8_wctomb(conv_t conv,unsigned char * r,ucs4_t wc,int n)82 utf8_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n) /* n == 0 is acceptable */
83 {
84   int count;
85   if (wc < 0x80)
86     count = 1;
87   else if (wc < 0x800)
88     count = 2;
89   else if (wc < 0x10000)
90     count = 3;
91   else if (wc < 0x200000)
92     count = 4;
93   else if (wc < 0x4000000)
94     count = 5;
95   else if (wc <= 0x7fffffff)
96     count = 6;
97   else
98     return RET_ILSEQ;
99   if (n < count)
100     return RET_TOOSMALL;
101   switch (count) { /* note: code falls through cases! */
102     case 6: r[5] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x4000000;
103     case 5: r[4] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x200000;
104     case 4: r[3] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x10000;
105     case 3: r[2] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0x800;
106     case 2: r[1] = 0x80 | (wc & 0x3f); wc = wc >> 6; wc |= 0xc0;
107     case 1: r[0] = wc;
108   }
109   return count;
110 }
111