1 /*
2     UTF8 simple decoder/encoder
3     Copyright (C) 2004 Oleg Bondar
4 
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation; either version 2 of the License, or
8     (at your option) any later version.
9 
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with this program; if not, write to the Free Software
17     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18 */
19 
20 #include "utf8.h"
21 
22 /* http://www.cl.cam.ac.uk/~mgk25/unicode.html
23 
24 UTF-8 has the following properties:
25 
26     UCS characters U+0000 to U+007F (ASCII) are encoded simply as bytes 0x00 to
27     0x7F (ASCII compatibility). This means that files and strings which contain
28     only 7-bit ASCII characters have the same encoding under both ASCII and UTF-8.
29 
30     All UCS characters >U+007F are encoded as a sequence of several bytes, each of
31     which has the most significant bit set. Therefore, no ASCII byte (0x00-0x7F)
32     can appear as part of any other character.
33 
34     The first byte of a multibyte sequence that represents a non-ASCII character is
35     always in the range 0xC0 to 0xFD and it indicates how many bytes follow for
36     this character. All further bytes in a multibyte sequence are in the range 0x80
37     to 0xBF. This allows easy resynchronization and makes the encoding stateless
38     and robust against missing bytes.
39 
40     All possible 231 UCS codes can be encoded.
41 
42     UTF-8 encoded characters may theoretically be up to six bytes long, however
43     16-bit BMP characters are only up to three bytes long.
44 
45     The sorting order of Bigendian UCS-4 byte strings is preserved.
46 
47     The bytes 0xFE and 0xFF are never used in the UTF-8 encoding.
48 
49 The following byte sequences are used to represent a character. The sequence to
50 be used depends on the Unicode number of the character:
51 
52 
53 U-00000000 - U-0000007F:  0xxxxxxx
54 U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
55 U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
56 U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
57 U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
58 U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
59 
60 The xxx bit positions are filled with the bits of the character code number in
61 binary representation. The rightmost x bit is the least-significant bit. Only
62 the shortest possible multibyte sequence which can represent the code number of
63 the character can be used. Note that in multibyte sequences, the number of
64 leading 1 bits in the first byte is identical to the number of bytes in the
65 entire sequence.
66 */
67 
68 /* assume: wchar_t == unsigned short == UCS-2 (2 byte long) */
69 
70 /* encode UCS-2 into UTF-8 */
encode_UTF8(char * utf8s,wchar_t * ws)71 char *encode_UTF8(char *utf8s, wchar_t *ws)
72 {
73     char    *s;
74 
75     s = utf8s;
76     while(*ws) {
77         if(*ws <= 0x007F) {                 /* 1 octet */
78             *s++ = *ws;
79         }
80         else if(*ws <= 0x07FF) {            /* 2 octets */
81             *s++ = 0xC0 | ((*ws >> 6) & 0x1F);
82             *s++ = 0x80 | (*ws & 0x3F);
83         }
84         else if(*ws <= 0xFFFF) {            /* 3 octets */
85             *s++ = 0xE0 | ((*ws >> 12) & 0x0F);
86             *s++ = 0x80 | ((*ws >> 6) & 0x3F);
87             *s++ = 0x80 | (*ws & 0x3F);
88         }
89         else {                              /* >= 4 octets -- not fit in UCS-2 */
90             *s++ = '_';
91         }
92         ++ws;
93     }
94     *s = 0;
95     return utf8s;
96 }
97 
98 /* returns size of buffer in chars for convert UCS-2 into UTF-8 */
99 #if 0
100 int UTF8_strsize(wchar_t *ws)
101 {
102     int     sz = 0;
103 
104     while(*ws) {
105         if(*ws <= 0x007F)       sz += 1;
106         else if(*ws <= 0x07FF)  sz += 2;
107         else if(*ws <= 0xFFFF)  sz += 3;
108         else                    sz += 1;
109         ++ws;
110     }
111     return sz;
112 }
113 #endif
114 /*
115 0 0000  4 0100  8 1000  C 1100
116 1 0001  5 0101  9 1001  D 1101
117 2 0010  6 0110  A 1010  E 1110
118 3 0011  7 0111  B 1011  F 1111
119  */
120 
121 /* decode UTF-8 into UCS-2 */
decode_UTF8(wchar_t * ws,unsigned char * utf8s)122 wchar_t *decode_UTF8(wchar_t *ws, unsigned char *utf8s)
123 {
124     wchar_t     *wc;
125 
126     wc = ws;
127     while(*utf8s) {
128         if(!(*utf8s & 0x80)) *wc = *utf8s++;    /* 1 octet */
129         else if((*utf8s & 0xE0) == 0xC0) {      /* 2 octets */
130             *wc = (*utf8s++ & 0x1F) << 6;       /* 1st */
131             if((*utf8s & 0xC0) == 0x80) *wc |= *utf8s++ & 0x3F; /* 2nd */
132             else *wc = *utf8s % 26 + 'a';
133         }
134         else if((*utf8s & 0xF0) == 0xE0) {      /* 3 octets */
135             *wc = (*utf8s++ & 0x0F) << 12;      /* 1st */
136             if((*utf8s & 0xC0) == 0x80) {       /* 2nd */
137                 *wc |= (*utf8s++ & 0x3F) << 6;
138                 if((*utf8s & 0xC0) == 0x80) *wc |= *utf8s++ & 0x3F; /* 3d */
139                 else *wc = *utf8s % 26 + 'a';   /* bad UTF-8 */
140             }
141             else *wc = *utf8s % 26 + 'a';       /* bad UTF-8 */
142         }
143         else if((*utf8s * 0xF0) == 0xF0) {  /* >= 4 octets -- not fit in UCS-2 */
144             ++utf8s;
145             while(*utf8s && ((*utf8s & 0xC0) == 0x80)) ++utf8s;
146             *wc =  '_';
147         }
148         else {
149             *wc = *utf8s % 26 + 'a';
150             ++utf8s;
151         }
152         ++wc;
153     }
154     *wc = 0;
155     return ws;
156 }
157 
158 /* returns number of UCS-2 characters in utf8s */
159 #if 0
160 int UTF8_strlen(char *utf8s)
161 {
162     int     l = 0;
163 
164     while(*utf8s) {
165         ++l;
166         if(!(*utf8s & 0x80))             utf8s += 1;
167         else if((*utf8s & 0xE0) == 0xC0) utf8s += 2;
168         else if((*utf8s & 0xF0) == 0xE0) utf8s += 3;
169         else while(*utf8s && ((*utf8s & 0xC0) == 0x80)) ++utf8s; /* >= 4 octets -- not fit in UCS-2 */
170     }
171     return l;
172 }
173 #endif
174