1 /*
2
3 UTF-8 (unicode transformation format) encoding/decoding layer
4
5 copyright (c) 2003, 2004, 2005 squell <squell@alumina.nl>
6
7 use, modification, copying and distribution of this software is permitted
8 under the conditions described in the file 'COPYING'.
9
10 Usage:
11
12 The namespace 'utf8' defines a couple of things.
13
14 utf8::length(ptr)
15 - returns the code-points of a null-terminated utf8 string
16 utf8::length(begin, end)
17 - returns the number of code points in [begin, end)
18 utf8::encode(begin, end, result)
19 - encodes the code points in [begin, end) to UTF8, storing in result
20 utf8::decode(begin, end, result)
21 - decodes the code units in [begin, end) to UCS4, storing in result
22
23 'ptr', 'begin' and 'end' should be Input Iterators,
24 'output' should be an Output Iterator.
25
26 UTF8 decoding is safe with respect to overlong sequences, surrogate pairs,
27 and so on, and passes Markus Kuhn's UTF8 stress test:
28
29 http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
30
31 Types for rolling your own;
32
33 utf8::mbbuf
34 - a buffer that is large enough to hold a single multibyte character
35 utf8::encoder<OutputIterator>
36 utf8::decoder<OutputIterator[, ErrorFunction]>
37 - encoding and decoding. see class definitions below.
38
39 decoder takes an optional second template argument, which is a function
40 pointer or function object type called for illegal input sequences,
41 as if matching this signature:
42
43 template<class utfIter> wchar error(utfIter p, wchar ucs)
44
45 the first argument points past the offending multibyte sequence, and ucs
46 is a suggested resolution: wchar(-1) for stray bytes, or the decoded value
47 for overlong or otherwise invalid sequences. The default error handler is
48 inline and returns U+FFFD - REPLACEMENT CHARACTER for all arguments.
49
50 Example:
51
52 int utf8_wctomb(char* s, wchar_t wc)
53 {
54 if(!s) return 0;
55 utf8::encoder<char*> enc(s);
56 enc.put(wc);
57 return enc.base() - s;
58 }
59
60 */
61 #ifndef __ZF_UTF8_HPP
62 #define __ZF_UTF8_HPP
63
64 #if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L
65 # include <stdint.h>
66 #endif
67
68 namespace utf8 {
69
70 // some definitions
71
72 #if __STDC_VERSION__ >= 199901L
73 typedef uint_fast32_t wchar; // ucs-4 symbol holder
74 #else
75 typedef unsigned long wchar; // ucs-4 symbol holder
76 #endif
77
78 typedef char mbbuf[6]; // maximum symbol length
79
80 // common internal definitions
81 // - made a template class to the static table can be defined in the header
82
83 template<class T = void> class base {
84 public:
85 template<class utfIter> struct error {
operatorerror86 inline wchar operator()(utfIter, wchar)
87 { return 0xFFFDu; }
88 };
89 protected:
base()90 base() { (T)void("ensure T == void"); }
91 static inline wchar max(unsigned n);
92 static unsigned char dectab[];
93 };
94
95 template<class utfIter> // invalid code handler (func)
default_error(utfIter p,wchar ucs)96 inline wchar default_error(utfIter p, wchar ucs)
97 {
98 return base<>::error<utfIter>()(p, ucs);
99 }
100
101 // encoding and decoding primitive classes
102
103 template<class utfIter, class Error = base<>::error<utfIter> >
104 class decoder : base<> {
105 utfIter p, end;
106 Error errf;
107 public:
108 decoder(utfIter begin, utfIter end = utfIter(), Error e = Error())
p(begin)109 : p(begin), end(end), errf(e) { }
110
base()111 utfIter& base() { return p; }
112
113 inline bool i_get(wchar&);
114 bool get(wchar&);
115 };
116
117 template<class utfIter>
118 class encoder : base<> {
119 utfIter p;
120 public:
encoder(utfIter out)121 encoder(utfIter out)
122 : p(out) { }
123
base()124 utfIter& base() { return p; }
125
126 inline void i_put(wchar);
127 void put(wchar);
128 };
129
130 // primitive functions
131
132 template<class utfIter>
133 unsigned long length(utfIter begin, utfIter end);
134
135 template<class utfIter>
136 unsigned long length(utfIter begin);
137
138 template<class utfIter, class charIter>
139 charIter decode(utfIter begin, utfIter end, charIter out);
140
141 template<class utfIter, class charIter>
142 utfIter encode(charIter begin, charIter end, utfIter out);
143
144 // encode & decode definitions
145
146 template<class T>
max(unsigned n)147 inline wchar base<T>::max(unsigned n) /* max. code of seq. n */
148 {
149 return 1ul << (n*5+1); /* 2 ** [ 6(n-1) + 7-n ] */
150 }
151
152 template<class T>
153 unsigned char base<T>::dectab[64] = { /* symbol length table */
154 2,2,2,2,2,2,2,2,
155 2,2,2,2,2,2,2,2,
156 2,2,2,2,2,2,2,2,
157 2,2,2,2,2,2,2,2,
158 3,3,3,3,3,3,3,3,
159 3,3,3,3,3,3,3,3,
160 4,4,4,4,4,4,4,4,
161 5,5,5,5,6,6,0,0,
162 };
163
164 template<class utfIter, class Error>
i_get(wchar & out)165 inline bool decoder<utfIter,Error>::i_get(wchar& out)
166 {
167 wchar ucs; /* uni. code symbol */
168
169 wchar lng; /* overlong mark */
170 int seq;
171 unsigned char c;
172
173 if(p != end) /* seq == 0 loop */
174 switch((c=*p++) & 0xC0) {
175 default: /* 0x00 .. 0x7F */
176 return out=c, 1;
177 case 0xC0: /* start of sequence */
178 if((seq=dectab[c & 0x3F])) {
179 ucs = c & ((1<<(8-seq))-1); /* mask off good bits */
180 lng = max(--seq); /* determine minimum */
181 lng <<= seq==1; /* fix 2 byte minimum */
182 goto seqloop;
183 }
184 case 0x80: /* stray byte */
185 return out=errf(p, -1), 1;
186 }
187 return 0;
188
189 seqloop:
190 while(p != end && (c=*p^0x80) <= 0x3F) {
191 ++p;
192 ucs = ucs << 6 | c;
193 if(--seq)
194 continue;
195
196 if((ucs&~1ul ) == 0xFFFE || /* illegal unicode? */
197 (ucs&~0x7FFul) == 0xD800 || /* utf-16 surrogate? */
198 (ucs < lng ) ) /* was overlong? */
199 return out=errf(p, ucs), 1;
200 else
201 return out=ucs, 1;
202 }
203 return out=errf(p, -1), 1;
204 }
205
206 template<class utfIter>
i_put(wchar ucs)207 inline void encoder<utfIter>::i_put(wchar ucs)
208 {
209 if(ucs < 0x80)
210 *p++ = ucs;
211 else { /* determine bytes */
212 unsigned char c;
213 int n;
214 ucs &= (1ul << 31) - 1;
215
216 for(n=2; max(n) <= ucs; ) ++n; /* could be faster? */
217
218 for(c = 0xFF^(0xFF >> n); n--; c = 0x80)
219 *p++ = c | ((ucs >> n*6) & 0x3F);
220 }
221 }
222
223 // non-inline versions.
224
225 template<class utfIter, class Error>
get(wchar & ucs)226 bool decoder<utfIter,Error>::get(wchar& ucs)
227 {
228 return i_get(ucs);
229 }
230
231 template<class utfIter>
put(wchar ucs)232 void encoder<utfIter>::put(wchar ucs)
233 {
234 i_put(ucs);
235 }
236
237 // sequence routines
238
239 template<class utfIter>
length(utfIter begin,utfIter end)240 unsigned long length(utfIter begin, utfIter end)
241 {
242 decoder<utfIter> utf(begin, end);
243 unsigned long cnt = 0;
244 for(wchar wc; utf.i_get(wc); ) ++cnt;
245 return cnt;
246 }
247
248 template<class utfIter>
length(utfIter begin)249 unsigned long length(utfIter begin)
250 {
251 decoder<utfIter> utf(begin);
252 unsigned long cnt = 0;
253 for(wchar wc; utf.i_get(wc), wc; ) ++cnt;
254 return cnt;
255 }
256
257 template<class utfIter, class charIter>
decode(utfIter begin,utfIter end,charIter out)258 charIter decode(utfIter begin, utfIter end, charIter out)
259 {
260 decoder<utfIter> utf(begin, end);
261 wchar wc;
262 while( utf.i_get(wc) )
263 *out++ = wc;
264 return out;
265 }
266
267 template<class utfIter, class charIter>
encode(charIter begin,charIter end,utfIter out)268 utfIter encode(charIter begin, charIter end, utfIter out)
269 {
270 encoder<utfIter> utf(out);
271 while(begin != end)
272 utf.i_put(*begin++);
273 return utf.base();
274 }
275
276 }
277
278 #endif
279
280 /*
281
282 UTF-8, terse
283 Any byte with bit7 clear is that byte.
284
285 Any byte with bit7 belongs to a sequence of bytes.
286 All but the first have bit6 clear, the remaining 6bits carrying data.
287
288 The starting byte consists of a number of bits (counted from msb side),
289 which indicate the number of bytes in this sequence (at least two),
290 followed by a zero bit, followed by a few bits of data.
291
292 */
293
294
295