1 /*
2 
3   UTF-8 (unicode transformation format) encoding/decoding layer
4 
5   copyright (c) 2003, 2004, 2005 squell <squell@alumina.nl>
6 
7   use, modification, copying and distribution of this software is permitted
8   under the conditions described in the file 'COPYING'.
9 
10   Usage:
11 
12   The namespace 'utf8' defines a couple of things.
13 
14   utf8::length(ptr)
15      - returns the code-points of a null-terminated utf8 string
16   utf8::length(begin, end)
17      - returns the number of code points in [begin, end)
18   utf8::encode(begin, end, result)
19      - encodes the code points in [begin, end) to UTF8, storing in result
20   utf8::decode(begin, end, result)
21      - decodes the code units in [begin, end) to UCS4, storing in result
22 
23   'ptr', 'begin' and 'end' should be Input Iterators,
24   'output' should be an Output Iterator.
25 
26   UTF8 decoding is safe with respect to overlong sequences, surrogate pairs,
27   and so on, and passes Markus Kuhn's UTF8 stress test:
28 
29       http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
30 
31   Types for rolling your own;
32 
33   utf8::mbbuf
34      - a buffer that is large enough to hold a single multibyte character
35   utf8::encoder<OutputIterator>
36   utf8::decoder<OutputIterator[, ErrorFunction]>
37      - encoding and decoding. see class definitions below.
38 
39   decoder takes an optional second template argument, which is a function
40   pointer or function object type called for illegal input sequences,
41   as if matching this signature:
42 
43       template<class utfIter> wchar error(utfIter p, wchar ucs)
44 
45   the first argument points past the offending multibyte sequence, and ucs
46   is a suggested resolution: wchar(-1) for stray bytes, or the decoded value
47   for overlong or otherwise invalid sequences. The default error handler is
48   inline and returns U+FFFD - REPLACEMENT CHARACTER for all arguments.
49 
50   Example:
51 
52      int utf8_wctomb(char* s, wchar_t wc)
53      {
54          if(!s) return 0;
55          utf8::encoder<char*> enc(s);
56          enc.put(wc);
57          return enc.base() - s;
58      }
59 
60 */
61 #ifndef __ZF_UTF8_HPP
62 #define __ZF_UTF8_HPP
63 
64 #if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L
65 #  include <stdint.h>
66 #endif
67 
68 namespace utf8 {
69 
70   // some definitions
71 
72 #if __STDC_VERSION__ >= 199901L
73 typedef uint_fast32_t wchar;                 // ucs-4 symbol holder
74 #else
75 typedef unsigned long wchar;                 // ucs-4 symbol holder
76 #endif
77 
78 typedef char mbbuf[6];                       // maximum symbol length
79 
80  // common internal definitions
81  // - made a template class to the static table can be defined in the header
82 
83 template<class T = void> class base {
84 public:
85     template<class utfIter> struct error {
operatorerror86         inline wchar operator()(utfIter, wchar)
87         { return 0xFFFDu; }
88     };
89 protected:
base()90     base() { (T)void("ensure T == void"); }
91     static inline wchar max(unsigned n);
92     static unsigned char dectab[];
93 };
94 
95 template<class utfIter>                      // invalid code handler (func)
default_error(utfIter p,wchar ucs)96 inline wchar default_error(utfIter p, wchar ucs)
97 {
98     return base<>::error<utfIter>()(p, ucs);
99 }
100 
101  // encoding and decoding primitive classes
102 
103 template<class utfIter, class Error = base<>::error<utfIter> >
104 class decoder : base<> {
105     utfIter p, end;
106     Error errf;
107 public:
108     decoder(utfIter begin, utfIter end = utfIter(), Error e = Error())
p(begin)109     : p(begin), end(end), errf(e) { }
110 
base()111     utfIter& base() { return p; }
112 
113     inline bool i_get(wchar&);
114     bool get(wchar&);
115 };
116 
117 template<class utfIter>
118 class encoder : base<> {
119     utfIter p;
120 public:
encoder(utfIter out)121     encoder(utfIter out)
122     : p(out) { }
123 
base()124     utfIter& base() { return p; }
125 
126     inline void i_put(wchar);
127     void put(wchar);
128 };
129 
130  // primitive functions
131 
132 template<class utfIter>
133 unsigned long length(utfIter begin, utfIter end);
134 
135 template<class utfIter>
136 unsigned long length(utfIter begin);
137 
138 template<class utfIter, class charIter>
139 charIter decode(utfIter begin, utfIter end, charIter out);
140 
141 template<class utfIter, class charIter>
142 utfIter encode(charIter begin, charIter end, utfIter out);
143 
144  // encode & decode definitions
145 
146 template<class T>
max(unsigned n)147 inline wchar base<T>::max(unsigned n)           /* max. code of seq. n */
148 {
149     return 1ul << (n*5+1);                      /* 2 ** [ 6(n-1) + 7-n ] */
150 }
151 
152 template<class T>
153 unsigned char base<T>::dectab[64] = {           /* symbol length table */
154     2,2,2,2,2,2,2,2,
155     2,2,2,2,2,2,2,2,
156     2,2,2,2,2,2,2,2,
157     2,2,2,2,2,2,2,2,
158     3,3,3,3,3,3,3,3,
159     3,3,3,3,3,3,3,3,
160     4,4,4,4,4,4,4,4,
161     5,5,5,5,6,6,0,0,
162 };
163 
164 template<class utfIter, class Error>
i_get(wchar & out)165 inline bool decoder<utfIter,Error>::i_get(wchar& out)
166 {
167     wchar ucs;                                  /* uni. code symbol */
168 
169     wchar lng;                                  /* overlong mark */
170     int seq;
171     unsigned char c;
172 
173     if(p != end)                                /* seq == 0 loop */
174         switch((c=*p++) & 0xC0) {
175         default:                                /* 0x00 .. 0x7F */
176             return out=c, 1;
177         case 0xC0:                              /* start of sequence */
178             if((seq=dectab[c & 0x3F])) {
179                 ucs = c & ((1<<(8-seq))-1);     /* mask off good bits */
180                 lng = max(--seq);               /* determine minimum */
181                 lng <<= seq==1;                 /* fix 2 byte minimum */
182                 goto seqloop;
183             }
184         case 0x80:                              /* stray byte */
185             return out=errf(p, -1), 1;
186         }
187     return 0;
188 
189 seqloop:
190     while(p != end && (c=*p^0x80) <= 0x3F) {
191     ++p;
192         ucs = ucs << 6 | c;
193         if(--seq)
194             continue;
195 
196         if((ucs&~1ul    ) == 0xFFFE ||      /* illegal unicode? */
197            (ucs&~0x7FFul) == 0xD800 ||      /* utf-16 surrogate? */
198            (ucs < lng   )          )        /* was overlong? */
199             return out=errf(p, ucs), 1;
200         else
201             return out=ucs, 1;
202     }
203     return out=errf(p, -1), 1;
204 }
205 
206 template<class utfIter>
i_put(wchar ucs)207 inline void encoder<utfIter>::i_put(wchar ucs)
208 {
209     if(ucs < 0x80)
210         *p++ = ucs;
211     else {                                      /* determine bytes */
212         unsigned char c;
213         int n;
214         ucs &= (1ul << 31) - 1;
215 
216         for(n=2; max(n) <= ucs; ) ++n;          /* could be faster? */
217 
218         for(c = 0xFF^(0xFF >> n); n--; c = 0x80)
219             *p++ = c | ((ucs >> n*6) & 0x3F);
220     }
221 }
222 
223   // non-inline versions.
224 
225 template<class utfIter, class Error>
get(wchar & ucs)226 bool decoder<utfIter,Error>::get(wchar& ucs)
227 {
228     return i_get(ucs);
229 }
230 
231 template<class utfIter>
put(wchar ucs)232 void encoder<utfIter>::put(wchar ucs)
233 {
234     i_put(ucs);
235 }
236 
237   // sequence routines
238 
239 template<class utfIter>
length(utfIter begin,utfIter end)240 unsigned long length(utfIter begin, utfIter end)
241 {
242     decoder<utfIter> utf(begin, end);
243     unsigned long cnt = 0;
244     for(wchar wc; utf.i_get(wc); ) ++cnt;
245     return cnt;
246 }
247 
248 template<class utfIter>
length(utfIter begin)249 unsigned long length(utfIter begin)
250 {
251     decoder<utfIter> utf(begin);
252     unsigned long cnt = 0;
253     for(wchar wc; utf.i_get(wc), wc; ) ++cnt;
254     return cnt;
255 }
256 
257 template<class utfIter, class charIter>
decode(utfIter begin,utfIter end,charIter out)258 charIter decode(utfIter begin, utfIter end, charIter out)
259 {
260     decoder<utfIter> utf(begin, end);
261     wchar wc;
262     while( utf.i_get(wc) )
263         *out++ = wc;
264     return out;
265 }
266 
267 template<class utfIter, class charIter>
encode(charIter begin,charIter end,utfIter out)268 utfIter encode(charIter begin, charIter end, utfIter out)
269 {
270     encoder<utfIter> utf(out);
271     while(begin != end)
272         utf.i_put(*begin++);
273     return utf.base();
274 }
275 
276 }
277 
278 #endif
279 
280 /*
281 
282  UTF-8, terse
283  Any byte with bit7 clear is that byte.
284 
285  Any byte with bit7 belongs to a sequence of bytes.
286    All but the first have bit6 clear, the remaining 6bits carrying data.
287 
288    The starting byte consists of a number of bits (counted from msb side),
289    which indicate the number of bytes in this sequence (at least two),
290    followed by a zero bit, followed by a few bits of data.
291 
292 */
293 
294 
295