1 /*  GRAPHITE2 LICENSING
2 
3     Copyright 2011, SIL International
4     All rights reserved.
5 
6     This library is free software; you can redistribute it and/or modify
7     it under the terms of the GNU Lesser General Public License as published
8     by the Free Software Foundation; either version 2.1 of License, or
9     (at your option) any later version.
10 
11     This program is distributed in the hope that it will be useful,
12     but WITHOUT ANY WARRANTY; without even the implied warranty of
13     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14     Lesser General Public License for more details.
15 
16     You should also have received a copy of the GNU Lesser General Public
17     License along with this library in the file named "LICENSE".
18     If not, write to the Free Software Foundation, 51 Franklin Street,
19     Suite 500, Boston, MA 02110-1335, USA or visit their web page on the
20     internet at http://www.fsf.org/licenses/lgpl.html.
21 
22 Alternatively, the contents of this file may be used under the terms of the
23 Mozilla Public License (http://mozilla.org/MPL) or the GNU General Public
24 License, as published by the Free Software Foundation, either version 2
25 of the License or (at your option) any later version.
26 */
27 #pragma once
28 
29 #include <cstdlib>
30 #include "inc/Main.h"
31 
32 namespace graphite2 {
33 
34 typedef uint32  uchar_t;
35 
36 template <int N>
37 struct _utf_codec
38 {
39     typedef uchar_t codeunit_t;
40 
41     static void     put(codeunit_t * cp, const uchar_t , int8 & len) throw();
42     static uchar_t  get(const codeunit_t * cp, int8 & len) throw();
43     static bool     validate(const codeunit_t * s, const codeunit_t * const e) throw();
44 };
45 
46 
47 template <>
48 struct _utf_codec<32>
49 {
50 private:
51     static const uchar_t    limit = 0x110000;
52 public:
53     typedef uint32  codeunit_t;
54 
55     inline
56     static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
57     {
58         *cp = usv; l = 1;
59     }
60 
61     inline
62     static uchar_t get(const codeunit_t * cp, int8 & l) throw()
63     {
64         if (cp[0] < limit)  { l = 1;  return cp[0]; }
65         else                { l = -1; return 0xFFFD; }
66     }
67 
68     inline
69     static bool validate(const codeunit_t * s, const codeunit_t * const e) throw()
70     {
71         return s <= e;
72     }
73 };
74 
75 
76 template <>
77 struct _utf_codec<16>
78 {
79 private:
80     static const int32  lead_offset      = 0xD800 - (0x10000 >> 10);
81     static const int32  surrogate_offset = 0x10000 - (0xD800 << 10) - 0xDC00;
82 public:
83     typedef uint16  codeunit_t;
84 
85     inline
86     static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
87     {
88         if (usv < 0x10000)  { l = 1; cp[0] = codeunit_t(usv); }
89         else
90         {
91             cp[0] = codeunit_t(lead_offset + (usv >> 10));
92             cp[1] = codeunit_t(0xDC00 + (usv & 0x3FF));
93             l = 2;
94         }
95     }
96 
97     inline
98     static uchar_t get(const codeunit_t * cp, int8 & l) throw()
99     {
100         const uint32    uh = cp[0];
101         l = 1;
102 
103         if (uh < 0xD800|| uh > 0xDFFF) { return uh; }
104         if (uh > 0xDBFF) { l = -1; return 0xFFFD; }
105         const uint32 ul = cp[1];
106         if (ul < 0xDC00 || ul > 0xDFFF) { l = -1; return 0xFFFD; }
107         ++l;
108         return (uh<<10) + ul + surrogate_offset;
109     }
110 
111     inline
112     static bool validate(const codeunit_t * s, const codeunit_t * const e) throw()
113     {
114         const ptrdiff_t n = e-s;
115         if (n <= 0) return n == 0;
116         const uint32 u = *(e-1); // Get the last codepoint
117         return (u < 0xD800 || u > 0xDBFF);
118     }
119 };
120 
121 
122 template <>
123 struct _utf_codec<8>
124 {
125 private:
126     static const int8 sz_lut[16];
127     static const byte mask_lut[5];
128     static const uchar_t    limit = 0x110000;
129 
130 public:
131     typedef uint8   codeunit_t;
132 
133     inline
134     static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw()
135     {
136         if (usv < 0x80)     {l = 1; cp[0] = usv; return; }
137         if (usv < 0x0800)   {l = 2; cp[0] = 0xC0 + (usv >> 6);  cp[1] = 0x80 + (usv & 0x3F); return; }
138         if (usv < 0x10000)  {l = 3; cp[0] = 0xE0 + (usv >> 12); cp[1] = 0x80 + ((usv >> 6) & 0x3F);  cp[2] = 0x80 + (usv & 0x3F); return; }
139         else                {l = 4; cp[0] = 0xF0 + (usv >> 18); cp[1] = 0x80 + ((usv >> 12) & 0x3F); cp[2] = 0x80 + ((usv >> 6) & 0x3F); cp[3] = 0x80 + (usv & 0x3F); return; }
140     }
141 
142     inline
143     static uchar_t get(const codeunit_t * cp, int8 & l) throw()
144     {
145         const int8 seq_sz = sz_lut[*cp >> 4];
146         uchar_t u = *cp & mask_lut[seq_sz];
147         l = 1;
148         bool toolong = false;
149 
150         switch(seq_sz) {
151             case 4:     u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong  = (u < 0x10); GR_FALLTHROUGH;
152                 // no break
153             case 3:     u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x20); GR_FALLTHROUGH;
154                 // no break
155             case 2:     u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x80); GR_FALLTHROUGH;
156                 // no break
157             case 1:     break;
158             case 0:     l = -1; return 0xFFFD;
159         }
160 
161         if (l != seq_sz || toolong  || u >= limit)
162         {
163             l = -l;
164             return 0xFFFD;
165         }
166         return u;
167     }
168 
169     inline
170     static bool validate(const codeunit_t * s, const codeunit_t * const e) throw()
171     {
172         const ptrdiff_t n = e-s;
173         if (n <= 0) return n == 0;
174         s += (n-1);
175         if (*s < 0x80) return true;
176         if (*s >= 0xC0) return false;
177         if (n == 1) return true;
178         if (*--s < 0x80) return true;
179         if (*s >= 0xE0) return false;
180         if (n == 2 || *s >= 0xC0) return true;
181         if (*--s < 0x80) return true;
182         if (*s >= 0xF0) return false;
183         return true;
184     }
185 
186 };
187 
188 
189 template <typename C>
190 class _utf_iterator
191 {
192     typedef _utf_codec<sizeof(C)*8> codec;
193 
194     C             * cp;
195     mutable int8    sl;
196 
197 public:
198     typedef C           codeunit_type;
199     typedef uchar_t     value_type;
200     typedef uchar_t   * pointer;
201 
202     class reference
203     {
204         const _utf_iterator & _i;
205 
206         reference(const _utf_iterator & i): _i(i) {}
207     public:
208         operator value_type () const throw ()                   { return codec::get(_i.cp, _i.sl); }
209         reference & operator = (const value_type usv) throw()   { codec::put(_i.cp, usv, _i.sl); return *this; }
210 
211         friend class _utf_iterator;
212     };
213 
214 
215     _utf_iterator(const void * us=0)    : cp(reinterpret_cast<C *>(const_cast<void *>(us))), sl(1) { }
216 
217     _utf_iterator   & operator ++ ()    { cp += abs(sl); return *this; }
218     _utf_iterator   operator ++ (int)   { _utf_iterator tmp(*this); operator++(); return tmp; }
219 
220     bool operator == (const _utf_iterator & rhs) const throw() { return cp >= rhs.cp; }
221     bool operator != (const _utf_iterator & rhs) const throw() { return !operator==(rhs); }
222 
223     reference   operator * () const throw() { return *this; }
224     pointer     operator ->() const throw() { return &operator *(); }
225 
226     operator codeunit_type * () const throw() { return cp; }
227 
228     bool error() const throw()  { return sl < 1; }
229     bool validate(const _utf_iterator & e)  { return codec::validate(cp, e.cp); }
230 };
231 
232 template <typename C>
233 struct utf
234 {
235     typedef typename _utf_codec<sizeof(C)*8>::codeunit_t codeunit_t;
236 
237     typedef _utf_iterator<C>        iterator;
238     typedef _utf_iterator<const C>  const_iterator;
239 
240     inline
241     static bool validate(codeunit_t * s, codeunit_t * e) throw() {
242         return _utf_codec<sizeof(C)*8>::validate(s,e);
243     }
244 };
245 
246 
247 typedef utf<uint32> utf32;
248 typedef utf<uint16> utf16;
249 typedef utf<uint8>  utf8;
250 
251 } // namespace graphite2
252