1 /* GRAPHITE2 LICENSING 2 3 Copyright 2011, SIL International 4 All rights reserved. 5 6 This library is free software; you can redistribute it and/or modify 7 it under the terms of the GNU Lesser General Public License as published 8 by the Free Software Foundation; either version 2.1 of License, or 9 (at your option) any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should also have received a copy of the GNU Lesser General Public 17 License along with this library in the file named "LICENSE". 18 If not, write to the Free Software Foundation, 51 Franklin Street, 19 Suite 500, Boston, MA 02110-1335, USA or visit their web page on the 20 internet at http://www.fsf.org/licenses/lgpl.html. 21 22 Alternatively, the contents of this file may be used under the terms of the 23 Mozilla Public License (http://mozilla.org/MPL) or the GNU General Public 24 License, as published by the Free Software Foundation, either version 2 25 of the License or (at your option) any later version. 26 */ 27 #pragma once 28 29 #include <cstdlib> 30 #include "inc/Main.h" 31 32 namespace graphite2 { 33 34 typedef uint32 uchar_t; 35 36 template <int N> 37 struct _utf_codec 38 { 39 typedef uchar_t codeunit_t; 40 41 static void put(codeunit_t * cp, const uchar_t , int8 & len) throw(); 42 static uchar_t get(const codeunit_t * cp, int8 & len) throw(); 43 static bool validate(const codeunit_t * s, const codeunit_t * const e) throw(); 44 }; 45 46 47 template <> 48 struct _utf_codec<32> 49 { 50 private: 51 static const uchar_t limit = 0x110000; 52 public: 53 typedef uint32 codeunit_t; 54 55 inline 56 static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw() 57 { 58 *cp = usv; l = 1; 59 } 60 61 inline 62 static uchar_t get(const codeunit_t * cp, int8 & l) throw() 63 { 64 if (cp[0] < limit) { l = 1; return cp[0]; } 65 else { l = -1; return 0xFFFD; } 66 } 67 68 inline 69 static bool validate(const codeunit_t * s, const codeunit_t * const e) throw() 70 { 71 return s <= e; 72 } 73 }; 74 75 76 template <> 77 struct _utf_codec<16> 78 { 79 private: 80 static const int32 lead_offset = 0xD800 - (0x10000 >> 10); 81 static const int32 surrogate_offset = 0x10000 - (0xD800 << 10) - 0xDC00; 82 public: 83 typedef uint16 codeunit_t; 84 85 inline 86 static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw() 87 { 88 if (usv < 0x10000) { l = 1; cp[0] = codeunit_t(usv); } 89 else 90 { 91 cp[0] = codeunit_t(lead_offset + (usv >> 10)); 92 cp[1] = codeunit_t(0xDC00 + (usv & 0x3FF)); 93 l = 2; 94 } 95 } 96 97 inline 98 static uchar_t get(const codeunit_t * cp, int8 & l) throw() 99 { 100 const uint32 uh = cp[0]; 101 l = 1; 102 103 if (uh < 0xD800|| uh > 0xDFFF) { return uh; } 104 if (uh > 0xDBFF) { l = -1; return 0xFFFD; } 105 const uint32 ul = cp[1]; 106 if (ul < 0xDC00 || ul > 0xDFFF) { l = -1; return 0xFFFD; } 107 ++l; 108 return (uh<<10) + ul + surrogate_offset; 109 } 110 111 inline 112 static bool validate(const codeunit_t * s, const codeunit_t * const e) throw() 113 { 114 const ptrdiff_t n = e-s; 115 if (n <= 0) return n == 0; 116 const uint32 u = *(e-1); // Get the last codepoint 117 return (u < 0xD800 || u > 0xDBFF); 118 } 119 }; 120 121 122 template <> 123 struct _utf_codec<8> 124 { 125 private: 126 static const int8 sz_lut[16]; 127 static const byte mask_lut[5]; 128 static const uchar_t limit = 0x110000; 129 130 public: 131 typedef uint8 codeunit_t; 132 133 inline 134 static void put(codeunit_t * cp, const uchar_t usv, int8 & l) throw() 135 { 136 if (usv < 0x80) {l = 1; cp[0] = usv; return; } 137 if (usv < 0x0800) {l = 2; cp[0] = 0xC0 + (usv >> 6); cp[1] = 0x80 + (usv & 0x3F); return; } 138 if (usv < 0x10000) {l = 3; cp[0] = 0xE0 + (usv >> 12); cp[1] = 0x80 + ((usv >> 6) & 0x3F); cp[2] = 0x80 + (usv & 0x3F); return; } 139 else {l = 4; cp[0] = 0xF0 + (usv >> 18); cp[1] = 0x80 + ((usv >> 12) & 0x3F); cp[2] = 0x80 + ((usv >> 6) & 0x3F); cp[3] = 0x80 + (usv & 0x3F); return; } 140 } 141 142 inline 143 static uchar_t get(const codeunit_t * cp, int8 & l) throw() 144 { 145 const int8 seq_sz = sz_lut[*cp >> 4]; 146 uchar_t u = *cp & mask_lut[seq_sz]; 147 l = 1; 148 bool toolong = false; 149 150 switch(seq_sz) { 151 case 4: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong = (u < 0x10); GR_FALLTHROUGH; 152 // no break 153 case 3: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x20); GR_FALLTHROUGH; 154 // no break 155 case 2: u <<= 6; u |= *++cp & 0x3F; if (*cp >> 6 != 2) break; ++l; toolong |= (u < 0x80); GR_FALLTHROUGH; 156 // no break 157 case 1: break; 158 case 0: l = -1; return 0xFFFD; 159 } 160 161 if (l != seq_sz || toolong || u >= limit) 162 { 163 l = -l; 164 return 0xFFFD; 165 } 166 return u; 167 } 168 169 inline 170 static bool validate(const codeunit_t * s, const codeunit_t * const e) throw() 171 { 172 const ptrdiff_t n = e-s; 173 if (n <= 0) return n == 0; 174 s += (n-1); 175 if (*s < 0x80) return true; 176 if (*s >= 0xC0) return false; 177 if (n == 1) return true; 178 if (*--s < 0x80) return true; 179 if (*s >= 0xE0) return false; 180 if (n == 2 || *s >= 0xC0) return true; 181 if (*--s < 0x80) return true; 182 if (*s >= 0xF0) return false; 183 return true; 184 } 185 186 }; 187 188 189 template <typename C> 190 class _utf_iterator 191 { 192 typedef _utf_codec<sizeof(C)*8> codec; 193 194 C * cp; 195 mutable int8 sl; 196 197 public: 198 typedef C codeunit_type; 199 typedef uchar_t value_type; 200 typedef uchar_t * pointer; 201 202 class reference 203 { 204 const _utf_iterator & _i; 205 206 reference(const _utf_iterator & i): _i(i) {} 207 public: 208 operator value_type () const throw () { return codec::get(_i.cp, _i.sl); } 209 reference & operator = (const value_type usv) throw() { codec::put(_i.cp, usv, _i.sl); return *this; } 210 211 friend class _utf_iterator; 212 }; 213 214 215 _utf_iterator(const void * us=0) : cp(reinterpret_cast<C *>(const_cast<void *>(us))), sl(1) { } 216 217 _utf_iterator & operator ++ () { cp += abs(sl); return *this; } 218 _utf_iterator operator ++ (int) { _utf_iterator tmp(*this); operator++(); return tmp; } 219 220 bool operator == (const _utf_iterator & rhs) const throw() { return cp >= rhs.cp; } 221 bool operator != (const _utf_iterator & rhs) const throw() { return !operator==(rhs); } 222 223 reference operator * () const throw() { return *this; } 224 pointer operator ->() const throw() { return &operator *(); } 225 226 operator codeunit_type * () const throw() { return cp; } 227 228 bool error() const throw() { return sl < 1; } 229 bool validate(const _utf_iterator & e) { return codec::validate(cp, e.cp); } 230 }; 231 232 template <typename C> 233 struct utf 234 { 235 typedef typename _utf_codec<sizeof(C)*8>::codeunit_t codeunit_t; 236 237 typedef _utf_iterator<C> iterator; 238 typedef _utf_iterator<const C> const_iterator; 239 240 inline 241 static bool validate(codeunit_t * s, codeunit_t * e) throw() { 242 return _utf_codec<sizeof(C)*8>::validate(s,e); 243 } 244 }; 245 246 247 typedef utf<uint32> utf32; 248 typedef utf<uint16> utf16; 249 typedef utf<uint8> utf8; 250 251 } // namespace graphite2 252