1 /*
2 * FTGL - OpenGL font library
3 *
4 * Copyright (c) 2008 Daniel Remenak <dtremenak@users.sourceforge.net>
5 *
6 * Portions derived from ConvertUTF.c Copyright (C) 2001-2004 Unicode, Inc
7 * Unicode, Inc. hereby grants the right to freely use the information
8 * supplied in this file in the creation of products supporting the
9 * Unicode Standard, and to make copies of this file in any form
10 * for internal or external distribution as long as this notice
11 * remains attached.
12 *
13 * Permission is hereby granted, free of charge, to any person obtaining
14 * a copy of this software and associated documentation files (the
15 * "Software"), to deal in the Software without restriction, including
16 * without limitation the rights to use, copy, modify, merge, publish,
17 * distribute, sublicense, and/or sell copies of the Software, and to
18 * permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be
22 * included in all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
27 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
28 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
29 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
30 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 */
32
33 #ifndef __FTUnicode__
34 #define __FTUnicode__
35
36 /**
37 * Provides a way to easily walk multibyte unicode strings in the various
38 * Unicode encodings (UTF-8, UTF-16, UTF-32, UCS-2, and UCS-4). Encodings
39 * with elements larger than one byte must already be in the correct endian
40 * order for the current architecture.
41 */
42 template <typename T>
43 class FTUnicodeStringItr
44 {
45 public:
46 /**
47 * Constructor. Also reads the first character and stores it.
48 *
49 * @param string The buffer to iterate. No copy is made.
50 */
FTUnicodeStringItr(const T * string)51 FTUnicodeStringItr(const T* string) : curPos(string), nextPos(string)
52 {
53 (*this)++;
54 };
55
56 /**
57 * Pre-increment operator. Reads the next unicode character and sets
58 * the state appropriately.
59 * Note - not protected against overruns.
60 */
61 FTUnicodeStringItr& operator++()
62 {
63 curPos = nextPos;
64 // unicode handling
65 switch (sizeof(T))
66 {
67 case 1: // UTF-8
68 // get this character
69 readUTF8(); break;
70 case 2: // UTF-16
71 readUTF16(); break;
72 case 4: // UTF-32
73 // fall through
74 default: // error condition really, but give it a shot anyway
75 curChar = *nextPos++;
76 }
77 return *this;
78 }
79
80 /**
81 * Post-increment operator. Reads the next character and sets
82 * the state appropriately.
83 * Note - not protected against overruns.
84 */
85 FTUnicodeStringItr operator++(int)
86 {
87 FTUnicodeStringItr temp = *this;
88 ++*this;
89 return temp;
90 }
91
92 /**
93 * Equality operator. Two FTUnicodeStringItrs are considered equal
94 * if they have the same current buffer and buffer position.
95 */
96 bool operator==(const FTUnicodeStringItr& right) const
97 {
98 if (curPos == right.getBufferFromHere())
99 return true;
100 return false;
101 }
102
103 /**
104 * Dereference operator.
105 *
106 * @return The unicode codepoint of the character currently pointed
107 * to by the FTUnicodeStringItr.
108 */
109 unsigned int operator*() const
110 {
111 return curChar;
112 }
113
114 /**
115 * Buffer-fetching getter. You can use this to retreive the buffer
116 * starting at the currently-iterated character for functions which
117 * require a Unicode string as input.
118 */
getBufferFromHere()119 const T* getBufferFromHere() const { return curPos; }
120
121 private:
122 /**
123 * Helper function for reading a single UTF8 character from the string.
124 * Updates internal state appropriately.
125 */
126 void readUTF8();
127
128 /**
129 * Helper function for reading a single UTF16 character from the string.
130 * Updates internal state appropriately.
131 */
132 void readUTF16();
133
134 /**
135 * The buffer position of the first element in the current character.
136 */
137 const T* curPos;
138
139 /**
140 * The character stored at the current buffer position (prefetched on
141 * increment, so there's no penalty for dereferencing more than once).
142 */
143 unsigned int curChar;
144
145 /**
146 * The buffer position of the first element in the next character.
147 */
148 const T* nextPos;
149
150 // unicode magic numbers
151 static const char utf8bytes[256];
152 static const unsigned long offsetsFromUTF8[6];
153 static const unsigned long highSurrogateStart;
154 static const unsigned long highSurrogateEnd;
155 static const unsigned long lowSurrogateStart;
156 static const unsigned long lowSurrogateEnd;
157 static const unsigned long highSurrogateShift;
158 static const unsigned long lowSurrogateBase;
159 };
160
161 /* The first character in a UTF8 sequence indicates how many bytes
162 * to read (among other things) */
163 template <typename T>
164 const char FTUnicodeStringItr<T>::utf8bytes[256] = {
165 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
166 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
167 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
168 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
169 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
170 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
171 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
172 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6
173 };
174
175 /* Magic values subtracted from a buffer value during UTF8 conversion.
176 * This table contains as many values as there might be trailing bytes
177 * in a UTF-8 sequence. */
178 template <typename T>
179 const unsigned long FTUnicodeStringItr<T>::offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
180 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
181
182 // get a UTF8 character; leave the tracking pointer at the start of the
183 // next character
184 // not protected against invalid UTF8
185 template <typename T>
readUTF8()186 inline void FTUnicodeStringItr<T>::readUTF8()
187 {
188 unsigned int ch = 0;
189 unsigned int extraBytesToRead = utf8bytes[(unsigned char)(*nextPos)];
190 // falls through
191 switch (extraBytesToRead)
192 {
193 case 6: ch += *nextPos++; ch <<= 6; /* remember, illegal UTF-8 */
194 case 5: ch += *nextPos++; ch <<= 6; /* remember, illegal UTF-8 */
195 case 4: ch += *nextPos++; ch <<= 6;
196 case 3: ch += *nextPos++; ch <<= 6;
197 case 2: ch += *nextPos++; ch <<= 6;
198 case 1: ch += *nextPos++;
199 }
200 ch -= offsetsFromUTF8[extraBytesToRead-1];
201 curChar = ch;
202 }
203
204 // Magic numbers for UTF-16 conversions
205 template <typename T>
206 const unsigned long FTUnicodeStringItr<T>::highSurrogateStart = 0xD800;
207 template <typename T>
208 const unsigned long FTUnicodeStringItr<T>::highSurrogateEnd = 0xDBFF;
209 template <typename T>
210 const unsigned long FTUnicodeStringItr<T>::lowSurrogateStart = 0xDC00;
211 template <typename T>
212 const unsigned long FTUnicodeStringItr<T>::lowSurrogateEnd = 0xDFFF;
213 template <typename T>
214 const unsigned long FTUnicodeStringItr<T>::highSurrogateShift = 10;
215 template <typename T>
216 const unsigned long FTUnicodeStringItr<T>::lowSurrogateBase = 0x0010000UL;
217
218 template <typename T>
readUTF16()219 inline void FTUnicodeStringItr<T>::readUTF16()
220 {
221 unsigned int ch = *nextPos++;
222 // if we have the first half of the surrogate pair
223 if (ch >= highSurrogateStart && ch <= highSurrogateEnd)
224 {
225 unsigned int ch2 = *curPos;
226 // complete the surrogate pair
227 if (ch2 >= lowSurrogateStart && ch2 <= lowSurrogateEnd)
228 {
229 ch = ((ch - highSurrogateStart) << highSurrogateShift)
230 + (ch2 - lowSurrogateStart) + lowSurrogateBase;
231 ++nextPos;
232 }
233 }
234 curChar = ch;
235 }
236
237 #endif
238