1 /*
2  * FTGL - OpenGL font library
3  *
4  * Copyright (c) 2008 Daniel Remenak <dtremenak@users.sourceforge.net>
5  *
6  * Portions derived from ConvertUTF.c Copyright (C) 2001-2004 Unicode, Inc
7  *   Unicode, Inc. hereby grants the right to freely use the information
8  *   supplied in this file in the creation of products supporting the
9  *   Unicode Standard, and to make copies of this file in any form
10  *   for internal or external distribution as long as this notice
11  *   remains attached.
12  *
13  * Permission is hereby granted, free of charge, to any person obtaining
14  * a copy of this software and associated documentation files (the
15  * "Software"), to deal in the Software without restriction, including
16  * without limitation the rights to use, copy, modify, merge, publish,
17  * distribute, sublicense, and/or sell copies of the Software, and to
18  * permit persons to whom the Software is furnished to do so, subject to
19  * the following conditions:
20  *
21  * The above copyright notice and this permission notice shall be
22  * included in all copies or substantial portions of the Software.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
27  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
28  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
29  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
30  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31  */
32 
33 #ifndef    __FTUnicode__
34 #define    __FTUnicode__
35 
36 /**
37  * Provides a way to easily walk multibyte unicode strings in the various
38  * Unicode encodings (UTF-8, UTF-16, UTF-32, UCS-2, and UCS-4).  Encodings
39  * with elements larger than one byte must already be in the correct endian
40  * order for the current architecture.
41  */
42 template <typename T>
43 class FTUnicodeStringItr
44 {
45 public:
46     /**
47      * Constructor.  Also reads the first character and stores it.
48      *
49      * @param string  The buffer to iterate.  No copy is made.
50      */
FTUnicodeStringItr(const T * string)51     FTUnicodeStringItr(const T* string) : curPos(string), nextPos(string)
52     {
53         (*this)++;
54     };
55 
56     /**
57      * Pre-increment operator.  Reads the next unicode character and sets
58      * the state appropriately.
59      * Note - not protected against overruns.
60      */
61     FTUnicodeStringItr& operator++()
62     {
63         curPos = nextPos;
64         // unicode handling
65         switch (sizeof(T))
66         {
67             case 1: // UTF-8
68                 // get this character
69                 readUTF8(); break;
70             case 2: // UTF-16
71                 readUTF16(); break;
72             case 4: // UTF-32
73                 // fall through
74             default: // error condition really, but give it a shot anyway
75                 curChar = *nextPos++;
76         }
77         return *this;
78     }
79 
80     /**
81      * Post-increment operator.  Reads the next character and sets
82      * the state appropriately.
83      * Note - not protected against overruns.
84      */
85     FTUnicodeStringItr operator++(int)
86     {
87         FTUnicodeStringItr temp = *this;
88         ++*this;
89         return temp;
90     }
91 
92     /**
93      * Equality operator.  Two FTUnicodeStringItrs are considered equal
94      * if they have the same current buffer and buffer position.
95      */
96     bool operator==(const FTUnicodeStringItr& right) const
97     {
98         if (curPos == right.getBufferFromHere())
99             return true;
100         return false;
101     }
102 
103     /**
104      * Dereference operator.
105      *
106      * @return  The unicode codepoint of the character currently pointed
107      * to by the FTUnicodeStringItr.
108      */
109     unsigned int operator*() const
110     {
111         return curChar;
112     }
113 
114     /**
115      * Buffer-fetching getter.  You can use this to retreive the buffer
116      * starting at the currently-iterated character for functions which
117      * require a Unicode string as input.
118      */
getBufferFromHere()119     const T* getBufferFromHere() const { return curPos; }
120 
121 private:
122     /**
123      * Helper function for reading a single UTF8 character from the string.
124      * Updates internal state appropriately.
125      */
126     void readUTF8();
127 
128     /**
129      * Helper function for reading a single UTF16 character from the string.
130      * Updates internal state appropriately.
131      */
132     void readUTF16();
133 
134     /**
135      * The buffer position of the first element in the current character.
136      */
137     const T* curPos;
138 
139     /**
140      * The character stored at the current buffer position (prefetched on
141      * increment, so there's no penalty for dereferencing more than once).
142      */
143     unsigned int curChar;
144 
145     /**
146      * The buffer position of the first element in the next character.
147      */
148     const T* nextPos;
149 
150     // unicode magic numbers
151     static const char utf8bytes[256];
152     static const unsigned long offsetsFromUTF8[6];
153     static const unsigned long highSurrogateStart;
154     static const unsigned long highSurrogateEnd;
155     static const unsigned long lowSurrogateStart;
156     static const unsigned long lowSurrogateEnd;
157     static const unsigned long highSurrogateShift;
158     static const unsigned long lowSurrogateBase;
159 };
160 
161 /* The first character in a UTF8 sequence indicates how many bytes
162  * to read (among other things) */
163 template <typename T>
164 const char FTUnicodeStringItr<T>::utf8bytes[256] = {
165   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
166   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
167   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
168   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
169   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
170   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
171   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
172   3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6
173 };
174 
175 /* Magic values subtracted from a buffer value during UTF8 conversion.
176  * This table contains as many values as there might be trailing bytes
177  * in a UTF-8 sequence. */
178 template <typename T>
179 const unsigned long FTUnicodeStringItr<T>::offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
180   0x03C82080UL, 0xFA082080UL, 0x82082080UL };
181 
182 // get a UTF8 character; leave the tracking pointer at the start of the
183 // next character
184 // not protected against invalid UTF8
185 template <typename T>
readUTF8()186 inline void FTUnicodeStringItr<T>::readUTF8()
187 {
188     unsigned int ch = 0;
189     unsigned int extraBytesToRead = utf8bytes[(unsigned char)(*nextPos)];
190     // falls through
191     switch (extraBytesToRead)
192     {
193           case 6: ch += *nextPos++; ch <<= 6; /* remember, illegal UTF-8 */
194           case 5: ch += *nextPos++; ch <<= 6; /* remember, illegal UTF-8 */
195           case 4: ch += *nextPos++; ch <<= 6;
196           case 3: ch += *nextPos++; ch <<= 6;
197           case 2: ch += *nextPos++; ch <<= 6;
198           case 1: ch += *nextPos++;
199     }
200     ch -= offsetsFromUTF8[extraBytesToRead-1];
201     curChar = ch;
202 }
203 
204 // Magic numbers for UTF-16 conversions
205 template <typename T>
206 const unsigned long FTUnicodeStringItr<T>::highSurrogateStart = 0xD800;
207 template <typename T>
208 const unsigned long FTUnicodeStringItr<T>::highSurrogateEnd = 0xDBFF;
209 template <typename T>
210 const unsigned long FTUnicodeStringItr<T>::lowSurrogateStart = 0xDC00;
211 template <typename T>
212 const unsigned long FTUnicodeStringItr<T>::lowSurrogateEnd = 0xDFFF;
213 template <typename T>
214 const unsigned long FTUnicodeStringItr<T>::highSurrogateShift = 10;
215 template <typename T>
216 const unsigned long FTUnicodeStringItr<T>::lowSurrogateBase = 0x0010000UL;
217 
218 template <typename T>
readUTF16()219 inline void FTUnicodeStringItr<T>::readUTF16()
220 {
221     unsigned int ch = *nextPos++;
222     // if we have the first half of the surrogate pair
223     if (ch >= highSurrogateStart && ch <= highSurrogateEnd)
224     {
225         unsigned int ch2 = *curPos;
226         // complete the surrogate pair
227         if (ch2 >= lowSurrogateStart && ch2 <= lowSurrogateEnd)
228         {
229             ch = ((ch - highSurrogateStart) << highSurrogateShift)
230                 + (ch2 - lowSurrogateStart) + lowSurrogateBase;
231             ++nextPos;
232         }
233     }
234     curChar = ch;
235 }
236 
237 #endif
238