1 /*
2  * Copyright © 2011,2012,2014  Google, Inc.
3  *
4  *  This is part of HarfBuzz, a text shaping library.
5  *
6  * Permission is hereby granted, without written agreement and without
7  * license or royalty fees, to use, copy, modify, and distribute this
8  * software and its documentation for any purpose, provided that the
9  * above copyright notice and the following two paragraphs appear in
10  * all copies of this software.
11  *
12  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
13  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
14  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
15  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
16  * DAMAGE.
17  *
18  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
19  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
20  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
21  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
22  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
23  *
24  * Google Author(s): Behdad Esfahbod
25  */
26 
27 #ifndef HB_UTF_PRIVATE_HH
28 #define HB_UTF_PRIVATE_HH
29 
30 #include "hb-private.hh"
31 
32 
33 struct hb_utf8_t
34 {
35   typedef uint8_t codepoint_t;
36 
37   static inline const uint8_t *
nexthb_utf8_t38   next (const uint8_t *text,
39 	const uint8_t *end,
40 	hb_codepoint_t *unicode,
41 	hb_codepoint_t replacement)
42   {
43     /* Written to only accept well-formed sequences.
44      * Based on ideas from ICU's U8_NEXT.
45      * Generates one "replacement" for each ill-formed byte. */
46 
47     hb_codepoint_t c = *text++;
48 
49     if (c > 0x7Fu)
50     {
51       if (hb_in_range<hb_codepoint_t> (c, 0xC2u, 0xDFu)) /* Two-byte */
52       {
53 	unsigned int t1;
54 	if (likely (text < end &&
55 		    (t1 = text[0] - 0x80u) <= 0x3Fu))
56 	{
57 	  c = ((c&0x1Fu)<<6) | t1;
58 	  text++;
59 	}
60 	else
61 	  goto error;
62       }
63       else if (hb_in_range<hb_codepoint_t> (c, 0xE0u, 0xEFu)) /* Three-byte */
64       {
65 	unsigned int t1, t2;
66 	if (likely (1 < end - text &&
67 		    (t1 = text[0] - 0x80u) <= 0x3Fu &&
68 		    (t2 = text[1] - 0x80u) <= 0x3Fu))
69 	{
70 	  c = ((c&0xFu)<<12) | (t1<<6) | t2;
71 	  if (unlikely (c < 0x0800u || hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu)))
72 	    goto error;
73 	  text += 2;
74 	}
75 	else
76 	  goto error;
77       }
78       else if (hb_in_range<hb_codepoint_t> (c, 0xF0u, 0xF4u)) /* Four-byte */
79       {
80 	unsigned int t1, t2, t3;
81 	if (likely (2 < end - text &&
82 		    (t1 = text[0] - 0x80u) <= 0x3Fu &&
83 		    (t2 = text[1] - 0x80u) <= 0x3Fu &&
84 		    (t3 = text[2] - 0x80u) <= 0x3Fu))
85 	{
86 	  c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3;
87 	  if (unlikely (!hb_in_range<hb_codepoint_t> (c, 0x10000u, 0x10FFFFu)))
88 	    goto error;
89 	  text += 3;
90 	}
91 	else
92 	  goto error;
93       }
94       else
95 	goto error;
96     }
97 
98     *unicode = c;
99     return text;
100 
101   error:
102     *unicode = replacement;
103     return text;
104   }
105 
106   static inline const uint8_t *
prevhb_utf8_t107   prev (const uint8_t *text,
108 	const uint8_t *start,
109 	hb_codepoint_t *unicode,
110 	hb_codepoint_t replacement)
111   {
112     const uint8_t *end = text--;
113     while (start < text && (*text & 0xc0) == 0x80 && end - text < 4)
114       text--;
115 
116     if (likely (next (text, end, unicode, replacement) == end))
117       return text;
118 
119     *unicode = replacement;
120     return end - 1;
121   }
122 
123   static inline unsigned int
strlenhb_utf8_t124   strlen (const uint8_t *text)
125   {
126     return ::strlen ((const char *) text);
127   }
128 };
129 
130 
131 struct hb_utf16_t
132 {
133   typedef uint16_t codepoint_t;
134 
135   static inline const uint16_t *
nexthb_utf16_t136   next (const uint16_t *text,
137 	const uint16_t *end,
138 	hb_codepoint_t *unicode,
139 	hb_codepoint_t replacement)
140   {
141     hb_codepoint_t c = *text++;
142 
143     if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu)))
144     {
145       *unicode = c;
146       return text;
147     }
148 
149     if (likely (c <= 0xDBFFu && text < end))
150     {
151       /* High-surrogate in c */
152       hb_codepoint_t l = *text;
153       if (likely (hb_in_range<hb_codepoint_t> (l, 0xDC00u, 0xDFFFu)))
154       {
155 	/* Low-surrogate in l */
156 	*unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u);
157 	 text++;
158 	 return text;
159       }
160     }
161 
162     /* Lonely / out-of-order surrogate. */
163     *unicode = replacement;
164     return text;
165   }
166 
167   static inline const uint16_t *
prevhb_utf16_t168   prev (const uint16_t *text,
169 	const uint16_t *start,
170 	hb_codepoint_t *unicode,
171 	hb_codepoint_t replacement)
172   {
173     hb_codepoint_t c = *--text;
174 
175     if (likely (!hb_in_range<hb_codepoint_t> (c, 0xD800u, 0xDFFFu)))
176     {
177       *unicode = c;
178       return text;
179     }
180 
181     if (likely (c >= 0xDC00u && start < text))
182     {
183       /* Low-surrogate in c */
184       hb_codepoint_t h = text[-1];
185       if (likely (hb_in_range<hb_codepoint_t> (h, 0xD800u, 0xDBFFu)))
186       {
187         /* High-surrogate in h */
188         *unicode = (h << 10) + c - ((0xD800u << 10) - 0x10000u + 0xDC00u);
189         text--;
190         return text;
191       }
192     }
193 
194     /* Lonely / out-of-order surrogate. */
195     *unicode = replacement;
196     return text;
197   }
198 
199 
200   static inline unsigned int
strlenhb_utf16_t201   strlen (const uint16_t *text)
202   {
203     unsigned int l = 0;
204     while (*text++) l++;
205     return l;
206   }
207 };
208 
209 
210 template <bool validate=true>
211 struct hb_utf32_t
212 {
213   typedef uint32_t codepoint_t;
214 
215   static inline const uint32_t *
nexthb_utf32_t216   next (const uint32_t *text,
217 	const uint32_t *end HB_UNUSED,
218 	hb_codepoint_t *unicode,
219 	hb_codepoint_t replacement)
220   {
221     hb_codepoint_t c = *unicode = *text++;
222     if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu)))
223       *unicode = replacement;
224     return text;
225   }
226 
227   static inline const uint32_t *
prevhb_utf32_t228   prev (const uint32_t *text,
229 	const uint32_t *start HB_UNUSED,
230 	hb_codepoint_t *unicode,
231 	hb_codepoint_t replacement)
232   {
233     hb_codepoint_t c = *unicode = *--text;
234     if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu)))
235       *unicode = replacement;
236     return text;
237   }
238 
239   static inline unsigned int
strlenhb_utf32_t240   strlen (const uint32_t *text)
241   {
242     unsigned int l = 0;
243     while (*text++) l++;
244     return l;
245   }
246 };
247 
248 
249 struct hb_latin1_t
250 {
251   typedef uint8_t codepoint_t;
252 
253   static inline const uint8_t *
nexthb_latin1_t254   next (const uint8_t *text,
255 	const uint8_t *end HB_UNUSED,
256 	hb_codepoint_t *unicode,
257 	hb_codepoint_t replacement HB_UNUSED)
258   {
259     *unicode = *text++;
260     return text;
261   }
262 
263   static inline const uint8_t *
prevhb_latin1_t264   prev (const uint8_t *text,
265 	const uint8_t *start HB_UNUSED,
266 	hb_codepoint_t *unicode,
267 	hb_codepoint_t replacement)
268   {
269     *unicode = *--text;
270     return text;
271   }
272 
273   static inline unsigned int
strlenhb_latin1_t274   strlen (const uint8_t *text)
275   {
276     unsigned int l = 0;
277     while (*text++) l++;
278     return l;
279   }
280 };
281 
282 #endif /* HB_UTF_PRIVATE_HH */
283