1 /* Pango
2 * pango-script.c: Script tag handling
3 *
4 * Copyright (C) 2002 Red Hat Software
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public
17 * License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 *
21 * Implementation of pango_script_iter is derived from ICU:
22 *
23 * icu/sources/common/usc_impl.c
24 *
25 **********************************************************************
26 * Copyright (C) 1999-2002, International Business Machines
27 * Corporation and others. All Rights Reserved.
28 **********************************************************************
29 *
30 * Permission is hereby granted, free of charge, to any person obtaining a
31 * copy of this software and associated documentation files (the
32 * "Software"), to deal in the Software without restriction, including
33 * without limitation the rights to use, copy, modify, merge, publish,
34 * distribute, and/or sell copies of the Software, and to permit persons
35 * to whom the Software is furnished to do so, provided that the above
36 * copyright notice(s) and this permission notice appear in all copies of
37 * the Software and that both the above copyright notice(s) and this
38 * permission notice appear in supporting documentation.
39 *
40 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
41 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
42 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
43 * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
44 * HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
45 * INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
46 * FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
47 * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
48 * WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
49 *
50 * Except as contained in this notice, the name of a copyright holder
51 * shall not be used in advertising or otherwise to promote the sale, use
52 * or other dealings in this Software without prior written authorization
53 * of the copyright holder.
54 */
55
56 #include "config.h"
57 #include <stdlib.h>
58 #include <string.h>
59
60 #include "pango-script.h"
61 #include "pango-script-private.h"
62
63 /**
64 * pango_script_for_unichar:
65 * @ch: a Unicode character
66 *
67 * Looks up the script for a particular character.
68 *
69 * The script of a character is defined by Unicode Standard Annex \#24.
70 * No check is made for @ch being a valid Unicode character; if you pass
71 * in invalid character, the result is undefined.
72 *
73 * Note that while the return type of this function is declared
74 * as `PangoScript`, as of Pango 1.18, this function simply returns
75 * the return value of g_unichar_get_script(). Callers must be
76 * prepared to handle unknown values.
77 *
78 * Return value: the `PangoScript` for the character.
79 *
80 * Since: 1.4
81 * Deprecated: 1.44. Use g_unichar_get_script()
82 **/
83 PangoScript
pango_script_for_unichar(gunichar ch)84 pango_script_for_unichar (gunichar ch)
85 {
86 return (PangoScript)g_unichar_get_script (ch);
87 }
88
89 /**********************************************************************/
90
91 static PangoScriptIter *pango_script_iter_copy (PangoScriptIter *iter);
92
G_DEFINE_BOXED_TYPE(PangoScriptIter,pango_script_iter,pango_script_iter_copy,pango_script_iter_free)93 G_DEFINE_BOXED_TYPE (PangoScriptIter,
94 pango_script_iter,
95 pango_script_iter_copy,
96 pango_script_iter_free)
97
98 PangoScriptIter *
99 _pango_script_iter_init (PangoScriptIter *iter,
100 const char *text,
101 int length)
102 {
103 iter->text_start = text;
104 if (length >= 0)
105 iter->text_end = text + length;
106 else
107 iter->text_end = text + strlen (text);
108
109 iter->script_start = text;
110 iter->script_end = text;
111 iter->script_code = PANGO_SCRIPT_COMMON;
112
113 iter->paren_sp = -1;
114
115 pango_script_iter_next (iter);
116
117 return iter;
118 }
119
120 /**
121 * pango_script_iter_new:
122 * @text: a UTF-8 string
123 * @length: length of @text, or -1 if @text is nul-terminated.
124 *
125 * Create a new `PangoScriptIter`, used to break a string of
126 * Unicode text into runs by Unicode script.
127 *
128 * No copy is made of @text, so the caller needs to make
129 * sure it remains valid until the iterator is freed with
130 * [method@Pango.ScriptIter.free].
131 *
132 * Return value: the new script iterator, initialized
133 * to point at the first range in the text, which should be
134 * freed with [method@Pango.ScriptIter.free]. If the string is
135 * empty, it will point at an empty range.
136 *
137 * Since: 1.4
138 **/
139 PangoScriptIter *
pango_script_iter_new(const char * text,int length)140 pango_script_iter_new (const char *text,
141 int length)
142 {
143 return _pango_script_iter_init (g_slice_new (PangoScriptIter), text, length);
144 }
145
146 static PangoScriptIter *
pango_script_iter_copy(PangoScriptIter * iter)147 pango_script_iter_copy (PangoScriptIter *iter)
148 {
149 return g_slice_dup (PangoScriptIter, iter);
150 }
151
152 void
_pango_script_iter_fini(PangoScriptIter * iter)153 _pango_script_iter_fini (PangoScriptIter *iter)
154 {
155 }
156
157 /**
158 * pango_script_iter_free:
159 * @iter: a `PangoScriptIter`
160 *
161 * Frees a `PangoScriptIter`.
162 *
163 * Since: 1.4
164 */
165 void
pango_script_iter_free(PangoScriptIter * iter)166 pango_script_iter_free (PangoScriptIter *iter)
167 {
168 _pango_script_iter_fini (iter);
169 g_slice_free (PangoScriptIter, iter);
170 }
171
172 /**
173 * pango_script_iter_get_range:
174 * @iter: a `PangoScriptIter`
175 * @start: (out) (optional): location to store start position of the range
176 * @end: (out) (optional): location to store end position of the range
177 * @script: (out) (optional): location to store script for range
178 *
179 * Gets information about the range to which @iter currently points.
180 * The range is the set of locations p where *start <= p < *end.
181 * (That is, it doesn't include the character stored at *end)
182 *
183 * Note that while the type of the @script argument is declared
184 * as `PangoScript`, as of Pango 1.18, this function simply returns
185 * GUnicodeScript values. Callers must be prepared to handle unknown
186 * values.
187 *
188 * Since: 1.4
189 */
190 void
pango_script_iter_get_range(PangoScriptIter * iter,const char ** start,const char ** end,PangoScript * script)191 pango_script_iter_get_range (PangoScriptIter *iter,
192 const char **start,
193 const char **end,
194 PangoScript *script)
195 {
196 if (start)
197 *start = iter->script_start;
198 if (end)
199 *end = iter->script_end;
200 if (script)
201 *script = iter->script_code;
202 }
203
204 static const gunichar paired_chars[] = {
205 0x0028, 0x0029, /* ascii paired punctuation */
206 0x003c, 0x003e,
207 0x005b, 0x005d,
208 0x007b, 0x007d,
209 0x00ab, 0x00bb, /* guillemets */
210 0x0f3a, 0x0f3b, /* tibetan */
211 0x0f3c, 0x0f3d,
212 0x169b, 0x169c, /* ogham */
213 0x2018, 0x2019, /* general punctuation */
214 0x201c, 0x201d,
215 0x2039, 0x203a,
216 0x2045, 0x2046,
217 0x207d, 0x207e,
218 0x208d, 0x208e,
219 0x27e6, 0x27e7, /* math */
220 0x27e8, 0x27e9,
221 0x27ea, 0x27eb,
222 0x27ec, 0x27ed,
223 0x27ee, 0x27ef,
224 0x2983, 0x2984,
225 0x2985, 0x2986,
226 0x2987, 0x2988,
227 0x2989, 0x298a,
228 0x298b, 0x298c,
229 0x298d, 0x298e,
230 0x298f, 0x2990,
231 0x2991, 0x2992,
232 0x2993, 0x2994,
233 0x2995, 0x2996,
234 0x2997, 0x2998,
235 0x29fc, 0x29fd,
236 0x2e02, 0x2e03,
237 0x2e04, 0x2e05,
238 0x2e09, 0x2e0a,
239 0x2e0c, 0x2e0d,
240 0x2e1c, 0x2e1d,
241 0x2e20, 0x2e21,
242 0x2e22, 0x2e23,
243 0x2e24, 0x2e25,
244 0x2e26, 0x2e27,
245 0x2e28, 0x2e29,
246 0x3008, 0x3009, /* chinese paired punctuation */
247 0x300a, 0x300b,
248 0x300c, 0x300d,
249 0x300e, 0x300f,
250 0x3010, 0x3011,
251 0x3014, 0x3015,
252 0x3016, 0x3017,
253 0x3018, 0x3019,
254 0x301a, 0x301b,
255 0xfe59, 0xfe5a,
256 0xfe5b, 0xfe5c,
257 0xfe5d, 0xfe5e,
258 0xff08, 0xff09,
259 0xff3b, 0xff3d,
260 0xff5b, 0xff5d,
261 0xff5f, 0xff60,
262 0xff62, 0xff63
263 };
264
265 static int
get_pair_index(gunichar ch)266 get_pair_index (gunichar ch)
267 {
268 int lower = 0;
269 int upper = G_N_ELEMENTS (paired_chars) - 1;
270
271 while (lower <= upper)
272 {
273 int mid = (lower + upper) / 2;
274
275 if (ch < paired_chars[mid])
276 upper = mid - 1;
277 else if (ch > paired_chars[mid])
278 lower = mid + 1;
279 else
280 return mid;
281 }
282
283 return -1;
284 }
285
286 /* duplicated in pango-language.c */
287 #define REAL_SCRIPT(script) \
288 ((script) > PANGO_SCRIPT_INHERITED && (script) != PANGO_SCRIPT_UNKNOWN)
289
290 #define SAME_SCRIPT(script1, script2) \
291 (!REAL_SCRIPT (script1) || !REAL_SCRIPT (script2) || (script1) == (script2))
292
293 #define IS_OPEN(pair_index) (((pair_index) & 1) == 0)
294
295 /**
296 * pango_script_iter_next:
297 * @iter: a `PangoScriptIter`
298 *
299 * Advances a `PangoScriptIter` to the next range.
300 *
301 * If @iter is already at the end, it is left unchanged
302 * and %FALSE is returned.
303 *
304 * Return value: %TRUE if @iter was successfully advanced
305 *
306 * Since: 1.4
307 */
308 gboolean
pango_script_iter_next(PangoScriptIter * iter)309 pango_script_iter_next (PangoScriptIter *iter)
310 {
311 int start_sp;
312
313 if (iter->script_end == iter->text_end)
314 return FALSE;
315
316 start_sp = iter->paren_sp;
317 iter->script_code = PANGO_SCRIPT_COMMON;
318 iter->script_start = iter->script_end;
319
320 for (; iter->script_end < iter->text_end; iter->script_end = g_utf8_next_char (iter->script_end))
321 {
322 gunichar ch = g_utf8_get_char (iter->script_end);
323 PangoScript sc;
324 int pair_index;
325
326 sc = (PangoScript)g_unichar_get_script (ch);
327 if (sc != PANGO_SCRIPT_COMMON)
328 pair_index = -1;
329 else
330 pair_index = get_pair_index (ch);
331
332 /*
333 * Paired character handling:
334 *
335 * if it's an open character, push it onto the stack.
336 * if it's a close character, find the matching open on the
337 * stack, and use that script code. Any non-matching open
338 * characters above it on the stack will be poped.
339 */
340 if (pair_index >= 0)
341 {
342 if (IS_OPEN (pair_index))
343 {
344 /*
345 * If the paren stack is full, empty it. This
346 * means that deeply nested paired punctuation
347 * characters will be ignored, but that's an unusual
348 * case, and it's better to ignore them than to
349 * write off the end of the stack...
350 */
351 if (++iter->paren_sp >= PAREN_STACK_DEPTH)
352 iter->paren_sp = 0;
353
354 iter->paren_stack[iter->paren_sp].pair_index = pair_index;
355 iter->paren_stack[iter->paren_sp].script_code = iter->script_code;
356 }
357 else if (iter->paren_sp >= 0)
358 {
359 int pi = pair_index & ~1;
360
361 while (iter->paren_sp >= 0 && iter->paren_stack[iter->paren_sp].pair_index != pi)
362 iter->paren_sp--;
363
364 if (iter->paren_sp < start_sp)
365 start_sp = iter->paren_sp;
366
367 if (iter->paren_sp >= 0)
368 sc = iter->paren_stack[iter->paren_sp].script_code;
369 }
370 }
371
372 if (SAME_SCRIPT (iter->script_code, sc))
373 {
374 if (!REAL_SCRIPT (iter->script_code) && REAL_SCRIPT (sc))
375 {
376 iter->script_code = sc;
377
378 /*
379 * now that we have a final script code, fix any open
380 * characters we pushed before we knew the script code.
381 */
382 while (start_sp < iter->paren_sp)
383 iter->paren_stack[++start_sp].script_code = iter->script_code;
384 }
385
386 /*
387 * if this character is a close paired character,
388 * pop it from the stack
389 */
390 if (pair_index >= 0 && !IS_OPEN (pair_index) && iter->paren_sp >= 0)
391 {
392 iter->paren_sp--;
393
394 if (iter->paren_sp < start_sp)
395 start_sp = iter->paren_sp;
396 }
397 }
398 else
399 {
400 /* Different script, we're done */
401 break;
402 }
403 }
404
405 return TRUE;
406 }
407
408 /**********************************************************
409 * End of code from ICU
410 **********************************************************/
411