1 /* Pango
2  * pango-script.c: Script tag handling
3  *
4  * Copyright (C) 2002 Red Hat Software
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Library General Public
8  * License as published by the Free Software Foundation; either
9  * version 2 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
14  * Library General Public License for more details.
15  *
16  * You should have received a copy of the GNU Library General Public
17  * License along with this library; if not, write to the
18  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19  * Boston, MA 02111-1307, USA.
20  *
21  * Implementation of pango_script_iter is derived from ICU:
22  *
23  *  icu/sources/common/usc_impl.c
24  *
25  **********************************************************************
26  *   Copyright (C) 1999-2002, International Business Machines
27  *   Corporation and others.  All Rights Reserved.
28  **********************************************************************
29  *
30  * Permission is hereby granted, free of charge, to any person obtaining a
31  * copy of this software and associated documentation files (the
32  * "Software"), to deal in the Software without restriction, including
33  * without limitation the rights to use, copy, modify, merge, publish,
34  * distribute, and/or sell copies of the Software, and to permit persons
35  * to whom the Software is furnished to do so, provided that the above
36  * copyright notice(s) and this permission notice appear in all copies of
37  * the Software and that both the above copyright notice(s) and this
38  * permission notice appear in supporting documentation.
39  *
40  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
41  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
42  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
43  * OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
44  * HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL
45  * INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING
46  * FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
47  * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
48  * WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
49  *
50  * Except as contained in this notice, the name of a copyright holder
51  * shall not be used in advertising or otherwise to promote the sale, use
52  * or other dealings in this Software without prior written authorization
53  * of the copyright holder.
54  */
55 
56 #include "config.h"
57 #include <stdlib.h>
58 #include <string.h>
59 
60 #include "pango-script.h"
61 #include "pango-script-private.h"
62 
63 /**
64  * pango_script_for_unichar:
65  * @ch: a Unicode character
66  *
67  * Looks up the script for a particular character.
68  *
69  * The script of a character is defined by Unicode Standard Annex \#24.
70  * No check is made for @ch being a valid Unicode character; if you pass
71  * in invalid character, the result is undefined.
72  *
73  * Note that while the return type of this function is declared
74  * as `PangoScript`, as of Pango 1.18, this function simply returns
75  * the return value of g_unichar_get_script(). Callers must be
76  * prepared to handle unknown values.
77  *
78  * Return value: the `PangoScript` for the character.
79  *
80  * Since: 1.4
81  * Deprecated: 1.44. Use g_unichar_get_script()
82  **/
83 PangoScript
pango_script_for_unichar(gunichar ch)84 pango_script_for_unichar (gunichar ch)
85 {
86   return (PangoScript)g_unichar_get_script (ch);
87 }
88 
89 /**********************************************************************/
90 
91 static PangoScriptIter *pango_script_iter_copy (PangoScriptIter *iter);
92 
G_DEFINE_BOXED_TYPE(PangoScriptIter,pango_script_iter,pango_script_iter_copy,pango_script_iter_free)93 G_DEFINE_BOXED_TYPE (PangoScriptIter,
94                      pango_script_iter,
95                      pango_script_iter_copy,
96                      pango_script_iter_free)
97 
98 PangoScriptIter *
99 _pango_script_iter_init (PangoScriptIter *iter,
100 	                 const char      *text,
101 			 int              length)
102 {
103   iter->text_start = text;
104   if (length >= 0)
105     iter->text_end = text + length;
106   else
107     iter->text_end = text + strlen (text);
108 
109   iter->script_start = text;
110   iter->script_end = text;
111   iter->script_code = PANGO_SCRIPT_COMMON;
112 
113   iter->paren_sp = -1;
114 
115   pango_script_iter_next (iter);
116 
117   return iter;
118 }
119 
120 /**
121  * pango_script_iter_new:
122  * @text: a UTF-8 string
123  * @length: length of @text, or -1 if @text is nul-terminated.
124  *
125  * Create a new `PangoScriptIter`, used to break a string of
126  * Unicode text into runs by Unicode script.
127  *
128  * No copy is made of @text, so the caller needs to make
129  * sure it remains valid until the iterator is freed with
130  * [method@Pango.ScriptIter.free].
131  *
132  * Return value: the new script iterator, initialized
133  *  to point at the first range in the text, which should be
134  *  freed with [method@Pango.ScriptIter.free]. If the string is
135  *  empty, it will point at an empty range.
136  *
137  * Since: 1.4
138  **/
139 PangoScriptIter *
pango_script_iter_new(const char * text,int length)140 pango_script_iter_new (const char *text,
141 		       int         length)
142 {
143   return _pango_script_iter_init (g_slice_new (PangoScriptIter), text, length);
144 }
145 
146 static PangoScriptIter *
pango_script_iter_copy(PangoScriptIter * iter)147 pango_script_iter_copy (PangoScriptIter *iter)
148 {
149   return g_slice_dup (PangoScriptIter, iter);
150 }
151 
152 void
_pango_script_iter_fini(PangoScriptIter * iter)153 _pango_script_iter_fini (PangoScriptIter *iter)
154 {
155 }
156 
157 /**
158  * pango_script_iter_free:
159  * @iter: a `PangoScriptIter`
160  *
161  * Frees a `PangoScriptIter`.
162  *
163  * Since: 1.4
164  */
165 void
pango_script_iter_free(PangoScriptIter * iter)166 pango_script_iter_free (PangoScriptIter *iter)
167 {
168   _pango_script_iter_fini (iter);
169   g_slice_free (PangoScriptIter, iter);
170 }
171 
172 /**
173  * pango_script_iter_get_range:
174  * @iter: a `PangoScriptIter`
175  * @start: (out) (optional): location to store start position of the range
176  * @end: (out) (optional): location to store end position of the range
177  * @script: (out) (optional): location to store script for range
178  *
179  * Gets information about the range to which @iter currently points.
180  * The range is the set of locations p where *start <= p < *end.
181  * (That is, it doesn't include the character stored at *end)
182  *
183  * Note that while the type of the @script argument is declared
184  * as `PangoScript`, as of Pango 1.18, this function simply returns
185  * GUnicodeScript values. Callers must be prepared to handle unknown
186  * values.
187  *
188  * Since: 1.4
189  */
190 void
pango_script_iter_get_range(PangoScriptIter * iter,const char ** start,const char ** end,PangoScript * script)191 pango_script_iter_get_range (PangoScriptIter  *iter,
192                              const char      **start,
193                              const char      **end,
194                              PangoScript      *script)
195 {
196   if (start)
197     *start = iter->script_start;
198   if (end)
199     *end = iter->script_end;
200   if (script)
201     *script = iter->script_code;
202 }
203 
204 static const gunichar paired_chars[] = {
205   0x0028, 0x0029, /* ascii paired punctuation */
206   0x003c, 0x003e,
207   0x005b, 0x005d,
208   0x007b, 0x007d,
209   0x00ab, 0x00bb, /* guillemets */
210   0x0f3a, 0x0f3b, /* tibetan */
211   0x0f3c, 0x0f3d,
212   0x169b, 0x169c, /* ogham */
213   0x2018, 0x2019, /* general punctuation */
214   0x201c, 0x201d,
215   0x2039, 0x203a,
216   0x2045, 0x2046,
217   0x207d, 0x207e,
218   0x208d, 0x208e,
219   0x27e6, 0x27e7, /* math */
220   0x27e8, 0x27e9,
221   0x27ea, 0x27eb,
222   0x27ec, 0x27ed,
223   0x27ee, 0x27ef,
224   0x2983, 0x2984,
225   0x2985, 0x2986,
226   0x2987, 0x2988,
227   0x2989, 0x298a,
228   0x298b, 0x298c,
229   0x298d, 0x298e,
230   0x298f, 0x2990,
231   0x2991, 0x2992,
232   0x2993, 0x2994,
233   0x2995, 0x2996,
234   0x2997, 0x2998,
235   0x29fc, 0x29fd,
236   0x2e02, 0x2e03,
237   0x2e04, 0x2e05,
238   0x2e09, 0x2e0a,
239   0x2e0c, 0x2e0d,
240   0x2e1c, 0x2e1d,
241   0x2e20, 0x2e21,
242   0x2e22, 0x2e23,
243   0x2e24, 0x2e25,
244   0x2e26, 0x2e27,
245   0x2e28, 0x2e29,
246   0x3008, 0x3009, /* chinese paired punctuation */
247   0x300a, 0x300b,
248   0x300c, 0x300d,
249   0x300e, 0x300f,
250   0x3010, 0x3011,
251   0x3014, 0x3015,
252   0x3016, 0x3017,
253   0x3018, 0x3019,
254   0x301a, 0x301b,
255   0xfe59, 0xfe5a,
256   0xfe5b, 0xfe5c,
257   0xfe5d, 0xfe5e,
258   0xff08, 0xff09,
259   0xff3b, 0xff3d,
260   0xff5b, 0xff5d,
261   0xff5f, 0xff60,
262   0xff62, 0xff63
263 };
264 
265 static int
get_pair_index(gunichar ch)266 get_pair_index (gunichar ch)
267 {
268   int lower = 0;
269   int upper = G_N_ELEMENTS (paired_chars) - 1;
270 
271   while (lower <= upper)
272     {
273       int mid = (lower + upper) / 2;
274 
275       if (ch < paired_chars[mid])
276 	upper = mid - 1;
277       else if (ch > paired_chars[mid])
278 	lower = mid + 1;
279       else
280 	return mid;
281     }
282 
283   return -1;
284 }
285 
286 /* duplicated in pango-language.c */
287 #define REAL_SCRIPT(script) \
288   ((script) > PANGO_SCRIPT_INHERITED && (script) != PANGO_SCRIPT_UNKNOWN)
289 
290 #define SAME_SCRIPT(script1, script2) \
291   (!REAL_SCRIPT (script1) || !REAL_SCRIPT (script2) || (script1) == (script2))
292 
293 #define IS_OPEN(pair_index) (((pair_index) & 1) == 0)
294 
295 /**
296  * pango_script_iter_next:
297  * @iter: a `PangoScriptIter`
298  *
299  * Advances a `PangoScriptIter` to the next range.
300  *
301  * If @iter is already at the end, it is left unchanged
302  * and %FALSE is returned.
303  *
304  * Return value: %TRUE if @iter was successfully advanced
305  *
306  * Since: 1.4
307  */
308 gboolean
pango_script_iter_next(PangoScriptIter * iter)309 pango_script_iter_next (PangoScriptIter *iter)
310 {
311   int start_sp;
312 
313   if (iter->script_end == iter->text_end)
314     return FALSE;
315 
316   start_sp = iter->paren_sp;
317   iter->script_code = PANGO_SCRIPT_COMMON;
318   iter->script_start = iter->script_end;
319 
320   for (; iter->script_end < iter->text_end; iter->script_end = g_utf8_next_char (iter->script_end))
321     {
322       gunichar ch = g_utf8_get_char (iter->script_end);
323       PangoScript sc;
324       int pair_index;
325 
326       sc = (PangoScript)g_unichar_get_script (ch);
327       if (sc != PANGO_SCRIPT_COMMON)
328 	pair_index = -1;
329       else
330 	pair_index = get_pair_index (ch);
331 
332       /*
333        * Paired character handling:
334        *
335        * if it's an open character, push it onto the stack.
336        * if it's a close character, find the matching open on the
337        * stack, and use that script code. Any non-matching open
338        * characters above it on the stack will be poped.
339        */
340       if (pair_index >= 0)
341 	{
342 	  if (IS_OPEN (pair_index))
343 	    {
344 	      /*
345 	       * If the paren stack is full, empty it. This
346 	       * means that deeply nested paired punctuation
347 	       * characters will be ignored, but that's an unusual
348 	       * case, and it's better to ignore them than to
349 	       * write off the end of the stack...
350 	       */
351 	      if (++iter->paren_sp >= PAREN_STACK_DEPTH)
352 		iter->paren_sp = 0;
353 
354 	      iter->paren_stack[iter->paren_sp].pair_index = pair_index;
355 	      iter->paren_stack[iter->paren_sp].script_code = iter->script_code;
356 	    }
357 	  else if (iter->paren_sp >= 0)
358 	    {
359 	      int pi = pair_index & ~1;
360 
361 	      while (iter->paren_sp >= 0 && iter->paren_stack[iter->paren_sp].pair_index != pi)
362 		iter->paren_sp--;
363 
364 	      if (iter->paren_sp < start_sp)
365 		start_sp = iter->paren_sp;
366 
367 	      if (iter->paren_sp >= 0)
368 		sc = iter->paren_stack[iter->paren_sp].script_code;
369 	    }
370 	}
371 
372       if (SAME_SCRIPT (iter->script_code, sc))
373 	{
374 	  if (!REAL_SCRIPT (iter->script_code) && REAL_SCRIPT (sc))
375 	    {
376 	      iter->script_code = sc;
377 
378 	      /*
379 	       * now that we have a final script code, fix any open
380 	       * characters we pushed before we knew the script code.
381 	       */
382 	      while (start_sp < iter->paren_sp)
383 		iter->paren_stack[++start_sp].script_code = iter->script_code;
384 	    }
385 
386 	  /*
387 	   * if this character is a close paired character,
388 	   * pop it from the stack
389 	   */
390 	  if (pair_index >= 0 && !IS_OPEN (pair_index) && iter->paren_sp >= 0)
391 	    {
392 	      iter->paren_sp--;
393 
394 	      if (iter->paren_sp < start_sp)
395 		start_sp = iter->paren_sp;
396 	    }
397 	}
398       else
399 	{
400 	  /* Different script, we're done */
401 	  break;
402 	}
403     }
404 
405   return TRUE;
406 }
407 
408 /**********************************************************
409  * End of code from ICU
410  **********************************************************/
411