1 /*
2  * Copyright © 2004 Noah Levitt
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms of the GNU General Public License as published by the
6  * Free Software Foundation; either version 3 of the License, or (at your
7  * option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful, but
10  * WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License along
15  * with this program; if not, write to the Free Software Foundation, Inc.,
16  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17  */
18 
19 #include <config.h>
20 #include <glib.h>
21 #include <string.h>
22 
23 #include <glib/gi18n-lib.h>
24 
25 #include "gucharmap.h"
26 #include "gucharmap-private.h"
27 
28 #include "unicode-scripts.h"
29 
30 typedef struct
31 {
32   gunichar start;
33   gunichar end;
34   gint index;   /* index of @start in the codepoint list */
35 }
36 UnicodeRange;
37 
38 struct _GucharmapScriptCodepointListPrivate
39 {
40   GPtrArray *ranges;
41 };
42 
43 static void gucharmap_script_codepoint_list_class_init (GucharmapScriptCodepointListClass *klass);
44 static void gucharmap_script_codepoint_list_init       (GucharmapScriptCodepointList      *list);
45 
G_DEFINE_TYPE(GucharmapScriptCodepointList,gucharmap_script_codepoint_list,GUCHARMAP_TYPE_CODEPOINT_LIST)46 G_DEFINE_TYPE (GucharmapScriptCodepointList, gucharmap_script_codepoint_list, GUCHARMAP_TYPE_CODEPOINT_LIST)
47 
48 static gint
49 find_script (const gchar *script)
50 {
51   gint min, mid, max;
52 
53   min = 0;
54   max = G_N_ELEMENTS (unicode_script_list_offsets) - 1;
55 
56   while (max >= min)
57     {
58       mid = (min + max) / 2;
59 
60       if (strcmp (script, unicode_script_list_strings + unicode_script_list_offsets[mid]) > 0)
61         min = mid + 1;
62       else if (strcmp (script, unicode_script_list_strings + unicode_script_list_offsets[mid]) < 0)
63         max = mid - 1;
64       else
65         return mid;
66     }
67 
68   return -1;
69 }
70 
71 /* *ranges should be freed by caller */
72 /* adds unlisted characters to the "Unknown" script */
73 static gboolean
get_chars_for_script(const gchar * script,UnicodeRange ** ranges,gint * size)74 get_chars_for_script (const gchar            *script,
75                       UnicodeRange          **ranges,
76                       gint                   *size)
77 {
78   gint i, j, index;
79   gint script_index, unknown_script_index;
80   gint prev_end;
81 
82   script_index = find_script (script);
83   unknown_script_index = find_script ("Unknown");
84   if (script_index == -1)
85     return FALSE;
86 
87   j = 0;
88 
89   if (script_index == unknown_script_index)
90     {
91       prev_end = -1;
92       for (i = 0;  i < G_N_ELEMENTS (unicode_scripts);  i++)
93 	{
94 	  if (unicode_scripts[i].start > prev_end + 1)
95 	    j++;
96 	  prev_end = unicode_scripts[i].end;
97 	}
98       if (unicode_scripts[i-1].end < UNICHAR_MAX)
99 	j++;
100     }
101 
102   for (i = 0;  i < G_N_ELEMENTS (unicode_scripts);  i++)
103     if (unicode_scripts[i].script_index == script_index)
104       j++;
105 
106   *size = j;
107   *ranges = g_new (UnicodeRange, *size);
108 
109   j = 0, index = 0, prev_end = -1;
110 
111   for (i = 0;  i < G_N_ELEMENTS (unicode_scripts);  i++)
112     {
113       if (script_index == unknown_script_index)
114 	{
115 	  if (unicode_scripts[i].start > prev_end + 1)
116 	    {
117 	      (*ranges)[j].start = prev_end + 1;
118 	      (*ranges)[j].end = unicode_scripts[i].start - 1;
119 	      (*ranges)[j].index = index;
120 
121 	      index += (*ranges)[j].end - (*ranges)[j].start + 1;
122 	      j++;
123 	    }
124 
125 	  prev_end = unicode_scripts[i].end;
126 	}
127 
128       if (unicode_scripts[i].script_index == script_index)
129 	{
130 	  (*ranges)[j].start = unicode_scripts[i].start;
131 	  (*ranges)[j].end = unicode_scripts[i].end;
132 	  (*ranges)[j].index = index;
133 
134 	  index += (*ranges)[j].end - (*ranges)[j].start + 1;
135 	  j++;
136 	}
137     }
138 
139   if (script_index == unknown_script_index)
140     {
141       if (unicode_scripts[i-1].end < UNICHAR_MAX)
142 	{
143 	  (*ranges)[j].start = unicode_scripts[i-1].end + 1;
144 	  (*ranges)[j].end = UNICHAR_MAX;
145 	  (*ranges)[j].index = index;
146 	  j++;
147 	}
148     }
149 
150 
151   g_assert (j == *size);
152 
153   return TRUE;
154 }
155 
156 static void
ensure_initialized(GucharmapScriptCodepointList * guscl)157 ensure_initialized (GucharmapScriptCodepointList *guscl)
158 {
159   GucharmapScriptCodepointListPrivate *priv = guscl->priv;
160   gboolean success;
161 
162   if (priv->ranges != NULL)
163     return;
164 
165   success = gucharmap_script_codepoint_list_set_script (guscl, "Latin");
166 
167   g_assert (success);
168 }
169 
170 static gunichar
get_char(GucharmapCodepointList * list,gint index)171 get_char (GucharmapCodepointList *list,
172           gint                    index)
173 {
174   GucharmapScriptCodepointList *guscl = GUCHARMAP_SCRIPT_CODEPOINT_LIST (list);
175   GucharmapScriptCodepointListPrivate *priv = guscl->priv;
176   gint min, mid, max;
177 
178   ensure_initialized (guscl);
179 
180   min = 0;
181   max = priv->ranges->len - 1;
182 
183   while (max >= min)
184     {
185       UnicodeRange *range;
186 
187       mid = (min + max) / 2;
188       range = (UnicodeRange *) (priv->ranges->pdata[mid]);
189 
190       if (index > range->index + range->end - range->start)
191         min = mid + 1;
192       else if (index < range->index)
193         max = mid - 1;
194       else
195         return range->start + index - range->index;
196     }
197 
198   return (gunichar)(-1);
199 }
200 
201 /* XXX: linear search */
202 static gint
get_index(GucharmapCodepointList * list,gunichar wc)203 get_index (GucharmapCodepointList *list,
204            gunichar                wc)
205 {
206   GucharmapScriptCodepointList *guscl = GUCHARMAP_SCRIPT_CODEPOINT_LIST (list);
207   GucharmapScriptCodepointListPrivate *priv = guscl->priv;
208   gint i;
209 
210   ensure_initialized (guscl);
211 
212   for (i = 0;  i < priv->ranges->len;  i++)
213     {
214       UnicodeRange *range = (UnicodeRange *) priv->ranges->pdata[i];
215       if (wc >= range->start && wc <= range->end)
216         return range->index + wc - range->start;
217     }
218 
219   return -1;
220 }
221 
222 static gint
get_last_index(GucharmapCodepointList * list)223 get_last_index (GucharmapCodepointList *list)
224 {
225   GucharmapScriptCodepointList *guscl = GUCHARMAP_SCRIPT_CODEPOINT_LIST (list);
226   GucharmapScriptCodepointListPrivate *priv = guscl->priv;
227   UnicodeRange *last_range;
228 
229   ensure_initialized (guscl);
230 
231   last_range = (UnicodeRange *) (priv->ranges->pdata[priv->ranges->len-1]);
232 
233   return last_range->index + last_range->end - last_range->start;
234 }
235 
236 static void
clear_ranges(GPtrArray * ranges)237 clear_ranges (GPtrArray *ranges)
238 {
239   guint i, n;
240 
241   n = ranges->len;
242   for (i = 0; i < n; ++i)
243     g_free (g_ptr_array_index (ranges, i));
244 
245   g_ptr_array_set_size (ranges, 0);
246 }
247 
248 static void
gucharmap_script_codepoint_list_finalize(GObject * object)249 gucharmap_script_codepoint_list_finalize (GObject *object)
250 {
251   GucharmapScriptCodepointList *guscl = GUCHARMAP_SCRIPT_CODEPOINT_LIST (object);
252   GucharmapScriptCodepointListPrivate *priv = guscl->priv;
253 
254   if (priv->ranges)
255     {
256       clear_ranges (priv->ranges);
257       g_ptr_array_free (priv->ranges, TRUE);
258     }
259 
260   G_OBJECT_CLASS (gucharmap_script_codepoint_list_parent_class)->finalize (object);
261 }
262 
263 static void
gucharmap_script_codepoint_list_class_init(GucharmapScriptCodepointListClass * clazz)264 gucharmap_script_codepoint_list_class_init (GucharmapScriptCodepointListClass *clazz)
265 {
266   GucharmapCodepointListClass *codepoint_list_class = GUCHARMAP_CODEPOINT_LIST_CLASS (clazz);
267   GObjectClass *gobject_class = G_OBJECT_CLASS (clazz);
268 
269   _gucharmap_intl_ensure_initialized ();
270 
271   g_type_class_add_private (codepoint_list_class, sizeof (GucharmapScriptCodepointListPrivate));
272 
273   codepoint_list_class->get_char = get_char;
274   codepoint_list_class->get_index = get_index;
275   codepoint_list_class->get_last_index = get_last_index;
276 
277   gobject_class->finalize = gucharmap_script_codepoint_list_finalize;
278 }
279 
280 static void
gucharmap_script_codepoint_list_init(GucharmapScriptCodepointList * guscl)281 gucharmap_script_codepoint_list_init (GucharmapScriptCodepointList *guscl)
282 {
283   guscl->priv = G_TYPE_INSTANCE_GET_PRIVATE (guscl, GUCHARMAP_TYPE_SCRIPT_CODEPOINT_LIST, GucharmapScriptCodepointListPrivate);
284 }
285 
286 /**
287  * gucharmap_script_codepoint_list_new:
288  *
289  * Creates a new script codepoint list. The default script is Latin.
290  *
291  * Return value: the newly-created #GucharmapCodepointList. Use
292  * g_object_unref() to free the result.
293  **/
294 GucharmapCodepointList *
gucharmap_script_codepoint_list_new(void)295 gucharmap_script_codepoint_list_new (void)
296 {
297   return GUCHARMAP_CODEPOINT_LIST (g_object_new (gucharmap_script_codepoint_list_get_type (), NULL));
298 }
299 
300 /**
301  * gucharmap_script_codepoint_list_set_script:
302  * @list: a GucharmapScriptCodepointList
303  * @script: the script name
304  *
305  * Sets the script for the codepoint list.
306  *
307  * Return value: %TRUE on success, %FALSE if there is no such script, in
308  * which case the script is not changed.
309  **/
310 gboolean
gucharmap_script_codepoint_list_set_script(GucharmapScriptCodepointList * list,const gchar * script)311 gucharmap_script_codepoint_list_set_script (GucharmapScriptCodepointList *list,
312                                             const gchar                  *script)
313 {
314   const gchar *scripts[2];
315 
316   scripts[0] = script;
317   scripts[1] = NULL;
318 
319   return gucharmap_script_codepoint_list_set_scripts (list, scripts);
320 }
321 
322 /**
323  * gucharmap_script_codepoint_list_set_scripts:
324  * @list: a GucharmapScriptCodepointList
325  * @scripts: NULL-terminated array of script names
326  *
327  * Sets multiple scripts for the codepoint list. Codepoints are sorted
328  * according to their order in @scripts.
329  *
330  * Return value: %TRUE on success, %FALSE if any of the scripts don’t
331  * exist, in which case the script is not changed.
332  **/
333 gboolean
gucharmap_script_codepoint_list_set_scripts(GucharmapScriptCodepointList * list,const gchar ** scripts)334 gucharmap_script_codepoint_list_set_scripts (GucharmapScriptCodepointList  *list,
335 	                                     const gchar                  **scripts)
336 {
337   GucharmapScriptCodepointListPrivate *priv = list->priv;
338   UnicodeRange *ranges;
339   gint i, j, size;
340 
341   if (priv->ranges)
342     clear_ranges (priv->ranges);
343   else
344     priv->ranges = g_ptr_array_new ();
345 
346   for (i = 0;  scripts[i];  i++)
347     if (get_chars_for_script (scripts[i], &ranges, &size))
348       {
349         for (j = 0;  j < size;  j++)
350           g_ptr_array_add (priv->ranges, g_memdup (ranges + j, sizeof (ranges[j])));
351         g_free (ranges);
352       }
353     else
354       {
355         g_ptr_array_free (priv->ranges, TRUE);
356         return FALSE;
357       }
358 
359   return TRUE;
360 }
361 
362 /**
363  * gucharmap_script_codepoint_list_append_script:
364  * @list: a GucharmapScriptCodepointList
365  * @script: the script name
366  *
367  * Appends the characters in @script to the codepoint list.
368  *
369  * Return value: %TRUE on success, %FALSE if there is no such script, in
370  * which case the codepoint list is not changed.
371  **/
372 gboolean
gucharmap_script_codepoint_list_append_script(GucharmapScriptCodepointList * list,const gchar * script)373 gucharmap_script_codepoint_list_append_script (GucharmapScriptCodepointList  *list,
374                                                const gchar                   *script)
375 {
376   GucharmapScriptCodepointListPrivate *priv = list->priv;
377   UnicodeRange *ranges;
378   gint j, size, index0;
379 
380   if (priv->ranges == NULL)
381     priv->ranges = g_ptr_array_new ();
382 
383   if (priv->ranges->len > 0)
384     {
385       UnicodeRange *last_range = (UnicodeRange *) priv->ranges->pdata[priv->ranges->len - 1];
386       index0 = last_range->index  + last_range->end - last_range->start + 1;
387     }
388   else
389     index0 = 0;
390 
391   if (get_chars_for_script (script, &ranges, &size))
392     {
393       for (j = 0;  j < size;  j++)
394         {
395           UnicodeRange *range = g_memdup (ranges + j, sizeof (ranges[j]));
396           range->index += index0;
397           g_ptr_array_add (priv->ranges, range);
398         }
399       g_free (ranges);
400 
401       return TRUE;
402     }
403 
404   return FALSE;
405 }
406 
407 /**
408  * gucharmap_unicode_list_scripts:
409  *
410  * Returns an array of untranslated script names.
411  *
412  * The strings in the array are owned by gucharmap and should not be
413  * modified or free; the array itself however is allocated and should
414  * be freed with g_free().
415  *
416  * Returns: (transfer container): a newly allocated %NULL-terminated array of strings
417  **/
418 const gchar **
gucharmap_unicode_list_scripts(void)419 gucharmap_unicode_list_scripts (void)
420 {
421   const char **scripts;
422   guint i;
423 
424   scripts = (const char **) g_new (char*, G_N_ELEMENTS (unicode_script_list_offsets) + 1);
425   for (i = 0; i < G_N_ELEMENTS (unicode_script_list_offsets); ++i)
426     {
427       scripts[i] = unicode_script_list_strings + unicode_script_list_offsets[i];
428     }
429   scripts[i] = NULL;
430 
431   return scripts;
432 }
433 
434 /**
435  * gucharmap_unicode_get_script_for_char:
436  * @wc: a character
437  *
438  * Return value: The English (untranslated) name of the script to which the
439  * character belongs. Characters that don't belong to an actual script
440  * return %"Unknown".
441  **/
442 const gchar *
gucharmap_unicode_get_script_for_char(gunichar wc)443 gucharmap_unicode_get_script_for_char (gunichar wc)
444 {
445   gint min = 0;
446   gint mid;
447   gint max = sizeof (unicode_scripts) / sizeof (UnicodeScript) - 1;
448 
449   if (wc > UNICHAR_MAX)
450     return NULL;
451 
452   while (max >= min)
453     {
454       mid = (min + max) / 2;
455       if (wc > unicode_scripts[mid].end)
456         min = mid + 1;
457       else if (wc < unicode_scripts[mid].start)
458         max = mid - 1;
459       else
460         return unicode_script_list_strings + unicode_script_list_offsets[unicode_scripts[mid].script_index];
461     }
462 
463   /* Unicode assigns "Unknown" as the script name for any character not
464    * specifically listed in Scripts.txt */
465   return N_("Unknown");
466 }
467