1 /*
2 * Copyright © 2004 Noah Levitt
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License as published by the
6 * Free Software Foundation; either version 3 of the License, or (at your
7 * option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License along
15 * with this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
17 */
18
19 #include <config.h>
20 #include <glib.h>
21 #include <string.h>
22
23 #include <glib/gi18n-lib.h>
24
25 #include "gucharmap.h"
26 #include "gucharmap-private.h"
27
28 #include "unicode-scripts.h"
29
30 typedef struct
31 {
32 gunichar start;
33 gunichar end;
34 gint index; /* index of @start in the codepoint list */
35 }
36 UnicodeRange;
37
38 struct _GucharmapScriptCodepointListPrivate
39 {
40 GPtrArray *ranges;
41 };
42
43 static void gucharmap_script_codepoint_list_class_init (GucharmapScriptCodepointListClass *klass);
44 static void gucharmap_script_codepoint_list_init (GucharmapScriptCodepointList *list);
45
G_DEFINE_TYPE(GucharmapScriptCodepointList,gucharmap_script_codepoint_list,GUCHARMAP_TYPE_CODEPOINT_LIST)46 G_DEFINE_TYPE (GucharmapScriptCodepointList, gucharmap_script_codepoint_list, GUCHARMAP_TYPE_CODEPOINT_LIST)
47
48 static gint
49 find_script (const gchar *script)
50 {
51 gint min, mid, max;
52
53 min = 0;
54 max = G_N_ELEMENTS (unicode_script_list_offsets) - 1;
55
56 while (max >= min)
57 {
58 mid = (min + max) / 2;
59
60 if (strcmp (script, unicode_script_list_strings + unicode_script_list_offsets[mid]) > 0)
61 min = mid + 1;
62 else if (strcmp (script, unicode_script_list_strings + unicode_script_list_offsets[mid]) < 0)
63 max = mid - 1;
64 else
65 return mid;
66 }
67
68 return -1;
69 }
70
71 /* *ranges should be freed by caller */
72 /* adds unlisted characters to the "Unknown" script */
73 static gboolean
get_chars_for_script(const gchar * script,UnicodeRange ** ranges,gint * size)74 get_chars_for_script (const gchar *script,
75 UnicodeRange **ranges,
76 gint *size)
77 {
78 gint i, j, index;
79 gint script_index, unknown_script_index;
80 gint prev_end;
81
82 script_index = find_script (script);
83 unknown_script_index = find_script ("Unknown");
84 if (script_index == -1)
85 return FALSE;
86
87 j = 0;
88
89 if (script_index == unknown_script_index)
90 {
91 prev_end = -1;
92 for (i = 0; i < G_N_ELEMENTS (unicode_scripts); i++)
93 {
94 if (unicode_scripts[i].start > prev_end + 1)
95 j++;
96 prev_end = unicode_scripts[i].end;
97 }
98 if (unicode_scripts[i-1].end < UNICHAR_MAX)
99 j++;
100 }
101
102 for (i = 0; i < G_N_ELEMENTS (unicode_scripts); i++)
103 if (unicode_scripts[i].script_index == script_index)
104 j++;
105
106 *size = j;
107 *ranges = g_new (UnicodeRange, *size);
108
109 j = 0, index = 0, prev_end = -1;
110
111 for (i = 0; i < G_N_ELEMENTS (unicode_scripts); i++)
112 {
113 if (script_index == unknown_script_index)
114 {
115 if (unicode_scripts[i].start > prev_end + 1)
116 {
117 (*ranges)[j].start = prev_end + 1;
118 (*ranges)[j].end = unicode_scripts[i].start - 1;
119 (*ranges)[j].index = index;
120
121 index += (*ranges)[j].end - (*ranges)[j].start + 1;
122 j++;
123 }
124
125 prev_end = unicode_scripts[i].end;
126 }
127
128 if (unicode_scripts[i].script_index == script_index)
129 {
130 (*ranges)[j].start = unicode_scripts[i].start;
131 (*ranges)[j].end = unicode_scripts[i].end;
132 (*ranges)[j].index = index;
133
134 index += (*ranges)[j].end - (*ranges)[j].start + 1;
135 j++;
136 }
137 }
138
139 if (script_index == unknown_script_index)
140 {
141 if (unicode_scripts[i-1].end < UNICHAR_MAX)
142 {
143 (*ranges)[j].start = unicode_scripts[i-1].end + 1;
144 (*ranges)[j].end = UNICHAR_MAX;
145 (*ranges)[j].index = index;
146 j++;
147 }
148 }
149
150
151 g_assert (j == *size);
152
153 return TRUE;
154 }
155
156 static void
ensure_initialized(GucharmapScriptCodepointList * guscl)157 ensure_initialized (GucharmapScriptCodepointList *guscl)
158 {
159 GucharmapScriptCodepointListPrivate *priv = guscl->priv;
160 gboolean success;
161
162 if (priv->ranges != NULL)
163 return;
164
165 success = gucharmap_script_codepoint_list_set_script (guscl, "Latin");
166
167 g_assert (success);
168 }
169
170 static gunichar
get_char(GucharmapCodepointList * list,gint index)171 get_char (GucharmapCodepointList *list,
172 gint index)
173 {
174 GucharmapScriptCodepointList *guscl = GUCHARMAP_SCRIPT_CODEPOINT_LIST (list);
175 GucharmapScriptCodepointListPrivate *priv = guscl->priv;
176 gint min, mid, max;
177
178 ensure_initialized (guscl);
179
180 min = 0;
181 max = priv->ranges->len - 1;
182
183 while (max >= min)
184 {
185 UnicodeRange *range;
186
187 mid = (min + max) / 2;
188 range = (UnicodeRange *) (priv->ranges->pdata[mid]);
189
190 if (index > range->index + range->end - range->start)
191 min = mid + 1;
192 else if (index < range->index)
193 max = mid - 1;
194 else
195 return range->start + index - range->index;
196 }
197
198 return (gunichar)(-1);
199 }
200
201 /* XXX: linear search */
202 static gint
get_index(GucharmapCodepointList * list,gunichar wc)203 get_index (GucharmapCodepointList *list,
204 gunichar wc)
205 {
206 GucharmapScriptCodepointList *guscl = GUCHARMAP_SCRIPT_CODEPOINT_LIST (list);
207 GucharmapScriptCodepointListPrivate *priv = guscl->priv;
208 gint i;
209
210 ensure_initialized (guscl);
211
212 for (i = 0; i < priv->ranges->len; i++)
213 {
214 UnicodeRange *range = (UnicodeRange *) priv->ranges->pdata[i];
215 if (wc >= range->start && wc <= range->end)
216 return range->index + wc - range->start;
217 }
218
219 return -1;
220 }
221
222 static gint
get_last_index(GucharmapCodepointList * list)223 get_last_index (GucharmapCodepointList *list)
224 {
225 GucharmapScriptCodepointList *guscl = GUCHARMAP_SCRIPT_CODEPOINT_LIST (list);
226 GucharmapScriptCodepointListPrivate *priv = guscl->priv;
227 UnicodeRange *last_range;
228
229 ensure_initialized (guscl);
230
231 last_range = (UnicodeRange *) (priv->ranges->pdata[priv->ranges->len-1]);
232
233 return last_range->index + last_range->end - last_range->start;
234 }
235
236 static void
clear_ranges(GPtrArray * ranges)237 clear_ranges (GPtrArray *ranges)
238 {
239 guint i, n;
240
241 n = ranges->len;
242 for (i = 0; i < n; ++i)
243 g_free (g_ptr_array_index (ranges, i));
244
245 g_ptr_array_set_size (ranges, 0);
246 }
247
248 static void
gucharmap_script_codepoint_list_finalize(GObject * object)249 gucharmap_script_codepoint_list_finalize (GObject *object)
250 {
251 GucharmapScriptCodepointList *guscl = GUCHARMAP_SCRIPT_CODEPOINT_LIST (object);
252 GucharmapScriptCodepointListPrivate *priv = guscl->priv;
253
254 if (priv->ranges)
255 {
256 clear_ranges (priv->ranges);
257 g_ptr_array_free (priv->ranges, TRUE);
258 }
259
260 G_OBJECT_CLASS (gucharmap_script_codepoint_list_parent_class)->finalize (object);
261 }
262
263 static void
gucharmap_script_codepoint_list_class_init(GucharmapScriptCodepointListClass * clazz)264 gucharmap_script_codepoint_list_class_init (GucharmapScriptCodepointListClass *clazz)
265 {
266 GucharmapCodepointListClass *codepoint_list_class = GUCHARMAP_CODEPOINT_LIST_CLASS (clazz);
267 GObjectClass *gobject_class = G_OBJECT_CLASS (clazz);
268
269 _gucharmap_intl_ensure_initialized ();
270
271 g_type_class_add_private (codepoint_list_class, sizeof (GucharmapScriptCodepointListPrivate));
272
273 codepoint_list_class->get_char = get_char;
274 codepoint_list_class->get_index = get_index;
275 codepoint_list_class->get_last_index = get_last_index;
276
277 gobject_class->finalize = gucharmap_script_codepoint_list_finalize;
278 }
279
280 static void
gucharmap_script_codepoint_list_init(GucharmapScriptCodepointList * guscl)281 gucharmap_script_codepoint_list_init (GucharmapScriptCodepointList *guscl)
282 {
283 guscl->priv = G_TYPE_INSTANCE_GET_PRIVATE (guscl, GUCHARMAP_TYPE_SCRIPT_CODEPOINT_LIST, GucharmapScriptCodepointListPrivate);
284 }
285
286 /**
287 * gucharmap_script_codepoint_list_new:
288 *
289 * Creates a new script codepoint list. The default script is Latin.
290 *
291 * Return value: the newly-created #GucharmapCodepointList. Use
292 * g_object_unref() to free the result.
293 **/
294 GucharmapCodepointList *
gucharmap_script_codepoint_list_new(void)295 gucharmap_script_codepoint_list_new (void)
296 {
297 return GUCHARMAP_CODEPOINT_LIST (g_object_new (gucharmap_script_codepoint_list_get_type (), NULL));
298 }
299
300 /**
301 * gucharmap_script_codepoint_list_set_script:
302 * @list: a GucharmapScriptCodepointList
303 * @script: the script name
304 *
305 * Sets the script for the codepoint list.
306 *
307 * Return value: %TRUE on success, %FALSE if there is no such script, in
308 * which case the script is not changed.
309 **/
310 gboolean
gucharmap_script_codepoint_list_set_script(GucharmapScriptCodepointList * list,const gchar * script)311 gucharmap_script_codepoint_list_set_script (GucharmapScriptCodepointList *list,
312 const gchar *script)
313 {
314 const gchar *scripts[2];
315
316 scripts[0] = script;
317 scripts[1] = NULL;
318
319 return gucharmap_script_codepoint_list_set_scripts (list, scripts);
320 }
321
322 /**
323 * gucharmap_script_codepoint_list_set_scripts:
324 * @list: a GucharmapScriptCodepointList
325 * @scripts: NULL-terminated array of script names
326 *
327 * Sets multiple scripts for the codepoint list. Codepoints are sorted
328 * according to their order in @scripts.
329 *
330 * Return value: %TRUE on success, %FALSE if any of the scripts don’t
331 * exist, in which case the script is not changed.
332 **/
333 gboolean
gucharmap_script_codepoint_list_set_scripts(GucharmapScriptCodepointList * list,const gchar ** scripts)334 gucharmap_script_codepoint_list_set_scripts (GucharmapScriptCodepointList *list,
335 const gchar **scripts)
336 {
337 GucharmapScriptCodepointListPrivate *priv = list->priv;
338 UnicodeRange *ranges;
339 gint i, j, size;
340
341 if (priv->ranges)
342 clear_ranges (priv->ranges);
343 else
344 priv->ranges = g_ptr_array_new ();
345
346 for (i = 0; scripts[i]; i++)
347 if (get_chars_for_script (scripts[i], &ranges, &size))
348 {
349 for (j = 0; j < size; j++)
350 g_ptr_array_add (priv->ranges, g_memdup (ranges + j, sizeof (ranges[j])));
351 g_free (ranges);
352 }
353 else
354 {
355 g_ptr_array_free (priv->ranges, TRUE);
356 return FALSE;
357 }
358
359 return TRUE;
360 }
361
362 /**
363 * gucharmap_script_codepoint_list_append_script:
364 * @list: a GucharmapScriptCodepointList
365 * @script: the script name
366 *
367 * Appends the characters in @script to the codepoint list.
368 *
369 * Return value: %TRUE on success, %FALSE if there is no such script, in
370 * which case the codepoint list is not changed.
371 **/
372 gboolean
gucharmap_script_codepoint_list_append_script(GucharmapScriptCodepointList * list,const gchar * script)373 gucharmap_script_codepoint_list_append_script (GucharmapScriptCodepointList *list,
374 const gchar *script)
375 {
376 GucharmapScriptCodepointListPrivate *priv = list->priv;
377 UnicodeRange *ranges;
378 gint j, size, index0;
379
380 if (priv->ranges == NULL)
381 priv->ranges = g_ptr_array_new ();
382
383 if (priv->ranges->len > 0)
384 {
385 UnicodeRange *last_range = (UnicodeRange *) priv->ranges->pdata[priv->ranges->len - 1];
386 index0 = last_range->index + last_range->end - last_range->start + 1;
387 }
388 else
389 index0 = 0;
390
391 if (get_chars_for_script (script, &ranges, &size))
392 {
393 for (j = 0; j < size; j++)
394 {
395 UnicodeRange *range = g_memdup (ranges + j, sizeof (ranges[j]));
396 range->index += index0;
397 g_ptr_array_add (priv->ranges, range);
398 }
399 g_free (ranges);
400
401 return TRUE;
402 }
403
404 return FALSE;
405 }
406
407 /**
408 * gucharmap_unicode_list_scripts:
409 *
410 * Returns an array of untranslated script names.
411 *
412 * The strings in the array are owned by gucharmap and should not be
413 * modified or free; the array itself however is allocated and should
414 * be freed with g_free().
415 *
416 * Returns: (transfer container): a newly allocated %NULL-terminated array of strings
417 **/
418 const gchar **
gucharmap_unicode_list_scripts(void)419 gucharmap_unicode_list_scripts (void)
420 {
421 const char **scripts;
422 guint i;
423
424 scripts = (const char **) g_new (char*, G_N_ELEMENTS (unicode_script_list_offsets) + 1);
425 for (i = 0; i < G_N_ELEMENTS (unicode_script_list_offsets); ++i)
426 {
427 scripts[i] = unicode_script_list_strings + unicode_script_list_offsets[i];
428 }
429 scripts[i] = NULL;
430
431 return scripts;
432 }
433
434 /**
435 * gucharmap_unicode_get_script_for_char:
436 * @wc: a character
437 *
438 * Return value: The English (untranslated) name of the script to which the
439 * character belongs. Characters that don't belong to an actual script
440 * return %"Unknown".
441 **/
442 const gchar *
gucharmap_unicode_get_script_for_char(gunichar wc)443 gucharmap_unicode_get_script_for_char (gunichar wc)
444 {
445 gint min = 0;
446 gint mid;
447 gint max = sizeof (unicode_scripts) / sizeof (UnicodeScript) - 1;
448
449 if (wc > UNICHAR_MAX)
450 return NULL;
451
452 while (max >= min)
453 {
454 mid = (min + max) / 2;
455 if (wc > unicode_scripts[mid].end)
456 min = mid + 1;
457 else if (wc < unicode_scripts[mid].start)
458 max = mid - 1;
459 else
460 return unicode_script_list_strings + unicode_script_list_offsets[unicode_scripts[mid].script_index];
461 }
462
463 /* Unicode assigns "Unknown" as the script name for any character not
464 * specifically listed in Scripts.txt */
465 return N_("Unknown");
466 }
467