1 /*
2 * Copyright (C) 2004, 2005 Jean-Yves Lefort
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of Jean-Yves Lefort nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
18 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
19 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
20 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
22 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
24 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include "config.h"
33 #include <string.h>
34 #include <errno.h>
35 #include <time.h>
36 #include <stdlib.h>
37 #include <limits.h>
38 #include <glib.h>
39 #include <glib/gi18n-lib.h>
40 #include "translate.h"
41
42 #include "translate-sgml-entities-private.h"
43
44 static gunichar translate_sgml_ref_get_unichar (const char *ref);
45
46 /**
47 * translate_ascii_strcase_equal:
48 * @s1: a nul-terminated string.
49 * @s2: a nul-terminated string.
50 *
51 * Compares two strings, ignoring the case of ASCII characters of both
52 * strings, and returns %TRUE if they are equal. It can be passed to
53 * g_hash_table_new() as the @key_equal_func parameter, when using
54 * strings as case-insensitive keys in a #GHashTable.
55 *
56 * Return value: %TRUE if the two strings match.
57 **/
58 gboolean
translate_ascii_strcase_equal(gconstpointer s1,gconstpointer s2)59 translate_ascii_strcase_equal (gconstpointer s1, gconstpointer s2)
60 {
61 return g_ascii_strcasecmp(s1, s2) == 0;
62 }
63
64 /**
65 * translate_ascii_strcase_hash:
66 * @key: a string key.
67 *
68 * Converts a string to a hash value, ignoring the case of ASCII
69 * characters of the string. It can be passed to g_hash_table_new() as
70 * the @hash_func parameter, when using strings as case-insensitive
71 * keys in a #GHashTable.
72 *
73 * Return value: a hash value corresponding to the key.
74 **/
75 unsigned int
translate_ascii_strcase_hash(gconstpointer key)76 translate_ascii_strcase_hash (gconstpointer key)
77 {
78 const char *p = key;
79 unsigned int h = g_ascii_tolower(*p);
80
81 if (h)
82 for (p++; *p; p++)
83 h = (h << 5) - h + g_ascii_tolower(*p);
84
85 return h;
86 }
87
88 /**
89 * translate_ascii_strcasestr:
90 * @big: a nul-terminated string, which may not be encoded in UTF-8.
91 * @little: the nul-terminated string to search for, which may not be
92 * encoded in UTF-8.
93 *
94 * Locates the first occurrence of @little in @big, ignoring the case
95 * of ASCII characters of both strings.
96 *
97 * Return value: if @little is an empty string, @big is returned; if
98 * @little occurs nowhere in @big, %NULL is returned; otherwise a
99 * pointer to the first character of the first occurrence of @little
100 * in @big is returned.
101 **/
102 char *
translate_ascii_strcasestr(const char * big,const char * little)103 translate_ascii_strcasestr (const char *big, const char *little)
104 {
105 g_return_val_if_fail(big != NULL, NULL);
106 g_return_val_if_fail(little != NULL, NULL);
107
108 return translate_ascii_strcasestr_len(big, -1, little);
109 }
110
111 /**
112 * translate_ascii_strcasestr_len:
113 * @big: a nul-terminated string, which may not be encoded in UTF-8.
114 * @big_len: length of @big in bytes, or -1 if @big is nul-terminated.
115 * @little: the nul-terminated string to search for, which may not be
116 * encoded in UTF-8.
117 *
118 * Locates the first occurrence of @little in @big, ignoring the case
119 * of ASCII characters of both strings, and limiting the length of the
120 * search to @big_len.
121 *
122 * Return value: if @little is an empty string, @big is returned; if
123 * @little occurs nowhere in @big, %NULL is returned; otherwise a
124 * pointer to the first character of the first occurrence of @little
125 * in @big is returned.
126 **/
127 char *
translate_ascii_strcasestr_len(const char * big,unsigned int big_len,const char * little)128 translate_ascii_strcasestr_len (const char *big,
129 unsigned int big_len,
130 const char *little)
131 {
132 char *lower_big;
133 char *lower_little;
134 char *s;
135
136 g_return_val_if_fail(big != NULL, NULL);
137 g_return_val_if_fail(little != NULL, NULL);
138
139 lower_big = g_ascii_strdown(big, (int) big_len);
140 lower_little = g_ascii_strdown(little, -1);
141
142 s = strstr(lower_big, lower_little);
143 if (s)
144 s = (char *) big + (s - lower_big);
145
146 g_free(lower_big);
147 g_free(lower_little);
148
149 return s;
150 }
151
152 /**
153 * translate_ascii_strcasecoll:
154 * @s1: a nul-terminated string, which may not be encoded in UTF-8.
155 * @s2: a nul-terminated string, which may not be encoded in UTF-8.
156 *
157 * Compares two strings for ordering using the linguistically correct
158 * rules for the current locale, ignoring the case of ASCII characters
159 * of both strings.
160 *
161 * Return value: an integer greater than, equal to, or less than 0,
162 * according as @s1 is greater than, equal to, or less than @s2.
163 **/
164 int
translate_ascii_strcasecoll(const char * s1,const char * s2)165 translate_ascii_strcasecoll (const char *s1, const char *s2)
166 {
167 char *lower_s1;
168 char *lower_s2;
169 int coll;
170
171 lower_s1 = g_ascii_strdown(s1, -1);
172 lower_s2 = g_ascii_strdown(s2, -1);
173
174 coll = strcoll(lower_s1, lower_s2);
175
176 g_free(lower_s1);
177 g_free(lower_s2);
178
179 return coll;
180 }
181
182 /**
183 * translate_utf8_strcasecoll:
184 * @s1: a nul-terminated string.
185 * @s2: a nul-terminated string.
186 *
187 * Compares two UTF-8 strings for ordering using the linguistically
188 * correct rules for the current locale, ignoring the case of both
189 * strings.
190 *
191 * Return value: an integer greater than, equal to, or less than 0,
192 * according as @s1 is greater than, equal to, or less than @s2.
193 **/
194 int
translate_utf8_strcasecoll(const char * s1,const char * s2)195 translate_utf8_strcasecoll (const char *s1, const char *s2)
196 {
197 char *folded_s1;
198 char *folded_s2;
199 int coll;
200
201 g_return_val_if_fail(s1 != NULL, 0);
202 g_return_val_if_fail(s2 != NULL, 0);
203
204 folded_s1 = g_utf8_casefold(s1, -1);
205 folded_s2 = g_utf8_casefold(s2, -1);
206
207 coll = g_utf8_collate(folded_s1, folded_s2);
208
209 g_free(folded_s1);
210 g_free(folded_s2);
211
212 return coll;
213 }
214
215 /**
216 * translate_utf8_strcmp:
217 * @s1: a nul-terminated string.
218 * @s2: a nul-terminated string.
219 *
220 * Compares two UTF-8 strings for ordering.
221 *
222 * Return value: an integer greater than, equal to, or less than 0,
223 * according as @s1 is greater than, equal to, or less than @s2.
224 **/
225 int
translate_utf8_strcmp(const char * s1,const char * s2)226 translate_utf8_strcmp (const char *s1, const char *s2)
227 {
228 char *normalized_s1;
229 char *normalized_s2;
230 int cmp;
231
232 g_return_val_if_fail(s1 != NULL, 0);
233 g_return_val_if_fail(s2 != NULL, 0);
234
235 normalized_s1 = g_utf8_normalize(s1, -1, G_NORMALIZE_ALL);
236 normalized_s2 = g_utf8_normalize(s2, -1, G_NORMALIZE_ALL);
237
238 cmp = strcmp(normalized_s1, normalized_s2);
239
240 g_free(normalized_s1);
241 g_free(normalized_s2);
242
243 return cmp;
244 }
245
246 /**
247 * translate_utf8_strcasecmp:
248 * @s1: a nul-terminated string.
249 * @s2: a nul-terminated string.
250 *
251 * Compares two UTF-8 strings for ordering, ignoring the case of both
252 * strings.
253 *
254 * Return value: an integer greater than, equal to, or less than 0,
255 * according as @s1 is greater than, equal to, or less than @s2.
256 **/
257 int
translate_utf8_strcasecmp(const char * s1,const char * s2)258 translate_utf8_strcasecmp (const char *s1, const char *s2)
259 {
260 char *normalized_s1;
261 char *normalized_s2;
262 char *case_normalized_s1;
263 char *case_normalized_s2;
264 int cmp;
265
266 g_return_val_if_fail(s1 != NULL, 0);
267 g_return_val_if_fail(s2 != NULL, 0);
268
269 normalized_s1 = g_utf8_normalize(s1, -1, G_NORMALIZE_ALL);
270 normalized_s2 = g_utf8_normalize(s2, -1, G_NORMALIZE_ALL);
271 case_normalized_s1 = g_utf8_casefold(normalized_s1, -1);
272 case_normalized_s2 = g_utf8_casefold(normalized_s2, -1);
273
274 cmp = strcmp(case_normalized_s1, case_normalized_s2);
275
276 g_free(normalized_s1);
277 g_free(normalized_s2);
278 g_free(case_normalized_s1);
279 g_free(case_normalized_s2);
280
281 return cmp;
282 }
283
284 /**
285 * translate_time:
286 *
287 * Returns the current time, issuing a warning if an error occurs.
288 *
289 * Return value: the number of seconds since 0 hours, 0 minutes, 0
290 * seconds, January 1, 1970, Coordinated Universal Time, or 0 if an
291 * error has occurred.
292 **/
293 time_t
translate_time(void)294 translate_time (void)
295 {
296 time_t now;
297
298 now = time(NULL);
299 if (now < 0)
300 {
301 g_warning(_("cannot get current time: %s"), g_strerror(errno));
302 now = 0;
303 }
304
305 return now;
306 }
307
308 static gunichar
translate_sgml_ref_get_unichar(const char * ref)309 translate_sgml_ref_get_unichar (const char *ref)
310 {
311 g_return_val_if_fail(ref != NULL, 0);
312
313 if (*ref == '#')
314 { /* numeric reference */
315 const char *nptr;
316 int base;
317
318 if (*(ref + 1) == 'x' || *(ref + 1) == 'X')
319 { /* hexadecimal number */
320 nptr = ref + 2;
321 base = 16;
322 }
323 else
324 { /* decimal number */
325 nptr = ref + 1;
326 base = 10;
327 }
328
329 if (*nptr)
330 {
331 char *end;
332 unsigned long code;
333
334 code = strtoul(nptr, &end, base);
335 if (*end == 0) /* could convert */
336 return code;
337 }
338 }
339 else
340 { /* entity reference */
341 int i;
342
343 for (i = 0; i < G_N_ELEMENTS(entities); i++)
344 if (! strcmp(ref, entities[i].name))
345 return entities[i].character;
346 }
347
348 return 0; /* invalid reference */
349 }
350
351 /**
352 * translate_sgml_ref_expand:
353 * @str: a nul-terminated string.
354 *
355 * Parses @str, expanding its SGML character references and XHTML
356 * character entities into their Unicode character value.
357 *
358 * Numerical SGML character references as well as XHTML entities are
359 * supported. Unsupported entities will be inserted verbatim into the
360 * result.
361 *
362 * Return value: the expansion of str. The returned string should be
363 * freed when no longer needed.
364 **/
365 char *
translate_sgml_ref_expand(const char * str)366 translate_sgml_ref_expand (const char *str)
367 {
368 GString *unescaped;
369 const char *start;
370
371 g_return_val_if_fail(str != NULL, NULL);
372
373 unescaped = g_string_new(NULL);
374
375 while ((start = strchr(str, '&')))
376 {
377 const char *end;
378 gunichar c;
379
380 end = strpbrk(start + 1, "; &\t\n");
381 if (! end)
382 end = strchr(start + 1, 0);
383
384 {
385 char ref[end - start];
386
387 strncpy(ref, start + 1, end - start - 1);
388 ref[end - start - 1] = 0;
389
390 c = translate_sgml_ref_get_unichar(ref);
391 }
392
393 if (*end == ';') /* semicolon is part of entity, skip it */
394 end++;
395
396 g_string_append_len(unescaped, str, start - str);
397 if (c)
398 g_string_append_unichar(unescaped, c);
399 else /* invalid reference, append it raw */
400 g_string_append_len(unescaped, start, end - start);
401
402 str = end;
403 }
404
405 g_string_append(unescaped, str);
406
407 return g_string_free(unescaped, FALSE);
408 }
409
410 /**
411 * translate_utf8_strpbrk:
412 * @p: a nul-terminated string.
413 * @len: length of @p in bytes, or -1 if @p is nul-terminated.
414 * @charset: the set of characters to search for.
415 *
416 * Locates in @p the first occurrence of any character in the string
417 * @charset.
418 *
419 * Return value: the first occurrence of any character of @charset in
420 * @p, or %NULL if no characters from @charset occur anywhere in @p.
421 **/
422 char *
translate_utf8_strpbrk(const char * p,gssize len,const char * charset)423 translate_utf8_strpbrk (const char *p, gssize len, const char *charset)
424 {
425 g_return_val_if_fail(p != NULL, NULL);
426 g_return_val_if_fail(charset != NULL, NULL);
427
428 for (; *charset; charset = g_utf8_next_char(charset))
429 {
430 char *match;
431
432 match = g_utf8_strchr(p, len, g_utf8_get_char(charset));
433 if (match)
434 return match;
435 }
436
437 return NULL;
438 }
439
440 /**
441 * translate_utf8_strrpbrk:
442 * @p: a nul-terminated string.
443 * @len: length of @p in bytes, or -1 if @p is nul-terminated.
444 * @charset: the set of characters to search for.
445 *
446 * Locates in @p the last occurrence of any character in the string
447 * @charset.
448 *
449 * Return value: the last occurrence of any character of @charset in
450 * @p, or %NULL if no characters from @charset occur anywhere in @p.
451 **/
452 char *
translate_utf8_strrpbrk(const char * p,gssize len,const char * charset)453 translate_utf8_strrpbrk (const char *p, gssize len, const char *charset)
454 {
455 g_return_val_if_fail(p != NULL, NULL);
456 g_return_val_if_fail(charset != NULL, NULL);
457
458 for (; *charset; charset = g_utf8_next_char(charset))
459 {
460 char *match;
461
462 match = g_utf8_strrchr(p, len, g_utf8_get_char(charset));
463 if (match)
464 return match;
465 }
466
467 return NULL;
468 }
469