1 /*
2  * Copyright (C) 2004, 2005 Jean-Yves Lefort
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of Jean-Yves Lefort nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
18  * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
19  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
20  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
24  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include "config.h"
33 #include <string.h>
34 #include <errno.h>
35 #include <time.h>
36 #include <stdlib.h>
37 #include <limits.h>
38 #include <glib.h>
39 #include <glib/gi18n-lib.h>
40 #include "translate.h"
41 
42 #include "translate-sgml-entities-private.h"
43 
44 static gunichar translate_sgml_ref_get_unichar (const char *ref);
45 
46 /**
47  * translate_ascii_strcase_equal:
48  * @s1: a nul-terminated string.
49  * @s2: a nul-terminated string.
50  *
51  * Compares two strings, ignoring the case of ASCII characters of both
52  * strings, and returns %TRUE if they are equal. It can be passed to
53  * g_hash_table_new() as the @key_equal_func parameter, when using
54  * strings as case-insensitive keys in a #GHashTable.
55  *
56  * Return value: %TRUE if the two strings match.
57  **/
58 gboolean
translate_ascii_strcase_equal(gconstpointer s1,gconstpointer s2)59 translate_ascii_strcase_equal (gconstpointer s1, gconstpointer s2)
60 {
61   return g_ascii_strcasecmp(s1, s2) == 0;
62 }
63 
64 /**
65  * translate_ascii_strcase_hash:
66  * @key: a string key.
67  *
68  * Converts a string to a hash value, ignoring the case of ASCII
69  * characters of the string. It can be passed to g_hash_table_new() as
70  * the @hash_func parameter, when using strings as case-insensitive
71  * keys in a #GHashTable.
72  *
73  * Return value: a hash value corresponding to the key.
74  **/
75 unsigned int
translate_ascii_strcase_hash(gconstpointer key)76 translate_ascii_strcase_hash (gconstpointer key)
77 {
78   const char *p = key;
79   unsigned int h = g_ascii_tolower(*p);
80 
81   if (h)
82     for (p++; *p; p++)
83       h = (h << 5) - h + g_ascii_tolower(*p);
84 
85   return h;
86 }
87 
88 /**
89  * translate_ascii_strcasestr:
90  * @big: a nul-terminated string, which may not be encoded in UTF-8.
91  * @little: the nul-terminated string to search for, which may not be
92  * encoded in UTF-8.
93  *
94  * Locates the first occurrence of @little in @big, ignoring the case
95  * of ASCII characters of both strings.
96  *
97  * Return value: if @little is an empty string, @big is returned; if
98  * @little occurs nowhere in @big, %NULL is returned; otherwise a
99  * pointer to the first character of the first occurrence of @little
100  * in @big is returned.
101  **/
102 char *
translate_ascii_strcasestr(const char * big,const char * little)103 translate_ascii_strcasestr (const char *big, const char *little)
104 {
105   g_return_val_if_fail(big != NULL, NULL);
106   g_return_val_if_fail(little != NULL, NULL);
107 
108   return translate_ascii_strcasestr_len(big, -1, little);
109 }
110 
111 /**
112  * translate_ascii_strcasestr_len:
113  * @big: a nul-terminated string, which may not be encoded in UTF-8.
114  * @big_len: length of @big in bytes, or -1 if @big is nul-terminated.
115  * @little: the nul-terminated string to search for, which may not be
116  * encoded in UTF-8.
117  *
118  * Locates the first occurrence of @little in @big, ignoring the case
119  * of ASCII characters of both strings, and limiting the length of the
120  * search to @big_len.
121  *
122  * Return value: if @little is an empty string, @big is returned; if
123  * @little occurs nowhere in @big, %NULL is returned; otherwise a
124  * pointer to the first character of the first occurrence of @little
125  * in @big is returned.
126  **/
127 char *
translate_ascii_strcasestr_len(const char * big,unsigned int big_len,const char * little)128 translate_ascii_strcasestr_len (const char *big,
129 				unsigned int big_len,
130 				const char *little)
131 {
132   char *lower_big;
133   char *lower_little;
134   char *s;
135 
136   g_return_val_if_fail(big != NULL, NULL);
137   g_return_val_if_fail(little != NULL, NULL);
138 
139   lower_big = g_ascii_strdown(big, (int) big_len);
140   lower_little = g_ascii_strdown(little, -1);
141 
142   s = strstr(lower_big, lower_little);
143   if (s)
144     s = (char *) big + (s - lower_big);
145 
146   g_free(lower_big);
147   g_free(lower_little);
148 
149   return s;
150 }
151 
152 /**
153  * translate_ascii_strcasecoll:
154  * @s1: a nul-terminated string, which may not be encoded in UTF-8.
155  * @s2: a nul-terminated string, which may not be encoded in UTF-8.
156  *
157  * Compares two strings for ordering using the linguistically correct
158  * rules for the current locale, ignoring the case of ASCII characters
159  * of both strings.
160  *
161  * Return value: an integer greater than, equal to, or less than 0,
162  * according as @s1 is greater than, equal to, or less than @s2.
163  **/
164 int
translate_ascii_strcasecoll(const char * s1,const char * s2)165 translate_ascii_strcasecoll (const char *s1, const char *s2)
166 {
167   char *lower_s1;
168   char *lower_s2;
169   int coll;
170 
171   lower_s1 = g_ascii_strdown(s1, -1);
172   lower_s2 = g_ascii_strdown(s2, -1);
173 
174   coll = strcoll(lower_s1, lower_s2);
175 
176   g_free(lower_s1);
177   g_free(lower_s2);
178 
179   return coll;
180 }
181 
182 /**
183  * translate_utf8_strcasecoll:
184  * @s1: a nul-terminated string.
185  * @s2: a nul-terminated string.
186  *
187  * Compares two UTF-8 strings for ordering using the linguistically
188  * correct rules for the current locale, ignoring the case of both
189  * strings.
190  *
191  * Return value: an integer greater than, equal to, or less than 0,
192  * according as @s1 is greater than, equal to, or less than @s2.
193  **/
194 int
translate_utf8_strcasecoll(const char * s1,const char * s2)195 translate_utf8_strcasecoll (const char *s1, const char *s2)
196 {
197   char *folded_s1;
198   char *folded_s2;
199   int coll;
200 
201   g_return_val_if_fail(s1 != NULL, 0);
202   g_return_val_if_fail(s2 != NULL, 0);
203 
204   folded_s1 = g_utf8_casefold(s1, -1);
205   folded_s2 = g_utf8_casefold(s2, -1);
206 
207   coll = g_utf8_collate(folded_s1, folded_s2);
208 
209   g_free(folded_s1);
210   g_free(folded_s2);
211 
212   return coll;
213 }
214 
215 /**
216  * translate_utf8_strcmp:
217  * @s1: a nul-terminated string.
218  * @s2: a nul-terminated string.
219  *
220  * Compares two UTF-8 strings for ordering.
221  *
222  * Return value: an integer greater than, equal to, or less than 0,
223  * according as @s1 is greater than, equal to, or less than @s2.
224  **/
225 int
translate_utf8_strcmp(const char * s1,const char * s2)226 translate_utf8_strcmp (const char *s1, const char *s2)
227 {
228   char *normalized_s1;
229   char *normalized_s2;
230   int cmp;
231 
232   g_return_val_if_fail(s1 != NULL, 0);
233   g_return_val_if_fail(s2 != NULL, 0);
234 
235   normalized_s1 = g_utf8_normalize(s1, -1, G_NORMALIZE_ALL);
236   normalized_s2 = g_utf8_normalize(s2, -1, G_NORMALIZE_ALL);
237 
238   cmp = strcmp(normalized_s1, normalized_s2);
239 
240   g_free(normalized_s1);
241   g_free(normalized_s2);
242 
243   return cmp;
244 }
245 
246 /**
247  * translate_utf8_strcasecmp:
248  * @s1: a nul-terminated string.
249  * @s2: a nul-terminated string.
250  *
251  * Compares two UTF-8 strings for ordering, ignoring the case of both
252  * strings.
253  *
254  * Return value: an integer greater than, equal to, or less than 0,
255  * according as @s1 is greater than, equal to, or less than @s2.
256  **/
257 int
translate_utf8_strcasecmp(const char * s1,const char * s2)258 translate_utf8_strcasecmp (const char *s1, const char *s2)
259 {
260   char *normalized_s1;
261   char *normalized_s2;
262   char *case_normalized_s1;
263   char *case_normalized_s2;
264   int cmp;
265 
266   g_return_val_if_fail(s1 != NULL, 0);
267   g_return_val_if_fail(s2 != NULL, 0);
268 
269   normalized_s1 = g_utf8_normalize(s1, -1, G_NORMALIZE_ALL);
270   normalized_s2 = g_utf8_normalize(s2, -1, G_NORMALIZE_ALL);
271   case_normalized_s1 = g_utf8_casefold(normalized_s1, -1);
272   case_normalized_s2 = g_utf8_casefold(normalized_s2, -1);
273 
274   cmp = strcmp(case_normalized_s1, case_normalized_s2);
275 
276   g_free(normalized_s1);
277   g_free(normalized_s2);
278   g_free(case_normalized_s1);
279   g_free(case_normalized_s2);
280 
281   return cmp;
282 }
283 
284 /**
285  * translate_time:
286  *
287  * Returns the current time, issuing a warning if an error occurs.
288  *
289  * Return value: the number of seconds since 0 hours, 0 minutes, 0
290  * seconds, January 1, 1970, Coordinated Universal Time, or 0 if an
291  * error has occurred.
292  **/
293 time_t
translate_time(void)294 translate_time (void)
295 {
296   time_t now;
297 
298   now = time(NULL);
299   if (now < 0)
300     {
301       g_warning(_("cannot get current time: %s"), g_strerror(errno));
302       now = 0;
303     }
304 
305   return now;
306 }
307 
308 static gunichar
translate_sgml_ref_get_unichar(const char * ref)309 translate_sgml_ref_get_unichar (const char *ref)
310 {
311   g_return_val_if_fail(ref != NULL, 0);
312 
313   if (*ref == '#')
314     {				/* numeric reference */
315       const char *nptr;
316       int base;
317 
318       if (*(ref + 1) == 'x' || *(ref + 1) == 'X')
319 	{			/* hexadecimal number */
320 	  nptr = ref + 2;
321 	  base = 16;
322 	}
323       else
324 	{			/* decimal number */
325 	  nptr = ref + 1;
326 	  base = 10;
327 	}
328 
329       if (*nptr)
330 	{
331 	  char *end;
332 	  unsigned long code;
333 
334 	  code = strtoul(nptr, &end, base);
335 	  if (*end == 0)	/* could convert */
336 	    return code;
337 	}
338     }
339   else
340     {				/* entity reference */
341       int i;
342 
343       for (i = 0; i < G_N_ELEMENTS(entities); i++)
344 	if (! strcmp(ref, entities[i].name))
345 	  return entities[i].character;
346     }
347 
348   return 0;			/* invalid reference */
349 }
350 
351 /**
352  * translate_sgml_ref_expand:
353  * @str: a nul-terminated string.
354  *
355  * Parses @str, expanding its SGML character references and XHTML
356  * character entities into their Unicode character value.
357  *
358  * Numerical SGML character references as well as XHTML entities are
359  * supported. Unsupported entities will be inserted verbatim into the
360  * result.
361  *
362  * Return value: the expansion of str. The returned string should be
363  * freed when no longer needed.
364  **/
365 char *
translate_sgml_ref_expand(const char * str)366 translate_sgml_ref_expand (const char *str)
367 {
368   GString *unescaped;
369   const char *start;
370 
371   g_return_val_if_fail(str != NULL, NULL);
372 
373   unescaped = g_string_new(NULL);
374 
375   while ((start = strchr(str, '&')))
376     {
377       const char *end;
378       gunichar c;
379 
380       end = strpbrk(start + 1, "; &\t\n");
381       if (! end)
382 	end = strchr(start + 1, 0);
383 
384       {
385 	char ref[end - start];
386 
387 	strncpy(ref, start + 1, end - start - 1);
388 	ref[end - start - 1] = 0;
389 
390 	c = translate_sgml_ref_get_unichar(ref);
391       }
392 
393       if (*end == ';')		/* semicolon is part of entity, skip it */
394 	end++;
395 
396       g_string_append_len(unescaped, str, start - str);
397       if (c)
398 	g_string_append_unichar(unescaped, c);
399       else			/* invalid reference, append it raw */
400 	g_string_append_len(unescaped, start, end - start);
401 
402       str = end;
403     }
404 
405   g_string_append(unescaped, str);
406 
407   return g_string_free(unescaped, FALSE);
408 }
409 
410 /**
411  * translate_utf8_strpbrk:
412  * @p: a nul-terminated string.
413  * @len: length of @p in bytes, or -1 if @p is nul-terminated.
414  * @charset: the set of characters to search for.
415  *
416  * Locates in @p the first occurrence of any character in the string
417  * @charset.
418  *
419  * Return value: the first occurrence of any character of @charset in
420  * @p, or %NULL if no characters from @charset occur anywhere in @p.
421  **/
422 char *
translate_utf8_strpbrk(const char * p,gssize len,const char * charset)423 translate_utf8_strpbrk (const char *p, gssize len, const char *charset)
424 {
425   g_return_val_if_fail(p != NULL, NULL);
426   g_return_val_if_fail(charset != NULL, NULL);
427 
428   for (; *charset; charset = g_utf8_next_char(charset))
429     {
430       char *match;
431 
432       match = g_utf8_strchr(p, len, g_utf8_get_char(charset));
433       if (match)
434 	return match;
435     }
436 
437   return NULL;
438 }
439 
440 /**
441  * translate_utf8_strrpbrk:
442  * @p: a nul-terminated string.
443  * @len: length of @p in bytes, or -1 if @p is nul-terminated.
444  * @charset: the set of characters to search for.
445  *
446  * Locates in @p the last occurrence of any character in the string
447  * @charset.
448  *
449  * Return value: the last occurrence of any character of @charset in
450  * @p, or %NULL if no characters from @charset occur anywhere in @p.
451  **/
452 char *
translate_utf8_strrpbrk(const char * p,gssize len,const char * charset)453 translate_utf8_strrpbrk (const char *p, gssize len, const char *charset)
454 {
455   g_return_val_if_fail(p != NULL, NULL);
456   g_return_val_if_fail(charset != NULL, NULL);
457 
458   for (; *charset; charset = g_utf8_next_char(charset))
459     {
460       char *match;
461 
462       match = g_utf8_strrchr(p, len, g_utf8_get_char(charset));
463       if (match)
464 	return match;
465     }
466 
467   return NULL;
468 }
469