1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3  * Copyright (C) 2013 Intel Corporation
4  *
5  * This library is free software: you can redistribute it and/or modify it
6  * under the terms of the GNU Lesser General Public License as published by
7  * the Free Software Foundation.
8  *
9  * This library is distributed in the hope that it will be useful, but
10  * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
11  * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
12  * for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public License
15  * along with this library. If not, see <http://www.gnu.org/licenses/>.
16  *
17  * Authors: Tristan Van Berkom <tristanvb@openismus.com>
18  */
19 
20 /**
21  * SECTION: e-collator
22  * @include: libedataserver/libedataserver.h
23  * @short_description: Collation services for locale sensitive sorting
24  *
25  * The #ECollator is a wrapper object around ICU collation services and
26  * provides features to sort words in locale specific ways. The collator
27  * also provides some API for determining features of the active alphabet
28  * in the user's locale, and which words should be sorted under which
29  * letter in the user's alphabet.
30  */
31 
32 #include "evolution-data-server-config.h"
33 
34 #include <stdio.h>
35 #include <string.h>
36 
37 /* ICU includes */
38 #include <unicode/uclean.h>
39 #include <unicode/ucol.h>
40 #include <unicode/ustring.h>
41 
42 #include "e-collator.h"
43 #include "e-alphabet-index-private.h"
44 #include "e-transliterator-private.h"
45 
46 #define CONVERT_BUFFER_LEN        512
47 #define COLLATION_KEY_BUFFER_LEN  1024
48 #define LOCALE_BUFFER_LEN         256
49 
50 #define ENABLE_DEBUGGING 0
51 
52 G_DEFINE_QUARK (e-collator-error-quark, e_collator_error)
53 
54 G_DEFINE_BOXED_TYPE (ECollator,
55 		     e_collator,
56 		     e_collator_ref,
57 		     e_collator_unref)
58 
59 struct _ECollator
60 {
61 	UCollator       *coll;
62 	volatile gint    ref_count;
63 
64 	EAlphabetIndex  *alpha_index;
65 	gchar          **labels;
66 	gint             n_labels;
67 	gint             underflow;
68 	gint             inflow;
69 	gint             overflow;
70 
71 	ETransliterator *transliterator;
72 };
73 
74 /*****************************************************
75  *                ICU Helper Functions               *
76  *****************************************************/
77 #if ENABLE_DEBUGGING
78 static void
print_available_locales(void)79 print_available_locales (void)
80 {
81 	UErrorCode status = U_ZERO_ERROR;
82 	UChar result[100];
83 	gchar printable[100 * 4];
84 	gint count, i;
85 
86 	u_init (&status);
87 
88 	g_printerr ("List of available locales (default locale is: %s)\n", uloc_getDefault ());
89 
90 	count = uloc_countAvailable ();
91 	for (i = 0; i < count; i++) {
92 		UEnumeration *keywords;
93 		const gchar *keyword;
94 
95 		uloc_getDisplayName (uloc_getAvailable (i), NULL, result, 100, &status);
96 
97 		u_austrncpy (printable, result, sizeof (printable));
98 
99 		/* print result */
100 		g_printerr ("\t%s - %s", uloc_getAvailable (i), printable);
101 
102 		keywords = uloc_openKeywords (uloc_getAvailable (i), &status);
103 		if (keywords) {
104 			UErrorCode kstatus = U_ZERO_ERROR;
105 
106 			g_printerr ("[");
107 
108 			while ((keyword = uenum_next (keywords, NULL, &kstatus)) != NULL)
109 				g_printerr (" %s ", keyword);
110 
111 			g_printerr ("]");
112 
113 			uenum_close (keywords);
114 		}
115 		g_printerr ("\n");
116 	}
117 }
118 #endif
119 
120 static gchar *
canonicalize_locale(const gchar * posix_locale,gchar ** language_code,gchar ** country_code,GError ** error)121 canonicalize_locale (const gchar *posix_locale,
122                      gchar **language_code,
123                      gchar **country_code,
124                      GError **error)
125 {
126 	UErrorCode status = U_ZERO_ERROR;
127 	gchar  locale_buffer[LOCALE_BUFFER_LEN];
128 	gchar  language_buffer[8];
129 	gchar  country_buffer[8];
130 	gchar *icu_locale;
131 	gchar *final_locale;
132 	gint   len;
133 	const gchar *collation_type = NULL;
134 
135 	if (posix_locale && (
136 	    g_ascii_strcasecmp (posix_locale, "C") == 0 ||
137 	    g_ascii_strcasecmp (posix_locale, "POSIX") == 0))
138 		posix_locale = "en_US_POSIX";
139 
140 	len = uloc_canonicalize (posix_locale, locale_buffer, LOCALE_BUFFER_LEN, &status);
141 
142 	if (U_FAILURE (status)) {
143 		g_set_error (
144 			error, E_COLLATOR_ERROR,
145 			E_COLLATOR_ERROR_INVALID_LOCALE,
146 			"Failed to interpret locale '%s' (%s)",
147 			posix_locale,
148 			u_errorName (status));
149 		return NULL;
150 	}
151 
152 	if (len > LOCALE_BUFFER_LEN) {
153 		icu_locale = g_malloc (len);
154 
155 		uloc_canonicalize (posix_locale, icu_locale, len, &status);
156 	} else {
157 		icu_locale = g_strndup (locale_buffer, len);
158 	}
159 
160 	status = U_ZERO_ERROR;
161 	len = uloc_getLanguage (icu_locale, language_buffer, 8, &status);
162 	if (U_FAILURE (status)) {
163 		g_set_error (
164 			error, E_COLLATOR_ERROR,
165 			E_COLLATOR_ERROR_INVALID_LOCALE,
166 			"Failed to interpret language for locale '%s': %s",
167 			icu_locale,
168 			u_errorName (status));
169 		g_free (icu_locale);
170 		return NULL;
171 	}
172 
173 	status = U_ZERO_ERROR;
174 	len = uloc_getCountry (icu_locale, country_buffer, 8, &status);
175 	if (U_FAILURE (status)) {
176 		g_set_error (
177 			error, E_COLLATOR_ERROR,
178 			E_COLLATOR_ERROR_INVALID_LOCALE,
179 			"Failed to interpret country for locale '%s': %s",
180 			icu_locale,
181 			u_errorName (status));
182 		g_free (icu_locale);
183 		return NULL;
184 	}
185 
186 	/* Add 'phonebook' tailoring to certain locales */
187 	if (len < 8 &&
188 	    (strcmp (language_buffer, "de") == 0 ||
189 	     strcmp (language_buffer, "fi") == 0)) {
190 
191 		collation_type = "phonebook";
192 	}
193 
194 	if (collation_type != NULL)
195 		final_locale = g_strconcat (icu_locale, "@collation=", collation_type, NULL);
196 	else {
197 		final_locale = icu_locale;
198 		icu_locale = NULL;
199 	}
200 
201 	g_free (icu_locale);
202 
203 	if (language_code)
204 		*language_code = g_strdup (language_buffer);
205 
206 	if (country_code)
207 		*country_code = g_strdup (country_buffer);
208 
209 	return final_locale;
210 }
211 
212 /* All purpose character encoding function, encodes text
213  * to a UChar from UTF-8 and first ensures that the string
214  * is valid UTF-8
215  */
216 static const UChar *
convert_to_ustring(const gchar * string,UChar * buffer,gint buffer_len,gint * result_len,UChar ** free_me,GError ** error)217 convert_to_ustring (const gchar *string,
218                     UChar *buffer,
219                     gint buffer_len,
220                     gint *result_len,
221                     UChar **free_me,
222                     GError **error)
223 {
224 	UErrorCode status = U_ZERO_ERROR;
225 	const gchar *source_utf8;
226 	gchar *alloc_utf8 = NULL;
227 	gint   converted_len = 0;
228 	UChar *converted_buffer;
229 
230 	/* First make sure we're dealing with utf8 */
231 	if (g_utf8_validate (string, -1, NULL))
232 		source_utf8 = string;
233 	else {
234 		alloc_utf8 = e_util_utf8_make_valid (string);
235 		source_utf8 = alloc_utf8;
236 	}
237 
238 	/* First pass, try converting to UChar in the given buffer */
239 	converted_buffer = u_strFromUTF8Lenient (
240 		buffer,
241 		buffer_len,
242 		&converted_len,
243 		source_utf8,
244 		-1,
245 		&status);
246 
247 	/* Set the result length right away... */
248 	*result_len = converted_len;
249 
250 	if (U_FAILURE (status)) {
251 		converted_buffer = NULL;
252 		goto out;
253 	}
254 
255 	/* Second pass, allocate a buffer big enough and then convert */
256 	if (converted_len > buffer_len) {
257 		*free_me = g_new (UChar, converted_len);
258 
259 		converted_buffer = u_strFromUTF8Lenient (
260 			*free_me,
261 			converted_len,
262 			NULL,
263 			source_utf8,
264 			-1,
265 			&status);
266 
267 		if (U_FAILURE (status)) {
268 			g_free (*free_me);
269 			*free_me = NULL;
270 			converted_buffer = NULL;
271 			goto out;
272 		}
273 	}
274 
275  out:
276 	g_free (alloc_utf8);
277 
278 	if (U_FAILURE (status))
279 		g_set_error (
280 			error, E_COLLATOR_ERROR,
281 			E_COLLATOR_ERROR_CONVERSION,
282 			"Error occured while converting character encoding (%s)",
283 			u_errorName (status));
284 
285 	return converted_buffer;
286 }
287 
288 /*****************************************************
289  *                        API                        *
290  *****************************************************/
291 
292 /**
293  * e_collator_new:
294  * @locale: The locale under which to sort
295  * @error: A location to store a #GError from the #E_COLLATOR_ERROR domain
296  *
297  * Creates a new #ECollator for the given @locale,
298  * the returned collator should be freed with e_collator_unref().
299  *
300  * Returns: (transfer full): A newly created #ECollator.
301  *
302  * Since: 3.12
303  */
304 ECollator *
e_collator_new(const gchar * locale,GError ** error)305 e_collator_new (const gchar *locale,
306                 GError **error)
307 {
308 	return e_collator_new_interpret_country (locale, NULL, error);
309 }
310 
311 /**
312  * e_collator_new_interpret_country:
313  * @locale: The locale under which to sort
314  * @country_code: (optional) (out) (transfer full): A location to store the interpreted country code from @locale
315  * @error: A location to store a #GError from the #E_COLLATOR_ERROR domain
316  *
317  * Creates a new #ECollator for the given @locale,
318  * the returned collator should be freed with e_collator_unref().
319  *
320  * In addition, this also reliably interprets the country
321  * code from the @locale string and stores it to @country_code.
322  *
323  * Returns: (transfer full): A newly created #ECollator.
324  *
325  * Since: 3.12
326  */
327 ECollator *
e_collator_new_interpret_country(const gchar * locale,gchar ** country_code,GError ** error)328 e_collator_new_interpret_country (const gchar *locale,
329                                   gchar **country_code,
330                                   GError **error)
331 {
332 	ECollator *collator;
333 	UCollator *coll;
334 	UErrorCode status = U_ZERO_ERROR;
335 	gchar     *icu_locale;
336 	gchar     *language_code = NULL;
337 	gchar     *local_country_code = NULL;
338 
339 	g_return_val_if_fail (locale && locale[0], NULL);
340 
341 #if ENABLE_DEBUGGING
342 	print_available_locales ();
343 #endif
344 
345 	icu_locale = canonicalize_locale (
346 		locale,
347 		&language_code,
348 		&local_country_code,
349 		error);
350 	if (!icu_locale)
351 		return NULL;
352 
353 	coll = ucol_open (icu_locale, &status);
354 
355 	if (U_FAILURE (status)) {
356 		g_set_error (
357 			error, E_COLLATOR_ERROR,
358 			E_COLLATOR_ERROR_OPEN,
359 			"Unable to open collator for locale '%s' (%s)",
360 			icu_locale,
361 			u_errorName (status));
362 
363 		g_free (language_code);
364 		g_free (local_country_code);
365 		g_free (icu_locale);
366 		ucol_close (coll);
367 		return NULL;
368 	}
369 
370 	g_free (icu_locale);
371 
372 	ucol_setStrength (coll, UCOL_DEFAULT_STRENGTH);
373 
374 	collator = g_slice_new0 (ECollator);
375 	collator->coll = coll;
376 	collator->ref_count = 1;
377 
378 	/* In Chinese we use transliteration services to sort latin
379 	 * names interleaved with Chinese names in a latin AlphabeticIndex
380 	 */
381 	if (g_strcmp0 (language_code, "zh") == 0)
382 		collator->transliterator = _e_transliterator_cxx_new ("Han-Latin");
383 
384 	collator->alpha_index = _e_alphabet_index_cxx_new_for_language (language_code);
385 	collator->labels = _e_alphabet_index_cxx_get_labels (
386 		collator->alpha_index,
387 		&collator->n_labels,
388 		&collator->underflow,
389 		&collator->inflow,
390 		&collator->overflow);
391 
392 	g_free (language_code);
393 
394 	if (country_code)
395 		*country_code = local_country_code;
396 	else
397 		g_free (local_country_code);
398 
399 	return collator;
400 }
401 
402 /**
403  * e_collator_ref:
404  * @collator: An #ECollator
405  *
406  * Increases the reference count of @collator.
407  *
408  * Returns: (transfer full): @collator
409  *
410  * Since: 3.12
411  */
412 ECollator *
e_collator_ref(ECollator * collator)413 e_collator_ref (ECollator *collator)
414 {
415 	g_return_val_if_fail (collator != NULL, NULL);
416 
417 	g_atomic_int_inc (&collator->ref_count);
418 
419 	return collator;
420 }
421 
422 /**
423  * e_collator_unref:
424  * @collator: An #ECollator
425  *
426  * Decreases the reference count of @collator.
427  * If the reference count reaches 0 then the collator is freed
428  *
429  * Since: 3.12
430  */
431 void
e_collator_unref(ECollator * collator)432 e_collator_unref (ECollator *collator)
433 {
434 	g_return_if_fail (collator != NULL);
435 
436 	if (g_atomic_int_dec_and_test (&collator->ref_count)) {
437 
438 		if (collator->coll)
439 			ucol_close (collator->coll);
440 
441 		_e_alphabet_index_cxx_free (collator->alpha_index);
442 		g_strfreev (collator->labels);
443 
444 		/* The transliterator is only used for specialized sorting in some locales,
445 		 * notably Chinese locales
446 		 */
447 		if (collator->transliterator)
448 			_e_transliterator_cxx_free (collator->transliterator);
449 
450 		g_slice_free (ECollator, collator);
451 	}
452 }
453 
454 /**
455  * e_collator_generate_key:
456  * @collator: An #ECollator
457  * @str: The string to generate a collation key for
458  * @error: A location to store a #GError from the #E_COLLATOR_ERROR domain
459  *
460  * Generates a collation key for @str, the result of comparing
461  * two collation keys with strcmp() will be the same result
462  * of calling e_collator_collate() on the same original strings.
463  *
464  * This function will first ensure that @str is valid UTF-8 encoded.
465  *
466  * Returns: (transfer full): A collation key for @str, or %NULL on failure with @error set.
467  *
468  * Since: 3.12
469  */
470 gchar *
e_collator_generate_key(ECollator * collator,const gchar * str,GError ** error)471 e_collator_generate_key (ECollator *collator,
472                          const gchar *str,
473                          GError **error)
474 {
475 	UChar  source_buffer[CONVERT_BUFFER_LEN];
476 	UChar *free_me = NULL;
477 	const UChar *source;
478 	gchar stack_buffer[COLLATION_KEY_BUFFER_LEN];
479 	gchar *collation_key;
480 	gint key_len, source_len = 0;
481 	gint alphabet_index;
482 	gchar *translit_str = NULL;
483 	const gchar *input_str;
484 
485 	g_return_val_if_fail (collator != NULL, NULL);
486 	g_return_val_if_fail (str != NULL, NULL);
487 
488 	/* We may need to perform a conversion before generating the sort key */
489 	if (collator->transliterator) {
490 		translit_str = _e_transliterator_cxx_transliterate (collator->transliterator, str);
491 		input_str = translit_str;
492 	} else {
493 		input_str = str;
494 	}
495 
496 	source = convert_to_ustring (
497 		input_str,
498 		source_buffer,
499 		CONVERT_BUFFER_LEN,
500 		&source_len,
501 		&free_me,
502 		error);
503 
504 	if (!source) {
505 		g_free (translit_str);
506 		g_free (free_me);
507 		return NULL;
508 	}
509 
510 	/* Get the numerical index for this string */
511 	alphabet_index = _e_alphabet_index_cxx_get_index (collator->alpha_index, input_str);
512 
513 	/* First try to generate a key in a predefined buffer size */
514 	key_len = ucol_getSortKey (
515 		collator->coll, source, source_len,
516 		(guchar *) stack_buffer, COLLATION_KEY_BUFFER_LEN);
517 
518 	if (key_len > COLLATION_KEY_BUFFER_LEN) {
519 
520 		/* Stack buffer wasn't large enough, regenerate into a new buffer
521 		 * (add a byte for a trailing NULL char)
522 		 *
523 		 * Note we allocate 4 extra chars to hold the prefixed alphabetic
524 		 * index into the first 4 charachters (the 5th extra char is the trailing
525 		 * null character).
526 		 */
527 		collation_key = g_malloc (key_len + 5);
528 
529 		/* Format the alphabetic index into the first 4 chars */
530 		snprintf (collation_key, key_len, "%03d-", alphabet_index);
531 
532 		/* Get the sort key and put it in &collation_key[4] */
533 		ucol_getSortKey (
534 			collator->coll, source, source_len,
535 			(guchar *)(collation_key + 4), key_len);
536 
537 		/* Just being paranoid, make sure we're null terminated since the API
538 		 * doesn't specify if the result length is null character inclusive
539 		 */
540 		collation_key[key_len + 4] = '\0';
541 	} else {
542 		GString *string = g_string_new (NULL);
543 
544 		/* Format the alphabetic index into the first 4 chars */
545 		g_string_append_printf (string, "%03d-", alphabet_index);
546 
547 		/* Insert the rest of the sort key from the stack buffer into the allocated buffer */
548 		g_string_insert_len (string, 4, stack_buffer, key_len);
549 
550 		collation_key = g_string_free (string, FALSE);
551 	}
552 
553 	g_free (free_me);
554 	g_free (translit_str);
555 
556 	return (gchar *) collation_key;
557 }
558 
559 /**
560  * e_collator_generate_key_for_index:
561  * @collator: An #ECollator
562  * @index: An index into the alphabetic labels
563  *
564  * Generates a sort key for the given alphabetic @index.
565  *
566  * The generated sort key is guaranteed to sort below
567  * any sort keys for words beginning with any variant of
568  * the given letter.
569  *
570  * For instance, a sort key generated for the index 5 of
571  * a latin alphabet, where the fifth index is 'E' will sort
572  * below any sort keys generated for words starting with
573  * the characters 'e', 'E', 'é', 'É', 'è' or 'È'. It will also
574  * sort above any sort keys generated for words starting with
575  * the characters 'd' or 'D'.
576  *
577  * Returns: (transfer full): A sort key for the given index
578  *
579  * Since: 3.12
580  */
581 gchar *
e_collator_generate_key_for_index(ECollator * collator,gint index)582 e_collator_generate_key_for_index (ECollator *collator,
583                                    gint index)
584 {
585 	g_return_val_if_fail (collator != NULL, NULL);
586 	g_return_val_if_fail (index >= 0 && index < collator->n_labels, NULL);
587 
588 	return g_strdup_printf ("%03d", index);
589 }
590 
591 /**
592  * e_collator_collate:
593  * @collator: An #ECollator
594  * @str_a: (nullable): A string to compare
595  * @str_b: (nullable): The string to compare with @str_a
596  * @result: (out): A location to store the comparison result
597  * @error: A location to store a #GError from the #E_COLLATOR_ERROR domain
598  *
599  * Compares @str_a with @str_b, the order of strings is determined by the parameters of @collator.
600  *
601  * The @result will be set to integer less than, equal to, or greater than zero if @str_a is found,
602  * respectively, to be less than, to match, or be greater than @str_b.
603  *
604  * Either @str_a or @str_b can be %NULL, %NULL strings are considered to sort below other strings.
605  *
606  * This function will first ensure that both strings are valid UTF-8.
607  *
608  * Returns: %TRUE on success, otherwise if %FALSE is returned then @error will be set.
609  *
610  * Since: 3.12
611  */
612 gboolean
e_collator_collate(ECollator * collator,const gchar * str_a,const gchar * str_b,gint * result,GError ** error)613 e_collator_collate (ECollator *collator,
614                     const gchar *str_a,
615                     const gchar *str_b,
616                     gint *result,
617                     GError **error)
618 {
619 	gchar *sort_key_a, *sort_key_b;
620 
621 	g_return_val_if_fail (collator != NULL, -1);
622 	g_return_val_if_fail (result != NULL, -1);
623 
624 	if (!str_a || !str_b) {
625 		*result = g_strcmp0 (str_a, str_b);
626 		return TRUE;
627 	}
628 
629 	sort_key_a = e_collator_generate_key (collator, str_a, error);
630 	if (!sort_key_a)
631 		return FALSE;
632 
633 	sort_key_b = e_collator_generate_key (collator, str_b, error);
634 	if (!sort_key_b) {
635 		g_free (sort_key_a);
636 		return FALSE;
637 	}
638 
639 	*result = strcmp (sort_key_a, sort_key_b);
640 
641 	g_free (sort_key_a);
642 	g_free (sort_key_b);
643 
644 	return TRUE;
645 }
646 
647 /**
648  * e_collator_get_index_labels:
649  * @collator: An #ECollator
650  * @n_labels: (out): The number of labels/indexes available for @collator
651  * @underflow: (optional) (out): The underflow index, for any words which sort below the active alphabet(s)
652  * @inflow: (optional) (out): The inflow index, for any words which sort between the active alphabets (if there is more than one)
653  * @overflow: (optional) (out): The overflow index, for any words which sort above the active alphabet(s)
654  *
655  * Fetches the displayable labels and index positions for the active alphabet.
656  *
657  * Returns: (array zero-terminated=1) (element-type utf8) (transfer none):
658  *   The array of displayable labels for each index in the active alphabet(s).
659  *
660  * Since: 3.12
661  */
662 const gchar *const  *
e_collator_get_index_labels(ECollator * collator,gint * n_labels,gint * underflow,gint * inflow,gint * overflow)663 e_collator_get_index_labels (ECollator *collator,
664                              gint *n_labels,
665                              gint *underflow,
666                              gint *inflow,
667                              gint *overflow)
668 {
669 	g_return_val_if_fail (collator != NULL, NULL);
670 
671 	if (n_labels)
672 		*n_labels = collator->n_labels;
673 	if (underflow)
674 		*underflow = collator->underflow;
675 	if (inflow)
676 		*inflow = collator->inflow;
677 	if (overflow)
678 		*overflow = collator->overflow;
679 
680 	return (const gchar *const  *) collator->labels;
681 }
682 
683 /**
684  * e_collator_get_index:
685  * @collator: An #ECollator
686  * @str: A string
687  *
688  * Checks which index, as determined by e_collator_get_index_labels(),
689  * that @str should sort under.
690  *
691  * Returns: The alphabetic index under which @str would sort
692  *
693  * Since: 3.12
694  */
695 gint
e_collator_get_index(ECollator * collator,const gchar * str)696 e_collator_get_index (ECollator *collator,
697                       const gchar *str)
698 {
699 	gint index;
700 	gchar *translit_str = NULL;
701 	const gchar *input_str;
702 
703 	g_return_val_if_fail (collator != NULL, -1);
704 	g_return_val_if_fail (str != NULL, -1);
705 
706 	/* We may need to perform a conversion before generating the sort key */
707 	if (collator->transliterator) {
708 		translit_str = _e_transliterator_cxx_transliterate (collator->transliterator, str);
709 		input_str = translit_str;
710 	} else {
711 		input_str = str;
712 	}
713 
714 	index = _e_alphabet_index_cxx_get_index (collator->alpha_index, input_str);
715 
716 	g_free (translit_str);
717 
718 	return index;
719 }
720