1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3 * Copyright (C) 2013 Intel Corporation
4 *
5 * This library is free software: you can redistribute it and/or modify it
6 * under the terms of the GNU Lesser General Public License as published by
7 * the Free Software Foundation.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
11 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
12 * for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public License
15 * along with this library. If not, see <http://www.gnu.org/licenses/>.
16 *
17 * Authors: Tristan Van Berkom <tristanvb@openismus.com>
18 */
19
20 /**
21 * SECTION: e-collator
22 * @include: libedataserver/libedataserver.h
23 * @short_description: Collation services for locale sensitive sorting
24 *
25 * The #ECollator is a wrapper object around ICU collation services and
26 * provides features to sort words in locale specific ways. The collator
27 * also provides some API for determining features of the active alphabet
28 * in the user's locale, and which words should be sorted under which
29 * letter in the user's alphabet.
30 */
31
32 #include "evolution-data-server-config.h"
33
34 #include <stdio.h>
35 #include <string.h>
36
37 /* ICU includes */
38 #include <unicode/uclean.h>
39 #include <unicode/ucol.h>
40 #include <unicode/ustring.h>
41
42 #include "e-collator.h"
43 #include "e-alphabet-index-private.h"
44 #include "e-transliterator-private.h"
45
46 #define CONVERT_BUFFER_LEN 512
47 #define COLLATION_KEY_BUFFER_LEN 1024
48 #define LOCALE_BUFFER_LEN 256
49
50 #define ENABLE_DEBUGGING 0
51
52 G_DEFINE_QUARK (e-collator-error-quark, e_collator_error)
53
54 G_DEFINE_BOXED_TYPE (ECollator,
55 e_collator,
56 e_collator_ref,
57 e_collator_unref)
58
59 struct _ECollator
60 {
61 UCollator *coll;
62 volatile gint ref_count;
63
64 EAlphabetIndex *alpha_index;
65 gchar **labels;
66 gint n_labels;
67 gint underflow;
68 gint inflow;
69 gint overflow;
70
71 ETransliterator *transliterator;
72 };
73
74 /*****************************************************
75 * ICU Helper Functions *
76 *****************************************************/
77 #if ENABLE_DEBUGGING
78 static void
print_available_locales(void)79 print_available_locales (void)
80 {
81 UErrorCode status = U_ZERO_ERROR;
82 UChar result[100];
83 gchar printable[100 * 4];
84 gint count, i;
85
86 u_init (&status);
87
88 g_printerr ("List of available locales (default locale is: %s)\n", uloc_getDefault ());
89
90 count = uloc_countAvailable ();
91 for (i = 0; i < count; i++) {
92 UEnumeration *keywords;
93 const gchar *keyword;
94
95 uloc_getDisplayName (uloc_getAvailable (i), NULL, result, 100, &status);
96
97 u_austrncpy (printable, result, sizeof (printable));
98
99 /* print result */
100 g_printerr ("\t%s - %s", uloc_getAvailable (i), printable);
101
102 keywords = uloc_openKeywords (uloc_getAvailable (i), &status);
103 if (keywords) {
104 UErrorCode kstatus = U_ZERO_ERROR;
105
106 g_printerr ("[");
107
108 while ((keyword = uenum_next (keywords, NULL, &kstatus)) != NULL)
109 g_printerr (" %s ", keyword);
110
111 g_printerr ("]");
112
113 uenum_close (keywords);
114 }
115 g_printerr ("\n");
116 }
117 }
118 #endif
119
120 static gchar *
canonicalize_locale(const gchar * posix_locale,gchar ** language_code,gchar ** country_code,GError ** error)121 canonicalize_locale (const gchar *posix_locale,
122 gchar **language_code,
123 gchar **country_code,
124 GError **error)
125 {
126 UErrorCode status = U_ZERO_ERROR;
127 gchar locale_buffer[LOCALE_BUFFER_LEN];
128 gchar language_buffer[8];
129 gchar country_buffer[8];
130 gchar *icu_locale;
131 gchar *final_locale;
132 gint len;
133 const gchar *collation_type = NULL;
134
135 if (posix_locale && (
136 g_ascii_strcasecmp (posix_locale, "C") == 0 ||
137 g_ascii_strcasecmp (posix_locale, "POSIX") == 0))
138 posix_locale = "en_US_POSIX";
139
140 len = uloc_canonicalize (posix_locale, locale_buffer, LOCALE_BUFFER_LEN, &status);
141
142 if (U_FAILURE (status)) {
143 g_set_error (
144 error, E_COLLATOR_ERROR,
145 E_COLLATOR_ERROR_INVALID_LOCALE,
146 "Failed to interpret locale '%s' (%s)",
147 posix_locale,
148 u_errorName (status));
149 return NULL;
150 }
151
152 if (len > LOCALE_BUFFER_LEN) {
153 icu_locale = g_malloc (len);
154
155 uloc_canonicalize (posix_locale, icu_locale, len, &status);
156 } else {
157 icu_locale = g_strndup (locale_buffer, len);
158 }
159
160 status = U_ZERO_ERROR;
161 len = uloc_getLanguage (icu_locale, language_buffer, 8, &status);
162 if (U_FAILURE (status)) {
163 g_set_error (
164 error, E_COLLATOR_ERROR,
165 E_COLLATOR_ERROR_INVALID_LOCALE,
166 "Failed to interpret language for locale '%s': %s",
167 icu_locale,
168 u_errorName (status));
169 g_free (icu_locale);
170 return NULL;
171 }
172
173 status = U_ZERO_ERROR;
174 len = uloc_getCountry (icu_locale, country_buffer, 8, &status);
175 if (U_FAILURE (status)) {
176 g_set_error (
177 error, E_COLLATOR_ERROR,
178 E_COLLATOR_ERROR_INVALID_LOCALE,
179 "Failed to interpret country for locale '%s': %s",
180 icu_locale,
181 u_errorName (status));
182 g_free (icu_locale);
183 return NULL;
184 }
185
186 /* Add 'phonebook' tailoring to certain locales */
187 if (len < 8 &&
188 (strcmp (language_buffer, "de") == 0 ||
189 strcmp (language_buffer, "fi") == 0)) {
190
191 collation_type = "phonebook";
192 }
193
194 if (collation_type != NULL)
195 final_locale = g_strconcat (icu_locale, "@collation=", collation_type, NULL);
196 else {
197 final_locale = icu_locale;
198 icu_locale = NULL;
199 }
200
201 g_free (icu_locale);
202
203 if (language_code)
204 *language_code = g_strdup (language_buffer);
205
206 if (country_code)
207 *country_code = g_strdup (country_buffer);
208
209 return final_locale;
210 }
211
212 /* All purpose character encoding function, encodes text
213 * to a UChar from UTF-8 and first ensures that the string
214 * is valid UTF-8
215 */
216 static const UChar *
convert_to_ustring(const gchar * string,UChar * buffer,gint buffer_len,gint * result_len,UChar ** free_me,GError ** error)217 convert_to_ustring (const gchar *string,
218 UChar *buffer,
219 gint buffer_len,
220 gint *result_len,
221 UChar **free_me,
222 GError **error)
223 {
224 UErrorCode status = U_ZERO_ERROR;
225 const gchar *source_utf8;
226 gchar *alloc_utf8 = NULL;
227 gint converted_len = 0;
228 UChar *converted_buffer;
229
230 /* First make sure we're dealing with utf8 */
231 if (g_utf8_validate (string, -1, NULL))
232 source_utf8 = string;
233 else {
234 alloc_utf8 = e_util_utf8_make_valid (string);
235 source_utf8 = alloc_utf8;
236 }
237
238 /* First pass, try converting to UChar in the given buffer */
239 converted_buffer = u_strFromUTF8Lenient (
240 buffer,
241 buffer_len,
242 &converted_len,
243 source_utf8,
244 -1,
245 &status);
246
247 /* Set the result length right away... */
248 *result_len = converted_len;
249
250 if (U_FAILURE (status)) {
251 converted_buffer = NULL;
252 goto out;
253 }
254
255 /* Second pass, allocate a buffer big enough and then convert */
256 if (converted_len > buffer_len) {
257 *free_me = g_new (UChar, converted_len);
258
259 converted_buffer = u_strFromUTF8Lenient (
260 *free_me,
261 converted_len,
262 NULL,
263 source_utf8,
264 -1,
265 &status);
266
267 if (U_FAILURE (status)) {
268 g_free (*free_me);
269 *free_me = NULL;
270 converted_buffer = NULL;
271 goto out;
272 }
273 }
274
275 out:
276 g_free (alloc_utf8);
277
278 if (U_FAILURE (status))
279 g_set_error (
280 error, E_COLLATOR_ERROR,
281 E_COLLATOR_ERROR_CONVERSION,
282 "Error occured while converting character encoding (%s)",
283 u_errorName (status));
284
285 return converted_buffer;
286 }
287
288 /*****************************************************
289 * API *
290 *****************************************************/
291
292 /**
293 * e_collator_new:
294 * @locale: The locale under which to sort
295 * @error: A location to store a #GError from the #E_COLLATOR_ERROR domain
296 *
297 * Creates a new #ECollator for the given @locale,
298 * the returned collator should be freed with e_collator_unref().
299 *
300 * Returns: (transfer full): A newly created #ECollator.
301 *
302 * Since: 3.12
303 */
304 ECollator *
e_collator_new(const gchar * locale,GError ** error)305 e_collator_new (const gchar *locale,
306 GError **error)
307 {
308 return e_collator_new_interpret_country (locale, NULL, error);
309 }
310
311 /**
312 * e_collator_new_interpret_country:
313 * @locale: The locale under which to sort
314 * @country_code: (optional) (out) (transfer full): A location to store the interpreted country code from @locale
315 * @error: A location to store a #GError from the #E_COLLATOR_ERROR domain
316 *
317 * Creates a new #ECollator for the given @locale,
318 * the returned collator should be freed with e_collator_unref().
319 *
320 * In addition, this also reliably interprets the country
321 * code from the @locale string and stores it to @country_code.
322 *
323 * Returns: (transfer full): A newly created #ECollator.
324 *
325 * Since: 3.12
326 */
327 ECollator *
e_collator_new_interpret_country(const gchar * locale,gchar ** country_code,GError ** error)328 e_collator_new_interpret_country (const gchar *locale,
329 gchar **country_code,
330 GError **error)
331 {
332 ECollator *collator;
333 UCollator *coll;
334 UErrorCode status = U_ZERO_ERROR;
335 gchar *icu_locale;
336 gchar *language_code = NULL;
337 gchar *local_country_code = NULL;
338
339 g_return_val_if_fail (locale && locale[0], NULL);
340
341 #if ENABLE_DEBUGGING
342 print_available_locales ();
343 #endif
344
345 icu_locale = canonicalize_locale (
346 locale,
347 &language_code,
348 &local_country_code,
349 error);
350 if (!icu_locale)
351 return NULL;
352
353 coll = ucol_open (icu_locale, &status);
354
355 if (U_FAILURE (status)) {
356 g_set_error (
357 error, E_COLLATOR_ERROR,
358 E_COLLATOR_ERROR_OPEN,
359 "Unable to open collator for locale '%s' (%s)",
360 icu_locale,
361 u_errorName (status));
362
363 g_free (language_code);
364 g_free (local_country_code);
365 g_free (icu_locale);
366 ucol_close (coll);
367 return NULL;
368 }
369
370 g_free (icu_locale);
371
372 ucol_setStrength (coll, UCOL_DEFAULT_STRENGTH);
373
374 collator = g_slice_new0 (ECollator);
375 collator->coll = coll;
376 collator->ref_count = 1;
377
378 /* In Chinese we use transliteration services to sort latin
379 * names interleaved with Chinese names in a latin AlphabeticIndex
380 */
381 if (g_strcmp0 (language_code, "zh") == 0)
382 collator->transliterator = _e_transliterator_cxx_new ("Han-Latin");
383
384 collator->alpha_index = _e_alphabet_index_cxx_new_for_language (language_code);
385 collator->labels = _e_alphabet_index_cxx_get_labels (
386 collator->alpha_index,
387 &collator->n_labels,
388 &collator->underflow,
389 &collator->inflow,
390 &collator->overflow);
391
392 g_free (language_code);
393
394 if (country_code)
395 *country_code = local_country_code;
396 else
397 g_free (local_country_code);
398
399 return collator;
400 }
401
402 /**
403 * e_collator_ref:
404 * @collator: An #ECollator
405 *
406 * Increases the reference count of @collator.
407 *
408 * Returns: (transfer full): @collator
409 *
410 * Since: 3.12
411 */
412 ECollator *
e_collator_ref(ECollator * collator)413 e_collator_ref (ECollator *collator)
414 {
415 g_return_val_if_fail (collator != NULL, NULL);
416
417 g_atomic_int_inc (&collator->ref_count);
418
419 return collator;
420 }
421
422 /**
423 * e_collator_unref:
424 * @collator: An #ECollator
425 *
426 * Decreases the reference count of @collator.
427 * If the reference count reaches 0 then the collator is freed
428 *
429 * Since: 3.12
430 */
431 void
e_collator_unref(ECollator * collator)432 e_collator_unref (ECollator *collator)
433 {
434 g_return_if_fail (collator != NULL);
435
436 if (g_atomic_int_dec_and_test (&collator->ref_count)) {
437
438 if (collator->coll)
439 ucol_close (collator->coll);
440
441 _e_alphabet_index_cxx_free (collator->alpha_index);
442 g_strfreev (collator->labels);
443
444 /* The transliterator is only used for specialized sorting in some locales,
445 * notably Chinese locales
446 */
447 if (collator->transliterator)
448 _e_transliterator_cxx_free (collator->transliterator);
449
450 g_slice_free (ECollator, collator);
451 }
452 }
453
454 /**
455 * e_collator_generate_key:
456 * @collator: An #ECollator
457 * @str: The string to generate a collation key for
458 * @error: A location to store a #GError from the #E_COLLATOR_ERROR domain
459 *
460 * Generates a collation key for @str, the result of comparing
461 * two collation keys with strcmp() will be the same result
462 * of calling e_collator_collate() on the same original strings.
463 *
464 * This function will first ensure that @str is valid UTF-8 encoded.
465 *
466 * Returns: (transfer full): A collation key for @str, or %NULL on failure with @error set.
467 *
468 * Since: 3.12
469 */
470 gchar *
e_collator_generate_key(ECollator * collator,const gchar * str,GError ** error)471 e_collator_generate_key (ECollator *collator,
472 const gchar *str,
473 GError **error)
474 {
475 UChar source_buffer[CONVERT_BUFFER_LEN];
476 UChar *free_me = NULL;
477 const UChar *source;
478 gchar stack_buffer[COLLATION_KEY_BUFFER_LEN];
479 gchar *collation_key;
480 gint key_len, source_len = 0;
481 gint alphabet_index;
482 gchar *translit_str = NULL;
483 const gchar *input_str;
484
485 g_return_val_if_fail (collator != NULL, NULL);
486 g_return_val_if_fail (str != NULL, NULL);
487
488 /* We may need to perform a conversion before generating the sort key */
489 if (collator->transliterator) {
490 translit_str = _e_transliterator_cxx_transliterate (collator->transliterator, str);
491 input_str = translit_str;
492 } else {
493 input_str = str;
494 }
495
496 source = convert_to_ustring (
497 input_str,
498 source_buffer,
499 CONVERT_BUFFER_LEN,
500 &source_len,
501 &free_me,
502 error);
503
504 if (!source) {
505 g_free (translit_str);
506 g_free (free_me);
507 return NULL;
508 }
509
510 /* Get the numerical index for this string */
511 alphabet_index = _e_alphabet_index_cxx_get_index (collator->alpha_index, input_str);
512
513 /* First try to generate a key in a predefined buffer size */
514 key_len = ucol_getSortKey (
515 collator->coll, source, source_len,
516 (guchar *) stack_buffer, COLLATION_KEY_BUFFER_LEN);
517
518 if (key_len > COLLATION_KEY_BUFFER_LEN) {
519
520 /* Stack buffer wasn't large enough, regenerate into a new buffer
521 * (add a byte for a trailing NULL char)
522 *
523 * Note we allocate 4 extra chars to hold the prefixed alphabetic
524 * index into the first 4 charachters (the 5th extra char is the trailing
525 * null character).
526 */
527 collation_key = g_malloc (key_len + 5);
528
529 /* Format the alphabetic index into the first 4 chars */
530 snprintf (collation_key, key_len, "%03d-", alphabet_index);
531
532 /* Get the sort key and put it in &collation_key[4] */
533 ucol_getSortKey (
534 collator->coll, source, source_len,
535 (guchar *)(collation_key + 4), key_len);
536
537 /* Just being paranoid, make sure we're null terminated since the API
538 * doesn't specify if the result length is null character inclusive
539 */
540 collation_key[key_len + 4] = '\0';
541 } else {
542 GString *string = g_string_new (NULL);
543
544 /* Format the alphabetic index into the first 4 chars */
545 g_string_append_printf (string, "%03d-", alphabet_index);
546
547 /* Insert the rest of the sort key from the stack buffer into the allocated buffer */
548 g_string_insert_len (string, 4, stack_buffer, key_len);
549
550 collation_key = g_string_free (string, FALSE);
551 }
552
553 g_free (free_me);
554 g_free (translit_str);
555
556 return (gchar *) collation_key;
557 }
558
559 /**
560 * e_collator_generate_key_for_index:
561 * @collator: An #ECollator
562 * @index: An index into the alphabetic labels
563 *
564 * Generates a sort key for the given alphabetic @index.
565 *
566 * The generated sort key is guaranteed to sort below
567 * any sort keys for words beginning with any variant of
568 * the given letter.
569 *
570 * For instance, a sort key generated for the index 5 of
571 * a latin alphabet, where the fifth index is 'E' will sort
572 * below any sort keys generated for words starting with
573 * the characters 'e', 'E', 'é', 'É', 'è' or 'È'. It will also
574 * sort above any sort keys generated for words starting with
575 * the characters 'd' or 'D'.
576 *
577 * Returns: (transfer full): A sort key for the given index
578 *
579 * Since: 3.12
580 */
581 gchar *
e_collator_generate_key_for_index(ECollator * collator,gint index)582 e_collator_generate_key_for_index (ECollator *collator,
583 gint index)
584 {
585 g_return_val_if_fail (collator != NULL, NULL);
586 g_return_val_if_fail (index >= 0 && index < collator->n_labels, NULL);
587
588 return g_strdup_printf ("%03d", index);
589 }
590
591 /**
592 * e_collator_collate:
593 * @collator: An #ECollator
594 * @str_a: (nullable): A string to compare
595 * @str_b: (nullable): The string to compare with @str_a
596 * @result: (out): A location to store the comparison result
597 * @error: A location to store a #GError from the #E_COLLATOR_ERROR domain
598 *
599 * Compares @str_a with @str_b, the order of strings is determined by the parameters of @collator.
600 *
601 * The @result will be set to integer less than, equal to, or greater than zero if @str_a is found,
602 * respectively, to be less than, to match, or be greater than @str_b.
603 *
604 * Either @str_a or @str_b can be %NULL, %NULL strings are considered to sort below other strings.
605 *
606 * This function will first ensure that both strings are valid UTF-8.
607 *
608 * Returns: %TRUE on success, otherwise if %FALSE is returned then @error will be set.
609 *
610 * Since: 3.12
611 */
612 gboolean
e_collator_collate(ECollator * collator,const gchar * str_a,const gchar * str_b,gint * result,GError ** error)613 e_collator_collate (ECollator *collator,
614 const gchar *str_a,
615 const gchar *str_b,
616 gint *result,
617 GError **error)
618 {
619 gchar *sort_key_a, *sort_key_b;
620
621 g_return_val_if_fail (collator != NULL, -1);
622 g_return_val_if_fail (result != NULL, -1);
623
624 if (!str_a || !str_b) {
625 *result = g_strcmp0 (str_a, str_b);
626 return TRUE;
627 }
628
629 sort_key_a = e_collator_generate_key (collator, str_a, error);
630 if (!sort_key_a)
631 return FALSE;
632
633 sort_key_b = e_collator_generate_key (collator, str_b, error);
634 if (!sort_key_b) {
635 g_free (sort_key_a);
636 return FALSE;
637 }
638
639 *result = strcmp (sort_key_a, sort_key_b);
640
641 g_free (sort_key_a);
642 g_free (sort_key_b);
643
644 return TRUE;
645 }
646
647 /**
648 * e_collator_get_index_labels:
649 * @collator: An #ECollator
650 * @n_labels: (out): The number of labels/indexes available for @collator
651 * @underflow: (optional) (out): The underflow index, for any words which sort below the active alphabet(s)
652 * @inflow: (optional) (out): The inflow index, for any words which sort between the active alphabets (if there is more than one)
653 * @overflow: (optional) (out): The overflow index, for any words which sort above the active alphabet(s)
654 *
655 * Fetches the displayable labels and index positions for the active alphabet.
656 *
657 * Returns: (array zero-terminated=1) (element-type utf8) (transfer none):
658 * The array of displayable labels for each index in the active alphabet(s).
659 *
660 * Since: 3.12
661 */
662 const gchar *const *
e_collator_get_index_labels(ECollator * collator,gint * n_labels,gint * underflow,gint * inflow,gint * overflow)663 e_collator_get_index_labels (ECollator *collator,
664 gint *n_labels,
665 gint *underflow,
666 gint *inflow,
667 gint *overflow)
668 {
669 g_return_val_if_fail (collator != NULL, NULL);
670
671 if (n_labels)
672 *n_labels = collator->n_labels;
673 if (underflow)
674 *underflow = collator->underflow;
675 if (inflow)
676 *inflow = collator->inflow;
677 if (overflow)
678 *overflow = collator->overflow;
679
680 return (const gchar *const *) collator->labels;
681 }
682
683 /**
684 * e_collator_get_index:
685 * @collator: An #ECollator
686 * @str: A string
687 *
688 * Checks which index, as determined by e_collator_get_index_labels(),
689 * that @str should sort under.
690 *
691 * Returns: The alphabetic index under which @str would sort
692 *
693 * Since: 3.12
694 */
695 gint
e_collator_get_index(ECollator * collator,const gchar * str)696 e_collator_get_index (ECollator *collator,
697 const gchar *str)
698 {
699 gint index;
700 gchar *translit_str = NULL;
701 const gchar *input_str;
702
703 g_return_val_if_fail (collator != NULL, -1);
704 g_return_val_if_fail (str != NULL, -1);
705
706 /* We may need to perform a conversion before generating the sort key */
707 if (collator->transliterator) {
708 translit_str = _e_transliterator_cxx_transliterate (collator->transliterator, str);
709 input_str = translit_str;
710 } else {
711 input_str = str;
712 }
713
714 index = _e_alphabet_index_cxx_get_index (collator->alpha_index, input_str);
715
716 g_free (translit_str);
717
718 return index;
719 }
720