1 /* Localization of proper names. 2 Copyright (C) 2006-2010 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2006. 4 5 This program is free software: you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18 #include <config.h> 19 20 /* Specification. */ 21 #include "propername.h" 22 23 #include <ctype.h> 24 #include <stdbool.h> 25 #include <stdio.h> 26 #include <stdlib.h> 27 #include <string.h> 28 #if HAVE_ICONV 29 # include <iconv.h> 30 #endif 31 32 #include "trim.h" 33 #include "mbchar.h" 34 #include "mbuiter.h" 35 #include "localcharset.h" 36 #include "c-strcase.h" 37 #include "xstriconv.h" 38 #include "xalloc.h" 39 #include "gettext.h" 40 41 42 /* Tests whether STRING contains trim (SUB), starting and ending at word 43 boundaries. 44 Here, instead of implementing Unicode Standard Annex #29 for determining 45 word boundaries, we assume that trim (SUB) starts and ends with words and 46 only test whether the part before it ends with a non-word and the part 47 after it starts with a non-word. */ 48 static bool 49 mbsstr_trimmed_wordbounded (const char *string, const char *sub) 50 { 51 char *tsub = trim (sub); 52 bool found = false; 53 54 for (; *string != '\0';) 55 { 56 const char *tsub_in_string = mbsstr (string, tsub); 57 if (tsub_in_string == NULL) 58 break; 59 else 60 { 61 if (MB_CUR_MAX > 1) 62 { 63 mbui_iterator_t string_iter; 64 bool word_boundary_before; 65 bool word_boundary_after; 66 67 mbui_init (string_iter, string); 68 word_boundary_before = true; 69 if (mbui_cur_ptr (string_iter) < tsub_in_string) 70 { 71 mbchar_t last_char_before_tsub; 72 do 73 { 74 if (!mbui_avail (string_iter)) 75 abort (); 76 last_char_before_tsub = mbui_cur (string_iter); 77 mbui_advance (string_iter); 78 } 79 while (mbui_cur_ptr (string_iter) < tsub_in_string); 80 if (mb_isalnum (last_char_before_tsub)) 81 word_boundary_before = false; 82 } 83 84 mbui_init (string_iter, tsub_in_string); 85 { 86 mbui_iterator_t tsub_iter; 87 88 for (mbui_init (tsub_iter, tsub); 89 mbui_avail (tsub_iter); 90 mbui_advance (tsub_iter)) 91 { 92 if (!mbui_avail (string_iter)) 93 abort (); 94 mbui_advance (string_iter); 95 } 96 } 97 word_boundary_after = true; 98 if (mbui_avail (string_iter)) 99 { 100 mbchar_t first_char_after_tsub = mbui_cur (string_iter); 101 if (mb_isalnum (first_char_after_tsub)) 102 word_boundary_after = false; 103 } 104 105 if (word_boundary_before && word_boundary_after) 106 { 107 found = true; 108 break; 109 } 110 111 mbui_init (string_iter, tsub_in_string); 112 if (!mbui_avail (string_iter)) 113 break; 114 string = tsub_in_string + mb_len (mbui_cur (string_iter)); 115 } 116 else 117 { 118 bool word_boundary_before; 119 const char *p; 120 bool word_boundary_after; 121 122 word_boundary_before = true; 123 if (string < tsub_in_string) 124 if (isalnum ((unsigned char) tsub_in_string[-1])) 125 word_boundary_before = false; 126 127 p = tsub_in_string + strlen (tsub); 128 word_boundary_after = true; 129 if (*p != '\0') 130 if (isalnum ((unsigned char) *p)) 131 word_boundary_after = false; 132 133 if (word_boundary_before && word_boundary_after) 134 { 135 found = true; 136 break; 137 } 138 139 if (*tsub_in_string == '\0') 140 break; 141 string = tsub_in_string + 1; 142 } 143 } 144 } 145 free (tsub); 146 return found; 147 } 148 149 /* Return the localization of NAME. NAME is written in ASCII. */ 150 151 const char * 152 proper_name (const char *name) 153 { 154 /* See whether there is a translation. */ 155 const char *translation = gettext (name); 156 157 if (translation != name) 158 { 159 /* See whether the translation contains the original name. */ 160 if (mbsstr_trimmed_wordbounded (translation, name)) 161 return translation; 162 else 163 { 164 /* Return "TRANSLATION (NAME)". */ 165 char *result = 166 XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char); 167 168 sprintf (result, "%s (%s)", translation, name); 169 return result; 170 } 171 } 172 else 173 return name; 174 } 175 176 /* Return the localization of a name whose original writing is not ASCII. 177 NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal 178 escape sequences. NAME_ASCII is a fallback written only with ASCII 179 characters. */ 180 181 const char * 182 proper_name_utf8 (const char *name_ascii, const char *name_utf8) 183 { 184 /* See whether there is a translation. */ 185 const char *translation = gettext (name_ascii); 186 187 /* Try to convert NAME_UTF8 to the locale encoding. */ 188 const char *locale_code = locale_charset (); 189 char *alloc_name_converted = NULL; 190 char *alloc_name_converted_translit = NULL; 191 const char *name_converted = NULL; 192 const char *name_converted_translit = NULL; 193 const char *name; 194 195 if (c_strcasecmp (locale_code, "UTF-8") != 0) 196 { 197 #if HAVE_ICONV 198 name_converted = alloc_name_converted = 199 xstr_iconv (name_utf8, "UTF-8", locale_code); 200 201 # if (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2 \ 202 || _LIBICONV_VERSION >= 0x0105 203 { 204 char *converted_translit; 205 206 size_t len = strlen (locale_code); 207 char *locale_code_translit = XNMALLOC (len + 10 + 1, char); 208 memcpy (locale_code_translit, locale_code, len); 209 memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1); 210 211 converted_translit = 212 xstr_iconv (name_utf8, "UTF-8", locale_code_translit); 213 214 free (locale_code_translit); 215 216 if (converted_translit != NULL) 217 { 218 # if !_LIBICONV_VERSION 219 /* Don't use the transliteration if it added question marks. 220 glibc's transliteration falls back to question marks; libiconv's 221 transliteration does not. 222 mbschr is equivalent to strchr in this case. */ 223 if (strchr (converted_translit, '?') != NULL) 224 free (converted_translit); 225 else 226 # endif 227 name_converted_translit = alloc_name_converted_translit = 228 converted_translit; 229 } 230 } 231 # endif 232 #endif 233 } 234 else 235 { 236 name_converted = name_utf8; 237 name_converted_translit = name_utf8; 238 } 239 240 /* The name in locale encoding. */ 241 name = (name_converted != NULL ? name_converted : 242 name_converted_translit != NULL ? name_converted_translit : 243 name_ascii); 244 245 /* See whether we have a translation. Some translators have not understood 246 that they should use the UTF-8 form of the name, if possible. So if the 247 translator provided a no-op translation, we ignore it. */ 248 if (strcmp (translation, name_ascii) != 0) 249 { 250 /* See whether the translation contains the original name. */ 251 if (mbsstr_trimmed_wordbounded (translation, name_ascii) 252 || (name_converted != NULL 253 && mbsstr_trimmed_wordbounded (translation, name_converted)) 254 || (name_converted_translit != NULL 255 && mbsstr_trimmed_wordbounded (translation, name_converted_translit))) 256 { 257 if (alloc_name_converted != NULL) 258 free (alloc_name_converted); 259 if (alloc_name_converted_translit != NULL) 260 free (alloc_name_converted_translit); 261 return translation; 262 } 263 else 264 { 265 /* Return "TRANSLATION (NAME)". */ 266 char *result = 267 XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char); 268 269 sprintf (result, "%s (%s)", translation, name); 270 271 if (alloc_name_converted != NULL) 272 free (alloc_name_converted); 273 if (alloc_name_converted_translit != NULL) 274 free (alloc_name_converted_translit); 275 return result; 276 } 277 } 278 else 279 { 280 if (alloc_name_converted != NULL && alloc_name_converted != name) 281 free (alloc_name_converted); 282 if (alloc_name_converted_translit != NULL 283 && alloc_name_converted_translit != name) 284 free (alloc_name_converted_translit); 285 return name; 286 } 287 } 288 289 #ifdef TEST1 290 # include <locale.h> 291 int 292 main (int argc, char *argv[]) 293 { 294 setlocale (LC_ALL, ""); 295 if (mbsstr_trimmed_wordbounded (argv[1], argv[2])) 296 printf("found\n"); 297 return 0; 298 } 299 #endif 300 301 #ifdef TEST2 302 # include <locale.h> 303 # include <stdio.h> 304 int 305 main (int argc, char *argv[]) 306 { 307 setlocale (LC_ALL, ""); 308 printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard")); 309 return 0; 310 } 311 #endif 312