xref: /dragonfly/contrib/grep/lib/propername.c (revision ed36d35d)
1 /* Localization of proper names.
2    Copyright (C) 2006-2015 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2006.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17 
18 /* Without this pragma, gcc 4.7.0 20111124 mistakenly suggests that
19    the proper_name function might be candidate for attribute 'const'  */
20 #if (__GNUC__ == 4 && 6 <= __GNUC_MINOR__) || 4 < __GNUC__
21 # pragma GCC diagnostic ignored "-Wsuggest-attribute=const"
22 #endif
23 
24 #include <config.h>
25 
26 /* Specification.  */
27 #include "propername.h"
28 
29 #include <ctype.h>
30 #include <stdbool.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #if HAVE_ICONV
35 # include <iconv.h>
36 #endif
37 
38 #include "trim.h"
39 #include "mbchar.h"
40 #include "mbuiter.h"
41 #include "localcharset.h"
42 #include "c-strcase.h"
43 #include "xstriconv.h"
44 #include "xalloc.h"
45 #include "gettext.h"
46 
47 
48 /* Tests whether STRING contains trim (SUB), starting and ending at word
49    boundaries.
50    Here, instead of implementing Unicode Standard Annex #29 for determining
51    word boundaries, we assume that trim (SUB) starts and ends with words and
52    only test whether the part before it ends with a non-word and the part
53    after it starts with a non-word.  */
54 static bool
55 mbsstr_trimmed_wordbounded (const char *string, const char *sub)
56 {
57   char *tsub = trim (sub);
58   bool found = false;
59 
60   for (; *string != '\0';)
61     {
62       const char *tsub_in_string = mbsstr (string, tsub);
63       if (tsub_in_string == NULL)
64         break;
65       else
66         {
67           if (MB_CUR_MAX > 1)
68             {
69               mbui_iterator_t string_iter;
70               bool word_boundary_before;
71               bool word_boundary_after;
72 
73               mbui_init (string_iter, string);
74               word_boundary_before = true;
75               if (mbui_cur_ptr (string_iter) < tsub_in_string)
76                 {
77                   mbchar_t last_char_before_tsub;
78                   do
79                     {
80                       if (!mbui_avail (string_iter))
81                         abort ();
82                       last_char_before_tsub = mbui_cur (string_iter);
83                       mbui_advance (string_iter);
84                     }
85                   while (mbui_cur_ptr (string_iter) < tsub_in_string);
86                   if (mb_isalnum (last_char_before_tsub))
87                     word_boundary_before = false;
88                 }
89 
90               mbui_init (string_iter, tsub_in_string);
91               {
92                 mbui_iterator_t tsub_iter;
93 
94                 for (mbui_init (tsub_iter, tsub);
95                      mbui_avail (tsub_iter);
96                      mbui_advance (tsub_iter))
97                   {
98                     if (!mbui_avail (string_iter))
99                       abort ();
100                     mbui_advance (string_iter);
101                   }
102               }
103               word_boundary_after = true;
104               if (mbui_avail (string_iter))
105                 {
106                   mbchar_t first_char_after_tsub = mbui_cur (string_iter);
107                   if (mb_isalnum (first_char_after_tsub))
108                     word_boundary_after = false;
109                 }
110 
111               if (word_boundary_before && word_boundary_after)
112                 {
113                   found = true;
114                   break;
115                 }
116 
117               mbui_init (string_iter, tsub_in_string);
118               if (!mbui_avail (string_iter))
119                 break;
120               string = tsub_in_string + mb_len (mbui_cur (string_iter));
121             }
122           else
123             {
124               bool word_boundary_before;
125               const char *p;
126               bool word_boundary_after;
127 
128               word_boundary_before = true;
129               if (string < tsub_in_string)
130                 if (isalnum ((unsigned char) tsub_in_string[-1]))
131                   word_boundary_before = false;
132 
133               p = tsub_in_string + strlen (tsub);
134               word_boundary_after = true;
135               if (*p != '\0')
136                 if (isalnum ((unsigned char) *p))
137                   word_boundary_after = false;
138 
139               if (word_boundary_before && word_boundary_after)
140                 {
141                   found = true;
142                   break;
143                 }
144 
145               if (*tsub_in_string == '\0')
146                 break;
147               string = tsub_in_string + 1;
148             }
149         }
150     }
151   free (tsub);
152   return found;
153 }
154 
155 /* Return the localization of NAME.  NAME is written in ASCII.  */
156 
157 const char *
158 proper_name (const char *name)
159 {
160   /* See whether there is a translation.   */
161   const char *translation = gettext (name);
162 
163   if (translation != name)
164     {
165       /* See whether the translation contains the original name.  */
166       if (mbsstr_trimmed_wordbounded (translation, name))
167         return translation;
168       else
169         {
170           /* Return "TRANSLATION (NAME)".  */
171           char *result =
172             XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
173 
174           sprintf (result, "%s (%s)", translation, name);
175           return result;
176         }
177     }
178   else
179     return name;
180 }
181 
182 /* Return the localization of a name whose original writing is not ASCII.
183    NAME_UTF8 is the real name, written in UTF-8 with octal or hexadecimal
184    escape sequences.  NAME_ASCII is a fallback written only with ASCII
185    characters.  */
186 
187 const char *
188 proper_name_utf8 (const char *name_ascii, const char *name_utf8)
189 {
190   /* See whether there is a translation.   */
191   const char *translation = gettext (name_ascii);
192 
193   /* Try to convert NAME_UTF8 to the locale encoding.  */
194   const char *locale_code = locale_charset ();
195   char *alloc_name_converted = NULL;
196   char *alloc_name_converted_translit = NULL;
197   const char *name_converted = NULL;
198   const char *name_converted_translit = NULL;
199   const char *name;
200 
201   if (c_strcasecmp (locale_code, "UTF-8") != 0)
202     {
203 #if HAVE_ICONV
204       name_converted = alloc_name_converted =
205         xstr_iconv (name_utf8, "UTF-8", locale_code);
206 
207 # if (((__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) || __GLIBC__ > 2) \
208       && !defined __UCLIBC__) \
209      || _LIBICONV_VERSION >= 0x0105
210       {
211         char *converted_translit;
212 
213         size_t len = strlen (locale_code);
214         char *locale_code_translit = XNMALLOC (len + 10 + 1, char);
215         memcpy (locale_code_translit, locale_code, len);
216         memcpy (locale_code_translit + len, "//TRANSLIT", 10 + 1);
217 
218         converted_translit =
219           xstr_iconv (name_utf8, "UTF-8", locale_code_translit);
220 
221         free (locale_code_translit);
222 
223         if (converted_translit != NULL)
224           {
225 #  if !_LIBICONV_VERSION
226             /* Don't use the transliteration if it added question marks.
227                glibc's transliteration falls back to question marks; libiconv's
228                transliteration does not.
229                mbschr is equivalent to strchr in this case.  */
230             if (strchr (converted_translit, '?') != NULL)
231               free (converted_translit);
232             else
233 #  endif
234               name_converted_translit = alloc_name_converted_translit =
235                 converted_translit;
236           }
237       }
238 # endif
239 #endif
240     }
241   else
242     {
243       name_converted = name_utf8;
244       name_converted_translit = name_utf8;
245     }
246 
247   /* The name in locale encoding.  */
248   name = (name_converted != NULL ? name_converted :
249           name_converted_translit != NULL ? name_converted_translit :
250           name_ascii);
251 
252   /* See whether we have a translation.  Some translators have not understood
253      that they should use the UTF-8 form of the name, if possible.  So if the
254      translator provided a no-op translation, we ignore it.  */
255   if (strcmp (translation, name_ascii) != 0)
256     {
257       /* See whether the translation contains the original name.  */
258       if (mbsstr_trimmed_wordbounded (translation, name_ascii)
259           || (name_converted != NULL
260               && mbsstr_trimmed_wordbounded (translation, name_converted))
261           || (name_converted_translit != NULL
262               && mbsstr_trimmed_wordbounded (translation, name_converted_translit)))
263         {
264           if (alloc_name_converted != NULL)
265             free (alloc_name_converted);
266           if (alloc_name_converted_translit != NULL)
267             free (alloc_name_converted_translit);
268           return translation;
269         }
270       else
271         {
272           /* Return "TRANSLATION (NAME)".  */
273           char *result =
274             XNMALLOC (strlen (translation) + 2 + strlen (name) + 1 + 1, char);
275 
276           sprintf (result, "%s (%s)", translation, name);
277 
278           if (alloc_name_converted != NULL)
279             free (alloc_name_converted);
280           if (alloc_name_converted_translit != NULL)
281             free (alloc_name_converted_translit);
282           return result;
283         }
284     }
285   else
286     {
287       if (alloc_name_converted != NULL && alloc_name_converted != name)
288         free (alloc_name_converted);
289       if (alloc_name_converted_translit != NULL
290           && alloc_name_converted_translit != name)
291         free (alloc_name_converted_translit);
292       return name;
293     }
294 }
295 
296 #ifdef TEST1
297 # include <locale.h>
298 int
299 main (int argc, char *argv[])
300 {
301   setlocale (LC_ALL, "");
302   if (mbsstr_trimmed_wordbounded (argv[1], argv[2]))
303     printf("found\n");
304   return 0;
305 }
306 #endif
307 
308 #ifdef TEST2
309 # include <locale.h>
310 # include <stdio.h>
311 int
312 main (int argc, char *argv[])
313 {
314   setlocale (LC_ALL, "");
315   printf ("%s\n", proper_name_utf8 ("Franc,ois Pinard", "Fran\303\247ois Pinard"));
316   return 0;
317 }
318 #endif
319