1 /* Titlecase mapping for UTF-8/UTF-16/UTF-32 substrings (locale dependent).
2    Copyright (C) 2009-2014 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2009.
4 
5    This program is free software: you can redistribute it and/or modify it
6    under the terms of the GNU Lesser General Public License as published
7    by the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public License
16    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
17 
18 /* Quoting the Unicode standard, section "Default Case Algorithms":
19      Find the word boundaries in X according to Unicode Standard Annex #29,
20      “Text Boundaries.” For each word boundary, find the first cased character
21      F following the word boundary. If F exists, map F to Titlecase_Mapping(F);
22      then map all characters C between F and the following word boundary to
23      Lowercase_Mapping(C).  */
24 
25 UNIT *
FUNC(const UNIT * s,size_t n,casing_prefix_context_t prefix_context,casing_suffix_context_t suffix_context,const char * iso639_language,uninorm_t nf,UNIT * resultbuf,size_t * lengthp)26 FUNC (const UNIT *s, size_t n,
27       casing_prefix_context_t prefix_context,
28       casing_suffix_context_t suffix_context,
29       const char *iso639_language,
30       uninorm_t nf,
31       UNIT *resultbuf, size_t *lengthp)
32 {
33   /* The result being accumulated.  */
34   UNIT *result;
35   size_t length;
36   size_t allocated;
37   /* An array containing the word break positions.  */
38   char *wordbreaks;
39 
40   /* Initialize the accumulator.  */
41   if (nf != NULL || resultbuf == NULL)
42     {
43       result = NULL;
44       allocated = 0;
45     }
46   else
47     {
48       result = resultbuf;
49       allocated = *lengthp;
50     }
51   length = 0;
52 
53   /* Initialize the word breaks array.  */
54   if (n > 0)
55     {
56       wordbreaks = (char *) malloc (n);
57       if (wordbreaks == NULL)
58         {
59           errno = ENOMEM;
60           goto fail2;
61         }
62       U_WORDBREAKS (s, n, wordbreaks);
63     }
64   else
65     wordbreaks = NULL;
66 
67   {
68     const UNIT *s_end = s + n;
69     const char *wp = wordbreaks;
70 
71     /* When considering the string as segmented by word boundaries: For each
72        such segment:
73         - In the first part, we are searching for the first cased character.
74           In this state, in_word_first_part = true, and no conversion takes
75           place.
76         - In the second part, we are converting every character: the first
77           among these characters to title case, the other ones to lower case.
78           In this state, in_word_first_part = false.  */
79     bool in_word_first_part = true;
80 
81     /* Helper for evaluating the FINAL_SIGMA condition:
82        Last character that was not case-ignorable.  */
83     ucs4_t last_char_except_ignorable =
84       prefix_context.last_char_except_ignorable;
85 
86     /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
87        Last character that was of combining class 230 ("Above") or 0.  */
88     ucs4_t last_char_normal_or_above =
89       prefix_context.last_char_normal_or_above;
90 
91     while (s < s_end)
92       {
93         /* Fetch the next character.  */
94         ucs4_t uc;
95         int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
96 
97         ucs4_t (*single_character_map) (ucs4_t);
98         size_t offset_in_rule; /* offset in 'struct special_casing_rule' */
99 
100         ucs4_t mapped_uc[3];
101         unsigned int mapped_count;
102 
103         if (*wp)
104           /* Crossing a word boundary.  */
105           in_word_first_part = true;
106 
107         /* Determine single_character_map, offset_in_rule.
108            There are three possibilities:
109              - uc should not be converted.
110              - uc should be titlecased.
111              - uc should be lowercased.  */
112         if (in_word_first_part)
113           {
114             if (uc_is_cased (uc))
115               {
116                 /* uc is to be titlecased.  */
117                 single_character_map = uc_totitle;
118                 offset_in_rule = offsetof (struct special_casing_rule, title[0]);
119                 in_word_first_part = false;
120               }
121             else
122               {
123                 /* uc is not converted.  */
124                 single_character_map = NULL;
125                 offset_in_rule = 0;
126               }
127           }
128         else
129           {
130             /* uc is to be lowercased.  */
131             single_character_map = uc_tolower;
132             offset_in_rule = offsetof (struct special_casing_rule, lower[0]);
133           }
134 
135         /* Actually map uc.  */
136         if (single_character_map == NULL)
137           {
138             mapped_uc[0] = uc;
139             mapped_count = 1;
140             goto found_mapping;
141           }
142 
143         if (uc < 0x10000)
144           {
145             /* Look first in the special-casing table.  */
146             char code[3];
147 
148             code[0] = (uc >> 8) & 0xff;
149             code[1] = uc & 0xff;
150 
151             for (code[2] = 0; ; code[2]++)
152               {
153                 const struct special_casing_rule *rule =
154                   gl_unicase_special_lookup (code, 3);
155 
156                 if (rule == NULL)
157                   break;
158 
159                 /* Test if the condition applies.  */
160                 /* Does the language apply?  */
161                 if (rule->language[0] == '\0'
162                     || (iso639_language != NULL
163                         && iso639_language[0] == rule->language[0]
164                         && iso639_language[1] == rule->language[1]))
165                   {
166                     /* Does the context apply?  */
167                     int context = rule->context;
168                     bool applies;
169 
170                     if (context < 0)
171                       context = - context;
172                     switch (context)
173                       {
174                       case SCC_ALWAYS:
175                         applies = true;
176                         break;
177 
178                       case SCC_FINAL_SIGMA:
179                         /* "Before" condition: preceded by a sequence
180                            consisting of a cased letter and a case-ignorable
181                            sequence.
182                            "After" condition: not followed by a sequence
183                            consisting of a case-ignorable sequence and then a
184                            cased letter.  */
185                         /* Test the "before" condition.  */
186                         applies = uc_is_cased (last_char_except_ignorable);
187                         /* Test the "after" condition.  */
188                         if (applies)
189                           {
190                             const UNIT *s2 = s + count;
191                             for (;;)
192                               {
193                                 if (s2 < s_end)
194                                   {
195                                     ucs4_t uc2;
196                                     int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
197                                     /* Our uc_is_case_ignorable function is
198                                        known to return false for all cased
199                                        characters.  So we can call
200                                        uc_is_case_ignorable first.  */
201                                     if (!uc_is_case_ignorable (uc2))
202                                       {
203                                         applies = ! uc_is_cased (uc2);
204                                         break;
205                                       }
206                                     s2 += count2;
207                                   }
208                                 else
209                                   {
210                                     applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
211                                     break;
212                                   }
213                               }
214                           }
215                         break;
216 
217                       case SCC_AFTER_SOFT_DOTTED:
218                         /* "Before" condition: There is a Soft_Dotted character
219                            before it, with no intervening character of
220                            combining class 0 or 230 (Above).  */
221                         /* Test the "before" condition.  */
222                         applies = uc_is_property_soft_dotted (last_char_normal_or_above);
223                         break;
224 
225                       case SCC_MORE_ABOVE:
226                         /* "After" condition: followed by a character of
227                            combining class 230 (Above) with no intervening
228                            character of combining class 0 or 230 (Above).  */
229                         /* Test the "after" condition.  */
230                         {
231                           const UNIT *s2 = s + count;
232                           applies = false;
233                           for (;;)
234                             {
235                               if (s2 < s_end)
236                                 {
237                                   ucs4_t uc2;
238                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
239                                   int ccc = uc_combining_class (uc2);
240                                   if (ccc == UC_CCC_A)
241                                     {
242                                       applies = true;
243                                       break;
244                                     }
245                                   if (ccc == UC_CCC_NR)
246                                     break;
247                                   s2 += count2;
248                                 }
249                               else
250                                 {
251                                   applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
252                                   break;
253                                 }
254                             }
255                         }
256                         break;
257 
258                       case SCC_BEFORE_DOT:
259                         /* "After" condition: followed by COMBINING DOT ABOVE
260                            (U+0307). Any sequence of characters with a
261                            combining class that is neither 0 nor 230 may
262                            intervene between the current character and the
263                            combining dot above.  */
264                         /* Test the "after" condition.  */
265                         {
266                           const UNIT *s2 = s + count;
267                           applies = false;
268                           for (;;)
269                             {
270                               if (s2 < s_end)
271                                 {
272                                   ucs4_t uc2;
273                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
274                                   if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
275                                     {
276                                       applies = true;
277                                       break;
278                                     }
279                                   {
280                                     int ccc = uc_combining_class (uc2);
281                                     if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
282                                       break;
283                                   }
284                                   s2 += count2;
285                                 }
286                               else
287                                 {
288                                   applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
289                                   break;
290                                 }
291                             }
292                         }
293                         break;
294 
295                       case SCC_AFTER_I:
296                         /* "Before" condition: There is an uppercase I before
297                            it, and there is no intervening character of
298                            combining class 0 or 230 (Above).  */
299                         /* Test the "before" condition.  */
300                         applies = (last_char_normal_or_above == 'I');
301                         break;
302 
303                       default:
304                         abort ();
305                       }
306                     if (rule->context < 0)
307                       applies = !applies;
308 
309                     if (applies)
310                       {
311                         /* The rule applies.
312                            Look up the mapping (0 to 3 characters).  */
313                         const unsigned short *mapped_in_rule =
314                           (const unsigned short *)((const char *)rule + offset_in_rule);
315 
316                         if (mapped_in_rule[0] == 0)
317                           mapped_count = 0;
318                         else
319                           {
320                             mapped_uc[0] = mapped_in_rule[0];
321                             if (mapped_in_rule[1] == 0)
322                               mapped_count = 1;
323                             else
324                               {
325                                 mapped_uc[1] = mapped_in_rule[1];
326                                 if (mapped_in_rule[2] == 0)
327                                   mapped_count = 2;
328                                 else
329                                   {
330                                     mapped_uc[2] = mapped_in_rule[2];
331                                     mapped_count = 3;
332                                   }
333                               }
334                           }
335                         goto found_mapping;
336                       }
337                   }
338 
339                 /* Optimization: Save a hash table lookup in the next round.  */
340                 if (!rule->has_next)
341                   break;
342               }
343           }
344 
345         /* No special-cased mapping.  So use the locale and context independent
346            mapping.  */
347         mapped_uc[0] = single_character_map (uc);
348         mapped_count = 1;
349 
350        found_mapping:
351         /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
352         {
353           unsigned int i;
354 
355           for (i = 0; i < mapped_count; i++)
356             {
357               ucs4_t muc = mapped_uc[i];
358 
359               /* Append muc to the result accumulator.  */
360               if (length < allocated)
361                 {
362                   int ret = U_UCTOMB (result + length, muc, allocated - length);
363                   if (ret == -1)
364                     {
365                       errno = EINVAL;
366                       goto fail1;
367                     }
368                   if (ret >= 0)
369                     {
370                       length += ret;
371                       goto done_appending;
372                     }
373                 }
374               {
375                 size_t old_allocated = allocated;
376                 size_t new_allocated = 2 * old_allocated;
377                 if (new_allocated < 64)
378                   new_allocated = 64;
379                 if (new_allocated < old_allocated) /* integer overflow? */
380                   abort ();
381                 {
382                   UNIT *larger_result;
383                   if (result == NULL)
384                     {
385                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
386                       if (larger_result == NULL)
387                         {
388                           errno = ENOMEM;
389                           goto fail1;
390                         }
391                     }
392                   else if (result == resultbuf)
393                     {
394                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
395                       if (larger_result == NULL)
396                         {
397                           errno = ENOMEM;
398                           goto fail1;
399                         }
400                       U_CPY (larger_result, resultbuf, length);
401                     }
402                   else
403                     {
404                       larger_result =
405                         (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
406                       if (larger_result == NULL)
407                         {
408                           errno = ENOMEM;
409                           goto fail1;
410                         }
411                     }
412                   result = larger_result;
413                   allocated = new_allocated;
414                   {
415                     int ret = U_UCTOMB (result + length, muc, allocated - length);
416                     if (ret == -1)
417                       {
418                         errno = EINVAL;
419                         goto fail1;
420                       }
421                     if (ret < 0)
422                       abort ();
423                     length += ret;
424                     goto done_appending;
425                   }
426                 }
427               }
428              done_appending: ;
429             }
430         }
431 
432         if (!uc_is_case_ignorable (uc))
433           last_char_except_ignorable = uc;
434 
435         {
436           int ccc = uc_combining_class (uc);
437           if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
438             last_char_normal_or_above = uc;
439         }
440 
441         s += count;
442         wp += count;
443       }
444   }
445 
446   free (wordbreaks);
447 
448   if (nf != NULL)
449     {
450       /* Finally, normalize the result.  */
451       UNIT *normalized_result;
452 
453       normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
454       if (normalized_result == NULL)
455         goto fail2;
456 
457       free (result);
458       return normalized_result;
459     }
460 
461   if (length == 0)
462     {
463       if (result == NULL)
464         {
465           /* Return a non-NULL value.  NULL means error.  */
466           result = (UNIT *) malloc (1);
467           if (result == NULL)
468             {
469               errno = ENOMEM;
470               goto fail2;
471             }
472         }
473     }
474   else if (result != resultbuf && length < allocated)
475     {
476       /* Shrink the allocated memory if possible.  */
477       UNIT *memory;
478 
479       memory = (UNIT *) realloc (result, length * sizeof (UNIT));
480       if (memory != NULL)
481         result = memory;
482     }
483 
484   *lengthp = length;
485   return result;
486 
487  fail1:
488   {
489     int saved_errno = errno;
490     free (wordbreaks);
491     errno = saved_errno;
492   }
493  fail2:
494   if (result != resultbuf)
495     {
496       int saved_errno = errno;
497       free (result);
498       errno = saved_errno;
499     }
500   return NULL;
501 }
502