1 /* Titlecase mapping for UTF-8/UTF-16/UTF-32 substrings (locale dependent).
2    Copyright (C) 2009-2021 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2009.
4 
5    This file is free software.
6    It is dual-licensed under "the GNU LGPLv3+ or the GNU GPLv2+".
7    You can redistribute it and/or modify it under either
8      - the terms of the GNU Lesser General Public License as published
9        by the Free Software Foundation; either version 3, or (at your
10        option) any later version, or
11      - the terms of the GNU General Public License as published by the
12        Free Software Foundation; either version 2, or (at your option)
13        any later version, or
14      - the same dual license "the GNU LGPLv3+ or the GNU GPLv2+".
15 
16    This file is distributed in the hope that it will be useful,
17    but WITHOUT ANY WARRANTY; without even the implied warranty of
18    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19    Lesser General Public License and the GNU General Public License
20    for more details.
21 
22    You should have received a copy of the GNU Lesser General Public
23    License and of the GNU General Public License along with this
24    program.  If not, see <https://www.gnu.org/licenses/>.  */
25 
26 /* Quoting the Unicode standard, section "Default Case Algorithms":
27      Find the word boundaries in X according to Unicode Standard Annex #29,
28      “Text Boundaries.” For each word boundary, find the first cased character
29      F following the word boundary. If F exists, map F to Titlecase_Mapping(F);
30      then map all characters C between F and the following word boundary to
31      Lowercase_Mapping(C).  */
32 
33 UNIT *
FUNC(const UNIT * s,size_t n,casing_prefix_context_t prefix_context,casing_suffix_context_t suffix_context,const char * iso639_language,uninorm_t nf,UNIT * resultbuf,size_t * lengthp)34 FUNC (const UNIT *s, size_t n,
35       casing_prefix_context_t prefix_context,
36       casing_suffix_context_t suffix_context,
37       const char *iso639_language,
38       uninorm_t nf,
39       UNIT *resultbuf, size_t *lengthp)
40 {
41   /* The result being accumulated.  */
42   UNIT *result;
43   size_t length;
44   size_t allocated;
45   /* An array containing the word break positions.  */
46   char *wordbreaks;
47 
48   /* Initialize the accumulator.  */
49   if (nf != NULL || resultbuf == NULL)
50     {
51       result = NULL;
52       allocated = 0;
53     }
54   else
55     {
56       result = resultbuf;
57       allocated = *lengthp;
58     }
59   length = 0;
60 
61   /* Initialize the word breaks array.  */
62   if (n > 0)
63     {
64       wordbreaks = (char *) malloc (n);
65       if (wordbreaks == NULL)
66         {
67           errno = ENOMEM;
68           goto fail2;
69         }
70       U_WORDBREAKS (s, n, wordbreaks);
71     }
72   else
73     wordbreaks = NULL;
74 
75   {
76     const UNIT *s_end = s + n;
77     const char *wp = wordbreaks;
78 
79     /* When considering the string as segmented by word boundaries: For each
80        such segment:
81         - In the first part, we are searching for the first cased character.
82           In this state, in_word_first_part = true, and no conversion takes
83           place.
84         - In the second part, we are converting every character: the first
85           among these characters to title case, the other ones to lower case.
86           In this state, in_word_first_part = false.  */
87     bool in_word_first_part = true;
88 
89     /* Helper for evaluating the FINAL_SIGMA condition:
90        Last character that was not case-ignorable.  */
91     ucs4_t last_char_except_ignorable =
92       prefix_context.last_char_except_ignorable;
93 
94     /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
95        Last character that was of combining class 230 ("Above") or 0.  */
96     ucs4_t last_char_normal_or_above =
97       prefix_context.last_char_normal_or_above;
98 
99     while (s < s_end)
100       {
101         /* Fetch the next character.  */
102         ucs4_t uc;
103         int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
104 
105         ucs4_t (*single_character_map) (ucs4_t);
106         size_t offset_in_rule; /* offset in 'struct special_casing_rule' */
107 
108         ucs4_t mapped_uc[3];
109         unsigned int mapped_count;
110 
111         if (*wp)
112           /* Crossing a word boundary.  */
113           in_word_first_part = true;
114 
115         /* Determine single_character_map, offset_in_rule.
116            There are three possibilities:
117              - uc should not be converted.
118              - uc should be titlecased.
119              - uc should be lowercased.  */
120         if (in_word_first_part)
121           {
122             if (uc_is_cased (uc))
123               {
124                 /* uc is to be titlecased.  */
125                 single_character_map = uc_totitle;
126                 offset_in_rule = offsetof (struct special_casing_rule, title[0]);
127                 in_word_first_part = false;
128               }
129             else
130               {
131                 /* uc is not converted.  */
132                 single_character_map = NULL;
133                 offset_in_rule = 0;
134               }
135           }
136         else
137           {
138             /* uc is to be lowercased.  */
139             single_character_map = uc_tolower;
140             offset_in_rule = offsetof (struct special_casing_rule, lower[0]);
141           }
142 
143         /* Actually map uc.  */
144         if (single_character_map == NULL)
145           {
146             mapped_uc[0] = uc;
147             mapped_count = 1;
148             goto found_mapping;
149           }
150 
151         if (uc < 0x10000)
152           {
153             /* Look first in the special-casing table.  */
154             char code[3];
155 
156             code[0] = (uc >> 8) & 0xff;
157             code[1] = uc & 0xff;
158 
159             for (code[2] = 0; ; code[2]++)
160               {
161                 const struct special_casing_rule *rule =
162                   gl_unicase_special_lookup (code, 3);
163 
164                 if (rule == NULL)
165                   break;
166 
167                 /* Test if the condition applies.  */
168                 /* Does the language apply?  */
169                 if (rule->language[0] == '\0'
170                     || (iso639_language != NULL
171                         && iso639_language[0] == rule->language[0]
172                         && iso639_language[1] == rule->language[1]))
173                   {
174                     /* Does the context apply?  */
175                     int context = rule->context;
176                     bool applies;
177 
178                     if (context < 0)
179                       context = - context;
180                     switch (context)
181                       {
182                       case SCC_ALWAYS:
183                         applies = true;
184                         break;
185 
186                       case SCC_FINAL_SIGMA:
187                         /* "Before" condition: preceded by a sequence
188                            consisting of a cased letter and a case-ignorable
189                            sequence.
190                            "After" condition: not followed by a sequence
191                            consisting of a case-ignorable sequence and then a
192                            cased letter.  */
193                         /* Test the "before" condition.  */
194                         applies = uc_is_cased (last_char_except_ignorable);
195                         /* Test the "after" condition.  */
196                         if (applies)
197                           {
198                             const UNIT *s2 = s + count;
199                             for (;;)
200                               {
201                                 if (s2 < s_end)
202                                   {
203                                     ucs4_t uc2;
204                                     int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
205                                     /* Our uc_is_case_ignorable function is
206                                        known to return false for all cased
207                                        characters.  So we can call
208                                        uc_is_case_ignorable first.  */
209                                     if (!uc_is_case_ignorable (uc2))
210                                       {
211                                         applies = ! uc_is_cased (uc2);
212                                         break;
213                                       }
214                                     s2 += count2;
215                                   }
216                                 else
217                                   {
218                                     applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
219                                     break;
220                                   }
221                               }
222                           }
223                         break;
224 
225                       case SCC_AFTER_SOFT_DOTTED:
226                         /* "Before" condition: There is a Soft_Dotted character
227                            before it, with no intervening character of
228                            combining class 0 or 230 (Above).  */
229                         /* Test the "before" condition.  */
230                         applies = uc_is_property_soft_dotted (last_char_normal_or_above);
231                         break;
232 
233                       case SCC_MORE_ABOVE:
234                         /* "After" condition: followed by a character of
235                            combining class 230 (Above) with no intervening
236                            character of combining class 0 or 230 (Above).  */
237                         /* Test the "after" condition.  */
238                         {
239                           const UNIT *s2 = s + count;
240                           applies = false;
241                           for (;;)
242                             {
243                               if (s2 < s_end)
244                                 {
245                                   ucs4_t uc2;
246                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
247                                   int ccc = uc_combining_class (uc2);
248                                   if (ccc == UC_CCC_A)
249                                     {
250                                       applies = true;
251                                       break;
252                                     }
253                                   if (ccc == UC_CCC_NR)
254                                     break;
255                                   s2 += count2;
256                                 }
257                               else
258                                 {
259                                   applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
260                                   break;
261                                 }
262                             }
263                         }
264                         break;
265 
266                       case SCC_BEFORE_DOT:
267                         /* "After" condition: followed by COMBINING DOT ABOVE
268                            (U+0307). Any sequence of characters with a
269                            combining class that is neither 0 nor 230 may
270                            intervene between the current character and the
271                            combining dot above.  */
272                         /* Test the "after" condition.  */
273                         {
274                           const UNIT *s2 = s + count;
275                           applies = false;
276                           for (;;)
277                             {
278                               if (s2 < s_end)
279                                 {
280                                   ucs4_t uc2;
281                                   int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
282                                   if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
283                                     {
284                                       applies = true;
285                                       break;
286                                     }
287                                   {
288                                     int ccc = uc_combining_class (uc2);
289                                     if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
290                                       break;
291                                   }
292                                   s2 += count2;
293                                 }
294                               else
295                                 {
296                                   applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
297                                   break;
298                                 }
299                             }
300                         }
301                         break;
302 
303                       case SCC_AFTER_I:
304                         /* "Before" condition: There is an uppercase I before
305                            it, and there is no intervening character of
306                            combining class 0 or 230 (Above).  */
307                         /* Test the "before" condition.  */
308                         applies = (last_char_normal_or_above == 'I');
309                         break;
310 
311                       default:
312                         abort ();
313                       }
314                     if (rule->context < 0)
315                       applies = !applies;
316 
317                     if (applies)
318                       {
319                         /* The rule applies.
320                            Look up the mapping (0 to 3 characters).  */
321                         const unsigned short *mapped_in_rule =
322                           (const unsigned short *)((const char *)rule + offset_in_rule);
323 
324                         if (mapped_in_rule[0] == 0)
325                           mapped_count = 0;
326                         else
327                           {
328                             mapped_uc[0] = mapped_in_rule[0];
329                             if (mapped_in_rule[1] == 0)
330                               mapped_count = 1;
331                             else
332                               {
333                                 mapped_uc[1] = mapped_in_rule[1];
334                                 if (mapped_in_rule[2] == 0)
335                                   mapped_count = 2;
336                                 else
337                                   {
338                                     mapped_uc[2] = mapped_in_rule[2];
339                                     mapped_count = 3;
340                                   }
341                               }
342                           }
343                         goto found_mapping;
344                       }
345                   }
346 
347                 /* Optimization: Save a hash table lookup in the next round.  */
348                 if (!rule->has_next)
349                   break;
350               }
351           }
352 
353         /* No special-cased mapping.  So use the locale and context independent
354            mapping.  */
355         mapped_uc[0] = single_character_map (uc);
356         mapped_count = 1;
357 
358        found_mapping:
359         /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
360         {
361           unsigned int i;
362 
363           for (i = 0; i < mapped_count; i++)
364             {
365               ucs4_t muc = mapped_uc[i];
366 
367               /* Append muc to the result accumulator.  */
368               if (length < allocated)
369                 {
370                   int ret = U_UCTOMB (result + length, muc, allocated - length);
371                   if (ret == -1)
372                     {
373                       errno = EINVAL;
374                       goto fail1;
375                     }
376                   if (ret >= 0)
377                     {
378                       length += ret;
379                       goto done_appending;
380                     }
381                 }
382               {
383                 size_t old_allocated = allocated;
384                 size_t new_allocated = 2 * old_allocated;
385                 if (new_allocated < 64)
386                   new_allocated = 64;
387                 if (new_allocated < old_allocated) /* integer overflow? */
388                   abort ();
389                 {
390                   UNIT *larger_result;
391                   if (result == NULL)
392                     {
393                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
394                       if (larger_result == NULL)
395                         {
396                           errno = ENOMEM;
397                           goto fail1;
398                         }
399                     }
400                   else if (result == resultbuf)
401                     {
402                       larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
403                       if (larger_result == NULL)
404                         {
405                           errno = ENOMEM;
406                           goto fail1;
407                         }
408                       U_CPY (larger_result, resultbuf, length);
409                     }
410                   else
411                     {
412                       larger_result =
413                         (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
414                       if (larger_result == NULL)
415                         {
416                           errno = ENOMEM;
417                           goto fail1;
418                         }
419                     }
420                   result = larger_result;
421                   allocated = new_allocated;
422                   {
423                     int ret = U_UCTOMB (result + length, muc, allocated - length);
424                     if (ret == -1)
425                       {
426                         errno = EINVAL;
427                         goto fail1;
428                       }
429                     if (ret < 0)
430                       abort ();
431                     length += ret;
432                     goto done_appending;
433                   }
434                 }
435               }
436              done_appending: ;
437             }
438         }
439 
440         if (!uc_is_case_ignorable (uc))
441           last_char_except_ignorable = uc;
442 
443         {
444           int ccc = uc_combining_class (uc);
445           if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
446             last_char_normal_or_above = uc;
447         }
448 
449         s += count;
450         wp += count;
451       }
452   }
453 
454   free (wordbreaks);
455 
456   if (nf != NULL)
457     {
458       /* Finally, normalize the result.  */
459       UNIT *normalized_result;
460 
461       normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
462       if (normalized_result == NULL)
463         goto fail2;
464 
465       free (result);
466       return normalized_result;
467     }
468 
469   if (length == 0)
470     {
471       if (result == NULL)
472         {
473           /* Return a non-NULL value.  NULL means error.  */
474           result = (UNIT *) malloc (1);
475           if (result == NULL)
476             {
477               errno = ENOMEM;
478               goto fail2;
479             }
480         }
481     }
482   else if (result != resultbuf && length < allocated)
483     {
484       /* Shrink the allocated memory if possible.  */
485       UNIT *memory;
486 
487       memory = (UNIT *) realloc (result, length * sizeof (UNIT));
488       if (memory != NULL)
489         result = memory;
490     }
491 
492   *lengthp = length;
493   return result;
494 
495  fail1:
496   {
497     int saved_errno = errno;
498     free (wordbreaks);
499     errno = saved_errno;
500   }
501  fail2:
502   if (result != resultbuf)
503     {
504       int saved_errno = errno;
505       free (result);
506       errno = saved_errno;
507     }
508   return NULL;
509 }
510 
511 /*
512  * Local Variables:
513  * coding: utf-8
514  * End:
515  */
516