1 /* Titlecase mapping for UTF-8/UTF-16/UTF-32 substrings (locale dependent).
2 Copyright (C) 2009-2014 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2009.
4
5 This program is free software: you can redistribute it and/or modify it
6 under the terms of the GNU Lesser General Public License as published
7 by the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17
18 /* Quoting the Unicode standard, section "Default Case Algorithms":
19 Find the word boundaries in X according to Unicode Standard Annex #29,
20 “Text Boundaries.” For each word boundary, find the first cased character
21 F following the word boundary. If F exists, map F to Titlecase_Mapping(F);
22 then map all characters C between F and the following word boundary to
23 Lowercase_Mapping(C). */
24
25 UNIT *
FUNC(const UNIT * s,size_t n,casing_prefix_context_t prefix_context,casing_suffix_context_t suffix_context,const char * iso639_language,uninorm_t nf,UNIT * resultbuf,size_t * lengthp)26 FUNC (const UNIT *s, size_t n,
27 casing_prefix_context_t prefix_context,
28 casing_suffix_context_t suffix_context,
29 const char *iso639_language,
30 uninorm_t nf,
31 UNIT *resultbuf, size_t *lengthp)
32 {
33 /* The result being accumulated. */
34 UNIT *result;
35 size_t length;
36 size_t allocated;
37 /* An array containing the word break positions. */
38 char *wordbreaks;
39
40 /* Initialize the accumulator. */
41 if (nf != NULL || resultbuf == NULL)
42 {
43 result = NULL;
44 allocated = 0;
45 }
46 else
47 {
48 result = resultbuf;
49 allocated = *lengthp;
50 }
51 length = 0;
52
53 /* Initialize the word breaks array. */
54 if (n > 0)
55 {
56 wordbreaks = (char *) malloc (n);
57 if (wordbreaks == NULL)
58 {
59 errno = ENOMEM;
60 goto fail2;
61 }
62 U_WORDBREAKS (s, n, wordbreaks);
63 }
64 else
65 wordbreaks = NULL;
66
67 {
68 const UNIT *s_end = s + n;
69 const char *wp = wordbreaks;
70
71 /* When considering the string as segmented by word boundaries: For each
72 such segment:
73 - In the first part, we are searching for the first cased character.
74 In this state, in_word_first_part = true, and no conversion takes
75 place.
76 - In the second part, we are converting every character: the first
77 among these characters to title case, the other ones to lower case.
78 In this state, in_word_first_part = false. */
79 bool in_word_first_part = true;
80
81 /* Helper for evaluating the FINAL_SIGMA condition:
82 Last character that was not case-ignorable. */
83 ucs4_t last_char_except_ignorable =
84 prefix_context.last_char_except_ignorable;
85
86 /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
87 Last character that was of combining class 230 ("Above") or 0. */
88 ucs4_t last_char_normal_or_above =
89 prefix_context.last_char_normal_or_above;
90
91 while (s < s_end)
92 {
93 /* Fetch the next character. */
94 ucs4_t uc;
95 int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
96
97 ucs4_t (*single_character_map) (ucs4_t);
98 size_t offset_in_rule; /* offset in 'struct special_casing_rule' */
99
100 ucs4_t mapped_uc[3];
101 unsigned int mapped_count;
102
103 if (*wp)
104 /* Crossing a word boundary. */
105 in_word_first_part = true;
106
107 /* Determine single_character_map, offset_in_rule.
108 There are three possibilities:
109 - uc should not be converted.
110 - uc should be titlecased.
111 - uc should be lowercased. */
112 if (in_word_first_part)
113 {
114 if (uc_is_cased (uc))
115 {
116 /* uc is to be titlecased. */
117 single_character_map = uc_totitle;
118 offset_in_rule = offsetof (struct special_casing_rule, title[0]);
119 in_word_first_part = false;
120 }
121 else
122 {
123 /* uc is not converted. */
124 single_character_map = NULL;
125 offset_in_rule = 0;
126 }
127 }
128 else
129 {
130 /* uc is to be lowercased. */
131 single_character_map = uc_tolower;
132 offset_in_rule = offsetof (struct special_casing_rule, lower[0]);
133 }
134
135 /* Actually map uc. */
136 if (single_character_map == NULL)
137 {
138 mapped_uc[0] = uc;
139 mapped_count = 1;
140 goto found_mapping;
141 }
142
143 if (uc < 0x10000)
144 {
145 /* Look first in the special-casing table. */
146 char code[3];
147
148 code[0] = (uc >> 8) & 0xff;
149 code[1] = uc & 0xff;
150
151 for (code[2] = 0; ; code[2]++)
152 {
153 const struct special_casing_rule *rule =
154 gl_unicase_special_lookup (code, 3);
155
156 if (rule == NULL)
157 break;
158
159 /* Test if the condition applies. */
160 /* Does the language apply? */
161 if (rule->language[0] == '\0'
162 || (iso639_language != NULL
163 && iso639_language[0] == rule->language[0]
164 && iso639_language[1] == rule->language[1]))
165 {
166 /* Does the context apply? */
167 int context = rule->context;
168 bool applies;
169
170 if (context < 0)
171 context = - context;
172 switch (context)
173 {
174 case SCC_ALWAYS:
175 applies = true;
176 break;
177
178 case SCC_FINAL_SIGMA:
179 /* "Before" condition: preceded by a sequence
180 consisting of a cased letter and a case-ignorable
181 sequence.
182 "After" condition: not followed by a sequence
183 consisting of a case-ignorable sequence and then a
184 cased letter. */
185 /* Test the "before" condition. */
186 applies = uc_is_cased (last_char_except_ignorable);
187 /* Test the "after" condition. */
188 if (applies)
189 {
190 const UNIT *s2 = s + count;
191 for (;;)
192 {
193 if (s2 < s_end)
194 {
195 ucs4_t uc2;
196 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
197 /* Our uc_is_case_ignorable function is
198 known to return false for all cased
199 characters. So we can call
200 uc_is_case_ignorable first. */
201 if (!uc_is_case_ignorable (uc2))
202 {
203 applies = ! uc_is_cased (uc2);
204 break;
205 }
206 s2 += count2;
207 }
208 else
209 {
210 applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
211 break;
212 }
213 }
214 }
215 break;
216
217 case SCC_AFTER_SOFT_DOTTED:
218 /* "Before" condition: There is a Soft_Dotted character
219 before it, with no intervening character of
220 combining class 0 or 230 (Above). */
221 /* Test the "before" condition. */
222 applies = uc_is_property_soft_dotted (last_char_normal_or_above);
223 break;
224
225 case SCC_MORE_ABOVE:
226 /* "After" condition: followed by a character of
227 combining class 230 (Above) with no intervening
228 character of combining class 0 or 230 (Above). */
229 /* Test the "after" condition. */
230 {
231 const UNIT *s2 = s + count;
232 applies = false;
233 for (;;)
234 {
235 if (s2 < s_end)
236 {
237 ucs4_t uc2;
238 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
239 int ccc = uc_combining_class (uc2);
240 if (ccc == UC_CCC_A)
241 {
242 applies = true;
243 break;
244 }
245 if (ccc == UC_CCC_NR)
246 break;
247 s2 += count2;
248 }
249 else
250 {
251 applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0);
252 break;
253 }
254 }
255 }
256 break;
257
258 case SCC_BEFORE_DOT:
259 /* "After" condition: followed by COMBINING DOT ABOVE
260 (U+0307). Any sequence of characters with a
261 combining class that is neither 0 nor 230 may
262 intervene between the current character and the
263 combining dot above. */
264 /* Test the "after" condition. */
265 {
266 const UNIT *s2 = s + count;
267 applies = false;
268 for (;;)
269 {
270 if (s2 < s_end)
271 {
272 ucs4_t uc2;
273 int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
274 if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
275 {
276 applies = true;
277 break;
278 }
279 {
280 int ccc = uc_combining_class (uc2);
281 if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
282 break;
283 }
284 s2 += count2;
285 }
286 else
287 {
288 applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0);
289 break;
290 }
291 }
292 }
293 break;
294
295 case SCC_AFTER_I:
296 /* "Before" condition: There is an uppercase I before
297 it, and there is no intervening character of
298 combining class 0 or 230 (Above). */
299 /* Test the "before" condition. */
300 applies = (last_char_normal_or_above == 'I');
301 break;
302
303 default:
304 abort ();
305 }
306 if (rule->context < 0)
307 applies = !applies;
308
309 if (applies)
310 {
311 /* The rule applies.
312 Look up the mapping (0 to 3 characters). */
313 const unsigned short *mapped_in_rule =
314 (const unsigned short *)((const char *)rule + offset_in_rule);
315
316 if (mapped_in_rule[0] == 0)
317 mapped_count = 0;
318 else
319 {
320 mapped_uc[0] = mapped_in_rule[0];
321 if (mapped_in_rule[1] == 0)
322 mapped_count = 1;
323 else
324 {
325 mapped_uc[1] = mapped_in_rule[1];
326 if (mapped_in_rule[2] == 0)
327 mapped_count = 2;
328 else
329 {
330 mapped_uc[2] = mapped_in_rule[2];
331 mapped_count = 3;
332 }
333 }
334 }
335 goto found_mapping;
336 }
337 }
338
339 /* Optimization: Save a hash table lookup in the next round. */
340 if (!rule->has_next)
341 break;
342 }
343 }
344
345 /* No special-cased mapping. So use the locale and context independent
346 mapping. */
347 mapped_uc[0] = single_character_map (uc);
348 mapped_count = 1;
349
350 found_mapping:
351 /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1]. */
352 {
353 unsigned int i;
354
355 for (i = 0; i < mapped_count; i++)
356 {
357 ucs4_t muc = mapped_uc[i];
358
359 /* Append muc to the result accumulator. */
360 if (length < allocated)
361 {
362 int ret = U_UCTOMB (result + length, muc, allocated - length);
363 if (ret == -1)
364 {
365 errno = EINVAL;
366 goto fail1;
367 }
368 if (ret >= 0)
369 {
370 length += ret;
371 goto done_appending;
372 }
373 }
374 {
375 size_t old_allocated = allocated;
376 size_t new_allocated = 2 * old_allocated;
377 if (new_allocated < 64)
378 new_allocated = 64;
379 if (new_allocated < old_allocated) /* integer overflow? */
380 abort ();
381 {
382 UNIT *larger_result;
383 if (result == NULL)
384 {
385 larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
386 if (larger_result == NULL)
387 {
388 errno = ENOMEM;
389 goto fail1;
390 }
391 }
392 else if (result == resultbuf)
393 {
394 larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
395 if (larger_result == NULL)
396 {
397 errno = ENOMEM;
398 goto fail1;
399 }
400 U_CPY (larger_result, resultbuf, length);
401 }
402 else
403 {
404 larger_result =
405 (UNIT *) realloc (result, new_allocated * sizeof (UNIT));
406 if (larger_result == NULL)
407 {
408 errno = ENOMEM;
409 goto fail1;
410 }
411 }
412 result = larger_result;
413 allocated = new_allocated;
414 {
415 int ret = U_UCTOMB (result + length, muc, allocated - length);
416 if (ret == -1)
417 {
418 errno = EINVAL;
419 goto fail1;
420 }
421 if (ret < 0)
422 abort ();
423 length += ret;
424 goto done_appending;
425 }
426 }
427 }
428 done_appending: ;
429 }
430 }
431
432 if (!uc_is_case_ignorable (uc))
433 last_char_except_ignorable = uc;
434
435 {
436 int ccc = uc_combining_class (uc);
437 if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
438 last_char_normal_or_above = uc;
439 }
440
441 s += count;
442 wp += count;
443 }
444 }
445
446 free (wordbreaks);
447
448 if (nf != NULL)
449 {
450 /* Finally, normalize the result. */
451 UNIT *normalized_result;
452
453 normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
454 if (normalized_result == NULL)
455 goto fail2;
456
457 free (result);
458 return normalized_result;
459 }
460
461 if (length == 0)
462 {
463 if (result == NULL)
464 {
465 /* Return a non-NULL value. NULL means error. */
466 result = (UNIT *) malloc (1);
467 if (result == NULL)
468 {
469 errno = ENOMEM;
470 goto fail2;
471 }
472 }
473 }
474 else if (result != resultbuf && length < allocated)
475 {
476 /* Shrink the allocated memory if possible. */
477 UNIT *memory;
478
479 memory = (UNIT *) realloc (result, length * sizeof (UNIT));
480 if (memory != NULL)
481 result = memory;
482 }
483
484 *lengthp = length;
485 return result;
486
487 fail1:
488 {
489 int saved_errno = errno;
490 free (wordbreaks);
491 errno = saved_errno;
492 }
493 fail2:
494 if (result != resultbuf)
495 {
496 int saved_errno = errno;
497 free (result);
498 errno = saved_errno;
499 }
500 return NULL;
501 }
502