1 /* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
2
3 /*
4 * Word breaking in a Unicode sequence. Designed to be used in a
5 * generic text renderer.
6 *
7 * Copyright (C) 2013-2016 Tom Hacohen <tom at stosb dot com>
8 *
9 * This software is provided 'as-is', without any express or implied
10 * warranty. In no event will the author be held liable for any damages
11 * arising from the use of this software.
12 *
13 * Permission is granted to anyone to use this software for any purpose,
14 * including commercial applications, and to alter it and redistribute
15 * it freely, subject to the following restrictions:
16 *
17 * 1. The origin of this software must not be misrepresented; you must
18 * not claim that you wrote the original software. If you use this
19 * software in a product, an acknowledgement in the product
20 * documentation would be appreciated but is not required.
21 * 2. Altered source versions must be plainly marked as such, and must
22 * not be misrepresented as being the original software.
23 * 3. This notice may not be removed or altered from any source
24 * distribution.
25 *
26 * The main reference is Unicode Standard Annex 29 (UAX #29):
27 * <URL:http://unicode.org/reports/tr29>
28 *
29 * When this library was designed, this annex was at Revision 17, for
30 * Unicode 6.0.0:
31 * <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
32 *
33 * This library has been updated according to Revision 29, for
34 * Unicode 9.0.0:
35 * <URL:http://www.unicode.org/reports/tr29/tr29-29.html>
36 *
37 * The Unicode Terms of Use are available at
38 * <URL:http://www.unicode.org/copyright.html>
39 */
40
41 /**
42 * @file wordbreak.c
43 *
44 * Implementation of the word breaking algorithm as described in Unicode
45 * Standard Annex 29.
46 *
47 * @author Tom Hacohen
48 */
49
50 #include <assert.h>
51 #include <stddef.h>
52 #include <string.h>
53 #include "unibreakdef.h"
54 #include "wordbreak.h"
55 #include "wordbreakdata.c"
56
57 #define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))
58
59 /**
60 * Initializes the wordbreak internals. It currently does nothing, but
61 * it may in the future.
62 */
init_wordbreak(void)63 void init_wordbreak(void)
64 {
65 }
66
67 /**
68 * Gets the word breaking class of a character.
69 *
70 * @param ch character to check
71 * @param wbp pointer to the wbp breaking properties array
72 * @param len size of the wbp array in number of items
73 * @return the word breaking class if found; \c WBP_Any otherwise
74 */
get_char_wb_class(utf32_t ch,const struct WordBreakProperties * wbp,size_t len)75 static enum WordBreakClass get_char_wb_class(
76 utf32_t ch,
77 const struct WordBreakProperties *wbp,
78 size_t len)
79 {
80 int min = 0;
81 int max = len - 1;
82 int mid;
83
84 do
85 {
86 mid = (min + max) / 2;
87
88 if (ch < wbp[mid].start)
89 max = mid - 1;
90 else if (ch > wbp[mid].end)
91 min = mid + 1;
92 else
93 return wbp[mid].prop;
94 }
95 while (min <= max);
96
97 return WBP_Any;
98 }
99
100 /**
101 * Sets the word break types to a specific value in a range.
102 *
103 * It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType.
104 * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
105 * cells that we really don't want to break after.
106 *
107 * @param[in] s input string
108 * @param[out] brks breaks array to fill
109 * @param[in] posStart start position
110 * @param[in] posEnd end position (exclusive)
111 * @param[in] len length of the string
112 * @param[in] brkType breaks type to use
113 * @param[in] get_next_char function to get the next UTF-32 character
114 */
set_brks_to(const void * s,char * brks,size_t posStart,size_t posEnd,size_t len,char brkType,get_next_char_t get_next_char)115 static void set_brks_to(
116 const void *s,
117 char *brks,
118 size_t posStart,
119 size_t posEnd,
120 size_t len,
121 char brkType,
122 get_next_char_t get_next_char)
123 {
124 size_t posNext = posStart;
125 while (posNext < posEnd)
126 {
127 utf32_t ch;
128 ch = get_next_char(s, len, &posNext);
129 (void)ch;
130 assert(ch != EOS);
131 for (; posStart < posNext - 1; ++posStart)
132 brks[posStart] = WORDBREAK_INSIDEACHAR;
133 assert(posStart == posNext - 1);
134
135 /* Only set it if we haven't set it not to break before. */
136 if (brks[posStart] != WORDBREAK_NOBREAK)
137 brks[posStart] = brkType;
138 posStart = posNext;
139 }
140 }
141
142 /* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
143 #define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
144 (cls == WBP_LF))
145
146 /**
147 * Sets the word breaking information for a generic input string.
148 *
149 * @param[in] s input string
150 * @param[in] len length of the input
151 * @param[in] lang language of the input (reserved for future use)
152 * @param[out] brks pointer to the output breaking data, containing
153 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
154 * #WORDBREAK_INSIDEACHAR
155 * @param[in] get_next_char function to get the next UTF-32 character
156 */
set_wordbreaks(const void * s,size_t len,const char * lang,char * brks,get_next_char_t get_next_char)157 static void set_wordbreaks(
158 const void *s,
159 size_t len,
160 const char *lang,
161 char *brks,
162 get_next_char_t get_next_char)
163 {
164 /* Counter of how many time we cam across RI */
165 int riCounter = 0;
166 enum WordBreakClass wbcLast = WBP_Undefined;
167 /* wbcSeqStart is the class that started the current sequence.
168 * WBP_Undefined is a special case that means "sot".
169 * This value is the class that is at the start of the current rule
170 * matching sequence. For example, in case of Numeric+MidNum+Numeric
171 * it'll be Numeric all the way.
172 */
173 enum WordBreakClass wbcSeqStart = WBP_Undefined;
174 utf32_t ch;
175 size_t posNext = 0;
176 size_t posCur = 0;
177 size_t posLast = 0;
178
179 /* TODO: Language-specific specialization. */
180 (void) lang;
181
182 /* Init brks. */
183 memset(brks, WORDBREAK_BREAK, len);
184
185 ch = get_next_char(s, len, &posNext);
186
187 while (ch != EOS)
188 {
189 enum WordBreakClass wbcCur;
190 wbcCur = get_char_wb_class(ch, wb_prop_default,
191 ARRAY_LEN(wb_prop_default));
192
193 switch (wbcCur)
194 {
195 case WBP_CR:
196 /* WB3b */
197 set_brks_to(s, brks, posLast, posCur, len,
198 WORDBREAK_BREAK, get_next_char);
199 wbcSeqStart = wbcCur;
200 posLast = posCur;
201 break;
202
203 case WBP_LF:
204 if (wbcSeqStart == WBP_CR) /* WB3 */
205 {
206 set_brks_to(s, brks, posLast, posCur, len,
207 WORDBREAK_NOBREAK, get_next_char);
208 wbcSeqStart = wbcCur;
209 posLast = posCur;
210 break;
211 }
212 #ifndef __has_attribute
213 # define __has_attribute(x) 0
214 #endif
215 #if __has_attribute(fallthrough)
216 __attribute__((fallthrough));
217 #endif
218 /* Fall off */
219
220 case WBP_Newline:
221 /* WB3a,3b */
222 set_brks_to(s, brks, posLast, posCur, len,
223 WORDBREAK_BREAK, get_next_char);
224 wbcSeqStart = wbcCur;
225 posLast = posCur;
226 break;
227
228 case WBP_E_Base_GAZ:
229 case WBP_Glue_After_Zwj:
230 /* WB3c */
231 if (wbcLast == WBP_ZWJ)
232 {
233 set_brks_to(s, brks, posLast, posCur, len,
234 WORDBREAK_NOBREAK, get_next_char);
235 }
236 /* No rule found, reset */
237 else
238 {
239 set_brks_to(s, brks, posLast, posCur, len,
240 WORDBREAK_BREAK, get_next_char);
241 }
242 wbcSeqStart = wbcCur;
243 posLast = posCur;
244 break;
245
246 case WBP_ZWJ:
247 case WBP_Extend:
248 case WBP_Format:
249 /* WB4 - If not the first char/after a newline (WB3a,3b), skip
250 * this class, set it to be the same as the prev, and mark
251 * brks not to break before them. */
252 if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
253 {
254 set_brks_to(s, brks, posLast, posCur, len,
255 WORDBREAK_BREAK, get_next_char);
256 wbcSeqStart = wbcCur;
257 posLast = posCur;
258 }
259 else
260 {
261 /* It's surely not the first */
262 brks[posCur - 1] = WORDBREAK_NOBREAK;
263 /* WB3c precedes 4, so no intervening Extend chars allowed. */
264 if (wbcSeqStart != WBP_ZWJ)
265 {
266 /* "inherit" the previous class. */
267 wbcCur = wbcLast;
268 }
269 }
270 break;
271
272 case WBP_Katakana:
273 if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
274 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
275 {
276 set_brks_to(s, brks, posLast, posCur, len,
277 WORDBREAK_NOBREAK, get_next_char);
278 }
279 /* No rule found, reset */
280 else
281 {
282 set_brks_to(s, brks, posLast, posCur, len,
283 WORDBREAK_BREAK, get_next_char);
284 }
285 wbcSeqStart = wbcCur;
286 posLast = posCur;
287 break;
288
289 case WBP_Hebrew_Letter:
290 case WBP_ALetter:
291 if ((wbcSeqStart == WBP_Hebrew_Letter) &&
292 (wbcLast == WBP_Double_Quote)) /* WB7b,c */
293 {
294 if (wbcCur == WBP_Hebrew_Letter)
295 {
296 set_brks_to(s, brks, posLast, posCur, len,
297 WORDBREAK_NOBREAK, get_next_char);
298 }
299 else
300 {
301 set_brks_to(s, brks, posLast, posCur, len,
302 WORDBREAK_BREAK, get_next_char);
303 }
304 }
305 else if (((wbcSeqStart == WBP_ALetter) ||
306 (wbcSeqStart == WBP_Hebrew_Letter)) || /* WB5,6,7 */
307 (wbcLast == WBP_Numeric) || /* WB10 */
308 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
309 {
310 set_brks_to(s, brks, posLast, posCur, len,
311 WORDBREAK_NOBREAK, get_next_char);
312 }
313 /* No rule found, reset */
314 else
315 {
316 set_brks_to(s, brks, posLast, posCur, len,
317 WORDBREAK_BREAK, get_next_char);
318 }
319 wbcSeqStart = wbcCur;
320 posLast = posCur;
321 break;
322
323 case WBP_Single_Quote:
324 if (wbcLast == WBP_Hebrew_Letter) /* WB7a */
325 {
326 set_brks_to(s, brks, posLast, posCur, len,
327 WORDBREAK_NOBREAK, get_next_char);
328 wbcSeqStart = wbcCur;
329 posLast = posCur;
330 }
331 #ifndef __has_attribute
332 # define __has_attribute(x) 0
333 #endif
334 #if __has_attribute(fallthrough)
335 __attribute__((fallthrough));
336 #endif
337 /* No break on purpose */
338 case WBP_MidNumLet:
339 if (((wbcLast == WBP_ALetter) ||
340 (wbcLast == WBP_Hebrew_Letter)) || /* WB6,7 */
341 (wbcLast == WBP_Numeric)) /* WB11,12 */
342 {
343 /* Go on */
344 }
345 else
346 {
347 set_brks_to(s, brks, posLast, posCur, len,
348 WORDBREAK_BREAK, get_next_char);
349 wbcSeqStart = wbcCur;
350 posLast = posCur;
351 }
352 break;
353
354 case WBP_MidLetter:
355 if ((wbcLast == WBP_ALetter) ||
356 (wbcLast == WBP_Hebrew_Letter)) /* WB6,7 */
357 {
358 /* Go on */
359 }
360 else
361 {
362 set_brks_to(s, brks, posLast, posCur, len,
363 WORDBREAK_BREAK, get_next_char);
364 wbcSeqStart = wbcCur;
365 posLast = posCur;
366 }
367 break;
368
369 case WBP_MidNum:
370 if (wbcLast == WBP_Numeric) /* WB11,12 */
371 {
372 /* Go on */
373 }
374 else
375 {
376 set_brks_to(s, brks, posLast, posCur, len,
377 WORDBREAK_BREAK, get_next_char);
378 wbcSeqStart = wbcCur;
379 posLast = posCur;
380 }
381 break;
382
383 case WBP_Numeric:
384 if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
385 ((wbcLast == WBP_ALetter) ||
386 (wbcLast == WBP_Hebrew_Letter)) || /* WB9 */
387 (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
388 {
389 set_brks_to(s, brks, posLast, posCur, len,
390 WORDBREAK_NOBREAK, get_next_char);
391 }
392 /* No rule found, reset */
393 else
394 {
395 set_brks_to(s, brks, posLast, posCur, len,
396 WORDBREAK_BREAK, get_next_char);
397 }
398 wbcSeqStart = wbcCur;
399 posLast = posCur;
400 break;
401
402 case WBP_ExtendNumLet:
403 /* WB13a,13b */
404 if ((wbcSeqStart == wbcLast) &&
405 ((wbcLast == WBP_ALetter) ||
406 (wbcLast == WBP_Hebrew_Letter) ||
407 (wbcLast == WBP_Numeric) ||
408 (wbcLast == WBP_Katakana) ||
409 (wbcLast == WBP_ExtendNumLet)))
410 {
411 set_brks_to(s, brks, posLast, posCur, len,
412 WORDBREAK_NOBREAK, get_next_char);
413 }
414 /* No rule found, reset */
415 else
416 {
417 set_brks_to(s, brks, posLast, posCur, len,
418 WORDBREAK_BREAK, get_next_char);
419 }
420 wbcSeqStart = wbcCur;
421 posLast = posCur;
422 break;
423
424 case WBP_E_Base:
425 /* No rule found, reset */
426 set_brks_to(s, brks, posLast, posCur, len,
427 WORDBREAK_BREAK, get_next_char);
428 wbcSeqStart = wbcCur;
429 posLast = posCur;
430 break;
431
432 case WBP_E_Modifier:
433 /* WB14 */
434 if ((wbcLast == WBP_E_Base) ||
435 (wbcLast == WBP_E_Base_GAZ))
436 {
437 set_brks_to(s, brks, posLast, posCur, len,
438 WORDBREAK_NOBREAK, get_next_char);
439 }
440 /* No rule found, reset */
441 else
442 {
443 set_brks_to(s, brks, posLast, posCur, len,
444 WORDBREAK_BREAK, get_next_char);
445 }
446 wbcSeqStart = wbcCur;
447 posLast = posCur;
448 break;
449
450 case WBP_Regional_Indicator:
451 /* WB15,16 */
452 if ((wbcSeqStart == WBP_Regional_Indicator) &&
453 ((riCounter % 2) == 1))
454 {
455 set_brks_to(s, brks, posLast, posCur, len,
456 WORDBREAK_NOBREAK, get_next_char);
457 riCounter = 0; /* Reset the sequence */
458 }
459 /* No rule found, reset */
460 else
461 {
462 set_brks_to(s, brks, posLast, posCur, len,
463 WORDBREAK_BREAK, get_next_char);
464 riCounter = 1;
465 }
466 wbcSeqStart = wbcCur;
467 posLast = posCur;
468 break;
469
470 case WBP_Double_Quote:
471 if (wbcLast == WBP_Hebrew_Letter) /* WB7b,c */
472 {
473 /* Go on */
474 }
475 else
476 {
477 set_brks_to(s, brks, posLast, posCur, len,
478 WORDBREAK_BREAK, get_next_char);
479 wbcSeqStart = wbcCur;
480 posLast = posCur;
481 }
482 break;
483
484 case WBP_Any:
485 /* Allow breaks and reset */
486 set_brks_to(s, brks, posLast, posCur, len,
487 WORDBREAK_BREAK, get_next_char);
488 wbcSeqStart = wbcCur;
489 posLast = posCur;
490 break;
491
492 default:
493 /* Error, should never get here! */
494 assert(0);
495 break;
496 }
497
498 wbcLast = wbcCur;
499 posCur = posNext;
500 ch = get_next_char(s, len, &posNext);
501 }
502
503 /* WB2 */
504 set_brks_to(s, brks, posLast, posNext, len,
505 WORDBREAK_BREAK, get_next_char);
506 }
507
508 /**
509 * Sets the word breaking information for a UTF-8 input string.
510 *
511 * @param[in] s input UTF-8 string
512 * @param[in] len length of the input
513 * @param[in] lang language of the input (reserved for future use)
514 * @param[out] brks pointer to the output breaking data, containing
515 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
516 * #WORDBREAK_INSIDEACHAR
517 */
set_wordbreaks_utf8(const utf8_t * s,size_t len,const char * lang,char * brks)518 void set_wordbreaks_utf8(
519 const utf8_t *s,
520 size_t len,
521 const char *lang,
522 char *brks)
523 {
524 set_wordbreaks(s, len, lang, brks,
525 (get_next_char_t)ub_get_next_char_utf8);
526 }
527
528 /**
529 * Sets the word breaking information for a UTF-16 input string.
530 *
531 * @param[in] s input UTF-16 string
532 * @param[in] len length of the input
533 * @param[in] lang language of the input (reserved for future use)
534 * @param[out] brks pointer to the output breaking data, containing
535 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
536 * #WORDBREAK_INSIDEACHAR
537 */
set_wordbreaks_utf16(const utf16_t * s,size_t len,const char * lang,char * brks)538 void set_wordbreaks_utf16(
539 const utf16_t *s,
540 size_t len,
541 const char *lang,
542 char *brks)
543 {
544 set_wordbreaks(s, len, lang, brks,
545 (get_next_char_t)ub_get_next_char_utf16);
546 }
547
548 /**
549 * Sets the word breaking information for a UTF-32 input string.
550 *
551 * @param[in] s input UTF-32 string
552 * @param[in] len length of the input
553 * @param[in] lang language of the input (reserved for future use)
554 * @param[out] brks pointer to the output breaking data, containing
555 * #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
556 * #WORDBREAK_INSIDEACHAR
557 */
set_wordbreaks_utf32(const utf32_t * s,size_t len,const char * lang,char * brks)558 void set_wordbreaks_utf32(
559 const utf32_t *s,
560 size_t len,
561 const char *lang,
562 char *brks)
563 {
564 set_wordbreaks(s, len, lang, brks,
565 (get_next_char_t)ub_get_next_char_utf32);
566 }
567