1 /* vim: set expandtab tabstop=4 softtabstop=4 shiftwidth=4: */
2 
3 /*
4  * Word breaking in a Unicode sequence.  Designed to be used in a
5  * generic text renderer.
6  *
7  * Copyright (C) 2013-2016 Tom Hacohen <tom at stosb dot com>
8  *
9  * This software is provided 'as-is', without any express or implied
10  * warranty.  In no event will the author be held liable for any damages
11  * arising from the use of this software.
12  *
13  * Permission is granted to anyone to use this software for any purpose,
14  * including commercial applications, and to alter it and redistribute
15  * it freely, subject to the following restrictions:
16  *
17  * 1. The origin of this software must not be misrepresented; you must
18  *    not claim that you wrote the original software.  If you use this
19  *    software in a product, an acknowledgement in the product
20  *    documentation would be appreciated but is not required.
21  * 2. Altered source versions must be plainly marked as such, and must
22  *    not be misrepresented as being the original software.
23  * 3. This notice may not be removed or altered from any source
24  *    distribution.
25  *
26  * The main reference is Unicode Standard Annex 29 (UAX #29):
27  *      <URL:http://unicode.org/reports/tr29>
28  *
29  * When this library was designed, this annex was at Revision 17, for
30  * Unicode 6.0.0:
31  *      <URL:http://www.unicode.org/reports/tr29/tr29-17.html>
32  *
33  * This library has been updated according to Revision 29, for
34  * Unicode 9.0.0:
35  *      <URL:http://www.unicode.org/reports/tr29/tr29-29.html>
36  *
37  * The Unicode Terms of Use are available at
38  *      <URL:http://www.unicode.org/copyright.html>
39  */
40 
41 /**
42  * @file    wordbreak.c
43  *
44  * Implementation of the word breaking algorithm as described in Unicode
45  * Standard Annex 29.
46  *
47  * @author  Tom Hacohen
48  */
49 
50 #include <assert.h>
51 #include <stddef.h>
52 #include <string.h>
53 #include "unibreakdef.h"
54 #include "wordbreak.h"
55 #include "wordbreakdata.c"
56 
57 #define ARRAY_LEN(x) (sizeof(x) / sizeof(x[0]))
58 
59 /**
60  * Initializes the wordbreak internals.  It currently does nothing, but
61  * it may in the future.
62  */
init_wordbreak(void)63 void init_wordbreak(void)
64 {
65 }
66 
67 /**
68  * Gets the word breaking class of a character.
69  *
70  * @param ch   character to check
71  * @param wbp  pointer to the wbp breaking properties array
72  * @param len  size of the wbp array in number of items
73  * @return     the word breaking class if found; \c WBP_Any otherwise
74  */
get_char_wb_class(utf32_t ch,const struct WordBreakProperties * wbp,size_t len)75 static enum WordBreakClass get_char_wb_class(
76         utf32_t ch,
77         const struct WordBreakProperties *wbp,
78         size_t len)
79 {
80     int min = 0;
81     int max = len - 1;
82     int mid;
83 
84     do
85     {
86         mid = (min + max) / 2;
87 
88         if (ch < wbp[mid].start)
89             max = mid - 1;
90         else if (ch > wbp[mid].end)
91             min = mid + 1;
92         else
93             return wbp[mid].prop;
94     }
95     while (min <= max);
96 
97     return WBP_Any;
98 }
99 
100 /**
101  * Sets the word break types to a specific value in a range.
102  *
103  * It sets the inside chars to #WORDBREAK_INSIDEACHAR and the rest to brkType.
104  * Assumes \a brks is initialized - all the cells with #WORDBREAK_NOBREAK are
105  * cells that we really don't want to break after.
106  *
107  * @param[in]  s             input string
108  * @param[out] brks          breaks array to fill
109  * @param[in]  posStart      start position
110  * @param[in]  posEnd        end position (exclusive)
111  * @param[in]  len           length of the string
112  * @param[in]  brkType       breaks type to use
113  * @param[in] get_next_char  function to get the next UTF-32 character
114  */
set_brks_to(const void * s,char * brks,size_t posStart,size_t posEnd,size_t len,char brkType,get_next_char_t get_next_char)115 static void set_brks_to(
116         const void *s,
117         char *brks,
118         size_t posStart,
119         size_t posEnd,
120         size_t len,
121         char brkType,
122         get_next_char_t get_next_char)
123 {
124     size_t posNext = posStart;
125     while (posNext < posEnd)
126     {
127         utf32_t ch;
128         ch = get_next_char(s, len, &posNext);
129         (void)ch;
130         assert(ch != EOS);
131         for (; posStart < posNext - 1; ++posStart)
132             brks[posStart] = WORDBREAK_INSIDEACHAR;
133         assert(posStart == posNext - 1);
134 
135         /* Only set it if we haven't set it not to break before. */
136         if (brks[posStart] != WORDBREAK_NOBREAK)
137             brks[posStart] = brkType;
138         posStart = posNext;
139     }
140 }
141 
142 /* Checks to see if the class is newline, CR, or LF (rules WB3a and b). */
143 #define IS_WB3ab(cls) ((cls == WBP_Newline) || (cls == WBP_CR) || \
144                        (cls == WBP_LF))
145 
146 /**
147  * Sets the word breaking information for a generic input string.
148  *
149  * @param[in]  s             input string
150  * @param[in]  len           length of the input
151  * @param[in]  lang          language of the input (reserved for future use)
152  * @param[out] brks          pointer to the output breaking data, containing
153  *                           #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
154  *                           #WORDBREAK_INSIDEACHAR
155  * @param[in] get_next_char  function to get the next UTF-32 character
156  */
set_wordbreaks(const void * s,size_t len,const char * lang,char * brks,get_next_char_t get_next_char)157 static void set_wordbreaks(
158         const void *s,
159         size_t len,
160         const char *lang,
161         char *brks,
162         get_next_char_t get_next_char)
163 {
164     /* Counter of how many time we cam across RI */
165     int riCounter = 0;
166     enum WordBreakClass wbcLast = WBP_Undefined;
167     /* wbcSeqStart is the class that started the current sequence.
168      * WBP_Undefined is a special case that means "sot".
169      * This value is the class that is at the start of the current rule
170      * matching sequence. For example, in case of Numeric+MidNum+Numeric
171      * it'll be Numeric all the way.
172      */
173     enum WordBreakClass wbcSeqStart = WBP_Undefined;
174     utf32_t ch;
175     size_t posNext = 0;
176     size_t posCur = 0;
177     size_t posLast = 0;
178 
179     /* TODO: Language-specific specialization. */
180     (void) lang;
181 
182     /* Init brks. */
183     memset(brks, WORDBREAK_BREAK, len);
184 
185     ch = get_next_char(s, len, &posNext);
186 
187     while (ch != EOS)
188     {
189         enum WordBreakClass wbcCur;
190         wbcCur = get_char_wb_class(ch, wb_prop_default,
191                                    ARRAY_LEN(wb_prop_default));
192 
193         switch (wbcCur)
194         {
195         case WBP_CR:
196             /* WB3b */
197             set_brks_to(s, brks, posLast, posCur, len,
198                         WORDBREAK_BREAK, get_next_char);
199             wbcSeqStart = wbcCur;
200             posLast = posCur;
201             break;
202 
203         case WBP_LF:
204             if (wbcSeqStart == WBP_CR) /* WB3 */
205             {
206                 set_brks_to(s, brks, posLast, posCur, len,
207                             WORDBREAK_NOBREAK, get_next_char);
208                 wbcSeqStart = wbcCur;
209                 posLast = posCur;
210                 break;
211             }
212 #ifndef __has_attribute
213 # define __has_attribute(x) 0
214 #endif
215 #if __has_attribute(fallthrough)
216            __attribute__((fallthrough));
217 #endif
218            /* Fall off */
219 
220         case WBP_Newline:
221             /* WB3a,3b */
222             set_brks_to(s, brks, posLast, posCur, len,
223                         WORDBREAK_BREAK, get_next_char);
224             wbcSeqStart = wbcCur;
225             posLast = posCur;
226             break;
227 
228         case WBP_E_Base_GAZ:
229         case WBP_Glue_After_Zwj:
230             /* WB3c */
231             if (wbcLast == WBP_ZWJ)
232             {
233                set_brks_to(s, brks, posLast, posCur, len,
234                        WORDBREAK_NOBREAK, get_next_char);
235             }
236             /* No rule found, reset */
237             else
238             {
239                 set_brks_to(s, brks, posLast, posCur, len,
240                             WORDBREAK_BREAK, get_next_char);
241             }
242             wbcSeqStart = wbcCur;
243             posLast = posCur;
244             break;
245 
246         case WBP_ZWJ:
247         case WBP_Extend:
248         case WBP_Format:
249             /* WB4 - If not the first char/after a newline (WB3a,3b), skip
250              * this class, set it to be the same as the prev, and mark
251              * brks not to break before them. */
252             if ((wbcSeqStart == WBP_Undefined) || IS_WB3ab(wbcSeqStart))
253             {
254                 set_brks_to(s, brks, posLast, posCur, len,
255                             WORDBREAK_BREAK, get_next_char);
256                 wbcSeqStart = wbcCur;
257                 posLast = posCur;
258             }
259             else
260             {
261                 /* It's surely not the first */
262                 brks[posCur - 1] = WORDBREAK_NOBREAK;
263                 /* WB3c precedes 4, so no intervening Extend chars allowed. */
264                 if (wbcSeqStart != WBP_ZWJ)
265                 {
266                     /* "inherit" the previous class. */
267                     wbcCur = wbcLast;
268                 }
269             }
270             break;
271 
272         case WBP_Katakana:
273             if ((wbcSeqStart == WBP_Katakana) || /* WB13 */
274                     (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
275             {
276                 set_brks_to(s, brks, posLast, posCur, len,
277                             WORDBREAK_NOBREAK, get_next_char);
278             }
279             /* No rule found, reset */
280             else
281             {
282                 set_brks_to(s, brks, posLast, posCur, len,
283                             WORDBREAK_BREAK, get_next_char);
284             }
285             wbcSeqStart = wbcCur;
286             posLast = posCur;
287             break;
288 
289         case WBP_Hebrew_Letter:
290         case WBP_ALetter:
291             if ((wbcSeqStart == WBP_Hebrew_Letter) &&
292                     (wbcLast == WBP_Double_Quote)) /* WB7b,c */
293             {
294                if (wbcCur == WBP_Hebrew_Letter)
295                  {
296                      set_brks_to(s, brks, posLast, posCur, len,
297                              WORDBREAK_NOBREAK, get_next_char);
298                  }
299                else
300                  {
301                      set_brks_to(s, brks, posLast, posCur, len,
302                              WORDBREAK_BREAK, get_next_char);
303                  }
304             }
305             else if (((wbcSeqStart == WBP_ALetter) ||
306                         (wbcSeqStart == WBP_Hebrew_Letter)) || /* WB5,6,7 */
307                     (wbcLast == WBP_Numeric) || /* WB10 */
308                     (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
309             {
310                 set_brks_to(s, brks, posLast, posCur, len,
311                             WORDBREAK_NOBREAK, get_next_char);
312             }
313             /* No rule found, reset */
314             else
315             {
316                 set_brks_to(s, brks, posLast, posCur, len,
317                             WORDBREAK_BREAK, get_next_char);
318             }
319             wbcSeqStart = wbcCur;
320             posLast = posCur;
321             break;
322 
323         case WBP_Single_Quote:
324             if (wbcLast == WBP_Hebrew_Letter) /* WB7a */
325             {
326                 set_brks_to(s, brks, posLast, posCur, len,
327                             WORDBREAK_NOBREAK, get_next_char);
328                 wbcSeqStart = wbcCur;
329                 posLast = posCur;
330             }
331 #ifndef __has_attribute
332 # define __has_attribute(x) 0
333 #endif
334 #if __has_attribute(fallthrough)
335            __attribute__((fallthrough));
336 #endif
337            /* No break on purpose */
338         case WBP_MidNumLet:
339             if (((wbcLast == WBP_ALetter) ||
340                         (wbcLast == WBP_Hebrew_Letter)) || /* WB6,7 */
341                     (wbcLast == WBP_Numeric)) /* WB11,12 */
342             {
343                 /* Go on */
344             }
345             else
346             {
347                 set_brks_to(s, brks, posLast, posCur, len,
348                             WORDBREAK_BREAK, get_next_char);
349                 wbcSeqStart = wbcCur;
350                 posLast = posCur;
351             }
352             break;
353 
354         case WBP_MidLetter:
355             if ((wbcLast == WBP_ALetter) ||
356                     (wbcLast == WBP_Hebrew_Letter)) /* WB6,7 */
357             {
358                 /* Go on */
359             }
360             else
361             {
362                 set_brks_to(s, brks, posLast, posCur, len,
363                             WORDBREAK_BREAK, get_next_char);
364                 wbcSeqStart = wbcCur;
365                 posLast = posCur;
366             }
367             break;
368 
369         case WBP_MidNum:
370             if (wbcLast == WBP_Numeric) /* WB11,12 */
371             {
372                 /* Go on */
373             }
374             else
375             {
376                 set_brks_to(s, brks, posLast, posCur, len,
377                             WORDBREAK_BREAK, get_next_char);
378                 wbcSeqStart = wbcCur;
379                 posLast = posCur;
380             }
381             break;
382 
383         case WBP_Numeric:
384             if ((wbcSeqStart == WBP_Numeric) || /* WB8,11,12 */
385                     ((wbcLast == WBP_ALetter) ||
386                      (wbcLast == WBP_Hebrew_Letter)) || /* WB9 */
387                     (wbcSeqStart == WBP_ExtendNumLet)) /* WB13b */
388             {
389                 set_brks_to(s, brks, posLast, posCur, len,
390                             WORDBREAK_NOBREAK, get_next_char);
391             }
392             /* No rule found, reset */
393             else
394             {
395                 set_brks_to(s, brks, posLast, posCur, len,
396                             WORDBREAK_BREAK, get_next_char);
397             }
398             wbcSeqStart = wbcCur;
399             posLast = posCur;
400             break;
401 
402         case WBP_ExtendNumLet:
403             /* WB13a,13b */
404             if ((wbcSeqStart == wbcLast) &&
405                 ((wbcLast == WBP_ALetter) ||
406                  (wbcLast == WBP_Hebrew_Letter) ||
407                  (wbcLast == WBP_Numeric) ||
408                  (wbcLast == WBP_Katakana) ||
409                  (wbcLast == WBP_ExtendNumLet)))
410             {
411                 set_brks_to(s, brks, posLast, posCur, len,
412                             WORDBREAK_NOBREAK, get_next_char);
413             }
414             /* No rule found, reset */
415             else
416             {
417                 set_brks_to(s, brks, posLast, posCur, len,
418                             WORDBREAK_BREAK, get_next_char);
419             }
420             wbcSeqStart = wbcCur;
421             posLast = posCur;
422             break;
423 
424         case WBP_E_Base:
425             /* No rule found, reset */
426             set_brks_to(s, brks, posLast, posCur, len,
427                     WORDBREAK_BREAK, get_next_char);
428             wbcSeqStart = wbcCur;
429             posLast = posCur;
430             break;
431 
432         case WBP_E_Modifier:
433             /* WB14 */
434             if ((wbcLast == WBP_E_Base) ||
435                 (wbcLast == WBP_E_Base_GAZ))
436             {
437                 set_brks_to(s, brks, posLast, posCur, len,
438                             WORDBREAK_NOBREAK, get_next_char);
439             }
440             /* No rule found, reset */
441             else
442             {
443                 set_brks_to(s, brks, posLast, posCur, len,
444                             WORDBREAK_BREAK, get_next_char);
445             }
446             wbcSeqStart = wbcCur;
447             posLast = posCur;
448             break;
449 
450         case WBP_Regional_Indicator:
451             /* WB15,16 */
452             if ((wbcSeqStart == WBP_Regional_Indicator) &&
453                 ((riCounter % 2) == 1))
454             {
455                 set_brks_to(s, brks, posLast, posCur, len,
456                         WORDBREAK_NOBREAK, get_next_char);
457                 riCounter = 0; /* Reset the sequence */
458             }
459             /* No rule found, reset */
460             else
461             {
462                 set_brks_to(s, brks, posLast, posCur, len,
463                             WORDBREAK_BREAK, get_next_char);
464                 riCounter = 1;
465             }
466             wbcSeqStart = wbcCur;
467             posLast = posCur;
468             break;
469 
470         case WBP_Double_Quote:
471             if (wbcLast == WBP_Hebrew_Letter) /* WB7b,c */
472             {
473                /* Go on */
474             }
475             else
476             {
477                 set_brks_to(s, brks, posLast, posCur, len,
478                             WORDBREAK_BREAK, get_next_char);
479                 wbcSeqStart = wbcCur;
480                 posLast = posCur;
481             }
482             break;
483 
484         case WBP_Any:
485             /* Allow breaks and reset */
486             set_brks_to(s, brks, posLast, posCur, len,
487                         WORDBREAK_BREAK, get_next_char);
488             wbcSeqStart = wbcCur;
489             posLast = posCur;
490             break;
491 
492         default:
493             /* Error, should never get here! */
494             assert(0);
495             break;
496         }
497 
498         wbcLast = wbcCur;
499         posCur = posNext;
500         ch = get_next_char(s, len, &posNext);
501     }
502 
503     /* WB2 */
504     set_brks_to(s, brks, posLast, posNext, len,
505                 WORDBREAK_BREAK, get_next_char);
506 }
507 
508 /**
509  * Sets the word breaking information for a UTF-8 input string.
510  *
511  * @param[in]  s     input UTF-8 string
512  * @param[in]  len   length of the input
513  * @param[in]  lang  language of the input (reserved for future use)
514  * @param[out] brks  pointer to the output breaking data, containing
515  *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
516  *                   #WORDBREAK_INSIDEACHAR
517  */
set_wordbreaks_utf8(const utf8_t * s,size_t len,const char * lang,char * brks)518 void set_wordbreaks_utf8(
519         const utf8_t *s,
520         size_t len,
521         const char *lang,
522         char *brks)
523 {
524     set_wordbreaks(s, len, lang, brks,
525                    (get_next_char_t)ub_get_next_char_utf8);
526 }
527 
528 /**
529  * Sets the word breaking information for a UTF-16 input string.
530  *
531  * @param[in]  s     input UTF-16 string
532  * @param[in]  len   length of the input
533  * @param[in]  lang  language of the input (reserved for future use)
534  * @param[out] brks  pointer to the output breaking data, containing
535  *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
536  *                   #WORDBREAK_INSIDEACHAR
537  */
set_wordbreaks_utf16(const utf16_t * s,size_t len,const char * lang,char * brks)538 void set_wordbreaks_utf16(
539         const utf16_t *s,
540         size_t len,
541         const char *lang,
542         char *brks)
543 {
544     set_wordbreaks(s, len, lang, brks,
545                    (get_next_char_t)ub_get_next_char_utf16);
546 }
547 
548 /**
549  * Sets the word breaking information for a UTF-32 input string.
550  *
551  * @param[in]  s     input UTF-32 string
552  * @param[in]  len   length of the input
553  * @param[in]  lang  language of the input (reserved for future use)
554  * @param[out] brks  pointer to the output breaking data, containing
555  *                   #WORDBREAK_BREAK, #WORDBREAK_NOBREAK, or
556  *                   #WORDBREAK_INSIDEACHAR
557  */
set_wordbreaks_utf32(const utf32_t * s,size_t len,const char * lang,char * brks)558 void set_wordbreaks_utf32(
559         const utf32_t *s,
560         size_t len,
561         const char *lang,
562         char *brks)
563 {
564     set_wordbreaks(s, len, lang, brks,
565                    (get_next_char_t)ub_get_next_char_utf32);
566 }
567