1 /* Word breaks in UTF-8/UTF-16/UTF-32 strings.  -*- coding: utf-8 -*-
2    Copyright (C) 2009-2020 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2009.
4 
5    This program is free software: you can redistribute it and/or modify it
6    under the terms of the GNU General Public License as published
7    by the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17 
18 void
19 FUNC (const UNIT *s, size_t n, char *p)
20 {
21   if (n > 0)
22     {
23       const UNIT *s_end = s + n;
24 
25       /* Word break property of the last character.
26          -1 at the very beginning of the string.  */
27       int last_char_prop = -1;
28 
29       /* Format and Extend characters are ignored; this means, the mostly used
30          unit is the complex character (= character with subsequent ignored
31          characters).
32          Word break property of the last complex character.
33          -1 at the very beginning of the string.  */
34       int last_compchar_prop = -1;
35       char *last_compchar_ptr = NULL;
36 
37       /* For recognizing rules involving 3 complex characters:
38          Word break property of the second-to-last complex character.
39          -1 at the very beginning of the string.  */
40       int secondlast_compchar_prop = -1;
41 
42       size_t ri_count = 0;
43 
44       /* Don't break inside multibyte characters.  */
45       memset (p, 0, n);
46 
47       while (s < s_end)
48         {
49           ucs4_t uc;
50           int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
51           int prop = uc_wordbreak_property (uc);
52 
53           /* No break at the start of the string.  */
54           if (last_char_prop >= 0)
55             {
56               /* No break between CR and LF (WB3).  */
57               if (last_char_prop == WBP_CR && prop == WBP_LF)
58                 /* *p = 0 */;
59               /* Break before and after newlines (WB3a, WB3b).  */
60               else if ((last_char_prop == WBP_CR
61                         || last_char_prop == WBP_LF
62                         || last_char_prop == WBP_NEWLINE)
63                        || (prop == WBP_CR
64                            || prop == WBP_LF
65                            || prop == WBP_NEWLINE))
66                 *p = 1;
67               /* No break within emoji zwj sequence (WB3c).  */
68               else if (last_char_prop == WBP_ZWJ &&
69                        (prop == WBP_GAZ || prop == WBP_EBG))
70                 /* *p = 0 */;
71               /* Ignore Format and Extend characters.  */
72               else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
73                 {
74                   /* No break in these situations (see UAX #29):
75 
76                       secondlast          last             current
77 
78     (ALetter | HL)   (MidLetter | MidNumLet | SQ) × (ALetter | HL)      (WB7)
79     (ALetter | HL) × (MidLetter | MidNumLet | SQ)   (ALetter | HL)      (WB6)
80                   Numeric   (MidNum | MidNumLet | SQ)    × Numeric      (WB11)
81                   Numeric × (MidNum | MidNumLet | SQ)      Numeric      (WB12)
82                                                         HL × DQ HL      (WB7b)
83                                                         HL DQ × HL      (WB7c)
84                                                 ^ (RI RI)* RI × RI      (WB15)
85                                             [^RI] (RI RI)* RI × RI      (WB16)
86                    */
87                   /* No break across certain punctuation.  Also, disable word
88                      breaks that were recognized earlier (due to lookahead of
89                      only one complex character).  */
90                   if (((prop == WBP_ALETTER
91                         || prop == WBP_HL)
92                        && (last_compchar_prop == WBP_MIDLETTER
93                            || last_compchar_prop == WBP_MIDNUMLET
94                            || last_compchar_prop == WBP_SQ)
95                        && (secondlast_compchar_prop == WBP_ALETTER
96                            || secondlast_compchar_prop == WBP_HL))
97                       || (prop == WBP_NUMERIC
98                           && (last_compchar_prop == WBP_MIDNUM
99                               || last_compchar_prop == WBP_MIDNUMLET
100                               || last_compchar_prop == WBP_SQ)
101                           && secondlast_compchar_prop == WBP_NUMERIC)
102                       || (prop == WBP_HL
103                           && last_compchar_prop == WBP_DQ
104                           && secondlast_compchar_prop == WBP_HL))
105                     {
106                       *last_compchar_ptr = 0;
107                       /* *p = 0; */
108                     }
109                   /* Break before RI, if odd number of RI's are
110                      preceding (WB15, WB16).  */
111                   else if (last_compchar_prop == WBP_RI && prop == WBP_RI)
112                     {
113                       if (ri_count % 2 == 0)
114                         *p = 1;
115                       /* else *p = 0 */
116                     }
117                   /* Break after Format and Extend character.  */
118                   else if (last_compchar_prop == WBP_EXTEND
119                            || last_compchar_prop == WBP_FORMAT)
120                     *p = 1;
121                   else
122                     {
123                       int last_compchar_index =
124                         uniwbrk_prop_index[last_compchar_prop];
125                       int index = uniwbrk_prop_index[prop];
126 
127                       /* Break between unknown pair (WB999).  */
128                       if (last_compchar_index < 0 || index < 0)
129                         *p = 1;
130                       /* Perform a single table lookup.  */
131                       else if (uniwbrk_table[last_compchar_index][index])
132                         *p = 1;
133                       /* else *p = 0; */
134                     }
135                 }
136             }
137 
138           last_char_prop = prop;
139 
140           /* Ignore Format and Extend characters, except at the
141              start of the line.  */
142           if (last_compchar_prop < 0
143               || last_compchar_prop == WBP_CR
144               || last_compchar_prop == WBP_LF
145               || last_compchar_prop == WBP_NEWLINE
146               || !(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
147             {
148               secondlast_compchar_prop = last_compchar_prop;
149               last_compchar_prop = prop;
150               last_compchar_ptr = p;
151 
152               if (prop == WBP_RI)
153                 ri_count++;
154               else
155                 ri_count = 0;
156             }
157 
158           s += count;
159           p += count;
160         }
161     }
162 }
163