1 /* Word breaks in UTF-8/UTF-16/UTF-32 strings. -*- coding: utf-8 -*- 2 Copyright (C) 2009-2020 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2009. 4 5 This program is free software: you can redistribute it and/or modify it 6 under the terms of the GNU General Public License as published 7 by the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program. If not, see <https://www.gnu.org/licenses/>. */ 17 18 void 19 FUNC (const UNIT *s, size_t n, char *p) 20 { 21 if (n > 0) 22 { 23 const UNIT *s_end = s + n; 24 25 /* Word break property of the last character. 26 -1 at the very beginning of the string. */ 27 int last_char_prop = -1; 28 29 /* Format and Extend characters are ignored; this means, the mostly used 30 unit is the complex character (= character with subsequent ignored 31 characters). 32 Word break property of the last complex character. 33 -1 at the very beginning of the string. */ 34 int last_compchar_prop = -1; 35 char *last_compchar_ptr = NULL; 36 37 /* For recognizing rules involving 3 complex characters: 38 Word break property of the second-to-last complex character. 39 -1 at the very beginning of the string. */ 40 int secondlast_compchar_prop = -1; 41 42 size_t ri_count = 0; 43 44 /* Don't break inside multibyte characters. */ 45 memset (p, 0, n); 46 47 while (s < s_end) 48 { 49 ucs4_t uc; 50 int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); 51 int prop = uc_wordbreak_property (uc); 52 53 /* No break at the start of the string. */ 54 if (last_char_prop >= 0) 55 { 56 /* No break between CR and LF (WB3). */ 57 if (last_char_prop == WBP_CR && prop == WBP_LF) 58 /* *p = 0 */; 59 /* Break before and after newlines (WB3a, WB3b). */ 60 else if ((last_char_prop == WBP_CR 61 || last_char_prop == WBP_LF 62 || last_char_prop == WBP_NEWLINE) 63 || (prop == WBP_CR 64 || prop == WBP_LF 65 || prop == WBP_NEWLINE)) 66 *p = 1; 67 /* No break within emoji zwj sequence (WB3c). */ 68 else if (last_char_prop == WBP_ZWJ && 69 (prop == WBP_GAZ || prop == WBP_EBG)) 70 /* *p = 0 */; 71 /* Ignore Format and Extend characters. */ 72 else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ)) 73 { 74 /* No break in these situations (see UAX #29): 75 76 secondlast last current 77 78 (ALetter | HL) (MidLetter | MidNumLet | SQ) × (ALetter | HL) (WB7) 79 (ALetter | HL) × (MidLetter | MidNumLet | SQ) (ALetter | HL) (WB6) 80 Numeric (MidNum | MidNumLet | SQ) × Numeric (WB11) 81 Numeric × (MidNum | MidNumLet | SQ) Numeric (WB12) 82 HL × DQ HL (WB7b) 83 HL DQ × HL (WB7c) 84 ^ (RI RI)* RI × RI (WB15) 85 [^RI] (RI RI)* RI × RI (WB16) 86 */ 87 /* No break across certain punctuation. Also, disable word 88 breaks that were recognized earlier (due to lookahead of 89 only one complex character). */ 90 if (((prop == WBP_ALETTER 91 || prop == WBP_HL) 92 && (last_compchar_prop == WBP_MIDLETTER 93 || last_compchar_prop == WBP_MIDNUMLET 94 || last_compchar_prop == WBP_SQ) 95 && (secondlast_compchar_prop == WBP_ALETTER 96 || secondlast_compchar_prop == WBP_HL)) 97 || (prop == WBP_NUMERIC 98 && (last_compchar_prop == WBP_MIDNUM 99 || last_compchar_prop == WBP_MIDNUMLET 100 || last_compchar_prop == WBP_SQ) 101 && secondlast_compchar_prop == WBP_NUMERIC) 102 || (prop == WBP_HL 103 && last_compchar_prop == WBP_DQ 104 && secondlast_compchar_prop == WBP_HL)) 105 { 106 *last_compchar_ptr = 0; 107 /* *p = 0; */ 108 } 109 /* Break before RI, if odd number of RI's are 110 preceding (WB15, WB16). */ 111 else if (last_compchar_prop == WBP_RI && prop == WBP_RI) 112 { 113 if (ri_count % 2 == 0) 114 *p = 1; 115 /* else *p = 0 */ 116 } 117 /* Break after Format and Extend character. */ 118 else if (last_compchar_prop == WBP_EXTEND 119 || last_compchar_prop == WBP_FORMAT) 120 *p = 1; 121 else 122 { 123 int last_compchar_index = 124 uniwbrk_prop_index[last_compchar_prop]; 125 int index = uniwbrk_prop_index[prop]; 126 127 /* Break between unknown pair (WB999). */ 128 if (last_compchar_index < 0 || index < 0) 129 *p = 1; 130 /* Perform a single table lookup. */ 131 else if (uniwbrk_table[last_compchar_index][index]) 132 *p = 1; 133 /* else *p = 0; */ 134 } 135 } 136 } 137 138 last_char_prop = prop; 139 140 /* Ignore Format and Extend characters, except at the 141 start of the line. */ 142 if (last_compchar_prop < 0 143 || last_compchar_prop == WBP_CR 144 || last_compchar_prop == WBP_LF 145 || last_compchar_prop == WBP_NEWLINE 146 || !(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ)) 147 { 148 secondlast_compchar_prop = last_compchar_prop; 149 last_compchar_prop = prop; 150 last_compchar_ptr = p; 151 152 if (prop == WBP_RI) 153 ri_count++; 154 else 155 ri_count = 0; 156 } 157 158 s += count; 159 p += count; 160 } 161 } 162 } 163