1 /* Line breaking of UTF-8 strings. 2 Copyright (C) 2001-2003, 2006-2014 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2001. 4 5 This program is free software: you can redistribute it and/or modify it 6 under the terms of the GNU Lesser General Public License as published 7 by the Free Software Foundation; either version 3 of the License, or 8 (at your option) any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public License 16 along with this program. If not, see <http://www.gnu.org/licenses/>. */ 17 18 #include <config.h> 19 20 /* Specification. */ 21 #include "unilbrk.h" 22 23 #include <stdlib.h> 24 #include <string.h> 25 26 #include "unilbrk/lbrktables.h" 27 #include "uniwidth/cjk.h" 28 #include "unistr.h" 29 30 void 31 u8_possible_linebreaks (const uint8_t *s, size_t n, const char *encoding, char *p) 32 { 33 int LBP_AI_REPLACEMENT = (is_cjk_encoding (encoding) ? LBP_ID : LBP_AL); 34 const uint8_t *s_end = s + n; 35 int last_prop = LBP_BK; /* line break property of last non-space character */ 36 char *seen_space = NULL; /* Was a space seen after the last non-space character? */ 37 char *seen_space2 = NULL; /* At least two spaces after the last non-space? */ 38 39 /* Don't break inside multibyte characters. */ 40 memset (p, UC_BREAK_PROHIBITED, n); 41 42 while (s < s_end) 43 { 44 ucs4_t uc; 45 int count = u8_mbtouc_unsafe (&uc, s, s_end - s); 46 int prop = unilbrkprop_lookup (uc); 47 48 if (prop == LBP_BK) 49 { 50 /* Mandatory break. */ 51 *p = UC_BREAK_MANDATORY; 52 last_prop = LBP_BK; 53 seen_space = NULL; 54 seen_space2 = NULL; 55 } 56 else 57 { 58 char *q; 59 60 /* Resolve property values whose behaviour is not fixed. */ 61 switch (prop) 62 { 63 case LBP_AI: 64 /* Resolve ambiguous. */ 65 prop = LBP_AI_REPLACEMENT; 66 break; 67 case LBP_CB: 68 /* This is arbitrary. */ 69 prop = LBP_ID; 70 break; 71 case LBP_SA: 72 /* We don't handle complex scripts yet. 73 Treat LBP_SA like LBP_XX. */ 74 case LBP_XX: 75 /* This is arbitrary. */ 76 prop = LBP_AL; 77 break; 78 } 79 80 /* Deal with spaces and combining characters. */ 81 q = p; 82 if (prop == LBP_SP) 83 { 84 /* Don't break just before a space. */ 85 *p = UC_BREAK_PROHIBITED; 86 seen_space2 = seen_space; 87 seen_space = p; 88 } 89 else if (prop == LBP_ZW) 90 { 91 /* Don't break just before a zero-width space. */ 92 *p = UC_BREAK_PROHIBITED; 93 last_prop = LBP_ZW; 94 seen_space = NULL; 95 seen_space2 = NULL; 96 } 97 else if (prop == LBP_CM) 98 { 99 /* Don't break just before a combining character, except immediately after a 100 zero-width space. */ 101 if (last_prop == LBP_ZW) 102 { 103 /* Break after zero-width space. */ 104 *p = UC_BREAK_POSSIBLE; 105 /* A combining character turns a preceding space into LBP_ID. */ 106 last_prop = LBP_ID; 107 } 108 else 109 { 110 *p = UC_BREAK_PROHIBITED; 111 /* A combining character turns a preceding space into LBP_ID. */ 112 if (seen_space != NULL) 113 { 114 q = seen_space; 115 seen_space = seen_space2; 116 prop = LBP_ID; 117 goto lookup_via_table; 118 } 119 } 120 } 121 else 122 { 123 lookup_via_table: 124 /* prop must be usable as an index for table 7.3 of UTR #14. */ 125 if (!(prop >= 0 && prop < sizeof (unilbrk_table) / sizeof (unilbrk_table[0]))) 126 abort (); 127 128 if (last_prop == LBP_BK) 129 { 130 /* Don't break at the beginning of a line. */ 131 *q = UC_BREAK_PROHIBITED; 132 } 133 else if (last_prop == LBP_ZW) 134 { 135 /* Break after zero-width space. */ 136 *q = UC_BREAK_POSSIBLE; 137 } 138 else 139 { 140 switch (unilbrk_table [last_prop] [prop]) 141 { 142 case D: 143 *q = UC_BREAK_POSSIBLE; 144 break; 145 case I: 146 *q = (seen_space != NULL ? UC_BREAK_POSSIBLE : UC_BREAK_PROHIBITED); 147 break; 148 case P: 149 *q = UC_BREAK_PROHIBITED; 150 break; 151 default: 152 abort (); 153 } 154 } 155 last_prop = prop; 156 seen_space = NULL; 157 seen_space2 = NULL; 158 } 159 } 160 161 s += count; 162 p += count; 163 } 164 } 165 166 167 #ifdef TEST 168 169 #include <stdio.h> 170 #include <string.h> 171 172 /* Read the contents of an input stream, and return it, terminated with a NUL 173 byte. */ 174 char * 175 read_file (FILE *stream) 176 { 177 #define BUFSIZE 4096 178 char *buf = NULL; 179 int alloc = 0; 180 int size = 0; 181 int count; 182 183 while (! feof (stream)) 184 { 185 if (size + BUFSIZE > alloc) 186 { 187 alloc = alloc + alloc / 2; 188 if (alloc < size + BUFSIZE) 189 alloc = size + BUFSIZE; 190 buf = realloc (buf, alloc); 191 if (buf == NULL) 192 { 193 fprintf (stderr, "out of memory\n"); 194 exit (1); 195 } 196 } 197 count = fread (buf + size, 1, BUFSIZE, stream); 198 if (count == 0) 199 { 200 if (ferror (stream)) 201 { 202 perror ("fread"); 203 exit (1); 204 } 205 } 206 else 207 size += count; 208 } 209 buf = realloc (buf, size + 1); 210 if (buf == NULL) 211 { 212 fprintf (stderr, "out of memory\n"); 213 exit (1); 214 } 215 buf[size] = '\0'; 216 return buf; 217 #undef BUFSIZE 218 } 219 220 int 221 main (int argc, char * argv[]) 222 { 223 if (argc == 1) 224 { 225 /* Display all the break opportunities in the input string. */ 226 char *input = read_file (stdin); 227 int length = strlen (input); 228 char *breaks = malloc (length); 229 int i; 230 231 u8_possible_linebreaks ((uint8_t *) input, length, "UTF-8", breaks); 232 233 for (i = 0; i < length; i++) 234 { 235 switch (breaks[i]) 236 { 237 case UC_BREAK_POSSIBLE: 238 /* U+2027 in UTF-8 encoding */ 239 putc (0xe2, stdout); putc (0x80, stdout); putc (0xa7, stdout); 240 break; 241 case UC_BREAK_MANDATORY: 242 /* U+21B2 (or U+21B5) in UTF-8 encoding */ 243 putc (0xe2, stdout); putc (0x86, stdout); putc (0xb2, stdout); 244 break; 245 case UC_BREAK_PROHIBITED: 246 break; 247 default: 248 abort (); 249 } 250 putc (input[i], stdout); 251 } 252 253 free (breaks); 254 255 return 0; 256 } 257 else 258 return 1; 259 } 260 261 #endif /* TEST */ 262