1 /*
2 * Copyright 2017 Patrick O. Perry.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <assert.h>
18 #include <stdio.h>
19 #include "private/graphbreak.h"
20 #include "utf8lite.h"
21
22
23 #define NEXT() \
24 do { \
25 scan->ptr = scan->iter.ptr; \
26 if (utf8lite_text_iter_advance(&scan->iter)) { \
27 scan->prop = graph_break(scan->iter.current); \
28 } else { \
29 scan->prop = -1; \
30 } \
31 } while (0)
32
33
utf8lite_graphscan_make(struct utf8lite_graphscan * scan,const struct utf8lite_text * text)34 void utf8lite_graphscan_make(struct utf8lite_graphscan *scan,
35 const struct utf8lite_text *text)
36 {
37 utf8lite_text_iter_make(&scan->iter, text);
38 utf8lite_graphscan_reset(scan);
39 }
40
41
utf8lite_graphscan_reset(struct utf8lite_graphscan * scan)42 void utf8lite_graphscan_reset(struct utf8lite_graphscan *scan)
43 {
44 utf8lite_text_iter_reset(&scan->iter);
45 scan->current.text.ptr = (uint8_t *)scan->iter.ptr;
46 scan->current.text.attr = (scan->iter.text_attr
47 & ~UTF8LITE_TEXT_SIZE_MASK);
48 NEXT();
49 }
50
51
utf8lite_graphscan_skip(struct utf8lite_graphscan * scan)52 void utf8lite_graphscan_skip(struct utf8lite_graphscan *scan)
53 {
54 utf8lite_text_iter_skip(&scan->iter);
55 scan->current.text.ptr = (uint8_t *)scan->iter.ptr;
56 scan->current.text.attr = (scan->iter.text_attr
57 & ~UTF8LITE_TEXT_SIZE_MASK);
58 scan->prop = -1;
59 }
60
61
utf8lite_graphscan_advance(struct utf8lite_graphscan * scan)62 int utf8lite_graphscan_advance(struct utf8lite_graphscan *scan)
63 {
64 scan->current.text.ptr = (uint8_t *)scan->ptr;
65 scan->current.text.attr = (scan->iter.text_attr
66 & ~UTF8LITE_TEXT_SIZE_MASK);
67 Start:
68 // GB2: Break at the end of text
69 if (scan->prop < 0) {
70 goto Break;
71 }
72
73 switch ((enum graph_break_prop)scan->prop) {
74 case GRAPH_BREAK_CR:
75 NEXT();
76 goto CR;
77
78 case GRAPH_BREAK_CONTROL:
79 case GRAPH_BREAK_LF:
80 // Break after controls
81 // GB4: (Newline | LF) +
82 NEXT();
83 goto Break;
84
85 case GRAPH_BREAK_L:
86 NEXT();
87 goto L;
88
89 case GRAPH_BREAK_LV:
90 case GRAPH_BREAK_V:
91 NEXT();
92 goto V;
93
94 case GRAPH_BREAK_LVT:
95 case GRAPH_BREAK_T:
96 NEXT();
97 goto T;
98
99 case GRAPH_BREAK_PREPEND:
100 NEXT();
101 goto Prepend;
102
103 case GRAPH_BREAK_EXTENDED_PICTOGRAPHIC:
104 NEXT();
105 goto Extended_Pictographic;
106
107 case GRAPH_BREAK_REGIONAL_INDICATOR:
108 NEXT();
109 goto Regional_Indicator;
110
111 case GRAPH_BREAK_EXTEND:
112 case GRAPH_BREAK_SPACINGMARK:
113 case GRAPH_BREAK_ZWJ:
114 case GRAPH_BREAK_OTHER:
115 NEXT();
116 goto MaybeBreak;
117 }
118
119 assert(0 && "unhandled grapheme break property");
120
121 CR:
122 // GB3: Do not break within CRLF
123 // GB4: Otherwise break after controls
124 if (scan->prop == GRAPH_BREAK_LF) {
125 NEXT();
126 }
127 goto Break;
128
129 L:
130 // GB6: Do not break Hangul syllable sequences.
131 switch (scan->prop) {
132 case GRAPH_BREAK_L:
133 NEXT();
134 goto L;
135
136 case GRAPH_BREAK_V:
137 case GRAPH_BREAK_LV:
138 NEXT();
139 goto V;
140
141 case GRAPH_BREAK_LVT:
142 NEXT();
143 goto T;
144
145 default:
146 goto MaybeBreak;
147 }
148
149 V:
150 // GB7: Do not break Hangul syllable sequences.
151 switch (scan->prop) {
152 case GRAPH_BREAK_V:
153 NEXT();
154 goto V;
155
156 case GRAPH_BREAK_T:
157 NEXT();
158 goto T;
159
160 default:
161 goto MaybeBreak;
162 }
163
164 T:
165 // GB8: Do not break Hangul syllable sequences.
166 switch (scan->prop) {
167 case GRAPH_BREAK_T:
168 NEXT();
169 goto T;
170
171 default:
172 goto MaybeBreak;
173 }
174
175 Prepend:
176 switch (scan->prop) {
177 case GRAPH_BREAK_CONTROL:
178 case GRAPH_BREAK_CR:
179 case GRAPH_BREAK_LF:
180 // GB5: break before controls
181 goto Break;
182
183 default:
184 // GB9b: do not break after Prepend characters.
185 goto Start;
186 }
187
188 Extended_Pictographic:
189 // GB9: Do not break before extending characters
190 while (scan->prop == GRAPH_BREAK_EXTEND) {
191 NEXT();
192 }
193 // GB9: Do not break before ZWJ
194 if (scan->prop == GRAPH_BREAK_ZWJ) {
195 NEXT();
196 // GB11: Do not break within emoji modifier sequences
197 // or emoji zwj sequences.
198 if (scan->prop == GRAPH_BREAK_EXTENDED_PICTOGRAPHIC) {
199 NEXT();
200 goto Extended_Pictographic;
201 }
202 }
203 goto MaybeBreak;
204
205 Regional_Indicator:
206 // Do not break within emoji flag sequences. That is, do not break
207 // between regional indicator (RI) symbols if there is an odd number
208 // of RI characters before the break point
209 if (scan->prop == GRAPH_BREAK_REGIONAL_INDICATOR) {
210 // GB12/13: [^RI] RI * RI
211 NEXT();
212 }
213 goto MaybeBreak;
214
215 MaybeBreak:
216 // GB9: Do not break before extending characters or ZWJ.
217 // GB9a: Do not break before SpacingMark [extended grapheme clusters]
218 // GB999: Otherwise, break everywhere
219 switch (scan->prop) {
220 case GRAPH_BREAK_EXTEND:
221 case GRAPH_BREAK_SPACINGMARK:
222 case GRAPH_BREAK_ZWJ:
223 NEXT();
224 goto MaybeBreak;
225
226 default:
227 goto Break;
228 }
229
230 Break:
231 scan->current.text.attr |= (size_t)(scan->ptr - scan->current.text.ptr);
232 return (scan->ptr == scan->current.text.ptr) ? 0 : 1;
233 }
234
235 #define PREV() \
236 do { \
237 if (utf8lite_text_iter_retreat(&prev)) { \
238 prop = graph_break(prev.current); \
239 } else { \
240 prop = -1; \
241 } \
242 } while (0)
243
244
regional_indicator_odd(const struct utf8lite_text_iter * prev)245 static int regional_indicator_odd(const struct utf8lite_text_iter *prev)
246 {
247 struct utf8lite_text_iter it = *prev;
248 int odd = 1, prop;
249
250 while (utf8lite_text_iter_retreat(&it)) {
251 prop = graph_break(it.current);
252 if (prop == GRAPH_BREAK_REGIONAL_INDICATOR) {
253 odd = odd ? 0 : 1;
254 } else {
255 return odd;
256 }
257 }
258
259 return odd;
260 }
261
262
follows_extended_pictographic(const struct utf8lite_text_iter * prev)263 static int follows_extended_pictographic(const struct utf8lite_text_iter *prev)
264 {
265 struct utf8lite_text_iter it = *prev;
266 int prop;
267
268 while (utf8lite_text_iter_retreat(&it)) {
269 prop = graph_break(it.current);
270 switch (prop) {
271 case GRAPH_BREAK_EXTENDED_PICTOGRAPHIC:
272 return 1;
273 case GRAPH_BREAK_EXTEND:
274 break;
275 default:
276 return 0;
277 }
278 }
279
280 return 0;
281 }
282
283
utf8lite_graphscan_retreat(struct utf8lite_graphscan * scan)284 int utf8lite_graphscan_retreat(struct utf8lite_graphscan *scan)
285 {
286 struct utf8lite_text_iter prev;
287 int prop;
288
289 // see if there is a previous character
290 prev = scan->iter;
291 if (!utf8lite_text_iter_retreat(&prev)) {
292 // already at the start
293 return 0;
294 }
295
296 // if so, start of current grapheme becomes end of previous
297 scan->current.text.attr = (scan->iter.text_attr
298 & ~UTF8LITE_TEXT_SIZE_MASK);
299 scan->ptr = scan->current.text.ptr;
300
301 // position iter after the last character, prev before
302 while (prev.ptr != scan->ptr) {
303 scan->iter = prev;
304 utf8lite_text_iter_retreat(&prev);
305 }
306
307 // update iterator property
308 if (scan->iter.current < 0) {
309 scan->prop = -1;
310 } else {
311 scan->prop = graph_break(scan->iter.current);
312 }
313
314 if (prev.current < 0) {
315 prop = -1;
316 } else {
317 prop = graph_break(prev.current);
318 }
319
320 Start:
321 // at the start of the text
322 if (prop < 0) {
323 goto Break;
324 }
325
326 switch ((enum graph_break_prop)prop) {
327 case GRAPH_BREAK_CONTROL:
328 case GRAPH_BREAK_CR:
329 // GB4: Break after controls
330 PREV();
331 goto Break;
332
333 case GRAPH_BREAK_LF:
334 PREV();
335 goto LF;
336
337 case GRAPH_BREAK_L:
338 case GRAPH_BREAK_LV:
339 case GRAPH_BREAK_LVT:
340 PREV();
341 goto L;
342
343 case GRAPH_BREAK_V:
344 PREV();
345 goto V;
346
347 case GRAPH_BREAK_T:
348 PREV();
349 goto T;
350
351 case GRAPH_BREAK_EXTEND:
352 case GRAPH_BREAK_SPACINGMARK:
353 case GRAPH_BREAK_ZWJ:
354 PREV();
355 goto Extend;
356
357 case GRAPH_BREAK_EXTENDED_PICTOGRAPHIC:
358 PREV();
359 goto Extended_Pictographic;
360
361 case GRAPH_BREAK_REGIONAL_INDICATOR:
362 PREV();
363 goto Regional_Indicator;
364
365 case GRAPH_BREAK_PREPEND:
366 case GRAPH_BREAK_OTHER:
367 PREV();
368 goto MaybeBreak;
369 }
370 assert(0 && "unhandled graph break property");
371
372 LF:
373 // GB3: Do not break between a CR and LF
374 // GB4: Otherwise, break after controls.
375 if (prop == GRAPH_BREAK_CR) {
376 PREV();
377 }
378 goto Break;
379
380 L:
381 // GB6: Do not break Hangul syllable sequences
382 switch (prop) {
383 case GRAPH_BREAK_L:
384 PREV();
385 goto L;
386
387 default:
388 goto MaybeBreak;
389 }
390
391 V:
392 // GB6, GB7: Do not break Hangul syllable sequences
393 switch (prop) {
394 case GRAPH_BREAK_V:
395 PREV();
396 goto V;
397
398 case GRAPH_BREAK_L:
399 case GRAPH_BREAK_LV:
400 PREV();
401 goto L;
402
403 default:
404 goto MaybeBreak;
405 }
406
407 T:
408 // GB6, GB7, GB8: Do not break Hangul syllable sequences
409 switch (prop) {
410 case GRAPH_BREAK_LV:
411 case GRAPH_BREAK_LVT:
412 PREV();
413 goto L;
414
415 case GRAPH_BREAK_V:
416 PREV();
417 goto V;
418
419 case GRAPH_BREAK_T:
420 PREV();
421 goto T;
422
423 default:
424 goto MaybeBreak;
425 }
426
427 Extend:
428 switch (prop) {
429 // GB4: Break after controls
430 case GRAPH_BREAK_CONTROL:
431 case GRAPH_BREAK_CR:
432 case GRAPH_BREAK_LF:
433 goto Break;
434
435 // GB9: Do not break before extending characters or ZWJ.
436 // GB9a: Do not break before SpacingMarks
437 default:
438 goto Start;
439 }
440
441 Extended_Pictographic:
442 // GB11: Do not break within emoji modifier sequences or
443 // emoji zwj sequences.
444 if (prop == GRAPH_BREAK_ZWJ && follows_extended_pictographic(&prev)) {
445 PREV(); // ZWJ
446 while (prop == GRAPH_BREAK_EXTEND) { // Extend*
447 PREV();
448 }
449
450 PREV();
451 goto Extended_Pictographic;
452 }
453 goto MaybeBreak;
454
455 Regional_Indicator:
456 // GB12, GB13: Do not break within emoji flag sequences
457 if (prop == GRAPH_BREAK_REGIONAL_INDICATOR) {
458 if (regional_indicator_odd(&prev)) {
459 PREV();
460 }
461 }
462 goto MaybeBreak;
463
464 MaybeBreak:
465 switch (prop) {
466 // GB9b: Do not break after Prepend characters
467 case GRAPH_BREAK_PREPEND:
468 PREV();
469 goto MaybeBreak;
470
471 default:
472 goto Break;
473 }
474
475 Break:
476 scan->current.text.ptr = (uint8_t *)prev.ptr;
477 scan->current.text.attr |= (size_t)(scan->ptr - scan->current.text.ptr);
478 return (scan->ptr == scan->current.text.ptr) ? 0 : 1;
479 }
480