1 /*
2  * Copyright 2017 Patrick O. Perry.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <assert.h>
18 #include <stdio.h>
19 #include "private/graphbreak.h"
20 #include "utf8lite.h"
21 
22 
23 #define NEXT() \
24 	do { \
25 		scan->ptr = scan->iter.ptr; \
26 		if (utf8lite_text_iter_advance(&scan->iter)) { \
27 			scan->prop = graph_break(scan->iter.current); \
28 		} else { \
29 			scan->prop = -1; \
30 		} \
31 	} while (0)
32 
33 
utf8lite_graphscan_make(struct utf8lite_graphscan * scan,const struct utf8lite_text * text)34 void utf8lite_graphscan_make(struct utf8lite_graphscan *scan,
35 			     const struct utf8lite_text *text)
36 {
37 	utf8lite_text_iter_make(&scan->iter, text);
38 	utf8lite_graphscan_reset(scan);
39 }
40 
41 
utf8lite_graphscan_reset(struct utf8lite_graphscan * scan)42 void utf8lite_graphscan_reset(struct utf8lite_graphscan *scan)
43 {
44 	utf8lite_text_iter_reset(&scan->iter);
45 	scan->current.text.ptr = (uint8_t *)scan->iter.ptr;
46 	scan->current.text.attr = (scan->iter.text_attr
47 				   & ~UTF8LITE_TEXT_SIZE_MASK);
48 	NEXT();
49 }
50 
51 
utf8lite_graphscan_skip(struct utf8lite_graphscan * scan)52 void utf8lite_graphscan_skip(struct utf8lite_graphscan *scan)
53 {
54 	utf8lite_text_iter_skip(&scan->iter);
55 	scan->current.text.ptr = (uint8_t *)scan->iter.ptr;
56 	scan->current.text.attr = (scan->iter.text_attr
57 				   & ~UTF8LITE_TEXT_SIZE_MASK);
58 	scan->prop = -1;
59 }
60 
61 
utf8lite_graphscan_advance(struct utf8lite_graphscan * scan)62 int utf8lite_graphscan_advance(struct utf8lite_graphscan *scan)
63 {
64 	scan->current.text.ptr = (uint8_t *)scan->ptr;
65 	scan->current.text.attr = (scan->iter.text_attr
66 				   & ~UTF8LITE_TEXT_SIZE_MASK);
67 Start:
68 	// GB2: Break at the end of text
69 	if (scan->prop < 0) {
70 		goto Break;
71 	}
72 
73 	switch ((enum graph_break_prop)scan->prop) {
74 	case GRAPH_BREAK_CR:
75 		NEXT();
76 		goto CR;
77 
78 	case GRAPH_BREAK_CONTROL:
79 	case GRAPH_BREAK_LF:
80 		// Break after controls
81 		// GB4: (Newline | LF) +
82 		NEXT();
83 		goto Break;
84 
85 	case GRAPH_BREAK_L:
86 		NEXT();
87 		goto L;
88 
89 	case GRAPH_BREAK_LV:
90 	case GRAPH_BREAK_V:
91 		NEXT();
92 		goto V;
93 
94 	case GRAPH_BREAK_LVT:
95 	case GRAPH_BREAK_T:
96 		NEXT();
97 		goto T;
98 
99 	case GRAPH_BREAK_PREPEND:
100 		NEXT();
101 		goto Prepend;
102 
103 	case GRAPH_BREAK_EXTENDED_PICTOGRAPHIC:
104 		NEXT();
105 		goto Extended_Pictographic;
106 
107 	case GRAPH_BREAK_REGIONAL_INDICATOR:
108 		NEXT();
109 		goto Regional_Indicator;
110 
111 	case GRAPH_BREAK_EXTEND:
112 	case GRAPH_BREAK_SPACINGMARK:
113 	case GRAPH_BREAK_ZWJ:
114 	case GRAPH_BREAK_OTHER:
115 		NEXT();
116 		goto MaybeBreak;
117 	}
118 
119 	assert(0 && "unhandled grapheme break property");
120 
121 CR:
122 	// GB3: Do not break within CRLF
123 	// GB4: Otherwise break after controls
124 	if (scan->prop == GRAPH_BREAK_LF) {
125 		NEXT();
126 	}
127 	goto Break;
128 
129 L:
130 	// GB6: Do not break Hangul syllable sequences.
131 	switch (scan->prop) {
132 	case GRAPH_BREAK_L:
133 		NEXT();
134 		goto L;
135 
136 	case GRAPH_BREAK_V:
137 	case GRAPH_BREAK_LV:
138 		NEXT();
139 		goto V;
140 
141 	case GRAPH_BREAK_LVT:
142 		NEXT();
143 		goto T;
144 
145 	default:
146 		goto MaybeBreak;
147 	}
148 
149 V:
150 	// GB7: Do not break Hangul syllable sequences.
151 	switch (scan->prop) {
152 	case GRAPH_BREAK_V:
153 		NEXT();
154 		goto V;
155 
156 	case GRAPH_BREAK_T:
157 		NEXT();
158 		goto T;
159 
160 	default:
161 		goto MaybeBreak;
162 	}
163 
164 T:
165 	// GB8: Do not break Hangul syllable sequences.
166 	switch (scan->prop) {
167 	case GRAPH_BREAK_T:
168 		NEXT();
169 		goto T;
170 
171 	default:
172 		goto MaybeBreak;
173 	}
174 
175 Prepend:
176 	switch (scan->prop) {
177 	case GRAPH_BREAK_CONTROL:
178 	case GRAPH_BREAK_CR:
179 	case GRAPH_BREAK_LF:
180 		// GB5: break before controls
181 		goto Break;
182 
183 	default:
184 		// GB9b: do not break after Prepend characters.
185 		goto Start;
186 	}
187 
188 Extended_Pictographic:
189 	// GB9:  Do not break before extending characters
190 	while (scan->prop == GRAPH_BREAK_EXTEND) {
191 		NEXT();
192 	}
193     // GB9: Do not break before ZWJ
194     if (scan->prop == GRAPH_BREAK_ZWJ) {
195         NEXT();
196         // GB11: Do not break within emoji modifier sequences
197         // or emoji zwj sequences.
198         if (scan->prop == GRAPH_BREAK_EXTENDED_PICTOGRAPHIC) {
199             NEXT();
200             goto Extended_Pictographic;
201         }
202     }
203 	goto MaybeBreak;
204 
205 Regional_Indicator:
206 	// Do not break within emoji flag sequences. That is, do not break
207 	// between regional indicator (RI) symbols if there is an odd number
208 	// of RI characters before the break point
209 	if (scan->prop == GRAPH_BREAK_REGIONAL_INDICATOR) {
210 		// GB12/13: [^RI] RI * RI
211 		NEXT();
212 	}
213 	goto MaybeBreak;
214 
215 MaybeBreak:
216 	// GB9: Do not break before extending characters or ZWJ.
217 	// GB9a: Do not break before SpacingMark [extended grapheme clusters]
218 	// GB999: Otherwise, break everywhere
219 	switch (scan->prop) {
220 	case GRAPH_BREAK_EXTEND:
221 	case GRAPH_BREAK_SPACINGMARK:
222 	case GRAPH_BREAK_ZWJ:
223 		NEXT();
224 		goto MaybeBreak;
225 
226 	default:
227 		goto Break;
228 	}
229 
230 Break:
231 	scan->current.text.attr |= (size_t)(scan->ptr - scan->current.text.ptr);
232 	return (scan->ptr == scan->current.text.ptr) ? 0 : 1;
233 }
234 
235 #define PREV() \
236 	do { \
237 		if (utf8lite_text_iter_retreat(&prev)) { \
238 			prop = graph_break(prev.current); \
239 		} else { \
240 			prop = -1; \
241 		} \
242 	} while (0)
243 
244 
regional_indicator_odd(const struct utf8lite_text_iter * prev)245 static int regional_indicator_odd(const struct utf8lite_text_iter *prev)
246 {
247 	struct utf8lite_text_iter it = *prev;
248 	int odd = 1, prop;
249 
250 	while (utf8lite_text_iter_retreat(&it)) {
251 		prop = graph_break(it.current);
252 		if (prop == GRAPH_BREAK_REGIONAL_INDICATOR) {
253 			odd = odd ? 0 : 1;
254 		} else {
255 			return odd;
256 		}
257 	}
258 
259 	return odd;
260 }
261 
262 
follows_extended_pictographic(const struct utf8lite_text_iter * prev)263 static int follows_extended_pictographic(const struct utf8lite_text_iter *prev)
264 {
265 	struct utf8lite_text_iter it = *prev;
266 	int prop;
267 
268 	while (utf8lite_text_iter_retreat(&it)) {
269 		prop = graph_break(it.current);
270 		switch (prop) {
271 		case GRAPH_BREAK_EXTENDED_PICTOGRAPHIC:
272 			return 1;
273 		case GRAPH_BREAK_EXTEND:
274 			break;
275 		default:
276 			return 0;
277 		}
278 	}
279 
280 	return 0;
281 }
282 
283 
utf8lite_graphscan_retreat(struct utf8lite_graphscan * scan)284 int utf8lite_graphscan_retreat(struct utf8lite_graphscan *scan)
285 {
286 	struct utf8lite_text_iter prev;
287 	int prop;
288 
289 	// see if there is a previous character
290 	prev = scan->iter;
291 	if (!utf8lite_text_iter_retreat(&prev)) {
292 		// already at the start
293 		return 0;
294 	}
295 
296 	// if so, start of current grapheme becomes end of previous
297 	scan->current.text.attr = (scan->iter.text_attr
298 				   & ~UTF8LITE_TEXT_SIZE_MASK);
299 	scan->ptr = scan->current.text.ptr;
300 
301 	// position iter after the last character, prev before
302 	while (prev.ptr != scan->ptr) {
303 		scan->iter = prev;
304 		utf8lite_text_iter_retreat(&prev);
305 	}
306 
307 	// update iterator property
308 	if (scan->iter.current < 0) {
309 		scan->prop = -1;
310 	} else {
311 		scan->prop = graph_break(scan->iter.current);
312 	}
313 
314 	if (prev.current < 0) {
315 		prop = -1;
316 	} else {
317 		prop = graph_break(prev.current);
318 	}
319 
320 Start:
321 	// at the start of the text
322 	if (prop < 0) {
323 		goto Break;
324 	}
325 
326 	switch ((enum graph_break_prop)prop) {
327 	case GRAPH_BREAK_CONTROL:
328 	case GRAPH_BREAK_CR:
329 		// GB4: Break after controls
330 		PREV();
331 		goto Break;
332 
333 	case GRAPH_BREAK_LF:
334 		PREV();
335 		goto LF;
336 
337 	case GRAPH_BREAK_L:
338 	case GRAPH_BREAK_LV:
339 	case GRAPH_BREAK_LVT:
340 		PREV();
341 		goto L;
342 
343 	case GRAPH_BREAK_V:
344 		PREV();
345 		goto V;
346 
347 	case GRAPH_BREAK_T:
348 		PREV();
349 		goto T;
350 
351 	case GRAPH_BREAK_EXTEND:
352 	case GRAPH_BREAK_SPACINGMARK:
353 	case GRAPH_BREAK_ZWJ:
354 		PREV();
355 		goto Extend;
356 
357 	case GRAPH_BREAK_EXTENDED_PICTOGRAPHIC:
358 		PREV();
359 		goto Extended_Pictographic;
360 
361 	case GRAPH_BREAK_REGIONAL_INDICATOR:
362 		PREV();
363 		goto Regional_Indicator;
364 
365 	case GRAPH_BREAK_PREPEND:
366 	case GRAPH_BREAK_OTHER:
367 		PREV();
368 		goto MaybeBreak;
369 	}
370 	assert(0 && "unhandled graph break property");
371 
372 LF:
373 	// GB3: Do not break between a CR and LF
374 	// GB4: Otherwise, break after controls.
375 	if (prop == GRAPH_BREAK_CR) {
376 		PREV();
377 	}
378 	goto Break;
379 
380 L:
381 	// GB6: Do not break Hangul syllable sequences
382 	switch (prop) {
383 	case GRAPH_BREAK_L:
384 		PREV();
385 		goto L;
386 
387 	default:
388 		goto MaybeBreak;
389 	}
390 
391 V:
392 	// GB6, GB7: Do not break Hangul syllable sequences
393 	switch (prop) {
394 	case GRAPH_BREAK_V:
395 		PREV();
396 		goto V;
397 
398 	case GRAPH_BREAK_L:
399 	case GRAPH_BREAK_LV:
400 		PREV();
401 		goto L;
402 
403 	default:
404 		goto MaybeBreak;
405 	}
406 
407 T:
408 	// GB6, GB7, GB8: Do not break Hangul syllable sequences
409 	switch (prop) {
410 	case GRAPH_BREAK_LV:
411 	case GRAPH_BREAK_LVT:
412 		PREV();
413 		goto L;
414 
415 	case GRAPH_BREAK_V:
416 		PREV();
417 		goto V;
418 
419 	case GRAPH_BREAK_T:
420 		PREV();
421 		goto T;
422 
423 	default:
424 		goto MaybeBreak;
425 	}
426 
427 Extend:
428 	switch (prop) {
429 	// GB4: Break after controls
430 	case GRAPH_BREAK_CONTROL:
431 	case GRAPH_BREAK_CR:
432 	case GRAPH_BREAK_LF:
433 		goto Break;
434 
435 	// GB9: Do not break before extending characters or ZWJ.
436 	// GB9a: Do not break before SpacingMarks
437 	default:
438 		goto Start;
439 	}
440 
441 Extended_Pictographic:
442     // GB11: Do not break within emoji modifier sequences or
443     // emoji zwj sequences.
444 	if (prop == GRAPH_BREAK_ZWJ && follows_extended_pictographic(&prev)) {
445         PREV(); // ZWJ
446 		while (prop == GRAPH_BREAK_EXTEND) { // Extend*
447 			PREV();
448 		}
449 
450         PREV();
451         goto Extended_Pictographic;
452 	}
453     goto MaybeBreak;
454 
455 Regional_Indicator:
456 	// GB12, GB13: Do not break within emoji flag sequences
457 	if (prop == GRAPH_BREAK_REGIONAL_INDICATOR) {
458 		if (regional_indicator_odd(&prev)) {
459 			PREV();
460 		}
461 	}
462 	goto MaybeBreak;
463 
464 MaybeBreak:
465 	switch (prop) {
466 	// GB9b: Do not break after Prepend characters
467 	case GRAPH_BREAK_PREPEND:
468 		PREV();
469 		goto MaybeBreak;
470 
471 	default:
472 		goto Break;
473 	}
474 
475 Break:
476 	scan->current.text.ptr = (uint8_t *)prev.ptr;
477 	scan->current.text.attr |= (size_t)(scan->ptr - scan->current.text.ptr);
478 	return (scan->ptr == scan->current.text.ptr) ? 0 : 1;
479 }
480