1 /*
2 * break.c - an implementation of Unicode line breaking algorithm.
3 *
4 * Copyright (C) 2009-2012 by Hatuka*nezumi - IKEDA Soji.
5 *
6 * This file is part of the Sombok Package. This program is free
7 * software; you can redistribute it and/or modify it under the terms of
8 * either the GNU General Public License or the Artistic License, as
9 * specified in the README file.
10 *
11 */
12
13 #include "sombok_constants.h"
14 #include "sombok.h"
15
16 extern propval_t *linebreak_rules[];
17 extern size_t linebreak_rulessiz;
18
19 /**
20 * @defgroup linebreak_break break
21 * @brief Perform line breaking algorithm
22 *@{*/
23
24 static
_user(linebreak_t * lbobj,unistr_t * str)25 gcstring_t *_user(linebreak_t * lbobj, unistr_t * str)
26 {
27 gcstring_t *result;
28
29 if (str == NULL)
30 return NULL;
31 else if (lbobj->user_func == NULL ||
32 ((result = (*(lbobj->user_func)) (lbobj, str)) == NULL &&
33 !lbobj->errnum)) {
34 if ((result = gcstring_newcopy(str, lbobj)) == NULL)
35 lbobj->errnum = errno ? errno : ENOMEM;
36 }
37 return result;
38 }
39
40 static
_prep_sub(linebreak_t * lbobj,unistr_t * substr,unistr_t * text,size_t findex)41 gcstring_t *_prep_sub(linebreak_t * lbobj, unistr_t * substr,
42 unistr_t * text, size_t findex)
43 {
44 unistr_t unistr = { NULL, 0 };
45 gcstring_t *ret, *s;
46 unichar_t *prev_str;
47 size_t prev_len;
48 gcstring_t *(*func) (linebreak_t *, void *, unistr_t *, unistr_t *);
49 void *data;
50
51 if ((func = lbobj->prep_func[findex]) == NULL) {
52 if ((ret = gcstring_newcopy(substr, lbobj)) == NULL)
53 lbobj->errnum = errno ? errno : ENOMEM;
54 return ret;
55 }
56 if (lbobj->prep_data == NULL)
57 data = NULL;
58 else
59 data = lbobj->prep_data[findex];
60
61 if ((ret = gcstring_new(NULL, lbobj)) == NULL) {
62 lbobj->errnum = errno ? errno : ENOMEM;
63 return NULL;
64 }
65
66 prev_str = substr->str;
67 prev_len = substr->len;
68 while (1) {
69 /* Pass I: search. */
70 unistr.str = prev_str;
71 unistr.len = prev_len;
72 gcstring_destroy((*func) (lbobj, data, &unistr, text));
73 /* - no match: stop searching. */
74 if (unistr.str == NULL)
75 break;
76 /* - buffer may be modified: abort. */
77 if (unistr.len < 0 ||
78 unistr.str < text->str ||
79 text->str + text->len < unistr.str + unistr.len) {
80 gcstring_destroy(ret);
81 lbobj->errnum = EINVAL;
82 return NULL;
83 }
84 /* - out of range: stop searching. */
85 if (unistr.str < substr->str ||
86 substr->str + substr->len < unistr.str + unistr.len)
87 break;
88
89 /* apply next callback to unmatched part. */
90 if (prev_str <= unistr.str) {
91 unistr_t us;
92 us.len = unistr.str - prev_str;
93 us.str = prev_str;
94 if ((s = _prep_sub(lbobj, &us, text, findex + 1)) == NULL) {
95 gcstring_destroy(ret);
96 return NULL;
97 }
98 if (gcstring_append(ret, s) == NULL) {
99 gcstring_destroy(ret);
100 gcstring_destroy(s);
101 lbobj->errnum = errno ? errno : ENOMEM;
102 return NULL;
103 }
104 gcstring_destroy(s);
105 }
106
107 /* Pass II: process matched string. */
108 if ((s = (*func) (lbobj, data, &unistr, NULL)) == NULL) {
109 if (lbobj->errnum != 0) {
110 gcstring_destroy(ret);
111 return NULL;
112 }
113 if ((s = gcstring_newcopy(&unistr, lbobj)) == NULL) {
114 gcstring_destroy(ret);
115 lbobj->errnum = errno ? errno : ENOMEM;
116 return NULL;
117 }
118 }
119 if (gcstring_append(ret, s) == NULL) {
120 gcstring_destroy(ret);
121 gcstring_destroy(s);
122 lbobj->errnum = errno ? errno : ENOMEM;
123 return NULL;
124 }
125 gcstring_destroy(s);
126
127 /* skip zero length match to avoid infinite loop. */
128 if (unistr.len == 0) {
129 if (substr->str + substr->len <= unistr.str) {
130 prev_str = unistr.str;
131 prev_len = 0;
132 break;
133 } else {
134 prev_str = unistr.str + 1;
135 prev_len = substr->str + substr->len - prev_str;
136 continue;
137 }
138 }
139 prev_str = unistr.str + unistr.len;
140 prev_len = substr->str + substr->len - prev_str;
141 }
142
143 /* apply next callback to the rest of string. */
144 if (prev_str < substr->str + substr->len) {
145 unistr.str = prev_str;
146 unistr.len = prev_len;
147 if ((s = _prep_sub(lbobj, &unistr, text, findex + 1)) == NULL) {
148 gcstring_destroy(ret);
149 return NULL;
150 }
151 if (gcstring_append(ret, s) == NULL) {
152 gcstring_destroy(ret);
153 gcstring_destroy(s);
154 lbobj->errnum = errno ? errno : ENOMEM;
155 return NULL;
156 }
157 gcstring_destroy(s);
158 }
159
160 return ret;
161 }
162
163 static
_prep(linebreak_t * lbobj,unistr_t * text)164 gcstring_t *_prep(linebreak_t * lbobj, unistr_t * text)
165 {
166 gcstring_t *ret;
167
168 if (lbobj->prep_func == NULL) {
169 if ((ret = gcstring_newcopy(text, lbobj)) == NULL)
170 lbobj->errnum = errno ? errno : ENOMEM;
171 return ret;
172 }
173 return _prep_sub(lbobj, text, text, 0);
174 }
175
176 static
_format(linebreak_t * lbobj,linebreak_state_t action,gcstring_t * str)177 gcstring_t *_format(linebreak_t * lbobj, linebreak_state_t action,
178 gcstring_t * str)
179 {
180 gcstring_t *result;
181
182 if (str == NULL)
183 return NULL;
184 else if (lbobj->format_func == NULL ||
185 ((result =
186 (*(lbobj->format_func)) (lbobj, action, str)) == NULL &&
187 !lbobj->errnum)) {
188 if ((result = gcstring_copy(str)) == NULL)
189 lbobj->errnum = errno ? errno : ENOMEM;
190 }
191 return result;
192 }
193
194 static
_sizing(linebreak_t * lbobj,double len,gcstring_t * pre,gcstring_t * spc,gcstring_t * str)195 double _sizing(linebreak_t * lbobj, double len,
196 gcstring_t * pre, gcstring_t * spc, gcstring_t * str)
197 {
198 double ret;
199
200 if (lbobj->sizing_func == NULL ||
201 ((ret = (*(lbobj->sizing_func)) (lbobj, len, pre, spc, str))
202 < 0.0 && !lbobj->errnum)) {
203 if (spc != NULL)
204 len += (double) spc->gclen;
205 if (str != NULL)
206 len += (double) str->gclen;
207 return len;
208 }
209 return ret;
210 }
211
212 static
_urgent_break(linebreak_t * lbobj,gcstring_t * str)213 gcstring_t *_urgent_break(linebreak_t * lbobj, gcstring_t * str)
214 {
215 gcstring_t *result;
216
217 if (lbobj->urgent_func == NULL ||
218 ((result = (*(lbobj->urgent_func)) (lbobj, str)) == NULL &&
219 !lbobj->errnum)) {
220 if ((result = gcstring_copy(str)) == NULL)
221 lbobj->errnum = errno ? errno : ENOMEM;
222 }
223 return result;
224 }
225
226 #define gcstring_DESTROY(gcstr) \
227 gcstring_destroy(gcstr); gcstr = NULL;
228
229 #define IF_NULL_THEN_ABORT(x) \
230 if ((x) == NULL) { \
231 size_t i; \
232 if (lbobj->errnum == 0) \
233 lbobj->errnum = errno? errno: EINVAL; \
234 gcstring_destroy(str); \
235 gcstring_destroy(bufStr); \
236 gcstring_destroy(bufSpc); \
237 for (i = 0; i < reslen; i++) \
238 gcstring_destroy(results[i]); \
239 free(results); \
240 gcstring_destroy(s); \
241 gcstring_destroy(t); \
242 gcstring_destroy(beforeFrg); \
243 gcstring_destroy(fmt); \
244 gcstring_destroy(broken); \
245 return NULL; \
246 }
247
248 /** @fn propval_t linebreak_lbrule(propval_t b_idx, propval_t a_idx)
249 * @deprecated Use linebreak_get_lbrule().
250 *
251 * Get breaking rule between two classes
252 *
253 * From given two line breaking classes, get breaking rule determined by
254 * internal data.
255 * @param[in] a_idx line breaking class.
256 * @param[in] b_idx line breaking class.
257 * @return line breaking action: MANDATORY, DIRECT, INDIRECT or PROHIBITED.
258 * If action was not determined, returns DIRECT.
259 *
260 * @note This method gives just approximate description of line breaking
261 * behavior. Especially, it won't give meaningful value related to classes
262 * AI and CJ.
263 * See also linebreak_get_lbrule().
264 *
265 */
266 static
_lbruleinfo(propval_t b_idx,propval_t a_idx)267 propval_t _lbruleinfo(propval_t b_idx, propval_t a_idx)
268 {
269 propval_t result = PROP_UNKNOWN;
270
271 if (b_idx < 0 || linebreak_rulessiz <= b_idx ||
272 a_idx < 0 || linebreak_rulessiz <= a_idx);
273 else
274 result = linebreak_rules[b_idx][a_idx];
275 if (result == PROP_UNKNOWN)
276 return LINEBREAK_ACTION_DIRECT;
277 return result;
278 }
279
linebreak_lbrule(propval_t b_idx,propval_t a_idx)280 propval_t linebreak_lbrule(propval_t b_idx, propval_t a_idx)
281 {
282 /* Resolve before-side class. */
283
284 switch (b_idx) {
285 /* LB1: Resolve SA, SG, XX to AL; AI and CJ cannot be resolved. */
286 case LB_SA:
287 case LB_SG:
288 case LB_XX:
289 /* LB10: Resolve CM to AL. */
290 case LB_CM:
291 #if 0
292 /* Resolve HL to AL. */
293 case LB_HL:
294 #endif
295 b_idx = LB_AL;
296 break;
297 }
298
299 /* Resolve after-side class. */
300
301 switch (a_idx) {
302 /* LB1 */
303 case LB_SA:
304 case LB_SG:
305 case LB_XX:
306 a_idx = LB_AL;
307 break;
308
309 /* LB9, LB10 */
310 case LB_CM:
311 /* LB9: Treat X CM as if it were X, with some exceptions. */
312 switch (b_idx) {
313 case LB_BK:
314 case LB_CR:
315 case LB_LF:
316 case LB_NL:
317 case LB_SP:
318 case LB_ZW:
319 break;
320
321 default:
322 return LINEBREAK_ACTION_PROHIBITED;
323 }
324
325 /* XXX Legacy-CM rule cannot be applied. */
326
327 /* LB10: Treat any remaining combining mark as AL. */
328 a_idx = LB_AL;
329 if (b_idx == LB_CM)
330 b_idx = LB_AL;
331 break;
332
333 #if 0
334 /* Resolve HL to AL. */
335 case LB_HL:
336 a_idx = LB_AL;
337 break;
338 #endif
339 }
340
341 /* LB25, simplified:
342 * (CL|CP|NU) × (PO|PR)
343 * (PO|PR) × (OP|NU)
344 * (HY|IS|NU|SY) × NU
345 */
346 if (((b_idx == LB_CL || b_idx == LB_CP || b_idx == LB_NU) &&
347 (a_idx == LB_PO || a_idx == LB_PR)) ||
348 ((b_idx == LB_PO || b_idx == LB_PR) &&
349 (a_idx == LB_OP || a_idx == LB_NU)) ||
350 ((b_idx == LB_HY || b_idx == LB_IS || b_idx == LB_NU ||
351 b_idx == LB_SY) && a_idx == LB_NU))
352 return LINEBREAK_ACTION_PROHIBITED;
353
354 return _lbruleinfo(b_idx, a_idx);
355 }
356
357 /** @fn gcstring_t** linebreak_break_partial(linebreak_t *lbobj, unistr_t *input)
358 *
359 * Perform line breaking algorithm with incremental inputs.
360 *
361 * @param[in] lbobj linebreak object.
362 * @param[in] input Unicode string; give NULL to specify end of input.
363 * @return array of (partial) broken grapheme cluster strings terminated by NULL.
364 * If internal error occurred, lbobj->errnum is set then NULL is returned.
365 */
366 static
_break_partial(linebreak_t * lbobj,unistr_t * input,size_t * lenp,int eot)367 gcstring_t **_break_partial(linebreak_t * lbobj, unistr_t * input,
368 size_t * lenp, int eot)
369 {
370 int state;
371 gcstring_t *str = NULL, *bufStr = NULL, *bufSpc = NULL;
372 double bufCols;
373 size_t bBeg, bLen, bCM, bSpc, aCM, urgEnd;
374 gcstring_t **results = NULL;
375 size_t reslen = 0;
376
377 gcstring_t *s = NULL, *t = NULL, *beforeFrg = NULL, *fmt = NULL,
378 *broken = NULL;
379 unistr_t unistr;
380 size_t i;
381 gcstring_t empty = { NULL, 0, NULL, 0, 0, lbobj };
382
383 /***
384 *** Unread and additional input.
385 ***/
386
387 unistr.str = lbobj->unread.str;
388 unistr.len = lbobj->unread.len;
389 lbobj->unread.str = NULL;
390 lbobj->unread.len = 0;
391 if (input != NULL && input->len != 0) {
392 unichar_t *_u;
393 if ((_u = realloc(unistr.str,
394 sizeof(unichar_t) * (unistr.len + input->len)))
395 == NULL) {
396 lbobj->errnum = errno;
397 free(unistr.str);
398 return NULL;
399 } else
400 unistr.str = _u;
401 memcpy(unistr.str + unistr.len, input->str,
402 sizeof(unichar_t) * input->len);
403 unistr.len += input->len;
404 }
405
406 /***
407 *** Preprocessing.
408 ***/
409
410 /* perform user breaking */
411 if (lbobj->user_func != NULL)
412 str = _user(lbobj, &unistr);
413 else
414 str = _prep(lbobj, &unistr);
415 free(unistr.str);
416 if (str == NULL)
417 return NULL;
418
419 /* South East Asian complex breaking. */
420 errno = 0;
421 linebreak_southeastasian_flagbreak(str);
422 if (errno) {
423 lbobj->errnum = errno;
424 gcstring_DESTROY(str);
425 return NULL;
426 }
427
428 /* LB21a (as of 6.1.0): HL (HY | BA) × [^ CB] */
429 if (str != NULL && str->gclen) {
430 propval_t lbc;
431
432 for (i = 0; i < str->gclen; i++) {
433 /* HL */
434 if ((lbc = gcstring_lbclass(str, i)) == LB_HL &&
435 gcstring_lbclass_ext(str, i) == lbc)
436 /* avoid non-CM grapheme extenders */
437 i++;
438 else
439 continue;
440 /* CM* */
441 while (i < str->gclen && gcstring_lbclass(str, i) == LB_CM)
442 i++;
443 if (str->gclen <= i)
444 break;
445
446 /* (HY|BA) */
447 if (((lbc = gcstring_lbclass(str, i)) == LB_HY ||
448 lbc == LB_BA) && gcstring_lbclass_ext(str, i) == lbc)
449 /* avoid non-CM grapheme extenders */
450 i++;
451 else
452 continue;
453 /* CM* */
454 while (i < str->gclen && gcstring_lbclass(str, i) == LB_CM)
455 i++;
456 if (str->gclen <= i)
457 break;
458
459 /* [^CB] */
460 switch (gcstring_lbclass(str, i)) {
461 /* prohibit break by default */
462 case LB_BK: /* LB6 */
463 case LB_CR:
464 case LB_LF:
465 case LB_NL:
466 case LB_SP: /* LB7 */
467 case LB_ZW:
468 case LB_CM: /* LB9 */
469 case LB_WJ: /* LB11 */
470 /* allow break by default */
471 case LB_CB: /* LB20 */
472 continue;
473 }
474
475 if (!str->gcstr[i].flag)
476 str->gcstr[i].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
477 }
478 }
479
480 /* LB25: not break in (PR|PO)? (OP|HY)? NU (NU|SY|IS)* (CL|CP)? (PR|PO)? */
481 /* FIXME:Avoid non-CM grapheme extenders */
482 if (str != NULL && str->gclen) {
483 size_t st, et;
484
485 for (i = 0; i < str->gclen; i++) {
486 st = et = (size_t) - 1;
487
488 /* (PR|PO)? */
489 switch (gcstring_lbclass(str, i)) {
490 case LB_PR:
491 case LB_PO:
492 if (st == (size_t) - 1)
493 st = i;
494 LB25_PRPO_PREFIX:
495 i++;
496 /* CM* */
497 while (i < str->gclen && gcstring_lbclass(str, i) == LB_CM)
498 i++;
499 if (str->gclen <= i)
500 goto LB25_BREAK;
501 }
502
503 /* (OP|HY)? */
504 switch (gcstring_lbclass(str, i)) {
505 case LB_OP:
506 case LB_HY:
507 if (st == (size_t) - 1)
508 st = i;
509 LB25_OPHY_PREFIX:
510 i++;
511 /* CM* */
512 while (i < str->gclen && gcstring_lbclass(str, i) == LB_CM)
513 i++;
514 if (str->gclen <= i) {
515 if (eot)
516 goto LB25_BREAK;
517 else
518 goto LB25_FOUND; /* save possible partial sequence. */
519 }
520 }
521
522 /* NU (NU|SY|IS)* */
523 switch (gcstring_lbclass(str, i)) {
524 case LB_NU:
525 if (st == (size_t) - 1)
526 st = i;
527 i++;
528 /* (NU|SY|IS|CM)* */
529 while (i < str->gclen)
530 switch (gcstring_lbclass(str, i)) {
531 case LB_NU:
532 case LB_SY:
533 case LB_IS:
534 case LB_CM:
535 i++;
536 break;
537
538 /* (CL|CP) */
539 case LB_CL:
540 case LB_CP:
541 goto LB25_CLCP_SUFFIX;
542
543 /* (PR|PO) */
544 case LB_PR:
545 case LB_PO:
546 goto LB25_PRPO_SUFFIX;
547
548 default:
549 goto LB25_FOUND;
550 }
551 if (str->gclen <= i)
552 goto LB25_FOUND;
553 break;
554
555 case LB_PR:
556 case LB_PO:
557 st = i;
558 goto LB25_PRPO_PREFIX;
559
560 case LB_OP:
561 case LB_HY:
562 st = i;
563 goto LB25_OPHY_PREFIX;
564
565 default:
566 continue;
567 }
568
569 /* (CL|CP)? */
570 switch (gcstring_lbclass(str, i)) {
571 case LB_CL:
572 case LB_CP:
573 LB25_CLCP_SUFFIX:
574 i++;
575 /* CM* */
576 while (i < str->gclen && gcstring_lbclass(str, i) == LB_CM)
577 i++;
578 if (str->gclen <= i)
579 goto LB25_FOUND;
580 }
581
582 /* (PR|PO)? */
583 switch (gcstring_lbclass(str, i)) {
584 case LB_PR:
585 case LB_PO:
586 LB25_PRPO_SUFFIX:
587 et = i;
588 i++;
589 /* CM* */
590 while (i < str->gclen && gcstring_lbclass(str, i) == LB_CM)
591 i++;
592 if (str->gclen <= i)
593 goto LB25_FOUND;
594 }
595
596 LB25_FOUND:
597 for (st++; st < i; st++) {
598 if (!str->gcstr[st].flag)
599 str->gcstr[st].flag = LINEBREAK_FLAG_PROHIBIT_BEFORE;
600 }
601 /* match may be overwrapped */
602 if (et != (size_t) - 1) {
603 i = st = et;
604 et = (size_t) - 1;
605 goto LB25_PRPO_PREFIX;
606 }
607 }
608 LB25_BREAK:
609 ;
610 }
611
612 /***
613 *** Initialize status.
614 ***/
615
616 str->pos = 0;
617
618 /*
619 * Line buffer.
620 * bufStr: Unbreakable text fragment.
621 * bufSpc: Trailing spaces.
622 * bufCols: Columns of bufStr: can be differ from gcstring_columns().
623 * state: Start of text/paragraph status.
624 * 0: Start of text not done.
625 * 1: Start of text done while start of paragraph not done.
626 * 2: Start of paragraph done while end of paragraph not done.
627 */
628 state = lbobj->state;
629
630 unistr.str = lbobj->bufstr.str;
631 unistr.len = lbobj->bufstr.len;
632 lbobj->bufstr.str = NULL;
633 lbobj->bufstr.len = 0;
634 IF_NULL_THEN_ABORT(bufStr = gcstring_new(&unistr, lbobj));
635
636 unistr.str = lbobj->bufspc.str;
637 unistr.len = lbobj->bufspc.len;
638 lbobj->bufspc.str = NULL;
639 lbobj->bufspc.len = 0;
640 IF_NULL_THEN_ABORT(bufSpc = gcstring_new(&unistr, lbobj));
641
642 bufCols = lbobj->bufcols;
643
644 /*
645 * Indexes and flags
646 * bBeg: Start of unbreakable text fragment.
647 * bLen: Length of unbreakable text fragment.
648 * bSpc: Length of trailing spaces.
649 * urgEnd: End of substring broken by urgent breaking.
650 *
651 * ...read...| before :CM | spaces | after :CM |...unread...|
652 * ^ ->bCM<- ^ ->aCM<- ^
653 * |<-- bLen -->|<- bSpc ->| ^ |
654 * bBeg candidate str->pos end of
655 * breaking input
656 * point
657 * `read' positions shall never be read again.
658 */
659 bBeg = bLen = bCM = bSpc = aCM = urgEnd = 0;
660
661 /* Result. */
662 IF_NULL_THEN_ABORT(results = malloc(sizeof(gcstring_t **)));
663 results[0] = NULL;
664
665 while (1) {
666 /***
667 *** Chop off a pair of unbreakable character clusters from text.
668 ***/
669 int action = 0;
670 propval_t lbc;
671 double newcols;
672
673 /* Go ahead reading input. */
674 while (!gcstring_eos(str)) {
675 lbc = gcstring_lbclass(str, str->pos);
676
677 /**
678 ** Append SP/ZW/eop to ``before'' buffer.
679 **/
680 switch (lbc) {
681 /* - Explicit breaks and non-breaks */
682
683 /* LB7(1): × SP+ */
684 case LB_SP:
685 gcstring_next(str);
686 bSpc++;
687
688 /* End of input. */
689 continue; /* while (!gcstring_eos(str)) */
690
691 /* - Mandatory breaks */
692
693 /* LB4 - LB7: × SP* (BK | CR LF | CR | LF | NL) ! */
694 case LB_BK:
695 case LB_CR:
696 case LB_LF:
697 case LB_NL:
698 gcstring_next(str);
699 bSpc++;
700 goto last_CHARACTER_PAIR; /* while (!gcstring_eos(str)) */
701
702 /* - Explicit breaks and non-breaks */
703
704 /* LB7(2): × (SP* ZW+)+ */
705 case LB_ZW:
706 gcstring_next(str);
707 bLen += bSpc + 1;
708 bCM = 0;
709 bSpc = 0;
710
711 /* End of input */
712 continue; /* while (!gcstring_eos(str)) */
713 }
714
715 /**
716 ** Then fill ``after'' buffer.
717 **/
718
719 gcstring_next(str);
720
721 /* skip to end of unbreakable fragment by user/complex/urgent
722 * breaking. */
723 while (!gcstring_eos(str) && str->gcstr[str->pos].flag &
724 LINEBREAK_FLAG_PROHIBIT_BEFORE)
725 gcstring_next(str);
726
727 /* - Combining marks */
728 /* LB9: Treat X CM+ as if it were X
729 * where X is anything except BK, CR, LF, NL, SP or ZW
730 * (NB: Some CM characters may be single grapheme cluster
731 * since they have Grapheme_Cluster_Break property Control.) */
732 while (!gcstring_eos(str) &&
733 gcstring_lbclass(str, str->pos) == LB_CM) {
734 gcstring_next(str);
735 aCM++;
736 }
737
738 /* - Start of text */
739
740 /* LB2: sot × */
741 if (0 < bLen || 0 < bSpc)
742 break; /* while (!gcstring_eos(str)) */
743
744 /* shift buffers. */
745 bLen = str->pos - bBeg;
746 bSpc = 0;
747 bCM = aCM;
748 aCM = 0;
749 } /* while (!gcstring_eos(str)) */
750 last_CHARACTER_PAIR:
751
752 /***
753 *** Determin line breaking action by classes of adjacent characters.
754 ***/
755
756 /* Mandatory break. */
757 if (0 < bSpc &&
758 (lbc = gcstring_lbclass(str, bBeg + bLen + bSpc - 1)) != LB_SP
759 && (lbc != LB_CR || eot || !gcstring_eos(str))) {
760 /* CR at end of input may be part of CR LF therefore not be eop. */
761 action = LINEBREAK_ACTION_MANDATORY;
762 /* LB11, LB12 and tailorable rules LB13 - LB31.
763 * Or urgent breaking. */
764 } else if (bBeg + bLen + bSpc < str->pos) {
765 if (str->gcstr[bBeg + bLen + bSpc].flag &
766 LINEBREAK_FLAG_ALLOW_BEFORE)
767 action = LINEBREAK_ACTION_DIRECT;
768 else if (str->gcstr[bBeg + bLen + bSpc].flag &
769 LINEBREAK_FLAG_PROHIBIT_BEFORE)
770 action = LINEBREAK_ACTION_PROHIBITED;
771 else if (lbobj->options & LINEBREAK_OPTION_BREAK_INDENT &&
772 bLen == 0 && 0 < bSpc)
773 /* Allow break at sot or after breaking,
774 * although rules don't tell it obviously. */
775 action = LINEBREAK_ACTION_DIRECT;
776 else {
777 propval_t blbc, albc;
778 size_t btail;
779
780 if (bLen == 0)
781 btail = bBeg + bSpc - 1; /* before buffer is SP only. */
782 else
783 btail = bBeg + bLen - bCM - 1; /* LB9 */
784
785 blbc = gcstring_lbclass_ext(str, btail);
786 switch (blbc) {
787 /* (SG and XX are already resolved). */
788 /* LB1: Resolve AI and CJ. */
789 case LB_AI:
790 blbc = (lbobj->options &
791 LINEBREAK_OPTION_EASTASIAN_CONTEXT) ?
792 LB_ID : LB_AL;
793 break;
794 case LB_CJ:
795 blbc = (lbobj->options &
796 LINEBREAK_OPTION_NONSTARTER_LOOSE) ?
797 LB_ID : LB_NS;
798 break;
799 /* LB1: SA is resolved to AL. */
800 case LB_SA:
801 blbc = LB_AL;
802 break;
803 /* LB10: Treat any remaining CM+ as if it were AL. */
804 case LB_CM:
805 blbc = LB_AL;
806 break;
807 #if 0
808 /* (As of 6.1.0): Treat HL as AL. */
809 case LB_HL:
810 blbc = LB_AL;
811 break;
812 #endif
813 /* Optionally, treat hangul syllable as if it were AL. */
814 case LB_H2:
815 case LB_H3:
816 case LB_JL:
817 case LB_JV:
818 case LB_JT:
819 if (lbobj->options & LINEBREAK_OPTION_HANGUL_AS_AL)
820 blbc = LB_AL;
821 break;
822 }
823
824 albc = gcstring_lbclass(str, bBeg + bLen + bSpc);
825 switch (albc) {
826 /* (SG and XX are already resolved). */
827 /* LB1: Resolve AI and CJ. */
828 case LB_AI:
829 albc = (lbobj->options &
830 LINEBREAK_OPTION_EASTASIAN_CONTEXT) ?
831 LB_ID : LB_AL;
832 break;
833 case LB_CJ:
834 albc = (lbobj->options &
835 LINEBREAK_OPTION_NONSTARTER_LOOSE) ?
836 LB_ID : LB_NS;
837 break;
838 /* LB1: SA is resolved to AL. */
839 case LB_SA:
840 albc = LB_AL;
841 break;
842 /* LB10: Treat any remaining CM+ as if it were AL. */
843 case LB_CM:
844 albc = LB_AL;
845 break;
846 #if 0
847 /* (As of 6.1.0): Treat HL as AL. */
848 case LB_HL:
849 albc = LB_AL;
850 break;
851 #endif
852 /* Optionally, treat hangul syllable as if it were AL. */
853 case LB_H2:
854 case LB_H3:
855 case LB_JL:
856 case LB_JV:
857 case LB_JT:
858 if (lbobj->options & LINEBREAK_OPTION_HANGUL_AS_AL)
859 albc = LB_AL;
860 break;
861 }
862
863 action = _lbruleinfo(blbc, albc);
864 }
865
866 /* Check prohibited break. */
867 if (action == LINEBREAK_ACTION_PROHIBITED ||
868 (action == LINEBREAK_ACTION_INDIRECT && bSpc == 0)) {
869 /* When conjunction is expected to exceed charmax,
870 * try urgent breaking. */
871 if (urgEnd < bBeg + bLen + bSpc &&
872 0 < lbobj->charmax &&
873 lbobj->charmax < str->gcstr[str->pos - 1].idx +
874 str->gcstr[str->pos - 1].len - str->gcstr[bBeg].idx) {
875 size_t charmax, chars;
876
877 IF_NULL_THEN_ABORT(s = gcstring_substr(str, bBeg,
878 str->pos -
879 bBeg));
880 IF_NULL_THEN_ABORT(broken = _urgent_break(lbobj, s));
881 gcstring_DESTROY(s);
882
883 /* If any of urgently broken fragments still
884 * exceed CharactersMax, force chop them. */
885 charmax = lbobj->charmax;
886 broken->pos = 0;
887 chars = gcstring_next(broken)->len;
888 while (!gcstring_eos(broken)) {
889 if (broken->gcstr[broken->pos].flag &
890 LINEBREAK_FLAG_ALLOW_BEFORE)
891 chars = 0;
892 else if (charmax <
893 chars + broken->gcstr[broken->pos].len) {
894 broken->gcstr[broken->pos].flag |=
895 LINEBREAK_FLAG_ALLOW_BEFORE;
896 chars = 0;
897 } else
898 chars += broken->gcstr[broken->pos].len;
899 gcstring_next(broken);
900 } /* while (!gcstring_eos(broken)) */
901
902 urgEnd = broken->gclen;
903 gcstring_replace(str, 0, str->pos, broken);
904 gcstring_DESTROY(broken);
905 str->pos = 0;
906 bBeg = bLen = bCM = bSpc = aCM = 0;
907 continue; /* while (1) */
908 }
909
910 /* if (urgEnd < ...) */
911 /* Otherwise, fragments may be conjuncted safely. Read more. */
912 bLen = str->pos - bBeg;
913 bSpc = 0;
914 bCM = aCM;
915 aCM = 0;
916 continue; /* while (1) */
917 } /* if (action == ...) */
918 } /* if (0 < bSpc && ...) */
919 /***
920 *** Check end of input.
921 ***/
922 if (!eot && str->gclen <= bBeg + bLen + bSpc) {
923 /* Save status then output partial result. */
924 lbobj->bufstr.str = bufStr->str;
925 lbobj->bufstr.len = bufStr->len;
926 bufStr->str = NULL;
927 bufStr->len = 0;
928 gcstring_DESTROY(bufStr);
929
930 lbobj->bufspc.str = bufSpc->str;
931 lbobj->bufspc.len = bufSpc->len;
932 bufSpc->str = NULL;
933 bufSpc->len = 0;
934 gcstring_DESTROY(bufSpc);
935
936 lbobj->bufcols = bufCols;
937
938 s = gcstring_substr(str, bBeg, str->gclen - bBeg);
939 lbobj->unread.str = s->str;
940 lbobj->unread.len = s->len;
941 s->str = NULL;
942 s->len = 0;
943 gcstring_DESTROY(s);
944
945 lbobj->state = state;
946
947 /* clenup. */
948 gcstring_DESTROY(str);
949
950 if (lenp != NULL)
951 *lenp = reslen;
952 return results;
953 }
954
955 /* After all, possible actions are MANDATORY and arbitrary. */
956
957 /***
958 *** Examine line breaking action
959 ***/
960
961 IF_NULL_THEN_ABORT(beforeFrg = gcstring_substr(str, bBeg, bLen));
962
963 if (state == LINEBREAK_STATE_NONE) { /* sot undone. */
964 /* Process start of text. */
965 IF_NULL_THEN_ABORT(fmt = _format(lbobj, LINEBREAK_STATE_SOT,
966 beforeFrg));
967 if (gcstring_cmp(beforeFrg, fmt) != 0) {
968 s = gcstring_substr(str, bBeg + bLen, bSpc);
969 gcstring_append(fmt, s);
970 gcstring_DESTROY(s);
971 s = gcstring_substr(str, bBeg + bLen + bSpc,
972 str->pos - (bBeg + bLen + bSpc));
973 gcstring_append(fmt, s);
974 gcstring_DESTROY(s);
975 gcstring_replace(str, 0, str->pos, fmt);
976 str->pos = 0;
977 bBeg = bLen = bCM = bSpc = aCM = 0;
978 urgEnd = 0;
979
980 state = LINEBREAK_STATE_SOT_FORMAT;
981 gcstring_DESTROY(fmt);
982 gcstring_DESTROY(beforeFrg);
983
984 continue; /* while (1) */
985 }
986 gcstring_DESTROY(fmt);
987 state = LINEBREAK_STATE_SOL;
988 } else if (state == LINEBREAK_STATE_SOT_FORMAT)
989 state = LINEBREAK_STATE_SOL;
990 else if (state == LINEBREAK_STATE_SOT) { /* sop undone. */
991 /* Process start of paragraph. */
992 IF_NULL_THEN_ABORT(fmt = _format(lbobj, LINEBREAK_STATE_SOP,
993 beforeFrg));
994 if (gcstring_cmp(beforeFrg, fmt) != 0) {
995 s = gcstring_substr(str, bBeg + bLen, bSpc);
996 gcstring_append(fmt, s);
997 gcstring_DESTROY(s);
998 s = gcstring_substr(str, bBeg + bLen + bSpc,
999 str->pos - (bBeg + bLen + bSpc));
1000 gcstring_append(fmt, s);
1001 gcstring_DESTROY(s);
1002 gcstring_replace(str, 0, str->pos, fmt);
1003 str->pos = 0;
1004 bBeg = bLen = bCM = bSpc = aCM = 0;
1005 urgEnd = 0;
1006
1007 state = LINEBREAK_STATE_SOP_FORMAT;
1008 gcstring_DESTROY(fmt);
1009 gcstring_DESTROY(beforeFrg);
1010
1011 continue; /* while (1) */
1012 }
1013 gcstring_DESTROY(fmt);
1014 state = LINEBREAK_STATE_SOP;
1015 } else if (state == LINEBREAK_STATE_SOP_FORMAT)
1016 state = LINEBREAK_STATE_SOP;
1017
1018 /***
1019 *** Check if arbitrary break is needed.
1020 ***/
1021 newcols = _sizing(lbobj, bufCols, bufStr, bufSpc, beforeFrg);
1022 if (newcols < 0.0) {
1023 IF_NULL_THEN_ABORT(NULL);
1024 }
1025 if (0 < lbobj->colmax && lbobj->colmax < newcols) {
1026 newcols = _sizing(lbobj, 0.0, &empty, &empty, beforeFrg);
1027 if (newcols < 0.0) {
1028 IF_NULL_THEN_ABORT(NULL);
1029 }
1030
1031 /**
1032 ** When arbitrary break is expected to generate a line shorter
1033 ** than colmin or, beforeFrg will exceed colmax, try urgent
1034 ** breaking.
1035 **/
1036 if (urgEnd < bBeg + bLen + bSpc) {
1037 broken = NULL;
1038
1039 if (0.0 < bufCols && bufCols < lbobj->colmin) {
1040 gcstring_replace(beforeFrg, 0, 0, bufSpc);
1041 gcstring_replace(beforeFrg, 0, 0, bufStr);
1042 gcstring_shrink(bufSpc, 0);
1043 gcstring_shrink(bufStr, 0);
1044 bufCols = 0.0;
1045 IF_NULL_THEN_ABORT(broken = _urgent_break(lbobj,
1046 beforeFrg));
1047 } else if (lbobj->colmax < newcols) {
1048 IF_NULL_THEN_ABORT(broken = _urgent_break(lbobj,
1049 beforeFrg));
1050 }
1051
1052 if (broken != NULL) {
1053 s = gcstring_substr(str, bBeg + bLen, bSpc);
1054 gcstring_append(broken, s);
1055 gcstring_DESTROY(s);
1056 gcstring_replace(str, 0, bBeg + bLen + bSpc, broken);
1057 str->pos = 0;
1058 urgEnd = broken->gclen;
1059 bBeg = bLen = bCM = bSpc = aCM = 0;
1060 gcstring_DESTROY(broken);
1061
1062 gcstring_DESTROY(beforeFrg);
1063 continue; /* while (1) */
1064 }
1065 }
1066
1067 /**
1068 ** Otherwise, process arbitrary break.
1069 **/
1070 if (bufStr->len || bufSpc->len) {
1071 gcstring_t **r;
1072
1073 IF_NULL_THEN_ABORT(r = realloc(results,
1074 sizeof(gcstring_t *) *
1075 (reslen + 2)));
1076 (results = r)[reslen + 1] = NULL;
1077 IF_NULL_THEN_ABORT(s = _format(lbobj, LINEBREAK_STATE_LINE,
1078 bufStr));
1079 IF_NULL_THEN_ABORT(t = _format(lbobj, LINEBREAK_STATE_EOL,
1080 bufSpc));
1081 IF_NULL_THEN_ABORT(results[reslen] =
1082 gcstring_concat(s, t));
1083 reslen++;
1084 gcstring_DESTROY(s);
1085 gcstring_DESTROY(t);
1086
1087 IF_NULL_THEN_ABORT(fmt =
1088 _format(lbobj, LINEBREAK_STATE_SOL,
1089 beforeFrg));
1090 if (gcstring_cmp(beforeFrg, fmt) != 0) {
1091 gcstring_DESTROY(beforeFrg);
1092 beforeFrg = fmt;
1093 newcols =
1094 _sizing(lbobj, 0.0, &empty, &empty, beforeFrg);
1095 if (newcols < 0.0) {
1096 IF_NULL_THEN_ABORT(NULL);
1097 }
1098 } else
1099 gcstring_DESTROY(fmt);
1100 }
1101 gcstring_shrink(bufStr, 0);
1102 gcstring_append(bufStr, beforeFrg);
1103
1104 gcstring_shrink(bufSpc, 0);
1105 s = gcstring_substr(str, bBeg + bLen, bSpc);
1106 gcstring_append(bufSpc, s);
1107 gcstring_DESTROY(s);
1108
1109 bufCols = newcols;
1110 /***
1111 *** Arbitrary break is not needed.
1112 ***/
1113 } else {
1114 gcstring_append(bufStr, bufSpc);
1115 gcstring_append(bufStr, beforeFrg);
1116
1117 gcstring_shrink(bufSpc, 0);
1118 s = gcstring_substr(str, bBeg + bLen, bSpc);
1119 gcstring_append(bufSpc, s);
1120 gcstring_DESTROY(s);
1121
1122 bufCols = newcols;
1123 } /* if (0 < lbobj->colmax ... ) */
1124
1125 gcstring_DESTROY(beforeFrg);
1126
1127 /***
1128 *** Mandatory break or end-of-text.
1129 ***/
1130 if (eot && str->gclen <= bBeg + bLen + bSpc)
1131 break; /* while (1) */
1132
1133 if (action == LINEBREAK_ACTION_MANDATORY) {
1134 /* Process mandatory break. */
1135 gcstring_t **r;
1136
1137 IF_NULL_THEN_ABORT(r = realloc(results,
1138 sizeof(gcstring_t *) *
1139 (reslen + 2)));
1140 (results = r)[reslen + 1] = NULL;
1141 IF_NULL_THEN_ABORT(s = _format(lbobj, LINEBREAK_STATE_LINE,
1142 bufStr));
1143 IF_NULL_THEN_ABORT(t = _format(lbobj, LINEBREAK_STATE_EOP,
1144 bufSpc));
1145 IF_NULL_THEN_ABORT(results[reslen] = gcstring_concat(s, t));
1146 reslen++;
1147 gcstring_DESTROY(s);
1148 gcstring_DESTROY(t);
1149
1150 /* eop done then sop must be carried out. */
1151 state = LINEBREAK_STATE_SOT;
1152
1153 gcstring_shrink(bufStr, 0);
1154 gcstring_shrink(bufSpc, 0);
1155 bufCols = 0.0;
1156 }
1157
1158 /***
1159 *** Shift buffers.
1160 ***/
1161 bBeg += bLen + bSpc;
1162 bLen = str->pos - bBeg;
1163 bSpc = 0;
1164 bCM = aCM;
1165 aCM = 0;
1166 } /* while (1) */
1167
1168 /***
1169 *** Process end of text.
1170 ***/
1171 {
1172 gcstring_t **r;
1173
1174 IF_NULL_THEN_ABORT(r = realloc(results,
1175 sizeof(gcstring_t *) * (reslen +
1176 2)));
1177 (results = r)[reslen + 1] = NULL;
1178 IF_NULL_THEN_ABORT(s =
1179 _format(lbobj, LINEBREAK_STATE_LINE, bufStr));
1180 IF_NULL_THEN_ABORT(t =
1181 _format(lbobj, LINEBREAK_STATE_EOT, bufSpc));
1182 IF_NULL_THEN_ABORT(results[reslen] = gcstring_concat(s, t));
1183 reslen++;
1184 gcstring_DESTROY(s);
1185 gcstring_DESTROY(t);
1186 }
1187
1188 /* clenup. */
1189 gcstring_DESTROY(str);
1190 gcstring_DESTROY(bufStr);
1191 gcstring_DESTROY(bufSpc);
1192
1193 /* Reset status then return the rest of result. */
1194 linebreak_reset(lbobj);
1195
1196 if (lenp != NULL)
1197 *lenp = reslen;
1198 return results;
1199 }
1200
linebreak_break_partial(linebreak_t * lbobj,unistr_t * input)1201 gcstring_t **linebreak_break_partial(linebreak_t * lbobj, unistr_t * input)
1202 {
1203 return _break_partial(lbobj, input, NULL, (input == NULL));
1204 }
1205
1206 /**
1207 * Perform line breaking algorithm on complete input.
1208 *
1209 * This function will consume heap size proportional to input size.
1210 * linebreak_break() is highly recommended.
1211 *
1212 * @param[in] lbobj linebreak object.
1213 * @param[in] input Unicode string.
1214 * @return array of broken grapheme cluster strings terminated by NULL.
1215 * If internal error occurred, lbobj->errnum is set then NULL is returned.
1216 */
linebreak_break_fast(linebreak_t * lbobj,unistr_t * input)1217 gcstring_t **linebreak_break_fast(linebreak_t * lbobj, unistr_t * input)
1218 {
1219 gcstring_t **ret;
1220
1221 if (input == NULL) {
1222 if ((ret = malloc(sizeof(gcstring_t *))) == NULL)
1223 lbobj->errnum = errno ? errno : ENOMEM;
1224 else
1225 ret[0] = NULL;
1226 return ret;
1227 }
1228
1229 return _break_partial(lbobj, input, NULL, 1);
1230 }
1231
1232 #define PARTIAL_LENGTH (1000)
1233
1234 /** Perform line breaking algorithm on complete input.
1235 *
1236 * This function will consume constant size of heap.
1237 *
1238 * @param[in] lbobj linebreak object.
1239 * @param[in] input Unicode string.
1240 * @return array of broken grapheme cluster strings terminated by NULL.
1241 * If internal error occurred, lbobj->errnum is set then NULL is returned.
1242 */
linebreak_break(linebreak_t * lbobj,unistr_t * input)1243 gcstring_t **linebreak_break(linebreak_t * lbobj, unistr_t * input)
1244 {
1245 unistr_t unistr = { NULL, 0 };
1246 gcstring_t **ret, **appe, **r;
1247 size_t i, j, k, retlen, appelen;
1248
1249 if ((ret = malloc(sizeof(gcstring_t *))) == NULL) {
1250 lbobj->errnum = errno ? errno : ENOMEM;
1251 return NULL;
1252 } else
1253 ret[0] = NULL;
1254 if (input == NULL)
1255 return ret;
1256 retlen = 0;
1257
1258 unistr.len = PARTIAL_LENGTH;
1259 for (k = 0; PARTIAL_LENGTH < input->len - k; k += PARTIAL_LENGTH) {
1260 unistr.str = input->str + k;
1261 if ((appe = _break_partial(lbobj, &unistr, &appelen, 0)) == NULL) {
1262 for (i = 0; i < retlen; i++)
1263 gcstring_destroy(ret[i]);
1264 free(ret);
1265 return NULL;
1266 }
1267 if (appelen) {
1268 if ((r = realloc(ret,
1269 sizeof(gcstring_t *) *
1270 (retlen + appelen + 1))) == NULL) {
1271 lbobj->errnum = errno ? errno : ENOMEM;
1272 for (i = 0; i < retlen; i++)
1273 gcstring_destroy(ret[i]);
1274 free(ret);
1275 for (j = 0; j < appelen; j++)
1276 gcstring_destroy(appe[j]);
1277 free(appe);
1278 return NULL;
1279 } else
1280 ret = r;
1281 memcpy(ret + retlen, appe,
1282 sizeof(gcstring_t *) * (appelen + 1));
1283 retlen += appelen;
1284 }
1285 free(appe);
1286 }
1287 unistr.len = input->len - k;
1288 unistr.str = input->str + k;
1289 if (k < input->len) {
1290 if ((appe = _break_partial(lbobj, &unistr, &appelen, 1)) == NULL) {
1291 for (i = 0; i < retlen; i++)
1292 gcstring_destroy(ret[i]);
1293 free(ret);
1294 return NULL;
1295 }
1296 if (appelen) {
1297 if ((r = realloc(ret,
1298 sizeof(gcstring_t *) *
1299 (retlen + appelen + 1))) == NULL) {
1300 lbobj->errnum = errno ? errno : ENOMEM;
1301 for (i = 0; i < retlen; i++)
1302 gcstring_destroy(ret[i]);
1303 free(ret);
1304 for (j = 0; j < appelen; j++)
1305 gcstring_destroy(appe[j]);
1306 free(appe);
1307 return NULL;
1308 } else
1309 ret = r;
1310 memcpy(ret + retlen, appe,
1311 sizeof(gcstring_t *) * (appelen + 1));
1312 retlen += appelen;
1313 }
1314 free(appe);
1315 }
1316
1317 return ret;
1318 }
1319
1320 /** Perform line breaking algorithm on UTF-8 text
1321 *
1322 * This function will consume constant size of heap.
1323 *
1324 * @param[in] lbobj linebreak object.
1325 * @param[in] input UTF-8 string, must not be NULL.
1326 * @param[in] len length of UTF-8 string.
1327 * @param[in] check check input. See sombok_decode_utf8().
1328 * @return array of broken grapheme cluster strings terminated by NULL.
1329 * If internal error occurred, lbobj->errnum is set then NULL is returned.
1330 */
linebreak_break_from_utf8(linebreak_t * lbobj,char * input,size_t len,int check)1331 gcstring_t **linebreak_break_from_utf8(linebreak_t * lbobj,
1332 char *input, size_t len, int check)
1333 {
1334 unistr_t unistr = { NULL, 0 };
1335 gcstring_t **ret;
1336
1337 if (input == NULL) {
1338 lbobj->errnum = EINVAL;
1339 return NULL;
1340 }
1341 if (sombok_decode_utf8(&unistr, 0, input, len, check) == NULL)
1342 return NULL;
1343
1344 ret = linebreak_break(lbobj, &unistr);
1345 free(unistr.str);
1346 return ret;
1347 }
1348
linebreak_free_result(gcstring_t ** result,int deep)1349 void linebreak_free_result(gcstring_t ** result, int deep)
1350 {
1351 size_t i;
1352
1353 if (result == NULL)
1354 return;
1355 if (deep)
1356 for (i = 0; result[i] != NULL; i++)
1357 gcstring_destroy(result[i]);
1358 free(result);
1359 }
1360