1 /*
2 * gcstring.c - implementation of grapheme cluster string.
3 *
4 * Copyright (C) 2009-2012 by Hatuka*nezumi - IKEDA Soji.
5 *
6 * This file is part of the Sombok Package. This program is free
7 * software; you can redistribute it and/or modify it under the terms of
8 * either the GNU General Public License or the Artistic License, as
9 * specified in the README file.
10 *
11 */
12
13 #include "sombok_constants.h"
14 #include "sombok.h"
15
16 /** @defgroup gcstring gcstring
17 * @brief Grapheme cluster string
18 *@{*/
19
20 #define eaw2col(o, e) \
21 ((e) == EA_A ? \
22 (((o)->options & LINEBREAK_OPTION_EASTASIAN_CONTEXT) ? 2 : 1) : \
23 (((e) == EA_F || (e) == EA_W)? 2: \
24 (((e) == EA_Z || (e) == EA_ZA || (e) == EA_ZW)? 0: 1)))
25 #define IS_EXTENDER(g) \
26 ((g) == GB_Extend || (g) == GB_SpacingMark || (g) == GB_Virama)
27
28 static
_gcinfo(linebreak_t * obj,unistr_t * str,size_t pos,gcchar_t * gc)29 void _gcinfo(linebreak_t * obj, unistr_t * str, size_t pos, gcchar_t * gc)
30 {
31 propval_t glbc = PROP_UNKNOWN, elbc = PROP_UNKNOWN;
32 size_t glen, gcol, pcol, ecol;
33 propval_t lbc, eaw, gcb, ngcb, scr;
34
35 if (!str || !str->str || !str->len) {
36 gc->len = 0;
37 gc->col = 0;
38 gc->lbc = PROP_UNKNOWN;
39 gc->elbc = PROP_UNKNOWN;
40 return;
41 }
42
43 linebreak_charprop(obj, str->str[pos], &lbc, &eaw, &gcb, &scr);
44 pos++;
45 glen = 1;
46
47 if (gcb == GB_V || gcb == GB_T)
48 /* isolated hangul jamo is wide, though part of them are
49 * neutral (N). */
50 gcol = 2;
51 else
52 gcol = eaw2col(obj, eaw);
53
54 if (lbc != LB_SA)
55 glbc = lbc;
56 #ifdef USE_LIBTHAI
57 else if (scr == SC_Thai)
58 glbc = lbc;
59 #endif /* USE_LIBTHAI */
60 else if (IS_EXTENDER(gcb))
61 glbc = LB_CM;
62 else
63 glbc = LB_AL;
64
65 switch (gcb) {
66 case GB_LF: /* GB5 */
67 break; /* switch (gcb) */
68
69 case GB_CR: /* GB3, GB4, GB5 */
70 if (pos < str->len) {
71 linebreak_charprop(obj, str->str[pos], NULL, &eaw, &gcb, NULL);
72 if (gcb == GB_LF) {
73 pos++;
74 glen++;
75 gcol += eaw2col(obj, eaw);
76 }
77 }
78 break; /* switch (gcb) */
79
80 case GB_Control: /* GB4 */
81 break; /* switch (gcb) */
82
83 default:
84 pcol = 0;
85 ecol = 0;
86 while (pos < str->len) { /* GB2 */
87 linebreak_charprop(obj, str->str[pos], &lbc, &eaw, &ngcb,
88 &scr);
89
90 /* Legacy-CM: Treat SP CM+ as if it were ID. cf. [UAX #14] 9.1. */
91 if (glbc == LB_SP) {
92 if ((obj->options & LINEBREAK_OPTION_LEGACY_CM) &&
93 IS_EXTENDER(ngcb) &&
94 (lbc == LB_CM || lbc == LB_SA)) {
95 glbc = LB_ID;
96
97 /* isolated "wide" nonspacing marks will be wide. */
98 if (eaw == EA_ZW &&
99 (obj->options &
100 LINEBREAK_OPTION_WIDE_NONSPACING_W)) {
101 if (gcol < 2)
102 gcol = 2;
103 }
104 #if 0 /* XXX */
105 else if (eaw == EA_ZA &&
106 (obj->options &
107 LINEBREAK_OPTION_WIDE_NONSPACING_A)) {
108 if (gcol < 2)
109 gcol = 2;
110 }
111 #endif /* 0 */
112 else
113 ecol += eaw2col(obj, eaw);
114 } else
115 /* prevent degenerate case. */
116 break; /* while (pos < str->len) */
117 }
118 /* GB5 */
119 else if (ngcb == GB_Control || ngcb == GB_CR || ngcb == GB_LF)
120 break; /* while (pos < str->len) */
121 /* GB6 - GB8 */
122 /*
123 * Assume hangul syllable block is always wide, while most of
124 * isolated junseong (gcb:V) and jongseong (gcb:T) are neutral
125 * (eaw:N).
126 */
127 else if ((gcb == GB_L &&
128 (ngcb == GB_L || ngcb == GB_V || ngcb == GB_LV ||
129 ngcb == GB_LVT)) ||
130 ((gcb == GB_LV || gcb == GB_V) &&
131 (ngcb == GB_V || ngcb == GB_T)) ||
132 ((gcb == GB_LVT || gcb == GB_T) && ngcb == GB_T)) {
133 gcol = 2;
134 elbc = lbc;
135 }
136 /* GB8a */
137 else if (gcb == GB_Regional_Indicator &&
138 ngcb == GB_Regional_Indicator) {
139 gcol += ecol + eaw2col(obj, eaw);
140 ecol = 0;
141 elbc = lbc;
142 }
143 /* GB9, GB9a */
144 else if (IS_EXTENDER(ngcb)) {
145 ecol += eaw2col(obj, eaw);
146 /* CM in grapheme extender is ignored. Virama is CM. */
147 /* SA in g. ext. is resolved to CM so it is ignored. */
148 if (lbc != LB_CM && lbc != LB_SA)
149 elbc = lbc;
150 }
151 /* GB9b */
152 else if (gcb == GB_Prepend) {
153 /* Here, next char shall grapheme base (or additional prepend
154 * character), since its GCB property is neither Control,
155 * Extend, SpacingMark, and Virama */
156 if (lbc != LB_SA)
157 elbc = lbc;
158 #ifdef USE_LIBTHAI
159 else if (scr == SC_Thai)
160 elbc = lbc; /* SA char in g. base is not resolved... */
161 #endif /* USE_LIBTHAI */
162 else
163 elbc = LB_AL; /* ...or resolved to AL. */
164 pcol += gcol;
165 if (ngcb == GB_V || ngcb == GB_T)
166 /* isolated hangul jamo with prepend character, though
167 * it may be degenerate case. */
168 gcol = 2;
169 else
170 gcol = eaw2col(obj, eaw);
171 }
172 /* Virama rule: \p{ccc:Virama} × \p{gc:Letter} */
173 else if (gcb == GB_Virama && ngcb == GB_OtherLetter &&
174 obj->options & LINEBREAK_OPTION_VIRAMA_AS_JOINER) {
175 /* OtherLetter is not grapheme extender. */
176 gcol += ecol + eaw2col(obj, eaw);
177 ecol = 0;
178 if (lbc != LB_SA)
179 elbc = lbc;
180 #ifdef USE_LIBTHAI
181 else if (scr == SC_Thai)
182 elbc = lbc; /* SA char in g. base is not resolved... */
183 #endif /* USE_LIBTHAI */
184 else
185 elbc = LB_AL; /* ...or resolved to AL. */
186 }
187 /* GB10 */
188 else
189 break; /* while (pos < str->len) */
190
191 pos++;
192 glen++;
193 gcb = ngcb;
194 } /* while (pos < str->len) */
195 gcol += pcol + ecol;
196 break; /* switch (gcb) */
197 } /* switch (gcb) */
198
199 gc->len = glen;
200 gc->col = gcol;
201 gc->lbc = glbc;
202 gc->elbc = elbc;
203 }
204
205 /*
206 * Exports
207 */
208
209 /** Constructor
210 *
211 * Create new grapheme cluster string from Unicode string.
212 * Use gcstring_newcopy() if you wish to copy buffer of Unicode string.
213 * @param[in] unistr Unicode string. NULL may be given as zero-length string.
214 * @param[in] lbobj linebreak object.
215 * @return New grapheme cluster string sharing str buffer with unistr.
216 * If error occurred, errno is set then NULL is returned.
217 *
218 * option bits of lbobj:
219 * - if LINEBREAK_OPTION_EASTASIAN_CONTEXT bit is set,
220 * LB_AI and EA_A are resolved to LB_ID and EA_F. Otherwise, LB_AL and EA_N,
221 * respectively.
222 * - if LINEBREAK_OPTION_LEGACY_CM bit is set,
223 * combining mark lead by a SPACE is isolated combining mark (ID).
224 * Otherwise, such sequences are treated as degenerate cases.
225 * - if LINEBREAK_OPTION_VIRAMA_AS_JOINER bit is set,
226 * virama and other letter are not broken.
227 */
gcstring_new(unistr_t * unistr,linebreak_t * lbobj)228 gcstring_t *gcstring_new(unistr_t * unistr, linebreak_t * lbobj)
229 {
230 gcstring_t *gcstr;
231 size_t len;
232
233 if ((gcstr = malloc(sizeof(gcstring_t))) == NULL)
234 return NULL;
235 gcstr->str = NULL;
236 gcstr->len = 0;
237 gcstr->gcstr = NULL;
238 gcstr->gclen = 0;
239 gcstr->pos = 0;
240 if (lbobj == NULL) {
241 if ((gcstr->lbobj = linebreak_new(NULL)) == NULL) {
242 free(gcstr);
243 return NULL;
244 }
245 } else
246 gcstr->lbobj = linebreak_incref(lbobj);
247
248 if (unistr == NULL || unistr->str == NULL || unistr->len == 0)
249 return gcstr;
250 gcstr->str = unistr->str;
251 gcstr->len = len = unistr->len;
252
253 if (len) {
254 size_t pos;
255 gcchar_t *gc, *_g;
256
257 if ((gcstr->gcstr = malloc(sizeof(gcchar_t) * len)) == NULL) {
258 gcstr->str = NULL;
259 gcstring_destroy(gcstr);
260 return NULL;
261 }
262 for (pos = 0, gc = gcstr->gcstr;
263 pos < len;
264 pos += gc->len, gcstr->gclen++, gc++) {
265 gc->flag = 0;
266 gc->idx = pos;
267 _gcinfo(gcstr->lbobj, unistr, pos, gc);
268 }
269 if ((_g = realloc(gcstr->gcstr, sizeof(gcchar_t) * gcstr->gclen))
270 == NULL) {
271 gcstr->str = NULL;
272 gcstring_destroy(gcstr);
273 return NULL;
274 } else
275 gcstr->gcstr = _g;
276 }
277
278 return gcstr;
279 }
280
281 /** Constructor copying Unicode string.
282 *
283 * Create new grapheme cluster string from Unicode string.
284 * Use gcstring_new() if you wish not to copy buffer of Unicode string.
285 * @param[in] str Unicode string. NULL may be given as zero-length string.
286 * @param[in] lbobj linebreak object.
287 * @return New grapheme cluster string.
288 * If error occurred, errno is set then NULL is returned.
289 */
gcstring_newcopy(unistr_t * str,linebreak_t * lbobj)290 gcstring_t *gcstring_newcopy(unistr_t * str, linebreak_t * lbobj)
291 {
292 unistr_t unistr = { NULL, 0 };
293
294 if (str->str && str->len) {
295 if ((unistr.str = malloc(sizeof(unichar_t) * str->len)) == NULL)
296 return NULL;
297 memcpy(unistr.str, str->str, sizeof(unichar_t) * str->len);
298 unistr.len = str->len;
299 }
300 return gcstring_new(&unistr, lbobj);
301 }
302
303 /** Constructor from UTF-8 string
304 *
305 * Create new grapheme cluster string from UTF-8 string.
306 * @param[in] str buffer of UTF-8 string, must not be NULL.
307 * @param[in] len length of UTF-8 string.
308 * @param[in] check check input. See sombok_decode_utf8().
309 * @param[in] lbobj linebreak object.
310 * @return New grapheme cluster string.
311 * If error occurred, errno is set then NULL is returned.
312 * Source string buffer would not be modified.
313 */
gcstring_new_from_utf8(char * str,size_t len,int check,linebreak_t * lbobj)314 gcstring_t *gcstring_new_from_utf8(char *str, size_t len, int check,
315 linebreak_t * lbobj)
316 {
317 unistr_t unistr = { NULL, 0 };
318
319 if (str == NULL) {
320 errno = EINVAL;
321 return NULL;
322 }
323 if (sombok_decode_utf8(&unistr, 0, str, len, check) == NULL)
324 return NULL;
325
326 return gcstring_new(&unistr, lbobj);
327 }
328
329 /** Destructor
330 *
331 * Free memories allocated for grapheme cluster string.
332 * @param[in] gcstr grapheme cluster string.
333 * @return none.
334 * If gcstr was NULL, do nothing.
335 */
gcstring_destroy(gcstring_t * gcstr)336 void gcstring_destroy(gcstring_t * gcstr)
337 {
338 if (gcstr == NULL)
339 return;
340 free(gcstr->str);
341 free(gcstr->gcstr);
342 linebreak_destroy(gcstr->lbobj);
343 free(gcstr);
344 }
345
346 /** Copy Constructor
347 *
348 * Create deep copy of grapheme cluster string.
349 * @param[in] gcstr grapheme cluster string, must not be NULL.
350 * @return deep copy of grapheme cluster string.
351 * If error occurred, errno is set then NULL is returned.
352 */
gcstring_copy(gcstring_t * gcstr)353 gcstring_t *gcstring_copy(gcstring_t * gcstr)
354 {
355 gcstring_t *new;
356 unichar_t *newstr = NULL;
357 gcchar_t *newgcstr = NULL;
358
359 if (gcstr == NULL)
360 return (errno = EINVAL), NULL;
361
362 if ((new = malloc(sizeof(gcstring_t))) == NULL)
363 return NULL;
364 memcpy(new, gcstr, sizeof(gcstring_t));
365
366 if (gcstr->str && gcstr->len) {
367 if ((newstr = malloc(sizeof(unichar_t) * gcstr->len)) == NULL) {
368 free(new);
369 return NULL;
370 }
371 memcpy(newstr, gcstr->str, sizeof(unichar_t) * gcstr->len);
372 }
373 new->str = newstr;
374 if (gcstr->gcstr && gcstr->gclen) {
375 if ((newgcstr = malloc(sizeof(gcchar_t) * gcstr->gclen)) == NULL) {
376 free(new->str);
377 free(new);
378 return NULL;
379 }
380 memcpy(newgcstr, gcstr->gcstr, sizeof(gcchar_t) * gcstr->gclen);
381 }
382 new->gcstr = newgcstr;
383 if (gcstr->lbobj == NULL) {
384 if ((new->lbobj = linebreak_new(NULL)) == NULL) {
385 gcstring_destroy(new);
386 return NULL;
387 }
388 } else
389 new->lbobj = linebreak_incref(gcstr->lbobj);
390 new->pos = 0;
391
392 return new;
393 }
394
395 /** Append
396 *
397 * Modify grapheme cluster string by appending another string.
398 * @param[in] gcstr target grapheme cluster string, must not be NULL.
399 * @param[in] appe grapheme cluster string to be appended.
400 * NULL means null string therefore gcstr won't be modified.
401 * @return Modified grapheme cluster string gcstr itself (not a copy).
402 * If error occurred, errno is set then NULL is returned.
403 */
gcstring_append(gcstring_t * gcstr,gcstring_t * appe)404 gcstring_t *gcstring_append(gcstring_t * gcstr, gcstring_t * appe)
405 {
406 unistr_t ustr = { NULL, 0 };
407
408 if (gcstr == NULL)
409 return (errno = EINVAL), NULL;
410 if (appe == NULL || appe->str == NULL || appe->len == 0)
411 return gcstr;
412 if (gcstr->gclen && appe->gclen) {
413 size_t aidx, alen, blen, newlen, newgclen, i;
414 unsigned char bflag;
415 gcstring_t *cstr;
416 unichar_t *_u;
417 gcchar_t *_g;
418
419 aidx = gcstr->gcstr[gcstr->gclen - 1].idx;
420 alen = gcstr->gcstr[gcstr->gclen - 1].len;
421 blen = appe->gcstr[0].len;
422 bflag = appe->gcstr[0].flag;
423
424 if ((ustr.str = malloc(sizeof(unichar_t) * (alen + blen))) == NULL)
425 return NULL;
426 memcpy(ustr.str, gcstr->str + aidx, sizeof(unichar_t) * alen);
427 memcpy(ustr.str + alen, appe->str, sizeof(unichar_t) * blen);
428 ustr.len = alen + blen;
429 if ((cstr = gcstring_new(&ustr, gcstr->lbobj)) == NULL) {
430 free(ustr.str);
431 return NULL;
432 }
433
434 newlen = gcstr->len + appe->len;
435 newgclen = gcstr->gclen - 1 + cstr->gclen + appe->gclen - 1;
436 if ((_u = realloc(gcstr->str, sizeof(unichar_t) * newlen)) == NULL) {
437 gcstring_destroy(cstr);
438 return NULL;
439 } else
440 gcstr->str = _u;
441 if ((_g = realloc(gcstr->gcstr,
442 sizeof(gcchar_t) * newgclen)) == NULL) {
443 gcstring_destroy(cstr);
444 return NULL;
445 } else
446 gcstr->gcstr = _g;
447 memcpy(gcstr->str + gcstr->len, appe->str,
448 sizeof(unichar_t) * appe->len);
449 for (i = 0; i < cstr->gclen; i++) {
450 gcchar_t *gc = gcstr->gcstr + gcstr->gclen - 1 + i;
451
452 gc->idx = cstr->gcstr[i].idx + aidx;
453 gc->len = cstr->gcstr[i].len;
454 gc->col = cstr->gcstr[i].col;
455 gc->lbc = cstr->gcstr[i].lbc;
456 gc->elbc = cstr->gcstr[i].elbc;
457 if (aidx + alen == gc->idx) /* Restore flag if possible */
458 gc->flag = bflag;
459 }
460 for (i = 1; i < appe->gclen; i++) {
461 gcchar_t *gc =
462 gcstr->gcstr + gcstr->gclen - 1 + cstr->gclen + i - 1;
463 gc->idx = appe->gcstr[i].idx - blen + aidx + cstr->len;
464 gc->len = appe->gcstr[i].len;
465 gc->col = appe->gcstr[i].col;
466 gc->lbc = appe->gcstr[i].lbc;
467 gc->elbc = appe->gcstr[i].elbc;
468 gc->flag = appe->gcstr[i].flag;
469 }
470
471 gcstr->len = newlen;
472 gcstr->gclen = newgclen;
473 gcstring_destroy(cstr);
474 } else if (appe->gclen) {
475 if ((gcstr->str = malloc(sizeof(unichar_t) * appe->len)) == NULL)
476 return NULL;
477 if ((gcstr->gcstr =
478 malloc(sizeof(gcchar_t) * appe->gclen)) == NULL) {
479 free(gcstr->str);
480 return NULL;
481 }
482 memcpy(gcstr->str, appe->str, sizeof(unichar_t) * appe->len);
483 gcstr->len = appe->len;
484 memcpy(gcstr->gcstr, appe->gcstr, sizeof(gcchar_t) * appe->gclen);
485 gcstr->gclen = appe->gclen;
486
487 gcstr->pos = 0;
488 }
489
490 return gcstr;
491 }
492
493 /** Compare
494 *
495 * Compare grapheme cluster strings.
496 * @param[in] a grapheme cluster string.
497 * @param[in] b grapheme cluster string.
498 * @return positive, zero or negative value when a is greater, equal to, lesser than b, respectively.
499 */
gcstring_cmp(gcstring_t * a,gcstring_t * b)500 int gcstring_cmp(gcstring_t * a, gcstring_t * b)
501 {
502 size_t i;
503
504 if (!a->len || !b->len)
505 return (a->len ? 1 : 0) - (b->len ? 1 : 0);
506 for (i = 0; i < a->len && i < b->len; i++)
507 if (a->str[i] != b->str[i])
508 return a->str[i] - b->str[i];
509 return a->len - b->len;
510 }
511
512 /** Number of Columns
513 *
514 * Returns number of columns of grapheme cluster strings determined by built-in character database according to UAX #11.
515 * @param[in] gcstr grapheme cluster string. NULL may mean null string.
516 * @return Number of columns.
517 */
gcstring_columns(gcstring_t * gcstr)518 size_t gcstring_columns(gcstring_t * gcstr)
519 {
520 size_t col, i;
521
522 if (gcstr == NULL)
523 return 0;
524 for (col = 0, i = 0; i < gcstr->gclen; i++)
525 col += gcstr->gcstr[i].col;
526 return col;
527 }
528
529 /** Concatenate
530 *
531 * Create new grapheme cluster string which is concatination of two strings.
532 * @param[in] gcstr grapheme cluster string, must not be NULL.
533 * @param[in] appe grapheme cluster string to be appended. NULL means null
534 * string.
535 * @return New grapheme cluster string.
536 * If error occurred, errno is set then NULL is returned.
537 */
gcstring_concat(gcstring_t * gcstr,gcstring_t * appe)538 gcstring_t *gcstring_concat(gcstring_t * gcstr, gcstring_t * appe)
539 {
540 gcstring_t *new;
541 size_t pos;
542
543 if (gcstr == NULL)
544 return (errno = EINVAL), NULL;
545 pos = gcstr->pos;
546 if ((new = gcstring_copy(gcstr)) == NULL)
547 return NULL;
548 new->pos = pos;
549 return gcstring_append(new, appe);
550 }
551
552 /** Iterator
553 *
554 * Returns pointer to next grapheme cluster of grapheme cluster string.
555 * Next position will be incremented.
556 * @param[in] gcstr grapheme cluster string.
557 * @return Pointer to grapheme cluster.
558 * If pointer was already at end of the string, NULL will be returned.
559 */
gcstring_next(gcstring_t * gcstr)560 gcchar_t *gcstring_next(gcstring_t * gcstr)
561 {
562 if (gcstr->gclen <= gcstr->pos)
563 return NULL;
564 return gcstr->gcstr + (gcstr->pos++);
565 }
566
567 /** Set Next Position
568 *
569 * Set next position of grapheme cluster string.
570 * @param[in] gcstr grapheme cluster string.
571 * @param[in] pos New position.
572 * @return none.
573 * If pos is out of range of string, position won't be updated.
574 *
575 * @todo On next major release, pos would be ssize_t, not int.
576 */
gcstring_setpos(gcstring_t * gcstr,int pos)577 void gcstring_setpos(gcstring_t * gcstr, int pos)
578 {
579 if (pos < 0)
580 pos += gcstr->gclen;
581 if (pos < 0 || gcstr->gclen < pos)
582 return;
583 gcstr->pos = pos;
584 }
585
586 /** Shrink
587 *
588 * Modify grapheme cluster string to shrink its length.
589 * Length is specified by number of grapheme clusters.
590 * @param[in] gcstr grapheme cluster string.
591 * @param[in] length New length.
592 * @return none.
593 * If gcstr was NULL, do nothing.
594 *
595 * @todo On next major release, length would be ssize_t, not int.
596 */
gcstring_shrink(gcstring_t * gcstr,int length)597 void gcstring_shrink(gcstring_t * gcstr, int length)
598 {
599 if (gcstr == NULL)
600 return;
601
602 if (length < 0)
603 length += gcstr->gclen;
604
605 if (length <= 0) {
606 free(gcstr->str);
607 gcstr->str = NULL;
608 gcstr->len = 0;
609 free(gcstr->gcstr);
610 gcstr->gcstr = NULL;
611 gcstr->gclen = 0;
612 } else if (gcstr->gclen <= length)
613 return;
614 else {
615 gcstr->len = gcstr->gcstr[length].idx;
616 gcstr->gclen = length;
617 }
618 }
619
620 /** Substring
621 *
622 * Returns substring of grapheme cluster string.
623 * Offset and length are specified by number of grapheme clusters.
624 * @param[in] gcstr grapheme cluster string. Must not be NULL.
625 * @param[in] offset Offset of substring.
626 * @param[in] length Length of substring.
627 * @return (newly allocated) substring.
628 * If error occurred, errno is set to non-zero then NULL is returned.
629 *
630 * @todo On next major release, offset and length would be ssize_t, not int.
631 */
gcstring_substr(gcstring_t * gcstr,int offset,int length)632 gcstring_t *gcstring_substr(gcstring_t * gcstr, int offset, int length)
633 {
634 gcstring_t *new;
635 size_t ulength, i;
636
637 if (gcstr == NULL)
638 return (errno = EINVAL), NULL;
639
640 /* adjust offset. */
641 if (offset < 0)
642 offset += gcstr->gclen;
643 if (offset < 0) {
644 length += offset;
645 offset = 0;
646 }
647 if (length < 0)
648 length += gcstr->gclen - offset;
649
650 if (length < 0 || gcstr->gclen < offset) /* out of range */
651 return gcstring_new(NULL, gcstr->lbobj);
652
653 if (gcstr->gclen == offset)
654 length = 0;
655 else if (gcstr->gclen <= offset + length)
656 length = gcstr->gclen - offset;
657
658 /* create substring. */
659
660 if (gcstr->gclen == offset)
661 ulength = 0;
662 else if (gcstr->gclen <= offset + length)
663 ulength = gcstr->len - gcstr->gcstr[offset].idx;
664 else
665 ulength =
666 gcstr->gcstr[offset + length].idx - gcstr->gcstr[offset].idx;
667
668 if ((new = gcstring_new(NULL, gcstr->lbobj)) == NULL)
669 return NULL;
670
671 if (ulength == 0);
672 else if ((new->str = malloc(sizeof(unichar_t) * ulength)) == NULL) {
673 gcstring_destroy(new);
674 return NULL;
675 }
676 if (length == 0);
677 else if ((new->gcstr = malloc(sizeof(gcchar_t) * length)) == NULL) {
678 free(new->str);
679 gcstring_destroy(new);
680 return NULL;
681 }
682 if (ulength != 0)
683 memcpy(new->str, gcstr->str + gcstr->gcstr[offset].idx,
684 sizeof(unichar_t) * ulength);
685 new->len = ulength;
686 for (i = 0; i < length; i++) {
687 memcpy(new->gcstr + i, gcstr->gcstr + offset + i,
688 sizeof(gcchar_t));
689 new->gcstr[i].idx -= gcstr->gcstr[offset].idx;
690 }
691 new->gclen = length;
692
693 return new;
694 }
695
696 /** Replace substring
697 *
698 * Replace substring og grapheme cluster string.
699 * Offset and length are specified by number of grapheme clusters.
700 * @param[in,out] gcstr grapheme cluster string. Must not be NULL.
701 * @param[in] offset Offset of substring.
702 * @param[in] length Length of substring.
703 * offset and length must not be out of range.
704 * @param[in] replacement If this was not NULL, modify grapheme cluster string by replacing substring with it.
705 * @return modified gcstr itself (not a copy of it).
706 * If error occurred, errno is set to non-zero then NULL is returned.
707 *
708 * @todo On next major release, offset and length would be ssize_t, not int.
709 */
gcstring_replace(gcstring_t * gcstr,int offset,int length,gcstring_t * replacement)710 gcstring_t *gcstring_replace(gcstring_t * gcstr, int offset, int length,
711 gcstring_t * replacement)
712 {
713 gcstring_t *tail;
714
715 if (gcstr == NULL)
716 return (errno = EINVAL), NULL;
717
718 /* without replacement: meaningless. return immedately. */
719 if (replacement == NULL)
720 return gcstr;
721
722 /* adjust offset. */
723 if (offset < 0)
724 offset += gcstr->gclen;
725 if (offset < 0) {
726 length += offset;
727 offset = 0;
728 }
729 if (length < 0)
730 length += gcstr->gclen - offset;
731
732 if (length < 0 || gcstr->gclen < offset) /* out of range */
733 return (errno = EINVAL), NULL;
734
735 if (gcstr->gclen == offset)
736 length = 0;
737 else if (gcstr->gclen <= offset + length)
738 length = gcstr->gclen - offset;
739
740 /* returns modified gcstr itself. */
741
742 if ((tail = gcstring_substr(gcstr, offset + length,
743 gcstr->gclen - (offset + length))) == NULL)
744 return NULL;
745 gcstring_shrink(gcstr, offset);
746 if (gcstring_append(gcstr, replacement) == NULL) {
747 gcstring_destroy(tail);
748 return NULL;
749 }
750 if (gcstring_append(gcstr, tail) == NULL) {
751 gcstring_destroy(tail);
752 return NULL;
753 }
754 gcstring_destroy(tail);
755 return gcstr;
756 }
757
758 /** Get Line Breaking Class of grapheme base
759 *
760 * Get UAX #14 line breaking class of grapheme base.
761 * @param[in] gcstr grapheme cluster string, must not be NULL.
762 * @param[in] pos position.
763 * @return line breaking class property value.
764 *
765 * @note Introduced by sombok 2.2.
766 */
gcstring_lbclass(gcstring_t * gcstr,int pos)767 propval_t gcstring_lbclass(gcstring_t * gcstr, int pos)
768 {
769 if (pos < 0)
770 pos += gcstr->gclen;
771 if (pos < 0 || gcstr->gclen == 0 || gcstr->gclen <= pos)
772 return PROP_UNKNOWN;
773 return gcstr->gcstr[pos].lbc;
774 }
775
776 /** Get Line Breaking Class of grapheme extender
777 *
778 * Get UAX #14 line breaking class of grapheme extender.
779 * If it is CM, get one of grapheme base.
780 * @param[in] gcstr grapheme cluster string, must not be NULL.
781 * @param[in] pos position.
782 * @return line breaking class property value.
783 *
784 * @note Introduced by sombok 2.2.
785 */
gcstring_lbclass_ext(gcstring_t * gcstr,int pos)786 propval_t gcstring_lbclass_ext(gcstring_t * gcstr, int pos)
787 {
788 propval_t lbc;
789
790 if (pos < 0)
791 pos += gcstr->gclen;
792 if (pos < 0 || gcstr->gclen == 0 || gcstr->gclen <= pos)
793 return PROP_UNKNOWN;
794 if ((lbc = gcstr->gcstr[pos].elbc) == PROP_UNKNOWN)
795 lbc = gcstr->gcstr[pos].lbc;
796 return lbc;
797 }
798