1 /*
2  * gcstring.c - implementation of grapheme cluster string.
3  *
4  * Copyright (C) 2009-2012 by Hatuka*nezumi - IKEDA Soji.
5  *
6  * This file is part of the Sombok Package.  This program is free
7  * software; you can redistribute it and/or modify it under the terms of
8  * either the GNU General Public License or the Artistic License, as
9  * specified in the README file.
10  *
11  */
12 
13 #include "sombok_constants.h"
14 #include "sombok.h"
15 
16 /** @defgroup gcstring gcstring
17  * @brief Grapheme cluster string
18  *@{*/
19 
20 #define eaw2col(o, e) \
21     ((e) == EA_A ? \
22      (((o)->options & LINEBREAK_OPTION_EASTASIAN_CONTEXT) ? 2 : 1) : \
23      (((e) == EA_F || (e) == EA_W)? 2: \
24      (((e) == EA_Z || (e) == EA_ZA || (e) == EA_ZW)? 0: 1)))
25 #define IS_EXTENDER(g) \
26     ((g) == GB_Extend || (g) == GB_SpacingMark || (g) == GB_Virama)
27 
28 static
_gcinfo(linebreak_t * obj,unistr_t * str,size_t pos,gcchar_t * gc)29 void _gcinfo(linebreak_t * obj, unistr_t * str, size_t pos, gcchar_t * gc)
30 {
31     propval_t glbc = PROP_UNKNOWN, elbc = PROP_UNKNOWN;
32     size_t glen, gcol, pcol, ecol;
33     propval_t lbc, eaw, gcb, ngcb, scr;
34 
35     if (!str || !str->str || !str->len) {
36 	gc->len = 0;
37 	gc->col = 0;
38 	gc->lbc = PROP_UNKNOWN;
39 	gc->elbc = PROP_UNKNOWN;
40 	return;
41     }
42 
43     linebreak_charprop(obj, str->str[pos], &lbc, &eaw, &gcb, &scr);
44     pos++;
45     glen = 1;
46 
47     if (gcb == GB_V || gcb == GB_T)
48 	/* isolated hangul jamo is wide, though part of them are
49 	 * neutral (N). */
50 	gcol = 2;
51     else
52 	gcol = eaw2col(obj, eaw);
53 
54     if (lbc != LB_SA)
55 	glbc = lbc;
56 #ifdef USE_LIBTHAI
57     else if (scr == SC_Thai)
58 	glbc = lbc;
59 #endif				/* USE_LIBTHAI */
60     else if (IS_EXTENDER(gcb))
61 	glbc = LB_CM;
62     else
63 	glbc = LB_AL;
64 
65     switch (gcb) {
66     case GB_LF:		/* GB5 */
67 	break;			/* switch (gcb) */
68 
69     case GB_CR:		/* GB3, GB4, GB5 */
70 	if (pos < str->len) {
71 	    linebreak_charprop(obj, str->str[pos], NULL, &eaw, &gcb, NULL);
72 	    if (gcb == GB_LF) {
73 		pos++;
74 		glen++;
75 		gcol += eaw2col(obj, eaw);
76 	    }
77 	}
78 	break;			/* switch (gcb) */
79 
80     case GB_Control:		/* GB4 */
81 	break;			/* switch (gcb) */
82 
83     default:
84 	pcol = 0;
85 	ecol = 0;
86 	while (pos < str->len) {	/* GB2 */
87 	    linebreak_charprop(obj, str->str[pos], &lbc, &eaw, &ngcb,
88 			       &scr);
89 
90 	    /* Legacy-CM: Treat SP CM+ as if it were ID.  cf. [UAX #14] 9.1. */
91 	    if (glbc == LB_SP) {
92 		if ((obj->options & LINEBREAK_OPTION_LEGACY_CM) &&
93 		    IS_EXTENDER(ngcb) &&
94 		    (lbc == LB_CM || lbc == LB_SA)) {
95 		    glbc = LB_ID;
96 
97 		    /* isolated "wide" nonspacing marks will be wide. */
98 		    if (eaw == EA_ZW &&
99 			(obj->options &
100 			 LINEBREAK_OPTION_WIDE_NONSPACING_W)) {
101 			if (gcol < 2)
102 			    gcol = 2;
103 		    }
104 #if 0 /* XXX */
105 		    else if (eaw == EA_ZA &&
106 			     (obj->options &
107 			      LINEBREAK_OPTION_WIDE_NONSPACING_A)) {
108 			if (gcol < 2)
109 			    gcol = 2;
110 		    }
111 #endif /* 0 */
112 		    else
113 			ecol += eaw2col(obj, eaw);
114 		} else
115 		    /* prevent degenerate case. */
116 		    break;	/* while (pos < str->len) */
117 	    }
118 	    /* GB5 */
119 	    else if (ngcb == GB_Control || ngcb == GB_CR || ngcb == GB_LF)
120 		break;		/* while (pos < str->len) */
121 	    /* GB6 - GB8 */
122 	    /*
123 	     * Assume hangul syllable block is always wide, while most of
124 	     * isolated junseong (gcb:V) and jongseong (gcb:T) are neutral
125 	     * (eaw:N).
126 	     */
127 	    else if ((gcb == GB_L &&
128 		      (ngcb == GB_L || ngcb == GB_V || ngcb == GB_LV ||
129 		       ngcb == GB_LVT)) ||
130 		     ((gcb == GB_LV || gcb == GB_V) &&
131 		      (ngcb == GB_V || ngcb == GB_T)) ||
132 		     ((gcb == GB_LVT || gcb == GB_T) && ngcb == GB_T)) {
133 		gcol = 2;
134 		elbc = lbc;
135 	    }
136 	    /* GB8a */
137 	    else if (gcb == GB_Regional_Indicator &&
138 		     ngcb == GB_Regional_Indicator) {
139 		gcol += ecol + eaw2col(obj, eaw);
140 		ecol = 0;
141 		elbc = lbc;
142 	    }
143 	    /* GB9, GB9a */
144 	    else if (IS_EXTENDER(ngcb)) {
145 		ecol += eaw2col(obj, eaw);
146 		/* CM in grapheme extender is ignored.  Virama is CM. */
147 		/* SA in g. ext. is resolved to CM so it is ignored. */
148 		if (lbc != LB_CM && lbc != LB_SA)
149 		    elbc = lbc;
150 	    }
151 	    /* GB9b */
152 	    else if (gcb == GB_Prepend) {
153 		/* Here, next char shall grapheme base (or additional prepend
154 		 * character), since its GCB property is neither Control,
155 		 * Extend, SpacingMark, and Virama */
156 		if (lbc != LB_SA)
157 		    elbc = lbc;
158 #ifdef USE_LIBTHAI
159 		else if (scr == SC_Thai)
160 		    elbc = lbc;	/* SA char in g. base is not resolved... */
161 #endif				/* USE_LIBTHAI */
162 		else
163 		    elbc = LB_AL;	/* ...or resolved to AL. */
164 		pcol += gcol;
165 		if (ngcb == GB_V || ngcb == GB_T)
166 		    /* isolated hangul jamo with prepend character, though
167 		     * it may be degenerate case. */
168 		    gcol = 2;
169 		else
170 		    gcol = eaw2col(obj, eaw);
171 	    }
172 	    /* Virama rule: \p{ccc:Virama} × \p{gc:Letter} */
173 	    else if (gcb == GB_Virama && ngcb == GB_OtherLetter &&
174 		     obj->options & LINEBREAK_OPTION_VIRAMA_AS_JOINER) {
175 		/* OtherLetter is not grapheme extender. */
176 		gcol += ecol + eaw2col(obj, eaw);
177 		ecol = 0;
178 		if (lbc != LB_SA)
179 		    elbc = lbc;
180 #ifdef USE_LIBTHAI
181 		else if (scr == SC_Thai)
182 		    elbc = lbc;	/* SA char in g. base is not resolved... */
183 #endif				/* USE_LIBTHAI */
184 		else
185 		    elbc = LB_AL;	/* ...or resolved to AL. */
186 	    }
187 	    /* GB10 */
188 	    else
189 		break;		/* while (pos < str->len) */
190 
191 	    pos++;
192 	    glen++;
193 	    gcb = ngcb;
194 	}			/* while (pos < str->len) */
195 	gcol += pcol + ecol;
196 	break;			/* switch (gcb) */
197     }				/* switch (gcb) */
198 
199     gc->len = glen;
200     gc->col = gcol;
201     gc->lbc = glbc;
202     gc->elbc = elbc;
203 }
204 
205 /*
206  * Exports
207  */
208 
209 /** Constructor
210  *
211  * Create new grapheme cluster string from Unicode string.
212  * Use gcstring_newcopy() if you wish to copy buffer of Unicode string.
213  * @param[in] unistr Unicode string.  NULL may be given as zero-length string.
214  * @param[in] lbobj linebreak object.
215  * @return New grapheme cluster string sharing str buffer with unistr.
216  * If error occurred, errno is set then NULL is returned.
217  *
218  * option bits of lbobj:
219  * - if LINEBREAK_OPTION_EASTASIAN_CONTEXT bit is set,
220  *   LB_AI and EA_A are resolved to LB_ID and EA_F. Otherwise, LB_AL and EA_N,
221  *   respectively.
222  * - if LINEBREAK_OPTION_LEGACY_CM bit is set,
223  *   combining mark lead by a SPACE is isolated combining mark (ID).
224  *   Otherwise, such sequences are treated as degenerate cases.
225  * - if LINEBREAK_OPTION_VIRAMA_AS_JOINER bit is set,
226  *   virama and other letter are not broken.
227  */
gcstring_new(unistr_t * unistr,linebreak_t * lbobj)228 gcstring_t *gcstring_new(unistr_t * unistr, linebreak_t * lbobj)
229 {
230     gcstring_t *gcstr;
231     size_t len;
232 
233     if ((gcstr = malloc(sizeof(gcstring_t))) == NULL)
234 	return NULL;
235     gcstr->str = NULL;
236     gcstr->len = 0;
237     gcstr->gcstr = NULL;
238     gcstr->gclen = 0;
239     gcstr->pos = 0;
240     if (lbobj == NULL) {
241 	if ((gcstr->lbobj = linebreak_new(NULL)) == NULL) {
242 	    free(gcstr);
243 	    return NULL;
244 	}
245     } else
246 	gcstr->lbobj = linebreak_incref(lbobj);
247 
248     if (unistr == NULL || unistr->str == NULL || unistr->len == 0)
249 	return gcstr;
250     gcstr->str = unistr->str;
251     gcstr->len = len = unistr->len;
252 
253     if (len) {
254 	size_t pos;
255 	gcchar_t *gc, *_g;
256 
257 	if ((gcstr->gcstr = malloc(sizeof(gcchar_t) * len)) == NULL) {
258 	    gcstr->str = NULL;
259 	    gcstring_destroy(gcstr);
260 	    return NULL;
261 	}
262 	for (pos = 0, gc = gcstr->gcstr;
263 	     pos < len;
264 	     pos += gc->len, gcstr->gclen++, gc++) {
265 	    gc->flag = 0;
266 	    gc->idx = pos;
267 	    _gcinfo(gcstr->lbobj, unistr, pos, gc);
268 	}
269 	if ((_g = realloc(gcstr->gcstr, sizeof(gcchar_t) * gcstr->gclen))
270 	    == NULL) {
271 	    gcstr->str = NULL;
272 	    gcstring_destroy(gcstr);
273 	    return NULL;
274 	} else
275 	    gcstr->gcstr = _g;
276     }
277 
278     return gcstr;
279 }
280 
281 /** Constructor copying Unicode string.
282  *
283  * Create new grapheme cluster string from Unicode string.
284  * Use gcstring_new() if you wish not to copy buffer of Unicode string.
285  * @param[in] str Unicode string.  NULL may be given as zero-length string.
286  * @param[in] lbobj linebreak object.
287  * @return New grapheme cluster string.
288  * If error occurred, errno is set then NULL is returned.
289  */
gcstring_newcopy(unistr_t * str,linebreak_t * lbobj)290 gcstring_t *gcstring_newcopy(unistr_t * str, linebreak_t * lbobj)
291 {
292     unistr_t unistr = { NULL, 0 };
293 
294     if (str->str && str->len) {
295 	if ((unistr.str = malloc(sizeof(unichar_t) * str->len)) == NULL)
296 	    return NULL;
297 	memcpy(unistr.str, str->str, sizeof(unichar_t) * str->len);
298 	unistr.len = str->len;
299     }
300     return gcstring_new(&unistr, lbobj);
301 }
302 
303 /** Constructor from UTF-8 string
304  *
305  * Create new grapheme cluster string from UTF-8 string.
306  * @param[in] str buffer of UTF-8 string, must not be NULL.
307  * @param[in] len length of UTF-8 string.
308  * @param[in] check check input.  See sombok_decode_utf8().
309  * @param[in] lbobj linebreak object.
310  * @return New grapheme cluster string.
311  * If error occurred, errno is set then NULL is returned.
312  * Source string buffer would not be modified.
313  */
gcstring_new_from_utf8(char * str,size_t len,int check,linebreak_t * lbobj)314 gcstring_t *gcstring_new_from_utf8(char *str, size_t len, int check,
315 				   linebreak_t * lbobj)
316 {
317     unistr_t unistr = { NULL, 0 };
318 
319     if (str == NULL) {
320 	errno = EINVAL;
321 	return NULL;
322     }
323     if (sombok_decode_utf8(&unistr, 0, str, len, check) == NULL)
324 	return NULL;
325 
326     return gcstring_new(&unistr, lbobj);
327 }
328 
329 /** Destructor
330  *
331  * Free memories allocated for grapheme cluster string.
332  * @param[in] gcstr grapheme cluster string.
333  * @return none.
334  * If gcstr was NULL, do nothing.
335  */
gcstring_destroy(gcstring_t * gcstr)336 void gcstring_destroy(gcstring_t * gcstr)
337 {
338     if (gcstr == NULL)
339 	return;
340     free(gcstr->str);
341     free(gcstr->gcstr);
342     linebreak_destroy(gcstr->lbobj);
343     free(gcstr);
344 }
345 
346 /** Copy Constructor
347  *
348  * Create deep copy of grapheme cluster string.
349  * @param[in] gcstr grapheme cluster string, must not be NULL.
350  * @return deep copy of grapheme cluster string.
351  * If error occurred, errno is set then NULL is returned.
352  */
gcstring_copy(gcstring_t * gcstr)353 gcstring_t *gcstring_copy(gcstring_t * gcstr)
354 {
355     gcstring_t *new;
356     unichar_t *newstr = NULL;
357     gcchar_t *newgcstr = NULL;
358 
359     if (gcstr == NULL)
360 	return (errno = EINVAL), NULL;
361 
362     if ((new = malloc(sizeof(gcstring_t))) == NULL)
363 	return NULL;
364     memcpy(new, gcstr, sizeof(gcstring_t));
365 
366     if (gcstr->str && gcstr->len) {
367 	if ((newstr = malloc(sizeof(unichar_t) * gcstr->len)) == NULL) {
368 	    free(new);
369 	    return NULL;
370 	}
371 	memcpy(newstr, gcstr->str, sizeof(unichar_t) * gcstr->len);
372     }
373     new->str = newstr;
374     if (gcstr->gcstr && gcstr->gclen) {
375 	if ((newgcstr = malloc(sizeof(gcchar_t) * gcstr->gclen)) == NULL) {
376 	    free(new->str);
377 	    free(new);
378 	    return NULL;
379 	}
380 	memcpy(newgcstr, gcstr->gcstr, sizeof(gcchar_t) * gcstr->gclen);
381     }
382     new->gcstr = newgcstr;
383     if (gcstr->lbobj == NULL) {
384 	if ((new->lbobj = linebreak_new(NULL)) == NULL) {
385 	    gcstring_destroy(new);
386 	    return NULL;
387 	}
388     } else
389 	new->lbobj = linebreak_incref(gcstr->lbobj);
390     new->pos = 0;
391 
392     return new;
393 }
394 
395 /** Append
396  *
397  * Modify grapheme cluster string by appending another string.
398  * @param[in] gcstr target grapheme cluster string, must not be NULL.
399  * @param[in] appe grapheme cluster string to be appended.
400  * NULL means null string therefore gcstr won't be modified.
401  * @return Modified grapheme cluster string gcstr itself (not a copy).
402  * If error occurred, errno is set then NULL is returned.
403  */
gcstring_append(gcstring_t * gcstr,gcstring_t * appe)404 gcstring_t *gcstring_append(gcstring_t * gcstr, gcstring_t * appe)
405 {
406     unistr_t ustr = { NULL, 0 };
407 
408     if (gcstr == NULL)
409 	return (errno = EINVAL), NULL;
410     if (appe == NULL || appe->str == NULL || appe->len == 0)
411 	return gcstr;
412     if (gcstr->gclen && appe->gclen) {
413 	size_t aidx, alen, blen, newlen, newgclen, i;
414 	unsigned char bflag;
415 	gcstring_t *cstr;
416 	unichar_t *_u;
417 	gcchar_t *_g;
418 
419 	aidx = gcstr->gcstr[gcstr->gclen - 1].idx;
420 	alen = gcstr->gcstr[gcstr->gclen - 1].len;
421 	blen = appe->gcstr[0].len;
422 	bflag = appe->gcstr[0].flag;
423 
424 	if ((ustr.str = malloc(sizeof(unichar_t) * (alen + blen))) == NULL)
425 	    return NULL;
426 	memcpy(ustr.str, gcstr->str + aidx, sizeof(unichar_t) * alen);
427 	memcpy(ustr.str + alen, appe->str, sizeof(unichar_t) * blen);
428 	ustr.len = alen + blen;
429 	if ((cstr = gcstring_new(&ustr, gcstr->lbobj)) == NULL) {
430 	    free(ustr.str);
431 	    return NULL;
432 	}
433 
434 	newlen = gcstr->len + appe->len;
435 	newgclen = gcstr->gclen - 1 + cstr->gclen + appe->gclen - 1;
436 	if ((_u = realloc(gcstr->str, sizeof(unichar_t) * newlen)) == NULL) {
437 	    gcstring_destroy(cstr);
438 	    return NULL;
439 	} else
440 	    gcstr->str = _u;
441 	if ((_g = realloc(gcstr->gcstr,
442 			  sizeof(gcchar_t) * newgclen)) == NULL) {
443 	    gcstring_destroy(cstr);
444 	    return NULL;
445 	} else
446 	    gcstr->gcstr = _g;
447 	memcpy(gcstr->str + gcstr->len, appe->str,
448 	       sizeof(unichar_t) * appe->len);
449 	for (i = 0; i < cstr->gclen; i++) {
450 	    gcchar_t *gc = gcstr->gcstr + gcstr->gclen - 1 + i;
451 
452 	    gc->idx = cstr->gcstr[i].idx + aidx;
453 	    gc->len = cstr->gcstr[i].len;
454 	    gc->col = cstr->gcstr[i].col;
455 	    gc->lbc = cstr->gcstr[i].lbc;
456 	    gc->elbc = cstr->gcstr[i].elbc;
457 	    if (aidx + alen == gc->idx)	/* Restore flag if possible */
458 		gc->flag = bflag;
459 	}
460 	for (i = 1; i < appe->gclen; i++) {
461 	    gcchar_t *gc =
462 		gcstr->gcstr + gcstr->gclen - 1 + cstr->gclen + i - 1;
463 	    gc->idx = appe->gcstr[i].idx - blen + aidx + cstr->len;
464 	    gc->len = appe->gcstr[i].len;
465 	    gc->col = appe->gcstr[i].col;
466 	    gc->lbc = appe->gcstr[i].lbc;
467 	    gc->elbc = appe->gcstr[i].elbc;
468 	    gc->flag = appe->gcstr[i].flag;
469 	}
470 
471 	gcstr->len = newlen;
472 	gcstr->gclen = newgclen;
473 	gcstring_destroy(cstr);
474     } else if (appe->gclen) {
475 	if ((gcstr->str = malloc(sizeof(unichar_t) * appe->len)) == NULL)
476 	    return NULL;
477 	if ((gcstr->gcstr =
478 	     malloc(sizeof(gcchar_t) * appe->gclen)) == NULL) {
479 	    free(gcstr->str);
480 	    return NULL;
481 	}
482 	memcpy(gcstr->str, appe->str, sizeof(unichar_t) * appe->len);
483 	gcstr->len = appe->len;
484 	memcpy(gcstr->gcstr, appe->gcstr, sizeof(gcchar_t) * appe->gclen);
485 	gcstr->gclen = appe->gclen;
486 
487 	gcstr->pos = 0;
488     }
489 
490     return gcstr;
491 }
492 
493 /** Compare
494  *
495  * Compare grapheme cluster strings.
496  * @param[in] a grapheme cluster string.
497  * @param[in] b grapheme cluster string.
498  * @return positive, zero or negative value when a is greater, equal to, lesser than b, respectively.
499  */
gcstring_cmp(gcstring_t * a,gcstring_t * b)500 int gcstring_cmp(gcstring_t * a, gcstring_t * b)
501 {
502     size_t i;
503 
504     if (!a->len || !b->len)
505 	return (a->len ? 1 : 0) - (b->len ? 1 : 0);
506     for (i = 0; i < a->len && i < b->len; i++)
507 	if (a->str[i] != b->str[i])
508 	    return a->str[i] - b->str[i];
509     return a->len - b->len;
510 }
511 
512 /** Number of Columns
513  *
514  * Returns number of columns of grapheme cluster strings determined by built-in character database according to UAX #11.
515  * @param[in] gcstr grapheme cluster string. NULL may mean null string.
516  * @return Number of columns.
517  */
gcstring_columns(gcstring_t * gcstr)518 size_t gcstring_columns(gcstring_t * gcstr)
519 {
520     size_t col, i;
521 
522     if (gcstr == NULL)
523 	return 0;
524     for (col = 0, i = 0; i < gcstr->gclen; i++)
525 	col += gcstr->gcstr[i].col;
526     return col;
527 }
528 
529 /** Concatenate
530  *
531  * Create new grapheme cluster string which is concatination of two strings.
532  * @param[in] gcstr grapheme cluster string, must not be NULL.
533  * @param[in] appe grapheme cluster string to be appended.  NULL means null
534  * string.
535  * @return New grapheme cluster string.
536  * If error occurred, errno is set then NULL is returned.
537  */
gcstring_concat(gcstring_t * gcstr,gcstring_t * appe)538 gcstring_t *gcstring_concat(gcstring_t * gcstr, gcstring_t * appe)
539 {
540     gcstring_t *new;
541     size_t pos;
542 
543     if (gcstr == NULL)
544 	return (errno = EINVAL), NULL;
545     pos = gcstr->pos;
546     if ((new = gcstring_copy(gcstr)) == NULL)
547 	return NULL;
548     new->pos = pos;
549     return gcstring_append(new, appe);
550 }
551 
552 /** Iterator
553  *
554  * Returns pointer to next grapheme cluster of grapheme cluster string.
555  * Next position will be incremented.
556  * @param[in] gcstr grapheme cluster string.
557  * @return Pointer to grapheme cluster.
558  * If pointer was already at end of the string, NULL will be returned.
559  */
gcstring_next(gcstring_t * gcstr)560 gcchar_t *gcstring_next(gcstring_t * gcstr)
561 {
562     if (gcstr->gclen <= gcstr->pos)
563 	return NULL;
564     return gcstr->gcstr + (gcstr->pos++);
565 }
566 
567 /** Set Next Position
568  *
569  * Set next position of grapheme cluster string.
570  * @param[in] gcstr grapheme cluster string.
571  * @param[in] pos New position.
572  * @return none.
573  * If pos is out of range of string, position won't be updated.
574  *
575  * @todo On next major release, pos would be ssize_t, not int.
576  */
gcstring_setpos(gcstring_t * gcstr,int pos)577 void gcstring_setpos(gcstring_t * gcstr, int pos)
578 {
579     if (pos < 0)
580 	pos += gcstr->gclen;
581     if (pos < 0 || gcstr->gclen < pos)
582 	return;
583     gcstr->pos = pos;
584 }
585 
586 /** Shrink
587  *
588  * Modify grapheme cluster string to shrink its length.
589  * Length is specified by number of grapheme clusters.
590  * @param[in] gcstr grapheme cluster string.
591  * @param[in] length New length.
592  * @return none.
593  * If gcstr was NULL, do nothing.
594  *
595  * @todo On next major release, length would be ssize_t, not int.
596  */
gcstring_shrink(gcstring_t * gcstr,int length)597 void gcstring_shrink(gcstring_t * gcstr, int length)
598 {
599     if (gcstr == NULL)
600 	return;
601 
602     if (length < 0)
603 	length += gcstr->gclen;
604 
605     if (length <= 0) {
606 	free(gcstr->str);
607 	gcstr->str = NULL;
608 	gcstr->len = 0;
609 	free(gcstr->gcstr);
610 	gcstr->gcstr = NULL;
611 	gcstr->gclen = 0;
612     } else if (gcstr->gclen <= length)
613 	return;
614     else {
615 	gcstr->len = gcstr->gcstr[length].idx;
616 	gcstr->gclen = length;
617     }
618 }
619 
620 /** Substring
621  *
622  * Returns substring of grapheme cluster string.
623  * Offset and length are specified by number of grapheme clusters.
624  * @param[in] gcstr grapheme cluster string.  Must not be NULL.
625  * @param[in] offset Offset of substring.
626  * @param[in] length Length of substring.
627  * @return (newly allocated) substring.
628  * If error occurred, errno is set to non-zero then NULL is returned.
629  *
630  * @todo On next major release, offset and length would be ssize_t, not int.
631  */
gcstring_substr(gcstring_t * gcstr,int offset,int length)632 gcstring_t *gcstring_substr(gcstring_t * gcstr, int offset, int length)
633 {
634     gcstring_t *new;
635     size_t ulength, i;
636 
637     if (gcstr == NULL)
638 	return (errno = EINVAL), NULL;
639 
640     /* adjust offset. */
641     if (offset < 0)
642 	offset += gcstr->gclen;
643     if (offset < 0) {
644 	length += offset;
645 	offset = 0;
646     }
647     if (length < 0)
648 	length += gcstr->gclen - offset;
649 
650     if (length < 0 || gcstr->gclen < offset)	/* out of range */
651 	return gcstring_new(NULL, gcstr->lbobj);
652 
653     if (gcstr->gclen == offset)
654 	length = 0;
655     else if (gcstr->gclen <= offset + length)
656 	length = gcstr->gclen - offset;
657 
658     /* create substring. */
659 
660     if (gcstr->gclen == offset)
661 	ulength = 0;
662     else if (gcstr->gclen <= offset + length)
663 	ulength = gcstr->len - gcstr->gcstr[offset].idx;
664     else
665 	ulength =
666 	    gcstr->gcstr[offset + length].idx - gcstr->gcstr[offset].idx;
667 
668     if ((new = gcstring_new(NULL, gcstr->lbobj)) == NULL)
669 	return NULL;
670 
671     if (ulength == 0);
672     else if ((new->str = malloc(sizeof(unichar_t) * ulength)) == NULL) {
673 	gcstring_destroy(new);
674 	return NULL;
675     }
676     if (length == 0);
677     else if ((new->gcstr = malloc(sizeof(gcchar_t) * length)) == NULL) {
678 	free(new->str);
679 	gcstring_destroy(new);
680 	return NULL;
681     }
682     if (ulength != 0)
683 	memcpy(new->str, gcstr->str + gcstr->gcstr[offset].idx,
684 	       sizeof(unichar_t) * ulength);
685     new->len = ulength;
686     for (i = 0; i < length; i++) {
687 	memcpy(new->gcstr + i, gcstr->gcstr + offset + i,
688 	       sizeof(gcchar_t));
689 	new->gcstr[i].idx -= gcstr->gcstr[offset].idx;
690     }
691     new->gclen = length;
692 
693     return new;
694 }
695 
696 /** Replace substring
697  *
698  * Replace substring og grapheme cluster string.
699  * Offset and length are specified by number of grapheme clusters.
700  * @param[in,out] gcstr grapheme cluster string.  Must not be NULL.
701  * @param[in] offset Offset of substring.
702  * @param[in] length Length of substring.
703  * offset and length must not be out of range.
704  * @param[in] replacement If this was not NULL, modify grapheme cluster string by replacing substring with it.
705  * @return modified gcstr itself (not a copy of it).
706  * If error occurred, errno is set to non-zero then NULL is returned.
707  *
708  * @todo On next major release, offset and length would be ssize_t, not int.
709  */
gcstring_replace(gcstring_t * gcstr,int offset,int length,gcstring_t * replacement)710 gcstring_t *gcstring_replace(gcstring_t * gcstr, int offset, int length,
711 			     gcstring_t * replacement)
712 {
713     gcstring_t *tail;
714 
715     if (gcstr == NULL)
716 	return (errno = EINVAL), NULL;
717 
718     /* without replacement: meaningless. return immedately. */
719     if (replacement == NULL)
720 	return gcstr;
721 
722     /* adjust offset. */
723     if (offset < 0)
724 	offset += gcstr->gclen;
725     if (offset < 0) {
726 	length += offset;
727 	offset = 0;
728     }
729     if (length < 0)
730 	length += gcstr->gclen - offset;
731 
732     if (length < 0 || gcstr->gclen < offset)	/* out of range */
733 	return (errno = EINVAL), NULL;
734 
735     if (gcstr->gclen == offset)
736 	length = 0;
737     else if (gcstr->gclen <= offset + length)
738 	length = gcstr->gclen - offset;
739 
740     /* returns modified gcstr itself. */
741 
742     if ((tail = gcstring_substr(gcstr, offset + length,
743 				gcstr->gclen - (offset + length))) == NULL)
744 	return NULL;
745     gcstring_shrink(gcstr, offset);
746     if (gcstring_append(gcstr, replacement) == NULL) {
747 	gcstring_destroy(tail);
748 	return NULL;
749     }
750     if (gcstring_append(gcstr, tail) == NULL) {
751 	gcstring_destroy(tail);
752 	return NULL;
753     }
754     gcstring_destroy(tail);
755     return gcstr;
756 }
757 
758 /** Get Line Breaking Class of grapheme base
759  *
760  * Get UAX #14 line breaking class of grapheme base.
761  * @param[in] gcstr grapheme cluster string, must not be NULL.
762  * @param[in] pos position.
763  * @return line breaking class property value.
764  *
765  * @note Introduced by sombok 2.2.
766  */
gcstring_lbclass(gcstring_t * gcstr,int pos)767 propval_t gcstring_lbclass(gcstring_t * gcstr, int pos)
768 {
769     if (pos < 0)
770 	pos += gcstr->gclen;
771     if (pos < 0 || gcstr->gclen == 0 || gcstr->gclen <= pos)
772 	return PROP_UNKNOWN;
773     return gcstr->gcstr[pos].lbc;
774 }
775 
776 /** Get Line Breaking Class of grapheme extender
777  *
778  * Get UAX #14 line breaking class of grapheme extender.
779  * If it is CM, get one of grapheme base.
780  * @param[in] gcstr grapheme cluster string, must not be NULL.
781  * @param[in] pos position.
782  * @return line breaking class property value.
783  *
784  * @note Introduced by sombok 2.2.
785  */
gcstring_lbclass_ext(gcstring_t * gcstr,int pos)786 propval_t gcstring_lbclass_ext(gcstring_t * gcstr, int pos)
787 {
788     propval_t lbc;
789 
790     if (pos < 0)
791 	pos += gcstr->gclen;
792     if (pos < 0 || gcstr->gclen == 0 || gcstr->gclen <= pos)
793 	return PROP_UNKNOWN;
794     if ((lbc = gcstr->gcstr[pos].elbc) == PROP_UNKNOWN)
795 	lbc = gcstr->gcstr[pos].lbc;
796     return lbc;
797 }
798