1 /*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version 2
5 * of the License, or (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software Foundation,
14 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
15 *
16 * The Original Code is Copyright (C) 2011 Blender Foundation.
17 * Code from gutf8.c Copyright (C) 1999 Tom Tromey
18 * Copyright (C) 2000 Red Hat, Inc.
19 * All rights reserved.
20 */
21
22 /** \file
23 * \ingroup bli
24 */
25
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <wchar.h>
30 #include <wctype.h>
31 #include <wcwidth.h>
32
33 #include "BLI_utildefines.h"
34
35 #include "BLI_string_utf8.h" /* own include */
36
37 #ifdef __GNUC__
38 # pragma GCC diagnostic error "-Wsign-conversion"
39 #endif
40
41 // #define DEBUG_STRSIZE
42
43 /* array copied from glib's gutf8.c, */
44 /* Note: last two values (0xfe and 0xff) are forbidden in utf-8,
45 * so they are considered 1 byte length too. */
46 static const size_t utf8_skip_data[256] = {
47 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
54 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1,
55 };
56
57 /* from libswish3, originally called u8_isvalid(),
58 * modified to return the index of the bad character (byte index not utf).
59 * http://svn.swish-e.org/libswish3/trunk/src/libswish3/utf8.c r3044 - campbell */
60
61 /* based on the valid_utf8 routine from the PCRE library by Philip Hazel
62 *
63 * length is in bytes, since without knowing whether the string is valid
64 * it's hard to know how many characters there are! */
65
66 /**
67 * Find first utf-8 invalid byte in given \a str, of \a length bytes.
68 *
69 * \return the offset of the first invalid byte.
70 */
BLI_utf8_invalid_byte(const char * str,size_t length)71 ptrdiff_t BLI_utf8_invalid_byte(const char *str, size_t length)
72 {
73 const unsigned char *p, *perr, *pend = (const unsigned char *)str + length;
74 unsigned char c;
75 int ab;
76
77 for (p = (const unsigned char *)str; p < pend; p++, length--) {
78 c = *p;
79 perr = p; /* Erroneous char is always the first of an invalid utf8 sequence... */
80 if (ELEM(c, 0xfe, 0xff, 0x00)) {
81 /* Those three values are not allowed in utf8 string. */
82 goto utf8_error;
83 }
84 if (c < 128) {
85 continue;
86 }
87 if ((c & 0xc0) != 0xc0) {
88 goto utf8_error;
89 }
90
91 /* Note that since we always increase p (and decrease length) by one byte in main loop,
92 * we only add/subtract extra utf8 bytes in code below
93 * (ab number, aka number of bytes remaining in the utf8 sequence after the initial one). */
94 ab = (int)utf8_skip_data[c] - 1;
95 if (length <= ab) {
96 goto utf8_error;
97 }
98
99 /* Check top bits in the second byte */
100 p++;
101 length--;
102 if ((*p & 0xc0) != 0x80) {
103 goto utf8_error;
104 }
105
106 /* Check for overlong sequences for each different length */
107 switch (ab) {
108 case 1:
109 /* Check for xx00 000x */
110 if ((c & 0x3e) == 0) {
111 goto utf8_error;
112 }
113 continue; /* We know there aren't any more bytes to check */
114
115 case 2:
116 /* Check for 1110 0000, xx0x xxxx */
117 if (c == 0xe0 && (*p & 0x20) == 0) {
118 goto utf8_error;
119 }
120 /* Some special cases, see section 5 of utf-8 decoder stress-test by Markus Kuhn
121 * (https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt). */
122 /* From section 5.1 (and 5.2) */
123 if (c == 0xed) {
124 if (*p == 0xa0 && *(p + 1) == 0x80) {
125 goto utf8_error;
126 }
127 if (*p == 0xad && *(p + 1) == 0xbf) {
128 goto utf8_error;
129 }
130 if (*p == 0xae && *(p + 1) == 0x80) {
131 goto utf8_error;
132 }
133 if (*p == 0xaf && *(p + 1) == 0xbf) {
134 goto utf8_error;
135 }
136 if (*p == 0xb0 && *(p + 1) == 0x80) {
137 goto utf8_error;
138 }
139 if (*p == 0xbe && *(p + 1) == 0x80) {
140 goto utf8_error;
141 }
142 if (*p == 0xbf && *(p + 1) == 0xbf) {
143 goto utf8_error;
144 }
145 }
146 /* From section 5.3 */
147 if (c == 0xef) {
148 if (*p == 0xbf && *(p + 1) == 0xbe) {
149 goto utf8_error;
150 }
151 if (*p == 0xbf && *(p + 1) == 0xbf) {
152 goto utf8_error;
153 }
154 }
155 break;
156
157 case 3:
158 /* Check for 1111 0000, xx00 xxxx */
159 if (c == 0xf0 && (*p & 0x30) == 0) {
160 goto utf8_error;
161 }
162 break;
163
164 case 4:
165 /* Check for 1111 1000, xx00 0xxx */
166 if (c == 0xf8 && (*p & 0x38) == 0) {
167 goto utf8_error;
168 }
169 break;
170
171 case 5:
172 /* Check for 1111 1100, xx00 00xx */
173 if (c == 0xfc && (*p & 0x3c) == 0) {
174 goto utf8_error;
175 }
176 break;
177 }
178
179 /* Check for valid bytes after the 2nd, if any; all must start 10 */
180 while (--ab > 0) {
181 p++;
182 length--;
183 if ((*p & 0xc0) != 0x80) {
184 goto utf8_error;
185 }
186 }
187 }
188
189 return -1;
190
191 utf8_error:
192
193 return ((const char *)perr - (const char *)str);
194 }
195
196 /**
197 * Remove any invalid utf-8 byte (taking into account multi-bytes sequence of course).
198 *
199 * \return number of stripped bytes.
200 */
BLI_utf8_invalid_strip(char * str,size_t length)201 int BLI_utf8_invalid_strip(char *str, size_t length)
202 {
203 ptrdiff_t bad_char;
204 int tot = 0;
205
206 BLI_assert(str[length] == '\0');
207
208 while ((bad_char = BLI_utf8_invalid_byte(str, length)) != -1) {
209 str += bad_char;
210 length -= (size_t)(bad_char + 1);
211
212 if (length == 0) {
213 /* last character bad, strip it */
214 *str = '\0';
215 tot++;
216 break;
217 }
218 /* strip, keep looking */
219 memmove(str, str + 1, length + 1); /* +1 for NULL char! */
220 tot++;
221 }
222
223 return tot;
224 }
225
226 /* compatible with BLI_strncpy, but esnure no partial utf8 chars */
227
228 #define BLI_STR_UTF8_CPY(dst, src, maxncpy) \
229 { \
230 size_t utf8_size; \
231 while (*src != '\0' && (utf8_size = utf8_skip_data[*src]) < maxncpy) { \
232 maxncpy -= utf8_size; \
233 switch (utf8_size) { \
234 case 6: \
235 *dst++ = *src++; \
236 ATTR_FALLTHROUGH; \
237 case 5: \
238 *dst++ = *src++; \
239 ATTR_FALLTHROUGH; \
240 case 4: \
241 *dst++ = *src++; \
242 ATTR_FALLTHROUGH; \
243 case 3: \
244 *dst++ = *src++; \
245 ATTR_FALLTHROUGH; \
246 case 2: \
247 *dst++ = *src++; \
248 ATTR_FALLTHROUGH; \
249 case 1: \
250 *dst++ = *src++; \
251 } \
252 } \
253 *dst = '\0'; \
254 } \
255 (void)0
256
BLI_strncpy_utf8(char * __restrict dst,const char * __restrict src,size_t maxncpy)257 char *BLI_strncpy_utf8(char *__restrict dst, const char *__restrict src, size_t maxncpy)
258 {
259 char *r_dst = dst;
260
261 BLI_assert(maxncpy != 0);
262
263 #ifdef DEBUG_STRSIZE
264 memset(dst, 0xff, sizeof(*dst) * maxncpy);
265 #endif
266
267 /* note: currently we don't attempt to deal with invalid utf8 chars */
268 BLI_STR_UTF8_CPY(dst, src, maxncpy);
269
270 return r_dst;
271 }
272
BLI_strncpy_utf8_rlen(char * __restrict dst,const char * __restrict src,size_t maxncpy)273 size_t BLI_strncpy_utf8_rlen(char *__restrict dst, const char *__restrict src, size_t maxncpy)
274 {
275 char *r_dst = dst;
276
277 BLI_assert(maxncpy != 0);
278
279 #ifdef DEBUG_STRSIZE
280 memset(dst, 0xff, sizeof(*dst) * maxncpy);
281 #endif
282
283 /* note: currently we don't attempt to deal with invalid utf8 chars */
284 BLI_STR_UTF8_CPY(dst, src, maxncpy);
285
286 return (size_t)(dst - r_dst);
287 }
288
289 #undef BLI_STR_UTF8_CPY
290
291 /* --------------------------------------------------------------------------*/
292 /* wchar_t / utf8 functions */
293
BLI_strncpy_wchar_as_utf8(char * __restrict dst,const wchar_t * __restrict src,const size_t maxncpy)294 size_t BLI_strncpy_wchar_as_utf8(char *__restrict dst,
295 const wchar_t *__restrict src,
296 const size_t maxncpy)
297 {
298 const size_t maxlen = maxncpy - 1;
299 /* 6 is max utf8 length of an unicode char. */
300 const int64_t maxlen_secured = (int64_t)maxlen - 6;
301 size_t len = 0;
302
303 BLI_assert(maxncpy != 0);
304
305 #ifdef DEBUG_STRSIZE
306 memset(dst, 0xff, sizeof(*dst) * maxncpy);
307 #endif
308
309 while (*src && len <= maxlen_secured) {
310 len += BLI_str_utf8_from_unicode((uint)*src++, dst + len);
311 }
312
313 /* We have to be more careful for the last six bytes,
314 * to avoid buffer overflow in case utf8-encoded char would be too long for our dst buffer. */
315 while (*src) {
316 char t[6];
317 size_t l = BLI_str_utf8_from_unicode((uint)*src++, t);
318 BLI_assert(l <= 6);
319 if (len + l > maxlen) {
320 break;
321 }
322 memcpy(dst + len, t, l);
323 len += l;
324 }
325
326 dst[len] = '\0';
327
328 return len;
329 }
330
331 /* wchar len in utf8 */
BLI_wstrlen_utf8(const wchar_t * src)332 size_t BLI_wstrlen_utf8(const wchar_t *src)
333 {
334 size_t len = 0;
335
336 while (*src) {
337 len += BLI_str_utf8_from_unicode((uint)*src++, NULL);
338 }
339
340 return len;
341 }
342
BLI_strlen_utf8_ex(const char * strc,size_t * r_len_bytes)343 size_t BLI_strlen_utf8_ex(const char *strc, size_t *r_len_bytes)
344 {
345 size_t len;
346 const char *strc_orig = strc;
347
348 for (len = 0; *strc; len++) {
349 strc += BLI_str_utf8_size_safe(strc);
350 }
351
352 *r_len_bytes = (size_t)(strc - strc_orig);
353 return len;
354 }
355
BLI_strlen_utf8(const char * strc)356 size_t BLI_strlen_utf8(const char *strc)
357 {
358 size_t len_bytes;
359 return BLI_strlen_utf8_ex(strc, &len_bytes);
360 }
361
BLI_strnlen_utf8_ex(const char * strc,const size_t maxlen,size_t * r_len_bytes)362 size_t BLI_strnlen_utf8_ex(const char *strc, const size_t maxlen, size_t *r_len_bytes)
363 {
364 size_t len = 0;
365 const char *strc_orig = strc;
366 const char *strc_end = strc + maxlen;
367
368 while (true) {
369 size_t step = (size_t)BLI_str_utf8_size_safe(strc);
370 if (!*strc || strc + step > strc_end) {
371 break;
372 }
373 strc += step;
374 len++;
375 }
376
377 *r_len_bytes = (size_t)(strc - strc_orig);
378 return len;
379 }
380
381 /**
382 * \param strc: the string to measure the length.
383 * \param maxlen: the string length (in bytes)
384 * \return the unicode length (not in bytes!)
385 */
BLI_strnlen_utf8(const char * strc,const size_t maxlen)386 size_t BLI_strnlen_utf8(const char *strc, const size_t maxlen)
387 {
388 size_t len_bytes;
389 return BLI_strnlen_utf8_ex(strc, maxlen, &len_bytes);
390 }
391
BLI_strncpy_wchar_from_utf8(wchar_t * __restrict dst_w,const char * __restrict src_c,const size_t maxncpy)392 size_t BLI_strncpy_wchar_from_utf8(wchar_t *__restrict dst_w,
393 const char *__restrict src_c,
394 const size_t maxncpy)
395 {
396 const size_t maxlen = maxncpy - 1;
397 size_t len = 0;
398
399 BLI_assert(maxncpy != 0);
400
401 #ifdef DEBUG_STRSIZE
402 memset(dst_w, 0xff, sizeof(*dst_w) * maxncpy);
403 #endif
404
405 while (*src_c && len != maxlen) {
406 size_t step = 0;
407 uint unicode = BLI_str_utf8_as_unicode_and_size(src_c, &step);
408 if (unicode != BLI_UTF8_ERR) {
409 /* TODO: `wchar_t` type is an implementation-defined and may represent
410 * 16-bit or 32-bit depending on operating system.
411 * So the ideal would be to do the corresponding encoding.
412 * But for now just assert that it has no conflicting use. */
413 BLI_assert(step <= sizeof(wchar_t));
414 *dst_w = (wchar_t)unicode;
415 src_c += step;
416 }
417 else {
418 *dst_w = '?';
419 src_c = BLI_str_find_next_char_utf8(src_c, NULL);
420 }
421 dst_w++;
422 len++;
423 }
424
425 *dst_w = 0;
426
427 return len;
428 }
429
430 /* end wchar_t / utf8 functions */
431 /* --------------------------------------------------------------------------*/
432
433 /* count columns that character/string occupies, based on wcwidth.c */
434
BLI_wcwidth(char32_t ucs)435 int BLI_wcwidth(char32_t ucs)
436 {
437 return mk_wcwidth(ucs);
438 }
439
BLI_wcswidth(const char32_t * pwcs,size_t n)440 int BLI_wcswidth(const char32_t *pwcs, size_t n)
441 {
442 return mk_wcswidth(pwcs, n);
443 }
444
BLI_str_utf8_char_width(const char * p)445 int BLI_str_utf8_char_width(const char *p)
446 {
447 uint unicode = BLI_str_utf8_as_unicode(p);
448 if (unicode == BLI_UTF8_ERR) {
449 return -1;
450 }
451
452 return BLI_wcwidth((char32_t)unicode);
453 }
454
BLI_str_utf8_char_width_safe(const char * p)455 int BLI_str_utf8_char_width_safe(const char *p)
456 {
457 int columns;
458
459 uint unicode = BLI_str_utf8_as_unicode(p);
460 if (unicode == BLI_UTF8_ERR) {
461 return 1;
462 }
463
464 columns = BLI_wcwidth((char32_t)unicode);
465
466 return (columns < 0) ? 1 : columns;
467 }
468
469 /* --------------------------------------------------------------------------*/
470
471 /* copied from glib's gutf8.c, added 'Err' arg */
472
473 /* note, glib uses uint for unicode, best we do the same,
474 * though we don't typedef it - campbell */
475
476 #define UTF8_COMPUTE(Char, Mask, Len, Err) \
477 if (Char < 128) { \
478 Len = 1; \
479 Mask = 0x7f; \
480 } \
481 else if ((Char & 0xe0) == 0xc0) { \
482 Len = 2; \
483 Mask = 0x1f; \
484 } \
485 else if ((Char & 0xf0) == 0xe0) { \
486 Len = 3; \
487 Mask = 0x0f; \
488 } \
489 else if ((Char & 0xf8) == 0xf0) { \
490 Len = 4; \
491 Mask = 0x07; \
492 } \
493 else if ((Char & 0xfc) == 0xf8) { \
494 Len = 5; \
495 Mask = 0x03; \
496 } \
497 else if ((Char & 0xfe) == 0xfc) { \
498 Len = 6; \
499 Mask = 0x01; \
500 } \
501 else { \
502 Len = Err; /* -1 is the typical error value or 1 to skip */ \
503 } \
504 (void)0
505
506 /* same as glib define but added an 'Err' arg */
507 #define UTF8_GET(Result, Chars, Count, Mask, Len, Err) \
508 (Result) = (Chars)[0] & (Mask); \
509 for ((Count) = 1; (Count) < (Len); ++(Count)) { \
510 if (((Chars)[(Count)] & 0xc0) != 0x80) { \
511 (Result) = Err; \
512 break; \
513 } \
514 (Result) <<= 6; \
515 (Result) |= ((Chars)[(Count)] & 0x3f); \
516 } \
517 (void)0
518
519 /* uses glib functions but not from glib */
520 /* gets the size of a single utf8 char */
BLI_str_utf8_size(const char * p)521 int BLI_str_utf8_size(const char *p)
522 {
523 int mask = 0, len;
524 const unsigned char c = (unsigned char)*p;
525
526 UTF8_COMPUTE(c, mask, len, -1);
527
528 (void)mask; /* quiet warning */
529
530 return len;
531 }
532
533 /* use when we want to skip errors */
BLI_str_utf8_size_safe(const char * p)534 int BLI_str_utf8_size_safe(const char *p)
535 {
536 int mask = 0, len;
537 const unsigned char c = (unsigned char)*p;
538
539 UTF8_COMPUTE(c, mask, len, 1);
540
541 (void)mask; /* quiet warning */
542
543 return len;
544 }
545
546 /* was g_utf8_get_char */
547 /**
548 * BLI_str_utf8_as_unicode:
549 * \param p: a pointer to Unicode character encoded as UTF-8
550 *
551 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
552 * If \a p does not point to a valid UTF-8 encoded character, results are
553 * undefined. If you are not sure that the bytes are complete
554 * valid Unicode characters, you should use g_utf8_get_char_validated()
555 * instead.
556 *
557 * Return value: the resulting character
558 */
BLI_str_utf8_as_unicode(const char * p)559 uint BLI_str_utf8_as_unicode(const char *p)
560 {
561 int i, len;
562 uint mask = 0;
563 uint result;
564 const unsigned char c = (unsigned char)*p;
565
566 UTF8_COMPUTE(c, mask, len, -1);
567 if (UNLIKELY(len == -1)) {
568 return BLI_UTF8_ERR;
569 }
570 UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
571
572 return result;
573 }
574
575 /* variant that increments the length */
BLI_str_utf8_as_unicode_and_size(const char * __restrict p,size_t * __restrict index)576 uint BLI_str_utf8_as_unicode_and_size(const char *__restrict p, size_t *__restrict index)
577 {
578 int i, len;
579 uint mask = 0;
580 uint result;
581 const unsigned char c = (unsigned char)*p;
582
583 UTF8_COMPUTE(c, mask, len, -1);
584 if (UNLIKELY(len == -1)) {
585 return BLI_UTF8_ERR;
586 }
587 UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
588 *index += (size_t)len;
589 return result;
590 }
591
BLI_str_utf8_as_unicode_and_size_safe(const char * __restrict p,size_t * __restrict index)592 uint BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p, size_t *__restrict index)
593 {
594 int i, len;
595 uint mask = 0;
596 uint result;
597 const unsigned char c = (unsigned char)*p;
598
599 UTF8_COMPUTE(c, mask, len, -1);
600 if (UNLIKELY(len == -1)) {
601 *index += 1;
602 return c;
603 }
604 UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
605 *index += (size_t)len;
606 return result;
607 }
608
609 /* another variant that steps over the index,
610 * note, currently this also falls back to latin1 for text drawing. */
BLI_str_utf8_as_unicode_step(const char * __restrict p,size_t * __restrict index)611 uint BLI_str_utf8_as_unicode_step(const char *__restrict p, size_t *__restrict index)
612 {
613 int i, len;
614 uint mask = 0;
615 uint result;
616 unsigned char c;
617
618 p += *index;
619 c = (unsigned char)*p;
620
621 UTF8_COMPUTE(c, mask, len, -1);
622 if (UNLIKELY(len == -1)) {
623 /* when called with NULL end, result will never be NULL,
624 * checks for a NULL character */
625 const char *p_next = BLI_str_find_next_char_utf8(p, NULL);
626 /* will never return the same pointer unless '\0',
627 * eternal loop is prevented */
628 *index += (size_t)(p_next - p);
629 return BLI_UTF8_ERR;
630 }
631
632 /* this is tricky since there are a few ways we can bail out of bad unicode
633 * values, 3 possible solutions. */
634 #if 0
635 UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
636 #elif 1
637 /* WARNING: this is NOT part of glib, or supported by similar functions.
638 * this is added for text drawing because some filepaths can have latin1
639 * characters */
640 UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
641 if (result == BLI_UTF8_ERR) {
642 len = 1;
643 result = *p;
644 }
645 /* end warning! */
646 #else
647 /* without a fallback like '?', text drawing will stop on this value */
648 UTF8_GET(result, p, i, mask, len, '?');
649 #endif
650
651 *index += (size_t)len;
652 return result;
653 }
654
655 /* was g_unichar_to_utf8 */
656 /**
657 * BLI_str_utf8_from_unicode:
658 * \param c: a Unicode character code
659 * \param outbuf: output buffer, must have at least 6 bytes of space.
660 * If %NULL, the length will be computed and returned
661 * and nothing will be written to outbuf.
662 *
663 * Converts a single character to UTF-8.
664 *
665 * \return number of bytes written
666 */
BLI_str_utf8_from_unicode(uint c,char * outbuf)667 size_t BLI_str_utf8_from_unicode(uint c, char *outbuf)
668 {
669 /* If this gets modified, also update the copy in g_string_insert_unichar() */
670 uint len = 0;
671 uint first;
672 uint i;
673
674 if (c < 0x80) {
675 first = 0;
676 len = 1;
677 }
678 else if (c < 0x800) {
679 first = 0xc0;
680 len = 2;
681 }
682 else if (c < 0x10000) {
683 first = 0xe0;
684 len = 3;
685 }
686 else if (c < 0x200000) {
687 first = 0xf0;
688 len = 4;
689 }
690 else if (c < 0x4000000) {
691 first = 0xf8;
692 len = 5;
693 }
694 else {
695 first = 0xfc;
696 len = 6;
697 }
698
699 if (outbuf) {
700 for (i = len - 1; i > 0; i--) {
701 outbuf[i] = (c & 0x3f) | 0x80;
702 c >>= 6;
703 }
704 outbuf[0] = c | first;
705 }
706
707 return len;
708 }
709
BLI_str_utf8_as_utf32(char32_t * __restrict dst_w,const char * __restrict src_c,const size_t maxncpy)710 size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w,
711 const char *__restrict src_c,
712 const size_t maxncpy)
713 {
714 const size_t maxlen = maxncpy - 1;
715 size_t len = 0;
716
717 BLI_assert(maxncpy != 0);
718
719 #ifdef DEBUG_STRSIZE
720 memset(dst_w, 0xff, sizeof(*dst_w) * maxncpy);
721 #endif
722
723 while (*src_c && len != maxlen) {
724 size_t step = 0;
725 uint unicode = BLI_str_utf8_as_unicode_and_size(src_c, &step);
726 if (unicode != BLI_UTF8_ERR) {
727 *dst_w = unicode;
728 src_c += step;
729 }
730 else {
731 *dst_w = '?';
732 src_c = BLI_str_find_next_char_utf8(src_c, NULL);
733 }
734 dst_w++;
735 len++;
736 }
737
738 *dst_w = 0;
739
740 return len;
741 }
742
BLI_str_utf32_as_utf8(char * __restrict dst,const char32_t * __restrict src,const size_t maxncpy)743 size_t BLI_str_utf32_as_utf8(char *__restrict dst,
744 const char32_t *__restrict src,
745 const size_t maxncpy)
746 {
747 const size_t maxlen = maxncpy - 1;
748 /* 6 is max utf8 length of an unicode char. */
749 const int64_t maxlen_secured = (int64_t)maxlen - 6;
750 size_t len = 0;
751
752 BLI_assert(maxncpy != 0);
753
754 #ifdef DEBUG_STRSIZE
755 memset(dst, 0xff, sizeof(*dst) * maxncpy);
756 #endif
757
758 while (*src && len <= maxlen_secured) {
759 len += BLI_str_utf8_from_unicode((uint)*src++, dst + len);
760 }
761
762 /* We have to be more careful for the last six bytes,
763 * to avoid buffer overflow in case utf8-encoded char would be too long for our dst buffer. */
764 while (*src) {
765 char t[6];
766 size_t l = BLI_str_utf8_from_unicode((uint)*src++, t);
767 BLI_assert(l <= 6);
768 if (len + l > maxlen) {
769 break;
770 }
771 memcpy(dst + len, t, l);
772 len += l;
773 }
774
775 dst[len] = '\0';
776
777 return len;
778 }
779
780 /* utf32 len in utf8 */
BLI_str_utf32_as_utf8_len(const char32_t * src)781 size_t BLI_str_utf32_as_utf8_len(const char32_t *src)
782 {
783 size_t len = 0;
784
785 while (*src) {
786 len += BLI_str_utf8_from_unicode((uint)*src++, NULL);
787 }
788
789 return len;
790 }
791
792 /* was g_utf8_find_prev_char */
793 /**
794 * BLI_str_find_prev_char_utf8:
795 * \param str: pointer to the beginning of a UTF-8 encoded string
796 * \param p: pointer to some position within \a str
797 *
798 * Given a position \a p with a UTF-8 encoded string \a str, find the start
799 * of the previous UTF-8 character starting before. \a p Returns %NULL if no
800 * UTF-8 characters are present in \a str before \a p
801 *
802 * \a p does not have to be at the beginning of a UTF-8 character. No check
803 * is made to see if the character found is actually valid other than
804 * it starts with an appropriate byte.
805 *
806 * Return value: a pointer to the found character or %NULL.
807 */
BLI_str_find_prev_char_utf8(const char * str,const char * p)808 char *BLI_str_find_prev_char_utf8(const char *str, const char *p)
809 {
810 for (--p; p >= str; p--) {
811 if ((*p & 0xc0) != 0x80) {
812 return (char *)p;
813 }
814 }
815 return NULL;
816 }
817
818 /* was g_utf8_find_next_char */
819 /**
820 * BLI_str_find_next_char_utf8:
821 * \param p: a pointer to a position within a UTF-8 encoded string
822 * \param end: a pointer to the byte following the end of the string,
823 * or %NULL to indicate that the string is nul-terminated.
824 *
825 * Finds the start of the next UTF-8 character in the string after \a p
826 *
827 * \a p does not have to be at the beginning of a UTF-8 character. No check
828 * is made to see if the character found is actually valid other than
829 * it starts with an appropriate byte.
830 *
831 * Return value: a pointer to the found character or %NULL
832 */
BLI_str_find_next_char_utf8(const char * p,const char * end)833 char *BLI_str_find_next_char_utf8(const char *p, const char *end)
834 {
835 if (*p) {
836 if (end) {
837 for (++p; p < end && (*p & 0xc0) == 0x80; p++) {
838 /* do nothing */
839 }
840 }
841 else {
842 for (++p; (*p & 0xc0) == 0x80; p++) {
843 /* do nothing */
844 }
845 }
846 }
847 return (p == end) ? NULL : (char *)p;
848 }
849
850 /* was g_utf8_prev_char */
851 /**
852 * BLI_str_prev_char_utf8:
853 * \param p: a pointer to a position within a UTF-8 encoded string
854 *
855 * Finds the previous UTF-8 character in the string before \a p
856 *
857 * \a p does not have to be at the beginning of a UTF-8 character. No check
858 * is made to see if the character found is actually valid other than
859 * it starts with an appropriate byte. If \a p might be the first
860 * character of the string, you must use g_utf8_find_prev_char() instead.
861 *
862 * Return value: a pointer to the found character.
863 */
BLI_str_prev_char_utf8(const char * p)864 char *BLI_str_prev_char_utf8(const char *p)
865 {
866 while (1) {
867 p--;
868 if ((*p & 0xc0) != 0x80) {
869 return (char *)p;
870 }
871 }
872 }
873 /* end glib copy */
874
BLI_str_partition_utf8(const char * str,const uint delim[],const char ** sep,const char ** suf)875 size_t BLI_str_partition_utf8(const char *str,
876 const uint delim[],
877 const char **sep,
878 const char **suf)
879 {
880 return BLI_str_partition_ex_utf8(str, NULL, delim, sep, suf, false);
881 }
882
BLI_str_rpartition_utf8(const char * str,const uint delim[],const char ** sep,const char ** suf)883 size_t BLI_str_rpartition_utf8(const char *str,
884 const uint delim[],
885 const char **sep,
886 const char **suf)
887 {
888 return BLI_str_partition_ex_utf8(str, NULL, delim, sep, suf, true);
889 }
890
BLI_str_partition_ex_utf8(const char * str,const char * end,const uint delim[],const char ** sep,const char ** suf,const bool from_right)891 size_t BLI_str_partition_ex_utf8(const char *str,
892 const char *end,
893 const uint delim[],
894 const char **sep,
895 const char **suf,
896 const bool from_right)
897 {
898 const uint *d;
899 const size_t str_len = end ? (size_t)(end - str) : strlen(str);
900 size_t index;
901
902 /* Note that here, we assume end points to a valid utf8 char! */
903 BLI_assert(end == NULL || (end >= str && (BLI_str_utf8_as_unicode(end) != BLI_UTF8_ERR)));
904
905 *suf = (char *)(str + str_len);
906
907 for (*sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(str, str + str_len) : str),
908 index = 0;
909 *sep >= str && (!end || *sep < end) && **sep != '\0';
910 *sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(str, *sep) : str + index)) {
911 const uint c = BLI_str_utf8_as_unicode_and_size(*sep, &index);
912
913 if (c == BLI_UTF8_ERR) {
914 *suf = *sep = NULL;
915 break;
916 }
917
918 for (d = delim; *d != '\0'; d++) {
919 if (*d == c) {
920 /* *suf is already correct in case from_right is true. */
921 if (!from_right) {
922 *suf = (char *)(str + index);
923 }
924 return (size_t)(*sep - str);
925 }
926 }
927
928 *suf = *sep; /* Useful in 'from_right' case! */
929 }
930
931 *suf = *sep = NULL;
932 return str_len;
933 }
934
935 /* -------------------------------------------------------------------- */
936 /** \name Offset Conversion in Strings
937 * \{ */
938
BLI_str_utf8_offset_to_index(const char * str,int offset)939 int BLI_str_utf8_offset_to_index(const char *str, int offset)
940 {
941 int index = 0, pos = 0;
942 while (pos != offset) {
943 pos += BLI_str_utf8_size(str + pos);
944 index++;
945 }
946 return index;
947 }
948
BLI_str_utf8_offset_from_index(const char * str,int index)949 int BLI_str_utf8_offset_from_index(const char *str, int index)
950 {
951 int offset = 0, pos = 0;
952 while (pos != index) {
953 offset += BLI_str_utf8_size(str + offset);
954 pos++;
955 }
956 return offset;
957 }
958
BLI_str_utf8_offset_to_column(const char * str,int offset)959 int BLI_str_utf8_offset_to_column(const char *str, int offset)
960 {
961 int column = 0, pos = 0;
962 while (pos < offset) {
963 column += BLI_str_utf8_char_width_safe(str + pos);
964 pos += BLI_str_utf8_size_safe(str + pos);
965 }
966 return column;
967 }
968
BLI_str_utf8_offset_from_column(const char * str,int column)969 int BLI_str_utf8_offset_from_column(const char *str, int column)
970 {
971 int offset = 0, pos = 0, col;
972 while (*(str + offset) && pos < column) {
973 col = BLI_str_utf8_char_width_safe(str + offset);
974 if (pos + col > column) {
975 break;
976 }
977 offset += BLI_str_utf8_size_safe(str + offset);
978 pos += col;
979 }
980 return offset;
981 }
982
983 /** \} */
984