1 /*
2  * This program is free software; you can redistribute it and/or
3  * modify it under the terms of the GNU General Public License
4  * as published by the Free Software Foundation; either version 2
5  * of the License, or (at your option) any later version.
6  *
7  * This program is distributed in the hope that it will be useful,
8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  * GNU General Public License for more details.
11  *
12  * You should have received a copy of the GNU General Public License
13  * along with this program; if not, write to the Free Software Foundation,
14  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
15  *
16  * The Original Code is Copyright (C) 2011 Blender Foundation.
17  * Code from gutf8.c Copyright (C) 1999 Tom Tromey
18  *                   Copyright (C) 2000 Red Hat, Inc.
19  * All rights reserved.
20  */
21 
22 /** \file
23  * \ingroup bli
24  */
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <wchar.h>
30 #include <wctype.h>
31 #include <wcwidth.h>
32 
33 #include "BLI_utildefines.h"
34 
35 #include "BLI_string_utf8.h" /* own include */
36 
37 #ifdef __GNUC__
38 #  pragma GCC diagnostic error "-Wsign-conversion"
39 #endif
40 
41 // #define DEBUG_STRSIZE
42 
43 /* array copied from glib's gutf8.c, */
44 /* Note: last two values (0xfe and 0xff) are forbidden in utf-8,
45  * so they are considered 1 byte length too. */
46 static const size_t utf8_skip_data[256] = {
47     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
52     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
53     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
54     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1,
55 };
56 
57 /* from libswish3, originally called u8_isvalid(),
58  * modified to return the index of the bad character (byte index not utf).
59  * http://svn.swish-e.org/libswish3/trunk/src/libswish3/utf8.c r3044 - campbell */
60 
61 /* based on the valid_utf8 routine from the PCRE library by Philip Hazel
62  *
63  * length is in bytes, since without knowing whether the string is valid
64  * it's hard to know how many characters there are! */
65 
66 /**
67  * Find first utf-8 invalid byte in given \a str, of \a length bytes.
68  *
69  * \return the offset of the first invalid byte.
70  */
BLI_utf8_invalid_byte(const char * str,size_t length)71 ptrdiff_t BLI_utf8_invalid_byte(const char *str, size_t length)
72 {
73   const unsigned char *p, *perr, *pend = (const unsigned char *)str + length;
74   unsigned char c;
75   int ab;
76 
77   for (p = (const unsigned char *)str; p < pend; p++, length--) {
78     c = *p;
79     perr = p; /* Erroneous char is always the first of an invalid utf8 sequence... */
80     if (ELEM(c, 0xfe, 0xff, 0x00)) {
81       /* Those three values are not allowed in utf8 string. */
82       goto utf8_error;
83     }
84     if (c < 128) {
85       continue;
86     }
87     if ((c & 0xc0) != 0xc0) {
88       goto utf8_error;
89     }
90 
91     /* Note that since we always increase p (and decrease length) by one byte in main loop,
92      * we only add/subtract extra utf8 bytes in code below
93      * (ab number, aka number of bytes remaining in the utf8 sequence after the initial one). */
94     ab = (int)utf8_skip_data[c] - 1;
95     if (length <= ab) {
96       goto utf8_error;
97     }
98 
99     /* Check top bits in the second byte */
100     p++;
101     length--;
102     if ((*p & 0xc0) != 0x80) {
103       goto utf8_error;
104     }
105 
106     /* Check for overlong sequences for each different length */
107     switch (ab) {
108       case 1:
109         /* Check for xx00 000x */
110         if ((c & 0x3e) == 0) {
111           goto utf8_error;
112         }
113         continue; /* We know there aren't any more bytes to check */
114 
115       case 2:
116         /* Check for 1110 0000, xx0x xxxx */
117         if (c == 0xe0 && (*p & 0x20) == 0) {
118           goto utf8_error;
119         }
120         /* Some special cases, see section 5 of utf-8 decoder stress-test by Markus Kuhn
121          * (https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt). */
122         /* From section 5.1 (and 5.2) */
123         if (c == 0xed) {
124           if (*p == 0xa0 && *(p + 1) == 0x80) {
125             goto utf8_error;
126           }
127           if (*p == 0xad && *(p + 1) == 0xbf) {
128             goto utf8_error;
129           }
130           if (*p == 0xae && *(p + 1) == 0x80) {
131             goto utf8_error;
132           }
133           if (*p == 0xaf && *(p + 1) == 0xbf) {
134             goto utf8_error;
135           }
136           if (*p == 0xb0 && *(p + 1) == 0x80) {
137             goto utf8_error;
138           }
139           if (*p == 0xbe && *(p + 1) == 0x80) {
140             goto utf8_error;
141           }
142           if (*p == 0xbf && *(p + 1) == 0xbf) {
143             goto utf8_error;
144           }
145         }
146         /* From section 5.3 */
147         if (c == 0xef) {
148           if (*p == 0xbf && *(p + 1) == 0xbe) {
149             goto utf8_error;
150           }
151           if (*p == 0xbf && *(p + 1) == 0xbf) {
152             goto utf8_error;
153           }
154         }
155         break;
156 
157       case 3:
158         /* Check for 1111 0000, xx00 xxxx */
159         if (c == 0xf0 && (*p & 0x30) == 0) {
160           goto utf8_error;
161         }
162         break;
163 
164       case 4:
165         /* Check for 1111 1000, xx00 0xxx */
166         if (c == 0xf8 && (*p & 0x38) == 0) {
167           goto utf8_error;
168         }
169         break;
170 
171       case 5:
172         /* Check for 1111 1100, xx00 00xx */
173         if (c == 0xfc && (*p & 0x3c) == 0) {
174           goto utf8_error;
175         }
176         break;
177     }
178 
179     /* Check for valid bytes after the 2nd, if any; all must start 10 */
180     while (--ab > 0) {
181       p++;
182       length--;
183       if ((*p & 0xc0) != 0x80) {
184         goto utf8_error;
185       }
186     }
187   }
188 
189   return -1;
190 
191 utf8_error:
192 
193   return ((const char *)perr - (const char *)str);
194 }
195 
196 /**
197  * Remove any invalid utf-8 byte (taking into account multi-bytes sequence of course).
198  *
199  * \return number of stripped bytes.
200  */
BLI_utf8_invalid_strip(char * str,size_t length)201 int BLI_utf8_invalid_strip(char *str, size_t length)
202 {
203   ptrdiff_t bad_char;
204   int tot = 0;
205 
206   BLI_assert(str[length] == '\0');
207 
208   while ((bad_char = BLI_utf8_invalid_byte(str, length)) != -1) {
209     str += bad_char;
210     length -= (size_t)(bad_char + 1);
211 
212     if (length == 0) {
213       /* last character bad, strip it */
214       *str = '\0';
215       tot++;
216       break;
217     }
218     /* strip, keep looking */
219     memmove(str, str + 1, length + 1); /* +1 for NULL char! */
220     tot++;
221   }
222 
223   return tot;
224 }
225 
226 /* compatible with BLI_strncpy, but esnure no partial utf8 chars */
227 
228 #define BLI_STR_UTF8_CPY(dst, src, maxncpy) \
229   { \
230     size_t utf8_size; \
231     while (*src != '\0' && (utf8_size = utf8_skip_data[*src]) < maxncpy) { \
232       maxncpy -= utf8_size; \
233       switch (utf8_size) { \
234         case 6: \
235           *dst++ = *src++; \
236           ATTR_FALLTHROUGH; \
237         case 5: \
238           *dst++ = *src++; \
239           ATTR_FALLTHROUGH; \
240         case 4: \
241           *dst++ = *src++; \
242           ATTR_FALLTHROUGH; \
243         case 3: \
244           *dst++ = *src++; \
245           ATTR_FALLTHROUGH; \
246         case 2: \
247           *dst++ = *src++; \
248           ATTR_FALLTHROUGH; \
249         case 1: \
250           *dst++ = *src++; \
251       } \
252     } \
253     *dst = '\0'; \
254   } \
255   (void)0
256 
BLI_strncpy_utf8(char * __restrict dst,const char * __restrict src,size_t maxncpy)257 char *BLI_strncpy_utf8(char *__restrict dst, const char *__restrict src, size_t maxncpy)
258 {
259   char *r_dst = dst;
260 
261   BLI_assert(maxncpy != 0);
262 
263 #ifdef DEBUG_STRSIZE
264   memset(dst, 0xff, sizeof(*dst) * maxncpy);
265 #endif
266 
267   /* note: currently we don't attempt to deal with invalid utf8 chars */
268   BLI_STR_UTF8_CPY(dst, src, maxncpy);
269 
270   return r_dst;
271 }
272 
BLI_strncpy_utf8_rlen(char * __restrict dst,const char * __restrict src,size_t maxncpy)273 size_t BLI_strncpy_utf8_rlen(char *__restrict dst, const char *__restrict src, size_t maxncpy)
274 {
275   char *r_dst = dst;
276 
277   BLI_assert(maxncpy != 0);
278 
279 #ifdef DEBUG_STRSIZE
280   memset(dst, 0xff, sizeof(*dst) * maxncpy);
281 #endif
282 
283   /* note: currently we don't attempt to deal with invalid utf8 chars */
284   BLI_STR_UTF8_CPY(dst, src, maxncpy);
285 
286   return (size_t)(dst - r_dst);
287 }
288 
289 #undef BLI_STR_UTF8_CPY
290 
291 /* --------------------------------------------------------------------------*/
292 /* wchar_t / utf8 functions  */
293 
BLI_strncpy_wchar_as_utf8(char * __restrict dst,const wchar_t * __restrict src,const size_t maxncpy)294 size_t BLI_strncpy_wchar_as_utf8(char *__restrict dst,
295                                  const wchar_t *__restrict src,
296                                  const size_t maxncpy)
297 {
298   const size_t maxlen = maxncpy - 1;
299   /* 6 is max utf8 length of an unicode char. */
300   const int64_t maxlen_secured = (int64_t)maxlen - 6;
301   size_t len = 0;
302 
303   BLI_assert(maxncpy != 0);
304 
305 #ifdef DEBUG_STRSIZE
306   memset(dst, 0xff, sizeof(*dst) * maxncpy);
307 #endif
308 
309   while (*src && len <= maxlen_secured) {
310     len += BLI_str_utf8_from_unicode((uint)*src++, dst + len);
311   }
312 
313   /* We have to be more careful for the last six bytes,
314    * to avoid buffer overflow in case utf8-encoded char would be too long for our dst buffer. */
315   while (*src) {
316     char t[6];
317     size_t l = BLI_str_utf8_from_unicode((uint)*src++, t);
318     BLI_assert(l <= 6);
319     if (len + l > maxlen) {
320       break;
321     }
322     memcpy(dst + len, t, l);
323     len += l;
324   }
325 
326   dst[len] = '\0';
327 
328   return len;
329 }
330 
331 /* wchar len in utf8 */
BLI_wstrlen_utf8(const wchar_t * src)332 size_t BLI_wstrlen_utf8(const wchar_t *src)
333 {
334   size_t len = 0;
335 
336   while (*src) {
337     len += BLI_str_utf8_from_unicode((uint)*src++, NULL);
338   }
339 
340   return len;
341 }
342 
BLI_strlen_utf8_ex(const char * strc,size_t * r_len_bytes)343 size_t BLI_strlen_utf8_ex(const char *strc, size_t *r_len_bytes)
344 {
345   size_t len;
346   const char *strc_orig = strc;
347 
348   for (len = 0; *strc; len++) {
349     strc += BLI_str_utf8_size_safe(strc);
350   }
351 
352   *r_len_bytes = (size_t)(strc - strc_orig);
353   return len;
354 }
355 
BLI_strlen_utf8(const char * strc)356 size_t BLI_strlen_utf8(const char *strc)
357 {
358   size_t len_bytes;
359   return BLI_strlen_utf8_ex(strc, &len_bytes);
360 }
361 
BLI_strnlen_utf8_ex(const char * strc,const size_t maxlen,size_t * r_len_bytes)362 size_t BLI_strnlen_utf8_ex(const char *strc, const size_t maxlen, size_t *r_len_bytes)
363 {
364   size_t len = 0;
365   const char *strc_orig = strc;
366   const char *strc_end = strc + maxlen;
367 
368   while (true) {
369     size_t step = (size_t)BLI_str_utf8_size_safe(strc);
370     if (!*strc || strc + step > strc_end) {
371       break;
372     }
373     strc += step;
374     len++;
375   }
376 
377   *r_len_bytes = (size_t)(strc - strc_orig);
378   return len;
379 }
380 
381 /**
382  * \param strc: the string to measure the length.
383  * \param maxlen: the string length (in bytes)
384  * \return the unicode length (not in bytes!)
385  */
BLI_strnlen_utf8(const char * strc,const size_t maxlen)386 size_t BLI_strnlen_utf8(const char *strc, const size_t maxlen)
387 {
388   size_t len_bytes;
389   return BLI_strnlen_utf8_ex(strc, maxlen, &len_bytes);
390 }
391 
BLI_strncpy_wchar_from_utf8(wchar_t * __restrict dst_w,const char * __restrict src_c,const size_t maxncpy)392 size_t BLI_strncpy_wchar_from_utf8(wchar_t *__restrict dst_w,
393                                    const char *__restrict src_c,
394                                    const size_t maxncpy)
395 {
396   const size_t maxlen = maxncpy - 1;
397   size_t len = 0;
398 
399   BLI_assert(maxncpy != 0);
400 
401 #ifdef DEBUG_STRSIZE
402   memset(dst_w, 0xff, sizeof(*dst_w) * maxncpy);
403 #endif
404 
405   while (*src_c && len != maxlen) {
406     size_t step = 0;
407     uint unicode = BLI_str_utf8_as_unicode_and_size(src_c, &step);
408     if (unicode != BLI_UTF8_ERR) {
409       /* TODO: `wchar_t` type is an implementation-defined and may represent
410        * 16-bit or 32-bit depending on operating system.
411        * So the ideal would be to do the corresponding encoding.
412        * But for now just assert that it has no conflicting use. */
413       BLI_assert(step <= sizeof(wchar_t));
414       *dst_w = (wchar_t)unicode;
415       src_c += step;
416     }
417     else {
418       *dst_w = '?';
419       src_c = BLI_str_find_next_char_utf8(src_c, NULL);
420     }
421     dst_w++;
422     len++;
423   }
424 
425   *dst_w = 0;
426 
427   return len;
428 }
429 
430 /* end wchar_t / utf8 functions  */
431 /* --------------------------------------------------------------------------*/
432 
433 /* count columns that character/string occupies, based on wcwidth.c */
434 
BLI_wcwidth(char32_t ucs)435 int BLI_wcwidth(char32_t ucs)
436 {
437   return mk_wcwidth(ucs);
438 }
439 
BLI_wcswidth(const char32_t * pwcs,size_t n)440 int BLI_wcswidth(const char32_t *pwcs, size_t n)
441 {
442   return mk_wcswidth(pwcs, n);
443 }
444 
BLI_str_utf8_char_width(const char * p)445 int BLI_str_utf8_char_width(const char *p)
446 {
447   uint unicode = BLI_str_utf8_as_unicode(p);
448   if (unicode == BLI_UTF8_ERR) {
449     return -1;
450   }
451 
452   return BLI_wcwidth((char32_t)unicode);
453 }
454 
BLI_str_utf8_char_width_safe(const char * p)455 int BLI_str_utf8_char_width_safe(const char *p)
456 {
457   int columns;
458 
459   uint unicode = BLI_str_utf8_as_unicode(p);
460   if (unicode == BLI_UTF8_ERR) {
461     return 1;
462   }
463 
464   columns = BLI_wcwidth((char32_t)unicode);
465 
466   return (columns < 0) ? 1 : columns;
467 }
468 
469 /* --------------------------------------------------------------------------*/
470 
471 /* copied from glib's gutf8.c, added 'Err' arg */
472 
473 /* note, glib uses uint for unicode, best we do the same,
474  * though we don't typedef it - campbell */
475 
476 #define UTF8_COMPUTE(Char, Mask, Len, Err) \
477   if (Char < 128) { \
478     Len = 1; \
479     Mask = 0x7f; \
480   } \
481   else if ((Char & 0xe0) == 0xc0) { \
482     Len = 2; \
483     Mask = 0x1f; \
484   } \
485   else if ((Char & 0xf0) == 0xe0) { \
486     Len = 3; \
487     Mask = 0x0f; \
488   } \
489   else if ((Char & 0xf8) == 0xf0) { \
490     Len = 4; \
491     Mask = 0x07; \
492   } \
493   else if ((Char & 0xfc) == 0xf8) { \
494     Len = 5; \
495     Mask = 0x03; \
496   } \
497   else if ((Char & 0xfe) == 0xfc) { \
498     Len = 6; \
499     Mask = 0x01; \
500   } \
501   else { \
502     Len = Err; /* -1 is the typical error value or 1 to skip */ \
503   } \
504   (void)0
505 
506 /* same as glib define but added an 'Err' arg */
507 #define UTF8_GET(Result, Chars, Count, Mask, Len, Err) \
508   (Result) = (Chars)[0] & (Mask); \
509   for ((Count) = 1; (Count) < (Len); ++(Count)) { \
510     if (((Chars)[(Count)] & 0xc0) != 0x80) { \
511       (Result) = Err; \
512       break; \
513     } \
514     (Result) <<= 6; \
515     (Result) |= ((Chars)[(Count)] & 0x3f); \
516   } \
517   (void)0
518 
519 /* uses glib functions but not from glib */
520 /* gets the size of a single utf8 char */
BLI_str_utf8_size(const char * p)521 int BLI_str_utf8_size(const char *p)
522 {
523   int mask = 0, len;
524   const unsigned char c = (unsigned char)*p;
525 
526   UTF8_COMPUTE(c, mask, len, -1);
527 
528   (void)mask; /* quiet warning */
529 
530   return len;
531 }
532 
533 /* use when we want to skip errors */
BLI_str_utf8_size_safe(const char * p)534 int BLI_str_utf8_size_safe(const char *p)
535 {
536   int mask = 0, len;
537   const unsigned char c = (unsigned char)*p;
538 
539   UTF8_COMPUTE(c, mask, len, 1);
540 
541   (void)mask; /* quiet warning */
542 
543   return len;
544 }
545 
546 /* was g_utf8_get_char */
547 /**
548  * BLI_str_utf8_as_unicode:
549  * \param p: a pointer to Unicode character encoded as UTF-8
550  *
551  * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
552  * If \a p does not point to a valid UTF-8 encoded character, results are
553  * undefined. If you are not sure that the bytes are complete
554  * valid Unicode characters, you should use g_utf8_get_char_validated()
555  * instead.
556  *
557  * Return value: the resulting character
558  */
BLI_str_utf8_as_unicode(const char * p)559 uint BLI_str_utf8_as_unicode(const char *p)
560 {
561   int i, len;
562   uint mask = 0;
563   uint result;
564   const unsigned char c = (unsigned char)*p;
565 
566   UTF8_COMPUTE(c, mask, len, -1);
567   if (UNLIKELY(len == -1)) {
568     return BLI_UTF8_ERR;
569   }
570   UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
571 
572   return result;
573 }
574 
575 /* variant that increments the length */
BLI_str_utf8_as_unicode_and_size(const char * __restrict p,size_t * __restrict index)576 uint BLI_str_utf8_as_unicode_and_size(const char *__restrict p, size_t *__restrict index)
577 {
578   int i, len;
579   uint mask = 0;
580   uint result;
581   const unsigned char c = (unsigned char)*p;
582 
583   UTF8_COMPUTE(c, mask, len, -1);
584   if (UNLIKELY(len == -1)) {
585     return BLI_UTF8_ERR;
586   }
587   UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
588   *index += (size_t)len;
589   return result;
590 }
591 
BLI_str_utf8_as_unicode_and_size_safe(const char * __restrict p,size_t * __restrict index)592 uint BLI_str_utf8_as_unicode_and_size_safe(const char *__restrict p, size_t *__restrict index)
593 {
594   int i, len;
595   uint mask = 0;
596   uint result;
597   const unsigned char c = (unsigned char)*p;
598 
599   UTF8_COMPUTE(c, mask, len, -1);
600   if (UNLIKELY(len == -1)) {
601     *index += 1;
602     return c;
603   }
604   UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
605   *index += (size_t)len;
606   return result;
607 }
608 
609 /* another variant that steps over the index,
610  * note, currently this also falls back to latin1 for text drawing. */
BLI_str_utf8_as_unicode_step(const char * __restrict p,size_t * __restrict index)611 uint BLI_str_utf8_as_unicode_step(const char *__restrict p, size_t *__restrict index)
612 {
613   int i, len;
614   uint mask = 0;
615   uint result;
616   unsigned char c;
617 
618   p += *index;
619   c = (unsigned char)*p;
620 
621   UTF8_COMPUTE(c, mask, len, -1);
622   if (UNLIKELY(len == -1)) {
623     /* when called with NULL end, result will never be NULL,
624      * checks for a NULL character */
625     const char *p_next = BLI_str_find_next_char_utf8(p, NULL);
626     /* will never return the same pointer unless '\0',
627      * eternal loop is prevented */
628     *index += (size_t)(p_next - p);
629     return BLI_UTF8_ERR;
630   }
631 
632   /* this is tricky since there are a few ways we can bail out of bad unicode
633    * values, 3 possible solutions. */
634 #if 0
635   UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
636 #elif 1
637   /* WARNING: this is NOT part of glib, or supported by similar functions.
638    * this is added for text drawing because some filepaths can have latin1
639    * characters */
640   UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
641   if (result == BLI_UTF8_ERR) {
642     len = 1;
643     result = *p;
644   }
645   /* end warning! */
646 #else
647   /* without a fallback like '?', text drawing will stop on this value */
648   UTF8_GET(result, p, i, mask, len, '?');
649 #endif
650 
651   *index += (size_t)len;
652   return result;
653 }
654 
655 /* was g_unichar_to_utf8 */
656 /**
657  * BLI_str_utf8_from_unicode:
658  * \param c: a Unicode character code
659  * \param outbuf: output buffer, must have at least 6 bytes of space.
660  *       If %NULL, the length will be computed and returned
661  *       and nothing will be written to outbuf.
662  *
663  * Converts a single character to UTF-8.
664  *
665  * \return number of bytes written
666  */
BLI_str_utf8_from_unicode(uint c,char * outbuf)667 size_t BLI_str_utf8_from_unicode(uint c, char *outbuf)
668 {
669   /* If this gets modified, also update the copy in g_string_insert_unichar() */
670   uint len = 0;
671   uint first;
672   uint i;
673 
674   if (c < 0x80) {
675     first = 0;
676     len = 1;
677   }
678   else if (c < 0x800) {
679     first = 0xc0;
680     len = 2;
681   }
682   else if (c < 0x10000) {
683     first = 0xe0;
684     len = 3;
685   }
686   else if (c < 0x200000) {
687     first = 0xf0;
688     len = 4;
689   }
690   else if (c < 0x4000000) {
691     first = 0xf8;
692     len = 5;
693   }
694   else {
695     first = 0xfc;
696     len = 6;
697   }
698 
699   if (outbuf) {
700     for (i = len - 1; i > 0; i--) {
701       outbuf[i] = (c & 0x3f) | 0x80;
702       c >>= 6;
703     }
704     outbuf[0] = c | first;
705   }
706 
707   return len;
708 }
709 
BLI_str_utf8_as_utf32(char32_t * __restrict dst_w,const char * __restrict src_c,const size_t maxncpy)710 size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w,
711                              const char *__restrict src_c,
712                              const size_t maxncpy)
713 {
714   const size_t maxlen = maxncpy - 1;
715   size_t len = 0;
716 
717   BLI_assert(maxncpy != 0);
718 
719 #ifdef DEBUG_STRSIZE
720   memset(dst_w, 0xff, sizeof(*dst_w) * maxncpy);
721 #endif
722 
723   while (*src_c && len != maxlen) {
724     size_t step = 0;
725     uint unicode = BLI_str_utf8_as_unicode_and_size(src_c, &step);
726     if (unicode != BLI_UTF8_ERR) {
727       *dst_w = unicode;
728       src_c += step;
729     }
730     else {
731       *dst_w = '?';
732       src_c = BLI_str_find_next_char_utf8(src_c, NULL);
733     }
734     dst_w++;
735     len++;
736   }
737 
738   *dst_w = 0;
739 
740   return len;
741 }
742 
BLI_str_utf32_as_utf8(char * __restrict dst,const char32_t * __restrict src,const size_t maxncpy)743 size_t BLI_str_utf32_as_utf8(char *__restrict dst,
744                              const char32_t *__restrict src,
745                              const size_t maxncpy)
746 {
747   const size_t maxlen = maxncpy - 1;
748   /* 6 is max utf8 length of an unicode char. */
749   const int64_t maxlen_secured = (int64_t)maxlen - 6;
750   size_t len = 0;
751 
752   BLI_assert(maxncpy != 0);
753 
754 #ifdef DEBUG_STRSIZE
755   memset(dst, 0xff, sizeof(*dst) * maxncpy);
756 #endif
757 
758   while (*src && len <= maxlen_secured) {
759     len += BLI_str_utf8_from_unicode((uint)*src++, dst + len);
760   }
761 
762   /* We have to be more careful for the last six bytes,
763    * to avoid buffer overflow in case utf8-encoded char would be too long for our dst buffer. */
764   while (*src) {
765     char t[6];
766     size_t l = BLI_str_utf8_from_unicode((uint)*src++, t);
767     BLI_assert(l <= 6);
768     if (len + l > maxlen) {
769       break;
770     }
771     memcpy(dst + len, t, l);
772     len += l;
773   }
774 
775   dst[len] = '\0';
776 
777   return len;
778 }
779 
780 /* utf32 len in utf8 */
BLI_str_utf32_as_utf8_len(const char32_t * src)781 size_t BLI_str_utf32_as_utf8_len(const char32_t *src)
782 {
783   size_t len = 0;
784 
785   while (*src) {
786     len += BLI_str_utf8_from_unicode((uint)*src++, NULL);
787   }
788 
789   return len;
790 }
791 
792 /* was g_utf8_find_prev_char */
793 /**
794  * BLI_str_find_prev_char_utf8:
795  * \param str: pointer to the beginning of a UTF-8 encoded string
796  * \param p: pointer to some position within \a str
797  *
798  * Given a position \a p with a UTF-8 encoded string \a str, find the start
799  * of the previous UTF-8 character starting before. \a p Returns %NULL if no
800  * UTF-8 characters are present in \a str before \a p
801  *
802  * \a p does not have to be at the beginning of a UTF-8 character. No check
803  * is made to see if the character found is actually valid other than
804  * it starts with an appropriate byte.
805  *
806  * Return value: a pointer to the found character or %NULL.
807  */
BLI_str_find_prev_char_utf8(const char * str,const char * p)808 char *BLI_str_find_prev_char_utf8(const char *str, const char *p)
809 {
810   for (--p; p >= str; p--) {
811     if ((*p & 0xc0) != 0x80) {
812       return (char *)p;
813     }
814   }
815   return NULL;
816 }
817 
818 /* was g_utf8_find_next_char */
819 /**
820  * BLI_str_find_next_char_utf8:
821  * \param p: a pointer to a position within a UTF-8 encoded string
822  * \param end: a pointer to the byte following the end of the string,
823  * or %NULL to indicate that the string is nul-terminated.
824  *
825  * Finds the start of the next UTF-8 character in the string after \a p
826  *
827  * \a p does not have to be at the beginning of a UTF-8 character. No check
828  * is made to see if the character found is actually valid other than
829  * it starts with an appropriate byte.
830  *
831  * Return value: a pointer to the found character or %NULL
832  */
BLI_str_find_next_char_utf8(const char * p,const char * end)833 char *BLI_str_find_next_char_utf8(const char *p, const char *end)
834 {
835   if (*p) {
836     if (end) {
837       for (++p; p < end && (*p & 0xc0) == 0x80; p++) {
838         /* do nothing */
839       }
840     }
841     else {
842       for (++p; (*p & 0xc0) == 0x80; p++) {
843         /* do nothing */
844       }
845     }
846   }
847   return (p == end) ? NULL : (char *)p;
848 }
849 
850 /* was g_utf8_prev_char */
851 /**
852  * BLI_str_prev_char_utf8:
853  * \param p: a pointer to a position within a UTF-8 encoded string
854  *
855  * Finds the previous UTF-8 character in the string before \a p
856  *
857  * \a p does not have to be at the beginning of a UTF-8 character. No check
858  * is made to see if the character found is actually valid other than
859  * it starts with an appropriate byte. If \a p might be the first
860  * character of the string, you must use g_utf8_find_prev_char() instead.
861  *
862  * Return value: a pointer to the found character.
863  */
BLI_str_prev_char_utf8(const char * p)864 char *BLI_str_prev_char_utf8(const char *p)
865 {
866   while (1) {
867     p--;
868     if ((*p & 0xc0) != 0x80) {
869       return (char *)p;
870     }
871   }
872 }
873 /* end glib copy */
874 
BLI_str_partition_utf8(const char * str,const uint delim[],const char ** sep,const char ** suf)875 size_t BLI_str_partition_utf8(const char *str,
876                               const uint delim[],
877                               const char **sep,
878                               const char **suf)
879 {
880   return BLI_str_partition_ex_utf8(str, NULL, delim, sep, suf, false);
881 }
882 
BLI_str_rpartition_utf8(const char * str,const uint delim[],const char ** sep,const char ** suf)883 size_t BLI_str_rpartition_utf8(const char *str,
884                                const uint delim[],
885                                const char **sep,
886                                const char **suf)
887 {
888   return BLI_str_partition_ex_utf8(str, NULL, delim, sep, suf, true);
889 }
890 
BLI_str_partition_ex_utf8(const char * str,const char * end,const uint delim[],const char ** sep,const char ** suf,const bool from_right)891 size_t BLI_str_partition_ex_utf8(const char *str,
892                                  const char *end,
893                                  const uint delim[],
894                                  const char **sep,
895                                  const char **suf,
896                                  const bool from_right)
897 {
898   const uint *d;
899   const size_t str_len = end ? (size_t)(end - str) : strlen(str);
900   size_t index;
901 
902   /* Note that here, we assume end points to a valid utf8 char! */
903   BLI_assert(end == NULL || (end >= str && (BLI_str_utf8_as_unicode(end) != BLI_UTF8_ERR)));
904 
905   *suf = (char *)(str + str_len);
906 
907   for (*sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(str, str + str_len) : str),
908       index = 0;
909        *sep >= str && (!end || *sep < end) && **sep != '\0';
910        *sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(str, *sep) : str + index)) {
911     const uint c = BLI_str_utf8_as_unicode_and_size(*sep, &index);
912 
913     if (c == BLI_UTF8_ERR) {
914       *suf = *sep = NULL;
915       break;
916     }
917 
918     for (d = delim; *d != '\0'; d++) {
919       if (*d == c) {
920         /* *suf is already correct in case from_right is true. */
921         if (!from_right) {
922           *suf = (char *)(str + index);
923         }
924         return (size_t)(*sep - str);
925       }
926     }
927 
928     *suf = *sep; /* Useful in 'from_right' case! */
929   }
930 
931   *suf = *sep = NULL;
932   return str_len;
933 }
934 
935 /* -------------------------------------------------------------------- */
936 /** \name Offset Conversion in Strings
937  * \{ */
938 
BLI_str_utf8_offset_to_index(const char * str,int offset)939 int BLI_str_utf8_offset_to_index(const char *str, int offset)
940 {
941   int index = 0, pos = 0;
942   while (pos != offset) {
943     pos += BLI_str_utf8_size(str + pos);
944     index++;
945   }
946   return index;
947 }
948 
BLI_str_utf8_offset_from_index(const char * str,int index)949 int BLI_str_utf8_offset_from_index(const char *str, int index)
950 {
951   int offset = 0, pos = 0;
952   while (pos != index) {
953     offset += BLI_str_utf8_size(str + offset);
954     pos++;
955   }
956   return offset;
957 }
958 
BLI_str_utf8_offset_to_column(const char * str,int offset)959 int BLI_str_utf8_offset_to_column(const char *str, int offset)
960 {
961   int column = 0, pos = 0;
962   while (pos < offset) {
963     column += BLI_str_utf8_char_width_safe(str + pos);
964     pos += BLI_str_utf8_size_safe(str + pos);
965   }
966   return column;
967 }
968 
BLI_str_utf8_offset_from_column(const char * str,int column)969 int BLI_str_utf8_offset_from_column(const char *str, int column)
970 {
971   int offset = 0, pos = 0, col;
972   while (*(str + offset) && pos < column) {
973     col = BLI_str_utf8_char_width_safe(str + offset);
974     if (pos + col > column) {
975       break;
976     }
977     offset += BLI_str_utf8_size_safe(str + offset);
978     pos += col;
979   }
980   return offset;
981 }
982 
983 /** \} */
984