1 /**********************************************************************
2 Freeciv - Copyright (C) 1996 - A Kjeldberg, L Gregersen, P Unold
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; either version 2, or (at your option)
6 any later version.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12 ***********************************************************************/
13
14 #ifdef HAVE_CONFIG_H
15 #include <fc_config.h>
16 #endif
17
18 #include <stdarg.h>
19 #include <string.h>
20
21 /* utility */
22 #include "log.h"
23 #include "mem.h"
24 #include "support.h"
25
26 #include "fc_utf8.h"
27
28
29 /* The length of a character for external use (at least 1 to avoid infinite
30 * loops). See also fc_ut8_next_char(). */
31 const char fc_utf8_skip[256] = {
32 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00000000 to 00001111. */
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00010000 to 00011111. */
34 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00100000 to 00101111. */
35 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00110000 to 00111111. */
36 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01000000 to 01001111. */
37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01010000 to 01011111. */
38 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01100000 to 01101111. */
39 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01110000 to 01111111. */
40 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10000000 to 10001111. */
41 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10010000 to 10011111. */
42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10100000 to 10101111. */
43 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10110000 to 10111111. */
44 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11000000 to 11001111. */
45 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11010000 to 11011111. */
46 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 11100000 to 11101111. */
47 #ifdef USE_6_BYTES_CHAR
48 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 /* 11110000 to 11111111. */
49 #else
50 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1 /* 11110000 to 11111111. */
51 #endif /* USE_6_BYTES_CHAR */
52 };
53
54 /* The length of a character for internal use (0 means an invalid start of
55 * a character). */
56 static const char fc_utf8_char_size[256] = {
57 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00000000 to 00001111. */
58 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00010000 to 00011111. */
59 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00100000 to 00101111. */
60 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00110000 to 00111111. */
61 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01000000 to 01001111. */
62 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01010000 to 01011111. */
63 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01100000 to 01101111. */
64 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01110000 to 01111111. */
65 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10000000 to 10001111. */
66 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10010000 to 10011111. */
67 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10100000 to 10101111. */
68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10110000 to 10111111. */
69 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11000000 to 11001111. */
70 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11010000 to 11011111. */
71 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 11100000 to 11101111. */
72 #ifdef USE_6_BYTES_CHAR
73 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 /* 11110000 to 11111111. */
74 #else
75 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 /* 11110000 to 11111111. */
76 #endif /* USE_6_BYTES_CHAR */
77 };
78
79 #define FC_UTF8_CHAR_SIZE(utf8_char) \
80 fc_utf8_char_size[*(unsigned char *) utf8_char]
81
82 #define FC_UTF8_REP_CHAR "\xef\xbf\xbd" /* U+FFFD. */
83
84
85 /****************************************************************************
86 Returns TRUE if the character beginning at the pointer 'utf8_char' of size
87 'size' is a valid UTF-8 character.
88 ****************************************************************************/
base_fc_utf8_char_validate(const char * utf8_char,char size)89 static inline bool base_fc_utf8_char_validate(const char *utf8_char,
90 char size)
91 {
92 if (1 < size) {
93 do {
94 utf8_char++;
95 if (0x80 != (0xC0 & *(unsigned char *) utf8_char)) {
96 /* Not a valid byte of the sequence. */
97 return FALSE;
98 }
99 size--;
100 } while (1 < size);
101 return TRUE;
102 } else {
103 return (1 == size);
104 }
105 }
106
107 /****************************************************************************
108 UTF-8-safe variant of fc_strlcpy() base function.
109 ****************************************************************************/
base_fc_utf8_strlcpy_trunc(char * dest,const char * src,size_t n)110 static inline size_t base_fc_utf8_strlcpy_trunc(char *dest, const char *src,
111 size_t n)
112 {
113 const char *end;
114 size_t len;
115
116 (void) fc_utf8_validate_len(src, n, &end);
117 len = end - src;
118 fc_assert(len < n);
119 if (0 < len) {
120 memcpy(dest, src, len);
121 }
122 dest[len] = '\0';
123 return strlen(src);
124 }
125
126 /****************************************************************************
127 UTF-8-safe variant of fc_strlcpy() base function.
128 ****************************************************************************/
base_fc_utf8_strlcpy_rep(char * dest,const char * src,size_t n)129 static inline size_t base_fc_utf8_strlcpy_rep(char *dest, const char *src,
130 size_t n)
131 {
132 const char *end;
133 size_t src_len, len;
134
135 fc_assert_ret_val(NULL != src, 0);
136
137 src_len = strlen(src);
138 while (TRUE) {
139 if (fc_utf8_validate_len(src, n, &end)) {
140 /* Valid UTF-8. */
141 len = end - src;
142
143 fc_assert(len < n);
144
145 if (0 < len) {
146 memcpy(dest, src, len);
147 }
148 dest[len] = '\0'; /* Valid UTF-8 string part. */
149 return src_len;
150 } else {
151 /* '*end' is not a valid UTF-8 character. */
152 len = end - src;
153
154 fc_assert(len < n);
155
156 if (0 < len) {
157 memcpy(dest, src, len);
158 }
159
160 n -= len;
161 dest += len;
162
163 /* Try to insert the replacement character. */
164 len = sizeof(FC_UTF8_REP_CHAR);
165 if (n > len) {
166 memcpy(dest, FC_UTF8_REP_CHAR, len);
167 n -= len;
168 dest += len;
169 }
170
171 if (1 == n) {
172 *dest = '\0';
173 return src_len; /* End of 'dest' reached. */
174 }
175
176 /* Jump to next character in src. */
177 src = fc_utf8_find_next_char(end);
178 if (src == NULL || *src == '\0') {
179 *dest = '\0';
180 return src_len; /* End of 'src' reached. */
181 }
182 }
183 }
184 fc_assert(FALSE); /* Shouldn't occur! */
185 return src_len;
186 }
187
188
189 /****************************************************************************
190 Returns TRUE if the character beginning at the pointer 'utf8_char' is
191 a valid UTF-8 character.
192 ****************************************************************************/
fc_utf8_char_validate(const char * utf8_char)193 bool fc_utf8_char_validate(const char *utf8_char)
194 {
195 fc_assert_ret_val(NULL != utf8_char, FALSE);
196
197 return base_fc_utf8_char_validate(utf8_char, FC_UTF8_CHAR_SIZE(utf8_char));
198 }
199
200 /****************************************************************************
201 Jump to next UTF-8 character start.
202
203 NB: This function can return a invalid UTF-8 character. Check with
204 fc_utf8_char_validate() to unsure.
205 ****************************************************************************/
fc_utf8_find_next_char(const char * utf8_char)206 char *fc_utf8_find_next_char(const char *utf8_char)
207 {
208 fc_assert_ret_val(NULL != utf8_char, NULL);
209
210 do {
211 utf8_char++;
212 } while (0 == FC_UTF8_CHAR_SIZE(utf8_char));
213 return (char *) utf8_char;
214 }
215
216 /****************************************************************************
217 Jump to previous UTF-8 character start in the limit of the 'utf8_string'
218 pointer. If no character is found, returns 'utf8_string'.
219
220 NB: This function can return a invalid UTF-8 character. Check with
221 fc_utf8_char_validate() to unsure.
222 ****************************************************************************/
fc_utf8_find_prev_char(const char * utf8_char,const char * utf8_string)223 char *fc_utf8_find_prev_char(const char *utf8_char, const char *utf8_string)
224 {
225 fc_assert_ret_val(NULL != utf8_char, NULL);
226
227 for (utf8_char--; utf8_char > utf8_string; utf8_char--) {
228 if (0 != FC_UTF8_CHAR_SIZE(utf8_char)) {
229 return (char *) utf8_char;
230 }
231 }
232 return (char *) utf8_string;
233 }
234
235
236 /****************************************************************************
237 Returns TRUE if the string 'utf8_string' contains only valid UTF-8
238 characters. If 'end' is not NULL, the end of the valid string will be
239 stored there, even if it returns TRUE.
240
241 See also fc_utf8_validate_len().
242 ****************************************************************************/
fc_utf8_validate(const char * utf8_string,const char ** end)243 bool fc_utf8_validate(const char *utf8_string, const char **end)
244 {
245 char size;
246
247 fc_assert_ret_val(NULL != utf8_string, FALSE);
248
249 while ('\0' != *utf8_string) {
250 size = FC_UTF8_CHAR_SIZE(utf8_string);
251 if (!base_fc_utf8_char_validate(utf8_string, size)) {
252 if (NULL != end) {
253 *end = utf8_string;
254 }
255 return FALSE;
256 }
257 utf8_string += size;
258 }
259 if (NULL != end) {
260 *end = utf8_string;
261 }
262 return TRUE;
263 }
264
265 /****************************************************************************
266 Returns TRUE if the string 'utf8_string' contains only valid UTF-8
267 characters in the limit of the length (in bytes) 'byte_len'. If 'end' is
268 not NULL, the end of the valid string will be stored there, even if it
269 returns TRUE.
270
271 See also fc_utf8_validate().
272 ****************************************************************************/
fc_utf8_validate_len(const char * utf8_string,size_t byte_len,const char ** end)273 bool fc_utf8_validate_len(const char *utf8_string, size_t byte_len,
274 const char **end)
275 {
276 unsigned char size;
277
278 fc_assert_ret_val(NULL != utf8_string, FALSE);
279
280 while ('\0' != *utf8_string) {
281 size = FC_UTF8_CHAR_SIZE(utf8_string);
282
283 if (!base_fc_utf8_char_validate(utf8_string, size)) {
284 if (NULL != end) {
285 *end = utf8_string;
286 }
287 return FALSE;
288 }
289
290 if (size > byte_len) {
291 if (NULL != end) {
292 *end = utf8_string;
293 }
294 return FALSE;
295 } else {
296 byte_len -= size;
297 }
298
299 utf8_string += size;
300 }
301 if (NULL != end) {
302 *end = utf8_string;
303 }
304
305 return TRUE;
306 }
307
308 /****************************************************************************
309 Truncate the string 'utf8_string' at the first invalid UTF-8 character.
310 Returns 'utf8_string'.
311
312 See also fc_utf8_validate(), fc_utf8_validate_trunc_len(),
313 and fc_utf8_validate_trunc_dup().
314 ****************************************************************************/
fc_utf8_validate_trunc(char * utf8_string)315 char *fc_utf8_validate_trunc(char *utf8_string)
316 {
317 char *end;
318
319 fc_assert_ret_val(NULL != utf8_string, NULL);
320
321 if (!fc_utf8_validate(utf8_string, (const char **) &end)) {
322 *end = '\0';
323 }
324
325 return utf8_string;
326 }
327
328 /****************************************************************************
329 Truncate the string 'utf8_string' at the first invalid UTF-8 character in
330 the limit (in bytes) of 'byte_len'. Returns 'utf8_string'.
331
332 See also fc_utf8_validate_trunc(), fc_utf8_validate_trunc_dup(),
333 and fc_utf8_validate_rep_len().
334 ****************************************************************************/
fc_utf8_validate_trunc_len(char * utf8_string,size_t byte_len)335 char *fc_utf8_validate_trunc_len(char *utf8_string, size_t byte_len)
336 {
337 char *end;
338
339 fc_assert_ret_val(NULL != utf8_string, NULL);
340
341 if (!fc_utf8_validate_len(utf8_string, byte_len, (const char **) &end)) {
342 *end = '\0';
343 }
344 return utf8_string;
345 }
346
347 /****************************************************************************
348 Duplicate the truncation of the string 'utf8_string' at the first invalid
349 UTF-8 character.
350
351 See also fc_utf8_validate_trunc(), fc_utf8_validate_trunc_len(),
352 and fc_utf8_validate_rep_dup().
353 ****************************************************************************/
fc_utf8_validate_trunc_dup(const char * utf8_string)354 char *fc_utf8_validate_trunc_dup(const char *utf8_string)
355 {
356 const char *end;
357 size_t size;
358 char *ret;
359
360 fc_assert_ret_val(NULL != utf8_string, NULL);
361
362 (void) fc_utf8_validate(utf8_string, &end);
363 size = end - utf8_string;
364 ret = fc_malloc(size + 1); /* Keep a spot for '\0'. */
365 memcpy(ret, utf8_string, size);
366 ret[size] = '\0';
367
368 return ret;
369 }
370
371 /****************************************************************************
372 Transform 'utf8_string' with replacing all invalid characters with the
373 replacement character in the limit of 'byte_len', truncate the last
374 character. Returns 'utf8_string'.
375
376 See also fc_utf8_validate_len(), fc_utf8_validate_trunc(),
377 and fc_utf8_validate_rep_dup().
378 ****************************************************************************/
fc_utf8_validate_rep_len(char * utf8_string,size_t byte_len)379 char *fc_utf8_validate_rep_len(char *utf8_string, size_t byte_len)
380 {
381 fc_assert_ret_val(NULL != utf8_string, NULL);
382
383 if (0 < byte_len) {
384 char copy[byte_len];
385
386 fc_strlcpy(copy, utf8_string, byte_len);
387 base_fc_utf8_strlcpy_rep(utf8_string, copy, byte_len);
388 }
389 return utf8_string;
390 }
391
392 /****************************************************************************
393 Duplicate 'utf8_string' and replace all invalid characters with the
394 replacement character.
395
396 See also fc_utf8_validate_rep_len(), and fc_utf8_validate_trunc_dup().
397 ****************************************************************************/
fc_utf8_validate_rep_dup(const char * utf8_string)398 char *fc_utf8_validate_rep_dup(const char *utf8_string)
399 {
400 char *ret;
401 const char *utf8_char;
402 size_t size = 1; /* '\0'. */
403 char char_size;
404
405 fc_assert_ret_val(NULL != utf8_string, NULL);
406
407 /* Check needed size. */
408 utf8_char = utf8_string;
409 while ('\0' != *utf8_char) {
410 char_size = FC_UTF8_CHAR_SIZE(utf8_char);
411 if (base_fc_utf8_char_validate(utf8_char, char_size)) {
412 /* Normal valid character. */
413 size += char_size;
414 utf8_char += char_size;
415 } else {
416 /* Replacement character. */
417 size += sizeof(FC_UTF8_REP_CHAR);
418 /* Find next character. */
419 do {
420 utf8_char++;
421 } while (0 == FC_UTF8_CHAR_SIZE(utf8_char));
422 }
423 }
424
425 /* Do the allocation. */
426 ret = fc_malloc(size);
427 base_fc_utf8_strlcpy_rep(ret, utf8_string, size);
428
429 return ret;
430 }
431
432 /****************************************************************************
433 Returns the number of characters in the string 'utf8_string'. To know the
434 number of used bytes, used strlen() instead.
435
436 NB: 'utf8_string' must be UTF-8 valid (see fc_utf8_validate()), or the
437 behaviour of this function will be unknown.
438 ****************************************************************************/
fc_utf8_strlen(const char * utf8_string)439 size_t fc_utf8_strlen(const char *utf8_string)
440 {
441 size_t len;
442
443 fc_assert_ret_val(NULL != utf8_string, 0);
444
445 for (len = 0; '\0' != *utf8_string; len++) {
446 utf8_string = fc_ut8_next_char(utf8_string);
447 }
448 return len;
449 }
450
451
452 /****************************************************************************
453 This is a variant of fc_strlcpy() to unsure the result will be a valid
454 UTF-8 string. It truncates the string at the first UTF-8 invalid
455 character.
456
457 See also fc_strlcpy(), fc_utf8_strlcpy_rep().
458 ****************************************************************************/
fc_utf8_strlcpy_trunc(char * dest,const char * src,size_t n)459 size_t fc_utf8_strlcpy_trunc(char *dest, const char *src, size_t n)
460 {
461 fc_assert_ret_val(NULL != dest, -1);
462 fc_assert_ret_val(NULL != src, -1);
463 fc_assert_ret_val(0 < n, -1);
464
465 return base_fc_utf8_strlcpy_trunc(dest, src, n);
466 }
467
468 /****************************************************************************
469 This is a variant of fc_strlcpy() to unsure the result will be a valid
470 UTF-8 string. Unlike fc_utf8_strlcpy_trunc(), it replaces the invalid
471 characters by the replacement character, instead of truncating the string.
472
473 See also fc_strlcpy(), fc_utf8_strlcpy_trunc().
474 ****************************************************************************/
fc_utf8_strlcpy_rep(char * dest,const char * src,size_t n)475 size_t fc_utf8_strlcpy_rep(char *dest, const char *src, size_t n)
476 {
477 fc_assert_ret_val(NULL != dest, -1);
478 fc_assert_ret_val(NULL != src, -1);
479 fc_assert_ret_val(0 < n, -1);
480
481 return base_fc_utf8_strlcpy_rep(dest, src, n);
482 }
483
484 /****************************************************************************
485 This is a variant of fc_strlcat() to unsure the result will be a valid
486 UTF-8 string. It truncates the string at the first UTF-8 invalid
487 character.
488
489 NB: This function doesn't perform anything on the already edited part of
490 the string 'dest', which can contain invalid UTF-8 characters.
491
492 See also fc_strlcat(), fc_utf8_strlcat_rep().
493 ****************************************************************************/
fc_utf8_strlcat_trunc(char * dest,const char * src,size_t n)494 size_t fc_utf8_strlcat_trunc(char *dest, const char *src, size_t n)
495 {
496 size_t len;
497
498 fc_assert_ret_val(NULL != dest, -1);
499 fc_assert_ret_val(NULL != src, -1);
500 fc_assert_ret_val(0 < n, -1);
501
502 len = strlen(dest);
503 fc_assert_ret_val(len < n, -1);
504 return len + base_fc_utf8_strlcpy_trunc(dest + len, src, n - len);
505 }
506
507 /****************************************************************************
508 This is a variant of fc_strlcat() to unsure the result will be a valid
509 UTF-8 string. Unlike fc_utf8_strlcat_trunc(), it replaces the invalid
510 characters by the replacement character, instead of truncating the string.
511
512 NB: This function doesn't perform anything on the already edited part of
513 the string 'dest', which can contain invalid UTF-8 characters.
514
515 See also fc_strlcat(), fc_utf8_strlcat_trunc().
516 ****************************************************************************/
fc_utf8_strlcat_rep(char * dest,const char * src,size_t n)517 size_t fc_utf8_strlcat_rep(char *dest, const char *src, size_t n)
518 {
519 size_t len;
520
521 fc_assert_ret_val(NULL != dest, -1);
522 fc_assert_ret_val(NULL != src, -1);
523 fc_assert_ret_val(0 < n, -1);
524
525 len = strlen(dest);
526 fc_assert_ret_val(len < n, -1);
527 return len + base_fc_utf8_strlcpy_rep(dest + len, src, n - len);
528 }
529
530 /****************************************************************************
531 This is a variant of fc_snprintf() to unsure the result will be a valid
532 UTF-8 string. It truncates the string at the first UTF-8 invalid
533 character.
534
535 See also fc_snprintf(), fc_utf8_snprintf_rep().
536 ****************************************************************************/
fc_utf8_snprintf_trunc(char * str,size_t n,const char * format,...)537 int fc_utf8_snprintf_trunc(char *str, size_t n, const char *format, ...)
538 {
539 int ret;
540 va_list args;
541
542 va_start(args, format);
543 ret = fc_utf8_vsnprintf_trunc(str, n, format, args);
544 va_end(args);
545 return ret;
546 }
547
548 /****************************************************************************
549 This is a variant of fc_snprintf() to unsure the result will be a valid
550 UTF-8 string. Unlike fc_utf8_snprintf_trunc(), it replaces the invalid
551 characters by the replacement character, instead of truncating the string.
552
553 See also fc_snprintf(), fc_utf8_snprintf_trunc().
554 ****************************************************************************/
fc_utf8_snprintf_rep(char * str,size_t n,const char * format,...)555 int fc_utf8_snprintf_rep(char *str, size_t n, const char *format, ...)
556 {
557 int ret;
558 va_list args;
559
560 va_start(args, format);
561 ret = fc_utf8_vsnprintf_rep(str, n, format, args);
562 va_end(args);
563 return ret;
564 }
565
566 /****************************************************************************
567 This is a variant of fc_vsnprintf() to unsure the result will be a valid
568 UTF-8 string. It truncates the string at the first UTF-8 invalid
569 character.
570
571 See also fc_vsnprintf(), fc_utf8_vsnprintf_rep().
572 ****************************************************************************/
fc_utf8_vsnprintf_trunc(char * str,size_t n,const char * format,va_list args)573 int fc_utf8_vsnprintf_trunc(char *str, size_t n, const char *format,
574 va_list args)
575 {
576 char *end;
577 int ret;
578
579 fc_assert_ret_val(NULL != str, -1);
580 fc_assert_ret_val(0 < n, -1);
581 fc_assert_ret_val(NULL != format, -1);
582
583 ret = fc_vsnprintf(str, n, format, args);
584 if (fc_utf8_validate(str, (const char **) &end)) {
585 /* Already valid UTF-8. */
586 return ret;
587 } else {
588 /* Truncate at last valid UTF-8 character. */
589 *end = '\0';
590 return (-1 == ret ? -1 : end - str);
591 }
592 }
593
594 /****************************************************************************
595 This is a variant of fc_vsnprintf() to unsure the result will be a valid
596 UTF-8 string. Unlike fc_utf8_vsnprintf_trunc(), it replaces the invalid
597 characters by the replacement character, instead of truncating the string.
598
599 See also fc_vsnprintf(), fc_utf8_vsnprintf_trunc().
600 ****************************************************************************/
fc_utf8_vsnprintf_rep(char * str,size_t n,const char * format,va_list args)601 int fc_utf8_vsnprintf_rep(char *str, size_t n, const char *format,
602 va_list args)
603 {
604 char *end;
605 int ret;
606
607 fc_assert_ret_val(NULL != str, -1);
608 fc_assert_ret_val(0 < n, -1);
609 fc_assert_ret_val(NULL != format, -1);
610
611 ret = fc_vsnprintf(str, n, format, args);
612 if (fc_utf8_validate(str, (const char **) &end)) {
613 /* Already valid UTF-8. */
614 return ret;
615 } else {
616 (void) fc_utf8_validate_rep_len(end, n - (end - str));
617 return ((-1 == ret) ? -1 : (int)strlen(str));
618 }
619 }
620
621 /****************************************************************************
622 This is a variant of cat_snprintf() to unsure the result will be a valid
623 UTF-8 string. It truncates the string at the first UTF-8 invalid
624 character.
625
626 NB: This function doesn't perform anything on the already edited part of
627 the string 'str', which can contain invalid UTF-8 characters.
628
629 See also cat_snprintf(), cat_utf8_snprintf_rep().
630 ****************************************************************************/
cat_utf8_snprintf_trunc(char * str,size_t n,const char * format,...)631 int cat_utf8_snprintf_trunc(char *str, size_t n, const char *format, ...)
632 {
633 size_t len;
634 int ret;
635 va_list args;
636
637 fc_assert_ret_val(NULL != format, -1);
638 fc_assert_ret_val(NULL != str, -1);
639 fc_assert_ret_val(0 < n, -1);
640
641 len = strlen(str);
642 fc_assert_ret_val(len < n, -1);
643
644 va_start(args, format);
645 ret = fc_utf8_vsnprintf_trunc(str + len, n - len, format, args);
646 va_end(args);
647
648 return ((-1 == ret) ? -1 : (int)(ret + len));
649 }
650
651 /****************************************************************************
652 This is a variant of cat_snprintf() to unsure the result will be a valid
653 UTF-8 string. Unlike cat_utf8_snprintf_trunc(), it replaces the invalid
654 characters by the replacement character, instead of truncating the string.
655
656 NB: This function doesn't perform anything on the already edited part of
657 the string 'str', which can contain invalid UTF-8 characters.
658
659 See also cat_snprintf(), cat_utf8_snprintf_trunc().
660 ****************************************************************************/
cat_utf8_snprintf_rep(char * str,size_t n,const char * format,...)661 int cat_utf8_snprintf_rep(char *str, size_t n, const char *format, ...)
662 {
663 size_t len;
664 int ret;
665 va_list args;
666
667 fc_assert_ret_val(NULL != format, -1);
668 fc_assert_ret_val(NULL != str, -1);
669 fc_assert_ret_val(0 < n, -1);
670
671 len = strlen(str);
672 fc_assert_ret_val(len < n, -1);
673
674 va_start(args, format);
675 ret = fc_utf8_vsnprintf_rep(str + len, n - len, format, args);
676 va_end(args);
677
678 return ((-1 == ret) ? -1 : (int)(ret + len));
679 }
680