1 /**********************************************************************
2  Freeciv - Copyright (C) 1996 - A Kjeldberg, L Gregersen, P Unold
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2, or (at your option)
6    any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 ***********************************************************************/
13 
14 #ifdef HAVE_CONFIG_H
15 #include <fc_config.h>
16 #endif
17 
18 #include <stdarg.h>
19 #include <string.h>
20 
21 /* utility */
22 #include "log.h"
23 #include "mem.h"
24 #include "support.h"
25 
26 #include "fc_utf8.h"
27 
28 
29 /* The length of a character for external use (at least 1 to avoid infinite
30  * loops). See also fc_ut8_next_char(). */
31 const char fc_utf8_skip[256] = {
32   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00000000 to 00001111. */
33   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00010000 to 00011111. */
34   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00100000 to 00101111. */
35   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00110000 to 00111111. */
36   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01000000 to 01001111. */
37   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01010000 to 01011111. */
38   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01100000 to 01101111. */
39   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01110000 to 01111111. */
40   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10000000 to 10001111. */
41   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10010000 to 10011111. */
42   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10100000 to 10101111. */
43   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 10110000 to 10111111. */
44   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11000000 to 11001111. */
45   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11010000 to 11011111. */
46   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 11100000 to 11101111. */
47 #ifdef USE_6_BYTES_CHAR
48   4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1  /* 11110000 to 11111111. */
49 #else
50   4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1  /* 11110000 to 11111111. */
51 #endif /* USE_6_BYTES_CHAR */
52 };
53 
54 /* The length of a character for internal use (0 means an invalid start of
55  * a character). */
56 static const char fc_utf8_char_size[256] = {
57   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00000000 to 00001111. */
58   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00010000 to 00011111. */
59   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00100000 to 00101111. */
60   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00110000 to 00111111. */
61   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01000000 to 01001111. */
62   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01010000 to 01011111. */
63   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01100000 to 01101111. */
64   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 01110000 to 01111111. */
65   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10000000 to 10001111. */
66   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10010000 to 10011111. */
67   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10100000 to 10101111. */
68   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10110000 to 10111111. */
69   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11000000 to 11001111. */
70   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11010000 to 11011111. */
71   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* 11100000 to 11101111. */
72 #ifdef USE_6_BYTES_CHAR
73   4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0  /* 11110000 to 11111111. */
74 #else
75   4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0  /* 11110000 to 11111111. */
76 #endif /* USE_6_BYTES_CHAR */
77 };
78 
79 #define FC_UTF8_CHAR_SIZE(utf8_char) \
80   fc_utf8_char_size[*(unsigned char *) utf8_char]
81 
82 #define FC_UTF8_REP_CHAR "\xef\xbf\xbd" /* U+FFFD. */
83 
84 
85 /****************************************************************************
86   Returns TRUE if the character beginning at the pointer 'utf8_char' of size
87   'size' is a valid UTF-8 character.
88 ****************************************************************************/
base_fc_utf8_char_validate(const char * utf8_char,char size)89 static inline bool base_fc_utf8_char_validate(const char *utf8_char,
90                                               char size)
91 {
92   if (1 < size) {
93     do {
94       utf8_char++;
95       if (0x80 != (0xC0 & *(unsigned char *) utf8_char)) {
96         /* Not a valid byte of the sequence. */
97         return FALSE;
98       }
99       size--;
100     } while (1 < size);
101     return TRUE;
102   } else {
103     return (1 == size);
104   }
105 }
106 
107 /****************************************************************************
108   UTF-8-safe variant of fc_strlcpy() base function.
109 ****************************************************************************/
base_fc_utf8_strlcpy_trunc(char * dest,const char * src,size_t n)110 static inline size_t base_fc_utf8_strlcpy_trunc(char *dest, const char *src,
111                                                 size_t n)
112 {
113   const char *end;
114   size_t len;
115 
116   (void) fc_utf8_validate_len(src, n, &end);
117   len = end - src;
118   fc_assert(len < n);
119   if (0 < len) {
120     memcpy(dest, src, len);
121   }
122   dest[len] = '\0';
123   return strlen(src);
124 }
125 
126 /****************************************************************************
127   UTF-8-safe variant of fc_strlcpy() base function.
128 ****************************************************************************/
base_fc_utf8_strlcpy_rep(char * dest,const char * src,size_t n)129 static inline size_t base_fc_utf8_strlcpy_rep(char *dest, const char *src,
130                                               size_t n)
131 {
132   const char *end;
133   size_t src_len, len;
134 
135   fc_assert_ret_val(NULL != src, 0);
136 
137   src_len = strlen(src);
138   while (TRUE) {
139     if (fc_utf8_validate_len(src, n, &end)) {
140       /* Valid UTF-8. */
141       len = end - src;
142 
143       fc_assert(len < n);
144 
145       if (0 < len) {
146         memcpy(dest, src, len);
147       }
148       dest[len] = '\0'; /* Valid UTF-8 string part. */
149       return src_len;
150     } else {
151       /* '*end' is not a valid UTF-8 character. */
152       len = end - src;
153 
154       fc_assert(len < n);
155 
156       if (0 < len) {
157         memcpy(dest, src, len);
158       }
159 
160       n -= len;
161       dest += len;
162 
163       /* Try to insert the replacement character. */
164       len = sizeof(FC_UTF8_REP_CHAR);
165       if (n > len) {
166         memcpy(dest, FC_UTF8_REP_CHAR, len);
167         n -= len;
168         dest += len;
169       }
170 
171       if (1 == n) {
172         *dest = '\0';
173         return src_len; /* End of 'dest' reached. */
174       }
175 
176       /* Jump to next character in src. */
177       src = fc_utf8_find_next_char(end);
178       if (src == NULL || *src == '\0') {
179         *dest = '\0';
180         return src_len; /* End of 'src' reached. */
181       }
182     }
183   }
184   fc_assert(FALSE);     /* Shouldn't occur! */
185   return src_len;
186 }
187 
188 
189 /****************************************************************************
190   Returns TRUE if the character beginning at the pointer 'utf8_char' is
191   a valid UTF-8 character.
192 ****************************************************************************/
fc_utf8_char_validate(const char * utf8_char)193 bool fc_utf8_char_validate(const char *utf8_char)
194 {
195   fc_assert_ret_val(NULL != utf8_char, FALSE);
196 
197   return base_fc_utf8_char_validate(utf8_char, FC_UTF8_CHAR_SIZE(utf8_char));
198 }
199 
200 /****************************************************************************
201   Jump to next UTF-8 character start.
202 
203   NB: This function can return a invalid UTF-8 character. Check with
204   fc_utf8_char_validate() to unsure.
205 ****************************************************************************/
fc_utf8_find_next_char(const char * utf8_char)206 char *fc_utf8_find_next_char(const char *utf8_char)
207 {
208   fc_assert_ret_val(NULL != utf8_char, NULL);
209 
210   do {
211     utf8_char++;
212   } while (0 == FC_UTF8_CHAR_SIZE(utf8_char));
213   return (char *) utf8_char;
214 }
215 
216 /****************************************************************************
217   Jump to previous UTF-8 character start in the limit of the 'utf8_string'
218   pointer. If no character is found, returns 'utf8_string'.
219 
220   NB: This function can return a invalid UTF-8 character. Check with
221   fc_utf8_char_validate() to unsure.
222 ****************************************************************************/
fc_utf8_find_prev_char(const char * utf8_char,const char * utf8_string)223 char *fc_utf8_find_prev_char(const char *utf8_char, const char *utf8_string)
224 {
225   fc_assert_ret_val(NULL != utf8_char, NULL);
226 
227   for (utf8_char--; utf8_char > utf8_string; utf8_char--) {
228     if (0 != FC_UTF8_CHAR_SIZE(utf8_char)) {
229       return (char *) utf8_char;
230     }
231   }
232   return (char *) utf8_string;
233 }
234 
235 
236 /****************************************************************************
237   Returns TRUE if the string 'utf8_string' contains only valid UTF-8
238   characters. If 'end' is not NULL, the end of the valid string will be
239   stored there, even if it returns TRUE.
240 
241   See also fc_utf8_validate_len().
242 ****************************************************************************/
fc_utf8_validate(const char * utf8_string,const char ** end)243 bool fc_utf8_validate(const char *utf8_string, const char **end)
244 {
245   char size;
246 
247   fc_assert_ret_val(NULL != utf8_string, FALSE);
248 
249   while ('\0' != *utf8_string) {
250     size = FC_UTF8_CHAR_SIZE(utf8_string);
251     if (!base_fc_utf8_char_validate(utf8_string, size)) {
252       if (NULL != end) {
253         *end = utf8_string;
254       }
255       return FALSE;
256     }
257     utf8_string += size;
258   }
259   if (NULL != end) {
260     *end = utf8_string;
261   }
262   return TRUE;
263 }
264 
265 /****************************************************************************
266   Returns TRUE if the string 'utf8_string' contains only valid UTF-8
267   characters in the limit of the length (in bytes) 'byte_len'. If 'end' is
268   not NULL, the end of the valid string will be stored there, even if it
269   returns TRUE.
270 
271   See also fc_utf8_validate().
272 ****************************************************************************/
fc_utf8_validate_len(const char * utf8_string,size_t byte_len,const char ** end)273 bool fc_utf8_validate_len(const char *utf8_string, size_t byte_len,
274                           const char **end)
275 {
276   unsigned char size;
277 
278   fc_assert_ret_val(NULL != utf8_string, FALSE);
279 
280   while ('\0' != *utf8_string) {
281     size = FC_UTF8_CHAR_SIZE(utf8_string);
282 
283     if (!base_fc_utf8_char_validate(utf8_string, size)) {
284       if (NULL != end) {
285         *end = utf8_string;
286       }
287       return FALSE;
288     }
289 
290     if (size > byte_len) {
291       if (NULL != end) {
292         *end = utf8_string;
293       }
294       return FALSE;
295     } else {
296       byte_len -= size;
297     }
298 
299     utf8_string += size;
300   }
301   if (NULL != end) {
302     *end = utf8_string;
303   }
304 
305   return TRUE;
306 }
307 
308 /****************************************************************************
309   Truncate the string 'utf8_string' at the first invalid UTF-8 character.
310   Returns 'utf8_string'.
311 
312   See also fc_utf8_validate(), fc_utf8_validate_trunc_len(),
313   and fc_utf8_validate_trunc_dup().
314 ****************************************************************************/
fc_utf8_validate_trunc(char * utf8_string)315 char *fc_utf8_validate_trunc(char *utf8_string)
316 {
317   char *end;
318 
319   fc_assert_ret_val(NULL != utf8_string, NULL);
320 
321   if (!fc_utf8_validate(utf8_string, (const char **) &end)) {
322     *end = '\0';
323   }
324 
325   return utf8_string;
326 }
327 
328 /****************************************************************************
329   Truncate the string 'utf8_string' at the first invalid UTF-8 character in
330   the limit (in bytes) of 'byte_len'. Returns 'utf8_string'.
331 
332   See also fc_utf8_validate_trunc(), fc_utf8_validate_trunc_dup(),
333   and fc_utf8_validate_rep_len().
334 ****************************************************************************/
fc_utf8_validate_trunc_len(char * utf8_string,size_t byte_len)335 char *fc_utf8_validate_trunc_len(char *utf8_string, size_t byte_len)
336 {
337   char *end;
338 
339   fc_assert_ret_val(NULL != utf8_string, NULL);
340 
341   if (!fc_utf8_validate_len(utf8_string, byte_len, (const char **) &end)) {
342     *end = '\0';
343   }
344   return utf8_string;
345 }
346 
347 /****************************************************************************
348   Duplicate the truncation of the string 'utf8_string' at the first invalid
349   UTF-8 character.
350 
351   See also fc_utf8_validate_trunc(), fc_utf8_validate_trunc_len(),
352   and fc_utf8_validate_rep_dup().
353 ****************************************************************************/
fc_utf8_validate_trunc_dup(const char * utf8_string)354 char *fc_utf8_validate_trunc_dup(const char *utf8_string)
355 {
356   const char *end;
357   size_t size;
358   char *ret;
359 
360   fc_assert_ret_val(NULL != utf8_string, NULL);
361 
362   (void) fc_utf8_validate(utf8_string, &end);
363   size = end - utf8_string;
364   ret = fc_malloc(size + 1);    /* Keep a spot for '\0'. */
365   memcpy(ret, utf8_string, size);
366   ret[size] = '\0';
367 
368   return ret;
369 }
370 
371 /****************************************************************************
372   Transform 'utf8_string' with replacing all invalid characters with the
373   replacement character in the limit of 'byte_len', truncate the last
374   character. Returns 'utf8_string'.
375 
376   See also fc_utf8_validate_len(), fc_utf8_validate_trunc(),
377   and fc_utf8_validate_rep_dup().
378 ****************************************************************************/
fc_utf8_validate_rep_len(char * utf8_string,size_t byte_len)379 char *fc_utf8_validate_rep_len(char *utf8_string, size_t byte_len)
380 {
381   fc_assert_ret_val(NULL != utf8_string, NULL);
382 
383   if (0 < byte_len) {
384     char copy[byte_len];
385 
386     fc_strlcpy(copy, utf8_string, byte_len);
387     base_fc_utf8_strlcpy_rep(utf8_string, copy, byte_len);
388   }
389   return utf8_string;
390 }
391 
392 /****************************************************************************
393   Duplicate 'utf8_string' and replace all invalid characters with the
394   replacement character.
395 
396   See also fc_utf8_validate_rep_len(), and fc_utf8_validate_trunc_dup().
397 ****************************************************************************/
fc_utf8_validate_rep_dup(const char * utf8_string)398 char *fc_utf8_validate_rep_dup(const char *utf8_string)
399 {
400   char *ret;
401   const char *utf8_char;
402   size_t size = 1;      /* '\0'. */
403   char char_size;
404 
405   fc_assert_ret_val(NULL != utf8_string, NULL);
406 
407   /* Check needed size. */
408   utf8_char = utf8_string;
409   while ('\0' != *utf8_char) {
410     char_size = FC_UTF8_CHAR_SIZE(utf8_char);
411     if (base_fc_utf8_char_validate(utf8_char, char_size)) {
412       /* Normal valid character. */
413       size += char_size;
414       utf8_char += char_size;
415     } else {
416       /* Replacement character. */
417       size += sizeof(FC_UTF8_REP_CHAR);
418       /* Find next character. */
419       do {
420         utf8_char++;
421       } while (0 == FC_UTF8_CHAR_SIZE(utf8_char));
422     }
423   }
424 
425   /* Do the allocation. */
426   ret = fc_malloc(size);
427   base_fc_utf8_strlcpy_rep(ret, utf8_string, size);
428 
429   return ret;
430 }
431 
432 /****************************************************************************
433   Returns the number of characters in the string 'utf8_string'. To know the
434   number of used bytes, used strlen() instead.
435 
436   NB: 'utf8_string' must be UTF-8 valid (see fc_utf8_validate()), or the
437   behaviour of this function will be unknown.
438 ****************************************************************************/
fc_utf8_strlen(const char * utf8_string)439 size_t fc_utf8_strlen(const char *utf8_string)
440 {
441   size_t len;
442 
443   fc_assert_ret_val(NULL != utf8_string, 0);
444 
445   for (len = 0; '\0' != *utf8_string; len++) {
446     utf8_string = fc_ut8_next_char(utf8_string);
447   }
448   return len;
449 }
450 
451 
452 /****************************************************************************
453   This is a variant of fc_strlcpy() to unsure the result will be a valid
454   UTF-8 string. It truncates the string at the first UTF-8 invalid
455   character.
456 
457   See also fc_strlcpy(), fc_utf8_strlcpy_rep().
458 ****************************************************************************/
fc_utf8_strlcpy_trunc(char * dest,const char * src,size_t n)459 size_t fc_utf8_strlcpy_trunc(char *dest, const char *src, size_t n)
460 {
461   fc_assert_ret_val(NULL != dest, -1);
462   fc_assert_ret_val(NULL != src, -1);
463   fc_assert_ret_val(0 < n, -1);
464 
465   return base_fc_utf8_strlcpy_trunc(dest, src, n);
466 }
467 
468 /****************************************************************************
469   This is a variant of fc_strlcpy() to unsure the result will be a valid
470   UTF-8 string. Unlike fc_utf8_strlcpy_trunc(), it replaces the invalid
471   characters by the replacement character, instead of truncating the string.
472 
473   See also fc_strlcpy(), fc_utf8_strlcpy_trunc().
474 ****************************************************************************/
fc_utf8_strlcpy_rep(char * dest,const char * src,size_t n)475 size_t fc_utf8_strlcpy_rep(char *dest, const char *src, size_t n)
476 {
477   fc_assert_ret_val(NULL != dest, -1);
478   fc_assert_ret_val(NULL != src, -1);
479   fc_assert_ret_val(0 < n, -1);
480 
481   return base_fc_utf8_strlcpy_rep(dest, src, n);
482 }
483 
484 /****************************************************************************
485   This is a variant of fc_strlcat() to unsure the result will be a valid
486   UTF-8 string. It truncates the string at the first UTF-8 invalid
487   character.
488 
489   NB: This function doesn't perform anything on the already edited part of
490   the string 'dest', which can contain invalid UTF-8 characters.
491 
492   See also fc_strlcat(), fc_utf8_strlcat_rep().
493 ****************************************************************************/
fc_utf8_strlcat_trunc(char * dest,const char * src,size_t n)494 size_t fc_utf8_strlcat_trunc(char *dest, const char *src, size_t n)
495 {
496   size_t len;
497 
498   fc_assert_ret_val(NULL != dest, -1);
499   fc_assert_ret_val(NULL != src, -1);
500   fc_assert_ret_val(0 < n, -1);
501 
502   len = strlen(dest);
503   fc_assert_ret_val(len < n, -1);
504   return len + base_fc_utf8_strlcpy_trunc(dest + len, src, n - len);
505 }
506 
507 /****************************************************************************
508   This is a variant of fc_strlcat() to unsure the result will be a valid
509   UTF-8 string. Unlike fc_utf8_strlcat_trunc(), it replaces the invalid
510   characters by the replacement character, instead of truncating the string.
511 
512   NB: This function doesn't perform anything on the already edited part of
513   the string 'dest', which can contain invalid UTF-8 characters.
514 
515   See also fc_strlcat(), fc_utf8_strlcat_trunc().
516 ****************************************************************************/
fc_utf8_strlcat_rep(char * dest,const char * src,size_t n)517 size_t fc_utf8_strlcat_rep(char *dest, const char *src, size_t n)
518 {
519   size_t len;
520 
521   fc_assert_ret_val(NULL != dest, -1);
522   fc_assert_ret_val(NULL != src, -1);
523   fc_assert_ret_val(0 < n, -1);
524 
525   len = strlen(dest);
526   fc_assert_ret_val(len < n, -1);
527   return len + base_fc_utf8_strlcpy_rep(dest + len, src, n - len);
528 }
529 
530 /****************************************************************************
531   This is a variant of fc_snprintf() to unsure the result will be a valid
532   UTF-8 string. It truncates the string at the first UTF-8 invalid
533   character.
534 
535   See also fc_snprintf(), fc_utf8_snprintf_rep().
536 ****************************************************************************/
fc_utf8_snprintf_trunc(char * str,size_t n,const char * format,...)537 int fc_utf8_snprintf_trunc(char *str, size_t n, const char *format, ...)
538 {
539   int ret;
540   va_list args;
541 
542   va_start(args, format);
543   ret = fc_utf8_vsnprintf_trunc(str, n, format, args);
544   va_end(args);
545   return ret;
546 }
547 
548 /****************************************************************************
549   This is a variant of fc_snprintf() to unsure the result will be a valid
550   UTF-8 string. Unlike fc_utf8_snprintf_trunc(), it replaces the invalid
551   characters by the replacement character, instead of truncating the string.
552 
553   See also fc_snprintf(), fc_utf8_snprintf_trunc().
554 ****************************************************************************/
fc_utf8_snprintf_rep(char * str,size_t n,const char * format,...)555 int fc_utf8_snprintf_rep(char *str, size_t n, const char *format, ...)
556 {
557   int ret;
558   va_list args;
559 
560   va_start(args, format);
561   ret = fc_utf8_vsnprintf_rep(str, n, format, args);
562   va_end(args);
563   return ret;
564 }
565 
566 /****************************************************************************
567   This is a variant of fc_vsnprintf() to unsure the result will be a valid
568   UTF-8 string. It truncates the string at the first UTF-8 invalid
569   character.
570 
571   See also fc_vsnprintf(), fc_utf8_vsnprintf_rep().
572 ****************************************************************************/
fc_utf8_vsnprintf_trunc(char * str,size_t n,const char * format,va_list args)573 int fc_utf8_vsnprintf_trunc(char *str, size_t n, const char *format,
574                             va_list args)
575 {
576   char *end;
577   int ret;
578 
579   fc_assert_ret_val(NULL != str, -1);
580   fc_assert_ret_val(0 < n, -1);
581   fc_assert_ret_val(NULL != format, -1);
582 
583   ret = fc_vsnprintf(str, n, format, args);
584   if (fc_utf8_validate(str, (const char **) &end)) {
585     /* Already valid UTF-8. */
586     return ret;
587   } else {
588     /* Truncate at last valid UTF-8 character. */
589     *end = '\0';
590     return (-1 == ret ? -1 : end - str);
591   }
592 }
593 
594 /****************************************************************************
595   This is a variant of fc_vsnprintf() to unsure the result will be a valid
596   UTF-8 string. Unlike fc_utf8_vsnprintf_trunc(), it replaces the invalid
597   characters by the replacement character, instead of truncating the string.
598 
599   See also fc_vsnprintf(), fc_utf8_vsnprintf_trunc().
600 ****************************************************************************/
fc_utf8_vsnprintf_rep(char * str,size_t n,const char * format,va_list args)601 int fc_utf8_vsnprintf_rep(char *str, size_t n, const char *format,
602                           va_list args)
603 {
604   char *end;
605   int ret;
606 
607   fc_assert_ret_val(NULL != str, -1);
608   fc_assert_ret_val(0 < n, -1);
609   fc_assert_ret_val(NULL != format, -1);
610 
611   ret = fc_vsnprintf(str, n, format, args);
612   if (fc_utf8_validate(str, (const char **) &end)) {
613     /* Already valid UTF-8. */
614     return ret;
615   } else {
616     (void) fc_utf8_validate_rep_len(end, n - (end - str));
617     return ((-1 == ret) ? -1 : (int)strlen(str));
618   }
619 }
620 
621 /****************************************************************************
622   This is a variant of cat_snprintf() to unsure the result will be a valid
623   UTF-8 string. It truncates the string at the first UTF-8 invalid
624   character.
625 
626   NB: This function doesn't perform anything on the already edited part of
627   the string 'str', which can contain invalid UTF-8 characters.
628 
629   See also cat_snprintf(), cat_utf8_snprintf_rep().
630 ****************************************************************************/
cat_utf8_snprintf_trunc(char * str,size_t n,const char * format,...)631 int cat_utf8_snprintf_trunc(char *str, size_t n, const char *format, ...)
632 {
633   size_t len;
634   int ret;
635   va_list args;
636 
637   fc_assert_ret_val(NULL != format, -1);
638   fc_assert_ret_val(NULL != str, -1);
639   fc_assert_ret_val(0 < n, -1);
640 
641   len = strlen(str);
642   fc_assert_ret_val(len < n, -1);
643 
644   va_start(args, format);
645   ret = fc_utf8_vsnprintf_trunc(str + len, n - len, format, args);
646   va_end(args);
647 
648   return ((-1 == ret) ? -1 : (int)(ret + len));
649 }
650 
651 /****************************************************************************
652   This is a variant of cat_snprintf() to unsure the result will be a valid
653   UTF-8 string. Unlike cat_utf8_snprintf_trunc(), it replaces the invalid
654   characters by the replacement character, instead of truncating the string.
655 
656   NB: This function doesn't perform anything on the already edited part of
657   the string 'str', which can contain invalid UTF-8 characters.
658 
659   See also cat_snprintf(), cat_utf8_snprintf_trunc().
660 ****************************************************************************/
cat_utf8_snprintf_rep(char * str,size_t n,const char * format,...)661 int cat_utf8_snprintf_rep(char *str, size_t n, const char *format, ...)
662 {
663   size_t len;
664   int ret;
665   va_list args;
666 
667   fc_assert_ret_val(NULL != format, -1);
668   fc_assert_ret_val(NULL != str, -1);
669   fc_assert_ret_val(0 < n, -1);
670 
671   len = strlen(str);
672   fc_assert_ret_val(len < n, -1);
673 
674   va_start(args, format);
675   ret = fc_utf8_vsnprintf_rep(str + len, n - len, format, args);
676   va_end(args);
677 
678   return ((-1 == ret) ? -1 : (int)(ret + len));
679 }
680