1 /**
2 * @file
3 */
4
5 /*
6 All original material Copyright (C) 2002-2013 UFO: Alien Invasion.
7
8 Copyright (C) 1997-2001 Id Software, Inc.
9
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License
12 as published by the Free Software Foundation; either version 2
13 of the License, or (at your option) any later version.
14
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18
19 See the GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with this program; if not, write to the Free Software
23 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
24 */
25
26 #include "utf8.h"
27 #include <string.h>
28
29 /**
30 * @brief Delete a whole (possibly multibyte) character from a string.
31 * @param[in] s Start of the string
32 * @param[in] pos UTF-8 char offset from the start (not the byte offset)
33 * @return Number of bytes deleted
34 */
UTF8_delete_char_at(char * s,int pos)35 int UTF8_delete_char_at (char* s, int pos)
36 {
37 /* Convert the UTF-8 char offset to byte offset */
38 pos = UTF8_char_offset_to_byte_offset(s, pos);
39
40 int start = pos;
41 int next = pos;
42
43 while (start > 0 && UTF8_CONTINUATION_BYTE(s[start]))
44 start--;
45 if (s[next] != 0)
46 next++;
47 while (s[next] != 0 && UTF8_CONTINUATION_BYTE(s[next]))
48 next++;
49 /* memmove is the only standard copying function that is guaranteed
50 * to work if the source and destination overlap. */
51 memmove(&s[start], &s[next], strlen(&s[next]) + 1);
52 return (next - start);
53 }
54
55 /**
56 * @brief Insert a (possibly multibyte) UTF-8 character into a string.
57 * @param[in] s Start of the string
58 * @param[in] n Buffer size of the string
59 * @param[in] pos UTF-8 char offset from the start (not the byte offset)
60 * @param[in] c Unicode code as 32-bit integer
61 * @return Number of bytes added
62 */
UTF8_insert_char_at(char * s,int n,int pos,int c)63 int UTF8_insert_char_at (char* s, int n, int pos, int c)
64 {
65 /* Convert the UTF-8 char offset to byte offset */
66 pos = UTF8_char_offset_to_byte_offset(s, pos);
67
68 const int utf8len = UTF8_encoded_len(c);
69 const int tail = strlen(&s[pos]) + 1;
70
71 if (utf8len == 0)
72 return 0;
73
74 if (pos + tail + utf8len > n)
75 return 0;
76
77 /* Insertion: move up rest of string. Also moves string terminator. */
78 memmove(&s[pos + utf8len], &s[pos], tail);
79
80 if (c <= 0x7f) {
81 s[pos] = c;
82 } else if (c <= 0x7ff) { /* c has 11 bits */
83 s[pos] = 0xc0 | (c >> 6); /* high 5 bits */
84 s[pos + 1] = 0x80 | (c & 0x3f); /* low 6 bits */
85 } else if (c <= 0xffff) { /* c has 16 bits */
86 s[pos] = 0xe0 | (c >> 12); /* high 4 bits */
87 s[pos + 1] = 0x80 | ((c >> 6) & 0x3f); /* mid 6 bits */
88 s[pos + 2] = 0x80 | (c & 0x3f); /* low 6 bits */
89 } else if (c <= 0x10ffff) { /* c has 21 bits */
90 s[pos] = 0xf0 | (c >> 18); /* high 3 bits */
91 s[pos + 1] = 0x80 | ((c >> 12) & 0x3f); /* mid 6 bits */
92 s[pos + 2] = 0x80 | ((c >> 6) & 0x3f); /* mid 6 bits */
93 s[pos + 3] = 0x80 | (c & 0x3f); /* low 6 bits */
94 }
95
96 return utf8len;
97 }
98
99 /**
100 * @brief length of UTF-8 character starting with this byte.
101 * @return length of character encoding, or 0 if not start of a UTF-8 sequence
102 * @todo Using this does not solve the truncation problem in case of
103 * decomposed characters. For example a code for "a" followed by a
104 * code for "put dots above previous character: the "a" will be reported
105 * as a character of length 1 by this function, even though the code
106 * that follows is part of its visual appearance and should not be
107 * cut off separately. Fortunately decomposed characters are rarely used.
108 */
UTF8_char_len(unsigned char c)109 int UTF8_char_len (unsigned char c)
110 {
111 if (c < 0x80)
112 return 1;
113 if (c < 0xc0)
114 return 0;
115 if (c < 0xe0)
116 return 2;
117 if (c < 0xf0)
118 return 3;
119 if (c < 0xf8)
120 return 4;
121 /* UTF-8 used to define 5 and 6 byte sequences, but they are
122 * no longer valid. */
123 return 0;
124 }
125
126 /**
127 * @brief Get the next utf-8 character from the given string
128 * @param[in] str The source string to get the utf-8 char from. The string is not touched,
129 * but the pointer is advanced by the length of the utf-8 character.
130 * @return The utf-8 character, or -1 on error
131 */
UTF8_next(const char ** str)132 int UTF8_next (const char** str)
133 {
134 size_t len, i;
135 int cp, min;
136 const char* s = *str;
137
138 if (s[0] == '\0')
139 return -1;
140
141 const unsigned char* buf = (const unsigned char*)(s);
142
143 if (buf[0] < 0x80) {
144 len = 1;
145 min = 0;
146 cp = buf[0];
147 } else if (buf[0] < 0xC0) {
148 return -1;
149 } else if (buf[0] < 0xE0) {
150 len = 2;
151 min = 1 << 7;
152 cp = buf[0] & 0x1F;
153 } else if (buf[0] < 0xF0) {
154 len = 3;
155 min = 1 << (5 + 6);
156 cp = buf[0] & 0x0F;
157 } else if (buf[0] < 0xF8) {
158 len = 4;
159 min = 1 << (4 + 6 + 6);
160 cp = buf[0] & 0x07;
161 } else {
162 return -1;
163 }
164
165 for (i = 1; i < len; i++) {
166 if (!UTF8_CONTINUATION_BYTE(buf[i]))
167 return -1;
168 cp = (cp << 6) | (buf[i] & 0x3F);
169 }
170
171 if (cp < min)
172 return -1;
173
174 if (0xD800 <= cp && cp <= 0xDFFF)
175 return -1;
176
177 if (0x110000 <= cp)
178 return -1;
179
180 *str += len;
181 return cp;
182 }
183
184 /**
185 * Calculate how long a Unicode code point (such as returned by
186 * SDL key events in unicode mode) would be in UTF-8 encoding.
187 */
UTF8_encoded_len(int c)188 int UTF8_encoded_len (int c)
189 {
190 if (c <= 0x7F)
191 return 1;
192 if (c <= 0x07FF)
193 return 2;
194 if (c <= 0xFFFF)
195 return 3;
196 if (c <= 0x10FFFF) /* highest defined Unicode code */
197 return 4;
198 return 0;
199 }
200
201 /**
202 * @brief Count the number of character (not the number of bytes) of a zero termination string
203 * @note the \\0 termination character is not counted
204 * @note to count the number of bytes, use strlen
205 * @sa strlen
206 */
UTF8_strlen(const char * str)207 size_t UTF8_strlen (const char* str)
208 {
209 size_t result = 0;
210
211 while (str[0] != '\0') {
212 const int n = UTF8_char_len((unsigned char)*str);
213 str += n;
214 result++;
215 }
216 return result;
217 }
218
219 /**
220 * @brief Convert UTF-8 character offset to a byte offset in the given string.
221 * @param[in] str Start of the string
222 * @param[in] pos UTF-8 character offset from the start
223 * @return offset of the first byte of the UTF-8 character at that offset
224 * @note If there aren't enough UTF-8 characters, returns the offset of the NULL terminator.
225 * @sa UTF8_char_len
226 */
UTF8_char_offset_to_byte_offset(char * str,int pos)227 int UTF8_char_offset_to_byte_offset (char* str, int pos)
228 {
229 int result = 0;
230
231 while (pos > 0 && str[0] != '\0') {
232 const int n = UTF8_char_len((unsigned char)*str);
233 str += n;
234 result += n;
235 pos--;
236 }
237 return result;
238 }
239
240 /**
241 * @brief UTF8 capable string copy function
242 * @param[out] dest Pointer to the output string
243 * @param[in] src Pointer to the input string
244 * @param[in] limit Maximum number of bytes to copy
245 * @return dest pointer
246 */
UTF8_strncpyz(char * dest,const char * src,size_t limit)247 char* UTF8_strncpyz (char* dest, const char* src, size_t limit)
248 {
249 size_t length;
250
251 length = strlen(src);
252 if (length > limit - 1) {
253 length = limit - 1;
254 if (length > 0 && (unsigned char) src[length - 1] >= 0x80) {
255 size_t i = length - 1;
256 while ((i > 0) && UTF8_CONTINUATION_BYTE((unsigned char) src[i]))
257 i--;
258 if (UTF8_char_len(src[i]) + i > length)
259 length = i;
260 }
261 }
262
263 memcpy(dest, src, length);
264 dest[length] = '\0';
265
266 return dest;
267 }
268