1 /**
2  * @file
3  */
4 
5 /*
6 All original material Copyright (C) 2002-2013 UFO: Alien Invasion.
7 
8 Copyright (C) 1997-2001 Id Software, Inc.
9 
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License
12 as published by the Free Software Foundation; either version 2
13 of the License, or (at your option) any later version.
14 
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18 
19 See the GNU General Public License for more details.
20 
21 You should have received a copy of the GNU General Public License
22 along with this program; if not, write to the Free Software
23 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
24 */
25 
26 #include "utf8.h"
27 #include <string.h>
28 
29 /**
30  * @brief Delete a whole (possibly multibyte) character from a string.
31  * @param[in] s Start of the string
32  * @param[in] pos UTF-8 char offset from the start (not the byte offset)
33  * @return Number of bytes deleted
34  */
UTF8_delete_char_at(char * s,int pos)35 int UTF8_delete_char_at (char* s, int pos)
36 {
37 	/* Convert the UTF-8 char offset to byte offset */
38 	pos = UTF8_char_offset_to_byte_offset(s, pos);
39 
40 	int start = pos;
41 	int next = pos;
42 
43 	while (start > 0 && UTF8_CONTINUATION_BYTE(s[start]))
44 		start--;
45 	if (s[next] != 0)
46 		next++;
47 	while (s[next] != 0 && UTF8_CONTINUATION_BYTE(s[next]))
48 		next++;
49 	/* memmove is the only standard copying function that is guaranteed
50 	 * to work if the source and destination overlap. */
51 	memmove(&s[start], &s[next], strlen(&s[next]) + 1);
52 	return (next - start);
53 }
54 
55 /**
56  * @brief Insert a (possibly multibyte) UTF-8 character into a string.
57  * @param[in] s Start of the string
58  * @param[in] n Buffer size of the string
59  * @param[in] pos UTF-8 char offset from the start (not the byte offset)
60  * @param[in] c Unicode code as 32-bit integer
61  * @return Number of bytes added
62  */
UTF8_insert_char_at(char * s,int n,int pos,int c)63 int UTF8_insert_char_at (char* s, int n, int pos, int c)
64 {
65 	/* Convert the UTF-8 char offset to byte offset */
66 	pos = UTF8_char_offset_to_byte_offset(s, pos);
67 
68 	const int utf8len = UTF8_encoded_len(c);
69 	const int tail = strlen(&s[pos]) + 1;
70 
71 	if (utf8len == 0)
72 		return 0;
73 
74 	if (pos + tail + utf8len > n)
75 		return 0;
76 
77 	/* Insertion: move up rest of string. Also moves string terminator. */
78 	memmove(&s[pos + utf8len], &s[pos], tail);
79 
80 	if (c <= 0x7f) {
81 		s[pos] = c;
82 	} else if (c <= 0x7ff) { 				/* c has 11 bits */
83 		s[pos] = 0xc0 | (c >> 6);	  			/* high 5 bits */
84 		s[pos + 1] = 0x80 | (c & 0x3f); 		/* low 6 bits */
85 	} else if (c <= 0xffff) { 				/* c has 16 bits */
86 		s[pos] = 0xe0 | (c >> 12);				/* high 4 bits */
87 		s[pos + 1] = 0x80 | ((c >> 6) & 0x3f);	/* mid 6 bits */
88 		s[pos + 2] = 0x80 | (c & 0x3f);			/* low 6 bits */
89 	} else if (c <= 0x10ffff) {				/* c has 21 bits */
90 		s[pos] = 0xf0 | (c >> 18);				/* high 3 bits */
91 		s[pos + 1] = 0x80 | ((c >> 12) & 0x3f);	/* mid 6 bits */
92 		s[pos + 2] = 0x80 | ((c >> 6) & 0x3f);	/* mid 6 bits */
93 		s[pos + 3] = 0x80 | (c & 0x3f);			/* low 6 bits */
94 	}
95 
96 	return utf8len;
97 }
98 
99 /**
100  * @brief length of UTF-8 character starting with this byte.
101  * @return length of character encoding, or 0 if not start of a UTF-8 sequence
102  * @todo Using this does not solve the truncation problem in case of
103  * decomposed characters. For example a code for "a" followed by a
104  * code for "put dots above previous character: the "a" will be reported
105  * as a character of length 1 by this function, even though the code
106  * that follows is part of its visual appearance and should not be
107  * cut off separately. Fortunately decomposed characters are rarely used.
108  */
UTF8_char_len(unsigned char c)109 int UTF8_char_len (unsigned char c)
110 {
111 	if (c < 0x80)
112 		return 1;
113 	if (c < 0xc0)
114 		return 0;
115 	if (c < 0xe0)
116 		return 2;
117 	if (c < 0xf0)
118 		return 3;
119 	if (c < 0xf8)
120 		return 4;
121 	/* UTF-8 used to define 5 and 6 byte sequences, but they are
122 	 * no longer valid. */
123 	return 0;
124 }
125 
126 /**
127  * @brief Get the next utf-8 character from the given string
128  * @param[in] str The source string to get the utf-8 char from. The string is not touched,
129  * but the pointer is advanced by the length of the utf-8 character.
130  * @return The utf-8 character, or -1 on error
131  */
UTF8_next(const char ** str)132 int UTF8_next (const char** str)
133 {
134 	size_t len, i;
135 	int cp, min;
136 	const char* s = *str;
137 
138 	if (s[0] == '\0')
139 		return -1;
140 
141 	const unsigned char* buf = (const unsigned char*)(s);
142 
143 	if (buf[0] < 0x80) {
144 		len = 1;
145 		min = 0;
146 		cp = buf[0];
147 	} else if (buf[0] < 0xC0) {
148 		return -1;
149 	} else if (buf[0] < 0xE0) {
150 		len = 2;
151 		min = 1 << 7;
152 		cp = buf[0] & 0x1F;
153 	} else if (buf[0] < 0xF0) {
154 		len = 3;
155 		min = 1 << (5 + 6);
156 		cp = buf[0] & 0x0F;
157 	} else if (buf[0] < 0xF8) {
158 		len = 4;
159 		min = 1 << (4 + 6 + 6);
160 		cp = buf[0] & 0x07;
161 	} else {
162 		return -1;
163 	}
164 
165 	for (i = 1; i < len; i++) {
166 		if (!UTF8_CONTINUATION_BYTE(buf[i]))
167 			return -1;
168 		cp = (cp << 6) | (buf[i] & 0x3F);
169 	}
170 
171 	if (cp < min)
172 		return -1;
173 
174 	if (0xD800 <= cp && cp <= 0xDFFF)
175 		return -1;
176 
177 	if (0x110000 <= cp)
178 		return -1;
179 
180 	*str += len;
181 	return cp;
182 }
183 
184 /**
185  * Calculate how long a Unicode code point (such as returned by
186  * SDL key events in unicode mode) would be in UTF-8 encoding.
187  */
UTF8_encoded_len(int c)188 int UTF8_encoded_len (int c)
189 {
190 	if (c <= 0x7F)
191 		return 1;
192 	if (c <= 0x07FF)
193 		return 2;
194 	if (c <= 0xFFFF)
195 		return 3;
196 	if (c <= 0x10FFFF)  /* highest defined Unicode code */
197 		return 4;
198 	return 0;
199 }
200 
201 /**
202  * @brief Count the number of character (not the number of bytes) of a zero termination string
203  * @note the \\0 termination character is not counted
204  * @note to count the number of bytes, use strlen
205  * @sa strlen
206  */
UTF8_strlen(const char * str)207 size_t UTF8_strlen (const char* str)
208 {
209 	size_t result = 0;
210 
211 	while (str[0] != '\0') {
212 		const int n = UTF8_char_len((unsigned char)*str);
213 		str += n;
214 		result++;
215 	}
216 	return result;
217 }
218 
219 /**
220  * @brief Convert UTF-8 character offset to a byte offset in the given string.
221  * @param[in] str Start of the string
222  * @param[in] pos UTF-8 character offset from the start
223  * @return offset of the first byte of the UTF-8 character at that offset
224  * @note If there aren't enough UTF-8 characters, returns the offset of the NULL terminator.
225  * @sa UTF8_char_len
226  */
UTF8_char_offset_to_byte_offset(char * str,int pos)227 int UTF8_char_offset_to_byte_offset (char* str, int pos)
228 {
229 	int result = 0;
230 
231 	while (pos > 0 && str[0] != '\0') {
232 		const int n = UTF8_char_len((unsigned char)*str);
233 		str += n;
234 		result += n;
235 		pos--;
236 	}
237 	return result;
238 }
239 
240 /**
241  * @brief UTF8 capable string copy function
242  * @param[out] dest Pointer to the output string
243  * @param[in] src Pointer to the input string
244  * @param[in] limit Maximum number of bytes to copy
245  * @return dest pointer
246  */
UTF8_strncpyz(char * dest,const char * src,size_t limit)247 char* UTF8_strncpyz (char* dest, const char* src, size_t limit)
248 {
249 	size_t length;
250 
251 	length = strlen(src);
252 	if (length > limit - 1) {
253 		length = limit - 1;
254 		if (length > 0 && (unsigned char) src[length - 1] >= 0x80) {
255 			size_t i = length - 1;
256 			while ((i > 0) && UTF8_CONTINUATION_BYTE((unsigned char) src[i]))
257 				i--;
258 			if (UTF8_char_len(src[i]) + i > length)
259 				length = i;
260 		}
261 	}
262 
263 	memcpy(dest, src, length);
264 	dest[length] = '\0';
265 
266 	return dest;
267 }
268