1 /* vifm
2  * Copyright (C) 2011 xaizek.
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
17  */
18 
19 #include "utf8.h"
20 
21 #ifdef _WIN32
22 #include <windows.h>
23 #endif
24 
25 #include <assert.h> /* assert() */
26 #include <stddef.h> /* size_t wchar_t */
27 #include <stdlib.h> /* malloc() */
28 #include <string.h> /* strlen() */
29 
30 #include "../compat/reallocarray.h"
31 #include "macros.h"
32 #include "utils.h"
33 
34 static size_t guess_char_width(char c);
35 static wchar_t utf8_char_to_wchar(const char str[], size_t char_width);
36 static size_t chrsw(const char str[], size_t char_width);
37 
38 size_t
utf8_chrw(const char str[])39 utf8_chrw(const char str[])
40 {
41 	const size_t expected = guess_char_width(str[0]);
42 	if(expected == 2 && (str[1] & 0xc0) == 0x80)
43 		return 2;
44 	else if(expected == 3 && (str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80)
45 		return 3;
46 	else if(expected == 4 && (str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80 &&
47 			(str[3] & 0xc0) == 0x80)
48 		return 4;
49 	else if(str[0] == '\0')
50 		return 0;
51 	return 1;
52 }
53 
54 /* Determines width of a utf-8 character by its first byte. */
55 static size_t
guess_char_width(char c)56 guess_char_width(char c)
57 {
58 	if((c & 0xe0) == 0xc0)
59 		return 2;
60 	else if((c & 0xf0) == 0xe0)
61 		return 3;
62 	else if((c & 0xf8) == 0xf0)
63 		return 4;
64 	return 1;
65 }
66 
67 size_t
utf8_strsnlen(const char str[],size_t max_screen_width)68 utf8_strsnlen(const char str[], size_t max_screen_width)
69 {
70 	size_t width = 0;
71 	while(*str != '\0' && max_screen_width != 0)
72 	{
73 		size_t char_width = utf8_chrw(str);
74 		size_t char_screen_width = chrsw(str, char_width);
75 		if(char_screen_width > max_screen_width)
76 		{
77 			break;
78 		}
79 		max_screen_width -= char_screen_width;
80 		width += char_width;
81 		str += char_width;
82 	}
83 	return width;
84 }
85 
86 size_t
utf8_nstrlen(const char str[])87 utf8_nstrlen(const char str[])
88 {
89 	size_t length_left = strlen(str);
90 	size_t length = 0;
91 	while(length_left != '\0')
92 	{
93 		const size_t char_width = utf8_chrw(str);
94 		if(char_width > length_left)
95 		{
96 			break;
97 		}
98 
99 		++length;
100 		str += char_width;
101 		length_left -= char_width;
102 	}
103 	return length;
104 }
105 
106 size_t
utf8_nstrsnlen(const char str[],size_t max_screen_width)107 utf8_nstrsnlen(const char str[], size_t max_screen_width)
108 {
109 	size_t length_left = strlen(str);
110 	size_t length = 0;
111 	while(length_left != 0 && max_screen_width > 0)
112 	{
113 		size_t char_screen_width;
114 		const size_t char_width = utf8_chrw(str);
115 		if(char_width > length_left)
116 		{
117 			break;
118 		}
119 
120 		char_screen_width = chrsw(str, char_width);
121 		if(char_screen_width > max_screen_width)
122 		{
123 			break;
124 		}
125 
126 		length += char_width;
127 		max_screen_width -= char_screen_width;
128 		str += char_width;
129 		length_left -= char_width;
130 	}
131 	return length;
132 }
133 
134 /* Converts one utf-8 encoded character to wide character form. */
135 static wchar_t
utf8_char_to_wchar(const char str[],size_t char_width)136 utf8_char_to_wchar(const char str[], size_t char_width)
137 {
138 	/* First mask is a fake one, to omit decrementing of char_width. */
139 	static const int masks[] = { 0x00, 0xff, 0x1f, 0x0f, 0x07 };
140 
141 	wchar_t result;
142 
143 	assert(char_width != 0 && "There are no zero width utf-8 characters.");
144 	assert(char_width < ARRAY_LEN(masks) && "Too long utf-8 character.");
145 
146 	result = *str&masks[char_width];
147 	while(--char_width != 0)
148 	{
149 		result = (result << 6)|(*++str&0x3f);
150 	}
151 
152 	return result;
153 }
154 
155 size_t
utf8_strsw(const char str[])156 utf8_strsw(const char str[])
157 {
158 	size_t length = 0;
159 	while(*str != '\0')
160 	{
161 		const size_t char_width = utf8_chrw(str);
162 		const size_t char_screen_width = chrsw(str, char_width);
163 		str += char_width;
164 		length += char_screen_width;
165 	}
166 	return length;
167 }
168 
169 size_t
utf8_strsw_with_tabs(const char str[],int tab_stops)170 utf8_strsw_with_tabs(const char str[], int tab_stops)
171 {
172 	size_t length = 0U;
173 
174 	assert(tab_stops > 0 && "Non-positive number of tab stops.");
175 
176 	while(*str != '\0')
177 	{
178 		size_t char_screen_width;
179 		const size_t char_width = utf8_chrw(str);
180 
181 		if(char_width == 1 && *str == '\t')
182 		{
183 			char_screen_width = tab_stops - length%tab_stops;
184 		}
185 		else
186 		{
187 			char_screen_width = chrsw(str, char_width);
188 		}
189 
190 		str += char_width;
191 		length += char_screen_width;
192 	}
193 	return length;
194 }
195 
196 size_t
utf8_chrsw(const char str[])197 utf8_chrsw(const char str[])
198 {
199 	return chrsw(str, utf8_chrw(str));
200 }
201 
202 /* Returns width of the character in the terminal. */
203 static size_t
chrsw(const char str[],size_t char_width)204 chrsw(const char str[], size_t char_width)
205 {
206 	const wchar_t wide = utf8_char_to_wchar(str, char_width);
207 	const size_t result = vifm_wcwidth(wide);
208 	return (result == (size_t)-1) ? 1 : result;
209 }
210 
211 size_t
utf8_stro(const char str[])212 utf8_stro(const char str[])
213 {
214 	size_t overhead = 0;
215 	while(*str != '\0')
216 	{
217 		size_t char_width = utf8_chrw(str);
218 		str += char_width;
219 		overhead += char_width - 1;
220 	}
221 	return overhead;
222 }
223 
224 size_t
utf8_strso(const char str[])225 utf8_strso(const char str[])
226 {
227 	size_t overhead = 0;
228 	while(*str != '\0')
229 	{
230 		const size_t char_width = utf8_chrw(str);
231 		const size_t char_screen_width = chrsw(str, char_width);
232 		str += char_width;
233 		overhead += (char_width - 1) - (char_screen_width - 1);
234 	}
235 	return overhead;
236 }
237 
238 size_t
utf8_strcpy(char dst[],const char src[],size_t dst_len)239 utf8_strcpy(char dst[], const char src[], size_t dst_len)
240 {
241 	const size_t len = dst_len;
242 	if(dst_len == 0U)
243 	{
244 		return 0U;
245 	}
246 
247 	while(*src != '\0' && dst_len > 1U)
248 	{
249 		size_t char_width = utf8_chrw(src);
250 		if(char_width >= dst_len)
251 		{
252 			break;
253 		}
254 		while(char_width-- != 0)
255 		{
256 			*dst++ = *src++;
257 			--dst_len;
258 		}
259 	}
260 
261 	*dst = '\0';
262 	return len - (dst_len - 1U);
263 }
264 
265 #ifdef _WIN32
266 
267 wchar_t
utf8_first_char(const char utf8[])268 utf8_first_char(const char utf8[])
269 {
270 	const size_t len = strlen(utf8);
271 	wchar_t wc;
272 	(void)MultiByteToWideChar(CP_UTF8, 0, utf8, len, &wc, 1);
273 	return wc;
274 }
275 
276 wchar_t *
utf8_to_utf16(const char utf8[])277 utf8_to_utf16(const char utf8[])
278 {
279 	const size_t len = strlen(utf8);
280 	const int size = MultiByteToWideChar(CP_UTF8, 0, utf8, len, NULL, 0);
281 	wchar_t *const utf16 = reallocarray(NULL, size + 1, sizeof(wchar_t));
282 	(void)MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, size);
283 	utf16[size] = L'\0';
284 	return utf16;
285 }
286 
287 size_t
utf8_widen_len(const char utf8[])288 utf8_widen_len(const char utf8[])
289 {
290 	return MultiByteToWideChar(CP_UTF8, 0, utf8, strlen(utf8), NULL, 0);
291 }
292 
293 char *
utf8_from_utf16(const wchar_t utf16[])294 utf8_from_utf16(const wchar_t utf16[])
295 {
296 	const size_t len = wcslen(utf16);
297 	const int size = WideCharToMultiByte(CP_UTF8, 0, utf16, len, NULL, 0, NULL,
298 			NULL);
299 	char *const utf8 = malloc(size + 1);
300 	(void)WideCharToMultiByte(CP_UTF8, 0, utf16, len, utf8, size, NULL, NULL);
301 	utf8[size] = '\0';
302 	return utf8;
303 }
304 
305 #endif
306 
307 /* vim: set tabstop=2 softtabstop=2 shiftwidth=2 noexpandtab cinoptions-=(0 : */
308 /* vim: set cinoptions+=t0 filetype=c : */
309