1 /* vifm
2 * Copyright (C) 2011 xaizek.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
17 */
18
19 #include "utf8.h"
20
21 #ifdef _WIN32
22 #include <windows.h>
23 #endif
24
25 #include <assert.h> /* assert() */
26 #include <stddef.h> /* size_t wchar_t */
27 #include <stdlib.h> /* malloc() */
28 #include <string.h> /* strlen() */
29
30 #include "../compat/reallocarray.h"
31 #include "macros.h"
32 #include "utils.h"
33
34 static size_t guess_char_width(char c);
35 static wchar_t utf8_char_to_wchar(const char str[], size_t char_width);
36 static size_t chrsw(const char str[], size_t char_width);
37
38 size_t
utf8_chrw(const char str[])39 utf8_chrw(const char str[])
40 {
41 const size_t expected = guess_char_width(str[0]);
42 if(expected == 2 && (str[1] & 0xc0) == 0x80)
43 return 2;
44 else if(expected == 3 && (str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80)
45 return 3;
46 else if(expected == 4 && (str[1] & 0xc0) == 0x80 && (str[2] & 0xc0) == 0x80 &&
47 (str[3] & 0xc0) == 0x80)
48 return 4;
49 else if(str[0] == '\0')
50 return 0;
51 return 1;
52 }
53
54 /* Determines width of a utf-8 character by its first byte. */
55 static size_t
guess_char_width(char c)56 guess_char_width(char c)
57 {
58 if((c & 0xe0) == 0xc0)
59 return 2;
60 else if((c & 0xf0) == 0xe0)
61 return 3;
62 else if((c & 0xf8) == 0xf0)
63 return 4;
64 return 1;
65 }
66
67 size_t
utf8_strsnlen(const char str[],size_t max_screen_width)68 utf8_strsnlen(const char str[], size_t max_screen_width)
69 {
70 size_t width = 0;
71 while(*str != '\0' && max_screen_width != 0)
72 {
73 size_t char_width = utf8_chrw(str);
74 size_t char_screen_width = chrsw(str, char_width);
75 if(char_screen_width > max_screen_width)
76 {
77 break;
78 }
79 max_screen_width -= char_screen_width;
80 width += char_width;
81 str += char_width;
82 }
83 return width;
84 }
85
86 size_t
utf8_nstrlen(const char str[])87 utf8_nstrlen(const char str[])
88 {
89 size_t length_left = strlen(str);
90 size_t length = 0;
91 while(length_left != '\0')
92 {
93 const size_t char_width = utf8_chrw(str);
94 if(char_width > length_left)
95 {
96 break;
97 }
98
99 ++length;
100 str += char_width;
101 length_left -= char_width;
102 }
103 return length;
104 }
105
106 size_t
utf8_nstrsnlen(const char str[],size_t max_screen_width)107 utf8_nstrsnlen(const char str[], size_t max_screen_width)
108 {
109 size_t length_left = strlen(str);
110 size_t length = 0;
111 while(length_left != 0 && max_screen_width > 0)
112 {
113 size_t char_screen_width;
114 const size_t char_width = utf8_chrw(str);
115 if(char_width > length_left)
116 {
117 break;
118 }
119
120 char_screen_width = chrsw(str, char_width);
121 if(char_screen_width > max_screen_width)
122 {
123 break;
124 }
125
126 length += char_width;
127 max_screen_width -= char_screen_width;
128 str += char_width;
129 length_left -= char_width;
130 }
131 return length;
132 }
133
134 /* Converts one utf-8 encoded character to wide character form. */
135 static wchar_t
utf8_char_to_wchar(const char str[],size_t char_width)136 utf8_char_to_wchar(const char str[], size_t char_width)
137 {
138 /* First mask is a fake one, to omit decrementing of char_width. */
139 static const int masks[] = { 0x00, 0xff, 0x1f, 0x0f, 0x07 };
140
141 wchar_t result;
142
143 assert(char_width != 0 && "There are no zero width utf-8 characters.");
144 assert(char_width < ARRAY_LEN(masks) && "Too long utf-8 character.");
145
146 result = *str&masks[char_width];
147 while(--char_width != 0)
148 {
149 result = (result << 6)|(*++str&0x3f);
150 }
151
152 return result;
153 }
154
155 size_t
utf8_strsw(const char str[])156 utf8_strsw(const char str[])
157 {
158 size_t length = 0;
159 while(*str != '\0')
160 {
161 const size_t char_width = utf8_chrw(str);
162 const size_t char_screen_width = chrsw(str, char_width);
163 str += char_width;
164 length += char_screen_width;
165 }
166 return length;
167 }
168
169 size_t
utf8_strsw_with_tabs(const char str[],int tab_stops)170 utf8_strsw_with_tabs(const char str[], int tab_stops)
171 {
172 size_t length = 0U;
173
174 assert(tab_stops > 0 && "Non-positive number of tab stops.");
175
176 while(*str != '\0')
177 {
178 size_t char_screen_width;
179 const size_t char_width = utf8_chrw(str);
180
181 if(char_width == 1 && *str == '\t')
182 {
183 char_screen_width = tab_stops - length%tab_stops;
184 }
185 else
186 {
187 char_screen_width = chrsw(str, char_width);
188 }
189
190 str += char_width;
191 length += char_screen_width;
192 }
193 return length;
194 }
195
196 size_t
utf8_chrsw(const char str[])197 utf8_chrsw(const char str[])
198 {
199 return chrsw(str, utf8_chrw(str));
200 }
201
202 /* Returns width of the character in the terminal. */
203 static size_t
chrsw(const char str[],size_t char_width)204 chrsw(const char str[], size_t char_width)
205 {
206 const wchar_t wide = utf8_char_to_wchar(str, char_width);
207 const size_t result = vifm_wcwidth(wide);
208 return (result == (size_t)-1) ? 1 : result;
209 }
210
211 size_t
utf8_stro(const char str[])212 utf8_stro(const char str[])
213 {
214 size_t overhead = 0;
215 while(*str != '\0')
216 {
217 size_t char_width = utf8_chrw(str);
218 str += char_width;
219 overhead += char_width - 1;
220 }
221 return overhead;
222 }
223
224 size_t
utf8_strso(const char str[])225 utf8_strso(const char str[])
226 {
227 size_t overhead = 0;
228 while(*str != '\0')
229 {
230 const size_t char_width = utf8_chrw(str);
231 const size_t char_screen_width = chrsw(str, char_width);
232 str += char_width;
233 overhead += (char_width - 1) - (char_screen_width - 1);
234 }
235 return overhead;
236 }
237
238 size_t
utf8_strcpy(char dst[],const char src[],size_t dst_len)239 utf8_strcpy(char dst[], const char src[], size_t dst_len)
240 {
241 const size_t len = dst_len;
242 if(dst_len == 0U)
243 {
244 return 0U;
245 }
246
247 while(*src != '\0' && dst_len > 1U)
248 {
249 size_t char_width = utf8_chrw(src);
250 if(char_width >= dst_len)
251 {
252 break;
253 }
254 while(char_width-- != 0)
255 {
256 *dst++ = *src++;
257 --dst_len;
258 }
259 }
260
261 *dst = '\0';
262 return len - (dst_len - 1U);
263 }
264
265 #ifdef _WIN32
266
267 wchar_t
utf8_first_char(const char utf8[])268 utf8_first_char(const char utf8[])
269 {
270 const size_t len = strlen(utf8);
271 wchar_t wc;
272 (void)MultiByteToWideChar(CP_UTF8, 0, utf8, len, &wc, 1);
273 return wc;
274 }
275
276 wchar_t *
utf8_to_utf16(const char utf8[])277 utf8_to_utf16(const char utf8[])
278 {
279 const size_t len = strlen(utf8);
280 const int size = MultiByteToWideChar(CP_UTF8, 0, utf8, len, NULL, 0);
281 wchar_t *const utf16 = reallocarray(NULL, size + 1, sizeof(wchar_t));
282 (void)MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, size);
283 utf16[size] = L'\0';
284 return utf16;
285 }
286
287 size_t
utf8_widen_len(const char utf8[])288 utf8_widen_len(const char utf8[])
289 {
290 return MultiByteToWideChar(CP_UTF8, 0, utf8, strlen(utf8), NULL, 0);
291 }
292
293 char *
utf8_from_utf16(const wchar_t utf16[])294 utf8_from_utf16(const wchar_t utf16[])
295 {
296 const size_t len = wcslen(utf16);
297 const int size = WideCharToMultiByte(CP_UTF8, 0, utf16, len, NULL, 0, NULL,
298 NULL);
299 char *const utf8 = malloc(size + 1);
300 (void)WideCharToMultiByte(CP_UTF8, 0, utf16, len, utf8, size, NULL, NULL);
301 utf8[size] = '\0';
302 return utf8;
303 }
304
305 #endif
306
307 /* vim: set tabstop=2 softtabstop=2 shiftwidth=2 noexpandtab cinoptions-=(0 : */
308 /* vim: set cinoptions+=t0 filetype=c : */
309