1 /* utf8.c - Translate to/from UTF8.
2
3 Copyright (C) 1999 Tom Tromey
4
5 The Gnome Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
9
10 The Gnome Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
14
15 You should have received a copy of the GNU Library General Public
16 License along with the Gnome Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
19
20 #include <config.h>
21
22 #include <stdlib.h>
23 #ifdef HAVE_LANGINFO_H
24 #include <langinfo.h>
25 #endif
26 #include <string.h>
27
28 #include "unicode.h"
29 #include "convert.h"
30 #include "utf8.h"
31
32 /* Skip backwards to previous utf8 character. */
33 char *
unicode_previous_utf8(const char * start,const char * p)34 unicode_previous_utf8 (const char *start, const char *p)
35 {
36 int count = 6;
37 for (--p; p > start && count; --p, --count)
38 {
39 if ((*p & 0xc0) != 0x80)
40 break;
41 }
42 return count ? (char *) p : NULL;
43 }
44
45 /* Skip over a utf8 character.
46 Note that P cannot be const because then there is no useful return
47 type for this function. */
48 char *
unicode_next_utf8(const char * p)49 unicode_next_utf8 (const char *p)
50 {
51 if (*p)
52 {
53 for (++p; (*p & 0xc0) == 0x80; ++p)
54 ;
55 }
56 return (char *) p;
57 }
58
59 /* Return length of a UTF8 string. */
60 int
unicode_strlen(const char * p,int max)61 unicode_strlen (const char *p, int max)
62 {
63 int len = 0;
64 const char *start = p;
65 /* special case for the empty string */
66 if (!*p)
67 return 0;
68 /* Note that the test here and the test in the loop differ subtly.
69 In the loop we want to see if we've passed the maximum limit --
70 for instance if the buffer ends mid-character. Here at the top
71 of the loop we want to see if we've just reached the last byte. */
72 while (max < 0 || p - start < max)
73 {
74 p = unicode_next_utf8 (p);
75 ++len;
76 if (! *p || (max > 0 && p - start > max))
77 break;
78 }
79 return len;
80 }
81
82 /* Return pointer to the last UTF-8 sequence in a string. */
83 char *
unicode_last_utf8(const char * p)84 unicode_last_utf8 (const char *p)
85 {
86 const char *start = p;
87 /* special case for the empty string */
88 if (!*p)
89 return (char *)p;
90 p = p + strlen(p);
91 p = unicode_previous_utf8(start, p);
92 return (char *)p;
93 }
94
95
96 char *
unicode_get_utf8(const char * p,unicode_char_t * result)97 unicode_get_utf8 (const char *p, unicode_char_t *result)
98 {
99 int i, mask = 0, len;
100 unsigned char c = (unsigned char) *p;
101
102 UTF8_COMPUTE (c, mask, len);
103 if (len == -1)
104 return NULL;
105 UTF8_GET (*result, p, i, mask, len);
106 if (*result == (unicode_char_t) -1)
107 return NULL;
108 return (char *) p + len;
109 }
110
111 /* unicode_offset_to_index */
112
113 size_t
unicode_offset_to_index(const char * src,int offset)114 unicode_offset_to_index(const char *src, int offset)
115 {
116 const char *s = src;
117 while (offset && *s) {
118 s = unicode_next_utf8(s);
119 offset--;
120 }
121 return s - src;
122 }
123
124 /* unicode_index_to_offset */
125
126 size_t
unicode_index_to_offset(const char * src,int index)127 unicode_index_to_offset(const char *src, int index)
128 {
129 const char *s = src;
130 size_t offset = 0;
131 while (*s) {
132 if ((s - src) >= index)
133 return offset;
134 s = unicode_next_utf8(s);
135 offset++;
136 }
137 return offset;
138 }
139
140 /* unicode_strncpy */
141
142 char
unicode_strncpy(char * dest,const char * src,size_t n)143 *unicode_strncpy(char *dest, const char *src, size_t n)
144 {
145 const char *s = src;
146 size_t o_n = n;
147 while (n && *s) {
148 s = unicode_next_utf8(s);
149 n--;
150 }
151 strncpy(dest, src, s - src);
152 dest[s - src] = 0;
153 return dest;
154 }
155
156 static int
unicode_get_charset_internal(char ** a)157 unicode_get_charset_internal (char **a)
158 {
159 char *charset = getenv("CHARSET");
160
161 if (charset && a && ! *a)
162 *a = charset;
163
164 if (charset && strstr (charset, "UTF-8"))
165 return 1;
166
167 #ifdef _NL_CTYPE_CODESET_NAME
168 charset = nl_langinfo (_NL_CTYPE_CODESET_NAME);
169 if (charset)
170 {
171 if (a && ! *a)
172 *a = charset;
173 if (strcmp (charset, "UTF-8") == 0)
174 return 1;
175 }
176 #elif CODESET
177 charset = nl_langinfo(CODESET);
178 if (charset)
179 {
180 if (a && ! *a)
181 *a = charset;
182 if (strcmp (charset, "UTF-8") == 0)
183 return 1;
184 }
185 #endif
186
187 if (a && ! *a)
188 *a = "US-ASCII";
189 /* Assume this for compatibility at present. */
190 return 0;
191 }
192
193 static int utf8_locale_cache = -1;
194 static char *utf8_charset_cache = NULL;
195
196 int
unicode_get_charset(char ** charset)197 unicode_get_charset (char **charset)
198 {
199 if (utf8_locale_cache != -1)
200 {
201 if (charset)
202 *charset = utf8_charset_cache;
203 return utf8_locale_cache;
204 }
205 utf8_locale_cache = unicode_get_charset_internal (&utf8_charset_cache);
206 if (charset)
207 *charset = utf8_charset_cache;
208 return utf8_locale_cache;
209 }
210
211 int
unicode_string_width(const char * p)212 unicode_string_width(const char *p)
213 {
214 /* for now, we just pass through to unicode_strlen
215 *
216 * FIXME : make characters with doublewidth property count for 2,
217 * and nonspacing combining characters count for 0.
218 */
219 return unicode_strlen(p, -1);
220 }
221
222 void
unicode_pad_string(char * dest,int right,int width,const char * string)223 unicode_pad_string(char *dest, int right, int width, const char *string)
224 {
225 char *old_dest = dest;
226 strcpy(dest, string);
227
228 width -= unicode_string_width(string);
229 dest += strlen(dest);
230
231 if (width < 0) {
232 int i = unicode_offset_to_index(string, width);
233 old_dest[i] = 0;
234 return;
235 }
236
237 while (width > 0) {
238 *dest = ' ';
239 dest++;
240 width--;
241 }
242
243 *dest = 0;
244
245 }
246
247 /* unicode_strchr */
248
249 char *
unicode_strchr(const char * p,unicode_char_t c)250 unicode_strchr(const char *p, unicode_char_t c)
251 {
252 char ch[10];
253 int first, len, i;
254
255 if (c < 0x80) return strchr(p, c);
256
257 if (c < 0x800) {
258 first = 0xc0;
259 len = 2;
260 } else if (c < 0x10000)
261 {
262 first = 0xe0;
263 len = 3;
264 }
265 else if (c < 0x200000)
266 {
267 first = 0xf0;
268 len = 4;
269 }
270 else if (c < 0x4000000)
271 {
272 first = 0xf8;
273 len = 5;
274 }
275 else
276 {
277 first = 0xfc;
278 len = 6;
279 }
280
281 for (i = len - 1; i > 0 ; --i)
282 {
283 ch[i] = (c & 0x3f) | 0x80;
284 c >>= 6;
285 }
286 ch[0] = c | first;
287 ch[len] = 0;
288
289 return strstr(p, ch);
290 }
291
292 #if 0
293 /* unicode_strrchr
294 *
295 * This is ifdefed out atm as there is no strrstr function in libc.
296 *
297 * One needs writing.
298 */
299
300 char *
301 unicode_strrchr(const char *p, unicode_char_t c)
302 {
303 char ch[10];
304 int first, len, i;
305
306 if (c < 0x80) return strrchr(p, c);
307
308 if (c < 0x800) {
309 first = 0xc0;
310 len = 2;
311 } else if (c < 0x10000)
312 {
313 first = 0xe0;
314 len = 3;
315 }
316 else if (c < 0x200000)
317 {
318 first = 0xf0;
319 len = 4;
320 }
321 else if (c < 0x4000000)
322 {
323 first = 0xf8;
324 len = 5;
325 }
326 else
327 {
328 first = 0xfc;
329 len = 6;
330 }
331
332 for (i = len - 1; i > 0 ; --i)
333 {
334 ch[i] = (c & 0x3f) | 0x80;
335 c >>= 6;
336 }
337 ch[0] = c | first;
338 ch[len] = 0;
339
340 return strrstr(p, ch);
341 }
342 #endif
343