1 /* utf8.c - Translate to/from UTF8.
2 
3    Copyright (C) 1999 Tom Tromey
4 
5    The Gnome Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Library General Public License as
7    published by the Free Software Foundation; either version 2 of the
8    License, or (at your option) any later version.
9 
10    The Gnome Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Library General Public License for more details.
14 
15    You should have received a copy of the GNU Library General Public
16    License along with the Gnome Library; see the file COPYING.LIB.  If not,
17    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18    Boston, MA 02111-1307, USA.  */
19 
20 #include <config.h>
21 
22 #include <stdlib.h>
23 #ifdef HAVE_LANGINFO_H
24 #include <langinfo.h>
25 #endif
26 #include <string.h>
27 
28 #include "unicode.h"
29 #include "convert.h"
30 #include "utf8.h"
31 
32 /* Skip backwards to previous utf8 character.  */
33 char *
unicode_previous_utf8(const char * start,const char * p)34 unicode_previous_utf8 (const char *start, const char *p)
35 {
36   int count = 6;
37   for (--p; p > start && count; --p, --count)
38     {
39       if ((*p & 0xc0) != 0x80)
40 	break;
41     }
42   return count ? (char *) p : NULL;
43 }
44 
45 /* Skip over a utf8 character.
46    Note that P cannot be const because then there is no useful return
47    type for this function.  */
48 char *
unicode_next_utf8(const char * p)49 unicode_next_utf8 (const char *p)
50 {
51   if (*p)
52     {
53       for (++p; (*p & 0xc0) == 0x80; ++p)
54 	;
55     }
56   return (char *) p;
57 }
58 
59 /* Return length of a UTF8 string.  */
60 int
unicode_strlen(const char * p,int max)61 unicode_strlen (const char *p, int max)
62 {
63   int len = 0;
64   const char *start = p;
65   /* special case for the empty string */
66   if (!*p)
67     return 0;
68   /* Note that the test here and the test in the loop differ subtly.
69      In the loop we want to see if we've passed the maximum limit --
70      for instance if the buffer ends mid-character.  Here at the top
71      of the loop we want to see if we've just reached the last byte.  */
72   while (max < 0 || p - start < max)
73     {
74       p = unicode_next_utf8 (p);
75       ++len;
76       if (! *p || (max > 0 && p - start > max))
77 	break;
78     }
79   return len;
80 }
81 
82 /* Return pointer to the last UTF-8 sequence in a string.  */
83 char *
unicode_last_utf8(const char * p)84 unicode_last_utf8 (const char *p)
85 {
86   const char *start = p;
87   /* special case for the empty string */
88   if (!*p)
89     return (char *)p;
90   p = p + strlen(p);
91   p = unicode_previous_utf8(start, p);
92   return (char *)p;
93 }
94 
95 
96 char *
unicode_get_utf8(const char * p,unicode_char_t * result)97 unicode_get_utf8 (const char *p, unicode_char_t *result)
98 {
99   int i, mask = 0, len;
100   unsigned char c = (unsigned char) *p;
101 
102   UTF8_COMPUTE (c, mask, len);
103   if (len == -1)
104     return NULL;
105   UTF8_GET (*result, p, i, mask, len);
106   if (*result == (unicode_char_t) -1)
107     return NULL;
108   return (char *) p + len;
109 }
110 
111 /* unicode_offset_to_index */
112 
113 size_t
unicode_offset_to_index(const char * src,int offset)114 unicode_offset_to_index(const char *src, int offset)
115 {
116 	const char *s = src;
117 	while (offset && *s) {
118 		s = unicode_next_utf8(s);
119 		offset--;
120 	}
121 	return s - src;
122 }
123 
124 /* unicode_index_to_offset */
125 
126 size_t
unicode_index_to_offset(const char * src,int index)127 unicode_index_to_offset(const char *src, int index)
128 {
129 	const char *s = src;
130 	size_t offset = 0;
131 	while (*s) {
132 		if ((s - src) >= index)
133 		  return offset;
134 		s = unicode_next_utf8(s);
135 		offset++;
136 	}
137 	return offset;
138 }
139 
140 /* unicode_strncpy */
141 
142 char
unicode_strncpy(char * dest,const char * src,size_t n)143  *unicode_strncpy(char *dest, const char *src, size_t n)
144 {
145 	const char *s = src;
146 	size_t o_n = n;
147 	while (n && *s) {
148 		s = unicode_next_utf8(s);
149 		n--;
150 	}
151 	strncpy(dest, src, s - src);
152 	dest[s - src] = 0;
153 	return dest;
154 }
155 
156 static int
unicode_get_charset_internal(char ** a)157 unicode_get_charset_internal (char **a)
158 {
159   char *charset = getenv("CHARSET");
160 
161   if (charset && a && ! *a)
162     *a = charset;
163 
164   if (charset && strstr (charset, "UTF-8"))
165       return 1;
166 
167 #ifdef _NL_CTYPE_CODESET_NAME
168   charset = nl_langinfo (_NL_CTYPE_CODESET_NAME);
169   if (charset)
170     {
171       if (a && ! *a)
172 	*a = charset;
173       if (strcmp (charset, "UTF-8") == 0)
174 	return 1;
175     }
176 #elif CODESET
177   charset = nl_langinfo(CODESET);
178   if (charset)
179     {
180       if (a && ! *a)
181 	*a = charset;
182       if (strcmp (charset, "UTF-8") == 0)
183 	return 1;
184     }
185 #endif
186 
187   if (a && ! *a)
188     *a = "US-ASCII";
189   /* Assume this for compatibility at present.  */
190   return 0;
191 }
192 
193 static int utf8_locale_cache = -1;
194 static char *utf8_charset_cache = NULL;
195 
196 int
unicode_get_charset(char ** charset)197 unicode_get_charset (char **charset)
198 {
199   if (utf8_locale_cache != -1)
200     {
201       if (charset)
202 	*charset = utf8_charset_cache;
203       return utf8_locale_cache;
204     }
205   utf8_locale_cache = unicode_get_charset_internal (&utf8_charset_cache);
206   if (charset)
207     *charset = utf8_charset_cache;
208   return utf8_locale_cache;
209 }
210 
211 int
unicode_string_width(const char * p)212 unicode_string_width(const char *p)
213 {
214   /* for now, we just pass through to unicode_strlen
215    *
216    *  FIXME : make characters with doublewidth property count for 2,
217    *          and nonspacing combining characters count for 0.
218    */
219   return unicode_strlen(p, -1);
220 }
221 
222 void
unicode_pad_string(char * dest,int right,int width,const char * string)223 unicode_pad_string(char *dest, int right, int width, const char *string)
224 {
225   char *old_dest = dest;
226   strcpy(dest, string);
227 
228   width -= unicode_string_width(string);
229   dest += strlen(dest);
230 
231   if (width < 0) {
232     int i = unicode_offset_to_index(string, width);
233     old_dest[i] = 0;
234     return;
235   }
236 
237   while (width > 0) {
238     *dest = ' ';
239     dest++;
240     width--;
241   }
242 
243   *dest = 0;
244 
245 }
246 
247 /* unicode_strchr */
248 
249 char *
unicode_strchr(const char * p,unicode_char_t c)250 unicode_strchr(const char *p, unicode_char_t c)
251 {
252   char ch[10];
253   int first, len, i;
254 
255   if (c < 0x80) return strchr(p, c);
256 
257   if (c < 0x800) {
258     first = 0xc0;
259     len = 2;
260   } else if (c < 0x10000)
261     {
262       first = 0xe0;
263       len = 3;
264     }
265   else if (c < 0x200000)
266     {
267       first = 0xf0;
268       len = 4;
269     }
270   else if (c < 0x4000000)
271     {
272       first = 0xf8;
273       len = 5;
274     }
275   else
276     {
277       first = 0xfc;
278       len = 6;
279     }
280 
281   for (i = len - 1; i > 0 ; --i)
282     {
283       ch[i] = (c & 0x3f) | 0x80;
284       c >>= 6;
285     }
286   ch[0] = c | first;
287   ch[len] = 0;
288 
289   return strstr(p, ch);
290 }
291 
292 #if 0
293 /* unicode_strrchr
294  *
295  * This is ifdefed out atm as there is no strrstr function in libc.
296  *
297  * One needs writing.
298  */
299 
300 char *
301 unicode_strrchr(const char *p, unicode_char_t c)
302 {
303   char ch[10];
304   int first, len, i;
305 
306   if (c < 0x80) return strrchr(p, c);
307 
308   if (c < 0x800) {
309     first = 0xc0;
310     len = 2;
311   } else if (c < 0x10000)
312     {
313       first = 0xe0;
314       len = 3;
315     }
316   else if (c < 0x200000)
317     {
318       first = 0xf0;
319       len = 4;
320     }
321   else if (c < 0x4000000)
322     {
323       first = 0xf8;
324       len = 5;
325     }
326   else
327     {
328       first = 0xfc;
329       len = 6;
330     }
331 
332   for (i = len - 1; i > 0 ; --i)
333     {
334       ch[i] = (c & 0x3f) | 0x80;
335       c >>= 6;
336     }
337   ch[0] = c | first;
338   ch[len] = 0;
339 
340   return strrstr(p, ch);
341 }
342 #endif
343