1 /*
2  * ustring.c: Unicode string routines
3  */
4 
5 #include <wchar.h>
6 #include <stdlib.h>
7 #include <assert.h>
8 #include <time.h>
9 #include "halibut.h"
10 
ustrdup(wchar_t const * s)11 wchar_t *ustrdup(wchar_t const *s) {
12     wchar_t *r;
13     if (s) {
14 	r = snewn(1+ustrlen(s), wchar_t);
15 	ustrcpy(r, s);
16     } else {
17 	r = snew(wchar_t);
18 	*r = 0;
19     }
20     return r;
21 }
22 
ustrtoa_internal(wchar_t const * s,char * outbuf,int size,int charset,int careful)23 static char *ustrtoa_internal(wchar_t const *s, char *outbuf, int size,
24 			      int charset, int careful) {
25     int len, ret, err;
26     charset_state state = CHARSET_INIT_STATE;
27 
28     if (!s) {
29 	*outbuf = '\0';
30 	return outbuf;
31     }
32 
33     len = ustrlen(s);
34     size--;			       /* leave room for terminating NUL */
35     *outbuf = '\0';
36     while (len > 0) {
37 	err = 0;
38 	ret = charset_from_unicode(&s, &len, outbuf, size, charset, &state,
39 				   (careful ? &err : NULL));
40 	if (err)
41 	    return NULL;
42 	if (!ret)
43 	    return outbuf;
44 	size -= ret;
45 	outbuf += ret;
46 	*outbuf = '\0';
47     }
48     /*
49      * Clean up
50      */
51     ret = charset_from_unicode(NULL, 0, outbuf, size, charset, &state, NULL);
52     size -= ret;
53     outbuf += ret;
54     *outbuf = '\0';
55     return outbuf;
56 }
57 
ustrtoa(wchar_t const * s,char * outbuf,int size,int charset)58 char *ustrtoa(wchar_t const *s, char *outbuf, int size, int charset) {
59     return ustrtoa_internal(s, outbuf, size, charset, FALSE);
60 }
61 
ustrtoa_careful(wchar_t const * s,char * outbuf,int size,int charset)62 char *ustrtoa_careful(wchar_t const *s, char *outbuf, int size, int charset) {
63     return ustrtoa_internal(s, outbuf, size, charset, TRUE);
64 }
65 
ustrfroma(char const * s,wchar_t * outbuf,int size,int charset)66 wchar_t *ustrfroma(char const *s, wchar_t *outbuf, int size, int charset) {
67     int len, ret;
68     charset_state state = CHARSET_INIT_STATE;
69 
70     if (!s) {
71 	*outbuf = L'\0';
72 	return outbuf;
73     }
74 
75     len = strlen(s);
76     size--;			       /* allow for terminating NUL */
77     *outbuf = L'\0';
78     while (len > 0) {
79 	ret = charset_to_unicode(&s, &len, outbuf, size,
80 				 charset, &state, NULL, 0);
81 	if (!ret)
82 	    return outbuf;
83 	outbuf += ret;
84 	size -= ret;
85 	*outbuf = L'\0';
86     }
87     return outbuf;
88 }
89 
utoa_internal_dup(wchar_t const * s,int charset,int * lenp,int careful)90 char *utoa_internal_dup(wchar_t const *s, int charset, int *lenp, int careful)
91 {
92     char *outbuf;
93     int outpos, outlen, len, ret, err;
94     charset_state state = CHARSET_INIT_STATE;
95 
96     if (!s) {
97 	return dupstr("");
98     }
99 
100     len = ustrlen(s);
101 
102     outlen = len + 10;
103     outbuf = snewn(outlen, char);
104 
105     outpos = 0;
106     outbuf[outpos] = '\0';
107 
108     while (len > 0) {
109 	err = 0;
110 	ret = charset_from_unicode(&s, &len,
111 				   outbuf + outpos, outlen - outpos - 1,
112 				   charset, &state, (careful ? &err : NULL));
113 	if (err) {
114 	    sfree(outbuf);
115 	    return NULL;
116 	}
117 	if (!ret) {
118 	    outlen = outlen * 3 / 2;
119 	    outbuf = sresize(outbuf, outlen, char);
120 	}
121 	outpos += ret;
122 	outbuf[outpos] = '\0';
123     }
124     /*
125      * Clean up
126      */
127     outlen = outpos + 32;
128     outbuf = sresize(outbuf, outlen, char);
129     ret = charset_from_unicode(NULL, 0,
130 			       outbuf + outpos, outlen - outpos + 1,
131 			       charset, &state, NULL);
132     outpos += ret;
133     outbuf[outpos] = '\0';
134     if (lenp)
135 	*lenp = outpos;
136     return outbuf;
137 }
138 
utoa_dup(wchar_t const * s,int charset)139 char *utoa_dup(wchar_t const *s, int charset)
140 {
141     return utoa_internal_dup(s, charset, NULL, FALSE);
142 }
143 
utoa_dup_len(wchar_t const * s,int charset,int * len)144 char *utoa_dup_len(wchar_t const *s, int charset, int *len)
145 {
146     return utoa_internal_dup(s, charset, len, FALSE);
147 }
148 
utoa_careful_dup(wchar_t const * s,int charset)149 char *utoa_careful_dup(wchar_t const *s, int charset)
150 {
151     return utoa_internal_dup(s, charset, NULL, TRUE);
152 }
153 
ufroma_dup(char const * s,int charset)154 wchar_t *ufroma_dup(char const *s, int charset) {
155     int len;
156     wchar_t *buf = NULL;
157 
158     len = strlen(s) + 1;
159     do {
160 	buf = sresize(buf, len, wchar_t);
161 	ustrfroma(s, buf, len, charset);
162 	len = (3 * len) / 2 + 1;       /* this guarantees a strict increase */
163     } while (ustrlen(buf) >= len-1);
164 
165     buf = sresize(buf, ustrlen(buf)+1, wchar_t);
166     return buf;
167 }
168 
utoa_locale_dup(wchar_t const * s)169 char *utoa_locale_dup(wchar_t const *s)
170 {
171     /*
172      * This variant uses the C library locale.
173      */
174     char *ret;
175     int len, outlen;
176     size_t siz;
177 
178     len = ustrlen(s);
179 
180     outlen = 1 + MB_CUR_MAX * len;
181     ret = snewn(outlen+1, char);
182 
183     siz = wcstombs(ret, s, outlen);
184 
185     if (siz) {
186 	assert(siz <= (size_t)(outlen));
187 	ret[siz] = '\0';
188 	ret = sresize(ret, siz+1, char);
189 	return ret;
190     }
191 
192     /*
193      * If that failed, try a different strategy (which we will also
194      * attempt in the total absence of wcstombs). Retrieve the
195      * locale's charset from nl_langinfo or equivalent, and use
196      * normal utoa_dup.
197      */
198     return utoa_dup(s, charset_from_locale());
199 }
200 
ufroma_locale_dup(char const * s)201 wchar_t *ufroma_locale_dup(char const *s)
202 {
203     /*
204      * This variant uses the C library locale.
205      */
206     wchar_t *ret;
207     int len, outlen;
208     size_t siz;
209 
210     len = strlen(s);
211 
212     outlen = 1 + 2*len;
213     ret = snewn(outlen+1, wchar_t);  /* be conservative */
214 
215     siz = mbstowcs(ret, s, outlen);
216 
217     if (siz) {
218 	assert(siz <= (size_t)(outlen));
219 	ret[siz] = L'\0';
220 	ret = sresize(ret, siz+1, wchar_t);
221 	return ret;
222     }
223 
224     /*
225      * If that failed, try a different strategy (which we will also
226      * attempt in the total absence of wcstombs). Retrieve the
227      * locale's charset from nl_langinfo or equivalent, and use
228      * normal ufroma_dup.
229      */
230     return ufroma_dup(s, charset_from_locale());
231 }
232 
ustrlen(wchar_t const * s)233 int ustrlen(wchar_t const *s) {
234     int len = 0;
235     while (*s++) len++;
236     return len;
237 }
238 
uadv(wchar_t * s)239 wchar_t *uadv(wchar_t *s) {
240     return s + 1 + ustrlen(s);
241 }
242 
ustrcpy(wchar_t * dest,wchar_t const * source)243 wchar_t *ustrcpy(wchar_t *dest, wchar_t const *source) {
244     wchar_t *ret = dest;
245     do {
246 	*dest++ = *source;
247     } while (*source++);
248     return ret;
249 }
250 
ustrncpy(wchar_t * dest,wchar_t const * source,int n)251 wchar_t *ustrncpy(wchar_t *dest, wchar_t const *source, int n) {
252     wchar_t *ret = dest;
253     do {
254 	*dest++ = *source;
255 	if (*source) source++;
256     } while (n-- > 0);
257     return ret;
258 }
259 
ustrcmp(wchar_t * lhs,wchar_t * rhs)260 int ustrcmp(wchar_t *lhs, wchar_t *rhs) {
261     if (!lhs && !rhs) return 0;
262     if (!lhs) return -1;
263     if (!rhs) return +1;
264     while (*lhs && *rhs && *lhs==*rhs)
265 	lhs++, rhs++;
266     if (*lhs < *rhs)
267 	return -1;
268     else if (*lhs > *rhs)
269 	return 1;
270     return 0;
271 }
272 
utolower(wchar_t c)273 wchar_t utolower(wchar_t c) {
274     if (c == L'\0')
275 	return c;		       /* this property needed by ustricmp */
276 #ifdef HAS_TOWLOWER
277     return towlower(c);
278 #else
279     if (c >= 'A' && c <= 'Z')
280 	c += 'a'-'A';
281     return c;
282 #endif
283 }
284 
uisalpha(wchar_t c)285 int uisalpha(wchar_t c) {
286 #ifdef HAS_ISWALPHA
287     return iswalpha(c);
288 #else
289     return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
290 #endif
291 }
292 
ustricmp(wchar_t const * lhs,wchar_t const * rhs)293 int ustricmp(wchar_t const *lhs, wchar_t const *rhs) {
294     wchar_t lc, rc;
295     while ((lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
296 	lhs++, rhs++;
297     if (!lc && !rc)
298 	return 0;
299     if (lc < rc)
300 	return -1;
301     else
302 	return 1;
303 }
304 
ustrnicmp(wchar_t const * lhs,wchar_t const * rhs,int maxlen)305 int ustrnicmp(wchar_t const *lhs, wchar_t const *rhs, int maxlen) {
306     wchar_t lc = 0, rc = 0;
307     while (maxlen-- > 0 &&
308 	   (lc = utolower(*lhs)) == (rc = utolower(*rhs)) && lc && rc)
309 	lhs++, rhs++;
310     if (lc < rc)
311 	return -1;
312     else if (lc > rc)
313 	return 1;
314     else
315 	return 0;
316 }
317 
ustrlow(wchar_t * s)318 wchar_t *ustrlow(wchar_t *s) {
319     wchar_t *p = s;
320     while (*p) {
321 	*p = utolower(*p);
322 	p++;
323     }
324     return s;
325 }
326 
utoi(wchar_t const * s)327 int utoi(wchar_t const *s) {
328     int sign = +1;
329     int n;
330 
331     if (*s == L'-') {
332 	s++;
333 	sign = -1;
334     }
335 
336     n = 0;
337     while (*s && *s >= L'0' && *s <= L'9') {
338 	n *= 10;
339 	n += (*s - '0');
340 	s++;
341     }
342 
343     return n * sign;
344 }
345 
utof(wchar_t const * s)346 double utof(wchar_t const *s)
347 {
348     char *cs = utoa_dup(s, CS_ASCII);
349     double ret = atof(cs);
350     sfree(cs);
351     return ret;
352 }
353 
utob(wchar_t const * s)354 int utob(wchar_t const *s) {
355     if (!ustricmp(s, L"yes") || !ustricmp(s, L"y") ||
356 	!ustricmp(s, L"true") || !ustricmp(s, L"t"))
357 	return TRUE;
358     return FALSE;
359 }
360 
uisdigit(wchar_t c)361 int uisdigit(wchar_t c) {
362     return c >= L'0' && c <= L'9';
363 }
364 
365 #define USTRFTIME_DELTA 128
ustrftime_internal(rdstring * rs,char formatchr,const struct tm * timespec)366 static void ustrftime_internal(rdstring *rs, char formatchr,
367 			       const struct tm *timespec)
368 {
369     /*
370      * strftime has the entertaining property that it returns 0
371      * _either_ on out-of-space _or_ on successful generation of
372      * the empty string. Hence we must ensure our format can never
373      * generate the empty string. Somebody throw a custard pie at
374      * whoever was responsible for that. Please?
375      */
376 
377 #ifdef HAS_WCSFTIME
378     wchar_t *buf = NULL;
379     wchar_t fmt[4];
380     int size, ret;
381 
382     fmt[0] = L' ';
383     fmt[1] = L'%';
384     /* Format chars are all ASCII, so conversion to Unicode is no problem */
385     fmt[2] = formatchr;
386     fmt[3] = L'\0';
387 
388     size = 0;
389     do {
390 	size += USTRFTIME_DELTA;
391 	buf = sresize(buf, size, wchar_t);
392 	ret = (int) wcsftime(buf, size, fmt, timespec);
393     } while (ret == 0);
394 
395     rdadds(rs, buf+1);
396     sfree(buf);
397 #else
398     char *buf = NULL;
399     wchar_t *cvtbuf;
400     char fmt[4];
401     int size, ret;
402 
403     fmt[0] = ' ';
404     fmt[1] = '%';
405     fmt[2] = formatchr;
406     fmt[3] = '\0';
407 
408     size = 0;
409     do {
410 	size += USTRFTIME_DELTA;
411 	buf = sresize(buf, size, char);
412 	ret = (int) strftime(buf, size, fmt, timespec);
413     } while (ret == 0);
414 
415     cvtbuf = ufroma_locale_dup(buf+1);
416     rdadds(rs, cvtbuf);
417     sfree(cvtbuf);
418     sfree(buf);
419 #endif
420 }
421 
ustrftime(const wchar_t * wfmt,const struct tm * timespec)422 wchar_t *ustrftime(const wchar_t *wfmt, const struct tm *timespec)
423 {
424     rdstring rs = { 0, 0, NULL };
425 
426     if (!wfmt)
427 	wfmt = L"%c";
428 
429     while (*wfmt) {
430 	if (wfmt[0] == L'%' && wfmt[1] == L'%') {
431 	    rdadd(&rs, L'%');
432 	    wfmt += 2;
433 	} else if (wfmt[0] == L'%' && wfmt[1]) {
434 	    ustrftime_internal(&rs, wfmt[1], timespec);
435 	    wfmt += 2;
436 	} else {
437 	    rdadd(&rs, wfmt[0]);
438 	    wfmt++;
439 	}
440     }
441 
442     return rdtrim(&rs);
443 }
444 
445 /*
446  * Determine whether a Unicode string can be translated into a
447  * given charset without any missing characters.
448  */
cvt_ok(int charset,const wchar_t * s)449 int cvt_ok(int charset, const wchar_t *s)
450 {
451     char buf[256];
452     charset_state state = CHARSET_INIT_STATE;
453     int err, len = ustrlen(s);
454 
455     err = 0;
456     while (len > 0) {
457 	(void)charset_from_unicode(&s, &len, buf, lenof(buf),
458 				   charset, &state, &err);
459 	if (err)
460 	    return FALSE;
461     }
462     return TRUE;
463 }
464 
465 /*
466  * Wrapper around charset_from_localenc which accepts the charset
467  * name as a wide string (since that happens to be more useful).
468  * Also throws a Halibut error and falls back to CS_ASCII if the
469  * charset is unrecognised, meaning the rest of the program can
470  * rely on always getting a valid charset id back from this
471  * function.
472  */
charset_from_ustr(filepos * fpos,const wchar_t * name)473 int charset_from_ustr(filepos *fpos, const wchar_t *name)
474 {
475     char *csname;
476     int charset;
477 
478     csname = utoa_dup(name, CS_ASCII);
479     charset = charset_from_localenc(csname);
480 
481     if (charset == CS_NONE) {
482 	charset = CS_ASCII;
483 	err_charset(fpos, name);
484     }
485 
486     sfree(csname);
487     return charset;
488 }
489