xref: /netbsd/external/bsd/nvi/dist/common/conv.c (revision 822d8d66)
1 /*	$NetBSD: conv.c,v 1.11 2019/10/24 18:17:14 kamil Exp $ */
2 /*-
3  * Copyright (c) 1993, 1994
4  *	The Regents of the University of California.  All rights reserved.
5  * Copyright (c) 1993, 1994, 1995, 1996
6  *	Keith Bostic.  All rights reserved.
7  *
8  * See the LICENSE file for redistribution information.
9  */
10 
11 #include "config.h"
12 
13 #include <sys/cdefs.h>
14 #if 0
15 #ifndef lint
16 static const char sccsid[] = "Id: conv.c,v 1.27 2001/08/18 21:41:41 skimo Exp  (Berkeley) Date: 2001/08/18 21:41:41 ";
17 #endif /* not lint */
18 #else
19 __RCSID("$NetBSD: conv.c,v 1.11 2019/10/24 18:17:14 kamil Exp $");
20 #endif
21 
22 #include <sys/types.h>
23 #include <sys/queue.h>
24 #include <sys/time.h>
25 
26 #include <bitstring.h>
27 #include <errno.h>
28 #include <limits.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 #include <unistd.h>
33 
34 #include "common.h"
35 
36 #if defined(USE_WIDECHAR) && defined(USE_ICONV)
37 #include <langinfo.h>
38 #include <iconv.h>
39 
40 #define LANGCODESET	nl_langinfo(CODESET)
41 #else
42 #define LANGCODESET	""
43 #endif
44 
45 #include <locale.h>
46 
47 #ifdef USE_WIDECHAR
48 #ifdef USE_ICONV
49 static int
raw2int(SCR * sp,const char * str,ssize_t len,CONVWIN * cw,size_t * tolen,const CHAR_T ** dst)50 raw2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, size_t *tolen,
51 	const CHAR_T **dst)
52 {
53     int i;
54     CHAR_T **tostr = (CHAR_T **)(void *)&cw->bp1;
55     size_t  *blen = &cw->blen1;
56 
57     BINC_RETW(NULL, *tostr, *blen, len);
58 
59     *tolen = len;
60     for (i = 0; i < len; ++i) {
61 	CHAR_T w = (u_char)str[i];
62 	memcpy((*tostr) + i, &w, sizeof(**tostr));
63     }
64 
65     *dst = cw->bp1;
66 
67     return 0;
68 }
69 #endif
70 
71 #ifndef ERROR_ON_CONVERT
72 #define HANDLE_ICONV_ERROR(o, i, ol, il) do {				\
73 		*o++ = *i++;						\
74 		ol--; il--;						\
75 	} while (/*CONSTCOND*/0)
76 #define HANDLE_MBR_ERROR(n, mbs, d, s) do {				\
77 		d = s;							\
78 		MEMSET(&mbs, 0, 1); 					\
79 		n = 1; 							\
80 	} while (/*CONSTCOND*/0)
81 #else
82 #define HANDLE_ICONV_ERROR goto err
83 #define	HANDLE_MBR_ERROR goto err
84 #endif
85 
86 #define CONV_BUFFER_SIZE    512
87 /* fill the buffer with codeset encoding of string pointed to by str
88  * left has the number of bytes left in str and is adjusted
89  * len contains the number of bytes put in the buffer
90  */
91 #ifdef USE_ICONV
92 #define CONVERT(str, left, src, len)				    	\
93     do {								\
94 	size_t outleft;							\
95 	char *bp = buffer;						\
96 	outleft = CONV_BUFFER_SIZE;					\
97 	errno = 0;							\
98 	if (iconv(id, (char **)(void *)&str, &left, &bp, &outleft) 	\
99 	    == (size_t)-1 && errno != E2BIG)				\
100 		HANDLE_ICONV_ERROR(bp, str, outleft, left);		\
101 	if ((len = CONV_BUFFER_SIZE - outleft) == 0) {			\
102 	    error = -left;						\
103 	    goto err;							\
104 	}				    				\
105 	src = buffer;							\
106     } while (0)
107 #endif
108 
109 static int
default_char2int(SCR * sp,const char * str,ssize_t len,CONVWIN * cw,size_t * tolen,const CHAR_T ** dst,const char * enc)110 default_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
111 		size_t *tolen, const CHAR_T **dst, const char *enc)
112 {
113     int j;
114     size_t i = 0;
115     CHAR_T **tostr = (CHAR_T **)(void *)&cw->bp1;
116     size_t  *blen = &cw->blen1;
117     mbstate_t mbs;
118     size_t   n;
119     ssize_t  nlen = len;
120     const char *src = (const char *)str;
121     int		error = 1;
122 #ifdef USE_ICONV
123     iconv_t	id = (iconv_t)-1;
124     char	buffer[CONV_BUFFER_SIZE];
125     size_t	left = len;
126 #endif
127 
128     MEMSET(&mbs, 0, 1);
129     BINC_RETW(NULL, *tostr, *blen, nlen);
130 
131 #ifdef USE_ICONV
132     if (strcmp(nl_langinfo(CODESET), enc)) {
133 	id = iconv_open(nl_langinfo(CODESET), enc);
134 	if (id == (iconv_t)-1)
135 	    goto err;
136 	CONVERT(str, left, src, len);
137     }
138 #endif
139 
140     for (i = 0, j = 0; j < len; ) {
141 	CHAR_T w;
142 	n = mbrtowc(&w, src + j, len - j, &mbs);
143 	memcpy((*tostr) + i, &w, sizeof(**tostr));
144 	/* NULL character converted */
145 	if (n == (size_t)-2) error = -(len - j);
146 	if (n == (size_t)-1 || n == (size_t)-2) {
147 	    HANDLE_MBR_ERROR(n, mbs, w, src[j]);
148 	    memcpy((*tostr) + i, &w, sizeof(**tostr));
149 	}
150 	if (n == 0) n = 1;
151 	j += n;
152 	if (++i >= *blen) {
153 	    nlen += 256;
154 	    BINC_GOTOW(NULL, *tostr, *blen, nlen);
155 	}
156 #ifdef USE_ICONV
157 	if (id != (iconv_t)-1 && j == len && left) {
158 	    CONVERT(str, left, src, len);
159 	    j = 0;
160 	}
161 #endif
162     }
163     *tolen = i;
164 
165 #ifdef USE_ICONV
166     if (id != (iconv_t)-1)
167 	iconv_close(id);
168 #endif
169 
170     *dst = cw->bp1;
171 
172     return 0;
173 alloc_err:
174 #ifdef USE_ICONV
175 err:
176     if (id != (iconv_t)-1)
177 	iconv_close(id);
178 #endif
179     *tolen = i;
180     *dst = cw->bp1;
181 
182     return error;
183 }
184 
185 static int
fe_char2int(SCR * sp,const char * str,ssize_t len,CONVWIN * cw,size_t * tolen,const CHAR_T ** dst)186 fe_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
187 	    size_t *tolen, const CHAR_T **dst)
188 {
189     return default_char2int(sp, str, len, cw, tolen, dst, O_STR(sp, O_FILEENCODING));
190 }
191 
192 static int
ie_char2int(SCR * sp,const char * str,ssize_t len,CONVWIN * cw,size_t * tolen,const CHAR_T ** dst)193 ie_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
194 	    size_t *tolen, const CHAR_T **dst)
195 {
196     return default_char2int(sp, str, len, cw, tolen, dst, O_STR(sp, O_INPUTENCODING));
197 }
198 
199 static int
cs_char2int(SCR * sp,const char * str,ssize_t len,CONVWIN * cw,size_t * tolen,const CHAR_T ** dst)200 cs_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
201 	    size_t *tolen, const CHAR_T **dst)
202 {
203     return default_char2int(sp, str, len, cw, tolen, dst, LANGCODESET);
204 }
205 
206 #ifdef USE_ICONV
207 static int
CHAR_T_int2char(SCR * sp,const CHAR_T * str,ssize_t len,CONVWIN * cw,size_t * tolen,const char ** dst)208 CHAR_T_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
209 	size_t *tolen, const char **dst)
210 {
211     *tolen = len * sizeof(CHAR_T);
212     *dst = (const char *)(const void *)str;
213 
214     return 0;
215 }
216 
217 static int
CHAR_T_char2int(SCR * sp,const char * str,ssize_t len,CONVWIN * cw,size_t * tolen,const CHAR_T ** dst)218 CHAR_T_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
219 	size_t *tolen, const CHAR_T **dst)
220 {
221     *tolen = len / sizeof(CHAR_T);
222     *dst = (const CHAR_T*) str;
223 
224     return 0;
225 }
226 
227 static int
int2raw(SCR * sp,const CHAR_T * str,ssize_t len,CONVWIN * cw,size_t * tolen,const char ** dst)228 int2raw(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw, size_t *tolen,
229 	const char **dst)
230 {
231     int i;
232     char **tostr = (char **)(void *)&cw->bp1;
233     size_t  *blen = &cw->blen1;
234 
235     BINC_RETC(NULL, *tostr, *blen, len);
236 
237     *tolen = len;
238     for (i = 0; i < len; ++i) {
239 	CHAR_T w;
240 	memcpy(&w, str + i, sizeof(w));
241 	(*tostr)[i] = w;
242     }
243 
244     *dst = cw->bp1;
245 
246     return 0;
247 }
248 #endif
249 
250 static int
default_int2char(SCR * sp,const CHAR_T * str,ssize_t len,CONVWIN * cw,size_t * tolen,const char ** pdst,const char * enc)251 default_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
252 		size_t *tolen, const char **pdst, const char *enc)
253 {
254     size_t i, j = 0;
255     char **tostr = (char **)(void *)&cw->bp1;
256     size_t  *blen = &cw->blen1;
257     mbstate_t mbs;
258     size_t n;
259     ssize_t  nlen = len + MB_CUR_MAX;
260     char *dst;
261     size_t buflen;
262 #ifdef USE_ICONV
263     int		offset = 0;
264     char	buffer[CONV_BUFFER_SIZE];
265     iconv_t	id = (iconv_t)-1;
266 #endif
267 
268 /* convert first len bytes of buffer and append it to cw->bp
269  * len is adjusted => 0
270  * offset contains the offset in cw->bp and is adjusted
271  * cw->bp is grown as required
272  */
273 #ifdef USE_ICONV
274 #define CONVERT2(_buffer, lenp, cw, offset)				\
275     do {								\
276 	const char *bp = _buffer;					\
277 	size_t ret;							\
278 	do {								\
279 	    size_t outleft = cw->blen1 - offset;			\
280 	    char *obp = (char *)cw->bp1 + offset;		    	\
281 	    if (cw->blen1 < offset + MB_CUR_MAX) {		    	\
282 		nlen += 256;						\
283 		BINC_GOTOC(NULL, cw->bp1, cw->blen1, nlen);		\
284 	    }						    		\
285 	    errno = 0;						    	\
286 	    ret = iconv(id, (char **)(void *)&bp, lenp, &obp, &outleft);\
287 	    if (ret == (size_t)-1 && errno != E2BIG) 			\
288 		    HANDLE_ICONV_ERROR(obp, bp, outleft, len);		\
289 	    offset = cw->blen1 - outleft;			        \
290 	} while (ret != 0);					        \
291     } while (0)
292 #endif
293 
294     MEMSET(&mbs, 0, 1);
295     BINC_RETC(NULL, *tostr, *blen, nlen);
296     dst = *tostr; buflen = *blen;
297 
298 #ifdef USE_ICONV
299     if (strcmp(nl_langinfo(CODESET), enc)) {
300 	id = iconv_open(enc, nl_langinfo(CODESET));
301 	if (id == (iconv_t)-1)
302 	    goto err;
303 	dst = buffer; buflen = CONV_BUFFER_SIZE;
304     }
305 #endif
306 
307     for (i = 0, j = 0; i < (size_t)len; ++i) {
308 	CHAR_T w;
309 	memcpy(&w, str + i, sizeof(w));
310 	n = wcrtomb(dst + j, w, &mbs);
311 	if (n == (size_t)-1)
312 	   HANDLE_MBR_ERROR(n, mbs, dst[j], w);
313 	j += n;
314 	if (buflen < j + MB_CUR_MAX) {
315 #ifdef USE_ICONV
316 	    if (id != (iconv_t)-1) {
317 		CONVERT2(buffer, &j, cw, offset);
318 	    } else
319 #endif
320 	    {
321 		nlen += 256;
322 		BINC_RETC(NULL, *tostr, *blen, nlen);
323 		dst = *tostr; buflen = *blen;
324 	    }
325 	}
326     }
327 
328     n = wcrtomb(dst + j, L'\0', &mbs);
329     j += n - 1;				/* don't count NUL at the end */
330     *tolen = j;
331 
332 #ifdef USE_ICONV
333     if (id != (iconv_t)-1) {
334 	CONVERT2(buffer, &j, cw, offset);
335 	CONVERT2(NULL, NULL, cw, offset);  /* back to the initial state */
336 	*tolen = offset;
337 	iconv_close(id);
338     }
339 #endif
340 
341     *pdst = cw->bp1;
342 
343     return 0;
344 #ifdef USE_ICONV
345 alloc_err:
346 err:
347     if (id != (iconv_t)-1)
348 	iconv_close(id);
349     *tolen = j;
350     *pdst = cw->bp1;
351 
352     return 1;
353 #endif
354 }
355 
356 static int
fe_int2char(SCR * sp,const CHAR_T * str,ssize_t len,CONVWIN * cw,size_t * tolen,const char ** dst)357 fe_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
358 	    size_t *tolen, const char **dst)
359 {
360     return default_int2char(sp, str, len, cw, tolen, dst, O_STR(sp, O_FILEENCODING));
361 }
362 
363 static int
cs_int2char(SCR * sp,const CHAR_T * str,ssize_t len,CONVWIN * cw,size_t * tolen,const char ** dst)364 cs_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
365 	    size_t *tolen, const char **dst)
366 {
367     return default_int2char(sp, str, len, cw, tolen, dst, LANGCODESET);
368 }
369 
370 #endif
371 
372 
373 void
conv_init(SCR * orig,SCR * sp)374 conv_init (SCR *orig, SCR *sp)
375 {
376     if (orig != NULL)
377 	MEMCPY(&sp->conv, &orig->conv, 1);
378     else {
379 	setlocale(LC_ALL, "");
380 #ifdef USE_WIDECHAR
381 	sp->conv.sys2int = cs_char2int;
382 	sp->conv.int2sys = cs_int2char;
383 	sp->conv.file2int = fe_char2int;
384 	sp->conv.int2file = fe_int2char;
385 	sp->conv.input2int = ie_char2int;
386 #ifdef USE_ICONV
387 	o_set(sp, O_FILEENCODING, OS_STRDUP, nl_langinfo(CODESET), 0);
388 	o_set(sp, O_INPUTENCODING, OS_STRDUP, nl_langinfo(CODESET), 0);
389 #endif
390 #endif
391     }
392 }
393 
394 int
conv_enc(SCR * sp,int option,const char * enc)395 conv_enc (SCR *sp, int option, const char *enc)
396 {
397 #if defined(USE_WIDECHAR) && defined(USE_ICONV)
398     iconv_t id;
399     char2wchar_t    *c2w;
400     wchar2char_t    *w2c;
401 
402     switch (option) {
403     case O_FILEENCODING:
404 	c2w = &sp->conv.file2int;
405 	w2c = &sp->conv.int2file;
406 	break;
407     case O_INPUTENCODING:
408 	c2w = &sp->conv.input2int;
409 	w2c = NULL;
410 	break;
411     default:
412 	c2w = NULL;
413 	w2c = NULL;
414 	break;
415     }
416 
417     if (!*enc) {
418 	if (c2w) *c2w = raw2int;
419 	if (w2c) *w2c = int2raw;
420 	return 0;
421     }
422 
423     if (!strcmp(enc, "WCHAR_T")) {
424 	if (c2w) *c2w = CHAR_T_char2int;
425 	if (w2c) *w2c = CHAR_T_int2char;
426 	return 0;
427     }
428 
429     id = iconv_open(enc, nl_langinfo(CODESET));
430     if (id == (iconv_t)-1)
431 	goto err;
432     iconv_close(id);
433     id = iconv_open(nl_langinfo(CODESET), enc);
434     if (id == (iconv_t)-1)
435 	goto err;
436     iconv_close(id);
437 
438     switch (option) {
439     case O_FILEENCODING:
440 	*c2w = fe_char2int;
441 	*w2c = fe_int2char;
442 	break;
443     case O_INPUTENCODING:
444 	*c2w = ie_char2int;
445 	break;
446     }
447 
448     F_CLR(sp, SC_CONV_ERROR);
449     F_SET(sp, SC_SCR_REFORMAT);
450 
451     return 0;
452 err:
453     switch (option) {
454     case O_FILEENCODING:
455 	msgq(sp, M_ERR,
456 	    "321|File encoding conversion not supported");
457 	break;
458     case O_INPUTENCODING:
459 	msgq(sp, M_ERR,
460 	    "322|Input encoding conversion not supported");
461 	break;
462     }
463 #endif
464     return 1;
465 }
466