1 #if !defined(lint) && !defined(DOS)
2 static char rcsid[] = "$Id: utf8.c 1184 2008-12-16 23:52:15Z hubert@u.washington.edu $";
3 #endif
4 
5 /*
6  * ========================================================================
7  * Copyright 2013-2021 Eduardo Chappa
8  * Copyright 2006-2008 University of Washington
9  *
10  * Licensed under the Apache License, Version 2.0 (the "License");
11  * you may not use this file except in compliance with the License.
12  * You may obtain a copy of the License at
13  *
14  *     http://www.apache.org/licenses/LICENSE-2.0
15  *
16  * ========================================================================
17  */
18 
19 
20 /* includable WITHOUT dependency on c-client */
21 #include "../../c-client/mail.h"
22 #include "../../c-client/utf8.h"
23 
24 #ifdef _WINDOWS
25 /* wingdi.h uses ERROR (!) and we aren't using the c-client ERROR so... */
26 #undef ERROR
27 #else
28 #define _XOPEN_SOURCE
29 #endif
30 
31 #include <system.h>
32 
33 #include "../../c-client/fs.h"
34 
35 /* includable WITHOUT dependency on pico */
36 #include "../../pico/keydefs.h"
37 
38 #include "../osdep/collate.h"
39 #include "../filttype.h"
40 
41 #include "utf8.h"
42 
43 #include <stdarg.h>
44 
45 
46 unsigned single_width_chars_a_to_b(UCS *, int, int);
47 
48 
49 static char locale_charmap[50];
50 
51 static int   native_utf8;
52 static void *display_data;
53 
54 void
init_utf8_display(int utf8,void * rmap)55 init_utf8_display(int utf8, void *rmap)
56 {
57     native_utf8 = utf8;
58     display_data = rmap;
59 }
60 
61 
62 /*
63  * Argument is a UCS-4 wide character.
64  * Returns the environment dependent cell width of the
65  * character when printed to the screen.
66  * This will be -1 if the character is not printable.
67  * It will be >= zero if it is printable.
68  *
69  * Note that in the case it is not printable but it is still sent to
70  * Writechar, Writechar will print a '?' with width 1.
71  */
72 int
wcellwidth(UCS ucs)73 wcellwidth(UCS ucs)
74 {
75     char dummy[32];
76     long w;
77 
78     /*
79      * We believe that on modern unix systems wchar_t is a UCS-4 character.
80      * That's the assumption here.
81      */
82 
83     if(native_utf8){			/* display is UTF-8 capable */
84 	w = ucs4_width((unsigned long) ucs);
85 	return((w & U4W_ERROR) ? -1 : w);
86     }
87     else if(display_data){
88 	if(wtomb(dummy, ucs) < 0)
89 	  return(-1);
90 	else{
91 	    w = ucs4_width((unsigned long) ucs);
92 	    return((w & U4W_ERROR) ? -1 : w);
93 	}
94     }
95 #if !defined(_WINDOWS) && HAVE_WCWIDTH
96     else
97       return(wcwidth((wchar_t) ucs));
98 #else
99     return(0);
100 #endif
101 }
102 
103 /* ambiguous width zone character function. We use the Windows code until
104  * we find a better way to do it in general.
105  */
106 int
pith_ucs4width(UCS ucs)107 pith_ucs4width(UCS ucs)
108 {
109   return (ucs >= 0x2100) ? 2 : 1;
110 #if !defined(_WINDOWS) && HAVE_WCWIDTH
111   return wcwidth((wchar_t) ucs);
112 #else
113   return (ucs >= 0x2100) ? 2 : 1;
114 #endif /* _WINDOWS */
115 }
116 
117 /*
118  * Argument is a UCS-4 wide character.
119  * It is converted to the multibyte version (for example UTF8 or EUC-JP).
120  * Dest is a buffer at least xx chars wide where the multi-byte version
121  * of the wide character will be written.
122  * The returned value is the number of bytes written to dest or -1
123  * if the conversion can't be done.
124  */
125 int
wtomb(char * dest,UCS ucs)126 wtomb(char *dest, UCS ucs)
127 {
128     int rv;
129     /*
130      * We believe that on modern unix systems wchar_t is a UCS-4 character.
131      * That's the assumption here.
132      */
133 
134     if(native_utf8){
135 	unsigned char *newdptr;
136 
137 	newdptr = utf8_put((unsigned char *) dest, (unsigned long) ucs);
138 	return((newdptr == (unsigned char *) dest) ? -1 : newdptr - (unsigned char *) dest);
139     }
140     else if(display_data){
141 	unsigned long ucs4;
142 	int           ret;
143 
144 	ucs4 = (unsigned long) ucs;
145 	ret = ucs4_rmaplen(&ucs4, 1, (unsigned short *) display_data, 0);
146 	if(ret >= 0)
147 	  ucs4_rmapbuf((unsigned char *) dest, &ucs4, 1, (unsigned short *) display_data, 0);
148 	else
149 	  ret = -1;
150 
151 	return(ret);
152     }
153     else
154 #if defined(HAVE_WCRTOMB)
155        rv = wcrtomb(dest, (wchar_t) ucs, NULL);
156 #elif defined(HAVE_WCTOMB)
157        rv = wctomb(dest, (wchar_t) ucs);
158 #else
159        rv = -1;
160 #endif
161    return rv;
162 }
163 
164 
165 /*
166  * This function does not necessarily update inputp and remaining_octets, so
167  * don't rely on that. The c-client version does but the other doesn't.
168  */
169 UCS
mbtow(void * input_cs,unsigned char ** inputp,unsigned long * remaining_octets)170 mbtow(void *input_cs, unsigned char **inputp, unsigned long *remaining_octets)
171 {
172     UCS ucs;
173 
174     if(input_cs){
175 	CHARSET *cast_input_cs;
176 
177 	cast_input_cs = (CHARSET *) input_cs;
178 
179 	switch((ucs = (UCS) ucs4_cs_get(cast_input_cs, inputp, remaining_octets))){
180 	  case U8G_ENDSTRG:
181 	  case U8G_ENDSTRI:
182 	    return(CCONV_NEEDMORE);
183 
184 	  default:
185 	    if(ucs & U8G_ERROR || ucs == UBOGON)
186 	      return(CCONV_BADCHAR);
187 
188 	    return(ucs);
189 	}
190     }
191     else{
192 	size_t ret;
193 	wchar_t w;
194 
195 	/*
196 	 * Warning:  input_cs and remaining_octets are unused in this
197 	 * half of the if/else.
198 	 *
199 	 * Unfortunately, we can't tell the difference between a source string
200 	 * that is just not long enough and one that has characters that can't
201 	 * be converted even though it is long enough. We return NEEDMORE in both cases.
202 	 */
203 	ret = mbstowcs(&w, (char *) (*inputp), 1);
204 	if(ret == (size_t)(-1))
205 	  return(CCONV_NEEDMORE);
206 	else{
207 	  ucs = (UCS) w;
208 	  return(ucs);
209 	}
210     }
211 }
212 
213 
214 void
set_locale_charmap(char * charmap)215 set_locale_charmap(char *charmap)
216 {
217     if(charmap){
218 	strncpy(locale_charmap, charmap, sizeof(locale_charmap));
219 	locale_charmap[sizeof(locale_charmap)-1] = '\0';
220     }
221     else
222       locale_charmap[0] = '\0';
223 }
224 
225 
226 /*
227  * This ensures that the string is UTF-8. If str is already a UTF-8 string,
228  * NULL is returned. Otherwise, an allocated string which is UTF-8 is returned.
229  * The caller is responsible for freeing the returned value.
230  *
231  * Args  str     -- the string to convert
232  */
233 char *
convert_to_utf8(char * str,char * fromcharset,int flags)234 convert_to_utf8(char *str, char *fromcharset, int flags)
235 {
236     char          *ret = NULL;
237     char          *fcharset;
238     SIZEDTEXT      src, result;
239     const CHARSET *cs = NULL;
240     int            try;
241 
242     src.data = (unsigned char *) str;
243     src.size = strlen(str);
244 
245     /* already UTF-8, return NULL */
246     if(!(flags & CU8_NOINFER)
247        && (cs = utf8_infercharset(&src))
248        && (cs->type == CT_ASCII || cs->type == CT_UTF8))
249       return(ret);
250 
251     try = 1;
252     while(try < 5){
253 	switch(try){
254 	  case 1:
255 	    fcharset = fromcharset;
256 	    if(fcharset && strucmp("UTF-8", fcharset) != 0)
257 	      break;	/* give it a try */
258 	    else
259 	      try++;	/* fall through */
260 
261 	  case 2:
262 	    if(!(flags & CU8_NOINFER)){
263 		fcharset = cs ? cs->name : NULL;
264 		if(fcharset && strucmp("UTF-8", fcharset) != 0)
265 		  break;
266 		else
267 		  try++;	/* fall through */
268 	    }
269 	    else
270 	      try++;	/* fall through */
271 
272 	  case 3:
273 	    fcharset = locale_charmap;
274 	    if(fcharset && strucmp("UTF-8", fcharset) != 0)
275 	      break;
276 	    else
277 	      try++;	/* fall through */
278 
279 	  default:
280 	    fcharset = "ISO-8859-1";		/* this will "work" */
281 	    break;
282 	}
283 
284 	memset(&result, 0, sizeof(result));
285 
286 	if(fcharset && utf8_text(&src, fcharset, &result, 0L)){
287 	    if(!(result.size == src.size && result.data == src.data)){
288 		ret = (char *) fs_get((result.size+1) * sizeof(char));
289 		strncpy(ret, (char *) result.data, result.size);
290 		ret[result.size] = '\0';
291 	    }
292 	    /* else no conversion necessary */
293 
294 	    if(result.data && result.data != src.data)
295 	      fs_give((void **) &result.data);
296 	    result.size = 0;
297 
298 	    return(ret);
299 	}
300 
301 	try++;
302     }
303 
304     /* won't make it to here */
305     return(ret);
306 }
307 
308 
309 /*
310  * Convert from UTF-8 to user's locale charset.
311  * This actually uses the wtomb routine to do the conversion, and that
312  * relies on setup_for_input_output having been called.
313  * If no conversion is necessary, NULL is returned, otherwise an allocated
314  * string in the locale charset is returned and the caller is responsible
315  * for freeing it.
316  */
317 char *
convert_to_locale(char * utf8str)318 convert_to_locale(char *utf8str)
319 {
320 #define CHNK 500
321     char *inp, *ret = NULL;
322     CBUF_S cb;
323     int alloced;
324     size_t i = 0;
325 
326     if(native_utf8 || !utf8str || !utf8str[0])
327       return(NULL);
328 
329     cb.cbuf[0] = '\0';
330     cb.cbufp = cb.cbufend = cb.cbuf;
331     inp = utf8str;
332 
333     alloced = CHNK;
334     ret = (char *) fs_get(alloced * sizeof(char));
335 
336     /*
337      * There's gotta be a better way to do this but utf8_to_locale was
338      * available and everything looks like a nail when all you have
339      * is a hammer.
340      */
341     while(*inp){
342 	/*
343 	 * We're placing the outgoing stream of characters in ret, a multi-byte
344 	 * array of characters in the user's locale charset. See if there is
345 	 * enough room for the next wide characters worth of output chars
346 	 * and allocate more space if not.
347 	 */
348         if((alloced - i) < MAX(MB_LEN_MAX,32)){
349 	    alloced += CHNK;
350 	    fs_resize((void **) &ret, alloced * sizeof(char));
351 	}
352 
353         i += utf8_to_locale((int) *inp++, &cb,
354                            (unsigned char *) &ret[i], alloced - i);
355     }
356 
357     fs_resize((void **) &ret, i + 1);
358 
359     ret[i] = '\0';
360 
361     return(ret);
362 }
363 
364 
365 /*
366  * Pass in a stream of UTF-8 characters in 'c' and return obuf
367  * filled in with multi-byte characters. The return value is the
368  * number of valid characters in obuf to be used.
369  */
370 int
utf8_to_locale(int c,CBUF_S * cb,unsigned char obuf[],size_t obuf_size)371 utf8_to_locale(int c, CBUF_S *cb, unsigned char obuf[], size_t obuf_size)
372 {
373     int outchars = 0;
374 
375     if(!(cb && cb->cbufp))
376       return(0);
377 
378     if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
379 	unsigned char *inputp;
380 	unsigned long remaining_octets;
381 	UCS ucs;
382 
383 	*(cb->cbufp)++ = (unsigned char) c;
384 	inputp = cb->cbuf;
385 	remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
386 	ucs = (UCS) utf8_get(&inputp, &remaining_octets);
387 
388 	switch(ucs){
389 	  case U8G_ENDSTRG:	/* incomplete character, wait */
390 	  case U8G_ENDSTRI:	/* incomplete character, wait */
391 	    break;
392 
393 	  default:
394 	    if(ucs & U8G_ERROR || ucs == UBOGON){
395 		/*
396 		 * None of these cases is supposed to happen. If it
397 		 * does happen then the input stream isn't UTF-8
398 		 * so something is wrong. Treat each character in the
399 		 * input buffer as a separate error character and
400 		 * print a '?' for each.
401 		 */
402 		for(inputp = cb->cbuf; inputp < cb->cbufp; inputp++)
403 		  obuf[outchars++] = '?';
404 
405 		cb->cbufp = cb->cbuf;
406 	    }
407 	    else{
408 		if(ucs >= 0x80 && wcellwidth(ucs) < 0){
409 		    /*
410 		     * This happens when we have a UTF-8 character that
411 		     * we aren't able to print in our locale. For example,
412 		     * if the locale is setup with the terminal
413 		     * expecting ISO-8859-1 characters then there are
414 		     * lots of UTF-8 characters that can't be printed.
415 		     * Print a '?' instead.
416 		     */
417 		    obuf[outchars++] = '?';
418 		}
419 		else{
420 		    /*
421 		     * Convert the ucs into the multibyte
422 		     * character that corresponds to the
423 		     * ucs in the users locale.
424 		     */
425 		    outchars = wtomb((char *) obuf, ucs);
426 		    if(outchars < 0){
427 			obuf[0] = '?';
428 			outchars = 1;
429 		    }
430 		}
431 
432 		/* update the input buffer */
433 		if(inputp >= cb->cbufp)	/* this should be the case */
434 		  cb->cbufp = cb->cbuf;
435 		else{		/* extra chars for some reason? */
436 		    unsigned char *q, *newcbufp;
437 
438 		    newcbufp = (cb->cbufp - inputp) + cb->cbuf;
439 		    q = cb->cbuf;
440 		    while(inputp < cb->cbufp)
441 		      *q++ = *inputp++;
442 
443 		    cb->cbufp = newcbufp;
444 		}
445 	    }
446 
447 	    break;
448 	}
449     }
450     else{			/* error */
451 	obuf[0] = '?';
452 	outchars = 1;
453 	cb->cbufp = cb->cbuf;	/* start over */
454     }
455 
456     return(outchars);
457 }
458 
459 
460 /*
461  * Returns the screen cells width of the UCS-4 string argument.
462  * The source string is zero terminated.
463  */
464 unsigned
ucs4_str_width(UCS * ucsstr)465 ucs4_str_width(UCS *ucsstr)
466 {
467     unsigned width = 0;
468     int w;
469 
470     if(ucsstr)
471       while(*ucsstr){
472 	w = wcellwidth(*ucsstr++);
473 	if(w != U4W_CTLSRGT)
474 	  width += (w < 0 ? 1 : w);
475       }
476 
477     return width;
478 }
479 
480 
481 /*
482  * Returns the screen cells width of the UCS-4 string argument
483  * from ucsstr[a] through (inclusive) ucsstr[b].
484  * No checking is done to make sure a starts in the middle
485  * of a UCS-4 array.
486  */
487 unsigned
ucs4_str_width_a_to_b(UCS * ucsstr,int a,int b)488 ucs4_str_width_a_to_b(UCS *ucsstr, int a, int b)
489 {
490     unsigned width = 0;
491     int i, w;
492 
493     if(ucsstr)
494       for(i = a; i <= b && ucsstr[i]; i++){
495 	w = wcellwidth(ucsstr[i]);
496 	if(w != U4W_CTLSRGT)
497 	  width += (w < 0 ? 1 : w);
498       }
499 
500     return width;
501 }
502 
503 
504 /*
505  * Returns the screen cells width of the UCS-4 string argument
506  * from ustart through (exclusive) uend.
507  * No checking is done to make sure it starts in the middle
508  * of a UCS-4 array.
509  */
510 unsigned
ucs4_str_width_ptr_to_ptr(UCS * ustart,UCS * uend)511 ucs4_str_width_ptr_to_ptr(UCS *ustart, UCS *uend)
512 {
513     UCS *u;
514     unsigned width = 0;
515     int w;
516 
517     if(!ustart)
518       return width;
519 
520     if(ustart)
521       for(u = ustart; u < uend; u++){
522 	w = wcellwidth(*u);
523 	if(w != U4W_CTLSRGT)
524 	  width += (w < 0 ? 1 : w);
525       }
526 
527     return(width);
528 }
529 
530 
531 /*
532  * Return the largest possible pointer into ucs4str so that the width
533  * of the string from ucs4str to the pointer (exclusive)
534  * is maxwidth or less. Also stops at a null character.
535  */
536 UCS *
ucs4_particular_width(UCS * ucs4str,int maxwidth)537 ucs4_particular_width(UCS *ucs4str, int maxwidth)
538 {
539     UCS *u;
540     int w_consumed = 0, w, done = 0;
541 
542     u = ucs4str;
543 
544     if(u)
545       while(!done && *u && w_consumed <= maxwidth){
546 	w = wcellwidth(*u);
547 	w = (w >= 0 ? w : 1);
548 	if(w_consumed + w <= maxwidth){
549 	    w_consumed += w;
550 	    ++u;
551 	}
552 	else
553 	  ++done;
554       }
555 
556     return(u);
557 }
558 
559 
560 /*
561  * Convert and copy a UTF-8 string into a UCS-4 NULL
562  * terminated array. Just like cpystr only it converts
563  * from UTF-8 to UCS-4.
564  *
565  * Returned UCS-4 string needs to be freed by caller.
566  */
567 UCS *
utf8_to_ucs4_cpystr(char * utf8src)568 utf8_to_ucs4_cpystr(char *utf8src)
569 {
570     size_t         retsize;
571     UCS           *ret = NULL;
572     UCS            ucs;
573     unsigned long  remaining_octets;
574     unsigned char *readptr;
575     size_t         arrayindex;
576 
577     /*
578      * We don't know how big to allocate the return array
579      * because variable numbers of octets in the src array
580      * will combine to make UCS-4 characters. The number of
581      * UCS-4 characters is less than or equal to the number
582      * of src characters, though.
583      */
584 
585     if(!utf8src)
586       return NULL;
587 
588     retsize = strlen(utf8src) + 1;
589 
590     ret = (UCS *) fs_get(retsize * sizeof(*ret));
591     memset(ret, 0, retsize * sizeof(*ret));
592 
593     readptr = (unsigned char *) utf8src;
594     remaining_octets = retsize-1;
595     arrayindex = 0;
596 
597     while(remaining_octets > 0 && *readptr && arrayindex < retsize-1){
598 	ucs = (UCS) utf8_get(&readptr, &remaining_octets);
599 
600 	if(ucs & U8G_ERROR || ucs == UBOGON)
601 	  remaining_octets = 0;
602 	else
603 	  ret[arrayindex++] = ucs;
604     }
605 
606     ret[arrayindex] = '\0';
607 
608     /* get rid of excess size */
609     if(arrayindex+1 < retsize)
610       fs_resize((void **) &ret, (arrayindex + 1) * sizeof(*ret));
611 
612     return ret;
613 }
614 
615 
616 /*
617  * Convert and copy a UCS-4 zero-terminated array into a UTF-8 NULL
618  * terminated string. Just like cpystr only it converts
619  * from UCS-4 to UTF-8.
620  *
621  * Returned UTF-8 string needs to be freed by caller.
622  */
623 char *
ucs4_to_utf8_cpystr(UCS * ucs4src)624 ucs4_to_utf8_cpystr(UCS *ucs4src)
625 {
626     unsigned char *ret = NULL;
627     unsigned char *writeptr;
628     int            i;
629 
630     if(!ucs4src)
631       return NULL;
632 
633     /*
634      * Over-allocate and then resize at the end.
635      */
636 
637     /* count characters in source */
638     for(i = 0; ucs4src[i]; i++)
639       ;
640 
641     ret = (unsigned char *) fs_get((6*i + 1) * sizeof(*ret));
642     memset(ret, 0, (6*i + 1) * sizeof(*ret));
643 
644     writeptr = ret;
645     for(i = 0; ucs4src[i]; i++)
646       writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
647 
648     /* get rid of excess size */
649     fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
650 
651     return ((char *) ret);
652 }
653 
654 
655 /*
656  * Similar to above but copy a fixed number of source
657  * characters instead of going until null terminator.
658  */
659 char *
ucs4_to_utf8_cpystr_n(UCS * ucs4src,int ucs4src_len)660 ucs4_to_utf8_cpystr_n(UCS *ucs4src, int ucs4src_len)
661 {
662     unsigned char *ret = NULL;
663     unsigned char *writeptr;
664     int            i;
665 
666     if(!ucs4src)
667       return NULL;
668 
669     /*
670      * Over-allocate and then resize at the end.
671      */
672 
673     ret = (unsigned char *) fs_get((6*ucs4src_len + 1) * sizeof(*ret));
674     memset(ret, 0, (6*ucs4src_len + 1) * sizeof(*ret));
675 
676     writeptr = ret;
677     for(i = 0; i < ucs4src_len; i++)
678       writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
679 
680     /* get rid of excess size */
681     fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
682 
683     return ((char *) ret);
684 }
685 
686 /*
687  * Similar to above but copy what is possible to a
688  * string of a size at most the given retlen.
689  */
690 char *
ucs4_to_utf8_n_cpystr(UCS * ucs4src,int retlen)691 ucs4_to_utf8_n_cpystr(UCS *ucs4src, int retlen)
692 {
693     unsigned char *ret = NULL;
694     unsigned char *writeptr;
695     int            i, oldlen, len;
696 
697     if(!ucs4src)
698       return NULL;
699 
700     /*
701      * Over-allocate and then resize at the end.
702      */
703 
704     /* count characters in source */
705     for(i = 0; ucs4src[i]; i++)
706       ;
707 
708     ret = (unsigned char *) fs_get((6*i + 1) * sizeof(unsigned char));
709     memset(ret, 0, (6*i + 1) * sizeof(unsigned char));
710 
711     writeptr = ret;
712     oldlen = len = 0;
713     for(i = 0; ucs4src[i] && (len < retlen); i++){
714       oldlen = len;
715       writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
716       len = strlen(ret);
717     }
718     if(len > retlen){
719       ret[oldlen] = '\0';
720       len = oldlen;
721     }
722 
723     /* get rid of excess size */
724     fs_resize((void **) &ret, (len + 1) * sizeof(unsigned char));
725 
726     return ((char *) ret);
727 }
728 
729 
730 #ifdef _WINDOWS
731 /*
732  * Convert a UTF-8 argument into an LPTSTR version
733  * of that argument. The result is allocated here
734  * and should be freed by the caller.
735  */
736 LPTSTR
utf8_to_lptstr(LPSTR arg_utf8)737 utf8_to_lptstr(LPSTR arg_utf8)
738 {
739      int lptstr_len;
740      LPTSTR lptstr_ret = NULL;
741 
742      lptstr_len = MultiByteToWideChar( CP_UTF8, 0, arg_utf8, -1, NULL, 0 );
743      if(lptstr_len > 0)
744      {
745          lptstr_ret = (LPTSTR)fs_get(lptstr_len * sizeof(TCHAR));
746          lptstr_len = MultiByteToWideChar( CP_UTF8, 0,
747              arg_utf8, -1, lptstr_ret, lptstr_len );
748      }
749 
750      if(!lptstr_len)
751      {
752          /* check GetLastError()? */
753          lptstr_ret = (LPTSTR)fs_get(sizeof(TCHAR));
754          lptstr_ret[0] = 0;
755      }
756 
757      return lptstr_ret;
758 }
759 
760 
761 /*
762  * Convert an LPTSTR argument into a UTF-8 version
763  * of that argument. The result is allocated here
764  * and should be freed by the caller.
765  */
766 LPSTR
lptstr_to_utf8(LPTSTR arg_lptstr)767 lptstr_to_utf8(LPTSTR arg_lptstr)
768 {
769      int utf8str_len;
770      LPSTR utf8str_ret = NULL;
771 
772      utf8str_len = WideCharToMultiByte( CP_UTF8, 0, arg_lptstr, -1, NULL, 0, NULL, NULL );
773      if(utf8str_len > 0)
774      {
775          utf8str_ret = (LPSTR)fs_get(utf8str_len * sizeof(CHAR));
776          utf8str_len = WideCharToMultiByte( CP_UTF8, 0,
777              arg_lptstr, -1, utf8str_ret, utf8str_len, NULL, NULL );
778      }
779 
780      if(!utf8str_len)
781      {
782          /* check GetLastError()? */
783          utf8str_ret = (LPSTR)fs_get(sizeof(CHAR));
784          utf8str_ret[0] = 0;
785      }
786 
787      return utf8str_ret;
788 }
789 
790 
791 /*
792  * Convert a UCS4 argument into an LPTSTR version
793  * of that argument. The result is allocated here
794  * and should be freed by the caller.
795  */
796 LPTSTR
ucs4_to_lptstr(UCS * arg_ucs4)797 ucs4_to_lptstr(UCS *arg_ucs4)
798 {
799     LPTSTR ret_lptstr = NULL;
800     size_t len;
801     size_t i;
802 
803     if(arg_ucs4){
804 	len = ucs4_strlen(arg_ucs4);
805 	ret_lptstr = (LPTSTR) fs_get((len+1) * sizeof(TCHAR));
806 	/* bogus conversion ignores UTF-16 */
807 	for(i = 0; i < len; i++)
808 	  ret_lptstr[i] = arg_ucs4[i];
809 
810 	ret_lptstr[len] = '\0';
811     }
812 
813     return(ret_lptstr);
814 }
815 
816 
817 /*
818  * Convert an LPTSTR argument into a UCS4 version
819  * of that argument. The result is MemAlloc'd here
820  * and should be freed by the caller.
821  */
822 UCS *
lptstr_to_ucs4(LPTSTR arg_lptstr)823 lptstr_to_ucs4(LPTSTR arg_lptstr)
824 {
825     UCS *ret_ucs4 = NULL;
826     size_t len;
827     size_t i;
828 
829     if(arg_lptstr){
830 	len = _tcslen(arg_lptstr);
831 	ret_ucs4 = (UCS *) fs_get((len+1)*sizeof(UCS));
832 	/* bogus conversion ignores UTF-16 */
833 	for(i = 0; i < len; i++)
834 	  ret_ucs4[i] = arg_lptstr[i];
835 
836 	ret_ucs4[len] = '\0';
837     }
838 
839     return(ret_ucs4);
840 }
841 
842 #endif /* _WINDOWS */
843 
844 
845 /*
846  * Pass in a stream of UTF-8 characters 1-at-a-time in 'c' and return obuf
847  * 1-at-a-time filled in with UCS characters. The return value is the
848  * number of valid characters in obuf to be used. It can only
849  * be 1 or 0 characters since we're only getting one UTF-8 character
850  * at a time.
851  */
852 int
utf8_to_ucs4_oneatatime(int c,CBUF_S * cb,UCS * obuf,int * obufwidth)853 utf8_to_ucs4_oneatatime(int c, CBUF_S *cb, UCS *obuf, int *obufwidth)
854 {
855     int  width = 0, outchars = 0;
856 
857     if(!(cb && cb->cbufp))
858       return(0);
859 
860     if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
861 	unsigned char *inputp;
862 	unsigned long remaining_octets;
863 	UCS ucs;
864 
865 	*cb->cbufp++ = (unsigned char) c;
866 	inputp = cb->cbuf;
867 	remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
868 	ucs = (UCS) utf8_get(&inputp, &remaining_octets);
869 
870 	switch(ucs){
871 	  case U8G_ENDSTRG:	/* incomplete character, wait */
872 	  case U8G_ENDSTRI:	/* incomplete character, wait */
873 	    break;
874 
875 	  default:
876 	    if(ucs & U8G_ERROR || ucs == UBOGON){
877 		/*
878 		 * None of these cases is supposed to happen. If it
879 		 * does happen then the input stream isn't UTF-8
880 		 * so something is wrong.
881 		 */
882 		outchars++;
883 		*obuf = '?';
884 		cb->cbufp = cb->cbuf;
885 		width = 1;
886 	    }
887 	    else{
888 		outchars++;
889 		if(ucs < 0x80 && ucs >= 0x20)
890 		  width = 1;
891 
892 		if(ucs >= 0x80 && (width=wcellwidth(ucs)) < 0){
893 		    /*
894 		     * This happens when we have a UTF-8 character that
895 		     * we aren't able to print in our locale. For example,
896 		     * if the locale is setup with the terminal
897 		     * expecting ISO-8859-1 characters then there are
898 		     * lots of UTF-8 characters that can't be printed.
899 		     * Print a '?' instead.
900 		     * Don't think this should happen in Windows.
901 		     */
902 		    *obuf = '?';
903 		}
904 		else{
905 		    *obuf = ucs;
906 		}
907 
908 		/* update the input buffer */
909 		if(inputp >= cb->cbufp)	/* this should be the case */
910 		  cb->cbufp = cb->cbuf;
911 		else{		/* extra chars for some reason? */
912 		    unsigned char *q, *newcbufp;
913 
914 		    newcbufp = (cb->cbufp - inputp) + cb->cbuf;
915 		    q = cb->cbuf;
916 		    while(inputp < cb->cbufp)
917 		      *q++ = *inputp++;
918 
919 		    cb->cbufp = newcbufp;
920 		}
921 	    }
922 
923 	    break;
924 	}
925     }
926     else{			/* error */
927 	*obuf = '?';
928 	outchars = 1;
929 	width = 1;
930 	cb->cbufp = cb->cbuf;	/* start over */
931     }
932 
933     if(obufwidth)
934       *obufwidth = width;
935 
936     return(outchars);
937 }
938 
939 
940 /*
941  * Return an allocated copy of a zero-terminated UCS-4 string.
942  */
943 UCS *
ucs4_cpystr(UCS * ucs4src)944 ucs4_cpystr(UCS *ucs4src)
945 {
946     size_t         arraysize;
947     UCS           *ret = NULL;
948     size_t         i;
949 
950     if(!ucs4src)
951       return NULL;
952 
953     arraysize = ucs4_strlen(ucs4src);
954 
955     ret = (UCS *) fs_get((arraysize+1) * sizeof(*ret));
956     memset(ret, 0, (arraysize+1) * sizeof(*ret));
957 
958     for(i = 0; i < arraysize; i++)
959       ret[i] = ucs4src[i];
960 
961     return ret;
962 }
963 
964 
965 UCS *
ucs4_strncpy(UCS * ucs4dst,UCS * ucs4src,size_t n)966 ucs4_strncpy(UCS *ucs4dst, UCS *ucs4src, size_t n)
967 {
968     size_t i;
969 
970     if(ucs4src && ucs4dst){
971 	for(i = 0; i < n; i++){
972 	    ucs4dst[i] = ucs4src[i];
973 	    if(ucs4dst[i] == '\0')
974 	      break;
975 	}
976     }
977 
978     return ucs4dst;
979 }
980 
981 
982 UCS *
ucs4_strncat(UCS * ucs4dst,UCS * ucs4src,size_t n)983 ucs4_strncat(UCS *ucs4dst, UCS *ucs4src, size_t n)
984 {
985     size_t i;
986     UCS *u;
987 
988     if(ucs4src && ucs4dst){
989 	for(u = ucs4dst; *u; u++)
990 	  ;
991 
992 	for(i = 0; i < n; i++){
993 	    u[i] = ucs4src[i];
994 	    if(u[i] == '\0')
995 	      break;
996 	}
997 
998 	if(i == n)
999 	  u[i] = '\0';
1000     }
1001 
1002     return ucs4dst;
1003 }
1004 
1005 
1006 /*
1007  * Like strlen only this returns the number of non-zero characters
1008  * in a zero-terminated UCS-4 array.
1009  */
1010 size_t
ucs4_strlen(UCS * ucs4str)1011 ucs4_strlen(UCS *ucs4str)
1012 {
1013     size_t i = 0;
1014 
1015     if(ucs4str)
1016       while(ucs4str[i])
1017         i++;
1018 
1019     return(i);
1020 }
1021 
1022 
1023 int
ucs4_strcmp(UCS * s1,UCS * s2)1024 ucs4_strcmp(UCS *s1, UCS *s2)
1025 {
1026     for(; *s1 == *s2; s1++, s2++)
1027       if(*s1 == '\0')
1028         return 0;
1029 
1030     return((*s1 < *s2) ? -1 : 1);
1031 }
1032 
1033 
1034 UCS *
ucs4_strchr(UCS * s,UCS c)1035 ucs4_strchr(UCS *s, UCS c)
1036 {
1037     if(!s)
1038       return NULL;
1039 
1040     while(*s && *s != c)
1041       s++;
1042 
1043     if(*s || !c)
1044       return s;
1045     else
1046       return NULL;
1047 }
1048 
1049 
1050 UCS *
ucs4_strrchr(UCS * s,UCS c)1051 ucs4_strrchr(UCS *s, UCS c)
1052 {
1053     UCS *ret = NULL;
1054 
1055     if(!s)
1056       return ret;
1057 
1058     while(*s){
1059 	if(*s == c)
1060 	  ret = s;
1061 
1062 	s++;
1063     }
1064 
1065     return ret;
1066 }
1067 
1068 
1069 /*
1070  * Returns the screen cells width of the UTF-8 string argument.
1071  */
1072 unsigned
utf8_width(char * str)1073 utf8_width(char *str)
1074 {
1075     unsigned width = 0;
1076     int this_width;
1077     UCS ucs;
1078     unsigned long remaining_octets;
1079     char *readptr;
1080 
1081     if(!(str && *str))
1082       return(width);
1083 
1084     readptr = str;
1085     remaining_octets = readptr ? strlen(readptr) : 0;
1086 
1087     while(remaining_octets > 0 && *readptr){
1088 
1089 	ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1090 
1091 	if(ucs & U8G_ERROR || ucs == UBOGON){
1092 	    /*
1093 	     * This should not happen, but do something to handle it anyway.
1094 	     * Treat each character as a single width character, which is what should
1095 	     * probably happen when we actually go to write it out.
1096 	     */
1097 	    remaining_octets--;
1098 	    readptr++;
1099 	    this_width = 1;
1100 	}
1101 	else{
1102 	    this_width = wcellwidth(ucs);
1103 
1104 	    /*
1105 	     * If this_width is -1 that means we can't print this character
1106 	     * with our current locale. Writechar will print a '?'.
1107 	     */
1108 	    if(this_width < 0)
1109 	      this_width = 1;
1110 	}
1111 
1112 	width += (unsigned) this_width;
1113     }
1114 
1115     return(width);
1116 }
1117 
1118 
1119 /*
1120  * Copy UTF-8 characters from src into dst.
1121  * This is intended to be used if you want to truncate a string at
1122  * the start instead of the end. For example, you have a long string
1123  * like
1124  *       this_is_a_long_string
1125  * but not enough space to fit it into a particular field. You want to
1126  * end up with
1127  *             s_a_long_string
1128  * where that fits in a particular width. Perhaps you'd use this with ...
1129  * to get
1130  *          ...s_a_long_string
1131  * This right adjusts the end of the string in the width space and
1132  * cuts it off at the start. If there is enough width for the whole
1133  * string it will copy the string into dst with no padding.
1134  *
1135  * Copy enough characters so that the result will have screen width of
1136  * want_width screen cells in current locale.
1137  *
1138  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1139  *   to dst. This is just for protection, it shouldn't be relied on to
1140  *   do anything useful. Dstlen should be large enough. Otherwise you'll get
1141  *   characters truncated in the middle or something like that.
1142  *
1143  * Returned value is the number of bytes written to dst, not including
1144  *   the possible terminating null.
1145  *
1146  * If we can't hit want_width exactly because of double width characters
1147  *   then we will pad the end of the string with space in order to make
1148  *   the width exact.
1149  */
1150 size_t
utf8_to_width_rhs(char * dst,char * src,size_t dstlen,unsigned want_width)1151 utf8_to_width_rhs(char *dst,		/* destination buffer */
1152 		  char *src,		/* source string */
1153 		  size_t dstlen,	/* space in dest */
1154 		  unsigned want_width)	/* desired screen width */
1155 {
1156     int this_width;
1157     unsigned width_consumed = 0;
1158     UCS ucs;
1159     unsigned long remaining_octets;
1160     char *readptr, *goodreadptr, *savereadptr, *endptr;
1161     size_t nb = 0;
1162 
1163     if(!src){
1164 	if(dstlen > 0)
1165 	  dst[0] = '\0';
1166 
1167 	return nb;
1168     }
1169 
1170     /*
1171      * Start at the end of the source string and go backwards until we
1172      * get to the desired width, but not more than the width.
1173      */
1174     readptr = src + strlen(src);
1175     endptr = readptr;
1176     goodreadptr = readptr;
1177     width_consumed = 0;
1178     savereadptr = readptr;
1179 
1180     for(readptr = savereadptr-1; readptr >= src && width_consumed < want_width && (endptr - readptr) < dstlen;
1181 	readptr = savereadptr-1){
1182 
1183 	savereadptr = readptr;
1184 	remaining_octets = goodreadptr - readptr;
1185 	ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1186 
1187 	/*
1188 	 * Handling the error case is tough because an error will be the normal thing that
1189 	 * happens as we back through the string. So we're just going to punt on the
1190 	 * error for now.
1191 	 */
1192 	if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1193 	    if(remaining_octets > 0){
1194 		/*
1195 		 * This means there are some bad octets after this good
1196 		 * character so things are not going to work out well.
1197 		 * Bail out.
1198 		 */
1199 		savereadptr = src;	/* we're done */
1200 	    }
1201 	    else{
1202 		this_width = wcellwidth(ucs);
1203 
1204 		if(this_width < 0)
1205 		  this_width = 1;
1206 
1207 		if(width_consumed + (unsigned) this_width <= want_width){  /* ok */
1208 		    width_consumed += (unsigned) this_width;
1209 		    goodreadptr = savereadptr;
1210 		}
1211 		else
1212 		  savereadptr = src;	/* we're done */
1213 	    }
1214 	}
1215     }
1216 
1217     /*
1218      * Copy characters from goodreadptr to endptr into dst.
1219      */
1220     nb = MIN(endptr-goodreadptr, dstlen-1);
1221     strncpy(dst, goodreadptr, nb);
1222     dst[nb] = '\0';
1223 
1224     /*
1225      * Pad out with spaces in order to hit width exactly.
1226      */
1227     while(width_consumed < want_width && nb < dstlen-1){
1228 	dst[nb++] = ' ';
1229 	dst[nb] = '\0';
1230 	width_consumed++;
1231     }
1232 
1233     return nb;
1234 }
1235 
1236 
1237 /*
1238  * The arguments being converted are UTF-8 strings.
1239  * This routine attempts to make it possible to use screen cell
1240  * widths in a format specifier. In a one-byte per screen cell
1241  * world we might have used %10.10s to cause a string to occupy
1242  * 10 screen positions. Since the width and precision are really
1243  * referring to numbers of bytes instead of screen positions that
1244  * won't work with UTF-8 input. We emulate that behavior with
1245  * the format string %w. %m.nw means to use the m and n as
1246  * screen width indicators instead of bytes indicators.
1247  *
1248  * There is no reason to use this routine unless you want to use
1249  * min field with or precision with the specifier. A plain %w without
1250  * widths is equivalent exactly to a plain %s in a regular printf.
1251  *
1252  * Double-width characters complicate things. It may not be possible
1253  * to satisfy the request exactly. For example, %3w for an input
1254  * string that is made up of two double-width characters.
1255  * This routine will arbitrarily use a trailing space character if
1256  * needed to make the width come out correctly where a half of a
1257  * double-width character would have been needed. We'll see how
1258  * that works for us.
1259  *
1260  * %w only works for strings (it's a %s replacement).
1261  *
1262  * Buffer overflow is handled by the size argument. %.30s will work
1263  * to limit a particular string to 30 bytes, but you lose that
1264  * ability with %w, since it may write more than precision bytes
1265  * in order to get to the desired width. It is best to choose
1266  * size large enough so that it doesn't come into play, otherwise
1267  * it may be possible to get partial UTF-8 characters because of
1268  * the truncation.
1269  *
1270  * The return value isn't quite the same as the return value
1271  * of snprintf. It is the number of bytes written, not counting
1272  * the trailing null, just like snprintf. However, if it is
1273  * truncated due to size then the output is size, not the
1274  * number of characters that would have been written.
1275  */
1276 int
utf8_snprintf(char * dest,size_t size,char * fmt,...)1277 utf8_snprintf(char *dest, size_t size, char *fmt, ...)
1278 {
1279     char    newfmt[100], buf[20], *q, *pdest, *width_str, *end;
1280     char   *start_of_specifier;
1281     char   *input_str;
1282     int     int_arg;
1283     double  double_arg;
1284     void   *ptr_arg;
1285     unsigned got_width;
1286     int     more_flags, ret, w;
1287     int     min_field_width, field_precision, modifier;
1288     int     flags_minus, flags_plus, flags_space, flags_zero, flags_pound;
1289     va_list args;
1290 
1291     newfmt[0] = '\0';
1292     q = newfmt;
1293 
1294     pdest = dest;
1295 
1296 #define IS_ROOM_IN_DEST(n_more_chars)			\
1297     ((pdest - dest + (n_more_chars) <= size) ? 1 : 0)
1298 
1299     /*
1300      * Strategy: Look through the fmt string for %w's. Replace the
1301      * %w's in the format string with %s's but with possibly different
1302      * width and precision arguments which will make it come out right.
1303      * Then call the regular system vsnprintf with the altered format
1304      * string but same arguments.
1305      *
1306      * That would be nice but it doesn't quite work. Why? Because a
1307      * %*w will need to have the value in the integer argument the *
1308      * refers to modified. Can't do it as far as I can tell. Or we could
1309      * remove the integer argument somehow before calling printf. Can't
1310      * do it. Or we could somehow add an additional conversion specifier
1311      * that caused nothing to be printed but ate up the integer arg.
1312      * Can't figure out how to do that either.
1313      *
1314      * Since we can't figure out how to do it, the alternative is to
1315      * construct the result one piece at a time, pasting together the
1316      * pieces from the different conversions.
1317      */
1318     va_start(args, fmt);
1319 
1320     while(*fmt && IS_ROOM_IN_DEST(1)){
1321 	if(*fmt == '%'){
1322 	    start_of_specifier = fmt++;
1323 
1324 	    min_field_width = field_precision = -1;
1325 	    flags_minus = flags_plus = flags_space = flags_zero = flags_pound = 0;
1326 
1327 	    /* flags */
1328 	    more_flags = 1;
1329 	    while(more_flags){
1330 		switch(*fmt){
1331 		  case '-':
1332 		    flags_minus++;
1333 		    fmt++;
1334 		    break;
1335 
1336 		  case '+':
1337 		    flags_plus++;
1338 		    fmt++;
1339 		    break;
1340 
1341 		  case ' ':
1342 		    flags_space++;
1343 		    fmt++;
1344 		    break;
1345 
1346 		  case '0':
1347 		    flags_zero++;
1348 		    fmt++;
1349 		    break;
1350 
1351 		  case '#':
1352 		    flags_pound++;
1353 		    fmt++;
1354 		    break;
1355 
1356 		  default:
1357 		    more_flags = 0;
1358 		    break;
1359 		}
1360 	    }
1361 
1362 	    /* minimum field width */
1363 	    if(*fmt == '*'){
1364 		min_field_width = va_arg(args, int);
1365 		fmt++;
1366 	    }
1367 	    else if(*fmt >= '0' && *fmt <= '9'){
1368 		width_str = fmt;
1369 		while (*fmt >= '0' && *fmt <= '9')
1370 		  fmt++;
1371 
1372 		strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1373 		if(sizeof(buf) > fmt-width_str)
1374 		  buf[fmt-width_str] = '\0';
1375 
1376 		buf[sizeof(buf)-1] = '\0';
1377 
1378 		min_field_width = atoi(width_str);
1379 	    }
1380 
1381 	    /* field precision */
1382 	    if(*fmt == '.'){
1383 		fmt++;
1384 		if(*fmt == '*'){
1385 		    field_precision = va_arg(args, int);
1386 		    fmt++;
1387 		}
1388 		else if(*fmt >= '0' && *fmt <= '9'){
1389 		    width_str = fmt;
1390 		    while (*fmt >= '0' && *fmt <= '9')
1391 		      fmt++;
1392 
1393 		    strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1394 		    if(sizeof(buf) > fmt-width_str)
1395 		      buf[fmt-width_str] = '\0';
1396 
1397 		    buf[sizeof(buf)-1] = '\0';
1398 
1399 		    field_precision = atoi(width_str);
1400 		}
1401 	    }
1402 
1403 	    /* length modifier */
1404 	    if(*fmt == 'h' || *fmt == 'l' || *fmt == 'L')
1405 	      modifier = *fmt++;
1406 
1407 	    /* conversion character */
1408 	    switch(*fmt){
1409 	      case 'w':
1410 		/*
1411 		 * work with va_arg(char *) to figure out width
1412 		 * and precision needed to produce the screen width
1413 		 * and precision asked for in %w using some of the
1414 		 * utf8 width routines we have.
1415 		 */
1416 
1417 		input_str = va_arg(args, char *);
1418 		if(field_precision >=0 || min_field_width >= 0)
1419 		  w = utf8_width(input_str);
1420 
1421 		if(field_precision >= 0){
1422 		    if(w <= field_precision)
1423 		      field_precision = -1;  /* print it all */
1424 		    else{
1425 			/*
1426 			 * We need to cut off some of the input_str
1427 			 * in this case.
1428 			 */
1429 			end = utf8_count_forw_width(input_str, field_precision, &got_width);
1430 			field_precision = (int) (end - input_str);
1431 			/* new w with this field_precision */
1432 			w = got_width;
1433 		    }
1434 		}
1435 
1436 		/* need some padding */
1437 		if(min_field_width >= 0)
1438 		  min_field_width = ((field_precision >= 0) ? field_precision : strlen(input_str)) +
1439 				      MAX(0, min_field_width - w);
1440 
1441 		/*
1442 		 * Now we just need to get the new format string
1443 		 * set correctly in newfmt.
1444 		 */
1445 		q = newfmt;
1446 		if(q-newfmt < sizeof(newfmt))
1447 		  *q++ = '%';
1448 
1449 		if(flags_minus && q-newfmt < sizeof(newfmt))
1450 		  *q++ = '-';
1451 		if(flags_plus && q-newfmt < sizeof(newfmt))
1452 		  *q++ = '+';
1453 		if(flags_space && q-newfmt < sizeof(newfmt))
1454 		  *q++ = ' ';
1455 		if(flags_zero && q-newfmt < sizeof(newfmt))
1456 		  *q++ = '0';
1457 		if(flags_pound && q-newfmt < sizeof(newfmt))
1458 		  *q++ = '#';
1459 
1460 		if(min_field_width >= 0){
1461 		    snprintf(buf, sizeof(buf), "%d", min_field_width);
1462 		    sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1463 		}
1464 
1465 		if(field_precision >= 0){
1466 		    if(q-newfmt < sizeof(newfmt))
1467 		      *q++ = '.';
1468 
1469 		    snprintf(buf, sizeof(buf), "%d", field_precision);
1470 		    sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1471 		}
1472 
1473 		if(q-newfmt < sizeof(newfmt))
1474 		  *q++ = 's';
1475 
1476 		if(q-newfmt < sizeof(newfmt))
1477 		  *q++ = '\0';
1478 
1479 		snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1480 		pdest += strlen(pdest);
1481 
1482 	        break;
1483 
1484 	      case '\0':
1485 		fmt--;
1486 	        break;
1487 
1488 	      default:
1489 		/* make a new format which leaves out the dynamic '*' arguments */
1490 		q = newfmt;
1491 		if(q-newfmt < sizeof(newfmt))
1492 		  *q++ = '%';
1493 
1494 		if(flags_minus && q-newfmt < sizeof(newfmt))
1495 		  *q++ = '-';
1496 		if(flags_plus && q-newfmt < sizeof(newfmt))
1497 		  *q++ = '+';
1498 		if(flags_space && q-newfmt < sizeof(newfmt))
1499 		  *q++ = ' ';
1500 		if(flags_zero && q-newfmt < sizeof(newfmt))
1501 		  *q++ = '0';
1502 		if(flags_pound && q-newfmt < sizeof(newfmt))
1503 		  *q++ = '#';
1504 
1505 		if(min_field_width >= 0){
1506 		    snprintf(buf, sizeof(buf), "%d", min_field_width);
1507 		    sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1508 		}
1509 
1510 		if(field_precision >= 0){
1511 		    if(q-newfmt < sizeof(newfmt))
1512 		      *q++ = '.';
1513 
1514 		    snprintf(buf, sizeof(buf), "%d", field_precision);
1515 		    sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1516 		}
1517 
1518 		if(q-newfmt < sizeof(newfmt))
1519 		  *q++ = *fmt;
1520 
1521 		if(q-newfmt < sizeof(newfmt))
1522 		  *q++ = '\0';
1523 
1524 		switch(*fmt){
1525 		  case 'd': case 'i': case 'o':
1526 		  case 'x': case 'X': case 'u': case 'c':
1527 		    int_arg = va_arg(args, int);
1528 		    snprintf(pdest, size - (pdest-dest), newfmt, int_arg);
1529 		    pdest += strlen(pdest);
1530 		    break;
1531 
1532 		  case 's':
1533 		    input_str = va_arg(args, char *);
1534 		    snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1535 		    pdest += strlen(pdest);
1536 		    break;
1537 
1538 		  case 'f': case 'e': case 'E':
1539 		  case 'g': case 'G':
1540 		    double_arg = va_arg(args, double);
1541 		    snprintf(pdest, size - (pdest-dest), newfmt, double_arg);
1542 		    pdest += strlen(pdest);
1543 		    break;
1544 
1545 		  case 'p':
1546 		    ptr_arg = va_arg(args, void *);
1547 		    snprintf(pdest, size - (pdest-dest), newfmt, ptr_arg);
1548 		    pdest += strlen(pdest);
1549 		    break;
1550 
1551 		  case '%':
1552 		    if(IS_ROOM_IN_DEST(1))
1553 		      *pdest++ =  '%';
1554 
1555 		    break;
1556 
1557 		  default:
1558 		    /* didn't think of this type */
1559 		    assert(0);
1560 		    break;
1561 		}
1562 
1563 	        break;
1564 	    }
1565 
1566 	    fmt++;
1567 	}
1568 	else{
1569 	    if(IS_ROOM_IN_DEST(1))
1570 	      *pdest++ = *fmt++;
1571 	}
1572     }
1573 
1574     ret = pdest - dest;
1575 
1576     if(IS_ROOM_IN_DEST(1))
1577       *pdest++ = '\0';
1578 
1579     va_end(args);
1580 
1581     return ret;
1582 }
1583 
1584 
1585 /*
1586  * Copy UTF-8 characters from src into dst.
1587  * Copy enough characters so that the result will have (<=) screen width of
1588  * want_width screen cells in current locale.
1589  *
1590  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1591  *   to dst.
1592  *
1593  * Returned value is the number of bytes written to dst, not including
1594  *   the possible terminating null.
1595  * Got_width is another returned value. It is the width in screen cells of
1596  *   the string placed in dst. It will be the same as want_width if there
1597  *   are enough characters in the src to do that and if the character widths
1598  *   hit the width exactly. It will be less than want_width if we run out
1599  *   of src characters or if the next character width would skip over the
1600  *   width we want, because it is double width.
1601  *
1602  * Zero width characters are collected and included at the end of the string.
1603  *   That is, if we make it to want_width but there is still a zero length
1604  *   character sitting in src, we add that to dst. This might be an accent
1605  *   or something like that.
1606  */
1607 size_t
utf8_to_width(char * dst,char * src,size_t dstlen,unsigned want_width,unsigned * got_width)1608 utf8_to_width(char *dst,		/* destination buffer */
1609 	      char *src,		/* source string */
1610 	      size_t dstlen,		/* space in dst */
1611 	      unsigned want_width,	/* desired screen width */
1612 	      unsigned *got_width)	/* returned screen width in dst */
1613 {
1614     int this_width;
1615     unsigned width_consumed = 0;
1616     UCS ucs;
1617     unsigned long remaining_octets;
1618     char *writeptr, *readptr, *savereadptr, *endptr;
1619     int ran_out_of_space = 0;
1620 
1621     readptr = src;
1622 
1623     remaining_octets = readptr ? strlen(readptr) : 0;
1624 
1625     writeptr = dst;
1626     endptr = writeptr + dstlen;
1627 
1628     if(readptr && writeptr){
1629       while(width_consumed <= want_width && remaining_octets > 0 && writeptr < dst + dstlen && !ran_out_of_space){
1630 	savereadptr = readptr;
1631 	ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1632 
1633 	if(ucs & U8G_ERROR || ucs == UBOGON)
1634 	  remaining_octets = 0;
1635 	else{
1636 	  this_width = wcellwidth(ucs);
1637 
1638 	  /*
1639 	   * If this_width is -1 that means we can't print this character
1640 	   * with our current locale. Writechar will print a '?'.
1641 	   */
1642 	  if(this_width < 0)
1643 	    this_width = 1;
1644 
1645 	  if(width_consumed + (unsigned) this_width <= want_width){
1646 	    /* append this utf8 character to dst if it will fit */
1647 	    if(writeptr + (readptr - savereadptr) < endptr){
1648 	      width_consumed += this_width;
1649 	      while(savereadptr < readptr)
1650 	        *writeptr++ = *savereadptr++;
1651 	    }
1652 	    else
1653 	      ran_out_of_space++;	/* no more utf8 to dst */
1654 	  }
1655 	  else
1656 	    remaining_octets = 0;	/* we're done */
1657 	}
1658       }
1659 
1660       if(writeptr < endptr)
1661         *writeptr = '\0';
1662     }
1663 
1664     if(got_width)
1665       *got_width = width_consumed;
1666 
1667     return(writeptr ? (writeptr - dst) : 0);
1668 }
1669 
1670 
1671 /*
1672  * Str is a UTF-8 string.
1673  * Count forward width screencell positions and return a pointer to the
1674  * end of the string that is width wide.
1675  * The returned pointer points at the next character (where the null would
1676  * be placed).
1677  *
1678  * Got_width is another returned value. It is the width in screen cells of
1679  *   the string from str to the returned pointer. It will be the same as
1680  *   want_width if there are enough characters in the str to do that
1681  *   and if the character widths hit the width exactly. It will be less
1682  *   than want_width if we run out of characters or if the next character
1683  *   width would skip over the width we want, because it is double width.
1684  */
1685 char *
utf8_count_forw_width(char * str,unsigned want_width,unsigned * got_width)1686 utf8_count_forw_width(char *str, unsigned want_width, unsigned *got_width)
1687 {
1688     int this_width;
1689     unsigned width_consumed = 0;
1690     UCS ucs;
1691     unsigned long remaining_octets;
1692     char *readptr;
1693     char *retptr;
1694 
1695     retptr = readptr = str;
1696 
1697     remaining_octets = readptr ? strlen(readptr) : 0;
1698 
1699     while(width_consumed <= want_width && remaining_octets > 0){
1700 
1701 	ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1702 
1703 	if(ucs & U8G_ERROR || ucs == UBOGON){
1704 	    /*
1705 	     * This should not happen, but do something to handle it anyway.
1706 	     * Treat each character as a single width character, which is what should
1707 	     * probably happen when we actually go to write it out.
1708 	     */
1709 	    remaining_octets--;
1710 	    readptr++;
1711 	    this_width = 1;
1712 	}
1713 	else{
1714 	    this_width = wcellwidth(ucs);
1715 
1716 	    /*
1717 	     * If this_width is -1 that means we can't print this character
1718 	     * with our current locale. Writechar will print a '?'.
1719 	     */
1720 	    if(this_width < 0)
1721 	      this_width = 1;
1722 	}
1723 
1724 	if(width_consumed + (unsigned) this_width <= want_width){
1725 	    width_consumed += (unsigned) this_width;
1726 	    retptr = readptr;
1727 	}
1728 	else
1729 	  remaining_octets = 0;	/* we're done */
1730     }
1731 
1732     if(got_width)
1733       *got_width = width_consumed;
1734 
1735     return(retptr);
1736 }
1737 
1738 
1739 /*
1740  * Copy a null terminator into a UTF-8 string in place so that the string is
1741  * no more than a certain screen width wide. If the string is already less
1742  * than or equal in width to the requested width, no change is made.
1743  *
1744  * The actual width accomplished is returned. Note that it may be less than
1745  * max_width due to double width characters as well as due to the fact that
1746  * it fits wholly in the max_width.
1747  *
1748  * Returned value is the actual screen width of str when done.
1749  *
1750  * A side effect is that a terminating null may have been written into
1751  * the passed in string.
1752  */
1753 unsigned
utf8_truncate(char * str,unsigned max_width)1754 utf8_truncate(char *str, unsigned max_width)
1755 {
1756     int this_width;
1757     unsigned width_consumed = 0;
1758     UCS ucs;
1759     unsigned long remaining_octets;
1760     char *readptr, *savereadptr;
1761 
1762     readptr = str;
1763 
1764     remaining_octets = readptr ? strlen(readptr) : 0;
1765 
1766     if(readptr){
1767       while(width_consumed <= max_width && remaining_octets > 0){
1768 
1769 	savereadptr = readptr;
1770 	ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1771 
1772 	if(ucs & U8G_ERROR || ucs == UBOGON){
1773 	    /*
1774 	     * This should not happen, but do something to handle it anyway.
1775 	     * Treat each character as a single width character, which is what should
1776 	     * probably happen when we actually go to write it out.
1777 	     */
1778 	    remaining_octets--;
1779 	    readptr++;
1780 	    this_width = 1;
1781 	}
1782 	else{
1783 	    this_width = wcellwidth(ucs);
1784 
1785 	    /*
1786 	     * If this_width is -1 that means we can't print this character
1787 	     * with our current locale. Writechar will print a '?'.
1788 	     */
1789 	    if(this_width < 0)
1790 	      this_width = 1;
1791 	}
1792 
1793 	if(width_consumed + (unsigned) this_width <= max_width){
1794 	    width_consumed += (unsigned) this_width;
1795 	}
1796 	else{
1797 	    remaining_octets = 0;	/* we're done */
1798 	    *savereadptr = '\0';
1799 	}
1800       }
1801     }
1802 
1803     return(width_consumed);
1804 }
1805 
1806 
1807 /*
1808  * Copy UTF-8 characters from src into dst.
1809  * Copy enough characters so that the result will have screen width of
1810  * want_width screen cells in current locale.
1811  * If there aren't enough characters in src to get to want_width, pad on
1812  * left or right according to left_adjust argument.
1813  *
1814  * Dstlen is the available space in dst. No more than dstlen bytes will be written
1815  *   to dst. Dst will be null terminated if there is enough room, but not
1816  *   if that would overflow dst's len.
1817  *
1818  * Returned value is the number of bytes written to dst, not including
1819  *   the possible terminating null.
1820  */
1821 size_t
utf8_pad_to_width(char * dst,char * src,size_t dstlen,unsigned want_width,int left_adjust)1822 utf8_pad_to_width(char *dst,		/* destination buffer */
1823 		  char *src,		/* source string */
1824 		  size_t dstlen,	/* space in dst */
1825 		  unsigned want_width,	/* desired screen width */
1826 		  int left_adjust)	/* adjust left or right in want_width columns */
1827 {
1828     unsigned got_width = 0;
1829     int      need_more, howmany;
1830     size_t   len_left, bytes_used;
1831 
1832     bytes_used = utf8_to_width(dst, src, dstlen, want_width, &got_width);
1833     len_left = dstlen - bytes_used;
1834 
1835     need_more = want_width - got_width;
1836     howmany = MIN(need_more, len_left);
1837 
1838     if(howmany > 0){
1839 	char *end, *newend, *p, *q;
1840 
1841 	end = dst + bytes_used;
1842 	newend = end + howmany;
1843 	if(left_adjust){
1844 	    /*
1845 	     * Add padding to end of string. Simply append
1846 	     * the needed number of spaces, or however many will fit
1847 	     * if we don't have enough space.
1848 	     */
1849 	    for(q = end; q < newend; q++)
1850 	      *q = ' ';
1851 	}
1852 	else{
1853 	    /*
1854 	     * Add padding to start of string.
1855 	     */
1856 
1857 	    /* slide existing string over */
1858 	    for(p = end - 1, q = newend - 1; p >= dst; p--, q--)
1859 	      *q = *p;
1860 
1861 	    /* fill rest with spaces */
1862 	    for(; q >= dst; q--)
1863 	      *q = ' ';
1864 	}
1865 
1866 	bytes_used += howmany;
1867     }
1868 
1869     if(bytes_used < dstlen)
1870       dst[bytes_used] = '\0';
1871 
1872     return(bytes_used);
1873 }
1874 
1875 
1876 /*
1877  * Str is a UTF-8 string.
1878  * Start_here is a pointer into the string. It points one position past
1879  * the last byte that should be considered a part of the length string.
1880  * Count back want_width screencell positions and return a pointer to the
1881  * start of the string that is want_width wide and ends with start_here.
1882  *
1883  * Since characters may be more than one cell width wide we may end up
1884  * skipping over the exact width. That is, if we need to we'll go back
1885  * too far (by one cell width). Account for that in the call by looking
1886  * at got_width.
1887  *
1888  * Note that this call gives a possible got_width == want_width+1 as
1889  * opposed to utf8_count_forw_width which gives got_width == want-1 instead.
1890  * That was just what was needed at the time, maybe it needs to be
1891  * optional.
1892  */
1893 char *
utf8_count_back_width(char * str,char * start_here,unsigned want_width,unsigned * got_width)1894 utf8_count_back_width(char *str, char *start_here, unsigned want_width, unsigned *got_width)
1895 {
1896     unsigned width_consumed = 0;
1897     int this_width;
1898     UCS ucs;
1899     unsigned long remaining_octets;
1900     char *ptr, *savereadptr, *goodreadptr;
1901 
1902     savereadptr = start_here;
1903     goodreadptr = start_here;
1904 
1905     for(ptr = savereadptr - 1; width_consumed < want_width && ptr >= str; ptr = savereadptr - 1){
1906 
1907 	savereadptr = ptr;
1908 	remaining_octets = goodreadptr - ptr;
1909 	ucs = (UCS) utf8_get((unsigned char **) &ptr, &remaining_octets);
1910 
1911 	if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1912 	  if(remaining_octets > 0){
1913 	      /*
1914 	       * This means there are some bad octets after this good
1915 	       * character so things are not going to work out well.
1916 	       * Bail out.
1917 	       */
1918 	      savereadptr = str;	/* we're done */
1919 	  }
1920 	  else{
1921 	    this_width = wcellwidth(ucs);
1922 
1923 	    /*
1924 	     * If this_width is -1 that means we can't print this character
1925 	     * with our current locale. Writechar will print a '?'.
1926 	     */
1927 	    if(this_width < 0)
1928 	      this_width = 1;
1929 
1930 	    width_consumed += (unsigned) this_width;
1931 	    goodreadptr = savereadptr;
1932 	  }
1933         }
1934     }
1935 
1936     if(got_width)
1937       *got_width = width_consumed;
1938 
1939     return(savereadptr);
1940 }
1941 
1942 
1943 /*----------------------------------------------------------------------
1944   copy the source string onto the destination string returning with
1945   the destination string pointer at the end of the destination text
1946 
1947   motivation for this is to avoid twice passing over a string that's
1948   being appended to twice (i.e., strcpy(t, x); t += strlen(t))
1949 
1950   This doesn't really belong here but it is used here.
1951  ----*/
1952 void
sstrncpy(char ** d,char * s,int n)1953 sstrncpy(char **d, char *s, int n)
1954 {
1955     while(n-- > 0 && (**d = *s++) != '\0')
1956       (*d)++;
1957 }
1958 
1959 
1960 /*
1961  * If use_system_routines is set then NULL is the return value and it is
1962  * not an error. Display_charmap and keyboard_charmap should come over as
1963  * malloced strings and will be filled in with the result.
1964  *
1965  * Returns a void pointer to the input_cs CHARSET which is
1966  * passed to mbtow via kbseq().
1967  * If !use_system_routines && NULL is returned, that is an error and err should
1968  * have a message.
1969  * display_charmap and keyboard_charmap should be malloced data and may be
1970  * realloced and changed here.
1971  */
1972 int
setup_for_input_output(int use_system_routines,char ** display_charmap,char ** keyboard_charmap,void ** input_cs_arg,char ** err)1973 setup_for_input_output(int use_system_routines, char **display_charmap,
1974 		       char **keyboard_charmap, void **input_cs_arg, char **err)
1975 {
1976     const CHARSET *cs;
1977     const CHARSET *input_cs = NULL;
1978     int already_tried = 0;
1979     int supported = 0;
1980     char buf[1000];
1981 
1982 #define cpstr(s) strcpy((char *)fs_get(1+strlen(s)), s)
1983 
1984     if(err)
1985       *err = NULL;
1986 
1987     if(!display_charmap || !keyboard_charmap || !input_cs_arg){
1988 	*err = cpstr("Bad call to setup_for_input_output");
1989 	return(-1);
1990     }
1991 
1992     if(use_system_routines){
1993 #if	PREREQ_FOR_SYS_TRANSLATION
1994 	char *dcm;
1995 
1996 	dcm = nl_langinfo_codeset_wrapper();
1997 	dcm = dcm ? dcm : "US-ASCII";
1998 
1999 	init_utf8_display(0, NULL);
2000 	if(*display_charmap){
2001 	    if(dcm && strucmp(*display_charmap, dcm)){
2002 		snprintf(buf, sizeof(buf),
2003 		 _("Display character set \"%s\" is ignored when using system translation"),
2004 		     *display_charmap);
2005 
2006 		*err = cpstr(buf);
2007 	    }
2008 
2009 	    fs_give((void **) display_charmap);
2010 	}
2011 
2012 	if(*keyboard_charmap){
2013 	    if(!*err && dcm && strucmp(*keyboard_charmap, dcm)){
2014 		snprintf(buf, sizeof(buf),
2015 		 _("Keyboard character set \"%s\" is ignored when using system translation"),
2016 		     *keyboard_charmap);
2017 
2018 		*err = cpstr(buf);
2019 	    }
2020 
2021 	    fs_give((void **) keyboard_charmap);
2022 	}
2023 
2024 	*display_charmap = cpstr(dcm);
2025 	*keyboard_charmap = cpstr(dcm);
2026 #else
2027 	*err = cpstr("Bad call to setup_for_input_output");
2028 #endif
2029 
2030 	*input_cs_arg = NULL;
2031 	return(0);
2032     }
2033 
2034 
2035 try_again1:
2036     if(!(*display_charmap))
2037       *display_charmap = cpstr("US-ASCII");
2038 
2039     if(!(*keyboard_charmap))
2040       *keyboard_charmap = cpstr(*display_charmap);
2041 
2042     if(*keyboard_charmap){
2043 	supported = input_charset_is_supported(*keyboard_charmap);
2044 
2045 	if(supported){
2046 	    if(!strucmp(*keyboard_charmap, "utf-8"))
2047 	      input_cs = utf8_charset(*keyboard_charmap);
2048 	    else if((cs = utf8_charset(*keyboard_charmap)) != NULL)
2049 	      input_cs = cs;
2050 	}
2051 	else{
2052 	    if(err && !*err){
2053 		int iso2022jp = 0;
2054 
2055 		if(!strucmp(*keyboard_charmap, "ISO-2022-JP"))
2056 		  iso2022jp = 1;
2057 
2058 		snprintf(buf, sizeof(buf),
2059 		     /* TRANSLATORS: The first argument is the name of the character
2060 		        set the user is trying to use (which is unsupported by alpine).
2061 			The second argument is " (except for posting)" if they are
2062 			trying to use ISO-2022-JP for something other than posting. */
2063 		     _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2064 		     *keyboard_charmap,
2065 		     iso2022jp ? _(" (except for posting)") : "");
2066 
2067 		*err = cpstr(buf);
2068 	    }
2069 
2070 	    input_cs = NULL;
2071 	    fs_give((void **) keyboard_charmap);
2072 	    *keyboard_charmap = cpstr("US-ASCII");
2073 	    if(!already_tried){
2074 		already_tried++;
2075 		goto try_again1;
2076 	    }
2077 	}
2078     }
2079 
2080 
2081 try_again2:
2082     if(!(*display_charmap))
2083       *display_charmap = cpstr("US-ASCII");
2084 
2085     if(*display_charmap){
2086 	supported = output_charset_is_supported(*display_charmap);
2087 	if(supported){
2088 	    if(!strucmp(*display_charmap, "utf-8"))
2089 	      init_utf8_display(1, NULL);
2090 	    else if((cs = utf8_charset(*display_charmap)) != NULL)
2091 	      init_utf8_display(0, utf8_rmap_gen(cs, NULL));
2092 	}
2093 	else{
2094 	    if(err && !*err){
2095 		int iso2022jp = 0;
2096 
2097 		if(!strucmp(*display_charmap, "ISO-2022-JP"))
2098 		  iso2022jp = 1;
2099 
2100 		snprintf(buf, sizeof(buf),
2101 		     _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2102 		     *display_charmap,
2103 		     iso2022jp ? _(" (except for posting)") : "");
2104 
2105 		*err = cpstr(buf);
2106 	    }
2107 
2108 	    fs_give((void **) display_charmap);
2109 	    if(!already_tried){
2110 		already_tried++;
2111 		goto try_again2;
2112 	    }
2113 	}
2114     }
2115     else{
2116 	if(err && !*err)
2117 	  *err = cpstr(_("Help, can't figure out display character set or even use US-ASCII."));
2118     }
2119 
2120 #undef cpstr
2121 
2122     *input_cs_arg = (void *) input_cs;
2123 
2124     return(0);
2125 }
2126 
2127 
2128 int
input_charset_is_supported(char * input_charset)2129 input_charset_is_supported(char *input_charset)
2130 {
2131     const CHARSET *cs;
2132 
2133     if(!(input_charset && *input_charset))
2134       return 0;
2135 
2136     if(!strucmp(input_charset, "utf-8"))
2137       return 1;
2138 
2139     if((cs = utf8_charset(input_charset)) != NULL){
2140 
2141 	/*
2142 	 * This was true 2006-09-25.
2143 	 */
2144 	switch(cs->type){
2145 	  case CT_ASCII: case CT_1BYTE0: case CT_1BYTE:
2146 	  case CT_1BYTE8: case CT_EUC: case CT_DBYTE:
2147 	  case CT_DBYTE2: case CT_SJIS: case CT_UCS2:
2148 	  case CT_UCS4: case CT_UTF16:
2149 	    return 1;
2150 	    break;
2151 
2152 	  default:
2153 	    break;
2154 	}
2155     }
2156 
2157     return 0;
2158 }
2159 
2160 
2161 int
output_charset_is_supported(char * output_charset)2162 output_charset_is_supported(char *output_charset)
2163 {
2164     const CHARSET *cs;
2165 
2166     if(!(output_charset && *output_charset))
2167       return 0;
2168 
2169     if(!strucmp(output_charset, "utf-8"))
2170       return 1;
2171 
2172     if((cs = utf8_charset(output_charset)) != NULL && utf8_rmap_gen(cs, NULL))
2173       return 1;
2174 
2175     return 0;
2176 }
2177 
2178 
2179 int
posting_charset_is_supported(char * posting_charset)2180 posting_charset_is_supported(char *posting_charset)
2181 {
2182     return(posting_charset && *posting_charset
2183 	   && (!strucmp(posting_charset, "ISO-2022-JP")
2184 	       || output_charset_is_supported(posting_charset)));
2185 }
2186 
2187 
2188 /*
2189  * This function is only defined in this special case and so calls
2190  * to it should be wrapped in the same macro conditionals.
2191  *
2192  * Returns the default display charset for a UNIX terminal emulator,
2193  * it is what nl_langinfo(CODESET) should return but we need to
2194  * wrap nl_langinfo because we know of strange behaving implementations.
2195  */
2196 #if !defined(_WINDOWS) && HAVE_LANGINFO_H && defined(CODESET)
2197 char *
nl_langinfo_codeset_wrapper(void)2198 nl_langinfo_codeset_wrapper(void)
2199 {
2200     char *ret = NULL;
2201 
2202     ret = nl_langinfo(CODESET);
2203 
2204     /*
2205      * If the value returned from nl_langinfo() is not a real charset,
2206      * see if we can figure out what they meant. If we can't figure it
2207      * out return NULL and let the caller decide what to do.
2208      */
2209     if(ret && *ret && !output_charset_is_supported(ret)){
2210 	if(!strcmp("ANSI_X3.4-1968", ret)
2211 	   || !strcmp("646", ret)
2212 	   || !strcmp("ASCII", ret)
2213 	   || !strcmp("C", ret)
2214 	   || !strcmp("POSIX", ret))
2215 	  ret = "US-ASCII";
2216 	else if(!strucmp(ret, "UTF8"))
2217 	  ret = "UTF-8";
2218 	else if(!strucmp(ret, "EUCJP"))
2219 	  ret = "EUC-JP";
2220 	else if(!strucmp(ret, "EUCKP"))
2221 	  ret = "EUC-KP";
2222 	else if(!strucmp(ret, "SJIS"))
2223 	  ret = "SHIFT-JIS";
2224 	else if(strstr(ret, "8859")){
2225 	    char *p;
2226 
2227 	    /* check for digits after 8859 */
2228 	    p = strstr(ret, "8859");
2229 	    p += 4;
2230 	    if(!isdigit(*p))
2231 	      p++;
2232 
2233 	    if(isdigit(*p)){
2234 		static char buf[12];
2235 
2236 		memset(buf, 0, sizeof(buf));
2237 		strncpy(buf, "ISO-8859-", sizeof(buf));
2238 		buf[9] = *p++;
2239 		if(isdigit(*p))
2240 		  buf[10] = *p;
2241 
2242 		ret = buf;
2243 	    }
2244 	}
2245     }
2246 
2247     if(ret && !output_charset_is_supported(ret))
2248       ret = NULL;
2249 
2250     return(ret);
2251 }
2252 #endif
2253 
2254 
2255 /*
2256  * Convert the "orig" string from UTF-8 to "charset". If no conversion is
2257  * needed the return value will point to orig. If a conversion is done,
2258  * the return string should be freed by the caller.
2259  * If not possible, returns NULL.
2260  */
2261 char *
utf8_to_charset(char * orig,char * charset,int report_err)2262 utf8_to_charset(char *orig, char *charset, int report_err)
2263 {
2264     SIZEDTEXT src, dst;
2265     char *ret = orig;
2266 
2267     if(!charset || !charset[0] || !orig || !orig[0] || !strucmp(charset, "utf-8"))
2268       return ret;
2269 
2270     src.size = strlen(orig);
2271     src.data = (unsigned char *) orig;
2272 
2273     if(!strucmp(charset, "us-ascii")){
2274 	size_t i;
2275 
2276 	for(i = 0; i < src.size; i++)
2277 	  if(src.data[i] & 0x80)
2278 	    return NULL;
2279 
2280 	return ret;
2281     }
2282 
2283     /*
2284      * This works for ISO-2022-JP because of special code in utf8_cstext
2285      * but not for other 2022 charsets.
2286      */
2287     memset(&dst, 0, sizeof(dst));
2288     if(utf8_cstext(&src, charset, &dst, report_err ? 0 : '?') && dst.size > 0 && dst.data)
2289       ret = (char *) dst.data;		/* c-client already null terminates it */
2290     else
2291       ret = NULL;
2292 
2293     if((unsigned char *) ret != dst.data && dst.data)
2294       fs_give((void **) &dst.data);
2295 
2296     return ret;
2297 }
2298 
2299 
2300 /*
2301  *      Turn a number into a string with comma's
2302  *
2303  * Args: number -- The long to be turned into a string.
2304  *
2305  * Result: pointer to static string representing number with commas
2306  * Can use up to 3 comatose results at once.
2307  */
2308 char *
comatose(long int number)2309 comatose(long int number)
2310 {
2311     long        i, x, done_one;
2312     static char buf[3][50];
2313     static int whichbuf = 0;
2314     char       *b;
2315 
2316     whichbuf = (whichbuf + 1) % 3;
2317 
2318     if(number == 0){
2319         strncpy(buf[whichbuf], "0", sizeof(buf[0]));
2320 	buf[whichbuf][sizeof(buf[0])-1] = '\0';
2321         return(buf[whichbuf]);
2322     }
2323 
2324     done_one = 0;
2325     b = buf[whichbuf];
2326     for(i = 1000000000; i >= 1; i /= 1000) {
2327 	x = number / i;
2328 	number = number % i;
2329 	if(x != 0 || done_one) {
2330 	    if(b != buf[whichbuf] && (b-buf[whichbuf]) <  sizeof(buf[0]))
2331 	      *b++ = ',';
2332 
2333 	    snprintf(b, sizeof(buf[0])-(b-buf[whichbuf]), done_one ? "%03ld" : "%ld", x);
2334 	    b += strlen(b);
2335 	    done_one = 1;
2336 	}
2337     }
2338 
2339     if(b-buf[whichbuf] < sizeof(buf[0]))
2340       *b = '\0';
2341 
2342     return(buf[whichbuf]);
2343 }
2344 
2345 
2346 /* leave out the commas */
2347 char *
tose(long int number)2348 tose(long int number)
2349 {
2350     static char buf[3][50];
2351     static int whichbuf = 0;
2352 
2353     whichbuf = (whichbuf + 1) % 3;
2354 
2355     snprintf(buf[whichbuf], sizeof(buf[0]), "%ld", number);
2356 
2357     return(buf[whichbuf]);
2358 }
2359 
2360 
2361 /*
2362  * line_paint - where the real work of managing what is displayed gets done.
2363  */
2364 void
line_paint(int offset,struct display_line * displ,int * passwd)2365 line_paint(int offset,			/* current dot offset into vl */
2366 	   struct display_line *displ,
2367 	   int *passwd)			/* flag to hide display of chars */
2368 {
2369     int i, w, w2, already_got_one = 0;
2370     int vfirst, vlast, dfirst, dlast, vi, di;
2371     int new_vbase;
2372     unsigned (*width_a_to_b)(UCS *, int, int);
2373 
2374     /*
2375      * Set passwd to 10 in caller if you want to conceal the
2376      * password but not print asterisks for feedback.
2377      *
2378      * Set passwd to 1 in caller to conceal by printing asterisks.
2379      */
2380     if(passwd && *passwd >= 10){	/* don't show asterisks */
2381 	if(*passwd > 10)
2382 	  return;
2383 	else
2384 	  *passwd = 11;		/* only blat once */
2385 
2386 	i = 0;
2387 	(*displ->movecursor)(displ->row, displ->col);
2388 	while(i++ <= displ->dwid)
2389 	  (*displ->writechar)(' ');
2390 
2391 	(*displ->movecursor)(displ->row, displ->col);
2392 	return;
2393     }
2394 
2395     if(passwd && *passwd)
2396       width_a_to_b = single_width_chars_a_to_b;
2397     else
2398       width_a_to_b = ucs4_str_width_a_to_b;
2399 
2400     /*
2401      * vl is the virtual line (the actual data). We operate on it by typing
2402      * characters to be added and deleting and so forth. In this routine we
2403      * copy a subset of those UCS-4 characters in vl into dl, the display
2404      * array, and show that subset on the screen.
2405      *
2406      * Offset is the location of the cursor in vl.
2407      *
2408      * We will display the string starting from vbase.
2409      * We have dwid screen cells to work in.
2410      * We may have to adjust vbase in order to display the
2411      * part of the string that contains the cursor.
2412      *
2413      * We'll make the display look like
2414      *   vl    a b c d e f g h i j k l m
2415      *             xxxxxxxxxxxxx  <- width dwid window
2416      *             < d e f g h >
2417      *               |
2418      *             vbase
2419      * The < will be there if vbase > 0.
2420      * The > will be there if the string from vbase to the
2421      * end can't all fit in the window.
2422      */
2423 
2424     memset(displ->dl, 0, displ->dlen * sizeof(UCS));
2425 
2426     /*
2427      * Adjust vbase so offset is not out of the window to the right.
2428      * (The +2 in w + 2 is for a possible " >" if the string goes past
2429      *  the right hand edge of the window and if the last visible character
2430      * is double wide. We don't want the offset to be under that > character.)
2431      */
2432     for(w = (*width_a_to_b)(displ->vl, displ->vbase, offset);
2433 	displ->dwid > 1 &&
2434 	w + 2 + (displ->vbase ? 1 : 0) > displ->dwid;
2435         w = (*width_a_to_b)(displ->vl, displ->vbase, offset)){
2436 	/*
2437 	 * offset is off the window to the right
2438 	 * It looks like   a b c d e f g h
2439 	 *                   |         |
2440 	 *               vbase         offset
2441 	 * and offset is either past the right edge,
2442 	 * or right at the right edge (and maybe under >),
2443 	 * or one before right at the edge (and maybe on space
2444 	 * for half a character).
2445 	 *
2446 	 * Since the characters may be double width it is slightly
2447 	 * complicated to figure out how far to increase vbase.
2448 	 * We're going to scoot over past width w/2 characters and
2449 	 * then see if that's sufficient.
2450 	 */
2451 	new_vbase = displ->vbase + 1;
2452 	for(w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase);
2453 	    w2 < displ->dwid/2;
2454 	    w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase))
2455 	  new_vbase++;
2456 
2457 	displ->vbase = new_vbase;
2458     }
2459 
2460     /* adjust so offset is not out of the window to the left */
2461     while(displ->vbase > 0 && displ->vbase >= offset){
2462 	/* add about dwid/2 more width */
2463 	new_vbase = displ->vbase - 1;
2464 	for(w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase);
2465 	    w2 < (displ->dwid+1)/2 && new_vbase > 0;
2466 	    w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase))
2467 	  new_vbase--;
2468 
2469 	/* but don't let it get too small, recheck off right end */
2470 	for(w = (*width_a_to_b)(displ->vl, new_vbase, offset);
2471 	    w + 2 + (new_vbase ? 1 : 0) > displ->dwid;
2472 	    w = (*width_a_to_b)(displ->vl, displ->vbase, offset))
2473 	  new_vbase++;
2474 
2475 	displ->vbase = MAX(new_vbase, 0);
2476     }
2477 
2478     if(displ->vbase == 1 && ((passwd && *passwd) || wcellwidth(displ->vl[0]) == 1))
2479       displ->vbase = 0;
2480 
2481     vfirst = displ->vbase;
2482     dfirst = 0;
2483     if(displ->vbase > 0){			/* off screen cue left */
2484 	dfirst = 1;				/* index which matches vfirst */
2485 	displ->dl[0] = '<';
2486     }
2487 
2488     vlast = displ->vused-1;			/* end */
2489     w = (*width_a_to_b)(displ->vl, vfirst, vlast);
2490 
2491     if(displ->dwid > 0 && w + dfirst > displ->dwid){			/* off window right */
2492 
2493 	/* find last ucs character to be printed */
2494 	while(w + dfirst > displ->dwid - 1)	/* -1 for > */
2495 	  w = (*width_a_to_b)(displ->vl, vfirst, --vlast);
2496 
2497 	/* worry about double-width characters */
2498 	if(w + dfirst == displ->dwid - 1){	/* no prob, hit it exactly */
2499 	    dlast = dfirst + vlast - vfirst + 1;	/* +1 for > */
2500 	    displ->dl[dlast] = '>';
2501 	}
2502 	else{
2503 	    dlast = dfirst + vlast - vfirst + 1;
2504 	    displ->dl[dlast++] = ' ';
2505 	    displ->dl[dlast] = '>';
2506 	}
2507     }
2508     else
2509       dlast = dfirst + vlast - vfirst;
2510 
2511     /*
2512      * Copy the relevant part of the virtual line into the display line.
2513      */
2514     for(vi = vfirst, di = dfirst; vi <= vlast; vi++, di++)
2515       if(passwd && *passwd)
2516         displ->dl[di] = '*';		/* to conceal password */
2517       else
2518         displ->dl[di] = displ->vl[vi];
2519 
2520     /*
2521      * Add spaces to clear the rest of the line.
2522      * We have dwid total space to fill.
2523      */
2524     w = (*width_a_to_b)(displ->dl, 0, dlast);	/* width through dlast */
2525     for(di = dlast+1, i = displ->dwid - w; i > 0 ; i--)
2526       displ->dl[di++] = ' ';
2527 
2528     /*
2529      * Draw from left to right, skipping until we get to
2530      * something that is different. Characters may be different
2531      * widths than they were initially so paint from there the
2532      * rest of the way.
2533      */
2534     for(di = 0; displ->dl[di]; di++){
2535 	if(already_got_one || displ->dl[di] != displ->olddl[di]){
2536 	    /* move cursor first time */
2537 	    if(!already_got_one++){
2538 		w = (di > 0) ? (*width_a_to_b)(displ->dl, 0, di-1) : 0;
2539 		(*displ->movecursor)(displ->row, displ->col + w);
2540 	    }
2541 
2542 	    (*displ->writechar)(displ->dl[di]);
2543 	    displ->olddl[di] = displ->dl[di];
2544 	}
2545     }
2546 
2547     memset(&displ->olddl[di], 0, (displ->dlen - di) * sizeof(UCS));
2548 
2549     /*
2550      * Move the cursor to the offset.
2551      *
2552      * The offset is relative to the start of the virtual array. We need
2553      * to find the location on the screen. The offset into the display array
2554      * will be offset-vbase+dfirst. We want to be at the start of that
2555      * character, so we need to find the width of all the characters up
2556      * to that point.
2557      */
2558     w = (offset > 0) ? (*width_a_to_b)(displ->dl, 0, offset-displ->vbase+dfirst-1) : 0;
2559 
2560     (*displ->movecursor)(displ->row, displ->col + w);
2561 }
2562 
2563 
2564 /*
2565  * This is just like ucs4_str_width_a_to_b() except all of the characters
2566  * are assumed to be of width 1. This is for printing out *'s when user
2567  * enters a password, while still managing to use the same code to do the
2568  * display.
2569  */
2570 unsigned
single_width_chars_a_to_b(UCS * ucsstr,int a,int b)2571 single_width_chars_a_to_b(UCS *ucsstr, int a, int b)
2572 {
2573     unsigned width = 0;
2574     int i;
2575 
2576     if(ucsstr)
2577       for(i = a; i <= b && ucsstr[i]; i++)
2578 	width++;
2579 
2580     return width;
2581 }
2582