1 #if !defined(lint) && !defined(DOS)
2 static char rcsid[] = "$Id: utf8.c 1184 2008-12-16 23:52:15Z hubert@u.washington.edu $";
3 #endif
4
5 /*
6 * ========================================================================
7 * Copyright 2013-2021 Eduardo Chappa
8 * Copyright 2006-2008 University of Washington
9 *
10 * Licensed under the Apache License, Version 2.0 (the "License");
11 * you may not use this file except in compliance with the License.
12 * You may obtain a copy of the License at
13 *
14 * http://www.apache.org/licenses/LICENSE-2.0
15 *
16 * ========================================================================
17 */
18
19
20 /* includable WITHOUT dependency on c-client */
21 #include "../../c-client/mail.h"
22 #include "../../c-client/utf8.h"
23
24 #ifdef _WINDOWS
25 /* wingdi.h uses ERROR (!) and we aren't using the c-client ERROR so... */
26 #undef ERROR
27 #else
28 #define _XOPEN_SOURCE
29 #endif
30
31 #include <system.h>
32
33 #include "../../c-client/fs.h"
34
35 /* includable WITHOUT dependency on pico */
36 #include "../../pico/keydefs.h"
37
38 #include "../osdep/collate.h"
39 #include "../filttype.h"
40
41 #include "utf8.h"
42
43 #include <stdarg.h>
44
45
46 unsigned single_width_chars_a_to_b(UCS *, int, int);
47
48
49 static char locale_charmap[50];
50
51 static int native_utf8;
52 static void *display_data;
53
54 void
init_utf8_display(int utf8,void * rmap)55 init_utf8_display(int utf8, void *rmap)
56 {
57 native_utf8 = utf8;
58 display_data = rmap;
59 }
60
61
62 /*
63 * Argument is a UCS-4 wide character.
64 * Returns the environment dependent cell width of the
65 * character when printed to the screen.
66 * This will be -1 if the character is not printable.
67 * It will be >= zero if it is printable.
68 *
69 * Note that in the case it is not printable but it is still sent to
70 * Writechar, Writechar will print a '?' with width 1.
71 */
72 int
wcellwidth(UCS ucs)73 wcellwidth(UCS ucs)
74 {
75 char dummy[32];
76 long w;
77
78 /*
79 * We believe that on modern unix systems wchar_t is a UCS-4 character.
80 * That's the assumption here.
81 */
82
83 if(native_utf8){ /* display is UTF-8 capable */
84 w = ucs4_width((unsigned long) ucs);
85 return((w & U4W_ERROR) ? -1 : w);
86 }
87 else if(display_data){
88 if(wtomb(dummy, ucs) < 0)
89 return(-1);
90 else{
91 w = ucs4_width((unsigned long) ucs);
92 return((w & U4W_ERROR) ? -1 : w);
93 }
94 }
95 #if !defined(_WINDOWS) && HAVE_WCWIDTH
96 else
97 return(wcwidth((wchar_t) ucs));
98 #else
99 return(0);
100 #endif
101 }
102
103 /* ambiguous width zone character function. We use the Windows code until
104 * we find a better way to do it in general.
105 */
106 int
pith_ucs4width(UCS ucs)107 pith_ucs4width(UCS ucs)
108 {
109 return (ucs >= 0x2100) ? 2 : 1;
110 #if !defined(_WINDOWS) && HAVE_WCWIDTH
111 return wcwidth((wchar_t) ucs);
112 #else
113 return (ucs >= 0x2100) ? 2 : 1;
114 #endif /* _WINDOWS */
115 }
116
117 /*
118 * Argument is a UCS-4 wide character.
119 * It is converted to the multibyte version (for example UTF8 or EUC-JP).
120 * Dest is a buffer at least xx chars wide where the multi-byte version
121 * of the wide character will be written.
122 * The returned value is the number of bytes written to dest or -1
123 * if the conversion can't be done.
124 */
125 int
wtomb(char * dest,UCS ucs)126 wtomb(char *dest, UCS ucs)
127 {
128 int rv;
129 /*
130 * We believe that on modern unix systems wchar_t is a UCS-4 character.
131 * That's the assumption here.
132 */
133
134 if(native_utf8){
135 unsigned char *newdptr;
136
137 newdptr = utf8_put((unsigned char *) dest, (unsigned long) ucs);
138 return((newdptr == (unsigned char *) dest) ? -1 : newdptr - (unsigned char *) dest);
139 }
140 else if(display_data){
141 unsigned long ucs4;
142 int ret;
143
144 ucs4 = (unsigned long) ucs;
145 ret = ucs4_rmaplen(&ucs4, 1, (unsigned short *) display_data, 0);
146 if(ret >= 0)
147 ucs4_rmapbuf((unsigned char *) dest, &ucs4, 1, (unsigned short *) display_data, 0);
148 else
149 ret = -1;
150
151 return(ret);
152 }
153 else
154 #if defined(HAVE_WCRTOMB)
155 rv = wcrtomb(dest, (wchar_t) ucs, NULL);
156 #elif defined(HAVE_WCTOMB)
157 rv = wctomb(dest, (wchar_t) ucs);
158 #else
159 rv = -1;
160 #endif
161 return rv;
162 }
163
164
165 /*
166 * This function does not necessarily update inputp and remaining_octets, so
167 * don't rely on that. The c-client version does but the other doesn't.
168 */
169 UCS
mbtow(void * input_cs,unsigned char ** inputp,unsigned long * remaining_octets)170 mbtow(void *input_cs, unsigned char **inputp, unsigned long *remaining_octets)
171 {
172 UCS ucs;
173
174 if(input_cs){
175 CHARSET *cast_input_cs;
176
177 cast_input_cs = (CHARSET *) input_cs;
178
179 switch((ucs = (UCS) ucs4_cs_get(cast_input_cs, inputp, remaining_octets))){
180 case U8G_ENDSTRG:
181 case U8G_ENDSTRI:
182 return(CCONV_NEEDMORE);
183
184 default:
185 if(ucs & U8G_ERROR || ucs == UBOGON)
186 return(CCONV_BADCHAR);
187
188 return(ucs);
189 }
190 }
191 else{
192 size_t ret;
193 wchar_t w;
194
195 /*
196 * Warning: input_cs and remaining_octets are unused in this
197 * half of the if/else.
198 *
199 * Unfortunately, we can't tell the difference between a source string
200 * that is just not long enough and one that has characters that can't
201 * be converted even though it is long enough. We return NEEDMORE in both cases.
202 */
203 ret = mbstowcs(&w, (char *) (*inputp), 1);
204 if(ret == (size_t)(-1))
205 return(CCONV_NEEDMORE);
206 else{
207 ucs = (UCS) w;
208 return(ucs);
209 }
210 }
211 }
212
213
214 void
set_locale_charmap(char * charmap)215 set_locale_charmap(char *charmap)
216 {
217 if(charmap){
218 strncpy(locale_charmap, charmap, sizeof(locale_charmap));
219 locale_charmap[sizeof(locale_charmap)-1] = '\0';
220 }
221 else
222 locale_charmap[0] = '\0';
223 }
224
225
226 /*
227 * This ensures that the string is UTF-8. If str is already a UTF-8 string,
228 * NULL is returned. Otherwise, an allocated string which is UTF-8 is returned.
229 * The caller is responsible for freeing the returned value.
230 *
231 * Args str -- the string to convert
232 */
233 char *
convert_to_utf8(char * str,char * fromcharset,int flags)234 convert_to_utf8(char *str, char *fromcharset, int flags)
235 {
236 char *ret = NULL;
237 char *fcharset;
238 SIZEDTEXT src, result;
239 const CHARSET *cs = NULL;
240 int try;
241
242 src.data = (unsigned char *) str;
243 src.size = strlen(str);
244
245 /* already UTF-8, return NULL */
246 if(!(flags & CU8_NOINFER)
247 && (cs = utf8_infercharset(&src))
248 && (cs->type == CT_ASCII || cs->type == CT_UTF8))
249 return(ret);
250
251 try = 1;
252 while(try < 5){
253 switch(try){
254 case 1:
255 fcharset = fromcharset;
256 if(fcharset && strucmp("UTF-8", fcharset) != 0)
257 break; /* give it a try */
258 else
259 try++; /* fall through */
260
261 case 2:
262 if(!(flags & CU8_NOINFER)){
263 fcharset = cs ? cs->name : NULL;
264 if(fcharset && strucmp("UTF-8", fcharset) != 0)
265 break;
266 else
267 try++; /* fall through */
268 }
269 else
270 try++; /* fall through */
271
272 case 3:
273 fcharset = locale_charmap;
274 if(fcharset && strucmp("UTF-8", fcharset) != 0)
275 break;
276 else
277 try++; /* fall through */
278
279 default:
280 fcharset = "ISO-8859-1"; /* this will "work" */
281 break;
282 }
283
284 memset(&result, 0, sizeof(result));
285
286 if(fcharset && utf8_text(&src, fcharset, &result, 0L)){
287 if(!(result.size == src.size && result.data == src.data)){
288 ret = (char *) fs_get((result.size+1) * sizeof(char));
289 strncpy(ret, (char *) result.data, result.size);
290 ret[result.size] = '\0';
291 }
292 /* else no conversion necessary */
293
294 if(result.data && result.data != src.data)
295 fs_give((void **) &result.data);
296 result.size = 0;
297
298 return(ret);
299 }
300
301 try++;
302 }
303
304 /* won't make it to here */
305 return(ret);
306 }
307
308
309 /*
310 * Convert from UTF-8 to user's locale charset.
311 * This actually uses the wtomb routine to do the conversion, and that
312 * relies on setup_for_input_output having been called.
313 * If no conversion is necessary, NULL is returned, otherwise an allocated
314 * string in the locale charset is returned and the caller is responsible
315 * for freeing it.
316 */
317 char *
convert_to_locale(char * utf8str)318 convert_to_locale(char *utf8str)
319 {
320 #define CHNK 500
321 char *inp, *ret = NULL;
322 CBUF_S cb;
323 int alloced;
324 size_t i = 0;
325
326 if(native_utf8 || !utf8str || !utf8str[0])
327 return(NULL);
328
329 cb.cbuf[0] = '\0';
330 cb.cbufp = cb.cbufend = cb.cbuf;
331 inp = utf8str;
332
333 alloced = CHNK;
334 ret = (char *) fs_get(alloced * sizeof(char));
335
336 /*
337 * There's gotta be a better way to do this but utf8_to_locale was
338 * available and everything looks like a nail when all you have
339 * is a hammer.
340 */
341 while(*inp){
342 /*
343 * We're placing the outgoing stream of characters in ret, a multi-byte
344 * array of characters in the user's locale charset. See if there is
345 * enough room for the next wide characters worth of output chars
346 * and allocate more space if not.
347 */
348 if((alloced - i) < MAX(MB_LEN_MAX,32)){
349 alloced += CHNK;
350 fs_resize((void **) &ret, alloced * sizeof(char));
351 }
352
353 i += utf8_to_locale((int) *inp++, &cb,
354 (unsigned char *) &ret[i], alloced - i);
355 }
356
357 fs_resize((void **) &ret, i + 1);
358
359 ret[i] = '\0';
360
361 return(ret);
362 }
363
364
365 /*
366 * Pass in a stream of UTF-8 characters in 'c' and return obuf
367 * filled in with multi-byte characters. The return value is the
368 * number of valid characters in obuf to be used.
369 */
370 int
utf8_to_locale(int c,CBUF_S * cb,unsigned char obuf[],size_t obuf_size)371 utf8_to_locale(int c, CBUF_S *cb, unsigned char obuf[], size_t obuf_size)
372 {
373 int outchars = 0;
374
375 if(!(cb && cb->cbufp))
376 return(0);
377
378 if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
379 unsigned char *inputp;
380 unsigned long remaining_octets;
381 UCS ucs;
382
383 *(cb->cbufp)++ = (unsigned char) c;
384 inputp = cb->cbuf;
385 remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
386 ucs = (UCS) utf8_get(&inputp, &remaining_octets);
387
388 switch(ucs){
389 case U8G_ENDSTRG: /* incomplete character, wait */
390 case U8G_ENDSTRI: /* incomplete character, wait */
391 break;
392
393 default:
394 if(ucs & U8G_ERROR || ucs == UBOGON){
395 /*
396 * None of these cases is supposed to happen. If it
397 * does happen then the input stream isn't UTF-8
398 * so something is wrong. Treat each character in the
399 * input buffer as a separate error character and
400 * print a '?' for each.
401 */
402 for(inputp = cb->cbuf; inputp < cb->cbufp; inputp++)
403 obuf[outchars++] = '?';
404
405 cb->cbufp = cb->cbuf;
406 }
407 else{
408 if(ucs >= 0x80 && wcellwidth(ucs) < 0){
409 /*
410 * This happens when we have a UTF-8 character that
411 * we aren't able to print in our locale. For example,
412 * if the locale is setup with the terminal
413 * expecting ISO-8859-1 characters then there are
414 * lots of UTF-8 characters that can't be printed.
415 * Print a '?' instead.
416 */
417 obuf[outchars++] = '?';
418 }
419 else{
420 /*
421 * Convert the ucs into the multibyte
422 * character that corresponds to the
423 * ucs in the users locale.
424 */
425 outchars = wtomb((char *) obuf, ucs);
426 if(outchars < 0){
427 obuf[0] = '?';
428 outchars = 1;
429 }
430 }
431
432 /* update the input buffer */
433 if(inputp >= cb->cbufp) /* this should be the case */
434 cb->cbufp = cb->cbuf;
435 else{ /* extra chars for some reason? */
436 unsigned char *q, *newcbufp;
437
438 newcbufp = (cb->cbufp - inputp) + cb->cbuf;
439 q = cb->cbuf;
440 while(inputp < cb->cbufp)
441 *q++ = *inputp++;
442
443 cb->cbufp = newcbufp;
444 }
445 }
446
447 break;
448 }
449 }
450 else{ /* error */
451 obuf[0] = '?';
452 outchars = 1;
453 cb->cbufp = cb->cbuf; /* start over */
454 }
455
456 return(outchars);
457 }
458
459
460 /*
461 * Returns the screen cells width of the UCS-4 string argument.
462 * The source string is zero terminated.
463 */
464 unsigned
ucs4_str_width(UCS * ucsstr)465 ucs4_str_width(UCS *ucsstr)
466 {
467 unsigned width = 0;
468 int w;
469
470 if(ucsstr)
471 while(*ucsstr){
472 w = wcellwidth(*ucsstr++);
473 if(w != U4W_CTLSRGT)
474 width += (w < 0 ? 1 : w);
475 }
476
477 return width;
478 }
479
480
481 /*
482 * Returns the screen cells width of the UCS-4 string argument
483 * from ucsstr[a] through (inclusive) ucsstr[b].
484 * No checking is done to make sure a starts in the middle
485 * of a UCS-4 array.
486 */
487 unsigned
ucs4_str_width_a_to_b(UCS * ucsstr,int a,int b)488 ucs4_str_width_a_to_b(UCS *ucsstr, int a, int b)
489 {
490 unsigned width = 0;
491 int i, w;
492
493 if(ucsstr)
494 for(i = a; i <= b && ucsstr[i]; i++){
495 w = wcellwidth(ucsstr[i]);
496 if(w != U4W_CTLSRGT)
497 width += (w < 0 ? 1 : w);
498 }
499
500 return width;
501 }
502
503
504 /*
505 * Returns the screen cells width of the UCS-4 string argument
506 * from ustart through (exclusive) uend.
507 * No checking is done to make sure it starts in the middle
508 * of a UCS-4 array.
509 */
510 unsigned
ucs4_str_width_ptr_to_ptr(UCS * ustart,UCS * uend)511 ucs4_str_width_ptr_to_ptr(UCS *ustart, UCS *uend)
512 {
513 UCS *u;
514 unsigned width = 0;
515 int w;
516
517 if(!ustart)
518 return width;
519
520 if(ustart)
521 for(u = ustart; u < uend; u++){
522 w = wcellwidth(*u);
523 if(w != U4W_CTLSRGT)
524 width += (w < 0 ? 1 : w);
525 }
526
527 return(width);
528 }
529
530
531 /*
532 * Return the largest possible pointer into ucs4str so that the width
533 * of the string from ucs4str to the pointer (exclusive)
534 * is maxwidth or less. Also stops at a null character.
535 */
536 UCS *
ucs4_particular_width(UCS * ucs4str,int maxwidth)537 ucs4_particular_width(UCS *ucs4str, int maxwidth)
538 {
539 UCS *u;
540 int w_consumed = 0, w, done = 0;
541
542 u = ucs4str;
543
544 if(u)
545 while(!done && *u && w_consumed <= maxwidth){
546 w = wcellwidth(*u);
547 w = (w >= 0 ? w : 1);
548 if(w_consumed + w <= maxwidth){
549 w_consumed += w;
550 ++u;
551 }
552 else
553 ++done;
554 }
555
556 return(u);
557 }
558
559
560 /*
561 * Convert and copy a UTF-8 string into a UCS-4 NULL
562 * terminated array. Just like cpystr only it converts
563 * from UTF-8 to UCS-4.
564 *
565 * Returned UCS-4 string needs to be freed by caller.
566 */
567 UCS *
utf8_to_ucs4_cpystr(char * utf8src)568 utf8_to_ucs4_cpystr(char *utf8src)
569 {
570 size_t retsize;
571 UCS *ret = NULL;
572 UCS ucs;
573 unsigned long remaining_octets;
574 unsigned char *readptr;
575 size_t arrayindex;
576
577 /*
578 * We don't know how big to allocate the return array
579 * because variable numbers of octets in the src array
580 * will combine to make UCS-4 characters. The number of
581 * UCS-4 characters is less than or equal to the number
582 * of src characters, though.
583 */
584
585 if(!utf8src)
586 return NULL;
587
588 retsize = strlen(utf8src) + 1;
589
590 ret = (UCS *) fs_get(retsize * sizeof(*ret));
591 memset(ret, 0, retsize * sizeof(*ret));
592
593 readptr = (unsigned char *) utf8src;
594 remaining_octets = retsize-1;
595 arrayindex = 0;
596
597 while(remaining_octets > 0 && *readptr && arrayindex < retsize-1){
598 ucs = (UCS) utf8_get(&readptr, &remaining_octets);
599
600 if(ucs & U8G_ERROR || ucs == UBOGON)
601 remaining_octets = 0;
602 else
603 ret[arrayindex++] = ucs;
604 }
605
606 ret[arrayindex] = '\0';
607
608 /* get rid of excess size */
609 if(arrayindex+1 < retsize)
610 fs_resize((void **) &ret, (arrayindex + 1) * sizeof(*ret));
611
612 return ret;
613 }
614
615
616 /*
617 * Convert and copy a UCS-4 zero-terminated array into a UTF-8 NULL
618 * terminated string. Just like cpystr only it converts
619 * from UCS-4 to UTF-8.
620 *
621 * Returned UTF-8 string needs to be freed by caller.
622 */
623 char *
ucs4_to_utf8_cpystr(UCS * ucs4src)624 ucs4_to_utf8_cpystr(UCS *ucs4src)
625 {
626 unsigned char *ret = NULL;
627 unsigned char *writeptr;
628 int i;
629
630 if(!ucs4src)
631 return NULL;
632
633 /*
634 * Over-allocate and then resize at the end.
635 */
636
637 /* count characters in source */
638 for(i = 0; ucs4src[i]; i++)
639 ;
640
641 ret = (unsigned char *) fs_get((6*i + 1) * sizeof(*ret));
642 memset(ret, 0, (6*i + 1) * sizeof(*ret));
643
644 writeptr = ret;
645 for(i = 0; ucs4src[i]; i++)
646 writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
647
648 /* get rid of excess size */
649 fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
650
651 return ((char *) ret);
652 }
653
654
655 /*
656 * Similar to above but copy a fixed number of source
657 * characters instead of going until null terminator.
658 */
659 char *
ucs4_to_utf8_cpystr_n(UCS * ucs4src,int ucs4src_len)660 ucs4_to_utf8_cpystr_n(UCS *ucs4src, int ucs4src_len)
661 {
662 unsigned char *ret = NULL;
663 unsigned char *writeptr;
664 int i;
665
666 if(!ucs4src)
667 return NULL;
668
669 /*
670 * Over-allocate and then resize at the end.
671 */
672
673 ret = (unsigned char *) fs_get((6*ucs4src_len + 1) * sizeof(*ret));
674 memset(ret, 0, (6*ucs4src_len + 1) * sizeof(*ret));
675
676 writeptr = ret;
677 for(i = 0; i < ucs4src_len; i++)
678 writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
679
680 /* get rid of excess size */
681 fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));
682
683 return ((char *) ret);
684 }
685
686 /*
687 * Similar to above but copy what is possible to a
688 * string of a size at most the given retlen.
689 */
690 char *
ucs4_to_utf8_n_cpystr(UCS * ucs4src,int retlen)691 ucs4_to_utf8_n_cpystr(UCS *ucs4src, int retlen)
692 {
693 unsigned char *ret = NULL;
694 unsigned char *writeptr;
695 int i, oldlen, len;
696
697 if(!ucs4src)
698 return NULL;
699
700 /*
701 * Over-allocate and then resize at the end.
702 */
703
704 /* count characters in source */
705 for(i = 0; ucs4src[i]; i++)
706 ;
707
708 ret = (unsigned char *) fs_get((6*i + 1) * sizeof(unsigned char));
709 memset(ret, 0, (6*i + 1) * sizeof(unsigned char));
710
711 writeptr = ret;
712 oldlen = len = 0;
713 for(i = 0; ucs4src[i] && (len < retlen); i++){
714 oldlen = len;
715 writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
716 len = strlen(ret);
717 }
718 if(len > retlen){
719 ret[oldlen] = '\0';
720 len = oldlen;
721 }
722
723 /* get rid of excess size */
724 fs_resize((void **) &ret, (len + 1) * sizeof(unsigned char));
725
726 return ((char *) ret);
727 }
728
729
730 #ifdef _WINDOWS
731 /*
732 * Convert a UTF-8 argument into an LPTSTR version
733 * of that argument. The result is allocated here
734 * and should be freed by the caller.
735 */
736 LPTSTR
utf8_to_lptstr(LPSTR arg_utf8)737 utf8_to_lptstr(LPSTR arg_utf8)
738 {
739 int lptstr_len;
740 LPTSTR lptstr_ret = NULL;
741
742 lptstr_len = MultiByteToWideChar( CP_UTF8, 0, arg_utf8, -1, NULL, 0 );
743 if(lptstr_len > 0)
744 {
745 lptstr_ret = (LPTSTR)fs_get(lptstr_len * sizeof(TCHAR));
746 lptstr_len = MultiByteToWideChar( CP_UTF8, 0,
747 arg_utf8, -1, lptstr_ret, lptstr_len );
748 }
749
750 if(!lptstr_len)
751 {
752 /* check GetLastError()? */
753 lptstr_ret = (LPTSTR)fs_get(sizeof(TCHAR));
754 lptstr_ret[0] = 0;
755 }
756
757 return lptstr_ret;
758 }
759
760
761 /*
762 * Convert an LPTSTR argument into a UTF-8 version
763 * of that argument. The result is allocated here
764 * and should be freed by the caller.
765 */
766 LPSTR
lptstr_to_utf8(LPTSTR arg_lptstr)767 lptstr_to_utf8(LPTSTR arg_lptstr)
768 {
769 int utf8str_len;
770 LPSTR utf8str_ret = NULL;
771
772 utf8str_len = WideCharToMultiByte( CP_UTF8, 0, arg_lptstr, -1, NULL, 0, NULL, NULL );
773 if(utf8str_len > 0)
774 {
775 utf8str_ret = (LPSTR)fs_get(utf8str_len * sizeof(CHAR));
776 utf8str_len = WideCharToMultiByte( CP_UTF8, 0,
777 arg_lptstr, -1, utf8str_ret, utf8str_len, NULL, NULL );
778 }
779
780 if(!utf8str_len)
781 {
782 /* check GetLastError()? */
783 utf8str_ret = (LPSTR)fs_get(sizeof(CHAR));
784 utf8str_ret[0] = 0;
785 }
786
787 return utf8str_ret;
788 }
789
790
791 /*
792 * Convert a UCS4 argument into an LPTSTR version
793 * of that argument. The result is allocated here
794 * and should be freed by the caller.
795 */
796 LPTSTR
ucs4_to_lptstr(UCS * arg_ucs4)797 ucs4_to_lptstr(UCS *arg_ucs4)
798 {
799 LPTSTR ret_lptstr = NULL;
800 size_t len;
801 size_t i;
802
803 if(arg_ucs4){
804 len = ucs4_strlen(arg_ucs4);
805 ret_lptstr = (LPTSTR) fs_get((len+1) * sizeof(TCHAR));
806 /* bogus conversion ignores UTF-16 */
807 for(i = 0; i < len; i++)
808 ret_lptstr[i] = arg_ucs4[i];
809
810 ret_lptstr[len] = '\0';
811 }
812
813 return(ret_lptstr);
814 }
815
816
817 /*
818 * Convert an LPTSTR argument into a UCS4 version
819 * of that argument. The result is MemAlloc'd here
820 * and should be freed by the caller.
821 */
822 UCS *
lptstr_to_ucs4(LPTSTR arg_lptstr)823 lptstr_to_ucs4(LPTSTR arg_lptstr)
824 {
825 UCS *ret_ucs4 = NULL;
826 size_t len;
827 size_t i;
828
829 if(arg_lptstr){
830 len = _tcslen(arg_lptstr);
831 ret_ucs4 = (UCS *) fs_get((len+1)*sizeof(UCS));
832 /* bogus conversion ignores UTF-16 */
833 for(i = 0; i < len; i++)
834 ret_ucs4[i] = arg_lptstr[i];
835
836 ret_ucs4[len] = '\0';
837 }
838
839 return(ret_ucs4);
840 }
841
842 #endif /* _WINDOWS */
843
844
845 /*
846 * Pass in a stream of UTF-8 characters 1-at-a-time in 'c' and return obuf
847 * 1-at-a-time filled in with UCS characters. The return value is the
848 * number of valid characters in obuf to be used. It can only
849 * be 1 or 0 characters since we're only getting one UTF-8 character
850 * at a time.
851 */
852 int
utf8_to_ucs4_oneatatime(int c,CBUF_S * cb,UCS * obuf,int * obufwidth)853 utf8_to_ucs4_oneatatime(int c, CBUF_S *cb, UCS *obuf, int *obufwidth)
854 {
855 int width = 0, outchars = 0;
856
857 if(!(cb && cb->cbufp))
858 return(0);
859
860 if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
861 unsigned char *inputp;
862 unsigned long remaining_octets;
863 UCS ucs;
864
865 *cb->cbufp++ = (unsigned char) c;
866 inputp = cb->cbuf;
867 remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
868 ucs = (UCS) utf8_get(&inputp, &remaining_octets);
869
870 switch(ucs){
871 case U8G_ENDSTRG: /* incomplete character, wait */
872 case U8G_ENDSTRI: /* incomplete character, wait */
873 break;
874
875 default:
876 if(ucs & U8G_ERROR || ucs == UBOGON){
877 /*
878 * None of these cases is supposed to happen. If it
879 * does happen then the input stream isn't UTF-8
880 * so something is wrong.
881 */
882 outchars++;
883 *obuf = '?';
884 cb->cbufp = cb->cbuf;
885 width = 1;
886 }
887 else{
888 outchars++;
889 if(ucs < 0x80 && ucs >= 0x20)
890 width = 1;
891
892 if(ucs >= 0x80 && (width=wcellwidth(ucs)) < 0){
893 /*
894 * This happens when we have a UTF-8 character that
895 * we aren't able to print in our locale. For example,
896 * if the locale is setup with the terminal
897 * expecting ISO-8859-1 characters then there are
898 * lots of UTF-8 characters that can't be printed.
899 * Print a '?' instead.
900 * Don't think this should happen in Windows.
901 */
902 *obuf = '?';
903 }
904 else{
905 *obuf = ucs;
906 }
907
908 /* update the input buffer */
909 if(inputp >= cb->cbufp) /* this should be the case */
910 cb->cbufp = cb->cbuf;
911 else{ /* extra chars for some reason? */
912 unsigned char *q, *newcbufp;
913
914 newcbufp = (cb->cbufp - inputp) + cb->cbuf;
915 q = cb->cbuf;
916 while(inputp < cb->cbufp)
917 *q++ = *inputp++;
918
919 cb->cbufp = newcbufp;
920 }
921 }
922
923 break;
924 }
925 }
926 else{ /* error */
927 *obuf = '?';
928 outchars = 1;
929 width = 1;
930 cb->cbufp = cb->cbuf; /* start over */
931 }
932
933 if(obufwidth)
934 *obufwidth = width;
935
936 return(outchars);
937 }
938
939
940 /*
941 * Return an allocated copy of a zero-terminated UCS-4 string.
942 */
943 UCS *
ucs4_cpystr(UCS * ucs4src)944 ucs4_cpystr(UCS *ucs4src)
945 {
946 size_t arraysize;
947 UCS *ret = NULL;
948 size_t i;
949
950 if(!ucs4src)
951 return NULL;
952
953 arraysize = ucs4_strlen(ucs4src);
954
955 ret = (UCS *) fs_get((arraysize+1) * sizeof(*ret));
956 memset(ret, 0, (arraysize+1) * sizeof(*ret));
957
958 for(i = 0; i < arraysize; i++)
959 ret[i] = ucs4src[i];
960
961 return ret;
962 }
963
964
965 UCS *
ucs4_strncpy(UCS * ucs4dst,UCS * ucs4src,size_t n)966 ucs4_strncpy(UCS *ucs4dst, UCS *ucs4src, size_t n)
967 {
968 size_t i;
969
970 if(ucs4src && ucs4dst){
971 for(i = 0; i < n; i++){
972 ucs4dst[i] = ucs4src[i];
973 if(ucs4dst[i] == '\0')
974 break;
975 }
976 }
977
978 return ucs4dst;
979 }
980
981
982 UCS *
ucs4_strncat(UCS * ucs4dst,UCS * ucs4src,size_t n)983 ucs4_strncat(UCS *ucs4dst, UCS *ucs4src, size_t n)
984 {
985 size_t i;
986 UCS *u;
987
988 if(ucs4src && ucs4dst){
989 for(u = ucs4dst; *u; u++)
990 ;
991
992 for(i = 0; i < n; i++){
993 u[i] = ucs4src[i];
994 if(u[i] == '\0')
995 break;
996 }
997
998 if(i == n)
999 u[i] = '\0';
1000 }
1001
1002 return ucs4dst;
1003 }
1004
1005
1006 /*
1007 * Like strlen only this returns the number of non-zero characters
1008 * in a zero-terminated UCS-4 array.
1009 */
1010 size_t
ucs4_strlen(UCS * ucs4str)1011 ucs4_strlen(UCS *ucs4str)
1012 {
1013 size_t i = 0;
1014
1015 if(ucs4str)
1016 while(ucs4str[i])
1017 i++;
1018
1019 return(i);
1020 }
1021
1022
1023 int
ucs4_strcmp(UCS * s1,UCS * s2)1024 ucs4_strcmp(UCS *s1, UCS *s2)
1025 {
1026 for(; *s1 == *s2; s1++, s2++)
1027 if(*s1 == '\0')
1028 return 0;
1029
1030 return((*s1 < *s2) ? -1 : 1);
1031 }
1032
1033
1034 UCS *
ucs4_strchr(UCS * s,UCS c)1035 ucs4_strchr(UCS *s, UCS c)
1036 {
1037 if(!s)
1038 return NULL;
1039
1040 while(*s && *s != c)
1041 s++;
1042
1043 if(*s || !c)
1044 return s;
1045 else
1046 return NULL;
1047 }
1048
1049
1050 UCS *
ucs4_strrchr(UCS * s,UCS c)1051 ucs4_strrchr(UCS *s, UCS c)
1052 {
1053 UCS *ret = NULL;
1054
1055 if(!s)
1056 return ret;
1057
1058 while(*s){
1059 if(*s == c)
1060 ret = s;
1061
1062 s++;
1063 }
1064
1065 return ret;
1066 }
1067
1068
1069 /*
1070 * Returns the screen cells width of the UTF-8 string argument.
1071 */
1072 unsigned
utf8_width(char * str)1073 utf8_width(char *str)
1074 {
1075 unsigned width = 0;
1076 int this_width;
1077 UCS ucs;
1078 unsigned long remaining_octets;
1079 char *readptr;
1080
1081 if(!(str && *str))
1082 return(width);
1083
1084 readptr = str;
1085 remaining_octets = readptr ? strlen(readptr) : 0;
1086
1087 while(remaining_octets > 0 && *readptr){
1088
1089 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1090
1091 if(ucs & U8G_ERROR || ucs == UBOGON){
1092 /*
1093 * This should not happen, but do something to handle it anyway.
1094 * Treat each character as a single width character, which is what should
1095 * probably happen when we actually go to write it out.
1096 */
1097 remaining_octets--;
1098 readptr++;
1099 this_width = 1;
1100 }
1101 else{
1102 this_width = wcellwidth(ucs);
1103
1104 /*
1105 * If this_width is -1 that means we can't print this character
1106 * with our current locale. Writechar will print a '?'.
1107 */
1108 if(this_width < 0)
1109 this_width = 1;
1110 }
1111
1112 width += (unsigned) this_width;
1113 }
1114
1115 return(width);
1116 }
1117
1118
1119 /*
1120 * Copy UTF-8 characters from src into dst.
1121 * This is intended to be used if you want to truncate a string at
1122 * the start instead of the end. For example, you have a long string
1123 * like
1124 * this_is_a_long_string
1125 * but not enough space to fit it into a particular field. You want to
1126 * end up with
1127 * s_a_long_string
1128 * where that fits in a particular width. Perhaps you'd use this with ...
1129 * to get
1130 * ...s_a_long_string
1131 * This right adjusts the end of the string in the width space and
1132 * cuts it off at the start. If there is enough width for the whole
1133 * string it will copy the string into dst with no padding.
1134 *
1135 * Copy enough characters so that the result will have screen width of
1136 * want_width screen cells in current locale.
1137 *
1138 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1139 * to dst. This is just for protection, it shouldn't be relied on to
1140 * do anything useful. Dstlen should be large enough. Otherwise you'll get
1141 * characters truncated in the middle or something like that.
1142 *
1143 * Returned value is the number of bytes written to dst, not including
1144 * the possible terminating null.
1145 *
1146 * If we can't hit want_width exactly because of double width characters
1147 * then we will pad the end of the string with space in order to make
1148 * the width exact.
1149 */
1150 size_t
utf8_to_width_rhs(char * dst,char * src,size_t dstlen,unsigned want_width)1151 utf8_to_width_rhs(char *dst, /* destination buffer */
1152 char *src, /* source string */
1153 size_t dstlen, /* space in dest */
1154 unsigned want_width) /* desired screen width */
1155 {
1156 int this_width;
1157 unsigned width_consumed = 0;
1158 UCS ucs;
1159 unsigned long remaining_octets;
1160 char *readptr, *goodreadptr, *savereadptr, *endptr;
1161 size_t nb = 0;
1162
1163 if(!src){
1164 if(dstlen > 0)
1165 dst[0] = '\0';
1166
1167 return nb;
1168 }
1169
1170 /*
1171 * Start at the end of the source string and go backwards until we
1172 * get to the desired width, but not more than the width.
1173 */
1174 readptr = src + strlen(src);
1175 endptr = readptr;
1176 goodreadptr = readptr;
1177 width_consumed = 0;
1178 savereadptr = readptr;
1179
1180 for(readptr = savereadptr-1; readptr >= src && width_consumed < want_width && (endptr - readptr) < dstlen;
1181 readptr = savereadptr-1){
1182
1183 savereadptr = readptr;
1184 remaining_octets = goodreadptr - readptr;
1185 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1186
1187 /*
1188 * Handling the error case is tough because an error will be the normal thing that
1189 * happens as we back through the string. So we're just going to punt on the
1190 * error for now.
1191 */
1192 if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1193 if(remaining_octets > 0){
1194 /*
1195 * This means there are some bad octets after this good
1196 * character so things are not going to work out well.
1197 * Bail out.
1198 */
1199 savereadptr = src; /* we're done */
1200 }
1201 else{
1202 this_width = wcellwidth(ucs);
1203
1204 if(this_width < 0)
1205 this_width = 1;
1206
1207 if(width_consumed + (unsigned) this_width <= want_width){ /* ok */
1208 width_consumed += (unsigned) this_width;
1209 goodreadptr = savereadptr;
1210 }
1211 else
1212 savereadptr = src; /* we're done */
1213 }
1214 }
1215 }
1216
1217 /*
1218 * Copy characters from goodreadptr to endptr into dst.
1219 */
1220 nb = MIN(endptr-goodreadptr, dstlen-1);
1221 strncpy(dst, goodreadptr, nb);
1222 dst[nb] = '\0';
1223
1224 /*
1225 * Pad out with spaces in order to hit width exactly.
1226 */
1227 while(width_consumed < want_width && nb < dstlen-1){
1228 dst[nb++] = ' ';
1229 dst[nb] = '\0';
1230 width_consumed++;
1231 }
1232
1233 return nb;
1234 }
1235
1236
1237 /*
1238 * The arguments being converted are UTF-8 strings.
1239 * This routine attempts to make it possible to use screen cell
1240 * widths in a format specifier. In a one-byte per screen cell
1241 * world we might have used %10.10s to cause a string to occupy
1242 * 10 screen positions. Since the width and precision are really
1243 * referring to numbers of bytes instead of screen positions that
1244 * won't work with UTF-8 input. We emulate that behavior with
1245 * the format string %w. %m.nw means to use the m and n as
1246 * screen width indicators instead of bytes indicators.
1247 *
1248 * There is no reason to use this routine unless you want to use
1249 * min field with or precision with the specifier. A plain %w without
1250 * widths is equivalent exactly to a plain %s in a regular printf.
1251 *
1252 * Double-width characters complicate things. It may not be possible
1253 * to satisfy the request exactly. For example, %3w for an input
1254 * string that is made up of two double-width characters.
1255 * This routine will arbitrarily use a trailing space character if
1256 * needed to make the width come out correctly where a half of a
1257 * double-width character would have been needed. We'll see how
1258 * that works for us.
1259 *
1260 * %w only works for strings (it's a %s replacement).
1261 *
1262 * Buffer overflow is handled by the size argument. %.30s will work
1263 * to limit a particular string to 30 bytes, but you lose that
1264 * ability with %w, since it may write more than precision bytes
1265 * in order to get to the desired width. It is best to choose
1266 * size large enough so that it doesn't come into play, otherwise
1267 * it may be possible to get partial UTF-8 characters because of
1268 * the truncation.
1269 *
1270 * The return value isn't quite the same as the return value
1271 * of snprintf. It is the number of bytes written, not counting
1272 * the trailing null, just like snprintf. However, if it is
1273 * truncated due to size then the output is size, not the
1274 * number of characters that would have been written.
1275 */
1276 int
utf8_snprintf(char * dest,size_t size,char * fmt,...)1277 utf8_snprintf(char *dest, size_t size, char *fmt, ...)
1278 {
1279 char newfmt[100], buf[20], *q, *pdest, *width_str, *end;
1280 char *start_of_specifier;
1281 char *input_str;
1282 int int_arg;
1283 double double_arg;
1284 void *ptr_arg;
1285 unsigned got_width;
1286 int more_flags, ret, w;
1287 int min_field_width, field_precision, modifier;
1288 int flags_minus, flags_plus, flags_space, flags_zero, flags_pound;
1289 va_list args;
1290
1291 newfmt[0] = '\0';
1292 q = newfmt;
1293
1294 pdest = dest;
1295
1296 #define IS_ROOM_IN_DEST(n_more_chars) \
1297 ((pdest - dest + (n_more_chars) <= size) ? 1 : 0)
1298
1299 /*
1300 * Strategy: Look through the fmt string for %w's. Replace the
1301 * %w's in the format string with %s's but with possibly different
1302 * width and precision arguments which will make it come out right.
1303 * Then call the regular system vsnprintf with the altered format
1304 * string but same arguments.
1305 *
1306 * That would be nice but it doesn't quite work. Why? Because a
1307 * %*w will need to have the value in the integer argument the *
1308 * refers to modified. Can't do it as far as I can tell. Or we could
1309 * remove the integer argument somehow before calling printf. Can't
1310 * do it. Or we could somehow add an additional conversion specifier
1311 * that caused nothing to be printed but ate up the integer arg.
1312 * Can't figure out how to do that either.
1313 *
1314 * Since we can't figure out how to do it, the alternative is to
1315 * construct the result one piece at a time, pasting together the
1316 * pieces from the different conversions.
1317 */
1318 va_start(args, fmt);
1319
1320 while(*fmt && IS_ROOM_IN_DEST(1)){
1321 if(*fmt == '%'){
1322 start_of_specifier = fmt++;
1323
1324 min_field_width = field_precision = -1;
1325 flags_minus = flags_plus = flags_space = flags_zero = flags_pound = 0;
1326
1327 /* flags */
1328 more_flags = 1;
1329 while(more_flags){
1330 switch(*fmt){
1331 case '-':
1332 flags_minus++;
1333 fmt++;
1334 break;
1335
1336 case '+':
1337 flags_plus++;
1338 fmt++;
1339 break;
1340
1341 case ' ':
1342 flags_space++;
1343 fmt++;
1344 break;
1345
1346 case '0':
1347 flags_zero++;
1348 fmt++;
1349 break;
1350
1351 case '#':
1352 flags_pound++;
1353 fmt++;
1354 break;
1355
1356 default:
1357 more_flags = 0;
1358 break;
1359 }
1360 }
1361
1362 /* minimum field width */
1363 if(*fmt == '*'){
1364 min_field_width = va_arg(args, int);
1365 fmt++;
1366 }
1367 else if(*fmt >= '0' && *fmt <= '9'){
1368 width_str = fmt;
1369 while (*fmt >= '0' && *fmt <= '9')
1370 fmt++;
1371
1372 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1373 if(sizeof(buf) > fmt-width_str)
1374 buf[fmt-width_str] = '\0';
1375
1376 buf[sizeof(buf)-1] = '\0';
1377
1378 min_field_width = atoi(width_str);
1379 }
1380
1381 /* field precision */
1382 if(*fmt == '.'){
1383 fmt++;
1384 if(*fmt == '*'){
1385 field_precision = va_arg(args, int);
1386 fmt++;
1387 }
1388 else if(*fmt >= '0' && *fmt <= '9'){
1389 width_str = fmt;
1390 while (*fmt >= '0' && *fmt <= '9')
1391 fmt++;
1392
1393 strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
1394 if(sizeof(buf) > fmt-width_str)
1395 buf[fmt-width_str] = '\0';
1396
1397 buf[sizeof(buf)-1] = '\0';
1398
1399 field_precision = atoi(width_str);
1400 }
1401 }
1402
1403 /* length modifier */
1404 if(*fmt == 'h' || *fmt == 'l' || *fmt == 'L')
1405 modifier = *fmt++;
1406
1407 /* conversion character */
1408 switch(*fmt){
1409 case 'w':
1410 /*
1411 * work with va_arg(char *) to figure out width
1412 * and precision needed to produce the screen width
1413 * and precision asked for in %w using some of the
1414 * utf8 width routines we have.
1415 */
1416
1417 input_str = va_arg(args, char *);
1418 if(field_precision >=0 || min_field_width >= 0)
1419 w = utf8_width(input_str);
1420
1421 if(field_precision >= 0){
1422 if(w <= field_precision)
1423 field_precision = -1; /* print it all */
1424 else{
1425 /*
1426 * We need to cut off some of the input_str
1427 * in this case.
1428 */
1429 end = utf8_count_forw_width(input_str, field_precision, &got_width);
1430 field_precision = (int) (end - input_str);
1431 /* new w with this field_precision */
1432 w = got_width;
1433 }
1434 }
1435
1436 /* need some padding */
1437 if(min_field_width >= 0)
1438 min_field_width = ((field_precision >= 0) ? field_precision : strlen(input_str)) +
1439 MAX(0, min_field_width - w);
1440
1441 /*
1442 * Now we just need to get the new format string
1443 * set correctly in newfmt.
1444 */
1445 q = newfmt;
1446 if(q-newfmt < sizeof(newfmt))
1447 *q++ = '%';
1448
1449 if(flags_minus && q-newfmt < sizeof(newfmt))
1450 *q++ = '-';
1451 if(flags_plus && q-newfmt < sizeof(newfmt))
1452 *q++ = '+';
1453 if(flags_space && q-newfmt < sizeof(newfmt))
1454 *q++ = ' ';
1455 if(flags_zero && q-newfmt < sizeof(newfmt))
1456 *q++ = '0';
1457 if(flags_pound && q-newfmt < sizeof(newfmt))
1458 *q++ = '#';
1459
1460 if(min_field_width >= 0){
1461 snprintf(buf, sizeof(buf), "%d", min_field_width);
1462 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1463 }
1464
1465 if(field_precision >= 0){
1466 if(q-newfmt < sizeof(newfmt))
1467 *q++ = '.';
1468
1469 snprintf(buf, sizeof(buf), "%d", field_precision);
1470 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1471 }
1472
1473 if(q-newfmt < sizeof(newfmt))
1474 *q++ = 's';
1475
1476 if(q-newfmt < sizeof(newfmt))
1477 *q++ = '\0';
1478
1479 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1480 pdest += strlen(pdest);
1481
1482 break;
1483
1484 case '\0':
1485 fmt--;
1486 break;
1487
1488 default:
1489 /* make a new format which leaves out the dynamic '*' arguments */
1490 q = newfmt;
1491 if(q-newfmt < sizeof(newfmt))
1492 *q++ = '%';
1493
1494 if(flags_minus && q-newfmt < sizeof(newfmt))
1495 *q++ = '-';
1496 if(flags_plus && q-newfmt < sizeof(newfmt))
1497 *q++ = '+';
1498 if(flags_space && q-newfmt < sizeof(newfmt))
1499 *q++ = ' ';
1500 if(flags_zero && q-newfmt < sizeof(newfmt))
1501 *q++ = '0';
1502 if(flags_pound && q-newfmt < sizeof(newfmt))
1503 *q++ = '#';
1504
1505 if(min_field_width >= 0){
1506 snprintf(buf, sizeof(buf), "%d", min_field_width);
1507 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1508 }
1509
1510 if(field_precision >= 0){
1511 if(q-newfmt < sizeof(newfmt))
1512 *q++ = '.';
1513
1514 snprintf(buf, sizeof(buf), "%d", field_precision);
1515 sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
1516 }
1517
1518 if(q-newfmt < sizeof(newfmt))
1519 *q++ = *fmt;
1520
1521 if(q-newfmt < sizeof(newfmt))
1522 *q++ = '\0';
1523
1524 switch(*fmt){
1525 case 'd': case 'i': case 'o':
1526 case 'x': case 'X': case 'u': case 'c':
1527 int_arg = va_arg(args, int);
1528 snprintf(pdest, size - (pdest-dest), newfmt, int_arg);
1529 pdest += strlen(pdest);
1530 break;
1531
1532 case 's':
1533 input_str = va_arg(args, char *);
1534 snprintf(pdest, size - (pdest-dest), newfmt, input_str);
1535 pdest += strlen(pdest);
1536 break;
1537
1538 case 'f': case 'e': case 'E':
1539 case 'g': case 'G':
1540 double_arg = va_arg(args, double);
1541 snprintf(pdest, size - (pdest-dest), newfmt, double_arg);
1542 pdest += strlen(pdest);
1543 break;
1544
1545 case 'p':
1546 ptr_arg = va_arg(args, void *);
1547 snprintf(pdest, size - (pdest-dest), newfmt, ptr_arg);
1548 pdest += strlen(pdest);
1549 break;
1550
1551 case '%':
1552 if(IS_ROOM_IN_DEST(1))
1553 *pdest++ = '%';
1554
1555 break;
1556
1557 default:
1558 /* didn't think of this type */
1559 assert(0);
1560 break;
1561 }
1562
1563 break;
1564 }
1565
1566 fmt++;
1567 }
1568 else{
1569 if(IS_ROOM_IN_DEST(1))
1570 *pdest++ = *fmt++;
1571 }
1572 }
1573
1574 ret = pdest - dest;
1575
1576 if(IS_ROOM_IN_DEST(1))
1577 *pdest++ = '\0';
1578
1579 va_end(args);
1580
1581 return ret;
1582 }
1583
1584
1585 /*
1586 * Copy UTF-8 characters from src into dst.
1587 * Copy enough characters so that the result will have (<=) screen width of
1588 * want_width screen cells in current locale.
1589 *
1590 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1591 * to dst.
1592 *
1593 * Returned value is the number of bytes written to dst, not including
1594 * the possible terminating null.
1595 * Got_width is another returned value. It is the width in screen cells of
1596 * the string placed in dst. It will be the same as want_width if there
1597 * are enough characters in the src to do that and if the character widths
1598 * hit the width exactly. It will be less than want_width if we run out
1599 * of src characters or if the next character width would skip over the
1600 * width we want, because it is double width.
1601 *
1602 * Zero width characters are collected and included at the end of the string.
1603 * That is, if we make it to want_width but there is still a zero length
1604 * character sitting in src, we add that to dst. This might be an accent
1605 * or something like that.
1606 */
1607 size_t
utf8_to_width(char * dst,char * src,size_t dstlen,unsigned want_width,unsigned * got_width)1608 utf8_to_width(char *dst, /* destination buffer */
1609 char *src, /* source string */
1610 size_t dstlen, /* space in dst */
1611 unsigned want_width, /* desired screen width */
1612 unsigned *got_width) /* returned screen width in dst */
1613 {
1614 int this_width;
1615 unsigned width_consumed = 0;
1616 UCS ucs;
1617 unsigned long remaining_octets;
1618 char *writeptr, *readptr, *savereadptr, *endptr;
1619 int ran_out_of_space = 0;
1620
1621 readptr = src;
1622
1623 remaining_octets = readptr ? strlen(readptr) : 0;
1624
1625 writeptr = dst;
1626 endptr = writeptr + dstlen;
1627
1628 if(readptr && writeptr){
1629 while(width_consumed <= want_width && remaining_octets > 0 && writeptr < dst + dstlen && !ran_out_of_space){
1630 savereadptr = readptr;
1631 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1632
1633 if(ucs & U8G_ERROR || ucs == UBOGON)
1634 remaining_octets = 0;
1635 else{
1636 this_width = wcellwidth(ucs);
1637
1638 /*
1639 * If this_width is -1 that means we can't print this character
1640 * with our current locale. Writechar will print a '?'.
1641 */
1642 if(this_width < 0)
1643 this_width = 1;
1644
1645 if(width_consumed + (unsigned) this_width <= want_width){
1646 /* append this utf8 character to dst if it will fit */
1647 if(writeptr + (readptr - savereadptr) < endptr){
1648 width_consumed += this_width;
1649 while(savereadptr < readptr)
1650 *writeptr++ = *savereadptr++;
1651 }
1652 else
1653 ran_out_of_space++; /* no more utf8 to dst */
1654 }
1655 else
1656 remaining_octets = 0; /* we're done */
1657 }
1658 }
1659
1660 if(writeptr < endptr)
1661 *writeptr = '\0';
1662 }
1663
1664 if(got_width)
1665 *got_width = width_consumed;
1666
1667 return(writeptr ? (writeptr - dst) : 0);
1668 }
1669
1670
1671 /*
1672 * Str is a UTF-8 string.
1673 * Count forward width screencell positions and return a pointer to the
1674 * end of the string that is width wide.
1675 * The returned pointer points at the next character (where the null would
1676 * be placed).
1677 *
1678 * Got_width is another returned value. It is the width in screen cells of
1679 * the string from str to the returned pointer. It will be the same as
1680 * want_width if there are enough characters in the str to do that
1681 * and if the character widths hit the width exactly. It will be less
1682 * than want_width if we run out of characters or if the next character
1683 * width would skip over the width we want, because it is double width.
1684 */
1685 char *
utf8_count_forw_width(char * str,unsigned want_width,unsigned * got_width)1686 utf8_count_forw_width(char *str, unsigned want_width, unsigned *got_width)
1687 {
1688 int this_width;
1689 unsigned width_consumed = 0;
1690 UCS ucs;
1691 unsigned long remaining_octets;
1692 char *readptr;
1693 char *retptr;
1694
1695 retptr = readptr = str;
1696
1697 remaining_octets = readptr ? strlen(readptr) : 0;
1698
1699 while(width_consumed <= want_width && remaining_octets > 0){
1700
1701 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1702
1703 if(ucs & U8G_ERROR || ucs == UBOGON){
1704 /*
1705 * This should not happen, but do something to handle it anyway.
1706 * Treat each character as a single width character, which is what should
1707 * probably happen when we actually go to write it out.
1708 */
1709 remaining_octets--;
1710 readptr++;
1711 this_width = 1;
1712 }
1713 else{
1714 this_width = wcellwidth(ucs);
1715
1716 /*
1717 * If this_width is -1 that means we can't print this character
1718 * with our current locale. Writechar will print a '?'.
1719 */
1720 if(this_width < 0)
1721 this_width = 1;
1722 }
1723
1724 if(width_consumed + (unsigned) this_width <= want_width){
1725 width_consumed += (unsigned) this_width;
1726 retptr = readptr;
1727 }
1728 else
1729 remaining_octets = 0; /* we're done */
1730 }
1731
1732 if(got_width)
1733 *got_width = width_consumed;
1734
1735 return(retptr);
1736 }
1737
1738
1739 /*
1740 * Copy a null terminator into a UTF-8 string in place so that the string is
1741 * no more than a certain screen width wide. If the string is already less
1742 * than or equal in width to the requested width, no change is made.
1743 *
1744 * The actual width accomplished is returned. Note that it may be less than
1745 * max_width due to double width characters as well as due to the fact that
1746 * it fits wholly in the max_width.
1747 *
1748 * Returned value is the actual screen width of str when done.
1749 *
1750 * A side effect is that a terminating null may have been written into
1751 * the passed in string.
1752 */
1753 unsigned
utf8_truncate(char * str,unsigned max_width)1754 utf8_truncate(char *str, unsigned max_width)
1755 {
1756 int this_width;
1757 unsigned width_consumed = 0;
1758 UCS ucs;
1759 unsigned long remaining_octets;
1760 char *readptr, *savereadptr;
1761
1762 readptr = str;
1763
1764 remaining_octets = readptr ? strlen(readptr) : 0;
1765
1766 if(readptr){
1767 while(width_consumed <= max_width && remaining_octets > 0){
1768
1769 savereadptr = readptr;
1770 ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);
1771
1772 if(ucs & U8G_ERROR || ucs == UBOGON){
1773 /*
1774 * This should not happen, but do something to handle it anyway.
1775 * Treat each character as a single width character, which is what should
1776 * probably happen when we actually go to write it out.
1777 */
1778 remaining_octets--;
1779 readptr++;
1780 this_width = 1;
1781 }
1782 else{
1783 this_width = wcellwidth(ucs);
1784
1785 /*
1786 * If this_width is -1 that means we can't print this character
1787 * with our current locale. Writechar will print a '?'.
1788 */
1789 if(this_width < 0)
1790 this_width = 1;
1791 }
1792
1793 if(width_consumed + (unsigned) this_width <= max_width){
1794 width_consumed += (unsigned) this_width;
1795 }
1796 else{
1797 remaining_octets = 0; /* we're done */
1798 *savereadptr = '\0';
1799 }
1800 }
1801 }
1802
1803 return(width_consumed);
1804 }
1805
1806
1807 /*
1808 * Copy UTF-8 characters from src into dst.
1809 * Copy enough characters so that the result will have screen width of
1810 * want_width screen cells in current locale.
1811 * If there aren't enough characters in src to get to want_width, pad on
1812 * left or right according to left_adjust argument.
1813 *
1814 * Dstlen is the available space in dst. No more than dstlen bytes will be written
1815 * to dst. Dst will be null terminated if there is enough room, but not
1816 * if that would overflow dst's len.
1817 *
1818 * Returned value is the number of bytes written to dst, not including
1819 * the possible terminating null.
1820 */
1821 size_t
utf8_pad_to_width(char * dst,char * src,size_t dstlen,unsigned want_width,int left_adjust)1822 utf8_pad_to_width(char *dst, /* destination buffer */
1823 char *src, /* source string */
1824 size_t dstlen, /* space in dst */
1825 unsigned want_width, /* desired screen width */
1826 int left_adjust) /* adjust left or right in want_width columns */
1827 {
1828 unsigned got_width = 0;
1829 int need_more, howmany;
1830 size_t len_left, bytes_used;
1831
1832 bytes_used = utf8_to_width(dst, src, dstlen, want_width, &got_width);
1833 len_left = dstlen - bytes_used;
1834
1835 need_more = want_width - got_width;
1836 howmany = MIN(need_more, len_left);
1837
1838 if(howmany > 0){
1839 char *end, *newend, *p, *q;
1840
1841 end = dst + bytes_used;
1842 newend = end + howmany;
1843 if(left_adjust){
1844 /*
1845 * Add padding to end of string. Simply append
1846 * the needed number of spaces, or however many will fit
1847 * if we don't have enough space.
1848 */
1849 for(q = end; q < newend; q++)
1850 *q = ' ';
1851 }
1852 else{
1853 /*
1854 * Add padding to start of string.
1855 */
1856
1857 /* slide existing string over */
1858 for(p = end - 1, q = newend - 1; p >= dst; p--, q--)
1859 *q = *p;
1860
1861 /* fill rest with spaces */
1862 for(; q >= dst; q--)
1863 *q = ' ';
1864 }
1865
1866 bytes_used += howmany;
1867 }
1868
1869 if(bytes_used < dstlen)
1870 dst[bytes_used] = '\0';
1871
1872 return(bytes_used);
1873 }
1874
1875
1876 /*
1877 * Str is a UTF-8 string.
1878 * Start_here is a pointer into the string. It points one position past
1879 * the last byte that should be considered a part of the length string.
1880 * Count back want_width screencell positions and return a pointer to the
1881 * start of the string that is want_width wide and ends with start_here.
1882 *
1883 * Since characters may be more than one cell width wide we may end up
1884 * skipping over the exact width. That is, if we need to we'll go back
1885 * too far (by one cell width). Account for that in the call by looking
1886 * at got_width.
1887 *
1888 * Note that this call gives a possible got_width == want_width+1 as
1889 * opposed to utf8_count_forw_width which gives got_width == want-1 instead.
1890 * That was just what was needed at the time, maybe it needs to be
1891 * optional.
1892 */
1893 char *
utf8_count_back_width(char * str,char * start_here,unsigned want_width,unsigned * got_width)1894 utf8_count_back_width(char *str, char *start_here, unsigned want_width, unsigned *got_width)
1895 {
1896 unsigned width_consumed = 0;
1897 int this_width;
1898 UCS ucs;
1899 unsigned long remaining_octets;
1900 char *ptr, *savereadptr, *goodreadptr;
1901
1902 savereadptr = start_here;
1903 goodreadptr = start_here;
1904
1905 for(ptr = savereadptr - 1; width_consumed < want_width && ptr >= str; ptr = savereadptr - 1){
1906
1907 savereadptr = ptr;
1908 remaining_octets = goodreadptr - ptr;
1909 ucs = (UCS) utf8_get((unsigned char **) &ptr, &remaining_octets);
1910
1911 if(!(ucs & U8G_ERROR || ucs == UBOGON)){
1912 if(remaining_octets > 0){
1913 /*
1914 * This means there are some bad octets after this good
1915 * character so things are not going to work out well.
1916 * Bail out.
1917 */
1918 savereadptr = str; /* we're done */
1919 }
1920 else{
1921 this_width = wcellwidth(ucs);
1922
1923 /*
1924 * If this_width is -1 that means we can't print this character
1925 * with our current locale. Writechar will print a '?'.
1926 */
1927 if(this_width < 0)
1928 this_width = 1;
1929
1930 width_consumed += (unsigned) this_width;
1931 goodreadptr = savereadptr;
1932 }
1933 }
1934 }
1935
1936 if(got_width)
1937 *got_width = width_consumed;
1938
1939 return(savereadptr);
1940 }
1941
1942
1943 /*----------------------------------------------------------------------
1944 copy the source string onto the destination string returning with
1945 the destination string pointer at the end of the destination text
1946
1947 motivation for this is to avoid twice passing over a string that's
1948 being appended to twice (i.e., strcpy(t, x); t += strlen(t))
1949
1950 This doesn't really belong here but it is used here.
1951 ----*/
1952 void
sstrncpy(char ** d,char * s,int n)1953 sstrncpy(char **d, char *s, int n)
1954 {
1955 while(n-- > 0 && (**d = *s++) != '\0')
1956 (*d)++;
1957 }
1958
1959
1960 /*
1961 * If use_system_routines is set then NULL is the return value and it is
1962 * not an error. Display_charmap and keyboard_charmap should come over as
1963 * malloced strings and will be filled in with the result.
1964 *
1965 * Returns a void pointer to the input_cs CHARSET which is
1966 * passed to mbtow via kbseq().
1967 * If !use_system_routines && NULL is returned, that is an error and err should
1968 * have a message.
1969 * display_charmap and keyboard_charmap should be malloced data and may be
1970 * realloced and changed here.
1971 */
1972 int
setup_for_input_output(int use_system_routines,char ** display_charmap,char ** keyboard_charmap,void ** input_cs_arg,char ** err)1973 setup_for_input_output(int use_system_routines, char **display_charmap,
1974 char **keyboard_charmap, void **input_cs_arg, char **err)
1975 {
1976 const CHARSET *cs;
1977 const CHARSET *input_cs = NULL;
1978 int already_tried = 0;
1979 int supported = 0;
1980 char buf[1000];
1981
1982 #define cpstr(s) strcpy((char *)fs_get(1+strlen(s)), s)
1983
1984 if(err)
1985 *err = NULL;
1986
1987 if(!display_charmap || !keyboard_charmap || !input_cs_arg){
1988 *err = cpstr("Bad call to setup_for_input_output");
1989 return(-1);
1990 }
1991
1992 if(use_system_routines){
1993 #if PREREQ_FOR_SYS_TRANSLATION
1994 char *dcm;
1995
1996 dcm = nl_langinfo_codeset_wrapper();
1997 dcm = dcm ? dcm : "US-ASCII";
1998
1999 init_utf8_display(0, NULL);
2000 if(*display_charmap){
2001 if(dcm && strucmp(*display_charmap, dcm)){
2002 snprintf(buf, sizeof(buf),
2003 _("Display character set \"%s\" is ignored when using system translation"),
2004 *display_charmap);
2005
2006 *err = cpstr(buf);
2007 }
2008
2009 fs_give((void **) display_charmap);
2010 }
2011
2012 if(*keyboard_charmap){
2013 if(!*err && dcm && strucmp(*keyboard_charmap, dcm)){
2014 snprintf(buf, sizeof(buf),
2015 _("Keyboard character set \"%s\" is ignored when using system translation"),
2016 *keyboard_charmap);
2017
2018 *err = cpstr(buf);
2019 }
2020
2021 fs_give((void **) keyboard_charmap);
2022 }
2023
2024 *display_charmap = cpstr(dcm);
2025 *keyboard_charmap = cpstr(dcm);
2026 #else
2027 *err = cpstr("Bad call to setup_for_input_output");
2028 #endif
2029
2030 *input_cs_arg = NULL;
2031 return(0);
2032 }
2033
2034
2035 try_again1:
2036 if(!(*display_charmap))
2037 *display_charmap = cpstr("US-ASCII");
2038
2039 if(!(*keyboard_charmap))
2040 *keyboard_charmap = cpstr(*display_charmap);
2041
2042 if(*keyboard_charmap){
2043 supported = input_charset_is_supported(*keyboard_charmap);
2044
2045 if(supported){
2046 if(!strucmp(*keyboard_charmap, "utf-8"))
2047 input_cs = utf8_charset(*keyboard_charmap);
2048 else if((cs = utf8_charset(*keyboard_charmap)) != NULL)
2049 input_cs = cs;
2050 }
2051 else{
2052 if(err && !*err){
2053 int iso2022jp = 0;
2054
2055 if(!strucmp(*keyboard_charmap, "ISO-2022-JP"))
2056 iso2022jp = 1;
2057
2058 snprintf(buf, sizeof(buf),
2059 /* TRANSLATORS: The first argument is the name of the character
2060 set the user is trying to use (which is unsupported by alpine).
2061 The second argument is " (except for posting)" if they are
2062 trying to use ISO-2022-JP for something other than posting. */
2063 _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2064 *keyboard_charmap,
2065 iso2022jp ? _(" (except for posting)") : "");
2066
2067 *err = cpstr(buf);
2068 }
2069
2070 input_cs = NULL;
2071 fs_give((void **) keyboard_charmap);
2072 *keyboard_charmap = cpstr("US-ASCII");
2073 if(!already_tried){
2074 already_tried++;
2075 goto try_again1;
2076 }
2077 }
2078 }
2079
2080
2081 try_again2:
2082 if(!(*display_charmap))
2083 *display_charmap = cpstr("US-ASCII");
2084
2085 if(*display_charmap){
2086 supported = output_charset_is_supported(*display_charmap);
2087 if(supported){
2088 if(!strucmp(*display_charmap, "utf-8"))
2089 init_utf8_display(1, NULL);
2090 else if((cs = utf8_charset(*display_charmap)) != NULL)
2091 init_utf8_display(0, utf8_rmap_gen(cs, NULL));
2092 }
2093 else{
2094 if(err && !*err){
2095 int iso2022jp = 0;
2096
2097 if(!strucmp(*display_charmap, "ISO-2022-JP"))
2098 iso2022jp = 1;
2099
2100 snprintf(buf, sizeof(buf),
2101 _("Character set \"%s\" is unsupported%s, using US-ASCII"),
2102 *display_charmap,
2103 iso2022jp ? _(" (except for posting)") : "");
2104
2105 *err = cpstr(buf);
2106 }
2107
2108 fs_give((void **) display_charmap);
2109 if(!already_tried){
2110 already_tried++;
2111 goto try_again2;
2112 }
2113 }
2114 }
2115 else{
2116 if(err && !*err)
2117 *err = cpstr(_("Help, can't figure out display character set or even use US-ASCII."));
2118 }
2119
2120 #undef cpstr
2121
2122 *input_cs_arg = (void *) input_cs;
2123
2124 return(0);
2125 }
2126
2127
2128 int
input_charset_is_supported(char * input_charset)2129 input_charset_is_supported(char *input_charset)
2130 {
2131 const CHARSET *cs;
2132
2133 if(!(input_charset && *input_charset))
2134 return 0;
2135
2136 if(!strucmp(input_charset, "utf-8"))
2137 return 1;
2138
2139 if((cs = utf8_charset(input_charset)) != NULL){
2140
2141 /*
2142 * This was true 2006-09-25.
2143 */
2144 switch(cs->type){
2145 case CT_ASCII: case CT_1BYTE0: case CT_1BYTE:
2146 case CT_1BYTE8: case CT_EUC: case CT_DBYTE:
2147 case CT_DBYTE2: case CT_SJIS: case CT_UCS2:
2148 case CT_UCS4: case CT_UTF16:
2149 return 1;
2150 break;
2151
2152 default:
2153 break;
2154 }
2155 }
2156
2157 return 0;
2158 }
2159
2160
2161 int
output_charset_is_supported(char * output_charset)2162 output_charset_is_supported(char *output_charset)
2163 {
2164 const CHARSET *cs;
2165
2166 if(!(output_charset && *output_charset))
2167 return 0;
2168
2169 if(!strucmp(output_charset, "utf-8"))
2170 return 1;
2171
2172 if((cs = utf8_charset(output_charset)) != NULL && utf8_rmap_gen(cs, NULL))
2173 return 1;
2174
2175 return 0;
2176 }
2177
2178
2179 int
posting_charset_is_supported(char * posting_charset)2180 posting_charset_is_supported(char *posting_charset)
2181 {
2182 return(posting_charset && *posting_charset
2183 && (!strucmp(posting_charset, "ISO-2022-JP")
2184 || output_charset_is_supported(posting_charset)));
2185 }
2186
2187
2188 /*
2189 * This function is only defined in this special case and so calls
2190 * to it should be wrapped in the same macro conditionals.
2191 *
2192 * Returns the default display charset for a UNIX terminal emulator,
2193 * it is what nl_langinfo(CODESET) should return but we need to
2194 * wrap nl_langinfo because we know of strange behaving implementations.
2195 */
2196 #if !defined(_WINDOWS) && HAVE_LANGINFO_H && defined(CODESET)
2197 char *
nl_langinfo_codeset_wrapper(void)2198 nl_langinfo_codeset_wrapper(void)
2199 {
2200 char *ret = NULL;
2201
2202 ret = nl_langinfo(CODESET);
2203
2204 /*
2205 * If the value returned from nl_langinfo() is not a real charset,
2206 * see if we can figure out what they meant. If we can't figure it
2207 * out return NULL and let the caller decide what to do.
2208 */
2209 if(ret && *ret && !output_charset_is_supported(ret)){
2210 if(!strcmp("ANSI_X3.4-1968", ret)
2211 || !strcmp("646", ret)
2212 || !strcmp("ASCII", ret)
2213 || !strcmp("C", ret)
2214 || !strcmp("POSIX", ret))
2215 ret = "US-ASCII";
2216 else if(!strucmp(ret, "UTF8"))
2217 ret = "UTF-8";
2218 else if(!strucmp(ret, "EUCJP"))
2219 ret = "EUC-JP";
2220 else if(!strucmp(ret, "EUCKP"))
2221 ret = "EUC-KP";
2222 else if(!strucmp(ret, "SJIS"))
2223 ret = "SHIFT-JIS";
2224 else if(strstr(ret, "8859")){
2225 char *p;
2226
2227 /* check for digits after 8859 */
2228 p = strstr(ret, "8859");
2229 p += 4;
2230 if(!isdigit(*p))
2231 p++;
2232
2233 if(isdigit(*p)){
2234 static char buf[12];
2235
2236 memset(buf, 0, sizeof(buf));
2237 strncpy(buf, "ISO-8859-", sizeof(buf));
2238 buf[9] = *p++;
2239 if(isdigit(*p))
2240 buf[10] = *p;
2241
2242 ret = buf;
2243 }
2244 }
2245 }
2246
2247 if(ret && !output_charset_is_supported(ret))
2248 ret = NULL;
2249
2250 return(ret);
2251 }
2252 #endif
2253
2254
2255 /*
2256 * Convert the "orig" string from UTF-8 to "charset". If no conversion is
2257 * needed the return value will point to orig. If a conversion is done,
2258 * the return string should be freed by the caller.
2259 * If not possible, returns NULL.
2260 */
2261 char *
utf8_to_charset(char * orig,char * charset,int report_err)2262 utf8_to_charset(char *orig, char *charset, int report_err)
2263 {
2264 SIZEDTEXT src, dst;
2265 char *ret = orig;
2266
2267 if(!charset || !charset[0] || !orig || !orig[0] || !strucmp(charset, "utf-8"))
2268 return ret;
2269
2270 src.size = strlen(orig);
2271 src.data = (unsigned char *) orig;
2272
2273 if(!strucmp(charset, "us-ascii")){
2274 size_t i;
2275
2276 for(i = 0; i < src.size; i++)
2277 if(src.data[i] & 0x80)
2278 return NULL;
2279
2280 return ret;
2281 }
2282
2283 /*
2284 * This works for ISO-2022-JP because of special code in utf8_cstext
2285 * but not for other 2022 charsets.
2286 */
2287 memset(&dst, 0, sizeof(dst));
2288 if(utf8_cstext(&src, charset, &dst, report_err ? 0 : '?') && dst.size > 0 && dst.data)
2289 ret = (char *) dst.data; /* c-client already null terminates it */
2290 else
2291 ret = NULL;
2292
2293 if((unsigned char *) ret != dst.data && dst.data)
2294 fs_give((void **) &dst.data);
2295
2296 return ret;
2297 }
2298
2299
2300 /*
2301 * Turn a number into a string with comma's
2302 *
2303 * Args: number -- The long to be turned into a string.
2304 *
2305 * Result: pointer to static string representing number with commas
2306 * Can use up to 3 comatose results at once.
2307 */
2308 char *
comatose(long int number)2309 comatose(long int number)
2310 {
2311 long i, x, done_one;
2312 static char buf[3][50];
2313 static int whichbuf = 0;
2314 char *b;
2315
2316 whichbuf = (whichbuf + 1) % 3;
2317
2318 if(number == 0){
2319 strncpy(buf[whichbuf], "0", sizeof(buf[0]));
2320 buf[whichbuf][sizeof(buf[0])-1] = '\0';
2321 return(buf[whichbuf]);
2322 }
2323
2324 done_one = 0;
2325 b = buf[whichbuf];
2326 for(i = 1000000000; i >= 1; i /= 1000) {
2327 x = number / i;
2328 number = number % i;
2329 if(x != 0 || done_one) {
2330 if(b != buf[whichbuf] && (b-buf[whichbuf]) < sizeof(buf[0]))
2331 *b++ = ',';
2332
2333 snprintf(b, sizeof(buf[0])-(b-buf[whichbuf]), done_one ? "%03ld" : "%ld", x);
2334 b += strlen(b);
2335 done_one = 1;
2336 }
2337 }
2338
2339 if(b-buf[whichbuf] < sizeof(buf[0]))
2340 *b = '\0';
2341
2342 return(buf[whichbuf]);
2343 }
2344
2345
2346 /* leave out the commas */
2347 char *
tose(long int number)2348 tose(long int number)
2349 {
2350 static char buf[3][50];
2351 static int whichbuf = 0;
2352
2353 whichbuf = (whichbuf + 1) % 3;
2354
2355 snprintf(buf[whichbuf], sizeof(buf[0]), "%ld", number);
2356
2357 return(buf[whichbuf]);
2358 }
2359
2360
2361 /*
2362 * line_paint - where the real work of managing what is displayed gets done.
2363 */
2364 void
line_paint(int offset,struct display_line * displ,int * passwd)2365 line_paint(int offset, /* current dot offset into vl */
2366 struct display_line *displ,
2367 int *passwd) /* flag to hide display of chars */
2368 {
2369 int i, w, w2, already_got_one = 0;
2370 int vfirst, vlast, dfirst, dlast, vi, di;
2371 int new_vbase;
2372 unsigned (*width_a_to_b)(UCS *, int, int);
2373
2374 /*
2375 * Set passwd to 10 in caller if you want to conceal the
2376 * password but not print asterisks for feedback.
2377 *
2378 * Set passwd to 1 in caller to conceal by printing asterisks.
2379 */
2380 if(passwd && *passwd >= 10){ /* don't show asterisks */
2381 if(*passwd > 10)
2382 return;
2383 else
2384 *passwd = 11; /* only blat once */
2385
2386 i = 0;
2387 (*displ->movecursor)(displ->row, displ->col);
2388 while(i++ <= displ->dwid)
2389 (*displ->writechar)(' ');
2390
2391 (*displ->movecursor)(displ->row, displ->col);
2392 return;
2393 }
2394
2395 if(passwd && *passwd)
2396 width_a_to_b = single_width_chars_a_to_b;
2397 else
2398 width_a_to_b = ucs4_str_width_a_to_b;
2399
2400 /*
2401 * vl is the virtual line (the actual data). We operate on it by typing
2402 * characters to be added and deleting and so forth. In this routine we
2403 * copy a subset of those UCS-4 characters in vl into dl, the display
2404 * array, and show that subset on the screen.
2405 *
2406 * Offset is the location of the cursor in vl.
2407 *
2408 * We will display the string starting from vbase.
2409 * We have dwid screen cells to work in.
2410 * We may have to adjust vbase in order to display the
2411 * part of the string that contains the cursor.
2412 *
2413 * We'll make the display look like
2414 * vl a b c d e f g h i j k l m
2415 * xxxxxxxxxxxxx <- width dwid window
2416 * < d e f g h >
2417 * |
2418 * vbase
2419 * The < will be there if vbase > 0.
2420 * The > will be there if the string from vbase to the
2421 * end can't all fit in the window.
2422 */
2423
2424 memset(displ->dl, 0, displ->dlen * sizeof(UCS));
2425
2426 /*
2427 * Adjust vbase so offset is not out of the window to the right.
2428 * (The +2 in w + 2 is for a possible " >" if the string goes past
2429 * the right hand edge of the window and if the last visible character
2430 * is double wide. We don't want the offset to be under that > character.)
2431 */
2432 for(w = (*width_a_to_b)(displ->vl, displ->vbase, offset);
2433 displ->dwid > 1 &&
2434 w + 2 + (displ->vbase ? 1 : 0) > displ->dwid;
2435 w = (*width_a_to_b)(displ->vl, displ->vbase, offset)){
2436 /*
2437 * offset is off the window to the right
2438 * It looks like a b c d e f g h
2439 * | |
2440 * vbase offset
2441 * and offset is either past the right edge,
2442 * or right at the right edge (and maybe under >),
2443 * or one before right at the edge (and maybe on space
2444 * for half a character).
2445 *
2446 * Since the characters may be double width it is slightly
2447 * complicated to figure out how far to increase vbase.
2448 * We're going to scoot over past width w/2 characters and
2449 * then see if that's sufficient.
2450 */
2451 new_vbase = displ->vbase + 1;
2452 for(w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase);
2453 w2 < displ->dwid/2;
2454 w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase))
2455 new_vbase++;
2456
2457 displ->vbase = new_vbase;
2458 }
2459
2460 /* adjust so offset is not out of the window to the left */
2461 while(displ->vbase > 0 && displ->vbase >= offset){
2462 /* add about dwid/2 more width */
2463 new_vbase = displ->vbase - 1;
2464 for(w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase);
2465 w2 < (displ->dwid+1)/2 && new_vbase > 0;
2466 w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase))
2467 new_vbase--;
2468
2469 /* but don't let it get too small, recheck off right end */
2470 for(w = (*width_a_to_b)(displ->vl, new_vbase, offset);
2471 w + 2 + (new_vbase ? 1 : 0) > displ->dwid;
2472 w = (*width_a_to_b)(displ->vl, displ->vbase, offset))
2473 new_vbase++;
2474
2475 displ->vbase = MAX(new_vbase, 0);
2476 }
2477
2478 if(displ->vbase == 1 && ((passwd && *passwd) || wcellwidth(displ->vl[0]) == 1))
2479 displ->vbase = 0;
2480
2481 vfirst = displ->vbase;
2482 dfirst = 0;
2483 if(displ->vbase > 0){ /* off screen cue left */
2484 dfirst = 1; /* index which matches vfirst */
2485 displ->dl[0] = '<';
2486 }
2487
2488 vlast = displ->vused-1; /* end */
2489 w = (*width_a_to_b)(displ->vl, vfirst, vlast);
2490
2491 if(displ->dwid > 0 && w + dfirst > displ->dwid){ /* off window right */
2492
2493 /* find last ucs character to be printed */
2494 while(w + dfirst > displ->dwid - 1) /* -1 for > */
2495 w = (*width_a_to_b)(displ->vl, vfirst, --vlast);
2496
2497 /* worry about double-width characters */
2498 if(w + dfirst == displ->dwid - 1){ /* no prob, hit it exactly */
2499 dlast = dfirst + vlast - vfirst + 1; /* +1 for > */
2500 displ->dl[dlast] = '>';
2501 }
2502 else{
2503 dlast = dfirst + vlast - vfirst + 1;
2504 displ->dl[dlast++] = ' ';
2505 displ->dl[dlast] = '>';
2506 }
2507 }
2508 else
2509 dlast = dfirst + vlast - vfirst;
2510
2511 /*
2512 * Copy the relevant part of the virtual line into the display line.
2513 */
2514 for(vi = vfirst, di = dfirst; vi <= vlast; vi++, di++)
2515 if(passwd && *passwd)
2516 displ->dl[di] = '*'; /* to conceal password */
2517 else
2518 displ->dl[di] = displ->vl[vi];
2519
2520 /*
2521 * Add spaces to clear the rest of the line.
2522 * We have dwid total space to fill.
2523 */
2524 w = (*width_a_to_b)(displ->dl, 0, dlast); /* width through dlast */
2525 for(di = dlast+1, i = displ->dwid - w; i > 0 ; i--)
2526 displ->dl[di++] = ' ';
2527
2528 /*
2529 * Draw from left to right, skipping until we get to
2530 * something that is different. Characters may be different
2531 * widths than they were initially so paint from there the
2532 * rest of the way.
2533 */
2534 for(di = 0; displ->dl[di]; di++){
2535 if(already_got_one || displ->dl[di] != displ->olddl[di]){
2536 /* move cursor first time */
2537 if(!already_got_one++){
2538 w = (di > 0) ? (*width_a_to_b)(displ->dl, 0, di-1) : 0;
2539 (*displ->movecursor)(displ->row, displ->col + w);
2540 }
2541
2542 (*displ->writechar)(displ->dl[di]);
2543 displ->olddl[di] = displ->dl[di];
2544 }
2545 }
2546
2547 memset(&displ->olddl[di], 0, (displ->dlen - di) * sizeof(UCS));
2548
2549 /*
2550 * Move the cursor to the offset.
2551 *
2552 * The offset is relative to the start of the virtual array. We need
2553 * to find the location on the screen. The offset into the display array
2554 * will be offset-vbase+dfirst. We want to be at the start of that
2555 * character, so we need to find the width of all the characters up
2556 * to that point.
2557 */
2558 w = (offset > 0) ? (*width_a_to_b)(displ->dl, 0, offset-displ->vbase+dfirst-1) : 0;
2559
2560 (*displ->movecursor)(displ->row, displ->col + w);
2561 }
2562
2563
2564 /*
2565 * This is just like ucs4_str_width_a_to_b() except all of the characters
2566 * are assumed to be of width 1. This is for printing out *'s when user
2567 * enters a password, while still managing to use the same code to do the
2568 * display.
2569 */
2570 unsigned
single_width_chars_a_to_b(UCS * ucsstr,int a,int b)2571 single_width_chars_a_to_b(UCS *ucsstr, int a, int b)
2572 {
2573 unsigned width = 0;
2574 int i;
2575
2576 if(ucsstr)
2577 for(i = a; i <= b && ucsstr[i]; i++)
2578 width++;
2579
2580 return width;
2581 }
2582