pith/charconv/utf8.c

#if !defined(lint) && !defined(DOS)
static char rcsid[] = "$Id: utf8.c 1184 2008-12-16 23:52:15Z hubert@u.washington.edu $";
#endif

/*
 * ========================================================================
 * Copyright 2013-2021 Eduardo Chappa
 * Copyright 2006-2008 University of Washington
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * ========================================================================
 */


/* includable WITHOUT dependency on c-client */
#include "../../c-client/mail.h"
#include "../../c-client/utf8.h"

#ifdef _WINDOWS
/* wingdi.h uses ERROR (!) and we aren't using the c-client ERROR so... */
#undef ERROR
#else
#define _XOPEN_SOURCE
#endif

#include <system.h>

#include "../../c-client/fs.h"

/* includable WITHOUT dependency on pico */
#include "../../pico/keydefs.h"

#include "../osdep/collate.h"
#include "../filttype.h"

#include "utf8.h"

#include <stdarg.h>


unsigned single_width_chars_a_to_b(UCS *, int, int);


static char locale_charmap[50];

static int   native_utf8;
static void *display_data;

void
init_utf8_display(int utf8, void *rmap)
{
    native_utf8 = utf8;
    display_data = rmap;
}


/*
 * Argument is a UCS-4 wide character.
 * Returns the environment dependent cell width of the
 * character when printed to the screen.
 * This will be -1 if the character is not printable.
 * It will be >= zero if it is printable.
 *
 * Note that in the case it is not printable but it is still sent to
 * Writechar, Writechar will print a '?' with width 1.
 */
int
wcellwidth(UCS ucs)
{
    char dummy[32];
    long w;

    /*
     * We believe that on modern unix systems wchar_t is a UCS-4 character.
     * That's the assumption here.
     */

    if(native_utf8){			/* display is UTF-8 capable */
	w = ucs4_width((unsigned long) ucs);
	return((w & U4W_ERROR) ? -1 : w);
    }
    else if(display_data){
	if(wtomb(dummy, ucs) < 0)
	  return(-1);
	else{
	    w = ucs4_width((unsigned long) ucs);
	    return((w & U4W_ERROR) ? -1 : w);
	}
    }
#if !defined(_WINDOWS) && HAVE_WCWIDTH
    else
      return(wcwidth((wchar_t) ucs));
#else
    return(0);
#endif
}

/* ambiguous width zone character function. We use the Windows code until
 * we find a better way to do it in general.
 */
int
pith_ucs4width(UCS ucs)
{
  return (ucs >= 0x2100) ? 2 : 1;
#if !defined(_WINDOWS) && HAVE_WCWIDTH
  return wcwidth((wchar_t) ucs);
#else
  return (ucs >= 0x2100) ? 2 : 1;
#endif /* _WINDOWS */
}

/*
 * Argument is a UCS-4 wide character.
 * It is converted to the multibyte version (for example UTF8 or EUC-JP).
 * Dest is a buffer at least xx chars wide where the multi-byte version
 * of the wide character will be written.
 * The returned value is the number of bytes written to dest or -1
 * if the conversion can't be done.
 */
int
wtomb(char *dest, UCS ucs)
{
    int rv;
    /*
     * We believe that on modern unix systems wchar_t is a UCS-4 character.
     * That's the assumption here.
     */

    if(native_utf8){
	unsigned char *newdptr;

	newdptr = utf8_put((unsigned char *) dest, (unsigned long) ucs);
	return((newdptr == (unsigned char *) dest) ? -1 : newdptr - (unsigned char *) dest);
    }
    else if(display_data){
	unsigned long ucs4;
	int           ret;

	ucs4 = (unsigned long) ucs;
	ret = ucs4_rmaplen(&ucs4, 1, (unsigned short *) display_data, 0);
	if(ret >= 0)
	  ucs4_rmapbuf((unsigned char *) dest, &ucs4, 1, (unsigned short *) display_data, 0);
	else
	  ret = -1;

	return(ret);
    }
    else
#if defined(HAVE_WCRTOMB)
       rv = wcrtomb(dest, (wchar_t) ucs, NULL);
#elif defined(HAVE_WCTOMB)
       rv = wctomb(dest, (wchar_t) ucs);
#else
       rv = -1;
#endif
   return rv;
}


/*
 * This function does not necessarily update inputp and remaining_octets, so
 * don't rely on that. The c-client version does but the other doesn't.
 */
UCS
mbtow(void *input_cs, unsigned char **inputp, unsigned long *remaining_octets)
{
    UCS ucs;

    if(input_cs){
	CHARSET *cast_input_cs;

	cast_input_cs = (CHARSET *) input_cs;

	switch((ucs = (UCS) ucs4_cs_get(cast_input_cs, inputp, remaining_octets))){
	  case U8G_ENDSTRG:
	  case U8G_ENDSTRI:
	    return(CCONV_NEEDMORE);

	  default:
	    if(ucs & U8G_ERROR || ucs == UBOGON)
	      return(CCONV_BADCHAR);

	    return(ucs);
	}
    }
    else{
	size_t ret;
	wchar_t w;

	/*
	 * Warning:  input_cs and remaining_octets are unused in this
	 * half of the if/else.
	 *
	 * Unfortunately, we can't tell the difference between a source string
	 * that is just not long enough and one that has characters that can't
	 * be converted even though it is long enough. We return NEEDMORE in both cases.
	 */
	ret = mbstowcs(&w, (char *) (*inputp), 1);
	if(ret == (size_t)(-1))
	  return(CCONV_NEEDMORE);
	else{
	  ucs = (UCS) w;
	  return(ucs);
	}
    }
}


void
set_locale_charmap(char *charmap)
{
    if(charmap){
	strncpy(locale_charmap, charmap, sizeof(locale_charmap));
	locale_charmap[sizeof(locale_charmap)-1] = '\0';
    }
    else
      locale_charmap[0] = '\0';
}


/*
 * This ensures that the string is UTF-8. If str is already a UTF-8 string,
 * NULL is returned. Otherwise, an allocated string which is UTF-8 is returned.
 * The caller is responsible for freeing the returned value.
 *
 * Args  str     -- the string to convert
 */
char *
convert_to_utf8(char *str, char *fromcharset, int flags)
{
    char          *ret = NULL;
    char          *fcharset;
    SIZEDTEXT      src, result;
    const CHARSET *cs = NULL;
    int            try;

    src.data = (unsigned char *) str;
    src.size = strlen(str);

    /* already UTF-8, return NULL */
    if(!(flags & CU8_NOINFER)
       && (cs = utf8_infercharset(&src))
       && (cs->type == CT_ASCII || cs->type == CT_UTF8))
      return(ret);

    try = 1;
    while(try < 5){
	switch(try){
	  case 1:
	    fcharset = fromcharset;
	    if(fcharset && strucmp("UTF-8", fcharset) != 0)
	      break;	/* give it a try */
	    else
	      try++;	/* fall through */

	  case 2:
	    if(!(flags & CU8_NOINFER)){
		fcharset = cs ? cs->name : NULL;
		if(fcharset && strucmp("UTF-8", fcharset) != 0)
		  break;
		else
		  try++;	/* fall through */
	    }
	    else
	      try++;	/* fall through */

	  case 3:
	    fcharset = locale_charmap;
	    if(fcharset && strucmp("UTF-8", fcharset) != 0)
	      break;
	    else
	      try++;	/* fall through */

	  default:
	    fcharset = "ISO-8859-1";		/* this will "work" */
	    break;
	}

	memset(&result, 0, sizeof(result));

	if(fcharset && utf8_text(&src, fcharset, &result, 0L)){
	    if(!(result.size == src.size && result.data == src.data)){
		ret = (char *) fs_get((result.size+1) * sizeof(char));
		strncpy(ret, (char *) result.data, result.size);
		ret[result.size] = '\0';
	    }
	    /* else no conversion necessary */

	    if(result.data && result.data != src.data)
	      fs_give((void **) &result.data);
	    result.size = 0;

	    return(ret);
	}

	try++;
    }

    /* won't make it to here */
    return(ret);
}


/*
 * Convert from UTF-8 to user's locale charset.
 * This actually uses the wtomb routine to do the conversion, and that
 * relies on setup_for_input_output having been called.
 * If no conversion is necessary, NULL is returned, otherwise an allocated
 * string in the locale charset is returned and the caller is responsible
 * for freeing it.
 */
char *
convert_to_locale(char *utf8str)
{
#define CHNK 500
    char *inp, *ret = NULL;
    CBUF_S cb;
    int alloced;
    size_t i = 0;

    if(native_utf8 || !utf8str || !utf8str[0])
      return(NULL);

    cb.cbuf[0] = '\0';
    cb.cbufp = cb.cbufend = cb.cbuf;
    inp = utf8str;

    alloced = CHNK;
    ret = (char *) fs_get(alloced * sizeof(char));

    /*
     * There's gotta be a better way to do this but utf8_to_locale was
     * available and everything looks like a nail when all you have
     * is a hammer.
     */
    while(*inp){
	/*
	 * We're placing the outgoing stream of characters in ret, a multi-byte
	 * array of characters in the user's locale charset. See if there is
	 * enough room for the next wide characters worth of output chars
	 * and allocate more space if not.
	 */
        if((alloced - i) < MAX(MB_LEN_MAX,32)){
	    alloced += CHNK;
	    fs_resize((void **) &ret, alloced * sizeof(char));
	}

        i += utf8_to_locale((int) *inp++, &cb,
                           (unsigned char *) &ret[i], alloced - i);
    }

    fs_resize((void **) &ret, i + 1);

    ret[i] = '\0';

    return(ret);
}


/*
 * Pass in a stream of UTF-8 characters in 'c' and return obuf
 * filled in with multi-byte characters. The return value is the
 * number of valid characters in obuf to be used.
 */
int
utf8_to_locale(int c, CBUF_S *cb, unsigned char obuf[], size_t obuf_size)
{
    int outchars = 0;

    if(!(cb && cb->cbufp))
      return(0);

    if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
	unsigned char *inputp;
	unsigned long remaining_octets;
	UCS ucs;

	*(cb->cbufp)++ = (unsigned char) c;
	inputp = cb->cbuf;
	remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
	ucs = (UCS) utf8_get(&inputp, &remaining_octets);

	switch(ucs){
	  case U8G_ENDSTRG:	/* incomplete character, wait */
	  case U8G_ENDSTRI:	/* incomplete character, wait */
	    break;

	  default:
	    if(ucs & U8G_ERROR || ucs == UBOGON){
		/*
		 * None of these cases is supposed to happen. If it
		 * does happen then the input stream isn't UTF-8
		 * so something is wrong. Treat each character in the
		 * input buffer as a separate error character and
		 * print a '?' for each.
		 */
		for(inputp = cb->cbuf; inputp < cb->cbufp; inputp++)
		  obuf[outchars++] = '?';

		cb->cbufp = cb->cbuf;
	    }
	    else{
		if(ucs >= 0x80 && wcellwidth(ucs) < 0){
		    /*
		     * This happens when we have a UTF-8 character that
		     * we aren't able to print in our locale. For example,
		     * if the locale is setup with the terminal
		     * expecting ISO-8859-1 characters then there are
		     * lots of UTF-8 characters that can't be printed.
		     * Print a '?' instead.
		     */
		    obuf[outchars++] = '?';
		}
		else{
		    /*
		     * Convert the ucs into the multibyte
		     * character that corresponds to the
		     * ucs in the users locale.
		     */
		    outchars = wtomb((char *) obuf, ucs);
		    if(outchars < 0){
			obuf[0] = '?';
			outchars = 1;
		    }
		}

		/* update the input buffer */
		if(inputp >= cb->cbufp)	/* this should be the case */
		  cb->cbufp = cb->cbuf;
		else{		/* extra chars for some reason? */
		    unsigned char *q, *newcbufp;

		    newcbufp = (cb->cbufp - inputp) + cb->cbuf;
		    q = cb->cbuf;
		    while(inputp < cb->cbufp)
		      *q++ = *inputp++;

		    cb->cbufp = newcbufp;
		}
	    }

	    break;
	}
    }
    else{			/* error */
	obuf[0] = '?';
	outchars = 1;
	cb->cbufp = cb->cbuf;	/* start over */
    }

    return(outchars);
}


/*
 * Returns the screen cells width of the UCS-4 string argument.
 * The source string is zero terminated.
 */
unsigned
ucs4_str_width(UCS *ucsstr)
{
    unsigned width = 0;
    int w;

    if(ucsstr)
      while(*ucsstr){
	w = wcellwidth(*ucsstr++);
	if(w != U4W_CTLSRGT)
	  width += (w < 0 ? 1 : w);
      }

    return width;
}


/*
 * Returns the screen cells width of the UCS-4 string argument
 * from ucsstr[a] through (inclusive) ucsstr[b].
 * No checking is done to make sure a starts in the middle
 * of a UCS-4 array.
 */
unsigned
ucs4_str_width_a_to_b(UCS *ucsstr, int a, int b)
{
    unsigned width = 0;
    int i, w;

    if(ucsstr)
      for(i = a; i <= b && ucsstr[i]; i++){
	w = wcellwidth(ucsstr[i]);
	if(w != U4W_CTLSRGT)
	  width += (w < 0 ? 1 : w);
      }

    return width;
}


/*
 * Returns the screen cells width of the UCS-4 string argument
 * from ustart through (exclusive) uend.
 * No checking is done to make sure it starts in the middle
 * of a UCS-4 array.
 */
unsigned
ucs4_str_width_ptr_to_ptr(UCS *ustart, UCS *uend)
{
    UCS *u;
    unsigned width = 0;
    int w;

    if(!ustart)
      return width;

    if(ustart)
      for(u = ustart; u < uend; u++){
	w = wcellwidth(*u);
	if(w != U4W_CTLSRGT)
	  width += (w < 0 ? 1 : w);
      }

    return(width);
}


/*
 * Return the largest possible pointer into ucs4str so that the width
 * of the string from ucs4str to the pointer (exclusive)
 * is maxwidth or less. Also stops at a null character.
 */
UCS *
ucs4_particular_width(UCS *ucs4str, int maxwidth)
{
    UCS *u;
    int w_consumed = 0, w, done = 0;

    u = ucs4str;

    if(u)
      while(!done && *u && w_consumed <= maxwidth){
	w = wcellwidth(*u);
	w = (w >= 0 ? w : 1);
	if(w_consumed + w <= maxwidth){
	    w_consumed += w;
	    ++u;
	}
	else
	  ++done;
      }

    return(u);
}


/*
 * Convert and copy a UTF-8 string into a UCS-4 NULL
 * terminated array. Just like cpystr only it converts
 * from UTF-8 to UCS-4.
 *
 * Returned UCS-4 string needs to be freed by caller.
 */
UCS *
utf8_to_ucs4_cpystr(char *utf8src)
{
    size_t         retsize;
    UCS           *ret = NULL;
    UCS            ucs;
    unsigned long  remaining_octets;
    unsigned char *readptr;
    size_t         arrayindex;

    /*
     * We don't know how big to allocate the return array
     * because variable numbers of octets in the src array
     * will combine to make UCS-4 characters. The number of
     * UCS-4 characters is less than or equal to the number
     * of src characters, though.
     */

    if(!utf8src)
      return NULL;

    retsize = strlen(utf8src) + 1;

    ret = (UCS *) fs_get(retsize * sizeof(*ret));
    memset(ret, 0, retsize * sizeof(*ret));

    readptr = (unsigned char *) utf8src;
    remaining_octets = retsize-1;
    arrayindex = 0;

    while(remaining_octets > 0 && *readptr && arrayindex < retsize-1){
	ucs = (UCS) utf8_get(&readptr, &remaining_octets);

	if(ucs & U8G_ERROR || ucs == UBOGON)
	  remaining_octets = 0;
	else
	  ret[arrayindex++] = ucs;
    }

    ret[arrayindex] = '\0';

    /* get rid of excess size */
    if(arrayindex+1 < retsize)
      fs_resize((void **) &ret, (arrayindex + 1) * sizeof(*ret));

    return ret;
}


/*
 * Convert and copy a UCS-4 zero-terminated array into a UTF-8 NULL
 * terminated string. Just like cpystr only it converts
 * from UCS-4 to UTF-8.
 *
 * Returned UTF-8 string needs to be freed by caller.
 */
char *
ucs4_to_utf8_cpystr(UCS *ucs4src)
{
    unsigned char *ret = NULL;
    unsigned char *writeptr;
    int            i;

    if(!ucs4src)
      return NULL;

    /*
     * Over-allocate and then resize at the end.
     */

    /* count characters in source */
    for(i = 0; ucs4src[i]; i++)
      ;

    ret = (unsigned char *) fs_get((6*i + 1) * sizeof(*ret));
    memset(ret, 0, (6*i + 1) * sizeof(*ret));

    writeptr = ret;
    for(i = 0; ucs4src[i]; i++)
      writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);

    /* get rid of excess size */
    fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));

    return ((char *) ret);
}


/*
 * Similar to above but copy a fixed number of source
 * characters instead of going until null terminator.
 */
char *
ucs4_to_utf8_cpystr_n(UCS *ucs4src, int ucs4src_len)
{
    unsigned char *ret = NULL;
    unsigned char *writeptr;
    int            i;

    if(!ucs4src)
      return NULL;

    /*
     * Over-allocate and then resize at the end.
     */

    ret = (unsigned char *) fs_get((6*ucs4src_len + 1) * sizeof(*ret));
    memset(ret, 0, (6*ucs4src_len + 1) * sizeof(*ret));

    writeptr = ret;
    for(i = 0; i < ucs4src_len; i++)
      writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);

    /* get rid of excess size */
    fs_resize((void **) &ret, (writeptr - ret + 1) * sizeof(*ret));

    return ((char *) ret);
}

/*
 * Similar to above but copy what is possible to a
 * string of a size at most the given retlen.
 */
char *
ucs4_to_utf8_n_cpystr(UCS *ucs4src, int retlen)
{
    unsigned char *ret = NULL;
    unsigned char *writeptr;
    int            i, oldlen, len;

    if(!ucs4src)
      return NULL;

    /*
     * Over-allocate and then resize at the end.
     */

    /* count characters in source */
    for(i = 0; ucs4src[i]; i++)
      ;

    ret = (unsigned char *) fs_get((6*i + 1) * sizeof(unsigned char));
    memset(ret, 0, (6*i + 1) * sizeof(unsigned char));

    writeptr = ret;
    oldlen = len = 0;
    for(i = 0; ucs4src[i] && (len < retlen); i++){
      oldlen = len;
      writeptr = utf8_put(writeptr, (unsigned long) ucs4src[i]);
      len = strlen(ret);
    }
    if(len > retlen){
      ret[oldlen] = '\0';
      len = oldlen;
    }

    /* get rid of excess size */
    fs_resize((void **) &ret, (len + 1) * sizeof(unsigned char));

    return ((char *) ret);
}


#ifdef _WINDOWS
/*
 * Convert a UTF-8 argument into an LPTSTR version
 * of that argument. The result is allocated here
 * and should be freed by the caller.
 */
LPTSTR
utf8_to_lptstr(LPSTR arg_utf8)
{
     int lptstr_len;
     LPTSTR lptstr_ret = NULL;

     lptstr_len = MultiByteToWideChar( CP_UTF8, 0, arg_utf8, -1, NULL, 0 );
     if(lptstr_len > 0)
     {
         lptstr_ret = (LPTSTR)fs_get(lptstr_len * sizeof(TCHAR));
         lptstr_len = MultiByteToWideChar( CP_UTF8, 0,
             arg_utf8, -1, lptstr_ret, lptstr_len );
     }

     if(!lptstr_len)
     {
         /* check GetLastError()? */
         lptstr_ret = (LPTSTR)fs_get(sizeof(TCHAR));
         lptstr_ret[0] = 0;
     }

     return lptstr_ret;
}


/*
 * Convert an LPTSTR argument into a UTF-8 version
 * of that argument. The result is allocated here
 * and should be freed by the caller.
 */
LPSTR
lptstr_to_utf8(LPTSTR arg_lptstr)
{
     int utf8str_len;
     LPSTR utf8str_ret = NULL;

     utf8str_len = WideCharToMultiByte( CP_UTF8, 0, arg_lptstr, -1, NULL, 0, NULL, NULL );
     if(utf8str_len > 0)
     {
         utf8str_ret = (LPSTR)fs_get(utf8str_len * sizeof(CHAR));
         utf8str_len = WideCharToMultiByte( CP_UTF8, 0,
             arg_lptstr, -1, utf8str_ret, utf8str_len, NULL, NULL );
     }

     if(!utf8str_len)
     {
         /* check GetLastError()? */
         utf8str_ret = (LPSTR)fs_get(sizeof(CHAR));
         utf8str_ret[0] = 0;
     }

     return utf8str_ret;
}


/*
 * Convert a UCS4 argument into an LPTSTR version
 * of that argument. The result is allocated here
 * and should be freed by the caller.
 */
LPTSTR
ucs4_to_lptstr(UCS *arg_ucs4)
{
    LPTSTR ret_lptstr = NULL;
    size_t len;
    size_t i;

    if(arg_ucs4){
	len = ucs4_strlen(arg_ucs4);
	ret_lptstr = (LPTSTR) fs_get((len+1) * sizeof(TCHAR));
	/* bogus conversion ignores UTF-16 */
	for(i = 0; i < len; i++)
	  ret_lptstr[i] = arg_ucs4[i];

	ret_lptstr[len] = '\0';
    }

    return(ret_lptstr);
}


/*
 * Convert an LPTSTR argument into a UCS4 version
 * of that argument. The result is MemAlloc'd here
 * and should be freed by the caller.
 */
UCS *
lptstr_to_ucs4(LPTSTR arg_lptstr)
{
    UCS *ret_ucs4 = NULL;
    size_t len;
    size_t i;

    if(arg_lptstr){
	len = _tcslen(arg_lptstr);
	ret_ucs4 = (UCS *) fs_get((len+1)*sizeof(UCS));
	/* bogus conversion ignores UTF-16 */
	for(i = 0; i < len; i++)
	  ret_ucs4[i] = arg_lptstr[i];

	ret_ucs4[len] = '\0';
    }

    return(ret_ucs4);
}

#endif /* _WINDOWS */


/*
 * Pass in a stream of UTF-8 characters 1-at-a-time in 'c' and return obuf
 * 1-at-a-time filled in with UCS characters. The return value is the
 * number of valid characters in obuf to be used. It can only
 * be 1 or 0 characters since we're only getting one UTF-8 character
 * at a time.
 */
int
utf8_to_ucs4_oneatatime(int c, CBUF_S *cb, UCS *obuf, int *obufwidth)
{
    int  width = 0, outchars = 0;

    if(!(cb && cb->cbufp))
      return(0);

    if(cb->cbufp < cb->cbuf+sizeof(cb->cbuf)){
	unsigned char *inputp;
	unsigned long remaining_octets;
	UCS ucs;

	*cb->cbufp++ = (unsigned char) c;
	inputp = cb->cbuf;
	remaining_octets = (cb->cbufp - cb->cbuf) * sizeof(unsigned char);
	ucs = (UCS) utf8_get(&inputp, &remaining_octets);

	switch(ucs){
	  case U8G_ENDSTRG:	/* incomplete character, wait */
	  case U8G_ENDSTRI:	/* incomplete character, wait */
	    break;

	  default:
	    if(ucs & U8G_ERROR || ucs == UBOGON){
		/*
		 * None of these cases is supposed to happen. If it
		 * does happen then the input stream isn't UTF-8
		 * so something is wrong.
		 */
		outchars++;
		*obuf = '?';
		cb->cbufp = cb->cbuf;
		width = 1;
	    }
	    else{
		outchars++;
		if(ucs < 0x80 && ucs >= 0x20)
		  width = 1;

		if(ucs >= 0x80 && (width=wcellwidth(ucs)) < 0){
		    /*
		     * This happens when we have a UTF-8 character that
		     * we aren't able to print in our locale. For example,
		     * if the locale is setup with the terminal
		     * expecting ISO-8859-1 characters then there are
		     * lots of UTF-8 characters that can't be printed.
		     * Print a '?' instead.
		     * Don't think this should happen in Windows.
		     */
		    *obuf = '?';
		}
		else{
		    *obuf = ucs;
		}

		/* update the input buffer */
		if(inputp >= cb->cbufp)	/* this should be the case */
		  cb->cbufp = cb->cbuf;
		else{		/* extra chars for some reason? */
		    unsigned char *q, *newcbufp;

		    newcbufp = (cb->cbufp - inputp) + cb->cbuf;
		    q = cb->cbuf;
		    while(inputp < cb->cbufp)
		      *q++ = *inputp++;

		    cb->cbufp = newcbufp;
		}
	    }

	    break;
	}
    }
    else{			/* error */
	*obuf = '?';
	outchars = 1;
	width = 1;
	cb->cbufp = cb->cbuf;	/* start over */
    }

    if(obufwidth)
      *obufwidth = width;

    return(outchars);
}


/*
 * Return an allocated copy of a zero-terminated UCS-4 string.
 */
UCS *
ucs4_cpystr(UCS *ucs4src)
{
    size_t         arraysize;
    UCS           *ret = NULL;
    size_t         i;

    if(!ucs4src)
      return NULL;

    arraysize = ucs4_strlen(ucs4src);

    ret = (UCS *) fs_get((arraysize+1) * sizeof(*ret));
    memset(ret, 0, (arraysize+1) * sizeof(*ret));

    for(i = 0; i < arraysize; i++)
      ret[i] = ucs4src[i];

    return ret;
}


UCS *
ucs4_strncpy(UCS *ucs4dst, UCS *ucs4src, size_t n)
{
    size_t i;

    if(ucs4src && ucs4dst){
	for(i = 0; i < n; i++){
	    ucs4dst[i] = ucs4src[i];
	    if(ucs4dst[i] == '\0')
	      break;
	}
    }

    return ucs4dst;
}


UCS *
ucs4_strncat(UCS *ucs4dst, UCS *ucs4src, size_t n)
{
    size_t i;
    UCS *u;

    if(ucs4src && ucs4dst){
	for(u = ucs4dst; *u; u++)
	  ;

	for(i = 0; i < n; i++){
	    u[i] = ucs4src[i];
	    if(u[i] == '\0')
	      break;
	}

	if(i == n)
	  u[i] = '\0';
    }

    return ucs4dst;
}


/*
 * Like strlen only this returns the number of non-zero characters
 * in a zero-terminated UCS-4 array.
 */
size_t
ucs4_strlen(UCS *ucs4str)
{
    size_t i = 0;

    if(ucs4str)
      while(ucs4str[i])
        i++;

    return(i);
}


int
ucs4_strcmp(UCS *s1, UCS *s2)
{
    for(; *s1 == *s2; s1++, s2++)
      if(*s1 == '\0')
        return 0;

    return((*s1 < *s2) ? -1 : 1);
}


UCS *
ucs4_strchr(UCS *s, UCS c)
{
    if(!s)
      return NULL;

    while(*s && *s != c)
      s++;

    if(*s || !c)
      return s;
    else
      return NULL;
}


UCS *
ucs4_strrchr(UCS *s, UCS c)
{
    UCS *ret = NULL;

    if(!s)
      return ret;

    while(*s){
	if(*s == c)
	  ret = s;

	s++;
    }

    return ret;
}


/*
 * Returns the screen cells width of the UTF-8 string argument.
 */
unsigned
utf8_width(char *str)
{
    unsigned width = 0;
    int this_width;
    UCS ucs;
    unsigned long remaining_octets;
    char *readptr;

    if(!(str && *str))
      return(width);

    readptr = str;
    remaining_octets = readptr ? strlen(readptr) : 0;

    while(remaining_octets > 0 && *readptr){

	ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);

	if(ucs & U8G_ERROR || ucs == UBOGON){
	    /*
	     * This should not happen, but do something to handle it anyway.
	     * Treat each character as a single width character, which is what should
	     * probably happen when we actually go to write it out.
	     */
	    remaining_octets--;
	    readptr++;
	    this_width = 1;
	}
	else{
	    this_width = wcellwidth(ucs);

	    /*
	     * If this_width is -1 that means we can't print this character
	     * with our current locale. Writechar will print a '?'.
	     */
	    if(this_width < 0)
	      this_width = 1;
	}

	width += (unsigned) this_width;
    }

    return(width);
}


/*
 * Copy UTF-8 characters from src into dst.
 * This is intended to be used if you want to truncate a string at
 * the start instead of the end. For example, you have a long string
 * like
 *       this_is_a_long_string
 * but not enough space to fit it into a particular field. You want to
 * end up with
 *             s_a_long_string
 * where that fits in a particular width. Perhaps you'd use this with ...
 * to get
 *          ...s_a_long_string
 * This right adjusts the end of the string in the width space and
 * cuts it off at the start. If there is enough width for the whole
 * string it will copy the string into dst with no padding.
 *
 * Copy enough characters so that the result will have screen width of
 * want_width screen cells in current locale.
 *
 * Dstlen is the available space in dst. No more than dstlen bytes will be written
 *   to dst. This is just for protection, it shouldn't be relied on to
 *   do anything useful. Dstlen should be large enough. Otherwise you'll get
 *   characters truncated in the middle or something like that.
 *
 * Returned value is the number of bytes written to dst, not including
 *   the possible terminating null.
 *
 * If we can't hit want_width exactly because of double width characters
 *   then we will pad the end of the string with space in order to make
 *   the width exact.
 */
size_t
utf8_to_width_rhs(char *dst,		/* destination buffer */
		  char *src,		/* source string */
		  size_t dstlen,	/* space in dest */
		  unsigned want_width)	/* desired screen width */
{
    int this_width;
    unsigned width_consumed = 0;
    UCS ucs;
    unsigned long remaining_octets;
    char *readptr, *goodreadptr, *savereadptr, *endptr;
    size_t nb = 0;

    if(!src){
	if(dstlen > 0)
	  dst[0] = '\0';

	return nb;
    }

    /*
     * Start at the end of the source string and go backwards until we
     * get to the desired width, but not more than the width.
     */
    readptr = src + strlen(src);
    endptr = readptr;
    goodreadptr = readptr;
    width_consumed = 0;
    savereadptr = readptr;

    for(readptr = savereadptr-1; readptr >= src && width_consumed < want_width && (endptr - readptr) < dstlen;
	readptr = savereadptr-1){

	savereadptr = readptr;
	remaining_octets = goodreadptr - readptr;
	ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);

	/*
	 * Handling the error case is tough because an error will be the normal thing that
	 * happens as we back through the string. So we're just going to punt on the
	 * error for now.
	 */
	if(!(ucs & U8G_ERROR || ucs == UBOGON)){
	    if(remaining_octets > 0){
		/*
		 * This means there are some bad octets after this good
		 * character so things are not going to work out well.
		 * Bail out.
		 */
		savereadptr = src;	/* we're done */
	    }
	    else{
		this_width = wcellwidth(ucs);

		if(this_width < 0)
		  this_width = 1;

		if(width_consumed + (unsigned) this_width <= want_width){  /* ok */
		    width_consumed += (unsigned) this_width;
		    goodreadptr = savereadptr;
		}
		else
		  savereadptr = src;	/* we're done */
	    }
	}
    }

    /*
     * Copy characters from goodreadptr to endptr into dst.
     */
    nb = MIN(endptr-goodreadptr, dstlen-1);
    strncpy(dst, goodreadptr, nb);
    dst[nb] = '\0';

    /*
     * Pad out with spaces in order to hit width exactly.
     */
    while(width_consumed < want_width && nb < dstlen-1){
	dst[nb++] = ' ';
	dst[nb] = '\0';
	width_consumed++;
    }

    return nb;
}


/*
 * The arguments being converted are UTF-8 strings.
 * This routine attempts to make it possible to use screen cell
 * widths in a format specifier. In a one-byte per screen cell
 * world we might have used %10.10s to cause a string to occupy
 * 10 screen positions. Since the width and precision are really
 * referring to numbers of bytes instead of screen positions that
 * won't work with UTF-8 input. We emulate that behavior with
 * the format string %w. %m.nw means to use the m and n as
 * screen width indicators instead of bytes indicators.
 *
 * There is no reason to use this routine unless you want to use
 * min field with or precision with the specifier. A plain %w without
 * widths is equivalent exactly to a plain %s in a regular printf.
 *
 * Double-width characters complicate things. It may not be possible
 * to satisfy the request exactly. For example, %3w for an input
 * string that is made up of two double-width characters.
 * This routine will arbitrarily use a trailing space character if
 * needed to make the width come out correctly where a half of a
 * double-width character would have been needed. We'll see how
 * that works for us.
 *
 * %w only works for strings (it's a %s replacement).
 *
 * Buffer overflow is handled by the size argument. %.30s will work
 * to limit a particular string to 30 bytes, but you lose that
 * ability with %w, since it may write more than precision bytes
 * in order to get to the desired width. It is best to choose
 * size large enough so that it doesn't come into play, otherwise
 * it may be possible to get partial UTF-8 characters because of
 * the truncation.
 *
 * The return value isn't quite the same as the return value
 * of snprintf. It is the number of bytes written, not counting
 * the trailing null, just like snprintf. However, if it is
 * truncated due to size then the output is size, not the
 * number of characters that would have been written.
 */
int
utf8_snprintf(char *dest, size_t size, char *fmt, ...)
{
    char    newfmt[100], buf[20], *q, *pdest, *width_str, *end;
    char   *start_of_specifier;
    char   *input_str;
    int     int_arg;
    double  double_arg;
    void   *ptr_arg;
    unsigned got_width;
    int     more_flags, ret, w;
    int     min_field_width, field_precision, modifier;
    int     flags_minus, flags_plus, flags_space, flags_zero, flags_pound;
    va_list args;

    newfmt[0] = '\0';
    q = newfmt;

    pdest = dest;

#define IS_ROOM_IN_DEST(n_more_chars)			\
    ((pdest - dest + (n_more_chars) <= size) ? 1 : 0)

    /*
     * Strategy: Look through the fmt string for %w's. Replace the
     * %w's in the format string with %s's but with possibly different
     * width and precision arguments which will make it come out right.
     * Then call the regular system vsnprintf with the altered format
     * string but same arguments.
     *
     * That would be nice but it doesn't quite work. Why? Because a
     * %*w will need to have the value in the integer argument the *
     * refers to modified. Can't do it as far as I can tell. Or we could
     * remove the integer argument somehow before calling printf. Can't
     * do it. Or we could somehow add an additional conversion specifier
     * that caused nothing to be printed but ate up the integer arg.
     * Can't figure out how to do that either.
     *
     * Since we can't figure out how to do it, the alternative is to
     * construct the result one piece at a time, pasting together the
     * pieces from the different conversions.
     */
    va_start(args, fmt);

    while(*fmt && IS_ROOM_IN_DEST(1)){
	if(*fmt == '%'){
	    start_of_specifier = fmt++;

	    min_field_width = field_precision = -1;
	    flags_minus = flags_plus = flags_space = flags_zero = flags_pound = 0;

	    /* flags */
	    more_flags = 1;
	    while(more_flags){
		switch(*fmt){
		  case '-':
		    flags_minus++;
		    fmt++;
		    break;

		  case '+':
		    flags_plus++;
		    fmt++;
		    break;

		  case ' ':
		    flags_space++;
		    fmt++;
		    break;

		  case '0':
		    flags_zero++;
		    fmt++;
		    break;

		  case '#':
		    flags_pound++;
		    fmt++;
		    break;

		  default:
		    more_flags = 0;
		    break;
		}
	    }

	    /* minimum field width */
	    if(*fmt == '*'){
		min_field_width = va_arg(args, int);
		fmt++;
	    }
	    else if(*fmt >= '0' && *fmt <= '9'){
		width_str = fmt;
		while (*fmt >= '0' && *fmt <= '9')
		  fmt++;

		strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
		if(sizeof(buf) > fmt-width_str)
		  buf[fmt-width_str] = '\0';

		buf[sizeof(buf)-1] = '\0';

		min_field_width = atoi(width_str);
	    }

	    /* field precision */
	    if(*fmt == '.'){
		fmt++;
		if(*fmt == '*'){
		    field_precision = va_arg(args, int);
		    fmt++;
		}
		else if(*fmt >= '0' && *fmt <= '9'){
		    width_str = fmt;
		    while (*fmt >= '0' && *fmt <= '9')
		      fmt++;

		    strncpy(buf, width_str, MIN(fmt-width_str,sizeof(buf)));
		    if(sizeof(buf) > fmt-width_str)
		      buf[fmt-width_str] = '\0';

		    buf[sizeof(buf)-1] = '\0';

		    field_precision = atoi(width_str);
		}
	    }

	    /* length modifier */
	    if(*fmt == 'h' || *fmt == 'l' || *fmt == 'L')
	      modifier = *fmt++;

	    /* conversion character */
	    switch(*fmt){
	      case 'w':
		/*
		 * work with va_arg(char *) to figure out width
		 * and precision needed to produce the screen width
		 * and precision asked for in %w using some of the
		 * utf8 width routines we have.
		 */

		input_str = va_arg(args, char *);
		if(field_precision >=0 || min_field_width >= 0)
		  w = utf8_width(input_str);

		if(field_precision >= 0){
		    if(w <= field_precision)
		      field_precision = -1;  /* print it all */
		    else{
			/*
			 * We need to cut off some of the input_str
			 * in this case.
			 */
			end = utf8_count_forw_width(input_str, field_precision, &got_width);
			field_precision = (int) (end - input_str);
			/* new w with this field_precision */
			w = got_width;
		    }
		}

		/* need some padding */
		if(min_field_width >= 0)
		  min_field_width = ((field_precision >= 0) ? field_precision : strlen(input_str)) +
				      MAX(0, min_field_width - w);

		/*
		 * Now we just need to get the new format string
		 * set correctly in newfmt.
		 */
		q = newfmt;
		if(q-newfmt < sizeof(newfmt))
		  *q++ = '%';

		if(flags_minus && q-newfmt < sizeof(newfmt))
		  *q++ = '-';
		if(flags_plus && q-newfmt < sizeof(newfmt))
		  *q++ = '+';
		if(flags_space && q-newfmt < sizeof(newfmt))
		  *q++ = ' ';
		if(flags_zero && q-newfmt < sizeof(newfmt))
		  *q++ = '0';
		if(flags_pound && q-newfmt < sizeof(newfmt))
		  *q++ = '#';

		if(min_field_width >= 0){
		    snprintf(buf, sizeof(buf), "%d", min_field_width);
		    sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
		}

		if(field_precision >= 0){
		    if(q-newfmt < sizeof(newfmt))
		      *q++ = '.';

		    snprintf(buf, sizeof(buf), "%d", field_precision);
		    sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
		}

		if(q-newfmt < sizeof(newfmt))
		  *q++ = 's';

		if(q-newfmt < sizeof(newfmt))
		  *q++ = '\0';

		snprintf(pdest, size - (pdest-dest), newfmt, input_str);
		pdest += strlen(pdest);

	        break;

	      case '\0':
		fmt--;
	        break;

	      default:
		/* make a new format which leaves out the dynamic '*' arguments */
		q = newfmt;
		if(q-newfmt < sizeof(newfmt))
		  *q++ = '%';

		if(flags_minus && q-newfmt < sizeof(newfmt))
		  *q++ = '-';
		if(flags_plus && q-newfmt < sizeof(newfmt))
		  *q++ = '+';
		if(flags_space && q-newfmt < sizeof(newfmt))
		  *q++ = ' ';
		if(flags_zero && q-newfmt < sizeof(newfmt))
		  *q++ = '0';
		if(flags_pound && q-newfmt < sizeof(newfmt))
		  *q++ = '#';

		if(min_field_width >= 0){
		    snprintf(buf, sizeof(buf), "%d", min_field_width);
		    sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
		}

		if(field_precision >= 0){
		    if(q-newfmt < sizeof(newfmt))
		      *q++ = '.';

		    snprintf(buf, sizeof(buf), "%d", field_precision);
		    sstrncpy(&q, buf, sizeof(newfmt)-(q-newfmt));
		}

		if(q-newfmt < sizeof(newfmt))
		  *q++ = *fmt;

		if(q-newfmt < sizeof(newfmt))
		  *q++ = '\0';

		switch(*fmt){
		  case 'd': case 'i': case 'o':
		  case 'x': case 'X': case 'u': case 'c':
		    int_arg = va_arg(args, int);
		    snprintf(pdest, size - (pdest-dest), newfmt, int_arg);
		    pdest += strlen(pdest);
		    break;

		  case 's':
		    input_str = va_arg(args, char *);
		    snprintf(pdest, size - (pdest-dest), newfmt, input_str);
		    pdest += strlen(pdest);
		    break;

		  case 'f': case 'e': case 'E':
		  case 'g': case 'G':
		    double_arg = va_arg(args, double);
		    snprintf(pdest, size - (pdest-dest), newfmt, double_arg);
		    pdest += strlen(pdest);
		    break;

		  case 'p':
		    ptr_arg = va_arg(args, void *);
		    snprintf(pdest, size - (pdest-dest), newfmt, ptr_arg);
		    pdest += strlen(pdest);
		    break;

		  case '%':
		    if(IS_ROOM_IN_DEST(1))
		      *pdest++ =  '%';

		    break;

		  default:
		    /* didn't think of this type */
		    assert(0);
		    break;
		}

	        break;
	    }

	    fmt++;
	}
	else{
	    if(IS_ROOM_IN_DEST(1))
	      *pdest++ = *fmt++;
	}
    }

    ret = pdest - dest;

    if(IS_ROOM_IN_DEST(1))
      *pdest++ = '\0';

    va_end(args);

    return ret;
}


/*
 * Copy UTF-8 characters from src into dst.
 * Copy enough characters so that the result will have (<=) screen width of
 * want_width screen cells in current locale.
 *
 * Dstlen is the available space in dst. No more than dstlen bytes will be written
 *   to dst.
 *
 * Returned value is the number of bytes written to dst, not including
 *   the possible terminating null.
 * Got_width is another returned value. It is the width in screen cells of
 *   the string placed in dst. It will be the same as want_width if there
 *   are enough characters in the src to do that and if the character widths
 *   hit the width exactly. It will be less than want_width if we run out
 *   of src characters or if the next character width would skip over the
 *   width we want, because it is double width.
 *
 * Zero width characters are collected and included at the end of the string.
 *   That is, if we make it to want_width but there is still a zero length
 *   character sitting in src, we add that to dst. This might be an accent
 *   or something like that.
 */
size_t
utf8_to_width(char *dst,		/* destination buffer */
	      char *src,		/* source string */
	      size_t dstlen,		/* space in dst */
	      unsigned want_width,	/* desired screen width */
	      unsigned *got_width)	/* returned screen width in dst */
{
    int this_width;
    unsigned width_consumed = 0;
    UCS ucs;
    unsigned long remaining_octets;
    char *writeptr, *readptr, *savereadptr, *endptr;
    int ran_out_of_space = 0;

    readptr = src;

    remaining_octets = readptr ? strlen(readptr) : 0;

    writeptr = dst;
    endptr = writeptr + dstlen;

    if(readptr && writeptr){
      while(width_consumed <= want_width && remaining_octets > 0 && writeptr < dst + dstlen && !ran_out_of_space){
	savereadptr = readptr;
	ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);

	if(ucs & U8G_ERROR || ucs == UBOGON)
	  remaining_octets = 0;
	else{
	  this_width = wcellwidth(ucs);

	  /*
	   * If this_width is -1 that means we can't print this character
	   * with our current locale. Writechar will print a '?'.
	   */
	  if(this_width < 0)
	    this_width = 1;

	  if(width_consumed + (unsigned) this_width <= want_width){
	    /* append this utf8 character to dst if it will fit */
	    if(writeptr + (readptr - savereadptr) < endptr){
	      width_consumed += this_width;
	      while(savereadptr < readptr)
	        *writeptr++ = *savereadptr++;
	    }
	    else
	      ran_out_of_space++;	/* no more utf8 to dst */
	  }
	  else
	    remaining_octets = 0;	/* we're done */
	}
      }

      if(writeptr < endptr)
        *writeptr = '\0';
    }

    if(got_width)
      *got_width = width_consumed;

    return(writeptr ? (writeptr - dst) : 0);
}


/*
 * Str is a UTF-8 string.
 * Count forward width screencell positions and return a pointer to the
 * end of the string that is width wide.
 * The returned pointer points at the next character (where the null would
 * be placed).
 *
 * Got_width is another returned value. It is the width in screen cells of
 *   the string from str to the returned pointer. It will be the same as
 *   want_width if there are enough characters in the str to do that
 *   and if the character widths hit the width exactly. It will be less
 *   than want_width if we run out of characters or if the next character
 *   width would skip over the width we want, because it is double width.
 */
char *
utf8_count_forw_width(char *str, unsigned want_width, unsigned *got_width)
{
    int this_width;
    unsigned width_consumed = 0;
    UCS ucs;
    unsigned long remaining_octets;
    char *readptr;
    char *retptr;

    retptr = readptr = str;

    remaining_octets = readptr ? strlen(readptr) : 0;

    while(width_consumed <= want_width && remaining_octets > 0){

	ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);

	if(ucs & U8G_ERROR || ucs == UBOGON){
	    /*
	     * This should not happen, but do something to handle it anyway.
	     * Treat each character as a single width character, which is what should
	     * probably happen when we actually go to write it out.
	     */
	    remaining_octets--;
	    readptr++;
	    this_width = 1;
	}
	else{
	    this_width = wcellwidth(ucs);

	    /*
	     * If this_width is -1 that means we can't print this character
	     * with our current locale. Writechar will print a '?'.
	     */
	    if(this_width < 0)
	      this_width = 1;
	}

	if(width_consumed + (unsigned) this_width <= want_width){
	    width_consumed += (unsigned) this_width;
	    retptr = readptr;
	}
	else
	  remaining_octets = 0;	/* we're done */
    }

    if(got_width)
      *got_width = width_consumed;

    return(retptr);
}


/*
 * Copy a null terminator into a UTF-8 string in place so that the string is
 * no more than a certain screen width wide. If the string is already less
 * than or equal in width to the requested width, no change is made.
 *
 * The actual width accomplished is returned. Note that it may be less than
 * max_width due to double width characters as well as due to the fact that
 * it fits wholly in the max_width.
 *
 * Returned value is the actual screen width of str when done.
 *
 * A side effect is that a terminating null may have been written into
 * the passed in string.
 */
unsigned
utf8_truncate(char *str, unsigned max_width)
{
    int this_width;
    unsigned width_consumed = 0;
    UCS ucs;
    unsigned long remaining_octets;
    char *readptr, *savereadptr;

    readptr = str;

    remaining_octets = readptr ? strlen(readptr) : 0;

    if(readptr){
      while(width_consumed <= max_width && remaining_octets > 0){

	savereadptr = readptr;
	ucs = (UCS) utf8_get((unsigned char **) &readptr, &remaining_octets);

	if(ucs & U8G_ERROR || ucs == UBOGON){
	    /*
	     * This should not happen, but do something to handle it anyway.
	     * Treat each character as a single width character, which is what should
	     * probably happen when we actually go to write it out.
	     */
	    remaining_octets--;
	    readptr++;
	    this_width = 1;
	}
	else{
	    this_width = wcellwidth(ucs);

	    /*
	     * If this_width is -1 that means we can't print this character
	     * with our current locale. Writechar will print a '?'.
	     */
	    if(this_width < 0)
	      this_width = 1;
	}

	if(width_consumed + (unsigned) this_width <= max_width){
	    width_consumed += (unsigned) this_width;
	}
	else{
	    remaining_octets = 0;	/* we're done */
	    *savereadptr = '\0';
	}
      }
    }

    return(width_consumed);
}


/*
 * Copy UTF-8 characters from src into dst.
 * Copy enough characters so that the result will have screen width of
 * want_width screen cells in current locale.
 * If there aren't enough characters in src to get to want_width, pad on
 * left or right according to left_adjust argument.
 *
 * Dstlen is the available space in dst. No more than dstlen bytes will be written
 *   to dst. Dst will be null terminated if there is enough room, but not
 *   if that would overflow dst's len.
 *
 * Returned value is the number of bytes written to dst, not including
 *   the possible terminating null.
 */
size_t
utf8_pad_to_width(char *dst,		/* destination buffer */
		  char *src,		/* source string */
		  size_t dstlen,	/* space in dst */
		  unsigned want_width,	/* desired screen width */
		  int left_adjust)	/* adjust left or right in want_width columns */
{
    unsigned got_width = 0;
    int      need_more, howmany;
    size_t   len_left, bytes_used;

    bytes_used = utf8_to_width(dst, src, dstlen, want_width, &got_width);
    len_left = dstlen - bytes_used;

    need_more = want_width - got_width;
    howmany = MIN(need_more, len_left);

    if(howmany > 0){
	char *end, *newend, *p, *q;

	end = dst + bytes_used;
	newend = end + howmany;
	if(left_adjust){
	    /*
	     * Add padding to end of string. Simply append
	     * the needed number of spaces, or however many will fit
	     * if we don't have enough space.
	     */
	    for(q = end; q < newend; q++)
	      *q = ' ';
	}
	else{
	    /*
	     * Add padding to start of string.
	     */

	    /* slide existing string over */
	    for(p = end - 1, q = newend - 1; p >= dst; p--, q--)
	      *q = *p;

	    /* fill rest with spaces */
	    for(; q >= dst; q--)
	      *q = ' ';
	}

	bytes_used += howmany;
    }

    if(bytes_used < dstlen)
      dst[bytes_used] = '\0';

    return(bytes_used);
}


/*
 * Str is a UTF-8 string.
 * Start_here is a pointer into the string. It points one position past
 * the last byte that should be considered a part of the length string.
 * Count back want_width screencell positions and return a pointer to the
 * start of the string that is want_width wide and ends with start_here.
 *
 * Since characters may be more than one cell width wide we may end up
 * skipping over the exact width. That is, if we need to we'll go back
 * too far (by one cell width). Account for that in the call by looking
 * at got_width.
 *
 * Note that this call gives a possible got_width == want_width+1 as
 * opposed to utf8_count_forw_width which gives got_width == want-1 instead.
 * That was just what was needed at the time, maybe it needs to be
 * optional.
 */
char *
utf8_count_back_width(char *str, char *start_here, unsigned want_width, unsigned *got_width)
{
    unsigned width_consumed = 0;
    int this_width;
    UCS ucs;
    unsigned long remaining_octets;
    char *ptr, *savereadptr, *goodreadptr;

    savereadptr = start_here;
    goodreadptr = start_here;

    for(ptr = savereadptr - 1; width_consumed < want_width && ptr >= str; ptr = savereadptr - 1){

	savereadptr = ptr;
	remaining_octets = goodreadptr - ptr;
	ucs = (UCS) utf8_get((unsigned char **) &ptr, &remaining_octets);

	if(!(ucs & U8G_ERROR || ucs == UBOGON)){
	  if(remaining_octets > 0){
	      /*
	       * This means there are some bad octets after this good
	       * character so things are not going to work out well.
	       * Bail out.
	       */
	      savereadptr = str;	/* we're done */
	  }
	  else{
	    this_width = wcellwidth(ucs);

	    /*
	     * If this_width is -1 that means we can't print this character
	     * with our current locale. Writechar will print a '?'.
	     */
	    if(this_width < 0)
	      this_width = 1;

	    width_consumed += (unsigned) this_width;
	    goodreadptr = savereadptr;
	  }
        }
    }

    if(got_width)
      *got_width = width_consumed;

    return(savereadptr);
}


/*----------------------------------------------------------------------
  copy the source string onto the destination string returning with
  the destination string pointer at the end of the destination text

  motivation for this is to avoid twice passing over a string that's
  being appended to twice (i.e., strcpy(t, x); t += strlen(t))

  This doesn't really belong here but it is used here.
 ----*/
void
sstrncpy(char **d, char *s, int n)
{
    while(n-- > 0 && (**d = *s++) != '\0')
      (*d)++;
}


/*
 * If use_system_routines is set then NULL is the return value and it is
 * not an error. Display_charmap and keyboard_charmap should come over as
 * malloced strings and will be filled in with the result.
 *
 * Returns a void pointer to the input_cs CHARSET which is
 * passed to mbtow via kbseq().
 * If !use_system_routines && NULL is returned, that is an error and err should
 * have a message.
 * display_charmap and keyboard_charmap should be malloced data and may be
 * realloced and changed here.
 */
int
setup_for_input_output(int use_system_routines, char **display_charmap,
		       char **keyboard_charmap, void **input_cs_arg, char **err)
{
    const CHARSET *cs;
    const CHARSET *input_cs = NULL;
    int already_tried = 0;
    int supported = 0;
    char buf[1000];

#define cpstr(s) strcpy((char *)fs_get(1+strlen(s)), s)

    if(err)
      *err = NULL;

    if(!display_charmap || !keyboard_charmap || !input_cs_arg){
	*err = cpstr("Bad call to setup_for_input_output");
	return(-1);
    }

    if(use_system_routines){
#if	PREREQ_FOR_SYS_TRANSLATION
	char *dcm;

	dcm = nl_langinfo_codeset_wrapper();
	dcm = dcm ? dcm : "US-ASCII";

	init_utf8_display(0, NULL);
	if(*display_charmap){
	    if(dcm && strucmp(*display_charmap, dcm)){
		snprintf(buf, sizeof(buf),
		 _("Display character set \"%s\" is ignored when using system translation"),
		     *display_charmap);

		*err = cpstr(buf);
	    }

	    fs_give((void **) display_charmap);
	}

	if(*keyboard_charmap){
	    if(!*err && dcm && strucmp(*keyboard_charmap, dcm)){
		snprintf(buf, sizeof(buf),
		 _("Keyboard character set \"%s\" is ignored when using system translation"),
		     *keyboard_charmap);

		*err = cpstr(buf);
	    }

	    fs_give((void **) keyboard_charmap);
	}

	*display_charmap = cpstr(dcm);
	*keyboard_charmap = cpstr(dcm);
#else
	*err = cpstr("Bad call to setup_for_input_output");
#endif

	*input_cs_arg = NULL;
	return(0);
    }


try_again1:
    if(!(*display_charmap))
      *display_charmap = cpstr("US-ASCII");

    if(!(*keyboard_charmap))
      *keyboard_charmap = cpstr(*display_charmap);

    if(*keyboard_charmap){
	supported = input_charset_is_supported(*keyboard_charmap);

	if(supported){
	    if(!strucmp(*keyboard_charmap, "utf-8"))
	      input_cs = utf8_charset(*keyboard_charmap);
	    else if((cs = utf8_charset(*keyboard_charmap)) != NULL)
	      input_cs = cs;
	}
	else{
	    if(err && !*err){
		int iso2022jp = 0;

		if(!strucmp(*keyboard_charmap, "ISO-2022-JP"))
		  iso2022jp = 1;

		snprintf(buf, sizeof(buf),
		     /* TRANSLATORS: The first argument is the name of the character
		        set the user is trying to use (which is unsupported by alpine).
			The second argument is " (except for posting)" if they are
			trying to use ISO-2022-JP for something other than posting. */
		     _("Character set \"%s\" is unsupported%s, using US-ASCII"),
		     *keyboard_charmap,
		     iso2022jp ? _(" (except for posting)") : "");

		*err = cpstr(buf);
	    }

	    input_cs = NULL;
	    fs_give((void **) keyboard_charmap);
	    *keyboard_charmap = cpstr("US-ASCII");
	    if(!already_tried){
		already_tried++;
		goto try_again1;
	    }
	}
    }


try_again2:
    if(!(*display_charmap))
      *display_charmap = cpstr("US-ASCII");

    if(*display_charmap){
	supported = output_charset_is_supported(*display_charmap);
	if(supported){
	    if(!strucmp(*display_charmap, "utf-8"))
	      init_utf8_display(1, NULL);
	    else if((cs = utf8_charset(*display_charmap)) != NULL)
	      init_utf8_display(0, utf8_rmap_gen(cs, NULL));
	}
	else{
	    if(err && !*err){
		int iso2022jp = 0;

		if(!strucmp(*display_charmap, "ISO-2022-JP"))
		  iso2022jp = 1;

		snprintf(buf, sizeof(buf),
		     _("Character set \"%s\" is unsupported%s, using US-ASCII"),
		     *display_charmap,
		     iso2022jp ? _(" (except for posting)") : "");

		*err = cpstr(buf);
	    }

	    fs_give((void **) display_charmap);
	    if(!already_tried){
		already_tried++;
		goto try_again2;
	    }
	}
    }
    else{
	if(err && !*err)
	  *err = cpstr(_("Help, can't figure out display character set or even use US-ASCII."));
    }

#undef cpstr

    *input_cs_arg = (void *) input_cs;

    return(0);
}


int
input_charset_is_supported(char *input_charset)
{
    const CHARSET *cs;

    if(!(input_charset && *input_charset))
      return 0;

    if(!strucmp(input_charset, "utf-8"))
      return 1;

    if((cs = utf8_charset(input_charset)) != NULL){

	/*
	 * This was true 2006-09-25.
	 */
	switch(cs->type){
	  case CT_ASCII: case CT_1BYTE0: case CT_1BYTE:
	  case CT_1BYTE8: case CT_EUC: case CT_DBYTE:
	  case CT_DBYTE2: case CT_SJIS: case CT_UCS2:
	  case CT_UCS4: case CT_UTF16:
	    return 1;
	    break;

	  default:
	    break;
	}
    }

    return 0;
}


int
output_charset_is_supported(char *output_charset)
{
    const CHARSET *cs;

    if(!(output_charset && *output_charset))
      return 0;

    if(!strucmp(output_charset, "utf-8"))
      return 1;

    if((cs = utf8_charset(output_charset)) != NULL && utf8_rmap_gen(cs, NULL))
      return 1;

    return 0;
}


int
posting_charset_is_supported(char *posting_charset)
{
    return(posting_charset && *posting_charset
	   && (!strucmp(posting_charset, "ISO-2022-JP")
	       || output_charset_is_supported(posting_charset)));
}


/*
 * This function is only defined in this special case and so calls
 * to it should be wrapped in the same macro conditionals.
 *
 * Returns the default display charset for a UNIX terminal emulator,
 * it is what nl_langinfo(CODESET) should return but we need to
 * wrap nl_langinfo because we know of strange behaving implementations.
 */
#if !defined(_WINDOWS) && HAVE_LANGINFO_H && defined(CODESET)
char *
nl_langinfo_codeset_wrapper(void)
{
    char *ret = NULL;

    ret = nl_langinfo(CODESET);

    /*
     * If the value returned from nl_langinfo() is not a real charset,
     * see if we can figure out what they meant. If we can't figure it
     * out return NULL and let the caller decide what to do.
     */
    if(ret && *ret && !output_charset_is_supported(ret)){
	if(!strcmp("ANSI_X3.4-1968", ret)
	   || !strcmp("646", ret)
	   || !strcmp("ASCII", ret)
	   || !strcmp("C", ret)
	   || !strcmp("POSIX", ret))
	  ret = "US-ASCII";
	else if(!strucmp(ret, "UTF8"))
	  ret = "UTF-8";
	else if(!strucmp(ret, "EUCJP"))
	  ret = "EUC-JP";
	else if(!strucmp(ret, "EUCKP"))
	  ret = "EUC-KP";
	else if(!strucmp(ret, "SJIS"))
	  ret = "SHIFT-JIS";
	else if(strstr(ret, "8859")){
	    char *p;

	    /* check for digits after 8859 */
	    p = strstr(ret, "8859");
	    p += 4;
	    if(!isdigit(*p))
	      p++;

	    if(isdigit(*p)){
		static char buf[12];

		memset(buf, 0, sizeof(buf));
		strncpy(buf, "ISO-8859-", sizeof(buf));
		buf[9] = *p++;
		if(isdigit(*p))
		  buf[10] = *p;

		ret = buf;
	    }
	}
    }

    if(ret && !output_charset_is_supported(ret))
      ret = NULL;

    return(ret);
}
#endif


/*
 * Convert the "orig" string from UTF-8 to "charset". If no conversion is
 * needed the return value will point to orig. If a conversion is done,
 * the return string should be freed by the caller.
 * If not possible, returns NULL.
 */
char *
utf8_to_charset(char *orig, char *charset, int report_err)
{
    SIZEDTEXT src, dst;
    char *ret = orig;

    if(!charset || !charset[0] || !orig || !orig[0] || !strucmp(charset, "utf-8"))
      return ret;

    src.size = strlen(orig);
    src.data = (unsigned char *) orig;

    if(!strucmp(charset, "us-ascii")){
	size_t i;

	for(i = 0; i < src.size; i++)
	  if(src.data[i] & 0x80)
	    return NULL;

	return ret;
    }

    /*
     * This works for ISO-2022-JP because of special code in utf8_cstext
     * but not for other 2022 charsets.
     */
    memset(&dst, 0, sizeof(dst));
    if(utf8_cstext(&src, charset, &dst, report_err ? 0 : '?') && dst.size > 0 && dst.data)
      ret = (char *) dst.data;		/* c-client already null terminates it */
    else
      ret = NULL;

    if((unsigned char *) ret != dst.data && dst.data)
      fs_give((void **) &dst.data);

    return ret;
}


/*
 *      Turn a number into a string with comma's
 *
 * Args: number -- The long to be turned into a string.
 *
 * Result: pointer to static string representing number with commas
 * Can use up to 3 comatose results at once.
 */
char *
comatose(long int number)
{
    long        i, x, done_one;
    static char buf[3][50];
    static int whichbuf = 0;
    char       *b;

    whichbuf = (whichbuf + 1) % 3;

    if(number == 0){
        strncpy(buf[whichbuf], "0", sizeof(buf[0]));
	buf[whichbuf][sizeof(buf[0])-1] = '\0';
        return(buf[whichbuf]);
    }

    done_one = 0;
    b = buf[whichbuf];
    for(i = 1000000000; i >= 1; i /= 1000) {
	x = number / i;
	number = number % i;
	if(x != 0 || done_one) {
	    if(b != buf[whichbuf] && (b-buf[whichbuf]) <  sizeof(buf[0]))
	      *b++ = ',';

	    snprintf(b, sizeof(buf[0])-(b-buf[whichbuf]), done_one ? "%03ld" : "%ld", x);
	    b += strlen(b);
	    done_one = 1;
	}
    }

    if(b-buf[whichbuf] < sizeof(buf[0]))
      *b = '\0';

    return(buf[whichbuf]);
}


/* leave out the commas */
char *
tose(long int number)
{
    static char buf[3][50];
    static int whichbuf = 0;

    whichbuf = (whichbuf + 1) % 3;

    snprintf(buf[whichbuf], sizeof(buf[0]), "%ld", number);

    return(buf[whichbuf]);
}


/*
 * line_paint - where the real work of managing what is displayed gets done.
 */
void
line_paint(int offset,			/* current dot offset into vl */
	   struct display_line *displ,
	   int *passwd)			/* flag to hide display of chars */
{
    int i, w, w2, already_got_one = 0;
    int vfirst, vlast, dfirst, dlast, vi, di;
    int new_vbase;
    unsigned (*width_a_to_b)(UCS *, int, int);

    /*
     * Set passwd to 10 in caller if you want to conceal the
     * password but not print asterisks for feedback.
     *
     * Set passwd to 1 in caller to conceal by printing asterisks.
     */
    if(passwd && *passwd >= 10){	/* don't show asterisks */
	if(*passwd > 10)
	  return;
	else
	  *passwd = 11;		/* only blat once */

	i = 0;
	(*displ->movecursor)(displ->row, displ->col);
	while(i++ <= displ->dwid)
	  (*displ->writechar)(' ');

	(*displ->movecursor)(displ->row, displ->col);
	return;
    }

    if(passwd && *passwd)
      width_a_to_b = single_width_chars_a_to_b;
    else
      width_a_to_b = ucs4_str_width_a_to_b;

    /*
     * vl is the virtual line (the actual data). We operate on it by typing
     * characters to be added and deleting and so forth. In this routine we
     * copy a subset of those UCS-4 characters in vl into dl, the display
     * array, and show that subset on the screen.
     *
     * Offset is the location of the cursor in vl.
     *
     * We will display the string starting from vbase.
     * We have dwid screen cells to work in.
     * We may have to adjust vbase in order to display the
     * part of the string that contains the cursor.
     *
     * We'll make the display look like
     *   vl    a b c d e f g h i j k l m
     *             xxxxxxxxxxxxx  <- width dwid window
     *             < d e f g h >
     *               |
     *             vbase
     * The < will be there if vbase > 0.
     * The > will be there if the string from vbase to the
     * end can't all fit in the window.
     */

    memset(displ->dl, 0, displ->dlen * sizeof(UCS));

    /*
     * Adjust vbase so offset is not out of the window to the right.
     * (The +2 in w + 2 is for a possible " >" if the string goes past
     *  the right hand edge of the window and if the last visible character
     * is double wide. We don't want the offset to be under that > character.)
     */
    for(w = (*width_a_to_b)(displ->vl, displ->vbase, offset);
	displ->dwid > 1 &&
	w + 2 + (displ->vbase ? 1 : 0) > displ->dwid;
        w = (*width_a_to_b)(displ->vl, displ->vbase, offset)){
	/*
	 * offset is off the window to the right
	 * It looks like   a b c d e f g h
	 *                   |         |
	 *               vbase         offset
	 * and offset is either past the right edge,
	 * or right at the right edge (and maybe under >),
	 * or one before right at the edge (and maybe on space
	 * for half a character).
	 *
	 * Since the characters may be double width it is slightly
	 * complicated to figure out how far to increase vbase.
	 * We're going to scoot over past width w/2 characters and
	 * then see if that's sufficient.
	 */
	new_vbase = displ->vbase + 1;
	for(w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase);
	    w2 < displ->dwid/2;
	    w2 = (*width_a_to_b)(displ->vl, displ->vbase+1, new_vbase))
	  new_vbase++;

	displ->vbase = new_vbase;
    }

    /* adjust so offset is not out of the window to the left */
    while(displ->vbase > 0 && displ->vbase >= offset){
	/* add about dwid/2 more width */
	new_vbase = displ->vbase - 1;
	for(w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase);
	    w2 < (displ->dwid+1)/2 && new_vbase > 0;
	    w2 = (*width_a_to_b)(displ->vl, new_vbase, displ->vbase))
	  new_vbase--;

	/* but don't let it get too small, recheck off right end */
	for(w = (*width_a_to_b)(displ->vl, new_vbase, offset);
	    w + 2 + (new_vbase ? 1 : 0) > displ->dwid;
	    w = (*width_a_to_b)(displ->vl, displ->vbase, offset))
	  new_vbase++;

	displ->vbase = MAX(new_vbase, 0);
    }

    if(displ->vbase == 1 && ((passwd && *passwd) || wcellwidth(displ->vl[0]) == 1))
      displ->vbase = 0;

    vfirst = displ->vbase;
    dfirst = 0;
    if(displ->vbase > 0){			/* off screen cue left */
	dfirst = 1;				/* index which matches vfirst */
	displ->dl[0] = '<';
    }

    vlast = displ->vused-1;			/* end */
    w = (*width_a_to_b)(displ->vl, vfirst, vlast);

    if(displ->dwid > 0 && w + dfirst > displ->dwid){			/* off window right */

	/* find last ucs character to be printed */
	while(w + dfirst > displ->dwid - 1)	/* -1 for > */
	  w = (*width_a_to_b)(displ->vl, vfirst, --vlast);

	/* worry about double-width characters */
	if(w + dfirst == displ->dwid - 1){	/* no prob, hit it exactly */
	    dlast = dfirst + vlast - vfirst + 1;	/* +1 for > */
	    displ->dl[dlast] = '>';
	}
	else{
	    dlast = dfirst + vlast - vfirst + 1;
	    displ->dl[dlast++] = ' ';
	    displ->dl[dlast] = '>';
	}
    }
    else
      dlast = dfirst + vlast - vfirst;

    /*
     * Copy the relevant part of the virtual line into the display line.
     */
    for(vi = vfirst, di = dfirst; vi <= vlast; vi++, di++)
      if(passwd && *passwd)
        displ->dl[di] = '*';		/* to conceal password */
      else
        displ->dl[di] = displ->vl[vi];

    /*
     * Add spaces to clear the rest of the line.
     * We have dwid total space to fill.
     */
    w = (*width_a_to_b)(displ->dl, 0, dlast);	/* width through dlast */
    for(di = dlast+1, i = displ->dwid - w; i > 0 ; i--)
      displ->dl[di++] = ' ';

    /*
     * Draw from left to right, skipping until we get to
     * something that is different. Characters may be different
     * widths than they were initially so paint from there the
     * rest of the way.
     */
    for(di = 0; displ->dl[di]; di++){
	if(already_got_one || displ->dl[di] != displ->olddl[di]){
	    /* move cursor first time */
	    if(!already_got_one++){
		w = (di > 0) ? (*width_a_to_b)(displ->dl, 0, di-1) : 0;
		(*displ->movecursor)(displ->row, displ->col + w);
	    }

	    (*displ->writechar)(displ->dl[di]);
	    displ->olddl[di] = displ->dl[di];
	}
    }

    memset(&displ->olddl[di], 0, (displ->dlen - di) * sizeof(UCS));

    /*
     * Move the cursor to the offset.
     *
     * The offset is relative to the start of the virtual array. We need
     * to find the location on the screen. The offset into the display array
     * will be offset-vbase+dfirst. We want to be at the start of that
     * character, so we need to find the width of all the characters up
     * to that point.
     */
    w = (offset > 0) ? (*width_a_to_b)(displ->dl, 0, offset-displ->vbase+dfirst-1) : 0;

    (*displ->movecursor)(displ->row, displ->col + w);
}


/*
 * This is just like ucs4_str_width_a_to_b() except all of the characters
 * are assumed to be of width 1. This is for printing out *'s when user
 * enters a password, while still managing to use the same code to do the
 * display.
 */
unsigned
single_width_chars_a_to_b(UCS *ucsstr, int a, int b)
{
    unsigned width = 0;
    int i;

    if(ucsstr)
      for(i = a; i <= b && ucsstr[i]; i++)
	width++;

    return width;
}