tthsum/tthsum/utf8.h

#ifndef INCLUDED_UTF8_H
#define INCLUDED_UTF8_H
/* vim: set ts=8 sts=4 sw=4 tw=80 noet: */
/*======================================================================
Copyright (C) 2004,2005,2009 Walter Doekes <walter+tthsum@wjd.nu>
This file is part of tthsum.

tthsum is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

tthsum is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with tthsum.  If not, see <http://www.gnu.org/licenses/>.
======================================================================*/

/**
 * Conversion routines to and from UTF8 and UCS (UNICODE).
 *
 * The ANSI-C mbstowcs and wcstombs functions aren't sufficient to do
 * UTF8 and USC conversion on machines that do not have the proper
 * locales set up.
 * The two functions contained herein (this and utf8.c) provide
 * functions similar in behaviour to mbstowcs and wcstombs, but
 * function without a proper locale set up.
 *
 * See also: mbstowcs(3), wcstombs(3), unicode(7), utf8(7).
 *
 * Last modified: 2009-05-02
 */

#include <stddef.h>

#ifdef __cplusplus
extern "C" {
#endif

/**
 * Convert an UTF8 multi-byte string to an UCS wide-character string.
 * The conversion stops at the first 0 (null) character encountered.
 *
 * - If dest is NULL, utf8towcs returns the size required for the full
 *   string (without the terminating 0 (null) character), n is ignored.
 * - If dest is not NULL, utf8towcs tries to write at most n characters
 *   to dest. the characters written (without the terminating 0 (null))
 *   is returned.
 * - If src is NULL, behaviour is undefined.
 * - If src contains an invalid UTF8 sequence, utf8towcs returns
 *   (size_t)-1. If there is room, a terminating 0 (null) character
 *   will be placed after the last successfully converted character in
 *   dest.
 */
size_t utf8towcs(wchar_t* dest, const char* src, size_t n);

/**
 * Allocate a wide character string and perform utf8towcs on it.
 * Free the allocated memory with free().
 */
size_t utf8toawcs(wchar_t** dest, const char* src);

/**
 * Convert an UCS wide-character string to an UTF8 multi-byte string.
 * The conversion stops at the first 0 (null) character encountered.
 *
 * - If dest is NULL, wcstoutf8 returns the size required for the full
 *   string (without the terminating 0 (null) character), n is ignored.
 * - If dest is not NULL, wcstoutf8 tries to write at most n characters
 *   to dest. the characters written (without the terminating 0 (null))
 *   is returned.
 * - If src is NULL, behaviour is undefined.
 * - If src contains an invalid UCS sequence, utf8towcs returns
 *   (size_t)-1. If there is room, a terminating 0 (null) character
 *   will be placed after the last successfully converted character in
 *   dest.
 *   The Unicode standard specifies no characters above 0x10ffff, so an
 *   UTF8 encoding will at most contain 4 instead of 6 characters per
 *   wide character.
 */
size_t wcstoutf8(char* dest, const wchar_t* src, size_t n);

/**
 * Allocate a wide character string and perform wcstoutf8 on it.
 * Free the allocated memory with free().
 */
size_t wcstoautf8(char** dest, const wchar_t* src);

#ifdef __cplusplus
} /* extern "C" */
#endif /* __cplusplus */

#endif /* INCLUDED_UTF8_H */