libunicode/libunicode-0.4/unicode.h

/* unicode.h - Header file for Unicode library.

   Copyright (C) 1999, 2000 Tom Tromey

   The Gnome Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   The Gnome Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with the Gnome Library; see the file COPYING.LIB.  If not,
   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
   Boston, MA 02111-1307, USA.  */

#ifndef UNICODE_H
#define UNICODE_H

#ifdef __cplusplus
extern "C"
{
#endif

#include <stdlib.h>      /* For size_t */
#include <sys/types.h>   /* For ssize_t */

/* We need the error codes so we can see if EILSEQ exists.  */
#include <errno.h>

#ifndef EILSEQ
/* On some systems, like SunOS and NetBSD, EILSEQ is not defined.  */
#  define EILSEQ -2323
#endif

/* FIXME: assumes 32-bit int.  */
typedef unsigned int unicode_char_t;

/* These are the possible character classifications.  */
#define UNICODE_CONTROL 0
#define UNICODE_FORMAT 1
#define UNICODE_UNASSIGNED 2
#define UNICODE_PRIVATE_USE 3
#define UNICODE_SURROGATE 4
#define UNICODE_LOWERCASE_LETTER 5
#define UNICODE_MODIFIER_LETTER 6
#define UNICODE_OTHER_LETTER 7
#define UNICODE_TITLECASE_LETTER 8
#define UNICODE_UPPERCASE_LETTER 9
#define UNICODE_COMBINING_MARK 10
#define UNICODE_ENCLOSING_MARK 11
#define UNICODE_NON_SPACING_MARK 12
#define UNICODE_DECIMAL_NUMBER 13
#define UNICODE_LETTER_NUMBER 14
#define UNICODE_OTHER_NUMBER 15
#define UNICODE_CONNECT_PUNCTUATION 16
#define UNICODE_DASH_PUNCTUATION 17
#define UNICODE_CLOSE_PUNCTUATION 18
#define UNICODE_FINAL_PUNCTUATION 19
#define UNICODE_INITIAL_PUNCTUATION 20
#define UNICODE_OTHER_PUNCTUATION 21
#define UNICODE_OPEN_PUNCTUATION 22
#define UNICODE_CURRENCY_SYMBOL 23
#define UNICODE_MODIFIER_SYMBOL 24
#define UNICODE_MATH_SYMBOL 25
#define UNICODE_OTHER_SYMBOL 26
#define UNICODE_LINE_SEPARATOR 27
#define UNICODE_PARAGRAPH_SEPARATOR 28
#define UNICODE_SPACE_SEPARATOR 29

/* Call this to initialize the library.  */
void unicode_init (void);

/* Returns 1 if current locale uses UTF-8 charset.  If CHARSET is
   not null, sets *CHARSET to the name of the current locale's
   charset.  This value is statically allocated.  */
int unicode_get_charset (char **charset);

/* These are all analogs of the <ctype.h> functions.  */
int unicode_isalnum (unicode_char_t c);
int unicode_isalpha (unicode_char_t c);
int unicode_iscntrl (unicode_char_t c);
int unicode_isdigit (unicode_char_t c);
int unicode_isgraph (unicode_char_t c);
int unicode_islower (unicode_char_t c);
int unicode_isprint (unicode_char_t c);
int unicode_ispunct (unicode_char_t c);
int unicode_isspace (unicode_char_t c);
int unicode_isupper (unicode_char_t c);
int unicode_isxdigit (unicode_char_t c);
int unicode_istitle (unicode_char_t c);
int unicode_isdefined (unicode_char_t c);
int unicode_iswide (unicode_char_t c);

/* More <ctype.h> functions.  These convert between the three cases.
   See the Unicode book to understand title case.  */
unicode_char_t unicode_toupper (unicode_char_t c);
unicode_char_t unicode_tolower (unicode_char_t c);
unicode_char_t unicode_totitle (unicode_char_t c);

/* If C is a digit (according to `unicode_isdigit'), then return its
   numeric value.  Otherwise return -1.  */
int unicode_digit_value (unicode_char_t c);

/* If C is a hex digit (according to `unicode_isxdigit'), then return
   its numeric value.  Otherwise return -1.  */
int unicode_xdigit_value (unicode_char_t c);

/* Return the Unicode character type of a given character.  */
int unicode_type (unicode_char_t c);

/* If P points to the middle of a Utf-8 character, this function
   returns a pointer to the first byte of the character.  If P points
   to the start of a Utf-8 character, this function returns a pointer
   to the first byte of the previous character.  If P does not point
   to a Utf-8 character, NULL is returned.  START bounds the search;
   in no case will a value before START be returned.  */
char *unicode_previous_utf8 (const char *start, const char *p);

/* Return a pointer to the first byte of the next Utf-8 character
   after P.  This works whether P points to the start or to the middle
   of a Utf-8 character.  P is assumed to be nul-terminated.  */
char *unicode_next_utf8 (const char *p);

/* Return the length, in characters, of P, a UTF-8 string.  MAX is the
   maximum number of bytes to examine.  If MAX is less than 0, then P
   is assumed to be nul-terminated.  */
int unicode_strlen (const char *p, int max);

/* Returns the visual width, in character-size units, of P, a string.
   This value may be used for tabulation.  */
int unicode_string_width (const char *p);

/* Fetch the next Utf-8 character from P into RESULT, and return a
   pointer to the start of the next Utf-8 character.  If P is not well
   formed, will return NULL.  */
char *unicode_get_utf8 (const char *p, unicode_char_t *result);

/* Returns the offset within the string, in bytes, of the character offset
   given. */

size_t unicode_offset_to_index(const char *p, int offset);

/* Returns the offset within the string, in characters, of the byte offset
   given. */

size_t unicode_index_to_offset(const char *p, int offset);

/* Returns a pointer to the _last_ non-NULL utf-8 within the string */

char *unicode_last_utf8(const char *p);

/* Copies n characters from src to dest */

char *unicode_strncpy(char *dest, const char *src, size_t n);

/* Find the UTF-8 character corresponding to ch, in string p. These
   functions are equivilants to strchr and strrchr */

char *unicode_strchr(const char *p, unicode_char_t ch);
char *unicode_strrchr(const char *p, unicode_char_t ch);

/* Pads a string to fill out a requested visual width */

void unicode_pad_string(char *dest, int right, int width, const char *string);

/* Compute canonical ordering of a string in-place.  This rearranges
   decomposed characters in the string according to their combining
   classes.  See the Unicode manual for more information.  */
void unicode_canonical_ordering (unicode_char_t *string, size_t len);

/* Compute canonical decomposition of a character.  Returns malloc()d
   string of Unicode characters.  RESULT_LEN is set to the resulting
   length of the string.  */
unicode_char_t *unicode_canonical_decomposition (unicode_char_t ch,
						 size_t *result_len);

/* An opaque type used by the iconv workalike.  */
typedef struct unicode_iconv_i *unicode_iconv_t;

/* Create a new iconv conversion instance.  TOCODE is the destination
   charset, FROMCODE is the source charset.  Returns -1 if a charset
   name is not recognized or if out of memory.  Can set errno to
   ENOMEM or EINVAL.  */
unicode_iconv_t unicode_iconv_open (const char *tocode, const char *fromcode);

/* Close an iconv conversion instance.  */
int unicode_iconv_close (unicode_iconv_t cd);

/* Convert characters from INBUF into OUTBUF.  Parameters are in/out
   and are updated by this function.  Returns -1 and sets errno on
   error (including E2BIG if not enough room left in output buffer).
   Otherwise returns number of conversions performed; this can be 0.
   Note that on some systems EILSEQ (a possible error code) is not
   defined.  On such systems we use EBADMSG instead.  */
ssize_t unicode_iconv (unicode_iconv_t cd,
		       const char **inbuf, size_t *inbytesleft,
		       char **outbuf, size_t *outbytesleft);

#ifdef __cplusplus
}
#endif

#endif /* UNICODE_H */