1 /* unicode.h - Header file for Unicode library. 2 3 Copyright (C) 1999, 2000 Tom Tromey 4 5 The Gnome Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Library General Public License as 7 published by the Free Software Foundation; either version 2 of the 8 License, or (at your option) any later version. 9 10 The Gnome Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Library General Public License for more details. 14 15 You should have received a copy of the GNU Library General Public 16 License along with the Gnome Library; see the file COPYING.LIB. If not, 17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, 18 Boston, MA 02111-1307, USA. */ 19 20 #ifndef UNICODE_H 21 #define UNICODE_H 22 23 #ifdef __cplusplus 24 extern "C" 25 { 26 #endif 27 28 #include <stdlib.h> /* For size_t */ 29 #include <sys/types.h> /* For ssize_t */ 30 31 /* We need the error codes so we can see if EILSEQ exists. */ 32 #include <errno.h> 33 34 #ifndef EILSEQ 35 /* On some systems, like SunOS and NetBSD, EILSEQ is not defined. */ 36 # define EILSEQ -2323 37 #endif 38 39 /* FIXME: assumes 32-bit int. */ 40 typedef unsigned int unicode_char_t; 41 42 /* These are the possible character classifications. */ 43 #define UNICODE_CONTROL 0 44 #define UNICODE_FORMAT 1 45 #define UNICODE_UNASSIGNED 2 46 #define UNICODE_PRIVATE_USE 3 47 #define UNICODE_SURROGATE 4 48 #define UNICODE_LOWERCASE_LETTER 5 49 #define UNICODE_MODIFIER_LETTER 6 50 #define UNICODE_OTHER_LETTER 7 51 #define UNICODE_TITLECASE_LETTER 8 52 #define UNICODE_UPPERCASE_LETTER 9 53 #define UNICODE_COMBINING_MARK 10 54 #define UNICODE_ENCLOSING_MARK 11 55 #define UNICODE_NON_SPACING_MARK 12 56 #define UNICODE_DECIMAL_NUMBER 13 57 #define UNICODE_LETTER_NUMBER 14 58 #define UNICODE_OTHER_NUMBER 15 59 #define UNICODE_CONNECT_PUNCTUATION 16 60 #define UNICODE_DASH_PUNCTUATION 17 61 #define UNICODE_CLOSE_PUNCTUATION 18 62 #define UNICODE_FINAL_PUNCTUATION 19 63 #define UNICODE_INITIAL_PUNCTUATION 20 64 #define UNICODE_OTHER_PUNCTUATION 21 65 #define UNICODE_OPEN_PUNCTUATION 22 66 #define UNICODE_CURRENCY_SYMBOL 23 67 #define UNICODE_MODIFIER_SYMBOL 24 68 #define UNICODE_MATH_SYMBOL 25 69 #define UNICODE_OTHER_SYMBOL 26 70 #define UNICODE_LINE_SEPARATOR 27 71 #define UNICODE_PARAGRAPH_SEPARATOR 28 72 #define UNICODE_SPACE_SEPARATOR 29 73 74 /* Call this to initialize the library. */ 75 void unicode_init (void); 76 77 /* Returns 1 if current locale uses UTF-8 charset. If CHARSET is 78 not null, sets *CHARSET to the name of the current locale's 79 charset. This value is statically allocated. */ 80 int unicode_get_charset (char **charset); 81 82 /* These are all analogs of the <ctype.h> functions. */ 83 int unicode_isalnum (unicode_char_t c); 84 int unicode_isalpha (unicode_char_t c); 85 int unicode_iscntrl (unicode_char_t c); 86 int unicode_isdigit (unicode_char_t c); 87 int unicode_isgraph (unicode_char_t c); 88 int unicode_islower (unicode_char_t c); 89 int unicode_isprint (unicode_char_t c); 90 int unicode_ispunct (unicode_char_t c); 91 int unicode_isspace (unicode_char_t c); 92 int unicode_isupper (unicode_char_t c); 93 int unicode_isxdigit (unicode_char_t c); 94 int unicode_istitle (unicode_char_t c); 95 int unicode_isdefined (unicode_char_t c); 96 int unicode_iswide (unicode_char_t c); 97 98 /* More <ctype.h> functions. These convert between the three cases. 99 See the Unicode book to understand title case. */ 100 unicode_char_t unicode_toupper (unicode_char_t c); 101 unicode_char_t unicode_tolower (unicode_char_t c); 102 unicode_char_t unicode_totitle (unicode_char_t c); 103 104 /* If C is a digit (according to `unicode_isdigit'), then return its 105 numeric value. Otherwise return -1. */ 106 int unicode_digit_value (unicode_char_t c); 107 108 /* If C is a hex digit (according to `unicode_isxdigit'), then return 109 its numeric value. Otherwise return -1. */ 110 int unicode_xdigit_value (unicode_char_t c); 111 112 /* Return the Unicode character type of a given character. */ 113 int unicode_type (unicode_char_t c); 114 115 /* If P points to the middle of a Utf-8 character, this function 116 returns a pointer to the first byte of the character. If P points 117 to the start of a Utf-8 character, this function returns a pointer 118 to the first byte of the previous character. If P does not point 119 to a Utf-8 character, NULL is returned. START bounds the search; 120 in no case will a value before START be returned. */ 121 char *unicode_previous_utf8 (const char *start, const char *p); 122 123 /* Return a pointer to the first byte of the next Utf-8 character 124 after P. This works whether P points to the start or to the middle 125 of a Utf-8 character. P is assumed to be nul-terminated. */ 126 char *unicode_next_utf8 (const char *p); 127 128 /* Return the length, in characters, of P, a UTF-8 string. MAX is the 129 maximum number of bytes to examine. If MAX is less than 0, then P 130 is assumed to be nul-terminated. */ 131 int unicode_strlen (const char *p, int max); 132 133 /* Returns the visual width, in character-size units, of P, a string. 134 This value may be used for tabulation. */ 135 int unicode_string_width (const char *p); 136 137 /* Fetch the next Utf-8 character from P into RESULT, and return a 138 pointer to the start of the next Utf-8 character. If P is not well 139 formed, will return NULL. */ 140 char *unicode_get_utf8 (const char *p, unicode_char_t *result); 141 142 /* Returns the offset within the string, in bytes, of the character offset 143 given. */ 144 145 size_t unicode_offset_to_index(const char *p, int offset); 146 147 /* Returns the offset within the string, in characters, of the byte offset 148 given. */ 149 150 size_t unicode_index_to_offset(const char *p, int offset); 151 152 /* Returns a pointer to the _last_ non-NULL utf-8 within the string */ 153 154 char *unicode_last_utf8(const char *p); 155 156 /* Copies n characters from src to dest */ 157 158 char *unicode_strncpy(char *dest, const char *src, size_t n); 159 160 /* Find the UTF-8 character corresponding to ch, in string p. These 161 functions are equivilants to strchr and strrchr */ 162 163 char *unicode_strchr(const char *p, unicode_char_t ch); 164 char *unicode_strrchr(const char *p, unicode_char_t ch); 165 166 /* Pads a string to fill out a requested visual width */ 167 168 void unicode_pad_string(char *dest, int right, int width, const char *string); 169 170 /* Compute canonical ordering of a string in-place. This rearranges 171 decomposed characters in the string according to their combining 172 classes. See the Unicode manual for more information. */ 173 void unicode_canonical_ordering (unicode_char_t *string, size_t len); 174 175 /* Compute canonical decomposition of a character. Returns malloc()d 176 string of Unicode characters. RESULT_LEN is set to the resulting 177 length of the string. */ 178 unicode_char_t *unicode_canonical_decomposition (unicode_char_t ch, 179 size_t *result_len); 180 181 /* An opaque type used by the iconv workalike. */ 182 typedef struct unicode_iconv_i *unicode_iconv_t; 183 184 /* Create a new iconv conversion instance. TOCODE is the destination 185 charset, FROMCODE is the source charset. Returns -1 if a charset 186 name is not recognized or if out of memory. Can set errno to 187 ENOMEM or EINVAL. */ 188 unicode_iconv_t unicode_iconv_open (const char *tocode, const char *fromcode); 189 190 /* Close an iconv conversion instance. */ 191 int unicode_iconv_close (unicode_iconv_t cd); 192 193 /* Convert characters from INBUF into OUTBUF. Parameters are in/out 194 and are updated by this function. Returns -1 and sets errno on 195 error (including E2BIG if not enough room left in output buffer). 196 Otherwise returns number of conversions performed; this can be 0. 197 Note that on some systems EILSEQ (a possible error code) is not 198 defined. On such systems we use EBADMSG instead. */ 199 ssize_t unicode_iconv (unicode_iconv_t cd, 200 const char **inbuf, size_t *inbytesleft, 201 char **outbuf, size_t *outbytesleft); 202 203 #ifdef __cplusplus 204 } 205 #endif 206 207 #endif /* UNICODE_H */ 208