1 /* unicode.h - Header file for Unicode library.
2 
3    Copyright (C) 1999, 2000 Tom Tromey
4 
5    The Gnome Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Library General Public License as
7    published by the Free Software Foundation; either version 2 of the
8    License, or (at your option) any later version.
9 
10    The Gnome Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Library General Public License for more details.
14 
15    You should have received a copy of the GNU Library General Public
16    License along with the Gnome Library; see the file COPYING.LIB.  If not,
17    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18    Boston, MA 02111-1307, USA.  */
19 
20 #ifndef UNICODE_H
21 #define UNICODE_H
22 
23 #ifdef __cplusplus
24 extern "C"
25 {
26 #endif
27 
28 #include <stdlib.h>      /* For size_t */
29 #include <sys/types.h>   /* For ssize_t */
30 
31 /* We need the error codes so we can see if EILSEQ exists.  */
32 #include <errno.h>
33 
34 #ifndef EILSEQ
35 /* On some systems, like SunOS and NetBSD, EILSEQ is not defined.  */
36 #  define EILSEQ -2323
37 #endif
38 
39 /* FIXME: assumes 32-bit int.  */
40 typedef unsigned int unicode_char_t;
41 
42 /* These are the possible character classifications.  */
43 #define UNICODE_CONTROL 0
44 #define UNICODE_FORMAT 1
45 #define UNICODE_UNASSIGNED 2
46 #define UNICODE_PRIVATE_USE 3
47 #define UNICODE_SURROGATE 4
48 #define UNICODE_LOWERCASE_LETTER 5
49 #define UNICODE_MODIFIER_LETTER 6
50 #define UNICODE_OTHER_LETTER 7
51 #define UNICODE_TITLECASE_LETTER 8
52 #define UNICODE_UPPERCASE_LETTER 9
53 #define UNICODE_COMBINING_MARK 10
54 #define UNICODE_ENCLOSING_MARK 11
55 #define UNICODE_NON_SPACING_MARK 12
56 #define UNICODE_DECIMAL_NUMBER 13
57 #define UNICODE_LETTER_NUMBER 14
58 #define UNICODE_OTHER_NUMBER 15
59 #define UNICODE_CONNECT_PUNCTUATION 16
60 #define UNICODE_DASH_PUNCTUATION 17
61 #define UNICODE_CLOSE_PUNCTUATION 18
62 #define UNICODE_FINAL_PUNCTUATION 19
63 #define UNICODE_INITIAL_PUNCTUATION 20
64 #define UNICODE_OTHER_PUNCTUATION 21
65 #define UNICODE_OPEN_PUNCTUATION 22
66 #define UNICODE_CURRENCY_SYMBOL 23
67 #define UNICODE_MODIFIER_SYMBOL 24
68 #define UNICODE_MATH_SYMBOL 25
69 #define UNICODE_OTHER_SYMBOL 26
70 #define UNICODE_LINE_SEPARATOR 27
71 #define UNICODE_PARAGRAPH_SEPARATOR 28
72 #define UNICODE_SPACE_SEPARATOR 29
73 
74 /* Call this to initialize the library.  */
75 void unicode_init (void);
76 
77 /* Returns 1 if current locale uses UTF-8 charset.  If CHARSET is
78    not null, sets *CHARSET to the name of the current locale's
79    charset.  This value is statically allocated.  */
80 int unicode_get_charset (char **charset);
81 
82 /* These are all analogs of the <ctype.h> functions.  */
83 int unicode_isalnum (unicode_char_t c);
84 int unicode_isalpha (unicode_char_t c);
85 int unicode_iscntrl (unicode_char_t c);
86 int unicode_isdigit (unicode_char_t c);
87 int unicode_isgraph (unicode_char_t c);
88 int unicode_islower (unicode_char_t c);
89 int unicode_isprint (unicode_char_t c);
90 int unicode_ispunct (unicode_char_t c);
91 int unicode_isspace (unicode_char_t c);
92 int unicode_isupper (unicode_char_t c);
93 int unicode_isxdigit (unicode_char_t c);
94 int unicode_istitle (unicode_char_t c);
95 int unicode_isdefined (unicode_char_t c);
96 int unicode_iswide (unicode_char_t c);
97 
98 /* More <ctype.h> functions.  These convert between the three cases.
99    See the Unicode book to understand title case.  */
100 unicode_char_t unicode_toupper (unicode_char_t c);
101 unicode_char_t unicode_tolower (unicode_char_t c);
102 unicode_char_t unicode_totitle (unicode_char_t c);
103 
104 /* If C is a digit (according to `unicode_isdigit'), then return its
105    numeric value.  Otherwise return -1.  */
106 int unicode_digit_value (unicode_char_t c);
107 
108 /* If C is a hex digit (according to `unicode_isxdigit'), then return
109    its numeric value.  Otherwise return -1.  */
110 int unicode_xdigit_value (unicode_char_t c);
111 
112 /* Return the Unicode character type of a given character.  */
113 int unicode_type (unicode_char_t c);
114 
115 /* If P points to the middle of a Utf-8 character, this function
116    returns a pointer to the first byte of the character.  If P points
117    to the start of a Utf-8 character, this function returns a pointer
118    to the first byte of the previous character.  If P does not point
119    to a Utf-8 character, NULL is returned.  START bounds the search;
120    in no case will a value before START be returned.  */
121 char *unicode_previous_utf8 (const char *start, const char *p);
122 
123 /* Return a pointer to the first byte of the next Utf-8 character
124    after P.  This works whether P points to the start or to the middle
125    of a Utf-8 character.  P is assumed to be nul-terminated.  */
126 char *unicode_next_utf8 (const char *p);
127 
128 /* Return the length, in characters, of P, a UTF-8 string.  MAX is the
129    maximum number of bytes to examine.  If MAX is less than 0, then P
130    is assumed to be nul-terminated.  */
131 int unicode_strlen (const char *p, int max);
132 
133 /* Returns the visual width, in character-size units, of P, a string.
134    This value may be used for tabulation.  */
135 int unicode_string_width (const char *p);
136 
137 /* Fetch the next Utf-8 character from P into RESULT, and return a
138    pointer to the start of the next Utf-8 character.  If P is not well
139    formed, will return NULL.  */
140 char *unicode_get_utf8 (const char *p, unicode_char_t *result);
141 
142 /* Returns the offset within the string, in bytes, of the character offset
143    given. */
144 
145 size_t unicode_offset_to_index(const char *p, int offset);
146 
147 /* Returns the offset within the string, in characters, of the byte offset
148    given. */
149 
150 size_t unicode_index_to_offset(const char *p, int offset);
151 
152 /* Returns a pointer to the _last_ non-NULL utf-8 within the string */
153 
154 char *unicode_last_utf8(const char *p);
155 
156 /* Copies n characters from src to dest */
157 
158 char *unicode_strncpy(char *dest, const char *src, size_t n);
159 
160 /* Find the UTF-8 character corresponding to ch, in string p. These
161    functions are equivilants to strchr and strrchr */
162 
163 char *unicode_strchr(const char *p, unicode_char_t ch);
164 char *unicode_strrchr(const char *p, unicode_char_t ch);
165 
166 /* Pads a string to fill out a requested visual width */
167 
168 void unicode_pad_string(char *dest, int right, int width, const char *string);
169 
170 /* Compute canonical ordering of a string in-place.  This rearranges
171    decomposed characters in the string according to their combining
172    classes.  See the Unicode manual for more information.  */
173 void unicode_canonical_ordering (unicode_char_t *string, size_t len);
174 
175 /* Compute canonical decomposition of a character.  Returns malloc()d
176    string of Unicode characters.  RESULT_LEN is set to the resulting
177    length of the string.  */
178 unicode_char_t *unicode_canonical_decomposition (unicode_char_t ch,
179 						 size_t *result_len);
180 
181 /* An opaque type used by the iconv workalike.  */
182 typedef struct unicode_iconv_i *unicode_iconv_t;
183 
184 /* Create a new iconv conversion instance.  TOCODE is the destination
185    charset, FROMCODE is the source charset.  Returns -1 if a charset
186    name is not recognized or if out of memory.  Can set errno to
187    ENOMEM or EINVAL.  */
188 unicode_iconv_t unicode_iconv_open (const char *tocode, const char *fromcode);
189 
190 /* Close an iconv conversion instance.  */
191 int unicode_iconv_close (unicode_iconv_t cd);
192 
193 /* Convert characters from INBUF into OUTBUF.  Parameters are in/out
194    and are updated by this function.  Returns -1 and sets errno on
195    error (including E2BIG if not enough room left in output buffer).
196    Otherwise returns number of conversions performed; this can be 0.
197    Note that on some systems EILSEQ (a possible error code) is not
198    defined.  On such systems we use EBADMSG instead.  */
199 ssize_t unicode_iconv (unicode_iconv_t cd,
200 		       const char **inbuf, size_t *inbytesleft,
201 		       char **outbuf, size_t *outbytesleft);
202 
203 #ifdef __cplusplus
204 }
205 #endif
206 
207 #endif /* UNICODE_H */
208