1 /*
2  * Copyright 2005 John M Bell <jmb202@ecs.soton.ac.uk>
3  *
4  * This file is part of NetSurf, http://www.netsurf-browser.org/
5  *
6  * NetSurf is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; version 2 of the License.
9  *
10  * NetSurf is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 /** \file
20  * UTF-8 manipulation functions (interface).
21  */
22 
23 #ifndef _NETSURF_UTILS_UTF8_H_
24 #define _NETSURF_UTILS_UTF8_H_
25 
26 #include <stdbool.h>
27 #include <stdint.h>
28 
29 #include "utils/errors.h"
30 
31 /**
32  * Convert a UTF-8 multibyte sequence into a single UCS4 character
33  *
34  * Encoding of UCS values outside the UTF-16 plane has been removed from
35  * RFC3629. This function conforms to RFC2279, however.
36  *
37  * \param[in] s The sequence to process
38  * \param[in] l Length of sequence
39  * \return UCS4 character
40  */
41 uint32_t utf8_to_ucs4(const char *s, size_t l);
42 
43 /**
44  * Convert a single UCS4 character into a UTF-8 multibyte sequence
45  *
46  * Encoding of UCS values outside the UTF-16 plane has been removed from
47  * RFC3629. This function conforms to RFC2279, however.
48  *
49  * \param c  The character to process (0 <= c <= 0x7FFFFFFF)
50  * \param s  Pointer to 6 byte long output buffer
51  * \return   Length of multibyte sequence
52  */
53 size_t utf8_from_ucs4(uint32_t c, char *s);
54 
55 
56 /**
57  * Calculate the length (in characters) of a NULL-terminated UTF-8 string
58  *
59  * \param s  The string
60  * \return   Length of string
61  */
62 size_t utf8_length(const char *s);
63 
64 /**
65  * Calculated the length (in characters) of a bounded UTF-8 string
66  *
67  * \param s  The string
68  * \param l  Maximum length of input (in bytes)
69  * \return Length of string, in characters
70  */
71 size_t utf8_bounded_length(const char *s, size_t l);
72 
73 /**
74  * Calculate the length (in bytes) of a bounded UTF-8 string
75  *
76  * \param s  The string
77  * \param l  Maximum length of input (in bytes)
78  * \param c  Maximum number of characters to measure
79  * \return Length of string, in bytes
80  */
81 size_t utf8_bounded_byte_length(const char *s, size_t l, size_t c);
82 
83 /**
84  * Calculate the length (in bytes) of a UTF-8 character
85  *
86  * \param s  Pointer to start of character
87  * \return Length of character, in bytes
88  */
89 size_t utf8_char_byte_length(const char *s);
90 
91 
92 /**
93  * Find previous legal UTF-8 char in string
94  *
95  * \param s  The string
96  * \param o  Offset in the string to start at
97  * \return Offset of first byte of previous legal character
98  */
99 size_t utf8_prev(const char *s, size_t o);
100 
101 /**
102  * Find next legal UTF-8 char in string
103  *
104  * \param s  The string
105  * \param l  Maximum offset in string
106  * \param o  Offset in the string to start at
107  * \return Offset of first byte of next legal character
108  */
109 size_t utf8_next(const char *s, size_t l, size_t o);
110 
111 
112 /**
113  * Convert a UTF8 string into the named encoding
114  *
115  * \param string  The NULL-terminated string to convert
116  * \param encname The encoding name (suitable for passing to iconv)
117  * \param len     Length of input string to consider (in bytes), or 0
118  * \param result  Pointer to location to store result (allocated on heap)
119  * \return standard nserror value
120  */
121 nserror utf8_to_enc(const char *string, const char *encname,
122 		size_t len, char **result);
123 
124 /**
125  * Convert a string in the named encoding into a UTF-8 string
126  *
127  * \param string  The NULL-terminated string to convert
128  * \param encname The encoding name (suitable for passing to iconv)
129  * \param len     Length of input string to consider (in bytes), or 0
130  * \param result  Pointer to location to store result (allocated on heap)
131  * \param result_len The length of the data placed in result.
132  * \return standard nserror value
133  */
134 nserror utf8_from_enc(const char *string, const char *encname,
135 		size_t len, char **result, size_t *result_len);
136 
137 /**
138  * Convert a UTF-8 encoded string into a string of the given encoding,
139  * applying HTML escape sequences where necessary.
140  *
141  * \param string   String to convert (NUL-terminated)
142  * \param encname  Name of encoding to convert to
143  * \param len      Length, in bytes, of the input string, or 0
144  * \param result   Pointer to location to receive result
145  * \return standard nserror code
146  */
147 nserror utf8_to_html(const char *string, const char *encname,
148 		size_t len, char **result);
149 
150 /**
151  * Save the given utf8 text to a file, converting to local encoding.
152  *
153  * \param  utf8_text	text to save to file
154  * \param  path		pathname to save to
155  * \return true iff the save succeeded
156  */
157 bool utf8_save_text(const char *utf8_text, const char *path);
158 
159 
160 /**
161  * Finalise the UTF-8 library
162  */
163 nserror utf8_finalise(void);
164 
165 #endif
166