1 /* 2 * Copyright 2005 John M Bell <jmb202@ecs.soton.ac.uk> 3 * 4 * This file is part of NetSurf, http://www.netsurf-browser.org/ 5 * 6 * NetSurf is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; version 2 of the License. 9 * 10 * NetSurf is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 */ 18 19 /** \file 20 * UTF-8 manipulation functions (interface). 21 */ 22 23 #ifndef _NETSURF_UTILS_UTF8_H_ 24 #define _NETSURF_UTILS_UTF8_H_ 25 26 #include <stdbool.h> 27 #include <stdint.h> 28 29 #include "utils/errors.h" 30 31 /** 32 * Convert a UTF-8 multibyte sequence into a single UCS4 character 33 * 34 * Encoding of UCS values outside the UTF-16 plane has been removed from 35 * RFC3629. This function conforms to RFC2279, however. 36 * 37 * \param[in] s The sequence to process 38 * \param[in] l Length of sequence 39 * \return UCS4 character 40 */ 41 uint32_t utf8_to_ucs4(const char *s, size_t l); 42 43 /** 44 * Convert a single UCS4 character into a UTF-8 multibyte sequence 45 * 46 * Encoding of UCS values outside the UTF-16 plane has been removed from 47 * RFC3629. This function conforms to RFC2279, however. 48 * 49 * \param c The character to process (0 <= c <= 0x7FFFFFFF) 50 * \param s Pointer to 6 byte long output buffer 51 * \return Length of multibyte sequence 52 */ 53 size_t utf8_from_ucs4(uint32_t c, char *s); 54 55 56 /** 57 * Calculate the length (in characters) of a NULL-terminated UTF-8 string 58 * 59 * \param s The string 60 * \return Length of string 61 */ 62 size_t utf8_length(const char *s); 63 64 /** 65 * Calculated the length (in characters) of a bounded UTF-8 string 66 * 67 * \param s The string 68 * \param l Maximum length of input (in bytes) 69 * \return Length of string, in characters 70 */ 71 size_t utf8_bounded_length(const char *s, size_t l); 72 73 /** 74 * Calculate the length (in bytes) of a bounded UTF-8 string 75 * 76 * \param s The string 77 * \param l Maximum length of input (in bytes) 78 * \param c Maximum number of characters to measure 79 * \return Length of string, in bytes 80 */ 81 size_t utf8_bounded_byte_length(const char *s, size_t l, size_t c); 82 83 /** 84 * Calculate the length (in bytes) of a UTF-8 character 85 * 86 * \param s Pointer to start of character 87 * \return Length of character, in bytes 88 */ 89 size_t utf8_char_byte_length(const char *s); 90 91 92 /** 93 * Find previous legal UTF-8 char in string 94 * 95 * \param s The string 96 * \param o Offset in the string to start at 97 * \return Offset of first byte of previous legal character 98 */ 99 size_t utf8_prev(const char *s, size_t o); 100 101 /** 102 * Find next legal UTF-8 char in string 103 * 104 * \param s The string 105 * \param l Maximum offset in string 106 * \param o Offset in the string to start at 107 * \return Offset of first byte of next legal character 108 */ 109 size_t utf8_next(const char *s, size_t l, size_t o); 110 111 112 /** 113 * Convert a UTF8 string into the named encoding 114 * 115 * \param string The NULL-terminated string to convert 116 * \param encname The encoding name (suitable for passing to iconv) 117 * \param len Length of input string to consider (in bytes), or 0 118 * \param result Pointer to location to store result (allocated on heap) 119 * \return standard nserror value 120 */ 121 nserror utf8_to_enc(const char *string, const char *encname, 122 size_t len, char **result); 123 124 /** 125 * Convert a string in the named encoding into a UTF-8 string 126 * 127 * \param string The NULL-terminated string to convert 128 * \param encname The encoding name (suitable for passing to iconv) 129 * \param len Length of input string to consider (in bytes), or 0 130 * \param result Pointer to location to store result (allocated on heap) 131 * \param result_len The length of the data placed in result. 132 * \return standard nserror value 133 */ 134 nserror utf8_from_enc(const char *string, const char *encname, 135 size_t len, char **result, size_t *result_len); 136 137 /** 138 * Convert a UTF-8 encoded string into a string of the given encoding, 139 * applying HTML escape sequences where necessary. 140 * 141 * \param string String to convert (NUL-terminated) 142 * \param encname Name of encoding to convert to 143 * \param len Length, in bytes, of the input string, or 0 144 * \param result Pointer to location to receive result 145 * \return standard nserror code 146 */ 147 nserror utf8_to_html(const char *string, const char *encname, 148 size_t len, char **result); 149 150 /** 151 * Save the given utf8 text to a file, converting to local encoding. 152 * 153 * \param utf8_text text to save to file 154 * \param path pathname to save to 155 * \return true iff the save succeeded 156 */ 157 bool utf8_save_text(const char *utf8_text, const char *path); 158 159 160 /** 161 * Finalise the UTF-8 library 162 */ 163 nserror utf8_finalise(void); 164 165 #endif 166