1 /* character.h -- header file for the character module. 2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 3 National Institute of Advanced Industrial Science and Technology (AIST) 4 Registration Number H15PRO112 5 6 This file is part of the m17n library. 7 8 The m17n library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public License 10 as published by the Free Software Foundation; either version 2.1 of 11 the License, or (at your option) any later version. 12 13 The m17n library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU Lesser General Public 19 License along with the m17n library; if not, write to the Free 20 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 21 Boston, MA 02110-1301 USA. */ 22 23 #ifndef _M17N_CHARACTER_H_ 24 #define _M17N_CHARACTER_H_ 25 26 /* UTF-8 format 27 28 0-7F 0xxxxxxx 29 80-7FF 110xxxxx 10xxxxxx 30 800-FFFF 1110xxxx 10xxxxxx 10xxxxxx 31 10000-1FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 32 200000-3FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 33 4000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 34 35 Unicode range: 36 0-10FFFF 0 - 11110uuu 10uuxxxx 10xxxxxx 10xxxxxx (uuuuu <= 0x10) 37 38 */ 39 40 #define MAX_UTF8_CHAR_BYTES 6 41 #define MAX_UNICODE_CHAR_BYTES 4 42 43 #define USHORT_SIZE (sizeof (unsigned short)) 44 #define UINT_SIZE (sizeof (unsigned int)) 45 46 /* Return how many bytes one unit (char, short, or int) in FORMAT 47 occupies. */ 48 49 #define UNIT_BYTES(format) \ 50 ((format) <= MTEXT_FORMAT_UTF_8 ? 1 \ 51 : (format) <= MTEXT_FORMAT_UTF_16BE ? USHORT_SIZE \ 52 : UINT_SIZE) 53 54 /* Return how many units (char, short, or int) C will occupy in 55 MText->data. If C is not in the supported range, return 0. */ 56 57 #define CHAR_UNITS_ASCII(c) ((c) < 0x80) 58 59 #define CHAR_UNITS_UTF8(c) \ 60 ((c) < 0x80 ? 1 \ 61 : (c) < 0x800 ? 2 \ 62 : (c) < 0x10000 ? 3 \ 63 : (c) < 0x200000 ? 4 \ 64 : (c) < 0x4000000 ? 5 \ 65 : 6) 66 67 #define CHAR_UNITS_UTF16(c) ((c) < 0x110000 ? (2 - ((c) < 0x10000)) : 0) 68 69 #define CHAR_UNITS_UTF32(c) 1 70 71 #define CHAR_UNITS(c, format) \ 72 ((format) <= MTEXT_FORMAT_UTF_8 ? CHAR_UNITS_UTF8 (c) \ 73 : (format) <= MTEXT_FORMAT_UTF_16BE ? CHAR_UNITS_UTF16 (c) \ 74 : CHAR_UNITS_UTF32 (c)) 75 76 #define CHAR_BYTES CHAR_UNITS_UTF8 77 78 #define CHAR_UNITS_AT_UTF8(p) \ 79 (!(*(p) & 0x80) ? 1 \ 80 : !(*(p) & 0x20) ? 2 \ 81 : !(*(p) & 0x10) ? 3 \ 82 : !(*(p) & 0x08) ? 4 \ 83 : !(*(p) & 0x04) ? 5 \ 84 : !(*(p) & 0x02) ? 6 \ 85 : 0) 86 87 #define CHAR_UNITS_AT_UTF16(p) \ 88 (2 - (*(unsigned short *) (p) < 0xD800 \ 89 || *(unsigned short *) (p) >= 0xDC00)) 90 91 #define CHAR_UNITS_AT(mt, p) \ 92 ((mt)->format <= MTEXT_FORMAT_UTF_8 ? CHAR_UNITS_AT_UTF8 (p) \ 93 : (mt)->format <= MTEXT_FORMAT_UTF_16BE ? CHAR_UNITS_AT_UTF16 (p) \ 94 : 1) 95 96 #define CHAR_BYTES_AT CHAR_UNITS_AT_UTF8 97 98 #define CHAR_UNITS_BY_HEAD_UTF8(c) \ 99 (!((c) & 0x80) ? 1 \ 100 : !((c) & 0x20) ? 2 \ 101 : !((c) & 0x10) ? 3 \ 102 : !((c) & 0x08) ? 4 \ 103 : !((c) & 0x04) ? 5 \ 104 : !((c) & 0x02) ? 6 \ 105 : 0) 106 107 #define CHAR_UNITS_BY_HEAD_UTF16(c) \ 108 (2 - ((unsigned short) (c) < 0xD800 || (unsigned short) (c) >= 0xDC00)) 109 110 #define CHAR_UNITS_BY_HEAD(c, format) \ 111 ((format) <= MTEXT_FORMAT_UTF_8 ? CHAR_UNITS_BY_HEAD_UTF8 (c) \ 112 : (format) <= MTEXT_FORMAT_UTF_16BE ? CHAR_UNITS_BY_HEAD_UTF16 (c) \ 113 : 1) 114 115 #define CHAR_BYTES_BY_HEAD CHAR_UNITS_BY_HEAD_UTF8 116 117 #define STRING_CHAR_UTF8(p) \ 118 (!((p)[0] & 0x80) ? (p)[0] \ 119 : !((p)[0] & 0x20) ? ((((p)[0] & 0x1F) << 6) \ 120 | ((p)[1] & 0x3F)) \ 121 : !((p)[0] & 0x10) ? ((((p)[0] & 0x0F) << 12) \ 122 | (((p)[1] & 0x3F) << 6) \ 123 | ((p)[2] & 0x3F)) \ 124 : !((p)[0] & 0x08) ? ((((p)[0] & 0x07) << 18) \ 125 | (((p)[1] & 0x3F) << 12) \ 126 | (((p)[2] & 0x3F) << 6) \ 127 | ((p)[3] & 0x3F)) \ 128 : !((p)[0] & 0x04) ? ((((p)[0] & 0x03) << 24) \ 129 | (((p)[1] & 0x3F) << 18) \ 130 | (((p)[2] & 0x3F) << 12) \ 131 | (((p)[3] & 0x3F) << 6) \ 132 | ((p)[4] & 0x3F)) \ 133 : ((((p)[0] & 0x01) << 30) \ 134 | (((p)[1] & 0x3F) << 24) \ 135 | (((p)[2] & 0x3F) << 18) \ 136 | (((p)[3] & 0x3F) << 12) \ 137 | (((p)[4] & 0x3F) << 6) \ 138 | ((p)[5] & 0x3F))) 139 140 #define STRING_CHAR_UTF16(p) \ 141 (((unsigned short) (p)[0] < 0xD800 || (unsigned short) (p)[0] >= 0xDC00) \ 142 ? (p)[0] \ 143 : ((((p)[0] - 0xD800) << 10) + ((p)[1] - 0xDC00) + 0x10000)) 144 145 146 #define STRING_CHAR STRING_CHAR_UTF8 147 148 149 #define STRING_CHAR_ADVANCE_UTF8(p) \ 150 (!(*(p) & 0x80) ? ((p)++, (p)[-1]) \ 151 : !(*(p) & 0x20) ? ((p) += 2, ((((p)[-2] & 0x1F) << 6) \ 152 | ((p)[-1] & 0x3F))) \ 153 : !(*(p) & 0x10) ? ((p) += 3, ((((p)[-3] & 0x0F) << 12) \ 154 | (((p)[-2] & 0x3F) << 6) \ 155 | ((p)[-1] & 0x3F))) \ 156 : !(*(p) & 0x08) ? ((p) += 4, ((((p)[-4] & 0x07) << 18) \ 157 | (((p)[-3] & 0x3F) << 12) \ 158 | (((p)[-2] & 0x3F) << 6) \ 159 | ((p)[-1] & 0x3F))) \ 160 : !(*(p) & 0x04) ? ((p) += 5, ((((p)[-5] & 0x03) << 24) \ 161 | (((p)[-4] & 0x3F) << 18) \ 162 | (((p)[-3] & 0x3F) << 12) \ 163 | (((p)[-2] & 0x3F) << 6) \ 164 | ((p)[-1] & 0x3F))) \ 165 : ((p) += 6, ((((p)[-6] & 0x01) << 30) \ 166 | (((p)[-5] & 0x3F) << 24) \ 167 | (((p)[-4] & 0x3F) << 18) \ 168 | (((p)[-3] & 0x3F) << 12) \ 169 | (((p)[-2] & 0x3F) << 6) \ 170 | ((p)[-1] & 0x3F)))) 171 172 #define STRING_CHAR_ADVANCE_UTF16(p) \ 173 (((p)[0] < 0xD800 || (p)[0] >= 0xDC00) \ 174 ? ((p)++, (p)[-1]) \ 175 : ((p) += 2, ((((p)[-2] - 0xD800) << 10) + ((p)[-1] - 0xDC00) + 0x10000))) 176 177 #define STRING_CHAR_ADVANCE STRING_CHAR_ADVANCE_UTF8 178 179 #define STRING_CHAR_AND_UNITS_UTF8(p, bytes) \ 180 (!((p)[0] & 0x80) ? ((bytes) = 1, (p)[0]) \ 181 : !((p)[0] & 0x20) ? ((bytes) = 2, \ 182 ((((p)[0] & 0x1F) << 6) \ 183 | ((p)[1] & 0x3F))) \ 184 : !((p)[0] & 0x10) ? ((bytes) = 3, \ 185 ((((p)[0] & 0x0F) << 12) \ 186 | (((p)[1] & 0x3F) << 6) \ 187 | ((p)[2] & 0x3F))) \ 188 : !((p)[0] & 0x08) ? ((bytes) = 4, \ 189 ((((p)[0] & 0x07) << 18) \ 190 | (((p)[1] & 0x3F) << 12) \ 191 | (((p)[2] & 0x3F) << 6) \ 192 | ((p)[3] & 0x3F))) \ 193 : !((p)[0] & 0x04) ? ((bytes) = 5, \ 194 ((((p)[0] & 0x03) << 24) \ 195 | (((p)[1] & 0x3F) << 18) \ 196 | (((p)[2] & 0x3F) << 12) \ 197 | (((p)[3] & 0x3F) << 6) \ 198 | ((p)[4] & 0x3F))) \ 199 : ((bytes) = 6, \ 200 ((((p)[0] & 0x01) << 30) \ 201 | (((p)[1] & 0x3F) << 24) \ 202 | (((p)[2] & 0x3F) << 18) \ 203 | (((p)[3] & 0x3F) << 12) \ 204 | (((p)[4] & 0x3F) << 6) \ 205 | ((p)[5] & 0x3F)))) 206 207 #define STRING_CHAR_AND_UNITS_UTF16(p, units) \ 208 (((unsigned short) (p)[0] < 0xD800 || (unsigned short) (p)[0] >= 0xDC00) \ 209 ? ((units) = 1, (p)[0]) \ 210 : ((units) = 2, \ 211 (((p)[0] - 0xD800) << 10) + ((p)[1] - 0xDC00) + 0x10000)) 212 213 #define STRING_CHAR_AND_UNITS(p, units, format) \ 214 ((format) <= MTEXT_FORMAT_UTF_8 \ 215 ? STRING_CHAR_AND_UNITS_UTF8 (p, units) \ 216 : (format) <= MTEXT_FORMAT_UTF_16BE \ 217 ? STRING_CHAR_AND_UNITS_UTF16 (p, units) \ 218 : ((units) = 1, ((unsigned) (p))[0])) 219 220 221 #define STRING_CHAR_AND_BYTES STRING_CHAR_AND_UNITS_UTF8 222 223 #define CHAR_STRING_UTF8(c, p) \ 224 ((c) < 0x80 \ 225 ? ((p)[0] = (c), 1) \ 226 : (c) < 0x800 ? ((p)[0] = (0xC0 | ((c) >> 6)), \ 227 (p)[1] = (0x80 | ((c) & 0x3F)), \ 228 2) \ 229 : (c) < 0x10000 ? ((p)[0] = (0xE0 | ((c) >> 12)), \ 230 (p)[1] = (0x80 | (((c) >> 6) & 0x3F)), \ 231 (p)[2] = (0x80 | ((c) & 0x3F)), \ 232 3) \ 233 : (c) < 0x200000 ? ((p)[0] = (0xF0 | ((c) >> 18)), \ 234 (p)[1] = (0x80 | (((c) >> 12) & 0x3F)), \ 235 (p)[2] = (0x80 | (((c) >> 6) & 0x3F)), \ 236 (p)[3] = (0x80 | ((c) & 0x3F)), \ 237 4) \ 238 : (c) < 0x4000000 ? ((p)[0] = 0xF8, \ 239 (p)[1] = (0x80 | ((c) >> 18)), \ 240 (p)[2] = (0x80 | (((c) >> 12) & 0x3F)), \ 241 (p)[3] = (0x80 | (((c) >> 6) & 0x3F)), \ 242 (p)[4] = (0x80 | ((c) & 0x3F)), \ 243 5) \ 244 : ((p)[0] = (0xFC | ((c) >> 30)), \ 245 (p)[1] = (0x80 | (((c) >> 24) & 0x3F)), \ 246 (p)[2] = (0x80 | (((c) >> 18) & 0x3F)), \ 247 (p)[3] = (0x80 | (((c) >> 12) & 0x3F)), \ 248 (p)[4] = (0x80 | (((c) >> 6) & 0x3F)), \ 249 (p)[5] = (0x80 | ((c) & 0x3F)), \ 250 6)) 251 252 #define CHAR_STRING_UTF16(c, p) \ 253 ((c) < 0x10000 ? (p)[0] = (c), 1 \ 254 : (p[0] = (((c) - 0x10000) >> 10) + 0xD800, \ 255 p[1] = (((c) - 0x10000) & 0x3FF) + 0xDC00, \ 256 2)) 257 258 #define CHAR_STRING CHAR_STRING_UTF8 259 260 #define CHAR_HEAD_P_UTF8(p) \ 261 ((*(p) & 0xC0) != 0x80) 262 263 #define CHAR_HEAD_P_UTF16(p) \ 264 (*(unsigned short *) (p) < 0xDC00 \ 265 || *(unsigned short *) (p) >= 0xE000) 266 267 #define CHAR_HEAD_P CHAR_HEAD_P_UTF8 268 269 /** Locale-safe version of tolower (). It works only for an ASCII 270 character. */ 271 #define TOLOWER(c) (((c) >= 'A' && (c) <= 'Z') ? (c) + 32 : (c)) 272 273 /** Locale-safe version of toupper (). It works only for an ASCII 274 character. */ 275 #define TOUPPER(c) (((c) >= 'a' && (c) <= 'z') ? (c) - 32 : (c)) 276 277 /** Locale-safe version of isupper (). It works only for an ASCII 278 character. */ 279 #define ISUPPER(c) ((c) >= 'A' && (c) <= 'Z') 280 281 /** Locale-safe version of isalnum (). It works only for an ASCII 282 character. */ 283 #define ISALNUM(c) \ 284 (((c) >= 'A' && (c) <= 'Z') \ 285 || ((c) >= 'a' && (c) <= 'z') \ 286 || ((c) >= '0' && (c) <= '9')) 287 288 extern void mchar__define_prop (MSymbol key, MSymbol type, void *mdb); 289 290 #endif /* not _M17N_CHARACTER_H_ */ 291