1 /* Multibyte character data type. 2 Copyright (C) 2001, 2005 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 2, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software Foundation, 16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ 17 18 /* Written by Bruno Haible <bruno@clisp.org>. */ 19 20 /* A multibyte character is a short subsequence of a char* string, 21 representing a single wide character. 22 23 We use multibyte characters instead of wide characters because of 24 the following goals: 25 1) correct multibyte handling, i.e. operate according to the LC_CTYPE 26 locale, 27 2) ease of maintenance, i.e. the maintainer needs not know all details 28 of the ISO C 99 standard, 29 3) don't fail grossly if the input is not in the encoding set by the 30 locale, because often different encodings are in use in the same 31 countries (ISO-8859-1/UTF-8, EUC-JP/Shift_JIS, ...), 32 4) fast in the case of ASCII characters, 33 5) portability, i.e. don't make unportable assumptions about wchar_t. 34 35 Multibyte characters are only accessed through the mb* macros. 36 37 mb_ptr (mbc) 38 return a pointer to the beginning of the multibyte sequence. 39 40 mb_len (mbc) 41 returns the number of bytes occupied by the multibyte sequence. 42 Always > 0. 43 44 mb_iseq (mbc, sc) 45 returns true if mbc is the standard ASCII character sc. 46 47 mb_isnul (mbc) 48 returns true if mbc is the nul character. 49 50 mb_cmp (mbc1, mbc2) 51 returns a positive, zero, or negative value depending on whether mbc1 52 sorts after, same or before mbc2. 53 54 mb_casecmp (mbc1, mbc2) 55 returns a positive, zero, or negative value depending on whether mbc1 56 sorts after, same or before mbc2, modulo upper/lowercase conversion. 57 58 mb_equal (mbc1, mbc2) 59 returns true if mbc1 and mbc2 are equal. 60 61 mb_caseequal (mbc1, mbc2) 62 returns true if mbc1 and mbc2 are equal modulo upper/lowercase conversion. 63 64 mb_isalnum (mbc) 65 returns true if mbc is alphanumeric. 66 67 mb_isalpha (mbc) 68 returns true if mbc is alphabetic. 69 70 mb_isascii(mbc) 71 returns true if mbc is plain ASCII. 72 73 mb_isblank (mbc) 74 returns true if mbc is a blank. 75 76 mb_iscntrl (mbc) 77 returns true if mbc is a control character. 78 79 mb_isdigit (mbc) 80 returns true if mbc is a decimal digit. 81 82 mb_isgraph (mbc) 83 returns true if mbc is a graphic character. 84 85 mb_islower (mbc) 86 returns true if mbc is lowercase. 87 88 mb_isprint (mbc) 89 returns true if mbc is a printable character. 90 91 mb_ispunct (mbc) 92 returns true if mbc is a punctuation character. 93 94 mb_isspace (mbc) 95 returns true if mbc is a space character. 96 97 mb_isupper (mbc) 98 returns true if mbc is uppercase. 99 100 mb_isxdigit (mbc) 101 returns true if mbc is a hexadecimal digit. 102 103 mb_width (mbc) 104 returns the number of columns on the output device occupied by mbc. 105 Always >= 0. 106 107 mb_putc (mbc, stream) 108 outputs mbc on stream, a byte oriented FILE stream opened for output. 109 110 mb_setascii (&mbc, sc) 111 assigns the standard ASCII character sc to mbc. 112 113 mb_copy (&destmbc, &srcmbc) 114 copies srcmbc to destmbc. 115 116 Here are the function prototypes of the macros. 117 118 extern const char * mb_ptr (const mbchar_t mbc); 119 extern size_t mb_len (const mbchar_t mbc); 120 extern bool mb_iseq (const mbchar_t mbc, char sc); 121 extern bool mb_isnul (const mbchar_t mbc); 122 extern int mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2); 123 extern int mb_casecmp (const mbchar_t mbc1, const mbchar_t mbc2); 124 extern bool mb_equal (const mbchar_t mbc1, const mbchar_t mbc2); 125 extern bool mb_caseequal (const mbchar_t mbc1, const mbchar_t mbc2); 126 extern bool mb_isalnum (const mbchar_t mbc); 127 extern bool mb_isalpha (const mbchar_t mbc); 128 extern bool mb_isascii (const mbchar_t mbc); 129 extern bool mb_isblank (const mbchar_t mbc); 130 extern bool mb_iscntrl (const mbchar_t mbc); 131 extern bool mb_isdigit (const mbchar_t mbc); 132 extern bool mb_isgraph (const mbchar_t mbc); 133 extern bool mb_islower (const mbchar_t mbc); 134 extern bool mb_isprint (const mbchar_t mbc); 135 extern bool mb_ispunct (const mbchar_t mbc); 136 extern bool mb_isspace (const mbchar_t mbc); 137 extern bool mb_isupper (const mbchar_t mbc); 138 extern bool mb_isxdigit (const mbchar_t mbc); 139 extern int mb_width (const mbchar_t mbc); 140 extern void mb_putc (const mbchar_t mbc, FILE *stream); 141 extern void mb_setascii (mbchar_t *new, char sc); 142 extern void mb_copy (mbchar_t *new, const mbchar_t *old); 143 */ 144 145 #ifndef _MBCHAR_H 146 #define _MBCHAR_H 1 147 148 #include <stdbool.h> 149 #include <string.h> 150 151 /* Tru64 with Desktop Toolkit C has a bug: <stdio.h> must be included before 152 <wchar.h>. 153 BSD/OS 4.1 has a bug: <stdio.h> and <time.h> must be included before 154 <wchar.h>. */ 155 #include <stdio.h> 156 #include <time.h> 157 #include <wchar.h> 158 159 #include <wctype.h> 160 161 #define MBCHAR_BUF_SIZE 24 162 163 struct mbchar 164 { 165 const char *ptr; /* pointer to current character */ 166 size_t bytes; /* number of bytes of current character, > 0 */ 167 bool wc_valid; /* true if wc is a valid wide character */ 168 wchar_t wc; /* if wc_valid: the current character */ 169 char buf[MBCHAR_BUF_SIZE]; /* room for the bytes, used for file input only */ 170 }; 171 172 /* EOF (not a real character) is represented with bytes = 0 and 173 wc_valid = false. */ 174 175 typedef struct mbchar mbchar_t; 176 177 /* Access the current character. */ 178 #define mb_ptr(mbc) ((mbc).ptr) 179 #define mb_len(mbc) ((mbc).bytes) 180 181 /* Comparison of characters. */ 182 #define mb_iseq(mbc, sc) ((mbc).wc_valid && (mbc).wc == (sc)) 183 #define mb_isnul(mbc) ((mbc).wc_valid && (mbc).wc == 0) 184 #define mb_cmp(mbc1, mbc2) \ 185 ((mbc1).wc_valid \ 186 ? ((mbc2).wc_valid \ 187 ? (int) (mbc1).wc - (int) (mbc2).wc \ 188 : -1) \ 189 : ((mbc2).wc_valid \ 190 ? 1 \ 191 : (mbc1).bytes == (mbc2).bytes \ 192 ? memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) \ 193 : (mbc1).bytes < (mbc2).bytes \ 194 ? (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) > 0 ? 1 : -1) \ 195 : (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc2).bytes) >= 0 ? 1 : -1))) 196 #define mb_casecmp(mbc1, mbc2) \ 197 ((mbc1).wc_valid \ 198 ? ((mbc2).wc_valid \ 199 ? (int) towlower ((mbc1).wc) - (int) towlower ((mbc2).wc) \ 200 : -1) \ 201 : ((mbc2).wc_valid \ 202 ? 1 \ 203 : (mbc1).bytes == (mbc2).bytes \ 204 ? memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) \ 205 : (mbc1).bytes < (mbc2).bytes \ 206 ? (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) > 0 ? 1 : -1) \ 207 : (memcmp ((mbc1).ptr, (mbc2).ptr, (mbc2).bytes) >= 0 ? 1 : -1))) 208 #define mb_equal(mbc1, mbc2) \ 209 ((mbc1).wc_valid && (mbc2).wc_valid \ 210 ? (mbc1).wc == (mbc2).wc \ 211 : (mbc1).bytes == (mbc2).bytes \ 212 && memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0) 213 #define mb_caseequal(mbc1, mbc2) \ 214 ((mbc1).wc_valid && (mbc2).wc_valid \ 215 ? towlower ((mbc1).wc) == towlower ((mbc2).wc) \ 216 : (mbc1).bytes == (mbc2).bytes \ 217 && memcmp ((mbc1).ptr, (mbc2).ptr, (mbc1).bytes) == 0) 218 219 /* <ctype.h>, <wctype.h> classification. */ 220 #define mb_isascii(mbc) \ 221 ((mbc).wc_valid && (mbc).wc >= 0 && (mbc).wc <= 127) 222 #define mb_isalnum(mbc) ((mbc).wc_valid && iswalnum ((mbc).wc)) 223 #define mb_isalpha(mbc) ((mbc).wc_valid && iswalpha ((mbc).wc)) 224 #define mb_isblank(mbc) ((mbc).wc_valid && iswblank ((mbc).wc)) 225 #define mb_iscntrl(mbc) ((mbc).wc_valid && iswcntrl ((mbc).wc)) 226 #define mb_isdigit(mbc) ((mbc).wc_valid && iswdigit ((mbc).wc)) 227 #define mb_isgraph(mbc) ((mbc).wc_valid && iswgraph ((mbc).wc)) 228 #define mb_islower(mbc) ((mbc).wc_valid && iswlower ((mbc).wc)) 229 #define mb_isprint(mbc) ((mbc).wc_valid && iswprint ((mbc).wc)) 230 #define mb_ispunct(mbc) ((mbc).wc_valid && iswpunct ((mbc).wc)) 231 #define mb_isspace(mbc) ((mbc).wc_valid && iswspace ((mbc).wc)) 232 #define mb_isupper(mbc) ((mbc).wc_valid && iswupper ((mbc).wc)) 233 #define mb_isxdigit(mbc) ((mbc).wc_valid && iswxdigit ((mbc).wc)) 234 235 /* Extra <wchar.h> function. */ 236 237 /* Unprintable characters appear as a small box of width 1. */ 238 #define MB_UNPRINTABLE_WIDTH 1 239 240 static inline int 241 mb_width_aux (wint_t wc) 242 { 243 int w = wcwidth (wc); 244 /* For unprintable characters, arbitrarily return 0 for control characters 245 and MB_UNPRINTABLE_WIDTH otherwise. */ 246 return (w >= 0 ? w : iswcntrl (wc) ? 0 : MB_UNPRINTABLE_WIDTH); 247 } 248 249 #define mb_width(mbc) \ 250 ((mbc).wc_valid ? mb_width_aux ((mbc).wc) : MB_UNPRINTABLE_WIDTH) 251 252 /* Output. */ 253 #define mb_putc(mbc, stream) fwrite ((mbc).ptr, 1, (mbc).bytes, (stream)) 254 255 /* Assignment. */ 256 #define mb_setascii(mbc, sc) \ 257 ((mbc)->ptr = (mbc)->buf, (mbc)->bytes = 1, (mbc)->wc_valid = 1, \ 258 (mbc)->wc = (mbc)->buf[0] = (sc)) 259 260 /* Copying a character. */ 261 static inline void 262 mb_copy (mbchar_t *new, const mbchar_t *old) 263 { 264 if (old->ptr == &old->buf[0]) 265 { 266 memcpy (&new->buf[0], &old->buf[0], old->bytes); 267 new->ptr = &new->buf[0]; 268 } 269 else 270 new->ptr = old->ptr; 271 new->bytes = old->bytes; 272 if ((new->wc_valid = old->wc_valid)) 273 new->wc = old->wc; 274 } 275 276 277 /* is_basic(c) tests whether the single-byte character c is in the 278 ISO C "basic character set". 279 This is a convenience function, and is in this file only to share code 280 between mbiter_multi.h and mbfile_multi.h. */ 281 #if (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) \ 282 && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) \ 283 && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) \ 284 && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) \ 285 && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) \ 286 && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) \ 287 && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) \ 288 && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) \ 289 && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) \ 290 && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) \ 291 && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) \ 292 && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) \ 293 && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) \ 294 && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) \ 295 && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) \ 296 && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) \ 297 && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) \ 298 && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) \ 299 && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) \ 300 && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) \ 301 && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) \ 302 && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) \ 303 && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126) 304 /* The character set is ISO-646, not EBCDIC. */ 305 # define IS_BASIC_ASCII 1 306 307 extern unsigned int is_basic_table[]; 308 309 static inline bool 310 is_basic (char c) 311 { 312 return (is_basic_table [(unsigned char) c >> 5] >> ((unsigned char) c & 31)) 313 & 1; 314 } 315 316 #else 317 318 static inline bool 319 is_basic (char c) 320 { 321 switch (c) 322 { 323 case '\t': case '\v': case '\f': 324 case ' ': case '!': case '"': case '#': case '%': 325 case '&': case '\'': case '(': case ')': case '*': 326 case '+': case ',': case '-': case '.': case '/': 327 case '0': case '1': case '2': case '3': case '4': 328 case '5': case '6': case '7': case '8': case '9': 329 case ':': case ';': case '<': case '=': case '>': 330 case '?': 331 case 'A': case 'B': case 'C': case 'D': case 'E': 332 case 'F': case 'G': case 'H': case 'I': case 'J': 333 case 'K': case 'L': case 'M': case 'N': case 'O': 334 case 'P': case 'Q': case 'R': case 'S': case 'T': 335 case 'U': case 'V': case 'W': case 'X': case 'Y': 336 case 'Z': 337 case '[': case '\\': case ']': case '^': case '_': 338 case 'a': case 'b': case 'c': case 'd': case 'e': 339 case 'f': case 'g': case 'h': case 'i': case 'j': 340 case 'k': case 'l': case 'm': case 'n': case 'o': 341 case 'p': case 'q': case 'r': case 's': case 't': 342 case 'u': case 'v': case 'w': case 'x': case 'y': 343 case 'z': case '{': case '|': case '}': case '~': 344 return 1; 345 default: 346 return 0; 347 } 348 } 349 350 #endif 351 352 #endif /* _MBCHAR_H */ 353