1 /* 2 * charset.h - Character set implementation 3 * 4 * Copyright (c) 2000-2020 Shiro Kawai <shiro@acm.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the authors nor the names of its contributors 18 * may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 27 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #ifndef GAUCHE_CHARSET_H 35 #define GAUCHE_CHARSET_H 36 37 /* We implement char-sets as hybrid of bitmap and binary tree. 38 * 39 * Bitmap is used for "small" characters, i.e. characters between 40 * U+0000 and U+007F. There, each bit represents whether the 41 * character is in the set (1) or not (0). 42 * 43 * For larger characters, we keep the range of included chars 44 * in a binary tree. For each entry, its key is the start char code 45 * and its value is the end char code (inclusive). 46 * For example, if the character set has characters between 47 * U+3040 and U+30FF, and U+4E00 and U+9FBF, then the tree has 48 * the following entries: 49 * #x3040 => #x30ff, #x4e00 => #x9fbf. 50 * 51 * We have mutable char-set (default) and immutable or frozen char-set. 52 * Mutable char-set uses ScmTreeCore for the large characters. Immutable 53 * char-set uses flat u32vector, sorted by the key and accessed by binary 54 * search. 55 */ 56 57 #define SCM_CHAR_SET_SMALL_CHARS 128 58 59 struct ScmCharSetRec { 60 SCM_HEADER; 61 ScmBits small[SCM_BITS_NUM_WORDS(SCM_CHAR_SET_SMALL_CHARS)]; 62 u_int flags; 63 union { 64 ScmTreeCore tree; 65 struct { 66 ScmSize size; /* size of vec. # of entries is half of this */ 67 const uint32_t *vec; 68 uint32_t ivec[2]; /* if size==2, vec points here */ 69 } frozen; 70 } large; 71 }; 72 73 typedef enum { 74 SCM_CHAR_SET_LARGE = 1, 75 SCM_CHAR_SET_IMMUTABLE = 2, 76 } ScmCharSetType; 77 78 SCM_CLASS_DECL(Scm_CharSetClass); 79 #define SCM_CLASS_CHAR_SET (&Scm_CharSetClass) 80 #define SCM_CHAR_SET(obj) ((ScmCharSet*)obj) 81 #define SCM_CHAR_SET_P(obj) SCM_XTYPEP(obj, SCM_CLASS_CHAR_SET) 82 83 #define SCM_CHAR_SET_LARGE_P(obj) \ 84 (SCM_CHAR_SET(obj)->flags & SCM_CHAR_SET_LARGE) 85 #define SCM_CHAR_SET_IMMUTABLE_P(obj) \ 86 (SCM_CHAR_SET(obj)->flags & SCM_CHAR_SET_IMMUTABLE) 87 88 /* for backward compatibility. deprecated. */ 89 #define SCM_CLASS_CHARSET SCM_CLASS_CHAR_SET 90 #define SCM_CHARSET(obj) SCM_CHAR_SET(obj) 91 #define SCM_CHARSETP(obj) SCM_CHAR_SET_P(obj) 92 93 SCM_EXTERN ScmObj Scm_MakeEmptyCharSet(void); 94 SCM_EXTERN ScmObj Scm_MakeImmutableCharSet(const ScmBits *small, 95 const uint32_t *vec, 96 size_t size); 97 SCM_EXTERN ScmObj Scm_CharSetCopy(ScmCharSet *src); 98 SCM_EXTERN ScmObj Scm_CharSetFreeze(ScmCharSet *src); 99 SCM_EXTERN ScmObj Scm_CharSetFreezeX(ScmCharSet *src); 100 SCM_EXTERN int Scm_CharSetEq(ScmCharSet *x, ScmCharSet *y); 101 SCM_EXTERN int Scm_CharSetLE(ScmCharSet *x, ScmCharSet *y); 102 SCM_EXTERN ScmObj Scm_CharSetAddRange(ScmCharSet *cs, 103 ScmChar from, ScmChar to); 104 SCM_EXTERN ScmObj Scm_CharSetAdd(ScmCharSet *dest, ScmCharSet *src); 105 SCM_EXTERN ScmObj Scm_CharSetComplement(ScmCharSet *cs); 106 SCM_EXTERN ScmObj Scm_CharSetCaseFold(ScmCharSet *cs); 107 SCM_EXTERN ScmObj Scm_CharSetRanges(ScmCharSet *cs); 108 SCM_EXTERN ScmObj Scm_CharSetRead(ScmPort *input, int *complement_p, 109 int error_p, int bracket_syntax); 110 SCM_EXTERN int Scm_CharSetParseCategory(ScmPort *input, char ch); 111 112 SCM_EXTERN int Scm_CharSetContains(ScmCharSet *cs, ScmChar c); 113 SCM_EXTERN void Scm_CharSetDump(ScmCharSet *cs, ScmPort *port); 114 115 #endif /*GAUCHE_CHARSET_H*/ 116