1 /*
2  * charset.h - Character set implementation
3  *
4  *   Copyright (c) 2000-2020  Shiro Kawai  <shiro@acm.org>
5  *
6  *   Redistribution and use in source and binary forms, with or without
7  *   modification, are permitted provided that the following conditions
8  *   are met:
9  *
10  *   1. Redistributions of source code must retain the above copyright
11  *      notice, this list of conditions and the following disclaimer.
12  *
13  *   2. Redistributions in binary form must reproduce the above copyright
14  *      notice, this list of conditions and the following disclaimer in the
15  *      documentation and/or other materials provided with the distribution.
16  *
17  *   3. Neither the name of the authors nor the names of its contributors
18  *      may be used to endorse or promote products derived from this
19  *      software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #ifndef GAUCHE_CHARSET_H
35 #define GAUCHE_CHARSET_H
36 
37 /* We implement char-sets as hybrid of bitmap and binary tree.
38  *
39  * Bitmap is used for "small" characters, i.e. characters between
40  * U+0000 and U+007F.  There, each bit represents whether the
41  * character is in the set (1) or not (0).
42  *
43  * For larger characters, we keep the range of included chars
44  * in a binary tree.  For each entry, its key is the start char code
45  * and its value is the end char code (inclusive).
46  * For example, if the character set has characters between
47  * U+3040 and U+30FF, and U+4E00 and U+9FBF, then the tree has
48  * the following entries:
49  *   #x3040 => #x30ff, #x4e00 => #x9fbf.
50  *
51  * We have mutable char-set (default) and immutable or frozen char-set.
52  * Mutable char-set uses ScmTreeCore for the large characters.  Immutable
53  * char-set uses flat u32vector, sorted by the key and accessed by binary
54  * search.
55  */
56 
57 #define SCM_CHAR_SET_SMALL_CHARS 128
58 
59 struct ScmCharSetRec {
60     SCM_HEADER;
61     ScmBits small[SCM_BITS_NUM_WORDS(SCM_CHAR_SET_SMALL_CHARS)];
62     u_int flags;
63     union {
64         ScmTreeCore tree;
65         struct {
66             ScmSize size; /* size of vec.  # of entries is half of this */
67             const uint32_t *vec;
68             uint32_t ivec[2]; /* if size==2, vec points here */
69         } frozen;
70     } large;
71 };
72 
73 typedef enum {
74     SCM_CHAR_SET_LARGE = 1,
75     SCM_CHAR_SET_IMMUTABLE = 2,
76 } ScmCharSetType;
77 
78 SCM_CLASS_DECL(Scm_CharSetClass);
79 #define SCM_CLASS_CHAR_SET  (&Scm_CharSetClass)
80 #define SCM_CHAR_SET(obj)   ((ScmCharSet*)obj)
81 #define SCM_CHAR_SET_P(obj) SCM_XTYPEP(obj, SCM_CLASS_CHAR_SET)
82 
83 #define SCM_CHAR_SET_LARGE_P(obj) \
84     (SCM_CHAR_SET(obj)->flags & SCM_CHAR_SET_LARGE)
85 #define SCM_CHAR_SET_IMMUTABLE_P(obj) \
86     (SCM_CHAR_SET(obj)->flags & SCM_CHAR_SET_IMMUTABLE)
87 
88 /* for backward compatibility.  deprecated. */
89 #define SCM_CLASS_CHARSET   SCM_CLASS_CHAR_SET
90 #define SCM_CHARSET(obj)    SCM_CHAR_SET(obj)
91 #define SCM_CHARSETP(obj)   SCM_CHAR_SET_P(obj)
92 
93 SCM_EXTERN ScmObj Scm_MakeEmptyCharSet(void);
94 SCM_EXTERN ScmObj Scm_MakeImmutableCharSet(const ScmBits *small,
95                                            const uint32_t *vec,
96                                            size_t size);
97 SCM_EXTERN ScmObj Scm_CharSetCopy(ScmCharSet *src);
98 SCM_EXTERN ScmObj Scm_CharSetFreeze(ScmCharSet *src);
99 SCM_EXTERN ScmObj Scm_CharSetFreezeX(ScmCharSet *src);
100 SCM_EXTERN int    Scm_CharSetEq(ScmCharSet *x, ScmCharSet *y);
101 SCM_EXTERN int    Scm_CharSetLE(ScmCharSet *x, ScmCharSet *y);
102 SCM_EXTERN ScmObj Scm_CharSetAddRange(ScmCharSet *cs,
103                                       ScmChar from, ScmChar to);
104 SCM_EXTERN ScmObj Scm_CharSetAdd(ScmCharSet *dest, ScmCharSet *src);
105 SCM_EXTERN ScmObj Scm_CharSetComplement(ScmCharSet *cs);
106 SCM_EXTERN ScmObj Scm_CharSetCaseFold(ScmCharSet *cs);
107 SCM_EXTERN ScmObj Scm_CharSetRanges(ScmCharSet *cs);
108 SCM_EXTERN ScmObj Scm_CharSetRead(ScmPort *input, int *complement_p,
109                                   int error_p, int bracket_syntax);
110 SCM_EXTERN int    Scm_CharSetParseCategory(ScmPort *input, char ch);
111 
112 SCM_EXTERN int    Scm_CharSetContains(ScmCharSet *cs, ScmChar c);
113 SCM_EXTERN void   Scm_CharSetDump(ScmCharSet *cs, ScmPort *port);
114 
115 #endif /*GAUCHE_CHARSET_H*/
116