1 /*
2  * stringP.h - String private API
3  *
4  *   Copyright (c) 2020  Shiro Kawai  <shiro@acm.org>
5  *
6  *   Redistribution and use in source and binary forms, with or without
7  *   modification, are permitted provided that the following conditions
8  *   are met:
9  *
10  *   1. Redistributions of source code must retain the above copyright
11  *      notice, this list of conditions and the following disclaimer.
12  *
13  *   2. Redistributions in binary form must reproduce the above copyright
14  *      notice, this list of conditions and the following disclaimer in the
15  *      documentation and/or other materials provided with the distribution.
16  *
17  *   3. Neither the name of the authors nor the names of its contributors
18  *      may be used to endorse or promote products derived from this
19  *      software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #ifndef GAUCHE_PRIV_STRINGP_H
35 #define GAUCHE_PRIV_STRINGP_H
36 
37 /* String cursor */
38 
39 /* If the offset is less than or equal to this, we use small cursor that
40    fits in ScmObj.  Otherwise we allocate ScmStringCursorLarge. */
41 #if SIZEOF_LONG == 4
42 #define SCM_STRING_CURSOR_SMALL_OFFSET_MAX  ((1L<<24)-1)
43 #else
44 #define SCM_STRING_CURSOR_SMALL_OFFSET_MAX  ((1L<<56)-1)
45 #endif
46 
47 struct ScmStringCursorLargeRec {
48     SCM_HEADER;
49     const char *start;
50     ScmSmallInt offset;	 /* in bytes, relative to string body start
51                             offset is non-negative. */
52 };
53 
54 #define SCM_STRING_CURSOR_FITS_SMALL_P(off) \
55     ((off) <= SCM_STRING_CURSOR_SMALL_OFFSET_MAX)
56 
57 #define SCM_STRING_CURSOR_LARGE_P(obj) \
58     SCM_XTYPEP(obj, SCM_CLASS_STRING_CURSOR_LARGE)
59 #define SCM_STRING_CURSOR_LARGE(obj)       ((ScmStringCursorLarge*)obj)
60 #define SCM_STRING_CURSOR_LARGE_OFFSET(obj)      \
61     (SCM_STRING_CURSOR_LARGE(obj)->offset)
62 #define SCM_STRING_CURSOR_LARGE_POINTER(sb, obj) \
63     (SCM_STRING_BODY_START(sb) + SCM_STRING_CURSOR_LARGE_OFFSET(obj))
64 #define SCM_STRING_CURSOR_LARGE_START(obj)       \
65     (SCM_STRING_CURSOR_LARGE(obj)->start)
66 
67 #define SCM_MAKE_STRING_CURSOR_SMALL(obj) \
68     SCM_OBJ(((uintptr_t)(obj) << 8) + 0x1b)
69 #define SCM_STRING_CURSOR_SMALL_P(obj)        (SCM_TAG8(obj) == 0x1b)
70 #define SCM_STRING_CURSOR_SMALL_OFFSET(obj) \
71     ((ScmSmallInt)(SCM_WORD(obj) >> 8))
72 #define SCM_STRING_CURSOR_SMALL_POINTER(sb, obj) \
73     (SCM_STRING_BODY_START(sb) + SCM_STRING_CURSOR_SMALL_OFFSET(obj))
74 
75 #define SCM_STRING_CURSOR_P(obj) \
76     (SCM_STRING_CURSOR_SMALL_P(obj)||SCM_STRING_CURSOR_LARGE_P(obj))
77 
78 /* String index
79  *
80  *  Attaching an index to a StringBody allows O(1) random access.
81  *  We don't worry too much about the constant factor; space-efficiency
82  *  is also a concern, for indexing is more important for long strings.
83  */
84 
85 typedef union ScmStringIndexRec {
86     /* We use element size according to the length of string body.
87        The first entry encodes the entry size and shift value.
88        (We might try index24 and/or index48.)
89 
90        RRSSS000 - index8
91        RRSSS001 - index16
92        RRSSS011 - index32
93        RRSSS101 - index64
94 
95        'SSS' encodes a value S, where an index is created for
96        every 1<<(S+1) characters.
97 
98        'RR' is reserved.  Currently 00.
99 
100        This octet is repeated to fill the first entry.  So, if the index
101        is index32 and S is 6, the first word is actually #x32323232.
102        This allows us to read the first byte to find out the actual
103        index type, regardless of endianness.
104 
105        The index array has the signature in the first element, and the total
106        lenght of the index array in the second.  The actual index array
107        begins from the third element.  Given the character index N,
108        indexX[(N>>shift)+2] contains the byte position of the character.
109      */
110     const uint8_t  signature;
111     const uint8_t  index8[1];
112     const uint16_t index16[1];
113     const uint32_t index32[1];
114     const uint64_t index64[1];
115 } ScmStringIndex;
116 
117 enum {
118     STRING_INDEX8 = 0,
119     STRING_INDEX16 = 1,
120     STRING_INDEX32 = 3,
121     STRING_INDEX64 = 5
122 };
123 
124 #define STRING_INDEX(p)       ((ScmStringIndex*)(p))
125 #define STRING_INDEX_TYPE(p)  (STRING_INDEX(p)->signature & 0x07)
126 #define STRING_INDEX_SHIFT(p) (((STRING_INDEX(p)->signature>>3)&0x07)+1)
127 #define STRING_INDEX_INTERVAL(p) (1L<<STRING_INDEX_SHIFT(p))
128 
129 #define STRING_INDEX_SIGNATURE(s, t)  (((((s)-1)&0x7)<<3)|((t)&0x07))
130 
131 #define SCM_STRING_BODY_HAS_INDEX(sb) ((sb)->index != NULL)
132 
133 SCM_EXTERN void Scm_StringBodyBuildIndex(ScmStringBody *sb);
134 SCM_EXTERN void Scm_StringBodyIndexDump(const ScmStringBody *sb, ScmPort *port);
135 
136 #endif /*GAUCHE_PRIV_STRINGP_H*/
137