1 /* 2 * stringP.h - String private API 3 * 4 * Copyright (c) 2020 Shiro Kawai <shiro@acm.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * 3. Neither the name of the authors nor the names of its contributors 18 * may be used to endorse or promote products derived from this 19 * software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 27 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #ifndef GAUCHE_PRIV_STRINGP_H 35 #define GAUCHE_PRIV_STRINGP_H 36 37 /* String cursor */ 38 39 /* If the offset is less than or equal to this, we use small cursor that 40 fits in ScmObj. Otherwise we allocate ScmStringCursorLarge. */ 41 #if SIZEOF_LONG == 4 42 #define SCM_STRING_CURSOR_SMALL_OFFSET_MAX ((1L<<24)-1) 43 #else 44 #define SCM_STRING_CURSOR_SMALL_OFFSET_MAX ((1L<<56)-1) 45 #endif 46 47 struct ScmStringCursorLargeRec { 48 SCM_HEADER; 49 const char *start; 50 ScmSmallInt offset; /* in bytes, relative to string body start 51 offset is non-negative. */ 52 }; 53 54 #define SCM_STRING_CURSOR_FITS_SMALL_P(off) \ 55 ((off) <= SCM_STRING_CURSOR_SMALL_OFFSET_MAX) 56 57 #define SCM_STRING_CURSOR_LARGE_P(obj) \ 58 SCM_XTYPEP(obj, SCM_CLASS_STRING_CURSOR_LARGE) 59 #define SCM_STRING_CURSOR_LARGE(obj) ((ScmStringCursorLarge*)obj) 60 #define SCM_STRING_CURSOR_LARGE_OFFSET(obj) \ 61 (SCM_STRING_CURSOR_LARGE(obj)->offset) 62 #define SCM_STRING_CURSOR_LARGE_POINTER(sb, obj) \ 63 (SCM_STRING_BODY_START(sb) + SCM_STRING_CURSOR_LARGE_OFFSET(obj)) 64 #define SCM_STRING_CURSOR_LARGE_START(obj) \ 65 (SCM_STRING_CURSOR_LARGE(obj)->start) 66 67 #define SCM_MAKE_STRING_CURSOR_SMALL(obj) \ 68 SCM_OBJ(((uintptr_t)(obj) << 8) + 0x1b) 69 #define SCM_STRING_CURSOR_SMALL_P(obj) (SCM_TAG8(obj) == 0x1b) 70 #define SCM_STRING_CURSOR_SMALL_OFFSET(obj) \ 71 ((ScmSmallInt)(SCM_WORD(obj) >> 8)) 72 #define SCM_STRING_CURSOR_SMALL_POINTER(sb, obj) \ 73 (SCM_STRING_BODY_START(sb) + SCM_STRING_CURSOR_SMALL_OFFSET(obj)) 74 75 #define SCM_STRING_CURSOR_P(obj) \ 76 (SCM_STRING_CURSOR_SMALL_P(obj)||SCM_STRING_CURSOR_LARGE_P(obj)) 77 78 /* String index 79 * 80 * Attaching an index to a StringBody allows O(1) random access. 81 * We don't worry too much about the constant factor; space-efficiency 82 * is also a concern, for indexing is more important for long strings. 83 */ 84 85 typedef union ScmStringIndexRec { 86 /* We use element size according to the length of string body. 87 The first entry encodes the entry size and shift value. 88 (We might try index24 and/or index48.) 89 90 RRSSS000 - index8 91 RRSSS001 - index16 92 RRSSS011 - index32 93 RRSSS101 - index64 94 95 'SSS' encodes a value S, where an index is created for 96 every 1<<(S+1) characters. 97 98 'RR' is reserved. Currently 00. 99 100 This octet is repeated to fill the first entry. So, if the index 101 is index32 and S is 6, the first word is actually #x32323232. 102 This allows us to read the first byte to find out the actual 103 index type, regardless of endianness. 104 105 The index array has the signature in the first element, and the total 106 lenght of the index array in the second. The actual index array 107 begins from the third element. Given the character index N, 108 indexX[(N>>shift)+2] contains the byte position of the character. 109 */ 110 const uint8_t signature; 111 const uint8_t index8[1]; 112 const uint16_t index16[1]; 113 const uint32_t index32[1]; 114 const uint64_t index64[1]; 115 } ScmStringIndex; 116 117 enum { 118 STRING_INDEX8 = 0, 119 STRING_INDEX16 = 1, 120 STRING_INDEX32 = 3, 121 STRING_INDEX64 = 5 122 }; 123 124 #define STRING_INDEX(p) ((ScmStringIndex*)(p)) 125 #define STRING_INDEX_TYPE(p) (STRING_INDEX(p)->signature & 0x07) 126 #define STRING_INDEX_SHIFT(p) (((STRING_INDEX(p)->signature>>3)&0x07)+1) 127 #define STRING_INDEX_INTERVAL(p) (1L<<STRING_INDEX_SHIFT(p)) 128 129 #define STRING_INDEX_SIGNATURE(s, t) (((((s)-1)&0x7)<<3)|((t)&0x07)) 130 131 #define SCM_STRING_BODY_HAS_INDEX(sb) ((sb)->index != NULL) 132 133 SCM_EXTERN void Scm_StringBodyBuildIndex(ScmStringBody *sb); 134 SCM_EXTERN void Scm_StringBodyIndexDump(const ScmStringBody *sb, ScmPort *port); 135 136 #endif /*GAUCHE_PRIV_STRINGP_H*/ 137