1 /*
2 * String cache.
3 *
4 * Provides a cache to optimize indexed string lookups. The cache keeps
5 * track of (byte offset, char offset) states for a fixed number of strings.
6 * Otherwise we'd need to scan from either end of the string, as we store
7 * strings in (extended) UTF-8.
8 */
9
10 #include "duk_internal.h"
11
12 /*
13 * Delete references to given hstring from the heap string cache.
14 *
15 * String cache references are 'weak': they are not counted towards
16 * reference counts, nor serve as roots for mark-and-sweep. When an
17 * object is about to be freed, such references need to be removed.
18 */
19
duk_heap_strcache_string_remove(duk_heap * heap,duk_hstring * h)20 DUK_INTERNAL void duk_heap_strcache_string_remove(duk_heap *heap, duk_hstring *h) {
21 duk_uint_t i;
22 for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) {
23 duk_strcache_entry *c = heap->strcache + i;
24 if (c->h == h) {
25 DUK_DD(DUK_DDPRINT("deleting weak strcache reference to hstring %p from heap %p",
26 (void *) h, (void *) heap));
27 c->h = NULL;
28
29 /* XXX: the string shouldn't appear twice, but we now loop to the
30 * end anyway; if fixed, add a looping assertion to ensure there
31 * is no duplicate.
32 */
33 }
34 }
35 }
36
37 /*
38 * String scanning helpers
39 *
40 * All bytes other than UTF-8 continuation bytes ([0x80,0xbf]) are
41 * considered to contribute a character. This must match how string
42 * character length is computed.
43 */
44
duk__scan_forwards(const duk_uint8_t * p,const duk_uint8_t * q,duk_uint_fast32_t n)45 DUK_LOCAL const duk_uint8_t *duk__scan_forwards(const duk_uint8_t *p, const duk_uint8_t *q, duk_uint_fast32_t n) {
46 while (n > 0) {
47 for (;;) {
48 p++;
49 if (p >= q) {
50 return NULL;
51 }
52 if ((*p & 0xc0) != 0x80) {
53 break;
54 }
55 }
56 n--;
57 }
58 return p;
59 }
60
duk__scan_backwards(const duk_uint8_t * p,const duk_uint8_t * q,duk_uint_fast32_t n)61 DUK_LOCAL const duk_uint8_t *duk__scan_backwards(const duk_uint8_t *p, const duk_uint8_t *q, duk_uint_fast32_t n) {
62 while (n > 0) {
63 for (;;) {
64 p--;
65 if (p < q) {
66 return NULL;
67 }
68 if ((*p & 0xc0) != 0x80) {
69 break;
70 }
71 }
72 n--;
73 }
74 return p;
75 }
76
77 /*
78 * Convert char offset to byte offset
79 *
80 * Avoid using the string cache if possible: for ASCII strings byte and
81 * char offsets are equal and for short strings direct scanning may be
82 * better than using the string cache (which may evict a more important
83 * entry).
84 *
85 * Typing now assumes 32-bit string byte/char offsets (duk_uint_fast32_t).
86 * Better typing might be to use duk_size_t.
87 *
88 * Caller should ensure 'char_offset' is within the string bounds [0,charlen]
89 * (endpoint is inclusive). If this is not the case, no memory unsafe
90 * behavior will happen but an error will be thrown.
91 */
92
duk_heap_strcache_offset_char2byte(duk_hthread * thr,duk_hstring * h,duk_uint_fast32_t char_offset)93 DUK_INTERNAL duk_uint_fast32_t duk_heap_strcache_offset_char2byte(duk_hthread *thr, duk_hstring *h, duk_uint_fast32_t char_offset) {
94 duk_heap *heap;
95 duk_strcache_entry *sce;
96 duk_uint_fast32_t byte_offset;
97 duk_uint_t i;
98 duk_bool_t use_cache;
99 duk_uint_fast32_t dist_start, dist_end, dist_sce;
100 duk_uint_fast32_t char_length;
101 const duk_uint8_t *p_start;
102 const duk_uint8_t *p_end;
103 const duk_uint8_t *p_found;
104
105 /*
106 * For ASCII strings, the answer is simple.
107 */
108
109 if (DUK_LIKELY(DUK_HSTRING_IS_ASCII(h))) {
110 return char_offset;
111 }
112
113 char_length = (duk_uint_fast32_t) DUK_HSTRING_GET_CHARLEN(h);
114 DUK_ASSERT(char_offset <= char_length);
115
116 if (DUK_LIKELY(DUK_HSTRING_IS_ASCII(h))) {
117 /* Must recheck because the 'is ascii' flag may be set
118 * lazily. Alternatively, we could just compare charlen
119 * to bytelen.
120 */
121 return char_offset;
122 }
123
124 /*
125 * For non-ASCII strings, we need to scan forwards or backwards
126 * from some starting point. The starting point may be the start
127 * or end of the string, or some cached midpoint in the string
128 * cache.
129 *
130 * For "short" strings we simply scan without checking or updating
131 * the cache. For longer strings we check and update the cache as
132 * necessary, inserting a new cache entry if none exists.
133 */
134
135 DUK_DDD(DUK_DDDPRINT("non-ascii string %p, char_offset=%ld, clen=%ld, blen=%ld",
136 (void *) h, (long) char_offset,
137 (long) DUK_HSTRING_GET_CHARLEN(h),
138 (long) DUK_HSTRING_GET_BYTELEN(h)));
139
140 heap = thr->heap;
141 sce = NULL;
142 use_cache = (char_length > DUK_HEAP_STRINGCACHE_NOCACHE_LIMIT);
143
144 if (use_cache) {
145 #if defined(DUK_USE_DEBUG_LEVEL) && (DUK_USE_DEBUG_LEVEL >= 2)
146 DUK_DDD(DUK_DDDPRINT("stringcache before char2byte (using cache):"));
147 for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) {
148 duk_strcache_entry *c = heap->strcache + i;
149 DUK_DDD(DUK_DDDPRINT(" [%ld] -> h=%p, cidx=%ld, bidx=%ld",
150 (long) i, (void *) c->h, (long) c->cidx, (long) c->bidx));
151 }
152 #endif
153
154 for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) {
155 duk_strcache_entry *c = heap->strcache + i;
156
157 if (c->h == h) {
158 sce = c;
159 break;
160 }
161 }
162 }
163
164 /*
165 * Scan from shortest distance:
166 * - start of string
167 * - end of string
168 * - cache entry (if exists)
169 */
170
171 DUK_ASSERT(DUK_HSTRING_GET_CHARLEN(h) >= char_offset);
172 dist_start = char_offset;
173 dist_end = char_length - char_offset;
174 dist_sce = 0; DUK_UNREF(dist_sce); /* initialize for debug prints, needed if sce==NULL */
175
176 p_start = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h);
177 p_end = (const duk_uint8_t *) (p_start + DUK_HSTRING_GET_BYTELEN(h));
178 p_found = NULL;
179
180 if (sce) {
181 if (char_offset >= sce->cidx) {
182 dist_sce = char_offset - sce->cidx;
183 if ((dist_sce <= dist_start) && (dist_sce <= dist_end)) {
184 DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, "
185 "dist_start=%ld, dist_end=%ld, dist_sce=%ld => "
186 "scan forwards from sce",
187 (long) use_cache, (void *) (sce ? sce->h : NULL),
188 (sce ? (long) sce->cidx : (long) -1),
189 (sce ? (long) sce->bidx : (long) -1),
190 (long) dist_start, (long) dist_end, (long) dist_sce));
191
192 p_found = duk__scan_forwards(p_start + sce->bidx,
193 p_end,
194 dist_sce);
195 goto scan_done;
196 }
197 } else {
198 dist_sce = sce->cidx - char_offset;
199 if ((dist_sce <= dist_start) && (dist_sce <= dist_end)) {
200 DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, "
201 "dist_start=%ld, dist_end=%ld, dist_sce=%ld => "
202 "scan backwards from sce",
203 (long) use_cache, (void *) (sce ? sce->h : NULL),
204 (sce ? (long) sce->cidx : (long) -1),
205 (sce ? (long) sce->bidx : (long) -1),
206 (long) dist_start, (long) dist_end, (long) dist_sce));
207
208 p_found = duk__scan_backwards(p_start + sce->bidx,
209 p_start,
210 dist_sce);
211 goto scan_done;
212 }
213 }
214 }
215
216 /* no sce, or sce scan not best */
217
218 if (dist_start <= dist_end) {
219 DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, "
220 "dist_start=%ld, dist_end=%ld, dist_sce=%ld => "
221 "scan forwards from string start",
222 (long) use_cache, (void *) (sce ? sce->h : NULL),
223 (sce ? (long) sce->cidx : (long) -1),
224 (sce ? (long) sce->bidx : (long) -1),
225 (long) dist_start, (long) dist_end, (long) dist_sce));
226
227 p_found = duk__scan_forwards(p_start,
228 p_end,
229 dist_start);
230 } else {
231 DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, "
232 "dist_start=%ld, dist_end=%ld, dist_sce=%ld => "
233 "scan backwards from string end",
234 (long) use_cache, (void *) (sce ? sce->h : NULL),
235 (sce ? (long) sce->cidx : (long) -1),
236 (sce ? (long) sce->bidx : (long) -1),
237 (long) dist_start, (long) dist_end, (long) dist_sce));
238
239 p_found = duk__scan_backwards(p_end,
240 p_start,
241 dist_end);
242 }
243
244 scan_done:
245
246 if (DUK_UNLIKELY(p_found == NULL)) {
247 /* Scan error: this shouldn't normally happen; it could happen if
248 * string is not valid UTF-8 data, and clen/blen are not consistent
249 * with the scanning algorithm.
250 */
251 goto scan_error;
252 }
253
254 DUK_ASSERT(p_found >= p_start);
255 DUK_ASSERT(p_found <= p_end); /* may be equal */
256 byte_offset = (duk_uint32_t) (p_found - p_start);
257
258 DUK_DDD(DUK_DDDPRINT("-> string %p, cidx %ld -> bidx %ld",
259 (void *) h, (long) char_offset, (long) byte_offset));
260
261 /*
262 * Update cache entry (allocating if necessary), and move the
263 * cache entry to the first place (in an "LRU" policy).
264 */
265
266 if (use_cache) {
267 /* update entry, allocating if necessary */
268 if (!sce) {
269 sce = heap->strcache + DUK_HEAP_STRCACHE_SIZE - 1; /* take last entry */
270 sce->h = h;
271 }
272 DUK_ASSERT(sce != NULL);
273 sce->bidx = (duk_uint32_t) (p_found - p_start);
274 sce->cidx = (duk_uint32_t) char_offset;
275
276 /* LRU: move our entry to first */
277 if (sce > &heap->strcache[0]) {
278 /*
279 * A C
280 * B A
281 * C <- sce ==> B
282 * D D
283 */
284 duk_strcache_entry tmp;
285
286 tmp = *sce;
287 duk_memmove((void *) (&heap->strcache[1]),
288 (const void *) (&heap->strcache[0]),
289 (size_t) (((char *) sce) - ((char *) &heap->strcache[0])));
290 heap->strcache[0] = tmp;
291
292 /* 'sce' points to the wrong entry here, but is no longer used */
293 }
294 #if defined(DUK_USE_DEBUG_LEVEL) && (DUK_USE_DEBUG_LEVEL >= 2)
295 DUK_DDD(DUK_DDDPRINT("stringcache after char2byte (using cache):"));
296 for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) {
297 duk_strcache_entry *c = heap->strcache + i;
298 DUK_DDD(DUK_DDDPRINT(" [%ld] -> h=%p, cidx=%ld, bidx=%ld",
299 (long) i, (void *) c->h, (long) c->cidx, (long) c->bidx));
300 }
301 #endif
302 }
303
304 return byte_offset;
305
306 scan_error:
307 DUK_ERROR_INTERNAL(thr);
308 DUK_WO_NORETURN(return 0;);
309 }
310