/* * String cache. * * Provides a cache to optimize indexed string lookups. The cache keeps * track of (byte offset, char offset) states for a fixed number of strings. * Otherwise we'd need to scan from either end of the string, as we store * strings in (extended) UTF-8. */ #include "duk_internal.h" /* * Delete references to given hstring from the heap string cache. * * String cache references are 'weak': they are not counted towards * reference counts, nor serve as roots for mark-and-sweep. When an * object is about to be freed, such references need to be removed. */ DUK_INTERNAL void duk_heap_strcache_string_remove(duk_heap *heap, duk_hstring *h) { duk_uint_t i; for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) { duk_strcache_entry *c = heap->strcache + i; if (c->h == h) { DUK_DD(DUK_DDPRINT("deleting weak strcache reference to hstring %p from heap %p", (void *) h, (void *) heap)); c->h = NULL; /* XXX: the string shouldn't appear twice, but we now loop to the * end anyway; if fixed, add a looping assertion to ensure there * is no duplicate. */ } } } /* * String scanning helpers * * All bytes other than UTF-8 continuation bytes ([0x80,0xbf]) are * considered to contribute a character. This must match how string * character length is computed. */ DUK_LOCAL const duk_uint8_t *duk__scan_forwards(const duk_uint8_t *p, const duk_uint8_t *q, duk_uint_fast32_t n) { while (n > 0) { for (;;) { p++; if (p >= q) { return NULL; } if ((*p & 0xc0) != 0x80) { break; } } n--; } return p; } DUK_LOCAL const duk_uint8_t *duk__scan_backwards(const duk_uint8_t *p, const duk_uint8_t *q, duk_uint_fast32_t n) { while (n > 0) { for (;;) { p--; if (p < q) { return NULL; } if ((*p & 0xc0) != 0x80) { break; } } n--; } return p; } /* * Convert char offset to byte offset * * Avoid using the string cache if possible: for ASCII strings byte and * char offsets are equal and for short strings direct scanning may be * better than using the string cache (which may evict a more important * entry). * * Typing now assumes 32-bit string byte/char offsets (duk_uint_fast32_t). * Better typing might be to use duk_size_t. * * Caller should ensure 'char_offset' is within the string bounds [0,charlen] * (endpoint is inclusive). If this is not the case, no memory unsafe * behavior will happen but an error will be thrown. */ DUK_INTERNAL duk_uint_fast32_t duk_heap_strcache_offset_char2byte(duk_hthread *thr, duk_hstring *h, duk_uint_fast32_t char_offset) { duk_heap *heap; duk_strcache_entry *sce; duk_uint_fast32_t byte_offset; duk_uint_t i; duk_bool_t use_cache; duk_uint_fast32_t dist_start, dist_end, dist_sce; duk_uint_fast32_t char_length; const duk_uint8_t *p_start; const duk_uint8_t *p_end; const duk_uint8_t *p_found; /* * For ASCII strings, the answer is simple. */ if (DUK_LIKELY(DUK_HSTRING_IS_ASCII(h))) { return char_offset; } char_length = (duk_uint_fast32_t) DUK_HSTRING_GET_CHARLEN(h); DUK_ASSERT(char_offset <= char_length); if (DUK_LIKELY(DUK_HSTRING_IS_ASCII(h))) { /* Must recheck because the 'is ascii' flag may be set * lazily. Alternatively, we could just compare charlen * to bytelen. */ return char_offset; } /* * For non-ASCII strings, we need to scan forwards or backwards * from some starting point. The starting point may be the start * or end of the string, or some cached midpoint in the string * cache. * * For "short" strings we simply scan without checking or updating * the cache. For longer strings we check and update the cache as * necessary, inserting a new cache entry if none exists. */ DUK_DDD(DUK_DDDPRINT("non-ascii string %p, char_offset=%ld, clen=%ld, blen=%ld", (void *) h, (long) char_offset, (long) DUK_HSTRING_GET_CHARLEN(h), (long) DUK_HSTRING_GET_BYTELEN(h))); heap = thr->heap; sce = NULL; use_cache = (char_length > DUK_HEAP_STRINGCACHE_NOCACHE_LIMIT); if (use_cache) { #if defined(DUK_USE_DEBUG_LEVEL) && (DUK_USE_DEBUG_LEVEL >= 2) DUK_DDD(DUK_DDDPRINT("stringcache before char2byte (using cache):")); for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) { duk_strcache_entry *c = heap->strcache + i; DUK_DDD(DUK_DDDPRINT(" [%ld] -> h=%p, cidx=%ld, bidx=%ld", (long) i, (void *) c->h, (long) c->cidx, (long) c->bidx)); } #endif for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) { duk_strcache_entry *c = heap->strcache + i; if (c->h == h) { sce = c; break; } } } /* * Scan from shortest distance: * - start of string * - end of string * - cache entry (if exists) */ DUK_ASSERT(DUK_HSTRING_GET_CHARLEN(h) >= char_offset); dist_start = char_offset; dist_end = char_length - char_offset; dist_sce = 0; DUK_UNREF(dist_sce); /* initialize for debug prints, needed if sce==NULL */ p_start = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h); p_end = (const duk_uint8_t *) (p_start + DUK_HSTRING_GET_BYTELEN(h)); p_found = NULL; if (sce) { if (char_offset >= sce->cidx) { dist_sce = char_offset - sce->cidx; if ((dist_sce <= dist_start) && (dist_sce <= dist_end)) { DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, " "dist_start=%ld, dist_end=%ld, dist_sce=%ld => " "scan forwards from sce", (long) use_cache, (void *) (sce ? sce->h : NULL), (sce ? (long) sce->cidx : (long) -1), (sce ? (long) sce->bidx : (long) -1), (long) dist_start, (long) dist_end, (long) dist_sce)); p_found = duk__scan_forwards(p_start + sce->bidx, p_end, dist_sce); goto scan_done; } } else { dist_sce = sce->cidx - char_offset; if ((dist_sce <= dist_start) && (dist_sce <= dist_end)) { DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, " "dist_start=%ld, dist_end=%ld, dist_sce=%ld => " "scan backwards from sce", (long) use_cache, (void *) (sce ? sce->h : NULL), (sce ? (long) sce->cidx : (long) -1), (sce ? (long) sce->bidx : (long) -1), (long) dist_start, (long) dist_end, (long) dist_sce)); p_found = duk__scan_backwards(p_start + sce->bidx, p_start, dist_sce); goto scan_done; } } } /* no sce, or sce scan not best */ if (dist_start <= dist_end) { DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, " "dist_start=%ld, dist_end=%ld, dist_sce=%ld => " "scan forwards from string start", (long) use_cache, (void *) (sce ? sce->h : NULL), (sce ? (long) sce->cidx : (long) -1), (sce ? (long) sce->bidx : (long) -1), (long) dist_start, (long) dist_end, (long) dist_sce)); p_found = duk__scan_forwards(p_start, p_end, dist_start); } else { DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, " "dist_start=%ld, dist_end=%ld, dist_sce=%ld => " "scan backwards from string end", (long) use_cache, (void *) (sce ? sce->h : NULL), (sce ? (long) sce->cidx : (long) -1), (sce ? (long) sce->bidx : (long) -1), (long) dist_start, (long) dist_end, (long) dist_sce)); p_found = duk__scan_backwards(p_end, p_start, dist_end); } scan_done: if (DUK_UNLIKELY(p_found == NULL)) { /* Scan error: this shouldn't normally happen; it could happen if * string is not valid UTF-8 data, and clen/blen are not consistent * with the scanning algorithm. */ goto scan_error; } DUK_ASSERT(p_found >= p_start); DUK_ASSERT(p_found <= p_end); /* may be equal */ byte_offset = (duk_uint32_t) (p_found - p_start); DUK_DDD(DUK_DDDPRINT("-> string %p, cidx %ld -> bidx %ld", (void *) h, (long) char_offset, (long) byte_offset)); /* * Update cache entry (allocating if necessary), and move the * cache entry to the first place (in an "LRU" policy). */ if (use_cache) { /* update entry, allocating if necessary */ if (!sce) { sce = heap->strcache + DUK_HEAP_STRCACHE_SIZE - 1; /* take last entry */ sce->h = h; } DUK_ASSERT(sce != NULL); sce->bidx = (duk_uint32_t) (p_found - p_start); sce->cidx = (duk_uint32_t) char_offset; /* LRU: move our entry to first */ if (sce > &heap->strcache[0]) { /* * A C * B A * C <- sce ==> B * D D */ duk_strcache_entry tmp; tmp = *sce; duk_memmove((void *) (&heap->strcache[1]), (const void *) (&heap->strcache[0]), (size_t) (((char *) sce) - ((char *) &heap->strcache[0]))); heap->strcache[0] = tmp; /* 'sce' points to the wrong entry here, but is no longer used */ } #if defined(DUK_USE_DEBUG_LEVEL) && (DUK_USE_DEBUG_LEVEL >= 2) DUK_DDD(DUK_DDDPRINT("stringcache after char2byte (using cache):")); for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) { duk_strcache_entry *c = heap->strcache + i; DUK_DDD(DUK_DDDPRINT(" [%ld] -> h=%p, cidx=%ld, bidx=%ld", (long) i, (void *) c->h, (long) c->cidx, (long) c->bidx)); } #endif } return byte_offset; scan_error: DUK_ERROR_INTERNAL(thr); DUK_WO_NORETURN(return 0;); }