1 /*
2  *  String cache.
3  *
4  *  Provides a cache to optimize indexed string lookups.  The cache keeps
5  *  track of (byte offset, char offset) states for a fixed number of strings.
6  *  Otherwise we'd need to scan from either end of the string, as we store
7  *  strings in (extended) UTF-8.
8  */
9 
10 #include "duk_internal.h"
11 
12 /*
13  *  Delete references to given hstring from the heap string cache.
14  *
15  *  String cache references are 'weak': they are not counted towards
16  *  reference counts, nor serve as roots for mark-and-sweep.  When an
17  *  object is about to be freed, such references need to be removed.
18  */
19 
duk_heap_strcache_string_remove(duk_heap * heap,duk_hstring * h)20 DUK_INTERNAL void duk_heap_strcache_string_remove(duk_heap *heap, duk_hstring *h) {
21 	duk_uint_t i;
22 	for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) {
23 		duk_strcache_entry *c = heap->strcache + i;
24 		if (c->h == h) {
25 			DUK_DD(DUK_DDPRINT("deleting weak strcache reference to hstring %p from heap %p",
26 			                   (void *) h, (void *) heap));
27 			c->h = NULL;
28 
29 			/* XXX: the string shouldn't appear twice, but we now loop to the
30 			 * end anyway; if fixed, add a looping assertion to ensure there
31 			 * is no duplicate.
32 			 */
33 		}
34 	}
35 }
36 
37 /*
38  *  String scanning helpers
39  *
40  *  All bytes other than UTF-8 continuation bytes ([0x80,0xbf]) are
41  *  considered to contribute a character.  This must match how string
42  *  character length is computed.
43  */
44 
duk__scan_forwards(const duk_uint8_t * p,const duk_uint8_t * q,duk_uint_fast32_t n)45 DUK_LOCAL const duk_uint8_t *duk__scan_forwards(const duk_uint8_t *p, const duk_uint8_t *q, duk_uint_fast32_t n) {
46 	while (n > 0) {
47 		for (;;) {
48 			p++;
49 			if (p >= q) {
50 				return NULL;
51 			}
52 			if ((*p & 0xc0) != 0x80) {
53 				break;
54 			}
55 		}
56 		n--;
57 	}
58 	return p;
59 }
60 
duk__scan_backwards(const duk_uint8_t * p,const duk_uint8_t * q,duk_uint_fast32_t n)61 DUK_LOCAL const duk_uint8_t *duk__scan_backwards(const duk_uint8_t *p, const duk_uint8_t *q, duk_uint_fast32_t n) {
62 	while (n > 0) {
63 		for (;;) {
64 			p--;
65 			if (p < q) {
66 				return NULL;
67 			}
68 			if ((*p & 0xc0) != 0x80) {
69 				break;
70 			}
71 		}
72 		n--;
73 	}
74 	return p;
75 }
76 
77 /*
78  *  Convert char offset to byte offset
79  *
80  *  Avoid using the string cache if possible: for ASCII strings byte and
81  *  char offsets are equal and for short strings direct scanning may be
82  *  better than using the string cache (which may evict a more important
83  *  entry).
84  *
85  *  Typing now assumes 32-bit string byte/char offsets (duk_uint_fast32_t).
86  *  Better typing might be to use duk_size_t.
87  *
88  *  Caller should ensure 'char_offset' is within the string bounds [0,charlen]
89  *  (endpoint is inclusive).  If this is not the case, no memory unsafe
90  *  behavior will happen but an error will be thrown.
91  */
92 
duk_heap_strcache_offset_char2byte(duk_hthread * thr,duk_hstring * h,duk_uint_fast32_t char_offset)93 DUK_INTERNAL duk_uint_fast32_t duk_heap_strcache_offset_char2byte(duk_hthread *thr, duk_hstring *h, duk_uint_fast32_t char_offset) {
94 	duk_heap *heap;
95 	duk_strcache_entry *sce;
96 	duk_uint_fast32_t byte_offset;
97 	duk_uint_t i;
98 	duk_bool_t use_cache;
99 	duk_uint_fast32_t dist_start, dist_end, dist_sce;
100 	duk_uint_fast32_t char_length;
101 	const duk_uint8_t *p_start;
102 	const duk_uint8_t *p_end;
103 	const duk_uint8_t *p_found;
104 
105 	/*
106 	 *  For ASCII strings, the answer is simple.
107 	 */
108 
109 	if (DUK_LIKELY(DUK_HSTRING_IS_ASCII(h))) {
110 		return char_offset;
111 	}
112 
113 	char_length = (duk_uint_fast32_t) DUK_HSTRING_GET_CHARLEN(h);
114 	DUK_ASSERT(char_offset <= char_length);
115 
116 	if (DUK_LIKELY(DUK_HSTRING_IS_ASCII(h))) {
117 		/* Must recheck because the 'is ascii' flag may be set
118 		 * lazily.  Alternatively, we could just compare charlen
119 		 * to bytelen.
120 		 */
121 		return char_offset;
122 	}
123 
124 	/*
125 	 *  For non-ASCII strings, we need to scan forwards or backwards
126 	 *  from some starting point.  The starting point may be the start
127 	 *  or end of the string, or some cached midpoint in the string
128 	 *  cache.
129 	 *
130 	 *  For "short" strings we simply scan without checking or updating
131 	 *  the cache.  For longer strings we check and update the cache as
132 	 *  necessary, inserting a new cache entry if none exists.
133 	 */
134 
135 	DUK_DDD(DUK_DDDPRINT("non-ascii string %p, char_offset=%ld, clen=%ld, blen=%ld",
136 	                     (void *) h, (long) char_offset,
137 	                     (long) DUK_HSTRING_GET_CHARLEN(h),
138 	                     (long) DUK_HSTRING_GET_BYTELEN(h)));
139 
140 	heap = thr->heap;
141 	sce = NULL;
142 	use_cache = (char_length > DUK_HEAP_STRINGCACHE_NOCACHE_LIMIT);
143 
144 	if (use_cache) {
145 #if defined(DUK_USE_DEBUG_LEVEL) && (DUK_USE_DEBUG_LEVEL >= 2)
146 		DUK_DDD(DUK_DDDPRINT("stringcache before char2byte (using cache):"));
147 		for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) {
148 			duk_strcache_entry *c = heap->strcache + i;
149 			DUK_DDD(DUK_DDDPRINT("  [%ld] -> h=%p, cidx=%ld, bidx=%ld",
150 			                     (long) i, (void *) c->h, (long) c->cidx, (long) c->bidx));
151 		}
152 #endif
153 
154 		for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) {
155 			duk_strcache_entry *c = heap->strcache + i;
156 
157 			if (c->h == h) {
158 				sce = c;
159 				break;
160 			}
161 		}
162 	}
163 
164 	/*
165 	 *  Scan from shortest distance:
166 	 *    - start of string
167 	 *    - end of string
168 	 *    - cache entry (if exists)
169 	 */
170 
171 	DUK_ASSERT(DUK_HSTRING_GET_CHARLEN(h) >= char_offset);
172 	dist_start = char_offset;
173 	dist_end = char_length - char_offset;
174 	dist_sce = 0; DUK_UNREF(dist_sce);  /* initialize for debug prints, needed if sce==NULL */
175 
176 	p_start = (const duk_uint8_t *) DUK_HSTRING_GET_DATA(h);
177 	p_end = (const duk_uint8_t *) (p_start + DUK_HSTRING_GET_BYTELEN(h));
178 	p_found = NULL;
179 
180 	if (sce) {
181 		if (char_offset >= sce->cidx) {
182 			dist_sce = char_offset - sce->cidx;
183 			if ((dist_sce <= dist_start) && (dist_sce <= dist_end)) {
184 				DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, "
185 				                     "dist_start=%ld, dist_end=%ld, dist_sce=%ld => "
186 				                     "scan forwards from sce",
187 				                     (long) use_cache, (void *) (sce ? sce->h : NULL),
188 				                     (sce ? (long) sce->cidx : (long) -1),
189 				                     (sce ? (long) sce->bidx : (long) -1),
190 				                     (long) dist_start, (long) dist_end, (long) dist_sce));
191 
192 				p_found = duk__scan_forwards(p_start + sce->bidx,
193 				                             p_end,
194 				                             dist_sce);
195 				goto scan_done;
196 			}
197 		} else {
198 			dist_sce = sce->cidx - char_offset;
199 			if ((dist_sce <= dist_start) && (dist_sce <= dist_end)) {
200 				DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, "
201 				                     "dist_start=%ld, dist_end=%ld, dist_sce=%ld => "
202 				                     "scan backwards from sce",
203 				                     (long) use_cache, (void *) (sce ? sce->h : NULL),
204 				                     (sce ? (long) sce->cidx : (long) -1),
205 				                     (sce ? (long) sce->bidx : (long) -1),
206 				                     (long) dist_start, (long) dist_end, (long) dist_sce));
207 
208 				p_found = duk__scan_backwards(p_start + sce->bidx,
209 				                              p_start,
210 				                              dist_sce);
211 				goto scan_done;
212 			}
213 		}
214 	}
215 
216 	/* no sce, or sce scan not best */
217 
218 	if (dist_start <= dist_end) {
219 		DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, "
220 		                     "dist_start=%ld, dist_end=%ld, dist_sce=%ld => "
221 		                     "scan forwards from string start",
222 		                     (long) use_cache, (void *) (sce ? sce->h : NULL),
223 		                     (sce ? (long) sce->cidx : (long) -1),
224 		                     (sce ? (long) sce->bidx : (long) -1),
225 		                     (long) dist_start, (long) dist_end, (long) dist_sce));
226 
227 		p_found = duk__scan_forwards(p_start,
228 		                             p_end,
229 		                             dist_start);
230 	} else {
231 		DUK_DDD(DUK_DDDPRINT("non-ascii string, use_cache=%ld, sce=%p:%ld:%ld, "
232 		                     "dist_start=%ld, dist_end=%ld, dist_sce=%ld => "
233 		                     "scan backwards from string end",
234 		                     (long) use_cache, (void *) (sce ? sce->h : NULL),
235 		                     (sce ? (long) sce->cidx : (long) -1),
236 		                     (sce ? (long) sce->bidx : (long) -1),
237 		                     (long) dist_start, (long) dist_end, (long) dist_sce));
238 
239 		p_found = duk__scan_backwards(p_end,
240 		                              p_start,
241 		                              dist_end);
242 	}
243 
244  scan_done:
245 
246 	if (DUK_UNLIKELY(p_found == NULL)) {
247 		/* Scan error: this shouldn't normally happen; it could happen if
248 		 * string is not valid UTF-8 data, and clen/blen are not consistent
249 		 * with the scanning algorithm.
250 		 */
251 		goto scan_error;
252 	}
253 
254 	DUK_ASSERT(p_found >= p_start);
255 	DUK_ASSERT(p_found <= p_end);  /* may be equal */
256 	byte_offset = (duk_uint32_t) (p_found - p_start);
257 
258 	DUK_DDD(DUK_DDDPRINT("-> string %p, cidx %ld -> bidx %ld",
259 	                     (void *) h, (long) char_offset, (long) byte_offset));
260 
261 	/*
262 	 *  Update cache entry (allocating if necessary), and move the
263 	 *  cache entry to the first place (in an "LRU" policy).
264 	 */
265 
266 	if (use_cache) {
267 		/* update entry, allocating if necessary */
268 		if (!sce) {
269 			sce = heap->strcache + DUK_HEAP_STRCACHE_SIZE - 1;  /* take last entry */
270 			sce->h = h;
271 		}
272 		DUK_ASSERT(sce != NULL);
273 		sce->bidx = (duk_uint32_t) (p_found - p_start);
274 		sce->cidx = (duk_uint32_t) char_offset;
275 
276 		/* LRU: move our entry to first */
277 		if (sce > &heap->strcache[0]) {
278 			/*
279 			 *   A                  C
280 			 *   B                  A
281 			 *   C <- sce    ==>    B
282 			 *   D                  D
283 			 */
284 			duk_strcache_entry tmp;
285 
286 			tmp = *sce;
287 			duk_memmove((void *) (&heap->strcache[1]),
288 			            (const void *) (&heap->strcache[0]),
289 			            (size_t) (((char *) sce) - ((char *) &heap->strcache[0])));
290 			heap->strcache[0] = tmp;
291 
292 			/* 'sce' points to the wrong entry here, but is no longer used */
293 		}
294 #if defined(DUK_USE_DEBUG_LEVEL) && (DUK_USE_DEBUG_LEVEL >= 2)
295 		DUK_DDD(DUK_DDDPRINT("stringcache after char2byte (using cache):"));
296 		for (i = 0; i < DUK_HEAP_STRCACHE_SIZE; i++) {
297 			duk_strcache_entry *c = heap->strcache + i;
298 			DUK_DDD(DUK_DDDPRINT("  [%ld] -> h=%p, cidx=%ld, bidx=%ld",
299 			                     (long) i, (void *) c->h, (long) c->cidx, (long) c->bidx));
300 		}
301 #endif
302 	}
303 
304 	return byte_offset;
305 
306  scan_error:
307 	DUK_ERROR_INTERNAL(thr);
308 	DUK_WO_NORETURN(return 0;);
309 }
310