1 /*
2  * Copyright (c) 2003
3  *      David Leonard.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of David Leonard nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #if HAVE_CONFIG_H
32 # include <config.h>
33 #endif
34 
35 #if STDC_HEADERS
36 # include <stdio.h>
37 #endif
38 
39 #include <see/type.h>
40 #include <see/string.h>
41 #include <see/try.h>
42 #include <see/mem.h>
43 #include <see/intern.h>
44 #include <see/error.h>
45 #include <see/interpreter.h>
46 #include <see/system.h>
47 #include <see/object.h>
48 
49 #include "stringdefs.h"
50 #include "dprint.h"
51 
52 /*
53  * Internalised strings.
54  *
55  * 'Interning' a string means to replace it with a unique pointer
56  * to a previously stored string.
57  * When a previously unseen string is interned, a copy is made.
58  * The result is that string comparison becomes a pointer comparison
59  * (much faster) and is primarily used for identifiers.
60  *
61  * This module uses a three-level intern strategy: the first level is the
62  * static list of library strings generated by string.defs. The second level
63  * is the application-wide "global" intern table, which applications must set
64  * up early and not change after the creation of any interpreter. The
65  * third level is the interpreter-local intern cache.
66  *
67  * This strategy allow the sharing of application static strings,
68  * while avoiding the need for mutual exclusion techniques between
69  * interpreters (since the library and application static strings are
70  * read-only).
71  */
72 
73 #define HASHTABSZ	257
74 #define HASHLENMAX	8		/* prefix of string hashed on */
75 
76 struct intern {				/* element in the intern hash table */
77 	struct intern *next;
78 	struct SEE_string *string;
79 };
80 typedef struct intern *(intern_tab_t[HASHTABSZ]);
81 
82 /* Prototypes */
83 static struct intern *  make(struct SEE_interpreter *, struct SEE_string *);
84 static unsigned int     hash(const struct SEE_string *);
85 static struct intern ** find(intern_tab_t *, struct SEE_string *,
86 			     unsigned int);
87 static int internalized(struct SEE_interpreter *interp,
88 			const struct SEE_string *s);
89 static void global_init(void);
90 
91 /** System-wide intern table */
92 static intern_tab_t	global_intern_tab;
93 static int		global_intern_tab_initialized;
94 
95 #ifndef NDEBUG
96 static int		global_intern_tab_locked = 0;
97 int			SEE_debug_intern;
98 #endif
99 
100 #ifndef NDEBUG
101 static int
string_only_contains_ascii(s)102 string_only_contains_ascii(s)
103 	const char *s;
104 {
105 	for (; *s; s++)
106 		if (*s & 0x80)
107 			return 0;
108 	return 1;
109 }
110 #endif
111 
112 /**
113  * Make an intern entry in the hash table containing the string s,
114  *  and flag s as being interned.
115  */
116 static struct intern *
make(interp,s)117 make(interp, s)
118 	struct SEE_interpreter *interp;		/* may be NULL */
119 	struct SEE_string *s;
120 {
121 	struct intern *i;
122 
123 	i = SEE_NEW(interp, struct intern);
124 	i->string = s;
125 	s->flags |= SEE_STRING_FLAG_INTERNED;
126 	i->next = NULL;
127 	return i;
128 }
129 
130 /** Compute the hash value of a UTF-16 string */
131 static unsigned int
hash(s)132 hash(s)
133 	const struct SEE_string *s;
134 {
135 	unsigned int j, h = 0;
136 	for (j = 0; j < HASHLENMAX && j < s->length; j++)
137 		h = (h << 1) ^ s->data[j];
138 	return h % HASHTABSZ;
139 }
140 
141 /**
142  * Compute the hash value of an ASCII string.
143  * Returns the same value as if the ASCII string had been converted to a
144  * UTF-16 string and passed to hash().
145  * Assumes the input string is indeed ASCII (bytes in 0x00..0x7f).
146  */
147 static unsigned int
hash_ascii(s,lenret)148 hash_ascii(s, lenret)
149 	const char *s;
150 	unsigned int *lenret;
151 {
152 	unsigned int j, h = 0;
153 	const char *t;
154 
155 	for (j = 0, t = s; j < HASHLENMAX && *t; j++, t++)
156 		h = (h << 1) ^ *t;
157 	while (*t) t++;
158 	*lenret = t - s;
159 	return h % HASHTABSZ;
160 }
161 
162 /** Find an interned string */
163 static struct intern **
find(intern_tab,s,hash)164 find(intern_tab, s, hash)
165 	intern_tab_t *intern_tab;
166 	struct SEE_string *s;
167 	unsigned int hash;
168 {
169 	struct intern **x;
170 
171 	x = &(*intern_tab)[hash];
172 	while (*x && SEE_string_cmp((*x)->string, s) != 0)
173 		x = &((*x)->next);
174 	return x;
175 }
176 
177 /**
178  * Returns true if a SEE_string matches the ASCII string s.
179  * Assumes the string s is ASCII (0x00..0x7f).
180  */
181 static int
ascii_eq(str,s)182 ascii_eq(str, s)
183 	struct SEE_string *str;
184 	const char *s;
185 {
186 	unsigned int len = str->length;
187 	SEE_char_t *c = str->data;
188 
189 	while (len--)
190 		if (!*s)
191 			return 0;
192 		else if (*c++ != *s++)
193 			return 0;
194 	return *s == 0;
195 }
196 
197 /** Find an interned ASCII string */
198 static struct intern **
find_ascii(intern_tab,s,hash)199 find_ascii(intern_tab, s, hash)
200 	intern_tab_t *intern_tab;
201 	const char *s;
202 	unsigned int hash;
203 {
204 	struct intern **x;
205 
206 	x = &(*intern_tab)[hash];
207 	while (*x && !ascii_eq((*x)->string, s))
208 		x = &((*x)->next);
209 	return x;
210 }
211 
212 /** Create an interpreter-local intern table */
213 void
_SEE_intern_init(interp)214 _SEE_intern_init(interp)
215 	struct SEE_interpreter *interp;
216 {
217 	intern_tab_t *intern_tab;
218 	unsigned int i;
219 
220 	global_init();
221 #ifndef NDEBUG
222 	global_intern_tab_locked = 1;
223 #endif
224 
225 	intern_tab = SEE_NEW(interp, intern_tab_t);
226 	for (i = 0; i < HASHTABSZ; i++)
227 		(*intern_tab)[i] = NULL;
228 
229 	interp->intern_tab = intern_tab;
230 }
231 
232 /* Returns true if the string is already internalized */
233 static int
internalized(interp,s)234 internalized(interp, s)
235 	struct SEE_interpreter *interp;
236 	const struct SEE_string *s;
237 {
238 	/*
239 	 * A string is internalized if
240 	 *  - is already internalized in this interpreter or the global hash
241 	 *  - is one of the static resource strings
242 	 */
243 	return
244 	    ((!s->interpreter || s->interpreter == interp) &&
245 	     (s->flags & SEE_STRING_FLAG_INTERNED)) ||
246 	    (s >= STRn(0) && s < STRn(SEE_nstringtab));
247 }
248 
249 /**
250  * Intern a string relative to an interpreter. Also reads the global table
251  * Note that a different pointer to s is *ALWAYS* returned unless s was
252  * originally returned by SEE_intern. In other words, on the first occurrence
253  * of a string, it is duplicated. This makes it safe to intern strings that
254  * will be later grown. However, you MUST not alter content of strings
255  * returned from this function.
256  */
257 struct SEE_string *
SEE_intern(interp,s)258 SEE_intern(interp, s)
259 	struct SEE_interpreter *interp;
260 	struct SEE_string *s;
261 {
262 	struct intern **x;
263 	unsigned int h;
264 #ifndef NDEBUG
265 	const char *where = NULL;
266 # define WHERE(f) do { if (SEE_debug_intern) where=f; } while (0)
267 #else
268 # define WHERE(f) /* nothing */
269 #endif
270 
271 	/* Allow interning NULL to lessen the number of error checks */
272 	if (!s)
273 	    return NULL;
274 
275 	if (internalized(interp, s)) {
276 #ifndef NDEBUG
277 		if (SEE_debug_intern) {
278 		    dprintf("INTERN ");
279 		    dprints(s);
280 		    dprintf(" -> %p [interned]\n", s);
281 		}
282 #endif
283 		return s;
284 	}
285 
286 	/* If the string is from another interpreter, then it must
287 	 * have been intern'd already. This is to prevent race conditions
288 	 * with string whose content is changing. */
289 	SEE_ASSERT(interp, !s->interpreter || s->interpreter == interp ||
290 		(s->flags & SEE_STRING_FLAG_INTERNED));
291 
292 	/* Look in system-wide intern table first */
293 	h = hash(s);
294 	x = find(&global_intern_tab, s, h);
295 	WHERE("global");
296 	if (!*x) {
297 		x = find(interp->intern_tab, s, h);
298 		WHERE("local");
299 		if (!*x) {
300 			*x = make(interp, _SEE_string_dup_fix(interp, s));
301 			WHERE("new");
302 		}
303 	}
304 #ifndef NDEBUG
305 	if (SEE_debug_intern) {
306 	    dprintf("INTERN ");
307 	    dprints(s);
308 	    dprintf(" -> %p [%s h=%d]\n", (*x)->string, where, h);
309 	}
310 #endif
311 	return (*x)->string;
312 
313 }
314 
315 /** Efficiently converts an ASCII C string into an internalised SEE string */
316 struct SEE_string *
SEE_intern_ascii(interp,s)317 SEE_intern_ascii(interp, s)
318 	struct SEE_interpreter *interp;
319 	const char *s;
320 {
321 	struct SEE_string *str;
322 	const char *t;
323 	SEE_char_t *c;
324 	unsigned int h, len;
325 	struct intern **x;
326 #ifndef NDEBUG
327 	const char *where = NULL;
328 #endif
329 
330 	SEE_ASSERT(interp, s != NULL);
331 	SEE_ASSERT(interp, string_only_contains_ascii(s));
332 
333 	h = hash_ascii(s, &len);
334 	x = find_ascii(&global_intern_tab, s, h);
335 	WHERE("global");
336 	if (!*x) {
337 	    x = find_ascii(interp->intern_tab, s, h);
338 	    WHERE("local");
339 	    if (!*x) {
340 		WHERE("new");
341 		str = SEE_NEW(interp, struct SEE_string);
342 		str->length = len;
343 		str->data = SEE_NEW_STRING_ARRAY(interp, SEE_char_t, len);
344 		for (c = str->data, t = s; *t;)
345 			*c++ = *t++;
346 		str->interpreter = interp;
347 		str->stringclass = NULL;
348 		str->flags = 0;
349 	    	SEE_ASSERT(interp, hash(str) == h);
350 		*x = make(interp, str);
351 		}
352 	    }
353 #ifndef NDEBUG
354 	if (SEE_debug_intern)
355 	    dprintf("INTERN %s -> %p [%s h=%d ascii]\n",
356 		s, (*x)->string, where, h);
357 #endif
358 	return (*x)->string;
359 }
360 
361 /*
362  * Interns a string, and frees the original string.
363  */
364 void
SEE_intern_and_free(interp,sp)365 SEE_intern_and_free(interp, sp)
366 	struct SEE_interpreter *interp;
367 	struct SEE_string **sp;
368 {
369 	struct SEE_string *is;
370 
371 	is = SEE_intern(interp, *sp);
372 	SEE_ASSERT(interp, is != *sp);
373 #ifndef NDEBUG
374 	if (SEE_debug_intern) {
375 	    dprintf("INTERN ");
376 	    dprints(*sp);
377 	    dprintf(" -> %p [hit & free]\n", is);
378 	}
379 #endif
380 	SEE_string_free(interp, sp);
381 	*sp = is;
382 }
383 
384 static void
global_init()385 global_init()
386 {
387 	unsigned int i, h;
388 	struct intern **x;
389 
390 	if (global_intern_tab_initialized)
391 		return;
392 
393 	/* Add all the predefined strings to the global intern table */
394 	for (i = 0; i < SEE_nstringtab; i++) {
395 		h = hash(STRn(i));
396 		x = find(&global_intern_tab, STRn(i), h);
397 		if (*x == NULL)
398 			*x = make(NULL, STRn(i));
399 	}
400 	global_intern_tab_initialized = 1;
401 }
402 
403 /**
404  * Adds an ASCII string into the system-wide intern table if
405  * not already there.
406  * Should not be called after any interpeters are created.
407  */
408 struct SEE_string *
SEE_intern_global(s)409 SEE_intern_global(s)
410 	const char *s;
411 {
412 	struct SEE_string *str;
413 	const char *t;
414 	SEE_char_t *c;
415 	unsigned int h, len;
416 	struct intern **x;
417 
418 #ifndef NDEBUG
419 	if (global_intern_tab_locked)
420 		SEE_ABORT(NULL, "SEE_intern_global: table is now read-only");
421 #endif
422 	global_init();
423 
424 	h = hash_ascii(s, &len);
425 	x = find_ascii(&global_intern_tab, s, h);
426 	if (*x) return (*x)->string;
427 
428 	str = SEE_NEW(NULL, struct SEE_string);
429 	str->length = len;
430 	str->data = SEE_NEW_STRING_ARRAY(NULL, SEE_char_t, len);
431 	for (c = str->data, t = s; *t;)
432 		*c++ = *t++;
433 	str->interpreter = NULL;
434 	str->stringclass = NULL;
435 	str->flags = 0;
436 	*x = make(NULL, str);
437 	return (*x)->string;
438 }
439 
440 /**
441  * Raises an assertion failure if the passed string is not internalised.
442  * This function used by the SEE_OBJECT_*() macros.
443  */
444 struct SEE_string *
_SEE_intern_assert(interp,s)445 _SEE_intern_assert(interp, s)
446 	struct SEE_interpreter *interp;
447         struct SEE_string *s;
448 {
449 #ifndef NDEBUG
450 	if (s)
451 	    SEE_ASSERT(interp, internalized(interp, s));
452 #endif
453 	return s;
454 }
455