1 /*
2 * Copyright (c) 2003
3 * David Leonard. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of David Leonard nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
21 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #if HAVE_CONFIG_H
32 # include <config.h>
33 #endif
34
35 #if STDC_HEADERS
36 # include <stdio.h>
37 # include <stdarg.h>
38 #endif
39
40 #if HAVE_STRING_H
41 # include <string.h>
42 #endif
43
44 #include <see/mem.h>
45 #include <see/type.h>
46 #include <see/string.h>
47 #include <see/object.h>
48 #include <see/system.h>
49 #include <see/error.h>
50 #include <see/try.h>
51 #include <see/interpreter.h>
52 #include <see/value.h>
53
54 #include "stringdefs.h"
55 #include "printf.h"
56
57 static void growby(struct SEE_string *s, unsigned int extra);
58 static void simple_growby(struct SEE_string *s, unsigned int extra);
59 static void string_append_int(struct SEE_string *s, unsigned int i);
60
61 static struct SEE_stringclass fixed_stringclass = {
62 0 /* growby */
63 };
64
65 #define IS_GROWABLE(s) ((s)->stringclass && (s)->stringclass->growby)
66 #define MAKE_UNGROWABLE(s) (s)->stringclass = 0
67 #define ASSERT_GROWABLE(s) SEE_ASSERT(s->interpreter, IS_GROWABLE(s))
68
69 /*
70 * Strings.
71 *
72 * Strings are arrays of 16-bit characters with UTF-16 encoding.
73 * Because the ECMAScript standard never needs the strings
74 * interpreted in their full unicode form, (such as UCS-4),
75 * the implementation here maintains them as an array of 16-bit
76 * unsigned integers.
77 */
78
79 /*
80 * Grows a string's data[] array by the given length increment
81 */
82 static void
growby(s,extra)83 growby(s, extra)
84 struct SEE_string *s;
85 unsigned int extra;
86 {
87 if (!IS_GROWABLE(s))
88 SEE_error_throw_string(s->interpreter, s->interpreter->Error,
89 STR(no_string_space));
90 (*s->stringclass->growby)(s, extra);
91 }
92
93 /*
94 * Creates a new, growable string containing a copy of an existing one.
95 */
96 struct SEE_string *
SEE_string_dup(interp,s)97 SEE_string_dup(interp, s)
98 struct SEE_interpreter *interp;
99 const struct SEE_string *s;
100 {
101 struct SEE_string *cp;
102
103 cp = SEE_string_new(interp, s->length);
104 SEE_string_append(cp, s);
105 return cp;
106 }
107
108 /*
109 * Returns an ungrowable string.
110 * Returns the given string if it is of the same interpreter and ungrowable,
111 * oterwise creates a new, ungrowable copy.
112 */
113 struct SEE_string *
_SEE_string_dup_fix(interp,s)114 _SEE_string_dup_fix(interp, s)
115 struct SEE_interpreter *interp;
116 struct SEE_string *s;
117 {
118 struct SEE_string *cp;
119
120 if (s->interpreter == interp && !IS_GROWABLE(s))
121 return s;
122 if (!s->length)
123 return STR(empty_string);
124 cp = SEE_NEW(interp, struct SEE_string);
125 cp->length = s->length;
126 cp->data = SEE_NEW_STRING_ARRAY(interp, SEE_char_t, cp->length);
127 memcpy(cp->data, s->data, sizeof *cp->data * cp->length);
128 cp->interpreter = interp;
129 cp->flags = 0;
130 MAKE_UNGROWABLE(cp);
131 return cp;
132 }
133
134 /*
135 * Returns a string suitable for simultaneous use between multiple
136 * interpreters. The resulting string is allocated against the NULL
137 * interpreter.
138 */
139 struct SEE_string *
SEE_string_fix(s)140 SEE_string_fix(s)
141 struct SEE_string *s;
142 {
143 return _SEE_string_dup_fix(NULL, s);
144 }
145
146 /*
147 * Creates a new (ungrowable) string that is a substring of another.
148 * Raises an error if the substring indixies are out of bounds.
149 * The source string (s) may continue to be grown, but should not
150 * be changed.
151 */
152 struct SEE_string *
SEE_string_substr(interp,s,start,len)153 SEE_string_substr(interp, s, start, len)
154 struct SEE_interpreter *interp;
155 struct SEE_string *s;
156 int start, len;
157 {
158 struct SEE_string *subs;
159
160 if (start < 0
161 || len < 0
162 || (unsigned int)(start + len) > s->length)
163 SEE_error_throw_string(interp, interp->Error, STR(bad_arg));
164
165 subs = SEE_NEW(interp, struct SEE_string);
166 subs->length = len;
167 subs->data = s->data + start;
168 subs->interpreter = interp;
169 subs->flags = 0;
170 subs->stringclass = &fixed_stringclass;
171 return subs;
172 }
173
174 /*
175 * Compares two strings, a and b, lexicographically by UTF-16. Returns
176 * -1 if a < b
177 * 0 if a = b
178 * +1 if a > b
179 */
180 int
SEE_string_cmp(a,b)181 SEE_string_cmp(a, b)
182 const struct SEE_string *a, *b;
183 {
184 const SEE_char_t *ap, *bp;
185 unsigned int alen, blen;
186
187 if (a == b)
188 return 0;
189
190 ap = a->data; alen = a->length;
191 bp = b->data; blen = b->length;
192
193 while (alen && blen && *ap == *bp) {
194 alen--;
195 blen--;
196 ap++;
197 bp++;
198 }
199 if (!alen) {
200 if (!blen)
201 return 0;
202 return -1;
203 }
204 if (!blen)
205 return 1;
206 return (*ap < *bp) ? -1 : 1;
207 }
208
209 /*
210 * Compares a SEE string with an ASCII string.
211 * Returns -1,0,+1 just like SEE_string_cmp().
212 * Non-ASCII parts of b compare higher than any Unicode codepoint in a.
213 */
214 int
SEE_string_cmp_ascii(a,b)215 SEE_string_cmp_ascii(a, b)
216 const struct SEE_string *a;
217 const char *b;
218 {
219 unsigned int i;
220
221 for (i = 0; i < a->length && b[i]; i++) {
222 if (b[i] & 0x80)
223 return -1;
224 if (a->data[i] != b[i])
225 return a->data[i] < b[i] ? -1 : 1;
226 }
227 if (i == a->length)
228 return b[i] == 0 ? 0 : -1;
229 return 1;
230 }
231
232 /*
233 * Appends character c to the end of string s.
234 */
235 void
SEE_string_addch(s,c)236 SEE_string_addch(s, c)
237 struct SEE_string *s;
238 int c; /* promoted SEE_char_t */
239 {
240 ASSERT_GROWABLE(s);
241 growby(s, 1);
242 s->data[s->length++] = c;
243 }
244
245 /*
246 * Appends a unicode codepoint to the end of strings s.
247 */
248 void
SEE_string_append_unicode(s,c)249 SEE_string_append_unicode(s, c)
250 struct SEE_string *s;
251 SEE_unicode_t c;
252 {
253 if (c < 0x10000)
254 SEE_string_addch(s, (SEE_char_t)(c & 0xffff));
255 else {
256 /* RFC2781: UTF-16 encoding */
257 c -= 0x10000;
258 SEE_string_addch(s, (SEE_char_t)(0xd800 | (c >> 10 & 0x3ff)));
259 SEE_string_addch(s, (SEE_char_t)(0xdc00 | (c & 0x3ff)));
260 }
261 }
262
263 /*
264 * Appends string t to the end of string s.
265 */
266 void
SEE_string_append(s,t)267 SEE_string_append(s, t)
268 struct SEE_string *s;
269 const struct SEE_string *t;
270 {
271 ASSERT_GROWABLE(s);
272 if (t->length) {
273 growby(s, t->length);
274 memcpy(s->data + s->length, t->data,
275 t->length * sizeof (SEE_char_t));
276 s->length += t->length;
277 }
278 }
279
280 /*
281 * Appends 7-bit ascii string to the end of string s.
282 */
283 void
SEE_string_append_ascii(s,ascii)284 SEE_string_append_ascii(s, ascii)
285 struct SEE_string *s;
286 const char *ascii;
287 {
288 const char *p;
289
290 ASSERT_GROWABLE(s);
291 for (p = ascii; *p; p++)
292 SEE_ASSERT(s->interpreter, !(*p & 0x80));
293 if (p - ascii) {
294 growby(s, p - ascii);
295 for (p = ascii; *p; p++)
296 s->data[s->length++] = *p;
297 }
298 }
299
300 /*
301 * Appends a signed integer onto the end of string s
302 */
303 void
SEE_string_append_int(s,i)304 SEE_string_append_int(s, i)
305 struct SEE_string *s;
306 int i;
307 {
308 ASSERT_GROWABLE(s);
309 if (i < 0) {
310 i = -i;
311 SEE_string_addch(s, '-');
312 }
313 string_append_int(s, i);
314 }
315
316 static void
string_append_int(s,i)317 string_append_int(s, i)
318 struct SEE_string *s;
319 unsigned int i;
320 {
321 if (i >= 10)
322 string_append_int(s, i / 10);
323 growby(s, 1);
324 s->data[s->length++] = (i % 10) + '0';
325 }
326
327 /*
328 * Converts a UTF-16 string to UTF-8 and write to a stdio file.
329 * Returns 0 on success, like fputs().
330 * Returns EOF on write error, like fputs().
331 * Throws exception on conversion error, unlike fputs().
332 * Ref: RFC2279, RFC2781
333 */
334 int
SEE_string_fputs(s,f)335 SEE_string_fputs(s, f)
336 const struct SEE_string *s;
337 FILE *f;
338 {
339 unsigned int i;
340 SEE_char_t ch, ch2;
341 struct SEE_interpreter *interp = s->interpreter;
342
343 #define OUTPUT(c) do { if (fputc(c, f) == EOF) goto error; } while (0)
344
345 for (i = 0; i < s->length; i++) {
346 ch = s->data[i];
347 if ((ch & 0xff80) == 0)
348 OUTPUT(ch & 0x7f);
349 else if ((ch & 0xf800) == 0) {
350 OUTPUT(0xc0 | ((ch >> 6) & 0x1f));
351 OUTPUT(0x80 | (ch & 0x3f));
352 } else if ((ch & 0xfc00) != 0xd800) {
353 OUTPUT(0xe0 | ((ch >> 12) & 0x0f));
354 OUTPUT(0x80 | ((ch >> 6) & 0x3f));
355 OUTPUT(0x80 | (ch & 0x3f));
356 } else {
357 if (i == s->length - 1)
358 SEE_error_throw_string(interp, interp->Error,
359 STR(bad_utf16_string));
360 ch2 = s->data[++i];
361 if ((ch2 & 0xfc00) != 0xdc00)
362 SEE_error_throw_string(interp, interp->Error,
363 STR(bad_utf16_string));
364 ch = (ch & 0x03ff) + 0x0040;
365 OUTPUT(0xf0 | ((ch >> 8) & 0x07));
366 OUTPUT(0x80 | ((ch >> 2) & 0x3f));
367 OUTPUT(0x80 | ((ch & 0x3) << 4) |
368 ((ch2 & 0x03c0) >> 6));
369 OUTPUT(0x80 | (ch2 & 0x3f));
370 }
371 }
372 return 0;
373 error:
374 return EOF;
375 #undef OUTPUT
376 }
377
378 /*------------------------------------------------------------
379 * The simple string class
380 */
381 struct simple_string {
382 struct SEE_string string;
383 struct SEE_growable grow;
384 };
385
386 /*
387 * Grows the string storage to have at least current+extra elements of storage.
388 * Simple strings never shrink.
389 */
390 static void
simple_growby(s,extra)391 simple_growby(s, extra)
392 struct SEE_string *s;
393 unsigned int extra;
394 {
395 struct simple_string *ss = (struct simple_string *)s;
396 unsigned int len_save;
397
398 /*
399 * The grow_to API increments the length, but the growby contract
400 * is simply to ensure that the length can be incremented up
401 * to that point. So we save the length before calling grow_to().
402 */
403 len_save = ss->string.length;
404 SEE_grow_to(s->interpreter, &ss->grow, ss->string.length + extra);
405 ss->string.length = len_save;
406 }
407
408 static struct SEE_stringclass simple_stringclass = {
409 simple_growby /* growby */
410 };
411
412 /*
413 * Constrycts a new, empty string.
414 * Storage is pre-allocated for the number of UTF-16 characters indicated
415 * by the 'space' argument.
416 */
417 struct SEE_string *
SEE_string_new(interp,space)418 SEE_string_new(interp, space)
419 struct SEE_interpreter *interp;
420 unsigned int space;
421 {
422 struct simple_string *ss = SEE_NEW(interp, struct simple_string);
423
424 ss->string.interpreter = interp;
425 ss->string.flags = 0;
426 SEE_GROW_INIT(interp, &ss->grow, ss->string.data, ss->string.length);
427 ss->grow.is_string = 1;
428 ss->string.stringclass = &simple_stringclass;
429 if (space)
430 simple_growby((struct SEE_string *)ss, space);
431 return (struct SEE_string *)ss;
432 }
433
434 /*
435 * Creates a string using vsprintf-like arguments.
436 */
437 struct SEE_string *
SEE_string_vsprintf(interp,fmt,ap)438 SEE_string_vsprintf(interp, fmt, ap)
439 struct SEE_interpreter *interp;
440 const char *fmt;
441 va_list ap;
442 {
443 struct simple_string *ss;
444
445 ss = (struct simple_string *)SEE_string_new(interp, 0);
446 _SEE_vsprintf(interp, &ss->string, fmt, ap);
447 return (struct SEE_string *)ss;
448 }
449
450 /*
451 * Creates a string using sprintf-like arguments.
452 */
453 struct SEE_string *
SEE_string_sprintf(struct SEE_interpreter * interp,const char * fmt,...)454 SEE_string_sprintf(struct SEE_interpreter *interp, const char *fmt, ...)
455 {
456 va_list ap;
457 struct SEE_string *s;
458
459 va_start(ap, fmt);
460 s = SEE_string_vsprintf(interp, fmt, ap);
461 va_end(ap);
462 return s;
463 }
464
465 /**
466 * Returns a quoted, escaped string, suitable for lexical analysis.
467 */
468 struct SEE_string *
SEE_string_literal(interp,s)469 SEE_string_literal(interp, s)
470 struct SEE_interpreter *interp;
471 const struct SEE_string *s;
472 {
473 struct SEE_string *lit;
474 unsigned int i;
475 SEE_char_t c;
476
477 if (s == NULL)
478 return NULL;
479
480 lit = SEE_string_new(interp, 0);
481 SEE_string_addch(lit, '\"');
482 for (i = 0; i < s->length; i++) {
483 c = s->data[i];
484 switch (c) {
485 case 0x0008: SEE_string_addch(lit, '\\');
486 SEE_string_addch(lit, 'b');
487 break;
488 case 0x0009: SEE_string_addch(lit, '\\');
489 SEE_string_addch(lit, 't');
490 break;
491 case 0x000a: SEE_string_addch(lit, '\\');
492 SEE_string_addch(lit, 'n');
493 break;
494 case 0x000b: SEE_string_addch(lit, '\\');
495 SEE_string_addch(lit, 'v');
496 break;
497 case 0x000c: SEE_string_addch(lit, '\\');
498 SEE_string_addch(lit, 'f');
499 break;
500 case 0x000d: SEE_string_addch(lit, '\\');
501 SEE_string_addch(lit, 'r');
502 break;
503 case '\\':
504 case '\"': SEE_string_addch(lit, '\\');
505 SEE_string_addch(lit, c);
506 break;
507 default:
508 if (c >= 0x20 && c < 0x7f)
509 SEE_string_addch(lit, c);
510 else if (c < 0x100) {
511 SEE_string_addch(lit, '\\');
512 SEE_string_addch(lit, 'x');
513 SEE_string_addch(lit, SEE_hexstr_lowercase[(c >> 4) & 0xf]);
514 SEE_string_addch(lit, SEE_hexstr_lowercase[ c & 0xf]);
515 } else {
516 SEE_string_addch(lit, '\\');
517 SEE_string_addch(lit, 'u');
518 SEE_string_addch(lit, SEE_hexstr_lowercase[(c >>12) & 0xf]);
519 SEE_string_addch(lit, SEE_hexstr_lowercase[(c >> 8) & 0xf]);
520 SEE_string_addch(lit, SEE_hexstr_lowercase[(c >> 4) & 0xf]);
521 SEE_string_addch(lit, SEE_hexstr_lowercase[ c & 0xf]);
522 }
523 }
524 }
525 SEE_string_addch(lit, '\"');
526 return lit;
527 }
528
529 /*
530 * Frees a string. The caller must know that the string data is not in
531 * use in any other place. That includes by substring references, and
532 * the piggybacking side effects of SEE_string_concat.
533 */
534 void
SEE_string_free(interp,sp)535 SEE_string_free(interp, sp)
536 struct SEE_interpreter *interp;
537 struct SEE_string **sp;
538 {
539 if (*sp && (*sp)->interpreter == interp) {
540 SEE_free(interp, (void **)&(*sp)->data);
541 SEE_free(interp, (void **)sp);
542 }
543 }
544
545 /*
546 * Returns the number of characters a UTF-8 representation would take.
547 * Does not include the trailing nul
548 */
549 SEE_size_t
SEE_string_utf8_size(interp,s)550 SEE_string_utf8_size(interp, s)
551 struct SEE_interpreter *interp;
552 const struct SEE_string *s;
553 {
554 SEE_size_t len;
555 unsigned int i;
556 SEE_char_t ch, ch2;
557
558 len = 0;
559 for (i = 0; i < s->length; i++) {
560 ch = s->data[i];
561 if ((ch & 0xff80) == 0)
562 len += 1;
563 else if ((ch & 0xf800) == 0)
564 len += 2;
565 else if ((ch & 0xfc00) != 0xd800)
566 len += 3;
567 else {
568 if (i == s->length - 1)
569 SEE_error_throw_string(interp, interp->Error,
570 STR(bad_utf16_string));
571 ch2 = s->data[++i];
572 if ((ch2 & 0xfc00) != 0xdc00)
573 SEE_error_throw_string(interp, interp->Error,
574 STR(bad_utf16_string));
575 len += 4;
576 }
577 }
578 return len;
579 }
580
581 /*
582 * Converts a SEE string into a UTF8 buffer.
583 * Throws a RangeError if the decoded string, including terminating nul,
584 * would exceed the size of the given buffer.
585 * If the string itself is illegally formed, a generic Error is thrown.
586 */
587 void
SEE_string_toutf8(interp,buf,buflen,s)588 SEE_string_toutf8(interp, buf, buflen, s)
589 struct SEE_interpreter *interp;
590 char *buf;
591 SEE_size_t buflen;
592 const struct SEE_string *s;
593 {
594 unsigned int i;
595 SEE_char_t ch, ch2;
596
597 #define OUTPUT(c) do { \
598 if (buflen <= 1) goto toolong; \
599 *buf++ = (c); \
600 buflen--; \
601 } while (0)
602
603 for (i = 0; i < s->length; i++) {
604 ch = s->data[i];
605 if ((ch & 0xff80) == 0)
606 OUTPUT(ch & 0x7f);
607 else if ((ch & 0xf800) == 0) {
608 OUTPUT(0xc0 | ((ch >> 6) & 0x1f));
609 OUTPUT(0x80 | (ch & 0x3f));
610 } else if ((ch & 0xfc00) != 0xd800) {
611 OUTPUT(0xe0 | ((ch >> 12) & 0x0f));
612 OUTPUT(0x80 | ((ch >> 6) & 0x3f));
613 OUTPUT(0x80 | (ch & 0x3f));
614 } else {
615 if (i == s->length - 1)
616 SEE_error_throw_string(interp, interp->Error,
617 STR(bad_utf16_string));
618 ch2 = s->data[++i];
619 if ((ch2 & 0xfc00) != 0xdc00)
620 SEE_error_throw_string(interp, interp->Error,
621 STR(bad_utf16_string));
622 ch = (ch & 0x03ff) + 0x0040;
623 OUTPUT(0xf0 | ((ch >> 8) & 0x07));
624 OUTPUT(0x80 | ((ch >> 2) & 0x3f));
625 OUTPUT(0x80 | ((ch & 0x3) << 4) |
626 ((ch2 & 0x03c0) >> 6));
627 OUTPUT(0x80 | (ch2 & 0x3f));
628 }
629 }
630
631 if (buflen < 1) goto toolong;
632 *buf = '\0';
633 return;
634
635 toolong:
636 SEE_error_throw_string(interp, interp->RangeError,
637 STR(string_limit_reached));
638 }
639 #undef OUTPUT
640
641 /*
642 * Extends a string, marking the original string as ungrowable.
643 */
644 static struct SEE_string *
simple_concat(interp,a,b)645 simple_concat(interp, a, b)
646 struct SEE_interpreter *interp;
647 struct simple_string *a;
648 const struct SEE_string *b;
649 {
650 struct simple_string *cp;
651
652 /* Copy a to cp, carefully moving the SEE_growable structure */
653 cp = SEE_NEW(interp, struct simple_string);
654 memcpy(cp, a, sizeof (struct simple_string));
655 cp->grow.data_ptr = /* (void**) */&cp->string.data;
656 cp->grow.length_ptr = &cp->string.length;
657
658 /* Invalidate a so that it can no longer grow */
659 a->grow.data_ptr = NULL;
660 a->grow.length_ptr = NULL;
661 MAKE_UNGROWABLE(&a->string);
662
663 SEE_string_append(&cp->string, b);
664 return (struct SEE_string *)cp;
665 }
666
667 /*
668 * Concatenates two strings together and return the resulting string.
669 * May return one of the original strings, or a new string altogether.
670 * May modify a. String b will not be modified, but it may be returned.
671 */
672 struct SEE_string *
SEE_string_concat(interp,a,b)673 SEE_string_concat(interp, a, b)
674 struct SEE_interpreter *interp;
675 struct SEE_string *a, *b;
676 {
677 struct SEE_string *s;
678
679 if (a->length == 0)
680 return b;
681 if (b->length == 0)
682 return a;
683
684 if (a->stringclass == &simple_stringclass)
685 return simple_concat(interp, (struct simple_string *)a, b);
686
687 s = SEE_string_new(interp, a->length + b->length);
688 if (a->length)
689 memcpy(s->data, a->data, a->length * sizeof (SEE_char_t));
690 if (b->length)
691 memcpy(s->data + a->length, b->data,
692 b->length * sizeof (SEE_char_t));
693 s->length = a->length + b->length;
694 return s;
695 }
696