1 /*
2 * string.c - string implementation
3 *
4 * Copyright (c) 2000-2020 Shiro Kawai <shiro@acm.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the authors nor the names of its contributors
18 * may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #define LIBGAUCHE_BODY
35 #include "gauche.h"
36 #include "gauche/priv/stringP.h"
37
38 #include <string.h>
39 #include <ctype.h>
40
41 void Scm_DStringDump(FILE *out, ScmDString *dstr);
42 static ScmObj make_string_cursor(ScmString *src, const char *cursor);
43
44 static void string_print(ScmObj obj, ScmPort *port, ScmWriteContext *ctx);
45 SCM_DEFINE_BUILTIN_CLASS(Scm_StringClass, string_print, NULL, NULL, NULL,
46 SCM_CLASS_SEQUENCE_CPL);
47
48 #define CHECK_SIZE(siz) \
49 do { \
50 if ((siz) > SCM_STRING_MAX_SIZE) { \
51 Scm_Error("string size too big: %ld", (siz)); \
52 } \
53 } while (0)
54
55 /* Internal primitive constructor. LEN can be negative if the string
56 is incomplete. */
make_str(ScmSmallInt len,ScmSmallInt siz,const char * p,u_long flags,const void * index)57 static ScmString *make_str(ScmSmallInt len, ScmSmallInt siz,
58 const char *p, u_long flags,
59 const void *index)
60 {
61 if (len < 0) flags |= SCM_STRING_INCOMPLETE;
62 if (flags & SCM_STRING_INCOMPLETE) len = siz;
63
64 if (siz > SCM_STRING_MAX_SIZE) {
65 Scm_Error("string size too big: %ld", siz);
66 }
67 if (len > siz) {
68 Scm_Error("string length (%ld) exceeds size (%ld)", len, siz);
69 }
70
71 ScmString *s = SCM_NEW(ScmString);
72 SCM_SET_CLASS(s, SCM_CLASS_STRING);
73 s->body = NULL;
74 s->initialBody.flags = flags & SCM_STRING_FLAG_MASK;
75 s->initialBody.length = len;
76 s->initialBody.size = siz;
77 s->initialBody.start = p;
78 s->initialBody.index = index;
79 return s;
80 }
81
82 #define DUMP_LENGTH 50
83
84 /* for debug */
Scm_StringDump(FILE * out,ScmObj str)85 void Scm_StringDump(FILE *out, ScmObj str)
86 {
87 const ScmStringBody *b = SCM_STRING_BODY(str);
88 ScmSmallInt s = SCM_STRING_BODY_SIZE(b);
89 const char *p = SCM_STRING_BODY_START(b);
90
91 fprintf(out, "STR(len=%ld,siz=%ld) \"", SCM_STRING_BODY_LENGTH(b), s);
92 for (int i=0; i < DUMP_LENGTH && s > 0;) {
93 int n = SCM_CHAR_NFOLLOWS(*p) + 1;
94 for (; n > 0 && s > 0; p++, n--, s--, i++) {
95 putc(*p, out);
96 }
97 }
98 if (s > 0) {
99 fputs("...\"\n", out);
100 } else {
101 fputs("\"\n", out);
102 }
103 }
104
105 /* Like GC_strndup, but we don't require the source string to be
106 NUL-terminated (instead, we trust the caller that the size
107 argument is in valid range.) */
Scm_StrdupPartial(const char * src,size_t size)108 char *Scm_StrdupPartial(const char *src, size_t size)
109 {
110 char *dst = SCM_NEW_ATOMIC_ARRAY(char, size+1);
111 memcpy(dst, src, size);
112 dst[size] = '\0';
113 return dst;
114 }
115
116 /*
117 * Multibyte length calculation
118 */
119
120 /* We have multiple similar functions, due to performance reasons. */
121
122 /* Calculate both length and size of C-string str.
123 If str is incomplete, *plen gets -1. */
count_size_and_length(const char * str,ScmSmallInt * psize,ScmSmallInt * plen)124 static inline ScmSmallInt count_size_and_length(const char *str,
125 ScmSmallInt *psize, /* out */
126 ScmSmallInt *plen) /* out */
127 {
128 char c;
129 int incomplete = FALSE;
130 const char *p = str;
131 ScmSmallInt size = 0, len = 0;
132 while ((c = *p++) != 0) {
133 int i = SCM_CHAR_NFOLLOWS(c);
134 len++;
135 size += i+1;
136
137 ScmChar ch;
138 SCM_CHAR_GET(p-1, ch);
139 if (ch == SCM_CHAR_INVALID) incomplete = TRUE;
140 /* Check every octet to avoid skipping over terminating NUL. */
141 while (i-- > 0) {
142 if (!*p++) { incomplete = TRUE; goto eos; }
143 }
144 }
145 eos:
146 if (incomplete) len = -1;
147 *psize = size;
148 *plen = len;
149 return len;
150 }
151
152 /* Calculate length of known size string. str can contain NUL character. */
count_length(const char * str,ScmSmallInt size)153 static inline ScmSmallInt count_length(const char *str, ScmSmallInt size)
154 {
155 ScmSmallInt count = 0;
156 while (size-- > 0) {
157 unsigned char c = (unsigned char)*str;
158 int i = SCM_CHAR_NFOLLOWS(c);
159 if (i < 0 || i > size) return -1;
160 ScmChar ch;
161 SCM_CHAR_GET(str, ch);
162 if (ch == SCM_CHAR_INVALID) return -1;
163 count++;
164 str += i+1;
165 size -= i;
166 }
167 return count;
168 }
169
170 /* Returns length of string, starts from str and end at stop.
171 If stop is NULL, str is regarded as C-string (NUL terminated).
172 If the string is incomplete, returns -1. */
Scm_MBLen(const char * str,const char * stop)173 ScmSmallInt Scm_MBLen(const char *str, const char *stop)
174 {
175 ScmSmallInt size = (stop == NULL)? (ScmSmallInt)strlen(str) : (stop - str);
176 ScmSmallInt len = count_length(str, size);
177 if (len > SCM_STRING_MAX_LENGTH) {
178 Scm_Error("Scm_MBLen: length too big: %ld", len);
179 }
180 return len;
181 }
182
183 /*----------------------------------------------------------------
184 * Cursors
185 */
186
cursor_print(ScmObj obj,ScmPort * port,ScmWriteContext * mode SCM_UNUSED)187 static void cursor_print(ScmObj obj, ScmPort *port,
188 ScmWriteContext *mode SCM_UNUSED)
189 {
190 Scm_Printf(port, "#<string-cursor-large %ld>",
191 SCM_STRING_CURSOR_LARGE_OFFSET(obj));
192 }
193
194 static ScmClass *cursor_cpl[] = {
195 SCM_CLASS_STATIC_PTR(Scm_StringCursorClass),
196 SCM_CLASS_STATIC_PTR(Scm_TopClass),
197 NULL
198 };
199
200 SCM_DEFINE_BUILTIN_CLASS_SIMPLE(Scm_StringCursorClass, NULL);
201 SCM_DEFINE_BUILTIN_CLASS(Scm_StringCursorLargeClass, cursor_print, NULL, NULL,
202 NULL, cursor_cpl);
203
204 /* Common routine to get hold of the pointer from string cursor.
205 Returns NULL if SC isn't a string cursor.
206 Raise an error if sc is not in the range. */
string_cursor_ptr(const ScmStringBody * sb,ScmObj sc)207 static inline const char *string_cursor_ptr(const ScmStringBody *sb, ScmObj sc)
208 {
209 const char *ptr = NULL;
210 if (SCM_STRING_CURSOR_LARGE_P(sc)) {
211 if (SCM_STRING_BODY_START(sb) != SCM_STRING_CURSOR_LARGE_START(sc)) {
212 Scm_Error("invalid cursor (made for string '%s'): %S",
213 SCM_STRING_CURSOR_LARGE_START(sc), sc);
214 }
215 ptr = SCM_STRING_CURSOR_LARGE_POINTER(sb, sc);
216 } else if (SCM_STRING_CURSOR_SMALL_P(sc)) {
217 ptr = SCM_STRING_CURSOR_SMALL_POINTER(sb, sc);
218 } else {
219 return NULL;
220 }
221 if (ptr < SCM_STRING_BODY_START(sb) ||
222 ptr > SCM_STRING_BODY_END(sb)) {
223 Scm_Error("cursor out of range: %S", sc);
224 }
225 return ptr;
226 }
227
228 /* Returns -1 if sc isn't a cursor. No range check performed. */
string_cursor_offset(ScmObj sc)229 static inline ScmSmallInt string_cursor_offset(ScmObj sc) {
230 if (SCM_STRING_CURSOR_LARGE_P(sc)) {
231 return SCM_STRING_CURSOR_LARGE_OFFSET(sc);
232 } else if (SCM_STRING_CURSOR_SMALL_P(sc)) {
233 return SCM_STRING_CURSOR_SMALL_OFFSET(sc);
234 } else {
235 return -1;
236 }
237 }
238
239 /*----------------------------------------------------------------
240 * Constructors
241 */
242
243 /* General constructor. */
Scm_MakeString(const char * str,ScmSmallInt size,ScmSmallInt len,u_long flags)244 ScmObj Scm_MakeString(const char *str, ScmSmallInt size, ScmSmallInt len,
245 u_long flags)
246 {
247 flags &= ~SCM_STRING_TERMINATED;
248
249 if (size < 0) {
250 count_size_and_length(str, &size, &len);
251 flags |= SCM_STRING_TERMINATED;
252 } else {
253 if (len < 0) len = count_length(str, size);
254 }
255 /* Range of size and len will be checked in make_str */
256
257 ScmString *s;
258 if (flags & SCM_STRING_COPYING) {
259 flags |= SCM_STRING_TERMINATED; /* SCM_STRDUP_PARTIAL terminates the result str */
260 s = make_str(len, size, SCM_STRDUP_PARTIAL(str, size), flags, NULL);
261 } else {
262 s = make_str(len, size, str, flags, NULL);
263 }
264 return SCM_OBJ(s);
265 }
266
Scm_MakeFillString(ScmSmallInt len,ScmChar fill)267 ScmObj Scm_MakeFillString(ScmSmallInt len, ScmChar fill)
268 {
269 if (len < 0) Scm_Error("length out of range: %ld", len);
270 ScmSmallInt csize = SCM_CHAR_NBYTES(fill);
271 CHECK_SIZE(csize*len);
272 char *ptr = SCM_NEW_ATOMIC2(char *, csize*len+1);
273 char *p = ptr;
274 for (ScmSmallInt i=0; i<len; i++, p+=csize) {
275 SCM_CHAR_PUT(p, fill);
276 }
277 ptr[csize*len] = '\0';
278 return SCM_OBJ(make_str(len, csize*len, ptr, SCM_STRING_TERMINATED, NULL));
279 }
280
Scm_ListToString(ScmObj chars)281 ScmObj Scm_ListToString(ScmObj chars)
282 {
283 ScmSmallInt size = 0, len = 0;
284
285 ScmObj cp;
286 SCM_FOR_EACH(cp, chars) {
287 if (!SCM_CHARP(SCM_CAR(cp)))
288 Scm_Error("character required, but got %S", SCM_CAR(cp));
289 ScmChar ch = SCM_CHAR_VALUE(SCM_CAR(cp));
290 size += SCM_CHAR_NBYTES(ch);
291 len++;
292 CHECK_SIZE(size);
293 }
294 char *buf = SCM_NEW_ATOMIC2(char *, size+1);
295 char *bufp = buf;
296 SCM_FOR_EACH(cp, chars) {
297 ScmChar ch = SCM_CHAR_VALUE(SCM_CAR(cp));
298 SCM_CHAR_PUT(bufp, ch);
299 bufp += SCM_CHAR_NBYTES(ch);
300 }
301 *bufp = '\0';
302 return Scm_MakeString(buf, size, len, 0);
303 }
304
305 /* Extract string as C-string. This one guarantees to return
306 mutable string (we always copy) */
Scm_GetString(ScmString * str)307 char *Scm_GetString(ScmString *str)
308 {
309 const ScmStringBody *b = SCM_STRING_BODY(str);
310 return SCM_STRDUP_PARTIAL(SCM_STRING_BODY_START(b), SCM_STRING_BODY_SIZE(b));
311 }
312
313 /* Common routine for Scm_GetStringConst and Scm_GetStringContent */
get_string_from_body(const ScmStringBody * b)314 static const char *get_string_from_body(const ScmStringBody *b)
315 {
316 ScmSmallInt size = SCM_STRING_BODY_SIZE(b);
317 if (SCM_STRING_BODY_HAS_FLAG(b, SCM_STRING_TERMINATED)) {
318 /* we can use string data as C-string */
319 return SCM_STRING_BODY_START(b);
320 } else {
321 char *p = SCM_STRDUP_PARTIAL(SCM_STRING_BODY_START(b), size);
322 /* kludge! This breaks 'const' qualification, but we know
323 this is an idempotent operation from the outside. Note that
324 this is safe even multiple threads execute this part
325 simultaneously. */
326 ((ScmStringBody*)b)->start = p; /* discard const qualifier */
327 ((ScmStringBody*)b)->flags |= SCM_STRING_TERMINATED;
328 return p;
329 }
330 }
331
332 /* Extract string as C-string. Returned string is immutable,
333 so we can directly return the body of the string. We do not
334 allow string containing NUL to be passed to C world, for it
335 would be a security risk.
336 TODO: Let the string body have a flag so that we don't need
337 to scan the string every time.
338 */
Scm_GetStringConst(ScmString * str)339 const char *Scm_GetStringConst(ScmString *str)
340 {
341 const ScmStringBody *b = SCM_STRING_BODY(str);
342 if (memchr(SCM_STRING_BODY_START(b), 0, SCM_STRING_BODY_SIZE(b))) {
343 Scm_Error("A string containing NUL character is not allowed: %S",
344 SCM_OBJ(str));
345 }
346 return get_string_from_body(b);
347 }
348
349 /* Atomically extracts C-string, length, size, and incomplete flag.
350 MT-safe. */
351 /* NB: Output parameters are int's for the ABI compatibility. */
Scm_GetStringContent(ScmString * str,ScmSmallInt * psize,ScmSmallInt * plength,u_long * pflags)352 const char *Scm_GetStringContent(ScmString *str,
353 ScmSmallInt *psize, /* out */
354 ScmSmallInt *plength, /* out */
355 u_long *pflags) /* out */
356 {
357 const ScmStringBody *b = SCM_STRING_BODY(str);
358 if (psize) *psize = SCM_STRING_BODY_SIZE(b);
359 if (plength) *plength = SCM_STRING_BODY_LENGTH(b);
360 if (pflags) *pflags = SCM_STRING_BODY_FLAGS(b);
361 return get_string_from_body(b);
362 }
363
364
365 /* Copy string. You can modify the flags of the newly created string
366 by FLAGS and MASK arguments; for the bits set in MASK, corresponding
367 bits in FLAGS are copied to the new string, and for other bits, the
368 original flags are copied.
369
370 The typical semantics of copy-string is achieved by passing 0 to
371 FLAGS and SCM_STRING_IMMUTABLE to MASK (i.e. reset IMMUTABLE flag,
372 and keep other flags intact.
373
374 NB: This routine doesn't check whether specified flag is valid
375 with the string content, i.e. you can drop INCOMPLETE flag with
376 copying, while the string content won't be checked if it consists
377 valid complete string. */
Scm_CopyStringWithFlags(ScmString * x,u_long flags,u_long mask)378 ScmObj Scm_CopyStringWithFlags(ScmString *x, u_long flags, u_long mask)
379 {
380 const ScmStringBody *b = SCM_STRING_BODY(x);
381 ScmSmallInt size = SCM_STRING_BODY_SIZE(b);
382 ScmSmallInt len = SCM_STRING_BODY_LENGTH(b);
383 const char *start = SCM_STRING_BODY_START(b);
384 const void *index = b->index;
385 u_long newflags = ((SCM_STRING_BODY_FLAGS(b) & ~mask)
386 | (flags & mask));
387
388 return SCM_OBJ(make_str(len, size, start, newflags, index));
389 }
390
391 /* OBSOLETED */
Scm_StringCompleteToIncomplete(ScmString * x)392 ScmObj Scm_StringCompleteToIncomplete(ScmString *x)
393 {
394 Scm_Warn("Obsoleted C API Scm_StringCompleteToIncomplete called");
395 ScmObj proc = SCM_UNDEFINED;
396 SCM_BIND_PROC(proc, "string-complete->incomplete", Scm_GaucheModule());
397 return Scm_ApplyRec1(proc, SCM_OBJ(x));
398 }
399
400 /* OBSOLETED */
Scm_StringIncompleteToComplete(ScmString * x,int handling,ScmChar substitute)401 ScmObj Scm_StringIncompleteToComplete(ScmString *x,
402 int handling,
403 ScmChar substitute)
404 {
405 Scm_Warn("Obsoleted C API Scm_StringIncompleteToComplete called");
406 ScmObj proc = SCM_UNDEFINED;
407 SCM_BIND_PROC(proc, "string-incomplete->complete", Scm_GaucheModule());
408 ScmObj r;
409 if (handling == SCM_ILLEGAL_CHAR_REJECT) {
410 r = Scm_ApplyRec1(proc, SCM_OBJ(x));
411 } else if (handling == SCM_ILLEGAL_CHAR_OMIT) {
412 r = Scm_ApplyRec2(proc, SCM_OBJ(x), SCM_MAKE_KEYWORD("omit"));
413 } else {
414 r = Scm_ApplyRec2(proc, SCM_OBJ(x), SCM_MAKE_CHAR(substitute));
415 }
416 return r;
417 }
418
419 /*----------------------------------------------------------------
420 * Comparison
421 */
422
423 /* TODO: merge Equal and Cmp API; required generic comparison protocol */
Scm_StringEqual(ScmString * x,ScmString * y)424 int Scm_StringEqual(ScmString *x, ScmString *y)
425 {
426 const ScmStringBody *xb = SCM_STRING_BODY(x);
427 const ScmStringBody *yb = SCM_STRING_BODY(y);
428 if ((SCM_STRING_BODY_FLAGS(xb)^SCM_STRING_BODY_FLAGS(yb))&SCM_STRING_INCOMPLETE) {
429 return FALSE;
430 }
431 if (SCM_STRING_BODY_SIZE(xb) != SCM_STRING_BODY_SIZE(yb)) {
432 return FALSE;
433 }
434 return (memcmp(SCM_STRING_BODY_START(xb),
435 SCM_STRING_BODY_START(yb),
436 SCM_STRING_BODY_SIZE(xb)) == 0? TRUE : FALSE);
437 }
438
Scm_StringCmp(ScmString * x,ScmString * y)439 int Scm_StringCmp(ScmString *x, ScmString *y)
440 {
441 const ScmStringBody *xb = SCM_STRING_BODY(x);
442 const ScmStringBody *yb = SCM_STRING_BODY(y);
443 ScmSmallInt sizx = SCM_STRING_BODY_SIZE(xb);
444 ScmSmallInt sizy = SCM_STRING_BODY_SIZE(yb);
445 ScmSmallInt siz = (sizx < sizy)? sizx : sizy;
446 int r = memcmp(SCM_STRING_BODY_START(xb), SCM_STRING_BODY_START(yb), siz);
447 if (r == 0) {
448 if (sizx == sizy) {
449 if (SCM_STRING_BODY_INCOMPLETE_P(xb)) {
450 if (SCM_STRING_BODY_INCOMPLETE_P(yb)) return 0;
451 else return 1;
452 } else {
453 if (SCM_STRING_BODY_INCOMPLETE_P(yb)) return -1;
454 else return 0;
455 }
456 }
457 if (sizx < sizy) return -1;
458 else return 1;
459 } else if (r < 0) {
460 return -1;
461 } else {
462 return 1;
463 }
464 }
465
466 /* single-byte case insensitive comparison */
sb_strcasecmp(const char * px,ScmSmallInt sizx,const char * py,ScmSmallInt sizy)467 static int sb_strcasecmp(const char *px, ScmSmallInt sizx,
468 const char *py, ScmSmallInt sizy)
469 {
470 for (; sizx > 0 && sizy > 0; sizx--, sizy--, px++, py++) {
471 char cx = tolower((u_char)*px);
472 char cy = tolower((u_char)*py);
473 if (cx == cy) continue;
474 return (cx - cy);
475 }
476 if (sizx > 0) return 1;
477 if (sizy > 0) return -1;
478 return 0;
479 }
480
481 /* multi-byte case insensitive comparison */
mb_strcasecmp(const char * px,ScmSmallInt lenx,const char * py,ScmSmallInt leny)482 static int mb_strcasecmp(const char *px, ScmSmallInt lenx,
483 const char *py, ScmSmallInt leny)
484 {
485 int ix, iy;
486 for (; lenx > 0 && leny > 0; lenx--, leny--, px+=ix, py+=iy) {
487 int cx, cy;
488 SCM_CHAR_GET(px, cx);
489 SCM_CHAR_GET(py, cy);
490 int ccx = SCM_CHAR_UPCASE(cx);
491 int ccy = SCM_CHAR_UPCASE(cy);
492 if (ccx != ccy) return (ccx - ccy);
493 ix = SCM_CHAR_NBYTES(cx);
494 iy = SCM_CHAR_NBYTES(cy);
495 }
496 if (lenx > 0) return 1;
497 if (leny > 0) return -1;
498 return 0;
499 }
500
Scm_StringCiCmp(ScmString * x,ScmString * y)501 int Scm_StringCiCmp(ScmString *x, ScmString *y)
502 {
503 const ScmStringBody *xb = SCM_STRING_BODY(x);
504 const ScmStringBody *yb = SCM_STRING_BODY(y);
505
506 if ((SCM_STRING_BODY_FLAGS(xb)^SCM_STRING_BODY_FLAGS(yb))&SCM_STRING_INCOMPLETE) {
507 Scm_Error("cannot compare incomplete strings in case-insensitive way: %S, %S",
508 SCM_OBJ(x), SCM_OBJ(y));
509 }
510 ScmSmallInt sizx = SCM_STRING_BODY_SIZE(xb);
511 ScmSmallInt lenx = SCM_STRING_BODY_LENGTH(xb);
512 ScmSmallInt sizy = SCM_STRING_BODY_SIZE(yb);
513 ScmSmallInt leny = SCM_STRING_BODY_LENGTH(yb);
514 const char *px = SCM_STRING_BODY_START(xb);
515 const char *py = SCM_STRING_BODY_START(yb);
516
517 if (sizx == lenx && sizy == leny) {
518 return sb_strcasecmp(px, sizx, py, sizy);
519 } else {
520 return mb_strcasecmp(px, lenx, py, leny);
521 }
522 }
523
524 /*----------------------------------------------------------------
525 * Reference
526 */
527
528 /* Advance ptr for NCHARS characters. Args assumed in boundary. */
forward_pos(const ScmStringBody * body,const char * current,ScmSmallInt nchars)529 static inline const char *forward_pos(const ScmStringBody *body,
530 const char *current,
531 ScmSmallInt nchars)
532 {
533 if (body && (SCM_STRING_BODY_SINGLE_BYTE_P(body) ||
534 SCM_STRING_BODY_INCOMPLETE_P(body))) {
535 return current + nchars;
536 }
537
538 while (nchars--) {
539 int n = SCM_CHAR_NFOLLOWS(*current);
540 current += n + 1;
541 }
542 return current;
543 }
544
545 /* Index -> ptr. Args assumed in boundary. */
index2ptr(const ScmStringBody * body,ScmSmallInt nchars)546 static const char *index2ptr(const ScmStringBody *body,
547 ScmSmallInt nchars)
548 {
549 if (body->index == NULL) {
550 return forward_pos(body, SCM_STRING_BODY_START(body), nchars);
551 }
552 ScmStringIndex *index = STRING_INDEX(body->index);
553 ScmSmallInt off = 0;
554 ScmSmallInt array_off = (nchars>>STRING_INDEX_SHIFT(index))+1;
555 /* If array_off is 1, we don't need lookup - the character is in the
556 first segment. */
557 if (array_off > 1) {
558 switch (STRING_INDEX_TYPE(index)) {
559 case STRING_INDEX8:
560 SCM_ASSERT(array_off < (ScmSmallInt)index->index8[1]);
561 off = index->index8[array_off];
562 break;
563 case STRING_INDEX16:
564 SCM_ASSERT(array_off < (ScmSmallInt)index->index16[1]);
565 off = index->index16[array_off];
566 break;
567 case STRING_INDEX32:
568 SCM_ASSERT(array_off < (ScmSmallInt)index->index32[1]);
569 off = index->index32[array_off];
570 break;
571 case STRING_INDEX64:
572 SCM_ASSERT(array_off < (ScmSmallInt)index->index64[1]);
573 off = index->index64[array_off];
574 break;
575 default:
576 Scm_Panic("String index contains unrecognized signature (%02x). "
577 "Possible memory corruption. Aborting...",
578 index->signature);
579 }
580 }
581 return forward_pos(body,
582 SCM_STRING_BODY_START(body) + off,
583 nchars & (STRING_INDEX_INTERVAL(index)-1));
584 }
585
586
587 /* string-ref.
588 * If POS is out of range,
589 * - returns SCM_CHAR_INVALID if range_error is FALSE
590 * - raise error otherwise.
591 * This differs from Scheme version, which takes an optional 'fallback'
592 * argument which will be returned when POS is out-of-range. We can't
593 * have the same semantics since the return type is limited.
594 */
Scm_StringRef(ScmString * str,ScmSmallInt pos,int range_error)595 ScmChar Scm_StringRef(ScmString *str, ScmSmallInt pos, int range_error)
596 {
597 const ScmStringBody *b = SCM_STRING_BODY(str);
598 ScmSmallInt len = SCM_STRING_BODY_LENGTH(b);
599
600 /* we can't allow string-ref on incomplete strings, since it may yield
601 invalid character object. */
602 if (SCM_STRING_BODY_INCOMPLETE_P(b)) {
603 Scm_Error("incomplete string not allowed : %S", str);
604 }
605 if (pos < 0 || pos >= len) {
606 if (range_error) {
607 Scm_Error("argument out of range: %ld", pos);
608 } else {
609 return SCM_CHAR_INVALID;
610 }
611 }
612
613 const char *p = NULL;
614 if (SCM_STRING_BODY_SINGLE_BYTE_P(b)) {
615 p = SCM_STRING_BODY_START(b) + pos;
616 } else {
617 p = index2ptr(b, pos);
618 }
619
620 if (SCM_STRING_BODY_SINGLE_BYTE_P(b)) {
621 return (ScmChar)(*(unsigned char *)p);
622 } else {
623 ScmChar c;
624 SCM_CHAR_GET(p, c);
625 return c;
626 }
627 }
628
629 /* The meaning and rationale of range_error is the same as Scm_StringRef.
630 * Returns -1 if OFFSET is out-of-range and RANGE_ERROR is FALSE.
631 * (Because of this, the return type is not ScmByte but int.
632 */
Scm_StringByteRef(ScmString * str,ScmSmallInt offset,int range_error)633 int Scm_StringByteRef(ScmString *str, ScmSmallInt offset, int range_error)
634 {
635 const ScmStringBody *b = SCM_STRING_BODY(str);
636 if (offset < 0 || offset >= SCM_STRING_BODY_SIZE(b)) {
637 if (range_error) {
638 Scm_Error("argument out of range: %ld", offset);
639 } else {
640 return -1;
641 }
642 }
643 return (ScmByte)SCM_STRING_BODY_START(b)[offset];
644 }
645
646 /* External interface of index2ptr. Returns the pointer to the
647 offset-th character in str. */
648 /* NB: this function allows offset == length of the string; in that
649 case, the return value points the location past the string body,
650 but it is necessary sometimes to do a pointer arithmetic with the
651 returned values. */
Scm_StringBodyPosition(const ScmStringBody * b,ScmSmallInt offset)652 const char *Scm_StringBodyPosition(const ScmStringBody *b, ScmSmallInt offset)
653 {
654 if (offset < 0 || offset > SCM_STRING_BODY_LENGTH(b)) {
655 Scm_Error("argument out of range: %ld", offset);
656 }
657 return index2ptr(b, offset);
658 }
659
660 /* This is old API and now DEPRECATED. It's difficult to use this safely,
661 since you don't have a way to get the string length consistent at the
662 moment you call this function. Use Scm_StringBodyPosition instead. */
Scm_StringPosition(ScmString * str,ScmSmallInt offset)663 const char *Scm_StringPosition(ScmString *str, ScmSmallInt offset)
664 {
665 return Scm_StringBodyPosition(SCM_STRING_BODY(str), offset);
666 }
667
668 /*----------------------------------------------------------------
669 * Concatenation
670 */
671
Scm_StringAppend2(ScmString * x,ScmString * y)672 ScmObj Scm_StringAppend2(ScmString *x, ScmString *y)
673 {
674 const ScmStringBody *xb = SCM_STRING_BODY(x);
675 const ScmStringBody *yb = SCM_STRING_BODY(y);
676 ScmSmallInt sizex = SCM_STRING_BODY_SIZE(xb);
677 ScmSmallInt lenx = SCM_STRING_BODY_LENGTH(xb);
678 ScmSmallInt sizey = SCM_STRING_BODY_SIZE(yb);
679 ScmSmallInt leny = SCM_STRING_BODY_LENGTH(yb);
680 CHECK_SIZE(sizex+sizey);
681 u_long flags = 0;
682 char *p = SCM_NEW_ATOMIC2(char *,sizex + sizey + 1);
683
684 memcpy(p, xb->start, sizex);
685 memcpy(p+sizex, yb->start, sizey);
686 p[sizex + sizey] = '\0';
687 flags |= SCM_STRING_TERMINATED;
688
689 if (SCM_STRING_BODY_INCOMPLETE_P(xb) || SCM_STRING_BODY_INCOMPLETE_P(yb)) {
690 flags |= SCM_STRING_INCOMPLETE; /* yields incomplete string */
691 }
692 return SCM_OBJ(make_str(lenx+leny, sizex+sizey, p, flags, NULL));
693 }
694
Scm_StringAppendC(ScmString * x,const char * str,ScmSmallInt sizey,ScmSmallInt leny)695 ScmObj Scm_StringAppendC(ScmString *x, const char *str,
696 ScmSmallInt sizey, ScmSmallInt leny)
697 {
698 const ScmStringBody *xb = SCM_STRING_BODY(x);
699 ScmSmallInt sizex = SCM_STRING_BODY_SIZE(xb);
700 ScmSmallInt lenx = SCM_STRING_BODY_LENGTH(xb);
701 u_long flags = 0;
702
703 if (sizey < 0) count_size_and_length(str, &sizey, &leny);
704 else if (leny < 0) leny = count_length(str, sizey);
705 CHECK_SIZE(sizex+sizey);
706
707 char *p = SCM_NEW_ATOMIC2(char *, sizex + sizey + 1);
708 memcpy(p, xb->start, sizex);
709 memcpy(p+sizex, str, sizey);
710 p[sizex+sizey] = '\0';
711 flags |= SCM_STRING_TERMINATED;
712
713 if (SCM_STRING_BODY_INCOMPLETE_P(xb) || leny < 0) {
714 flags |= SCM_STRING_INCOMPLETE;
715 }
716 return SCM_OBJ(make_str(lenx + leny, sizex + sizey, p, flags, NULL));
717 }
718
Scm_StringAppend(ScmObj strs)719 ScmObj Scm_StringAppend(ScmObj strs)
720 {
721 #define BODY_ARRAY_SIZE 32
722 ScmSmallInt size = 0, len = 0;
723 u_long flags = 0;
724 const ScmStringBody *bodies_s[BODY_ARRAY_SIZE], **bodies;
725
726 /* It is trickier than it appears, since the strings may be modified
727 by another thread during we're dealing with it. So in the first
728 pass to sum up the lengths of strings, we extract the string bodies
729 and save it. */
730 ScmSmallInt numstrs = Scm_Length(strs);
731 if (numstrs < 0) Scm_Error("improper list not allowed: %S", strs);
732 if (numstrs > BODY_ARRAY_SIZE) {
733 bodies = SCM_NEW_ARRAY(const ScmStringBody*, numstrs);
734 } else {
735 bodies = bodies_s;
736 }
737
738 ScmSmallInt i = 0;
739 ScmObj cp;
740 SCM_FOR_EACH(cp, strs) {
741 const ScmStringBody *b;
742 if (!SCM_STRINGP(SCM_CAR(cp))) {
743 Scm_Error("string required, but got %S", SCM_CAR(cp));
744 }
745 b = SCM_STRING_BODY(SCM_CAR(cp));
746 size += SCM_STRING_BODY_SIZE(b);
747 len += SCM_STRING_BODY_LENGTH(b);
748 CHECK_SIZE(size);
749 if (SCM_STRING_BODY_INCOMPLETE_P(b)) {
750 flags |= SCM_STRING_INCOMPLETE;
751 }
752 bodies[i++] = b;
753 }
754
755 char *buf = SCM_NEW_ATOMIC2(char *, size+1);
756 char *bufp = buf;
757 for (i=0; i<numstrs; i++) {
758 const ScmStringBody *b = bodies[i];
759 memcpy(bufp, SCM_STRING_BODY_START(b), SCM_STRING_BODY_SIZE(b));
760 bufp += SCM_STRING_BODY_SIZE(b);
761 }
762 *bufp = '\0';
763 bodies = NULL; /* to help GC */
764 flags |= SCM_STRING_TERMINATED;
765 return SCM_OBJ(make_str(len, size, buf, flags, NULL));
766 #undef BODY_ARRAY_SIZE
767 }
768
Scm_StringJoin(ScmObj strs,ScmString * delim,int grammar)769 ScmObj Scm_StringJoin(ScmObj strs, ScmString *delim, int grammar)
770 {
771 #define BODY_ARRAY_SIZE 32
772 ScmSmallInt size = 0, len = 0;
773 u_long flags = 0;
774 const ScmStringBody *bodies_s[BODY_ARRAY_SIZE], **bodies;
775
776 ScmSmallInt nstrs = Scm_Length(strs);
777 if (nstrs < 0) Scm_Error("improper list not allowed: %S", strs);
778 if (nstrs == 0) {
779 if (grammar == SCM_STRING_JOIN_STRICT_INFIX) {
780 Scm_Error("can't join empty list of strings with strict-infix grammar");
781 }
782 return SCM_MAKE_STR("");
783 }
784
785 if (nstrs > BODY_ARRAY_SIZE) {
786 bodies = SCM_NEW_ARRAY(const ScmStringBody *, nstrs);
787 } else {
788 bodies = bodies_s;
789 }
790
791 const ScmStringBody *dbody = SCM_STRING_BODY(delim);
792 ScmSmallInt dsize = SCM_STRING_BODY_SIZE(dbody);
793 ScmSmallInt dlen = SCM_STRING_BODY_LENGTH(dbody);
794 if (SCM_STRING_BODY_INCOMPLETE_P(dbody)) {
795 flags |= SCM_STRING_INCOMPLETE;
796 }
797
798 ScmSmallInt i = 0, ndelim;
799 ScmObj cp;
800 SCM_FOR_EACH(cp, strs) {
801 const ScmStringBody *b;
802 if (!SCM_STRINGP(SCM_CAR(cp))) {
803 Scm_Error("string required, but got %S", SCM_CAR(cp));
804 }
805 b = SCM_STRING_BODY(SCM_CAR(cp));
806 size += SCM_STRING_BODY_SIZE(b);
807 len += SCM_STRING_BODY_LENGTH(b);
808 CHECK_SIZE(size);
809 if (SCM_STRING_BODY_INCOMPLETE_P(b)) {
810 flags |= SCM_STRING_INCOMPLETE;
811 }
812 bodies[i++] = b;
813 }
814 if (grammar == SCM_STRING_JOIN_INFIX
815 || grammar == SCM_STRING_JOIN_STRICT_INFIX) {
816 ndelim = nstrs - 1;
817 } else {
818 ndelim = nstrs;
819 }
820 size += dsize * ndelim;
821 len += dlen * ndelim;
822 CHECK_SIZE(size);
823
824 char *buf = SCM_NEW_ATOMIC2(char *, size+1);
825 char *bufp = buf;
826 if (grammar == SCM_STRING_JOIN_PREFIX) {
827 memcpy(bufp, SCM_STRING_BODY_START(dbody), dsize);
828 bufp += dsize;
829 }
830 for (i=0; i<nstrs; i++) {
831 const ScmStringBody *b = bodies[i];
832 memcpy(bufp, SCM_STRING_BODY_START(b), SCM_STRING_BODY_SIZE(b));
833 bufp += SCM_STRING_BODY_SIZE(b);
834 if (i < nstrs-1) {
835 memcpy(bufp, SCM_STRING_BODY_START(dbody), dsize);
836 bufp += dsize;
837 }
838 }
839 if (grammar == SCM_STRING_JOIN_SUFFIX) {
840 memcpy(bufp, SCM_STRING_BODY_START(dbody), dsize);
841 bufp += dsize;
842 }
843 *bufp = '\0';
844 bodies = NULL; /* to help GC */
845 flags |= SCM_STRING_TERMINATED;
846 return SCM_OBJ(make_str(len, size, buf, flags, NULL));
847 #undef BODY_ARRAY_SIZE
848 }
849
850 /*----------------------------------------------------------------
851 * Mutation
852 */
853
854 /*
855 * String mutation is extremely heavy operation in Gauche,
856 * and only provided for compatibility to RnRS. At C API level
857 * there's no point in using string mutation at all. A single
858 * API, which replaces the string body, is provided at C level.
859 */
860
Scm_StringReplaceBody(ScmString * str,const ScmStringBody * newbody)861 ScmObj Scm_StringReplaceBody(ScmString *str, const ScmStringBody *newbody)
862 {
863 if (SCM_STRING_IMMUTABLE_P(str)) {
864 Scm_Error("attempted to modify an immutable string: %S", str);
865 }
866
867 /* Atomically replaces the str's body (no MT hazard) */
868 str->body = newbody;
869
870 /* TODO: If the initialBody of str isn't shared,
871 nullify str->initialBody.start so that the original string is
872 GCed. It should be done after implementing 'shared' flag
873 into the string body. */
874 return SCM_OBJ(str);
875 }
876
877 /*----------------------------------------------------------------
878 * Substring
879 */
880
substring(const ScmStringBody * xb,ScmSmallInt start,ScmSmallInt end,int byterange,int immutable)881 static ScmObj substring(const ScmStringBody *xb,
882 ScmSmallInt start, ScmSmallInt end,
883 int byterange, int immutable)
884 {
885 ScmSmallInt len = byterange? SCM_STRING_BODY_SIZE(xb) : SCM_STRING_BODY_LENGTH(xb);
886 u_long flags = SCM_STRING_BODY_FLAGS(xb);
887 if (!immutable) flags &= ~SCM_STRING_IMMUTABLE;
888
889 SCM_CHECK_START_END(start, end, len);
890
891 if (byterange) {
892 if (end != len) flags &= ~SCM_STRING_TERMINATED;
893 flags |= SCM_STRING_INCOMPLETE;
894 return SCM_OBJ(make_str(end - start,
895 end - start,
896 SCM_STRING_BODY_START(xb) + start,
897 flags, NULL));
898 } else {
899 const char *s, *e;
900 s = index2ptr(xb, start);
901 if (len == end) {
902 e = SCM_STRING_BODY_END(xb);
903 } else {
904 /* kludge - if we don't have index, forward_pos is faster. */
905 if (start > 0 && xb->index == NULL) {
906 e = forward_pos(xb, s, end - start);
907 } else {
908 e = index2ptr(xb, end);
909 }
910 flags &= ~SCM_STRING_TERMINATED;
911 }
912 return SCM_OBJ(make_str(end - start,
913 (ScmSmallInt)(e - s), s, flags, NULL));
914 }
915 }
916
substring_cursor(const ScmStringBody * xb,const char * start,const char * end,int immutable)917 static ScmObj substring_cursor(const ScmStringBody *xb,
918 const char *start,
919 const char *end,
920 int immutable)
921 {
922 u_long flags = SCM_STRING_BODY_FLAGS(xb);
923 if (!immutable) flags &= ~SCM_STRING_IMMUTABLE;
924
925 if (start < SCM_STRING_BODY_START(xb) ||
926 start > SCM_STRING_BODY_END(xb)) {
927 Scm_Error("start argument out of range: %S", start);
928 }
929 else if (end > SCM_STRING_BODY_END(xb)) {
930 Scm_Error("end argument out of range: %S", end);
931 } else if (end < start) {
932 Scm_Error("end argument must be greater than or "
933 "equal to the start argument: %S vs %S", end, start);
934 }
935
936 if (end != SCM_STRING_BODY_END(xb)) {
937 flags &= ~SCM_STRING_TERMINATED;
938 }
939
940 ScmSmallInt len;
941 if (SCM_STRING_BODY_SINGLE_BYTE_P(xb)) {
942 len = (ScmSmallInt)(end - start);
943 } else {
944 len = Scm_MBLen(start, end);
945 }
946
947 return SCM_OBJ(make_str(len,
948 (ScmSmallInt)(end - start),
949 start, flags, NULL));
950 }
951
Scm_Substring(ScmString * x,ScmSmallInt start,ScmSmallInt end,int byterangep)952 ScmObj Scm_Substring(ScmString *x, ScmSmallInt start, ScmSmallInt end,
953 int byterangep)
954 {
955 return substring(SCM_STRING_BODY(x), start, end, byterangep, FALSE);
956 }
957
958 /* Auxiliary procedure to support optional start/end parameter specified
959 in lots of SRFI-13 functions. If start and end is specified and restricts
960 string range, call substring. Otherwise returns x itself.
961 If input string is immutable, the result is also immutable. If the caller
962 needs a mutable string it should call CopyString anyway, for the caller
963 doesn't know if the input string is just passed through.
964 */
Scm_MaybeSubstring(ScmString * x,ScmObj start,ScmObj end)965 ScmObj Scm_MaybeSubstring(ScmString *x, ScmObj start, ScmObj end)
966 {
967 const ScmStringBody *xb = SCM_STRING_BODY(x);
968 int no_start = SCM_UNBOUNDP(start) || SCM_UNDEFINEDP(start) || SCM_FALSEP(start);
969 int no_end = SCM_UNBOUNDP(end) || SCM_UNDEFINEDP(end) || SCM_FALSEP(end);
970 ScmSmallInt istart = -1, iend = -1, ostart = -1, oend = -1;
971
972 int immutable = SCM_STRING_BODY_HAS_FLAG(xb, SCM_STRING_IMMUTABLE);
973
974 if (no_start)
975 istart = 0;
976 else if (SCM_STRING_CURSOR_P(start))
977 ostart = string_cursor_offset(start);
978 else if (SCM_INTP(start))
979 istart = SCM_INT_VALUE(start);
980 else
981 Scm_Error("exact integer or cursor required for start, but got %S", start);
982
983 if (no_end) {
984 if (istart == 0 || ostart == 0) {
985 return SCM_OBJ(x);
986 }
987 iend = SCM_STRING_BODY_LENGTH(xb);
988 } else if (SCM_STRING_CURSOR_P(end))
989 oend = string_cursor_offset(end);
990 else if (SCM_INTP(end))
991 iend = SCM_INT_VALUE(end);
992 else
993 Scm_Error("exact integer or cursor required for end, but got %S", end);
994
995 if (no_start && oend != -1) {
996 return substring_cursor(xb,
997 SCM_STRING_BODY_START(xb),
998 SCM_STRING_BODY_START(xb) + oend,
999 immutable);
1000 }
1001 if (ostart != -1 && oend != -1) {
1002 return substring_cursor(xb,
1003 SCM_STRING_BODY_START(xb) + ostart,
1004 SCM_STRING_BODY_START(xb) + oend,
1005 immutable);
1006 }
1007 if (ostart != -1 && no_end) {
1008 return substring_cursor(xb,
1009 SCM_STRING_BODY_START(xb) + ostart,
1010 SCM_STRING_BODY_END(xb),
1011 immutable);
1012 }
1013
1014 if (ostart != -1) {
1015 istart = Scm_GetInteger(Scm_StringCursorIndex(x, start));
1016 }
1017 if (oend != -1) {
1018 iend = Scm_GetInteger(Scm_StringCursorIndex(x, end));
1019 }
1020
1021 return substring(xb, istart, iend, FALSE, immutable);
1022 }
1023
1024 /*----------------------------------------------------------------
1025 * Search & parse
1026 */
1027
1028 /* Boyer-Moore string search. assuming siz1 > siz2, siz2 < 256. */
boyer_moore(const char * ss1,ScmSmallInt siz1,const char * ss2,ScmSmallInt siz2)1029 static ScmSmallInt boyer_moore(const char *ss1, ScmSmallInt siz1,
1030 const char *ss2, ScmSmallInt siz2)
1031 {
1032 unsigned char shift[256];
1033 for (ScmSmallInt i=0; i<256; i++) { shift[i] = siz2; }
1034 for (ScmSmallInt j=0; j<siz2-1; j++) {
1035 shift[(unsigned char)ss2[j]] = siz2-j-1;
1036 }
1037 for (ScmSmallInt i=siz2-1; i<siz1; i+=shift[(unsigned char)ss1[i]]) {
1038 ScmSmallInt j, k;
1039 for (j=siz2-1, k = i; j>=0 && ss1[k] == ss2[j]; j--, k--)
1040 ;
1041 if (j == -1) return k+1;
1042 }
1043 return -1;
1044 }
1045
boyer_moore_reverse(const char * ss1,ScmSmallInt siz1,const char * ss2,ScmSmallInt siz2)1046 static ScmSmallInt boyer_moore_reverse(const char *ss1, ScmSmallInt siz1,
1047 const char *ss2, ScmSmallInt siz2)
1048 {
1049 unsigned char shift[256];
1050 for (ScmSmallInt i=0; i<256; i++) { shift[i] = siz2; }
1051 for (ScmSmallInt j=siz2-1; j>0; j--) {
1052 shift[(unsigned char)ss2[j]] = j;
1053 }
1054 for (ScmSmallInt i=siz1-siz2+1; i>=0; i-=shift[(unsigned char)ss1[i]]) {
1055 ScmSmallInt j, k;
1056 for (j=0, k = i; j<siz2 && ss1[k] == ss2[j]; j++, k++)
1057 ;
1058 if (j == siz2) return i;
1059 }
1060 return -1;
1061 }
1062
1063 /* Primitive routines to search a substring s2 within s1.
1064 Returns NOT_FOUND if not found, FOUND_BOTH_INDEX if both byte index
1065 (*bi) and character index (*ci) is calculted, FOUND_BYTE_INDEX
1066 if only byte index is calculated.
1067
1068 When the encoding is utf-8 or none, we can scan a string as if it is just
1069 a bytestring. The only caveat is that, with utf-8, we need to calculate
1070 character index after we find the match. It is still a total win, for
1071 finding out non-matches using Boyer-Moore is a lot faster than naive way.
1072
1073 If the encoding is EUC-JP or SJIS, we can only use Boyer-Moore when we
1074 know the strings have single-byte only. Multibyte strings of those
1075 encodings can have spurious matches when compared bytewise.
1076 */
1077
1078 /* return value of string_scan */
1079 #define NOT_FOUND 0 /* string not found */
1080 #define FOUND_BOTH_INDEX 1 /* string found, and both indexes are calculated */
1081 #define FOUND_BYTE_INDEX 2 /* string found, and only byte index is calc'd */
1082
1083 /* In utf-8 multibyte case, we only count byte index and let the caller
1084 figure out the character index. In other encodings we can always find
1085 both index. */
1086 #if defined(GAUCHE_CHAR_ENCODING_UTF_8)
1087 #define FOUND_MAYBE_BOTH FOUND_BYTE_INDEX
1088 #else
1089 #define FOUND_MAYBE_BOTH FOUND_BOTH_INDEX
1090 #endif
1091
1092 /* In euc-jp and sjis case, we use faster method only when (size == len). */
1093 #if defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
1094 #define BYTEWISE_SEARCHABLE(siz, len) ((siz) == (len))
1095 #define MULTIBYTE_NAIVE_SEARCH_NEEDED 1
1096 #else
1097 #define BYTEWISE_SEARCHABLE(siz, len) TRUE
1098 #define MULTIBYTE_NAIVE_SEARCH_NEEDED 0
1099 #endif
1100
1101 /* glibc has memrchr, but we need to provide fallback anyway and
1102 we don't need it to be highly tuned, so we just roll our own. */
my_memrchr(const void * s,int c,size_t n)1103 static const void *my_memrchr(const void *s, int c, size_t n)
1104 {
1105 const char *p = (const char*)s + n - 1;
1106 for (;p >= (const char*)s; p--) {
1107 if ((int)*p == c) return p;
1108 }
1109 return NULL;
1110 }
1111
1112 /* NB: len1 and len2 only used in certain internal CES. */
string_search(const char * s1,ScmSmallInt siz1,ScmSmallInt len1 SCM_UNUSED,const char * s2,ScmSmallInt siz2,ScmSmallInt len2 SCM_UNUSED,ScmSmallInt * bi,ScmSmallInt * ci)1113 static int string_search(const char *s1, ScmSmallInt siz1,
1114 ScmSmallInt len1 SCM_UNUSED,
1115 const char *s2, ScmSmallInt siz2,
1116 ScmSmallInt len2 SCM_UNUSED,
1117 ScmSmallInt *bi /* out */,
1118 ScmSmallInt *ci /* out */)
1119 {
1120 if (siz2 == 0) {
1121 *bi = *ci = 0;
1122 return FOUND_BOTH_INDEX;
1123 }
1124
1125 /* Single-byte case. */
1126 if (BYTEWISE_SEARCHABLE(siz1, len1)) {
1127 if (siz2 == 1) {
1128 /* Single ASCII character search case. This is a huge win. */
1129 const char *z = memchr(s1, s2[0], siz1);
1130 if (z) { *bi = *ci = z - s1; return FOUND_MAYBE_BOTH; }
1131 else return NOT_FOUND;
1132 }
1133 if (BYTEWISE_SEARCHABLE(siz2, len2)) {
1134 ScmSmallInt i;
1135 /* Shortcut for single-byte strings */
1136 if (siz1 < siz2) return NOT_FOUND;
1137 if (siz1 < 256 || siz2 >= 256) {
1138 /* brute-force search */
1139 for (i=0; i<=siz1-siz2; i++) {
1140 if (memcmp(s2, s1+i, siz2) == 0) break;
1141 }
1142 if (i == siz1-siz2+1) return NOT_FOUND;
1143 } else {
1144 i = boyer_moore(s1, siz1, s2, siz2);
1145 if (i < 0) return NOT_FOUND;
1146 }
1147 *bi = *ci = i;
1148 return FOUND_MAYBE_BOTH;
1149 }
1150 /* FALLTHROUGH */
1151 }
1152
1153 #if MULTIBYTE_NAIVE_SEARCH_NEEDED
1154 /* Multibyte case. */
1155 if (len1 >= len2) {
1156 const char *sp = s1;
1157 for (ScmSmallInt i=0; i<=len1-len2; i++) {
1158 if (memcmp(sp, s2, siz2) == 0) {
1159 *bi = (ScmSmallInt)(sp - s1);
1160 *ci = i;
1161 return FOUND_BOTH_INDEX;
1162 }
1163 sp += SCM_CHAR_NFOLLOWS(*sp) + 1;
1164 }
1165 }
1166 #endif /*MULTIBYTE_NAIVE_SEARCH_NEEDED*/
1167 return NOT_FOUND;
1168 }
1169
1170 /* NB: len2 is only used in some internal CES */
string_search_reverse(const char * s1,ScmSmallInt siz1,ScmSmallInt len1,const char * s2,ScmSmallInt siz2,ScmSmallInt len2 SCM_UNUSED,ScmSmallInt * bi,ScmSmallInt * ci)1171 static int string_search_reverse(const char *s1, ScmSmallInt siz1,
1172 ScmSmallInt len1,
1173 const char *s2, ScmSmallInt siz2,
1174 ScmSmallInt len2 SCM_UNUSED,
1175 ScmSmallInt *bi /* out */,
1176 ScmSmallInt *ci /* out */)
1177 {
1178 if (siz2 == 0) {
1179 *bi = siz1;
1180 *ci = len1;
1181 return FOUND_BOTH_INDEX;
1182 }
1183
1184 /* Single-byte case. */
1185 if (BYTEWISE_SEARCHABLE(siz1, len1)) {
1186 if (siz2 == 1) {
1187 /* Single ASCII character search case. This is a huge win. */
1188 const char *z = my_memrchr(s1, s2[0], siz1);
1189 if (z) { *bi = *ci = z - s1; return FOUND_MAYBE_BOTH; }
1190 else return NOT_FOUND;
1191 }
1192 if (BYTEWISE_SEARCHABLE(siz2, len2)) {
1193 ScmSmallInt i;
1194 /* short cut for single-byte strings */
1195 if (siz1 < siz2) return NOT_FOUND;
1196 if (siz1 < 256 || siz2 >= 256) {
1197 /* brute-force search */
1198 for (i=siz1-siz2; i>=0; i--) {
1199 if (memcmp(s2, s1+i, siz2) == 0) break;
1200 }
1201 if (i < 0) return NOT_FOUND;
1202 } else {
1203 i = boyer_moore_reverse(s1, siz1, s2, siz2);
1204 if (i < 0) return NOT_FOUND;
1205 }
1206 *bi = *ci = i;
1207 return FOUND_MAYBE_BOTH;
1208 } else {
1209 return NOT_FOUND; /* sbstring can't contain mbstring. */
1210 }
1211 }
1212
1213 #if MULTIBYTE_NAIVE_SEARCH_NEEDED
1214 /* Multibyte case. */
1215 if (len1 >= len2) {
1216 const char *sp = s1 + siz1, *p;
1217 for (ScmSmallInt i=0; i<len2; i++) {
1218 SCM_CHAR_BACKWARD(sp, s1, p);
1219 SCM_ASSERT(*p);
1220 sp = p;
1221 }
1222 for (ScmSmallInt i=len1-len2; i>=0; i--) {
1223 if (memcmp(sp, s2, siz2) == 0) {
1224 *bi = (ScmSmallInt)(sp - s1);
1225 *ci = i;
1226 return FOUND_BOTH_INDEX;
1227 }
1228 SCM_CHAR_BACKWARD(sp, s1, p);
1229 sp = p;
1230 }
1231 }
1232 #endif /*MULTIBYTE_NAIVE_SEARCH_NEEDED*/
1233 return NOT_FOUND;
1234 }
1235
1236 /* Scan s2 in s1, and calculates appropriate return value(s) according to
1237 retmode. Returns # of values, 1 or 2.
1238
1239 SCM_STRING_SCAN_INDEX : v1 <- the index of s1
1240 s1 = "abcde" and s2 = "cd" => 2
1241 SCM_STRING_SCAN_CURSOR : v1 <- the cursor of s1
1242 s1 = "abcde" and s2 = "cd" => #<string-cursor 2>
1243 SCM_STRING_SCAN_BEFORE : v1 <- substring of s1 before s2
1244 s1 = "abcde" and s2 = "cd" => "ab"
1245 SCM_STRING_SCAN_AFTER : v1 <- substring of s1 after s2
1246 s1 = "abcde" and s2 = "cd" => "e"
1247 SCM_STRING_SCAN_BEFORE2 : v1 <- substring of s1 before s2, v2 <- rest
1248 s1 = "abcde" and s2 = "cd" => "ab" and "cde"
1249 SCM_STRING_SCAN_AFTER2 : v1 <- substring of s1 up to s2, v2 <- rest
1250 s1 = "abcde" and s2 = "cd" => "abcd" and "e"
1251 SCM_STRING_SCAN_BOTH : v1 <- substring of s1 before, v2 <- after s2
1252 s1 = "abcde" and s2 = "cd" => "ab" and "e"
1253 */
string_scan(ScmString * ss1,const char * s2,ScmSmallInt siz2,ScmSmallInt len2,int incomplete2,int retmode,int (* searcher)(const char *,ScmSmallInt,ScmSmallInt,const char *,ScmSmallInt,ScmSmallInt,ScmSmallInt *,ScmSmallInt *),ScmObj * v1,ScmObj * v2)1254 static int string_scan(ScmString *ss1, const char *s2,
1255 ScmSmallInt siz2, ScmSmallInt len2,
1256 int incomplete2,
1257 int retmode,
1258 int (*searcher)(const char*, ScmSmallInt, ScmSmallInt,
1259 const char*, ScmSmallInt, ScmSmallInt,
1260 ScmSmallInt*, ScmSmallInt*),
1261 ScmObj *v1, /* out */
1262 ScmObj *v2) /* out */
1263 {
1264 ScmSmallInt bi = 0, ci = 0;
1265 const ScmStringBody *sb = SCM_STRING_BODY(ss1);
1266 const char *s1 = SCM_STRING_BODY_START(sb);
1267 ScmSmallInt siz1 = SCM_STRING_BODY_SIZE(sb);
1268 ScmSmallInt len1 = SCM_STRING_BODY_LENGTH(sb);
1269
1270 if (retmode < 0 || retmode >= SCM_STRING_SCAN_NUM_RETMODES) {
1271 Scm_Error("return mode out fo range: %d", retmode);
1272 }
1273
1274 int incomplete =
1275 (SCM_STRING_BODY_INCOMPLETE_P(sb) || incomplete2)
1276 ? SCM_STRING_INCOMPLETE : 0;
1277
1278 /* prefiltering - if both string is complete, and s1 is sbstring
1279 and s2 is mbstring, we know there's no match. */
1280 int retcode =
1281 (!incomplete && (siz1 == len1) && (siz2 != len2))
1282 ? NOT_FOUND
1283 : searcher(s1, siz1, len1, s2, siz2, len2, &bi, &ci);
1284
1285 if (retcode == NOT_FOUND) {
1286 switch (retmode) {
1287 case SCM_STRING_SCAN_INDEX:
1288 case SCM_STRING_SCAN_CURSOR:
1289 case SCM_STRING_SCAN_BEFORE:
1290 case SCM_STRING_SCAN_AFTER:
1291 *v1 = SCM_FALSE;
1292 return 1;
1293 default:
1294 *v1 = SCM_FALSE;
1295 *v2 = SCM_FALSE;
1296 return 2;
1297 }
1298 }
1299
1300 if (retmode != SCM_STRING_SCAN_CURSOR
1301 && (retcode == FOUND_BYTE_INDEX && !incomplete)) {
1302 ci = count_length(s1, bi);
1303 }
1304
1305 switch (retmode) {
1306 case SCM_STRING_SCAN_INDEX:
1307 *v1 = Scm_MakeInteger(ci);
1308 return 1;
1309 case SCM_STRING_SCAN_CURSOR:
1310 *v1 = make_string_cursor(ss1, s1 + bi);
1311 return 1;
1312 case SCM_STRING_SCAN_BEFORE:
1313 *v1 = Scm_MakeString(s1, bi, ci, incomplete);
1314 return 1;
1315 case SCM_STRING_SCAN_AFTER:
1316 *v1 = Scm_MakeString(s1+bi+siz2, siz1-bi-siz2,
1317 len1-ci-len2, incomplete);
1318 return 1;
1319 case SCM_STRING_SCAN_BEFORE2:
1320 *v1 = Scm_MakeString(s1, bi, ci, incomplete);
1321 *v2 = Scm_MakeString(s1+bi, siz1-bi, len1-ci, incomplete);
1322 return 2;
1323 case SCM_STRING_SCAN_AFTER2:
1324 *v1 = Scm_MakeString(s1, bi+siz2, ci+len2, incomplete);
1325 *v2 = Scm_MakeString(s1+bi+siz2, siz1-bi-siz2,
1326 len1-ci-len2, incomplete);
1327 return 2;
1328 case SCM_STRING_SCAN_BOTH:
1329 *v1 = Scm_MakeString(s1, bi, ci, incomplete);;
1330 *v2 = Scm_MakeString(s1+bi+siz2, siz1-bi-siz2,
1331 len1-ci-len2, incomplete);
1332 return 2;
1333 }
1334 return 0; /* dummy */
1335 }
1336
Scm_StringScan(ScmString * s1,ScmString * s2,int retmode)1337 ScmObj Scm_StringScan(ScmString *s1, ScmString *s2, int retmode)
1338 {
1339 ScmObj v1, v2;
1340 const ScmStringBody *s2b = SCM_STRING_BODY(s2);
1341 int nvals = string_scan(s1,
1342 SCM_STRING_BODY_START(s2b),
1343 SCM_STRING_BODY_SIZE(s2b),
1344 SCM_STRING_BODY_LENGTH(s2b),
1345 SCM_STRING_BODY_INCOMPLETE_P(s2b),
1346 retmode, string_search, &v1, &v2);
1347 if (nvals == 1) return v1;
1348 else return Scm_Values2(v1, v2);
1349 }
1350
Scm_StringScanChar(ScmString * s1,ScmChar ch,int retmode)1351 ScmObj Scm_StringScanChar(ScmString *s1, ScmChar ch, int retmode)
1352 {
1353 ScmObj v1, v2;
1354 char buf[SCM_CHAR_MAX_BYTES];
1355 SCM_CHAR_PUT(buf, ch);
1356 int nvals = string_scan(s1, buf, SCM_CHAR_NBYTES(ch), 1, FALSE, retmode,
1357 string_search, &v1, &v2);
1358 if (nvals == 1) return v1;
1359 else return Scm_Values2(v1, v2);
1360 }
1361
Scm_StringScanRight(ScmString * s1,ScmString * s2,int retmode)1362 ScmObj Scm_StringScanRight(ScmString *s1, ScmString *s2, int retmode)
1363 {
1364 ScmObj v1, v2;
1365 const ScmStringBody *s2b = SCM_STRING_BODY(s2);
1366 int nvals = string_scan(s1,
1367 SCM_STRING_BODY_START(s2b),
1368 SCM_STRING_BODY_SIZE(s2b),
1369 SCM_STRING_BODY_LENGTH(s2b),
1370 SCM_STRING_BODY_INCOMPLETE_P(s2b),
1371 retmode, string_search_reverse, &v1, &v2);
1372 if (nvals == 1) return v1;
1373 else return Scm_Values2(v1, v2);
1374 }
1375
Scm_StringScanCharRight(ScmString * s1,ScmChar ch,int retmode)1376 ScmObj Scm_StringScanCharRight(ScmString *s1, ScmChar ch, int retmode)
1377 {
1378 ScmObj v1, v2;
1379 char buf[SCM_CHAR_MAX_BYTES];
1380 SCM_CHAR_PUT(buf, ch);
1381 int nvals = string_scan(s1, buf, SCM_CHAR_NBYTES(ch), 1, FALSE, retmode,
1382 string_search_reverse, &v1, &v2);
1383 if (nvals == 1) return v1;
1384 else return Scm_Values2(v1, v2);
1385 }
1386
1387 #undef NOT_FOUND
1388 #undef FOUND_BOTH_INDEX
1389 #undef FOUND_BYTE_INDEX
1390 #undef FOUND_MAYBE_BOTH
1391 #undef BYTEWISE_SEARCHABLE
1392 #undef MULTIBYTE_NAIVE_SEARCH_NEEDED
1393
1394 /* Split string by char. Char itself is not included in the result.
1395 If LIMIT >= 0, up to that number of matches are considered (i.e.
1396 up to LIMIT+1 strings are returned). LIMIT < 0 makes the number
1397 of matches unlimited.
1398 TODO: If CH is a utf-8 multi-byte char, Boyer-Moore skip table is
1399 calculated every time we call string_scan, which is a waste. Some
1400 mechanism to cache the skip table would be nice.
1401 */
Scm_StringSplitByCharWithLimit(ScmString * str,ScmChar ch,int limit)1402 ScmObj Scm_StringSplitByCharWithLimit(ScmString *str, ScmChar ch, int limit)
1403 {
1404 char buf[SCM_CHAR_MAX_BYTES];
1405 int nb = SCM_CHAR_NBYTES(ch);
1406 ScmObj head = SCM_NIL, tail = SCM_NIL;
1407
1408 if (limit == 0) return SCM_LIST1(SCM_OBJ(str)); /* trivial case */
1409
1410 SCM_CHAR_PUT(buf, ch);
1411
1412 for (;;) {
1413 ScmObj v1, v2;
1414 (void)string_scan(str, buf, nb, 1, FALSE, SCM_STRING_SCAN_BOTH,
1415 string_search, &v1, &v2);
1416 if (SCM_FALSEP(v1)) {
1417 SCM_APPEND1(head, tail, SCM_OBJ(str));
1418 break;
1419 } else {
1420 SCM_APPEND1(head, tail, v1);
1421 if (--limit == 0) { SCM_APPEND1(head, tail, v2); break; }
1422 }
1423 str = SCM_STRING(v2);
1424 }
1425 return head;
1426 }
1427
1428 /* For ABI compatibility - On 1.0, let's make this have limit arg and
1429 drop Scm_StringSplitByCharWithLimit. */
Scm_StringSplitByChar(ScmString * str,ScmChar ch)1430 ScmObj Scm_StringSplitByChar(ScmString *str, ScmChar ch)
1431 {
1432 return Scm_StringSplitByCharWithLimit(str, ch, -1);
1433 }
1434
1435 /*----------------------------------------------------------------
1436 * Miscellaneous functions
1437 */
1438
Scm_StringToList(ScmString * str)1439 ScmObj Scm_StringToList(ScmString *str)
1440 {
1441 const ScmStringBody *b = SCM_STRING_BODY(str);
1442 ScmObj start = SCM_NIL, end = SCM_NIL;
1443 const char *bufp = SCM_STRING_BODY_START(b);
1444 ScmSmallInt len = SCM_STRING_BODY_LENGTH(b);
1445
1446 if (SCM_STRING_BODY_INCOMPLETE_P(b))
1447 Scm_Error("incomplete string not supported: %S", str);
1448 while (len-- > 0) {
1449 ScmChar ch;
1450 SCM_CHAR_GET(bufp, ch);
1451 bufp += SCM_CHAR_NBYTES(ch);
1452 SCM_APPEND1(start, end, SCM_MAKE_CHAR(ch));
1453 }
1454 return start;
1455 }
1456
1457 /* Convert cstring array to a list of Scheme strings. Cstring array
1458 can be NULL terminated (in case size < 0) or its size is explicitly
1459 specified (size >= 0). FLAGS is passed to Scm_MakeString. */
Scm_CStringArrayToList(const char ** array,ScmSmallInt size,u_long flags)1460 ScmObj Scm_CStringArrayToList(const char **array, ScmSmallInt size, u_long flags)
1461 {
1462 ScmObj h = SCM_NIL, t = SCM_NIL;
1463 if (size < 0) {
1464 for (;*array; array++) {
1465 ScmObj s = Scm_MakeString(*array, -1, -1, flags);
1466 SCM_APPEND1(h, t, s);
1467 }
1468 } else {
1469 for (ScmSmallInt i=0; i<size; i++, array++) {
1470 ScmObj s = Scm_MakeString(*array, -1, -1, flags);
1471 SCM_APPEND1(h, t, s);
1472 }
1473 }
1474 return h;
1475 }
1476
1477 /* common routine for Scm_ListTo[Const]CStringArray */
list_to_cstring_array_check(ScmObj lis,int errp)1478 static ScmSmallInt list_to_cstring_array_check(ScmObj lis, int errp)
1479 {
1480 ScmObj lp;
1481 ScmSmallInt len = 0;
1482 SCM_FOR_EACH(lp, lis) {
1483 if (!SCM_STRINGP(SCM_CAR(lp))) {
1484 if (errp) Scm_Error("a proper list of strings is required, but the list contains non-string element: %S", SCM_CAR(lp));
1485 else return -1;
1486 }
1487 len++;
1488 }
1489 return len;
1490 }
1491
1492 /* Convert list of Scheme strings into C const char* string array, NULL
1493 terminated.
1494 If errp == FALSE, returns NULL on error.
1495 otherwise, signals an error. */
Scm_ListToConstCStringArray(ScmObj lis,int errp)1496 const char **Scm_ListToConstCStringArray(ScmObj lis, int errp)
1497 {
1498 ScmSmallInt len = list_to_cstring_array_check(lis, errp);
1499 if (len < 0) return NULL;
1500 const char **array = SCM_NEW_ARRAY(const char*, len+1);
1501 const char **p = array;
1502 ScmObj lp;
1503 SCM_FOR_EACH(lp, lis) {
1504 *p++ = Scm_GetStringConst(SCM_STRING(SCM_CAR(lp)));
1505 }
1506 *p = NULL; /* termination */
1507 return array;
1508 }
1509
1510 /* Convert list of Scheme strings into C char* string array, NULL
1511 terminated.
1512 If errp == FALSE, returns NULL on error.
1513 otherwise, signals an error.
1514 If provided, alloc is used to allocate both a pointer array and char
1515 arrays. Otherwise, SCM_ALLOC is used. */
Scm_ListToCStringArray(ScmObj lis,int errp,void * (* alloc)(size_t))1516 char **Scm_ListToCStringArray(ScmObj lis, int errp, void *(*alloc)(size_t))
1517 {
1518 char **array, **p;
1519 ScmSmallInt len = list_to_cstring_array_check(lis, errp);
1520 if (len < 0) return NULL;
1521
1522 if (alloc) {
1523 p = array = (char **)alloc((len+1) * sizeof(char *));
1524 ScmObj lp;
1525 SCM_FOR_EACH(lp, lis) {
1526 const char *s = Scm_GetStringConst(SCM_STRING(SCM_CAR(lp)));
1527 *p = (char *)alloc(strlen(s) + 1);
1528 strcpy(*p, s);
1529 p++;
1530 }
1531 } else {
1532 p = array = SCM_NEW_ARRAY(char*, len+1);
1533 ScmObj lp;
1534 SCM_FOR_EACH(lp, lis) {
1535 *p++ = Scm_GetString(SCM_STRING(SCM_CAR(lp)));
1536 }
1537 }
1538 *p = NULL; /* termination */
1539 return array;
1540 }
1541
1542 /*----------------------------------------------------------------
1543 * printer
1544 */
string_putc(ScmChar ch,ScmPort * port,int bytemode)1545 static inline void string_putc(ScmChar ch, ScmPort *port, int bytemode)
1546 {
1547 char buf[6];
1548 switch (ch) {
1549 case '\\': SCM_PUTZ("\\\\", -1, port); break;
1550 case '"': SCM_PUTZ("\\\"", -1, port); break;
1551 case '\n': SCM_PUTZ("\\n", -1, port); break;
1552 case '\t': SCM_PUTZ("\\t", -1, port); break;
1553 case '\r': SCM_PUTZ("\\r", -1, port); break;
1554 case '\f': SCM_PUTZ("\\f", -1, port); break;
1555 case '\0': SCM_PUTZ("\\0", -1, port); break;
1556 default:
1557 if (ch < ' ' || ch == 0x7f || (bytemode && ch >= 0x80)) {
1558 /* TODO: Should we provide 'legacy-compatible writer mode,
1559 which does not use ';' terminator? */
1560 snprintf(buf, 6, "\\x%02x;", (unsigned char)ch);
1561 SCM_PUTZ(buf, -1, port);
1562 } else {
1563 SCM_PUTC(ch, port);
1564 }
1565 }
1566 }
1567
string_print(ScmObj obj,ScmPort * port,ScmWriteContext * ctx)1568 static void string_print(ScmObj obj, ScmPort *port, ScmWriteContext *ctx)
1569 {
1570 ScmString *str = SCM_STRING(obj);
1571 if (Scm_WriteContextMode(ctx) == SCM_WRITE_DISPLAY) {
1572 SCM_PUTS(str, port);
1573 } else {
1574 const ScmStringBody *b = SCM_STRING_BODY(str);
1575 if (SCM_STRING_BODY_SINGLE_BYTE_P(b)) {
1576 const char *cp = SCM_STRING_BODY_START(b);
1577 ScmSmallInt size = SCM_STRING_BODY_SIZE(b);
1578 if (SCM_STRING_BODY_INCOMPLETE_P(b)) {
1579 /* TODO: Should we provide legacy-compatible writer mode,
1580 which puts #*"..." instead? */
1581 SCM_PUTZ("#**\"", -1, port);
1582 } else {
1583 SCM_PUTC('"', port);
1584 }
1585 while (size--) {
1586 string_putc(*cp++, port, SCM_STRING_BODY_INCOMPLETE_P(b));
1587 }
1588 } else {
1589 const char *cp = SCM_STRING_BODY_START(b);
1590 ScmSmallInt len = SCM_STRING_BODY_LENGTH(b);
1591
1592 SCM_PUTC('"', port);
1593 while (len--) {
1594 ScmChar ch;
1595 SCM_CHAR_GET(cp, ch);
1596 string_putc(ch, port, FALSE);
1597 cp += SCM_CHAR_NBYTES(ch);
1598 }
1599 }
1600 SCM_PUTC('"', port);
1601 }
1602 }
1603
1604 /*==================================================================
1605 *
1606 * String index building
1607 *
1608 */
1609
string_body_index_needed(const ScmStringBody * sb)1610 static int string_body_index_needed(const ScmStringBody *sb)
1611 {
1612 return (!SCM_STRING_BODY_SINGLE_BYTE_P(sb)
1613 && !SCM_STRING_BODY_INCOMPLETE_P(sb)
1614 && SCM_STRING_BODY_SIZE(sb) >= 64);
1615 }
1616
Scm_StringBodyFastIndexableP(const ScmStringBody * sb)1617 int Scm_StringBodyFastIndexableP(const ScmStringBody *sb)
1618 {
1619 return (!string_body_index_needed(sb)
1620 || SCM_STRING_BODY_HAS_INDEX(sb));
1621 }
1622
compute_index_size(const ScmStringBody * sb,int interval)1623 static size_t compute_index_size(const ScmStringBody *sb, int interval)
1624 {
1625 ScmSmallInt len = SCM_STRING_BODY_LENGTH(sb);
1626 /* We don't store the first entry (0th character == 0th byte), and
1627 we use two extra entry for the signature and index_size. So
1628 we need +1. */
1629 return ((len + interval - 1)/interval) + 1;
1630 }
1631
build_index_array(const ScmStringBody * sb)1632 static void *build_index_array(const ScmStringBody *sb)
1633 {
1634 /* Signature byte is repeated in the first element of the vector */
1635 #define SIG8(type,sig) (type)(sig)
1636 #define SIG16(type,sig) ((type)((sig)<<8)|(sig))
1637 #define SIG32(type,sig) ((type)(SIG16(type,sig)<<16)|SIG16(type,sig))
1638 #define SIG64(type,sig) ((type)(SIG32(type,sig)<<32)|SIG32(type,sig))
1639
1640 #define BUILD_ARRAY(type_, typeenum_, shift_, sigrep_) \
1641 do { \
1642 int interval = 1 << (shift_); \
1643 size_t index_size = compute_index_size(sb, interval); \
1644 type_ *vec = SCM_NEW_ATOMIC_ARRAY(type_, index_size); \
1645 u_long sig = STRING_INDEX_SIGNATURE(shift_, typeenum_); \
1646 vec[0] = sigrep_(type_,sig); \
1647 vec[1] = (type_)index_size; \
1648 const char *p = SCM_STRING_BODY_START(sb); \
1649 for (size_t i = 2; i < index_size; i++) { \
1650 const char *q = forward_pos(sb, p, interval); \
1651 vec[i] = (type_)(q - SCM_STRING_BODY_START(sb)); \
1652 p = q; \
1653 } \
1654 return vec; \
1655 } while (0)
1656
1657 /* Technically we can use index8 even if size is bigger than 256,
1658 as long as the last indexed character is within the range. But
1659 checking it is too much. */
1660 if (sb->size < 256) {
1661 BUILD_ARRAY(uint8_t, STRING_INDEX8, 4, SIG8);
1662 } else if (sb->size < 8192) {
1663 /* 32 chars interval */
1664 BUILD_ARRAY(uint16_t, STRING_INDEX16, 5, SIG16);
1665 } else if (sb->size < 65536) {
1666 /* 64 chars interval */
1667 BUILD_ARRAY(uint16_t, STRING_INDEX16, 6, SIG16);
1668 }
1669 #if SIZEOF_LONG == 4
1670 else {
1671 /* 128 chars interval */
1672 BUILD_ARRAY(uint32_t, STRING_INDEX32, 7, SIG32);
1673 }
1674 #else /* SIZEOF_LONG != 4 */
1675 else if (sb->size < (1L<<32)) {
1676 /* 128 chars interval */
1677 BUILD_ARRAY(uint32_t, STRING_INDEX32, 7, SIG32);
1678 } else {
1679 /* 256 chars interval */
1680 BUILD_ARRAY(uint64_t, STRING_INDEX64, 8, SIG64);
1681 }
1682 #endif
1683 #undef BUILD_ARRAY
1684 }
1685
Scm_StringBodyBuildIndex(ScmStringBody * sb)1686 void Scm_StringBodyBuildIndex(ScmStringBody *sb)
1687 {
1688 if (!string_body_index_needed(sb) || SCM_STRING_BODY_HAS_INDEX(sb)) return;
1689 /* This is idempotent, atomic operation; no need to lock. */
1690 sb->index = build_index_array(sb);
1691 }
1692
1693 /* For debugging */
Scm_StringBodyIndexDump(const ScmStringBody * sb,ScmPort * port)1694 void Scm_StringBodyIndexDump(const ScmStringBody *sb, ScmPort *port)
1695 {
1696 ScmStringIndex *index = STRING_INDEX(sb->index);
1697 if (index == NULL) {
1698 Scm_Printf(port, "(nil)\n");
1699 return;
1700 }
1701 int interval = STRING_INDEX_INTERVAL(index);
1702 size_t index_size = 0;
1703
1704 switch (STRING_INDEX_TYPE(index)) {
1705 case STRING_INDEX8:
1706 Scm_Printf(port, "index8 ");
1707 index_size = (size_t)index->index8[1];
1708 break;
1709 case STRING_INDEX16:
1710 Scm_Printf(port, "index16 ");
1711 index_size = (size_t)index->index16[1];
1712 break;
1713 case STRING_INDEX32:
1714 Scm_Printf(port, "index32 ");
1715 index_size = (size_t)index->index32[1];
1716 break;
1717 case STRING_INDEX64:
1718 Scm_Printf(port, "index64 ");
1719 index_size = (size_t)index->index64[1];
1720 break;
1721 default:
1722 Scm_Printf(port, "unknown(%02x) ", (uint8_t)STRING_INDEX_TYPE(index));
1723 }
1724 Scm_Printf(port, " interval %d size %d\n", interval, index_size-1);
1725 Scm_Printf(port, " 0 0\n");
1726 for (size_t i = 2; i < index_size; i++) {
1727 switch (STRING_INDEX_TYPE(index)) {
1728 case STRING_INDEX8:
1729 Scm_Printf(port, " %8ld %8u\n", i-1, index->index8[i]); break;
1730 case STRING_INDEX16:
1731 Scm_Printf(port, " %8ld %8u\n", i-1, index->index16[i]); break;
1732 case STRING_INDEX32:
1733 Scm_Printf(port, " %8ld %8u\n", i-1, index->index32[i]); break;
1734 case STRING_INDEX64:
1735 Scm_Printf(port, " %8ld %8lu\n",i-1, index->index64[i]); break;
1736 }
1737 }
1738 }
1739
1740
1741 /*==================================================================
1742 *
1743 * String pointer (OBSOLETED)
1744 *
1745 */
1746
1747 #if GAUCHE_STRING_POINTER
1748
1749 SCM_DEFINE_BUILTIN_CLASS_SIMPLE(Scm_StringPointerClass, NULL);
1750
Scm_MakeStringPointer(ScmString * src,ScmSmallInt index,ScmSmallInt start,ScmSmallInt end)1751 ScmObj Scm_MakeStringPointer(ScmString *src, ScmSmallInt index,
1752 ScmSmallInt start, ScmSmallInt end)
1753 {
1754 const ScmStringBody *srcb = SCM_STRING_BODY(src);
1755 ScmSmallInt len = SCM_STRING_BODY_LENGTH(srcb);
1756 ScmSmallInt effective_size;
1757 const char *sptr, *ptr, *eptr;
1758
1759 SCM_CHECK_START_END(start, end, len);
1760 while (index < 0) index += (end - start) + 1;
1761 if (index > (end - start)) goto badindex;
1762
1763 sptr = forward_pos(srcb, SCM_STRING_BODY_START(srcb), start);
1764 ptr = forward_pos(srcb, sptr, index);
1765 if (end == len) {
1766 eptr = SCM_STRING_BODY_END(srcb);
1767 } else {
1768 eptr = forward_pos(srcb, sptr, end - start);
1769 }
1770 effective_size = eptr - sptr;
1771
1772 ScmStringPointer *sp = SCM_NEW(ScmStringPointer);
1773 SCM_SET_CLASS(sp, SCM_CLASS_STRING_POINTER);
1774 sp->length = (SCM_STRING_BODY_INCOMPLETE_P(srcb)? -1 : (end-start));
1775 sp->size = effective_size;
1776 sp->start = sptr;
1777 sp->index = index;
1778 sp->current = ptr;
1779 return SCM_OBJ(sp);
1780 badindex:
1781 Scm_Error("index out of range: %ld", index);
1782 return SCM_UNDEFINED;
1783 }
1784
Scm_StringPointerRef(ScmStringPointer * sp)1785 ScmObj Scm_StringPointerRef(ScmStringPointer *sp)
1786 {
1787 ScmChar ch;
1788 if (sp->length < 0 || sp->size == sp->length) {
1789 if (sp->index >= sp->size) return SCM_EOF;
1790 ch = *(const unsigned char*)sp->current;
1791 } else {
1792 if (sp->index >= sp->length) return SCM_EOF;
1793 SCM_CHAR_GET(sp->current, ch);
1794 }
1795 return SCM_MAKE_CHAR(ch);
1796 }
1797
Scm_StringPointerNext(ScmStringPointer * sp)1798 ScmObj Scm_StringPointerNext(ScmStringPointer *sp)
1799 {
1800 ScmChar ch;
1801 if (sp->length < 0 || sp->size == sp->length) {
1802 if (sp->index >= sp->size) return SCM_EOF;
1803 sp->index++;
1804 ch = *(const unsigned char*)sp->current++;
1805 } else {
1806 if (sp->index >= sp->length) return SCM_EOF;
1807 SCM_CHAR_GET(sp->current, ch);
1808 sp->index++;
1809 sp->current += SCM_CHAR_NFOLLOWS(*sp->current) + 1;
1810 }
1811 return SCM_MAKE_CHAR(ch);
1812 }
1813
Scm_StringPointerPrev(ScmStringPointer * sp)1814 ScmObj Scm_StringPointerPrev(ScmStringPointer *sp)
1815 {
1816 ScmChar ch;
1817 if (sp->index <= 0) return SCM_EOF;
1818 if (sp->length < 0 || sp->size == sp->length) {
1819 sp->index--;
1820 ch = *(const unsigned char*)--sp->current;
1821 } else {
1822 const char *prev;
1823 SCM_CHAR_BACKWARD(sp->current, sp->start, prev);
1824 SCM_ASSERT(prev != NULL);
1825 SCM_CHAR_GET(prev, ch);
1826 sp->index--;
1827 sp->current = prev;
1828 }
1829 return SCM_MAKE_CHAR(ch);
1830 }
1831
Scm_StringPointerSet(ScmStringPointer * sp,ScmSmallInt index)1832 ScmObj Scm_StringPointerSet(ScmStringPointer *sp, ScmSmallInt index)
1833 {
1834 if (index < 0) goto badindex;
1835 /* NB: Safe to cast index, for too large index would be rejected
1836 by the check. */
1837 if (sp->length < 0 || sp->size == sp->length) {
1838 if (index > sp->size) goto badindex;
1839 sp->index = index;
1840 sp->current = sp->start + index;
1841 } else {
1842 if (index > sp->length) goto badindex;
1843 sp->index = index;
1844 sp->current = forward_pos(NULL, sp->start, index);
1845 }
1846 return SCM_OBJ(sp);
1847 badindex:
1848 Scm_Error("index out of range: %ld", index);
1849 return SCM_UNDEFINED;
1850 }
1851
Scm_StringPointerSubstring(ScmStringPointer * sp,int afterp)1852 ScmObj Scm_StringPointerSubstring(ScmStringPointer *sp, int afterp)
1853 {
1854 /* TODO: set SCM_STRING_TERMINATED if applicable. */
1855 if (sp->length < 0) {
1856 if (afterp)
1857 return SCM_OBJ(make_str(-1, sp->size - sp->index, sp->current,
1858 0, NULL));
1859 else
1860 return SCM_OBJ(make_str(-1, sp->index, sp->start, 0, NULL));
1861 } else {
1862 if (afterp)
1863 return SCM_OBJ(make_str(sp->length - sp->index,
1864 sp->start + sp->size - sp->current,
1865 sp->current, 0, NULL));
1866 else
1867 return SCM_OBJ(make_str(sp->index,
1868 sp->current - sp->start,
1869 sp->start, 0, NULL));
1870 }
1871 }
1872
1873 /* Copy string pointer.
1874 Thanks to Alex Shinn (foof@synthcode.com) */
Scm_StringPointerCopy(ScmStringPointer * sp1)1875 ScmObj Scm_StringPointerCopy(ScmStringPointer *sp1)
1876 {
1877 ScmStringPointer *sp2 = SCM_NEW(ScmStringPointer);
1878 SCM_SET_CLASS(sp2, SCM_CLASS_STRING_POINTER);
1879 sp2->length = sp1->length;
1880 sp2->size = sp1->size;
1881 sp2->start = sp1->start;
1882 sp2->index = sp1->index;
1883 sp2->current = sp1->current;
1884 return SCM_OBJ(sp2);
1885 }
1886
1887 /* Dump string pointer info for debugging.
1888 Thanks to Alex Shinn (foof@synthcode.com) */
Scm_StringPointerDump(ScmStringPointer * sp1)1889 void Scm_StringPointerDump(ScmStringPointer *sp1)
1890 {
1891 Scm_Printf(SCM_CUROUT,
1892 "<sp addr: %p len: %d size: %d start: %p index: %d cur: %p>\n",
1893 sp1, sp1->length, sp1->size, sp1->start, sp1->index,
1894 sp1->current);
1895 }
1896
1897 #endif /*GAUCHE_STRING_POINTER*/
1898
1899 /*==================================================================
1900 *
1901 * String cursor API
1902 *
1903 */
1904
1905 /* Public interface */
Scm_StringCursorP(ScmObj obj)1906 int Scm_StringCursorP(ScmObj obj)
1907 {
1908 return SCM_STRING_CURSOR_P(obj);
1909 }
1910
make_string_cursor(ScmString * src,const char * ptr)1911 static ScmObj make_string_cursor(ScmString *src, const char *ptr)
1912 {
1913 const ScmStringBody *srcb = SCM_STRING_BODY(src);
1914
1915 if (ptr < SCM_STRING_BODY_START(srcb) ||
1916 ptr > SCM_STRING_BODY_END(srcb)) {
1917 Scm_Error("cursor out of range of %S: %ld",
1918 SCM_OBJ(src),
1919 (ScmSmallInt)(ptr - SCM_STRING_BODY_START(srcb)));
1920 }
1921
1922 ScmSmallInt offset = (ScmSmallInt)(ptr - SCM_STRING_BODY_START(srcb));
1923 if (!SCM_VM_RUNTIME_FLAG_IS_SET(Scm_VM(), SCM_SAFE_STRING_CURSORS) &&
1924 SCM_STRING_CURSOR_FITS_SMALL_P(offset)) {
1925 return SCM_MAKE_STRING_CURSOR_SMALL(offset);
1926 }
1927
1928 ScmStringCursorLarge *sc = SCM_NEW(ScmStringCursorLarge);
1929 SCM_SET_CLASS(sc, SCM_CLASS_STRING_CURSOR_LARGE);
1930 sc->offset = offset;
1931 sc->start = SCM_STRING_BODY_START(srcb);
1932 return SCM_OBJ(sc);
1933 }
1934
Scm_MakeStringCursorFromIndex(ScmString * src,ScmSmallInt index)1935 ScmObj Scm_MakeStringCursorFromIndex(ScmString *src, ScmSmallInt index)
1936 {
1937 const ScmStringBody *srcb = SCM_STRING_BODY(src);
1938 ScmSmallInt len = SCM_STRING_BODY_LENGTH(srcb);
1939 if (index < 0 || index > len) {
1940 Scm_Error("index out of range: %ld", index);
1941 }
1942 return make_string_cursor(src, index2ptr(srcb, index));
1943 }
1944
Scm_MakeStringCursorEnd(ScmString * src)1945 ScmObj Scm_MakeStringCursorEnd(ScmString *src)
1946 {
1947 const ScmStringBody *srcb = SCM_STRING_BODY(src);
1948
1949 ScmSmallInt offset = SCM_STRING_BODY_END(srcb) - SCM_STRING_BODY_START(srcb);
1950 if (!SCM_VM_RUNTIME_FLAG_IS_SET(Scm_VM(), SCM_SAFE_STRING_CURSORS) &&
1951 SCM_STRING_CURSOR_FITS_SMALL_P(offset)) {
1952 return SCM_MAKE_STRING_CURSOR_SMALL(offset);
1953 }
1954 ScmStringCursorLarge *sc = SCM_NEW(ScmStringCursorLarge);
1955 SCM_SET_CLASS(sc, SCM_CLASS_STRING_CURSOR_LARGE);
1956 sc->offset = offset;
1957 sc->start = SCM_STRING_BODY_START(srcb);
1958 return SCM_OBJ(sc);
1959 }
1960
Scm_StringCursorIndex(ScmString * src,ScmObj sc)1961 ScmObj Scm_StringCursorIndex(ScmString *src, ScmObj sc)
1962 {
1963 if (SCM_INTP(sc) || SCM_BIGNUMP(sc)) {
1964 return sc; /* no validation */
1965 }
1966
1967 const ScmStringBody *srcb = SCM_STRING_BODY(src);
1968 const char *ptr = NULL;
1969
1970 if ((ptr = string_cursor_ptr(srcb, sc)) == NULL) {
1971 Scm_Error("must be either an index or a cursor: %S", sc);
1972 }
1973
1974 if (SCM_STRING_BODY_SINGLE_BYTE_P(srcb) ||
1975 SCM_STRING_BODY_INCOMPLETE_P(srcb)) {
1976 return SCM_MAKE_INT(ptr - SCM_STRING_BODY_START(srcb));
1977 }
1978
1979 const char *current = SCM_STRING_BODY_START(srcb);
1980 ScmSmallInt len = SCM_STRING_BODY_LENGTH(srcb);
1981 ScmSmallInt index = 0;
1982 while (index < len && current < ptr) {
1983 current += SCM_CHAR_NFOLLOWS(*current) + 1;
1984 index++;
1985 }
1986 if (current != ptr) {
1987 Scm_Error("cursor not pointed at the beginning of a character: %S", sc);
1988 }
1989
1990 return SCM_MAKE_INT(index);
1991 }
1992
Scm_StringCursorForward(ScmString * s,ScmObj sc,int nchars)1993 ScmObj Scm_StringCursorForward(ScmString* s, ScmObj sc, int nchars)
1994 {
1995 if (nchars < 0) {
1996 Scm_Error("nchars is negative: %ld", nchars);
1997 }
1998
1999 if (SCM_INTEGERP(sc)) {
2000 return Scm_MakeStringCursorFromIndex(s, Scm_GetInteger(sc) + nchars);
2001 }
2002
2003 const ScmStringBody *srcb = SCM_STRING_BODY(s);
2004 const char *ptr = string_cursor_ptr(srcb, sc);
2005 if (ptr == NULL) {
2006 Scm_Error("must be either an index or a cursor: %S", sc);
2007 }
2008 return make_string_cursor(s, forward_pos(srcb, ptr, nchars));
2009 }
2010
Scm_StringCursorBack(ScmString * s,ScmObj sc,int nchars)2011 ScmObj Scm_StringCursorBack(ScmString* s, ScmObj sc, int nchars)
2012 {
2013 if (nchars < 0) {
2014 Scm_Error("nchars is negative: %ld", nchars);
2015 }
2016
2017 if (SCM_INTP(sc) || SCM_BIGNUMP(sc)) {
2018 return Scm_MakeStringCursorFromIndex(s, Scm_GetInteger(sc) - nchars);
2019 }
2020
2021 const ScmStringBody *srcb = SCM_STRING_BODY(s);
2022 const char *ptr = string_cursor_ptr(srcb, sc);
2023 if (ptr == NULL) {
2024 Scm_Error("must be either an index or a cursor: %S", sc);
2025 }
2026
2027 if (SCM_STRING_BODY_SINGLE_BYTE_P(srcb) ||
2028 SCM_STRING_BODY_INCOMPLETE_P(srcb)) {
2029 return make_string_cursor(s, ptr - nchars);
2030 }
2031
2032 while (nchars--) {
2033 const char *prev;
2034 SCM_CHAR_BACKWARD(ptr, SCM_STRING_BODY_START(srcb), prev);
2035 if (!prev) {
2036 Scm_Error("nchars out of range: %ld", nchars);
2037 }
2038 ptr = prev;
2039 }
2040
2041 return make_string_cursor(s, ptr);
2042 }
2043
Scm_StringRefCursor(ScmString * s,ScmObj sc,int range_error)2044 ScmChar Scm_StringRefCursor(ScmString* s, ScmObj sc, int range_error)
2045 {
2046 if (SCM_INTP(sc)) {
2047 return Scm_StringRef(s, SCM_INT_VALUE(sc), range_error);
2048 }
2049
2050 const ScmStringBody *srcb = SCM_STRING_BODY(s);
2051 const char *ptr = string_cursor_ptr(srcb, sc);
2052 if (ptr == NULL) {
2053 Scm_Error("must be either an index or a cursor: %S", sc);
2054 }
2055 if (ptr == SCM_STRING_BODY_END(srcb)) {
2056 if (range_error) {
2057 Scm_Error("cursor is at the end: %S", sc);
2058 } else {
2059 return SCM_CHAR_INVALID;
2060 }
2061 }
2062 ScmChar ch;
2063 SCM_CHAR_GET(ptr, ch);
2064 return ch;
2065 }
2066
Scm_SubstringCursor(ScmString * str,ScmObj start_scm,ScmObj end_scm)2067 ScmObj Scm_SubstringCursor(ScmString *str,
2068 ScmObj start_scm, ScmObj end_scm)
2069 {
2070 const ScmStringBody *sb = SCM_STRING_BODY(str);
2071 const char *start = string_cursor_ptr(sb, start_scm);
2072 const char *end = string_cursor_ptr(sb, end_scm);
2073
2074 if (start && end) {
2075 return substring_cursor(sb, start, end, FALSE);
2076 }
2077
2078 return substring(SCM_STRING_BODY(str),
2079 Scm_GetInteger(Scm_StringCursorIndex(str, start_scm)),
2080 Scm_GetInteger(Scm_StringCursorIndex(str, end_scm)),
2081 FALSE, FALSE);
2082 }
2083
Scm_StringCursorCompare(ScmObj sc1,ScmObj sc2,int (* numcmp)(ScmObj,ScmObj))2084 int Scm_StringCursorCompare(ScmObj sc1, ScmObj sc2,
2085 int (*numcmp)(ScmObj, ScmObj))
2086 {
2087 /*
2088 * Handle indexes separately, we can't mix index and cursor
2089 * because cursor is byte offset, not index.
2090 */
2091 if (SCM_INTP(sc1) && SCM_INTP(sc2)) {
2092 return numcmp(sc1, sc2);
2093 }
2094
2095 ScmSmallInt i1 = string_cursor_offset(sc1);
2096 ScmSmallInt i2 = string_cursor_offset(sc2);
2097 if (i1 < 0 || i2 < 0) {
2098 Scm_Error("arguments must be either both cursors or both indexes: %S vs %S", sc1, sc2);
2099 }
2100 return numcmp(SCM_MAKE_INT(i1), SCM_MAKE_INT(i2));
2101 }
2102
2103 /*==================================================================
2104 *
2105 * Dynamic strings
2106 *
2107 */
2108
2109 /* I used to use realloc() to grow the storage; now I avoid it, for
2110 Boehm GC's realloc almost always copies the original content and
2111 we don't get any benefit.
2112 The growing string is kept in the chained chunks. The size of
2113 chunk getting bigger as the string grows, until a certain threshold.
2114 The memory for actual chunks and the chain is allocated separately,
2115 in order to use SCM_NEW_ATOMIC.
2116 */
2117
2118 /* NB: it is important that DString functions don't call any
2119 * time-consuming procedures except memory allocation. Some of
2120 * mutex code in other parts relies on that fact.
2121 */
2122
2123 /* maximum chunk size */
2124 #define DSTRING_MAX_CHUNK_SIZE 8180
2125
Scm_DStringInit(ScmDString * dstr)2126 void Scm_DStringInit(ScmDString *dstr)
2127 {
2128 dstr->init.bytes = 0;
2129 dstr->anchor = dstr->tail = NULL;
2130 dstr->current = dstr->init.data;
2131 dstr->end = dstr->current + SCM_DSTRING_INIT_CHUNK_SIZE;
2132 dstr->lastChunkSize = SCM_DSTRING_INIT_CHUNK_SIZE;
2133 dstr->length = 0;
2134 }
2135
Scm_DStringSize(ScmDString * dstr)2136 ScmSmallInt Scm_DStringSize(ScmDString *dstr)
2137 {
2138 ScmSmallInt size;
2139 if (dstr->tail) {
2140 size = dstr->init.bytes;
2141 dstr->tail->chunk->bytes = dstr->current - dstr->tail->chunk->data;
2142 for (ScmDStringChain *chain = dstr->anchor; chain; chain = chain->next) {
2143 size += chain->chunk->bytes;
2144 }
2145 } else {
2146 size = dstr->init.bytes = dstr->current - dstr->init.data;
2147 }
2148 if (size > SCM_STRING_MAX_SIZE) {
2149 Scm_Error("Scm_DStringSize: size exceeded the range: %ld", size);
2150 }
2151 return size;
2152 }
2153
newChunk(ScmSmallInt size)2154 static ScmDStringChunk *newChunk(ScmSmallInt size)
2155 {
2156 return SCM_NEW_ATOMIC2(ScmDStringChunk*,
2157 (sizeof(ScmDStringChunk)
2158 +size-SCM_DSTRING_INIT_CHUNK_SIZE));
2159 }
2160
Scm__DStringRealloc(ScmDString * dstr,ScmSmallInt minincr)2161 void Scm__DStringRealloc(ScmDString *dstr, ScmSmallInt minincr)
2162 {
2163 /* sets the byte count of the last chunk */
2164 if (dstr->tail) {
2165 dstr->tail->chunk->bytes = dstr->current - dstr->tail->chunk->data;
2166 } else {
2167 dstr->init.bytes = dstr->current - dstr->init.data;
2168 }
2169
2170 /* determine the size of the new chunk. the increase factor 3 is
2171 somewhat arbitrary, determined by rudimental benchmarking. */
2172 ScmSmallInt newsize = dstr->lastChunkSize * 3;
2173 if (newsize > DSTRING_MAX_CHUNK_SIZE) {
2174 newsize = DSTRING_MAX_CHUNK_SIZE;
2175 }
2176 if (newsize < minincr) {
2177 newsize = minincr;
2178 }
2179
2180 ScmDStringChunk *newchunk = newChunk(newsize);
2181 newchunk->bytes = 0;
2182 ScmDStringChain *newchain = SCM_NEW(ScmDStringChain);
2183
2184 newchain->next = NULL;
2185 newchain->chunk = newchunk;
2186 if (dstr->tail) {
2187 dstr->tail->next = newchain;
2188 dstr->tail = newchain;
2189 } else {
2190 dstr->anchor = dstr->tail = newchain;
2191 }
2192 dstr->current = newchunk->data;
2193 dstr->end = newchunk->data + newsize;
2194 dstr->lastChunkSize = newsize;
2195 }
2196
2197 /* Retrieve accumulated string. */
dstring_getz(ScmDString * dstr,ScmSmallInt * psiz,ScmSmallInt * plen,int noalloc)2198 static const char *dstring_getz(ScmDString *dstr,
2199 ScmSmallInt *psiz,
2200 ScmSmallInt *plen,
2201 int noalloc)
2202 {
2203 ScmSmallInt size, len;
2204 char *buf;
2205 if (dstr->anchor == NULL) {
2206 /* we only have one chunk */
2207 size = dstr->current - dstr->init.data;
2208 CHECK_SIZE(size);
2209 len = dstr->length;
2210 if (noalloc) {
2211 buf = dstr->init.data;
2212 } else {
2213 buf = SCM_STRDUP_PARTIAL(dstr->init.data, size);
2214 }
2215 } else {
2216 ScmDStringChain *chain = dstr->anchor;
2217 char *bptr;
2218
2219 size = Scm_DStringSize(dstr);
2220 CHECK_SIZE(size);
2221 len = dstr->length;
2222 bptr = buf = SCM_NEW_ATOMIC2(char*, size+1);
2223
2224 memcpy(bptr, dstr->init.data, dstr->init.bytes);
2225 bptr += dstr->init.bytes;
2226 for (; chain; chain = chain->next) {
2227 memcpy(bptr, chain->chunk->data, chain->chunk->bytes);
2228 bptr += chain->chunk->bytes;
2229 }
2230 *bptr = '\0';
2231 }
2232 if (len < 0) len = count_length(buf, size);
2233 if (plen) *plen = len;
2234 if (psiz) *psiz = size;
2235 return buf;
2236 }
2237
Scm_DStringGet(ScmDString * dstr,u_long flags)2238 ScmObj Scm_DStringGet(ScmDString *dstr, u_long flags)
2239 {
2240 ScmSmallInt len, size;
2241 const char *str = dstring_getz(dstr, &size, &len, FALSE);
2242 return SCM_OBJ(make_str(len, size, str, flags|SCM_STRING_TERMINATED, NULL));
2243 }
2244
2245 /* For conveninence. Note that dstr may already contain NUL byte in it,
2246 in that case you'll get chopped string. */
Scm_DStringGetz(ScmDString * dstr)2247 const char *Scm_DStringGetz(ScmDString *dstr)
2248 {
2249 ScmSmallInt len, size;
2250 return dstring_getz(dstr, &size, &len, FALSE);
2251 }
2252
2253 /* Concatenate all chains in DString into one chunk. Externally nothing
2254 really changes, but this can be used to optimize allocation. */
Scm_DStringWeld(ScmDString * dstr)2255 void Scm_DStringWeld(ScmDString *dstr)
2256 {
2257 if (dstr->anchor == NULL) return; /* nothing to do */
2258 ScmDStringChain *chain = dstr->anchor;
2259 ScmSmallInt size = Scm_DStringSize(dstr);
2260 ScmSmallInt bufsiz = size + (dstr->end - dstr->current);
2261 ScmDStringChunk *newchunk = newChunk(bufsiz);
2262 newchunk->bytes = size;
2263 char *bptr = newchunk->data;
2264 memcpy(bptr, dstr->init.data, dstr->init.bytes);
2265 bptr += dstr->init.bytes;
2266 for (; chain; chain = chain->next) {
2267 memcpy(bptr, chain->chunk->data, chain->chunk->bytes);
2268 bptr += chain->chunk->bytes;
2269 }
2270 dstr->init.bytes = 0;
2271 dstr->anchor->chunk = newchunk;
2272 dstr->anchor->next = NULL;
2273 dstr->tail = dstr->anchor;
2274 dstr->current = newchunk->data + size;
2275 dstr->end = newchunk->data + bufsiz;
2276 dstr->lastChunkSize = bufsiz;
2277 }
2278
2279 /* Returns the current content of DString, along with byte size and character
2280 length. The returned pointer may not be NUL-terminated.
2281
2282 Unlike Scm_DStringGet[z], returned pointer can directly points into
2283 the internal buffer of Scm_DString; especially, this never allocates
2284 if DString only uses initial buffer. The caller should be aware that
2285 the returned content may be altered by further DString operation. */
Scm_DStringPeek(ScmDString * dstr,ScmSmallInt * size,ScmSmallInt * len)2286 const char *Scm_DStringPeek(ScmDString *dstr,
2287 ScmSmallInt *size, ScmSmallInt *len)
2288 {
2289 Scm_DStringWeld(dstr);
2290 if (dstr->anchor == NULL) {
2291 if (size) *size = dstr->current - dstr->init.data;
2292 if (len) *len = dstr->length;
2293 return dstr->init.data;
2294 } else {
2295 if (size) *size = dstr->anchor->chunk->bytes;
2296 if (len) *len = dstr->length;
2297 return dstr->anchor->chunk->data;
2298 }
2299 }
2300
Scm_DStringPutz(ScmDString * dstr,const char * str,ScmSmallInt size)2301 void Scm_DStringPutz(ScmDString *dstr, const char *str, ScmSmallInt size)
2302 {
2303 if (size < 0) size = strlen(str);
2304 if (dstr->current + size > dstr->end) {
2305 Scm__DStringRealloc(dstr, size);
2306 }
2307 memcpy(dstr->current, str, size);
2308 dstr->current += size;
2309 if (dstr->length >= 0) {
2310 ScmSmallInt len = count_length(str, size);
2311 if (len >= 0) dstr->length += len;
2312 else dstr->length = -1;
2313 }
2314 }
2315
Scm_DStringAdd(ScmDString * dstr,ScmString * str)2316 void Scm_DStringAdd(ScmDString *dstr, ScmString *str)
2317 {
2318 const ScmStringBody *b = SCM_STRING_BODY(str);
2319 ScmSmallInt size = SCM_STRING_BODY_SIZE(b);
2320 if (size == 0) return;
2321 if (dstr->current + size > dstr->end) {
2322 Scm__DStringRealloc(dstr, size);
2323 }
2324 memcpy(dstr->current, SCM_STRING_BODY_START(b), size);
2325 dstr->current += size;
2326 if (dstr->length >= 0 && !SCM_STRING_BODY_INCOMPLETE_P(b)) {
2327 dstr->length += SCM_STRING_BODY_LENGTH(b);
2328 } else {
2329 dstr->length = -1;
2330 }
2331 }
2332
Scm_DStringPutb(ScmDString * ds,char byte)2333 void Scm_DStringPutb(ScmDString *ds, char byte)
2334 {
2335 SCM_DSTRING_PUTB(ds, byte);
2336 }
2337
Scm_DStringPutc(ScmDString * ds,ScmChar ch)2338 void Scm_DStringPutc(ScmDString *ds, ScmChar ch)
2339 {
2340 SCM_DSTRING_PUTC(ds, ch);
2341 }
2342
2343 /* Truncate DString at the specified size.
2344 Returns after-truncation size (it may be smaller than newsize if
2345 the original DString isn't as large as newsize. */
Scm_DStringTruncate(ScmDString * dstr,ScmSmallInt newsize)2346 ScmSmallInt Scm_DStringTruncate(ScmDString *dstr, ScmSmallInt newsize)
2347 {
2348 ScmSmallInt origsize = Scm_DStringSize(dstr);
2349
2350 if (newsize < dstr->init.bytes) {
2351 dstr->init.bytes = newsize;
2352 dstr->anchor = NULL;
2353 dstr->tail = NULL;
2354 dstr->current = dstr->init.data + newsize;
2355 dstr->end = dstr->init.data + SCM_DSTRING_INIT_CHUNK_SIZE;
2356 } else {
2357 if (newsize >= origsize) return origsize;
2358 ScmDStringChain *chain = dstr->anchor;
2359 ScmSmallInt ss = dstr->init.bytes;
2360 for (; chain; chain = chain->next) {
2361 if (newsize < ss + chain->chunk->bytes) {
2362 /* truncate this chunk */
2363 if (chain == dstr->tail) {
2364 chain->chunk->bytes = newsize - ss;
2365 dstr->current = chain->chunk->data + newsize - ss;
2366 } else {
2367 dstr->lastChunkSize = chain->chunk->bytes;
2368 dstr->end = chain->chunk->data + chain->chunk->bytes;
2369 chain->chunk->bytes = newsize - ss;
2370 chain->next = NULL;
2371 dstr->tail = chain;
2372 dstr->current = chain->chunk->data + newsize - ss;
2373 }
2374 break;
2375 }
2376 ss += chain->chunk->bytes;
2377 }
2378 SCM_ASSERT(chain != NULL);
2379 }
2380
2381 /* If we accumulated only ASCII, we can adjust length as well. */
2382 if (dstr->length == origsize || newsize == 0) dstr->length = newsize;
2383 else dstr->length = -1;
2384 return newsize;
2385 }
2386
2387
2388 /* for debug */
Scm_DStringDump(FILE * out,ScmDString * dstr)2389 void Scm_DStringDump(FILE *out, ScmDString *dstr)
2390 {
2391 fprintf(out, "DString %p\n", dstr);
2392 if (dstr->anchor) {
2393 fprintf(out, " chunk0[%3ld] = \"", dstr->init.bytes);
2394 SCM_IGNORE_RESULT(fwrite(dstr->init.data, 1, dstr->init.bytes, out));
2395 fprintf(out, "\"\n");
2396 ScmDStringChain *chain = dstr->anchor;
2397 for (int i=1; chain; chain = chain->next, i++) {
2398 ScmSmallInt size = (chain->next? chain->chunk->bytes : (dstr->current - dstr->tail->chunk->data));
2399 fprintf(out, " chunk%d[%3ld] = \"", i, size);
2400 SCM_IGNORE_RESULT(fwrite(chain->chunk->data, 1, size, out));
2401 fprintf(out, "\"\n");
2402 }
2403 } else {
2404 ScmSmallInt size = dstr->current - dstr->init.data;
2405 fprintf(out, " chunk0[%3ld] = \"", size);
2406 SCM_IGNORE_RESULT(fwrite(dstr->init.data, 1, size, out));
2407 fprintf(out, "\"\n");
2408 }
2409 }
2410