1 /*
2  * string.c - string implementation
3  *
4  *   Copyright (c) 2000-2020  Shiro Kawai  <shiro@acm.org>
5  *
6  *   Redistribution and use in source and binary forms, with or without
7  *   modification, are permitted provided that the following conditions
8  *   are met:
9  *
10  *   1. Redistributions of source code must retain the above copyright
11  *      notice, this list of conditions and the following disclaimer.
12  *
13  *   2. Redistributions in binary form must reproduce the above copyright
14  *      notice, this list of conditions and the following disclaimer in the
15  *      documentation and/or other materials provided with the distribution.
16  *
17  *   3. Neither the name of the authors nor the names of its contributors
18  *      may be used to endorse or promote products derived from this
19  *      software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #define LIBGAUCHE_BODY
35 #include "gauche.h"
36 #include "gauche/priv/stringP.h"
37 
38 #include <string.h>
39 #include <ctype.h>
40 
41 void Scm_DStringDump(FILE *out, ScmDString *dstr);
42 static ScmObj make_string_cursor(ScmString *src, const char *cursor);
43 
44 static void string_print(ScmObj obj, ScmPort *port, ScmWriteContext *ctx);
45 SCM_DEFINE_BUILTIN_CLASS(Scm_StringClass, string_print, NULL, NULL, NULL,
46                          SCM_CLASS_SEQUENCE_CPL);
47 
48 #define CHECK_SIZE(siz)                                         \
49     do {                                                        \
50         if ((siz) > SCM_STRING_MAX_SIZE) {                      \
51             Scm_Error("string size too big: %ld", (siz));       \
52         }                                                       \
53     } while (0)
54 
55 /* Internal primitive constructor.   LEN can be negative if the string
56    is incomplete. */
make_str(ScmSmallInt len,ScmSmallInt siz,const char * p,u_long flags,const void * index)57 static ScmString *make_str(ScmSmallInt len, ScmSmallInt siz,
58                            const char *p, u_long flags,
59                            const void *index)
60 {
61     if (len < 0) flags |= SCM_STRING_INCOMPLETE;
62     if (flags & SCM_STRING_INCOMPLETE) len = siz;
63 
64     if (siz > SCM_STRING_MAX_SIZE) {
65         Scm_Error("string size too big: %ld", siz);
66     }
67     if (len > siz) {
68         Scm_Error("string length (%ld) exceeds size (%ld)", len, siz);
69     }
70 
71     ScmString *s = SCM_NEW(ScmString);
72     SCM_SET_CLASS(s, SCM_CLASS_STRING);
73     s->body = NULL;
74     s->initialBody.flags = flags & SCM_STRING_FLAG_MASK;
75     s->initialBody.length = len;
76     s->initialBody.size = siz;
77     s->initialBody.start = p;
78     s->initialBody.index = index;
79     return s;
80 }
81 
82 #define DUMP_LENGTH   50
83 
84 /* for debug */
Scm_StringDump(FILE * out,ScmObj str)85 void Scm_StringDump(FILE *out, ScmObj str)
86 {
87     const ScmStringBody *b = SCM_STRING_BODY(str);
88     ScmSmallInt s = SCM_STRING_BODY_SIZE(b);
89     const char *p = SCM_STRING_BODY_START(b);
90 
91     fprintf(out, "STR(len=%ld,siz=%ld) \"", SCM_STRING_BODY_LENGTH(b), s);
92     for (int i=0; i < DUMP_LENGTH && s > 0;) {
93         int n = SCM_CHAR_NFOLLOWS(*p) + 1;
94         for (; n > 0 && s > 0; p++, n--, s--, i++) {
95             putc(*p, out);
96         }
97     }
98     if (s > 0) {
99         fputs("...\"\n", out);
100     } else {
101         fputs("\"\n", out);
102     }
103 }
104 
105 /* Like GC_strndup, but we don't require the source string to be
106    NUL-terminated (instead, we trust the caller that the size
107    argument is in valid range.) */
Scm_StrdupPartial(const char * src,size_t size)108 char *Scm_StrdupPartial(const char *src, size_t size)
109 {
110     char *dst = SCM_NEW_ATOMIC_ARRAY(char, size+1);
111     memcpy(dst, src, size);
112     dst[size] = '\0';
113     return dst;
114 }
115 
116 /*
117  * Multibyte length calculation
118  */
119 
120 /* We have multiple similar functions, due to performance reasons. */
121 
122 /* Calculate both length and size of C-string str.
123    If str is incomplete, *plen gets -1. */
count_size_and_length(const char * str,ScmSmallInt * psize,ScmSmallInt * plen)124 static inline ScmSmallInt count_size_and_length(const char *str,
125                                                 ScmSmallInt *psize, /* out */
126                                                 ScmSmallInt *plen)  /* out */
127 {
128     char c;
129     int incomplete = FALSE;
130     const char *p = str;
131     ScmSmallInt size = 0, len = 0;
132     while ((c = *p++) != 0) {
133         int i = SCM_CHAR_NFOLLOWS(c);
134         len++;
135         size += i+1;
136 
137         ScmChar ch;
138         SCM_CHAR_GET(p-1, ch);
139         if (ch == SCM_CHAR_INVALID) incomplete = TRUE;
140         /* Check every octet to avoid skipping over terminating NUL. */
141         while (i-- > 0) {
142             if (!*p++) { incomplete = TRUE; goto eos; }
143         }
144     }
145   eos:
146     if (incomplete) len = -1;
147     *psize = size;
148     *plen = len;
149     return len;
150 }
151 
152 /* Calculate length of known size string.  str can contain NUL character. */
count_length(const char * str,ScmSmallInt size)153 static inline ScmSmallInt count_length(const char *str, ScmSmallInt size)
154 {
155     ScmSmallInt count = 0;
156     while (size-- > 0) {
157         unsigned char c = (unsigned char)*str;
158         int i = SCM_CHAR_NFOLLOWS(c);
159         if (i < 0 || i > size) return -1;
160         ScmChar ch;
161         SCM_CHAR_GET(str, ch);
162         if (ch == SCM_CHAR_INVALID) return -1;
163         count++;
164         str += i+1;
165         size -= i;
166     }
167     return count;
168 }
169 
170 /* Returns length of string, starts from str and end at stop.
171    If stop is NULL, str is regarded as C-string (NUL terminated).
172    If the string is incomplete, returns -1. */
Scm_MBLen(const char * str,const char * stop)173 ScmSmallInt Scm_MBLen(const char *str, const char *stop)
174 {
175     ScmSmallInt size = (stop == NULL)? (ScmSmallInt)strlen(str) : (stop - str);
176     ScmSmallInt len = count_length(str, size);
177     if (len > SCM_STRING_MAX_LENGTH) {
178         Scm_Error("Scm_MBLen: length too big: %ld", len);
179     }
180     return len;
181 }
182 
183 /*----------------------------------------------------------------
184  * Cursors
185  */
186 
cursor_print(ScmObj obj,ScmPort * port,ScmWriteContext * mode SCM_UNUSED)187 static void cursor_print(ScmObj obj, ScmPort *port,
188                          ScmWriteContext *mode SCM_UNUSED)
189 {
190     Scm_Printf(port, "#<string-cursor-large %ld>",
191                SCM_STRING_CURSOR_LARGE_OFFSET(obj));
192 }
193 
194 static ScmClass *cursor_cpl[] = {
195     SCM_CLASS_STATIC_PTR(Scm_StringCursorClass),
196     SCM_CLASS_STATIC_PTR(Scm_TopClass),
197     NULL
198 };
199 
200 SCM_DEFINE_BUILTIN_CLASS_SIMPLE(Scm_StringCursorClass, NULL);
201 SCM_DEFINE_BUILTIN_CLASS(Scm_StringCursorLargeClass, cursor_print, NULL, NULL,
202                          NULL, cursor_cpl);
203 
204 /* Common routine to get hold of the pointer from string cursor.
205    Returns NULL if SC isn't a string cursor.
206    Raise an error if sc is not in the range. */
string_cursor_ptr(const ScmStringBody * sb,ScmObj sc)207 static inline const char *string_cursor_ptr(const ScmStringBody *sb, ScmObj sc)
208 {
209     const char *ptr = NULL;
210     if (SCM_STRING_CURSOR_LARGE_P(sc)) {
211         if (SCM_STRING_BODY_START(sb) != SCM_STRING_CURSOR_LARGE_START(sc)) {
212             Scm_Error("invalid cursor (made for string '%s'): %S",
213                       SCM_STRING_CURSOR_LARGE_START(sc), sc);
214         }
215         ptr = SCM_STRING_CURSOR_LARGE_POINTER(sb, sc);
216     } else if (SCM_STRING_CURSOR_SMALL_P(sc)) {
217         ptr = SCM_STRING_CURSOR_SMALL_POINTER(sb, sc);
218     } else {
219         return NULL;
220     }
221     if (ptr < SCM_STRING_BODY_START(sb) ||
222         ptr > SCM_STRING_BODY_END(sb)) {
223         Scm_Error("cursor out of range: %S", sc);
224     }
225     return ptr;
226 }
227 
228 /* Returns -1 if sc isn't a cursor.  No range check performed. */
string_cursor_offset(ScmObj sc)229 static inline ScmSmallInt string_cursor_offset(ScmObj sc) {
230     if (SCM_STRING_CURSOR_LARGE_P(sc)) {
231         return SCM_STRING_CURSOR_LARGE_OFFSET(sc);
232     } else if (SCM_STRING_CURSOR_SMALL_P(sc)) {
233         return SCM_STRING_CURSOR_SMALL_OFFSET(sc);
234     } else {
235         return -1;
236     }
237 }
238 
239 /*----------------------------------------------------------------
240  * Constructors
241  */
242 
243 /* General constructor. */
Scm_MakeString(const char * str,ScmSmallInt size,ScmSmallInt len,u_long flags)244 ScmObj Scm_MakeString(const char *str, ScmSmallInt size, ScmSmallInt len,
245                       u_long flags)
246 {
247     flags &= ~SCM_STRING_TERMINATED;
248 
249     if (size < 0) {
250         count_size_and_length(str, &size, &len);
251         flags |= SCM_STRING_TERMINATED;
252     } else {
253         if (len < 0) len = count_length(str, size);
254     }
255     /* Range of size and len will be checked in make_str */
256 
257     ScmString *s;
258     if (flags & SCM_STRING_COPYING) {
259         flags |= SCM_STRING_TERMINATED; /* SCM_STRDUP_PARTIAL terminates the result str */
260         s = make_str(len, size, SCM_STRDUP_PARTIAL(str, size), flags, NULL);
261     } else {
262         s = make_str(len, size, str, flags, NULL);
263     }
264     return SCM_OBJ(s);
265 }
266 
Scm_MakeFillString(ScmSmallInt len,ScmChar fill)267 ScmObj Scm_MakeFillString(ScmSmallInt len, ScmChar fill)
268 {
269     if (len < 0) Scm_Error("length out of range: %ld", len);
270     ScmSmallInt csize = SCM_CHAR_NBYTES(fill);
271     CHECK_SIZE(csize*len);
272     char *ptr = SCM_NEW_ATOMIC2(char *, csize*len+1);
273     char *p = ptr;
274     for (ScmSmallInt i=0; i<len; i++, p+=csize) {
275         SCM_CHAR_PUT(p, fill);
276     }
277     ptr[csize*len] = '\0';
278     return SCM_OBJ(make_str(len, csize*len, ptr, SCM_STRING_TERMINATED, NULL));
279 }
280 
Scm_ListToString(ScmObj chars)281 ScmObj Scm_ListToString(ScmObj chars)
282 {
283     ScmSmallInt size = 0, len = 0;
284 
285     ScmObj cp;
286     SCM_FOR_EACH(cp, chars) {
287         if (!SCM_CHARP(SCM_CAR(cp)))
288             Scm_Error("character required, but got %S", SCM_CAR(cp));
289         ScmChar ch = SCM_CHAR_VALUE(SCM_CAR(cp));
290         size += SCM_CHAR_NBYTES(ch);
291         len++;
292         CHECK_SIZE(size);
293     }
294     char *buf = SCM_NEW_ATOMIC2(char *, size+1);
295     char *bufp = buf;
296     SCM_FOR_EACH(cp, chars) {
297         ScmChar ch = SCM_CHAR_VALUE(SCM_CAR(cp));
298         SCM_CHAR_PUT(bufp, ch);
299         bufp += SCM_CHAR_NBYTES(ch);
300     }
301     *bufp = '\0';
302     return Scm_MakeString(buf, size, len, 0);
303 }
304 
305 /* Extract string as C-string.  This one guarantees to return
306    mutable string (we always copy) */
Scm_GetString(ScmString * str)307 char *Scm_GetString(ScmString *str)
308 {
309     const ScmStringBody *b = SCM_STRING_BODY(str);
310     return SCM_STRDUP_PARTIAL(SCM_STRING_BODY_START(b), SCM_STRING_BODY_SIZE(b));
311 }
312 
313 /* Common routine for Scm_GetStringConst and Scm_GetStringContent */
get_string_from_body(const ScmStringBody * b)314 static const char *get_string_from_body(const ScmStringBody *b)
315 {
316     ScmSmallInt size = SCM_STRING_BODY_SIZE(b);
317     if (SCM_STRING_BODY_HAS_FLAG(b, SCM_STRING_TERMINATED)) {
318         /* we can use string data as C-string */
319         return SCM_STRING_BODY_START(b);
320     } else {
321         char *p = SCM_STRDUP_PARTIAL(SCM_STRING_BODY_START(b), size);
322         /* kludge! This breaks 'const' qualification, but we know
323            this is an idempotent operation from the outside.  Note that
324            this is safe even multiple threads execute this part
325            simultaneously. */
326         ((ScmStringBody*)b)->start = p; /* discard const qualifier */
327         ((ScmStringBody*)b)->flags |= SCM_STRING_TERMINATED;
328         return p;
329     }
330 }
331 
332 /* Extract string as C-string.  Returned string is immutable,
333    so we can directly return the body of the string.  We do not
334    allow string containing NUL to be passed to C world, for it
335    would be a security risk.
336    TODO: Let the string body have a flag so that we don't need
337    to scan the string every time.
338 */
Scm_GetStringConst(ScmString * str)339 const char *Scm_GetStringConst(ScmString *str)
340 {
341     const ScmStringBody *b = SCM_STRING_BODY(str);
342     if (memchr(SCM_STRING_BODY_START(b), 0, SCM_STRING_BODY_SIZE(b))) {
343         Scm_Error("A string containing NUL character is not allowed: %S",
344                   SCM_OBJ(str));
345     }
346     return get_string_from_body(b);
347 }
348 
349 /* Atomically extracts C-string, length, size, and incomplete flag.
350    MT-safe. */
351 /* NB: Output parameters are int's for the ABI compatibility. */
Scm_GetStringContent(ScmString * str,ScmSmallInt * psize,ScmSmallInt * plength,u_long * pflags)352 const char *Scm_GetStringContent(ScmString *str,
353                                  ScmSmallInt *psize,   /* out */
354                                  ScmSmallInt *plength, /* out */
355                                  u_long *pflags)       /* out */
356 {
357     const ScmStringBody *b = SCM_STRING_BODY(str);
358     if (psize)   *psize = SCM_STRING_BODY_SIZE(b);
359     if (plength) *plength = SCM_STRING_BODY_LENGTH(b);
360     if (pflags) *pflags = SCM_STRING_BODY_FLAGS(b);
361     return get_string_from_body(b);
362 }
363 
364 
365 /* Copy string.  You can modify the flags of the newly created string
366    by FLAGS and MASK arguments; for the bits set in MASK, corresponding
367    bits in FLAGS are copied to the new string, and for other bits, the
368    original flags are copied.
369 
370    The typical semantics of copy-string is achieved by passing 0 to
371    FLAGS and SCM_STRING_IMMUTABLE to MASK (i.e. reset IMMUTABLE flag,
372    and keep other flags intact.
373 
374    NB: This routine doesn't check whether specified flag is valid
375    with the string content, i.e. you can drop INCOMPLETE flag with
376    copying, while the string content won't be checked if it consists
377    valid complete string. */
Scm_CopyStringWithFlags(ScmString * x,u_long flags,u_long mask)378 ScmObj Scm_CopyStringWithFlags(ScmString *x, u_long flags, u_long mask)
379 {
380     const ScmStringBody *b = SCM_STRING_BODY(x);
381     ScmSmallInt size = SCM_STRING_BODY_SIZE(b);
382     ScmSmallInt len  = SCM_STRING_BODY_LENGTH(b);
383     const char *start = SCM_STRING_BODY_START(b);
384     const void *index = b->index;
385     u_long newflags = ((SCM_STRING_BODY_FLAGS(b) & ~mask)
386                        | (flags & mask));
387 
388     return SCM_OBJ(make_str(len, size, start, newflags, index));
389 }
390 
391 /* OBSOLETED */
Scm_StringCompleteToIncomplete(ScmString * x)392 ScmObj Scm_StringCompleteToIncomplete(ScmString *x)
393 {
394     Scm_Warn("Obsoleted C API Scm_StringCompleteToIncomplete called");
395     ScmObj proc = SCM_UNDEFINED;
396     SCM_BIND_PROC(proc, "string-complete->incomplete", Scm_GaucheModule());
397     return Scm_ApplyRec1(proc, SCM_OBJ(x));
398 }
399 
400 /* OBSOLETED */
Scm_StringIncompleteToComplete(ScmString * x,int handling,ScmChar substitute)401 ScmObj Scm_StringIncompleteToComplete(ScmString *x,
402                                       int handling,
403                                       ScmChar substitute)
404 {
405     Scm_Warn("Obsoleted C API Scm_StringIncompleteToComplete called");
406     ScmObj proc = SCM_UNDEFINED;
407     SCM_BIND_PROC(proc, "string-incomplete->complete", Scm_GaucheModule());
408     ScmObj r;
409     if (handling == SCM_ILLEGAL_CHAR_REJECT) {
410         r = Scm_ApplyRec1(proc, SCM_OBJ(x));
411     } else if (handling == SCM_ILLEGAL_CHAR_OMIT) {
412         r = Scm_ApplyRec2(proc, SCM_OBJ(x), SCM_MAKE_KEYWORD("omit"));
413     } else {
414         r = Scm_ApplyRec2(proc, SCM_OBJ(x), SCM_MAKE_CHAR(substitute));
415     }
416     return r;
417 }
418 
419 /*----------------------------------------------------------------
420  * Comparison
421  */
422 
423 /* TODO: merge Equal and Cmp API; required generic comparison protocol */
Scm_StringEqual(ScmString * x,ScmString * y)424 int Scm_StringEqual(ScmString *x, ScmString *y)
425 {
426     const ScmStringBody *xb = SCM_STRING_BODY(x);
427     const ScmStringBody *yb = SCM_STRING_BODY(y);
428     if ((SCM_STRING_BODY_FLAGS(xb)^SCM_STRING_BODY_FLAGS(yb))&SCM_STRING_INCOMPLETE) {
429         return FALSE;
430     }
431     if (SCM_STRING_BODY_SIZE(xb) != SCM_STRING_BODY_SIZE(yb)) {
432         return FALSE;
433     }
434     return (memcmp(SCM_STRING_BODY_START(xb),
435                    SCM_STRING_BODY_START(yb),
436                    SCM_STRING_BODY_SIZE(xb)) == 0? TRUE : FALSE);
437 }
438 
Scm_StringCmp(ScmString * x,ScmString * y)439 int Scm_StringCmp(ScmString *x, ScmString *y)
440 {
441     const ScmStringBody *xb = SCM_STRING_BODY(x);
442     const ScmStringBody *yb = SCM_STRING_BODY(y);
443     ScmSmallInt sizx = SCM_STRING_BODY_SIZE(xb);
444     ScmSmallInt sizy = SCM_STRING_BODY_SIZE(yb);
445     ScmSmallInt siz = (sizx < sizy)? sizx : sizy;
446     int r = memcmp(SCM_STRING_BODY_START(xb), SCM_STRING_BODY_START(yb), siz);
447     if (r == 0) {
448         if (sizx == sizy) {
449             if (SCM_STRING_BODY_INCOMPLETE_P(xb)) {
450                 if (SCM_STRING_BODY_INCOMPLETE_P(yb)) return 0;
451                 else                                  return 1;
452             } else {
453                 if (SCM_STRING_BODY_INCOMPLETE_P(yb)) return -1;
454                 else                                  return 0;
455             }
456         }
457         if (sizx < sizy)  return -1;
458         else              return 1;
459     } else if (r < 0) {
460         return -1;
461     } else {
462         return 1;
463     }
464 }
465 
466 /* single-byte case insensitive comparison */
sb_strcasecmp(const char * px,ScmSmallInt sizx,const char * py,ScmSmallInt sizy)467 static int sb_strcasecmp(const char *px, ScmSmallInt sizx,
468                          const char *py, ScmSmallInt sizy)
469 {
470     for (; sizx > 0 && sizy > 0; sizx--, sizy--, px++, py++) {
471         char cx = tolower((u_char)*px);
472         char cy = tolower((u_char)*py);
473         if (cx == cy) continue;
474         return (cx - cy);
475     }
476     if (sizx > 0) return 1;
477     if (sizy > 0) return -1;
478     return 0;
479 }
480 
481 /* multi-byte case insensitive comparison */
mb_strcasecmp(const char * px,ScmSmallInt lenx,const char * py,ScmSmallInt leny)482 static int mb_strcasecmp(const char *px, ScmSmallInt lenx,
483                          const char *py, ScmSmallInt leny)
484 {
485     int ix, iy;
486     for (; lenx > 0 && leny > 0; lenx--, leny--, px+=ix, py+=iy) {
487         int cx, cy;
488         SCM_CHAR_GET(px, cx);
489         SCM_CHAR_GET(py, cy);
490         int ccx = SCM_CHAR_UPCASE(cx);
491         int ccy = SCM_CHAR_UPCASE(cy);
492         if (ccx != ccy) return (ccx - ccy);
493         ix = SCM_CHAR_NBYTES(cx);
494         iy = SCM_CHAR_NBYTES(cy);
495     }
496     if (lenx > 0) return 1;
497     if (leny > 0) return -1;
498     return 0;
499 }
500 
Scm_StringCiCmp(ScmString * x,ScmString * y)501 int Scm_StringCiCmp(ScmString *x, ScmString *y)
502 {
503     const ScmStringBody *xb = SCM_STRING_BODY(x);
504     const ScmStringBody *yb = SCM_STRING_BODY(y);
505 
506     if ((SCM_STRING_BODY_FLAGS(xb)^SCM_STRING_BODY_FLAGS(yb))&SCM_STRING_INCOMPLETE) {
507         Scm_Error("cannot compare incomplete strings in case-insensitive way: %S, %S",
508                   SCM_OBJ(x), SCM_OBJ(y));
509     }
510     ScmSmallInt sizx = SCM_STRING_BODY_SIZE(xb);
511     ScmSmallInt lenx = SCM_STRING_BODY_LENGTH(xb);
512     ScmSmallInt sizy = SCM_STRING_BODY_SIZE(yb);
513     ScmSmallInt leny = SCM_STRING_BODY_LENGTH(yb);
514     const char *px = SCM_STRING_BODY_START(xb);
515     const char *py = SCM_STRING_BODY_START(yb);
516 
517     if (sizx == lenx && sizy == leny) {
518         return sb_strcasecmp(px, sizx, py, sizy);
519     } else {
520         return mb_strcasecmp(px, lenx, py, leny);
521     }
522 }
523 
524 /*----------------------------------------------------------------
525  * Reference
526  */
527 
528 /* Advance ptr for NCHARS characters.  Args assumed in boundary. */
forward_pos(const ScmStringBody * body,const char * current,ScmSmallInt nchars)529 static inline const char *forward_pos(const ScmStringBody *body,
530                                       const char *current,
531                                       ScmSmallInt nchars)
532 {
533     if (body && (SCM_STRING_BODY_SINGLE_BYTE_P(body) ||
534                  SCM_STRING_BODY_INCOMPLETE_P(body))) {
535         return current + nchars;
536     }
537 
538     while (nchars--) {
539         int n = SCM_CHAR_NFOLLOWS(*current);
540         current += n + 1;
541     }
542     return current;
543 }
544 
545 /* Index -> ptr.  Args assumed in boundary. */
index2ptr(const ScmStringBody * body,ScmSmallInt nchars)546 static const char *index2ptr(const ScmStringBody *body,
547                              ScmSmallInt nchars)
548 {
549     if (body->index == NULL) {
550         return forward_pos(body, SCM_STRING_BODY_START(body), nchars);
551     }
552     ScmStringIndex *index = STRING_INDEX(body->index);
553     ScmSmallInt off = 0;
554     ScmSmallInt array_off = (nchars>>STRING_INDEX_SHIFT(index))+1;
555     /* If array_off is 1, we don't need lookup - the character is in the
556        first segment. */
557     if (array_off > 1) {
558         switch (STRING_INDEX_TYPE(index)) {
559         case STRING_INDEX8:
560             SCM_ASSERT(array_off < (ScmSmallInt)index->index8[1]);
561             off = index->index8[array_off];
562             break;
563         case STRING_INDEX16:
564             SCM_ASSERT(array_off < (ScmSmallInt)index->index16[1]);
565             off = index->index16[array_off];
566             break;
567         case STRING_INDEX32:
568             SCM_ASSERT(array_off < (ScmSmallInt)index->index32[1]);
569             off = index->index32[array_off];
570             break;
571         case STRING_INDEX64:
572             SCM_ASSERT(array_off < (ScmSmallInt)index->index64[1]);
573             off = index->index64[array_off];
574             break;
575         default:
576             Scm_Panic("String index contains unrecognized signature (%02x). "
577                       "Possible memory corruption.  Aborting...",
578                       index->signature);
579         }
580     }
581     return forward_pos(body,
582                        SCM_STRING_BODY_START(body) + off,
583                        nchars & (STRING_INDEX_INTERVAL(index)-1));
584 }
585 
586 
587 /* string-ref.
588  * If POS is out of range,
589  *   - returns SCM_CHAR_INVALID if range_error is FALSE
590  *   - raise error otherwise.
591  * This differs from Scheme version, which takes an optional 'fallback'
592  * argument which will be returned when POS is out-of-range.  We can't
593  * have the same semantics since the return type is limited.
594  */
Scm_StringRef(ScmString * str,ScmSmallInt pos,int range_error)595 ScmChar Scm_StringRef(ScmString *str, ScmSmallInt pos, int range_error)
596 {
597     const ScmStringBody *b = SCM_STRING_BODY(str);
598     ScmSmallInt len = SCM_STRING_BODY_LENGTH(b);
599 
600     /* we can't allow string-ref on incomplete strings, since it may yield
601        invalid character object. */
602     if (SCM_STRING_BODY_INCOMPLETE_P(b)) {
603         Scm_Error("incomplete string not allowed : %S", str);
604     }
605     if (pos < 0 || pos >= len) {
606         if (range_error) {
607             Scm_Error("argument out of range: %ld", pos);
608         } else {
609             return SCM_CHAR_INVALID;
610         }
611     }
612 
613     const char *p = NULL;
614     if (SCM_STRING_BODY_SINGLE_BYTE_P(b)) {
615         p = SCM_STRING_BODY_START(b) + pos;
616     } else {
617         p = index2ptr(b, pos);
618     }
619 
620     if (SCM_STRING_BODY_SINGLE_BYTE_P(b)) {
621         return (ScmChar)(*(unsigned char *)p);
622     } else {
623         ScmChar c;
624         SCM_CHAR_GET(p, c);
625         return c;
626     }
627 }
628 
629 /* The meaning and rationale of range_error is the same as Scm_StringRef.
630  * Returns -1 if OFFSET is out-of-range and RANGE_ERROR is FALSE.
631  * (Because of this, the return type is not ScmByte but int.
632  */
Scm_StringByteRef(ScmString * str,ScmSmallInt offset,int range_error)633 int Scm_StringByteRef(ScmString *str, ScmSmallInt offset, int range_error)
634 {
635     const ScmStringBody *b = SCM_STRING_BODY(str);
636     if (offset < 0 || offset >= SCM_STRING_BODY_SIZE(b)) {
637         if (range_error) {
638             Scm_Error("argument out of range: %ld", offset);
639         } else {
640             return -1;
641         }
642     }
643     return (ScmByte)SCM_STRING_BODY_START(b)[offset];
644 }
645 
646 /* External interface of index2ptr.  Returns the pointer to the
647    offset-th character in str. */
648 /* NB: this function allows offset == length of the string; in that
649    case, the return value points the location past the string body,
650    but it is necessary sometimes to do a pointer arithmetic with the
651    returned values. */
Scm_StringBodyPosition(const ScmStringBody * b,ScmSmallInt offset)652 const char *Scm_StringBodyPosition(const ScmStringBody *b, ScmSmallInt offset)
653 {
654     if (offset < 0 || offset > SCM_STRING_BODY_LENGTH(b)) {
655         Scm_Error("argument out of range: %ld", offset);
656     }
657     return index2ptr(b, offset);
658 }
659 
660 /* This is old API and now DEPRECATED.  It's difficult to use this safely,
661    since you don't have a way to get the string length consistent at the
662    moment you call this function.   Use Scm_StringBodyPosition instead. */
Scm_StringPosition(ScmString * str,ScmSmallInt offset)663 const char *Scm_StringPosition(ScmString *str, ScmSmallInt offset)
664 {
665     return Scm_StringBodyPosition(SCM_STRING_BODY(str), offset);
666 }
667 
668 /*----------------------------------------------------------------
669  * Concatenation
670  */
671 
Scm_StringAppend2(ScmString * x,ScmString * y)672 ScmObj Scm_StringAppend2(ScmString *x, ScmString *y)
673 {
674     const ScmStringBody *xb = SCM_STRING_BODY(x);
675     const ScmStringBody *yb = SCM_STRING_BODY(y);
676     ScmSmallInt sizex = SCM_STRING_BODY_SIZE(xb);
677     ScmSmallInt lenx = SCM_STRING_BODY_LENGTH(xb);
678     ScmSmallInt sizey = SCM_STRING_BODY_SIZE(yb);
679     ScmSmallInt leny = SCM_STRING_BODY_LENGTH(yb);
680     CHECK_SIZE(sizex+sizey);
681     u_long flags = 0;
682     char *p = SCM_NEW_ATOMIC2(char *,sizex + sizey + 1);
683 
684     memcpy(p, xb->start, sizex);
685     memcpy(p+sizex, yb->start, sizey);
686     p[sizex + sizey] = '\0';
687     flags |= SCM_STRING_TERMINATED;
688 
689     if (SCM_STRING_BODY_INCOMPLETE_P(xb) || SCM_STRING_BODY_INCOMPLETE_P(yb)) {
690         flags |= SCM_STRING_INCOMPLETE; /* yields incomplete string */
691     }
692     return SCM_OBJ(make_str(lenx+leny, sizex+sizey, p, flags, NULL));
693 }
694 
Scm_StringAppendC(ScmString * x,const char * str,ScmSmallInt sizey,ScmSmallInt leny)695 ScmObj Scm_StringAppendC(ScmString *x, const char *str,
696                          ScmSmallInt sizey, ScmSmallInt leny)
697 {
698     const ScmStringBody *xb = SCM_STRING_BODY(x);
699     ScmSmallInt sizex = SCM_STRING_BODY_SIZE(xb);
700     ScmSmallInt lenx = SCM_STRING_BODY_LENGTH(xb);
701     u_long flags = 0;
702 
703     if (sizey < 0) count_size_and_length(str, &sizey, &leny);
704     else if (leny < 0) leny = count_length(str, sizey);
705     CHECK_SIZE(sizex+sizey);
706 
707     char *p = SCM_NEW_ATOMIC2(char *, sizex + sizey + 1);
708     memcpy(p, xb->start, sizex);
709     memcpy(p+sizex, str, sizey);
710     p[sizex+sizey] = '\0';
711     flags |= SCM_STRING_TERMINATED;
712 
713     if (SCM_STRING_BODY_INCOMPLETE_P(xb) || leny < 0) {
714         flags |= SCM_STRING_INCOMPLETE;
715     }
716     return SCM_OBJ(make_str(lenx + leny, sizex + sizey, p, flags, NULL));
717 }
718 
Scm_StringAppend(ScmObj strs)719 ScmObj Scm_StringAppend(ScmObj strs)
720 {
721 #define BODY_ARRAY_SIZE 32
722     ScmSmallInt size = 0, len = 0;
723     u_long flags = 0;
724     const ScmStringBody *bodies_s[BODY_ARRAY_SIZE], **bodies;
725 
726     /* It is trickier than it appears, since the strings may be modified
727        by another thread during we're dealing with it.  So in the first
728        pass to sum up the lengths of strings, we extract the string bodies
729        and save it.  */
730     ScmSmallInt numstrs = Scm_Length(strs);
731     if (numstrs < 0) Scm_Error("improper list not allowed: %S", strs);
732     if (numstrs > BODY_ARRAY_SIZE) {
733         bodies = SCM_NEW_ARRAY(const ScmStringBody*, numstrs);
734     } else {
735         bodies = bodies_s;
736     }
737 
738     ScmSmallInt i = 0;
739     ScmObj cp;
740     SCM_FOR_EACH(cp, strs) {
741         const ScmStringBody *b;
742         if (!SCM_STRINGP(SCM_CAR(cp))) {
743             Scm_Error("string required, but got %S", SCM_CAR(cp));
744         }
745         b = SCM_STRING_BODY(SCM_CAR(cp));
746         size += SCM_STRING_BODY_SIZE(b);
747         len += SCM_STRING_BODY_LENGTH(b);
748         CHECK_SIZE(size);
749         if (SCM_STRING_BODY_INCOMPLETE_P(b)) {
750             flags |= SCM_STRING_INCOMPLETE;
751         }
752         bodies[i++] = b;
753     }
754 
755     char *buf = SCM_NEW_ATOMIC2(char *, size+1);
756     char *bufp = buf;
757     for (i=0; i<numstrs; i++) {
758         const ScmStringBody *b = bodies[i];
759         memcpy(bufp, SCM_STRING_BODY_START(b), SCM_STRING_BODY_SIZE(b));
760         bufp += SCM_STRING_BODY_SIZE(b);
761     }
762     *bufp = '\0';
763     bodies = NULL;              /* to help GC */
764     flags |= SCM_STRING_TERMINATED;
765     return SCM_OBJ(make_str(len, size, buf, flags, NULL));
766 #undef BODY_ARRAY_SIZE
767 }
768 
Scm_StringJoin(ScmObj strs,ScmString * delim,int grammar)769 ScmObj Scm_StringJoin(ScmObj strs, ScmString *delim, int grammar)
770 {
771 #define BODY_ARRAY_SIZE 32
772     ScmSmallInt size = 0, len = 0;
773     u_long flags = 0;
774     const ScmStringBody *bodies_s[BODY_ARRAY_SIZE], **bodies;
775 
776     ScmSmallInt nstrs = Scm_Length(strs);
777     if (nstrs < 0) Scm_Error("improper list not allowed: %S", strs);
778     if (nstrs == 0) {
779         if (grammar == SCM_STRING_JOIN_STRICT_INFIX) {
780             Scm_Error("can't join empty list of strings with strict-infix grammar");
781         }
782         return SCM_MAKE_STR("");
783     }
784 
785     if (nstrs > BODY_ARRAY_SIZE) {
786         bodies = SCM_NEW_ARRAY(const ScmStringBody *, nstrs);
787     } else {
788         bodies = bodies_s;
789     }
790 
791     const ScmStringBody *dbody = SCM_STRING_BODY(delim);
792     ScmSmallInt dsize = SCM_STRING_BODY_SIZE(dbody);
793     ScmSmallInt dlen  = SCM_STRING_BODY_LENGTH(dbody);
794     if (SCM_STRING_BODY_INCOMPLETE_P(dbody)) {
795         flags |= SCM_STRING_INCOMPLETE;
796     }
797 
798     ScmSmallInt i = 0, ndelim;
799     ScmObj cp;
800     SCM_FOR_EACH(cp, strs) {
801         const ScmStringBody *b;
802         if (!SCM_STRINGP(SCM_CAR(cp))) {
803             Scm_Error("string required, but got %S", SCM_CAR(cp));
804         }
805         b = SCM_STRING_BODY(SCM_CAR(cp));
806         size += SCM_STRING_BODY_SIZE(b);
807         len  += SCM_STRING_BODY_LENGTH(b);
808         CHECK_SIZE(size);
809         if (SCM_STRING_BODY_INCOMPLETE_P(b)) {
810             flags |= SCM_STRING_INCOMPLETE;
811         }
812         bodies[i++] = b;
813     }
814     if (grammar == SCM_STRING_JOIN_INFIX
815         || grammar == SCM_STRING_JOIN_STRICT_INFIX) {
816         ndelim = nstrs - 1;
817     } else {
818         ndelim = nstrs;
819     }
820     size += dsize * ndelim;
821     len += dlen * ndelim;
822     CHECK_SIZE(size);
823 
824     char *buf = SCM_NEW_ATOMIC2(char *, size+1);
825     char *bufp = buf;
826     if (grammar == SCM_STRING_JOIN_PREFIX) {
827         memcpy(bufp, SCM_STRING_BODY_START(dbody), dsize);
828         bufp += dsize;
829     }
830     for (i=0; i<nstrs; i++) {
831         const ScmStringBody *b = bodies[i];
832         memcpy(bufp, SCM_STRING_BODY_START(b), SCM_STRING_BODY_SIZE(b));
833         bufp += SCM_STRING_BODY_SIZE(b);
834         if (i < nstrs-1) {
835             memcpy(bufp, SCM_STRING_BODY_START(dbody), dsize);
836             bufp += dsize;
837         }
838     }
839     if (grammar == SCM_STRING_JOIN_SUFFIX) {
840         memcpy(bufp, SCM_STRING_BODY_START(dbody), dsize);
841         bufp += dsize;
842     }
843     *bufp = '\0';
844     bodies = NULL;              /* to help GC */
845     flags |= SCM_STRING_TERMINATED;
846     return SCM_OBJ(make_str(len, size, buf, flags, NULL));
847 #undef BODY_ARRAY_SIZE
848 }
849 
850 /*----------------------------------------------------------------
851  * Mutation
852  */
853 
854 /*
855  * String mutation is extremely heavy operation in Gauche,
856  * and only provided for compatibility to RnRS.  At C API level
857  * there's no point in using string mutation at all.  A single
858  * API, which replaces the string body, is provided at C level.
859  */
860 
Scm_StringReplaceBody(ScmString * str,const ScmStringBody * newbody)861 ScmObj Scm_StringReplaceBody(ScmString *str, const ScmStringBody *newbody)
862 {
863     if (SCM_STRING_IMMUTABLE_P(str)) {
864         Scm_Error("attempted to modify an immutable string: %S", str);
865     }
866 
867     /* Atomically replaces the str's body (no MT hazard) */
868     str->body = newbody;
869 
870     /* TODO: If the initialBody of str isn't shared,
871        nullify str->initialBody.start so that the original string is
872        GCed.  It should be done after implementing 'shared' flag
873        into the string body. */
874     return SCM_OBJ(str);
875 }
876 
877 /*----------------------------------------------------------------
878  * Substring
879  */
880 
substring(const ScmStringBody * xb,ScmSmallInt start,ScmSmallInt end,int byterange,int immutable)881 static ScmObj substring(const ScmStringBody *xb,
882                         ScmSmallInt start, ScmSmallInt end,
883                         int byterange, int immutable)
884 {
885     ScmSmallInt len = byterange? SCM_STRING_BODY_SIZE(xb) : SCM_STRING_BODY_LENGTH(xb);
886     u_long flags = SCM_STRING_BODY_FLAGS(xb);
887     if (!immutable) flags &= ~SCM_STRING_IMMUTABLE;
888 
889     SCM_CHECK_START_END(start, end, len);
890 
891     if (byterange) {
892         if (end != len) flags &= ~SCM_STRING_TERMINATED;
893         flags |= SCM_STRING_INCOMPLETE;
894         return SCM_OBJ(make_str(end - start,
895                                 end - start,
896                                 SCM_STRING_BODY_START(xb) + start,
897                                 flags, NULL));
898     } else {
899         const char *s, *e;
900         s = index2ptr(xb, start);
901         if (len == end) {
902             e = SCM_STRING_BODY_END(xb);
903         } else {
904             /* kludge - if we don't have index, forward_pos is faster. */
905             if (start > 0 && xb->index == NULL) {
906                 e = forward_pos(xb, s, end - start);
907             } else {
908                 e = index2ptr(xb, end);
909             }
910             flags &= ~SCM_STRING_TERMINATED;
911         }
912         return SCM_OBJ(make_str(end - start,
913                                 (ScmSmallInt)(e - s), s, flags, NULL));
914     }
915 }
916 
substring_cursor(const ScmStringBody * xb,const char * start,const char * end,int immutable)917 static ScmObj substring_cursor(const ScmStringBody *xb,
918                                const char *start,
919                                const char *end,
920                                int immutable)
921 {
922     u_long flags = SCM_STRING_BODY_FLAGS(xb);
923     if (!immutable) flags &= ~SCM_STRING_IMMUTABLE;
924 
925     if (start < SCM_STRING_BODY_START(xb) ||
926         start > SCM_STRING_BODY_END(xb)) {
927         Scm_Error("start argument out of range: %S", start);
928     }
929     else if (end > SCM_STRING_BODY_END(xb)) {
930         Scm_Error("end argument out of range: %S", end);
931     } else if (end < start) {
932         Scm_Error("end argument must be greater than or "
933                   "equal to the start argument: %S vs %S", end, start);
934     }
935 
936     if (end != SCM_STRING_BODY_END(xb)) {
937         flags &= ~SCM_STRING_TERMINATED;
938     }
939 
940     ScmSmallInt len;
941     if (SCM_STRING_BODY_SINGLE_BYTE_P(xb)) {
942         len = (ScmSmallInt)(end - start);
943     } else {
944         len = Scm_MBLen(start, end);
945     }
946 
947     return SCM_OBJ(make_str(len,
948                             (ScmSmallInt)(end - start),
949                             start, flags, NULL));
950 }
951 
Scm_Substring(ScmString * x,ScmSmallInt start,ScmSmallInt end,int byterangep)952 ScmObj Scm_Substring(ScmString *x, ScmSmallInt start, ScmSmallInt end,
953                      int byterangep)
954 {
955     return substring(SCM_STRING_BODY(x), start, end, byterangep, FALSE);
956 }
957 
958 /* Auxiliary procedure to support optional start/end parameter specified
959    in lots of SRFI-13 functions.   If start and end is specified and restricts
960    string range, call substring.  Otherwise returns x itself.
961    If input string is immutable, the result is also immutable.  If the caller
962    needs a mutable string it should call CopyString anyway, for the caller
963    doesn't know if the input string is just passed through.
964 */
Scm_MaybeSubstring(ScmString * x,ScmObj start,ScmObj end)965 ScmObj Scm_MaybeSubstring(ScmString *x, ScmObj start, ScmObj end)
966 {
967     const ScmStringBody *xb = SCM_STRING_BODY(x);
968     int no_start = SCM_UNBOUNDP(start) || SCM_UNDEFINEDP(start) || SCM_FALSEP(start);
969     int no_end = SCM_UNBOUNDP(end) || SCM_UNDEFINEDP(end) || SCM_FALSEP(end);
970     ScmSmallInt istart = -1, iend = -1, ostart = -1, oend = -1;
971 
972     int immutable = SCM_STRING_BODY_HAS_FLAG(xb, SCM_STRING_IMMUTABLE);
973 
974     if (no_start)
975         istart = 0;
976     else if (SCM_STRING_CURSOR_P(start))
977         ostart = string_cursor_offset(start);
978     else if (SCM_INTP(start))
979         istart = SCM_INT_VALUE(start);
980     else
981         Scm_Error("exact integer or cursor required for start, but got %S", start);
982 
983     if (no_end) {
984         if (istart == 0 || ostart == 0) {
985             return SCM_OBJ(x);
986         }
987         iend = SCM_STRING_BODY_LENGTH(xb);
988     } else if (SCM_STRING_CURSOR_P(end))
989         oend = string_cursor_offset(end);
990     else if (SCM_INTP(end))
991         iend = SCM_INT_VALUE(end);
992     else
993         Scm_Error("exact integer or cursor required for end, but got %S", end);
994 
995     if (no_start && oend != -1) {
996         return substring_cursor(xb,
997                                 SCM_STRING_BODY_START(xb),
998                                 SCM_STRING_BODY_START(xb) + oend,
999                                 immutable);
1000     }
1001     if (ostart != -1 && oend != -1) {
1002         return substring_cursor(xb,
1003                                 SCM_STRING_BODY_START(xb) + ostart,
1004                                 SCM_STRING_BODY_START(xb) + oend,
1005                                 immutable);
1006     }
1007     if (ostart != -1 && no_end) {
1008         return substring_cursor(xb,
1009                                 SCM_STRING_BODY_START(xb) + ostart,
1010                                 SCM_STRING_BODY_END(xb),
1011                                 immutable);
1012     }
1013 
1014     if (ostart != -1) {
1015         istart = Scm_GetInteger(Scm_StringCursorIndex(x, start));
1016     }
1017     if (oend != -1) {
1018         iend = Scm_GetInteger(Scm_StringCursorIndex(x, end));
1019     }
1020 
1021     return substring(xb, istart, iend, FALSE, immutable);
1022 }
1023 
1024 /*----------------------------------------------------------------
1025  * Search & parse
1026  */
1027 
1028 /* Boyer-Moore string search.  assuming siz1 > siz2, siz2 < 256. */
boyer_moore(const char * ss1,ScmSmallInt siz1,const char * ss2,ScmSmallInt siz2)1029 static ScmSmallInt boyer_moore(const char *ss1, ScmSmallInt siz1,
1030                                const char *ss2, ScmSmallInt siz2)
1031 {
1032     unsigned char shift[256];
1033     for (ScmSmallInt i=0; i<256; i++) { shift[i] = siz2; }
1034     for (ScmSmallInt j=0; j<siz2-1; j++) {
1035         shift[(unsigned char)ss2[j]] = siz2-j-1;
1036     }
1037     for (ScmSmallInt i=siz2-1; i<siz1; i+=shift[(unsigned char)ss1[i]]) {
1038         ScmSmallInt j, k;
1039         for (j=siz2-1, k = i; j>=0 && ss1[k] == ss2[j]; j--, k--)
1040             ;
1041         if (j == -1) return k+1;
1042     }
1043     return -1;
1044 }
1045 
boyer_moore_reverse(const char * ss1,ScmSmallInt siz1,const char * ss2,ScmSmallInt siz2)1046 static ScmSmallInt boyer_moore_reverse(const char *ss1, ScmSmallInt siz1,
1047                                        const char *ss2, ScmSmallInt siz2)
1048 {
1049     unsigned char shift[256];
1050     for (ScmSmallInt i=0; i<256; i++) { shift[i] = siz2; }
1051     for (ScmSmallInt j=siz2-1; j>0; j--) {
1052         shift[(unsigned char)ss2[j]] = j;
1053     }
1054     for (ScmSmallInt i=siz1-siz2+1; i>=0; i-=shift[(unsigned char)ss1[i]]) {
1055         ScmSmallInt j, k;
1056         for (j=0, k = i; j<siz2 && ss1[k] == ss2[j]; j++, k++)
1057             ;
1058         if (j == siz2) return i;
1059     }
1060     return -1;
1061 }
1062 
1063 /* Primitive routines to search a substring s2 within s1.
1064    Returns NOT_FOUND if not found, FOUND_BOTH_INDEX if both byte index
1065    (*bi) and character index (*ci) is calculted, FOUND_BYTE_INDEX
1066    if only byte index is calculated.
1067 
1068    When the encoding is utf-8 or none, we can scan a string as if it is just
1069    a bytestring.   The only caveat is that, with utf-8, we need to calculate
1070    character index after we find the match.  It is still a total win, for
1071    finding out non-matches using Boyer-Moore is a lot faster than naive way.
1072 
1073    If the encoding is EUC-JP or SJIS, we can only use Boyer-Moore when we
1074    know the strings have single-byte only.  Multibyte strings of those
1075    encodings can have spurious matches when compared bytewise.
1076  */
1077 
1078 /* return value of string_scan */
1079 #define NOT_FOUND 0         /* string not found */
1080 #define FOUND_BOTH_INDEX 1  /* string found, and both indexes are calculated */
1081 #define FOUND_BYTE_INDEX 2  /* string found, and only byte index is calc'd */
1082 
1083 /* In utf-8 multibyte case, we only count byte index and let the caller
1084    figure out the character index.  In other encodings we can always find
1085    both index. */
1086 #if defined(GAUCHE_CHAR_ENCODING_UTF_8)
1087 #define FOUND_MAYBE_BOTH FOUND_BYTE_INDEX
1088 #else
1089 #define FOUND_MAYBE_BOTH FOUND_BOTH_INDEX
1090 #endif
1091 
1092 /* In euc-jp and sjis case, we use faster method only when (size == len). */
1093 #if defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
1094 #define BYTEWISE_SEARCHABLE(siz, len)  ((siz) == (len))
1095 #define MULTIBYTE_NAIVE_SEARCH_NEEDED 1
1096 #else
1097 #define BYTEWISE_SEARCHABLE(siz, len)  TRUE
1098 #define MULTIBYTE_NAIVE_SEARCH_NEEDED 0
1099 #endif
1100 
1101 /* glibc has memrchr, but we need to provide fallback anyway and
1102    we don't need it to be highly tuned, so we just roll our own. */
my_memrchr(const void * s,int c,size_t n)1103 static const void *my_memrchr(const void *s, int c, size_t n)
1104 {
1105     const char *p = (const char*)s + n - 1;
1106     for (;p >= (const char*)s; p--) {
1107         if ((int)*p == c) return p;
1108     }
1109     return NULL;
1110 }
1111 
1112 /* NB: len1 and len2 only used in certain internal CES. */
string_search(const char * s1,ScmSmallInt siz1,ScmSmallInt len1 SCM_UNUSED,const char * s2,ScmSmallInt siz2,ScmSmallInt len2 SCM_UNUSED,ScmSmallInt * bi,ScmSmallInt * ci)1113 static int string_search(const char *s1, ScmSmallInt siz1,
1114                          ScmSmallInt len1 SCM_UNUSED,
1115                          const char *s2, ScmSmallInt siz2,
1116                          ScmSmallInt len2 SCM_UNUSED,
1117                          ScmSmallInt *bi /* out */,
1118                          ScmSmallInt *ci /* out */)
1119 {
1120     if (siz2 == 0) {
1121         *bi = *ci = 0;
1122         return FOUND_BOTH_INDEX;
1123     }
1124 
1125     /* Single-byte case. */
1126     if (BYTEWISE_SEARCHABLE(siz1, len1)) {
1127         if (siz2 == 1) {
1128             /* Single ASCII character search case.  This is a huge win. */
1129             const char *z = memchr(s1, s2[0], siz1);
1130             if (z) { *bi = *ci = z - s1; return FOUND_MAYBE_BOTH; }
1131             else return NOT_FOUND;
1132         }
1133         if (BYTEWISE_SEARCHABLE(siz2, len2)) {
1134             ScmSmallInt i;
1135             /* Shortcut for single-byte strings */
1136             if (siz1 < siz2) return NOT_FOUND;
1137             if (siz1 < 256 || siz2 >= 256) {
1138                 /* brute-force search */
1139                 for (i=0; i<=siz1-siz2; i++) {
1140                     if (memcmp(s2, s1+i, siz2) == 0) break;
1141                 }
1142                 if (i == siz1-siz2+1) return NOT_FOUND;
1143             } else {
1144                 i = boyer_moore(s1, siz1, s2, siz2);
1145                 if (i < 0) return NOT_FOUND;
1146             }
1147             *bi = *ci = i;
1148             return FOUND_MAYBE_BOTH;
1149         }
1150         /* FALLTHROUGH */
1151     }
1152 
1153 #if MULTIBYTE_NAIVE_SEARCH_NEEDED
1154     /* Multibyte case. */
1155     if (len1 >= len2) {
1156         const char *sp = s1;
1157         for (ScmSmallInt i=0; i<=len1-len2; i++) {
1158             if (memcmp(sp, s2, siz2) == 0) {
1159                 *bi = (ScmSmallInt)(sp - s1);
1160                 *ci = i;
1161                 return FOUND_BOTH_INDEX;
1162             }
1163             sp += SCM_CHAR_NFOLLOWS(*sp) + 1;
1164         }
1165     }
1166 #endif /*MULTIBYTE_NAIVE_SEARCH_NEEDED*/
1167     return NOT_FOUND;
1168 }
1169 
1170 /* NB: len2 is only used in some internal CES */
string_search_reverse(const char * s1,ScmSmallInt siz1,ScmSmallInt len1,const char * s2,ScmSmallInt siz2,ScmSmallInt len2 SCM_UNUSED,ScmSmallInt * bi,ScmSmallInt * ci)1171 static int string_search_reverse(const char *s1, ScmSmallInt siz1,
1172                                  ScmSmallInt len1,
1173                                  const char *s2, ScmSmallInt siz2,
1174                                  ScmSmallInt len2 SCM_UNUSED,
1175                                  ScmSmallInt *bi /* out */,
1176                                  ScmSmallInt *ci /* out */)
1177 {
1178     if (siz2 == 0) {
1179         *bi = siz1;
1180         *ci = len1;
1181         return FOUND_BOTH_INDEX;
1182     }
1183 
1184     /* Single-byte case. */
1185     if (BYTEWISE_SEARCHABLE(siz1, len1)) {
1186         if (siz2 == 1) {
1187             /* Single ASCII character search case.  This is a huge win. */
1188             const char *z = my_memrchr(s1, s2[0], siz1);
1189             if (z) { *bi = *ci = z - s1; return FOUND_MAYBE_BOTH; }
1190             else return NOT_FOUND;
1191         }
1192         if (BYTEWISE_SEARCHABLE(siz2, len2)) {
1193             ScmSmallInt i;
1194             /* short cut for single-byte strings */
1195             if (siz1 < siz2) return NOT_FOUND;
1196             if (siz1 < 256 || siz2 >= 256) {
1197                 /* brute-force search */
1198                 for (i=siz1-siz2; i>=0; i--) {
1199                     if (memcmp(s2, s1+i, siz2) == 0) break;
1200                 }
1201                 if (i < 0) return NOT_FOUND;
1202             } else {
1203                 i = boyer_moore_reverse(s1, siz1, s2, siz2);
1204                 if (i < 0) return NOT_FOUND;
1205             }
1206             *bi = *ci = i;
1207             return FOUND_MAYBE_BOTH;
1208         } else {
1209             return NOT_FOUND;   /* sbstring can't contain mbstring. */
1210         }
1211     }
1212 
1213 #if MULTIBYTE_NAIVE_SEARCH_NEEDED
1214     /* Multibyte case. */
1215     if (len1 >= len2) {
1216         const char *sp = s1 + siz1, *p;
1217         for (ScmSmallInt i=0; i<len2; i++) {
1218             SCM_CHAR_BACKWARD(sp, s1, p);
1219             SCM_ASSERT(*p);
1220             sp = p;
1221         }
1222         for (ScmSmallInt i=len1-len2; i>=0; i--) {
1223             if (memcmp(sp, s2, siz2) == 0) {
1224                 *bi = (ScmSmallInt)(sp - s1);
1225                 *ci = i;
1226                 return FOUND_BOTH_INDEX;
1227             }
1228             SCM_CHAR_BACKWARD(sp, s1, p);
1229             sp = p;
1230         }
1231     }
1232 #endif /*MULTIBYTE_NAIVE_SEARCH_NEEDED*/
1233     return NOT_FOUND;
1234 }
1235 
1236 /* Scan s2 in s1, and calculates appropriate return value(s) according to
1237    retmode.  Returns # of values, 1 or 2.
1238 
1239    SCM_STRING_SCAN_INDEX  : v1 <- the index of s1
1240         s1 = "abcde" and s2 = "cd" => 2
1241    SCM_STRING_SCAN_CURSOR : v1 <- the cursor of s1
1242         s1 = "abcde" and s2 = "cd" => #<string-cursor 2>
1243    SCM_STRING_SCAN_BEFORE : v1 <- substring of s1 before s2
1244         s1 = "abcde" and s2 = "cd" => "ab"
1245    SCM_STRING_SCAN_AFTER  : v1 <- substring of s1 after s2
1246         s1 = "abcde" and s2 = "cd" => "e"
1247    SCM_STRING_SCAN_BEFORE2 : v1 <- substring of s1 before s2, v2 <- rest
1248        s1 = "abcde" and s2 = "cd" => "ab" and "cde"
1249    SCM_STRING_SCAN_AFTER2 : v1 <- substring of s1 up to s2, v2 <- rest
1250        s1 = "abcde" and s2 = "cd" => "abcd" and "e"
1251    SCM_STRING_SCAN_BOTH   : v1 <- substring of s1 before, v2 <- after s2
1252        s1 = "abcde" and s2 = "cd" => "ab" and "e"
1253 */
string_scan(ScmString * ss1,const char * s2,ScmSmallInt siz2,ScmSmallInt len2,int incomplete2,int retmode,int (* searcher)(const char *,ScmSmallInt,ScmSmallInt,const char *,ScmSmallInt,ScmSmallInt,ScmSmallInt *,ScmSmallInt *),ScmObj * v1,ScmObj * v2)1254 static int string_scan(ScmString *ss1, const char *s2,
1255                        ScmSmallInt siz2, ScmSmallInt len2,
1256                        int incomplete2,
1257                        int retmode,
1258                        int (*searcher)(const char*, ScmSmallInt, ScmSmallInt,
1259                                        const char*, ScmSmallInt, ScmSmallInt,
1260                                        ScmSmallInt*, ScmSmallInt*),
1261                        ScmObj *v1,        /* out */
1262                        ScmObj *v2)        /* out */
1263 {
1264     ScmSmallInt bi = 0, ci = 0;
1265     const ScmStringBody *sb = SCM_STRING_BODY(ss1);
1266     const char *s1 = SCM_STRING_BODY_START(sb);
1267     ScmSmallInt siz1 = SCM_STRING_BODY_SIZE(sb);
1268     ScmSmallInt len1 = SCM_STRING_BODY_LENGTH(sb);
1269 
1270     if (retmode < 0 || retmode >= SCM_STRING_SCAN_NUM_RETMODES) {
1271         Scm_Error("return mode out fo range: %d", retmode);
1272     }
1273 
1274     int incomplete =
1275         (SCM_STRING_BODY_INCOMPLETE_P(sb) || incomplete2)
1276         ? SCM_STRING_INCOMPLETE : 0;
1277 
1278     /* prefiltering - if both string is complete, and s1 is sbstring
1279        and s2 is mbstring, we know there's no match.  */
1280     int retcode =
1281         (!incomplete && (siz1 == len1) && (siz2 != len2))
1282         ? NOT_FOUND
1283         : searcher(s1, siz1, len1, s2, siz2, len2, &bi, &ci);
1284 
1285     if (retcode == NOT_FOUND) {
1286         switch (retmode) {
1287         case SCM_STRING_SCAN_INDEX:
1288         case SCM_STRING_SCAN_CURSOR:
1289         case SCM_STRING_SCAN_BEFORE:
1290         case SCM_STRING_SCAN_AFTER:
1291             *v1 = SCM_FALSE;
1292             return 1;
1293         default:
1294             *v1 = SCM_FALSE;
1295             *v2 = SCM_FALSE;
1296             return 2;
1297         }
1298     }
1299 
1300     if (retmode != SCM_STRING_SCAN_CURSOR
1301         && (retcode == FOUND_BYTE_INDEX && !incomplete)) {
1302         ci = count_length(s1, bi);
1303     }
1304 
1305     switch (retmode) {
1306     case SCM_STRING_SCAN_INDEX:
1307         *v1 = Scm_MakeInteger(ci);
1308         return 1;
1309     case SCM_STRING_SCAN_CURSOR:
1310         *v1 = make_string_cursor(ss1, s1 + bi);
1311         return 1;
1312     case SCM_STRING_SCAN_BEFORE:
1313         *v1 = Scm_MakeString(s1, bi, ci, incomplete);
1314         return 1;
1315     case SCM_STRING_SCAN_AFTER:
1316         *v1 = Scm_MakeString(s1+bi+siz2, siz1-bi-siz2,
1317                              len1-ci-len2, incomplete);
1318         return 1;
1319     case SCM_STRING_SCAN_BEFORE2:
1320         *v1 = Scm_MakeString(s1, bi, ci, incomplete);
1321         *v2 = Scm_MakeString(s1+bi, siz1-bi, len1-ci, incomplete);
1322         return 2;
1323     case SCM_STRING_SCAN_AFTER2:
1324         *v1 = Scm_MakeString(s1, bi+siz2, ci+len2, incomplete);
1325         *v2 = Scm_MakeString(s1+bi+siz2, siz1-bi-siz2,
1326                              len1-ci-len2, incomplete);
1327         return 2;
1328     case SCM_STRING_SCAN_BOTH:
1329         *v1 = Scm_MakeString(s1, bi, ci, incomplete);;
1330         *v2 = Scm_MakeString(s1+bi+siz2, siz1-bi-siz2,
1331                              len1-ci-len2, incomplete);
1332         return 2;
1333     }
1334     return 0;       /* dummy */
1335 }
1336 
Scm_StringScan(ScmString * s1,ScmString * s2,int retmode)1337 ScmObj Scm_StringScan(ScmString *s1, ScmString *s2, int retmode)
1338 {
1339     ScmObj v1, v2;
1340     const ScmStringBody *s2b = SCM_STRING_BODY(s2);
1341     int nvals = string_scan(s1,
1342                             SCM_STRING_BODY_START(s2b),
1343                             SCM_STRING_BODY_SIZE(s2b),
1344                             SCM_STRING_BODY_LENGTH(s2b),
1345                             SCM_STRING_BODY_INCOMPLETE_P(s2b),
1346                             retmode, string_search, &v1, &v2);
1347     if (nvals == 1) return v1;
1348     else return Scm_Values2(v1, v2);
1349 }
1350 
Scm_StringScanChar(ScmString * s1,ScmChar ch,int retmode)1351 ScmObj Scm_StringScanChar(ScmString *s1, ScmChar ch, int retmode)
1352 {
1353     ScmObj v1, v2;
1354     char buf[SCM_CHAR_MAX_BYTES];
1355     SCM_CHAR_PUT(buf, ch);
1356     int nvals = string_scan(s1, buf, SCM_CHAR_NBYTES(ch), 1, FALSE, retmode,
1357                             string_search, &v1, &v2);
1358     if (nvals == 1) return v1;
1359     else return Scm_Values2(v1, v2);
1360 }
1361 
Scm_StringScanRight(ScmString * s1,ScmString * s2,int retmode)1362 ScmObj Scm_StringScanRight(ScmString *s1, ScmString *s2, int retmode)
1363 {
1364     ScmObj v1, v2;
1365     const ScmStringBody *s2b = SCM_STRING_BODY(s2);
1366     int nvals = string_scan(s1,
1367                             SCM_STRING_BODY_START(s2b),
1368                             SCM_STRING_BODY_SIZE(s2b),
1369                             SCM_STRING_BODY_LENGTH(s2b),
1370                             SCM_STRING_BODY_INCOMPLETE_P(s2b),
1371                             retmode, string_search_reverse, &v1, &v2);
1372     if (nvals == 1) return v1;
1373     else return Scm_Values2(v1, v2);
1374 }
1375 
Scm_StringScanCharRight(ScmString * s1,ScmChar ch,int retmode)1376 ScmObj Scm_StringScanCharRight(ScmString *s1, ScmChar ch, int retmode)
1377 {
1378     ScmObj v1, v2;
1379     char buf[SCM_CHAR_MAX_BYTES];
1380     SCM_CHAR_PUT(buf, ch);
1381     int nvals = string_scan(s1, buf, SCM_CHAR_NBYTES(ch), 1, FALSE, retmode,
1382                             string_search_reverse, &v1, &v2);
1383     if (nvals == 1) return v1;
1384     else return Scm_Values2(v1, v2);
1385 }
1386 
1387 #undef NOT_FOUND
1388 #undef FOUND_BOTH_INDEX
1389 #undef FOUND_BYTE_INDEX
1390 #undef FOUND_MAYBE_BOTH
1391 #undef BYTEWISE_SEARCHABLE
1392 #undef MULTIBYTE_NAIVE_SEARCH_NEEDED
1393 
1394 /* Split string by char.  Char itself is not included in the result.
1395    If LIMIT >= 0, up to that number of matches are considered (i.e.
1396    up to LIMIT+1 strings are returned).   LIMIT < 0 makes the number
1397    of matches unlimited.
1398    TODO: If CH is a utf-8 multi-byte char, Boyer-Moore skip table is
1399    calculated every time we call string_scan, which is a waste.  Some
1400    mechanism to cache the skip table would be nice.
1401 */
Scm_StringSplitByCharWithLimit(ScmString * str,ScmChar ch,int limit)1402 ScmObj Scm_StringSplitByCharWithLimit(ScmString *str, ScmChar ch, int limit)
1403 {
1404     char buf[SCM_CHAR_MAX_BYTES];
1405     int nb = SCM_CHAR_NBYTES(ch);
1406     ScmObj head = SCM_NIL, tail = SCM_NIL;
1407 
1408     if (limit == 0) return SCM_LIST1(SCM_OBJ(str)); /* trivial case */
1409 
1410     SCM_CHAR_PUT(buf, ch);
1411 
1412     for (;;) {
1413         ScmObj v1, v2;
1414         (void)string_scan(str, buf, nb, 1, FALSE, SCM_STRING_SCAN_BOTH,
1415                           string_search, &v1, &v2);
1416         if (SCM_FALSEP(v1)) {
1417             SCM_APPEND1(head, tail, SCM_OBJ(str));
1418             break;
1419         } else {
1420             SCM_APPEND1(head, tail, v1);
1421             if (--limit == 0) { SCM_APPEND1(head, tail, v2); break; }
1422         }
1423         str = SCM_STRING(v2);
1424     }
1425     return head;
1426 }
1427 
1428 /* For ABI compatibility - On 1.0, let's make this have limit arg and
1429    drop Scm_StringSplitByCharWithLimit.  */
Scm_StringSplitByChar(ScmString * str,ScmChar ch)1430 ScmObj Scm_StringSplitByChar(ScmString *str, ScmChar ch)
1431 {
1432     return Scm_StringSplitByCharWithLimit(str, ch, -1);
1433 }
1434 
1435 /*----------------------------------------------------------------
1436  * Miscellaneous functions
1437  */
1438 
Scm_StringToList(ScmString * str)1439 ScmObj Scm_StringToList(ScmString *str)
1440 {
1441     const ScmStringBody *b = SCM_STRING_BODY(str);
1442     ScmObj start = SCM_NIL, end = SCM_NIL;
1443     const char *bufp = SCM_STRING_BODY_START(b);
1444     ScmSmallInt len = SCM_STRING_BODY_LENGTH(b);
1445 
1446     if (SCM_STRING_BODY_INCOMPLETE_P(b))
1447         Scm_Error("incomplete string not supported: %S", str);
1448     while (len-- > 0) {
1449         ScmChar ch;
1450         SCM_CHAR_GET(bufp, ch);
1451         bufp += SCM_CHAR_NBYTES(ch);
1452         SCM_APPEND1(start, end, SCM_MAKE_CHAR(ch));
1453     }
1454     return start;
1455 }
1456 
1457 /* Convert cstring array to a list of Scheme strings.  Cstring array
1458    can be NULL terminated (in case size < 0) or its size is explicitly
1459    specified (size >= 0).  FLAGS is passed to Scm_MakeString. */
Scm_CStringArrayToList(const char ** array,ScmSmallInt size,u_long flags)1460 ScmObj Scm_CStringArrayToList(const char **array, ScmSmallInt size, u_long flags)
1461 {
1462     ScmObj h = SCM_NIL, t = SCM_NIL;
1463     if (size < 0) {
1464         for (;*array; array++) {
1465             ScmObj s = Scm_MakeString(*array, -1, -1, flags);
1466             SCM_APPEND1(h, t, s);
1467         }
1468     } else {
1469         for (ScmSmallInt i=0; i<size; i++, array++) {
1470             ScmObj s = Scm_MakeString(*array, -1, -1, flags);
1471             SCM_APPEND1(h, t, s);
1472         }
1473     }
1474     return h;
1475 }
1476 
1477 /* common routine for Scm_ListTo[Const]CStringArray */
list_to_cstring_array_check(ScmObj lis,int errp)1478 static ScmSmallInt list_to_cstring_array_check(ScmObj lis, int errp)
1479 {
1480     ScmObj lp;
1481     ScmSmallInt len = 0;
1482     SCM_FOR_EACH(lp, lis) {
1483         if (!SCM_STRINGP(SCM_CAR(lp))) {
1484             if (errp) Scm_Error("a proper list of strings is required, but the list contains non-string element: %S", SCM_CAR(lp));
1485             else return -1;
1486         }
1487         len++;
1488     }
1489     return len;
1490 }
1491 
1492 /* Convert list of Scheme strings into C const char* string array, NULL
1493    terminated.
1494    If errp == FALSE, returns NULL on error.
1495    otherwise, signals an error. */
Scm_ListToConstCStringArray(ScmObj lis,int errp)1496 const char **Scm_ListToConstCStringArray(ScmObj lis, int errp)
1497 {
1498     ScmSmallInt len = list_to_cstring_array_check(lis, errp);
1499     if (len < 0) return NULL;
1500     const char **array = SCM_NEW_ARRAY(const char*, len+1);
1501     const char **p = array;
1502     ScmObj lp;
1503     SCM_FOR_EACH(lp, lis) {
1504         *p++ = Scm_GetStringConst(SCM_STRING(SCM_CAR(lp)));
1505     }
1506     *p = NULL;                  /* termination */
1507     return array;
1508 }
1509 
1510 /* Convert list of Scheme strings into C char* string array, NULL
1511    terminated.
1512    If errp == FALSE, returns NULL on error.
1513    otherwise, signals an error.
1514    If provided, alloc is used to allocate both a pointer array and char
1515    arrays.  Otherwise, SCM_ALLOC is used. */
Scm_ListToCStringArray(ScmObj lis,int errp,void * (* alloc)(size_t))1516 char **Scm_ListToCStringArray(ScmObj lis, int errp, void *(*alloc)(size_t))
1517 {
1518     char **array, **p;
1519     ScmSmallInt len = list_to_cstring_array_check(lis, errp);
1520     if (len < 0) return NULL;
1521 
1522     if (alloc) {
1523         p = array = (char **)alloc((len+1) * sizeof(char *));
1524         ScmObj lp;
1525         SCM_FOR_EACH(lp, lis) {
1526             const char *s = Scm_GetStringConst(SCM_STRING(SCM_CAR(lp)));
1527             *p = (char *)alloc(strlen(s) + 1);
1528             strcpy(*p, s);
1529             p++;
1530         }
1531     } else {
1532         p = array = SCM_NEW_ARRAY(char*, len+1);
1533         ScmObj lp;
1534         SCM_FOR_EACH(lp, lis) {
1535             *p++ = Scm_GetString(SCM_STRING(SCM_CAR(lp)));
1536         }
1537     }
1538     *p = NULL;                  /* termination */
1539     return array;
1540 }
1541 
1542 /*----------------------------------------------------------------
1543  * printer
1544  */
string_putc(ScmChar ch,ScmPort * port,int bytemode)1545 static inline void string_putc(ScmChar ch, ScmPort *port, int bytemode)
1546 {
1547     char buf[6];
1548     switch (ch) {
1549     case '\\': SCM_PUTZ("\\\\", -1, port); break;
1550     case '"':  SCM_PUTZ("\\\"", -1, port); break;
1551     case '\n': SCM_PUTZ("\\n", -1, port); break;
1552     case '\t': SCM_PUTZ("\\t", -1, port); break;
1553     case '\r': SCM_PUTZ("\\r", -1, port); break;
1554     case '\f': SCM_PUTZ("\\f", -1, port); break;
1555     case '\0': SCM_PUTZ("\\0", -1, port); break;
1556     default:
1557         if (ch < ' ' || ch == 0x7f || (bytemode && ch >= 0x80)) {
1558             /* TODO: Should we provide 'legacy-compatible writer mode,
1559                which does not use ';' terminator? */
1560             snprintf(buf, 6, "\\x%02x;", (unsigned char)ch);
1561             SCM_PUTZ(buf, -1, port);
1562         } else {
1563             SCM_PUTC(ch, port);
1564         }
1565     }
1566 }
1567 
string_print(ScmObj obj,ScmPort * port,ScmWriteContext * ctx)1568 static void string_print(ScmObj obj, ScmPort *port, ScmWriteContext *ctx)
1569 {
1570     ScmString *str = SCM_STRING(obj);
1571     if (Scm_WriteContextMode(ctx) == SCM_WRITE_DISPLAY) {
1572         SCM_PUTS(str, port);
1573     } else {
1574         const ScmStringBody *b = SCM_STRING_BODY(str);
1575         if (SCM_STRING_BODY_SINGLE_BYTE_P(b)) {
1576             const char *cp = SCM_STRING_BODY_START(b);
1577             ScmSmallInt size = SCM_STRING_BODY_SIZE(b);
1578             if (SCM_STRING_BODY_INCOMPLETE_P(b)) {
1579                 /* TODO: Should we provide legacy-compatible writer mode,
1580                    which puts #*"..." instead? */
1581                 SCM_PUTZ("#**\"", -1, port);
1582             } else {
1583                 SCM_PUTC('"', port);
1584             }
1585             while (size--) {
1586                 string_putc(*cp++, port, SCM_STRING_BODY_INCOMPLETE_P(b));
1587             }
1588         } else {
1589             const char *cp = SCM_STRING_BODY_START(b);
1590             ScmSmallInt len = SCM_STRING_BODY_LENGTH(b);
1591 
1592             SCM_PUTC('"', port);
1593             while (len--) {
1594                 ScmChar ch;
1595                 SCM_CHAR_GET(cp, ch);
1596                 string_putc(ch, port, FALSE);
1597                 cp += SCM_CHAR_NBYTES(ch);
1598             }
1599         }
1600         SCM_PUTC('"', port);
1601     }
1602 }
1603 
1604 /*==================================================================
1605  *
1606  * String index building
1607  *
1608  */
1609 
string_body_index_needed(const ScmStringBody * sb)1610 static int string_body_index_needed(const ScmStringBody *sb)
1611 {
1612     return (!SCM_STRING_BODY_SINGLE_BYTE_P(sb)
1613             && !SCM_STRING_BODY_INCOMPLETE_P(sb)
1614             && SCM_STRING_BODY_SIZE(sb) >= 64);
1615 }
1616 
Scm_StringBodyFastIndexableP(const ScmStringBody * sb)1617 int Scm_StringBodyFastIndexableP(const ScmStringBody *sb)
1618 {
1619     return (!string_body_index_needed(sb)
1620             || SCM_STRING_BODY_HAS_INDEX(sb));
1621 }
1622 
compute_index_size(const ScmStringBody * sb,int interval)1623 static size_t compute_index_size(const ScmStringBody *sb, int interval)
1624 {
1625     ScmSmallInt len = SCM_STRING_BODY_LENGTH(sb);
1626     /* We don't store the first entry (0th character == 0th byte), and
1627        we use two extra entry for the signature and index_size.  So
1628        we need +1. */
1629     return ((len + interval - 1)/interval) + 1;
1630 }
1631 
build_index_array(const ScmStringBody * sb)1632 static void *build_index_array(const ScmStringBody *sb)
1633 {
1634     /* Signature byte is repeated in the first element of the vector */
1635 #define SIG8(type,sig)    (type)(sig)
1636 #define SIG16(type,sig)   ((type)((sig)<<8)|(sig))
1637 #define SIG32(type,sig)   ((type)(SIG16(type,sig)<<16)|SIG16(type,sig))
1638 #define SIG64(type,sig)   ((type)(SIG32(type,sig)<<32)|SIG32(type,sig))
1639 
1640 #define BUILD_ARRAY(type_, typeenum_, shift_, sigrep_)                  \
1641     do {                                                                \
1642         int interval = 1 << (shift_);                                   \
1643         size_t index_size = compute_index_size(sb, interval);           \
1644         type_ *vec = SCM_NEW_ATOMIC_ARRAY(type_, index_size);           \
1645         u_long sig = STRING_INDEX_SIGNATURE(shift_, typeenum_);         \
1646         vec[0] = sigrep_(type_,sig);                                    \
1647         vec[1] = (type_)index_size;                                     \
1648         const char *p = SCM_STRING_BODY_START(sb);                      \
1649         for (size_t i = 2; i < index_size; i++) {                       \
1650             const char *q = forward_pos(sb, p, interval);               \
1651             vec[i] = (type_)(q - SCM_STRING_BODY_START(sb));            \
1652             p = q;                                                      \
1653         }                                                               \
1654         return vec;                                                     \
1655     } while (0)
1656 
1657     /* Technically we can use index8 even if size is bigger than 256,
1658        as long as the last indexed character is within the range.  But
1659        checking it is too much. */
1660     if (sb->size < 256) {
1661         BUILD_ARRAY(uint8_t, STRING_INDEX8, 4, SIG8);
1662     } else if (sb->size < 8192) {
1663         /* 32 chars interval */
1664         BUILD_ARRAY(uint16_t, STRING_INDEX16, 5, SIG16);
1665     } else if (sb->size < 65536) {
1666         /* 64 chars interval */
1667         BUILD_ARRAY(uint16_t, STRING_INDEX16, 6, SIG16);
1668     }
1669 #if SIZEOF_LONG == 4
1670     else {
1671         /* 128 chars interval */
1672         BUILD_ARRAY(uint32_t, STRING_INDEX32, 7, SIG32);
1673     }
1674 #else /* SIZEOF_LONG != 4 */
1675     else if (sb->size < (1L<<32)) {
1676         /* 128 chars interval */
1677         BUILD_ARRAY(uint32_t, STRING_INDEX32, 7, SIG32);
1678     } else {
1679         /* 256 chars interval */
1680         BUILD_ARRAY(uint64_t, STRING_INDEX64, 8, SIG64);
1681     }
1682 #endif
1683 #undef BUILD_ARRAY
1684 }
1685 
Scm_StringBodyBuildIndex(ScmStringBody * sb)1686 void Scm_StringBodyBuildIndex(ScmStringBody *sb)
1687 {
1688     if (!string_body_index_needed(sb) || SCM_STRING_BODY_HAS_INDEX(sb)) return;
1689     /* This is idempotent, atomic operation; no need to lock.  */
1690     sb->index = build_index_array(sb);
1691 }
1692 
1693 /* For debugging */
Scm_StringBodyIndexDump(const ScmStringBody * sb,ScmPort * port)1694 void Scm_StringBodyIndexDump(const ScmStringBody *sb, ScmPort *port)
1695 {
1696     ScmStringIndex *index = STRING_INDEX(sb->index);
1697     if (index == NULL) {
1698         Scm_Printf(port, "(nil)\n");
1699         return;
1700     }
1701     int interval = STRING_INDEX_INTERVAL(index);
1702     size_t index_size = 0;
1703 
1704     switch (STRING_INDEX_TYPE(index)) {
1705     case STRING_INDEX8:
1706         Scm_Printf(port, "index8  ");
1707         index_size = (size_t)index->index8[1];
1708         break;
1709     case STRING_INDEX16:
1710         Scm_Printf(port, "index16 ");
1711         index_size = (size_t)index->index16[1];
1712         break;
1713     case STRING_INDEX32:
1714         Scm_Printf(port, "index32 ");
1715         index_size = (size_t)index->index32[1];
1716         break;
1717     case STRING_INDEX64:
1718         Scm_Printf(port, "index64 ");
1719         index_size = (size_t)index->index64[1];
1720         break;
1721     default:
1722         Scm_Printf(port, "unknown(%02x) ", (uint8_t)STRING_INDEX_TYPE(index));
1723     }
1724     Scm_Printf(port, " interval %d  size %d\n", interval, index_size-1);
1725     Scm_Printf(port, "        0         0\n");
1726     for (size_t i = 2; i < index_size; i++) {
1727         switch (STRING_INDEX_TYPE(index)) {
1728         case STRING_INDEX8:
1729             Scm_Printf(port, " %8ld  %8u\n", i-1, index->index8[i]); break;
1730         case STRING_INDEX16:
1731             Scm_Printf(port, " %8ld  %8u\n", i-1, index->index16[i]); break;
1732         case STRING_INDEX32:
1733             Scm_Printf(port, " %8ld  %8u\n", i-1, index->index32[i]); break;
1734         case STRING_INDEX64:
1735             Scm_Printf(port, " %8ld  %8lu\n",i-1, index->index64[i]); break;
1736         }
1737     }
1738 }
1739 
1740 
1741 /*==================================================================
1742  *
1743  * String pointer (OBSOLETED)
1744  *
1745  */
1746 
1747 #if GAUCHE_STRING_POINTER
1748 
1749 SCM_DEFINE_BUILTIN_CLASS_SIMPLE(Scm_StringPointerClass, NULL);
1750 
Scm_MakeStringPointer(ScmString * src,ScmSmallInt index,ScmSmallInt start,ScmSmallInt end)1751 ScmObj Scm_MakeStringPointer(ScmString *src, ScmSmallInt index,
1752                              ScmSmallInt start, ScmSmallInt end)
1753 {
1754     const ScmStringBody *srcb = SCM_STRING_BODY(src);
1755     ScmSmallInt len = SCM_STRING_BODY_LENGTH(srcb);
1756     ScmSmallInt effective_size;
1757     const char *sptr, *ptr, *eptr;
1758 
1759     SCM_CHECK_START_END(start, end, len);
1760     while (index < 0) index += (end - start) + 1;
1761     if (index > (end - start)) goto badindex;
1762 
1763     sptr = forward_pos(srcb, SCM_STRING_BODY_START(srcb), start);
1764     ptr = forward_pos(srcb, sptr, index);
1765     if (end == len) {
1766         eptr = SCM_STRING_BODY_END(srcb);
1767     } else {
1768         eptr = forward_pos(srcb, sptr, end - start);
1769     }
1770     effective_size = eptr - sptr;
1771 
1772     ScmStringPointer *sp = SCM_NEW(ScmStringPointer);
1773     SCM_SET_CLASS(sp, SCM_CLASS_STRING_POINTER);
1774     sp->length = (SCM_STRING_BODY_INCOMPLETE_P(srcb)? -1 : (end-start));
1775     sp->size = effective_size;
1776     sp->start = sptr;
1777     sp->index = index;
1778     sp->current = ptr;
1779     return SCM_OBJ(sp);
1780   badindex:
1781     Scm_Error("index out of range: %ld", index);
1782     return SCM_UNDEFINED;
1783 }
1784 
Scm_StringPointerRef(ScmStringPointer * sp)1785 ScmObj Scm_StringPointerRef(ScmStringPointer *sp)
1786 {
1787     ScmChar ch;
1788     if (sp->length < 0 || sp->size == sp->length) {
1789         if (sp->index >= sp->size) return SCM_EOF;
1790         ch = *(const unsigned char*)sp->current;
1791     } else {
1792         if (sp->index >= sp->length) return SCM_EOF;
1793         SCM_CHAR_GET(sp->current, ch);
1794     }
1795     return SCM_MAKE_CHAR(ch);
1796 }
1797 
Scm_StringPointerNext(ScmStringPointer * sp)1798 ScmObj Scm_StringPointerNext(ScmStringPointer *sp)
1799 {
1800     ScmChar ch;
1801     if (sp->length < 0 || sp->size == sp->length) {
1802         if (sp->index >= sp->size) return SCM_EOF;
1803         sp->index++;
1804         ch = *(const unsigned char*)sp->current++;
1805     } else {
1806         if (sp->index >= sp->length) return SCM_EOF;
1807         SCM_CHAR_GET(sp->current, ch);
1808         sp->index++;
1809         sp->current += SCM_CHAR_NFOLLOWS(*sp->current) + 1;
1810     }
1811     return SCM_MAKE_CHAR(ch);
1812 }
1813 
Scm_StringPointerPrev(ScmStringPointer * sp)1814 ScmObj Scm_StringPointerPrev(ScmStringPointer *sp)
1815 {
1816     ScmChar ch;
1817     if (sp->index <= 0) return SCM_EOF;
1818     if (sp->length < 0 || sp->size == sp->length) {
1819         sp->index--;
1820         ch = *(const unsigned char*)--sp->current;
1821     } else {
1822         const char *prev;
1823         SCM_CHAR_BACKWARD(sp->current, sp->start, prev);
1824         SCM_ASSERT(prev != NULL);
1825         SCM_CHAR_GET(prev, ch);
1826         sp->index--;
1827         sp->current = prev;
1828     }
1829     return SCM_MAKE_CHAR(ch);
1830 }
1831 
Scm_StringPointerSet(ScmStringPointer * sp,ScmSmallInt index)1832 ScmObj Scm_StringPointerSet(ScmStringPointer *sp, ScmSmallInt index)
1833 {
1834     if (index < 0) goto badindex;
1835     /* NB: Safe to cast index, for too large index would be rejected
1836        by the check. */
1837     if (sp->length < 0 || sp->size == sp->length) {
1838         if (index > sp->size) goto badindex;
1839         sp->index = index;
1840         sp->current = sp->start + index;
1841     } else {
1842         if (index > sp->length) goto badindex;
1843         sp->index = index;
1844         sp->current = forward_pos(NULL, sp->start, index);
1845     }
1846     return SCM_OBJ(sp);
1847   badindex:
1848     Scm_Error("index out of range: %ld", index);
1849     return SCM_UNDEFINED;
1850 }
1851 
Scm_StringPointerSubstring(ScmStringPointer * sp,int afterp)1852 ScmObj Scm_StringPointerSubstring(ScmStringPointer *sp, int afterp)
1853 {
1854     /* TODO: set SCM_STRING_TERMINATED if applicable. */
1855     if (sp->length < 0) {
1856         if (afterp)
1857             return SCM_OBJ(make_str(-1, sp->size - sp->index, sp->current,
1858                                     0, NULL));
1859         else
1860             return SCM_OBJ(make_str(-1, sp->index, sp->start, 0, NULL));
1861     } else {
1862         if (afterp)
1863             return SCM_OBJ(make_str(sp->length - sp->index,
1864                                     sp->start + sp->size - sp->current,
1865                                     sp->current, 0, NULL));
1866         else
1867             return SCM_OBJ(make_str(sp->index,
1868                                     sp->current - sp->start,
1869                                     sp->start, 0, NULL));
1870     }
1871 }
1872 
1873 /* Copy string pointer.
1874    Thanks to Alex Shinn (foof@synthcode.com) */
Scm_StringPointerCopy(ScmStringPointer * sp1)1875 ScmObj Scm_StringPointerCopy(ScmStringPointer *sp1)
1876 {
1877     ScmStringPointer *sp2 = SCM_NEW(ScmStringPointer);
1878     SCM_SET_CLASS(sp2, SCM_CLASS_STRING_POINTER);
1879     sp2->length  = sp1->length;
1880     sp2->size    = sp1->size;
1881     sp2->start   = sp1->start;
1882     sp2->index   = sp1->index;
1883     sp2->current = sp1->current;
1884     return SCM_OBJ(sp2);
1885 }
1886 
1887 /* Dump string pointer info for debugging.
1888    Thanks to Alex Shinn (foof@synthcode.com) */
Scm_StringPointerDump(ScmStringPointer * sp1)1889 void Scm_StringPointerDump(ScmStringPointer *sp1)
1890 {
1891     Scm_Printf(SCM_CUROUT,
1892                "<sp addr: %p len: %d size: %d start: %p index: %d cur: %p>\n",
1893                sp1, sp1->length, sp1->size, sp1->start, sp1->index,
1894                sp1->current);
1895 }
1896 
1897 #endif /*GAUCHE_STRING_POINTER*/
1898 
1899 /*==================================================================
1900  *
1901  * String cursor API
1902  *
1903  */
1904 
1905 /* Public interface */
Scm_StringCursorP(ScmObj obj)1906 int Scm_StringCursorP(ScmObj obj)
1907 {
1908     return SCM_STRING_CURSOR_P(obj);
1909 }
1910 
make_string_cursor(ScmString * src,const char * ptr)1911 static ScmObj make_string_cursor(ScmString *src, const char *ptr)
1912 {
1913     const ScmStringBody *srcb = SCM_STRING_BODY(src);
1914 
1915     if (ptr < SCM_STRING_BODY_START(srcb) ||
1916         ptr > SCM_STRING_BODY_END(srcb)) {
1917         Scm_Error("cursor out of range of %S: %ld",
1918                   SCM_OBJ(src),
1919                   (ScmSmallInt)(ptr - SCM_STRING_BODY_START(srcb)));
1920     }
1921 
1922     ScmSmallInt offset = (ScmSmallInt)(ptr - SCM_STRING_BODY_START(srcb));
1923     if (!SCM_VM_RUNTIME_FLAG_IS_SET(Scm_VM(), SCM_SAFE_STRING_CURSORS) &&
1924         SCM_STRING_CURSOR_FITS_SMALL_P(offset)) {
1925         return SCM_MAKE_STRING_CURSOR_SMALL(offset);
1926     }
1927 
1928     ScmStringCursorLarge *sc = SCM_NEW(ScmStringCursorLarge);
1929     SCM_SET_CLASS(sc, SCM_CLASS_STRING_CURSOR_LARGE);
1930     sc->offset = offset;
1931     sc->start = SCM_STRING_BODY_START(srcb);
1932     return SCM_OBJ(sc);
1933 }
1934 
Scm_MakeStringCursorFromIndex(ScmString * src,ScmSmallInt index)1935 ScmObj Scm_MakeStringCursorFromIndex(ScmString *src, ScmSmallInt index)
1936 {
1937     const ScmStringBody *srcb = SCM_STRING_BODY(src);
1938     ScmSmallInt len = SCM_STRING_BODY_LENGTH(srcb);
1939     if (index < 0 || index > len) {
1940         Scm_Error("index out of range: %ld", index);
1941     }
1942     return make_string_cursor(src, index2ptr(srcb, index));
1943 }
1944 
Scm_MakeStringCursorEnd(ScmString * src)1945 ScmObj Scm_MakeStringCursorEnd(ScmString *src)
1946 {
1947     const ScmStringBody *srcb = SCM_STRING_BODY(src);
1948 
1949     ScmSmallInt offset = SCM_STRING_BODY_END(srcb) - SCM_STRING_BODY_START(srcb);
1950     if (!SCM_VM_RUNTIME_FLAG_IS_SET(Scm_VM(), SCM_SAFE_STRING_CURSORS) &&
1951         SCM_STRING_CURSOR_FITS_SMALL_P(offset)) {
1952         return SCM_MAKE_STRING_CURSOR_SMALL(offset);
1953     }
1954     ScmStringCursorLarge *sc = SCM_NEW(ScmStringCursorLarge);
1955     SCM_SET_CLASS(sc, SCM_CLASS_STRING_CURSOR_LARGE);
1956     sc->offset = offset;
1957     sc->start = SCM_STRING_BODY_START(srcb);
1958     return SCM_OBJ(sc);
1959 }
1960 
Scm_StringCursorIndex(ScmString * src,ScmObj sc)1961 ScmObj Scm_StringCursorIndex(ScmString *src, ScmObj sc)
1962 {
1963     if (SCM_INTP(sc) || SCM_BIGNUMP(sc)) {
1964         return sc;              /* no validation */
1965     }
1966 
1967     const ScmStringBody *srcb = SCM_STRING_BODY(src);
1968     const char          *ptr  = NULL;
1969 
1970     if ((ptr = string_cursor_ptr(srcb, sc)) == NULL) {
1971         Scm_Error("must be either an index or a cursor: %S", sc);
1972     }
1973 
1974     if (SCM_STRING_BODY_SINGLE_BYTE_P(srcb) ||
1975         SCM_STRING_BODY_INCOMPLETE_P(srcb)) {
1976         return SCM_MAKE_INT(ptr - SCM_STRING_BODY_START(srcb));
1977     }
1978 
1979     const char *current = SCM_STRING_BODY_START(srcb);
1980     ScmSmallInt len     = SCM_STRING_BODY_LENGTH(srcb);
1981     ScmSmallInt index   = 0;
1982     while (index < len && current < ptr) {
1983         current += SCM_CHAR_NFOLLOWS(*current) + 1;
1984         index++;
1985     }
1986     if (current != ptr) {
1987         Scm_Error("cursor not pointed at the beginning of a character: %S", sc);
1988     }
1989 
1990     return SCM_MAKE_INT(index);
1991 }
1992 
Scm_StringCursorForward(ScmString * s,ScmObj sc,int nchars)1993 ScmObj Scm_StringCursorForward(ScmString* s, ScmObj sc, int nchars)
1994 {
1995     if (nchars < 0) {
1996         Scm_Error("nchars is negative: %ld", nchars);
1997     }
1998 
1999     if (SCM_INTEGERP(sc)) {
2000         return Scm_MakeStringCursorFromIndex(s, Scm_GetInteger(sc) + nchars);
2001     }
2002 
2003     const ScmStringBody *srcb = SCM_STRING_BODY(s);
2004     const char *ptr = string_cursor_ptr(srcb, sc);
2005     if (ptr == NULL) {
2006         Scm_Error("must be either an index or a cursor: %S", sc);
2007     }
2008     return make_string_cursor(s, forward_pos(srcb, ptr, nchars));
2009 }
2010 
Scm_StringCursorBack(ScmString * s,ScmObj sc,int nchars)2011 ScmObj Scm_StringCursorBack(ScmString* s, ScmObj sc, int nchars)
2012 {
2013     if (nchars < 0) {
2014         Scm_Error("nchars is negative: %ld", nchars);
2015     }
2016 
2017     if (SCM_INTP(sc) || SCM_BIGNUMP(sc)) {
2018         return Scm_MakeStringCursorFromIndex(s, Scm_GetInteger(sc) - nchars);
2019     }
2020 
2021     const ScmStringBody *srcb = SCM_STRING_BODY(s);
2022     const char *ptr = string_cursor_ptr(srcb, sc);
2023     if (ptr == NULL) {
2024         Scm_Error("must be either an index or a cursor: %S", sc);
2025     }
2026 
2027     if (SCM_STRING_BODY_SINGLE_BYTE_P(srcb) ||
2028         SCM_STRING_BODY_INCOMPLETE_P(srcb)) {
2029         return make_string_cursor(s, ptr - nchars);
2030     }
2031 
2032     while (nchars--) {
2033         const char *prev;
2034         SCM_CHAR_BACKWARD(ptr, SCM_STRING_BODY_START(srcb), prev);
2035         if (!prev) {
2036             Scm_Error("nchars out of range: %ld", nchars);
2037         }
2038         ptr = prev;
2039     }
2040 
2041     return make_string_cursor(s, ptr);
2042 }
2043 
Scm_StringRefCursor(ScmString * s,ScmObj sc,int range_error)2044 ScmChar Scm_StringRefCursor(ScmString* s, ScmObj sc, int range_error)
2045 {
2046     if (SCM_INTP(sc)) {
2047         return Scm_StringRef(s, SCM_INT_VALUE(sc), range_error);
2048     }
2049 
2050     const ScmStringBody *srcb = SCM_STRING_BODY(s);
2051     const char *ptr = string_cursor_ptr(srcb, sc);
2052     if (ptr == NULL) {
2053         Scm_Error("must be either an index or a cursor: %S", sc);
2054     }
2055     if (ptr == SCM_STRING_BODY_END(srcb)) {
2056         if (range_error) {
2057             Scm_Error("cursor is at the end: %S", sc);
2058         } else {
2059             return SCM_CHAR_INVALID;
2060         }
2061     }
2062     ScmChar ch;
2063     SCM_CHAR_GET(ptr, ch);
2064     return ch;
2065 }
2066 
Scm_SubstringCursor(ScmString * str,ScmObj start_scm,ScmObj end_scm)2067 ScmObj Scm_SubstringCursor(ScmString *str,
2068                            ScmObj start_scm, ScmObj end_scm)
2069 {
2070     const ScmStringBody *sb = SCM_STRING_BODY(str);
2071     const char *start = string_cursor_ptr(sb, start_scm);
2072     const char *end   = string_cursor_ptr(sb, end_scm);
2073 
2074     if (start && end) {
2075         return substring_cursor(sb, start, end, FALSE);
2076     }
2077 
2078     return substring(SCM_STRING_BODY(str),
2079                      Scm_GetInteger(Scm_StringCursorIndex(str, start_scm)),
2080                      Scm_GetInteger(Scm_StringCursorIndex(str, end_scm)),
2081                      FALSE, FALSE);
2082 }
2083 
Scm_StringCursorCompare(ScmObj sc1,ScmObj sc2,int (* numcmp)(ScmObj,ScmObj))2084 int Scm_StringCursorCompare(ScmObj sc1, ScmObj sc2,
2085                             int (*numcmp)(ScmObj, ScmObj))
2086 {
2087     /*
2088      * Handle indexes separately, we can't mix index and cursor
2089      * because cursor is byte offset, not index.
2090      */
2091     if (SCM_INTP(sc1) && SCM_INTP(sc2)) {
2092         return numcmp(sc1, sc2);
2093     }
2094 
2095     ScmSmallInt i1 = string_cursor_offset(sc1);
2096     ScmSmallInt i2 = string_cursor_offset(sc2);
2097     if (i1 < 0 || i2 < 0) {
2098         Scm_Error("arguments must be either both cursors or both indexes: %S vs %S", sc1, sc2);
2099     }
2100     return numcmp(SCM_MAKE_INT(i1), SCM_MAKE_INT(i2));
2101 }
2102 
2103 /*==================================================================
2104  *
2105  * Dynamic strings
2106  *
2107  */
2108 
2109 /* I used to use realloc() to grow the storage; now I avoid it, for
2110    Boehm GC's realloc almost always copies the original content and
2111    we don't get any benefit.
2112    The growing string is kept in the chained chunks.  The size of
2113    chunk getting bigger as the string grows, until a certain threshold.
2114    The memory for actual chunks and the chain is allocated separately,
2115    in order to use SCM_NEW_ATOMIC.
2116  */
2117 
2118 /* NB: it is important that DString functions don't call any
2119  * time-consuming procedures except memory allocation.   Some of
2120  * mutex code in other parts relies on that fact.
2121  */
2122 
2123 /* maximum chunk size */
2124 #define DSTRING_MAX_CHUNK_SIZE  8180
2125 
Scm_DStringInit(ScmDString * dstr)2126 void Scm_DStringInit(ScmDString *dstr)
2127 {
2128     dstr->init.bytes = 0;
2129     dstr->anchor = dstr->tail = NULL;
2130     dstr->current = dstr->init.data;
2131     dstr->end = dstr->current + SCM_DSTRING_INIT_CHUNK_SIZE;
2132     dstr->lastChunkSize = SCM_DSTRING_INIT_CHUNK_SIZE;
2133     dstr->length = 0;
2134 }
2135 
Scm_DStringSize(ScmDString * dstr)2136 ScmSmallInt Scm_DStringSize(ScmDString *dstr)
2137 {
2138     ScmSmallInt size;
2139     if (dstr->tail) {
2140         size = dstr->init.bytes;
2141         dstr->tail->chunk->bytes = dstr->current - dstr->tail->chunk->data;
2142         for (ScmDStringChain *chain = dstr->anchor; chain; chain = chain->next) {
2143             size += chain->chunk->bytes;
2144         }
2145     } else {
2146         size = dstr->init.bytes = dstr->current - dstr->init.data;
2147     }
2148     if (size > SCM_STRING_MAX_SIZE) {
2149         Scm_Error("Scm_DStringSize: size exceeded the range: %ld", size);
2150     }
2151     return size;
2152 }
2153 
newChunk(ScmSmallInt size)2154 static ScmDStringChunk *newChunk(ScmSmallInt size)
2155 {
2156     return SCM_NEW_ATOMIC2(ScmDStringChunk*,
2157                            (sizeof(ScmDStringChunk)
2158                             +size-SCM_DSTRING_INIT_CHUNK_SIZE));
2159 }
2160 
Scm__DStringRealloc(ScmDString * dstr,ScmSmallInt minincr)2161 void Scm__DStringRealloc(ScmDString *dstr, ScmSmallInt minincr)
2162 {
2163     /* sets the byte count of the last chunk */
2164     if (dstr->tail) {
2165         dstr->tail->chunk->bytes = dstr->current - dstr->tail->chunk->data;
2166     } else {
2167         dstr->init.bytes = dstr->current - dstr->init.data;
2168     }
2169 
2170     /* determine the size of the new chunk.  the increase factor 3 is
2171        somewhat arbitrary, determined by rudimental benchmarking. */
2172     ScmSmallInt newsize = dstr->lastChunkSize * 3;
2173     if (newsize > DSTRING_MAX_CHUNK_SIZE) {
2174         newsize = DSTRING_MAX_CHUNK_SIZE;
2175     }
2176     if (newsize < minincr) {
2177         newsize = minincr;
2178     }
2179 
2180     ScmDStringChunk *newchunk = newChunk(newsize);
2181     newchunk->bytes = 0;
2182     ScmDStringChain *newchain = SCM_NEW(ScmDStringChain);
2183 
2184     newchain->next = NULL;
2185     newchain->chunk = newchunk;
2186     if (dstr->tail) {
2187         dstr->tail->next = newchain;
2188         dstr->tail = newchain;
2189     } else {
2190         dstr->anchor = dstr->tail = newchain;
2191     }
2192     dstr->current = newchunk->data;
2193     dstr->end = newchunk->data + newsize;
2194     dstr->lastChunkSize = newsize;
2195 }
2196 
2197 /* Retrieve accumulated string. */
dstring_getz(ScmDString * dstr,ScmSmallInt * psiz,ScmSmallInt * plen,int noalloc)2198 static const char *dstring_getz(ScmDString *dstr,
2199                                 ScmSmallInt *psiz,
2200                                 ScmSmallInt *plen,
2201                                 int noalloc)
2202 {
2203     ScmSmallInt size, len;
2204     char *buf;
2205     if (dstr->anchor == NULL) {
2206         /* we only have one chunk */
2207         size = dstr->current - dstr->init.data;
2208         CHECK_SIZE(size);
2209         len = dstr->length;
2210         if (noalloc) {
2211             buf = dstr->init.data;
2212         } else {
2213             buf = SCM_STRDUP_PARTIAL(dstr->init.data, size);
2214         }
2215     } else {
2216         ScmDStringChain *chain = dstr->anchor;
2217         char *bptr;
2218 
2219         size = Scm_DStringSize(dstr);
2220         CHECK_SIZE(size);
2221         len = dstr->length;
2222         bptr = buf = SCM_NEW_ATOMIC2(char*, size+1);
2223 
2224         memcpy(bptr, dstr->init.data, dstr->init.bytes);
2225         bptr += dstr->init.bytes;
2226         for (; chain; chain = chain->next) {
2227             memcpy(bptr, chain->chunk->data, chain->chunk->bytes);
2228             bptr += chain->chunk->bytes;
2229         }
2230         *bptr = '\0';
2231     }
2232     if (len < 0) len = count_length(buf, size);
2233     if (plen) *plen = len;
2234     if (psiz) *psiz = size;
2235     return buf;
2236 }
2237 
Scm_DStringGet(ScmDString * dstr,u_long flags)2238 ScmObj Scm_DStringGet(ScmDString *dstr, u_long flags)
2239 {
2240     ScmSmallInt len, size;
2241     const char *str = dstring_getz(dstr, &size, &len, FALSE);
2242     return SCM_OBJ(make_str(len, size, str, flags|SCM_STRING_TERMINATED, NULL));
2243 }
2244 
2245 /* For conveninence.   Note that dstr may already contain NUL byte in it,
2246    in that case you'll get chopped string. */
Scm_DStringGetz(ScmDString * dstr)2247 const char *Scm_DStringGetz(ScmDString *dstr)
2248 {
2249     ScmSmallInt len, size;
2250     return dstring_getz(dstr, &size, &len, FALSE);
2251 }
2252 
2253 /* Concatenate all chains in DString into one chunk.  Externally nothing
2254    really changes, but this can be used to optimize allocation. */
Scm_DStringWeld(ScmDString * dstr)2255 void Scm_DStringWeld(ScmDString *dstr)
2256 {
2257     if (dstr->anchor == NULL) return; /* nothing to do */
2258     ScmDStringChain *chain = dstr->anchor;
2259     ScmSmallInt size = Scm_DStringSize(dstr);
2260     ScmSmallInt bufsiz = size + (dstr->end - dstr->current);
2261     ScmDStringChunk *newchunk = newChunk(bufsiz);
2262     newchunk->bytes = size;
2263     char *bptr = newchunk->data;
2264     memcpy(bptr, dstr->init.data, dstr->init.bytes);
2265     bptr += dstr->init.bytes;
2266     for (; chain; chain = chain->next) {
2267         memcpy(bptr, chain->chunk->data, chain->chunk->bytes);
2268         bptr += chain->chunk->bytes;
2269     }
2270     dstr->init.bytes = 0;
2271     dstr->anchor->chunk = newchunk;
2272     dstr->anchor->next = NULL;
2273     dstr->tail = dstr->anchor;
2274     dstr->current = newchunk->data + size;
2275     dstr->end = newchunk->data + bufsiz;
2276     dstr->lastChunkSize = bufsiz;
2277 }
2278 
2279 /* Returns the current content of DString, along with byte size and character
2280    length. The returned pointer may not be NUL-terminated.
2281 
2282    Unlike Scm_DStringGet[z], returned pointer can directly points into
2283    the internal buffer of Scm_DString; especially, this never allocates
2284    if DString only uses initial buffer.  The caller should be aware that
2285    the returned content may be altered by further DString operation. */
Scm_DStringPeek(ScmDString * dstr,ScmSmallInt * size,ScmSmallInt * len)2286 const char *Scm_DStringPeek(ScmDString *dstr,
2287                             ScmSmallInt *size, ScmSmallInt *len)
2288 {
2289     Scm_DStringWeld(dstr);
2290     if (dstr->anchor == NULL) {
2291         if (size) *size = dstr->current - dstr->init.data;
2292         if (len)  *len = dstr->length;
2293         return dstr->init.data;
2294     } else {
2295         if (size) *size = dstr->anchor->chunk->bytes;
2296         if (len)  *len = dstr->length;
2297         return dstr->anchor->chunk->data;
2298     }
2299 }
2300 
Scm_DStringPutz(ScmDString * dstr,const char * str,ScmSmallInt size)2301 void Scm_DStringPutz(ScmDString *dstr, const char *str, ScmSmallInt size)
2302 {
2303     if (size < 0) size = strlen(str);
2304     if (dstr->current + size > dstr->end) {
2305         Scm__DStringRealloc(dstr, size);
2306     }
2307     memcpy(dstr->current, str, size);
2308     dstr->current += size;
2309     if (dstr->length >= 0) {
2310         ScmSmallInt len = count_length(str, size);
2311         if (len >= 0) dstr->length += len;
2312         else dstr->length = -1;
2313     }
2314 }
2315 
Scm_DStringAdd(ScmDString * dstr,ScmString * str)2316 void Scm_DStringAdd(ScmDString *dstr, ScmString *str)
2317 {
2318     const ScmStringBody *b = SCM_STRING_BODY(str);
2319     ScmSmallInt size = SCM_STRING_BODY_SIZE(b);
2320     if (size == 0) return;
2321     if (dstr->current + size > dstr->end) {
2322         Scm__DStringRealloc(dstr, size);
2323     }
2324     memcpy(dstr->current, SCM_STRING_BODY_START(b), size);
2325     dstr->current += size;
2326     if (dstr->length >= 0 && !SCM_STRING_BODY_INCOMPLETE_P(b)) {
2327         dstr->length += SCM_STRING_BODY_LENGTH(b);
2328     } else {
2329         dstr->length = -1;
2330     }
2331 }
2332 
Scm_DStringPutb(ScmDString * ds,char byte)2333 void Scm_DStringPutb(ScmDString *ds, char byte)
2334 {
2335     SCM_DSTRING_PUTB(ds, byte);
2336 }
2337 
Scm_DStringPutc(ScmDString * ds,ScmChar ch)2338 void Scm_DStringPutc(ScmDString *ds, ScmChar ch)
2339 {
2340     SCM_DSTRING_PUTC(ds, ch);
2341 }
2342 
2343 /* Truncate DString at the specified size.
2344    Returns after-truncation size (it may be smaller than newsize if
2345    the original DString isn't as large as newsize. */
Scm_DStringTruncate(ScmDString * dstr,ScmSmallInt newsize)2346 ScmSmallInt Scm_DStringTruncate(ScmDString *dstr, ScmSmallInt newsize)
2347 {
2348     ScmSmallInt origsize = Scm_DStringSize(dstr);
2349 
2350     if (newsize < dstr->init.bytes) {
2351         dstr->init.bytes = newsize;
2352         dstr->anchor = NULL;
2353         dstr->tail = NULL;
2354         dstr->current = dstr->init.data + newsize;
2355         dstr->end = dstr->init.data + SCM_DSTRING_INIT_CHUNK_SIZE;
2356     } else {
2357         if (newsize >= origsize) return origsize;
2358         ScmDStringChain *chain = dstr->anchor;
2359         ScmSmallInt ss = dstr->init.bytes;
2360         for (; chain; chain = chain->next) {
2361             if (newsize < ss + chain->chunk->bytes) {
2362                 /* truncate this chunk */
2363                 if (chain == dstr->tail) {
2364                     chain->chunk->bytes = newsize - ss;
2365                     dstr->current = chain->chunk->data + newsize - ss;
2366                 } else {
2367                     dstr->lastChunkSize = chain->chunk->bytes;
2368                     dstr->end = chain->chunk->data + chain->chunk->bytes;
2369                     chain->chunk->bytes = newsize - ss;
2370                     chain->next = NULL;
2371                     dstr->tail = chain;
2372                     dstr->current = chain->chunk->data + newsize - ss;
2373                 }
2374                 break;
2375             }
2376             ss += chain->chunk->bytes;
2377         }
2378         SCM_ASSERT(chain != NULL);
2379     }
2380 
2381     /* If we accumulated only ASCII, we can adjust length as well. */
2382     if (dstr->length == origsize || newsize == 0) dstr->length = newsize;
2383     else                                          dstr->length = -1;
2384     return newsize;
2385 }
2386 
2387 
2388 /* for debug */
Scm_DStringDump(FILE * out,ScmDString * dstr)2389 void Scm_DStringDump(FILE *out, ScmDString *dstr)
2390 {
2391     fprintf(out, "DString %p\n", dstr);
2392     if (dstr->anchor) {
2393         fprintf(out, "  chunk0[%3ld] = \"", dstr->init.bytes);
2394         SCM_IGNORE_RESULT(fwrite(dstr->init.data, 1, dstr->init.bytes, out));
2395         fprintf(out, "\"\n");
2396         ScmDStringChain *chain = dstr->anchor;
2397         for (int i=1; chain; chain = chain->next, i++) {
2398             ScmSmallInt size = (chain->next? chain->chunk->bytes : (dstr->current - dstr->tail->chunk->data));
2399             fprintf(out, "  chunk%d[%3ld] = \"", i, size);
2400             SCM_IGNORE_RESULT(fwrite(chain->chunk->data, 1, size, out));
2401             fprintf(out, "\"\n");
2402         }
2403     } else {
2404         ScmSmallInt size = dstr->current - dstr->init.data;
2405         fprintf(out, "  chunk0[%3ld] = \"", size);
2406         SCM_IGNORE_RESULT(fwrite(dstr->init.data, 1, size, out));
2407         fprintf(out, "\"\n");
2408     }
2409 }
2410