1 /*
2  * read.c - reader
3  *
4  *   Copyright (c) 2000-2020  Shiro Kawai  <shiro@acm.org>
5  *
6  *   Redistribution and use in source and binary forms, with or without
7  *   modification, are permitted provided that the following conditions
8  *   are met:
9  *
10  *   1. Redistributions of source code must retain the above copyright
11  *      notice, this list of conditions and the following disclaimer.
12  *
13  *   2. Redistributions in binary form must reproduce the above copyright
14  *      notice, this list of conditions and the following disclaimer in the
15  *      documentation and/or other materials provided with the distribution.
16  *
17  *   3. Neither the name of the authors nor the names of its contributors
18  *      may be used to endorse or promote products derived from this
19  *      software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <stdio.h>
35 #include <ctype.h>
36 #include <math.h>
37 #define LIBGAUCHE_BODY
38 #include "gauche.h"
39 #include "gauche/char_attr.h"
40 #include "gauche/priv/portP.h"
41 #include "gauche/priv/builtin-syms.h"
42 #include "gauche/priv/readerP.h"
43 
44 /*
45  * READ
46  */
47 
48 static void   read_context_flush(ScmReadContext *ctx);
49 static ScmObj read_internal(ScmPort *port, ScmReadContext *ctx);
50 static ScmObj read_item(ScmPort *port, ScmReadContext *ctx);
51 static ScmObj read_list(ScmPort *port, ScmChar closer, ScmReadContext *ctx);
52 static ScmObj read_vector(ScmPort *port, ScmChar closer, ScmReadContext *ctx);
53 static ScmObj read_string(ScmPort *port, int incompletep, ScmReadContext *ctx);
54 static ScmObj read_quoted(ScmPort *port, ScmObj quoter, ScmReadContext *ctx);
55 static ScmObj read_char(ScmPort *port, ScmReadContext *ctx);
56 static ScmObj read_word(ScmPort *port, ScmChar initial, ScmReadContext *ctx,
57                         int temp_case_fold, int include_hash_sign);
58 static ScmObj read_symbol(ScmPort *port, ScmChar initial, ScmReadContext *ctx);
59 static ScmObj read_immediate_symbol(ScmPort *port, ScmChar initial,
60                                     int interned, const char *preceding,
61                                     ScmReadContext *ctx);
62 static ScmObj read_number(ScmPort *port, ScmChar initial,
63                           int radix, /* #<radix>r case */
64                           ScmReadContext *ctx);
65 static ScmObj read_symbol_or_number(ScmPort *port, ScmChar initial, ScmReadContext *ctx);
66 static ScmObj read_escaped_symbol(ScmPort *port, ScmChar delim, int interned,
67                                   ScmReadContext *ctx);
68 static ScmObj read_keyword(ScmPort *port, ScmReadContext *ctx);
69 static ScmObj read_regexp(ScmPort *port);
70 static ScmObj read_charset(ScmPort *port);
71 static ScmObj read_sharp_comma(ScmPort *port, ScmReadContext *ctx);
72 static ScmObj read_sharp_asterisk(ScmPort *port, ScmReadContext *ctx);
73 static ScmObj process_sharp_comma(ScmPort *port, ScmObj key, ScmObj args,
74                                   ScmReadContext *ctx, int has_ref);
75 static ScmObj read_shebang(ScmPort *port, ScmReadContext *ctx);
76 static ScmObj read_num_prefixed(ScmPort *port, ScmChar ch, ScmReadContext *ctx);
77 static ScmObj read_sharp_word(ScmPort *port, char c, ScmReadContext *ctx);
78 
79 /* Table of 'read-time constructor' in SRFI-10 */
80 static struct {
81     ScmHashTable *table;
82     ScmInternalMutex mutex;
83 } readCtorData;
84 
85 /* Table of hash-bang directive */
86 static struct {
87     ScmHashTable *table;
88     ScmInternalMutex mutex;
89 } hashBangData;
90 
91 /* Parameter location for default reader mode */
92 static ScmPrimitiveParameter *defaultReadContext;
93 
94 /*----------------------------------------------------------------
95  * Entry points
96  *   Note: Entire read operation are done while locking the input port.
97  *   So we can use 'unsafe' version of port operations inside this file.
98  *   The lock is removed if reader routine signals an error.  It is OK
99  *   to call read routine recursively.
100  */
Scm_ReadWithContext(ScmObj port,ScmReadContext * ctx)101 ScmObj Scm_ReadWithContext(ScmObj port, ScmReadContext *ctx)
102 {
103     ScmVM *vm = Scm_VM();
104     volatile ScmObj r = SCM_NIL;
105     if (!SCM_PORTP(port) || SCM_PORT_DIR(port) != SCM_PORT_INPUT) {
106         Scm_Error("input port required: %S", port);
107     }
108     if (!(ctx->flags & RCTX_RECURSIVELY)) {
109         ctx->table = NULL;
110         ctx->pending = SCM_NIL;
111     }
112     if (PORT_LOCKED(SCM_PORT(port), vm)) {
113         r = read_item(SCM_PORT(port), ctx);
114     } else {
115         PORT_LOCK(SCM_PORT(port), vm);
116         PORT_SAFE_CALL(SCM_PORT(port),
117                        r = read_item(SCM_PORT(port), ctx), /*no cleanup*/);
118         PORT_UNLOCK(SCM_PORT(port));
119     }
120     if (!(ctx->flags & RCTX_RECURSIVELY)) {
121         read_context_flush(ctx);
122     }
123     return r;
124 }
125 
Scm_Read(ScmObj port)126 ScmObj Scm_Read(ScmObj port)
127 {
128     return Scm_ReadWithContext(port, Scm_MakeReadContext(NULL));
129 }
130 
131 /* Convenience functions */
Scm_ReadFromString(ScmString * str)132 ScmObj Scm_ReadFromString(ScmString *str)
133 {
134     ScmObj inp = Scm_MakeInputStringPort(str, TRUE);
135     ScmReadContext *ctx = Scm_MakeReadContext(NULL);
136     ScmObj r = read_item(SCM_PORT(inp), ctx);
137     read_context_flush(ctx);
138     return r;
139 }
140 
Scm_ReadFromCString(const char * cstr)141 ScmObj Scm_ReadFromCString(const char *cstr)
142 {
143     ScmObj s = SCM_MAKE_STR_IMMUTABLE(cstr);
144     ScmObj inp = Scm_MakeInputStringPort(SCM_STRING(s), TRUE);
145     ScmReadContext *ctx = Scm_MakeReadContext(NULL);
146     ScmObj r = read_item(SCM_PORT(inp), ctx);
147     read_context_flush(ctx);
148     return r;
149 }
150 
Scm_ReadListWithContext(ScmObj port,ScmChar closer,ScmReadContext * ctx)151 ScmObj Scm_ReadListWithContext(ScmObj port, ScmChar closer, ScmReadContext *ctx)
152 {
153     ScmVM *vm = Scm_VM();
154     volatile ScmObj r = SCM_NIL;
155     if (!SCM_PORTP(port) || SCM_PORT_DIR(port) != SCM_PORT_INPUT) {
156         Scm_Error("input port required: %S", port);
157     }
158     if (!(ctx->flags & RCTX_RECURSIVELY)) {
159         ctx->table = NULL;
160         ctx->pending = SCM_NIL;
161     }
162     if (PORT_LOCKED(SCM_PORT(port), vm)) {
163         r = read_list(SCM_PORT(port), closer, ctx);
164     } else {
165         PORT_LOCK(SCM_PORT(port), vm);
166         PORT_SAFE_CALL(SCM_PORT(port),
167                        r = read_list(SCM_PORT(port), closer, ctx),
168                        /*no cleanup*/);
169         PORT_UNLOCK(SCM_PORT(port));
170     }
171     if (!(ctx->flags & RCTX_RECURSIVELY)) {
172         read_context_flush(ctx);
173     }
174     return r;
175 }
176 
Scm_ReadList(ScmObj port,ScmChar closer)177 ScmObj Scm_ReadList(ScmObj port, ScmChar closer)
178 {
179     ScmReadContext *ctx = Scm_MakeReadContext(NULL);
180     return Scm_ReadListWithContext(port, closer, ctx);
181 }
182 
183 /*----------------------------------------------------------------
184  * Read context
185  */
186 
Scm_CurrentReadContext()187 ScmReadContext *Scm_CurrentReadContext()
188 {
189     ScmObj c = Scm_PrimitiveParameterRef(Scm_VM(), defaultReadContext);
190     SCM_ASSERT(SCM_READ_CONTEXT_P(c));
191     return SCM_READ_CONTEXT(c);
192 }
193 
Scm_SetCurrentReadContext(ScmReadContext * ctx)194 ScmReadContext *Scm_SetCurrentReadContext(ScmReadContext *ctx)
195 {
196     ScmObj p = Scm_PrimitiveParameterSet(Scm_VM(), defaultReadContext,
197                                          SCM_OBJ(ctx));
198     SCM_ASSERT(SCM_READ_CONTEXT_P(p));
199     return SCM_READ_CONTEXT(p);
200 }
201 
make_read_context(ScmReadContext * proto)202 static ScmReadContext *make_read_context(ScmReadContext *proto)
203 {
204     ScmReadContext *ctx = SCM_NEW(ScmReadContext);
205     SCM_SET_CLASS(ctx, SCM_CLASS_READ_CONTEXT);
206     ctx->flags = proto ? proto->flags : RCTX_SOURCE_INFO;
207     ctx->table = NULL;
208     ctx->pending = SCM_NIL;
209     return ctx;
210 }
211 
Scm_MakeReadContext(ScmReadContext * proto)212 ScmReadContext *Scm_MakeReadContext(ScmReadContext *proto)
213 {
214     return make_read_context(proto? proto : Scm_CurrentReadContext());
215 }
216 
read_context_print(ScmObj obj,ScmPort * port,ScmWriteContext * ctx SCM_UNUSED)217 static void read_context_print(ScmObj obj, ScmPort *port,
218                                ScmWriteContext *ctx SCM_UNUSED)
219 {
220     Scm_Printf(port, "#<read-context %p>", obj);
221 }
222 
223 SCM_DEFINE_BUILTIN_CLASS_SIMPLE(Scm_ReadContextClass, read_context_print);
224 
225 /* public api */
Scm_ReadContextLiteralImmutable(ScmReadContext * ctx)226 int Scm_ReadContextLiteralImmutable(ScmReadContext *ctx)
227 {
228     return (ctx->flags & RCTX_LITERAL_IMMUTABLE);
229 }
230 
231 /*----------------------------------------------------------------
232  * Error
233  */
234 
Scm_ReadError(ScmPort * port,const char * msg,...)235 void Scm_ReadError(ScmPort *port, const char *msg, ...)
236 {
237     ScmObj ostr = Scm_MakeOutputStringPort(TRUE);
238     ScmObj name = Scm_PortName(port);
239     ScmSize line = Scm_PortLine(port);
240 
241     Scm_Printf(SCM_PORT(ostr), "Read error at %S:",
242                SCM_STRINGP(name)? name : SCM_OBJ(SCM_MAKE_STR("??")));
243     if (line >= 0) {
244         Scm_Printf(SCM_PORT(ostr), "line %d: ", line);
245     }
246     va_list ap;
247     va_start(ap, msg);
248     Scm_Vprintf(SCM_PORT(ostr), msg, ap, TRUE);
249     va_end(ap);
250 
251     ScmObj rerr = Scm_MakeReadError(Scm_GetOutputString(SCM_PORT(ostr), 0),
252                                     port, line);
253     Scm_Raise(rerr, 0);
254 }
255 
256 /*----------------------------------------------------------------
257  * Read reference
258  */
259 
260 /* Read reference is a proxy object to for referenced object (#N=).
261  */
262 
263 static void read_reference_print(ScmObj obj, ScmPort *port,
264                                  ScmWriteContext *ctx);
265 SCM_DEFINE_BUILTIN_CLASS_SIMPLE(Scm_ReadReferenceClass, read_reference_print);
266 
Scm_MakeReadReference(void)267 ScmObj Scm_MakeReadReference(void)
268 {
269     ScmReadReference *a = SCM_NEW(ScmReadReference);
270     SCM_SET_CLASS(a, SCM_CLASS_READ_REFERENCE);
271     a->value = SCM_UNBOUND;
272     return SCM_OBJ(a);
273 }
274 
read_reference_print(ScmObj obj SCM_UNUSED,ScmPort * port,ScmWriteContext * ctx SCM_UNUSED)275 static void read_reference_print(ScmObj obj SCM_UNUSED, ScmPort *port,
276                                  ScmWriteContext *ctx SCM_UNUSED)
277 {
278     Scm_Printf(port, "#<read-reference>");
279 }
280 
ref_push(ScmReadContext * ctx,ScmObj obj,ScmObj finisher)281 static void ref_push(ScmReadContext *ctx, ScmObj obj, ScmObj finisher)
282 {
283     ctx->pending = Scm_Acons(obj, finisher, ctx->pending);
284 }
285 
ref_val(ScmObj ref)286 static ScmObj ref_val(ScmObj ref)
287 {
288     if (!SCM_READ_REFERENCE_REALIZED(ref)) {
289         Scm_Error("reader encontered unresolved read reference.  Implementation error?");
290     }
291     return SCM_READ_REFERENCE(ref)->value;
292 }
293 
ref_register(ScmReadContext * ctx,ScmObj obj,int refnum)294 static ScmObj ref_register(ScmReadContext *ctx, ScmObj obj, int refnum)
295 {
296     SCM_ASSERT(ctx->table);
297     Scm_HashTableSet(ctx->table, SCM_MAKE_INT(refnum), obj, 0);
298     return obj;
299 }
300 
301 /* ctx->pending contains an assoc list of objects who contains read reference
302    which should be resolved.
303    The car of each entry is the object that needs to be fixed, and the
304    cdr of eacy entry may contain a finisher procedure (if the object is
305    created by read-time constructor.
306 */
read_context_flush(ScmReadContext * ctx)307 static void read_context_flush(ScmReadContext *ctx)
308 {
309     ScmObj cp, ep;
310 
311     SCM_FOR_EACH(cp, ctx->pending) {
312         ScmObj entry = SCM_CAR(cp);
313         SCM_ASSERT(SCM_PAIRP(entry));
314         ScmObj obj = SCM_CAR(entry);
315         ScmObj finisher = SCM_CDR(entry);
316 
317         if (!SCM_FALSEP(finisher)) {
318             Scm_ApplyRec(finisher, SCM_LIST1(obj));
319         } else if (SCM_PAIRP(obj)) {
320             SCM_FOR_EACH(ep, obj) {
321                 if (SCM_READ_REFERENCE_P(SCM_CAR(ep))) {
322                     SCM_SET_CAR_UNCHECKED(ep, ref_val(SCM_CAR(ep)));
323                 }
324                 if (SCM_READ_REFERENCE_P(SCM_CDR(ep))) {
325                     /* in case we have (... . #N#) */
326                     SCM_SET_CDR_UNCHECKED(ep, ref_val(SCM_CDR(ep)));
327                     break;
328                 }
329             }
330         } else if (SCM_VECTORP(obj)) {
331             int i, len = SCM_VECTOR_SIZE(obj);
332             for (i=0; i<len; i++) {
333                 ep = SCM_VECTOR_ELEMENT(obj, i);
334                 if (SCM_READ_REFERENCE_P(ep)) {
335                     SCM_VECTOR_ELEMENTS(obj)[i] = ref_val(ep);
336                 }
337             }
338         } else {
339             Scm_Error("read_context_flush: recursive reference only supported with vector and lists");
340         }
341     }
342 }
343 
344 /*----------------------------------------------------------------
345  * Miscellaneous routines
346  */
347 
348 /* Table of initial 128 bytes of ASCII characters to dispatch for
349    special meanings.
350     bit 0 : a valid constituent char of words
351     bit 1 : candidate of case folding
352 */
353 static const unsigned char ctypes[] = {
354     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
355     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
356  /*     !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
357     0,  1,  0,  0,  1,  1,  1,  0,  0,  0,  1,  1,  0,  1,  1,  1,
358  /* 0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
359     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1,
360  /* @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
361     1,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
362  /* P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
363     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  0,  0,  0,  1,  1,
364  /* `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
365     0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
366  /* p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~   ^? */
367     1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  0,
368 };
369 
370 /* For characters >= 0x80, we follow R[67]RS. */
char_word_constituent(int c,int include_hash_sign)371 inline static int char_word_constituent(int c, int include_hash_sign)
372 {
373     if (c < 128) {
374         return ((c >= 0 && (ctypes[(unsigned char)c]&1))
375                 || (c == '#' && include_hash_sign));
376     } else if (c == 0x200c || c == 0x200d) {
377         return TRUE;
378     } else {
379         switch (Scm_CharGeneralCategory(c)) {
380         case SCM_CHAR_CATEGORY_Lu:
381         case SCM_CHAR_CATEGORY_Ll:
382         case SCM_CHAR_CATEGORY_Lt:
383         case SCM_CHAR_CATEGORY_Lm:
384         case SCM_CHAR_CATEGORY_Lo:
385         case SCM_CHAR_CATEGORY_Mn:
386         case SCM_CHAR_CATEGORY_Nl:
387         case SCM_CHAR_CATEGORY_No:
388         case SCM_CHAR_CATEGORY_Pd:
389         case SCM_CHAR_CATEGORY_Pc:
390         case SCM_CHAR_CATEGORY_Po:
391         case SCM_CHAR_CATEGORY_Sc:
392         case SCM_CHAR_CATEGORY_Sm:
393         case SCM_CHAR_CATEGORY_Sk:
394         case SCM_CHAR_CATEGORY_So:
395         case SCM_CHAR_CATEGORY_Co: return TRUE;
396         default: return FALSE;
397         }
398     }
399 }
400 
char_word_case_fold(int c)401 inline static int char_word_case_fold(int c)
402 {
403     /*NB: Should we adopt full-fledged case folding for #!fold-case mode?
404       It seems overkill to me.  For now, we only handles ASCII range.*/
405     return (c >= 0 && c < 128 && (ctypes[(unsigned char)c]&2));
406 }
407 
408 /* R7RS 7.1.1 <delimiter> */
char_is_delimiter(ScmChar ch)409 static int char_is_delimiter(ScmChar ch)
410 {
411     return ((ch < 0x80 && strchr("()\";| \t\n\r", ch) != NULL)
412             || SCM_CHAR_EXTRA_WHITESPACE(ch));
413 }
414 
read_nested_comment(ScmPort * port,ScmReadContext * ctx SCM_UNUSED)415 static void read_nested_comment(ScmPort *port,
416                                 ScmReadContext *ctx SCM_UNUSED)
417 {
418     int nesting = 0;
419     ScmSize line = Scm_PortLine(port);
420 
421     for (ScmChar c = Scm_GetcUnsafe(port);;) {
422         switch (c) {
423         case '#': {
424             ScmChar c1 = Scm_GetcUnsafe(port);
425             if (c1 == '|')   { nesting++; break; }
426             else if (c1 == EOF) goto eof;
427             else c = c1;
428             continue;
429         }
430         case '|': {
431             ScmChar c1 = Scm_GetcUnsafe(port);
432             if (c1 == '#') {
433                 if (nesting-- == 0) {
434                     return;
435                 }
436                 break;
437             }
438             else if (c1 == EOF) goto eof;
439             else c = c1;
440             continue;
441         }
442         case EOF:
443           eof:
444             Scm_ReadError(port, "encountered EOF inside nested multi-line comment (comment begins at line %d)", line);
445         default:
446             break;
447         }
448         c = Scm_GetcUnsafe(port);
449     }
450 }
451 
read_comment(ScmPort * port)452 static void read_comment(ScmPort *port) /* leading semicolon is already read */
453 {
454     for (;;) {
455         /* NB: comment may contain unexpected character code.
456            for the safety, we read bytes here. */
457         int c = Scm_GetbUnsafe(port);
458         if (c == '\n' || c == EOF) break;
459     }
460 }
461 
skipws(ScmPort * port,ScmReadContext * ctx SCM_UNUSED)462 static int skipws(ScmPort *port, ScmReadContext *ctx SCM_UNUSED)
463 {
464     for (;;) {
465         int c = Scm_GetcUnsafe(port);
466         if (c == EOF) return c;
467         if (c <= 127) {
468             if (isspace(c)) continue;
469             if (c == ';') {
470                 read_comment(port);
471                 continue;
472             }
473             return c;
474         }
475         else if (!SCM_CHAR_EXTRA_WHITESPACE(c)) return c;
476     }
477 }
478 
reject_in_r7(ScmPort * port,ScmReadContext * ctx SCM_UNUSED,const char * token)479 static void reject_in_r7(ScmPort *port,
480                          ScmReadContext *ctx SCM_UNUSED,
481                          const char *token)
482 {
483     if (SCM_EQ(Scm_GetPortReaderLexicalMode(port), SCM_SYM_STRICT_R7)) {
484         Scm_ReadError(port,
485                       "lexical syntax %s isn't allowed in strict R7RS mode",
486                       token);
487     }
488 }
489 
read_internal(ScmPort * port,ScmReadContext * ctx)490 static ScmObj read_internal(ScmPort *port, ScmReadContext *ctx)
491 {
492     int c = skipws(port, ctx);
493     switch (c) {
494     case '(':
495         return read_list(port, ')', ctx);
496     case '"':
497         return read_string(port, FALSE, ctx);
498     case '#':
499         {
500             int c1 = Scm_GetcUnsafe(port);
501             switch (c1) {
502             case EOF:
503                 Scm_ReadError(port, "premature #-sequence at EOF");
504                 return SCM_UNDEFINED; /* dummy */
505             case 't':; case 'T': return read_sharp_word(port, 't', ctx);
506             case 'f':; case 'F': return read_sharp_word(port, 'f', ctx);
507             case 's':; case 'S': return read_sharp_word(port, 's', ctx);
508             case 'u':; case 'U': return read_sharp_word(port, 'u', ctx);
509             case 'c':; case 'C': return read_sharp_word(port, 'c', ctx);
510             case '(':
511                 return read_vector(port, ')', ctx);
512             case '\\':
513                 return read_char(port, ctx);
514             case 'x':; case 'X':; case 'o':; case 'O':;
515             case 'b':; case 'B':; case 'd':; case 'D':;
516             case 'e':; case 'E':; case 'i':; case 'I':;
517                 Scm_UngetcUnsafe(c1, port);
518                 return read_number(port, c, 0, ctx); /* let StringToNumber handle radix prefix */
519             case '!':
520                 /* #! is either a script shebang or a reader directive */
521                 return read_shebang(port, ctx);
522             case '"': {
523                 /* #"..." - string interpolation  */
524                 reject_in_r7(port, ctx, "#\"...\"");
525                 Scm_UngetcUnsafe(c1, port);
526                 ScmObj form = read_item(port, ctx);
527                 return process_sharp_comma(port,
528                                            SCM_SYM_STRING_INTERPOLATE,
529                                            SCM_LIST2(form, SCM_FALSE),
530                                            ctx, FALSE);
531             }
532             case '/':
533                 /* #/.../ literal regexp */
534                 reject_in_r7(port, ctx, "#/.../");
535                 return read_regexp(port);
536             case '[':
537                 /* #[...] literal charset */
538                 reject_in_r7(port, ctx, "#[...]");
539                 return read_charset(port);
540             case ',':
541                 /* #,(form) - SRFI-10 read-time macro */
542                 return read_sharp_comma(port, ctx);
543             case '|':
544                 /* #| - block comment (SRFI-30)
545                    it is equivalent to whitespace, so we return #<undef> */
546                 read_nested_comment(port, ctx);
547                 return SCM_UNDEFINED;
548             case '`': {
549                 /* #`"..." - Legacy string interpolation syntax */
550                 reject_in_r7(port, ctx, "#`\"...\"");
551                 ScmObj form = read_item(port, ctx);
552                 return process_sharp_comma(port,
553                                            SCM_SYM_STRING_INTERPOLATE,
554                                            SCM_LIST2(form, SCM_TRUE),
555                                            ctx, FALSE);
556             }
557             case '?': {
558                 /* #? - debug directives */
559                 reject_in_r7(port, ctx, "#?");
560                 int c2 = Scm_GetcUnsafe(port);
561                 switch (c2) {
562                 case '=': {
563                     /* #?=form - debug print */
564                     ScmObj form = read_item(port, ctx);
565                     return SCM_LIST2(SCM_SYM_DEBUG_PRINT, form);
566                 }
567                 case ',': {
568                     /* #?,form - debug funcall */
569                     ScmObj form = read_item(port, ctx);
570                     return SCM_LIST2(SCM_SYM_DEBUG_FUNCALL, form);
571                 }
572                 case EOF:
573                     return SCM_EOF;
574                 default:
575                     Scm_ReadError(port, "unsupported #?-syntax: #?%C", c2);
576                     return SCM_UNDEFINED; /* dummy */
577                 }
578             }
579             case '0': case '1': case '2': case '3': case '4':
580             case '5': case '6': case '7': case '8': case '9':
581                 /* #N#, #N= or #Nr form */
582                 return read_num_prefixed(port, c1, ctx);
583             case '*': {
584                 reject_in_r7(port, ctx, "#*");
585                 /* #**"...." byte string
586                    #*01001001 for bit vector. */
587                 return read_sharp_asterisk(port, ctx);
588             }
589             case ':': {
590                 reject_in_r7(port, ctx, "#:");
591                 /* #:name - uninterned symbol */
592                 return read_immediate_symbol(port, Scm_GetcUnsafe(port),
593                                              FALSE, "#:", ctx);
594             }
595             case ';': {
596                 /* #;expr - comment out sexpr */
597                 int orig = ctx->flags;
598                 ctx->flags |= RCTX_DISABLE_CTOR;
599                 read_item(port, ctx); /* read and discard */
600                 ctx->flags = orig;
601                 return SCM_UNDEFINED; /* indicate this is a comment */
602             }
603             default:
604                 Scm_ReadError(port, "unsupported #-syntax: #%C", c1);
605                 return SCM_UNDEFINED; /* dummy */
606             }
607         }
608     case '\'': return read_quoted(port, SCM_SYM_QUOTE, ctx);
609     case '`': return read_quoted(port, SCM_SYM_QUASIQUOTE, ctx);
610     case ':':
611         /* Would be removed once we make keywords a subtype of symbols. */
612         if (SCM_EQ(Scm_GetPortReaderLexicalMode(port), SCM_SYM_STRICT_R7)) {
613             return read_symbol(port, c, ctx);
614         } else {
615             return read_keyword(port, ctx);
616         }
617     case ',':
618         {
619             int c1 = Scm_GetcUnsafe(port);
620             if (c1 == EOF) {
621                 Scm_ReadError(port, "unterminated unquote");
622                 return SCM_UNDEFINED; /* dummy */
623             } else if (c1 == '@') {
624                 return read_quoted(port, SCM_SYM_UNQUOTE_SPLICING, ctx);
625             } else {
626                 Scm_UngetcUnsafe(c1, port);
627                 return read_quoted(port, SCM_SYM_UNQUOTE, ctx);
628             }
629         }
630     case '|':
631         return read_escaped_symbol(port, '|', TRUE, ctx);
632     case '[':
633         /* TODO: make it customizable */
634         reject_in_r7(port, ctx, "[]");
635         return read_list(port, ']', ctx);
636     case '{':
637         reject_in_r7(port, ctx, "{}");
638         return read_list(port, '}', ctx);
639     case '+':; case '-':
640         /* Note: R5RS doesn't permit identifiers beginning with '+' or '-',
641            but some Scheme programs use such identifiers. */
642         return read_symbol_or_number(port, c, ctx);
643     case '.':
644         {
645             int c1 = Scm_GetcUnsafe(port);
646             if (!char_word_constituent(c1, FALSE)) {
647                 Scm_ReadError(port, "dot in wrong context");
648             }
649             Scm_UngetcUnsafe(c1, port);
650             return read_symbol_or_number(port, c, ctx);
651         }
652     case '0':; case '1':; case '2':; case '3':; case '4':;
653     case '5':; case '6':; case '7':; case '8':; case '9':;
654         /* Note: R5RS doesn't permit identifiers beginning with digits,
655            but some Scheme programs use such identifiers. */
656         return read_symbol_or_number(port, c, ctx);
657     case ')':; case ']':; case '}':;
658         Scm_ReadError(port, "extra close parenthesis `%c'", c);
659         return SCM_UNDEFINED;   /* dummy */
660     case EOF:
661         return SCM_EOF;
662     default:
663         return read_symbol(port, c, ctx);
664     }
665 }
666 
read_item(ScmPort * port,ScmReadContext * ctx)667 static ScmObj read_item(ScmPort *port, ScmReadContext *ctx)
668 {
669     for (;;) {
670         ScmObj obj = read_internal(port, ctx);
671         if (!SCM_UNDEFINEDP(obj)) return obj;
672     }
673 }
674 
675 /*--------------------------------------------------------
676  * Common routine to handle hex-digit escape \xNN etc.
677  */
678 /* TODO: Redesign these for clearer and simpler API. */
679 
680 /*
681    Hex-escape sequence can appear in the following places:
682 
683    - character literals      #\x1b
684    - string literals         "\x1b;("
685    - symbol literals         |\x1b;|
686    - character set literals  #[\x1b;-\x1f;]
687    - regexp literals         #/\x1b;+ /
688 
689    Legacy Gauche supports fixed-digit syntax.
690 
691    \xN{2}   - char code up to 256 in _native encoding_
692    \uN{4}   - unicode codepoint in BMP
693    \UN{8}   - unicode codepoint
694 
695    New (R7RS) syntax has variable number of digits, terminated by ';'
696 
697    \xN{1,}; - unicode codepoint
698 
699    There's an ambiguity in '\x' case - we first try to read it as R7RS syntax,
700    then falls back to legacy syntax, by default.  Because of this backtrack,
701    the API is a bit complicated.
702 
703    Scm_ReadXdigitsFromString reads from char* buffer, up to buflen octets.
704    The preceding backslash and a character 'x', 'u' or 'U' should already
705    be read.  The character is passed to KEY.  It also receives
706    ScmReaderLexicalMode.
707 
708    Character literal doesn't take terminating ';'.  TERMINATOR flag
709    indicates whether we expect terminator or not.
710 
711    NEXTBUF is an output variable points to the next character on success.
712    Returns the retrieved decoded character, or SCM_CHAR_INVALID on error.
713 
714    Scm_ReadXdigitsFromPort is a convenience routine on top of
715    Scm_ReadXdigitsFromString, when the caller is reading from a port.
716    Since we need look ahead arbitrary number of characters, we can't just
717    return a retrieved character.  The procedure takes ScmDString to which
718    the read characters (the decoded character, plus remaining hexdigit
719    characters) are accumulated.   When the escape syntax is invalid,
720    it returns #<string> of prefetched characters.  On success, it returns
721    #t.
722 */
723 
Scm_ReadXdigitsFromString(const char * buf,int buflen,ScmChar key,ScmObj mode,int terminator,const char ** nextbuf)724 ScmChar Scm_ReadXdigitsFromString(const char *buf,
725                                   int buflen,
726                                   ScmChar key, /* x, u or U */
727                                   ScmObj mode, /* Reader lexical mode */
728                                   int terminator, /* TRUE expecting ';' */
729                                   const char **nextbuf)
730 {
731     int legacy_fallback = FALSE;
732 
733     if (key == 'x' && !SCM_EQ(mode, SCM_SYM_LEGACY)) {
734         int val = 0, i;
735         int overflow = FALSE;
736         for (i=0; i<buflen; i++) {
737             if (isxdigit(buf[i])) {
738                 val = val*16 + Scm_DigitToInt(buf[i], 16, FALSE);
739                 if (val > 0x10ffff) overflow = TRUE;
740             } else if (terminator && buf[i] == ';' && i > 0) {
741                 /* R7RS syntax */
742                 *nextbuf = buf+i+1;
743                 return overflow? SCM_CHAR_INVALID : Scm_UcsToChar(val);
744             } else if (terminator && i < 2) {
745                 return SCM_CHAR_INVALID;
746             } else {
747                 break;
748             }
749         }
750         if (!terminator && i == buflen) {
751             *nextbuf = buf+i;
752             return overflow? SCM_CHAR_INVALID:Scm_UcsToChar(val);
753         }
754         /* Fallback to legacy syntax */
755         legacy_fallback = TRUE;
756     }
757     if (SCM_EQ(mode, SCM_SYM_STRICT_R7)) return SCM_CHAR_INVALID;
758     if (key == 'x' && SCM_EQ(mode, SCM_SYM_WARN_LEGACY)) {
759         Scm_Warn("Legacy \\x hex-escape: \\x%c%c", buf[0], buf[1]);
760     }
761 
762     {
763         int val = 0;
764         int ndigits = (key == 'u')? 4 : (key == 'x')? 2 : 8;
765         if (ndigits > buflen) return SCM_CHAR_INVALID;
766         for (int i=0; i<ndigits; i++) {
767             if (!isxdigit(buf[i])) return SCM_CHAR_INVALID;
768             val = val * 16 + Scm_DigitToInt(buf[i], 16, FALSE);
769         }
770         *nextbuf = buf + ndigits;
771         if (!legacy_fallback && key != 'x') val = Scm_UcsToChar(val);
772         return val;
773     }
774 }
775 
776 /* On success, parsed char and other prefetched hexdigits are added
777    to BUF and returns #t.  Otherwise the prefetched string is returned. */
Scm_ReadXdigitsFromPort(ScmPort * port,int key,ScmObj mode,int incompletep,ScmDString * buf)778 ScmObj Scm_ReadXdigitsFromPort(ScmPort *port, int key, ScmObj mode,
779                                int incompletep, ScmDString *buf)
780 {
781     ScmDString ds;
782 
783     Scm_DStringInit(&ds);
784     for (;;) {
785         int ch = Scm_GetcUnsafe(port);
786         if (ch == ';') {
787             Scm_DStringPutc(&ds, ch);
788             break;
789         }
790         if (ch == EOF || ch >= 0x80 || !isxdigit(ch)) {
791             Scm_UngetcUnsafe(ch, port);
792             break;
793         }
794         Scm_DStringPutc(&ds, ch);
795     }
796 
797     ScmSmallInt numchars;
798     const char *chars = Scm_DStringPeek(&ds, &numchars, NULL);
799     const char *next;
800 
801     int r = Scm_ReadXdigitsFromString(chars, numchars, key, mode, TRUE, &next);
802     if (r != SCM_CHAR_INVALID) {
803         if (incompletep) Scm_DStringPutb(buf, r);
804         else             Scm_DStringPutc(buf, r);
805         if (next - chars < numchars) {
806             Scm_DStringPutz(buf, next, numchars - (next-chars));
807         }
808         return SCM_TRUE;
809     } else {
810         return Scm_MakeString(chars, numchars, -1, SCM_STRING_COPYING);
811     }
812 }
813 
814 /* Another utility procedure.   Read sequence of digits from PORT and made it
815    long and return.  CH can be prefetched digit char; if you don't
816    have one, pass SCM_CHAR_INVALID.  The first nondigit char read
817    is stored in NEXT, and number of digits read is stored in NUMREAD.
818    If the number overflows LONG_MAX or there's no digits to be read,
819    returns -1 (you can check NUMREAD to see which is the case).
820    NB: This may throw an error if ch is neither a valid digit char
821    nor SCM_CHAR_INVALID. */
822 /* NB: Call for refactoring, using Scm_ParseDigitsAsLong  */
Scm_ReadDigitsAsLong(ScmPort * port,ScmChar ch,int radix,ScmChar * next,ScmSize * numread)823 long Scm_ReadDigitsAsLong(ScmPort *port, ScmChar ch, int radix,
824                           ScmChar *next, /*out*/
825                           ScmSize *numread /*out*/)
826 {
827     long val = 0;
828     ScmSize nchars = 0;
829     if (ch != SCM_CHAR_INVALID) {
830         int v = Scm_DigitToInt(ch, radix, FALSE);
831         if (v < 0) {
832             /* This is for safety net.  The caller should filter out
833                this case. */
834             Scm_ReadError(port, "Digit char expected, but got %C", ch);
835         }
836         val = v;
837     }
838     for (;;) {
839         ch = Scm_Getc(port);
840         nchars++;
841         int v = Scm_DigitToInt(ch, radix, FALSE);
842         if (v < 0) {            /* EOF case is covered here */
843             *next = ch;
844             *numread = nchars;
845             return val;
846         }
847         if (val >= (LONG_MAX/radix+1)) {
848             /* we'll overflow */
849             *next = ch;
850             *numread = nchars;
851             return -1;
852         }
853         val = val*radix+v;
854     }
855     /* NOTREACHED */
856 }
857 
858 /* Read long digits from BUF up to LEN.  BUF must only contain single-byte
859    chars. */
Scm_ParseDigitsAsLong(const char * buf,ScmSize len,int radix,ScmSize * numread)860 long Scm_ParseDigitsAsLong(const char *buf, ScmSize len, int radix,
861                            ScmSize *numread) /*out*/
862 {
863     long val = 0;
864     ScmSize nchars = 0;
865     for (; nchars < len; nchars++, buf++) {
866         int v = Scm_DigitToInt((ScmChar)*buf, radix, FALSE);
867         if (v < 0) {
868             *numread = nchars;
869             return val;
870         }
871         if (val >= (LONG_MAX/radix+1)) {
872             /* we'll overflow */
873             *numread = nchars;
874             return -1;
875         }
876         val = val*radix+v;
877     }
878     *numread = nchars;
879     return (nchars == 0)? -1 : val;
880 }
881 
882 /*----------------------------------------------------------------
883  * List
884  */
885 
886 /* Internal read_list.  returns whether the list contains unresolved
887    reference or not within the flag has_ref */
read_list_int(ScmPort * port,ScmChar closer,ScmReadContext * ctx,int * has_ref,int start_line)888 static ScmObj read_list_int(ScmPort *port, ScmChar closer,
889                             ScmReadContext *ctx, int *has_ref, int start_line)
890 {
891     ScmObj start = SCM_NIL, last = SCM_NIL;
892     int dot_seen = FALSE, ref_seen = FALSE;
893 
894     for (;;) {
895         int c = skipws(port, ctx);
896         if (c == EOF) goto eoferr;
897         if (c == closer) {
898             *has_ref = !!ref_seen;
899             return start;
900         }
901 
902         /* NB: We need to keep reading after we see the dotted pair, since
903            there can be a case like (a . b #;c), which is valid syntax.
904            We only reject when we see a next item after reading dotted pair. */
905         ScmObj item;
906         if (c == '.') {
907             if (dot_seen) goto baddot;
908             int c2 = Scm_GetcUnsafe(port);
909             if (c2 == closer) {
910                 goto baddot;
911             } else if (c2 == EOF) {
912                 goto eoferr;
913             } else if (!char_word_constituent(c2, FALSE)) {
914                 /* can be a dot pair at the end */
915                 if (start == SCM_NIL) goto baddot;
916                 Scm_UngetcUnsafe(c2, port);
917                 item = read_item(port, ctx);
918                 if (SCM_READ_REFERENCE_P(item)) ref_seen = TRUE;
919                 SCM_SET_CDR_UNCHECKED(last, item);
920                 dot_seen = TRUE;
921                 continue;
922             }
923             Scm_UngetcUnsafe(c2, port);
924             item = read_symbol_or_number(port, c, ctx);
925         } else {
926             Scm_UngetcUnsafe(c, port);
927             item = read_internal(port, ctx);
928             if (SCM_UNDEFINEDP(item)) continue; /* it was just a comment */
929             if (dot_seen) goto baddot;
930             if (SCM_READ_REFERENCE_P(item)) ref_seen = TRUE;
931         }
932         SCM_APPEND1(start, last, item);
933     }
934   eoferr:
935     if (start_line >= 0) {
936         Scm_ReadError(port, "EOF inside a list (starting from line %d)",
937                       start_line);
938     } else {
939         Scm_ReadError(port, "EOF inside a list");
940     }
941   baddot:
942     Scm_ReadError(port, "bad dot syntax");
943     return SCM_NIL;             /* dummy */
944 }
945 
read_list(ScmPort * port,ScmChar closer,ScmReadContext * ctx)946 static ScmObj read_list(ScmPort *port, ScmChar closer, ScmReadContext *ctx)
947 {
948     int has_ref;
949     ScmSize line = -1;
950 
951     if (ctx->flags & RCTX_SOURCE_INFO) line = Scm_PortLine(port);
952 
953     ScmObj r = read_list_int(port, closer, ctx, &has_ref, line);
954 
955     if (SCM_PAIRP(r) && (ctx->flags & RCTX_SOURCE_INFO) && line >= 0) {
956         /* Swap the head of the list for an extended pair to record
957            source-code info.*/
958         r = Scm_ExtendedCons(SCM_CAR(r), SCM_CDR(r));
959         Scm_PairAttrSet(SCM_PAIR(r), SCM_SYM_SOURCE_INFO,
960                         SCM_LIST2(Scm_PortName(port), SCM_MAKE_INT(line)));
961     }
962 
963     if (has_ref) ref_push(ctx, r, SCM_FALSE);
964     return r;
965 }
966 
read_vector(ScmPort * port,ScmChar closer,ScmReadContext * ctx)967 static ScmObj read_vector(ScmPort *port, ScmChar closer, ScmReadContext *ctx)
968 {
969     int has_ref;
970     ScmSize line = -1;
971     ScmObj r;
972 
973     if (ctx->flags & RCTX_SOURCE_INFO) line = Scm_PortLine(port);
974     r = read_list_int(port, closer, ctx, &has_ref, line);
975     r = Scm_ListToVector(r, 0, -1);
976     if (has_ref) ref_push(ctx, r, SCM_FALSE);
977     return r;
978 }
979 
read_quoted(ScmPort * port,ScmObj quoter,ScmReadContext * ctx)980 static ScmObj read_quoted(ScmPort *port, ScmObj quoter, ScmReadContext *ctx)
981 {
982     ScmSize line = -1;
983     ScmObj r;
984 
985     if (ctx->flags & RCTX_SOURCE_INFO) line = Scm_PortLine(port);
986     ScmObj item = read_item(port, ctx);
987     if (SCM_EOFP(item)) Scm_ReadError(port, "unterminated quote");
988     if (line >= 0) {
989         r = Scm_ExtendedCons(quoter, Scm_Cons(item, SCM_NIL));
990         Scm_PairAttrSet(SCM_PAIR(r), SCM_SYM_SOURCE_INFO,
991                         SCM_LIST2(Scm_PortName(port), SCM_MAKE_INT(line)));
992     } else {
993         r = Scm_Cons(quoter, Scm_Cons(item, SCM_NIL));
994     }
995     if (SCM_READ_REFERENCE_P(item)) ref_push(ctx, SCM_CDR(r), SCM_FALSE);
996     return r;
997 }
998 
999 /*----------------------------------------------------------------
1000  * String
1001  */
1002 
1003 /* Handling \xNN;, \uNNNN, \UNNNNNNNN escapes.  To make it easier
1004    to support both legacy \xN{2} syntax and new \xN{1,} syntax,
1005    we read-ahead as many hexdigits as possible first, then parse it,
1006    and append the result to BUF, along with any unused digits. */
read_string_xdigits(ScmPort * port,int key,int incompletep,ScmDString * buf)1007 static void read_string_xdigits(ScmPort *port, int key,
1008                                 int incompletep, ScmDString *buf)
1009 {
1010     ScmObj bad = Scm_ReadXdigitsFromPort(port, key,
1011                                          Scm_GetPortReaderLexicalMode(port),
1012                                          incompletep, buf);
1013     if (SCM_STRINGP(bad)) {
1014         /* skip chars to the end of string, so that the reader will read
1015            after the erroneous string */
1016         for (;;) {
1017             int c;
1018             if (incompletep) c = Scm_GetbUnsafe(port);
1019             else c = Scm_GetcUnsafe(port);
1020             if (c == EOF || c == '"') break;
1021             if (c == '\\') {
1022                 if (incompletep) c = Scm_GetbUnsafe(port);
1023                 else c = Scm_GetcUnsafe(port);
1024             }
1025         }
1026         Scm_ReadError(port,
1027                       "Bad \\%c escape sequence in a string literal: `\\%c%A'",
1028                       key, key, bad);
1029     }
1030 }
1031 
read_string(ScmPort * port,int incompletep,ScmReadContext * ctx SCM_UNUSED)1032 static ScmObj read_string(ScmPort *port, int incompletep,
1033                           ScmReadContext *ctx SCM_UNUSED)
1034 {
1035     int c = 0;
1036     ScmDString ds;
1037     Scm_DStringInit(&ds);
1038 
1039 #define FETCH(var)                                      \
1040     if (incompletep) { var = Scm_GetbUnsafe(port); }    \
1041     else             { var = Scm_GetcUnsafe(port); }
1042 #define ACCUMULATE(var)                                 \
1043     if (incompletep) { SCM_DSTRING_PUTB(&ds, var); }    \
1044     else             { SCM_DSTRING_PUTC(&ds, var); }
1045 #define INTRALINE_WS(var)                               \
1046     ((var)==' ' || (var)=='\t' || SCM_CHAR_EXTRA_WHITESPACE_INTRALINE(var))
1047 
1048     for (;;) {
1049         FETCH(c);
1050         switch (c) {
1051         case EOF: goto eof_exit;
1052         case '"': goto finish;
1053         backslash:
1054         case '\\': {
1055             FETCH(c);
1056             switch (c) {
1057             case EOF: goto eof_exit;
1058             case 'n': ACCUMULATE('\n'); break;
1059             case 'r': ACCUMULATE('\r'); break;
1060             case 'f': ACCUMULATE('\f'); break;
1061             case 't': ACCUMULATE('\t'); break;
1062             case 'a': ACCUMULATE(0x07); break; /* alarm */
1063             case 'b': ACCUMULATE(0x08); break; /* backspace */
1064             case '\\': ACCUMULATE('\\'); break;
1065             case '0': ACCUMULATE(0); break;
1066             case 'x': {
1067                 read_string_xdigits(port, 'x', incompletep, &ds);
1068                 break;
1069             }
1070             case 'u': case 'U': {
1071                 if (SCM_EQ(Scm_GetPortReaderLexicalMode(port),
1072                            SCM_SYM_STRICT_R7)) {
1073                     Scm_ReadError(port, "\\%c in string literal isn't allowed "
1074                                   "in strinct-r7rs mode", c);
1075                 }
1076                 read_string_xdigits(port, c, incompletep, &ds);
1077                 break;
1078             }
1079                 /* R6RS-style line continuation handling*/
1080             line_continuation:
1081             case ' ':
1082             case '\t': {
1083                 for (;;) {
1084                     FETCH(c);
1085                     if (c == EOF)  goto eof_exit;
1086                     if (c == '\r') goto cont_cr;
1087                     if (c == '\n') goto cont_lf;
1088                     if (!INTRALINE_WS(c)) goto cont_err;
1089                 }
1090             }
1091                 /*FALLTHROUGH*/
1092             cont_cr:
1093             case '\r': {
1094                 FETCH(c);
1095                 if (c == EOF)  goto eof_exit;
1096                 if (c == '"')  goto finish;
1097                 if (c == '\\') goto backslash;
1098                 if (c != '\n' && !INTRALINE_WS(c)) {
1099                     ACCUMULATE(c);
1100                     break;
1101                 }
1102             }
1103                 /*FALLTHROUGH*/
1104             cont_lf:
1105             case '\n': {
1106                 for (;;) {
1107                     FETCH(c);
1108                     if (c == EOF)  goto eof_exit;
1109                     if (c == '"')  goto finish;
1110                     if (c == '\\') goto backslash;
1111                     if (!INTRALINE_WS(c)) {
1112                         ACCUMULATE(c);
1113                         break;
1114                     }
1115                 }
1116                 break;
1117             }
1118             default:
1119                 if (SCM_CHAR_EXTRA_WHITESPACE_INTRALINE(c)) goto line_continuation;
1120                 else ACCUMULATE(c);
1121             }
1122             break;
1123         }
1124         default: ACCUMULATE(c); break;
1125         }
1126     }
1127  eof_exit:
1128     Scm_ReadError(port, "EOF encountered in a string literal: %S",
1129                   Scm_DStringGet(&ds, 0));
1130  cont_err:
1131     Scm_ReadError(port, "Invalid line continuation sequence in a string literal: %S...",
1132                   Scm_DStringGet(&ds, 0));
1133  finish:;
1134     int flags = ((incompletep? SCM_STRING_INCOMPLETE:0) | SCM_STRING_IMMUTABLE);
1135     return Scm_DStringGet(&ds, flags);
1136 }
1137 
1138 /*----------------------------------------------------------------
1139  * Character
1140  */
1141 
1142 static struct char_name {
1143     const char *name;
1144     u_int size;
1145     ScmObj ch;
1146 } char_names[] = {
1147 #define DEFCHAR(name, char) \
1148     { #name, sizeof(#name)-1, SCM_MAKE_CHAR(char) }
1149     DEFCHAR(alarm,   0x07),
1150     DEFCHAR(backspace, 0x08),
1151     DEFCHAR(space,   ' '),
1152     DEFCHAR(newline, '\n'),
1153     DEFCHAR(nl,      '\n'),
1154     DEFCHAR(lf,      '\n'),
1155     DEFCHAR(return,  '\r'),
1156     DEFCHAR(cr,      '\r'),
1157     DEFCHAR(tab,     '\t'),
1158     DEFCHAR(ht,      '\t'),
1159     DEFCHAR(page,    '\f'),
1160     DEFCHAR(escape,  0x1b),
1161     DEFCHAR(esc,     0x1b),
1162     DEFCHAR(delete,  0x7f),
1163     DEFCHAR(del,     0x7f),
1164     DEFCHAR(null,    0),
1165     { NULL, 0, 0 }
1166 };
1167 
read_char(ScmPort * port,ScmReadContext * ctx)1168 static ScmObj read_char(ScmPort *port, ScmReadContext *ctx)
1169 {
1170     ScmString *name;
1171 
1172     int c = Scm_GetcUnsafe(port);
1173     switch (c) {
1174     case EOF:
1175         Scm_ReadError(port, "EOF encountered in character literal");
1176         break;
1177     case '(':; case ')':; case '[':; case ']':; case '{':; case '}':;
1178     case '"':; case ' ':; case '\\':; case '|':; case ';':;
1179     case '#':;
1180         return SCM_MAKE_CHAR(c);
1181     default: {
1182         /* need to read word to see if it is a character name */
1183         name = SCM_STRING(read_word(port, c, ctx, TRUE, FALSE));
1184 
1185         if (SCM_EQ(Scm_GetPortReaderLexicalMode(port), SCM_SYM_STRICT_R7)) {
1186             ScmChar following = Scm_GetcUnsafe(port);
1187             if (!char_is_delimiter(following)) {
1188                 Scm_Error("Character literal isn't delimited: #\\%A%C ...",
1189                           name, following);
1190             }
1191             Scm_UngetcUnsafe(following, port);
1192         }
1193 
1194         ScmSmallInt namelen, namesize;
1195         const char *cname = Scm_GetStringContent(name, &namesize, &namelen,
1196                                                  NULL);
1197         if (namelen == 1) {
1198             return SCM_MAKE_CHAR(c);
1199         }
1200         if (namelen != namesize) {
1201             /* no character name contains multibyte chars */
1202 
1203             goto unknown;
1204         }
1205 
1206         ScmObj lexmode = Scm_GetPortReaderLexicalMode(port);
1207 
1208         /* handle #\x1f etc. */
1209         if (cname[0] == 'x' && isxdigit(cname[1])) {
1210             const char *nextptr;
1211             ScmChar code = Scm_ReadXdigitsFromString(cname+1, namesize-1, 'x',
1212                                                      lexmode, FALSE, &nextptr);
1213             if (code == SCM_CHAR_INVALID || *nextptr != '\0') goto unknown;
1214             return SCM_MAKE_CHAR(code);
1215         }
1216         /* handle legacy #\uxxxx or #\uxxxxxxxx */
1217         if ((cname[0] == 'u') && isxdigit(cname[1])
1218             && (!SCM_EQ(lexmode, SCM_SYM_STRICT_R7))) {
1219             if (namesize >= 5 && namesize <= 9) {
1220                 const char *nextptr;
1221                 /* NB: We want to allow variable number of digits, so
1222                    we pass 'x' as key (instead of 'u') here. */
1223                 ScmChar code = Scm_ReadXdigitsFromString(cname+1, namesize-1,
1224                                                          'x', lexmode, FALSE,
1225                                                          &nextptr);
1226                 if (code >= 0 && *nextptr == '\0') {
1227                     return SCM_MAKE_CHAR(code);
1228                 }
1229             }
1230             /* if we come here, it's an error. */
1231             Scm_ReadError(port, "Bad UCS character code: #\\%s", cname);
1232         }
1233 
1234         struct char_name *cntab = char_names;
1235         while (cntab->name) {
1236             if (cntab->size == (unsigned)namesize
1237                 && strncmp(cntab->name, cname, namesize) == 0) {
1238                 return cntab->ch;
1239             }
1240             cntab++;
1241         }
1242       unknown:
1243         Scm_ReadError(port, "Unknown character name: #\\%A", name);
1244     }
1245     }
1246     return SCM_UNDEFINED;       /* dummy */
1247 }
1248 
1249 /*----------------------------------------------------------------
1250  * Symbols and Numbers
1251  */
1252 
1253 /* Reads a sequence of word-constituent characters from PORT, and returns
1254    ScmString.  INITIAL may be a readahead character, or SCM_CHAR_INVALID
1255    if there's none.  TEMP_CASE_FOLD turns on case-fold mode regardless of
1256    the read context setting.  INCLUDE_HASH_SIGN allows '#' to appear in
1257    the word.
1258 */
read_word(ScmPort * port,ScmChar initial,ScmReadContext * ctx SCM_UNUSED,int temp_case_fold,int include_hash_sign)1259 static ScmObj read_word(ScmPort *port, ScmChar initial,
1260                         ScmReadContext *ctx SCM_UNUSED,
1261                         int temp_case_fold, int include_hash_sign)
1262 {
1263     int case_fold = temp_case_fold || SCM_PORT_CASE_FOLDING(port);
1264     ScmDString ds;
1265     Scm_DStringInit(&ds);
1266     if (initial != SCM_CHAR_INVALID) {
1267         if (case_fold && char_word_case_fold(initial)) initial = tolower(initial);
1268         SCM_DSTRING_PUTC(&ds, initial);
1269     }
1270 
1271     for (;;) {
1272         int c = Scm_GetcUnsafe(port);
1273         if (c == EOF || !char_word_constituent(c, include_hash_sign)) {
1274             Scm_UngetcUnsafe(c, port);
1275             return Scm_DStringGet(&ds, 0);
1276         }
1277         if (case_fold && char_word_case_fold(c)) c = tolower(c);
1278         SCM_DSTRING_PUTC(&ds, c);
1279     }
1280 }
1281 
1282 /* Kaveat: We don't allow '#' in symbols, but we need to read '#'
1283    for numbers.  To allow weird identifers like '1+', we need to read the
1284    word as a number fist and convert it to a symbol when the read word
1285    can't be interpreted as a number.  For the consistency, we read
1286    with '#' and then check it in read_symbol, too. */
check_valid_symbol(ScmString * s)1287 static void check_valid_symbol(ScmString *s)
1288 {
1289     ScmObj r = Scm_StringScanChar(s, SCM_CHAR('#'), SCM_STRING_SCAN_INDEX);
1290     if (!SCM_FALSEP(r)) {
1291         Scm_Error("invalid symbol name: %S", SCM_OBJ(s));
1292     }
1293 }
1294 
1295 /* Read a symbol starting with INITIAL (assuming unescaped), interned. */
read_symbol(ScmPort * port,ScmChar initial,ScmReadContext * ctx)1296 static ScmObj read_symbol(ScmPort *port, ScmChar initial, ScmReadContext *ctx)
1297 {
1298     ScmString *s = SCM_STRING(read_word(port, initial, ctx, FALSE, TRUE));
1299     check_valid_symbol(s);
1300     return Scm_Intern(s);
1301 }
1302 
1303 /* This is called when a symbol (either bare or escaped) is expected
1304    starting with INITIAL char; e.g. after #: or #!.  An error is thrown
1305    if a symbol isn't immediately followed.  PRECEDING is used in the
1306    error message. */
read_immediate_symbol(ScmPort * port,ScmChar initial,int interned,const char * preceding,ScmReadContext * ctx)1307 static ScmObj read_immediate_symbol(ScmPort *port, ScmChar initial,
1308                                     int interned, const char *preceding,
1309                                     ScmReadContext *ctx)
1310 {
1311     if (initial == '|') {
1312         return read_escaped_symbol(port, initial, interned, ctx);
1313     } else if (char_word_constituent(initial, FALSE)) {
1314         ScmString *s = SCM_STRING(read_word(port, initial, ctx, FALSE, TRUE));
1315         check_valid_symbol(s);
1316         /* we have to exclude numbers.  a bit ugly - call for cleanup */
1317         if (!(isdigit(initial) || initial == '+' || initial == '-')
1318             || SCM_FALSEP(Scm_StringToNumber(s, 10, 0))) {
1319             return Scm_MakeSymbol(s, interned);
1320         }
1321     }
1322     /* If we come here, we have invalid syntax. */
1323     if (initial == EOF) {
1324         Scm_ReadError(port, "'%s' followed by nothing", preceding);
1325     } else {
1326         Scm_ReadError(port, "invalid %s syntax near '%s%C'", preceding,
1327                       preceding, initial);
1328     }
1329     return SCM_UNDEFINED;   /* dummy */
1330 }
1331 
read_number(ScmPort * port,ScmChar initial,int radix,ScmReadContext * ctx)1332 static ScmObj read_number(ScmPort *port, ScmChar initial, int radix,
1333                           ScmReadContext *ctx)
1334 {
1335     ScmString *s = SCM_STRING(read_word(port, initial, ctx, FALSE, TRUE));
1336     u_long flags = radix >=2 ? SCM_NUMBER_FORMAT_ALT_RADIX : 0;
1337     if (SCM_EQ(Scm_GetPortReaderLexicalMode(port), SCM_SYM_STRICT_R7)) {
1338         flags |= SCM_NUMBER_FORMAT_STRICT_R7RS;
1339     }
1340     int default_radix = radix >= 2? radix : 10;
1341     ScmObj num = Scm_StringToNumber(s, default_radix, flags);
1342     if (num == SCM_FALSE) {
1343         if (radix >= 2) {
1344             /* In this case, we've read #<radix>r syntax */
1345             Scm_ReadError(port, "bad numeric format: \"#%dr%A\"", radix, s);
1346         } else {
1347             Scm_ReadError(port, "bad numeric format: %S", s);
1348         }
1349     }
1350     return num;
1351 }
1352 
read_symbol_or_number(ScmPort * port,ScmChar initial,ScmReadContext * ctx)1353 static ScmObj read_symbol_or_number(ScmPort *port, ScmChar initial, ScmReadContext *ctx)
1354 {
1355     ScmString *s = SCM_STRING(read_word(port, initial, ctx, FALSE, TRUE));
1356     u_long flags = 0;
1357     if (SCM_EQ(Scm_GetPortReaderLexicalMode(port), SCM_SYM_STRICT_R7)) {
1358         flags |= SCM_NUMBER_FORMAT_STRICT_R7RS;
1359     }
1360     ScmObj num = Scm_StringToNumber(s, 10, flags);
1361     if (num != SCM_FALSE) return num;
1362     check_valid_symbol(s);
1363     return Scm_Intern(s);
1364 }
1365 
read_keyword(ScmPort * port,ScmReadContext * ctx)1366 static ScmObj read_keyword(ScmPort *port, ScmReadContext *ctx)
1367 {
1368     int c2 = Scm_GetcUnsafe(port);
1369 
1370     if (c2 == '|') {
1371         ScmObj name = read_escaped_symbol(port, c2, FALSE, ctx); /* read as uninterned */
1372         return Scm_MakeKeyword(SCM_SYMBOL_NAME(name));
1373     } else {
1374         Scm_UngetcUnsafe(c2, port);
1375         ScmObj name = read_word(port, SCM_CHAR_INVALID, ctx, FALSE, FALSE);
1376         return Scm_MakeKeyword(SCM_STRING(name));
1377     }
1378 }
1379 
read_escaped_symbol(ScmPort * port,ScmChar delim,int interned,ScmReadContext * ctx SCM_UNUSED)1380 static ScmObj read_escaped_symbol(ScmPort *port, ScmChar delim, int interned,
1381                                   ScmReadContext *ctx SCM_UNUSED)
1382 {
1383     ScmDString ds;
1384     Scm_DStringInit(&ds);
1385     ScmObj xmode = Scm_GetPortReaderLexicalMode(port);
1386 
1387     for (;;) {
1388         int c = Scm_GetcUnsafe(port);
1389         if (c == EOF) {
1390             goto err;
1391         } else if (c == delim) {
1392             ScmString *s = SCM_STRING(Scm_DStringGet(&ds, 0));
1393             return Scm_MakeSymbol(s, interned);
1394         } else if (c == '\\') {
1395             /* CL-style single escape */
1396             c = Scm_GetcUnsafe(port);
1397             if (c == EOF) goto err;
1398             if (SCM_EQ(xmode, SCM_SYM_LEGACY)) {
1399                 SCM_DSTRING_PUTC(&ds, c);
1400             } else {
1401                 switch (c) {
1402                 case 'x': {
1403                     /* R7RS-style hex escape. */
1404                     ScmObj bad = Scm_ReadXdigitsFromPort(port, 'x', xmode,
1405                                                          FALSE, &ds);
1406                     if (SCM_STRINGP(bad)) {
1407                         Scm_ReadError(port, "invalid hex escape in a symbol literal: \\x%C", bad);
1408                     }
1409                     break;
1410                 }
1411                 case '\\': case '|': SCM_DSTRING_PUTC(&ds, c); break;
1412                 case 'a': SCM_DSTRING_PUTC(&ds, '\a'); break;
1413                 case 'b': SCM_DSTRING_PUTC(&ds, '\b'); break;
1414                 case 't': SCM_DSTRING_PUTC(&ds, '\t'); break;
1415                 case 'n': SCM_DSTRING_PUTC(&ds, '\n'); break;
1416                 case 'r': SCM_DSTRING_PUTC(&ds, '\r'); break;
1417                 default:
1418                     if (SCM_EQ(xmode, SCM_SYM_STRICT_R7)) {
1419                         Scm_ReadError(port, "invalid backslash-escape in a symbol literal: \\%A", SCM_MAKE_CHAR(c));
1420                     } else {
1421                         SCM_DSTRING_PUTC(&ds, c);
1422                     }
1423                 }
1424             }
1425         } else {
1426             SCM_DSTRING_PUTC(&ds, c);
1427         }
1428     }
1429   err:
1430     Scm_ReadError(port, "unterminated escaped symbol: |%s ...",
1431                   Scm_DStringGetz(&ds));
1432     return SCM_UNDEFINED; /* dummy */
1433 }
1434 
1435 /*----------------------------------------------------------------
1436  * Regexp & charset
1437  */
1438 
1439 /* gauche extension :  #/regexp/ */
read_regexp(ScmPort * port)1440 static ScmObj read_regexp(ScmPort *port)
1441 {
1442     ScmDString ds;
1443     Scm_DStringInit(&ds);
1444     for (;;) {
1445         ScmChar c = Scm_GetcUnsafe(port);
1446         if (c == SCM_CHAR_INVALID) {
1447             Scm_ReadError(port, "unterminated literal regexp");
1448         }
1449         if (c == '\\') {
1450             /* NB: We "eat" a backslash before '/', since it is only dealt
1451                with the reader and nothing to do with regexp parser itself. */
1452             ScmChar c1 = Scm_GetcUnsafe(port);
1453             if (c1 == SCM_CHAR_INVALID) {
1454                 Scm_ReadError(port, "unterminated literal regexp");
1455             }
1456             if (c1 != '/') SCM_DSTRING_PUTC(&ds, c);
1457             SCM_DSTRING_PUTC(&ds, c1);
1458         } else if (c == '/') {
1459             /* Read one more char to see if we have a flag */
1460             int flags = 0;
1461             c = Scm_GetcUnsafe(port);
1462             if (c == 'i') flags |= SCM_REGEXP_CASE_FOLD;
1463             else          Scm_UngetcUnsafe(c, port);
1464             return Scm_RegComp(SCM_STRING(Scm_DStringGet(&ds, 0)), flags);
1465         } else {
1466             SCM_DSTRING_PUTC(&ds, c);
1467         }
1468     }
1469 }
1470 
1471 /* gauche extension :  #[charset] */
read_charset(ScmPort * port)1472 static ScmObj read_charset(ScmPort *port)
1473 {
1474     ScmCharSet *cs = SCM_CHAR_SET(Scm_CharSetRead(port, NULL, TRUE, FALSE));
1475     return Scm_CharSetFreezeX(cs); /* literal is immutable */
1476 }
1477 
1478 /*----------------------------------------------------------------
1479  * Numeric prefixed (#N#, #N=, #Nr)
1480  */
1481 
read_num_prefixed(ScmPort * port,ScmChar ch,ScmReadContext * ctx)1482 static ScmObj read_num_prefixed(ScmPort *port, ScmChar ch, ScmReadContext *ctx)
1483 {
1484     ScmObj e = SCM_UNBOUND;
1485     ScmChar ch2;
1486     ScmSize nread;
1487     long prefix = Scm_ReadDigitsAsLong(port, ch, 10, &ch2, &nread);
1488 
1489     if (ch2 == EOF) Scm_ReadError(port, "unterminated reference form (#digits)");
1490     if (prefix < 0) Scm_ReadError(port, "#-prefix number overflow");
1491 
1492     switch (ch2) {
1493     case '#':
1494         /* #digit# - back reference */
1495         if (ctx->table == NULL
1496             || SCM_UNBOUNDP(e = Scm_HashTableRef(ctx->table,
1497                                                  Scm_MakeInteger(prefix),
1498                                                  SCM_UNBOUND))) {
1499             Scm_ReadError(port, "invalid reference number in #%d#", prefix);
1500         }
1501         if (SCM_READ_REFERENCE_P(e) && SCM_READ_REFERENCE_REALIZED(e)) {
1502             return SCM_READ_REFERENCE(e)->value;
1503         } else {
1504             return e;
1505         }
1506     case '=':
1507         /* #digit= - register */
1508         {
1509             ScmObj ref = Scm_MakeReadReference();
1510             if (ctx->table == NULL) {
1511                 ctx->table =
1512                     SCM_HASH_TABLE(Scm_MakeHashTableSimple(SCM_HASH_EQV, 0));
1513             }
1514             if (!SCM_UNBOUNDP(Scm_HashTableRef(ctx->table,
1515                                                Scm_MakeInteger(prefix),
1516                                                SCM_UNBOUND))) {
1517                 Scm_ReadError(port, "duplicate back-reference number in #%d=",
1518                               prefix);
1519             }
1520             ref_register(ctx, ref, prefix);
1521             ScmObj val = read_item(port, ctx);
1522             if (ref == val) {
1523                 /* an edge case: #0=#0# */
1524                 Scm_ReadError(port, "indeterminate read reference: #%d=#%d#",
1525                               prefix, prefix);
1526             }
1527             SCM_READ_REFERENCE(ref)->value = val;
1528             return val;
1529         }
1530     case 'r': case 'R':
1531         /* #digitR - radix */
1532         if (SCM_EQ(Scm_GetPortReaderLexicalMode(port), SCM_SYM_STRICT_R7)) {
1533             Scm_ReadError(port, "Radix prefix isn't allowed in strict R7RS mode.");
1534         }
1535         if (prefix < SCM_RADIX_MIN || prefix > SCM_RADIX_MAX) {
1536             Scm_ReadError(port, "Radix prefix out of range: radix in #<radix>R must be between %d and %d inclusive, but got: %d",
1537                           SCM_RADIX_MIN, SCM_RADIX_MAX, prefix);
1538             return SCM_UNDEFINED; /* dummy */
1539         } else {
1540             int ch = Scm_GetcUnsafe(port);
1541             if (ch == EOF) {
1542                 Scm_ReadError(port, "Premature end of radix-prefixed number: #%dr",
1543                               prefix);
1544             }
1545             return read_number(port, ch, prefix, ctx);
1546         }
1547     default:
1548         Scm_ReadError(port, "invalid numeric prefix (#, =, r or R is expected) : #%d%A", prefix, SCM_MAKE_CHAR(ch2));
1549         return SCM_UNDEFINED;
1550     }
1551 }
1552 
1553 /*----------------------------------------------------------------
1554  * SRFI-10 support
1555  */
1556 
1557 /* NB: The 'module' argument of DefineReaderCtor and GetReaderCtor may
1558    be used in future to make reader-ctor binding associated with modules.
1559    For now, it is not used and the caller should pass SCM_FALSE. */
1560 
Scm_DefineReaderCtor(ScmObj symbol,ScmObj proc,ScmObj finisher,ScmObj module SCM_UNUSED)1561 ScmObj Scm_DefineReaderCtor(ScmObj symbol, ScmObj proc, ScmObj finisher,
1562                             ScmObj module SCM_UNUSED /*reserved*/)
1563 {
1564     if (!SCM_PROCEDUREP(proc)) {
1565         Scm_Error("procedure required, but got %S", proc);
1566     }
1567     ScmObj pair = Scm_Cons(proc, finisher);
1568     (void)SCM_INTERNAL_MUTEX_LOCK(readCtorData.mutex);
1569     Scm_HashTableSet(readCtorData.table, symbol, pair, 0);
1570     (void)SCM_INTERNAL_MUTEX_UNLOCK(readCtorData.mutex);
1571     return SCM_UNDEFINED;
1572 }
1573 
Scm_GetReaderCtor(ScmObj symbol,ScmObj module SCM_UNUSED)1574 ScmObj Scm_GetReaderCtor(ScmObj symbol, ScmObj module SCM_UNUSED/*reserved*/)
1575 {
1576     (void)SCM_INTERNAL_MUTEX_LOCK(readCtorData.mutex);
1577     ScmObj r = Scm_HashTableRef(readCtorData.table, symbol, SCM_FALSE);
1578     (void)SCM_INTERNAL_MUTEX_UNLOCK(readCtorData.mutex);
1579     return r;
1580 }
1581 
read_sharp_comma(ScmPort * port,ScmReadContext * ctx)1582 static ScmObj read_sharp_comma(ScmPort *port, ScmReadContext *ctx)
1583 {
1584     ScmChar next = Scm_GetcUnsafe(port);
1585     if (next != '(') {
1586         Scm_ReadError(port, "bad #,-form: '(' should be followed, but got %C",
1587                       next);
1588     }
1589 
1590     int has_ref;
1591     ScmSize line = -1;
1592     if (ctx->flags & RCTX_SOURCE_INFO) line = Scm_PortLine(port);
1593 
1594     ScmObj form = read_list_int(port, ')', ctx, &has_ref, line);
1595     int len = Scm_Length(form);
1596     if (len <= 0) {
1597         Scm_ReadError(port, "bad #,-form: #,%S", form);
1598     }
1599     ScmObj r = process_sharp_comma(port, SCM_CAR(form), SCM_CDR(form), ctx,
1600                                    has_ref);
1601     return r;
1602 }
1603 
process_sharp_comma(ScmPort * port,ScmObj key,ScmObj args,ScmReadContext * ctx,int has_ref)1604 static ScmObj process_sharp_comma(ScmPort *port, ScmObj key, ScmObj args,
1605                                   ScmReadContext *ctx, int has_ref)
1606 {
1607     if (ctx->flags & RCTX_DISABLE_CTOR) return SCM_FALSE;
1608 
1609     (void)SCM_INTERNAL_MUTEX_LOCK(readCtorData.mutex);
1610     ScmObj e = Scm_HashTableRef(readCtorData.table, key, SCM_FALSE);
1611     (void)SCM_INTERNAL_MUTEX_UNLOCK(readCtorData.mutex);
1612 
1613     if (!SCM_PAIRP(e)) Scm_ReadError(port, "unknown #,-key: %S", key);
1614     ScmObj r = Scm_ApplyRec(SCM_CAR(e), args);
1615     if (has_ref) ref_push(ctx, r, SCM_CDR(e));
1616     return r;
1617 }
1618 
reader_ctor(ScmObj * args,int nargs,void * data SCM_UNUSED)1619 static ScmObj reader_ctor(ScmObj *args, int nargs, void *data SCM_UNUSED)
1620 {
1621     ScmObj optarg = (nargs > 2? args[2] : SCM_FALSE);
1622     return Scm_DefineReaderCtor(args[0], args[1], optarg, SCM_FALSE);
1623 }
1624 
1625 /*----------------------------------------------------------------
1626  * #!-support
1627  */
1628 
Scm_DefineReaderDirective(ScmObj symbol,ScmObj proc)1629 ScmObj Scm_DefineReaderDirective(ScmObj symbol, ScmObj proc)
1630 {
1631     (void)SCM_INTERNAL_MUTEX_LOCK(hashBangData.mutex);
1632     Scm_HashTableSet(hashBangData.table, symbol, proc, 0);
1633     (void)SCM_INTERNAL_MUTEX_UNLOCK(hashBangData.mutex);
1634     return SCM_UNDEFINED;
1635 }
1636 
read_shebang(ScmPort * port,ScmReadContext * ctx)1637 static ScmObj read_shebang(ScmPort *port, ScmReadContext *ctx)
1638 {
1639     /* If '#!' appears at the beginning of the input port, and it is
1640        followed by '/' or a space or a newline, then we consider it as
1641        the beginning of shebang and discard the entire line.
1642        Otherwise, we take this as #!<identifier> directive as
1643        specified in R6RS, and calls appropriate handler.
1644 
1645        R6RS is actually not very clean at this corner.  It requires
1646        distinct modes to parse a script (which always begins with
1647        shebang) and to parse R6RS program (in which '#!'  always
1648        introduces #!<identifier> token).  There's no way for just one
1649        parser that strictly covers both situation.
1650     */
1651     int c2 = Scm_GetcUnsafe(port);
1652     if (Scm_PortBytes(port) == 3 && (c2 == '/' || c2 == ' ' || c2 == '\n')) {
1653         /* shebang */
1654         for (;;) {
1655             if (c2 == '\n') return SCM_UNDEFINED;
1656             if (c2 == EOF) return SCM_EOF;
1657             c2 = Scm_GetcUnsafe(port);
1658         }
1659         /*NOTREACHED*/
1660     } else {
1661         ScmObj id = read_immediate_symbol(port, c2, TRUE, "#!", ctx);
1662         (void)SCM_INTERNAL_MUTEX_LOCK(hashBangData.mutex);
1663         ScmObj e = Scm_HashTableRef(hashBangData.table, id, SCM_FALSE);
1664         (void)SCM_INTERNAL_MUTEX_UNLOCK(hashBangData.mutex);
1665         if (SCM_FALSEP(e)) {
1666             Scm_Warn("Ignoring unrecognized hash-bang directive: #!%S", id);
1667             return SCM_UNDEFINED;
1668         }
1669         /* Reader directive may return zero or one value.  When it returns
1670            no values, we call Scm_VMSetResult to adjust the number of values.
1671          */
1672         ScmObj r = Scm_ApplyRec3(e, id, SCM_OBJ(port), SCM_OBJ(ctx));
1673         if (Scm_VMGetNumResults(Scm_VM()) == 1) return r;
1674         else { Scm_VMSetResult(SCM_UNDEFINED); return SCM_UNDEFINED; }
1675     }
1676 }
1677 
1678 /*----------------------------------------------------------------
1679  * #f, #t, #false, #true, and UVector literals
1680  */
1681 
1682 /* Pre-0.9.4 reader.  #t and #f delimit themselves (except '1', '3' or '6'
1683    follows '#f'.)  I doubt any code breaks if we change that, but there
1684    may be a data files around that somehow relies on this behavior.  So
1685    we keep this in 'legacy' reader mode.  */
read_sharp_word_legacy(ScmPort * port,char ch,ScmReadContext * ctx)1686 static ScmObj read_sharp_word_legacy(ScmPort *port, char ch, ScmReadContext *ctx)
1687 {
1688     ScmChar c1, c2 = SCM_CHAR_INVALID;
1689     char *tag = NULL;
1690 
1691     if (ch == 't') return SCM_TRUE;
1692 
1693     c1 = Scm_GetcUnsafe(port);
1694     if (ch == 'f') {
1695         if (c1 != '1' && c1 != '3' && c1 != '6') {
1696             Scm_UngetcUnsafe(c1, port);
1697             return SCM_FALSE;
1698         }
1699         c2 = Scm_GetcUnsafe(port);
1700         if (c1 == '3' && c2 == '2') tag = "f32";
1701         else if (c1 == '6' && c2 == '4') tag = "f64";
1702         else if (c1 == '1' && c2 == '6') tag = "f16";
1703     } else {
1704         if (c1 == '8') tag = (ch == 's')? "s8" : "u8";
1705         else if (c1 == '1') {
1706             c2 = Scm_GetcUnsafe(port);
1707             if (c2 == '6') tag = (ch == 's')? "s16" : "u16";
1708         }
1709         else if (c1 == '3') {
1710             c2 = Scm_GetcUnsafe(port);
1711             if (c2 == '2') tag = (ch == 's')? "s32" : "u32";
1712         }
1713         else if (c1 == '6') {
1714             c2 = Scm_GetcUnsafe(port);
1715             if (c2 == '4') tag = (ch == 's')? "s64" : "u64";
1716         }
1717     }
1718     if (tag == NULL) {
1719         char buf[SCM_CHAR_MAX_BYTES*4], *bufp = buf;
1720         *bufp++ = ch;
1721         SCM_CHAR_PUT(bufp, c1);
1722         bufp += SCM_CHAR_NBYTES(c1);
1723         if (c2 != SCM_CHAR_INVALID) {
1724             SCM_CHAR_PUT(bufp, c2);
1725             bufp += SCM_CHAR_NBYTES(c2);
1726         }
1727         *bufp = '\0';
1728         Scm_ReadError(port, "invalid uniform vector tag: %s", buf);
1729     }
1730     return Scm_ReadUVector(port, tag, ctx);
1731 }
1732 
1733 /* A 'new' version, friendly to R7RS */
read_sharp_word_1(ScmPort * port,char ch,ScmReadContext * ctx)1734 static ScmObj read_sharp_word_1(ScmPort *port, char ch, ScmReadContext *ctx)
1735 {
1736     ScmString *s = SCM_STRING(read_word(port, ch, ctx, TRUE, FALSE));
1737     const char *w = Scm_GetStringConst(s);
1738     const char *tag = NULL;
1739 
1740     switch (ch) {
1741     case 'c':
1742         if (strcmp(w, "c32") == 0
1743             || strcmp(w, "c64") == 0
1744             || strcmp(w, "c128") == 0) {
1745             tag = w;
1746         }
1747         break;
1748     case 'f':
1749         if (strcmp(w, "f16") == 0
1750             || strcmp(w, "f32") == 0
1751             || strcmp(w, "f64") == 0) {
1752             tag = w;
1753         } else if (w[1] == '\0' || strcmp(w, "false") == 0) {
1754             return SCM_FALSE;
1755         }
1756         break;
1757     case 's':
1758         if (strcmp(w, "s8") == 0
1759             || strcmp(w, "s16") == 0
1760             || strcmp(w, "s32") == 0
1761             || strcmp(w, "s64") == 0) {
1762             tag = w;
1763         }
1764         break;
1765     case 'u':
1766         if (strcmp(w, "u8") == 0
1767             || strcmp(w, "u16") == 0
1768             || strcmp(w, "u32") == 0
1769             || strcmp(w, "u64") == 0) {
1770             tag = w;
1771         }
1772         break;
1773     case 't':
1774         if (w[1] == '\0' || strcmp(w, "true") == 0) {
1775             return SCM_TRUE;
1776         }
1777     }
1778     if (tag == NULL) {
1779         Scm_ReadError(port, "invalid #-token: #%s", w);
1780     }
1781     return Scm_ReadUVector(port, tag, ctx);
1782 }
1783 
read_sharp_word(ScmPort * port,char ch,ScmReadContext * ctx)1784 static ScmObj read_sharp_word(ScmPort *port, char ch, ScmReadContext *ctx)
1785 {
1786     if (SCM_EQ(Scm_GetPortReaderLexicalMode(port), SCM_SYM_LEGACY)) {
1787         return read_sharp_word_legacy(port, ch, ctx);
1788     } else {
1789         return read_sharp_word_1(port, ch, ctx);
1790     }
1791 }
1792 
1793 /*----------------------------------------------------------------
1794  * #**"....", #*1010010...
1795  */
read_sharp_asterisk(ScmPort * port,ScmReadContext * ctx)1796 static ScmObj read_sharp_asterisk(ScmPort *port, ScmReadContext *ctx)
1797 {
1798     /* We used to use #*"..." for incomplete string literals, but it turned
1799        out it can be read as an zero-length bitvector #* followed by an
1800        ordinary string (since a double quote is a <delimiter> in R7RS).
1801 
1802        So we changed the literal to #**"..." (yeah, ugly, but incomplete
1803        strings are anomalies and shouldn't be used often).
1804 
1805        The reader behavior can be customized by reader-lexical-mode:
1806 
1807          permissive (default) - For the next release, #*"..." is read as
1808              an incomplete string, without warning.  In future, it warns.
1809          legacy - #*"..." is read as an incomplete string, without warning.
1810          warn-legacy - #*"..." is read as an incomplete string, but warns.
1811          strict-r7 - #*"..." is read as a zero-length bitvector followed by
1812                      a string.  #**"..." is rejected.
1813     */
1814 
1815     int c = Scm_GetcUnsafe(port);
1816     if (c == '*') {
1817         c = Scm_GetcUnsafe(port);
1818         if (c == '"') return read_string(port, TRUE, ctx);
1819         Scm_ReadError(port, "Invalid #* syntax: #**%C", c);
1820     }
1821     if (c == '"') {
1822         ScmObj m = Scm_GetPortReaderLexicalMode(port);
1823         if (SCM_EQ(m, SCM_SYM_PERMISSIVE)) {
1824             /* TRANSIENT: We switch this default behavior to warn in future */
1825             return read_string(port, TRUE, ctx);
1826         }
1827         if (SCM_EQ(m, SCM_SYM_LEGACY)) {
1828             return read_string(port, TRUE, ctx);
1829         }
1830         if (SCM_EQ(m, SCM_SYM_WARN_LEGACY)) {
1831             Scm_Warn("Deprecated incomplete string syntax #*\"...\" at %A:%ld",
1832                      Scm_PortName(port), Scm_PortLine(port));
1833             return read_string(port, TRUE, ctx);
1834         }
1835         if (SCM_EQ(m, SCM_SYM_STRICT_R7)) {
1836             Scm_UngetcUnsafe(c, port);
1837             return Scm_MakeBitvector(0, SCM_FALSE);
1838         }
1839         Scm_Panic("invalid reader-lexical-mode");
1840     }
1841 
1842     ScmDString ds;
1843     Scm_DStringInit(&ds);
1844     for (;;) {
1845         switch (c) {
1846         case EOF:
1847             goto finish;
1848         case '(': case ')': case '"': case '|': case ';':
1849             Scm_UngetcUnsafe(c, port);
1850             goto finish;
1851         case '0': case '1':
1852             Scm_DStringPutc(&ds, c);
1853             break;
1854         default:
1855             if ((SCM_CHAR_ASCII_P(c) && isspace(c))
1856                 || (!SCM_CHAR_ASCII_P(c) && SCM_CHAR_EXTRA_WHITESPACE(c))) {
1857                 Scm_UngetcUnsafe(c, port);
1858                 goto finish;
1859             }
1860             Scm_ReadError(port, "Invalid char in bitvector literal: %C", c);
1861         }
1862         c = Scm_GetcUnsafe(port);
1863     }
1864 
1865  finish:;
1866     ScmObj s = Scm_DStringGet(&ds, 0);
1867     return Scm_StringToBitvector(SCM_STRING(s), FALSE);
1868 }
1869 
1870 /* OBSOLETED: gauche.uvector used to call this to set up reader pointer.
1871    Now it is read in src/vector.c.   We keep this entry for ABI compatibility.
1872    Remove on 1.0 release. */
Scm__InstallReadUvectorHook(ScmObj (* f)(ScmPort *,const char *,ScmReadContext *)SCM_UNUSED)1873 void Scm__InstallReadUvectorHook(ScmObj (*f)(ScmPort*, const char*, ScmReadContext *) SCM_UNUSED)
1874 {
1875 }
1876 
1877 
1878 /*----------------------------------------------------------------
1879  * Initialization
1880  */
1881 
Scm__InitRead(void)1882 void Scm__InitRead(void)
1883 {
1884     readCtorData.table =
1885         SCM_HASH_TABLE(Scm_MakeHashTableSimple(SCM_HASH_EQ, 0));
1886     (void)SCM_INTERNAL_MUTEX_INIT(readCtorData.mutex);
1887     Scm_DefineReaderCtor(SCM_SYM_DEFINE_READER_CTOR,
1888                          Scm_MakeSubr(reader_ctor, NULL, 2, 1,
1889                                       SCM_SYM_DEFINE_READER_CTOR),
1890                          SCM_FALSE, SCM_FALSE);
1891 
1892     hashBangData.table =
1893         SCM_HASH_TABLE(Scm_MakeHashTableSimple(SCM_HASH_EQ, 0));
1894     (void)SCM_INTERNAL_MUTEX_INIT(hashBangData.mutex);
1895 
1896     defaultReadContext =
1897         Scm_MakePrimitiveParameter(SCM_CLASS_PRIMITIVE_PARAMETER, SCM_FALSE,
1898                                    SCM_OBJ(make_read_context(NULL)),
1899                                    0);
1900 }
1901