1 /*
2 * read.c - reader
3 *
4 * Copyright (c) 2000-2020 Shiro Kawai <shiro@acm.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the authors nor the names of its contributors
18 * may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include <stdio.h>
35 #include <ctype.h>
36 #include <math.h>
37 #define LIBGAUCHE_BODY
38 #include "gauche.h"
39 #include "gauche/char_attr.h"
40 #include "gauche/priv/portP.h"
41 #include "gauche/priv/builtin-syms.h"
42 #include "gauche/priv/readerP.h"
43
44 /*
45 * READ
46 */
47
48 static void read_context_flush(ScmReadContext *ctx);
49 static ScmObj read_internal(ScmPort *port, ScmReadContext *ctx);
50 static ScmObj read_item(ScmPort *port, ScmReadContext *ctx);
51 static ScmObj read_list(ScmPort *port, ScmChar closer, ScmReadContext *ctx);
52 static ScmObj read_vector(ScmPort *port, ScmChar closer, ScmReadContext *ctx);
53 static ScmObj read_string(ScmPort *port, int incompletep, ScmReadContext *ctx);
54 static ScmObj read_quoted(ScmPort *port, ScmObj quoter, ScmReadContext *ctx);
55 static ScmObj read_char(ScmPort *port, ScmReadContext *ctx);
56 static ScmObj read_word(ScmPort *port, ScmChar initial, ScmReadContext *ctx,
57 int temp_case_fold, int include_hash_sign);
58 static ScmObj read_symbol(ScmPort *port, ScmChar initial, ScmReadContext *ctx);
59 static ScmObj read_immediate_symbol(ScmPort *port, ScmChar initial,
60 int interned, const char *preceding,
61 ScmReadContext *ctx);
62 static ScmObj read_number(ScmPort *port, ScmChar initial,
63 int radix, /* #<radix>r case */
64 ScmReadContext *ctx);
65 static ScmObj read_symbol_or_number(ScmPort *port, ScmChar initial, ScmReadContext *ctx);
66 static ScmObj read_escaped_symbol(ScmPort *port, ScmChar delim, int interned,
67 ScmReadContext *ctx);
68 static ScmObj read_keyword(ScmPort *port, ScmReadContext *ctx);
69 static ScmObj read_regexp(ScmPort *port);
70 static ScmObj read_charset(ScmPort *port);
71 static ScmObj read_sharp_comma(ScmPort *port, ScmReadContext *ctx);
72 static ScmObj read_sharp_asterisk(ScmPort *port, ScmReadContext *ctx);
73 static ScmObj process_sharp_comma(ScmPort *port, ScmObj key, ScmObj args,
74 ScmReadContext *ctx, int has_ref);
75 static ScmObj read_shebang(ScmPort *port, ScmReadContext *ctx);
76 static ScmObj read_num_prefixed(ScmPort *port, ScmChar ch, ScmReadContext *ctx);
77 static ScmObj read_sharp_word(ScmPort *port, char c, ScmReadContext *ctx);
78
79 /* Table of 'read-time constructor' in SRFI-10 */
80 static struct {
81 ScmHashTable *table;
82 ScmInternalMutex mutex;
83 } readCtorData;
84
85 /* Table of hash-bang directive */
86 static struct {
87 ScmHashTable *table;
88 ScmInternalMutex mutex;
89 } hashBangData;
90
91 /* Parameter location for default reader mode */
92 static ScmPrimitiveParameter *defaultReadContext;
93
94 /*----------------------------------------------------------------
95 * Entry points
96 * Note: Entire read operation are done while locking the input port.
97 * So we can use 'unsafe' version of port operations inside this file.
98 * The lock is removed if reader routine signals an error. It is OK
99 * to call read routine recursively.
100 */
Scm_ReadWithContext(ScmObj port,ScmReadContext * ctx)101 ScmObj Scm_ReadWithContext(ScmObj port, ScmReadContext *ctx)
102 {
103 ScmVM *vm = Scm_VM();
104 volatile ScmObj r = SCM_NIL;
105 if (!SCM_PORTP(port) || SCM_PORT_DIR(port) != SCM_PORT_INPUT) {
106 Scm_Error("input port required: %S", port);
107 }
108 if (!(ctx->flags & RCTX_RECURSIVELY)) {
109 ctx->table = NULL;
110 ctx->pending = SCM_NIL;
111 }
112 if (PORT_LOCKED(SCM_PORT(port), vm)) {
113 r = read_item(SCM_PORT(port), ctx);
114 } else {
115 PORT_LOCK(SCM_PORT(port), vm);
116 PORT_SAFE_CALL(SCM_PORT(port),
117 r = read_item(SCM_PORT(port), ctx), /*no cleanup*/);
118 PORT_UNLOCK(SCM_PORT(port));
119 }
120 if (!(ctx->flags & RCTX_RECURSIVELY)) {
121 read_context_flush(ctx);
122 }
123 return r;
124 }
125
Scm_Read(ScmObj port)126 ScmObj Scm_Read(ScmObj port)
127 {
128 return Scm_ReadWithContext(port, Scm_MakeReadContext(NULL));
129 }
130
131 /* Convenience functions */
Scm_ReadFromString(ScmString * str)132 ScmObj Scm_ReadFromString(ScmString *str)
133 {
134 ScmObj inp = Scm_MakeInputStringPort(str, TRUE);
135 ScmReadContext *ctx = Scm_MakeReadContext(NULL);
136 ScmObj r = read_item(SCM_PORT(inp), ctx);
137 read_context_flush(ctx);
138 return r;
139 }
140
Scm_ReadFromCString(const char * cstr)141 ScmObj Scm_ReadFromCString(const char *cstr)
142 {
143 ScmObj s = SCM_MAKE_STR_IMMUTABLE(cstr);
144 ScmObj inp = Scm_MakeInputStringPort(SCM_STRING(s), TRUE);
145 ScmReadContext *ctx = Scm_MakeReadContext(NULL);
146 ScmObj r = read_item(SCM_PORT(inp), ctx);
147 read_context_flush(ctx);
148 return r;
149 }
150
Scm_ReadListWithContext(ScmObj port,ScmChar closer,ScmReadContext * ctx)151 ScmObj Scm_ReadListWithContext(ScmObj port, ScmChar closer, ScmReadContext *ctx)
152 {
153 ScmVM *vm = Scm_VM();
154 volatile ScmObj r = SCM_NIL;
155 if (!SCM_PORTP(port) || SCM_PORT_DIR(port) != SCM_PORT_INPUT) {
156 Scm_Error("input port required: %S", port);
157 }
158 if (!(ctx->flags & RCTX_RECURSIVELY)) {
159 ctx->table = NULL;
160 ctx->pending = SCM_NIL;
161 }
162 if (PORT_LOCKED(SCM_PORT(port), vm)) {
163 r = read_list(SCM_PORT(port), closer, ctx);
164 } else {
165 PORT_LOCK(SCM_PORT(port), vm);
166 PORT_SAFE_CALL(SCM_PORT(port),
167 r = read_list(SCM_PORT(port), closer, ctx),
168 /*no cleanup*/);
169 PORT_UNLOCK(SCM_PORT(port));
170 }
171 if (!(ctx->flags & RCTX_RECURSIVELY)) {
172 read_context_flush(ctx);
173 }
174 return r;
175 }
176
Scm_ReadList(ScmObj port,ScmChar closer)177 ScmObj Scm_ReadList(ScmObj port, ScmChar closer)
178 {
179 ScmReadContext *ctx = Scm_MakeReadContext(NULL);
180 return Scm_ReadListWithContext(port, closer, ctx);
181 }
182
183 /*----------------------------------------------------------------
184 * Read context
185 */
186
Scm_CurrentReadContext()187 ScmReadContext *Scm_CurrentReadContext()
188 {
189 ScmObj c = Scm_PrimitiveParameterRef(Scm_VM(), defaultReadContext);
190 SCM_ASSERT(SCM_READ_CONTEXT_P(c));
191 return SCM_READ_CONTEXT(c);
192 }
193
Scm_SetCurrentReadContext(ScmReadContext * ctx)194 ScmReadContext *Scm_SetCurrentReadContext(ScmReadContext *ctx)
195 {
196 ScmObj p = Scm_PrimitiveParameterSet(Scm_VM(), defaultReadContext,
197 SCM_OBJ(ctx));
198 SCM_ASSERT(SCM_READ_CONTEXT_P(p));
199 return SCM_READ_CONTEXT(p);
200 }
201
make_read_context(ScmReadContext * proto)202 static ScmReadContext *make_read_context(ScmReadContext *proto)
203 {
204 ScmReadContext *ctx = SCM_NEW(ScmReadContext);
205 SCM_SET_CLASS(ctx, SCM_CLASS_READ_CONTEXT);
206 ctx->flags = proto ? proto->flags : RCTX_SOURCE_INFO;
207 ctx->table = NULL;
208 ctx->pending = SCM_NIL;
209 return ctx;
210 }
211
Scm_MakeReadContext(ScmReadContext * proto)212 ScmReadContext *Scm_MakeReadContext(ScmReadContext *proto)
213 {
214 return make_read_context(proto? proto : Scm_CurrentReadContext());
215 }
216
read_context_print(ScmObj obj,ScmPort * port,ScmWriteContext * ctx SCM_UNUSED)217 static void read_context_print(ScmObj obj, ScmPort *port,
218 ScmWriteContext *ctx SCM_UNUSED)
219 {
220 Scm_Printf(port, "#<read-context %p>", obj);
221 }
222
223 SCM_DEFINE_BUILTIN_CLASS_SIMPLE(Scm_ReadContextClass, read_context_print);
224
225 /* public api */
Scm_ReadContextLiteralImmutable(ScmReadContext * ctx)226 int Scm_ReadContextLiteralImmutable(ScmReadContext *ctx)
227 {
228 return (ctx->flags & RCTX_LITERAL_IMMUTABLE);
229 }
230
231 /*----------------------------------------------------------------
232 * Error
233 */
234
Scm_ReadError(ScmPort * port,const char * msg,...)235 void Scm_ReadError(ScmPort *port, const char *msg, ...)
236 {
237 ScmObj ostr = Scm_MakeOutputStringPort(TRUE);
238 ScmObj name = Scm_PortName(port);
239 ScmSize line = Scm_PortLine(port);
240
241 Scm_Printf(SCM_PORT(ostr), "Read error at %S:",
242 SCM_STRINGP(name)? name : SCM_OBJ(SCM_MAKE_STR("??")));
243 if (line >= 0) {
244 Scm_Printf(SCM_PORT(ostr), "line %d: ", line);
245 }
246 va_list ap;
247 va_start(ap, msg);
248 Scm_Vprintf(SCM_PORT(ostr), msg, ap, TRUE);
249 va_end(ap);
250
251 ScmObj rerr = Scm_MakeReadError(Scm_GetOutputString(SCM_PORT(ostr), 0),
252 port, line);
253 Scm_Raise(rerr, 0);
254 }
255
256 /*----------------------------------------------------------------
257 * Read reference
258 */
259
260 /* Read reference is a proxy object to for referenced object (#N=).
261 */
262
263 static void read_reference_print(ScmObj obj, ScmPort *port,
264 ScmWriteContext *ctx);
265 SCM_DEFINE_BUILTIN_CLASS_SIMPLE(Scm_ReadReferenceClass, read_reference_print);
266
Scm_MakeReadReference(void)267 ScmObj Scm_MakeReadReference(void)
268 {
269 ScmReadReference *a = SCM_NEW(ScmReadReference);
270 SCM_SET_CLASS(a, SCM_CLASS_READ_REFERENCE);
271 a->value = SCM_UNBOUND;
272 return SCM_OBJ(a);
273 }
274
read_reference_print(ScmObj obj SCM_UNUSED,ScmPort * port,ScmWriteContext * ctx SCM_UNUSED)275 static void read_reference_print(ScmObj obj SCM_UNUSED, ScmPort *port,
276 ScmWriteContext *ctx SCM_UNUSED)
277 {
278 Scm_Printf(port, "#<read-reference>");
279 }
280
ref_push(ScmReadContext * ctx,ScmObj obj,ScmObj finisher)281 static void ref_push(ScmReadContext *ctx, ScmObj obj, ScmObj finisher)
282 {
283 ctx->pending = Scm_Acons(obj, finisher, ctx->pending);
284 }
285
ref_val(ScmObj ref)286 static ScmObj ref_val(ScmObj ref)
287 {
288 if (!SCM_READ_REFERENCE_REALIZED(ref)) {
289 Scm_Error("reader encontered unresolved read reference. Implementation error?");
290 }
291 return SCM_READ_REFERENCE(ref)->value;
292 }
293
ref_register(ScmReadContext * ctx,ScmObj obj,int refnum)294 static ScmObj ref_register(ScmReadContext *ctx, ScmObj obj, int refnum)
295 {
296 SCM_ASSERT(ctx->table);
297 Scm_HashTableSet(ctx->table, SCM_MAKE_INT(refnum), obj, 0);
298 return obj;
299 }
300
301 /* ctx->pending contains an assoc list of objects who contains read reference
302 which should be resolved.
303 The car of each entry is the object that needs to be fixed, and the
304 cdr of eacy entry may contain a finisher procedure (if the object is
305 created by read-time constructor.
306 */
read_context_flush(ScmReadContext * ctx)307 static void read_context_flush(ScmReadContext *ctx)
308 {
309 ScmObj cp, ep;
310
311 SCM_FOR_EACH(cp, ctx->pending) {
312 ScmObj entry = SCM_CAR(cp);
313 SCM_ASSERT(SCM_PAIRP(entry));
314 ScmObj obj = SCM_CAR(entry);
315 ScmObj finisher = SCM_CDR(entry);
316
317 if (!SCM_FALSEP(finisher)) {
318 Scm_ApplyRec(finisher, SCM_LIST1(obj));
319 } else if (SCM_PAIRP(obj)) {
320 SCM_FOR_EACH(ep, obj) {
321 if (SCM_READ_REFERENCE_P(SCM_CAR(ep))) {
322 SCM_SET_CAR_UNCHECKED(ep, ref_val(SCM_CAR(ep)));
323 }
324 if (SCM_READ_REFERENCE_P(SCM_CDR(ep))) {
325 /* in case we have (... . #N#) */
326 SCM_SET_CDR_UNCHECKED(ep, ref_val(SCM_CDR(ep)));
327 break;
328 }
329 }
330 } else if (SCM_VECTORP(obj)) {
331 int i, len = SCM_VECTOR_SIZE(obj);
332 for (i=0; i<len; i++) {
333 ep = SCM_VECTOR_ELEMENT(obj, i);
334 if (SCM_READ_REFERENCE_P(ep)) {
335 SCM_VECTOR_ELEMENTS(obj)[i] = ref_val(ep);
336 }
337 }
338 } else {
339 Scm_Error("read_context_flush: recursive reference only supported with vector and lists");
340 }
341 }
342 }
343
344 /*----------------------------------------------------------------
345 * Miscellaneous routines
346 */
347
348 /* Table of initial 128 bytes of ASCII characters to dispatch for
349 special meanings.
350 bit 0 : a valid constituent char of words
351 bit 1 : candidate of case folding
352 */
353 static const unsigned char ctypes[] = {
354 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
355 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
356 /* ! " # $ % & ' ( ) * + , - . / */
357 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
358 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
359 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
360 /* @ A B C D E F G H I J K L M N O */
361 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
362 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
363 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 1, 1,
364 /* ` a b c d e f g h i j k l m n o */
365 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
366 /* p q r s t u v w x y z { | } ~ ^? */
367 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
368 };
369
370 /* For characters >= 0x80, we follow R[67]RS. */
char_word_constituent(int c,int include_hash_sign)371 inline static int char_word_constituent(int c, int include_hash_sign)
372 {
373 if (c < 128) {
374 return ((c >= 0 && (ctypes[(unsigned char)c]&1))
375 || (c == '#' && include_hash_sign));
376 } else if (c == 0x200c || c == 0x200d) {
377 return TRUE;
378 } else {
379 switch (Scm_CharGeneralCategory(c)) {
380 case SCM_CHAR_CATEGORY_Lu:
381 case SCM_CHAR_CATEGORY_Ll:
382 case SCM_CHAR_CATEGORY_Lt:
383 case SCM_CHAR_CATEGORY_Lm:
384 case SCM_CHAR_CATEGORY_Lo:
385 case SCM_CHAR_CATEGORY_Mn:
386 case SCM_CHAR_CATEGORY_Nl:
387 case SCM_CHAR_CATEGORY_No:
388 case SCM_CHAR_CATEGORY_Pd:
389 case SCM_CHAR_CATEGORY_Pc:
390 case SCM_CHAR_CATEGORY_Po:
391 case SCM_CHAR_CATEGORY_Sc:
392 case SCM_CHAR_CATEGORY_Sm:
393 case SCM_CHAR_CATEGORY_Sk:
394 case SCM_CHAR_CATEGORY_So:
395 case SCM_CHAR_CATEGORY_Co: return TRUE;
396 default: return FALSE;
397 }
398 }
399 }
400
char_word_case_fold(int c)401 inline static int char_word_case_fold(int c)
402 {
403 /*NB: Should we adopt full-fledged case folding for #!fold-case mode?
404 It seems overkill to me. For now, we only handles ASCII range.*/
405 return (c >= 0 && c < 128 && (ctypes[(unsigned char)c]&2));
406 }
407
408 /* R7RS 7.1.1 <delimiter> */
char_is_delimiter(ScmChar ch)409 static int char_is_delimiter(ScmChar ch)
410 {
411 return ((ch < 0x80 && strchr("()\";| \t\n\r", ch) != NULL)
412 || SCM_CHAR_EXTRA_WHITESPACE(ch));
413 }
414
read_nested_comment(ScmPort * port,ScmReadContext * ctx SCM_UNUSED)415 static void read_nested_comment(ScmPort *port,
416 ScmReadContext *ctx SCM_UNUSED)
417 {
418 int nesting = 0;
419 ScmSize line = Scm_PortLine(port);
420
421 for (ScmChar c = Scm_GetcUnsafe(port);;) {
422 switch (c) {
423 case '#': {
424 ScmChar c1 = Scm_GetcUnsafe(port);
425 if (c1 == '|') { nesting++; break; }
426 else if (c1 == EOF) goto eof;
427 else c = c1;
428 continue;
429 }
430 case '|': {
431 ScmChar c1 = Scm_GetcUnsafe(port);
432 if (c1 == '#') {
433 if (nesting-- == 0) {
434 return;
435 }
436 break;
437 }
438 else if (c1 == EOF) goto eof;
439 else c = c1;
440 continue;
441 }
442 case EOF:
443 eof:
444 Scm_ReadError(port, "encountered EOF inside nested multi-line comment (comment begins at line %d)", line);
445 default:
446 break;
447 }
448 c = Scm_GetcUnsafe(port);
449 }
450 }
451
read_comment(ScmPort * port)452 static void read_comment(ScmPort *port) /* leading semicolon is already read */
453 {
454 for (;;) {
455 /* NB: comment may contain unexpected character code.
456 for the safety, we read bytes here. */
457 int c = Scm_GetbUnsafe(port);
458 if (c == '\n' || c == EOF) break;
459 }
460 }
461
skipws(ScmPort * port,ScmReadContext * ctx SCM_UNUSED)462 static int skipws(ScmPort *port, ScmReadContext *ctx SCM_UNUSED)
463 {
464 for (;;) {
465 int c = Scm_GetcUnsafe(port);
466 if (c == EOF) return c;
467 if (c <= 127) {
468 if (isspace(c)) continue;
469 if (c == ';') {
470 read_comment(port);
471 continue;
472 }
473 return c;
474 }
475 else if (!SCM_CHAR_EXTRA_WHITESPACE(c)) return c;
476 }
477 }
478
reject_in_r7(ScmPort * port,ScmReadContext * ctx SCM_UNUSED,const char * token)479 static void reject_in_r7(ScmPort *port,
480 ScmReadContext *ctx SCM_UNUSED,
481 const char *token)
482 {
483 if (SCM_EQ(Scm_GetPortReaderLexicalMode(port), SCM_SYM_STRICT_R7)) {
484 Scm_ReadError(port,
485 "lexical syntax %s isn't allowed in strict R7RS mode",
486 token);
487 }
488 }
489
read_internal(ScmPort * port,ScmReadContext * ctx)490 static ScmObj read_internal(ScmPort *port, ScmReadContext *ctx)
491 {
492 int c = skipws(port, ctx);
493 switch (c) {
494 case '(':
495 return read_list(port, ')', ctx);
496 case '"':
497 return read_string(port, FALSE, ctx);
498 case '#':
499 {
500 int c1 = Scm_GetcUnsafe(port);
501 switch (c1) {
502 case EOF:
503 Scm_ReadError(port, "premature #-sequence at EOF");
504 return SCM_UNDEFINED; /* dummy */
505 case 't':; case 'T': return read_sharp_word(port, 't', ctx);
506 case 'f':; case 'F': return read_sharp_word(port, 'f', ctx);
507 case 's':; case 'S': return read_sharp_word(port, 's', ctx);
508 case 'u':; case 'U': return read_sharp_word(port, 'u', ctx);
509 case 'c':; case 'C': return read_sharp_word(port, 'c', ctx);
510 case '(':
511 return read_vector(port, ')', ctx);
512 case '\\':
513 return read_char(port, ctx);
514 case 'x':; case 'X':; case 'o':; case 'O':;
515 case 'b':; case 'B':; case 'd':; case 'D':;
516 case 'e':; case 'E':; case 'i':; case 'I':;
517 Scm_UngetcUnsafe(c1, port);
518 return read_number(port, c, 0, ctx); /* let StringToNumber handle radix prefix */
519 case '!':
520 /* #! is either a script shebang or a reader directive */
521 return read_shebang(port, ctx);
522 case '"': {
523 /* #"..." - string interpolation */
524 reject_in_r7(port, ctx, "#\"...\"");
525 Scm_UngetcUnsafe(c1, port);
526 ScmObj form = read_item(port, ctx);
527 return process_sharp_comma(port,
528 SCM_SYM_STRING_INTERPOLATE,
529 SCM_LIST2(form, SCM_FALSE),
530 ctx, FALSE);
531 }
532 case '/':
533 /* #/.../ literal regexp */
534 reject_in_r7(port, ctx, "#/.../");
535 return read_regexp(port);
536 case '[':
537 /* #[...] literal charset */
538 reject_in_r7(port, ctx, "#[...]");
539 return read_charset(port);
540 case ',':
541 /* #,(form) - SRFI-10 read-time macro */
542 return read_sharp_comma(port, ctx);
543 case '|':
544 /* #| - block comment (SRFI-30)
545 it is equivalent to whitespace, so we return #<undef> */
546 read_nested_comment(port, ctx);
547 return SCM_UNDEFINED;
548 case '`': {
549 /* #`"..." - Legacy string interpolation syntax */
550 reject_in_r7(port, ctx, "#`\"...\"");
551 ScmObj form = read_item(port, ctx);
552 return process_sharp_comma(port,
553 SCM_SYM_STRING_INTERPOLATE,
554 SCM_LIST2(form, SCM_TRUE),
555 ctx, FALSE);
556 }
557 case '?': {
558 /* #? - debug directives */
559 reject_in_r7(port, ctx, "#?");
560 int c2 = Scm_GetcUnsafe(port);
561 switch (c2) {
562 case '=': {
563 /* #?=form - debug print */
564 ScmObj form = read_item(port, ctx);
565 return SCM_LIST2(SCM_SYM_DEBUG_PRINT, form);
566 }
567 case ',': {
568 /* #?,form - debug funcall */
569 ScmObj form = read_item(port, ctx);
570 return SCM_LIST2(SCM_SYM_DEBUG_FUNCALL, form);
571 }
572 case EOF:
573 return SCM_EOF;
574 default:
575 Scm_ReadError(port, "unsupported #?-syntax: #?%C", c2);
576 return SCM_UNDEFINED; /* dummy */
577 }
578 }
579 case '0': case '1': case '2': case '3': case '4':
580 case '5': case '6': case '7': case '8': case '9':
581 /* #N#, #N= or #Nr form */
582 return read_num_prefixed(port, c1, ctx);
583 case '*': {
584 reject_in_r7(port, ctx, "#*");
585 /* #**"...." byte string
586 #*01001001 for bit vector. */
587 return read_sharp_asterisk(port, ctx);
588 }
589 case ':': {
590 reject_in_r7(port, ctx, "#:");
591 /* #:name - uninterned symbol */
592 return read_immediate_symbol(port, Scm_GetcUnsafe(port),
593 FALSE, "#:", ctx);
594 }
595 case ';': {
596 /* #;expr - comment out sexpr */
597 int orig = ctx->flags;
598 ctx->flags |= RCTX_DISABLE_CTOR;
599 read_item(port, ctx); /* read and discard */
600 ctx->flags = orig;
601 return SCM_UNDEFINED; /* indicate this is a comment */
602 }
603 default:
604 Scm_ReadError(port, "unsupported #-syntax: #%C", c1);
605 return SCM_UNDEFINED; /* dummy */
606 }
607 }
608 case '\'': return read_quoted(port, SCM_SYM_QUOTE, ctx);
609 case '`': return read_quoted(port, SCM_SYM_QUASIQUOTE, ctx);
610 case ':':
611 /* Would be removed once we make keywords a subtype of symbols. */
612 if (SCM_EQ(Scm_GetPortReaderLexicalMode(port), SCM_SYM_STRICT_R7)) {
613 return read_symbol(port, c, ctx);
614 } else {
615 return read_keyword(port, ctx);
616 }
617 case ',':
618 {
619 int c1 = Scm_GetcUnsafe(port);
620 if (c1 == EOF) {
621 Scm_ReadError(port, "unterminated unquote");
622 return SCM_UNDEFINED; /* dummy */
623 } else if (c1 == '@') {
624 return read_quoted(port, SCM_SYM_UNQUOTE_SPLICING, ctx);
625 } else {
626 Scm_UngetcUnsafe(c1, port);
627 return read_quoted(port, SCM_SYM_UNQUOTE, ctx);
628 }
629 }
630 case '|':
631 return read_escaped_symbol(port, '|', TRUE, ctx);
632 case '[':
633 /* TODO: make it customizable */
634 reject_in_r7(port, ctx, "[]");
635 return read_list(port, ']', ctx);
636 case '{':
637 reject_in_r7(port, ctx, "{}");
638 return read_list(port, '}', ctx);
639 case '+':; case '-':
640 /* Note: R5RS doesn't permit identifiers beginning with '+' or '-',
641 but some Scheme programs use such identifiers. */
642 return read_symbol_or_number(port, c, ctx);
643 case '.':
644 {
645 int c1 = Scm_GetcUnsafe(port);
646 if (!char_word_constituent(c1, FALSE)) {
647 Scm_ReadError(port, "dot in wrong context");
648 }
649 Scm_UngetcUnsafe(c1, port);
650 return read_symbol_or_number(port, c, ctx);
651 }
652 case '0':; case '1':; case '2':; case '3':; case '4':;
653 case '5':; case '6':; case '7':; case '8':; case '9':;
654 /* Note: R5RS doesn't permit identifiers beginning with digits,
655 but some Scheme programs use such identifiers. */
656 return read_symbol_or_number(port, c, ctx);
657 case ')':; case ']':; case '}':;
658 Scm_ReadError(port, "extra close parenthesis `%c'", c);
659 return SCM_UNDEFINED; /* dummy */
660 case EOF:
661 return SCM_EOF;
662 default:
663 return read_symbol(port, c, ctx);
664 }
665 }
666
read_item(ScmPort * port,ScmReadContext * ctx)667 static ScmObj read_item(ScmPort *port, ScmReadContext *ctx)
668 {
669 for (;;) {
670 ScmObj obj = read_internal(port, ctx);
671 if (!SCM_UNDEFINEDP(obj)) return obj;
672 }
673 }
674
675 /*--------------------------------------------------------
676 * Common routine to handle hex-digit escape \xNN etc.
677 */
678 /* TODO: Redesign these for clearer and simpler API. */
679
680 /*
681 Hex-escape sequence can appear in the following places:
682
683 - character literals #\x1b
684 - string literals "\x1b;("
685 - symbol literals |\x1b;|
686 - character set literals #[\x1b;-\x1f;]
687 - regexp literals #/\x1b;+ /
688
689 Legacy Gauche supports fixed-digit syntax.
690
691 \xN{2} - char code up to 256 in _native encoding_
692 \uN{4} - unicode codepoint in BMP
693 \UN{8} - unicode codepoint
694
695 New (R7RS) syntax has variable number of digits, terminated by ';'
696
697 \xN{1,}; - unicode codepoint
698
699 There's an ambiguity in '\x' case - we first try to read it as R7RS syntax,
700 then falls back to legacy syntax, by default. Because of this backtrack,
701 the API is a bit complicated.
702
703 Scm_ReadXdigitsFromString reads from char* buffer, up to buflen octets.
704 The preceding backslash and a character 'x', 'u' or 'U' should already
705 be read. The character is passed to KEY. It also receives
706 ScmReaderLexicalMode.
707
708 Character literal doesn't take terminating ';'. TERMINATOR flag
709 indicates whether we expect terminator or not.
710
711 NEXTBUF is an output variable points to the next character on success.
712 Returns the retrieved decoded character, or SCM_CHAR_INVALID on error.
713
714 Scm_ReadXdigitsFromPort is a convenience routine on top of
715 Scm_ReadXdigitsFromString, when the caller is reading from a port.
716 Since we need look ahead arbitrary number of characters, we can't just
717 return a retrieved character. The procedure takes ScmDString to which
718 the read characters (the decoded character, plus remaining hexdigit
719 characters) are accumulated. When the escape syntax is invalid,
720 it returns #<string> of prefetched characters. On success, it returns
721 #t.
722 */
723
Scm_ReadXdigitsFromString(const char * buf,int buflen,ScmChar key,ScmObj mode,int terminator,const char ** nextbuf)724 ScmChar Scm_ReadXdigitsFromString(const char *buf,
725 int buflen,
726 ScmChar key, /* x, u or U */
727 ScmObj mode, /* Reader lexical mode */
728 int terminator, /* TRUE expecting ';' */
729 const char **nextbuf)
730 {
731 int legacy_fallback = FALSE;
732
733 if (key == 'x' && !SCM_EQ(mode, SCM_SYM_LEGACY)) {
734 int val = 0, i;
735 int overflow = FALSE;
736 for (i=0; i<buflen; i++) {
737 if (isxdigit(buf[i])) {
738 val = val*16 + Scm_DigitToInt(buf[i], 16, FALSE);
739 if (val > 0x10ffff) overflow = TRUE;
740 } else if (terminator && buf[i] == ';' && i > 0) {
741 /* R7RS syntax */
742 *nextbuf = buf+i+1;
743 return overflow? SCM_CHAR_INVALID : Scm_UcsToChar(val);
744 } else if (terminator && i < 2) {
745 return SCM_CHAR_INVALID;
746 } else {
747 break;
748 }
749 }
750 if (!terminator && i == buflen) {
751 *nextbuf = buf+i;
752 return overflow? SCM_CHAR_INVALID:Scm_UcsToChar(val);
753 }
754 /* Fallback to legacy syntax */
755 legacy_fallback = TRUE;
756 }
757 if (SCM_EQ(mode, SCM_SYM_STRICT_R7)) return SCM_CHAR_INVALID;
758 if (key == 'x' && SCM_EQ(mode, SCM_SYM_WARN_LEGACY)) {
759 Scm_Warn("Legacy \\x hex-escape: \\x%c%c", buf[0], buf[1]);
760 }
761
762 {
763 int val = 0;
764 int ndigits = (key == 'u')? 4 : (key == 'x')? 2 : 8;
765 if (ndigits > buflen) return SCM_CHAR_INVALID;
766 for (int i=0; i<ndigits; i++) {
767 if (!isxdigit(buf[i])) return SCM_CHAR_INVALID;
768 val = val * 16 + Scm_DigitToInt(buf[i], 16, FALSE);
769 }
770 *nextbuf = buf + ndigits;
771 if (!legacy_fallback && key != 'x') val = Scm_UcsToChar(val);
772 return val;
773 }
774 }
775
776 /* On success, parsed char and other prefetched hexdigits are added
777 to BUF and returns #t. Otherwise the prefetched string is returned. */
Scm_ReadXdigitsFromPort(ScmPort * port,int key,ScmObj mode,int incompletep,ScmDString * buf)778 ScmObj Scm_ReadXdigitsFromPort(ScmPort *port, int key, ScmObj mode,
779 int incompletep, ScmDString *buf)
780 {
781 ScmDString ds;
782
783 Scm_DStringInit(&ds);
784 for (;;) {
785 int ch = Scm_GetcUnsafe(port);
786 if (ch == ';') {
787 Scm_DStringPutc(&ds, ch);
788 break;
789 }
790 if (ch == EOF || ch >= 0x80 || !isxdigit(ch)) {
791 Scm_UngetcUnsafe(ch, port);
792 break;
793 }
794 Scm_DStringPutc(&ds, ch);
795 }
796
797 ScmSmallInt numchars;
798 const char *chars = Scm_DStringPeek(&ds, &numchars, NULL);
799 const char *next;
800
801 int r = Scm_ReadXdigitsFromString(chars, numchars, key, mode, TRUE, &next);
802 if (r != SCM_CHAR_INVALID) {
803 if (incompletep) Scm_DStringPutb(buf, r);
804 else Scm_DStringPutc(buf, r);
805 if (next - chars < numchars) {
806 Scm_DStringPutz(buf, next, numchars - (next-chars));
807 }
808 return SCM_TRUE;
809 } else {
810 return Scm_MakeString(chars, numchars, -1, SCM_STRING_COPYING);
811 }
812 }
813
814 /* Another utility procedure. Read sequence of digits from PORT and made it
815 long and return. CH can be prefetched digit char; if you don't
816 have one, pass SCM_CHAR_INVALID. The first nondigit char read
817 is stored in NEXT, and number of digits read is stored in NUMREAD.
818 If the number overflows LONG_MAX or there's no digits to be read,
819 returns -1 (you can check NUMREAD to see which is the case).
820 NB: This may throw an error if ch is neither a valid digit char
821 nor SCM_CHAR_INVALID. */
822 /* NB: Call for refactoring, using Scm_ParseDigitsAsLong */
Scm_ReadDigitsAsLong(ScmPort * port,ScmChar ch,int radix,ScmChar * next,ScmSize * numread)823 long Scm_ReadDigitsAsLong(ScmPort *port, ScmChar ch, int radix,
824 ScmChar *next, /*out*/
825 ScmSize *numread /*out*/)
826 {
827 long val = 0;
828 ScmSize nchars = 0;
829 if (ch != SCM_CHAR_INVALID) {
830 int v = Scm_DigitToInt(ch, radix, FALSE);
831 if (v < 0) {
832 /* This is for safety net. The caller should filter out
833 this case. */
834 Scm_ReadError(port, "Digit char expected, but got %C", ch);
835 }
836 val = v;
837 }
838 for (;;) {
839 ch = Scm_Getc(port);
840 nchars++;
841 int v = Scm_DigitToInt(ch, radix, FALSE);
842 if (v < 0) { /* EOF case is covered here */
843 *next = ch;
844 *numread = nchars;
845 return val;
846 }
847 if (val >= (LONG_MAX/radix+1)) {
848 /* we'll overflow */
849 *next = ch;
850 *numread = nchars;
851 return -1;
852 }
853 val = val*radix+v;
854 }
855 /* NOTREACHED */
856 }
857
858 /* Read long digits from BUF up to LEN. BUF must only contain single-byte
859 chars. */
Scm_ParseDigitsAsLong(const char * buf,ScmSize len,int radix,ScmSize * numread)860 long Scm_ParseDigitsAsLong(const char *buf, ScmSize len, int radix,
861 ScmSize *numread) /*out*/
862 {
863 long val = 0;
864 ScmSize nchars = 0;
865 for (; nchars < len; nchars++, buf++) {
866 int v = Scm_DigitToInt((ScmChar)*buf, radix, FALSE);
867 if (v < 0) {
868 *numread = nchars;
869 return val;
870 }
871 if (val >= (LONG_MAX/radix+1)) {
872 /* we'll overflow */
873 *numread = nchars;
874 return -1;
875 }
876 val = val*radix+v;
877 }
878 *numread = nchars;
879 return (nchars == 0)? -1 : val;
880 }
881
882 /*----------------------------------------------------------------
883 * List
884 */
885
886 /* Internal read_list. returns whether the list contains unresolved
887 reference or not within the flag has_ref */
read_list_int(ScmPort * port,ScmChar closer,ScmReadContext * ctx,int * has_ref,int start_line)888 static ScmObj read_list_int(ScmPort *port, ScmChar closer,
889 ScmReadContext *ctx, int *has_ref, int start_line)
890 {
891 ScmObj start = SCM_NIL, last = SCM_NIL;
892 int dot_seen = FALSE, ref_seen = FALSE;
893
894 for (;;) {
895 int c = skipws(port, ctx);
896 if (c == EOF) goto eoferr;
897 if (c == closer) {
898 *has_ref = !!ref_seen;
899 return start;
900 }
901
902 /* NB: We need to keep reading after we see the dotted pair, since
903 there can be a case like (a . b #;c), which is valid syntax.
904 We only reject when we see a next item after reading dotted pair. */
905 ScmObj item;
906 if (c == '.') {
907 if (dot_seen) goto baddot;
908 int c2 = Scm_GetcUnsafe(port);
909 if (c2 == closer) {
910 goto baddot;
911 } else if (c2 == EOF) {
912 goto eoferr;
913 } else if (!char_word_constituent(c2, FALSE)) {
914 /* can be a dot pair at the end */
915 if (start == SCM_NIL) goto baddot;
916 Scm_UngetcUnsafe(c2, port);
917 item = read_item(port, ctx);
918 if (SCM_READ_REFERENCE_P(item)) ref_seen = TRUE;
919 SCM_SET_CDR_UNCHECKED(last, item);
920 dot_seen = TRUE;
921 continue;
922 }
923 Scm_UngetcUnsafe(c2, port);
924 item = read_symbol_or_number(port, c, ctx);
925 } else {
926 Scm_UngetcUnsafe(c, port);
927 item = read_internal(port, ctx);
928 if (SCM_UNDEFINEDP(item)) continue; /* it was just a comment */
929 if (dot_seen) goto baddot;
930 if (SCM_READ_REFERENCE_P(item)) ref_seen = TRUE;
931 }
932 SCM_APPEND1(start, last, item);
933 }
934 eoferr:
935 if (start_line >= 0) {
936 Scm_ReadError(port, "EOF inside a list (starting from line %d)",
937 start_line);
938 } else {
939 Scm_ReadError(port, "EOF inside a list");
940 }
941 baddot:
942 Scm_ReadError(port, "bad dot syntax");
943 return SCM_NIL; /* dummy */
944 }
945
read_list(ScmPort * port,ScmChar closer,ScmReadContext * ctx)946 static ScmObj read_list(ScmPort *port, ScmChar closer, ScmReadContext *ctx)
947 {
948 int has_ref;
949 ScmSize line = -1;
950
951 if (ctx->flags & RCTX_SOURCE_INFO) line = Scm_PortLine(port);
952
953 ScmObj r = read_list_int(port, closer, ctx, &has_ref, line);
954
955 if (SCM_PAIRP(r) && (ctx->flags & RCTX_SOURCE_INFO) && line >= 0) {
956 /* Swap the head of the list for an extended pair to record
957 source-code info.*/
958 r = Scm_ExtendedCons(SCM_CAR(r), SCM_CDR(r));
959 Scm_PairAttrSet(SCM_PAIR(r), SCM_SYM_SOURCE_INFO,
960 SCM_LIST2(Scm_PortName(port), SCM_MAKE_INT(line)));
961 }
962
963 if (has_ref) ref_push(ctx, r, SCM_FALSE);
964 return r;
965 }
966
read_vector(ScmPort * port,ScmChar closer,ScmReadContext * ctx)967 static ScmObj read_vector(ScmPort *port, ScmChar closer, ScmReadContext *ctx)
968 {
969 int has_ref;
970 ScmSize line = -1;
971 ScmObj r;
972
973 if (ctx->flags & RCTX_SOURCE_INFO) line = Scm_PortLine(port);
974 r = read_list_int(port, closer, ctx, &has_ref, line);
975 r = Scm_ListToVector(r, 0, -1);
976 if (has_ref) ref_push(ctx, r, SCM_FALSE);
977 return r;
978 }
979
read_quoted(ScmPort * port,ScmObj quoter,ScmReadContext * ctx)980 static ScmObj read_quoted(ScmPort *port, ScmObj quoter, ScmReadContext *ctx)
981 {
982 ScmSize line = -1;
983 ScmObj r;
984
985 if (ctx->flags & RCTX_SOURCE_INFO) line = Scm_PortLine(port);
986 ScmObj item = read_item(port, ctx);
987 if (SCM_EOFP(item)) Scm_ReadError(port, "unterminated quote");
988 if (line >= 0) {
989 r = Scm_ExtendedCons(quoter, Scm_Cons(item, SCM_NIL));
990 Scm_PairAttrSet(SCM_PAIR(r), SCM_SYM_SOURCE_INFO,
991 SCM_LIST2(Scm_PortName(port), SCM_MAKE_INT(line)));
992 } else {
993 r = Scm_Cons(quoter, Scm_Cons(item, SCM_NIL));
994 }
995 if (SCM_READ_REFERENCE_P(item)) ref_push(ctx, SCM_CDR(r), SCM_FALSE);
996 return r;
997 }
998
999 /*----------------------------------------------------------------
1000 * String
1001 */
1002
1003 /* Handling \xNN;, \uNNNN, \UNNNNNNNN escapes. To make it easier
1004 to support both legacy \xN{2} syntax and new \xN{1,} syntax,
1005 we read-ahead as many hexdigits as possible first, then parse it,
1006 and append the result to BUF, along with any unused digits. */
read_string_xdigits(ScmPort * port,int key,int incompletep,ScmDString * buf)1007 static void read_string_xdigits(ScmPort *port, int key,
1008 int incompletep, ScmDString *buf)
1009 {
1010 ScmObj bad = Scm_ReadXdigitsFromPort(port, key,
1011 Scm_GetPortReaderLexicalMode(port),
1012 incompletep, buf);
1013 if (SCM_STRINGP(bad)) {
1014 /* skip chars to the end of string, so that the reader will read
1015 after the erroneous string */
1016 for (;;) {
1017 int c;
1018 if (incompletep) c = Scm_GetbUnsafe(port);
1019 else c = Scm_GetcUnsafe(port);
1020 if (c == EOF || c == '"') break;
1021 if (c == '\\') {
1022 if (incompletep) c = Scm_GetbUnsafe(port);
1023 else c = Scm_GetcUnsafe(port);
1024 }
1025 }
1026 Scm_ReadError(port,
1027 "Bad \\%c escape sequence in a string literal: `\\%c%A'",
1028 key, key, bad);
1029 }
1030 }
1031
read_string(ScmPort * port,int incompletep,ScmReadContext * ctx SCM_UNUSED)1032 static ScmObj read_string(ScmPort *port, int incompletep,
1033 ScmReadContext *ctx SCM_UNUSED)
1034 {
1035 int c = 0;
1036 ScmDString ds;
1037 Scm_DStringInit(&ds);
1038
1039 #define FETCH(var) \
1040 if (incompletep) { var = Scm_GetbUnsafe(port); } \
1041 else { var = Scm_GetcUnsafe(port); }
1042 #define ACCUMULATE(var) \
1043 if (incompletep) { SCM_DSTRING_PUTB(&ds, var); } \
1044 else { SCM_DSTRING_PUTC(&ds, var); }
1045 #define INTRALINE_WS(var) \
1046 ((var)==' ' || (var)=='\t' || SCM_CHAR_EXTRA_WHITESPACE_INTRALINE(var))
1047
1048 for (;;) {
1049 FETCH(c);
1050 switch (c) {
1051 case EOF: goto eof_exit;
1052 case '"': goto finish;
1053 backslash:
1054 case '\\': {
1055 FETCH(c);
1056 switch (c) {
1057 case EOF: goto eof_exit;
1058 case 'n': ACCUMULATE('\n'); break;
1059 case 'r': ACCUMULATE('\r'); break;
1060 case 'f': ACCUMULATE('\f'); break;
1061 case 't': ACCUMULATE('\t'); break;
1062 case 'a': ACCUMULATE(0x07); break; /* alarm */
1063 case 'b': ACCUMULATE(0x08); break; /* backspace */
1064 case '\\': ACCUMULATE('\\'); break;
1065 case '0': ACCUMULATE(0); break;
1066 case 'x': {
1067 read_string_xdigits(port, 'x', incompletep, &ds);
1068 break;
1069 }
1070 case 'u': case 'U': {
1071 if (SCM_EQ(Scm_GetPortReaderLexicalMode(port),
1072 SCM_SYM_STRICT_R7)) {
1073 Scm_ReadError(port, "\\%c in string literal isn't allowed "
1074 "in strinct-r7rs mode", c);
1075 }
1076 read_string_xdigits(port, c, incompletep, &ds);
1077 break;
1078 }
1079 /* R6RS-style line continuation handling*/
1080 line_continuation:
1081 case ' ':
1082 case '\t': {
1083 for (;;) {
1084 FETCH(c);
1085 if (c == EOF) goto eof_exit;
1086 if (c == '\r') goto cont_cr;
1087 if (c == '\n') goto cont_lf;
1088 if (!INTRALINE_WS(c)) goto cont_err;
1089 }
1090 }
1091 /*FALLTHROUGH*/
1092 cont_cr:
1093 case '\r': {
1094 FETCH(c);
1095 if (c == EOF) goto eof_exit;
1096 if (c == '"') goto finish;
1097 if (c == '\\') goto backslash;
1098 if (c != '\n' && !INTRALINE_WS(c)) {
1099 ACCUMULATE(c);
1100 break;
1101 }
1102 }
1103 /*FALLTHROUGH*/
1104 cont_lf:
1105 case '\n': {
1106 for (;;) {
1107 FETCH(c);
1108 if (c == EOF) goto eof_exit;
1109 if (c == '"') goto finish;
1110 if (c == '\\') goto backslash;
1111 if (!INTRALINE_WS(c)) {
1112 ACCUMULATE(c);
1113 break;
1114 }
1115 }
1116 break;
1117 }
1118 default:
1119 if (SCM_CHAR_EXTRA_WHITESPACE_INTRALINE(c)) goto line_continuation;
1120 else ACCUMULATE(c);
1121 }
1122 break;
1123 }
1124 default: ACCUMULATE(c); break;
1125 }
1126 }
1127 eof_exit:
1128 Scm_ReadError(port, "EOF encountered in a string literal: %S",
1129 Scm_DStringGet(&ds, 0));
1130 cont_err:
1131 Scm_ReadError(port, "Invalid line continuation sequence in a string literal: %S...",
1132 Scm_DStringGet(&ds, 0));
1133 finish:;
1134 int flags = ((incompletep? SCM_STRING_INCOMPLETE:0) | SCM_STRING_IMMUTABLE);
1135 return Scm_DStringGet(&ds, flags);
1136 }
1137
1138 /*----------------------------------------------------------------
1139 * Character
1140 */
1141
1142 static struct char_name {
1143 const char *name;
1144 u_int size;
1145 ScmObj ch;
1146 } char_names[] = {
1147 #define DEFCHAR(name, char) \
1148 { #name, sizeof(#name)-1, SCM_MAKE_CHAR(char) }
1149 DEFCHAR(alarm, 0x07),
1150 DEFCHAR(backspace, 0x08),
1151 DEFCHAR(space, ' '),
1152 DEFCHAR(newline, '\n'),
1153 DEFCHAR(nl, '\n'),
1154 DEFCHAR(lf, '\n'),
1155 DEFCHAR(return, '\r'),
1156 DEFCHAR(cr, '\r'),
1157 DEFCHAR(tab, '\t'),
1158 DEFCHAR(ht, '\t'),
1159 DEFCHAR(page, '\f'),
1160 DEFCHAR(escape, 0x1b),
1161 DEFCHAR(esc, 0x1b),
1162 DEFCHAR(delete, 0x7f),
1163 DEFCHAR(del, 0x7f),
1164 DEFCHAR(null, 0),
1165 { NULL, 0, 0 }
1166 };
1167
read_char(ScmPort * port,ScmReadContext * ctx)1168 static ScmObj read_char(ScmPort *port, ScmReadContext *ctx)
1169 {
1170 ScmString *name;
1171
1172 int c = Scm_GetcUnsafe(port);
1173 switch (c) {
1174 case EOF:
1175 Scm_ReadError(port, "EOF encountered in character literal");
1176 break;
1177 case '(':; case ')':; case '[':; case ']':; case '{':; case '}':;
1178 case '"':; case ' ':; case '\\':; case '|':; case ';':;
1179 case '#':;
1180 return SCM_MAKE_CHAR(c);
1181 default: {
1182 /* need to read word to see if it is a character name */
1183 name = SCM_STRING(read_word(port, c, ctx, TRUE, FALSE));
1184
1185 if (SCM_EQ(Scm_GetPortReaderLexicalMode(port), SCM_SYM_STRICT_R7)) {
1186 ScmChar following = Scm_GetcUnsafe(port);
1187 if (!char_is_delimiter(following)) {
1188 Scm_Error("Character literal isn't delimited: #\\%A%C ...",
1189 name, following);
1190 }
1191 Scm_UngetcUnsafe(following, port);
1192 }
1193
1194 ScmSmallInt namelen, namesize;
1195 const char *cname = Scm_GetStringContent(name, &namesize, &namelen,
1196 NULL);
1197 if (namelen == 1) {
1198 return SCM_MAKE_CHAR(c);
1199 }
1200 if (namelen != namesize) {
1201 /* no character name contains multibyte chars */
1202
1203 goto unknown;
1204 }
1205
1206 ScmObj lexmode = Scm_GetPortReaderLexicalMode(port);
1207
1208 /* handle #\x1f etc. */
1209 if (cname[0] == 'x' && isxdigit(cname[1])) {
1210 const char *nextptr;
1211 ScmChar code = Scm_ReadXdigitsFromString(cname+1, namesize-1, 'x',
1212 lexmode, FALSE, &nextptr);
1213 if (code == SCM_CHAR_INVALID || *nextptr != '\0') goto unknown;
1214 return SCM_MAKE_CHAR(code);
1215 }
1216 /* handle legacy #\uxxxx or #\uxxxxxxxx */
1217 if ((cname[0] == 'u') && isxdigit(cname[1])
1218 && (!SCM_EQ(lexmode, SCM_SYM_STRICT_R7))) {
1219 if (namesize >= 5 && namesize <= 9) {
1220 const char *nextptr;
1221 /* NB: We want to allow variable number of digits, so
1222 we pass 'x' as key (instead of 'u') here. */
1223 ScmChar code = Scm_ReadXdigitsFromString(cname+1, namesize-1,
1224 'x', lexmode, FALSE,
1225 &nextptr);
1226 if (code >= 0 && *nextptr == '\0') {
1227 return SCM_MAKE_CHAR(code);
1228 }
1229 }
1230 /* if we come here, it's an error. */
1231 Scm_ReadError(port, "Bad UCS character code: #\\%s", cname);
1232 }
1233
1234 struct char_name *cntab = char_names;
1235 while (cntab->name) {
1236 if (cntab->size == (unsigned)namesize
1237 && strncmp(cntab->name, cname, namesize) == 0) {
1238 return cntab->ch;
1239 }
1240 cntab++;
1241 }
1242 unknown:
1243 Scm_ReadError(port, "Unknown character name: #\\%A", name);
1244 }
1245 }
1246 return SCM_UNDEFINED; /* dummy */
1247 }
1248
1249 /*----------------------------------------------------------------
1250 * Symbols and Numbers
1251 */
1252
1253 /* Reads a sequence of word-constituent characters from PORT, and returns
1254 ScmString. INITIAL may be a readahead character, or SCM_CHAR_INVALID
1255 if there's none. TEMP_CASE_FOLD turns on case-fold mode regardless of
1256 the read context setting. INCLUDE_HASH_SIGN allows '#' to appear in
1257 the word.
1258 */
read_word(ScmPort * port,ScmChar initial,ScmReadContext * ctx SCM_UNUSED,int temp_case_fold,int include_hash_sign)1259 static ScmObj read_word(ScmPort *port, ScmChar initial,
1260 ScmReadContext *ctx SCM_UNUSED,
1261 int temp_case_fold, int include_hash_sign)
1262 {
1263 int case_fold = temp_case_fold || SCM_PORT_CASE_FOLDING(port);
1264 ScmDString ds;
1265 Scm_DStringInit(&ds);
1266 if (initial != SCM_CHAR_INVALID) {
1267 if (case_fold && char_word_case_fold(initial)) initial = tolower(initial);
1268 SCM_DSTRING_PUTC(&ds, initial);
1269 }
1270
1271 for (;;) {
1272 int c = Scm_GetcUnsafe(port);
1273 if (c == EOF || !char_word_constituent(c, include_hash_sign)) {
1274 Scm_UngetcUnsafe(c, port);
1275 return Scm_DStringGet(&ds, 0);
1276 }
1277 if (case_fold && char_word_case_fold(c)) c = tolower(c);
1278 SCM_DSTRING_PUTC(&ds, c);
1279 }
1280 }
1281
1282 /* Kaveat: We don't allow '#' in symbols, but we need to read '#'
1283 for numbers. To allow weird identifers like '1+', we need to read the
1284 word as a number fist and convert it to a symbol when the read word
1285 can't be interpreted as a number. For the consistency, we read
1286 with '#' and then check it in read_symbol, too. */
check_valid_symbol(ScmString * s)1287 static void check_valid_symbol(ScmString *s)
1288 {
1289 ScmObj r = Scm_StringScanChar(s, SCM_CHAR('#'), SCM_STRING_SCAN_INDEX);
1290 if (!SCM_FALSEP(r)) {
1291 Scm_Error("invalid symbol name: %S", SCM_OBJ(s));
1292 }
1293 }
1294
1295 /* Read a symbol starting with INITIAL (assuming unescaped), interned. */
read_symbol(ScmPort * port,ScmChar initial,ScmReadContext * ctx)1296 static ScmObj read_symbol(ScmPort *port, ScmChar initial, ScmReadContext *ctx)
1297 {
1298 ScmString *s = SCM_STRING(read_word(port, initial, ctx, FALSE, TRUE));
1299 check_valid_symbol(s);
1300 return Scm_Intern(s);
1301 }
1302
1303 /* This is called when a symbol (either bare or escaped) is expected
1304 starting with INITIAL char; e.g. after #: or #!. An error is thrown
1305 if a symbol isn't immediately followed. PRECEDING is used in the
1306 error message. */
read_immediate_symbol(ScmPort * port,ScmChar initial,int interned,const char * preceding,ScmReadContext * ctx)1307 static ScmObj read_immediate_symbol(ScmPort *port, ScmChar initial,
1308 int interned, const char *preceding,
1309 ScmReadContext *ctx)
1310 {
1311 if (initial == '|') {
1312 return read_escaped_symbol(port, initial, interned, ctx);
1313 } else if (char_word_constituent(initial, FALSE)) {
1314 ScmString *s = SCM_STRING(read_word(port, initial, ctx, FALSE, TRUE));
1315 check_valid_symbol(s);
1316 /* we have to exclude numbers. a bit ugly - call for cleanup */
1317 if (!(isdigit(initial) || initial == '+' || initial == '-')
1318 || SCM_FALSEP(Scm_StringToNumber(s, 10, 0))) {
1319 return Scm_MakeSymbol(s, interned);
1320 }
1321 }
1322 /* If we come here, we have invalid syntax. */
1323 if (initial == EOF) {
1324 Scm_ReadError(port, "'%s' followed by nothing", preceding);
1325 } else {
1326 Scm_ReadError(port, "invalid %s syntax near '%s%C'", preceding,
1327 preceding, initial);
1328 }
1329 return SCM_UNDEFINED; /* dummy */
1330 }
1331
read_number(ScmPort * port,ScmChar initial,int radix,ScmReadContext * ctx)1332 static ScmObj read_number(ScmPort *port, ScmChar initial, int radix,
1333 ScmReadContext *ctx)
1334 {
1335 ScmString *s = SCM_STRING(read_word(port, initial, ctx, FALSE, TRUE));
1336 u_long flags = radix >=2 ? SCM_NUMBER_FORMAT_ALT_RADIX : 0;
1337 if (SCM_EQ(Scm_GetPortReaderLexicalMode(port), SCM_SYM_STRICT_R7)) {
1338 flags |= SCM_NUMBER_FORMAT_STRICT_R7RS;
1339 }
1340 int default_radix = radix >= 2? radix : 10;
1341 ScmObj num = Scm_StringToNumber(s, default_radix, flags);
1342 if (num == SCM_FALSE) {
1343 if (radix >= 2) {
1344 /* In this case, we've read #<radix>r syntax */
1345 Scm_ReadError(port, "bad numeric format: \"#%dr%A\"", radix, s);
1346 } else {
1347 Scm_ReadError(port, "bad numeric format: %S", s);
1348 }
1349 }
1350 return num;
1351 }
1352
read_symbol_or_number(ScmPort * port,ScmChar initial,ScmReadContext * ctx)1353 static ScmObj read_symbol_or_number(ScmPort *port, ScmChar initial, ScmReadContext *ctx)
1354 {
1355 ScmString *s = SCM_STRING(read_word(port, initial, ctx, FALSE, TRUE));
1356 u_long flags = 0;
1357 if (SCM_EQ(Scm_GetPortReaderLexicalMode(port), SCM_SYM_STRICT_R7)) {
1358 flags |= SCM_NUMBER_FORMAT_STRICT_R7RS;
1359 }
1360 ScmObj num = Scm_StringToNumber(s, 10, flags);
1361 if (num != SCM_FALSE) return num;
1362 check_valid_symbol(s);
1363 return Scm_Intern(s);
1364 }
1365
read_keyword(ScmPort * port,ScmReadContext * ctx)1366 static ScmObj read_keyword(ScmPort *port, ScmReadContext *ctx)
1367 {
1368 int c2 = Scm_GetcUnsafe(port);
1369
1370 if (c2 == '|') {
1371 ScmObj name = read_escaped_symbol(port, c2, FALSE, ctx); /* read as uninterned */
1372 return Scm_MakeKeyword(SCM_SYMBOL_NAME(name));
1373 } else {
1374 Scm_UngetcUnsafe(c2, port);
1375 ScmObj name = read_word(port, SCM_CHAR_INVALID, ctx, FALSE, FALSE);
1376 return Scm_MakeKeyword(SCM_STRING(name));
1377 }
1378 }
1379
read_escaped_symbol(ScmPort * port,ScmChar delim,int interned,ScmReadContext * ctx SCM_UNUSED)1380 static ScmObj read_escaped_symbol(ScmPort *port, ScmChar delim, int interned,
1381 ScmReadContext *ctx SCM_UNUSED)
1382 {
1383 ScmDString ds;
1384 Scm_DStringInit(&ds);
1385 ScmObj xmode = Scm_GetPortReaderLexicalMode(port);
1386
1387 for (;;) {
1388 int c = Scm_GetcUnsafe(port);
1389 if (c == EOF) {
1390 goto err;
1391 } else if (c == delim) {
1392 ScmString *s = SCM_STRING(Scm_DStringGet(&ds, 0));
1393 return Scm_MakeSymbol(s, interned);
1394 } else if (c == '\\') {
1395 /* CL-style single escape */
1396 c = Scm_GetcUnsafe(port);
1397 if (c == EOF) goto err;
1398 if (SCM_EQ(xmode, SCM_SYM_LEGACY)) {
1399 SCM_DSTRING_PUTC(&ds, c);
1400 } else {
1401 switch (c) {
1402 case 'x': {
1403 /* R7RS-style hex escape. */
1404 ScmObj bad = Scm_ReadXdigitsFromPort(port, 'x', xmode,
1405 FALSE, &ds);
1406 if (SCM_STRINGP(bad)) {
1407 Scm_ReadError(port, "invalid hex escape in a symbol literal: \\x%C", bad);
1408 }
1409 break;
1410 }
1411 case '\\': case '|': SCM_DSTRING_PUTC(&ds, c); break;
1412 case 'a': SCM_DSTRING_PUTC(&ds, '\a'); break;
1413 case 'b': SCM_DSTRING_PUTC(&ds, '\b'); break;
1414 case 't': SCM_DSTRING_PUTC(&ds, '\t'); break;
1415 case 'n': SCM_DSTRING_PUTC(&ds, '\n'); break;
1416 case 'r': SCM_DSTRING_PUTC(&ds, '\r'); break;
1417 default:
1418 if (SCM_EQ(xmode, SCM_SYM_STRICT_R7)) {
1419 Scm_ReadError(port, "invalid backslash-escape in a symbol literal: \\%A", SCM_MAKE_CHAR(c));
1420 } else {
1421 SCM_DSTRING_PUTC(&ds, c);
1422 }
1423 }
1424 }
1425 } else {
1426 SCM_DSTRING_PUTC(&ds, c);
1427 }
1428 }
1429 err:
1430 Scm_ReadError(port, "unterminated escaped symbol: |%s ...",
1431 Scm_DStringGetz(&ds));
1432 return SCM_UNDEFINED; /* dummy */
1433 }
1434
1435 /*----------------------------------------------------------------
1436 * Regexp & charset
1437 */
1438
1439 /* gauche extension : #/regexp/ */
read_regexp(ScmPort * port)1440 static ScmObj read_regexp(ScmPort *port)
1441 {
1442 ScmDString ds;
1443 Scm_DStringInit(&ds);
1444 for (;;) {
1445 ScmChar c = Scm_GetcUnsafe(port);
1446 if (c == SCM_CHAR_INVALID) {
1447 Scm_ReadError(port, "unterminated literal regexp");
1448 }
1449 if (c == '\\') {
1450 /* NB: We "eat" a backslash before '/', since it is only dealt
1451 with the reader and nothing to do with regexp parser itself. */
1452 ScmChar c1 = Scm_GetcUnsafe(port);
1453 if (c1 == SCM_CHAR_INVALID) {
1454 Scm_ReadError(port, "unterminated literal regexp");
1455 }
1456 if (c1 != '/') SCM_DSTRING_PUTC(&ds, c);
1457 SCM_DSTRING_PUTC(&ds, c1);
1458 } else if (c == '/') {
1459 /* Read one more char to see if we have a flag */
1460 int flags = 0;
1461 c = Scm_GetcUnsafe(port);
1462 if (c == 'i') flags |= SCM_REGEXP_CASE_FOLD;
1463 else Scm_UngetcUnsafe(c, port);
1464 return Scm_RegComp(SCM_STRING(Scm_DStringGet(&ds, 0)), flags);
1465 } else {
1466 SCM_DSTRING_PUTC(&ds, c);
1467 }
1468 }
1469 }
1470
1471 /* gauche extension : #[charset] */
read_charset(ScmPort * port)1472 static ScmObj read_charset(ScmPort *port)
1473 {
1474 ScmCharSet *cs = SCM_CHAR_SET(Scm_CharSetRead(port, NULL, TRUE, FALSE));
1475 return Scm_CharSetFreezeX(cs); /* literal is immutable */
1476 }
1477
1478 /*----------------------------------------------------------------
1479 * Numeric prefixed (#N#, #N=, #Nr)
1480 */
1481
read_num_prefixed(ScmPort * port,ScmChar ch,ScmReadContext * ctx)1482 static ScmObj read_num_prefixed(ScmPort *port, ScmChar ch, ScmReadContext *ctx)
1483 {
1484 ScmObj e = SCM_UNBOUND;
1485 ScmChar ch2;
1486 ScmSize nread;
1487 long prefix = Scm_ReadDigitsAsLong(port, ch, 10, &ch2, &nread);
1488
1489 if (ch2 == EOF) Scm_ReadError(port, "unterminated reference form (#digits)");
1490 if (prefix < 0) Scm_ReadError(port, "#-prefix number overflow");
1491
1492 switch (ch2) {
1493 case '#':
1494 /* #digit# - back reference */
1495 if (ctx->table == NULL
1496 || SCM_UNBOUNDP(e = Scm_HashTableRef(ctx->table,
1497 Scm_MakeInteger(prefix),
1498 SCM_UNBOUND))) {
1499 Scm_ReadError(port, "invalid reference number in #%d#", prefix);
1500 }
1501 if (SCM_READ_REFERENCE_P(e) && SCM_READ_REFERENCE_REALIZED(e)) {
1502 return SCM_READ_REFERENCE(e)->value;
1503 } else {
1504 return e;
1505 }
1506 case '=':
1507 /* #digit= - register */
1508 {
1509 ScmObj ref = Scm_MakeReadReference();
1510 if (ctx->table == NULL) {
1511 ctx->table =
1512 SCM_HASH_TABLE(Scm_MakeHashTableSimple(SCM_HASH_EQV, 0));
1513 }
1514 if (!SCM_UNBOUNDP(Scm_HashTableRef(ctx->table,
1515 Scm_MakeInteger(prefix),
1516 SCM_UNBOUND))) {
1517 Scm_ReadError(port, "duplicate back-reference number in #%d=",
1518 prefix);
1519 }
1520 ref_register(ctx, ref, prefix);
1521 ScmObj val = read_item(port, ctx);
1522 if (ref == val) {
1523 /* an edge case: #0=#0# */
1524 Scm_ReadError(port, "indeterminate read reference: #%d=#%d#",
1525 prefix, prefix);
1526 }
1527 SCM_READ_REFERENCE(ref)->value = val;
1528 return val;
1529 }
1530 case 'r': case 'R':
1531 /* #digitR - radix */
1532 if (SCM_EQ(Scm_GetPortReaderLexicalMode(port), SCM_SYM_STRICT_R7)) {
1533 Scm_ReadError(port, "Radix prefix isn't allowed in strict R7RS mode.");
1534 }
1535 if (prefix < SCM_RADIX_MIN || prefix > SCM_RADIX_MAX) {
1536 Scm_ReadError(port, "Radix prefix out of range: radix in #<radix>R must be between %d and %d inclusive, but got: %d",
1537 SCM_RADIX_MIN, SCM_RADIX_MAX, prefix);
1538 return SCM_UNDEFINED; /* dummy */
1539 } else {
1540 int ch = Scm_GetcUnsafe(port);
1541 if (ch == EOF) {
1542 Scm_ReadError(port, "Premature end of radix-prefixed number: #%dr",
1543 prefix);
1544 }
1545 return read_number(port, ch, prefix, ctx);
1546 }
1547 default:
1548 Scm_ReadError(port, "invalid numeric prefix (#, =, r or R is expected) : #%d%A", prefix, SCM_MAKE_CHAR(ch2));
1549 return SCM_UNDEFINED;
1550 }
1551 }
1552
1553 /*----------------------------------------------------------------
1554 * SRFI-10 support
1555 */
1556
1557 /* NB: The 'module' argument of DefineReaderCtor and GetReaderCtor may
1558 be used in future to make reader-ctor binding associated with modules.
1559 For now, it is not used and the caller should pass SCM_FALSE. */
1560
Scm_DefineReaderCtor(ScmObj symbol,ScmObj proc,ScmObj finisher,ScmObj module SCM_UNUSED)1561 ScmObj Scm_DefineReaderCtor(ScmObj symbol, ScmObj proc, ScmObj finisher,
1562 ScmObj module SCM_UNUSED /*reserved*/)
1563 {
1564 if (!SCM_PROCEDUREP(proc)) {
1565 Scm_Error("procedure required, but got %S", proc);
1566 }
1567 ScmObj pair = Scm_Cons(proc, finisher);
1568 (void)SCM_INTERNAL_MUTEX_LOCK(readCtorData.mutex);
1569 Scm_HashTableSet(readCtorData.table, symbol, pair, 0);
1570 (void)SCM_INTERNAL_MUTEX_UNLOCK(readCtorData.mutex);
1571 return SCM_UNDEFINED;
1572 }
1573
Scm_GetReaderCtor(ScmObj symbol,ScmObj module SCM_UNUSED)1574 ScmObj Scm_GetReaderCtor(ScmObj symbol, ScmObj module SCM_UNUSED/*reserved*/)
1575 {
1576 (void)SCM_INTERNAL_MUTEX_LOCK(readCtorData.mutex);
1577 ScmObj r = Scm_HashTableRef(readCtorData.table, symbol, SCM_FALSE);
1578 (void)SCM_INTERNAL_MUTEX_UNLOCK(readCtorData.mutex);
1579 return r;
1580 }
1581
read_sharp_comma(ScmPort * port,ScmReadContext * ctx)1582 static ScmObj read_sharp_comma(ScmPort *port, ScmReadContext *ctx)
1583 {
1584 ScmChar next = Scm_GetcUnsafe(port);
1585 if (next != '(') {
1586 Scm_ReadError(port, "bad #,-form: '(' should be followed, but got %C",
1587 next);
1588 }
1589
1590 int has_ref;
1591 ScmSize line = -1;
1592 if (ctx->flags & RCTX_SOURCE_INFO) line = Scm_PortLine(port);
1593
1594 ScmObj form = read_list_int(port, ')', ctx, &has_ref, line);
1595 int len = Scm_Length(form);
1596 if (len <= 0) {
1597 Scm_ReadError(port, "bad #,-form: #,%S", form);
1598 }
1599 ScmObj r = process_sharp_comma(port, SCM_CAR(form), SCM_CDR(form), ctx,
1600 has_ref);
1601 return r;
1602 }
1603
process_sharp_comma(ScmPort * port,ScmObj key,ScmObj args,ScmReadContext * ctx,int has_ref)1604 static ScmObj process_sharp_comma(ScmPort *port, ScmObj key, ScmObj args,
1605 ScmReadContext *ctx, int has_ref)
1606 {
1607 if (ctx->flags & RCTX_DISABLE_CTOR) return SCM_FALSE;
1608
1609 (void)SCM_INTERNAL_MUTEX_LOCK(readCtorData.mutex);
1610 ScmObj e = Scm_HashTableRef(readCtorData.table, key, SCM_FALSE);
1611 (void)SCM_INTERNAL_MUTEX_UNLOCK(readCtorData.mutex);
1612
1613 if (!SCM_PAIRP(e)) Scm_ReadError(port, "unknown #,-key: %S", key);
1614 ScmObj r = Scm_ApplyRec(SCM_CAR(e), args);
1615 if (has_ref) ref_push(ctx, r, SCM_CDR(e));
1616 return r;
1617 }
1618
reader_ctor(ScmObj * args,int nargs,void * data SCM_UNUSED)1619 static ScmObj reader_ctor(ScmObj *args, int nargs, void *data SCM_UNUSED)
1620 {
1621 ScmObj optarg = (nargs > 2? args[2] : SCM_FALSE);
1622 return Scm_DefineReaderCtor(args[0], args[1], optarg, SCM_FALSE);
1623 }
1624
1625 /*----------------------------------------------------------------
1626 * #!-support
1627 */
1628
Scm_DefineReaderDirective(ScmObj symbol,ScmObj proc)1629 ScmObj Scm_DefineReaderDirective(ScmObj symbol, ScmObj proc)
1630 {
1631 (void)SCM_INTERNAL_MUTEX_LOCK(hashBangData.mutex);
1632 Scm_HashTableSet(hashBangData.table, symbol, proc, 0);
1633 (void)SCM_INTERNAL_MUTEX_UNLOCK(hashBangData.mutex);
1634 return SCM_UNDEFINED;
1635 }
1636
read_shebang(ScmPort * port,ScmReadContext * ctx)1637 static ScmObj read_shebang(ScmPort *port, ScmReadContext *ctx)
1638 {
1639 /* If '#!' appears at the beginning of the input port, and it is
1640 followed by '/' or a space or a newline, then we consider it as
1641 the beginning of shebang and discard the entire line.
1642 Otherwise, we take this as #!<identifier> directive as
1643 specified in R6RS, and calls appropriate handler.
1644
1645 R6RS is actually not very clean at this corner. It requires
1646 distinct modes to parse a script (which always begins with
1647 shebang) and to parse R6RS program (in which '#!' always
1648 introduces #!<identifier> token). There's no way for just one
1649 parser that strictly covers both situation.
1650 */
1651 int c2 = Scm_GetcUnsafe(port);
1652 if (Scm_PortBytes(port) == 3 && (c2 == '/' || c2 == ' ' || c2 == '\n')) {
1653 /* shebang */
1654 for (;;) {
1655 if (c2 == '\n') return SCM_UNDEFINED;
1656 if (c2 == EOF) return SCM_EOF;
1657 c2 = Scm_GetcUnsafe(port);
1658 }
1659 /*NOTREACHED*/
1660 } else {
1661 ScmObj id = read_immediate_symbol(port, c2, TRUE, "#!", ctx);
1662 (void)SCM_INTERNAL_MUTEX_LOCK(hashBangData.mutex);
1663 ScmObj e = Scm_HashTableRef(hashBangData.table, id, SCM_FALSE);
1664 (void)SCM_INTERNAL_MUTEX_UNLOCK(hashBangData.mutex);
1665 if (SCM_FALSEP(e)) {
1666 Scm_Warn("Ignoring unrecognized hash-bang directive: #!%S", id);
1667 return SCM_UNDEFINED;
1668 }
1669 /* Reader directive may return zero or one value. When it returns
1670 no values, we call Scm_VMSetResult to adjust the number of values.
1671 */
1672 ScmObj r = Scm_ApplyRec3(e, id, SCM_OBJ(port), SCM_OBJ(ctx));
1673 if (Scm_VMGetNumResults(Scm_VM()) == 1) return r;
1674 else { Scm_VMSetResult(SCM_UNDEFINED); return SCM_UNDEFINED; }
1675 }
1676 }
1677
1678 /*----------------------------------------------------------------
1679 * #f, #t, #false, #true, and UVector literals
1680 */
1681
1682 /* Pre-0.9.4 reader. #t and #f delimit themselves (except '1', '3' or '6'
1683 follows '#f'.) I doubt any code breaks if we change that, but there
1684 may be a data files around that somehow relies on this behavior. So
1685 we keep this in 'legacy' reader mode. */
read_sharp_word_legacy(ScmPort * port,char ch,ScmReadContext * ctx)1686 static ScmObj read_sharp_word_legacy(ScmPort *port, char ch, ScmReadContext *ctx)
1687 {
1688 ScmChar c1, c2 = SCM_CHAR_INVALID;
1689 char *tag = NULL;
1690
1691 if (ch == 't') return SCM_TRUE;
1692
1693 c1 = Scm_GetcUnsafe(port);
1694 if (ch == 'f') {
1695 if (c1 != '1' && c1 != '3' && c1 != '6') {
1696 Scm_UngetcUnsafe(c1, port);
1697 return SCM_FALSE;
1698 }
1699 c2 = Scm_GetcUnsafe(port);
1700 if (c1 == '3' && c2 == '2') tag = "f32";
1701 else if (c1 == '6' && c2 == '4') tag = "f64";
1702 else if (c1 == '1' && c2 == '6') tag = "f16";
1703 } else {
1704 if (c1 == '8') tag = (ch == 's')? "s8" : "u8";
1705 else if (c1 == '1') {
1706 c2 = Scm_GetcUnsafe(port);
1707 if (c2 == '6') tag = (ch == 's')? "s16" : "u16";
1708 }
1709 else if (c1 == '3') {
1710 c2 = Scm_GetcUnsafe(port);
1711 if (c2 == '2') tag = (ch == 's')? "s32" : "u32";
1712 }
1713 else if (c1 == '6') {
1714 c2 = Scm_GetcUnsafe(port);
1715 if (c2 == '4') tag = (ch == 's')? "s64" : "u64";
1716 }
1717 }
1718 if (tag == NULL) {
1719 char buf[SCM_CHAR_MAX_BYTES*4], *bufp = buf;
1720 *bufp++ = ch;
1721 SCM_CHAR_PUT(bufp, c1);
1722 bufp += SCM_CHAR_NBYTES(c1);
1723 if (c2 != SCM_CHAR_INVALID) {
1724 SCM_CHAR_PUT(bufp, c2);
1725 bufp += SCM_CHAR_NBYTES(c2);
1726 }
1727 *bufp = '\0';
1728 Scm_ReadError(port, "invalid uniform vector tag: %s", buf);
1729 }
1730 return Scm_ReadUVector(port, tag, ctx);
1731 }
1732
1733 /* A 'new' version, friendly to R7RS */
read_sharp_word_1(ScmPort * port,char ch,ScmReadContext * ctx)1734 static ScmObj read_sharp_word_1(ScmPort *port, char ch, ScmReadContext *ctx)
1735 {
1736 ScmString *s = SCM_STRING(read_word(port, ch, ctx, TRUE, FALSE));
1737 const char *w = Scm_GetStringConst(s);
1738 const char *tag = NULL;
1739
1740 switch (ch) {
1741 case 'c':
1742 if (strcmp(w, "c32") == 0
1743 || strcmp(w, "c64") == 0
1744 || strcmp(w, "c128") == 0) {
1745 tag = w;
1746 }
1747 break;
1748 case 'f':
1749 if (strcmp(w, "f16") == 0
1750 || strcmp(w, "f32") == 0
1751 || strcmp(w, "f64") == 0) {
1752 tag = w;
1753 } else if (w[1] == '\0' || strcmp(w, "false") == 0) {
1754 return SCM_FALSE;
1755 }
1756 break;
1757 case 's':
1758 if (strcmp(w, "s8") == 0
1759 || strcmp(w, "s16") == 0
1760 || strcmp(w, "s32") == 0
1761 || strcmp(w, "s64") == 0) {
1762 tag = w;
1763 }
1764 break;
1765 case 'u':
1766 if (strcmp(w, "u8") == 0
1767 || strcmp(w, "u16") == 0
1768 || strcmp(w, "u32") == 0
1769 || strcmp(w, "u64") == 0) {
1770 tag = w;
1771 }
1772 break;
1773 case 't':
1774 if (w[1] == '\0' || strcmp(w, "true") == 0) {
1775 return SCM_TRUE;
1776 }
1777 }
1778 if (tag == NULL) {
1779 Scm_ReadError(port, "invalid #-token: #%s", w);
1780 }
1781 return Scm_ReadUVector(port, tag, ctx);
1782 }
1783
read_sharp_word(ScmPort * port,char ch,ScmReadContext * ctx)1784 static ScmObj read_sharp_word(ScmPort *port, char ch, ScmReadContext *ctx)
1785 {
1786 if (SCM_EQ(Scm_GetPortReaderLexicalMode(port), SCM_SYM_LEGACY)) {
1787 return read_sharp_word_legacy(port, ch, ctx);
1788 } else {
1789 return read_sharp_word_1(port, ch, ctx);
1790 }
1791 }
1792
1793 /*----------------------------------------------------------------
1794 * #**"....", #*1010010...
1795 */
read_sharp_asterisk(ScmPort * port,ScmReadContext * ctx)1796 static ScmObj read_sharp_asterisk(ScmPort *port, ScmReadContext *ctx)
1797 {
1798 /* We used to use #*"..." for incomplete string literals, but it turned
1799 out it can be read as an zero-length bitvector #* followed by an
1800 ordinary string (since a double quote is a <delimiter> in R7RS).
1801
1802 So we changed the literal to #**"..." (yeah, ugly, but incomplete
1803 strings are anomalies and shouldn't be used often).
1804
1805 The reader behavior can be customized by reader-lexical-mode:
1806
1807 permissive (default) - For the next release, #*"..." is read as
1808 an incomplete string, without warning. In future, it warns.
1809 legacy - #*"..." is read as an incomplete string, without warning.
1810 warn-legacy - #*"..." is read as an incomplete string, but warns.
1811 strict-r7 - #*"..." is read as a zero-length bitvector followed by
1812 a string. #**"..." is rejected.
1813 */
1814
1815 int c = Scm_GetcUnsafe(port);
1816 if (c == '*') {
1817 c = Scm_GetcUnsafe(port);
1818 if (c == '"') return read_string(port, TRUE, ctx);
1819 Scm_ReadError(port, "Invalid #* syntax: #**%C", c);
1820 }
1821 if (c == '"') {
1822 ScmObj m = Scm_GetPortReaderLexicalMode(port);
1823 if (SCM_EQ(m, SCM_SYM_PERMISSIVE)) {
1824 /* TRANSIENT: We switch this default behavior to warn in future */
1825 return read_string(port, TRUE, ctx);
1826 }
1827 if (SCM_EQ(m, SCM_SYM_LEGACY)) {
1828 return read_string(port, TRUE, ctx);
1829 }
1830 if (SCM_EQ(m, SCM_SYM_WARN_LEGACY)) {
1831 Scm_Warn("Deprecated incomplete string syntax #*\"...\" at %A:%ld",
1832 Scm_PortName(port), Scm_PortLine(port));
1833 return read_string(port, TRUE, ctx);
1834 }
1835 if (SCM_EQ(m, SCM_SYM_STRICT_R7)) {
1836 Scm_UngetcUnsafe(c, port);
1837 return Scm_MakeBitvector(0, SCM_FALSE);
1838 }
1839 Scm_Panic("invalid reader-lexical-mode");
1840 }
1841
1842 ScmDString ds;
1843 Scm_DStringInit(&ds);
1844 for (;;) {
1845 switch (c) {
1846 case EOF:
1847 goto finish;
1848 case '(': case ')': case '"': case '|': case ';':
1849 Scm_UngetcUnsafe(c, port);
1850 goto finish;
1851 case '0': case '1':
1852 Scm_DStringPutc(&ds, c);
1853 break;
1854 default:
1855 if ((SCM_CHAR_ASCII_P(c) && isspace(c))
1856 || (!SCM_CHAR_ASCII_P(c) && SCM_CHAR_EXTRA_WHITESPACE(c))) {
1857 Scm_UngetcUnsafe(c, port);
1858 goto finish;
1859 }
1860 Scm_ReadError(port, "Invalid char in bitvector literal: %C", c);
1861 }
1862 c = Scm_GetcUnsafe(port);
1863 }
1864
1865 finish:;
1866 ScmObj s = Scm_DStringGet(&ds, 0);
1867 return Scm_StringToBitvector(SCM_STRING(s), FALSE);
1868 }
1869
1870 /* OBSOLETED: gauche.uvector used to call this to set up reader pointer.
1871 Now it is read in src/vector.c. We keep this entry for ABI compatibility.
1872 Remove on 1.0 release. */
Scm__InstallReadUvectorHook(ScmObj (* f)(ScmPort *,const char *,ScmReadContext *)SCM_UNUSED)1873 void Scm__InstallReadUvectorHook(ScmObj (*f)(ScmPort*, const char*, ScmReadContext *) SCM_UNUSED)
1874 {
1875 }
1876
1877
1878 /*----------------------------------------------------------------
1879 * Initialization
1880 */
1881
Scm__InitRead(void)1882 void Scm__InitRead(void)
1883 {
1884 readCtorData.table =
1885 SCM_HASH_TABLE(Scm_MakeHashTableSimple(SCM_HASH_EQ, 0));
1886 (void)SCM_INTERNAL_MUTEX_INIT(readCtorData.mutex);
1887 Scm_DefineReaderCtor(SCM_SYM_DEFINE_READER_CTOR,
1888 Scm_MakeSubr(reader_ctor, NULL, 2, 1,
1889 SCM_SYM_DEFINE_READER_CTOR),
1890 SCM_FALSE, SCM_FALSE);
1891
1892 hashBangData.table =
1893 SCM_HASH_TABLE(Scm_MakeHashTableSimple(SCM_HASH_EQ, 0));
1894 (void)SCM_INTERNAL_MUTEX_INIT(hashBangData.mutex);
1895
1896 defaultReadContext =
1897 Scm_MakePrimitiveParameter(SCM_CLASS_PRIMITIVE_PARAMETER, SCM_FALSE,
1898 SCM_OBJ(make_read_context(NULL)),
1899 0);
1900 }
1901