1 /*===========================================================================
2 * Filename : read.c
3 * About : S-Expression reader
4 *
5 * Copyright (C) 2000-2005 Shiro Kawai <shiro AT acm.org>
6 * Copyright (C) 2005 Kazuki Ohta <mover AT hct.zaq.ne.jp>
7 * Copyright (C) 2005-2006 Jun Inoue <jun.lambda AT gmail.com>
8 * Copyright (C) 2005-2006 YAMAMOTO Kengo <yamaken AT bp.iij4u.or.jp>
9 * Copyright (c) 2007-2008 SigScheme Project <uim-en AT googlegroups.com>
10 *
11 * All rights reserved.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 *
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. Neither the name of authors nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
27 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
28 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
30 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
33 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
34 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
35 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
36 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 ===========================================================================*/
38
39 /*
40 * FIXME: Support full R6RS characters once the specification has been
41 * finalized. -- YamaKen 2007-04-03
42 *
43 * - Support new escapes in string (\<linefeed> and \<space>)
44 * - Support character category validation for identifiers
45 * - Disable #\newline on R6RS-compatible mode
46 * - Confirm symbol escape syntax (not defined in R6RS yet)
47 */
48
49 /*
50 * ChangeLog
51 *
52 * 2005-06-18 kzk Copied from read.c of Gauche 0.8.5 and modified for
53 * SigScheme.
54 * 2005-11-01
55 * ...
56 * 2006-02-03 YamaKen Add SRFI-75 support, introduce safe and low-consumptive
57 * stack management, table-based char classification, and
58 * overall rewrite.
59 * 2007-01-20 YamaKen Revise SRFI-75 support into R6RS (R5.92RS) characters.
60 *
61 */
62
63 /* TODO: replace with character class sequence expression-based tokenizer */
64
65 /*
66 * R5RS: 7.1.1 Lexical structure
67 *
68 * <token> --> <identifier> | <boolean> | <number> | <character> | <string>
69 * | ( | ) | #( | ' | ` | , | ,@ | .
70 * <delimiter> --> <whitespace> | ( | ) | " | ;
71 * <whitespace> --> <space or newline>
72 * <comment> --> ; <all subsequent characters up to a
73 * line break>
74 * <atmosphere> --> <whitespace> | <comment>
75 * <intertoken space> --> <atmosphere>*
76 *
77 * <identifier> --> <initial> <subsequent>* | <peculiar identifier>
78 * <initial> --> <letter> | <special initial>
79 * <letter> --> a | b | c | ... | z
80 *
81 * <special initial> --> ! | $ | % | & | * | / | : | < | = | > | ? | ^ | _ | ~
82 * <subsequent> --> <initial> | <digit> | <special subsequent>
83 * <digit> --> 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
84 * <special subsequent> --> + | - | . | @
85 * <peculiar identifier> --> + | - | ...
86 * <syntactic keyword> --> <expression keyword>
87 * | else | => | define
88 * | unquote | unquote-splicing
89 * <expression keyword> --> quote | lambda | if
90 * | set! | begin | cond | and | or | case
91 * | let | let* | letrec | do | delay
92 * | quasiquote
93 *
94 * `<variable> => <'any <identifier> that isn't
95 * also a <syntactic keyword>>
96 *
97 * <boolean> --> #t | #f
98 * <character> --> #\ <any character>
99 * | #\ <character name>
100 * <character name> --> space | newline
101 *
102 * <string> --> " <string element>* "
103 * <string element> --> <any character other than " or \>
104 * | \" | \\
105 *
106 * <number> --> <num 2>| <num 8>
107 * | <num 10>| <num 16>
108 *
109 *
110 * <num R> --> <prefix R> <complex R>
111 * <complex R> --> <real R> | <real R> @ <real R>
112 * | <real R> + <ureal R> i | <real R> - <ureal R> i
113 * | <real R> + i | <real R> - i
114 * | + <ureal R> i | - <ureal R> i | + i | - i
115 * <real R> --> <sign> <ureal R>
116 * <ureal R> --> <uinteger R>
117 * | <uinteger R> / <uinteger R>
118 * | <decimal R>
119 * <decimal 10> --> <uinteger 10> <suffix>
120 * | . <digit 10>+ #* <suffix>
121 * | <digit 10>+ . <digit 10>* #* <suffix>
122 * | <digit 10>+ #+ . #* <suffix>
123 * <uinteger R> --> <digit R>+ #*
124 * <prefix R> --> <radix R> <exactness>
125 * | <exactness> <radix R>
126 *
127 * <suffix> --> <empty>
128 * | <exponent marker> <sign> <digit 10>+
129 * <exponent marker> --> e | s | f | d | l
130 * <sign> --> <empty> | + | -
131 * <exactness> --> <empty> | #i | #e
132 * <radix 2> --> #b
133 * <radix 8> --> #o
134 * <radix 10> --> <empty> | #d
135 * <radix 16> --> #x
136 * <digit 2> --> 0 | 1
137 * <digit 8> --> 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7
138 * <digit 10> --> <digit>
139 * <digit 16> --> <digit 10> | a | b | c | d | e | f
140 */
141
142 /*
143 * Although R5RS defined number literals as above, SigScheme only supports
144 * these truncated forms. See "R5RS conformance: Numbers: Literals" section of
145 * doc/spec.txt.
146 *
147 * <number> --> <num 2>| <num 8>
148 * | <num 10>| <num 16>
149 *
150 * <num R> --> <prefix R> <complex R>
151 * <complex R> --> <real R>
152 * <real R> --> <sign> <ureal R>
153 * <ureal R> --> <uinteger R>
154 * <uinteger R> --> <digit R>+ #* ;; '#' must not occur
155 * <prefix R> --> <radix R>
156 *
157 * <sign> --> <empty> | + | -
158 * <radix 2> --> #b
159 * <radix 8> --> #o
160 * <radix 10> --> <empty> | #d
161 * <radix 16> --> #x
162 * <digit 2> --> 0 | 1
163 * <digit 8> --> 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7
164 * <digit 10> --> <digit>
165 * <digit 16> --> <digit 10> | a | b | c | d | e | f
166 * <digit> --> 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
167 */
168
169 #include <config.h>
170
171 #include <limits.h>
172 #include <stdlib.h>
173 #include <string.h>
174 #if (HAVE_STRCASECMP && HAVE_STRINGS_H)
175 #include <strings.h>
176 #endif
177
178 #include "sigscheme.h"
179 #include "sigschemeinternal.h"
180
181 /*=======================================
182 File Local Macro Definitions
183 =======================================*/
184 #define OK 0
185 #define TOKEN_BUF_EXCEEDED (-1)
186
187 /* can accept "backspace" of R5RS and "x0010FFFF" of R6RS characters */
188 #define CHAR_LITERAL_LEN_MAX (sizeof("backspace") - sizeof(""))
189
190 /* #b-010101... */
191 #define INT_LITERAL_LEN_MAX (sizeof("-0") + SCM_INT_BITS - sizeof(""))
192
193 #define DISCARD_LOOKAHEAD(port) (scm_port_get_char(port))
194
195 /* accepts SCM_ICHAR_EOF */
196 #define ICHAR_ASCII_CLASS(c) \
197 (ICHAR_ASCIIP(c) ? scm_char_class_table[c] : SCM_CH_INVALID)
198 #define ICHAR_CLASS(c) \
199 ((127 < (c)) ? SCM_CH_NONASCII \
200 : (((c) < 0) ? SCM_CH_INVALID : scm_char_class_table[c]))
201
202 /*=======================================
203 File Local Type Definitions
204 =======================================*/
205 enum ScmLexerState {
206 LEX_ST_NORMAL,
207 LEX_ST_COMMENT
208 };
209
210 enum ScmCharClass {
211 /* ASCII */
212 SCM_CH_INVALID = 0,
213 SCM_CH_CONTROL = 1 << 0, /* iscntrl(3) + backslash */
214 SCM_CH_WHITESPACE = 1 << 1, /* [ \t\n\r\v\f] */
215 SCM_CH_DIGIT = 1 << 2, /* [0-9] */
216 SCM_CH_HEX_LETTER = 1 << 3, /* [a-fA-F] */
217 SCM_CH_NONHEX_LETTER = 1 << 4, /* [g-zG-Z] */
218 SCM_CH_SPECIAL_INITIAL = 1 << 5, /* [!$%&*\/:<=>?^_~] */
219 SCM_CH_SPECIAL_SUBSEQUENT = 1 << 6, /* [-+\.@] */
220 /* currently '.' is not included in SCM_CH_TOKEN_INITIAL */
221 SCM_CH_TOKEN_INITIAL = 1 << 7, /* [()#'`,\"\|\{\}\[\]] */
222
223 SCM_CH_LETTER = SCM_CH_HEX_LETTER | SCM_CH_NONHEX_LETTER,
224 SCM_CH_HEX_DIGIT = SCM_CH_DIGIT | SCM_CH_HEX_LETTER,
225 SCM_CH_INITIAL = SCM_CH_LETTER | SCM_CH_SPECIAL_INITIAL,
226 SCM_CH_SUBSEQUENT = SCM_CH_INITIAL | SCM_CH_DIGIT,
227 SCM_CH_PECULIAR_IDENTIFIER_CAND = SCM_CH_SPECIAL_SUBSEQUENT,
228 SCM_CH_DELIMITER
229 = SCM_CH_CONTROL | SCM_CH_WHITESPACE | SCM_CH_TOKEN_INITIAL,
230
231 /* beyond ASCII */
232 SCM_CH_ASCII = 0 << 8,
233 SCM_CH_8BIT = 1 << 8,
234 SCM_CH_MULTIBYTE = 1 << 9,
235
236 SCM_CH_NONASCII = SCM_CH_8BIT | SCM_CH_MULTIBYTE
237 };
238
239 /*=======================================
240 Variable Definitions
241 =======================================*/
242 static const unsigned char scm_char_class_table[] = {
243 SCM_CH_CONTROL, /* 0 nul */
244 SCM_CH_CONTROL, /* 1 x01 */
245 SCM_CH_CONTROL, /* 2 x02 */
246 SCM_CH_CONTROL, /* 3 x03 */
247 SCM_CH_CONTROL, /* 4 x04 */
248 SCM_CH_CONTROL, /* 5 x05 */
249 SCM_CH_CONTROL, /* 6 x06 */
250 SCM_CH_CONTROL, /* 7 alarm */
251 SCM_CH_CONTROL, /* 8 backspace */
252 SCM_CH_CONTROL | SCM_CH_WHITESPACE, /* 9 tab */
253 SCM_CH_CONTROL | SCM_CH_WHITESPACE, /* 10 newline */
254 SCM_CH_CONTROL | SCM_CH_WHITESPACE, /* 11 vtab */
255 SCM_CH_CONTROL | SCM_CH_WHITESPACE, /* 12 page */
256 SCM_CH_CONTROL | SCM_CH_WHITESPACE, /* 13 return */
257 SCM_CH_CONTROL, /* 14 x0e */
258 SCM_CH_CONTROL, /* 15 x0f */
259 SCM_CH_CONTROL, /* 16 x10 */
260 SCM_CH_CONTROL, /* 17 x11 */
261 SCM_CH_CONTROL, /* 18 x12 */
262 SCM_CH_CONTROL, /* 19 x13 */
263 SCM_CH_CONTROL, /* 20 x14 */
264 SCM_CH_CONTROL, /* 21 x15 */
265 SCM_CH_CONTROL, /* 22 x16 */
266 SCM_CH_CONTROL, /* 23 x17 */
267 SCM_CH_CONTROL, /* 24 x18 */
268 SCM_CH_CONTROL, /* 25 x19 */
269 SCM_CH_CONTROL, /* 26 x1a */
270 SCM_CH_CONTROL, /* 27 esc */
271 SCM_CH_CONTROL, /* 28 x1c */
272 SCM_CH_CONTROL, /* 29 x1d */
273 SCM_CH_CONTROL, /* 30 x1e */
274 SCM_CH_CONTROL, /* 31 x1f */
275 SCM_CH_WHITESPACE, /* 32 space */
276 SCM_CH_SPECIAL_INITIAL, /* 33 ! */
277 SCM_CH_TOKEN_INITIAL, /* 34 " */
278 SCM_CH_TOKEN_INITIAL, /* 35 # */
279 SCM_CH_SPECIAL_INITIAL, /* 36 $ */
280 SCM_CH_SPECIAL_INITIAL, /* 37 % */
281 SCM_CH_SPECIAL_INITIAL, /* 38 & */
282 SCM_CH_TOKEN_INITIAL, /* 39 ' */
283 SCM_CH_TOKEN_INITIAL, /* 40 ( */
284 SCM_CH_TOKEN_INITIAL, /* 41 ) */
285 SCM_CH_SPECIAL_INITIAL, /* 42 * */
286 SCM_CH_SPECIAL_SUBSEQUENT, /* 43 + */
287 SCM_CH_TOKEN_INITIAL, /* 44 , */
288 SCM_CH_SPECIAL_SUBSEQUENT, /* 45 - */
289 SCM_CH_SPECIAL_SUBSEQUENT /* | SCM_CH_TOKEN_INITIAL */, /* 46 . */
290 SCM_CH_SPECIAL_INITIAL, /* 47 / */
291 SCM_CH_DIGIT, /* 48 0 */
292 SCM_CH_DIGIT, /* 49 1 */
293 SCM_CH_DIGIT, /* 50 2 */
294 SCM_CH_DIGIT, /* 51 3 */
295 SCM_CH_DIGIT, /* 52 4 */
296 SCM_CH_DIGIT, /* 53 5 */
297 SCM_CH_DIGIT, /* 54 6 */
298 SCM_CH_DIGIT, /* 55 7 */
299 SCM_CH_DIGIT, /* 56 8 */
300 SCM_CH_DIGIT, /* 57 9 */
301 SCM_CH_SPECIAL_INITIAL, /* 58 : */
302 SCM_CH_TOKEN_INITIAL, /* 59 ; */
303 SCM_CH_SPECIAL_INITIAL, /* 60 < */
304 SCM_CH_SPECIAL_INITIAL, /* 61 = */
305 SCM_CH_SPECIAL_INITIAL, /* 62 > */
306 SCM_CH_SPECIAL_INITIAL, /* 63 ? */
307 SCM_CH_SPECIAL_SUBSEQUENT, /* 64 @ */
308 SCM_CH_HEX_LETTER, /* 65 A */
309 SCM_CH_HEX_LETTER, /* 66 B */
310 SCM_CH_HEX_LETTER, /* 67 C */
311 SCM_CH_HEX_LETTER, /* 68 D */
312 SCM_CH_HEX_LETTER, /* 69 E */
313 SCM_CH_HEX_LETTER, /* 70 F */
314 SCM_CH_NONHEX_LETTER, /* 71 G */
315 SCM_CH_NONHEX_LETTER, /* 72 H */
316 SCM_CH_NONHEX_LETTER, /* 73 I */
317 SCM_CH_NONHEX_LETTER, /* 74 J */
318 SCM_CH_NONHEX_LETTER, /* 75 K */
319 SCM_CH_NONHEX_LETTER, /* 76 L */
320 SCM_CH_NONHEX_LETTER, /* 77 M */
321 SCM_CH_NONHEX_LETTER, /* 78 N */
322 SCM_CH_NONHEX_LETTER, /* 79 O */
323 SCM_CH_NONHEX_LETTER, /* 80 P */
324 SCM_CH_NONHEX_LETTER, /* 81 Q */
325 SCM_CH_NONHEX_LETTER, /* 82 R */
326 SCM_CH_NONHEX_LETTER, /* 83 S */
327 SCM_CH_NONHEX_LETTER, /* 84 T */
328 SCM_CH_NONHEX_LETTER, /* 85 U */
329 SCM_CH_NONHEX_LETTER, /* 86 V */
330 SCM_CH_NONHEX_LETTER, /* 87 W */
331 SCM_CH_NONHEX_LETTER, /* 88 X */
332 SCM_CH_NONHEX_LETTER, /* 89 Y */
333 SCM_CH_NONHEX_LETTER, /* 90 Z */
334 SCM_CH_TOKEN_INITIAL, /* 91 [ */
335 SCM_CH_CONTROL, /* 92 \\ */
336 SCM_CH_TOKEN_INITIAL, /* 93 ] */
337 SCM_CH_SPECIAL_INITIAL, /* 94 ^ */
338 SCM_CH_SPECIAL_INITIAL, /* 95 _ */
339 SCM_CH_TOKEN_INITIAL, /* 96 ` */
340 SCM_CH_HEX_LETTER, /* 97 a */
341 SCM_CH_HEX_LETTER, /* 98 b */
342 SCM_CH_HEX_LETTER, /* 99 c */
343 SCM_CH_HEX_LETTER, /* 100 d */
344 SCM_CH_HEX_LETTER, /* 101 e */
345 SCM_CH_HEX_LETTER, /* 102 f */
346 SCM_CH_NONHEX_LETTER, /* 103 g */
347 SCM_CH_NONHEX_LETTER, /* 104 h */
348 SCM_CH_NONHEX_LETTER, /* 105 i */
349 SCM_CH_NONHEX_LETTER, /* 106 j */
350 SCM_CH_NONHEX_LETTER, /* 107 k */
351 SCM_CH_NONHEX_LETTER, /* 108 l */
352 SCM_CH_NONHEX_LETTER, /* 109 m */
353 SCM_CH_NONHEX_LETTER, /* 110 n */
354 SCM_CH_NONHEX_LETTER, /* 111 o */
355 SCM_CH_NONHEX_LETTER, /* 112 p */
356 SCM_CH_NONHEX_LETTER, /* 113 q */
357 SCM_CH_NONHEX_LETTER, /* 114 r */
358 SCM_CH_NONHEX_LETTER, /* 115 s */
359 SCM_CH_NONHEX_LETTER, /* 116 t */
360 SCM_CH_NONHEX_LETTER, /* 117 u */
361 SCM_CH_NONHEX_LETTER, /* 118 v */
362 SCM_CH_NONHEX_LETTER, /* 119 w */
363 SCM_CH_NONHEX_LETTER, /* 120 x */
364 SCM_CH_NONHEX_LETTER, /* 121 y */
365 SCM_CH_NONHEX_LETTER, /* 122 z */
366 SCM_CH_TOKEN_INITIAL, /* 123 { */
367 SCM_CH_TOKEN_INITIAL, /* 124 | */
368 SCM_CH_TOKEN_INITIAL, /* 125 } */
369 SCM_CH_SPECIAL_INITIAL, /* 126 ~ */
370 SCM_CH_CONTROL, /* 127 delete */
371 };
372
373 /*=======================================
374 File Local Function Declarations
375 =======================================*/
376 static scm_ichar_t skip_comment_and_space(ScmObj port);
377 static size_t read_token(ScmObj port, int *err,
378 char *buf, size_t buf_size, enum ScmCharClass delim);
379
380 static ScmObj read_sexpression(ScmObj port);
381 static ScmObj read_list(ScmObj port, scm_ichar_t closing_paren);
382 #if SCM_USE_R6RS_CHARS
383 static scm_ichar_t parse_unicode_sequence(const char *seq, int len);
384 static scm_ichar_t read_unicode_sequence(ScmObj port);
385 #endif /* SCM_USE_R6RS_CHARS */
386 #if SCM_USE_CHAR
387 static ScmObj read_char(ScmObj port);
388 #endif /* SCM_USE_CHAR */
389 #if SCM_USE_STRING
390 static ScmObj read_string(ScmObj port);
391 #endif /* SCM_USE_STRING */
392 static ScmObj read_symbol(ScmObj port);
393 static ScmObj read_number_or_peculiar(ScmObj port);
394 #if SCM_USE_NUMBER
395 static ScmObj parse_number(ScmObj port,
396 char *buf, size_t buf_size, char prefix);
397 static ScmObj read_number(ScmObj port, char prefix);
398 #endif /* SCM_USE_NUMBER */
399 static ScmObj read_quoted(ScmObj port, ScmObj quoter);
400
401 /*=======================================
402 Function Definitions
403 =======================================*/
404 /*===========================================================================
405 S-Expression Parser
406 ===========================================================================*/
407 SCM_EXPORT ScmObj
scm_read(ScmObj port)408 scm_read(ScmObj port)
409 {
410 ScmObj sexp;
411 DECLARE_INTERNAL_FUNCTION("scm_read");
412
413 sexp = read_sexpression(port);
414 #if SCM_DEBUG
415 if ((scm_debug_categories() & SCM_DBG_READ) && !EOFP(sexp)) {
416 scm_write(scm_err, sexp);
417 scm_port_newline(scm_err);
418 }
419 #endif
420
421 return sexp;
422 }
423
424 static scm_ichar_t
skip_comment_and_space(ScmObj port)425 skip_comment_and_space(ScmObj port)
426 {
427 scm_ichar_t c;
428 int state;
429
430 for (state = LEX_ST_NORMAL;;) {
431 c = scm_port_peek_char(port);
432 switch (state) {
433 case LEX_ST_NORMAL:
434 if (c == ';')
435 state = LEX_ST_COMMENT;
436 else if (!ICHAR_WHITESPACEP(c) || c == SCM_ICHAR_EOF)
437 return c; /* peeked */
438 break;
439
440 case LEX_ST_COMMENT:
441 if (c == '\n' || c == '\r')
442 state = LEX_ST_NORMAL;
443 else if (c == SCM_ICHAR_EOF)
444 return c; /* peeked */
445 break;
446 }
447 scm_port_get_char(port); /* skip the char */
448 }
449 }
450
451 static size_t
read_token(ScmObj port,int * err,char * buf,size_t buf_size,enum ScmCharClass delim)452 read_token(ScmObj port, int *err,
453 char *buf, size_t buf_size, enum ScmCharClass delim)
454 {
455 #if SCM_USE_R6RS_CHARS
456 ScmCharCodec *codec;
457 #endif
458 enum ScmCharClass ch_class;
459 scm_ichar_t c;
460 size_t len;
461 char *p, *buf_last;
462 DECLARE_INTERNAL_FUNCTION("read");
463
464 buf_last = &buf[buf_size - sizeof("")];
465 for (p = buf;;) {
466 c = scm_port_peek_char(port);
467 ch_class = ICHAR_CLASS(c);
468 CDBG((SCM_DBG_PARSER, "c = ~C", c));
469
470 if (p == buf) {
471 if (c == SCM_ICHAR_EOF)
472 ERR("unexpected EOF at a token");
473 } else {
474 if (ch_class & delim || c == SCM_ICHAR_EOF) {
475 *err = OK;
476 break;
477 }
478 }
479
480 if (ch_class & SCM_CH_NONASCII) {
481 #if SCM_USE_R6RS_CHARS
482 if (buf_last <= p + SCM_MB_MAX_LEN) {
483 *err = TOKEN_BUF_EXCEEDED;
484 break;
485 }
486 codec = scm_port_codec(port);
487 if (SCM_CHARCODEC_CCS(codec) != SCM_CCS_UNICODE)
488 ERR("non-ASCII char in token on a non-Unicode port: 0x~MX",
489 (scm_int_t)c);
490 /* canonicalize internal Unicode encoding */
491 p = SCM_CHARCODEC_INT2STR(scm_identifier_codec, p, c,
492 SCM_MB_STATELESS);
493 #else
494 ERR("non-ASCII char in token: 0x~X", (int)c);
495 #endif
496 } else {
497 if (p == buf_last) {
498 *err = TOKEN_BUF_EXCEEDED;
499 break;
500 }
501 *p++ = c;
502 }
503 DISCARD_LOOKAHEAD(port);
504 }
505
506 *p = '\0';
507 len = p - buf;
508 return len;
509 }
510
511 static ScmObj
read_sexpression(ScmObj port)512 read_sexpression(ScmObj port)
513 {
514 #if SCM_USE_VECTOR
515 ScmObj ret;
516 #endif
517 enum ScmCharClass ch_class;
518 scm_ichar_t c;
519 DECLARE_INTERNAL_FUNCTION("read");
520
521 CDBG((SCM_DBG_PARSER, "read_sexpression"));
522
523 for (;;) {
524 c = skip_comment_and_space(port);
525
526 CDBG((SCM_DBG_PARSER, "read_sexpression c = ~C", c));
527
528 ch_class = ICHAR_CLASS(c);
529 if (ch_class & (SCM_CH_INITIAL | SCM_CH_NONASCII))
530 return read_symbol(port);
531
532 if (ch_class & (SCM_CH_DIGIT | SCM_CH_PECULIAR_IDENTIFIER_CAND))
533 return read_number_or_peculiar(port);
534
535 /* case labels are ordered by appearance rate and penalty cost */
536 SCM_ASSERT(ch_class == SCM_CH_TOKEN_INITIAL || c == SCM_ICHAR_EOF);
537 SCM_ASSERT(c != ';');
538 DISCARD_LOOKAHEAD(port);
539 switch (c) {
540 case '(':
541 return read_list(port, ')');
542
543 #if SCM_USE_STRING
544 case '\"':
545 return read_string(port);
546 #endif
547
548 case '\'':
549 return read_quoted(port, SYM_QUOTE);
550
551 case '#':
552 c = scm_port_get_char(port);
553 switch (c) {
554 case 't':
555 return SCM_TRUE;
556 case 'f':
557 return SCM_FALSE;
558 #if SCM_USE_VECTOR
559 case '(':
560 ret = scm_p_list2vector(read_list(port, ')'));
561 #if SCM_CONST_VECTOR_LITERAL
562 SCM_VECTOR_SET_IMMUTABLE(ret);
563 #endif
564 return ret;
565 #endif /* SCM_USE_VECTOR */
566 #if SCM_USE_CHAR
567 case '\\':
568 return read_char(port);
569 #endif
570 #if SCM_USE_NUMBER
571 /* TODO: support exactness prefixes 'i' and 'e' */
572 case 'b': case 'o': case 'd': case 'x':
573 return read_number(port, c);
574 #endif
575 case SCM_ICHAR_EOF:
576 ERR("EOF in #");
577 /* NOTREACHED */
578 default:
579 ERR("unsupported # notation: ~C", c);
580 /* NOTREACHED */
581 }
582 /* NOTREACHED */
583
584 case '`':
585 return read_quoted(port, SYM_QUASIQUOTE);
586
587 case ',':
588 c = scm_port_peek_char(port);
589 switch (c) {
590 case SCM_ICHAR_EOF:
591 ERR("EOF in unquote");
592 /* NOTREACHED */
593
594 case '@':
595 DISCARD_LOOKAHEAD(port);
596 return read_quoted(port, SYM_UNQUOTE_SPLICING);
597
598 default:
599 return read_quoted(port, SYM_UNQUOTE);
600 }
601 /* NOTREACHED */
602
603 case ')':
604 ERR("unexpected ')'");
605 /* NOTREACHED */
606
607 case SCM_ICHAR_EOF:
608 return SCM_EOF;
609
610 case '|':
611 case '[':
612 case ']':
613 case '{':
614 case '}':
615 ERR("reserved notation: ~C", c);
616 /* NOTREACHED */
617
618 default:
619 SCM_NOTREACHED;
620 }
621 }
622 }
623
624 static ScmObj
read_list(ScmObj port,scm_ichar_t closing_paren)625 read_list(ScmObj port, scm_ichar_t closing_paren)
626 {
627 ScmObj lst, elm, cdr;
628 ScmQueue q;
629 #if SCM_DEBUG
630 ScmBaseCharPort *basecport;
631 size_t start_line, cur_line;
632 #endif
633 scm_ichar_t c;
634 int err;
635 char dot_buf[sizeof("...")];
636 DECLARE_INTERNAL_FUNCTION("read");
637
638 #if SCM_DEBUG
639 CDBG((SCM_DBG_PARSER, "read_list"));
640 basecport = SCM_PORT_TRY_DYNAMIC_CAST(ScmBaseCharPort,
641 SCM_PORT_IMPL(port));
642 start_line = (basecport) ? ScmBaseCharPort_line_number(basecport) : 0;
643 #endif
644
645 for (lst = SCM_NULL, SCM_QUEUE_POINT_TO(q, lst);
646 ;
647 #if SCM_CONST_LIST_LITERAL
648 SCM_QUEUE_CONST_ADD(q, elm)
649 #else
650 SCM_QUEUE_ADD(q, elm)
651 #endif
652 )
653 {
654 c = skip_comment_and_space(port);
655
656 CDBG((SCM_DBG_PARSER, "read_list c = [~C]", c));
657
658 if (c == SCM_ICHAR_EOF) {
659 #if SCM_DEBUG
660 if (basecport && start_line) {
661 cur_line = ScmBaseCharPort_line_number(basecport);
662 ERR("EOF inside list at line ~ZU (started from line ~ZU)",
663 cur_line, start_line);
664 } else
665 #endif
666 ERR("EOF inside list");
667 } else if (c == closing_paren) {
668 DISCARD_LOOKAHEAD(port);
669 return lst;
670 } else if (c == '.') {
671 /* Since expressions that beginning with a dot are limited to '.',
672 * '...' and numbers in R5RS (See "7.1.1 Lexical structure"), the
673 * fixed size buffer can safely buffer them. */
674 read_token(port, &err, dot_buf, sizeof(dot_buf), SCM_CH_DELIMITER);
675
676 if (dot_buf[1] == '\0') {
677 #if !SCM_STRICT_R5RS
678 /* Although implicit delimiter around the dot is allowd by
679 * R5RS, some other implementation doesn't parse so
680 * (e.g. '("foo"."bar") is parsed as 3 element list which 2nd
681 * elem is dot as symbol). To avoid introducing such
682 * incompatibility problem into codes of SigScheme users,
683 * require explicit whitespace around the dot. */
684 c = scm_port_peek_char(port);
685 if (!ICHAR_WHITESPACEP(c))
686 ERR("implicit dot delimitation is disabled to avoid compatibility problem");
687 #endif
688 if (NULLP(lst))
689 ERR(".(dot) at the start of the list");
690
691 cdr = read_sexpression(port);
692 c = skip_comment_and_space(port);
693 DISCARD_LOOKAHEAD(port);
694 if (c != closing_paren)
695 ERR("bad dot syntax");
696
697 SCM_QUEUE_SLOPPY_APPEND(q, cdr);
698 return lst;
699 } else if (strcmp(dot_buf, "...") == 0) {
700 elm = SYM_ELLIPSIS;
701 } else {
702 ERR("bad dot syntax");
703 }
704 } else {
705 elm = read_sexpression(port);
706 }
707 }
708 }
709
710 #if SCM_USE_R6RS_CHARS
711 static scm_ichar_t
parse_unicode_sequence(const char * seq,int len)712 parse_unicode_sequence(const char *seq, int len)
713 {
714 scm_ichar_t c;
715 int err;
716 DECLARE_INTERNAL_FUNCTION("read");
717
718 /* reject ordinary char literal and invalid signed hexadecimal */
719 if (len < 2 || seq[0] != 'x' || !ICHAR_HEXA_NUMERICP(seq[1]))
720 return -1;
721
722 c = scm_string2number(&seq[1], 16, &err);
723 SCM_ASSERT(c >= 0);
724 if (err)
725 return -1;
726
727 /* R6RS: 3.2.6 Strings
728 * the sequence of <digit 16>s forms a hexadecimal number between 0 and
729 * #x10FFFF excluding the range [#xD800, #xDFFF] */
730 if (!ICHAR_VALID_UNICODEP(c))
731 ERR("invalid Unicode value: 0x~MX", (scm_int_t)c);
732
733 return c;
734 }
735
736 static scm_ichar_t
read_unicode_sequence(ScmObj port)737 read_unicode_sequence(ScmObj port)
738 {
739 int err;
740 scm_ichar_t c;
741 size_t len;
742 char seq[sizeof("x0010ffff")];
743 DECLARE_INTERNAL_FUNCTION("read");
744
745 seq[0] = 'x';
746 len = read_token(port, &err, &seq[1], sizeof(seq) - sizeof((char)'x'),
747 SCM_CH_DELIMITER);
748 if (err == TOKEN_BUF_EXCEEDED)
749 goto err;
750
751 c = parse_unicode_sequence(seq, len + sizeof((char)'x'));
752 if (c < 0)
753 goto err;
754
755 return c;
756
757 err:
758 ERR("invalid hex scalar value");
759 }
760 #endif /* SCM_USE_R6RS_CHARS */
761
762 #if SCM_USE_CHAR
763 static ScmObj
read_char(ScmObj port)764 read_char(ScmObj port)
765 {
766 const ScmSpecialCharInfo *info;
767 size_t len;
768 scm_ichar_t c, next;
769 #if SCM_USE_R6RS_CHARS
770 scm_ichar_t unicode;
771 #endif
772 int err;
773 char buf[CHAR_LITERAL_LEN_MAX + sizeof("")];
774 DECLARE_INTERNAL_FUNCTION("read");
775
776 /* raw char (multibyte-ready) */
777 c = scm_port_get_char(port);
778 next = scm_port_peek_char(port);
779 if (ICHAR_ASCII_CLASS(next) & SCM_CH_DELIMITER || next == SCM_ICHAR_EOF) {
780 #if !SCM_USE_R6RS_CHARS
781 if (!ICHAR_ASCIIP(c))
782 ERR("invalid character literal");
783 #endif
784 return MAKE_CHAR(c);
785 }
786
787 buf[0] = c;
788 len = read_token(port, &err, &buf[1], sizeof(buf) - 1, SCM_CH_DELIMITER);
789 if (err == TOKEN_BUF_EXCEEDED)
790 ERR("invalid character literal");
791
792 CDBG((SCM_DBG_PARSER, "read_char: ch = ~S", buf));
793
794 #if SCM_USE_R6RS_CHARS
795 unicode = parse_unicode_sequence(buf, len + sizeof((char)c));
796 if (0 <= unicode)
797 return MAKE_CHAR(unicode);
798 #endif
799 /* named chars */
800 for (info = scm_special_char_table; info->esc_seq; info++) {
801 /*
802 * R5RS: 6.3.4 Characters
803 * Case is significant in #\<character>, but not in #\<character name>.
804 */
805 if (strcasecmp(buf, info->lex_rep) == 0)
806 return MAKE_CHAR(info->code);
807 }
808 ERR("invalid character literal: #\\~S", buf);
809 }
810 #endif /* SCM_USE_CHAR */
811
812 #if SCM_USE_STRING
813 static ScmObj
read_string(ScmObj port)814 read_string(ScmObj port)
815 {
816 ScmObj obj;
817 const ScmSpecialCharInfo *info;
818 ScmCharCodec *codec;
819 scm_int_t len;
820 scm_ichar_t c;
821 char *p;
822 size_t offset;
823 ScmLBuf(char) lbuf;
824 char init_buf[SCM_INITIAL_STRING_BUF_SIZE];
825 DECLARE_INTERNAL_FUNCTION("read");
826
827 CDBG((SCM_DBG_PARSER, "read_string"));
828
829 LBUF_INIT(lbuf, init_buf, sizeof(init_buf));
830 codec = scm_port_codec(port);
831
832 for (offset = 0, p = LBUF_BUF(lbuf), len = 0;
833 ;
834 offset = p - LBUF_BUF(lbuf), len++)
835 {
836 c = scm_port_get_char(port);
837
838 CDBG((SCM_DBG_PARSER, "read_string c = ~C", c));
839
840 switch (c) {
841 case SCM_ICHAR_EOF:
842 LBUF_FREE(lbuf);
843 ERR("EOF in string");
844 /* NOTREACHED */
845
846 case '\"':
847 LBUF_EXTEND(lbuf, SCM_LBUF_F_STRING, offset + 1);
848 LBUF_BUF(lbuf)[offset] = '\0';
849 obj = MAKE_IMMUTABLE_STRING_COPYING(LBUF_BUF(lbuf), len);
850 LBUF_FREE(lbuf);
851 return obj;
852
853 case '\\':
854 c = scm_port_get_char(port);
855 #if SCM_USE_R6RS_CHARS
856 if (c == 'x') {
857 c = read_unicode_sequence(port);
858 LBUF_EXTEND(lbuf, SCM_LBUF_F_STRING,
859 offset + SCM_MB_CHAR_BUF_SIZE);
860 p = &LBUF_BUF(lbuf)[offset];
861 p = SCM_CHARCODEC_INT2STR(codec, p, c, SCM_MB_STATELESS);
862 if (!p)
863 ERR("invalid inline hex escape in string: 0x~MX",
864 (scm_int_t)c);
865 c = scm_port_get_char(port);
866 if (c != ';')
867 ERR("inline hex escape must be followed by ';'");
868 goto found;
869 } else
870 #endif
871 {
872 /* escape sequences */
873 for (info = scm_special_char_table; info->esc_seq; info++) {
874 if (strlen(info->esc_seq) == 2 && c == info->esc_seq[1]) {
875 LBUF_EXTEND(lbuf, SCM_LBUF_F_STRING, offset + 1);
876 p = &LBUF_BUF(lbuf)[offset];
877 *p++ = info->code;
878 goto found;
879 }
880 }
881 }
882 ERR("invalid escape sequence in string: \\~C", c);
883 found:
884 break;
885
886 default:
887 LBUF_EXTEND(lbuf, SCM_LBUF_F_STRING,
888 offset + SCM_MB_CHAR_BUF_SIZE);
889 p = &LBUF_BUF(lbuf)[offset];
890 #if SCM_USE_R6RS_CHARS
891 /* FIXME: support stateful encoding */
892 p = SCM_CHARCODEC_INT2STR(codec, p, c, SCM_MB_STATELESS);
893 if (!p)
894 ERR("invalid char in string: 0x~MX", (scm_int_t)c);
895 #else
896 *p++ = c;
897 #endif
898 break;
899 }
900 #if !SCM_USE_NULL_CAPABLE_STRING
901 if (c == '\0')
902 ERR(SCM_ERRMSG_NULL_IN_STRING);
903 #endif
904 }
905 #if 0
906 LBUF_END(lbuf)[-1] = '\0';
907 ERR("too long string: \"~S\"", LBUF_BUF(lbuf));
908 #endif
909 /* NOTREACHED */
910 }
911 #endif /* SCM_USE_STRING */
912
913 static ScmObj
read_symbol(ScmObj port)914 read_symbol(ScmObj port)
915 {
916 ScmObj sym;
917 size_t offset, tail_len;
918 int err;
919 ScmLBuf(char) lbuf;
920 char init_buf[SCM_INITIAL_SYMBOL_BUF_SIZE];
921
922 CDBG((SCM_DBG_PARSER, "read_symbol"));
923
924 LBUF_INIT(lbuf, init_buf, sizeof(init_buf));
925
926 for (offset = 0;;) {
927 tail_len = read_token(port, &err,
928 &LBUF_BUF(lbuf)[offset],
929 LBUF_SIZE(lbuf) - offset,
930 SCM_CH_DELIMITER);
931 if (err != TOKEN_BUF_EXCEEDED)
932 break;
933 offset += tail_len;
934 LBUF_EXTEND(lbuf, SCM_LBUF_F_SYMBOL,
935 LBUF_SIZE(lbuf) + SCM_MB_CHAR_BUF_SIZE);
936 }
937
938 sym = scm_intern(LBUF_BUF(lbuf));
939 LBUF_FREE(lbuf);
940
941 return sym;
942 }
943
944 static ScmObj
read_number_or_peculiar(ScmObj port)945 read_number_or_peculiar(ScmObj port)
946 {
947 scm_ichar_t c;
948 int err;
949 char buf[INT_LITERAL_LEN_MAX + sizeof("")];
950 DECLARE_INTERNAL_FUNCTION("read");
951
952 CDBG((SCM_DBG_PARSER, "read"));
953
954 c = scm_port_peek_char(port);
955 SCM_ASSERT(ICHAR_ASCII_CLASS(c)
956 & (SCM_CH_DIGIT | SCM_CH_PECULIAR_IDENTIFIER_CAND));
957
958 #if SCM_USE_NUMBER
959 if (ICHAR_NUMERICP(c))
960 return read_number(port, 'd');
961
962 if (c == '+' || c == '-') {
963 read_token(port, &err, buf, sizeof(buf), SCM_CH_DELIMITER);
964 if (err == TOKEN_BUF_EXCEEDED)
965 ERR("invalid number literal");
966
967 /* '+' or '-' */
968 if (!buf[1])
969 return scm_intern(buf);
970
971 return parse_number(port, buf, sizeof(buf), 'd');
972 }
973 #endif /* SCM_USE_NUMBER */
974
975 if (c == '.') {
976 read_token(port, &err, buf, sizeof(buf), SCM_CH_DELIMITER);
977 if (strcmp(buf, "...") == 0)
978 return SYM_ELLIPSIS;
979 /* TODO: support numeric expressions when the numeric tower is
980 implemented */
981 ERR("invalid identifier: ~S", buf);
982 }
983
984 if (c == '@')
985 ERR("invalid identifier starting with @");
986
987 return read_symbol(port);
988 }
989
990 #if SCM_USE_NUMBER
991 /* reads 'b123' part of #b123 */
992 static ScmObj
parse_number(ScmObj port,char * buf,size_t buf_size,char prefix)993 parse_number(ScmObj port, char *buf, size_t buf_size, char prefix)
994 {
995 scm_int_t number;
996 int radix;
997 scm_bool err;
998 DECLARE_INTERNAL_FUNCTION("read");
999
1000 switch (prefix) {
1001 case 'b': radix = 2; break;
1002 case 'o': radix = 8; break;
1003 case 'd': radix = 10; break;
1004 case 'x': radix = 16; break;
1005 default:
1006 goto err;
1007 }
1008
1009 number = scm_string2number(buf, radix, &err);
1010 if (!err)
1011 return MAKE_INT(number);
1012
1013 err:
1014 ERR("ill-formatted number: #~C~S", (scm_ichar_t)prefix, buf);
1015 }
1016
1017 static ScmObj
read_number(ScmObj port,char prefix)1018 read_number(ScmObj port, char prefix)
1019 {
1020 int err;
1021 char buf[INT_LITERAL_LEN_MAX + sizeof("")];
1022 DECLARE_INTERNAL_FUNCTION("read");
1023
1024 read_token(port, &err, buf, sizeof(buf), SCM_CH_DELIMITER);
1025 if (err == TOKEN_BUF_EXCEEDED)
1026 ERR("invalid number literal");
1027
1028 return parse_number(port, buf, sizeof(buf), prefix);
1029 }
1030 #endif /* SCM_USE_NUMBER */
1031
1032 static ScmObj
read_quoted(ScmObj port,ScmObj quoter)1033 read_quoted(ScmObj port, ScmObj quoter)
1034 {
1035 ScmObj obj;
1036 DECLARE_INTERNAL_FUNCTION("read");
1037
1038 obj = read_sexpression(port);
1039 if (EOFP(obj))
1040 ERR("EOF in ~a", quoter);
1041
1042 return SCM_LIST_2(quoter, obj);
1043 }
1044
1045 /*===========================================================================
1046 R5RS : 6.6 Input and Output : 6.6.2 Input
1047 ===========================================================================*/
1048 SCM_EXPORT ScmObj
scm_p_read(ScmObj args)1049 scm_p_read(ScmObj args)
1050 {
1051 ScmObj port;
1052 DECLARE_FUNCTION("read", procedure_variadic_0);
1053
1054 port = scm_prepare_port(args, scm_in);
1055 return scm_read(port);
1056 }
1057