1 /*===========================================================================
2  *  Filename : read.c
3  *  About    : S-Expression reader
4  *
5  *  Copyright (C) 2000-2005 Shiro Kawai <shiro AT acm.org>
6  *  Copyright (C) 2005      Kazuki Ohta <mover AT hct.zaq.ne.jp>
7  *  Copyright (C) 2005-2006 Jun Inoue <jun.lambda AT gmail.com>
8  *  Copyright (C) 2005-2006 YAMAMOTO Kengo <yamaken AT bp.iij4u.or.jp>
9  *  Copyright (c) 2007-2008 SigScheme Project <uim-en AT googlegroups.com>
10  *
11  *  All rights reserved.
12  *
13  *  Redistribution and use in source and binary forms, with or without
14  *  modification, are permitted provided that the following conditions
15  *  are met:
16  *
17  *  1. Redistributions of source code must retain the above copyright
18  *     notice, this list of conditions and the following disclaimer.
19  *  2. Redistributions in binary form must reproduce the above copyright
20  *     notice, this list of conditions and the following disclaimer in the
21  *     documentation and/or other materials provided with the distribution.
22  *  3. Neither the name of authors nor the names of its contributors
23  *     may be used to endorse or promote products derived from this software
24  *     without specific prior written permission.
25  *
26  *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
27  *  IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
28  *  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29  *  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
30  *  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31  *  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32  *  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
33  *  OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
34  *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
35  *  OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
36  *  ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 ===========================================================================*/
38 
39 /*
40  * FIXME: Support full R6RS characters once the specification has been
41  * finalized.  -- YamaKen 2007-04-03
42  *
43  * - Support new escapes in string (\<linefeed> and \<space>)
44  * - Support character category validation for identifiers
45  * - Disable #\newline on R6RS-compatible mode
46  * - Confirm symbol escape syntax (not defined in R6RS yet)
47  */
48 
49 /*
50  * ChangeLog
51  *
52  * 2005-06-18 kzk      Copied from read.c of Gauche 0.8.5 and modified for
53  *                     SigScheme.
54  * 2005-11-01
55  *    ...
56  * 2006-02-03 YamaKen  Add SRFI-75 support, introduce safe and low-consumptive
57  *                     stack management, table-based char classification, and
58  *                     overall rewrite.
59  * 2007-01-20 YamaKen  Revise SRFI-75 support into R6RS (R5.92RS) characters.
60  *
61  */
62 
63 /* TODO: replace with character class sequence expression-based tokenizer */
64 
65 /*
66  * R5RS: 7.1.1 Lexical structure
67  *
68  * <token> --> <identifier> | <boolean> | <number> | <character> | <string>
69  *      | ( | ) | #( | ' | ` | , | ,@ | .
70  * <delimiter> --> <whitespace> | ( | ) | " | ;
71  * <whitespace> --> <space or newline>
72  * <comment> --> ;  <all subsequent characters up to a
73  *                  line break>
74  * <atmosphere> --> <whitespace> | <comment>
75  * <intertoken space> --> <atmosphere>*
76  *
77  * <identifier> --> <initial> <subsequent>* | <peculiar identifier>
78  * <initial> --> <letter> | <special initial>
79  * <letter> --> a | b | c | ... | z
80  *
81  * <special initial> --> ! | $ | % | & | * | / | : | < | = | > | ? | ^ | _ | ~
82  * <subsequent> --> <initial> | <digit> | <special subsequent>
83  * <digit> --> 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
84  * <special subsequent> --> + | - | . | @
85  * <peculiar identifier> --> + | - | ...
86  * <syntactic keyword> --> <expression keyword>
87  *      | else | => | define
88  *      | unquote | unquote-splicing
89  * <expression keyword> --> quote | lambda | if
90  *      | set! | begin | cond | and | or | case
91  *      | let | let* | letrec | do | delay
92  *      | quasiquote
93  *
94  * `<variable> => <'any <identifier> that isn't
95  *                 also a <syntactic keyword>>
96  *
97  * <boolean> --> #t | #f
98  * <character> --> #\ <any character>
99  *      | #\ <character name>
100  * <character name> --> space | newline
101  *
102  * <string> --> " <string element>* "
103  * <string element> --> <any character other than " or \>
104  *      | \" | \\
105  *
106  * <number> --> <num 2>| <num 8>
107  *      | <num 10>| <num 16>
108  *
109  *
110  * <num R> --> <prefix R> <complex R>
111  * <complex R> --> <real R> | <real R> @ <real R>
112  *     | <real R> + <ureal R> i | <real R> - <ureal R> i
113  *     | <real R> + i | <real R> - i
114  *     | + <ureal R> i | - <ureal R> i | + i | - i
115  * <real R> --> <sign> <ureal R>
116  * <ureal R> --> <uinteger R>
117  *     | <uinteger R> / <uinteger R>
118  *     | <decimal R>
119  * <decimal 10> --> <uinteger 10> <suffix>
120  *     | . <digit 10>+ #* <suffix>
121  *     | <digit 10>+ . <digit 10>* #* <suffix>
122  *     | <digit 10>+ #+ . #* <suffix>
123  * <uinteger R> --> <digit R>+ #*
124  * <prefix R> --> <radix R> <exactness>
125  *     | <exactness> <radix R>
126  *
127  * <suffix> --> <empty>
128  *     | <exponent marker> <sign> <digit 10>+
129  * <exponent marker> --> e | s | f | d | l
130  * <sign> --> <empty>  | + |  -
131  * <exactness> --> <empty> | #i | #e
132  * <radix 2> --> #b
133  * <radix 8> --> #o
134  * <radix 10> --> <empty> | #d
135  * <radix 16> --> #x
136  * <digit 2> --> 0 | 1
137  * <digit 8> --> 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7
138  * <digit 10> --> <digit>
139  * <digit 16> --> <digit 10> | a | b | c | d | e | f
140  */
141 
142 /*
143  * Although R5RS defined number literals as above, SigScheme only supports
144  * these truncated forms. See "R5RS conformance: Numbers: Literals" section of
145  * doc/spec.txt.
146  *
147  * <number> --> <num 2>| <num 8>
148  *      | <num 10>| <num 16>
149  *
150  * <num R> --> <prefix R> <complex R>
151  * <complex R> --> <real R>
152  * <real R> --> <sign> <ureal R>
153  * <ureal R> --> <uinteger R>
154  * <uinteger R> --> <digit R>+ #*   ;; '#' must not occur
155  * <prefix R> --> <radix R>
156  *
157  * <sign> --> <empty>  | + |  -
158  * <radix 2> --> #b
159  * <radix 8> --> #o
160  * <radix 10> --> <empty> | #d
161  * <radix 16> --> #x
162  * <digit 2> --> 0 | 1
163  * <digit 8> --> 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7
164  * <digit 10> --> <digit>
165  * <digit 16> --> <digit 10> | a | b | c | d | e | f
166  * <digit> --> 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
167  */
168 
169 #include <config.h>
170 
171 #include <limits.h>
172 #include <stdlib.h>
173 #include <string.h>
174 #if (HAVE_STRCASECMP && HAVE_STRINGS_H)
175 #include <strings.h>
176 #endif
177 
178 #include "sigscheme.h"
179 #include "sigschemeinternal.h"
180 
181 /*=======================================
182   File Local Macro Definitions
183 =======================================*/
184 #define OK 0
185 #define TOKEN_BUF_EXCEEDED (-1)
186 
187 /* can accept "backspace" of R5RS and "x0010FFFF" of R6RS characters */
188 #define CHAR_LITERAL_LEN_MAX (sizeof("backspace") - sizeof(""))
189 
190 /* #b-010101... */
191 #define INT_LITERAL_LEN_MAX (sizeof("-0") + SCM_INT_BITS - sizeof(""))
192 
193 #define DISCARD_LOOKAHEAD(port) (scm_port_get_char(port))
194 
195 /* accepts SCM_ICHAR_EOF */
196 #define ICHAR_ASCII_CLASS(c)                                                 \
197     (ICHAR_ASCIIP(c) ? scm_char_class_table[c] : SCM_CH_INVALID)
198 #define ICHAR_CLASS(c)                                                       \
199     ((127 < (c)) ? SCM_CH_NONASCII                                           \
200                  : (((c) < 0) ? SCM_CH_INVALID : scm_char_class_table[c]))
201 
202 /*=======================================
203   File Local Type Definitions
204 =======================================*/
205 enum ScmLexerState {
206     LEX_ST_NORMAL,
207     LEX_ST_COMMENT
208 };
209 
210 enum ScmCharClass {
211     /* ASCII */
212     SCM_CH_INVALID            = 0,
213     SCM_CH_CONTROL            = 1 << 0, /* iscntrl(3) + backslash */
214     SCM_CH_WHITESPACE         = 1 << 1, /* [ \t\n\r\v\f] */
215     SCM_CH_DIGIT              = 1 << 2, /* [0-9] */
216     SCM_CH_HEX_LETTER         = 1 << 3, /* [a-fA-F] */
217     SCM_CH_NONHEX_LETTER      = 1 << 4, /* [g-zG-Z] */
218     SCM_CH_SPECIAL_INITIAL    = 1 << 5, /* [!$%&*\/:<=>?^_~] */
219     SCM_CH_SPECIAL_SUBSEQUENT = 1 << 6, /* [-+\.@] */
220     /* currently '.' is not included in SCM_CH_TOKEN_INITIAL */
221     SCM_CH_TOKEN_INITIAL      = 1 << 7, /* [()#'`,\"\|\{\}\[\]] */
222 
223     SCM_CH_LETTER     = SCM_CH_HEX_LETTER | SCM_CH_NONHEX_LETTER,
224     SCM_CH_HEX_DIGIT  = SCM_CH_DIGIT | SCM_CH_HEX_LETTER,
225     SCM_CH_INITIAL    = SCM_CH_LETTER | SCM_CH_SPECIAL_INITIAL,
226     SCM_CH_SUBSEQUENT = SCM_CH_INITIAL | SCM_CH_DIGIT,
227     SCM_CH_PECULIAR_IDENTIFIER_CAND = SCM_CH_SPECIAL_SUBSEQUENT,
228     SCM_CH_DELIMITER
229         = SCM_CH_CONTROL | SCM_CH_WHITESPACE | SCM_CH_TOKEN_INITIAL,
230 
231     /* beyond ASCII */
232     SCM_CH_ASCII              = 0 << 8,
233     SCM_CH_8BIT               = 1 << 8,
234     SCM_CH_MULTIBYTE          = 1 << 9,
235 
236     SCM_CH_NONASCII           = SCM_CH_8BIT | SCM_CH_MULTIBYTE
237 };
238 
239 /*=======================================
240   Variable Definitions
241 =======================================*/
242 static const unsigned char scm_char_class_table[] = {
243     SCM_CH_CONTROL,            /*   0  nul       */
244     SCM_CH_CONTROL,            /*   1  x01       */
245     SCM_CH_CONTROL,            /*   2  x02       */
246     SCM_CH_CONTROL,            /*   3  x03       */
247     SCM_CH_CONTROL,            /*   4  x04       */
248     SCM_CH_CONTROL,            /*   5  x05       */
249     SCM_CH_CONTROL,            /*   6  x06       */
250     SCM_CH_CONTROL,            /*   7  alarm     */
251     SCM_CH_CONTROL,            /*   8  backspace */
252     SCM_CH_CONTROL | SCM_CH_WHITESPACE, /*   9  tab       */
253     SCM_CH_CONTROL | SCM_CH_WHITESPACE, /*  10  newline   */
254     SCM_CH_CONTROL | SCM_CH_WHITESPACE, /*  11  vtab      */
255     SCM_CH_CONTROL | SCM_CH_WHITESPACE, /*  12  page      */
256     SCM_CH_CONTROL | SCM_CH_WHITESPACE, /*  13  return    */
257     SCM_CH_CONTROL,            /*  14  x0e       */
258     SCM_CH_CONTROL,            /*  15  x0f       */
259     SCM_CH_CONTROL,            /*  16  x10       */
260     SCM_CH_CONTROL,            /*  17  x11       */
261     SCM_CH_CONTROL,            /*  18  x12       */
262     SCM_CH_CONTROL,            /*  19  x13       */
263     SCM_CH_CONTROL,            /*  20  x14       */
264     SCM_CH_CONTROL,            /*  21  x15       */
265     SCM_CH_CONTROL,            /*  22  x16       */
266     SCM_CH_CONTROL,            /*  23  x17       */
267     SCM_CH_CONTROL,            /*  24  x18       */
268     SCM_CH_CONTROL,            /*  25  x19       */
269     SCM_CH_CONTROL,            /*  26  x1a       */
270     SCM_CH_CONTROL,            /*  27  esc       */
271     SCM_CH_CONTROL,            /*  28  x1c       */
272     SCM_CH_CONTROL,            /*  29  x1d       */
273     SCM_CH_CONTROL,            /*  30  x1e       */
274     SCM_CH_CONTROL,            /*  31  x1f       */
275     SCM_CH_WHITESPACE,         /*  32  space     */
276     SCM_CH_SPECIAL_INITIAL,    /*  33  !         */
277     SCM_CH_TOKEN_INITIAL,      /*  34  "         */
278     SCM_CH_TOKEN_INITIAL,      /*  35  #         */
279     SCM_CH_SPECIAL_INITIAL,    /*  36  $         */
280     SCM_CH_SPECIAL_INITIAL,    /*  37  %         */
281     SCM_CH_SPECIAL_INITIAL,    /*  38  &         */
282     SCM_CH_TOKEN_INITIAL,      /*  39  '         */
283     SCM_CH_TOKEN_INITIAL,      /*  40  (         */
284     SCM_CH_TOKEN_INITIAL,      /*  41  )         */
285     SCM_CH_SPECIAL_INITIAL,    /*  42  *         */
286     SCM_CH_SPECIAL_SUBSEQUENT, /*  43  +         */
287     SCM_CH_TOKEN_INITIAL,      /*  44  ,         */
288     SCM_CH_SPECIAL_SUBSEQUENT, /*  45  -         */
289     SCM_CH_SPECIAL_SUBSEQUENT /* | SCM_CH_TOKEN_INITIAL */, /*  46  .        */
290     SCM_CH_SPECIAL_INITIAL,    /*  47  /         */
291     SCM_CH_DIGIT,              /*  48  0         */
292     SCM_CH_DIGIT,              /*  49  1         */
293     SCM_CH_DIGIT,              /*  50  2         */
294     SCM_CH_DIGIT,              /*  51  3         */
295     SCM_CH_DIGIT,              /*  52  4         */
296     SCM_CH_DIGIT,              /*  53  5         */
297     SCM_CH_DIGIT,              /*  54  6         */
298     SCM_CH_DIGIT,              /*  55  7         */
299     SCM_CH_DIGIT,              /*  56  8         */
300     SCM_CH_DIGIT,              /*  57  9         */
301     SCM_CH_SPECIAL_INITIAL,    /*  58  :         */
302     SCM_CH_TOKEN_INITIAL,      /*  59  ;         */
303     SCM_CH_SPECIAL_INITIAL,    /*  60  <         */
304     SCM_CH_SPECIAL_INITIAL,    /*  61  =         */
305     SCM_CH_SPECIAL_INITIAL,    /*  62  >         */
306     SCM_CH_SPECIAL_INITIAL,    /*  63  ?         */
307     SCM_CH_SPECIAL_SUBSEQUENT, /*  64  @         */
308     SCM_CH_HEX_LETTER,         /*  65  A         */
309     SCM_CH_HEX_LETTER,         /*  66  B         */
310     SCM_CH_HEX_LETTER,         /*  67  C         */
311     SCM_CH_HEX_LETTER,         /*  68  D         */
312     SCM_CH_HEX_LETTER,         /*  69  E         */
313     SCM_CH_HEX_LETTER,         /*  70  F         */
314     SCM_CH_NONHEX_LETTER,      /*  71  G         */
315     SCM_CH_NONHEX_LETTER,      /*  72  H         */
316     SCM_CH_NONHEX_LETTER,      /*  73  I         */
317     SCM_CH_NONHEX_LETTER,      /*  74  J         */
318     SCM_CH_NONHEX_LETTER,      /*  75  K         */
319     SCM_CH_NONHEX_LETTER,      /*  76  L         */
320     SCM_CH_NONHEX_LETTER,      /*  77  M         */
321     SCM_CH_NONHEX_LETTER,      /*  78  N         */
322     SCM_CH_NONHEX_LETTER,      /*  79  O         */
323     SCM_CH_NONHEX_LETTER,      /*  80  P         */
324     SCM_CH_NONHEX_LETTER,      /*  81  Q         */
325     SCM_CH_NONHEX_LETTER,      /*  82  R         */
326     SCM_CH_NONHEX_LETTER,      /*  83  S         */
327     SCM_CH_NONHEX_LETTER,      /*  84  T         */
328     SCM_CH_NONHEX_LETTER,      /*  85  U         */
329     SCM_CH_NONHEX_LETTER,      /*  86  V         */
330     SCM_CH_NONHEX_LETTER,      /*  87  W         */
331     SCM_CH_NONHEX_LETTER,      /*  88  X         */
332     SCM_CH_NONHEX_LETTER,      /*  89  Y         */
333     SCM_CH_NONHEX_LETTER,      /*  90  Z         */
334     SCM_CH_TOKEN_INITIAL,      /*  91  [         */
335     SCM_CH_CONTROL,            /*  92  \\        */
336     SCM_CH_TOKEN_INITIAL,      /*  93  ]         */
337     SCM_CH_SPECIAL_INITIAL,    /*  94  ^         */
338     SCM_CH_SPECIAL_INITIAL,    /*  95  _         */
339     SCM_CH_TOKEN_INITIAL,      /*  96  `         */
340     SCM_CH_HEX_LETTER,         /*  97  a         */
341     SCM_CH_HEX_LETTER,         /*  98  b         */
342     SCM_CH_HEX_LETTER,         /*  99  c         */
343     SCM_CH_HEX_LETTER,         /* 100  d         */
344     SCM_CH_HEX_LETTER,         /* 101  e         */
345     SCM_CH_HEX_LETTER,         /* 102  f         */
346     SCM_CH_NONHEX_LETTER,      /* 103  g         */
347     SCM_CH_NONHEX_LETTER,      /* 104  h         */
348     SCM_CH_NONHEX_LETTER,      /* 105  i         */
349     SCM_CH_NONHEX_LETTER,      /* 106  j         */
350     SCM_CH_NONHEX_LETTER,      /* 107  k         */
351     SCM_CH_NONHEX_LETTER,      /* 108  l         */
352     SCM_CH_NONHEX_LETTER,      /* 109  m         */
353     SCM_CH_NONHEX_LETTER,      /* 110  n         */
354     SCM_CH_NONHEX_LETTER,      /* 111  o         */
355     SCM_CH_NONHEX_LETTER,      /* 112  p         */
356     SCM_CH_NONHEX_LETTER,      /* 113  q         */
357     SCM_CH_NONHEX_LETTER,      /* 114  r         */
358     SCM_CH_NONHEX_LETTER,      /* 115  s         */
359     SCM_CH_NONHEX_LETTER,      /* 116  t         */
360     SCM_CH_NONHEX_LETTER,      /* 117  u         */
361     SCM_CH_NONHEX_LETTER,      /* 118  v         */
362     SCM_CH_NONHEX_LETTER,      /* 119  w         */
363     SCM_CH_NONHEX_LETTER,      /* 120  x         */
364     SCM_CH_NONHEX_LETTER,      /* 121  y         */
365     SCM_CH_NONHEX_LETTER,      /* 122  z         */
366     SCM_CH_TOKEN_INITIAL,      /* 123  {         */
367     SCM_CH_TOKEN_INITIAL,      /* 124  |         */
368     SCM_CH_TOKEN_INITIAL,      /* 125  }         */
369     SCM_CH_SPECIAL_INITIAL,    /* 126  ~         */
370     SCM_CH_CONTROL,            /* 127  delete    */
371 };
372 
373 /*=======================================
374   File Local Function Declarations
375 =======================================*/
376 static scm_ichar_t skip_comment_and_space(ScmObj port);
377 static size_t read_token(ScmObj port, int *err,
378                          char *buf, size_t buf_size, enum ScmCharClass delim);
379 
380 static ScmObj read_sexpression(ScmObj port);
381 static ScmObj read_list(ScmObj port, scm_ichar_t closing_paren);
382 #if SCM_USE_R6RS_CHARS
383 static scm_ichar_t parse_unicode_sequence(const char *seq, int len);
384 static scm_ichar_t read_unicode_sequence(ScmObj port);
385 #endif /* SCM_USE_R6RS_CHARS */
386 #if SCM_USE_CHAR
387 static ScmObj read_char(ScmObj port);
388 #endif /* SCM_USE_CHAR */
389 #if SCM_USE_STRING
390 static ScmObj read_string(ScmObj port);
391 #endif /* SCM_USE_STRING */
392 static ScmObj read_symbol(ScmObj port);
393 static ScmObj read_number_or_peculiar(ScmObj port);
394 #if SCM_USE_NUMBER
395 static ScmObj parse_number(ScmObj port,
396                            char *buf, size_t buf_size, char prefix);
397 static ScmObj read_number(ScmObj port, char prefix);
398 #endif /* SCM_USE_NUMBER */
399 static ScmObj read_quoted(ScmObj port, ScmObj quoter);
400 
401 /*=======================================
402   Function Definitions
403 =======================================*/
404 /*===========================================================================
405   S-Expression Parser
406 ===========================================================================*/
407 SCM_EXPORT ScmObj
scm_read(ScmObj port)408 scm_read(ScmObj port)
409 {
410     ScmObj sexp;
411     DECLARE_INTERNAL_FUNCTION("scm_read");
412 
413     sexp = read_sexpression(port);
414 #if SCM_DEBUG
415     if ((scm_debug_categories() & SCM_DBG_READ) && !EOFP(sexp)) {
416         scm_write(scm_err, sexp);
417         scm_port_newline(scm_err);
418     }
419 #endif
420 
421     return sexp;
422 }
423 
424 static scm_ichar_t
skip_comment_and_space(ScmObj port)425 skip_comment_and_space(ScmObj port)
426 {
427     scm_ichar_t c;
428     int state;
429 
430     for (state = LEX_ST_NORMAL;;) {
431         c = scm_port_peek_char(port);
432         switch (state) {
433         case LEX_ST_NORMAL:
434             if (c == ';')
435                 state = LEX_ST_COMMENT;
436             else if (!ICHAR_WHITESPACEP(c) || c == SCM_ICHAR_EOF)
437                 return c;  /* peeked */
438             break;
439 
440         case LEX_ST_COMMENT:
441             if (c == '\n' || c == '\r')
442                 state = LEX_ST_NORMAL;
443             else if (c == SCM_ICHAR_EOF)
444                 return c;  /* peeked */
445             break;
446         }
447         scm_port_get_char(port);  /* skip the char */
448     }
449 }
450 
451 static size_t
read_token(ScmObj port,int * err,char * buf,size_t buf_size,enum ScmCharClass delim)452 read_token(ScmObj port, int *err,
453            char *buf, size_t buf_size, enum ScmCharClass delim)
454 {
455 #if SCM_USE_R6RS_CHARS
456     ScmCharCodec *codec;
457 #endif
458     enum ScmCharClass ch_class;
459     scm_ichar_t c;
460     size_t len;
461     char *p, *buf_last;
462     DECLARE_INTERNAL_FUNCTION("read");
463 
464     buf_last = &buf[buf_size - sizeof("")];
465     for (p = buf;;) {
466         c = scm_port_peek_char(port);
467         ch_class = ICHAR_CLASS(c);
468         CDBG((SCM_DBG_PARSER, "c = ~C", c));
469 
470         if (p == buf) {
471             if (c == SCM_ICHAR_EOF)
472                 ERR("unexpected EOF at a token");
473         } else {
474             if (ch_class & delim || c == SCM_ICHAR_EOF) {
475                 *err = OK;
476                 break;
477             }
478         }
479 
480         if (ch_class & SCM_CH_NONASCII) {
481 #if SCM_USE_R6RS_CHARS
482             if (buf_last <= p + SCM_MB_MAX_LEN) {
483                 *err = TOKEN_BUF_EXCEEDED;
484                 break;
485             }
486             codec = scm_port_codec(port);
487             if (SCM_CHARCODEC_CCS(codec) != SCM_CCS_UNICODE)
488                 ERR("non-ASCII char in token on a non-Unicode port: 0x~MX",
489                     (scm_int_t)c);
490             /* canonicalize internal Unicode encoding */
491             p = SCM_CHARCODEC_INT2STR(scm_identifier_codec, p, c,
492                                       SCM_MB_STATELESS);
493 #else
494             ERR("non-ASCII char in token: 0x~X", (int)c);
495 #endif
496         } else {
497             if (p == buf_last) {
498                 *err = TOKEN_BUF_EXCEEDED;
499                 break;
500             }
501             *p++ = c;
502         }
503         DISCARD_LOOKAHEAD(port);
504     }
505 
506     *p = '\0';
507     len = p - buf;
508     return len;
509 }
510 
511 static ScmObj
read_sexpression(ScmObj port)512 read_sexpression(ScmObj port)
513 {
514 #if SCM_USE_VECTOR
515     ScmObj ret;
516 #endif
517     enum ScmCharClass ch_class;
518     scm_ichar_t c;
519     DECLARE_INTERNAL_FUNCTION("read");
520 
521     CDBG((SCM_DBG_PARSER, "read_sexpression"));
522 
523     for (;;) {
524         c = skip_comment_and_space(port);
525 
526         CDBG((SCM_DBG_PARSER, "read_sexpression c = ~C", c));
527 
528         ch_class = ICHAR_CLASS(c);
529         if (ch_class & (SCM_CH_INITIAL | SCM_CH_NONASCII))
530             return read_symbol(port);
531 
532         if (ch_class & (SCM_CH_DIGIT | SCM_CH_PECULIAR_IDENTIFIER_CAND))
533             return read_number_or_peculiar(port);
534 
535         /* case labels are ordered by appearance rate and penalty cost */
536         SCM_ASSERT(ch_class == SCM_CH_TOKEN_INITIAL || c == SCM_ICHAR_EOF);
537         SCM_ASSERT(c != ';');
538         DISCARD_LOOKAHEAD(port);
539         switch (c) {
540         case '(':
541             return read_list(port, ')');
542 
543 #if SCM_USE_STRING
544         case '\"':
545             return read_string(port);
546 #endif
547 
548         case '\'':
549             return read_quoted(port, SYM_QUOTE);
550 
551         case '#':
552             c = scm_port_get_char(port);
553             switch (c) {
554             case 't':
555                 return SCM_TRUE;
556             case 'f':
557                 return SCM_FALSE;
558 #if SCM_USE_VECTOR
559             case '(':
560                 ret = scm_p_list2vector(read_list(port, ')'));
561 #if SCM_CONST_VECTOR_LITERAL
562                 SCM_VECTOR_SET_IMMUTABLE(ret);
563 #endif
564                 return ret;
565 #endif /* SCM_USE_VECTOR */
566 #if SCM_USE_CHAR
567             case '\\':
568                 return read_char(port);
569 #endif
570 #if SCM_USE_NUMBER
571             /* TODO: support exactness prefixes 'i' and 'e' */
572             case 'b': case 'o': case 'd': case 'x':
573                 return read_number(port, c);
574 #endif
575             case SCM_ICHAR_EOF:
576                 ERR("EOF in #");
577                 /* NOTREACHED */
578             default:
579                 ERR("unsupported # notation: ~C", c);
580                 /* NOTREACHED */
581             }
582             /* NOTREACHED */
583 
584         case '`':
585             return read_quoted(port, SYM_QUASIQUOTE);
586 
587         case ',':
588             c = scm_port_peek_char(port);
589             switch (c) {
590             case SCM_ICHAR_EOF:
591                 ERR("EOF in unquote");
592                 /* NOTREACHED */
593 
594             case '@':
595                 DISCARD_LOOKAHEAD(port);
596                 return read_quoted(port, SYM_UNQUOTE_SPLICING);
597 
598             default:
599                 return read_quoted(port, SYM_UNQUOTE);
600             }
601             /* NOTREACHED */
602 
603         case ')':
604             ERR("unexpected ')'");
605             /* NOTREACHED */
606 
607         case SCM_ICHAR_EOF:
608             return SCM_EOF;
609 
610         case '|':
611         case '[':
612         case ']':
613         case '{':
614         case '}':
615             ERR("reserved notation: ~C", c);
616             /* NOTREACHED */
617 
618         default:
619             SCM_NOTREACHED;
620         }
621     }
622 }
623 
624 static ScmObj
read_list(ScmObj port,scm_ichar_t closing_paren)625 read_list(ScmObj port, scm_ichar_t closing_paren)
626 {
627     ScmObj lst, elm, cdr;
628     ScmQueue q;
629 #if SCM_DEBUG
630     ScmBaseCharPort *basecport;
631     size_t start_line, cur_line;
632 #endif
633     scm_ichar_t c;
634     int err;
635     char dot_buf[sizeof("...")];
636     DECLARE_INTERNAL_FUNCTION("read");
637 
638 #if SCM_DEBUG
639     CDBG((SCM_DBG_PARSER, "read_list"));
640     basecport = SCM_PORT_TRY_DYNAMIC_CAST(ScmBaseCharPort,
641                                           SCM_PORT_IMPL(port));
642     start_line = (basecport) ? ScmBaseCharPort_line_number(basecport) : 0;
643 #endif
644 
645     for (lst = SCM_NULL, SCM_QUEUE_POINT_TO(q, lst);
646          ;
647 #if SCM_CONST_LIST_LITERAL
648          SCM_QUEUE_CONST_ADD(q, elm)
649 #else
650          SCM_QUEUE_ADD(q, elm)
651 #endif
652          )
653     {
654         c = skip_comment_and_space(port);
655 
656         CDBG((SCM_DBG_PARSER, "read_list c = [~C]", c));
657 
658         if (c == SCM_ICHAR_EOF) {
659 #if SCM_DEBUG
660             if (basecport && start_line) {
661                 cur_line = ScmBaseCharPort_line_number(basecport);
662                 ERR("EOF inside list at line ~ZU (started from line ~ZU)",
663                     cur_line, start_line);
664             } else
665 #endif
666                 ERR("EOF inside list");
667         } else if (c == closing_paren) {
668             DISCARD_LOOKAHEAD(port);
669             return lst;
670         } else if (c == '.') {
671             /* Since expressions that beginning with a dot are limited to '.',
672              * '...' and numbers in R5RS (See "7.1.1 Lexical structure"), the
673              * fixed size buffer can safely buffer them. */
674             read_token(port, &err, dot_buf, sizeof(dot_buf), SCM_CH_DELIMITER);
675 
676             if (dot_buf[1] == '\0') {
677 #if !SCM_STRICT_R5RS
678                 /* Although implicit delimiter around the dot is allowd by
679                  * R5RS, some other implementation doesn't parse so
680                  * (e.g. '("foo"."bar") is parsed as 3 element list which 2nd
681                  * elem is dot as symbol). To avoid introducing such
682                  * incompatibility problem into codes of SigScheme users,
683                  * require explicit whitespace around the dot. */
684                 c = scm_port_peek_char(port);
685                 if (!ICHAR_WHITESPACEP(c))
686                     ERR("implicit dot delimitation is disabled to avoid compatibility problem");
687 #endif
688                 if (NULLP(lst))
689                     ERR(".(dot) at the start of the list");
690 
691                 cdr = read_sexpression(port);
692                 c = skip_comment_and_space(port);
693                 DISCARD_LOOKAHEAD(port);
694                 if (c != closing_paren)
695                     ERR("bad dot syntax");
696 
697                 SCM_QUEUE_SLOPPY_APPEND(q, cdr);
698                 return lst;
699             } else if (strcmp(dot_buf, "...") == 0) {
700                 elm = SYM_ELLIPSIS;
701             } else {
702                 ERR("bad dot syntax");
703             }
704         } else {
705             elm = read_sexpression(port);
706         }
707     }
708 }
709 
710 #if SCM_USE_R6RS_CHARS
711 static scm_ichar_t
parse_unicode_sequence(const char * seq,int len)712 parse_unicode_sequence(const char *seq, int len)
713 {
714     scm_ichar_t c;
715     int err;
716     DECLARE_INTERNAL_FUNCTION("read");
717 
718     /* reject ordinary char literal and invalid signed hexadecimal */
719     if (len < 2 || seq[0] != 'x' || !ICHAR_HEXA_NUMERICP(seq[1]))
720         return -1;
721 
722     c = scm_string2number(&seq[1], 16, &err);
723     SCM_ASSERT(c >= 0);
724     if (err)
725         return -1;
726 
727     /* R6RS: 3.2.6 Strings
728      * the sequence of <digit 16>s forms a hexadecimal number between 0 and
729      * #x10FFFF excluding the range [#xD800, #xDFFF] */
730     if (!ICHAR_VALID_UNICODEP(c))
731         ERR("invalid Unicode value: 0x~MX", (scm_int_t)c);
732 
733     return c;
734 }
735 
736 static scm_ichar_t
read_unicode_sequence(ScmObj port)737 read_unicode_sequence(ScmObj port)
738 {
739     int err;
740     scm_ichar_t c;
741     size_t len;
742     char seq[sizeof("x0010ffff")];
743     DECLARE_INTERNAL_FUNCTION("read");
744 
745     seq[0] = 'x';
746     len = read_token(port, &err, &seq[1], sizeof(seq) - sizeof((char)'x'),
747                      SCM_CH_DELIMITER);
748     if (err == TOKEN_BUF_EXCEEDED)
749         goto err;
750 
751     c = parse_unicode_sequence(seq, len + sizeof((char)'x'));
752     if (c < 0)
753         goto err;
754 
755     return c;
756 
757  err:
758     ERR("invalid hex scalar value");
759 }
760 #endif /* SCM_USE_R6RS_CHARS */
761 
762 #if SCM_USE_CHAR
763 static ScmObj
read_char(ScmObj port)764 read_char(ScmObj port)
765 {
766     const ScmSpecialCharInfo *info;
767     size_t len;
768     scm_ichar_t c, next;
769 #if SCM_USE_R6RS_CHARS
770     scm_ichar_t unicode;
771 #endif
772     int err;
773     char buf[CHAR_LITERAL_LEN_MAX + sizeof("")];
774     DECLARE_INTERNAL_FUNCTION("read");
775 
776     /* raw char (multibyte-ready) */
777     c = scm_port_get_char(port);
778     next = scm_port_peek_char(port);
779     if (ICHAR_ASCII_CLASS(next) & SCM_CH_DELIMITER || next == SCM_ICHAR_EOF) {
780 #if !SCM_USE_R6RS_CHARS
781         if (!ICHAR_ASCIIP(c))
782             ERR("invalid character literal");
783 #endif
784         return MAKE_CHAR(c);
785     }
786 
787     buf[0] = c;
788     len = read_token(port, &err, &buf[1], sizeof(buf) - 1, SCM_CH_DELIMITER);
789     if (err == TOKEN_BUF_EXCEEDED)
790         ERR("invalid character literal");
791 
792     CDBG((SCM_DBG_PARSER, "read_char: ch = ~S", buf));
793 
794 #if SCM_USE_R6RS_CHARS
795     unicode = parse_unicode_sequence(buf, len + sizeof((char)c));
796     if (0 <= unicode)
797         return MAKE_CHAR(unicode);
798 #endif
799     /* named chars */
800     for (info = scm_special_char_table; info->esc_seq; info++) {
801         /*
802          * R5RS: 6.3.4 Characters
803          * Case is significant in #\<character>, but not in #\<character name>.
804          */
805         if (strcasecmp(buf, info->lex_rep) == 0)
806             return MAKE_CHAR(info->code);
807     }
808     ERR("invalid character literal: #\\~S", buf);
809 }
810 #endif /* SCM_USE_CHAR */
811 
812 #if SCM_USE_STRING
813 static ScmObj
read_string(ScmObj port)814 read_string(ScmObj port)
815 {
816     ScmObj obj;
817     const ScmSpecialCharInfo *info;
818     ScmCharCodec *codec;
819     scm_int_t len;
820     scm_ichar_t c;
821     char *p;
822     size_t offset;
823     ScmLBuf(char) lbuf;
824     char init_buf[SCM_INITIAL_STRING_BUF_SIZE];
825     DECLARE_INTERNAL_FUNCTION("read");
826 
827     CDBG((SCM_DBG_PARSER, "read_string"));
828 
829     LBUF_INIT(lbuf, init_buf, sizeof(init_buf));
830     codec = scm_port_codec(port);
831 
832     for (offset = 0, p = LBUF_BUF(lbuf), len = 0;
833          ;
834          offset = p - LBUF_BUF(lbuf), len++)
835     {
836         c = scm_port_get_char(port);
837 
838         CDBG((SCM_DBG_PARSER, "read_string c = ~C", c));
839 
840         switch (c) {
841         case SCM_ICHAR_EOF:
842             LBUF_FREE(lbuf);
843             ERR("EOF in string");
844             /* NOTREACHED */
845 
846         case '\"':
847             LBUF_EXTEND(lbuf, SCM_LBUF_F_STRING, offset + 1);
848             LBUF_BUF(lbuf)[offset] = '\0';
849             obj = MAKE_IMMUTABLE_STRING_COPYING(LBUF_BUF(lbuf), len);
850             LBUF_FREE(lbuf);
851             return obj;
852 
853         case '\\':
854             c = scm_port_get_char(port);
855 #if SCM_USE_R6RS_CHARS
856             if (c == 'x') {
857                 c = read_unicode_sequence(port);
858                 LBUF_EXTEND(lbuf, SCM_LBUF_F_STRING,
859                             offset + SCM_MB_CHAR_BUF_SIZE);
860                 p = &LBUF_BUF(lbuf)[offset];
861                 p = SCM_CHARCODEC_INT2STR(codec, p, c, SCM_MB_STATELESS);
862                 if (!p)
863                     ERR("invalid inline hex escape in string: 0x~MX",
864                         (scm_int_t)c);
865                 c = scm_port_get_char(port);
866                 if (c != ';')
867                     ERR("inline hex escape must be followed by ';'");
868                 goto found;
869             } else
870 #endif
871             {
872                 /* escape sequences */
873                 for (info = scm_special_char_table; info->esc_seq; info++) {
874                     if (strlen(info->esc_seq) == 2 && c == info->esc_seq[1]) {
875                         LBUF_EXTEND(lbuf, SCM_LBUF_F_STRING, offset + 1);
876                         p = &LBUF_BUF(lbuf)[offset];
877                         *p++ = info->code;
878                         goto found;
879                     }
880                 }
881             }
882             ERR("invalid escape sequence in string: \\~C", c);
883         found:
884             break;
885 
886         default:
887             LBUF_EXTEND(lbuf, SCM_LBUF_F_STRING,
888                         offset + SCM_MB_CHAR_BUF_SIZE);
889             p = &LBUF_BUF(lbuf)[offset];
890 #if SCM_USE_R6RS_CHARS
891             /* FIXME: support stateful encoding */
892             p = SCM_CHARCODEC_INT2STR(codec, p, c, SCM_MB_STATELESS);
893             if (!p)
894                 ERR("invalid char in string: 0x~MX", (scm_int_t)c);
895 #else
896             *p++ = c;
897 #endif
898             break;
899         }
900 #if !SCM_USE_NULL_CAPABLE_STRING
901         if (c == '\0')
902             ERR(SCM_ERRMSG_NULL_IN_STRING);
903 #endif
904     }
905 #if 0
906     LBUF_END(lbuf)[-1] = '\0';
907     ERR("too long string: \"~S\"", LBUF_BUF(lbuf));
908 #endif
909     /* NOTREACHED */
910 }
911 #endif /* SCM_USE_STRING */
912 
913 static ScmObj
read_symbol(ScmObj port)914 read_symbol(ScmObj port)
915 {
916     ScmObj sym;
917     size_t offset, tail_len;
918     int err;
919     ScmLBuf(char) lbuf;
920     char init_buf[SCM_INITIAL_SYMBOL_BUF_SIZE];
921 
922     CDBG((SCM_DBG_PARSER, "read_symbol"));
923 
924     LBUF_INIT(lbuf, init_buf, sizeof(init_buf));
925 
926     for (offset = 0;;) {
927         tail_len = read_token(port, &err,
928                               &LBUF_BUF(lbuf)[offset],
929                               LBUF_SIZE(lbuf) - offset,
930                               SCM_CH_DELIMITER);
931         if (err != TOKEN_BUF_EXCEEDED)
932             break;
933         offset += tail_len;
934         LBUF_EXTEND(lbuf, SCM_LBUF_F_SYMBOL,
935                     LBUF_SIZE(lbuf) + SCM_MB_CHAR_BUF_SIZE);
936     }
937 
938     sym = scm_intern(LBUF_BUF(lbuf));
939     LBUF_FREE(lbuf);
940 
941     return sym;
942 }
943 
944 static ScmObj
read_number_or_peculiar(ScmObj port)945 read_number_or_peculiar(ScmObj port)
946 {
947     scm_ichar_t c;
948     int err;
949     char buf[INT_LITERAL_LEN_MAX + sizeof("")];
950     DECLARE_INTERNAL_FUNCTION("read");
951 
952     CDBG((SCM_DBG_PARSER, "read"));
953 
954     c = scm_port_peek_char(port);
955     SCM_ASSERT(ICHAR_ASCII_CLASS(c)
956                & (SCM_CH_DIGIT | SCM_CH_PECULIAR_IDENTIFIER_CAND));
957 
958 #if SCM_USE_NUMBER
959     if (ICHAR_NUMERICP(c))
960         return read_number(port, 'd');
961 
962     if (c == '+' || c == '-') {
963         read_token(port, &err, buf, sizeof(buf), SCM_CH_DELIMITER);
964         if (err == TOKEN_BUF_EXCEEDED)
965             ERR("invalid number literal");
966 
967         /* '+' or '-' */
968         if (!buf[1])
969             return scm_intern(buf);
970 
971         return parse_number(port, buf, sizeof(buf), 'd');
972     }
973 #endif /* SCM_USE_NUMBER */
974 
975     if (c == '.') {
976         read_token(port, &err, buf, sizeof(buf), SCM_CH_DELIMITER);
977         if (strcmp(buf, "...") == 0)
978             return SYM_ELLIPSIS;
979         /* TODO: support numeric expressions when the numeric tower is
980            implemented */
981         ERR("invalid identifier: ~S", buf);
982     }
983 
984     if (c == '@')
985         ERR("invalid identifier starting with @");
986 
987     return read_symbol(port);
988 }
989 
990 #if SCM_USE_NUMBER
991 /* reads 'b123' part of #b123 */
992 static ScmObj
parse_number(ScmObj port,char * buf,size_t buf_size,char prefix)993 parse_number(ScmObj port, char *buf, size_t buf_size, char prefix)
994 {
995     scm_int_t number;
996     int radix;
997     scm_bool err;
998     DECLARE_INTERNAL_FUNCTION("read");
999 
1000     switch (prefix) {
1001     case 'b': radix = 2;  break;
1002     case 'o': radix = 8;  break;
1003     case 'd': radix = 10; break;
1004     case 'x': radix = 16; break;
1005     default:
1006         goto err;
1007     }
1008 
1009     number = scm_string2number(buf, radix, &err);
1010     if (!err)
1011         return MAKE_INT(number);
1012 
1013  err:
1014     ERR("ill-formatted number: #~C~S", (scm_ichar_t)prefix, buf);
1015 }
1016 
1017 static ScmObj
read_number(ScmObj port,char prefix)1018 read_number(ScmObj port, char prefix)
1019 {
1020     int err;
1021     char buf[INT_LITERAL_LEN_MAX + sizeof("")];
1022     DECLARE_INTERNAL_FUNCTION("read");
1023 
1024     read_token(port, &err, buf, sizeof(buf), SCM_CH_DELIMITER);
1025     if (err == TOKEN_BUF_EXCEEDED)
1026         ERR("invalid number literal");
1027 
1028     return parse_number(port, buf, sizeof(buf), prefix);
1029 }
1030 #endif /* SCM_USE_NUMBER */
1031 
1032 static ScmObj
read_quoted(ScmObj port,ScmObj quoter)1033 read_quoted(ScmObj port, ScmObj quoter)
1034 {
1035     ScmObj obj;
1036     DECLARE_INTERNAL_FUNCTION("read");
1037 
1038     obj = read_sexpression(port);
1039     if (EOFP(obj))
1040         ERR("EOF in ~a", quoter);
1041 
1042     return SCM_LIST_2(quoter, obj);
1043 }
1044 
1045 /*===========================================================================
1046   R5RS : 6.6 Input and Output : 6.6.2 Input
1047 ===========================================================================*/
1048 SCM_EXPORT ScmObj
scm_p_read(ScmObj args)1049 scm_p_read(ScmObj args)
1050 {
1051     ScmObj port;
1052     DECLARE_FUNCTION("read", procedure_variadic_0);
1053 
1054     port = scm_prepare_port(args, scm_in);
1055     return scm_read(port);
1056 }
1057