1 /* Copyright (C) 1995-1997, 1999-2001, 2003, 2004, 2006-2012, 2014-2019
2  *   Free Software Foundation, Inc.
3  *
4  * This library is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public License
6  * as published by the Free Software Foundation; either version 3 of
7  * the License, or (at your option) any later version.
8  *
9  * This library is distributed in the hope that it will be useful, but
10  * WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with this library; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17  * 02110-1301 USA
18  */
19 
20 
21 
22 
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26 
27 #include <stdio.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <unicase.h>
31 #include <unictype.h>
32 #include <c-strcase.h>
33 #include <c-ctype.h>
34 #include <alloca.h>
35 
36 #include "libguile/_scm.h"
37 #include "libguile/bytevectors.h"
38 #include "libguile/chars.h"
39 #include "libguile/eval.h"
40 #include "libguile/arrays.h"
41 #include "libguile/bitvectors.h"
42 #include "libguile/keywords.h"
43 #include "libguile/alist.h"
44 #include "libguile/srcprop.h"
45 #include "libguile/hashtab.h"
46 #include "libguile/hash.h"
47 #include "libguile/ports.h"
48 #include "libguile/ports-internal.h"
49 #include "libguile/fports.h"
50 #include "libguile/strings.h"
51 #include "libguile/strports.h"
52 #include "libguile/vectors.h"
53 #include "libguile/validate.h"
54 #include "libguile/srfi-4.h"
55 #include "libguile/srfi-13.h"
56 
57 #include "libguile/read.h"
58 #include "libguile/private-options.h"
59 
60 
61 
62 
63 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
64 SCM_SYMBOL (scm_keyword_prefix, "prefix");
65 SCM_SYMBOL (scm_keyword_postfix, "postfix");
66 SCM_SYMBOL (sym_nil, "nil");
67 SCM_SYMBOL (sym_ISO_8859_1, "ISO-8859-1");
68 
69 /* SRFI-105 curly infix expression support */
70 SCM_SYMBOL (sym_nfx, "$nfx$");
71 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
72 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
73 
74 scm_t_option scm_read_opts[] =
75   {
76     { SCM_OPTION_BOOLEAN, "copy", 0,
77       "Copy source code expressions." },
78     { SCM_OPTION_BOOLEAN, "positions", 1,
79       "Record positions of source code expressions." },
80     { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
81       "Convert symbols to lower case."},
82     { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
83       "Style of keyword recognition: #f, 'prefix or 'postfix."},
84     { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
85       "Use R6RS variable-length character and string hex escapes."},
86     { SCM_OPTION_BOOLEAN, "square-brackets", 1,
87       "Treat `[' and `]' as parentheses, for R6RS compatibility."},
88     { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
89       "In strings, consume leading whitespace after an escaped end-of-line."},
90     { SCM_OPTION_BOOLEAN, "curly-infix", 0,
91       "Support SRFI-105 curly infix expressions."},
92     { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
93       "Support R7RS |...| symbol notation."},
94     { 0, },
95   };
96 
97 /* Internal read options structure.  This is initialized by 'scm_read'
98    from the global and per-port read options, and a pointer is passed
99    down to all helper functions. */
100 
101 enum t_keyword_style
102   {
103     KEYWORD_STYLE_HASH_PREFIX,
104     KEYWORD_STYLE_PREFIX,
105     KEYWORD_STYLE_POSTFIX
106   };
107 
108 struct t_read_opts
109 {
110   enum t_keyword_style keyword_style;
111   unsigned int copy_source_p        : 1;
112   unsigned int record_positions_p   : 1;
113   unsigned int case_insensitive_p   : 1;
114   unsigned int r6rs_escapes_p       : 1;
115   unsigned int square_brackets_p    : 1;
116   unsigned int hungry_eol_escapes_p : 1;
117   unsigned int curly_infix_p        : 1;
118   unsigned int neoteric_p           : 1;
119   unsigned int r7rs_symbols_p       : 1;
120 };
121 
122 typedef struct t_read_opts scm_t_read_opts;
123 
124 
125 /*
126   Give meaningful error messages for errors
127 
128   We use the format
129 
130   FILE:LINE:COL: MESSAGE
131   This happened in ....
132 
133   This is not standard GNU format, but the test-suite likes the real
134   message to be in front.
135 
136  */
137 
138 
139 void
scm_i_input_error(char const * function,SCM port,const char * message,SCM arg)140 scm_i_input_error (char const *function,
141 		   SCM port, const char *message, SCM arg)
142 {
143   SCM fn = (scm_is_string (SCM_FILENAME(port))
144 	    ? SCM_FILENAME(port)
145 	    : scm_from_utf8_string ("#<unknown port>"));
146 
147   SCM string_port = scm_open_output_string ();
148   SCM string = SCM_EOL;
149   scm_simple_format (string_port,
150 		     scm_from_utf8_string ("~A:~S:~S: ~A"),
151 		     scm_list_4 (fn,
152 				 scm_sum (scm_port_line (port), SCM_INUM1),
153 				 scm_sum (scm_port_column (port), SCM_INUM1),
154 				 scm_from_utf8_string (message)));
155 
156   string = scm_get_output_string (string_port);
157   scm_close_output_port (string_port);
158   scm_error_scm (scm_from_utf8_symbol ("read-error"),
159 		 function? scm_from_utf8_string (function) : SCM_BOOL_F,
160 		 string,
161 		 arg,
162 		 SCM_BOOL_F);
163 }
164 
165 
166 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
167             (SCM setting),
168 	    "Option interface for the read options. Instead of using\n"
169 	    "this procedure directly, use the procedures @code{read-enable},\n"
170 	    "@code{read-disable}, @code{read-set!} and @code{read-options}.")
171 #define FUNC_NAME s_scm_read_options
172 {
173   SCM ans = scm_options (setting,
174 			 scm_read_opts,
175 			 FUNC_NAME);
176   if (SCM_COPY_SOURCE_P)
177     SCM_RECORD_POSITIONS_P = 1;
178   return ans;
179 }
180 #undef FUNC_NAME
181 
182 /* A fluid referring to an association list mapping extra hash
183    characters to procedures.  */
184 static SCM *scm_i_read_hash_procedures;
185 
186 static SCM
scm_i_read_hash_procedures_ref(void)187 scm_i_read_hash_procedures_ref (void)
188 {
189   return scm_fluid_ref (*scm_i_read_hash_procedures);
190 }
191 
192 static void
scm_i_read_hash_procedures_set_x(SCM value)193 scm_i_read_hash_procedures_set_x (SCM value)
194 {
195   scm_fluid_set_x (*scm_i_read_hash_procedures, value);
196 }
197 
198 
199 /* Token readers.  */
200 
201 
202 /* Size of the C buffer used to read symbols and numbers.  */
203 #define READER_BUFFER_SIZE            128
204 
205 /* Number of 32-bit codepoints in the buffer used to read strings.  */
206 #define READER_STRING_BUFFER_SIZE     128
207 
208 /* The maximum size of Scheme character names.  */
209 #define READER_CHAR_NAME_MAX_SIZE      50
210 
211 /* The maximum size of reader directive names.  */
212 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
213 
214 
215 /* `isblank' is only in C99.  */
216 #define CHAR_IS_BLANK_(_chr)					\
217   (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n')	\
218    || ((_chr) == '\f') || ((_chr) == '\r'))
219 
220 #ifdef MSDOS
221 # define CHAR_IS_BLANK(_chr)			\
222   ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
223 #else
224 # define CHAR_IS_BLANK CHAR_IS_BLANK_
225 #endif
226 
227 
228 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
229    structure'').  */
230 #define CHAR_IS_R5RS_DELIMITER(c)				\
231   (CHAR_IS_BLANK (c)						\
232    || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
233 
234 #define CHAR_IS_DELIMITER(c)                                    \
235   (CHAR_IS_R5RS_DELIMITER (c)                                   \
236    || (((c) == ']' || (c) == '[') && (opts->square_brackets_p   \
237                                       || opts->curly_infix_p))  \
238    || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
239 
240 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
241    Structure''.  */
242 #define CHAR_IS_EXPONENT_MARKER(_chr)				\
243   (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f')	\
244    || ((_chr) == 'd') || ((_chr) == 'l'))
245 
246 /* Read an SCSH block comment.  */
247 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
248 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
249 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
250 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
251 static SCM scm_get_hash_procedure (int);
252 
253 /* Read from PORT until a delimiter (e.g., a whitespace) is read.  Put the
254    result in the pre-allocated buffer BUF.  Return zero if the whole token has
255    fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
256    bytes actually read.  */
257 static int
read_token(SCM port,scm_t_read_opts * opts,char * buf,size_t buf_size,size_t * read)258 read_token (SCM port, scm_t_read_opts *opts,
259             char *buf, size_t buf_size, size_t *read)
260 {
261    *read = 0;
262 
263    while (*read < buf_size)
264      {
265        int chr;
266 
267        chr = scm_get_byte_or_eof (port);
268 
269        if (chr == EOF)
270         return 0;
271       else if (CHAR_IS_DELIMITER (chr))
272         {
273           scm_unget_byte (chr, port);
274           return 0;
275         }
276       else
277         {
278           *buf = (char) chr;
279           buf++, (*read)++;
280         }
281      }
282 
283    return 1;
284  }
285 
286 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
287    if the token doesn't fit in BUFFER_SIZE bytes.  */
288 static char *
read_complete_token(SCM port,scm_t_read_opts * opts,char * buffer,size_t buffer_size,size_t * read)289 read_complete_token (SCM port, scm_t_read_opts *opts,
290                      char *buffer, size_t buffer_size, size_t *read)
291 {
292   int overflow = 0;
293   size_t bytes_read, overflow_size = 0;
294   char *overflow_buffer = NULL;
295 
296   do
297     {
298       overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
299       if (bytes_read == 0)
300         break;
301       if (overflow || overflow_size != 0)
302         {
303           if (overflow_size == 0)
304             {
305               overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
306               memcpy (overflow_buffer, buffer, bytes_read);
307               overflow_size = bytes_read;
308             }
309           else
310             {
311 	      char *new_buf =
312 		scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
313 
314 	      memcpy (new_buf, overflow_buffer, overflow_size);
315               memcpy (new_buf + overflow_size, buffer, bytes_read);
316 
317 	      overflow_buffer = new_buf;
318               overflow_size += bytes_read;
319             }
320         }
321     }
322   while (overflow);
323 
324   if (overflow_size)
325     *read = overflow_size;
326   else
327     *read = bytes_read;
328 
329   return (overflow_size > 0 ? overflow_buffer : buffer);
330 }
331 
332 /* Skip whitespace from PORT and return the first non-whitespace character
333    read.  Raise an error on end-of-file.  */
334 static int
flush_ws(SCM port,scm_t_read_opts * opts,const char * eoferr)335 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
336 {
337   scm_t_wchar c;
338   while (1)
339     switch (c = scm_getc (port))
340       {
341       case EOF:
342       goteof:
343 	if (eoferr)
344 	  {
345 	    scm_i_input_error (eoferr,
346 			       port,
347 			       "end of file",
348 			       SCM_EOL);
349 	  }
350 	return c;
351 
352       case ';':
353       lp:
354 	switch (c = scm_getc (port))
355 	  {
356 	  case EOF:
357 	    goto goteof;
358 	  default:
359 	    goto lp;
360 	  case SCM_LINE_INCREMENTORS:
361 	    break;
362 	  }
363 	break;
364 
365       case '#':
366 	switch (c = scm_getc (port))
367 	  {
368 	  case EOF:
369 	    eoferr = "read_sharp";
370 	    goto goteof;
371 	  case '!':
372 	    scm_read_shebang (c, port, opts);
373 	    break;
374 	  case ';':
375 	    scm_read_commented_expression (c, port, opts);
376 	    break;
377 	  case '|':
378 	    if (scm_is_false (scm_get_hash_procedure (c)))
379 	      {
380 		scm_read_r6rs_block_comment (c, port);
381 		break;
382 	      }
383 	    /* fall through */
384 	  default:
385 	    scm_ungetc (c, port);
386 	    return '#';
387 	  }
388 	break;
389 
390       case SCM_LINE_INCREMENTORS:
391       case SCM_SINGLE_SPACES:
392       case '\t':
393 	break;
394 
395       default:
396 	return c;
397       }
398 
399   return 0;
400 }
401 
402 
403 
404 /* Token readers.  */
405 
406 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
407 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
408                            long line, int column);
409 
410 
411 static SCM
maybe_annotate_source(SCM x,SCM port,scm_t_read_opts * opts,long line,int column)412 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
413                        long line, int column)
414 {
415   /* This condition can be caused by a user calling
416      set-port-column!.  */
417   if (line < 0 || column < 0)
418     return x;
419 
420   if (opts->record_positions_p)
421     scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
422   return x;
423 }
424 
425 static SCM
scm_read_sexp(scm_t_wchar chr,SCM port,scm_t_read_opts * opts)426 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
427 #define FUNC_NAME "scm_i_lreadparen"
428 {
429   int c;
430   SCM tmp, tl, ans = SCM_EOL;
431   const int curly_list_p = (chr == '{') && opts->curly_infix_p;
432   const int terminating_char = ((chr == '{') ? '}'
433                                 : ((chr == '[') ? ']'
434                                    : ')'));
435 
436   /* Need to capture line and column numbers here. */
437   long line = scm_to_long (scm_port_line (port));
438   int column = scm_to_int (scm_port_column (port)) - 1;
439 
440   c = flush_ws (port, opts, FUNC_NAME);
441   if (terminating_char == c)
442     return SCM_EOL;
443 
444   scm_ungetc (c, port);
445   tmp = scm_read_expression (port, opts);
446 
447   /* Note that it is possible for scm_read_expression to return
448      scm_sym_dot, but not as part of a dotted pair: as in #{.}#.  So
449      check that it's a real dot by checking `c'.  */
450   if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
451     {
452       ans = scm_read_expression (port, opts);
453       if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
454 	scm_i_input_error (FUNC_NAME, port, "missing close paren",
455 			   SCM_EOL);
456       return ans;
457     }
458 
459   /* Build the head of the list structure. */
460   ans = tl = scm_cons (tmp, SCM_EOL);
461 
462   while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
463     {
464       SCM new_tail;
465 
466       if (c == ')' || (c == ']' && opts->square_brackets_p)
467           || ((c == '}' || c == ']') && opts->curly_infix_p))
468         scm_i_input_error (FUNC_NAME, port,
469                            "in pair: mismatched close paren: ~A",
470                            scm_list_1 (SCM_MAKE_CHAR (c)));
471 
472       scm_ungetc (c, port);
473       tmp = scm_read_expression (port, opts);
474 
475       /* See above note about scm_sym_dot.  */
476       if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
477 	{
478 	  SCM_SETCDR (tl, scm_read_expression (port, opts));
479 
480 	  c = flush_ws (port, opts, FUNC_NAME);
481 	  if (terminating_char != c)
482 	    scm_i_input_error (FUNC_NAME, port,
483 			       "in pair: missing close paren", SCM_EOL);
484 	  break;
485 	}
486 
487       new_tail = scm_cons (tmp, SCM_EOL);
488       SCM_SETCDR (tl, new_tail);
489       tl = new_tail;
490     }
491 
492   if (curly_list_p)
493     {
494       /* In addition to finding the length, 'scm_ilength' checks for
495          improper or circular lists, in which case it returns -1. */
496       int len = scm_ilength (ans);
497 
498       /* The (len == 0) case is handled above */
499       if (len == 1)
500         /* Return directly to avoid re-annotating the element's source
501            location with the position of the outer brace.  Also, it
502            might not be possible to annotate the element. */
503         return scm_car (ans);  /* {e} => e */
504       else if (len == 2)
505         ;  /* Leave the list unchanged: {e1 e2} => (e1 e2) */
506       else if (len >= 3 && (len & 1))
507         {
508           /* It's a proper list whose length is odd and at least 3.  If
509              the elements at odd indices (the infix operator positions)
510              are all 'equal?', then it's a simple curly-infix list.
511              Otherwise it's a mixed curly-infix list. */
512           SCM op = scm_cadr (ans);
513 
514           /* Check to see if the elements at odd indices are 'equal?' */
515           for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
516             {
517               if (scm_is_null (tl))
518                 {
519                   /* Convert simple curly-infix list to prefix:
520                      {a <op> b <op> ...} => (<op> a b ...) */
521                   tl = ans;
522                   while (scm_is_pair (scm_cdr (tl)))
523                     {
524                       tmp = scm_cddr (tl);
525                       SCM_SETCDR (tl, tmp);
526                       tl = tmp;
527                     }
528                   ans = scm_cons (op, ans);
529                   break;
530                 }
531               else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
532                 {
533                   /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
534                   ans = scm_cons (sym_nfx, ans);
535                   break;
536                 }
537             }
538         }
539       else
540         /* Mixed curly-infix (possibly improper) list:
541            {e . tail} => ($nfx$ e . tail) */
542         ans = scm_cons (sym_nfx, ans);
543     }
544 
545   return maybe_annotate_source (ans, port, opts, line, column);
546 }
547 #undef FUNC_NAME
548 
549 
550 /* Read a hexadecimal number NDIGITS in length.  Put its value into the variable
551    C.  If TERMINATOR is non-null, terminate early if the TERMINATOR character is
552    found.  */
553 #define SCM_READ_HEX_ESCAPE(ndigits, terminator)                   \
554   do                                                               \
555     {                                                              \
556       scm_t_wchar a;                                               \
557       size_t i = 0;                                                \
558       c = 0;                                                       \
559       while (i < ndigits)                                          \
560         {                                                          \
561           a = scm_getc (port);                                     \
562           if (a == EOF)                                            \
563             goto str_eof;                                          \
564           if (terminator                                           \
565               && (a == (scm_t_wchar) terminator)                   \
566               && (i > 0))                                          \
567             break;                                                 \
568           if ('0' <= a && a <= '9')                                \
569             a -= '0';                                              \
570           else if ('A' <= a && a <= 'F')                           \
571             a = a - 'A' + 10;                                      \
572           else if ('a' <= a && a <= 'f')                           \
573             a = a - 'a' + 10;                                      \
574           else                                                     \
575             {                                                      \
576               c = a;                                               \
577               goto bad_escaped;                                    \
578             }                                                      \
579           c = c * 16 + a;                                          \
580           i ++;                                                    \
581         }                                                          \
582     } while (0)
583 
584 static void
skip_intraline_whitespace(SCM port)585 skip_intraline_whitespace (SCM port)
586 {
587   scm_t_wchar c;
588 
589   do
590     {
591       c = scm_getc (port);
592       if (c == EOF)
593         return;
594     }
595   while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
596 
597   scm_ungetc (c, port);
598 }
599 
600 /* Read either a double-quoted string or an R7RS-style symbol delimited
601    by vertical lines, depending on the value of 'chr' ('"' or '|').
602    Regardless, the result is always returned as a string.  */
603 static SCM
scm_read_string_like_syntax(int chr,SCM port,scm_t_read_opts * opts)604 scm_read_string_like_syntax (int chr, SCM port, scm_t_read_opts *opts)
605 #define FUNC_NAME "scm_lreadr"
606 {
607   /* For strings smaller than C_STR, this function creates only one Scheme
608      object (the string returned).  */
609 
610   SCM str = SCM_EOL;
611   size_t c_str_len = 0;
612   scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
613 
614   /* Need to capture line and column numbers here. */
615   long line = scm_to_long (scm_port_line (port));
616   int column = scm_to_int (scm_port_column (port)) - 1;
617 
618   while (chr != (c = scm_getc (port)))
619     {
620       if (c == EOF)
621         {
622         str_eof:
623           scm_i_input_error (FUNC_NAME, port,
624                              (chr == '|'
625                               ? "end of file in symbol"
626                               : "end of file in string constant"),
627                              SCM_EOL);
628         }
629 
630       if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
631 	{
632 	  str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
633 	  c_str_len = 0;
634 	}
635 
636       if (c == '\\')
637         {
638           switch (c = scm_getc (port))
639             {
640             case EOF:
641               goto str_eof;
642             case '|':
643             case '\\':
644             case '(':  /* Accept "\(" for use at the beginning of lines
645 			  in multiline strings to avoid confusing emacs
646 			  lisp modes.  */
647               break;
648             case '\n':
649               if (opts->hungry_eol_escapes_p)
650                 skip_intraline_whitespace (port);
651               continue;
652             case '0':
653               c = '\0';
654               break;
655             case 'f':
656               c = '\f';
657               break;
658             case 'n':
659               c = '\n';
660               break;
661             case 'r':
662               c = '\r';
663               break;
664             case 't':
665               c = '\t';
666               break;
667             case 'a':
668               c = '\007';
669               break;
670             case 'v':
671               c = '\v';
672               break;
673             case 'b':
674               c = '\010';
675               break;
676             case 'x':
677               if (opts->r6rs_escapes_p || chr == '|')
678                 SCM_READ_HEX_ESCAPE (10, ';');
679               else
680                 SCM_READ_HEX_ESCAPE (2, '\0');
681               break;
682             case 'u':
683               if (!opts->r6rs_escapes_p)
684                 {
685                   SCM_READ_HEX_ESCAPE (4, '\0');
686                   break;
687                 }
688             case 'U':
689               if (!opts->r6rs_escapes_p)
690                 {
691                   SCM_READ_HEX_ESCAPE (6, '\0');
692                   break;
693                 }
694             default:
695               if (c == chr)
696                 break;
697             bad_escaped:
698               scm_i_input_error (FUNC_NAME, port,
699                                  "illegal character in escape sequence: ~S",
700                                  scm_list_1 (SCM_MAKE_CHAR (c)));
701             }
702         }
703 
704       c_str[c_str_len++] = c;
705     }
706 
707   if (scm_is_null (str))
708     /* Fast path: we got a string that fits in C_STR.  */
709     str = scm_from_utf32_stringn (c_str, c_str_len);
710   else
711     {
712       if (c_str_len > 0)
713 	str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
714 
715       str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
716     }
717 
718   return maybe_annotate_source (str, port, opts, line, column);
719 }
720 #undef FUNC_NAME
721 
722 static SCM
scm_read_string(int chr,SCM port,scm_t_read_opts * opts)723 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
724 {
725   return scm_read_string_like_syntax (chr, port, opts);
726 }
727 
728 static SCM
scm_read_r7rs_symbol(int chr,SCM port,scm_t_read_opts * opts)729 scm_read_r7rs_symbol (int chr, SCM port, scm_t_read_opts *opts)
730 {
731   return scm_string_to_symbol (scm_read_string_like_syntax (chr, port, opts));
732 }
733 
734 static SCM
scm_read_number(scm_t_wchar chr,SCM port,scm_t_read_opts * opts)735 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
736 {
737   SCM result, str = SCM_EOL;
738   char local_buffer[READER_BUFFER_SIZE], *buffer;
739   size_t bytes_read;
740 
741   /* Need to capture line and column numbers here. */
742   long line = scm_to_long (scm_port_line (port));
743   int column = scm_to_int (scm_port_column (port)) - 1;
744 
745   scm_ungetc (chr, port);
746   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
747 				&bytes_read);
748 
749   str = scm_from_port_stringn (buffer, bytes_read, port);
750 
751   result = scm_string_to_number (str, SCM_UNDEFINED);
752   if (scm_is_false (result))
753     {
754       /* Return a symbol instead of a number */
755       if (opts->case_insensitive_p)
756         str = scm_string_downcase_x (str);
757       result = scm_string_to_symbol (str);
758     }
759   else if (SCM_NIMP (result))
760     result = maybe_annotate_source (result, port, opts, line, column);
761 
762   scm_set_port_column_x (port,
763                          scm_sum (scm_port_column (port),
764                                   scm_string_length (str)));
765   return result;
766 }
767 
768 static SCM
scm_read_mixed_case_symbol(scm_t_wchar chr,SCM port,scm_t_read_opts * opts)769 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
770 {
771   SCM result;
772   int ends_with_colon = 0;
773   size_t bytes_read;
774   int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
775   char local_buffer[READER_BUFFER_SIZE], *buffer;
776   SCM str;
777 
778   scm_ungetc (chr, port);
779   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
780 				&bytes_read);
781   if (bytes_read > 0)
782     ends_with_colon = buffer[bytes_read - 1] == ':';
783 
784   if (postfix && ends_with_colon && (bytes_read > 1))
785     {
786       str = scm_from_port_stringn (buffer, bytes_read - 1, port);
787 
788       if (opts->case_insensitive_p)
789         str = scm_string_downcase_x (str);
790       result = scm_symbol_to_keyword (scm_string_to_symbol (str));
791     }
792   else
793     {
794       str = scm_from_port_stringn (buffer, bytes_read, port);
795 
796       if (opts->case_insensitive_p)
797         str = scm_string_downcase_x (str);
798       result = scm_string_to_symbol (str);
799     }
800 
801   scm_set_port_column_x (port,
802                          scm_sum (scm_port_column (port),
803                                   scm_string_length (str)));
804   return result;
805 }
806 
807 static SCM
scm_read_number_and_radix(scm_t_wchar chr,SCM port,scm_t_read_opts * opts)808 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
809 #define FUNC_NAME "scm_lreadr"
810 {
811   SCM result;
812   size_t read;
813   char local_buffer[READER_BUFFER_SIZE], *buffer;
814   unsigned int radix;
815   SCM str;
816 
817   switch (chr)
818     {
819     case 'B':
820     case 'b':
821       radix = 2;
822       break;
823 
824     case 'o':
825     case 'O':
826       radix = 8;
827       break;
828 
829     case 'd':
830     case 'D':
831       radix = 10;
832       break;
833 
834     case 'x':
835     case 'X':
836       radix = 16;
837       break;
838 
839     default:
840       scm_ungetc (chr, port);
841       scm_ungetc ('#', port);
842       radix = 10;
843     }
844 
845   buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
846 				&read);
847 
848   str = scm_from_port_stringn (buffer, read, port);
849 
850   result = scm_string_to_number (str, scm_from_uint (radix));
851 
852   scm_set_port_column_x (port,
853                          scm_sum (scm_port_column (port),
854                                   scm_string_length (str)));
855 
856   if (scm_is_true (result))
857     return result;
858 
859   scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
860 
861   return SCM_BOOL_F;
862 }
863 #undef FUNC_NAME
864 
865 static SCM
scm_read_quote(int chr,SCM port,scm_t_read_opts * opts)866 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
867 {
868   SCM p;
869   long line = scm_to_long (scm_port_line (port));
870   int column = scm_to_int (scm_port_column (port)) - 1;
871 
872   switch (chr)
873     {
874     case '`':
875       p = scm_sym_quasiquote;
876       break;
877 
878     case '\'':
879       p = scm_sym_quote;
880       break;
881 
882     case ',':
883       {
884 	scm_t_wchar c;
885 
886 	c = scm_getc (port);
887 	if ('@' == c)
888 	  p = scm_sym_uq_splicing;
889 	else
890 	  {
891 	    scm_ungetc (c, port);
892 	    p = scm_sym_unquote;
893 	  }
894 	break;
895       }
896 
897     default:
898       fprintf (stderr, "%s: unhandled quote character (%i)\n",
899 	       "scm_read_quote", chr);
900       abort ();
901     }
902 
903   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
904   return maybe_annotate_source (p, port, opts, line, column);
905 }
906 
907 SCM_SYMBOL (sym_syntax, "syntax");
908 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
909 SCM_SYMBOL (sym_unsyntax, "unsyntax");
910 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
911 
912 static SCM
scm_read_syntax(int chr,SCM port,scm_t_read_opts * opts)913 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
914 {
915   SCM p;
916   long line = scm_to_long (scm_port_line (port));
917   int column = scm_to_int (scm_port_column (port)) - 1;
918 
919   switch (chr)
920     {
921     case '`':
922       p = sym_quasisyntax;
923       break;
924 
925     case '\'':
926       p = sym_syntax;
927       break;
928 
929     case ',':
930       {
931 	int c;
932 
933 	c = scm_getc (port);
934 	if ('@' == c)
935 	  p = sym_unsyntax_splicing;
936 	else
937 	  {
938 	    scm_ungetc (c, port);
939 	    p = sym_unsyntax;
940 	  }
941 	break;
942       }
943 
944     default:
945       fprintf (stderr, "%s: unhandled syntax character (%i)\n",
946 	       "scm_read_syntax", chr);
947       abort ();
948     }
949 
950   p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
951   return maybe_annotate_source (p, port, opts, line, column);
952 }
953 
954 static SCM
scm_read_nil(int chr,SCM port,scm_t_read_opts * opts)955 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
956 {
957   SCM id = scm_read_mixed_case_symbol (chr, port, opts);
958 
959   if (!scm_is_eq (id, sym_nil))
960     scm_i_input_error ("scm_read_nil", port,
961                        "unexpected input while reading #nil: ~a",
962                        scm_list_1 (id));
963 
964   return SCM_ELISP_NIL;
965 }
966 
967 static SCM
scm_read_semicolon_comment(int chr,SCM port)968 scm_read_semicolon_comment (int chr, SCM port)
969 {
970   int c;
971 
972   /* We use the get_byte here because there is no need to get the
973      locale correct with comment input. This presumes that newline
974      always represents itself no matter what the encoding is.  */
975   for (c = scm_get_byte_or_eof (port);
976        (c != EOF) && (c != '\n');
977        c = scm_get_byte_or_eof (port));
978 
979   return SCM_UNSPECIFIED;
980 }
981 
982 /* If the EXPECTED_CHARS are the next ones available from PORT, then
983    consume them and return 1.  Otherwise leave the port position where
984    it was and return 0.  EXPECTED_CHARS should be all lowercase, and
985    will be matched case-insensitively against the characters read from
986    PORT. */
987 static int
try_read_ci_chars(SCM port,const char * expected_chars)988 try_read_ci_chars (SCM port, const char *expected_chars)
989 {
990   int num_chars_wanted = strlen (expected_chars);
991   int num_chars_read = 0;
992   char *chars_read = alloca (num_chars_wanted);
993   int c;
994 
995   while (num_chars_read < num_chars_wanted)
996     {
997       c = scm_getc (port);
998       if (c == EOF)
999         break;
1000       else if (c_tolower (c) != expected_chars[num_chars_read])
1001         {
1002           scm_ungetc (c, port);
1003           break;
1004         }
1005       else
1006         chars_read[num_chars_read++] = c;
1007     }
1008 
1009   if (num_chars_read == num_chars_wanted)
1010     return 1;
1011   else
1012     {
1013       while (num_chars_read > 0)
1014         scm_ungetc (chars_read[--num_chars_read], port);
1015       return 0;
1016     }
1017 }
1018 
1019 
1020 /* Sharp readers, i.e. readers called after a `#' sign has been read.  */
1021 
1022 static SCM
scm_read_boolean(int chr,SCM port)1023 scm_read_boolean (int chr, SCM port)
1024 {
1025   switch (chr)
1026     {
1027     case 't':
1028     case 'T':
1029       try_read_ci_chars (port, "rue");
1030       return SCM_BOOL_T;
1031 
1032     case 'f':
1033     case 'F':
1034       try_read_ci_chars (port, "alse");
1035       return SCM_BOOL_F;
1036     }
1037 
1038   return SCM_UNSPECIFIED;
1039 }
1040 
1041 static SCM
scm_read_character(scm_t_wchar chr,SCM port,scm_t_read_opts * opts)1042 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1043 #define FUNC_NAME "scm_lreadr"
1044 {
1045   char buffer[READER_CHAR_NAME_MAX_SIZE];
1046   SCM charname;
1047   size_t charname_len, bytes_read;
1048   scm_t_wchar cp;
1049   int overflow;
1050   scm_t_port *pt;
1051 
1052   overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
1053                          &bytes_read);
1054   if (overflow)
1055     scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
1056 
1057   if (bytes_read == 0)
1058     {
1059       chr = scm_getc (port);
1060       if (chr == EOF)
1061 	scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
1062 			   "while reading character", SCM_EOL);
1063 
1064       /* CHR must be a token delimiter, like a whitespace.  */
1065       return (SCM_MAKE_CHAR (chr));
1066     }
1067 
1068   pt = SCM_PORT (port);
1069 
1070   /* Simple ASCII characters can be processed immediately.  Also, simple
1071      ISO-8859-1 characters can be processed immediately if the encoding for this
1072      port is ISO-8859-1.  */
1073   if (bytes_read == 1 &&
1074       ((unsigned char) buffer[0] <= 127
1075        || scm_is_eq (pt->encoding, sym_ISO_8859_1)))
1076     {
1077       scm_set_port_column_x (port, scm_sum (scm_port_column (port), SCM_INUM1));
1078       return SCM_MAKE_CHAR (buffer[0]);
1079     }
1080 
1081   /* Otherwise, convert the buffer into a proper scheme string for
1082      processing.  */
1083   charname = scm_from_port_stringn (buffer, bytes_read, port);
1084   charname_len = scm_i_string_length (charname);
1085   scm_set_port_column_x (port,
1086                          scm_sum (scm_port_column (port),
1087                                   scm_from_size_t (charname_len)));
1088   cp = scm_i_string_ref (charname, 0);
1089   if (charname_len == 1)
1090     return SCM_MAKE_CHAR (cp);
1091 
1092   /* Ignore dotted circles, which may be used to keep combining characters from
1093      combining with the backslash in #\charname.  */
1094   if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1095     return scm_i_make_char (scm_i_string_ref (charname, 1));
1096 
1097   if (cp >= '0' && cp < '8')
1098     {
1099       /* Dirk:FIXME::  This type of character syntax is not R5RS
1100        * compliant.  Further, it should be verified that the constant
1101        * does only consist of octal digits.  */
1102       SCM p = scm_string_to_number (charname, scm_from_uint (8));
1103       if (SCM_I_INUMP (p))
1104         {
1105           scm_t_wchar c = scm_to_uint32 (p);
1106           if (SCM_IS_UNICODE_CHAR (c))
1107             return SCM_MAKE_CHAR (c);
1108           else
1109             scm_i_input_error (FUNC_NAME, port,
1110                                "out-of-range octal character escape: ~a",
1111                                scm_list_1 (charname));
1112         }
1113     }
1114 
1115   if (cp == 'x' && (charname_len > 1))
1116     {
1117       SCM p;
1118 
1119       /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1120       p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1121                                 scm_from_uint (16));
1122       if (SCM_I_INUMP (p))
1123         {
1124           scm_t_wchar c = scm_to_uint32 (p);
1125           if (SCM_IS_UNICODE_CHAR (c))
1126             return SCM_MAKE_CHAR (c);
1127           else
1128             scm_i_input_error (FUNC_NAME, port,
1129                                "out-of-range hex character escape: ~a",
1130                                scm_list_1 (charname));
1131         }
1132     }
1133 
1134   /* The names of characters should never have non-Latin1
1135      characters.  */
1136   if (scm_i_is_narrow_string (charname)
1137       || scm_i_try_narrow_string (charname))
1138     { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1139                                        charname_len);
1140       if (scm_is_true (ch))
1141         return ch;
1142     }
1143 
1144   scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1145 		     scm_list_1 (charname));
1146 
1147   return SCM_UNSPECIFIED;
1148 }
1149 #undef FUNC_NAME
1150 
1151 static SCM
scm_read_keyword(int chr,SCM port,scm_t_read_opts * opts)1152 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1153 {
1154   SCM symbol;
1155 
1156   /* Read the symbol that comprises the keyword.  Doing this instead of
1157      invoking a specific symbol reader function allows `scm_read_keyword ()'
1158      to adapt to the delimiters currently valid of symbols.
1159 
1160      XXX: This implementation allows sloppy syntaxes like `#:  key'.  */
1161   symbol = scm_read_expression (port, opts);
1162   if (!scm_is_symbol (symbol))
1163     scm_i_input_error ("scm_read_keyword", port,
1164 		       "keyword prefix `~a' not followed by a symbol: ~s",
1165 		       scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1166 
1167   return (scm_symbol_to_keyword (symbol));
1168 }
1169 
1170 static SCM
scm_read_vector(int chr,SCM port,scm_t_read_opts * opts,long line,int column)1171 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1172                  long line, int column)
1173 {
1174   /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1175      guarantee that it's going to do what we want.  After all, this is an
1176      implementation detail of `scm_read_vector ()', not a desirable
1177      property.  */
1178   return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1179                                 port, opts, line, column);
1180 }
1181 
1182 /* Helper used by scm_read_array */
1183 static int
read_decimal_integer(SCM port,int c,ssize_t * resp)1184 read_decimal_integer (SCM port, int c, ssize_t *resp)
1185 {
1186   ssize_t sign = 1;
1187   ssize_t res = 0;
1188   int got_it = 0;
1189 
1190   if (c == '-')
1191     {
1192       sign = -1;
1193       c = scm_getc (port);
1194     }
1195 
1196   while ('0' <= c && c <= '9')
1197     {
1198       if (((SSIZE_MAX - (c-'0')) / 10) <= res)
1199         scm_i_input_error ("read_decimal_integer", port,
1200                            "number too large", SCM_EOL);
1201       res = 10*res + c-'0';
1202       got_it = 1;
1203       c = scm_getc (port);
1204     }
1205 
1206   if (got_it)
1207     *resp = sign * res;
1208   return c;
1209 }
1210 
1211 /* Read an array.  This function can also read vectors and uniform
1212    vectors.  Also, the conflict between '#f' and '#f32' and '#f64' is
1213    handled here.
1214 
1215    C is the first character read after the '#'. */
1216 static SCM
scm_read_array(int c,SCM port,scm_t_read_opts * opts,long line,int column)1217 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1218 {
1219   ssize_t rank;
1220   scm_t_wchar tag_buf[8];
1221   int tag_len;
1222 
1223   SCM tag, shape = SCM_BOOL_F, elements, array;
1224 
1225   /* XXX - shortcut for ordinary vectors.  Shouldn't be necessary but
1226      the array code can not deal with zero-length dimensions yet, and
1227      we want to allow zero-length vectors, of course. */
1228   if (c == '(')
1229     return scm_read_vector (c, port, opts, line, column);
1230 
1231   /* Disambiguate between '#f' and uniform floating point vectors. */
1232   if (c == 'f')
1233     {
1234       c = scm_getc (port);
1235       if (c != '3' && c != '6')
1236 	{
1237           if (c == 'a' && try_read_ci_chars (port, "lse"))
1238             return SCM_BOOL_F;
1239           else if (c != EOF)
1240             scm_ungetc (c, port);
1241 	  return SCM_BOOL_F;
1242 	}
1243       rank = 1;
1244       tag_buf[0] = 'f';
1245       tag_len = 1;
1246       goto continue_reading_tag;
1247     }
1248 
1249   /* Read rank. */
1250   rank = 1;
1251   c = read_decimal_integer (port, c, &rank);
1252   if (rank < 0)
1253     scm_i_input_error (NULL, port, "array rank must be non-negative",
1254 		       SCM_EOL);
1255 
1256   /* Read tag. */
1257   tag_len = 0;
1258  continue_reading_tag:
1259   while (c != EOF && c != '(' && c != '@' && c != ':'
1260          && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1261     {
1262       tag_buf[tag_len++] = c;
1263       c = scm_getc (port);
1264     }
1265   if (tag_len == 0)
1266     tag = SCM_BOOL_T;
1267   else
1268     {
1269       tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1270       if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1271         scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1272                            scm_list_1 (tag));
1273     }
1274 
1275   /* Read shape. */
1276   if (c == '@' || c == ':')
1277     {
1278       shape = SCM_EOL;
1279 
1280       do
1281 	{
1282 	  ssize_t lbnd = 0, len = 0;
1283 	  SCM s;
1284 
1285 	  if (c == '@')
1286 	    {
1287 	      c = scm_getc (port);
1288 	      c = read_decimal_integer (port, c, &lbnd);
1289 	    }
1290 
1291 	  s = scm_from_ssize_t (lbnd);
1292 
1293 	  if (c == ':')
1294 	    {
1295 	      c = scm_getc (port);
1296 	      c = read_decimal_integer (port, c, &len);
1297 	      if (len < 0)
1298 		scm_i_input_error (NULL, port,
1299 				   "array length must be non-negative",
1300 				   SCM_EOL);
1301 
1302 	      s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1303 	    }
1304 
1305 	  shape = scm_cons (s, shape);
1306 	} while (c == '@' || c == ':');
1307 
1308       shape = scm_reverse_x (shape, SCM_EOL);
1309     }
1310 
1311   /* Read nested lists of elements. */
1312   if (c != '(')
1313     scm_i_input_error (NULL, port,
1314 		       "missing '(' in vector or array literal",
1315 		       SCM_EOL);
1316   elements = scm_read_sexp (c, port, opts);
1317 
1318   if (scm_is_false (shape))
1319     shape = scm_from_ssize_t (rank);
1320   else if (scm_ilength (shape) != rank)
1321     scm_i_input_error
1322       (NULL, port,
1323        "the number of shape specifications must match the array rank",
1324        SCM_EOL);
1325 
1326   /* Handle special print syntax of rank zero arrays; see
1327      scm_i_print_array for a rationale. */
1328   if (rank == 0)
1329     {
1330       if (!scm_is_pair (elements))
1331 	scm_i_input_error (NULL, port,
1332 			   "too few elements in array literal, need 1",
1333 			   SCM_EOL);
1334       if (!scm_is_null (SCM_CDR (elements)))
1335 	scm_i_input_error (NULL, port,
1336 			   "too many elements in array literal, want 1",
1337 			   SCM_EOL);
1338       elements = SCM_CAR (elements);
1339     }
1340 
1341   /* Construct array, annotate with source location, and return. */
1342   array = scm_list_to_typed_array (tag, shape, elements);
1343   return maybe_annotate_source (array, port, opts, line, column);
1344 }
1345 
1346 static SCM
scm_read_srfi4_vector(int chr,SCM port,scm_t_read_opts * opts,long line,int column)1347 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1348                        long line, int column)
1349 {
1350   return scm_read_array (chr, port, opts, line, column);
1351 }
1352 
1353 static SCM
scm_read_bytevector(scm_t_wchar chr,SCM port,scm_t_read_opts * opts,long line,int column)1354 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1355                      long line, int column)
1356 {
1357   chr = scm_getc (port);
1358   if (chr != 'u')
1359     goto syntax;
1360 
1361   chr = scm_getc (port);
1362   if (chr != '8')
1363     goto syntax;
1364 
1365   chr = scm_getc (port);
1366   if (chr != '(')
1367     goto syntax;
1368 
1369   return maybe_annotate_source
1370     (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1371      port, opts, line, column);
1372 
1373  syntax:
1374   scm_i_input_error ("read_bytevector", port,
1375 		     "invalid bytevector prefix",
1376 		     SCM_MAKE_CHAR (chr));
1377   return SCM_UNSPECIFIED;
1378 }
1379 
1380 static SCM
scm_read_guile_bit_vector(scm_t_wchar chr,SCM port,scm_t_read_opts * opts,long line,int column)1381 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1382                            long line, int column)
1383 {
1384   /* Read the `#*10101'-style read syntax for bit vectors in Guile.  This is
1385      terribly inefficient but who cares?  */
1386   SCM s_bits = SCM_EOL;
1387 
1388   for (chr = scm_getc (port);
1389        (chr != EOF) && ((chr == '0') || (chr == '1'));
1390        chr = scm_getc (port))
1391     {
1392       s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1393     }
1394 
1395   if (chr != EOF)
1396     scm_ungetc (chr, port);
1397 
1398   return maybe_annotate_source
1399     (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1400      port, opts, line, column);
1401 }
1402 
1403 static SCM
scm_read_scsh_block_comment(scm_t_wchar chr,SCM port)1404 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1405 {
1406   int bang_seen = 0;
1407 
1408   for (;;)
1409     {
1410       int c = scm_getc (port);
1411 
1412       if (c == EOF)
1413 	scm_i_input_error ("skip_block_comment", port,
1414 			   "unterminated `#! ... !#' comment", SCM_EOL);
1415 
1416       if (c == '!')
1417 	bang_seen = 1;
1418       else if (c == '#' && bang_seen)
1419 	break;
1420       else
1421 	bang_seen = 0;
1422     }
1423 
1424   return SCM_UNSPECIFIED;
1425 }
1426 
1427 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1428                                          int value);
1429 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1430                                         int value);
1431 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1432                                     int value);
1433 static void set_port_r6rs_hex_escapes_p (SCM port, scm_t_read_opts *opts,
1434                                          int value);
1435 static void set_port_hungry_eol_escapes_p (SCM port, scm_t_read_opts *opts,
1436                                            int value);
1437 static void set_port_keyword_style (SCM port, scm_t_read_opts *opts,
1438                                     enum t_keyword_style value);
1439 
1440 static SCM
scm_read_shebang(scm_t_wchar chr,SCM port,scm_t_read_opts * opts)1441 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1442 {
1443   char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1444   int c;
1445   int i = 0;
1446 
1447   while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1448     {
1449       c = scm_getc (port);
1450       if (c == EOF)
1451 	scm_i_input_error ("skip_block_comment", port,
1452 			   "unterminated `#! ... !#' comment", SCM_EOL);
1453       else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1454         name[i++] = c;
1455       else if (CHAR_IS_DELIMITER (c))
1456         {
1457           scm_ungetc (c, port);
1458           name[i] = '\0';
1459           if (0 == strcmp ("r6rs", name))
1460             {
1461               set_port_case_insensitive_p (port, opts, 0);
1462               set_port_r6rs_hex_escapes_p (port, opts, 1);
1463               set_port_square_brackets_p (port, opts, 1);
1464               set_port_keyword_style (port, opts, KEYWORD_STYLE_HASH_PREFIX);
1465               set_port_hungry_eol_escapes_p (port, opts, 1);
1466             }
1467           else if (0 == strcmp ("fold-case", name))
1468             set_port_case_insensitive_p (port, opts, 1);
1469           else if (0 == strcmp ("no-fold-case", name))
1470             set_port_case_insensitive_p (port, opts, 0);
1471           else if (0 == strcmp ("curly-infix", name))
1472             set_port_curly_infix_p (port, opts, 1);
1473           else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1474             {
1475               set_port_curly_infix_p (port, opts, 1);
1476               set_port_square_brackets_p (port, opts, 0);
1477             }
1478           else
1479             break;
1480 
1481           return SCM_UNSPECIFIED;
1482         }
1483       else
1484         {
1485           scm_ungetc (c, port);
1486           break;
1487         }
1488     }
1489   while (i > 0)
1490     scm_ungetc (name[--i], port);
1491   return scm_read_scsh_block_comment (chr, port);
1492 }
1493 
1494 static SCM
scm_read_r6rs_block_comment(scm_t_wchar chr,SCM port)1495 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1496 {
1497   /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1498      nested.  So care must be taken.  */
1499   int nesting_level = 1;
1500 
1501   int a = scm_getc (port);
1502 
1503   if (a == EOF)
1504     scm_i_input_error ("scm_read_r6rs_block_comment", port,
1505                        "unterminated `#| ... |#' comment", SCM_EOL);
1506 
1507   while (nesting_level > 0)
1508     {
1509       int b = scm_getc (port);
1510 
1511       if (b == EOF)
1512 	scm_i_input_error ("scm_read_r6rs_block_comment", port,
1513 			   "unterminated `#| ... |#' comment", SCM_EOL);
1514 
1515       if (a == '|' && b == '#')
1516         {
1517           nesting_level--;
1518           b = EOF;
1519         }
1520       else if (a == '#' && b == '|')
1521         {
1522           nesting_level++;
1523           b = EOF;
1524         }
1525 
1526       a = b;
1527     }
1528 
1529   return SCM_UNSPECIFIED;
1530 }
1531 
1532 static SCM
scm_read_commented_expression(scm_t_wchar chr,SCM port,scm_t_read_opts * opts)1533 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1534                                scm_t_read_opts *opts)
1535 {
1536   scm_t_wchar c;
1537 
1538   c = flush_ws (port, opts, (char *) NULL);
1539   if (EOF == c)
1540     scm_i_input_error ("read_commented_expression", port,
1541                        "no expression after #; comment", SCM_EOL);
1542   scm_ungetc (c, port);
1543   scm_read_expression (port, opts);
1544   return SCM_UNSPECIFIED;
1545 }
1546 
1547 static SCM
scm_read_extended_symbol(scm_t_wchar chr,SCM port)1548 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1549 {
1550   /* Guile's extended symbol read syntax looks like this:
1551 
1552        #{This is all a symbol name}#
1553 
1554      So here, CHR is expected to be `{'.  */
1555   int saw_brace = 0;
1556   size_t len = 0;
1557   SCM buf = scm_i_make_string (1024, NULL, 0);
1558 
1559   /* No need to scm_i_string_start_writing (), as the string isn't
1560      visible to any other thread.  */
1561 
1562   while ((chr = scm_getc (port)) != EOF)
1563     {
1564       if (saw_brace)
1565 	{
1566 	  if (chr == '#')
1567 	    {
1568 	      break;
1569 	    }
1570 	  else
1571 	    {
1572 	      saw_brace = 0;
1573 	      scm_i_string_set_x (buf, len++, '}');
1574 	    }
1575 	}
1576 
1577       if (chr == '}')
1578 	saw_brace = 1;
1579       else if (chr == '\\')
1580         {
1581           /* It used to be that print.c would print extended-read-syntax
1582              symbols with backslashes before "non-standard" chars, but
1583              this routine wouldn't do anything with those escapes.
1584              Bummer.  What we've done is to change print.c to output
1585              R6RS hex escapes for those characters, relying on the fact
1586              that the extended read syntax would never put a `\' before
1587              an `x'.  For now, we just ignore other instances of
1588              backslash in the string.  */
1589           switch ((chr = scm_getc (port)))
1590             {
1591             case EOF:
1592               goto done;
1593             case 'x':
1594               {
1595                 scm_t_wchar c;
1596 
1597                 SCM_READ_HEX_ESCAPE (10, ';');
1598                 scm_i_string_set_x (buf, len++, c);
1599                 break;
1600 
1601               str_eof:
1602                 chr = EOF;
1603                 goto done;
1604 
1605               bad_escaped:
1606                 scm_i_string_stop_writing ();
1607                 scm_i_input_error ("scm_read_extended_symbol", port,
1608                                    "illegal character in escape sequence: ~S",
1609                                    scm_list_1 (SCM_MAKE_CHAR (c)));
1610                 break;
1611               }
1612             default:
1613 	      scm_i_string_set_x (buf, len++, chr);
1614               break;
1615             }
1616         }
1617       else
1618         scm_i_string_set_x (buf, len++, chr);
1619 
1620       if (len >= scm_i_string_length (buf) - 2)
1621 	{
1622 	  SCM addy;
1623 
1624 	  addy = scm_i_make_string (1024, NULL, 0);
1625 	  buf = scm_string_append (scm_list_2 (buf, addy));
1626 	  len = 0;
1627 	}
1628     }
1629 
1630  done:
1631   if (chr == EOF)
1632     scm_i_input_error ("scm_read_extended_symbol", port,
1633                        "end of file while reading symbol", SCM_EOL);
1634 
1635   return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1636 }
1637 
1638 
1639 
1640 /* Top-level token readers, i.e., dispatchers.  */
1641 
1642 static SCM
scm_read_sharp_extension(int chr,SCM port,scm_t_read_opts * opts)1643 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1644 {
1645   SCM proc;
1646 
1647   proc = scm_get_hash_procedure (chr);
1648   if (scm_is_true (scm_procedure_p (proc)))
1649     {
1650       long line = scm_to_long (scm_port_line (port));
1651       int column = scm_to_int (scm_port_column (port)) - 2;
1652       SCM got;
1653 
1654       got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1655 
1656       if (opts->record_positions_p && SCM_NIMP (got)
1657           && !scm_i_has_source_properties (got))
1658         scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1659 
1660       return got;
1661     }
1662 
1663   return SCM_UNSPECIFIED;
1664 }
1665 
1666 /* The reader for the sharp `#' character.  It basically dispatches reads
1667    among the above token readers.   */
1668 static SCM
scm_read_sharp(scm_t_wchar chr,SCM port,scm_t_read_opts * opts,long line,int column)1669 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1670                 long line, int column)
1671 #define FUNC_NAME "scm_lreadr"
1672 {
1673   SCM result;
1674 
1675   chr = scm_getc (port);
1676 
1677   result = scm_read_sharp_extension (chr, port, opts);
1678   if (!scm_is_eq (result, SCM_UNSPECIFIED))
1679     return result;
1680 
1681   switch (chr)
1682     {
1683     case '\\':
1684       return (scm_read_character (chr, port, opts));
1685     case '(':
1686       return (scm_read_vector (chr, port, opts, line, column));
1687     case 's':
1688     case 'u':
1689     case 'f':
1690     case 'c':
1691       /* This one may return either a boolean or an SRFI-4 vector.  */
1692       return (scm_read_srfi4_vector (chr, port, opts, line, column));
1693     case 'v':
1694       return (scm_read_bytevector (chr, port, opts, line, column));
1695     case '*':
1696       return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1697     case 't':
1698     case 'T':
1699     case 'F':
1700       return (scm_read_boolean (chr, port));
1701     case ':':
1702       return (scm_read_keyword (chr, port, opts));
1703     case '0': case '1': case '2': case '3': case '4':
1704     case '5': case '6': case '7': case '8': case '9':
1705     case '@':
1706       return (scm_read_array (chr, port, opts, line, column));
1707 
1708     case 'i':
1709     case 'e':
1710     case 'b':
1711     case 'B':
1712     case 'o':
1713     case 'O':
1714     case 'd':
1715     case 'D':
1716     case 'x':
1717     case 'X':
1718     case 'I':
1719     case 'E':
1720       return (scm_read_number_and_radix (chr, port, opts));
1721     case '{':
1722       return (scm_read_extended_symbol (chr, port));
1723     case '!':
1724       return (scm_read_shebang (chr, port, opts));
1725     case ';':
1726       return (scm_read_commented_expression (chr, port, opts));
1727     case '`':
1728     case '\'':
1729     case ',':
1730       return (scm_read_syntax (chr, port, opts));
1731     case 'n':
1732       return (scm_read_nil (chr, port, opts));
1733     default:
1734       result = scm_read_sharp_extension (chr, port, opts);
1735       if (scm_is_eq (result, SCM_UNSPECIFIED))
1736 	{
1737 	  /* To remain compatible with 1.8 and earlier, the following
1738 	     characters have lower precedence than `read-hash-extend'
1739 	     characters.  */
1740 	  switch (chr)
1741 	    {
1742 	    case '|':
1743 	      return scm_read_r6rs_block_comment (chr, port);
1744 	    default:
1745 	      scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1746 				 scm_list_1 (SCM_MAKE_CHAR (chr)));
1747 	    }
1748 	}
1749       else
1750 	return result;
1751     }
1752 
1753   return SCM_UNSPECIFIED;
1754 }
1755 #undef FUNC_NAME
1756 
1757 static SCM
read_inner_expression(SCM port,scm_t_read_opts * opts)1758 read_inner_expression (SCM port, scm_t_read_opts *opts)
1759 #define FUNC_NAME "read_inner_expression"
1760 {
1761   while (1)
1762     {
1763       scm_t_wchar chr;
1764 
1765       chr = scm_getc (port);
1766 
1767       switch (chr)
1768 	{
1769 	case SCM_WHITE_SPACES:
1770 	case SCM_LINE_INCREMENTORS:
1771 	  break;
1772 	case ';':
1773 	  (void) scm_read_semicolon_comment (chr, port);
1774 	  break;
1775         case '{':
1776           if (opts->curly_infix_p)
1777             {
1778               if (opts->neoteric_p)
1779                 return scm_read_sexp (chr, port, opts);
1780               else
1781                 {
1782                   SCM expr;
1783 
1784                   /* Enable neoteric expressions within curly braces */
1785                   opts->neoteric_p = 1;
1786                   expr = scm_read_sexp (chr, port, opts);
1787                   opts->neoteric_p = 0;
1788                   return expr;
1789                 }
1790             }
1791           else
1792             return scm_read_mixed_case_symbol (chr, port, opts);
1793 	case '[':
1794           if (opts->square_brackets_p)
1795             return scm_read_sexp (chr, port, opts);
1796           else if (opts->curly_infix_p)
1797             {
1798               /* The syntax of neoteric expressions requires that '[' be
1799                  a delimiter when curly-infix is enabled, so it cannot
1800                  be part of an unescaped symbol.  We might as well do
1801                  something useful with it, so we adopt Kawa's convention:
1802                  [...] => ($bracket-list$ ...) */
1803               long line = scm_to_long (scm_port_line (port));
1804               int column = scm_to_int (scm_port_column (port)) - 1;
1805               return maybe_annotate_source
1806                 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1807                  port, opts, line, column);
1808             }
1809           else
1810             return scm_read_mixed_case_symbol (chr, port, opts);
1811 	case '(':
1812 	  return (scm_read_sexp (chr, port, opts));
1813 	case '"':
1814 	  return (scm_read_string (chr, port, opts));
1815         case '|':
1816           if (opts->r7rs_symbols_p)
1817             return scm_read_r7rs_symbol (chr, port, opts);
1818           else
1819             return scm_read_mixed_case_symbol (chr, port, opts);
1820 	case '\'':
1821 	case '`':
1822 	case ',':
1823 	  return (scm_read_quote (chr, port, opts));
1824 	case '#':
1825 	  {
1826             long line = scm_to_long (scm_port_line (port));
1827             int column = scm_to_int (scm_port_column (port)) - 1;
1828 	    SCM result = scm_read_sharp (chr, port, opts, line, column);
1829 	    if (scm_is_eq (result, SCM_UNSPECIFIED))
1830 	      /* We read a comment or some such.  */
1831 	      break;
1832 	    else
1833 	      return result;
1834 	  }
1835 	case ')':
1836 	  scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1837 	  break;
1838         case '}':
1839           if (opts->curly_infix_p)
1840             scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1841           else
1842             return scm_read_mixed_case_symbol (chr, port, opts);
1843 	case ']':
1844           if (opts->square_brackets_p)
1845             scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1846           /* otherwise fall through */
1847 	case EOF:
1848 	  return SCM_EOF_VAL;
1849 	case ':':
1850 	  if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1851 	    return scm_symbol_to_keyword (scm_read_expression (port, opts));
1852 	  /* Fall through.  */
1853 
1854 	default:
1855 	  {
1856 	    if (((chr >= '0') && (chr <= '9'))
1857 		|| (strchr ("+-.", chr)))
1858 	      return (scm_read_number (chr, port, opts));
1859 	    else
1860 	      return (scm_read_mixed_case_symbol (chr, port, opts));
1861 	  }
1862 	}
1863     }
1864 }
1865 #undef FUNC_NAME
1866 
1867 static SCM
scm_read_expression(SCM port,scm_t_read_opts * opts)1868 scm_read_expression (SCM port, scm_t_read_opts *opts)
1869 #define FUNC_NAME "scm_read_expression"
1870 {
1871   if (!opts->neoteric_p)
1872     return read_inner_expression (port, opts);
1873   else
1874     {
1875       long line = 0;
1876       int column = 0;
1877       SCM expr;
1878 
1879       if (opts->record_positions_p)
1880         {
1881           /* We need to get the position of the first non-whitespace
1882              character in order to correctly annotate neoteric
1883              expressions.  For example, for the expression 'f(x)', the
1884              first call to 'read_inner_expression' reads the 'f' (which
1885              cannot be annotated), and then we later read the '(x)' and
1886              use it to construct the new list (f x). */
1887           int c = flush_ws (port, opts, (char *) NULL);
1888           if (c == EOF)
1889             return SCM_EOF_VAL;
1890           scm_ungetc (c, port);
1891           line = scm_to_long (scm_port_line (port));
1892           column = scm_to_int (scm_port_column (port));
1893         }
1894 
1895       expr = read_inner_expression (port, opts);
1896 
1897       /* 'expr' is the first component of the neoteric expression.  Now
1898          we loop, and as long as the next character is '(', '[', or '{',
1899          (without any intervening whitespace), we use it to construct a
1900          new expression.  For example, f{n - 1}(x) => ((f (- n 1)) x). */
1901       for (;;)
1902         {
1903           int chr = scm_getc (port);
1904 
1905           if (chr == '(')
1906             /* e(...) => (e ...) */
1907             expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1908           else if (chr == '[')
1909             /* e[...] => ($bracket-apply$ e ...) */
1910             expr = scm_cons (sym_bracket_apply,
1911                              scm_cons (expr,
1912                                        scm_read_sexp (chr, port, opts)));
1913           else if (chr == '{')
1914             {
1915               SCM arg = scm_read_sexp (chr, port, opts);
1916 
1917               if (scm_is_null (arg))
1918                 expr = scm_list_1 (expr);       /* e{} => (e) */
1919               else
1920                 expr = scm_list_2 (expr, arg);  /* e{...} => (e {...}) */
1921             }
1922           else
1923             {
1924               if (chr != EOF)
1925                 scm_ungetc (chr, port);
1926               break;
1927             }
1928           maybe_annotate_source (expr, port, opts, line, column);
1929         }
1930       return expr;
1931     }
1932 }
1933 #undef FUNC_NAME
1934 
1935 
1936 /* Actual reader.  */
1937 
1938 static void init_read_options (SCM port, scm_t_read_opts *opts);
1939 
1940 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1941             (SCM port),
1942 	    "Read an s-expression from the input port @var{port}, or from\n"
1943 	    "the current input port if @var{port} is not specified.\n"
1944 	    "Any whitespace before the next token is discarded.")
1945 #define FUNC_NAME s_scm_read
1946 {
1947   scm_t_read_opts opts;
1948   int c;
1949 
1950   if (SCM_UNBNDP (port))
1951     port = scm_current_input_port ();
1952   SCM_VALIDATE_OPINPORT (1, port);
1953 
1954   init_read_options (port, &opts);
1955 
1956   c = flush_ws (port, &opts, (char *) NULL);
1957   if (EOF == c)
1958     return SCM_EOF_VAL;
1959   scm_ungetc (c, port);
1960 
1961   return (scm_read_expression (port, &opts));
1962 }
1963 #undef FUNC_NAME
1964 
1965 
1966 
1967 
1968 /* Manipulate the read-hash-procedures alist.  This could be written in
1969    Scheme, but maybe it will also be used by C code during initialisation.  */
1970 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1971             (SCM chr, SCM proc),
1972 	    "Install the procedure @var{proc} for reading expressions\n"
1973 	    "starting with the character sequence @code{#} and @var{chr}.\n"
1974 	    "@var{proc} will be called with two arguments:  the character\n"
1975 	    "@var{chr} and the port to read further data from. The object\n"
1976 	    "returned will be the return value of @code{read}. \n"
1977 	    "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1978 	    )
1979 #define FUNC_NAME s_scm_read_hash_extend
1980 {
1981   SCM this;
1982   SCM prev;
1983 
1984   SCM_VALIDATE_CHAR (1, chr);
1985   SCM_ASSERT (scm_is_false (proc)
1986 	      || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1987 	      proc, SCM_ARG2, FUNC_NAME);
1988 
1989   /* Check if chr is already in the alist.  */
1990   this = scm_i_read_hash_procedures_ref ();
1991   prev = SCM_BOOL_F;
1992   while (1)
1993     {
1994       if (scm_is_null (this))
1995 	{
1996 	  /* not found, so add it to the beginning.  */
1997 	  if (scm_is_true (proc))
1998 	    {
1999               SCM new = scm_cons (scm_cons (chr, proc),
2000                                   scm_i_read_hash_procedures_ref ());
2001 	      scm_i_read_hash_procedures_set_x (new);
2002 	    }
2003 	  break;
2004 	}
2005       if (scm_is_eq (chr, SCM_CAAR (this)))
2006 	{
2007 	  /* already in the alist.  */
2008 	  if (scm_is_false (proc))
2009 	    {
2010 	      /* remove it.  */
2011 	      if (scm_is_false (prev))
2012 		{
2013                   SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
2014 		  scm_i_read_hash_procedures_set_x (rest);
2015 		}
2016 	      else
2017 		scm_set_cdr_x (prev, SCM_CDR (this));
2018 	    }
2019 	  else
2020 	    {
2021 	      /* replace it.  */
2022 	      scm_set_cdr_x (SCM_CAR (this), proc);
2023 	    }
2024 	  break;
2025 	}
2026       prev = this;
2027       this = SCM_CDR (this);
2028     }
2029 
2030   return SCM_UNSPECIFIED;
2031 }
2032 #undef FUNC_NAME
2033 
2034 /* Recover the read-hash procedure corresponding to char c.  */
2035 static SCM
scm_get_hash_procedure(int c)2036 scm_get_hash_procedure (int c)
2037 {
2038   SCM rest = scm_i_read_hash_procedures_ref ();
2039 
2040   while (1)
2041     {
2042       if (scm_is_null (rest))
2043 	return SCM_BOOL_F;
2044 
2045       if (SCM_CHAR (SCM_CAAR (rest)) == c)
2046 	return SCM_CDAR (rest);
2047 
2048       rest = SCM_CDR (rest);
2049     }
2050 }
2051 
2052 static int
is_encoding_char(char c)2053 is_encoding_char (char c)
2054 {
2055   if (c >= 'a' && c <= 'z') return 1;
2056   if (c >= 'A' && c <= 'Z') return 1;
2057   if (c >= '0' && c <= '9') return 1;
2058   return strchr ("_-.:/,+=()", c) != NULL;
2059 }
2060 
2061 /* Maximum size of an encoding name.  This is a bit more than the
2062    longest name listed at
2063    <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
2064    characters.)  */
2065 #define ENCODING_NAME_MAX_SIZE 20
2066 
2067 /* Number of bytes at the beginning or end of a file that are scanned
2068    for a "coding:" declaration.  */
2069 #define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
2070 
2071 
2072 /* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
2073    coding declaration.  Returns either NULL or a string whose storage
2074    has been allocated with `scm_gc_malloc'.  */
2075 char *
scm_i_scan_for_encoding(SCM port)2076 scm_i_scan_for_encoding (SCM port)
2077 {
2078   scm_t_port *pt;
2079   SCM buf;
2080   char header[SCM_ENCODING_SEARCH_SIZE+1];
2081   size_t cur, bytes_read, encoding_length, i;
2082   char *encoding = NULL;
2083   char *pos, *encoding_start;
2084   int in_comment;
2085 
2086   pt = SCM_PORT (port);
2087   buf = pt->read_buf;
2088 
2089   if (pt->rw_random)
2090     scm_flush (port);
2091 
2092   if (scm_port_buffer_can_take (buf, &cur) == 0)
2093     {
2094       /* We can use the read buffer, and thus avoid a seek. */
2095       buf = scm_fill_input (port, 0, &cur, &bytes_read);
2096       if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2097         bytes_read = SCM_ENCODING_SEARCH_SIZE;
2098 
2099       if (bytes_read <= 1)
2100         /* An unbuffered port -- don't scan.  */
2101         return NULL;
2102 
2103       memcpy (header, scm_port_buffer_take_pointer (buf, cur), bytes_read);
2104       header[bytes_read] = '\0';
2105     }
2106   else if (pt->rw_random)
2107     {
2108       /* The port is seekable.  This is OK but grubbing in the read
2109          buffer is better, so this case is just a fallback.  */
2110       bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
2111       header[bytes_read] = '\0';
2112       scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2113     }
2114   else
2115     /* No input available and not seekable; scan fails.  */
2116     return NULL;
2117 
2118   /* search past "coding[:=]" */
2119   pos = header;
2120   while (1)
2121     {
2122       if ((pos = strstr(pos, "coding")) == NULL)
2123         return NULL;
2124 
2125       pos += strlen ("coding");
2126       if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2127           (*pos == ':' || *pos == '='))
2128         {
2129           pos ++;
2130           break;
2131         }
2132     }
2133 
2134   /* skip spaces */
2135   while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2136 	 (*pos == ' ' || *pos == '\t'))
2137     pos ++;
2138 
2139   if (pos - header >= SCM_ENCODING_SEARCH_SIZE - ENCODING_NAME_MAX_SIZE)
2140     /* We found the "coding:" string, but there is probably not enough
2141        room to store an encoding name in its entirety, so ignore it.
2142        This makes sure we do not end up returning a truncated encoding
2143        name.  */
2144     return NULL;
2145 
2146   /* grab the next token */
2147   encoding_start = pos;
2148   i = 0;
2149   while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2150          && encoding_start + i - header < bytes_read
2151 	 && is_encoding_char (encoding_start[i]))
2152     i++;
2153 
2154   encoding_length = i;
2155   if (encoding_length == 0)
2156     return NULL;
2157 
2158   encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2159 
2160   /* push backwards to make sure we were in a comment */
2161   in_comment = 0;
2162   pos = encoding_start;
2163   while (pos >= header)
2164     {
2165       if (*pos == ';')
2166 	{
2167 	  in_comment = 1;
2168 	  break;
2169 	}
2170       else if (*pos == '\n' || pos == header)
2171 	{
2172 	  /* This wasn't in a semicolon comment. Check for a
2173 	   hash-bang comment. */
2174 	  char *beg = strstr (header, "#!");
2175 	  char *end = strstr (header, "!#");
2176 	  if (beg < encoding_start && encoding_start + encoding_length <= end)
2177 	    in_comment = 1;
2178 	  break;
2179 	}
2180       else
2181         {
2182           pos --;
2183           continue;
2184         }
2185     }
2186   if (!in_comment)
2187     /* This wasn't in a comment */
2188     return NULL;
2189 
2190   return encoding;
2191 }
2192 
2193 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2194             (SCM port),
2195             "Scans the port for an Emacs-like character coding declaration\n"
2196             "near the top of the contents of a port with random-accessible contents.\n"
2197             "The coding declaration is of the form\n"
2198             "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2199             "\n"
2200             "Returns a string containing the character encoding of the file\n"
2201             "if a declaration was found, or @code{#f} otherwise.\n")
2202 #define FUNC_NAME s_scm_file_encoding
2203 {
2204   char *enc;
2205   SCM s_enc;
2206 
2207   SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2208 
2209   enc = scm_i_scan_for_encoding (port);
2210   if (enc == NULL)
2211     return SCM_BOOL_F;
2212   else
2213     {
2214       /* It's not obvious what encoding to use here, but latin1 has the
2215          advantage of never causing a decoding error, and a valid
2216          encoding name should be ASCII anyway. */
2217       s_enc = scm_string_upcase (scm_from_latin1_string (enc));
2218       return s_enc;
2219     }
2220 
2221   return SCM_BOOL_F;
2222 }
2223 #undef FUNC_NAME
2224 
2225 
2226 /* Per-port read options.
2227 
2228    We store per-port read options in the 'port-read-options' port
2229    property, which is stored in the internal port structure.  The value
2230    stored is a single integer that contains a two-bit field for each
2231    read option.
2232 
2233    If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2234    the applicable value should be inherited from the corresponding
2235    global read option.  Otherwise, the bit field contains the value of
2236    the read option.  For boolean read options that have been set
2237    per-port, the possible values are 0 or 1.  If the 'keyword_style'
2238    read option has been set per-port, its possible values are those in
2239    'enum t_keyword_style'. */
2240 
2241 /* Key to read options in port properties. */
2242 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2243 
2244 /* Offsets of bit fields for each per-port override */
2245 #define READ_OPTION_COPY_SOURCE_P          0
2246 #define READ_OPTION_RECORD_POSITIONS_P     2
2247 #define READ_OPTION_CASE_INSENSITIVE_P     4
2248 #define READ_OPTION_KEYWORD_STYLE          6
2249 #define READ_OPTION_R6RS_ESCAPES_P         8
2250 #define READ_OPTION_SQUARE_BRACKETS_P     10
2251 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P  12
2252 #define READ_OPTION_CURLY_INFIX_P         14
2253 #define READ_OPTION_R7RS_SYMBOLS_P        16
2254 
2255 /* The total width in bits of the per-port overrides */
2256 #define READ_OPTIONS_NUM_BITS             18
2257 
2258 #define READ_OPTIONS_INHERIT_ALL  ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2259 #define READ_OPTIONS_MAX_VALUE    READ_OPTIONS_INHERIT_ALL
2260 
2261 #define READ_OPTION_MASK     3
2262 #define READ_OPTION_INHERIT  3
2263 
2264 static void
set_port_read_option(SCM port,int option,int new_value)2265 set_port_read_option (SCM port, int option, int new_value)
2266 {
2267   SCM scm_read_options;
2268   unsigned int read_options;
2269 
2270   new_value &= READ_OPTION_MASK;
2271 
2272   scm_read_options = scm_i_port_property (port, sym_port_read_options);
2273   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2274     read_options = scm_to_uint (scm_read_options);
2275   else
2276     read_options = READ_OPTIONS_INHERIT_ALL;
2277   read_options &= ~(READ_OPTION_MASK << option);
2278   read_options |= new_value << option;
2279   scm_read_options = scm_from_uint (read_options);
2280   scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
2281 }
2282 
2283 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2284 static void
set_port_case_insensitive_p(SCM port,scm_t_read_opts * opts,int value)2285 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2286 {
2287   value = !!value;
2288   opts->case_insensitive_p = value;
2289   set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2290 }
2291 
2292 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2293 static void
set_port_square_brackets_p(SCM port,scm_t_read_opts * opts,int value)2294 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2295 {
2296   value = !!value;
2297   opts->square_brackets_p = value;
2298   set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2299 }
2300 
2301 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2302 static void
set_port_curly_infix_p(SCM port,scm_t_read_opts * opts,int value)2303 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2304 {
2305   value = !!value;
2306   opts->curly_infix_p = value;
2307   set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2308 }
2309 
2310 /* Set OPTS and PORT's r6rs_hex_escapes_p option according to VALUE. */
2311 static void
set_port_r6rs_hex_escapes_p(SCM port,scm_t_read_opts * opts,int value)2312 set_port_r6rs_hex_escapes_p (SCM port, scm_t_read_opts *opts, int value)
2313 {
2314   value = !!value;
2315   opts->r6rs_escapes_p = value;
2316   set_port_read_option (port, READ_OPTION_R6RS_ESCAPES_P, value);
2317 }
2318 
2319 static void
set_port_hungry_eol_escapes_p(SCM port,scm_t_read_opts * opts,int value)2320 set_port_hungry_eol_escapes_p (SCM port, scm_t_read_opts *opts, int value)
2321 {
2322   value = !!value;
2323   opts->hungry_eol_escapes_p = value;
2324   set_port_read_option (port, READ_OPTION_HUNGRY_EOL_ESCAPES_P, value);
2325 }
2326 
2327 static void
set_port_keyword_style(SCM port,scm_t_read_opts * opts,enum t_keyword_style value)2328 set_port_keyword_style (SCM port, scm_t_read_opts *opts, enum t_keyword_style value)
2329 {
2330   opts->keyword_style = value;
2331   set_port_read_option (port, READ_OPTION_KEYWORD_STYLE, value);
2332 }
2333 
2334 /* Initialize OPTS based on PORT's read options and the global read
2335    options. */
2336 static void
init_read_options(SCM port,scm_t_read_opts * opts)2337 init_read_options (SCM port, scm_t_read_opts *opts)
2338 {
2339   SCM val, scm_read_options;
2340   unsigned int read_options, x;
2341 
2342   scm_read_options = scm_i_port_property (port, sym_port_read_options);
2343 
2344   if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2345     read_options = scm_to_uint (scm_read_options);
2346   else
2347     read_options = READ_OPTIONS_INHERIT_ALL;
2348 
2349   x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2350   if (x == READ_OPTION_INHERIT)
2351     {
2352       val = SCM_PACK (SCM_KEYWORD_STYLE);
2353       if (scm_is_eq (val, scm_keyword_prefix))
2354         x = KEYWORD_STYLE_PREFIX;
2355       else if (scm_is_eq (val, scm_keyword_postfix))
2356         x = KEYWORD_STYLE_POSTFIX;
2357       else
2358         x = KEYWORD_STYLE_HASH_PREFIX;
2359     }
2360   opts->keyword_style = x;
2361 
2362 #define RESOLVE_BOOLEAN_OPTION(NAME, name)                              \
2363   do                                                                    \
2364     {                                                                   \
2365       x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME);    \
2366       if (x == READ_OPTION_INHERIT)                                     \
2367         x = !!SCM_ ## NAME;                                             \
2368           opts->name = x;                                               \
2369     }                                                                   \
2370   while (0)
2371 
2372   RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P,        copy_source_p);
2373   RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P,   record_positions_p);
2374   RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P,   case_insensitive_p);
2375   RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P,       r6rs_escapes_p);
2376   RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P,    square_brackets_p);
2377   RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2378   RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P,        curly_infix_p);
2379   RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P,       r7rs_symbols_p);
2380 
2381 #undef RESOLVE_BOOLEAN_OPTION
2382 
2383   opts->neoteric_p = 0;
2384 }
2385 
2386 void
scm_init_read()2387 scm_init_read ()
2388 {
2389   SCM read_hash_procs;
2390 
2391   read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2392 
2393   scm_i_read_hash_procedures =
2394     SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2395 
2396   scm_init_opts (scm_read_options, scm_read_opts);
2397 #include "libguile/read.x"
2398 }
2399 
2400 /*
2401   Local Variables:
2402   c-file-style: "gnu"
2403   End:
2404 */
2405