1 /* Copyright (C) 1995-1997, 1999-2001, 2003, 2004, 2006-2012, 2014-2019
2 * Free Software Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21
22
23 #ifdef HAVE_CONFIG_H
24 # include <config.h>
25 #endif
26
27 #include <stdio.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <unicase.h>
31 #include <unictype.h>
32 #include <c-strcase.h>
33 #include <c-ctype.h>
34 #include <alloca.h>
35
36 #include "libguile/_scm.h"
37 #include "libguile/bytevectors.h"
38 #include "libguile/chars.h"
39 #include "libguile/eval.h"
40 #include "libguile/arrays.h"
41 #include "libguile/bitvectors.h"
42 #include "libguile/keywords.h"
43 #include "libguile/alist.h"
44 #include "libguile/srcprop.h"
45 #include "libguile/hashtab.h"
46 #include "libguile/hash.h"
47 #include "libguile/ports.h"
48 #include "libguile/ports-internal.h"
49 #include "libguile/fports.h"
50 #include "libguile/strings.h"
51 #include "libguile/strports.h"
52 #include "libguile/vectors.h"
53 #include "libguile/validate.h"
54 #include "libguile/srfi-4.h"
55 #include "libguile/srfi-13.h"
56
57 #include "libguile/read.h"
58 #include "libguile/private-options.h"
59
60
61
62
63 SCM_GLOBAL_SYMBOL (scm_sym_dot, ".");
64 SCM_SYMBOL (scm_keyword_prefix, "prefix");
65 SCM_SYMBOL (scm_keyword_postfix, "postfix");
66 SCM_SYMBOL (sym_nil, "nil");
67 SCM_SYMBOL (sym_ISO_8859_1, "ISO-8859-1");
68
69 /* SRFI-105 curly infix expression support */
70 SCM_SYMBOL (sym_nfx, "$nfx$");
71 SCM_SYMBOL (sym_bracket_list, "$bracket-list$");
72 SCM_SYMBOL (sym_bracket_apply, "$bracket-apply$");
73
74 scm_t_option scm_read_opts[] =
75 {
76 { SCM_OPTION_BOOLEAN, "copy", 0,
77 "Copy source code expressions." },
78 { SCM_OPTION_BOOLEAN, "positions", 1,
79 "Record positions of source code expressions." },
80 { SCM_OPTION_BOOLEAN, "case-insensitive", 0,
81 "Convert symbols to lower case."},
82 { SCM_OPTION_SCM, "keywords", (scm_t_bits) SCM_BOOL_F_BITS,
83 "Style of keyword recognition: #f, 'prefix or 'postfix."},
84 { SCM_OPTION_BOOLEAN, "r6rs-hex-escapes", 0,
85 "Use R6RS variable-length character and string hex escapes."},
86 { SCM_OPTION_BOOLEAN, "square-brackets", 1,
87 "Treat `[' and `]' as parentheses, for R6RS compatibility."},
88 { SCM_OPTION_BOOLEAN, "hungry-eol-escapes", 0,
89 "In strings, consume leading whitespace after an escaped end-of-line."},
90 { SCM_OPTION_BOOLEAN, "curly-infix", 0,
91 "Support SRFI-105 curly infix expressions."},
92 { SCM_OPTION_BOOLEAN, "r7rs-symbols", 0,
93 "Support R7RS |...| symbol notation."},
94 { 0, },
95 };
96
97 /* Internal read options structure. This is initialized by 'scm_read'
98 from the global and per-port read options, and a pointer is passed
99 down to all helper functions. */
100
101 enum t_keyword_style
102 {
103 KEYWORD_STYLE_HASH_PREFIX,
104 KEYWORD_STYLE_PREFIX,
105 KEYWORD_STYLE_POSTFIX
106 };
107
108 struct t_read_opts
109 {
110 enum t_keyword_style keyword_style;
111 unsigned int copy_source_p : 1;
112 unsigned int record_positions_p : 1;
113 unsigned int case_insensitive_p : 1;
114 unsigned int r6rs_escapes_p : 1;
115 unsigned int square_brackets_p : 1;
116 unsigned int hungry_eol_escapes_p : 1;
117 unsigned int curly_infix_p : 1;
118 unsigned int neoteric_p : 1;
119 unsigned int r7rs_symbols_p : 1;
120 };
121
122 typedef struct t_read_opts scm_t_read_opts;
123
124
125 /*
126 Give meaningful error messages for errors
127
128 We use the format
129
130 FILE:LINE:COL: MESSAGE
131 This happened in ....
132
133 This is not standard GNU format, but the test-suite likes the real
134 message to be in front.
135
136 */
137
138
139 void
scm_i_input_error(char const * function,SCM port,const char * message,SCM arg)140 scm_i_input_error (char const *function,
141 SCM port, const char *message, SCM arg)
142 {
143 SCM fn = (scm_is_string (SCM_FILENAME(port))
144 ? SCM_FILENAME(port)
145 : scm_from_utf8_string ("#<unknown port>"));
146
147 SCM string_port = scm_open_output_string ();
148 SCM string = SCM_EOL;
149 scm_simple_format (string_port,
150 scm_from_utf8_string ("~A:~S:~S: ~A"),
151 scm_list_4 (fn,
152 scm_sum (scm_port_line (port), SCM_INUM1),
153 scm_sum (scm_port_column (port), SCM_INUM1),
154 scm_from_utf8_string (message)));
155
156 string = scm_get_output_string (string_port);
157 scm_close_output_port (string_port);
158 scm_error_scm (scm_from_utf8_symbol ("read-error"),
159 function? scm_from_utf8_string (function) : SCM_BOOL_F,
160 string,
161 arg,
162 SCM_BOOL_F);
163 }
164
165
166 SCM_DEFINE (scm_read_options, "read-options-interface", 0, 1, 0,
167 (SCM setting),
168 "Option interface for the read options. Instead of using\n"
169 "this procedure directly, use the procedures @code{read-enable},\n"
170 "@code{read-disable}, @code{read-set!} and @code{read-options}.")
171 #define FUNC_NAME s_scm_read_options
172 {
173 SCM ans = scm_options (setting,
174 scm_read_opts,
175 FUNC_NAME);
176 if (SCM_COPY_SOURCE_P)
177 SCM_RECORD_POSITIONS_P = 1;
178 return ans;
179 }
180 #undef FUNC_NAME
181
182 /* A fluid referring to an association list mapping extra hash
183 characters to procedures. */
184 static SCM *scm_i_read_hash_procedures;
185
186 static SCM
scm_i_read_hash_procedures_ref(void)187 scm_i_read_hash_procedures_ref (void)
188 {
189 return scm_fluid_ref (*scm_i_read_hash_procedures);
190 }
191
192 static void
scm_i_read_hash_procedures_set_x(SCM value)193 scm_i_read_hash_procedures_set_x (SCM value)
194 {
195 scm_fluid_set_x (*scm_i_read_hash_procedures, value);
196 }
197
198
199 /* Token readers. */
200
201
202 /* Size of the C buffer used to read symbols and numbers. */
203 #define READER_BUFFER_SIZE 128
204
205 /* Number of 32-bit codepoints in the buffer used to read strings. */
206 #define READER_STRING_BUFFER_SIZE 128
207
208 /* The maximum size of Scheme character names. */
209 #define READER_CHAR_NAME_MAX_SIZE 50
210
211 /* The maximum size of reader directive names. */
212 #define READER_DIRECTIVE_NAME_MAX_SIZE 50
213
214
215 /* `isblank' is only in C99. */
216 #define CHAR_IS_BLANK_(_chr) \
217 (((_chr) == ' ') || ((_chr) == '\t') || ((_chr) == '\n') \
218 || ((_chr) == '\f') || ((_chr) == '\r'))
219
220 #ifdef MSDOS
221 # define CHAR_IS_BLANK(_chr) \
222 ((CHAR_IS_BLANK_ (chr)) || ((_chr) == 26))
223 #else
224 # define CHAR_IS_BLANK CHAR_IS_BLANK_
225 #endif
226
227
228 /* R5RS one-character delimiters (see section 7.1.1, ``Lexical
229 structure''). */
230 #define CHAR_IS_R5RS_DELIMITER(c) \
231 (CHAR_IS_BLANK (c) \
232 || (c) == ')' || (c) == '(' || (c) == ';' || (c) == '"')
233
234 #define CHAR_IS_DELIMITER(c) \
235 (CHAR_IS_R5RS_DELIMITER (c) \
236 || (((c) == ']' || (c) == '[') && (opts->square_brackets_p \
237 || opts->curly_infix_p)) \
238 || (((c) == '}' || (c) == '{') && opts->curly_infix_p))
239
240 /* Exponent markers, as defined in section 7.1.1 of R5RS, ``Lexical
241 Structure''. */
242 #define CHAR_IS_EXPONENT_MARKER(_chr) \
243 (((_chr) == 'e') || ((_chr) == 's') || ((_chr) == 'f') \
244 || ((_chr) == 'd') || ((_chr) == 'l'))
245
246 /* Read an SCSH block comment. */
247 static SCM scm_read_scsh_block_comment (scm_t_wchar, SCM);
248 static SCM scm_read_r6rs_block_comment (scm_t_wchar, SCM);
249 static SCM scm_read_commented_expression (scm_t_wchar, SCM, scm_t_read_opts *);
250 static SCM scm_read_shebang (scm_t_wchar, SCM, scm_t_read_opts *);
251 static SCM scm_get_hash_procedure (int);
252
253 /* Read from PORT until a delimiter (e.g., a whitespace) is read. Put the
254 result in the pre-allocated buffer BUF. Return zero if the whole token has
255 fewer than BUF_SIZE bytes, non-zero otherwise. READ will be set the number of
256 bytes actually read. */
257 static int
read_token(SCM port,scm_t_read_opts * opts,char * buf,size_t buf_size,size_t * read)258 read_token (SCM port, scm_t_read_opts *opts,
259 char *buf, size_t buf_size, size_t *read)
260 {
261 *read = 0;
262
263 while (*read < buf_size)
264 {
265 int chr;
266
267 chr = scm_get_byte_or_eof (port);
268
269 if (chr == EOF)
270 return 0;
271 else if (CHAR_IS_DELIMITER (chr))
272 {
273 scm_unget_byte (chr, port);
274 return 0;
275 }
276 else
277 {
278 *buf = (char) chr;
279 buf++, (*read)++;
280 }
281 }
282
283 return 1;
284 }
285
286 /* Like `read_token', but return either BUFFER, or a GC-allocated buffer
287 if the token doesn't fit in BUFFER_SIZE bytes. */
288 static char *
read_complete_token(SCM port,scm_t_read_opts * opts,char * buffer,size_t buffer_size,size_t * read)289 read_complete_token (SCM port, scm_t_read_opts *opts,
290 char *buffer, size_t buffer_size, size_t *read)
291 {
292 int overflow = 0;
293 size_t bytes_read, overflow_size = 0;
294 char *overflow_buffer = NULL;
295
296 do
297 {
298 overflow = read_token (port, opts, buffer, buffer_size, &bytes_read);
299 if (bytes_read == 0)
300 break;
301 if (overflow || overflow_size != 0)
302 {
303 if (overflow_size == 0)
304 {
305 overflow_buffer = scm_gc_malloc_pointerless (bytes_read, "read");
306 memcpy (overflow_buffer, buffer, bytes_read);
307 overflow_size = bytes_read;
308 }
309 else
310 {
311 char *new_buf =
312 scm_gc_malloc_pointerless (overflow_size + bytes_read, "read");
313
314 memcpy (new_buf, overflow_buffer, overflow_size);
315 memcpy (new_buf + overflow_size, buffer, bytes_read);
316
317 overflow_buffer = new_buf;
318 overflow_size += bytes_read;
319 }
320 }
321 }
322 while (overflow);
323
324 if (overflow_size)
325 *read = overflow_size;
326 else
327 *read = bytes_read;
328
329 return (overflow_size > 0 ? overflow_buffer : buffer);
330 }
331
332 /* Skip whitespace from PORT and return the first non-whitespace character
333 read. Raise an error on end-of-file. */
334 static int
flush_ws(SCM port,scm_t_read_opts * opts,const char * eoferr)335 flush_ws (SCM port, scm_t_read_opts *opts, const char *eoferr)
336 {
337 scm_t_wchar c;
338 while (1)
339 switch (c = scm_getc (port))
340 {
341 case EOF:
342 goteof:
343 if (eoferr)
344 {
345 scm_i_input_error (eoferr,
346 port,
347 "end of file",
348 SCM_EOL);
349 }
350 return c;
351
352 case ';':
353 lp:
354 switch (c = scm_getc (port))
355 {
356 case EOF:
357 goto goteof;
358 default:
359 goto lp;
360 case SCM_LINE_INCREMENTORS:
361 break;
362 }
363 break;
364
365 case '#':
366 switch (c = scm_getc (port))
367 {
368 case EOF:
369 eoferr = "read_sharp";
370 goto goteof;
371 case '!':
372 scm_read_shebang (c, port, opts);
373 break;
374 case ';':
375 scm_read_commented_expression (c, port, opts);
376 break;
377 case '|':
378 if (scm_is_false (scm_get_hash_procedure (c)))
379 {
380 scm_read_r6rs_block_comment (c, port);
381 break;
382 }
383 /* fall through */
384 default:
385 scm_ungetc (c, port);
386 return '#';
387 }
388 break;
389
390 case SCM_LINE_INCREMENTORS:
391 case SCM_SINGLE_SPACES:
392 case '\t':
393 break;
394
395 default:
396 return c;
397 }
398
399 return 0;
400 }
401
402
403
404 /* Token readers. */
405
406 static SCM scm_read_expression (SCM port, scm_t_read_opts *opts);
407 static SCM scm_read_sharp (int chr, SCM port, scm_t_read_opts *opts,
408 long line, int column);
409
410
411 static SCM
maybe_annotate_source(SCM x,SCM port,scm_t_read_opts * opts,long line,int column)412 maybe_annotate_source (SCM x, SCM port, scm_t_read_opts *opts,
413 long line, int column)
414 {
415 /* This condition can be caused by a user calling
416 set-port-column!. */
417 if (line < 0 || column < 0)
418 return x;
419
420 if (opts->record_positions_p)
421 scm_i_set_source_properties_x (x, line, column, SCM_FILENAME (port));
422 return x;
423 }
424
425 static SCM
scm_read_sexp(scm_t_wchar chr,SCM port,scm_t_read_opts * opts)426 scm_read_sexp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
427 #define FUNC_NAME "scm_i_lreadparen"
428 {
429 int c;
430 SCM tmp, tl, ans = SCM_EOL;
431 const int curly_list_p = (chr == '{') && opts->curly_infix_p;
432 const int terminating_char = ((chr == '{') ? '}'
433 : ((chr == '[') ? ']'
434 : ')'));
435
436 /* Need to capture line and column numbers here. */
437 long line = scm_to_long (scm_port_line (port));
438 int column = scm_to_int (scm_port_column (port)) - 1;
439
440 c = flush_ws (port, opts, FUNC_NAME);
441 if (terminating_char == c)
442 return SCM_EOL;
443
444 scm_ungetc (c, port);
445 tmp = scm_read_expression (port, opts);
446
447 /* Note that it is possible for scm_read_expression to return
448 scm_sym_dot, but not as part of a dotted pair: as in #{.}#. So
449 check that it's a real dot by checking `c'. */
450 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
451 {
452 ans = scm_read_expression (port, opts);
453 if (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
454 scm_i_input_error (FUNC_NAME, port, "missing close paren",
455 SCM_EOL);
456 return ans;
457 }
458
459 /* Build the head of the list structure. */
460 ans = tl = scm_cons (tmp, SCM_EOL);
461
462 while (terminating_char != (c = flush_ws (port, opts, FUNC_NAME)))
463 {
464 SCM new_tail;
465
466 if (c == ')' || (c == ']' && opts->square_brackets_p)
467 || ((c == '}' || c == ']') && opts->curly_infix_p))
468 scm_i_input_error (FUNC_NAME, port,
469 "in pair: mismatched close paren: ~A",
470 scm_list_1 (SCM_MAKE_CHAR (c)));
471
472 scm_ungetc (c, port);
473 tmp = scm_read_expression (port, opts);
474
475 /* See above note about scm_sym_dot. */
476 if (c == '.' && scm_is_eq (scm_sym_dot, tmp))
477 {
478 SCM_SETCDR (tl, scm_read_expression (port, opts));
479
480 c = flush_ws (port, opts, FUNC_NAME);
481 if (terminating_char != c)
482 scm_i_input_error (FUNC_NAME, port,
483 "in pair: missing close paren", SCM_EOL);
484 break;
485 }
486
487 new_tail = scm_cons (tmp, SCM_EOL);
488 SCM_SETCDR (tl, new_tail);
489 tl = new_tail;
490 }
491
492 if (curly_list_p)
493 {
494 /* In addition to finding the length, 'scm_ilength' checks for
495 improper or circular lists, in which case it returns -1. */
496 int len = scm_ilength (ans);
497
498 /* The (len == 0) case is handled above */
499 if (len == 1)
500 /* Return directly to avoid re-annotating the element's source
501 location with the position of the outer brace. Also, it
502 might not be possible to annotate the element. */
503 return scm_car (ans); /* {e} => e */
504 else if (len == 2)
505 ; /* Leave the list unchanged: {e1 e2} => (e1 e2) */
506 else if (len >= 3 && (len & 1))
507 {
508 /* It's a proper list whose length is odd and at least 3. If
509 the elements at odd indices (the infix operator positions)
510 are all 'equal?', then it's a simple curly-infix list.
511 Otherwise it's a mixed curly-infix list. */
512 SCM op = scm_cadr (ans);
513
514 /* Check to see if the elements at odd indices are 'equal?' */
515 for (tl = scm_cdddr (ans); ; tl = scm_cddr (tl))
516 {
517 if (scm_is_null (tl))
518 {
519 /* Convert simple curly-infix list to prefix:
520 {a <op> b <op> ...} => (<op> a b ...) */
521 tl = ans;
522 while (scm_is_pair (scm_cdr (tl)))
523 {
524 tmp = scm_cddr (tl);
525 SCM_SETCDR (tl, tmp);
526 tl = tmp;
527 }
528 ans = scm_cons (op, ans);
529 break;
530 }
531 else if (scm_is_false (scm_equal_p (op, scm_car (tl))))
532 {
533 /* Mixed curly-infix list: {e ...} => ($nfx$ e ...) */
534 ans = scm_cons (sym_nfx, ans);
535 break;
536 }
537 }
538 }
539 else
540 /* Mixed curly-infix (possibly improper) list:
541 {e . tail} => ($nfx$ e . tail) */
542 ans = scm_cons (sym_nfx, ans);
543 }
544
545 return maybe_annotate_source (ans, port, opts, line, column);
546 }
547 #undef FUNC_NAME
548
549
550 /* Read a hexadecimal number NDIGITS in length. Put its value into the variable
551 C. If TERMINATOR is non-null, terminate early if the TERMINATOR character is
552 found. */
553 #define SCM_READ_HEX_ESCAPE(ndigits, terminator) \
554 do \
555 { \
556 scm_t_wchar a; \
557 size_t i = 0; \
558 c = 0; \
559 while (i < ndigits) \
560 { \
561 a = scm_getc (port); \
562 if (a == EOF) \
563 goto str_eof; \
564 if (terminator \
565 && (a == (scm_t_wchar) terminator) \
566 && (i > 0)) \
567 break; \
568 if ('0' <= a && a <= '9') \
569 a -= '0'; \
570 else if ('A' <= a && a <= 'F') \
571 a = a - 'A' + 10; \
572 else if ('a' <= a && a <= 'f') \
573 a = a - 'a' + 10; \
574 else \
575 { \
576 c = a; \
577 goto bad_escaped; \
578 } \
579 c = c * 16 + a; \
580 i ++; \
581 } \
582 } while (0)
583
584 static void
skip_intraline_whitespace(SCM port)585 skip_intraline_whitespace (SCM port)
586 {
587 scm_t_wchar c;
588
589 do
590 {
591 c = scm_getc (port);
592 if (c == EOF)
593 return;
594 }
595 while (c == '\t' || uc_is_general_category (c, UC_SPACE_SEPARATOR));
596
597 scm_ungetc (c, port);
598 }
599
600 /* Read either a double-quoted string or an R7RS-style symbol delimited
601 by vertical lines, depending on the value of 'chr' ('"' or '|').
602 Regardless, the result is always returned as a string. */
603 static SCM
scm_read_string_like_syntax(int chr,SCM port,scm_t_read_opts * opts)604 scm_read_string_like_syntax (int chr, SCM port, scm_t_read_opts *opts)
605 #define FUNC_NAME "scm_lreadr"
606 {
607 /* For strings smaller than C_STR, this function creates only one Scheme
608 object (the string returned). */
609
610 SCM str = SCM_EOL;
611 size_t c_str_len = 0;
612 scm_t_wchar c, c_str[READER_STRING_BUFFER_SIZE];
613
614 /* Need to capture line and column numbers here. */
615 long line = scm_to_long (scm_port_line (port));
616 int column = scm_to_int (scm_port_column (port)) - 1;
617
618 while (chr != (c = scm_getc (port)))
619 {
620 if (c == EOF)
621 {
622 str_eof:
623 scm_i_input_error (FUNC_NAME, port,
624 (chr == '|'
625 ? "end of file in symbol"
626 : "end of file in string constant"),
627 SCM_EOL);
628 }
629
630 if (c_str_len + 1 >= READER_STRING_BUFFER_SIZE)
631 {
632 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
633 c_str_len = 0;
634 }
635
636 if (c == '\\')
637 {
638 switch (c = scm_getc (port))
639 {
640 case EOF:
641 goto str_eof;
642 case '|':
643 case '\\':
644 case '(': /* Accept "\(" for use at the beginning of lines
645 in multiline strings to avoid confusing emacs
646 lisp modes. */
647 break;
648 case '\n':
649 if (opts->hungry_eol_escapes_p)
650 skip_intraline_whitespace (port);
651 continue;
652 case '0':
653 c = '\0';
654 break;
655 case 'f':
656 c = '\f';
657 break;
658 case 'n':
659 c = '\n';
660 break;
661 case 'r':
662 c = '\r';
663 break;
664 case 't':
665 c = '\t';
666 break;
667 case 'a':
668 c = '\007';
669 break;
670 case 'v':
671 c = '\v';
672 break;
673 case 'b':
674 c = '\010';
675 break;
676 case 'x':
677 if (opts->r6rs_escapes_p || chr == '|')
678 SCM_READ_HEX_ESCAPE (10, ';');
679 else
680 SCM_READ_HEX_ESCAPE (2, '\0');
681 break;
682 case 'u':
683 if (!opts->r6rs_escapes_p)
684 {
685 SCM_READ_HEX_ESCAPE (4, '\0');
686 break;
687 }
688 case 'U':
689 if (!opts->r6rs_escapes_p)
690 {
691 SCM_READ_HEX_ESCAPE (6, '\0');
692 break;
693 }
694 default:
695 if (c == chr)
696 break;
697 bad_escaped:
698 scm_i_input_error (FUNC_NAME, port,
699 "illegal character in escape sequence: ~S",
700 scm_list_1 (SCM_MAKE_CHAR (c)));
701 }
702 }
703
704 c_str[c_str_len++] = c;
705 }
706
707 if (scm_is_null (str))
708 /* Fast path: we got a string that fits in C_STR. */
709 str = scm_from_utf32_stringn (c_str, c_str_len);
710 else
711 {
712 if (c_str_len > 0)
713 str = scm_cons (scm_from_utf32_stringn (c_str, c_str_len), str);
714
715 str = scm_string_concatenate_reverse (str, SCM_UNDEFINED, SCM_UNDEFINED);
716 }
717
718 return maybe_annotate_source (str, port, opts, line, column);
719 }
720 #undef FUNC_NAME
721
722 static SCM
scm_read_string(int chr,SCM port,scm_t_read_opts * opts)723 scm_read_string (int chr, SCM port, scm_t_read_opts *opts)
724 {
725 return scm_read_string_like_syntax (chr, port, opts);
726 }
727
728 static SCM
scm_read_r7rs_symbol(int chr,SCM port,scm_t_read_opts * opts)729 scm_read_r7rs_symbol (int chr, SCM port, scm_t_read_opts *opts)
730 {
731 return scm_string_to_symbol (scm_read_string_like_syntax (chr, port, opts));
732 }
733
734 static SCM
scm_read_number(scm_t_wchar chr,SCM port,scm_t_read_opts * opts)735 scm_read_number (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
736 {
737 SCM result, str = SCM_EOL;
738 char local_buffer[READER_BUFFER_SIZE], *buffer;
739 size_t bytes_read;
740
741 /* Need to capture line and column numbers here. */
742 long line = scm_to_long (scm_port_line (port));
743 int column = scm_to_int (scm_port_column (port)) - 1;
744
745 scm_ungetc (chr, port);
746 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
747 &bytes_read);
748
749 str = scm_from_port_stringn (buffer, bytes_read, port);
750
751 result = scm_string_to_number (str, SCM_UNDEFINED);
752 if (scm_is_false (result))
753 {
754 /* Return a symbol instead of a number */
755 if (opts->case_insensitive_p)
756 str = scm_string_downcase_x (str);
757 result = scm_string_to_symbol (str);
758 }
759 else if (SCM_NIMP (result))
760 result = maybe_annotate_source (result, port, opts, line, column);
761
762 scm_set_port_column_x (port,
763 scm_sum (scm_port_column (port),
764 scm_string_length (str)));
765 return result;
766 }
767
768 static SCM
scm_read_mixed_case_symbol(scm_t_wchar chr,SCM port,scm_t_read_opts * opts)769 scm_read_mixed_case_symbol (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
770 {
771 SCM result;
772 int ends_with_colon = 0;
773 size_t bytes_read;
774 int postfix = (opts->keyword_style == KEYWORD_STYLE_POSTFIX);
775 char local_buffer[READER_BUFFER_SIZE], *buffer;
776 SCM str;
777
778 scm_ungetc (chr, port);
779 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
780 &bytes_read);
781 if (bytes_read > 0)
782 ends_with_colon = buffer[bytes_read - 1] == ':';
783
784 if (postfix && ends_with_colon && (bytes_read > 1))
785 {
786 str = scm_from_port_stringn (buffer, bytes_read - 1, port);
787
788 if (opts->case_insensitive_p)
789 str = scm_string_downcase_x (str);
790 result = scm_symbol_to_keyword (scm_string_to_symbol (str));
791 }
792 else
793 {
794 str = scm_from_port_stringn (buffer, bytes_read, port);
795
796 if (opts->case_insensitive_p)
797 str = scm_string_downcase_x (str);
798 result = scm_string_to_symbol (str);
799 }
800
801 scm_set_port_column_x (port,
802 scm_sum (scm_port_column (port),
803 scm_string_length (str)));
804 return result;
805 }
806
807 static SCM
scm_read_number_and_radix(scm_t_wchar chr,SCM port,scm_t_read_opts * opts)808 scm_read_number_and_radix (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
809 #define FUNC_NAME "scm_lreadr"
810 {
811 SCM result;
812 size_t read;
813 char local_buffer[READER_BUFFER_SIZE], *buffer;
814 unsigned int radix;
815 SCM str;
816
817 switch (chr)
818 {
819 case 'B':
820 case 'b':
821 radix = 2;
822 break;
823
824 case 'o':
825 case 'O':
826 radix = 8;
827 break;
828
829 case 'd':
830 case 'D':
831 radix = 10;
832 break;
833
834 case 'x':
835 case 'X':
836 radix = 16;
837 break;
838
839 default:
840 scm_ungetc (chr, port);
841 scm_ungetc ('#', port);
842 radix = 10;
843 }
844
845 buffer = read_complete_token (port, opts, local_buffer, sizeof local_buffer,
846 &read);
847
848 str = scm_from_port_stringn (buffer, read, port);
849
850 result = scm_string_to_number (str, scm_from_uint (radix));
851
852 scm_set_port_column_x (port,
853 scm_sum (scm_port_column (port),
854 scm_string_length (str)));
855
856 if (scm_is_true (result))
857 return result;
858
859 scm_i_input_error (FUNC_NAME, port, "unknown # object", SCM_EOL);
860
861 return SCM_BOOL_F;
862 }
863 #undef FUNC_NAME
864
865 static SCM
scm_read_quote(int chr,SCM port,scm_t_read_opts * opts)866 scm_read_quote (int chr, SCM port, scm_t_read_opts *opts)
867 {
868 SCM p;
869 long line = scm_to_long (scm_port_line (port));
870 int column = scm_to_int (scm_port_column (port)) - 1;
871
872 switch (chr)
873 {
874 case '`':
875 p = scm_sym_quasiquote;
876 break;
877
878 case '\'':
879 p = scm_sym_quote;
880 break;
881
882 case ',':
883 {
884 scm_t_wchar c;
885
886 c = scm_getc (port);
887 if ('@' == c)
888 p = scm_sym_uq_splicing;
889 else
890 {
891 scm_ungetc (c, port);
892 p = scm_sym_unquote;
893 }
894 break;
895 }
896
897 default:
898 fprintf (stderr, "%s: unhandled quote character (%i)\n",
899 "scm_read_quote", chr);
900 abort ();
901 }
902
903 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
904 return maybe_annotate_source (p, port, opts, line, column);
905 }
906
907 SCM_SYMBOL (sym_syntax, "syntax");
908 SCM_SYMBOL (sym_quasisyntax, "quasisyntax");
909 SCM_SYMBOL (sym_unsyntax, "unsyntax");
910 SCM_SYMBOL (sym_unsyntax_splicing, "unsyntax-splicing");
911
912 static SCM
scm_read_syntax(int chr,SCM port,scm_t_read_opts * opts)913 scm_read_syntax (int chr, SCM port, scm_t_read_opts *opts)
914 {
915 SCM p;
916 long line = scm_to_long (scm_port_line (port));
917 int column = scm_to_int (scm_port_column (port)) - 1;
918
919 switch (chr)
920 {
921 case '`':
922 p = sym_quasisyntax;
923 break;
924
925 case '\'':
926 p = sym_syntax;
927 break;
928
929 case ',':
930 {
931 int c;
932
933 c = scm_getc (port);
934 if ('@' == c)
935 p = sym_unsyntax_splicing;
936 else
937 {
938 scm_ungetc (c, port);
939 p = sym_unsyntax;
940 }
941 break;
942 }
943
944 default:
945 fprintf (stderr, "%s: unhandled syntax character (%i)\n",
946 "scm_read_syntax", chr);
947 abort ();
948 }
949
950 p = scm_cons2 (p, scm_read_expression (port, opts), SCM_EOL);
951 return maybe_annotate_source (p, port, opts, line, column);
952 }
953
954 static SCM
scm_read_nil(int chr,SCM port,scm_t_read_opts * opts)955 scm_read_nil (int chr, SCM port, scm_t_read_opts *opts)
956 {
957 SCM id = scm_read_mixed_case_symbol (chr, port, opts);
958
959 if (!scm_is_eq (id, sym_nil))
960 scm_i_input_error ("scm_read_nil", port,
961 "unexpected input while reading #nil: ~a",
962 scm_list_1 (id));
963
964 return SCM_ELISP_NIL;
965 }
966
967 static SCM
scm_read_semicolon_comment(int chr,SCM port)968 scm_read_semicolon_comment (int chr, SCM port)
969 {
970 int c;
971
972 /* We use the get_byte here because there is no need to get the
973 locale correct with comment input. This presumes that newline
974 always represents itself no matter what the encoding is. */
975 for (c = scm_get_byte_or_eof (port);
976 (c != EOF) && (c != '\n');
977 c = scm_get_byte_or_eof (port));
978
979 return SCM_UNSPECIFIED;
980 }
981
982 /* If the EXPECTED_CHARS are the next ones available from PORT, then
983 consume them and return 1. Otherwise leave the port position where
984 it was and return 0. EXPECTED_CHARS should be all lowercase, and
985 will be matched case-insensitively against the characters read from
986 PORT. */
987 static int
try_read_ci_chars(SCM port,const char * expected_chars)988 try_read_ci_chars (SCM port, const char *expected_chars)
989 {
990 int num_chars_wanted = strlen (expected_chars);
991 int num_chars_read = 0;
992 char *chars_read = alloca (num_chars_wanted);
993 int c;
994
995 while (num_chars_read < num_chars_wanted)
996 {
997 c = scm_getc (port);
998 if (c == EOF)
999 break;
1000 else if (c_tolower (c) != expected_chars[num_chars_read])
1001 {
1002 scm_ungetc (c, port);
1003 break;
1004 }
1005 else
1006 chars_read[num_chars_read++] = c;
1007 }
1008
1009 if (num_chars_read == num_chars_wanted)
1010 return 1;
1011 else
1012 {
1013 while (num_chars_read > 0)
1014 scm_ungetc (chars_read[--num_chars_read], port);
1015 return 0;
1016 }
1017 }
1018
1019
1020 /* Sharp readers, i.e. readers called after a `#' sign has been read. */
1021
1022 static SCM
scm_read_boolean(int chr,SCM port)1023 scm_read_boolean (int chr, SCM port)
1024 {
1025 switch (chr)
1026 {
1027 case 't':
1028 case 'T':
1029 try_read_ci_chars (port, "rue");
1030 return SCM_BOOL_T;
1031
1032 case 'f':
1033 case 'F':
1034 try_read_ci_chars (port, "alse");
1035 return SCM_BOOL_F;
1036 }
1037
1038 return SCM_UNSPECIFIED;
1039 }
1040
1041 static SCM
scm_read_character(scm_t_wchar chr,SCM port,scm_t_read_opts * opts)1042 scm_read_character (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1043 #define FUNC_NAME "scm_lreadr"
1044 {
1045 char buffer[READER_CHAR_NAME_MAX_SIZE];
1046 SCM charname;
1047 size_t charname_len, bytes_read;
1048 scm_t_wchar cp;
1049 int overflow;
1050 scm_t_port *pt;
1051
1052 overflow = read_token (port, opts, buffer, READER_CHAR_NAME_MAX_SIZE,
1053 &bytes_read);
1054 if (overflow)
1055 scm_i_input_error (FUNC_NAME, port, "character name too long", SCM_EOL);
1056
1057 if (bytes_read == 0)
1058 {
1059 chr = scm_getc (port);
1060 if (chr == EOF)
1061 scm_i_input_error (FUNC_NAME, port, "unexpected end of file "
1062 "while reading character", SCM_EOL);
1063
1064 /* CHR must be a token delimiter, like a whitespace. */
1065 return (SCM_MAKE_CHAR (chr));
1066 }
1067
1068 pt = SCM_PORT (port);
1069
1070 /* Simple ASCII characters can be processed immediately. Also, simple
1071 ISO-8859-1 characters can be processed immediately if the encoding for this
1072 port is ISO-8859-1. */
1073 if (bytes_read == 1 &&
1074 ((unsigned char) buffer[0] <= 127
1075 || scm_is_eq (pt->encoding, sym_ISO_8859_1)))
1076 {
1077 scm_set_port_column_x (port, scm_sum (scm_port_column (port), SCM_INUM1));
1078 return SCM_MAKE_CHAR (buffer[0]);
1079 }
1080
1081 /* Otherwise, convert the buffer into a proper scheme string for
1082 processing. */
1083 charname = scm_from_port_stringn (buffer, bytes_read, port);
1084 charname_len = scm_i_string_length (charname);
1085 scm_set_port_column_x (port,
1086 scm_sum (scm_port_column (port),
1087 scm_from_size_t (charname_len)));
1088 cp = scm_i_string_ref (charname, 0);
1089 if (charname_len == 1)
1090 return SCM_MAKE_CHAR (cp);
1091
1092 /* Ignore dotted circles, which may be used to keep combining characters from
1093 combining with the backslash in #\charname. */
1094 if (cp == SCM_CODEPOINT_DOTTED_CIRCLE && charname_len == 2)
1095 return scm_i_make_char (scm_i_string_ref (charname, 1));
1096
1097 if (cp >= '0' && cp < '8')
1098 {
1099 /* Dirk:FIXME:: This type of character syntax is not R5RS
1100 * compliant. Further, it should be verified that the constant
1101 * does only consist of octal digits. */
1102 SCM p = scm_string_to_number (charname, scm_from_uint (8));
1103 if (SCM_I_INUMP (p))
1104 {
1105 scm_t_wchar c = scm_to_uint32 (p);
1106 if (SCM_IS_UNICODE_CHAR (c))
1107 return SCM_MAKE_CHAR (c);
1108 else
1109 scm_i_input_error (FUNC_NAME, port,
1110 "out-of-range octal character escape: ~a",
1111 scm_list_1 (charname));
1112 }
1113 }
1114
1115 if (cp == 'x' && (charname_len > 1))
1116 {
1117 SCM p;
1118
1119 /* Convert from hex, skipping the initial 'x' character in CHARNAME */
1120 p = scm_string_to_number (scm_c_substring (charname, 1, charname_len),
1121 scm_from_uint (16));
1122 if (SCM_I_INUMP (p))
1123 {
1124 scm_t_wchar c = scm_to_uint32 (p);
1125 if (SCM_IS_UNICODE_CHAR (c))
1126 return SCM_MAKE_CHAR (c);
1127 else
1128 scm_i_input_error (FUNC_NAME, port,
1129 "out-of-range hex character escape: ~a",
1130 scm_list_1 (charname));
1131 }
1132 }
1133
1134 /* The names of characters should never have non-Latin1
1135 characters. */
1136 if (scm_i_is_narrow_string (charname)
1137 || scm_i_try_narrow_string (charname))
1138 { SCM ch = scm_i_charname_to_char (scm_i_string_chars (charname),
1139 charname_len);
1140 if (scm_is_true (ch))
1141 return ch;
1142 }
1143
1144 scm_i_input_error (FUNC_NAME, port, "unknown character name ~a",
1145 scm_list_1 (charname));
1146
1147 return SCM_UNSPECIFIED;
1148 }
1149 #undef FUNC_NAME
1150
1151 static SCM
scm_read_keyword(int chr,SCM port,scm_t_read_opts * opts)1152 scm_read_keyword (int chr, SCM port, scm_t_read_opts *opts)
1153 {
1154 SCM symbol;
1155
1156 /* Read the symbol that comprises the keyword. Doing this instead of
1157 invoking a specific symbol reader function allows `scm_read_keyword ()'
1158 to adapt to the delimiters currently valid of symbols.
1159
1160 XXX: This implementation allows sloppy syntaxes like `#: key'. */
1161 symbol = scm_read_expression (port, opts);
1162 if (!scm_is_symbol (symbol))
1163 scm_i_input_error ("scm_read_keyword", port,
1164 "keyword prefix `~a' not followed by a symbol: ~s",
1165 scm_list_2 (SCM_MAKE_CHAR (chr), symbol));
1166
1167 return (scm_symbol_to_keyword (symbol));
1168 }
1169
1170 static SCM
scm_read_vector(int chr,SCM port,scm_t_read_opts * opts,long line,int column)1171 scm_read_vector (int chr, SCM port, scm_t_read_opts *opts,
1172 long line, int column)
1173 {
1174 /* Note: We call `scm_read_sexp ()' rather than READER here in order to
1175 guarantee that it's going to do what we want. After all, this is an
1176 implementation detail of `scm_read_vector ()', not a desirable
1177 property. */
1178 return maybe_annotate_source (scm_vector (scm_read_sexp (chr, port, opts)),
1179 port, opts, line, column);
1180 }
1181
1182 /* Helper used by scm_read_array */
1183 static int
read_decimal_integer(SCM port,int c,ssize_t * resp)1184 read_decimal_integer (SCM port, int c, ssize_t *resp)
1185 {
1186 ssize_t sign = 1;
1187 ssize_t res = 0;
1188 int got_it = 0;
1189
1190 if (c == '-')
1191 {
1192 sign = -1;
1193 c = scm_getc (port);
1194 }
1195
1196 while ('0' <= c && c <= '9')
1197 {
1198 if (((SSIZE_MAX - (c-'0')) / 10) <= res)
1199 scm_i_input_error ("read_decimal_integer", port,
1200 "number too large", SCM_EOL);
1201 res = 10*res + c-'0';
1202 got_it = 1;
1203 c = scm_getc (port);
1204 }
1205
1206 if (got_it)
1207 *resp = sign * res;
1208 return c;
1209 }
1210
1211 /* Read an array. This function can also read vectors and uniform
1212 vectors. Also, the conflict between '#f' and '#f32' and '#f64' is
1213 handled here.
1214
1215 C is the first character read after the '#'. */
1216 static SCM
scm_read_array(int c,SCM port,scm_t_read_opts * opts,long line,int column)1217 scm_read_array (int c, SCM port, scm_t_read_opts *opts, long line, int column)
1218 {
1219 ssize_t rank;
1220 scm_t_wchar tag_buf[8];
1221 int tag_len;
1222
1223 SCM tag, shape = SCM_BOOL_F, elements, array;
1224
1225 /* XXX - shortcut for ordinary vectors. Shouldn't be necessary but
1226 the array code can not deal with zero-length dimensions yet, and
1227 we want to allow zero-length vectors, of course. */
1228 if (c == '(')
1229 return scm_read_vector (c, port, opts, line, column);
1230
1231 /* Disambiguate between '#f' and uniform floating point vectors. */
1232 if (c == 'f')
1233 {
1234 c = scm_getc (port);
1235 if (c != '3' && c != '6')
1236 {
1237 if (c == 'a' && try_read_ci_chars (port, "lse"))
1238 return SCM_BOOL_F;
1239 else if (c != EOF)
1240 scm_ungetc (c, port);
1241 return SCM_BOOL_F;
1242 }
1243 rank = 1;
1244 tag_buf[0] = 'f';
1245 tag_len = 1;
1246 goto continue_reading_tag;
1247 }
1248
1249 /* Read rank. */
1250 rank = 1;
1251 c = read_decimal_integer (port, c, &rank);
1252 if (rank < 0)
1253 scm_i_input_error (NULL, port, "array rank must be non-negative",
1254 SCM_EOL);
1255
1256 /* Read tag. */
1257 tag_len = 0;
1258 continue_reading_tag:
1259 while (c != EOF && c != '(' && c != '@' && c != ':'
1260 && tag_len < sizeof tag_buf / sizeof tag_buf[0])
1261 {
1262 tag_buf[tag_len++] = c;
1263 c = scm_getc (port);
1264 }
1265 if (tag_len == 0)
1266 tag = SCM_BOOL_T;
1267 else
1268 {
1269 tag = scm_string_to_symbol (scm_from_utf32_stringn (tag_buf, tag_len));
1270 if (tag_len == sizeof tag_buf / sizeof tag_buf[0])
1271 scm_i_input_error (NULL, port, "invalid array tag, starting with: ~a",
1272 scm_list_1 (tag));
1273 }
1274
1275 /* Read shape. */
1276 if (c == '@' || c == ':')
1277 {
1278 shape = SCM_EOL;
1279
1280 do
1281 {
1282 ssize_t lbnd = 0, len = 0;
1283 SCM s;
1284
1285 if (c == '@')
1286 {
1287 c = scm_getc (port);
1288 c = read_decimal_integer (port, c, &lbnd);
1289 }
1290
1291 s = scm_from_ssize_t (lbnd);
1292
1293 if (c == ':')
1294 {
1295 c = scm_getc (port);
1296 c = read_decimal_integer (port, c, &len);
1297 if (len < 0)
1298 scm_i_input_error (NULL, port,
1299 "array length must be non-negative",
1300 SCM_EOL);
1301
1302 s = scm_list_2 (s, scm_from_ssize_t (lbnd+len-1));
1303 }
1304
1305 shape = scm_cons (s, shape);
1306 } while (c == '@' || c == ':');
1307
1308 shape = scm_reverse_x (shape, SCM_EOL);
1309 }
1310
1311 /* Read nested lists of elements. */
1312 if (c != '(')
1313 scm_i_input_error (NULL, port,
1314 "missing '(' in vector or array literal",
1315 SCM_EOL);
1316 elements = scm_read_sexp (c, port, opts);
1317
1318 if (scm_is_false (shape))
1319 shape = scm_from_ssize_t (rank);
1320 else if (scm_ilength (shape) != rank)
1321 scm_i_input_error
1322 (NULL, port,
1323 "the number of shape specifications must match the array rank",
1324 SCM_EOL);
1325
1326 /* Handle special print syntax of rank zero arrays; see
1327 scm_i_print_array for a rationale. */
1328 if (rank == 0)
1329 {
1330 if (!scm_is_pair (elements))
1331 scm_i_input_error (NULL, port,
1332 "too few elements in array literal, need 1",
1333 SCM_EOL);
1334 if (!scm_is_null (SCM_CDR (elements)))
1335 scm_i_input_error (NULL, port,
1336 "too many elements in array literal, want 1",
1337 SCM_EOL);
1338 elements = SCM_CAR (elements);
1339 }
1340
1341 /* Construct array, annotate with source location, and return. */
1342 array = scm_list_to_typed_array (tag, shape, elements);
1343 return maybe_annotate_source (array, port, opts, line, column);
1344 }
1345
1346 static SCM
scm_read_srfi4_vector(int chr,SCM port,scm_t_read_opts * opts,long line,int column)1347 scm_read_srfi4_vector (int chr, SCM port, scm_t_read_opts *opts,
1348 long line, int column)
1349 {
1350 return scm_read_array (chr, port, opts, line, column);
1351 }
1352
1353 static SCM
scm_read_bytevector(scm_t_wchar chr,SCM port,scm_t_read_opts * opts,long line,int column)1354 scm_read_bytevector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1355 long line, int column)
1356 {
1357 chr = scm_getc (port);
1358 if (chr != 'u')
1359 goto syntax;
1360
1361 chr = scm_getc (port);
1362 if (chr != '8')
1363 goto syntax;
1364
1365 chr = scm_getc (port);
1366 if (chr != '(')
1367 goto syntax;
1368
1369 return maybe_annotate_source
1370 (scm_u8_list_to_bytevector (scm_read_sexp (chr, port, opts)),
1371 port, opts, line, column);
1372
1373 syntax:
1374 scm_i_input_error ("read_bytevector", port,
1375 "invalid bytevector prefix",
1376 SCM_MAKE_CHAR (chr));
1377 return SCM_UNSPECIFIED;
1378 }
1379
1380 static SCM
scm_read_guile_bit_vector(scm_t_wchar chr,SCM port,scm_t_read_opts * opts,long line,int column)1381 scm_read_guile_bit_vector (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1382 long line, int column)
1383 {
1384 /* Read the `#*10101'-style read syntax for bit vectors in Guile. This is
1385 terribly inefficient but who cares? */
1386 SCM s_bits = SCM_EOL;
1387
1388 for (chr = scm_getc (port);
1389 (chr != EOF) && ((chr == '0') || (chr == '1'));
1390 chr = scm_getc (port))
1391 {
1392 s_bits = scm_cons ((chr == '0') ? SCM_BOOL_F : SCM_BOOL_T, s_bits);
1393 }
1394
1395 if (chr != EOF)
1396 scm_ungetc (chr, port);
1397
1398 return maybe_annotate_source
1399 (scm_bitvector (scm_reverse_x (s_bits, SCM_EOL)),
1400 port, opts, line, column);
1401 }
1402
1403 static SCM
scm_read_scsh_block_comment(scm_t_wchar chr,SCM port)1404 scm_read_scsh_block_comment (scm_t_wchar chr, SCM port)
1405 {
1406 int bang_seen = 0;
1407
1408 for (;;)
1409 {
1410 int c = scm_getc (port);
1411
1412 if (c == EOF)
1413 scm_i_input_error ("skip_block_comment", port,
1414 "unterminated `#! ... !#' comment", SCM_EOL);
1415
1416 if (c == '!')
1417 bang_seen = 1;
1418 else if (c == '#' && bang_seen)
1419 break;
1420 else
1421 bang_seen = 0;
1422 }
1423
1424 return SCM_UNSPECIFIED;
1425 }
1426
1427 static void set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts,
1428 int value);
1429 static void set_port_square_brackets_p (SCM port, scm_t_read_opts *opts,
1430 int value);
1431 static void set_port_curly_infix_p (SCM port, scm_t_read_opts *opts,
1432 int value);
1433 static void set_port_r6rs_hex_escapes_p (SCM port, scm_t_read_opts *opts,
1434 int value);
1435 static void set_port_hungry_eol_escapes_p (SCM port, scm_t_read_opts *opts,
1436 int value);
1437 static void set_port_keyword_style (SCM port, scm_t_read_opts *opts,
1438 enum t_keyword_style value);
1439
1440 static SCM
scm_read_shebang(scm_t_wchar chr,SCM port,scm_t_read_opts * opts)1441 scm_read_shebang (scm_t_wchar chr, SCM port, scm_t_read_opts *opts)
1442 {
1443 char name[READER_DIRECTIVE_NAME_MAX_SIZE + 1];
1444 int c;
1445 int i = 0;
1446
1447 while (i <= READER_DIRECTIVE_NAME_MAX_SIZE)
1448 {
1449 c = scm_getc (port);
1450 if (c == EOF)
1451 scm_i_input_error ("skip_block_comment", port,
1452 "unterminated `#! ... !#' comment", SCM_EOL);
1453 else if (('a' <= c && c <= 'z') || ('0' <= c && c <= '9') || c == '-')
1454 name[i++] = c;
1455 else if (CHAR_IS_DELIMITER (c))
1456 {
1457 scm_ungetc (c, port);
1458 name[i] = '\0';
1459 if (0 == strcmp ("r6rs", name))
1460 {
1461 set_port_case_insensitive_p (port, opts, 0);
1462 set_port_r6rs_hex_escapes_p (port, opts, 1);
1463 set_port_square_brackets_p (port, opts, 1);
1464 set_port_keyword_style (port, opts, KEYWORD_STYLE_HASH_PREFIX);
1465 set_port_hungry_eol_escapes_p (port, opts, 1);
1466 }
1467 else if (0 == strcmp ("fold-case", name))
1468 set_port_case_insensitive_p (port, opts, 1);
1469 else if (0 == strcmp ("no-fold-case", name))
1470 set_port_case_insensitive_p (port, opts, 0);
1471 else if (0 == strcmp ("curly-infix", name))
1472 set_port_curly_infix_p (port, opts, 1);
1473 else if (0 == strcmp ("curly-infix-and-bracket-lists", name))
1474 {
1475 set_port_curly_infix_p (port, opts, 1);
1476 set_port_square_brackets_p (port, opts, 0);
1477 }
1478 else
1479 break;
1480
1481 return SCM_UNSPECIFIED;
1482 }
1483 else
1484 {
1485 scm_ungetc (c, port);
1486 break;
1487 }
1488 }
1489 while (i > 0)
1490 scm_ungetc (name[--i], port);
1491 return scm_read_scsh_block_comment (chr, port);
1492 }
1493
1494 static SCM
scm_read_r6rs_block_comment(scm_t_wchar chr,SCM port)1495 scm_read_r6rs_block_comment (scm_t_wchar chr, SCM port)
1496 {
1497 /* Unlike SCSH-style block comments, SRFI-30/R6RS block comments may be
1498 nested. So care must be taken. */
1499 int nesting_level = 1;
1500
1501 int a = scm_getc (port);
1502
1503 if (a == EOF)
1504 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1505 "unterminated `#| ... |#' comment", SCM_EOL);
1506
1507 while (nesting_level > 0)
1508 {
1509 int b = scm_getc (port);
1510
1511 if (b == EOF)
1512 scm_i_input_error ("scm_read_r6rs_block_comment", port,
1513 "unterminated `#| ... |#' comment", SCM_EOL);
1514
1515 if (a == '|' && b == '#')
1516 {
1517 nesting_level--;
1518 b = EOF;
1519 }
1520 else if (a == '#' && b == '|')
1521 {
1522 nesting_level++;
1523 b = EOF;
1524 }
1525
1526 a = b;
1527 }
1528
1529 return SCM_UNSPECIFIED;
1530 }
1531
1532 static SCM
scm_read_commented_expression(scm_t_wchar chr,SCM port,scm_t_read_opts * opts)1533 scm_read_commented_expression (scm_t_wchar chr, SCM port,
1534 scm_t_read_opts *opts)
1535 {
1536 scm_t_wchar c;
1537
1538 c = flush_ws (port, opts, (char *) NULL);
1539 if (EOF == c)
1540 scm_i_input_error ("read_commented_expression", port,
1541 "no expression after #; comment", SCM_EOL);
1542 scm_ungetc (c, port);
1543 scm_read_expression (port, opts);
1544 return SCM_UNSPECIFIED;
1545 }
1546
1547 static SCM
scm_read_extended_symbol(scm_t_wchar chr,SCM port)1548 scm_read_extended_symbol (scm_t_wchar chr, SCM port)
1549 {
1550 /* Guile's extended symbol read syntax looks like this:
1551
1552 #{This is all a symbol name}#
1553
1554 So here, CHR is expected to be `{'. */
1555 int saw_brace = 0;
1556 size_t len = 0;
1557 SCM buf = scm_i_make_string (1024, NULL, 0);
1558
1559 /* No need to scm_i_string_start_writing (), as the string isn't
1560 visible to any other thread. */
1561
1562 while ((chr = scm_getc (port)) != EOF)
1563 {
1564 if (saw_brace)
1565 {
1566 if (chr == '#')
1567 {
1568 break;
1569 }
1570 else
1571 {
1572 saw_brace = 0;
1573 scm_i_string_set_x (buf, len++, '}');
1574 }
1575 }
1576
1577 if (chr == '}')
1578 saw_brace = 1;
1579 else if (chr == '\\')
1580 {
1581 /* It used to be that print.c would print extended-read-syntax
1582 symbols with backslashes before "non-standard" chars, but
1583 this routine wouldn't do anything with those escapes.
1584 Bummer. What we've done is to change print.c to output
1585 R6RS hex escapes for those characters, relying on the fact
1586 that the extended read syntax would never put a `\' before
1587 an `x'. For now, we just ignore other instances of
1588 backslash in the string. */
1589 switch ((chr = scm_getc (port)))
1590 {
1591 case EOF:
1592 goto done;
1593 case 'x':
1594 {
1595 scm_t_wchar c;
1596
1597 SCM_READ_HEX_ESCAPE (10, ';');
1598 scm_i_string_set_x (buf, len++, c);
1599 break;
1600
1601 str_eof:
1602 chr = EOF;
1603 goto done;
1604
1605 bad_escaped:
1606 scm_i_string_stop_writing ();
1607 scm_i_input_error ("scm_read_extended_symbol", port,
1608 "illegal character in escape sequence: ~S",
1609 scm_list_1 (SCM_MAKE_CHAR (c)));
1610 break;
1611 }
1612 default:
1613 scm_i_string_set_x (buf, len++, chr);
1614 break;
1615 }
1616 }
1617 else
1618 scm_i_string_set_x (buf, len++, chr);
1619
1620 if (len >= scm_i_string_length (buf) - 2)
1621 {
1622 SCM addy;
1623
1624 addy = scm_i_make_string (1024, NULL, 0);
1625 buf = scm_string_append (scm_list_2 (buf, addy));
1626 len = 0;
1627 }
1628 }
1629
1630 done:
1631 if (chr == EOF)
1632 scm_i_input_error ("scm_read_extended_symbol", port,
1633 "end of file while reading symbol", SCM_EOL);
1634
1635 return (scm_string_to_symbol (scm_c_substring (buf, 0, len)));
1636 }
1637
1638
1639
1640 /* Top-level token readers, i.e., dispatchers. */
1641
1642 static SCM
scm_read_sharp_extension(int chr,SCM port,scm_t_read_opts * opts)1643 scm_read_sharp_extension (int chr, SCM port, scm_t_read_opts *opts)
1644 {
1645 SCM proc;
1646
1647 proc = scm_get_hash_procedure (chr);
1648 if (scm_is_true (scm_procedure_p (proc)))
1649 {
1650 long line = scm_to_long (scm_port_line (port));
1651 int column = scm_to_int (scm_port_column (port)) - 2;
1652 SCM got;
1653
1654 got = scm_call_2 (proc, SCM_MAKE_CHAR (chr), port);
1655
1656 if (opts->record_positions_p && SCM_NIMP (got)
1657 && !scm_i_has_source_properties (got))
1658 scm_i_set_source_properties_x (got, line, column, SCM_FILENAME (port));
1659
1660 return got;
1661 }
1662
1663 return SCM_UNSPECIFIED;
1664 }
1665
1666 /* The reader for the sharp `#' character. It basically dispatches reads
1667 among the above token readers. */
1668 static SCM
scm_read_sharp(scm_t_wchar chr,SCM port,scm_t_read_opts * opts,long line,int column)1669 scm_read_sharp (scm_t_wchar chr, SCM port, scm_t_read_opts *opts,
1670 long line, int column)
1671 #define FUNC_NAME "scm_lreadr"
1672 {
1673 SCM result;
1674
1675 chr = scm_getc (port);
1676
1677 result = scm_read_sharp_extension (chr, port, opts);
1678 if (!scm_is_eq (result, SCM_UNSPECIFIED))
1679 return result;
1680
1681 switch (chr)
1682 {
1683 case '\\':
1684 return (scm_read_character (chr, port, opts));
1685 case '(':
1686 return (scm_read_vector (chr, port, opts, line, column));
1687 case 's':
1688 case 'u':
1689 case 'f':
1690 case 'c':
1691 /* This one may return either a boolean or an SRFI-4 vector. */
1692 return (scm_read_srfi4_vector (chr, port, opts, line, column));
1693 case 'v':
1694 return (scm_read_bytevector (chr, port, opts, line, column));
1695 case '*':
1696 return (scm_read_guile_bit_vector (chr, port, opts, line, column));
1697 case 't':
1698 case 'T':
1699 case 'F':
1700 return (scm_read_boolean (chr, port));
1701 case ':':
1702 return (scm_read_keyword (chr, port, opts));
1703 case '0': case '1': case '2': case '3': case '4':
1704 case '5': case '6': case '7': case '8': case '9':
1705 case '@':
1706 return (scm_read_array (chr, port, opts, line, column));
1707
1708 case 'i':
1709 case 'e':
1710 case 'b':
1711 case 'B':
1712 case 'o':
1713 case 'O':
1714 case 'd':
1715 case 'D':
1716 case 'x':
1717 case 'X':
1718 case 'I':
1719 case 'E':
1720 return (scm_read_number_and_radix (chr, port, opts));
1721 case '{':
1722 return (scm_read_extended_symbol (chr, port));
1723 case '!':
1724 return (scm_read_shebang (chr, port, opts));
1725 case ';':
1726 return (scm_read_commented_expression (chr, port, opts));
1727 case '`':
1728 case '\'':
1729 case ',':
1730 return (scm_read_syntax (chr, port, opts));
1731 case 'n':
1732 return (scm_read_nil (chr, port, opts));
1733 default:
1734 result = scm_read_sharp_extension (chr, port, opts);
1735 if (scm_is_eq (result, SCM_UNSPECIFIED))
1736 {
1737 /* To remain compatible with 1.8 and earlier, the following
1738 characters have lower precedence than `read-hash-extend'
1739 characters. */
1740 switch (chr)
1741 {
1742 case '|':
1743 return scm_read_r6rs_block_comment (chr, port);
1744 default:
1745 scm_i_input_error (FUNC_NAME, port, "Unknown # object: ~S",
1746 scm_list_1 (SCM_MAKE_CHAR (chr)));
1747 }
1748 }
1749 else
1750 return result;
1751 }
1752
1753 return SCM_UNSPECIFIED;
1754 }
1755 #undef FUNC_NAME
1756
1757 static SCM
read_inner_expression(SCM port,scm_t_read_opts * opts)1758 read_inner_expression (SCM port, scm_t_read_opts *opts)
1759 #define FUNC_NAME "read_inner_expression"
1760 {
1761 while (1)
1762 {
1763 scm_t_wchar chr;
1764
1765 chr = scm_getc (port);
1766
1767 switch (chr)
1768 {
1769 case SCM_WHITE_SPACES:
1770 case SCM_LINE_INCREMENTORS:
1771 break;
1772 case ';':
1773 (void) scm_read_semicolon_comment (chr, port);
1774 break;
1775 case '{':
1776 if (opts->curly_infix_p)
1777 {
1778 if (opts->neoteric_p)
1779 return scm_read_sexp (chr, port, opts);
1780 else
1781 {
1782 SCM expr;
1783
1784 /* Enable neoteric expressions within curly braces */
1785 opts->neoteric_p = 1;
1786 expr = scm_read_sexp (chr, port, opts);
1787 opts->neoteric_p = 0;
1788 return expr;
1789 }
1790 }
1791 else
1792 return scm_read_mixed_case_symbol (chr, port, opts);
1793 case '[':
1794 if (opts->square_brackets_p)
1795 return scm_read_sexp (chr, port, opts);
1796 else if (opts->curly_infix_p)
1797 {
1798 /* The syntax of neoteric expressions requires that '[' be
1799 a delimiter when curly-infix is enabled, so it cannot
1800 be part of an unescaped symbol. We might as well do
1801 something useful with it, so we adopt Kawa's convention:
1802 [...] => ($bracket-list$ ...) */
1803 long line = scm_to_long (scm_port_line (port));
1804 int column = scm_to_int (scm_port_column (port)) - 1;
1805 return maybe_annotate_source
1806 (scm_cons (sym_bracket_list, scm_read_sexp (chr, port, opts)),
1807 port, opts, line, column);
1808 }
1809 else
1810 return scm_read_mixed_case_symbol (chr, port, opts);
1811 case '(':
1812 return (scm_read_sexp (chr, port, opts));
1813 case '"':
1814 return (scm_read_string (chr, port, opts));
1815 case '|':
1816 if (opts->r7rs_symbols_p)
1817 return scm_read_r7rs_symbol (chr, port, opts);
1818 else
1819 return scm_read_mixed_case_symbol (chr, port, opts);
1820 case '\'':
1821 case '`':
1822 case ',':
1823 return (scm_read_quote (chr, port, opts));
1824 case '#':
1825 {
1826 long line = scm_to_long (scm_port_line (port));
1827 int column = scm_to_int (scm_port_column (port)) - 1;
1828 SCM result = scm_read_sharp (chr, port, opts, line, column);
1829 if (scm_is_eq (result, SCM_UNSPECIFIED))
1830 /* We read a comment or some such. */
1831 break;
1832 else
1833 return result;
1834 }
1835 case ')':
1836 scm_i_input_error (FUNC_NAME, port, "unexpected \")\"", SCM_EOL);
1837 break;
1838 case '}':
1839 if (opts->curly_infix_p)
1840 scm_i_input_error (FUNC_NAME, port, "unexpected \"}\"", SCM_EOL);
1841 else
1842 return scm_read_mixed_case_symbol (chr, port, opts);
1843 case ']':
1844 if (opts->square_brackets_p)
1845 scm_i_input_error (FUNC_NAME, port, "unexpected \"]\"", SCM_EOL);
1846 /* otherwise fall through */
1847 case EOF:
1848 return SCM_EOF_VAL;
1849 case ':':
1850 if (opts->keyword_style == KEYWORD_STYLE_PREFIX)
1851 return scm_symbol_to_keyword (scm_read_expression (port, opts));
1852 /* Fall through. */
1853
1854 default:
1855 {
1856 if (((chr >= '0') && (chr <= '9'))
1857 || (strchr ("+-.", chr)))
1858 return (scm_read_number (chr, port, opts));
1859 else
1860 return (scm_read_mixed_case_symbol (chr, port, opts));
1861 }
1862 }
1863 }
1864 }
1865 #undef FUNC_NAME
1866
1867 static SCM
scm_read_expression(SCM port,scm_t_read_opts * opts)1868 scm_read_expression (SCM port, scm_t_read_opts *opts)
1869 #define FUNC_NAME "scm_read_expression"
1870 {
1871 if (!opts->neoteric_p)
1872 return read_inner_expression (port, opts);
1873 else
1874 {
1875 long line = 0;
1876 int column = 0;
1877 SCM expr;
1878
1879 if (opts->record_positions_p)
1880 {
1881 /* We need to get the position of the first non-whitespace
1882 character in order to correctly annotate neoteric
1883 expressions. For example, for the expression 'f(x)', the
1884 first call to 'read_inner_expression' reads the 'f' (which
1885 cannot be annotated), and then we later read the '(x)' and
1886 use it to construct the new list (f x). */
1887 int c = flush_ws (port, opts, (char *) NULL);
1888 if (c == EOF)
1889 return SCM_EOF_VAL;
1890 scm_ungetc (c, port);
1891 line = scm_to_long (scm_port_line (port));
1892 column = scm_to_int (scm_port_column (port));
1893 }
1894
1895 expr = read_inner_expression (port, opts);
1896
1897 /* 'expr' is the first component of the neoteric expression. Now
1898 we loop, and as long as the next character is '(', '[', or '{',
1899 (without any intervening whitespace), we use it to construct a
1900 new expression. For example, f{n - 1}(x) => ((f (- n 1)) x). */
1901 for (;;)
1902 {
1903 int chr = scm_getc (port);
1904
1905 if (chr == '(')
1906 /* e(...) => (e ...) */
1907 expr = scm_cons (expr, scm_read_sexp (chr, port, opts));
1908 else if (chr == '[')
1909 /* e[...] => ($bracket-apply$ e ...) */
1910 expr = scm_cons (sym_bracket_apply,
1911 scm_cons (expr,
1912 scm_read_sexp (chr, port, opts)));
1913 else if (chr == '{')
1914 {
1915 SCM arg = scm_read_sexp (chr, port, opts);
1916
1917 if (scm_is_null (arg))
1918 expr = scm_list_1 (expr); /* e{} => (e) */
1919 else
1920 expr = scm_list_2 (expr, arg); /* e{...} => (e {...}) */
1921 }
1922 else
1923 {
1924 if (chr != EOF)
1925 scm_ungetc (chr, port);
1926 break;
1927 }
1928 maybe_annotate_source (expr, port, opts, line, column);
1929 }
1930 return expr;
1931 }
1932 }
1933 #undef FUNC_NAME
1934
1935
1936 /* Actual reader. */
1937
1938 static void init_read_options (SCM port, scm_t_read_opts *opts);
1939
1940 SCM_DEFINE (scm_read, "read", 0, 1, 0,
1941 (SCM port),
1942 "Read an s-expression from the input port @var{port}, or from\n"
1943 "the current input port if @var{port} is not specified.\n"
1944 "Any whitespace before the next token is discarded.")
1945 #define FUNC_NAME s_scm_read
1946 {
1947 scm_t_read_opts opts;
1948 int c;
1949
1950 if (SCM_UNBNDP (port))
1951 port = scm_current_input_port ();
1952 SCM_VALIDATE_OPINPORT (1, port);
1953
1954 init_read_options (port, &opts);
1955
1956 c = flush_ws (port, &opts, (char *) NULL);
1957 if (EOF == c)
1958 return SCM_EOF_VAL;
1959 scm_ungetc (c, port);
1960
1961 return (scm_read_expression (port, &opts));
1962 }
1963 #undef FUNC_NAME
1964
1965
1966
1967
1968 /* Manipulate the read-hash-procedures alist. This could be written in
1969 Scheme, but maybe it will also be used by C code during initialisation. */
1970 SCM_DEFINE (scm_read_hash_extend, "read-hash-extend", 2, 0, 0,
1971 (SCM chr, SCM proc),
1972 "Install the procedure @var{proc} for reading expressions\n"
1973 "starting with the character sequence @code{#} and @var{chr}.\n"
1974 "@var{proc} will be called with two arguments: the character\n"
1975 "@var{chr} and the port to read further data from. The object\n"
1976 "returned will be the return value of @code{read}. \n"
1977 "Passing @code{#f} for @var{proc} will remove a previous setting. \n"
1978 )
1979 #define FUNC_NAME s_scm_read_hash_extend
1980 {
1981 SCM this;
1982 SCM prev;
1983
1984 SCM_VALIDATE_CHAR (1, chr);
1985 SCM_ASSERT (scm_is_false (proc)
1986 || scm_is_eq (scm_procedure_p (proc), SCM_BOOL_T),
1987 proc, SCM_ARG2, FUNC_NAME);
1988
1989 /* Check if chr is already in the alist. */
1990 this = scm_i_read_hash_procedures_ref ();
1991 prev = SCM_BOOL_F;
1992 while (1)
1993 {
1994 if (scm_is_null (this))
1995 {
1996 /* not found, so add it to the beginning. */
1997 if (scm_is_true (proc))
1998 {
1999 SCM new = scm_cons (scm_cons (chr, proc),
2000 scm_i_read_hash_procedures_ref ());
2001 scm_i_read_hash_procedures_set_x (new);
2002 }
2003 break;
2004 }
2005 if (scm_is_eq (chr, SCM_CAAR (this)))
2006 {
2007 /* already in the alist. */
2008 if (scm_is_false (proc))
2009 {
2010 /* remove it. */
2011 if (scm_is_false (prev))
2012 {
2013 SCM rest = SCM_CDR (scm_i_read_hash_procedures_ref ());
2014 scm_i_read_hash_procedures_set_x (rest);
2015 }
2016 else
2017 scm_set_cdr_x (prev, SCM_CDR (this));
2018 }
2019 else
2020 {
2021 /* replace it. */
2022 scm_set_cdr_x (SCM_CAR (this), proc);
2023 }
2024 break;
2025 }
2026 prev = this;
2027 this = SCM_CDR (this);
2028 }
2029
2030 return SCM_UNSPECIFIED;
2031 }
2032 #undef FUNC_NAME
2033
2034 /* Recover the read-hash procedure corresponding to char c. */
2035 static SCM
scm_get_hash_procedure(int c)2036 scm_get_hash_procedure (int c)
2037 {
2038 SCM rest = scm_i_read_hash_procedures_ref ();
2039
2040 while (1)
2041 {
2042 if (scm_is_null (rest))
2043 return SCM_BOOL_F;
2044
2045 if (SCM_CHAR (SCM_CAAR (rest)) == c)
2046 return SCM_CDAR (rest);
2047
2048 rest = SCM_CDR (rest);
2049 }
2050 }
2051
2052 static int
is_encoding_char(char c)2053 is_encoding_char (char c)
2054 {
2055 if (c >= 'a' && c <= 'z') return 1;
2056 if (c >= 'A' && c <= 'Z') return 1;
2057 if (c >= '0' && c <= '9') return 1;
2058 return strchr ("_-.:/,+=()", c) != NULL;
2059 }
2060
2061 /* Maximum size of an encoding name. This is a bit more than the
2062 longest name listed at
2063 <http://www.iana.org/assignments/character-sets> ("ISO-2022-JP-2", 13
2064 characters.) */
2065 #define ENCODING_NAME_MAX_SIZE 20
2066
2067 /* Number of bytes at the beginning or end of a file that are scanned
2068 for a "coding:" declaration. */
2069 #define SCM_ENCODING_SEARCH_SIZE (500 + ENCODING_NAME_MAX_SIZE)
2070
2071
2072 /* Search the SCM_ENCODING_SEARCH_SIZE bytes of a file for an Emacs-like
2073 coding declaration. Returns either NULL or a string whose storage
2074 has been allocated with `scm_gc_malloc'. */
2075 char *
scm_i_scan_for_encoding(SCM port)2076 scm_i_scan_for_encoding (SCM port)
2077 {
2078 scm_t_port *pt;
2079 SCM buf;
2080 char header[SCM_ENCODING_SEARCH_SIZE+1];
2081 size_t cur, bytes_read, encoding_length, i;
2082 char *encoding = NULL;
2083 char *pos, *encoding_start;
2084 int in_comment;
2085
2086 pt = SCM_PORT (port);
2087 buf = pt->read_buf;
2088
2089 if (pt->rw_random)
2090 scm_flush (port);
2091
2092 if (scm_port_buffer_can_take (buf, &cur) == 0)
2093 {
2094 /* We can use the read buffer, and thus avoid a seek. */
2095 buf = scm_fill_input (port, 0, &cur, &bytes_read);
2096 if (bytes_read > SCM_ENCODING_SEARCH_SIZE)
2097 bytes_read = SCM_ENCODING_SEARCH_SIZE;
2098
2099 if (bytes_read <= 1)
2100 /* An unbuffered port -- don't scan. */
2101 return NULL;
2102
2103 memcpy (header, scm_port_buffer_take_pointer (buf, cur), bytes_read);
2104 header[bytes_read] = '\0';
2105 }
2106 else if (pt->rw_random)
2107 {
2108 /* The port is seekable. This is OK but grubbing in the read
2109 buffer is better, so this case is just a fallback. */
2110 bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE);
2111 header[bytes_read] = '\0';
2112 scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
2113 }
2114 else
2115 /* No input available and not seekable; scan fails. */
2116 return NULL;
2117
2118 /* search past "coding[:=]" */
2119 pos = header;
2120 while (1)
2121 {
2122 if ((pos = strstr(pos, "coding")) == NULL)
2123 return NULL;
2124
2125 pos += strlen ("coding");
2126 if (pos - header >= SCM_ENCODING_SEARCH_SIZE ||
2127 (*pos == ':' || *pos == '='))
2128 {
2129 pos ++;
2130 break;
2131 }
2132 }
2133
2134 /* skip spaces */
2135 while (pos - header <= SCM_ENCODING_SEARCH_SIZE &&
2136 (*pos == ' ' || *pos == '\t'))
2137 pos ++;
2138
2139 if (pos - header >= SCM_ENCODING_SEARCH_SIZE - ENCODING_NAME_MAX_SIZE)
2140 /* We found the "coding:" string, but there is probably not enough
2141 room to store an encoding name in its entirety, so ignore it.
2142 This makes sure we do not end up returning a truncated encoding
2143 name. */
2144 return NULL;
2145
2146 /* grab the next token */
2147 encoding_start = pos;
2148 i = 0;
2149 while (encoding_start + i - header <= SCM_ENCODING_SEARCH_SIZE
2150 && encoding_start + i - header < bytes_read
2151 && is_encoding_char (encoding_start[i]))
2152 i++;
2153
2154 encoding_length = i;
2155 if (encoding_length == 0)
2156 return NULL;
2157
2158 encoding = scm_gc_strndup (encoding_start, encoding_length, "encoding");
2159
2160 /* push backwards to make sure we were in a comment */
2161 in_comment = 0;
2162 pos = encoding_start;
2163 while (pos >= header)
2164 {
2165 if (*pos == ';')
2166 {
2167 in_comment = 1;
2168 break;
2169 }
2170 else if (*pos == '\n' || pos == header)
2171 {
2172 /* This wasn't in a semicolon comment. Check for a
2173 hash-bang comment. */
2174 char *beg = strstr (header, "#!");
2175 char *end = strstr (header, "!#");
2176 if (beg < encoding_start && encoding_start + encoding_length <= end)
2177 in_comment = 1;
2178 break;
2179 }
2180 else
2181 {
2182 pos --;
2183 continue;
2184 }
2185 }
2186 if (!in_comment)
2187 /* This wasn't in a comment */
2188 return NULL;
2189
2190 return encoding;
2191 }
2192
2193 SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
2194 (SCM port),
2195 "Scans the port for an Emacs-like character coding declaration\n"
2196 "near the top of the contents of a port with random-accessible contents.\n"
2197 "The coding declaration is of the form\n"
2198 "@code{coding: XXXXX} and must appear in a scheme comment.\n"
2199 "\n"
2200 "Returns a string containing the character encoding of the file\n"
2201 "if a declaration was found, or @code{#f} otherwise.\n")
2202 #define FUNC_NAME s_scm_file_encoding
2203 {
2204 char *enc;
2205 SCM s_enc;
2206
2207 SCM_VALIDATE_OPINPORT (SCM_ARG1, port);
2208
2209 enc = scm_i_scan_for_encoding (port);
2210 if (enc == NULL)
2211 return SCM_BOOL_F;
2212 else
2213 {
2214 /* It's not obvious what encoding to use here, but latin1 has the
2215 advantage of never causing a decoding error, and a valid
2216 encoding name should be ASCII anyway. */
2217 s_enc = scm_string_upcase (scm_from_latin1_string (enc));
2218 return s_enc;
2219 }
2220
2221 return SCM_BOOL_F;
2222 }
2223 #undef FUNC_NAME
2224
2225
2226 /* Per-port read options.
2227
2228 We store per-port read options in the 'port-read-options' port
2229 property, which is stored in the internal port structure. The value
2230 stored is a single integer that contains a two-bit field for each
2231 read option.
2232
2233 If a bit field contains READ_OPTION_INHERIT (3), that indicates that
2234 the applicable value should be inherited from the corresponding
2235 global read option. Otherwise, the bit field contains the value of
2236 the read option. For boolean read options that have been set
2237 per-port, the possible values are 0 or 1. If the 'keyword_style'
2238 read option has been set per-port, its possible values are those in
2239 'enum t_keyword_style'. */
2240
2241 /* Key to read options in port properties. */
2242 SCM_SYMBOL (sym_port_read_options, "port-read-options");
2243
2244 /* Offsets of bit fields for each per-port override */
2245 #define READ_OPTION_COPY_SOURCE_P 0
2246 #define READ_OPTION_RECORD_POSITIONS_P 2
2247 #define READ_OPTION_CASE_INSENSITIVE_P 4
2248 #define READ_OPTION_KEYWORD_STYLE 6
2249 #define READ_OPTION_R6RS_ESCAPES_P 8
2250 #define READ_OPTION_SQUARE_BRACKETS_P 10
2251 #define READ_OPTION_HUNGRY_EOL_ESCAPES_P 12
2252 #define READ_OPTION_CURLY_INFIX_P 14
2253 #define READ_OPTION_R7RS_SYMBOLS_P 16
2254
2255 /* The total width in bits of the per-port overrides */
2256 #define READ_OPTIONS_NUM_BITS 18
2257
2258 #define READ_OPTIONS_INHERIT_ALL ((1UL << READ_OPTIONS_NUM_BITS) - 1)
2259 #define READ_OPTIONS_MAX_VALUE READ_OPTIONS_INHERIT_ALL
2260
2261 #define READ_OPTION_MASK 3
2262 #define READ_OPTION_INHERIT 3
2263
2264 static void
set_port_read_option(SCM port,int option,int new_value)2265 set_port_read_option (SCM port, int option, int new_value)
2266 {
2267 SCM scm_read_options;
2268 unsigned int read_options;
2269
2270 new_value &= READ_OPTION_MASK;
2271
2272 scm_read_options = scm_i_port_property (port, sym_port_read_options);
2273 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2274 read_options = scm_to_uint (scm_read_options);
2275 else
2276 read_options = READ_OPTIONS_INHERIT_ALL;
2277 read_options &= ~(READ_OPTION_MASK << option);
2278 read_options |= new_value << option;
2279 scm_read_options = scm_from_uint (read_options);
2280 scm_i_set_port_property_x (port, sym_port_read_options, scm_read_options);
2281 }
2282
2283 /* Set OPTS and PORT's case-insensitivity according to VALUE. */
2284 static void
set_port_case_insensitive_p(SCM port,scm_t_read_opts * opts,int value)2285 set_port_case_insensitive_p (SCM port, scm_t_read_opts *opts, int value)
2286 {
2287 value = !!value;
2288 opts->case_insensitive_p = value;
2289 set_port_read_option (port, READ_OPTION_CASE_INSENSITIVE_P, value);
2290 }
2291
2292 /* Set OPTS and PORT's square_brackets_p option according to VALUE. */
2293 static void
set_port_square_brackets_p(SCM port,scm_t_read_opts * opts,int value)2294 set_port_square_brackets_p (SCM port, scm_t_read_opts *opts, int value)
2295 {
2296 value = !!value;
2297 opts->square_brackets_p = value;
2298 set_port_read_option (port, READ_OPTION_SQUARE_BRACKETS_P, value);
2299 }
2300
2301 /* Set OPTS and PORT's curly_infix_p option according to VALUE. */
2302 static void
set_port_curly_infix_p(SCM port,scm_t_read_opts * opts,int value)2303 set_port_curly_infix_p (SCM port, scm_t_read_opts *opts, int value)
2304 {
2305 value = !!value;
2306 opts->curly_infix_p = value;
2307 set_port_read_option (port, READ_OPTION_CURLY_INFIX_P, value);
2308 }
2309
2310 /* Set OPTS and PORT's r6rs_hex_escapes_p option according to VALUE. */
2311 static void
set_port_r6rs_hex_escapes_p(SCM port,scm_t_read_opts * opts,int value)2312 set_port_r6rs_hex_escapes_p (SCM port, scm_t_read_opts *opts, int value)
2313 {
2314 value = !!value;
2315 opts->r6rs_escapes_p = value;
2316 set_port_read_option (port, READ_OPTION_R6RS_ESCAPES_P, value);
2317 }
2318
2319 static void
set_port_hungry_eol_escapes_p(SCM port,scm_t_read_opts * opts,int value)2320 set_port_hungry_eol_escapes_p (SCM port, scm_t_read_opts *opts, int value)
2321 {
2322 value = !!value;
2323 opts->hungry_eol_escapes_p = value;
2324 set_port_read_option (port, READ_OPTION_HUNGRY_EOL_ESCAPES_P, value);
2325 }
2326
2327 static void
set_port_keyword_style(SCM port,scm_t_read_opts * opts,enum t_keyword_style value)2328 set_port_keyword_style (SCM port, scm_t_read_opts *opts, enum t_keyword_style value)
2329 {
2330 opts->keyword_style = value;
2331 set_port_read_option (port, READ_OPTION_KEYWORD_STYLE, value);
2332 }
2333
2334 /* Initialize OPTS based on PORT's read options and the global read
2335 options. */
2336 static void
init_read_options(SCM port,scm_t_read_opts * opts)2337 init_read_options (SCM port, scm_t_read_opts *opts)
2338 {
2339 SCM val, scm_read_options;
2340 unsigned int read_options, x;
2341
2342 scm_read_options = scm_i_port_property (port, sym_port_read_options);
2343
2344 if (scm_is_unsigned_integer (scm_read_options, 0, READ_OPTIONS_MAX_VALUE))
2345 read_options = scm_to_uint (scm_read_options);
2346 else
2347 read_options = READ_OPTIONS_INHERIT_ALL;
2348
2349 x = READ_OPTION_MASK & (read_options >> READ_OPTION_KEYWORD_STYLE);
2350 if (x == READ_OPTION_INHERIT)
2351 {
2352 val = SCM_PACK (SCM_KEYWORD_STYLE);
2353 if (scm_is_eq (val, scm_keyword_prefix))
2354 x = KEYWORD_STYLE_PREFIX;
2355 else if (scm_is_eq (val, scm_keyword_postfix))
2356 x = KEYWORD_STYLE_POSTFIX;
2357 else
2358 x = KEYWORD_STYLE_HASH_PREFIX;
2359 }
2360 opts->keyword_style = x;
2361
2362 #define RESOLVE_BOOLEAN_OPTION(NAME, name) \
2363 do \
2364 { \
2365 x = READ_OPTION_MASK & (read_options >> READ_OPTION_ ## NAME); \
2366 if (x == READ_OPTION_INHERIT) \
2367 x = !!SCM_ ## NAME; \
2368 opts->name = x; \
2369 } \
2370 while (0)
2371
2372 RESOLVE_BOOLEAN_OPTION (COPY_SOURCE_P, copy_source_p);
2373 RESOLVE_BOOLEAN_OPTION (RECORD_POSITIONS_P, record_positions_p);
2374 RESOLVE_BOOLEAN_OPTION (CASE_INSENSITIVE_P, case_insensitive_p);
2375 RESOLVE_BOOLEAN_OPTION (R6RS_ESCAPES_P, r6rs_escapes_p);
2376 RESOLVE_BOOLEAN_OPTION (SQUARE_BRACKETS_P, square_brackets_p);
2377 RESOLVE_BOOLEAN_OPTION (HUNGRY_EOL_ESCAPES_P, hungry_eol_escapes_p);
2378 RESOLVE_BOOLEAN_OPTION (CURLY_INFIX_P, curly_infix_p);
2379 RESOLVE_BOOLEAN_OPTION (R7RS_SYMBOLS_P, r7rs_symbols_p);
2380
2381 #undef RESOLVE_BOOLEAN_OPTION
2382
2383 opts->neoteric_p = 0;
2384 }
2385
2386 void
scm_init_read()2387 scm_init_read ()
2388 {
2389 SCM read_hash_procs;
2390
2391 read_hash_procs = scm_make_fluid_with_default (SCM_EOL);
2392
2393 scm_i_read_hash_procedures =
2394 SCM_VARIABLE_LOC (scm_c_define ("%read-hash-procedures", read_hash_procs));
2395
2396 scm_init_opts (scm_read_options, scm_read_opts);
2397 #include "libguile/read.x"
2398 }
2399
2400 /*
2401 Local Variables:
2402 c-file-style: "gnu"
2403 End:
2404 */
2405