1#lang racket/base
2(require "config.rkt"
3         "special.rkt"
4         "wrap.rkt"
5         "readtable.rkt"
6         "consume.rkt"
7         "accum-string.rkt"
8         "error.rkt"
9         "digit.rkt")
10
11(provide read-string
12         read-here-string)
13
14(define (read-string in config #:mode [mode 'string])
15  (define source (read-config-source config))
16  (define-values (open-end-line open-end-col open-end-pos) (port-next-location in))
17  (define accum-str (accum-string-init! config))
18  (define (bad-end c)
19    (cond
20     [(eof-object? c)
21      (reader-error in config #:due-to c #:end-pos open-end-pos "expected a closing `\"`")]
22     [else
23      (reader-error in config #:due-to c
24                    "found non-character while reading a ~a"
25                    mode)]))
26  (let loop ()
27    (define c (read-char/special in config source))
28    ;; Note: readtable is not used for a closing " or other string decisions
29    (cond
30     [(not (char? c))
31      (bad-end c)]
32     [(char=? #\\ c)
33      (define escaping-c c)
34      (define escaped-c (read-char/special in config source))
35      (when (not (char? escaped-c))
36        (bad-end escaped-c))
37      (define (unknown-error)
38        (reader-error in config
39                      "unknown escape sequence `~a~a` in ~a"
40                      escaping-c escaped-c
41                      mode))
42      (case escaped-c
43        [(#\\ #\" #\')
44         (accum-string-add! accum-str escaped-c)]
45        [(#\a) (accum-string-add! accum-str #\u7)]
46        [(#\b) (accum-string-add! accum-str #\backspace)]
47        [(#\t) (accum-string-add! accum-str #\tab)]
48        [(#\n) (accum-string-add! accum-str #\newline)]
49        [(#\v) (accum-string-add! accum-str #\vtab)]
50        [(#\f) (accum-string-add! accum-str #\page)]
51        [(#\r) (accum-string-add! accum-str #\return)]
52        [(#\e) (accum-string-add! accum-str #\u1B)]
53        [(#\newline) (void)]
54        [(#\return)
55         (define maybe-newline-c (peek-char/special in config 0 source))
56         (when (eqv? maybe-newline-c #\newline)
57           (consume-char in maybe-newline-c))
58         (void)]
59        [(#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7)
60         ;; Octal (valid if <= 255)
61         (define pos (accum-string-count accum-str))
62         (accum-string-add! accum-str escaped-c)
63         (define init-v (digit->number escaped-c))
64         (define v (read-digits in config accum-str #:base 8 #:max-count 2
65                                #:init init-v
66                                #:zero-digits-result init-v))
67         (unless (v . <= . 255)
68           (reader-error in config
69                         "escape sequence `~a~a` is out of range in ~a"
70                         escaping-c (accum-string-get! accum-str config #:start-pos pos)
71                         mode))
72         (set-accum-string-count! accum-str pos)
73         (accum-string-add! accum-str (integer->char v))]
74        [(#\x)
75         ;; Hex, two characters (always valid)
76         (define pos (accum-string-count accum-str))
77         (define v (read-digits in config accum-str #:base 16 #:max-count 2))
78         (unless (integer? v) (no-hex-digits in config v escaping-c escaped-c))
79         (set-accum-string-count! accum-str pos)
80         (accum-string-add! accum-str (integer->char v))]
81        [(#\u)
82         ;; Hex, four characters (valid if not surrogate or if surrogate pair)
83         (unless (eq? mode 'string) (unknown-error))
84         (define pos (accum-string-count accum-str))
85         (define v (read-digits in config accum-str #:base 16 #:max-count 4))
86         (unless (integer? v) (no-hex-digits in config v escaping-c escaped-c))
87         (cond
88          [(or (v . < . #xD800) (v . > . #xDFFF))
89           ;; Normal \u escape
90           (set-accum-string-count! accum-str pos)
91           (accum-string-add! accum-str (integer->char v))]
92          [else
93           ;; Maybe a surrogate-pair encoding
94           (define (next!)
95             (define next-c (read-char/special in config source))
96             (when (char? next-c)
97               (accum-string-add! accum-str next-c))
98             next-c)
99           (define v2
100             (let ([next-c (next!)])
101               (cond
102                [(char=? next-c #\\)
103                 (define next-c (next!))
104                 (cond
105                  [(char=? next-c #\u)
106                   (define v2 (read-digits in config accum-str #:base 16 #:max-count 4))
107                   (cond
108                    [(integer? v2)
109                     (and (v2 . >= . #xDC00)
110                          (v2 . <= . #xDFFF)
111                          v2)]
112                    [else v2])]    ; maybe EOF
113                  [else next-c])]  ; maybe EOF
114                [else next-c]))) ; maybe EOF
115           (cond
116            [(integer? v2)
117             (define combined-v (+ (arithmetic-shift (- v #xD800) 10)
118                                   (- v2 #xDC00)
119                                   #x10000))
120             (cond
121              [(combined-v . > . #x10FFFF)
122               (reader-error in config
123                             "escape sequence `~au~a` is out of range in string"
124                             escaping-c (accum-string-get! accum-str config #:start-pos pos))]
125              [else
126               (set-accum-string-count! accum-str pos)
127               (accum-string-add! accum-str (integer->char combined-v))])]
128            [else
129             (reader-error in config
130                           #:due-to v2
131                           "bad or incomplete surrogate-style encoding at `~au~a`"
132                           escaping-c (accum-string-get! accum-str config #:start-pos pos))])])]
133        [(#\U)
134         (unless (eq? mode 'string) (unknown-error))
135         (define pos (accum-string-count accum-str))
136         (define v (read-digits in config accum-str #:base 16 #:max-count 8))
137         (unless (integer? v) (no-hex-digits in config v escaping-c escaped-c))
138         (cond
139          [(and (or (v . < . #xD800) (v . > . #xDFFF))
140                (v . <= . #x10FFFF))
141           (set-accum-string-count! accum-str pos)
142           (accum-string-add! accum-str (integer->char v))]
143          [else
144           (reader-error in config
145                         "escape sequence `~aU~a` is out of range in string"
146                         escaping-c (accum-string-get! accum-str config #:start-pos pos))])]
147        [else (unknown-error)])
148      (loop)]
149     [(char=? #\" c)
150      null]
151     [else
152      (when (eq? mode '|byte string|)
153        (unless (byte? (char->integer c))
154          (reader-error in config
155                        "character `~a` is out of range in byte string"
156                        c)))
157      (accum-string-add! accum-str c)
158      (loop)]))
159  (define str (if (eq? mode '|byte string|)
160                  (accum-string-get-bytes! accum-str config)
161                  (accum-string-get! accum-str config)))
162  (wrap str
163        in
164        config
165        str))
166
167;; ----------------------------------------
168
169(define (read-here-string in config)
170  (define source (read-config-source config))
171  (define-values (open-end-line open-end-col open-end-pos) (port-next-location in))
172  (define accum-str (accum-string-init! config))
173
174  ;; Parse terminator
175  (define full-terminator
176    (cons
177     #\newline ;; assumption below that this character is first
178     (let loop ()
179       (define c (read-char/special in config source))
180       (cond
181         [(eof-object? c)
182          (reader-error in config #:due-to c
183                        "found end-of-file after `#<<` and before a newline")]
184         [(not (char? c))
185          (reader-error in config #:due-to c
186                        "found non-character while reading `#<<`")]
187         [(char=? c #\newline) null]
188         [else (cons c (loop))]))))
189
190  ;; Get string content.
191  ;; We just saw a newline that could be considered the start of an
192  ;; immediate `full-terminator`.
193  (let loop ([terminator (cdr full-terminator)] [terminator-accum null])
194    (define c (read-char/special in config source))
195    (cond
196     [(eof-object? c)
197      (unless (null? terminator)
198        (reader-error in config #:due-to c #:end-pos open-end-pos
199                      "found end-of-file before terminating `~a`"
200                      (list->string (cdr full-terminator))))]
201     [(not (char? c))
202      (reader-error in config #:due-to c
203                    "found non-character while reading `#<<`")]
204     [(and (pair? terminator)
205           (char=? c (car terminator)))
206      (loop (cdr terminator) (cons (car terminator) terminator-accum))]
207     [(and (null? terminator)
208           (char=? c #\newline))
209      (void)]
210     [else
211      (unless (null? terminator-accum)
212        (for ([c (in-list (reverse terminator-accum))])
213          (accum-string-add! accum-str c)))
214      (cond
215        [(char=? c #\newline)
216         ;; assume `full-terminator` starts with #\newline
217         (loop (cdr full-terminator) (list #\newline))]
218        [else
219         (accum-string-add! accum-str c)
220         (loop full-terminator null)])]))
221
222  ;; Done
223  (define str (accum-string-get! accum-str config))
224  (wrap str
225        in
226        config
227        str))
228
229;; ----------------------------------------
230
231(define (no-hex-digits in config c escaping-c escaped-c)
232  (reader-error in config
233                #:due-to c
234                "no hex digit following `~a~a`"
235                escaping-c escaped-c))
236