1;;;
2;;; parse.scm - utilities to parse input
3;;;
4;;;   Copyright (c) 2000-2020  Shiro Kawai  <shiro@acm.org>
5;;;
6;;;   Redistribution and use in source and binary forms, with or without
7;;;   modification, are permitted provided that the following conditions
8;;;   are met:
9;;;
10;;;   1. Redistributions of source code must retain the above copyright
11;;;      notice, this list of conditions and the following disclaimer.
12;;;
13;;;   2. Redistributions in binary form must reproduce the above copyright
14;;;      notice, this list of conditions and the following disclaimer in the
15;;;      documentation and/or other materials provided with the distribution.
16;;;
17;;;   3. Neither the name of the authors nor the names of its contributors
18;;;      may be used to endorse or promote products derived from this
19;;;      software without specific prior written permission.
20;;;
21;;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22;;;   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23;;;   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24;;;   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25;;;   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26;;;   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27;;;   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28;;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29;;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30;;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31;;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32;;;
33
34;; This module implements the input parsing utilities described in Oleg's site
35;;  http://pobox.com/~oleg/ftp
36;; (follow the link of "Scheme Code -> Input parsing")
37;;
38;; The functions are API compatible with Oleg's library.  The performance of
39;; this module is critical for the programs that parses large amount of
40;; text, so the code here is tuned specifially to the Gauche compiler
41;; to generate fast code.  (NB: We use some undocumented, experimenal
42;; features.  Do not copy these techniques casually; we'll change the feature
43;; at any time.)
44
45(define-module text.parse
46  (use srfi-13)
47  (use srfi-14)
48  (use util.match)
49  (export find-string-from-port?
50          assert-curr-char
51          skip-until
52          skip-while
53          peek-next-char
54          next-token
55          next-token-of
56          read-string)
57  )
58(select-module text.parse)
59
60(define ppp port-position-prefix)       ;for conciseness
61
62(define (find-string-from-port? str in-port :optional (max-no-chars #f))
63
64  (if (string-null? str)
65    0                               ;special case
66    (let ((restart (make-kmp-restart-vector str))
67          (pattern (list->vector (string->list str)))
68          (patlen  (string-length str)))
69
70      (define (scan patpos count char)
71        (cond ((eof-object? char) #f)
72              ((char=? char (vector-ref pattern patpos))
73               (if (= patpos (- patlen 1))
74                 count
75                 (scan (+ patpos 1) (+ count 1) (read-char in-port))))
76              ((and max-no-chars (>= count max-no-chars)) #f)
77              ((= patpos 0)
78               (scan 0 (+ count 1) (read-char in-port)))
79              (else
80               (scan (vector-ref restart patpos) count char))
81              ))
82
83      (scan 0 1 (read-char in-port))
84      )))
85
86;; Given CHAR-LIST, returns a predicate that takes CHAR-LIST and a character,
87;; and see if a character is included in the CHAR-LIST.
88;; Oleg's original utility only allows characters and symbol *eof* in
89;; CHAR-LIST.  We allow a single character set, or a list of mixture of
90;; characters, character sets and symbol *eof*.
91;;
92;; This function, and the resulting predicate, can be called frequently
93;; (it's O(n) where n is the size of the input to parse).  So we avoid
94;; allocation, including closure creation.  Inlining char-list-predicate
95;; also allows compile-time evaluation in majority of cases when char-list
96;; is constant.
97(define-inline (char-list-predicate char-list)
98  (cond
99   [(char-set? char-list) char-list-contains?/char-set]
100   [(not (list? char-list))
101    (error "CHAR-LIST must be a char-set or a list of characters, \
102            char-sets and/or symbol '*eof*" char-list)]
103   [(and (pair? char-list)              ; this pattern is generated by the
104         (char-set? (car char-list))    ; compiler macros.
105         (pair? (cdr char-list))
106         (null? (cddr char-list))
107         (eq? '*eof* (cadr char-list)))
108    char-list-contains?/char-set/eof]
109   [(memq '*eof* char-list)
110    (if (every character-or-eof? char-list)
111      char-list-contains?/chars/eof
112      char-list-contains?/eof)]
113   [(every char? char-list) char-list-contains?/chars]
114   [else char-list-contains?]))
115
116(define character-or-eof? (any-pred eof-object? char?))
117
118(define (char-list-contains?/char-set char-list char)
119  (and (char? char) (char-set-contains? char-list char)))
120(define (char-list-contains?/char-set/eof char-list char) ; (#[...] *eof*)
121  (or (eof-object? char)
122      (and (char? char) (char-set-contains? (car char-list) char))))
123(define (char-list-contains?/empty char-list char) #f)
124(define (char-list-contains?/chars char-list char) (memv char char-list))
125(define (char-list-contains?/chars/eof char-list char)
126  (or (eof-object? char) (memv char char-list)))
127(define (char-list-contains?/eof char-list char)
128  (or (eof-object? char) (char-list-contains? char-list char)))
129(define (char-list-contains? char-list char) ;generic version
130  (let loop ((cs char-list))
131    (if (null? cs)
132      #f
133      (or (eqv? (car cs) char)
134          (and (char-set? (car cs))
135               (char-list-contains?/char-set (car cs) char))
136          (loop (cdr cs))))))
137
138;; Common routine for the compiler macros.
139(eval-when (:compile-toplevel :load-toplevel)
140  (define (prefold-char-list char-list)
141    (and (list? char-list)
142         (cond [(every char? char-list) (apply char-set char-list)]
143               [(every (any-pred char? (cut eq? <> '*eof*)) char-list)
144                (list (apply char-set (delete '*eof* char-list)) '*eof*)]
145               [else #f])))
146
147  (define (prefold-macro-1 form r c)
148    (match form
149      [(op ('quote cs) . args)
150       (or (and-let* ([cs. (prefold-char-list cs)])
151             `(,(r (symbol-append '% op)) ',cs. ,@args))
152           `(,(r (symbol-append '% op)) ',cs ,@args))]
153      [(op . x) `(,(r (symbol-append '% op)) ,@x)]))
154
155  (define (prefold-macro-2 form r c)
156    (match form
157      [(op ('quote cs1) ('quote cs2) . args)
158       (or (and-let* ([cs1. (prefold-char-list cs1)]
159                      [cs2. (prefold-char-list cs2)])
160             `(,(r (symbol-append '% op)) ',cs1. ',cs2. ,@args))
161           `(,(r (symbol-append '% op)) ',cs1 ',cs2 ,@args))]
162      [(op . x) `(,(r (symbol-append '% op)) ,@x)]))
163  )
164
165;; ASSERT-CURR-CHAR <char-list> <string> :optional <port>
166(define-inline (%assert-curr-char char-list string
167                                  :optional (port (current-input-port)))
168  (define pred (char-list-predicate char-list))
169  (rlet1 c (read-char port)
170    (unless (pred char-list c)
171      (errorf "~awrong character ~s ~a. ~s expected."
172              (ppp port) c string char-list))))
173
174(define-hybrid-syntax assert-curr-char
175  %assert-curr-char (er-macro-transformer prefold-macro-1))
176
177;; SKIP-UNTIL <char-list/number/pred> :optional <port>
178(define-inline (%skip-until char-list/number/pred
179                            :optional (port (current-input-port)))
180  (cond
181   [(number? char-list/number/pred)
182    (skip-until/number char-list/number/pred port)]
183   [(procedure? char-list/number/pred)
184    (skip-until/pred char-list/number/pred port)]
185   [else
186    (skip-until/char-list (char-list-predicate char-list/number/pred)
187                          char-list/number/pred port)]))
188
189(define-hybrid-syntax skip-until
190  %skip-until (er-macro-transformer prefold-macro-1))
191
192(define (skip-until/number num port)
193  (and (<= 1 num)
194       (let loop ([i 1] [c (read-char port)])
195         (cond [(eof-object? c) (errorf "~aunexpected EOF" (ppp port))]
196               [(>= i num) #f]
197               [else (loop (+ i 1) (read-char port))]))))
198
199(define-inline (skip-until/common pred port)
200  (let loop ([c (read-char port)])
201    (cond [(pred c) c]
202          [(eof-object? c) (errorf "~aunexpected EOF" (ppp port))]
203          [else (loop (read-char port))])))
204(define skip-until/pred skip-until/common);trick to prevent excessive inlining
205(define (skip-until/char-list pred char-list port)
206  (skip-until/common (cut pred char-list <>) port))
207
208
209;; SKIP-WHILE <char-list/pred> :optional <port>
210(define-inline (%skip-while char-list/pred
211                            :optional (port (current-input-port)))
212  (cond
213   [(procedure? char-list/pred) (skip-while/pred char-list/pred port)]
214   [else (skip-while/char-list (char-list-predicate char-list/pred)
215                               char-list/pred port)]))
216
217(define-hybrid-syntax skip-while
218  %skip-while (er-macro-transformer prefold-macro-1))
219
220(define-inline (skip-while/common pred port)
221  (let loop ([c (peek-char port)])
222    (cond [(pred c) (read-char port) (loop (peek-char port))]
223          [else c])))
224(define skip-while/pred skip-while/common)
225(define (skip-while/char-list pred char-list port)
226  (skip-while/common (cut pred char-list <>) port))
227
228;; PEEK-NEXT-CHAR :optional <port>
229(define-inline (peek-next-char :optional (port (current-input-port)))
230  (read-char port)
231  (peek-char port))
232
233;; NEXT-TOKEN <prefix-char-list/pred> <break-char-list/pred>
234;;            :optional <comment> <port>
235(define-inline (%next-token prefix-char-list/pred break-char-list/pred
236                            :optional (comment "unexpected EOF")
237                                      (port (current-input-port)))
238  (let1 c (skip-while prefix-char-list/pred port)
239    (if (procedure? break-char-list/pred)
240      (next-token/pred break-char-list/pred c port comment)
241      (next-token/char-list (char-list-predicate break-char-list/pred)
242                            break-char-list/pred c port comment))))
243
244(define-hybrid-syntax next-token
245  %next-token (er-macro-transformer prefold-macro-2))
246
247(define-inline (next-token/common break-pred char port errmsg)
248  (define o (open-output-string))
249  (let loop ([c char])
250    (cond [(break-pred c) (get-output-string o)]
251          [(eof-object? c) (errorf "~a~a" (ppp port) errmsg)]
252          [else (write-char c o) (read-char port) (loop (peek-char port))])))
253(define next-token/pred next-token/common)
254(define (next-token/char-list pred char-list char port errmsg)
255  (next-token/common (cut pred char-list <>) char port errmsg))
256
257;; NEXT-TOKEN-OF <char-list/pred> :optional <port>
258(define-inline (%next-token-of char-list/pred
259                              :optional (port (current-input-port)))
260  (if (procedure? char-list/pred)
261    (next-token-of/pred char-list/pred port)
262    (next-token-of/char-list (char-list-predicate char-list/pred)
263                             char-list/pred port)))
264
265(define-hybrid-syntax next-token-of
266  %next-token-of (er-macro-transformer prefold-macro-1))
267
268(define-inline (next-token-of/common pred port)
269  (define o (open-output-string))
270  (let loop ([c (peek-char port)])
271    (cond [(or (eof-object? c) (not (pred c))) (get-output-string o)]
272          [else (write-char c o) (read-char port) (loop (peek-char port))])))
273(define next-token-of/pred next-token-of/common)
274(define (next-token-of/char-list pred char-list port)
275  (next-token-of/common (cut pred char-list <>) port))
276
277
278;; read-line is built-in.
279
280;; this is slightly different from built-in read-string
281(define (read-string n :optional (port (current-input-port)))
282  (let1 s ((with-module gauche read-string) n port)
283    (if (eof-object? s)
284      ""
285      s)))
286