1;;; 2;;; parse.scm - utilities to parse input 3;;; 4;;; Copyright (c) 2000-2020 Shiro Kawai <shiro@acm.org> 5;;; 6;;; Redistribution and use in source and binary forms, with or without 7;;; modification, are permitted provided that the following conditions 8;;; are met: 9;;; 10;;; 1. Redistributions of source code must retain the above copyright 11;;; notice, this list of conditions and the following disclaimer. 12;;; 13;;; 2. Redistributions in binary form must reproduce the above copyright 14;;; notice, this list of conditions and the following disclaimer in the 15;;; documentation and/or other materials provided with the distribution. 16;;; 17;;; 3. Neither the name of the authors nor the names of its contributors 18;;; may be used to endorse or promote products derived from this 19;;; software without specific prior written permission. 20;;; 21;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24;;; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25;;; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26;;; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 27;;; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28;;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 29;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32;;; 33 34;; This module implements the input parsing utilities described in Oleg's site 35;; http://pobox.com/~oleg/ftp 36;; (follow the link of "Scheme Code -> Input parsing") 37;; 38;; The functions are API compatible with Oleg's library. The performance of 39;; this module is critical for the programs that parses large amount of 40;; text, so the code here is tuned specifially to the Gauche compiler 41;; to generate fast code. (NB: We use some undocumented, experimenal 42;; features. Do not copy these techniques casually; we'll change the feature 43;; at any time.) 44 45(define-module text.parse 46 (use srfi-13) 47 (use srfi-14) 48 (use util.match) 49 (export find-string-from-port? 50 assert-curr-char 51 skip-until 52 skip-while 53 peek-next-char 54 next-token 55 next-token-of 56 read-string) 57 ) 58(select-module text.parse) 59 60(define ppp port-position-prefix) ;for conciseness 61 62(define (find-string-from-port? str in-port :optional (max-no-chars #f)) 63 64 (if (string-null? str) 65 0 ;special case 66 (let ((restart (make-kmp-restart-vector str)) 67 (pattern (list->vector (string->list str))) 68 (patlen (string-length str))) 69 70 (define (scan patpos count char) 71 (cond ((eof-object? char) #f) 72 ((char=? char (vector-ref pattern patpos)) 73 (if (= patpos (- patlen 1)) 74 count 75 (scan (+ patpos 1) (+ count 1) (read-char in-port)))) 76 ((and max-no-chars (>= count max-no-chars)) #f) 77 ((= patpos 0) 78 (scan 0 (+ count 1) (read-char in-port))) 79 (else 80 (scan (vector-ref restart patpos) count char)) 81 )) 82 83 (scan 0 1 (read-char in-port)) 84 ))) 85 86;; Given CHAR-LIST, returns a predicate that takes CHAR-LIST and a character, 87;; and see if a character is included in the CHAR-LIST. 88;; Oleg's original utility only allows characters and symbol *eof* in 89;; CHAR-LIST. We allow a single character set, or a list of mixture of 90;; characters, character sets and symbol *eof*. 91;; 92;; This function, and the resulting predicate, can be called frequently 93;; (it's O(n) where n is the size of the input to parse). So we avoid 94;; allocation, including closure creation. Inlining char-list-predicate 95;; also allows compile-time evaluation in majority of cases when char-list 96;; is constant. 97(define-inline (char-list-predicate char-list) 98 (cond 99 [(char-set? char-list) char-list-contains?/char-set] 100 [(not (list? char-list)) 101 (error "CHAR-LIST must be a char-set or a list of characters, \ 102 char-sets and/or symbol '*eof*" char-list)] 103 [(and (pair? char-list) ; this pattern is generated by the 104 (char-set? (car char-list)) ; compiler macros. 105 (pair? (cdr char-list)) 106 (null? (cddr char-list)) 107 (eq? '*eof* (cadr char-list))) 108 char-list-contains?/char-set/eof] 109 [(memq '*eof* char-list) 110 (if (every character-or-eof? char-list) 111 char-list-contains?/chars/eof 112 char-list-contains?/eof)] 113 [(every char? char-list) char-list-contains?/chars] 114 [else char-list-contains?])) 115 116(define character-or-eof? (any-pred eof-object? char?)) 117 118(define (char-list-contains?/char-set char-list char) 119 (and (char? char) (char-set-contains? char-list char))) 120(define (char-list-contains?/char-set/eof char-list char) ; (#[...] *eof*) 121 (or (eof-object? char) 122 (and (char? char) (char-set-contains? (car char-list) char)))) 123(define (char-list-contains?/empty char-list char) #f) 124(define (char-list-contains?/chars char-list char) (memv char char-list)) 125(define (char-list-contains?/chars/eof char-list char) 126 (or (eof-object? char) (memv char char-list))) 127(define (char-list-contains?/eof char-list char) 128 (or (eof-object? char) (char-list-contains? char-list char))) 129(define (char-list-contains? char-list char) ;generic version 130 (let loop ((cs char-list)) 131 (if (null? cs) 132 #f 133 (or (eqv? (car cs) char) 134 (and (char-set? (car cs)) 135 (char-list-contains?/char-set (car cs) char)) 136 (loop (cdr cs)))))) 137 138;; Common routine for the compiler macros. 139(eval-when (:compile-toplevel :load-toplevel) 140 (define (prefold-char-list char-list) 141 (and (list? char-list) 142 (cond [(every char? char-list) (apply char-set char-list)] 143 [(every (any-pred char? (cut eq? <> '*eof*)) char-list) 144 (list (apply char-set (delete '*eof* char-list)) '*eof*)] 145 [else #f]))) 146 147 (define (prefold-macro-1 form r c) 148 (match form 149 [(op ('quote cs) . args) 150 (or (and-let* ([cs. (prefold-char-list cs)]) 151 `(,(r (symbol-append '% op)) ',cs. ,@args)) 152 `(,(r (symbol-append '% op)) ',cs ,@args))] 153 [(op . x) `(,(r (symbol-append '% op)) ,@x)])) 154 155 (define (prefold-macro-2 form r c) 156 (match form 157 [(op ('quote cs1) ('quote cs2) . args) 158 (or (and-let* ([cs1. (prefold-char-list cs1)] 159 [cs2. (prefold-char-list cs2)]) 160 `(,(r (symbol-append '% op)) ',cs1. ',cs2. ,@args)) 161 `(,(r (symbol-append '% op)) ',cs1 ',cs2 ,@args))] 162 [(op . x) `(,(r (symbol-append '% op)) ,@x)])) 163 ) 164 165;; ASSERT-CURR-CHAR <char-list> <string> :optional <port> 166(define-inline (%assert-curr-char char-list string 167 :optional (port (current-input-port))) 168 (define pred (char-list-predicate char-list)) 169 (rlet1 c (read-char port) 170 (unless (pred char-list c) 171 (errorf "~awrong character ~s ~a. ~s expected." 172 (ppp port) c string char-list)))) 173 174(define-hybrid-syntax assert-curr-char 175 %assert-curr-char (er-macro-transformer prefold-macro-1)) 176 177;; SKIP-UNTIL <char-list/number/pred> :optional <port> 178(define-inline (%skip-until char-list/number/pred 179 :optional (port (current-input-port))) 180 (cond 181 [(number? char-list/number/pred) 182 (skip-until/number char-list/number/pred port)] 183 [(procedure? char-list/number/pred) 184 (skip-until/pred char-list/number/pred port)] 185 [else 186 (skip-until/char-list (char-list-predicate char-list/number/pred) 187 char-list/number/pred port)])) 188 189(define-hybrid-syntax skip-until 190 %skip-until (er-macro-transformer prefold-macro-1)) 191 192(define (skip-until/number num port) 193 (and (<= 1 num) 194 (let loop ([i 1] [c (read-char port)]) 195 (cond [(eof-object? c) (errorf "~aunexpected EOF" (ppp port))] 196 [(>= i num) #f] 197 [else (loop (+ i 1) (read-char port))])))) 198 199(define-inline (skip-until/common pred port) 200 (let loop ([c (read-char port)]) 201 (cond [(pred c) c] 202 [(eof-object? c) (errorf "~aunexpected EOF" (ppp port))] 203 [else (loop (read-char port))]))) 204(define skip-until/pred skip-until/common);trick to prevent excessive inlining 205(define (skip-until/char-list pred char-list port) 206 (skip-until/common (cut pred char-list <>) port)) 207 208 209;; SKIP-WHILE <char-list/pred> :optional <port> 210(define-inline (%skip-while char-list/pred 211 :optional (port (current-input-port))) 212 (cond 213 [(procedure? char-list/pred) (skip-while/pred char-list/pred port)] 214 [else (skip-while/char-list (char-list-predicate char-list/pred) 215 char-list/pred port)])) 216 217(define-hybrid-syntax skip-while 218 %skip-while (er-macro-transformer prefold-macro-1)) 219 220(define-inline (skip-while/common pred port) 221 (let loop ([c (peek-char port)]) 222 (cond [(pred c) (read-char port) (loop (peek-char port))] 223 [else c]))) 224(define skip-while/pred skip-while/common) 225(define (skip-while/char-list pred char-list port) 226 (skip-while/common (cut pred char-list <>) port)) 227 228;; PEEK-NEXT-CHAR :optional <port> 229(define-inline (peek-next-char :optional (port (current-input-port))) 230 (read-char port) 231 (peek-char port)) 232 233;; NEXT-TOKEN <prefix-char-list/pred> <break-char-list/pred> 234;; :optional <comment> <port> 235(define-inline (%next-token prefix-char-list/pred break-char-list/pred 236 :optional (comment "unexpected EOF") 237 (port (current-input-port))) 238 (let1 c (skip-while prefix-char-list/pred port) 239 (if (procedure? break-char-list/pred) 240 (next-token/pred break-char-list/pred c port comment) 241 (next-token/char-list (char-list-predicate break-char-list/pred) 242 break-char-list/pred c port comment)))) 243 244(define-hybrid-syntax next-token 245 %next-token (er-macro-transformer prefold-macro-2)) 246 247(define-inline (next-token/common break-pred char port errmsg) 248 (define o (open-output-string)) 249 (let loop ([c char]) 250 (cond [(break-pred c) (get-output-string o)] 251 [(eof-object? c) (errorf "~a~a" (ppp port) errmsg)] 252 [else (write-char c o) (read-char port) (loop (peek-char port))]))) 253(define next-token/pred next-token/common) 254(define (next-token/char-list pred char-list char port errmsg) 255 (next-token/common (cut pred char-list <>) char port errmsg)) 256 257;; NEXT-TOKEN-OF <char-list/pred> :optional <port> 258(define-inline (%next-token-of char-list/pred 259 :optional (port (current-input-port))) 260 (if (procedure? char-list/pred) 261 (next-token-of/pred char-list/pred port) 262 (next-token-of/char-list (char-list-predicate char-list/pred) 263 char-list/pred port))) 264 265(define-hybrid-syntax next-token-of 266 %next-token-of (er-macro-transformer prefold-macro-1)) 267 268(define-inline (next-token-of/common pred port) 269 (define o (open-output-string)) 270 (let loop ([c (peek-char port)]) 271 (cond [(or (eof-object? c) (not (pred c))) (get-output-string o)] 272 [else (write-char c o) (read-char port) (loop (peek-char port))]))) 273(define next-token-of/pred next-token-of/common) 274(define (next-token-of/char-list pred char-list port) 275 (next-token-of/common (cut pred char-list <>) port)) 276 277 278;; read-line is built-in. 279 280;; this is slightly different from built-in read-string 281(define (read-string n :optional (port (current-input-port))) 282 (let1 s ((with-module gauche read-string) n port) 283 (if (eof-object? s) 284 "" 285 s))) 286