1;;; 2;;; SRFI-115 Scheme Regular Expressions 3;;; 4;;; Copyright (c) 2019 Duy Nguyen <pclouds@gmail.com> 5;;; 6;;; Redistribution and use in source and binary forms, with or without 7;;; modification, are permitted provided that the following conditions 8;;; are met: 9;;; 10;;; 1. Redistributions of source code must retain the above copyright 11;;; notice, this list of conditions and the following disclaimer. 12;;; 13;;; 2. Redistributions in binary form must reproduce the above copyright 14;;; notice, this list of conditions and the following disclaimer in the 15;;; documentation and/or other materials provided with the distribution. 16;;; 17;;; 3. Neither the name of the authors nor the names of its contributors 18;;; may be used to endorse or promote products derived from this 19;;; software without specific prior written permission. 20;;; 21;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24;;; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25;;; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26;;; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 27;;; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28;;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 29;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32;;; 33 34(define-module srfi-115 35 (use scheme.charset) 36 (use gauche.regexp.sre) 37 (export rx regexp regexp->sre char-set->sre valid-sre? 38 regexp-matches regexp-matches? regexp-search 39 regexp-fold regexp-extract regexp-split regexp-partition 40 regexp-replace regexp-replace-all 41 regexp-match? regexp-match-count regexp-match-submatch 42 regexp-match-submatch-start regexp-match-submatch-end 43 regexp-match->list)) 44(select-module srfi-115) 45 46(define-syntax rx 47 (syntax-rules () 48 [(_ sre ...) (regexp `(: sre ...))])) 49 50(define (regexp re) 51 (if (regexp? re) re (sre->regexp re))) 52 53(define (regexp->sre re) 54 (regexp-unparse-sre (regexp-ast re))) 55 56(define (char-set->sre cs) 57 (list (list->string (char-set->list cs)))) 58 59(define (valid-sre? sre) 60 (guard (e [(<regexp-invalid-sre> e) #f][else (raise e)]) 61 (regexp-parse-sre sre) 62 #t)) 63 64;; regexp? is already defined by gauche.regexp and is builtin. 65;; do we even need to export it in pure R7RS environment? 66 67(define (regexp-matches re str :optional start end) 68 (if (regexp? re) 69 (let1 match (regexp-search re str start end) 70 (if (and match 71 (zero? (rxmatch-start match)) 72 (eq? (rxmatch-end match) 73 (if (undefined? end) 74 (string-length str) 75 end))) 76 match 77 #f)) 78 ;; if re is not compiled yet, insert bos to take advantage of 79 ;; BOL_ANCHORED optimization. 80 (regexp-search `(: bos ,re eos) str start end))) 81 82(define (regexp-matches? re str :optional start end) 83 (if (regexp-matches re str start end) #t #f)) 84 85(define (regexp-search re str :optional start end) 86 (rxmatch (regexp re) str start end)) 87 88(define (regexp-fold re kons knil str :optional finish start end) 89 (cond 90 [(undefined? finish) 91 (regexp-fold re kons knil str (lambda (from match str acc) 92 acc))] 93 [(undefined? start) 94 (regexp-fold re kons knil str finish 0)] 95 [(undefined? end) 96 (regexp-fold re kons knil str finish 0 (string-length str))] 97 [(not (regexp? re)) 98 (regexp-fold (regexp re) kons knil str finish 0 end)] 99 [else 100 (let loop ([start start] 101 [last-end start] 102 [acc knil]) 103 (let1 match (and (< start end) 104 (regexp-search re str start end)) 105 (cond 106 [(not match) 107 (finish last-end #f str acc)] 108 [(eq? start (rxmatch-end match)) ; empty match, try again 109 (loop (+ start 1) last-end acc)] 110 [else 111 (loop (rxmatch-end match) 112 (rxmatch-end match) 113 (kons last-end match str acc))])))])) 114 115(define (regexp-extract re str . opt) 116 (apply regexp-fold re 117 (lambda (i match str acc) 118 (if (equal? (match 0) "") 119 acc 120 (cons (match 0) acc))) 121 '() 122 str 123 (lambda (i match str acc) 124 (reverse acc)) 125 opt)) 126 127(define (regexp-split rx str . o) 128 ;; start and end in indices passed to regexp-fold 129 (let ((start (if (pair? o) (car o) 0)) 130 (end (if (and (pair? o) (pair? (cdr o))) 131 (cadr o) 132 (string-length str)))) 133 (regexp-fold rx 134 (lambda (from md str a) 135 (let ((i (regexp-match-submatch-start md 0)) 136 (j (regexp-match-submatch-end md 0))) 137 (if (eqv? i j) 138 a 139 (cons j 140 (cons (substring str (car a) i) 141 (cdr a)))))) 142 (cons start '()) 143 str 144 (lambda (from md str a) 145 (reverse 146 (cons (substring str (car a) end) 147 (cdr a)))) 148 start 149 end))) 150 151(define (regexp-partition rx str . o) 152 (let ((start (if (pair? o) (car o) 0)) 153 (end (if (and (pair? o) (pair? (cdr o))) 154 (cadr o) 155 (string-length str)))) 156 (define (kons from md str a) 157 (let ((i (regexp-match-submatch-start md 0)) 158 (j (regexp-match-submatch-end md 0))) 159 (if (eqv? i j) 160 a 161 (let ((left (substring str (car a) i))) 162 (cons j 163 (cons (regexp-match-submatch md 0) 164 (cons left (cdr a)))))))) 165 (define (final from md str a) 166 (if (or (< from end) 167 (null? (cdr a))) 168 (cons (substring str (car a) end) (cdr a)) 169 (cdr a))) 170 (reverse 171 (regexp-fold rx 172 kons 173 (cons start '()) 174 str 175 final 176 start 177 end)))) 178 179;; %regexp-replace-rec takes a substitution as either a procedure or a 180;; list of symbols, numbers and strings. Wrap single symbols or 181;; strings to a list to follow this rule 182(define (transform-sub rx sub) 183 (define (check-pre-post sub) 184 (when (and (eq? sub 'pre) (assq 'pre (regexp-named-groups rx))) 185 (error "If 'pre is used in regexp-replace, there should not be a capture group also named 'pre")) 186 (when (and (eq? sub 'post) (assq 'post (regexp-named-groups rx))) 187 (error "If 'post is used in regexp-replace, there should not be a capture group also named 'post")) 188 sub) 189 190 (cond 191 [(string? sub) (list sub)] 192 [(number? sub) (list sub)] 193 [(symbol? sub) (list (check-pre-post sub))] 194 [(list? sub) (map check-pre-post sub)] 195 [else sub])) 196 197(define (%substring str start end) 198 (if (or (> start 0) end) 199 (substring str start (or end (string-length str))) 200 str)) 201 202(define (regexp-replace rx str sub :optional 203 (start 0) 204 (end #f) 205 (count 0)) 206 (let1 rx (regexp rx) 207 ((with-module gauche.internal %regexp-replace) 208 rx (%substring str start end) 209 (transform-sub rx sub) count 1))) 210 211(define (regexp-replace-all rx str sub :optional 212 (start 0) 213 (end #f)) 214 (let1 rx (regexp rx) 215 ((with-module gauche.internal %regexp-replace) 216 rx (%substring str start end) 217 (transform-sub rx sub) 0 #f))) 218 219(define regexp-match? regmatch?) 220(define (regexp-match-count match) 221 (- (rxmatch-num-matches match) 1)) 222(define regexp-match-submatch rxmatch-substring) 223(define regexp-match-submatch-start rxmatch-start) 224(define regexp-match-submatch-end rxmatch-end) 225(define regexp-match->list rxmatch-substrings) 226