1;;; 2;;; librx.scm - builtin regular-expression procedures 3;;; 4;;; Copyright (c) 2000-2020 Shiro Kawai <shiro@acm.org> 5;;; 6;;; Redistribution and use in source and binary forms, with or without 7;;; modification, are permitted provided that the following conditions 8;;; are met: 9;;; 10;;; 1. Redistributions of source code must retain the above copyright 11;;; notice, this list of conditions and the following disclaimer. 12;;; 13;;; 2. Redistributions in binary form must reproduce the above copyright 14;;; notice, this list of conditions and the following disclaimer in the 15;;; documentation and/or other materials provided with the distribution. 16;;; 17;;; 3. Neither the name of the authors nor the names of its contributors 18;;; may be used to endorse or promote products derived from this 19;;; software without specific prior written permission. 20;;; 21;;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22;;; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23;;; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24;;; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25;;; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26;;; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 27;;; TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 28;;; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 29;;; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32;;; 33 34(select-module gauche) 35(use util.match) 36 37(inline-stub 38 (declcode (.include <gauche/regexp.h>))) 39 40(define-cproc regexp? (obj) ::<boolean> :constant SCM_REGEXPP) 41(define-cproc regmatch? (obj) ::<boolean> SCM_REGMATCHP) 42 43(define-cproc string->regexp (str::<string> :key (case-fold #f) (multi-line #f)) 44 (return 45 (Scm_RegComp str 46 (logior (?: (SCM_BOOL_VALUE case-fold) SCM_REGEXP_CASE_FOLD 0) 47 (?: (SCM_BOOL_VALUE multi-line) SCM_REGEXP_MULTI_LINE 0))))) 48(define-cproc regexp-ast (regexp::<regexp>) (return (-> regexp ast))) 49(define-cproc regexp-case-fold? (regexp::<regexp>) ::<boolean> 50 (return (logand (-> regexp flags) SCM_REGEXP_CASE_FOLD))) 51 52(define-cproc regexp-parse (str::<string> :key (case-fold #f) (multi-line #f)) 53 (return 54 (Scm_RegComp str 55 (logior (?: (SCM_BOOL_VALUE case-fold) SCM_REGEXP_CASE_FOLD 0) 56 (?: (SCM_BOOL_VALUE multi-line) SCM_REGEXP_MULTI_LINE 0) 57 SCM_REGEXP_PARSE_ONLY)))) 58(define-cproc regexp-compile (ast :key (multi-line #f)) 59 (return (Scm_RegCompFromAST2 ast 60 (?: (SCM_BOOL_VALUE multi-line) 61 SCM_REGEXP_MULTI_LINE 62 0)))) 63(define-cproc regexp-optimize (ast) Scm_RegOptimizeAST) 64 65(define-cproc regexp-num-groups (regexp::<regexp>) ::<int> 66 (return (-> regexp numGroups))) 67(define-cproc regexp-named-groups (regexp::<regexp>) 68 (return (-> regexp grpNames))) 69 70(define-cproc rxmatch (regexp str::<string> :optional start end) 71 (let* ([rx::ScmRegexp* NULL]) 72 (cond [(SCM_STRINGP regexp) (set! rx (SCM_REGEXP (Scm_RegComp 73 (SCM_STRING regexp) 0)))] 74 [(SCM_REGEXPP regexp) (set! rx (SCM_REGEXP regexp))] 75 [else (SCM_TYPE_ERROR regexp "regexp")]) 76 (return (Scm_RegExec rx str start end)))) 77 78(inline-stub 79 (define-cise-stmt rxmatchop 80 [(_ (exp ...)) (template exp)] 81 [(_ fn) (template `(,fn (SCM_REGMATCH match) obj))] 82 :where 83 (define (template result) 84 `(cond [(SCM_FALSEP match) (return '#f)] 85 [(SCM_REGMATCHP match) (return ,result)] 86 [else (SCM_TYPE_ERROR match "regmatch object or #f") 87 (return SCM_UNDEFINED)]))) 88 ) 89 90(define-cproc rxmatch-substring (match :optional (obj 0)) 91 (rxmatchop Scm_RegMatchSubstr)) 92(define-cproc rxmatch-start (match :optional (obj 0)) 93 (rxmatchop Scm_RegMatchStart)) 94(define-cproc rxmatch-end (match :optional (obj 0)) 95 (rxmatchop Scm_RegMatchEnd)) 96(define-cproc rxmatch-before (match :optional (obj 0)) 97 (rxmatchop Scm_RegMatchBefore)) 98(define-cproc rxmatch-after (match :optional (obj 0)) 99 (rxmatchop Scm_RegMatchAfter)) 100(define-cproc rxmatch-num-matches (match) 101 (if (SCM_FALSEP match) 102 (return (SCM_MAKE_INT 0)) 103 (rxmatchop (SCM_MAKE_INT (-> (SCM_REGMATCH match) numMatches))))) 104(define-cproc rxmatch-named-groups (match) 105 (if (SCM_FALSEP match) 106 (return SCM_NIL) 107 (rxmatchop (-> (SCM_REGMATCH match) grpNames)))) 108 109(select-module gauche.internal) 110(define-cproc %regexp-dump (rx::<regexp>) ::<void> Scm_RegDump) 111(define-cproc %regmatch-dump (rm::<regmatch>) ::<void> Scm_RegMatchDump) 112 113(define-cproc %regexp-pattern (regexp::<regexp>) 114 (setter (regexp::<regexp> pat::<string>) ::<void> 115 (set! (-> regexp pattern) (SCM_OBJ pat))) 116 (return (-> regexp pattern))) 117(define-cproc %regexp-laset (regexp::<regexp>) ; for testing 118 (return (-> regexp laset))) 119 120(select-module gauche.internal) 121;; aux routine for regexp-replace[-all] 122;; "abc\\1de\\3" => '("abc" 1 "de" 3) 123(define (%regexp-parse-subpattern sub) 124 (cond 125 [(string? sub) 126 (let loop ((sub sub) (r '())) 127 (cond [(rxmatch #/\\(?:(\d+)|k<([^>]+)>|(.))/ sub) 128 => (^m 129 (define (loop2 elem) 130 (loop (rxmatch-after m) 131 (list* elem (rxmatch-before m) r))) 132 (cond [(rxmatch-substring m 1) 133 => (^d (loop2 (string->number d)))] 134 [(rxmatch-substring m 2) 135 => (^s (loop2 (string->symbol s)))] 136 [else (loop2 (rxmatch-substring m 3))]))] 137 [else (reverse (cons sub r))]))] 138 [(procedure? sub) sub] 139 [else (error "string, procedure or list required, but got" sub)])) 140 141;; Skip the first subskip matches, then start replacing only up to 142;; subcount times (or infinite if subcount is #f). 143(define (%regexp-replace-rec rx string subpat subskip subcount) 144 (define (next-string match) 145 (let1 rest (rxmatch-after match) 146 (and (not (equal? rest "")) 147 (if (= (rxmatch-start match) (rxmatch-end match)) 148 (begin (display (string-ref rest 0)) 149 (string-copy rest 1)) 150 rest)))) 151 (if (and subcount (zero? subcount)) 152 (display string) 153 (let1 match (rxmatch rx string) 154 (cond 155 [(not match) 156 (display string)] 157 [(> subskip 0) 158 (display (rxmatch-before match)) 159 (display (rxmatch-substring match)) 160 (and-let1 next (next-string match) 161 (%regexp-replace-rec rx next subpat (- subskip 1) subcount))] 162 [else 163 (display (rxmatch-before match)) 164 (if (procedure? subpat) 165 (display (subpat match)) 166 (dolist [pat subpat] 167 (display (cond 168 [(eq? pat 'pre) (rxmatch-before match)] 169 [(eq? pat 'post) (rxmatch-after match)] 170 [(or (number? pat) (symbol? pat)) 171 (rxmatch-substring match pat)] 172 [else pat])))) 173 (and-let1 next (next-string match) 174 (%regexp-replace-rec rx next subpat subskip 175 (and subcount (- subcount 1))))])))) 176 177(define (%regexp-replace rx string subpat subskip subcount) 178 (with-output-to-string 179 (^[] 180 (%regexp-replace-rec rx string subpat subskip subcount)))) 181 182(define-in-module gauche (regexp-replace rx string sub) 183 (%regexp-replace rx string 184 (%regexp-parse-subpattern sub) 0 1)) 185 186(define-in-module gauche (regexp-replace-all rx string sub) 187 (%regexp-replace rx string 188 (%regexp-parse-subpattern sub) 0 #f)) 189 190(define (regexp-replace-driver name func-1) 191 (^[string rx sub . more] 192 (cond [(null? more) (func-1 rx string sub)] 193 [else 194 (unless (even? (length more)) 195 (errorf "~a: regexp and subsitution don't pair up" name)) 196 (let loop ([s (func-1 rx string sub)] 197 [args more]) 198 (if (null? args) 199 s 200 (loop (func-1 (car args) s (cadr args)) 201 (cddr args))))]))) 202 203(define-in-module gauche regexp-replace* 204 (regexp-replace-driver 'regexp-replace* regexp-replace)) 205 206(define-in-module gauche regexp-replace-all* 207 (regexp-replace-driver 'regexp-replace-all* regexp-replace-all)) 208 209;; Contributed from Alex Shinn; modified a bit by shiro 210(define-in-module gauche (regexp-quote str) 211 (with-string-io str 212 (^[] (let loop ((c (read-char))) 213 (unless (eof-object? c) 214 (when (char-set-contains? #[\\|\[\](){}.*+?^$] c) (write-char #\\)) 215 (write-char c) 216 (loop (read-char))))))) 217 218(define-in-module gauche (regexp->string rx) 219 (or (%regexp-pattern rx) 220 (rlet1 s (regexp-unparse (regexp-ast rx)) 221 (set! (%regexp-pattern rx) s)))) 222 223(define-in-module gauche (rxmatch->string rx str . sel) 224 (cond [(null? sel) (rxmatch-substring (rxmatch rx str))] 225 [(eq? (car sel) 'after) 226 (apply rxmatch-after (rxmatch rx str) (cdr sel))] 227 [(eq? (car sel) 'before) 228 (apply rxmatch-before (rxmatch rx str) (cdr sel))] 229 [else (rxmatch-substring (rxmatch rx str) (car sel))])) 230