1;;;
2;;; librx.scm - builtin regular-expression procedures
3;;;
4;;;   Copyright (c) 2000-2020  Shiro Kawai  <shiro@acm.org>
5;;;
6;;;   Redistribution and use in source and binary forms, with or without
7;;;   modification, are permitted provided that the following conditions
8;;;   are met:
9;;;
10;;;   1. Redistributions of source code must retain the above copyright
11;;;      notice, this list of conditions and the following disclaimer.
12;;;
13;;;   2. Redistributions in binary form must reproduce the above copyright
14;;;      notice, this list of conditions and the following disclaimer in the
15;;;      documentation and/or other materials provided with the distribution.
16;;;
17;;;   3. Neither the name of the authors nor the names of its contributors
18;;;      may be used to endorse or promote products derived from this
19;;;      software without specific prior written permission.
20;;;
21;;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22;;;   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23;;;   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24;;;   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25;;;   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26;;;   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27;;;   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28;;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29;;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30;;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31;;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32;;;
33
34(select-module gauche)
35(use util.match)
36
37(inline-stub
38 (declcode (.include <gauche/regexp.h>)))
39
40(define-cproc regexp? (obj)   ::<boolean> :constant SCM_REGEXPP)
41(define-cproc regmatch? (obj) ::<boolean> SCM_REGMATCHP)
42
43(define-cproc string->regexp (str::<string> :key (case-fold #f) (multi-line #f))
44  (return
45   (Scm_RegComp str
46                (logior (?: (SCM_BOOL_VALUE case-fold) SCM_REGEXP_CASE_FOLD 0)
47                        (?: (SCM_BOOL_VALUE multi-line) SCM_REGEXP_MULTI_LINE 0)))))
48(define-cproc regexp-ast (regexp::<regexp>) (return (-> regexp ast)))
49(define-cproc regexp-case-fold? (regexp::<regexp>) ::<boolean>
50  (return (logand (-> regexp flags) SCM_REGEXP_CASE_FOLD)))
51
52(define-cproc regexp-parse (str::<string> :key (case-fold #f) (multi-line #f))
53  (return
54   (Scm_RegComp str
55                (logior (?: (SCM_BOOL_VALUE case-fold) SCM_REGEXP_CASE_FOLD 0)
56                        (?: (SCM_BOOL_VALUE multi-line) SCM_REGEXP_MULTI_LINE 0)
57                        SCM_REGEXP_PARSE_ONLY))))
58(define-cproc regexp-compile (ast :key (multi-line #f))
59  (return (Scm_RegCompFromAST2 ast
60                               (?: (SCM_BOOL_VALUE multi-line)
61                                   SCM_REGEXP_MULTI_LINE
62                                   0))))
63(define-cproc regexp-optimize (ast) Scm_RegOptimizeAST)
64
65(define-cproc regexp-num-groups (regexp::<regexp>) ::<int>
66  (return (-> regexp numGroups)))
67(define-cproc regexp-named-groups (regexp::<regexp>)
68  (return (-> regexp grpNames)))
69
70(define-cproc rxmatch (regexp str::<string> :optional start end)
71  (let* ([rx::ScmRegexp* NULL])
72    (cond [(SCM_STRINGP regexp) (set! rx (SCM_REGEXP (Scm_RegComp
73                                                      (SCM_STRING regexp) 0)))]
74          [(SCM_REGEXPP regexp) (set! rx (SCM_REGEXP regexp))]
75          [else (SCM_TYPE_ERROR regexp "regexp")])
76    (return (Scm_RegExec rx str start end))))
77
78(inline-stub
79 (define-cise-stmt rxmatchop
80   [(_ (exp ...)) (template exp)]
81   [(_ fn)        (template `(,fn (SCM_REGMATCH match) obj))]
82   :where
83   (define (template result)
84     `(cond [(SCM_FALSEP match) (return '#f)]
85            [(SCM_REGMATCHP match) (return ,result)]
86            [else (SCM_TYPE_ERROR match "regmatch object or #f")
87                  (return SCM_UNDEFINED)])))
88 )
89
90(define-cproc rxmatch-substring (match :optional (obj 0))
91  (rxmatchop Scm_RegMatchSubstr))
92(define-cproc rxmatch-start (match :optional (obj 0))
93  (rxmatchop Scm_RegMatchStart))
94(define-cproc rxmatch-end (match :optional (obj 0))
95  (rxmatchop Scm_RegMatchEnd))
96(define-cproc rxmatch-before (match :optional (obj 0))
97  (rxmatchop Scm_RegMatchBefore))
98(define-cproc rxmatch-after (match :optional (obj 0))
99  (rxmatchop Scm_RegMatchAfter))
100(define-cproc rxmatch-num-matches (match)
101  (if (SCM_FALSEP match)
102    (return (SCM_MAKE_INT 0))
103    (rxmatchop (SCM_MAKE_INT (-> (SCM_REGMATCH match) numMatches)))))
104(define-cproc rxmatch-named-groups (match)
105  (if (SCM_FALSEP match)
106    (return SCM_NIL)
107    (rxmatchop (-> (SCM_REGMATCH match) grpNames))))
108
109(select-module gauche.internal)
110(define-cproc %regexp-dump (rx::<regexp>) ::<void> Scm_RegDump)
111(define-cproc %regmatch-dump (rm::<regmatch>) ::<void> Scm_RegMatchDump)
112
113(define-cproc %regexp-pattern (regexp::<regexp>)
114  (setter (regexp::<regexp> pat::<string>) ::<void>
115          (set! (-> regexp pattern) (SCM_OBJ pat)))
116  (return (-> regexp pattern)))
117(define-cproc %regexp-laset (regexp::<regexp>) ; for testing
118  (return (-> regexp laset)))
119
120(select-module gauche.internal)
121;; aux routine for regexp-replace[-all]
122;; "abc\\1de\\3" => '("abc" 1 "de" 3)
123(define (%regexp-parse-subpattern sub)
124  (cond
125   [(string? sub)
126    (let loop ((sub sub) (r '()))
127      (cond [(rxmatch #/\\(?:(\d+)|k<([^>]+)>|(.))/ sub)
128             => (^m
129                 (define (loop2 elem)
130                   (loop (rxmatch-after m)
131                         (list* elem (rxmatch-before m) r)))
132                 (cond [(rxmatch-substring m 1)
133                        => (^d (loop2 (string->number d)))]
134                       [(rxmatch-substring m 2)
135                        => (^s (loop2 (string->symbol s)))]
136                       [else (loop2 (rxmatch-substring m 3))]))]
137            [else (reverse (cons sub r))]))]
138   [(procedure? sub) sub]
139   [else (error "string, procedure or list required, but got" sub)]))
140
141;; Skip the first subskip matches, then start replacing only up to
142;; subcount times (or infinite if subcount is #f).
143(define (%regexp-replace-rec rx string subpat subskip subcount)
144  (define (next-string match)
145    (let1 rest (rxmatch-after match)
146      (and (not (equal? rest ""))
147           (if (= (rxmatch-start match) (rxmatch-end match))
148             (begin (display (string-ref rest 0))
149                    (string-copy rest 1))
150             rest))))
151  (if (and subcount (zero? subcount))
152    (display string)
153    (let1 match (rxmatch rx string)
154      (cond
155       [(not match)
156        (display string)]
157       [(> subskip 0)
158        (display (rxmatch-before match))
159        (display (rxmatch-substring match))
160        (and-let1 next (next-string match)
161          (%regexp-replace-rec rx next subpat (- subskip 1) subcount))]
162       [else
163        (display (rxmatch-before match))
164        (if (procedure? subpat)
165          (display (subpat match))
166          (dolist [pat subpat]
167            (display (cond
168                      [(eq? pat 'pre) (rxmatch-before match)]
169                      [(eq? pat 'post) (rxmatch-after match)]
170                      [(or (number? pat) (symbol? pat))
171                       (rxmatch-substring match pat)]
172                      [else pat]))))
173        (and-let1 next (next-string match)
174          (%regexp-replace-rec rx next subpat subskip
175                               (and subcount (- subcount 1))))]))))
176
177(define (%regexp-replace rx string subpat subskip subcount)
178  (with-output-to-string
179      (^[]
180        (%regexp-replace-rec rx string subpat subskip subcount))))
181
182(define-in-module gauche (regexp-replace rx string sub)
183  (%regexp-replace rx string
184                   (%regexp-parse-subpattern sub) 0 1))
185
186(define-in-module gauche (regexp-replace-all rx string sub)
187  (%regexp-replace rx string
188                   (%regexp-parse-subpattern sub) 0 #f))
189
190(define (regexp-replace-driver name func-1)
191  (^[string rx sub . more]
192    (cond [(null? more) (func-1 rx string sub)]
193          [else
194           (unless (even? (length more))
195             (errorf "~a: regexp and subsitution don't pair up" name))
196           (let loop ([s (func-1 rx string sub)]
197                      [args more])
198             (if (null? args)
199               s
200               (loop (func-1 (car args) s (cadr args))
201                     (cddr args))))])))
202
203(define-in-module gauche regexp-replace*
204  (regexp-replace-driver 'regexp-replace* regexp-replace))
205
206(define-in-module gauche regexp-replace-all*
207  (regexp-replace-driver 'regexp-replace-all* regexp-replace-all))
208
209;; Contributed from Alex Shinn; modified a bit by shiro
210(define-in-module gauche (regexp-quote str)
211  (with-string-io str
212    (^[] (let loop ((c (read-char)))
213           (unless (eof-object? c)
214             (when (char-set-contains? #[\\|\[\](){}.*+?^$] c) (write-char #\\))
215             (write-char c)
216             (loop (read-char)))))))
217
218(define-in-module gauche (regexp->string rx)
219  (or (%regexp-pattern rx)
220      (rlet1 s (regexp-unparse (regexp-ast rx))
221        (set! (%regexp-pattern rx) s))))
222
223(define-in-module gauche (rxmatch->string rx str . sel)
224  (cond [(null? sel) (rxmatch-substring (rxmatch rx str))]
225        [(eq? (car sel) 'after)
226         (apply rxmatch-after (rxmatch rx str) (cdr sel))]
227        [(eq? (car sel) 'before)
228         (apply rxmatch-before (rxmatch rx str) (cdr sel))]
229        [else (rxmatch-substring (rxmatch rx str) (car sel))]))
230