1;;;
2;;; SRFI-115  Scheme Regular Expressions
3;;;
4;;;   Copyright (c) 2019  Duy Nguyen <pclouds@gmail.com>
5;;;
6;;;   Redistribution and use in source and binary forms, with or without
7;;;   modification, are permitted provided that the following conditions
8;;;   are met:
9;;;
10;;;   1. Redistributions of source code must retain the above copyright
11;;;      notice, this list of conditions and the following disclaimer.
12;;;
13;;;   2. Redistributions in binary form must reproduce the above copyright
14;;;      notice, this list of conditions and the following disclaimer in the
15;;;      documentation and/or other materials provided with the distribution.
16;;;
17;;;   3. Neither the name of the authors nor the names of its contributors
18;;;      may be used to endorse or promote products derived from this
19;;;      software without specific prior written permission.
20;;;
21;;;   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22;;;   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23;;;   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24;;;   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25;;;   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26;;;   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27;;;   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28;;;   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29;;;   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30;;;   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31;;;   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32;;;
33
34(define-module srfi-115
35  (use scheme.charset)
36  (use gauche.regexp.sre)
37  (export rx regexp regexp->sre char-set->sre valid-sre?
38          regexp-matches regexp-matches? regexp-search
39          regexp-fold regexp-extract regexp-split regexp-partition
40          regexp-replace regexp-replace-all
41          regexp-match? regexp-match-count regexp-match-submatch
42          regexp-match-submatch-start regexp-match-submatch-end
43          regexp-match->list))
44(select-module srfi-115)
45
46(define-syntax rx
47  (syntax-rules ()
48    [(_ sre ...) (regexp `(: sre ...))]))
49
50(define (regexp re)
51  (if (regexp? re) re (sre->regexp re)))
52
53(define (regexp->sre re)
54  (regexp-unparse-sre (regexp-ast re)))
55
56(define (char-set->sre cs)
57  (list (list->string (char-set->list cs))))
58
59(define (valid-sre? sre)
60  (guard (e [(<regexp-invalid-sre> e) #f][else (raise e)])
61    (regexp-parse-sre sre)
62    #t))
63
64;; regexp? is already defined by gauche.regexp and is builtin.
65;; do we even need to export it in pure R7RS environment?
66
67(define (regexp-matches re str :optional start end)
68  (if (regexp? re)
69      (let1 match (regexp-search re str start end)
70        (if (and match
71                 (zero? (rxmatch-start match))
72                 (eq? (rxmatch-end match)
73                      (if (undefined? end)
74                          (string-length str)
75                          end)))
76            match
77            #f))
78      ;; if re is not compiled yet, insert bos to take advantage of
79      ;; BOL_ANCHORED optimization.
80      (regexp-search `(: bos ,re eos) str start end)))
81
82(define (regexp-matches? re str :optional start end)
83  (if (regexp-matches re str start end) #t #f))
84
85(define (regexp-search re str :optional start end)
86  (rxmatch (regexp re) str start end))
87
88(define (regexp-fold re kons knil str :optional finish start end)
89  (cond
90   [(undefined? finish)
91    (regexp-fold re kons knil str (lambda (from match str acc)
92                                    acc))]
93   [(undefined? start)
94    (regexp-fold re kons knil str finish 0)]
95   [(undefined? end)
96    (regexp-fold re kons knil str finish 0 (string-length str))]
97   [(not (regexp? re))
98    (regexp-fold (regexp re) kons knil str finish 0 end)]
99   [else
100    (let loop ([start start]
101               [last-end start]
102               [acc knil])
103      (let1 match (and (< start end)
104                       (regexp-search re str start end))
105        (cond
106         [(not match)
107          (finish last-end #f str acc)]
108         [(eq? start (rxmatch-end match)) ; empty match, try again
109          (loop (+ start 1) last-end acc)]
110         [else
111          (loop (rxmatch-end match)
112                (rxmatch-end match)
113                (kons last-end match str acc))])))]))
114
115(define (regexp-extract re str . opt)
116  (apply regexp-fold re
117         (lambda (i match str acc)
118           (if (equal? (match 0) "")
119               acc
120               (cons (match 0) acc)))
121         '()
122         str
123         (lambda (i match str acc)
124           (reverse acc))
125         opt))
126
127(define (regexp-split rx str . o)
128  ;; start and end in indices passed to regexp-fold
129  (let ((start (if (pair? o) (car o) 0))
130        (end (if (and (pair? o) (pair? (cdr o)))
131                 (cadr o)
132                 (string-length str))))
133    (regexp-fold rx
134                 (lambda (from md str a)
135                   (let ((i (regexp-match-submatch-start md 0))
136                         (j (regexp-match-submatch-end md 0)))
137                     (if (eqv? i j)
138                         a
139                         (cons j
140                               (cons (substring str (car a) i)
141                                     (cdr a))))))
142                 (cons start '())
143                 str
144                 (lambda (from md str a)
145                   (reverse
146                    (cons (substring str (car a) end)
147                          (cdr a))))
148                 start
149                 end)))
150
151(define (regexp-partition rx str . o)
152  (let ((start (if (pair? o) (car o) 0))
153        (end (if (and (pair? o) (pair? (cdr o)))
154                 (cadr o)
155                 (string-length str))))
156    (define (kons from md str a)
157      (let ((i (regexp-match-submatch-start md 0))
158            (j (regexp-match-submatch-end md 0)))
159        (if (eqv? i j)
160            a
161            (let ((left (substring str (car a) i)))
162              (cons j
163                    (cons (regexp-match-submatch md 0)
164                          (cons left (cdr a))))))))
165    (define (final from md str a)
166      (if (or (< from end)
167              (null? (cdr a)))
168          (cons (substring str (car a) end) (cdr a))
169          (cdr a)))
170    (reverse
171     (regexp-fold rx
172                  kons
173                  (cons start '())
174                  str
175                  final
176                  start
177                  end))))
178
179;; %regexp-replace-rec takes a substitution as either a procedure or a
180;; list of symbols, numbers and strings. Wrap single symbols or
181;; strings to a list to follow this rule
182(define (transform-sub rx sub)
183  (define (check-pre-post sub)
184    (when (and (eq? sub 'pre) (assq 'pre (regexp-named-groups rx)))
185      (error "If 'pre is used in regexp-replace, there should not be a capture group also named 'pre"))
186    (when (and (eq? sub 'post) (assq 'post (regexp-named-groups rx)))
187      (error "If 'post is used in regexp-replace, there should not be a capture group also named 'post"))
188    sub)
189
190  (cond
191   [(string? sub) (list sub)]
192   [(number? sub) (list sub)]
193   [(symbol? sub) (list (check-pre-post sub))]
194   [(list? sub) (map check-pre-post sub)]
195   [else sub]))
196
197(define (%substring str start end)
198  (if (or (> start 0) end)
199      (substring str start (or end (string-length str)))
200      str))
201
202(define (regexp-replace rx str sub :optional
203                        (start 0)
204                        (end #f)
205                        (count 0))
206  (let1 rx (regexp rx)
207    ((with-module gauche.internal %regexp-replace)
208     rx (%substring str start end)
209     (transform-sub rx sub) count 1)))
210
211(define (regexp-replace-all rx str sub :optional
212                            (start 0)
213                            (end #f))
214  (let1 rx (regexp rx)
215    ((with-module gauche.internal %regexp-replace)
216     rx (%substring str start end)
217     (transform-sub rx sub) 0 #f)))
218
219(define regexp-match? regmatch?)
220(define (regexp-match-count match)
221  (- (rxmatch-num-matches match) 1))
222(define regexp-match-submatch rxmatch-substring)
223(define regexp-match-submatch-start rxmatch-start)
224(define regexp-match-submatch-end rxmatch-end)
225(define regexp-match->list rxmatch-substrings)
226