1#lang racket/base
2(require (prefix-in rx: "main.rkt"))
3
4(define-syntax-rule (test expr v)
5  (let ([b expr])
6    (unless (equal? b v)
7      (error 'test "failed: ~s => ~s" 'expr b))))
8
9(test (rx:regexp-match "" (open-input-string "123") 4)
10      #f)
11(test (rx:regexp-match-peek "" (open-input-string "123") 4)
12      #f)
13
14(for* ([succeed? '(#f #t)]
15       [char '(#\x #\u3BB)])
16  (for ([N '(1 100 1000 1023 1024 10000)])
17    (for ([M (list 0 (quotient N 2))])
18      (define o (open-output-bytes))
19      (log-error "N = ~a, M = ~a" N M)
20      (void (rx:regexp-match-positions "y"
21                                       (string-append
22                                        (make-string N char)
23                                        (if succeed? "y" ""))
24                                       M
25                                       (+ N (if succeed? 1 0))
26                                       o))
27      (test (string-length (get-output-string o)) (- N M)))))
28
29;; Test bounded byte consumption on failure:
30(let ([is (open-input-string "barfoo")])
31  (test (list (rx:regexp-match "^foo" is 0 3) (read-char is)) '(#f #\f)))
32(let ([is (open-input-string "barfoo")])
33  (test (list (rx:regexp-match "foo" is 0 3) (read-char is)) '(#f #\f)))
34
35;; Don't consume bytes that corresponds to a prefix:
36(let ()
37  (define in (open-input-string "a\nb\nc\n"))
38  (define rx:.n (rx:byte-regexp #"(?m:^.\n)"))
39  (test (rx:regexp-match rx:.n in 0 #f #f #"") '(#"a\n"))
40  (test (rx:regexp-match rx:.n in 0 #f #f #"\n") '(#"b\n"))
41  (test (rx:regexp-match rx:.n in 0 #f #f #"\n") '(#"c\n")))
42
43(let ()
44  (define in (open-input-bytes #" a b c "))
45
46  (define discard (open-output-bytes))
47  (rx:regexp-match "[abc]" in 0 3 discard #"")
48  (test (get-output-bytes discard) #" ")
49
50  (define discard2 (open-output-bytes))
51  (rx:regexp-match "[abc]" in 0 1 discard2 #"")
52  (test (get-output-bytes discard2) #" "))
53
54;; Input streams that are large enough for bytes to be discarded along the way
55(test (rx:regexp-match #"(.)x" (open-input-string (string-append (make-string 50000 #\y) "x")))
56      '(#"yx" #"y"))
57(test (rx:regexp-match-positions #"(.)x" (open-input-string (string-append (make-string 50000 #\y) "x")))
58      '((49999 . 50001) (49999 . 50000)))
59(test (rx:regexp-match "(.)x" (string-append (make-string 50000 #\y) "x"))
60      '("yx" "y"))
61(test (rx:regexp-match-positions "(.)x" (string-append (make-string 50000 #\y) "x"))
62      '((49999 . 50001) (49999 . 50000)))
63(test (rx:regexp-match "(.)\u3BC" (string-append (make-string 50000 #\u3BB) "\u3BC"))
64      '("\u3BB\u3BC" "\u3BB"))
65(test (rx:regexp-match-positions "(.)\u3BC" (string-append (make-string 50000 #\y) "\u3BC"))
66      '((49999 . 50001) (49999 . 50000)))
67
68(test (rx:regexp-match-positions #"<([abc])(>)?" "<a + <b = <c" 3)
69      '((5 . 7) (6 . 7) #f))
70(test (rx:regexp-match-positions "[abc]" " a b c " 2)
71      '((3 . 4)))
72(test (rx:regexp-match-positions "(?m:^.\n)" "a\nb\nc\n" 2 6 #f #"\n")
73      '((2 . 4)))
74(test (rx:regexp-match-positions "(?:(?m:^$))(?<=..)" "ge \n TLambda-tc\n\n ;; (extend Γ o Γx-s\n extend\n\n ;;" 29 #f #f #"\n")
75      '((46 . 46)))
76
77;; Match groups spans prefix:
78(test (rx:regexp-match-positions (rx:regexp "(?<=(..))") "aaa" 0 #f #f #"x")
79      '((1 . 1) (-1 . 1)))
80(test (rx:regexp-match-positions (rx:regexp "(?<=(.))") "aaa" 0 #f #f #"x")
81      '((0 . 0) (-1 . 0)))
82(test (rx:regexp-match-positions (rx:byte-regexp #"(?<=(.))") #"aaa" 0 #f #f #"x")
83      '((0 . 0) (-1 . 0)))
84(test (rx:regexp-match-peek-positions (rx:byte-regexp #"(?<=(.))") (open-input-bytes #"aaa") 0 #f #f #"x")
85      '((0 . 0) (-1 . 0)))
86(test (rx:regexp-match-positions (rx:regexp "(?<=(.))") "aaa" 0 #f #f (string->bytes/utf-8 "\u3BB"))
87      '((0 . 0) (-1 . 0)))
88(test (rx:regexp-match-positions (rx:regexp "(?<=(.)).") "\u03BBaa" 0 #f #f (string->bytes/utf-8 "\u3BC"))
89      '((0 . 1) (-1 . 0)))
90(test (rx:regexp-match (rx:regexp "(?<=(.))") "aaa" 0 #f #f (string->bytes/utf-8 "\u3BB"))
91      '("" "\u3BB"))
92(test (rx:regexp-match (rx:regexp "(?<=(.)).") "aaa" 0 #f #f (string->bytes/utf-8 "\u3BB"))
93      '("a" "\u3BB"))
94(test (rx:regexp-match (rx:regexp "(?<=(.)).") "\u03BBaa" 0 #f #f (string->bytes/utf-8 "\u3BC"))
95      '("\u3BB" "\u3BC"))
96(test (rx:regexp-match (rx:byte-regexp #"(?<=(.))") #"aaa" 0 #f #f #"x")
97      '(#"" #"x"))
98(test (rx:regexp-match (rx:byte-regexp #"(?<=(.))..") (open-input-bytes #"abc") 0 #f #f #"x")
99      '(#"ab" #"x"))
100(test (rx:regexp-match-peek (rx:byte-regexp #"(?<=(.))..") (open-input-bytes #"abc") 0 #f #f #"x")
101      '(#"ab" #"x"))
102
103;; Replacement where match groups spans prefix:
104(test (rx:regexp-replace (rx:byte-regexp #"(?<=(.))") #"aaa" #"[\\1]" #"x")
105      #"[x]aaa")
106(test (rx:regexp-replace (rx:byte-regexp #"(?<=(..))") #"abc" #"[\\1]" #"x")
107      #"a[xa]bc")
108(test (rx:regexp-replace (rx:byte-regexp #"(?<=(.)).") #"aaa" #"[\\1]" #"x")
109      #"[x]aa")
110(test (rx:regexp-replace (rx:regexp "(?<=(.))") "aaa" "[\\1]" #"x")
111      "[x]aaa")
112(test (rx:regexp-replace (rx:regexp "(?<=(..))") "aaa" "[\\1]" #"x")
113      "a[xa]aa")
114(test (rx:regexp-replace (rx:regexp "(?<=(.)).") "aaa" "[\\1]" #"x")
115      "[x]aa")
116(test (rx:regexp-replace (rx:regexp "(?<=(.)).") "aaa" "[\\1]" #"\xFFx")
117      "[x]aa")
118(test (rx:regexp-replace (rx:regexp "(?<=(.)).") "abc" "[\\1]" #"\xFF") ; can't match non-UTF-8 prefix
119      "a[a]c")
120(test (rx:regexp-replace (rx:regexp "(?<=(..)).") "aaa" "[\\1]"
121                         (bytes-append #"\xFF"
122                                       (string->bytes/utf-8 "\u03BBx")))
123      "[\u03BBx]aa")
124(test (rx:regexp-replace (rx:regexp "(?<=(..)).") "aaa" (lambda (m m1) (string-append "{" m1 "}"))
125                         (bytes-append #"\xFF"
126                                       (string->bytes/utf-8 "\u03BBx")))
127      "{\u03BBx}aa")
128
129(test (rx:regexp-replace* "-" "zero-or-more?" "_")
130      "zero_or_more?")
131(test (rx:regexp-replace* "" "aaa" "c")
132      "cacacac")
133(test (rx:regexp-replace* (rx:regexp "(?<=(.)).") "aaa" "[\\1]" #"\xFFx")
134      "[x][a][a]")
135(test (rx:regexp-replace* (rx:regexp "(?<=(.)).") "abc" "[\\1]" #"\xFFx")
136      "[x][a][b]")
137(test (rx:regexp-replace* (rx:regexp "(?<=(..)).") "abc" "[\\1]" #"\xFFx")
138      "a[xa][ab]")
139
140(test (rx:regexp-replace* (rx:byte-regexp #"(?<=(..))") #"abc" #"[\\1]" #"x")
141      #"a[xa]b[ab]c[bc]")
142
143;; Don't get stuck waiting for an unneeded byte:
144(let ()
145  (define-values (i o) (make-pipe))
146  (write-string "1\n" o)
147  (define rx (rx:regexp "^(?:(.*?)(?:\r\n|\n))"))
148  (test (rx:regexp-match rx i) '(#"1\n" #"1")))
149(let ()
150  (define-values (i o) (make-pipe))
151  (write-string "abc" o)
152  (define rx (rx:regexp "^(ab)*"))
153  (test (rx:regexp-match rx i) '(#"ab" #"ab")))
154(let ()
155  (define-values (i o) (make-pipe))
156  (write-string "123" o)
157  (define rx (rx:pregexp "^(12)\\1|123"))
158  (test (rx:regexp-match rx i) '(#"123" #f)))
159
160;; Check for quadratic `regexp-replace*`:
161(time
162 (test (rx:regexp-replace* (rx:regexp "a") (make-string (* 1024 1024) #\a) "x")
163       (make-string (* 1024 1024) #\x)))
164
165;; ----------------------------------------
166
167(define (check rx in N [M (max 1 (quotient N 10))])
168  (define c-start (current-inexact-milliseconds))
169  (define orig-rx
170    (if (bytes? rx)
171        (for/fold ([r #f]) ([i (in-range M)])
172          (byte-pregexp rx))
173        (for/fold ([r #f]) ([i (in-range M)])
174          (pregexp rx))))
175  (define c-after-orig (current-inexact-milliseconds))
176  (define new-rx
177    (if (bytes? rx)
178        (for/fold ([r #f]) ([i (in-range M)])
179          (rx:byte-pregexp rx))
180        (for/fold ([r #f]) ([i (in-range M)])
181          (rx:pregexp rx))))
182  (define c-after-new (current-inexact-milliseconds))
183
184  (define orig-v (regexp-match orig-rx in))
185  (define new-v (rx:regexp-match new-rx in))
186  (unless (equal? orig-v new-v)
187    (error 'check
188           "failed\n  pattern: ~s\n  input: ~s\n  expected: ~s\n  got: ~s"
189           rx in orig-v new-v))
190
191  (define start (current-inexact-milliseconds))
192  (for/fold ([r #f]) ([i (in-range N)])
193    (regexp-match? orig-rx in))
194  (define after-orig (current-inexact-milliseconds))
195  (for/fold ([r #f]) ([i (in-range N)])
196    (rx:regexp-match? new-rx in))
197  (define after-new (current-inexact-milliseconds))
198
199  (define orig-c-msec (- c-after-orig c-start))
200  (define new-c-msec (- c-after-new c-after-orig))
201  (define orig-msec (- after-orig start))
202  (define new-msec (- after-new after-orig))
203
204  (unless (= N 1)
205    (parameterize ([error-print-width 64])
206      (printf "regex: ~.s\non: ~.s\n" rx in))
207
208    (define (~n n)
209      (car (regexp-match #px"^[0-9]*[.]?[0-9]{0,2}" (format "~a" n))))
210
211    (printf " compile: ~a  (~a vs. ~a) / ~a iterations\n"
212            (~n (/ new-c-msec orig-c-msec))
213            (~n orig-c-msec)
214            (~n new-c-msec)
215            M)
216    (printf " interp:  ~a  (~a vs. ~a) / ~a iterations\n"
217            (~n (/ new-msec orig-msec))
218            (~n orig-msec)
219            (~n new-msec)
220            N)))
221
222;; ----------------------------------------
223
224(check #"(?m:^aa$a.)"
225       #"abaac\nac\naa\nacacaaacd"
226       1)
227
228(check #"\\sa."
229       #"cat apple"
230       1)
231
232(check "(?>a*)a"
233       "aaa"
234       1)
235
236(check "(?:a|b)y(\\1)"
237       "ayb"
238       1)
239
240(check "!.!"
241       #"!\x80!"
242       1)
243
244(check #"\\P{Ll}"
245       #"aB"
246       1)
247
248(check #"\\p{^Ll}"
249       #"aB"
250       1)
251
252(check #".*"
253       #"abaacacaaacacaaacd"
254       100000)
255
256(check #"ab(?:a*c)*d"
257       #"abaacacaaacacaaacd"
258       100000)
259
260(check #"ab(?:a*?c)*d"
261       #"abaacacaaacacaaacd"
262       100000)
263
264(check #"ab(?:[ab]*c)*d"
265       #"abaacacaaacacaaacd"
266       100000)
267
268(define ipv6-hex "[0-9a-fA-F:]*:[0-9a-fA-F:]*")
269
270(define url-s
271  (string-append
272   "^"
273   "(?:"              ; / scheme-colon-opt
274   "([^:/?#]*)"       ; | #1 = scheme-opt
275   ":)?"              ; \
276   "(?://"            ; / slash-slash-authority-opt
277   "(?:"              ; | / user-at-opt
278   "([^/?#@]*)"       ; | | #2 = user-opt
279   "@)?"              ; | \
280   "(?:"              ;
281   "(?:\\["           ; | / #3 = ipv6-host-opt
282   "(" ipv6-hex ")"   ; | | hex-addresses
283   "\\])|"            ; | \
284   "([^/?#:]*)"       ; | #4 = host-opt
285   ")?"               ;
286   "(?::"             ; | / colon-port-opt
287   "([0-9]*)"         ; | | #5 = port-opt
288   ")?"               ; | \
289   ")?"               ; \
290   "([^?#]*)"         ; #6 = path
291   "(?:\\?"           ; / question-query-opt
292   "([^#]*)"          ; | #7 = query-opt
293   ")?"               ; \
294   "(?:#"             ; / hash-fragment-opt
295   "(.*)"             ; | #8 = fragment-opt
296   ")?"               ; \
297   "$"))
298
299(define rlo "https://racket-lang.org:80x/people.html?check=ok#end")
300
301(check (string->bytes/utf-8 url-s)
302       (string->bytes/utf-8 rlo)
303       100000)
304
305(check url-s
306       rlo
307       10000)
308
309;; all of the work is looking for a must-string
310(check #"a*b"
311       (make-bytes 1024 (char->integer #\a))
312       100000)
313