1#lang racket/base 2(require (prefix-in rx: "main.rkt")) 3 4(define-syntax-rule (test expr v) 5 (let ([b expr]) 6 (unless (equal? b v) 7 (error 'test "failed: ~s => ~s" 'expr b)))) 8 9(test (rx:regexp-match "" (open-input-string "123") 4) 10 #f) 11(test (rx:regexp-match-peek "" (open-input-string "123") 4) 12 #f) 13 14(for* ([succeed? '(#f #t)] 15 [char '(#\x #\u3BB)]) 16 (for ([N '(1 100 1000 1023 1024 10000)]) 17 (for ([M (list 0 (quotient N 2))]) 18 (define o (open-output-bytes)) 19 (log-error "N = ~a, M = ~a" N M) 20 (void (rx:regexp-match-positions "y" 21 (string-append 22 (make-string N char) 23 (if succeed? "y" "")) 24 M 25 (+ N (if succeed? 1 0)) 26 o)) 27 (test (string-length (get-output-string o)) (- N M))))) 28 29;; Test bounded byte consumption on failure: 30(let ([is (open-input-string "barfoo")]) 31 (test (list (rx:regexp-match "^foo" is 0 3) (read-char is)) '(#f #\f))) 32(let ([is (open-input-string "barfoo")]) 33 (test (list (rx:regexp-match "foo" is 0 3) (read-char is)) '(#f #\f))) 34 35;; Don't consume bytes that corresponds to a prefix: 36(let () 37 (define in (open-input-string "a\nb\nc\n")) 38 (define rx:.n (rx:byte-regexp #"(?m:^.\n)")) 39 (test (rx:regexp-match rx:.n in 0 #f #f #"") '(#"a\n")) 40 (test (rx:regexp-match rx:.n in 0 #f #f #"\n") '(#"b\n")) 41 (test (rx:regexp-match rx:.n in 0 #f #f #"\n") '(#"c\n"))) 42 43(let () 44 (define in (open-input-bytes #" a b c ")) 45 46 (define discard (open-output-bytes)) 47 (rx:regexp-match "[abc]" in 0 3 discard #"") 48 (test (get-output-bytes discard) #" ") 49 50 (define discard2 (open-output-bytes)) 51 (rx:regexp-match "[abc]" in 0 1 discard2 #"") 52 (test (get-output-bytes discard2) #" ")) 53 54;; Input streams that are large enough for bytes to be discarded along the way 55(test (rx:regexp-match #"(.)x" (open-input-string (string-append (make-string 50000 #\y) "x"))) 56 '(#"yx" #"y")) 57(test (rx:regexp-match-positions #"(.)x" (open-input-string (string-append (make-string 50000 #\y) "x"))) 58 '((49999 . 50001) (49999 . 50000))) 59(test (rx:regexp-match "(.)x" (string-append (make-string 50000 #\y) "x")) 60 '("yx" "y")) 61(test (rx:regexp-match-positions "(.)x" (string-append (make-string 50000 #\y) "x")) 62 '((49999 . 50001) (49999 . 50000))) 63(test (rx:regexp-match "(.)\u3BC" (string-append (make-string 50000 #\u3BB) "\u3BC")) 64 '("\u3BB\u3BC" "\u3BB")) 65(test (rx:regexp-match-positions "(.)\u3BC" (string-append (make-string 50000 #\y) "\u3BC")) 66 '((49999 . 50001) (49999 . 50000))) 67 68(test (rx:regexp-match-positions #"<([abc])(>)?" "<a + <b = <c" 3) 69 '((5 . 7) (6 . 7) #f)) 70(test (rx:regexp-match-positions "[abc]" " a b c " 2) 71 '((3 . 4))) 72(test (rx:regexp-match-positions "(?m:^.\n)" "a\nb\nc\n" 2 6 #f #"\n") 73 '((2 . 4))) 74(test (rx:regexp-match-positions "(?:(?m:^$))(?<=..)" "ge \n TLambda-tc\n\n ;; (extend Γ o Γx-s\n extend\n\n ;;" 29 #f #f #"\n") 75 '((46 . 46))) 76 77;; Match groups spans prefix: 78(test (rx:regexp-match-positions (rx:regexp "(?<=(..))") "aaa" 0 #f #f #"x") 79 '((1 . 1) (-1 . 1))) 80(test (rx:regexp-match-positions (rx:regexp "(?<=(.))") "aaa" 0 #f #f #"x") 81 '((0 . 0) (-1 . 0))) 82(test (rx:regexp-match-positions (rx:byte-regexp #"(?<=(.))") #"aaa" 0 #f #f #"x") 83 '((0 . 0) (-1 . 0))) 84(test (rx:regexp-match-peek-positions (rx:byte-regexp #"(?<=(.))") (open-input-bytes #"aaa") 0 #f #f #"x") 85 '((0 . 0) (-1 . 0))) 86(test (rx:regexp-match-positions (rx:regexp "(?<=(.))") "aaa" 0 #f #f (string->bytes/utf-8 "\u3BB")) 87 '((0 . 0) (-1 . 0))) 88(test (rx:regexp-match-positions (rx:regexp "(?<=(.)).") "\u03BBaa" 0 #f #f (string->bytes/utf-8 "\u3BC")) 89 '((0 . 1) (-1 . 0))) 90(test (rx:regexp-match (rx:regexp "(?<=(.))") "aaa" 0 #f #f (string->bytes/utf-8 "\u3BB")) 91 '("" "\u3BB")) 92(test (rx:regexp-match (rx:regexp "(?<=(.)).") "aaa" 0 #f #f (string->bytes/utf-8 "\u3BB")) 93 '("a" "\u3BB")) 94(test (rx:regexp-match (rx:regexp "(?<=(.)).") "\u03BBaa" 0 #f #f (string->bytes/utf-8 "\u3BC")) 95 '("\u3BB" "\u3BC")) 96(test (rx:regexp-match (rx:byte-regexp #"(?<=(.))") #"aaa" 0 #f #f #"x") 97 '(#"" #"x")) 98(test (rx:regexp-match (rx:byte-regexp #"(?<=(.))..") (open-input-bytes #"abc") 0 #f #f #"x") 99 '(#"ab" #"x")) 100(test (rx:regexp-match-peek (rx:byte-regexp #"(?<=(.))..") (open-input-bytes #"abc") 0 #f #f #"x") 101 '(#"ab" #"x")) 102 103;; Replacement where match groups spans prefix: 104(test (rx:regexp-replace (rx:byte-regexp #"(?<=(.))") #"aaa" #"[\\1]" #"x") 105 #"[x]aaa") 106(test (rx:regexp-replace (rx:byte-regexp #"(?<=(..))") #"abc" #"[\\1]" #"x") 107 #"a[xa]bc") 108(test (rx:regexp-replace (rx:byte-regexp #"(?<=(.)).") #"aaa" #"[\\1]" #"x") 109 #"[x]aa") 110(test (rx:regexp-replace (rx:regexp "(?<=(.))") "aaa" "[\\1]" #"x") 111 "[x]aaa") 112(test (rx:regexp-replace (rx:regexp "(?<=(..))") "aaa" "[\\1]" #"x") 113 "a[xa]aa") 114(test (rx:regexp-replace (rx:regexp "(?<=(.)).") "aaa" "[\\1]" #"x") 115 "[x]aa") 116(test (rx:regexp-replace (rx:regexp "(?<=(.)).") "aaa" "[\\1]" #"\xFFx") 117 "[x]aa") 118(test (rx:regexp-replace (rx:regexp "(?<=(.)).") "abc" "[\\1]" #"\xFF") ; can't match non-UTF-8 prefix 119 "a[a]c") 120(test (rx:regexp-replace (rx:regexp "(?<=(..)).") "aaa" "[\\1]" 121 (bytes-append #"\xFF" 122 (string->bytes/utf-8 "\u03BBx"))) 123 "[\u03BBx]aa") 124(test (rx:regexp-replace (rx:regexp "(?<=(..)).") "aaa" (lambda (m m1) (string-append "{" m1 "}")) 125 (bytes-append #"\xFF" 126 (string->bytes/utf-8 "\u03BBx"))) 127 "{\u03BBx}aa") 128 129(test (rx:regexp-replace* "-" "zero-or-more?" "_") 130 "zero_or_more?") 131(test (rx:regexp-replace* "" "aaa" "c") 132 "cacacac") 133(test (rx:regexp-replace* (rx:regexp "(?<=(.)).") "aaa" "[\\1]" #"\xFFx") 134 "[x][a][a]") 135(test (rx:regexp-replace* (rx:regexp "(?<=(.)).") "abc" "[\\1]" #"\xFFx") 136 "[x][a][b]") 137(test (rx:regexp-replace* (rx:regexp "(?<=(..)).") "abc" "[\\1]" #"\xFFx") 138 "a[xa][ab]") 139 140(test (rx:regexp-replace* (rx:byte-regexp #"(?<=(..))") #"abc" #"[\\1]" #"x") 141 #"a[xa]b[ab]c[bc]") 142 143;; Don't get stuck waiting for an unneeded byte: 144(let () 145 (define-values (i o) (make-pipe)) 146 (write-string "1\n" o) 147 (define rx (rx:regexp "^(?:(.*?)(?:\r\n|\n))")) 148 (test (rx:regexp-match rx i) '(#"1\n" #"1"))) 149(let () 150 (define-values (i o) (make-pipe)) 151 (write-string "abc" o) 152 (define rx (rx:regexp "^(ab)*")) 153 (test (rx:regexp-match rx i) '(#"ab" #"ab"))) 154(let () 155 (define-values (i o) (make-pipe)) 156 (write-string "123" o) 157 (define rx (rx:pregexp "^(12)\\1|123")) 158 (test (rx:regexp-match rx i) '(#"123" #f))) 159 160;; Check for quadratic `regexp-replace*`: 161(time 162 (test (rx:regexp-replace* (rx:regexp "a") (make-string (* 1024 1024) #\a) "x") 163 (make-string (* 1024 1024) #\x))) 164 165;; ---------------------------------------- 166 167(define (check rx in N [M (max 1 (quotient N 10))]) 168 (define c-start (current-inexact-milliseconds)) 169 (define orig-rx 170 (if (bytes? rx) 171 (for/fold ([r #f]) ([i (in-range M)]) 172 (byte-pregexp rx)) 173 (for/fold ([r #f]) ([i (in-range M)]) 174 (pregexp rx)))) 175 (define c-after-orig (current-inexact-milliseconds)) 176 (define new-rx 177 (if (bytes? rx) 178 (for/fold ([r #f]) ([i (in-range M)]) 179 (rx:byte-pregexp rx)) 180 (for/fold ([r #f]) ([i (in-range M)]) 181 (rx:pregexp rx)))) 182 (define c-after-new (current-inexact-milliseconds)) 183 184 (define orig-v (regexp-match orig-rx in)) 185 (define new-v (rx:regexp-match new-rx in)) 186 (unless (equal? orig-v new-v) 187 (error 'check 188 "failed\n pattern: ~s\n input: ~s\n expected: ~s\n got: ~s" 189 rx in orig-v new-v)) 190 191 (define start (current-inexact-milliseconds)) 192 (for/fold ([r #f]) ([i (in-range N)]) 193 (regexp-match? orig-rx in)) 194 (define after-orig (current-inexact-milliseconds)) 195 (for/fold ([r #f]) ([i (in-range N)]) 196 (rx:regexp-match? new-rx in)) 197 (define after-new (current-inexact-milliseconds)) 198 199 (define orig-c-msec (- c-after-orig c-start)) 200 (define new-c-msec (- c-after-new c-after-orig)) 201 (define orig-msec (- after-orig start)) 202 (define new-msec (- after-new after-orig)) 203 204 (unless (= N 1) 205 (parameterize ([error-print-width 64]) 206 (printf "regex: ~.s\non: ~.s\n" rx in)) 207 208 (define (~n n) 209 (car (regexp-match #px"^[0-9]*[.]?[0-9]{0,2}" (format "~a" n)))) 210 211 (printf " compile: ~a (~a vs. ~a) / ~a iterations\n" 212 (~n (/ new-c-msec orig-c-msec)) 213 (~n orig-c-msec) 214 (~n new-c-msec) 215 M) 216 (printf " interp: ~a (~a vs. ~a) / ~a iterations\n" 217 (~n (/ new-msec orig-msec)) 218 (~n orig-msec) 219 (~n new-msec) 220 N))) 221 222;; ---------------------------------------- 223 224(check #"(?m:^aa$a.)" 225 #"abaac\nac\naa\nacacaaacd" 226 1) 227 228(check #"\\sa." 229 #"cat apple" 230 1) 231 232(check "(?>a*)a" 233 "aaa" 234 1) 235 236(check "(?:a|b)y(\\1)" 237 "ayb" 238 1) 239 240(check "!.!" 241 #"!\x80!" 242 1) 243 244(check #"\\P{Ll}" 245 #"aB" 246 1) 247 248(check #"\\p{^Ll}" 249 #"aB" 250 1) 251 252(check #".*" 253 #"abaacacaaacacaaacd" 254 100000) 255 256(check #"ab(?:a*c)*d" 257 #"abaacacaaacacaaacd" 258 100000) 259 260(check #"ab(?:a*?c)*d" 261 #"abaacacaaacacaaacd" 262 100000) 263 264(check #"ab(?:[ab]*c)*d" 265 #"abaacacaaacacaaacd" 266 100000) 267 268(define ipv6-hex "[0-9a-fA-F:]*:[0-9a-fA-F:]*") 269 270(define url-s 271 (string-append 272 "^" 273 "(?:" ; / scheme-colon-opt 274 "([^:/?#]*)" ; | #1 = scheme-opt 275 ":)?" ; \ 276 "(?://" ; / slash-slash-authority-opt 277 "(?:" ; | / user-at-opt 278 "([^/?#@]*)" ; | | #2 = user-opt 279 "@)?" ; | \ 280 "(?:" ; 281 "(?:\\[" ; | / #3 = ipv6-host-opt 282 "(" ipv6-hex ")" ; | | hex-addresses 283 "\\])|" ; | \ 284 "([^/?#:]*)" ; | #4 = host-opt 285 ")?" ; 286 "(?::" ; | / colon-port-opt 287 "([0-9]*)" ; | | #5 = port-opt 288 ")?" ; | \ 289 ")?" ; \ 290 "([^?#]*)" ; #6 = path 291 "(?:\\?" ; / question-query-opt 292 "([^#]*)" ; | #7 = query-opt 293 ")?" ; \ 294 "(?:#" ; / hash-fragment-opt 295 "(.*)" ; | #8 = fragment-opt 296 ")?" ; \ 297 "$")) 298 299(define rlo "https://racket-lang.org:80x/people.html?check=ok#end") 300 301(check (string->bytes/utf-8 url-s) 302 (string->bytes/utf-8 rlo) 303 100000) 304 305(check url-s 306 rlo 307 10000) 308 309;; all of the work is looking for a must-string 310(check #"a*b" 311 (make-bytes 1024 (char->integer #\a)) 312 100000) 313