1#lang racket/base 2(require "../parse/ast.rkt" 3 "../common/range.rkt" 4 "match.rkt") 5 6;; Compile to a Spencer-style interpretation of a regular expression, 7;; where sequences are implemented by record chaining. Backtracking 8;; is implemented by a stack of of success continuations as needed. 9 10;; Spenser's implementation in C dispatches on records, but we compile 11;; to closures, instead. A function like `byte-matcher` allocates a 12;; closure to implement byte matching. Matcher-creation functions 13;; usually take a closure to use as the next step, so the closure tree 14;; is built bottom-up. 15 16(provide compile) 17 18(define (compile rx) 19 (let compile ([rx rx] [next-m done-m]) 20 (define-syntax-rule (mode-cond 21 #:tail tail 22 #:general general) 23 (cond 24 [(eq? next-m done-m) tail] 25 [else general])) 26 (cond 27 [(exact-integer? rx) 28 (mode-cond 29 #:tail (byte-tail-matcher rx) 30 #:general (byte-matcher rx next-m))] 31 [(bytes? rx) 32 (define len (bytes-length rx)) 33 (mode-cond 34 #:tail (bytes-tail-matcher rx len) 35 #:general (bytes-matcher rx len next-m))] 36 [(eq? rx rx:empty) 37 next-m] 38 [(eq? rx rx:never) 39 (never-matcher)] 40 [(eq? rx rx:any) 41 (mode-cond 42 #:tail (any-tail-matcher) 43 #:general (any-matcher next-m))] 44 [(rx:range? rx) 45 (define rng (compile-range (rx:range-range rx))) 46 (mode-cond 47 #:tail (range-tail-matcher rng) 48 #:general (range-matcher rng next-m))] 49 [(eq? rx rx:start) 50 (start-matcher next-m)] 51 [(eq? rx rx:end) 52 (end-matcher next-m)] 53 [(eq? rx rx:line-start) 54 (line-start-matcher next-m)] 55 [(eq? rx rx:line-end) 56 (line-end-matcher next-m)] 57 [(eq? rx rx:word-boundary) 58 (word-boundary-matcher next-m)] 59 [(eq? rx rx:not-word-boundary) 60 (not-word-boundary-matcher next-m)] 61 [(rx:sequence? rx) 62 (define rxs (rx:sequence-rxs rx)) 63 (let loop ([rxs rxs]) 64 (cond 65 [(null? rxs) next-m] 66 [else 67 (define rest-node (loop (cdr rxs))) 68 (compile (car rxs) rest-node)]))] 69 [(rx:alts? rx) 70 (alts-matcher (compile (rx:alts-rx1 rx) next-m) 71 (compile (rx:alts-rx2 rx) next-m))] 72 [(rx:maybe? rx) 73 (if (rx:maybe-non-greedy? rx) 74 (alts-matcher next-m 75 (compile (rx:maybe-rx rx) next-m)) 76 (alts-matcher (compile (rx:maybe-rx rx) next-m) 77 next-m))] 78 [(rx:repeat? rx) 79 (define actual-r-rx (rx:repeat-rx rx)) 80 ;; As a special case, handle in non-lazy `repeat` a group around 81 ;; a simple pattern: 82 (define r-rx (if (and (rx:group? actual-r-rx) 83 (not (rx:repeat-non-greedy? rx)) 84 (not (needs-backtrack? (rx:group-rx actual-r-rx)))) 85 (rx:group-rx actual-r-rx) 86 actual-r-rx)) 87 (define simple? (not (needs-backtrack? r-rx))) 88 (define group-n (and simple? 89 (rx:group? actual-r-rx) 90 (rx:group-number actual-r-rx))) 91 (define min (rx:repeat-min rx)) 92 (define max (let ([n (rx:repeat-max rx)]) 93 (if (= n +inf.0) #f n))) 94 (define r-m* (compile*/maybe r-rx min max)) 95 (cond 96 [(and r-m* 97 (not (rx:repeat-non-greedy? rx))) 98 (repeat-simple-many-matcher r-m* min max group-n next-m)] 99 [else 100 (define r-m (compile r-rx (if simple? done-m continue-m))) 101 (cond 102 [(rx:repeat-non-greedy? rx) 103 (if simple? 104 (lazy-repeat-simple-matcher r-m min max next-m) 105 (lazy-repeat-matcher r-m min max next-m))] 106 [else 107 (if simple? 108 (repeat-simple-matcher r-m min max group-n next-m) 109 (repeat-matcher r-m min max next-m))])])] 110 [(rx:group? rx) 111 (define n (rx:group-number rx)) 112 (define m (compile (rx:group-rx rx) (group-set-matcher n next-m))) 113 (group-push-matcher n m)] 114 [(rx:reference? rx) 115 (define n (rx:reference-n rx)) 116 (cond 117 [(zero? n) 118 (never-matcher)] 119 [(rx:reference-case-sensitive? rx) 120 (reference-matcher (sub1 n) next-m)] 121 [else 122 (reference-matcher/case-insensitive (sub1 n) next-m)])] 123 [(rx:cut? rx) 124 (cut-matcher (compile (rx:cut-rx rx) done-m) 125 (rx:cut-n-start rx) 126 (rx:cut-num-n rx) 127 next-m)] 128 [(rx:conditional? rx) 129 (define tst (rx:conditional-tst rx)) 130 (define m1 (compile (rx:conditional-rx1 rx) next-m)) 131 (define m2 (compile (rx:conditional-rx2 rx) next-m)) 132 (cond 133 [(rx:reference? tst) 134 (define n (sub1 (rx:reference-n tst))) 135 (conditional/reference-matcher n m1 m2)] 136 [else 137 (conditional/look-matcher (compile tst done-m) m1 m2 138 (rx:conditional-n-start rx) 139 (rx:conditional-num-n rx))])] 140 [(rx:lookahead? rx) 141 (lookahead-matcher (rx:lookahead-match? rx) 142 (compile (rx:lookahead-rx rx) done-m) 143 (rx:lookahead-n-start rx) 144 (rx:lookahead-num-n rx) 145 next-m)] 146 [(rx:lookbehind? rx) 147 (lookbehind-matcher (rx:lookbehind-match? rx) 148 (rx:lookbehind-lb-min rx) 149 (rx:lookbehind-lb-max rx) 150 (compile (rx:lookbehind-rx rx) limit-m) 151 (rx:lookbehind-n-start rx) 152 (rx:lookbehind-num-n rx) 153 next-m)] 154 [(rx:unicode-categories? rx) 155 (unicode-categories-matcher (rx:unicode-categories-symlist rx) 156 (rx:unicode-categories-match? rx) 157 next-m)] 158 [else (error 'compile/bt "internal error: unrecognized ~s" rx)]))) 159 160;; Compile a matcher repeater, if possible; the result is 161;; the repeating matcher and the (consistent) length of each match 162(define (compile*/maybe rx min max) 163 (cond 164 [(exact-integer? rx) 165 (byte-matcher* rx max)] 166 [(bytes? rx) 167 (bytes-matcher* rx max)] 168 [(eq? rx rx:any) 169 (any-matcher* max)] 170 [(rx:range? rx) 171 (range-matcher* (compile-range (rx:range-range rx)) max)] 172 [else 173 #f])) 174 175;; Determine the length of the prefix of `l` that needs backtracking: 176(define (count-backtrack-prefix l) 177 (let loop ([l l] [total 0] [non-bt 0]) 178 (cond 179 [(null? l) (- total non-bt)] 180 [(needs-backtrack? (car l)) 181 (loop (cdr l) (add1 total) 0)] 182 [else 183 (loop (cdr l) (add1 total) (add1 non-bt))]))) 184