1#lang racket/base
2(require "../parse/ast.rkt"
3         "../common/range.rkt"
4         "match.rkt")
5
6;; Compile to a Spencer-style interpretation of a regular expression,
7;; where sequences are implemented by record chaining. Backtracking
8;; is implemented by a stack of of success continuations as needed.
9
10;; Spenser's implementation in C dispatches on records, but we compile
11;; to closures, instead. A function like `byte-matcher` allocates a
12;; closure to implement byte matching. Matcher-creation functions
13;; usually take a closure to use as the next step, so the closure tree
14;; is built bottom-up.
15
16(provide compile)
17
18(define (compile rx)
19  (let compile ([rx rx] [next-m done-m])
20    (define-syntax-rule (mode-cond
21                         #:tail tail
22                         #:general general)
23      (cond
24       [(eq? next-m done-m) tail]
25       [else general]))
26    (cond
27     [(exact-integer? rx)
28      (mode-cond
29       #:tail (byte-tail-matcher rx)
30       #:general (byte-matcher rx next-m))]
31     [(bytes? rx)
32      (define len (bytes-length rx))
33      (mode-cond
34       #:tail (bytes-tail-matcher rx len)
35       #:general (bytes-matcher rx len next-m))]
36     [(eq? rx rx:empty)
37      next-m]
38     [(eq? rx rx:never)
39      (never-matcher)]
40     [(eq? rx rx:any)
41      (mode-cond
42       #:tail (any-tail-matcher)
43       #:general (any-matcher next-m))]
44     [(rx:range? rx)
45      (define rng (compile-range (rx:range-range rx)))
46      (mode-cond
47       #:tail (range-tail-matcher rng)
48       #:general (range-matcher rng next-m))]
49     [(eq? rx rx:start)
50      (start-matcher next-m)]
51     [(eq? rx rx:end)
52      (end-matcher next-m)]
53     [(eq? rx rx:line-start)
54      (line-start-matcher next-m)]
55     [(eq? rx rx:line-end)
56      (line-end-matcher next-m)]
57     [(eq? rx rx:word-boundary)
58      (word-boundary-matcher next-m)]
59     [(eq? rx rx:not-word-boundary)
60      (not-word-boundary-matcher next-m)]
61     [(rx:sequence? rx)
62      (define rxs (rx:sequence-rxs rx))
63      (let loop ([rxs rxs])
64        (cond
65         [(null? rxs) next-m]
66         [else
67          (define rest-node (loop (cdr rxs)))
68          (compile (car rxs) rest-node)]))]
69     [(rx:alts? rx)
70      (alts-matcher (compile (rx:alts-rx1 rx) next-m)
71                    (compile (rx:alts-rx2 rx) next-m))]
72     [(rx:maybe? rx)
73      (if (rx:maybe-non-greedy? rx)
74          (alts-matcher next-m
75                        (compile (rx:maybe-rx rx) next-m))
76          (alts-matcher (compile (rx:maybe-rx rx) next-m)
77                        next-m))]
78     [(rx:repeat? rx)
79      (define actual-r-rx (rx:repeat-rx rx))
80      ;; As a special case, handle in non-lazy `repeat` a group around
81      ;; a simple pattern:
82      (define r-rx (if (and (rx:group? actual-r-rx)
83                            (not (rx:repeat-non-greedy? rx))
84                            (not (needs-backtrack? (rx:group-rx actual-r-rx))))
85                       (rx:group-rx actual-r-rx)
86                       actual-r-rx))
87      (define simple? (not (needs-backtrack? r-rx)))
88      (define group-n (and simple?
89                           (rx:group? actual-r-rx)
90                           (rx:group-number actual-r-rx)))
91      (define min (rx:repeat-min rx))
92      (define max (let ([n (rx:repeat-max rx)])
93                    (if (= n +inf.0) #f n)))
94      (define r-m* (compile*/maybe r-rx min max))
95      (cond
96       [(and r-m*
97             (not (rx:repeat-non-greedy? rx)))
98        (repeat-simple-many-matcher r-m* min max group-n next-m)]
99       [else
100        (define r-m (compile r-rx (if simple? done-m continue-m)))
101        (cond
102         [(rx:repeat-non-greedy? rx)
103          (if simple?
104              (lazy-repeat-simple-matcher r-m min max next-m)
105              (lazy-repeat-matcher r-m min max next-m))]
106         [else
107          (if simple?
108              (repeat-simple-matcher r-m min max group-n next-m)
109              (repeat-matcher r-m min max next-m))])])]
110     [(rx:group? rx)
111      (define n (rx:group-number rx))
112      (define m (compile (rx:group-rx rx) (group-set-matcher n next-m)))
113      (group-push-matcher n m)]
114     [(rx:reference? rx)
115      (define n (rx:reference-n rx))
116      (cond
117       [(zero? n)
118        (never-matcher)]
119       [(rx:reference-case-sensitive? rx)
120        (reference-matcher (sub1 n) next-m)]
121       [else
122        (reference-matcher/case-insensitive (sub1 n) next-m)])]
123     [(rx:cut? rx)
124      (cut-matcher (compile (rx:cut-rx rx) done-m)
125                   (rx:cut-n-start rx)
126                   (rx:cut-num-n rx)
127                   next-m)]
128     [(rx:conditional? rx)
129      (define tst (rx:conditional-tst rx))
130      (define m1 (compile (rx:conditional-rx1 rx) next-m))
131      (define m2 (compile (rx:conditional-rx2 rx) next-m))
132      (cond
133       [(rx:reference? tst)
134        (define n (sub1 (rx:reference-n tst)))
135        (conditional/reference-matcher n m1 m2)]
136       [else
137        (conditional/look-matcher (compile tst done-m) m1 m2
138                                  (rx:conditional-n-start rx)
139                                  (rx:conditional-num-n rx))])]
140     [(rx:lookahead? rx)
141      (lookahead-matcher (rx:lookahead-match? rx)
142                         (compile (rx:lookahead-rx rx) done-m)
143                         (rx:lookahead-n-start rx)
144                         (rx:lookahead-num-n rx)
145                         next-m)]
146     [(rx:lookbehind? rx)
147      (lookbehind-matcher (rx:lookbehind-match? rx)
148                          (rx:lookbehind-lb-min rx)
149                          (rx:lookbehind-lb-max rx)
150                          (compile (rx:lookbehind-rx rx) limit-m)
151                          (rx:lookbehind-n-start rx)
152                          (rx:lookbehind-num-n rx)
153                          next-m)]
154     [(rx:unicode-categories? rx)
155      (unicode-categories-matcher (rx:unicode-categories-symlist rx)
156                                  (rx:unicode-categories-match? rx)
157                                  next-m)]
158     [else (error 'compile/bt "internal error: unrecognized ~s" rx)])))
159
160;; Compile a matcher repeater, if possible; the result is
161;; the repeating matcher and the (consistent) length of each match
162(define (compile*/maybe rx min max)
163  (cond
164   [(exact-integer? rx)
165    (byte-matcher* rx max)]
166   [(bytes? rx)
167    (bytes-matcher* rx max)]
168   [(eq? rx rx:any)
169    (any-matcher* max)]
170   [(rx:range? rx)
171    (range-matcher* (compile-range (rx:range-range rx)) max)]
172   [else
173    #f]))
174
175;; Determine the length of the prefix of `l` that needs backtracking:
176(define (count-backtrack-prefix l)
177  (let loop ([l l] [total 0] [non-bt 0])
178    (cond
179     [(null? l) (- total non-bt)]
180     [(needs-backtrack? (car l))
181      (loop (cdr l) (add1 total) 0)]
182     [else
183      (loop (cdr l) (add1 total) (add1 non-bt))])))
184