1#lang racket/base
2(require "chyte.rkt"
3         "chyte-case.rkt"
4         "ast.rkt"
5         "config.rkt"
6         "error.rkt"
7         "../common/range.rkt"
8         "class.rkt"
9         "unicode.rkt"
10         "range.rkt"
11         "case.rkt")
12
13(provide parse)
14
15(define (parse p #:px? [px? #f])
16  (define config (make-parse-config #:px? px?))
17  (define-values (rx pos) (parse-regexp p 0 config))
18  (chyte-case/eos
19   p pos
20   [(#\))
21    (parse-error p pos config "unmatched `)` in pattern")]
22   [else
23    (values rx
24            (config-group-number config)
25            (unbox (parse-config-references?-box config)))]))
26
27;; Returns (values rx position)
28(define (parse-regexp s pos config #:parse-regexp [parse-regexp (lambda (s pos config)
29                                                                  (parse-regexp s pos config))])
30  (define-values (rxs pos2) (parse-pces s pos config))
31  (chyte-case/eos
32   s pos2
33   [(#\|)
34    (define-values (rx pos3) (parse-regexp s (add1 pos2) config))
35    (values (rx-alts (rx-sequence rxs) rx (chytes-limit s)) pos3)]
36   [else
37    (values (rx-sequence rxs) pos2)]))
38
39(define (parse-regexp/maybe-empty s pos config)
40  (chyte-case/eos
41   s pos
42   [(#\))
43    (values rx:empty pos)]
44   [else
45    (parse-regexp s pos config #:parse-regexp parse-regexp/maybe-empty)]))
46
47;; Returns (values list-of-rx position)
48(define (parse-pces s pos config)
49  (cond
50   [(= pos (chytes-length s))
51    (values null pos)]
52   [else
53    (define-values (rx pos2) (parse-pce s pos config))
54    (chyte-case/eos
55     s pos2
56     [(eos)
57      (values (list rx) pos2)]
58     [(#\| #\))
59      (values (list rx) pos2)]
60     [else
61      (define-values (rxs pos3) (parse-pces s pos2 config))
62      (values (cons rx rxs) pos3)])]))
63
64;; Returns (values rx position)
65(define (parse-pce s pos config)
66  (define-values (rx pos2) (parse-atom s pos config))
67  (chyte-case/eos
68   s pos2
69   [(#\*)
70    (define-values (non-greedy? pos3) (parse-non-greedy s (add1 pos2) config))
71    (values (rx:repeat rx 0 +inf.0 non-greedy?) pos3)]
72   [(#\+)
73    (define-values (non-greedy? pos3) (parse-non-greedy s (add1 pos2) config))
74    (values (rx:repeat rx 1 +inf.0 non-greedy?) pos3)]
75   [(#\?)
76    (define-values (non-greedy? pos3) (parse-non-greedy s (add1 pos2) config))
77    (values (rx:maybe rx non-greedy?) pos3)]
78   [(#\{)
79    (cond
80     [(parse-config-px? config)
81      (define-values (n1 pos3) (parse-integer 0 s (add1 pos2) config))
82      (chyte-case/eos
83       s pos3
84       [(#\,)
85        (define-values (n2 pos4) (parse-integer 0 s (add1 pos3) config))
86        (chyte-case/eos
87         s pos4
88         [(#\})
89          (define n2* (if (= pos4 (add1 pos3)) +inf.0 n2))
90          (define-values (non-greedy? pos5) (parse-non-greedy s (add1 pos4) config))
91          (values (rx:repeat rx n1 n2* non-greedy?) pos5)]
92         [else
93          (parse-error s pos3 config "expected digit or `}` to end repetition specification started with `{`")])]
94       [(#\})
95        (define-values (non-greedy? pos4) (parse-non-greedy s (add1 pos3) config))
96        (values (rx:repeat rx n1 n1 non-greedy?) pos4)]
97       [else
98        (parse-error s pos3 config "expected digit, `,`, or `}' for repetition specification started with `{`")])]
99     [else
100      (values rx pos2)])]
101   [else
102    (values rx pos2)]))
103
104(define (parse-non-greedy s pos config)
105  (chyte-case/eos
106   s pos
107   [(#\?)
108    (values #t (check-not-nested s (add1 pos) config))]
109   [else
110    (values #f (check-not-nested s pos config))]))
111
112(define (check-not-nested s pos config)
113  (chyte-case/eos
114   s pos
115   [(#\? #\* #\+)
116    (parse-error s pos config
117                 "nested `~a` in patten"
118                 (integer->char (chytes-ref s pos)))]
119   [(#\{)
120    (when (parse-config-px? config)
121      (parse-error s pos config
122                   "nested `{` in pattern"))])
123  pos)
124
125;; Returns (values rx position)
126(define (parse-atom s pos config)
127  ;; Assumes at least one character
128  (chyte-case
129   (chytes-ref s pos)
130   [(#\|)
131    (values rx:empty pos)]
132   [(#\()
133    (parse-parenthesized-atom s (add1 pos) config)]
134   [(#\[)
135    (define-values (range pos2) (parse-range/not s (add1 pos) config))
136    (values (rx-range range (chytes-limit s)) pos2)]
137   [(#\.)
138    (define rx (if (parse-config-multi-line? config)
139                   (rx-range (range-invert (range-add empty-range (chyte #\newline))
140                                           (chytes-limit s))
141                             (chytes-limit s))
142                   rx:any))
143    (values rx (add1 pos))]
144   [(#\^)
145    (values (if (parse-config-multi-line? config) rx:line-start rx:start)
146            (add1 pos))]
147   [(#\$)
148    (values (if (parse-config-multi-line? config) rx:line-end rx:end)
149            (add1 pos))]
150   [else
151    ;; Literal or (for px mode) `\` character class
152    (parse-literal s pos config)]))
153
154;; Returns (values rx position)
155(define (parse-parenthesized-atom s pos config)
156  (chyte-case/eos
157   s pos
158   [(eos)
159    (missing-closing-error s pos config)]
160   [(#\?)
161    (define pos2 (add1 pos))
162    (chyte-case/eos
163     s pos2
164     [(eos)
165      (bad-?-sequence-error s pos2 config)]
166     [(#\>)
167      (define pre-num-groups (config-group-number config))
168      (define-values (rx pos3) (parse-regexp/maybe-empty s (add1 pos2) config))
169      (define post-num-groups (config-group-number config))
170      (values (rx-cut rx pre-num-groups (- post-num-groups pre-num-groups))
171              (check-close-paren s pos3 config))]
172     [(#\()
173      (parse-conditional s (add1 pos2) config)]
174     [(#\i #\s #\m #\- #\:)
175      (define-values (config2 pos3) (parse-mode s pos2 config))
176      (chyte-case/eos
177       s pos3
178       [(#\:)
179        (define-values (rx pos4) (parse-regexp/maybe-empty s (add1 pos3) config2))
180        (values rx (check-close-paren s pos4 config2))]
181       [else
182        (parse-error s pos3 config2 (string-append
183                                     "expected `:` or another mode after `(?` and a mode sequence;\n"
184                                     " a mode is `i`, `-i`, `m`, `-m`, `s`, or `-s`"))])]
185     [else
186      (parse-look s pos2 config)])]
187   [else
188    (define group-number (config-group-number config))
189    (define-values (rx pos2) (parse-regexp/maybe-empty s pos (config-group-number+1 config)))
190    (values (rx-group rx group-number)
191            (check-close-paren s pos2 config))]))
192
193;; Returns (values rx position)
194(define (parse-look s pos2 config)
195  ;; known that one character is available
196  (define pre-num-groups (config-group-number config))
197  (define (span-num-groups) (- (config-group-number config) pre-num-groups))
198  (chyte-case
199   (chytes-ref s pos2)
200   [(#\=)
201    (define-values (rx pos3) (parse-regexp/maybe-empty s (add1 pos2) config))
202    (values (rx:lookahead rx #t pre-num-groups (span-num-groups))
203            (check-close-paren s pos3 config))]
204   [(#\!)
205    (define-values (rx pos3) (parse-regexp/maybe-empty s (add1 pos2) config))
206    (values (rx:lookahead rx #f pre-num-groups (span-num-groups))
207            (check-close-paren s pos3 config))]
208   [(#\<)
209    (define pos2+ (add1 pos2))
210    (chyte-case/eos
211     s pos2+
212     [(eos)
213      (bad-?-sequence-error s pos2+ config)]
214     [(#\=)
215      (define-values (rx pos3) (parse-regexp/maybe-empty s (add1 pos2+) config))
216      (values (rx:lookbehind rx #t 0 0 pre-num-groups (span-num-groups))
217              (check-close-paren s pos3 config))]
218     [(#\!)
219      (define-values (rx pos3) (parse-regexp/maybe-empty s (add1 pos2+) config))
220      (values (rx:lookbehind rx #f 0 0 pre-num-groups (span-num-groups))
221              (check-close-paren s pos3 config))]
222     [else
223      (bad-?-sequence-error s pos2+ config)])]
224   [else
225    (bad-?-sequence-error s pos2 config)]))
226
227;; Returns (values rx position)
228(define (parse-conditional s pos config)
229  (define tst-pre-num-groups (config-group-number config))
230  (define-values (tst pos2) (parse-test s pos config))
231  (define tst-span-num-groups (- (config-group-number config) tst-pre-num-groups))
232  (define-values (pces pos3) (parse-pces s pos2 config))
233  (chyte-case/eos
234   s pos3
235   [(eos)
236    (missing-closing-error s pos3 config)]
237   [(#\|)
238    (define-values (pces2 pos4) (parse-pces s (add1 pos3) config))
239    (chyte-case/eos
240     s pos4
241     [(eos)
242      (missing-closing-error s pos4 config)]
243     [(#\))
244      (values (rx-conditional tst (rx-sequence pces) (rx-sequence pces2)
245                              tst-pre-num-groups tst-span-num-groups)
246              (add1 pos4))]
247     [else
248      (parse-error s pos4 config "expected `)` to close `(?(...)...` after second branch")])]
249   [(#\))
250    (values (rx-conditional tst (rx-sequence pces) rx:empty
251                            tst-pre-num-groups tst-span-num-groups)
252            (add1 pos3))]))
253
254;; Returns (values rx position)
255(define (parse-test s pos config)
256  (chyte-case/eos
257   s pos
258   [(eos)
259    (missing-closing-error s pos config)]
260   [(#\?)
261    (parse-look s (add1 pos) config)]
262   [else
263    (define c (chytes-ref s pos))
264    (cond
265     [(and (>= c (chyte #\0)) (<= c (chyte #\9)))
266      (set-box! (parse-config-references?-box config) #t)
267      (define-values (n pos3) (parse-integer 0 s pos config))
268      (unless (and (pos3 . < . (chytes-length s))
269                   (= (chytes-ref s pos3) (chyte #\))))
270        (parse-error s pos3 config "expected `)` after `(?(` followed by digits"))
271      (values (rx:reference n #f) (add1 pos3))]
272     [else
273      (parse-error s pos config "expected `(?=`, `(?!`, `(?<`, or digit after `(?(`")])]))
274
275;; Returns (values n position)
276(define (parse-integer n s pos config)
277  (cond
278   [(= pos (chytes-length s))
279    (values n pos)]
280   [else
281    (define c (chytes-ref s pos))
282    (cond
283     [(and (>= c (chyte #\0)) (<= c (chyte #\9)))
284      (define n2 (+ (* n 10) (- c (chyte #\0))))
285      (parse-integer n2 s (add1 pos) config)]
286     [else
287      (values n pos)])]))
288
289;; Returns (values rx position)
290(define (parse-literal s pos config)
291  ;; Assumes at least one character;
292  ;; we don't get here for `(`, `[`, `.`, `^`, `$`, or `|`
293  (define c (chytes-ref s pos))
294  (chyte-case
295   c
296   [(#\* #\+ #\?)
297    (parse-error s pos config "`~a` follows nothing in pattern" (integer->char c))]
298   [(#\{)
299    (cond
300     [(parse-config-px? config)
301      (parse-error s pos config "`{` follows nothing in pattern")]
302     [else (values c (add1 pos))])]
303   [(#\\)
304    ;; escaped character
305    (parse-backslash-literal s (add1 pos) config)]
306   [(#\))
307    (parse-error s pos config "unmatched `)` in pattern")]
308   [(#\] #\})
309    (cond
310     [(parse-config-px? config)
311      (parse-error s pos config "unmatched `~a` in pattern" (integer->char c))]
312     [else (values c (add1 pos))])]
313   [else
314    (cond
315     [(parse-config-case-sensitive? config)
316      (values c (add1 pos))]
317     [else
318      ;; case-insensitive char match
319      (values (rx-range (range-add* empty-range c config) (chytes-limit s))
320              (add1 pos))])]))
321
322(define (parse-backslash-literal s pos2 config)
323  (cond
324   [(= pos2 (chytes-length s))
325    ;; An "expected character after `\`" error would make more sense,
326    ;; but the old expander produced a match against the nul character
327    (values (chyte #\u0) pos2)]
328   [else
329    (define c2 (chytes-ref s pos2))
330    (cond
331     [(and (parse-config-px? config)
332           (and (>= c2 (chyte #\0)) (<= c2 (chyte #\9))))
333      (set-box! (parse-config-references?-box config) #t)
334      (define-values (n pos3) (parse-integer 0 s pos2 config))
335      (values (rx:reference n (parse-config-case-sensitive? config)) pos3)]
336     [(and (parse-config-px? config)
337           (or (and (>= c2 (chyte #\a)) (<= c2 (chyte #\z)))
338               (and (>= c2 (chyte #\A)) (<= c2 (chyte #\Z)))))
339      (chyte-case
340       c2
341       [(#\p #\P)
342        (parse-unicode-categories c2 s (add1 pos2) config)]
343       [(#\b)
344        (values rx:word-boundary (add1 pos2))]
345       [(#\B)
346        (values rx:not-word-boundary (add1 pos2))]
347       [else
348        (define-values (success? range pos3) (parse-class s pos2 config))
349        (if success?
350            (values (rx-range range (chytes-limit s)) pos3)
351            (parse-error s pos2 config "illegal alphabetic escape"))])]
352     [else
353      (values c2 (add1 pos2))])]))
354
355;; Returns (values config position)
356(define (parse-mode s pos config)
357  (chyte-case/eos
358   s pos
359   [(eos)
360    (values config pos)]
361   [(#\i)
362    (parse-mode s (add1 pos) (config-case-sensitive config #f))]
363   [(#\s)
364    (parse-mode s (add1 pos) (config-multi-line config #f))]
365   [(#\m)
366    (parse-mode s (add1 pos) (config-multi-line config #t))]
367   [(#\-)
368    (define pos2 (add1 pos))
369    (chyte-case/eos
370     s pos2
371     [(eos)
372      (values config pos)]
373     [(#\i)
374      (parse-mode s (add1 pos2) (config-case-sensitive config #t))]
375     [(#\s)
376      (parse-mode s (add1 pos2) (config-multi-line config #t))]
377     [(#\m)
378      (parse-mode s (add1 pos2) (config-multi-line config #f))]
379     [else
380      (values config pos)])]
381   [else
382    (values config pos)]))
383
384
385(define (check-close-paren s pos config)
386  (unless (and (pos . < . (chytes-length s))
387               (= (chyte #\)) (chytes-ref s pos)))
388    (parse-error s pos config "expected a closing `)`"))
389  (add1 pos))
390
391(define (missing-closing-error s pos config)
392  (parse-error s pos config "missing closing parenthesis in pattern"))
393
394(define (bad-?-sequence-error s pos config)
395  (parse-error s pos config
396               "expected `:`, `=`, `!`, `<=`, `<!`, `i`, `-i`, `m`, `-m`, `s`, or `-s` after `(?`"))
397