1#lang racket/base 2(require "chyte.rkt" 3 "chyte-case.rkt" 4 "ast.rkt" 5 "config.rkt" 6 "error.rkt" 7 "../common/range.rkt" 8 "class.rkt" 9 "unicode.rkt" 10 "range.rkt" 11 "case.rkt") 12 13(provide parse) 14 15(define (parse p #:px? [px? #f]) 16 (define config (make-parse-config #:px? px?)) 17 (define-values (rx pos) (parse-regexp p 0 config)) 18 (chyte-case/eos 19 p pos 20 [(#\)) 21 (parse-error p pos config "unmatched `)` in pattern")] 22 [else 23 (values rx 24 (config-group-number config) 25 (unbox (parse-config-references?-box config)))])) 26 27;; Returns (values rx position) 28(define (parse-regexp s pos config #:parse-regexp [parse-regexp (lambda (s pos config) 29 (parse-regexp s pos config))]) 30 (define-values (rxs pos2) (parse-pces s pos config)) 31 (chyte-case/eos 32 s pos2 33 [(#\|) 34 (define-values (rx pos3) (parse-regexp s (add1 pos2) config)) 35 (values (rx-alts (rx-sequence rxs) rx (chytes-limit s)) pos3)] 36 [else 37 (values (rx-sequence rxs) pos2)])) 38 39(define (parse-regexp/maybe-empty s pos config) 40 (chyte-case/eos 41 s pos 42 [(#\)) 43 (values rx:empty pos)] 44 [else 45 (parse-regexp s pos config #:parse-regexp parse-regexp/maybe-empty)])) 46 47;; Returns (values list-of-rx position) 48(define (parse-pces s pos config) 49 (cond 50 [(= pos (chytes-length s)) 51 (values null pos)] 52 [else 53 (define-values (rx pos2) (parse-pce s pos config)) 54 (chyte-case/eos 55 s pos2 56 [(eos) 57 (values (list rx) pos2)] 58 [(#\| #\)) 59 (values (list rx) pos2)] 60 [else 61 (define-values (rxs pos3) (parse-pces s pos2 config)) 62 (values (cons rx rxs) pos3)])])) 63 64;; Returns (values rx position) 65(define (parse-pce s pos config) 66 (define-values (rx pos2) (parse-atom s pos config)) 67 (chyte-case/eos 68 s pos2 69 [(#\*) 70 (define-values (non-greedy? pos3) (parse-non-greedy s (add1 pos2) config)) 71 (values (rx:repeat rx 0 +inf.0 non-greedy?) pos3)] 72 [(#\+) 73 (define-values (non-greedy? pos3) (parse-non-greedy s (add1 pos2) config)) 74 (values (rx:repeat rx 1 +inf.0 non-greedy?) pos3)] 75 [(#\?) 76 (define-values (non-greedy? pos3) (parse-non-greedy s (add1 pos2) config)) 77 (values (rx:maybe rx non-greedy?) pos3)] 78 [(#\{) 79 (cond 80 [(parse-config-px? config) 81 (define-values (n1 pos3) (parse-integer 0 s (add1 pos2) config)) 82 (chyte-case/eos 83 s pos3 84 [(#\,) 85 (define-values (n2 pos4) (parse-integer 0 s (add1 pos3) config)) 86 (chyte-case/eos 87 s pos4 88 [(#\}) 89 (define n2* (if (= pos4 (add1 pos3)) +inf.0 n2)) 90 (define-values (non-greedy? pos5) (parse-non-greedy s (add1 pos4) config)) 91 (values (rx:repeat rx n1 n2* non-greedy?) pos5)] 92 [else 93 (parse-error s pos3 config "expected digit or `}` to end repetition specification started with `{`")])] 94 [(#\}) 95 (define-values (non-greedy? pos4) (parse-non-greedy s (add1 pos3) config)) 96 (values (rx:repeat rx n1 n1 non-greedy?) pos4)] 97 [else 98 (parse-error s pos3 config "expected digit, `,`, or `}' for repetition specification started with `{`")])] 99 [else 100 (values rx pos2)])] 101 [else 102 (values rx pos2)])) 103 104(define (parse-non-greedy s pos config) 105 (chyte-case/eos 106 s pos 107 [(#\?) 108 (values #t (check-not-nested s (add1 pos) config))] 109 [else 110 (values #f (check-not-nested s pos config))])) 111 112(define (check-not-nested s pos config) 113 (chyte-case/eos 114 s pos 115 [(#\? #\* #\+) 116 (parse-error s pos config 117 "nested `~a` in patten" 118 (integer->char (chytes-ref s pos)))] 119 [(#\{) 120 (when (parse-config-px? config) 121 (parse-error s pos config 122 "nested `{` in pattern"))]) 123 pos) 124 125;; Returns (values rx position) 126(define (parse-atom s pos config) 127 ;; Assumes at least one character 128 (chyte-case 129 (chytes-ref s pos) 130 [(#\|) 131 (values rx:empty pos)] 132 [(#\() 133 (parse-parenthesized-atom s (add1 pos) config)] 134 [(#\[) 135 (define-values (range pos2) (parse-range/not s (add1 pos) config)) 136 (values (rx-range range (chytes-limit s)) pos2)] 137 [(#\.) 138 (define rx (if (parse-config-multi-line? config) 139 (rx-range (range-invert (range-add empty-range (chyte #\newline)) 140 (chytes-limit s)) 141 (chytes-limit s)) 142 rx:any)) 143 (values rx (add1 pos))] 144 [(#\^) 145 (values (if (parse-config-multi-line? config) rx:line-start rx:start) 146 (add1 pos))] 147 [(#\$) 148 (values (if (parse-config-multi-line? config) rx:line-end rx:end) 149 (add1 pos))] 150 [else 151 ;; Literal or (for px mode) `\` character class 152 (parse-literal s pos config)])) 153 154;; Returns (values rx position) 155(define (parse-parenthesized-atom s pos config) 156 (chyte-case/eos 157 s pos 158 [(eos) 159 (missing-closing-error s pos config)] 160 [(#\?) 161 (define pos2 (add1 pos)) 162 (chyte-case/eos 163 s pos2 164 [(eos) 165 (bad-?-sequence-error s pos2 config)] 166 [(#\>) 167 (define pre-num-groups (config-group-number config)) 168 (define-values (rx pos3) (parse-regexp/maybe-empty s (add1 pos2) config)) 169 (define post-num-groups (config-group-number config)) 170 (values (rx-cut rx pre-num-groups (- post-num-groups pre-num-groups)) 171 (check-close-paren s pos3 config))] 172 [(#\() 173 (parse-conditional s (add1 pos2) config)] 174 [(#\i #\s #\m #\- #\:) 175 (define-values (config2 pos3) (parse-mode s pos2 config)) 176 (chyte-case/eos 177 s pos3 178 [(#\:) 179 (define-values (rx pos4) (parse-regexp/maybe-empty s (add1 pos3) config2)) 180 (values rx (check-close-paren s pos4 config2))] 181 [else 182 (parse-error s pos3 config2 (string-append 183 "expected `:` or another mode after `(?` and a mode sequence;\n" 184 " a mode is `i`, `-i`, `m`, `-m`, `s`, or `-s`"))])] 185 [else 186 (parse-look s pos2 config)])] 187 [else 188 (define group-number (config-group-number config)) 189 (define-values (rx pos2) (parse-regexp/maybe-empty s pos (config-group-number+1 config))) 190 (values (rx-group rx group-number) 191 (check-close-paren s pos2 config))])) 192 193;; Returns (values rx position) 194(define (parse-look s pos2 config) 195 ;; known that one character is available 196 (define pre-num-groups (config-group-number config)) 197 (define (span-num-groups) (- (config-group-number config) pre-num-groups)) 198 (chyte-case 199 (chytes-ref s pos2) 200 [(#\=) 201 (define-values (rx pos3) (parse-regexp/maybe-empty s (add1 pos2) config)) 202 (values (rx:lookahead rx #t pre-num-groups (span-num-groups)) 203 (check-close-paren s pos3 config))] 204 [(#\!) 205 (define-values (rx pos3) (parse-regexp/maybe-empty s (add1 pos2) config)) 206 (values (rx:lookahead rx #f pre-num-groups (span-num-groups)) 207 (check-close-paren s pos3 config))] 208 [(#\<) 209 (define pos2+ (add1 pos2)) 210 (chyte-case/eos 211 s pos2+ 212 [(eos) 213 (bad-?-sequence-error s pos2+ config)] 214 [(#\=) 215 (define-values (rx pos3) (parse-regexp/maybe-empty s (add1 pos2+) config)) 216 (values (rx:lookbehind rx #t 0 0 pre-num-groups (span-num-groups)) 217 (check-close-paren s pos3 config))] 218 [(#\!) 219 (define-values (rx pos3) (parse-regexp/maybe-empty s (add1 pos2+) config)) 220 (values (rx:lookbehind rx #f 0 0 pre-num-groups (span-num-groups)) 221 (check-close-paren s pos3 config))] 222 [else 223 (bad-?-sequence-error s pos2+ config)])] 224 [else 225 (bad-?-sequence-error s pos2 config)])) 226 227;; Returns (values rx position) 228(define (parse-conditional s pos config) 229 (define tst-pre-num-groups (config-group-number config)) 230 (define-values (tst pos2) (parse-test s pos config)) 231 (define tst-span-num-groups (- (config-group-number config) tst-pre-num-groups)) 232 (define-values (pces pos3) (parse-pces s pos2 config)) 233 (chyte-case/eos 234 s pos3 235 [(eos) 236 (missing-closing-error s pos3 config)] 237 [(#\|) 238 (define-values (pces2 pos4) (parse-pces s (add1 pos3) config)) 239 (chyte-case/eos 240 s pos4 241 [(eos) 242 (missing-closing-error s pos4 config)] 243 [(#\)) 244 (values (rx-conditional tst (rx-sequence pces) (rx-sequence pces2) 245 tst-pre-num-groups tst-span-num-groups) 246 (add1 pos4))] 247 [else 248 (parse-error s pos4 config "expected `)` to close `(?(...)...` after second branch")])] 249 [(#\)) 250 (values (rx-conditional tst (rx-sequence pces) rx:empty 251 tst-pre-num-groups tst-span-num-groups) 252 (add1 pos3))])) 253 254;; Returns (values rx position) 255(define (parse-test s pos config) 256 (chyte-case/eos 257 s pos 258 [(eos) 259 (missing-closing-error s pos config)] 260 [(#\?) 261 (parse-look s (add1 pos) config)] 262 [else 263 (define c (chytes-ref s pos)) 264 (cond 265 [(and (>= c (chyte #\0)) (<= c (chyte #\9))) 266 (set-box! (parse-config-references?-box config) #t) 267 (define-values (n pos3) (parse-integer 0 s pos config)) 268 (unless (and (pos3 . < . (chytes-length s)) 269 (= (chytes-ref s pos3) (chyte #\)))) 270 (parse-error s pos3 config "expected `)` after `(?(` followed by digits")) 271 (values (rx:reference n #f) (add1 pos3))] 272 [else 273 (parse-error s pos config "expected `(?=`, `(?!`, `(?<`, or digit after `(?(`")])])) 274 275;; Returns (values n position) 276(define (parse-integer n s pos config) 277 (cond 278 [(= pos (chytes-length s)) 279 (values n pos)] 280 [else 281 (define c (chytes-ref s pos)) 282 (cond 283 [(and (>= c (chyte #\0)) (<= c (chyte #\9))) 284 (define n2 (+ (* n 10) (- c (chyte #\0)))) 285 (parse-integer n2 s (add1 pos) config)] 286 [else 287 (values n pos)])])) 288 289;; Returns (values rx position) 290(define (parse-literal s pos config) 291 ;; Assumes at least one character; 292 ;; we don't get here for `(`, `[`, `.`, `^`, `$`, or `|` 293 (define c (chytes-ref s pos)) 294 (chyte-case 295 c 296 [(#\* #\+ #\?) 297 (parse-error s pos config "`~a` follows nothing in pattern" (integer->char c))] 298 [(#\{) 299 (cond 300 [(parse-config-px? config) 301 (parse-error s pos config "`{` follows nothing in pattern")] 302 [else (values c (add1 pos))])] 303 [(#\\) 304 ;; escaped character 305 (parse-backslash-literal s (add1 pos) config)] 306 [(#\)) 307 (parse-error s pos config "unmatched `)` in pattern")] 308 [(#\] #\}) 309 (cond 310 [(parse-config-px? config) 311 (parse-error s pos config "unmatched `~a` in pattern" (integer->char c))] 312 [else (values c (add1 pos))])] 313 [else 314 (cond 315 [(parse-config-case-sensitive? config) 316 (values c (add1 pos))] 317 [else 318 ;; case-insensitive char match 319 (values (rx-range (range-add* empty-range c config) (chytes-limit s)) 320 (add1 pos))])])) 321 322(define (parse-backslash-literal s pos2 config) 323 (cond 324 [(= pos2 (chytes-length s)) 325 ;; An "expected character after `\`" error would make more sense, 326 ;; but the old expander produced a match against the nul character 327 (values (chyte #\u0) pos2)] 328 [else 329 (define c2 (chytes-ref s pos2)) 330 (cond 331 [(and (parse-config-px? config) 332 (and (>= c2 (chyte #\0)) (<= c2 (chyte #\9)))) 333 (set-box! (parse-config-references?-box config) #t) 334 (define-values (n pos3) (parse-integer 0 s pos2 config)) 335 (values (rx:reference n (parse-config-case-sensitive? config)) pos3)] 336 [(and (parse-config-px? config) 337 (or (and (>= c2 (chyte #\a)) (<= c2 (chyte #\z))) 338 (and (>= c2 (chyte #\A)) (<= c2 (chyte #\Z))))) 339 (chyte-case 340 c2 341 [(#\p #\P) 342 (parse-unicode-categories c2 s (add1 pos2) config)] 343 [(#\b) 344 (values rx:word-boundary (add1 pos2))] 345 [(#\B) 346 (values rx:not-word-boundary (add1 pos2))] 347 [else 348 (define-values (success? range pos3) (parse-class s pos2 config)) 349 (if success? 350 (values (rx-range range (chytes-limit s)) pos3) 351 (parse-error s pos2 config "illegal alphabetic escape"))])] 352 [else 353 (values c2 (add1 pos2))])])) 354 355;; Returns (values config position) 356(define (parse-mode s pos config) 357 (chyte-case/eos 358 s pos 359 [(eos) 360 (values config pos)] 361 [(#\i) 362 (parse-mode s (add1 pos) (config-case-sensitive config #f))] 363 [(#\s) 364 (parse-mode s (add1 pos) (config-multi-line config #f))] 365 [(#\m) 366 (parse-mode s (add1 pos) (config-multi-line config #t))] 367 [(#\-) 368 (define pos2 (add1 pos)) 369 (chyte-case/eos 370 s pos2 371 [(eos) 372 (values config pos)] 373 [(#\i) 374 (parse-mode s (add1 pos2) (config-case-sensitive config #t))] 375 [(#\s) 376 (parse-mode s (add1 pos2) (config-multi-line config #t))] 377 [(#\m) 378 (parse-mode s (add1 pos2) (config-multi-line config #f))] 379 [else 380 (values config pos)])] 381 [else 382 (values config pos)])) 383 384 385(define (check-close-paren s pos config) 386 (unless (and (pos . < . (chytes-length s)) 387 (= (chyte #\)) (chytes-ref s pos))) 388 (parse-error s pos config "expected a closing `)`")) 389 (add1 pos)) 390 391(define (missing-closing-error s pos config) 392 (parse-error s pos config "missing closing parenthesis in pattern")) 393 394(define (bad-?-sequence-error s pos config) 395 (parse-error s pos config 396 "expected `:`, `=`, `!`, `<=`, `<!`, `i`, `-i`, `m`, `-m`, `s`, or `-s` after `(?`")) 397