1;;;; irregex.scm -- IrRegular Expressions 2(library (irregex) 3 (export 4 ;from irregex-chicken 5 6 irregex string->irregex sre->irregex 7 string->sre maybe-string->sre 8 irregex? irregex-match-data? 9 irregex-new-matches irregex-reset-matches! 10 irregex-search irregex-search/matches irregex-match 11 irregex-search/chunked irregex-match/chunked make-irregex-chunker 12 irregex-match-substring irregex-match-subchunk 13 irregex-match-start-chunk irregex-match-start-index 14 irregex-match-end-chunk irregex-match-end-index 15 irregex-match-num-submatches irregex-match-names 16 irregex-match-valid-index? 17 irregex-fold irregex-replace irregex-replace/all 18 irregex-dfa irregex-dfa/search irregex-dfa/extract 19 irregex-nfa irregex-flags irregex-lengths irregex-names 20 irregex-num-submatches irregex-extract irregex-split 21 22 ;; add 23 irregex-fold/chunked 24 ) 25 (import (rnrs) 26 (only (rnrs r5rs (6)) modulo remainder quotient) 27 (rnrs mutable-strings) 28 (rnrs mutable-pairs)) 29 30;; based on rev:2e55ccfbba 31 32;;;; irregex.scm -- IrRegular Expressions 33;; 34;; Copyright (c) 2005-2010 Alex Shinn. All rights reserved. 35;; BSD-style license: http://synthcode.com/license.txt 36 37;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 38;; At this moment there was a loud ring at the bell, and I could 39;; hear Mrs. Hudson, our landlady, raising her voice in a wail of 40;; expostulation and dismay. 41;; 42;; "By heaven, Holmes," I said, half rising, "I believe that 43;; they are really after us." 44;; 45;; "No, it's not quite so bad as that. It is the unofficial 46;; force, -- the Baker Street irregulars." 47 48;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 49;;;; Notes 50;; 51;; This code should not require any porting - it should work out of 52;; the box in any R[45]RS Scheme implementation. Slight modifications 53;; are needed for R6RS (a separate R6RS-compatible version is included 54;; in the distribution as irregex-r6rs.scm). 55;; 56;; The goal of portability makes this code a little clumsy and 57;; inefficient. Future versions will include both cleanup and 58;; performance tuning, but you can only go so far while staying 59;; portable. AND-LET*, SRFI-9 records and custom macros would've been 60;; nice. 61 62;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 63;;;; History 64;; 65;; 0.8.2: 2010/08/28 - (...)? submatch extraction fix and alternate 66;; named submatches from Peter Bex 67;; Added irregex-split, irregex-extract, 68;; irregex-match-names and irregex-match-valid-index? 69;; to Chicken and Guile module export lists and made 70;; the latter accept named submatches. The procedures 71;; irregex-match-{start,end}-{index,chunk} now also 72;; accept named submatches, with the index argument 73;; made optional. Improved argument type checks. 74;; Disallow negative submatch index. 75;; Improve performance of backtracking matcher. 76;; Refactor charset handling into a consistent API 77;; 0.8.1: 2010/03/09 - backtracking irregex-match fix and other small fixes 78;; 0.8.0: 2010/01/20 - optimizing DFA compilation, adding SRE escapes 79;; inside PCREs, adding utility SREs 80;; 0.7.5: 2009/08/31 - adding irregex-extract and irregex-split 81;; *-fold copies match data (use *-fold/fast for speed) 82;; irregex-opt now returns an SRE 83;; 0.7.4: 2009/05/14 - empty alternates (or) and empty csets always fail, 84;; bugfix in default finalizer for irregex-fold/chunked 85;; 0.7.3: 2009/04/14 - adding irregex-fold/chunked, minor doc fixes 86;; 0.7.2: 2009/02/11 - some bugfixes, much improved documentation 87;; 0.7.1: 2008/10/30 - several bugfixes (thanks to Derick Eddington) 88;; 0.7.0: 2008/10/20 - support abstract chunked strings 89;; 0.6.2: 2008/07/26 - minor bugfixes, allow global disabling of utf8 mode, 90;; friendlier error messages in parsing, \Q..\E support 91;; 0.6.1: 2008/07/21 - added utf8 mode, more utils, bugfixes 92;; 0.6: 2008/05/01 - most of PCRE supported 93;; 0.5: 2008/04/24 - fully portable R4RS, many PCRE features implemented 94;; 0.4: 2008/04/17 - rewriting NFA to use efficient closure compilation, 95;; normal strings only, but all of the spencer tests pass 96;; 0.3: 2008/03/10 - adding DFA converter (normal strings only) 97;; 0.2: 2005/09/27 - adding irregex-opt (like elisp's regexp-opt) utility 98;; 0.1: 2005/08/18 - simple NFA interpreter over abstract chunked strings 99 100;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 101;;;; Data Structures 102 103(define irregex-tag '*irregex-tag*) 104 105(define (make-irregex dfa dfa/search dfa/extract nfa flags 106 submatches lengths names) 107 (vector irregex-tag dfa dfa/search dfa/extract nfa flags 108 submatches lengths names)) 109 110(define (irregex? obj) 111 (and (vector? obj) 112 (= 9 (vector-length obj)) 113 (eq? irregex-tag (vector-ref obj 0)))) 114 115(define (irregex-dfa x) (vector-ref x 1)) 116(define (irregex-dfa/search x) (vector-ref x 2)) 117(define (irregex-dfa/extract x) (vector-ref x 3)) 118(define (irregex-nfa x) (vector-ref x 4)) 119(define (irregex-flags x) (vector-ref x 5)) 120(define (irregex-num-submatches x) (vector-ref x 6)) 121(define (irregex-lengths x) (vector-ref x 7)) 122(define (irregex-names x) (vector-ref x 8)) 123 124(define (irregex-new-matches irx) 125 (make-irregex-match (irregex-num-submatches irx) (irregex-names irx))) 126 127(define (irregex-reset-matches! m) 128 (do ((i (- (vector-length m) 1) (- i 1))) 129 ((<= i 3) m) 130 (vector-set! m i #f))) 131 132(define (irregex-copy-matches m) 133 (and (vector? m) 134 (let ((r (make-vector (vector-length m)))) 135 (do ((i (- (vector-length m) 1) (- i 1))) 136 ((< i 0) r) 137 (vector-set! r i (vector-ref m i)))))) 138 139(define irregex-match-tag '*irregex-match-tag*) 140 141(define (irregex-match-data? obj) 142 (and (vector? obj) 143 (>= (vector-length obj) 11) 144 (eq? irregex-match-tag (vector-ref obj 0)))) 145 146(define (make-irregex-match count names) 147 (let ((res (make-vector (+ (* 4 (+ 2 count)) 3) #f))) 148 (vector-set! res 0 irregex-match-tag) 149 (vector-set! res 2 names) 150 res)) 151 152(define (irregex-match-num-submatches m) 153 (- (quotient (- (vector-length m) 3) 4) 2)) 154 155(define (irregex-match-chunker m) 156 (vector-ref m 1)) 157(define (irregex-match-names m) 158 (vector-ref m 2)) 159(define (irregex-match-chunker-set! m str) 160 (vector-set! m 1 str)) 161 162(define (%irregex-match-start-chunk m n) (vector-ref m (+ 3 (* n 4)))) 163(define (%irregex-match-start-index m n) (vector-ref m (+ 4 (* n 4)))) 164(define (%irregex-match-end-chunk m n) (vector-ref m (+ 5 (* n 4)))) 165(define (%irregex-match-end-index m n) (vector-ref m (+ 6 (* n 4)))) 166 167(define (%irregex-match-fail m) 168 (vector-ref m (- (vector-length m) 1))) 169(define (%irregex-match-fail-set! m x) 170 (vector-set! m (- (vector-length m) 1) x)) 171 172;; public interface with error checking 173(define (irregex-match-start-chunk m . opt) 174 (let ((n (irregex-match-numeric-index "irregex-match-start-chunk" m opt))) 175 (and n (%irregex-match-start-chunk m n)))) 176(define (irregex-match-start-index m . opt) 177 (let ((n (irregex-match-numeric-index "irregex-match-start-index" m opt))) 178 (and n (%irregex-match-start-index m n)))) 179(define (irregex-match-end-chunk m . opt) 180 (let ((n (irregex-match-numeric-index "irregex-match-end-chunk" m opt))) 181 (and n (%irregex-match-end-chunk m n)))) 182(define (irregex-match-end-index m . opt) 183 (let ((n (irregex-match-numeric-index "irregex-match-end-index" m opt))) 184 (and n (%irregex-match-end-index m n)))) 185 186(define (irregex-match-start-chunk-set! m n start) 187 (vector-set! m (+ 3 (* n 4)) start)) 188(define (irregex-match-start-index-set! m n start) 189 (vector-set! m (+ 4 (* n 4)) start)) 190(define (irregex-match-end-chunk-set! m n end) 191 (vector-set! m (+ 5 (* n 4)) end)) 192(define (irregex-match-end-index-set! m n end) 193 (vector-set! m (+ 6 (* n 4)) end)) 194 195;; Helper procedure to convert any type of index from a rest args list 196;; to a numeric index. Named submatches are converted to their corresponding 197;; numeric index, and numeric submatches are checked for validity. 198;; An error is raised for invalid numeric or named indices, #f is returned 199;; for defined but nonmatching indices. 200(define (irregex-match-numeric-index location m opt) 201 (cond 202 ((not (irregex-match-data? m)) 203 (error (string-append location ": not match data") m)) 204 ((not (pair? opt)) 0) 205 ((pair? (cdr opt)) 206 (apply error (string-append location ": too many arguments") m opt)) 207 (else 208 (let ((n (car opt))) 209 (if (number? n) 210 (if (and (integer? n) (exact? n)) 211 (if (irregex-match-valid-numeric-index? m n) 212 (and (irregex-match-matched-numeric-index? m n) n) 213 (error (string-append location ": not a valid index") 214 m n)) 215 (error (string-append location ": not an exact integer") n)) 216 (let lp ((ls (irregex-match-names m)) 217 (unknown? #t)) 218 (cond 219 ((null? ls) 220 (and unknown? 221 (error (string-append location ": unknown match name") n))) 222 ((eq? n (caar ls)) 223 (if (%irregex-match-start-chunk m (cdar ls)) 224 (cdar ls) 225 (lp (cdr ls) #f))) 226 (else (lp (cdr ls) unknown?))))))))) 227 228(define (irregex-match-valid-numeric-index? m n) 229 (and (>= n 0) (< (+ 3 (* n 4)) (- (vector-length m) 4)))) 230 231(define (irregex-match-matched-numeric-index? m n) 232 (and (vector-ref m (+ 4 (* n 4))) 233 #t)) 234 235(define (irregex-match-valid-named-index? m n) 236 (and (assq n (irregex-match-names m)) 237 #t)) 238 239(define (irregex-match-valid-index? m n) 240 (if (not (irregex-match-data? m)) 241 (error "irregex-match-valid-index?: not match data" m)) 242 (if (integer? n) 243 (if (not (exact? n)) 244 (error "irregex-match-valid-index?: not an exact integer" n) 245 (irregex-match-valid-numeric-index? m n)) 246 (irregex-match-valid-named-index? m n))) 247 248(define (irregex-match-substring m . opt) 249 (let* ((n (irregex-match-numeric-index "irregex-match-substring" m opt)) 250 (cnk (irregex-match-chunker m))) 251 (and n 252 ((chunker-get-substring cnk) 253 (%irregex-match-start-chunk m n) 254 (%irregex-match-start-index m n) 255 (%irregex-match-end-chunk m n) 256 (%irregex-match-end-index m n))))) 257 258(define (irregex-match-subchunk m . opt) 259 (let* ((n (irregex-match-numeric-index "irregex-match-subchunk" m opt)) 260 (cnk (irregex-match-chunker m)) 261 (get-subchunk (chunker-get-subchunk cnk))) 262 (if (not get-subchunk) 263 (error "this chunk type does not support match subchunks") 264 (and n (get-subchunk 265 (%irregex-match-start-chunk m n) 266 (%irregex-match-start-index m n) 267 (%irregex-match-end-chunk m n) 268 (%irregex-match-end-index m n)))))) 269 270;; chunkers tell us how to navigate through chained chunks of strings 271 272(define (make-irregex-chunker get-next get-str . o) 273 (let* ((get-start (or (and (pair? o) (car o)) (lambda (cnk) 0))) 274 (o (if (pair? o) (cdr o) o)) 275 (get-end (or (and (pair? o) (car o)) 276 (lambda (cnk) (string-length (get-str cnk))))) 277 (o (if (pair? o) (cdr o) o)) 278 (get-substr 279 (or (and (pair? o) (car o)) 280 (lambda (cnk1 start cnk2 end) 281 (if (eq? cnk1 cnk2) 282 (substring (get-str cnk1) start end) 283 (let loop ((cnk (get-next cnk1)) 284 (res (list (substring (get-str cnk1) 285 start 286 (get-end cnk1))))) 287 (if (eq? cnk cnk2) 288 (string-cat-reverse 289 (cons (substring (get-str cnk) 290 (get-start cnk) 291 end) 292 res)) 293 (loop (get-next cnk) 294 (cons (substring (get-str cnk) 295 (get-start cnk) 296 (get-end cnk)) 297 res)))))))) 298 (o (if (pair? o) (cdr o) o)) 299 (get-subchunk (and (pair? o) (car o)))) 300 (if (not (and (procedure? get-next) (procedure? get-str) 301 (procedure? get-start) (procedure? get-substr))) 302 (error "make-irregex-chunker: expected a procdure")) 303 (vector get-next get-str get-start get-end get-substr get-subchunk))) 304 305(define (chunker-get-next cnk) (vector-ref cnk 0)) 306(define (chunker-get-str cnk) (vector-ref cnk 1)) 307(define (chunker-get-start cnk) (vector-ref cnk 2)) 308(define (chunker-get-end cnk) (vector-ref cnk 3)) 309(define (chunker-get-substring cnk) (vector-ref cnk 4)) 310(define (chunker-get-subchunk cnk) (vector-ref cnk 5)) 311 312(define (chunker-prev-chunk cnk start end) 313 (if (eq? start end) 314 #f 315 (let ((get-next (chunker-get-next cnk))) 316 (let lp ((start start)) 317 (let ((next (get-next start))) 318 (if (eq? next end) 319 start 320 (and next (lp next)))))))) 321 322(define (chunker-prev-char cnk start end) 323 (let ((prev (chunker-prev-chunk cnk start end))) 324 (and prev 325 (string-ref ((chunker-get-str cnk) prev) 326 (- ((chunker-get-end cnk) prev) 1))))) 327 328(define (chunker-next-char cnk src) 329 (let ((next ((chunker-get-next cnk) src))) 330 (and next 331 (string-ref ((chunker-get-str cnk) next) 332 ((chunker-get-start cnk) next))))) 333 334(define (chunk-before? cnk a b) 335 (and (not (eq? a b)) 336 (let ((next ((chunker-get-next cnk) a))) 337 (and next 338 (if (eq? next b) 339 #t 340 (chunk-before? cnk next b)))))) 341 342;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 343;;;; String Utilities 344 345;; Unicode version (skip surrogates) 346(define *all-chars* 347 `(/ ,(integer->char 0) ,(integer->char #xD7FF) 348 ,(integer->char #xE000) ,(integer->char #x10FFFF))) 349 350;; ASCII version, offset to not assume 0-255 351;; (define *all-chars* `(/ ,(integer->char (- (char->integer #\space) 32)) ,(integer->char (+ (char->integer #\space) 223)))) 352 353;; set to #f to ignore even an explicit request for utf8 handling 354(define *allow-utf8-mode?* #f) 355 356;; (define *named-char-properties* '()) 357 358(define (string-scan-char str c . o) 359 (let ((end (string-length str))) 360 (let scan ((i (if (pair? o) (car o) 0))) 361 (cond ((= i end) #f) 362 ((eqv? c (string-ref str i)) i) 363 (else (scan (+ i 1))))))) 364 365(define (string-scan-char-escape str c . o) 366 (let ((end (string-length str))) 367 (let scan ((i (if (pair? o) (car o) 0))) 368 (cond ((= i end) #f) 369 ((eqv? c (string-ref str i)) i) 370 ((eqv? c #\\) (scan (+ i 2))) 371 (else (scan (+ i 1))))))) 372 373(define (string-scan-pred str pred . o) 374 (let ((end (string-length str))) 375 (let scan ((i (if (pair? o) (car o) 0))) 376 (cond ((= i end) #f) 377 ((pred (string-ref str i)) i) 378 (else (scan (+ i 1))))))) 379 380(define (string-split-char str c) 381 (let ((end (string-length str))) 382 (let lp ((i 0) (from 0) (res '())) 383 (define (collect) (cons (substring str from i) res)) 384 (cond ((>= i end) (reverse (collect))) 385 ((eqv? c (string-ref str i)) (lp (+ i 1) (+ i 1) (collect))) 386 (else (lp (+ i 1) from res)))))) 387 388(define (char-alphanumeric? c) 389 (or (char-alphabetic? c) (char-numeric? c))) 390 391(define (%substring=? a b start1 start2 len) 392 (let lp ((i 0)) 393 (cond ((>= i len) 394 #t) 395 ((char=? (string-ref a (+ start1 i)) (string-ref b (+ start2 i))) 396 (lp (+ i 1))) 397 (else 398 #f)))) 399 400;; SRFI-13 extracts 401 402(define (%%string-copy! to tstart from fstart fend) 403 (do ((i fstart (+ i 1)) 404 (j tstart (+ j 1))) 405 ((>= i fend)) 406 (string-set! to j (string-ref from i)))) 407 408(define (string-cat-reverse string-list) 409 (string-cat-reverse/aux 410 (fold (lambda (s a) (+ (string-length s) a)) 0 string-list) 411 string-list)) 412 413(define (string-cat-reverse/aux len string-list) 414 (let ((res (make-string len))) 415 (let lp ((i len) (ls string-list)) 416 (if (pair? ls) 417 (let* ((s (car ls)) 418 (slen (string-length s)) 419 (i (- i slen))) 420 (%%string-copy! res i s 0 slen) 421 (lp i (cdr ls))))) 422 res)) 423 424;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 425;;;; List Utilities 426 427;; like the one-arg IOTA case 428(define (zero-to n) 429 (if (<= n 0) 430 '() 431 (let lp ((i (- n 1)) (res '())) 432 (if (zero? i) (cons 0 res) (lp (- i 1) (cons i res)))))) 433 434;; SRFI-1 extracts (simplified 1-ary versions) 435 436(define (find-tail pred ls) 437 (let lp ((ls ls)) 438 (cond ((null? ls) #f) 439 ((pred (car ls)) ls) 440 (else (lp (cdr ls)))))) 441 442(define (last ls) 443 (if (not (pair? ls)) 444 (error "can't take last of empty list" ls) 445 (let lp ((ls ls)) 446 (if (pair? (cdr ls)) 447 (lp (cdr ls)) 448 (car ls))))) 449 450(define (any pred ls) 451 (and (pair? ls) 452 (let lp ((head (car ls)) (tail (cdr ls))) 453 (if (null? tail) 454 (pred head) 455 (or (pred head) (lp (car tail) (cdr tail))))))) 456 457(define (every pred ls) 458 (or (null? ls) 459 (let lp ((head (car ls)) (tail (cdr ls))) 460 (if (null? tail) 461 (pred head) 462 (and (pred head) (lp (car tail) (cdr tail))))))) 463 464(define (fold kons knil ls) 465 (let lp ((ls ls) (res knil)) 466 (if (null? ls) 467 res 468 (lp (cdr ls) (kons (car ls) res))))) 469 470;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 471;;;; Flags 472 473(define (bit-shr n i) 474 (quotient n (expt 2 i))) 475 476(define (bit-shl n i) 477 (* n (expt 2 i))) 478 479(define (bit-not n) (- #xFFFF n)) 480 481(define (bit-ior a b) 482 (cond 483 ((zero? a) b) 484 ((zero? b) a) 485 (else 486 (+ (if (or (odd? a) (odd? b)) 1 0) 487 (* 2 (bit-ior (quotient a 2) (quotient b 2))))))) 488 489(define (bit-and a b) 490 (cond 491 ((zero? a) 0) 492 ((zero? b) 0) 493 (else 494 (+ (if (and (odd? a) (odd? b)) 1 0) 495 (* 2 (bit-and (quotient a 2) (quotient b 2))))))) 496 497(define (integer-log n) 498 (define (b8 n r) 499 (if (>= n (bit-shl 1 8)) (b4 (bit-shr n 8) (+ r 8)) (b4 n r))) 500 (define (b4 n r) 501 (if (>= n (bit-shl 1 4)) (b2 (bit-shr n 4) (+ r 4)) (b2 n r))) 502 (define (b2 n r) 503 (if (>= n (bit-shl 1 2)) (b1 (bit-shr n 2) (+ r 2)) (b1 n r))) 504 (define (b1 n r) (if (>= n (bit-shl 1 1)) (+ r 1) r)) 505 (if (>= n (bit-shl 1 16)) (b8 (bit-shr n 16) 16) (b8 n 0))) 506 507(define (flag-set? flags i) 508 (= i (bit-and flags i))) 509(define (flag-join a b) 510 (if b (bit-ior a b) a)) 511(define (flag-clear a b) 512 (bit-and a (bit-not b))) 513 514(define ~none 0) 515(define ~searcher? 1) 516(define ~consumer? 2) 517 518;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 519;;;; Parsing Embedded SREs in PCRE Strings 520 521;; (define (with-read-from-string str i proc) 522;; (define (port-size in) 523;; (let lp ((i 0)) (if (eof-object? (read-char in)) i (lp (+ i 1))))) 524;; (let* ((len (string-length str)) 525;; (tail-len (- len i)) 526;; (in (open-input-string (substring str i len))) 527;; (sre (read in)) 528;; (unused-len (port-size in))) 529;; (close-input-port in) 530;; (proc sre (- tail-len unused-len)))) 531 532(define close-token (list 'close)) 533(define dot-token (string->symbol ".")) 534 535(define (with-read-from-string str i proc) 536 (define end (string-length str)) 537 (define (read i k) 538 (cond 539 ((>= i end) (error "unterminated embedded SRE" str)) 540 (else 541 (case (string-ref str i) 542 ((#\() 543 (let lp ((i (+ i 1)) (ls '())) 544 (read 545 i 546 (lambda (x j) 547 (cond 548 ((eq? x close-token) 549 (k (reverse ls) j)) 550 ((eq? x dot-token) 551 (if (null? ls) 552 (error "bad dotted form" str) 553 (read j (lambda (y j2) 554 (read j2 (lambda (z j3) 555 (if (not (eq? z close-token)) 556 (error "bad dotted form" str) 557 (k (append (reverse (cdr ls)) 558 (cons (car ls) y)) 559 j3)))))))) 560 (else 561 (lp j (cons x ls)))))))) 562 ((#\)) 563 (k close-token (+ i 1))) 564 ((#\;) 565 (let skip ((i (+ i 1))) 566 (if (or (>= i end) (eqv? #\newline (string-ref str i))) 567 (read (+ i 1) k) 568 (skip (+ i 1))))) 569 ((#\' #\`) 570 (read (+ i 1) 571 (lambda (sexp j) 572 (let ((q (if (eqv? #\' (string-ref str i)) 'quote 'quasiquote))) 573 (k (list q sexp) j))))) 574 ((#\,) 575 (let* ((at? (and (< (+ i 1) end) (eqv? #\@ (string-ref str (+ i 1))))) 576 (u (if at? 'uquote-splicing 'unquote)) 577 (j (if at? (+ i 2) (+ i 1)))) 578 (read j (lambda (sexp j) (k (list u sexp) j))))) 579 ((#\") 580 (let scan ((from (+ i 1)) (i (+ i 1)) (res '())) 581 (define (collect) 582 (if (= from i) res (cons (substring str from i) res))) 583 (if (>= i end) 584 (error "unterminated string in embeded SRE" str) 585 (case (string-ref str i) 586 ((#\") (k (string-cat-reverse (collect)) (+ i 1))) 587 ((#\\) (scan (+ i 1) (+ i 2) (collect))) 588 (else (scan from (+ i 1) res)))))) 589 ((#\#) 590 (case (string-ref str (+ i 1)) 591 ((#\;) 592 (read (+ i 2) (lambda (sexp j) (read j k)))) 593 ((#\\) 594 (read (+ i 2) 595 (lambda (sexp j) 596 (k (case sexp 597 ((space) #\space) 598 ((newline) #\newline) 599 (else (let ((s (if (number? sexp) 600 (number->string sexp) 601 (symbol->string sexp)))) 602 (string-ref s 0)))) 603 j)))) 604 ((#\t #\f) 605 (k (eqv? #\t (string-ref str (+ i 1))) (+ i 2))) 606 (else 607 (error "bad # syntax in simplified SRE" i)))) 608 (else 609 (cond 610 ((char-whitespace? (string-ref str i)) 611 (read (+ i 1) k)) 612 (else ;; symbol/number 613 (let scan ((j (+ i 1))) 614 (cond 615 ((or (>= j end) 616 (let ((c (string-ref str j))) 617 (or (char-whitespace? c) 618 (memv c '(#\; #\( #\) #\" #\# #\\))))) 619 (let ((str2 (substring str i j))) 620 (k (or (string->number str2) (string->symbol str2)) j))) 621 (else (scan (+ j 1)))))))))))) 622 (read i (lambda (res j) 623 (if (eq? res 'close-token) 624 (error "unexpected ')' in SRE" str j) 625 (proc res j))))) 626 627;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 628;;;; Parsing PCRE Strings 629 630(define ~save? 1) 631(define ~case-insensitive? 2) 632(define ~multi-line? 4) 633(define ~single-line? 8) 634(define ~ignore-space? 16) 635(define ~utf8? 32) 636 637(define (symbol-list->flags ls) 638 (let lp ((ls ls) (res ~none)) 639 (if (not (pair? ls)) 640 res 641 (lp (cdr ls) 642 (flag-join 643 res 644 (case (car ls) 645 ((i ci case-insensitive) ~case-insensitive?) 646 ((m multi-line) ~multi-line?) 647 ((s single-line) ~single-line?) 648 ((x ignore-space) ~ignore-space?) 649 ((u utf8) (if *allow-utf8-mode?* ~utf8? ~none)) 650 (else #f))))))) 651 652(define (maybe-string->sre obj) 653 (if (string? obj) (string->sre obj) obj)) 654 655(define (string->sre str . o) 656 (if (not (string? str)) (error "string->sre: expected a string" str)) 657 (let ((end (string-length str)) 658 (flags (symbol-list->flags o))) 659 660 (let lp ((i 0) (from 0) (flags flags) (res '()) (st '())) 661 662 ;; handle case sensitivity at the literal char/string level 663 (define (cased-char ch) 664 (if (and (flag-set? flags ~case-insensitive?) 665 (char-alphabetic? ch)) 666 `(or ,ch ,(char-altcase ch)) 667 ch)) 668 (define (cased-string str) 669 (if (flag-set? flags ~case-insensitive?) 670 (sre-sequence (map cased-char (string->list str))) 671 str)) 672 ;; accumulate the substring from..i as literal text 673 (define (collect) 674 (if (= i from) res (cons (cased-string (substring str from i)) res))) 675 ;; like collect but breaks off the last single character when 676 ;; collecting literal data, as the argument to ?/*/+ etc. 677 (define (collect/single) 678 (let* ((utf8? (flag-set? flags ~utf8?)) 679 (j (if (and utf8? (> i 1)) 680 (utf8-backup-to-initial-char str (- i 1)) 681 (- i 1)))) 682 (cond 683 ((< j from) 684 res) 685 (else 686 (let ((c (cased-char (if utf8? 687 (utf8-string-ref str j (- i j)) 688 (string-ref str j))))) 689 (cond 690 ((= j from) 691 (cons c res)) 692 (else 693 (cons c 694 (cons (cased-string (substring str from j)) 695 res))))))))) 696 ;; collects for use as a result, reversing and grouping OR 697 ;; terms, and some ugly tweaking of `function-like' groups and 698 ;; conditionals 699 (define (collect/terms) 700 (let* ((ls (collect)) 701 (func 702 (and (pair? ls) 703 (memq (last ls) 704 '(atomic if look-ahead neg-look-ahead 705 look-behind neg-look-behind 706 => submatch-named 707 w/utf8 w/noutf8)))) 708 (prefix (if (and func (memq (car func) '(=> submatch-named))) 709 (list 'submatch-named (cadr (reverse ls))) 710 (and func (list (car func))))) 711 (ls (if func 712 (if (memq (car func) '(=> submatch-named)) 713 (reverse (cddr (reverse ls))) 714 (reverse (cdr (reverse ls)))) 715 ls))) 716 (let lp ((ls ls) (term '()) (res '())) 717 (define (shift) 718 (cons (sre-sequence term) res)) 719 (cond 720 ((null? ls) 721 (let* ((res (sre-alternate (shift))) 722 (res (if (flag-set? flags ~save?) 723 (list 'submatch res) 724 res))) 725 (if prefix 726 (if (eq? 'if (car prefix)) 727 (cond 728 ((not (pair? res)) 729 'epsilon) 730 ((memq (car res) 731 '(look-ahead neg-look-ahead 732 look-behind neg-look-behind)) 733 res) 734 ((eq? 'seq (car res)) 735 `(if ,(cadr res) 736 ,(if (pair? (cdr res)) 737 (sre-sequence (cddr res)) 738 'epsilon))) 739 (else 740 `(if ,(cadadr res) 741 ,(if (pair? (cdr res)) 742 (sre-sequence (cddadr res)) 743 'epsilon) 744 ,(sre-alternate 745 (if (pair? (cdr res)) (cddr res) '()))))) 746 `(,@prefix ,res)) 747 res))) 748 ((eq? 'or (car ls)) (lp (cdr ls) '() (shift))) 749 (else (lp (cdr ls) (cons (car ls) term) res)))))) 750 (define (save) 751 (cons (cons flags (collect)) st)) 752 753 ;; main parsing 754 (if (>= i end) 755 (if (pair? st) 756 (error "unterminated parenthesis in regexp" str) 757 (collect/terms)) 758 (let ((c (string-ref str i))) 759 (case c 760 ((#\.) 761 (lp (+ i 1) (+ i 1) flags 762 (cons (if (flag-set? flags ~single-line?) 'any 'nonl) 763 (collect)) 764 st)) 765 ((#\?) 766 (let ((res (collect/single))) 767 (if (null? res) 768 (error "? can't follow empty pattern" str res) 769 (let ((x (car res))) 770 (lp (+ i 1) 771 (+ i 1) 772 flags 773 (cons 774 (if (pair? x) 775 (case (car x) 776 ((*) `(*? ,@(cdr x))) 777 ((+) `(**? 1 #f ,@(cdr x))) 778 ((?) `(?? ,@(cdr x))) 779 ((**) `(**? ,@(cdr x))) 780 ((=) `(**? ,(cadr x) ,@(cdr x))) 781 ((>=) `(**? ,(cadr x) #f ,@(cddr x))) 782 (else `(? ,x))) 783 `(? ,x)) 784 (cdr res)) 785 st))))) 786 ((#\+ #\*) 787 (let* ((res (collect/single)) 788 (x (if (pair? res) (car res) 'epsilon)) 789 (op (string->symbol (string c)))) 790 (cond 791 ((sre-repeater? x) 792 (error "duplicate repetition (e.g. **) in pattern" str res)) 793 ((sre-empty? x) 794 (error "can't repeat empty pattern (e.g. ()*)" str res)) 795 (else 796 (lp (+ i 1) (+ i 1) flags 797 (cons (list op x) (cdr res)) 798 st))))) 799 ((#\() 800 (cond 801 ((>= (+ i 1) end) 802 (error "unterminated parenthesis in regexp" str)) 803 ((not (memv (string-ref str (+ i 1)) '(#\? #\*))) ; normal case 804 (lp (+ i 1) (+ i 1) (flag-join flags ~save?) '() (save))) 805 ((>= (+ i 2) end) 806 (error "unterminated parenthesis in regexp" str)) 807 ((eqv? (string-ref str (+ i 1)) #\*) 808 (if (eqv? #\' (string-ref str (+ i 2))) 809 (with-read-from-string str (+ i 3) 810 (lambda (sre j) 811 (if (or (>= j end) (not (eqv? #\) (string-ref str j)))) 812 (error "unterminated (*'...) SRE escape" str) 813 (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st)))) 814 (error "bad regexp syntax: (*FOO) not supported" str))) 815 (else ;; (?...) case 816 (case (string-ref str (+ i 2)) 817 ((#\#) 818 (let ((j (string-scan-char str #\) (+ i 3)))) 819 (lp (+ j i) (+ j 1) flags (collect) st))) 820 ((#\:) 821 (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '() (save))) 822 ((#\=) 823 (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) 824 '(look-ahead) (save))) 825 ((#\!) 826 (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) 827 '(neg-look-ahead) (save))) 828 ((#\<) 829 (cond 830 ((>= (+ i 3) end) 831 (error "unterminated parenthesis in regexp" str)) 832 (else 833 (case (string-ref str (+ i 3)) 834 ((#\=) 835 (lp (+ i 4) (+ i 4) (flag-clear flags ~save?) 836 '(look-behind) (save))) 837 ((#\!) 838 (lp (+ i 4) (+ i 4) (flag-clear flags ~save?) 839 '(neg-look-behind) (save))) 840 (else 841 (let ((j (and (char-alphabetic? 842 (string-ref str (+ i 3))) 843 (string-scan-char str #\> (+ i 4))))) 844 (if j 845 (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) 846 `(,(string->symbol (substring str (+ i 3) j)) 847 submatch-named) 848 (save)) 849 (error "invalid (?< sequence" str)))))))) 850 ((#\>) 851 (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) 852 '(atomic) (save))) 853 ;;((#\' #\P) ; named subpatterns 854 ;; ) 855 ;;((#\R) ; recursion 856 ;; ) 857 ((#\() 858 (cond 859 ((>= (+ i 3) end) 860 (error "unterminated parenthesis in regexp" str)) 861 ((char-numeric? (string-ref str (+ i 3))) 862 (let* ((j (string-scan-char str #\) (+ i 3))) 863 (n (string->number (substring str (+ i 3) j)))) 864 (if (not n) 865 (error "invalid conditional reference" str) 866 (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) 867 `(,n if) (save))))) 868 ((char-alphabetic? (string-ref str (+ i 3))) 869 (let* ((j (string-scan-char str #\) (+ i 3))) 870 (s (string->symbol (substring str (+ i 3) j)))) 871 (lp (+ j 1) (+ j 1) (flag-clear flags ~save?) 872 `(,s if) (save)))) 873 (else 874 (lp (+ i 2) (+ i 2) (flag-clear flags ~save?) 875 '(if) (save))))) 876 ((#\{) 877 (error "unsupported Perl-style cluster" str)) 878 (else 879 (let ((old-flags flags)) 880 (let lp2 ((j (+ i 2)) (flags flags) (invert? #f)) 881 (define (join x) 882 ((if invert? flag-clear flag-join) flags x)) 883 (define (new-res res) 884 (let ((before (flag-set? old-flags ~utf8?)) 885 (after (flag-set? flags ~utf8?))) 886 (if (eq? before after) 887 res 888 (cons (if after 'w/utf8 'w/noutf8) res)))) 889 (cond 890 ((>= j end) 891 (error "incomplete cluster" str i)) 892 (else 893 (case (string-ref str j) 894 ((#\i) 895 (lp2 (+ j 1) (join ~case-insensitive?) invert?)) 896 ((#\m) 897 (lp2 (+ j 1) (join ~multi-line?) invert?)) 898 ((#\x) 899 (lp2 (+ j 1) (join ~ignore-space?) invert?)) 900 ((#\u) 901 (if *allow-utf8-mode?* 902 (lp2 (+ j 1) (join ~utf8?) invert?) 903 (lp2 (+ j 1) flags invert?))) 904 ((#\-) 905 (lp2 (+ j 1) flags (not invert?))) 906 ((#\)) 907 (lp (+ j 1) (+ j 1) flags (new-res (collect)) 908 st)) 909 ((#\:) 910 (lp (+ j 1) (+ j 1) flags (new-res '()) 911 (cons (cons old-flags (collect)) st))) 912 (else 913 (error "unknown regex cluster modifier" str) 914 ))))))))))) 915 ((#\)) 916 (if (null? st) 917 (error "too many )'s in regexp" str) 918 (lp (+ i 1) 919 (+ i 1) 920 (caar st) 921 (cons (collect/terms) (cdar st)) 922 (cdr st)))) 923 ((#\[) 924 (apply 925 (lambda (sre j) 926 (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st)) 927 (string-parse-cset str (+ i 1) flags))) 928 ((#\{) 929 (cond 930 ((or (>= (+ i 1) end) 931 (not (or (char-numeric? (string-ref str (+ i 1))) 932 (eqv? #\, (string-ref str (+ i 1)))))) 933 (lp (+ i 1) from flags res st)) 934 (else 935 (let ((res (collect/single))) 936 (cond 937 ((null? res) 938 (error "{ can't follow empty pattern")) 939 (else 940 (let* ((x (car res)) 941 (tail (cdr res)) 942 (j (string-scan-char str #\} (+ i 1))) 943 (s2 (string-split-char (substring str (+ i 1) j) 944 #\,)) 945 (n (string->number (car s2))) 946 (m (and (pair? (cdr s2)) 947 (string->number (cadr s2))))) 948 (cond 949 ((or (not n) 950 (and (pair? (cdr s2)) 951 (not (equal? "" (cadr s2))) 952 (not m))) 953 (error "invalid {n} repetition syntax" s2)) 954 ((null? (cdr s2)) 955 (lp (+ j 1) (+ j 1) flags `((= ,n ,x) ,@tail) st)) 956 (m 957 (lp (+ j 1) (+ j 1) flags `((** ,n ,m ,x) ,@tail) st)) 958 (else 959 (lp (+ j 1) (+ j 1) flags `((>= ,n ,x) ,@tail) st) 960 ))))))))) 961 ((#\\) 962 (cond 963 ((>= (+ i 1) end) 964 (error "incomplete escape sequence" str)) 965 (else 966 (let ((c (string-ref str (+ i 1)))) 967 (case c 968 ((#\d) 969 (lp (+ i 2) (+ i 2) flags `(numeric ,@(collect)) st)) 970 ((#\D) 971 (lp (+ i 2) (+ i 2) flags `((~ numeric) ,@(collect)) st)) 972 ((#\s) 973 (lp (+ i 2) (+ i 2) flags `(space ,@(collect)) st)) 974 ((#\S) 975 (lp (+ i 2) (+ i 2) flags `((~ space) ,@(collect)) st)) 976 ((#\w) 977 (lp (+ i 2) (+ i 2) flags 978 `((or alphanumeric ("_")) ,@(collect)) st)) 979 ((#\W) 980 (lp (+ i 2) (+ i 2) flags 981 `((~ (or alphanumeric ("_"))) ,@(collect)) st)) 982 ((#\b) 983 (lp (+ i 2) (+ i 2) flags 984 `((or bow eow) ,@(collect)) st)) 985 ((#\B) 986 (lp (+ i 2) (+ i 2) flags `(nwb ,@(collect)) st)) 987 ((#\A) 988 (lp (+ i 2) (+ i 2) flags `(bos ,@(collect)) st)) 989 ((#\Z) 990 (lp (+ i 2) (+ i 2) flags 991 `((? #\newline) eos ,@(collect)) st)) 992 ((#\z) 993 (lp (+ i 2) (+ i 2) flags `(eos ,@(collect)) st)) 994 ((#\R) 995 (lp (+ i 2) (+ i 2) flags `(newline ,@(collect)) st)) 996 ((#\K) 997 (lp (+ i 2) (+ i 2) flags `(reset ,@(collect)) st)) 998 ;; these two are from Emacs and TRE, but not in PCRE 999 ((#\<) 1000 (lp (+ i 2) (+ i 2) flags `(bow ,@(collect)) st)) 1001 ((#\>) 1002 (lp (+ i 2) (+ i 2) flags `(eow ,@(collect)) st)) 1003 ((#\x) 1004 (apply 1005 (lambda (ch j) 1006 (lp (+ j 1) (+ j 1) flags `(,ch ,@(collect)) st)) 1007 (string-parse-hex-escape str (+ i 2) end))) 1008 ((#\k) 1009 (let ((c (string-ref str (+ i 2)))) 1010 (if (not (memv c '(#\< #\{ #\'))) 1011 (error "bad \\k usage, expected \\k<...>" str) 1012 (let* ((terminal (char-mirror c)) 1013 (j (string-scan-char str terminal (+ i 2))) 1014 (s (and j (substring str (+ i 3) j))) 1015 (backref 1016 (if (flag-set? flags ~case-insensitive?) 1017 'backref-ci 1018 'backref))) 1019 (if (not j) 1020 (error "unterminated named backref" str) 1021 (lp (+ j 1) (+ j 1) flags 1022 `((,backref ,(string->symbol s)) 1023 ,@(collect)) 1024 st)))))) 1025 ((#\Q) ;; \Q..\E escapes 1026 (let ((res (collect))) 1027 (let lp2 ((j (+ i 2))) 1028 (cond 1029 ((>= j end) 1030 (lp j (+ i 2) flags res st)) 1031 ((eqv? #\\ (string-ref str j)) 1032 (cond 1033 ((>= (+ j 1) end) 1034 (lp (+ j 1) (+ i 2) flags res st)) 1035 ((eqv? #\E (string-ref str (+ j 1))) 1036 (lp (+ j 2) (+ j 2) flags 1037 (cons (substring str (+ i 2) j) res) st)) 1038 (else 1039 (lp2 (+ j 2))))) 1040 (else 1041 (lp2 (+ j 1))))))) 1042 ((#\') 1043 (with-read-from-string str (+ i 2) 1044 (lambda (sre j) 1045 (lp j j flags (cons sre (collect)) st)))) 1046 ;;((#\p) ; XXXX unicode properties 1047 ;; ) 1048 ;;((#\P) 1049 ;; ) 1050 (else 1051 (cond 1052 ((char-numeric? c) 1053 (let* ((j (or (string-scan-pred 1054 str 1055 (lambda (c) (not (char-numeric? c))) 1056 (+ i 2)) 1057 end)) 1058 (backref 1059 (if (flag-set? flags ~case-insensitive?) 1060 'backref-ci 1061 'backref)) 1062 (res `((,backref ,(string->number 1063 (substring str (+ i 1) j))) 1064 ,@(collect)))) 1065 (lp j j flags res st))) 1066 ((char-alphabetic? c) 1067 (let ((cell (assv c posix-escape-sequences))) 1068 (if cell 1069 (lp (+ i 2) (+ i 2) flags 1070 (cons (cdr cell) (collect)) st) 1071 (error "unknown escape sequence" str c)))) 1072 (else 1073 (lp (+ i 2) (+ i 1) flags (collect) st))))))))) 1074 ((#\|) 1075 (lp (+ i 1) (+ i 1) flags (cons 'or (collect)) st)) 1076 ((#\^) 1077 (let ((sym (if (flag-set? flags ~multi-line?) 'bol 'bos))) 1078 (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st))) 1079 ((#\$) 1080 (let ((sym (if (flag-set? flags ~multi-line?) 'eol 'eos))) 1081 (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st))) 1082 ((#\space) 1083 (if (flag-set? flags ~ignore-space?) 1084 (lp (+ i 1) (+ i 1) flags (collect) st) 1085 (lp (+ i 1) from flags res st))) 1086 ((#\#) 1087 (if (flag-set? flags ~ignore-space?) 1088 (let ((j (or (string-scan-char str #\newline (+ i 1)) 1089 (- end 1)))) 1090 (lp (+ j 1) (+ j 1) flags (collect) st)) 1091 (lp (+ i 1) from flags res st))) 1092 (else 1093 (lp (+ i 1) from flags res st)))))))) 1094 1095(define posix-escape-sequences 1096 `((#\n . #\newline) 1097 (#\r . ,(integer->char (+ (char->integer #\newline) 3))) 1098 (#\t . ,(integer->char (- (char->integer #\newline) 1))) 1099 (#\a . ,(integer->char (- (char->integer #\newline) 3))) 1100 (#\e . ,(integer->char (+ (char->integer #\newline) #x11))) 1101 (#\f . ,(integer->char (+ (char->integer #\newline) 2))) 1102 )) 1103 1104(define (char-altcase c) 1105 (if (char-upper-case? c) (char-downcase c) (char-upcase c))) 1106 1107(define (char-mirror c) 1108 (case c ((#\<) #\>) ((#\{) #\}) ((#\() #\)) ((#\[) #\]) (else c))) 1109 1110(define (string-parse-hex-escape str i end) 1111 (cond 1112 ((>= i end) 1113 (error "incomplete hex escape" str i)) 1114 ((eqv? #\{ (string-ref str i)) 1115 (let ((j (string-scan-char-escape str #\} (+ i 1)))) 1116 (if (not j) 1117 (error "incomplete hex brace escape" str i) 1118 (let* ((s (substring str (+ i 1) j)) 1119 (n (string->number s 16))) 1120 (if n 1121 (list (integer->char n) j) 1122 (error "bad hex brace escape" s)))))) 1123 ((>= (+ i 1) end) 1124 (error "incomplete hex escape" str i)) 1125 (else 1126 (let* ((s (substring str i (+ i 2))) 1127 (n (string->number s 16))) 1128 (if n 1129 (list (integer->char n) (+ i 2)) 1130 (error "bad hex escape" s)))))) 1131 1132(define (string-parse-cset str start flags) 1133 (let* ((end (string-length str)) 1134 (invert? (and (< start end) (eqv? #\^ (string-ref str start)))) 1135 (utf8? (flag-set? flags ~utf8?))) 1136 (define (go i prev-char cset) 1137 (if (>= i end) 1138 (error "incomplete char set" str i end) 1139 (let ((c (string-ref str i))) 1140 (case c 1141 ((#\]) 1142 (if (cset-empty? cset) 1143 (go (+ i 1) #\] (cset-adjoin cset #\])) 1144 (let ((ci? (flag-set? flags ~case-insensitive?))) 1145 (list 1146 (let ((res (if ci? (cset-case-insensitive cset) cset))) 1147 (cset->sre (if invert? (cset-complement res) res))) 1148 i)))) 1149 ((#\-) 1150 (cond 1151 ((or (= i start) 1152 (and (= i (+ start 1)) (eqv? #\^ (string-ref str start))) 1153 (eqv? #\] (string-ref str (+ i 1)))) 1154 (go (+ i 1) c (cset-adjoin cset c))) 1155 ((not prev-char) 1156 (error "bad char-set")) 1157 (else 1158 (let ((char (string-ref str (+ i 1)))) 1159 (apply 1160 (lambda (c j) 1161 (if (char<? c prev-char) 1162 (error "inverted range in char-set" prev-char c) 1163 (go j #f (cset-union cset (range->cset prev-char c))))) 1164 (cond 1165 ((and (eqv? #\\ char) (assv char posix-escape-sequences)) 1166 => (lambda (x) (list (cdr x) (+ i 3)))) 1167 ((and (eqv? #\\ char) 1168 (eqv? (string-ref str (+ i 2)) #\x)) 1169 (string-parse-hex-escape str (+ i 3) end)) 1170 ((and utf8? (<= #x80 (char->integer char) #xFF)) 1171 (let ((len (utf8-start-char->length char))) 1172 (list (utf8-string-ref str (+ i 1) len) (+ i 1 len)))) 1173 (else 1174 (list char (+ i 2))))))))) 1175 ((#\[) 1176 (let* ((inv? (eqv? #\^ (string-ref str (+ i 1)))) 1177 (i2 (if inv? (+ i 2) (+ i 1)))) 1178 (case (string-ref str i2) 1179 ((#\:) 1180 (let ((j (string-scan-char str #\: (+ i2 1)))) 1181 (if (or (not j) (not (eqv? #\] (string-ref str (+ j 1))))) 1182 (error "incomplete character class" str) 1183 (let* ((class (sre->cset 1184 (string->symbol 1185 (substring str (+ i2 1) j)))) 1186 (class (if inv? (cset-complement class) class))) 1187 (go (+ j 2) #f (cset-union cset class)))))) 1188 ((#\= #\.) 1189 (error "collating sequences not supported" str)) 1190 (else 1191 (go (+ i 1) #\[ (cset-adjoin cset #\[)))))) 1192 ((#\\) 1193 (let ((c (string-ref str (+ i 1)))) 1194 (case c 1195 ((#\d #\D #\s #\S #\w #\W) 1196 (go (+ i 2) #f 1197 (cset-union cset 1198 (sre->cset (string->sre (string #\\ c)))))) 1199 ((#\x) 1200 (apply 1201 (lambda (ch j) 1202 (go j ch (cset-adjoin cset ch))) 1203 (string-parse-hex-escape str (+ i 2) end))) 1204 (else 1205 (let ((c (cond ((assv c posix-escape-sequences) => cdr) 1206 (else c)))) 1207 (go (+ i 2) c (cset-adjoin cset c))))))) 1208 (else 1209 (if (and utf8? (<= #x80 (char->integer c) #xFF)) 1210 (let ((len (utf8-start-char->length c))) 1211 (go (+ i len) 1212 (utf8-string-ref str i len) 1213 (cset-adjoin cset (utf8-string-ref str i len)))) 1214 (go (+ i 1) c (cset-adjoin cset c)))))))) 1215 (if invert? 1216 (go (+ start 1) 1217 #f 1218 (if (flag-set? flags ~multi-line?) 1219 (char->cset #\newline) 1220 (make-cset))) 1221 (go start #f (make-cset))))) 1222 1223;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1224;;;; UTF-8 Utilities 1225 1226;; Here are some hairy optimizations that need to be documented 1227;; better. Thanks to these, we never do any utf8 processing once the 1228;; regexp is compiled. 1229 1230;; two chars: ab..ef 1231;; a[b..xFF]|[b-d][x80..xFF]|e[x80..xFF] 1232 1233;; three chars: abc..ghi 1234;; ab[c..xFF]|a[d..xFF][x80..xFF]| 1235;; [b..f][x80..xFF][x80..xFF]| 1236;; g[x80..g][x80..xFF]|gh[x80..i] 1237 1238;; four chars: abcd..ghij 1239;; abc[d..xFF]|ab[d..xFF][x80..xFF]|a[c..xFF][x80..xFF][x80..xFF]| 1240;; [b..f][x80..xFF][x80..xFF][x80..xFF]| 1241;; g[x80..g][x80..xFF][x80..xFF]|gh[x80..h][x80..xFF]|ghi[x80..j] 1242 1243(define (high-char? c) (<= #x80 (char->integer c))) 1244 1245;; number of total bytes in a utf8 char given the 1st byte 1246 1247(define utf8-start-char->length 1248 (let ((table '#( 12491 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 0x 12501 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 1x 12511 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 2x 12521 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 3x 12531 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 4x 12541 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 5x 12551 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 6x 12561 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 7x 12571 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 8x 12581 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 9x 12591 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; ax 12601 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; bx 12612 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; cx 12622 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; dx 12633 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ; ex 12644 4 4 4 4 4 4 4 5 5 5 5 6 6 0 0 ; fx 1265))) 1266 (lambda (c) (vector-ref table (char->integer c))))) 1267 1268(define (utf8-string-ref str i len) str) 1269 1270(define (utf8-backup-to-initial-char str i) str) 1271 1272(define (utf8-lowest-digit-of-length len) 1273 (case len 1274 ((1) 0) ((2) #xC0) ((3) #xE0) ((4) #xF0) 1275 (else (error "invalid utf8 length" len)))) 1276 1277(define (utf8-highest-digit-of-length len) 1278 (case len 1279 ((1) #x7F) ((2) #xDF) ((3) #xEF) ((4) #xF7) 1280 (else (error "invalid utf8 length" len)))) 1281 1282#| ;; NMOSH: we don't need this one 1283;; Maybe this should just modify the input? 1284(define (cset->utf8-pattern cset) 1285 (let lp ((ls (cset->plist cset)) (alts '()) (lo-cset '())) 1286 (if (null? ls) 1287 (sre-alternate (append (reverse alts) 1288 (if (null? lo-cset) 1289 '() 1290 (list (cons '/ (reverse lo-cset)))))) 1291 (if (or (high-char? (car ls)) (high-char? (cadr ls))) 1292 (lp (cddr ls) 1293 (cons (unicode-range->utf8-pattern (car ls) (cadr ls)) alts) 1294 lo-cset) 1295 (lp (cddr ls) alts (cons (cadr ls) (cons (car ls) lo-cset))))))) 1296|# 1297 1298(define (sre-adjust-utf8 sre flags) sre) 1299 1300;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1301;;;; Compilation 1302 1303(define (irregex x . o) 1304 (cond 1305 ((irregex? x) x) 1306 ((string? x) (apply string->irregex x o)) 1307 (else (apply sre->irregex x o)))) 1308 1309(define (string->irregex str . o) 1310 (apply sre->irregex (apply string->sre str o) o)) 1311 1312(define (sre->irregex sre . o) 1313 (let* ((pat-flags (symbol-list->flags o)) 1314 (sre (if *allow-utf8-mode?* 1315 (sre-adjust-utf8 sre pat-flags) 1316 sre)) 1317 (searcher? (sre-searcher? sre)) 1318 (sre-dfa (if searcher? (sre-remove-initial-bos sre) sre)) 1319 (dfa-limit (cond ((memq 'small o) 1) ((memq 'fast o) 50) (else 10))) 1320 (dfa/search 1321 (cond ((memq 'backtrack o) #f) 1322 (searcher? #t) 1323 ((sre->nfa `(seq (* any) ,sre-dfa) pat-flags) 1324 => (lambda (nfa) 1325 (nfa->dfa nfa (* dfa-limit (nfa-num-states nfa))))) 1326 (else #f))) 1327 (dfa (cond ((and dfa/search (sre->nfa sre-dfa pat-flags)) 1328 => (lambda (nfa) 1329 (nfa->dfa nfa (* dfa-limit (nfa-num-states nfa))))) 1330 (else #f))) 1331 (submatches (sre-count-submatches sre-dfa)) 1332 (extractor 1333 (and dfa dfa/search (sre-match-extractor sre-dfa submatches))) 1334 (names (sre-names sre-dfa 1 '())) 1335 (lens (sre-length-ranges sre-dfa names)) 1336 (flags (flag-join 1337 (flag-join ~none (and searcher? ~searcher?)) 1338 (and (sre-consumer? sre) ~consumer?)))) 1339 (cond 1340 (dfa 1341 (make-irregex dfa dfa/search extractor #f flags submatches lens names)) 1342 (else 1343 (let ((f (sre->procedure sre pat-flags names))) 1344 (make-irregex #f #f #f f flags submatches lens names)))))) 1345 1346;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1347;;;; SRE Analysis 1348 1349;; returns #t if the sre can ever be empty 1350(define (sre-empty? sre) 1351 (if (pair? sre) 1352 (case (car sre) 1353 ((* ? look-ahead look-behind neg-look-ahead neg-look-behind) #t) 1354 ((**) (or (not (number? (cadr sre))) (zero? (cadr sre)))) 1355 ((or) (any sre-empty? (cdr sre))) 1356 ((: seq $ submatch => submatch-named + atomic) 1357 (every sre-empty? (cdr sre))) 1358 (else #f)) 1359 (memq sre '(epsilon bos eos bol eol bow eow commit)))) 1360 1361(define (sre-any? sre) 1362 (or (eq? sre 'any) 1363 (and (pair? sre) 1364 (case (car sre) 1365 ((seq : $ submatch => submatch-named) 1366 (and (pair? (cdr sre)) (null? (cddr sre)) (sre-any? (cadr sre)))) 1367 ((or) (every sre-any? (cdr sre))) 1368 (else #f))))) 1369 1370(define (sre-repeater? sre) 1371 (and (pair? sre) 1372 (or (memq (car sre) '(* +)) 1373 (and (memq (car sre) '($ submatch => submatch-named seq :)) 1374 (pair? (cdr sre)) 1375 (null? (cddr sre)) 1376 (sre-repeater? (cadr sre)))))) 1377 1378(define (sre-searcher? sre) 1379 (if (pair? sre) 1380 (case (car sre) 1381 ((* +) (sre-any? (sre-sequence (cdr sre)))) 1382 ((seq : $ submatch => submatch-named) 1383 (and (pair? (cdr sre)) (sre-searcher? (cadr sre)))) 1384 ((or) (every sre-searcher? (cdr sre))) 1385 (else #f)) 1386 (eq? 'bos sre))) 1387 1388(define (sre-consumer? sre) 1389 (if (pair? sre) 1390 (case (car sre) 1391 ((* +) (sre-any? (sre-sequence (cdr sre)))) 1392 ((seq : $ submatch => submatch-named) 1393 (and (pair? (cdr sre)) (sre-consumer? (last sre)))) 1394 ((or) (every sre-consumer? (cdr sre))) 1395 (else #f)) 1396 (eq? 'eos sre))) 1397 1398(define (sre-has-submatches? sre) 1399 (and (pair? sre) 1400 (or (memq (car sre) '($ submatch => submatch-named)) 1401 (if (eq? 'posix-string (car sre)) 1402 (sre-has-submatches? (string->sre (cadr sre))) 1403 (any sre-has-submatches? (cdr sre)))))) 1404 1405(define (sre-count-submatches sre) 1406 (let count ((sre sre) (sum 0)) 1407 (if (pair? sre) 1408 (fold count 1409 (+ sum (case (car sre) 1410 (($ submatch => submatch-named) 1) 1411 ((dsm) (+ (cadr sre) (caddr sre))) 1412 ((posix-string) 1413 (sre-count-submatches (string->sre (cadr sre)))) 1414 (else 0))) 1415 (cdr sre)) 1416 sum))) 1417 1418(define (sre-length-ranges sre . o) 1419 (let ((names (if (pair? o) (car o) (sre-names sre 1 '()))) 1420 (sublens (make-vector (+ 1 (sre-count-submatches sre)) #f))) 1421 (vector-set! 1422 sublens 1423 0 1424 (let lp ((sre sre) (n 1) (lo 0) (hi 0) (return cons)) 1425 (define (grow i) (return (+ lo i) (and hi (+ hi i)))) 1426 (cond 1427 ((pair? sre) 1428 (if (string? (car sre)) 1429 (grow 1) 1430 (case (car sre) 1431 ((/ ~ & -) 1432 (grow 1)) 1433 ((posix-string) 1434 (lp (string->sre (cadr sre)) n lo hi return)) 1435 ((seq : w/case w/nocase atomic) 1436 (let lp2 ((ls (cdr sre)) (n n) (lo2 0) (hi2 0)) 1437 (if (null? ls) 1438 (return (+ lo lo2) (and hi hi2 (+ hi hi2))) 1439 (lp (car ls) n 0 0 1440 (lambda (lo3 hi3) 1441 (lp2 (cdr ls) 1442 (+ n (sre-count-submatches (car ls))) 1443 (+ lo2 lo3) 1444 (and hi2 hi3 (+ hi2 hi3)))))))) 1445 ((or) 1446 (let lp2 ((ls (cdr sre)) (n n) (lo2 #f) (hi2 0)) 1447 (if (null? ls) 1448 (return (+ lo (or lo2 1)) (and hi hi2 (+ hi hi2))) 1449 (lp (car ls) n 0 0 1450 (lambda (lo3 hi3) 1451 (lp2 (cdr ls) 1452 (+ n (sre-count-submatches (car ls))) 1453 (if lo2 (min lo2 lo3) lo3) 1454 (and hi2 hi3 (max hi2 hi3)))))))) 1455 ((if) 1456 (cond 1457 ((or (null? (cdr sre)) (null? (cddr sre))) 1458 (return lo hi)) 1459 (else 1460 (let ((n1 (sre-count-submatches (car sre))) 1461 (n2 (sre-count-submatches (cadr sre)))) 1462 (lp (if (or (number? (cadr sre)) (symbol? (cadr sre))) 1463 'epsilon 1464 (cadr sre)) 1465 n lo hi 1466 (lambda (lo2 hi2) 1467 (lp (caddr sre) (+ n n1) 0 0 1468 (lambda (lo3 hi3) 1469 (lp (if (pair? (cdddr sre)) 1470 (cadddr sre) 1471 'epsilon) 1472 (+ n n1 n2) 0 0 1473 (lambda (lo4 hi4) 1474 (return (+ lo2 (min lo3 lo4)) 1475 (and hi2 hi3 hi4 1476 (+ hi2 (max hi3 hi4)) 1477 )))))))))))) 1478 ((dsm) 1479 (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) lo hi return)) 1480 (($ submatch => submatch-named) 1481 (lp (sre-sequence 1482 (if (eq? 'submatch (car sre)) (cdr sre) (cddr sre))) 1483 (+ n 1) lo hi 1484 (lambda (lo2 hi2) 1485 (vector-set! sublens n (cons lo2 hi2)) 1486 (return lo2 hi2)))) 1487 ((backref backref-ci) 1488 (let ((n (cond 1489 ((number? (cadr sre)) (cadr sre)) 1490 ((assq (cadr sre) names) => cdr) 1491 (else (error "unknown backreference" (cadr sre)))))) 1492 (cond 1493 ((or (not (integer? n)) 1494 (not (< 0 n (vector-length sublens)))) 1495 (error "sre-length: invalid backreference" sre)) 1496 ((not (vector-ref sublens n)) 1497 (error "sre-length: invalid forward backreference" sre)) 1498 (else 1499 (let ((lo2 (car (vector-ref sublens n))) 1500 (hi2 (cdr (vector-ref sublens n)))) 1501 (return (+ lo lo2) (and hi hi2 (+ hi hi2)))))))) 1502 ((* *?) 1503 (lp (sre-sequence (cdr sre)) n lo hi (lambda (lo hi) #f)) 1504 (return lo #f)) 1505 ((** **?) 1506 (cond 1507 ((or (and (number? (cadr sre)) 1508 (number? (caddr sre)) 1509 (> (cadr sre) (caddr sre))) 1510 (and (not (cadr sre)) (caddr sre))) 1511 (return lo hi)) 1512 (else 1513 (if (caddr sre) 1514 (lp (sre-sequence (cdddr sre)) n 0 0 1515 (lambda (lo2 hi2) 1516 (return (+ lo (* (cadr sre) lo2)) 1517 (and hi hi2 (+ hi (* (caddr sre) hi2)))))) 1518 (lp (sre-sequence (cdddr sre)) n 0 0 1519 (lambda (lo2 hi2) 1520 (return (+ lo (* (cadr sre) lo2)) #f))))))) 1521 ((+) 1522 (lp (sre-sequence (cdr sre)) n lo hi 1523 (lambda (lo2 hi2) 1524 (return (+ lo lo2) #f)))) 1525 ((? ??) 1526 (lp (sre-sequence (cdr sre)) n lo hi 1527 (lambda (lo2 hi2) 1528 (return lo (and hi hi2 (+ hi hi2)))))) 1529 ((= =? >= >=?) 1530 (lp `(** ,(cadr sre) 1531 ,(if (memq (car sre) '(>= >=?)) #f (cadr sre)) 1532 ,@(cddr sre)) 1533 n lo hi return)) 1534 ((look-ahead neg-look-ahead look-behind neg-look-behind) 1535 (return lo hi)) 1536 (else 1537 (cond 1538 ((assq (car sre) sre-named-definitions) 1539 => (lambda (cell) 1540 (lp (apply (cdr cell) (cdr sre)) n lo hi return))) 1541 (else 1542 (error "sre-length-ranges: unknown sre operator" sre))))))) 1543 ((char? sre) 1544 (grow 1)) 1545 ((string? sre) 1546 (grow (string-length sre))) 1547 ((memq sre '(any nonl)) 1548 (grow 1)) 1549 ((memq sre '(epsilon bos eos bol eol bow eow nwb commit)) 1550 (return lo hi)) 1551 (else 1552 (let ((cell (assq sre sre-named-definitions))) 1553 (if cell 1554 (lp (if (procedure? (cdr cell)) ((cdr cell)) (cdr cell)) 1555 n lo hi return) 1556 (error "sre-length-ranges: unknown sre" sre))))))) 1557 sublens)) 1558 1559;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1560;;;; SRE Manipulation 1561 1562;; build a (seq ls ...) sre from a list 1563(define (sre-sequence ls) 1564 (cond 1565 ((null? ls) 'epsilon) 1566 ((null? (cdr ls)) (car ls)) 1567 (else (cons 'seq ls)))) 1568 1569;; build a (or ls ...) sre from a list 1570(define (sre-alternate ls) 1571 (cond 1572 ((null? ls) '(or)) 1573 ((null? (cdr ls)) (car ls)) 1574 (else (cons 'or ls)))) 1575 1576;; returns an equivalent SRE without any match information 1577(define (sre-strip-submatches sre) 1578 (if (not (pair? sre)) 1579 sre 1580 (case (car sre) 1581 (($ submatch) (sre-strip-submatches (sre-sequence (cdr sre)))) 1582 ((=> submatch-named) (sre-strip-submatches (sre-sequence (cddr sre)))) 1583 ((dsm) (sre-strip-submatches (sre-sequence (cdddr sre)))) 1584 (else (map sre-strip-submatches sre))))) 1585 1586;; given a char-set list of chars and strings, flattens them into 1587;; chars only 1588(define (sre-flatten-ranges ls) 1589 (let lp ((ls ls) (res '())) 1590 (cond 1591 ((null? ls) 1592 (reverse res)) 1593 ((string? (car ls)) 1594 (lp (append (string->list (car ls)) (cdr ls)) res)) 1595 (else 1596 (lp (cdr ls) (cons (car ls) res)))))) 1597 1598(define (sre-names sre n names) 1599 (if (not (pair? sre)) 1600 names 1601 (case (car sre) 1602 (($ submatch) 1603 (sre-names (sre-sequence (cdr sre)) (+ n 1) names)) 1604 ((=> submatch-named) 1605 (sre-names (sre-sequence (cddr sre)) 1606 (+ n 1) 1607 (cons (cons (cadr sre) n) names))) 1608 ((dsm) 1609 (sre-names (sre-sequence (cdddr sre)) (+ n (cadr sre)) names)) 1610 ((seq : or * + ? *? ?? w/case w/nocase atomic 1611 look-ahead look-behind neg-look-ahead neg-look-behind) 1612 (sre-sequence-names (cdr sre) n names)) 1613 ((= >=) 1614 (sre-sequence-names (cddr sre) n names)) 1615 ((** **?) 1616 (sre-sequence-names (cdddr sre) n names)) 1617 (else 1618 names)))) 1619 1620(define (sre-sequence-names ls n names) 1621 (if (null? ls) 1622 names 1623 (sre-sequence-names (cdr ls) 1624 (+ n (sre-count-submatches (car ls))) 1625 (sre-names (car ls) n names)))) 1626 1627(define (sre-remove-initial-bos sre) 1628 (cond 1629 ((pair? sre) 1630 (case (car sre) 1631 ((seq : $ submatch => submatch-named * +) 1632 (cond 1633 ((not (pair? (cdr sre))) 1634 sre) 1635 ((eq? 'bos (cadr sre)) 1636 (cons (car sre) (cddr sre))) 1637 (else 1638 (cons (car sre) 1639 (cons (sre-remove-initial-bos (cadr sre)) (cddr sre)))))) 1640 ((or) 1641 (sre-alternate (map sre-remove-initial-bos (cdr sre)))) 1642 (else 1643 sre))) 1644 (else 1645 sre))) 1646 1647;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1648;;;; Basic Matching 1649 1650(define irregex-basic-string-chunker 1651 (make-irregex-chunker (lambda (x) #f) 1652 car 1653 cadr 1654 caddr 1655 (lambda (src1 i src2 j) 1656 (substring (car src1) i j)))) 1657 1658(define (irregex-search x str . o) 1659 (if (not (string? str)) (error "irregex-search: not a string" str)) 1660 (let ((start (or (and (pair? o) (car o)) 0)) 1661 (end (or (and (pair? o) (pair? (cdr o)) (cadr o)) (string-length str)))) 1662 (if (not (and (integer? start) (exact? start))) 1663 (error "irregex-search: not an exact integer" start)) 1664 (if (not (and (integer? end) (exact? end))) 1665 (error "irregex-search: not an exact integer" end)) 1666 (irregex-search/chunked x 1667 irregex-basic-string-chunker 1668 (list str start end) 1669 start))) 1670 1671(define (irregex-search/chunked x cnk src . o) 1672 (let* ((irx (irregex x)) 1673 (matches (irregex-new-matches irx)) 1674 (i (if (pair? o) (car o) ((chunker-get-start cnk) src)))) 1675 (irregex-match-chunker-set! matches cnk) 1676 (irregex-search/matches irx cnk src i matches))) 1677 1678;; internal routine, can be used in loops to avoid reallocating the 1679;; match vector 1680(define (irregex-search/matches irx cnk src i matches) 1681 (cond 1682 ((irregex-dfa irx) 1683 (cond 1684 ((flag-set? (irregex-flags irx) ~searcher?) 1685 (cond 1686 ((dfa-match/longest (irregex-dfa irx) cnk src i #f #f matches 0) 1687 (irregex-match-start-chunk-set! matches 0 src) 1688 (irregex-match-start-index-set! matches 0 i) 1689 ((irregex-dfa/extract irx) 1690 cnk src i 1691 (%irregex-match-end-chunk matches 0) 1692 (%irregex-match-end-index matches 0) 1693 matches) 1694 matches) 1695 (else 1696 #f))) 1697 ((dfa-match/shortest 1698 (irregex-dfa/search irx) cnk src i matches 0) 1699 (let ((dfa (irregex-dfa irx)) 1700 (get-start (chunker-get-start cnk)) 1701 (get-end (chunker-get-end cnk)) 1702 (get-next (chunker-get-next cnk))) 1703 (let lp1 ((src src) (i i)) 1704 (let ((end (get-end src))) 1705 (let lp2 ((i i)) 1706 (cond 1707 ((dfa-match/longest dfa cnk src i #f #f matches 0) 1708 (irregex-match-start-chunk-set! matches 0 src) 1709 (irregex-match-start-index-set! matches 0 i) 1710 ((irregex-dfa/extract irx) 1711 cnk src i 1712 (%irregex-match-end-chunk matches 0) 1713 (%irregex-match-end-index matches 0) 1714 matches) 1715 matches) 1716 ((>= i end) 1717 (let ((next (get-next src))) 1718 (and next (lp1 next (get-start next))))) 1719 (else 1720 (lp2 (+ i 1))))))))) 1721 (else 1722 #f))) 1723 (else 1724 (let ((res (irregex-search/backtrack irx cnk src i matches))) 1725 (if res (%irregex-match-fail-set! res #f)) 1726 res)))) 1727 1728(define (irregex-search/backtrack irx cnk src i matches) 1729 (let ((matcher (irregex-nfa irx)) 1730 (str ((chunker-get-str cnk) src)) 1731 (end ((chunker-get-end cnk) src)) 1732 (get-next (chunker-get-next cnk)) 1733 (init (cons src i))) 1734 (if (flag-set? (irregex-flags irx) ~searcher?) 1735 (matcher cnk init src str i end matches (lambda () #f)) 1736 (let lp ((src2 src) 1737 (str str) 1738 (i i) 1739 (end end)) 1740 (cond 1741 ((matcher cnk init src2 str i end matches (lambda () #f)) 1742 (irregex-match-start-chunk-set! matches 0 src2) 1743 (irregex-match-start-index-set! matches 0 i) 1744 matches) 1745 ((< i end) 1746 (lp src2 str (+ i 1) end)) 1747 (else 1748 (let ((src2 (get-next src2))) 1749 (if src2 1750 (lp src2 1751 ((chunker-get-str cnk) src2) 1752 ((chunker-get-start cnk) src2) 1753 ((chunker-get-end cnk) src2)) 1754 #f)))))))) 1755 1756(define (irregex-match irx str . o) 1757 (if (not (string? str)) (error "irregex-match: not a string" str)) 1758 (let ((start (or (and (pair? o) (car o)) 0)) 1759 (end (or (and (pair? o) (pair? (cdr o)) (cadr o)) (string-length str)))) 1760 (if (not (and (integer? start) (exact? start))) 1761 (error "irregex-match: not an exact integer" start)) 1762 (if (not (and (integer? end) (exact? end))) 1763 (error "irregex-match: not an exact integer" end)) 1764 (irregex-match/chunked irx 1765 irregex-basic-string-chunker 1766 (list str start end)))) 1767 1768(define (irregex-match/chunked irx cnk src) 1769 (let* ((irx (irregex irx)) 1770 (matches (irregex-new-matches irx))) 1771 (irregex-match-chunker-set! matches cnk) 1772 (cond 1773 ((irregex-dfa irx) 1774 (and 1775 (dfa-match/longest 1776 (irregex-dfa irx) cnk src ((chunker-get-start cnk) src) #f #f matches 0) 1777 (= ((chunker-get-end cnk) (%irregex-match-end-chunk matches 0)) 1778 (%irregex-match-end-index matches 0)) 1779 (begin 1780 (irregex-match-start-chunk-set! matches 0 src) 1781 (irregex-match-start-index-set! matches 1782 0 1783 ((chunker-get-start cnk) src)) 1784 ((irregex-dfa/extract irx) 1785 cnk src ((chunker-get-start cnk) src) 1786 (%irregex-match-end-chunk matches 0) 1787 (%irregex-match-end-index matches 0) 1788 matches) 1789 matches))) 1790 (else 1791 (let* ((matcher (irregex-nfa irx)) 1792 (str ((chunker-get-str cnk) src)) 1793 (i ((chunker-get-start cnk) src)) 1794 (end ((chunker-get-end cnk) src)) 1795 (init (cons src i))) 1796 (let lp ((m (matcher cnk init src str i end matches (lambda () #f)))) 1797 (and m 1798 (cond 1799 ((and (not ((chunker-get-next cnk) 1800 (%irregex-match-end-chunk m 0))) 1801 (= ((chunker-get-end cnk) 1802 (%irregex-match-end-chunk m 0)) 1803 (%irregex-match-end-index m 0))) 1804 (%irregex-match-fail-set! m #f) 1805 m) 1806 ((%irregex-match-fail m) 1807 (lp ((%irregex-match-fail m)))) 1808 (else 1809 #f))))))))) 1810 1811(define (irregex-match? . args) 1812 (and (apply irregex-match args) #t)) 1813 1814;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1815;;;; DFA Matching 1816 1817;; inline these 1818(define (dfa-init-state dfa) 1819 (vector-ref dfa 0)) 1820(define (dfa-next-state dfa node) 1821 (vector-ref dfa (cdr node))) 1822(define (dfa-final-state? dfa state) 1823 (car state)) 1824 1825;; this searches for the first end index for which a match is possible 1826(define (dfa-match/shortest dfa cnk src start matches index) 1827 (let ((get-str (chunker-get-str cnk)) 1828 (get-start (chunker-get-start cnk)) 1829 (get-end (chunker-get-end cnk)) 1830 (get-next (chunker-get-next cnk))) 1831 (let lp1 ((src src) (start start) (state (dfa-init-state dfa))) 1832 (and 1833 src 1834 (let ((str (get-str src)) 1835 (end (get-end src))) 1836 (let lp2 ((i start) (state state)) 1837 (cond 1838 ((dfa-final-state? dfa state) 1839 (cond 1840 (index 1841 (irregex-match-end-chunk-set! matches index src) 1842 (irregex-match-end-index-set! matches index i))) 1843 #t) 1844 ((< i end) 1845 (let* ((ch (string-ref str i)) 1846 (next (find (lambda (x) 1847 (or (eqv? ch (car x)) 1848 (and (not (char? (car x))) 1849 (cset-contains? (car x) ch)))) 1850 (cdr state)))) 1851 (and next (lp2 (+ i 1) (dfa-next-state dfa next))))) 1852 (else 1853 (let ((next (get-next src))) 1854 (and next (lp1 next (get-start next) state))))))))))) 1855 1856;; this finds the longest match starting at a given index 1857(define (dfa-match/longest dfa cnk src start end-src end matches index) 1858 (let ((get-str (chunker-get-str cnk)) 1859 (get-start (chunker-get-start cnk)) 1860 (get-end (chunker-get-end cnk)) 1861 (get-next (chunker-get-next cnk)) 1862 (start-is-final? (dfa-final-state? dfa (dfa-init-state dfa)))) 1863 (cond 1864 (index 1865 (irregex-match-end-chunk-set! matches index #f) 1866 (irregex-match-end-index-set! matches index #f))) 1867 (let lp1 ((src src) 1868 (start start) 1869 (state (dfa-init-state dfa)) 1870 (res-src (and start-is-final? src)) 1871 (res-index (and start-is-final? start))) 1872 (let ((str (get-str src)) 1873 (end (if (eq? src end-src) end (get-end src)))) 1874 (let lp2 ((i start) 1875 (state state) 1876 (res-src res-src) 1877 (res-index res-index)) 1878 (cond 1879 ((>= i end) 1880 (cond 1881 ((and index res-src) 1882 (irregex-match-end-chunk-set! matches index res-src) 1883 (irregex-match-end-index-set! matches index res-index))) 1884 (let ((next (and (not (eq? src end-src)) (get-next src)))) 1885 (if next 1886 (lp1 next (get-start next) state res-src res-index) 1887 (and index 1888 (%irregex-match-end-chunk matches index) 1889 #t)))) 1890 (else 1891 (let* ((ch (string-ref str i)) 1892 (cell (find (lambda (x) 1893 (or (eqv? ch (car x)) 1894 (and (not (char? (car x))) 1895 (cset-contains? (car x) ch)))) 1896 (cdr state)))) 1897 (cond 1898 (cell 1899 (let ((next (dfa-next-state dfa cell))) 1900 (if (dfa-final-state? dfa next) 1901 (lp2 (+ i 1) next src (+ i 1)) 1902 (lp2 (+ i 1) next res-src res-index)))) 1903 (res-src 1904 (cond 1905 (index 1906 (irregex-match-end-chunk-set! matches index res-src) 1907 (irregex-match-end-index-set! matches index res-index))) 1908 #t) 1909 ((and index (%irregex-match-end-chunk matches index)) 1910 #t) 1911 (else 1912 #f)))))))))) 1913 1914;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 1915;;;; Named Definitions 1916 1917(define sre-named-definitions 1918 `((any . ,*all-chars*) 1919 (nonl . (- ,*all-chars* (,(string #\newline)))) 1920 (alphabetic . (/ #\a #\z #\A #\Z)) 1921 (alpha . alphabetic) 1922 (alphanumeric . (/ #\a #\z #\A #\Z #\0 #\9)) 1923 (alphanum . alphanumeric) 1924 (alnum . alphanumeric) 1925 (lower-case . (/ #\a #\z)) 1926 (lower . lower-case) 1927 (upper-case . (/ #\A #\Z)) 1928 (upper . upper-case) 1929 (numeric . (/ #\0 #\9)) 1930 (num . numeric) 1931 (digit . numeric) 1932 (punctuation . (or #\! #\" #\# #\% #\& #\' #\( #\) #\* #\, #\- #\. 1933 #\/ #\: #\; #\? #\@ #\[ #\\ #\] #\_ #\{ #\})) 1934 (punct . punctuation) 1935 (graphic 1936 . (or alphanumeric punctuation #\$ #\+ #\< #\= #\> #\^ #\` #\| #\~)) 1937 (graph . graphic) 1938 (blank . (or #\space ,(integer->char (- (char->integer #\space) 23)))) 1939 (whitespace . (or blank #\newline)) 1940 (space . whitespace) 1941 (white . whitespace) 1942 (printing or graphic whitespace) 1943 (print . printing) 1944 1945 ;; XXXX we assume a (possibly shifted) ASCII-based ordering 1946 (control . (/ ,(integer->char (- (char->integer #\space) 32)) 1947 ,(integer->char (- (char->integer #\space) 1)))) 1948 (cntrl . control) 1949 (hex-digit . (or numeric (/ #\a #\f #\A #\F))) 1950 (xdigit . hex-digit) 1951 (ascii . (/ ,(integer->char (- (char->integer #\space) 32)) 1952 ,(integer->char (+ (char->integer #\space) 95)))) 1953 (ascii-nonl . (/ ,(integer->char (- (char->integer #\space) 32)) 1954 ,(integer->char (- (char->integer #\newline) 1)) 1955 ,(integer->char (+ (char->integer #\newline) 1)) 1956 ,(integer->char (+ (char->integer #\space) 95)))) 1957 (newline . (or (seq ,(integer->char (+ (char->integer #\newline) 3)) 1958 #\newline) 1959 (/ #\newline 1960 ,(integer->char (+ (char->integer #\newline) 3))))) 1961 1962 ;; ... it's really annoying to support old Scheme48 1963 (word . (seq bow (+ (or alphanumeric #\_)) eow)) 1964 (utf8-tail-char . (/ ,(integer->char (+ (char->integer #\space) #x60)) 1965 ,(integer->char (+ (char->integer #\space) #xA1)))) 1966 (utf8-2-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xA2)) 1967 ,(integer->char (+ (char->integer #\space) #xBF))) 1968 utf8-tail-char)) 1969 (utf8-3-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xC0)) 1970 ,(integer->char (+ (char->integer #\space) #xCF))) 1971 utf8-tail-char 1972 utf8-tail-char)) 1973 (utf8-4-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xD0)) 1974 ,(integer->char (+ (char->integer #\space) #xD7))) 1975 utf8-tail-char 1976 utf8-tail-char 1977 utf8-tail-char)) 1978 (utf8-any . (or ascii utf8-2-char utf8-3-char utf8-4-char)) 1979 (utf8-nonl . (or ascii-nonl utf8-2-char utf8-3-char utf8-4-char)) 1980 1981 ;; extended library patterns 1982 (integer . (seq (? (or #\+ #\-)) (+ numeric))) 1983 (real . (seq (+ numeric) (? #\. (+ numeric)) (? (or #\e #\E) integer))) 1984 ;; slightly more lax than R5RS, allow ->foo, etc. 1985 (symbol-initial . (or alpha ("!$%&*/:<=>?^_~"))) 1986 (symbol-subsequent . (or symbol-initial digit ("+-.@"))) 1987 (symbol . (or (seq symbol-initial (* symbol-subsequent)) 1988 (seq ("+-") (? symbol-initial (* symbol-subsequent))) 1989 (seq ".." (* ".")))) 1990 (sexp-space . (seq (* (* space) ";" (* nonl) newline) (+ space))) 1991 (string . (seq #\" (escape #\\ #\") #\")) 1992 (escape . ,(lambda (esc . o) `(* (or (~ ,esc ,@o) (seq ,esc any))))) 1993 1994 (ipv4-digit . (seq (? (/ "12")) (? numeric) numeric)) 1995 (ipv4-address . (seq ipv4-digit (= 3 #\. ipv4-digit))) 1996 ;; XXXX lax, allows multiple double-colons or < 8 terms w/o a :: 1997 (ipv6-address . (seq (** 0 4 hex-digit) 1998 (** 1 7 #\: (? #\:) (** 0 4 hex-digit)))) 1999 (ip-address . (or ipv4-address ipv6-address)) 2000 (domain-atom . (+ (or alphanumeric #\_ #\-))) 2001 (domain . (seq domain-atom (+ #\. domain-atom))) 2002 ;; XXXX now anything can be a top-level domain, but this is still handy 2003 (top-level-domain . (w/nocase (or "arpa" "com" "gov" "mil" "net" "org" 2004 "aero" "biz" "coop" "info" "museum" 2005 "name" "pro" (= 2 alpha)))) 2006 (domain/common . (seq (+ domain-atom #\.) top-level-domain)) 2007 ;;(email-local-part . (seq (+ (or (~ #\") string)))) 2008 (email-local-part . (+ (or alphanumeric #\_ #\- #\. #\+))) 2009 (email . (seq email-local-part #\@ domain)) 2010 (url-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\. #\, #\& #\; 2011 (seq "%" hex-digit hex-digit))) 2012 (url-final-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\& 2013 (seq "%" hex-digit hex-digit))) 2014 (http-url . (w/nocase 2015 "http" (? "s") "://" 2016 (or domain/common ipv4-address) ;; (seq "[" ipv6-address "]") 2017 (? ":" (+ numeric)) ;; port 2018 ;; path 2019 (? "/" (* url-char) 2020 (? "?" (* url-char)) ;; query 2021 (? "#" (? (* url-char) url-final-char)) ;; fragment 2022 ))) 2023 2024 )) 2025 2026;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2027;;;; SRE->NFA compilation 2028;; 2029;; An NFA state is a numbered node with a list of pattern->number 2030;; transitions, where pattern is character set range, or epsilon 2031;; (indicating an empty transition). 2032;; There may be overlapping ranges - since it's an NFA we process it 2033;; by considering all possible transitions. 2034 2035(define *nfa-presize* 128) ;; constant 2036(define *nfa-num-fields* 4) ;; constant 2037 2038(define (nfa-num-states nfa) (quotient (vector-length nfa) *nfa-num-fields*)) 2039(define (nfa-start-state nfa) (- (nfa-num-states nfa) 1)) 2040 2041(define (nfa-get-state-trans nfa i) 2042 (vector-ref nfa (* i *nfa-num-fields*))) 2043(define (nfa-set-state-trans! nfa i x) 2044 (vector-set! nfa (* i *nfa-num-fields*) x)) 2045 2046(define (nfa-get-epsilons nfa i) 2047 (vector-ref nfa (+ (* i *nfa-num-fields*) 1))) 2048(define (nfa-set-epsilons! nfa i x) 2049 (vector-set! nfa (+ (* i *nfa-num-fields*) 1) x)) 2050(define (nfa-add-epsilon! nfa i x) 2051 (let ((eps (nfa-get-epsilons nfa i))) 2052 (if (not (memq x eps)) 2053 (nfa-set-epsilons! nfa i (cons x eps))))) 2054 2055(define (nfa-get-state-closure nfa i) 2056 (vector-ref nfa (+ (* i *nfa-num-fields*) 2))) 2057(define (nfa-set-state-closure! nfa i x) 2058 (vector-set! nfa (+ (* i *nfa-num-fields*) 2) x)) 2059 2060(define (nfa-get-closure nfa mst) 2061 (cond ((assoc mst 2062 (vector-ref nfa (+ (* (nfa-multi-state-hash nfa mst) 2063 *nfa-num-fields*) 2064 (- *nfa-num-fields* 1)))) 2065 => cdr) 2066 (else #f))) 2067(define (nfa-add-closure! nfa mst x) 2068 (let ((i (+ (* (nfa-multi-state-hash nfa mst) *nfa-num-fields*) 2069 (- *nfa-num-fields* 1)))) 2070 (vector-set! nfa i (cons (cons mst x) (vector-ref nfa i))))) 2071 2072;; Compile and return the vector of NFA states (in groups of 2073;; *nfa-num-fields* packed elements). The start state will be the 2074;; last element(s) of the vector, and all remaining states will be in 2075;; descending numeric order, with state 0 being the unique accepting 2076;; state. 2077(define (sre->nfa sre init-flags) 2078 (let ((buf (make-vector (* *nfa-presize* *nfa-num-fields*) '()))) 2079 ;; we loop over an implicit sequence list 2080 (define (lp ls n flags next) 2081 (define (new-state-number state) 2082 (max n (+ 1 state))) 2083 (define (add-state! n2 trans-ls) 2084 (if (>= (* n2 *nfa-num-fields*) (vector-length buf)) 2085 (let ((tmp (make-vector (* 2 (vector-length buf)) '()))) 2086 (do ((i (- (vector-length buf) 1) (- i 1))) 2087 ((< i 0)) 2088 (vector-set! tmp i (vector-ref buf i))) 2089 (set! buf tmp))) 2090 (nfa-set-state-trans! buf n2 trans-ls) 2091 n2) 2092 (define (extend-state! next trans-cs) 2093 (and next 2094 (add-state! (new-state-number next) (cons trans-cs next)))) 2095 (define (add-char-state! next ch) 2096 (let ((alt (char-altcase ch))) 2097 (if (flag-set? flags ~case-insensitive?) 2098 (extend-state! next (cset-union (char->cset ch) (char->cset alt))) 2099 (extend-state! next (char->cset ch))))) 2100 (if (null? ls) 2101 next 2102 (cond 2103 ((or (eq? 'epsilon (car ls)) (equal? "" (car ls))) 2104 ;; chars and epsilons go directly into the transition table 2105 (let ((next (lp (cdr ls) n flags next))) 2106 (and next 2107 (let ((new (add-state! (new-state-number next) '()))) 2108 (nfa-add-epsilon! buf new next) 2109 new)))) 2110 ((string? (car ls)) 2111 ;; process literal strings a char at a time 2112 (let ((next (lp (cdr ls) n flags next))) 2113 (and next 2114 (let lp2 ((i (- (string-length (car ls)) 1)) 2115 (next next)) 2116 (if (< i 0) 2117 next 2118 (lp2 (- i 1) 2119 (add-char-state! next (string-ref (car ls) i)))) 2120 )))) 2121 ((char? (car ls)) 2122 (add-char-state! (lp (cdr ls) n flags next) (car ls))) 2123 ((symbol? (car ls)) 2124 (let ((cell (assq (car ls) sre-named-definitions))) 2125 (and cell 2126 (lp (cons (if (procedure? (cdr cell)) 2127 ((cdr cell)) 2128 (cdr cell)) 2129 (cdr ls)) 2130 n 2131 flags 2132 next)))) 2133 ((pair? (car ls)) 2134 (cond 2135 ((string? (caar ls)) ; Enumerated character set 2136 (let ((set (if (flag-set? flags ~case-insensitive?) 2137 (cset-case-insensitive (string->cset (caar ls))) 2138 (string->cset (caar ls))))) 2139 (extend-state! (lp (cdr ls) n flags next) set))) 2140 (else 2141 (case (caar ls) 2142 ((seq :) 2143 ;; for an explicit sequence, just append to the list 2144 (lp (append (cdar ls) (cdr ls)) n flags next)) 2145 ((w/case w/nocase w/utf8 w/noutf8) 2146 (let* ((next (lp (cdr ls) n flags next)) 2147 (flags ((if (memq (caar ls) '(w/case w/utf8)) 2148 flag-clear 2149 flag-join) 2150 flags 2151 (if (memq (caar ls) '(w/case w/nocase)) 2152 ~case-insensitive? 2153 ~utf8?)))) 2154 (and next 2155 (lp (cdar ls) (new-state-number next) flags next)))) 2156 ((/ - & ~) 2157 (let ((range (sre->cset (car ls) 2158 (flag-set? flags ~case-insensitive?)))) 2159 (extend-state! (lp (cdr ls) n flags next) 2160 range))) 2161 ((or) 2162 (let ((next (lp (cdr ls) n flags next))) 2163 (and 2164 next 2165 (if (null? (cdar ls)) 2166 ;; empty (or) always fails 2167 (add-state! (new-state-number next) '()) 2168 ;; compile both branches and insert epsilon 2169 ;; transitions to either 2170 (let* ((b (lp (list (sre-alternate (cddar ls))) 2171 (new-state-number next) 2172 flags 2173 next)) 2174 (a (and b 2175 (lp (list (cadar ls)) 2176 (new-state-number (max b next)) 2177 flags 2178 next)))) 2179 (and a 2180 (let ((c (add-state! (new-state-number a) 2181 '()))) 2182 (nfa-add-epsilon! buf c a) 2183 (nfa-add-epsilon! buf c b) 2184 c))))))) 2185 ((?) 2186 (let ((next (lp (cdr ls) n flags next))) 2187 ;; insert an epsilon transition directly to next 2188 (and 2189 next 2190 (let ((a (lp (cdar ls) (new-state-number next) flags next))) 2191 (if a 2192 (nfa-add-epsilon! buf a next)) 2193 a)))) 2194 ((+ *) 2195 (let ((next (lp (cdr ls) n flags next))) 2196 (and 2197 next 2198 (let* ((new (lp '(epsilon) 2199 (new-state-number next) 2200 flags 2201 next)) 2202 (a (lp (cdar ls) (new-state-number new) flags new))) 2203 (cond 2204 (a 2205 ;; for *, insert an epsilon transition as in ? above 2206 (if (eq? '* (caar ls)) 2207 (nfa-add-epsilon! buf a new)) 2208 ;; for both, insert a loop back to self 2209 (nfa-add-epsilon! buf new a))) 2210 a)))) 2211 ;; need to add these to the match extractor first, 2212 ;; but they tend to generate large DFAs 2213 ;;((=) 2214 ;; (lp (append (vector->list 2215 ;; (make-vector (cadar ls) 2216 ;; (sre-sequence (cddar ls)))) 2217 ;; (cdr ls)) 2218 ;; n flags next)) 2219 ;;((>=) 2220 ;; (lp (append (vector->list 2221 ;; (make-vector (- (cadar ls) 1) 2222 ;; (sre-sequence (cddar ls)))) 2223 ;; (cons `(+ ,@(cddar ls)) (cdr ls))) 2224 ;; n flags next)) 2225 ;;((**) 2226 ;; (lp (append (vector->list 2227 ;; (make-vector (cadar ls) 2228 ;; (sre-sequence (cdddar ls)))) 2229 ;; (map 2230 ;; (lambda (x) `(? ,x)) 2231 ;; (vector->list 2232 ;; (make-vector (- (caddar ls) (cadar ls)) 2233 ;; (sre-sequence (cdddar ls))))) 2234 ;; (cdr ls)) 2235 ;; n flags next)) 2236 ;; ignore submatches altogether 2237 (($ submatch) 2238 (lp (cons (sre-sequence (cdar ls)) (cdr ls)) n flags next)) 2239 ((=> submatch-named) 2240 (lp (cons (sre-sequence (cddar ls)) (cdr ls)) n flags next)) 2241 (else 2242 (cond 2243 ((assq (caar ls) sre-named-definitions) 2244 => (lambda (cell) 2245 (if (procedure? (cdr cell)) 2246 (lp (cons (apply (cdr cell) (cdar ls)) (cdr ls)) 2247 n flags next) 2248 (error "non-procedure in op position" (caar ls))))) 2249 (else #f))))))) 2250 (else 2251 #f)))) 2252 (let ((len (lp (list sre) 1 init-flags 0))) 2253 (and len 2254 (let ((nfa (make-vector (* *nfa-num-fields* (+ len 1))))) 2255 (do ((i (- (vector-length nfa) 1) (- i 1))) 2256 ((< i 0)) 2257 (vector-set! nfa i (vector-ref buf i))) 2258 nfa))))) 2259 2260;; We don't really want to use this, we use the closure compilation 2261;; below instead, but this is included for reference and testing the 2262;; sre->nfa conversion. 2263 2264;; (define (nfa-match nfa str) 2265;; (let lp ((ls (string->list str)) (state (car nfa)) (epsilons '())) 2266;; (if (null? ls) 2267;; (zero? (car state)) 2268;; (any (lambda (m) 2269;; (if (eq? 'epsilon (car m)) 2270;; (and (not (memv (cdr m) epsilons)) 2271;; (lp ls (assv (cdr m) nfa) (cons (cdr m) epsilons))) 2272;; (and (or (eqv? (car m) (car ls)) 2273;; (and (pair? (car m)) 2274;; (char<=? (caar m) (car ls)) 2275;; (char<=? (car ls) (cdar m)))) 2276;; (lp (cdr ls) (assv (cdr m) nfa) '())))) 2277;; (cdr state))))) 2278 2279;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2280;;;; NFA multi-state representation 2281 2282;; Cache closures in a simple hash-table keyed on the smallest state 2283;; (define (nfa-multi-state-hash nfa mst) 2284;; (car mst)) 2285 2286;; Original sorted list-based representation 2287 2288;; (define (make-nfa-multi-state nfa) 2289;; '()) 2290 2291;; (define (nfa-state->multi-state nfa state) 2292;; (list state)) 2293 2294;; (define (nfa-multi-state-copy mst) 2295;; (map (lambda (x) x) mst)) 2296 2297;; (define (list->nfa-multi-state nfa ls) 2298;; (nfa-multi-state-copy ls)) 2299 2300;; (define (nfa-multi-state-contains? mst i) 2301;; (memq i mst)) 2302 2303;; (define (nfa-multi-state-fold mst kons knil) 2304;; (fold kons knil mst)) 2305 2306;; (define (nfa-multi-state-add! mst i) 2307;; (insert-sorted i mst)) 2308 2309;; (define (nfa-multi-state-add mst i) 2310;; (insert-sorted i mst)) 2311 2312;; (define (nfa-multi-state-union a b) 2313;; (merge-sorted a b)) 2314 2315;; Sorted List Utilities 2316 2317;; (define (insert-sorted n ls) 2318;; (cond 2319;; ((null? ls) 2320;; (cons n '())) 2321;; ((<= n (car ls)) 2322;; (if (= n (car ls)) 2323;; ls 2324;; (cons n ls))) 2325;; (else 2326;; (cons (car ls) (insert-sorted n (cdr ls)))))) 2327 2328;; (define (insert-sorted! n ls) 2329;; (cond 2330;; ((null? ls) 2331;; (cons n '())) 2332;; ((<= n (car ls)) 2333;; (if (= n (car ls)) 2334;; ls 2335;; (cons n ls))) 2336;; (else 2337;; (let lp ((head ls) (tail (cdr ls))) 2338;; (cond ((or (null? tail) (< n (car tail))) 2339;; (set-cdr! head (cons n tail))) 2340;; ((> n (car tail)) 2341;; (lp tail (cdr tail))))) 2342;; ls))) 2343 2344;; (define (merge-sorted a b) 2345;; (cond ((null? a) b) 2346;; ((null? b) a) 2347;; ((< (car a) (car b)) 2348;; (cons (car a) (merge-sorted (cdr a) b))) 2349;; ((> (car a) (car b)) 2350;; (cons (car b) (merge-sorted a (cdr b)))) 2351;; (else (merge-sorted (cdr a) b)))) 2352 2353;; ========================================================= ;; 2354 2355;; Presized bit-vector based 2356 2357(define (nfa-multi-state-hash nfa mst) 2358 (modulo (vector-ref mst 0) (nfa-num-states nfa))) 2359 2360(define (make-nfa-multi-state nfa) 2361 (make-vector (quotient (+ (nfa-num-states nfa) 24 -1) 24) 0)) 2362 2363(define (nfa-state->multi-state nfa state) 2364 (nfa-multi-state-add! (make-nfa-multi-state nfa) state)) 2365 2366(define (nfa-multi-state-copy mst) 2367 (let ((res (make-vector (vector-length mst)))) 2368 (do ((i (- (vector-length mst) 1) (- i 1))) 2369 ((< i 0) res) 2370 (vector-set! res i (vector-ref mst i))))) 2371 2372(define (nfa-multi-state-contains? mst i) 2373 (let ((cell (quotient i 24)) 2374 (bit (remainder i 24))) 2375 (not (zero? (bit-and (vector-ref mst cell) (bit-shl 1 bit)))))) 2376 2377(define (nfa-multi-state-add! mst i) 2378 (let ((cell (quotient i 24)) 2379 (bit (remainder i 24))) 2380 (vector-set! mst cell (bit-ior (vector-ref mst cell) (bit-shl 1 bit))) 2381 mst)) 2382 2383(define (nfa-multi-state-add mst i) 2384 (nfa-multi-state-add! (nfa-multi-state-copy mst) i)) 2385 2386(define (nfa-multi-state-union! a b) 2387 (do ((i (- (vector-length a) 1) (- i 1))) 2388 ((< i 0) a) 2389 (vector-set! a i (bit-ior (vector-ref a i) (vector-ref b i))))) 2390 2391(define (nfa-multi-state-union a b) 2392 (nfa-multi-state-union! (nfa-multi-state-copy a) b)) 2393 2394(define (nfa-multi-state-fold mst kons knil) 2395 (let ((limit (vector-length mst))) 2396 (let lp1 ((i 0) 2397 (acc knil)) 2398 (if (>= i limit) 2399 acc 2400 (let lp2 ((n (vector-ref mst i)) 2401 (acc acc)) 2402 (if (zero? n) 2403 (lp1 (+ i 1) acc) 2404 (let* ((n2 (bit-and n (- n 1))) 2405 (n-tail (- n n2)) 2406 (bit (+ (* i 24) (integer-log n-tail)))) 2407 (lp2 n2 (kons bit acc))))))))) 2408 2409;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2410;;;; NFA->DFA compilation 2411;; 2412;; During processing, the DFA is a list of the form: 2413;; 2414;; ((NFA-states ...) accepting-state? transitions ...) 2415;; 2416;; where the transitions are as in the NFA, except there are no 2417;; epsilons, duplicate characters or overlapping char-set ranges, and 2418;; the states moved to are closures (sets of NFA states). Multiple 2419;; DFA states may be accepting states. 2420 2421(define (nfa->dfa nfa . o) 2422 (let ((max-states (and (pair? o) (car o)))) 2423 (let lp ((ls (list (nfa-cache-state-closure! nfa (nfa-start-state nfa)))) 2424 (i 0) 2425 (res '())) 2426 (cond 2427 ((null? ls) 2428 (dfa-renumber nfa (reverse res))) 2429 ((assoc (car ls) res) ;; already seen this combination of states 2430 (lp (cdr ls) i res)) 2431 ((and max-states (> i max-states)) ;; too many DFA states 2432 #f) 2433 (else 2434 (let* ((states (car ls)) 2435 (trans (nfa-state-transitions nfa states)) 2436 (accept? (and (nfa-multi-state-contains? states 0) #t))) 2437 (lp (append (map cdr trans) (cdr ls)) 2438 (+ i 1) 2439 `((,states ,accept? ,@trans) ,@res)))))))) 2440 2441;; When the conversion is complete we renumber the DFA sets-of-states 2442;; in order and convert the result to a vector for fast lookup. 2443;; Charsets containing single characters are converted to those characters 2444;; for quick matching of the literal parts in a regex. 2445(define (dfa-renumber nfa dfa) 2446 (let* ((len (length dfa)) 2447 (states (make-vector (nfa-num-states nfa) '())) 2448 (res (make-vector len))) 2449 (define (renumber mst) 2450 (cdr (assoc mst (vector-ref states (nfa-multi-state-hash nfa mst))))) 2451 (let lp ((ls dfa) (i 0)) 2452 (cond ((pair? ls) 2453 (let ((j (nfa-multi-state-hash nfa (caar ls)))) 2454 (vector-set! states j (cons (cons (caar ls) i) 2455 (vector-ref states j)))) 2456 (lp (cdr ls) (+ i 1))))) 2457 (let lp ((ls dfa) (i 0)) 2458 (cond ((pair? ls) 2459 (for-each 2460 (lambda (x) 2461 (set-car! x (maybe-cset->char (car x))) 2462 (set-cdr! x (renumber (cdr x)))) 2463 (cddar ls)) 2464 (vector-set! res i (cdar ls)) 2465 (lp (cdr ls) (+ i 1))))) 2466 res)) 2467 2468;; Extract all distinct ranges and the potential states they can transition 2469;; to from a given set of states. Any ranges that would overlap with 2470;; distinct characters are split accordingly. 2471(define (nfa-state-transitions nfa states) 2472 (let ((res (nfa-multi-state-fold 2473 states 2474 (lambda (st res) 2475 (let ((trans (nfa-get-state-trans nfa st))) 2476 (if (null? trans) 2477 res 2478 (nfa-join-transitions! nfa res (car trans) (cdr trans))))) 2479 '()))) 2480 (for-each (lambda (x) (set-cdr! x (nfa-closure nfa (cdr x)))) res) 2481 res)) 2482 2483(define (nfa-join-transitions! nfa existing elt state) 2484 (define (csets-intersect? a b) 2485 (let ((i (cset-intersection a b))) 2486 (and (not (cset-empty? i)) i))) 2487 (let lp ((ls existing) (res '())) 2488 (cond 2489 ((null? ls) 2490 (cond ; First try to find a group that includes this state 2491 ((find (lambda (x) (nfa-multi-state-contains? (cdr x) state)) existing) => 2492 (lambda (existing-state) ; If found, merge charsets with it 2493 (set-car! existing-state (cset-union (car existing-state) elt)) 2494 existing)) 2495 ;; State not seen yet? Add a new state transition 2496 (else (cons (cons elt (nfa-state->multi-state nfa state)) existing)))) 2497 ((cset=? elt (caar ls)) ; Add state to existing set for this charset 2498 (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state)) 2499 existing) 2500 ((csets-intersect? elt (caar ls)) => ; overlapping charset, but diff state 2501 (lambda (intersection) 2502 (let* ((only-in-old (cset-difference (caar ls) elt)) 2503 (states-for-old (and (not (cset-empty? only-in-old)) 2504 (nfa-multi-state-copy (cdar ls)))) 2505 (result (if states-for-old 2506 (cons (cons only-in-old states-for-old) 2507 (append res (cdr ls))) 2508 (append res (cdr ls)))) 2509 (only-in-new (cset-difference elt (caar ls)))) 2510 ;; Add this state to the states already here and restrict to 2511 ;; the overlapping charset 2512 (set-car! (car ls) intersection) 2513 (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state)) 2514 ;; Continue with the remaining subset of the new cset (if nonempty) 2515 (cons (car ls) 2516 (if (cset-empty? only-in-new) 2517 result 2518 (nfa-join-transitions! nfa result only-in-new state)))))) 2519 (else 2520 (lp (cdr ls) (cons (car ls) res)))))) 2521 2522(define (nfa-cache-state-closure! nfa state) 2523 (let ((cached (nfa-get-state-closure nfa state))) 2524 (cond 2525 ((not (null? cached)) 2526 cached) 2527 (else 2528 (let ((res (nfa-state-closure-internal nfa state))) 2529 (nfa-set-state-closure! nfa state res) 2530 res))))) 2531 2532;; The `closure' of a list of NFA states - all states that can be 2533;; reached from any of them using any number of epsilon transitions. 2534(define (nfa-state-closure-internal nfa state) 2535 (let lp ((ls (list state)) 2536 (res (make-nfa-multi-state nfa))) 2537 (cond 2538 ((null? ls) 2539 res) 2540 ((nfa-multi-state-contains? res (car ls)) 2541 (lp (cdr ls) res)) 2542 (else 2543 (lp (append (nfa-get-epsilons nfa (car ls)) (cdr ls)) 2544 (nfa-multi-state-add! res (car ls))))))) 2545 2546(define (nfa-closure-internal nfa states) 2547 (nfa-multi-state-fold 2548 states 2549 (lambda (st res) 2550 (nfa-multi-state-union! res (nfa-cache-state-closure! nfa st))) 2551 (make-nfa-multi-state nfa))) 2552 2553(define (nfa-closure nfa states) 2554 (or (nfa-get-closure nfa states) 2555 (let ((res (nfa-closure-internal nfa states))) 2556 (nfa-add-closure! nfa states res) 2557 res))) 2558 2559;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2560;;;; Match Extraction 2561;; 2562;; DFAs don't give us match information, so once we match and 2563;; determine the start and end, we need to recursively break the 2564;; problem into smaller DFAs to get each submatch. 2565;; 2566;; See http://compilers.iecc.com/comparch/article/07-10-026 2567 2568(define (sre-match-extractor sre num-submatches) 2569 (let* ((tmp (+ num-submatches 1)) 2570 (tmp-end-src-offset (+ 5 (* tmp 4))) 2571 (tmp-end-index-offset (+ 6 (* tmp 4)))) 2572 (let lp ((sre sre) (n 1) (submatch-deps? #f)) 2573 (cond 2574 ((not (sre-has-submatches? sre)) 2575 (if (not submatch-deps?) 2576 (lambda (cnk start i end j matches) #t) 2577 (let ((dfa (nfa->dfa (sre->nfa sre ~none)))) 2578 (lambda (cnk start i end j matches) 2579 (dfa-match/longest dfa cnk start i end j matches tmp))))) 2580 ((pair? sre) 2581 (case (car sre) 2582 ((: seq) 2583 (let* ((right (sre-sequence (cddr sre))) 2584 (match-left (lp (cadr sre) n #t)) 2585 (match-right 2586 (lp right (+ n (sre-count-submatches (cadr sre))) #t))) 2587 (lambda (cnk start i end j matches) 2588 (let lp1 ((end2 end) (j2 j) (best-src #f) (best-index #f)) 2589 (let ((limit (if (eq? start end2) 2590 i 2591 ((chunker-get-start cnk) end2)))) 2592 (let lp2 ((k j2) (best-src best-src) (best-index best-index)) 2593 (if (< k limit) 2594 (cond 2595 ((not (eq? start end2)) 2596 (let ((prev (chunker-prev-chunk cnk start end2))) 2597 (lp1 prev 2598 ((chunker-get-end cnk) prev) 2599 best-src 2600 best-index))) 2601 (best-src 2602 (vector-set! matches tmp-end-src-offset best-src) 2603 (vector-set! matches tmp-end-index-offset best-index) 2604 #t) 2605 (else 2606 #f)) 2607 (if (and (match-left cnk start i end2 k matches) 2608 (eq? end2 (vector-ref matches 2609 tmp-end-src-offset)) 2610 (eqv? k (vector-ref matches 2611 tmp-end-index-offset)) 2612 (match-right cnk end2 k end j matches)) 2613 (let ((right-src 2614 (vector-ref matches tmp-end-src-offset)) 2615 (right 2616 (vector-ref matches tmp-end-index-offset))) 2617 (cond 2618 ((and (eq? end right-src) (eqv? j right)) 2619 (vector-set! matches tmp-end-src-offset end) 2620 (vector-set! matches tmp-end-index-offset j) 2621 #t) 2622 ((or (not best-src) 2623 (if (eq? best-src right-src) 2624 (> right best-index) 2625 (chunk-before? cnk 2626 best-src 2627 right-src))) 2628 (lp2 (- k 1) right-src right)) 2629 (else 2630 (lp2 (- k 1) best-src best-index)))) 2631 (lp2 (- k 1) best-src best-index))))))))) 2632 ((or) 2633 (if (null? (cdr sre)) 2634 (lambda (cnk start i end j matches) #f) 2635 (let* ((rest (sre-alternate (cddr sre))) 2636 (match-first 2637 (lp (cadr sre) n #t)) 2638 (match-rest 2639 (lp rest 2640 (+ n (sre-count-submatches (cadr sre))) 2641 submatch-deps?))) 2642 (lambda (cnk start i end j matches) 2643 (or (and (match-first cnk start i end j matches) 2644 (eq? end (vector-ref matches tmp-end-src-offset)) 2645 (eqv? j (vector-ref matches tmp-end-index-offset))) 2646 (match-rest cnk start i end j matches)))))) 2647 ((* +) 2648 (letrec ((match-once 2649 (lp (sre-sequence (cdr sre)) n #t)) 2650 (match-all 2651 (lambda (cnk start i end j matches) 2652 (if (match-once cnk start i end j matches) 2653 (let ((src (vector-ref matches tmp-end-src-offset)) 2654 (k (vector-ref matches tmp-end-index-offset))) 2655 (if (and src (or (not (eq? start src)) (< i k))) 2656 (match-all cnk src k end j matches) 2657 #t)) 2658 (begin 2659 (vector-set! matches tmp-end-src-offset start) 2660 (vector-set! matches tmp-end-index-offset i) 2661 #t))))) 2662 (if (eq? '* (car sre)) 2663 match-all 2664 (lambda (cnk start i end j matches) 2665 (and (match-once cnk start i end j matches) 2666 (let ((src (vector-ref matches tmp-end-src-offset)) 2667 (k (vector-ref matches tmp-end-index-offset))) 2668 (match-all cnk src k end j matches))))))) 2669 ((?) 2670 (let ((match-once (lp (sre-sequence (cdr sre)) n #t))) 2671 (lambda (cnk start i end j matches) 2672 (cond 2673 ((match-once cnk start i end j matches) 2674 #t) 2675 (else 2676 (vector-set! matches tmp-end-src-offset start) 2677 (vector-set! matches tmp-end-index-offset i) 2678 #t))))) 2679 (($ submatch => submatch-named) 2680 (let ((match-one 2681 (lp (sre-sequence (if (memq (car sre) '($ submatch)) 2682 (cdr sre) 2683 (cddr sre))) 2684 (+ n 1) 2685 #t)) 2686 (start-src-offset (+ 3 (* n 4))) 2687 (start-index-offset (+ 4 (* n 4))) 2688 (end-src-offset (+ 5 (* n 4))) 2689 (end-index-offset (+ 6 (* n 4)))) 2690 (lambda (cnk start i end j matches) 2691 (cond 2692 ((match-one cnk start i end j matches) 2693 (vector-set! matches start-src-offset start) 2694 (vector-set! matches start-index-offset i) 2695 (vector-set! matches end-src-offset 2696 (vector-ref matches tmp-end-src-offset)) 2697 (vector-set! matches end-index-offset 2698 (vector-ref matches tmp-end-index-offset)) 2699 #t) 2700 (else 2701 #f))))) 2702 (else 2703 (error "unknown regexp operator" (car sre))))) 2704 (else 2705 (error "unknown regexp" sre)))))) 2706 2707;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 2708;;;; Closure Compilation 2709;; 2710;; We use this for non-regular expressions instead of an interpreted 2711;; NFA matcher. We use backtracking anyway, but this gives us more 2712;; freedom of implementation, allowing us to support patterns that 2713;; can't be represented in the above NFA representation. 2714 2715(define (sre->procedure sre . o) 2716 (define names 2717 (if (and (pair? o) (pair? (cdr o))) (cadr o) (sre-names sre 1 '()))) 2718 (let lp ((sre sre) 2719 (n 1) 2720 (flags (if (pair? o) (car o) ~none)) 2721 (next (lambda (cnk init src str i end matches fail) 2722 (irregex-match-start-chunk-set! matches 0 (car init)) 2723 (irregex-match-start-index-set! matches 0 (cdr init)) 2724 (irregex-match-end-chunk-set! matches 0 src) 2725 (irregex-match-end-index-set! matches 0 i) 2726 (%irregex-match-fail-set! matches fail) 2727 matches))) 2728 ;; XXXX this should be inlined 2729 (define (rec sre) (lp sre n flags next)) 2730 (cond 2731 ((pair? sre) 2732 (if (string? (car sre)) 2733 (sre-cset->procedure 2734 (sre->cset (car sre) (flag-set? flags ~case-insensitive?)) 2735 next) 2736 (case (car sre) 2737 ((~ - & /) 2738 (sre-cset->procedure 2739 (sre->cset sre (flag-set? flags ~case-insensitive?)) 2740 next)) 2741 ((or) 2742 (case (length (cdr sre)) 2743 ((0) (lambda (cnk init src str i end matches fail) (fail))) 2744 ((1) (rec (cadr sre))) 2745 (else 2746 (let* ((first (rec (cadr sre))) 2747 (rest (lp (sre-alternate (cddr sre)) 2748 (+ n (sre-count-submatches (cadr sre))) 2749 flags 2750 next))) 2751 (lambda (cnk init src str i end matches fail) 2752 (first cnk init src str i end matches 2753 (lambda () 2754 (rest cnk init src str i end matches fail)))))))) 2755 ((w/case) 2756 (lp (sre-sequence (cdr sre)) 2757 n 2758 (flag-clear flags ~case-insensitive?) 2759 next)) 2760 ((w/nocase) 2761 (lp (sre-sequence (cdr sre)) 2762 n 2763 (flag-join flags ~case-insensitive?) 2764 next)) 2765 ((w/utf8) 2766 (lp (sre-sequence (cdr sre)) n (flag-join flags ~utf8?) next)) 2767 ((w/noutf8) 2768 (lp (sre-sequence (cdr sre)) n (flag-clear flags ~utf8?) next)) 2769 ((seq :) 2770 (case (length (cdr sre)) 2771 ((0) next) 2772 ((1) (rec (cadr sre))) 2773 (else 2774 (let ((rest (lp (sre-sequence (cddr sre)) 2775 (+ n (sre-count-submatches (cadr sre))) 2776 flags 2777 next))) 2778 (lp (cadr sre) n flags rest))))) 2779 ((?) 2780 (let ((body (rec (sre-sequence (cdr sre))))) 2781 (lambda (cnk init src str i end matches fail) 2782 (body cnk init src str i end matches 2783 (lambda () (next cnk init src str i end matches fail)))))) 2784 ((??) 2785 (let ((body (rec (sre-sequence (cdr sre))))) 2786 (lambda (cnk init src str i end matches fail) 2787 (next cnk init src str i end matches 2788 (lambda () (body cnk init src str i end matches fail)))))) 2789 ((*) 2790 (cond 2791 ((sre-empty? (sre-sequence (cdr sre))) 2792 (error "invalid sre: empty *" sre)) 2793 (else 2794 (letrec 2795 ((body 2796 (lp (sre-sequence (cdr sre)) 2797 n 2798 flags 2799 (lambda (cnk init src str i end matches fail) 2800 (body cnk init src str i end matches 2801 (lambda () 2802 (next cnk init src str i end matches fail) 2803 )))))) 2804 (lambda (cnk init src str i end matches fail) 2805 (body cnk init src str i end matches 2806 (lambda () 2807 (next cnk init src str i end matches fail)))))))) 2808 ((*?) 2809 (cond 2810 ((sre-empty? (sre-sequence (cdr sre))) 2811 (error "invalid sre: empty *?" sre)) 2812 (else 2813 (letrec 2814 ((body 2815 (lp (sre-sequence (cdr sre)) 2816 n 2817 flags 2818 (lambda (cnk init src str i end matches fail) 2819 (next cnk init src str i end matches 2820 (lambda () 2821 (body cnk init src str i end matches fail) 2822 )))))) 2823 (lambda (cnk init src str i end matches fail) 2824 (next cnk init src str i end matches 2825 (lambda () 2826 (body cnk init src str i end matches fail)))))))) 2827 ((+) 2828 (lp (sre-sequence (cdr sre)) 2829 n 2830 flags 2831 (rec (list '* (sre-sequence (cdr sre)))))) 2832 ((=) 2833 (rec `(** ,(cadr sre) ,(cadr sre) ,@(cddr sre)))) 2834 ((>=) 2835 (rec `(** ,(cadr sre) #f ,@(cddr sre)))) 2836 ((** **?) 2837 (cond 2838 ((or (and (number? (cadr sre)) 2839 (number? (caddr sre)) 2840 (> (cadr sre) (caddr sre))) 2841 (and (not (cadr sre)) (caddr sre))) 2842 (lambda (cnk init src str i end matches fail) (fail))) 2843 (else 2844 (let* ((from (cadr sre)) 2845 (to (caddr sre)) 2846 (? (if (eq? '** (car sre)) '? '??)) 2847 (* (if (eq? '** (car sre)) '* '*?)) 2848 (sre (sre-sequence (cdddr sre))) 2849 (x-sre (sre-strip-submatches sre)) 2850 (next (if to 2851 (if (= from to) 2852 next 2853 (fold (lambda (x next) 2854 (lp `(,? ,sre) n flags next)) 2855 next 2856 (zero-to (- to from)))) 2857 (rec `(,* ,sre))))) 2858 (if (zero? from) 2859 next 2860 (lp `(seq ,@(map (lambda (x) x-sre) (zero-to (- from 1))) 2861 ,sre) 2862 n 2863 flags 2864 next)))))) 2865 ((word) 2866 (rec `(seq bow ,@(cdr sre) eow))) 2867 ((word+) 2868 (rec `(seq bow (+ (& (or alphanumeric "_") 2869 (or ,@(cdr sre)))) eow))) 2870 ((posix-string) 2871 (rec (string->sre (cadr sre)))) 2872 ((look-ahead) 2873 (let ((check 2874 (lp (sre-sequence (cdr sre)) 2875 n 2876 flags 2877 (lambda (cnk init src str i end matches fail) i)))) 2878 (lambda (cnk init src str i end matches fail) 2879 (if (check cnk init src str i end matches (lambda () #f)) 2880 (next cnk init src str i end matches fail) 2881 (fail))))) 2882 ((neg-look-ahead) 2883 (let ((check 2884 (lp (sre-sequence (cdr sre)) 2885 n 2886 flags 2887 (lambda (cnk init src str i end matches fail) i)))) 2888 (lambda (cnk init src str i end matches fail) 2889 (if (check cnk init src str i end matches (lambda () #f)) 2890 (fail) 2891 (next cnk init src str i end matches fail))))) 2892 ((look-behind neg-look-behind) 2893 (let ((check 2894 (lp (sre-sequence 2895 (cons '(* any) (append (cdr sre) '(eos)))) 2896 n 2897 flags 2898 (lambda (cnk init src str i end matches fail) i)))) 2899 (lambda (cnk init src str i end matches fail) 2900 (let* ((prev ((chunker-get-substring cnk) 2901 (car init) 2902 (cdr init) 2903 src 2904 i)) 2905 (len (string-length prev)) 2906 (src2 (list prev 0 len))) 2907 (if ((if (eq? (car sre) 'look-behind) (lambda (x) x) not) 2908 (check irregex-basic-string-chunker 2909 (cons src2 0) src2 prev 0 len matches (lambda () #f))) 2910 (next cnk init src str i end matches fail) 2911 (fail)))))) 2912 ((atomic) 2913 (let ((once 2914 (lp (sre-sequence (cdr sre)) 2915 n 2916 flags 2917 (lambda (cnk init src str i end matches fail) i)))) 2918 (lambda (cnk init src str i end matches fail) 2919 (let ((j (once cnk init src str i end matches (lambda () #f)))) 2920 (if j 2921 (next cnk init src str j end matches fail) 2922 (fail)))))) 2923 ((if) 2924 (let* ((test-submatches (sre-count-submatches (cadr sre))) 2925 (pass (lp (caddr sre) flags (+ n test-submatches) next)) 2926 (fail (if (pair? (cdddr sre)) 2927 (lp (cadddr sre) 2928 (+ n test-submatches 2929 (sre-count-submatches (caddr sre))) 2930 flags 2931 next) 2932 (lambda (cnk init src str i end matches fail) 2933 (fail))))) 2934 (cond 2935 ((or (number? (cadr sre)) (symbol? (cadr sre))) 2936 (let ((index 2937 (if (symbol? (cadr sre)) 2938 (cond 2939 ((assq (cadr sre) names) => cdr) 2940 (else 2941 (error "unknown named backref in SRE IF" sre))) 2942 (cadr sre)))) 2943 (lambda (cnk init src str i end matches fail2) 2944 (if (%irregex-match-end-chunk matches index) 2945 (pass cnk init src str i end matches fail2) 2946 (fail cnk init src str i end matches fail2))))) 2947 (else 2948 (let ((test (lp (cadr sre) n flags pass))) 2949 (lambda (cnk init src str i end matches fail2) 2950 (test cnk init src str i end matches 2951 (lambda () (fail cnk init src str i end matches fail2))) 2952 )))))) 2953 ((backref backref-ci) 2954 (let ((n (cond ((number? (cadr sre)) (cadr sre)) 2955 ((assq (cadr sre) names) => cdr) 2956 (else (error "unknown backreference" (cadr sre))))) 2957 (compare (if (or (eq? (car sre) 'backref-ci) 2958 (flag-set? flags ~case-insensitive?)) 2959 string-ci=? 2960 string=?))) 2961 (lambda (cnk init src str i end matches fail) 2962 (let ((s (irregex-match-substring matches n))) 2963 (if (not s) 2964 (fail) 2965 ;; XXXX create an abstract subchunk-compare 2966 (let lp ((src src) 2967 (str str) 2968 (i i) 2969 (end end) 2970 (j 0) 2971 (len (string-length s))) 2972 (cond 2973 ((<= len (- end i)) 2974 (cond 2975 ((compare (substring s j (string-length s)) 2976 (substring str i (+ i len))) 2977 (next cnk init src str (+ i len) end matches fail)) 2978 (else 2979 (fail)))) 2980 (else 2981 (cond 2982 ((compare (substring s j (+ j (- end i))) 2983 (substring str i end)) 2984 (let ((src2 ((chunker-get-next cnk) src))) 2985 (if src2 2986 (lp src2 2987 ((chunker-get-str cnk) src2) 2988 ((chunker-get-start cnk) src2) 2989 ((chunker-get-end cnk) src2) 2990 (+ j (- end i)) 2991 (- len (- end i))) 2992 (fail)))) 2993 (else 2994 (fail))))))))))) 2995 ((dsm) 2996 (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) flags next)) 2997 (($ submatch) 2998 (let ((body 2999 (lp (sre-sequence (cdr sre)) 3000 (+ n 1) 3001 flags 3002 (lambda (cnk init src str i end matches fail) 3003 (let ((old-source 3004 (%irregex-match-end-chunk matches n)) 3005 (old-index 3006 (%irregex-match-end-index matches n))) 3007 (irregex-match-end-chunk-set! matches n src) 3008 (irregex-match-end-index-set! matches n i) 3009 (next cnk init src str i end matches 3010 (lambda () 3011 (irregex-match-end-chunk-set! 3012 matches n old-source) 3013 (irregex-match-end-index-set! 3014 matches n old-index) 3015 (fail)))))))) 3016 (lambda (cnk init src str i end matches fail) 3017 (let ((old-source (%irregex-match-start-chunk matches n)) 3018 (old-index (%irregex-match-start-index matches n))) 3019 (irregex-match-start-chunk-set! matches n src) 3020 (irregex-match-start-index-set! matches n i) 3021 (body cnk init src str i end matches 3022 (lambda () 3023 (irregex-match-start-chunk-set! 3024 matches n old-source) 3025 (irregex-match-start-index-set! 3026 matches n old-index) 3027 (fail))))))) 3028 ((=> submatch-named) 3029 (rec `(submatch ,@(cddr sre)))) 3030 (else 3031 (error "unknown regexp operator" sre))))) 3032 ((symbol? sre) 3033 (case sre 3034 ((any) 3035 (lambda (cnk init src str i end matches fail) 3036 (if (< i end) 3037 (next cnk init src str (+ i 1) end matches fail) 3038 (let ((src2 ((chunker-get-next cnk) src))) 3039 (if src2 3040 (let ((str2 ((chunker-get-str cnk) src2)) 3041 (i2 ((chunker-get-start cnk) src2)) 3042 (end2 ((chunker-get-end cnk) src2))) 3043 (next cnk init src2 str2 (+ i2 1) end2 matches fail)) 3044 (fail)))))) 3045 ((nonl) 3046 (lambda (cnk init src str i end matches fail) 3047 (if (< i end) 3048 (if (not (eqv? #\newline (string-ref str i))) 3049 (next cnk init src str (+ i 1) end matches fail) 3050 (fail)) 3051 (let ((src2 ((chunker-get-next cnk) src))) 3052 (if src2 3053 (let ((str2 ((chunker-get-str cnk) src2)) 3054 (i2 ((chunker-get-start cnk) src2)) 3055 (end2 ((chunker-get-end cnk) src2))) 3056 (if (not (eqv? #\newline (string-ref str2 i2))) 3057 (next cnk init src2 str2 (+ i2 1) end2 matches fail) 3058 (fail))) 3059 (fail)))))) 3060 ((bos) 3061 (lambda (cnk init src str i end matches fail) 3062 (if (and (eq? src (car init)) (eqv? i (cdr init))) 3063 (next cnk init src str i end matches fail) 3064 (fail)))) 3065 ((bol) 3066 (lambda (cnk init src str i end matches fail) 3067 (if (or (and (eq? src (car init)) (eqv? i (cdr init))) 3068 (and (> i ((chunker-get-start cnk) src)) 3069 (eqv? #\newline (string-ref str (- i 1))))) 3070 (next cnk init src str i end matches fail) 3071 (fail)))) 3072 ((bow) 3073 (lambda (cnk init src str i end matches fail) 3074 (if (and (or (if (> i ((chunker-get-start cnk) src)) 3075 (not (char-alphanumeric? (string-ref str (- i 1)))) 3076 (let ((ch (chunker-prev-char cnk src end))) 3077 (and ch (not (char-alphanumeric? ch))))) 3078 (and (eq? src (car init)) (eqv? i (cdr init)))) 3079 (if (< i end) 3080 (char-alphanumeric? (string-ref str i)) 3081 (let ((next ((chunker-get-next cnk) src))) 3082 (and next 3083 (char-alphanumeric? 3084 (string-ref ((chunker-get-str cnk) next) 3085 ((chunker-get-start cnk) next))))))) 3086 (next cnk init src str i end matches fail) 3087 (fail)))) 3088 ((eos) 3089 (lambda (cnk init src str i end matches fail) 3090 (if (and (>= i end) (not ((chunker-get-next cnk) src))) 3091 (next cnk init src str i end matches fail) 3092 (fail)))) 3093 ((eol) 3094 (lambda (cnk init src str i end matches fail) 3095 (if (if (< i end) 3096 (eqv? #\newline (string-ref str i)) 3097 (let ((src2 ((chunker-get-next cnk) src))) 3098 (if (not src2) 3099 #t 3100 (eqv? #\newline 3101 (string-ref ((chunker-get-str cnk) src2) 3102 ((chunker-get-start cnk) src2)))))) 3103 (next cnk init src str i end matches fail) 3104 (fail)))) 3105 ((eow) 3106 (lambda (cnk init src str i end matches fail) 3107 (if (and (if (< i end) 3108 (not (char-alphanumeric? (string-ref str i))) 3109 (let ((ch (chunker-next-char cnk src))) 3110 (or (not ch) (not (char-alphanumeric? ch))))) 3111 (if (> i ((chunker-get-start cnk) src)) 3112 (char-alphanumeric? (string-ref str (- i 1))) 3113 (let ((prev (chunker-prev-char cnk init src))) 3114 (or (not prev) (char-alphanumeric? prev))))) 3115 (next cnk init src str i end matches fail) 3116 (fail)))) 3117 ((nwb) ;; non-word-boundary 3118 (lambda (cnk init src str i end matches fail) 3119 (let ((c1 (if (< i end) 3120 (string-ref str i) 3121 (chunker-next-char cnk src))) 3122 (c2 (if (> i ((chunker-get-start cnk) src)) 3123 (string-ref str (- i 1)) 3124 (chunker-prev-char cnk init src)))) 3125 (if (and c1 c2 3126 (if (char-alphanumeric? c1) 3127 (char-alphanumeric? c2) 3128 (not (char-alphanumeric? c2)))) 3129 (next cnk init src str i end matches fail) 3130 (fail))))) 3131 ((epsilon) 3132 next) 3133 (else 3134 (let ((cell (assq sre sre-named-definitions))) 3135 (if cell 3136 (rec (cdr cell)) 3137 (error "unknown regexp" sre)))))) 3138 ((char? sre) 3139 (if (flag-set? flags ~case-insensitive?) 3140 ;; case-insensitive 3141 (lambda (cnk init src str i end matches fail) 3142 (if (>= i end) 3143 (let lp ((src2 ((chunker-get-next cnk) src))) 3144 (if src2 3145 (let ((str2 ((chunker-get-str cnk) src2)) 3146 (i2 ((chunker-get-start cnk) src2)) 3147 (end2 ((chunker-get-end cnk) src2))) 3148 (if (>= i2 end2) 3149 (lp ((chunker-get-next cnk) src2)) 3150 (if (char-ci=? sre (string-ref str2 i2)) 3151 (next cnk init src2 str2 (+ i2 1) end2 3152 matches fail) 3153 (fail)))) 3154 (fail))) 3155 (if (char-ci=? sre (string-ref str i)) 3156 (next cnk init src str (+ i 1) end matches fail) 3157 (fail)))) 3158 ;; case-sensitive 3159 (lambda (cnk init src str i end matches fail) 3160 (if (>= i end) 3161 (let lp ((src2 ((chunker-get-next cnk) src))) 3162 (if src2 3163 (let ((str2 ((chunker-get-str cnk) src2)) 3164 (i2 ((chunker-get-start cnk) src2)) 3165 (end2 ((chunker-get-end cnk) src2))) 3166 (if (>= i2 end2) 3167 (lp ((chunker-get-next cnk) src2)) 3168 (if (char=? sre (string-ref str2 i2)) 3169 (next cnk init src2 str2 (+ i2 1) end2 3170 matches fail) 3171 (fail)))) 3172 (fail))) 3173 (if (char=? sre (string-ref str i)) 3174 (next cnk init src str (+ i 1) end matches fail) 3175 (fail)))) 3176 )) 3177 ((string? sre) 3178 (rec (sre-sequence (string->list sre))) 3179;; XXXX reintroduce faster string matching on chunks 3180;; (if (flag-set? flags ~case-insensitive?) 3181;; (rec (sre-sequence (string->list sre))) 3182;; (let ((len (string-length sre))) 3183;; (lambda (cnk init src str i end matches fail) 3184;; (if (and (<= (+ i len) end) 3185;; (%substring=? sre str 0 i len)) 3186;; (next str (+ i len) matches fail) 3187;; (fail))))) 3188 ) 3189 (else 3190 (error "unknown regexp" sre))))) 3191 3192;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3193;;;; Character Sets 3194;; 3195;; Simple character sets as lists of ranges, as used in the NFA/DFA 3196;; compilation. This is not especially efficient, but is portable and 3197;; scalable for any range of character sets. 3198 3199(define (sre-cset->procedure cset next) 3200 (lambda (cnk init src str i end matches fail) 3201 (if (< i end) 3202 (if (cset-contains? cset (string-ref str i)) 3203 (next cnk init src str (+ i 1) end matches fail) 3204 (fail)) 3205 (let ((src2 ((chunker-get-next cnk) src))) 3206 (if src2 3207 (let ((str2 ((chunker-get-str cnk) src2)) 3208 (i2 ((chunker-get-start cnk) src2)) 3209 (end2 ((chunker-get-end cnk) src2))) 3210 (if (cset-contains? cset (string-ref str2 i2)) 3211 (next cnk init src2 str2 (+ i2 1) end2 matches fail) 3212 (fail))) 3213 (fail)))))) 3214 3215(define (make-cset) (vector)) 3216(define (range->cset from to) (vector (cons from to))) 3217(define (char->cset ch) (vector (cons ch ch))) 3218(define (cset-empty? cs) (zero? (vector-length cs))) 3219(define (maybe-cset->char cs) 3220 (if (and (= (vector-length cs) 1) 3221 (char=? (car (vector-ref cs 0)) (cdr (vector-ref cs 0)))) 3222 (car (vector-ref cs 0)) 3223 cs)) 3224 3225;; Since csets are sorted, there's only one possible representation of any cset 3226(define cset=? equal?) 3227 3228(define (cset-size cs) 3229 (let ((len (vector-length cs))) 3230 (let lp ((i 0) (size 0)) 3231 (if (= i len) 3232 size 3233 (lp (+ i 1) (+ size 1 3234 (- (char->integer (cdr (vector-ref cs i))) 3235 (char->integer (car (vector-ref cs i)))))))))) 3236 3237(define (cset->plist cs) 3238 (let lp ((i (- (vector-length cs) 1)) 3239 (res '())) 3240 (if (= i -1) 3241 res 3242 (lp (- i 1) (cons (car (vector-ref cs i)) 3243 (cons (cdr (vector-ref cs i)) res)))))) 3244 3245(define (plist->cset ls) 3246 (let lp ((ls ls) (res (make-cset))) 3247 (if (null? ls) 3248 res 3249 (lp (cddr ls) (cset-union (range->cset (car ls) (cadr ls)) res))))) 3250 3251(define (string->cset s) 3252 (fold (lambda (ch cs) 3253 (cset-adjoin cs ch)) 3254 (make-cset) 3255 (string->list s))) 3256 3257(define (sre->cset sre . o) 3258 (let lp ((sre sre) (ci? (and (pair? o) (car o)))) 3259 (define (rec sre) (lp sre ci?)) 3260 (cond 3261 ((pair? sre) 3262 (if (string? (car sre)) 3263 (if ci? 3264 (cset-case-insensitive (string->cset (car sre))) 3265 (string->cset (car sre))) 3266 (case (car sre) 3267 ((~) 3268 (cset-complement 3269 (fold cset-union (rec (cadr sre)) (map rec (cddr sre))))) 3270 ((&) 3271 (fold cset-intersection (rec (cadr sre)) (map rec (cddr sre)))) 3272 ((-) 3273 (fold (lambda (x res) (cset-difference res x)) 3274 (rec (cadr sre)) 3275 (map rec (cddr sre)))) 3276 ((/) 3277 (let ((res (plist->cset (sre-flatten-ranges (cdr sre))))) 3278 (if ci? 3279 (cset-case-insensitive res) 3280 res))) 3281 ((or) 3282 (fold cset-union (rec (cadr sre)) (map rec (cddr sre)))) 3283 ((w/case) 3284 (lp (sre-alternate (cdr sre)) #f)) 3285 ((w/nocase) 3286 (lp (sre-alternate (cdr sre)) #t)) 3287 (else 3288 (error "not a valid sre char-set operator" sre))))) 3289 ((char? sre) (if ci? 3290 (cset-case-insensitive (range->cset sre sre)) 3291 (range->cset sre sre))) 3292 ((string? sre) (rec (list sre))) 3293 (else 3294 (let ((cell (assq sre sre-named-definitions))) 3295 (if cell 3296 (rec (cdr cell)) 3297 (error "not a valid sre char-set" sre))))))) 3298 3299(define (cset->sre cset) 3300 (sre-alternate 3301 (map (lambda (x) (list '/ (car x) (cdr x))) 3302 (vector->list cset)))) 3303 3304(define (cset-contains? cset ch) 3305 (let ((len (vector-length cset))) 3306 (case len 3307 ((0) #f) 3308 ((1) (let ((range (vector-ref cset 0))) 3309 (and (char<=? ch (cdr range)) (char<=? (car range) ch)))) 3310 (else (let lp ((lower 0) (upper len)) 3311 (let* ((middle (quotient (+ upper lower) 2)) 3312 (range (vector-ref cset middle))) 3313 (cond ((char<? (cdr range) ch) 3314 (let ((next (+ middle 1))) 3315 (and (< next upper) (lp next upper)))) 3316 ((char<? ch (car range)) 3317 (and (< lower middle) (lp lower middle))) 3318 (else #t)))))))) 3319 3320(define (char-ranges-union a b) 3321 (cons (if (char<=? (car a) (car b)) (car a) (car b)) 3322 (if (char>=? (cdr a) (cdr b)) (cdr a) (cdr b)))) 3323 3324(define (cset-union a b) 3325 (let union-range ((a (vector->list a)) 3326 (b (vector->list b)) 3327 (res '())) 3328 (cond 3329 ((null? a) (list->vector (reverse (append (reverse b) res)))) 3330 ((null? b) (list->vector (reverse (append (reverse a) res)))) 3331 (else 3332 (let ((a-range (car a)) 3333 (b-range (car b))) 3334 (cond 3335 ((i/char<? (next-char-idx (cdr a-range)) (car b-range)) 3336 (union-range (cdr a) b (cons a-range res))) 3337 ((i/char>? (car a-range) (next-char-idx (cdr b-range))) 3338 (union-range (cdr b) a (cons b-range res))) 3339 ((char>=? (cdr a-range) (car b-range)) 3340 (union-range (cons (char-ranges-union a-range b-range) (cdr a)) 3341 (cdr b) 3342 res)) 3343 (else (union-range (cdr a) 3344 (cons (char-ranges-union a-range b-range) (cdr b)) 3345 res)))))))) 3346 3347(define (cset-adjoin cs ch) (cset-union cs (char->cset ch))) 3348 3349(define (char-idx obj) 3350 (if (char? obj) 3351 (char->integer obj) 3352 obj)) 3353(define (i/char<? x y) 3354 (< (char-idx x) 3355 (char-idx y))) 3356(define (i/char>? x y) 3357 (> (char-idx x) 3358 (char-idx y))) 3359 3360 3361(define (next-char c) 3362 (integer->char (+ (char->integer c) 1))) 3363 3364(define (prev-char c) 3365 (integer->char (- (char->integer c) 1))) 3366 3367;; NMOSH: we need this because R6RS doesn't allow (+ #x10ffff 1) 3368;; char index. 3369(define (next-char-idx c) 3370 (+ (char->integer c) 1)) 3371 3372(define (cset-difference a b) 3373 (let diff ((a (vector->list a)) 3374 (b (vector->list b)) 3375 (res '())) 3376 (cond ((null? a) (list->vector (reverse res))) 3377 ((null? b) (list->vector (append (reverse res) a))) 3378 (else 3379 (let ((a-range (car a)) 3380 (b-range (car b))) 3381 (cond 3382 ((char<? (cdr a-range) (car b-range)) 3383 (diff (cdr a) b (cons a-range res))) 3384 ((char>? (car a-range) (cdr b-range)) 3385 (diff a (cdr b) res)) 3386 ((and (char<=? (car b-range) (car a-range)) 3387 (char>=? (cdr b-range) (cdr a-range))) 3388 (diff (cdr a) b res)) 3389 (else (let ((left (and (char<? (car a-range) (car b-range)) 3390 (cons (car a-range) 3391 (prev-char (car b-range))))) 3392 (right (and (char>? (cdr a-range) (cdr b-range)) 3393 (cons (next-char (cdr b-range)) 3394 (cdr a-range))))) 3395 (diff (if right (cons right (cdr a)) (cdr a)) 3396 b 3397 (if left (cons left res) res)))))))))) 3398 3399(define (min-char a b) 3400 (if (char<? a b) a b)) 3401 3402(define (max-char a b) 3403 (if (char<? a b) b a)) 3404 3405(define (cset-intersection a b) 3406 (let intersect ((a (vector->list a)) 3407 (b (vector->list b)) 3408 (res '())) 3409 (if (or (null? a) (null? b)) 3410 (list->vector (reverse res)) 3411 (let ((a-range (car a)) 3412 (b-range (car b))) 3413 (cond 3414 ((char<? (cdr a-range) (car b-range)) 3415 (intersect (cdr a) b res)) 3416 ((char>? (car a-range) (cdr b-range)) 3417 (intersect a (cdr b) res)) 3418 (else 3419 (let ((result (cons (max-char (car b-range) (car a-range)) 3420 (min-char (cdr a-range) (cdr b-range))))) 3421 (intersect (if (char>? (cdr a-range) (cdr result)) 3422 a (cdr a)) 3423 (if (char>? (cdr b-range) (cdr result)) 3424 b (cdr b)) 3425 (cons result res))))))))) 3426 3427(define (cset-complement a) 3428 (cset-difference (sre->cset *all-chars*) a)) 3429 3430;; This could use some optimization :) 3431(define (cset-case-insensitive a) 3432 (let lp ((ls (vector->list a)) (res '())) 3433 (cond ((null? ls) (list->vector (reverse res))) 3434 ((and (char-alphabetic? (caar ls)) 3435 (char-alphabetic? (cdar ls))) 3436 (lp (cdr ls) 3437 (reverse 3438 (vector->list 3439 (cset-union (cset-union (list->vector (reverse res)) 3440 (vector (car ls))) 3441 (range->cset (char-altcase (caar ls)) 3442 (char-altcase (cdar ls)))))))) 3443 (else (lp (cdr ls) (reverse (vector->list 3444 (cset-union (list->vector (reverse res)) 3445 (vector (car ls)))))))))) 3446 3447;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 3448;;;; Match and Replace Utilities 3449 3450(define (irregex-fold/fast irx kons knil str . o) 3451 (if (not (string? str)) (error "irregex-fold: not a string" str)) 3452 (let* ((irx (irregex irx)) 3453 (matches (irregex-new-matches irx)) 3454 (finish (or (and (pair? o) (car o)) (lambda (i acc) acc))) 3455 (start (if (and (pair? o) (pair? (cdr o))) (cadr o) 0)) 3456 (end (if (and (pair? o) (pair? (cdr o)) (pair? (cddr o))) 3457 (caddr o) 3458 (string-length str)))) 3459 (if (not (and (integer? start) (exact? start))) 3460 (error "irregex-fold: not an exact integer" start)) 3461 (if (not (and (integer? end) (exact? end))) 3462 (error "irregex-fold: not an exact integer" end)) 3463 (irregex-match-chunker-set! matches irregex-basic-string-chunker) 3464 (let lp ((i start) (acc knil)) 3465 (if (>= i end) 3466 (finish i acc) 3467 (let ((m (irregex-search/matches 3468 irx 3469 irregex-basic-string-chunker 3470 (list str i end) 3471 i 3472 matches))) 3473 (if (not m) 3474 (finish i acc) 3475 (let* ((end (%irregex-match-end-index m 0)) 3476 (acc (kons i m acc))) 3477 (irregex-reset-matches! matches) 3478 (lp end acc)))))))) 3479 3480(define (irregex-fold irx kons . args) 3481 (if (not (procedure? kons)) (error "irregex-fold: not a procedure" kons)) 3482 (let ((kons2 (lambda (i m acc) (kons i (irregex-copy-matches m) acc)))) 3483 (apply irregex-fold/fast irx kons2 args))) 3484 3485(define (irregex-fold/chunked/fast irx kons knil cnk start . o) 3486 (let* ((irx (irregex irx)) 3487 (matches (irregex-new-matches irx)) 3488 (finish (or (and (pair? o) (car o)) (lambda (src i acc) acc))) 3489 (i (if (and (pair? o) (pair? (cdr o))) 3490 (cadr o) 3491 ((chunker-get-start cnk) start)))) 3492 (if (not (integer? i)) (error "irregex-fold/chunked: not an integer" i)) 3493 (irregex-match-chunker-set! matches cnk) 3494 (let lp ((start start) (i i) (acc knil)) 3495 (if (not start) 3496 (finish start i acc) 3497 (let ((m (irregex-search/matches irx cnk start i matches))) 3498 (if (not m) 3499 (finish start i acc) 3500 (let* ((acc (kons start i m acc)) 3501 (end-src (%irregex-match-end-chunk m 0)) 3502 (end-index (%irregex-match-end-index m 0))) 3503 (irregex-reset-matches! matches) 3504 (lp end-src end-index acc)))))))) 3505 3506(define (irregex-fold/chunked irx kons . args) 3507 (if (not (procedure? kons)) (error "irregex-fold/chunked: not a procedure" kons)) 3508 (let ((kons2 (lambda (s i m acc) (kons s i (irregex-copy-matches m) acc)))) 3509 (apply irregex-fold/chunked/fast irx kons2 args))) 3510 3511(define (irregex-replace irx str . o) 3512 (if (not (string? str)) (error "irregex-replace: not a string" str)) 3513 (let ((m (irregex-search irx str))) 3514 (and 3515 m 3516 (string-cat-reverse 3517 (cons (substring str (%irregex-match-end-index m 0) (string-length str)) 3518 (append (irregex-apply-match m o) 3519 (list (substring str 0 (%irregex-match-start-index m 0))) 3520 )))))) 3521 3522(define (irregex-replace/all irx str . o) 3523 (if (not (string? str)) (error "irregex-replace/all: not a string" str)) 3524 (irregex-fold/fast 3525 irx 3526 (lambda (i m acc) 3527 (let ((m-start (%irregex-match-start-index m 0))) 3528 (append (irregex-apply-match m o) 3529 (if (>= i m-start) 3530 acc 3531 (cons (substring str i m-start) acc))))) 3532 '() 3533 str 3534 (lambda (i acc) 3535 (let ((end (string-length str))) 3536 (string-cat-reverse (if (>= i end) 3537 acc 3538 (cons (substring str i end) acc))))))) 3539 3540(define (irregex-apply-match m ls) 3541 (let lp ((ls ls) (res '())) 3542 (if (null? ls) 3543 res 3544 (cond 3545 ((integer? (car ls)) 3546 (lp (cdr ls) 3547 (cons (or (irregex-match-substring m (car ls)) "") res))) 3548 ((procedure? (car ls)) 3549 (lp (cdr ls) (cons ((car ls) m) res))) 3550 ((symbol? (car ls)) 3551 (case (car ls) 3552 ((pre) 3553 (lp (cdr ls) 3554 (cons (substring (car (%irregex-match-start-chunk m 0)) 3555 0 3556 (%irregex-match-start-index m 0)) 3557 res))) 3558 ((post) 3559 (let ((str (car (%irregex-match-start-chunk m 0)))) 3560 (lp (cdr ls) 3561 (cons (substring str 3562 (%irregex-match-end-index m 0) 3563 (string-length str)) 3564 res)))) 3565 (else 3566 (cond 3567 ((assq (car ls) (irregex-match-names m)) 3568 => (lambda (x) (lp (cons (cdr x) (cdr ls)) res))) 3569 (else 3570 (error "unknown match replacement" (car ls))))))) 3571 (else 3572 (lp (cdr ls) (cons (car ls) res))))))) 3573 3574(define (irregex-extract irx str . o) 3575 (if (not (string? str)) (error "irregex-extract: not a string" str)) 3576 (apply irregex-fold/fast 3577 irx 3578 (lambda (i m a) (cons (irregex-match-substring m) a)) 3579 '() 3580 str 3581 (lambda (i a) (reverse a)) 3582 o)) 3583 3584(define (irregex-split irx str . o) 3585 (if (not (string? str)) (error "irregex-split: not a string" str)) 3586 (let ((start (if (pair? o) (car o) 0)) 3587 (end (if (and (pair? o) (pair? (cdr o))) (cadr o) (string-length str)))) 3588 (irregex-fold/fast 3589 irx 3590 (lambda (i m a) 3591 (if (= i (%irregex-match-start-index m 0)) 3592 a 3593 (cons (substring str i (%irregex-match-start-index m 0)) a))) 3594 '() 3595 str 3596 (lambda (i a) 3597 (reverse (if (= i end) a (cons (substring str i end) a)))) 3598 start 3599 end))) 3600) 3601