1;;;; irregex.scm -- IrRegular Expressions
2(library (irregex)
3         (export
4           ;from irregex-chicken
5
6           irregex string->irregex sre->irregex
7           string->sre maybe-string->sre
8           irregex? irregex-match-data?
9           irregex-new-matches irregex-reset-matches!
10           irregex-search irregex-search/matches irregex-match
11           irregex-search/chunked irregex-match/chunked make-irregex-chunker
12           irregex-match-substring irregex-match-subchunk
13           irregex-match-start-chunk irregex-match-start-index
14           irregex-match-end-chunk irregex-match-end-index
15           irregex-match-num-submatches irregex-match-names
16           irregex-match-valid-index?
17           irregex-fold irregex-replace irregex-replace/all
18           irregex-dfa irregex-dfa/search irregex-dfa/extract
19           irregex-nfa irregex-flags irregex-lengths irregex-names
20           irregex-num-submatches irregex-extract irregex-split
21
22           ;; add
23           irregex-fold/chunked
24           )
25         (import (rnrs)
26                 (only (rnrs r5rs (6)) modulo remainder quotient)
27                 (rnrs mutable-strings)
28                 (rnrs mutable-pairs))
29
30;; based on rev:2e55ccfbba
31
32;;;; irregex.scm -- IrRegular Expressions
33;;
34;; Copyright (c) 2005-2010 Alex Shinn.  All rights reserved.
35;; BSD-style license: http://synthcode.com/license.txt
36
37;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
38;; At this moment there was a loud ring at the bell, and I could
39;; hear Mrs. Hudson, our landlady, raising her voice in a wail of
40;; expostulation and dismay.
41;;
42;; "By heaven, Holmes," I said, half rising, "I believe that
43;; they are really after us."
44;;
45;; "No, it's not quite so bad as that.  It is the unofficial
46;; force, -- the Baker Street irregulars."
47
48;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
49;;;; Notes
50;;
51;; This code should not require any porting - it should work out of
52;; the box in any R[45]RS Scheme implementation.  Slight modifications
53;; are needed for R6RS (a separate R6RS-compatible version is included
54;; in the distribution as irregex-r6rs.scm).
55;;
56;; The goal of portability makes this code a little clumsy and
57;; inefficient.  Future versions will include both cleanup and
58;; performance tuning, but you can only go so far while staying
59;; portable.  AND-LET*, SRFI-9 records and custom macros would've been
60;; nice.
61
62;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
63;;;; History
64;;
65;; 0.8.2: 2010/08/28 - (...)? submatch extraction fix and alternate
66;;                     named submatches from Peter Bex
67;;                     Added irregex-split, irregex-extract,
68;;                     irregex-match-names and irregex-match-valid-index?
69;;                     to Chicken and Guile module export lists and made
70;;                     the latter accept named submatches.  The procedures
71;;                     irregex-match-{start,end}-{index,chunk} now also
72;;                     accept named submatches, with the index argument
73;;                     made optional.  Improved argument type checks.
74;;                     Disallow negative submatch index.
75;;                     Improve performance of backtracking matcher.
76;;                     Refactor charset handling into a consistent API
77;; 0.8.1: 2010/03/09 - backtracking irregex-match fix and other small fixes
78;; 0.8.0: 2010/01/20 - optimizing DFA compilation, adding SRE escapes
79;;                     inside PCREs, adding utility SREs
80;; 0.7.5: 2009/08/31 - adding irregex-extract and irregex-split
81;;                     *-fold copies match data (use *-fold/fast for speed)
82;;                     irregex-opt now returns an SRE
83;; 0.7.4: 2009/05/14 - empty alternates (or) and empty csets always fail,
84;;                     bugfix in default finalizer for irregex-fold/chunked
85;; 0.7.3: 2009/04/14 - adding irregex-fold/chunked, minor doc fixes
86;; 0.7.2: 2009/02/11 - some bugfixes, much improved documentation
87;; 0.7.1: 2008/10/30 - several bugfixes (thanks to Derick Eddington)
88;; 0.7.0: 2008/10/20 - support abstract chunked strings
89;; 0.6.2: 2008/07/26 - minor bugfixes, allow global disabling of utf8 mode,
90;;                     friendlier error messages in parsing, \Q..\E support
91;; 0.6.1: 2008/07/21 - added utf8 mode, more utils, bugfixes
92;;   0.6: 2008/05/01 - most of PCRE supported
93;;   0.5: 2008/04/24 - fully portable R4RS, many PCRE features implemented
94;;   0.4: 2008/04/17 - rewriting NFA to use efficient closure compilation,
95;;                     normal strings only, but all of the spencer tests pass
96;;   0.3: 2008/03/10 - adding DFA converter (normal strings only)
97;;   0.2: 2005/09/27 - adding irregex-opt (like elisp's regexp-opt) utility
98;;   0.1: 2005/08/18 - simple NFA interpreter over abstract chunked strings
99
100;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
101;;;; Data Structures
102
103(define irregex-tag '*irregex-tag*)
104
105(define (make-irregex dfa dfa/search dfa/extract nfa flags
106                      submatches lengths names)
107  (vector irregex-tag dfa dfa/search dfa/extract nfa flags
108          submatches lengths names))
109
110(define (irregex? obj)
111  (and (vector? obj)
112       (= 9 (vector-length obj))
113       (eq? irregex-tag (vector-ref obj 0))))
114
115(define (irregex-dfa x) (vector-ref x 1))
116(define (irregex-dfa/search x) (vector-ref x 2))
117(define (irregex-dfa/extract x) (vector-ref x 3))
118(define (irregex-nfa x) (vector-ref x 4))
119(define (irregex-flags x) (vector-ref x 5))
120(define (irregex-num-submatches x) (vector-ref x 6))
121(define (irregex-lengths x) (vector-ref x 7))
122(define (irregex-names x) (vector-ref x 8))
123
124(define (irregex-new-matches irx)
125  (make-irregex-match (irregex-num-submatches irx) (irregex-names irx)))
126
127(define (irregex-reset-matches! m)
128  (do ((i (- (vector-length m) 1) (- i 1)))
129      ((<= i 3) m)
130    (vector-set! m i #f)))
131
132(define (irregex-copy-matches m)
133  (and (vector? m)
134       (let ((r (make-vector (vector-length m))))
135         (do ((i (- (vector-length m) 1) (- i 1)))
136             ((< i 0) r)
137           (vector-set! r i (vector-ref m i))))))
138
139(define irregex-match-tag '*irregex-match-tag*)
140
141(define (irregex-match-data? obj)
142  (and (vector? obj)
143       (>= (vector-length obj) 11)
144       (eq? irregex-match-tag (vector-ref obj 0))))
145
146(define (make-irregex-match count names)
147  (let ((res (make-vector (+ (* 4 (+ 2 count)) 3) #f)))
148    (vector-set! res 0 irregex-match-tag)
149    (vector-set! res 2 names)
150    res))
151
152(define (irregex-match-num-submatches m)
153  (- (quotient (- (vector-length m) 3) 4) 2))
154
155(define (irregex-match-chunker m)
156  (vector-ref m 1))
157(define (irregex-match-names m)
158  (vector-ref m 2))
159(define (irregex-match-chunker-set! m str)
160  (vector-set! m 1 str))
161
162(define (%irregex-match-start-chunk m n) (vector-ref m (+ 3 (* n 4))))
163(define (%irregex-match-start-index m n) (vector-ref m (+ 4 (* n 4))))
164(define (%irregex-match-end-chunk m n)   (vector-ref m (+ 5 (* n 4))))
165(define (%irregex-match-end-index m n)   (vector-ref m (+ 6 (* n 4))))
166
167(define (%irregex-match-fail m)
168  (vector-ref m (- (vector-length m) 1)))
169(define (%irregex-match-fail-set! m x)
170  (vector-set! m (- (vector-length m) 1) x))
171
172;; public interface with error checking
173(define (irregex-match-start-chunk m . opt)
174  (let ((n (irregex-match-numeric-index "irregex-match-start-chunk" m opt)))
175    (and n (%irregex-match-start-chunk m n))))
176(define (irregex-match-start-index m . opt)
177  (let ((n (irregex-match-numeric-index "irregex-match-start-index" m opt)))
178    (and n (%irregex-match-start-index m n))))
179(define (irregex-match-end-chunk m . opt)
180  (let ((n (irregex-match-numeric-index "irregex-match-end-chunk" m opt)))
181    (and n (%irregex-match-end-chunk m n))))
182(define (irregex-match-end-index m . opt)
183  (let ((n (irregex-match-numeric-index "irregex-match-end-index" m opt)))
184    (and n (%irregex-match-end-index m n))))
185
186(define (irregex-match-start-chunk-set! m n start)
187  (vector-set! m (+ 3 (* n 4)) start))
188(define (irregex-match-start-index-set! m n start)
189  (vector-set! m (+ 4 (* n 4)) start))
190(define (irregex-match-end-chunk-set! m n end)
191  (vector-set! m (+ 5 (* n 4)) end))
192(define (irregex-match-end-index-set! m n end)
193  (vector-set! m (+ 6 (* n 4)) end))
194
195;; Helper procedure to convert any type of index from a rest args list
196;; to a numeric index.  Named submatches are converted to their corresponding
197;; numeric index, and numeric submatches are checked for validity.
198;; An error is raised for invalid numeric or named indices, #f is returned
199;; for defined but nonmatching indices.
200(define (irregex-match-numeric-index location m opt)
201  (cond
202   ((not (irregex-match-data? m))
203    (error (string-append location ": not match data") m))
204   ((not (pair? opt)) 0)
205   ((pair? (cdr opt))
206    (apply error (string-append location ": too many arguments") m opt))
207   (else
208    (let ((n (car opt)))
209      (if (number? n)
210          (if (and (integer? n) (exact? n))
211              (if (irregex-match-valid-numeric-index? m n)
212                  (and (irregex-match-matched-numeric-index? m n) n)
213                  (error (string-append location ": not a valid index")
214                         m n))
215              (error (string-append location ": not an exact integer") n))
216          (let lp ((ls (irregex-match-names m))
217                   (unknown? #t))
218            (cond
219             ((null? ls)
220              (and unknown?
221                   (error (string-append location ": unknown match name") n)))
222             ((eq? n (caar ls))
223              (if (%irregex-match-start-chunk m (cdar ls))
224                  (cdar ls)
225                  (lp (cdr ls) #f)))
226             (else (lp (cdr ls) unknown?)))))))))
227
228(define (irregex-match-valid-numeric-index? m n)
229  (and (>= n 0) (< (+ 3 (* n 4)) (- (vector-length m) 4))))
230
231(define (irregex-match-matched-numeric-index? m n)
232  (and (vector-ref m (+ 4 (* n 4)))
233       #t))
234
235(define (irregex-match-valid-named-index? m n)
236  (and (assq n (irregex-match-names m))
237       #t))
238
239(define (irregex-match-valid-index? m n)
240  (if (not (irregex-match-data? m))
241      (error "irregex-match-valid-index?: not match data" m))
242  (if (integer? n)
243      (if (not (exact? n))
244          (error "irregex-match-valid-index?: not an exact integer" n)
245          (irregex-match-valid-numeric-index? m n))
246      (irregex-match-valid-named-index? m n)))
247
248(define (irregex-match-substring m . opt)
249  (let* ((n (irregex-match-numeric-index "irregex-match-substring" m opt))
250         (cnk (irregex-match-chunker m)))
251    (and n
252         ((chunker-get-substring cnk)
253          (%irregex-match-start-chunk m n)
254          (%irregex-match-start-index m n)
255          (%irregex-match-end-chunk m n)
256          (%irregex-match-end-index m n)))))
257
258(define (irregex-match-subchunk m . opt)
259  (let* ((n (irregex-match-numeric-index "irregex-match-subchunk" m opt))
260         (cnk (irregex-match-chunker m))
261         (get-subchunk (chunker-get-subchunk cnk)))
262    (if (not get-subchunk)
263        (error "this chunk type does not support match subchunks")
264        (and n (get-subchunk
265                (%irregex-match-start-chunk m n)
266                (%irregex-match-start-index m n)
267                (%irregex-match-end-chunk m n)
268                (%irregex-match-end-index m n))))))
269
270;; chunkers tell us how to navigate through chained chunks of strings
271
272(define (make-irregex-chunker get-next get-str . o)
273  (let* ((get-start (or (and (pair? o) (car o)) (lambda (cnk) 0)))
274         (o (if (pair? o) (cdr o) o))
275         (get-end (or (and (pair? o) (car o))
276                      (lambda (cnk) (string-length (get-str cnk)))))
277         (o (if (pair? o) (cdr o) o))
278         (get-substr
279          (or (and (pair? o) (car o))
280              (lambda (cnk1 start cnk2 end)
281                (if (eq? cnk1 cnk2)
282                    (substring (get-str cnk1) start end)
283                    (let loop ((cnk (get-next cnk1))
284                               (res (list (substring (get-str cnk1)
285                                                     start
286                                                     (get-end cnk1)))))
287                      (if (eq? cnk cnk2)
288                          (string-cat-reverse
289                           (cons (substring (get-str cnk)
290                                            (get-start cnk)
291                                            end)
292                                 res))
293                          (loop (get-next cnk)
294                                (cons (substring (get-str cnk)
295                                                 (get-start cnk)
296                                                 (get-end cnk))
297                                      res))))))))
298         (o (if (pair? o) (cdr o) o))
299         (get-subchunk (and (pair? o) (car o))))
300    (if (not (and (procedure? get-next) (procedure? get-str)
301                  (procedure? get-start) (procedure? get-substr)))
302        (error "make-irregex-chunker: expected a procdure"))
303    (vector get-next get-str get-start get-end get-substr get-subchunk)))
304
305(define (chunker-get-next cnk) (vector-ref cnk 0))
306(define (chunker-get-str cnk) (vector-ref cnk 1))
307(define (chunker-get-start cnk) (vector-ref cnk 2))
308(define (chunker-get-end cnk) (vector-ref cnk 3))
309(define (chunker-get-substring cnk) (vector-ref cnk 4))
310(define (chunker-get-subchunk cnk) (vector-ref cnk 5))
311
312(define (chunker-prev-chunk cnk start end)
313  (if (eq? start end)
314      #f
315      (let ((get-next (chunker-get-next cnk)))
316        (let lp ((start start))
317          (let ((next (get-next start)))
318            (if (eq? next end)
319                start
320                (and next (lp next))))))))
321
322(define (chunker-prev-char cnk start end)
323  (let ((prev (chunker-prev-chunk cnk start end)))
324    (and prev
325         (string-ref ((chunker-get-str cnk) prev)
326                     (- ((chunker-get-end cnk) prev) 1)))))
327
328(define (chunker-next-char cnk src)
329  (let ((next ((chunker-get-next cnk) src)))
330    (and next
331         (string-ref ((chunker-get-str cnk) next)
332                     ((chunker-get-start cnk) next)))))
333
334(define (chunk-before? cnk a b)
335  (and (not (eq? a b))
336       (let ((next ((chunker-get-next cnk) a)))
337         (and next
338              (if (eq? next b)
339                  #t
340                  (chunk-before? cnk next b))))))
341
342;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
343;;;; String Utilities
344
345;; Unicode version (skip surrogates)
346(define *all-chars*
347  `(/ ,(integer->char 0) ,(integer->char #xD7FF)
348      ,(integer->char #xE000) ,(integer->char #x10FFFF)))
349
350;; ASCII version, offset to not assume 0-255
351;; (define *all-chars* `(/ ,(integer->char (- (char->integer #\space) 32)) ,(integer->char (+ (char->integer #\space) 223))))
352
353;; set to #f to ignore even an explicit request for utf8 handling
354(define *allow-utf8-mode?* #f)
355
356;; (define *named-char-properties* '())
357
358(define (string-scan-char str c . o)
359  (let ((end (string-length str)))
360    (let scan ((i (if (pair? o) (car o) 0)))
361      (cond ((= i end) #f)
362            ((eqv? c (string-ref str i)) i)
363            (else (scan (+ i 1)))))))
364
365(define (string-scan-char-escape str c . o)
366  (let ((end (string-length str)))
367    (let scan ((i (if (pair? o) (car o) 0)))
368      (cond ((= i end) #f)
369            ((eqv? c (string-ref str i)) i)
370            ((eqv? c #\\) (scan (+ i 2)))
371            (else (scan (+ i 1)))))))
372
373(define (string-scan-pred str pred . o)
374  (let ((end (string-length str)))
375    (let scan ((i (if (pair? o) (car o) 0)))
376      (cond ((= i end) #f)
377            ((pred (string-ref str i)) i)
378            (else (scan (+ i 1)))))))
379
380(define (string-split-char str c)
381  (let ((end (string-length str)))
382    (let lp ((i 0) (from 0) (res '()))
383      (define (collect) (cons (substring str from i) res))
384      (cond ((>= i end) (reverse (collect)))
385            ((eqv? c (string-ref str i)) (lp (+ i 1) (+ i 1) (collect)))
386            (else (lp (+ i 1) from res))))))
387
388(define (char-alphanumeric? c)
389  (or (char-alphabetic? c) (char-numeric? c)))
390
391(define (%substring=? a b start1 start2 len)
392  (let lp ((i 0))
393    (cond ((>= i len)
394           #t)
395          ((char=? (string-ref a (+ start1 i)) (string-ref b (+ start2 i)))
396           (lp (+ i 1)))
397          (else
398           #f))))
399
400;; SRFI-13 extracts
401
402(define (%%string-copy! to tstart from fstart fend)
403  (do ((i fstart (+ i 1))
404       (j tstart (+ j 1)))
405      ((>= i fend))
406    (string-set! to j (string-ref from i))))
407
408(define (string-cat-reverse string-list)
409  (string-cat-reverse/aux
410   (fold (lambda (s a) (+ (string-length s) a)) 0 string-list)
411   string-list))
412
413(define (string-cat-reverse/aux len string-list)
414  (let ((res (make-string len)))
415    (let lp ((i len) (ls string-list))
416      (if (pair? ls)
417          (let* ((s (car ls))
418                 (slen (string-length s))
419                 (i (- i slen)))
420            (%%string-copy! res i s 0 slen)
421            (lp i (cdr ls)))))
422    res))
423
424;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
425;;;; List Utilities
426
427;; like the one-arg IOTA case
428(define (zero-to n)
429  (if (<= n 0)
430      '()
431      (let lp ((i (- n 1)) (res '()))
432        (if (zero? i) (cons 0 res) (lp (- i 1) (cons i res))))))
433
434;; SRFI-1 extracts (simplified 1-ary versions)
435
436(define (find-tail pred ls)
437  (let lp ((ls ls))
438    (cond ((null? ls) #f)
439          ((pred (car ls)) ls)
440          (else (lp (cdr ls))))))
441
442(define (last ls)
443  (if (not (pair? ls))
444      (error "can't take last of empty list" ls)
445      (let lp ((ls ls))
446        (if (pair? (cdr ls))
447            (lp (cdr ls))
448            (car ls)))))
449
450(define (any pred ls)
451  (and (pair? ls)
452       (let lp ((head (car ls)) (tail (cdr ls)))
453         (if (null? tail)
454             (pred head)
455             (or (pred head) (lp (car tail) (cdr tail)))))))
456
457(define (every pred ls)
458  (or (null? ls)
459      (let lp ((head (car ls))  (tail (cdr ls)))
460        (if (null? tail)
461            (pred head)
462            (and (pred head) (lp (car tail) (cdr tail)))))))
463
464(define (fold kons knil ls)
465  (let lp ((ls ls) (res knil))
466    (if (null? ls)
467        res
468        (lp (cdr ls) (kons (car ls) res)))))
469
470;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
471;;;; Flags
472
473(define (bit-shr n i)
474  (quotient n (expt 2 i)))
475
476(define (bit-shl n i)
477  (* n (expt 2 i)))
478
479(define (bit-not n) (- #xFFFF n))
480
481(define (bit-ior a b)
482  (cond
483   ((zero? a) b)
484   ((zero? b) a)
485   (else
486    (+ (if (or (odd? a) (odd? b)) 1 0)
487       (* 2 (bit-ior (quotient a 2) (quotient b 2)))))))
488
489(define (bit-and a b)
490  (cond
491   ((zero? a) 0)
492   ((zero? b) 0)
493   (else
494    (+ (if (and (odd? a) (odd? b)) 1 0)
495       (* 2 (bit-and (quotient a 2) (quotient b 2)))))))
496
497(define (integer-log n)
498  (define (b8 n r)
499    (if (>= n (bit-shl 1 8)) (b4 (bit-shr n 8) (+ r 8)) (b4 n r)))
500  (define (b4 n r)
501    (if (>= n (bit-shl 1 4)) (b2 (bit-shr n 4) (+ r 4)) (b2 n r)))
502  (define (b2 n r)
503    (if (>= n (bit-shl 1 2)) (b1 (bit-shr n 2) (+ r 2)) (b1 n r)))
504  (define (b1 n r) (if (>= n (bit-shl 1 1)) (+ r 1) r))
505  (if (>= n (bit-shl 1 16)) (b8 (bit-shr n 16) 16) (b8 n 0)))
506
507(define (flag-set? flags i)
508  (= i (bit-and flags i)))
509(define (flag-join a b)
510  (if b (bit-ior a b) a))
511(define (flag-clear a b)
512  (bit-and a (bit-not b)))
513
514(define ~none 0)
515(define ~searcher? 1)
516(define ~consumer? 2)
517
518;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
519;;;; Parsing Embedded SREs in PCRE Strings
520
521;; (define (with-read-from-string str i proc)
522;;   (define (port-size in)
523;;     (let lp ((i 0)) (if (eof-object? (read-char in)) i (lp (+ i 1)))))
524;;   (let* ((len (string-length str))
525;;          (tail-len (- len i))
526;;          (in (open-input-string (substring str i len)))
527;;          (sre (read in))
528;;          (unused-len (port-size in)))
529;;     (close-input-port in)
530;;     (proc sre (- tail-len unused-len))))
531
532(define close-token (list 'close))
533(define dot-token (string->symbol "."))
534
535(define (with-read-from-string str i proc)
536  (define end (string-length str))
537  (define (read i k)
538    (cond
539     ((>= i end) (error "unterminated embedded SRE" str))
540     (else
541      (case (string-ref str i)
542        ((#\()
543         (let lp ((i (+ i 1)) (ls '()))
544           (read
545            i
546            (lambda (x j)
547              (cond
548               ((eq? x close-token)
549                (k (reverse ls) j))
550               ((eq? x dot-token)
551                (if (null? ls)
552                    (error "bad dotted form" str)
553                    (read j (lambda (y j2)
554                              (read j2 (lambda (z j3)
555                                         (if (not (eq? z close-token))
556                                             (error "bad dotted form" str)
557                                             (k (append (reverse (cdr ls))
558                                                        (cons (car ls) y))
559                                                j3))))))))
560               (else
561                (lp j (cons x ls))))))))
562        ((#\))
563         (k close-token (+ i 1)))
564        ((#\;)
565         (let skip ((i (+ i 1)))
566           (if (or (>= i end) (eqv? #\newline (string-ref str i)))
567               (read (+ i 1) k)
568               (skip (+ i 1)))))
569        ((#\' #\`)
570         (read (+ i 1)
571           (lambda (sexp j)
572             (let ((q (if (eqv? #\' (string-ref str i)) 'quote 'quasiquote)))
573               (k (list q sexp) j)))))
574        ((#\,)
575         (let* ((at? (and (< (+ i 1) end) (eqv? #\@ (string-ref str (+ i 1)))))
576                (u (if at? 'uquote-splicing 'unquote))
577                (j (if at? (+ i 2) (+ i 1))))
578           (read j (lambda (sexp j) (k (list u sexp) j)))))
579        ((#\")
580         (let scan ((from (+ i 1)) (i (+ i 1)) (res '()))
581           (define (collect)
582             (if (= from i) res (cons (substring str from i) res)))
583           (if (>= i end)
584               (error "unterminated string in embeded SRE" str)
585               (case (string-ref str i)
586                 ((#\") (k (string-cat-reverse (collect)) (+ i 1)))
587                 ((#\\) (scan (+ i 1) (+ i 2) (collect)))
588                 (else (scan from (+ i 1) res))))))
589        ((#\#)
590         (case (string-ref str (+ i 1))
591           ((#\;)
592            (read (+ i 2) (lambda (sexp j) (read j k))))
593           ((#\\)
594            (read (+ i 2)
595              (lambda (sexp j)
596                (k (case sexp
597                     ((space) #\space)
598                     ((newline) #\newline)
599                     (else (let ((s (if (number? sexp)
600                                        (number->string sexp)
601                                        (symbol->string sexp))))
602                             (string-ref s 0))))
603                   j))))
604           ((#\t #\f)
605            (k (eqv? #\t (string-ref str (+ i 1))) (+ i 2)))
606           (else
607            (error "bad # syntax in simplified SRE" i))))
608        (else
609         (cond
610          ((char-whitespace? (string-ref str i))
611           (read (+ i 1) k))
612          (else ;; symbol/number
613           (let scan ((j (+ i 1)))
614             (cond
615              ((or (>= j end)
616                   (let ((c (string-ref str j)))
617                     (or (char-whitespace? c)
618                         (memv c '(#\; #\( #\) #\" #\# #\\)))))
619               (let ((str2 (substring str i j)))
620                 (k (or (string->number str2) (string->symbol str2)) j)))
621              (else (scan (+ j 1))))))))))))
622  (read i (lambda (res j)
623            (if (eq? res 'close-token)
624                (error "unexpected ')' in SRE" str j)
625                (proc res j)))))
626
627;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
628;;;; Parsing PCRE Strings
629
630(define ~save? 1)
631(define ~case-insensitive? 2)
632(define ~multi-line? 4)
633(define ~single-line? 8)
634(define ~ignore-space? 16)
635(define ~utf8? 32)
636
637(define (symbol-list->flags ls)
638  (let lp ((ls ls) (res ~none))
639    (if (not (pair? ls))
640        res
641        (lp (cdr ls)
642            (flag-join
643             res
644             (case (car ls)
645               ((i ci case-insensitive) ~case-insensitive?)
646               ((m multi-line) ~multi-line?)
647               ((s single-line) ~single-line?)
648               ((x ignore-space) ~ignore-space?)
649               ((u utf8) (if *allow-utf8-mode?* ~utf8? ~none))
650               (else #f)))))))
651
652(define (maybe-string->sre obj)
653  (if (string? obj) (string->sre obj) obj))
654
655(define (string->sre str . o)
656  (if (not (string? str)) (error "string->sre: expected a string" str))
657  (let ((end (string-length str))
658        (flags (symbol-list->flags o)))
659
660    (let lp ((i 0) (from 0) (flags flags) (res '()) (st '()))
661
662      ;; handle case sensitivity at the literal char/string level
663      (define (cased-char ch)
664        (if (and (flag-set? flags ~case-insensitive?)
665                 (char-alphabetic? ch))
666            `(or ,ch ,(char-altcase ch))
667            ch))
668      (define (cased-string str)
669        (if (flag-set? flags ~case-insensitive?)
670            (sre-sequence (map cased-char (string->list str)))
671            str))
672      ;; accumulate the substring from..i as literal text
673      (define (collect)
674        (if (= i from) res (cons (cased-string (substring str from i)) res)))
675      ;; like collect but breaks off the last single character when
676      ;; collecting literal data, as the argument to ?/*/+ etc.
677      (define (collect/single)
678        (let* ((utf8? (flag-set? flags ~utf8?))
679               (j (if (and utf8? (> i 1))
680                      (utf8-backup-to-initial-char str (- i 1))
681                      (- i 1))))
682          (cond
683           ((< j from)
684            res)
685           (else
686            (let ((c (cased-char (if utf8?
687                                     (utf8-string-ref str j (- i j))
688                                     (string-ref str j)))))
689              (cond
690               ((= j from)
691                (cons c res))
692               (else
693                (cons c
694                      (cons (cased-string (substring str from j))
695                            res)))))))))
696      ;; collects for use as a result, reversing and grouping OR
697      ;; terms, and some ugly tweaking of `function-like' groups and
698      ;; conditionals
699      (define (collect/terms)
700        (let* ((ls (collect))
701               (func
702                (and (pair? ls)
703                     (memq (last ls)
704                           '(atomic if look-ahead neg-look-ahead
705                                    look-behind neg-look-behind
706                                    => submatch-named
707                                    w/utf8 w/noutf8))))
708               (prefix (if (and func (memq (car func) '(=> submatch-named)))
709                           (list 'submatch-named (cadr (reverse ls)))
710                           (and func (list (car func)))))
711               (ls (if func
712                       (if (memq (car func) '(=> submatch-named))
713                           (reverse (cddr (reverse ls)))
714                           (reverse (cdr (reverse ls))))
715                       ls)))
716          (let lp ((ls ls) (term '()) (res '()))
717            (define (shift)
718              (cons (sre-sequence term) res))
719            (cond
720             ((null? ls)
721              (let* ((res (sre-alternate (shift)))
722                     (res (if (flag-set? flags ~save?)
723                              (list 'submatch res)
724                              res)))
725                (if prefix
726                    (if (eq? 'if (car prefix))
727                        (cond
728                         ((not (pair? res))
729                          'epsilon)
730                         ((memq (car res)
731                                '(look-ahead neg-look-ahead
732                                             look-behind neg-look-behind))
733                          res)
734                         ((eq? 'seq (car res))
735                          `(if ,(cadr res)
736                               ,(if (pair? (cdr res))
737                                    (sre-sequence (cddr res))
738                                    'epsilon)))
739                         (else
740                          `(if ,(cadadr res)
741                               ,(if (pair? (cdr res))
742                                    (sre-sequence (cddadr res))
743                                    'epsilon)
744                               ,(sre-alternate
745                                 (if (pair? (cdr res)) (cddr res) '())))))
746                        `(,@prefix ,res))
747                    res)))
748             ((eq? 'or (car ls)) (lp (cdr ls) '() (shift)))
749             (else (lp (cdr ls) (cons (car ls) term) res))))))
750      (define (save)
751        (cons (cons flags (collect)) st))
752
753      ;; main parsing
754      (if (>= i end)
755          (if (pair? st)
756              (error "unterminated parenthesis in regexp" str)
757              (collect/terms))
758          (let ((c (string-ref str i)))
759            (case c
760              ((#\.)
761               (lp (+ i 1) (+ i 1) flags
762                   (cons (if (flag-set? flags ~single-line?) 'any 'nonl)
763                         (collect))
764                   st))
765              ((#\?)
766               (let ((res (collect/single)))
767                 (if (null? res)
768                     (error "? can't follow empty pattern" str res)
769                     (let ((x (car res)))
770                       (lp (+ i 1)
771                           (+ i 1)
772                           flags
773                           (cons
774                            (if (pair? x)
775                                (case (car x)
776                                  ((*)  `(*? ,@(cdr x)))
777                                  ((+)  `(**? 1 #f ,@(cdr x)))
778                                  ((?)  `(?? ,@(cdr x)))
779                                  ((**) `(**? ,@(cdr x)))
780                                  ((=)  `(**? ,(cadr x) ,@(cdr x)))
781                                  ((>=)  `(**? ,(cadr x) #f ,@(cddr x)))
782                                  (else `(? ,x)))
783                                `(? ,x))
784                            (cdr res))
785                           st)))))
786              ((#\+ #\*)
787               (let* ((res (collect/single))
788                      (x (if (pair? res) (car res) 'epsilon))
789                      (op (string->symbol (string c))))
790                 (cond
791                  ((sre-repeater? x)
792                   (error "duplicate repetition (e.g. **) in pattern" str res))
793                  ((sre-empty? x)
794                   (error "can't repeat empty pattern (e.g. ()*)" str res))
795                  (else
796                   (lp (+ i 1) (+ i 1) flags
797                       (cons (list op x) (cdr res))
798                       st)))))
799              ((#\()
800               (cond
801                ((>= (+ i 1) end)
802                 (error "unterminated parenthesis in regexp" str))
803                ((not (memv (string-ref str (+ i 1)) '(#\? #\*))) ; normal case
804                 (lp (+ i 1) (+ i 1) (flag-join flags ~save?) '() (save)))
805                ((>= (+ i 2) end)
806                 (error "unterminated parenthesis in regexp" str))
807                ((eqv? (string-ref str (+ i 1)) #\*)
808                 (if (eqv? #\' (string-ref str (+ i 2)))
809                     (with-read-from-string str (+ i 3)
810                       (lambda (sre j)
811                         (if (or (>= j end) (not (eqv? #\) (string-ref str j))))
812                             (error "unterminated (*'...) SRE escape" str)
813                             (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st))))
814                     (error "bad regexp syntax: (*FOO) not supported" str)))
815                (else                   ;; (?...) case
816                 (case (string-ref str (+ i 2))
817                   ((#\#)
818                    (let ((j (string-scan-char str #\) (+ i 3))))
819                      (lp (+ j i) (+ j 1) flags (collect) st)))
820                   ((#\:)
821                    (lp (+ i 3) (+ i 3) (flag-clear flags ~save?) '() (save)))
822                   ((#\=)
823                    (lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
824                        '(look-ahead) (save)))
825                   ((#\!)
826                    (lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
827                        '(neg-look-ahead) (save)))
828                   ((#\<)
829                    (cond
830                     ((>= (+ i 3) end)
831                      (error "unterminated parenthesis in regexp" str))
832                     (else
833                      (case (string-ref str (+ i 3))
834                        ((#\=)
835                         (lp (+ i 4) (+ i 4) (flag-clear flags ~save?)
836                             '(look-behind) (save)))
837                        ((#\!)
838                         (lp (+ i 4) (+ i 4) (flag-clear flags ~save?)
839                             '(neg-look-behind) (save)))
840                        (else
841                         (let ((j (and (char-alphabetic?
842                                        (string-ref str (+ i 3)))
843                                       (string-scan-char str #\> (+ i 4)))))
844                           (if j
845                               (lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
846                                   `(,(string->symbol (substring str (+ i 3) j))
847                                     submatch-named)
848                                   (save))
849                               (error "invalid (?< sequence" str))))))))
850                   ((#\>)
851                    (lp (+ i 3) (+ i 3) (flag-clear flags ~save?)
852                        '(atomic) (save)))
853                   ;;((#\' #\P) ; named subpatterns
854                   ;; )
855                   ;;((#\R) ; recursion
856                   ;; )
857                   ((#\()
858                    (cond
859                     ((>= (+ i 3) end)
860                      (error "unterminated parenthesis in regexp" str))
861                     ((char-numeric? (string-ref str (+ i 3)))
862                      (let* ((j (string-scan-char str #\) (+ i 3)))
863                             (n (string->number (substring str (+ i 3) j))))
864                        (if (not n)
865                            (error "invalid conditional reference" str)
866                            (lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
867                                `(,n if) (save)))))
868                     ((char-alphabetic? (string-ref str (+ i 3)))
869                      (let* ((j (string-scan-char str #\) (+ i 3)))
870                             (s (string->symbol (substring str (+ i 3) j))))
871                        (lp (+ j 1) (+ j 1) (flag-clear flags ~save?)
872                            `(,s if) (save))))
873                     (else
874                      (lp (+ i 2) (+ i 2) (flag-clear flags ~save?)
875                          '(if) (save)))))
876                   ((#\{)
877                    (error "unsupported Perl-style cluster" str))
878                   (else
879                    (let ((old-flags flags))
880                      (let lp2 ((j (+ i 2)) (flags flags) (invert? #f))
881                        (define (join x)
882                          ((if invert? flag-clear flag-join) flags x))
883                        (define (new-res res)
884                          (let ((before (flag-set? old-flags ~utf8?))
885                                (after (flag-set? flags ~utf8?)))
886                            (if (eq? before after)
887                                res
888                                (cons (if after 'w/utf8 'w/noutf8) res))))
889                        (cond
890                         ((>= j end)
891                          (error "incomplete cluster" str i))
892                         (else
893                          (case (string-ref str j)
894                            ((#\i)
895                             (lp2 (+ j 1) (join ~case-insensitive?) invert?))
896                            ((#\m)
897                             (lp2 (+ j 1) (join ~multi-line?) invert?))
898                            ((#\x)
899                             (lp2 (+ j 1) (join ~ignore-space?) invert?))
900                            ((#\u)
901                             (if *allow-utf8-mode?*
902                                 (lp2 (+ j 1) (join ~utf8?) invert?)
903                                 (lp2 (+ j 1) flags invert?)))
904                            ((#\-)
905                             (lp2 (+ j 1) flags (not invert?)))
906                            ((#\))
907                             (lp (+ j 1) (+ j 1) flags (new-res (collect))
908                                 st))
909                            ((#\:)
910                             (lp (+ j 1) (+ j 1) flags (new-res '())
911                                 (cons (cons old-flags (collect)) st)))
912                            (else
913                             (error "unknown regex cluster modifier" str)
914                             )))))))))))
915              ((#\))
916               (if (null? st)
917                   (error "too many )'s in regexp" str)
918                   (lp (+ i 1)
919                       (+ i 1)
920                       (caar st)
921                       (cons (collect/terms) (cdar st))
922                       (cdr st))))
923              ((#\[)
924               (apply
925                (lambda (sre j)
926                  (lp (+ j 1) (+ j 1) flags (cons sre (collect)) st))
927                (string-parse-cset str (+ i 1) flags)))
928              ((#\{)
929               (cond
930                ((or (>= (+ i 1) end)
931                     (not (or (char-numeric? (string-ref str (+ i 1)))
932                              (eqv? #\, (string-ref str (+ i 1))))))
933                 (lp (+ i 1) from flags res st))
934                (else
935                 (let ((res (collect/single)))
936                   (cond
937                    ((null? res)
938                     (error "{ can't follow empty pattern"))
939                    (else
940                     (let* ((x (car res))
941                            (tail (cdr res))
942                            (j (string-scan-char str #\} (+ i 1)))
943                            (s2 (string-split-char (substring str (+ i 1) j)
944                                                   #\,))
945                            (n (string->number (car s2)))
946                            (m (and (pair? (cdr s2))
947                                    (string->number (cadr s2)))))
948                       (cond
949                        ((or (not n)
950                             (and (pair? (cdr s2))
951                                  (not (equal? "" (cadr s2)))
952                                  (not m)))
953                         (error "invalid {n} repetition syntax" s2))
954                        ((null? (cdr s2))
955                         (lp (+ j 1) (+ j 1) flags `((= ,n ,x) ,@tail) st))
956                        (m
957                         (lp (+ j 1) (+ j 1) flags `((** ,n ,m ,x) ,@tail) st))
958                        (else
959                         (lp (+ j 1) (+ j 1) flags `((>= ,n ,x) ,@tail) st)
960                         )))))))))
961              ((#\\)
962               (cond
963                ((>= (+ i 1) end)
964                 (error "incomplete escape sequence" str))
965                (else
966                 (let ((c (string-ref str (+ i 1))))
967                   (case c
968                     ((#\d)
969                      (lp (+ i 2) (+ i 2) flags `(numeric ,@(collect)) st))
970                     ((#\D)
971                      (lp (+ i 2) (+ i 2) flags `((~ numeric) ,@(collect)) st))
972                     ((#\s)
973                      (lp (+ i 2) (+ i 2) flags `(space ,@(collect)) st))
974                     ((#\S)
975                      (lp (+ i 2) (+ i 2) flags `((~ space) ,@(collect)) st))
976                     ((#\w)
977                      (lp (+ i 2) (+ i 2) flags
978                          `((or alphanumeric ("_")) ,@(collect)) st))
979                     ((#\W)
980                      (lp (+ i 2) (+ i 2) flags
981                          `((~ (or alphanumeric ("_"))) ,@(collect)) st))
982                     ((#\b)
983                      (lp (+ i 2) (+ i 2) flags
984                          `((or bow eow) ,@(collect)) st))
985                     ((#\B)
986                      (lp (+ i 2) (+ i 2) flags `(nwb ,@(collect)) st))
987                     ((#\A)
988                      (lp (+ i 2) (+ i 2) flags `(bos ,@(collect)) st))
989                     ((#\Z)
990                      (lp (+ i 2) (+ i 2) flags
991                          `((? #\newline) eos ,@(collect)) st))
992                     ((#\z)
993                      (lp (+ i 2) (+ i 2) flags `(eos ,@(collect)) st))
994                     ((#\R)
995                      (lp (+ i 2) (+ i 2) flags `(newline ,@(collect)) st))
996                     ((#\K)
997                      (lp (+ i 2) (+ i 2) flags `(reset ,@(collect)) st))
998                     ;; these two are from Emacs and TRE, but not in PCRE
999                     ((#\<)
1000                      (lp (+ i 2) (+ i 2) flags `(bow ,@(collect)) st))
1001                     ((#\>)
1002                      (lp (+ i 2) (+ i 2) flags `(eow ,@(collect)) st))
1003                     ((#\x)
1004                      (apply
1005                       (lambda (ch j)
1006                         (lp (+ j 1) (+ j 1) flags `(,ch ,@(collect)) st))
1007                       (string-parse-hex-escape str (+ i 2) end)))
1008                     ((#\k)
1009                      (let ((c (string-ref str (+ i 2))))
1010                        (if (not (memv c '(#\< #\{ #\')))
1011                            (error "bad \\k usage, expected \\k<...>" str)
1012                            (let* ((terminal (char-mirror c))
1013                                   (j (string-scan-char str terminal (+ i 2)))
1014                                   (s (and j (substring str (+ i 3) j)))
1015                                   (backref
1016                                    (if (flag-set? flags ~case-insensitive?)
1017                                        'backref-ci
1018                                        'backref)))
1019                              (if (not j)
1020                                  (error "unterminated named backref" str)
1021                                  (lp (+ j 1) (+ j 1) flags
1022                                      `((,backref ,(string->symbol s))
1023                                        ,@(collect))
1024                                      st))))))
1025                     ((#\Q)  ;; \Q..\E escapes
1026                      (let ((res (collect)))
1027                        (let lp2 ((j (+ i 2)))
1028                          (cond
1029                           ((>= j end)
1030                            (lp j (+ i 2) flags res st))
1031                           ((eqv? #\\ (string-ref str j))
1032                            (cond
1033                             ((>= (+ j 1) end)
1034                              (lp (+ j 1) (+ i 2) flags res st))
1035                             ((eqv? #\E (string-ref str (+ j 1)))
1036                              (lp (+ j 2) (+ j 2) flags
1037                                  (cons (substring str (+ i 2) j) res) st))
1038                             (else
1039                              (lp2 (+ j 2)))))
1040                           (else
1041                            (lp2 (+ j 1)))))))
1042                     ((#\')
1043                      (with-read-from-string str (+ i 2)
1044                       (lambda (sre j)
1045                         (lp j j flags (cons sre (collect)) st))))
1046                     ;;((#\p)  ; XXXX unicode properties
1047                     ;; )
1048                     ;;((#\P)
1049                     ;; )
1050                     (else
1051                      (cond
1052                       ((char-numeric? c)
1053                        (let* ((j (or (string-scan-pred
1054                                       str
1055                                       (lambda (c) (not (char-numeric? c)))
1056                                       (+ i 2))
1057                                      end))
1058                               (backref
1059                                (if (flag-set? flags ~case-insensitive?)
1060                                    'backref-ci
1061                                    'backref))
1062                               (res `((,backref ,(string->number
1063                                                  (substring str (+ i 1) j)))
1064                                      ,@(collect))))
1065                          (lp j j flags res st)))
1066                       ((char-alphabetic? c)
1067                        (let ((cell (assv c posix-escape-sequences)))
1068                          (if cell
1069                              (lp (+ i 2) (+ i 2) flags
1070                                  (cons (cdr cell) (collect)) st)
1071                              (error "unknown escape sequence" str c))))
1072                       (else
1073                        (lp (+ i 2) (+ i 1) flags (collect) st)))))))))
1074              ((#\|)
1075               (lp (+ i 1) (+ i 1) flags (cons 'or (collect)) st))
1076              ((#\^)
1077               (let ((sym (if (flag-set? flags ~multi-line?) 'bol 'bos)))
1078                 (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st)))
1079              ((#\$)
1080               (let ((sym (if (flag-set? flags ~multi-line?) 'eol 'eos)))
1081                 (lp (+ i 1) (+ i 1) flags (cons sym (collect)) st)))
1082              ((#\space)
1083               (if (flag-set? flags ~ignore-space?)
1084                   (lp (+ i 1) (+ i 1) flags (collect) st)
1085                   (lp (+ i 1) from flags res st)))
1086              ((#\#)
1087               (if (flag-set? flags ~ignore-space?)
1088                   (let ((j (or (string-scan-char str #\newline (+ i 1))
1089                                (- end 1))))
1090                     (lp (+ j 1) (+ j 1) flags (collect) st))
1091                   (lp (+ i 1) from flags res st)))
1092              (else
1093               (lp (+ i 1) from flags res st))))))))
1094
1095(define posix-escape-sequences
1096  `((#\n . #\newline)
1097    (#\r . ,(integer->char (+ (char->integer #\newline) 3)))
1098    (#\t . ,(integer->char (- (char->integer #\newline) 1)))
1099    (#\a . ,(integer->char (- (char->integer #\newline) 3)))
1100    (#\e . ,(integer->char (+ (char->integer #\newline) #x11)))
1101    (#\f . ,(integer->char (+ (char->integer #\newline) 2)))
1102    ))
1103
1104(define (char-altcase c)
1105  (if (char-upper-case? c) (char-downcase c) (char-upcase c)))
1106
1107(define (char-mirror c)
1108  (case c ((#\<) #\>) ((#\{) #\}) ((#\() #\)) ((#\[) #\]) (else c)))
1109
1110(define (string-parse-hex-escape str i end)
1111  (cond
1112   ((>= i end)
1113    (error "incomplete hex escape" str i))
1114   ((eqv? #\{ (string-ref str i))
1115    (let ((j (string-scan-char-escape str #\} (+ i 1))))
1116      (if (not j)
1117          (error "incomplete hex brace escape" str i)
1118          (let* ((s (substring str (+ i 1) j))
1119                 (n (string->number s 16)))
1120            (if n
1121                (list (integer->char n) j)
1122                (error "bad hex brace escape" s))))))
1123   ((>= (+ i 1) end)
1124    (error "incomplete hex escape" str i))
1125   (else
1126    (let* ((s (substring str i (+ i 2)))
1127           (n (string->number s 16)))
1128      (if n
1129          (list (integer->char n) (+ i 2))
1130          (error "bad hex escape" s))))))
1131
1132(define (string-parse-cset str start flags)
1133  (let* ((end (string-length str))
1134         (invert? (and (< start end) (eqv? #\^ (string-ref str start))))
1135         (utf8? (flag-set? flags ~utf8?)))
1136    (define (go i prev-char cset)
1137      (if (>= i end)
1138          (error "incomplete char set" str i end)
1139          (let ((c (string-ref str i)))
1140            (case c
1141              ((#\])
1142               (if (cset-empty? cset)
1143                   (go (+ i 1) #\] (cset-adjoin cset #\]))
1144                   (let ((ci? (flag-set? flags ~case-insensitive?)))
1145                     (list
1146                      (let ((res (if ci? (cset-case-insensitive cset) cset)))
1147                        (cset->sre (if invert? (cset-complement res) res)))
1148                      i))))
1149              ((#\-)
1150               (cond
1151                ((or (= i start)
1152                     (and (= i (+ start 1)) (eqv? #\^ (string-ref str start)))
1153                     (eqv? #\] (string-ref str (+ i 1))))
1154                 (go (+ i 1) c (cset-adjoin cset c)))
1155                ((not prev-char)
1156                 (error "bad char-set"))
1157                (else
1158                 (let ((char (string-ref str (+ i 1))))
1159                   (apply
1160                    (lambda (c j)
1161                      (if (char<? c prev-char)
1162                          (error "inverted range in char-set" prev-char c)
1163                          (go j #f (cset-union cset (range->cset prev-char c)))))
1164                    (cond
1165                     ((and (eqv? #\\ char) (assv char posix-escape-sequences))
1166                      => (lambda (x) (list (cdr x) (+ i 3))))
1167                     ((and (eqv? #\\ char)
1168                           (eqv? (string-ref str (+ i 2)) #\x))
1169                      (string-parse-hex-escape str (+ i 3) end))
1170                     ((and utf8? (<= #x80 (char->integer char) #xFF))
1171                      (let ((len (utf8-start-char->length char)))
1172                        (list (utf8-string-ref str (+ i 1) len) (+ i 1 len))))
1173                     (else
1174                      (list char (+ i 2)))))))))
1175              ((#\[)
1176               (let* ((inv? (eqv? #\^ (string-ref str (+ i 1))))
1177                      (i2 (if inv? (+ i 2) (+ i 1))))
1178                 (case (string-ref str i2)
1179                   ((#\:)
1180                    (let ((j (string-scan-char str #\: (+ i2 1))))
1181                      (if (or (not j) (not (eqv? #\] (string-ref str (+ j 1)))))
1182                          (error "incomplete character class" str)
1183                          (let* ((class (sre->cset
1184                                         (string->symbol
1185                                          (substring str (+ i2 1) j))))
1186                                 (class (if inv? (cset-complement class) class)))
1187                            (go (+ j 2) #f (cset-union cset class))))))
1188                   ((#\= #\.)
1189                    (error "collating sequences not supported" str))
1190                   (else
1191                    (go (+ i 1) #\[ (cset-adjoin cset #\[))))))
1192              ((#\\)
1193               (let ((c (string-ref str (+ i 1))))
1194                 (case c
1195                   ((#\d #\D #\s #\S #\w #\W)
1196                    (go (+ i 2) #f
1197                        (cset-union cset
1198                                    (sre->cset (string->sre (string #\\ c))))))
1199                   ((#\x)
1200                    (apply
1201                     (lambda (ch j)
1202                       (go j ch (cset-adjoin cset ch)))
1203                     (string-parse-hex-escape str (+ i 2) end)))
1204                   (else
1205                    (let ((c (cond ((assv c posix-escape-sequences) => cdr)
1206                                   (else c))))
1207                      (go (+ i 2) c (cset-adjoin cset c)))))))
1208              (else
1209               (if (and utf8? (<= #x80 (char->integer c) #xFF))
1210                   (let ((len (utf8-start-char->length c)))
1211                     (go (+ i len)
1212                         (utf8-string-ref str i len)
1213                         (cset-adjoin cset (utf8-string-ref str i len))))
1214                   (go (+ i 1) c (cset-adjoin cset c))))))))
1215    (if invert?
1216        (go (+ start 1)
1217            #f
1218            (if (flag-set? flags ~multi-line?)
1219                (char->cset #\newline)
1220                (make-cset)))
1221        (go start #f (make-cset)))))
1222
1223;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1224;;;; UTF-8 Utilities
1225
1226;; Here are some hairy optimizations that need to be documented
1227;; better.  Thanks to these, we never do any utf8 processing once the
1228;; regexp is compiled.
1229
1230;; two chars: ab..ef
1231;;            a[b..xFF]|[b-d][x80..xFF]|e[x80..xFF]
1232
1233;; three chars: abc..ghi
1234;;              ab[c..xFF]|a[d..xFF][x80..xFF]|
1235;;              [b..f][x80..xFF][x80..xFF]|
1236;;              g[x80..g][x80..xFF]|gh[x80..i]
1237
1238;; four chars: abcd..ghij
1239;;             abc[d..xFF]|ab[d..xFF][x80..xFF]|a[c..xFF][x80..xFF][x80..xFF]|
1240;;             [b..f][x80..xFF][x80..xFF][x80..xFF]|
1241;;             g[x80..g][x80..xFF][x80..xFF]|gh[x80..h][x80..xFF]|ghi[x80..j]
1242
1243(define (high-char? c) (<= #x80 (char->integer c)))
1244
1245;; number of total bytes in a utf8 char given the 1st byte
1246
1247(define utf8-start-char->length
1248  (let ((table '#(
12491 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 0x
12501 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 1x
12511 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 2x
12521 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 3x
12531 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 4x
12541 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 5x
12551 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 6x
12561 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 7x
12571 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 8x
12581 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; 9x
12591 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; ax
12601 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ; bx
12612 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; cx
12622 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ; dx
12633 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 ; ex
12644 4 4 4 4 4 4 4 5 5 5 5 6 6 0 0 ; fx
1265)))
1266    (lambda (c) (vector-ref table (char->integer c)))))
1267
1268(define (utf8-string-ref str i len) str)
1269
1270(define (utf8-backup-to-initial-char str i) str)
1271
1272(define (utf8-lowest-digit-of-length len)
1273  (case len
1274    ((1) 0) ((2) #xC0) ((3) #xE0) ((4) #xF0)
1275    (else (error "invalid utf8 length" len))))
1276
1277(define (utf8-highest-digit-of-length len)
1278  (case len
1279    ((1) #x7F) ((2) #xDF) ((3) #xEF) ((4) #xF7)
1280    (else (error "invalid utf8 length" len))))
1281
1282#| ;; NMOSH: we don't need this one
1283;; Maybe this should just modify the input?
1284(define (cset->utf8-pattern cset)
1285  (let lp ((ls (cset->plist cset)) (alts '()) (lo-cset '()))
1286    (if (null? ls)
1287        (sre-alternate (append (reverse alts)
1288                               (if (null? lo-cset)
1289                                   '()
1290                                   (list (cons '/ (reverse lo-cset))))))
1291        (if (or (high-char? (car ls))  (high-char? (cadr ls)))
1292            (lp (cddr ls)
1293                (cons (unicode-range->utf8-pattern (car ls) (cadr ls)) alts)
1294                lo-cset)
1295            (lp (cddr ls) alts (cons (cadr ls) (cons (car ls) lo-cset)))))))
1296|#
1297
1298(define (sre-adjust-utf8 sre flags) sre)
1299
1300;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1301;;;; Compilation
1302
1303(define (irregex x . o)
1304  (cond
1305   ((irregex? x) x)
1306   ((string? x) (apply string->irregex x o))
1307   (else (apply sre->irregex x o))))
1308
1309(define (string->irregex str . o)
1310  (apply sre->irregex (apply string->sre str o) o))
1311
1312(define (sre->irregex sre . o)
1313  (let* ((pat-flags (symbol-list->flags o))
1314         (sre (if *allow-utf8-mode?*
1315                  (sre-adjust-utf8 sre pat-flags)
1316                  sre))
1317         (searcher? (sre-searcher? sre))
1318         (sre-dfa (if searcher? (sre-remove-initial-bos sre) sre))
1319         (dfa-limit (cond ((memq 'small o) 1) ((memq 'fast o) 50) (else 10)))
1320         (dfa/search
1321          (cond ((memq 'backtrack o) #f)
1322                (searcher? #t)
1323                ((sre->nfa `(seq (* any) ,sre-dfa) pat-flags)
1324                 => (lambda (nfa)
1325                      (nfa->dfa nfa (* dfa-limit (nfa-num-states nfa)))))
1326                (else #f)))
1327         (dfa (cond ((and dfa/search (sre->nfa sre-dfa pat-flags))
1328                     => (lambda (nfa)
1329                          (nfa->dfa nfa (* dfa-limit (nfa-num-states nfa)))))
1330                    (else #f)))
1331         (submatches (sre-count-submatches sre-dfa))
1332         (extractor
1333          (and dfa dfa/search (sre-match-extractor sre-dfa submatches)))
1334         (names (sre-names sre-dfa 1 '()))
1335         (lens (sre-length-ranges sre-dfa names))
1336         (flags (flag-join
1337                 (flag-join ~none (and searcher? ~searcher?))
1338                 (and (sre-consumer? sre) ~consumer?))))
1339    (cond
1340     (dfa
1341      (make-irregex dfa dfa/search extractor #f flags submatches lens names))
1342     (else
1343      (let ((f (sre->procedure sre pat-flags names)))
1344        (make-irregex #f #f #f f flags submatches lens names))))))
1345
1346;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1347;;;; SRE Analysis
1348
1349;; returns #t if the sre can ever be empty
1350(define (sre-empty? sre)
1351  (if (pair? sre)
1352      (case (car sre)
1353        ((* ? look-ahead look-behind neg-look-ahead neg-look-behind) #t)
1354        ((**) (or (not (number? (cadr sre))) (zero? (cadr sre))))
1355        ((or) (any sre-empty? (cdr sre)))
1356        ((: seq $ submatch => submatch-named + atomic)
1357         (every sre-empty? (cdr sre)))
1358        (else #f))
1359      (memq sre '(epsilon bos eos bol eol bow eow commit))))
1360
1361(define (sre-any? sre)
1362  (or (eq? sre 'any)
1363      (and (pair? sre)
1364           (case (car sre)
1365             ((seq : $ submatch => submatch-named)
1366              (and (pair? (cdr sre)) (null? (cddr sre)) (sre-any? (cadr sre))))
1367             ((or) (every sre-any? (cdr sre)))
1368             (else #f)))))
1369
1370(define (sre-repeater? sre)
1371  (and (pair? sre)
1372       (or (memq (car sre) '(* +))
1373           (and (memq (car sre) '($ submatch => submatch-named seq :))
1374                (pair? (cdr sre))
1375                (null? (cddr sre))
1376                (sre-repeater? (cadr sre))))))
1377
1378(define (sre-searcher? sre)
1379  (if (pair? sre)
1380      (case (car sre)
1381        ((* +) (sre-any? (sre-sequence (cdr sre))))
1382        ((seq : $ submatch => submatch-named)
1383         (and (pair? (cdr sre)) (sre-searcher? (cadr sre))))
1384        ((or) (every sre-searcher? (cdr sre)))
1385        (else #f))
1386      (eq? 'bos sre)))
1387
1388(define (sre-consumer? sre)
1389  (if (pair? sre)
1390      (case (car sre)
1391        ((* +) (sre-any? (sre-sequence (cdr sre))))
1392        ((seq : $ submatch => submatch-named)
1393         (and (pair? (cdr sre)) (sre-consumer? (last sre))))
1394        ((or) (every sre-consumer? (cdr sre)))
1395        (else #f))
1396      (eq? 'eos sre)))
1397
1398(define (sre-has-submatches? sre)
1399  (and (pair? sre)
1400       (or (memq (car sre) '($ submatch => submatch-named))
1401           (if (eq? 'posix-string (car sre))
1402               (sre-has-submatches? (string->sre (cadr sre)))
1403               (any sre-has-submatches? (cdr sre))))))
1404
1405(define (sre-count-submatches sre)
1406  (let count ((sre sre) (sum 0))
1407    (if (pair? sre)
1408        (fold count
1409              (+ sum (case (car sre)
1410                       (($ submatch => submatch-named) 1)
1411                       ((dsm) (+ (cadr sre) (caddr sre)))
1412                       ((posix-string)
1413                        (sre-count-submatches (string->sre (cadr sre))))
1414                       (else 0)))
1415              (cdr sre))
1416        sum)))
1417
1418(define (sre-length-ranges sre . o)
1419  (let ((names (if (pair? o) (car o) (sre-names sre 1 '())))
1420        (sublens (make-vector (+ 1 (sre-count-submatches sre)) #f)))
1421    (vector-set!
1422     sublens
1423     0
1424     (let lp ((sre sre) (n 1) (lo 0) (hi 0) (return cons))
1425       (define (grow i) (return (+ lo i) (and hi (+ hi i))))
1426       (cond
1427        ((pair? sre)
1428         (if (string? (car sre))
1429             (grow 1)
1430             (case (car sre)
1431               ((/ ~ & -)
1432                (grow 1))
1433               ((posix-string)
1434                (lp (string->sre (cadr sre)) n lo hi return))
1435               ((seq : w/case w/nocase atomic)
1436                (let lp2 ((ls (cdr sre)) (n n) (lo2 0) (hi2 0))
1437                  (if (null? ls)
1438                      (return (+ lo lo2) (and hi hi2 (+ hi hi2)))
1439                      (lp (car ls) n 0 0
1440                          (lambda (lo3 hi3)
1441                            (lp2 (cdr ls)
1442                                 (+ n (sre-count-submatches (car ls)))
1443                                 (+ lo2 lo3)
1444                                 (and hi2 hi3 (+ hi2 hi3))))))))
1445               ((or)
1446                (let lp2 ((ls (cdr sre)) (n n) (lo2 #f) (hi2 0))
1447                  (if (null? ls)
1448                      (return (+ lo (or lo2 1)) (and hi hi2 (+ hi hi2)))
1449                      (lp (car ls) n 0 0
1450                          (lambda (lo3 hi3)
1451                            (lp2 (cdr ls)
1452                                 (+ n (sre-count-submatches (car ls)))
1453                                 (if lo2 (min lo2 lo3) lo3)
1454                                 (and hi2 hi3 (max hi2 hi3))))))))
1455               ((if)
1456                (cond
1457                 ((or (null? (cdr sre)) (null? (cddr sre)))
1458                  (return lo hi))
1459                 (else
1460                  (let ((n1 (sre-count-submatches (car sre)))
1461                        (n2 (sre-count-submatches (cadr sre))))
1462                    (lp (if (or (number? (cadr sre)) (symbol? (cadr sre)))
1463                            'epsilon
1464                            (cadr sre))
1465                        n lo hi
1466                        (lambda (lo2 hi2)
1467                          (lp (caddr sre) (+ n n1) 0 0
1468                              (lambda (lo3 hi3)
1469                                (lp (if (pair? (cdddr sre))
1470                                        (cadddr sre)
1471                                        'epsilon)
1472                                    (+ n n1 n2) 0 0
1473                                    (lambda (lo4 hi4)
1474                                      (return (+ lo2 (min lo3 lo4))
1475                                              (and hi2 hi3 hi4
1476                                                   (+ hi2 (max hi3 hi4))
1477                                                   ))))))))))))
1478               ((dsm)
1479                (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) lo hi return))
1480               (($ submatch => submatch-named)
1481                (lp (sre-sequence
1482                     (if (eq? 'submatch (car sre)) (cdr sre) (cddr sre)))
1483                    (+ n 1) lo hi
1484                    (lambda (lo2 hi2)
1485                      (vector-set! sublens n (cons lo2 hi2))
1486                      (return lo2 hi2))))
1487               ((backref backref-ci)
1488                (let ((n (cond
1489                          ((number? (cadr sre)) (cadr sre))
1490                          ((assq (cadr sre) names) => cdr)
1491                          (else (error "unknown backreference" (cadr sre))))))
1492                  (cond
1493                   ((or (not (integer? n))
1494                        (not (< 0 n (vector-length sublens))))
1495                    (error "sre-length: invalid backreference" sre))
1496                   ((not (vector-ref sublens n))
1497                    (error "sre-length: invalid forward backreference" sre))
1498                   (else
1499                    (let ((lo2 (car (vector-ref sublens n)))
1500                          (hi2 (cdr (vector-ref sublens n))))
1501                      (return (+ lo lo2) (and hi hi2 (+ hi hi2))))))))
1502               ((* *?)
1503                (lp (sre-sequence (cdr sre)) n lo hi (lambda (lo hi) #f))
1504                (return lo #f))
1505               ((** **?)
1506                (cond
1507                 ((or (and (number? (cadr sre))
1508                           (number? (caddr sre))
1509                           (> (cadr sre) (caddr sre)))
1510                      (and (not (cadr sre)) (caddr sre)))
1511                  (return lo hi))
1512                 (else
1513                  (if (caddr sre)
1514                      (lp (sre-sequence (cdddr sre)) n 0 0
1515                          (lambda (lo2 hi2)
1516                            (return (+ lo (* (cadr sre) lo2))
1517                                    (and hi hi2 (+ hi (* (caddr sre) hi2))))))
1518                      (lp (sre-sequence (cdddr sre)) n 0 0
1519                          (lambda (lo2 hi2)
1520                            (return (+ lo (* (cadr sre) lo2)) #f)))))))
1521               ((+)
1522                (lp (sre-sequence (cdr sre)) n lo hi
1523                    (lambda (lo2 hi2)
1524                      (return (+ lo lo2) #f))))
1525               ((? ??)
1526                (lp (sre-sequence (cdr sre)) n lo hi
1527                    (lambda (lo2 hi2)
1528                      (return lo (and hi hi2 (+ hi hi2))))))
1529               ((= =? >= >=?)
1530                (lp `(** ,(cadr sre)
1531                         ,(if (memq (car sre) '(>= >=?)) #f (cadr sre))
1532                         ,@(cddr sre))
1533                    n lo hi return))
1534               ((look-ahead neg-look-ahead look-behind neg-look-behind)
1535                (return lo hi))
1536               (else
1537                (cond
1538                 ((assq (car sre) sre-named-definitions)
1539                  => (lambda (cell)
1540                       (lp (apply (cdr cell) (cdr sre)) n lo hi return)))
1541                 (else
1542                  (error "sre-length-ranges: unknown sre operator" sre)))))))
1543        ((char? sre)
1544         (grow 1))
1545        ((string? sre)
1546         (grow (string-length sre)))
1547        ((memq sre '(any nonl))
1548         (grow 1))
1549        ((memq sre '(epsilon bos eos bol eol bow eow nwb commit))
1550         (return lo hi))
1551        (else
1552         (let ((cell (assq sre sre-named-definitions)))
1553           (if cell
1554               (lp (if (procedure? (cdr cell)) ((cdr cell)) (cdr cell))
1555                   n lo hi return)
1556               (error "sre-length-ranges: unknown sre" sre)))))))
1557    sublens))
1558
1559;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1560;;;; SRE Manipulation
1561
1562;; build a (seq ls ...) sre from a list
1563(define (sre-sequence ls)
1564  (cond
1565   ((null? ls) 'epsilon)
1566   ((null? (cdr ls)) (car ls))
1567   (else (cons 'seq ls))))
1568
1569;; build a (or ls ...) sre from a list
1570(define (sre-alternate ls)
1571  (cond
1572   ((null? ls) '(or))
1573   ((null? (cdr ls)) (car ls))
1574   (else (cons 'or ls))))
1575
1576;; returns an equivalent SRE without any match information
1577(define (sre-strip-submatches sre)
1578  (if (not (pair? sre))
1579      sre
1580      (case (car sre)
1581        (($ submatch) (sre-strip-submatches (sre-sequence (cdr sre))))
1582        ((=> submatch-named) (sre-strip-submatches (sre-sequence (cddr sre))))
1583        ((dsm) (sre-strip-submatches (sre-sequence (cdddr sre))))
1584        (else (map sre-strip-submatches sre)))))
1585
1586;; given a char-set list of chars and strings, flattens them into
1587;; chars only
1588(define (sre-flatten-ranges ls)
1589  (let lp ((ls ls) (res '()))
1590    (cond
1591     ((null? ls)
1592      (reverse res))
1593     ((string? (car ls))
1594      (lp (append (string->list (car ls)) (cdr ls)) res))
1595     (else
1596      (lp (cdr ls) (cons (car ls) res))))))
1597
1598(define (sre-names sre n names)
1599  (if (not (pair? sre))
1600      names
1601      (case (car sre)
1602        (($ submatch)
1603         (sre-names (sre-sequence (cdr sre)) (+ n 1) names))
1604        ((=> submatch-named)
1605         (sre-names (sre-sequence (cddr sre))
1606                    (+ n 1)
1607                    (cons (cons (cadr sre) n) names)))
1608        ((dsm)
1609         (sre-names (sre-sequence (cdddr sre)) (+ n (cadr sre)) names))
1610        ((seq : or * + ? *? ?? w/case w/nocase atomic
1611          look-ahead look-behind neg-look-ahead neg-look-behind)
1612         (sre-sequence-names (cdr sre) n names))
1613        ((= >=)
1614         (sre-sequence-names (cddr sre) n names))
1615        ((** **?)
1616         (sre-sequence-names (cdddr sre) n names))
1617        (else
1618         names))))
1619
1620(define (sre-sequence-names ls n names)
1621  (if (null? ls)
1622      names
1623      (sre-sequence-names (cdr ls)
1624                          (+ n (sre-count-submatches (car ls)))
1625                          (sre-names (car ls) n names))))
1626
1627(define (sre-remove-initial-bos sre)
1628  (cond
1629   ((pair? sre)
1630    (case (car sre)
1631      ((seq : $ submatch => submatch-named * +)
1632       (cond
1633        ((not (pair? (cdr sre)))
1634         sre)
1635        ((eq? 'bos (cadr sre))
1636         (cons (car sre) (cddr sre)))
1637        (else
1638         (cons (car sre)
1639               (cons (sre-remove-initial-bos (cadr sre)) (cddr sre))))))
1640      ((or)
1641       (sre-alternate (map sre-remove-initial-bos (cdr sre))))
1642      (else
1643       sre)))
1644   (else
1645    sre)))
1646
1647;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1648;;;; Basic Matching
1649
1650(define irregex-basic-string-chunker
1651  (make-irregex-chunker (lambda (x) #f)
1652                        car
1653                        cadr
1654                        caddr
1655                        (lambda (src1 i src2 j)
1656                          (substring (car src1) i j))))
1657
1658(define (irregex-search x str . o)
1659  (if (not (string? str)) (error "irregex-search: not a string" str))
1660  (let ((start (or (and (pair? o) (car o)) 0))
1661        (end (or (and (pair? o) (pair? (cdr o)) (cadr o)) (string-length str))))
1662    (if (not (and (integer? start) (exact? start)))
1663        (error "irregex-search: not an exact integer" start))
1664    (if (not (and (integer? end) (exact? end)))
1665        (error "irregex-search: not an exact integer" end))
1666    (irregex-search/chunked x
1667                            irregex-basic-string-chunker
1668                            (list str start end)
1669                            start)))
1670
1671(define (irregex-search/chunked x cnk src . o)
1672  (let* ((irx (irregex x))
1673         (matches (irregex-new-matches irx))
1674         (i (if (pair? o) (car o) ((chunker-get-start cnk) src))))
1675    (irregex-match-chunker-set! matches cnk)
1676    (irregex-search/matches irx cnk src i matches)))
1677
1678;; internal routine, can be used in loops to avoid reallocating the
1679;; match vector
1680(define (irregex-search/matches irx cnk src i matches)
1681  (cond
1682   ((irregex-dfa irx)
1683    (cond
1684     ((flag-set? (irregex-flags irx) ~searcher?)
1685      (cond
1686       ((dfa-match/longest (irregex-dfa irx) cnk src i #f #f matches 0)
1687        (irregex-match-start-chunk-set! matches 0 src)
1688        (irregex-match-start-index-set! matches 0 i)
1689        ((irregex-dfa/extract irx)
1690         cnk src i
1691         (%irregex-match-end-chunk matches 0)
1692         (%irregex-match-end-index matches 0)
1693         matches)
1694        matches)
1695       (else
1696        #f)))
1697     ((dfa-match/shortest
1698       (irregex-dfa/search irx) cnk src i matches 0)
1699      (let ((dfa (irregex-dfa irx))
1700            (get-start (chunker-get-start cnk))
1701            (get-end (chunker-get-end cnk))
1702            (get-next (chunker-get-next cnk)))
1703        (let lp1 ((src src) (i i))
1704          (let ((end (get-end src)))
1705            (let lp2 ((i i))
1706              (cond
1707               ((dfa-match/longest dfa cnk src i #f #f matches 0)
1708                (irregex-match-start-chunk-set! matches 0 src)
1709                (irregex-match-start-index-set! matches 0 i)
1710                ((irregex-dfa/extract irx)
1711                 cnk src i
1712                 (%irregex-match-end-chunk matches 0)
1713                 (%irregex-match-end-index matches 0)
1714                 matches)
1715                matches)
1716               ((>= i end)
1717                (let ((next (get-next src)))
1718                  (and next (lp1 next (get-start next)))))
1719               (else
1720                (lp2 (+ i 1)))))))))
1721     (else
1722      #f)))
1723   (else
1724    (let ((res (irregex-search/backtrack irx cnk src i matches)))
1725      (if res (%irregex-match-fail-set! res #f))
1726      res))))
1727
1728(define (irregex-search/backtrack irx cnk src i matches)
1729  (let ((matcher (irregex-nfa irx))
1730        (str ((chunker-get-str cnk) src))
1731        (end ((chunker-get-end cnk) src))
1732        (get-next (chunker-get-next cnk))
1733        (init (cons src i)))
1734    (if (flag-set? (irregex-flags irx) ~searcher?)
1735        (matcher cnk init src str i end matches (lambda () #f))
1736        (let lp ((src2 src)
1737                 (str str)
1738                 (i i)
1739                 (end end))
1740          (cond
1741           ((matcher cnk init src2 str i end matches (lambda () #f))
1742            (irregex-match-start-chunk-set! matches 0 src2)
1743            (irregex-match-start-index-set! matches 0 i)
1744            matches)
1745           ((< i end)
1746            (lp src2 str (+ i 1) end))
1747           (else
1748            (let ((src2 (get-next src2)))
1749              (if src2
1750                  (lp src2
1751                      ((chunker-get-str cnk) src2)
1752                      ((chunker-get-start cnk) src2)
1753                      ((chunker-get-end cnk) src2))
1754                  #f))))))))
1755
1756(define (irregex-match irx str . o)
1757  (if (not (string? str)) (error "irregex-match: not a string" str))
1758  (let ((start (or (and (pair? o) (car o)) 0))
1759        (end (or (and (pair? o) (pair? (cdr o)) (cadr o)) (string-length str))))
1760    (if (not (and (integer? start) (exact? start)))
1761        (error "irregex-match: not an exact integer" start))
1762    (if (not (and (integer? end) (exact? end)))
1763        (error "irregex-match: not an exact integer" end))
1764    (irregex-match/chunked irx
1765                           irregex-basic-string-chunker
1766                           (list str start end))))
1767
1768(define (irregex-match/chunked irx cnk src)
1769  (let* ((irx (irregex irx))
1770         (matches (irregex-new-matches irx)))
1771    (irregex-match-chunker-set! matches cnk)
1772    (cond
1773     ((irregex-dfa irx)
1774      (and
1775       (dfa-match/longest
1776        (irregex-dfa irx) cnk src ((chunker-get-start cnk) src) #f #f matches 0)
1777       (= ((chunker-get-end cnk) (%irregex-match-end-chunk matches 0))
1778          (%irregex-match-end-index matches 0))
1779       (begin
1780         (irregex-match-start-chunk-set! matches 0 src)
1781         (irregex-match-start-index-set! matches
1782                                         0
1783                                         ((chunker-get-start cnk) src))
1784         ((irregex-dfa/extract irx)
1785          cnk src ((chunker-get-start cnk) src)
1786          (%irregex-match-end-chunk matches 0)
1787          (%irregex-match-end-index matches 0)
1788          matches)
1789         matches)))
1790     (else
1791      (let* ((matcher (irregex-nfa irx))
1792             (str ((chunker-get-str cnk) src))
1793             (i ((chunker-get-start cnk) src))
1794             (end ((chunker-get-end cnk) src))
1795             (init (cons src i)))
1796        (let lp ((m (matcher cnk init src str i end matches (lambda () #f))))
1797          (and m
1798               (cond
1799                ((and (not ((chunker-get-next cnk)
1800                            (%irregex-match-end-chunk m 0)))
1801                      (= ((chunker-get-end cnk)
1802                          (%irregex-match-end-chunk m 0))
1803                         (%irregex-match-end-index m 0)))
1804                 (%irregex-match-fail-set! m #f)
1805                 m)
1806                ((%irregex-match-fail m)
1807                 (lp ((%irregex-match-fail m))))
1808                (else
1809                 #f)))))))))
1810
1811(define (irregex-match? . args)
1812  (and (apply irregex-match args) #t))
1813
1814;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1815;;;; DFA Matching
1816
1817;; inline these
1818(define (dfa-init-state dfa)
1819  (vector-ref dfa 0))
1820(define (dfa-next-state dfa node)
1821  (vector-ref dfa (cdr node)))
1822(define (dfa-final-state? dfa state)
1823  (car state))
1824
1825;; this searches for the first end index for which a match is possible
1826(define (dfa-match/shortest dfa cnk src start matches index)
1827  (let ((get-str (chunker-get-str cnk))
1828        (get-start (chunker-get-start cnk))
1829        (get-end (chunker-get-end cnk))
1830        (get-next (chunker-get-next cnk)))
1831    (let lp1 ((src src) (start start) (state (dfa-init-state dfa)))
1832      (and
1833       src
1834       (let ((str (get-str src))
1835             (end (get-end src)))
1836         (let lp2 ((i start) (state state))
1837           (cond
1838            ((dfa-final-state? dfa state)
1839             (cond
1840              (index
1841               (irregex-match-end-chunk-set! matches index src)
1842               (irregex-match-end-index-set! matches index i)))
1843             #t)
1844            ((< i end)
1845             (let* ((ch (string-ref str i))
1846                    (next (find (lambda (x)
1847                                  (or (eqv? ch (car x))
1848                                      (and (not (char? (car x)))
1849                                           (cset-contains? (car x) ch))))
1850                                (cdr state))))
1851               (and next (lp2 (+ i 1) (dfa-next-state dfa next)))))
1852            (else
1853             (let ((next (get-next src)))
1854               (and next (lp1 next (get-start next) state)))))))))))
1855
1856;; this finds the longest match starting at a given index
1857(define (dfa-match/longest dfa cnk src start end-src end matches index)
1858  (let ((get-str (chunker-get-str cnk))
1859        (get-start (chunker-get-start cnk))
1860        (get-end (chunker-get-end cnk))
1861        (get-next (chunker-get-next cnk))
1862        (start-is-final? (dfa-final-state? dfa (dfa-init-state dfa))))
1863    (cond
1864     (index
1865      (irregex-match-end-chunk-set! matches index #f)
1866      (irregex-match-end-index-set! matches index #f)))
1867    (let lp1 ((src src)
1868              (start start)
1869              (state (dfa-init-state dfa))
1870              (res-src (and start-is-final? src))
1871              (res-index (and start-is-final? start)))
1872      (let ((str (get-str src))
1873            (end (if (eq? src end-src) end (get-end src))))
1874        (let lp2 ((i start)
1875                  (state state)
1876                  (res-src res-src)
1877                  (res-index res-index))
1878          (cond
1879           ((>= i end)
1880            (cond
1881             ((and index res-src)
1882              (irregex-match-end-chunk-set! matches index res-src)
1883              (irregex-match-end-index-set! matches index res-index)))
1884            (let ((next (and (not (eq? src end-src)) (get-next src))))
1885              (if next
1886                  (lp1 next (get-start next) state res-src res-index)
1887                  (and index
1888                       (%irregex-match-end-chunk matches index)
1889                       #t))))
1890           (else
1891            (let* ((ch (string-ref str i))
1892                   (cell (find (lambda (x)
1893                                 (or (eqv? ch (car x))
1894                                     (and (not (char? (car x)))
1895                                          (cset-contains? (car x) ch))))
1896                               (cdr state))))
1897              (cond
1898               (cell
1899                (let ((next (dfa-next-state dfa cell)))
1900                  (if (dfa-final-state? dfa next)
1901                      (lp2 (+ i 1) next src (+ i 1))
1902                      (lp2 (+ i 1) next res-src res-index))))
1903               (res-src
1904                (cond
1905                 (index
1906                  (irregex-match-end-chunk-set! matches index res-src)
1907                  (irregex-match-end-index-set! matches index res-index)))
1908                #t)
1909               ((and index (%irregex-match-end-chunk matches index))
1910                #t)
1911               (else
1912                #f))))))))))
1913
1914;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1915;;;; Named Definitions
1916
1917(define sre-named-definitions
1918  `((any . ,*all-chars*)
1919    (nonl . (- ,*all-chars* (,(string #\newline))))
1920    (alphabetic . (/ #\a #\z #\A #\Z))
1921    (alpha . alphabetic)
1922    (alphanumeric . (/ #\a #\z #\A #\Z #\0 #\9))
1923    (alphanum . alphanumeric)
1924    (alnum . alphanumeric)
1925    (lower-case . (/ #\a #\z))
1926    (lower . lower-case)
1927    (upper-case . (/ #\A #\Z))
1928    (upper . upper-case)
1929    (numeric . (/ #\0 #\9))
1930    (num . numeric)
1931    (digit . numeric)
1932    (punctuation . (or #\! #\" #\# #\% #\& #\' #\( #\) #\* #\, #\- #\.
1933                       #\/ #\: #\; #\? #\@ #\[ #\\ #\] #\_ #\{ #\}))
1934    (punct . punctuation)
1935    (graphic
1936     . (or alphanumeric punctuation #\$ #\+ #\< #\= #\> #\^ #\` #\| #\~))
1937    (graph . graphic)
1938    (blank . (or #\space ,(integer->char (- (char->integer #\space) 23))))
1939    (whitespace . (or blank #\newline))
1940    (space . whitespace)
1941    (white . whitespace)
1942    (printing or graphic whitespace)
1943    (print . printing)
1944
1945    ;; XXXX we assume a (possibly shifted) ASCII-based ordering
1946    (control . (/ ,(integer->char (- (char->integer #\space) 32))
1947                  ,(integer->char (- (char->integer #\space) 1))))
1948    (cntrl . control)
1949    (hex-digit . (or numeric (/ #\a #\f #\A #\F)))
1950    (xdigit . hex-digit)
1951    (ascii . (/ ,(integer->char (- (char->integer #\space) 32))
1952                ,(integer->char (+ (char->integer #\space) 95))))
1953    (ascii-nonl . (/ ,(integer->char (- (char->integer #\space) 32))
1954                     ,(integer->char (- (char->integer #\newline) 1))
1955                     ,(integer->char (+ (char->integer #\newline) 1))
1956                     ,(integer->char (+ (char->integer #\space) 95))))
1957    (newline . (or (seq ,(integer->char (+ (char->integer #\newline) 3))
1958                        #\newline)
1959                   (/ #\newline
1960                      ,(integer->char (+ (char->integer #\newline) 3)))))
1961
1962    ;; ... it's really annoying to support old Scheme48
1963    (word . (seq bow (+ (or alphanumeric #\_)) eow))
1964    (utf8-tail-char . (/ ,(integer->char (+ (char->integer #\space) #x60))
1965                         ,(integer->char (+ (char->integer #\space) #xA1))))
1966    (utf8-2-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xA2))
1967                           ,(integer->char (+ (char->integer #\space) #xBF)))
1968                        utf8-tail-char))
1969    (utf8-3-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xC0))
1970                           ,(integer->char (+ (char->integer #\space) #xCF)))
1971                        utf8-tail-char
1972                        utf8-tail-char))
1973    (utf8-4-char . (seq (/ ,(integer->char (+ (char->integer #\space) #xD0))
1974                           ,(integer->char (+ (char->integer #\space) #xD7)))
1975                        utf8-tail-char
1976                        utf8-tail-char
1977                        utf8-tail-char))
1978    (utf8-any . (or ascii utf8-2-char utf8-3-char utf8-4-char))
1979    (utf8-nonl . (or ascii-nonl utf8-2-char utf8-3-char utf8-4-char))
1980
1981    ;; extended library patterns
1982    (integer . (seq (? (or #\+ #\-)) (+ numeric)))
1983    (real . (seq (+ numeric) (? #\. (+ numeric)) (? (or #\e #\E) integer)))
1984    ;; slightly more lax than R5RS, allow ->foo, etc.
1985    (symbol-initial . (or alpha ("!$%&*/:<=>?^_~")))
1986    (symbol-subsequent . (or symbol-initial digit ("+-.@")))
1987    (symbol . (or (seq symbol-initial (* symbol-subsequent))
1988                  (seq ("+-") (? symbol-initial (* symbol-subsequent)))
1989                  (seq ".." (* "."))))
1990    (sexp-space . (seq (* (* space) ";" (* nonl) newline) (+ space)))
1991    (string . (seq #\" (escape #\\ #\") #\"))
1992    (escape . ,(lambda (esc . o) `(* (or (~ ,esc ,@o) (seq ,esc any)))))
1993
1994    (ipv4-digit . (seq (? (/ "12")) (? numeric) numeric))
1995    (ipv4-address . (seq ipv4-digit (= 3 #\. ipv4-digit)))
1996    ;; XXXX lax, allows multiple double-colons or < 8 terms w/o a ::
1997    (ipv6-address . (seq (** 0 4 hex-digit)
1998                         (** 1 7 #\: (? #\:) (** 0 4 hex-digit))))
1999    (ip-address . (or ipv4-address ipv6-address))
2000    (domain-atom . (+ (or alphanumeric #\_ #\-)))
2001    (domain . (seq domain-atom (+ #\. domain-atom)))
2002    ;; XXXX now anything can be a top-level domain, but this is still handy
2003    (top-level-domain . (w/nocase (or "arpa" "com" "gov" "mil" "net" "org"
2004                                      "aero" "biz" "coop" "info" "museum"
2005                                      "name" "pro" (= 2 alpha))))
2006    (domain/common . (seq (+ domain-atom #\.) top-level-domain))
2007    ;;(email-local-part . (seq (+ (or (~ #\") string))))
2008    (email-local-part . (+ (or alphanumeric #\_ #\- #\. #\+)))
2009    (email . (seq email-local-part #\@ domain))
2010    (url-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\. #\, #\& #\;
2011                    (seq "%" hex-digit hex-digit)))
2012    (url-final-char . (or alnum #\_ #\- #\+ #\\ #\= #\~ #\&
2013                          (seq "%" hex-digit hex-digit)))
2014    (http-url . (w/nocase
2015                 "http" (? "s") "://"
2016                 (or domain/common ipv4-address) ;; (seq "[" ipv6-address "]")
2017                 (? ":" (+ numeric)) ;; port
2018                 ;; path
2019                 (? "/" (* url-char)
2020                    (? "?" (* url-char))                      ;; query
2021                    (? "#" (? (* url-char) url-final-char)) ;; fragment
2022                    )))
2023
2024    ))
2025
2026;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2027;;;; SRE->NFA compilation
2028;;
2029;; An NFA state is a numbered node with a list of pattern->number
2030;; transitions, where pattern is character set range, or epsilon
2031;; (indicating an empty transition).
2032;; There may be overlapping ranges - since it's an NFA we process it
2033;; by considering all possible transitions.
2034
2035(define *nfa-presize* 128)  ;; constant
2036(define *nfa-num-fields* 4) ;; constant
2037
2038(define (nfa-num-states nfa) (quotient (vector-length nfa) *nfa-num-fields*))
2039(define (nfa-start-state nfa) (- (nfa-num-states nfa) 1))
2040
2041(define (nfa-get-state-trans nfa i)
2042  (vector-ref nfa (* i *nfa-num-fields*)))
2043(define (nfa-set-state-trans! nfa i x)
2044  (vector-set! nfa (* i *nfa-num-fields*) x))
2045
2046(define (nfa-get-epsilons nfa i)
2047  (vector-ref nfa (+ (* i *nfa-num-fields*) 1)))
2048(define (nfa-set-epsilons! nfa i x)
2049  (vector-set! nfa (+ (* i *nfa-num-fields*) 1) x))
2050(define (nfa-add-epsilon! nfa i x)
2051  (let ((eps (nfa-get-epsilons nfa i)))
2052    (if (not (memq x eps))
2053        (nfa-set-epsilons! nfa i (cons x eps)))))
2054
2055(define (nfa-get-state-closure nfa i)
2056  (vector-ref nfa (+ (* i *nfa-num-fields*) 2)))
2057(define (nfa-set-state-closure! nfa i x)
2058  (vector-set! nfa (+ (* i *nfa-num-fields*) 2) x))
2059
2060(define (nfa-get-closure nfa mst)
2061  (cond ((assoc mst
2062                (vector-ref nfa (+ (* (nfa-multi-state-hash nfa mst)
2063                                      *nfa-num-fields*)
2064                                   (- *nfa-num-fields* 1))))
2065         => cdr)
2066        (else #f)))
2067(define (nfa-add-closure! nfa mst x)
2068  (let ((i (+ (* (nfa-multi-state-hash nfa mst) *nfa-num-fields*)
2069              (- *nfa-num-fields* 1))))
2070    (vector-set! nfa i (cons (cons mst x) (vector-ref nfa i)))))
2071
2072;; Compile and return the vector of NFA states (in groups of
2073;; *nfa-num-fields* packed elements).  The start state will be the
2074;; last element(s) of the vector, and all remaining states will be in
2075;; descending numeric order, with state 0 being the unique accepting
2076;; state.
2077(define (sre->nfa sre init-flags)
2078  (let ((buf (make-vector (* *nfa-presize* *nfa-num-fields*) '())))
2079    ;; we loop over an implicit sequence list
2080    (define (lp ls n flags next)
2081      (define (new-state-number state)
2082        (max n (+ 1 state)))
2083      (define (add-state! n2 trans-ls)
2084        (if (>= (* n2 *nfa-num-fields*) (vector-length buf))
2085            (let ((tmp (make-vector (* 2 (vector-length buf)) '())))
2086              (do ((i (- (vector-length buf) 1) (- i 1)))
2087                  ((< i 0))
2088                (vector-set! tmp i (vector-ref buf i)))
2089              (set! buf tmp)))
2090        (nfa-set-state-trans! buf n2 trans-ls)
2091        n2)
2092      (define (extend-state! next trans-cs)
2093        (and next
2094             (add-state! (new-state-number next) (cons trans-cs next))))
2095      (define (add-char-state! next ch)
2096        (let ((alt (char-altcase ch)))
2097          (if (flag-set? flags ~case-insensitive?)
2098              (extend-state! next (cset-union (char->cset ch) (char->cset alt)))
2099              (extend-state! next (char->cset ch)))))
2100      (if (null? ls)
2101          next
2102          (cond
2103           ((or (eq? 'epsilon (car ls)) (equal? "" (car ls)))
2104            ;; chars and epsilons go directly into the transition table
2105            (let ((next (lp (cdr ls) n flags next)))
2106              (and next
2107                   (let ((new (add-state! (new-state-number next) '())))
2108                     (nfa-add-epsilon! buf new next)
2109                     new))))
2110           ((string? (car ls))
2111            ;; process literal strings a char at a time
2112            (let ((next (lp (cdr ls) n flags next)))
2113              (and next
2114                   (let lp2 ((i (- (string-length (car ls)) 1))
2115                             (next next))
2116                     (if (< i 0)
2117                         next
2118                         (lp2 (- i 1)
2119                              (add-char-state! next (string-ref (car ls) i))))
2120                     ))))
2121           ((char? (car ls))
2122            (add-char-state! (lp (cdr ls) n flags next) (car ls)))
2123           ((symbol? (car ls))
2124            (let ((cell (assq (car ls) sre-named-definitions)))
2125              (and cell
2126                   (lp (cons (if (procedure? (cdr cell))
2127                                 ((cdr cell))
2128                                 (cdr cell))
2129                             (cdr ls))
2130                       n
2131                       flags
2132                       next))))
2133           ((pair? (car ls))
2134            (cond
2135             ((string? (caar ls))       ; Enumerated character set
2136              (let ((set (if (flag-set? flags ~case-insensitive?)
2137                             (cset-case-insensitive (string->cset (caar ls)))
2138                             (string->cset (caar ls)))))
2139               (extend-state! (lp (cdr ls) n flags next) set)))
2140             (else
2141              (case (caar ls)
2142                ((seq :)
2143                 ;; for an explicit sequence, just append to the list
2144                 (lp (append (cdar ls) (cdr ls)) n flags next))
2145                ((w/case w/nocase w/utf8 w/noutf8)
2146                 (let* ((next (lp (cdr ls) n flags next))
2147                        (flags ((if (memq (caar ls) '(w/case w/utf8))
2148                                    flag-clear
2149                                    flag-join)
2150                                flags
2151                                (if (memq (caar ls) '(w/case w/nocase))
2152                                    ~case-insensitive?
2153                                    ~utf8?))))
2154                   (and next
2155                        (lp (cdar ls) (new-state-number next) flags next))))
2156                ((/ - & ~)
2157                 (let ((range (sre->cset (car ls)
2158                                         (flag-set? flags ~case-insensitive?))))
2159                   (extend-state! (lp (cdr ls) n flags next)
2160                                  range)))
2161                ((or)
2162                 (let ((next (lp (cdr ls) n flags next)))
2163                   (and
2164                    next
2165                    (if (null? (cdar ls))
2166                        ;; empty (or) always fails
2167                        (add-state! (new-state-number next) '())
2168                        ;; compile both branches and insert epsilon
2169                        ;; transitions to either
2170                        (let* ((b (lp (list (sre-alternate (cddar ls)))
2171                                      (new-state-number next)
2172                                      flags
2173                                      next))
2174                               (a (and b
2175                                       (lp (list (cadar ls))
2176                                           (new-state-number (max b next))
2177                                           flags
2178                                           next))))
2179                          (and a
2180                               (let ((c (add-state! (new-state-number a)
2181                                                    '())))
2182                                 (nfa-add-epsilon! buf c a)
2183                                 (nfa-add-epsilon! buf c b)
2184                                 c)))))))
2185                ((?)
2186                 (let ((next (lp (cdr ls) n flags next)))
2187                   ;; insert an epsilon transition directly to next
2188                   (and
2189                    next
2190                    (let ((a (lp (cdar ls) (new-state-number next) flags next)))
2191                      (if a
2192                          (nfa-add-epsilon! buf a next))
2193                      a))))
2194                ((+ *)
2195                 (let ((next (lp (cdr ls) n flags next)))
2196                   (and
2197                    next
2198                    (let* ((new (lp '(epsilon)
2199                                    (new-state-number next)
2200                                    flags
2201                                    next))
2202                           (a (lp (cdar ls) (new-state-number new) flags new)))
2203                      (cond
2204                       (a
2205                        ;; for *, insert an epsilon transition as in ? above
2206                        (if (eq? '* (caar ls))
2207                            (nfa-add-epsilon! buf a new))
2208                        ;; for both, insert a loop back to self
2209                        (nfa-add-epsilon! buf new a)))
2210                      a))))
2211                ;; need to add these to the match extractor first,
2212                ;; but they tend to generate large DFAs
2213                ;;((=)
2214                ;; (lp (append (vector->list
2215                ;;              (make-vector (cadar ls)
2216                ;;                           (sre-sequence (cddar ls))))
2217                ;;             (cdr ls))
2218                ;;     n flags next))
2219                ;;((>=)
2220                ;; (lp (append (vector->list
2221                ;;              (make-vector (- (cadar ls) 1)
2222                ;;                           (sre-sequence (cddar ls))))
2223                ;;             (cons `(+ ,@(cddar ls)) (cdr ls)))
2224                ;;     n flags next))
2225                ;;((**)
2226                ;; (lp (append (vector->list
2227                ;;              (make-vector (cadar ls)
2228                ;;                           (sre-sequence (cdddar ls))))
2229                ;;             (map
2230                ;;              (lambda (x) `(? ,x))
2231                ;;              (vector->list
2232                ;;               (make-vector (- (caddar ls) (cadar ls))
2233                ;;                            (sre-sequence (cdddar ls)))))
2234                ;;             (cdr ls))
2235                ;;     n flags next))
2236                ;; ignore submatches altogether
2237                (($ submatch)
2238                 (lp (cons (sre-sequence (cdar ls)) (cdr ls)) n flags next))
2239                ((=> submatch-named)
2240                 (lp (cons (sre-sequence (cddar ls)) (cdr ls)) n flags next))
2241                (else
2242                 (cond
2243                  ((assq (caar ls) sre-named-definitions)
2244                   => (lambda (cell)
2245                        (if (procedure? (cdr cell))
2246                            (lp (cons (apply (cdr cell) (cdar ls)) (cdr ls))
2247                                n flags next)
2248                            (error "non-procedure in op position" (caar ls)))))
2249                  (else #f)))))))
2250           (else
2251            #f))))
2252    (let ((len (lp (list sre) 1 init-flags 0)))
2253      (and len
2254           (let ((nfa (make-vector (* *nfa-num-fields* (+ len 1)))))
2255             (do ((i (- (vector-length nfa) 1) (- i 1)))
2256                 ((< i 0))
2257               (vector-set! nfa i (vector-ref buf i)))
2258             nfa)))))
2259
2260;; We don't really want to use this, we use the closure compilation
2261;; below instead, but this is included for reference and testing the
2262;; sre->nfa conversion.
2263
2264;; (define (nfa-match nfa str)
2265;;   (let lp ((ls (string->list str)) (state (car nfa)) (epsilons '()))
2266;;     (if (null? ls)
2267;;         (zero? (car state))
2268;;         (any (lambda (m)
2269;;                (if (eq? 'epsilon (car m))
2270;;                    (and (not (memv (cdr m) epsilons))
2271;;                         (lp ls (assv (cdr m) nfa) (cons (cdr m) epsilons)))
2272;;                    (and (or (eqv? (car m) (car ls))
2273;;                             (and (pair? (car m))
2274;;                                  (char<=? (caar m) (car ls))
2275;;                                  (char<=? (car ls) (cdar m))))
2276;;                         (lp (cdr ls) (assv (cdr m) nfa) '()))))
2277;;              (cdr state)))))
2278
2279;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2280;;;; NFA multi-state representation
2281
2282;; Cache closures in a simple hash-table keyed on the smallest state
2283;; (define (nfa-multi-state-hash nfa mst)
2284;;   (car mst))
2285
2286;; Original sorted list-based representation
2287
2288;; (define (make-nfa-multi-state nfa)
2289;;   '())
2290
2291;; (define (nfa-state->multi-state nfa state)
2292;;   (list state))
2293
2294;; (define (nfa-multi-state-copy mst)
2295;;   (map (lambda (x) x) mst))
2296
2297;; (define (list->nfa-multi-state nfa ls)
2298;;   (nfa-multi-state-copy ls))
2299
2300;; (define (nfa-multi-state-contains? mst i)
2301;;   (memq i mst))
2302
2303;; (define (nfa-multi-state-fold mst kons knil)
2304;;   (fold kons knil mst))
2305
2306;; (define (nfa-multi-state-add! mst i)
2307;;   (insert-sorted i mst))
2308
2309;; (define (nfa-multi-state-add mst i)
2310;;   (insert-sorted i mst))
2311
2312;; (define (nfa-multi-state-union a b)
2313;;   (merge-sorted a b))
2314
2315;; Sorted List Utilities
2316
2317;; (define (insert-sorted n ls)
2318;;   (cond
2319;;    ((null? ls)
2320;;     (cons n '()))
2321;;    ((<= n (car ls))
2322;;     (if (= n (car ls))
2323;;         ls
2324;;         (cons n ls)))
2325;;    (else
2326;;     (cons (car ls) (insert-sorted n (cdr ls))))))
2327
2328;; (define (insert-sorted! n ls)
2329;;   (cond
2330;;    ((null? ls)
2331;;     (cons n '()))
2332;;    ((<= n (car ls))
2333;;     (if (= n (car ls))
2334;;         ls
2335;;         (cons n ls)))
2336;;    (else
2337;;     (let lp ((head ls) (tail (cdr ls)))
2338;;       (cond ((or (null? tail) (< n (car tail)))
2339;;              (set-cdr! head (cons n tail)))
2340;;             ((> n (car tail))
2341;;              (lp tail (cdr tail)))))
2342;;     ls)))
2343
2344;; (define (merge-sorted a b)
2345;;   (cond ((null? a) b)
2346;;         ((null? b) a)
2347;;         ((< (car a) (car b))
2348;;          (cons (car a) (merge-sorted (cdr a) b)))
2349;;         ((> (car a) (car b))
2350;;          (cons (car b) (merge-sorted a (cdr b))))
2351;;         (else (merge-sorted (cdr a) b))))
2352
2353;; ========================================================= ;;
2354
2355;; Presized bit-vector based
2356
2357(define (nfa-multi-state-hash nfa mst)
2358  (modulo (vector-ref mst 0) (nfa-num-states nfa)))
2359
2360(define (make-nfa-multi-state nfa)
2361  (make-vector (quotient (+ (nfa-num-states nfa) 24 -1) 24) 0))
2362
2363(define (nfa-state->multi-state nfa state)
2364  (nfa-multi-state-add! (make-nfa-multi-state nfa) state))
2365
2366(define (nfa-multi-state-copy mst)
2367  (let ((res (make-vector (vector-length mst))))
2368    (do ((i (- (vector-length mst) 1) (- i 1)))
2369        ((< i 0) res)
2370      (vector-set! res i (vector-ref mst i)))))
2371
2372(define (nfa-multi-state-contains? mst i)
2373  (let ((cell (quotient i 24))
2374        (bit (remainder i 24)))
2375    (not (zero? (bit-and (vector-ref mst cell) (bit-shl 1 bit))))))
2376
2377(define (nfa-multi-state-add! mst i)
2378  (let ((cell (quotient i 24))
2379        (bit (remainder i 24)))
2380    (vector-set! mst cell (bit-ior (vector-ref mst cell) (bit-shl 1 bit)))
2381    mst))
2382
2383(define (nfa-multi-state-add mst i)
2384  (nfa-multi-state-add! (nfa-multi-state-copy mst) i))
2385
2386(define (nfa-multi-state-union! a b)
2387  (do ((i (- (vector-length a) 1) (- i 1)))
2388      ((< i 0) a)
2389    (vector-set! a i (bit-ior (vector-ref a i) (vector-ref b i)))))
2390
2391(define (nfa-multi-state-union a b)
2392  (nfa-multi-state-union! (nfa-multi-state-copy a) b))
2393
2394(define (nfa-multi-state-fold mst kons knil)
2395  (let ((limit (vector-length mst)))
2396    (let lp1 ((i 0)
2397              (acc knil))
2398      (if (>= i limit)
2399          acc
2400          (let lp2 ((n (vector-ref mst i))
2401                    (acc acc))
2402            (if (zero? n)
2403                (lp1 (+ i 1) acc)
2404                (let* ((n2 (bit-and n (- n 1)))
2405                       (n-tail (- n n2))
2406                       (bit (+ (* i 24) (integer-log n-tail))))
2407                  (lp2 n2 (kons bit acc)))))))))
2408
2409;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2410;;;; NFA->DFA compilation
2411;;
2412;; During processing, the DFA is a list of the form:
2413;;
2414;;   ((NFA-states ...) accepting-state? transitions ...)
2415;;
2416;; where the transitions are as in the NFA, except there are no
2417;; epsilons, duplicate characters or overlapping char-set ranges, and
2418;; the states moved to are closures (sets of NFA states).  Multiple
2419;; DFA states may be accepting states.
2420
2421(define (nfa->dfa nfa . o)
2422  (let ((max-states (and (pair? o) (car o))))
2423    (let lp ((ls (list (nfa-cache-state-closure! nfa (nfa-start-state nfa))))
2424             (i 0)
2425             (res '()))
2426      (cond
2427       ((null? ls)
2428        (dfa-renumber nfa (reverse res)))
2429       ((assoc (car ls) res) ;; already seen this combination of states
2430        (lp (cdr ls) i res))
2431       ((and max-states (> i max-states)) ;; too many DFA states
2432        #f)
2433       (else
2434        (let* ((states (car ls))
2435               (trans (nfa-state-transitions nfa states))
2436               (accept? (and (nfa-multi-state-contains? states 0) #t)))
2437          (lp (append (map cdr trans) (cdr ls))
2438              (+ i 1)
2439              `((,states ,accept? ,@trans) ,@res))))))))
2440
2441;; When the conversion is complete we renumber the DFA sets-of-states
2442;; in order and convert the result to a vector for fast lookup.
2443;; Charsets containing single characters are converted to those characters
2444;; for quick matching of the literal parts in a regex.
2445(define (dfa-renumber nfa dfa)
2446  (let* ((len (length dfa))
2447         (states (make-vector (nfa-num-states nfa) '()))
2448         (res (make-vector len)))
2449    (define (renumber mst)
2450      (cdr (assoc mst (vector-ref states (nfa-multi-state-hash nfa mst)))))
2451    (let lp ((ls dfa) (i 0))
2452      (cond ((pair? ls)
2453             (let ((j (nfa-multi-state-hash nfa (caar ls))))
2454               (vector-set! states j (cons (cons (caar ls) i)
2455                                           (vector-ref states j))))
2456             (lp (cdr ls) (+ i 1)))))
2457    (let lp ((ls dfa) (i 0))
2458      (cond ((pair? ls)
2459             (for-each
2460              (lambda (x)
2461                (set-car! x (maybe-cset->char (car x)))
2462                (set-cdr! x (renumber (cdr x))))
2463              (cddar ls))
2464             (vector-set! res i (cdar ls))
2465             (lp (cdr ls) (+ i 1)))))
2466    res))
2467
2468;; Extract all distinct ranges and the potential states they can transition
2469;; to from a given set of states.  Any ranges that would overlap with
2470;; distinct characters are split accordingly.
2471(define (nfa-state-transitions nfa states)
2472  (let ((res (nfa-multi-state-fold
2473              states
2474              (lambda (st res)
2475                (let ((trans (nfa-get-state-trans nfa st)))
2476                  (if (null? trans)
2477                      res
2478                      (nfa-join-transitions! nfa res (car trans) (cdr trans)))))
2479              '())))
2480    (for-each (lambda (x) (set-cdr! x (nfa-closure nfa (cdr x)))) res)
2481    res))
2482
2483(define (nfa-join-transitions! nfa existing elt state)
2484  (define (csets-intersect? a b)
2485    (let ((i (cset-intersection a b)))
2486      (and (not (cset-empty? i)) i)))
2487  (let lp ((ls existing) (res '()))
2488    (cond
2489     ((null? ls)
2490      (cond       ; First try to find a group that includes this state
2491       ((find (lambda (x) (nfa-multi-state-contains? (cdr x) state)) existing) =>
2492        (lambda (existing-state)    ; If found, merge charsets with it
2493          (set-car! existing-state (cset-union (car existing-state) elt))
2494          existing))
2495       ;; State not seen yet?  Add a new state transition
2496       (else (cons (cons elt (nfa-state->multi-state nfa state)) existing))))
2497     ((cset=? elt (caar ls)) ; Add state to existing set for this charset
2498      (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state))
2499      existing)
2500     ((csets-intersect? elt (caar ls)) => ; overlapping charset, but diff state
2501      (lambda (intersection)
2502        (let* ((only-in-old (cset-difference (caar ls) elt))
2503               (states-for-old (and (not (cset-empty? only-in-old))
2504                                    (nfa-multi-state-copy (cdar ls))))
2505               (result (if states-for-old
2506                           (cons (cons only-in-old states-for-old)
2507                                 (append res (cdr ls)))
2508                           (append res (cdr ls))))
2509               (only-in-new (cset-difference elt (caar ls))))
2510          ;; Add this state to the states already here and restrict to
2511          ;; the overlapping charset
2512          (set-car! (car ls) intersection)
2513          (set-cdr! (car ls) (nfa-multi-state-add! (cdar ls) state))
2514          ;; Continue with the remaining subset of the new cset (if nonempty)
2515          (cons (car ls)
2516                (if (cset-empty? only-in-new)
2517                    result
2518                    (nfa-join-transitions! nfa result only-in-new state))))))
2519     (else
2520      (lp (cdr ls) (cons (car ls) res))))))
2521
2522(define (nfa-cache-state-closure! nfa state)
2523  (let ((cached (nfa-get-state-closure nfa state)))
2524    (cond
2525     ((not (null? cached))
2526      cached)
2527     (else
2528      (let ((res (nfa-state-closure-internal nfa state)))
2529        (nfa-set-state-closure! nfa state res)
2530        res)))))
2531
2532;; The `closure' of a list of NFA states - all states that can be
2533;; reached from any of them using any number of epsilon transitions.
2534(define (nfa-state-closure-internal nfa state)
2535  (let lp ((ls (list state))
2536           (res (make-nfa-multi-state nfa)))
2537    (cond
2538     ((null? ls)
2539      res)
2540     ((nfa-multi-state-contains? res (car ls))
2541      (lp (cdr ls) res))
2542     (else
2543      (lp (append (nfa-get-epsilons nfa (car ls)) (cdr ls))
2544          (nfa-multi-state-add! res (car ls)))))))
2545
2546(define (nfa-closure-internal nfa states)
2547  (nfa-multi-state-fold
2548   states
2549   (lambda (st res)
2550     (nfa-multi-state-union! res (nfa-cache-state-closure! nfa st)))
2551   (make-nfa-multi-state nfa)))
2552
2553(define (nfa-closure nfa states)
2554  (or (nfa-get-closure nfa states)
2555      (let ((res (nfa-closure-internal nfa states)))
2556        (nfa-add-closure! nfa states res)
2557        res)))
2558
2559;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2560;;;; Match Extraction
2561;;
2562;; DFAs don't give us match information, so once we match and
2563;; determine the start and end, we need to recursively break the
2564;; problem into smaller DFAs to get each submatch.
2565;;
2566;; See http://compilers.iecc.com/comparch/article/07-10-026
2567
2568(define (sre-match-extractor sre num-submatches)
2569  (let* ((tmp (+ num-submatches 1))
2570         (tmp-end-src-offset (+ 5 (* tmp 4)))
2571         (tmp-end-index-offset (+ 6 (* tmp 4))))
2572    (let lp ((sre sre) (n 1) (submatch-deps? #f))
2573      (cond
2574       ((not (sre-has-submatches? sre))
2575        (if (not submatch-deps?)
2576            (lambda (cnk start i end j matches) #t)
2577            (let ((dfa (nfa->dfa (sre->nfa sre ~none))))
2578              (lambda (cnk start i end j matches)
2579                (dfa-match/longest dfa cnk start i end j matches tmp)))))
2580       ((pair? sre)
2581        (case (car sre)
2582          ((: seq)
2583           (let* ((right (sre-sequence (cddr sre)))
2584                  (match-left (lp (cadr sre) n #t))
2585                  (match-right
2586                   (lp right (+ n (sre-count-submatches (cadr sre))) #t)))
2587             (lambda (cnk start i end j matches)
2588               (let lp1 ((end2 end) (j2 j) (best-src #f) (best-index #f))
2589                 (let ((limit (if (eq? start end2)
2590                                  i
2591                                  ((chunker-get-start cnk) end2))))
2592                   (let lp2 ((k j2) (best-src best-src) (best-index best-index))
2593                     (if (< k limit)
2594                         (cond
2595                          ((not (eq? start end2))
2596                           (let ((prev (chunker-prev-chunk cnk start end2)))
2597                             (lp1 prev
2598                                  ((chunker-get-end cnk) prev)
2599                                  best-src
2600                                  best-index)))
2601                          (best-src
2602                           (vector-set! matches tmp-end-src-offset best-src)
2603                           (vector-set! matches tmp-end-index-offset best-index)
2604                           #t)
2605                          (else
2606                           #f))
2607                         (if (and (match-left cnk start i end2 k matches)
2608                                  (eq? end2 (vector-ref matches
2609                                                        tmp-end-src-offset))
2610                                  (eqv? k (vector-ref matches
2611                                                      tmp-end-index-offset))
2612                                  (match-right cnk end2 k end j matches))
2613                             (let ((right-src
2614                                    (vector-ref matches tmp-end-src-offset))
2615                                   (right
2616                                    (vector-ref matches tmp-end-index-offset)))
2617                               (cond
2618                                ((and (eq? end right-src) (eqv? j right))
2619                                 (vector-set! matches tmp-end-src-offset end)
2620                                 (vector-set! matches tmp-end-index-offset j)
2621                                 #t)
2622                                ((or (not best-src)
2623                                     (if (eq? best-src right-src)
2624                                         (> right best-index)
2625                                         (chunk-before? cnk
2626                                                        best-src
2627                                                        right-src)))
2628                                 (lp2 (- k 1) right-src right))
2629                                (else
2630                                 (lp2 (- k 1) best-src best-index))))
2631                             (lp2 (- k 1) best-src best-index)))))))))
2632          ((or)
2633           (if (null? (cdr sre))
2634               (lambda (cnk start i end j matches) #f)
2635               (let* ((rest (sre-alternate (cddr sre)))
2636                      (match-first
2637                       (lp (cadr sre) n #t))
2638                      (match-rest
2639                       (lp rest
2640                           (+ n (sre-count-submatches (cadr sre)))
2641                           submatch-deps?)))
2642                 (lambda (cnk start i end j matches)
2643                   (or (and (match-first cnk start i end j matches)
2644                            (eq? end (vector-ref matches tmp-end-src-offset))
2645                            (eqv? j (vector-ref matches tmp-end-index-offset)))
2646                       (match-rest cnk start i end j matches))))))
2647          ((* +)
2648           (letrec ((match-once
2649                     (lp (sre-sequence (cdr sre)) n #t))
2650                    (match-all
2651                     (lambda (cnk start i end j matches)
2652                       (if (match-once cnk start i end j matches)
2653                           (let ((src (vector-ref matches tmp-end-src-offset))
2654                                 (k (vector-ref matches tmp-end-index-offset)))
2655                             (if (and src (or (not (eq? start src)) (< i k)))
2656                                 (match-all cnk src k end j matches)
2657                                 #t))
2658                           (begin
2659                             (vector-set! matches tmp-end-src-offset start)
2660                             (vector-set! matches tmp-end-index-offset i)
2661                             #t)))))
2662             (if (eq? '* (car sre))
2663                 match-all
2664                 (lambda (cnk start i end j matches)
2665                   (and (match-once cnk start i end j matches)
2666                        (let ((src (vector-ref matches tmp-end-src-offset))
2667                              (k (vector-ref matches tmp-end-index-offset)))
2668                          (match-all cnk src k end j matches)))))))
2669          ((?)
2670           (let ((match-once (lp (sre-sequence (cdr sre)) n #t)))
2671             (lambda (cnk start i end j matches)
2672               (cond
2673                ((match-once cnk start i end j matches)
2674                 #t)
2675                (else
2676                 (vector-set! matches tmp-end-src-offset start)
2677                 (vector-set! matches tmp-end-index-offset i)
2678                 #t)))))
2679          (($ submatch => submatch-named)
2680           (let ((match-one
2681                  (lp (sre-sequence (if (memq (car sre) '($ submatch))
2682                                        (cdr sre)
2683                                        (cddr sre)))
2684                      (+ n 1)
2685                      #t))
2686                 (start-src-offset (+ 3 (* n 4)))
2687                 (start-index-offset (+ 4 (* n 4)))
2688                 (end-src-offset (+ 5 (* n 4)))
2689                 (end-index-offset (+ 6 (* n 4))))
2690             (lambda (cnk start i end j matches)
2691               (cond
2692                ((match-one cnk start i end j matches)
2693                 (vector-set! matches start-src-offset start)
2694                 (vector-set! matches start-index-offset i)
2695                 (vector-set! matches end-src-offset
2696                              (vector-ref matches tmp-end-src-offset))
2697                 (vector-set! matches end-index-offset
2698                              (vector-ref matches tmp-end-index-offset))
2699                 #t)
2700                (else
2701                 #f)))))
2702          (else
2703           (error "unknown regexp operator" (car sre)))))
2704       (else
2705        (error "unknown regexp" sre))))))
2706
2707;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2708;;;; Closure Compilation
2709;;
2710;; We use this for non-regular expressions instead of an interpreted
2711;; NFA matcher.  We use backtracking anyway, but this gives us more
2712;; freedom of implementation, allowing us to support patterns that
2713;; can't be represented in the above NFA representation.
2714
2715(define (sre->procedure sre . o)
2716  (define names
2717    (if (and (pair? o) (pair? (cdr o))) (cadr o) (sre-names sre 1 '())))
2718  (let lp ((sre sre)
2719           (n 1)
2720           (flags (if (pair? o) (car o) ~none))
2721           (next (lambda (cnk init src str i end matches fail)
2722                   (irregex-match-start-chunk-set! matches 0 (car init))
2723                   (irregex-match-start-index-set! matches 0 (cdr init))
2724                   (irregex-match-end-chunk-set! matches 0 src)
2725                   (irregex-match-end-index-set! matches 0 i)
2726                   (%irregex-match-fail-set! matches fail)
2727                   matches)))
2728    ;; XXXX this should be inlined
2729    (define (rec sre) (lp sre n flags next))
2730    (cond
2731     ((pair? sre)
2732      (if (string? (car sre))
2733          (sre-cset->procedure
2734           (sre->cset (car sre) (flag-set? flags ~case-insensitive?))
2735           next)
2736          (case (car sre)
2737            ((~ - & /)
2738             (sre-cset->procedure
2739              (sre->cset sre (flag-set? flags ~case-insensitive?))
2740              next))
2741            ((or)
2742             (case (length (cdr sre))
2743               ((0) (lambda (cnk init src str i end matches fail) (fail)))
2744               ((1) (rec (cadr sre)))
2745               (else
2746                (let* ((first (rec (cadr sre)))
2747                       (rest (lp (sre-alternate (cddr sre))
2748                                 (+ n (sre-count-submatches (cadr sre)))
2749                                 flags
2750                                 next)))
2751                  (lambda (cnk init src str i end matches fail)
2752                    (first cnk init src str i end matches
2753                           (lambda ()
2754                             (rest cnk init src str i end matches fail))))))))
2755            ((w/case)
2756             (lp (sre-sequence (cdr sre))
2757                 n
2758                 (flag-clear flags ~case-insensitive?)
2759                 next))
2760            ((w/nocase)
2761             (lp (sre-sequence (cdr sre))
2762                 n
2763                 (flag-join flags ~case-insensitive?)
2764                 next))
2765            ((w/utf8)
2766             (lp (sre-sequence (cdr sre)) n (flag-join flags ~utf8?) next))
2767            ((w/noutf8)
2768             (lp (sre-sequence (cdr sre)) n (flag-clear flags ~utf8?) next))
2769            ((seq :)
2770             (case (length (cdr sre))
2771               ((0) next)
2772               ((1) (rec (cadr sre)))
2773               (else
2774                (let ((rest (lp (sre-sequence (cddr sre))
2775                                (+ n (sre-count-submatches (cadr sre)))
2776                                flags
2777                                next)))
2778                  (lp (cadr sre) n flags rest)))))
2779            ((?)
2780             (let ((body (rec (sre-sequence (cdr sre)))))
2781               (lambda (cnk init src str i end matches fail)
2782                 (body cnk init src str i end matches
2783                       (lambda () (next cnk init src str i end matches fail))))))
2784            ((??)
2785             (let ((body (rec (sre-sequence (cdr sre)))))
2786               (lambda (cnk init src str i end matches fail)
2787                 (next cnk init src str i end matches
2788                       (lambda () (body cnk init src str i end matches fail))))))
2789            ((*)
2790             (cond
2791              ((sre-empty? (sre-sequence (cdr sre)))
2792               (error "invalid sre: empty *" sre))
2793              (else
2794               (letrec
2795                   ((body
2796                     (lp (sre-sequence (cdr sre))
2797                         n
2798                         flags
2799                         (lambda (cnk init src str i end matches fail)
2800                           (body cnk init src str i end matches
2801                                 (lambda ()
2802                                   (next cnk init src str i end matches fail)
2803                                   ))))))
2804                 (lambda (cnk init src str i end matches fail)
2805                   (body cnk init src str i end matches
2806                         (lambda ()
2807                           (next cnk init src str i end matches fail))))))))
2808            ((*?)
2809             (cond
2810              ((sre-empty? (sre-sequence (cdr sre)))
2811               (error "invalid sre: empty *?" sre))
2812              (else
2813               (letrec
2814                   ((body
2815                     (lp (sre-sequence (cdr sre))
2816                         n
2817                         flags
2818                         (lambda (cnk init src str i end matches fail)
2819                           (next cnk init src str i end matches
2820                                 (lambda ()
2821                                   (body cnk init src str i end matches fail)
2822                                   ))))))
2823                 (lambda (cnk init src str i end matches fail)
2824                   (next cnk init src str i end matches
2825                         (lambda ()
2826                           (body cnk init src str i end matches fail))))))))
2827            ((+)
2828             (lp (sre-sequence (cdr sre))
2829                 n
2830                 flags
2831                 (rec (list '* (sre-sequence (cdr sre))))))
2832            ((=)
2833             (rec `(** ,(cadr sre) ,(cadr sre) ,@(cddr sre))))
2834            ((>=)
2835             (rec `(** ,(cadr sre) #f ,@(cddr sre))))
2836            ((** **?)
2837             (cond
2838              ((or (and (number? (cadr sre))
2839                        (number? (caddr sre))
2840                        (> (cadr sre) (caddr sre)))
2841                   (and (not (cadr sre)) (caddr sre)))
2842               (lambda (cnk init src str i end matches fail) (fail)))
2843              (else
2844               (let* ((from (cadr sre))
2845                      (to (caddr sre))
2846                      (? (if (eq? '** (car sre)) '? '??))
2847                      (* (if (eq? '** (car sre)) '* '*?))
2848                      (sre (sre-sequence (cdddr sre)))
2849                      (x-sre (sre-strip-submatches sre))
2850                      (next (if to
2851                                (if (= from to)
2852                                    next
2853                                    (fold (lambda (x next)
2854                                            (lp `(,? ,sre) n flags next))
2855                                          next
2856                                          (zero-to (- to from))))
2857                                (rec `(,* ,sre)))))
2858                 (if (zero? from)
2859                     next
2860                     (lp `(seq ,@(map (lambda (x) x-sre) (zero-to (- from 1)))
2861                               ,sre)
2862                         n
2863                         flags
2864                         next))))))
2865            ((word)
2866             (rec `(seq bow ,@(cdr sre) eow)))
2867            ((word+)
2868             (rec `(seq bow (+ (& (or alphanumeric "_")
2869                                  (or ,@(cdr sre)))) eow)))
2870            ((posix-string)
2871             (rec (string->sre (cadr sre))))
2872            ((look-ahead)
2873             (let ((check
2874                    (lp (sre-sequence (cdr sre))
2875                        n
2876                        flags
2877                        (lambda (cnk init src str i end matches fail) i))))
2878               (lambda (cnk init src str i end matches fail)
2879                 (if (check cnk init src str i end matches (lambda () #f))
2880                     (next cnk init src str i end matches fail)
2881                     (fail)))))
2882            ((neg-look-ahead)
2883             (let ((check
2884                    (lp (sre-sequence (cdr sre))
2885                        n
2886                        flags
2887                        (lambda (cnk init src str i end matches fail) i))))
2888               (lambda (cnk init src str i end matches fail)
2889                 (if (check cnk init src str i end matches (lambda () #f))
2890                     (fail)
2891                     (next cnk init src str i end matches fail)))))
2892            ((look-behind neg-look-behind)
2893             (let ((check
2894                    (lp (sre-sequence
2895                         (cons '(* any) (append (cdr sre) '(eos))))
2896                        n
2897                        flags
2898                        (lambda (cnk init src str i end matches fail) i))))
2899               (lambda (cnk init src str i end matches fail)
2900                 (let* ((prev ((chunker-get-substring cnk)
2901                               (car init)
2902                               (cdr init)
2903                               src
2904                               i))
2905                        (len (string-length prev))
2906                        (src2 (list prev 0 len)))
2907                   (if ((if (eq? (car sre) 'look-behind) (lambda (x) x) not)
2908                        (check irregex-basic-string-chunker
2909                               (cons src2 0) src2 prev 0 len matches (lambda () #f)))
2910                       (next cnk init src str i end matches fail)
2911                       (fail))))))
2912            ((atomic)
2913             (let ((once
2914                    (lp (sre-sequence (cdr sre))
2915                        n
2916                        flags
2917                        (lambda (cnk init src str i end matches fail) i))))
2918               (lambda (cnk init src str i end matches fail)
2919                 (let ((j (once cnk init src str i end matches (lambda () #f))))
2920                   (if j
2921                       (next cnk init src str j end matches fail)
2922                       (fail))))))
2923            ((if)
2924             (let* ((test-submatches (sre-count-submatches (cadr sre)))
2925                    (pass (lp (caddr sre) flags (+ n test-submatches) next))
2926                    (fail (if (pair? (cdddr sre))
2927                              (lp (cadddr sre)
2928                                  (+ n test-submatches
2929                                     (sre-count-submatches (caddr sre)))
2930                                  flags
2931                                  next)
2932                              (lambda (cnk init src str i end matches fail)
2933                                (fail)))))
2934               (cond
2935                ((or (number? (cadr sre)) (symbol? (cadr sre)))
2936                 (let ((index
2937                        (if (symbol? (cadr sre))
2938                            (cond
2939                             ((assq (cadr sre) names) => cdr)
2940                             (else
2941                              (error "unknown named backref in SRE IF" sre)))
2942                            (cadr sre))))
2943                   (lambda (cnk init src str i end matches fail2)
2944                     (if (%irregex-match-end-chunk matches index)
2945                         (pass cnk init src str i end matches fail2)
2946                         (fail cnk init src str i end matches fail2)))))
2947                (else
2948                 (let ((test (lp (cadr sre) n flags pass)))
2949                   (lambda (cnk init src str i end matches fail2)
2950                     (test cnk init src str i end matches
2951                           (lambda () (fail cnk init src str i end matches fail2)))
2952                     ))))))
2953            ((backref backref-ci)
2954             (let ((n (cond ((number? (cadr sre)) (cadr sre))
2955                            ((assq (cadr sre) names) => cdr)
2956                            (else (error "unknown backreference" (cadr sre)))))
2957                   (compare (if (or (eq? (car sre) 'backref-ci)
2958                                    (flag-set? flags ~case-insensitive?))
2959                                string-ci=?
2960                                string=?)))
2961               (lambda (cnk init src str i end matches fail)
2962                 (let ((s (irregex-match-substring matches n)))
2963                   (if (not s)
2964                       (fail)
2965                       ;; XXXX create an abstract subchunk-compare
2966                       (let lp ((src src)
2967                                (str str)
2968                                (i i)
2969                                (end end)
2970                                (j 0)
2971                                (len (string-length s)))
2972                         (cond
2973                          ((<= len (- end i))
2974                           (cond
2975                            ((compare (substring s j (string-length s))
2976                                      (substring str i (+ i len)))
2977                             (next cnk init src str (+ i len) end matches fail))
2978                            (else
2979                             (fail))))
2980                          (else
2981                           (cond
2982                            ((compare (substring s j (+ j (- end i)))
2983                                      (substring str i end))
2984                             (let ((src2 ((chunker-get-next cnk) src)))
2985                               (if src2
2986                                   (lp src2
2987                                       ((chunker-get-str cnk) src2)
2988                                       ((chunker-get-start cnk) src2)
2989                                       ((chunker-get-end cnk) src2)
2990                                       (+ j (- end i))
2991                                       (- len (- end i)))
2992                                   (fail))))
2993                            (else
2994                             (fail)))))))))))
2995            ((dsm)
2996             (lp (sre-sequence (cdddr sre)) (+ n (cadr sre)) flags next))
2997            (($ submatch)
2998             (let ((body
2999                    (lp (sre-sequence (cdr sre))
3000                        (+ n 1)
3001                        flags
3002                        (lambda (cnk init src str i end matches fail)
3003                          (let ((old-source
3004                                 (%irregex-match-end-chunk matches n))
3005                                (old-index
3006                                 (%irregex-match-end-index matches n)))
3007                            (irregex-match-end-chunk-set! matches n src)
3008                            (irregex-match-end-index-set! matches n i)
3009                            (next cnk init src str i end matches
3010                                  (lambda ()
3011                                    (irregex-match-end-chunk-set!
3012                                     matches n old-source)
3013                                    (irregex-match-end-index-set!
3014                                     matches n old-index)
3015                                    (fail))))))))
3016               (lambda (cnk init src str i end matches fail)
3017                 (let ((old-source (%irregex-match-start-chunk matches n))
3018                       (old-index (%irregex-match-start-index matches n)))
3019                   (irregex-match-start-chunk-set! matches n src)
3020                   (irregex-match-start-index-set! matches n i)
3021                   (body cnk init src str i end matches
3022                         (lambda ()
3023                           (irregex-match-start-chunk-set!
3024                            matches n old-source)
3025                           (irregex-match-start-index-set!
3026                            matches n old-index)
3027                           (fail)))))))
3028            ((=> submatch-named)
3029             (rec `(submatch ,@(cddr sre))))
3030            (else
3031             (error "unknown regexp operator" sre)))))
3032     ((symbol? sre)
3033      (case sre
3034        ((any)
3035         (lambda (cnk init src str i end matches fail)
3036           (if (< i end)
3037               (next cnk init src str (+ i 1) end matches fail)
3038               (let ((src2 ((chunker-get-next cnk) src)))
3039                 (if src2
3040                     (let ((str2 ((chunker-get-str cnk) src2))
3041                           (i2 ((chunker-get-start cnk) src2))
3042                           (end2 ((chunker-get-end cnk) src2)))
3043                       (next cnk init src2 str2 (+ i2 1) end2 matches fail))
3044                     (fail))))))
3045        ((nonl)
3046         (lambda (cnk init src str i end matches fail)
3047           (if (< i end)
3048               (if (not (eqv? #\newline (string-ref str i)))
3049                   (next cnk init src str (+ i 1) end matches fail)
3050                   (fail))
3051               (let ((src2 ((chunker-get-next cnk) src)))
3052                 (if src2
3053                     (let ((str2 ((chunker-get-str cnk) src2))
3054                           (i2 ((chunker-get-start cnk) src2))
3055                           (end2 ((chunker-get-end cnk) src2)))
3056                       (if (not (eqv? #\newline (string-ref str2 i2)))
3057                           (next cnk init src2 str2 (+ i2 1) end2 matches fail)
3058                           (fail)))
3059                     (fail))))))
3060        ((bos)
3061         (lambda (cnk init src str i end matches fail)
3062           (if (and (eq? src (car init)) (eqv? i (cdr init)))
3063               (next cnk init src str i end matches fail)
3064               (fail))))
3065        ((bol)
3066         (lambda (cnk init src str i end matches fail)
3067           (if (or (and (eq? src (car init)) (eqv? i (cdr init)))
3068                   (and (> i ((chunker-get-start cnk) src))
3069                        (eqv? #\newline (string-ref str (- i 1)))))
3070               (next cnk init src str i end matches fail)
3071               (fail))))
3072        ((bow)
3073         (lambda (cnk init src str i end matches fail)
3074           (if (and (or (if (> i ((chunker-get-start cnk) src))
3075                            (not (char-alphanumeric? (string-ref str (- i 1))))
3076                            (let ((ch (chunker-prev-char cnk src end)))
3077                              (and ch (not (char-alphanumeric? ch)))))
3078                        (and (eq? src (car init)) (eqv? i (cdr init))))
3079                    (if (< i end)
3080                        (char-alphanumeric? (string-ref str i))
3081                        (let ((next ((chunker-get-next cnk) src)))
3082                          (and next
3083                               (char-alphanumeric?
3084                                (string-ref ((chunker-get-str cnk) next)
3085                                            ((chunker-get-start cnk) next)))))))
3086               (next cnk init src str i end matches fail)
3087               (fail))))
3088        ((eos)
3089         (lambda (cnk init src str i end matches fail)
3090           (if (and (>= i end) (not ((chunker-get-next cnk) src)))
3091               (next cnk init src str i end matches fail)
3092               (fail))))
3093        ((eol)
3094         (lambda (cnk init src str i end matches fail)
3095           (if (if (< i end)
3096                   (eqv? #\newline (string-ref str i))
3097                   (let ((src2 ((chunker-get-next cnk) src)))
3098                     (if (not src2)
3099                         #t
3100                         (eqv? #\newline
3101                               (string-ref ((chunker-get-str cnk) src2)
3102                                           ((chunker-get-start cnk) src2))))))
3103               (next cnk init src str i end matches fail)
3104               (fail))))
3105        ((eow)
3106         (lambda (cnk init src str i end matches fail)
3107           (if (and (if (< i end)
3108                        (not (char-alphanumeric? (string-ref str i)))
3109                        (let ((ch (chunker-next-char cnk src)))
3110                          (or (not ch) (not (char-alphanumeric? ch)))))
3111                    (if (> i ((chunker-get-start cnk) src))
3112                        (char-alphanumeric? (string-ref str (- i 1)))
3113                        (let ((prev (chunker-prev-char cnk init src)))
3114                          (or (not prev) (char-alphanumeric? prev)))))
3115               (next cnk init src str i end matches fail)
3116               (fail))))
3117        ((nwb)  ;; non-word-boundary
3118         (lambda (cnk init src str i end matches fail)
3119           (let ((c1 (if (< i end)
3120                         (string-ref str i)
3121                         (chunker-next-char cnk src)))
3122                 (c2 (if (> i ((chunker-get-start cnk) src))
3123                         (string-ref str (- i 1))
3124                         (chunker-prev-char cnk init src))))
3125             (if (and c1 c2
3126                      (if (char-alphanumeric? c1)
3127                          (char-alphanumeric? c2)
3128                          (not (char-alphanumeric? c2))))
3129                 (next cnk init src str i end matches fail)
3130                 (fail)))))
3131        ((epsilon)
3132         next)
3133        (else
3134         (let ((cell (assq sre sre-named-definitions)))
3135           (if cell
3136               (rec (cdr cell))
3137               (error "unknown regexp" sre))))))
3138     ((char? sre)
3139      (if (flag-set? flags ~case-insensitive?)
3140          ;; case-insensitive
3141          (lambda (cnk init src str i end matches fail)
3142            (if (>= i end)
3143                (let lp ((src2 ((chunker-get-next cnk) src)))
3144                  (if src2
3145                      (let ((str2 ((chunker-get-str cnk) src2))
3146                            (i2 ((chunker-get-start cnk) src2))
3147                            (end2 ((chunker-get-end cnk) src2)))
3148                        (if (>= i2 end2)
3149                            (lp ((chunker-get-next cnk) src2))
3150                            (if (char-ci=? sre (string-ref str2 i2))
3151                                (next cnk init src2 str2 (+ i2 1) end2
3152                                      matches fail)
3153                                (fail))))
3154                      (fail)))
3155                (if (char-ci=? sre (string-ref str i))
3156                    (next cnk init src str (+ i 1) end matches fail)
3157                    (fail))))
3158          ;; case-sensitive
3159          (lambda (cnk init src str i end matches fail)
3160            (if (>= i end)
3161                (let lp ((src2 ((chunker-get-next cnk) src)))
3162                  (if src2
3163                      (let ((str2 ((chunker-get-str cnk) src2))
3164                            (i2 ((chunker-get-start cnk) src2))
3165                            (end2 ((chunker-get-end cnk) src2)))
3166                        (if (>= i2 end2)
3167                            (lp ((chunker-get-next cnk) src2))
3168                            (if (char=? sre (string-ref str2 i2))
3169                                (next cnk init src2 str2 (+ i2 1) end2
3170                                      matches fail)
3171                                (fail))))
3172                      (fail)))
3173                (if (char=? sre (string-ref str i))
3174                    (next cnk init src str (+ i 1) end matches fail)
3175                    (fail))))
3176          ))
3177     ((string? sre)
3178      (rec (sre-sequence (string->list sre)))
3179;; XXXX reintroduce faster string matching on chunks
3180;;       (if (flag-set? flags ~case-insensitive?)
3181;;           (rec (sre-sequence (string->list sre)))
3182;;           (let ((len (string-length sre)))
3183;;             (lambda (cnk init src str i end matches fail)
3184;;               (if (and (<= (+ i len) end)
3185;;                        (%substring=? sre str 0 i len))
3186;;                   (next str (+ i len) matches fail)
3187;;                   (fail)))))
3188      )
3189     (else
3190      (error "unknown regexp" sre)))))
3191
3192;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3193;;;; Character Sets
3194;;
3195;; Simple character sets as lists of ranges, as used in the NFA/DFA
3196;; compilation.  This is not especially efficient, but is portable and
3197;; scalable for any range of character sets.
3198
3199(define (sre-cset->procedure cset next)
3200  (lambda (cnk init src str i end matches fail)
3201    (if (< i end)
3202        (if (cset-contains? cset (string-ref str i))
3203            (next cnk init src str (+ i 1) end matches fail)
3204            (fail))
3205        (let ((src2 ((chunker-get-next cnk) src)))
3206          (if src2
3207              (let ((str2 ((chunker-get-str cnk) src2))
3208                    (i2 ((chunker-get-start cnk) src2))
3209                    (end2 ((chunker-get-end cnk) src2)))
3210                (if (cset-contains? cset (string-ref str2 i2))
3211                    (next cnk init src2 str2 (+ i2 1) end2 matches fail)
3212                    (fail)))
3213              (fail))))))
3214
3215(define (make-cset) (vector))
3216(define (range->cset from to) (vector (cons from to)))
3217(define (char->cset ch) (vector (cons ch ch)))
3218(define (cset-empty? cs) (zero? (vector-length cs)))
3219(define (maybe-cset->char cs)
3220  (if (and (= (vector-length cs) 1)
3221           (char=? (car (vector-ref cs 0)) (cdr (vector-ref cs 0))))
3222      (car (vector-ref cs 0))
3223      cs))
3224
3225;; Since csets are sorted, there's only one possible representation of any cset
3226(define cset=? equal?)
3227
3228(define (cset-size cs)
3229  (let ((len (vector-length cs)))
3230   (let lp ((i 0) (size 0))
3231     (if (= i len)
3232         size
3233         (lp (+ i 1) (+ size 1
3234                        (- (char->integer (cdr (vector-ref cs i)))
3235                           (char->integer (car (vector-ref cs i))))))))))
3236
3237(define (cset->plist cs)
3238  (let lp ((i (- (vector-length cs) 1))
3239           (res '()))
3240    (if (= i -1)
3241        res
3242        (lp (- i 1) (cons (car (vector-ref cs i))
3243                          (cons (cdr (vector-ref cs i)) res))))))
3244
3245(define (plist->cset ls)
3246  (let lp ((ls ls) (res (make-cset)))
3247    (if (null? ls)
3248        res
3249        (lp (cddr ls) (cset-union (range->cset (car ls) (cadr ls)) res)))))
3250
3251(define (string->cset s)
3252  (fold (lambda (ch cs)
3253          (cset-adjoin cs ch))
3254        (make-cset)
3255        (string->list s)))
3256
3257(define (sre->cset sre . o)
3258  (let lp ((sre sre) (ci? (and (pair? o) (car o))))
3259    (define (rec sre) (lp sre ci?))
3260    (cond
3261     ((pair? sre)
3262      (if (string? (car sre))
3263          (if ci?
3264              (cset-case-insensitive (string->cset (car sre)))
3265              (string->cset (car sre)))
3266          (case (car sre)
3267            ((~)
3268             (cset-complement
3269              (fold cset-union (rec (cadr sre)) (map rec (cddr sre)))))
3270            ((&)
3271             (fold cset-intersection (rec (cadr sre)) (map rec (cddr sre))))
3272            ((-)
3273             (fold (lambda (x res) (cset-difference res x))
3274                   (rec (cadr sre))
3275                   (map rec (cddr sre))))
3276            ((/)
3277             (let ((res (plist->cset (sre-flatten-ranges (cdr sre)))))
3278               (if ci?
3279                   (cset-case-insensitive res)
3280                   res)))
3281            ((or)
3282             (fold cset-union (rec (cadr sre)) (map rec (cddr sre))))
3283            ((w/case)
3284             (lp (sre-alternate (cdr sre)) #f))
3285            ((w/nocase)
3286             (lp (sre-alternate (cdr sre)) #t))
3287            (else
3288             (error "not a valid sre char-set operator" sre)))))
3289     ((char? sre) (if ci?
3290                      (cset-case-insensitive (range->cset sre sre))
3291                      (range->cset sre sre)))
3292     ((string? sre) (rec (list sre)))
3293     (else
3294      (let ((cell (assq sre sre-named-definitions)))
3295        (if cell
3296            (rec (cdr cell))
3297            (error "not a valid sre char-set" sre)))))))
3298
3299(define (cset->sre cset)
3300  (sre-alternate
3301   (map (lambda (x) (list '/ (car x) (cdr x)))
3302        (vector->list cset))))
3303
3304(define (cset-contains? cset ch)
3305  (let ((len (vector-length cset)))
3306    (case len
3307      ((0) #f)
3308      ((1) (let ((range (vector-ref cset 0)))
3309             (and (char<=? ch (cdr range)) (char<=? (car range) ch))))
3310      (else (let lp ((lower 0) (upper len))
3311              (let* ((middle (quotient (+ upper lower) 2))
3312                     (range (vector-ref cset middle)))
3313                (cond ((char<? (cdr range) ch)
3314                       (let ((next (+ middle 1)))
3315                         (and (< next upper) (lp next upper))))
3316                      ((char<? ch (car range))
3317                       (and (< lower middle) (lp lower middle)))
3318                      (else #t))))))))
3319
3320(define (char-ranges-union a b)
3321  (cons (if (char<=? (car a) (car b)) (car a) (car b))
3322        (if (char>=? (cdr a) (cdr b)) (cdr a) (cdr b))))
3323
3324(define (cset-union a b)
3325  (let union-range ((a (vector->list a))
3326                    (b (vector->list b))
3327                    (res '()))
3328    (cond
3329     ((null? a) (list->vector (reverse (append (reverse b) res))))
3330     ((null? b) (list->vector (reverse (append (reverse a) res))))
3331     (else
3332      (let ((a-range (car a))
3333            (b-range (car b)))
3334        (cond
3335         ((i/char<? (next-char-idx (cdr a-range)) (car b-range))
3336          (union-range (cdr a) b (cons a-range res)))
3337         ((i/char>? (car a-range) (next-char-idx (cdr b-range)))
3338          (union-range (cdr b) a (cons b-range res)))
3339         ((char>=? (cdr a-range) (car b-range))
3340          (union-range (cons (char-ranges-union a-range b-range) (cdr a))
3341                       (cdr b)
3342                       res))
3343         (else (union-range (cdr a)
3344                            (cons (char-ranges-union a-range b-range) (cdr b))
3345                            res))))))))
3346
3347(define (cset-adjoin cs ch) (cset-union cs (char->cset ch)))
3348
3349(define (char-idx obj)
3350  (if (char? obj)
3351    (char->integer obj)
3352    obj))
3353(define (i/char<? x y)
3354  (< (char-idx x)
3355     (char-idx y)))
3356(define (i/char>? x y)
3357  (> (char-idx x)
3358     (char-idx y)))
3359
3360
3361(define (next-char c)
3362  (integer->char (+ (char->integer c) 1)))
3363
3364(define (prev-char c)
3365  (integer->char (- (char->integer c) 1)))
3366
3367;; NMOSH: we need this because R6RS doesn't allow (+ #x10ffff 1)
3368;;        char index.
3369(define (next-char-idx c)
3370  (+ (char->integer c) 1))
3371
3372(define (cset-difference a b)
3373  (let diff ((a (vector->list a))
3374             (b (vector->list b))
3375             (res '()))
3376    (cond ((null? a) (list->vector (reverse res)))
3377          ((null? b) (list->vector (append (reverse res) a)))
3378          (else
3379           (let ((a-range (car a))
3380                 (b-range (car b)))
3381             (cond
3382              ((char<? (cdr a-range) (car b-range))
3383               (diff (cdr a) b (cons a-range res)))
3384              ((char>? (car a-range) (cdr b-range))
3385               (diff a (cdr b) res))
3386              ((and (char<=? (car b-range) (car a-range))
3387                    (char>=? (cdr b-range) (cdr a-range)))
3388               (diff (cdr a) b res))
3389              (else (let ((left (and (char<? (car a-range) (car b-range))
3390                                     (cons (car a-range)
3391                                           (prev-char (car b-range)))))
3392                          (right (and (char>? (cdr a-range) (cdr b-range))
3393                                      (cons (next-char (cdr b-range))
3394                                            (cdr a-range)))))
3395                      (diff (if right (cons right (cdr a)) (cdr a))
3396                            b
3397                            (if left (cons left res) res))))))))))
3398
3399(define (min-char a b)
3400  (if (char<? a b) a b))
3401
3402(define (max-char a b)
3403  (if (char<? a b) b a))
3404
3405(define (cset-intersection a b)
3406  (let intersect ((a (vector->list a))
3407                  (b (vector->list b))
3408                  (res '()))
3409    (if (or (null? a) (null? b))
3410        (list->vector (reverse res))
3411        (let ((a-range (car a))
3412              (b-range (car b)))
3413          (cond
3414           ((char<? (cdr a-range) (car b-range))
3415            (intersect (cdr a) b res))
3416           ((char>? (car a-range) (cdr b-range))
3417            (intersect a (cdr b) res))
3418           (else
3419            (let ((result (cons (max-char (car b-range) (car a-range))
3420                                (min-char (cdr a-range) (cdr b-range)))))
3421              (intersect (if (char>? (cdr a-range) (cdr result))
3422                             a (cdr a))
3423                         (if (char>? (cdr b-range) (cdr result))
3424                             b (cdr b))
3425                         (cons result res)))))))))
3426
3427(define (cset-complement a)
3428  (cset-difference (sre->cset *all-chars*) a))
3429
3430;; This could use some optimization :)
3431(define (cset-case-insensitive a)
3432  (let lp ((ls (vector->list a)) (res '()))
3433    (cond ((null? ls) (list->vector (reverse res)))
3434          ((and (char-alphabetic? (caar ls))
3435                (char-alphabetic? (cdar ls)))
3436           (lp (cdr ls)
3437               (reverse
3438                (vector->list
3439                 (cset-union (cset-union (list->vector (reverse res))
3440                                         (vector (car ls)))
3441                             (range->cset (char-altcase (caar ls))
3442                                          (char-altcase (cdar ls))))))))
3443          (else (lp (cdr ls) (reverse (vector->list
3444                                       (cset-union (list->vector (reverse res))
3445                                                   (vector (car ls))))))))))
3446
3447;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3448;;;; Match and Replace Utilities
3449
3450(define (irregex-fold/fast irx kons knil str . o)
3451  (if (not (string? str)) (error "irregex-fold: not a string" str))
3452  (let* ((irx (irregex irx))
3453         (matches (irregex-new-matches irx))
3454         (finish (or (and (pair? o) (car o)) (lambda (i acc) acc)))
3455         (start (if (and (pair? o) (pair? (cdr o))) (cadr o) 0))
3456         (end (if (and (pair? o) (pair? (cdr o)) (pair? (cddr o)))
3457                  (caddr o)
3458                  (string-length str))))
3459    (if (not (and (integer? start) (exact? start)))
3460        (error "irregex-fold: not an exact integer" start))
3461    (if (not (and (integer? end) (exact? end)))
3462        (error "irregex-fold: not an exact integer" end))
3463    (irregex-match-chunker-set! matches irregex-basic-string-chunker)
3464    (let lp ((i start) (acc knil))
3465      (if (>= i end)
3466          (finish i acc)
3467          (let ((m (irregex-search/matches
3468                    irx
3469                    irregex-basic-string-chunker
3470                    (list str i end)
3471                    i
3472                    matches)))
3473            (if (not m)
3474                (finish i acc)
3475                (let* ((end (%irregex-match-end-index m 0))
3476                       (acc (kons i m acc)))
3477                  (irregex-reset-matches! matches)
3478                  (lp end acc))))))))
3479
3480(define (irregex-fold irx kons . args)
3481  (if (not (procedure? kons)) (error "irregex-fold: not a procedure" kons))
3482  (let ((kons2 (lambda (i m acc) (kons i (irregex-copy-matches m) acc))))
3483    (apply irregex-fold/fast irx kons2 args)))
3484
3485(define (irregex-fold/chunked/fast irx kons knil cnk start . o)
3486  (let* ((irx (irregex irx))
3487         (matches (irregex-new-matches irx))
3488         (finish (or (and (pair? o) (car o)) (lambda (src i acc) acc)))
3489         (i (if (and (pair? o) (pair? (cdr o)))
3490                (cadr o)
3491                ((chunker-get-start cnk) start))))
3492    (if (not (integer? i)) (error "irregex-fold/chunked: not an integer" i))
3493    (irregex-match-chunker-set! matches cnk)
3494    (let lp ((start start) (i i) (acc knil))
3495      (if (not start)
3496          (finish start i acc)
3497          (let ((m (irregex-search/matches irx cnk start i matches)))
3498            (if (not m)
3499                (finish start i acc)
3500                (let* ((acc (kons start i m acc))
3501                       (end-src (%irregex-match-end-chunk m 0))
3502                       (end-index (%irregex-match-end-index m 0)))
3503                  (irregex-reset-matches! matches)
3504                  (lp end-src end-index acc))))))))
3505
3506(define (irregex-fold/chunked irx kons . args)
3507  (if (not (procedure? kons)) (error "irregex-fold/chunked: not a procedure" kons))
3508  (let ((kons2 (lambda (s i m acc) (kons s i (irregex-copy-matches m) acc))))
3509    (apply irregex-fold/chunked/fast irx kons2 args)))
3510
3511(define (irregex-replace irx str . o)
3512  (if (not (string? str)) (error "irregex-replace: not a string" str))
3513  (let ((m (irregex-search irx str)))
3514    (and
3515     m
3516     (string-cat-reverse
3517      (cons (substring str (%irregex-match-end-index m 0) (string-length str))
3518            (append (irregex-apply-match m o)
3519                    (list (substring str 0 (%irregex-match-start-index m 0)))
3520                    ))))))
3521
3522(define (irregex-replace/all irx str . o)
3523  (if (not (string? str)) (error "irregex-replace/all: not a string" str))
3524  (irregex-fold/fast
3525   irx
3526   (lambda (i m acc)
3527     (let ((m-start (%irregex-match-start-index m 0)))
3528       (append (irregex-apply-match m o)
3529               (if (>= i m-start)
3530                   acc
3531                   (cons (substring str i m-start) acc)))))
3532   '()
3533   str
3534   (lambda (i acc)
3535     (let ((end (string-length str)))
3536       (string-cat-reverse (if (>= i end)
3537                               acc
3538                               (cons (substring str i end) acc)))))))
3539
3540(define (irregex-apply-match m ls)
3541  (let lp ((ls ls) (res '()))
3542    (if (null? ls)
3543        res
3544        (cond
3545         ((integer? (car ls))
3546          (lp (cdr ls)
3547              (cons (or (irregex-match-substring m (car ls)) "") res)))
3548         ((procedure? (car ls))
3549          (lp (cdr ls) (cons ((car ls) m) res)))
3550         ((symbol? (car ls))
3551          (case (car ls)
3552            ((pre)
3553             (lp (cdr ls)
3554                 (cons (substring (car (%irregex-match-start-chunk m 0))
3555                                  0
3556                                  (%irregex-match-start-index m 0))
3557                       res)))
3558            ((post)
3559             (let ((str (car (%irregex-match-start-chunk m 0))))
3560               (lp (cdr ls)
3561                   (cons (substring str
3562                                    (%irregex-match-end-index m 0)
3563                                    (string-length str))
3564                         res))))
3565            (else
3566             (cond
3567              ((assq (car ls) (irregex-match-names m))
3568               => (lambda (x) (lp (cons (cdr x) (cdr ls)) res)))
3569              (else
3570               (error "unknown match replacement" (car ls)))))))
3571         (else
3572          (lp (cdr ls) (cons (car ls) res)))))))
3573
3574(define (irregex-extract irx str . o)
3575  (if (not (string? str)) (error "irregex-extract: not a string" str))
3576  (apply irregex-fold/fast
3577         irx
3578         (lambda (i m a) (cons (irregex-match-substring m) a))
3579         '()
3580         str
3581         (lambda (i a) (reverse a))
3582         o))
3583
3584(define (irregex-split irx str . o)
3585  (if (not (string? str)) (error "irregex-split: not a string" str))
3586  (let ((start (if (pair? o) (car o) 0))
3587        (end (if (and (pair? o) (pair? (cdr o))) (cadr o) (string-length str))))
3588    (irregex-fold/fast
3589     irx
3590     (lambda (i m a)
3591       (if (= i (%irregex-match-start-index m 0))
3592           a
3593           (cons (substring str i (%irregex-match-start-index m 0)) a)))
3594     '()
3595     str
3596     (lambda (i a)
3597       (reverse (if (= i end) a (cons (substring str i end) a))))
3598     start
3599     end)))
3600)
3601