1;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*-
2;;; $Header: /usr/local/cvsrep/cl-ppcre/repetition-closures.lisp,v 1.34 2009/09/17 19:17:31 edi Exp $
3
4;;; This is actually a part of closures.lisp which we put into a
5;;; separate file because it is rather complex. We only deal with
6;;; REPETITIONs here. Note that this part of the code contains some
7;;; rather crazy micro-optimizations which were introduced to be as
8;;; competitive with Perl as possible in tight loops.
9
10;;; Copyright (c) 2002-2009, Dr. Edmund Weitz. All rights reserved.
11
12;;; Redistribution and use in source and binary forms, with or without
13;;; modification, are permitted provided that the following conditions
14;;; are met:
15
16;;;   * Redistributions of source code must retain the above copyright
17;;;     notice, this list of conditions and the following disclaimer.
18
19;;;   * Redistributions in binary form must reproduce the above
20;;;     copyright notice, this list of conditions and the following
21;;;     disclaimer in the documentation and/or other materials
22;;;     provided with the distribution.
23
24;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
25;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
26;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
28;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
30;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
32;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
33;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
34;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35
36(in-package :cl-ppcre)
37
38(defmacro incf-after (place &optional (delta 1) &environment env)
39  "Utility macro inspired by C's \"place++\", i.e. first return the
40value of PLACE and afterwards increment it by DELTA."
41  (with-unique-names (%temp)
42    (multiple-value-bind (vars vals store-vars writer-form reader-form)
43        (get-setf-expansion place env)
44      `(let* (,@(mapcar #'list vars vals)
45              (,%temp ,reader-form)
46              (,(car store-vars) (+ ,%temp ,delta)))
47        ,writer-form
48        ,%temp))))
49
50;; code for greedy repetitions with minimum zero
51
52(defmacro greedy-constant-length-closure (check-curr-pos)
53  "This is the template for simple greedy repetitions (where simple
54means that the minimum number of repetitions is zero, that the inner
55regex to be checked is of fixed length LEN, and that it doesn't
56contain registers, i.e. there's no need for backtracking).
57CHECK-CURR-POS is a form which checks whether the inner regex of the
58repetition matches at CURR-POS."
59  `(if maximum
60    (lambda (start-pos)
61      (declare (fixnum start-pos maximum))
62      ;; because we know LEN we know in advance where to stop at the
63      ;; latest; we also take into consideration MIN-REST, i.e. the
64      ;; minimal length of the part behind the repetition
65      (let ((target-end-pos (min (1+ (- *end-pos* len min-rest))
66                                 ;; don't go further than MAXIMUM
67                                 ;; repetitions, of course
68                                 (+ start-pos
69                                    (the fixnum (* len maximum)))))
70            (curr-pos start-pos))
71        (declare (fixnum target-end-pos curr-pos))
72        (block greedy-constant-length-matcher
73          ;; we use an ugly TAGBODY construct because this might be a
74          ;; tight loop and this version is a bit faster than our LOOP
75          ;; version (at least in CMUCL)
76          (tagbody
77            forward-loop
78            ;; first go forward as far as possible, i.e. while
79            ;; the inner regex matches
80            (when (>= curr-pos target-end-pos)
81              (go backward-loop))
82            (when ,check-curr-pos
83              (incf curr-pos len)
84              (go forward-loop))
85            backward-loop
86            ;; now go back LEN steps each until we're able to match
87            ;; the rest of the regex
88            (when (< curr-pos start-pos)
89              (return-from greedy-constant-length-matcher nil))
90            (let ((result (funcall next-fn curr-pos)))
91              (when result
92                (return-from greedy-constant-length-matcher result)))
93            (decf curr-pos len)
94            (go backward-loop)))))
95    ;; basically the same code; it's just a bit easier because we're
96    ;; not bounded by MAXIMUM
97    (lambda (start-pos)
98      (declare (fixnum start-pos))
99      (let ((target-end-pos (1+ (- *end-pos* len min-rest)))
100            (curr-pos start-pos))
101        (declare (fixnum target-end-pos curr-pos))
102        (block greedy-constant-length-matcher
103          (tagbody
104            forward-loop
105            (when (>= curr-pos target-end-pos)
106              (go backward-loop))
107            (when ,check-curr-pos
108              (incf curr-pos len)
109              (go forward-loop))
110            backward-loop
111            (when (< curr-pos start-pos)
112              (return-from greedy-constant-length-matcher nil))
113            (let ((result (funcall next-fn curr-pos)))
114              (when result
115                (return-from greedy-constant-length-matcher result)))
116            (decf curr-pos len)
117            (go backward-loop)))))))
118
119(defun create-greedy-everything-matcher (maximum min-rest next-fn)
120  "Creates a closure which just matches as far ahead as possible,
121i.e. a closure for a dot in single-line mode."
122  (declare #.*standard-optimize-settings*)
123  (declare (fixnum min-rest) (function next-fn))
124  (if maximum
125    (lambda (start-pos)
126      (declare (fixnum start-pos maximum))
127      ;; because we know LEN we know in advance where to stop at the
128      ;; latest; we also take into consideration MIN-REST, i.e. the
129      ;; minimal length of the part behind the repetition
130      (let ((target-end-pos (min (+ start-pos maximum)
131                                 (- *end-pos* min-rest))))
132        (declare (fixnum target-end-pos))
133        ;; start from the highest possible position and go backward
134        ;; until we're able to match the rest of the regex
135        (loop for curr-pos of-type fixnum from target-end-pos downto start-pos
136              thereis (funcall next-fn curr-pos))))
137    ;; basically the same code; it's just a bit easier because we're
138    ;; not bounded by MAXIMUM
139    (lambda (start-pos)
140      (declare (fixnum start-pos))
141      (let ((target-end-pos (- *end-pos* min-rest)))
142        (declare (fixnum target-end-pos))
143        (loop for curr-pos of-type fixnum from target-end-pos downto start-pos
144              thereis (funcall next-fn curr-pos))))))
145
146(defgeneric create-greedy-constant-length-matcher (repetition next-fn)
147  (declare #.*standard-optimize-settings*)
148  (:documentation "Creates a closure which tries to match REPETITION.
149It is assumed that REPETITION is greedy and the minimal number of
150repetitions is zero.  It is furthermore assumed that the inner regex
151of REPETITION is of fixed length and doesn't contain registers."))
152
153(defmethod create-greedy-constant-length-matcher ((repetition repetition)
154                                                  next-fn)
155  (declare #.*standard-optimize-settings*)
156  (let ((len (len repetition))
157        (maximum (maximum repetition))
158        (regex (regex repetition))
159        (min-rest (min-rest repetition)))
160    (declare (fixnum len min-rest)
161             (function next-fn))
162    (cond ((zerop len)
163            ;; inner regex has zero-length, so we can discard it
164            ;; completely
165            next-fn)
166          (t
167            ;; now first try to optimize for a couple of common cases
168            (typecase regex
169              (str
170                (let ((str (str regex)))
171                  (if (= 1 len)
172                    ;; a single character
173                    (let ((chr (schar str 0)))
174                      (if (case-insensitive-p regex)
175                        (greedy-constant-length-closure
176                         (char-equal chr (schar *string* curr-pos)))
177                        (greedy-constant-length-closure
178                         (char= chr (schar *string* curr-pos)))))
179                    ;; a string
180                    (if (case-insensitive-p regex)
181                      (greedy-constant-length-closure
182                       (*string*-equal str curr-pos (+ curr-pos len) 0 len))
183                      (greedy-constant-length-closure
184                       (*string*= str curr-pos (+ curr-pos len) 0 len))))))
185              (char-class
186                ;; a character class
187                (insert-char-class-tester (regex (schar *string* curr-pos))
188                  (greedy-constant-length-closure
189                   (char-class-test))))
190              (everything
191                ;; an EVERYTHING object, i.e. a dot
192                (if (single-line-p regex)
193                  (create-greedy-everything-matcher maximum min-rest next-fn)
194                  (greedy-constant-length-closure
195                   (char/= #\Newline (schar *string* curr-pos)))))
196              (t
197                ;; the general case - we build an inner matcher which
198                ;; just checks for immediate success, i.e. NEXT-FN is
199                ;; #'IDENTITY
200                (let ((inner-matcher (create-matcher-aux regex #'identity)))
201                  (declare (function inner-matcher))
202                  (greedy-constant-length-closure
203                   (funcall inner-matcher curr-pos)))))))))
204
205(defgeneric create-greedy-no-zero-matcher (repetition next-fn)
206  (declare #.*standard-optimize-settings*)
207  (:documentation "Creates a closure which tries to match REPETITION.
208It is assumed that REPETITION is greedy and the minimal number of
209repetitions is zero.  It is furthermore assumed that the inner regex
210of REPETITION can never match a zero-length string \(or instead the
211maximal number of repetitions is 1)."))
212
213(defmethod create-greedy-no-zero-matcher ((repetition repetition) next-fn)
214  (declare #.*standard-optimize-settings*)
215  (let ((maximum (maximum repetition))
216        ;; REPEAT-MATCHER is part of the closure's environment but it
217        ;; can only be defined after GREEDY-AUX is defined
218        repeat-matcher)
219    (declare (function next-fn))
220    (cond
221      ((eql maximum 1)
222        ;; this is essentially like the next case but with a known
223        ;; MAXIMUM of 1 we can get away without a counter; note that
224        ;; we always arrive here if CONVERT optimizes <regex>* to
225        ;; (?:<regex'>*<regex>)?
226        (setq repeat-matcher
227                (create-matcher-aux (regex repetition) next-fn))
228        (lambda (start-pos)
229          (declare (function repeat-matcher))
230          (or (funcall repeat-matcher start-pos)
231              (funcall next-fn start-pos))))
232      (maximum
233        ;; we make a reservation for our slot in *REPEAT-COUNTERS*
234        ;; because we need to keep track whether we've reached MAXIMUM
235        ;; repetitions
236        (let ((rep-num (incf-after *rep-num*)))
237          (flet ((greedy-aux (start-pos)
238                   (declare (fixnum start-pos maximum rep-num)
239                            (function repeat-matcher))
240                   ;; the actual matcher which first tries to match the
241                   ;; inner regex of REPETITION (if we haven't done so
242                   ;; too often) and on failure calls NEXT-FN
243                   (or (and (< (aref *repeat-counters* rep-num) maximum)
244                            (incf (aref *repeat-counters* rep-num))
245                            ;; note that REPEAT-MATCHER will call
246                            ;; GREEDY-AUX again recursively
247                            (prog1
248                              (funcall repeat-matcher start-pos)
249                              (decf (aref *repeat-counters* rep-num))))
250                       (funcall next-fn start-pos))))
251            ;; create a closure to match the inner regex and to
252            ;; implement backtracking via GREEDY-AUX
253            (setq repeat-matcher
254                    (create-matcher-aux (regex repetition) #'greedy-aux))
255            ;; the closure we return is just a thin wrapper around
256            ;; GREEDY-AUX to initialize the repetition counter
257            (lambda (start-pos)
258              (declare (fixnum start-pos))
259              (setf (aref *repeat-counters* rep-num) 0)
260              (greedy-aux start-pos)))))
261      (t
262        ;; easier code because we're not bounded by MAXIMUM, but
263        ;; basically the same
264        (flet ((greedy-aux (start-pos)
265                 (declare (fixnum start-pos)
266                          (function repeat-matcher))
267                 (or (funcall repeat-matcher start-pos)
268                     (funcall next-fn start-pos))))
269          (setq repeat-matcher
270                  (create-matcher-aux (regex repetition) #'greedy-aux))
271          #'greedy-aux)))))
272
273(defgeneric create-greedy-matcher (repetition next-fn)
274  (declare #.*standard-optimize-settings*)
275  (:documentation "Creates a closure which tries to match REPETITION.
276It is assumed that REPETITION is greedy and the minimal number of
277repetitions is zero."))
278
279(defmethod create-greedy-matcher ((repetition repetition) next-fn)
280  (declare #.*standard-optimize-settings*)
281  (let ((maximum (maximum repetition))
282        ;; we make a reservation for our slot in *LAST-POS-STORES* because
283        ;; we have to watch out for endless loops as the inner regex might
284        ;; match zero-length strings
285        (zero-length-num (incf-after *zero-length-num*))
286        ;; REPEAT-MATCHER is part of the closure's environment but it
287        ;; can only be defined after GREEDY-AUX is defined
288        repeat-matcher)
289    (declare (fixnum zero-length-num)
290             (function next-fn))
291    (cond
292      (maximum
293        ;; we make a reservation for our slot in *REPEAT-COUNTERS*
294        ;; because we need to keep track whether we've reached MAXIMUM
295        ;; repetitions
296        (let ((rep-num (incf-after *rep-num*)))
297          (flet ((greedy-aux (start-pos)
298                   ;; the actual matcher which first tries to match the
299                   ;; inner regex of REPETITION (if we haven't done so
300                   ;; too often) and on failure calls NEXT-FN
301                   (declare (fixnum start-pos maximum rep-num)
302                            (function repeat-matcher))
303                   (let ((old-last-pos
304                           (svref *last-pos-stores* zero-length-num)))
305                     (when (and old-last-pos
306                                (= (the fixnum old-last-pos) start-pos))
307                       ;; stop immediately if we've been here before,
308                       ;; i.e. if the last attempt matched a zero-length
309                       ;; string
310                       (return-from greedy-aux (funcall next-fn start-pos)))
311                     ;; otherwise remember this position for the next
312                     ;; repetition
313                     (setf (svref *last-pos-stores* zero-length-num) start-pos)
314                     (or (and (< (aref *repeat-counters* rep-num) maximum)
315                              (incf (aref *repeat-counters* rep-num))
316                              ;; note that REPEAT-MATCHER will call
317                              ;; GREEDY-AUX again recursively
318                              (prog1
319                                (funcall repeat-matcher start-pos)
320                                (decf (aref *repeat-counters* rep-num))
321                                (setf (svref *last-pos-stores* zero-length-num)
322                                        old-last-pos)))
323                         (funcall next-fn start-pos)))))
324            ;; create a closure to match the inner regex and to
325            ;; implement backtracking via GREEDY-AUX
326            (setq repeat-matcher
327                    (create-matcher-aux (regex repetition) #'greedy-aux))
328            ;; the closure we return is just a thin wrapper around
329            ;; GREEDY-AUX to initialize the repetition counter and our
330            ;; slot in *LAST-POS-STORES*
331            (lambda (start-pos)
332              (declare (fixnum start-pos))
333              (setf (aref *repeat-counters* rep-num) 0
334                    (svref *last-pos-stores* zero-length-num) nil)
335              (greedy-aux start-pos)))))
336      (t
337        ;; easier code because we're not bounded by MAXIMUM, but
338        ;; basically the same
339        (flet ((greedy-aux (start-pos)
340                 (declare (fixnum start-pos)
341                          (function repeat-matcher))
342                 (let ((old-last-pos
343                         (svref *last-pos-stores* zero-length-num)))
344                   (when (and old-last-pos
345                              (= (the fixnum old-last-pos) start-pos))
346                     (return-from greedy-aux (funcall next-fn start-pos)))
347                   (setf (svref *last-pos-stores* zero-length-num) start-pos)
348                   (or (prog1
349                         (funcall repeat-matcher start-pos)
350                         (setf (svref *last-pos-stores* zero-length-num) old-last-pos))
351                       (funcall next-fn start-pos)))))
352          (setq repeat-matcher
353                  (create-matcher-aux (regex repetition) #'greedy-aux))
354          (lambda (start-pos)
355            (declare (fixnum start-pos))
356            (setf (svref *last-pos-stores* zero-length-num) nil)
357            (greedy-aux start-pos)))))))
358
359;; code for non-greedy repetitions with minimum zero
360
361(defmacro non-greedy-constant-length-closure (check-curr-pos)
362  "This is the template for simple non-greedy repetitions \(where
363simple means that the minimum number of repetitions is zero, that the
364inner regex to be checked is of fixed length LEN, and that it doesn't
365contain registers, i.e. there's no need for backtracking).
366CHECK-CURR-POS is a form which checks whether the inner regex of the
367repetition matches at CURR-POS."
368  `(if maximum
369    (lambda (start-pos)
370      (declare (fixnum start-pos maximum))
371      ;; because we know LEN we know in advance where to stop at the
372      ;; latest; we also take into consideration MIN-REST, i.e. the
373      ;; minimal length of the part behind the repetition
374      (let ((target-end-pos (min (1+ (- *end-pos* len min-rest))
375                                 (+ start-pos
376                                    (the fixnum (* len maximum))))))
377        ;; move forward by LEN and always try NEXT-FN first, then
378        ;; CHECK-CUR-POS
379        (loop for curr-pos of-type fixnum from start-pos
380                                          below target-end-pos
381                                          by len
382              thereis (funcall next-fn curr-pos)
383              while ,check-curr-pos
384              finally (return (funcall next-fn curr-pos)))))
385  ;; basically the same code; it's just a bit easier because we're
386  ;; not bounded by MAXIMUM
387  (lambda (start-pos)
388    (declare (fixnum start-pos))
389    (let ((target-end-pos (1+ (- *end-pos* len min-rest))))
390      (loop for curr-pos of-type fixnum from start-pos
391                                        below target-end-pos
392                                        by len
393            thereis (funcall next-fn curr-pos)
394            while ,check-curr-pos
395            finally (return (funcall next-fn curr-pos)))))))
396
397(defgeneric create-non-greedy-constant-length-matcher (repetition next-fn)
398  (declare #.*standard-optimize-settings*)
399  (:documentation "Creates a closure which tries to match REPETITION.
400It is assumed that REPETITION is non-greedy and the minimal number of
401repetitions is zero.  It is furthermore assumed that the inner regex
402of REPETITION is of fixed length and doesn't contain registers."))
403
404(defmethod create-non-greedy-constant-length-matcher ((repetition repetition) next-fn)
405  (declare #.*standard-optimize-settings*)
406  (let ((len (len repetition))
407        (maximum (maximum repetition))
408        (regex (regex repetition))
409        (min-rest (min-rest repetition)))
410    (declare (fixnum len min-rest)
411             (function next-fn))
412    (cond ((zerop len)
413            ;; inner regex has zero-length, so we can discard it
414            ;; completely
415            next-fn)
416          (t
417            ;; now first try to optimize for a couple of common cases
418            (typecase regex
419              (str
420                (let ((str (str regex)))
421                  (if (= 1 len)
422                    ;; a single character
423                    (let ((chr (schar str 0)))
424                      (if (case-insensitive-p regex)
425                        (non-greedy-constant-length-closure
426                         (char-equal chr (schar *string* curr-pos)))
427                        (non-greedy-constant-length-closure
428                         (char= chr (schar *string* curr-pos)))))
429                    ;; a string
430                    (if (case-insensitive-p regex)
431                      (non-greedy-constant-length-closure
432                       (*string*-equal str curr-pos (+ curr-pos len) 0 len))
433                      (non-greedy-constant-length-closure
434                       (*string*= str curr-pos (+ curr-pos len) 0 len))))))
435              (char-class
436                ;; a character class
437                (insert-char-class-tester (regex (schar *string* curr-pos))
438                  (non-greedy-constant-length-closure
439                   (char-class-test))))
440              (everything
441                (if (single-line-p regex)
442                  ;; a dot which really can match everything; we rely
443                  ;; on the compiler to optimize this away
444                  (non-greedy-constant-length-closure
445                   t)
446                  ;; a dot which has to watch out for #\Newline
447                  (non-greedy-constant-length-closure
448                   (char/= #\Newline (schar *string* curr-pos)))))
449              (t
450                ;; the general case - we build an inner matcher which
451                ;; just checks for immediate success, i.e. NEXT-FN is
452                ;; #'IDENTITY
453                (let ((inner-matcher (create-matcher-aux regex #'identity)))
454                  (declare (function inner-matcher))
455                  (non-greedy-constant-length-closure
456                   (funcall inner-matcher curr-pos)))))))))
457
458(defgeneric create-non-greedy-no-zero-matcher (repetition next-fn)
459  (declare #.*standard-optimize-settings*)
460  (:documentation "Creates a closure which tries to match REPETITION.
461It is assumed that REPETITION is non-greedy and the minimal number of
462repetitions is zero.  It is furthermore assumed that the inner regex
463of REPETITION can never match a zero-length string \(or instead the
464maximal number of repetitions is 1)."))
465
466(defmethod create-non-greedy-no-zero-matcher ((repetition repetition) next-fn)
467  (declare #.*standard-optimize-settings*)
468  (let ((maximum (maximum repetition))
469        ;; REPEAT-MATCHER is part of the closure's environment but it
470        ;; can only be defined after NON-GREEDY-AUX is defined
471        repeat-matcher)
472    (declare (function next-fn))
473    (cond
474      ((eql maximum 1)
475        ;; this is essentially like the next case but with a known
476        ;; MAXIMUM of 1 we can get away without a counter
477        (setq repeat-matcher
478                (create-matcher-aux (regex repetition) next-fn))
479        (lambda (start-pos)
480          (declare (function repeat-matcher))
481          (or (funcall next-fn start-pos)
482              (funcall repeat-matcher start-pos))))
483      (maximum
484        ;; we make a reservation for our slot in *REPEAT-COUNTERS*
485        ;; because we need to keep track whether we've reached MAXIMUM
486        ;; repetitions
487        (let ((rep-num (incf-after *rep-num*)))
488          (flet ((non-greedy-aux (start-pos)
489                   ;; the actual matcher which first calls NEXT-FN and
490                   ;; on failure tries to match the inner regex of
491                   ;; REPETITION (if we haven't done so too often)
492                   (declare (fixnum start-pos maximum rep-num)
493                            (function repeat-matcher))
494                   (or (funcall next-fn start-pos)
495                       (and (< (aref *repeat-counters* rep-num) maximum)
496                            (incf (aref *repeat-counters* rep-num))
497                            ;; note that REPEAT-MATCHER will call
498                            ;; NON-GREEDY-AUX again recursively
499                            (prog1
500                              (funcall repeat-matcher start-pos)
501                              (decf (aref *repeat-counters* rep-num)))))))
502            ;; create a closure to match the inner regex and to
503            ;; implement backtracking via NON-GREEDY-AUX
504            (setq repeat-matcher
505                    (create-matcher-aux (regex repetition) #'non-greedy-aux))
506            ;; the closure we return is just a thin wrapper around
507            ;; NON-GREEDY-AUX to initialize the repetition counter
508            (lambda (start-pos)
509              (declare (fixnum start-pos))
510              (setf (aref *repeat-counters* rep-num) 0)
511              (non-greedy-aux start-pos)))))
512      (t
513        ;; easier code because we're not bounded by MAXIMUM, but
514        ;; basically the same
515        (flet ((non-greedy-aux (start-pos)
516                 (declare (fixnum start-pos)
517                          (function repeat-matcher))
518                 (or (funcall next-fn start-pos)
519                     (funcall repeat-matcher start-pos))))
520          (setq repeat-matcher
521                  (create-matcher-aux (regex repetition) #'non-greedy-aux))
522          #'non-greedy-aux)))))
523
524(defgeneric create-non-greedy-matcher (repetition next-fn)
525  (declare #.*standard-optimize-settings*)
526  (:documentation "Creates a closure which tries to match REPETITION.
527It is assumed that REPETITION is non-greedy and the minimal number of
528repetitions is zero."))
529
530(defmethod create-non-greedy-matcher ((repetition repetition) next-fn)
531  (declare #.*standard-optimize-settings*)
532  ;; we make a reservation for our slot in *LAST-POS-STORES* because
533  ;; we have to watch out for endless loops as the inner regex might
534  ;; match zero-length strings
535  (let ((zero-length-num (incf-after *zero-length-num*))
536        (maximum (maximum repetition))
537        ;; REPEAT-MATCHER is part of the closure's environment but it
538        ;; can only be defined after NON-GREEDY-AUX is defined
539        repeat-matcher)
540    (declare (fixnum zero-length-num)
541             (function next-fn))
542    (cond
543      (maximum
544        ;; we make a reservation for our slot in *REPEAT-COUNTERS*
545        ;; because we need to keep track whether we've reached MAXIMUM
546        ;; repetitions
547        (let ((rep-num (incf-after *rep-num*)))
548          (flet ((non-greedy-aux (start-pos)
549                   ;; the actual matcher which first calls NEXT-FN and
550                   ;; on failure tries to match the inner regex of
551                   ;; REPETITION (if we haven't done so too often)
552                   (declare (fixnum start-pos maximum rep-num)
553                            (function repeat-matcher))
554                   (let ((old-last-pos
555                           (svref *last-pos-stores* zero-length-num)))
556                     (when (and old-last-pos
557                                (= (the fixnum old-last-pos) start-pos))
558                       ;; stop immediately if we've been here before,
559                       ;; i.e. if the last attempt matched a zero-length
560                       ;; string
561                       (return-from non-greedy-aux (funcall next-fn start-pos)))
562                     ;; otherwise remember this position for the next
563                     ;; repetition
564                     (setf (svref *last-pos-stores* zero-length-num) start-pos)
565                     (or (funcall next-fn start-pos)
566                         (and (< (aref *repeat-counters* rep-num) maximum)
567                              (incf (aref *repeat-counters* rep-num))
568                              ;; note that REPEAT-MATCHER will call
569                              ;; NON-GREEDY-AUX again recursively
570                              (prog1
571                                (funcall repeat-matcher start-pos)
572                                (decf (aref *repeat-counters* rep-num))
573                                (setf (svref *last-pos-stores* zero-length-num)
574                                        old-last-pos)))))))
575            ;; create a closure to match the inner regex and to
576            ;; implement backtracking via NON-GREEDY-AUX
577            (setq repeat-matcher
578                    (create-matcher-aux (regex repetition) #'non-greedy-aux))
579            ;; the closure we return is just a thin wrapper around
580            ;; NON-GREEDY-AUX to initialize the repetition counter and our
581            ;; slot in *LAST-POS-STORES*
582            (lambda (start-pos)
583              (declare (fixnum start-pos))
584              (setf (aref *repeat-counters* rep-num) 0
585                    (svref *last-pos-stores* zero-length-num) nil)
586              (non-greedy-aux start-pos)))))
587      (t
588        ;; easier code because we're not bounded by MAXIMUM, but
589        ;; basically the same
590        (flet ((non-greedy-aux (start-pos)
591                 (declare (fixnum start-pos)
592                          (function repeat-matcher))
593                 (let ((old-last-pos
594                         (svref *last-pos-stores* zero-length-num)))
595                   (when (and old-last-pos
596                              (= (the fixnum old-last-pos) start-pos))
597                     (return-from non-greedy-aux (funcall next-fn start-pos)))
598                   (setf (svref *last-pos-stores* zero-length-num) start-pos)
599                   (or (funcall next-fn start-pos)
600                       (prog1
601                         (funcall repeat-matcher start-pos)
602                         (setf (svref *last-pos-stores* zero-length-num)
603                                 old-last-pos))))))
604          (setq repeat-matcher
605                  (create-matcher-aux (regex repetition) #'non-greedy-aux))
606          (lambda (start-pos)
607            (declare (fixnum start-pos))
608            (setf (svref *last-pos-stores* zero-length-num) nil)
609            (non-greedy-aux start-pos)))))))
610
611;; code for constant repetitions, i.e. those with a fixed number of repetitions
612
613(defmacro constant-repetition-constant-length-closure (check-curr-pos)
614  "This is the template for simple constant repetitions (where simple
615means that the inner regex to be checked is of fixed length LEN, and
616that it doesn't contain registers, i.e. there's no need for
617backtracking) and where constant means that MINIMUM is equal to
618MAXIMUM.  CHECK-CURR-POS is a form which checks whether the inner
619regex of the repetition matches at CURR-POS."
620  `(lambda (start-pos)
621    (declare (fixnum start-pos))
622      (let ((target-end-pos (+ start-pos
623                               (the fixnum (* len repetitions)))))
624        (declare (fixnum target-end-pos))
625        ;; first check if we won't go beyond the end of the string
626        (and (>= *end-pos* target-end-pos)
627             ;; then loop through all repetitions step by step
628             (loop for curr-pos of-type fixnum from start-pos
629                                               below target-end-pos
630                                               by len
631                   always ,check-curr-pos)
632             ;; finally call NEXT-FN if we made it that far
633             (funcall next-fn target-end-pos)))))
634
635(defgeneric create-constant-repetition-constant-length-matcher
636    (repetition next-fn)
637  (declare #.*standard-optimize-settings*)
638  (:documentation "Creates a closure which tries to match REPETITION.
639It is assumed that REPETITION has a constant number of repetitions.
640It is furthermore assumed that the inner regex of REPETITION is of
641fixed length and doesn't contain registers."))
642
643(defmethod create-constant-repetition-constant-length-matcher
644       ((repetition repetition) next-fn)
645  (declare #.*standard-optimize-settings*)
646  (let ((len (len repetition))
647        (repetitions (minimum repetition))
648        (regex (regex repetition)))
649    (declare (fixnum len repetitions)
650             (function next-fn))
651    (if (zerop len)
652      ;; if the length is zero it suffices to try once
653      (create-matcher-aux regex next-fn)
654      ;; otherwise try to optimize for a couple of common cases
655      (typecase regex
656        (str
657          (let ((str (str regex)))
658            (if (= 1 len)
659              ;; a single character
660              (let ((chr (schar str 0)))
661                (if (case-insensitive-p regex)
662                  (constant-repetition-constant-length-closure
663                   (and (char-equal chr (schar *string* curr-pos))
664                        (1+ curr-pos)))
665                  (constant-repetition-constant-length-closure
666                   (and (char= chr (schar *string* curr-pos))
667                        (1+ curr-pos)))))
668              ;; a string
669              (if (case-insensitive-p regex)
670                (constant-repetition-constant-length-closure
671                 (let ((next-pos (+ curr-pos len)))
672                   (declare (fixnum next-pos))
673                   (and (*string*-equal str curr-pos next-pos 0 len)
674                        next-pos)))
675                (constant-repetition-constant-length-closure
676                 (let ((next-pos (+ curr-pos len)))
677                   (declare (fixnum next-pos))
678                   (and (*string*= str curr-pos next-pos 0 len)
679                        next-pos)))))))
680        (char-class
681          ;; a character class
682          (insert-char-class-tester (regex (schar *string* curr-pos))
683            (constant-repetition-constant-length-closure
684             (and (char-class-test)
685                  (1+ curr-pos)))))
686        (everything
687          (if (single-line-p regex)
688            ;; a dot which really matches everything - we just have to
689            ;; advance the index into *STRING* accordingly and check
690            ;; if we didn't go past the end
691            (lambda (start-pos)
692              (declare (fixnum start-pos))
693              (let ((next-pos (+ start-pos repetitions)))
694                (declare (fixnum next-pos))
695                (and (<= next-pos *end-pos*)
696                     (funcall next-fn next-pos))))
697            ;; a dot which is not in single-line-mode - make sure we
698            ;; don't match #\Newline
699            (constant-repetition-constant-length-closure
700             (and (char/= #\Newline (schar *string* curr-pos))
701                  (1+ curr-pos)))))
702        (t
703          ;; the general case - we build an inner matcher which just
704          ;; checks for immediate success, i.e. NEXT-FN is #'IDENTITY
705          (let ((inner-matcher (create-matcher-aux regex #'identity)))
706            (declare (function inner-matcher))
707            (constant-repetition-constant-length-closure
708             (funcall inner-matcher curr-pos))))))))
709
710(defgeneric create-constant-repetition-matcher (repetition next-fn)
711  (declare #.*standard-optimize-settings*)
712  (:documentation "Creates a closure which tries to match REPETITION.
713It is assumed that REPETITION has a constant number of repetitions."))
714
715(defmethod create-constant-repetition-matcher ((repetition repetition) next-fn)
716  (declare #.*standard-optimize-settings*)
717  (let ((repetitions (minimum repetition))
718        ;; we make a reservation for our slot in *REPEAT-COUNTERS*
719        ;; because we need to keep track of the number of repetitions
720        (rep-num (incf-after *rep-num*))
721        ;; REPEAT-MATCHER is part of the closure's environment but it
722        ;; can only be defined after NON-GREEDY-AUX is defined
723        repeat-matcher)
724    (declare (fixnum repetitions rep-num)
725             (function next-fn))
726    (if (zerop (min-len repetition))
727      ;; we make a reservation for our slot in *LAST-POS-STORES*
728      ;; because we have to watch out for needless loops as the inner
729      ;; regex might match zero-length strings
730      (let ((zero-length-num (incf-after *zero-length-num*)))
731        (declare (fixnum zero-length-num))
732        (flet ((constant-aux (start-pos)
733                 ;; the actual matcher which first calls NEXT-FN and
734                 ;; on failure tries to match the inner regex of
735                 ;; REPETITION (if we haven't done so too often)
736                 (declare (fixnum start-pos)
737                          (function repeat-matcher))
738                 (let ((old-last-pos
739                         (svref *last-pos-stores* zero-length-num)))
740                   (when (and old-last-pos
741                              (= (the fixnum old-last-pos) start-pos))
742                     ;; if we've been here before we matched a
743                     ;; zero-length string the last time, so we can
744                     ;; just carry on because we will definitely be
745                     ;; able to do this again often enough
746                     (return-from constant-aux (funcall next-fn start-pos)))
747                   ;; otherwise remember this position for the next
748                   ;; repetition
749                   (setf (svref *last-pos-stores* zero-length-num) start-pos)
750                   (cond ((< (aref *repeat-counters* rep-num) repetitions)
751                           ;; not enough repetitions yet, try it again
752                           (incf (aref *repeat-counters* rep-num))
753                           ;; note that REPEAT-MATCHER will call
754                           ;; CONSTANT-AUX again recursively
755                           (prog1
756                             (funcall repeat-matcher start-pos)
757                             (decf (aref *repeat-counters* rep-num))
758                             (setf (svref *last-pos-stores* zero-length-num)
759                                     old-last-pos)))
760                         (t
761                           ;; we're done - call NEXT-FN
762                           (funcall next-fn start-pos))))))
763          ;; create a closure to match the inner regex and to
764          ;; implement backtracking via CONSTANT-AUX
765          (setq repeat-matcher
766                  (create-matcher-aux (regex repetition) #'constant-aux))
767          ;; the closure we return is just a thin wrapper around
768          ;; CONSTANT-AUX to initialize the repetition counter
769          (lambda (start-pos)
770            (declare (fixnum start-pos))
771            (setf (aref *repeat-counters* rep-num) 0
772                  (aref *last-pos-stores* zero-length-num) nil)
773            (constant-aux start-pos))))
774      ;; easier code because we don't have to care about zero-length
775      ;; matches but basically the same
776      (flet ((constant-aux (start-pos)
777               (declare (fixnum start-pos)
778                        (function repeat-matcher))
779               (cond ((< (aref *repeat-counters* rep-num) repetitions)
780                       (incf (aref *repeat-counters* rep-num))
781                       (prog1
782                         (funcall repeat-matcher start-pos)
783                         (decf (aref *repeat-counters* rep-num))))
784                     (t (funcall next-fn start-pos)))))
785        (setq repeat-matcher
786                (create-matcher-aux (regex repetition) #'constant-aux))
787        (lambda (start-pos)
788          (declare (fixnum start-pos))
789          (setf (aref *repeat-counters* rep-num) 0)
790          (constant-aux start-pos))))))
791
792;; the actual CREATE-MATCHER-AUX method for REPETITION objects which
793;; utilizes all the functions and macros defined above
794
795(defmethod create-matcher-aux ((repetition repetition) next-fn)
796  (declare #.*standard-optimize-settings*)
797  (with-slots (minimum maximum len min-len greedyp contains-register-p)
798      repetition
799    (cond ((and maximum
800                (zerop maximum))
801           ;; this should have been optimized away by CONVERT but just
802           ;; in case...
803           (error "Got REPETITION with MAXIMUM 0 \(should not happen)"))
804          ((and maximum
805                (= minimum maximum 1))
806           ;; this should have been optimized away by CONVERT but just
807           ;; in case...
808           (error "Got REPETITION with MAXIMUM 1 and MINIMUM 1 \(should not happen)"))
809          ((and (eql minimum maximum)
810                len
811                (not contains-register-p))
812           (create-constant-repetition-constant-length-matcher repetition next-fn))
813          ((eql minimum maximum)
814           (create-constant-repetition-matcher repetition next-fn))
815          ((and greedyp
816                len
817                (not contains-register-p))
818           (create-greedy-constant-length-matcher repetition next-fn))
819          ((and greedyp
820                (or (plusp min-len)
821                    (eql maximum 1)))
822           (create-greedy-no-zero-matcher repetition next-fn))
823          (greedyp
824           (create-greedy-matcher repetition next-fn))
825          ((and len
826                (plusp len)
827                (not contains-register-p))
828           (create-non-greedy-constant-length-matcher repetition next-fn))
829          ((or (plusp min-len)
830               (eql maximum 1))
831           (create-non-greedy-no-zero-matcher repetition next-fn))
832          (t
833           (create-non-greedy-matcher repetition next-fn)))))
834