1;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- 2;;; $Header: /usr/local/cvsrep/cl-ppcre/repetition-closures.lisp,v 1.34 2009/09/17 19:17:31 edi Exp $ 3 4;;; This is actually a part of closures.lisp which we put into a 5;;; separate file because it is rather complex. We only deal with 6;;; REPETITIONs here. Note that this part of the code contains some 7;;; rather crazy micro-optimizations which were introduced to be as 8;;; competitive with Perl as possible in tight loops. 9 10;;; Copyright (c) 2002-2009, Dr. Edmund Weitz. All rights reserved. 11 12;;; Redistribution and use in source and binary forms, with or without 13;;; modification, are permitted provided that the following conditions 14;;; are met: 15 16;;; * Redistributions of source code must retain the above copyright 17;;; notice, this list of conditions and the following disclaimer. 18 19;;; * Redistributions in binary form must reproduce the above 20;;; copyright notice, this list of conditions and the following 21;;; disclaimer in the documentation and/or other materials 22;;; provided with the distribution. 23 24;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED 25;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 26;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 28;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 30;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 31;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 32;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 33;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 34;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 35 36(in-package :cl-ppcre) 37 38(defmacro incf-after (place &optional (delta 1) &environment env) 39 "Utility macro inspired by C's \"place++\", i.e. first return the 40value of PLACE and afterwards increment it by DELTA." 41 (with-unique-names (%temp) 42 (multiple-value-bind (vars vals store-vars writer-form reader-form) 43 (get-setf-expansion place env) 44 `(let* (,@(mapcar #'list vars vals) 45 (,%temp ,reader-form) 46 (,(car store-vars) (+ ,%temp ,delta))) 47 ,writer-form 48 ,%temp)))) 49 50;; code for greedy repetitions with minimum zero 51 52(defmacro greedy-constant-length-closure (check-curr-pos) 53 "This is the template for simple greedy repetitions (where simple 54means that the minimum number of repetitions is zero, that the inner 55regex to be checked is of fixed length LEN, and that it doesn't 56contain registers, i.e. there's no need for backtracking). 57CHECK-CURR-POS is a form which checks whether the inner regex of the 58repetition matches at CURR-POS." 59 `(if maximum 60 (lambda (start-pos) 61 (declare (fixnum start-pos maximum)) 62 ;; because we know LEN we know in advance where to stop at the 63 ;; latest; we also take into consideration MIN-REST, i.e. the 64 ;; minimal length of the part behind the repetition 65 (let ((target-end-pos (min (1+ (- *end-pos* len min-rest)) 66 ;; don't go further than MAXIMUM 67 ;; repetitions, of course 68 (+ start-pos 69 (the fixnum (* len maximum))))) 70 (curr-pos start-pos)) 71 (declare (fixnum target-end-pos curr-pos)) 72 (block greedy-constant-length-matcher 73 ;; we use an ugly TAGBODY construct because this might be a 74 ;; tight loop and this version is a bit faster than our LOOP 75 ;; version (at least in CMUCL) 76 (tagbody 77 forward-loop 78 ;; first go forward as far as possible, i.e. while 79 ;; the inner regex matches 80 (when (>= curr-pos target-end-pos) 81 (go backward-loop)) 82 (when ,check-curr-pos 83 (incf curr-pos len) 84 (go forward-loop)) 85 backward-loop 86 ;; now go back LEN steps each until we're able to match 87 ;; the rest of the regex 88 (when (< curr-pos start-pos) 89 (return-from greedy-constant-length-matcher nil)) 90 (let ((result (funcall next-fn curr-pos))) 91 (when result 92 (return-from greedy-constant-length-matcher result))) 93 (decf curr-pos len) 94 (go backward-loop))))) 95 ;; basically the same code; it's just a bit easier because we're 96 ;; not bounded by MAXIMUM 97 (lambda (start-pos) 98 (declare (fixnum start-pos)) 99 (let ((target-end-pos (1+ (- *end-pos* len min-rest))) 100 (curr-pos start-pos)) 101 (declare (fixnum target-end-pos curr-pos)) 102 (block greedy-constant-length-matcher 103 (tagbody 104 forward-loop 105 (when (>= curr-pos target-end-pos) 106 (go backward-loop)) 107 (when ,check-curr-pos 108 (incf curr-pos len) 109 (go forward-loop)) 110 backward-loop 111 (when (< curr-pos start-pos) 112 (return-from greedy-constant-length-matcher nil)) 113 (let ((result (funcall next-fn curr-pos))) 114 (when result 115 (return-from greedy-constant-length-matcher result))) 116 (decf curr-pos len) 117 (go backward-loop))))))) 118 119(defun create-greedy-everything-matcher (maximum min-rest next-fn) 120 "Creates a closure which just matches as far ahead as possible, 121i.e. a closure for a dot in single-line mode." 122 (declare #.*standard-optimize-settings*) 123 (declare (fixnum min-rest) (function next-fn)) 124 (if maximum 125 (lambda (start-pos) 126 (declare (fixnum start-pos maximum)) 127 ;; because we know LEN we know in advance where to stop at the 128 ;; latest; we also take into consideration MIN-REST, i.e. the 129 ;; minimal length of the part behind the repetition 130 (let ((target-end-pos (min (+ start-pos maximum) 131 (- *end-pos* min-rest)))) 132 (declare (fixnum target-end-pos)) 133 ;; start from the highest possible position and go backward 134 ;; until we're able to match the rest of the regex 135 (loop for curr-pos of-type fixnum from target-end-pos downto start-pos 136 thereis (funcall next-fn curr-pos)))) 137 ;; basically the same code; it's just a bit easier because we're 138 ;; not bounded by MAXIMUM 139 (lambda (start-pos) 140 (declare (fixnum start-pos)) 141 (let ((target-end-pos (- *end-pos* min-rest))) 142 (declare (fixnum target-end-pos)) 143 (loop for curr-pos of-type fixnum from target-end-pos downto start-pos 144 thereis (funcall next-fn curr-pos)))))) 145 146(defgeneric create-greedy-constant-length-matcher (repetition next-fn) 147 (declare #.*standard-optimize-settings*) 148 (:documentation "Creates a closure which tries to match REPETITION. 149It is assumed that REPETITION is greedy and the minimal number of 150repetitions is zero. It is furthermore assumed that the inner regex 151of REPETITION is of fixed length and doesn't contain registers.")) 152 153(defmethod create-greedy-constant-length-matcher ((repetition repetition) 154 next-fn) 155 (declare #.*standard-optimize-settings*) 156 (let ((len (len repetition)) 157 (maximum (maximum repetition)) 158 (regex (regex repetition)) 159 (min-rest (min-rest repetition))) 160 (declare (fixnum len min-rest) 161 (function next-fn)) 162 (cond ((zerop len) 163 ;; inner regex has zero-length, so we can discard it 164 ;; completely 165 next-fn) 166 (t 167 ;; now first try to optimize for a couple of common cases 168 (typecase regex 169 (str 170 (let ((str (str regex))) 171 (if (= 1 len) 172 ;; a single character 173 (let ((chr (schar str 0))) 174 (if (case-insensitive-p regex) 175 (greedy-constant-length-closure 176 (char-equal chr (schar *string* curr-pos))) 177 (greedy-constant-length-closure 178 (char= chr (schar *string* curr-pos))))) 179 ;; a string 180 (if (case-insensitive-p regex) 181 (greedy-constant-length-closure 182 (*string*-equal str curr-pos (+ curr-pos len) 0 len)) 183 (greedy-constant-length-closure 184 (*string*= str curr-pos (+ curr-pos len) 0 len)))))) 185 (char-class 186 ;; a character class 187 (insert-char-class-tester (regex (schar *string* curr-pos)) 188 (greedy-constant-length-closure 189 (char-class-test)))) 190 (everything 191 ;; an EVERYTHING object, i.e. a dot 192 (if (single-line-p regex) 193 (create-greedy-everything-matcher maximum min-rest next-fn) 194 (greedy-constant-length-closure 195 (char/= #\Newline (schar *string* curr-pos))))) 196 (t 197 ;; the general case - we build an inner matcher which 198 ;; just checks for immediate success, i.e. NEXT-FN is 199 ;; #'IDENTITY 200 (let ((inner-matcher (create-matcher-aux regex #'identity))) 201 (declare (function inner-matcher)) 202 (greedy-constant-length-closure 203 (funcall inner-matcher curr-pos))))))))) 204 205(defgeneric create-greedy-no-zero-matcher (repetition next-fn) 206 (declare #.*standard-optimize-settings*) 207 (:documentation "Creates a closure which tries to match REPETITION. 208It is assumed that REPETITION is greedy and the minimal number of 209repetitions is zero. It is furthermore assumed that the inner regex 210of REPETITION can never match a zero-length string \(or instead the 211maximal number of repetitions is 1).")) 212 213(defmethod create-greedy-no-zero-matcher ((repetition repetition) next-fn) 214 (declare #.*standard-optimize-settings*) 215 (let ((maximum (maximum repetition)) 216 ;; REPEAT-MATCHER is part of the closure's environment but it 217 ;; can only be defined after GREEDY-AUX is defined 218 repeat-matcher) 219 (declare (function next-fn)) 220 (cond 221 ((eql maximum 1) 222 ;; this is essentially like the next case but with a known 223 ;; MAXIMUM of 1 we can get away without a counter; note that 224 ;; we always arrive here if CONVERT optimizes <regex>* to 225 ;; (?:<regex'>*<regex>)? 226 (setq repeat-matcher 227 (create-matcher-aux (regex repetition) next-fn)) 228 (lambda (start-pos) 229 (declare (function repeat-matcher)) 230 (or (funcall repeat-matcher start-pos) 231 (funcall next-fn start-pos)))) 232 (maximum 233 ;; we make a reservation for our slot in *REPEAT-COUNTERS* 234 ;; because we need to keep track whether we've reached MAXIMUM 235 ;; repetitions 236 (let ((rep-num (incf-after *rep-num*))) 237 (flet ((greedy-aux (start-pos) 238 (declare (fixnum start-pos maximum rep-num) 239 (function repeat-matcher)) 240 ;; the actual matcher which first tries to match the 241 ;; inner regex of REPETITION (if we haven't done so 242 ;; too often) and on failure calls NEXT-FN 243 (or (and (< (aref *repeat-counters* rep-num) maximum) 244 (incf (aref *repeat-counters* rep-num)) 245 ;; note that REPEAT-MATCHER will call 246 ;; GREEDY-AUX again recursively 247 (prog1 248 (funcall repeat-matcher start-pos) 249 (decf (aref *repeat-counters* rep-num)))) 250 (funcall next-fn start-pos)))) 251 ;; create a closure to match the inner regex and to 252 ;; implement backtracking via GREEDY-AUX 253 (setq repeat-matcher 254 (create-matcher-aux (regex repetition) #'greedy-aux)) 255 ;; the closure we return is just a thin wrapper around 256 ;; GREEDY-AUX to initialize the repetition counter 257 (lambda (start-pos) 258 (declare (fixnum start-pos)) 259 (setf (aref *repeat-counters* rep-num) 0) 260 (greedy-aux start-pos))))) 261 (t 262 ;; easier code because we're not bounded by MAXIMUM, but 263 ;; basically the same 264 (flet ((greedy-aux (start-pos) 265 (declare (fixnum start-pos) 266 (function repeat-matcher)) 267 (or (funcall repeat-matcher start-pos) 268 (funcall next-fn start-pos)))) 269 (setq repeat-matcher 270 (create-matcher-aux (regex repetition) #'greedy-aux)) 271 #'greedy-aux))))) 272 273(defgeneric create-greedy-matcher (repetition next-fn) 274 (declare #.*standard-optimize-settings*) 275 (:documentation "Creates a closure which tries to match REPETITION. 276It is assumed that REPETITION is greedy and the minimal number of 277repetitions is zero.")) 278 279(defmethod create-greedy-matcher ((repetition repetition) next-fn) 280 (declare #.*standard-optimize-settings*) 281 (let ((maximum (maximum repetition)) 282 ;; we make a reservation for our slot in *LAST-POS-STORES* because 283 ;; we have to watch out for endless loops as the inner regex might 284 ;; match zero-length strings 285 (zero-length-num (incf-after *zero-length-num*)) 286 ;; REPEAT-MATCHER is part of the closure's environment but it 287 ;; can only be defined after GREEDY-AUX is defined 288 repeat-matcher) 289 (declare (fixnum zero-length-num) 290 (function next-fn)) 291 (cond 292 (maximum 293 ;; we make a reservation for our slot in *REPEAT-COUNTERS* 294 ;; because we need to keep track whether we've reached MAXIMUM 295 ;; repetitions 296 (let ((rep-num (incf-after *rep-num*))) 297 (flet ((greedy-aux (start-pos) 298 ;; the actual matcher which first tries to match the 299 ;; inner regex of REPETITION (if we haven't done so 300 ;; too often) and on failure calls NEXT-FN 301 (declare (fixnum start-pos maximum rep-num) 302 (function repeat-matcher)) 303 (let ((old-last-pos 304 (svref *last-pos-stores* zero-length-num))) 305 (when (and old-last-pos 306 (= (the fixnum old-last-pos) start-pos)) 307 ;; stop immediately if we've been here before, 308 ;; i.e. if the last attempt matched a zero-length 309 ;; string 310 (return-from greedy-aux (funcall next-fn start-pos))) 311 ;; otherwise remember this position for the next 312 ;; repetition 313 (setf (svref *last-pos-stores* zero-length-num) start-pos) 314 (or (and (< (aref *repeat-counters* rep-num) maximum) 315 (incf (aref *repeat-counters* rep-num)) 316 ;; note that REPEAT-MATCHER will call 317 ;; GREEDY-AUX again recursively 318 (prog1 319 (funcall repeat-matcher start-pos) 320 (decf (aref *repeat-counters* rep-num)) 321 (setf (svref *last-pos-stores* zero-length-num) 322 old-last-pos))) 323 (funcall next-fn start-pos))))) 324 ;; create a closure to match the inner regex and to 325 ;; implement backtracking via GREEDY-AUX 326 (setq repeat-matcher 327 (create-matcher-aux (regex repetition) #'greedy-aux)) 328 ;; the closure we return is just a thin wrapper around 329 ;; GREEDY-AUX to initialize the repetition counter and our 330 ;; slot in *LAST-POS-STORES* 331 (lambda (start-pos) 332 (declare (fixnum start-pos)) 333 (setf (aref *repeat-counters* rep-num) 0 334 (svref *last-pos-stores* zero-length-num) nil) 335 (greedy-aux start-pos))))) 336 (t 337 ;; easier code because we're not bounded by MAXIMUM, but 338 ;; basically the same 339 (flet ((greedy-aux (start-pos) 340 (declare (fixnum start-pos) 341 (function repeat-matcher)) 342 (let ((old-last-pos 343 (svref *last-pos-stores* zero-length-num))) 344 (when (and old-last-pos 345 (= (the fixnum old-last-pos) start-pos)) 346 (return-from greedy-aux (funcall next-fn start-pos))) 347 (setf (svref *last-pos-stores* zero-length-num) start-pos) 348 (or (prog1 349 (funcall repeat-matcher start-pos) 350 (setf (svref *last-pos-stores* zero-length-num) old-last-pos)) 351 (funcall next-fn start-pos))))) 352 (setq repeat-matcher 353 (create-matcher-aux (regex repetition) #'greedy-aux)) 354 (lambda (start-pos) 355 (declare (fixnum start-pos)) 356 (setf (svref *last-pos-stores* zero-length-num) nil) 357 (greedy-aux start-pos))))))) 358 359;; code for non-greedy repetitions with minimum zero 360 361(defmacro non-greedy-constant-length-closure (check-curr-pos) 362 "This is the template for simple non-greedy repetitions \(where 363simple means that the minimum number of repetitions is zero, that the 364inner regex to be checked is of fixed length LEN, and that it doesn't 365contain registers, i.e. there's no need for backtracking). 366CHECK-CURR-POS is a form which checks whether the inner regex of the 367repetition matches at CURR-POS." 368 `(if maximum 369 (lambda (start-pos) 370 (declare (fixnum start-pos maximum)) 371 ;; because we know LEN we know in advance where to stop at the 372 ;; latest; we also take into consideration MIN-REST, i.e. the 373 ;; minimal length of the part behind the repetition 374 (let ((target-end-pos (min (1+ (- *end-pos* len min-rest)) 375 (+ start-pos 376 (the fixnum (* len maximum)))))) 377 ;; move forward by LEN and always try NEXT-FN first, then 378 ;; CHECK-CUR-POS 379 (loop for curr-pos of-type fixnum from start-pos 380 below target-end-pos 381 by len 382 thereis (funcall next-fn curr-pos) 383 while ,check-curr-pos 384 finally (return (funcall next-fn curr-pos))))) 385 ;; basically the same code; it's just a bit easier because we're 386 ;; not bounded by MAXIMUM 387 (lambda (start-pos) 388 (declare (fixnum start-pos)) 389 (let ((target-end-pos (1+ (- *end-pos* len min-rest)))) 390 (loop for curr-pos of-type fixnum from start-pos 391 below target-end-pos 392 by len 393 thereis (funcall next-fn curr-pos) 394 while ,check-curr-pos 395 finally (return (funcall next-fn curr-pos))))))) 396 397(defgeneric create-non-greedy-constant-length-matcher (repetition next-fn) 398 (declare #.*standard-optimize-settings*) 399 (:documentation "Creates a closure which tries to match REPETITION. 400It is assumed that REPETITION is non-greedy and the minimal number of 401repetitions is zero. It is furthermore assumed that the inner regex 402of REPETITION is of fixed length and doesn't contain registers.")) 403 404(defmethod create-non-greedy-constant-length-matcher ((repetition repetition) next-fn) 405 (declare #.*standard-optimize-settings*) 406 (let ((len (len repetition)) 407 (maximum (maximum repetition)) 408 (regex (regex repetition)) 409 (min-rest (min-rest repetition))) 410 (declare (fixnum len min-rest) 411 (function next-fn)) 412 (cond ((zerop len) 413 ;; inner regex has zero-length, so we can discard it 414 ;; completely 415 next-fn) 416 (t 417 ;; now first try to optimize for a couple of common cases 418 (typecase regex 419 (str 420 (let ((str (str regex))) 421 (if (= 1 len) 422 ;; a single character 423 (let ((chr (schar str 0))) 424 (if (case-insensitive-p regex) 425 (non-greedy-constant-length-closure 426 (char-equal chr (schar *string* curr-pos))) 427 (non-greedy-constant-length-closure 428 (char= chr (schar *string* curr-pos))))) 429 ;; a string 430 (if (case-insensitive-p regex) 431 (non-greedy-constant-length-closure 432 (*string*-equal str curr-pos (+ curr-pos len) 0 len)) 433 (non-greedy-constant-length-closure 434 (*string*= str curr-pos (+ curr-pos len) 0 len)))))) 435 (char-class 436 ;; a character class 437 (insert-char-class-tester (regex (schar *string* curr-pos)) 438 (non-greedy-constant-length-closure 439 (char-class-test)))) 440 (everything 441 (if (single-line-p regex) 442 ;; a dot which really can match everything; we rely 443 ;; on the compiler to optimize this away 444 (non-greedy-constant-length-closure 445 t) 446 ;; a dot which has to watch out for #\Newline 447 (non-greedy-constant-length-closure 448 (char/= #\Newline (schar *string* curr-pos))))) 449 (t 450 ;; the general case - we build an inner matcher which 451 ;; just checks for immediate success, i.e. NEXT-FN is 452 ;; #'IDENTITY 453 (let ((inner-matcher (create-matcher-aux regex #'identity))) 454 (declare (function inner-matcher)) 455 (non-greedy-constant-length-closure 456 (funcall inner-matcher curr-pos))))))))) 457 458(defgeneric create-non-greedy-no-zero-matcher (repetition next-fn) 459 (declare #.*standard-optimize-settings*) 460 (:documentation "Creates a closure which tries to match REPETITION. 461It is assumed that REPETITION is non-greedy and the minimal number of 462repetitions is zero. It is furthermore assumed that the inner regex 463of REPETITION can never match a zero-length string \(or instead the 464maximal number of repetitions is 1).")) 465 466(defmethod create-non-greedy-no-zero-matcher ((repetition repetition) next-fn) 467 (declare #.*standard-optimize-settings*) 468 (let ((maximum (maximum repetition)) 469 ;; REPEAT-MATCHER is part of the closure's environment but it 470 ;; can only be defined after NON-GREEDY-AUX is defined 471 repeat-matcher) 472 (declare (function next-fn)) 473 (cond 474 ((eql maximum 1) 475 ;; this is essentially like the next case but with a known 476 ;; MAXIMUM of 1 we can get away without a counter 477 (setq repeat-matcher 478 (create-matcher-aux (regex repetition) next-fn)) 479 (lambda (start-pos) 480 (declare (function repeat-matcher)) 481 (or (funcall next-fn start-pos) 482 (funcall repeat-matcher start-pos)))) 483 (maximum 484 ;; we make a reservation for our slot in *REPEAT-COUNTERS* 485 ;; because we need to keep track whether we've reached MAXIMUM 486 ;; repetitions 487 (let ((rep-num (incf-after *rep-num*))) 488 (flet ((non-greedy-aux (start-pos) 489 ;; the actual matcher which first calls NEXT-FN and 490 ;; on failure tries to match the inner regex of 491 ;; REPETITION (if we haven't done so too often) 492 (declare (fixnum start-pos maximum rep-num) 493 (function repeat-matcher)) 494 (or (funcall next-fn start-pos) 495 (and (< (aref *repeat-counters* rep-num) maximum) 496 (incf (aref *repeat-counters* rep-num)) 497 ;; note that REPEAT-MATCHER will call 498 ;; NON-GREEDY-AUX again recursively 499 (prog1 500 (funcall repeat-matcher start-pos) 501 (decf (aref *repeat-counters* rep-num))))))) 502 ;; create a closure to match the inner regex and to 503 ;; implement backtracking via NON-GREEDY-AUX 504 (setq repeat-matcher 505 (create-matcher-aux (regex repetition) #'non-greedy-aux)) 506 ;; the closure we return is just a thin wrapper around 507 ;; NON-GREEDY-AUX to initialize the repetition counter 508 (lambda (start-pos) 509 (declare (fixnum start-pos)) 510 (setf (aref *repeat-counters* rep-num) 0) 511 (non-greedy-aux start-pos))))) 512 (t 513 ;; easier code because we're not bounded by MAXIMUM, but 514 ;; basically the same 515 (flet ((non-greedy-aux (start-pos) 516 (declare (fixnum start-pos) 517 (function repeat-matcher)) 518 (or (funcall next-fn start-pos) 519 (funcall repeat-matcher start-pos)))) 520 (setq repeat-matcher 521 (create-matcher-aux (regex repetition) #'non-greedy-aux)) 522 #'non-greedy-aux))))) 523 524(defgeneric create-non-greedy-matcher (repetition next-fn) 525 (declare #.*standard-optimize-settings*) 526 (:documentation "Creates a closure which tries to match REPETITION. 527It is assumed that REPETITION is non-greedy and the minimal number of 528repetitions is zero.")) 529 530(defmethod create-non-greedy-matcher ((repetition repetition) next-fn) 531 (declare #.*standard-optimize-settings*) 532 ;; we make a reservation for our slot in *LAST-POS-STORES* because 533 ;; we have to watch out for endless loops as the inner regex might 534 ;; match zero-length strings 535 (let ((zero-length-num (incf-after *zero-length-num*)) 536 (maximum (maximum repetition)) 537 ;; REPEAT-MATCHER is part of the closure's environment but it 538 ;; can only be defined after NON-GREEDY-AUX is defined 539 repeat-matcher) 540 (declare (fixnum zero-length-num) 541 (function next-fn)) 542 (cond 543 (maximum 544 ;; we make a reservation for our slot in *REPEAT-COUNTERS* 545 ;; because we need to keep track whether we've reached MAXIMUM 546 ;; repetitions 547 (let ((rep-num (incf-after *rep-num*))) 548 (flet ((non-greedy-aux (start-pos) 549 ;; the actual matcher which first calls NEXT-FN and 550 ;; on failure tries to match the inner regex of 551 ;; REPETITION (if we haven't done so too often) 552 (declare (fixnum start-pos maximum rep-num) 553 (function repeat-matcher)) 554 (let ((old-last-pos 555 (svref *last-pos-stores* zero-length-num))) 556 (when (and old-last-pos 557 (= (the fixnum old-last-pos) start-pos)) 558 ;; stop immediately if we've been here before, 559 ;; i.e. if the last attempt matched a zero-length 560 ;; string 561 (return-from non-greedy-aux (funcall next-fn start-pos))) 562 ;; otherwise remember this position for the next 563 ;; repetition 564 (setf (svref *last-pos-stores* zero-length-num) start-pos) 565 (or (funcall next-fn start-pos) 566 (and (< (aref *repeat-counters* rep-num) maximum) 567 (incf (aref *repeat-counters* rep-num)) 568 ;; note that REPEAT-MATCHER will call 569 ;; NON-GREEDY-AUX again recursively 570 (prog1 571 (funcall repeat-matcher start-pos) 572 (decf (aref *repeat-counters* rep-num)) 573 (setf (svref *last-pos-stores* zero-length-num) 574 old-last-pos))))))) 575 ;; create a closure to match the inner regex and to 576 ;; implement backtracking via NON-GREEDY-AUX 577 (setq repeat-matcher 578 (create-matcher-aux (regex repetition) #'non-greedy-aux)) 579 ;; the closure we return is just a thin wrapper around 580 ;; NON-GREEDY-AUX to initialize the repetition counter and our 581 ;; slot in *LAST-POS-STORES* 582 (lambda (start-pos) 583 (declare (fixnum start-pos)) 584 (setf (aref *repeat-counters* rep-num) 0 585 (svref *last-pos-stores* zero-length-num) nil) 586 (non-greedy-aux start-pos))))) 587 (t 588 ;; easier code because we're not bounded by MAXIMUM, but 589 ;; basically the same 590 (flet ((non-greedy-aux (start-pos) 591 (declare (fixnum start-pos) 592 (function repeat-matcher)) 593 (let ((old-last-pos 594 (svref *last-pos-stores* zero-length-num))) 595 (when (and old-last-pos 596 (= (the fixnum old-last-pos) start-pos)) 597 (return-from non-greedy-aux (funcall next-fn start-pos))) 598 (setf (svref *last-pos-stores* zero-length-num) start-pos) 599 (or (funcall next-fn start-pos) 600 (prog1 601 (funcall repeat-matcher start-pos) 602 (setf (svref *last-pos-stores* zero-length-num) 603 old-last-pos)))))) 604 (setq repeat-matcher 605 (create-matcher-aux (regex repetition) #'non-greedy-aux)) 606 (lambda (start-pos) 607 (declare (fixnum start-pos)) 608 (setf (svref *last-pos-stores* zero-length-num) nil) 609 (non-greedy-aux start-pos))))))) 610 611;; code for constant repetitions, i.e. those with a fixed number of repetitions 612 613(defmacro constant-repetition-constant-length-closure (check-curr-pos) 614 "This is the template for simple constant repetitions (where simple 615means that the inner regex to be checked is of fixed length LEN, and 616that it doesn't contain registers, i.e. there's no need for 617backtracking) and where constant means that MINIMUM is equal to 618MAXIMUM. CHECK-CURR-POS is a form which checks whether the inner 619regex of the repetition matches at CURR-POS." 620 `(lambda (start-pos) 621 (declare (fixnum start-pos)) 622 (let ((target-end-pos (+ start-pos 623 (the fixnum (* len repetitions))))) 624 (declare (fixnum target-end-pos)) 625 ;; first check if we won't go beyond the end of the string 626 (and (>= *end-pos* target-end-pos) 627 ;; then loop through all repetitions step by step 628 (loop for curr-pos of-type fixnum from start-pos 629 below target-end-pos 630 by len 631 always ,check-curr-pos) 632 ;; finally call NEXT-FN if we made it that far 633 (funcall next-fn target-end-pos))))) 634 635(defgeneric create-constant-repetition-constant-length-matcher 636 (repetition next-fn) 637 (declare #.*standard-optimize-settings*) 638 (:documentation "Creates a closure which tries to match REPETITION. 639It is assumed that REPETITION has a constant number of repetitions. 640It is furthermore assumed that the inner regex of REPETITION is of 641fixed length and doesn't contain registers.")) 642 643(defmethod create-constant-repetition-constant-length-matcher 644 ((repetition repetition) next-fn) 645 (declare #.*standard-optimize-settings*) 646 (let ((len (len repetition)) 647 (repetitions (minimum repetition)) 648 (regex (regex repetition))) 649 (declare (fixnum len repetitions) 650 (function next-fn)) 651 (if (zerop len) 652 ;; if the length is zero it suffices to try once 653 (create-matcher-aux regex next-fn) 654 ;; otherwise try to optimize for a couple of common cases 655 (typecase regex 656 (str 657 (let ((str (str regex))) 658 (if (= 1 len) 659 ;; a single character 660 (let ((chr (schar str 0))) 661 (if (case-insensitive-p regex) 662 (constant-repetition-constant-length-closure 663 (and (char-equal chr (schar *string* curr-pos)) 664 (1+ curr-pos))) 665 (constant-repetition-constant-length-closure 666 (and (char= chr (schar *string* curr-pos)) 667 (1+ curr-pos))))) 668 ;; a string 669 (if (case-insensitive-p regex) 670 (constant-repetition-constant-length-closure 671 (let ((next-pos (+ curr-pos len))) 672 (declare (fixnum next-pos)) 673 (and (*string*-equal str curr-pos next-pos 0 len) 674 next-pos))) 675 (constant-repetition-constant-length-closure 676 (let ((next-pos (+ curr-pos len))) 677 (declare (fixnum next-pos)) 678 (and (*string*= str curr-pos next-pos 0 len) 679 next-pos))))))) 680 (char-class 681 ;; a character class 682 (insert-char-class-tester (regex (schar *string* curr-pos)) 683 (constant-repetition-constant-length-closure 684 (and (char-class-test) 685 (1+ curr-pos))))) 686 (everything 687 (if (single-line-p regex) 688 ;; a dot which really matches everything - we just have to 689 ;; advance the index into *STRING* accordingly and check 690 ;; if we didn't go past the end 691 (lambda (start-pos) 692 (declare (fixnum start-pos)) 693 (let ((next-pos (+ start-pos repetitions))) 694 (declare (fixnum next-pos)) 695 (and (<= next-pos *end-pos*) 696 (funcall next-fn next-pos)))) 697 ;; a dot which is not in single-line-mode - make sure we 698 ;; don't match #\Newline 699 (constant-repetition-constant-length-closure 700 (and (char/= #\Newline (schar *string* curr-pos)) 701 (1+ curr-pos))))) 702 (t 703 ;; the general case - we build an inner matcher which just 704 ;; checks for immediate success, i.e. NEXT-FN is #'IDENTITY 705 (let ((inner-matcher (create-matcher-aux regex #'identity))) 706 (declare (function inner-matcher)) 707 (constant-repetition-constant-length-closure 708 (funcall inner-matcher curr-pos)))))))) 709 710(defgeneric create-constant-repetition-matcher (repetition next-fn) 711 (declare #.*standard-optimize-settings*) 712 (:documentation "Creates a closure which tries to match REPETITION. 713It is assumed that REPETITION has a constant number of repetitions.")) 714 715(defmethod create-constant-repetition-matcher ((repetition repetition) next-fn) 716 (declare #.*standard-optimize-settings*) 717 (let ((repetitions (minimum repetition)) 718 ;; we make a reservation for our slot in *REPEAT-COUNTERS* 719 ;; because we need to keep track of the number of repetitions 720 (rep-num (incf-after *rep-num*)) 721 ;; REPEAT-MATCHER is part of the closure's environment but it 722 ;; can only be defined after NON-GREEDY-AUX is defined 723 repeat-matcher) 724 (declare (fixnum repetitions rep-num) 725 (function next-fn)) 726 (if (zerop (min-len repetition)) 727 ;; we make a reservation for our slot in *LAST-POS-STORES* 728 ;; because we have to watch out for needless loops as the inner 729 ;; regex might match zero-length strings 730 (let ((zero-length-num (incf-after *zero-length-num*))) 731 (declare (fixnum zero-length-num)) 732 (flet ((constant-aux (start-pos) 733 ;; the actual matcher which first calls NEXT-FN and 734 ;; on failure tries to match the inner regex of 735 ;; REPETITION (if we haven't done so too often) 736 (declare (fixnum start-pos) 737 (function repeat-matcher)) 738 (let ((old-last-pos 739 (svref *last-pos-stores* zero-length-num))) 740 (when (and old-last-pos 741 (= (the fixnum old-last-pos) start-pos)) 742 ;; if we've been here before we matched a 743 ;; zero-length string the last time, so we can 744 ;; just carry on because we will definitely be 745 ;; able to do this again often enough 746 (return-from constant-aux (funcall next-fn start-pos))) 747 ;; otherwise remember this position for the next 748 ;; repetition 749 (setf (svref *last-pos-stores* zero-length-num) start-pos) 750 (cond ((< (aref *repeat-counters* rep-num) repetitions) 751 ;; not enough repetitions yet, try it again 752 (incf (aref *repeat-counters* rep-num)) 753 ;; note that REPEAT-MATCHER will call 754 ;; CONSTANT-AUX again recursively 755 (prog1 756 (funcall repeat-matcher start-pos) 757 (decf (aref *repeat-counters* rep-num)) 758 (setf (svref *last-pos-stores* zero-length-num) 759 old-last-pos))) 760 (t 761 ;; we're done - call NEXT-FN 762 (funcall next-fn start-pos)))))) 763 ;; create a closure to match the inner regex and to 764 ;; implement backtracking via CONSTANT-AUX 765 (setq repeat-matcher 766 (create-matcher-aux (regex repetition) #'constant-aux)) 767 ;; the closure we return is just a thin wrapper around 768 ;; CONSTANT-AUX to initialize the repetition counter 769 (lambda (start-pos) 770 (declare (fixnum start-pos)) 771 (setf (aref *repeat-counters* rep-num) 0 772 (aref *last-pos-stores* zero-length-num) nil) 773 (constant-aux start-pos)))) 774 ;; easier code because we don't have to care about zero-length 775 ;; matches but basically the same 776 (flet ((constant-aux (start-pos) 777 (declare (fixnum start-pos) 778 (function repeat-matcher)) 779 (cond ((< (aref *repeat-counters* rep-num) repetitions) 780 (incf (aref *repeat-counters* rep-num)) 781 (prog1 782 (funcall repeat-matcher start-pos) 783 (decf (aref *repeat-counters* rep-num)))) 784 (t (funcall next-fn start-pos))))) 785 (setq repeat-matcher 786 (create-matcher-aux (regex repetition) #'constant-aux)) 787 (lambda (start-pos) 788 (declare (fixnum start-pos)) 789 (setf (aref *repeat-counters* rep-num) 0) 790 (constant-aux start-pos)))))) 791 792;; the actual CREATE-MATCHER-AUX method for REPETITION objects which 793;; utilizes all the functions and macros defined above 794 795(defmethod create-matcher-aux ((repetition repetition) next-fn) 796 (declare #.*standard-optimize-settings*) 797 (with-slots (minimum maximum len min-len greedyp contains-register-p) 798 repetition 799 (cond ((and maximum 800 (zerop maximum)) 801 ;; this should have been optimized away by CONVERT but just 802 ;; in case... 803 (error "Got REPETITION with MAXIMUM 0 \(should not happen)")) 804 ((and maximum 805 (= minimum maximum 1)) 806 ;; this should have been optimized away by CONVERT but just 807 ;; in case... 808 (error "Got REPETITION with MAXIMUM 1 and MINIMUM 1 \(should not happen)")) 809 ((and (eql minimum maximum) 810 len 811 (not contains-register-p)) 812 (create-constant-repetition-constant-length-matcher repetition next-fn)) 813 ((eql minimum maximum) 814 (create-constant-repetition-matcher repetition next-fn)) 815 ((and greedyp 816 len 817 (not contains-register-p)) 818 (create-greedy-constant-length-matcher repetition next-fn)) 819 ((and greedyp 820 (or (plusp min-len) 821 (eql maximum 1))) 822 (create-greedy-no-zero-matcher repetition next-fn)) 823 (greedyp 824 (create-greedy-matcher repetition next-fn)) 825 ((and len 826 (plusp len) 827 (not contains-register-p)) 828 (create-non-greedy-constant-length-matcher repetition next-fn)) 829 ((or (plusp min-len) 830 (eql maximum 1)) 831 (create-non-greedy-no-zero-matcher repetition next-fn)) 832 (t 833 (create-non-greedy-matcher repetition next-fn))))) 834