1;;; -*- Mode: Lisp; Package:USER; Base:10 -*- 2;;; 3;;; This code was written by: 4;;; 5;;; Lawrence E. Freil <lef@nscf.org> 6;;; National Science Center Foundation 7;;; Augusta, Georgia 30909 8;;; 9;;; If you modify this code, please comment your modifications 10;;; clearly and inform the author of any improvements so they 11;;; can be incorporated in future releases. 12;;; 13;;; nregex.lisp - My 4/8/92 attempt at a Lisp based regular expression 14;;; parser. 15;;; 16;;; This regular expression parser operates by taking a 17;;; regular expression and breaking it down into a list 18;;; consisting of lisp expressions and flags. The list 19;;; of lisp expressions is then taken in turned into a 20;;; lambda expression that can be later applied to a 21;;; string argument for parsing. 22 23;;; 24;;; First we create a copy of macros to help debug the beast 25 26(eval-when #-gcl(:compile-toplevel :load-toplevel :execute) 27 #+gcl(load compile eval) 28 (defpackage :maxima-nregex 29 (:use :common-lisp) 30 (:export 31 ;; Vars 32 #:*regex-debug* #:*regex-groups* #:*regex-groupings* 33 ;; Functions 34 #:regex-compile 35 )) 36 ) 37 38(in-package :maxima-nregex) 39 40(eval-when (:compile-toplevel :load-toplevel :execute) 41 (defvar *regex-debug* nil) ; Set to nil for no debugging code 42 43 (defmacro info (message &rest args) 44 (if *regex-debug* 45 `(format *trace-output* ,message ,@args))) 46 47;;; 48;;; Declare the global variables for storing the paren index list. 49;;; 50 (defvar *regex-groups* (make-array 10)) 51 (defvar *regex-groupings* 0) 52 ) 53 54;;; 55;;; Declare a simple interface for testing. You probably wouldn't want 56;;; to use this interface unless you were just calling this once. 57;;; 58(defun regex (expression string) 59 "Usage: (regex <expression> <string) 60 This function will call regex-compile on the expression and then apply 61 the string to the returned lambda list." 62 (let ((findit (cond ((stringp expression) 63 (regex-compile expression)) 64 ((listp expression) 65 expression))) 66 (result nil)) 67 (if (not (funcall (if (functionp findit) 68 findit 69 (eval `(function ,findit))) string)) 70 (return-from regex nil)) 71 (if (= *regex-groupings* 0) 72 (return-from regex t)) 73 (dotimes (i *regex-groupings*) 74 (push (funcall 'subseq 75 string 76 (car (aref *regex-groups* i)) 77 (cadr (aref *regex-groups* i))) 78 result)) 79 (reverse result))) 80;;; 81;;; Declare some simple macros to make the code more readable. 82;;; 83(defvar *regex-special-chars* "?*+.()[]\\${}") 84 85(defmacro add-exp (list) 86 "Add an item to the end of expression" 87 `(setf expression (append expression ,list))) 88 89;;; 90;;; Now for the main regex compiler routine. 91;;; 92(defun regex-compile (source &key (anchored nil) (case-sensitive t)) 93 "Usage: (regex-compile <expression> [ :anchored (t/nil) ] [ :case-sensitive (t/nil) ]) 94 This function take a regular expression (supplied as source) and 95 compiles this into a lambda list that a string argument can then 96 be applied to. It is also possible to compile this lambda list 97 for better performance or to save it as a named function for later 98 use" 99 (info "Now entering regex-compile with \"~A\"~%" source) 100 ;; 101 ;; This routine works in two parts. 102 ;; The first pass take the regular expression and produces a list of 103 ;; operators and lisp expressions for the entire regular expression. 104 ;; The second pass takes this list and produces the lambda expression. 105 (let ((expression '()) ; holder for expressions 106 (group 1) ; Current group index 107 (group-stack nil) ; Stack of current group endings 108 (result nil) ; holder for built expression. 109 (fast-first nil)) ; holder for quick unanchored scan 110 ;; 111 ;; If the expression was an empty string then it alway 112 ;; matches (so lets leave early) 113 ;; 114 (if (= (length source) 0) 115 (return-from regex-compile 116 '(lambda (&rest args) 117 (declare (ignore args)) 118 t))) 119 ;; 120 ;; If the first character is a caret then set the anchored 121 ;; flags and remove if from the expression string. 122 ;; 123 (cond ((eql (char source 0) #\^) 124 (setf source (subseq source 1)) 125 (setf anchored t))) 126 ;; 127 ;; If the first sequence is .* then also set the anchored flags. 128 ;; (This is purely for optimization, it will work without this). 129 ;; 130 (if (>= (length source) 2) 131 (if (string= source ".*" :start1 0 :end1 2) 132 (setf anchored t))) 133 ;; 134 ;; Also, If this is not an anchored search and the first character is 135 ;; a literal, then do a quick scan to see if it is even in the string. 136 ;; If not then we can issue a quick nil, 137 ;; otherwise we can start the search at the matching character to skip 138 ;; the checks of the non-matching characters anyway. 139 ;; 140 ;; If I really wanted to speed up this section of code it would be 141 ;; easy to recognize the case of a fairly long multi-character literal 142 ;; and generate a Boyer-Moore search for the entire literal. 143 ;; 144 ;; I generate the code to do a loop because on CMU Lisp this is about 145 ;; twice as fast a calling position. 146 ;; 147 (if (and (not anchored) 148 (not (position (char source 0) *regex-special-chars*)) 149 (not (and (> (length source) 1) 150 (position (char source 1) *regex-special-chars*)))) 151 (setf fast-first `((if (not (do ((i start (+ i 1))) 152 ((>= i length)) 153 (if (,(if case-sensitive 'eql 'char-equal) 154 (char string i) 155 ,(char source 0)) 156 (return (setf start i))))) 157 (return-from final-return nil))))) 158 ;; 159 ;; Generate the very first expression to save the starting index 160 ;; so that group 0 will be the entire string matched always 161 ;; 162 (add-exp '((setf (aref *regex-groups* 0) 163 (list index nil)))) 164 ;; 165 ;; Loop over each character in the regular expression building the 166 ;; expression list as we go. 167 ;; 168 (do ((eindex 0 (1+ eindex))) 169 ((= eindex (length source))) 170 (let ((current (char source eindex))) 171 (info "Now processing character ~A index = ~A~%" current eindex) 172 (case current 173 ((#\.) 174 ;; 175 ;; Generate code for a single wild character 176 ;; 177 (add-exp '((if (>= index length) 178 (return-from compare nil) 179 (incf index))))) 180 ((#\$) 181 ;; 182 ;; If this is the last character of the expression then 183 ;; anchor the end of the expression, otherwise let it slide 184 ;; as a standard character (even though it should be quoted). 185 ;; 186 (if (= eindex (1- (length source))) 187 (add-exp '((if (not (= index length)) 188 (return-from compare nil)))) 189 (add-exp '((if (not (and (< index length) 190 (eql (char string index) #\$))) 191 (return-from compare nil) 192 (incf index)))))) 193 ((#\*) 194 (add-exp '(astrisk))) 195 196 ((#\+) 197 (add-exp '(plus))) 198 199 ((#\?) 200 (add-exp '(question))) 201 202 ((#\() 203 ;; 204 ;; Start a grouping. 205 ;; 206 (incf group) 207 (push group group-stack) 208 (add-exp `((setf (aref *regex-groups* ,(1- group)) 209 (list index nil)))) 210 (add-exp `(,group))) 211 ((#\)) 212 ;; 213 ;; End a grouping 214 ;; 215 (let ((group (pop group-stack))) 216 (add-exp `((setf (cadr (aref *regex-groups* ,(1- group))) 217 index))) 218 (add-exp `(,(- group))))) 219 ((#\[) 220 ;; 221 ;; Start of a range operation. 222 ;; Generate a bit-vector that has one bit per possible character 223 ;; and then on each character or range, set the possible bits. 224 ;; 225 ;; If the first character is carat then invert the set. 226 (let* ((invert (eql (char source (1+ eindex)) #\^)) 227 (bitstring (make-array 256 :element-type 'bit 228 :initial-element 229 (if invert 1 0))) 230 (set-char (if invert 0 1))) 231 (if invert (incf eindex)) 232 (do ((x (1+ eindex) (1+ x))) 233 ((eql (char source x) #\]) (setf eindex x)) 234 (info "Building range with character ~A~%" (char source x)) 235 (cond ((and (eql (char source (1+ x)) #\-) 236 (not (eql (char source (+ x 2)) #\]))) 237 (if (>= (char-code (char source x)) 238 (char-code (char source (+ 2 x)))) 239 (error (intl:gettext "regex: ranges must be in ascending order; found: \"~A-~A\"") 240 (char source x) (char source (+ 2 x)))) 241 (do ((j (char-code (char source x)) (1+ j))) 242 ((> j (char-code (char source (+ 2 x)))) 243 (incf x 2)) 244 (info "Setting bit for char ~A code ~A~%" (code-char j) j) 245 (setf (sbit bitstring j) set-char))) 246 (t 247 (cond ((not (eql (char source x) #\])) 248 (let ((char (char source x))) 249 ;; 250 ;; If the character is quoted then find out what 251 ;; it should have been 252 ;; 253 (if (eql (char source x) #\\ ) 254 (let ((length)) 255 (multiple-value-setq (char length) 256 (regex-quoted (subseq source x) invert)) 257 (incf x length))) 258 (info "Setting bit for char ~A code ~A~%" char (char-code char)) 259 (if (not (vectorp char)) 260 (setf (sbit bitstring (char-code (char source x))) set-char) 261 (bit-ior bitstring char t)))))))) 262 (add-exp `((let ((range ,bitstring)) 263 (if (>= index length) 264 (return-from compare nil)) 265 (if (= 1 (sbit range (char-code (char string index)))) 266 (incf index) 267 (return-from compare nil))))))) 268 ((#\\ ) 269 ;; 270 ;; Intreprete the next character as a special, range, octal, group or 271 ;; just the character itself. 272 ;; 273 (let ((length) 274 (value)) 275 (multiple-value-setq (value length) 276 (regex-quoted (subseq source (1+ eindex)) nil)) 277 (cond ((listp value) 278 (add-exp value)) 279 ((characterp value) 280 (add-exp `((if (not (and (< index length) 281 (eql (char string index) 282 ,value))) 283 (return-from compare nil) 284 (incf index))))) 285 ((vectorp value) 286 (add-exp `((let ((range ,value)) 287 (if (>= index length) 288 (return-from compare nil)) 289 (if (= 1 (sbit range (char-code (char string index)))) 290 (incf index) 291 (return-from compare nil))))))) 292 (incf eindex length))) 293 (t 294 ;; 295 ;; We have a literal character. 296 ;; Scan to see how many we have and if it is more than one 297 ;; generate a string= verses as single eql. 298 ;; 299 (let* ((lit "") 300 (term (dotimes (litindex (- (length source) eindex) nil) 301 (let ((litchar (char source (+ eindex litindex)))) 302 (if (position litchar *regex-special-chars*) 303 (return litchar) 304 (progn 305 (info "Now adding ~A index ~A to lit~%" litchar 306 litindex) 307 (setf lit (concatenate 'string lit 308 (string litchar))))))))) 309 (if (= (length lit) 1) 310 (add-exp `((if (not (and (< index length) 311 (,(if case-sensitive 'eql 'char-equal) 312 (char string index) ,current))) 313 (return-from compare nil) 314 (incf index)))) 315 ;; 316 ;; If we have a multi-character literal then we must 317 ;; check to see if the next character (if there is one) 318 ;; is an astrisk or a plus. If so then we must not use this 319 ;; character in the big literal. 320 (progn 321 (if (or (eql term #\*) (eql term #\+)) 322 (setf lit (subseq lit 0 (1- (length lit))))) 323 (add-exp `((if (< length (+ index ,(length lit))) 324 (return-from compare nil)) 325 (if (not (,(if case-sensitive 'string= 'string-equal) 326 string ,lit :start1 index 327 :end1 (+ index ,(length lit)))) 328 (return-from compare nil) 329 (incf index ,(length lit))))))) 330 (incf eindex (1- (length lit)))))))) 331 ;; 332 ;; Plug end of list to return t. If we made it this far then 333 ;; We have matched! 334 (add-exp '((setf (cadr (aref *regex-groups* 0)) 335 index))) 336 (add-exp '((return-from final-return t))) 337 ;; 338;;; (print expression) 339 ;; 340 ;; Now take the expression list and turn it into a lambda expression 341 ;; replacing the special flags with lisp code. 342 ;; For example: A BEGIN needs to be replace by an expression that 343 ;; saves the current index, then evaluates everything till it gets to 344 ;; the END then save the new index if it didn't fail. 345 ;; On an ASTRISK I need to take the previous expression and wrap 346 ;; it in a do that will evaluate the expression till an error 347 ;; occurs and then another do that encompases the remainder of the 348 ;; regular expression and iterates decrementing the index by one 349 ;; of the matched expression sizes and then returns nil. After 350 ;; the last expression insert a form that does a return t so that 351 ;; if the entire nested sub-expression succeeds then the loop 352 ;; is broken manually. 353 ;; 354 (setf result (copy-tree nil)) 355 ;; 356 ;; Reversing the current expression makes building up the 357 ;; lambda list easier due to the nexting of expressions when 358 ;; and astrisk has been encountered. 359 (setf expression (reverse expression)) 360 (do ((elt 0 (1+ elt))) 361 ((>= elt (length expression))) 362 (let ((piece (nth elt expression))) 363 ;; 364 ;; Now check for PLUS, if so then ditto the expression and then let the 365 ;; ASTRISK below handle the rest. 366 ;; 367 (cond ((eql piece 'plus) 368 (cond ((listp (nth (1+ elt) expression)) 369 (setf result (append (list (nth (1+ elt) expression)) 370 result))) 371 ;; 372 ;; duplicate the entire group 373 ;; NOTE: This hasn't been implemented yet!! 374 (t 375 (format *standard-output* "`group' repeat hasn't been implemented yet~%"))))) 376 (cond ((listp piece) ;Just append the list 377 (setf result (append (list piece) result))) 378 ((eql piece 'question) ; Wrap it in a block that won't fail 379 (cond ((listp (nth (1+ elt) expression)) 380 (setf result 381 (append `((progn (block compare 382 ,(nth (1+ elt) 383 expression)) 384 t)) 385 result)) 386 (incf elt)) 387 ;; 388 ;; This is a QUESTION on an entire group which 389 ;; hasn't been implemented yet!!! 390 ;; 391 (t 392 (format *standard-output* "Optional groups not implemented yet~%")))) 393 ((or (eql piece 'astrisk) ; Do the wild thing! 394 (eql piece 'plus)) 395 (cond ((listp (nth (1+ elt) expression)) 396 ;; 397 ;; This is a single character wild card so 398 ;; do the simple form. 399 ;; 400 (setf result 401 `((let ((oindex index)) 402 (declare (fixnum oindex)) 403 (block compare 404 (do () 405 (nil) 406 ,(nth (1+ elt) expression))) 407 (do ((start index (1- start))) 408 ((< start oindex) nil) 409 (declare (fixnum start)) 410 (let ((index start)) 411 (declare (fixnum index)) 412 (block compare 413 ,@result)))))) 414 (incf elt)) 415 (t 416 ;; 417 ;; This is a subgroup repeated so I must build 418 ;; the loop using several values. 419 ;; 420 )) 421 ) 422 (t t)))) ; Just ignore everything else. 423 ;; 424 ;; Now wrap the result in a lambda list that can then be 425 ;; invoked or compiled, however the user wishes. 426 ;; 427 (if anchored 428 (setf result 429 `(lambda (string &key (start 0) (end (length string))) 430 (declare (string string) 431 (fixnum start end) 432 (ignorable start) 433 (optimize (speed 0) (compilation-speed 3))) 434 (setf *regex-groupings* ,group) 435 (block final-return 436 (block compare 437 (let ((index start) 438 (length end)) 439 (declare (fixnum index length)) 440 ,@result))))) 441 (setf result 442 `(lambda (string &key (start 0) (end (length string))) 443 (declare (string string) 444 (fixnum start end) 445 (ignorable start) 446 (optimize (speed 0) (compilation-speed 3))) 447 (setf *regex-groupings* ,group) 448 (block final-return 449 (let ((length end)) 450 (declare (fixnum length)) 451 ,@fast-first 452 (do ((marker start (1+ marker))) 453 ((> marker end) nil) 454 (declare (fixnum marker)) 455 (let ((index marker)) 456 (declare (fixnum index)) 457 (if (block compare 458 ,@result) 459 (return t))))))))))) 460 461 462;;; 463;;; Define a function that will take a quoted character and return 464;;; what the real character should be plus how much of the source 465;;; string was used. If the result is a set of characters, return an 466;;; array of bits indicating which characters should be set. If the 467;;; expression is one of the sub-group matches return a 468;;; list-expression that will provide the match. 469;;; 470(defun regex-quoted (char-string &optional (invert nil)) 471 "Usage: (regex-quoted <char-string> &optional invert) 472 Returns either the quoted character or a simple bit vector of bits set for 473 the matching values" 474 (let ((first (char char-string 0)) 475 (result (char char-string 0)) 476 (used-length 1)) 477 (cond ((eql first #\n) 478 (setf result #\newline)) 479 ((eql first #\c) 480 (setf result #\return)) 481 ((eql first #\t) 482 (setf result #\tab)) 483 ((eql first #\d) 484 (setf result #*0000000000000000000000000000000000000000000000001111111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000)) 485 ((eql first #\D) 486 (setf result #*1111111111111111111111111111111111111111111111110000000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111)) 487 ((eql first #\w) 488 (setf result #*0000000000000000000000000000000000000000000000001111111111000000011111111111111111111111111000010111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000)) 489 ((eql first #\W) 490 (setf result #*1111111111111111111111111111111111111111111111110000000000111111100000000000000000000000000111101000000000000000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111)) 491 ((eql first #\b) 492 (setf result #*0000000001000000000000000000000011000000000010100000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000)) 493 ((eql first #\B) 494 (setf result #*1111111110111111111111111111111100111111111101011111111111011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111)) 495 ((eql first #\s) 496 (setf result #*0000000001100000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000)) 497 ((eql first #\S) 498 (setf result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111)) 499 ((and (>= (char-code first) (char-code #\0)) 500 (<= (char-code first) (char-code #\9))) 501 (if (and (> (length char-string) 2) 502 (and (>= (char-code (char char-string 1)) (char-code #\0)) 503 (<= (char-code (char char-string 1)) (char-code #\9)) 504 (>= (char-code (char char-string 2)) (char-code #\0)) 505 (<= (char-code (char char-string 2)) (char-code #\9)))) 506 ;; 507 ;; It is a single character specified in octal 508 ;; 509 (progn 510 (setf result (do ((x 0 (1+ x)) 511 (return 0)) 512 ((= x 2) return) 513 (setf return (+ (* return 8) 514 (- (char-code (char char-string x)) 515 (char-code #\0)))))) 516 (setf used-length 3)) 517 ;; 518 ;; We have a group number replacement. 519 ;; 520 (let ((group (- (char-code first) (char-code #\0)))) 521 (setf result `((let ((nstring (subseq string (car (aref *regex-groups* ,group)) 522 (cadr (aref *regex-groups* ,group))))) 523 (if (< length (+ index (length nstring))) 524 (return-from compare nil)) 525 (if (not (string= string nstring 526 :start1 index 527 :end1 (+ index (length nstring)))) 528 (return-from compare nil) 529 (incf index (length nstring))))))))) 530 (t 531 (setf result first))) 532 (if (and (vectorp result) invert) 533 (bit-xor result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 t)) 534 (values result used-length))) 535 536