1;;; -*- Mode: Lisp; Package:USER; Base:10 -*-
2;;;
3;;; This code was written by:
4;;;
5;;;    Lawrence E. Freil <lef@nscf.org>
6;;;    National Science Center Foundation
7;;;    Augusta, Georgia 30909
8;;;
9;;; If you modify this code, please comment your modifications
10;;; clearly and inform the author of any improvements so they
11;;; can be incorporated in future releases.
12;;;
13;;; nregex.lisp - My 4/8/92 attempt at a Lisp based regular expression
14;;;               parser.
15;;;
16;;;               This regular expression parser operates by taking a
17;;;               regular expression and breaking it down into a list
18;;;               consisting of lisp expressions and flags.  The list
19;;;               of lisp expressions is then taken in turned into a
20;;;               lambda expression that can be later applied to a
21;;;               string argument for parsing.
22
23;;;
24;;; First we create a copy of macros to help debug the beast
25
26(eval-when #-gcl(:compile-toplevel :load-toplevel :execute)
27	   #+gcl(load compile eval)
28	   (defpackage :maxima-nregex
29	     (:use :common-lisp)
30	     (:export
31	      ;; Vars
32	      #:*regex-debug* #:*regex-groups* #:*regex-groupings*
33	      ;; Functions
34	      #:regex-compile
35	      ))
36	   )
37
38(in-package :maxima-nregex)
39
40(eval-when (:compile-toplevel :load-toplevel :execute)
41  (defvar *regex-debug* nil)	    ; Set to nil for no debugging code
42
43  (defmacro info (message &rest args)
44    (if *regex-debug*
45	`(format *trace-output* ,message ,@args)))
46
47;;;
48;;; Declare the global variables for storing the paren index list.
49;;;
50  (defvar *regex-groups* (make-array 10))
51  (defvar *regex-groupings* 0)
52  )
53
54;;;
55;;; Declare a simple interface for testing.  You probably wouldn't want
56;;; to use this interface unless you were just calling this once.
57;;;
58(defun regex (expression string)
59  "Usage: (regex <expression> <string)
60   This function will call regex-compile on the expression and then apply
61   the string to the returned lambda list."
62  (let ((findit (cond ((stringp expression)
63		       (regex-compile expression))
64		      ((listp expression)
65		       expression)))
66	(result nil))
67    (if (not (funcall (if (functionp findit)
68			  findit
69			  (eval `(function ,findit))) string))
70	(return-from regex nil))
71    (if (= *regex-groupings* 0)
72	(return-from regex t))
73    (dotimes (i *regex-groupings*)
74      (push (funcall 'subseq
75		     string
76		     (car (aref *regex-groups* i))
77		     (cadr (aref *regex-groups* i)))
78	    result))
79    (reverse result)))
80;;;
81;;; Declare some simple macros to make the code more readable.
82;;;
83(defvar *regex-special-chars* "?*+.()[]\\${}")
84
85(defmacro add-exp (list)
86  "Add an item to the end of expression"
87  `(setf expression (append expression ,list)))
88
89;;;
90;;; Now for the main regex compiler routine.
91;;;
92(defun regex-compile (source &key (anchored nil) (case-sensitive t))
93  "Usage: (regex-compile <expression> [ :anchored (t/nil) ] [ :case-sensitive (t/nil) ])
94       This function take a regular expression (supplied as source) and
95       compiles this into a lambda list that a string argument can then
96       be applied to.  It is also possible to compile this lambda list
97       for better performance or to save it as a named function for later
98       use"
99  (info "Now entering regex-compile with \"~A\"~%" source)
100  ;;
101  ;; This routine works in two parts.
102  ;; The first pass take the regular expression and produces a list of
103  ;; operators and lisp expressions for the entire regular expression.
104  ;; The second pass takes this list and produces the lambda expression.
105  (let ((expression '())		; holder for expressions
106	(group 1)			; Current group index
107	(group-stack nil)	      ; Stack of current group endings
108	(result nil)			; holder for built expression.
109	(fast-first nil))	    ; holder for quick unanchored scan
110    ;;
111    ;; If the expression was an empty string then it alway
112    ;; matches (so lets leave early)
113    ;;
114    (if (= (length source) 0)
115	(return-from regex-compile
116	  '(lambda (&rest args)
117	    (declare (ignore args))
118	    t)))
119    ;;
120    ;; If the first character is a caret then set the anchored
121    ;; flags and remove if from the expression string.
122    ;;
123    (cond ((eql (char source 0) #\^)
124	   (setf source (subseq source 1))
125	   (setf anchored t)))
126    ;;
127    ;; If the first sequence is .* then also set the anchored flags.
128    ;; (This is purely for optimization, it will work without this).
129    ;;
130    (if (>= (length source) 2)
131	(if (string= source ".*" :start1 0 :end1 2)
132	    (setf anchored t)))
133    ;;
134    ;; Also, If this is not an anchored search and the first character is
135    ;; a literal, then do a quick scan to see if it is even in the string.
136    ;; If not then we can issue a quick nil,
137    ;; otherwise we can start the search at the matching character to skip
138    ;; the checks of the non-matching characters anyway.
139    ;;
140    ;; If I really wanted to speed up this section of code it would be
141    ;; easy to recognize the case of a fairly long multi-character literal
142    ;; and generate a Boyer-Moore search for the entire literal.
143    ;;
144    ;; I generate the code to do a loop because on CMU Lisp this is about
145    ;; twice as fast a calling position.
146    ;;
147    (if (and (not anchored)
148	     (not (position (char source 0) *regex-special-chars*))
149	     (not (and (> (length source) 1)
150		       (position (char source 1) *regex-special-chars*))))
151	(setf fast-first `((if (not (do ((i start (+ i 1)))
152					((>= i length))
153				      (if (,(if case-sensitive 'eql 'char-equal)
154					    (char string i)
155					    ,(char source 0))
156					  (return (setf start i)))))
157			       (return-from final-return nil)))))
158    ;;
159    ;; Generate the very first expression to save the starting index
160    ;; so that group 0 will be the entire string matched always
161    ;;
162    (add-exp '((setf (aref *regex-groups* 0)
163		(list index nil))))
164    ;;
165    ;; Loop over each character in the regular expression building the
166    ;; expression list as we go.
167    ;;
168    (do ((eindex 0 (1+ eindex)))
169	((= eindex (length source)))
170      (let ((current (char source eindex)))
171	(info "Now processing character ~A index = ~A~%" current eindex)
172	(case current
173	  ((#\.)
174	   ;;
175	   ;; Generate code for a single wild character
176	   ;;
177	   (add-exp '((if (>= index length)
178			  (return-from compare nil)
179			  (incf index)))))
180	  ((#\$)
181	   ;;
182	   ;; If this is the last character of the expression then
183	   ;; anchor the end of the expression, otherwise let it slide
184	   ;; as a standard character (even though it should be quoted).
185	   ;;
186	   (if (= eindex (1- (length source)))
187	       (add-exp '((if (not (= index length))
188			      (return-from compare nil))))
189	       (add-exp '((if (not (and (< index length)
190					(eql (char string index) #\$)))
191			      (return-from compare nil)
192			      (incf index))))))
193	  ((#\*)
194	   (add-exp '(astrisk)))
195
196	  ((#\+)
197	   (add-exp '(plus)))
198
199	  ((#\?)
200	   (add-exp '(question)))
201
202	  ((#\()
203	   ;;
204	   ;; Start a grouping.
205	   ;;
206	   (incf group)
207	   (push group group-stack)
208	   (add-exp `((setf (aref *regex-groups* ,(1- group))
209		       (list index nil))))
210	   (add-exp `(,group)))
211	  ((#\))
212	   ;;
213	   ;; End a grouping
214	   ;;
215	   (let ((group (pop group-stack)))
216	     (add-exp `((setf (cadr (aref *regex-groups* ,(1- group)))
217			 index)))
218	     (add-exp `(,(- group)))))
219	  ((#\[)
220	   ;;
221	   ;; Start of a range operation.
222	   ;; Generate a bit-vector that has one bit per possible character
223	   ;; and then on each character or range, set the possible bits.
224	   ;;
225	   ;; If the first character is carat then invert the set.
226	   (let* ((invert (eql (char source (1+ eindex)) #\^))
227		  (bitstring (make-array 256 :element-type 'bit
228					 :initial-element
229					 (if invert 1 0)))
230		  (set-char (if invert 0 1)))
231	     (if invert (incf eindex))
232	     (do ((x (1+ eindex) (1+ x)))
233		 ((eql (char source x) #\]) (setf eindex x))
234	       (info "Building range with character ~A~%" (char source x))
235	       (cond ((and (eql (char source (1+ x)) #\-)
236			   (not (eql (char source (+ x 2)) #\])))
237		      (if (>= (char-code (char source x))
238			      (char-code (char source (+ 2 x))))
239			  (error (intl:gettext "regex: ranges must be in ascending order; found: \"~A-~A\"")
240				 (char source x) (char source (+ 2 x))))
241		      (do ((j (char-code (char source x)) (1+ j)))
242			  ((> j (char-code (char source (+ 2 x))))
243			   (incf x 2))
244			(info "Setting bit for char ~A code ~A~%" (code-char j) j)
245			(setf (sbit bitstring j) set-char)))
246		     (t
247		      (cond ((not (eql (char source x) #\]))
248			     (let ((char (char source x)))
249			       ;;
250			       ;; If the character is quoted then find out what
251			       ;; it should have been
252			       ;;
253			       (if (eql (char source x) #\\ )
254				   (let ((length))
255				     (multiple-value-setq (char length)
256				       (regex-quoted (subseq source x) invert))
257				     (incf x length)))
258			       (info "Setting bit for char ~A code ~A~%" char (char-code char))
259			       (if (not (vectorp char))
260				   (setf (sbit bitstring (char-code (char source x))) set-char)
261				   (bit-ior bitstring char t))))))))
262	     (add-exp `((let ((range ,bitstring))
263			  (if (>= index length)
264			      (return-from compare nil))
265			  (if (= 1 (sbit range (char-code (char string index))))
266			      (incf index)
267			      (return-from compare nil)))))))
268	  ((#\\ )
269	   ;;
270	   ;; Intreprete the next character as a special, range, octal, group or
271           ;; just the character itself.
272	   ;;
273	   (let ((length)
274		 (value))
275	     (multiple-value-setq (value length)
276	       (regex-quoted (subseq source (1+ eindex)) nil))
277	     (cond ((listp value)
278		    (add-exp value))
279		   ((characterp value)
280		    (add-exp `((if (not (and (< index length)
281					     (eql (char string index)
282						  ,value)))
283				   (return-from compare nil)
284				   (incf index)))))
285		   ((vectorp value)
286		    (add-exp `((let ((range ,value))
287				 (if (>= index length)
288				     (return-from compare nil))
289				 (if (= 1 (sbit range (char-code (char string index))))
290				     (incf index)
291				     (return-from compare nil)))))))
292	     (incf eindex length)))
293	  (t
294	   ;;
295	   ;; We have a literal character.
296	   ;; Scan to see how many we have and if it is more than one
297	   ;; generate a string= verses as single eql.
298	   ;;
299	   (let* ((lit "")
300		  (term (dotimes (litindex (- (length source) eindex) nil)
301			  (let ((litchar (char source (+ eindex litindex))))
302			    (if (position litchar *regex-special-chars*)
303				(return litchar)
304				(progn
305				  (info "Now adding ~A index ~A to lit~%" litchar
306					litindex)
307				  (setf lit (concatenate 'string lit
308							 (string litchar)))))))))
309	     (if (= (length lit) 1)
310		 (add-exp `((if (not (and (< index length)
311					  (,(if case-sensitive 'eql 'char-equal)
312					    (char string index) ,current)))
313				(return-from compare nil)
314				(incf index))))
315		 ;;
316		 ;; If we have a multi-character literal then we must
317		 ;; check to see if the next character (if there is one)
318		 ;; is an astrisk or a plus.  If so then we must not use this
319		 ;; character in the big literal.
320		 (progn
321		   (if (or (eql term #\*) (eql term #\+))
322		       (setf lit (subseq lit 0 (1- (length lit)))))
323		   (add-exp `((if (< length (+ index ,(length lit)))
324				  (return-from compare nil))
325			      (if (not (,(if case-sensitive 'string= 'string-equal)
326					 string ,lit :start1 index
327					 :end1 (+ index ,(length lit))))
328				  (return-from compare nil)
329				  (incf index ,(length lit)))))))
330	     (incf eindex (1- (length lit))))))))
331    ;;
332    ;; Plug end of list to return t.  If we made it this far then
333    ;; We have matched!
334    (add-exp '((setf (cadr (aref *regex-groups* 0))
335		index)))
336    (add-exp '((return-from final-return t)))
337    ;;
338;;;    (print expression)
339    ;;
340    ;; Now take the expression list and turn it into a lambda expression
341    ;; replacing the special flags with lisp code.
342    ;; For example:  A BEGIN needs to be replace by an expression that
343    ;; saves the current index, then evaluates everything till it gets to
344    ;; the END then save the new index if it didn't fail.
345    ;; On an ASTRISK I need to take the previous expression and wrap
346    ;; it in a do that will evaluate the expression till an error
347    ;; occurs and then another do that encompases the remainder of the
348    ;; regular expression and iterates decrementing the index by one
349    ;; of the matched expression sizes and then returns nil.  After
350    ;; the last expression insert a form that does a return t so that
351    ;; if the entire nested sub-expression succeeds then the loop
352    ;; is broken manually.
353    ;;
354    (setf result (copy-tree nil))
355    ;;
356    ;; Reversing the current expression makes building up the
357    ;; lambda list easier due to the nexting of expressions when
358    ;; and astrisk has been encountered.
359    (setf expression (reverse expression))
360    (do ((elt 0 (1+ elt)))
361	((>= elt (length expression)))
362      (let ((piece (nth elt expression)))
363	;;
364	;; Now check for PLUS, if so then ditto the expression and then let the
365	;; ASTRISK below handle the rest.
366	;;
367	(cond ((eql piece 'plus)
368	       (cond ((listp (nth (1+ elt) expression))
369		      (setf result (append (list (nth (1+ elt) expression))
370					   result)))
371		     ;;
372		     ;; duplicate the entire group
373		     ;; NOTE: This hasn't been implemented yet!!
374		     (t
375		      (format *standard-output* "`group' repeat hasn't been implemented yet~%")))))
376	(cond ((listp piece)		;Just append the list
377	       (setf result (append (list piece) result)))
378	      ((eql piece 'question) ; Wrap it in a block that won't fail
379	       (cond ((listp (nth (1+ elt) expression))
380		      (setf result
381			    (append `((progn (block compare
382					       ,(nth (1+ elt)
383						     expression))
384					     t))
385				    result))
386		      (incf elt))
387		     ;;
388		     ;; This is a QUESTION on an entire group which
389		     ;; hasn't been implemented yet!!!
390		     ;;
391		     (t
392		      (format *standard-output* "Optional groups not implemented yet~%"))))
393	      ((or (eql piece 'astrisk) ; Do the wild thing!
394		   (eql piece 'plus))
395	       (cond ((listp (nth (1+ elt) expression))
396		      ;;
397		      ;; This is a single character wild card so
398		      ;; do the simple form.
399		      ;;
400		      (setf result
401			    `((let ((oindex index))
402				(declare (fixnum oindex))
403				(block compare
404				  (do ()
405				      (nil)
406				    ,(nth (1+ elt) expression)))
407				(do ((start index (1- start)))
408				    ((< start oindex) nil)
409				  (declare (fixnum start))
410				  (let ((index start))
411				    (declare (fixnum index))
412				    (block compare
413				      ,@result))))))
414		      (incf elt))
415		     (t
416		      ;;
417		      ;; This is a subgroup repeated so I must build
418		      ;; the loop using several values.
419		      ;;
420		      ))
421	       )
422	      (t t))))			; Just ignore everything else.
423    ;;
424    ;; Now wrap the result in a lambda list that can then be
425    ;; invoked or compiled, however the user wishes.
426    ;;
427    (if anchored
428	(setf result
429	      `(lambda (string &key (start 0) (end (length string)))
430		(declare (string string)
431		 (fixnum start end)
432		 (ignorable start)
433		 (optimize (speed 0) (compilation-speed 3)))
434		(setf *regex-groupings* ,group)
435		(block final-return
436		  (block compare
437		    (let ((index start)
438			  (length end))
439		      (declare (fixnum index length))
440		      ,@result)))))
441	(setf result
442	      `(lambda (string &key (start 0) (end (length string)))
443		(declare (string string)
444		 (fixnum start end)
445		 (ignorable start)
446		 (optimize (speed 0) (compilation-speed 3)))
447		(setf *regex-groupings* ,group)
448		(block final-return
449		  (let ((length end))
450		    (declare (fixnum length))
451		    ,@fast-first
452		    (do ((marker start (1+ marker)))
453			((> marker end) nil)
454		      (declare (fixnum marker))
455		      (let ((index marker))
456			(declare (fixnum index))
457			(if (block compare
458			      ,@result)
459			    (return t)))))))))))
460
461
462;;;
463;;; Define a function that will take a quoted character and return
464;;; what the real character should be plus how much of the source
465;;; string was used.  If the result is a set of characters, return an
466;;; array of bits indicating which characters should be set.  If the
467;;; expression is one of the sub-group matches return a
468;;; list-expression that will provide the match.
469;;;
470(defun regex-quoted (char-string &optional (invert nil))
471  "Usage: (regex-quoted <char-string> &optional invert)
472       Returns either the quoted character or a simple bit vector of bits set for
473       the matching values"
474  (let ((first (char char-string 0))
475	(result (char char-string 0))
476	(used-length 1))
477    (cond ((eql first #\n)
478	   (setf result #\newline))
479	  ((eql first #\c)
480	   (setf result #\return))
481	  ((eql first #\t)
482	   (setf result #\tab))
483	  ((eql first #\d)
484	   (setf result #*0000000000000000000000000000000000000000000000001111111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
485	  ((eql first #\D)
486	   (setf result #*1111111111111111111111111111111111111111111111110000000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
487	  ((eql first #\w)
488	   (setf result #*0000000000000000000000000000000000000000000000001111111111000000011111111111111111111111111000010111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
489	  ((eql first #\W)
490	   (setf result #*1111111111111111111111111111111111111111111111110000000000111111100000000000000000000000000111101000000000000000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
491	  ((eql first #\b)
492	   (setf result #*0000000001000000000000000000000011000000000010100000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
493	  ((eql first #\B)
494	   (setf result #*1111111110111111111111111111111100111111111101011111111111011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
495	  ((eql first #\s)
496	   (setf result #*0000000001100000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
497	  ((eql first #\S)
498	   (setf result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
499	  ((and (>= (char-code first) (char-code #\0))
500		(<= (char-code first) (char-code #\9)))
501	   (if (and (> (length char-string) 2)
502		    (and (>= (char-code (char char-string 1)) (char-code #\0))
503			 (<= (char-code (char char-string 1)) (char-code #\9))
504			 (>= (char-code (char char-string 2)) (char-code #\0))
505			 (<= (char-code (char char-string 2)) (char-code #\9))))
506	       ;;
507	       ;; It is a single character specified in octal
508	       ;;
509	       (progn
510		 (setf result (do ((x 0 (1+ x))
511				   (return 0))
512				  ((= x 2) return)
513				(setf return (+ (* return 8)
514						(- (char-code (char char-string x))
515						   (char-code #\0))))))
516		 (setf used-length 3))
517	       ;;
518	       ;; We have a group number replacement.
519	       ;;
520	       (let ((group (- (char-code first) (char-code #\0))))
521		 (setf result `((let ((nstring (subseq string (car (aref *regex-groups* ,group))
522						       (cadr (aref *regex-groups* ,group)))))
523				  (if (< length (+ index (length nstring)))
524				      (return-from compare nil))
525				  (if (not (string= string nstring
526						    :start1 index
527						    :end1 (+ index (length nstring))))
528				      (return-from compare nil)
529				      (incf index (length nstring)))))))))
530	  (t
531	   (setf result first)))
532    (if (and (vectorp result) invert)
533	(bit-xor result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 t))
534    (values result used-length)))
535
536