1 #include "sam.h"
2 
3 Rangeset	sel;
4 String		lastregexp;
5 /*
6  * Machine Information
7  */
8 typedef struct Inst Inst;
9 
10 struct Inst
11 {
12 	long	type;	/* < OPERATOR ==> literal, otherwise action */
13 	union {
14 		int rsid;
15 		int rsubid;
16 		int class;
17 		struct Inst *rother;
18 		struct Inst *rright;
19 	} r;
20 	union{
21 		struct Inst *lleft;
22 		struct Inst *lnext;
23 	} l;
24 };
25 #define	sid	r.rsid
26 #define	subid	r.rsubid
27 #define	rclass	r.class
28 #define	other	r.rother
29 #define	right	r.rright
30 #define	left	l.lleft
31 #define	next	l.lnext
32 
33 #define	NPROG	1024
34 Inst	program[NPROG];
35 Inst	*progp;
36 Inst	*startinst;	/* First inst. of program; might not be program[0] */
37 Inst	*bstartinst;	/* same for backwards machine */
38 
39 typedef struct Ilist Ilist;
40 struct Ilist
41 {
42 	Inst	*inst;		/* Instruction of the thread */
43 	Rangeset se;
44 	Posn	startp;		/* first char of match */
45 };
46 
47 #define	NLIST	127
48 
49 Ilist	*tl, *nl;	/* This list, next list */
50 Ilist	list[2][NLIST+1];	/* +1 for trailing null */
51 static	Rangeset sempty;
52 
53 /*
54  * Actions and Tokens
55  *
56  *	0x10000xx are operators, value == precedence
57  *	0x20000xx are tokens, i.e. operands for operators
58  */
59 #define	OPERATOR	0x1000000	/* Bit set in all operators */
60 #define	START		(OPERATOR+0)	/* Start, used for marker on stack */
61 #define	RBRA		(OPERATOR+1)	/* Right bracket, ) */
62 #define	LBRA		(OPERATOR+2)	/* Left bracket, ( */
63 #define	OR		(OPERATOR+3)	/* Alternation, | */
64 #define	CAT		(OPERATOR+4)	/* Concatentation, implicit operator */
65 #define	STAR		(OPERATOR+5)	/* Closure, * */
66 #define	PLUS		(OPERATOR+6)	/* a+ == aa* */
67 #define	QUEST		(OPERATOR+7)	/* a? == a|nothing, i.e. 0 or 1 a's */
68 #define	ANY		0x2000000	/* Any character but newline, . */
69 #define	NOP		(ANY+1)	/* No operation, internal use only */
70 #define	BOL		(ANY+2)	/* Beginning of line, ^ */
71 #define	EOL		(ANY+3)	/* End of line, $ */
72 #define	CCLASS		(ANY+4)	/* Character class, [] */
73 #define	NCCLASS		(ANY+5)	/* Negated character class, [^] */
74 #define	END		(ANY+0x77)	/* Terminate: match found */
75 
76 #define	ISATOR		OPERATOR
77 #define	ISAND		ANY
78 
79 #define	QUOTED	0x4000000	/* Bit set for \-ed lex characters */
80 
81 /*
82  * Parser Information
83  */
84 typedef struct Node Node;
85 struct Node
86 {
87 	Inst	*first;
88 	Inst	*last;
89 };
90 
91 #define	NSTACK	20
92 Node	andstack[NSTACK];
93 Node	*andp;
94 int	atorstack[NSTACK];
95 int	*atorp;
96 int	lastwasand;	/* Last token was operand */
97 int	cursubid;
98 int	subidstack[NSTACK];
99 int	*subidp;
100 int	backwards;
101 int	nbra;
102 Rune	*exprp;		/* pointer to next character in source expression */
103 #define	DCLASS	10	/* allocation increment */
104 int	nclass;		/* number active */
105 int	Nclass;		/* high water mark */
106 Rune	**class;
107 int	negateclass;
108 
109 int	addinst(Ilist *l, Inst *inst, Rangeset *sep);
110 void	newmatch(Rangeset*);
111 void	bnewmatch(Rangeset*);
112 void	pushand(Inst*, Inst*);
113 void	pushator(int);
114 Node	*popand(int);
115 int	popator(void);
116 void	startlex(Rune*);
117 int	lex(void);
118 void	operator(int);
119 void	operand(int);
120 void	evaluntil(int);
121 void	optimize(Inst*);
122 void	bldcclass(void);
123 
124 void
regerror(Err e)125 regerror(Err e)
126 {
127 	Strzero(&lastregexp);
128 	error(e);
129 }
130 
131 void
regerror_c(Err e,int c)132 regerror_c(Err e, int c)
133 {
134 	Strzero(&lastregexp);
135 	error_c(e, c);
136 }
137 
138 Inst *
newinst(int t)139 newinst(int t)
140 {
141 	if(progp >= &program[NPROG])
142 		regerror(Etoolong);
143 	progp->type = t;
144 	progp->left = 0;
145 	progp->right = 0;
146 	return progp++;
147 }
148 
149 Inst *
realcompile(Rune * s)150 realcompile(Rune *s)
151 {
152 	int token;
153 
154 	startlex(s);
155 	atorp = atorstack;
156 	andp = andstack;
157 	subidp = subidstack;
158 	cursubid = 0;
159 	lastwasand = FALSE;
160 	/* Start with a low priority operator to prime parser */
161 	pushator(START-1);
162 	while((token=lex()) != END){
163 		if((token&ISATOR) == OPERATOR)
164 			operator(token);
165 		else
166 			operand(token);
167 	}
168 	/* Close with a low priority operator */
169 	evaluntil(START);
170 	/* Force END */
171 	operand(END);
172 	evaluntil(START);
173 	if(nbra)
174 		regerror(Eleftpar);
175 	--andp;	/* points to first and only operand */
176 	return andp->first;
177 }
178 
179 void
compile(String * s)180 compile(String *s)
181 {
182 	int i;
183 	Inst *oprogp;
184 
185 	if(Strcmp(s, &lastregexp)==0)
186 		return;
187 	for(i=0; i<nclass; i++)
188 		free(class[i]);
189 	nclass = 0;
190 	progp = program;
191 	backwards = FALSE;
192 	startinst = realcompile(s->s);
193 	optimize(program);
194 	oprogp = progp;
195 	backwards = TRUE;
196 	bstartinst = realcompile(s->s);
197 	optimize(oprogp);
198 	Strduplstr(&lastregexp, s);
199 }
200 
201 void
operand(int t)202 operand(int t)
203 {
204 	Inst *i;
205 	if(lastwasand)
206 		operator(CAT);	/* catenate is implicit */
207 	i = newinst(t);
208 	if(t == CCLASS){
209 		if(negateclass)
210 			i->type = NCCLASS;	/* UGH */
211 		i->rclass = nclass-1;		/* UGH */
212 	}
213 	pushand(i, i);
214 	lastwasand = TRUE;
215 }
216 
217 void
operator(int t)218 operator(int t)
219 {
220 	if(t==RBRA && --nbra<0)
221 		regerror(Erightpar);
222 	if(t==LBRA){
223 /*
224  *		if(++cursubid >= NSUBEXP)
225  *			regerror(Esubexp);
226  */
227 		cursubid++;	/* silently ignored */
228 		nbra++;
229 		if(lastwasand)
230 			operator(CAT);
231 	}else
232 		evaluntil(t);
233 	if(t!=RBRA)
234 		pushator(t);
235 	lastwasand = FALSE;
236 	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
237 		lastwasand = TRUE;	/* these look like operands */
238 }
239 
240 void
cant(char * s)241 cant(char *s)
242 {
243 	char buf[100];
244 
245 	sprint(buf, "regexp: can't happen: %s", s);
246 	panic(buf);
247 }
248 
249 void
pushand(Inst * f,Inst * l)250 pushand(Inst *f, Inst *l)
251 {
252 	if(andp >= &andstack[NSTACK])
253 		cant("operand stack overflow");
254 	andp->first = f;
255 	andp->last = l;
256 	andp++;
257 }
258 
259 void
pushator(int t)260 pushator(int t)
261 {
262 	if(atorp >= &atorstack[NSTACK])
263 		cant("operator stack overflow");
264 	*atorp++=t;
265 	if(cursubid >= NSUBEXP)
266 		*subidp++= -1;
267 	else
268 		*subidp++=cursubid;
269 }
270 
271 Node *
popand(int op)272 popand(int op)
273 {
274 	if(andp <= &andstack[0])
275 		if(op)
276 			regerror_c(Emissop, op);
277 		else
278 			regerror(Ebadregexp);
279 	return --andp;
280 }
281 
282 int
popator(void)283 popator(void)
284 {
285 	if(atorp <= &atorstack[0])
286 		cant("operator stack underflow");
287 	--subidp;
288 	return *--atorp;
289 }
290 
291 void
evaluntil(int pri)292 evaluntil(int pri)
293 {
294 	Node *op1, *op2, *t;
295 	Inst *inst1, *inst2;
296 
297 	while(pri==RBRA || atorp[-1]>=pri){
298 		switch(popator()){
299 		case LBRA:
300 			op1 = popand('(');
301 			inst2 = newinst(RBRA);
302 			inst2->subid = *subidp;
303 			op1->last->next = inst2;
304 			inst1 = newinst(LBRA);
305 			inst1->subid = *subidp;
306 			inst1->next = op1->first;
307 			pushand(inst1, inst2);
308 			return;		/* must have been RBRA */
309 		default:
310 			panic("unknown regexp operator");
311 			break;
312 		case OR:
313 			op2 = popand('|');
314 			op1 = popand('|');
315 			inst2 = newinst(NOP);
316 			op2->last->next = inst2;
317 			op1->last->next = inst2;
318 			inst1 = newinst(OR);
319 			inst1->right = op1->first;
320 			inst1->left = op2->first;
321 			pushand(inst1, inst2);
322 			break;
323 		case CAT:
324 			op2 = popand(0);
325 			op1 = popand(0);
326 			if(backwards && op2->first->type!=END)
327 				t = op1, op1 = op2, op2 = t;
328 			op1->last->next = op2->first;
329 			pushand(op1->first, op2->last);
330 			break;
331 		case STAR:
332 			op2 = popand('*');
333 			inst1 = newinst(OR);
334 			op2->last->next = inst1;
335 			inst1->right = op2->first;
336 			pushand(inst1, inst1);
337 			break;
338 		case PLUS:
339 			op2 = popand('+');
340 			inst1 = newinst(OR);
341 			op2->last->next = inst1;
342 			inst1->right = op2->first;
343 			pushand(op2->first, inst1);
344 			break;
345 		case QUEST:
346 			op2 = popand('?');
347 			inst1 = newinst(OR);
348 			inst2 = newinst(NOP);
349 			inst1->left = inst2;
350 			inst1->right = op2->first;
351 			op2->last->next = inst2;
352 			pushand(inst1, inst2);
353 			break;
354 		}
355 	}
356 }
357 
358 
359 void
optimize(Inst * start)360 optimize(Inst *start)
361 {
362 	Inst *inst, *target;
363 
364 	for(inst=start; inst->type!=END; inst++){
365 		target = inst->next;
366 		while(target->type == NOP)
367 			target = target->next;
368 		inst->next = target;
369 	}
370 }
371 
372 #ifdef	DEBUG
373 void
dumpstack(void)374 dumpstack(void){
375 	Node *stk;
376 	int *ip;
377 
378 	dprint("operators\n");
379 	for(ip = atorstack; ip<atorp; ip++)
380 		dprint("0%o\n", *ip);
381 	dprint("operands\n");
382 	for(stk = andstack; stk<andp; stk++)
383 		dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
384 }
385 void
dump(void)386 dump(void){
387 	Inst *l;
388 
389 	l = program;
390 	do{
391 		dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
392 			l->left-program, l->right-program);
393 	}while(l++->type);
394 }
395 #endif
396 
397 void
startlex(Rune * s)398 startlex(Rune *s)
399 {
400 	exprp = s;
401 	nbra = 0;
402 }
403 
404 
405 int
lex(void)406 lex(void){
407 	int c= *exprp++;
408 
409 	switch(c){
410 	case '\\':
411 		if(*exprp)
412 			if((c= *exprp++)=='n')
413 				c='\n';
414 		break;
415 	case 0:
416 		c = END;
417 		--exprp;	/* In case we come here again */
418 		break;
419 	case '*':
420 		c = STAR;
421 		break;
422 	case '?':
423 		c = QUEST;
424 		break;
425 	case '+':
426 		c = PLUS;
427 		break;
428 	case '|':
429 		c = OR;
430 		break;
431 	case '.':
432 		c = ANY;
433 		break;
434 	case '(':
435 		c = LBRA;
436 		break;
437 	case ')':
438 		c = RBRA;
439 		break;
440 	case '^':
441 		c = BOL;
442 		break;
443 	case '$':
444 		c = EOL;
445 		break;
446 	case '[':
447 		c = CCLASS;
448 		bldcclass();
449 		break;
450 	}
451 	return c;
452 }
453 
454 long
nextrec(void)455 nextrec(void){
456 	if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
457 		regerror(Ebadclass);
458 	if(exprp[0] == '\\'){
459 		exprp++;
460 		if(*exprp=='n'){
461 			exprp++;
462 			return '\n';
463 		}
464 		return *exprp++|QUOTED;
465 	}
466 	return *exprp++;
467 }
468 
469 void
bldcclass(void)470 bldcclass(void)
471 {
472 	long c1, c2, n, na;
473 	Rune *classp;
474 
475 	classp = emalloc(DCLASS*RUNESIZE);
476 	n = 0;
477 	na = DCLASS;
478 	/* we have already seen the '[' */
479 	if(*exprp == '^'){
480 		classp[n++] = '\n';	/* don't match newline in negate case */
481 		negateclass = TRUE;
482 		exprp++;
483 	}else
484 		negateclass = FALSE;
485 	while((c1 = nextrec()) != ']'){
486 		if(c1 == '-'){
487     Error:
488 			free(classp);
489 			regerror(Ebadclass);
490 		}
491 		if(n+4 >= na){		/* 3 runes plus NUL */
492 			na += DCLASS;
493 			classp = erealloc(classp, na*RUNESIZE);
494 		}
495 		if(*exprp == '-'){
496 			exprp++;	/* eat '-' */
497 			if((c2 = nextrec()) == ']')
498 				goto Error;
499 			classp[n+0] = Runemax;
500 			classp[n+1] = c1;
501 			classp[n+2] = c2;
502 			n += 3;
503 		}else
504 			classp[n++] = c1 & ~QUOTED;
505 	}
506 	classp[n] = 0;
507 	if(nclass == Nclass){
508 		Nclass += DCLASS;
509 		class = erealloc(class, Nclass*sizeof(Rune*));
510 	}
511 	class[nclass++] = classp;
512 }
513 
514 int
classmatch(int classno,int c,int negate)515 classmatch(int classno, int c, int negate)
516 {
517 	Rune *p;
518 
519 	p = class[classno];
520 	while(*p){
521 		if(*p == Runemax){
522 			if(p[1]<=c && c<=p[2])
523 				return !negate;
524 			p += 3;
525 		}else if(*p++ == c)
526 			return !negate;
527 	}
528 	return negate;
529 }
530 
531 /*
532  * Note optimization in addinst:
533  * 	*l must be pending when addinst called; if *l has been looked
534  *		at already, the optimization is a bug.
535  */
536 int
addinst(Ilist * l,Inst * inst,Rangeset * sep)537 addinst(Ilist *l, Inst *inst, Rangeset *sep)
538 {
539 	Ilist *p;
540 
541 	for(p = l; p->inst; p++){
542 		if(p->inst==inst){
543 			if((sep)->p[0].p1 < p->se.p[0].p1)
544 				p->se= *sep;	/* this would be bug */
545 			return 0;	/* It's already there */
546 		}
547 	}
548 	p->inst = inst;
549 	p->se= *sep;
550 	(p+1)->inst = 0;
551 	return 1;
552 }
553 
554 int
execute(File * f,Posn startp,Posn eof)555 execute(File *f, Posn startp, Posn eof)
556 {
557 	int flag = 0;
558 	Inst *inst;
559 	Ilist *tlp;
560 	Posn p = startp;
561 	int nnl = 0, ntl;
562 	int c;
563 	int wrapped = 0;
564 	int startchar = startinst->type<OPERATOR? startinst->type : 0;
565 
566 	list[0][0].inst = list[1][0].inst = 0;
567 	sel.p[0].p1 = -1;
568 	/* Execute machine once for each character */
569 	for(;;p++){
570 	doloop:
571 		c = filereadc(f, p);
572 		if(p>=eof || c<0){
573 			switch(wrapped++){
574 			case 0:		/* let loop run one more click */
575 			case 2:
576 				break;
577 			case 1:		/* expired; wrap to beginning */
578 				if(sel.p[0].p1>=0 || eof!=INFINITY)
579 					goto Return;
580 				list[0][0].inst = list[1][0].inst = 0;
581 				p = 0;
582 				goto doloop;
583 			default:
584 				goto Return;
585 			}
586 		}else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0)
587 			break;
588 		/* fast check for first char */
589 		if(startchar && nnl==0 && c!=startchar)
590 			continue;
591 		tl = list[flag];
592 		nl = list[flag^=1];
593 		nl->inst = 0;
594 		ntl = nnl;
595 		nnl = 0;
596 		if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){
597 			/* Add first instruction to this list */
598 			sempty.p[0].p1 = p;
599 			if(addinst(tl, startinst, &sempty))
600 			if(++ntl >= NLIST)
601 	Overflow:
602 				error(Eoverflow);
603 		}
604 		/* Execute machine until this list is empty */
605 		for(tlp = tl; inst = tlp->inst; tlp++){	/* assignment = */
606 	Switchstmt:
607 			switch(inst->type){
608 			default:	/* regular character */
609 				if(inst->type==c){
610 	Addinst:
611 					if(addinst(nl, inst->next, &tlp->se))
612 					if(++nnl >= NLIST)
613 						goto Overflow;
614 				}
615 				break;
616 			case LBRA:
617 				if(inst->subid>=0)
618 					tlp->se.p[inst->subid].p1 = p;
619 				inst = inst->next;
620 				goto Switchstmt;
621 			case RBRA:
622 				if(inst->subid>=0)
623 					tlp->se.p[inst->subid].p2 = p;
624 				inst = inst->next;
625 				goto Switchstmt;
626 			case ANY:
627 				if(c!='\n')
628 					goto Addinst;
629 				break;
630 			case BOL:
631 				if(p==0 || filereadc(f, p - 1)=='\n'){
632 	Step:
633 					inst = inst->next;
634 					goto Switchstmt;
635 				}
636 				break;
637 			case EOL:
638 				if(c == '\n')
639 					goto Step;
640 				break;
641 			case CCLASS:
642 				if(c>=0 && classmatch(inst->rclass, c, 0))
643 					goto Addinst;
644 				break;
645 			case NCCLASS:
646 				if(c>=0 && classmatch(inst->rclass, c, 1))
647 					goto Addinst;
648 				break;
649 			case OR:
650 				/* evaluate right choice later */
651 				if(addinst(tl, inst->right, &tlp->se))
652 				if(++ntl >= NLIST)
653 					goto Overflow;
654 				/* efficiency: advance and re-evaluate */
655 				inst = inst->left;
656 				goto Switchstmt;
657 			case END:	/* Match! */
658 				tlp->se.p[0].p2 = p;
659 				newmatch(&tlp->se);
660 				break;
661 			}
662 		}
663 	}
664     Return:
665 	return sel.p[0].p1>=0;
666 }
667 
668 void
newmatch(Rangeset * sp)669 newmatch(Rangeset *sp)
670 {
671 	int i;
672 
673 	if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
674 	   (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
675 		for(i = 0; i<NSUBEXP; i++)
676 			sel.p[i] = sp->p[i];
677 }
678 
679 int
bexecute(File * f,Posn startp)680 bexecute(File *f, Posn startp)
681 {
682 	int flag = 0;
683 	Inst *inst;
684 	Ilist *tlp;
685 	Posn p = startp;
686 	int nnl = 0, ntl;
687 	int c;
688 	int wrapped = 0;
689 	int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;
690 
691 	list[0][0].inst = list[1][0].inst = 0;
692 	sel.p[0].p1= -1;
693 	/* Execute machine once for each character, including terminal NUL */
694 	for(;;--p){
695 	doloop:
696 		if((c = filereadc(f, p - 1))==-1){
697 			switch(wrapped++){
698 			case 0:		/* let loop run one more click */
699 			case 2:
700 				break;
701 			case 1:		/* expired; wrap to end */
702 				if(sel.p[0].p1>=0)
703 			case 3:
704 					goto Return;
705 				list[0][0].inst = list[1][0].inst = 0;
706 				p = f->b.nc;
707 				goto doloop;
708 			default:
709 				goto Return;
710 			}
711 		}else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0)
712 			break;
713 		/* fast check for first char */
714 		if(startchar && nnl==0 && c!=startchar)
715 			continue;
716 		tl = list[flag];
717 		nl = list[flag^=1];
718 		nl->inst = 0;
719 		ntl = nnl;
720 		nnl = 0;
721 		if(sel.p[0].p1<0 && (!wrapped || p>startp)){
722 			/* Add first instruction to this list */
723 			/* the minus is so the optimizations in addinst work */
724 			sempty.p[0].p1 = -p;
725 			if(addinst(tl, bstartinst, &sempty))
726 			if(++ntl >= NLIST)
727 	Overflow:
728 				error(Eoverflow);
729 		}
730 		/* Execute machine until this list is empty */
731 		for(tlp = tl; inst = tlp->inst; tlp++){	/* assignment = */
732 	Switchstmt:
733 			switch(inst->type){
734 			default:	/* regular character */
735 				if(inst->type == c){
736 	Addinst:
737 					if(addinst(nl, inst->next, &tlp->se))
738 					if(++nnl >= NLIST)
739 						goto Overflow;
740 				}
741 				break;
742 			case LBRA:
743 				if(inst->subid>=0)
744 					tlp->se.p[inst->subid].p1 = p;
745 				inst = inst->next;
746 				goto Switchstmt;
747 			case RBRA:
748 				if(inst->subid >= 0)
749 					tlp->se.p[inst->subid].p2 = p;
750 				inst = inst->next;
751 				goto Switchstmt;
752 			case ANY:
753 				if(c != '\n')
754 					goto Addinst;
755 				break;
756 			case BOL:
757 				if(c=='\n' || p==0){
758 	Step:
759 					inst = inst->next;
760 					goto Switchstmt;
761 				}
762 				break;
763 			case EOL:
764 				if(p==f->b.nc || filereadc(f, p)=='\n')
765 					goto Step;
766 				break;
767 			case CCLASS:
768 				if(c>=0 && classmatch(inst->rclass, c, 0))
769 					goto Addinst;
770 				break;
771 			case NCCLASS:
772 				if(c>=0 && classmatch(inst->rclass, c, 1))
773 					goto Addinst;
774 				break;
775 			case OR:
776 				/* evaluate right choice later */
777 				if(addinst(tlp, inst->right, &tlp->se))
778 				if(++ntl >= NLIST)
779 					goto Overflow;
780 				/* efficiency: advance and re-evaluate */
781 				inst = inst->left;
782 				goto Switchstmt;
783 			case END:	/* Match! */
784 				tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */
785 				tlp->se.p[0].p2 = p;
786 				bnewmatch(&tlp->se);
787 				break;
788 			}
789 		}
790 	}
791     Return:
792 	return sel.p[0].p1>=0;
793 }
794 
795 void
bnewmatch(Rangeset * sp)796 bnewmatch(Rangeset *sp)
797 {
798         int  i;
799         if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1))
800                 for(i = 0; i<NSUBEXP; i++){       /* note the reversal; p1<=p2 */
801                         sel.p[i].p1 = sp->p[i].p2;
802                         sel.p[i].p2 = sp->p[i].p1;
803                 }
804 }
805