1 /*
2 	System: Structured text retrieval tool sgrep.
3 	Module: eval.c
4 	Author: Pekka Kilpel�inen & Jani Jaakkola
5 	Description: Handles the evaluation of sgrep expressions, thus
6 		     implementing the actual semantics of sgrep language.
7 		     used through eval() function
8 	Version history: Original version February 1995 by JJ & PK
9 	Copyright: University of Helsinki, Dept. of Computer Science
10 		   Distributed under GNU General Public Lisence
11 		   See file COPYING for details
12 */
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <limits.h>
16 #include "defines.h"
17 
18 #if defined(ASSERT) && defined(OPTIMIZE_SORTS)
19 /* Define this if you want always test that nest optimization works */
20 #define ASSERT_NESTS
21 #endif
22 
23 #ifdef PROGRESS_REPORTS
24  /* We have progress reports compiled in */
25 #define SHOW_PROG do { \
26 	prog++; \
27 	if (progress_output && !(prog&1023)) report_progress(oper_name,prog_start,prog); \
28 } while (0)
29 #else
30  #define SHOW_PROG do {} while (0)
31 #endif
32 
33 /*
34  * Startup size of nest stack
35  */
36 #define DEFAULT_NEST_DEPTH 1024
37 
38 /*
39  * Startup size of inner queue
40  */
41 #define DEFAULT_INNER_QUEUE 1024
42 
43 struct GC_LIST *real_eval(struct TREE_NODE *root);
44 struct GC_LIST *or(struct GC_LIST *,struct GC_LIST *);
45 struct GC_LIST *nest_order(struct GC_LIST *,struct GC_LIST *,int);
46 struct GC_LIST *quote(struct GC_LIST *,struct GC_LIST *,int);
47 struct GC_LIST *in(struct GC_LIST *,struct GC_LIST *,int);
48 struct GC_LIST *containing(struct GC_LIST *,struct GC_LIST *,int);
49 struct GC_LIST *extracting(struct GC_LIST *,struct GC_LIST *);
50 struct GC_LIST *outer(struct GC_LIST *);
51 struct GC_LIST *inner(struct GC_LIST *);
52 struct GC_LIST *concat(struct GC_LIST *);
53 struct GC_LIST *join(struct GC_LIST *,int number);
54 struct GC_LIST *equal(struct GC_LIST *,struct GC_LIST *,int);
55 int free_tree_node(struct TREE_NODE *node);
56 
57 /*
58  * Recursively evaluates parse tree using operation functions
59  * root points the root node of parse tree
60  */
eval(struct TREE_NODE * root)61 struct GC_LIST *eval(struct TREE_NODE *root)
62 {
63 	static int depth=0;
64 #ifdef DEBUG
65 	int i;
66 #endif
67 	struct GC_LIST *a;
68 
69 	a=root->GC_list;
70 	depth++;
71 #ifdef DEBUG
72 	for(i=0;i<depth;i++) fputc(' ',stderr);
73 	fprintf(stderr,"Evaluating oper %s l_label=%d r_label=%d\n",
74 		give_oper_name(root->oper),root->label_left,root->label_right);
75 #endif
76 #ifdef ASSERT
77 	assert(root->oper!=INVALID);
78 #endif
79 	/* If this is a leaf node, we just use leafs gc list */
80 	if ( a==NULL && root->oper==PHRASE )
81 	{
82 #ifdef ASSERT
83 		assert(root->leaf->GC_list!=NULL);
84 #endif
85 		a=root->leaf->GC_list;
86 		/* OBSOLETE
87 			root->leaf->GC_list=NULL;
88 		*/
89 		a->refcount=root->refcount;
90 #ifdef DEBUG
91 		for(i=0;i<depth;i++) fputc(' ',stderr);
92 		fprintf(stderr,"Using phrase list %s\n",root->leaf->phrase->s);
93 #endif
94 	}
95 	/* If gc_list is still NULL, it means that it hasn't been evaluated yet */
96 	if ( a==NULL )
97 	{
98 #ifdef DEBUG
99 		for(i=0;i<depth;i++) fputc(' ',stderr);
100 		fprintf(stderr,"Calling real eval oper %s\n",
101 			give_oper_name(root->oper));
102 #endif
103 		a=real_eval(root);
104 		a->refcount=root->refcount;
105 		/* We free subtrees unneeded gclists */
106 		if (free_tree_node(root->left))
107 		{
108 #ifdef DEBUG
109 			for(i=0;i<depth;i++) putc(' ',stderr);
110 			fprintf(stderr,"label %d freed (left)\n",root->label_left);
111 #endif
112 		}
113 		if (free_tree_node(root->right))
114 		{
115 #ifdef DEBUG
116 			for(i=0;i<depth;i++) putc(' ',stderr);
117 			fprintf(stderr,"label %d freed (right)\n",root->label_right);
118 #endif
119 		}
120 	}
121 #ifdef DEBUG
122 	else
123 	{
124 		for(i=0;i<depth;i++) fputc(' ',stderr);
125 		fprintf(stderr,"Using already known list\n");
126 	}
127 #endif
128 
129 	/* Keeps track of longest used gc list */
130 	if (LIST_SIZE(a)>stats.longest_list)
131 		stats.longest_list=LIST_SIZE(a);
132 #ifdef ASSERT_NESTS
133 	/* We check that if list isn't marked as nested, it really isn't */
134 	if (!a->nested)
135 	{
136 		struct REGION reg1,reg2;
137 		struct GC_POINTER p;
138 
139 		start_region_search(a,&p);
140 		get_region(&p,&reg1);
141 		get_region(&p,&reg2);
142 		while (reg2.start!=-1)
143 		{
144 			assert(reg1.end<reg2.end);
145 			reg1=reg2;
146 			get_region(&p,&reg2);
147 		}
148 	}
149 #endif
150 
151 #ifdef PROGRESS_REPORTS
152 	if (progress_output)
153 	{
154 		/* After every operation, we clean the line */
155 		fprintf(stderr,"                                    \r");
156 	}
157 #endif
158 	root->GC_list=a;
159 
160 #ifdef DEBUG
161 	for(i=0;i<depth;i++) fputc(' ',stderr);
162 	fprintf(stderr,"eval done\n");
163 #endif
164 	depth--;
165 	return a;
166 }
167 
168 /*
169  * Handles the actual evaluation of some operation
170  */
real_eval(struct TREE_NODE * root)171 struct GC_LIST *real_eval(struct TREE_NODE *root)
172 {
173 	struct GC_LIST *a,*l,*r;
174 
175 	a=NULL;
176 #ifdef ASSERT
177 	assert(root->left!=NULL);
178 #endif
179 
180 	/* Let's evaluate left and right subtrees first */
181 	l=eval(root->left);
182 	/* if root->right==NULL this is a function, and it won't have
183 	   right sub tree */
184 	if (root->right==NULL) r=NULL;
185 	else r=eval(root->right);
186 
187 	/* Let's call the operation function */
188 	switch (root->oper) {
189 	case OR:
190 		a=or(l,r);
191 		break;
192 	case ORDERED:
193 	case L_ORDERED:
194 	case R_ORDERED:
195 	case LR_ORDERED:
196 		a=nest_order(l,r,root->oper);
197 		break;
198 	case QUOTE:
199 	case L_QUOTE:
200 	case R_QUOTE:
201 	case LR_QUOTE:
202 		a=quote(l,r,root->oper);
203 		break;
204 	case IN:
205 		a=in(l,r,FALSE);
206 		break;
207 	case NOT_IN:
208 		a=in(l,r,TRUE);
209 		break;
210 	case CONTAINING:
211 		a=containing(l,r,FALSE);
212 		break;
213 	case NOT_CONTAINING:
214 		a=containing(l,r,TRUE);
215 		break;
216 /* Start PK Febr 95 */
217 	case EQUAL:
218 		a=equal(l,r,FALSE);
219 		break;
220 	case NOT_EQUAL:
221 		a=equal(l,r,TRUE);
222 		break;
223 /* End PK Febr 95 */
224 	case OUTER:
225 		a=outer(l);
226 		break;
227 	case INNER:
228 		a=inner(l);
229 		break;
230 	case EXTRACTING:
231 		a=extracting(l,r);
232 		break;
233 	case CONCAT:
234 		a=concat(l);
235 		break;
236 	case JOIN:
237 		a=join(l,root->number);
238 		break;
239 	default:
240 		fprintf(stderr,"Unsupported operation in parse tree (%d)\n",
241 			root->oper);
242 		exit(3);
243 		break;
244 	}
245 
246 	return a;
247 }
248 
249 /*
250  * Decrements tree nodes reference counter, and frees nodes gc list if
251  * counter comes down to 0. Returns 1 if something was freed, 0
252  * otherwise
253  */
free_tree_node(struct TREE_NODE * node)254 int free_tree_node(struct TREE_NODE *node)
255 {
256 	if (node==NULL) return 0; /* This was a leaf or function node */
257 	if (node->GC_list!=NULL)
258 	{
259 		if (node->GC_list->refcount!=-1)
260 			node->GC_list->refcount--;
261 #ifdef ASSERT
262 		assert(node->GC_list->refcount>=0
263 			|| node->GC_list->refcount==-1);
264 #endif
265 		if (node->GC_list->refcount==0)
266 		{
267 			free_gclist(node->GC_list);
268 			node->GC_list=NULL;
269 			return 1;
270 		}
271 	}
272 	return 0;
273 }
274 
275 #ifdef PROGRESS_REPORTS
276 /*
277  * Shows a progress report on stderr
278  */
report_progress(char * line,int size,int now)279 void report_progress(char *line,int size, int now)
280 {
281 	fprintf(stderr,"%s %d%% done%s\r",
282 		line,(100*now)/size,"                  ");
283 	fflush(stderr);
284 }
285 #endif
286 
287 /*
288  * Gives first region from two gc_lists, eliminating same regions.
289  */
first_of(struct GC_POINTER * lp,struct GC_POINTER * rp)290 struct REGION first_of(struct GC_POINTER *lp,struct GC_POINTER *rp)
291 {
292 	struct REGION l_reg,r_reg;
293 
294 	/* quite straightforward limiting of two gc lists.
295 	   same regions are concatanated */
296 	get_region(lp,&l_reg);
297 	get_region(rp,&r_reg);
298 	if (r_reg.start!=-1 && l_reg.start!=-1)
299 	{
300 		if (l_reg.start<r_reg.start)
301 		{
302 			prev_region(rp,&r_reg);
303 			return l_reg;
304 		} else if (l_reg.start>r_reg.start)
305 		{
306 			prev_region(lp,&l_reg);
307 			return r_reg;
308 		} else if (l_reg.end<r_reg.end)
309 		{
310 			prev_region(rp,&r_reg);
311 			return l_reg;
312 		} else if (l_reg.end>r_reg.end)
313 		{
314 			prev_region(lp,&l_reg);
315 			return r_reg;
316 		} else
317 		{
318 			return r_reg;
319 		}
320 	}
321 	if (r_reg.start!=-1) return r_reg;
322 	if (l_reg.start!=-1) return l_reg;
323 	/* Both lists were empty, we return (-1,-1) */
324 	return r_reg;
325 }
326 
327 /*
328  * Handles or operation
329  */
or(struct GC_LIST * l,struct GC_LIST * r)330 struct GC_LIST *or(struct GC_LIST *l,struct GC_LIST *r)
331 {
332 	struct GC_POINTER lp,rp;
333 	struct GC_LIST *a;
334 	struct REGION tmp;
335 #ifdef OPTIMIZE_SORTS
336 	struct REGION prev;
337 #endif
338 #ifdef PROGRESS_REPORTS
339 	char *oper_name;
340 	int prog;int prog_start;
341 #endif
342 
343 #ifdef DEBUG
344 	fprintf(stderr,"or called\n");
345 #endif
346 	stats.or++;
347 	a=new_gclist();
348 #ifdef OPTIMIZE_SORTS
349 	prev.start=-1;
350 	prev.end=-1;
351 #endif
352 	start_region_search(l,&lp);
353 	start_region_search(r,&rp);
354 
355 #ifdef PROGRESS_REPORTS
356 	prog_start=LIST_SIZE(r)+LIST_SIZE(l);
357 	prog=0;
358 	oper_name="or";
359 #endif
360 
361 	for(tmp=first_of(&lp,&rp);tmp.start!=-1;tmp=first_of(&lp,&rp))
362 	{
363 #ifdef OPTIMIZE_SORTS
364 		if ( tmp.end<=prev.end )
365 		{
366 			/* We had nesting */
367 			a->nested=TRUE;
368 		}
369 #endif
370 		SHOW_PROG;
371 		add_region(a,tmp.start,tmp.end);
372 #ifdef OPTIMIZE_SORTS
373 		prev=tmp;
374 #endif
375 	}
376 	return a;
377 }
378 
379 /*
380  * Handles ordering which produces possibly nesting gc-lists.
381  */
nest_order(struct GC_LIST * l,struct GC_LIST * r,int type)382 struct GC_LIST *nest_order(struct GC_LIST *l,struct GC_LIST *r,int type)
383 {
384 	struct GC_POINTER lp,rp;
385 	struct GC_LIST *a;
386 	struct REGION r_reg,l_reg;
387 	static struct REGION *nest_stack=NULL;
388 	int nest_depth=0;
389 	int nestings;
390 	int s,e;
391 #ifdef PROGRESS_REPORTS
392 	char *oper_name;
393 	int prog;int prog_start;
394 #endif
395 
396 #ifdef DEBUG
397 	fprintf(stderr,"nest_order called\n");
398 #endif
399 	start_region_search(r,&rp);
400 
401 	if (nest_stack==NULL)
402 	{
403 		stats.nest_stacksize=DEFAULT_NEST_DEPTH*sizeof(*nest_stack);
404 		nest_stack=(struct REGION *)e_malloc(stats.nest_stacksize);
405 	}
406 	stats.order++;
407 	a=new_gclist();
408 	a->sorted=FALSE;
409 	a->nested=l->nested || r->nested;
410 #ifdef DEBUG
411 	if (a->nested) fprintf(stderr,"inherited nesting\n");
412 #endif
413 
414 #ifdef OPTIMIZE_SORTS
415 	if (l->nested) sort_by_end(l);
416 	else stats.sorts_optimized++;
417 #else
418 	sort_by_end(l);
419 #endif
420 
421 #ifdef PROGRESS_REPORTS
422 	prog_start=LIST_SIZE(r);
423 	prog=0;
424 	switch (type) {
425 	case L_ORDERED:
426 		oper_name="_.";
427 		break;
428 	case R_ORDERED:
429 		oper_name="._";
430 		break;
431 	case LR_ORDERED:
432 		oper_name="__";
433 		break;
434 	default:
435 		oper_name="..";
436 	}
437 #endif
438 	nestings=0;
439 	start_region_search(l,&lp);
440 	get_region(&lp,&l_reg);
441 	get_region(&rp,&r_reg);
442 	SHOW_PROG;
443 	/* If left or right region list was empty, we can return empty list */
444 	if (l_reg.start==-1 || r_reg.start==-1) return a;
445 
446 	do
447 	{
448 		if (l_reg.end<r_reg.start && l_reg.start!=-1 )
449 		{
450 			/* left region is first. Add to nest_stack
451 			   and nest queue */
452 			if (nest_depth*sizeof(*nest_stack)==stats.nest_stacksize)
453 			{
454 				nest_stack=(struct REGION *)
455 					e_realloc(nest_stack,
456 						stats.nest_stacksize*2,
457 						stats.nest_stacksize);
458 				stats.nest_stacksize*=2;
459 
460 			}
461 			nest_stack[nest_depth++]=l_reg;
462 			nestings=0;
463 #ifdef DEBUG
464 			if (nest_depth==1)
465 				fprintf(stderr," New q");
466 			else fprintf(stderr," +");
467 			fprintf(stderr,"(%d:%d)",l_reg.start,l_reg.end);
468 #endif
469 			get_region(&lp,&l_reg);
470 		}
471 		else if (nest_depth>0)
472 		{
473 #ifdef DEBUG
474 			fprintf(stderr," %d",r_reg.end);
475 #endif
476 			if (type==L_ORDERED || type==LR_ORDERED)
477 				s=nest_stack[--nest_depth].end+1;
478 			else s=nest_stack[--nest_depth].start;
479 			if (type==R_ORDERED || type==LR_ORDERED)
480 				e=r_reg.start-1;
481 			else e=r_reg.end;
482 
483 			if (e>=s)
484 			{
485 				add_region(a,s,e);
486 #ifdef OPTIMIZE_SORTS
487 				/* If we have taken region from nest stack
488 				   twice in row, it probably means, that
489 				   we have a nested result list */
490 				nestings++;
491 				if (nestings==2)
492 				{
493 #ifdef DEBUG
494 					if (!a->nested)
495 					fprintf(stderr,"nesting order detecded\n");
496 #endif
497 					a->nested=TRUE;
498 				}
499 #endif
500 			}
501 			get_region(&rp,&r_reg);
502 			SHOW_PROG;
503 		} else
504 		{
505 			get_region(&rp,&r_reg);
506 			SHOW_PROG;
507 		}
508 	} while ( r_reg.start!=-1 );
509 #ifdef OPTIMIZE_SORTS
510 	if (!a->nested) stats.sorts_optimized++;
511 	a->sorted=!a->nested;
512 #endif
513 	if (!a->sorted) sort_by_start(a);
514 	return a;
515 }
516 
517 /*
518  * Handles in operation
519  */
in(struct GC_LIST * l,struct GC_LIST * r,int not)520 struct GC_LIST *in(struct GC_LIST *l,struct GC_LIST *r, int not)
521 /* Changed by PK in Febr 95 to capture the semantics of _proper_
522 containment */
523 {
524 	struct GC_POINTER lp,rp;
525 	struct GC_LIST *a,*r2;
526 	struct REGION r_reg,l_reg,r_reg2;
527 	char *oper_name;
528 #ifdef PROGRESS_REPORTS
529 	int prog;int prog_start;
530 #endif
531 
532 #ifdef DEBUG
533 	fprintf(stderr,"in called\n");
534 #endif
535 	if (not)
536 	{
537 		stats.not_in++;
538 		oper_name="not in";
539 	} else
540 	{
541 		stats.in++;
542 		oper_name="in";
543 	}
544 	a=new_gclist();
545 
546 #ifdef OPTIMIZE_SORTS
547 	a->nested=l->nested;
548 #endif
549 
550 	start_region_search(l,&lp);
551 	get_region(&lp,&l_reg);
552 
553 
554 	/*
555  	* To simplify things we do an outer function on right gc_list
556  	*/
557 #ifdef OPTIMIZE_SORTS
558 	if (r->nested)
559 	{
560 #endif
561 		r2=outer(r);
562 		r=r2;
563 #ifdef OPTIMIZE_SORTS
564 	} else r2=NULL;
565 #endif
566 	start_region_search(r,&rp);
567 
568 #ifdef PROGRESS_REPORTS
569 	prog_start=LIST_SIZE(l)+LIST_SIZE(r);
570 	prog=0;
571 #endif
572 
573 	get_region(&rp,&r_reg);
574 	SHOW_PROG;
575 	while (r_reg.start!=-1 && l_reg.start!=-1)
576 	{
577 #ifdef DEBUG
578 		fprintf(stderr,"in: left=(%d,%d) right=(%d,%d)\n",
579 			l_reg.start,l_reg.end,
580 			r_reg.start,r_reg.end);
581 #endif
582 		if (l_reg.start<r_reg.start)
583 		{
584 			/* Left region starts before right -> can't be
585 			   in right region or any right region that follows
586 			   current one */
587 			if (not) add_region(a,l_reg.start,l_reg.end);
588 			get_region(&lp,&l_reg);
589 			SHOW_PROG;
590 		} else /* l_reg.start>=r_reg.start */
591 		{
592 			if (l_reg.end<=r_reg.end)
593 			{
594 				/* left region is in right region */
595 /* Start PK Febr 95 */
596 				if (l_reg.start>r_reg.start ||
597 				    l_reg.end<r_reg.end)
598 				{	/* inclusion is proper */
599 					if (!not) add_region(a,l_reg.start,l_reg.end);
600 					get_region(&lp,&l_reg);
601 					SHOW_PROG;
602 				} else { /* l_reg == r_reg */
603 					if (not) add_region(a,l_reg.start,l_reg.end);
604 					get_region(&lp,&l_reg);
605 					SHOW_PROG;
606 				}
607 /* End PK Febr 95 */
608 			} else 	if (l_reg.start==r_reg.start)
609 			{
610 				/* Regions start from same place. Because
611 				no right region after current one can start
612 				from same place we can skip left region */
613 				if (not) add_region(a,l_reg.start,l_reg.end);
614 				get_region(&lp,&l_reg);
615 				SHOW_PROG;
616 			} else
617 			{
618 				/* left and right region are overlapping */
619 #ifdef DEBUG
620 				fprintf(stderr,"in overlap\n");
621 #endif
622 				get_region(&rp,&r_reg2);
623 				if (r_reg2.start==-1)
624 				{
625 					/* All right regions have been scanned */
626 					if ( l_reg.start > r_reg.end )
627 					{
628 						/* Left region end after last right region.
629 						   We can fall out of loop */
630 						SHOW_PROG;
631 						r_reg=r_reg2;
632 					} else
633 					{
634 						/* Next left region might still be in right
635 						   region */
636 						if (not) add_region(a,l_reg.start, l_reg.end);
637 						get_region(&lp,&l_reg);
638 						SHOW_PROG;
639 					}
640 				} else
641 				{
642 					/* There are still right regions */
643 					if ( l_reg.start >= r_reg2.start )
644 					{
645 						/* Since left region starts after new right region,
646 						 * We can safely skip previous right */
647 						r_reg=r_reg2;
648 						SHOW_PROG;
649 					} else
650 					{
651 						/* Left region is not in previous or next
652 						   right region */
653 						prev_region(&rp,&r_reg2);
654 						if (not) add_region(a,l_reg.start, l_reg.end);
655 						get_region(&lp,&l_reg);
656 						SHOW_PROG;
657 					}
658 				}
659 			}
660 		}
661 	}
662 
663 #ifdef DEBUG
664 	fprintf(stderr,"in fall out\n");
665 #endif
666 /* If we have "not in" and right gc_list is empty, we need to copy
667    rest of left list */
668 	if (not)
669 	{
670 		while (l_reg.start!=-1)
671 		{
672 			add_region(a,l_reg.start,l_reg.end);
673 			get_region(&lp,&l_reg);
674 			SHOW_PROG;
675 		}
676 	}
677 
678 /* because we created list r2 here, we free it here */
679 	if (r2!=NULL) free_gclist(r2);
680 	return a;
681 }
682 
683 /*
684  * Handles outer function
685  */
outer(struct GC_LIST * gcl)686 struct GC_LIST *outer(struct GC_LIST *gcl)
687 {
688 	struct GC_POINTER p;
689 	struct REGION reg1,reg2;
690 	struct GC_LIST *a;
691 #ifdef PROGRESS_REPORTS
692 	char *oper_name;
693 	int prog;int prog_start;
694 #endif
695 
696 #ifdef PROGRESS_REPORTS
697 	oper_name="outer";
698 	prog_start=LIST_SIZE(gcl);
699 	prog=0;
700 #endif
701 
702 	stats.outer++;
703 	reg2.start=0;
704 	a=new_gclist();
705 	start_region_search(gcl,&p);
706 	get_region(&p,&reg1);
707 
708 	/* if we had empty gc list */
709 	if (reg1.start==-1) return a;
710 	SHOW_PROG;
711 
712 	/* If there are many regions starting from same place, we choose the
713 	   longest */
714 	get_region(&p,&reg2);
715 	SHOW_PROG;
716 	while (reg2.start==reg1.start && reg2.end>reg1.end)
717 	{
718 		reg1=reg2;
719 		get_region(&p,&reg2);
720 		SHOW_PROG;
721 	}
722 
723 	while(reg1.start!=-1 && reg2.start!=-1)
724 	{
725 		if (reg2.end>reg1.end && reg2.start!=reg1.start)
726 		{
727 			/* reg2 ends after reg1 -> no nesting */
728 			add_region(a,reg1.start,reg1.end);
729 			reg1=reg2;
730 		}
731 		get_region(&p,&reg2);
732 		SHOW_PROG;
733 		/* If regions start from same place, nesting is guaranteed */
734 		if (reg2.start==reg1.start)
735 		{
736 			reg1=reg2;
737 			get_region(&p,&reg2);
738 			SHOW_PROG;
739 		}
740 	}
741 	add_region(a,reg1.start,reg1.end);
742 	return a;
743 }
744 
745 /*
746  * Handles inner function
747  */
inner(struct GC_LIST * gcl)748 struct GC_LIST *inner(struct GC_LIST *gcl)
749 {
750 	static struct REGION *inner_stack;
751 	struct GC_POINTER p;
752 	int inq_ind=0;
753 	struct GC_LIST *a=NULL;
754 	struct REGION n_reg,c_reg;
755 	int i;
756 #ifdef PROGRESS_REPORTS
757 	char *oper_name;
758 	int prog;int prog_start;
759 #endif
760 
761 #ifdef DEBUG
762 	fprintf(stderr,"inner called\n");
763 #endif
764 	stats.inner++;
765 	a=new_gclist();
766 	if (stats.inner_tablesize==0)
767 	{
768 		stats.inner_tablesize=
769 			DEFAULT_INNER_QUEUE*sizeof(*inner_stack);
770 		inner_stack=(struct REGION *) e_malloc(stats.inner_tablesize);
771 	}
772 
773 #ifdef PROGRESS_REPORTS
774 	prog_start=LIST_SIZE(gcl);
775 	prog=0;
776 	oper_name="inner";
777 #endif
778 
779 	start_region_search(gcl,&p);
780 	get_region(&p,&c_reg);
781 	SHOW_PROG;
782 	while (c_reg.start!=-1) {
783 		get_region(&p,&n_reg);
784 		SHOW_PROG;
785 #ifdef ASSERT
786 		assert(n_reg.start>=c_reg.start || n_reg.start==-1 );
787 #endif
788 		if ( n_reg.start>c_reg.end || n_reg.start==-1 )
789 		{
790 			/* n_reg and c_reg are separate. Therefore c_reg must
791 			   be innermost */
792 			/* Now we can empty inner_stack */
793 #ifdef DEBUG
794 			fprintf(stderr,"empty inner stack (%d regions)\n",inq_ind);
795 #endif
796 			for (i=0;i<inq_ind;i++)
797 			{
798 #ifdef ASSERT
799 				assert(inner_stack[i].start<=c_reg.start);
800 #endif
801 				if (inner_stack[i].end<c_reg.end)
802 				/* Region in inner_stack was innermost */
803 					add_region(a,inner_stack[i].start,
804 						inner_stack[i].end);
805 			}
806 			inq_ind=0;
807 			add_region(a,c_reg.start,c_reg.end);
808 		} else if ( n_reg.end>c_reg.end )
809 		{
810 			/* n_reg and c_reg are overlapping. Let's add c_reg
811 			   to inner_stack */
812 			if (inq_ind*sizeof(*inner_stack)==stats.inner_tablesize)
813 			{
814 				inner_stack=(struct REGION *) e_realloc(inner_stack,
815 					stats.inner_tablesize*2,stats.inner_tablesize);
816 				stats.inner_tablesize*=2;
817 			}
818 			inner_stack[inq_ind++]=c_reg;
819 		} else {
820 		/* if neither of the previous if's was taken,
821 		   c_reg contains n_reg. We remove regions containing n_reg from
822 		   inner_stack */
823 			while(inq_ind &&
824 				n_reg.start>=inner_stack[inq_ind-1].start &&
825 				n_reg.end<=inner_stack[inq_ind-1].end )
826 			{
827 				inq_ind--;
828 			}
829 		}
830 		c_reg=n_reg;
831 #ifdef ASSERT
832 		if (inq_ind)
833 			assert(c_reg.start<inner_stack[inq_ind-1].start ||
834 				c_reg.end>inner_stack[inq_ind-1].end);
835 #endif
836 	}
837 	return a;
838 }
839 
containing(struct GC_LIST * l,struct GC_LIST * r,int not)840 struct GC_LIST *containing(struct GC_LIST *l,struct GC_LIST *r,int not)
841 /* Changed by PK in Febr 95 to capture the semantics of _proper_
842 containment */
843 {
844 	struct GC_POINTER lp,rp;
845 	struct GC_LIST *a,*r2;
846 	struct REGION r_reg,l_reg;
847 #ifdef PROGRESS_REPORTS
848 	char *oper_name;
849 	int prog;int prog_start;
850 #endif
851 
852 #ifdef DEBUG
853 	fprintf(stderr,"containing called\n");
854 #endif
855 	if (not) stats.not_containing++; else stats.containing++;
856 	a=new_gclist();
857 #ifdef OPTIMIZE_SORTS
858 	a->nested=l->nested;
859 #endif
860 	start_region_search(l,&lp);
861 	get_region(&lp,&l_reg);
862 
863 /* To simplify things we do an inner function on right gc_list */
864 #ifdef OPTIMIZE_SORTS
865 	if (r->nested)
866 	{
867 #endif
868 		r2=inner(r);
869 		r=r2;
870 #ifdef OPTIMIZE_SORTS
871 	} else r2=NULL;
872 #endif
873 
874 #ifdef PROGRESS_REPORTS
875 	oper_name= (not) ? "not containing" : "containing";
876 	prog=0;
877 	prog_start=LIST_SIZE(l)+LIST_SIZE(r);
878 #endif
879 	start_region_search(r,&rp);
880 
881 	get_region(&rp,&r_reg);
882 	SHOW_PROG;
883 	while (r_reg.start!=-1 && l_reg.start!=-1)
884 	{
885 		if ( l_reg.start>r_reg.start )
886 		{
887 			/* right starts before left */
888 			get_region(&rp,&r_reg);
889 			SHOW_PROG;
890 		} else if ( l_reg.end>=r_reg.end )
891 		{
892 			/* left contains right */
893 /* Start PK Febr 95 */
894 			if (l_reg.start<r_reg.start ||
895 			    l_reg.end>r_reg.end)
896 			{	/* Containment is proper */
897 				if (!not) add_region(a,l_reg.start,l_reg.end);
898 				get_region(&lp,&l_reg);
899 				SHOW_PROG;
900 			} else { /* l_reg == r_reg */
901 				if (not) add_region(a,l_reg.start,l_reg.end);
902 				get_region(&lp,&l_reg);
903 				SHOW_PROG;
904 			}
905 /* End PK Febr 95 */
906 		} else {
907 			/* left comes after right */
908 			if (not) add_region(a,l_reg.start,l_reg.end);
909 			get_region(&lp,&l_reg);
910 			SHOW_PROG;
911 		}
912 	}
913 	/* When right list ended, there still might be something in left list */
914 	while (not && l_reg.start!=-1)
915 	{
916 		add_region(a,l_reg.start,l_reg.end);
917 		get_region(&lp,&l_reg);
918 		SHOW_PROG;
919 	}
920 /* because we created list r2 here, we free it here */
921 	if (r2!=NULL) free_gclist(r2);
922 	return a;
923 }
924 
equal(struct GC_LIST * l,struct GC_LIST * r,int not)925 struct GC_LIST *equal(struct GC_LIST *l,struct GC_LIST *r,int not)
926 /* Intersection of GC_LISTs *l and *r */
927 /* PK Febr '95 */
928 {
929 	struct GC_POINTER lp,rp;
930 	struct GC_LIST *a;
931 	struct REGION r_reg,l_reg;
932 #ifdef PROGRESS_REPORTS
933 	char *oper_name;
934 	int prog;int prog_start;
935 #endif
936 
937 #ifdef DEBUG
938 	fprintf(stderr,"equal called\n");
939 #endif
940 	if (not) stats.not_equal++; else stats.equal++;
941 	a=new_gclist();
942 #ifdef OPTIMIZE_SORTS
943 	a->nested=l->nested;
944 #endif
945 	start_region_search(l,&lp);
946 	get_region(&lp,&l_reg);
947 
948 
949 #ifdef PROGRESS_REPORTS
950 	oper_name= (not) ? "not equal" : "equal";
951 	prog=0;
952 	prog_start=LIST_SIZE(l)+LIST_SIZE(r);
953 #endif
954 	start_region_search(r,&rp);
955 	get_region(&rp,&r_reg);
956 
957 	SHOW_PROG;
958 
959 	while (r_reg.start!=-1 && l_reg.start!=-1)
960 	{
961 		if ( l_reg.start<r_reg.start )
962 		{
963 			if (not) add_region(a,l_reg.start,l_reg.end);
964 			get_region(&lp,&l_reg);
965 			SHOW_PROG;
966 		} else if ( r_reg.start<l_reg.start )
967 		{
968 			get_region(&rp,&r_reg);
969 			SHOW_PROG;
970 		} else  /*  r_reg.start=l_reg.start */
971 			if ( l_reg.end<r_reg.end )
972 			{
973 				if (not) add_region(a,l_reg.start,l_reg.end);
974 				get_region(&lp,&l_reg);
975 				SHOW_PROG;
976 			} else if ( r_reg.end<l_reg.end )
977 			{
978 				get_region(&rp,&r_reg);
979 				SHOW_PROG;
980 			} else /* l_reg = r_reg */
981 			{
982 				if (!not) add_region(a,l_reg.start,l_reg.end);
983 				get_region(&rp,&r_reg);
984 				get_region(&lp,&l_reg);
985 				SHOW_PROG;
986 			}
987 	}
988 	/* When right list ended, there still might be something in left list */
989 	while (not && l_reg.start!=-1)
990 	{
991 		add_region(a,l_reg.start,l_reg.end);
992 		get_region(&lp,&l_reg);
993 		SHOW_PROG;
994 	}
995 	return a;
996 
997 } /* END equal(struct GC_LIST *l,struct GC_LIST *r,int not) */
998 
999 /*
1000  * Here we implement concat operation, which concats all overlapping regions
1001  * into one region and removes all nestings
1002  */
concat(struct GC_LIST * l)1003 struct GC_LIST *concat(struct GC_LIST *l)
1004 {
1005 	struct GC_POINTER lp;
1006 	struct GC_LIST *a;
1007 	struct REGION reg1,reg2;
1008 #ifdef PROGRESS_REPORTS
1009 	char *oper_name;
1010 	int prog;int prog_start;
1011 #endif
1012 
1013 #ifdef DEBUG
1014 	fprintf(stderr,"concat called\n");
1015 #endif
1016 	stats.concat++;
1017 	a=new_gclist();
1018 	start_region_search(l,&lp);
1019 	get_region(&lp,&reg1);
1020 
1021 	/* We had empty list */
1022 	if (reg1.start==-1) return a;
1023 
1024 #ifdef PROGRESS_REPORTS
1025 	oper_name="concat";
1026 	prog=0;
1027 	prog_start=LIST_SIZE(l);
1028 #endif
1029 	SHOW_PROG;
1030 	get_region(&lp,&reg2);
1031 	SHOW_PROG;
1032 
1033 	while (reg2.start!=-1)
1034 	{
1035 		if (reg2.start>reg1.end+1)
1036 		{
1037 			/* separate regions, no concat */
1038 			add_region(a,reg1.start,reg1.end);
1039 			reg1=reg2;
1040 		} else if ( reg2.end>reg1.end )
1041 		{
1042 			/* We found overlapping */
1043 			reg1.end=reg2.end;
1044 		}
1045 		get_region(&lp,&reg2);
1046 		SHOW_PROG;
1047 	}
1048 	add_region(a,reg1.start,reg1.end);
1049 	return a;
1050 }
1051 
1052 /*
1053  * Here we implement extracting operation
1054  */
extracting(struct GC_LIST * l,struct GC_LIST * r)1055 struct GC_LIST *extracting(struct GC_LIST *l,struct GC_LIST *r)
1056 {
1057 	struct GC_POINTER lp,rp,tmpp;
1058 	struct GC_LIST *a,*r2,*tmp,*new_tmp;
1059 	struct REGION l_reg,r_reg;
1060 	int prev_s=-1;
1061 	int prev_e=-1;
1062 	int last_tmp;
1063 	int must_be_sorted;
1064 #ifdef PROGRESS_REPORTS
1065 	char *oper_name;
1066 	int prog;int prog_start;
1067 #endif
1068 
1069 #ifdef DEBUG
1070 	fprintf(stderr,"extracting called\n");
1071 #endif
1072 	stats.extracting++;
1073 	/* to simplify things we do concat on right gc_list. Result stays
1074 	   the same anyway */
1075 	r2=concat(r);
1076 	r=r2;
1077 
1078 	a=new_gclist();
1079 #ifdef OPTIMIZE_SORTS
1080 	a->nested=l->nested;
1081 #endif
1082 	tmp=new_gclist();
1083 	start_region_search(tmp,&tmpp);
1084 
1085 #ifdef PROGRESS_REPORTS
1086 	oper_name="extracting";
1087 	prog=0;
1088 	prog_start=LIST_SIZE(l);
1089 #endif
1090 
1091 	start_region_search(l,&lp);
1092 	get_region(&lp,&l_reg);
1093 	SHOW_PROG;
1094 	start_region_search(r,&rp);
1095 	get_region(&rp,&r_reg);
1096 
1097 	while ( l_reg.start!=-1 )
1098 	{
1099 		if ( l_reg.end<r_reg.start || r_reg.start==-1 )
1100 		{
1101 			/* Regions are separate, left starting first.
1102  			   no cutting */
1103  			if ( prev_s!=l_reg.start || prev_e!=l_reg.end )
1104  			{
1105 #ifdef DEBUG
1106 	fprintf(stderr,"extracting adding 1(%d,%d)\n",l_reg.start,l_reg.end);
1107 #endif
1108 				prev_s=l_reg.start;
1109 				prev_e=l_reg.end;
1110 				add_region(a,l_reg.start,l_reg.end);
1111 			}
1112 			l_reg=first_of(&lp,&tmpp);
1113 			SHOW_PROG;
1114 		} else if ( r_reg.end<l_reg.start )
1115 		{
1116 			/* Regions are separate right starting first.
1117 			   we skip right */
1118 #ifdef DEBUG
1119 			fprintf(stderr,"skipping right, left=(%d,%d) right=(%d,%d)\n",
1120 				l_reg.start,l_reg.end,r_reg.start,r_reg.end);
1121 #endif
1122 			get_region(&rp,&r_reg);
1123 		} else
1124 		{
1125 			/* We need to do clipping. */
1126 			new_tmp=new_gclist();
1127 			new_tmp->sorted=FALSE;
1128 			must_be_sorted=FALSE;
1129 			last_tmp=-1;
1130 #ifdef DEBUG
1131 			fprintf(stderr,"cutting loop, cutter (%d,%d)\n",r_reg.start,r_reg.end);
1132 #endif
1133 			while ( l_reg.start!=-1 && l_reg.start<=r_reg.end )
1134 			{
1135 				if (l_reg.start<r_reg.start
1136 			&& ( prev_s!=l_reg.start ||
1137 			prev_e!=r_reg.start-1 )
1138 					)
1139 				{
1140 					prev_s=l_reg.start;
1141 					prev_e=r_reg.start-1;
1142 
1143 #ifdef DEBUG
1144 	fprintf(stderr,"extracting adding 2(%d,%d)\n",l_reg.start,r_reg.start-1);
1145 #endif
1146 					add_region(a,l_reg.start,r_reg.start-1);
1147 				}
1148 				if (r_reg.end<l_reg.end)
1149 				{
1150 #ifdef DEBUG
1151 	fprintf(stderr,"(%d,%d)<-new_tmp\n",r_reg.end+1,l_reg.end);
1152 #endif
1153 					add_region(new_tmp,
1154 						r_reg.end+1,l_reg.end);
1155 					if (l_reg.end<last_tmp)
1156 					{
1157 						must_be_sorted=TRUE;
1158 					} else last_tmp=l_reg.end;
1159 				}
1160 				l_reg=first_of(&lp,&tmpp);
1161 				SHOW_PROG;
1162 			}
1163 			if (l_reg.start!=-1) prev_region(&lp,&l_reg);
1164 
1165 #ifdef ASSERT
1166 			assert (tmpp.ind==tmp->length &&
1167 				tmpp.node->next==NULL);
1168 #endif
1169 			free_gclist(tmp);
1170 			if (must_be_sorted)
1171 				sort_by_start(new_tmp);
1172 #ifdef PROGRESS_REPORTS
1173 			prog--;
1174 			prog_start+=LIST_SIZE(new_tmp);
1175 #endif
1176 			tmp=new_tmp;
1177 			start_region_search(tmp,&tmpp);
1178 			/* Left region is now handled -> skip to next */
1179 			l_reg=first_of(&lp,&tmpp);
1180 			SHOW_PROG;
1181 		}
1182 	}
1183 	free_gclist(r2);
1184 	free_gclist(tmp);
1185 	return a;
1186 }
1187 
1188 /*
1189  * Join operation
1190  */
join(struct GC_LIST * l,int number)1191 struct GC_LIST *join(struct GC_LIST *l,int number)
1192 {
1193 	struct GC_LIST *a;
1194 	struct GC_POINTER p1,p2;
1195 	struct REGION r1,r2,prev_r1,prev_r2;
1196 	int i;
1197 #ifdef PROGRESS_REPORTS
1198 	char *oper_name;
1199 	int prog;int prog_start;
1200 #endif
1201 
1202 #ifdef DEBUG
1203 	fprintf(stderr,"join called %d\n",number);
1204 #endif
1205 #ifdef ASSERT
1206 	assert(number>0);
1207 #endif
1208 	stats.join++;
1209 	a=new_gclist();
1210 #ifdef OPTIMIZE_SORTS
1211 	a->nested=l->nested;
1212 #endif
1213 	if ( l->first==NULL )
1214 	{
1215 		/* This is an optimized chars node */
1216 		to_chars(a,(l->chars+1)*number);
1217 		return a;
1218 	}
1219 
1220 	/* List is smaller than join number, so return list is empty */
1221 	if (LIST_SIZE(l)<number) return a;
1222 
1223 #ifdef PROGRESS_REPORTS
1224 	oper_name="join";
1225 	prog=0;
1226 	prog_start=LIST_SIZE(l);
1227 #endif
1228 
1229 	start_region_search(l,&p1);
1230 	start_region_search(l,&p2);
1231 
1232 	prev_r2.start=-1;
1233 	prev_r1.end=-1;
1234 	for (i=number;i>0;i--)
1235 	{
1236 		get_region(&p1,&r1);
1237 		SHOW_PROG;
1238 #ifdef ASSERT
1239 		assert(r1.start!=-1);
1240 #endif
1241 	}
1242 	while (r1.start!=-1)
1243 	{
1244 		get_region(&p2,&r2);
1245 		if (r2.start==prev_r2.start)
1246 		{
1247 /*PK			if (prev.end<r2.end) */
1248 
1249 			if (r1.end<=prev_r1.end)
1250 			{
1251 				a->sorted=FALSE;
1252 			}
1253 		}
1254 		add_region(a,r2.start,r1.end);
1255 		prev_r1=r1;
1256 		get_region(&p1,&r1);
1257 		SHOW_PROG;
1258 		prev_r2=r2;
1259 	}
1260 	if (!a->sorted)
1261 	{
1262 		sort_by_start(a); /* This may leave duplicates in a */
1263 	  	remove_duplicates(a); /* Here they are removed */
1264 	};
1265 	return a;
1266 }
1267 
1268 /*
1269  * Handles ordering which does _not_ produce nesting gc-lists.
1270  * For example '"--" quote "--"' to catch SGML comments.
1271  */
quote(struct GC_LIST * l,struct GC_LIST * r,int type)1272 struct GC_LIST *quote(struct GC_LIST *l,struct GC_LIST *r,int type)
1273 {
1274 	struct GC_POINTER lp,rp;
1275 	struct GC_LIST *a;
1276 	struct REGION r_reg,l_reg;
1277 #ifdef PROGRESS_REPORTS
1278 	char *oper_name;
1279 	int prog;int prog_start;
1280 #endif
1281 
1282 #ifdef DEBUG
1283 	fprintf(stderr,"quote called\n");
1284 #endif
1285 	stats.quote++;
1286 	a=new_gclist();
1287 	a->sorted=TRUE;
1288 	a->nested=FALSE;
1289 
1290 #ifdef PROGRESS_REPORTS
1291 	prog_start=LIST_SIZE(r);
1292 	prog=0;
1293 	switch (type) {
1294 	case L_QUOTE:
1295 		oper_name="_quote";
1296 		break;
1297 	case R_QUOTE:
1298 		oper_name="quote_";
1299 		break;
1300 	case LR_QUOTE:
1301 		oper_name="_quote_";
1302 		break;
1303 	default:
1304 		oper_name="quote";
1305 	}
1306 #endif
1307 	start_region_search(r,&rp);
1308 	start_region_search(l,&lp);
1309 	get_region(&lp,&l_reg);
1310 	get_region(&rp,&r_reg);
1311 	SHOW_PROG;
1312 
1313 	/* If left or right region list was empty, we can return empty list */
1314 	if (l_reg.start==-1 || r_reg.start==-1) return a;
1315 	do {
1316 		/* Skip until we find ending quote after start quote */
1317 		while (l_reg.end>=r_reg.start && r_reg.start!=-1) {
1318 			get_region(&rp,&r_reg);
1319 			SHOW_PROG;
1320 		}
1321 		if (r_reg.start>=0) {
1322 			/* Add region using operation type */
1323 			switch (type) {
1324 			case QUOTE:
1325 				add_region(a,l_reg.start, r_reg.end);
1326 				break;
1327 			case L_QUOTE:
1328 				add_region(a,l_reg.end+1, r_reg.end);
1329 				break;
1330 			case R_QUOTE:
1331 				add_region(a,l_reg.start, r_reg.start-1);
1332 				break;
1333 			case LR_QUOTE:
1334 				/* No empty regions */
1335 				if (l_reg.end+1<r_reg.start) {
1336 					add_region(a,l_reg.end+1,r_reg.start-1);
1337 				}
1338 				break;
1339 			default:
1340 				fprintf(stderr,"quote: invalid oper type\n");
1341 				exit(2);
1342 			/* Skip until starting quote is after last ending
1343 			   quote */
1344 			}
1345 			while (l_reg.start<=r_reg.end && l_reg.start!=-1) {
1346 				get_region(&lp,&l_reg);
1347 				SHOW_PROG;
1348 			}
1349 		}
1350 	} while ( l_reg.start!=-1 && r_reg.start!=-1);
1351 	return a;
1352 }
1353 
1354