1 /*
2 System: Structured text retrieval tool sgrep.
3 Module: eval.c
4 Author: Pekka Kilpel�inen & Jani Jaakkola
5 Description: Handles the evaluation of sgrep expressions, thus
6 implementing the actual semantics of sgrep language.
7 used through eval() function
8 Version history: Original version February 1995 by JJ & PK
9 Copyright: University of Helsinki, Dept. of Computer Science
10 Distributed under GNU General Public Lisence
11 See file COPYING for details
12 */
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <limits.h>
16 #include "defines.h"
17
18 #if defined(ASSERT) && defined(OPTIMIZE_SORTS)
19 /* Define this if you want always test that nest optimization works */
20 #define ASSERT_NESTS
21 #endif
22
23 #ifdef PROGRESS_REPORTS
24 /* We have progress reports compiled in */
25 #define SHOW_PROG do { \
26 prog++; \
27 if (progress_output && !(prog&1023)) report_progress(oper_name,prog_start,prog); \
28 } while (0)
29 #else
30 #define SHOW_PROG do {} while (0)
31 #endif
32
33 /*
34 * Startup size of nest stack
35 */
36 #define DEFAULT_NEST_DEPTH 1024
37
38 /*
39 * Startup size of inner queue
40 */
41 #define DEFAULT_INNER_QUEUE 1024
42
43 struct GC_LIST *real_eval(struct TREE_NODE *root);
44 struct GC_LIST *or(struct GC_LIST *,struct GC_LIST *);
45 struct GC_LIST *nest_order(struct GC_LIST *,struct GC_LIST *,int);
46 struct GC_LIST *quote(struct GC_LIST *,struct GC_LIST *,int);
47 struct GC_LIST *in(struct GC_LIST *,struct GC_LIST *,int);
48 struct GC_LIST *containing(struct GC_LIST *,struct GC_LIST *,int);
49 struct GC_LIST *extracting(struct GC_LIST *,struct GC_LIST *);
50 struct GC_LIST *outer(struct GC_LIST *);
51 struct GC_LIST *inner(struct GC_LIST *);
52 struct GC_LIST *concat(struct GC_LIST *);
53 struct GC_LIST *join(struct GC_LIST *,int number);
54 struct GC_LIST *equal(struct GC_LIST *,struct GC_LIST *,int);
55 int free_tree_node(struct TREE_NODE *node);
56
57 /*
58 * Recursively evaluates parse tree using operation functions
59 * root points the root node of parse tree
60 */
eval(struct TREE_NODE * root)61 struct GC_LIST *eval(struct TREE_NODE *root)
62 {
63 static int depth=0;
64 #ifdef DEBUG
65 int i;
66 #endif
67 struct GC_LIST *a;
68
69 a=root->GC_list;
70 depth++;
71 #ifdef DEBUG
72 for(i=0;i<depth;i++) fputc(' ',stderr);
73 fprintf(stderr,"Evaluating oper %s l_label=%d r_label=%d\n",
74 give_oper_name(root->oper),root->label_left,root->label_right);
75 #endif
76 #ifdef ASSERT
77 assert(root->oper!=INVALID);
78 #endif
79 /* If this is a leaf node, we just use leafs gc list */
80 if ( a==NULL && root->oper==PHRASE )
81 {
82 #ifdef ASSERT
83 assert(root->leaf->GC_list!=NULL);
84 #endif
85 a=root->leaf->GC_list;
86 /* OBSOLETE
87 root->leaf->GC_list=NULL;
88 */
89 a->refcount=root->refcount;
90 #ifdef DEBUG
91 for(i=0;i<depth;i++) fputc(' ',stderr);
92 fprintf(stderr,"Using phrase list %s\n",root->leaf->phrase->s);
93 #endif
94 }
95 /* If gc_list is still NULL, it means that it hasn't been evaluated yet */
96 if ( a==NULL )
97 {
98 #ifdef DEBUG
99 for(i=0;i<depth;i++) fputc(' ',stderr);
100 fprintf(stderr,"Calling real eval oper %s\n",
101 give_oper_name(root->oper));
102 #endif
103 a=real_eval(root);
104 a->refcount=root->refcount;
105 /* We free subtrees unneeded gclists */
106 if (free_tree_node(root->left))
107 {
108 #ifdef DEBUG
109 for(i=0;i<depth;i++) putc(' ',stderr);
110 fprintf(stderr,"label %d freed (left)\n",root->label_left);
111 #endif
112 }
113 if (free_tree_node(root->right))
114 {
115 #ifdef DEBUG
116 for(i=0;i<depth;i++) putc(' ',stderr);
117 fprintf(stderr,"label %d freed (right)\n",root->label_right);
118 #endif
119 }
120 }
121 #ifdef DEBUG
122 else
123 {
124 for(i=0;i<depth;i++) fputc(' ',stderr);
125 fprintf(stderr,"Using already known list\n");
126 }
127 #endif
128
129 /* Keeps track of longest used gc list */
130 if (LIST_SIZE(a)>stats.longest_list)
131 stats.longest_list=LIST_SIZE(a);
132 #ifdef ASSERT_NESTS
133 /* We check that if list isn't marked as nested, it really isn't */
134 if (!a->nested)
135 {
136 struct REGION reg1,reg2;
137 struct GC_POINTER p;
138
139 start_region_search(a,&p);
140 get_region(&p,®1);
141 get_region(&p,®2);
142 while (reg2.start!=-1)
143 {
144 assert(reg1.end<reg2.end);
145 reg1=reg2;
146 get_region(&p,®2);
147 }
148 }
149 #endif
150
151 #ifdef PROGRESS_REPORTS
152 if (progress_output)
153 {
154 /* After every operation, we clean the line */
155 fprintf(stderr," \r");
156 }
157 #endif
158 root->GC_list=a;
159
160 #ifdef DEBUG
161 for(i=0;i<depth;i++) fputc(' ',stderr);
162 fprintf(stderr,"eval done\n");
163 #endif
164 depth--;
165 return a;
166 }
167
168 /*
169 * Handles the actual evaluation of some operation
170 */
real_eval(struct TREE_NODE * root)171 struct GC_LIST *real_eval(struct TREE_NODE *root)
172 {
173 struct GC_LIST *a,*l,*r;
174
175 a=NULL;
176 #ifdef ASSERT
177 assert(root->left!=NULL);
178 #endif
179
180 /* Let's evaluate left and right subtrees first */
181 l=eval(root->left);
182 /* if root->right==NULL this is a function, and it won't have
183 right sub tree */
184 if (root->right==NULL) r=NULL;
185 else r=eval(root->right);
186
187 /* Let's call the operation function */
188 switch (root->oper) {
189 case OR:
190 a=or(l,r);
191 break;
192 case ORDERED:
193 case L_ORDERED:
194 case R_ORDERED:
195 case LR_ORDERED:
196 a=nest_order(l,r,root->oper);
197 break;
198 case QUOTE:
199 case L_QUOTE:
200 case R_QUOTE:
201 case LR_QUOTE:
202 a=quote(l,r,root->oper);
203 break;
204 case IN:
205 a=in(l,r,FALSE);
206 break;
207 case NOT_IN:
208 a=in(l,r,TRUE);
209 break;
210 case CONTAINING:
211 a=containing(l,r,FALSE);
212 break;
213 case NOT_CONTAINING:
214 a=containing(l,r,TRUE);
215 break;
216 /* Start PK Febr 95 */
217 case EQUAL:
218 a=equal(l,r,FALSE);
219 break;
220 case NOT_EQUAL:
221 a=equal(l,r,TRUE);
222 break;
223 /* End PK Febr 95 */
224 case OUTER:
225 a=outer(l);
226 break;
227 case INNER:
228 a=inner(l);
229 break;
230 case EXTRACTING:
231 a=extracting(l,r);
232 break;
233 case CONCAT:
234 a=concat(l);
235 break;
236 case JOIN:
237 a=join(l,root->number);
238 break;
239 default:
240 fprintf(stderr,"Unsupported operation in parse tree (%d)\n",
241 root->oper);
242 exit(3);
243 break;
244 }
245
246 return a;
247 }
248
249 /*
250 * Decrements tree nodes reference counter, and frees nodes gc list if
251 * counter comes down to 0. Returns 1 if something was freed, 0
252 * otherwise
253 */
free_tree_node(struct TREE_NODE * node)254 int free_tree_node(struct TREE_NODE *node)
255 {
256 if (node==NULL) return 0; /* This was a leaf or function node */
257 if (node->GC_list!=NULL)
258 {
259 if (node->GC_list->refcount!=-1)
260 node->GC_list->refcount--;
261 #ifdef ASSERT
262 assert(node->GC_list->refcount>=0
263 || node->GC_list->refcount==-1);
264 #endif
265 if (node->GC_list->refcount==0)
266 {
267 free_gclist(node->GC_list);
268 node->GC_list=NULL;
269 return 1;
270 }
271 }
272 return 0;
273 }
274
275 #ifdef PROGRESS_REPORTS
276 /*
277 * Shows a progress report on stderr
278 */
report_progress(char * line,int size,int now)279 void report_progress(char *line,int size, int now)
280 {
281 fprintf(stderr,"%s %d%% done%s\r",
282 line,(100*now)/size," ");
283 fflush(stderr);
284 }
285 #endif
286
287 /*
288 * Gives first region from two gc_lists, eliminating same regions.
289 */
first_of(struct GC_POINTER * lp,struct GC_POINTER * rp)290 struct REGION first_of(struct GC_POINTER *lp,struct GC_POINTER *rp)
291 {
292 struct REGION l_reg,r_reg;
293
294 /* quite straightforward limiting of two gc lists.
295 same regions are concatanated */
296 get_region(lp,&l_reg);
297 get_region(rp,&r_reg);
298 if (r_reg.start!=-1 && l_reg.start!=-1)
299 {
300 if (l_reg.start<r_reg.start)
301 {
302 prev_region(rp,&r_reg);
303 return l_reg;
304 } else if (l_reg.start>r_reg.start)
305 {
306 prev_region(lp,&l_reg);
307 return r_reg;
308 } else if (l_reg.end<r_reg.end)
309 {
310 prev_region(rp,&r_reg);
311 return l_reg;
312 } else if (l_reg.end>r_reg.end)
313 {
314 prev_region(lp,&l_reg);
315 return r_reg;
316 } else
317 {
318 return r_reg;
319 }
320 }
321 if (r_reg.start!=-1) return r_reg;
322 if (l_reg.start!=-1) return l_reg;
323 /* Both lists were empty, we return (-1,-1) */
324 return r_reg;
325 }
326
327 /*
328 * Handles or operation
329 */
or(struct GC_LIST * l,struct GC_LIST * r)330 struct GC_LIST *or(struct GC_LIST *l,struct GC_LIST *r)
331 {
332 struct GC_POINTER lp,rp;
333 struct GC_LIST *a;
334 struct REGION tmp;
335 #ifdef OPTIMIZE_SORTS
336 struct REGION prev;
337 #endif
338 #ifdef PROGRESS_REPORTS
339 char *oper_name;
340 int prog;int prog_start;
341 #endif
342
343 #ifdef DEBUG
344 fprintf(stderr,"or called\n");
345 #endif
346 stats.or++;
347 a=new_gclist();
348 #ifdef OPTIMIZE_SORTS
349 prev.start=-1;
350 prev.end=-1;
351 #endif
352 start_region_search(l,&lp);
353 start_region_search(r,&rp);
354
355 #ifdef PROGRESS_REPORTS
356 prog_start=LIST_SIZE(r)+LIST_SIZE(l);
357 prog=0;
358 oper_name="or";
359 #endif
360
361 for(tmp=first_of(&lp,&rp);tmp.start!=-1;tmp=first_of(&lp,&rp))
362 {
363 #ifdef OPTIMIZE_SORTS
364 if ( tmp.end<=prev.end )
365 {
366 /* We had nesting */
367 a->nested=TRUE;
368 }
369 #endif
370 SHOW_PROG;
371 add_region(a,tmp.start,tmp.end);
372 #ifdef OPTIMIZE_SORTS
373 prev=tmp;
374 #endif
375 }
376 return a;
377 }
378
379 /*
380 * Handles ordering which produces possibly nesting gc-lists.
381 */
nest_order(struct GC_LIST * l,struct GC_LIST * r,int type)382 struct GC_LIST *nest_order(struct GC_LIST *l,struct GC_LIST *r,int type)
383 {
384 struct GC_POINTER lp,rp;
385 struct GC_LIST *a;
386 struct REGION r_reg,l_reg;
387 static struct REGION *nest_stack=NULL;
388 int nest_depth=0;
389 int nestings;
390 int s,e;
391 #ifdef PROGRESS_REPORTS
392 char *oper_name;
393 int prog;int prog_start;
394 #endif
395
396 #ifdef DEBUG
397 fprintf(stderr,"nest_order called\n");
398 #endif
399 start_region_search(r,&rp);
400
401 if (nest_stack==NULL)
402 {
403 stats.nest_stacksize=DEFAULT_NEST_DEPTH*sizeof(*nest_stack);
404 nest_stack=(struct REGION *)e_malloc(stats.nest_stacksize);
405 }
406 stats.order++;
407 a=new_gclist();
408 a->sorted=FALSE;
409 a->nested=l->nested || r->nested;
410 #ifdef DEBUG
411 if (a->nested) fprintf(stderr,"inherited nesting\n");
412 #endif
413
414 #ifdef OPTIMIZE_SORTS
415 if (l->nested) sort_by_end(l);
416 else stats.sorts_optimized++;
417 #else
418 sort_by_end(l);
419 #endif
420
421 #ifdef PROGRESS_REPORTS
422 prog_start=LIST_SIZE(r);
423 prog=0;
424 switch (type) {
425 case L_ORDERED:
426 oper_name="_.";
427 break;
428 case R_ORDERED:
429 oper_name="._";
430 break;
431 case LR_ORDERED:
432 oper_name="__";
433 break;
434 default:
435 oper_name="..";
436 }
437 #endif
438 nestings=0;
439 start_region_search(l,&lp);
440 get_region(&lp,&l_reg);
441 get_region(&rp,&r_reg);
442 SHOW_PROG;
443 /* If left or right region list was empty, we can return empty list */
444 if (l_reg.start==-1 || r_reg.start==-1) return a;
445
446 do
447 {
448 if (l_reg.end<r_reg.start && l_reg.start!=-1 )
449 {
450 /* left region is first. Add to nest_stack
451 and nest queue */
452 if (nest_depth*sizeof(*nest_stack)==stats.nest_stacksize)
453 {
454 nest_stack=(struct REGION *)
455 e_realloc(nest_stack,
456 stats.nest_stacksize*2,
457 stats.nest_stacksize);
458 stats.nest_stacksize*=2;
459
460 }
461 nest_stack[nest_depth++]=l_reg;
462 nestings=0;
463 #ifdef DEBUG
464 if (nest_depth==1)
465 fprintf(stderr," New q");
466 else fprintf(stderr," +");
467 fprintf(stderr,"(%d:%d)",l_reg.start,l_reg.end);
468 #endif
469 get_region(&lp,&l_reg);
470 }
471 else if (nest_depth>0)
472 {
473 #ifdef DEBUG
474 fprintf(stderr," %d",r_reg.end);
475 #endif
476 if (type==L_ORDERED || type==LR_ORDERED)
477 s=nest_stack[--nest_depth].end+1;
478 else s=nest_stack[--nest_depth].start;
479 if (type==R_ORDERED || type==LR_ORDERED)
480 e=r_reg.start-1;
481 else e=r_reg.end;
482
483 if (e>=s)
484 {
485 add_region(a,s,e);
486 #ifdef OPTIMIZE_SORTS
487 /* If we have taken region from nest stack
488 twice in row, it probably means, that
489 we have a nested result list */
490 nestings++;
491 if (nestings==2)
492 {
493 #ifdef DEBUG
494 if (!a->nested)
495 fprintf(stderr,"nesting order detecded\n");
496 #endif
497 a->nested=TRUE;
498 }
499 #endif
500 }
501 get_region(&rp,&r_reg);
502 SHOW_PROG;
503 } else
504 {
505 get_region(&rp,&r_reg);
506 SHOW_PROG;
507 }
508 } while ( r_reg.start!=-1 );
509 #ifdef OPTIMIZE_SORTS
510 if (!a->nested) stats.sorts_optimized++;
511 a->sorted=!a->nested;
512 #endif
513 if (!a->sorted) sort_by_start(a);
514 return a;
515 }
516
517 /*
518 * Handles in operation
519 */
in(struct GC_LIST * l,struct GC_LIST * r,int not)520 struct GC_LIST *in(struct GC_LIST *l,struct GC_LIST *r, int not)
521 /* Changed by PK in Febr 95 to capture the semantics of _proper_
522 containment */
523 {
524 struct GC_POINTER lp,rp;
525 struct GC_LIST *a,*r2;
526 struct REGION r_reg,l_reg,r_reg2;
527 char *oper_name;
528 #ifdef PROGRESS_REPORTS
529 int prog;int prog_start;
530 #endif
531
532 #ifdef DEBUG
533 fprintf(stderr,"in called\n");
534 #endif
535 if (not)
536 {
537 stats.not_in++;
538 oper_name="not in";
539 } else
540 {
541 stats.in++;
542 oper_name="in";
543 }
544 a=new_gclist();
545
546 #ifdef OPTIMIZE_SORTS
547 a->nested=l->nested;
548 #endif
549
550 start_region_search(l,&lp);
551 get_region(&lp,&l_reg);
552
553
554 /*
555 * To simplify things we do an outer function on right gc_list
556 */
557 #ifdef OPTIMIZE_SORTS
558 if (r->nested)
559 {
560 #endif
561 r2=outer(r);
562 r=r2;
563 #ifdef OPTIMIZE_SORTS
564 } else r2=NULL;
565 #endif
566 start_region_search(r,&rp);
567
568 #ifdef PROGRESS_REPORTS
569 prog_start=LIST_SIZE(l)+LIST_SIZE(r);
570 prog=0;
571 #endif
572
573 get_region(&rp,&r_reg);
574 SHOW_PROG;
575 while (r_reg.start!=-1 && l_reg.start!=-1)
576 {
577 #ifdef DEBUG
578 fprintf(stderr,"in: left=(%d,%d) right=(%d,%d)\n",
579 l_reg.start,l_reg.end,
580 r_reg.start,r_reg.end);
581 #endif
582 if (l_reg.start<r_reg.start)
583 {
584 /* Left region starts before right -> can't be
585 in right region or any right region that follows
586 current one */
587 if (not) add_region(a,l_reg.start,l_reg.end);
588 get_region(&lp,&l_reg);
589 SHOW_PROG;
590 } else /* l_reg.start>=r_reg.start */
591 {
592 if (l_reg.end<=r_reg.end)
593 {
594 /* left region is in right region */
595 /* Start PK Febr 95 */
596 if (l_reg.start>r_reg.start ||
597 l_reg.end<r_reg.end)
598 { /* inclusion is proper */
599 if (!not) add_region(a,l_reg.start,l_reg.end);
600 get_region(&lp,&l_reg);
601 SHOW_PROG;
602 } else { /* l_reg == r_reg */
603 if (not) add_region(a,l_reg.start,l_reg.end);
604 get_region(&lp,&l_reg);
605 SHOW_PROG;
606 }
607 /* End PK Febr 95 */
608 } else if (l_reg.start==r_reg.start)
609 {
610 /* Regions start from same place. Because
611 no right region after current one can start
612 from same place we can skip left region */
613 if (not) add_region(a,l_reg.start,l_reg.end);
614 get_region(&lp,&l_reg);
615 SHOW_PROG;
616 } else
617 {
618 /* left and right region are overlapping */
619 #ifdef DEBUG
620 fprintf(stderr,"in overlap\n");
621 #endif
622 get_region(&rp,&r_reg2);
623 if (r_reg2.start==-1)
624 {
625 /* All right regions have been scanned */
626 if ( l_reg.start > r_reg.end )
627 {
628 /* Left region end after last right region.
629 We can fall out of loop */
630 SHOW_PROG;
631 r_reg=r_reg2;
632 } else
633 {
634 /* Next left region might still be in right
635 region */
636 if (not) add_region(a,l_reg.start, l_reg.end);
637 get_region(&lp,&l_reg);
638 SHOW_PROG;
639 }
640 } else
641 {
642 /* There are still right regions */
643 if ( l_reg.start >= r_reg2.start )
644 {
645 /* Since left region starts after new right region,
646 * We can safely skip previous right */
647 r_reg=r_reg2;
648 SHOW_PROG;
649 } else
650 {
651 /* Left region is not in previous or next
652 right region */
653 prev_region(&rp,&r_reg2);
654 if (not) add_region(a,l_reg.start, l_reg.end);
655 get_region(&lp,&l_reg);
656 SHOW_PROG;
657 }
658 }
659 }
660 }
661 }
662
663 #ifdef DEBUG
664 fprintf(stderr,"in fall out\n");
665 #endif
666 /* If we have "not in" and right gc_list is empty, we need to copy
667 rest of left list */
668 if (not)
669 {
670 while (l_reg.start!=-1)
671 {
672 add_region(a,l_reg.start,l_reg.end);
673 get_region(&lp,&l_reg);
674 SHOW_PROG;
675 }
676 }
677
678 /* because we created list r2 here, we free it here */
679 if (r2!=NULL) free_gclist(r2);
680 return a;
681 }
682
683 /*
684 * Handles outer function
685 */
outer(struct GC_LIST * gcl)686 struct GC_LIST *outer(struct GC_LIST *gcl)
687 {
688 struct GC_POINTER p;
689 struct REGION reg1,reg2;
690 struct GC_LIST *a;
691 #ifdef PROGRESS_REPORTS
692 char *oper_name;
693 int prog;int prog_start;
694 #endif
695
696 #ifdef PROGRESS_REPORTS
697 oper_name="outer";
698 prog_start=LIST_SIZE(gcl);
699 prog=0;
700 #endif
701
702 stats.outer++;
703 reg2.start=0;
704 a=new_gclist();
705 start_region_search(gcl,&p);
706 get_region(&p,®1);
707
708 /* if we had empty gc list */
709 if (reg1.start==-1) return a;
710 SHOW_PROG;
711
712 /* If there are many regions starting from same place, we choose the
713 longest */
714 get_region(&p,®2);
715 SHOW_PROG;
716 while (reg2.start==reg1.start && reg2.end>reg1.end)
717 {
718 reg1=reg2;
719 get_region(&p,®2);
720 SHOW_PROG;
721 }
722
723 while(reg1.start!=-1 && reg2.start!=-1)
724 {
725 if (reg2.end>reg1.end && reg2.start!=reg1.start)
726 {
727 /* reg2 ends after reg1 -> no nesting */
728 add_region(a,reg1.start,reg1.end);
729 reg1=reg2;
730 }
731 get_region(&p,®2);
732 SHOW_PROG;
733 /* If regions start from same place, nesting is guaranteed */
734 if (reg2.start==reg1.start)
735 {
736 reg1=reg2;
737 get_region(&p,®2);
738 SHOW_PROG;
739 }
740 }
741 add_region(a,reg1.start,reg1.end);
742 return a;
743 }
744
745 /*
746 * Handles inner function
747 */
inner(struct GC_LIST * gcl)748 struct GC_LIST *inner(struct GC_LIST *gcl)
749 {
750 static struct REGION *inner_stack;
751 struct GC_POINTER p;
752 int inq_ind=0;
753 struct GC_LIST *a=NULL;
754 struct REGION n_reg,c_reg;
755 int i;
756 #ifdef PROGRESS_REPORTS
757 char *oper_name;
758 int prog;int prog_start;
759 #endif
760
761 #ifdef DEBUG
762 fprintf(stderr,"inner called\n");
763 #endif
764 stats.inner++;
765 a=new_gclist();
766 if (stats.inner_tablesize==0)
767 {
768 stats.inner_tablesize=
769 DEFAULT_INNER_QUEUE*sizeof(*inner_stack);
770 inner_stack=(struct REGION *) e_malloc(stats.inner_tablesize);
771 }
772
773 #ifdef PROGRESS_REPORTS
774 prog_start=LIST_SIZE(gcl);
775 prog=0;
776 oper_name="inner";
777 #endif
778
779 start_region_search(gcl,&p);
780 get_region(&p,&c_reg);
781 SHOW_PROG;
782 while (c_reg.start!=-1) {
783 get_region(&p,&n_reg);
784 SHOW_PROG;
785 #ifdef ASSERT
786 assert(n_reg.start>=c_reg.start || n_reg.start==-1 );
787 #endif
788 if ( n_reg.start>c_reg.end || n_reg.start==-1 )
789 {
790 /* n_reg and c_reg are separate. Therefore c_reg must
791 be innermost */
792 /* Now we can empty inner_stack */
793 #ifdef DEBUG
794 fprintf(stderr,"empty inner stack (%d regions)\n",inq_ind);
795 #endif
796 for (i=0;i<inq_ind;i++)
797 {
798 #ifdef ASSERT
799 assert(inner_stack[i].start<=c_reg.start);
800 #endif
801 if (inner_stack[i].end<c_reg.end)
802 /* Region in inner_stack was innermost */
803 add_region(a,inner_stack[i].start,
804 inner_stack[i].end);
805 }
806 inq_ind=0;
807 add_region(a,c_reg.start,c_reg.end);
808 } else if ( n_reg.end>c_reg.end )
809 {
810 /* n_reg and c_reg are overlapping. Let's add c_reg
811 to inner_stack */
812 if (inq_ind*sizeof(*inner_stack)==stats.inner_tablesize)
813 {
814 inner_stack=(struct REGION *) e_realloc(inner_stack,
815 stats.inner_tablesize*2,stats.inner_tablesize);
816 stats.inner_tablesize*=2;
817 }
818 inner_stack[inq_ind++]=c_reg;
819 } else {
820 /* if neither of the previous if's was taken,
821 c_reg contains n_reg. We remove regions containing n_reg from
822 inner_stack */
823 while(inq_ind &&
824 n_reg.start>=inner_stack[inq_ind-1].start &&
825 n_reg.end<=inner_stack[inq_ind-1].end )
826 {
827 inq_ind--;
828 }
829 }
830 c_reg=n_reg;
831 #ifdef ASSERT
832 if (inq_ind)
833 assert(c_reg.start<inner_stack[inq_ind-1].start ||
834 c_reg.end>inner_stack[inq_ind-1].end);
835 #endif
836 }
837 return a;
838 }
839
containing(struct GC_LIST * l,struct GC_LIST * r,int not)840 struct GC_LIST *containing(struct GC_LIST *l,struct GC_LIST *r,int not)
841 /* Changed by PK in Febr 95 to capture the semantics of _proper_
842 containment */
843 {
844 struct GC_POINTER lp,rp;
845 struct GC_LIST *a,*r2;
846 struct REGION r_reg,l_reg;
847 #ifdef PROGRESS_REPORTS
848 char *oper_name;
849 int prog;int prog_start;
850 #endif
851
852 #ifdef DEBUG
853 fprintf(stderr,"containing called\n");
854 #endif
855 if (not) stats.not_containing++; else stats.containing++;
856 a=new_gclist();
857 #ifdef OPTIMIZE_SORTS
858 a->nested=l->nested;
859 #endif
860 start_region_search(l,&lp);
861 get_region(&lp,&l_reg);
862
863 /* To simplify things we do an inner function on right gc_list */
864 #ifdef OPTIMIZE_SORTS
865 if (r->nested)
866 {
867 #endif
868 r2=inner(r);
869 r=r2;
870 #ifdef OPTIMIZE_SORTS
871 } else r2=NULL;
872 #endif
873
874 #ifdef PROGRESS_REPORTS
875 oper_name= (not) ? "not containing" : "containing";
876 prog=0;
877 prog_start=LIST_SIZE(l)+LIST_SIZE(r);
878 #endif
879 start_region_search(r,&rp);
880
881 get_region(&rp,&r_reg);
882 SHOW_PROG;
883 while (r_reg.start!=-1 && l_reg.start!=-1)
884 {
885 if ( l_reg.start>r_reg.start )
886 {
887 /* right starts before left */
888 get_region(&rp,&r_reg);
889 SHOW_PROG;
890 } else if ( l_reg.end>=r_reg.end )
891 {
892 /* left contains right */
893 /* Start PK Febr 95 */
894 if (l_reg.start<r_reg.start ||
895 l_reg.end>r_reg.end)
896 { /* Containment is proper */
897 if (!not) add_region(a,l_reg.start,l_reg.end);
898 get_region(&lp,&l_reg);
899 SHOW_PROG;
900 } else { /* l_reg == r_reg */
901 if (not) add_region(a,l_reg.start,l_reg.end);
902 get_region(&lp,&l_reg);
903 SHOW_PROG;
904 }
905 /* End PK Febr 95 */
906 } else {
907 /* left comes after right */
908 if (not) add_region(a,l_reg.start,l_reg.end);
909 get_region(&lp,&l_reg);
910 SHOW_PROG;
911 }
912 }
913 /* When right list ended, there still might be something in left list */
914 while (not && l_reg.start!=-1)
915 {
916 add_region(a,l_reg.start,l_reg.end);
917 get_region(&lp,&l_reg);
918 SHOW_PROG;
919 }
920 /* because we created list r2 here, we free it here */
921 if (r2!=NULL) free_gclist(r2);
922 return a;
923 }
924
equal(struct GC_LIST * l,struct GC_LIST * r,int not)925 struct GC_LIST *equal(struct GC_LIST *l,struct GC_LIST *r,int not)
926 /* Intersection of GC_LISTs *l and *r */
927 /* PK Febr '95 */
928 {
929 struct GC_POINTER lp,rp;
930 struct GC_LIST *a;
931 struct REGION r_reg,l_reg;
932 #ifdef PROGRESS_REPORTS
933 char *oper_name;
934 int prog;int prog_start;
935 #endif
936
937 #ifdef DEBUG
938 fprintf(stderr,"equal called\n");
939 #endif
940 if (not) stats.not_equal++; else stats.equal++;
941 a=new_gclist();
942 #ifdef OPTIMIZE_SORTS
943 a->nested=l->nested;
944 #endif
945 start_region_search(l,&lp);
946 get_region(&lp,&l_reg);
947
948
949 #ifdef PROGRESS_REPORTS
950 oper_name= (not) ? "not equal" : "equal";
951 prog=0;
952 prog_start=LIST_SIZE(l)+LIST_SIZE(r);
953 #endif
954 start_region_search(r,&rp);
955 get_region(&rp,&r_reg);
956
957 SHOW_PROG;
958
959 while (r_reg.start!=-1 && l_reg.start!=-1)
960 {
961 if ( l_reg.start<r_reg.start )
962 {
963 if (not) add_region(a,l_reg.start,l_reg.end);
964 get_region(&lp,&l_reg);
965 SHOW_PROG;
966 } else if ( r_reg.start<l_reg.start )
967 {
968 get_region(&rp,&r_reg);
969 SHOW_PROG;
970 } else /* r_reg.start=l_reg.start */
971 if ( l_reg.end<r_reg.end )
972 {
973 if (not) add_region(a,l_reg.start,l_reg.end);
974 get_region(&lp,&l_reg);
975 SHOW_PROG;
976 } else if ( r_reg.end<l_reg.end )
977 {
978 get_region(&rp,&r_reg);
979 SHOW_PROG;
980 } else /* l_reg = r_reg */
981 {
982 if (!not) add_region(a,l_reg.start,l_reg.end);
983 get_region(&rp,&r_reg);
984 get_region(&lp,&l_reg);
985 SHOW_PROG;
986 }
987 }
988 /* When right list ended, there still might be something in left list */
989 while (not && l_reg.start!=-1)
990 {
991 add_region(a,l_reg.start,l_reg.end);
992 get_region(&lp,&l_reg);
993 SHOW_PROG;
994 }
995 return a;
996
997 } /* END equal(struct GC_LIST *l,struct GC_LIST *r,int not) */
998
999 /*
1000 * Here we implement concat operation, which concats all overlapping regions
1001 * into one region and removes all nestings
1002 */
concat(struct GC_LIST * l)1003 struct GC_LIST *concat(struct GC_LIST *l)
1004 {
1005 struct GC_POINTER lp;
1006 struct GC_LIST *a;
1007 struct REGION reg1,reg2;
1008 #ifdef PROGRESS_REPORTS
1009 char *oper_name;
1010 int prog;int prog_start;
1011 #endif
1012
1013 #ifdef DEBUG
1014 fprintf(stderr,"concat called\n");
1015 #endif
1016 stats.concat++;
1017 a=new_gclist();
1018 start_region_search(l,&lp);
1019 get_region(&lp,®1);
1020
1021 /* We had empty list */
1022 if (reg1.start==-1) return a;
1023
1024 #ifdef PROGRESS_REPORTS
1025 oper_name="concat";
1026 prog=0;
1027 prog_start=LIST_SIZE(l);
1028 #endif
1029 SHOW_PROG;
1030 get_region(&lp,®2);
1031 SHOW_PROG;
1032
1033 while (reg2.start!=-1)
1034 {
1035 if (reg2.start>reg1.end+1)
1036 {
1037 /* separate regions, no concat */
1038 add_region(a,reg1.start,reg1.end);
1039 reg1=reg2;
1040 } else if ( reg2.end>reg1.end )
1041 {
1042 /* We found overlapping */
1043 reg1.end=reg2.end;
1044 }
1045 get_region(&lp,®2);
1046 SHOW_PROG;
1047 }
1048 add_region(a,reg1.start,reg1.end);
1049 return a;
1050 }
1051
1052 /*
1053 * Here we implement extracting operation
1054 */
extracting(struct GC_LIST * l,struct GC_LIST * r)1055 struct GC_LIST *extracting(struct GC_LIST *l,struct GC_LIST *r)
1056 {
1057 struct GC_POINTER lp,rp,tmpp;
1058 struct GC_LIST *a,*r2,*tmp,*new_tmp;
1059 struct REGION l_reg,r_reg;
1060 int prev_s=-1;
1061 int prev_e=-1;
1062 int last_tmp;
1063 int must_be_sorted;
1064 #ifdef PROGRESS_REPORTS
1065 char *oper_name;
1066 int prog;int prog_start;
1067 #endif
1068
1069 #ifdef DEBUG
1070 fprintf(stderr,"extracting called\n");
1071 #endif
1072 stats.extracting++;
1073 /* to simplify things we do concat on right gc_list. Result stays
1074 the same anyway */
1075 r2=concat(r);
1076 r=r2;
1077
1078 a=new_gclist();
1079 #ifdef OPTIMIZE_SORTS
1080 a->nested=l->nested;
1081 #endif
1082 tmp=new_gclist();
1083 start_region_search(tmp,&tmpp);
1084
1085 #ifdef PROGRESS_REPORTS
1086 oper_name="extracting";
1087 prog=0;
1088 prog_start=LIST_SIZE(l);
1089 #endif
1090
1091 start_region_search(l,&lp);
1092 get_region(&lp,&l_reg);
1093 SHOW_PROG;
1094 start_region_search(r,&rp);
1095 get_region(&rp,&r_reg);
1096
1097 while ( l_reg.start!=-1 )
1098 {
1099 if ( l_reg.end<r_reg.start || r_reg.start==-1 )
1100 {
1101 /* Regions are separate, left starting first.
1102 no cutting */
1103 if ( prev_s!=l_reg.start || prev_e!=l_reg.end )
1104 {
1105 #ifdef DEBUG
1106 fprintf(stderr,"extracting adding 1(%d,%d)\n",l_reg.start,l_reg.end);
1107 #endif
1108 prev_s=l_reg.start;
1109 prev_e=l_reg.end;
1110 add_region(a,l_reg.start,l_reg.end);
1111 }
1112 l_reg=first_of(&lp,&tmpp);
1113 SHOW_PROG;
1114 } else if ( r_reg.end<l_reg.start )
1115 {
1116 /* Regions are separate right starting first.
1117 we skip right */
1118 #ifdef DEBUG
1119 fprintf(stderr,"skipping right, left=(%d,%d) right=(%d,%d)\n",
1120 l_reg.start,l_reg.end,r_reg.start,r_reg.end);
1121 #endif
1122 get_region(&rp,&r_reg);
1123 } else
1124 {
1125 /* We need to do clipping. */
1126 new_tmp=new_gclist();
1127 new_tmp->sorted=FALSE;
1128 must_be_sorted=FALSE;
1129 last_tmp=-1;
1130 #ifdef DEBUG
1131 fprintf(stderr,"cutting loop, cutter (%d,%d)\n",r_reg.start,r_reg.end);
1132 #endif
1133 while ( l_reg.start!=-1 && l_reg.start<=r_reg.end )
1134 {
1135 if (l_reg.start<r_reg.start
1136 && ( prev_s!=l_reg.start ||
1137 prev_e!=r_reg.start-1 )
1138 )
1139 {
1140 prev_s=l_reg.start;
1141 prev_e=r_reg.start-1;
1142
1143 #ifdef DEBUG
1144 fprintf(stderr,"extracting adding 2(%d,%d)\n",l_reg.start,r_reg.start-1);
1145 #endif
1146 add_region(a,l_reg.start,r_reg.start-1);
1147 }
1148 if (r_reg.end<l_reg.end)
1149 {
1150 #ifdef DEBUG
1151 fprintf(stderr,"(%d,%d)<-new_tmp\n",r_reg.end+1,l_reg.end);
1152 #endif
1153 add_region(new_tmp,
1154 r_reg.end+1,l_reg.end);
1155 if (l_reg.end<last_tmp)
1156 {
1157 must_be_sorted=TRUE;
1158 } else last_tmp=l_reg.end;
1159 }
1160 l_reg=first_of(&lp,&tmpp);
1161 SHOW_PROG;
1162 }
1163 if (l_reg.start!=-1) prev_region(&lp,&l_reg);
1164
1165 #ifdef ASSERT
1166 assert (tmpp.ind==tmp->length &&
1167 tmpp.node->next==NULL);
1168 #endif
1169 free_gclist(tmp);
1170 if (must_be_sorted)
1171 sort_by_start(new_tmp);
1172 #ifdef PROGRESS_REPORTS
1173 prog--;
1174 prog_start+=LIST_SIZE(new_tmp);
1175 #endif
1176 tmp=new_tmp;
1177 start_region_search(tmp,&tmpp);
1178 /* Left region is now handled -> skip to next */
1179 l_reg=first_of(&lp,&tmpp);
1180 SHOW_PROG;
1181 }
1182 }
1183 free_gclist(r2);
1184 free_gclist(tmp);
1185 return a;
1186 }
1187
1188 /*
1189 * Join operation
1190 */
join(struct GC_LIST * l,int number)1191 struct GC_LIST *join(struct GC_LIST *l,int number)
1192 {
1193 struct GC_LIST *a;
1194 struct GC_POINTER p1,p2;
1195 struct REGION r1,r2,prev_r1,prev_r2;
1196 int i;
1197 #ifdef PROGRESS_REPORTS
1198 char *oper_name;
1199 int prog;int prog_start;
1200 #endif
1201
1202 #ifdef DEBUG
1203 fprintf(stderr,"join called %d\n",number);
1204 #endif
1205 #ifdef ASSERT
1206 assert(number>0);
1207 #endif
1208 stats.join++;
1209 a=new_gclist();
1210 #ifdef OPTIMIZE_SORTS
1211 a->nested=l->nested;
1212 #endif
1213 if ( l->first==NULL )
1214 {
1215 /* This is an optimized chars node */
1216 to_chars(a,(l->chars+1)*number);
1217 return a;
1218 }
1219
1220 /* List is smaller than join number, so return list is empty */
1221 if (LIST_SIZE(l)<number) return a;
1222
1223 #ifdef PROGRESS_REPORTS
1224 oper_name="join";
1225 prog=0;
1226 prog_start=LIST_SIZE(l);
1227 #endif
1228
1229 start_region_search(l,&p1);
1230 start_region_search(l,&p2);
1231
1232 prev_r2.start=-1;
1233 prev_r1.end=-1;
1234 for (i=number;i>0;i--)
1235 {
1236 get_region(&p1,&r1);
1237 SHOW_PROG;
1238 #ifdef ASSERT
1239 assert(r1.start!=-1);
1240 #endif
1241 }
1242 while (r1.start!=-1)
1243 {
1244 get_region(&p2,&r2);
1245 if (r2.start==prev_r2.start)
1246 {
1247 /*PK if (prev.end<r2.end) */
1248
1249 if (r1.end<=prev_r1.end)
1250 {
1251 a->sorted=FALSE;
1252 }
1253 }
1254 add_region(a,r2.start,r1.end);
1255 prev_r1=r1;
1256 get_region(&p1,&r1);
1257 SHOW_PROG;
1258 prev_r2=r2;
1259 }
1260 if (!a->sorted)
1261 {
1262 sort_by_start(a); /* This may leave duplicates in a */
1263 remove_duplicates(a); /* Here they are removed */
1264 };
1265 return a;
1266 }
1267
1268 /*
1269 * Handles ordering which does _not_ produce nesting gc-lists.
1270 * For example '"--" quote "--"' to catch SGML comments.
1271 */
quote(struct GC_LIST * l,struct GC_LIST * r,int type)1272 struct GC_LIST *quote(struct GC_LIST *l,struct GC_LIST *r,int type)
1273 {
1274 struct GC_POINTER lp,rp;
1275 struct GC_LIST *a;
1276 struct REGION r_reg,l_reg;
1277 #ifdef PROGRESS_REPORTS
1278 char *oper_name;
1279 int prog;int prog_start;
1280 #endif
1281
1282 #ifdef DEBUG
1283 fprintf(stderr,"quote called\n");
1284 #endif
1285 stats.quote++;
1286 a=new_gclist();
1287 a->sorted=TRUE;
1288 a->nested=FALSE;
1289
1290 #ifdef PROGRESS_REPORTS
1291 prog_start=LIST_SIZE(r);
1292 prog=0;
1293 switch (type) {
1294 case L_QUOTE:
1295 oper_name="_quote";
1296 break;
1297 case R_QUOTE:
1298 oper_name="quote_";
1299 break;
1300 case LR_QUOTE:
1301 oper_name="_quote_";
1302 break;
1303 default:
1304 oper_name="quote";
1305 }
1306 #endif
1307 start_region_search(r,&rp);
1308 start_region_search(l,&lp);
1309 get_region(&lp,&l_reg);
1310 get_region(&rp,&r_reg);
1311 SHOW_PROG;
1312
1313 /* If left or right region list was empty, we can return empty list */
1314 if (l_reg.start==-1 || r_reg.start==-1) return a;
1315 do {
1316 /* Skip until we find ending quote after start quote */
1317 while (l_reg.end>=r_reg.start && r_reg.start!=-1) {
1318 get_region(&rp,&r_reg);
1319 SHOW_PROG;
1320 }
1321 if (r_reg.start>=0) {
1322 /* Add region using operation type */
1323 switch (type) {
1324 case QUOTE:
1325 add_region(a,l_reg.start, r_reg.end);
1326 break;
1327 case L_QUOTE:
1328 add_region(a,l_reg.end+1, r_reg.end);
1329 break;
1330 case R_QUOTE:
1331 add_region(a,l_reg.start, r_reg.start-1);
1332 break;
1333 case LR_QUOTE:
1334 /* No empty regions */
1335 if (l_reg.end+1<r_reg.start) {
1336 add_region(a,l_reg.end+1,r_reg.start-1);
1337 }
1338 break;
1339 default:
1340 fprintf(stderr,"quote: invalid oper type\n");
1341 exit(2);
1342 /* Skip until starting quote is after last ending
1343 quote */
1344 }
1345 while (l_reg.start<=r_reg.end && l_reg.start!=-1) {
1346 get_region(&lp,&l_reg);
1347 SHOW_PROG;
1348 }
1349 }
1350 } while ( l_reg.start!=-1 && r_reg.start!=-1);
1351 return a;
1352 }
1353
1354