1 /*
2   Copyright (c) 2003 by Stefan Kurtz and The Institute for
3   Genomic Research.  This is OSI Certified Open Source Software.
4   Please see the file LICENSE for licensing information and
5   the file ACKNOWLEDGEMENTS for names of contributors to the
6   code base.
7 */
8 
9 #include "debugdef.h"
10 #include "streedef.h"
11 #include "streeacc.h"
12 #include "protodef.h"
13 
lcp(SYMBOL * start1,SYMBOL * end1,SYMBOL * start2,SYMBOL * end2)14 static Uint lcp(SYMBOL *start1,SYMBOL *end1,SYMBOL *start2,SYMBOL *end2)
15 {
16   register SYMBOL *ptr1 = start1,
17                   *ptr2 = start2;
18 
19   while(ptr1 <= end1 &&
20         ptr2 <= end2 &&
21         *ptr1 == *ptr2)
22   {
23     ptr1++;
24     ptr2++;
25   }
26   return (Uint) (ptr1-start1);
27 }
28 
scanprefixfromnodestree(Suffixtree * stree,Location * loc,Bref btptr,SYMBOL * left,SYMBOL * right,Uint rescanlength)29 /*@null@*/ SYMBOL *scanprefixfromnodestree(Suffixtree *stree,Location *loc,
30                                            Bref btptr,SYMBOL *left,
31                                            SYMBOL *right,Uint rescanlength)
32 {
33   Uint *nodeptr = NULL, *largeptr = NULL, leafindex, nodedepth,
34        node, distance = 0, prefixlen, headposition, tmpnodedepth,
35        edgelen, remainingtoskip;
36   SYMBOL *lptr, *leftborder = (SYMBOL *) NULL, firstchar, edgechar = 0;
37 
38   DEBUG1(4,"scanprefixfromnodestree starts at node %lu\n",
39           (Showuint) BRADDR2NUM(stree,btptr));
40   lptr = left;
41   nodeptr = btptr;
42   if(nodeptr == stree->branchtab)
43   {
44     nodedepth = 0;
45     headposition = 0;
46   } else
47   {
48     GETBOTH(nodedepth,headposition,nodeptr);
49   }
50   loc->nextnode.toleaf = False;
51   loc->nextnode.address = nodeptr;
52   loc->locstring.start = headposition;
53   loc->locstring.length = nodedepth;
54   loc->remain = 0;
55   if(rescanlength <= nodedepth)
56   {
57     remainingtoskip = 0;
58   } else
59   {
60     remainingtoskip = rescanlength - nodedepth;
61   }
62   while(True)
63   {
64     if(lptr > right)   // check for empty word
65     {
66       return NULL;
67     }
68     firstchar = *lptr;
69     if(nodeptr == stree->branchtab)  // at the root
70     {
71       if((node = stree->rootchildren[(Uint) firstchar]) == UNDEFINEDREFERENCE)
72       {
73         return lptr;
74       }
75       if(ISLEAF(node))
76       {
77         leafindex = GETLEAFINDEX(node);
78         loc->firstptr = stree->text + leafindex;
79         if(remainingtoskip > 0)
80         {
81           prefixlen = remainingtoskip +
82                       lcp(lptr+remainingtoskip,right,
83                           loc->firstptr+remainingtoskip,stree->sentinel-1);
84         } else
85         {
86           prefixlen = 1 + lcp(lptr+1,right,
87                               loc->firstptr+1,stree->sentinel-1);
88         }
89         loc->previousnode = stree->branchtab;
90         loc->edgelen = stree->textlen - leafindex + 1;
91         loc->remain = loc->edgelen - prefixlen;
92         loc->nextnode.toleaf = True;
93         loc->nextnode.address = stree->leaftab + leafindex;
94         loc->locstring.start = leafindex;
95         loc->locstring.length = prefixlen;
96         if(prefixlen == (Uint) (right - lptr + 1))
97         {
98           return NULL;
99         }
100         return lptr + prefixlen;
101       }
102       nodeptr = stree->branchtab + GETBRANCHINDEX(node);
103       GETONLYHEADPOS(headposition,nodeptr);
104       leftborder = stree->text + headposition;
105     } else
106     {
107       node = GETCHILD(nodeptr);
108       while(True)
109       {
110         if(NILPTR(node))
111         {
112           return lptr;
113         }
114         if(ISLEAF(node))
115         {
116           leafindex = GETLEAFINDEX(node);
117           leftborder = stree->text + (nodedepth + leafindex);
118           if(leftborder == stree->sentinel)
119           {
120             return lptr;
121           }
122           edgechar = *leftborder;
123           if(edgechar > firstchar)
124           {
125             return lptr;
126           }
127           if(edgechar == firstchar)
128           {
129             if(remainingtoskip > 0)
130             {
131               prefixlen = remainingtoskip +
132                           lcp(lptr+remainingtoskip,right,
133                               leftborder+remainingtoskip,stree->sentinel-1);
134             } else
135             {
136               prefixlen = 1 + lcp(lptr+1,right,
137                                   leftborder+1,stree->sentinel-1);
138             }
139             loc->firstptr = leftborder;
140             loc->previousnode = loc->nextnode.address;
141             loc->edgelen = stree->textlen - (nodedepth + leafindex) + 1;
142             loc->remain = loc->edgelen - prefixlen;
143             loc->nextnode.toleaf = True;
144             loc->nextnode.address = stree->leaftab + leafindex;
145             loc->locstring.start = leafindex;
146             loc->locstring.length = nodedepth + prefixlen;
147             if(prefixlen == (Uint) (right - lptr + 1))
148             {
149               return NULL;
150             }
151             return lptr + prefixlen;
152           }
153           node = LEAFBROTHERVAL(stree->leaftab[leafindex]);
154         } else
155         {
156           nodeptr = stree->branchtab + GETBRANCHINDEX(node);
157           GETONLYHEADPOS(headposition,nodeptr);
158           leftborder = stree->text + (nodedepth + headposition);
159           edgechar = *leftborder;
160           if (edgechar > firstchar)
161           {
162             return lptr;
163           }
164           if(edgechar == firstchar)
165           {
166             /*@innerbreak@*/ break;
167           }
168           node = GETBROTHER(nodeptr);
169         }
170       }
171     }
172     GETONLYDEPTH(tmpnodedepth,nodeptr);
173     edgelen = tmpnodedepth - nodedepth;
174     if(remainingtoskip > 0)
175     {
176       if(remainingtoskip >= edgelen)
177       {
178         prefixlen = edgelen;
179         remainingtoskip -= prefixlen;
180       } else
181       {
182         NOTSUPPOSEDTOBENULL(leftborder);
183         prefixlen = remainingtoskip +
184                     lcp(lptr+remainingtoskip,right,
185                         leftborder+remainingtoskip,leftborder+edgelen-1);
186         remainingtoskip = 0;
187       }
188     } else
189     {
190       NOTSUPPOSEDTOBENULL(leftborder);
191       prefixlen = 1 + lcp(lptr+1,right,
192                           leftborder+1,leftborder+edgelen-1);
193     }
194     loc->nextnode.toleaf = False;
195     loc->locstring.start = headposition;
196     loc->locstring.length = nodedepth + prefixlen;
197     if(prefixlen == edgelen)
198     {
199       lptr += edgelen;
200       nodedepth += edgelen;
201       loc->nextnode.address = nodeptr;
202       loc->remain = 0;
203     } else
204     {
205       loc->firstptr = leftborder;
206       loc->previousnode = loc->nextnode.address;
207       loc->nextnode.address = nodeptr;
208       loc->edgelen = edgelen;
209       loc->remain = loc->edgelen - prefixlen;
210       if(prefixlen == (Uint) (right - lptr + 1))
211       {
212         return NULL;
213       }
214       return lptr + prefixlen;
215     }
216   }
217 }
218 
scanprefixstree(Suffixtree * stree,Location * outloc,Location * inloc,SYMBOL * left,SYMBOL * right,Uint rescanlength)219 /*@null@*/ SYMBOL *scanprefixstree(Suffixtree *stree,Location *outloc,
220                                    Location *inloc,SYMBOL *left,
221                                    SYMBOL *right,Uint rescanlength)
222 {
223   Uint prefixlen, remainingtoskip;
224 
225   DEBUG0(4,"scanprefixstree starts at location ");
226   DEBUGCODE(4,showlocation(stdout,stree,inloc));
227   DEBUG0(4,"\n");
228   if(inloc->remain == 0)
229   {
230     return scanprefixfromnodestree(stree,outloc,inloc->nextnode.address,
231                                    left,right,rescanlength);
232   }
233   if(rescanlength <= inloc->locstring.length)
234   {
235     remainingtoskip = 0;
236   } else
237   {
238     remainingtoskip = rescanlength - inloc->locstring.length;
239   }
240   if(inloc->nextnode.toleaf)
241   {
242 
243     if(remainingtoskip > 0)
244     {
245       prefixlen = remainingtoskip +
246                   lcp(left+remainingtoskip,right,
247                       inloc->firstptr+(inloc->edgelen-inloc->remain)
248                                      +remainingtoskip,
249                       stree->sentinel-1);
250     } else
251     {
252       prefixlen = lcp(left,right,
253                       inloc->firstptr+(inloc->edgelen-inloc->remain),
254                       stree->sentinel-1);
255     }
256     outloc->firstptr = inloc->firstptr;
257     outloc->edgelen = inloc->edgelen;
258     outloc->remain = inloc->remain - prefixlen;
259     outloc->previousnode = inloc->previousnode;
260     outloc->nextnode.toleaf = True;
261     outloc->nextnode.address = inloc->nextnode.address;
262     outloc->locstring.start = LEAFADDR2NUM(stree,inloc->nextnode.address);
263     outloc->locstring.length = inloc->locstring.length + prefixlen;
264     return left + prefixlen;
265   }
266   if(remainingtoskip > 0)
267   {
268     if(remainingtoskip >= inloc->remain)
269     {
270       prefixlen = inloc->remain;
271     } else
272     {
273       prefixlen = remainingtoskip +
274                   lcp(left+remainingtoskip,right,
275                       inloc->firstptr+(inloc->edgelen-inloc->remain)
276                                      +remainingtoskip,
277                       inloc->firstptr+inloc->edgelen-1);
278     }
279   } else
280   {
281     prefixlen = lcp(left,right,
282                     inloc->firstptr+(inloc->edgelen-inloc->remain),
283                     inloc->firstptr+inloc->edgelen-1);
284   }
285   if(prefixlen < inloc->remain)
286   {
287     outloc->firstptr = inloc->firstptr;
288     outloc->edgelen = inloc->edgelen;
289     outloc->remain = inloc->remain - prefixlen;
290     outloc->previousnode = inloc->previousnode;
291     outloc->nextnode.toleaf = False;
292     outloc->nextnode.address = inloc->nextnode.address;
293     outloc->locstring.start = inloc->locstring.start;
294     outloc->locstring.length = inloc->locstring.length + prefixlen;
295     return left + prefixlen;
296   }
297   return scanprefixfromnodestree(stree,outloc,inloc->nextnode.address,
298                                    left+prefixlen,right,rescanlength);
299 }
300 
findprefixpathfromnodestree(Suffixtree * stree,ArrayPathinfo * path,Location * loc,Bref btptr,SYMBOL * left,SYMBOL * right,Uint rescanlength)301 /*@null@*/SYMBOL *findprefixpathfromnodestree(Suffixtree *stree,
302                                               ArrayPathinfo *path,
303                                               Location *loc,
304                                               Bref btptr,
305                                               SYMBOL *left,
306                                               SYMBOL *right,
307                                               Uint rescanlength)
308 {
309   Uint *nodeptr = NULL, *largeptr = NULL, leafindex, nodedepth,
310        edgelen, node, distance = 0, prefixlen, headposition,
311        remainingtoskip, tmpnodedepth;
312   SYMBOL *leftborder = (SYMBOL *) NULL, *lptr, firstchar, edgechar = 0;
313 
314   lptr = left;
315   nodeptr = btptr;
316   if(nodeptr == stree->branchtab)
317   {
318     nodedepth = 0;
319     headposition = 0;
320   } else
321   {
322     GETBOTH(nodedepth,headposition,nodeptr);
323   }
324   loc->nextnode.toleaf = False;
325   loc->nextnode.address = nodeptr;
326   loc->locstring.start = headposition;
327   loc->locstring.length = nodedepth;
328   loc->remain = 0;
329   if(rescanlength <= nodedepth)
330   {
331     remainingtoskip = 0;
332   } else
333   {
334     remainingtoskip = rescanlength - nodedepth;
335   }
336   while(True)
337   {
338     if(lptr > right)   // check for empty word
339     {
340       return NULL;
341     }
342     firstchar = *lptr;
343     if(nodeptr == stree->branchtab)  // at the root
344     {
345       if((node = stree->rootchildren[(Uint) firstchar]) == UNDEFINEDREFERENCE)
346       {
347         return lptr;
348       }
349       if(ISLEAF(node))
350       {
351         leafindex = GETLEAFINDEX(node);
352         loc->firstptr = stree->text + leafindex;
353         if(remainingtoskip > 0)
354         {
355           prefixlen = remainingtoskip +
356                       lcp(lptr+remainingtoskip,right,
357                           loc->firstptr+remainingtoskip,stree->sentinel-1);
358         } else
359         {
360           prefixlen = 1 + lcp(lptr+1,right,
361                               loc->firstptr+1,stree->sentinel-1);
362         }
363         loc->previousnode = stree->branchtab;
364         loc->edgelen = stree->textlen - leafindex + 1;
365         loc->remain = loc->edgelen - prefixlen;
366         loc->nextnode.toleaf = True;
367         loc->nextnode.address = stree->leaftab + leafindex;
368         loc->locstring.start = leafindex;
369         loc->locstring.length = prefixlen;
370         if(prefixlen == (Uint) (right - lptr + 1))
371         {
372           return NULL;
373         }
374         return lptr + prefixlen;
375       }
376       nodeptr = stree->branchtab + GETBRANCHINDEX(node);
377       GETONLYHEADPOS(headposition,nodeptr);
378       leftborder = stree->text + headposition;
379     } else
380     {
381       node = GETCHILD(nodeptr);
382       while(True)
383       {
384         if(NILPTR(node))
385         {
386           return lptr;
387         }
388         if(ISLEAF(node))
389         {
390           leafindex = GETLEAFINDEX(node);
391           leftborder = stree->text + (nodedepth + leafindex);
392           if(leftborder == stree->sentinel)
393           {
394             return lptr;
395           }
396           edgechar = *leftborder;
397           if(edgechar > firstchar)
398           {
399             return lptr;
400           }
401           if(edgechar == firstchar)
402           {
403             if(remainingtoskip > 0)
404             {
405               prefixlen = remainingtoskip +
406                           lcp(lptr+remainingtoskip,right,
407                               leftborder+remainingtoskip,stree->sentinel-1);
408             } else
409             {
410               prefixlen = 1 + lcp(lptr+1,right,
411                                   leftborder+1,stree->sentinel-1);
412             }
413             loc->firstptr = leftborder;
414             loc->previousnode = loc->nextnode.address;
415             loc->edgelen = stree->textlen - (nodedepth + leafindex) + 1;
416             loc->remain = loc->edgelen - prefixlen;
417             loc->nextnode.toleaf = True;
418             loc->nextnode.address = stree->leaftab + leafindex;
419             loc->locstring.start = leafindex;
420             loc->locstring.length = nodedepth + prefixlen;
421             if(prefixlen == (Uint) (right - lptr + 1))
422             {
423               return NULL;
424             }
425             return lptr + prefixlen;
426           }
427           node = LEAFBROTHERVAL(stree->leaftab[leafindex]);
428         } else
429         {
430           nodeptr = stree->branchtab + GETBRANCHINDEX(node);
431           GETONLYHEADPOS(headposition,nodeptr);
432           leftborder = stree->text + (nodedepth + headposition);
433           edgechar = *leftborder;
434           if (edgechar > firstchar)
435           {
436             return lptr;
437           }
438           if(edgechar == firstchar)
439           {
440             /*@innerbreak@*/ break;
441           }
442           node = GETBROTHER(nodeptr);
443         }
444       }
445     }
446     GETONLYDEPTH(tmpnodedepth,nodeptr);
447     edgelen = tmpnodedepth - nodedepth;
448     if(remainingtoskip > 0)
449     {
450       if(remainingtoskip >= edgelen)
451       {
452         prefixlen = edgelen;
453         remainingtoskip -= prefixlen;
454       } else
455       {
456         NOTSUPPOSEDTOBENULL(leftborder);
457         prefixlen = remainingtoskip +
458                     lcp(lptr+remainingtoskip,right,
459                         leftborder+remainingtoskip,leftborder+edgelen-1);
460         remainingtoskip = 0;
461       }
462     } else
463     {
464       NOTSUPPOSEDTOBENULL(leftborder);
465       prefixlen = 1 + lcp(lptr+1,right,
466                           leftborder+1,leftborder+edgelen-1);
467     }
468     loc->nextnode.toleaf = False;
469     loc->locstring.start = headposition;
470     loc->locstring.length = nodedepth + prefixlen;
471     if(prefixlen == edgelen)
472     {
473       lptr += edgelen;
474       nodedepth += edgelen;
475       loc->nextnode.address = nodeptr;
476       loc->remain = 0;
477     } else
478     {
479       loc->firstptr = leftborder;
480       loc->previousnode = loc->nextnode.address;
481       loc->nextnode.address = nodeptr;
482       loc->edgelen = edgelen;
483       loc->remain = loc->edgelen - prefixlen;
484       if(prefixlen == (Uint) (right - lptr + 1))
485       {
486         return NULL;
487       }
488       return lptr + prefixlen;
489     }
490     CHECKARRAYSPACE(path,Pathinfo,128);
491     path->spacePathinfo[path->nextfreePathinfo].ref = nodeptr;
492     path->spacePathinfo[path->nextfreePathinfo].depth = tmpnodedepth;
493     path->spacePathinfo[path->nextfreePathinfo].headposition = headposition;
494     path->nextfreePathinfo++;
495   }
496 }
497 
findprefixpathstree(Suffixtree * stree,ArrayPathinfo * path,Location * outloc,Location * inloc,SYMBOL * left,SYMBOL * right,Uint rescanlength)498 /*@null@*/ SYMBOL *findprefixpathstree(Suffixtree *stree,
499                                        ArrayPathinfo *path,
500                                        Location *outloc,
501                                        Location *inloc,
502                                        SYMBOL *left,
503                                        SYMBOL *right,
504                                        Uint rescanlength)
505 {
506   Uint prefixlen, remainingtoskip;
507 
508   DEBUG0(4,"findprefixpathstree starts at location ");
509   DEBUGCODE(4,showlocation(stdout,stree,inloc));
510   DEBUG0(4,"\n");
511   if(inloc->remain == 0)
512   {
513     CHECKARRAYSPACE(path,Pathinfo,128);
514     path->spacePathinfo[path->nextfreePathinfo].ref
515       = inloc->nextnode.address;
516     path->spacePathinfo[path->nextfreePathinfo].depth
517       = inloc->locstring.length;
518     path->spacePathinfo[path->nextfreePathinfo].headposition
519       = inloc->locstring.start;
520     path->nextfreePathinfo++;
521     return findprefixpathfromnodestree(stree,path,outloc,
522                                        inloc->nextnode.address,
523                                        left,right,rescanlength);
524   }
525   if(rescanlength <= inloc->locstring.length)
526   {
527     remainingtoskip = 0;
528   } else
529   {
530     remainingtoskip = rescanlength - inloc->locstring.length;
531   }
532   if(inloc->nextnode.toleaf)
533   {
534     if(remainingtoskip > 0)
535     {
536       prefixlen = remainingtoskip +
537                   lcp(left+remainingtoskip,right,
538                       inloc->firstptr+(inloc->edgelen-inloc->remain)
539                                      +remainingtoskip,
540                       stree->sentinel-1);
541     } else
542     {
543       prefixlen = lcp(left,right,
544                       inloc->firstptr+(inloc->edgelen-inloc->remain),
545                       stree->sentinel-1);
546     }
547     outloc->firstptr = inloc->firstptr;
548     outloc->edgelen = inloc->edgelen;
549     outloc->remain = inloc->remain - prefixlen;
550     outloc->previousnode = inloc->previousnode;
551     outloc->nextnode.toleaf = True;
552     outloc->nextnode.address = inloc->nextnode.address;
553     outloc->locstring.start = LEAFADDR2NUM(stree,inloc->nextnode.address);
554     outloc->locstring.length = inloc->locstring.length + prefixlen;
555     return left + prefixlen;
556   }
557   if(remainingtoskip > 0)
558   {
559     if(remainingtoskip >= inloc->remain)
560     {
561       prefixlen = inloc->remain;
562     } else
563     {
564       prefixlen = remainingtoskip +
565                   lcp(left+remainingtoskip,right,
566                       inloc->firstptr+(inloc->edgelen-inloc->remain)
567                                      +remainingtoskip,
568                       inloc->firstptr+inloc->edgelen-1);
569     }
570   } else
571   {
572     prefixlen = lcp(left,right,
573                     inloc->firstptr+(inloc->edgelen-inloc->remain),
574                     inloc->firstptr+inloc->edgelen-1);
575   }
576   if(prefixlen < inloc->remain)
577   {
578     outloc->firstptr = inloc->firstptr;
579     outloc->edgelen = inloc->edgelen;
580     outloc->remain = inloc->remain - prefixlen;
581     outloc->previousnode = inloc->previousnode;
582     outloc->nextnode.toleaf = False;
583     outloc->nextnode.address = inloc->nextnode.address;
584     outloc->locstring.start = inloc->locstring.start;
585     outloc->locstring.length = inloc->locstring.length + prefixlen;
586     return left + prefixlen;
587   }
588   CHECKARRAYSPACE(path,Pathinfo,128);
589   path->spacePathinfo[path->nextfreePathinfo].ref = inloc->nextnode.address;
590   path->spacePathinfo[path->nextfreePathinfo].depth
591     = inloc->locstring.length + prefixlen;
592   path->spacePathinfo[path->nextfreePathinfo].headposition
593       = inloc->locstring.start;
594   path->nextfreePathinfo++;
595   return findprefixpathfromnodestree(stree,path,outloc,
596                                      inloc->nextnode.address,
597                                      left+prefixlen,right,rescanlength);
598 }
599