1 /*
2   Copyright (c) 2003 by Stefan Kurtz and The Institute for
3   Genomic Research.  This is OSI Certified Open Source Software.
4   Please see the file LICENSE for licensing information and
5   the file ACKNOWLEDGEMENTS for names of contributors to the
6   code base.
7 */
8 
9 //}
10 
11 //\FILEINFO{construct.c}{Stefan Kurtz}{November 1999}
12 
13 //\Ignore{
14 
15 #include "intbits.h"
16 #include "spacedef.h"
17 #include "megabytes.h"
18 #include "debugdef.h"
19 #include "streedef.h"
20 #include "streeacc.h"
21 #include "protodef.h"
22 
23 #define FUNCLEVEL 4
24 
25 #define DEBUGDEFAULT DEBUG1(FUNCLEVEL,">%s\n",__func__);\
26                      DEBUGCODE(5,showstree(stree))
27 
28 #define VALIDINIT     0
29 
30 #define CHECKTEXTLEN\
31         if(textlen > MAXTEXTLEN)\
32         {\
33           ERROR2("suffix tree construction failed: "\
34                  "textlen=%lu larger than maximal textlen=%lu",\
35                   (Showuint) textlen,(Showuint) MAXTEXTLEN);\
36           return -1;\
37         }
38 
39 #ifdef DEBUG
40 
41 
showvalues(void)42 static void showvalues(void)
43 {
44   SHOWVAL(SMALLINTS);
45   SHOWVAL(LARGEINTS);
46   SHOWVAL(MAXDISTANCE);
47 #if defined(STREELARGE) || defined(STREESMALL)
48   SHOWVAL(SMALLDEPTH);
49 #endif
50   SHOWVAL(MAXTEXTLEN);
51 }
52 
53 #endif
54 
55 //}
56 
57 /*
58  This file contains code for the improved linked list implementation,
59  as described in \cite{KUR:1998,KUR:BAL:1999}. It can be
60  compiled with two options:
61  \begin{itemize}
62  \item
63  for short strings of length \(\leq 2^{21}-1=2\) megabytes,
64  we recommend the option
65  \texttt{STREESMALL}. This results in a representation which requires
66  \(2\) integers for each small node, and three integers for each large node.
67  See also the header file \texttt{streesmall.h}.
68  \item
69  for long strings of length \(\leq 2^{27}-1=12\) megabytes, we
70  recommend the option
71  \texttt{STREELARGE}. This results in a representation which requires
72  \(2\) integers for each small node, and four integers for each large node.
73  See also the header file \texttt{streelarge.h}.
74  \end{itemize}
75 */
76 
77 //\subsection{Space Management}
78 
79 /*
80  For a string of length \(n\) we initially allocate space for
81  \(\texttt{STARTFACTOR}\cdot\texttt{SMALLINTS}\cdot n\) integers to store
82  the branching nodes. This usually suffices for most cases. In case we need
83  more integers, we allocate space for \(\texttt{ADDFACTOR}\cdot n\)
84  (at least 16) extra branching nodes.
85 */
86 
87 #ifndef STARTFACTOR
88 #define STARTFACTOR 0.5
89 #endif
90 
91 #define ADDFACTOR   0.05
92 #define MINEXTRA    16
93 
94 /*
95  Before a new node is stored, we check if there is enough space available.
96  If not, the space is enlarged by a small amount. Since some global pointers
97  directly refer into the table, these have to be adjusted after reallocation.
98 */
99 
spaceforbranchtab(Suffixtree * stree)100 static void spaceforbranchtab(Suffixtree *stree)
101 {
102   DEBUG1(FUNCLEVEL,">%s\n",__func__);
103 
104   if(stree->nextfreebranch >= stree->firstnotallocated)
105   {
106     Uint tmpheadnode,
107          tmpchainstart = 0,
108          extra = (Uint) (ADDFACTOR * (MULTBYSMALLINTS(stree->textlen+1)));
109     if(extra < MINEXTRA)
110     {
111       extra = MULTBYSMALLINTS(MINEXTRA);
112     }
113     DEBUG1(2,"#all suffixes up to suffix %lu have been scanned\n",
114               (Showuint) stree->nextfreeleafnum);
115     DEBUG1(2,"#current space peak %f\n",MEGABYTES(getspacepeak()));
116     DEBUG1(2,"#to get %lu extra space do ",(Showuint) extra);
117     stree->currentbranchtabsize += extra;
118     tmpheadnode = BRADDR2NUM(stree,stree->headnode);
119     if(stree->chainstart != NULL)
120     {
121       tmpchainstart = BRADDR2NUM(stree,stree->chainstart);
122     }
123 
124     stree->branchtab
125      = ALLOCSPACE(stree->branchtab,Uint,stree->currentbranchtabsize);
126     stree->nextfreebranch = stree->branchtab + stree->nextfreebranchnum;
127     stree->headnode = stree->branchtab + tmpheadnode;
128     if(stree->chainstart != NULL)
129     {
130       stree->chainstart = stree->branchtab + tmpchainstart;
131     }
132     stree->firstnotallocated
133        = stree->branchtab + stree->currentbranchtabsize - LARGEINTS;
134   }
135 }
136 
137 //\subsection{Initializing and Retrieving Headpositions, Depth, and Suffixlinks}
138 
139 /*
140  We have three functions to initialize and retrieve head positions, depth, and
141  suffix links. The implementation depends on the bit layout.
142  \begin{enumerate}
143  \item
144  The function \emph{setdepthnum} stores the \emph{depth} and the
145  \emph{head position} of a new large node.
146  \item
147  The function \emph{setsuffixlink} stores the \emph{suffixlink}
148  of a new large node.
149  \item
150  The function \emph{getlargelinkconstruction} retrieves the \emph{suffixlink}
151  of a large node, which is referenced by \emph{headposition}.
152  \end{enumerate}
153 */
154 
155 #ifdef STREESMALL
156 
setdepthheadposition(Suffixtree * stree,Uint depth,Uint headposition)157 static void setdepthheadposition(Suffixtree *stree,Uint depth,
158                                  Uint headposition)
159 {
160   DEBUG4(FUNCLEVEL,">%s(%lu)=(depth=%lu,headposition=%lu)\n",
161                    __func__,
162                    (Showuint) stree->nextfreebranchnum,
163                    (Showuint) depth,
164                    (Showuint) headposition);
165   DEBUGCODE(1,stree->maxset = stree->nextfreebranch + 2);
166   if(ISSMALLDEPTH(depth))
167   {
168     *(stree->nextfreebranch+1) |= SMALLDEPTHMARK;
169   } else
170   {
171     *(stree->nextfreebranch)
172      = (*(stree->nextfreebranch) & (MAXINDEX | LARGEBIT)) |
173        ((depth << 11) & (7 << 29));
174     *(stree->nextfreebranch+1)
175      = (*(stree->nextfreebranch+1) & (MAXINDEX | NILBIT)) |
176        ((depth << 14) & (127 << 25));
177   }
178   *(stree->nextfreebranch+2) = (depth << 21) | headposition;
179 }
180 
setsuffixlink(Suffixtree * stree,Uint slink)181 static void setsuffixlink(Suffixtree *stree,Uint slink)
182 {
183   DEBUG3(FUNCLEVEL,">%s(%lu)=%lu\n",
184                     __func__,
185                     (Showuint) stree->nextfreebranchnum,
186                     (Showuint) slink);
187   *(stree->setlink) = (*(stree->setlink) & (255 << 24)) | (slink | NILBIT);
188   if(ISSMALLDEPTH(stree->currentdepth))
189   {
190     *(stree->nextfreebranch+1) |= (slink << 25);
191     if(stree->nextfreebranchnum & (~((1 << 7) - 1)))
192     {
193       *(stree->nextfreebranch) |= ((slink << 17) & (255 << 24));
194       if(stree->nextfreebranchnum & (~((1 << 15) - 1)))
195       {
196         stree->leaftab[stree->nextfreeleafnum-1] |=
197           ((slink << 9) & (255 << 24));
198       }
199     }
200   }
201 }
202 
getlargelinkconstruction(Suffixtree * stree)203 static Uint getlargelinkconstruction(Suffixtree *stree)
204 {
205   SYMBOL secondchar;
206   Uint succ, slink, headnodenum;
207 
208   DEBUG2(FUNCLEVEL,">%s(%lu)\n",
209                    __func__,
210                    (Showuint) BRADDR2NUM(stree,stree->headnode));
211 
212   DEBUGCODE(1,stree->largelinks++);
213   if(stree->headnodedepth == 1)
214   {
215     return 0;               // link refers to the root
216   }
217   if(stree->headnodedepth == 2)   // determine second character of edge
218   {
219     if(stree->headend == NULL)
220     {
221       secondchar = *(stree->tailptr-1);
222     } else
223     {
224       secondchar = *(stree->tailptr - (stree->headend - stree->headstart + 2));
225     }
226     // this leads to the suffix link node
227     return GETBRANCHINDEX(stree->rootchildren[(Uint) secondchar]);
228   }
229   if(ISSMALLDEPTH(stree->headnodedepth))   // retrieve link in constant time
230   {
231     slink = *(stree->headnode+1) >> 25;
232     headnodenum = BRADDR2NUM(stree,stree->headnode);
233     if(headnodenum & (~((1 << 7) - 1)))
234     {
235       slink |= ((*(stree->headnode) & (255 << 24)) >> 17);
236       if(headnodenum & (~((1 << 15) - 1)))
237       {
238         slink |= ((stree->leaftab[GETHEADPOS(stree->headnode)]
239                    & (255 << 24)) >> 9);
240       }
241     }
242     return slink;
243   }
244   succ = stree->onsuccpath;   // start at node on successor path
245   DEBUGCODE(1,stree->largelinklinkwork++);
246   while(!NILPTR(succ))        // linear retrieval of suffix links
247   {
248     DEBUGCODE(1,stree->largelinkwork++);
249     if(ISLEAF(succ))
250     {
251       succ = LEAFBROTHERVAL(stree->leaftab[GETLEAFINDEX(succ)]);
252     } else
253     {
254       succ = GETBROTHER(stree->branchtab + GETBRANCHINDEX(succ));
255     }
256     DEBUGCODE(1,stree->largelinkwork++);
257   }
258   return succ & MAXINDEX;   // get only the index
259 }
260 #endif
261 
262 #ifdef STREELARGE
263 
setdepthheadposition(Suffixtree * stree,Uint depth,Uint headposition)264 static void setdepthheadposition(Suffixtree *stree,Uint depth,
265                                  Uint headposition)
266 {
267   DEBUG4(FUNCLEVEL,">%s(%lu)=(depth=%lu,headposition=%lu)\n",
268                    __func__,
269                    (Showuint) stree->nextfreebranchnum,
270                    (Showuint) depth,
271                    (Showuint) headposition);
272 
273   if(ISSMALLDEPTH(depth))
274   {
275     *(stree->nextfreebranch+2) = depth | SMALLDEPTHMARK;
276   } else
277   {
278     *(stree->nextfreebranch+2) = depth;
279   }
280   *(stree->nextfreebranch+3) = headposition;
281 }
282 
setsuffixlink(Suffixtree * stree,Uint slink)283 static void setsuffixlink(Suffixtree *stree,Uint slink)
284 {
285   Uint slinkhalf = slink >> 1;
286 
287   DEBUG3(FUNCLEVEL,">%s(%lu)=%lu\n",
288                     __func__,
289                     (Showuint) stree->nextfreebranchnum,
290                     (Showuint) slink);
291   *(stree->setlink) = (*(stree->setlink) & EXTRAPATT) | (slink | NILBIT);
292   if(ISSMALLDEPTH(stree->currentdepth))
293   {
294     *(stree->nextfreebranch+2)
295       |= ((slinkhalf << SMALLDEPTHBITS) & LOWERLINKPATT);
296     if(stree->nextfreebranchnum & (~LOWERLINKSIZE))
297     {
298       *(stree->nextfreebranch+3)
299         |= ((slinkhalf << SHIFTMIDDLE) & MIDDLELINKPATT);
300       if(stree->nextfreebranchnum & HIGHERSIZE)
301       {
302         stree->leaftab[stree->nextfreeleafnum-1] |=
303               ((slinkhalf << SHIFTHIGHER) & EXTRAPATT);
304       }
305     }
306   }
307 }
308 
getlargelinkconstruction(Suffixtree * stree)309 static Uint getlargelinkconstruction(Suffixtree *stree)
310 {
311   SYMBOL secondchar;
312   Uint succ, slinkhalf, headnodenum;
313 
314   DEBUG2(FUNCLEVEL,">%s(%lu)\n",
315                    __func__,
316                    (Showuint) BRADDR2NUM(stree,stree->headnode));
317   DEBUGCODE(1,stree->largelinks++);
318   if(stree->headnodedepth == 1)
319   {
320     return 0;        // link refers to root
321   }
322   if(stree->headnodedepth == 2)  // determine second char of egde
323   {
324     if(stree->headend == NULL)
325     {
326       secondchar = *(stree->tailptr-1);
327     } else
328     {
329       secondchar = *(stree->tailptr - (stree->headend - stree->headstart + 2));
330     }
331     return stree->rootchildren[(Uint) secondchar];
332   }
333   if(ISSMALLDEPTH(stree->headnodedepth))  // retrieve link in constant time
334   {
335     slinkhalf = (*(stree->headnode+2) & LOWERLINKPATT) >> SMALLDEPTHBITS;
336     headnodenum = BRADDR2NUM(stree,stree->headnode);
337     if(headnodenum & (~LOWERLINKSIZE))
338     {
339       slinkhalf |= ((*(stree->headnode+3) & MIDDLELINKPATT) >> SHIFTMIDDLE);
340       if(headnodenum & HIGHERSIZE)
341       {
342         slinkhalf
343           |= ((stree->leaftab[GETHEADPOS(stree->headnode)] & EXTRAPATT)
344              >> SHIFTHIGHER);
345       }
346     }
347     return slinkhalf << 1;
348   }
349   succ = stree->onsuccpath;
350   DEBUGCODE(1,stree->largelinklinkwork++);
351   while(!NILPTR(succ))   // linear retrieval of suffix link
352   {
353     DEBUGCODE(1,stree->largelinkwork++);
354     if(ISLEAF(succ))
355     {
356       succ = LEAFBROTHERVAL(stree->leaftab[GETLEAFINDEX(succ)]);
357     } else
358     {
359       succ = GETBROTHER(stree->branchtab + GETBRANCHINDEX(succ));
360     }
361     DEBUGCODE(1,stree->largelinkwork++);
362   }
363   return succ & MAXINDEX;   // get only the index
364 }
365 #endif
366 
367 #ifdef STREEHUGE
368 
getlargelinkconstruction(Suffixtree * stree)369 static Uint getlargelinkconstruction(Suffixtree *stree)
370 {
371   SYMBOL secondchar;
372 
373   DEBUG2(FUNCLEVEL,">%s(%lu)\n",
374                    __func__,
375                    (Showuint) BRADDR2NUM(stree,stree->headnode));
376   if(stree->headnodedepth == 1)
377   {
378     return 0;        // link refers to root
379   }
380   if(stree->headnodedepth == 2)  // determine second char of egde
381   {
382     if(stree->headend == NULL)
383     {
384       secondchar = *(stree->tailptr-1);
385     } else
386     {
387       secondchar = *(stree->tailptr - (stree->headend - stree->headstart + 2));
388     }
389     return stree->rootchildren[(Uint) secondchar];
390   }
391   return *(stree->headnode+4);
392 }
393 #endif
394 
395 //\subsection{Insertion of Nodes}
396 
397 /*
398   The function \emph{insertleaf} inserts a leaf and a corresponding leaf
399   edge outgoing from the current \emph{headnode}.
400   \emph{insertprev} refers to the node to the left of the leaf to be inserted.
401   If the leaf is the first child, then \emph{insertprev} is
402   \texttt{UNDEFINEDREFERENCE}.
403 */
404 
insertleaf(Suffixtree * stree)405 static void insertleaf(Suffixtree *stree)
406 {
407   Uint *ptr, newleaf;
408 
409   DEBUGDEFAULT;
410   newleaf = MAKELEAF(stree->nextfreeleafnum);
411   DEBUGCODE(1,stree->insertleafcalls++);
412   if(stree->headnodedepth == 0)                // head is the root
413   {
414     if(stree->tailptr != stree->sentinel)      // no \$-edge initially
415     {
416       stree->rootchildren[(Uint) *(stree->tailptr)] = newleaf;
417       *(stree->nextfreeleafptr) = VALIDINIT;
418       DEBUG2(4,"%c-edge from root points to leaf %lu\n",
419                *(stree->tailptr),(Showuint) stree->nextfreeleafnum);
420     }
421   } else
422   {
423     if (stree->insertprev == UNDEFINEDREFERENCE)  // newleaf = first child
424     {
425       *(stree->nextfreeleafptr) = GETCHILD(stree->headnode);
426       SETCHILD(stree->headnode,newleaf);
427     } else
428     {
429       if(ISLEAF(stree->insertprev))   // previous node is leaf
430       {
431         ptr = stree->leaftab + GETLEAFINDEX(stree->insertprev);
432         *(stree->nextfreeleafptr) = LEAFBROTHERVAL(*ptr);
433         SETLEAFBROTHER(ptr,newleaf);
434       } else   // previous node is branching node
435       {
436         ptr = stree->branchtab + GETBRANCHINDEX(stree->insertprev);
437         *(stree->nextfreeleafptr) = GETBROTHER(ptr);
438         SETBROTHER(ptr,newleaf);
439       }
440     }
441   }
442   RECALLSUCC(newleaf);     // recall node on successor path of \emph{headnode}
443   stree->nextfreeleafnum++;
444   stree->nextfreeleafptr++;
445 }
446 
447 /*
448   The function \emph{insertbranch} inserts a branching node and splits
449   the appropriate edges, according to the canonical location of the current
450   head. \emph{insertprev} refers to the node to the left of the branching
451   node to be inserted. If the branching node is the first child, then
452   \emph{insertprev} is \texttt{UNDEFINEDREFERENCE}. The edge to be split ends
453   in the node referred to by \emph{insertnode}.
454 */
455 
insertbranchnode(Suffixtree * stree)456 static void insertbranchnode(Suffixtree *stree)
457 {
458   Uint *ptr, *insertnodeptr, *insertleafptr, insertnodeptrbrother;
459 
460   DEBUGDEFAULT;
461   spaceforbranchtab(stree);
462   if(stree->headnodedepth == 0)      // head is the root
463   {
464     stree->rootchildren[(Uint) *(stree->headstart)]
465       = MAKEBRANCHADDR(stree->nextfreebranchnum);
466     *(stree->nextfreebranch+1) = VALIDINIT;
467     DEBUG2(4,"%c-edge from root points to branch node with address %lu\n",
468            *(stree->headstart),(Showuint) stree->nextfreebranchnum);
469   } else
470   {
471     if(stree->insertprev == UNDEFINEDREFERENCE)  // new branch = first child
472     {
473       SETCHILD(stree->headnode,MAKEBRANCHADDR(stree->nextfreebranchnum));
474     } else
475     {
476       if(ISLEAF(stree->insertprev))  // new branch = right brother of leaf
477       {
478         ptr = stree->leaftab + GETLEAFINDEX(stree->insertprev);
479         SETLEAFBROTHER(ptr,MAKEBRANCHADDR(stree->nextfreebranchnum));
480       } else                     // new branch = brother of branching node
481       {
482         SETBROTHER(stree->branchtab + GETBRANCHINDEX(stree->insertprev),
483                    MAKEBRANCHADDR(stree->nextfreebranchnum));
484       }
485     }
486   }
487   if(ISLEAF(stree->insertnode))   // split edge is leaf edge
488   {
489     DEBUGCODE(1,stree->splitleafedge++);
490     insertleafptr = stree->leaftab + GETLEAFINDEX(stree->insertnode);
491     if (stree->tailptr == stree->sentinel ||
492         *(stree->headend+1) < *(stree->tailptr))
493     {
494       SETNEWCHILDBROTHER(MAKELARGE(stree->insertnode),  // first child=oldleaf
495                          LEAFBROTHERVAL(*insertleafptr));  // inherit brother
496       RECALLNEWLEAFADDRESS(stree->nextfreeleafptr);
497       SETLEAFBROTHER(insertleafptr,                     // new leaf =
498                      MAKELEAF(stree->nextfreeleafnum)); // right brother of old leaf
499     } else
500     {
501       SETNEWCHILDBROTHER(MAKELARGELEAF(stree->nextfreeleafnum),  // first child=new leaf
502                          LEAFBROTHERVAL(*insertleafptr));  // inherit brother
503       *(stree->nextfreeleafptr) = stree->insertnode;  // old leaf = right brother of of new leaf
504       RECALLLEAFADDRESS(insertleafptr);
505     }
506   } else  // split edge leads to branching node
507   {
508     DEBUGCODE(1,stree->splitinternaledge++);
509     insertnodeptr = stree->branchtab + GETBRANCHINDEX(stree->insertnode);
510     insertnodeptrbrother = GETBROTHER(insertnodeptr);
511     if (stree->tailptr == stree->sentinel ||
512         *(stree->headend+1) < *(stree->tailptr))
513     {
514       SETNEWCHILDBROTHER(MAKELARGE(stree->insertnode), // first child new branch
515                          insertnodeptrbrother);        // inherit right brother
516       RECALLNEWLEAFADDRESS(stree->nextfreeleafptr);
517       SETBROTHER(insertnodeptr,MAKELEAF(stree->nextfreeleafnum)); // new leaf = brother of old branch
518     } else
519     {
520       SETNEWCHILDBROTHER(MAKELARGELEAF(stree->nextfreeleafnum), // first child is new leaf
521                          insertnodeptrbrother);        // inherit brother
522       *(stree->nextfreeleafptr) = stree->insertnode;   // new branch is brother of new leaf
523       RECALLBRANCHADDRESS(insertnodeptr);
524     }
525   }
526   SETNILBIT;
527   RECALLSUCC(MAKEBRANCHADDR(stree->nextfreebranchnum)); // node on succ. path
528   stree->currentdepth = stree->headnodedepth + (Uint) (stree->headend-stree->headstart+1);
529   SETDEPTHHEADPOS(stree->currentdepth,stree->nextfreeleafnum);
530   SETMAXBRANCHDEPTH(stree->currentdepth);
531   stree->nextfreeleafnum++;
532   stree->nextfreeleafptr++;
533   DEBUGCODE(1,stree->nodecount++);
534 }
535 
536 //\subsection{Finding the Head-Locations}
537 
538 /*
539   The function \emph{rescan} finds the location of the current head.
540   In order to scan down the tree, it suffices to look at the first
541   character of each edge.
542 */
543 
rescan(Suffixtree * stree)544 static void rescan (Suffixtree *stree)
545 {
546   Uint *nodeptr, *largeptr = NULL, distance = 0, node, prevnode,
547        nodedepth, edgelen, wlen, leafindex, headposition;
548   SYMBOL headchar, edgechar;
549 
550   DEBUGDEFAULT;
551   if(stree->headnodedepth == 0)   // head is the root
552   {
553     headchar = *(stree->headstart);  // headstart is assumed to be not empty
554     node = stree->rootchildren[(Uint) headchar];
555     if(ISLEAF(node))   // stop if successor is leaf
556     {
557       stree->insertnode = node;
558       return;
559     }
560     nodeptr = stree->branchtab + GETBRANCHINDEX(node);
561     GETONLYDEPTH(nodedepth,nodeptr);
562     wlen = (Uint) (stree->headend - stree->headstart + 1);
563     if(nodedepth > wlen)    // cannot reach the successor node
564     {
565       stree->insertnode = node;
566       return;
567     }
568     stree->headnode = nodeptr;        // go to successor node
569     stree->headnodedepth = nodedepth;
570     if(nodedepth == wlen)             // location has been scanned
571     {
572       stree->headend = NULL;
573       return;
574     }
575     (stree->headstart) += nodedepth;
576   }
577   while(True)   // \emph{headnode} is not the root
578   {
579     headchar = *(stree->headstart);  // \emph{headstart} is assumed to be nonempty
580     prevnode = UNDEFINEDREFERENCE;
581     node = GETCHILD(stree->headnode);
582     while(True)             // traverse the list of successors
583     {
584       if(ISLEAF(node))   // successor is leaf
585       {
586         leafindex = GETLEAFINDEX(node);
587         edgechar = stree->text[stree->headnodedepth + leafindex];
588         if(edgechar == headchar)    // correct edge found
589         {
590           stree->insertnode = node;
591           stree->insertprev = prevnode;
592           return;
593         }
594         prevnode = node;
595         node = LEAFBROTHERVAL(stree->leaftab[leafindex]);
596       } else   // successor is branch node
597       {
598         nodeptr = stree->branchtab + GETBRANCHINDEX(node);
599         GETONLYHEADPOS(headposition,nodeptr);
600         edgechar = stree->text[stree->headnodedepth + headposition];
601         if(edgechar == headchar) // correct edge found
602         {
603           break;
604         }
605         prevnode = node;
606         node = GETBROTHER(nodeptr);
607       }
608     }
609 
610     GETDEPTHAFTERHEADPOS(nodedepth,nodeptr);     // get info about succ node
611     edgelen = nodedepth - stree->headnodedepth;
612     wlen = (Uint) (stree->headend - stree->headstart + 1);
613     if(edgelen > wlen)     // cannot reach the succ node
614     {
615       stree->insertnode = node;
616       stree->insertprev = prevnode;
617       return;
618     }
619     stree->headnode = nodeptr;    // go to the successor node
620     stree->headnodedepth = nodedepth;
621     if(edgelen == wlen)                    // location is found
622     {
623       stree->headend = NULL;
624       return;
625     }
626     (stree->headstart) += edgelen;
627   }
628 }
629 
630 /*
631   The function \emph{taillcp} computes the length of the longest common prefix
632   of two strings. The first string is between pointers \emph{start1} and
633   \emph{end1}. The second string is the current tail, which is between  the
634   pointers \emph{tailptr} and \emph{sentinel}.
635 */
636 
taillcp(Suffixtree * stree,SYMBOL * start1,SYMBOL * end1)637 static Uint taillcp(Suffixtree *stree,SYMBOL *start1, SYMBOL *end1)
638 {
639   SYMBOL *ptr1 = start1, *ptr2 = stree->tailptr + 1;
640   while(ptr1 <= end1 && ptr2 < stree->sentinel && *ptr1 == *ptr2)
641   {
642     ptr1++;
643     ptr2++;
644   }
645   return (Uint) (ptr1-start1);
646 }
647 
648 /*
649  The function \emph{scanprefix} scans a prefix of the current tail
650  down from a given node.
651 */
652 
scanprefix(Suffixtree * stree)653 static void scanprefix(Suffixtree *stree)
654 {
655   Uint *nodeptr = NULL, *largeptr = NULL, leafindex, nodedepth, edgelen, node,
656        distance = 0, prevnode, prefixlen, headposition;
657   SYMBOL *leftborder = (SYMBOL *) NULL, tailchar, edgechar = 0;
658 
659   DEBUGDEFAULT;
660   if(stree->headnodedepth == 0)   // headnode is root
661   {
662     if(stree->tailptr == stree->sentinel)   // there is no \$-edge
663     {
664       stree->headend = NULL;
665       return;
666     }
667     tailchar = *(stree->tailptr);
668     if((node = stree->rootchildren[(Uint) tailchar]) == UNDEFINEDREFERENCE)
669     {
670       stree->headend = NULL;
671       return;
672     }
673     if(ISLEAF(node)) // successor edge is leaf, compare tail and leaf edge label
674     {
675       leftborder = stree->text + GETLEAFINDEX(node);
676       prefixlen = 1 + taillcp(stree,leftborder+1,stree->sentinel-1);
677       (stree->tailptr) += prefixlen;
678       stree->headstart = leftborder;
679       stree->headend = leftborder + (prefixlen-1);
680       stree->insertnode = node;
681       return;
682     }
683     nodeptr = stree->branchtab + GETBRANCHINDEX(node);
684     GETBOTH(nodedepth,headposition,nodeptr);  // get info for branch node
685     leftborder = stree->text + headposition;
686     prefixlen = 1 + taillcp(stree,leftborder+1,leftborder + nodedepth - 1);
687     (stree->tailptr)+= prefixlen;
688     if(nodedepth > prefixlen)   // cannot reach the successor, fall out of tree
689     {
690       stree->headstart = leftborder;
691       stree->headend = leftborder + (prefixlen-1);
692       stree->insertnode = node;
693       return;
694     }
695     stree->headnode = nodeptr;
696     stree->headnodedepth = nodedepth;
697   }
698   while(True)  // \emph{headnode} is not the root
699   {
700     prevnode = UNDEFINEDREFERENCE;
701     node = GETCHILD(stree->headnode);
702     if(stree->tailptr == stree->sentinel)  //  \$-edge
703     {
704       do // there is no \$-edge, so find last successor, of which it becomes right brother
705       {
706         prevnode = node;
707         if(ISLEAF(node))
708         {
709           node = LEAFBROTHERVAL(stree->leaftab[GETLEAFINDEX(node)]);
710         } else
711         {
712           node = GETBROTHER(stree->branchtab + GETBRANCHINDEX(node));
713         }
714       } while(!NILPTR(node));
715       stree->insertnode = NILBIT;
716       stree->insertprev = prevnode;
717       stree->headend = NULL;
718       return;
719     }
720     tailchar = *(stree->tailptr);
721 
722     do // find successor edge with firstchar = tailchar
723     {
724       if(ISLEAF(node))   // successor is leaf
725       {
726         leafindex = GETLEAFINDEX(node);
727         leftborder = stree->text + (stree->headnodedepth + leafindex);
728         if((edgechar = *leftborder) >= tailchar)   // edge will not come later
729         {
730           break;
731         }
732         prevnode = node;
733         node = LEAFBROTHERVAL(stree->leaftab[leafindex]);
734       } else  // successor is branch node
735       {
736         nodeptr = stree->branchtab + GETBRANCHINDEX(node);
737         GETONLYHEADPOS(headposition,nodeptr);
738         leftborder = stree->text + (stree->headnodedepth + headposition);
739         if((edgechar = *leftborder) >= tailchar)  // edge will not come later
740         {
741           break;
742         }
743         prevnode = node;
744         node = GETBROTHER(nodeptr);
745       }
746     } while(!NILPTR(node));
747     if(NILPTR(node) || edgechar > tailchar)  // edge not found
748     {
749       stree->insertprev = prevnode;   // new edge will become brother of this
750       stree->headend = NULL;
751       return;
752     }
753     if(ISLEAF(node))  // correct edge is leaf edge, compare its label with tail
754     {
755       prefixlen = 1 + taillcp(stree,leftborder+1,stree->sentinel-1);
756       (stree->tailptr) += prefixlen;
757       stree->headstart = leftborder;
758       stree->headend = leftborder + (prefixlen-1);
759       stree->insertnode = node;
760       stree->insertprev = prevnode;
761       return;
762     }
763     GETDEPTHAFTERHEADPOS(nodedepth,nodeptr); // we already know headposition
764     edgelen = nodedepth - stree->headnodedepth;
765     prefixlen = 1 + taillcp(stree,leftborder+1,leftborder + edgelen - 1);
766     (stree->tailptr) += prefixlen;
767     if(edgelen > prefixlen)  // cannot reach next node
768     {
769       stree->headstart = leftborder;
770       stree->headend = leftborder + (prefixlen-1);
771       stree->insertnode = node;
772       stree->insertprev = prevnode;
773       return;
774     }
775     stree->headnode = nodeptr;
776     stree->headnodedepth = nodedepth;
777   }
778 }
779 
780 //\subsection{Completion and Initialization}
781 
782 /*
783   The function \emph{completelarge} is called whenever a large node
784   is inserted. It basically sets the appropriate distance values of the small
785   nodes of the current chain.
786 */
787 
completelarge(Suffixtree * stree)788 static void completelarge(Suffixtree *stree)
789 {
790   Uint distance, *backwards;
791 
792   DEBUGDEFAULT;
793   if(stree->smallnotcompleted > 0)
794   {
795     backwards = stree->nextfreebranch;
796     for(distance = 1; distance <= stree->smallnotcompleted; distance++)
797     {
798       backwards -= SMALLINTS;
799       SETDISTANCE(backwards,distance);
800     }
801     stree->smallnotcompleted = 0;
802     stree->chainstart = NULL;
803   }
804   stree->nextfreebranch += LARGEINTS;
805   stree->nextfreebranchnum += LARGEINTS;
806   stree->largenode++;
807 }
808 
809 /*
810   The function \emph{linkrootchildren} constructs the successor chain
811   for the children of the root. This is done at the end of the algorithm
812   in one sweep over table \emph{rootchildren}.
813 */
814 
linkrootchildren(Suffixtree * stree)815 static void linkrootchildren(Suffixtree *stree)
816 {
817   Uint *rcptr, *prevnodeptr, prev = UNDEFINEDREFERENCE;
818 
819   DEBUGDEFAULT;
820   stree->alphasize = 0;
821   for(rcptr = stree->rootchildren;
822       rcptr <= stree->rootchildren + LARGESTCHARINDEX; rcptr++)
823   {
824     if(*rcptr != UNDEFINEDREFERENCE)
825     {
826       stree->alphasize++;
827       if(prev == UNDEFINEDREFERENCE)
828       {
829         SETCHILD(stree->branchtab,MAKELARGE(*rcptr));
830       } else
831       {
832         if(ISLEAF(prev))
833         {
834           stree->leaftab[GETLEAFINDEX(prev)] = *rcptr;
835         } else
836         {
837           prevnodeptr = stree->branchtab + GETBRANCHINDEX(prev);
838           SETBROTHER(prevnodeptr,*rcptr);
839         }
840       }
841       prev = *rcptr;
842     }
843   }
844   if(ISLEAF(prev))
845   {
846     stree->leaftab[GETLEAFINDEX(prev)] = MAKELEAF(stree->textlen);
847   } else
848   {
849     prevnodeptr = stree->branchtab + GETBRANCHINDEX(prev);
850     SETBROTHER(prevnodeptr,MAKELEAF(stree->textlen));
851   }
852   stree->leaftab[stree->textlen] = NILBIT;
853 }
854 
855 /*
856   \newpage
857   \emph{initSuffixtree} allocates and initializes the data structures for
858   McCreight's Algorithm.
859 */
860 
initSuffixtree(Suffixtree * stree,SYMBOL * text,Uint textlen)861 static void initSuffixtree(Suffixtree *stree,SYMBOL *text,Uint textlen)
862 {
863   Uint i, *ptr;
864 
865   DEBUG1(4,">%s\n",__func__);
866   DEBUG1(2,"# MULTBYSMALLINTS(textlen+1)=%lu\n",
867            (Showuint) MULTBYSMALLINTS(textlen+1));
868   DEBUG1(2,"# STARTFACTOR=%.2f\n",STARTFACTOR);
869   DEBUG1(2,"# MINEXTRA=%lu\n",(Showuint) MINEXTRA);
870   stree->currentbranchtabsize
871     = (Uint) (STARTFACTOR * MULTBYSMALLINTS(textlen+1));
872   if(stree->currentbranchtabsize < MINEXTRA)
873   {
874     stree->currentbranchtabsize = MULTBYSMALLINTS(MINEXTRA);
875   }
876   stree->leaftab = ALLOCSPACE(NULL,Uint,textlen+2);
877   stree->rootchildren = ALLOCSPACE(NULL,Uint,LARGESTCHARINDEX + 1);
878   stree->branchtab = ALLOCSPACE(NULL,Uint,stree->currentbranchtabsize);
879 
880   stree->text = stree->tailptr = text;
881   stree->textlen = textlen;
882   stree->sentinel = text + textlen;
883   stree->firstnotallocated
884     = stree->branchtab + stree->currentbranchtabsize - LARGEINTS;
885   stree->headnode = stree->nextfreebranch = stree->branchtab;
886   stree->headend = NULL;
887   stree->headnodedepth = stree->maxbranchdepth = 0;
888   for(ptr=stree->rootchildren; ptr<=stree->rootchildren+LARGESTCHARINDEX; ptr++)
889   {
890     *ptr = UNDEFINEDREFERENCE;
891   }
892   for(i=0; i<LARGEINTS; i++)
893   {
894     stree->branchtab[i] = 0;
895   }
896   stree->nextfreebranch = stree->branchtab;
897   stree->nextfreebranchnum = 0;
898   SETDEPTHHEADPOS(0,0);
899   SETNEWCHILDBROTHER(MAKELARGELEAF(0),0);
900   SETBRANCHNODEOFFSET;
901   stree->rootchildren[(Uint) *text] = MAKELEAF(0);
902   stree->leaftab[0] = VALIDINIT;
903   DEBUG2(4,"%c-edge from root points to leaf %lu\n",*text,(Showuint) 0);
904   stree->leafcounts = NULL;
905   stree->nextfreeleafnum = 1;
906   stree->nextfreeleafptr = stree->leaftab + 1;
907   stree->nextfreebranch = stree->branchtab + LARGEINTS;
908   stree->nextfreebranchnum = LARGEINTS;
909   stree->insertnode = stree->insertprev = UNDEFINEDREFERENCE;
910   stree->smallnotcompleted = 0;
911   stree->chainstart = NULL;
912   stree->largenode = stree->smallnode = 0;
913 
914 //\Ignore{
915 
916 #ifdef DEBUG
917   stree->showsymbolstree = NULL;
918   stree->alphabet = NULL;
919   stree->nodecount = 1;
920   stree->splitleafedge =
921   stree->splitinternaledge =
922   stree->artificial =
923   stree->multiplications = 0;
924   stree->insertleafcalls = 1;
925   stree->maxset = stree->branchtab + LARGEINTS - 1;
926   stree->largelinks = stree->largelinkwork = stree->largelinklinkwork = 0;
927 #endif
928 
929 //}
930 
931 }
932 
freestree(Suffixtree * stree)933 void freestree(Suffixtree *stree)
934 {
935   FREESPACE(stree->leaftab);
936   FREESPACE(stree->rootchildren);
937   FREESPACE(stree->branchtab);
938   if(stree->nonmaximal != NULL)
939   {
940     FREESPACE(stree->nonmaximal);
941   }
942   if(stree->leafcounts != NULL)
943   {
944     FREESPACE(stree->leafcounts);
945   }
946 }
947 
948 //\subsection{Computing the Suffix Tree}
949 
950 /*
951   \emph{constructstree} implements McCreight Algorithm to compute the suffix
952   tree for a \texttt{text} of length \texttt{textlen}. For explanations, see
953   \cite{KUR:1998}. The number \((i)\) refers to the cases of Section 6 in
954   \cite{KUR:1998}.
955 */
956 
957 #define CONSTRUCT Sint constructstree(Suffixtree *stree,SYMBOL *text,Uint textlen)
958 #define DECLAREEXTRA        stree->nonmaximal = NULL
959 #define COMPLETELARGEFIRST  completelarge(stree)
960 #define COMPLETELARGESECOND completelarge(stree)
961 #define PROCESSHEAD         /* Nothing */
962 #define CHECKSTEP           /* Nothing */
963 #define FINALPROGRESS       /* Nothing */
964 
965 #include "construct.gen"
966 
967 #undef CONSTRUCT
968 #undef DECLAREEXTRA
969 #undef COMPLETELARGEFIRST
970 #undef COMPLETELARGESECOND
971 #undef PROCESSHEAD
972 #undef CHECKSTEP
973 #undef FINALPROGRESS
974 
975 /* --------------------------------------------------- */
976 
977 #define CONSTRUCT Sint constructmarkmaxstree(Suffixtree *stree,SYMBOL *text,Uint textlen)
978 
979 #define DECLAREEXTRA  Uint distance, headposition = 0, *largeptr,\
980                            tabsize = 1 + DIVWORDSIZE(textlen+1), *tabptr;\
981                       stree->nonmaximal = ALLOCSPACE(NULL,Uint,tabsize);\
982                       for(tabptr = stree->nonmaximal;\
983                           tabptr < stree->nonmaximal + tabsize;\
984                           tabptr++)\
985                           {\
986                             *tabptr = 0;\
987                           }
988 
989 #define COMPLETELARGEFIRST\
990         GETONLYHEADPOS(headposition,stree->headnode);\
991         DEBUG2(3,"j=%lu: slink->%lu\n",(Showuint) stree->nextfreeleafnum,\
992                                        (Showuint) headposition);\
993         SETIBIT(stree->nonmaximal,headposition);\
994         completelarge(stree);
995 
996 #define COMPLETELARGESECOND\
997         DEBUG2(3,"j=%lu: slink->%lu\n",(Showuint) stree->nextfreeleafnum,\
998                                        (Showuint) stree->nextfreeleafnum);\
999         SETIBIT(stree->nonmaximal,stree->nextfreeleafnum);\
1000         completelarge(stree)
1001 
1002 #define PROCESSHEAD          /* Nothing */
1003 #define CHECKSTEP            /* Nothing */
1004 #define FINALPROGRESS        /* Nothing */
1005 
1006 #include "construct.gen"
1007 
1008 #undef CONSTRUCT
1009 #undef DECLAREEXTRA
1010 #undef COMPLETELARGEFIRST
1011 #undef COMPLETELARGESECOND
1012 #undef PROCESSHEAD
1013 #undef CHECKSTEP
1014 #undef FINALPROGRESS
1015 
1016 /* --------------------------------------------------- */
1017 
1018 #define CONSTRUCT Sint constructheadstree(Suffixtree *stree,SYMBOL *text,Uint textlen,void(*processhead)(Suffixtree *,Uint,void *),void *processheadinfo)
1019 
1020 #define DECLAREEXTRA         stree->nonmaximal = NULL
1021 #define COMPLETELARGEFIRST   completelarge(stree)
1022 #define COMPLETELARGESECOND  completelarge(stree)
1023 #define PROCESSHEAD          processhead(stree,stree->nextfreeleafnum,\
1024                                                processheadinfo)
1025 
1026 #define CHECKSTEP            /* Nothing */
1027 #define FINALPROGRESS        /* Nothing */
1028 
1029 #include "construct.gen"
1030 
1031 #undef CONSTRUCT
1032 #undef DECLAREEXTRA
1033 #undef COMPLETELARGEFIRST
1034 #undef COMPLETELARGESECOND
1035 #undef PROCESSHEAD
1036 #undef CHECKSTEP
1037 #undef FINALPROGRESS
1038 
1039 /* --------------------------------------------------- */
1040 
1041 #define LEASTSHOWPROGRESS 100000
1042 #define NUMOFCALLS 100
1043 
1044 #define CONSTRUCT Sint constructprogressstree(Suffixtree *stree,SYMBOL *text,Uint textlen,void (*progress)(Uint,void *),void (*finalprogress)(void *),void *info)
1045 #define DECLAREEXTRA\
1046         Uint j = 0, step, nextstep;\
1047         stree->nonmaximal = NULL;\
1048         step = textlen/NUMOFCALLS;\
1049         nextstep = (textlen >= LEASTSHOWPROGRESS) ? step : (textlen+1);\
1050         if(progress == NULL && textlen >= LEASTSHOWPROGRESS)\
1051         {\
1052           fprintf(stderr,"# process %lu characters per dot\n",\
1053                  (Showuint) textlen/NUMOFCALLS);\
1054         }
1055 
1056 #define COMPLETELARGEFIRST  completelarge(stree)
1057 #define COMPLETELARGESECOND completelarge(stree)
1058 #define PROCESSHEAD          /* Nothing */
1059 
1060 #define CHECKSTEP            if(j == nextstep)\
1061                              {\
1062                                if(progress == NULL)\
1063                                {\
1064                                  if(nextstep == step)\
1065                                  {\
1066                                    fputc('#',stderr);\
1067                                  }\
1068                                  fputc('.',stderr);\
1069                                  fflush(stdout);\
1070                                } else\
1071                                {\
1072                                  progress(nextstep,info);\
1073                                }\
1074                                nextstep += step;\
1075                              }\
1076                              j++
1077 
1078 #define FINALPROGRESS        if(textlen >= LEASTSHOWPROGRESS)\
1079                              {\
1080                                if(finalprogress == NULL)\
1081                                {\
1082                                  fputc('\n',stderr);\
1083                                } else\
1084                                {\
1085                                  finalprogress(info);\
1086                                }\
1087                              }
1088 
1089 #include "construct.gen"
1090 
1091 #undef CONSTRUCT
1092 #undef DECLAREEXTRA
1093 #undef COMPLETELARGEFIRST
1094 #undef COMPLETELARGESECOND
1095 #undef PROCESSHEAD
1096 #undef CHECKSTEP
1097 #undef FINALPROGRESS
1098