1 /*
2 Copyright (c) 2003 by Stefan Kurtz and The Institute for
3 Genomic Research. This is OSI Certified Open Source Software.
4 Please see the file LICENSE for licensing information and
5 the file ACKNOWLEDGEMENTS for names of contributors to the
6 code base.
7 */
8
9 //}
10
11 //\FILEINFO{construct.c}{Stefan Kurtz}{November 1999}
12
13 //\Ignore{
14
15 #include "intbits.h"
16 #include "spacedef.h"
17 #include "megabytes.h"
18 #include "debugdef.h"
19 #include "streedef.h"
20 #include "streeacc.h"
21 #include "protodef.h"
22
23 #define FUNCLEVEL 4
24
25 #define DEBUGDEFAULT DEBUG1(FUNCLEVEL,">%s\n",__func__);\
26 DEBUGCODE(5,showstree(stree))
27
28 #define VALIDINIT 0
29
30 #define CHECKTEXTLEN\
31 if(textlen > MAXTEXTLEN)\
32 {\
33 ERROR2("suffix tree construction failed: "\
34 "textlen=%lu larger than maximal textlen=%lu",\
35 (Showuint) textlen,(Showuint) MAXTEXTLEN);\
36 return -1;\
37 }
38
39 #ifdef DEBUG
40
41
showvalues(void)42 static void showvalues(void)
43 {
44 SHOWVAL(SMALLINTS);
45 SHOWVAL(LARGEINTS);
46 SHOWVAL(MAXDISTANCE);
47 #if defined(STREELARGE) || defined(STREESMALL)
48 SHOWVAL(SMALLDEPTH);
49 #endif
50 SHOWVAL(MAXTEXTLEN);
51 }
52
53 #endif
54
55 //}
56
57 /*
58 This file contains code for the improved linked list implementation,
59 as described in \cite{KUR:1998,KUR:BAL:1999}. It can be
60 compiled with two options:
61 \begin{itemize}
62 \item
63 for short strings of length \(\leq 2^{21}-1=2\) megabytes,
64 we recommend the option
65 \texttt{STREESMALL}. This results in a representation which requires
66 \(2\) integers for each small node, and three integers for each large node.
67 See also the header file \texttt{streesmall.h}.
68 \item
69 for long strings of length \(\leq 2^{27}-1=12\) megabytes, we
70 recommend the option
71 \texttt{STREELARGE}. This results in a representation which requires
72 \(2\) integers for each small node, and four integers for each large node.
73 See also the header file \texttt{streelarge.h}.
74 \end{itemize}
75 */
76
77 //\subsection{Space Management}
78
79 /*
80 For a string of length \(n\) we initially allocate space for
81 \(\texttt{STARTFACTOR}\cdot\texttt{SMALLINTS}\cdot n\) integers to store
82 the branching nodes. This usually suffices for most cases. In case we need
83 more integers, we allocate space for \(\texttt{ADDFACTOR}\cdot n\)
84 (at least 16) extra branching nodes.
85 */
86
87 #ifndef STARTFACTOR
88 #define STARTFACTOR 0.5
89 #endif
90
91 #define ADDFACTOR 0.05
92 #define MINEXTRA 16
93
94 /*
95 Before a new node is stored, we check if there is enough space available.
96 If not, the space is enlarged by a small amount. Since some global pointers
97 directly refer into the table, these have to be adjusted after reallocation.
98 */
99
spaceforbranchtab(Suffixtree * stree)100 static void spaceforbranchtab(Suffixtree *stree)
101 {
102 DEBUG1(FUNCLEVEL,">%s\n",__func__);
103
104 if(stree->nextfreebranch >= stree->firstnotallocated)
105 {
106 Uint tmpheadnode,
107 tmpchainstart = 0,
108 extra = (Uint) (ADDFACTOR * (MULTBYSMALLINTS(stree->textlen+1)));
109 if(extra < MINEXTRA)
110 {
111 extra = MULTBYSMALLINTS(MINEXTRA);
112 }
113 DEBUG1(2,"#all suffixes up to suffix %lu have been scanned\n",
114 (Showuint) stree->nextfreeleafnum);
115 DEBUG1(2,"#current space peak %f\n",MEGABYTES(getspacepeak()));
116 DEBUG1(2,"#to get %lu extra space do ",(Showuint) extra);
117 stree->currentbranchtabsize += extra;
118 tmpheadnode = BRADDR2NUM(stree,stree->headnode);
119 if(stree->chainstart != NULL)
120 {
121 tmpchainstart = BRADDR2NUM(stree,stree->chainstart);
122 }
123
124 stree->branchtab
125 = ALLOCSPACE(stree->branchtab,Uint,stree->currentbranchtabsize);
126 stree->nextfreebranch = stree->branchtab + stree->nextfreebranchnum;
127 stree->headnode = stree->branchtab + tmpheadnode;
128 if(stree->chainstart != NULL)
129 {
130 stree->chainstart = stree->branchtab + tmpchainstart;
131 }
132 stree->firstnotallocated
133 = stree->branchtab + stree->currentbranchtabsize - LARGEINTS;
134 }
135 }
136
137 //\subsection{Initializing and Retrieving Headpositions, Depth, and Suffixlinks}
138
139 /*
140 We have three functions to initialize and retrieve head positions, depth, and
141 suffix links. The implementation depends on the bit layout.
142 \begin{enumerate}
143 \item
144 The function \emph{setdepthnum} stores the \emph{depth} and the
145 \emph{head position} of a new large node.
146 \item
147 The function \emph{setsuffixlink} stores the \emph{suffixlink}
148 of a new large node.
149 \item
150 The function \emph{getlargelinkconstruction} retrieves the \emph{suffixlink}
151 of a large node, which is referenced by \emph{headposition}.
152 \end{enumerate}
153 */
154
155 #ifdef STREESMALL
156
setdepthheadposition(Suffixtree * stree,Uint depth,Uint headposition)157 static void setdepthheadposition(Suffixtree *stree,Uint depth,
158 Uint headposition)
159 {
160 DEBUG4(FUNCLEVEL,">%s(%lu)=(depth=%lu,headposition=%lu)\n",
161 __func__,
162 (Showuint) stree->nextfreebranchnum,
163 (Showuint) depth,
164 (Showuint) headposition);
165 DEBUGCODE(1,stree->maxset = stree->nextfreebranch + 2);
166 if(ISSMALLDEPTH(depth))
167 {
168 *(stree->nextfreebranch+1) |= SMALLDEPTHMARK;
169 } else
170 {
171 *(stree->nextfreebranch)
172 = (*(stree->nextfreebranch) & (MAXINDEX | LARGEBIT)) |
173 ((depth << 11) & (7 << 29));
174 *(stree->nextfreebranch+1)
175 = (*(stree->nextfreebranch+1) & (MAXINDEX | NILBIT)) |
176 ((depth << 14) & (127 << 25));
177 }
178 *(stree->nextfreebranch+2) = (depth << 21) | headposition;
179 }
180
setsuffixlink(Suffixtree * stree,Uint slink)181 static void setsuffixlink(Suffixtree *stree,Uint slink)
182 {
183 DEBUG3(FUNCLEVEL,">%s(%lu)=%lu\n",
184 __func__,
185 (Showuint) stree->nextfreebranchnum,
186 (Showuint) slink);
187 *(stree->setlink) = (*(stree->setlink) & (255 << 24)) | (slink | NILBIT);
188 if(ISSMALLDEPTH(stree->currentdepth))
189 {
190 *(stree->nextfreebranch+1) |= (slink << 25);
191 if(stree->nextfreebranchnum & (~((1 << 7) - 1)))
192 {
193 *(stree->nextfreebranch) |= ((slink << 17) & (255 << 24));
194 if(stree->nextfreebranchnum & (~((1 << 15) - 1)))
195 {
196 stree->leaftab[stree->nextfreeleafnum-1] |=
197 ((slink << 9) & (255 << 24));
198 }
199 }
200 }
201 }
202
getlargelinkconstruction(Suffixtree * stree)203 static Uint getlargelinkconstruction(Suffixtree *stree)
204 {
205 SYMBOL secondchar;
206 Uint succ, slink, headnodenum;
207
208 DEBUG2(FUNCLEVEL,">%s(%lu)\n",
209 __func__,
210 (Showuint) BRADDR2NUM(stree,stree->headnode));
211
212 DEBUGCODE(1,stree->largelinks++);
213 if(stree->headnodedepth == 1)
214 {
215 return 0; // link refers to the root
216 }
217 if(stree->headnodedepth == 2) // determine second character of edge
218 {
219 if(stree->headend == NULL)
220 {
221 secondchar = *(stree->tailptr-1);
222 } else
223 {
224 secondchar = *(stree->tailptr - (stree->headend - stree->headstart + 2));
225 }
226 // this leads to the suffix link node
227 return GETBRANCHINDEX(stree->rootchildren[(Uint) secondchar]);
228 }
229 if(ISSMALLDEPTH(stree->headnodedepth)) // retrieve link in constant time
230 {
231 slink = *(stree->headnode+1) >> 25;
232 headnodenum = BRADDR2NUM(stree,stree->headnode);
233 if(headnodenum & (~((1 << 7) - 1)))
234 {
235 slink |= ((*(stree->headnode) & (255 << 24)) >> 17);
236 if(headnodenum & (~((1 << 15) - 1)))
237 {
238 slink |= ((stree->leaftab[GETHEADPOS(stree->headnode)]
239 & (255 << 24)) >> 9);
240 }
241 }
242 return slink;
243 }
244 succ = stree->onsuccpath; // start at node on successor path
245 DEBUGCODE(1,stree->largelinklinkwork++);
246 while(!NILPTR(succ)) // linear retrieval of suffix links
247 {
248 DEBUGCODE(1,stree->largelinkwork++);
249 if(ISLEAF(succ))
250 {
251 succ = LEAFBROTHERVAL(stree->leaftab[GETLEAFINDEX(succ)]);
252 } else
253 {
254 succ = GETBROTHER(stree->branchtab + GETBRANCHINDEX(succ));
255 }
256 DEBUGCODE(1,stree->largelinkwork++);
257 }
258 return succ & MAXINDEX; // get only the index
259 }
260 #endif
261
262 #ifdef STREELARGE
263
setdepthheadposition(Suffixtree * stree,Uint depth,Uint headposition)264 static void setdepthheadposition(Suffixtree *stree,Uint depth,
265 Uint headposition)
266 {
267 DEBUG4(FUNCLEVEL,">%s(%lu)=(depth=%lu,headposition=%lu)\n",
268 __func__,
269 (Showuint) stree->nextfreebranchnum,
270 (Showuint) depth,
271 (Showuint) headposition);
272
273 if(ISSMALLDEPTH(depth))
274 {
275 *(stree->nextfreebranch+2) = depth | SMALLDEPTHMARK;
276 } else
277 {
278 *(stree->nextfreebranch+2) = depth;
279 }
280 *(stree->nextfreebranch+3) = headposition;
281 }
282
setsuffixlink(Suffixtree * stree,Uint slink)283 static void setsuffixlink(Suffixtree *stree,Uint slink)
284 {
285 Uint slinkhalf = slink >> 1;
286
287 DEBUG3(FUNCLEVEL,">%s(%lu)=%lu\n",
288 __func__,
289 (Showuint) stree->nextfreebranchnum,
290 (Showuint) slink);
291 *(stree->setlink) = (*(stree->setlink) & EXTRAPATT) | (slink | NILBIT);
292 if(ISSMALLDEPTH(stree->currentdepth))
293 {
294 *(stree->nextfreebranch+2)
295 |= ((slinkhalf << SMALLDEPTHBITS) & LOWERLINKPATT);
296 if(stree->nextfreebranchnum & (~LOWERLINKSIZE))
297 {
298 *(stree->nextfreebranch+3)
299 |= ((slinkhalf << SHIFTMIDDLE) & MIDDLELINKPATT);
300 if(stree->nextfreebranchnum & HIGHERSIZE)
301 {
302 stree->leaftab[stree->nextfreeleafnum-1] |=
303 ((slinkhalf << SHIFTHIGHER) & EXTRAPATT);
304 }
305 }
306 }
307 }
308
getlargelinkconstruction(Suffixtree * stree)309 static Uint getlargelinkconstruction(Suffixtree *stree)
310 {
311 SYMBOL secondchar;
312 Uint succ, slinkhalf, headnodenum;
313
314 DEBUG2(FUNCLEVEL,">%s(%lu)\n",
315 __func__,
316 (Showuint) BRADDR2NUM(stree,stree->headnode));
317 DEBUGCODE(1,stree->largelinks++);
318 if(stree->headnodedepth == 1)
319 {
320 return 0; // link refers to root
321 }
322 if(stree->headnodedepth == 2) // determine second char of egde
323 {
324 if(stree->headend == NULL)
325 {
326 secondchar = *(stree->tailptr-1);
327 } else
328 {
329 secondchar = *(stree->tailptr - (stree->headend - stree->headstart + 2));
330 }
331 return stree->rootchildren[(Uint) secondchar];
332 }
333 if(ISSMALLDEPTH(stree->headnodedepth)) // retrieve link in constant time
334 {
335 slinkhalf = (*(stree->headnode+2) & LOWERLINKPATT) >> SMALLDEPTHBITS;
336 headnodenum = BRADDR2NUM(stree,stree->headnode);
337 if(headnodenum & (~LOWERLINKSIZE))
338 {
339 slinkhalf |= ((*(stree->headnode+3) & MIDDLELINKPATT) >> SHIFTMIDDLE);
340 if(headnodenum & HIGHERSIZE)
341 {
342 slinkhalf
343 |= ((stree->leaftab[GETHEADPOS(stree->headnode)] & EXTRAPATT)
344 >> SHIFTHIGHER);
345 }
346 }
347 return slinkhalf << 1;
348 }
349 succ = stree->onsuccpath;
350 DEBUGCODE(1,stree->largelinklinkwork++);
351 while(!NILPTR(succ)) // linear retrieval of suffix link
352 {
353 DEBUGCODE(1,stree->largelinkwork++);
354 if(ISLEAF(succ))
355 {
356 succ = LEAFBROTHERVAL(stree->leaftab[GETLEAFINDEX(succ)]);
357 } else
358 {
359 succ = GETBROTHER(stree->branchtab + GETBRANCHINDEX(succ));
360 }
361 DEBUGCODE(1,stree->largelinkwork++);
362 }
363 return succ & MAXINDEX; // get only the index
364 }
365 #endif
366
367 #ifdef STREEHUGE
368
getlargelinkconstruction(Suffixtree * stree)369 static Uint getlargelinkconstruction(Suffixtree *stree)
370 {
371 SYMBOL secondchar;
372
373 DEBUG2(FUNCLEVEL,">%s(%lu)\n",
374 __func__,
375 (Showuint) BRADDR2NUM(stree,stree->headnode));
376 if(stree->headnodedepth == 1)
377 {
378 return 0; // link refers to root
379 }
380 if(stree->headnodedepth == 2) // determine second char of egde
381 {
382 if(stree->headend == NULL)
383 {
384 secondchar = *(stree->tailptr-1);
385 } else
386 {
387 secondchar = *(stree->tailptr - (stree->headend - stree->headstart + 2));
388 }
389 return stree->rootchildren[(Uint) secondchar];
390 }
391 return *(stree->headnode+4);
392 }
393 #endif
394
395 //\subsection{Insertion of Nodes}
396
397 /*
398 The function \emph{insertleaf} inserts a leaf and a corresponding leaf
399 edge outgoing from the current \emph{headnode}.
400 \emph{insertprev} refers to the node to the left of the leaf to be inserted.
401 If the leaf is the first child, then \emph{insertprev} is
402 \texttt{UNDEFINEDREFERENCE}.
403 */
404
insertleaf(Suffixtree * stree)405 static void insertleaf(Suffixtree *stree)
406 {
407 Uint *ptr, newleaf;
408
409 DEBUGDEFAULT;
410 newleaf = MAKELEAF(stree->nextfreeleafnum);
411 DEBUGCODE(1,stree->insertleafcalls++);
412 if(stree->headnodedepth == 0) // head is the root
413 {
414 if(stree->tailptr != stree->sentinel) // no \$-edge initially
415 {
416 stree->rootchildren[(Uint) *(stree->tailptr)] = newleaf;
417 *(stree->nextfreeleafptr) = VALIDINIT;
418 DEBUG2(4,"%c-edge from root points to leaf %lu\n",
419 *(stree->tailptr),(Showuint) stree->nextfreeleafnum);
420 }
421 } else
422 {
423 if (stree->insertprev == UNDEFINEDREFERENCE) // newleaf = first child
424 {
425 *(stree->nextfreeleafptr) = GETCHILD(stree->headnode);
426 SETCHILD(stree->headnode,newleaf);
427 } else
428 {
429 if(ISLEAF(stree->insertprev)) // previous node is leaf
430 {
431 ptr = stree->leaftab + GETLEAFINDEX(stree->insertprev);
432 *(stree->nextfreeleafptr) = LEAFBROTHERVAL(*ptr);
433 SETLEAFBROTHER(ptr,newleaf);
434 } else // previous node is branching node
435 {
436 ptr = stree->branchtab + GETBRANCHINDEX(stree->insertprev);
437 *(stree->nextfreeleafptr) = GETBROTHER(ptr);
438 SETBROTHER(ptr,newleaf);
439 }
440 }
441 }
442 RECALLSUCC(newleaf); // recall node on successor path of \emph{headnode}
443 stree->nextfreeleafnum++;
444 stree->nextfreeleafptr++;
445 }
446
447 /*
448 The function \emph{insertbranch} inserts a branching node and splits
449 the appropriate edges, according to the canonical location of the current
450 head. \emph{insertprev} refers to the node to the left of the branching
451 node to be inserted. If the branching node is the first child, then
452 \emph{insertprev} is \texttt{UNDEFINEDREFERENCE}. The edge to be split ends
453 in the node referred to by \emph{insertnode}.
454 */
455
insertbranchnode(Suffixtree * stree)456 static void insertbranchnode(Suffixtree *stree)
457 {
458 Uint *ptr, *insertnodeptr, *insertleafptr, insertnodeptrbrother;
459
460 DEBUGDEFAULT;
461 spaceforbranchtab(stree);
462 if(stree->headnodedepth == 0) // head is the root
463 {
464 stree->rootchildren[(Uint) *(stree->headstart)]
465 = MAKEBRANCHADDR(stree->nextfreebranchnum);
466 *(stree->nextfreebranch+1) = VALIDINIT;
467 DEBUG2(4,"%c-edge from root points to branch node with address %lu\n",
468 *(stree->headstart),(Showuint) stree->nextfreebranchnum);
469 } else
470 {
471 if(stree->insertprev == UNDEFINEDREFERENCE) // new branch = first child
472 {
473 SETCHILD(stree->headnode,MAKEBRANCHADDR(stree->nextfreebranchnum));
474 } else
475 {
476 if(ISLEAF(stree->insertprev)) // new branch = right brother of leaf
477 {
478 ptr = stree->leaftab + GETLEAFINDEX(stree->insertprev);
479 SETLEAFBROTHER(ptr,MAKEBRANCHADDR(stree->nextfreebranchnum));
480 } else // new branch = brother of branching node
481 {
482 SETBROTHER(stree->branchtab + GETBRANCHINDEX(stree->insertprev),
483 MAKEBRANCHADDR(stree->nextfreebranchnum));
484 }
485 }
486 }
487 if(ISLEAF(stree->insertnode)) // split edge is leaf edge
488 {
489 DEBUGCODE(1,stree->splitleafedge++);
490 insertleafptr = stree->leaftab + GETLEAFINDEX(stree->insertnode);
491 if (stree->tailptr == stree->sentinel ||
492 *(stree->headend+1) < *(stree->tailptr))
493 {
494 SETNEWCHILDBROTHER(MAKELARGE(stree->insertnode), // first child=oldleaf
495 LEAFBROTHERVAL(*insertleafptr)); // inherit brother
496 RECALLNEWLEAFADDRESS(stree->nextfreeleafptr);
497 SETLEAFBROTHER(insertleafptr, // new leaf =
498 MAKELEAF(stree->nextfreeleafnum)); // right brother of old leaf
499 } else
500 {
501 SETNEWCHILDBROTHER(MAKELARGELEAF(stree->nextfreeleafnum), // first child=new leaf
502 LEAFBROTHERVAL(*insertleafptr)); // inherit brother
503 *(stree->nextfreeleafptr) = stree->insertnode; // old leaf = right brother of of new leaf
504 RECALLLEAFADDRESS(insertleafptr);
505 }
506 } else // split edge leads to branching node
507 {
508 DEBUGCODE(1,stree->splitinternaledge++);
509 insertnodeptr = stree->branchtab + GETBRANCHINDEX(stree->insertnode);
510 insertnodeptrbrother = GETBROTHER(insertnodeptr);
511 if (stree->tailptr == stree->sentinel ||
512 *(stree->headend+1) < *(stree->tailptr))
513 {
514 SETNEWCHILDBROTHER(MAKELARGE(stree->insertnode), // first child new branch
515 insertnodeptrbrother); // inherit right brother
516 RECALLNEWLEAFADDRESS(stree->nextfreeleafptr);
517 SETBROTHER(insertnodeptr,MAKELEAF(stree->nextfreeleafnum)); // new leaf = brother of old branch
518 } else
519 {
520 SETNEWCHILDBROTHER(MAKELARGELEAF(stree->nextfreeleafnum), // first child is new leaf
521 insertnodeptrbrother); // inherit brother
522 *(stree->nextfreeleafptr) = stree->insertnode; // new branch is brother of new leaf
523 RECALLBRANCHADDRESS(insertnodeptr);
524 }
525 }
526 SETNILBIT;
527 RECALLSUCC(MAKEBRANCHADDR(stree->nextfreebranchnum)); // node on succ. path
528 stree->currentdepth = stree->headnodedepth + (Uint) (stree->headend-stree->headstart+1);
529 SETDEPTHHEADPOS(stree->currentdepth,stree->nextfreeleafnum);
530 SETMAXBRANCHDEPTH(stree->currentdepth);
531 stree->nextfreeleafnum++;
532 stree->nextfreeleafptr++;
533 DEBUGCODE(1,stree->nodecount++);
534 }
535
536 //\subsection{Finding the Head-Locations}
537
538 /*
539 The function \emph{rescan} finds the location of the current head.
540 In order to scan down the tree, it suffices to look at the first
541 character of each edge.
542 */
543
rescan(Suffixtree * stree)544 static void rescan (Suffixtree *stree)
545 {
546 Uint *nodeptr, *largeptr = NULL, distance = 0, node, prevnode,
547 nodedepth, edgelen, wlen, leafindex, headposition;
548 SYMBOL headchar, edgechar;
549
550 DEBUGDEFAULT;
551 if(stree->headnodedepth == 0) // head is the root
552 {
553 headchar = *(stree->headstart); // headstart is assumed to be not empty
554 node = stree->rootchildren[(Uint) headchar];
555 if(ISLEAF(node)) // stop if successor is leaf
556 {
557 stree->insertnode = node;
558 return;
559 }
560 nodeptr = stree->branchtab + GETBRANCHINDEX(node);
561 GETONLYDEPTH(nodedepth,nodeptr);
562 wlen = (Uint) (stree->headend - stree->headstart + 1);
563 if(nodedepth > wlen) // cannot reach the successor node
564 {
565 stree->insertnode = node;
566 return;
567 }
568 stree->headnode = nodeptr; // go to successor node
569 stree->headnodedepth = nodedepth;
570 if(nodedepth == wlen) // location has been scanned
571 {
572 stree->headend = NULL;
573 return;
574 }
575 (stree->headstart) += nodedepth;
576 }
577 while(True) // \emph{headnode} is not the root
578 {
579 headchar = *(stree->headstart); // \emph{headstart} is assumed to be nonempty
580 prevnode = UNDEFINEDREFERENCE;
581 node = GETCHILD(stree->headnode);
582 while(True) // traverse the list of successors
583 {
584 if(ISLEAF(node)) // successor is leaf
585 {
586 leafindex = GETLEAFINDEX(node);
587 edgechar = stree->text[stree->headnodedepth + leafindex];
588 if(edgechar == headchar) // correct edge found
589 {
590 stree->insertnode = node;
591 stree->insertprev = prevnode;
592 return;
593 }
594 prevnode = node;
595 node = LEAFBROTHERVAL(stree->leaftab[leafindex]);
596 } else // successor is branch node
597 {
598 nodeptr = stree->branchtab + GETBRANCHINDEX(node);
599 GETONLYHEADPOS(headposition,nodeptr);
600 edgechar = stree->text[stree->headnodedepth + headposition];
601 if(edgechar == headchar) // correct edge found
602 {
603 break;
604 }
605 prevnode = node;
606 node = GETBROTHER(nodeptr);
607 }
608 }
609
610 GETDEPTHAFTERHEADPOS(nodedepth,nodeptr); // get info about succ node
611 edgelen = nodedepth - stree->headnodedepth;
612 wlen = (Uint) (stree->headend - stree->headstart + 1);
613 if(edgelen > wlen) // cannot reach the succ node
614 {
615 stree->insertnode = node;
616 stree->insertprev = prevnode;
617 return;
618 }
619 stree->headnode = nodeptr; // go to the successor node
620 stree->headnodedepth = nodedepth;
621 if(edgelen == wlen) // location is found
622 {
623 stree->headend = NULL;
624 return;
625 }
626 (stree->headstart) += edgelen;
627 }
628 }
629
630 /*
631 The function \emph{taillcp} computes the length of the longest common prefix
632 of two strings. The first string is between pointers \emph{start1} and
633 \emph{end1}. The second string is the current tail, which is between the
634 pointers \emph{tailptr} and \emph{sentinel}.
635 */
636
taillcp(Suffixtree * stree,SYMBOL * start1,SYMBOL * end1)637 static Uint taillcp(Suffixtree *stree,SYMBOL *start1, SYMBOL *end1)
638 {
639 SYMBOL *ptr1 = start1, *ptr2 = stree->tailptr + 1;
640 while(ptr1 <= end1 && ptr2 < stree->sentinel && *ptr1 == *ptr2)
641 {
642 ptr1++;
643 ptr2++;
644 }
645 return (Uint) (ptr1-start1);
646 }
647
648 /*
649 The function \emph{scanprefix} scans a prefix of the current tail
650 down from a given node.
651 */
652
scanprefix(Suffixtree * stree)653 static void scanprefix(Suffixtree *stree)
654 {
655 Uint *nodeptr = NULL, *largeptr = NULL, leafindex, nodedepth, edgelen, node,
656 distance = 0, prevnode, prefixlen, headposition;
657 SYMBOL *leftborder = (SYMBOL *) NULL, tailchar, edgechar = 0;
658
659 DEBUGDEFAULT;
660 if(stree->headnodedepth == 0) // headnode is root
661 {
662 if(stree->tailptr == stree->sentinel) // there is no \$-edge
663 {
664 stree->headend = NULL;
665 return;
666 }
667 tailchar = *(stree->tailptr);
668 if((node = stree->rootchildren[(Uint) tailchar]) == UNDEFINEDREFERENCE)
669 {
670 stree->headend = NULL;
671 return;
672 }
673 if(ISLEAF(node)) // successor edge is leaf, compare tail and leaf edge label
674 {
675 leftborder = stree->text + GETLEAFINDEX(node);
676 prefixlen = 1 + taillcp(stree,leftborder+1,stree->sentinel-1);
677 (stree->tailptr) += prefixlen;
678 stree->headstart = leftborder;
679 stree->headend = leftborder + (prefixlen-1);
680 stree->insertnode = node;
681 return;
682 }
683 nodeptr = stree->branchtab + GETBRANCHINDEX(node);
684 GETBOTH(nodedepth,headposition,nodeptr); // get info for branch node
685 leftborder = stree->text + headposition;
686 prefixlen = 1 + taillcp(stree,leftborder+1,leftborder + nodedepth - 1);
687 (stree->tailptr)+= prefixlen;
688 if(nodedepth > prefixlen) // cannot reach the successor, fall out of tree
689 {
690 stree->headstart = leftborder;
691 stree->headend = leftborder + (prefixlen-1);
692 stree->insertnode = node;
693 return;
694 }
695 stree->headnode = nodeptr;
696 stree->headnodedepth = nodedepth;
697 }
698 while(True) // \emph{headnode} is not the root
699 {
700 prevnode = UNDEFINEDREFERENCE;
701 node = GETCHILD(stree->headnode);
702 if(stree->tailptr == stree->sentinel) // \$-edge
703 {
704 do // there is no \$-edge, so find last successor, of which it becomes right brother
705 {
706 prevnode = node;
707 if(ISLEAF(node))
708 {
709 node = LEAFBROTHERVAL(stree->leaftab[GETLEAFINDEX(node)]);
710 } else
711 {
712 node = GETBROTHER(stree->branchtab + GETBRANCHINDEX(node));
713 }
714 } while(!NILPTR(node));
715 stree->insertnode = NILBIT;
716 stree->insertprev = prevnode;
717 stree->headend = NULL;
718 return;
719 }
720 tailchar = *(stree->tailptr);
721
722 do // find successor edge with firstchar = tailchar
723 {
724 if(ISLEAF(node)) // successor is leaf
725 {
726 leafindex = GETLEAFINDEX(node);
727 leftborder = stree->text + (stree->headnodedepth + leafindex);
728 if((edgechar = *leftborder) >= tailchar) // edge will not come later
729 {
730 break;
731 }
732 prevnode = node;
733 node = LEAFBROTHERVAL(stree->leaftab[leafindex]);
734 } else // successor is branch node
735 {
736 nodeptr = stree->branchtab + GETBRANCHINDEX(node);
737 GETONLYHEADPOS(headposition,nodeptr);
738 leftborder = stree->text + (stree->headnodedepth + headposition);
739 if((edgechar = *leftborder) >= tailchar) // edge will not come later
740 {
741 break;
742 }
743 prevnode = node;
744 node = GETBROTHER(nodeptr);
745 }
746 } while(!NILPTR(node));
747 if(NILPTR(node) || edgechar > tailchar) // edge not found
748 {
749 stree->insertprev = prevnode; // new edge will become brother of this
750 stree->headend = NULL;
751 return;
752 }
753 if(ISLEAF(node)) // correct edge is leaf edge, compare its label with tail
754 {
755 prefixlen = 1 + taillcp(stree,leftborder+1,stree->sentinel-1);
756 (stree->tailptr) += prefixlen;
757 stree->headstart = leftborder;
758 stree->headend = leftborder + (prefixlen-1);
759 stree->insertnode = node;
760 stree->insertprev = prevnode;
761 return;
762 }
763 GETDEPTHAFTERHEADPOS(nodedepth,nodeptr); // we already know headposition
764 edgelen = nodedepth - stree->headnodedepth;
765 prefixlen = 1 + taillcp(stree,leftborder+1,leftborder + edgelen - 1);
766 (stree->tailptr) += prefixlen;
767 if(edgelen > prefixlen) // cannot reach next node
768 {
769 stree->headstart = leftborder;
770 stree->headend = leftborder + (prefixlen-1);
771 stree->insertnode = node;
772 stree->insertprev = prevnode;
773 return;
774 }
775 stree->headnode = nodeptr;
776 stree->headnodedepth = nodedepth;
777 }
778 }
779
780 //\subsection{Completion and Initialization}
781
782 /*
783 The function \emph{completelarge} is called whenever a large node
784 is inserted. It basically sets the appropriate distance values of the small
785 nodes of the current chain.
786 */
787
completelarge(Suffixtree * stree)788 static void completelarge(Suffixtree *stree)
789 {
790 Uint distance, *backwards;
791
792 DEBUGDEFAULT;
793 if(stree->smallnotcompleted > 0)
794 {
795 backwards = stree->nextfreebranch;
796 for(distance = 1; distance <= stree->smallnotcompleted; distance++)
797 {
798 backwards -= SMALLINTS;
799 SETDISTANCE(backwards,distance);
800 }
801 stree->smallnotcompleted = 0;
802 stree->chainstart = NULL;
803 }
804 stree->nextfreebranch += LARGEINTS;
805 stree->nextfreebranchnum += LARGEINTS;
806 stree->largenode++;
807 }
808
809 /*
810 The function \emph{linkrootchildren} constructs the successor chain
811 for the children of the root. This is done at the end of the algorithm
812 in one sweep over table \emph{rootchildren}.
813 */
814
linkrootchildren(Suffixtree * stree)815 static void linkrootchildren(Suffixtree *stree)
816 {
817 Uint *rcptr, *prevnodeptr, prev = UNDEFINEDREFERENCE;
818
819 DEBUGDEFAULT;
820 stree->alphasize = 0;
821 for(rcptr = stree->rootchildren;
822 rcptr <= stree->rootchildren + LARGESTCHARINDEX; rcptr++)
823 {
824 if(*rcptr != UNDEFINEDREFERENCE)
825 {
826 stree->alphasize++;
827 if(prev == UNDEFINEDREFERENCE)
828 {
829 SETCHILD(stree->branchtab,MAKELARGE(*rcptr));
830 } else
831 {
832 if(ISLEAF(prev))
833 {
834 stree->leaftab[GETLEAFINDEX(prev)] = *rcptr;
835 } else
836 {
837 prevnodeptr = stree->branchtab + GETBRANCHINDEX(prev);
838 SETBROTHER(prevnodeptr,*rcptr);
839 }
840 }
841 prev = *rcptr;
842 }
843 }
844 if(ISLEAF(prev))
845 {
846 stree->leaftab[GETLEAFINDEX(prev)] = MAKELEAF(stree->textlen);
847 } else
848 {
849 prevnodeptr = stree->branchtab + GETBRANCHINDEX(prev);
850 SETBROTHER(prevnodeptr,MAKELEAF(stree->textlen));
851 }
852 stree->leaftab[stree->textlen] = NILBIT;
853 }
854
855 /*
856 \newpage
857 \emph{initSuffixtree} allocates and initializes the data structures for
858 McCreight's Algorithm.
859 */
860
initSuffixtree(Suffixtree * stree,SYMBOL * text,Uint textlen)861 static void initSuffixtree(Suffixtree *stree,SYMBOL *text,Uint textlen)
862 {
863 Uint i, *ptr;
864
865 DEBUG1(4,">%s\n",__func__);
866 DEBUG1(2,"# MULTBYSMALLINTS(textlen+1)=%lu\n",
867 (Showuint) MULTBYSMALLINTS(textlen+1));
868 DEBUG1(2,"# STARTFACTOR=%.2f\n",STARTFACTOR);
869 DEBUG1(2,"# MINEXTRA=%lu\n",(Showuint) MINEXTRA);
870 stree->currentbranchtabsize
871 = (Uint) (STARTFACTOR * MULTBYSMALLINTS(textlen+1));
872 if(stree->currentbranchtabsize < MINEXTRA)
873 {
874 stree->currentbranchtabsize = MULTBYSMALLINTS(MINEXTRA);
875 }
876 stree->leaftab = ALLOCSPACE(NULL,Uint,textlen+2);
877 stree->rootchildren = ALLOCSPACE(NULL,Uint,LARGESTCHARINDEX + 1);
878 stree->branchtab = ALLOCSPACE(NULL,Uint,stree->currentbranchtabsize);
879
880 stree->text = stree->tailptr = text;
881 stree->textlen = textlen;
882 stree->sentinel = text + textlen;
883 stree->firstnotallocated
884 = stree->branchtab + stree->currentbranchtabsize - LARGEINTS;
885 stree->headnode = stree->nextfreebranch = stree->branchtab;
886 stree->headend = NULL;
887 stree->headnodedepth = stree->maxbranchdepth = 0;
888 for(ptr=stree->rootchildren; ptr<=stree->rootchildren+LARGESTCHARINDEX; ptr++)
889 {
890 *ptr = UNDEFINEDREFERENCE;
891 }
892 for(i=0; i<LARGEINTS; i++)
893 {
894 stree->branchtab[i] = 0;
895 }
896 stree->nextfreebranch = stree->branchtab;
897 stree->nextfreebranchnum = 0;
898 SETDEPTHHEADPOS(0,0);
899 SETNEWCHILDBROTHER(MAKELARGELEAF(0),0);
900 SETBRANCHNODEOFFSET;
901 stree->rootchildren[(Uint) *text] = MAKELEAF(0);
902 stree->leaftab[0] = VALIDINIT;
903 DEBUG2(4,"%c-edge from root points to leaf %lu\n",*text,(Showuint) 0);
904 stree->leafcounts = NULL;
905 stree->nextfreeleafnum = 1;
906 stree->nextfreeleafptr = stree->leaftab + 1;
907 stree->nextfreebranch = stree->branchtab + LARGEINTS;
908 stree->nextfreebranchnum = LARGEINTS;
909 stree->insertnode = stree->insertprev = UNDEFINEDREFERENCE;
910 stree->smallnotcompleted = 0;
911 stree->chainstart = NULL;
912 stree->largenode = stree->smallnode = 0;
913
914 //\Ignore{
915
916 #ifdef DEBUG
917 stree->showsymbolstree = NULL;
918 stree->alphabet = NULL;
919 stree->nodecount = 1;
920 stree->splitleafedge =
921 stree->splitinternaledge =
922 stree->artificial =
923 stree->multiplications = 0;
924 stree->insertleafcalls = 1;
925 stree->maxset = stree->branchtab + LARGEINTS - 1;
926 stree->largelinks = stree->largelinkwork = stree->largelinklinkwork = 0;
927 #endif
928
929 //}
930
931 }
932
freestree(Suffixtree * stree)933 void freestree(Suffixtree *stree)
934 {
935 FREESPACE(stree->leaftab);
936 FREESPACE(stree->rootchildren);
937 FREESPACE(stree->branchtab);
938 if(stree->nonmaximal != NULL)
939 {
940 FREESPACE(stree->nonmaximal);
941 }
942 if(stree->leafcounts != NULL)
943 {
944 FREESPACE(stree->leafcounts);
945 }
946 }
947
948 //\subsection{Computing the Suffix Tree}
949
950 /*
951 \emph{constructstree} implements McCreight Algorithm to compute the suffix
952 tree for a \texttt{text} of length \texttt{textlen}. For explanations, see
953 \cite{KUR:1998}. The number \((i)\) refers to the cases of Section 6 in
954 \cite{KUR:1998}.
955 */
956
957 #define CONSTRUCT Sint constructstree(Suffixtree *stree,SYMBOL *text,Uint textlen)
958 #define DECLAREEXTRA stree->nonmaximal = NULL
959 #define COMPLETELARGEFIRST completelarge(stree)
960 #define COMPLETELARGESECOND completelarge(stree)
961 #define PROCESSHEAD /* Nothing */
962 #define CHECKSTEP /* Nothing */
963 #define FINALPROGRESS /* Nothing */
964
965 #include "construct.gen"
966
967 #undef CONSTRUCT
968 #undef DECLAREEXTRA
969 #undef COMPLETELARGEFIRST
970 #undef COMPLETELARGESECOND
971 #undef PROCESSHEAD
972 #undef CHECKSTEP
973 #undef FINALPROGRESS
974
975 /* --------------------------------------------------- */
976
977 #define CONSTRUCT Sint constructmarkmaxstree(Suffixtree *stree,SYMBOL *text,Uint textlen)
978
979 #define DECLAREEXTRA Uint distance, headposition = 0, *largeptr,\
980 tabsize = 1 + DIVWORDSIZE(textlen+1), *tabptr;\
981 stree->nonmaximal = ALLOCSPACE(NULL,Uint,tabsize);\
982 for(tabptr = stree->nonmaximal;\
983 tabptr < stree->nonmaximal + tabsize;\
984 tabptr++)\
985 {\
986 *tabptr = 0;\
987 }
988
989 #define COMPLETELARGEFIRST\
990 GETONLYHEADPOS(headposition,stree->headnode);\
991 DEBUG2(3,"j=%lu: slink->%lu\n",(Showuint) stree->nextfreeleafnum,\
992 (Showuint) headposition);\
993 SETIBIT(stree->nonmaximal,headposition);\
994 completelarge(stree);
995
996 #define COMPLETELARGESECOND\
997 DEBUG2(3,"j=%lu: slink->%lu\n",(Showuint) stree->nextfreeleafnum,\
998 (Showuint) stree->nextfreeleafnum);\
999 SETIBIT(stree->nonmaximal,stree->nextfreeleafnum);\
1000 completelarge(stree)
1001
1002 #define PROCESSHEAD /* Nothing */
1003 #define CHECKSTEP /* Nothing */
1004 #define FINALPROGRESS /* Nothing */
1005
1006 #include "construct.gen"
1007
1008 #undef CONSTRUCT
1009 #undef DECLAREEXTRA
1010 #undef COMPLETELARGEFIRST
1011 #undef COMPLETELARGESECOND
1012 #undef PROCESSHEAD
1013 #undef CHECKSTEP
1014 #undef FINALPROGRESS
1015
1016 /* --------------------------------------------------- */
1017
1018 #define CONSTRUCT Sint constructheadstree(Suffixtree *stree,SYMBOL *text,Uint textlen,void(*processhead)(Suffixtree *,Uint,void *),void *processheadinfo)
1019
1020 #define DECLAREEXTRA stree->nonmaximal = NULL
1021 #define COMPLETELARGEFIRST completelarge(stree)
1022 #define COMPLETELARGESECOND completelarge(stree)
1023 #define PROCESSHEAD processhead(stree,stree->nextfreeleafnum,\
1024 processheadinfo)
1025
1026 #define CHECKSTEP /* Nothing */
1027 #define FINALPROGRESS /* Nothing */
1028
1029 #include "construct.gen"
1030
1031 #undef CONSTRUCT
1032 #undef DECLAREEXTRA
1033 #undef COMPLETELARGEFIRST
1034 #undef COMPLETELARGESECOND
1035 #undef PROCESSHEAD
1036 #undef CHECKSTEP
1037 #undef FINALPROGRESS
1038
1039 /* --------------------------------------------------- */
1040
1041 #define LEASTSHOWPROGRESS 100000
1042 #define NUMOFCALLS 100
1043
1044 #define CONSTRUCT Sint constructprogressstree(Suffixtree *stree,SYMBOL *text,Uint textlen,void (*progress)(Uint,void *),void (*finalprogress)(void *),void *info)
1045 #define DECLAREEXTRA\
1046 Uint j = 0, step, nextstep;\
1047 stree->nonmaximal = NULL;\
1048 step = textlen/NUMOFCALLS;\
1049 nextstep = (textlen >= LEASTSHOWPROGRESS) ? step : (textlen+1);\
1050 if(progress == NULL && textlen >= LEASTSHOWPROGRESS)\
1051 {\
1052 fprintf(stderr,"# process %lu characters per dot\n",\
1053 (Showuint) textlen/NUMOFCALLS);\
1054 }
1055
1056 #define COMPLETELARGEFIRST completelarge(stree)
1057 #define COMPLETELARGESECOND completelarge(stree)
1058 #define PROCESSHEAD /* Nothing */
1059
1060 #define CHECKSTEP if(j == nextstep)\
1061 {\
1062 if(progress == NULL)\
1063 {\
1064 if(nextstep == step)\
1065 {\
1066 fputc('#',stderr);\
1067 }\
1068 fputc('.',stderr);\
1069 fflush(stdout);\
1070 } else\
1071 {\
1072 progress(nextstep,info);\
1073 }\
1074 nextstep += step;\
1075 }\
1076 j++
1077
1078 #define FINALPROGRESS if(textlen >= LEASTSHOWPROGRESS)\
1079 {\
1080 if(finalprogress == NULL)\
1081 {\
1082 fputc('\n',stderr);\
1083 } else\
1084 {\
1085 finalprogress(info);\
1086 }\
1087 }
1088
1089 #include "construct.gen"
1090
1091 #undef CONSTRUCT
1092 #undef DECLAREEXTRA
1093 #undef COMPLETELARGEFIRST
1094 #undef COMPLETELARGESECOND
1095 #undef PROCESSHEAD
1096 #undef CHECKSTEP
1097 #undef FINALPROGRESS
1098