1/* 2 Copyright (c) 2003 by Stefan Kurtz and The Institute for 3 Genomic Research. This is OSI Certified Open Source Software. 4 Please see the file LICENSE for licensing information and 5 the file ACKNOWLEDGEMENTS for names of contributors to the 6 code base. 7*/ 8 9Sint constructstree(Suffixtree *stree,SYMBOL *text,Uint textlen) 10{ 11 12} 13 14/*EE 15 In some applications it is convenient to perform some extra computations 16 during the suffix tree construction. For this purpose, there are 17 two variations of \texttt{constructstree}: 18 19 The following function 20 additionally marks all branching nodes which are maximal. 21 A branching node \(\overline{v}\) is \emph{maximal} if 22 and only if there is 23 no suffix link to \(\overline{v}\). The maximal nodes can be 24 enumerated using the function \texttt{overmaximalstree}, see Section 25 \ref{Traverse}. 26*/ 27 28Sint constructmarkmaxstree(Suffixtree *stree,SYMBOL *text, 29 Uint textlen) 30{ 31 32} 33 34/*EE 35 The following variation 36 constructs the suffix tree and calls for each \(j\in[0,n]\) the 37 function \texttt{processhead} with the second argument being \(j\), 38 as soon as \(\HD{j}\) is computed. 39 The first argument of \texttt{processhead} is the current suffix tree, 40 and the third argument is the pointer \texttt{processheadinfo}. 41*/ 42 43Sint constructheadstree(Suffixtree *stree,SYMBOL *text, 44 Uint textlen, 45 void(*processhead)(Suffixtree *,Uint, 46 void *), 47 void *processheadinfo) 48{ 49} 50 51/*EE 52 The following variation 53 constructs the suffix tree for \texttt{text}. Additionally, whenever 54 \(textlen>\Numofcalls\), the function 55 \texttt{progress} is called after \(i\cdot textlen/\Numofcalls\) 56 construction 57 steps, for \(i\in[1,\Numofcalls]\). Moreover, the function 58 \texttt{finalprogress} is called at the end of the suffix tree 59 construction. When they are called, both functions are supplied 60 with the argument \texttt{info}. 61*/ 62 63Sint constructprogressstree(Suffixtree *stree,SYMBOL *text, 64 Uint textlen, 65 void (*progress)(Uint,void *), 66 void (*finalprogress)(void *), 67 void *info) 68{ 69 70} 71 72/*EE 73 For example, one could define \texttt{progress} and \texttt{finalprogress} 74 as follows, in order to write a line of 79 dots on the given file pointer, 75 to show the progress of the suffix tree construction: 76*/ 77 78void progresswithdot(Uint nextstep,void *info) 79{ 80 fputc('.',stderr); 81 fflush(stderr); 82} 83 84void finalprogress(void *info) 85{ 86 fputc('\n',stderr); 87} 88 89/*EE 90 The following function stores for each branching node 91 \(\overline{v}\) the \emph{lefcount}, 92 i.e.\ the number of leafs in the subtree below \(\overline{v}\). 93 The computation of the leafcounts is done in linear time. It can be 94 retrieved using the function \texttt{getleafcountstree}, described 95 below. 96*/ 97 98void addleafcountsstree(Suffixtree *stree) 99{ 100 101} 102 103/*EE 104 The following function frees the space allocated for \texttt{stree}. 105*/ 106 107void freestree(Suffixtree *stree) 108{ 109 110} 111 112/*EE 113 The following function 114 returns the maximal length of an input string allowed for the 115 suffix tree construction. 116*/ 117 118Uint getmaxtextlenstree(void) 119{ 120 121} 122 123/*EE 124 \subsection{Accessing the Stored Information} 125 126 The following function stores the information for leaf 127 \texttt{lref} in the structure \texttt{leafinfo}. 128*/ 129 130void getleafinfostree(Suffixtree *stree,Leafinfo *leafinfo, 131 Lref lptr) 132{ 133 134} 135 136/*EE 137 The following function 138 stores the information for branching node \texttt{bnode} in 139 \texttt{branchinfo}. For efficiency reason, the function does not always 140 compute all 5 components stored for a branching node. Instead, the 141 components are delivered as specified by \texttt{whichinfo}. This 142 has the following possible bits: 143 \begin{center} 144 \begin{tabular}{l} 145 \texttt{ACCESSDEPTH}\\ 146 \texttt{ACCESSHEADPOS}\\ 147 \texttt{ACCESSSUFFIXLINK}\\ 148 \texttt{ACCESSFIRSTCHILD}\\ 149 \texttt{ACCESSBRANCHBROTHER} 150 \end{tabular} 151 \end{center} 152*/ 153 154void getbranchinfostree(Suffixtree *stree,Uint whichinfo, 155 Branchinfo *branchinfo,Bref btptr) 156{ 157 158} 159 160/*EE 161 The following function 162 shows the path for the node \texttt{bnode}. A symbol \(a\) is shown 163 by applying the function \texttt{showchar} to it and the 164 additional argument \texttt{info}. 165*/ 166 167void showpathstree(Suffixtree *stree,Bref bnode, 168 void (*showchar)(SYMBOL,void *),void *info) 169{ 170 171} 172 173/*EE 174 The following function stores a start position and the length of 175 \(\HD{j}\), for the current \(j\) in \texttt{str}. The start 176 position is smaller than \(j\). 177*/ 178 179void getheadstringstree(Suffixtree *stree,Stringtype *str) 180{ 181 182} 183 184/*EE 185 The following function returns \texttt{True}, if and only if the 186 branching node \texttt{bnode} has exactly two leaves, say with leaf 187 numbers \(i\) and \(j\), \(i<j\). In this case, \(i\) is stored in 188 \texttt{twoleaves->uint0} and \(j\) is stored in 189 \texttt{twoleaves->uint1}. 190*/ 191 192BOOL exactlytwoleavesstree(Suffixtree *stree, 193 PairUint *twoleaves,Bref start) 194{ 195 196} 197 198/*EE 199 The following function 200 returns the leafcount for a node referenced by \texttt{nodeptr}. 201 The function \texttt{addleafcountstree} must have been called 202 exactly once before, in order to compute the leafcounts. 203*/ 204 205Uint getleafcountstree(Suffixtree *stree,Bref nodeptr) 206{ 207 208} 209 210/*EE 211 The following function 212 computes a table \texttt{depthtab}, such that 213 \texttt{depthtab->spaceUint[i]} 214 holds the number of branching nodes in the given 215 suffix tree whose depth is 216 exactly \(i\). The maximal depth of any branching node is 217 \texttt{depthtab->nextfreeUint-1}. 218*/ 219 220void makedepthtabstree(ArrayUint *depthtab,Suffixtree *stree) 221{ 222 223} 224 225/*EE 226 \subsection{Traversing the Suffix Tree}\label{Traverse} 227 228 The following function 229 implements the function \emph{scanprefix}, restricted to the case that 230 the starting location is a node location. \texttt{stree} is the suffix 231 tree, \texttt{outloc} is the resulting location, \texttt{startnode} is 232 the branching node, the scanning starts from, and \texttt{left} 233 and \texttt{right} delimit the string, say \(s\), to be scanned. 234 The function returns \texttt{NULL}, if \(s\) is scanned completely. 235 Otherwise, it points to a suffix of \(s\). 236 \texttt{rescanlength} is the length of a prefix of 237 \(s\) that already occurs in the suffix tree. It is always safe 238 to set \texttt{rescanlength} to 0. 239*/ 240 241SYMBOL *scanprefixfromnodestree(Suffixtree *stree,Location *loc, 242 Bref btptr,SYMBOL *left, 243 SYMBOL *right,Uint rescanlength) 244{ 245 246} 247 248/*EE 249 The following function 250 implements the function \emph{scanprefix}. \texttt{stree} is the suffix 251 tree, \texttt{outloc} is the resulting location, \texttt{inloc} is the 252 location, the scanning starts from, and \texttt{left} 253 and \texttt{right} delimit the string, say \(s\), to be scanned. 254 The function returns \texttt{NULL}, if \(s\) is scanned completely. 255 Otherwise, it points to a suffix of \(s\). 256 \texttt{rescanlength} is the length of a prefix of 257 \(s\) that already occurs in the suffix tree. It is always safe 258 to set \texttt{rescanlength} to 0. 259*/ 260 261SYMBOL *scanprefixstree(Suffixtree *stree,Location *outloc, 262 Location *inloc,SYMBOL *left, 263 SYMBOL *right,Uint rescanlength) 264{ 265 266} 267 268/*EE 269 The following function 270 is similar to the function \emph{scanprefixfromnodestree}. It additionally 271 stores in the array \texttt{path} the sequence of branching nodes 272 that has been visited during the scan to location \texttt{loc}. 273 The array \texttt{path} may not be empty, in which case the list of 274 visited branching nodes is appended to it. 275*/ 276 277SYMBOL *findprefixpathfromnodestree(Suffixtree *stree, 278 ArrayPathinfo *path, 279 Location *loc,Bref btptr, 280 SYMBOL *left,SYMBOL *right, 281 Uint rescanlength) 282{ 283 284} 285 286/*EE 287 The following function 288 is similar to the function \emph{scanprefixstree}. It additionally 289 stores in the array \texttt{path} the sequence of branching nodes 290 that has been visited during the scan to location \texttt{inloc}. 291 The array \texttt{path} may not be empty, in which case the list 292 of visited branching nodes is appended to it. 293*/ 294 295SYMBOL *findprefixpathstree(Suffixtree *stree, 296 ArrayPathinfo *path, 297 Location *outloc, 298 Location *inloc, 299 SYMBOL *left,SYMBOL *right, 300 Uint rescanlength); 301 302/*EE 303 The following function 304 implements the function \emph{rescan}. \texttt{stree} is the suffix 305 tree, \texttt{outloc} is the resulting location, \texttt{startnode} 306 is the branching node, the scanning starts from, and \texttt{left} 307 and \texttt{right} delimit the string, say \(s\), to be rescanned. 308*/ 309 310void rescanstree(Suffixtree *stree,Location *loc, 311 Bref btptr,SYMBOL *left,SYMBOL *right) 312{ 313 314} 315 316/*EE 317 The following function 318 implements the function \emph{linkloc}. \texttt{stree} is the 319 suffix tree, \texttt{outloc} is the resulting location, and 320 \texttt{inloc} is the input location. 321*/ 322 323void linklocstree(Suffixtree *stree,Location *outloc, 324 Location *inloc) 325{ 326 327} 328 329/*EE 330 The following function 331 applies the function \texttt{processnode} to all branching nodes 332 of the suffix tree \texttt{stree}. The \emph{root} is skipped, if 333 and only if \texttt{skiproot} is \texttt{True}. The arguments of 334 \texttt{processnode} are as follows: The suffix tree \texttt{stree}, 335 the branching node, its depth, its head position, and the 336 pointer \texttt{info}. 337*/ 338 339void overallstree(Suffixtree *stree,BOOL skiproot, 340 void(*processnode)(Suffixtree *,Bref, 341 Uint,Uint,void *), 342 void *info) 343{ 344 345} 346 347/*EE 348 The following function 349 applies the function \texttt{processnode} to all maximal branching 350 nodes of the suffix tree \texttt{stree}. The arguments of 351 \texttt{processnode} are as follows: The suffix tree \texttt{stree}, 352 the reference to the branching node, its depth, its head position, 353 and the pointer \texttt{info}. 354*/ 355 356void overmaximalstree(Suffixtree *stree, 357 void(*processnode)(Suffixtree *,Bref, 358 Uint,Uint,void *), 359 void *info) 360{ 361 362} 363 364/*EE 365 The following function 366 enumerates all immediate successors of the branching node \texttt{bnode}. 367 Suppose the leaf with leaf number \texttt{j} is a 368 successor of \texttt{bnode}. Then the function \texttt{processleaf} with 369 arguments \texttt{stree}, \texttt{j}, and \texttt{info} is called. 370 Suppose the branching node \texttt{bsucc} is a successor of 371 \texttt{bnode}. Then the function \texttt{processbranch} is called with 372 arguments \texttt{stree}, \texttt{bsucc}, and \texttt{info}. 373*/ 374 375void oversuccsstree(Suffixtree *stree,Bref bnode, 376 void(*processleaf)(Suffixtree *,Uint,void *), 377 void(*processbranch)(Suffixtree *,Bref,void *), 378 void *info) 379{ 380} 381 382/*EE 383 The following function 384 performs a (possibly) limited depth first traversal of the suffix 385 tree rooted by \texttt{startnode}. 386 \texttt{startnode} can be a reference to 387 a leaf or a reference to a branching node. The nodes of the subtree are 388 enumerated in depth first left to right order. The argument 389 \texttt{processleaf} 390 is not allowed to be \texttt{NULL}. \texttt{processbranch1} and 391 \texttt{processbranch2} can either be both \texttt{NULL} or 392 both different from \texttt{NULL}. 393 \begin{itemize} 394 \item 395 Each time a leaf, say \(\overline{v}\), with leaf number 396 \texttt{j} is encountered, the function \texttt{processleaf} is called 397 with arguments \texttt{j}, \texttt{lca}, and 398 \texttt{info}. \texttt{lca} is the longest common ancestor of 399 \(\overline{v}\) and the previous leaf encountered during the traversal. 400 \texttt{lca} is \texttt{NULL}, if \(\overline{v}\) is the first leaf 401 encountered in the traversal. If \texttt{processleaf} returns a value 402 smaller than 0, then \texttt{depthfirststree} terminates with a 403 return value \texttt{-1}. 404 \item 405 Each time a branching node \texttt{bsucc} is visited for the 406 first time, the function \texttt{process\-branch1} is called with 407 arguments \texttt{bsucc} and \texttt{info}. If 408 \texttt{processbranch1} returns \texttt{False}, then 409 the entire subtree below \texttt{bsucc} is discarded. Otherwise the 410 depth first traversal continues with the subtree rooted by \texttt{bsucc}. 411 \item 412 Each time a branching node \texttt{bsucc} 413 is visited for the second time (i.e.\ 414 the entire subtree below \texttt{bsucc} has been processed), the function 415 \texttt{processbranch2} is called with arguments \texttt{bsucc} and 416 \texttt{info}. If \texttt{processbranch2} returns a value smaller than 417 0, then \texttt{depthfirststree} returns with value \texttt{-1}. 418 \end{itemize} 419 Either \texttt{processleaf} or \texttt{processbranch1} and 420 \texttt{processbranch2} can be \texttt{NULL}, in which case there is no 421 function call. If \texttt{stoptraversal} is not \texttt{NULL}, then 422 after each call the function \texttt{stoptraversal} is applied 423 to \texttt{stopinfo}. If the return value of this call is \texttt{True}, 424 then the depth first traversal stops. 425 426 In case everything goes right, \texttt{depthfirststree} returns 0. 427*/ 428 429Sint depthfirststree(Suffixtree *stree,Reference *startnode, 430 Sint (*processleaf)(Uint,Bref,void *), 431 BOOL (*processbranch1)(Bref,void *), 432 Sint (*processbranch2)(Bref,void *), 433 BOOL (*stoptraversal)(void *), 434 void *stopinfo,void *info) 435{ 436 437} 438 439