1 /*
2   Copyright (c) 2003 by Stefan Kurtz and The Institute for
3   Genomic Research.  This is OSI Certified Open Source Software.
4   Please see the file LICENSE for licensing information and
5   the file ACKNOWLEDGEMENTS for names of contributors to the
6   code base.
7 */
8 
9 #ifndef STREETYP_H
10 #define STREETYP_H
11 
12 #include "types.h"
13 #include "arraydef.h"
14 
15 //}
16 
17 /*
18   A \texttt{Reference} consists of an \texttt{address} pointing a leaf,
19   or to a branching node. The boolean \texttt{toleaf} is \texttt{True} if
20   and only if \texttt{address} points to a leaf.
21 */
22 
23 typedef struct
24 {
25   BOOL toleaf;
26   Uint *address;
27 } Reference;            // \Typedef{Reference}
28 
29 /*
30   The following types are used for references to leaves and
31   branching nodes, respectively. We will always identify a leaf and
32   and branching node with their references.
33 */
34 
35 typedef Uint * Bref;    // \Typedef{Bref}
36 typedef Uint * Lref;    // \Typedef{Lref}
37 
38 /*
39   For each branching node we store five values, as described in Section
40   \ref{Representation}. These values comprise the following structure.
41 */
42 
43 typedef struct
44 {
45   Uint headposition,        // the head position of the branching node
46        depth;               // the depth of the branching node
47   Bref suffixlink;          // the suffix link is always to a branching node
48   Reference firstchild,     // the reference to the first child
49             branchbrother;  // the reference to the right brother;
50                             // if this doesn't exist then it's \texttt{NULL}
51 } Branchinfo;               // \Typedef{Branchinfo}
52 
53 /*
54   For each leaf, we store a reference to its right brother, which is
55   \texttt{NULL}, if the right brother does not exist. This is
56   expressed in the type synonym \texttt{Leafinfo}.
57 */
58 
59 typedef Reference Leafinfo;  // \Typedef{Leafinfo}
60 
61 /*
62   A suffix tree is implemented by the type \texttt{Suffixtree}.
63   This structure contains several components which are mostly only used
64   during the suffix tree construction. For applications, assume the
65   following definition. Note that the input sequence is represented
66   as an array of elements of type \texttt{SYMBOL}. The latter
67   is a synonym for \texttt{Uchar} by default.
68 
69 @typedef struct
70 @{
71 @  SYMBOL *text;     // points to the input string
72 @  Uint textlen;     // the length of the input string
73 @  Uint *branchtab;  // stores the infos for the branching nodes
74 @  Uint *leaftab;    // stores the brother-references of the leaves
75 @} Suffixtree;       // \Typedef{Suffixtree}
76 
77 */
78 
79 //\Ignore{
80 
81 struct Suffixtreetype
82 {
83   Uint textlen,               // the length of the input string
84        *leaftab,              // stores the brother-references of the leafs
85        *branchtab,            // table TBranch
86        *rootchildren;         // references to successors of root
87   SYMBOL *text,               // points to the input string
88          *sentinel;           // points to the position of the \(\$\)-symbol
89 
90   Uint nextfreeleafnum,       // the number of the next leaf
91        headnodedepth,         // the depth of the headnode
92        insertnode,            // the node the split edge leads to
93        insertprev,            // the edge preceeding the split edge
94        smallnotcompleted,     // the number of small nodes in the current chain
95        nextfreebranchnum,     // the number of the next free branch node
96        onsuccpath,            // refers to node on success path of headnode
97        currentdepth,          // depth of the new branch node
98        branchnodeoffset,      // number of leafs in tree
99        alphasize,             // the number of different characters in t
100        maxbranchdepth,        // maximal depth of branching node
101        largenode,             // number of large nodes
102        smallnode,             // number of small nodes
103        *setlink,              // address of a nil-reference
104        *nextfreeleafptr,      // points to next free entry in leaftab
105        *chainstart,           // address of the node, current chains starts at
106        *nextfreebranch,       // reference to next free base addr. in branchtab
107        *headnode,             // left component of head location
108        currentbranchtabsize,  // current number of cells in branchtab
109        *firstnotallocated,    // refers to the last address, such that at
110                               // least \texttt{LARGEINTS} integers are
111                               // available. So a large node can be stored in
112                               // the available amount of space.
113        *nonmaximal,           // bit table: if node with headposition \(i\) is
114                               // not maximal, then \(nonmaximal[i]\) is set.
115        *leafcounts;           // holds counts of the number of leafs in subtree
116                               // indexed by headposition
117   BOOL setatnewleaf;          // nil-reference is stored in new leaf
118   SYMBOL *headstart,          // these references represent the right component
119          *headend,            // of the head location \((\overline{u},v)\).
120                               // \emph{headstart} refers to the first character
121                               // of \(v\), and \emph{headend} to the last
122                               // character. In case, \(v=\varepsilon\),
123                               // \(\emph{headend}=\emph{NULL}\).
124          *tailptr;            // points to the tail
125 
126 #ifdef DEBUG
127   char * (*showsymbolstree)(SYMBOL,Uchar *);
128   Uchar *alphabet;
129   Uint splitleafedge,
130        splitinternaledge,
131        artificial,
132        insertleafcalls,
133        largelinks,
134        largelinkwork,
135        largelinklinkwork,
136        multiplications,
137        nodecount,
138        *maxset;
139   void *generalcounter;
140 #endif
141 #if (SYMBOLBYTES == 2) || (SYMBOLBYTES == 4)
142   Sint lastcharindex;
143 #endif
144 
145 };
146 
147 typedef struct Suffixtreetype Suffixtree;
148 
149 DECLAREARRAYSTRUCT(Bref);
150 
151 //}
152 
153 /*
154   A location is implemented by the type \texttt{Location}.
155 */
156 
157 typedef struct
158 {
159   Stringtype locstring; // string represented by location
160   Bref previousnode;    // reference to previous node (which is branching)
161   SYMBOL *firstptr;     // pointer to first character of edge label
162   Uint edgelen,         // length of edge
163        remain;          // number of remaining characters on edge
164   Reference nextnode;   // reference to node the edge points to
165 } Location;             // \Typedef{Location}
166 
167 /*
168   If a location is a node \(\overline{u}\), we set \texttt{remain} to 0, and
169   store a reference to \(\overline{u}\) in \texttt{nextnode}. Moreover, we
170   store a position where \(u\) starts and its length in \texttt{locstring}.
171   If the location is of the form \((\overline{u},v,w,\overline{uvw})\),
172   then the components of the location satisfies the following values:
173   \begin{enumerate}
174   \item
175   \texttt{previousnode} is a reference to \(\overline{u}\)
176   \item
177   \texttt{firstptr} points to the first symbol of the edge label \(vw\).
178   \item
179   \(\texttt{edgelen}=\Size{vw}\)
180   \item
181   \(\texttt{remain}=\Size{w}\)
182   \item
183   \texttt{nextnode} is a reference to \(\overline{uvw}\).
184   \end{enumerate}
185   Since \(w\) is not empty, a location is a node location if and only if
186   \texttt{remain} is 0.
187 */
188 
189 //\Ignore{
190 
191 /*
192   A simple location stores just a part of information stored in a suffix tree.
193 */
194 
195 typedef struct
196 {
197   Uint remain,
198        textpos;  // these last two items are redundant and can be computed
199   Reference nextnode;
200 } Simpleloc;     // \Typedef{Simpleloc}
201 
202 DECLAREARRAYSTRUCT(Simpleloc);
203 
204 /*
205   A path in the suffix tree is stored as an array of \texttt{Pathinfo}-records.
206 */
207 
208 typedef struct
209 {
210   Uint depth, headposition;
211   Bref ref;
212 } Pathinfo;      // \Typedef{Pathinfo}
213 
214 DECLAREARRAYSTRUCT(Pathinfo);
215 
216 typedef struct
217 {
218   BOOL secondtime;
219   ArrayBref stack;
220 } DFSstate;      // \Typedef{DFSstate}
221 
222 #endif
223 
224 //}
225