1 /*
2 Copyright (c) 2003 by Stefan Kurtz and The Institute for
3 Genomic Research. This is OSI Certified Open Source Software.
4 Please see the file LICENSE for licensing information and
5 the file ACKNOWLEDGEMENTS for names of contributors to the
6 code base.
7 */
8
9 //\IgnoreLatex{
10
11 #include <stdio.h>
12 #include <ctype.h>
13 #include "streedef.h"
14 #include "debugdef.h"
15 #include "spacedef.h"
16 #include "maxmatdef.h"
17
18 //}
19
20 /*EE
21 This module contains functions to compute MUM-candidates using
22 a linear time suffix tree traversal.
23 */
24
25 /*
26 The following function checks if a location \texttt{loc} (of length
27 larger than \texttt{minmatchlength}) in the suffix tree represents
28 a MUM-candiate. The parameters are as follows:
29
30 \begin{enumerate}
31 \item
32 \texttt{subjectseq} points to the subject sequence
33 \item
34 \texttt{querysuffix} points to the current suffix of the query sequence
35 \item
36 \texttt{query} points to the query sequence
37 \item
38 \texttt{seqnum} is the number of the query sequence currently considered
39 \item
40 \texttt{processmumcandidate} is the function to further process a
41 MUM-candidate.
42 \item
43 \texttt{processinfo} points to some values additionally required by
44 the function \texttt{processmumcandidate}.
45 \end{enumerate}
46 By construction, the location in the suffix tree represents a
47 substring of the subject sequence which maximaly matches a prefix of
48 \texttt{querysuffix}. Thus it is only necessary to verify that,
49 the substring of the subject sequence is long enough, that it
50 is unique in the subject sequence and that the match
51 is also left maximal. This is done as follows:
52
53 \begin{enumerate}
54 \item
55 does \texttt{loc} represent a substring of length at least
56 \texttt{minmatchlength}?
57 \item
58 does \texttt{loc} correspond to a leaf edge? Then then the string
59 represented by the location is unique in the subject sequence.
60 \item
61 is the substring left maximal? This is true if one of the following
62 conditions hold:
63 \begin{itemize}
64 \item
65 the suffix of the query currently considered is the first suffix, or
66 \item
67 the string represented by \texttt{loc} is a prefix of the subject string,
68 or
69 \item
70 the characters immediately to the left of the matching strings
71 in the subject sequence and the query sequence are different
72 \end{itemize}
73 \end{enumerate}
74 If all conditions 1-3 are true, then a function
75 \texttt{processmumcandidate} is called. It takes the necessary
76 information about the MUM-candidate as its arguments.
77 In case an error occurs, a negative number is returned. Otherwise,
78 0 is returned.
79 */
80
checkiflocationisMUMcand(Location * loc,Uchar * subjectseq,Uchar * querysuffix,Uchar * query,Uint seqnum,Processmatchfunction processmumcandidate,void * processinfo)81 static Sint checkiflocationisMUMcand (Location *loc,
82 Uchar *subjectseq,
83 Uchar *querysuffix,
84 Uchar *query,
85 Uint seqnum,
86 Processmatchfunction
87 processmumcandidate,
88 void *processinfo)
89 {
90 if (loc->remain > 0
91 && loc->nextnode.toleaf
92 && (querysuffix == query || loc->locstring.start == 0
93 || *(querysuffix - 1) !=
94 subjectseq[loc->locstring.start - 1]))
95 {
96 if(processmumcandidate(processinfo,
97 loc->locstring.length, // matchlength
98 loc->locstring.start, // subject start
99 seqnum, // queryseq
100 (Uint) (querysuffix -
101 query)) != 0) // querystart
102 {
103 return -1;
104 }
105 }
106 return 0;
107 }
108
109 /*EE
110 The following function traverses the suffix tree guided by
111 some query string. The parameters are as follows:
112 \begin{enumerate}
113 \item
114 \texttt{stree} is the suffix tree constructed from the subject
115 sequence.
116 \item
117 \texttt{minmatchlength} is the minimal length of the MUMs as specified
118 \item
119 \texttt{processmumcandidate} is the function to further process a
120 MUM-candidate.
121 \item
122 \texttt{processinfo} points to some values additionally required by
123 the function \texttt{processmumcandidate}.
124 \item
125 \texttt{query} points to the query which is of length
126 \texttt{querylen}
127 \item
128 \texttt{seqnum} is the number of the current query sequence
129 in the \texttt{Multiseq}-record.
130 \end{enumerate}
131 For each suffix, say \(s\), of the query sequence
132 the following function computes the location of the longest prefix of s
133 that is a substring of the subject sequence. This is done
134 by iteratively calling the function \texttt{scanprefixfromnodestree}.
135 In each step, the scan starts at a location which represents a prefix
136 of the maximaly matching substring. The locations are computed
137 using the function \texttt{linklocstree}.
138 In case an error occurs, a negative number is returned. Otherwise,
139 0 is returned.
140 */
141
findmumcandidates(Suffixtree * stree,Uint minmatchlength,Processmatchfunction processmumcandidate,void * processinfo,Uchar * query,Uint querylen,Uint seqnum)142 Sint findmumcandidates(Suffixtree *stree,
143 Uint minmatchlength,
144 Processmatchfunction processmumcandidate,
145 void *processinfo,
146 Uchar *query,
147 Uint querylen,
148 Uint seqnum)
149 {
150 Uchar *lptr,
151 *right = query + querylen - 1,
152 *querysuffix;
153 Location loc;
154
155 DEBUG1(2,"query of length %lu=",(Showuint) querylen);
156 DEBUGCODE(2,(void) fwrite(query,sizeof(Uchar),(size_t) querylen,stdout));
157 DEBUG0(2,"\n");
158 lptr = scanprefixfromnodestree (stree, &loc, ROOT (stree),
159 query, right, 0);
160 for (querysuffix = query; lptr != NULL; querysuffix++)
161 {
162 DEBUGCODE(2,showlocation(stdout,stree,&loc));
163 if(loc.locstring.length >= minmatchlength &&
164 checkiflocationisMUMcand(&loc,stree->text,
165 querysuffix,
166 query,
167 seqnum,
168 processmumcandidate,
169 processinfo) != 0)
170 {
171 return -1;
172 }
173 if (ROOTLOCATION (&loc))
174 {
175 lptr = scanprefixfromnodestree (stree, &loc, ROOT (stree),
176 lptr + 1, right, 0);
177 }
178 else
179 {
180 linklocstree (stree, &loc, &loc);
181 lptr = scanprefixstree (stree, &loc, &loc, lptr, right, 0);
182 }
183 }
184 DEBUGCODE(2,showlocation(stdout,stree,&loc));
185 while (!ROOTLOCATION (&loc) && loc.locstring.length >= minmatchlength)
186 {
187 if(checkiflocationisMUMcand (&loc,
188 stree->text,
189 querysuffix,
190 query,
191 seqnum,
192 processmumcandidate,
193 processinfo) != 0)
194 {
195 return -2;
196 }
197 linklocstree (stree, &loc, &loc);
198 querysuffix++;
199 DEBUGCODE(2,showlocation(stdout,stree,&loc));
200 }
201 return 0;
202 }
203