1 /*
2   Copyright (c) 2003 by Stefan Kurtz and The Institute for
3   Genomic Research.  This is OSI Certified Open Source Software.
4   Please see the file LICENSE for licensing information and
5   the file ACKNOWLEDGEMENTS for names of contributors to the
6   code base.
7 */
8 
9 //\IgnoreLatex{
10 
11 #include <stdio.h>
12 #include <ctype.h>
13 #include "streedef.h"
14 #include "debugdef.h"
15 #include "spacedef.h"
16 #include "maxmatdef.h"
17 
18 //}
19 
20 /*EE
21   This module contains functions to compute MUM-candidates using
22   a linear time suffix tree traversal.
23 */
24 
25 /*
26   The following function checks if a location \texttt{loc} (of length
27   larger than \texttt{minmatchlength}) in the suffix tree represents
28   a MUM-candiate. The parameters are as follows:
29 
30   \begin{enumerate}
31   \item
32   \texttt{subjectseq} points to the subject sequence
33   \item
34   \texttt{querysuffix} points to the current suffix of the query sequence
35   \item
36   \texttt{query} points to the query sequence
37   \item
38   \texttt{seqnum} is the number of the query sequence currently considered
39   \item
40   \texttt{processmumcandidate} is the function to further process a
41   MUM-candidate.
42   \item
43   \texttt{processinfo} points to some values additionally required by
44   the function \texttt{processmumcandidate}.
45   \end{enumerate}
46   By construction, the location in the suffix tree represents a
47   substring of the subject sequence which maximaly matches a prefix of
48   \texttt{querysuffix}. Thus it is only necessary to verify that,
49   the substring of the subject sequence is long enough, that it
50   is unique in the subject sequence and that the match
51   is also left maximal. This is done as follows:
52 
53   \begin{enumerate}
54   \item
55   does \texttt{loc} represent a substring of length at least
56   \texttt{minmatchlength}?
57   \item
58   does \texttt{loc} correspond to a leaf edge? Then then the string
59   represented by the location is unique in the subject sequence.
60   \item
61   is the substring left maximal? This is true if one of the following
62   conditions hold:
63   \begin{itemize}
64   \item
65   the suffix of the query currently considered is the first suffix, or
66   \item
67   the string represented by \texttt{loc} is a prefix of the subject string,
68   or
69   \item
70   the characters immediately to the left of the matching strings
71   in the subject sequence and the query sequence are different
72   \end{itemize}
73   \end{enumerate}
74   If all conditions 1-3 are true, then a function
75   \texttt{processmumcandidate} is called. It takes the necessary
76   information about the MUM-candidate as its arguments.
77   In case an error occurs, a negative number is returned. Otherwise,
78   0 is returned.
79 */
80 
checkiflocationisMUMcand(Location * loc,Uchar * subjectseq,Uchar * querysuffix,Uchar * query,Uint seqnum,Processmatchfunction processmumcandidate,void * processinfo)81 static Sint checkiflocationisMUMcand (Location *loc,
82                                       Uchar *subjectseq,
83 				      Uchar *querysuffix,
84 				      Uchar *query,
85                                       Uint seqnum,
86                                       Processmatchfunction
87                                         processmumcandidate,
88                                       void *processinfo)
89 {
90   if (loc->remain > 0
91       && loc->nextnode.toleaf
92       && (querysuffix == query || loc->locstring.start == 0
93 			       || *(querysuffix - 1) !=
94                                   subjectseq[loc->locstring.start - 1]))
95   {
96     if(processmumcandidate(processinfo,
97                            loc->locstring.length,   // matchlength
98 	                   loc->locstring.start,    // subject start
99                            seqnum,                  // queryseq
100                            (Uint) (querysuffix -
101                                    query)) != 0)    // querystart
102     {
103       return -1;
104     }
105   }
106   return 0;
107 }
108 
109 /*EE
110   The following function traverses the suffix tree guided by
111   some query string. The parameters are as follows:
112   \begin{enumerate}
113   \item
114   \texttt{stree} is the suffix tree constructed from the subject
115   sequence.
116   \item
117   \texttt{minmatchlength} is the minimal length of the MUMs as specified
118   \item
119   \texttt{processmumcandidate} is the function to further process a
120   MUM-candidate.
121   \item
122   \texttt{processinfo} points to some values additionally required by
123   the function \texttt{processmumcandidate}.
124   \item
125   \texttt{query} points to the query which is of length
126   \texttt{querylen}
127   \item
128   \texttt{seqnum} is the number of the current query sequence
129   in the \texttt{Multiseq}-record.
130   \end{enumerate}
131   For each suffix, say \(s\), of the query sequence
132   the following function computes the location of the longest prefix of s
133   that is a substring of the subject sequence. This is done
134   by iteratively calling the function \texttt{scanprefixfromnodestree}.
135   In each step, the scan starts at a location which represents a prefix
136   of the maximaly matching substring. The locations are computed
137   using the function \texttt{linklocstree}.
138   In case an error occurs, a negative number is returned. Otherwise,
139   0 is returned.
140 */
141 
findmumcandidates(Suffixtree * stree,Uint minmatchlength,Processmatchfunction processmumcandidate,void * processinfo,Uchar * query,Uint querylen,Uint seqnum)142 Sint findmumcandidates(Suffixtree *stree,
143                        Uint minmatchlength,
144                        Processmatchfunction processmumcandidate,
145                        void *processinfo,
146                        Uchar *query,
147                        Uint querylen,
148                        Uint seqnum)
149 {
150   Uchar *lptr,
151         *right = query + querylen - 1,
152         *querysuffix;
153   Location loc;
154 
155   DEBUG1(2,"query of length %lu=",(Showuint) querylen);
156   DEBUGCODE(2,(void) fwrite(query,sizeof(Uchar),(size_t) querylen,stdout));
157   DEBUG0(2,"\n");
158   lptr = scanprefixfromnodestree (stree, &loc, ROOT (stree),
159                                   query, right, 0);
160   for (querysuffix = query; lptr != NULL; querysuffix++)
161   {
162     DEBUGCODE(2,showlocation(stdout,stree,&loc));
163     if(loc.locstring.length >= minmatchlength &&
164        checkiflocationisMUMcand(&loc,stree->text,
165                                 querysuffix,
166                                 query,
167                                 seqnum,
168                                 processmumcandidate,
169                                 processinfo) != 0)
170     {
171       return -1;
172     }
173     if (ROOTLOCATION (&loc))
174     {
175       lptr = scanprefixfromnodestree (stree, &loc, ROOT (stree),
176                                       lptr + 1, right, 0);
177     }
178     else
179     {
180       linklocstree (stree, &loc, &loc);
181       lptr = scanprefixstree (stree, &loc, &loc, lptr, right, 0);
182     }
183   }
184   DEBUGCODE(2,showlocation(stdout,stree,&loc));
185   while (!ROOTLOCATION (&loc) && loc.locstring.length >= minmatchlength)
186   {
187     if(checkiflocationisMUMcand (&loc,
188                                  stree->text,
189                                  querysuffix,
190                                  query,
191                                  seqnum,
192                                  processmumcandidate,
193                                  processinfo) != 0)
194     {
195       return -2;
196     }
197     linklocstree (stree, &loc, &loc);
198     querysuffix++;
199     DEBUGCODE(2,showlocation(stdout,stree,&loc));
200   }
201   return 0;
202 }
203