1 /* @source supermatcher application
2 **
3 ** Local alignment of large sequences
4 **
5 ** @author Copyright (C) Ian Longden
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
21 ******************************************************************************/
22 
23 /* supermatcher
24 ** Create a word table for the sequence set.
25 ** Then iterates over the sequence stream first checking any possible word
26 ** matches.
27 ** If word matches then check to see if the position lines up with the last
28 ** position if it does continue else stop.
29 ** This gives us the start (offset) for the smith-waterman match by finding
30 ** the biggest match and calculating start and ends for both sequences.
31 */
32 
33 
34 /*
35 ** Word matching function was re-implemented using Rabin-Karp multi-pattern
36 ** search algorithm. May 2010.
37 */
38 
39 #include "emboss.h"
40 
41 
42 
43 
44 static void supermatcher_findendpoints(const EmbPWordMatch max,
45 	const AjPSeq trgseq, const AjPSeq qryseq,
46 	ajint *trgstart, ajint *qrystart,
47 	ajint *trgend, ajint *qryend);
48 
49 
50 
51 
52 /* @prog supermatcher *********************************************************
53 **
54 ** Finds matches of a set of sequences against one or more sequences
55 **
56 ** Create a word table for the second sequence.
57 ** Then go down first sequence checking to see if the word matches.
58 ** If word matches then check to see if the position lines up with the last
59 ** position if it does continue else stop.
60 ** This gives us the start (offset) for the smith-waterman match by finding
61 ** the biggest match and calculating start and ends for both sequences.
62 **
63 ******************************************************************************/
64 
main(int argc,char ** argv)65 int main(int argc, char **argv)
66 {
67     AjPSeqall queryseqs;
68     AjPSeqset targetseqs;
69     AjPSeq queryseq;
70     const AjPSeq targetseq;
71     AjPStr queryaln = 0;
72     AjPStr targetaln = 0;
73 
74     AjPFile errorf;
75     AjBool show = ajFalse;
76 
77     const char   *queryseqc;
78     const char   *targetseqc;
79 
80     AjPMatrixf matrix;
81     AjPSeqCvt cvt = 0;
82     float **sub;
83     ajint *compass = NULL;
84     float *path = NULL;
85 
86     float gapopen;
87     float gapextend;
88     float score;
89     float minscore;
90 
91     ajuint j, k;
92     ajint querystart = 0;
93     ajint targetstart = 0;
94     ajint queryend   = 0;
95     ajint targetend   = 0;
96     ajint width  = 0;
97     AjPTable kmers = 0;
98     ajint wordlen = 6;
99     ajint oldmax = 0;
100     ajint newmax = 0;
101 
102     ajuint ntargetseqs;
103     ajuint nkmers;
104 
105     AjPAlign align = NULL;
106     EmbPWordMatch maxmatch; /* match with maximum score */
107 
108     /* Cursors for the current sequence being scanned,
109     ** i.e., until which location it was scanned.
110     ** Separate cursor/location entries for each sequence in the seqset.
111     */
112     ajuint* lastlocation;
113 
114     EmbPWordRK* wordsw = NULL;
115     AjPList* matchlist = NULL;
116 
117     embInit("supermatcher", argc, argv);
118 
119     matrix    = ajAcdGetMatrixf("datafile");
120     queryseqs = ajAcdGetSeqall("asequence");
121     targetseqs= ajAcdGetSeqset("bsequence");
122     gapopen   = ajAcdGetFloat("gapopen");
123     gapextend = ajAcdGetFloat("gapextend");
124     wordlen   = ajAcdGetInt("wordlen");
125     align     = ajAcdGetAlign("outfile");
126     errorf    = ajAcdGetOutfile("errfile");
127     width     = ajAcdGetInt("width");	/* width for banded Smith-Waterman */
128     minscore  = ajAcdGetFloat("minscore");
129 
130     gapopen   = ajRoundFloat(gapopen, 8);
131     gapextend = ajRoundFloat(gapextend, 8);
132 
133     sub = ajMatrixfGetMatrix(matrix);
134     cvt = ajMatrixfGetCvt(matrix);
135 
136     embWordLength(wordlen);
137 
138     /* seqset sequence is the reference sequence for SAM format */
139     ajAlignSetRefSeqIndx(align, 1);
140 
141     ajSeqsetTrim(targetseqs);
142 
143     ntargetseqs = ajSeqsetGetSize(targetseqs);
144 
145     AJCNEW0(matchlist, ntargetseqs);
146 
147     /* get tables of words */
148     for(k=0;k<ntargetseqs;k++)
149     {
150 	targetseq = ajSeqsetGetseqSeq(targetseqs, k);
151 	embWordGetTable(&kmers, targetseq);
152 	ajDebug("Number of distinct kmers found so far: %Lu\n",
153 		ajTableGetLength(kmers));
154     }
155     AJCNEW0(lastlocation, ntargetseqs);
156 
157     if(ajTableGetLength(kmers)<1)
158 	ajErr("no kmers found");
159 
160     nkmers = embWordRabinKarpInit(kmers, &wordsw, wordlen, targetseqs);
161 
162     while(ajSeqallNext(queryseqs,&queryseq))
163     {
164 	ajSeqTrim(queryseq);
165 
166 	queryaln = ajStrNewRes(1+ajSeqGetLen(queryseq));
167 
168 	ajDebug("Read '%S'\n", ajSeqGetNameS(queryseq));
169 
170 	for(k=0;k<ntargetseqs;k++)
171 	{
172 	    lastlocation[k]=0;
173 	    matchlist[k] = ajListstrNew();
174 	}
175 
176 	embWordRabinKarpSearch(ajSeqGetSeqS(queryseq), targetseqs,
177 		(EmbPWordRK const *)wordsw, wordlen, nkmers,
178 		matchlist, lastlocation, ajFalse);
179 
180 
181 	for(k=0;k<ajSeqsetGetSize(targetseqs);k++)
182 	{
183 	    targetseq      = ajSeqsetGetseqSeq(targetseqs, k);
184 
185 	    ajDebug("Processing '%S'\n", ajSeqGetNameS(targetseq));
186 
187 	    if(ajListGetLength(matchlist[k])==0)
188 	    {
189 		ajFmtPrintF(errorf,
190 		            "No wordmatch start points for "
191 		            "%s vs %s. No alignment\n",
192 		            ajSeqGetNameC(queryseq),ajSeqGetNameC(targetseq));
193 		embWordMatchListDelete(&matchlist[k]);
194 		continue;
195 	    }
196 
197 
198 	    /* only the maximum match is used as seed
199 	     * (if there is more than one location with the maximum match
200 	     * only the first one is used)
201 	     * TODO: we should add a new option to make above limit optional
202 	     */
203 	    maxmatch = embWordMatchFirstMax(matchlist[k]);
204 
205 	    supermatcher_findendpoints(maxmatch,targetseq, queryseq,
206 		    &targetstart, &querystart,
207 		    &targetend, &queryend);
208 
209 	    targetaln=ajStrNewRes(1+ajSeqGetLen(targetseq));
210 	    queryseqc = ajSeqGetSeqC(queryseq);
211 	    targetseqc = ajSeqGetSeqC(targetseq);
212 
213 	    ajStrAssignC(&queryaln,"");
214 	    ajStrAssignC(&targetaln,"");
215 
216 	    ajDebug("++ %S v %S start:%d %d end:%d %d\n",
217 		    ajSeqGetNameS(targetseq), ajSeqGetNameS(queryseq),
218 		    targetstart, querystart, targetend, queryend);
219 
220 	    newmax = (targetend-targetstart+2)*width;
221 
222 	    if(newmax > oldmax)
223 	    {
224 		AJFREE(path);
225 		AJCNEW0(path,newmax);
226 		AJFREE(compass);
227 		AJCNEW0(compass,newmax);
228 		oldmax=newmax;
229 		ajDebug("++ memory re/allocation for path/compass arrays"
230 			" to size: %d\n", newmax);
231 	    }
232 	    else
233 	    {
234 		AJCSET0(path,newmax);
235 		AJCSET0(compass,newmax);
236 	    }
237 
238 	    ajDebug("Calling embAlignPathCalcSWFast "
239 		    "%d..%d [%d/%d] %d..%d [%d/%d] width:%d\n",
240 		    querystart, queryend, (queryend - querystart + 1),
241 		    ajSeqGetLen(queryseq),
242 		    targetstart, targetend, (targetend - targetstart + 1),
243 		    ajSeqGetLen(targetseq),
244 		    width);
245 
246 	    score = embAlignPathCalcSWFast(&targetseqc[targetstart],
247 	                                   &queryseqc[querystart],
248 	                                   targetend-targetstart+1,
249 	                                   queryend-querystart+1,
250 	                                   0,width,
251 	                                   gapopen,gapextend,
252 	                                   path,sub,cvt,
253 	                                   compass,show);
254 	    if(score>minscore)
255 	    {
256 		embAlignWalkSWMatrixFast(path,compass,gapopen,gapextend,
257 		                         targetseq,queryseq,
258 		                         &targetaln,&queryaln,
259 		                         targetend-targetstart+1,
260 		                         queryend-querystart+1,
261 		                         0,width,
262 		                         &targetstart,&querystart);
263 
264 		if(!ajAlignFormatShowsSequences(align))
265 		{
266 		    ajAlignDefineCC(align, ajStrGetPtr(targetaln),
267 		                    ajStrGetPtr(queryaln),
268 		                    ajSeqGetNameC(targetseq),
269 		                    ajSeqGetNameC(queryseq));
270 		    ajAlignSetScoreR(align, score);
271 		}
272 		else
273 		{
274 		    ajDebug(" queryaln:%S \ntargetaln:%S\n",
275 		            queryaln,targetaln);
276 		    embAlignReportLocal(align,
277 			    queryseq, targetseq,
278 			    queryaln, targetaln,
279 			    querystart, targetstart,
280 			    gapopen, gapextend,
281 			    score, matrix,
282 			    1 + ajSeqGetOffset(queryseq),
283 			    1 + ajSeqGetOffset(targetseq)
284 		    );
285 		}
286 		ajAlignWrite(align);
287 		ajAlignReset(align);
288 	    }
289 	    ajStrDel(&targetaln);
290 
291 	    embWordMatchListDelete(&matchlist[k]);
292 	}
293 
294 	ajStrDel(&queryaln);
295     }
296 
297 
298     for(k=0;k<nkmers;k++)
299     {
300 	AJFREE(wordsw[k]->seqindxs);
301 	AJFREE(wordsw[k]->nSeqMatches);
302 
303 	for(j=0;j<wordsw[k]->nseqs;j++)
304 	    AJFREE(wordsw[k]->locs[j]);
305 
306 	AJFREE(wordsw[k]->nnseqlocs);
307 	AJFREE(wordsw[k]->locs);
308 	AJFREE(wordsw[k]);
309     }
310 
311     embWordFreeTable(&kmers);
312 
313     if(!ajAlignFormatShowsSequences(align))
314 	ajMatrixfDel(&matrix);
315 
316     AJFREE(path);
317     AJFREE(compass);
318     AJFREE(kmers);
319     AJFREE(wordsw);
320 
321     AJFREE(matchlist);
322     AJFREE(lastlocation);
323 
324     ajAlignClose(align);
325     ajAlignDel(&align);
326     ajSeqallDel(&queryseqs);
327     ajSeqDel(&queryseq);
328     ajSeqsetDel(&targetseqs);
329     ajFileClose(&errorf);
330 
331     embExit();
332 
333     return 0;
334 }
335 
336 
337 
338 
339 /* @funcstatic supermatcher_findendpoints ***********************************
340 **
341 ** Calculate end points for banded Smith-Waterman alignment.
342 **
343 ** @param [r] max [const EmbPWordMatch] match with maximum similarity
344 ** @param [r] trgseq [const AjPSeq] target sequence
345 ** @param [r] qryseq [const AjPSeq] query sequence
346 ** @param [w] trgstart [ajint*] start in target sequence
347 ** @param [w] qrystart [ajint*] start in query sequence
348 ** @param [w] trgend [ajint*] end in target sequence
349 ** @param [w] qryend [ajint*] end in query sequence
350 ** @return [void]
351 ** @@
352 ******************************************************************************/
353 
supermatcher_findendpoints(const EmbPWordMatch max,const AjPSeq trgseq,const AjPSeq qryseq,ajint * trgstart,ajint * qrystart,ajint * trgend,ajint * qryend)354 static void supermatcher_findendpoints(const EmbPWordMatch max,
355 	const AjPSeq trgseq, const AjPSeq qryseq,
356 	ajint *trgstart, ajint *qrystart,
357 	ajint *trgend, ajint *qryend)
358 {
359     ajint amax;
360     ajint bmax;
361     ajint offset;
362 
363     *trgstart = max->seq1start;
364     *qrystart = max->seq2start;
365 
366     offset = *trgstart - *qrystart;
367 
368     if(offset > 0)
369     {
370 	*trgstart = offset;
371 	*qrystart = 0;
372     }
373     else
374     {
375 	*qrystart = 0-offset;
376 	*trgstart = 0;
377     }
378 
379     amax = ajSeqGetLen(trgseq)-1;
380     bmax = ajSeqGetLen(qryseq)-1;
381 
382     *trgend = *trgstart;
383     *qryend = *qrystart;
384 
385     ajDebug("++ end1 %d -> %d end2 %d -> %d\n", *trgend, amax, *qryend, bmax);
386 
387     while(*trgend<amax && *qryend<bmax)
388     {
389 	(*trgend)++;
390 	(*qryend)++;
391     }
392 
393     ajDebug("++ end1 %d end2 %d\n", *trgend, *qryend);
394 
395     ajDebug("supermatcher_findendpoints: %d..%d [%d] %d..%d [%d]\n",
396 	    trgstart, *trgend, ajSeqGetLen(trgseq), qrystart, *qryend,
397 	    ajSeqGetLen(qryseq));
398 
399     return;
400 }
401