1 /* @source supermatcher application
2 **
3 ** Local alignment of large sequences
4 **
5 ** @author Copyright (C) Ian Longden
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ******************************************************************************/
22
23 /* supermatcher
24 ** Create a word table for the sequence set.
25 ** Then iterates over the sequence stream first checking any possible word
26 ** matches.
27 ** If word matches then check to see if the position lines up with the last
28 ** position if it does continue else stop.
29 ** This gives us the start (offset) for the smith-waterman match by finding
30 ** the biggest match and calculating start and ends for both sequences.
31 */
32
33
34 /*
35 ** Word matching function was re-implemented using Rabin-Karp multi-pattern
36 ** search algorithm. May 2010.
37 */
38
39 #include "emboss.h"
40
41
42
43
44 static void supermatcher_findendpoints(const EmbPWordMatch max,
45 const AjPSeq trgseq, const AjPSeq qryseq,
46 ajint *trgstart, ajint *qrystart,
47 ajint *trgend, ajint *qryend);
48
49
50
51
52 /* @prog supermatcher *********************************************************
53 **
54 ** Finds matches of a set of sequences against one or more sequences
55 **
56 ** Create a word table for the second sequence.
57 ** Then go down first sequence checking to see if the word matches.
58 ** If word matches then check to see if the position lines up with the last
59 ** position if it does continue else stop.
60 ** This gives us the start (offset) for the smith-waterman match by finding
61 ** the biggest match and calculating start and ends for both sequences.
62 **
63 ******************************************************************************/
64
main(int argc,char ** argv)65 int main(int argc, char **argv)
66 {
67 AjPSeqall queryseqs;
68 AjPSeqset targetseqs;
69 AjPSeq queryseq;
70 const AjPSeq targetseq;
71 AjPStr queryaln = 0;
72 AjPStr targetaln = 0;
73
74 AjPFile errorf;
75 AjBool show = ajFalse;
76
77 const char *queryseqc;
78 const char *targetseqc;
79
80 AjPMatrixf matrix;
81 AjPSeqCvt cvt = 0;
82 float **sub;
83 ajint *compass = NULL;
84 float *path = NULL;
85
86 float gapopen;
87 float gapextend;
88 float score;
89 float minscore;
90
91 ajuint j, k;
92 ajint querystart = 0;
93 ajint targetstart = 0;
94 ajint queryend = 0;
95 ajint targetend = 0;
96 ajint width = 0;
97 AjPTable kmers = 0;
98 ajint wordlen = 6;
99 ajint oldmax = 0;
100 ajint newmax = 0;
101
102 ajuint ntargetseqs;
103 ajuint nkmers;
104
105 AjPAlign align = NULL;
106 EmbPWordMatch maxmatch; /* match with maximum score */
107
108 /* Cursors for the current sequence being scanned,
109 ** i.e., until which location it was scanned.
110 ** Separate cursor/location entries for each sequence in the seqset.
111 */
112 ajuint* lastlocation;
113
114 EmbPWordRK* wordsw = NULL;
115 AjPList* matchlist = NULL;
116
117 embInit("supermatcher", argc, argv);
118
119 matrix = ajAcdGetMatrixf("datafile");
120 queryseqs = ajAcdGetSeqall("asequence");
121 targetseqs= ajAcdGetSeqset("bsequence");
122 gapopen = ajAcdGetFloat("gapopen");
123 gapextend = ajAcdGetFloat("gapextend");
124 wordlen = ajAcdGetInt("wordlen");
125 align = ajAcdGetAlign("outfile");
126 errorf = ajAcdGetOutfile("errfile");
127 width = ajAcdGetInt("width"); /* width for banded Smith-Waterman */
128 minscore = ajAcdGetFloat("minscore");
129
130 gapopen = ajRoundFloat(gapopen, 8);
131 gapextend = ajRoundFloat(gapextend, 8);
132
133 sub = ajMatrixfGetMatrix(matrix);
134 cvt = ajMatrixfGetCvt(matrix);
135
136 embWordLength(wordlen);
137
138 /* seqset sequence is the reference sequence for SAM format */
139 ajAlignSetRefSeqIndx(align, 1);
140
141 ajSeqsetTrim(targetseqs);
142
143 ntargetseqs = ajSeqsetGetSize(targetseqs);
144
145 AJCNEW0(matchlist, ntargetseqs);
146
147 /* get tables of words */
148 for(k=0;k<ntargetseqs;k++)
149 {
150 targetseq = ajSeqsetGetseqSeq(targetseqs, k);
151 embWordGetTable(&kmers, targetseq);
152 ajDebug("Number of distinct kmers found so far: %Lu\n",
153 ajTableGetLength(kmers));
154 }
155 AJCNEW0(lastlocation, ntargetseqs);
156
157 if(ajTableGetLength(kmers)<1)
158 ajErr("no kmers found");
159
160 nkmers = embWordRabinKarpInit(kmers, &wordsw, wordlen, targetseqs);
161
162 while(ajSeqallNext(queryseqs,&queryseq))
163 {
164 ajSeqTrim(queryseq);
165
166 queryaln = ajStrNewRes(1+ajSeqGetLen(queryseq));
167
168 ajDebug("Read '%S'\n", ajSeqGetNameS(queryseq));
169
170 for(k=0;k<ntargetseqs;k++)
171 {
172 lastlocation[k]=0;
173 matchlist[k] = ajListstrNew();
174 }
175
176 embWordRabinKarpSearch(ajSeqGetSeqS(queryseq), targetseqs,
177 (EmbPWordRK const *)wordsw, wordlen, nkmers,
178 matchlist, lastlocation, ajFalse);
179
180
181 for(k=0;k<ajSeqsetGetSize(targetseqs);k++)
182 {
183 targetseq = ajSeqsetGetseqSeq(targetseqs, k);
184
185 ajDebug("Processing '%S'\n", ajSeqGetNameS(targetseq));
186
187 if(ajListGetLength(matchlist[k])==0)
188 {
189 ajFmtPrintF(errorf,
190 "No wordmatch start points for "
191 "%s vs %s. No alignment\n",
192 ajSeqGetNameC(queryseq),ajSeqGetNameC(targetseq));
193 embWordMatchListDelete(&matchlist[k]);
194 continue;
195 }
196
197
198 /* only the maximum match is used as seed
199 * (if there is more than one location with the maximum match
200 * only the first one is used)
201 * TODO: we should add a new option to make above limit optional
202 */
203 maxmatch = embWordMatchFirstMax(matchlist[k]);
204
205 supermatcher_findendpoints(maxmatch,targetseq, queryseq,
206 &targetstart, &querystart,
207 &targetend, &queryend);
208
209 targetaln=ajStrNewRes(1+ajSeqGetLen(targetseq));
210 queryseqc = ajSeqGetSeqC(queryseq);
211 targetseqc = ajSeqGetSeqC(targetseq);
212
213 ajStrAssignC(&queryaln,"");
214 ajStrAssignC(&targetaln,"");
215
216 ajDebug("++ %S v %S start:%d %d end:%d %d\n",
217 ajSeqGetNameS(targetseq), ajSeqGetNameS(queryseq),
218 targetstart, querystart, targetend, queryend);
219
220 newmax = (targetend-targetstart+2)*width;
221
222 if(newmax > oldmax)
223 {
224 AJFREE(path);
225 AJCNEW0(path,newmax);
226 AJFREE(compass);
227 AJCNEW0(compass,newmax);
228 oldmax=newmax;
229 ajDebug("++ memory re/allocation for path/compass arrays"
230 " to size: %d\n", newmax);
231 }
232 else
233 {
234 AJCSET0(path,newmax);
235 AJCSET0(compass,newmax);
236 }
237
238 ajDebug("Calling embAlignPathCalcSWFast "
239 "%d..%d [%d/%d] %d..%d [%d/%d] width:%d\n",
240 querystart, queryend, (queryend - querystart + 1),
241 ajSeqGetLen(queryseq),
242 targetstart, targetend, (targetend - targetstart + 1),
243 ajSeqGetLen(targetseq),
244 width);
245
246 score = embAlignPathCalcSWFast(&targetseqc[targetstart],
247 &queryseqc[querystart],
248 targetend-targetstart+1,
249 queryend-querystart+1,
250 0,width,
251 gapopen,gapextend,
252 path,sub,cvt,
253 compass,show);
254 if(score>minscore)
255 {
256 embAlignWalkSWMatrixFast(path,compass,gapopen,gapextend,
257 targetseq,queryseq,
258 &targetaln,&queryaln,
259 targetend-targetstart+1,
260 queryend-querystart+1,
261 0,width,
262 &targetstart,&querystart);
263
264 if(!ajAlignFormatShowsSequences(align))
265 {
266 ajAlignDefineCC(align, ajStrGetPtr(targetaln),
267 ajStrGetPtr(queryaln),
268 ajSeqGetNameC(targetseq),
269 ajSeqGetNameC(queryseq));
270 ajAlignSetScoreR(align, score);
271 }
272 else
273 {
274 ajDebug(" queryaln:%S \ntargetaln:%S\n",
275 queryaln,targetaln);
276 embAlignReportLocal(align,
277 queryseq, targetseq,
278 queryaln, targetaln,
279 querystart, targetstart,
280 gapopen, gapextend,
281 score, matrix,
282 1 + ajSeqGetOffset(queryseq),
283 1 + ajSeqGetOffset(targetseq)
284 );
285 }
286 ajAlignWrite(align);
287 ajAlignReset(align);
288 }
289 ajStrDel(&targetaln);
290
291 embWordMatchListDelete(&matchlist[k]);
292 }
293
294 ajStrDel(&queryaln);
295 }
296
297
298 for(k=0;k<nkmers;k++)
299 {
300 AJFREE(wordsw[k]->seqindxs);
301 AJFREE(wordsw[k]->nSeqMatches);
302
303 for(j=0;j<wordsw[k]->nseqs;j++)
304 AJFREE(wordsw[k]->locs[j]);
305
306 AJFREE(wordsw[k]->nnseqlocs);
307 AJFREE(wordsw[k]->locs);
308 AJFREE(wordsw[k]);
309 }
310
311 embWordFreeTable(&kmers);
312
313 if(!ajAlignFormatShowsSequences(align))
314 ajMatrixfDel(&matrix);
315
316 AJFREE(path);
317 AJFREE(compass);
318 AJFREE(kmers);
319 AJFREE(wordsw);
320
321 AJFREE(matchlist);
322 AJFREE(lastlocation);
323
324 ajAlignClose(align);
325 ajAlignDel(&align);
326 ajSeqallDel(&queryseqs);
327 ajSeqDel(&queryseq);
328 ajSeqsetDel(&targetseqs);
329 ajFileClose(&errorf);
330
331 embExit();
332
333 return 0;
334 }
335
336
337
338
339 /* @funcstatic supermatcher_findendpoints ***********************************
340 **
341 ** Calculate end points for banded Smith-Waterman alignment.
342 **
343 ** @param [r] max [const EmbPWordMatch] match with maximum similarity
344 ** @param [r] trgseq [const AjPSeq] target sequence
345 ** @param [r] qryseq [const AjPSeq] query sequence
346 ** @param [w] trgstart [ajint*] start in target sequence
347 ** @param [w] qrystart [ajint*] start in query sequence
348 ** @param [w] trgend [ajint*] end in target sequence
349 ** @param [w] qryend [ajint*] end in query sequence
350 ** @return [void]
351 ** @@
352 ******************************************************************************/
353
supermatcher_findendpoints(const EmbPWordMatch max,const AjPSeq trgseq,const AjPSeq qryseq,ajint * trgstart,ajint * qrystart,ajint * trgend,ajint * qryend)354 static void supermatcher_findendpoints(const EmbPWordMatch max,
355 const AjPSeq trgseq, const AjPSeq qryseq,
356 ajint *trgstart, ajint *qrystart,
357 ajint *trgend, ajint *qryend)
358 {
359 ajint amax;
360 ajint bmax;
361 ajint offset;
362
363 *trgstart = max->seq1start;
364 *qrystart = max->seq2start;
365
366 offset = *trgstart - *qrystart;
367
368 if(offset > 0)
369 {
370 *trgstart = offset;
371 *qrystart = 0;
372 }
373 else
374 {
375 *qrystart = 0-offset;
376 *trgstart = 0;
377 }
378
379 amax = ajSeqGetLen(trgseq)-1;
380 bmax = ajSeqGetLen(qryseq)-1;
381
382 *trgend = *trgstart;
383 *qryend = *qrystart;
384
385 ajDebug("++ end1 %d -> %d end2 %d -> %d\n", *trgend, amax, *qryend, bmax);
386
387 while(*trgend<amax && *qryend<bmax)
388 {
389 (*trgend)++;
390 (*qryend)++;
391 }
392
393 ajDebug("++ end1 %d end2 %d\n", *trgend, *qryend);
394
395 ajDebug("supermatcher_findendpoints: %d..%d [%d] %d..%d [%d]\n",
396 trgstart, *trgend, ajSeqGetLen(trgseq), qrystart, *qryend,
397 ajSeqGetLen(qryseq));
398
399 return;
400 }
401