1 /* @source wordfinder application
2 **
3 ** Word-based rapid comparison of large sequences
4 **
5 ** @author Copyright (C) Peter Rice
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ******************************************************************************/
22
23 #include "emboss.h"
24 #include <limits.h>
25 #include <math.h>
26
27
28 /*
29 ** Notes:
30 ** Specific requests to search for word-based matches with 15-35 identical
31 ** residues ungapped, and then to check that these are the only significant
32 ** matches by checking for general homology, for example histones matching.
33 ** Intended for use in widely divergent species.
34 */
35
36
37
38
39 /* @datastatic concat *********************************************************
40 **
41 ** wordfinder internals
42 **
43 ** @alias concatS
44 **
45 ** @attr offset [ajint] Undocumented
46 ** @attr count [ajint] Undocumented
47 ** @attr list [AjPList] Undocumented
48 ** @attr total [ajint] Undocumented
49 ** @attr Padding [char[4]] Padding to alignment boundary
50 ******************************************************************************/
51
52 typedef struct concatS
53 {
54 ajint offset;
55 ajint count;
56 AjPList list;
57 ajint total;
58 char Padding[4];
59 } concat;
60
61
62
63
64 static void wordfinder_matchListOrder(void **x,void *cl);
65 static void wordfinder_orderandconcat(AjPList list,AjPList ordered);
66 static void wordfinder_removelists(void **x,void *cl);
67 static ajint wordfinder_findstartpoints(AjPTable seq1MatchTable,
68 const AjPSeq b, const AjPSeq a,
69 ajint *trgstart, ajint *qrystart,
70 ajint *trgend, ajint *qryend);
71 static void wordfinder_findmax(void **x,void *cl);
72
73
74
75
76 concat *conmax = NULL;
77 ajint maxgap = 0;
78
79
80
81
82 /* @prog wordfinder *********************************************************
83 **
84 ** Finds a match of a large sequence against one or more sequences
85 **
86 ** Create a word table for the first sequence.
87 ** Then go down second sequence checking to see if the word matches.
88 ** If word matches then check to see if the position lines up with the last
89 ** position if it does continue else stop.
90 ** This gives us the start (offset) for the smith-waterman match by finding
91 ** the biggest match and calculating start and ends for both sequences.
92 **
93 ******************************************************************************/
94
main(int argc,char ** argv)95 int main(int argc, char **argv)
96 {
97 AjPSeqset qryseqs;
98 AjPSeqall trgseqs;
99 AjPSeq trgseq;
100 const AjPSeq qryseq;
101 AjPStr mtrg = 0;
102 AjPStr nqry = 0;
103
104 AjPFile errorf;
105 AjBool show = ajFalse;
106
107 ajint trglen = 0;
108 ajint qrylen = 0;
109
110 const char *ptrg;
111 const char *qqry;
112
113 AjPMatrixf matrix;
114 AjPSeqCvt cvt = 0;
115 float **sub;
116 ajint *compass = 0;
117 float *path = 0;
118
119 float gapopen;
120 float gapextend;
121 float score = 0.0;
122 ajint matchscore = 0;
123 ajint limitmatch;
124 ajint limitalign;
125 ajint lowmatch;
126 ajint lowalign;
127
128 ajint trgbegin;
129 ajint i;
130 ajuint k;
131 ajint qrybegin;
132 ajint trgstart = 0;
133 ajint qrystart = 0;
134 ajint trgend = 0;
135 ajint qryend = 0;
136 ajint width = 0;
137 AjPTable seq1MatchTable = 0;
138 ajint wordlen = 6;
139 ajint oldmax = 0;
140 ajint newmax = 0;
141
142 AjPAlign align = NULL;
143 ajint imatches = 0;
144 ajint itrg = 0;
145 AjPStr tmpstr = NULL;
146
147 embInit("wordfinder", argc, argv);
148
149 matrix = ajAcdGetMatrixf("datafile");
150 qryseqs = ajAcdGetSeqset("asequence");
151 trgseqs = ajAcdGetSeqall("bsequence");
152 gapopen = ajAcdGetFloat("gapopen");
153 gapextend = ajAcdGetFloat("gapextend");
154 wordlen = ajAcdGetInt("wordlen");
155 limitmatch = ajAcdGetInt("limitmatch");
156 limitalign = ajAcdGetInt("limitalign");
157 lowmatch = ajAcdGetInt("lowmatch");
158 lowalign = ajAcdGetInt("lowalign");
159 align = ajAcdGetAlign("outfile");
160 errorf = ajAcdGetOutfile("errfile");
161 width = ajAcdGetInt("width"); /* not the same as awidth */
162
163 gapopen = ajRoundFloat(gapopen, 8);
164 gapextend = ajRoundFloat(gapextend, 8);
165
166
167 sub = ajMatrixfGetMatrix(matrix);
168 cvt = ajMatrixfGetCvt(matrix);
169
170 embWordLength(wordlen);
171
172 ajSeqsetTrim(qryseqs);
173
174 while(ajSeqallNext(trgseqs,&trgseq))
175 {
176 itrg++;
177 imatches = 0;
178 ajSeqTrim(trgseq);
179 trgbegin = 1 + ajSeqGetOffset(trgseq);
180
181 mtrg = ajStrNewRes(1+ajSeqGetLen(trgseq));
182
183 trglen = ajSeqGetLen(trgseq);
184
185 ajDebug("Read '%S'\n", ajSeqGetNameS(trgseq));
186 ajSeqTrace(trgseq);
187 if(embWordGetTable(&seq1MatchTable, trgseq)) /* get table of words */
188 {
189 for(k=0;k<ajSeqsetGetSize(qryseqs);k++)
190 {
191 qryseq = ajSeqsetGetseqSeq(qryseqs, k);
192 qrylen = ajSeqGetLen(qryseq);
193 qrybegin = 1 + ajSeqGetOffset(qryseq);
194
195 ajDebug("Processing '%S'\n", ajSeqGetNameS(qryseq));
196
197 matchscore = wordfinder_findstartpoints(seq1MatchTable,
198 qryseq, trgseq,
199 &qrystart, &trgstart,
200 &qryend, &trgend);
201 if(!matchscore ||
202 (limitmatch && (matchscore > limitmatch)) ||
203 (matchscore < lowmatch))
204 {
205 if(matchscore)
206 {
207 ajFmtPrintF(errorf,"Match limits (%d..%d) exclude '%S' '%S' %d\n",
208 lowmatch, limitmatch,
209 ajSeqGetNameS(qryseq),
210 ajSeqGetNameS(trgseq),
211 matchscore);
212 }
213 continue;
214 }
215
216 nqry=ajStrNewRes(1+ajSeqGetLen(qryseq));
217 ptrg = ajSeqGetSeqC(trgseq);
218 qqry = ajSeqGetSeqC(qryseq);
219
220 ajStrAssignC(&mtrg,"");
221 ajStrAssignC(&nqry,"");
222
223 ajDebug("++ %S v %S start:%d %d end:%d %d\n",
224 ajSeqGetNameS(qryseq), ajSeqGetNameS(trgseq),
225 qrystart, trgstart, qryend, trgend);
226 imatches++;
227
228 newmax = (trgend-trgstart+2)*width;
229 if(newmax > oldmax)
230 {
231 AJCRESIZE0(path,oldmax, newmax);
232 AJCRESIZE0(compass,oldmax,newmax);
233 oldmax = newmax;
234 ajDebug("++ resize to oldmax: %d\n", oldmax);
235 }
236
237 for(i=0;i<((trgend-trgstart)+1)*width;i++)
238 path[i] = 0.0;
239
240 ajDebug("Calling embAlignPathCalcSWFast "
241 "%d..%d [%d/%d] %d..%d [%d/%d] width:%d\n",
242 trgstart, trgend, (trgend - trgstart + 1), trglen,
243 qrystart, qryend, (qryend - qrystart + 1), qrylen,
244 width);
245
246 score = embAlignPathCalcSWFast(&qqry[qrystart],
247 &ptrg[trgstart],
248 qryend-qrystart+1,
249 trgend-trgstart+1,
250 0,width,
251 gapopen,gapextend,path,sub,cvt,
252 compass,show);
253
254 embAlignWalkSWMatrixFast(path,compass,gapopen,gapextend,
255 qryseq,trgseq,
256 &nqry,&mtrg,
257 qryend-qrystart+1,
258 trgend-trgstart+1,
259 0, width,
260 &qrystart,&trgstart);
261
262 if(!ajAlignFormatShowsSequences(align))
263 {
264 ajAlignDefineCC(align, ajStrGetPtr(mtrg),
265 ajStrGetPtr(nqry), ajSeqGetNameC(trgseq),
266 ajSeqGetNameC(qryseq));
267 ajAlignSetScoreR(align, score);
268 }
269 else
270 embAlignReportLocal(align,
271 qryseq, trgseq,
272 nqry,mtrg,
273 qrystart,trgstart,
274 gapopen, gapextend,
275 score,matrix,
276 qrybegin, trgbegin);
277
278
279 if((limitalign && (ajAlignGetLen(align) > limitalign)) ||
280 (ajAlignGetLen(align) < lowalign))
281 {
282 imatches--;
283 ajFmtPrintF(errorf,"Align limits (%d..%d) excludes '%S' '%S' %d\n",
284 lowalign, limitalign,
285 ajSeqGetNameS(qryseq),
286 ajSeqGetNameS(trgseq),
287 ajAlignGetLen(align));
288 }
289 else
290 {
291 if(ajAlignFormatShowsSequences(align))
292 {
293 ajFmtPrintS(&tmpstr, "\nWordscore:%d", matchscore);
294 ajAlignSetSubHeader(align, tmpstr);
295 ajFmtPrintS(&tmpstr, "Alignlength:%d",
296 ajAlignGetLen(align));
297 ajAlignSetSubHeaderApp(align, tmpstr);
298 ajStrDel(&tmpstr);
299 }
300 ajAlignWrite(align);
301 }
302 ajAlignReset(align);
303 ajStrDel(&nqry);
304 }
305 }
306 embWordFreeTable(&seq1MatchTable); /* free table of words */
307 seq1MatchTable=0;
308
309 ajStrDel(&mtrg);
310
311 ajDebug("... %d matches", imatches);
312 if(imatches)
313 ajFmtPrintF(errorf, "Target %d %S matches %d\n",
314 itrg, ajSeqGetNameS(trgseq), imatches);
315 }
316
317 if(!ajAlignFormatShowsSequences(align))
318 {
319 ajMatrixfDel(&matrix);
320 }
321
322 AJFREE(path);
323 AJFREE(compass);
324
325 ajAlignClose(align);
326 ajAlignDel(&align);
327 ajSeqallDel(&trgseqs);
328 ajSeqDel(&trgseq);
329 ajSeqsetDel(&qryseqs);
330 ajFileClose(&errorf);
331
332 embExit();
333
334 return 0;
335 }
336
337
338
339
340 /* @funcstatic wordfinder_matchListOrder ************************************
341 **
342 ** Undocumented.
343 **
344 ** @param [r] x [void**] Undocumented
345 ** @param [r] cl [void*] Undocumented
346 ** @return [void]
347 ** @@
348 ******************************************************************************/
349
wordfinder_matchListOrder(void ** x,void * cl)350 static void wordfinder_matchListOrder(void **x,void *cl)
351 {
352 EmbPWordMatch p;
353 AjPList ordered;
354 ajint offset;
355 AjIList listIter;
356 concat *con;
357 concat *c=NULL;
358
359 p = (EmbPWordMatch)*x;
360 ordered = (AjPList) cl;
361
362 offset = (*p).seq1start-(*p).seq2start;
363
364 /* iterate through ordered list to find if it exists already*/
365 listIter = ajListIterNewread(ordered);
366
367 while(!ajListIterDone( listIter))
368 {
369 con = ajListIterGet(listIter);
370 if(con->offset == offset)
371 {
372 /* found so add count and set offset to the new value */
373 con->offset = offset;
374 con->total+= (*p).length;
375 con->count++;
376 ajListPushAppend(con->list,p);
377 ajListIterDel(&listIter);
378 return;
379 }
380 }
381 ajListIterDel(&listIter);
382
383 /* not found so add it */
384 AJNEW(c);
385 c->offset = offset;
386 c->total = (*p).length;
387 c->count = 1;
388 c->list = ajListNew();
389 ajListPushAppend(c->list,p);
390 ajListPushAppend(ordered, c);
391
392 return;
393 }
394
395
396
397
398 /* @funcstatic wordfinder_orderandconcat ************************************
399 **
400 ** Undocumented.
401 **
402 ** @param [u] list [AjPList] unordered input list - elements added to the
403 ** ordered list, but apparently not deleted.
404 ** @param [w] ordered [AjPList] ordered output list
405 ** @return [void]
406 ** @@
407 ******************************************************************************/
408
wordfinder_orderandconcat(AjPList list,AjPList ordered)409 static void wordfinder_orderandconcat(AjPList list,AjPList ordered)
410 {
411 ajListMap(list, &wordfinder_matchListOrder, ordered);
412
413 return;
414 }
415
416
417
418
419 /* @funcstatic wordfinder_removelists ***************************************
420 **
421 ** Undocumented.
422 **
423 ** @param [r] x [void**] Undocumented
424 ** @param [r] cl [void*] Undocumented
425 ** @return [void]
426 ** @@
427 ******************************************************************************/
428
wordfinder_removelists(void ** x,void * cl)429 static void wordfinder_removelists(void **x,void *cl)
430 {
431 concat *p;
432
433 (void) cl; /* make it used */
434
435 p = (concat *)*x;
436
437 ajListFree(&(p)->list);
438 AJFREE(p);
439
440 return;
441 }
442
443
444
445
446 /* @funcstatic wordfinder_findmax *******************************************
447 **
448 ** Undocumented.
449 **
450 ** @param [r] x [void**] Undocumented
451 ** @param [r] cl [void*] Undocumented
452 ** @return [void]
453 ** @@
454 ******************************************************************************/
455
wordfinder_findmax(void ** x,void * cl)456 static void wordfinder_findmax(void **x,void *cl)
457 {
458 concat *p;
459 ajint *max;
460
461 p = (concat *)*x;
462 max = (ajint *) cl;
463
464 if(p->total > *max)
465 {
466 *max = p->total;
467 conmax = p;
468 }
469
470 return;
471 }
472
473
474
475
476 /* @funcstatic wordfinder_findstartpoints ***********************************
477 **
478 ** Undocumented.
479 **
480 ** @param [w] seq1MatchTable [AjPTable] match table
481 ** @param [r] qryseq [const AjPSeq] query sequence 1
482 ** @param [r] trgseq [const AjPSeq] target sequence 2
483 ** @param [w] qrystart [ajint*] start in sequence 2
484 ** @param [w] trgstart [ajint*] start in sequence 1
485 ** @param [w] qryend [ajint*] end in sequence 2
486 ** @param [w] trgend [ajint*] end in sequence 1
487 ** @return [ajint] Undocumented
488 ** @@
489 ******************************************************************************/
490
wordfinder_findstartpoints(AjPTable seq1MatchTable,const AjPSeq qryseq,const AjPSeq trgseq,ajint * qrystart,ajint * trgstart,ajint * qryend,ajint * trgend)491 static ajint wordfinder_findstartpoints(AjPTable seq1MatchTable,
492 const AjPSeq qryseq,
493 const AjPSeq trgseq,
494 ajint *qrystart, ajint *trgstart,
495 ajint *qryend, ajint *trgend)
496 {
497 ajint max = -10;
498 ajint offset = 0;
499 AjPList matchlist = NULL;
500 AjPList ordered = NULL;
501 ajint trgmax;
502 ajint qrymax;
503 ajint bega;
504 ajint begb;
505
506 trgmax = ajSeqGetLen(trgseq)-1;
507 qrymax = ajSeqGetLen(qryseq)-1;
508 bega = ajSeqGetOffset(trgseq);
509 begb = ajSeqGetOffset(qryseq);
510
511
512 ajDebug("wordfinder_findstartpoints len %d %d off %d %d\n",
513 trgmax, qrymax, bega, begb);
514 matchlist = embWordBuildMatchTable(seq1MatchTable, qryseq, ajTrue);
515
516 if(!matchlist)
517 return 0;
518 else if(!matchlist->Count)
519 {
520 embWordMatchListDelete(&matchlist);
521 return 0;
522 }
523
524
525 /* order and add if the gap is gapmax or less */
526
527 /* create list header bit*/
528 ordered = ajListNew();
529
530 wordfinder_orderandconcat(matchlist, ordered);
531
532 ajListMap(ordered, &wordfinder_findmax, &max);
533
534 ajDebug("findstart conmax off:%d count:%d total:%d list:%Lu\n",
535 conmax->offset, conmax->count, conmax->total,
536 ajListGetLength(conmax->list));
537 offset = conmax->offset;
538
539 ajListMap(ordered, &wordfinder_removelists, NULL);
540 ajListFree(&ordered);
541 embWordMatchListDelete(&matchlist); /* free the match structures */
542
543
544 if(offset > 0)
545 {
546 *trgstart = offset;
547 *qrystart = 0;
548 }
549 else
550 {
551 *qrystart = 0-offset;
552 *trgstart = 0;
553 }
554 *trgend = *trgstart;
555 *qryend = *qrystart;
556
557 ajDebug("++ trgend %d -> %d qryend %d -> %d\n",
558 *trgend, trgmax, *qryend, qrymax);
559 while(*trgend<trgmax && *qryend<qrymax)
560 {
561 (*trgend)++;
562 (*qryend)++;
563 }
564
565 ajDebug("++ trgend %d qryend %d\n", *trgend, *qryend);
566
567
568 ajDebug("wordfinder_findstartpoints has %d..%d [%d] %d..%d [%d]\n",
569 *qrystart, *qryend, ajSeqGetLen(qryseq),
570 *trgstart, *trgend, ajSeqGetLen(trgseq));
571
572 return max;
573 }
574