1 /* @source wordfinder application
2 **
3 ** Word-based rapid comparison of large sequences
4 **
5 ** @author Copyright (C) Peter Rice
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
21 ******************************************************************************/
22 
23 #include "emboss.h"
24 #include <limits.h>
25 #include <math.h>
26 
27 
28 /*
29 ** Notes:
30 ** Specific requests to search for word-based matches with 15-35 identical
31 ** residues ungapped, and then to check that these are the only significant
32 ** matches by checking for general homology, for example histones matching.
33 ** Intended for use in widely divergent species.
34 */
35 
36 
37 
38 
39 /* @datastatic concat *********************************************************
40 **
41 ** wordfinder internals
42 **
43 ** @alias concatS
44 **
45 ** @attr offset [ajint] Undocumented
46 ** @attr count [ajint] Undocumented
47 ** @attr list [AjPList] Undocumented
48 ** @attr total [ajint] Undocumented
49 ** @attr Padding [char[4]] Padding to alignment boundary
50 ******************************************************************************/
51 
52 typedef struct concatS
53 {
54     ajint offset;
55     ajint count;
56     AjPList list;
57     ajint total;
58     char  Padding[4];
59 } concat;
60 
61 
62 
63 
64 static void wordfinder_matchListOrder(void **x,void *cl);
65 static void wordfinder_orderandconcat(AjPList list,AjPList ordered);
66 static void wordfinder_removelists(void **x,void *cl);
67 static ajint wordfinder_findstartpoints(AjPTable seq1MatchTable,
68 					const AjPSeq b, const AjPSeq a,
69 					ajint *trgstart, ajint *qrystart,
70 					ajint *trgend, ajint *qryend);
71 static void wordfinder_findmax(void **x,void *cl);
72 
73 
74 
75 
76 concat *conmax = NULL;
77 ajint maxgap   = 0;
78 
79 
80 
81 
82 /* @prog wordfinder *********************************************************
83 **
84 ** Finds a match of a large sequence against one or more sequences
85 **
86 ** Create a word table for the first sequence.
87 ** Then go down second sequence checking to see if the word matches.
88 ** If word matches then check to see if the position lines up with the last
89 ** position if it does continue else stop.
90 ** This gives us the start (offset) for the smith-waterman match by finding
91 ** the biggest match and calculating start and ends for both sequences.
92 **
93 ******************************************************************************/
94 
main(int argc,char ** argv)95 int main(int argc, char **argv)
96 {
97     AjPSeqset qryseqs;
98     AjPSeqall trgseqs;
99     AjPSeq trgseq;
100     const AjPSeq qryseq;
101     AjPStr mtrg = 0;
102     AjPStr nqry = 0;
103 
104     AjPFile errorf;
105     AjBool show = ajFalse;
106 
107     ajint    trglen = 0;
108     ajint    qrylen = 0;
109 
110     const char   *ptrg;
111     const char   *qqry;
112 
113     AjPMatrixf matrix;
114     AjPSeqCvt cvt = 0;
115     float **sub;
116     ajint *compass = 0;
117     float *path = 0;
118 
119     float gapopen;
120     float gapextend;
121     float score = 0.0;
122     ajint matchscore = 0;
123     ajint limitmatch;
124     ajint limitalign;
125     ajint lowmatch;
126     ajint lowalign;
127 
128     ajint trgbegin;
129     ajint i;
130     ajuint k;
131     ajint qrybegin;
132     ajint trgstart = 0;
133     ajint qrystart = 0;
134     ajint trgend   = 0;
135     ajint qryend   = 0;
136     ajint width  = 0;
137     AjPTable seq1MatchTable = 0;
138     ajint wordlen = 6;
139     ajint oldmax = 0;
140     ajint newmax = 0;
141 
142     AjPAlign align = NULL;
143     ajint imatches = 0;
144     ajint itrg = 0;
145     AjPStr tmpstr = NULL;
146 
147     embInit("wordfinder", argc, argv);
148 
149     matrix    = ajAcdGetMatrixf("datafile");
150     qryseqs   = ajAcdGetSeqset("asequence");
151     trgseqs   = ajAcdGetSeqall("bsequence");
152     gapopen   = ajAcdGetFloat("gapopen");
153     gapextend = ajAcdGetFloat("gapextend");
154     wordlen   = ajAcdGetInt("wordlen");
155     limitmatch = ajAcdGetInt("limitmatch");
156     limitalign = ajAcdGetInt("limitalign");
157     lowmatch = ajAcdGetInt("lowmatch");
158     lowalign = ajAcdGetInt("lowalign");
159     align     = ajAcdGetAlign("outfile");
160     errorf    = ajAcdGetOutfile("errfile");
161     width     = ajAcdGetInt("width");	/* not the same as awidth */
162 
163     gapopen   = ajRoundFloat(gapopen, 8);
164     gapextend = ajRoundFloat(gapextend, 8);
165 
166 
167     sub = ajMatrixfGetMatrix(matrix);
168     cvt = ajMatrixfGetCvt(matrix);
169 
170     embWordLength(wordlen);
171 
172     ajSeqsetTrim(qryseqs);
173 
174     while(ajSeqallNext(trgseqs,&trgseq))
175     {
176         itrg++;
177         imatches = 0;
178         ajSeqTrim(trgseq);
179 	trgbegin = 1 + ajSeqGetOffset(trgseq);
180 
181 	mtrg = ajStrNewRes(1+ajSeqGetLen(trgseq));
182 
183 	trglen = ajSeqGetLen(trgseq);
184 
185 	ajDebug("Read '%S'\n", ajSeqGetNameS(trgseq));
186 	ajSeqTrace(trgseq);
187 	if(embWordGetTable(&seq1MatchTable, trgseq)) /* get table of words */
188 	{
189 	    for(k=0;k<ajSeqsetGetSize(qryseqs);k++)
190 	    {
191 	        qryseq = ajSeqsetGetseqSeq(qryseqs, k);
192 		qrylen   = ajSeqGetLen(qryseq);
193 		qrybegin = 1 + ajSeqGetOffset(qryseq);
194 
195 		ajDebug("Processing '%S'\n", ajSeqGetNameS(qryseq));
196 
197 		matchscore = wordfinder_findstartpoints(seq1MatchTable,
198 							qryseq, trgseq,
199 							&qrystart, &trgstart,
200 							&qryend, &trgend);
201 		if(!matchscore ||
202 		   (limitmatch && (matchscore > limitmatch)) ||
203 		   (matchscore < lowmatch))
204 		{
205 		    if(matchscore)
206 		    {
207 			ajFmtPrintF(errorf,"Match limits (%d..%d) exclude '%S' '%S' %d\n",
208 			       lowmatch, limitmatch,
209 			       ajSeqGetNameS(qryseq),
210 			       ajSeqGetNameS(trgseq),
211 			       matchscore);
212 		    }
213 		    continue;
214 		}
215 
216 		nqry=ajStrNewRes(1+ajSeqGetLen(qryseq));
217 		ptrg = ajSeqGetSeqC(trgseq);
218 		qqry = ajSeqGetSeqC(qryseq);
219 
220 		ajStrAssignC(&mtrg,"");
221 		ajStrAssignC(&nqry,"");
222 
223 		ajDebug("++ %S v %S start:%d %d end:%d %d\n",
224 			ajSeqGetNameS(qryseq), ajSeqGetNameS(trgseq),
225 			qrystart, trgstart, qryend, trgend);
226 		imatches++;
227 
228 		newmax = (trgend-trgstart+2)*width;
229 		if(newmax > oldmax)
230 		{
231 		    AJCRESIZE0(path,oldmax, newmax);
232 		    AJCRESIZE0(compass,oldmax,newmax);
233 		    oldmax = newmax;
234 		    ajDebug("++ resize to oldmax: %d\n", oldmax);
235 		}
236 
237 		for(i=0;i<((trgend-trgstart)+1)*width;i++)
238 		  path[i] = 0.0;
239 
240 		ajDebug("Calling embAlignPathCalcSWFast "
241 			"%d..%d [%d/%d] %d..%d [%d/%d] width:%d\n",
242 			trgstart, trgend, (trgend - trgstart + 1), trglen,
243 			qrystart, qryend, (qryend - qrystart + 1), qrylen,
244                         width);
245 
246 		score = embAlignPathCalcSWFast(&qqry[qrystart],
247                                                &ptrg[trgstart],
248                                                qryend-qrystart+1,
249                                                trgend-trgstart+1,
250                                                0,width,
251                                                gapopen,gapextend,path,sub,cvt,
252                                                compass,show);
253 
254 		    embAlignWalkSWMatrixFast(path,compass,gapopen,gapextend,
255 					     qryseq,trgseq,
256 					     &nqry,&mtrg,
257 					     qryend-qrystart+1,
258 					     trgend-trgstart+1,
259 					     0, width,
260 					     &qrystart,&trgstart);
261 
262             if(!ajAlignFormatShowsSequences(align))
263             {
264                 ajAlignDefineCC(align, ajStrGetPtr(mtrg),
265                         ajStrGetPtr(nqry), ajSeqGetNameC(trgseq),
266                         ajSeqGetNameC(qryseq));
267                 ajAlignSetScoreR(align, score);
268             }
269             else
270                 embAlignReportLocal(align,
271                         qryseq, trgseq,
272                         nqry,mtrg,
273                         qrystart,trgstart,
274                         gapopen, gapextend,
275                         score,matrix,
276                         qrybegin, trgbegin);
277 
278 
279 		    if((limitalign && (ajAlignGetLen(align) > limitalign)) ||
280 		       (ajAlignGetLen(align) < lowalign))
281 		    {
282 			imatches--;
283 			ajFmtPrintF(errorf,"Align limits (%d..%d) excludes '%S' '%S' %d\n",
284 			       lowalign, limitalign,
285 			       ajSeqGetNameS(qryseq),
286 			       ajSeqGetNameS(trgseq),
287 			       ajAlignGetLen(align));
288 		    }
289 		    else
290 		    {
291 		        if(ajAlignFormatShowsSequences(align))
292 		          {
293 		              ajFmtPrintS(&tmpstr, "\nWordscore:%d", matchscore);
294 		              ajAlignSetSubHeader(align, tmpstr);
295 		              ajFmtPrintS(&tmpstr, "Alignlength:%d",
296 		                      ajAlignGetLen(align));
297 		              ajAlignSetSubHeaderApp(align, tmpstr);
298 		              ajStrDel(&tmpstr);
299 		          }
300 		        ajAlignWrite(align);
301 		    }
302             ajAlignReset(align);
303             ajStrDel(&nqry);
304 	    }
305 	}
306 	embWordFreeTable(&seq1MatchTable); /* free table of words */
307 	seq1MatchTable=0;
308 
309 	ajStrDel(&mtrg);
310 
311 	ajDebug("... %d matches", imatches);
312 	if(imatches)
313 	    ajFmtPrintF(errorf, "Target %d %S matches %d\n",
314 	         itrg, ajSeqGetNameS(trgseq), imatches);
315     }
316 
317     if(!ajAlignFormatShowsSequences(align))
318     {
319         ajMatrixfDel(&matrix);
320     }
321 
322     AJFREE(path);
323     AJFREE(compass);
324 
325     ajAlignClose(align);
326     ajAlignDel(&align);
327     ajSeqallDel(&trgseqs);
328     ajSeqDel(&trgseq);
329     ajSeqsetDel(&qryseqs);
330     ajFileClose(&errorf);
331 
332     embExit();
333 
334     return 0;
335 }
336 
337 
338 
339 
340 /* @funcstatic wordfinder_matchListOrder ************************************
341 **
342 ** Undocumented.
343 **
344 ** @param [r] x [void**] Undocumented
345 ** @param [r] cl [void*] Undocumented
346 ** @return [void]
347 ** @@
348 ******************************************************************************/
349 
wordfinder_matchListOrder(void ** x,void * cl)350 static void wordfinder_matchListOrder(void **x,void *cl)
351 {
352     EmbPWordMatch p;
353     AjPList ordered;
354     ajint offset;
355     AjIList listIter;
356     concat *con;
357     concat *c=NULL;
358 
359     p = (EmbPWordMatch)*x;
360     ordered = (AjPList) cl;
361 
362     offset = (*p).seq1start-(*p).seq2start;
363 
364     /* iterate through ordered list to find if it exists already*/
365     listIter = ajListIterNewread(ordered);
366 
367     while(!ajListIterDone( listIter))
368     {
369 	con = ajListIterGet(listIter);
370 	if(con->offset == offset)
371 	{
372 	    /* found so add count and set offset to the new value */
373 	    con->offset = offset;
374 	    con->total+= (*p).length;
375 	    con->count++;
376 	    ajListPushAppend(con->list,p);
377 	    ajListIterDel(&listIter);
378 	    return;
379 	}
380     }
381     ajListIterDel(&listIter);
382 
383     /* not found so add it */
384     AJNEW(c);
385     c->offset = offset;
386     c->total  = (*p).length;
387     c->count  = 1;
388     c->list   = ajListNew();
389     ajListPushAppend(c->list,p);
390     ajListPushAppend(ordered, c);
391 
392     return;
393 }
394 
395 
396 
397 
398 /* @funcstatic wordfinder_orderandconcat ************************************
399 **
400 ** Undocumented.
401 **
402 ** @param [u] list [AjPList] unordered input list - elements added to the
403 **                           ordered list, but apparently not deleted.
404 ** @param [w] ordered [AjPList] ordered output list
405 ** @return [void]
406 ** @@
407 ******************************************************************************/
408 
wordfinder_orderandconcat(AjPList list,AjPList ordered)409 static void wordfinder_orderandconcat(AjPList list,AjPList ordered)
410 {
411     ajListMap(list, &wordfinder_matchListOrder, ordered);
412 
413     return;
414 }
415 
416 
417 
418 
419 /* @funcstatic wordfinder_removelists ***************************************
420 **
421 ** Undocumented.
422 **
423 ** @param [r] x [void**] Undocumented
424 ** @param [r] cl [void*] Undocumented
425 ** @return [void]
426 ** @@
427 ******************************************************************************/
428 
wordfinder_removelists(void ** x,void * cl)429 static void wordfinder_removelists(void **x,void *cl)
430 {
431     concat *p;
432 
433     (void) cl;				/* make it used */
434 
435     p = (concat *)*x;
436 
437     ajListFree(&(p)->list);
438     AJFREE(p);
439 
440     return;
441 }
442 
443 
444 
445 
446 /* @funcstatic wordfinder_findmax *******************************************
447 **
448 ** Undocumented.
449 **
450 ** @param [r] x [void**] Undocumented
451 ** @param [r] cl [void*] Undocumented
452 ** @return [void]
453 ** @@
454 ******************************************************************************/
455 
wordfinder_findmax(void ** x,void * cl)456 static void wordfinder_findmax(void **x,void *cl)
457 {
458     concat *p;
459     ajint *max;
460 
461     p   = (concat *)*x;
462     max = (ajint *) cl;
463 
464     if(p->total > *max)
465     {
466 	*max = p->total;
467 	conmax = p;
468     }
469 
470     return;
471 }
472 
473 
474 
475 
476 /* @funcstatic wordfinder_findstartpoints ***********************************
477 **
478 ** Undocumented.
479 **
480 ** @param [w] seq1MatchTable [AjPTable] match table
481 ** @param [r] qryseq [const AjPSeq] query sequence 1
482 ** @param [r] trgseq [const AjPSeq] target sequence 2
483 ** @param [w] qrystart [ajint*] start in sequence 2
484 ** @param [w] trgstart [ajint*] start in sequence 1
485 ** @param [w] qryend [ajint*] end in sequence 2
486 ** @param [w] trgend [ajint*] end in sequence 1
487 ** @return [ajint] Undocumented
488 ** @@
489 ******************************************************************************/
490 
wordfinder_findstartpoints(AjPTable seq1MatchTable,const AjPSeq qryseq,const AjPSeq trgseq,ajint * qrystart,ajint * trgstart,ajint * qryend,ajint * trgend)491 static ajint wordfinder_findstartpoints(AjPTable seq1MatchTable,
492 					const AjPSeq qryseq,
493 					const AjPSeq trgseq,
494 					ajint *qrystart, ajint *trgstart,
495 					ajint *qryend,	ajint *trgend)
496 {
497     ajint max = -10;
498     ajint offset = 0;
499     AjPList matchlist = NULL;
500     AjPList ordered = NULL;
501     ajint trgmax;
502     ajint qrymax;
503     ajint bega;
504     ajint begb;
505 
506     trgmax = ajSeqGetLen(trgseq)-1;
507     qrymax = ajSeqGetLen(qryseq)-1;
508     bega = ajSeqGetOffset(trgseq);
509     begb = ajSeqGetOffset(qryseq);
510 
511 
512     ajDebug("wordfinder_findstartpoints len %d %d off %d %d\n",
513 	     trgmax, qrymax, bega, begb);
514     matchlist = embWordBuildMatchTable(seq1MatchTable, qryseq, ajTrue);
515 
516     if(!matchlist)
517 	return 0;
518     else if(!matchlist->Count)
519     {
520         embWordMatchListDelete(&matchlist);
521 	return 0;
522     }
523 
524 
525     /* order and add if the gap is gapmax or less */
526 
527     /* create list header bit*/
528     ordered = ajListNew();
529 
530     wordfinder_orderandconcat(matchlist, ordered);
531 
532     ajListMap(ordered, &wordfinder_findmax, &max);
533 
534     ajDebug("findstart conmax off:%d count:%d total:%d list:%Lu\n",
535 	    conmax->offset, conmax->count, conmax->total,
536 	    ajListGetLength(conmax->list));
537     offset = conmax->offset;
538 
539     ajListMap(ordered, &wordfinder_removelists, NULL);
540     ajListFree(&ordered);
541     embWordMatchListDelete(&matchlist);	/* free the match structures */
542 
543 
544     if(offset > 0)
545     {
546 	*trgstart = offset;
547 	*qrystart = 0;
548     }
549     else
550     {
551 	*qrystart = 0-offset;
552 	*trgstart = 0;
553     }
554     *trgend = *trgstart;
555     *qryend = *qrystart;
556 
557     ajDebug("++ trgend %d -> %d qryend %d -> %d\n",
558 	    *trgend, trgmax, *qryend, qrymax);
559     while(*trgend<trgmax && *qryend<qrymax)
560     {
561 	(*trgend)++;
562 	(*qryend)++;
563     }
564 
565     ajDebug("++ trgend %d qryend %d\n", *trgend, *qryend);
566 
567 
568     ajDebug("wordfinder_findstartpoints has %d..%d [%d] %d..%d [%d]\n",
569 	    *qrystart, *qryend, ajSeqGetLen(qryseq),
570 	    *trgstart, *trgend, ajSeqGetLen(trgseq));
571 
572     return max;
573 }
574