1 /* @source merger application
2 **
3 ** Merge two sequences after a global alignment
4 ** @author Copyright (C) Gary Williams (gwilliam@hgmp.mrc.ac.uk)
5 ** @@
6 **
7 ** Closely based on work by Alan Bleasby
8 **
9 ** This program is free software; you can redistribute it and/or
10 ** modify it under the terms of the GNU General Public License
11 ** as published by the Free Software Foundation; either version 2
12 ** of the License, or (at your option) any later version.
13 **
14 ** This program is distributed in the hope that it will be useful,
15 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 ** GNU General Public License for more details.
18 **
19 ** You should have received a copy of the GNU General Public License
20 ** along with this program; if not, write to the Free Software
21 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
22 ******************************************************************************/
23 
24 #include "emboss.h"
25 
26 
27 
28 
29 static void merger_Merge(AjPAlign align, AjPStr *merged,
30 			 const char *a, const char *b,
31 			 const AjPStr m, const AjPStr n,
32 			 ajint start1,
33 			 ajint start2,
34 			 const char *namea, const char *nameb);
35 
36 static float merger_quality(const char * seq, ajuint pos, ajuint window);
37 
38 static AjBool merger_bestquality(const char * a, const char *b,
39 				 ajint apos, ajint bpos);
40 
41 
42 
43 
44 /* @prog merger ***************************************************************
45 **
46 ** Merge two overlapping nucleic acid sequences
47 **
48 ******************************************************************************/
49 
main(int argc,char ** argv)50 int main(int argc, char **argv)
51 {
52     AjPAlign align;
53     AjPSeq a;
54     AjPSeq b;
55     AjPSeqout seqout;
56 
57     AjPStr m;
58     AjPStr n;
59 
60     AjPStr merged = NULL;
61 
62     ajuint lena;
63     ajuint lenb;
64 
65     const char   *p;
66     const char   *q;
67 
68     ajint start1 = 0;
69     ajint start2 = 0;
70 
71     float *path;
72     ajint *compass;
73 
74     AjPMatrixf matrix;
75     AjPSeqCvt  cvt = 0;
76     float **sub;
77 
78     float gapopen;
79     float gapextend;
80     ajulong maxarr = 1000;
81     ajulong len;		  /* arbitrary. realloc'd if needed */
82     size_t  stlen;
83 
84     float score;
85     ajint begina;
86     ajint beginb;
87 
88     embInit("merger", argc, argv);
89 
90     a         = ajAcdGetSeq("asequence");
91     b         = ajAcdGetSeq("bsequence");
92     seqout    = ajAcdGetSeqout("outseq");
93     matrix    = ajAcdGetMatrixf("datafile");
94     gapopen   = ajAcdGetFloat("gapopen");
95     gapextend = ajAcdGetFloat("gapextend");
96     align     = ajAcdGetAlign("outfile");
97 
98     gapopen = ajRoundFloat(gapopen, 8);
99     gapextend = ajRoundFloat(gapextend, 8);
100 
101     AJCNEW(path, maxarr);
102     AJCNEW(compass, maxarr);
103 
104     /*
105     **  make the two sequences lowercase so we can show which one we are
106     **  using in the merge by uppercasing it
107     */
108 
109     ajSeqFmtLower(a);
110     ajSeqFmtLower(b);
111 
112     m = ajStrNew();
113     n = ajStrNew();
114 
115     sub = ajMatrixfGetMatrix(matrix);
116     cvt = ajMatrixfGetCvt(matrix);
117 
118     begina = ajSeqGetBegin(a);
119     beginb = ajSeqGetBegin(b);
120 
121     lena = ajSeqGetLen(a);
122     lenb = ajSeqGetLen(b);
123 
124     if(lenb > (ULONG_MAX/(ajulong)(lena+1)))
125 	ajFatal("Sequences too big. Try 'supermatcher'");
126 
127     len  = lena*lenb;
128 
129     if(len>maxarr)
130     {
131 
132 	ajDebug("merger: resize path, len to %d (%d * $d)\n",
133 		len, lena, lenb);
134 
135 	stlen = (size_t) len;
136         AJCRESIZE(path,stlen);
137         AJCRESIZE(compass,stlen);
138         maxarr=len;
139     }
140 
141 
142     p = ajSeqGetSeqC(a);
143     q = ajSeqGetSeqC(b);
144 
145     ajStrAssignC(&m,"");
146     ajStrAssignC(&n,"");
147 
148     score = embAlignPathCalc(p,q,lena,lenb,gapopen,gapextend,path,sub,cvt,
149 		     compass, ajFalse);
150 
151     /*score = embAlignScoreNWMatrix(path,compass,gapopen,gapextend,
152                                   a,b,lena,lenb,sub,cvt,
153 				  &start1,&start2);*/
154 
155     embAlignWalkNWMatrix(path,a,b,&m,&n,lena,lenb, &start1,&start2,gapopen,
156 			 gapextend,compass);
157 
158     /*
159     ** now construct the merged sequence, uppercase the bits of the two
160     ** input sequences which are used in the merger
161     */
162     merger_Merge(align, &merged,p,q,m,n,start1,start2,
163 		 ajSeqGetNameC(a),ajSeqGetNameC(b));
164 
165     embAlignReportGlobal(align, a, b, m, n,
166 			 start1, start2, gapopen, gapextend,
167 			 score, matrix, begina, beginb);
168 
169     ajAlignWrite(align);
170     ajAlignReset(align);
171 
172     /* write the merged sequence */
173     ajSeqAssignSeqS(a, merged);
174     ajSeqoutWriteSeq(seqout, a);
175     ajSeqoutClose(seqout);
176     ajSeqoutDel(&seqout);
177 
178     ajSeqDel(&a);
179     ajSeqDel(&b);
180 
181     ajAlignClose(align);
182     ajAlignDel(&align);
183     ajStrDel(&merged);
184 
185     AJFREE(compass);
186     AJFREE(path);
187 
188     ajStrDel(&n);
189     ajStrDel(&m);
190 
191     embExit();
192 
193     return 0;
194 }
195 
196 
197 
198 
199 /* @funcstatic merger_Merge ***************************************************
200 **
201 ** Print a global alignment
202 ** Nucleotides or proteins as needed.
203 **
204 ** @param [w] align [AjPAlign] Alignment object
205 ** @param [w] ms [AjPStr *] output merged sequence
206 ** @param [r] a [const char *] complete first sequence
207 ** @param [r] b [const char *] complete second sequence
208 ** @param [r] m [const AjPStr] Walk alignment for first sequence
209 ** @param [r] n [const AjPStr] Walk alignment for second sequence
210 ** @param [r] start1 [ajint] start of alignment in first sequence
211 ** @param [r] start2 [ajint] start of alignment in second sequence
212 ** @param [r] namea [const char *] name of first sequence
213 ** @param [r] nameb [const char *] name of second sequence
214 **
215 ** @return [void]
216 ******************************************************************************/
217 
merger_Merge(AjPAlign align,AjPStr * ms,const char * a,const char * b,const AjPStr m,const AjPStr n,ajint start1,ajint start2,const char * namea,const char * nameb)218 static void merger_Merge(AjPAlign align, AjPStr *ms,
219 			 const char *a, const char *b,
220 			 const AjPStr m, const AjPStr n, ajint start1,
221 			 ajint start2,
222 			 const char *namea, const char *nameb)
223 {
224     ajint apos;
225     ajint bpos;
226     ajint i;
227     AjPStr mm = NULL;
228     AjPStr nn = NULL;
229 
230     char *p;
231     char *q;
232 
233     ajint olen;				/* length of walk alignment */
234     size_t tt;
235 
236     /* lengths of the sequences after the aligned region */
237     ajint alen;
238     ajint blen;
239     AjPStr tmpstr = NULL;
240 
241     mm = ajStrNewS(m);
242     nn = ajStrNewS(n);
243 
244     p    = ajStrGetuniquePtr(&mm);
245     q    = ajStrGetuniquePtr(&nn);
246     olen = ajStrGetLen(mm);
247 
248     /* output the left hand side */
249     if(start1 > start2)
250     {
251 	for(i=0; i<start1; i++)
252 	    ajStrAppendK(ms, a[i]);
253 
254 	if(start2)
255 	{
256 	    ajFmtPrintAppS(&tmpstr, "WARNING: *************************"
257 			   "********\n");
258 	    ajFmtPrintAppS(&tmpstr, "The region of alignment only starts at "
259 			   "position %d of sequence %s\n", start2+1, nameb);
260 	    ajFmtPrintAppS(&tmpstr, "Only the sequence of %s is being used "
261 			   "before this point\n\n", namea);
262 	    ajAlignSetTailApp(align, tmpstr);
263 	    ajStrDel(&tmpstr);
264 	}
265     }
266     else if(start2 > start1)
267     {
268 	for(i=0; i<start2; i++)
269 	    ajStrAppendK(ms, b[i]);
270 
271 	if(start1)
272 	{
273 	    ajFmtPrintAppS(&tmpstr, "WARNING: **************************"
274 			   "*******\n");
275 	    ajFmtPrintAppS(&tmpstr, "The region of alignment only starts at "
276 			   "position %d of sequence %s\n", start1+1, namea);
277 	    ajFmtPrintAppS(&tmpstr, "Only the sequence of %s is being used "
278 			   "before this point\n\n", nameb);
279 	    ajAlignSetTailApp(align, tmpstr);
280 	    ajStrDel(&tmpstr);
281 	}
282     }
283     else if(start1 && start2)
284     {
285 	/* both the same length and > 1 before the aligned region */
286 	ajFmtPrintAppS(&tmpstr,
287 			      "WARNING: *********************************\n");
288 	ajFmtPrintAppS(&tmpstr, "There is an equal amount of unaligned "
289 		       "sequence (%d) at the start of the sequences.\n",
290 		       start1);
291 	ajFmtPrintAppS(&tmpstr, "Sequence %s is being arbitrarily chosen "
292 		       "for the merged sequence\n\n", namea);
293 	ajAlignSetTailApp(align, tmpstr);
294 	ajStrDel(&tmpstr);
295 
296 	for(i=0; i<start1; i++)
297 	    ajStrAppendK(ms, a[i]);
298     }
299 
300     /* header */
301     ajFmtPrintS(&tmpstr, "Conflicts: %15.15s %15.15s\n", namea, nameb);
302     ajFmtPrintAppS(&tmpstr,
303 		   "             position base   position base Using\n");
304     ajAlignSetTailApp(align, tmpstr);
305 
306     /* make the merged sequence
307     **
308     **  point to the start of the alignment in the complete unaligned
309     **  sequences
310     */
311     apos = start1;
312     bpos = start2;
313 
314     for(i=0; i<olen; i++)
315     {
316 	if(p[i]=='.' || p[i]==' ' || p[i]=='-' ||
317 	   q[i]=='.' || q[i]==' ' || q[i]=='-')
318 	{				/* gap! */
319 	    if(merger_bestquality(a, b, apos, bpos))
320 	    {
321 		p[i] = toupper((ajint)p[i]);
322 		if(p[i] != '.' && p[i] != ' ' && p[i] != '-')
323 		    ajStrAppendK(ms, p[i]);
324 		ajFmtPrintS(&tmpstr,
325 			    "             %8d  '%c'   %8d  '%c'   '%c'\n",
326 			    apos+1, p[i], bpos+1, q[i], p[i]);
327 		ajAlignSetTailApp(align, tmpstr);
328 	    }
329 	    else
330 	    {
331 		q[i] = toupper((ajint)q[i]);
332 		if(q[i] != '.' && q[i] != ' ' && q[i] != '-')
333 		    ajStrAppendK(ms, q[i]);
334 		ajFmtPrintS(&tmpstr,
335 			    "             %8d  '%c'   %8d  '%c'   '%c'\n",
336 			    apos+1, p[i], bpos+1, q[i], q[i]);
337 		ajAlignSetTailApp(align, tmpstr);
338 	    }
339 
340 	}
341 	else if(p[i]=='n' || p[i]=='N')
342 	{
343 	    q[i] = toupper((ajint)q[i]);
344 	    ajStrAppendK(ms, q[i]);
345 	}
346 	else if(q[i]=='n' || q[i]=='N')
347 	{
348 	    p[i] = toupper((ajint)p[i]);
349 	    ajStrAppendK(ms, p[i]);
350 	}
351 	else if(p[i] != q[i])
352 	{
353 	    /*
354 	    **  get the sequence with the best quality and use the base
355 	    **  from that one
356 	    */
357 	    if(merger_bestquality(a, b, apos, bpos))
358 	    {
359 		p[i] = toupper((ajint)p[i]);
360 		ajStrAppendK(ms, p[i]);
361 		ajFmtPrintS(&tmpstr,
362 			    "             %8d  '%c'   %8d  '%c'   '%c'\n",
363 			    apos+1, p[i], bpos+1, q[i], p[i]);
364 		ajAlignSetTailApp(align, tmpstr);
365 	    }
366 	    else
367 	    {
368 		q[i] = toupper((ajint)q[i]);
369 		ajStrAppendK(ms, q[i]);
370 		ajFmtPrintS(&tmpstr,
371 			    "             %8d  '%c'   %8d  '%c'   '%c'\n",
372 			    apos+1, p[i], bpos+1, q[i], q[i]);
373 		ajAlignSetTailApp(align, tmpstr);
374 	    }
375 
376 	}
377 	else
378 	    ajStrAppendK(ms, p[i]);
379 
380 	/* update the positions in the unaligned complete sequences */
381 	if(p[i] != '.' &&  p[i] != ' ' &&  p[i] != '-') apos++;
382 	if(q[i] != '.' &&  q[i] != ' ' &&  q[i] != '-') bpos++;
383     }
384 
385     /* output the right hand side */
386     tt = strlen(&a[apos]);
387     alen = (ajint) tt;
388 
389     tt = strlen(&b[bpos]);
390     blen = (ajint) tt;
391 
392     if(alen > blen)
393     {
394 	ajStrAppendC(ms, &a[apos]);
395 	if(blen)
396 	{
397 	    ajFmtPrintAppS(&tmpstr, "WARNING: ***************************"
398 			   "******\n");
399 	    ajFmtPrintAppS(&tmpstr, "The region of alignment ends at "
400 			   "position %d of sequence %s\n",
401 			   bpos+1, nameb);
402 	    ajFmtPrintAppS(&tmpstr, "Only the sequence of %s is being used "
403 			   "after this point\n\n", namea);
404 	    ajAlignSetTailApp(align, tmpstr);
405 	    ajStrDel(&tmpstr);
406 	}
407 
408     }
409 
410     if(blen > alen)
411     {
412 	ajStrAppendC(ms, &b[bpos]);
413 	if(alen)
414 	{
415 	    ajFmtPrintAppS(&tmpstr, "WARNING: ***************************"
416 			   "******\n");
417 	    ajFmtPrintAppS(&tmpstr, "The region of alignment ends at "
418 			   "position %d of sequence %s\n",
419 			   apos+1, namea);
420 	    ajFmtPrintAppS(&tmpstr, "Only the sequence of %s is being used "
421 			   "after this point\n\n", nameb);
422 	    ajAlignSetTailApp(align, tmpstr);
423 	    ajStrDel(&tmpstr);
424 	}
425     }
426     else if(alen && blen)
427     {	/* both the same length and > 1 */
428 	ajFmtPrintAppS(&tmpstr, "WARNING: ************************"
429 		       "*********\n");
430 	ajFmtPrintAppS(&tmpstr, "There is an equal amount of unaligned "
431 		       "sequence (%d) at the end of the sequences.\n",
432 		       alen);
433 	ajFmtPrintAppS(&tmpstr, "Sequence %s is being arbitrarily chosen "
434 		       "for the merged sequence\n\n", namea);
435 	ajStrAppendC(ms, &a[apos]);
436 	ajAlignSetTailApp(align, tmpstr);
437 	ajStrDel(&tmpstr);
438     }
439 
440     ajStrDel(&mm);
441     ajStrDel(&nn);
442     ajStrDel(&tmpstr);
443 
444     return;
445 }
446 
447 
448 
449 
450 /* @funcstatic merger_bestquality *********************************************
451 **
452 ** Return ajTrue if the first sequence has the best quality
453 ** If both sequences have the same quality, pick the first
454 **
455 ** @param [r] a [const char*] First sequence
456 ** @param [r] b [const char*] Second sequence
457 ** @param [r] apos [ajint] Position in first sequence
458 ** @param [r] bpos [ajint] Position in second sequence
459 ** @return [AjBool] ajTrue = first sequence is the best quality at this point
460 **
461 ******************************************************************************/
462 
merger_bestquality(const char * a,const char * b,ajint apos,ajint bpos)463 static AjBool merger_bestquality(const char * a, const char *b,
464 				 ajint apos, ajint bpos)
465 {
466     float qa;
467     float qb;
468 
469     qa = merger_quality(a, apos, 5);
470     qb = merger_quality(b, bpos, 5);
471 
472     if(qa == qb)
473     {
474 	/* both have the same quality, use a larger window */
475 	qa = merger_quality(a, apos, 10);
476 	qb = merger_quality(b, bpos, 10);
477     }
478 
479     if(qa == qb)
480     {
481 	/* both have the same quality, use a larger window */
482 	qa = merger_quality(a, apos, 20);
483 	qb = merger_quality(b, bpos, 20);
484     }
485 
486     ajDebug("merger_bestquality %d..%d = %.3f %.3f\n", apos, bpos, qa, qb);
487 
488     if(qa >= qb)
489 	/*  both have the same quality, use the first sequence */
490 	return ajTrue;
491 
492     return ajFalse;
493 }
494 
495 
496 
497 
498 /* @funcstatic merger_quality *************************************************
499 **
500 ** Calculate the quality of a window of a sequence
501 **
502 ** quality = sequence value/length under window either side of a position
503 **
504 ** sequence value = sum of points in that subsequence
505 **
506 ** good bases = 2 points
507 **
508 ** ambiguous bases = 1 point
509 **
510 ** N's = 0 points
511 **
512 ** off end of the sequence = 0 points
513 **
514 ** THIS HEAVILY DISCRIMINATES AGAINST THE IFFY BITS AT THE END OF
515 ** SEQUENCE READS
516 **
517 ** @param [r] seq [const char*] Sequence
518 ** @param [r] pos [ajuint] Position
519 ** @param [r] window [ajuint] Window size
520 ** @return [float] quality of the window
521 **
522 ******************************************************************************/
523 
merger_quality(const char * seq,ajuint pos,ajuint window)524 static float merger_quality(const char * seq, ajuint pos, ajuint window)
525 {
526     ajint value = 0;
527     ajuint i;
528     ajint j;
529     ajint jlast;
530     float tf;
531 
532     for(i=pos; i<pos+window && i < strlen(seq); i++)
533 	if(strchr("aAcCgGtTuU", seq[i]))
534 	    /* good bases count for two points */
535 	    value+=2;
536 	else if(strchr("mrwsykvhdbMRWSYKVHDB", seq[i]))
537 	    /* ambiguous bases count for only one point */
538 	    value++;
539     jlast = pos-window;
540     for(j=pos-1; j>jlast && j>=0; j--)
541 	if(strchr("aAcCgGtTuU", seq[j]))
542 	    /* good bases count for two points */
543 	    value+=2;
544 	else if(strchr("mrwsykvhdbMRWSYKVHDB", seq[j]))
545 	    /* ambiguous bases count for only one point */
546 	    value++;
547 
548     tf = (float) ((double)value/((double)window*2.+1.));
549 
550     return tf;
551 }
552 
553