1 /* @source merger application
2 **
3 ** Merge two sequences after a global alignment
4 ** @author Copyright (C) Gary Williams (gwilliam@hgmp.mrc.ac.uk)
5 ** @@
6 **
7 ** Closely based on work by Alan Bleasby
8 **
9 ** This program is free software; you can redistribute it and/or
10 ** modify it under the terms of the GNU General Public License
11 ** as published by the Free Software Foundation; either version 2
12 ** of the License, or (at your option) any later version.
13 **
14 ** This program is distributed in the hope that it will be useful,
15 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ** GNU General Public License for more details.
18 **
19 ** You should have received a copy of the GNU General Public License
20 ** along with this program; if not, write to the Free Software
21 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 ******************************************************************************/
23
24 #include "emboss.h"
25
26
27
28
29 static void merger_Merge(AjPAlign align, AjPStr *merged,
30 const char *a, const char *b,
31 const AjPStr m, const AjPStr n,
32 ajint start1,
33 ajint start2,
34 const char *namea, const char *nameb);
35
36 static float merger_quality(const char * seq, ajuint pos, ajuint window);
37
38 static AjBool merger_bestquality(const char * a, const char *b,
39 ajint apos, ajint bpos);
40
41
42
43
44 /* @prog merger ***************************************************************
45 **
46 ** Merge two overlapping nucleic acid sequences
47 **
48 ******************************************************************************/
49
main(int argc,char ** argv)50 int main(int argc, char **argv)
51 {
52 AjPAlign align;
53 AjPSeq a;
54 AjPSeq b;
55 AjPSeqout seqout;
56
57 AjPStr m;
58 AjPStr n;
59
60 AjPStr merged = NULL;
61
62 ajuint lena;
63 ajuint lenb;
64
65 const char *p;
66 const char *q;
67
68 ajint start1 = 0;
69 ajint start2 = 0;
70
71 float *path;
72 ajint *compass;
73
74 AjPMatrixf matrix;
75 AjPSeqCvt cvt = 0;
76 float **sub;
77
78 float gapopen;
79 float gapextend;
80 ajulong maxarr = 1000;
81 ajulong len; /* arbitrary. realloc'd if needed */
82 size_t stlen;
83
84 float score;
85 ajint begina;
86 ajint beginb;
87
88 embInit("merger", argc, argv);
89
90 a = ajAcdGetSeq("asequence");
91 b = ajAcdGetSeq("bsequence");
92 seqout = ajAcdGetSeqout("outseq");
93 matrix = ajAcdGetMatrixf("datafile");
94 gapopen = ajAcdGetFloat("gapopen");
95 gapextend = ajAcdGetFloat("gapextend");
96 align = ajAcdGetAlign("outfile");
97
98 gapopen = ajRoundFloat(gapopen, 8);
99 gapextend = ajRoundFloat(gapextend, 8);
100
101 AJCNEW(path, maxarr);
102 AJCNEW(compass, maxarr);
103
104 /*
105 ** make the two sequences lowercase so we can show which one we are
106 ** using in the merge by uppercasing it
107 */
108
109 ajSeqFmtLower(a);
110 ajSeqFmtLower(b);
111
112 m = ajStrNew();
113 n = ajStrNew();
114
115 sub = ajMatrixfGetMatrix(matrix);
116 cvt = ajMatrixfGetCvt(matrix);
117
118 begina = ajSeqGetBegin(a);
119 beginb = ajSeqGetBegin(b);
120
121 lena = ajSeqGetLen(a);
122 lenb = ajSeqGetLen(b);
123
124 if(lenb > (ULONG_MAX/(ajulong)(lena+1)))
125 ajFatal("Sequences too big. Try 'supermatcher'");
126
127 len = lena*lenb;
128
129 if(len>maxarr)
130 {
131
132 ajDebug("merger: resize path, len to %d (%d * $d)\n",
133 len, lena, lenb);
134
135 stlen = (size_t) len;
136 AJCRESIZE(path,stlen);
137 AJCRESIZE(compass,stlen);
138 maxarr=len;
139 }
140
141
142 p = ajSeqGetSeqC(a);
143 q = ajSeqGetSeqC(b);
144
145 ajStrAssignC(&m,"");
146 ajStrAssignC(&n,"");
147
148 score = embAlignPathCalc(p,q,lena,lenb,gapopen,gapextend,path,sub,cvt,
149 compass, ajFalse);
150
151 /*score = embAlignScoreNWMatrix(path,compass,gapopen,gapextend,
152 a,b,lena,lenb,sub,cvt,
153 &start1,&start2);*/
154
155 embAlignWalkNWMatrix(path,a,b,&m,&n,lena,lenb, &start1,&start2,gapopen,
156 gapextend,compass);
157
158 /*
159 ** now construct the merged sequence, uppercase the bits of the two
160 ** input sequences which are used in the merger
161 */
162 merger_Merge(align, &merged,p,q,m,n,start1,start2,
163 ajSeqGetNameC(a),ajSeqGetNameC(b));
164
165 embAlignReportGlobal(align, a, b, m, n,
166 start1, start2, gapopen, gapextend,
167 score, matrix, begina, beginb);
168
169 ajAlignWrite(align);
170 ajAlignReset(align);
171
172 /* write the merged sequence */
173 ajSeqAssignSeqS(a, merged);
174 ajSeqoutWriteSeq(seqout, a);
175 ajSeqoutClose(seqout);
176 ajSeqoutDel(&seqout);
177
178 ajSeqDel(&a);
179 ajSeqDel(&b);
180
181 ajAlignClose(align);
182 ajAlignDel(&align);
183 ajStrDel(&merged);
184
185 AJFREE(compass);
186 AJFREE(path);
187
188 ajStrDel(&n);
189 ajStrDel(&m);
190
191 embExit();
192
193 return 0;
194 }
195
196
197
198
199 /* @funcstatic merger_Merge ***************************************************
200 **
201 ** Print a global alignment
202 ** Nucleotides or proteins as needed.
203 **
204 ** @param [w] align [AjPAlign] Alignment object
205 ** @param [w] ms [AjPStr *] output merged sequence
206 ** @param [r] a [const char *] complete first sequence
207 ** @param [r] b [const char *] complete second sequence
208 ** @param [r] m [const AjPStr] Walk alignment for first sequence
209 ** @param [r] n [const AjPStr] Walk alignment for second sequence
210 ** @param [r] start1 [ajint] start of alignment in first sequence
211 ** @param [r] start2 [ajint] start of alignment in second sequence
212 ** @param [r] namea [const char *] name of first sequence
213 ** @param [r] nameb [const char *] name of second sequence
214 **
215 ** @return [void]
216 ******************************************************************************/
217
merger_Merge(AjPAlign align,AjPStr * ms,const char * a,const char * b,const AjPStr m,const AjPStr n,ajint start1,ajint start2,const char * namea,const char * nameb)218 static void merger_Merge(AjPAlign align, AjPStr *ms,
219 const char *a, const char *b,
220 const AjPStr m, const AjPStr n, ajint start1,
221 ajint start2,
222 const char *namea, const char *nameb)
223 {
224 ajint apos;
225 ajint bpos;
226 ajint i;
227 AjPStr mm = NULL;
228 AjPStr nn = NULL;
229
230 char *p;
231 char *q;
232
233 ajint olen; /* length of walk alignment */
234 size_t tt;
235
236 /* lengths of the sequences after the aligned region */
237 ajint alen;
238 ajint blen;
239 AjPStr tmpstr = NULL;
240
241 mm = ajStrNewS(m);
242 nn = ajStrNewS(n);
243
244 p = ajStrGetuniquePtr(&mm);
245 q = ajStrGetuniquePtr(&nn);
246 olen = ajStrGetLen(mm);
247
248 /* output the left hand side */
249 if(start1 > start2)
250 {
251 for(i=0; i<start1; i++)
252 ajStrAppendK(ms, a[i]);
253
254 if(start2)
255 {
256 ajFmtPrintAppS(&tmpstr, "WARNING: *************************"
257 "********\n");
258 ajFmtPrintAppS(&tmpstr, "The region of alignment only starts at "
259 "position %d of sequence %s\n", start2+1, nameb);
260 ajFmtPrintAppS(&tmpstr, "Only the sequence of %s is being used "
261 "before this point\n\n", namea);
262 ajAlignSetTailApp(align, tmpstr);
263 ajStrDel(&tmpstr);
264 }
265 }
266 else if(start2 > start1)
267 {
268 for(i=0; i<start2; i++)
269 ajStrAppendK(ms, b[i]);
270
271 if(start1)
272 {
273 ajFmtPrintAppS(&tmpstr, "WARNING: **************************"
274 "*******\n");
275 ajFmtPrintAppS(&tmpstr, "The region of alignment only starts at "
276 "position %d of sequence %s\n", start1+1, namea);
277 ajFmtPrintAppS(&tmpstr, "Only the sequence of %s is being used "
278 "before this point\n\n", nameb);
279 ajAlignSetTailApp(align, tmpstr);
280 ajStrDel(&tmpstr);
281 }
282 }
283 else if(start1 && start2)
284 {
285 /* both the same length and > 1 before the aligned region */
286 ajFmtPrintAppS(&tmpstr,
287 "WARNING: *********************************\n");
288 ajFmtPrintAppS(&tmpstr, "There is an equal amount of unaligned "
289 "sequence (%d) at the start of the sequences.\n",
290 start1);
291 ajFmtPrintAppS(&tmpstr, "Sequence %s is being arbitrarily chosen "
292 "for the merged sequence\n\n", namea);
293 ajAlignSetTailApp(align, tmpstr);
294 ajStrDel(&tmpstr);
295
296 for(i=0; i<start1; i++)
297 ajStrAppendK(ms, a[i]);
298 }
299
300 /* header */
301 ajFmtPrintS(&tmpstr, "Conflicts: %15.15s %15.15s\n", namea, nameb);
302 ajFmtPrintAppS(&tmpstr,
303 " position base position base Using\n");
304 ajAlignSetTailApp(align, tmpstr);
305
306 /* make the merged sequence
307 **
308 ** point to the start of the alignment in the complete unaligned
309 ** sequences
310 */
311 apos = start1;
312 bpos = start2;
313
314 for(i=0; i<olen; i++)
315 {
316 if(p[i]=='.' || p[i]==' ' || p[i]=='-' ||
317 q[i]=='.' || q[i]==' ' || q[i]=='-')
318 { /* gap! */
319 if(merger_bestquality(a, b, apos, bpos))
320 {
321 p[i] = toupper((ajint)p[i]);
322 if(p[i] != '.' && p[i] != ' ' && p[i] != '-')
323 ajStrAppendK(ms, p[i]);
324 ajFmtPrintS(&tmpstr,
325 " %8d '%c' %8d '%c' '%c'\n",
326 apos+1, p[i], bpos+1, q[i], p[i]);
327 ajAlignSetTailApp(align, tmpstr);
328 }
329 else
330 {
331 q[i] = toupper((ajint)q[i]);
332 if(q[i] != '.' && q[i] != ' ' && q[i] != '-')
333 ajStrAppendK(ms, q[i]);
334 ajFmtPrintS(&tmpstr,
335 " %8d '%c' %8d '%c' '%c'\n",
336 apos+1, p[i], bpos+1, q[i], q[i]);
337 ajAlignSetTailApp(align, tmpstr);
338 }
339
340 }
341 else if(p[i]=='n' || p[i]=='N')
342 {
343 q[i] = toupper((ajint)q[i]);
344 ajStrAppendK(ms, q[i]);
345 }
346 else if(q[i]=='n' || q[i]=='N')
347 {
348 p[i] = toupper((ajint)p[i]);
349 ajStrAppendK(ms, p[i]);
350 }
351 else if(p[i] != q[i])
352 {
353 /*
354 ** get the sequence with the best quality and use the base
355 ** from that one
356 */
357 if(merger_bestquality(a, b, apos, bpos))
358 {
359 p[i] = toupper((ajint)p[i]);
360 ajStrAppendK(ms, p[i]);
361 ajFmtPrintS(&tmpstr,
362 " %8d '%c' %8d '%c' '%c'\n",
363 apos+1, p[i], bpos+1, q[i], p[i]);
364 ajAlignSetTailApp(align, tmpstr);
365 }
366 else
367 {
368 q[i] = toupper((ajint)q[i]);
369 ajStrAppendK(ms, q[i]);
370 ajFmtPrintS(&tmpstr,
371 " %8d '%c' %8d '%c' '%c'\n",
372 apos+1, p[i], bpos+1, q[i], q[i]);
373 ajAlignSetTailApp(align, tmpstr);
374 }
375
376 }
377 else
378 ajStrAppendK(ms, p[i]);
379
380 /* update the positions in the unaligned complete sequences */
381 if(p[i] != '.' && p[i] != ' ' && p[i] != '-') apos++;
382 if(q[i] != '.' && q[i] != ' ' && q[i] != '-') bpos++;
383 }
384
385 /* output the right hand side */
386 tt = strlen(&a[apos]);
387 alen = (ajint) tt;
388
389 tt = strlen(&b[bpos]);
390 blen = (ajint) tt;
391
392 if(alen > blen)
393 {
394 ajStrAppendC(ms, &a[apos]);
395 if(blen)
396 {
397 ajFmtPrintAppS(&tmpstr, "WARNING: ***************************"
398 "******\n");
399 ajFmtPrintAppS(&tmpstr, "The region of alignment ends at "
400 "position %d of sequence %s\n",
401 bpos+1, nameb);
402 ajFmtPrintAppS(&tmpstr, "Only the sequence of %s is being used "
403 "after this point\n\n", namea);
404 ajAlignSetTailApp(align, tmpstr);
405 ajStrDel(&tmpstr);
406 }
407
408 }
409
410 if(blen > alen)
411 {
412 ajStrAppendC(ms, &b[bpos]);
413 if(alen)
414 {
415 ajFmtPrintAppS(&tmpstr, "WARNING: ***************************"
416 "******\n");
417 ajFmtPrintAppS(&tmpstr, "The region of alignment ends at "
418 "position %d of sequence %s\n",
419 apos+1, namea);
420 ajFmtPrintAppS(&tmpstr, "Only the sequence of %s is being used "
421 "after this point\n\n", nameb);
422 ajAlignSetTailApp(align, tmpstr);
423 ajStrDel(&tmpstr);
424 }
425 }
426 else if(alen && blen)
427 { /* both the same length and > 1 */
428 ajFmtPrintAppS(&tmpstr, "WARNING: ************************"
429 "*********\n");
430 ajFmtPrintAppS(&tmpstr, "There is an equal amount of unaligned "
431 "sequence (%d) at the end of the sequences.\n",
432 alen);
433 ajFmtPrintAppS(&tmpstr, "Sequence %s is being arbitrarily chosen "
434 "for the merged sequence\n\n", namea);
435 ajStrAppendC(ms, &a[apos]);
436 ajAlignSetTailApp(align, tmpstr);
437 ajStrDel(&tmpstr);
438 }
439
440 ajStrDel(&mm);
441 ajStrDel(&nn);
442 ajStrDel(&tmpstr);
443
444 return;
445 }
446
447
448
449
450 /* @funcstatic merger_bestquality *********************************************
451 **
452 ** Return ajTrue if the first sequence has the best quality
453 ** If both sequences have the same quality, pick the first
454 **
455 ** @param [r] a [const char*] First sequence
456 ** @param [r] b [const char*] Second sequence
457 ** @param [r] apos [ajint] Position in first sequence
458 ** @param [r] bpos [ajint] Position in second sequence
459 ** @return [AjBool] ajTrue = first sequence is the best quality at this point
460 **
461 ******************************************************************************/
462
merger_bestquality(const char * a,const char * b,ajint apos,ajint bpos)463 static AjBool merger_bestquality(const char * a, const char *b,
464 ajint apos, ajint bpos)
465 {
466 float qa;
467 float qb;
468
469 qa = merger_quality(a, apos, 5);
470 qb = merger_quality(b, bpos, 5);
471
472 if(qa == qb)
473 {
474 /* both have the same quality, use a larger window */
475 qa = merger_quality(a, apos, 10);
476 qb = merger_quality(b, bpos, 10);
477 }
478
479 if(qa == qb)
480 {
481 /* both have the same quality, use a larger window */
482 qa = merger_quality(a, apos, 20);
483 qb = merger_quality(b, bpos, 20);
484 }
485
486 ajDebug("merger_bestquality %d..%d = %.3f %.3f\n", apos, bpos, qa, qb);
487
488 if(qa >= qb)
489 /* both have the same quality, use the first sequence */
490 return ajTrue;
491
492 return ajFalse;
493 }
494
495
496
497
498 /* @funcstatic merger_quality *************************************************
499 **
500 ** Calculate the quality of a window of a sequence
501 **
502 ** quality = sequence value/length under window either side of a position
503 **
504 ** sequence value = sum of points in that subsequence
505 **
506 ** good bases = 2 points
507 **
508 ** ambiguous bases = 1 point
509 **
510 ** N's = 0 points
511 **
512 ** off end of the sequence = 0 points
513 **
514 ** THIS HEAVILY DISCRIMINATES AGAINST THE IFFY BITS AT THE END OF
515 ** SEQUENCE READS
516 **
517 ** @param [r] seq [const char*] Sequence
518 ** @param [r] pos [ajuint] Position
519 ** @param [r] window [ajuint] Window size
520 ** @return [float] quality of the window
521 **
522 ******************************************************************************/
523
merger_quality(const char * seq,ajuint pos,ajuint window)524 static float merger_quality(const char * seq, ajuint pos, ajuint window)
525 {
526 ajint value = 0;
527 ajuint i;
528 ajint j;
529 ajint jlast;
530 float tf;
531
532 for(i=pos; i<pos+window && i < strlen(seq); i++)
533 if(strchr("aAcCgGtTuU", seq[i]))
534 /* good bases count for two points */
535 value+=2;
536 else if(strchr("mrwsykvhdbMRWSYKVHDB", seq[i]))
537 /* ambiguous bases count for only one point */
538 value++;
539 jlast = pos-window;
540 for(j=pos-1; j>jlast && j>=0; j--)
541 if(strchr("aAcCgGtTuU", seq[j]))
542 /* good bases count for two points */
543 value+=2;
544 else if(strchr("mrwsykvhdbMRWSYKVHDB", seq[j]))
545 /* ambiguous bases count for only one point */
546 value++;
547
548 tf = (float) ((double)value/((double)window*2.+1.));
549
550 return tf;
551 }
552
553