1 /* @source megamerger application
2 **
3 ** Merge two large overlapping nucleic acid sequences
4 **
5 ** @author Copyright (C) Gary Williams (gwilliam@hgmp.mrc.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
21 ******************************************************************************/
22 
23 #include "emboss.h"
24 
25 
26 
27 
28 static void megamerger_Merge(const AjPList matchlist,
29 			     const AjPSeq seq1, const AjPSeq seq2,
30 			     AjPSeqout seqout, AjPFile outfile, AjBool prefer);
31 
32 
33 
34 
35 /* @prog megamerger ***********************************************************
36 **
37 ** Merge two large overlapping nucleic acid sequences
38 **
39 ******************************************************************************/
40 
main(int argc,char ** argv)41 int main(int argc, char **argv)
42 {
43 
44     AjPSeq seq1;
45     AjPSeq seq2;
46     AjPSeqout seqout;
47     ajint wordlen;
48     AjPTable seq1MatchTable = NULL;
49     AjPList matchlist = NULL;
50     AjPFile outfile;
51     AjBool prefer;
52 
53     embInit("megamerger", argc, argv);
54 
55     wordlen = ajAcdGetInt("wordsize");
56     seq1    = ajAcdGetSeq("asequence");
57     seq2    = ajAcdGetSeq("bsequence");
58     prefer  = ajAcdGetBoolean("prefer");
59     outfile = ajAcdGetOutfile("outfile");
60     seqout  = ajAcdGetSeqout("outseq");
61 
62     /* trim sequences to -sbegin and -send */
63     ajSeqTrim(seq1);
64     ajSeqTrim(seq2);
65 
66     embWordLength(wordlen);
67     if(embWordGetTable(&seq1MatchTable, seq1))
68 	/* get table of words */
69 	matchlist = embWordBuildMatchTable(seq1MatchTable, seq2, ajTrue);
70     else
71 	ajFatal("No match found\n");
72 
73 
74     /* get the minimal set of overlapping matches */
75        embWordMatchMin(matchlist);
76 
77     if(ajListGetLength(matchlist))
78     {
79 	/* make the output file */
80 	megamerger_Merge(matchlist, seq1, seq2, seqout, outfile, prefer);
81 
82 	/* tidy up */
83     }
84 
85     embWordMatchListDelete(&matchlist); /* free the match structures */
86     embWordFreeTable(&seq1MatchTable);
87 
88     ajSeqoutClose(seqout);
89     ajFileClose(&outfile);
90 
91     ajSeqDel(&seq1);
92     ajSeqDel(&seq2);
93     ajSeqoutDel(&seqout);
94 
95     embExit();
96 
97     return 0;
98 }
99 
100 
101 
102 
103 /* @funcstatic megamerger_Merge ***********************************************
104 **
105 ** Marge and write a report on the merge of the two sequences.
106 **
107 ** @param [r] matchlist [const AjPList] List of minimal non-overlapping matches
108 ** @param [r] seq1 [const AjPSeq] Sequence to be merged.
109 ** @param [r] seq2 [const AjPSeq] Sequence to be merged.
110 ** @param [w] seqout [AjPSeqout] Output merged sequence
111 ** @param [u] outfile [AjPFile] Output file containing report.
112 ** @param [r] prefer [AjBool] If TRUE, use the first sequence when there
113 **                            is a mismatch
114 ** @return [void]
115 ** @@
116 ******************************************************************************/
117 
megamerger_Merge(const AjPList matchlist,const AjPSeq seq1,const AjPSeq seq2,AjPSeqout seqout,AjPFile outfile,AjBool prefer)118 static void megamerger_Merge(const AjPList matchlist,
119 			     const AjPSeq seq1,const  AjPSeq seq2,
120 			     AjPSeqout seqout, AjPFile outfile, AjBool prefer)
121 {
122     AjIList iter = NULL;       		/* match list iterator */
123     EmbPWordMatch p = NULL;  		/* match structure */
124     ajint count = 0;			/* count of matches */
125     AjPStr seqstr;			/* merged sequence string */
126     AjPStr s1;			/* string of seq1 */
127     AjPStr s2;			/* string of seq2 */
128     ajuint prev1end = 0;
129     ajuint prev2end = 0;       /* end positions (+1) of previous match */
130     ajuint mid1;
131     ajuint mid2;			/* middle of a mismatch region */
132     AjPStr tmp;			/* holds sequence string while uppercasing */
133     AjPSeq seq = NULL;
134 
135     tmp    = ajStrNew();
136     seqstr = ajStrNew();
137     s1     = ajSeqGetSeqCopyS(seq1);
138     s2     = ajSeqGetSeqCopyS(seq2);
139 
140     /* change the sequences to lowercase to highlight problem areas */
141     ajStrFmtLower(&s1);
142     ajStrFmtLower(&s2);
143 
144     /* title line */
145     ajFmtPrintF(outfile, "# Report of megamerger of: %s and %s\n\n",
146 		ajSeqGetNameC(seq1), ajSeqGetNameC(seq2));
147 
148     iter = ajListIterNewread(matchlist);
149     while(!ajListIterDone(iter))
150     {
151 	p = (EmbPWordMatch) ajListIterGet(iter);
152 	/* first match? */
153 	if(!count++)
154 	{
155 	    ajFmtPrintF(outfile, "%s overlap starts at %d\n",
156 			ajSeqGetNameC(seq1), p->seq1start+1);
157 	    ajFmtPrintF(outfile, "%s overlap starts at %d\n\n",
158 			ajSeqGetNameC(seq2), p->seq2start+1);
159 
160 	    /* get initial sequence before the overlapping region */
161 	    if(p->seq1start == 0)
162 	    {
163 		/*
164 		**  ignore the initial bit if both sequences match from
165 		**  the start
166 		*/
167 		if(p->seq2start > 0)
168 		{
169 		    ajFmtPrintF(outfile, "Using %s 1-%d as the "
170 				"initial sequence\n\n",
171 				ajSeqGetNameC(seq2), p->seq2start);
172 		    ajStrAssignSubS(&seqstr, s2, 0, p->seq2start-1);
173 		}
174 
175 	    }
176 	    else if(p->seq2start == 0)
177 	    {
178 		ajFmtPrintF(outfile, "Using %s 1-%d as the initial "
179 			    "sequence\n\n", ajSeqGetNameC(seq1),
180 			    p->seq1start);
181 		ajStrAssignSubS(&seqstr, s1, 0, p->seq1start-1);
182 
183 	    }
184 	    else
185 	    {
186 		ajFmtPrintF(outfile, "WARNING!\n");
187 		ajFmtPrintF(outfile, "Neither sequence's overlap is at "
188 			    "the start of the sequence\n");
189 		if(p->seq1start > p->seq2start)
190 		{
191 		    ajFmtPrintF(outfile, "Using %s 1-%d as the "
192 				"initial sequence\n\n",
193 				ajSeqGetNameC(seq1), p->seq1start);
194 		    ajStrAssignSubS(&tmp, s1, 0, p->seq1start-1);
195 		    ajStrFmtUpper(&tmp);
196 		    ajStrAssignS(&seqstr, tmp);
197 
198 		}
199 		else
200 		{
201 		    ajFmtPrintF(outfile, "Using %s 1-%d as the initial "
202 				"sequence\n\n", ajSeqGetNameC(seq2),
203 				p->seq2start);
204 		    ajStrAssignSubS(&tmp, s2, 0, p->seq2start-1);
205 		    ajStrFmtUpper(&tmp);
206 		    ajStrAssignS(&seqstr, tmp);
207 
208 		}
209 	    }
210 	}
211 	else
212 	{
213 	    /*
214 	    **  output the intervening mismatch between the previous
215 	    **  matching region
216 	    */
217 	    ajFmtPrintF(outfile, "\nWARNING!\nMismatch region found:\n");
218 	    if(prev1end<p->seq1start)
219 		ajFmtPrintF(outfile, "Mismatch %s %d-%d\n",
220 			    ajSeqGetNameC(seq1), prev1end+1,
221 			    p->seq1start);
222 	    else
223 		ajFmtPrintF(outfile, "Mismatch %s %d\n",
224 			    ajSeqGetNameC(seq1), prev1end);
225 
226 	    if(prev2end<p->seq2start)
227 		ajFmtPrintF(outfile, "Mismatch %s %d-%d\n",
228 			    ajSeqGetNameC(seq2), prev2end+1, p->seq2start);
229 	    else
230 		ajFmtPrintF(outfile, "Mismatch %s %d\n",
231 			    ajSeqGetNameC(seq2), prev2end);
232 
233             if(prefer)
234 	    {
235                 /* use sequence 1 as the 'correct' one */
236 	        ajStrAssignSubS(&tmp, s1, prev1end, p->seq1start-1);
237 	        ajStrFmtUpper(&tmp);
238 	        ajStrAppendS(&seqstr, tmp);
239 
240             }
241 	    else
242 	    {
243                 /*
244 		** use the sequence where the mismatch is furthest
245 		** from the end as the 'correct' one
246 		*/
247 	        mid1 = (prev1end + p->seq1start-1)/2;
248 	        mid2 = (prev2end + p->seq2start-1)/2;
249 	        /* is the mismatch closer to the ends of seq1 or seq2? */
250 	        if(AJMIN(mid1, ajSeqGetLen(seq1)-mid1-1) <
251 		    AJMIN(mid2, ajSeqGetLen(seq2)-mid2-1))
252 	        {
253 		    ajFmtPrintF(outfile, "Mismatch is closer to the ends "
254 				"of %s, so use %s in the merged "
255 				"sequence\n\n", ajSeqGetNameC(seq1),
256 				ajSeqGetNameC(seq2));
257 		    if(prev2end < p->seq2start)
258 		    {
259 		        ajStrAssignSubS(&tmp, s2, prev2end, p->seq2start-1);
260 		        ajStrFmtUpper(&tmp);
261 		        ajStrAppendS(&seqstr, tmp);
262 
263 		    }
264 	        }
265 	        else
266 	        {
267 		    ajFmtPrintF(outfile,
268 				"Mismatch is closer to the ends of %s, "
269 				"so use %s in the merged sequence\n\n",
270 				ajSeqGetNameC(seq2), ajSeqGetNameC(seq1));
271 		    if(prev1end < p->seq1start)
272 		    {
273 		        ajStrAssignSubS(&tmp, s1, prev1end, p->seq1start-1);
274 		        ajStrFmtUpper(&tmp);
275 		        ajStrAppendS(&seqstr, tmp);
276 		    }
277 		}
278 	    }
279 	}
280 
281 	/* output the match */
282 	ajFmtPrintF(outfile, "Matching region %s %d-%d : %s %d-%d\n",
283 		    ajSeqGetNameC(seq1), p->seq1start+1,
284 		    p->seq1start + p->length, ajSeqGetNameC(seq2),
285 		    p->seq2start+1, p->seq2start + p->length);
286 	ajFmtPrintF(outfile, "Length of match: %d\n", p->length);
287 	ajStrAppendSubS(&seqstr, s1, p->seq1start, p->seq1start + p->length-1);
288 
289 	/*
290 	** note the end positions (+1) to get the intervening region
291 	** between matches
292 	*/
293 	prev1end = p->seq1start + p->length;
294 	prev2end = p->seq2start + p->length;
295     }
296 
297     /* end of overlapping region */
298     ajFmtPrintF(outfile, "\n%s overlap ends at %d\n", ajSeqGetNameC(seq1),
299 		p->seq1start+p->length);
300     ajFmtPrintF(outfile, "%s overlap ends at %d\n\n", ajSeqGetNameC(seq2),
301 		p->seq2start+p->length);
302 
303     /* is seq1 only longer than the matched regions? */
304     if(prev2end >= ajSeqGetLen(seq2) &&
305        prev1end < ajSeqGetLen(seq1))
306     {
307 	ajFmtPrintF(outfile, "Using %s %d-%d as the final "
308 		    "sequence\n\n", ajSeqGetNameC(seq1), prev1end+1,
309 		    ajSeqGetLen(seq1));
310 	ajStrAppendSubS(&seqstr, s1, prev1end, ajSeqGetLen(seq1)-1);
311 
312 	/* is seq2 only longer than the matched regions? */
313     }
314     else if(prev1end >= ajSeqGetLen(seq1) && prev2end < ajSeqGetLen(seq2))
315     {
316 	ajFmtPrintF(outfile, "Using %s %d-%d as the final "
317 		    "sequence\n\n", ajSeqGetNameC(seq2), prev2end+1,
318 		    ajSeqGetLen(seq2));
319 	ajStrAppendSubS(&seqstr, s2, prev2end, ajSeqGetLen(seq2)-1);
320 
321 	/* both are longer! */
322     }
323     else if(prev1end < ajSeqGetLen(seq1) && prev2end < ajSeqGetLen(seq2))
324     {
325 	ajFmtPrintF(outfile, "WARNING!\n");
326 	ajFmtPrintF(outfile, "Neither sequence's overlap is at the "
327 		    "end of the sequence\n");
328 	if(ajSeqGetLen(seq1)-prev1end > ajSeqGetLen(seq2)-prev2end)
329 	{
330 	    ajFmtPrintF(outfile, "Using %s %d-%d as the final "
331 			"sequence\n\n", ajSeqGetNameC(seq1), prev1end+1,
332 			ajSeqGetLen(seq1));
333 	    ajStrAssignSubS(&tmp, s1, prev1end, ajSeqGetLen(seq1)-1);
334 	    ajStrFmtUpper(&tmp);
335 	    ajStrAppendS(&seqstr, tmp);
336 	}
337 	else
338 	{
339 	    ajFmtPrintF(outfile, "Using %s %d-%d as the final "
340 			       "sequence\n\n", ajSeqGetNameC(seq2), prev2end+1,
341 			       ajSeqGetLen(seq2));
342 	    ajStrAssignSubS(&tmp, s2, prev2end, ajSeqGetLen(seq2)-1);
343 	    ajStrFmtUpper(&tmp);
344 	    ajStrAppendS(&seqstr, tmp);
345 
346 	}
347     }
348 
349     /* write out sequence at end */
350     seq = ajSeqNewSeq(seq1);
351     ajSeqAssignSeqS(seq, seqstr);
352     ajSeqoutWriteSeq(seqout, seq);
353 
354     ajSeqDel(&seq);
355     ajStrDel(&s1);
356     ajStrDel(&s2);
357     ajStrDel(&tmp);
358     ajStrDel(&seqstr);
359     ajListIterDel(&iter);
360 
361     return;
362 }
363