1 /* @source megamerger application
2 **
3 ** Merge two large overlapping nucleic acid sequences
4 **
5 ** @author Copyright (C) Gary Williams (gwilliam@hgmp.mrc.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ******************************************************************************/
22
23 #include "emboss.h"
24
25
26
27
28 static void megamerger_Merge(const AjPList matchlist,
29 const AjPSeq seq1, const AjPSeq seq2,
30 AjPSeqout seqout, AjPFile outfile, AjBool prefer);
31
32
33
34
35 /* @prog megamerger ***********************************************************
36 **
37 ** Merge two large overlapping nucleic acid sequences
38 **
39 ******************************************************************************/
40
main(int argc,char ** argv)41 int main(int argc, char **argv)
42 {
43
44 AjPSeq seq1;
45 AjPSeq seq2;
46 AjPSeqout seqout;
47 ajint wordlen;
48 AjPTable seq1MatchTable = NULL;
49 AjPList matchlist = NULL;
50 AjPFile outfile;
51 AjBool prefer;
52
53 embInit("megamerger", argc, argv);
54
55 wordlen = ajAcdGetInt("wordsize");
56 seq1 = ajAcdGetSeq("asequence");
57 seq2 = ajAcdGetSeq("bsequence");
58 prefer = ajAcdGetBoolean("prefer");
59 outfile = ajAcdGetOutfile("outfile");
60 seqout = ajAcdGetSeqout("outseq");
61
62 /* trim sequences to -sbegin and -send */
63 ajSeqTrim(seq1);
64 ajSeqTrim(seq2);
65
66 embWordLength(wordlen);
67 if(embWordGetTable(&seq1MatchTable, seq1))
68 /* get table of words */
69 matchlist = embWordBuildMatchTable(seq1MatchTable, seq2, ajTrue);
70 else
71 ajFatal("No match found\n");
72
73
74 /* get the minimal set of overlapping matches */
75 embWordMatchMin(matchlist);
76
77 if(ajListGetLength(matchlist))
78 {
79 /* make the output file */
80 megamerger_Merge(matchlist, seq1, seq2, seqout, outfile, prefer);
81
82 /* tidy up */
83 }
84
85 embWordMatchListDelete(&matchlist); /* free the match structures */
86 embWordFreeTable(&seq1MatchTable);
87
88 ajSeqoutClose(seqout);
89 ajFileClose(&outfile);
90
91 ajSeqDel(&seq1);
92 ajSeqDel(&seq2);
93 ajSeqoutDel(&seqout);
94
95 embExit();
96
97 return 0;
98 }
99
100
101
102
103 /* @funcstatic megamerger_Merge ***********************************************
104 **
105 ** Marge and write a report on the merge of the two sequences.
106 **
107 ** @param [r] matchlist [const AjPList] List of minimal non-overlapping matches
108 ** @param [r] seq1 [const AjPSeq] Sequence to be merged.
109 ** @param [r] seq2 [const AjPSeq] Sequence to be merged.
110 ** @param [w] seqout [AjPSeqout] Output merged sequence
111 ** @param [u] outfile [AjPFile] Output file containing report.
112 ** @param [r] prefer [AjBool] If TRUE, use the first sequence when there
113 ** is a mismatch
114 ** @return [void]
115 ** @@
116 ******************************************************************************/
117
megamerger_Merge(const AjPList matchlist,const AjPSeq seq1,const AjPSeq seq2,AjPSeqout seqout,AjPFile outfile,AjBool prefer)118 static void megamerger_Merge(const AjPList matchlist,
119 const AjPSeq seq1,const AjPSeq seq2,
120 AjPSeqout seqout, AjPFile outfile, AjBool prefer)
121 {
122 AjIList iter = NULL; /* match list iterator */
123 EmbPWordMatch p = NULL; /* match structure */
124 ajint count = 0; /* count of matches */
125 AjPStr seqstr; /* merged sequence string */
126 AjPStr s1; /* string of seq1 */
127 AjPStr s2; /* string of seq2 */
128 ajuint prev1end = 0;
129 ajuint prev2end = 0; /* end positions (+1) of previous match */
130 ajuint mid1;
131 ajuint mid2; /* middle of a mismatch region */
132 AjPStr tmp; /* holds sequence string while uppercasing */
133 AjPSeq seq = NULL;
134
135 tmp = ajStrNew();
136 seqstr = ajStrNew();
137 s1 = ajSeqGetSeqCopyS(seq1);
138 s2 = ajSeqGetSeqCopyS(seq2);
139
140 /* change the sequences to lowercase to highlight problem areas */
141 ajStrFmtLower(&s1);
142 ajStrFmtLower(&s2);
143
144 /* title line */
145 ajFmtPrintF(outfile, "# Report of megamerger of: %s and %s\n\n",
146 ajSeqGetNameC(seq1), ajSeqGetNameC(seq2));
147
148 iter = ajListIterNewread(matchlist);
149 while(!ajListIterDone(iter))
150 {
151 p = (EmbPWordMatch) ajListIterGet(iter);
152 /* first match? */
153 if(!count++)
154 {
155 ajFmtPrintF(outfile, "%s overlap starts at %d\n",
156 ajSeqGetNameC(seq1), p->seq1start+1);
157 ajFmtPrintF(outfile, "%s overlap starts at %d\n\n",
158 ajSeqGetNameC(seq2), p->seq2start+1);
159
160 /* get initial sequence before the overlapping region */
161 if(p->seq1start == 0)
162 {
163 /*
164 ** ignore the initial bit if both sequences match from
165 ** the start
166 */
167 if(p->seq2start > 0)
168 {
169 ajFmtPrintF(outfile, "Using %s 1-%d as the "
170 "initial sequence\n\n",
171 ajSeqGetNameC(seq2), p->seq2start);
172 ajStrAssignSubS(&seqstr, s2, 0, p->seq2start-1);
173 }
174
175 }
176 else if(p->seq2start == 0)
177 {
178 ajFmtPrintF(outfile, "Using %s 1-%d as the initial "
179 "sequence\n\n", ajSeqGetNameC(seq1),
180 p->seq1start);
181 ajStrAssignSubS(&seqstr, s1, 0, p->seq1start-1);
182
183 }
184 else
185 {
186 ajFmtPrintF(outfile, "WARNING!\n");
187 ajFmtPrintF(outfile, "Neither sequence's overlap is at "
188 "the start of the sequence\n");
189 if(p->seq1start > p->seq2start)
190 {
191 ajFmtPrintF(outfile, "Using %s 1-%d as the "
192 "initial sequence\n\n",
193 ajSeqGetNameC(seq1), p->seq1start);
194 ajStrAssignSubS(&tmp, s1, 0, p->seq1start-1);
195 ajStrFmtUpper(&tmp);
196 ajStrAssignS(&seqstr, tmp);
197
198 }
199 else
200 {
201 ajFmtPrintF(outfile, "Using %s 1-%d as the initial "
202 "sequence\n\n", ajSeqGetNameC(seq2),
203 p->seq2start);
204 ajStrAssignSubS(&tmp, s2, 0, p->seq2start-1);
205 ajStrFmtUpper(&tmp);
206 ajStrAssignS(&seqstr, tmp);
207
208 }
209 }
210 }
211 else
212 {
213 /*
214 ** output the intervening mismatch between the previous
215 ** matching region
216 */
217 ajFmtPrintF(outfile, "\nWARNING!\nMismatch region found:\n");
218 if(prev1end<p->seq1start)
219 ajFmtPrintF(outfile, "Mismatch %s %d-%d\n",
220 ajSeqGetNameC(seq1), prev1end+1,
221 p->seq1start);
222 else
223 ajFmtPrintF(outfile, "Mismatch %s %d\n",
224 ajSeqGetNameC(seq1), prev1end);
225
226 if(prev2end<p->seq2start)
227 ajFmtPrintF(outfile, "Mismatch %s %d-%d\n",
228 ajSeqGetNameC(seq2), prev2end+1, p->seq2start);
229 else
230 ajFmtPrintF(outfile, "Mismatch %s %d\n",
231 ajSeqGetNameC(seq2), prev2end);
232
233 if(prefer)
234 {
235 /* use sequence 1 as the 'correct' one */
236 ajStrAssignSubS(&tmp, s1, prev1end, p->seq1start-1);
237 ajStrFmtUpper(&tmp);
238 ajStrAppendS(&seqstr, tmp);
239
240 }
241 else
242 {
243 /*
244 ** use the sequence where the mismatch is furthest
245 ** from the end as the 'correct' one
246 */
247 mid1 = (prev1end + p->seq1start-1)/2;
248 mid2 = (prev2end + p->seq2start-1)/2;
249 /* is the mismatch closer to the ends of seq1 or seq2? */
250 if(AJMIN(mid1, ajSeqGetLen(seq1)-mid1-1) <
251 AJMIN(mid2, ajSeqGetLen(seq2)-mid2-1))
252 {
253 ajFmtPrintF(outfile, "Mismatch is closer to the ends "
254 "of %s, so use %s in the merged "
255 "sequence\n\n", ajSeqGetNameC(seq1),
256 ajSeqGetNameC(seq2));
257 if(prev2end < p->seq2start)
258 {
259 ajStrAssignSubS(&tmp, s2, prev2end, p->seq2start-1);
260 ajStrFmtUpper(&tmp);
261 ajStrAppendS(&seqstr, tmp);
262
263 }
264 }
265 else
266 {
267 ajFmtPrintF(outfile,
268 "Mismatch is closer to the ends of %s, "
269 "so use %s in the merged sequence\n\n",
270 ajSeqGetNameC(seq2), ajSeqGetNameC(seq1));
271 if(prev1end < p->seq1start)
272 {
273 ajStrAssignSubS(&tmp, s1, prev1end, p->seq1start-1);
274 ajStrFmtUpper(&tmp);
275 ajStrAppendS(&seqstr, tmp);
276 }
277 }
278 }
279 }
280
281 /* output the match */
282 ajFmtPrintF(outfile, "Matching region %s %d-%d : %s %d-%d\n",
283 ajSeqGetNameC(seq1), p->seq1start+1,
284 p->seq1start + p->length, ajSeqGetNameC(seq2),
285 p->seq2start+1, p->seq2start + p->length);
286 ajFmtPrintF(outfile, "Length of match: %d\n", p->length);
287 ajStrAppendSubS(&seqstr, s1, p->seq1start, p->seq1start + p->length-1);
288
289 /*
290 ** note the end positions (+1) to get the intervening region
291 ** between matches
292 */
293 prev1end = p->seq1start + p->length;
294 prev2end = p->seq2start + p->length;
295 }
296
297 /* end of overlapping region */
298 ajFmtPrintF(outfile, "\n%s overlap ends at %d\n", ajSeqGetNameC(seq1),
299 p->seq1start+p->length);
300 ajFmtPrintF(outfile, "%s overlap ends at %d\n\n", ajSeqGetNameC(seq2),
301 p->seq2start+p->length);
302
303 /* is seq1 only longer than the matched regions? */
304 if(prev2end >= ajSeqGetLen(seq2) &&
305 prev1end < ajSeqGetLen(seq1))
306 {
307 ajFmtPrintF(outfile, "Using %s %d-%d as the final "
308 "sequence\n\n", ajSeqGetNameC(seq1), prev1end+1,
309 ajSeqGetLen(seq1));
310 ajStrAppendSubS(&seqstr, s1, prev1end, ajSeqGetLen(seq1)-1);
311
312 /* is seq2 only longer than the matched regions? */
313 }
314 else if(prev1end >= ajSeqGetLen(seq1) && prev2end < ajSeqGetLen(seq2))
315 {
316 ajFmtPrintF(outfile, "Using %s %d-%d as the final "
317 "sequence\n\n", ajSeqGetNameC(seq2), prev2end+1,
318 ajSeqGetLen(seq2));
319 ajStrAppendSubS(&seqstr, s2, prev2end, ajSeqGetLen(seq2)-1);
320
321 /* both are longer! */
322 }
323 else if(prev1end < ajSeqGetLen(seq1) && prev2end < ajSeqGetLen(seq2))
324 {
325 ajFmtPrintF(outfile, "WARNING!\n");
326 ajFmtPrintF(outfile, "Neither sequence's overlap is at the "
327 "end of the sequence\n");
328 if(ajSeqGetLen(seq1)-prev1end > ajSeqGetLen(seq2)-prev2end)
329 {
330 ajFmtPrintF(outfile, "Using %s %d-%d as the final "
331 "sequence\n\n", ajSeqGetNameC(seq1), prev1end+1,
332 ajSeqGetLen(seq1));
333 ajStrAssignSubS(&tmp, s1, prev1end, ajSeqGetLen(seq1)-1);
334 ajStrFmtUpper(&tmp);
335 ajStrAppendS(&seqstr, tmp);
336 }
337 else
338 {
339 ajFmtPrintF(outfile, "Using %s %d-%d as the final "
340 "sequence\n\n", ajSeqGetNameC(seq2), prev2end+1,
341 ajSeqGetLen(seq2));
342 ajStrAssignSubS(&tmp, s2, prev2end, ajSeqGetLen(seq2)-1);
343 ajStrFmtUpper(&tmp);
344 ajStrAppendS(&seqstr, tmp);
345
346 }
347 }
348
349 /* write out sequence at end */
350 seq = ajSeqNewSeq(seq1);
351 ajSeqAssignSeqS(seq, seqstr);
352 ajSeqoutWriteSeq(seqout, seq);
353
354 ajSeqDel(&seq);
355 ajStrDel(&s1);
356 ajStrDel(&s2);
357 ajStrDel(&tmp);
358 ajStrDel(&seqstr);
359 ajListIterDel(&iter);
360
361 return;
362 }
363