1 /* @source est2genome application
2 **
3 ** Richard Mott's est_genome ported into EMBOSS.
4 ** See also nucleus/embest.c
5 **
6 ** @author Copyright (C) Peter Rice, Sanger Centre
7 ** @author Copyright (C) Richard Mott, Sanger Centre
8 ** @@
9 **
10 ** This program is free software; you can redistribute it and/or
11 ** modify it under the terms of the GNU General Public License
12 ** as published by the Free Software Foundation; either version 2
13 ** of the License, or (at your option) any later version.
14 **
15 ** This program is distributed in the hope that it will be useful,
16 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18 ** GNU General Public License for more details.
19 **
20 ** You should have received a copy of the GNU General Public License
21 ** along with this program; if not, write to the Free Software
22 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
23 ******************************************************************************/
24 
25 /*
26 ** Revision 1.7 1997/03/17   15:26:00  pmr
27 ** when EST is reversed, need to reverse the EST sequence positions
28 ** also cleaned up long code lines
29 **
30 ** Revision 1.6  1997/02/10  14:07:54  rmott
31 ** fixed bug so that splice sites in REVERSE direction
32 ** (ie ct/ac rather than gt/ag) are found correctly.
33 ** Output modified so that splice direction is written
34 **
35 ** Revision 1.4  1997/01/30  17:21:22  rmott
36 ** fixed bug, and now computes area limit better
37 **
38 ** Revision 1.3  1997/01/30  17:03:45  rmott
39 ** debugged version with debug statements still in.
40 ** Fixed problem of not initialising edge properly
41 **
42 ** Revision 1.2  1996/08/07  10:12:05  rmott
43 **
44 ** Linear-Space version
45 **
46 ** Revision 1.1  1996/08/01  13:55:42  rmott
47 ** Initial revision
48 **
49 */
50 
51 #include "emboss.h"
52 #include <math.h>
53 
54 
55 
56 
57 extern ajint lsimmat[256][256];
58 
59 #define BOTH 0
60 #define FORWARD_ONLY 1
61 #define REVERSE_ONLY 2
62 
63 ajint verbose;
64 ajint debug;
65 
66 
67 
68 
69 static void  est2genome_make_output(AjPFile ofile,
70 				    const AjPSeq genome, const AjPSeq est,
71 				    const EmbPEstAlign ge, ajint gap_penalty,
72 				    ajint intron_penalty,
73 				    ajint splice_penalty, ajint minscore,
74 				    ajint align, ajint width, ajint reverse);
75 
76 
77 
78 
79 /* @prog est2genome ***********************************************************
80 **
81 ** Align EST and genomic DNA sequences
82 **
83 ******************************************************************************/
84 
85 
main(int argc,char ** argv)86 int main(int argc, char **argv)
87 {
88     AjPSeq genome = NULL;
89     AjPSeq splice_sites = NULL;
90     AjPSeq reversed_splice_sites = NULL;
91     AjPSeq est = NULL;
92     AjPSeq reversed_est = NULL;
93     EmbPEstAlign fge = NULL;
94     EmbPEstAlign rge = NULL;
95     EmbPEstAlign bge = NULL;
96     ajint width = 50;
97     ajint match = 1;
98     ajint mismatch = 1;
99     ajint gap_penalty = 2;
100     ajint intron_penalty = 40;
101     ajint splice_penalty = 20;
102     ajint splice = 1;
103     ajint align = 0;
104     ajint reverse = 0;
105     ajint isreverse = 0;
106     ajint doreverse = 0; /* zero for first inclusion, set to 1 later */
107     float megabytes = 10.0;
108     ajint minscore = 30;
109     ajint shuffles = 0;
110     ajint max_score = 0;
111     ajint seed;
112     ajint best=1;
113     ajint search_mode;
114     AjPStr modestr   = NULL;
115     AjPFile outfile  = NULL;
116     AjPSeqall estset = NULL;
117 
118     /* the fasta input files */
119 
120     embInit("est2genome", argc, argv);
121 
122     estset  = ajAcdGetSeqall("estsequence");
123     genome  = ajAcdGetSeq("genomesequence");
124     outfile = ajAcdGetOutfile("outfile");
125 
126     /* the alignment penalties */
127 
128     match          = ajAcdGetInt("match");
129     mismatch       = ajAcdGetInt("mismatch");
130     gap_penalty    = ajAcdGetInt("gappenalty");
131     intron_penalty = ajAcdGetInt("intronpenalty");
132     splice_penalty = ajAcdGetInt("splicepenalty");
133     doreverse      = ajAcdGetBoolean("reverse");
134 
135     /* the min score for an alignment to be output */
136     minscore = ajAcdGetInt("minscore");
137 
138     if(doreverse)
139 	isreverse = 1;
140 
141     splice = ajAcdGetBoolean("usesplice");
142 
143     /* Print the alignment */
144     align = ajAcdGetBoolean("align");
145     width = ajAcdGetInt("width");
146 
147     /* mode: This is complicated.
148     **
149     ** "forward"   just search forward strands of both sequences
150     ** "reverse"   just search forward of genomic vs reverse of est
151     ** "both"      search forward strand of genomic against forward and
152     ** reverse
153     ** THEN: take the best of these two, and re-align assuming
154     ** a reversed gene so that the splice sites would be appear
155     ** as ct/ac. Only output the best alignment unless the
156     ** flag -nobest is set.
157     **
158     ** Thus THREE alignments are made.
159     **
160     ** The output cordinates are such that the genomic sequence
161     ** is always in the forward direction.
162     */
163 
164     modestr = ajAcdGetListSingle("mode");
165 
166     if(ajStrMatchC(modestr,"both"))
167 	search_mode = BOTH;
168     else if(ajStrMatchC(modestr,"forward"))
169 	search_mode = FORWARD_ONLY;
170     else if(ajStrMatchC(modestr,"reverse"))
171 	search_mode = REVERSE_ONLY;
172     else
173     {
174 	ajErr("search mode %S must be one of: "
175 	      "both, forward, reverse\n", modestr);
176 	exit(1);
177     }
178 
179     /* just print the best alignment ? */
180 
181     best = ajAcdGetBoolean("best");
182 
183     /* max space in megabytes */
184 
185     megabytes = ajAcdGetFloat("space");
186 
187     /* print debugging info */
188 
189     verbose = ajAcdGetBoolean("verbose");
190     debug = ajAcdGetBoolean("debug");
191 
192     if(verbose)
193 	ajDebug("debugging set to %d\n", debug);
194 
195     if(verbose)
196 	embEstSetVerbose();
197     if(debug)
198 	embEstSetDebug();
199 
200     /*
201     ** shuffle the sequences to test for statistical
202     ** significance this many times
203     */
204 
205     shuffles = ajAcdGetInt("shuffle");
206     seed = ajAcdGetInt("seed");
207 
208     if(!seed)
209 	seed = embEstGetSeed();
210     seed = -seed;
211 
212     if(mismatch < 0)
213 	mismatch = -mismatch;
214     if(gap_penalty < 0)
215 	gap_penalty = -gap_penalty;
216     if(intron_penalty < 0)
217 	intron_penalty = -intron_penalty;
218     if(splice_penalty < 0)
219 	splice_penalty = -splice_penalty;
220 
221     embEstMatInit(match, mismatch, gap_penalty, 0, '-');
222 
223     ajSeqTrim(genome);
224 
225     /* Make sure theres enough space to hold the genomic AjPSeq */
226 
227     if(megabytes < ajSeqGetLen(genome)*1.5e-6)
228     {
229 	ajWarn("increasing space from %.3f to %.3f Mb\n",
230 	       megabytes, 1.5e-6*ajSeqGetLen(genome));
231 	megabytes = (float)1.5e-6*ajSeqGetLen(genome);
232     }
233 
234     /* find the GT/AG splice sites */
235 
236     if(splice)
237 	splice_sites = embEstFindSpliceSites(genome, 1);
238     else
239 	splice_sites = NULL;
240 
241     if(search_mode == BOTH && splice)
242 	reversed_splice_sites = embEstFindSpliceSites( genome, 0 );
243     else
244 	reversed_splice_sites = NULL;
245 
246     /* process each est */
247 
248     while(ajSeqallNext(estset, &est))
249     {
250 	/*
251 	 ** if required, make shuffled comparisons
252 	 ** to get statistical significance
253 	 */
254 	ajSeqTrim(est);
255 
256 	ajDebug("shuffles: %d\n", shuffles);
257 	if(shuffles > 0)
258 	{
259 	    AjPSeq shuffled_est;
260 	    ajint n;
261 	    ajint score;
262 	    double mean = 0;
263 	    double std  = 0;
264 	    EmbPEstAlign sge;
265 
266 	    shuffled_est = ajSeqNewSeq(est);
267 
268 	    for(n=0;n<shuffles;n++)
269 	    {
270 		embEstShuffleSeq(shuffled_est, 1, &seed);
271 		sge = embEstAlignNonRecursive(shuffled_est,
272 					      genome,
273 					      gap_penalty,
274 					      intron_penalty,
275 					      splice_penalty,
276 					      splice_sites, 0, 0,
277 					      DIAGONAL);
278 		score = sge->score;
279 		ajDebug("%30.30S\n", ajSeqGetSeqS(shuffled_est));
280 		ajDebug("%5d score %d seed %d\n", n, score, seed);
281 		if(score > max_score)
282 		    max_score = score;
283 		mean += score;
284 		std += score*score;
285 		embEstFreeAlign(&sge);
286 	    }
287 
288 	    mean /= shuffles;
289 	    std = sqrt((std = shuffles*mean*mean)/(shuffles-1.0));
290 	    ajDebug("shuffles: %d max: %d mean: %.2f std dev: %.2f\n",
291 		    shuffles, max_score, mean, std);
292 	    minscore = max_score+1;
293 	    ajSeqDel(&shuffled_est);
294 	}
295 
296 	if(search_mode != REVERSE_ONLY)
297 	{
298 	    /* forward strand */
299 	    fge = embEstAlignLinearSpace(est, genome, match,
300 					 mismatch, gap_penalty,
301 					 intron_penalty, splice_penalty,
302 					 splice_sites, megabytes);
303 	    if(!fge)
304 		ajFatal("forward strand alignment failed");
305 	}
306 	else
307 	    fge = NULL;
308 
309 	if(search_mode != FORWARD_ONLY) /* reverse strand */
310 	{
311 	    reversed_est = ajSeqNewSeq(est);
312 	    ajSeqReverseForce(reversed_est);
313 
314 	    rge = embEstAlignLinearSpace(reversed_est, genome,
315 					 match, mismatch, gap_penalty,
316 					 intron_penalty, splice_penalty,
317 					 splice_sites, megabytes);
318 	    if(!rge)
319 		ajFatal("reverse strand alignment failed");
320 	}
321 	else
322 	    rge = NULL;
323 
324 	if(search_mode == BOTH)	/* search both strands */
325 	{
326 	    if(fge->score > rge->score)
327 	    {			/* redo forward search with
328 				   reversed splice sites */
329 		bge = embEstAlignLinearSpace(est, genome, match,
330 					     mismatch, gap_penalty,
331 					     intron_penalty,
332 					     splice_penalty,
333 					     reversed_splice_sites,
334 					     megabytes);
335 
336 		if(bge->score > fge->score) /* probably have a
337 					       reversed gene */
338 		{
339 		    ajFmtPrintF(outfile,
340 				"Note Best alignment is between forward "
341 				"est and forward genome, but splice "
342 				"sites imply REVERSED GENE\n");
343 		    est2genome_make_output(outfile, genome, est, bge,
344 					   gap_penalty,
345 					   intron_penalty, splice_penalty,
346 					   minscore, align, width,
347 					   reverse);
348 
349 		    if(best == 0)	/* print substandard alignment too */
350 			est2genome_make_output(outfile, genome, est, fge,
351 					       gap_penalty, intron_penalty,
352 					       splice_penalty, minscore,
353 					       align, width, reverse);
354 		}
355 		else
356 		{
357 		    ajFmtPrintF(outfile,
358 				"Note Best alignment is between forward "
359 				"est and forward genome, and splice "
360 				"sites imply forward gene\n");
361 		    est2genome_make_output(outfile, genome, est, fge,
362 					   gap_penalty, intron_penalty,
363 					   splice_penalty, minscore,
364 					   align, width, reverse);
365 		    if(best == 0)
366 			est2genome_make_output(outfile, genome, est, bge,
367 					       gap_penalty, intron_penalty,
368 					       splice_penalty, minscore,
369 					       align, width, reverse);
370 		}
371 	    }
372 	    else
373 	    {
374 		bge = embEstAlignLinearSpace(reversed_est,genome,
375 					     match, mismatch,
376 					     gap_penalty,
377 					     intron_penalty,
378 					     splice_penalty,
379 					     reversed_splice_sites,
380 					     megabytes);
381 
382 		if(bge->score > rge->score) /* probably have a
383 					       reversed gene */
384 		{
385 		    ajFmtPrintF(outfile,
386 				"Note Best alignment is between "
387 				"reversed est and forward genome, but "
388 				"splice sites imply REVERSED GENE\n");
389 		    est2genome_make_output(outfile, genome, reversed_est,
390 					   bge, gap_penalty, intron_penalty,
391 					   splice_penalty, minscore,
392 					   align, width, isreverse);
393 		    if(best == 0)	/* print substandard alignment too */
394 			est2genome_make_output(outfile, genome,
395 					       reversed_est, rge, gap_penalty,
396 					       intron_penalty,
397 					       splice_penalty, minscore,
398 					       align, width, isreverse);
399 		}
400 		else
401 		{
402 		    ajFmtPrintF(outfile,
403 				"Note Best alignment is between reversed "
404 				"est and forward genome, and splice "
405 				"sites imply forward gene\n");
406 		    est2genome_make_output(outfile, genome, reversed_est,
407 					   rge, gap_penalty, intron_penalty,
408 					   splice_penalty, minscore,
409 					   align, width, isreverse);
410 
411 		    if(best == 0)
412 			est2genome_make_output(outfile, genome,
413 					       reversed_est, bge,
414 					       gap_penalty, intron_penalty,
415 					       splice_penalty, minscore,
416 					       align, width, isreverse);
417 		}
418 	    }
419 	}
420 	else if(search_mode == FORWARD_ONLY)
421 	{
422 	    ajFmtPrintF(outfile,
423 			"Note requested forward est and forward genome\n");
424 	    est2genome_make_output(outfile, genome, est, fge,
425 				   gap_penalty, intron_penalty,
426 				   splice_penalty, minscore,
427 				   align, width, reverse);
428 		if(best == 0)
429 		    est2genome_make_output(outfile, genome, est, bge,
430 					   gap_penalty, intron_penalty,
431 					   splice_penalty, minscore,
432 					   align, width, reverse);
433 	    }
434 
435 	else if(search_mode == REVERSE_ONLY)
436 	{
437 	    ajFmtPrintF(outfile,"Note requested reversed est and "
438 			"forward genome\n");
439 	    est2genome_make_output(outfile, genome, reversed_est,
440 				   rge, gap_penalty, intron_penalty,
441 				   splice_penalty, minscore,
442 				   align, width, isreverse);
443 	    if( best == 0 )
444 		est2genome_make_output(outfile, genome,
445 				       reversed_est, bge, gap_penalty,
446 				       intron_penalty,
447 				       splice_penalty, minscore,
448 				       align, width, isreverse);
449 	}
450 
451 	embEstFreeAlign(&bge);
452 	embEstFreeAlign(&rge);
453 	embEstFreeAlign(&fge);
454 
455 	/* ajSeqDel(&est); */ /* Clone from seqall: Don't delete */
456 	ajSeqDel(&reversed_est);
457     }
458 
459     ajSeqDel(&splice_sites);
460 
461     ajSeqDel(&reversed_splice_sites);
462 
463     ajSeqDel(&genome);
464     ajSeqallDel(&estset);
465     ajSeqDel(&est);
466     ajFileClose(&outfile);
467     ajStrDel(&modestr);
468 
469     embExit();
470 
471     return 0;
472 }
473 
474 
475 
476 
477 /* @funcstatic est2genome_make_output *****************************************
478 **
479 ** Undocumented.
480 **
481 ** @param [u] ofile [AjPFile] Undocumented
482 ** @param [r] genome [const AjPSeq] Undocumented
483 ** @param [r] est [const AjPSeq] Undocumented
484 ** @param [r] ge [const EmbPEstAlign] Undocumented
485 ** @param [r] gap_penalty [ajint] Undocumented
486 ** @param [r] intron_penalty [ajint] Undocumented
487 ** @param [r] splice_penalty [ajint] Undocumented
488 ** @param [r] minscore [ajint] Undocumented
489 ** @param [r] align [ajint] Undocumented
490 ** @param [r] width [ajint] Undocumented
491 ** @param [r] reverse [ajint] Undocumented
492 ** @@
493 ******************************************************************************/
494 
est2genome_make_output(AjPFile ofile,const AjPSeq genome,const AjPSeq est,const EmbPEstAlign ge,ajint gap_penalty,ajint intron_penalty,ajint splice_penalty,ajint minscore,ajint align,ajint width,ajint reverse)495 static void est2genome_make_output(AjPFile ofile,
496 				   const AjPSeq genome, const AjPSeq est,
497 				   const EmbPEstAlign ge, ajint gap_penalty,
498 				   ajint intron_penalty, ajint splice_penalty,
499 				   ajint minscore, ajint align, ajint width,
500 				   ajint reverse)
501 {
502 
503     if(ge->score >= minscore)
504     {
505 	embEstOutBlastStyle(ofile, genome, est, ge,
506 			    gap_penalty,
507 			    intron_penalty, splice_penalty, 1, reverse );
508 	ajFmtPrintF( ofile, "\n");
509 	embEstOutBlastStyle(ofile, genome, est, ge,
510 			    gap_penalty,
511 			    intron_penalty, splice_penalty, 0, reverse);
512 
513 	if(align)
514 	{
515 	    ajFmtPrintF(ofile, "\n\n%s vs %s:\n",
516 			ajSeqGetNameC(genome), ajSeqGetNameC(est));
517 	    embEstPrintAlign(ofile, genome, est, ge, width);
518 	}
519     }
520 
521     return;
522 }
523