1 /* @source est2genome application
2 **
3 ** Richard Mott's est_genome ported into EMBOSS.
4 ** See also nucleus/embest.c
5 **
6 ** @author Copyright (C) Peter Rice, Sanger Centre
7 ** @author Copyright (C) Richard Mott, Sanger Centre
8 ** @@
9 **
10 ** This program is free software; you can redistribute it and/or
11 ** modify it under the terms of the GNU General Public License
12 ** as published by the Free Software Foundation; either version 2
13 ** of the License, or (at your option) any later version.
14 **
15 ** This program is distributed in the hope that it will be useful,
16 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ** GNU General Public License for more details.
19 **
20 ** You should have received a copy of the GNU General Public License
21 ** along with this program; if not, write to the Free Software
22 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
23 ******************************************************************************/
24
25 /*
26 ** Revision 1.7 1997/03/17 15:26:00 pmr
27 ** when EST is reversed, need to reverse the EST sequence positions
28 ** also cleaned up long code lines
29 **
30 ** Revision 1.6 1997/02/10 14:07:54 rmott
31 ** fixed bug so that splice sites in REVERSE direction
32 ** (ie ct/ac rather than gt/ag) are found correctly.
33 ** Output modified so that splice direction is written
34 **
35 ** Revision 1.4 1997/01/30 17:21:22 rmott
36 ** fixed bug, and now computes area limit better
37 **
38 ** Revision 1.3 1997/01/30 17:03:45 rmott
39 ** debugged version with debug statements still in.
40 ** Fixed problem of not initialising edge properly
41 **
42 ** Revision 1.2 1996/08/07 10:12:05 rmott
43 **
44 ** Linear-Space version
45 **
46 ** Revision 1.1 1996/08/01 13:55:42 rmott
47 ** Initial revision
48 **
49 */
50
51 #include "emboss.h"
52 #include <math.h>
53
54
55
56
57 extern ajint lsimmat[256][256];
58
59 #define BOTH 0
60 #define FORWARD_ONLY 1
61 #define REVERSE_ONLY 2
62
63 ajint verbose;
64 ajint debug;
65
66
67
68
69 static void est2genome_make_output(AjPFile ofile,
70 const AjPSeq genome, const AjPSeq est,
71 const EmbPEstAlign ge, ajint gap_penalty,
72 ajint intron_penalty,
73 ajint splice_penalty, ajint minscore,
74 ajint align, ajint width, ajint reverse);
75
76
77
78
79 /* @prog est2genome ***********************************************************
80 **
81 ** Align EST and genomic DNA sequences
82 **
83 ******************************************************************************/
84
85
main(int argc,char ** argv)86 int main(int argc, char **argv)
87 {
88 AjPSeq genome = NULL;
89 AjPSeq splice_sites = NULL;
90 AjPSeq reversed_splice_sites = NULL;
91 AjPSeq est = NULL;
92 AjPSeq reversed_est = NULL;
93 EmbPEstAlign fge = NULL;
94 EmbPEstAlign rge = NULL;
95 EmbPEstAlign bge = NULL;
96 ajint width = 50;
97 ajint match = 1;
98 ajint mismatch = 1;
99 ajint gap_penalty = 2;
100 ajint intron_penalty = 40;
101 ajint splice_penalty = 20;
102 ajint splice = 1;
103 ajint align = 0;
104 ajint reverse = 0;
105 ajint isreverse = 0;
106 ajint doreverse = 0; /* zero for first inclusion, set to 1 later */
107 float megabytes = 10.0;
108 ajint minscore = 30;
109 ajint shuffles = 0;
110 ajint max_score = 0;
111 ajint seed;
112 ajint best=1;
113 ajint search_mode;
114 AjPStr modestr = NULL;
115 AjPFile outfile = NULL;
116 AjPSeqall estset = NULL;
117
118 /* the fasta input files */
119
120 embInit("est2genome", argc, argv);
121
122 estset = ajAcdGetSeqall("estsequence");
123 genome = ajAcdGetSeq("genomesequence");
124 outfile = ajAcdGetOutfile("outfile");
125
126 /* the alignment penalties */
127
128 match = ajAcdGetInt("match");
129 mismatch = ajAcdGetInt("mismatch");
130 gap_penalty = ajAcdGetInt("gappenalty");
131 intron_penalty = ajAcdGetInt("intronpenalty");
132 splice_penalty = ajAcdGetInt("splicepenalty");
133 doreverse = ajAcdGetBoolean("reverse");
134
135 /* the min score for an alignment to be output */
136 minscore = ajAcdGetInt("minscore");
137
138 if(doreverse)
139 isreverse = 1;
140
141 splice = ajAcdGetBoolean("usesplice");
142
143 /* Print the alignment */
144 align = ajAcdGetBoolean("align");
145 width = ajAcdGetInt("width");
146
147 /* mode: This is complicated.
148 **
149 ** "forward" just search forward strands of both sequences
150 ** "reverse" just search forward of genomic vs reverse of est
151 ** "both" search forward strand of genomic against forward and
152 ** reverse
153 ** THEN: take the best of these two, and re-align assuming
154 ** a reversed gene so that the splice sites would be appear
155 ** as ct/ac. Only output the best alignment unless the
156 ** flag -nobest is set.
157 **
158 ** Thus THREE alignments are made.
159 **
160 ** The output cordinates are such that the genomic sequence
161 ** is always in the forward direction.
162 */
163
164 modestr = ajAcdGetListSingle("mode");
165
166 if(ajStrMatchC(modestr,"both"))
167 search_mode = BOTH;
168 else if(ajStrMatchC(modestr,"forward"))
169 search_mode = FORWARD_ONLY;
170 else if(ajStrMatchC(modestr,"reverse"))
171 search_mode = REVERSE_ONLY;
172 else
173 {
174 ajErr("search mode %S must be one of: "
175 "both, forward, reverse\n", modestr);
176 exit(1);
177 }
178
179 /* just print the best alignment ? */
180
181 best = ajAcdGetBoolean("best");
182
183 /* max space in megabytes */
184
185 megabytes = ajAcdGetFloat("space");
186
187 /* print debugging info */
188
189 verbose = ajAcdGetBoolean("verbose");
190 debug = ajAcdGetBoolean("debug");
191
192 if(verbose)
193 ajDebug("debugging set to %d\n", debug);
194
195 if(verbose)
196 embEstSetVerbose();
197 if(debug)
198 embEstSetDebug();
199
200 /*
201 ** shuffle the sequences to test for statistical
202 ** significance this many times
203 */
204
205 shuffles = ajAcdGetInt("shuffle");
206 seed = ajAcdGetInt("seed");
207
208 if(!seed)
209 seed = embEstGetSeed();
210 seed = -seed;
211
212 if(mismatch < 0)
213 mismatch = -mismatch;
214 if(gap_penalty < 0)
215 gap_penalty = -gap_penalty;
216 if(intron_penalty < 0)
217 intron_penalty = -intron_penalty;
218 if(splice_penalty < 0)
219 splice_penalty = -splice_penalty;
220
221 embEstMatInit(match, mismatch, gap_penalty, 0, '-');
222
223 ajSeqTrim(genome);
224
225 /* Make sure theres enough space to hold the genomic AjPSeq */
226
227 if(megabytes < ajSeqGetLen(genome)*1.5e-6)
228 {
229 ajWarn("increasing space from %.3f to %.3f Mb\n",
230 megabytes, 1.5e-6*ajSeqGetLen(genome));
231 megabytes = (float)1.5e-6*ajSeqGetLen(genome);
232 }
233
234 /* find the GT/AG splice sites */
235
236 if(splice)
237 splice_sites = embEstFindSpliceSites(genome, 1);
238 else
239 splice_sites = NULL;
240
241 if(search_mode == BOTH && splice)
242 reversed_splice_sites = embEstFindSpliceSites( genome, 0 );
243 else
244 reversed_splice_sites = NULL;
245
246 /* process each est */
247
248 while(ajSeqallNext(estset, &est))
249 {
250 /*
251 ** if required, make shuffled comparisons
252 ** to get statistical significance
253 */
254 ajSeqTrim(est);
255
256 ajDebug("shuffles: %d\n", shuffles);
257 if(shuffles > 0)
258 {
259 AjPSeq shuffled_est;
260 ajint n;
261 ajint score;
262 double mean = 0;
263 double std = 0;
264 EmbPEstAlign sge;
265
266 shuffled_est = ajSeqNewSeq(est);
267
268 for(n=0;n<shuffles;n++)
269 {
270 embEstShuffleSeq(shuffled_est, 1, &seed);
271 sge = embEstAlignNonRecursive(shuffled_est,
272 genome,
273 gap_penalty,
274 intron_penalty,
275 splice_penalty,
276 splice_sites, 0, 0,
277 DIAGONAL);
278 score = sge->score;
279 ajDebug("%30.30S\n", ajSeqGetSeqS(shuffled_est));
280 ajDebug("%5d score %d seed %d\n", n, score, seed);
281 if(score > max_score)
282 max_score = score;
283 mean += score;
284 std += score*score;
285 embEstFreeAlign(&sge);
286 }
287
288 mean /= shuffles;
289 std = sqrt((std = shuffles*mean*mean)/(shuffles-1.0));
290 ajDebug("shuffles: %d max: %d mean: %.2f std dev: %.2f\n",
291 shuffles, max_score, mean, std);
292 minscore = max_score+1;
293 ajSeqDel(&shuffled_est);
294 }
295
296 if(search_mode != REVERSE_ONLY)
297 {
298 /* forward strand */
299 fge = embEstAlignLinearSpace(est, genome, match,
300 mismatch, gap_penalty,
301 intron_penalty, splice_penalty,
302 splice_sites, megabytes);
303 if(!fge)
304 ajFatal("forward strand alignment failed");
305 }
306 else
307 fge = NULL;
308
309 if(search_mode != FORWARD_ONLY) /* reverse strand */
310 {
311 reversed_est = ajSeqNewSeq(est);
312 ajSeqReverseForce(reversed_est);
313
314 rge = embEstAlignLinearSpace(reversed_est, genome,
315 match, mismatch, gap_penalty,
316 intron_penalty, splice_penalty,
317 splice_sites, megabytes);
318 if(!rge)
319 ajFatal("reverse strand alignment failed");
320 }
321 else
322 rge = NULL;
323
324 if(search_mode == BOTH) /* search both strands */
325 {
326 if(fge->score > rge->score)
327 { /* redo forward search with
328 reversed splice sites */
329 bge = embEstAlignLinearSpace(est, genome, match,
330 mismatch, gap_penalty,
331 intron_penalty,
332 splice_penalty,
333 reversed_splice_sites,
334 megabytes);
335
336 if(bge->score > fge->score) /* probably have a
337 reversed gene */
338 {
339 ajFmtPrintF(outfile,
340 "Note Best alignment is between forward "
341 "est and forward genome, but splice "
342 "sites imply REVERSED GENE\n");
343 est2genome_make_output(outfile, genome, est, bge,
344 gap_penalty,
345 intron_penalty, splice_penalty,
346 minscore, align, width,
347 reverse);
348
349 if(best == 0) /* print substandard alignment too */
350 est2genome_make_output(outfile, genome, est, fge,
351 gap_penalty, intron_penalty,
352 splice_penalty, minscore,
353 align, width, reverse);
354 }
355 else
356 {
357 ajFmtPrintF(outfile,
358 "Note Best alignment is between forward "
359 "est and forward genome, and splice "
360 "sites imply forward gene\n");
361 est2genome_make_output(outfile, genome, est, fge,
362 gap_penalty, intron_penalty,
363 splice_penalty, minscore,
364 align, width, reverse);
365 if(best == 0)
366 est2genome_make_output(outfile, genome, est, bge,
367 gap_penalty, intron_penalty,
368 splice_penalty, minscore,
369 align, width, reverse);
370 }
371 }
372 else
373 {
374 bge = embEstAlignLinearSpace(reversed_est,genome,
375 match, mismatch,
376 gap_penalty,
377 intron_penalty,
378 splice_penalty,
379 reversed_splice_sites,
380 megabytes);
381
382 if(bge->score > rge->score) /* probably have a
383 reversed gene */
384 {
385 ajFmtPrintF(outfile,
386 "Note Best alignment is between "
387 "reversed est and forward genome, but "
388 "splice sites imply REVERSED GENE\n");
389 est2genome_make_output(outfile, genome, reversed_est,
390 bge, gap_penalty, intron_penalty,
391 splice_penalty, minscore,
392 align, width, isreverse);
393 if(best == 0) /* print substandard alignment too */
394 est2genome_make_output(outfile, genome,
395 reversed_est, rge, gap_penalty,
396 intron_penalty,
397 splice_penalty, minscore,
398 align, width, isreverse);
399 }
400 else
401 {
402 ajFmtPrintF(outfile,
403 "Note Best alignment is between reversed "
404 "est and forward genome, and splice "
405 "sites imply forward gene\n");
406 est2genome_make_output(outfile, genome, reversed_est,
407 rge, gap_penalty, intron_penalty,
408 splice_penalty, minscore,
409 align, width, isreverse);
410
411 if(best == 0)
412 est2genome_make_output(outfile, genome,
413 reversed_est, bge,
414 gap_penalty, intron_penalty,
415 splice_penalty, minscore,
416 align, width, isreverse);
417 }
418 }
419 }
420 else if(search_mode == FORWARD_ONLY)
421 {
422 ajFmtPrintF(outfile,
423 "Note requested forward est and forward genome\n");
424 est2genome_make_output(outfile, genome, est, fge,
425 gap_penalty, intron_penalty,
426 splice_penalty, minscore,
427 align, width, reverse);
428 if(best == 0)
429 est2genome_make_output(outfile, genome, est, bge,
430 gap_penalty, intron_penalty,
431 splice_penalty, minscore,
432 align, width, reverse);
433 }
434
435 else if(search_mode == REVERSE_ONLY)
436 {
437 ajFmtPrintF(outfile,"Note requested reversed est and "
438 "forward genome\n");
439 est2genome_make_output(outfile, genome, reversed_est,
440 rge, gap_penalty, intron_penalty,
441 splice_penalty, minscore,
442 align, width, isreverse);
443 if( best == 0 )
444 est2genome_make_output(outfile, genome,
445 reversed_est, bge, gap_penalty,
446 intron_penalty,
447 splice_penalty, minscore,
448 align, width, isreverse);
449 }
450
451 embEstFreeAlign(&bge);
452 embEstFreeAlign(&rge);
453 embEstFreeAlign(&fge);
454
455 /* ajSeqDel(&est); */ /* Clone from seqall: Don't delete */
456 ajSeqDel(&reversed_est);
457 }
458
459 ajSeqDel(&splice_sites);
460
461 ajSeqDel(&reversed_splice_sites);
462
463 ajSeqDel(&genome);
464 ajSeqallDel(&estset);
465 ajSeqDel(&est);
466 ajFileClose(&outfile);
467 ajStrDel(&modestr);
468
469 embExit();
470
471 return 0;
472 }
473
474
475
476
477 /* @funcstatic est2genome_make_output *****************************************
478 **
479 ** Undocumented.
480 **
481 ** @param [u] ofile [AjPFile] Undocumented
482 ** @param [r] genome [const AjPSeq] Undocumented
483 ** @param [r] est [const AjPSeq] Undocumented
484 ** @param [r] ge [const EmbPEstAlign] Undocumented
485 ** @param [r] gap_penalty [ajint] Undocumented
486 ** @param [r] intron_penalty [ajint] Undocumented
487 ** @param [r] splice_penalty [ajint] Undocumented
488 ** @param [r] minscore [ajint] Undocumented
489 ** @param [r] align [ajint] Undocumented
490 ** @param [r] width [ajint] Undocumented
491 ** @param [r] reverse [ajint] Undocumented
492 ** @@
493 ******************************************************************************/
494
est2genome_make_output(AjPFile ofile,const AjPSeq genome,const AjPSeq est,const EmbPEstAlign ge,ajint gap_penalty,ajint intron_penalty,ajint splice_penalty,ajint minscore,ajint align,ajint width,ajint reverse)495 static void est2genome_make_output(AjPFile ofile,
496 const AjPSeq genome, const AjPSeq est,
497 const EmbPEstAlign ge, ajint gap_penalty,
498 ajint intron_penalty, ajint splice_penalty,
499 ajint minscore, ajint align, ajint width,
500 ajint reverse)
501 {
502
503 if(ge->score >= minscore)
504 {
505 embEstOutBlastStyle(ofile, genome, est, ge,
506 gap_penalty,
507 intron_penalty, splice_penalty, 1, reverse );
508 ajFmtPrintF( ofile, "\n");
509 embEstOutBlastStyle(ofile, genome, est, ge,
510 gap_penalty,
511 intron_penalty, splice_penalty, 0, reverse);
512
513 if(align)
514 {
515 ajFmtPrintF(ofile, "\n\n%s vs %s:\n",
516 ajSeqGetNameC(genome), ajSeqGetNameC(est));
517 embEstPrintAlign(ofile, genome, est, ge, width);
518 }
519 }
520
521 return;
522 }
523