1 #include "est_evidence.h"
2 #include "genomewise9.h"
3 #include "geneutil.h"
4 #include "geneoutput.h"
5 #include "version.h"
6
7
8
9 char * program_name = "genomewise";
10
11
12 void debug_genomewise(AlnBlock * alb,GenomeEvidenceSet * ges,CodonTable * ct,Sequence * gen,FILE * ofp);
13
14 void show_utr_exon_genomewise(AlnBlock * alb,FILE * ofp);
15
show_version(FILE * ofp)16 void show_version(FILE * ofp)
17 {
18 fprintf(ofp,"%s\nVersion: %s\nReleased: %s\nCompiled: %s\n",program_name,VERSION_NUMBER,RELEASE_DAY,COMPILE_DATE);
19 fprintf(ofp,"\nThis program is freely distributed under a Gnu Public License\n");
20 fprintf(ofp,"The source code is copyright (c) EMBL and others\n");
21 fprintf(ofp,"There is no warranty, implied or otherwise on the performance of this program\n");
22 fprintf(ofp,"For more information read the GNULICENSE file in the distribution\n\n");
23 fprintf(ofp,"Credits: Ewan Birney <birney@ebi.ac.uk>\n");
24 exit(63);
25 }
26
show_help(FILE * ofp)27 void show_help(FILE * ofp)
28 {
29 fprintf(ofp,"%s genomic-fasta-file evidence-file\n",program_name);
30 fprintf(ofp," ** Genomewise is designed to work with the Ensembl EST build system\n");
31 fprintf(ofp," ** Although you can reuse it directly, alot of the magic occurs in \n");
32 fprintf(ofp," ** the Ensembl Runnable/RunnableDB system behind this. see www.ensembl.org \n\n");
33
34 fprintf(ofp," evidence file should have exon,cds and indel lines separated by //\n");
35 fprintf(ofp," between predictions (multiple predictions ok)\n");
36 fprintf(ofp," exon start end -- means exon prediction, no phase restriction\n");
37 fprintf(ofp," cds start end phase -- means exon prediction, only in that phase\n");
38 fprintf(ofp," indel start end -- allow frameshifting in this area\n");
39 fprintf(ofp,"eg - \n");
40 fprintf(ofp,"exon 120 340\n");
41 fprintf(ofp,"exon 560 591\n");
42 fprintf(ofp,"//\n");
43 fprintf(ofp,"cds 12 56 0\n");
44 fprintf(ofp,"cds 70 80 1\n");
45 fprintf(ofp,"\n\nOPTIONS (can occur anywhere on the command line\n");
46 fprintf(ofp,"Scoring\n");
47 fprintf(ofp," -start <number> no start codon penalty 30\n");
48 fprintf(ofp," -stop <number> no stop codon penalty 200\n");
49 fprintf(ofp," -gene <number> new gene cost 5000\n");
50 fprintf(ofp," -switch <number> evidence switch cost 100\n");
51 fprintf(ofp," -smell <number> smell space used for out-phase splice sites 8\n");
52
53 show_help_GeneOutputPara(ofp);
54
55 show_help_DPRunImpl(ofp);
56
57
58
59 show_standard_options(ofp);
60 }
61
62
63
64
main(int argc,char ** argv)65 int main(int argc,char ** argv)
66 {
67 Sequence * gen;
68 Genomic * genomic;
69 CodonTable * ct = NULL;
70 GenomeEvidenceSet * ges = NULL;
71 RandomCodonScore * rcs;
72 FILE * ifp = NULL;
73 ComplexSequence * cs = NULL;
74 ComplexSequenceEvalSet * cses = NULL;
75 AlnBlock * alb;
76 PackAln * pal;
77 GenomicRegion * gr;
78 int i;
79
80
81 int kbyte = 10000;
82 int stop_codon_pen = 200;
83 int start_codon_pen = 30;
84 int new_gene = 5000;
85 int switch_cost = 100;
86 int smell = 8;
87 DPRunImpl * dpri = NULL;
88 GeneOutputPara * geneout = NULL;
89 GeneOutputData data;
90
91 EstEvidence * est;
92
93 boolean show_debug = FALSE;
94
95 char * divide_string = "//";
96
97 strip_out_boolean_def_argument(&argc,argv,"debug",&show_debug);
98
99 strip_out_integer_argument(&argc,argv,"stop",&stop_codon_pen);
100 strip_out_integer_argument(&argc,argv,"start",&start_codon_pen);
101 strip_out_integer_argument(&argc,argv,"gene",&new_gene);
102 strip_out_integer_argument(&argc,argv,"switch",&switch_cost);
103 strip_out_integer_argument(&argc,argv,"smell",&smell);
104
105 dpri = new_DPRunImpl_from_argv(&argc,argv);
106
107 if( dpri == NULL ) {
108 fatal("Unable to build DPRun implementation. Bad arguments");
109 }
110
111 geneout = new_GeneOutputPara_from_argv(&argc,argv);
112 assert(geneout);
113
114
115 strip_out_standard_options(&argc,argv,show_help,show_version);
116 if( argc != 3 ) {
117 show_help(stdout);
118 exit(12);
119 }
120
121
122
123 ct = read_CodonTable_file("codon.table");
124
125 /* fprintf(stderr,"Codon table is %d\n",ct);*/
126
127 gen = read_fasta_file_Sequence(argv[1]);
128 ifp = openfile(argv[2],"r");
129 ges = read_est_evidence(ifp,ct);
130
131 for(i=0;i<ges->len;i++) {
132 est = (EstEvidence *) (ges->geu[i]->data);
133 est->in_smell = smell;
134 }
135
136
137 rcs= RandomCodonScore_alloc();
138 for(i=0;i<125;i++) {
139 if( is_stop_codon(i,ct) ) {
140 rcs->codon[i] = -1000000;
141 } else {
142 rcs->codon[i] = 0;
143 }
144 /* fprintf(stderr,"Got %d for %d\n",rcs->codon[i],i); */
145 }
146
147
148
149 cses = default_genomic_ComplexSequenceEvalSet();
150 cs = new_ComplexSequence(gen,cses);
151
152
153 pal = PackAln_bestmemory_GenomeWise9(ges,cs,-switch_cost,-new_gene,-start_codon_pen,-stop_codon_pen,rcs,NULL,dpri);
154 alb = convert_PackAln_to_AlnBlock_GenomeWise9(pal);
155
156
157 genomic = Genomic_from_Sequence(gen);
158 gr = new_GenomicRegion(genomic);
159
160 add_Genes_to_GenomicRegion_GeneWise(gr,1,gen->len,alb,gen->name,0,NULL);
161
162 data.pal = pal;
163 data.alb = alb;
164 data.gr = gr;
165 data.gen = genomic;
166 data.ct = ct;
167
168 show_GeneOutput(&data,geneout,stdout);
169
170 if( show_debug ) {
171 debug_genomewise(alb,ges,ct,gen,stdout);
172 fprintf(stdout,"%s\n",geneout->divide_string);
173 }
174
175 return 0;
176 }
177
178
debug_genomewise(AlnBlock * alb,GenomeEvidenceSet * ges,CodonTable * ct,Sequence * gen,FILE * ofp)179 void debug_genomewise(AlnBlock * alb,GenomeEvidenceSet * ges,CodonTable * ct,Sequence * gen,FILE * ofp)
180 {
181 AlnColumn *alc;
182 int cstart;
183
184
185 for(alc=alb->start;alc != NULL;alc = alc->next ) {
186 fprintf(ofp,"%4d %12s %12s [%3d][%5d %5d] ",alc->alu[1]->score[0],alc->alu[0]->text_label,alc->alu[1]->text_label,alc->alu[0]->start,alc->alu[1]->start+1,alc->alu[1]->end);
187 if( strstartcmp(alc->alu[1]->text_label,"CODON") == 0 ) {
188 cstart = alc->alu[1]->start+1;
189 fprintf(ofp,"%c%c%c %c\n",gen->seq[cstart],gen->seq[cstart+1],gen->seq[cstart+2],aminoacid_from_seq(ct,gen->seq+cstart));
190 } else {
191 for (cstart = alc->alu[1]->start+1; cstart <= alc->alu[1]->end; cstart++) {
192 fprintf(ofp,"%c",gen->seq[cstart]);
193 }
194 fprintf(ofp,"\n");
195 }
196 }
197
198 }
199
200
201
202
203
204
205
206
207