1 #include "est_evidence.h"
2 #include "genomewise9.h"
3 #include "geneutil.h"
4 #include "geneoutput.h"
5 #include "version.h"
6 
7 
8 
9 char * program_name = "genomewise";
10 
11 
12 void debug_genomewise(AlnBlock * alb,GenomeEvidenceSet * ges,CodonTable * ct,Sequence * gen,FILE * ofp);
13 
14 void show_utr_exon_genomewise(AlnBlock * alb,FILE * ofp);
15 
show_version(FILE * ofp)16 void show_version(FILE * ofp)
17 {
18   fprintf(ofp,"%s\nVersion: %s\nReleased: %s\nCompiled: %s\n",program_name,VERSION_NUMBER,RELEASE_DAY,COMPILE_DATE);
19   fprintf(ofp,"\nThis program is freely distributed under a Gnu Public License\n");
20   fprintf(ofp,"The source code is copyright (c) EMBL and others\n");
21   fprintf(ofp,"There is no warranty, implied or otherwise on the performance of this program\n");
22   fprintf(ofp,"For more information read the GNULICENSE file in the distribution\n\n");
23   fprintf(ofp,"Credits: Ewan Birney <birney@ebi.ac.uk>\n");
24   exit(63);
25 }
26 
show_help(FILE * ofp)27 void show_help(FILE * ofp)
28 {
29   fprintf(ofp,"%s genomic-fasta-file evidence-file\n",program_name);
30   fprintf(ofp,"   ** Genomewise is designed to work with the Ensembl EST build system\n");
31   fprintf(ofp,"   ** Although you can reuse it directly, alot of the magic occurs in \n");
32   fprintf(ofp,"   ** the Ensembl Runnable/RunnableDB system behind this. see www.ensembl.org \n\n");
33 
34   fprintf(ofp,"   evidence file should have exon,cds and indel lines separated by //\n");
35   fprintf(ofp,"   between predictions (multiple predictions ok)\n");
36   fprintf(ofp,"      exon start end -- means exon prediction, no phase restriction\n");
37   fprintf(ofp,"      cds  start end phase -- means exon prediction, only in that phase\n");
38   fprintf(ofp,"      indel start end -- allow frameshifting in this area\n");
39   fprintf(ofp,"eg - \n");
40   fprintf(ofp,"exon 120 340\n");
41   fprintf(ofp,"exon 560 591\n");
42   fprintf(ofp,"//\n");
43   fprintf(ofp,"cds  12  56 0\n");
44   fprintf(ofp,"cds  70  80 1\n");
45   fprintf(ofp,"\n\nOPTIONS (can occur anywhere on the command line\n");
46   fprintf(ofp,"Scoring\n");
47   fprintf(ofp,"   -start  <number> no start codon penalty 30\n");
48   fprintf(ofp,"   -stop   <number> no stop codon penalty 200\n");
49   fprintf(ofp,"   -gene   <number> new gene cost 5000\n");
50   fprintf(ofp,"   -switch <number> evidence switch cost 100\n");
51   fprintf(ofp,"   -smell  <number> smell space used for out-phase splice sites 8\n");
52 
53   show_help_GeneOutputPara(ofp);
54 
55   show_help_DPRunImpl(ofp);
56 
57 
58 
59   show_standard_options(ofp);
60 }
61 
62 
63 
64 
main(int argc,char ** argv)65 int main(int argc,char ** argv)
66 {
67   Sequence   * gen;
68   Genomic    * genomic;
69   CodonTable * ct = NULL;
70   GenomeEvidenceSet * ges = NULL;
71   RandomCodonScore * rcs;
72   FILE * ifp = NULL;
73   ComplexSequence * cs = NULL;
74   ComplexSequenceEvalSet * cses = NULL;
75   AlnBlock * alb;
76   PackAln * pal;
77   GenomicRegion * gr;
78   int i;
79 
80 
81   int kbyte                = 10000;
82   int stop_codon_pen  = 200;
83   int start_codon_pen = 30;
84   int new_gene        = 5000;
85   int switch_cost     = 100;
86   int smell           = 8;
87   DPRunImpl * dpri = NULL;
88   GeneOutputPara * geneout = NULL;
89   GeneOutputData data;
90 
91   EstEvidence * est;
92 
93   boolean show_debug = FALSE;
94 
95   char * divide_string = "//";
96 
97   strip_out_boolean_def_argument(&argc,argv,"debug",&show_debug);
98 
99   strip_out_integer_argument(&argc,argv,"stop",&stop_codon_pen);
100   strip_out_integer_argument(&argc,argv,"start",&start_codon_pen);
101   strip_out_integer_argument(&argc,argv,"gene",&new_gene);
102   strip_out_integer_argument(&argc,argv,"switch",&switch_cost);
103   strip_out_integer_argument(&argc,argv,"smell",&smell);
104 
105   dpri = new_DPRunImpl_from_argv(&argc,argv);
106 
107   if( dpri == NULL ) {
108     fatal("Unable to build DPRun implementation. Bad arguments");
109   }
110 
111   geneout = new_GeneOutputPara_from_argv(&argc,argv);
112   assert(geneout);
113 
114 
115   strip_out_standard_options(&argc,argv,show_help,show_version);
116   if( argc != 3 ) {
117     show_help(stdout);
118     exit(12);
119   }
120 
121 
122 
123   ct  = read_CodonTable_file("codon.table");
124 
125   /*  fprintf(stderr,"Codon table is %d\n",ct);*/
126 
127   gen = read_fasta_file_Sequence(argv[1]);
128   ifp = openfile(argv[2],"r");
129   ges = read_est_evidence(ifp,ct);
130 
131   for(i=0;i<ges->len;i++) {
132     est = (EstEvidence *) (ges->geu[i]->data);
133     est->in_smell = smell;
134   }
135 
136 
137   rcs= RandomCodonScore_alloc();
138   for(i=0;i<125;i++) {
139     if( is_stop_codon(i,ct) ) {
140       rcs->codon[i] = -1000000;
141     } else {
142       rcs->codon[i] = 0;
143     }
144     /*    fprintf(stderr,"Got %d for %d\n",rcs->codon[i],i); */
145   }
146 
147 
148 
149   cses = default_genomic_ComplexSequenceEvalSet();
150   cs   = new_ComplexSequence(gen,cses);
151 
152 
153   pal  = PackAln_bestmemory_GenomeWise9(ges,cs,-switch_cost,-new_gene,-start_codon_pen,-stop_codon_pen,rcs,NULL,dpri);
154   alb  = convert_PackAln_to_AlnBlock_GenomeWise9(pal);
155 
156 
157   genomic = Genomic_from_Sequence(gen);
158   gr = new_GenomicRegion(genomic);
159 
160   add_Genes_to_GenomicRegion_GeneWise(gr,1,gen->len,alb,gen->name,0,NULL);
161 
162   data.pal = pal;
163   data.alb = alb;
164   data.gr  = gr;
165   data.gen = genomic;
166   data.ct  = ct;
167 
168   show_GeneOutput(&data,geneout,stdout);
169 
170   if( show_debug ) {
171     debug_genomewise(alb,ges,ct,gen,stdout);
172     fprintf(stdout,"%s\n",geneout->divide_string);
173   }
174 
175   return 0;
176 }
177 
178 
debug_genomewise(AlnBlock * alb,GenomeEvidenceSet * ges,CodonTable * ct,Sequence * gen,FILE * ofp)179 void debug_genomewise(AlnBlock * alb,GenomeEvidenceSet * ges,CodonTable * ct,Sequence * gen,FILE * ofp)
180 {
181   AlnColumn *alc;
182   int cstart;
183 
184 
185   for(alc=alb->start;alc != NULL;alc = alc->next ) {
186     fprintf(ofp,"%4d %12s %12s [%3d][%5d %5d] ",alc->alu[1]->score[0],alc->alu[0]->text_label,alc->alu[1]->text_label,alc->alu[0]->start,alc->alu[1]->start+1,alc->alu[1]->end);
187     if( strstartcmp(alc->alu[1]->text_label,"CODON") == 0 ) {
188       cstart = alc->alu[1]->start+1;
189       fprintf(ofp,"%c%c%c  %c\n",gen->seq[cstart],gen->seq[cstart+1],gen->seq[cstart+2],aminoacid_from_seq(ct,gen->seq+cstart));
190     } else {
191       for (cstart = alc->alu[1]->start+1; cstart <= alc->alu[1]->end; cstart++) {
192         fprintf(ofp,"%c",gen->seq[cstart]);
193       }
194       fprintf(ofp,"\n");
195     }
196   }
197 
198 }
199 
200 
201 
202 
203 
204 
205 
206 
207