1 #include "version.h"
2 #include "genestats.h"
3 #include "geneutil.h"
4 #include "standardout.h"
5 #include "pseudowise7.h"
6 #include "genedisplay.h"
7 
8 #include "genewise6.h"
9 
10 char * program_name = "pseudowise";
11 
12 
show_version(FILE * ofp)13 void show_version(FILE * ofp)
14 {
15   fprintf(ofp,"%s\nVersion: %s\nReleased: %s\nCompiled: %s\n",program_name,VERSION_NUMBER,RELEASE_DAY,COMPILE_DATE);
16   fprintf(ofp,"\nThis program is freely distributed under a Gnu Public License\n");
17   fprintf(ofp,"The source code is copyright (c) EMBL and others\n");
18   fprintf(ofp,"There is no warranty, implied or otherwise on the performance of this program\n");
19   fprintf(ofp,"For more information read the GNULICENSE file in the distribution\n\n");
20   fprintf(ofp,"Credits: Ewan Birney <birney@ebi.ac.uk>\n");
21   exit(63);
22 }
23 
show_help(FILE * ofp)24 void show_help(FILE * ofp)
25 {
26   fprintf(ofp,"%s query-protein-as-fasta query-cdna-as-fasta target-genomic-region\n",program_name);
27   fprintf(ofp,"  -pretty   show pretty output\n");
28   show_help_GeneModelParam(ofp);
29 
30   show_help_StandardOutputOptions(ofp);
31 
32   show_help_DPRunImpl(ofp);
33 
34   show_standard_options(ofp);
35 }
36 
37 typedef struct {
38   double gene_model;
39   double pseudo_model;
40   int synonymous;
41   int nonsynonymous;
42   int unlikely;
43   int frameshift;
44   int identical;
45   int stop;
46   int codon;
47   int intron;
48 } PseudoGeneAssessment;
49 
pseudo_gene_assessment(AlnBlock * alb,Sequence * pep,Sequence * cdna,Sequence * gen,CompMat * protein,CodonTable * ct,double subs,double indel)50 PseudoGeneAssessment pseudo_gene_assessment(AlnBlock * alb,Sequence * pep,Sequence * cdna,Sequence *gen,CompMat * protein,CodonTable * ct,double subs,double indel)
51 {
52 
53   int aa;
54   int gen_pos;
55   int pep_pos;
56 
57   int i,j;
58   AlnColumn * alc;
59 
60 
61   PseudoGeneAssessment out;
62 
63   out.gene_model    = 1.0;
64   out.pseudo_model  = 1.0;
65   out.synonymous    = 0;
66   out.nonsynonymous = 0;
67   out.identical     = 0;
68   out.unlikely      = 0;
69   out.frameshift    = 0;
70   out.stop          = 0;
71   out.codon         = 0;
72   out.intron        = 0;
73 
74   for(alc=alb->start;alc!=NULL;alc = alc->next ) {
75     gen_pos = alc->alu[1]->start+1;
76     pep_pos = alc->alu[0]->start+1;
77 
78     if( strcmp(alc->alu[1]->text_label,"CODON") == 0 && strcmp(alc->alu[0]->text_label,"MATCH_STATE") == 0 ) {
79       out.codon++;
80       if( is_stop_codon(codon_from_seq(gen->seq+gen_pos),ct) ) {
81 	out.stop++;
82 	continue;
83       }
84 
85       aa = aminoacid_from_seq(ct,gen->seq+gen_pos);
86 
87       /* see if there has been a synomous codon change here */
88       if( aa == pep->seq[pep_pos] ) {
89 	if( cdna->seq[pep_pos*3] != gen->seq[gen_pos] ||
90 	    cdna->seq[pep_pos*3+1] != gen->seq[gen_pos+1] ||
91 	    cdna->seq[pep_pos*3+2] != gen->seq[gen_pos+2] ) {
92 	  out.synonymous++;
93 	} else {
94 	  /* identical codon */
95 	  out.identical++;
96 	}
97       } else {
98 	out.nonsynonymous++;
99 	if( protein->comp[aa][pep->seq[pep_pos]] < 0 ) {
100 	  out.unlikely++;
101 	}
102       }
103 
104     } else if ( strstr(alc->alu[1]->text_label,"SEQUENCE_DELETION") != NULL ) {
105       out.frameshift++;
106     } else if ( strstr(alc->alu[1]->text_label,"SEQUENCE_INSERTION") != NULL ) {
107       out.frameshift++;
108     } else if( strstr(alc->alu[1]->text_label,"5SS") != NULL ) {
109       out.intron++;
110     }
111   }
112 
113   return out;
114 }
115 
116 
check_pep_cdna_synchrony(Sequence * cdna,Sequence * pep,CodonTable * ct)117 int check_pep_cdna_synchrony(Sequence * cdna,Sequence * pep,CodonTable * ct)
118 {
119   int i;
120 
121   assert(cdna);
122   assert(pep);
123 
124   if( ! (is_dna_SequenceType(cdna->type)) ) {
125     fatal("cDNA sequnece is not DNA!");
126   }
127 
128   for(i=0;i<pep->len;i++) {
129     if( pep->seq[i] != aminoacid_from_seq(ct,cdna->seq+(i*3)) ) {
130       fatal("cDNA sequence does not match up with peptide sequence");
131     }
132   }
133 
134   return 0;
135 }
136 
137 
main(int argc,char ** argv)138 int main(int argc,char ** argv)
139 {
140   DPRunImpl * dpri     = NULL;
141   GeneModelParam * gmp = NULL;
142   GeneModel * gm       = NULL;
143   CodonTable * ct      = NULL;
144 
145   Sequence * prot      = NULL;
146   Protein  * protein   = NULL;
147   Sequence * cdna      = NULL;
148   Sequence * genomic   = NULL;
149   Genomic  * gen       = NULL;
150 
151   ComplexSequenceEvalSet * cses = NULL;
152   ComplexSequence * cseq = NULL;
153 
154   ThreeStateModel * tsm;
155   GeneWise * gw;
156 
157   GeneWiseScore * gws;
158   GeneParser4Score * gps;
159   GeneParser4 * gp;
160 
161   GenomicRegion * gr;
162 
163   CodonMapper  * cm = NULL;
164 
165   StandardOutputOptions * std_opt;
166 
167   boolean show_pretty = FALSE;
168 
169   PackAln * pal;
170   AlnBlock * alb;
171 
172   PseudoGeneAssessment pa;
173 
174   boolean use_genewise= TRUE;
175 
176   char * matrix = "blosum62.bla";
177   CompMat * mat;
178   double indel_rate = 0.01;
179   double subs_rate  = 0.01;
180   int offset;
181   RandomModel * rm;
182   RandomModelDNA * rmd;
183 
184 
185   dpri = new_DPRunImpl_from_argv(&argc,argv);
186   if( dpri == NULL ) {
187     fatal("Unable to build DPRun implementation. Bad arguments");
188   }
189 
190   gmp = new_GeneModelParam_from_argv(&argc,argv);
191 
192   strip_out_boolean_def_argument(&argc,argv,"pretty",&show_pretty);
193 
194   std_opt = new_StandardOutputOptions_from_argv(&argc,argv);
195   strip_out_boolean_def_argument(&argc,argv,"gw",&use_genewise);
196 
197   strip_out_standard_options(&argc,argv,show_help,show_version);
198   if( argc != 4 ) {
199     show_help(stdout);
200     exit(12);
201   }
202 
203 
204   if((gm=GeneModel_from_GeneModelParam(gmp)) == NULL ) {
205     fatal("Could not build gene model");
206   }
207 
208   ct= read_CodonTable_file("codon.table");
209 
210 
211   prot    = read_fasta_file_Sequence(argv[1]);
212   cdna    = read_fasta_file_Sequence(argv[2]);
213   genomic = read_fasta_file_Sequence(argv[3]);
214 
215   if( prot == NULL || cdna == NULL || genomic == NULL ) {
216     fatal("need to have 3 sequences- protein guide, cdna and genomic");
217   }
218 
219 
220   offset = check_pep_cdna_synchrony(cdna,prot,ct);
221 
222   rm = default_RandomModel();
223   rmd = RandomModelDNA_std();
224 
225   mat = read_Blast_file_CompMat(matrix);
226 
227   cm = flat_CodonMapper(ct);
228 
229   sprinkle_errors_over_CodonMapper(cm,subs_rate);
230 
231   protein = Protein_from_Sequence(prot);
232 
233   tsm = ThreeStateModel_from_half_bit_Sequence(protein,mat,rm,-11,-1);
234 
235   gw = GeneWise_from_ThreeStateModel(tsm,NULL,cm,0.0001,NULL);
236 
237   cses = new_ComplexSequenceEvalSet_from_GeneModel(gm);
238   cseq = new_ComplexSequence(genomic,cses);
239 
240   gp  = std_GeneParser4(indel_rate,indel_rate*0.1);
241 
242 
243   gps = GeneParser4Score_from_GeneParser4(gp);
244 
245   GeneWise_fold_in_RandomModelDNA(gw,rmd);
246 
247   gws = GeneWiseScore_from_GeneWise(gw);
248 
249   if( use_genewise ) {
250     pal = PackAln_bestmemory_GeneWise6(gws,cseq,gps,NULL,dpri);
251     alb = convert_PackAln_to_AlnBlock_GeneWise6(pal);
252   } else {
253     pal = PackAln_bestmemory_PseudoWise7(gws,cseq,gps,Probability2Score(indel_rate),NULL,dpri);
254     alb = convert_PackAln_to_AlnBlock_PseudoWise7(pal);
255   }
256 
257 
258   show_StandardOutputOptions(std_opt,alb,pal,"//",stdout);
259 
260   pa = pseudo_gene_assessment(alb,
261 			      prot,
262 			      cdna,
263 			      genomic,
264 			      mat,ct,subs_rate,indel_rate);
265 
266   printf("Synonymous     : %d\n",pa.synonymous);
267   printf("Nonsynonymous  : %d\n",pa.nonsynonymous);
268   printf("Ka/Ks          : %.2f\n",(double)pa.nonsynonymous/(double)pa.synonymous);
269   printf("Unlikely       : %d\n",pa.unlikely);
270   printf("Identical      : %d\n",pa.identical);
271   printf("Stop           : %d\n",pa.stop);
272   printf("Total codons   : %d\n",pa.codon);
273   printf("Frameshift     : %d\n",pa.frameshift);
274   printf("Intron         : %d\n",pa.intron);
275 
276   printf("//\n");
277 
278   gen = Genomic_from_Sequence(genomic);
279   gr  = new_GenomicRegion_from_GeneWise(gen,FALSE,alb);
280 
281   show_pretty_GenomicRegion(gr,0,stdout);
282 
283   printf("//\n");
284   if( show_pretty) {
285     protgene_ascii_display(alb,prot->seq,prot->name,prot->offset,gen,ct,15,50,FALSE, stdout);
286   }
287 
288 
289   return 0;
290 
291 }
292