1 #include "version.h"
2 #include "genestats.h"
3 #include "geneutil.h"
4 #include "standardout.h"
5 #include "pseudowise7.h"
6 #include "genedisplay.h"
7
8 #include "genewise6.h"
9
10 char * program_name = "pseudowise";
11
12
show_version(FILE * ofp)13 void show_version(FILE * ofp)
14 {
15 fprintf(ofp,"%s\nVersion: %s\nReleased: %s\nCompiled: %s\n",program_name,VERSION_NUMBER,RELEASE_DAY,COMPILE_DATE);
16 fprintf(ofp,"\nThis program is freely distributed under a Gnu Public License\n");
17 fprintf(ofp,"The source code is copyright (c) EMBL and others\n");
18 fprintf(ofp,"There is no warranty, implied or otherwise on the performance of this program\n");
19 fprintf(ofp,"For more information read the GNULICENSE file in the distribution\n\n");
20 fprintf(ofp,"Credits: Ewan Birney <birney@ebi.ac.uk>\n");
21 exit(63);
22 }
23
show_help(FILE * ofp)24 void show_help(FILE * ofp)
25 {
26 fprintf(ofp,"%s query-protein-as-fasta query-cdna-as-fasta target-genomic-region\n",program_name);
27 fprintf(ofp," -pretty show pretty output\n");
28 show_help_GeneModelParam(ofp);
29
30 show_help_StandardOutputOptions(ofp);
31
32 show_help_DPRunImpl(ofp);
33
34 show_standard_options(ofp);
35 }
36
37 typedef struct {
38 double gene_model;
39 double pseudo_model;
40 int synonymous;
41 int nonsynonymous;
42 int unlikely;
43 int frameshift;
44 int identical;
45 int stop;
46 int codon;
47 int intron;
48 } PseudoGeneAssessment;
49
pseudo_gene_assessment(AlnBlock * alb,Sequence * pep,Sequence * cdna,Sequence * gen,CompMat * protein,CodonTable * ct,double subs,double indel)50 PseudoGeneAssessment pseudo_gene_assessment(AlnBlock * alb,Sequence * pep,Sequence * cdna,Sequence *gen,CompMat * protein,CodonTable * ct,double subs,double indel)
51 {
52
53 int aa;
54 int gen_pos;
55 int pep_pos;
56
57 int i,j;
58 AlnColumn * alc;
59
60
61 PseudoGeneAssessment out;
62
63 out.gene_model = 1.0;
64 out.pseudo_model = 1.0;
65 out.synonymous = 0;
66 out.nonsynonymous = 0;
67 out.identical = 0;
68 out.unlikely = 0;
69 out.frameshift = 0;
70 out.stop = 0;
71 out.codon = 0;
72 out.intron = 0;
73
74 for(alc=alb->start;alc!=NULL;alc = alc->next ) {
75 gen_pos = alc->alu[1]->start+1;
76 pep_pos = alc->alu[0]->start+1;
77
78 if( strcmp(alc->alu[1]->text_label,"CODON") == 0 && strcmp(alc->alu[0]->text_label,"MATCH_STATE") == 0 ) {
79 out.codon++;
80 if( is_stop_codon(codon_from_seq(gen->seq+gen_pos),ct) ) {
81 out.stop++;
82 continue;
83 }
84
85 aa = aminoacid_from_seq(ct,gen->seq+gen_pos);
86
87 /* see if there has been a synomous codon change here */
88 if( aa == pep->seq[pep_pos] ) {
89 if( cdna->seq[pep_pos*3] != gen->seq[gen_pos] ||
90 cdna->seq[pep_pos*3+1] != gen->seq[gen_pos+1] ||
91 cdna->seq[pep_pos*3+2] != gen->seq[gen_pos+2] ) {
92 out.synonymous++;
93 } else {
94 /* identical codon */
95 out.identical++;
96 }
97 } else {
98 out.nonsynonymous++;
99 if( protein->comp[aa][pep->seq[pep_pos]] < 0 ) {
100 out.unlikely++;
101 }
102 }
103
104 } else if ( strstr(alc->alu[1]->text_label,"SEQUENCE_DELETION") != NULL ) {
105 out.frameshift++;
106 } else if ( strstr(alc->alu[1]->text_label,"SEQUENCE_INSERTION") != NULL ) {
107 out.frameshift++;
108 } else if( strstr(alc->alu[1]->text_label,"5SS") != NULL ) {
109 out.intron++;
110 }
111 }
112
113 return out;
114 }
115
116
check_pep_cdna_synchrony(Sequence * cdna,Sequence * pep,CodonTable * ct)117 int check_pep_cdna_synchrony(Sequence * cdna,Sequence * pep,CodonTable * ct)
118 {
119 int i;
120
121 assert(cdna);
122 assert(pep);
123
124 if( ! (is_dna_SequenceType(cdna->type)) ) {
125 fatal("cDNA sequnece is not DNA!");
126 }
127
128 for(i=0;i<pep->len;i++) {
129 if( pep->seq[i] != aminoacid_from_seq(ct,cdna->seq+(i*3)) ) {
130 fatal("cDNA sequence does not match up with peptide sequence");
131 }
132 }
133
134 return 0;
135 }
136
137
main(int argc,char ** argv)138 int main(int argc,char ** argv)
139 {
140 DPRunImpl * dpri = NULL;
141 GeneModelParam * gmp = NULL;
142 GeneModel * gm = NULL;
143 CodonTable * ct = NULL;
144
145 Sequence * prot = NULL;
146 Protein * protein = NULL;
147 Sequence * cdna = NULL;
148 Sequence * genomic = NULL;
149 Genomic * gen = NULL;
150
151 ComplexSequenceEvalSet * cses = NULL;
152 ComplexSequence * cseq = NULL;
153
154 ThreeStateModel * tsm;
155 GeneWise * gw;
156
157 GeneWiseScore * gws;
158 GeneParser4Score * gps;
159 GeneParser4 * gp;
160
161 GenomicRegion * gr;
162
163 CodonMapper * cm = NULL;
164
165 StandardOutputOptions * std_opt;
166
167 boolean show_pretty = FALSE;
168
169 PackAln * pal;
170 AlnBlock * alb;
171
172 PseudoGeneAssessment pa;
173
174 boolean use_genewise= TRUE;
175
176 char * matrix = "blosum62.bla";
177 CompMat * mat;
178 double indel_rate = 0.01;
179 double subs_rate = 0.01;
180 int offset;
181 RandomModel * rm;
182 RandomModelDNA * rmd;
183
184
185 dpri = new_DPRunImpl_from_argv(&argc,argv);
186 if( dpri == NULL ) {
187 fatal("Unable to build DPRun implementation. Bad arguments");
188 }
189
190 gmp = new_GeneModelParam_from_argv(&argc,argv);
191
192 strip_out_boolean_def_argument(&argc,argv,"pretty",&show_pretty);
193
194 std_opt = new_StandardOutputOptions_from_argv(&argc,argv);
195 strip_out_boolean_def_argument(&argc,argv,"gw",&use_genewise);
196
197 strip_out_standard_options(&argc,argv,show_help,show_version);
198 if( argc != 4 ) {
199 show_help(stdout);
200 exit(12);
201 }
202
203
204 if((gm=GeneModel_from_GeneModelParam(gmp)) == NULL ) {
205 fatal("Could not build gene model");
206 }
207
208 ct= read_CodonTable_file("codon.table");
209
210
211 prot = read_fasta_file_Sequence(argv[1]);
212 cdna = read_fasta_file_Sequence(argv[2]);
213 genomic = read_fasta_file_Sequence(argv[3]);
214
215 if( prot == NULL || cdna == NULL || genomic == NULL ) {
216 fatal("need to have 3 sequences- protein guide, cdna and genomic");
217 }
218
219
220 offset = check_pep_cdna_synchrony(cdna,prot,ct);
221
222 rm = default_RandomModel();
223 rmd = RandomModelDNA_std();
224
225 mat = read_Blast_file_CompMat(matrix);
226
227 cm = flat_CodonMapper(ct);
228
229 sprinkle_errors_over_CodonMapper(cm,subs_rate);
230
231 protein = Protein_from_Sequence(prot);
232
233 tsm = ThreeStateModel_from_half_bit_Sequence(protein,mat,rm,-11,-1);
234
235 gw = GeneWise_from_ThreeStateModel(tsm,NULL,cm,0.0001,NULL);
236
237 cses = new_ComplexSequenceEvalSet_from_GeneModel(gm);
238 cseq = new_ComplexSequence(genomic,cses);
239
240 gp = std_GeneParser4(indel_rate,indel_rate*0.1);
241
242
243 gps = GeneParser4Score_from_GeneParser4(gp);
244
245 GeneWise_fold_in_RandomModelDNA(gw,rmd);
246
247 gws = GeneWiseScore_from_GeneWise(gw);
248
249 if( use_genewise ) {
250 pal = PackAln_bestmemory_GeneWise6(gws,cseq,gps,NULL,dpri);
251 alb = convert_PackAln_to_AlnBlock_GeneWise6(pal);
252 } else {
253 pal = PackAln_bestmemory_PseudoWise7(gws,cseq,gps,Probability2Score(indel_rate),NULL,dpri);
254 alb = convert_PackAln_to_AlnBlock_PseudoWise7(pal);
255 }
256
257
258 show_StandardOutputOptions(std_opt,alb,pal,"//",stdout);
259
260 pa = pseudo_gene_assessment(alb,
261 prot,
262 cdna,
263 genomic,
264 mat,ct,subs_rate,indel_rate);
265
266 printf("Synonymous : %d\n",pa.synonymous);
267 printf("Nonsynonymous : %d\n",pa.nonsynonymous);
268 printf("Ka/Ks : %.2f\n",(double)pa.nonsynonymous/(double)pa.synonymous);
269 printf("Unlikely : %d\n",pa.unlikely);
270 printf("Identical : %d\n",pa.identical);
271 printf("Stop : %d\n",pa.stop);
272 printf("Total codons : %d\n",pa.codon);
273 printf("Frameshift : %d\n",pa.frameshift);
274 printf("Intron : %d\n",pa.intron);
275
276 printf("//\n");
277
278 gen = Genomic_from_Sequence(genomic);
279 gr = new_GenomicRegion_from_GeneWise(gen,FALSE,alb);
280
281 show_pretty_GenomicRegion(gr,0,stdout);
282
283 printf("//\n");
284 if( show_pretty) {
285 protgene_ascii_display(alb,prot->seq,prot->name,prot->offset,gen,ct,15,50,FALSE, stdout);
286 }
287
288
289 return 0;
290
291 }
292