1 /* has to be before the others due to nasty namespace clashes */
2 #define WISE2_CROSS_HMMER2
3 #include "wise2xhmmer2.h"
4
5
6 #include "dyna.h"
7 #include "version.h"
8
9 char * program_name = "genewise";
10
11 /*
12 * program specific includes
13 */
14
15
16 #include "gwrap.h"
17 #include "genedisplay.h"
18 #include "matchsum.h"
19
20 #include "phasemodel.h"
21 #include "genephase6.h"
22
23
24 /*
25 * program specific variables
26 */
27
28 char * dna_seq_file = NULL;
29 Genomic * gen = NULL;
30 boolean is_embl = FALSE;
31
32 char * tstart_str = NULL;
33 int tstart = -1;
34
35 char * tend_str = NULL;
36 int tend = -1;
37
38 boolean reverse = FALSE;
39 boolean target_abs = FALSE;
40 boolean do_both = FALSE;
41
42 boolean use_tsm = FALSE;
43
44
45 char * protein_file = NULL;
46 Protein * pro = NULL;
47
48 char * hmm_file = NULL;
49 ThreeStateModel * tsm = NULL;
50 char * hmm_name = NULL;
51
52 char * potential_file = NULL;
53
54 char * qstart_str = NULL;
55 int qstart = -1;
56
57 char * qend_str = NULL;
58 int qend = -1;
59
60 char * gene_file = NULL;
61 GeneFrequency21 * gf = NULL;
62
63 char * new_gene_file = NULL;
64 GeneStats * gs = NULL;
65
66 GeneModelParam * gmp = NULL;
67
68 PhasedProteinPara * ppp = NULL;
69 PhasedProtein * pp = NULL;
70
71 boolean use_new_stats = 1;
72
73 char * matrix_file = NULL;
74 CompMat * mat = NULL;
75
76 char * gap_str = "12";
77 int gap = 12;
78
79 char * ext_str = "2";
80 int ext = 2;
81
82 char * length_of_N_str = "0";
83 int length_of_N = 0;
84
85 boolean flat_insert = TRUE;
86
87 char * codon_file = NULL;
88 CodonTable * ct = NULL;
89
90 char * output_file = "-";
91 FILE * ofp = NULL;
92
93 RandomModelDNA * rmd = NULL;
94
95 char * allN_string = "1.0";
96 Probability allN = 1.0;
97
98 char * subs_string = "0.000001";
99 double subs_error = 0.000001;
100
101 char * indel_string = "0.000001";
102 double indel_error = 0.000001;
103
104 char * cfreq_string = "flat";
105 boolean model_codon = FALSE;
106
107 char * splice_string = "model";
108 boolean model_splice = TRUE;
109
110 char * startend_string = "default";
111 TSM_StartEndMode startend = TSM_default;
112
113 char * null_string = "syn";
114 boolean use_syn = FALSE;
115
116 char * intron_string = "tied";
117 boolean use_tied_model = FALSE;
118
119 int alg = GWWRAP_623;
120 char * alg_str = NULL;
121
122 char * kbyte_str = NULL;
123 int kbyte = 10000; /* will be reset in build_defaults */
124
125 char * pal_file = NULL;
126
127 boolean show_PackAln = FALSE;
128 boolean show_cumlative_PackAln = FALSE;
129 boolean show_AlnBlock = FALSE;
130 boolean show_ace = FALSE;
131 boolean show_gff = FALSE;
132 boolean show_trans = FALSE;
133 boolean show_pep = FALSE;
134 boolean show_cdna = FALSE;
135 boolean show_pretty = FALSE;
136 boolean show_pretty_gene = FALSE;
137 boolean show_supp_gene = FALSE;
138 boolean show_gene_plain = FALSE;
139 boolean show_match_sum = FALSE;
140 boolean show_para = FALSE;
141 boolean show_overlap = FALSE;
142 boolean show_embl = FALSE;
143 boolean show_diana = FALSE;
144
145 boolean pseudo = FALSE;
146
147 char * main_block_str = "50";
148 int main_block = 50;
149
150 char * divide_str = "//";
151
152 Probability rnd_loop = 0.99;
153 Probability cds_loop = 0.97;
154 Probability rnd_to_model = (1 - 0.99) / 3;
155 Probability link_loop = 0.98;
156 Probability link_to_model = (1- 0.98) / 3;
157
158 AlnBlock * alb;
159 PackAln * pal;
160
161 GenomicRegion * gr;
162 GenomicRegion * embl;
163
164 MatchSummarySet * mss;
165
166 RandomModel * rm;
167
168 DPRunImpl * dpri;
169
170 GeneWiseRunPara * gwrp;
171
show_parameters(void)172 void show_parameters(void)
173 {
174 fprintf(ofp,"%s %s (%s release)\n",program_name,VERSION_NUMBER,RELEASE_DAY);
175 fprintf(ofp,"This program is freely distributed under a GPL. See source directory\n");
176 fprintf(ofp,"Copyright (c) GRL limited: portions of the code are from separate copyright\n\n");
177 if( use_tsm == FALSE) {
178 fprintf(ofp,"Query protein: %s\n",pro->baseseq->name);
179 fprintf(ofp,"Comp Matrix: %s\n",matrix_file);
180 fprintf(ofp,"Gap open: %d\n",gap);
181 fprintf(ofp,"Gap extension: %d\n",ext);
182 }
183 else
184 fprintf(ofp,"Query model: %s\n",tsm->name);
185 fprintf(ofp,"Start/End %s\n",startend_string);
186 fprintf(ofp,"Target Sequence %s\n",gen->baseseq->name);
187 fprintf(ofp,"Strand: %s\n",do_both == TRUE ? "both" : reverse == TRUE ? "reverse" : "forward");
188 fprintf(ofp,"Start/End (protein) %s\n",startend_string);
189 show_info_GeneModelParam(gmp,ofp);
190
191 fprintf(ofp,"Codon Table: %s\n",codon_file);
192 fprintf(ofp,"Subs error: %2.2g\n",subs_error);
193 fprintf(ofp,"Indel error: %2.2g\n",indel_error);
194 fprintf(ofp,"Null model %s\n",null_string);
195 fprintf(ofp,"Algorithm %s\n",alg_str);
196 }
197
show_pretty_aln(void)198 boolean show_pretty_aln(void)
199 {
200 Protein * ps;
201
202 fprintf(ofp,"\n%s output\nScore %4.2f bits over entire alignment\n",program_name,Score2Bits(pal->score));
203
204
205 if( alg == GWWRAP_2193L || alg == GWWRAP_2193) {
206 fprintf(ofp,"Entrie alignment score contains unseen 'random' score segments\nYou should only use the per-alignments score printed below\nfor the bits score of the alignment\n\n");
207 }
208
209 if( use_syn == FALSE ) {
210 fprintf(ofp,"Scores as bits over a flat simple random model\n\n");
211 } else {
212 fprintf(ofp,"Scores as bits over a synchronous coding model\n\n");
213 }
214
215
216
217 if( use_tsm == FALSE ) {
218 fprintf(ofp,"Warning: The bits scores is not probablistically correct for single seqs\nSee WWW help for more info\n\n");
219
220 protgene_ascii_display(alb,pro->baseseq->seq,pro->baseseq->name,pro->baseseq->offset,gen,ct,15,main_block,(alg == GWWRAP_623L || alg == GWWRAP_2193L || alg == GWWRAP_2193) ? TRUE : FALSE, ofp);
221 } else {
222 ps = pseudo_Protein_from_ThreeStateModel(tsm);
223 protgene_ascii_display(alb,ps->baseseq->seq,ps->baseseq->name,ps->baseseq->offset,gen,ct,15,main_block,(alg == GWWRAP_623L || alg == GWWRAP_2193L || alg == GWWRAP_2193) ? TRUE : FALSE,ofp);
224 free_Protein(ps);
225 }
226 fprintf(ofp,"%s\n",divide_str);
227
228 return TRUE;
229 }
230
231
show_output(void)232 boolean show_output(void)
233 {
234 int i;
235
236 cDNA * cdna;
237 Protein * trans;
238 GenomicOverlapResults * gor;
239 AlnColumn * alt;
240
241
242
243
244 if( show_pretty == TRUE ) {
245 show_pretty_aln();
246 }
247
248 if( show_match_sum == TRUE ) {
249 show_MatchSummary_genewise_header(ofp);
250 show_MatchSummarySet_genewise(mss,ofp);
251 fprintf(ofp,"%s\n",divide_str);
252 }
253
254 if( show_pretty_gene == TRUE ) {
255 show_pretty_GenomicRegion(gr,0,ofp);
256 fprintf(ofp,"%s\n",divide_str);
257 }
258
259 if( show_supp_gene == TRUE ) {
260 show_pretty_GenomicRegion(gr,1,ofp);
261 fprintf(ofp,"%s\n",divide_str);
262 }
263
264 if( show_embl == TRUE ) {
265 write_Embl_FT_GenomicRegion(gr,ofp);
266 fprintf(ofp,"%s\n",divide_str);
267 }
268
269 if( show_diana == TRUE ) {
270 write_Diana_FT_GenomicRegion(gr,ofp);
271 fprintf(ofp,"%s\n",divide_str);
272 }
273
274 if( show_overlap == TRUE ) {
275 gor = Genomic_overlap(gr,embl);
276 show_GenomicOverlapResults(gor,ofp);
277 fprintf(ofp,"%s\n",divide_str);
278 }
279
280
281 if( show_trans == TRUE ) {
282 for(i=0;i<gr->len;i++) {
283 if( gr->gene[i]->ispseudo == TRUE ) {
284 fprintf(ofp,"#Gene %d is a pseudo gene - no translation possible\n",i);
285 } else {
286 trans = get_Protein_from_Translation(gr->gene[i]->transcript[0]->translation[0],ct);
287 write_fasta_Sequence(trans->baseseq,ofp);
288 }
289 }
290 fprintf(ofp,"%s\n",divide_str);
291 }
292
293 if( show_pep == TRUE ) {
294 alt = alb->start;
295 for(;alt != NULL;) {
296 trans = Protein_from_GeneWise_AlnColumn(gen->baseseq,alt,1,&alt,ct,is_random_AlnColumn_genewise);
297 if ( trans == NULL )
298 break;
299 write_fasta_Sequence(trans->baseseq,ofp);
300 free_Protein(trans);
301 }
302 fprintf(ofp,"%s\n",divide_str);
303 }
304
305 if( show_cdna == TRUE ) {
306 for(i=0;i<gr->len;i++) {
307 cdna = get_cDNA_from_Transcript(gr->gene[i]->transcript[0]);
308 write_fasta_Sequence(cdna->baseseq,ofp);
309 }
310 fprintf(ofp,"%s\n",divide_str);
311 }
312
313
314 if( show_ace == TRUE ) {
315 show_ace_GenomicRegion(gr,gen->baseseq->name,ofp);
316 fprintf(ofp,"%s\n",divide_str);
317 }
318
319 if( show_gff == TRUE ) {
320 show_GFF_GenomicRegion(gr,gen->baseseq->name,"GeneWise",ofp);
321 fprintf(ofp,"%s\n",divide_str);
322 }
323
324 if( show_gene_plain == TRUE ) {
325 show_GenomicRegion(gr,ofp);
326 fprintf(ofp,"%s\n",divide_str);
327 }
328
329 if( show_AlnBlock == TRUE ) {
330 mapped_ascii_AlnBlock(alb,Score2Bits,0,ofp);
331 fprintf(ofp,"%s\n",divide_str);
332 }
333
334 if( show_cumlative_PackAln == TRUE ) {
335 show_bits_and_cumlative_PackAln(pal,ofp);
336 fprintf(ofp,"%s\n",divide_str);
337 }
338
339 if( show_PackAln == TRUE ) {
340 show_simple_PackAln(pal,ofp);
341 fprintf(ofp,"%s\n",divide_str);
342 }
343
344
345
346 return TRUE;
347 }
348
build_alignment(void)349 boolean build_alignment(void)
350 {
351 GeneParameter21 * gpara;
352 GeneModel * gm;
353
354 if( pal_file != NULL ) {
355 pal = read_simple_PackAln_file(pal_file);
356 }
357
358
359 if( use_new_stats == 0 ) {
360 gpara = GeneParameter21_wrap(gf,subs_error,indel_error,rmd,model_codon,model_splice,use_tied_model,ct,rnd_loop,cds_loop,rnd_to_model,link_loop,link_to_model);
361 } else {
362 gm = GeneModel_from_GeneStats(gs,gmp);
363
364 gpara= GeneParameter21_from_GeneModel(gm,ct,rnd_loop,cds_loop,rnd_to_model,link_loop,link_to_model,subs_error,indel_error);
365 }
366
367
368 if( gpara == NULL ) {
369 warn("Sorry - could not build gene parameters. Must be a bug of some sort");
370 return FALSE;
371 }
372
373
374 /* phased based proteins use a different source */
375
376 if( alg == GWWRAP_623P ) {
377 alb = AlnBlock_from_phased_protein_wrap(pro,tsm,gen,gpara->cm,rm,mat,ppp,gpara,dpri,&pal);
378 } else {
379 if( use_tsm == FALSE) {
380 alb = AlnBlock_from_protein_genewise_wrap(pro,gen,mat,-gap,-ext,gpara,rmd,rmd,alg,use_syn,rm,allN,startend,dpri,&pal,gwrp);
381 } else {
382 alb = AlnBlock_from_TSM_genewise_wrap(tsm,gen,gpara,rmd,rmd,use_syn,alg,allN,flat_insert,dpri,&pal,NULL);
383 }
384 }
385
386 free_GeneParameter21(gpara);
387
388
389
390 if( alb == NULL )
391 return FALSE;
392
393
394 gr = new_GenomicRegion(gen);
395
396
397 add_Genes_to_GenomicRegion_GeneWise(gr,gen->baseseq->offset,gen->baseseq->end,alb,use_tsm == TRUE ? tsm->name : pro->baseseq->name,pseudo,NULL);
398
399 if( use_tsm == TRUE)
400 mss = MatchSummarySet_from_AlnBlock_genewise(alb,tsm->name,1,gen->baseseq);
401 else
402 mss = MatchSummarySet_from_AlnBlock_genewise(alb,pro->baseseq->name,pro->baseseq->offset,gen->baseseq);
403
404 return TRUE;
405 }
406
free_temporary_objects(void)407 boolean free_temporary_objects(void)
408 {
409 alb = free_AlnBlock(alb);
410 pal = free_PackAln(pal);
411 mss = free_MatchSummarySet(mss);
412 gr = free_GenomicRegion(gr);
413
414 return TRUE;
415 }
416
free_io_objects(void)417 boolean free_io_objects(void)
418 {
419 if( use_tsm == TRUE) {
420 free_ThreeStateModel(tsm);
421 } else {
422 free_Protein(pro);
423 }
424
425 free_CodonTable(ct);
426 if( gf != NULL ) {
427 free_GeneFrequency21(gf);
428 }
429 free_RandomModelDNA(rmd);
430 if( is_embl ) {
431 free_GenomicRegion(embl);
432 }
433 free_Genomic(gen);
434
435
436 return TRUE;
437 }
438
439
440
build_objects(void)441 boolean build_objects(void)
442 {
443 boolean ret = TRUE;
444 Protein * pro_temp;
445 Genomic * gen_temp;
446 FILE * ifp;
447
448
449
450
451
452 startend = threestatemodel_mode_from_string(startend_string);
453 if( startend == TSM_unknown ) {
454 warn("String %s was unable to converted into a start/end policy\n",startend_string);
455 ret = FALSE;
456 }
457
458
459 if( tstart_str != NULL ) {
460 if( is_integer_string(tstart_str,&tstart) == FALSE || tstart < 0) {
461 warn("Could not make %s out as target start",tstart);
462 ret = FALSE;
463 }
464 }
465
466 if( tend_str != NULL ) {
467 if( is_integer_string(tend_str,&tend) == FALSE || tend < 0) {
468 warn("Could not make %s out as target end",tend);
469 ret = FALSE;
470 }
471 }
472
473 if( is_integer_string(gap_str,&gap) == FALSE ) {
474 warn("Could not make %s out as gap penalty (must be integer at the moment)",gap_str);
475 ret = FALSE;
476 }
477
478
479 if( is_integer_string(ext_str,&ext) == FALSE ) {
480 warn("Could not make %s out as gap penalty (must be integer at the moment)",ext_str);
481 ret = FALSE;
482 }
483
484 if( is_embl == FALSE ) {
485 if( (gen = read_fasta_file_Genomic(dna_seq_file,length_of_N)) == NULL ) {
486 ret = FALSE;
487 warn("Could not read genomic sequence in %s",dna_seq_file);
488 gen = NULL;
489 }
490 } else {
491 embl = read_EMBL_GenomicRegion_file(dna_seq_file);
492 if( embl == NULL ) {
493 warn("Could not read genomic EMBL file in %s",dna_seq_file);
494 gen = NULL;
495 ret = FALSE;
496 } else {
497 gen = hard_link_Genomic(embl->genomic);
498
499 }
500 }
501
502 if( gen != NULL ) {
503
504 if( tstart != -1 || tend != -1 ) {
505 if( tstart == -1 )
506 tstart = 0;
507 if( tend == -1 )
508 tend = gen->baseseq->len;
509 gen_temp = truncate_Genomic(gen,tstart-1,tend);
510 if( gen_temp == NULL ){
511 ret = FALSE;
512 } else {
513 free_Genomic(gen);
514 gen = gen_temp;
515 }
516 } else {
517 /* no truncation required */
518 }
519
520
521 if( reverse == TRUE ) {
522 if( tstart > tend ) {
523 warn("You have already reversed the DNA by using %d - %d truncation. Re-reversing",tstart,tend);
524 }
525
526 gen_temp = reverse_complement_Genomic(gen);
527 free_Genomic(gen);
528 gen = gen_temp;
529 }
530 }
531
532 /*
533 * Can't truncate on GenomicRegion (for good reasons!).
534 * but we want only a section of the EMBL file to be used
535 *
536 * So... swap genomic now. Positions in EMBL are still valid,
537 * however - some genes will loose their sequence, which will be damaging. ;)
538 */
539
540
541 if( is_embl ) {
542 free_Genomic(embl->genomic);
543 embl->genomic = hard_link_Genomic(gen); /* pointer could be dead anyway ;) */
544 }
545
546
547 if( target_abs == TRUE ) {
548 if( is_embl == TRUE ) {
549 warn("Sorry you can't both use absolute positioning and EMBL files as I can't cope with all the coordinate remapping. You'll have to convert to fasta.");
550 ret = FALSE;
551 }
552
553 gen->baseseq->offset = 1;
554 gen->baseseq->end = strlen(gen->baseseq->seq);
555 }
556
557 if( alg_str != NULL ) {
558 alg = gwrap_alg_type_from_string(alg_str);
559 } else {
560 if( use_tsm == TRUE ) {
561 alg_str = "623L";
562 } else {
563 alg_str = "623";
564 }
565 alg = gwrap_alg_type_from_string(alg_str);
566 }
567
568
569 if( qstart_str != NULL ) {
570 if( is_integer_string(qstart_str,&qstart) == FALSE || qstart < 0) {
571 warn("Could not make %s out as query start",qstart);
572 ret = FALSE;
573 }
574 }
575
576 if( qend_str != NULL ) {
577 if( is_integer_string(qend_str,&qend) == FALSE || qend < 0) {
578 warn("Could not make %s out as query end",qend);
579 ret = FALSE;
580 }
581 }
582
583
584 if( use_tsm == FALSE ) {
585 if( startend != TSM_default && startend != TSM_global && startend != TSM_local && startend != TSM_endbiased) {
586 warn("Proteins can only have local/global/endbias startend policies set, not %s",startend_string);
587 ret = FALSE;
588 }
589 if( (pro = read_fasta_file_Protein(protein_file)) == NULL ) {
590 ret = FALSE;
591 warn("Could not read Protein sequence in %s",protein_file);
592 } else {
593
594 if( qstart != -1 || qend != -1 ) {
595 if( qstart == -1 )
596 qstart = 0;
597 if( qend == -1 )
598 qend = pro->baseseq->len;
599
600 pro_temp = truncate_Protein(pro,qstart-1,qend);
601 if( pro_temp == NULL ){
602 ret = FALSE;
603 } else {
604 free_Protein(pro);
605 pro = pro_temp;
606 }
607 }
608 }
609 } else {
610 /** using a HMM **/
611
612 /*tsm = read_HMMer_1_7_ascii_file(hmm_file);*/
613 /*tsm = Wise2_read_ThreeStateModel_from_hmmer1_file(hmm_file);*/
614 tsm = HMMer2_read_ThreeStateModel(hmm_file);
615
616
617 if( tsm == NULL ) {
618 warn("Could not read hmm from %s\n",hmm_file);
619 ret = FALSE;
620 } else {
621
622 display_char_in_ThreeStateModel(tsm);
623 if( hmm_name != NULL ) {
624 if( tsm->name != NULL )
625 ckfree(tsm->name);
626 tsm->name = stringalloc(hmm_name);
627 }
628
629 if( tsm == NULL ) {
630 warn("Could not read %s as a hmm",hmm_file);
631 }
632
633 /** have to set start/end **/
634 set_startend_policy_ThreeStateModel(tsm,startend,30,0.1);
635
636 }
637 } /* end of else tsm != NULL */
638
639
640
641 if( main_block_str != NULL ) {
642 if( is_integer_string(main_block_str,&main_block) == FALSE ) {
643 warn("Could not get maximum main_block number %s",main_block_str);
644 ret = FALSE;
645 }
646 }
647
648
649
650 if( is_double_string(subs_string,&subs_error) == FALSE ) {
651 warn("Could not convert %s to a double",subs_error);
652 ret = FALSE;
653 }
654
655 if( is_double_string(indel_string,&indel_error) == FALSE ) {
656 warn("Could not convert %s to a double",indel_error);
657 ret = FALSE;
658 }
659
660 if( is_double_string(allN_string,&allN) == FALSE ) {
661 warn("Could not convert %s to a double",allN_string);
662 ret = FALSE;
663 }
664
665
666 if( strcmp(cfreq_string,"model") == 0 ) {
667 model_codon = TRUE;
668 } else if ( strcmp(cfreq_string,"flat") == 0 ) {
669 model_codon = FALSE;
670 } else {
671 warn("Cannot interpret [%s] as a codon modelling parameter\n",cfreq_string);
672 ret = FALSE;
673 }
674
675
676 if( strcmp(splice_string,"model") == 0 ) {
677 model_splice = TRUE;
678 } else if ( strcmp(splice_string,"flat") == 0 ) {
679 model_splice = FALSE;
680 gmp->use_gtag_splice = TRUE;
681 } else {
682 warn("Cannot interpret [%s] as a splice modelling parameter\n",splice_string);
683 ret = FALSE;
684 }
685
686 if( strcmp(null_string,"syn") == 0 ) {
687 use_syn = TRUE;
688 } else if ( strcmp(null_string,"flat") == 0 ) {
689 use_syn = FALSE;
690 } else {
691 warn("Cannot interpret [%s] as a null model string\n",null_string);
692 ret = FALSE;
693 }
694
695 if( strcmp(intron_string,"model") == 0 ) {
696 use_tied_model = FALSE;
697 } else if ( strcmp(intron_string,"tied") == 0 ) {
698 use_tied_model = TRUE;
699 } else {
700 warn("Cannot interpret [%s] as a intron tieing switch\n",intron_string);
701 ret = FALSE;
702 }
703
704
705
706 if( (rm = default_RandomModel()) == NULL) {
707 warn("Could not make default random model\n");
708 ret = FALSE;
709 }
710
711 if( use_new_stats == 0 ) {
712 if( (gf = read_GeneFrequency21_file(gene_file)) == NULL) {
713 ret = FALSE;
714 warn("Could not read a GeneFrequency file in %s",gene_file);
715 }
716 } else {
717 if( (gs = GeneStats_from_GeneModelParam(gmp)) == NULL ){
718 ret=FALSE;
719 warn("Could not read gene statistics in %s",new_gene_file);
720 }
721 } /* end of else using new gene stats */
722
723
724 if( (mat = read_Blast_file_CompMat(matrix_file)) == NULL) {
725 if( use_tsm == TRUE ) {
726 info("I could not read the Comparison matrix file in %s; however, you are using a HMM so it is not needed. Please set the WISECONFIGDIR or WISEPERSONALDIR variable correctly to prevent this message.",matrix_file);
727 } else {
728 warn("Could not read Comparison matrix file in %s",matrix_file);
729 ret = FALSE;
730 }
731 }
732
733 if( (ct = read_CodonTable_file(codon_file)) == NULL) {
734 ret = FALSE;
735 warn("Could not read codon table file in %s",codon_file);
736 }
737
738 if( (ofp = openfile(output_file,"W")) == NULL) {
739 warn("Could not open %s as an output file",output_file);
740 ret = FALSE;
741 }
742
743 rmd = RandomModelDNA_std();
744 return ret;
745
746 }
747
build_defaults(void)748 void build_defaults(void)
749 {
750 gene_file = "human.gf";
751 new_gene_file = "gene.stat";
752 codon_file = "codon.table";
753 matrix_file = "BLOSUM62.bla";
754
755
756 }
757
reverse_target(void)758 void reverse_target(void)
759 {
760 Genomic * gen_temp;
761
762 gen_temp = reverse_complement_Genomic(gen);
763
764 free_temporary_objects();
765
766 free_Genomic(gen);
767
768 gen = gen_temp;
769
770 }
771
show_short_help(void)772 void show_short_help(void)
773 {
774 fprintf(stdout,"%s (%s)\n",program_name,VERSION_NUMBER);
775 fprintf(stdout,"This program is freely distributed under a GPL. See -version for more info\n");
776 fprintf(stdout,"Copyright (c) GRL limited: portions of the code are from separate copyrights\n\n");
777 fprintf(stdout,"genewise <protein-file> <dna-file> in fasta format\n");
778 fprintf(stdout," Options. In any order, '-' as filename (for any input) means stdin\n");
779 fprintf(stdout,"\nFor more help go %s -help.\n",program_name);
780 exit(63);
781 }
782
show_help(FILE * ofp)783 void show_help(FILE * ofp)
784 {
785 fprintf(ofp,"%s (%s)\n",program_name,VERSION_NUMBER);
786 fprintf(ofp,"genewise <protein-file> <dna-file> in fasta format\n");
787 /* program specific help */
788 fprintf(ofp,"Dna sequence options\n");
789 fprintf(ofp," -u start position in dna\n");
790 fprintf(ofp," -v end position in dna\n");
791 fprintf(ofp," -trev Compare on the reverse strand\n");
792 fprintf(ofp," -tfor [default] Compare on the forward strand\n");
793 fprintf(ofp," -both Both strands\n");
794 fprintf(ofp," -tabs Report positions as absolute to truncated/reverse sequence\n");
795 fprintf(ofp," -fembl File is an EMBL file native format\n");
796 fprintf(ofp,"Protein comparison options\n");
797 fprintf(ofp," -s start position in protein\n");
798 fprintf(ofp," -t end position in protein\n");
799 fprintf(ofp," -[g]ap [%3d] gap penalty\n",gap);
800 fprintf(ofp," -[e]xt [%3d] extension penalty\n",ext);
801 fprintf(ofp," -[m]atrix [%s] Comparison matrix\n",matrix_file);
802 fprintf(ofp,"HMM options\n");
803 fprintf(ofp," -hmmer Protein file is HMMer file (version 2 compatible)\n");
804 fprintf(ofp," -hname Use this as the name of the HMM, not filename\n");
805
806 show_help_GeneModelParam(ofp);
807 fprintf(ofp," -splice [model/flat] [LEGACY only for -splice flat. use -splice_gtag]\n");
808
809 show_help_PhasedProteinPara(ofp);
810
811 fprintf(ofp,"Other model options\n");
812 fprintf(ofp," -[no]newgene use new gene stats (default), no for reverting to old system\n");
813 fprintf(ofp," -init [%s] [default/global/local/wing/endbias] startend policy for the HMM/protein\n",startend_string);
814 fprintf(ofp," -codon [%s] Codon file\n",codon_file);
815 fprintf(ofp," -subs [%2.2g] Substitution error rate\n",subs_error);
816 fprintf(ofp," -indel [%2.2g] Insertion/deletion error rate\n",indel_error);
817 fprintf(ofp," -null [syn/flat] Random Model as synchronous or flat [default syn]\n");
818 fprintf(ofp," -alln [%s] Probability of matching a NNN codon\n",allN_string);
819 fprintf(ofp," -insert [model/flat] Use protein insert model [default flat]\n");
820 fprintf(ofp,"Algorithm options\n");
821 fprintf(ofp," -alg [623/623L/623S/623P/2193/2193L] Algorithm used [default 623/623L]\n");
822 fprintf(ofp," (normally use 623 for proteins, 623L for HMMs and 623P for Phased Proteins)\n");
823 fprintf(ofp,"Output options [default -pretty -para]\n");
824 fprintf(ofp," -pretty show pretty ascii output\n");
825 fprintf(ofp," -pseudo Mark genes with frameshifts as pseudogenes\n");
826 fprintf(ofp," -genes show gene structure\n");
827 fprintf(ofp," -genesf show gene structure with supporting evidence\n");
828 fprintf(ofp," -embl show EMBL feature format with CDS key\n");
829 fprintf(ofp," -diana show EMBL feature format with misc_feature key (for diana)\n");
830 fprintf(ofp," -para show parameters\n");
831 fprintf(ofp," -sum show summary output\n");
832
833 /* lets not bring trouble on ourselves ;) */
834 /* fprintf(ofp," -over show EMBL overlap (only with EMBL format)\n"); */
835
836 fprintf(ofp," -cdna show cDNA\n");
837 fprintf(ofp," -trans show protein translation, breaking at frameshifts\n");
838 fprintf(ofp," -pep show protein translation, splicing frameshifts\n");
839 fprintf(ofp," -ace ace file gene structure\n");
840 fprintf(ofp," -gff Gene Feature Format file\n");
841 fprintf(ofp," -gener raw gene structure\n");
842 fprintf(ofp," -alb show logical AlnBlock alignment\n");
843 fprintf(ofp," -pal show raw matrix alignment\n");
844 fprintf(ofp," -block [%s] Length of main block in pretty output\n",main_block_str);
845 fprintf(ofp," -divide [%s] divide string for multiple outputs\n",divide_str);
846
847 show_help_GeneWiseRunPara(ofp);
848
849
850 show_help_DPRunImpl(ofp);
851 show_standard_options(ofp);
852
853 exit(63);
854 }
855
show_version(FILE * ofp)856 void show_version(FILE * ofp)
857 {
858 fprintf(ofp,"%s\nVersion: %s\nReleased: %s\nCompiled: %s\n",program_name,VERSION_NUMBER,RELEASE_DAY,COMPILE_DATE);
859 fprintf(ofp,"\nThis program is freely distributed under a Gnu Public License\n");
860 fprintf(ofp,"The source code is copyright (c) GRL 1998 and others\n");
861 fprintf(ofp,"There is no warranty, implied or otherwise on the performance of this program\n");
862 fprintf(ofp,"For more information read the GNULICENSE file in the distribution\n\n");
863 fprintf(ofp,"Credits: Ewan Birney <birney@sanger.ac.uk> wrote the core code.\n");
864 fprintf(ofp," Portions of this code was from HMMer2, written by Sean Eddy\n");
865 exit(63);
866 }
867
868
main(int argc,char ** argv)869 int main(int argc,char ** argv)
870 {
871 int i;
872 char * temp;
873
874
875 build_defaults();
876
877 strip_out_standard_options(&argc,argv,show_help,show_version);
878
879 potential_file = strip_out_assigned_argument(&argc,argv,"pg");
880
881 pal_file = strip_out_assigned_argument(&argc,argv,"pal_file");
882
883 if( (temp = strip_out_assigned_argument(&argc,argv,"gap")) != NULL )
884 gap_str = temp;
885
886 if( (temp = strip_out_assigned_argument(&argc,argv,"g")) != NULL )
887 gap_str = temp;
888
889 if( (temp = strip_out_assigned_argument(&argc,argv,"ext")) != NULL )
890 ext_str = temp;
891
892 if( (temp = strip_out_assigned_argument(&argc,argv,"e")) != NULL )
893 ext_str = temp;
894
895 if( (temp = strip_out_assigned_argument(&argc,argv,"matrix")) != NULL )
896 matrix_file = temp;
897
898 if( (temp = strip_out_assigned_argument(&argc,argv,"m")) != NULL )
899 matrix_file = temp;
900
901 if( (temp = strip_out_assigned_argument(&argc,argv,"s")) != NULL )
902 qstart_str = temp;
903
904 if( (temp = strip_out_assigned_argument(&argc,argv,"t")) != NULL )
905 qend_str = temp;
906
907 if( (temp = strip_out_assigned_argument(&argc,argv,"u")) != NULL )
908 tstart_str = temp;
909
910 if( (temp = strip_out_assigned_argument(&argc,argv,"v")) != NULL )
911 tend_str = temp;
912
913 if( (strip_out_boolean_argument(&argc,argv,"trev")) == TRUE )
914 reverse = TRUE;
915
916 if( (strip_out_boolean_argument(&argc,argv,"[no]newgene")) == TRUE )
917 use_new_stats = TRUE;
918
919 if( (strip_out_boolean_argument(&argc,argv,"tfor")) == TRUE ){
920 if( reverse == TRUE ) {
921 warn("You have specified both trev and tfor. Treating as both");
922 do_both = TRUE;
923 reverse = FALSE;
924 } else {
925 reverse = FALSE;
926 }
927 }
928
929 if( (temp = strip_out_assigned_argument(&argc,argv,"insert")) != NULL ) {
930 if( strcmp(temp,"flat") == 0 ) {
931 flat_insert = TRUE;
932 } else {
933 flat_insert = FALSE;
934 }
935 }
936
937 if( (strip_out_boolean_argument(&argc,argv,"both")) == TRUE )
938 do_both = TRUE;
939
940
941 if( (strip_out_boolean_argument(&argc,argv,"fembl")) == TRUE )
942 is_embl = TRUE;
943
944 if( (strip_out_boolean_argument(&argc,argv,"tabs")) == TRUE )
945 target_abs = TRUE;
946
947 pseudo = strip_out_boolean_argument(&argc,argv,"pseudo");
948
949 if( (temp = strip_out_assigned_argument(&argc,argv,"codon")) != NULL )
950 codon_file = temp;
951
952 if( (temp = strip_out_assigned_argument(&argc,argv,"gene")) != NULL )
953 gene_file = temp;
954
955 if( (temp = strip_out_assigned_argument(&argc,argv,"alg")) != NULL )
956 alg_str = temp;
957
958 if( (temp = strip_out_assigned_argument(&argc,argv,"kbyte")) != NULL )
959 kbyte_str = temp;
960
961 if( (temp = strip_out_assigned_argument(&argc,argv,"subs")) != NULL )
962 subs_string = temp;
963
964 if( (temp = strip_out_assigned_argument(&argc,argv,"indel")) != NULL )
965 indel_string = temp;
966
967 if( (temp = strip_out_assigned_argument(&argc,argv,"cfreq")) != NULL )
968 cfreq_string = temp;
969
970 if( (temp = strip_out_assigned_argument(&argc,argv,"splice")) != NULL ) {
971 warn("deprecated command line option -splice. use -splice_gtag now");
972 splice_string = temp;
973 }
974
975 if( (temp = strip_out_assigned_argument(&argc,argv,"init")) != NULL )
976 startend_string = temp;
977
978 if( (temp = strip_out_assigned_argument(&argc,argv,"null")) != NULL )
979 null_string = temp;
980
981 if( (temp = strip_out_assigned_argument(&argc,argv,"intron")) != NULL )
982 intron_string = temp;
983
984 if( (temp = strip_out_assigned_argument(&argc,argv,"alln")) != NULL )
985 allN_string = temp;
986
987 if( (strip_out_boolean_argument(&argc,argv,"hmmer")) == TRUE )
988 use_tsm = TRUE;
989
990 if( (strip_out_boolean_argument(&argc,argv,"intie")) == TRUE )
991 use_tied_model = TRUE;
992
993 if( (temp = strip_out_assigned_argument(&argc,argv,"hname")) != NULL )
994 hmm_name = temp;
995
996
997 if( (strip_out_boolean_argument(&argc,argv,"pretty")) != FALSE )
998 show_pretty = TRUE;
999
1000 if( (strip_out_boolean_argument(&argc,argv,"gff")) != FALSE )
1001 show_gff = TRUE;
1002
1003 if( (strip_out_boolean_argument(&argc,argv,"diana")) != FALSE )
1004 show_diana = TRUE;
1005
1006 if( (strip_out_boolean_argument(&argc,argv,"embl")) != FALSE )
1007 show_embl = TRUE;
1008
1009
1010 if( (strip_out_boolean_argument(&argc,argv,"genes")) != FALSE )
1011 show_pretty_gene = TRUE;
1012
1013 if( (strip_out_boolean_argument(&argc,argv,"genesf")) != FALSE )
1014 show_supp_gene = TRUE;
1015
1016 if( (strip_out_boolean_argument(&argc,argv,"para")) != FALSE )
1017 show_para = TRUE;
1018
1019 if( (strip_out_boolean_argument(&argc,argv,"trans")) != FALSE )
1020 show_trans = TRUE;
1021
1022 if( (strip_out_boolean_argument(&argc,argv,"pep")) != FALSE )
1023 show_pep = TRUE;
1024
1025 if( (strip_out_boolean_argument(&argc,argv,"cdna")) != FALSE )
1026 show_cdna = TRUE;
1027
1028 if( (strip_out_boolean_argument(&argc,argv,"sum")) != FALSE )
1029 show_match_sum = TRUE;
1030
1031 if( (strip_out_boolean_argument(&argc,argv,"alb")) != FALSE )
1032 show_AlnBlock = TRUE;
1033
1034 if( (strip_out_boolean_argument(&argc,argv,"ace")) != FALSE )
1035 show_ace = TRUE;
1036
1037 if( (strip_out_boolean_argument(&argc,argv,"pal")) != FALSE )
1038 show_PackAln = TRUE;
1039
1040 if( (strip_out_boolean_argument(&argc,argv,"gener")) != FALSE )
1041 show_gene_plain = TRUE;
1042
1043 if( (strip_out_boolean_argument(&argc,argv,"over")) != FALSE )
1044 show_overlap = TRUE;
1045
1046 if( (temp = strip_out_assigned_argument(&argc,argv,"divide")) != NULL )
1047 divide_str = temp;
1048
1049 if( (temp = strip_out_assigned_argument(&argc,argv,"block")) != NULL )
1050 main_block_str = temp;
1051
1052 dpri = new_DPRunImpl_from_argv(&argc,argv);
1053
1054 gmp = new_GeneModelParam_from_argv(&argc,argv);
1055
1056 ppp = new_PhasedProteinPara_from_argv(&argc,argv);
1057
1058 gwrp = new_GeneWiseRunPara_from_argv(&argc,argv);
1059
1060 strip_out_remaining_options_with_warning(&argc,argv);
1061
1062
1063
1064 if( argc != 3 ) {
1065 warn("Wrong number of arguments (expect 2)!\n");
1066 if( argc > 1 ){
1067 warn("Arg line looked like (after option processing)");
1068 for(i=1;i<argc;i++) {
1069 fprintf(stderr," %s\n",argv[i]);
1070 }
1071 }
1072
1073 show_short_help();
1074 }
1075
1076 if( show_embl == FALSE && show_diana == FALSE && show_gff == FALSE && show_overlap == FALSE && show_pretty_gene == FALSE && show_match_sum == FALSE && show_ace == FALSE && show_gene_plain == FALSE && show_pretty == FALSE && show_AlnBlock == FALSE && show_PackAln == FALSE && show_pep == FALSE ) {
1077 show_pretty = TRUE;
1078 show_para = TRUE;
1079 }
1080
1081 dna_seq_file = argv[2];
1082 if( use_tsm == FALSE)
1083 protein_file = argv[1];
1084 else
1085 hmm_file = argv[1];
1086
1087
1088 if( build_objects() == FALSE)
1089 fatal("Could not build objects!");
1090
1091 if( show_para == TRUE) {
1092 show_parameters();
1093 }
1094
1095 if( build_alignment() == FALSE)
1096 fatal("Could not build alignment!");
1097
1098 if( show_output() == FALSE)
1099 fatal("Could not show alignment. Sorry!");
1100
1101 if( do_both == TRUE) {
1102 reverse_target();
1103
1104 if( build_alignment() == FALSE)
1105 fatal("Could not build alignment!");
1106
1107 if( show_output() == FALSE)
1108 fatal("Could not show alignment. Sorry!");
1109 }
1110
1111 free_temporary_objects();
1112 free_io_objects();
1113 return 0;
1114 }
1115
1116
1117
1118
1119