1 /* has to be before the others due to nasty namespace clashes */
2 #define WISE2_CROSS_HMMER2
3 #include "wise2xhmmer2.h"
4 
5 
6 #include "dyna.h"
7 #include "version.h"
8 
9 char * program_name = "genewise";
10 
11 /*
12  * program specific includes
13  */
14 
15 
16 #include "gwrap.h"
17 #include "genedisplay.h"
18 #include "matchsum.h"
19 
20 #include "phasemodel.h"
21 #include "genephase6.h"
22 
23 
24 /*
25  * program specific variables
26  */
27 
28 char * dna_seq_file  = NULL;
29 Genomic * gen        = NULL;
30 boolean is_embl      = FALSE;
31 
32 char * tstart_str    = NULL;
33 int tstart           = -1;
34 
35 char * tend_str      = NULL;
36 int tend             = -1;
37 
38 boolean reverse      = FALSE;
39 boolean target_abs   = FALSE;
40 boolean do_both      = FALSE;
41 
42 boolean use_tsm      = FALSE;
43 
44 
45 char * protein_file  = NULL;
46 Protein * pro        = NULL;
47 
48 char * hmm_file       = NULL;
49 ThreeStateModel * tsm = NULL;
50 char * hmm_name       = NULL;
51 
52 char * potential_file = NULL;
53 
54 char * qstart_str    = NULL;
55 int qstart           = -1;
56 
57 char * qend_str      = NULL;
58 int qend             = -1;
59 
60 char * gene_file        = NULL;
61 GeneFrequency21 * gf    = NULL;
62 
63 char * new_gene_file    = NULL;
64 GeneStats * gs          = NULL;
65 
66 GeneModelParam * gmp    = NULL;
67 
68 PhasedProteinPara * ppp  = NULL;
69 PhasedProtein * pp      = NULL;
70 
71 boolean use_new_stats   = 1;
72 
73 char * matrix_file      = NULL;
74 CompMat * mat           = NULL;
75 
76 char * gap_str          = "12";
77 int gap                 = 12;
78 
79 char * ext_str          = "2";
80 int ext                 = 2;
81 
82 char * length_of_N_str  = "0";
83 int length_of_N         = 0;
84 
85 boolean flat_insert     = TRUE;
86 
87 char * codon_file       = NULL;
88 CodonTable * ct         = NULL;
89 
90 char * output_file      = "-";
91 FILE * ofp              = NULL;
92 
93 RandomModelDNA * rmd    = NULL;
94 
95 char * allN_string      = "1.0";
96 Probability allN        = 1.0;
97 
98 char * subs_string      = "0.000001";
99 double subs_error       = 0.000001;
100 
101 char * indel_string      = "0.000001";
102 double indel_error       = 0.000001;
103 
104 char * cfreq_string      = "flat";
105 boolean model_codon      = FALSE;
106 
107 char * splice_string     = "model";
108 boolean model_splice     = TRUE;
109 
110 char * startend_string   = "default";
111 TSM_StartEndMode startend   = TSM_default;
112 
113 char * null_string       = "syn";
114 boolean use_syn          = FALSE;
115 
116 char * intron_string     = "tied";
117 boolean use_tied_model   = FALSE;
118 
119 int alg                  = GWWRAP_623;
120 char * alg_str           = NULL;
121 
122 char * kbyte_str         = NULL;
123 int kbyte                = 10000; /* will be reset in build_defaults */
124 
125 char * pal_file          = NULL;
126 
127 boolean show_PackAln     = FALSE;
128 boolean show_cumlative_PackAln = FALSE;
129 boolean show_AlnBlock    = FALSE;
130 boolean show_ace         = FALSE;
131 boolean show_gff         = FALSE;
132 boolean show_trans       = FALSE;
133 boolean show_pep         = FALSE;
134 boolean show_cdna        = FALSE;
135 boolean show_pretty      = FALSE;
136 boolean show_pretty_gene = FALSE;
137 boolean show_supp_gene   = FALSE;
138 boolean show_gene_plain  = FALSE;
139 boolean show_match_sum   = FALSE;
140 boolean show_para        = FALSE;
141 boolean show_overlap     = FALSE;
142 boolean show_embl        = FALSE;
143 boolean show_diana       = FALSE;
144 
145 boolean pseudo           = FALSE;
146 
147 char * main_block_str      = "50";
148 int main_block           = 50;
149 
150 char * divide_str        = "//";
151 
152 Probability rnd_loop      = 0.99;
153 Probability cds_loop      = 0.97;
154 Probability rnd_to_model  = (1 - 0.99) / 3;
155 Probability link_loop     = 0.98;
156 Probability link_to_model = (1- 0.98) / 3;
157 
158 AlnBlock * alb;
159 PackAln  * pal;
160 
161 GenomicRegion * gr;
162 GenomicRegion * embl;
163 
164 MatchSummarySet * mss;
165 
166 RandomModel * rm;
167 
168 DPRunImpl * dpri;
169 
170 GeneWiseRunPara * gwrp;
171 
show_parameters(void)172 void show_parameters(void)
173 {
174   fprintf(ofp,"%s %s (%s release)\n",program_name,VERSION_NUMBER,RELEASE_DAY);
175   fprintf(ofp,"This program is freely distributed under a GPL. See source directory\n");
176   fprintf(ofp,"Copyright (c) GRL limited: portions of the code are from separate copyright\n\n");
177   if( use_tsm == FALSE) {
178     fprintf(ofp,"Query protein:       %s\n",pro->baseseq->name);
179     fprintf(ofp,"Comp Matrix:         %s\n",matrix_file);
180     fprintf(ofp,"Gap open:            %d\n",gap);
181     fprintf(ofp,"Gap extension:       %d\n",ext);
182   }
183   else
184     fprintf(ofp,"Query model:         %s\n",tsm->name);
185   fprintf(ofp,"Start/End            %s\n",startend_string);
186   fprintf(ofp,"Target Sequence      %s\n",gen->baseseq->name);
187   fprintf(ofp,"Strand:              %s\n",do_both == TRUE ? "both" : reverse == TRUE ? "reverse" : "forward");
188   fprintf(ofp,"Start/End (protein)  %s\n",startend_string);
189   show_info_GeneModelParam(gmp,ofp);
190 
191   fprintf(ofp,"Codon Table:         %s\n",codon_file);
192   fprintf(ofp,"Subs error:          %2.2g\n",subs_error);
193   fprintf(ofp,"Indel error:         %2.2g\n",indel_error);
194   fprintf(ofp,"Null model           %s\n",null_string);
195   fprintf(ofp,"Algorithm            %s\n",alg_str);
196 }
197 
show_pretty_aln(void)198 boolean show_pretty_aln(void)
199 {
200   Protein * ps;
201 
202   fprintf(ofp,"\n%s output\nScore %4.2f bits over entire alignment\n",program_name,Score2Bits(pal->score));
203 
204 
205   if( alg == GWWRAP_2193L || alg == GWWRAP_2193) {
206     fprintf(ofp,"Entrie alignment score contains unseen 'random' score segments\nYou should only use the per-alignments score printed below\nfor the bits score of the alignment\n\n");
207   }
208 
209   if( use_syn == FALSE ) {
210     fprintf(ofp,"Scores as bits over a flat simple random model\n\n");
211   } else {
212     fprintf(ofp,"Scores as bits over a synchronous coding model\n\n");
213   }
214 
215 
216 
217   if( use_tsm == FALSE ) {
218     fprintf(ofp,"Warning: The bits scores is not probablistically correct for single seqs\nSee WWW help for more info\n\n");
219 
220     protgene_ascii_display(alb,pro->baseseq->seq,pro->baseseq->name,pro->baseseq->offset,gen,ct,15,main_block,(alg == GWWRAP_623L || alg == GWWRAP_2193L || alg == GWWRAP_2193) ? TRUE : FALSE, ofp);
221   } else {
222     ps = pseudo_Protein_from_ThreeStateModel(tsm);
223     protgene_ascii_display(alb,ps->baseseq->seq,ps->baseseq->name,ps->baseseq->offset,gen,ct,15,main_block,(alg == GWWRAP_623L || alg == GWWRAP_2193L || alg == GWWRAP_2193) ? TRUE : FALSE,ofp);
224     free_Protein(ps);
225   }
226   fprintf(ofp,"%s\n",divide_str);
227 
228   return TRUE;
229 }
230 
231 
show_output(void)232 boolean show_output(void)
233 {
234   int i;
235 
236   cDNA * cdna;
237   Protein * trans;
238   GenomicOverlapResults * gor;
239   AlnColumn * alt;
240 
241 
242 
243 
244   if( show_pretty == TRUE ) {
245     show_pretty_aln();
246   }
247 
248   if( show_match_sum == TRUE ) {
249     show_MatchSummary_genewise_header(ofp);
250     show_MatchSummarySet_genewise(mss,ofp);
251     fprintf(ofp,"%s\n",divide_str);
252   }
253 
254   if( show_pretty_gene == TRUE ) {
255     show_pretty_GenomicRegion(gr,0,ofp);
256     fprintf(ofp,"%s\n",divide_str);
257   }
258 
259   if( show_supp_gene == TRUE ) {
260     show_pretty_GenomicRegion(gr,1,ofp);
261     fprintf(ofp,"%s\n",divide_str);
262   }
263 
264   if( show_embl == TRUE ) {
265     write_Embl_FT_GenomicRegion(gr,ofp);
266     fprintf(ofp,"%s\n",divide_str);
267   }
268 
269   if( show_diana == TRUE ) {
270     write_Diana_FT_GenomicRegion(gr,ofp);
271     fprintf(ofp,"%s\n",divide_str);
272   }
273 
274   if( show_overlap == TRUE ) {
275     gor = Genomic_overlap(gr,embl);
276     show_GenomicOverlapResults(gor,ofp);
277     fprintf(ofp,"%s\n",divide_str);
278   }
279 
280 
281   if( show_trans == TRUE ) {
282     for(i=0;i<gr->len;i++) {
283       if( gr->gene[i]->ispseudo == TRUE ) {
284 	fprintf(ofp,"#Gene %d is a pseudo gene - no translation possible\n",i);
285       } else {
286 	trans = get_Protein_from_Translation(gr->gene[i]->transcript[0]->translation[0],ct);
287 	write_fasta_Sequence(trans->baseseq,ofp);
288       }
289     }
290     fprintf(ofp,"%s\n",divide_str);
291   }
292 
293   if( show_pep == TRUE ) {
294     alt = alb->start;
295     for(;alt != NULL;) {
296       trans = Protein_from_GeneWise_AlnColumn(gen->baseseq,alt,1,&alt,ct,is_random_AlnColumn_genewise);
297       if ( trans == NULL )
298 	break;
299       write_fasta_Sequence(trans->baseseq,ofp);
300       free_Protein(trans);
301     }
302     fprintf(ofp,"%s\n",divide_str);
303   }
304 
305   if( show_cdna == TRUE ) {
306     for(i=0;i<gr->len;i++) {
307       cdna = get_cDNA_from_Transcript(gr->gene[i]->transcript[0]);
308       write_fasta_Sequence(cdna->baseseq,ofp);
309     }
310     fprintf(ofp,"%s\n",divide_str);
311   }
312 
313 
314   if( show_ace == TRUE ) {
315     show_ace_GenomicRegion(gr,gen->baseseq->name,ofp);
316     fprintf(ofp,"%s\n",divide_str);
317   }
318 
319   if( show_gff == TRUE ) {
320     show_GFF_GenomicRegion(gr,gen->baseseq->name,"GeneWise",ofp);
321     fprintf(ofp,"%s\n",divide_str);
322   }
323 
324   if( show_gene_plain == TRUE ) {
325     show_GenomicRegion(gr,ofp);
326     fprintf(ofp,"%s\n",divide_str);
327   }
328 
329   if( show_AlnBlock == TRUE ) {
330     mapped_ascii_AlnBlock(alb,Score2Bits,0,ofp);
331     fprintf(ofp,"%s\n",divide_str);
332   }
333 
334   if( show_cumlative_PackAln == TRUE ) {
335     show_bits_and_cumlative_PackAln(pal,ofp);
336     fprintf(ofp,"%s\n",divide_str);
337   }
338 
339   if( show_PackAln == TRUE ) {
340     show_simple_PackAln(pal,ofp);
341     fprintf(ofp,"%s\n",divide_str);
342   }
343 
344 
345 
346   return TRUE;
347 }
348 
build_alignment(void)349 boolean build_alignment(void)
350 {
351   GeneParameter21 * gpara;
352   GeneModel * gm;
353 
354   if( pal_file != NULL ) {
355     pal = read_simple_PackAln_file(pal_file);
356   }
357 
358 
359   if( use_new_stats == 0 ) {
360     gpara = GeneParameter21_wrap(gf,subs_error,indel_error,rmd,model_codon,model_splice,use_tied_model,ct,rnd_loop,cds_loop,rnd_to_model,link_loop,link_to_model);
361   } else {
362     gm = GeneModel_from_GeneStats(gs,gmp);
363 
364     gpara= GeneParameter21_from_GeneModel(gm,ct,rnd_loop,cds_loop,rnd_to_model,link_loop,link_to_model,subs_error,indel_error);
365   }
366 
367 
368   if( gpara == NULL ) {
369     warn("Sorry - could not build gene parameters. Must be a bug of some sort");
370     return FALSE;
371   }
372 
373 
374   /* phased based proteins use a different source */
375 
376   if( alg == GWWRAP_623P ) {
377     alb = AlnBlock_from_phased_protein_wrap(pro,tsm,gen,gpara->cm,rm,mat,ppp,gpara,dpri,&pal);
378   } else {
379     if( use_tsm == FALSE) {
380       alb =  AlnBlock_from_protein_genewise_wrap(pro,gen,mat,-gap,-ext,gpara,rmd,rmd,alg,use_syn,rm,allN,startend,dpri,&pal,gwrp);
381     } else {
382       alb =  AlnBlock_from_TSM_genewise_wrap(tsm,gen,gpara,rmd,rmd,use_syn,alg,allN,flat_insert,dpri,&pal,NULL);
383     }
384   }
385 
386   free_GeneParameter21(gpara);
387 
388 
389 
390   if( alb == NULL )
391     return FALSE;
392 
393 
394   gr = new_GenomicRegion(gen);
395 
396 
397   add_Genes_to_GenomicRegion_GeneWise(gr,gen->baseseq->offset,gen->baseseq->end,alb,use_tsm == TRUE ? tsm->name : pro->baseseq->name,pseudo,NULL);
398 
399   if( use_tsm == TRUE)
400     mss = MatchSummarySet_from_AlnBlock_genewise(alb,tsm->name,1,gen->baseseq);
401   else
402     mss = MatchSummarySet_from_AlnBlock_genewise(alb,pro->baseseq->name,pro->baseseq->offset,gen->baseseq);
403 
404   return TRUE;
405 }
406 
free_temporary_objects(void)407 boolean free_temporary_objects(void)
408 {
409   alb = free_AlnBlock(alb);
410   pal = free_PackAln(pal);
411   mss = free_MatchSummarySet(mss);
412   gr = free_GenomicRegion(gr);
413 
414   return TRUE;
415 }
416 
free_io_objects(void)417 boolean free_io_objects(void)
418 {
419   if( use_tsm == TRUE) {
420     free_ThreeStateModel(tsm);
421   } else {
422     free_Protein(pro);
423   }
424 
425   free_CodonTable(ct);
426   if( gf != NULL ) {
427     free_GeneFrequency21(gf);
428   }
429   free_RandomModelDNA(rmd);
430   if( is_embl ) {
431     free_GenomicRegion(embl);
432   }
433   free_Genomic(gen);
434 
435 
436   return TRUE;
437 }
438 
439 
440 
build_objects(void)441 boolean build_objects(void)
442 {
443   boolean ret = TRUE;
444   Protein * pro_temp;
445   Genomic * gen_temp;
446   FILE * ifp;
447 
448 
449 
450 
451 
452   startend = threestatemodel_mode_from_string(startend_string);
453   if( startend == TSM_unknown ) {
454     warn("String %s was unable to converted into a start/end policy\n",startend_string);
455     ret = FALSE;
456   }
457 
458 
459   if( tstart_str != NULL ) {
460     if( is_integer_string(tstart_str,&tstart) == FALSE || tstart < 0) {
461       warn("Could not make %s out as target start",tstart);
462       ret = FALSE;
463     }
464   }
465 
466   if( tend_str != NULL ) {
467     if( is_integer_string(tend_str,&tend) == FALSE || tend < 0) {
468       warn("Could not make %s out as target end",tend);
469       ret = FALSE;
470     }
471   }
472 
473   if( is_integer_string(gap_str,&gap) == FALSE ) {
474       warn("Could not make %s out as gap penalty (must be integer at the moment)",gap_str);
475       ret = FALSE;
476   }
477 
478 
479   if( is_integer_string(ext_str,&ext) == FALSE ) {
480     warn("Could not make %s out as gap penalty (must be integer at the moment)",ext_str);
481     ret = FALSE;
482   }
483 
484   if( is_embl == FALSE ) {
485     if( (gen = read_fasta_file_Genomic(dna_seq_file,length_of_N)) == NULL ) {
486       ret = FALSE;
487       warn("Could not read genomic sequence in %s",dna_seq_file);
488       gen = NULL;
489     }
490   } else {
491     embl = read_EMBL_GenomicRegion_file(dna_seq_file);
492     if( embl == NULL ) {
493       warn("Could not read genomic EMBL file in %s",dna_seq_file);
494       gen = NULL;
495       ret = FALSE;
496     } else {
497       gen = hard_link_Genomic(embl->genomic);
498 
499     }
500   }
501 
502   if( gen != NULL ) {
503 
504     if( tstart != -1 || tend != -1 ) {
505       if( tstart == -1 )
506 	tstart = 0;
507       if( tend == -1 )
508 	tend = gen->baseseq->len;
509       gen_temp = truncate_Genomic(gen,tstart-1,tend);
510       if( gen_temp == NULL ){
511 	ret = FALSE;
512       } else {
513 	free_Genomic(gen);
514 	gen = gen_temp;
515       }
516     } else {
517       /* no truncation required */
518     }
519 
520 
521     if( reverse == TRUE ) {
522       if( tstart > tend ) {
523 	warn("You have already reversed the DNA by using %d - %d truncation. Re-reversing",tstart,tend);
524     }
525 
526       gen_temp = reverse_complement_Genomic(gen);
527       free_Genomic(gen);
528       gen = gen_temp;
529     }
530   }
531 
532   /*
533    * Can't truncate on GenomicRegion (for good reasons!).
534    * but we want only a section of the EMBL file to be used
535    *
536    * So... swap genomic now. Positions in EMBL are still valid,
537    * however - some genes will loose their sequence, which will be damaging. ;)
538    */
539 
540 
541   if( is_embl ) {
542     free_Genomic(embl->genomic);
543     embl->genomic = hard_link_Genomic(gen); /* pointer could be dead anyway ;) */
544   }
545 
546 
547   if( target_abs == TRUE ) {
548     if( is_embl == TRUE ) {
549       warn("Sorry you can't both use absolute positioning and EMBL files as I can't cope with all the coordinate remapping. You'll have to convert to fasta.");
550       ret =  FALSE;
551     }
552 
553     gen->baseseq->offset = 1;
554     gen->baseseq->end  = strlen(gen->baseseq->seq);
555   }
556 
557   if( alg_str != NULL ) {
558     alg = gwrap_alg_type_from_string(alg_str);
559   } else {
560     if( use_tsm == TRUE ) {
561       alg_str = "623L";
562     } else {
563       alg_str = "623";
564     }
565     alg = gwrap_alg_type_from_string(alg_str);
566   }
567 
568 
569   if( qstart_str != NULL ) {
570     if( is_integer_string(qstart_str,&qstart) == FALSE || qstart < 0) {
571       warn("Could not make %s out as query start",qstart);
572       ret = FALSE;
573     }
574   }
575 
576   if( qend_str != NULL ) {
577     if( is_integer_string(qend_str,&qend) == FALSE || qend < 0) {
578       warn("Could not make %s out as query end",qend);
579       ret = FALSE;
580     }
581   }
582 
583 
584   if( use_tsm == FALSE ) {
585     if( startend != TSM_default && startend != TSM_global && startend != TSM_local && startend != TSM_endbiased) {
586       warn("Proteins can only have local/global/endbias startend policies set, not %s",startend_string);
587       ret = FALSE;
588     }
589     if( (pro = read_fasta_file_Protein(protein_file)) == NULL ) {
590       ret = FALSE;
591       warn("Could not read Protein sequence in %s",protein_file);
592     } else {
593 
594       if( qstart != -1 || qend != -1 ) {
595 	if( qstart == -1 )
596 	  qstart = 0;
597 	if( qend == -1 )
598 	  qend = pro->baseseq->len;
599 
600 	pro_temp = truncate_Protein(pro,qstart-1,qend);
601 	if( pro_temp == NULL ){
602 	  ret = FALSE;
603 	} else {
604 	  free_Protein(pro);
605 	  pro = pro_temp;
606 	}
607       }
608     }
609   } else {
610     /** using a HMM **/
611 
612     /*tsm = read_HMMer_1_7_ascii_file(hmm_file);*/
613     /*tsm = Wise2_read_ThreeStateModel_from_hmmer1_file(hmm_file);*/
614     tsm = HMMer2_read_ThreeStateModel(hmm_file);
615 
616 
617       if( tsm == NULL ) {
618 	warn("Could not read hmm from %s\n",hmm_file);
619 	ret = FALSE;
620       }  else {
621 
622 	display_char_in_ThreeStateModel(tsm);
623 	if( hmm_name != NULL ) {
624 	  if( tsm->name != NULL )
625 	    ckfree(tsm->name);
626 	  tsm->name = stringalloc(hmm_name);
627 	}
628 
629 	if( tsm == NULL ) {
630 	  warn("Could not read %s as a hmm",hmm_file);
631 	}
632 
633 	/** have to set start/end **/
634 	set_startend_policy_ThreeStateModel(tsm,startend,30,0.1);
635 
636       }
637   } /* end of else tsm != NULL */
638 
639 
640 
641   if( main_block_str != NULL ) {
642     if( is_integer_string(main_block_str,&main_block) == FALSE ) {
643       warn("Could not get maximum main_block number %s",main_block_str);
644       ret = FALSE;
645     }
646   }
647 
648 
649 
650   if( is_double_string(subs_string,&subs_error) == FALSE ) {
651     warn("Could not convert %s to a double",subs_error);
652     ret = FALSE;
653   }
654 
655   if( is_double_string(indel_string,&indel_error) == FALSE ) {
656     warn("Could not convert %s to a double",indel_error);
657     ret = FALSE;
658   }
659 
660   if( is_double_string(allN_string,&allN) == FALSE ) {
661     warn("Could not convert %s to a double",allN_string);
662     ret = FALSE;
663   }
664 
665 
666   if( strcmp(cfreq_string,"model") == 0 ) {
667     model_codon = TRUE;
668   } else if ( strcmp(cfreq_string,"flat") == 0 ) {
669     model_codon = FALSE;
670   } else {
671     warn("Cannot interpret [%s] as a codon modelling parameter\n",cfreq_string);
672     ret = FALSE;
673   }
674 
675 
676   if( strcmp(splice_string,"model") == 0 ) {
677     model_splice = TRUE;
678   } else if ( strcmp(splice_string,"flat") == 0 ) {
679     model_splice = FALSE;
680     gmp->use_gtag_splice = TRUE;
681   } else {
682     warn("Cannot interpret [%s] as a splice modelling parameter\n",splice_string);
683     ret = FALSE;
684   }
685 
686   if( strcmp(null_string,"syn") == 0 ) {
687     use_syn = TRUE;
688   } else if ( strcmp(null_string,"flat") == 0 ) {
689     use_syn = FALSE;
690   } else {
691     warn("Cannot interpret [%s] as a null model string\n",null_string);
692     ret = FALSE;
693   }
694 
695   if( strcmp(intron_string,"model") == 0 ) {
696     use_tied_model = FALSE;
697   } else if ( strcmp(intron_string,"tied") == 0 ) {
698     use_tied_model = TRUE;
699   } else {
700     warn("Cannot interpret [%s] as a intron tieing switch\n",intron_string);
701     ret = FALSE;
702   }
703 
704 
705 
706   if( (rm = default_RandomModel()) == NULL) {
707     warn("Could not make default random model\n");
708     ret = FALSE;
709   }
710 
711   if( use_new_stats == 0 ) {
712     if( (gf = read_GeneFrequency21_file(gene_file)) == NULL) {
713       ret = FALSE;
714       warn("Could not read a GeneFrequency file in %s",gene_file);
715     }
716   } else {
717     if( (gs = GeneStats_from_GeneModelParam(gmp)) == NULL ){
718       ret=FALSE;
719       warn("Could not read gene statistics in %s",new_gene_file);
720     }
721   } /* end of else using new gene stats */
722 
723 
724   if( (mat = read_Blast_file_CompMat(matrix_file)) == NULL) {
725     if( use_tsm == TRUE ) {
726       info("I could not read the Comparison matrix file in %s; however, you are using a HMM so it is not needed. Please set the WISECONFIGDIR or WISEPERSONALDIR variable correctly to prevent this message.",matrix_file);
727     } else {
728       warn("Could not read Comparison matrix file in %s",matrix_file);
729       ret = FALSE;
730     }
731   }
732 
733   if( (ct = read_CodonTable_file(codon_file)) == NULL) {
734     ret = FALSE;
735     warn("Could not read codon table file in %s",codon_file);
736   }
737 
738   if( (ofp = openfile(output_file,"W")) ==  NULL) {
739     warn("Could not open %s as an output file",output_file);
740     ret = FALSE;
741   }
742 
743   rmd = RandomModelDNA_std();
744   return ret;
745 
746 }
747 
build_defaults(void)748 void build_defaults(void)
749 {
750   gene_file = "human.gf";
751   new_gene_file = "gene.stat";
752   codon_file = "codon.table";
753   matrix_file = "BLOSUM62.bla";
754 
755 
756 }
757 
reverse_target(void)758 void reverse_target(void)
759 {
760   Genomic * gen_temp;
761 
762   gen_temp = reverse_complement_Genomic(gen);
763 
764   free_temporary_objects();
765 
766   free_Genomic(gen);
767 
768   gen = gen_temp;
769 
770 }
771 
show_short_help(void)772 void show_short_help(void)
773 {
774   fprintf(stdout,"%s (%s)\n",program_name,VERSION_NUMBER);
775   fprintf(stdout,"This program is freely distributed under a GPL. See -version for more info\n");
776   fprintf(stdout,"Copyright (c) GRL limited: portions of the code are from separate copyrights\n\n");
777   fprintf(stdout,"genewise <protein-file> <dna-file> in fasta format\n");
778   fprintf(stdout," Options. In any order, '-' as filename (for any input) means stdin\n");
779   fprintf(stdout,"\nFor more help go %s -help.\n",program_name);
780   exit(63);
781 }
782 
show_help(FILE * ofp)783 void show_help(FILE * ofp)
784 {
785   fprintf(ofp,"%s (%s)\n",program_name,VERSION_NUMBER);
786   fprintf(ofp,"genewise <protein-file> <dna-file> in fasta format\n");
787   /* program specific help */
788   fprintf(ofp,"Dna sequence options\n");
789   fprintf(ofp,"  -u               start position in dna\n");
790   fprintf(ofp,"  -v               end position in dna\n");
791   fprintf(ofp,"  -trev            Compare on the reverse strand\n");
792   fprintf(ofp,"  -tfor [default]  Compare on the forward strand\n");
793   fprintf(ofp,"  -both            Both strands\n");
794   fprintf(ofp,"  -tabs            Report positions as absolute to truncated/reverse sequence\n");
795   fprintf(ofp,"  -fembl            File is an EMBL file native format\n");
796   fprintf(ofp,"Protein comparison options\n");
797   fprintf(ofp,"  -s               start position in protein\n");
798   fprintf(ofp,"  -t               end   position in protein\n");
799   fprintf(ofp,"  -[g]ap    [%3d]  gap penalty\n",gap);
800   fprintf(ofp,"  -[e]xt    [%3d]  extension penalty\n",ext);
801   fprintf(ofp,"  -[m]atrix [%s]  Comparison matrix\n",matrix_file);
802   fprintf(ofp,"HMM options\n");
803   fprintf(ofp,"  -hmmer           Protein file is HMMer file (version 2 compatible)\n");
804   fprintf(ofp,"  -hname           Use this as the name of the HMM, not filename\n");
805 
806   show_help_GeneModelParam(ofp);
807   fprintf(ofp,"  -splice [model/flat] [LEGACY only for -splice flat. use -splice_gtag]\n");
808 
809   show_help_PhasedProteinPara(ofp);
810 
811   fprintf(ofp,"Other model options\n");
812   fprintf(ofp,"  -[no]newgene  use new gene stats (default), no for reverting to old system\n");
813   fprintf(ofp,"  -init   [%s]  [default/global/local/wing/endbias] startend policy for the HMM/protein\n",startend_string);
814   fprintf(ofp,"  -codon  [%s]  Codon file\n",codon_file);
815   fprintf(ofp,"  -subs   [%2.2g] Substitution error rate\n",subs_error);
816   fprintf(ofp,"  -indel  [%2.2g] Insertion/deletion error rate\n",indel_error);
817   fprintf(ofp,"  -null   [syn/flat]   Random Model as synchronous or flat [default syn]\n");
818   fprintf(ofp,"  -alln   [%s]   Probability of matching a NNN codon\n",allN_string);
819   fprintf(ofp,"  -insert [model/flat] Use protein insert model     [default flat]\n");
820   fprintf(ofp,"Algorithm options\n");
821   fprintf(ofp,"  -alg    [623/623L/623S/623P/2193/2193L]  Algorithm used [default 623/623L]\n");
822   fprintf(ofp,"  (normally use 623 for proteins, 623L for HMMs and 623P for Phased Proteins)\n");
823   fprintf(ofp,"Output options [default -pretty -para]\n");
824   fprintf(ofp,"  -pretty          show pretty ascii output\n");
825   fprintf(ofp,"  -pseudo          Mark genes with frameshifts as pseudogenes\n");
826   fprintf(ofp,"  -genes           show gene structure\n");
827   fprintf(ofp,"  -genesf          show gene structure with supporting evidence\n");
828   fprintf(ofp,"  -embl            show EMBL feature format with CDS key\n");
829   fprintf(ofp,"  -diana           show EMBL feature format with misc_feature key (for diana)\n");
830   fprintf(ofp,"  -para            show parameters\n");
831   fprintf(ofp,"  -sum             show summary output\n");
832 
833   /* lets not bring trouble on ourselves ;) */
834   /* fprintf(ofp,"  -over            show EMBL overlap (only with EMBL format)\n"); */
835 
836   fprintf(ofp,"  -cdna            show cDNA\n");
837   fprintf(ofp,"  -trans           show protein translation, breaking at frameshifts\n");
838   fprintf(ofp,"  -pep             show protein translation, splicing frameshifts\n");
839   fprintf(ofp,"  -ace             ace file gene structure\n");
840   fprintf(ofp,"  -gff             Gene Feature Format file\n");
841   fprintf(ofp,"  -gener           raw gene structure\n");
842   fprintf(ofp,"  -alb             show logical AlnBlock alignment\n");
843   fprintf(ofp,"  -pal             show raw matrix alignment\n");
844   fprintf(ofp,"  -block  [%s]     Length of main block in pretty output\n",main_block_str);
845   fprintf(ofp,"  -divide [%s]     divide string for multiple outputs\n",divide_str);
846 
847   show_help_GeneWiseRunPara(ofp);
848 
849 
850   show_help_DPRunImpl(ofp);
851   show_standard_options(ofp);
852 
853   exit(63);
854 }
855 
show_version(FILE * ofp)856 void show_version(FILE * ofp)
857 {
858   fprintf(ofp,"%s\nVersion: %s\nReleased: %s\nCompiled: %s\n",program_name,VERSION_NUMBER,RELEASE_DAY,COMPILE_DATE);
859   fprintf(ofp,"\nThis program is freely distributed under a Gnu Public License\n");
860   fprintf(ofp,"The source code is copyright (c) GRL 1998 and others\n");
861   fprintf(ofp,"There is no warranty, implied or otherwise on the performance of this program\n");
862   fprintf(ofp,"For more information read the GNULICENSE file in the distribution\n\n");
863   fprintf(ofp,"Credits: Ewan Birney <birney@sanger.ac.uk> wrote the core code.\n");
864   fprintf(ofp,"         Portions of this code was from HMMer2, written by Sean Eddy\n");
865   exit(63);
866 }
867 
868 
main(int argc,char ** argv)869 int main(int argc,char ** argv)
870 {
871   int i;
872   char * temp;
873 
874 
875   build_defaults();
876 
877   strip_out_standard_options(&argc,argv,show_help,show_version);
878 
879   potential_file = strip_out_assigned_argument(&argc,argv,"pg");
880 
881   pal_file = strip_out_assigned_argument(&argc,argv,"pal_file");
882 
883   if( (temp = strip_out_assigned_argument(&argc,argv,"gap")) != NULL )
884     gap_str = temp;
885 
886   if( (temp = strip_out_assigned_argument(&argc,argv,"g")) != NULL )
887     gap_str = temp;
888 
889   if( (temp = strip_out_assigned_argument(&argc,argv,"ext")) != NULL )
890     ext_str = temp;
891 
892   if( (temp = strip_out_assigned_argument(&argc,argv,"e")) != NULL )
893     ext_str = temp;
894 
895   if( (temp = strip_out_assigned_argument(&argc,argv,"matrix")) != NULL )
896     matrix_file = temp;
897 
898   if( (temp = strip_out_assigned_argument(&argc,argv,"m")) != NULL )
899     matrix_file = temp;
900 
901   if( (temp = strip_out_assigned_argument(&argc,argv,"s")) != NULL )
902     qstart_str = temp;
903 
904   if( (temp = strip_out_assigned_argument(&argc,argv,"t")) != NULL )
905     qend_str = temp;
906 
907   if( (temp = strip_out_assigned_argument(&argc,argv,"u")) != NULL )
908     tstart_str = temp;
909 
910   if( (temp = strip_out_assigned_argument(&argc,argv,"v")) != NULL )
911     tend_str = temp;
912 
913   if( (strip_out_boolean_argument(&argc,argv,"trev")) == TRUE )
914     reverse = TRUE;
915 
916   if( (strip_out_boolean_argument(&argc,argv,"[no]newgene")) == TRUE )
917     use_new_stats = TRUE;
918 
919   if( (strip_out_boolean_argument(&argc,argv,"tfor")) == TRUE ){
920     if( reverse == TRUE ) {
921       warn("You have specified both trev and tfor. Treating as both");
922       do_both = TRUE;
923       reverse = FALSE;
924     } else {
925       reverse = FALSE;
926     }
927   }
928 
929   if( (temp = strip_out_assigned_argument(&argc,argv,"insert")) != NULL ) {
930     if( strcmp(temp,"flat") == 0 ) {
931       flat_insert = TRUE;
932     } else {
933       flat_insert = FALSE;
934     }
935   }
936 
937   if( (strip_out_boolean_argument(&argc,argv,"both")) == TRUE )
938     do_both = TRUE;
939 
940 
941   if( (strip_out_boolean_argument(&argc,argv,"fembl")) == TRUE )
942     is_embl = TRUE;
943 
944   if( (strip_out_boolean_argument(&argc,argv,"tabs")) == TRUE )
945     target_abs = TRUE;
946 
947   pseudo = strip_out_boolean_argument(&argc,argv,"pseudo");
948 
949   if( (temp = strip_out_assigned_argument(&argc,argv,"codon")) != NULL )
950     codon_file = temp;
951 
952   if( (temp = strip_out_assigned_argument(&argc,argv,"gene")) != NULL )
953     gene_file = temp;
954 
955   if( (temp = strip_out_assigned_argument(&argc,argv,"alg")) != NULL )
956     alg_str = temp;
957 
958   if( (temp = strip_out_assigned_argument(&argc,argv,"kbyte")) != NULL )
959     kbyte_str = temp;
960 
961   if( (temp = strip_out_assigned_argument(&argc,argv,"subs")) != NULL )
962     subs_string = temp;
963 
964   if( (temp = strip_out_assigned_argument(&argc,argv,"indel")) != NULL )
965     indel_string = temp;
966 
967   if( (temp = strip_out_assigned_argument(&argc,argv,"cfreq")) != NULL )
968     cfreq_string = temp;
969 
970   if( (temp = strip_out_assigned_argument(&argc,argv,"splice")) != NULL ) {
971     warn("deprecated command line option -splice. use -splice_gtag now");
972     splice_string = temp;
973   }
974 
975   if( (temp = strip_out_assigned_argument(&argc,argv,"init")) != NULL )
976     startend_string = temp;
977 
978   if( (temp = strip_out_assigned_argument(&argc,argv,"null")) != NULL )
979     null_string = temp;
980 
981   if( (temp = strip_out_assigned_argument(&argc,argv,"intron")) != NULL )
982     intron_string = temp;
983 
984   if( (temp = strip_out_assigned_argument(&argc,argv,"alln")) != NULL )
985     allN_string = temp;
986 
987   if( (strip_out_boolean_argument(&argc,argv,"hmmer")) == TRUE )
988     use_tsm = TRUE;
989 
990   if( (strip_out_boolean_argument(&argc,argv,"intie")) == TRUE )
991     use_tied_model = TRUE;
992 
993   if( (temp = strip_out_assigned_argument(&argc,argv,"hname")) != NULL )
994     hmm_name = temp;
995 
996 
997   if( (strip_out_boolean_argument(&argc,argv,"pretty")) != FALSE )
998     show_pretty = TRUE;
999 
1000   if( (strip_out_boolean_argument(&argc,argv,"gff")) != FALSE )
1001     show_gff = TRUE;
1002 
1003   if( (strip_out_boolean_argument(&argc,argv,"diana")) != FALSE )
1004     show_diana = TRUE;
1005 
1006   if( (strip_out_boolean_argument(&argc,argv,"embl")) != FALSE )
1007     show_embl = TRUE;
1008 
1009 
1010   if( (strip_out_boolean_argument(&argc,argv,"genes")) != FALSE )
1011     show_pretty_gene = TRUE;
1012 
1013   if( (strip_out_boolean_argument(&argc,argv,"genesf")) != FALSE )
1014     show_supp_gene = TRUE;
1015 
1016   if( (strip_out_boolean_argument(&argc,argv,"para")) != FALSE )
1017     show_para = TRUE;
1018 
1019   if( (strip_out_boolean_argument(&argc,argv,"trans")) != FALSE )
1020     show_trans = TRUE;
1021 
1022   if( (strip_out_boolean_argument(&argc,argv,"pep")) != FALSE )
1023     show_pep = TRUE;
1024 
1025   if( (strip_out_boolean_argument(&argc,argv,"cdna")) != FALSE )
1026     show_cdna = TRUE;
1027 
1028   if( (strip_out_boolean_argument(&argc,argv,"sum")) != FALSE )
1029     show_match_sum = TRUE;
1030 
1031   if( (strip_out_boolean_argument(&argc,argv,"alb")) != FALSE )
1032     show_AlnBlock = TRUE;
1033 
1034   if( (strip_out_boolean_argument(&argc,argv,"ace")) != FALSE )
1035     show_ace = TRUE;
1036 
1037   if( (strip_out_boolean_argument(&argc,argv,"pal")) != FALSE )
1038     show_PackAln = TRUE;
1039 
1040   if( (strip_out_boolean_argument(&argc,argv,"gener")) != FALSE )
1041     show_gene_plain = TRUE;
1042 
1043   if( (strip_out_boolean_argument(&argc,argv,"over")) != FALSE )
1044     show_overlap = TRUE;
1045 
1046   if( (temp = strip_out_assigned_argument(&argc,argv,"divide")) != NULL )
1047     divide_str = temp;
1048 
1049   if( (temp = strip_out_assigned_argument(&argc,argv,"block")) != NULL )
1050     main_block_str = temp;
1051 
1052   dpri = new_DPRunImpl_from_argv(&argc,argv);
1053 
1054   gmp  = new_GeneModelParam_from_argv(&argc,argv);
1055 
1056   ppp = new_PhasedProteinPara_from_argv(&argc,argv);
1057 
1058   gwrp = new_GeneWiseRunPara_from_argv(&argc,argv);
1059 
1060   strip_out_remaining_options_with_warning(&argc,argv);
1061 
1062 
1063 
1064   if( argc !=  3 ) {
1065     warn("Wrong number of arguments (expect 2)!\n");
1066     if( argc > 1 ){
1067       warn("Arg line looked like (after option processing)");
1068       for(i=1;i<argc;i++) {
1069 	fprintf(stderr,"   %s\n",argv[i]);
1070       }
1071     }
1072 
1073     show_short_help();
1074   }
1075 
1076   if( show_embl == FALSE && show_diana == FALSE && show_gff == FALSE && show_overlap == FALSE && show_pretty_gene == FALSE && show_match_sum == FALSE && show_ace == FALSE && show_gene_plain == FALSE && show_pretty == FALSE && show_AlnBlock == FALSE && show_PackAln == FALSE && show_pep == FALSE ) {
1077     show_pretty = TRUE;
1078     show_para = TRUE;
1079   }
1080 
1081   dna_seq_file = argv[2];
1082   if( use_tsm == FALSE)
1083     protein_file = argv[1];
1084   else
1085     hmm_file  = argv[1];
1086 
1087 
1088   if( build_objects() == FALSE)
1089     fatal("Could not build objects!");
1090 
1091   if( show_para == TRUE) {
1092     show_parameters();
1093   }
1094 
1095   if( build_alignment() == FALSE)
1096     fatal("Could not build alignment!");
1097 
1098   if( show_output() == FALSE)
1099     fatal("Could not show alignment. Sorry!");
1100 
1101   if( do_both == TRUE) {
1102     reverse_target();
1103 
1104     if( build_alignment() == FALSE)
1105       fatal("Could not build alignment!");
1106 
1107     if( show_output() == FALSE)
1108       fatal("Could not show alignment. Sorry!");
1109   }
1110 
1111   free_temporary_objects();
1112   free_io_objects();
1113   return 0;
1114 }
1115 
1116 
1117 
1118 
1119