1#!/usr/bin/env perl 2 3use strict; 4use warnings; 5use FindBin; 6use lib ("/usr/local/lib/perl5/site_perl/transdecoder"); 7use Gene_obj; 8use Fasta_reader; 9use GFF3_utils2; 10use Carp; 11use Nuc_translator; 12use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through); 13 14 15my $usage = <<__EOUSAGE__; 16 17#################################################### 18# 19# Required: 20# 21# --gff3 <string> gff3 file 22# 23# --fasta <string> fasta file corresponding to gff3 file 24# 25## 26# Optional: 27# 28# --seqType <string> prot|CDS|cDNA|gene, default=prot 29# 30# --genetic_code <string> universal (default) 31# Euplotes, Tetrahymena, Candida 32# Acetabularia, Mitochondrial-Canonical 33# Mitochondrial-Vertebrates, Mitochondrial-Arthropods 34# Mitochondrial-Echinoderms, Mitochondrial-Molluscs 35# Mitochondrial-Ascidians, Mitochondrial-Nematodes 36# Mitochondrial-Platyhelminths,Mitochondrial-Yeasts 37# Mitochondrial-Euascomycetes, Mitochondrial-Protozoans 38# 39################################################### 40 41 42__EOUSAGE__ 43 44 ; 45 46 47my $gff3_file; 48my $fasta_db; 49my $seq_type = 'prot'; 50my $genetic_code = ''; 51 52&GetOptions ( 'gff3=s' => \$gff3_file, 53 'fasta=s' => \$fasta_db, 54 'seqType=s' => \$seq_type, 55 'genetic_code=s' => \$genetic_code, 56 ); 57 58unless ($gff3_file && $fasta_db) { 59 die $usage; 60} 61 62unless ($seq_type =~ /^(prot|CDS|cDNA|gene)$/) { 63 die "Error, don't understand sequence type [$seq_type]\n\n$usage"; 64} 65 66if ($genetic_code) { 67 &Nuc_translator::use_specified_genetic_code($genetic_code); 68} 69 70## read genome 71my $fasta_reader = new Fasta_reader($fasta_db); 72my %genome = $fasta_reader->retrieve_all_seqs_hash(); 73 74 75my $gene_obj_indexer_href = {}; 76 77## associate gene identifiers with contig id's. 78my $contig_to_gene_list_href = &GFF3_utils2::index_GFF3_gene_objs($gff3_file, $gene_obj_indexer_href); 79 80foreach my $asmbl_id (sort keys %$contig_to_gene_list_href) { 81 82 my $genome_seq = $genome{$asmbl_id} or die "Error, no sequence for $asmbl_id"; 83 84 my @gene_ids = @{$contig_to_gene_list_href->{$asmbl_id}}; 85 86 foreach my $gene_id (@gene_ids) { 87 my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id}; 88 89 my %params; 90 if ($seq_type eq "gene") { 91 $params{unspliced_transcript} = 1; 92 } 93 94 $gene_obj_ref->create_all_sequence_types(\$genome_seq, %params); 95 96 my $counter = 0; 97 foreach my $isoform ($gene_obj_ref, $gene_obj_ref->get_additional_isoforms()) { 98 99 $counter++; 100 101 my $orientation = $isoform->get_orientation(); 102 my ($model_lend, $model_rend) = sort {$a<=>$b} $isoform->get_model_span(); 103 my ($gene_lend, $gene_rend) = sort {$a<=>$b} $isoform->get_gene_span(); 104 105 my $isoform_id = $isoform->{Model_feat_name}; 106 107 my $seq = ""; 108 109 if ($seq_type eq "prot") { 110 $seq = $isoform->get_protein_sequence(); 111 } 112 elsif ($seq_type eq "CDS") { 113 $seq = $isoform->get_CDS_sequence(); 114 } 115 elsif ($seq_type eq "cDNA") { 116 $seq = $isoform->get_cDNA_sequence(); 117 } 118 elsif ($seq_type eq "gene" && $counter == 1) { 119 $seq = $isoform->get_gene_sequence(); 120 } 121 122 unless ($seq) { 123 print STDERR "-warning, no $seq_type sequence for $isoform_id\n"; 124 next; 125 } 126 127 $seq =~ s/(\S{60})/$1\n/g; # make fasta format 128 chomp $seq; 129 130 my $com_name = $isoform->{com_name} || ""; 131 132 if ($com_name eq $isoform_id) { $com_name = ""; } # no sense in repeating it 133 134 my $locus = $isoform->{pub_locus}; 135 my $model_locus = $isoform->{model_pub_locus}; 136 137 my $locus_string = ""; 138 if ($model_locus) { 139 $locus_string .= $model_locus; 140 } 141 if ($locus) { 142 $locus_string .= " $locus"; 143 } 144 if ($locus_string) { 145 $locus_string .= " "; # add spacer 146 } 147 148 #if ($seq_type eq 'prot' || $seq_type eq 'CDS') { # this was a bad idea, just use the original id. 149 # $isoform_id = "cds.$isoform_id"; 150 #} 151 152 print ">$isoform_id $gene_id $locus_string $com_name $asmbl_id:$model_lend-$model_rend($orientation)\n$seq\n"; 153 } 154 } 155} 156 157 158exit(0); 159 160 161