1#!/usr/bin/env perl
2
3use strict;
4use warnings;
5use FindBin;
6use lib ("/usr/local/lib/perl5/site_perl/transdecoder");
7use Gene_obj;
8use Fasta_reader;
9use GFF3_utils2;
10use Carp;
11use Nuc_translator;
12use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
13
14
15my $usage = <<__EOUSAGE__;
16
17####################################################
18#
19# Required:
20#
21#  --gff3 <string>          gff3 file
22#
23#  --fasta <string>         fasta file corresponding to gff3 file
24#
25##
26#  Optional:
27#
28#  --seqType <string>        prot|CDS|cDNA|gene,  default=prot
29#
30#  --genetic_code  <string>   universal (default)
31#                             Euplotes, Tetrahymena, Candida
32#                             Acetabularia, Mitochondrial-Canonical
33#                             Mitochondrial-Vertebrates, Mitochondrial-Arthropods
34#                             Mitochondrial-Echinoderms, Mitochondrial-Molluscs
35#                             Mitochondrial-Ascidians, Mitochondrial-Nematodes
36#                             Mitochondrial-Platyhelminths,Mitochondrial-Yeasts
37#                             Mitochondrial-Euascomycetes, Mitochondrial-Protozoans
38#
39###################################################
40
41
42__EOUSAGE__
43
44    ;
45
46
47my $gff3_file;
48my $fasta_db;
49my $seq_type = 'prot';
50my $genetic_code = '';
51
52&GetOptions ( 'gff3=s' => \$gff3_file,
53              'fasta=s' => \$fasta_db,
54              'seqType=s' => \$seq_type,
55              'genetic_code=s' => \$genetic_code,
56    );
57
58unless ($gff3_file && $fasta_db) {
59    die $usage;
60}
61
62unless ($seq_type =~ /^(prot|CDS|cDNA|gene)$/) {
63    die "Error, don't understand sequence type [$seq_type]\n\n$usage";
64}
65
66if ($genetic_code) {
67    &Nuc_translator::use_specified_genetic_code($genetic_code);
68}
69
70## read genome
71my $fasta_reader = new Fasta_reader($fasta_db);
72my %genome = $fasta_reader->retrieve_all_seqs_hash();
73
74
75my $gene_obj_indexer_href = {};
76
77## associate gene identifiers with contig id's.
78my $contig_to_gene_list_href = &GFF3_utils2::index_GFF3_gene_objs($gff3_file, $gene_obj_indexer_href);
79
80foreach my $asmbl_id (sort keys %$contig_to_gene_list_href) {
81
82    my $genome_seq = $genome{$asmbl_id} or die "Error, no sequence for $asmbl_id";
83
84    my @gene_ids = @{$contig_to_gene_list_href->{$asmbl_id}};
85
86    foreach my $gene_id (@gene_ids) {
87        my $gene_obj_ref = $gene_obj_indexer_href->{$gene_id};
88
89        my %params;
90        if ($seq_type eq "gene") {
91            $params{unspliced_transcript} = 1;
92        }
93
94        $gene_obj_ref->create_all_sequence_types(\$genome_seq, %params);
95
96		my $counter = 0;
97        foreach my $isoform ($gene_obj_ref, $gene_obj_ref->get_additional_isoforms()) {
98
99			$counter++;
100
101			my $orientation = $isoform->get_orientation();
102			my ($model_lend, $model_rend) = sort {$a<=>$b} $isoform->get_model_span();
103			my ($gene_lend, $gene_rend) = sort {$a<=>$b} $isoform->get_gene_span();
104
105            my $isoform_id = $isoform->{Model_feat_name};
106
107            my $seq = "";
108
109            if ($seq_type eq "prot") {
110                $seq = $isoform->get_protein_sequence();
111            }
112            elsif ($seq_type eq "CDS") {
113                $seq = $isoform->get_CDS_sequence();
114            }
115            elsif ($seq_type eq "cDNA") {
116                $seq = $isoform->get_cDNA_sequence();
117            }
118            elsif ($seq_type eq "gene" && $counter == 1) {
119                $seq = $isoform->get_gene_sequence();
120            }
121
122            unless ($seq) {
123                print STDERR "-warning, no $seq_type sequence for $isoform_id\n";
124                next;
125            }
126
127            $seq =~ s/(\S{60})/$1\n/g; # make fasta format
128            chomp $seq;
129
130            my $com_name = $isoform->{com_name} || "";
131
132			if ($com_name eq $isoform_id) { $com_name = ""; } # no sense in repeating it
133
134			my $locus = $isoform->{pub_locus};
135			my $model_locus = $isoform->{model_pub_locus};
136
137			my $locus_string = "";
138			if ($model_locus) {
139                $locus_string .= $model_locus;
140			}
141			if ($locus) {
142				$locus_string .= " $locus";
143			}
144			if ($locus_string) {
145				$locus_string .= " "; # add spacer
146			}
147
148            #if ($seq_type eq 'prot' || $seq_type eq 'CDS') {  # this was a bad idea, just use the original id.
149            #    $isoform_id = "cds.$isoform_id";
150            #}
151
152            print ">$isoform_id $gene_id $locus_string $com_name $asmbl_id:$model_lend-$model_rend($orientation)\n$seq\n";
153        }
154    }
155}
156
157
158exit(0);
159
160
161