1#!/usr/bin/env perl
2
3use strict;
4use warnings;
5use FindBin;
6use lib ("/usr/local/lib/perl5/site_perl/transdecoder");
7use Gene_obj;
8use GFF3_utils2;
9use Fasta_reader;
10use Nuc_translator;
11use Carp;
12use Data::Dumper;
13use List::Util qw (max);
14use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
15use PWM;
16use File::Basename;
17
18
19my $atg_pwm_pos = 20;
20my $adj_dist = 30;
21my $adj_pct = 15;
22
23my $usage = <<__EOUSAGE__;
24
25#####################################################################################
26#
27#  --transcripts <string>   transcripts.fasta file targeted by transdecoder
28#
29#  --gff3_file <string>     target gff3 file
30#
31#  optional:
32#
33#  --adj_dist <int>         distance allowed for start adjustment (default: $adj_dist)
34#
35#  --adj_pct <int>          pecentage of orf length for examining start adjustment (default: $adj_pct)
36#
37#  --atg_pos <int>          atg index position in pwm (default: $atg_pwm_pos)
38#
39#  --debug                  verbose
40#
41#######################################################################################
42
43
44
45__EOUSAGE__
46
47    ;
48
49
50my $transcripts_file;
51my $gff3_file;
52my $DEBUG = 0;
53
54&GetOptions('transcripts=s' => \$transcripts_file,
55            'gff3_file=s' => \$gff3_file,
56            'adj_dist=i' => \$adj_dist,
57            'atg_pos=i'=> \$atg_pwm_pos,
58            'atg_pct=i' => \$adj_pct,
59            'debug' => \$DEBUG,
60    );
61
62
63unless ($transcripts_file && $gff3_file) {
64    die $usage;
65}
66
67if ($adj_pct > 30 || $adj_pct < 0) {
68    die "Error, --adj_pct is out of range...  must be between 0 and 30 ";
69}
70
71main: {
72
73    # should be in the output directory
74
75    # get transdecoder start train files
76    my $transdecoder_dir = basename($transcripts_file) . ".transdecoder_dir";
77    unless (-d $transdecoder_dir) {
78        die "Error, cannot locate transdecoder working directory as: $transdecoder_dir";
79    }
80
81
82    my $pwm_plus = "${transdecoder_dir}/start_refinement.enhanced.+.pwm";
83    my $pwm_minus = "${transdecoder_dir}/start_refinement.-.pwm";
84
85    my $pwm_plus_obj = new PWM();
86    $pwm_plus_obj->load_pwm_from_file($pwm_plus);
87
88    my $pwm_minus_obj = new PWM();
89    $pwm_minus_obj->load_pwm_from_file($pwm_minus);
90
91    my $roc_file = "${transdecoder_dir}/start_refinement.enhanced.feature.scores.roc";
92    my $auc_file = "${transdecoder_dir}/start_refinement.enhanced.feature.scores.roc.auc";
93
94    my ($pwm_range, $min_threshold) = &parse_range_and_thresholds($auc_file, $roc_file);
95    print STDERR "-best pwm: $pwm_range, with score thresh: $min_threshold\n" if $DEBUG;
96
97    my ($pwm_range_left, $pwm_range_right) = split(",", $pwm_range); # extent around the atg
98    my $pwm_range_aref = [$atg_pwm_pos - $pwm_range_left -1, $atg_pwm_pos + 2 + $pwm_range_right -1]; # zero based
99
100    my $start_scores_log_file = "${transdecoder_dir}/start_refinement.alt_start_scores";
101    open(my $ofh_start_scores, ">$start_scores_log_file") or die "Error, cannot write to $start_scores_log_file";
102
103    print STDERR "-reading transcripts: $transcripts_file\n" if $DEBUG;
104    my $fasta_reader = new Fasta_reader($transcripts_file);
105
106    my %seqs = $fasta_reader->retrieve_all_seqs_hash();
107
108    print STDERR "-parsing orf annotations: $gff3_file\n" if $DEBUG;
109    my $gene_obj_indexer_href = {};
110
111    my $asmbl_id_to_gene_list_href = &GFF3_utils2::index_GFF3_gene_objs($gff3_file, $gene_obj_indexer_href);
112
113    my $num_starts_revised = 0;
114
115    foreach my $transcript_acc (sort keys %$asmbl_id_to_gene_list_href) {
116
117        print STDERR "-processing: $transcript_acc\n" if $DEBUG;
118
119        my @gene_ids = @{$asmbl_id_to_gene_list_href->{$transcript_acc}};
120
121
122        my $transcript_seq = uc $seqs{$transcript_acc};
123
124        foreach my $gene_id (@gene_ids) {
125
126            my $gene_obj = $gene_obj_indexer_href->{$gene_id};
127
128            my $revised_start_flag = &refine_start_codon_position($transcript_acc, $gene_id,
129                                                                  $gene_obj, $pwm_plus_obj, $pwm_minus_obj,
130                                                                  $pwm_range_aref, $min_threshold, $transcript_seq,
131                                                                  $ofh_start_scores);
132
133            if ($revised_start_flag) {
134                $num_starts_revised++;
135
136                ## update naming convention based on now having an updated start codon.
137                if ($gene_obj->{com_name} =~ /internal/) {
138                    $gene_obj->{com_name} =~ s/internal/3prime_partial/;
139                }
140                elsif ($gene_obj->{com_name} =~ /5prime_partial/) {
141                    $gene_obj->{com_name} =~ s/5prime_partial/complete/;
142                }
143            }
144            print $gene_obj->to_GFF3_format(source => "transdecoder") . "\n";
145
146
147        }
148    }
149
150    close $ofh_start_scores;
151
152    print STDERR "-number of revised start positions: $num_starts_revised\n";
153
154    exit(0);
155
156}
157
158####
159sub parse_range_and_thresholds {
160    my ($auc_file, $roc_file) = @_;
161
162    ## get the most accurate search range:
163    my $best_range = "";
164    my $best_range_score = 0;
165    {
166        open(my $fh, $auc_file) or die "Error, cannot open file: $auc_file";
167        while (<$fh>) {
168            chomp;
169            my ($range, $score) = split(/\t/);
170            if ($score > $best_range_score) {
171                $best_range_score = $score;
172                $best_range = $range;
173            }
174        }
175        close $fh;
176    }
177
178    # get most accurate threshold
179    my $best_min_threshold = undef;
180    my $best_F1 = 0;
181
182    {
183        open(my $fh, $roc_file) or die "Error, cannot open file $roc_file";
184        my $header = <$fh>;
185        while (<$fh>) {
186            chomp;
187            my @x = split(/\t/);
188            my $cat = $x[0];
189
190            unless ($cat eq $best_range) { next; }
191
192            my $thresh = $x[1];
193            my $F1 = $x[8];
194            if ($F1 > $best_F1) {
195                $best_F1 = $F1;
196                $best_min_threshold = $thresh;
197            }
198        }
199        close $fh;
200    }
201
202    return($best_range, $best_min_threshold);
203
204}
205
206
207####
208sub refine_start_codon_position {
209    my ($transcript_acc, $gene_id,
210        $gene_obj, $pwm_plus_obj, $pwm_minus_obj,
211        $pwm_range_aref, $min_threshold, $transcript_seq, $ofh_start_scores) = @_;
212
213    my $revised_start_flag = 0;
214
215    my $orient = $gene_obj->get_orientation();
216
217    my ($lend, $rend) = sort {$a<=>$b} $gene_obj->get_model_span();
218    my $orf_len = $rend - $lend + 1;
219    if ($orf_len % 3 != 0) {
220        die "Error, $orf_len is not mod 3 " . $gene_obj->toString();
221    }
222
223    my $orig_start_pos = $lend;
224
225    my $start_pos = $lend;
226    if ($orient eq '-') {
227        $transcript_seq = &reverse_complement($transcript_seq);
228        $start_pos = length($transcript_seq) - $rend + 1;
229    }
230
231
232    # only work on 5' partials
233    my $start_index = $start_pos - 1; # zero based
234
235    if (substr($transcript_seq, $start_index, 3) eq "ATG") {
236        return(0);
237    }
238
239    my @alt_starts;
240
241
242
243    my $max_search_pos = max($start_index + $adj_dist, $start_index + int($adj_pct * $orf_len / 100));
244
245
246    while ($transcript_seq =~ /(ATG)/g) {
247        my $pos = $-[0];
248        if ($pos > $max_search_pos) { last; } # too far
249        if ($pos > $start_index
250            &&
251            ($pos - $start_index) % 3  == 0) { # in frame start
252
253            push (@alt_starts, $pos);
254        }
255    }
256
257    unless (@alt_starts) {
258        return($revised_start_flag);
259    }
260
261    my $pwm_len = $pwm_plus_obj->get_pwm_length();
262
263
264    my $best_alt_start = undef;
265    my $best_alt_start_score = undef;
266
267    my @alt_start_scores;
268
269    foreach my $alt_start (@alt_starts) {
270        my $feature_seq_start = $alt_start - $atg_pwm_pos;
271        if ($feature_seq_start > 0) {
272            my $feature_seq = substr($transcript_seq, $feature_seq_start, $pwm_len);
273            unless (length($feature_seq) == $pwm_len) { next; }
274
275            my $alt_start_score = $pwm_plus_obj->score_plus_minus_pwm($feature_seq, $pwm_minus_obj,
276                                                                      pwm_range => $pwm_range_aref);
277
278            if ($alt_start_score eq "NA") { next; }
279
280            $alt_start_score = sprintf("%.3f", $alt_start_score);
281
282            my $short_feature_seq = &translate_sequence(substr($transcript_seq, $alt_start, 15), 1);
283
284            push (@alt_start_scores, "${alt_start}_${short_feature_seq}_${alt_start_score}");
285
286            if ($alt_start_score >= $min_threshold
287
288                &&
289                (  (! defined $best_alt_start_score) || $alt_start_score > $best_alt_start_score) ) {
290
291                $best_alt_start = $alt_start;
292                $best_alt_start_score = $alt_start_score;
293            }
294        }
295    }
296
297
298    if ($best_alt_start) {
299        $best_alt_start++; # make 1-based coord
300        my $new_start = $best_alt_start;
301        if ($orient eq '-') {
302            $new_start = length($transcript_seq) - $best_alt_start + 1;
303        }
304
305        my ($exon_obj) = $gene_obj->get_exons();
306        my $cds_obj = $exon_obj->get_CDS_obj();
307        $cds_obj->{end5} = $new_start;
308
309        $gene_obj->refine_gene_object();
310
311        $revised_start_flag = 1;
312
313        print STDERR "# refined start codon: $orig_start_pos -> $new_start\n" if $DEBUG;
314        print "# refined start codon: $orig_start_pos -> $new_start (score: $best_alt_start_score)\n";
315    }
316
317
318    if (@alt_start_scores) {
319        unshift(@alt_start_scores, $transcript_acc, $gene_id);
320        print $ofh_start_scores join("\t", @alt_start_scores) . "\n";
321    }
322
323    return($revised_start_flag);
324
325}
326
327