1#!/usr/bin/env perl 2 3use strict; 4use warnings; 5use FindBin; 6use lib ("/usr/local/lib/perl5/site_perl/transdecoder"); 7use Gene_obj; 8use GFF3_utils2; 9use Fasta_reader; 10use Nuc_translator; 11use Carp; 12use Data::Dumper; 13use List::Util qw (max); 14use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through); 15use PWM; 16use File::Basename; 17 18 19my $atg_pwm_pos = 20; 20my $adj_dist = 30; 21my $adj_pct = 15; 22 23my $usage = <<__EOUSAGE__; 24 25##################################################################################### 26# 27# --transcripts <string> transcripts.fasta file targeted by transdecoder 28# 29# --gff3_file <string> target gff3 file 30# 31# optional: 32# 33# --adj_dist <int> distance allowed for start adjustment (default: $adj_dist) 34# 35# --adj_pct <int> pecentage of orf length for examining start adjustment (default: $adj_pct) 36# 37# --atg_pos <int> atg index position in pwm (default: $atg_pwm_pos) 38# 39# --debug verbose 40# 41####################################################################################### 42 43 44 45__EOUSAGE__ 46 47 ; 48 49 50my $transcripts_file; 51my $gff3_file; 52my $DEBUG = 0; 53 54&GetOptions('transcripts=s' => \$transcripts_file, 55 'gff3_file=s' => \$gff3_file, 56 'adj_dist=i' => \$adj_dist, 57 'atg_pos=i'=> \$atg_pwm_pos, 58 'atg_pct=i' => \$adj_pct, 59 'debug' => \$DEBUG, 60 ); 61 62 63unless ($transcripts_file && $gff3_file) { 64 die $usage; 65} 66 67if ($adj_pct > 30 || $adj_pct < 0) { 68 die "Error, --adj_pct is out of range... must be between 0 and 30 "; 69} 70 71main: { 72 73 # should be in the output directory 74 75 # get transdecoder start train files 76 my $transdecoder_dir = basename($transcripts_file) . ".transdecoder_dir"; 77 unless (-d $transdecoder_dir) { 78 die "Error, cannot locate transdecoder working directory as: $transdecoder_dir"; 79 } 80 81 82 my $pwm_plus = "${transdecoder_dir}/start_refinement.enhanced.+.pwm"; 83 my $pwm_minus = "${transdecoder_dir}/start_refinement.-.pwm"; 84 85 my $pwm_plus_obj = new PWM(); 86 $pwm_plus_obj->load_pwm_from_file($pwm_plus); 87 88 my $pwm_minus_obj = new PWM(); 89 $pwm_minus_obj->load_pwm_from_file($pwm_minus); 90 91 my $roc_file = "${transdecoder_dir}/start_refinement.enhanced.feature.scores.roc"; 92 my $auc_file = "${transdecoder_dir}/start_refinement.enhanced.feature.scores.roc.auc"; 93 94 my ($pwm_range, $min_threshold) = &parse_range_and_thresholds($auc_file, $roc_file); 95 print STDERR "-best pwm: $pwm_range, with score thresh: $min_threshold\n" if $DEBUG; 96 97 my ($pwm_range_left, $pwm_range_right) = split(",", $pwm_range); # extent around the atg 98 my $pwm_range_aref = [$atg_pwm_pos - $pwm_range_left -1, $atg_pwm_pos + 2 + $pwm_range_right -1]; # zero based 99 100 my $start_scores_log_file = "${transdecoder_dir}/start_refinement.alt_start_scores"; 101 open(my $ofh_start_scores, ">$start_scores_log_file") or die "Error, cannot write to $start_scores_log_file"; 102 103 print STDERR "-reading transcripts: $transcripts_file\n" if $DEBUG; 104 my $fasta_reader = new Fasta_reader($transcripts_file); 105 106 my %seqs = $fasta_reader->retrieve_all_seqs_hash(); 107 108 print STDERR "-parsing orf annotations: $gff3_file\n" if $DEBUG; 109 my $gene_obj_indexer_href = {}; 110 111 my $asmbl_id_to_gene_list_href = &GFF3_utils2::index_GFF3_gene_objs($gff3_file, $gene_obj_indexer_href); 112 113 my $num_starts_revised = 0; 114 115 foreach my $transcript_acc (sort keys %$asmbl_id_to_gene_list_href) { 116 117 print STDERR "-processing: $transcript_acc\n" if $DEBUG; 118 119 my @gene_ids = @{$asmbl_id_to_gene_list_href->{$transcript_acc}}; 120 121 122 my $transcript_seq = uc $seqs{$transcript_acc}; 123 124 foreach my $gene_id (@gene_ids) { 125 126 my $gene_obj = $gene_obj_indexer_href->{$gene_id}; 127 128 my $revised_start_flag = &refine_start_codon_position($transcript_acc, $gene_id, 129 $gene_obj, $pwm_plus_obj, $pwm_minus_obj, 130 $pwm_range_aref, $min_threshold, $transcript_seq, 131 $ofh_start_scores); 132 133 if ($revised_start_flag) { 134 $num_starts_revised++; 135 136 ## update naming convention based on now having an updated start codon. 137 if ($gene_obj->{com_name} =~ /internal/) { 138 $gene_obj->{com_name} =~ s/internal/3prime_partial/; 139 } 140 elsif ($gene_obj->{com_name} =~ /5prime_partial/) { 141 $gene_obj->{com_name} =~ s/5prime_partial/complete/; 142 } 143 } 144 print $gene_obj->to_GFF3_format(source => "transdecoder") . "\n"; 145 146 147 } 148 } 149 150 close $ofh_start_scores; 151 152 print STDERR "-number of revised start positions: $num_starts_revised\n"; 153 154 exit(0); 155 156} 157 158#### 159sub parse_range_and_thresholds { 160 my ($auc_file, $roc_file) = @_; 161 162 ## get the most accurate search range: 163 my $best_range = ""; 164 my $best_range_score = 0; 165 { 166 open(my $fh, $auc_file) or die "Error, cannot open file: $auc_file"; 167 while (<$fh>) { 168 chomp; 169 my ($range, $score) = split(/\t/); 170 if ($score > $best_range_score) { 171 $best_range_score = $score; 172 $best_range = $range; 173 } 174 } 175 close $fh; 176 } 177 178 # get most accurate threshold 179 my $best_min_threshold = undef; 180 my $best_F1 = 0; 181 182 { 183 open(my $fh, $roc_file) or die "Error, cannot open file $roc_file"; 184 my $header = <$fh>; 185 while (<$fh>) { 186 chomp; 187 my @x = split(/\t/); 188 my $cat = $x[0]; 189 190 unless ($cat eq $best_range) { next; } 191 192 my $thresh = $x[1]; 193 my $F1 = $x[8]; 194 if ($F1 > $best_F1) { 195 $best_F1 = $F1; 196 $best_min_threshold = $thresh; 197 } 198 } 199 close $fh; 200 } 201 202 return($best_range, $best_min_threshold); 203 204} 205 206 207#### 208sub refine_start_codon_position { 209 my ($transcript_acc, $gene_id, 210 $gene_obj, $pwm_plus_obj, $pwm_minus_obj, 211 $pwm_range_aref, $min_threshold, $transcript_seq, $ofh_start_scores) = @_; 212 213 my $revised_start_flag = 0; 214 215 my $orient = $gene_obj->get_orientation(); 216 217 my ($lend, $rend) = sort {$a<=>$b} $gene_obj->get_model_span(); 218 my $orf_len = $rend - $lend + 1; 219 if ($orf_len % 3 != 0) { 220 die "Error, $orf_len is not mod 3 " . $gene_obj->toString(); 221 } 222 223 my $orig_start_pos = $lend; 224 225 my $start_pos = $lend; 226 if ($orient eq '-') { 227 $transcript_seq = &reverse_complement($transcript_seq); 228 $start_pos = length($transcript_seq) - $rend + 1; 229 } 230 231 232 # only work on 5' partials 233 my $start_index = $start_pos - 1; # zero based 234 235 if (substr($transcript_seq, $start_index, 3) eq "ATG") { 236 return(0); 237 } 238 239 my @alt_starts; 240 241 242 243 my $max_search_pos = max($start_index + $adj_dist, $start_index + int($adj_pct * $orf_len / 100)); 244 245 246 while ($transcript_seq =~ /(ATG)/g) { 247 my $pos = $-[0]; 248 if ($pos > $max_search_pos) { last; } # too far 249 if ($pos > $start_index 250 && 251 ($pos - $start_index) % 3 == 0) { # in frame start 252 253 push (@alt_starts, $pos); 254 } 255 } 256 257 unless (@alt_starts) { 258 return($revised_start_flag); 259 } 260 261 my $pwm_len = $pwm_plus_obj->get_pwm_length(); 262 263 264 my $best_alt_start = undef; 265 my $best_alt_start_score = undef; 266 267 my @alt_start_scores; 268 269 foreach my $alt_start (@alt_starts) { 270 my $feature_seq_start = $alt_start - $atg_pwm_pos; 271 if ($feature_seq_start > 0) { 272 my $feature_seq = substr($transcript_seq, $feature_seq_start, $pwm_len); 273 unless (length($feature_seq) == $pwm_len) { next; } 274 275 my $alt_start_score = $pwm_plus_obj->score_plus_minus_pwm($feature_seq, $pwm_minus_obj, 276 pwm_range => $pwm_range_aref); 277 278 if ($alt_start_score eq "NA") { next; } 279 280 $alt_start_score = sprintf("%.3f", $alt_start_score); 281 282 my $short_feature_seq = &translate_sequence(substr($transcript_seq, $alt_start, 15), 1); 283 284 push (@alt_start_scores, "${alt_start}_${short_feature_seq}_${alt_start_score}"); 285 286 if ($alt_start_score >= $min_threshold 287 288 && 289 ( (! defined $best_alt_start_score) || $alt_start_score > $best_alt_start_score) ) { 290 291 $best_alt_start = $alt_start; 292 $best_alt_start_score = $alt_start_score; 293 } 294 } 295 } 296 297 298 if ($best_alt_start) { 299 $best_alt_start++; # make 1-based coord 300 my $new_start = $best_alt_start; 301 if ($orient eq '-') { 302 $new_start = length($transcript_seq) - $best_alt_start + 1; 303 } 304 305 my ($exon_obj) = $gene_obj->get_exons(); 306 my $cds_obj = $exon_obj->get_CDS_obj(); 307 $cds_obj->{end5} = $new_start; 308 309 $gene_obj->refine_gene_object(); 310 311 $revised_start_flag = 1; 312 313 print STDERR "# refined start codon: $orig_start_pos -> $new_start\n" if $DEBUG; 314 print "# refined start codon: $orig_start_pos -> $new_start (score: $best_alt_start_score)\n"; 315 } 316 317 318 if (@alt_start_scores) { 319 unshift(@alt_start_scores, $transcript_acc, $gene_id); 320 print $ofh_start_scores join("\t", @alt_start_scores) . "\n"; 321 } 322 323 return($revised_start_flag); 324 325} 326 327