1#!/usr/local/bin/perl -w 2# 3# Given a positive file (.pos) and an output file of an rmark benchmark, 4# first remove all overlapping hits from the file, as defined below. 5# Then given the set of non-overlapping hits, create a positive-annotated 6# output file that lists the non-overlapping hits in the benchmark output file, 7# along with two extra fields indicating if each hit overlaps a positive or not 8# and whether the overlap is on the correct strand. 9# 10# The <rmark outfile> MUST be sorted properly by score (E-value or bit score), with 11# better scores preceding worse scores. 12# 13# Usage: perl identify-positives.pl <posfile> <sorted rmark output> 14# Example: ./identify-positives.pl rmark3.pos sorted-cmsearch.out 15# 16# 17use strict; 18my $usage = "Usage: perl identify-positives <posfile> <rmark outfile>\n"; 19 20if(scalar(@ARGV) != 2) { 21 print "Incorrect number of command line arguments.\n"; 22 print $usage; 23 24 exit(1); 25} 26 27my($posfile, $outfile) = @ARGV; 28my $overlap_thr = 0.5; 29 30if (! -e $posfile) { die "$posfile doesn't exist"; } 31if (! -e $outfile) { die "$outfile doesn't exist"; } 32 33# Step 1: read the positives file and store information on where each positive is 34my @fields = (); 35my ($target_name, $family, $target_from, $target_to, $target_ori, $family_and_idx, $idx, $tmp, $line); 36my ($matching_fam, $matching_strand, $matching_idx); 37my %pos_fam_HH = (); 38my %pos_to_HH = (); 39my %pos_ori_HH = (); 40my %pos_idx_HH = (); 41my %pos_order_HA = (); 42my @pout_lines; 43open(POS, "$posfile") || die "FAILED: to open $posfile"; 44while ($line = <POS>) 45{ 46 chomp $line; 47 if ($line =~ m/^\#/) { next; } 48 @fields = split(' ', $line, 5); 49 $family_and_idx = $fields[0]; 50 $target_name = $fields[1]; 51 $target_from = $fields[3]; 52 $target_to = $fields[4]; 53 if($family_and_idx !~ m/^\S+\/\d+$/) { die "ERROR invalid family and idx field $family_and_idx"; } 54 $idx = $family_and_idx; 55 $idx =~ s/^.+\///; 56 $family = $family_and_idx; 57 $family =~ s/\/\d+$//; 58 if($family =~ m/\//) { die "ERROR family $family from $family_and_idx, contains two '/', it should only have 1!"; } 59 if($target_name eq "decoy") { die "ERROR a family named \"decoy\" exists in the benchmark dataset, this is not allowed."; } 60 if($target_from > $target_to) { # on negative strand (Crick) 61 $tmp = $target_to; 62 $target_to = $target_from; 63 $target_from = $tmp; 64 $target_ori = "-"; 65 } 66 else { 67 $target_ori = "+"; 68 } 69 $pos_fam_HH{$target_name}{$target_from} = $family; 70 $pos_to_HH{$target_name}{$target_from} = $target_to; 71 $pos_ori_HH{$target_name}{$target_from} = $target_ori; 72 $pos_idx_HH{$target_name}{$target_from} = $idx; 73} 74close(POS); 75 76# Create sorted arrays of the start points in each target 77%pos_order_HA = (); 78my $pos_to; 79foreach $target_name (keys (%pos_to_HH)) { 80 @{$pos_order_HA{$target_name}} = sort {$a <=> $b} (keys (%{$pos_to_HH{$target_name}})); 81} 82 83# Step 2: Read the output file of hits and make sure the hits are sorted 84# by score. Either increasing or decreasing. 85open(OUT, "$outfile") || die "FAILED: to open $outfile"; 86my $seen_sc = 0; 87my $sc_should_increase = 0; 88my $sc_should_decrease = 0; 89my ($sc, $prv_sc, $overlap); 90while ($line = <OUT>) 91{ 92 chomp $line; 93 if ($line =~ m/^\#/) { next; } 94 @fields = split(' ', $line, 6); 95 $sc = $fields[0]; 96 if($seen_sc) { 97 if($sc > $prv_sc) { 98 if($sc_should_decrease) { die "ERROR, results don't appear to be sorted by score"; } 99 $sc_should_increase = 1; 100 } 101 elsif($sc < $prv_sc) { 102 if($sc_should_increase) { die "ERROR, results don't appear to be sorted by score"; } 103 $sc_should_decrease = 1; 104 } 105 } 106 $prv_sc = $sc; 107 $seen_sc = 1; 108} 109close(OUT); 110 111# Step 3: If we get here, the scores are sorted. Reread the output 112# file of hits and remove overlaps greedily. Since they're 113# sorted by score, we can just remove any hit as we read it IF 114# it overlaps with any hit we've already read and stored. 115# We don't actually remove hits, but we flag them, and then 116# ignore them in step 4 below. 117my $nhits_total = 0; 118my $nhits_kept = 0; 119my $nhits_removed = 0; 120my @hit_usemeA = (); 121my @khit_targetA = (); 122my @khit_fromA = (); 123my @khit_toA = (); 124my @khit_oriA = (); 125my @khit_queryA = (); 126my ($h, $query ); 127 128open(OUT, "$outfile") || die "FAILED: to open $outfile"; 129while ($line = <OUT>) 130{ 131 chomp $line; 132 if ($line =~ m/^\#/) { next; } 133 @fields = split(' ', $line, 6); 134 $target_from = $fields[2]; 135 $target_to = $fields[3]; 136 $target_name = $fields[4]; 137 $query = $fields[5]; 138 if($target_from > $target_to) { # on negative strand (Crick) 139 $tmp = $target_to; 140 $target_to = $target_from; 141 $target_from = $tmp; 142 $target_ori = "-"; 143 } 144 else { 145 $target_ori = "+"; 146 } 147 148 # check if this hit overlaps with all previously seen hits: 149 $hit_usemeA[$nhits_total] = 1; # we'll use it, unless we find an overlapping hit with better score already stored 150 for($h = 0; $h < $nhits_kept; $h++) { 151 if(($query eq $khit_queryA[$h]) && ($khit_targetA[$h] eq $target_name) && ($khit_oriA[$h] eq $target_ori)) { 152 # hit is from same query in same target and in same orientation, check if it overlaps 153 $overlap = GetOverlap($target_from, $target_to, $khit_fromA[$h], $khit_toA[$h]); 154 if($overlap > $overlap_thr) { 155 # this hit overlaps another one with a better score 156 $hit_usemeA[$nhits_total] = 0; 157 $h = $nhits_kept; #breaks for loop above 158 } 159 } 160 } 161 if($hit_usemeA[$nhits_total] == 1) { 162 push(@khit_targetA, $target_name); 163 push(@khit_fromA, $target_from); 164 push(@khit_toA, $target_to); 165 push(@khit_oriA, $target_ori); 166 push(@khit_queryA, $query); 167 $nhits_kept++; 168 #printf("KEPT HIT %-20s %-20s %10d %10d %s\n", $query, $target_name, $target_from, $target_to, $target_ori); 169 } 170 else { 171 $nhits_removed++; 172 #printf("REMOVED HIT %-20s %-20s %10d %10d %s\n", $query, $target_name, $target_from, $target_to, $target_ori); 173 } 174 $nhits_total++; 175} 176close(OUT); 177#printf("nkept: %d\n", $nhits_kept); 178#printf("nremoved: %d\n", $nhits_removed); 179 180# Step 4: Reead the output file for a final time determine if each hit overlaps a positive. 181# Create the ".pout" file, which is identical to the ".out' file but with 182# two additional fields telling if each hit overlaps with a positive or not. 183# We actually don't have to reread the file, because we have all we 184# need stored in the khit arrays, but I already had it implemented this way, 185# and it works, so I left it. 186# 187open(OUT, "$outfile") || die "FAILED: to open $outfile"; 188$h = 0; 189while ($line = <OUT>) 190{ 191 chomp $line; 192 if ($line =~ m/^\#/) { next; } 193 # make sure we want to keep this hit 194 if($hit_usemeA[$h] == 1) { 195 @fields = split(' ', $line, 6); 196 $target_from = $fields[2]; 197 $target_to = $fields[3]; 198 $target_name = $fields[4]; 199 if($target_from > $target_to) { # on negative strand (Crick) 200 $tmp = $target_to; 201 $target_to = $target_from; 202 $target_from = $tmp; 203 $target_ori = "-"; 204 } 205 else { 206 $target_ori = "+"; 207 } 208 209 if(! (exists ($pos_fam_HH{$target_name}))) { 210 $matching_fam = "decoy"; 211 $matching_strand = "decoy"; 212 } 213 else { 214 # target_name has at least 1 positive embedded within it, 215 # check if this hit overlaps any positives 216 #printf("calling CheckIfPositive: targetname: $target_name\n"); 217 CheckIfPositive($target_from, $target_to, $target_ori, $overlap_thr, 218 \%{$pos_fam_HH{$target_name}}, 219 \%{$pos_to_HH{$target_name}}, 220 \%{$pos_ori_HH{$target_name}}, 221 \%{$pos_idx_HH{$target_name}}, 222 \@{$pos_order_HA{$target_name}}, 223 \$matching_fam, \$matching_strand, \$matching_idx); 224 } 225 # note we don't print pout_lines until the end, so if there's an error that causes 226 # us to exit early, we'll know it b/c the pout file not exist 227 push(@pout_lines, sprintf("%s %s/%d %s\n", $line, $matching_fam, $matching_idx, $matching_strand)); 228 #printf("%s %s %s\n", $line, $matching_fam, $matching_idx, $matching_strand); 229 } 230 $h++; 231} 232close(OUT); 233 234if($h != $nhits_total) { die "ERROR, second pass read $h hits, first pass read $nhits_total hits"; } 235 236my $pout_line; 237foreach $pout_line (@pout_lines) { 238 print $pout_line; 239} 240 241################################################################# 242# Subroutine : CheckIfPositive() 243# Incept: EPN, Mon Jun 28 08:08:02 2010 244# 245# Purpose: Check if a given hit overlaps with a positive, on 246# either strand. Return the name of the family 247# of the overlapping positive in $ret_fam, 248# and whether it is the same strand ("same") or 249# opposite strand ("opposite") in $ret_strand. 250# 251# Arguments: 252# $target_name: name of the target sequence of the hit 253# $target_from: end point of hit in target, if > <target_to>, hit 254# was on opposite strand 255# $target_to: start point of hit in target 256# $overlap_thr: fractional threshold for overlap; if length of overlap is > <overlapF> 257# the length of the shorter of the two sequences, it counts 258# $pos_fam_HR: reference to a hash, key: target start point (always < target end) 259# of positive, value is family the positive belongs to. 260# $pos_to_HR: reference to a hash, key: target start point (always < target end) 261# of positive, value is end point of the positive (always > target start) 262# $pos_ori_HR: reference to a hash, key: target start point (always < target end) 263# value is target orientation, if "+", positive is on the positive strand, 264# if "-", positive is on the negative strand 265# $pos_idx_HR: reference to a hash, key: target start point (always < target end) 266# value is the index of this target for its family, e.g. this is the 267# nth tRNA, this is eventually useful for closer inspection of benchmark results 268# $pos_order_AR: reference to an array, the sorted start points of all positives 269# in the target, this allows us to look through the three pos_*_HR 270# in order of positives as they appear in the target. 271# $ret_fam: RETURN: the family the hit overlaps with a positive from, 272# if none, "-". 273# $ret_strand: RETURN: "same" if the hit overlaps with a positive on the same strand, 274# "opposite" if the hit overlaps with a positive on the opposite strand. 275# if $ret_fam is "-", we return "decoy". 276# $ret_idx: RETURN: index of the hit in the family, from %pos_idx_HR 277################################################################# 278sub CheckIfPositive { 279 my $narg_expected = 12; 280 if(scalar(@_) != $narg_expected) { printf STDERR ("ERROR, CheckIfPositive() entered with %d != %d input arguments.\n", scalar(@_), $narg_expected); exit(1); } 281 my ($target_from, $target_to, $target_ori, $overlap_thr, $pos_fam_HR, $pos_to_HR, $pos_ori_HR, $pos_idx_HR, $pos_order_AR, $ret_fam, $ret_strand, $ret_idx) = @_; 282 283 my $fam = "decoy"; 284 my $strand = "decoy"; 285 my $idx = 0; 286 my $already_found_match = 0; 287 my ($pos_from, $pos_to, $pos_fam, $pos_ori, $pos_idx, $overlap); 288 289 # Exhaustively search for all positives that overlap this hit If 290 # more than one positive overlaps this hit, it's an error that we 291 # don't know how to deal with, so we die with an ERROR message. 292 # If speed of this script becomes an issue, this is the chunk of 293 # code to rewrite, probably with a binary search for overlaps 294 # (though you'd have to take care to deal with the possibility 295 # that one hit overlaps 2 positives). 296 foreach $pos_from (@{$pos_order_AR}) { 297 if(! exists($pos_to_HR->{$pos_from})) { die "ERROR, CheckIfPositive(), pos_to_HR->{$pos_from} does not exist"; } 298 if(! exists($pos_fam_HR->{$pos_from})) { die "ERROR, CheckIfPositive(), pos_fam_HR->{$pos_from} does not exist"; } 299 if(! exists($pos_ori_HR->{$pos_from})) { die "ERROR, CheckIfPositive(), pos_ori_HR->{$pos_from} does not exist"; } 300 $pos_to = $pos_to_HR->{$pos_from}; 301 $pos_fam = $pos_fam_HR->{$pos_from}; 302 $pos_ori = $pos_ori_HR->{$pos_from}; 303 $pos_idx = $pos_idx_HR->{$pos_from}; 304 #printf("\tcalling GetOverlap: target: %d..%d pos: %d..%d $pos_fam $pos_ori $pos_idx\n", $target_from, $target_to, $pos_from, $pos_to); 305 $overlap = GetOverlap($target_from, $target_to, $pos_from, $pos_to); 306 if($overlap > $overlap_thr) { 307 #printf("\t\toverlap match!\n"); 308 if($already_found_match == 1) { die "ERROR, CheckIfPositive(), two positives overlap with $target_name $target_from..$target_to; not allowed"; } 309 $fam = $pos_fam; 310 $strand = ($pos_ori eq $target_ori) ? "same" : "opposite"; 311 $idx = $pos_idx; 312 $already_found_match = 1; 313 } 314 } 315 $$ret_fam = $fam; 316 $$ret_strand = $strand; 317 $$ret_idx = $idx; 318 return; 319} 320 321 322################################################################# 323# Subroutine : GetOverlap() 324# Incept: EPN, Mon Jun 28 09:09:57 2010 325# 326# Purpose: Return the fractional overlap of two hits. 327# Fractional overlap is the number of overlapping positions 328# divided by the shorter of the lengths of the two hits. 329# 330# Arguments: 331# $from1: start position of hit 1 332# $to1: end position of hit 1 333# $from2: start position of hit 2 334# $to2: end position of hit 2 335################################################################# 336sub GetOverlap { 337 my $narg_expected = 4; 338 if(scalar(@_) != $narg_expected) { printf STDERR ("ERROR, GetOverlap() entered with %d != %d input arguments.\n", scalar(@_), $narg_expected); exit(1); } 339 my ($from1, $to1, $from2, $to2) = @_; 340 341 my($tmp, $minlen); 342 343 if($from1 > $to1) { die "ERROR, GetOverlap(), from1 > to1\n"; } 344 if($from2 > $to2) { die "ERROR, GetOverlap(), from2 > to2\n"; } 345 346 $minlen = $to1 - $from1 + 1; 347 if($minlen > ($to2 - $from2 + 1)) { $minlen = ($to2 - $from2 + 1); } 348 349 # Given: $from1 <= $to1 and $from2 <= $to2. 350 351 # Swap if nec so that $from1 <= $from2. 352 if($from1 > $from2) { 353 $tmp = $from1; $from1 = $from2; $from2 = $tmp; 354 $tmp = $to1; $to1 = $to2; $to2 = $tmp; 355 } 356 357 # 3 possible cases: 358 # Case 1. $from1 <= $to1 < $from2 <= $to2 Overlap is 0 359 # Case 2. $from1 <= $from2 <= $to1 < $to2 360 # Case 3. $from1 <= $from2 <= $to2 <= $to1 361 if($to1 < $from2) { return 0.; } # case 1 362 if($to1 < $to2) { return ($to1 - $from2 + 1) / $minlen; } # case 2 363 if($to2 <= $to1) { return ($to2 - $from2 + 1) / $minlen; } # case 3 364 die "ERROR, unforeseen case in GetOverlap $from1..$to1 and $from2..$to2"; 365} 366