1#!/usr/local/bin/perl -w
2#
3# Given a positive file (.pos) and an output file of an rmark benchmark,
4# first remove all overlapping hits from the file, as defined below.
5# Then given the set of non-overlapping hits, create a positive-annotated
6# output file that lists the non-overlapping hits in the benchmark output file,
7# along with two extra fields indicating if each hit overlaps a positive or not
8# and whether the overlap is on the correct strand.
9#
10# The <rmark outfile> MUST be sorted properly by score (E-value or bit score), with
11# better scores preceding worse scores.
12#
13# Usage:    perl identify-positives.pl <posfile> <sorted rmark output>
14# Example:  ./identify-positives.pl rmark3.pos sorted-cmsearch.out
15#
16#
17use strict;
18my $usage          = "Usage: perl identify-positives <posfile> <rmark outfile>\n";
19
20if(scalar(@ARGV) != 2) {
21    print "Incorrect number of command line arguments.\n";
22    print $usage;
23
24    exit(1);
25}
26
27my($posfile, $outfile) = @ARGV;
28my $overlap_thr = 0.5;
29
30if (! -e $posfile) { die "$posfile doesn't exist"; }
31if (! -e $outfile) { die "$outfile doesn't exist"; }
32
33# Step 1: read the positives file and store information on where each positive is
34my @fields = ();
35my ($target_name, $family, $target_from, $target_to, $target_ori, $family_and_idx, $idx, $tmp, $line);
36my ($matching_fam, $matching_strand, $matching_idx);
37my %pos_fam_HH = ();
38my %pos_to_HH  = ();
39my %pos_ori_HH = ();
40my %pos_idx_HH = ();
41my %pos_order_HA = ();
42my @pout_lines;
43open(POS, "$posfile") || die "FAILED: to open $posfile";
44while ($line = <POS>)
45{
46    chomp $line;
47    if ($line =~ m/^\#/) { next; }
48    @fields   = split(' ', $line, 5);
49    $family_and_idx = $fields[0];
50    $target_name    = $fields[1];
51    $target_from    = $fields[3];
52    $target_to      = $fields[4];
53    if($family_and_idx !~ m/^\S+\/\d+$/) { die "ERROR invalid family and idx field $family_and_idx"; }
54    $idx = $family_and_idx;
55    $idx =~ s/^.+\///;
56    $family = $family_and_idx;
57    $family =~ s/\/\d+$//;
58    if($family =~ m/\//) { die "ERROR family $family from $family_and_idx, contains two '/', it should only have 1!"; }
59    if($target_name eq "decoy") { die "ERROR a family named \"decoy\" exists in the benchmark dataset, this is not allowed."; }
60    if($target_from > $target_to) { # on negative strand (Crick)
61	$tmp         = $target_to;
62	$target_to   = $target_from;
63	$target_from = $tmp;
64	$target_ori  = "-";
65    }
66    else {
67	$target_ori = "+";
68    }
69    $pos_fam_HH{$target_name}{$target_from} = $family;
70    $pos_to_HH{$target_name}{$target_from}  = $target_to;
71    $pos_ori_HH{$target_name}{$target_from} = $target_ori;
72    $pos_idx_HH{$target_name}{$target_from} = $idx;
73}
74close(POS);
75
76# Create sorted arrays of the start points in each target
77%pos_order_HA = ();
78my $pos_to;
79foreach $target_name (keys (%pos_to_HH)) {
80    @{$pos_order_HA{$target_name}} = sort {$a <=> $b} (keys (%{$pos_to_HH{$target_name}}));
81}
82
83# Step 2: Read the output file of hits and make sure the hits are sorted
84#         by score. Either increasing or decreasing.
85open(OUT,  "$outfile")   || die "FAILED: to open $outfile";
86my $seen_sc = 0;
87my $sc_should_increase = 0;
88my $sc_should_decrease = 0;
89my ($sc, $prv_sc, $overlap);
90while ($line = <OUT>)
91{
92    chomp $line;
93    if ($line =~ m/^\#/) { next; }
94    @fields   = split(' ', $line, 6);
95    $sc          = $fields[0];
96    if($seen_sc) {
97	if($sc > $prv_sc) {
98	    if($sc_should_decrease) { die "ERROR, results don't appear to be sorted by score"; }
99	    $sc_should_increase = 1;
100	}
101	elsif($sc < $prv_sc) {
102	    if($sc_should_increase) { die "ERROR, results don't appear to be sorted by score"; }
103	    $sc_should_decrease = 1;
104	}
105    }
106    $prv_sc = $sc;
107    $seen_sc = 1;
108}
109close(OUT);
110
111# Step 3: If we get here, the scores are sorted.  Reread the output
112#         file of hits and remove overlaps greedily.  Since they're
113#         sorted by score, we can just remove any hit as we read it IF
114#         it overlaps with any hit we've already read and stored.
115#         We don't actually remove hits, but we flag them, and then
116#         ignore them in step 4 below.
117my $nhits_total   = 0;
118my $nhits_kept    = 0;
119my $nhits_removed = 0;
120my @hit_usemeA = ();
121my @khit_targetA = ();
122my @khit_fromA = ();
123my @khit_toA = ();
124my @khit_oriA = ();
125my @khit_queryA = ();
126my ($h, $query );
127
128open(OUT,  "$outfile")   || die "FAILED: to open $outfile";
129while ($line = <OUT>)
130{
131    chomp $line;
132    if ($line =~ m/^\#/) { next; }
133    @fields   = split(' ', $line, 6);
134    $target_from = $fields[2];
135    $target_to   = $fields[3];
136    $target_name = $fields[4];
137    $query       = $fields[5];
138    if($target_from > $target_to) { # on negative strand (Crick)
139	$tmp         = $target_to;
140	$target_to   = $target_from;
141	$target_from = $tmp;
142	$target_ori  = "-";
143    }
144    else {
145	$target_ori = "+";
146    }
147
148    # check if this hit overlaps with all previously seen hits:
149    $hit_usemeA[$nhits_total] = 1; # we'll use it, unless we find an overlapping hit with better score already stored
150    for($h = 0; $h < $nhits_kept; $h++) {
151	if(($query eq $khit_queryA[$h]) && ($khit_targetA[$h] eq $target_name) && ($khit_oriA[$h] eq $target_ori)) {
152	    # hit is from same query in same target and in same orientation, check if it overlaps
153	    $overlap = GetOverlap($target_from, $target_to, $khit_fromA[$h], $khit_toA[$h]);
154	    if($overlap > $overlap_thr) {
155		# this hit overlaps another one with a better score
156		$hit_usemeA[$nhits_total] = 0;
157		$h = $nhits_kept; #breaks for loop above
158	    }
159	}
160    }
161    if($hit_usemeA[$nhits_total] == 1) {
162	push(@khit_targetA, $target_name);
163	push(@khit_fromA,   $target_from);
164	push(@khit_toA,     $target_to);
165	push(@khit_oriA,    $target_ori);
166	push(@khit_queryA,  $query);
167	$nhits_kept++;
168	#printf("KEPT HIT    %-20s  %-20s  %10d  %10d  %s\n", $query, $target_name, $target_from, $target_to, $target_ori);
169    }
170    else {
171	$nhits_removed++;
172	#printf("REMOVED HIT %-20s  %-20s  %10d  %10d  %s\n", $query, $target_name, $target_from, $target_to, $target_ori);
173    }
174    $nhits_total++;
175}
176close(OUT);
177#printf("nkept:    %d\n", $nhits_kept);
178#printf("nremoved: %d\n", $nhits_removed);
179
180# Step 4: Reead the output file for a final time determine if each hit overlaps a positive.
181#         Create the ".pout" file, which is identical to the ".out' file but with
182#         two additional fields telling if each hit overlaps with a positive or not.
183#         We actually don't have to reread the file, because we have all we
184#         need stored in the khit arrays, but I already had it implemented this way,
185#         and it works, so I left it.
186#
187open(OUT,  "$outfile")   || die "FAILED: to open $outfile";
188$h = 0;
189while ($line = <OUT>)
190{
191    chomp $line;
192    if ($line =~ m/^\#/) { next; }
193    # make sure we want to keep this hit
194    if($hit_usemeA[$h] == 1) {
195	@fields   = split(' ', $line, 6);
196	$target_from = $fields[2];
197	$target_to   = $fields[3];
198	$target_name = $fields[4];
199	if($target_from > $target_to) { # on negative strand (Crick)
200	    $tmp         = $target_to;
201	    $target_to   = $target_from;
202	    $target_from = $tmp;
203	    $target_ori  = "-";
204	}
205	else {
206	    $target_ori = "+";
207	}
208
209	if(! (exists ($pos_fam_HH{$target_name}))) {
210	    $matching_fam    = "decoy";
211	    $matching_strand = "decoy";
212	}
213	else {
214	    # target_name has at least 1 positive embedded within it,
215	    # check if this hit overlaps any positives
216	    #printf("calling CheckIfPositive: targetname: $target_name\n");
217	    CheckIfPositive($target_from, $target_to, $target_ori, $overlap_thr,
218			    \%{$pos_fam_HH{$target_name}},
219			    \%{$pos_to_HH{$target_name}},
220			    \%{$pos_ori_HH{$target_name}},
221			    \%{$pos_idx_HH{$target_name}},
222			    \@{$pos_order_HA{$target_name}},
223			    \$matching_fam, \$matching_strand, \$matching_idx);
224	}
225	# note we don't print pout_lines until the end, so if there's an error that causes
226	# us to exit early, we'll know it b/c the pout file not exist
227	push(@pout_lines, sprintf("%s %s/%d %s\n", $line, $matching_fam, $matching_idx, $matching_strand));
228	#printf("%s %s %s\n", $line, $matching_fam, $matching_idx, $matching_strand);
229    }
230    $h++;
231}
232close(OUT);
233
234if($h != $nhits_total) { die "ERROR, second pass read $h hits, first pass read $nhits_total hits"; }
235
236my $pout_line;
237foreach $pout_line (@pout_lines) {
238    print $pout_line;
239}
240
241#################################################################
242# Subroutine : CheckIfPositive()
243# Incept:      EPN, Mon Jun 28 08:08:02 2010
244#
245# Purpose:     Check if a given hit overlaps with a positive, on
246#              either strand. Return the name of the family
247#              of the overlapping positive in $ret_fam,
248#              and whether it is the same strand ("same") or
249#              opposite strand ("opposite") in $ret_strand.
250#
251# Arguments:
252# $target_name:  name of the target sequence of the hit
253# $target_from:  end point of hit in target, if > <target_to>, hit
254#                was on opposite strand
255# $target_to:    start point of hit in target
256# $overlap_thr:  fractional threshold for overlap; if length of overlap is > <overlapF>
257#                the length of the shorter of the two sequences, it counts
258# $pos_fam_HR:   reference to a hash, key: target start point (always < target end)
259#                of positive, value is family the positive belongs to.
260# $pos_to_HR:    reference to a hash, key: target start point (always < target end)
261#                of positive, value is end point of the positive (always > target start)
262# $pos_ori_HR:   reference to a hash, key: target start point (always < target end)
263#                value is target orientation, if "+", positive is on the positive strand,
264#                if "-", positive is on the negative strand
265# $pos_idx_HR:   reference to a hash, key: target start point (always < target end)
266#                value is the index of this target for its family, e.g. this is the
267#                nth tRNA, this is eventually useful for closer inspection of benchmark results
268# $pos_order_AR: reference to an array, the sorted start points of all positives
269#                in the target, this allows us to look through the three pos_*_HR
270#                in order of positives as they appear in the target.
271# $ret_fam:      RETURN: the family the hit overlaps with a positive from,
272#                if none, "-".
273# $ret_strand:   RETURN: "same" if the hit overlaps with a positive on the same strand,
274#                "opposite" if the hit overlaps with a positive on the opposite strand.
275#                if $ret_fam is "-", we return "decoy".
276# $ret_idx:      RETURN: index of the hit in the family, from %pos_idx_HR
277#################################################################
278sub CheckIfPositive {
279    my $narg_expected = 12;
280    if(scalar(@_) != $narg_expected) { printf STDERR ("ERROR, CheckIfPositive() entered with %d != %d input arguments.\n", scalar(@_), $narg_expected); exit(1); }
281    my ($target_from, $target_to, $target_ori, $overlap_thr, $pos_fam_HR, $pos_to_HR, $pos_ori_HR, $pos_idx_HR, $pos_order_AR, $ret_fam, $ret_strand, $ret_idx) = @_;
282
283    my $fam = "decoy";
284    my $strand = "decoy";
285    my $idx = 0;
286    my $already_found_match = 0;
287    my ($pos_from, $pos_to, $pos_fam, $pos_ori, $pos_idx, $overlap);
288
289    # Exhaustively search for all positives that overlap this hit If
290    # more than one positive overlaps this hit, it's an error that we
291    # don't know how to deal with, so we die with an ERROR message.
292    # If speed of this script becomes an issue, this is the chunk of
293    # code to rewrite, probably with a binary search for overlaps
294    # (though you'd have to take care to deal with the possibility
295    # that one hit overlaps 2 positives).
296    foreach $pos_from (@{$pos_order_AR}) {
297	if(! exists($pos_to_HR->{$pos_from}))  { die "ERROR, CheckIfPositive(), pos_to_HR->{$pos_from} does not exist"; }
298	if(! exists($pos_fam_HR->{$pos_from})) { die "ERROR, CheckIfPositive(), pos_fam_HR->{$pos_from} does not exist"; }
299	if(! exists($pos_ori_HR->{$pos_from})) { die "ERROR, CheckIfPositive(), pos_ori_HR->{$pos_from} does not exist"; }
300	$pos_to  = $pos_to_HR->{$pos_from};
301	$pos_fam = $pos_fam_HR->{$pos_from};
302	$pos_ori = $pos_ori_HR->{$pos_from};
303	$pos_idx = $pos_idx_HR->{$pos_from};
304	#printf("\tcalling GetOverlap: target: %d..%d pos: %d..%d $pos_fam $pos_ori $pos_idx\n", $target_from, $target_to, $pos_from, $pos_to);
305	$overlap = GetOverlap($target_from, $target_to, $pos_from, $pos_to);
306	if($overlap > $overlap_thr) {
307	    #printf("\t\toverlap match!\n");
308	    if($already_found_match == 1) { die "ERROR, CheckIfPositive(), two positives overlap with $target_name $target_from..$target_to; not allowed"; }
309	    $fam = $pos_fam;
310	    $strand = ($pos_ori eq $target_ori) ? "same" : "opposite";
311	    $idx = $pos_idx;
312	    $already_found_match = 1;
313	}
314    }
315    $$ret_fam    = $fam;
316    $$ret_strand = $strand;
317    $$ret_idx    = $idx;
318    return;
319}
320
321
322#################################################################
323# Subroutine : GetOverlap()
324# Incept:      EPN, Mon Jun 28 09:09:57 2010
325#
326# Purpose:     Return the fractional overlap of two hits.
327#              Fractional overlap is the number of overlapping positions
328#              divided by the shorter of the lengths of the two hits.
329#
330# Arguments:
331# $from1:        start position of hit 1
332# $to1:          end position of hit 1
333# $from2:        start position of hit 2
334# $to2:          end position of hit 2
335#################################################################
336sub GetOverlap {
337    my $narg_expected = 4;
338    if(scalar(@_) != $narg_expected) { printf STDERR ("ERROR, GetOverlap() entered with %d != %d input arguments.\n", scalar(@_), $narg_expected); exit(1); }
339    my ($from1, $to1, $from2, $to2) = @_;
340
341    my($tmp, $minlen);
342
343    if($from1 > $to1) { die "ERROR, GetOverlap(), from1 > to1\n"; }
344    if($from2 > $to2) { die "ERROR, GetOverlap(), from2 > to2\n"; }
345
346    $minlen = $to1 - $from1 + 1;
347    if($minlen > ($to2 - $from2 + 1)) { $minlen = ($to2 - $from2 + 1); }
348
349    # Given: $from1 <= $to1 and $from2 <= $to2.
350
351    # Swap if nec so that $from1 <= $from2.
352    if($from1 > $from2) {
353	$tmp   = $from1; $from1 = $from2; $from2 = $tmp;
354	$tmp   =   $to1;   $to1 =   $to2;   $to2 = $tmp;
355    }
356
357    # 3 possible cases:
358    # Case 1. $from1 <=   $to1 <  $from2 <=   $to2  Overlap is 0
359    # Case 2. $from1 <= $from2 <=   $to1 <    $to2
360    # Case 3. $from1 <= $from2 <=   $to2 <=   $to1
361    if($to1 < $from2) { return 0.; }                             # case 1
362    if($to1 <   $to2) { return ($to1 - $from2 + 1) / $minlen; }  # case 2
363    if($to2 <=  $to1) { return ($to2 - $from2 + 1) / $minlen; }  # case 3
364    die "ERROR, unforeseen case in GetOverlap $from1..$to1 and $from2..$to2";
365}
366