1#!/usr/local/bin/perl -w 2 3# 4# Copyright 2011, Ben Langmead <langmea@cs.jhu.edu> 5# 6# This file is part of Bowtie 2. 7# 8# Bowtie 2 is free software: you can redistribute it and/or modify 9# it under the terms of the GNU General Public License as published by 10# the Free Software Foundation, either version 3 of the License, or 11# (at your option) any later version. 12# 13# Bowtie 2 is distributed in the hope that it will be useful, 14# but WITHOUT ANY WARRANTY; without even the implied warranty of 15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16# GNU General Public License for more details. 17# 18# You should have received a copy of the GNU General Public License 19# along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>. 20# 21 22## 23# infer_fraglen.pl 24# 25# Infer fragment length by looking for unique alignments for mates 26# (separately), then piecing those together and building a distribution. 27# 28 29use strict; 30use warnings; 31use Getopt::Long; 32use FindBin qw($Bin); 33 34my $m1 = ""; 35my $m2 = ""; 36my $index = ""; 37my $bowtie_args = ""; 38my $bowtie2 = "$Bin/../bowtie2"; 39my $debug = 0; 40my $binsz = 10; 41my $mapq_cutoff = 30; 42my $upto = undef; 43 44sub dieusage { 45 my $msg = shift; 46 my $exitlevel = shift; 47 $exitlevel = $exitlevel || 1; 48 print STDERR "$msg\n"; 49 exit $exitlevel; 50} 51 52## 53# Given a basename, return true iff all index files exist. 54# 55sub checkIndex($) { 56 my $idx = shift; 57 my $ext = "bt2"; 58 return -f "$idx.1.$ext" && 59 -f "$idx.2.$ext" && 60 -f "$idx.3.$ext" && 61 -f "$idx.4.$ext" && 62 -f "$idx.rev.1.$ext" && 63 -f "$idx.rev.2.$ext"; 64} 65 66GetOptions ( 67 "bowtie2=s" => \$bowtie2, 68 "index=s" => \$index, 69 "m1=s" => \$m1, 70 "m2=s" => \$m2, 71 "upto=i" => \$upto, 72 "mapq_cutoff=i" => \$mapq_cutoff, 73 "debug" => \$debug, 74 "bowtie-args=s" => \$bowtie_args) || dieusage("Bad option", 1); 75 76die "Must specify --m1" if $m1 eq ""; 77die "Must specify --m2" if $m2 eq ""; 78die "Must specify --index" if $index eq ""; 79$m1 =~ s/^~/$ENV{HOME}/; 80$m2 =~ s/^~/$ENV{HOME}/; 81$index =~ s/^~/$ENV{HOME}/; 82die "Bad bowtie path: $bowtie2" if system("$bowtie2 --version >/dev/null 2>/dev/null") != 0; 83die "Bad index: $index" if !checkIndex($index); 84 85# Hash holding all the observed fragment orientations and lengths 86my %fragments = (); 87my $m1cmd = ($m1 =~ /\.gz$/) ? "gzip -dc $m1" : "cat $m1"; 88my $m2cmd = ($m2 =~ /\.gz$/) ? "gzip -dc $m2" : "cat $m2"; 89my $cmd1 = "$m1cmd | $bowtie2 $bowtie_args --sam-nohead -x $index - > .infer_fraglen.tmp"; 90my $cmd2 = "$m2cmd | $bowtie2 $bowtie_args --sam-nohead -x $index - |"; 91my $tot = 0; 92system($cmd1) == 0 || die "Error running '$cmd1'"; 93open (M1, ".infer_fraglen.tmp") || die "Could not open '.infer_fraglen.tmp'"; 94open (M2, $cmd2) || die "Could not open '$cmd2'"; 95while(<M1>) { 96 my $lm1 = $_; 97 my $lm2 = <M2>; 98 chomp($lm1); chomp($lm2); 99 my @lms1 = split(/\t/, $lm1); 100 my @lms2 = split(/\t/, $lm2); 101 my ($name1, $flags1, $chr1, $off1, $mapq1, $slen1) = ($lms1[0], $lms1[1], $lms1[2], $lms1[3], $lms1[4], length($lms1[9])); 102 my ($name2, $flags2, $chr2, $off2, $mapq2, $slen2) = ($lms2[0], $lms2[1], $lms2[2], $lms2[3], $lms2[4], length($lms2[9])); 103 # One or both mates didn't align uniquely? 104 next if $chr1 eq "*" || $chr2 eq "*"; 105 # Mates aligned to different chromosomes? 106 next if $chr1 ne $chr2; 107 # MAPQs too low? 108 next if $mapq1 < $mapq_cutoff || $mapq2 < $mapq_cutoff; 109 # This pairing can be used as an observation of fragment orientation and length 110 my $fw1 = (($flags1 & 16) == 0) ? "F" : "R"; 111 my $fw2 = (($flags2 & 16) == 0) ? "F" : "R"; 112 my $frag = $off2 - $off1; 113 # This can overestimate if one mate is entirely subsumed in the other 114 if($frag > 0) { $frag += $slen2; } 115 else { $frag -= $slen1; } 116 # Install into bin 117 $frag = int(($frag + ($binsz/2))/$binsz); # Round to nearest bin 118 $fragments{"$fw1$fw2"}{$frag}++; 119 $tot++; 120} 121close(M1); 122close(M2); 123unlink(".infer_fraglen.tmp"); # ditch temporary file 124 125# Print out the bins 126for my $k (keys %fragments) { 127 for my $k2 (sort {$a <=> $b} keys %{$fragments{$k}}) { 128 print "$k, ".($k2*$binsz).", ".$fragments{$k}{$k2}."\n"; 129 } 130} 131 132print STDERR "DONE\n"; 133