1#!/usr/local/bin/perl 2# 3# Copyright (C) 2009, 2010 Genome Research Ltd. 4# 5# Author: Petr Danecek <pd3@sanger.ac.uk> 6# 7# Permission is hereby granted, free of charge, to any person obtaining a copy 8# of this software and associated documentation files (the "Software"), to deal 9# in the Software without restriction, including without limitation the rights 10# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11# copies of the Software, and to permit persons to whom the Software is 12# furnished to do so, subject to the following conditions: 13# 14# The above copyright notice and this permission notice shall be included in 15# all copies or substantial portions of the Software. 16# 17# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23# DEALINGS IN THE SOFTWARE. 24 25# VCF specs: http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3 26 27use strict; 28use warnings; 29use Carp; 30 31my $opts = parse_params(); 32do_pileup_to_vcf($opts); 33 34exit; 35 36#--------------- 37 38sub error 39{ 40 my (@msg) = @_; 41 if ( scalar @msg ) { croak(@msg); } 42 die 43 "Usage: sam2vcf.pl [OPTIONS] < in.pileup > out.vcf\n", 44 "Options:\n", 45 " -h, -?, --help This help message.\n", 46 " -i, --indels-only Ignore SNPs.\n", 47 " -r, --refseq <file.fa> The reference sequence, required when indels are present.\n", 48 " -R, --keep-ref Print reference alleles as well.\n", 49 " -s, --snps-only Ignore indels.\n", 50 " -t, --column-title <string> The column title.\n", 51 "\n"; 52} 53 54 55sub parse_params 56{ 57 my %opts = (); 58 59 $opts{fh_in} = *STDIN; 60 $opts{fh_out} = *STDOUT; 61 62 while (my $arg=shift(@ARGV)) 63 { 64 if ( $arg eq '-R' || $arg eq '--keep-ref' ) { $opts{keep_ref}=1; next; } 65 if ( $arg eq '-r' || $arg eq '--refseq' ) { $opts{refseq}=shift(@ARGV); next; } 66 if ( $arg eq '-t' || $arg eq '--column-title' ) { $opts{title}=shift(@ARGV); next; } 67 if ( $arg eq '-s' || $arg eq '--snps-only' ) { $opts{snps_only}=1; next; } 68 if ( $arg eq '-i' || $arg eq '--indels-only' ) { $opts{indels_only}=1; next; } 69 if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } 70 71 error("Unknown parameter \"$arg\". Run -h for help.\n"); 72 } 73 return \%opts; 74} 75 76sub iupac_to_gtype 77{ 78 my ($ref,$base) = @_; 79 my %iupac = ( 80 'K' => ['G','T'], 81 'M' => ['A','C'], 82 'S' => ['C','G'], 83 'R' => ['A','G'], 84 'W' => ['A','T'], 85 'Y' => ['C','T'], 86 ); 87 if ( !exists($iupac{$base}) ) 88 { 89 if ( $base ne 'A' && $base ne 'C' && $base ne 'G' && $base ne 'T' ) { error("FIXME: what is this [$base]?\n"); } 90 if ( $ref eq $base ) { return ('.','0/0'); } 91 return ($base,'1/1'); 92 } 93 my $gt = $iupac{$base}; 94 if ( $$gt[0] eq $ref ) { return ($$gt[1],'0/1'); } 95 elsif ( $$gt[1] eq $ref ) { return ($$gt[0],'0/1'); } 96 return ("$$gt[0],$$gt[1]",'1/2'); 97} 98 99 100sub parse_indel 101{ 102 my ($cons) = @_; 103 if ( $cons=~/^-/ ) 104 { 105 my $len = length($'); 106 return "D$len"; 107 } 108 elsif ( $cons=~/^\+/ ) { return "I$'"; } 109 elsif ( $cons eq '*' ) { return undef; } 110 error("FIXME: could not parse [$cons]\n"); 111} 112 113 114# An example of the pileup format: 115# 1 3000011 C C 32 0 98 1 ^~, A 116# 1 3002155 * +T/+T 53 119 52 5 +T * 4 1 0 117# 1 3003094 * -TT/-TT 31 164 60 11 -TT * 5 6 0 118# 1 3073986 * */-AAAAAAAAAAAAAA 3 3 45 9 * -AAAAAAAAAAAAAA 7 2 0 119# 120sub do_pileup_to_vcf 121{ 122 my ($opts) = @_; 123 124 my $fh_in = $$opts{fh_in}; 125 my $fh_out = $$opts{fh_out}; 126 my ($prev_chr,$prev_pos,$prev_ref); 127 my $refseq; 128 my $ignore_indels = $$opts{snps_only} ? 1 : 0; 129 my $ignore_snps = $$opts{indels_only} ? 1 : 0; 130 my $keep_ref = $$opts{keep_ref} ? 1 : 0; 131 my $title = exists($$opts{title}) ? $$opts{title} : 'data'; 132 133 print $fh_out 134 qq[##fileformat=VCFv3.3\n], 135 qq[##INFO=DP,1,Integer,"Total Depth"\n], 136 qq[##FORMAT=GT,1,String,"Genotype"\n], 137 qq[##FORMAT=GQ,1,Integer,"Genotype Quality"\n], 138 qq[##FORMAT=DP,1,Integer,"Read Depth"\n], 139 qq[#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$title\n] 140 ; 141 142 while (my $line=<$fh_in>) 143 { 144 chomp($line); 145 my (@items) = split(/\t/,$line); 146 if ( scalar @items<8 ) 147 { 148 error("\nToo few columns, does not look like output of 'samtools pileup -c': $line\n"); 149 } 150 my ($chr,$pos,$ref,$cons,$cons_qual,$snp_qual,$rms_qual,$depth,$a1,$a2) = @items; 151 $ref = uc($ref); 152 $cons = uc($cons); 153 154 my ($alt,$gt); 155 if ( $ref eq '*' ) 156 { 157 # An indel is involved. 158 if ( $ignore_indels ) 159 { 160 $prev_ref = $ref; 161 $prev_pos = $pos; 162 $prev_chr = $chr; 163 next; 164 } 165 166 if (!defined $prev_chr || $chr ne $prev_chr || $pos ne $prev_pos) 167 { 168 if ( !$$opts{refseq} ) { error("Cannot do indels without the reference.\n"); } 169 if ( !$refseq ) { $refseq = Fasta->new(file=>$$opts{refseq}); } 170 $ref = $refseq->get_base($chr,$pos); 171 $ref = uc($ref); 172 } 173 else { $ref = $prev_ref; } 174 175 # One of the alleles can be a reference and it can come in arbitrary order. In some 176 # cases */* can be encountered. In such a case, look in the additional columns. 177 my ($al1,$al2) = split(m{/},$cons); 178 if ( $al1 eq $al2 && $al1 eq '*' ) { $al1=$a1; $al2=$a2; } 179 my $alt1 = parse_indel($al1); 180 my $alt2 = parse_indel($al2); 181 if ( !$alt1 && !$alt2 ) { error("FIXME: could not parse indel:\n", $line); } 182 if ( !$alt1 ) 183 { 184 $alt=$alt2; 185 $gt='0/1'; 186 } 187 elsif ( !$alt2 ) 188 { 189 $alt=$alt1; 190 $gt='0/1'; 191 } 192 elsif ( $alt1 eq $alt2 ) 193 { 194 $alt="$alt1"; 195 $gt='1/1'; 196 } 197 else 198 { 199 $alt="$alt1,$alt2"; 200 $gt='1/2'; 201 } 202 } 203 else 204 { 205 if ( $ignore_snps || (!$keep_ref && $ref eq $cons) ) 206 { 207 $prev_ref = $ref; 208 $prev_pos = $pos; 209 $prev_chr = $chr; 210 next; 211 } 212 213 # SNP 214 ($alt,$gt) = iupac_to_gtype($ref,$cons); 215 } 216 217 print $fh_out "$chr\t$pos\t.\t$ref\t$alt\t$snp_qual\t0\tDP=$depth\tGT:GQ:DP\t$gt:$cons_qual:$depth\n"; 218 219 $prev_ref = $ref; 220 $prev_pos = $pos; 221 $prev_chr = $chr; 222 } 223} 224 225 226#------------- Fasta -------------------- 227# 228# Uses samtools to get a requested base from a fasta file. For efficiency, preloads 229# a chunk to memory. The size of the cached sequence can be controlled by the 'size' 230# parameter. 231# 232package Fasta; 233 234use strict; 235use warnings; 236use Carp; 237 238sub Fasta::new 239{ 240 my ($class,@args) = @_; 241 my $self = {@args}; 242 bless $self, ref($class) || $class; 243 if ( !$$self{file} ) { $self->throw(qq[Missing the parameter "file"\n]); } 244 $$self{chr} = undef; 245 $$self{from} = undef; 246 $$self{to} = undef; 247 if ( !$$self{size} ) { $$self{size}=10_000_000; } 248 bless $self, ref($class) || $class; 249 return $self; 250} 251 252sub read_chunk 253{ 254 my ($self,$chr,$pos) = @_; 255 my $to = $pos + $$self{size}; 256 my $cmd = "samtools faidx $$self{file} $chr:$pos-$to"; 257 my @out = `$cmd`; 258 if ( $? ) { $self->throw("$cmd: $!"); } 259 my $line = shift(@out); 260 if ( !($line=~/^>$chr:(\d+)-(\d+)/) ) { $self->throw("Could not parse: $line"); } 261 $$self{chr} = $chr; 262 $$self{from} = $1; 263 $$self{to} = $2; 264 my $chunk = ''; 265 while ($line=shift(@out)) 266 { 267 chomp($line); 268 $chunk .= $line; 269 } 270 $$self{chunk} = $chunk; 271 return; 272} 273 274sub get_base 275{ 276 my ($self,$chr,$pos) = @_; 277 if ( !$$self{chr} || $chr ne $$self{chr} || $pos<$$self{from} || $pos>$$self{to} ) 278 { 279 $self->read_chunk($chr,$pos); 280 } 281 my $idx = $pos - $$self{from}; 282 return substr($$self{chunk},$idx,1); 283} 284 285sub throw 286{ 287 my ($self,@msg) = @_; 288 croak(@msg); 289} 290