1#!/usr/local/bin/perl
2#
3#    Copyright (C) 2009, 2010 Genome Research Ltd.
4#
5#    Author: Petr Danecek <pd3@sanger.ac.uk>
6#
7# Permission is hereby granted, free of charge, to any person obtaining a copy
8# of this software and associated documentation files (the "Software"), to deal
9# in the Software without restriction, including without limitation the rights
10# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11# copies of the Software, and to permit persons to whom the Software is
12# furnished to do so, subject to the following conditions:
13#
14# The above copyright notice and this permission notice shall be included in
15# all copies or substantial portions of the Software.
16#
17# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23# DEALINGS IN THE SOFTWARE.
24
25# VCF specs: http://www.1000genomes.org/wiki/doku.php?id=1000_genomes:analysis:vcf3.3
26
27use strict;
28use warnings;
29use Carp;
30
31my $opts = parse_params();
32do_pileup_to_vcf($opts);
33
34exit;
35
36#---------------
37
38sub error
39{
40    my (@msg) = @_;
41    if ( scalar @msg ) { croak(@msg); }
42    die
43        "Usage: sam2vcf.pl [OPTIONS] < in.pileup > out.vcf\n",
44        "Options:\n",
45        "   -h, -?, --help                  This help message.\n",
46        "   -i, --indels-only               Ignore SNPs.\n",
47        "   -r, --refseq <file.fa>          The reference sequence, required when indels are present.\n",
48        "   -R, --keep-ref                  Print reference alleles as well.\n",
49        "   -s, --snps-only                 Ignore indels.\n",
50        "   -t, --column-title <string>     The column title.\n",
51        "\n";
52}
53
54
55sub parse_params
56{
57    my %opts = ();
58
59    $opts{fh_in}  = *STDIN;
60    $opts{fh_out} = *STDOUT;
61
62    while (my $arg=shift(@ARGV))
63    {
64        if ( $arg eq '-R' || $arg eq '--keep-ref' ) { $opts{keep_ref}=1; next; }
65        if ( $arg eq '-r' || $arg eq '--refseq' ) { $opts{refseq}=shift(@ARGV); next; }
66        if ( $arg eq '-t' || $arg eq '--column-title' ) { $opts{title}=shift(@ARGV); next; }
67        if ( $arg eq '-s' || $arg eq '--snps-only' ) { $opts{snps_only}=1; next; }
68        if ( $arg eq '-i' || $arg eq '--indels-only' ) { $opts{indels_only}=1; next; }
69        if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
70
71        error("Unknown parameter \"$arg\". Run -h for help.\n");
72    }
73    return \%opts;
74}
75
76sub iupac_to_gtype
77{
78    my ($ref,$base) = @_;
79    my %iupac = (
80            'K' => ['G','T'],
81            'M' => ['A','C'],
82            'S' => ['C','G'],
83            'R' => ['A','G'],
84            'W' => ['A','T'],
85            'Y' => ['C','T'],
86            );
87    if ( !exists($iupac{$base}) )
88    {
89        if ( $base ne 'A' && $base ne 'C' && $base ne 'G' && $base ne 'T' ) { error("FIXME: what is this [$base]?\n"); }
90        if ( $ref eq $base ) { return ('.','0/0'); }
91        return ($base,'1/1');
92    }
93    my $gt = $iupac{$base};
94    if ( $$gt[0] eq $ref  ) { return ($$gt[1],'0/1'); }
95    elsif ( $$gt[1] eq $ref ) { return ($$gt[0],'0/1'); }
96    return ("$$gt[0],$$gt[1]",'1/2');
97}
98
99
100sub parse_indel
101{
102    my ($cons) = @_;
103    if ( $cons=~/^-/ )
104    {
105        my $len = length($');
106        return "D$len";
107    }
108    elsif ( $cons=~/^\+/ ) { return "I$'"; }
109    elsif ( $cons eq '*' ) { return undef; }
110    error("FIXME: could not parse [$cons]\n");
111}
112
113
114# An example of the pileup format:
115#   1       3000011 C       C       32      0       98      1       ^~,     A
116#   1       3002155 *       +T/+T   53      119     52      5       +T      *       4       1       0
117#   1       3003094 *       -TT/-TT 31      164     60      11      -TT     *       5       6       0
118#   1       3073986 *       */-AAAAAAAAAAAAAA       3       3       45      9       *       -AAAAAAAAAAAAAA 7       2       0
119#
120sub do_pileup_to_vcf
121{
122    my ($opts) = @_;
123
124    my $fh_in  = $$opts{fh_in};
125    my $fh_out = $$opts{fh_out};
126    my ($prev_chr,$prev_pos,$prev_ref);
127    my $refseq;
128    my $ignore_indels = $$opts{snps_only} ? 1 : 0;
129    my $ignore_snps   = $$opts{indels_only} ? 1 : 0;
130    my $keep_ref      = $$opts{keep_ref} ? 1 : 0;
131    my $title = exists($$opts{title}) ? $$opts{title} : 'data';
132
133    print $fh_out
134        qq[##fileformat=VCFv3.3\n],
135        qq[##INFO=DP,1,Integer,"Total Depth"\n],
136        qq[##FORMAT=GT,1,String,"Genotype"\n],
137        qq[##FORMAT=GQ,1,Integer,"Genotype Quality"\n],
138        qq[##FORMAT=DP,1,Integer,"Read Depth"\n],
139        qq[#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$title\n]
140        ;
141
142    while (my $line=<$fh_in>)
143    {
144        chomp($line);
145        my (@items) = split(/\t/,$line);
146        if ( scalar @items<8 )
147        {
148            error("\nToo few columns, does not look like output of 'samtools pileup -c': $line\n");
149        }
150        my ($chr,$pos,$ref,$cons,$cons_qual,$snp_qual,$rms_qual,$depth,$a1,$a2) = @items;
151        $ref  = uc($ref);
152        $cons = uc($cons);
153
154        my ($alt,$gt);
155        if ( $ref eq '*' )
156        {
157            # An indel is involved.
158            if ( $ignore_indels )
159            {
160                $prev_ref = $ref;
161                $prev_pos = $pos;
162                $prev_chr = $chr;
163                next;
164            }
165
166            if (!defined $prev_chr || $chr ne $prev_chr || $pos ne $prev_pos)
167            {
168                if ( !$$opts{refseq} ) { error("Cannot do indels without the reference.\n"); }
169                if ( !$refseq ) { $refseq = Fasta->new(file=>$$opts{refseq}); }
170                $ref = $refseq->get_base($chr,$pos);
171                $ref = uc($ref);
172            }
173            else { $ref = $prev_ref; }
174
175            # One of the alleles can be a reference and it can come in arbitrary order. In some
176            #   cases */* can be encountered. In such a case, look in the additional columns.
177            my ($al1,$al2) = split(m{/},$cons);
178            if ( $al1 eq $al2 && $al1 eq '*' ) { $al1=$a1; $al2=$a2; }
179            my $alt1 = parse_indel($al1);
180            my $alt2 = parse_indel($al2);
181            if ( !$alt1 && !$alt2 ) { error("FIXME: could not parse indel:\n", $line); }
182            if ( !$alt1 )
183            {
184                $alt=$alt2;
185                $gt='0/1';
186            }
187            elsif ( !$alt2 )
188            {
189                $alt=$alt1;
190                $gt='0/1';
191            }
192            elsif ( $alt1 eq $alt2 )
193            {
194                $alt="$alt1";
195                $gt='1/1';
196            }
197            else
198            {
199                $alt="$alt1,$alt2";
200                $gt='1/2';
201            }
202        }
203        else
204        {
205            if ( $ignore_snps || (!$keep_ref && $ref eq $cons) )
206            {
207                $prev_ref = $ref;
208                $prev_pos = $pos;
209                $prev_chr = $chr;
210                next;
211            }
212
213            # SNP
214            ($alt,$gt) = iupac_to_gtype($ref,$cons);
215        }
216
217        print $fh_out "$chr\t$pos\t.\t$ref\t$alt\t$snp_qual\t0\tDP=$depth\tGT:GQ:DP\t$gt:$cons_qual:$depth\n";
218
219        $prev_ref = $ref;
220        $prev_pos = $pos;
221        $prev_chr = $chr;
222    }
223}
224
225
226#------------- Fasta --------------------
227#
228# Uses samtools to get a requested base from a fasta file. For efficiency, preloads
229#   a chunk to memory. The size of the cached sequence can be controlled by the 'size'
230#   parameter.
231#
232package Fasta;
233
234use strict;
235use warnings;
236use Carp;
237
238sub Fasta::new
239{
240    my ($class,@args) = @_;
241    my $self = {@args};
242    bless $self, ref($class) || $class;
243    if ( !$$self{file} ) { $self->throw(qq[Missing the parameter "file"\n]); }
244    $$self{chr}  = undef;
245    $$self{from} = undef;
246    $$self{to}   = undef;
247    if ( !$$self{size} ) { $$self{size}=10_000_000; }
248    bless $self, ref($class) || $class;
249    return $self;
250}
251
252sub read_chunk
253{
254    my ($self,$chr,$pos) = @_;
255    my $to = $pos + $$self{size};
256    my $cmd = "samtools faidx $$self{file} $chr:$pos-$to";
257    my @out = `$cmd`;
258    if ( $? ) { $self->throw("$cmd: $!"); }
259    my $line = shift(@out);
260    if ( !($line=~/^>$chr:(\d+)-(\d+)/) ) { $self->throw("Could not parse: $line"); }
261    $$self{chr}  = $chr;
262    $$self{from} = $1;
263    $$self{to}   = $2;
264    my $chunk = '';
265    while ($line=shift(@out))
266    {
267        chomp($line);
268        $chunk .= $line;
269    }
270    $$self{chunk} = $chunk;
271    return;
272}
273
274sub get_base
275{
276    my ($self,$chr,$pos) = @_;
277    if ( !$$self{chr} || $chr ne $$self{chr} || $pos<$$self{from} || $pos>$$self{to} )
278    {
279        $self->read_chunk($chr,$pos);
280    }
281    my $idx = $pos - $$self{from};
282    return substr($$self{chunk},$idx,1);
283}
284
285sub throw
286{
287    my ($self,@msg) = @_;
288    croak(@msg);
289}
290