1#!/usr/local/bin/perl
2#
3#  The MIT License
4#
5#  Copyright (c) 2017-2018 Genome Research Ltd.
6#
7#  Author: Petr Danecek <pd3@sanger.ac.uk>
8#
9#  Permission is hereby granted, free of charge, to any person obtaining a copy
10#  of this software and associated documentation files (the "Software"), to deal
11#  in the Software without restriction, including without limitation the rights
12#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13#  copies of the Software, and to permit persons to whom the Software is
14#  furnished to do so, subject to the following conditions:
15#
16#  The above copyright notice and this permission notice shall be included in
17#  all copies or substantial portions of the Software.
18#
19#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25#  THE SOFTWARE.
26
27use strict;
28use warnings;
29use Carp;
30
31my $opts = parse_params();
32fix($opts);
33
34exit;
35
36#--------------------------------
37
38sub error
39{
40    my (@msg) = @_;
41    if ( scalar @msg ) { confess @msg; }
42    print
43        "About:\n",
44        "   Some parts of GATK throw errors like\n",
45        "       \"java.lang.Double cannot be cast to java.lang.Integer\"\n",
46        "   This is caused by a bug in GATK which incorrectly parses valid float number expressions, for example\n",
47        "   it would not accept \"0\", only \"0.0\". Such unnecessary strictness violates the VCF specification\n",
48        "   (and common sense). This script reformats floating point fields to work around this.\n",
49        "\n",
50        "Usage: fix-broken-GATK-Double-vs-Integer [OPTIONS]\n",
51        "Options:\n",
52        "   -c, --check-only    check for problems, do not output VCF\n",
53        "   -h, --help          this help message\n",
54        "\n",
55        "Example:\n",
56        "   gunzip -c ori.vcf.gz | fix-broken-GATK-Double-vs-Integer | bgzip -c > new.vcf.gz\n",
57        "\n";
58    exit -1;
59}
60sub parse_params
61{
62    my $opts =
63    {
64        nflt => 0,
65        nint => 0,
66    };
67    while (defined(my $arg=shift(@ARGV)))
68    {
69        if ( $arg eq '-c' || $arg eq '--check-only' ) { $$opts{check_only} = 1; next; }
70        if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
71        error("Unknown parameter \"$arg\". Run -h for help.\n");
72    }
73    return $opts;
74}
75
76sub fix
77{
78    my ($opts) = @_;
79    while (my $line=<STDIN>)
80    {
81        if ( $line=~/^#/ )
82        {
83            $line = parse_header($opts,$line);
84        }
85        else
86        {
87            $line = fix_line($opts,$line);
88        }
89        if ( !$$opts{check_only} ) { print $line; }
90    }
91    print STDERR "Modified $$opts{nflt} float values, $$opts{nint} integer values\n";
92}
93
94sub parse_header
95{
96    my ($opts,$line) = @_;
97
98    my $coltype = undef;
99    if ( $line=~/^##INFO/ ) { $coltype = 'info'; }
100    elsif ( $line=~/^##FORMAT/ ) { $coltype = 'fmt'; }
101    else { return $line; }
102
103    if ( !($line=~/ID=([^,>]+)/) ) { return $line; }
104    my $id = $1;
105
106    my $type = undef;
107    if ( $line=~/Type=Float/ ) { $type = 'float'; }
108    elsif ( $line=~/Type=Integer/ ) { $type = 'int'; }
109    else { return $line; }
110
111    $$opts{$coltype}{$id} = $type;
112
113    return $line;
114}
115
116sub fix_line
117{
118    my ($opts,$line) = @_;
119    my $info = $$opts{info};
120    my $fmt  = $$opts{fmt};
121    my @cols = split(/\t/,$line);
122    my @info = split(/;/,$cols[7]);
123    my @fmt  = split(/:/,$cols[8]);
124    my $pos  = "$cols[0]:$cols[1]";
125    chomp($cols[-1]);
126    for (my $i=0; $i<@info; $i++)
127    {
128        my ($key,$val) = split(/=/,$info[$i]);
129        if ( !defined $val or !exists($$info{$key}) ) { next; }
130        $val = $$info{$key} eq 'float' ? fix_float($opts,$pos,$key,$val) : fix_int($opts,$pos,$key,$val);
131        $info[$i] = "$key=$val";
132    }
133    $cols[7] = join(';',@info);
134    for (my $j=9; $j<@cols; $j++)
135    {
136        my @vals = split(/:/,$cols[$j]);
137        for (my $i=0; $i<@fmt; $i++)
138        {
139            my $key = $fmt[$i];
140            if ( !exists($$fmt{$key}) ) { next; }
141            if ( !exists($vals[$i]) ) { last; }
142            $vals[$i] = $$fmt{$key} eq 'float' ? fix_float($opts,$pos,$key,$vals[$i]) : fix_int($opts,$pos,$key,$vals[$i]);
143        }
144        $cols[$j] = join(':',@vals);
145    }
146    return join("\t",@cols)."\n";
147}
148
149sub fix_int
150{
151    my ($opts,$pos,$key,$vals) = @_;
152    my @vals = split(/,/,$vals);
153    for (my $i=0; $i<@vals; $i++)
154    {
155        if ( $vals[$i] eq '.' ) { next; }
156        if ( $vals[$i] =~ /^-?[0-9]+$/ ) { next; }
157        if ( $$opts{check_only} ) { print "$pos\t$key\tInteger\t$vals[$i]\n"; }
158        error("todo: $pos\t$key\tInteger\t$vals[$i]\n");
159        $$opts{nint}++;
160    }
161    return $vals;
162}
163sub fix_float
164{
165    my ($opts,$pos,$key,$vals) = @_;
166    my @vals = split(/,/,$vals);
167    for (my $i=0; $i<@vals; $i++)
168    {
169        if ( $vals[$i] eq '.' ) { next; }
170        if ( $vals[$i] =~ /\./ ) { next; }
171        if ( $vals[$i] =~ /[eE]/ ) { next; }
172        if ( $$opts{check_only} ) { print "$pos\t$key\tFloat\t$vals[$i]\n"; }
173        $vals[$i] .= '.';
174        $$opts{nflt}++;
175    }
176    return join(',',@vals);
177}
178
179