1#!/usr/local/bin/perl 2# 3# The MIT License 4# 5# Copyright (c) 2017-2018 Genome Research Ltd. 6# 7# Author: Petr Danecek <pd3@sanger.ac.uk> 8# 9# Permission is hereby granted, free of charge, to any person obtaining a copy 10# of this software and associated documentation files (the "Software"), to deal 11# in the Software without restriction, including without limitation the rights 12# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13# copies of the Software, and to permit persons to whom the Software is 14# furnished to do so, subject to the following conditions: 15# 16# The above copyright notice and this permission notice shall be included in 17# all copies or substantial portions of the Software. 18# 19# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 25# THE SOFTWARE. 26 27use strict; 28use warnings; 29use Carp; 30 31my $opts = parse_params(); 32fix($opts); 33 34exit; 35 36#-------------------------------- 37 38sub error 39{ 40 my (@msg) = @_; 41 if ( scalar @msg ) { confess @msg; } 42 print 43 "About:\n", 44 " Some parts of GATK throw errors like\n", 45 " \"java.lang.Double cannot be cast to java.lang.Integer\"\n", 46 " This is caused by a bug in GATK which incorrectly parses valid float number expressions, for example\n", 47 " it would not accept \"0\", only \"0.0\". Such unnecessary strictness violates the VCF specification\n", 48 " (and common sense). This script reformats floating point fields to work around this.\n", 49 "\n", 50 "Usage: fix-broken-GATK-Double-vs-Integer [OPTIONS]\n", 51 "Options:\n", 52 " -c, --check-only check for problems, do not output VCF\n", 53 " -h, --help this help message\n", 54 "\n", 55 "Example:\n", 56 " gunzip -c ori.vcf.gz | fix-broken-GATK-Double-vs-Integer | bgzip -c > new.vcf.gz\n", 57 "\n"; 58 exit -1; 59} 60sub parse_params 61{ 62 my $opts = 63 { 64 nflt => 0, 65 nint => 0, 66 }; 67 while (defined(my $arg=shift(@ARGV))) 68 { 69 if ( $arg eq '-c' || $arg eq '--check-only' ) { $$opts{check_only} = 1; next; } 70 if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } 71 error("Unknown parameter \"$arg\". Run -h for help.\n"); 72 } 73 return $opts; 74} 75 76sub fix 77{ 78 my ($opts) = @_; 79 while (my $line=<STDIN>) 80 { 81 if ( $line=~/^#/ ) 82 { 83 $line = parse_header($opts,$line); 84 } 85 else 86 { 87 $line = fix_line($opts,$line); 88 } 89 if ( !$$opts{check_only} ) { print $line; } 90 } 91 print STDERR "Modified $$opts{nflt} float values, $$opts{nint} integer values\n"; 92} 93 94sub parse_header 95{ 96 my ($opts,$line) = @_; 97 98 my $coltype = undef; 99 if ( $line=~/^##INFO/ ) { $coltype = 'info'; } 100 elsif ( $line=~/^##FORMAT/ ) { $coltype = 'fmt'; } 101 else { return $line; } 102 103 if ( !($line=~/ID=([^,>]+)/) ) { return $line; } 104 my $id = $1; 105 106 my $type = undef; 107 if ( $line=~/Type=Float/ ) { $type = 'float'; } 108 elsif ( $line=~/Type=Integer/ ) { $type = 'int'; } 109 else { return $line; } 110 111 $$opts{$coltype}{$id} = $type; 112 113 return $line; 114} 115 116sub fix_line 117{ 118 my ($opts,$line) = @_; 119 my $info = $$opts{info}; 120 my $fmt = $$opts{fmt}; 121 my @cols = split(/\t/,$line); 122 my @info = split(/;/,$cols[7]); 123 my @fmt = split(/:/,$cols[8]); 124 my $pos = "$cols[0]:$cols[1]"; 125 chomp($cols[-1]); 126 for (my $i=0; $i<@info; $i++) 127 { 128 my ($key,$val) = split(/=/,$info[$i]); 129 if ( !defined $val or !exists($$info{$key}) ) { next; } 130 $val = $$info{$key} eq 'float' ? fix_float($opts,$pos,$key,$val) : fix_int($opts,$pos,$key,$val); 131 $info[$i] = "$key=$val"; 132 } 133 $cols[7] = join(';',@info); 134 for (my $j=9; $j<@cols; $j++) 135 { 136 my @vals = split(/:/,$cols[$j]); 137 for (my $i=0; $i<@fmt; $i++) 138 { 139 my $key = $fmt[$i]; 140 if ( !exists($$fmt{$key}) ) { next; } 141 if ( !exists($vals[$i]) ) { last; } 142 $vals[$i] = $$fmt{$key} eq 'float' ? fix_float($opts,$pos,$key,$vals[$i]) : fix_int($opts,$pos,$key,$vals[$i]); 143 } 144 $cols[$j] = join(':',@vals); 145 } 146 return join("\t",@cols)."\n"; 147} 148 149sub fix_int 150{ 151 my ($opts,$pos,$key,$vals) = @_; 152 my @vals = split(/,/,$vals); 153 for (my $i=0; $i<@vals; $i++) 154 { 155 if ( $vals[$i] eq '.' ) { next; } 156 if ( $vals[$i] =~ /^-?[0-9]+$/ ) { next; } 157 if ( $$opts{check_only} ) { print "$pos\t$key\tInteger\t$vals[$i]\n"; } 158 error("todo: $pos\t$key\tInteger\t$vals[$i]\n"); 159 $$opts{nint}++; 160 } 161 return $vals; 162} 163sub fix_float 164{ 165 my ($opts,$pos,$key,$vals) = @_; 166 my @vals = split(/,/,$vals); 167 for (my $i=0; $i<@vals; $i++) 168 { 169 if ( $vals[$i] eq '.' ) { next; } 170 if ( $vals[$i] =~ /\./ ) { next; } 171 if ( $vals[$i] =~ /[eE]/ ) { next; } 172 if ( $$opts{check_only} ) { print "$pos\t$key\tFloat\t$vals[$i]\n"; } 173 $vals[$i] .= '.'; 174 $$opts{nflt}++; 175 } 176 return join(',',@vals); 177} 178 179