1#!/usr/bin/env perl 2# 3# Author: petr.danecek@sanger 4# 5 6use strict; 7use warnings; 8use Carp; 9use Vcf; 10 11my $opts = parse_params(); 12read_data($opts); 13 14exit; 15 16#-------------------------------- 17 18sub error 19{ 20 my (@msg) = @_; 21 if ( scalar @msg ) { confess @msg; } 22 die 23 "Usage: vcf-query [OPTIONS] file.vcf.gz\n", 24 "Options:\n", 25 " -c, --columns <file|list> List of comma-separated column names or one column name per line in a file.\n", 26 " -f, --format <string> The default is '%CHROM:%POS\\t%REF[\\t%SAMPLE=%GT]\\n'\n", 27 " -l, --list-columns List columns.\n", 28 " -r, --region chr:from-to Retrieve the region. (Runs tabix.)\n", 29 " --use-old-method Use old version of API, which is slow but more robust.\n", 30 " -h, -?, --help This help message.\n", 31 "Expressions:\n", 32 " %CHROM The CHROM column (similarly also other columns)\n", 33 " %GT Translated genotype (e.g. C/A)\n", 34 " %GTR Raw genotype (e.g. 0/1)\n", 35 " %INFO/TAG Any tag in the INFO column\n", 36 " %LINE Prints the whole line\n", 37 " %SAMPLE Sample name\n", 38 " [] The brackets loop over all samples\n", 39 " %*<A><B> All format fields printed as KEY<A>VALUE<B>\n", 40 "Examples:\n", 41 " vcf-query file.vcf.gz 1:1000-2000 -c NA001,NA002,NA003\n", 42 " vcf-query file.vcf.gz -r 1:1000-2000 -f '%CHROM:%POS\\t%REF\\t%ALT[\\t%SAMPLE:%*=,]\\n'\n", 43 " vcf-query file.vcf.gz -f '[%GT\\t]%LINE\\n'\n", 44 " vcf-query file.vcf.gz -f '[%GT\\ ]%LINE\\n'\n", 45 " vcf-query file.vcf.gz -f '%CHROM\\_%POS\\t%INFO/DP\\t%FILTER\\n'\n", 46 "Notes:\n", 47 " Please use `bcftools query` instead, this script will not be supported in future.\n", 48 "\n"; 49} 50 51 52sub parse_params 53{ 54 my $opts = { columns=>'', format_string=>"%CHROM:%POS\t%REF[\t%SAMPLE=%GT]\n" }; 55 while (defined(my $arg=shift(@ARGV))) 56 { 57 if ( $arg eq '--use-old-method' ) { $$opts{use_old_method}=1; next } 58 if ( $arg eq '-f' || $arg eq '--format' ) { $$opts{format_string}=shift(@ARGV); next } 59 if ( $arg eq '-c' || $arg eq '--columns' ) { $$opts{columns}=shift(@ARGV); next } 60 if ( $arg eq '-l' || $arg eq '--list-columns' ) { $$opts{list_columns}=1; next } 61 if ( $arg eq '-r' || $arg eq '--region' ) { $$opts{region}=shift(@ARGV); next } 62 if ( -e $arg or $arg=~m{^(?:ftp|http)://} ) { $$opts{file}=$arg; next; } 63 if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } 64 if ( !exists($$opts{region}) && exists($$opts{file}) && ($arg=~/^[^:]+:[0-9,]+-[0-9,]+$/ or $arg=~/^[^\:]+$/) ) { $$opts{region}=$arg; next; } 65 error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); 66 } 67 if ( !exists($$opts{file}) && exists($$opts{region}) ) { error("The region cannot be used when streaming the file.\n"); } 68 if ( exists($$opts{columns}) && -e $$opts{columns} ) 69 { 70 my @cols; 71 open(my $fh,'<',$$opts{columns}) or error("$$opts{columns}: $!"); 72 while (my $line=<$fh>) 73 { 74 if ( $line=~/^\s*$/ ) { next; } 75 $line =~ s/^\s*//; 76 $line =~ s/\s*$//; 77 push @cols, $line; 78 } 79 close($fh); 80 $$opts{columns} = join(',', @cols); 81 } 82 return $opts; 83} 84 85sub parse_format_string 86{ 87 my ($str,$hash) = @_; 88 my (@arr,%idx,$join1,$join2); 89 $str =~ s/\\n/\n/g; 90 $str =~ s/\\t/\t/g; 91 while ($str) 92 { 93 if ( !($str=~/%/) ) 94 { 95 push @arr,$str; 96 last; 97 } 98 99 my $before = $`; 100 $str = $'; 101 102 my $match; 103 if ( $str=~/^[*](.)(.)/ ) 104 { 105 $match = '*'; $join1=$1; $join2=$2; 106 } 107 elsif ( $str=~m{([A-Za-z0-9/_]+)} ) 108 { 109 $match = $1; 110 } 111 else { error("FIXME: $str"); } 112 113 if ( defined $before && $before ne '' ) { push @arr,$before; } 114 push @arr,'.'; # If the tag is not present in the VCF, a missing value ('.') will be printed instead. 115 if ( exists($idx{$match}) ) 116 { 117 warn("The tag \"$match\" given multiple times, only the last occurance will be used\n"); 118 } 119 $idx{$match} = $#arr; 120 $str = $'; 121 } 122 for (my $i=0; $i<@arr; $i++) 123 { 124 $arr[$i] =~ s/\\{1}//g; 125 } 126 $$hash{format} = \@arr; 127 $$hash{idx} = \%idx; 128 $$hash{join1} = $join1; 129 $$hash{join2} = $join2; 130} 131 132sub parse_format 133{ 134 my ($opts,$cols) = @_; 135 136 $$opts{before} = {}; 137 $$opts{repeat} = {}; 138 $$opts{after} = {}; 139 140 my ($before,$repeat,$after); 141 142 my $str = $$opts{format_string}; 143 $before = $str; 144 145 if ( $str=~/\[([^\]]+)\]/ ) 146 { 147 $before = $`; 148 $repeat = $1; 149 $after = $'; 150 } 151 if ( $before ) { parse_format_string($before,$$opts{before}); } 152 if ( $repeat ) { parse_format_string($repeat,$$opts{repeat}); } 153 if ( $after ) { parse_format_string($after,$$opts{after}); } 154} 155 156sub copy_array 157{ 158 my ($arr) = @_; 159 my @out; 160 for my $item (@$arr) { push @out,$item; } 161 return @out; 162} 163 164sub get_columns 165{ 166 my ($vcf) = @_; 167 my @cols = (); 168 my $ncols = @{$$vcf{columns}}; 169 for (my $i=9; $i<$ncols; $i++) 170 { 171 push @cols, $$vcf{columns}[$i]; 172 } 173 return \@cols; 174} 175 176sub get_sample_idxs 177{ 178 my ($vcf,@samples) = @_; 179 my @idxs; 180 for my $sample (@samples) 181 { 182 if ( !exists($$vcf{has_column}{$sample}) ) { error("No such sample: [$sample]\n"); } 183 push @idxs, $$vcf{has_column}{$sample} - 1; 184 } 185 return @idxs; 186} 187 188sub list_columns 189{ 190 my ($opts) = @_; 191 my $cols = get_columns($$opts{vcf}); 192 for my $col (@$cols) { print "$col\n"; } 193} 194 195sub read_data 196{ 197 my ($opts) = @_; 198 199 if ( exists($$opts{use_old_method}) ) 200 { 201 read_data_slow_hash($opts); 202 return; 203 } 204 205 my %args = ( print_header=>1 ); 206 if ( $$opts{region} ) { $args{region} = $$opts{region}; } 207 if ( exists($$opts{file}) ) { $args{file} = $$opts{file}; } 208 else { $args{fh} = \*STDIN; } 209 210 my $vcf = Vcf->new(%args); 211 $$opts{vcf} = $vcf; 212 $vcf->parse_header(); 213 214 if ( $$opts{list_columns} ) { list_columns($opts); exit; } 215 216 my @cols = split(/,/,$$opts{columns}); 217 if ( !@cols ) { @cols = @{get_columns($$opts{vcf})}; } 218 my @sample_idxs = get_sample_idxs($$opts{vcf},@cols); 219 220 # The hash opts will be filled with the keys 'before','repeat','after' with formatting information 221 parse_format($opts); 222 while (my $line=$vcf->next_line()) 223 { 224 my $x = $vcf->next_data_array($line); 225 226 # Fill everything what comes before the repeat [] 227 if ( $$opts{before} ) 228 { 229 my (@out) = copy_array($$opts{before}{format}); 230 while (my ($fieldname,$idx) = each %{$$opts{before}{idx}}) 231 { 232 if ( $fieldname eq 'LINE' ) { chomp($line); $out[$idx] = $line; } 233 elsif ( exists($$vcf{has_column}{$fieldname}) ) { $out[$idx] = $$x[$$vcf{has_column}{$fieldname}-1]; } 234 elsif ( substr($fieldname,0,5) eq 'INFO/' ) 235 { 236 $out[$idx] = $vcf->get_info_field($$x[7],substr($fieldname,5)); 237 } 238 } 239 for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } } 240 print join('',@out); 241 } 242 243 # Fill the repeaty stuff (the sample columns) 244 if ( $$opts{repeat} ) 245 { 246 my @repeats; 247 for my $sample_idx (@sample_idxs) { push @repeats, [ copy_array($$opts{repeat}{format}) ]; } 248 249 my @alt; 250 if ( exists($$opts{repeat}{idx}{GT}) ) 251 { 252 @alt = split(/,/,$$x[4]); 253 } 254 while (my ($fieldname,$idx) = each %{$$opts{repeat}{idx}}) 255 { 256 if ( $fieldname eq '*' ) 257 { 258 my $sep1 = $$opts{repeat}{join1}; 259 my $sep2 = $$opts{repeat}{join2}; 260 my @fmt = split(/:/,$$x[8]); 261 for (my $i=0; $i<@sample_idxs; $i++) 262 { 263 my $sample_idx = $sample_idxs[$i]; 264 my @tmp; 265 my $j = 0; 266 for my $value (split(/:/,$$x[$sample_idx])) 267 { 268 push @tmp, $fmt[$j++].$sep1.$value; 269 } 270 $repeats[$i][$idx] = join($sep2,@tmp); 271 } 272 next; 273 } 274 275 my $fmt_idx = $vcf->get_tag_index($$x[8],$fieldname eq 'GTR' ? 'GT' : $fieldname,':'); 276 for (my $i=0; $i<@sample_idxs; $i++) 277 { 278 my $sample_idx = $sample_idxs[$i]; 279 if ( $fmt_idx!=-1 ) 280 { 281 my $value = $vcf->get_field($$x[$sample_idx],$fmt_idx); 282 if ( $fieldname eq 'GT' ) 283 { 284 $value = $vcf->decode_genotype($$x[3],\@alt,$value); 285 } 286 $repeats[$i][$idx] = $value; 287 } 288 } 289 } 290 if ( exists($$opts{repeat}{idx}{SAMPLE}) ) 291 { 292 my $idx = $$opts{repeat}{idx}{SAMPLE}; 293 for (my $i=0; $i<@cols; $i++) { $repeats[$i][$idx] = $cols[$i] } 294 } 295 296 for my $repeat (@repeats) 297 { 298 for (my $i=0; $i<@$repeat; $i++) { if (!defined($$repeat[$i])) { $$repeat[$i]='.'; } } 299 print join('',@$repeat); 300 } 301 } 302 303 # Fill everything what comes after the repeat ([]) 304 if ( $$opts{after} ) 305 { 306 my (@out) = copy_array($$opts{after}{format}); 307 while (my ($fieldname,$idx) = each %{$$opts{after}{idx}}) 308 { 309 if ( $fieldname eq 'LINE' ) { chomp($line); $out[$idx] = $line; } 310 elsif ( exists($$vcf{has_column}{$fieldname}) ) { $out[$idx] = $$x[$$vcf{has_column}{$fieldname}-1]; } 311 elsif ( substr($fieldname,0,5) eq 'INFO/' ) 312 { 313 $out[$idx] = $vcf->get_info_field($$x[7],substr($fieldname,5)); 314 } 315 } 316 for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } } 317 print join('',@out); 318 } 319 } 320} 321 322 323sub read_data_slow_hash 324{ 325 my ($opts) = @_; 326 327 my %args = ( print_header=>1 ); 328 if ( $$opts{region} ) { $args{region} = $$opts{region}; } 329 if ( exists($$opts{file}) ) { $args{file} = $$opts{file}; } 330 else { $args{fh} = \*STDIN; } 331 332 my $vcf = Vcf->new(%args); 333 $$opts{vcf} = $vcf; 334 $vcf->parse_header(); 335 336 if ( $$opts{list_columns} ) { list_columns($opts); exit; } 337 338 my @cols = split(/,/,$$opts{columns}); 339 if ( !@cols ) { @cols = @{get_columns($$opts{vcf})}; } 340 341 # The hash opts will be filled with the keys 'before','repeat','after' with formatting information 342 parse_format($opts); 343 344 while (my $line=$vcf->next_line()) 345 { 346 my $x=$vcf->next_data_hash($line); 347 348 # Fill everything what comes before the repeat [] 349 # Code repetition and not very nice, should be changed at some point... 350 if ( $$opts{before} ) 351 { 352 my (@out) = copy_array($$opts{before}{format}); 353 while (my ($colname,$idx) = each %{$$opts{before}{idx}}) 354 { 355 if ( $colname eq 'LINE' ) { chomp($line); $out[$idx] = $line; next; } 356 if ( $colname eq 'ALT' ) { $out[$idx] = join(',',@{$$x{ALT}}); next; } 357 if ( $colname eq 'FILTER' ) { $out[$idx] = join(';',@{$$x{FILTER}}); next; } 358 if ( $colname=~m{INFO/(.+)} ) 359 { 360 if ( exists($$x{INFO}{$1}) && !defined($$x{INFO}{$1}) ) 361 { 362 # It is a flag 363 $out[$idx] = 'True'; 364 } 365 else 366 { 367 $out[$idx] = $$x{INFO}{$1}; 368 } 369 next; 370 } 371 if ( exists($$x{$colname}) ) { $out[$idx] = $$x{$colname}; } 372 } 373 for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } } 374 print join('',@out); 375 } 376 377 # Fill the repeaty stuff (the sample columns) 378 if ( $$opts{repeat} ) 379 { 380 for my $col (@cols) 381 { 382 my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,$col); 383 my (@out) = copy_array($$opts{repeat}{format}); 384 while (my ($colname,$idx) = each %{$$opts{repeat}{idx}}) 385 { 386 if ( exists($$x{gtypes}{$col}{$colname}) ) { $out[$idx] = $$x{gtypes}{$col}{$colname}; } 387 elsif ( exists($$x{$colname}) ) { $out[$idx] = $$x{$colname}; } 388 } 389 if ( exists($$opts{repeat}{idx}{SAMPLE}) ) { $out[$$opts{repeat}{idx}{SAMPLE}] = $col; } 390 if ( exists($$opts{repeat}{idx}{GTR}) ) 391 { 392 $out[$$opts{repeat}{idx}{GTR}] = $$x{gtypes}{$col}{GT}; 393 } 394 if ( exists($$opts{repeat}{idx}{GT}) ) 395 { 396 my $tmp = $$alleles[0]; 397 for (my $i=0; $i<@$seps; $i++) { $tmp .= $$seps[$i].$$alleles[$i+1]; } 398 $out[$$opts{repeat}{idx}{GT}] = $tmp; 399 } 400 if ( exists($$opts{repeat}{idx}{'*'}) ) 401 { 402 my $sep1 = $$opts{repeat}{join1}; 403 my $sep2 = $$opts{repeat}{join2}; 404 my @tmp; 405 while (my ($key,$value)=each(%{$$x{gtypes}{$col}})) 406 { 407 if ( $key eq 'GT' ) 408 { 409 $value = $$alleles[0]; 410 for (my $i=0; $i<@$seps; $i++) { $value .= $$seps[$i].$$alleles[$i+1]; } 411 } 412 push @tmp, $key.$sep1.$value; 413 } 414 my $idx = $$opts{repeat}{idx}{'*'}; 415 $out[$idx] = join($sep2,@tmp); 416 } 417 for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } } 418 print join('',@out); 419 } 420 } 421 422 # Fill everything what comes after the repeat ([]) 423 if ( $$opts{after} ) 424 { 425 my (@out) = copy_array($$opts{after}{format}); 426 while (my ($colname,$idx) = each %{$$opts{after}{idx}}) 427 { 428 if ( $colname eq 'LINE' ) { chomp($line); $out[$idx] = $line; next; } 429 if ( $colname eq 'ALT' ) { $out[$idx] = join(',',@{$$x{ALT}}); next; } 430 if ( $colname eq 'FILTER' ) { $out[$idx] = join(';',@{$$x{FILTER}}); next; } 431 if ( $colname=~m{INFO/(.+)} ) 432 { 433 if ( exists($$x{INFO}{$1}) && !defined($$x{INFO}{$1}) ) 434 { 435 # It is a flag 436 $out[$idx] = 'True'; 437 } 438 else 439 { 440 $out[$idx] = $$x{INFO}{$1}; 441 } 442 next; 443 } 444 if ( exists($$x{$colname}) ) { $out[$idx] = $$x{$colname}; } 445 } 446 for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } } 447 print join('',@out); 448 } 449 } 450} 451 452 453