1#!/usr/bin/env perl
2#
3# Author: petr.danecek@sanger
4#
5
6use strict;
7use warnings;
8use Carp;
9use Vcf;
10
11my $opts = parse_params();
12read_data($opts);
13
14exit;
15
16#--------------------------------
17
18sub error
19{
20    my (@msg) = @_;
21    if ( scalar @msg ) { confess @msg; }
22    die
23        "Usage: vcf-query [OPTIONS] file.vcf.gz\n",
24        "Options:\n",
25        "   -c, --columns <file|list>           List of comma-separated column names or one column name per line in a file.\n",
26        "   -f, --format <string>               The default is '%CHROM:%POS\\t%REF[\\t%SAMPLE=%GT]\\n'\n",
27        "   -l, --list-columns                  List columns.\n",
28        "   -r, --region chr:from-to            Retrieve the region. (Runs tabix.)\n",
29        "       --use-old-method                Use old version of API, which is slow but more robust.\n",
30        "   -h, -?, --help                      This help message.\n",
31        "Expressions:\n",
32        "   %CHROM          The CHROM column (similarly also other columns)\n",
33        "   %GT             Translated genotype (e.g. C/A)\n",
34        "   %GTR            Raw genotype (e.g. 0/1)\n",
35        "   %INFO/TAG       Any tag in the INFO column\n",
36        "   %LINE           Prints the whole line\n",
37        "   %SAMPLE         Sample name\n",
38        "   []              The brackets loop over all samples\n",
39        "   %*<A><B>        All format fields printed as KEY<A>VALUE<B>\n",
40        "Examples:\n",
41        "   vcf-query file.vcf.gz 1:1000-2000 -c NA001,NA002,NA003\n",
42        "   vcf-query file.vcf.gz -r 1:1000-2000 -f '%CHROM:%POS\\t%REF\\t%ALT[\\t%SAMPLE:%*=,]\\n'\n",
43        "   vcf-query file.vcf.gz -f '[%GT\\t]%LINE\\n'\n",
44        "   vcf-query file.vcf.gz -f '[%GT\\ ]%LINE\\n'\n",
45        "   vcf-query file.vcf.gz -f '%CHROM\\_%POS\\t%INFO/DP\\t%FILTER\\n'\n",
46        "Notes:\n",
47        "   Please use `bcftools query` instead, this script will not be supported in future.\n",
48        "\n";
49}
50
51
52sub parse_params
53{
54    my $opts = { columns=>'', format_string=>"%CHROM:%POS\t%REF[\t%SAMPLE=%GT]\n" };
55    while (defined(my $arg=shift(@ARGV)))
56    {
57        if (                 $arg eq '--use-old-method' ) { $$opts{use_old_method}=1; next }
58        if ( $arg eq '-f' || $arg eq '--format' ) { $$opts{format_string}=shift(@ARGV); next }
59        if ( $arg eq '-c' || $arg eq '--columns' ) { $$opts{columns}=shift(@ARGV); next }
60        if ( $arg eq '-l' || $arg eq '--list-columns' ) { $$opts{list_columns}=1; next }
61        if ( $arg eq '-r' || $arg eq '--region' ) { $$opts{region}=shift(@ARGV); next }
62        if ( -e $arg or $arg=~m{^(?:ftp|http)://} ) { $$opts{file}=$arg; next; }
63        if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
64        if ( !exists($$opts{region}) && exists($$opts{file}) && ($arg=~/^[^:]+:[0-9,]+-[0-9,]+$/ or $arg=~/^[^\:]+$/) ) { $$opts{region}=$arg; next; }
65        error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n");
66    }
67    if ( !exists($$opts{file}) && exists($$opts{region}) ) { error("The region cannot be used when streaming the file.\n"); }
68    if ( exists($$opts{columns}) && -e $$opts{columns} )
69    {
70        my @cols;
71        open(my $fh,'<',$$opts{columns}) or error("$$opts{columns}: $!");
72        while (my $line=<$fh>)
73        {
74            if ( $line=~/^\s*$/ ) { next; }
75            $line =~ s/^\s*//;
76            $line =~ s/\s*$//;
77            push @cols, $line;
78        }
79        close($fh);
80        $$opts{columns} = join(',', @cols);
81    }
82    return $opts;
83}
84
85sub parse_format_string
86{
87    my ($str,$hash) = @_;
88    my (@arr,%idx,$join1,$join2);
89    $str =~ s/\\n/\n/g;
90    $str =~ s/\\t/\t/g;
91    while ($str)
92    {
93        if ( !($str=~/%/) )
94        {
95            push @arr,$str;
96            last;
97        }
98
99        my $before = $`;
100        $str = $';
101
102        my $match;
103        if ( $str=~/^[*](.)(.)/ )
104        {
105            $match = '*'; $join1=$1; $join2=$2;
106        }
107        elsif ( $str=~m{([A-Za-z0-9/_]+)} )
108        {
109            $match = $1;
110        }
111        else { error("FIXME: $str"); }
112
113        if ( defined $before && $before ne '' ) { push @arr,$before; }
114        push @arr,'.';      # If the tag is not present in the VCF, a missing value ('.') will be printed instead.
115        if ( exists($idx{$match}) )
116        {
117            warn("The tag \"$match\" given multiple times, only the last occurance will be used\n");
118        }
119        $idx{$match} = $#arr;
120        $str = $';
121    }
122    for (my $i=0; $i<@arr; $i++)
123    {
124        $arr[$i] =~ s/\\{1}//g;
125    }
126    $$hash{format} = \@arr;
127    $$hash{idx}    = \%idx;
128    $$hash{join1}  = $join1;
129    $$hash{join2}  = $join2;
130}
131
132sub parse_format
133{
134    my ($opts,$cols) = @_;
135
136    $$opts{before} = {};
137    $$opts{repeat} = {};
138    $$opts{after}  = {};
139
140    my ($before,$repeat,$after);
141
142    my $str = $$opts{format_string};
143    $before = $str;
144
145    if ( $str=~/\[([^\]]+)\]/ )
146    {
147        $before = $`;
148        $repeat = $1;
149        $after  = $';
150    }
151    if ( $before ) { parse_format_string($before,$$opts{before}); }
152    if ( $repeat ) { parse_format_string($repeat,$$opts{repeat}); }
153    if ( $after ) { parse_format_string($after,$$opts{after}); }
154}
155
156sub copy_array
157{
158    my ($arr) = @_;
159    my @out;
160    for my $item (@$arr) { push @out,$item; }
161    return @out;
162}
163
164sub get_columns
165{
166    my ($vcf) = @_;
167    my @cols = ();
168    my $ncols = @{$$vcf{columns}};
169    for (my $i=9; $i<$ncols; $i++)
170    {
171        push @cols, $$vcf{columns}[$i];
172    }
173    return \@cols;
174}
175
176sub get_sample_idxs
177{
178    my ($vcf,@samples) = @_;
179    my @idxs;
180    for my $sample (@samples)
181    {
182        if ( !exists($$vcf{has_column}{$sample}) ) { error("No such sample: [$sample]\n"); }
183        push @idxs, $$vcf{has_column}{$sample} - 1;
184    }
185    return @idxs;
186}
187
188sub list_columns
189{
190    my ($opts) = @_;
191    my $cols = get_columns($$opts{vcf});
192    for my $col (@$cols) { print "$col\n"; }
193}
194
195sub read_data
196{
197    my ($opts) = @_;
198
199    if ( exists($$opts{use_old_method}) )
200    {
201        read_data_slow_hash($opts);
202        return;
203    }
204
205    my %args = ( print_header=>1 );
206    if ( $$opts{region} ) { $args{region} = $$opts{region}; }
207    if ( exists($$opts{file}) ) { $args{file} = $$opts{file}; }
208    else { $args{fh} = \*STDIN; }
209
210    my $vcf = Vcf->new(%args);
211    $$opts{vcf} = $vcf;
212    $vcf->parse_header();
213
214    if ( $$opts{list_columns} ) { list_columns($opts); exit; }
215
216    my @cols = split(/,/,$$opts{columns});
217    if ( !@cols ) { @cols = @{get_columns($$opts{vcf})}; }
218    my @sample_idxs = get_sample_idxs($$opts{vcf},@cols);
219
220    # The hash opts will be filled with the keys 'before','repeat','after' with formatting information
221    parse_format($opts);
222    while (my $line=$vcf->next_line())
223    {
224        my $x = $vcf->next_data_array($line);
225
226        # Fill everything what comes before the repeat []
227        if ( $$opts{before} )
228        {
229            my (@out) = copy_array($$opts{before}{format});
230            while (my ($fieldname,$idx) = each %{$$opts{before}{idx}})
231            {
232                if ( $fieldname eq 'LINE' ) { chomp($line); $out[$idx] = $line; }
233                elsif ( exists($$vcf{has_column}{$fieldname}) ) { $out[$idx] = $$x[$$vcf{has_column}{$fieldname}-1]; }
234                elsif ( substr($fieldname,0,5) eq 'INFO/' )
235                {
236                    $out[$idx] = $vcf->get_info_field($$x[7],substr($fieldname,5));
237                }
238            }
239            for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } }
240            print join('',@out);
241        }
242
243        # Fill the repeaty stuff (the sample columns)
244        if ( $$opts{repeat} )
245        {
246            my @repeats;
247            for my $sample_idx (@sample_idxs) { push @repeats, [ copy_array($$opts{repeat}{format}) ]; }
248
249            my @alt;
250            if ( exists($$opts{repeat}{idx}{GT}) )
251            {
252                @alt = split(/,/,$$x[4]);
253            }
254            while (my ($fieldname,$idx) = each %{$$opts{repeat}{idx}})
255            {
256                if ( $fieldname eq '*' )
257                {
258                    my $sep1 = $$opts{repeat}{join1};
259                    my $sep2 = $$opts{repeat}{join2};
260                    my @fmt = split(/:/,$$x[8]);
261                    for (my $i=0; $i<@sample_idxs; $i++)
262                    {
263                        my $sample_idx = $sample_idxs[$i];
264                        my @tmp;
265                        my $j = 0;
266                        for my $value (split(/:/,$$x[$sample_idx]))
267                        {
268                            push @tmp, $fmt[$j++].$sep1.$value;
269                        }
270                        $repeats[$i][$idx] = join($sep2,@tmp);
271                    }
272                    next;
273                }
274
275                my $fmt_idx = $vcf->get_tag_index($$x[8],$fieldname eq 'GTR' ? 'GT' : $fieldname,':');
276                for (my $i=0; $i<@sample_idxs; $i++)
277                {
278                    my $sample_idx = $sample_idxs[$i];
279                    if ( $fmt_idx!=-1 )
280                    {
281                        my $value = $vcf->get_field($$x[$sample_idx],$fmt_idx);
282                        if ( $fieldname eq 'GT' )
283                        {
284                            $value = $vcf->decode_genotype($$x[3],\@alt,$value);
285                        }
286                        $repeats[$i][$idx] = $value;
287                    }
288                }
289            }
290            if ( exists($$opts{repeat}{idx}{SAMPLE}) )
291            {
292                my $idx = $$opts{repeat}{idx}{SAMPLE};
293                for (my $i=0; $i<@cols; $i++) { $repeats[$i][$idx] = $cols[$i] }
294            }
295
296            for my $repeat (@repeats)
297            {
298                for (my $i=0; $i<@$repeat; $i++) { if (!defined($$repeat[$i])) { $$repeat[$i]='.'; } }
299                print join('',@$repeat);
300            }
301        }
302
303        # Fill everything what comes after the repeat ([])
304        if ( $$opts{after} )
305        {
306            my (@out) = copy_array($$opts{after}{format});
307            while (my ($fieldname,$idx) = each %{$$opts{after}{idx}})
308            {
309                if ( $fieldname eq 'LINE' ) { chomp($line); $out[$idx] = $line; }
310                elsif ( exists($$vcf{has_column}{$fieldname}) ) { $out[$idx] = $$x[$$vcf{has_column}{$fieldname}-1]; }
311                elsif ( substr($fieldname,0,5) eq 'INFO/' )
312                {
313                    $out[$idx] = $vcf->get_info_field($$x[7],substr($fieldname,5));
314                }
315            }
316            for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } }
317            print join('',@out);
318        }
319    }
320}
321
322
323sub read_data_slow_hash
324{
325    my ($opts) = @_;
326
327    my %args = ( print_header=>1 );
328    if ( $$opts{region} ) { $args{region} = $$opts{region}; }
329    if ( exists($$opts{file}) ) { $args{file} = $$opts{file}; }
330    else { $args{fh} = \*STDIN; }
331
332    my $vcf = Vcf->new(%args);
333    $$opts{vcf} = $vcf;
334    $vcf->parse_header();
335
336    if ( $$opts{list_columns} ) { list_columns($opts); exit; }
337
338    my @cols = split(/,/,$$opts{columns});
339    if ( !@cols ) { @cols = @{get_columns($$opts{vcf})}; }
340
341    # The hash opts will be filled with the keys 'before','repeat','after' with formatting information
342    parse_format($opts);
343
344    while (my $line=$vcf->next_line())
345    {
346        my $x=$vcf->next_data_hash($line);
347
348        # Fill everything what comes before the repeat []
349        # Code repetition and not very nice, should be changed at some point...
350        if ( $$opts{before} )
351        {
352            my (@out) = copy_array($$opts{before}{format});
353            while (my ($colname,$idx) = each %{$$opts{before}{idx}})
354            {
355                if ( $colname eq 'LINE' ) { chomp($line); $out[$idx] = $line; next; }
356                if ( $colname eq 'ALT' ) { $out[$idx] = join(',',@{$$x{ALT}}); next; }
357                if ( $colname eq 'FILTER' ) { $out[$idx] = join(';',@{$$x{FILTER}}); next; }
358                if ( $colname=~m{INFO/(.+)} )
359                {
360                    if ( exists($$x{INFO}{$1}) && !defined($$x{INFO}{$1}) )
361                    {
362                        # It is a flag
363                        $out[$idx] = 'True';
364                    }
365                    else
366                    {
367                        $out[$idx] = $$x{INFO}{$1};
368                    }
369                    next;
370                }
371                if ( exists($$x{$colname}) ) { $out[$idx] = $$x{$colname}; }
372            }
373            for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } }
374            print join('',@out);
375        }
376
377        # Fill the repeaty stuff (the sample columns)
378        if ( $$opts{repeat} )
379        {
380            for my $col (@cols)
381            {
382                my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,$col);
383                my (@out) = copy_array($$opts{repeat}{format});
384                while (my ($colname,$idx) = each %{$$opts{repeat}{idx}})
385                {
386                    if ( exists($$x{gtypes}{$col}{$colname}) ) { $out[$idx] = $$x{gtypes}{$col}{$colname}; }
387                    elsif ( exists($$x{$colname}) ) { $out[$idx] = $$x{$colname}; }
388                }
389                if ( exists($$opts{repeat}{idx}{SAMPLE}) ) { $out[$$opts{repeat}{idx}{SAMPLE}] = $col; }
390                if ( exists($$opts{repeat}{idx}{GTR}) )
391                {
392                    $out[$$opts{repeat}{idx}{GTR}] = $$x{gtypes}{$col}{GT};
393                }
394                if ( exists($$opts{repeat}{idx}{GT}) )
395                {
396                    my $tmp = $$alleles[0];
397                    for (my $i=0; $i<@$seps; $i++) { $tmp .= $$seps[$i].$$alleles[$i+1]; }
398                    $out[$$opts{repeat}{idx}{GT}] = $tmp;
399                }
400                if ( exists($$opts{repeat}{idx}{'*'}) )
401                {
402                    my $sep1 = $$opts{repeat}{join1};
403                    my $sep2 = $$opts{repeat}{join2};
404                    my @tmp;
405                    while (my ($key,$value)=each(%{$$x{gtypes}{$col}}))
406                    {
407                        if ( $key eq 'GT' )
408                        {
409                            $value = $$alleles[0];
410                            for (my $i=0; $i<@$seps; $i++) { $value .= $$seps[$i].$$alleles[$i+1]; }
411                        }
412                        push @tmp, $key.$sep1.$value;
413                    }
414                    my $idx = $$opts{repeat}{idx}{'*'};
415                    $out[$idx] = join($sep2,@tmp);
416                }
417                for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } }
418                print join('',@out);
419            }
420        }
421
422        # Fill everything what comes after the repeat ([])
423        if ( $$opts{after} )
424        {
425            my (@out) = copy_array($$opts{after}{format});
426            while (my ($colname,$idx) = each %{$$opts{after}{idx}})
427            {
428                if ( $colname eq 'LINE' ) { chomp($line); $out[$idx] = $line; next; }
429                if ( $colname eq 'ALT' ) { $out[$idx] = join(',',@{$$x{ALT}}); next; }
430                if ( $colname eq 'FILTER' ) { $out[$idx] = join(';',@{$$x{FILTER}}); next; }
431                if ( $colname=~m{INFO/(.+)} )
432                {
433                    if ( exists($$x{INFO}{$1}) && !defined($$x{INFO}{$1}) )
434                    {
435                        # It is a flag
436                        $out[$idx] = 'True';
437                    }
438                    else
439                    {
440                        $out[$idx] = $$x{INFO}{$1};
441                    }
442                    next;
443                }
444                if ( exists($$x{$colname}) ) { $out[$idx] = $$x{$colname}; }
445            }
446            for (my $i=0; $i<@out; $i++) { if (!defined($out[$i])) { $out[$i]='.'; } }
447            print join('',@out);
448        }
449    }
450}
451
452
453