1#!/usr/bin/env perl
2#
3# Author: petr.danecek@sanger
4#
5
6use strict;
7use warnings;
8use Carp;
9use Vcf;
10use FaSlice;
11
12my $opts = parse_params();
13flanking_sequence($opts);
14
15exit;
16
17#--------------------------------
18
19sub error
20{
21    my (@msg) = @_;
22    if ( scalar @msg ) { confess @msg; }
23    die
24        "About: Annotate VCF with flanking sequence (INFO/FS tag)\n",
25        "Usage: fill-fs [OPTIONS] file.vcf\n",
26        "Options:\n",
27        "   -b, --bed-mask <file>           Regions to mask (tabix indexed), multiple files can be given\n",
28        "   -c, --cluster <int>             Do self-masking of clustered variants within this range.\n",
29        "   -l, --length <int>              Flanking sequence length [100]\n",
30        "   -m, --mask-char <char|lc>       The character to use or \"lc\" for lowercase. This option must preceed\n",
31        "                                       -b, -v or -c in order to take effect. With multiple files works\n",
32        "                                        as a switch on the command line, see the example below [N]\n",
33        "   -r, --refseq <file>             The reference sequence.\n",
34        "   -v, --vcf-mask <file>           Mask known variants in the flanking sequence, multiple files can be given (tabix indexed)\n",
35        "   -h, -?, --help                  This help message.\n",
36        "Example:\n",
37        "   # Mask variants from the VCF file with N's and use lowercase for the bed file regions\n",
38        "   fill-fs file.vcf -v mask.vcf -m lc -b mask.bed\n",
39        "\n";
40}
41
42
43sub parse_params
44{
45    my $opts = { length=>100, mask=>[], cluster=>0 };
46    my $mask = $$opts{mask_char}{default} = 'N';
47    my $mask_changed = 0;
48    while (defined(my $arg=shift(@ARGV)))
49    {
50        if ( $arg eq '-c' || $arg eq '--cluster' ) { $$opts{cluster}=shift(@ARGV); $$opts{mask_char}{default}=$mask; $mask_changed=0; next; }
51        if ( $arg eq '-r' || $arg eq '--refseq' ) { $$opts{refseq}=shift(@ARGV); next; }
52        if ( $arg eq '-l' || $arg eq '--length' ) { $$opts{length}=shift(@ARGV); next; }
53        if ( $arg eq '-m' || $arg eq '--mask' ) { $mask=shift(@ARGV); check_mask_char($mask); $mask_changed=1; next; }
54        if ( $arg eq '-b' || $arg eq '--bed-mask' ) { $arg=shift(@ARGV); push @{$$opts{bed_mask}},$arg; $$opts{mask_char}{$arg}=$mask; $mask_changed=0; next; }
55        if ( $arg eq '-v' || $arg eq '--vcf-mask' ) { $arg=shift(@ARGV); push @{$$opts{vcf_mask}},$arg; $$opts{mask_char}{$arg}=$mask; $mask_changed=0; next; }
56        if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
57        if ( -e $arg && !exists($$opts{file}) ) { $$opts{file}=$arg; next; }
58        error("Unknown parameter \"$arg\". Run -h for help.\n");
59    }
60    if ( !($$opts{length}=~/^\d+$/) ) { error("Expected integer after -l, got $$opts{length}\n"); }
61    if ( !exists($$opts{refseq}) ) { error("Missing the -r option.\n"); }
62    if ( $mask_changed ) { error("The -m parameter must preceed -b, -v, or the file in order to take effect.\n"); }
63    return $opts;
64}
65
66sub check_mask_char
67{
68    my ($mask) = @_;
69    if ( $mask eq 'lc' ) { return; }
70    if ( length($mask) eq 1 ) { return; }
71    error("Currently only \"lc\" or one-character mask is supported, got \"$mask\".\n");
72}
73
74sub flanking_sequence
75{
76    my ($opts) = @_;
77
78    $$opts{faref} = FaSlice->new(file=>$$opts{refseq},size=>1_024,oob=>'N');
79
80    my $vcf = $$opts{vcf} = exists($$opts{file}) ? Vcf->new(file=>$$opts{file}) : Vcf->new(fh=>\*STDIN);
81    $vcf->parse_header;
82    $vcf->add_header_line({key=>'INFO',ID=>'FS',Number=>1,Type=>'String',Description=>'Flanking sequence'});
83    print $vcf->format_header;
84
85    my (@lines,@mask);
86    while (my $line=$vcf->next_data_array)
87    {
88        my $chr = $$line[0];
89        my $pos = $$line[1];
90        my $ref = $$line[3];
91        my $alt = $$line[4];
92
93        my $off;
94        $alt =~ s/,.+$//;   # first allele is used at multiallelic sites
95        ($off,$ref,$alt) = $vcf->normalize_alleles_pos($ref,$alt);
96        $pos += $off;
97
98        push @lines, { chr=>$chr, pos=>$pos, ref=>$ref, alt=>$alt, line=>$line };
99        push @mask, { chr=>$chr, pos=>$pos, ref=>$ref };
100        flush_buffers($opts,\@lines,\@mask);
101    }
102
103    flush_buffers($opts,\@lines,\@mask,1);
104}
105
106sub flush_buffers
107{
108    my ($opts,$lines,$mask,$force) = @_;
109
110    if ( !@$lines ) { return; }
111
112    if ( !$$opts{cluster} )
113    {
114        shift(@$mask);
115        output_line($opts,shift(@$lines),$mask);
116        return;
117    }
118
119    while ( @$lines && ($force or $$mask[0]{chr} ne $$lines[-1]{chr} or $$mask[0]{pos}+2*$$opts{cluster}<=$$lines[-1]{pos}) )
120    {
121        output_line($opts,$$lines[0],$mask);
122        shift(@$lines);
123        while ( @$mask && @$lines && ($$mask[0]{chr} ne $$lines[0]{chr} or $$mask[0]{pos}+$$opts{cluster}<=$$lines[0]{pos}) )
124        {
125            shift(@$mask);
126        }
127    }
128}
129
130sub output_line
131{
132    my ($opts,$hline,$mask) = @_;
133
134    my $chr  = $$hline{chr};
135    my $pos  = $$hline{pos};
136    my $ref  = $$hline{ref};
137    my $alt  = $$hline{alt};
138    my $line = $$hline{line};
139
140    my $seq_pos = $$opts{length};
141    my $reflen  = length($ref);
142    my $from = $pos-$$opts{length};
143    my $to   = $pos+($reflen-1)+$$opts{length};
144    my $seq  = $$opts{faref}->get_slice($chr,$from,$to);
145    $seq = mask_sequence($opts,$seq,$chr,$from,$to,$mask);
146
147    my $reflen_ori = $reflen;
148    my ($len,$indel,$off) = $$opts{vcf}->is_indel($ref,$alt);
149    if ( $len<0 )
150    {
151        $seq_pos += $off;
152        $ref = $indel;
153        $reflen = abs($len);
154        $alt = '-';
155    }
156    elsif ( $len>0 )
157    {
158        $seq_pos += $off;
159        $ref = '-';
160        $alt = $indel;
161        $reflen = $off-1;
162    }
163
164    substr($seq,$seq_pos,$reflen,"[$ref/$alt]");
165    if ( $reflen_ori - $reflen > 0 )
166    {
167        # for redundant pad bases which cannot be removed without changing the position, e.g. ACGT AC
168        $seq = substr($seq,$reflen_ori-$reflen);
169    }
170    if ( $$line[7] eq '.' or !defined $$line[7] )
171    {
172        $$line[7] = '';
173    }
174    else
175    {
176        $$line[7] .= ';';
177    }
178    $$line[7] .= "FS=$seq";
179
180    print join("\t",@$line),"\n";
181}
182
183sub mask_sequence
184{
185    my ($opts,$seq,$chr,$from,$to,$mask) = @_;
186    for my $m (@$mask)
187    {
188        my $reflen = length($$m{ref});
189        if ( $$m{chr} ne $chr or $$m{pos}+$reflen<$from or $$m{pos}>$to ) { next; }
190        apply_mask($opts,\$seq,$$m{pos}-$from,$$m{ref},$$opts{mask_char}{default});
191    }
192    for my $file (@{$$opts{vcf_mask}})
193    {
194        my @tabix = `tabix $file $chr:$from-$to`;
195        for my $ret (@tabix)
196        {
197            my $items = $$opts{vcf}->split_mandatory($ret);
198            # In different situations one may want to treat indels differently. For
199            #   now, mask the whole REF string as for primer design it is safer to
200            #   mask the whole thing; for example, a 2bp deletion can be reported by
201            #   samtools as REF=GACACACA ALT=GACACA, the script will mask it all.
202            apply_mask($opts,\$seq,$$items[1]-$from,$$items[3],$$opts{mask_char}{$file});
203        }
204    }
205    for my $file (@{$$opts{bed_mask}})
206    {
207        my @tabix = `tabix $file $chr:$from-$to`;
208        for my $ret (@tabix)
209        {
210            my @items = split(/\t/,$ret);
211            apply_mask($opts,\$seq,$items[1]-$from+1,$items[2]-$from,$$opts{mask_char}{$file});
212        }
213    }
214    return $seq;
215}
216
217sub apply_mask
218{
219    my ($opts,$seq,$from,$ref,$mask_char) = @_;
220    if ( $from<0 ) { $from=0; }
221    my $ref_len = $ref=~/^\d+$/ ? $ref-$from+1 : length($ref);
222    my $seq_len = length($$seq);
223    if ( $from+$ref_len>=$seq_len ) { $ref_len = $seq_len - $from; }
224    if ( $ref_len<0 ) { return; }
225    if ( $ref_len==1 )
226    {
227        my $rpl = substr($$seq,$from,1);
228        $rpl = $mask_char eq 'lc' ? lc(substr($$seq,$from,1)) : $mask_char;
229        substr($$seq,$from,1,$rpl);
230        return;
231    }
232    my $rpl = substr($$seq,$from,$ref_len);
233    $rpl = $mask_char eq 'lc' ? lc(substr($$seq,$from,$ref_len)) : ($mask_char x $ref_len);
234    substr($$seq,$from,$ref_len,$rpl);
235}
236
237
238