1#!/usr/bin/env perl 2# 3# Author: petr.danecek@sanger 4# 5 6use strict; 7use warnings; 8use Carp; 9use Vcf; 10use FaSlice; 11 12my $opts = parse_params(); 13flanking_sequence($opts); 14 15exit; 16 17#-------------------------------- 18 19sub error 20{ 21 my (@msg) = @_; 22 if ( scalar @msg ) { confess @msg; } 23 die 24 "About: Annotate VCF with flanking sequence (INFO/FS tag)\n", 25 "Usage: fill-fs [OPTIONS] file.vcf\n", 26 "Options:\n", 27 " -b, --bed-mask <file> Regions to mask (tabix indexed), multiple files can be given\n", 28 " -c, --cluster <int> Do self-masking of clustered variants within this range.\n", 29 " -l, --length <int> Flanking sequence length [100]\n", 30 " -m, --mask-char <char|lc> The character to use or \"lc\" for lowercase. This option must preceed\n", 31 " -b, -v or -c in order to take effect. With multiple files works\n", 32 " as a switch on the command line, see the example below [N]\n", 33 " -r, --refseq <file> The reference sequence.\n", 34 " -v, --vcf-mask <file> Mask known variants in the flanking sequence, multiple files can be given (tabix indexed)\n", 35 " -h, -?, --help This help message.\n", 36 "Example:\n", 37 " # Mask variants from the VCF file with N's and use lowercase for the bed file regions\n", 38 " fill-fs file.vcf -v mask.vcf -m lc -b mask.bed\n", 39 "\n"; 40} 41 42 43sub parse_params 44{ 45 my $opts = { length=>100, mask=>[], cluster=>0 }; 46 my $mask = $$opts{mask_char}{default} = 'N'; 47 my $mask_changed = 0; 48 while (defined(my $arg=shift(@ARGV))) 49 { 50 if ( $arg eq '-c' || $arg eq '--cluster' ) { $$opts{cluster}=shift(@ARGV); $$opts{mask_char}{default}=$mask; $mask_changed=0; next; } 51 if ( $arg eq '-r' || $arg eq '--refseq' ) { $$opts{refseq}=shift(@ARGV); next; } 52 if ( $arg eq '-l' || $arg eq '--length' ) { $$opts{length}=shift(@ARGV); next; } 53 if ( $arg eq '-m' || $arg eq '--mask' ) { $mask=shift(@ARGV); check_mask_char($mask); $mask_changed=1; next; } 54 if ( $arg eq '-b' || $arg eq '--bed-mask' ) { $arg=shift(@ARGV); push @{$$opts{bed_mask}},$arg; $$opts{mask_char}{$arg}=$mask; $mask_changed=0; next; } 55 if ( $arg eq '-v' || $arg eq '--vcf-mask' ) { $arg=shift(@ARGV); push @{$$opts{vcf_mask}},$arg; $$opts{mask_char}{$arg}=$mask; $mask_changed=0; next; } 56 if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } 57 if ( -e $arg && !exists($$opts{file}) ) { $$opts{file}=$arg; next; } 58 error("Unknown parameter \"$arg\". Run -h for help.\n"); 59 } 60 if ( !($$opts{length}=~/^\d+$/) ) { error("Expected integer after -l, got $$opts{length}\n"); } 61 if ( !exists($$opts{refseq}) ) { error("Missing the -r option.\n"); } 62 if ( $mask_changed ) { error("The -m parameter must preceed -b, -v, or the file in order to take effect.\n"); } 63 return $opts; 64} 65 66sub check_mask_char 67{ 68 my ($mask) = @_; 69 if ( $mask eq 'lc' ) { return; } 70 if ( length($mask) eq 1 ) { return; } 71 error("Currently only \"lc\" or one-character mask is supported, got \"$mask\".\n"); 72} 73 74sub flanking_sequence 75{ 76 my ($opts) = @_; 77 78 $$opts{faref} = FaSlice->new(file=>$$opts{refseq},size=>1_024,oob=>'N'); 79 80 my $vcf = $$opts{vcf} = exists($$opts{file}) ? Vcf->new(file=>$$opts{file}) : Vcf->new(fh=>\*STDIN); 81 $vcf->parse_header; 82 $vcf->add_header_line({key=>'INFO',ID=>'FS',Number=>1,Type=>'String',Description=>'Flanking sequence'}); 83 print $vcf->format_header; 84 85 my (@lines,@mask); 86 while (my $line=$vcf->next_data_array) 87 { 88 my $chr = $$line[0]; 89 my $pos = $$line[1]; 90 my $ref = $$line[3]; 91 my $alt = $$line[4]; 92 93 my $off; 94 $alt =~ s/,.+$//; # first allele is used at multiallelic sites 95 ($off,$ref,$alt) = $vcf->normalize_alleles_pos($ref,$alt); 96 $pos += $off; 97 98 push @lines, { chr=>$chr, pos=>$pos, ref=>$ref, alt=>$alt, line=>$line }; 99 push @mask, { chr=>$chr, pos=>$pos, ref=>$ref }; 100 flush_buffers($opts,\@lines,\@mask); 101 } 102 103 flush_buffers($opts,\@lines,\@mask,1); 104} 105 106sub flush_buffers 107{ 108 my ($opts,$lines,$mask,$force) = @_; 109 110 if ( !@$lines ) { return; } 111 112 if ( !$$opts{cluster} ) 113 { 114 shift(@$mask); 115 output_line($opts,shift(@$lines),$mask); 116 return; 117 } 118 119 while ( @$lines && ($force or $$mask[0]{chr} ne $$lines[-1]{chr} or $$mask[0]{pos}+2*$$opts{cluster}<=$$lines[-1]{pos}) ) 120 { 121 output_line($opts,$$lines[0],$mask); 122 shift(@$lines); 123 while ( @$mask && @$lines && ($$mask[0]{chr} ne $$lines[0]{chr} or $$mask[0]{pos}+$$opts{cluster}<=$$lines[0]{pos}) ) 124 { 125 shift(@$mask); 126 } 127 } 128} 129 130sub output_line 131{ 132 my ($opts,$hline,$mask) = @_; 133 134 my $chr = $$hline{chr}; 135 my $pos = $$hline{pos}; 136 my $ref = $$hline{ref}; 137 my $alt = $$hline{alt}; 138 my $line = $$hline{line}; 139 140 my $seq_pos = $$opts{length}; 141 my $reflen = length($ref); 142 my $from = $pos-$$opts{length}; 143 my $to = $pos+($reflen-1)+$$opts{length}; 144 my $seq = $$opts{faref}->get_slice($chr,$from,$to); 145 $seq = mask_sequence($opts,$seq,$chr,$from,$to,$mask); 146 147 my $reflen_ori = $reflen; 148 my ($len,$indel,$off) = $$opts{vcf}->is_indel($ref,$alt); 149 if ( $len<0 ) 150 { 151 $seq_pos += $off; 152 $ref = $indel; 153 $reflen = abs($len); 154 $alt = '-'; 155 } 156 elsif ( $len>0 ) 157 { 158 $seq_pos += $off; 159 $ref = '-'; 160 $alt = $indel; 161 $reflen = $off-1; 162 } 163 164 substr($seq,$seq_pos,$reflen,"[$ref/$alt]"); 165 if ( $reflen_ori - $reflen > 0 ) 166 { 167 # for redundant pad bases which cannot be removed without changing the position, e.g. ACGT AC 168 $seq = substr($seq,$reflen_ori-$reflen); 169 } 170 if ( $$line[7] eq '.' or !defined $$line[7] ) 171 { 172 $$line[7] = ''; 173 } 174 else 175 { 176 $$line[7] .= ';'; 177 } 178 $$line[7] .= "FS=$seq"; 179 180 print join("\t",@$line),"\n"; 181} 182 183sub mask_sequence 184{ 185 my ($opts,$seq,$chr,$from,$to,$mask) = @_; 186 for my $m (@$mask) 187 { 188 my $reflen = length($$m{ref}); 189 if ( $$m{chr} ne $chr or $$m{pos}+$reflen<$from or $$m{pos}>$to ) { next; } 190 apply_mask($opts,\$seq,$$m{pos}-$from,$$m{ref},$$opts{mask_char}{default}); 191 } 192 for my $file (@{$$opts{vcf_mask}}) 193 { 194 my @tabix = `tabix $file $chr:$from-$to`; 195 for my $ret (@tabix) 196 { 197 my $items = $$opts{vcf}->split_mandatory($ret); 198 # In different situations one may want to treat indels differently. For 199 # now, mask the whole REF string as for primer design it is safer to 200 # mask the whole thing; for example, a 2bp deletion can be reported by 201 # samtools as REF=GACACACA ALT=GACACA, the script will mask it all. 202 apply_mask($opts,\$seq,$$items[1]-$from,$$items[3],$$opts{mask_char}{$file}); 203 } 204 } 205 for my $file (@{$$opts{bed_mask}}) 206 { 207 my @tabix = `tabix $file $chr:$from-$to`; 208 for my $ret (@tabix) 209 { 210 my @items = split(/\t/,$ret); 211 apply_mask($opts,\$seq,$items[1]-$from+1,$items[2]-$from,$$opts{mask_char}{$file}); 212 } 213 } 214 return $seq; 215} 216 217sub apply_mask 218{ 219 my ($opts,$seq,$from,$ref,$mask_char) = @_; 220 if ( $from<0 ) { $from=0; } 221 my $ref_len = $ref=~/^\d+$/ ? $ref-$from+1 : length($ref); 222 my $seq_len = length($$seq); 223 if ( $from+$ref_len>=$seq_len ) { $ref_len = $seq_len - $from; } 224 if ( $ref_len<0 ) { return; } 225 if ( $ref_len==1 ) 226 { 227 my $rpl = substr($$seq,$from,1); 228 $rpl = $mask_char eq 'lc' ? lc(substr($$seq,$from,1)) : $mask_char; 229 substr($$seq,$from,1,$rpl); 230 return; 231 } 232 my $rpl = substr($$seq,$from,$ref_len); 233 $rpl = $mask_char eq 'lc' ? lc(substr($$seq,$from,$ref_len)) : ($mask_char x $ref_len); 234 substr($$seq,$from,$ref_len,$rpl); 235} 236 237 238