1#!/usr/local/bin/perl 2# 3# The MIT License 4# 5# Copyright (c) 2017-2018, 2020 Genome Research Ltd. 6# 7# Author: Petr Danecek <pd3@sanger.ac.uk> 8# 9# Permission is hereby granted, free of charge, to any person obtaining a copy 10# of this software and associated documentation files (the "Software"), to deal 11# in the Software without restriction, including without limitation the rights 12# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13# copies of the Software, and to permit persons to whom the Software is 14# furnished to do so, subject to the following conditions: 15# 16# The above copyright notice and this permission notice shall be included in 17# all copies or substantial portions of the Software. 18# 19# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 25# THE SOFTWARE. 26 27use strict; 28use warnings; 29use Carp; 30use Data::Dumper; 31 32my $opts = parse_params(); 33run_roh($opts); 34eval_roh($opts); 35 36exit; 37 38#-------------------------------- 39 40sub error 41{ 42 my (@msg) = @_; 43 if ( scalar @msg ) { confess @msg,"\n"; } 44 print 45 "About: This is a convenience wrapper for \"bcftools roh\" which takes multiple VCF/BCF files\n", 46 " and locates regions private to a sample or shared across multiple samples. On input it\n", 47 " expects a directory with .vcf, .vcf.gz or .bcf files, a file with allele frequencies\n", 48 " and optionally a genetic map. See http://samtools.github.io/bcftools/howtos/roh-calling.html\n", 49 " for details\n", 50 "Usage: run-roh.pl [OPTIONS]\n", 51 "Options:\n", 52 " -a, --af-annots <file> Allele frequency annotations [1000GP-AFs/AFs.tab.gz]\n", 53 " -i, --indir <dir> Input directory with VCF files\n", 54 " --include <expr> Select sites for which the expression is true\n", 55 " --exclude <expr> Exclude sites for which the epxression is true\n", 56 " -l, --min-length <num> Filter input regions shorter than this [1e6]\n", 57 " -m, --genmap <dir> Directory with genetic map in IMPUTE2 format (optional)\n", 58 " -M, --rec-rate <float> constant recombination rate per bp (optional)\n", 59 " -n, --min-markers <num> Filter input regions with fewer marker than this [100]\n", 60 " -o, --outdir <dir> Output directory\n", 61 " -q, --min-qual <num> Filter input regions with quality smaller than this [10]\n", 62 " --roh-args <string> Extra arguments to pass to bcftools roh\n", 63 " -s, --silent Quiet output, do not print commands\n", 64 " -h, -?, --help This help message\n", 65 "\n"; 66 exit -1; 67} 68sub parse_params 69{ 70 my $opts = 71 { 72 af_annots => '1000GP-AFs/AFs.tab.gz', 73 verbose => 1, 74 min_length => 1e6, 75 min_markers => 100, 76 min_qual => 10, 77 roh_args => '', 78 }; 79 while (defined(my $arg=shift(@ARGV))) 80 { 81 if ( $arg eq '--roh-args' ) { $$opts{roh_args}=shift(@ARGV); next } 82 if ( $arg eq '--include' ) { $$opts{include_expr}=shift(@ARGV); next } 83 if ( $arg eq '--exclude' ) { $$opts{exclude_expr}=shift(@ARGV); next } 84 if ( $arg eq '-q' || $arg eq '--min-qual' ) { $$opts{min_qual}=shift(@ARGV); next } 85 if ( $arg eq '-l' || $arg eq '--min-length' ) { $$opts{min_length}=shift(@ARGV); next } 86 if ( $arg eq '-n' || $arg eq '--min-markers' ) { $$opts{min_markers}=shift(@ARGV); next } 87 if ( $arg eq '-s' || $arg eq '--silent' ) { $$opts{verbose}=0; next } 88 if ( $arg eq '-a' || $arg eq '--af-annots' ) { $$opts{af_annots}=shift(@ARGV); next } 89 if ( $arg eq '-m' || $arg eq '--genmap' ) { $$opts{genmap}=shift(@ARGV); next } 90 if ( $arg eq '-M' || $arg eq '--rec-rate' ) { $$opts{rec_rate}=shift(@ARGV); next } 91 if ( $arg eq '-o' || $arg eq '--outdir' ) { $$opts{outdir}=shift(@ARGV); next } 92 if ( $arg eq '-i' || $arg eq '--indir' ) { $$opts{indir}=shift(@ARGV); next } 93 if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } 94 error("Unknown parameter \"$arg\". Run -h for help.\n"); 95 } 96 if ( !exists($$opts{outdir}) ) { error("Missing the -o, --outdir option.\n") } 97 if ( !exists($$opts{indir}) ) { error("Missing the -i, --indir option.\n") } 98 if ( ! -e $$opts{af_annots} ) { error("The annotation file does not exist: $$opts{af_annots}\n"); } 99 if ( ! -e "$$opts{af_annots}.tbi" ) { error("The annotation file is not indexed: $$opts{af_annots}.tbi\n"); } 100 if ( ! -e "$$opts{af_annots}.hdr" ) { error("The annotation file has no header: $$opts{af_annots}.hdr\n"); } 101 if ( exists($$opts{genmap}) && ! -d "$$opts{genmap}" ) { error("The directory with genetic maps does not exist: $$opts{genmap}\n"); } 102 if ( exists($$opts{include_expr}) ) { $$opts{include_expr} =~ s/\'/\'\\\'\'/g; $$opts{inc_exc} .= qq[ -i '$$opts{include_expr}']; } 103 if ( exists($$opts{exclude_expr}) ) { $$opts{exclude_expr} =~ s/\'/\'\\\'\'/g; $$opts{inc_exc} .= qq[ -e '$$opts{exclude_expr}']; } 104 return $opts; 105} 106 107sub cmd 108{ 109 my ($cmd,%args) = @_; 110 111 if ( $args{verbose} ) { print STDERR $cmd,"\n"; } 112 113 # Why not to use backticks? Perl calls /bin/sh, which is often bash. To get the correct 114 # status of failing pipes, it must be called with the pipefail option. 115 116 my $kid_io; 117 my $pid = open($kid_io, "-|"); 118 if ( !defined $pid ) { error("Cannot fork: $!"); } 119 120 my @out; 121 if ($pid) 122 { 123 # parent 124 @out = <$kid_io>; 125 close($kid_io); 126 } 127 else 128 { 129 # child 130 exec('bash', '-o','pipefail','-c', $cmd) or error("Failed to run the command [bash -o pipefail -c $cmd]: $!"); 131 } 132 133 if ( exists($args{exit_on_error}) && !$args{exit_on_error} ) { return @out; } 134 135 my $exit_status = $?; 136 my $status = exists($args{require_status}) ? $args{require_status} : 0; 137 if ( $status ne $exit_status ) 138 { 139 my $msg; 140 if ( $? & 0xff ) 141 { 142 $msg = "The command died with signal ".($? & 0xff); 143 } 144 else 145 { 146 $msg = "The command exited with status ".($? >> 8)." (expected $status)"; 147 } 148 $msg .= ":\n\t$cmd\n\n"; 149 if ( @out ) { $msg .= join('',@out,"\n\n"); } 150 error($msg); 151 } 152 return @out; 153} 154 155# determine the common prefix/suffix in file names like: 156# genetic_map_chr12_combined_b37.txt 157# genetic_map_chr13_combined_b37.txt 158sub parse_genmap_path 159{ 160 my ($opts) = @_; 161 if ( !exists($$opts{genmap}) or !-d $$opts{genmap} ) { return ''; } 162 my @files = glob("$$opts{genmap}/*"); 163 my $prefix = $files[0]; 164 my $suffix = $files[0]; 165 for my $file (@files) 166 { 167 while ( length($prefix) && index($file,$prefix)==-1 ) 168 { 169 substr($prefix,length($prefix)-1,1,''); 170 } 171 if ( !length($prefix) ) { last; } 172 } 173 for my $file (@files) 174 { 175 while ( length($suffix) && rindex($file,$suffix)==-1 ) 176 { 177 substr($suffix,0,1,''); 178 } 179 if ( !length($suffix) ) { last; } 180 } 181 my @test = glob("$prefix*$suffix"); 182 if ( @test != @files ) 183 { 184 error( 185 "Error: Could not determine the genetic map files in \"$$opts{genmap}\". The directory must contain only\n" . 186 " the genetic map files (and nothing else) and the expected file names are\n" . 187 " <prefix><chromosome-name><suffix>\n" . 188 " for example:\n" . 189 " genetic_map_chr10_combined_b37.txt\n" . 190 " genetic_map_chr11_combined_b37.txt\n" . 191 " ...\n" . 192 " The heuristically determined wildcard \"$prefix*$suffix\" matches ",scalar @test," files, in total there are ",scalar @files," files.\n"); 193 } 194 return "-m $prefix\{CHROM}$suffix"; 195} 196 197sub run_roh 198{ 199 my ($opts) = @_; 200 cmd("mkdir -p $$opts{outdir}",%$opts) unless -d $$opts{outdir}; 201 202 my $chr_fname = "$$opts{outdir}/chr-names.txt"; 203 open(my $fh,'>',$chr_fname) or error("$chr_fname: $!"); 204 for my $chr (1..22,'X') { print $fh "chr$chr\t$chr\n"; } 205 close($fh) or error("close failed: $chr_fname"); 206 207 my @files = (); 208 opendir(my $dh,$$opts{indir}) or error("$$opts{indir}: $!"); 209 while (my $file = readdir($dh)) 210 { 211 if ( !($file=~/\.vcf$/i) && !($file=~/\.vcf\.gz$/i) && !($file=~/\.bcf$/i) ) { next; } 212 my $outfile = "$$opts{outdir}/$`.bcf"; 213 push @files,$outfile; 214 if ( -e $outfile ) { next; } 215 216 my $cmd = 217 "bcftools annotate --rename-chrs $chr_fname '$$opts{indir}/$file' -Ou | " . 218 "bcftools annotate -c CHROM,POS,REF,ALT,AF1KG -h $$opts{af_annots}.hdr -a $$opts{af_annots} "; 219 220 if ( exists($$opts{inc_exc}) ) 221 { 222 $cmd .= " -Ou | bcftools view $$opts{inc_exc} "; 223 } 224 $cmd .= "-Ob -o $outfile.part && "; 225 $cmd .= "mv $outfile.part $outfile"; 226 227 cmd($cmd, %$opts); 228 } 229 closedir($dh) or error("close failed: $$opts{indir}"); 230 231 my $genmap = parse_genmap_path($opts); 232 if ( exists($$opts{rec_rate}) ) { $genmap .= " -M $$opts{rec_rate}"; } 233 234 for my $file (@files) 235 { 236 if ( -e "$file.txt.gz" ) { next; } 237 my @out = cmd("bcftools roh $$opts{roh_args} --AF-tag AF1KG $genmap $file -Orz -o $file.txt.gz.part 2>&1 | tee -a $file.log",%$opts); 238 for my $line (@out) 239 { 240 if ( !($line=~m{total/processed:\s+(\d+)/(\d+)}) ) { next; } 241 my $total = $1; 242 my $used = $2; 243 if ( !$total or $used/$total < 0.3 ) 244 { 245 print STDERR @out; 246 print STDERR "WARNING: Less than 30% of sites was used!\n\n"; 247 } 248 } 249 cmd(qq[bcftools query -f'GT\\t%CHROM\\t%POS[\\t%SAMPLE\\t%GT]\\n' $file | gzip -c >> $file.txt.gz.part && mv $file.txt.gz.part $file.txt.gz],%$opts); 250 } 251 $$opts{files} = \@files; 252} 253 254sub next_region 255{ 256 my ($regions) = @_; 257 my $chr = undef; 258 for my $chrom (sort keys %$regions) 259 { 260 if ( %{$$regions{$chrom}} ) { $chr = $chrom; last; } 261 delete($$regions{$chrom}); 262 } 263 if ( !defined $chr ) { return undef; } 264 265 my %min = (); 266 for my $smpl (keys %{$$regions{$chr}}) 267 { 268 if ( !exists($$regions{$chr}{$smpl}) ) { next; } 269 my $reg = $$regions{$chr}{$smpl}[0]; 270 if ( !%min ) 271 { 272 $min{chr} = $chr; 273 $min{beg} = $$reg{beg}; 274 $min{end} = $$reg{end}; 275 next; 276 } 277 if ( $min{beg} > $$reg{beg} ) { $min{beg} = $$reg{beg}; } 278 } 279 if ( !%min ) { return undef; } 280 for my $smpl (keys %{$$regions{$chr}}) 281 { 282 if ( !exists($$regions{$chr}{$smpl}) ) { next; } 283 my $reg = $$regions{$chr}{$smpl}[0]; 284 if ( $min{end} > $$reg{end} ) { $min{end} = $$reg{end}; } 285 if ( $min{end} > $$reg{beg} - 1 && $min{beg} != $$reg{beg} ) { $min{end} = $$reg{beg} - 1; } 286 } 287 return \%min; 288} 289 290sub eval_roh 291{ 292 my ($opts) = @_; 293 my %regions = (); 294 my %samples = (); 295 my %lengths = (); 296 for my $file (@{$$opts{files}}) 297 { 298 open(my $fh,"gunzip -c $file.txt.gz |") or error("gunzip -c $file.txt.gz: $!"); 299 while (my $line=<$fh>) 300 { 301 if ( !($line=~/^RG/) ) { next; } 302 my (%vals); 303 @vals{qw(type smpl chr beg end len num qual)} = split(/\s+/,$line); 304 if ( $vals{type} ne 'RG' ) { next; } 305 chomp($vals{qual}); 306 if ( $vals{len} < $$opts{min_length} ) { next; } 307 if ( $vals{num} < $$opts{min_markers} ) { next; } 308 if ( $vals{qual} < $$opts{min_qual} ) { next; } 309 push @{$regions{$vals{chr}}{$vals{smpl}}},\%vals; 310 $samples{$vals{smpl}} = 1; 311 $lengths{$vals{smpl}} += $vals{end} - $vals{beg} + 1; 312 } 313 close($fh) or error("close failed: gunzip -c $file.txt.gz"); 314 } 315 open(my $fh,'>',"$$opts{outdir}/merged.txt") or error("$$opts{outdir}/merged.txt: $!"); 316 my @samples = sort keys %samples; 317 print $fh "# [1]chrom\t[2]beg\t[3]end\t[4]length (Mb)"; 318 my $i = 5; 319 for my $smpl (@samples) { print $fh "\t[$i]$smpl"; $i++; } 320 print $fh "\n"; 321 while (my $min = next_region(\%regions)) 322 { 323 my $chr = $$min{chr}; 324 my $beg = $$min{beg}; 325 my $end = $$min{end}; 326 printf $fh "$chr\t$beg\t$end\t%.2f",($end-$beg+1)/1e6; 327 for my $smpl (@samples) 328 { 329 if ( !exists($regions{$chr}{$smpl}) ) { goto not_present; } 330 my $reg = $regions{$chr}{$smpl}[0]; 331 if ( $$reg{beg} > $end ) { goto not_present; } 332 if ( $$reg{end} > $end ) { $regions{$chr}{$smpl}[0]{beg} = $end + 1; } 333 else { shift @{$regions{$chr}{$smpl}}; } 334 if ( !@{$regions{$chr}{$smpl}} ) { delete($regions{$chr}{$smpl}); } 335 $lengths{$smpl} -= $end - $beg + 1; 336 print $fh "\t1"; 337 next; 338not_present: 339 print $fh "\t0"; 340 } 341 print $fh "\n"; 342 } 343 close($fh) or error("close failed: $$opts{outdir}/merged.txt"); 344 for my $smpl (@samples) 345 { 346 if ( $lengths{$smpl}!=0 ) 347 { 348 print STDERR "ERROR: a bug detected, sanity check failed, expected zero length : $smpl .. $lengths{$smpl}\n"; 349 } 350 } 351 print STDERR "The merged regions are in $$opts{outdir}/merged.txt\n"; 352} 353 354 355