1#!/usr/local/bin/perl
2#
3#  The MIT License
4#
5#  Copyright (c) 2017-2018, 2020 Genome Research Ltd.
6#
7#  Author: Petr Danecek <pd3@sanger.ac.uk>
8#
9#  Permission is hereby granted, free of charge, to any person obtaining a copy
10#  of this software and associated documentation files (the "Software"), to deal
11#  in the Software without restriction, including without limitation the rights
12#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13#  copies of the Software, and to permit persons to whom the Software is
14#  furnished to do so, subject to the following conditions:
15#
16#  The above copyright notice and this permission notice shall be included in
17#  all copies or substantial portions of the Software.
18#
19#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25#  THE SOFTWARE.
26
27use strict;
28use warnings;
29use Carp;
30use Data::Dumper;
31
32my $opts = parse_params();
33run_roh($opts);
34eval_roh($opts);
35
36exit;
37
38#--------------------------------
39
40sub error
41{
42    my (@msg) = @_;
43    if ( scalar @msg ) { confess @msg,"\n"; }
44    print
45        "About: This is a convenience wrapper for \"bcftools roh\" which takes multiple VCF/BCF files\n",
46        "       and locates regions private to a sample or shared across multiple samples. On input it\n",
47        "       expects a directory with .vcf, .vcf.gz or .bcf files, a file with allele frequencies\n",
48        "       and optionally a genetic map. See http://samtools.github.io/bcftools/howtos/roh-calling.html\n",
49        "       for details\n",
50        "Usage: run-roh.pl [OPTIONS]\n",
51        "Options:\n",
52        "   -a, --af-annots <file>      Allele frequency annotations [1000GP-AFs/AFs.tab.gz]\n",
53        "   -i, --indir <dir>           Input directory with VCF files\n",
54        "       --include <expr>        Select sites for which the expression is true\n",
55        "       --exclude <expr>        Exclude sites for which the epxression is true\n",
56        "   -l, --min-length <num>      Filter input regions shorter than this [1e6]\n",
57        "   -m, --genmap <dir>          Directory with genetic map in IMPUTE2 format (optional)\n",
58        "   -M, --rec-rate <float>      constant recombination rate per bp (optional)\n",
59        "   -n, --min-markers <num>     Filter input regions with fewer marker than this [100]\n",
60        "   -o, --outdir <dir>          Output directory\n",
61        "   -q, --min-qual <num>        Filter input regions with quality smaller than this [10]\n",
62        "       --roh-args <string>     Extra arguments to pass to bcftools roh\n",
63        "   -s, --silent                Quiet output, do not print commands\n",
64        "   -h, -?, --help              This help message\n",
65        "\n";
66    exit -1;
67}
68sub parse_params
69{
70    my $opts =
71    {
72        af_annots   => '1000GP-AFs/AFs.tab.gz',
73        verbose     => 1,
74        min_length  => 1e6,
75        min_markers => 100,
76        min_qual    => 10,
77        roh_args    => '',
78    };
79    while (defined(my $arg=shift(@ARGV)))
80    {
81        if (                 $arg eq '--roh-args' ) { $$opts{roh_args}=shift(@ARGV); next }
82        if (                 $arg eq '--include' ) { $$opts{include_expr}=shift(@ARGV); next }
83        if (                 $arg eq '--exclude' ) { $$opts{exclude_expr}=shift(@ARGV); next }
84        if ( $arg eq '-q' || $arg eq '--min-qual' ) { $$opts{min_qual}=shift(@ARGV); next }
85        if ( $arg eq '-l' || $arg eq '--min-length' ) { $$opts{min_length}=shift(@ARGV); next }
86        if ( $arg eq '-n' || $arg eq '--min-markers' ) { $$opts{min_markers}=shift(@ARGV); next }
87        if ( $arg eq '-s' || $arg eq '--silent' ) { $$opts{verbose}=0; next }
88        if ( $arg eq '-a' || $arg eq '--af-annots' ) { $$opts{af_annots}=shift(@ARGV); next }
89        if ( $arg eq '-m' || $arg eq '--genmap' ) { $$opts{genmap}=shift(@ARGV); next }
90        if ( $arg eq '-M' || $arg eq '--rec-rate' ) { $$opts{rec_rate}=shift(@ARGV); next }
91        if ( $arg eq '-o' || $arg eq '--outdir' ) { $$opts{outdir}=shift(@ARGV); next }
92        if ( $arg eq '-i' || $arg eq '--indir' ) { $$opts{indir}=shift(@ARGV); next }
93        if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
94        error("Unknown parameter \"$arg\". Run -h for help.\n");
95    }
96    if ( !exists($$opts{outdir}) ) { error("Missing the -o, --outdir option.\n") }
97    if ( !exists($$opts{indir}) ) { error("Missing the -i, --indir option.\n") }
98    if ( ! -e $$opts{af_annots} ) { error("The annotation file does not exist: $$opts{af_annots}\n"); }
99    if ( ! -e "$$opts{af_annots}.tbi" ) { error("The annotation file is not indexed: $$opts{af_annots}.tbi\n"); }
100    if ( ! -e "$$opts{af_annots}.hdr" ) { error("The annotation file has no header: $$opts{af_annots}.hdr\n"); }
101    if ( exists($$opts{genmap}) && ! -d "$$opts{genmap}" ) { error("The directory with genetic maps does not exist: $$opts{genmap}\n"); }
102    if ( exists($$opts{include_expr}) ) { $$opts{include_expr} =~ s/\'/\'\\\'\'/g; $$opts{inc_exc} .= qq[ -i '$$opts{include_expr}']; }
103    if ( exists($$opts{exclude_expr}) ) { $$opts{exclude_expr} =~ s/\'/\'\\\'\'/g; $$opts{inc_exc} .= qq[ -e '$$opts{exclude_expr}']; }
104    return $opts;
105}
106
107sub cmd
108{
109    my ($cmd,%args) = @_;
110
111    if ( $args{verbose} ) { print STDERR $cmd,"\n"; }
112
113    # Why not to use backticks? Perl calls /bin/sh, which is often bash. To get the correct
114    #   status of failing pipes, it must be called with the pipefail option.
115
116    my $kid_io;
117    my $pid = open($kid_io, "-|");
118    if ( !defined $pid ) { error("Cannot fork: $!"); }
119
120    my @out;
121    if ($pid)
122    {
123        # parent
124        @out = <$kid_io>;
125        close($kid_io);
126    }
127    else
128    {
129        # child
130        exec('bash', '-o','pipefail','-c', $cmd) or error("Failed to run the command [bash -o pipefail -c $cmd]: $!");
131    }
132
133    if ( exists($args{exit_on_error}) && !$args{exit_on_error} ) { return @out; }
134
135    my $exit_status = $?;
136    my $status = exists($args{require_status}) ? $args{require_status} : 0;
137    if ( $status ne $exit_status )
138    {
139        my $msg;
140        if ( $? & 0xff )
141        {
142            $msg = "The command died with signal ".($? & 0xff);
143        }
144        else
145        {
146            $msg = "The command exited with status ".($? >> 8)." (expected $status)";
147        }
148        $msg .= ":\n\t$cmd\n\n";
149        if ( @out ) {  $msg .= join('',@out,"\n\n"); }
150        error($msg);
151    }
152    return @out;
153}
154
155# determine the common prefix/suffix in file names like:
156#   genetic_map_chr12_combined_b37.txt
157#   genetic_map_chr13_combined_b37.txt
158sub parse_genmap_path
159{
160    my ($opts) = @_;
161    if ( !exists($$opts{genmap}) or !-d $$opts{genmap} ) { return ''; }
162    my @files  = glob("$$opts{genmap}/*");
163    my $prefix = $files[0];
164    my $suffix = $files[0];
165    for my $file (@files)
166    {
167        while ( length($prefix) && index($file,$prefix)==-1 )
168        {
169            substr($prefix,length($prefix)-1,1,'');
170        }
171        if ( !length($prefix) ) { last; }
172    }
173    for my $file (@files)
174    {
175        while ( length($suffix) && rindex($file,$suffix)==-1 )
176        {
177            substr($suffix,0,1,'');
178        }
179        if ( !length($suffix) ) { last; }
180    }
181    my @test = glob("$prefix*$suffix");
182    if ( @test != @files )
183    {
184        error(
185            "Error: Could not determine the genetic map files in \"$$opts{genmap}\". The directory must contain only\n" .
186            "       the genetic map files (and nothing else) and the expected file names are\n" .
187            "           <prefix><chromosome-name><suffix>\n" .
188            "       for example:\n" .
189            "           genetic_map_chr10_combined_b37.txt\n" .
190            "           genetic_map_chr11_combined_b37.txt\n" .
191            "           ...\n" .
192            "       The heuristically determined wildcard \"$prefix*$suffix\" matches ",scalar @test," files, in total there are ",scalar @files," files.\n");
193    }
194    return "-m $prefix\{CHROM}$suffix";
195}
196
197sub run_roh
198{
199    my ($opts) = @_;
200    cmd("mkdir -p $$opts{outdir}",%$opts) unless -d $$opts{outdir};
201
202    my $chr_fname = "$$opts{outdir}/chr-names.txt";
203    open(my $fh,'>',$chr_fname) or error("$chr_fname: $!");
204    for my $chr (1..22,'X') { print $fh "chr$chr\t$chr\n"; }
205    close($fh) or error("close failed: $chr_fname");
206
207    my @files = ();
208    opendir(my $dh,$$opts{indir}) or error("$$opts{indir}: $!");
209    while (my $file = readdir($dh))
210    {
211        if ( !($file=~/\.vcf$/i) && !($file=~/\.vcf\.gz$/i) && !($file=~/\.bcf$/i) ) { next; }
212        my $outfile = "$$opts{outdir}/$`.bcf";
213        push @files,$outfile;
214        if ( -e $outfile ) { next; }
215
216        my $cmd =
217            "bcftools annotate --rename-chrs $chr_fname '$$opts{indir}/$file' -Ou | " .
218            "bcftools annotate -c CHROM,POS,REF,ALT,AF1KG -h $$opts{af_annots}.hdr -a $$opts{af_annots} ";
219
220        if ( exists($$opts{inc_exc}) )
221        {
222            $cmd .= " -Ou | bcftools view $$opts{inc_exc} ";
223        }
224        $cmd .= "-Ob -o $outfile.part && ";
225        $cmd .= "mv $outfile.part $outfile";
226
227        cmd($cmd, %$opts);
228    }
229    closedir($dh) or error("close failed: $$opts{indir}");
230
231    my $genmap = parse_genmap_path($opts);
232    if ( exists($$opts{rec_rate}) ) { $genmap .= " -M $$opts{rec_rate}"; }
233
234    for my $file (@files)
235    {
236        if ( -e "$file.txt.gz" ) { next; }
237        my @out = cmd("bcftools roh $$opts{roh_args} --AF-tag AF1KG $genmap $file -Orz -o $file.txt.gz.part 2>&1 | tee -a $file.log",%$opts);
238        for my $line (@out)
239        {
240            if ( !($line=~m{total/processed:\s+(\d+)/(\d+)}) ) { next; }
241            my $total = $1;
242            my $used  = $2;
243            if ( !$total or $used/$total < 0.3 )
244            {
245                print STDERR @out;
246                print STDERR "WARNING: Less than 30% of sites was used!\n\n";
247            }
248        }
249        cmd(qq[bcftools query -f'GT\\t%CHROM\\t%POS[\\t%SAMPLE\\t%GT]\\n' $file | gzip -c >> $file.txt.gz.part && mv $file.txt.gz.part $file.txt.gz],%$opts);
250    }
251    $$opts{files} = \@files;
252}
253
254sub next_region
255{
256    my ($regions) = @_;
257    my $chr = undef;
258    for my $chrom (sort keys %$regions)
259    {
260        if ( %{$$regions{$chrom}} ) { $chr = $chrom; last; }
261        delete($$regions{$chrom});
262    }
263    if ( !defined $chr ) { return undef; }
264
265    my %min = ();
266    for my $smpl (keys %{$$regions{$chr}})
267    {
268        if ( !exists($$regions{$chr}{$smpl}) ) { next; }
269        my $reg = $$regions{$chr}{$smpl}[0];
270        if ( !%min )
271        {
272            $min{chr} = $chr;
273            $min{beg} = $$reg{beg};
274            $min{end} = $$reg{end};
275            next;
276        }
277        if ( $min{beg} > $$reg{beg} ) { $min{beg} = $$reg{beg}; }
278    }
279    if ( !%min ) { return undef; }
280    for my $smpl (keys %{$$regions{$chr}})
281    {
282        if ( !exists($$regions{$chr}{$smpl}) ) { next; }
283        my $reg = $$regions{$chr}{$smpl}[0];
284        if ( $min{end} > $$reg{end} ) { $min{end} = $$reg{end}; }
285        if ( $min{end} > $$reg{beg} - 1 && $min{beg} != $$reg{beg} ) { $min{end} = $$reg{beg} - 1; }
286    }
287    return \%min;
288}
289
290sub eval_roh
291{
292    my ($opts) = @_;
293    my %regions = ();
294    my %samples = ();
295    my %lengths = ();
296    for my $file (@{$$opts{files}})
297    {
298        open(my $fh,"gunzip -c $file.txt.gz |") or error("gunzip -c $file.txt.gz: $!");
299        while (my $line=<$fh>)
300        {
301            if ( !($line=~/^RG/) ) { next; }
302            my (%vals);
303            @vals{qw(type smpl chr beg end len num qual)} = split(/\s+/,$line);
304            if ( $vals{type} ne 'RG' ) { next; }
305            chomp($vals{qual});
306            if ( $vals{len} < $$opts{min_length} ) { next; }
307            if ( $vals{num} < $$opts{min_markers} ) { next; }
308            if ( $vals{qual} < $$opts{min_qual} ) { next; }
309            push @{$regions{$vals{chr}}{$vals{smpl}}},\%vals;
310            $samples{$vals{smpl}} = 1;
311            $lengths{$vals{smpl}} += $vals{end} - $vals{beg} + 1;
312        }
313        close($fh) or error("close failed: gunzip -c $file.txt.gz");
314    }
315    open(my $fh,'>',"$$opts{outdir}/merged.txt") or error("$$opts{outdir}/merged.txt: $!");
316    my @samples = sort keys %samples;
317    print $fh "# [1]chrom\t[2]beg\t[3]end\t[4]length (Mb)";
318    my $i = 5;
319    for my $smpl (@samples) { print $fh "\t[$i]$smpl"; $i++; }
320    print $fh "\n";
321    while (my $min = next_region(\%regions))
322    {
323        my $chr = $$min{chr};
324        my $beg = $$min{beg};
325        my $end = $$min{end};
326        printf $fh "$chr\t$beg\t$end\t%.2f",($end-$beg+1)/1e6;
327        for my $smpl (@samples)
328        {
329            if ( !exists($regions{$chr}{$smpl}) ) { goto not_present; }
330            my $reg = $regions{$chr}{$smpl}[0];
331            if ( $$reg{beg} > $end ) { goto not_present; }
332            if ( $$reg{end} > $end ) { $regions{$chr}{$smpl}[0]{beg} = $end + 1; }
333            else { shift @{$regions{$chr}{$smpl}}; }
334            if ( !@{$regions{$chr}{$smpl}} ) { delete($regions{$chr}{$smpl}); }
335            $lengths{$smpl} -= $end - $beg + 1;
336            print $fh "\t1";
337            next;
338not_present:
339            print $fh "\t0";
340        }
341        print $fh "\n";
342    }
343    close($fh) or error("close failed: $$opts{outdir}/merged.txt");
344    for my $smpl (@samples)
345    {
346        if ( $lengths{$smpl}!=0 )
347        {
348            print STDERR "ERROR: a bug detected, sanity check failed, expected zero length : $smpl .. $lengths{$smpl}\n";
349        }
350    }
351    print STDERR "The merged regions are in $$opts{outdir}/merged.txt\n";
352}
353
354
355