1#!/usr/bin/env perl
2#
3# Author: petr.danecek@sanger
4#
5# Test bcf synced reader's allele pairing
6#
7
8use strict;
9use warnings;
10use Carp;
11use Data::Dumper;
12use List::Util 'shuffle';
13use File::Temp qw/ tempfile tempdir /;
14use FindBin;
15use lib "$FindBin::Bin";
16
17my $opts = parse_params();
18run_test($opts);
19
20exit;
21
22#--------------------------------
23
24sub error
25{
26    my (@msg) = @_;
27    if ( scalar @msg ) { confess @msg; }
28    print
29        "Usage: test-bcf-sr.pl [OPTIONS]\n",
30        "Options:\n",
31        "   -s, --seed <int>        Random seed\n",
32        "   -t, --temp-dir <dir>    When given, temporary files will not be removed\n",
33        "   -v, --verbose           \n",
34        "   -h, -?, --help          This help message\n",
35        "\n";
36    exit -1;
37}
38sub parse_params
39{
40    my $opts = {};
41    while (defined(my $arg=shift(@ARGV)))
42    {
43        if ( $arg eq '-t' || $arg eq '--temp-dir' ) { $$opts{keep_files}=shift(@ARGV); next }
44        if ( $arg eq '-v' || $arg eq '--verbose' ) { $$opts{verbose}=1; next }
45        if ( $arg eq '-s' || $arg eq '--seed' ) { $$opts{seed}=shift(@ARGV); next }
46        if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
47        error("Unknown parameter \"$arg\". Run -h for help.\n");
48    }
49    $$opts{tmp} = exists($$opts{keep_files}) ? $$opts{keep_files} : tempdir(CLEANUP=>1);
50    if ( $$opts{keep_files} ) { cmd("mkdir -p $$opts{keep_files}"); }
51    if ( !exists($$opts{seed}) )
52    {
53        $$opts{seed} = time();
54        print STDERR "Random seed is $$opts{seed}\n";
55    }
56    srand($$opts{seed});
57    return $opts;
58}
59
60sub _cmd
61{
62    my ($cmd) = @_;
63    my $kid_io;
64    my @out;
65    my $pid = open($kid_io, "-|");
66    if ( !defined $pid ) { error("Cannot fork: $!"); }
67    if ($pid)
68    {
69        # parent
70        @out = <$kid_io>;
71        close($kid_io);
72    }
73    else
74    {
75        # child
76        exec('bash', '-o','pipefail','-c', $cmd) or error("Cannot execute the command [/bin/sh -o pipefail -c $cmd]: $!");
77    }
78    return ($? >> 8, join('',@out));
79}
80sub cmd
81{
82    my ($cmd) = @_;
83    my ($ret,$out) = _cmd($cmd);
84    if ( $ret ) { error("The command failed [$ret]: $cmd\n", $out); }
85    return $out;
86}
87
88sub save_vcf
89{
90    my ($opts,$vars,$fname) = @_;
91    open(my $fh,"| $FindBin::Bin/../bgzip -c > $fname") or error("$FindBin::Bin/../bgzip -c > $fname: !");
92    print $fh qq[##fileformat=VCFv4.3\n];
93    print $fh qq[##FILTER=<ID=PASS,Description="All filters passed">\n];
94    print $fh qq[##contig=<ID=1>\n];
95    print $fh qq[##contig=<ID=2>\n];
96    print $fh '#'. join("\t", qw(CHROM POS ID  REF ALT QUAL    FILTER  INFO))."\n";
97    for my $var (@$vars)
98    {
99        my @als = split(/,/,$var);
100        my @alts = ();
101        my $ref;
102        for my $al (@als)
103        {
104            my ($xref,$alt) = split(/>/,$al);
105            $ref = $xref;
106            push @alts,$alt;
107        }
108        print $fh join("\t", (1,100,'.',$ref,join(',',@alts),'.','.','.'))."\n";
109    }
110    for my $var (@$vars)
111    {
112        my @als = split(/,/,$var);
113        my @alts = ();
114        my $ref;
115        for my $al (@als)
116        {
117            my ($xref,$alt) = split(/>/,$al);
118            $ref = $xref;
119            push @alts,$alt;
120        }
121        print $fh join("\t", (1,300,'.',$ref,join(',',@alts),'.','.','.'))."\n";
122    }
123    for my $var (@$vars)
124    {
125        my @als = split(/,/,$var);
126        my @alts = ();
127        my $ref;
128        for my $al (@als)
129        {
130            my ($xref,$alt) = split(/>/,$al);
131            $ref = $xref;
132            push @alts,$alt;
133        }
134        print $fh join("\t", (2,100,'.',$ref,join(',',@alts),'.','.','.'))."\n";
135    }
136    close($fh) or error("close failed: bgzip -c > $fname");
137    cmd("$FindBin::Bin/../tabix -f $fname");
138}
139
140sub random_alt
141{
142    my ($ref,$is_snp) = @_;
143    my @acgt = qw(A C G T);
144    my $alt = $acgt[rand @acgt];
145    if ( $ref eq $alt ) { return '.'; }   # ref
146    if ( !$is_snp ) { $alt = $ref.$alt; }
147    return $alt;
148}
149
150sub check_outputs
151{
152    my ($fname_bin,$fname_perl) = @_;
153    my %out = ();
154    open(my $fh,'<',$fname_bin) or error("$fname_bin: $!");
155    while (my $line=<$fh>)
156    {
157        my ($pos,@vals) = split(/\t/,$line);
158        chomp($vals[-1]);
159        push @{$out{$pos}},join("\t",@vals);
160    }
161    close($fh) or error("close failed: $fname_bin");
162    if ( keys %out != 3 ) { error("Expected 3 positions, found ",scalar keys %out,": $fname_bin\n"); }
163    my $n;
164    for my $pos (keys %out)
165    {
166        if ( !defined $n ) { $n = scalar @{$out{$pos}}; }
167        if ( @{$out{$pos}} != $n ) { error("Expected $n positions, found ",scalar keys %{$out{$pos}},"\n"); }
168    }
169    my @blines = @{$out{(keys %out)[0]}};
170
171    my @plines = ();
172    open($fh,'<',$fname_perl) or error("$fname_perl: $!");
173    while (my $line=<$fh>)
174    {
175        chomp($line);
176        push @plines,$line;
177    }
178    close($fh) or error("close failed: $fname_perl");
179    if ( @blines != @plines ) { error("Different number of lines: ",scalar @blines," vs ",scalar @plines," in $fname_bin vs $fname_perl\n"); }
180    @blines = sort @blines;
181    @plines = sort @plines;
182    for (my $i=0; $i<@plines; $i++)
183    {
184        if ( $blines[$i] ne $plines[$i] )
185        {
186            #error("Different lines in $fname_bin vs $fname_perl:\n\t$blines[$i].\nvs\n\t$plines[$i].\n");
187            error("Different lines in $fname_bin vs $fname_perl:\n\t".join("\n\t",@blines)."\nvs\n\t".join("\n\t",@plines)."\n");
188        }
189    }
190}
191
192sub run_test
193{
194    my ($opts) = @_;
195    my @acgt = qw(A C G T);
196    my $ref  = $acgt[rand @acgt];
197    my @vcfs = ();
198    my $nvcf = 1 + int(rand(10));
199    for (my $i=0; $i<$nvcf; $i++)
200    {
201        my %vars  = ();
202        my $nvars = 1 + int(rand(6));
203        for (my $j=0; $j<$nvars; $j++)
204        {
205            my $snp = int(rand(2));
206            my $alt = random_alt($ref,$snp);
207            my $var = "$ref>$alt";
208            if ( $alt ne '.' && !int(rand(5)) )    # create multiallelic site
209            {
210                my $alt2 = random_alt($ref,$snp);
211                if ( $alt2 ne '.' && $alt ne $alt2 )
212                {
213                    $var .= ",$ref>$alt2";
214                }
215            }
216            $vars{$var} = 1;
217        }
218        my $ndup = 1 + int(rand(4));
219        for (my $j=0; $j<$ndup; $j++)
220        {
221            my @keys = shuffle keys %vars;
222            push @vcfs, \@keys;
223        }
224    }
225    @vcfs = shuffle @vcfs;
226    open(my $fh,'>',"$$opts{tmp}/list.txt") or error("$$opts{tmp}/list.txt: $!");
227    my %groups = ();
228    my @group_list = ();
229    for (my $i=0; $i<@vcfs; $i++)
230    {
231        my $vcf = $vcfs[$i];
232        my $key = join(';',sort @$vcf);
233        if ( !exists($groups{$key}) )
234        {
235            push @group_list,$key;
236            $groups{$key}{vars} = [@$vcf];
237            $groups{$key}{key} = $key;
238        }
239        push @{$groups{$key}{vcfs}},$i;
240        save_vcf($opts,$vcf,"$$opts{tmp}/$i.vcf.gz");
241        print $fh "$$opts{tmp}/$i.vcf.gz\n";
242    }
243    close($fh);
244
245    my @groups = ();
246    for my $group (@group_list) { push @groups, $groups{$group}; }
247    for my $logic (qw(snps indels both snps+ref indels+ref both+ref exact some all))
248    #for my $logic (qw(snps))
249    {
250        print STDERR "$FindBin::Bin/test-bcf-sr $$opts{tmp}/list.txt -p $logic > $$opts{tmp}/rmme.bin.out\n" unless !$$opts{verbose};
251        cmd("$FindBin::Bin/test-bcf-sr $$opts{tmp}/list.txt -p $logic > $$opts{tmp}/rmme.bin.out");
252
253        open(my $fh,'>',"$$opts{tmp}/rmme.perl.out") or error("$$opts{tmp}/rmme.perl.out: $!");
254        $$opts{fh} = $fh;
255        $$opts{logic} = $logic;
256        pair_lines($opts,\@groups);
257        close($fh) or error("close failed: $$opts{tmp}/rmme.perl.out");
258
259        check_outputs("$$opts{tmp}/rmme.bin.out","$$opts{tmp}/rmme.perl.out");
260    }
261}
262
263sub pair_lines
264{
265    my ($opts,$groups) = @_;
266
267    #print 'groups: '.Dumper($groups);
268
269    # get a list of all unique variants and their groups
270    my %vars = ();
271    my @var_list = ();
272    for (my $igrp=0; $igrp<@$groups; $igrp++)
273    {
274        my $grp = $$groups[$igrp];
275        for (my $ivar=0; $ivar<@{$$grp{vars}}; $ivar++)
276        {
277            my $var = $$grp{vars}[$ivar];
278            if ( !exists($vars{$var}) ) { push @var_list,$var; }  # just to keep the order
279            push @{$vars{$var}}, { igrp=>$igrp, ivar=>$ivar, cnt=>scalar @{$$grp{vcfs}} };
280        }
281    }
282
283    # each variant has a list of groups that it is present in
284    my @vars = ();
285    for my $var (@var_list) { push @vars, $vars{$var}; }
286
287    #print STDERR 'unique variants: '.Dumper(\@var_list);
288    # for (my $i=0; $i<@vars; $i++)
289    # {
290    #     my $igrp = $vars[$i][0]{igrp};
291    #     my $jvar = $vars[$i][0]{ivar};
292    #     my $var  = $$groups[$igrp]{vars}[$jvar];
293    #     print STDERR "$i: $var\n";
294    # }
295
296    # initialize variant sets - combinations of compatible variants across multiple reader groups
297    my @var_sets = ();
298    for (my $i=0; $i<@vars; $i++) { push @var_sets,[$i]; }
299
300    my @bitmask = ();
301    my @pmatrix = ();
302    for (my $iset=0; $iset<@var_sets; $iset++)
303    {
304        $pmatrix[$iset] = [(0) x (scalar @$groups)];
305        $bitmask[$iset] = 0;
306    }
307    my @max;
308    for (my $iset=0; $iset<@var_sets; $iset++)
309    {
310        my $tmp_max = 0;
311        for my $ivar (@{$var_sets[$iset]})
312        {
313            my $var = $vars[$ivar];
314            for my $grp (@$var)
315            {
316                my $igrp = $$grp{igrp};
317                $pmatrix[$iset][$igrp] += $$grp{cnt};
318                if ( $bitmask[$iset] & (1<<$igrp) ) { error("Uh!"); }
319                $bitmask[$iset] |= 1<<$igrp;
320                $tmp_max += $$grp{cnt};
321            }
322        }
323        push @max, $tmp_max;
324    }
325
326    # pair the lines
327    while ( @var_sets )
328    {
329        my $imax = 0;
330        for (my $iset=1; $iset<@var_sets; $iset++)
331        {
332            if ( $max[$iset] > $max[$imax] ) { $imax = $iset; }
333        }
334        # if ( @var_sets == @vars ) { dump_pmatrix($groups,\@vars,\@var_sets,\@pmatrix,\@bitmask); }
335
336        my $ipair = undef;
337        my $max_score = 0;
338        for (my $iset=0; $iset<@var_sets; $iset++)
339        {
340            if ( $bitmask[$imax] & $bitmask[$iset] ) { next; }     # cannot merge
341            my $score = pairing_score($opts,$groups,\@vars,$var_sets[$imax],$var_sets[$iset]);
342            if ( $max_score < $score ) { $max_score = $score; $ipair = $iset; }
343        }
344
345        # merge rows thus creating a new variant set
346        if ( defined $ipair && $ipair != $imax )
347        {
348            $imax = merge_rows($groups,\@vars,\@var_sets,\@pmatrix,\@bitmask,\@max,$imax,$ipair);
349            next;
350        }
351
352        output_row($opts,$groups,\@vars,\@var_sets,\@pmatrix,\@bitmask,\@max,$imax);
353        # dump_pmatrix($groups,\@vars,\@var_sets,\@pmatrix,\@bitmask);
354    }
355}
356
357sub merge_rows
358{
359    my ($grps,$vars,$var_sets,$pmat,$bitmask,$max,$ivset,$jvset) = @_;
360    if ( $ivset > $jvset ) { my $tmp = $ivset; $ivset = $jvset; $jvset = $tmp; }
361    push @{$$var_sets[$ivset]}, @{$$var_sets[$jvset]};
362    for (my $igrp=0; $igrp<@{$$pmat[$ivset]}; $igrp++)
363    {
364        $$pmat[$ivset][$igrp] += $$pmat[$jvset][$igrp];
365    }
366    $$max[$ivset] += $$max[$jvset];
367    $$bitmask[$ivset] |= $$bitmask[$jvset];
368    splice(@$var_sets,$jvset,1);
369    splice(@$pmat,$jvset,1);
370    splice(@$bitmask,$jvset,1);
371    splice(@$max,$jvset,1);
372    return $ivset;
373}
374
375sub output_row
376{
377    my ($opts,$grps,$vars,$var_sets,$pmat,$bitmask,$max,$ivset) = @_;
378    my $varset = $$var_sets[$ivset];
379    my @tmp = ();
380    for my $grp (@$grps)
381    {
382        for my $vcf (@{$$grp{vcfs}}) { push @tmp, '-'; }
383    }
384    for my $idx (@$varset)
385    {
386        for my $var (@{$$vars[$idx]})
387        {
388            my $igrp = $$var{igrp};
389            my $jvar = $$var{ivar};
390            my $str  = $$grps[$igrp]{vars}[$jvar];
391            $str =~ s/[^>]>//g;
392            for my $ivcf (@{$$grps[$igrp]{vcfs}}) { $tmp[$ivcf] = $str; }
393        }
394    }
395    print {$$opts{fh}} join("\t",@tmp)."\n";
396    splice(@$var_sets,$ivset,1);
397    splice(@$pmat,$ivset,1);
398    splice(@$bitmask,$ivset,1);
399    splice(@$max,$ivset,1);
400}
401
402sub dump_pmatrix
403{
404    my ($grps,$vars,$var_sets,$pmat,$bitmask) = @_;
405    for (my $ivset=0; $ivset<@$var_sets; $ivset++)
406    {
407        my $varset = $$var_sets[$ivset];
408        my @tmp = ();
409        for my $ivar (@$varset)
410        {
411            my $igrp = $$vars[$ivar][0]{igrp};
412            my $jvar = $$vars[$ivar][0]{ivar};
413            push @tmp, $$grps[$igrp]{vars}[$jvar];
414        }
415        printf STDERR "%-10s",join(',',@tmp);
416        for (my $igrp=0; $igrp<@{$$pmat[0]}; $igrp++)
417        {
418            print STDERR "\t$$pmat[$ivset][$igrp]";
419        }
420        print STDERR "\n";
421    }
422    print STDERR "\n";
423}
424
425sub var_type
426{
427    my ($vars) = @_;
428    my %type = ();
429    for my $var (split(/,/,$vars))
430    {
431        my ($ref,$alt) = split(/>/,$var);
432        if ( $ref eq $alt or $alt eq '.' ) { $type{ref} = 1; }
433        elsif ( length($ref)==length($alt) && length($ref)==1 ) { $type{snp} = 1; }
434        else { $type{indel} = 1; }
435    }
436    return keys %type;
437}
438sub multi_is_subset
439{
440    my ($avar,$bvar) = @_;
441    my %avars = ();
442    my %bvars = ();
443    for my $var (split(/,/,$avar)) { $avars{$var} = 1; }
444    for my $var (split(/,/,$bvar)) { $bvars{$var} = 1; }
445    for my $var (keys %avars)
446    {
447        if ( exists($bvars{$var}) ) { return 1; }
448    }
449    for my $var (keys %bvars)
450    {
451        if ( exists($avars{$var}) ) { return 1; }
452    }
453    return 0;
454}
455sub multi_is_exact
456{
457    my ($avar,$bvar) = @_;
458    my %avars = ();
459    my %bvars = ();
460    for my $var (split(/,/,$avar)) { $avars{$var} = 1; }
461    for my $var (split(/,/,$bvar)) { $bvars{$var} = 1; }
462    for my $var (keys %avars)
463    {
464        if ( !exists($bvars{$var}) ) { return 0; }
465    }
466    for my $var (keys %bvars)
467    {
468        if ( !exists($avars{$var}) ) { return 0; }
469    }
470    return 1;
471}
472sub pairing_score
473{
474    my ($opts,$grps,$vars,$avset,$bvset) = @_;
475
476    my $score = {};
477    if ( $$opts{logic}=~/both/ or $$opts{logic}=~/snps/ or $$opts{logic}=~/all/ )
478    {
479        $$score{snp}{snp} = 3;
480        if ( $$opts{logic}=~/ref/ or $$opts{logic}=~/all/ ) { $$score{snp}{ref} = 2; }
481    }
482    if ( $$opts{logic}=~/both/ or $$opts{logic}=~/indels/ or $$opts{logic}=~/all/ )
483    {
484        $$score{indel}{indel} = 3;
485        if ( $$opts{logic}=~/ref/ or $$opts{logic}=~/all/ ) { $$score{indel}{ref} = 2; }
486    }
487    if ( $$opts{logic}=~/all/ )
488    {
489        $$score{snp}{indel} = 1;
490        $$score{indel}{snp} = 1;
491    }
492    for my $a (keys %$score)
493    {
494        for my $b (keys %{$$score{$a}})
495        {
496            $$score{$b}{$a} = $$score{$a}{$b};
497        }
498    }
499
500    my $max_int = 0xFFFFFFFF;
501    my $min = $max_int;
502    for my $ia (@$avset)
503    {
504        for my $ib (@$bvset)
505        {
506            my $avar = $$grps[ $$vars[$ia][0]{igrp} ]{vars}[ $$vars[$ia][0]{ivar} ];
507            my $bvar = $$grps[ $$vars[$ib][0]{igrp} ]{vars}[ $$vars[$ib][0]{ivar} ];
508
509            if ( $avar eq $bvar ) { return $max_int; }
510            if ( $$opts{logic} eq 'exact' )
511            {
512                if ( multi_is_exact($avar,$bvar) ) { return $max_int; }
513                next;
514            }
515            elsif ( multi_is_subset($avar,$bvar) ) { return $max_int; }
516
517            my @atype = var_type($avar);
518            my @btype = var_type($bvar);
519            my $max = 0;
520            for my $a (@atype)
521            {
522                for my $b (@btype)
523                {
524                    if ( !exists($$score{$a}{$b}) ) { next; }
525                    if ( $max < $$score{$a}{$b} ) { $max = $$score{$a}{$b}; }
526                }
527            }
528            if ( !$max ) { return 0; }      # some of the variants in the two groups are not compatible
529            if ( $min > $max ) { $min = $max; }
530        }
531    }
532    if ( $$opts{logic} eq 'exact' ) { return 0; }
533
534    my $cnt = 0;
535    for my $ivar (@$avset,@$bvset)
536    {
537        my $var = $$vars[$ivar];
538        for my $grp (@$var)
539        {
540            $cnt += $$grp{cnt};
541        }
542    }
543    return (1<<(28+$min)) + $cnt;
544}
545
546
547