1#!/usr/bin/env perl 2# 3# Author: petr.danecek@sanger 4# 5# Test bcf synced reader's allele pairing 6# 7 8use strict; 9use warnings; 10use Carp; 11use Data::Dumper; 12use List::Util 'shuffle'; 13use File::Temp qw/ tempfile tempdir /; 14use FindBin; 15use lib "$FindBin::Bin"; 16 17my $opts = parse_params(); 18run_test($opts); 19 20exit; 21 22#-------------------------------- 23 24sub error 25{ 26 my (@msg) = @_; 27 if ( scalar @msg ) { confess @msg; } 28 print 29 "Usage: test-bcf-sr.pl [OPTIONS]\n", 30 "Options:\n", 31 " -s, --seed <int> Random seed\n", 32 " -t, --temp-dir <dir> When given, temporary files will not be removed\n", 33 " -v, --verbose \n", 34 " -h, -?, --help This help message\n", 35 "\n"; 36 exit -1; 37} 38sub parse_params 39{ 40 my $opts = {}; 41 while (defined(my $arg=shift(@ARGV))) 42 { 43 if ( $arg eq '-t' || $arg eq '--temp-dir' ) { $$opts{keep_files}=shift(@ARGV); next } 44 if ( $arg eq '-v' || $arg eq '--verbose' ) { $$opts{verbose}=1; next } 45 if ( $arg eq '-s' || $arg eq '--seed' ) { $$opts{seed}=shift(@ARGV); next } 46 if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } 47 error("Unknown parameter \"$arg\". Run -h for help.\n"); 48 } 49 $$opts{tmp} = exists($$opts{keep_files}) ? $$opts{keep_files} : tempdir(CLEANUP=>1); 50 if ( $$opts{keep_files} ) { cmd("mkdir -p $$opts{keep_files}"); } 51 if ( !exists($$opts{seed}) ) 52 { 53 $$opts{seed} = time(); 54 print STDERR "Random seed is $$opts{seed}\n"; 55 } 56 srand($$opts{seed}); 57 return $opts; 58} 59 60sub _cmd 61{ 62 my ($cmd) = @_; 63 my $kid_io; 64 my @out; 65 my $pid = open($kid_io, "-|"); 66 if ( !defined $pid ) { error("Cannot fork: $!"); } 67 if ($pid) 68 { 69 # parent 70 @out = <$kid_io>; 71 close($kid_io); 72 } 73 else 74 { 75 # child 76 exec('bash', '-o','pipefail','-c', $cmd) or error("Cannot execute the command [/bin/sh -o pipefail -c $cmd]: $!"); 77 } 78 return ($? >> 8, join('',@out)); 79} 80sub cmd 81{ 82 my ($cmd) = @_; 83 my ($ret,$out) = _cmd($cmd); 84 if ( $ret ) { error("The command failed [$ret]: $cmd\n", $out); } 85 return $out; 86} 87 88sub save_vcf 89{ 90 my ($opts,$vars,$fname) = @_; 91 open(my $fh,"| $FindBin::Bin/../bgzip -c > $fname") or error("$FindBin::Bin/../bgzip -c > $fname: !"); 92 print $fh qq[##fileformat=VCFv4.3\n]; 93 print $fh qq[##FILTER=<ID=PASS,Description="All filters passed">\n]; 94 print $fh qq[##contig=<ID=1>\n]; 95 print $fh qq[##contig=<ID=2>\n]; 96 print $fh '#'. join("\t", qw(CHROM POS ID REF ALT QUAL FILTER INFO))."\n"; 97 for my $var (@$vars) 98 { 99 my @als = split(/,/,$var); 100 my @alts = (); 101 my $ref; 102 for my $al (@als) 103 { 104 my ($xref,$alt) = split(/>/,$al); 105 $ref = $xref; 106 push @alts,$alt; 107 } 108 print $fh join("\t", (1,100,'.',$ref,join(',',@alts),'.','.','.'))."\n"; 109 } 110 for my $var (@$vars) 111 { 112 my @als = split(/,/,$var); 113 my @alts = (); 114 my $ref; 115 for my $al (@als) 116 { 117 my ($xref,$alt) = split(/>/,$al); 118 $ref = $xref; 119 push @alts,$alt; 120 } 121 print $fh join("\t", (1,300,'.',$ref,join(',',@alts),'.','.','.'))."\n"; 122 } 123 for my $var (@$vars) 124 { 125 my @als = split(/,/,$var); 126 my @alts = (); 127 my $ref; 128 for my $al (@als) 129 { 130 my ($xref,$alt) = split(/>/,$al); 131 $ref = $xref; 132 push @alts,$alt; 133 } 134 print $fh join("\t", (2,100,'.',$ref,join(',',@alts),'.','.','.'))."\n"; 135 } 136 close($fh) or error("close failed: bgzip -c > $fname"); 137 cmd("$FindBin::Bin/../tabix -f $fname"); 138} 139 140sub random_alt 141{ 142 my ($ref,$is_snp) = @_; 143 my @acgt = qw(A C G T); 144 my $alt = $acgt[rand @acgt]; 145 if ( $ref eq $alt ) { return '.'; } # ref 146 if ( !$is_snp ) { $alt = $ref.$alt; } 147 return $alt; 148} 149 150sub check_outputs 151{ 152 my ($fname_bin,$fname_perl) = @_; 153 my %out = (); 154 open(my $fh,'<',$fname_bin) or error("$fname_bin: $!"); 155 while (my $line=<$fh>) 156 { 157 my ($pos,@vals) = split(/\t/,$line); 158 chomp($vals[-1]); 159 push @{$out{$pos}},join("\t",@vals); 160 } 161 close($fh) or error("close failed: $fname_bin"); 162 if ( keys %out != 3 ) { error("Expected 3 positions, found ",scalar keys %out,": $fname_bin\n"); } 163 my $n; 164 for my $pos (keys %out) 165 { 166 if ( !defined $n ) { $n = scalar @{$out{$pos}}; } 167 if ( @{$out{$pos}} != $n ) { error("Expected $n positions, found ",scalar keys %{$out{$pos}},"\n"); } 168 } 169 my @blines = @{$out{(keys %out)[0]}}; 170 171 my @plines = (); 172 open($fh,'<',$fname_perl) or error("$fname_perl: $!"); 173 while (my $line=<$fh>) 174 { 175 chomp($line); 176 push @plines,$line; 177 } 178 close($fh) or error("close failed: $fname_perl"); 179 if ( @blines != @plines ) { error("Different number of lines: ",scalar @blines," vs ",scalar @plines," in $fname_bin vs $fname_perl\n"); } 180 @blines = sort @blines; 181 @plines = sort @plines; 182 for (my $i=0; $i<@plines; $i++) 183 { 184 if ( $blines[$i] ne $plines[$i] ) 185 { 186 #error("Different lines in $fname_bin vs $fname_perl:\n\t$blines[$i].\nvs\n\t$plines[$i].\n"); 187 error("Different lines in $fname_bin vs $fname_perl:\n\t".join("\n\t",@blines)."\nvs\n\t".join("\n\t",@plines)."\n"); 188 } 189 } 190} 191 192sub run_test 193{ 194 my ($opts) = @_; 195 my @acgt = qw(A C G T); 196 my $ref = $acgt[rand @acgt]; 197 my @vcfs = (); 198 my $nvcf = 1 + int(rand(10)); 199 for (my $i=0; $i<$nvcf; $i++) 200 { 201 my %vars = (); 202 my $nvars = 1 + int(rand(6)); 203 for (my $j=0; $j<$nvars; $j++) 204 { 205 my $snp = int(rand(2)); 206 my $alt = random_alt($ref,$snp); 207 my $var = "$ref>$alt"; 208 if ( $alt ne '.' && !int(rand(5)) ) # create multiallelic site 209 { 210 my $alt2 = random_alt($ref,$snp); 211 if ( $alt2 ne '.' && $alt ne $alt2 ) 212 { 213 $var .= ",$ref>$alt2"; 214 } 215 } 216 $vars{$var} = 1; 217 } 218 my $ndup = 1 + int(rand(4)); 219 for (my $j=0; $j<$ndup; $j++) 220 { 221 my @keys = shuffle keys %vars; 222 push @vcfs, \@keys; 223 } 224 } 225 @vcfs = shuffle @vcfs; 226 open(my $fh,'>',"$$opts{tmp}/list.txt") or error("$$opts{tmp}/list.txt: $!"); 227 my %groups = (); 228 my @group_list = (); 229 for (my $i=0; $i<@vcfs; $i++) 230 { 231 my $vcf = $vcfs[$i]; 232 my $key = join(';',sort @$vcf); 233 if ( !exists($groups{$key}) ) 234 { 235 push @group_list,$key; 236 $groups{$key}{vars} = [@$vcf]; 237 $groups{$key}{key} = $key; 238 } 239 push @{$groups{$key}{vcfs}},$i; 240 save_vcf($opts,$vcf,"$$opts{tmp}/$i.vcf.gz"); 241 print $fh "$$opts{tmp}/$i.vcf.gz\n"; 242 } 243 close($fh); 244 245 my @groups = (); 246 for my $group (@group_list) { push @groups, $groups{$group}; } 247 for my $logic (qw(snps indels both snps+ref indels+ref both+ref exact some all)) 248 #for my $logic (qw(snps)) 249 { 250 print STDERR "$FindBin::Bin/test-bcf-sr $$opts{tmp}/list.txt -p $logic > $$opts{tmp}/rmme.bin.out\n" unless !$$opts{verbose}; 251 cmd("$FindBin::Bin/test-bcf-sr $$opts{tmp}/list.txt -p $logic > $$opts{tmp}/rmme.bin.out"); 252 253 open(my $fh,'>',"$$opts{tmp}/rmme.perl.out") or error("$$opts{tmp}/rmme.perl.out: $!"); 254 $$opts{fh} = $fh; 255 $$opts{logic} = $logic; 256 pair_lines($opts,\@groups); 257 close($fh) or error("close failed: $$opts{tmp}/rmme.perl.out"); 258 259 check_outputs("$$opts{tmp}/rmme.bin.out","$$opts{tmp}/rmme.perl.out"); 260 } 261} 262 263sub pair_lines 264{ 265 my ($opts,$groups) = @_; 266 267 #print 'groups: '.Dumper($groups); 268 269 # get a list of all unique variants and their groups 270 my %vars = (); 271 my @var_list = (); 272 for (my $igrp=0; $igrp<@$groups; $igrp++) 273 { 274 my $grp = $$groups[$igrp]; 275 for (my $ivar=0; $ivar<@{$$grp{vars}}; $ivar++) 276 { 277 my $var = $$grp{vars}[$ivar]; 278 if ( !exists($vars{$var}) ) { push @var_list,$var; } # just to keep the order 279 push @{$vars{$var}}, { igrp=>$igrp, ivar=>$ivar, cnt=>scalar @{$$grp{vcfs}} }; 280 } 281 } 282 283 # each variant has a list of groups that it is present in 284 my @vars = (); 285 for my $var (@var_list) { push @vars, $vars{$var}; } 286 287 #print STDERR 'unique variants: '.Dumper(\@var_list); 288 # for (my $i=0; $i<@vars; $i++) 289 # { 290 # my $igrp = $vars[$i][0]{igrp}; 291 # my $jvar = $vars[$i][0]{ivar}; 292 # my $var = $$groups[$igrp]{vars}[$jvar]; 293 # print STDERR "$i: $var\n"; 294 # } 295 296 # initialize variant sets - combinations of compatible variants across multiple reader groups 297 my @var_sets = (); 298 for (my $i=0; $i<@vars; $i++) { push @var_sets,[$i]; } 299 300 my @bitmask = (); 301 my @pmatrix = (); 302 for (my $iset=0; $iset<@var_sets; $iset++) 303 { 304 $pmatrix[$iset] = [(0) x (scalar @$groups)]; 305 $bitmask[$iset] = 0; 306 } 307 my @max; 308 for (my $iset=0; $iset<@var_sets; $iset++) 309 { 310 my $tmp_max = 0; 311 for my $ivar (@{$var_sets[$iset]}) 312 { 313 my $var = $vars[$ivar]; 314 for my $grp (@$var) 315 { 316 my $igrp = $$grp{igrp}; 317 $pmatrix[$iset][$igrp] += $$grp{cnt}; 318 if ( $bitmask[$iset] & (1<<$igrp) ) { error("Uh!"); } 319 $bitmask[$iset] |= 1<<$igrp; 320 $tmp_max += $$grp{cnt}; 321 } 322 } 323 push @max, $tmp_max; 324 } 325 326 # pair the lines 327 while ( @var_sets ) 328 { 329 my $imax = 0; 330 for (my $iset=1; $iset<@var_sets; $iset++) 331 { 332 if ( $max[$iset] > $max[$imax] ) { $imax = $iset; } 333 } 334 # if ( @var_sets == @vars ) { dump_pmatrix($groups,\@vars,\@var_sets,\@pmatrix,\@bitmask); } 335 336 my $ipair = undef; 337 my $max_score = 0; 338 for (my $iset=0; $iset<@var_sets; $iset++) 339 { 340 if ( $bitmask[$imax] & $bitmask[$iset] ) { next; } # cannot merge 341 my $score = pairing_score($opts,$groups,\@vars,$var_sets[$imax],$var_sets[$iset]); 342 if ( $max_score < $score ) { $max_score = $score; $ipair = $iset; } 343 } 344 345 # merge rows thus creating a new variant set 346 if ( defined $ipair && $ipair != $imax ) 347 { 348 $imax = merge_rows($groups,\@vars,\@var_sets,\@pmatrix,\@bitmask,\@max,$imax,$ipair); 349 next; 350 } 351 352 output_row($opts,$groups,\@vars,\@var_sets,\@pmatrix,\@bitmask,\@max,$imax); 353 # dump_pmatrix($groups,\@vars,\@var_sets,\@pmatrix,\@bitmask); 354 } 355} 356 357sub merge_rows 358{ 359 my ($grps,$vars,$var_sets,$pmat,$bitmask,$max,$ivset,$jvset) = @_; 360 if ( $ivset > $jvset ) { my $tmp = $ivset; $ivset = $jvset; $jvset = $tmp; } 361 push @{$$var_sets[$ivset]}, @{$$var_sets[$jvset]}; 362 for (my $igrp=0; $igrp<@{$$pmat[$ivset]}; $igrp++) 363 { 364 $$pmat[$ivset][$igrp] += $$pmat[$jvset][$igrp]; 365 } 366 $$max[$ivset] += $$max[$jvset]; 367 $$bitmask[$ivset] |= $$bitmask[$jvset]; 368 splice(@$var_sets,$jvset,1); 369 splice(@$pmat,$jvset,1); 370 splice(@$bitmask,$jvset,1); 371 splice(@$max,$jvset,1); 372 return $ivset; 373} 374 375sub output_row 376{ 377 my ($opts,$grps,$vars,$var_sets,$pmat,$bitmask,$max,$ivset) = @_; 378 my $varset = $$var_sets[$ivset]; 379 my @tmp = (); 380 for my $grp (@$grps) 381 { 382 for my $vcf (@{$$grp{vcfs}}) { push @tmp, '-'; } 383 } 384 for my $idx (@$varset) 385 { 386 for my $var (@{$$vars[$idx]}) 387 { 388 my $igrp = $$var{igrp}; 389 my $jvar = $$var{ivar}; 390 my $str = $$grps[$igrp]{vars}[$jvar]; 391 $str =~ s/[^>]>//g; 392 for my $ivcf (@{$$grps[$igrp]{vcfs}}) { $tmp[$ivcf] = $str; } 393 } 394 } 395 print {$$opts{fh}} join("\t",@tmp)."\n"; 396 splice(@$var_sets,$ivset,1); 397 splice(@$pmat,$ivset,1); 398 splice(@$bitmask,$ivset,1); 399 splice(@$max,$ivset,1); 400} 401 402sub dump_pmatrix 403{ 404 my ($grps,$vars,$var_sets,$pmat,$bitmask) = @_; 405 for (my $ivset=0; $ivset<@$var_sets; $ivset++) 406 { 407 my $varset = $$var_sets[$ivset]; 408 my @tmp = (); 409 for my $ivar (@$varset) 410 { 411 my $igrp = $$vars[$ivar][0]{igrp}; 412 my $jvar = $$vars[$ivar][0]{ivar}; 413 push @tmp, $$grps[$igrp]{vars}[$jvar]; 414 } 415 printf STDERR "%-10s",join(',',@tmp); 416 for (my $igrp=0; $igrp<@{$$pmat[0]}; $igrp++) 417 { 418 print STDERR "\t$$pmat[$ivset][$igrp]"; 419 } 420 print STDERR "\n"; 421 } 422 print STDERR "\n"; 423} 424 425sub var_type 426{ 427 my ($vars) = @_; 428 my %type = (); 429 for my $var (split(/,/,$vars)) 430 { 431 my ($ref,$alt) = split(/>/,$var); 432 if ( $ref eq $alt or $alt eq '.' ) { $type{ref} = 1; } 433 elsif ( length($ref)==length($alt) && length($ref)==1 ) { $type{snp} = 1; } 434 else { $type{indel} = 1; } 435 } 436 return keys %type; 437} 438sub multi_is_subset 439{ 440 my ($avar,$bvar) = @_; 441 my %avars = (); 442 my %bvars = (); 443 for my $var (split(/,/,$avar)) { $avars{$var} = 1; } 444 for my $var (split(/,/,$bvar)) { $bvars{$var} = 1; } 445 for my $var (keys %avars) 446 { 447 if ( exists($bvars{$var}) ) { return 1; } 448 } 449 for my $var (keys %bvars) 450 { 451 if ( exists($avars{$var}) ) { return 1; } 452 } 453 return 0; 454} 455sub multi_is_exact 456{ 457 my ($avar,$bvar) = @_; 458 my %avars = (); 459 my %bvars = (); 460 for my $var (split(/,/,$avar)) { $avars{$var} = 1; } 461 for my $var (split(/,/,$bvar)) { $bvars{$var} = 1; } 462 for my $var (keys %avars) 463 { 464 if ( !exists($bvars{$var}) ) { return 0; } 465 } 466 for my $var (keys %bvars) 467 { 468 if ( !exists($avars{$var}) ) { return 0; } 469 } 470 return 1; 471} 472sub pairing_score 473{ 474 my ($opts,$grps,$vars,$avset,$bvset) = @_; 475 476 my $score = {}; 477 if ( $$opts{logic}=~/both/ or $$opts{logic}=~/snps/ or $$opts{logic}=~/all/ ) 478 { 479 $$score{snp}{snp} = 3; 480 if ( $$opts{logic}=~/ref/ or $$opts{logic}=~/all/ ) { $$score{snp}{ref} = 2; } 481 } 482 if ( $$opts{logic}=~/both/ or $$opts{logic}=~/indels/ or $$opts{logic}=~/all/ ) 483 { 484 $$score{indel}{indel} = 3; 485 if ( $$opts{logic}=~/ref/ or $$opts{logic}=~/all/ ) { $$score{indel}{ref} = 2; } 486 } 487 if ( $$opts{logic}=~/all/ ) 488 { 489 $$score{snp}{indel} = 1; 490 $$score{indel}{snp} = 1; 491 } 492 for my $a (keys %$score) 493 { 494 for my $b (keys %{$$score{$a}}) 495 { 496 $$score{$b}{$a} = $$score{$a}{$b}; 497 } 498 } 499 500 my $max_int = 0xFFFFFFFF; 501 my $min = $max_int; 502 for my $ia (@$avset) 503 { 504 for my $ib (@$bvset) 505 { 506 my $avar = $$grps[ $$vars[$ia][0]{igrp} ]{vars}[ $$vars[$ia][0]{ivar} ]; 507 my $bvar = $$grps[ $$vars[$ib][0]{igrp} ]{vars}[ $$vars[$ib][0]{ivar} ]; 508 509 if ( $avar eq $bvar ) { return $max_int; } 510 if ( $$opts{logic} eq 'exact' ) 511 { 512 if ( multi_is_exact($avar,$bvar) ) { return $max_int; } 513 next; 514 } 515 elsif ( multi_is_subset($avar,$bvar) ) { return $max_int; } 516 517 my @atype = var_type($avar); 518 my @btype = var_type($bvar); 519 my $max = 0; 520 for my $a (@atype) 521 { 522 for my $b (@btype) 523 { 524 if ( !exists($$score{$a}{$b}) ) { next; } 525 if ( $max < $$score{$a}{$b} ) { $max = $$score{$a}{$b}; } 526 } 527 } 528 if ( !$max ) { return 0; } # some of the variants in the two groups are not compatible 529 if ( $min > $max ) { $min = $max; } 530 } 531 } 532 if ( $$opts{logic} eq 'exact' ) { return 0; } 533 534 my $cnt = 0; 535 for my $ivar (@$avset,@$bvset) 536 { 537 my $var = $$vars[$ivar]; 538 for my $grp (@$var) 539 { 540 $cnt += $$grp{cnt}; 541 } 542 } 543 return (1<<(28+$min)) + $cnt; 544} 545 546 547