1#!/usr/local/bin/perl 2# 3# Copyright 2010-2019, Julian Catchen <jcatchen@illinois.edu> 4# 5# This file is part of Stacks. 6# 7# Stacks is free software: you can redistribute it and/or modify 8# it under the terms of the GNU General Public License as published by 9# the Free Software Foundation, either version 3 of the License, or 10# (at your option) any later version. 11# 12# Stacks is distributed in the hope that it will be useful, 13# but WITHOUT ANY WARRANTY; without even the implied warranty of 14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15# GNU General Public License for more details. 16# 17# You should have received a copy of the GNU General Public License 18# along with Stacks. If not, see <http://www.gnu.org/licenses/>. 19# 20 21use strict; 22use POSIX; 23use File::Temp qw/ mktemp /; 24use File::Spec; 25use constant stacks_version => "_VERSION_"; 26 27use constant true => 1; 28use constant false => 0; 29 30my $dry_run = false; 31my $exe_path = "_BINDIR_"; 32my $out_path = ""; 33my $popmap_path = ""; 34my $sample_path = ""; 35my $db = ""; 36my $sample_id = 1; 37my $gzip = false; 38my $time = ""; 39 40my @parents; 41my @progeny; 42my @samples; 43 44my (@_gstacks, @_populations); 45 46my $cmd_str = $0 . " " . join(" ", @ARGV); 47 48parse_command_line(); 49 50# 51# Check for the existence of the necessary pipeline programs 52# 53foreach my $prog ("gstacks", "populations") { 54 die "Unable to find '" . $exe_path . $prog . "'.\n" if (!-e $exe_path . $prog); 55} 56 57my ($log, $log_fh, $sample); 58 59my (@sample_list, %pop_ids, %pops, %grp_ids, %grps, %sample_ids); 60 61parse_population_map(\@sample_list, \%pop_ids, \%pops, \%grp_ids, \%grps); 62 63initialize_samples(\@parents, \@progeny, \@samples, \@sample_list, \%pop_ids, \%grp_ids); 64 65# 66# Open the log file 67# 68$log = "$out_path/ref_map.log"; 69open($log_fh, ">$log") or die("Unable to open log file '$log'; $!\n"); 70 71print $log_fh 72 "ref_map.pl version ", stacks_version, " started at ", strftime("%Y-%m-%d %H:%M:%S", (localtime(time))), "\n", 73 $cmd_str, "\n"; 74 75execute_stacks($log_fh, $sample_id, \@parents, \@progeny, \@samples, \%sample_ids); 76 77print $log_fh "\nref_map.pl completed at ", strftime("%Y-%m-%d %H:%M:%S", (localtime(time))), "\n"; 78close($log_fh); 79 80sub check_return_value { 81 # $? is a 16 bit int. Exit code is given by `$? & 255` if the process was 82 # terminated by a signal, and by `$? >> 8` if it exited normally. 83 my ($rv, $log_fh) = @_; 84 if ($rv != 0) { 85 my $code = ($rv >> 8) & 127; 86 if ($rv & 255 || ($rv >> 8) > 127) { 87 $code += 128; 88 } 89 my $msg = "\nref_map.pl: Aborted because the last command failed ($code"; 90 if ($code == 129 || $code == 130 || $code == 131) { 91 $msg .= "/interrupted"; 92 } elsif ($code == 137 || $code == 143) { 93 $msg .= "/killed"; 94 } elsif ($code == 134) { 95 $msg .= "/SIGABRT"; 96 } elsif ($code == 139) { 97 $msg .= "/segmentation fault"; 98 } 99 $msg .= ")"; 100 print $log_fh ($msg . ".\n"); 101 print STDERR ($msg . "; see log file.\n"); 102 exit 1; 103 } 104} 105 106sub execute_stacks { 107 my ($log_fh, $sample_id, $parents, $progeny, $samples, $sample_ids) = @_; 108 109 my (@results, @depths_of_cov); 110 my ($pop_cnt, $sample, $num_files, $i, $cmd, $pipe_fh, $path, $cat_file); 111 112 # 113 # Call genotypes. 114 # 115 print STDERR "Calling variants, genotypes and haplotypes...\n"; 116 print $log_fh "\ngstacks\n==========\n"; 117 118 $cmd = $exe_path . "gstacks -I $sample_path -M $popmap_path -O $out_path"; 119 foreach (@_gstacks) { 120 $cmd .= " " . $_; 121 } 122 print STDERR " $cmd\n\n"; 123 print $log_fh "$cmd\n\n"; 124 if (!$dry_run) { 125 open($pipe_fh, "$time $cmd 2>&1 |"); 126 while (<$pipe_fh>) { 127 print $log_fh $_; 128 } 129 close($pipe_fh); 130 check_return_value($?, $log_fh); 131 } 132 133 printf(STDERR "Calculating population-level summary statistics\n"); 134 print $log_fh "\npopulations\n==========\n"; 135 136 $cmd = $exe_path . "populations" . " -P $out_path " . join(" ", @_populations); 137 print STDERR " $cmd\n\n"; 138 print $log_fh "$cmd\n\n"; 139 140 if (!$dry_run) { 141 open($pipe_fh, "$time $cmd 2>&1 |"); 142 while (<$pipe_fh>) { 143 print $log_fh $_; 144 } 145 close($pipe_fh); 146 check_return_value($?, $log_fh); 147 } 148 149 print STDERR "ref_map.pl is done.\n"; 150 print $log_fh "ref_map.pl is done.\n"; 151} 152 153sub parse_population_map { 154 my ($sample_list, $pop_ids, $pops, $grp_ids, $grps) = @_; 155 156 my ($fh, @parts, $line, $sample); 157 158 return if (length($popmap_path) == 0); 159 160 open($fh, "<$popmap_path") or die("Unable to open population map, '$popmap_path', $!\n"); 161 162 while ($line = <$fh>) { 163 chomp $line; 164 165 next if ($line =~ /^\s*#/); 166 167 @parts = split(/\t/, $line); 168 if (scalar(@parts) != 2 and scalar(@parts) != 3) { 169 die("Unable to parse population map, '$popmap_path' (expected 2 or 3 columns, found " . scalar(@parts) . "); at line:\n$line\n"); 170 } 171 172 foreach my $part (@parts) { 173 $part =~ s/^\s*|\s*$//g; 174 } 175 176 push(@{$sample_list}, $parts[0]); 177 178 $pop_ids->{$parts[0]} = $parts[1]; 179 $pops->{$parts[1]}++; 180 181 if (scalar(@parts) > 2) { 182 $grp_ids->{$parts[0]} = $parts[2]; 183 $grps->{$parts[2]}++; 184 } 185 } 186 187 if (scalar(keys %{$grps}) == 0) { 188 $grps->{"1"}++; 189 190 foreach $sample (@{$sample_list}) { 191 $grp_ids->{$sample} = "1"; 192 } 193 } 194 195 print STDERR "Parsed population map: ", scalar(@{$sample_list}), " files in ", scalar(keys %{$pops}); 196 scalar(keys %{$pops}) == 1 ? print STDERR " population" : print STDERR " populations"; 197 print STDERR " and ", scalar(keys %{$grps}); 198 scalar(keys %{$grps}) == 1 ? print STDERR " group.\n" : print STDERR " groups.\n"; 199 200 close($fh); 201} 202 203sub initialize_samples { 204 my ($parents, $progeny, $samples, $sample_list, $pop_ids, $grp_ids) = @_; 205 206 my ($local_gzip, $file, $prefix, $suffix, $path, $found, $i); 207 208 if (scalar(@{$sample_list}) > 0 && scalar(@{$samples}) == 0) { 209 my @suffixes = ("bam"); 210 my @fmts = ("bam"); 211 212 # 213 # If a population map was specified and no samples were provided on the command line. 214 # 215 foreach $sample (@{$sample_list}) { 216 $found = false; 217 218 for ($i = 0; $i < scalar(@suffixes); $i++) { 219 $path = $sample_path . $sample . "." . $suffixes[$i]; 220 if (-e $path) { 221 222 $gzip = true if ($i == 1); 223 224 push(@{$samples}, {'path' => $sample_path, 225 'file' => $sample, 226 'suffix' => $suffixes[$i], 227 'type' => "sample", 228 'fmt' => $fmts[$i]}); 229 $found = true; 230 last; 231 } 232 } 233 234 if ($found == false) { 235 die("Error: Failed to open '$sample_path$sample.bam'.\n"); 236 } 237 } 238 } 239 240 # 241 # If a population map was specified, make sure all samples in the list were found (and vice versa) and assign popualtion IDs. 242 # 243 if (scalar(@{$sample_list}) > 0) { 244 245 my %sample_hash; 246 247 foreach $sample (@{$samples}) { 248 $sample_hash{$sample->{'file'}}++; 249 250 if (!defined($pop_ids->{$sample->{'file'}})) { 251 die("Unable to find an entry for '" . $sample->{'file'} . "' in the population map, '$popmap_path'.\n"); 252 } else { 253 $sample->{'pop_id'} = $pop_ids->{$sample->{'file'}}; 254 } 255 if (!defined($grp_ids->{$sample->{'file'}})) { 256 die("Unable to find an entry for '" . $sample->{'file'} . "' in the population map, '$popmap_path'.\n"); 257 } else { 258 $sample->{'grp_id'} = $grp_ids->{$sample->{'file'}}; 259 } 260 } 261 262 foreach $sample (@{$sample_list}) { 263 if (!defined($sample_hash{$sample})) { 264 die("Unable to find a file corresponding to the population map entry '" . $sample . "' in the population map, '$popmap_path'.\n"); 265 } 266 } 267 268 } else { 269 foreach $sample (@{$parents}, @{$progeny}, @{$samples}) { 270 $sample->{'pop_id'} = "1"; 271 $sample->{'grp_id'} = "1"; 272 $pop_ids->{$sample->{'file'}} = $sample->{'pop_id'}; 273 $grp_ids->{$sample->{'file'}} = $sample->{'grp_id'}; 274 } 275 } 276 277 # 278 # Check that no duplicate files were specified. 279 # 280 my (%files, $file); 281 foreach $file (@{$parents}, @{$progeny}, @{$samples}) { 282 $files{$file}++; 283 } 284 foreach $file (keys %files) { 285 if ($files{$file} > 1) { 286 die("A duplicate file was specified which may create undefined results, '$file'\n"); 287 } 288 } 289 290 print STDERR "Found ", scalar(@{$parents}), " parental file(s).\n\n" if (scalar(@{$parents}) > 0); 291 print STDERR "Found ", scalar(@{$progeny}), " progeny file(s).\n\n" if (scalar(@{$progeny}) > 0); 292 print STDERR "Found ", scalar(@{$samples}), " sample file(s).\n\n" if (scalar(@{$samples}) > 0); 293 294 if ( scalar(@{$samples}) > 0 && (scalar(@{$parents}) > 0 || scalar(@{$progeny}) > 0) ) { 295 die("Both samples and parents/progeny were specified either on the command line (-s/-r/-p) or within the population map. Only one of the other may be specified.\n"); 296 } 297} 298 299sub write_results { 300 my ($results, $log_fh) = @_; 301 302 my $line; 303 304 foreach $line (@{$results}) { 305 if ($line =~ /\r/) { 306 $line =~ s/^.+\r(.*\n)$/\1/; 307 } 308 print $log_fh $line; 309 } 310} 311 312sub write_depths_of_cov { 313 my ($depths, $log_fh) = @_; 314 315 print STDERR "\nDepths of Coverage for Processed Samples:\n"; 316 print $log_fh "\nDepths of Coverage for Processed Samples:\n"; 317 318 foreach $a (@{$depths}) { 319 print STDERR $a->[0], ": ", $a->[1], "x\n"; 320 print $log_fh $a->[0], ": ", $a->[1], "x\n"; 321 } 322} 323 324sub parse_command_line { 325 my ($arg); 326 327 while (@ARGV) { 328 $_ = shift @ARGV; 329 if ($_ =~ /^-v$/ || $_ =~ /^--version$/) { version(); exit 1; } 330 elsif ($_ =~ /^-h$/) { usage(); } 331 elsif ($_ =~ /^-d$/ || $_ =~ /^--dry-run$/) { $dry_run = true; } 332 elsif ($_ =~ /^-o$/) { $out_path = shift @ARGV; } 333 elsif ($_ =~ /^-e$/) { $exe_path = shift @ARGV; } 334 elsif ($_ =~ /^--samples$/) { 335 $sample_path = shift @ARGV; 336 } elsif ($_ =~ /^-O$/ || $_ =~ /^--popmap$/) { 337 $popmap_path = shift @ARGV; 338 push(@_populations, "-M " . $popmap_path); 339 340 } elsif ($_ =~ /^--unpaired$/) { 341 push(@_gstacks, "--unpaired"); 342 343 } elsif ($_ =~ /^--ignore-pe-reads$/) { 344 push(@_gstacks, "--ignore-pe-reads"); 345 346 } elsif ($_ =~ /^-T$/) { 347 $arg = shift @ARGV; 348 push(@_gstacks, "-t " . $arg); 349 push(@_populations, "-t " . $arg); 350 351 } elsif ($_ =~ /^--rm-pcr-duplicates$/) { 352 push(@_gstacks, "--rm-pcr-duplicates"); 353 354 } elsif ($_ =~ /^--var-alpha$/) { 355 $arg = shift @ARGV; 356 push(@_gstacks, "--var-alpha " . $arg); 357 358 } elsif ($_ =~ /^--gt-alpha$/) { 359 $arg = shift @ARGV; 360 push(@_gstacks, "--gt-alpha " . $arg); 361 362 } elsif ($_ =~ /^-X$/) { 363 # 364 # Pass an arbitrary command-line option to a pipeline program. 365 # 366 # Command line option must be of the form '-X "program:option"' 367 # 368 $arg = shift @ARGV; 369 my ($prog, $opt) = ($arg =~ /^(\w+):(.+)$/); 370 371 if ($prog eq "gstacks") { 372 push(@_gstacks, $opt); 373 374 } elsif ($prog eq "populations") { 375 push(@_populations, $opt); 376 } else { 377 print STDERR "Unknown pipeline program, '$arg'\n"; 378 usage(); 379 } 380 } elsif ($_ =~ /^--time-components$/) { 381 $time = '/usr/bin/time'; 382 if (! -e $time) { 383 die "Error: '$time': No such file or directory.\n"; 384 } 385 } else { 386 print STDERR "Unknown command line option: '$_'\n"; 387 usage(); 388 } 389 } 390 391 $exe_path = $exe_path . "/" if (substr($exe_path, -1) ne "/"); 392 $out_path = substr($out_path, 0, -1) if (substr($out_path, -1) eq "/"); 393 394 if (length($popmap_path) == 0) { 395 print STDERR "You must specify a population map that lists your sample names (--popmap).\n"; 396 usage(); 397 } 398 399 if (length($sample_path) == 0) { 400 print STDERR "You must specify the path to the directory containing the samples (--samples).\n"; 401 usage(); 402 } 403 404 if (length($sample_path) > 0) { 405 $sample_path .= "/" if (substr($sample_path, -1) ne "/"); 406 } 407} 408 409sub version { 410 print STDERR "ref_map.pl ", stacks_version, "\n"; 411} 412 413sub usage { 414 version(); 415 416 print STDERR <<EOQ; 417ref_map.pl --samples path --popmap path [-s spacer] -o path [--rm-pcr-duplicates] [-X prog:"opts" ...] 418 419 Input/Output files: 420 --samples: path to the directory containing the samples BAM (or SAM) alignment files. 421 --popmap: path to a population map file (format is "<name> TAB <pop>", one sample per line). 422 s: spacer for file names: by default this is empty and the program looks for files 423 named 'SAMPLE_NAME.bam'; if this option is given the program looks for files 424 named 'SAMPLE_NAME.SPACER.bam'. 425 o: path to an output directory. 426 --unpaired: ignore read pairing (for paired-end GBS; treat READ2's as if they were READ1's) 427 --ignore-pe-reads: ignore paired-end reads even if present in the input 428 429 General options: 430 X: additional options for specific pipeline components, e.g. -X "populations: -p 3 -r 0.50" 431 T: the number of threads/CPUs to use (default: 1). 432 d: Dry run. Do not actually execute anything, just print the individual pipeline commands 433 that would be executed. 434 435 Paired-end options: 436 --rm-pcr-duplicates: remove all but one copy of read pairs of the same sample that have 437 the same insert length. 438 439 SNP model options: 440 --var-alpha: significance level at which to call variant sites (for gstacks; default: 0.05). 441 --gt-alpha: significance level at which to call genotypes (for gstacks; default: 0.05). 442 443 Miscellaneous: 444 --time-components (for benchmarking) 445EOQ 446 exit 1; 447} 448