1#!/usr/local/bin/perl
2#
3# Copyright 2010-2019, Julian Catchen <jcatchen@illinois.edu>
4#
5# This file is part of Stacks.
6#
7# Stacks is free software: you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation, either version 3 of the License, or
10# (at your option) any later version.
11#
12# Stacks is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15# GNU General Public License for more details.
16#
17# You should have received a copy of the GNU General Public License
18# along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19#
20
21use strict;
22use POSIX;
23use File::Temp qw/ mktemp /;
24use File::Spec;
25use constant stacks_version => "_VERSION_";
26
27use constant true  => 1;
28use constant false => 0;
29
30my $dry_run      = false;
31my $exe_path     = "_BINDIR_";
32my $out_path     = "";
33my $popmap_path  = "";
34my $sample_path  = "";
35my $db           = "";
36my $sample_id    = 1;
37my $gzip         = false;
38my $time         = "";
39
40my @parents;
41my @progeny;
42my @samples;
43
44my (@_gstacks, @_populations);
45
46my $cmd_str = $0 . " " . join(" ", @ARGV);
47
48parse_command_line();
49
50#
51# Check for the existence of the necessary pipeline programs
52#
53foreach my $prog ("gstacks", "populations") {
54    die "Unable to find '" . $exe_path . $prog . "'.\n" if (!-e $exe_path . $prog);
55}
56
57my ($log, $log_fh, $sample);
58
59my (@sample_list, %pop_ids, %pops, %grp_ids, %grps, %sample_ids);
60
61parse_population_map(\@sample_list, \%pop_ids, \%pops, \%grp_ids, \%grps);
62
63initialize_samples(\@parents, \@progeny, \@samples, \@sample_list, \%pop_ids, \%grp_ids);
64
65#
66# Open the log file
67#
68$log = "$out_path/ref_map.log";
69open($log_fh, ">$log") or die("Unable to open log file '$log'; $!\n");
70
71print $log_fh
72    "ref_map.pl version ", stacks_version, " started at ", strftime("%Y-%m-%d %H:%M:%S", (localtime(time))), "\n",
73    $cmd_str, "\n";
74
75execute_stacks($log_fh, $sample_id, \@parents, \@progeny, \@samples, \%sample_ids);
76
77print $log_fh "\nref_map.pl completed at ", strftime("%Y-%m-%d %H:%M:%S", (localtime(time))), "\n";
78close($log_fh);
79
80sub check_return_value {
81    # $? is a 16 bit int. Exit code is given by `$? & 255` if the process was
82    # terminated by a signal, and by `$? >> 8` if it exited normally.
83    my ($rv, $log_fh) = @_;
84    if ($rv != 0) {
85        my $code = ($rv >> 8) & 127;
86        if ($rv & 255 || ($rv >> 8) > 127) {
87            $code += 128;
88        }
89        my $msg = "\nref_map.pl: Aborted because the last command failed ($code";
90        if ($code == 129 || $code == 130 || $code == 131) {
91            $msg .= "/interrupted";
92        } elsif ($code == 137 || $code == 143) {
93            $msg .= "/killed";
94        } elsif ($code == 134) {
95            $msg .= "/SIGABRT";
96        } elsif ($code == 139) {
97            $msg .= "/segmentation fault";
98        }
99        $msg .= ")";
100        print $log_fh ($msg . ".\n");
101        print STDERR ($msg . "; see log file.\n");
102        exit 1;
103    }
104}
105
106sub execute_stacks {
107    my ($log_fh, $sample_id, $parents, $progeny, $samples, $sample_ids) = @_;
108
109    my (@results, @depths_of_cov);
110    my ($pop_cnt, $sample, $num_files, $i, $cmd, $pipe_fh, $path, $cat_file);
111
112    #
113    # Call genotypes.
114    #
115    print STDERR "Calling variants, genotypes and haplotypes...\n";
116    print $log_fh "\ngstacks\n==========\n";
117
118    $cmd = $exe_path . "gstacks -I $sample_path -M $popmap_path -O $out_path";
119    foreach (@_gstacks) {
120        $cmd .= " " . $_;
121    }
122    print STDERR  "  $cmd\n\n";
123    print $log_fh "$cmd\n\n";
124    if (!$dry_run) {
125        open($pipe_fh, "$time $cmd 2>&1 |");
126        while (<$pipe_fh>) {
127            print $log_fh $_;
128        }
129        close($pipe_fh);
130        check_return_value($?, $log_fh);
131    }
132
133    printf(STDERR "Calculating population-level summary statistics\n");
134    print $log_fh "\npopulations\n==========\n";
135
136    $cmd = $exe_path . "populations" . " -P $out_path " . join(" ", @_populations);
137    print STDERR  "  $cmd\n\n";
138    print $log_fh "$cmd\n\n";
139
140    if (!$dry_run) {
141        open($pipe_fh, "$time $cmd 2>&1 |");
142        while (<$pipe_fh>) {
143            print $log_fh $_;
144        }
145        close($pipe_fh);
146        check_return_value($?, $log_fh);
147    }
148
149    print STDERR  "ref_map.pl is done.\n";
150    print $log_fh "ref_map.pl is done.\n";
151}
152
153sub parse_population_map {
154    my ($sample_list, $pop_ids, $pops, $grp_ids, $grps) = @_;
155
156    my ($fh, @parts, $line, $sample);
157
158    return if (length($popmap_path) == 0);
159
160    open($fh, "<$popmap_path") or die("Unable to open population map, '$popmap_path', $!\n");
161
162    while ($line = <$fh>) {
163        chomp $line;
164
165        next if ($line =~ /^\s*#/);
166
167        @parts = split(/\t/, $line);
168        if (scalar(@parts) != 2 and scalar(@parts) != 3) {
169            die("Unable to parse population map, '$popmap_path' (expected 2 or 3 columns, found " . scalar(@parts) . "); at line:\n$line\n");
170        }
171
172        foreach my $part (@parts) {
173            $part =~ s/^\s*|\s*$//g;
174        }
175
176        push(@{$sample_list}, $parts[0]);
177
178        $pop_ids->{$parts[0]} = $parts[1];
179        $pops->{$parts[1]}++;
180
181        if (scalar(@parts) > 2) {
182            $grp_ids->{$parts[0]} = $parts[2];
183            $grps->{$parts[2]}++;
184        }
185    }
186
187    if (scalar(keys %{$grps}) == 0) {
188        $grps->{"1"}++;
189
190        foreach $sample (@{$sample_list}) {
191            $grp_ids->{$sample} = "1";
192        }
193    }
194
195    print STDERR "Parsed population map: ", scalar(@{$sample_list}), " files in ", scalar(keys %{$pops});
196    scalar(keys %{$pops}) == 1 ?  print STDERR " population" : print STDERR " populations";
197    print STDERR " and ", scalar(keys %{$grps});
198    scalar(keys %{$grps}) == 1 ? print STDERR " group.\n" : print STDERR " groups.\n";
199
200    close($fh);
201}
202
203sub initialize_samples {
204    my ($parents, $progeny, $samples, $sample_list, $pop_ids, $grp_ids) = @_;
205
206    my ($local_gzip, $file, $prefix, $suffix, $path, $found, $i);
207
208    if (scalar(@{$sample_list}) > 0 && scalar(@{$samples}) == 0) {
209        my @suffixes = ("bam");
210        my @fmts     = ("bam");
211
212        #
213        # If a population map was specified and no samples were provided on the command line.
214        #
215        foreach $sample (@{$sample_list}) {
216            $found = false;
217
218            for ($i = 0; $i < scalar(@suffixes); $i++) {
219                $path = $sample_path . $sample . "." . $suffixes[$i];
220                if (-e $path) {
221
222                    $gzip = true if ($i == 1);
223
224                    push(@{$samples}, {'path'   => $sample_path,
225                                       'file'   => $sample,
226                                       'suffix' => $suffixes[$i],
227                                       'type'   => "sample",
228                                       'fmt'    => $fmts[$i]});
229                    $found = true;
230                    last;
231                }
232            }
233
234            if ($found == false) {
235                die("Error: Failed to open '$sample_path$sample.bam'.\n");
236            }
237        }
238    }
239
240    #
241    # If a population map was specified, make sure all samples in the list were found (and vice versa) and assign popualtion IDs.
242    #
243    if (scalar(@{$sample_list}) > 0) {
244
245        my %sample_hash;
246
247        foreach $sample (@{$samples}) {
248            $sample_hash{$sample->{'file'}}++;
249
250            if (!defined($pop_ids->{$sample->{'file'}})) {
251                die("Unable to find an entry for '" . $sample->{'file'} . "' in the population map, '$popmap_path'.\n");
252            } else {
253                $sample->{'pop_id'} = $pop_ids->{$sample->{'file'}};
254            }
255            if (!defined($grp_ids->{$sample->{'file'}})) {
256                die("Unable to find an entry for '" . $sample->{'file'} . "' in the population map, '$popmap_path'.\n");
257            } else {
258                $sample->{'grp_id'} = $grp_ids->{$sample->{'file'}};
259            }
260        }
261
262        foreach $sample (@{$sample_list}) {
263            if (!defined($sample_hash{$sample})) {
264                die("Unable to find a file corresponding to the population map entry '" . $sample . "' in the population map, '$popmap_path'.\n");
265            }
266        }
267
268    } else {
269        foreach $sample (@{$parents}, @{$progeny}, @{$samples}) {
270            $sample->{'pop_id'} = "1";
271            $sample->{'grp_id'} = "1";
272            $pop_ids->{$sample->{'file'}} = $sample->{'pop_id'};
273            $grp_ids->{$sample->{'file'}} = $sample->{'grp_id'};
274        }
275    }
276
277    #
278    # Check that no duplicate files were specified.
279    #
280    my (%files, $file);
281    foreach $file (@{$parents}, @{$progeny}, @{$samples}) {
282        $files{$file}++;
283    }
284    foreach $file (keys %files) {
285        if ($files{$file} > 1) {
286            die("A duplicate file was specified which may create undefined results, '$file'\n");
287        }
288    }
289
290    print STDERR "Found ", scalar(@{$parents}), " parental file(s).\n\n" if (scalar(@{$parents}) > 0);
291    print STDERR "Found ", scalar(@{$progeny}), " progeny file(s).\n\n" if (scalar(@{$progeny}) > 0);
292    print STDERR "Found ", scalar(@{$samples}), " sample file(s).\n\n" if (scalar(@{$samples}) > 0);
293
294    if ( scalar(@{$samples}) > 0 && (scalar(@{$parents}) > 0 || scalar(@{$progeny}) > 0) ) {
295	die("Both samples and parents/progeny were specified either on the command line (-s/-r/-p) or within the population map. Only one of the other may be specified.\n");
296    }
297}
298
299sub write_results {
300    my ($results, $log_fh) = @_;
301
302    my $line;
303
304    foreach $line (@{$results}) {
305        if ($line =~ /\r/) {
306            $line =~ s/^.+\r(.*\n)$/\1/;
307        }
308        print $log_fh $line;
309    }
310}
311
312sub write_depths_of_cov {
313    my ($depths, $log_fh) = @_;
314
315    print STDERR "\nDepths of Coverage for Processed Samples:\n";
316    print $log_fh "\nDepths of Coverage for Processed Samples:\n";
317
318    foreach $a (@{$depths}) {
319        print STDERR  $a->[0], ": ", $a->[1], "x\n";
320        print $log_fh $a->[0], ": ", $a->[1], "x\n";
321    }
322}
323
324sub parse_command_line {
325    my ($arg);
326
327    while (@ARGV) {
328        $_ = shift @ARGV;
329        if    ($_ =~ /^-v$/ || $_ =~ /^--version$/) { version(); exit 1; }
330        elsif ($_ =~ /^-h$/) { usage(); }
331        elsif ($_ =~ /^-d$/ || $_ =~ /^--dry-run$/) { $dry_run = true; }
332        elsif ($_ =~ /^-o$/) { $out_path  = shift @ARGV; }
333        elsif ($_ =~ /^-e$/) { $exe_path  = shift @ARGV; }
334        elsif ($_ =~ /^--samples$/) {
335            $sample_path = shift @ARGV;
336        } elsif ($_ =~ /^-O$/ || $_ =~ /^--popmap$/) {
337            $popmap_path = shift @ARGV;
338            push(@_populations, "-M " . $popmap_path);
339
340        } elsif ($_ =~ /^--unpaired$/) {
341            push(@_gstacks, "--unpaired");
342
343        } elsif ($_ =~ /^--ignore-pe-reads$/) {
344            push(@_gstacks, "--ignore-pe-reads");
345
346        } elsif ($_ =~ /^-T$/) {
347            $arg = shift @ARGV;
348            push(@_gstacks,     "-t " . $arg);
349            push(@_populations, "-t " . $arg);
350
351        } elsif ($_ =~ /^--rm-pcr-duplicates$/) {
352            push(@_gstacks, "--rm-pcr-duplicates");
353
354        } elsif ($_ =~ /^--var-alpha$/) {
355            $arg = shift @ARGV;
356            push(@_gstacks, "--var-alpha " . $arg);
357
358        } elsif ($_ =~ /^--gt-alpha$/) {
359            $arg = shift @ARGV;
360            push(@_gstacks, "--gt-alpha " . $arg);
361
362        } elsif ($_ =~ /^-X$/) {
363            #
364            # Pass an arbitrary command-line option to a pipeline program.
365            #
366            # Command line option must be of the form '-X "program:option"'
367            #
368            $arg = shift @ARGV;
369            my ($prog, $opt) = ($arg =~ /^(\w+):(.+)$/);
370
371            if ($prog eq "gstacks") {
372                push(@_gstacks, $opt);
373
374            } elsif ($prog eq "populations") {
375                push(@_populations, $opt);
376            } else {
377                print STDERR "Unknown pipeline program, '$arg'\n";
378                usage();
379            }
380        } elsif ($_ =~ /^--time-components$/) {
381            $time = '/usr/bin/time';
382            if (! -e $time) {
383                die "Error: '$time': No such file or directory.\n";
384            }
385        } else {
386            print STDERR "Unknown command line option: '$_'\n";
387            usage();
388        }
389    }
390
391    $exe_path = $exe_path . "/"          if (substr($exe_path, -1) ne "/");
392    $out_path = substr($out_path, 0, -1) if (substr($out_path, -1) eq "/");
393
394    if (length($popmap_path) == 0) {
395        print STDERR "You must specify a population map that lists your sample names (--popmap).\n";
396        usage();
397    }
398
399    if (length($sample_path) == 0) {
400        print STDERR "You must specify the path to the directory containing the samples (--samples).\n";
401        usage();
402    }
403
404    if (length($sample_path) > 0) {
405        $sample_path .= "/" if (substr($sample_path, -1) ne "/");
406    }
407}
408
409sub version {
410    print STDERR "ref_map.pl ", stacks_version, "\n";
411}
412
413sub usage {
414    version();
415
416    print STDERR <<EOQ;
417ref_map.pl --samples path --popmap path [-s spacer] -o path [--rm-pcr-duplicates] [-X prog:"opts" ...]
418
419  Input/Output files:
420    --samples: path to the directory containing the samples BAM (or SAM) alignment files.
421    --popmap: path to a population map file (format is "<name> TAB <pop>", one sample per line).
422    s: spacer for file names: by default this is empty and the program looks for files
423       named 'SAMPLE_NAME.bam'; if this option is given the program looks for files
424       named 'SAMPLE_NAME.SPACER.bam'.
425    o: path to an output directory.
426    --unpaired: ignore read pairing (for paired-end GBS; treat READ2's as if they were READ1's)
427    --ignore-pe-reads: ignore paired-end reads even if present in the input
428
429  General options:
430    X: additional options for specific pipeline components, e.g. -X "populations: -p 3 -r 0.50"
431    T: the number of threads/CPUs to use (default: 1).
432    d: Dry run. Do not actually execute anything, just print the individual pipeline commands
433       that would be executed.
434
435  Paired-end options:
436    --rm-pcr-duplicates: remove all but one copy of read pairs of the same sample that have
437                         the same insert length.
438
439  SNP model options:
440    --var-alpha: significance level at which to call variant sites (for gstacks; default: 0.05).
441    --gt-alpha: significance level at which to call genotypes (for gstacks; default: 0.05).
442
443  Miscellaneous:
444    --time-components (for benchmarking)
445EOQ
446    exit 1;
447}
448