1#!/usr/bin/env perl
2
3use strict;
4use warnings;
5use Getopt::Std;
6
7my %opts = (t=>1);
8getopts("MPSadskHo:R:x:t:", \%opts);
9
10die('
11Usage:   run-bwamem [options] <idxbase> <file1> [file2]
12
13Options: -o STR    prefix for output files                       [inferred from input]
14         -R STR    read group header line such as \'@RG\tID:foo\tSM:bar\'         [null]
15         -x STR    read type: pacbio, ont2d or intractg                      [default]
16                   intractg: intra-species contig (kb query, highly similar)
17                   pacbio:   pacbio subreads (~10kb query, high error rate)
18                   ont2d:    Oxford Nanopore reads (~10kb query, higher error rate)
19         -t INT    number of threads                                               [1]
20
21         -H        apply HLA typing
22         -a        trim HiSeq2000/2500 PE resequencing adapters (via trimadap)
23         -d        mark duplicate (via samblaster)
24         -S        for BAM input, don\'t shuffle
25         -s        sort the output alignment (via samtools; requring more RAM)
26         -k        keep temporary files generated by typeHLA
27         -M        mark shorter split hits as secondary
28
29Examples:
30
31 * Map paired-end reads to GRCh38+ALT+decoy+HLA and perform HLA typing:
32
33     run-bwamem -o prefix -t8 -HR"@RG\tID:foo\tSM:bar" hs38DH.fa read1.fq.gz read2.fq.gz
34
35   Note: HLA typing is only effective for high-coverage data. The typing accuracy varies
36   with the quality of input. It is only intended for research purpose, not for diagnostic.
37
38 * Remap coordinate-sorted BAM, transfer read groups tags, trim Illumina PE adapters and
39   sort the output. The BAM may contain single-end or paired-end reads, or a mixture of
40   the two types. Specifying -R stops read group transfer.
41
42     run-bwamem -sao prefix hs38DH.fa old-srt.bam
43
44   Note: the adaptor trimmer included in bwa.kit is chosen because it fits the current
45   mapping pipeline better. It is conservative and suboptimal. A more sophisticated
46   trimmer is recommended if this becomes a concern.
47
48 * Remap name-grouped BAM and mark duplicates:
49
50     run-bwamem -Sdo prefix hs38DH.fa old-unsrt.bam
51
52   Note: streamed duplicate marking requires all reads from a single paired-end library
53   to be aligned at the same time.
54
55Output files:
56
57  {-o}.aln.bam - final alignment
58  {-o}.hla.top - best genotypes for the 6 classical HLA genes (if there are HLA-* contigs)
59  {-o}.hla.all - additional HLA genotypes consistent with data
60  {-o}.log.*   - log files
61
62') if @ARGV < 2;
63
64my $idx = $ARGV[0];
65
66my $exepath = $0 =~/^\S+\/[^\/\s]+/? $0 : &which($0);
67my $root = $0 =~/^(\S+)\/[^\/\s]+/? $1 : undef;
68$root = $exepath =~/^(\S+)\/[^\/\s]+/? $1 : undef if !defined($root);
69die "ERROR: failed to locate the 'bwa.kit' directory\n" if !defined($root);
70
71die("ERROR: failed to locate the BWA index. Please run '$root/bwa index -p $idx ref.fa'.\n")
72  unless (-f "$idx.bwt" && -f "$idx.pac" && -f "$idx.sa" && -f "$idx.ann" && -f "$idx.amb");
73
74if (@ARGV >= 3 && $ARGV[1] =~ /\.(bam|sam|sam\.gz)$/) {
75	warn("WARNING: for SAM/BAM input, only the first sequence file is used.\n");
76	@ARGV = 2;
77}
78
79if (defined($opts{p}) && @ARGV >= 3) {
80	warn("WARNING: option -P is ignored as there are two input sequence files.\n");
81	delete $opts{p};
82}
83
84my $prefix;
85if (defined $opts{o}) {
86	$prefix = $opts{o};
87} elsif (@ARGV >= 3) {
88	my $len = length($ARGV[1]) < length($ARGV[2])? length($ARGV[1]) : length($ARGV[2]);
89	my $i;
90	for ($i = 0; $i < $len; ++$i) {
91		last if substr($ARGV[1], $i, 1) ne substr($ARGV[2], $i, 1)
92	}
93	$prefix = substr($ARGV[1], 0, $i) if $i > 0;
94} elsif ($ARGV[1] =~ /^(\S+)\.(fastq|fq|fasta|fa|mag|mag\.gz|fasta\.gz|fa\.gz|fastq\.gz|fq\.gz|bam)$/) {
95	$prefix = $1;
96}
97die("ERROR: failed to identify the prefix for output. Please specify -o.\n") unless defined($prefix);
98
99my $size = 0;
100my $comp_ratio = 3.;
101for my $f (@ARGV[1..$#ARGV]) {
102	my @a = stat($f);
103	my $s = $a[7];
104	die("ERROR: failed to read file $f\n") if !defined($s);
105	$s *= $comp_ratio if $f =~ /\.(gz|bam)$/;
106	$size += int($s) + 1;
107}
108
109my $is_pe = (defined($opts{p}) || @ARGV >= 3)? 1 : 0;
110my $is_bam = $ARGV[1] =~ /\.bam$/? 1 : 0;
111
112if (defined($opts{x})) {
113	delete($opts{d}); delete($opts{a}); delete $opts{p};
114}
115
116# for BAM input, find @RG header lines
117my @RG_lines = ();
118if ($is_bam && !defined($opts{R})) {
119	my $fh;
120	open($fh, "$root/samtools view -H $ARGV[1] |") || die;
121	while (<$fh>) {
122		chomp;
123		if (/^\@RG\t/) {
124			s/\t/\\t/g;
125			push(@RG_lines, "-H'$_'");
126		}
127	}
128	close($fh);
129}
130
131warn("WARNING: many programs require read groups. Please specify with -R if you can.\n") if !defined($opts{R}) && @RG_lines == 0;
132
133my $cmd = '';
134if ($is_bam) {
135	my $cmd_sam2bam = "cat $ARGV[1] \\\n";
136	my $ntmps = int($size / 4e9) + 1;
137	my $cmd_shuf = !defined($opts{S})? "  | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : "";
138	my $bam2fq_opt = @RG_lines > 0? " -t" : "";
139	my $cmd_bam2fq = "  | $root/htsbox bam2fq -O$bam2fq_opt - \\\n";
140	$cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq;
141} elsif (@ARGV >= 3) {
142	$cmd = "$root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n";
143} else {
144	$cmd = "cat $ARGV[1] \\\n";
145}
146
147my $bwa_opts = "-p " . ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "") . (defined($opts{M})? "-M " : "");
148$bwa_opts .= join(" ", @RG_lines) . " -C " if @RG_lines > 0;
149
150$cmd .= "  | $root/trimadap 2> $prefix.log.trim \\\n" if defined($opts{a});
151$cmd .= "  | $root/bwa mem $bwa_opts$ARGV[0] - 2> $prefix.log.bwamem \\\n";
152$cmd .= "  | $root/samblaster 2> $prefix.log.dedup \\\n" if defined($opts{d});
153
154my $has_hla = 0;
155if (-f "$ARGV[0].alt" && !defined($opts{P})) {
156	my $fh;
157	open($fh, "$ARGV[0].alt") || die;
158	while (<$fh>) {
159		$has_hla = 1 if /^HLA-[^\s\*]+\*\d+/;
160	}
161	close($fh);
162	my $hla_pre = $has_hla? "-p $prefix.hla " : "";
163	$cmd .= "  | $root/k8 $root/bwa-postalt.js $hla_pre$ARGV[0].alt \\\n";
164}
165
166my $t_sort = $opts{t} < 4? $opts{t} : 4;
167$cmd .= defined($opts{s})? "  | $root/samtools sort -@ $t_sort -m1G - -o $prefix.aln.bam;\n" : "  | $root/samtools view -1 - > $prefix.aln.bam;\n";
168
169if ($has_hla && defined($opts{H}) && (!defined($opts{x}) || $opts{x} eq 'intractg')) {
170	$cmd .= "$root/run-HLA ". (defined($opts{x}) && $opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n";
171	$cmd .= "touch $prefix.hla.HLA-dummy.gt; cat $prefix.hla.HLA*.gt | grep ^GT | cut -f2- > $prefix.hla.all;\n";
172	$cmd .= "rm -f $prefix.hla.HLA*;\n" unless defined($opts{k});
173}
174
175print $cmd;
176
177sub which
178{
179	my $file = shift;
180	my $path = (@_)? shift : $ENV{PATH};
181	return if (!defined($path));
182	foreach my $x (split(":", $path)) {
183		$x =~ s/\/$//;
184		return "$x/$file" if (-x "$x/$file");
185	}
186	return;
187}
188