1#!/usr/bin/env perl 2 3use strict; 4use warnings; 5use Getopt::Std; 6 7my %opts = (t=>1); 8getopts("MPSadskHo:R:x:t:", \%opts); 9 10die(' 11Usage: run-bwamem [options] <idxbase> <file1> [file2] 12 13Options: -o STR prefix for output files [inferred from input] 14 -R STR read group header line such as \'@RG\tID:foo\tSM:bar\' [null] 15 -x STR read type: pacbio, ont2d or intractg [default] 16 intractg: intra-species contig (kb query, highly similar) 17 pacbio: pacbio subreads (~10kb query, high error rate) 18 ont2d: Oxford Nanopore reads (~10kb query, higher error rate) 19 -t INT number of threads [1] 20 21 -H apply HLA typing 22 -a trim HiSeq2000/2500 PE resequencing adapters (via trimadap) 23 -d mark duplicate (via samblaster) 24 -S for BAM input, don\'t shuffle 25 -s sort the output alignment (via samtools; requring more RAM) 26 -k keep temporary files generated by typeHLA 27 -M mark shorter split hits as secondary 28 29Examples: 30 31 * Map paired-end reads to GRCh38+ALT+decoy+HLA and perform HLA typing: 32 33 run-bwamem -o prefix -t8 -HR"@RG\tID:foo\tSM:bar" hs38DH.fa read1.fq.gz read2.fq.gz 34 35 Note: HLA typing is only effective for high-coverage data. The typing accuracy varies 36 with the quality of input. It is only intended for research purpose, not for diagnostic. 37 38 * Remap coordinate-sorted BAM, transfer read groups tags, trim Illumina PE adapters and 39 sort the output. The BAM may contain single-end or paired-end reads, or a mixture of 40 the two types. Specifying -R stops read group transfer. 41 42 run-bwamem -sao prefix hs38DH.fa old-srt.bam 43 44 Note: the adaptor trimmer included in bwa.kit is chosen because it fits the current 45 mapping pipeline better. It is conservative and suboptimal. A more sophisticated 46 trimmer is recommended if this becomes a concern. 47 48 * Remap name-grouped BAM and mark duplicates: 49 50 run-bwamem -Sdo prefix hs38DH.fa old-unsrt.bam 51 52 Note: streamed duplicate marking requires all reads from a single paired-end library 53 to be aligned at the same time. 54 55Output files: 56 57 {-o}.aln.bam - final alignment 58 {-o}.hla.top - best genotypes for the 6 classical HLA genes (if there are HLA-* contigs) 59 {-o}.hla.all - additional HLA genotypes consistent with data 60 {-o}.log.* - log files 61 62') if @ARGV < 2; 63 64my $idx = $ARGV[0]; 65 66my $exepath = $0 =~/^\S+\/[^\/\s]+/? $0 : &which($0); 67my $root = $0 =~/^(\S+)\/[^\/\s]+/? $1 : undef; 68$root = $exepath =~/^(\S+)\/[^\/\s]+/? $1 : undef if !defined($root); 69die "ERROR: failed to locate the 'bwa.kit' directory\n" if !defined($root); 70 71die("ERROR: failed to locate the BWA index. Please run '$root/bwa index -p $idx ref.fa'.\n") 72 unless (-f "$idx.bwt" && -f "$idx.pac" && -f "$idx.sa" && -f "$idx.ann" && -f "$idx.amb"); 73 74if (@ARGV >= 3 && $ARGV[1] =~ /\.(bam|sam|sam\.gz)$/) { 75 warn("WARNING: for SAM/BAM input, only the first sequence file is used.\n"); 76 @ARGV = 2; 77} 78 79if (defined($opts{p}) && @ARGV >= 3) { 80 warn("WARNING: option -P is ignored as there are two input sequence files.\n"); 81 delete $opts{p}; 82} 83 84my $prefix; 85if (defined $opts{o}) { 86 $prefix = $opts{o}; 87} elsif (@ARGV >= 3) { 88 my $len = length($ARGV[1]) < length($ARGV[2])? length($ARGV[1]) : length($ARGV[2]); 89 my $i; 90 for ($i = 0; $i < $len; ++$i) { 91 last if substr($ARGV[1], $i, 1) ne substr($ARGV[2], $i, 1) 92 } 93 $prefix = substr($ARGV[1], 0, $i) if $i > 0; 94} elsif ($ARGV[1] =~ /^(\S+)\.(fastq|fq|fasta|fa|mag|mag\.gz|fasta\.gz|fa\.gz|fastq\.gz|fq\.gz|bam)$/) { 95 $prefix = $1; 96} 97die("ERROR: failed to identify the prefix for output. Please specify -o.\n") unless defined($prefix); 98 99my $size = 0; 100my $comp_ratio = 3.; 101for my $f (@ARGV[1..$#ARGV]) { 102 my @a = stat($f); 103 my $s = $a[7]; 104 die("ERROR: failed to read file $f\n") if !defined($s); 105 $s *= $comp_ratio if $f =~ /\.(gz|bam)$/; 106 $size += int($s) + 1; 107} 108 109my $is_pe = (defined($opts{p}) || @ARGV >= 3)? 1 : 0; 110my $is_bam = $ARGV[1] =~ /\.bam$/? 1 : 0; 111 112if (defined($opts{x})) { 113 delete($opts{d}); delete($opts{a}); delete $opts{p}; 114} 115 116# for BAM input, find @RG header lines 117my @RG_lines = (); 118if ($is_bam && !defined($opts{R})) { 119 my $fh; 120 open($fh, "$root/samtools view -H $ARGV[1] |") || die; 121 while (<$fh>) { 122 chomp; 123 if (/^\@RG\t/) { 124 s/\t/\\t/g; 125 push(@RG_lines, "-H'$_'"); 126 } 127 } 128 close($fh); 129} 130 131warn("WARNING: many programs require read groups. Please specify with -R if you can.\n") if !defined($opts{R}) && @RG_lines == 0; 132 133my $cmd = ''; 134if ($is_bam) { 135 my $cmd_sam2bam = "cat $ARGV[1] \\\n"; 136 my $ntmps = int($size / 4e9) + 1; 137 my $cmd_shuf = !defined($opts{S})? " | $root/htsbox bamshuf -uOn$ntmps - $prefix.shuf \\\n" : ""; 138 my $bam2fq_opt = @RG_lines > 0? " -t" : ""; 139 my $cmd_bam2fq = " | $root/htsbox bam2fq -O$bam2fq_opt - \\\n"; 140 $cmd = $cmd_sam2bam . $cmd_shuf . $cmd_bam2fq; 141} elsif (@ARGV >= 3) { 142 $cmd = "$root/seqtk mergepe $ARGV[1] $ARGV[2] \\\n"; 143} else { 144 $cmd = "cat $ARGV[1] \\\n"; 145} 146 147my $bwa_opts = "-p " . ($opts{t} > 1? "-t$opts{t} " : "") . (defined($opts{x})? "-x $opts{x} " : "") . (defined($opts{R})? "-R'$opts{R}' " : "") . (defined($opts{M})? "-M " : ""); 148$bwa_opts .= join(" ", @RG_lines) . " -C " if @RG_lines > 0; 149 150$cmd .= " | $root/trimadap 2> $prefix.log.trim \\\n" if defined($opts{a}); 151$cmd .= " | $root/bwa mem $bwa_opts$ARGV[0] - 2> $prefix.log.bwamem \\\n"; 152$cmd .= " | $root/samblaster 2> $prefix.log.dedup \\\n" if defined($opts{d}); 153 154my $has_hla = 0; 155if (-f "$ARGV[0].alt" && !defined($opts{P})) { 156 my $fh; 157 open($fh, "$ARGV[0].alt") || die; 158 while (<$fh>) { 159 $has_hla = 1 if /^HLA-[^\s\*]+\*\d+/; 160 } 161 close($fh); 162 my $hla_pre = $has_hla? "-p $prefix.hla " : ""; 163 $cmd .= " | $root/k8 $root/bwa-postalt.js $hla_pre$ARGV[0].alt \\\n"; 164} 165 166my $t_sort = $opts{t} < 4? $opts{t} : 4; 167$cmd .= defined($opts{s})? " | $root/samtools sort -@ $t_sort -m1G - -o $prefix.aln.bam;\n" : " | $root/samtools view -1 - > $prefix.aln.bam;\n"; 168 169if ($has_hla && defined($opts{H}) && (!defined($opts{x}) || $opts{x} eq 'intractg')) { 170 $cmd .= "$root/run-HLA ". (defined($opts{x}) && $opts{x} eq 'intractg'? "-A " : "") . "$prefix.hla > $prefix.hla.top 2> $prefix.log.hla;\n"; 171 $cmd .= "touch $prefix.hla.HLA-dummy.gt; cat $prefix.hla.HLA*.gt | grep ^GT | cut -f2- > $prefix.hla.all;\n"; 172 $cmd .= "rm -f $prefix.hla.HLA*;\n" unless defined($opts{k}); 173} 174 175print $cmd; 176 177sub which 178{ 179 my $file = shift; 180 my $path = (@_)? shift : $ENV{PATH}; 181 return if (!defined($path)); 182 foreach my $x (split(":", $path)) { 183 $x =~ s/\/$//; 184 return "$x/$file" if (-x "$x/$file"); 185 } 186 return; 187} 188