1#!/usr/local/bin/bash
2
3usage(){
4echo "
5Written by Brian Bushnell
6Last modified April 30, 2020
7
8Description:  Calculates per-scaffold or per-base coverage information from an unsorted sam or bam file.
9Supports SAM/BAM format for reads and FASTA for reference.
10Sorting is not needed, so output may be streamed directly from a mapping program.
11Requires a minimum of 1 bit per reference base plus 100 bytes per scaffold (even if no reference is specified).
12If per-base coverage is needed (including for stdev and median), at least 4 bytes per base is needed.
13
14Usage:        pileup.sh in=<input> out=<output>
15
16Input Parameters:
17in=<file>           The input sam file; this is the only required parameter.
18ref=<file>          Scans a reference fasta for per-scaffold GC counts, not otherwise needed.
19fastaorf=<file>     An optional fasta file with ORF header information in PRODIGAL's output format.  Must also specify 'outorf'.
20unpigz=t            Decompress with pigz for faster decompression.
21addfromref=t        Allow ref scaffolds not present in sam header to be added from the reference.
22addfromreads=f      Allow ref scaffolds not present in sam header to be added from the reads.
23                    Note that in this case the ref scaffold lengths will be inaccurate.
24
25Output Parameters:
26out=<file>          (covstats) Per-scaffold coverage info.
27rpkm=<file>         Per-scaffold RPKM/FPKM counts.
28twocolumn=f         Change to true to print only ID and Avg_fold instead of all 6 columns.
29countgc=t           Enable/disable counting of read GC content.
30outorf=<file>       Per-orf coverage info to this file (only if 'fastaorf' is specified).
31outsam=<file>       Print the input sam stream to this file (or stdout).  Useful for piping data.
32hist=<file>         Histogram of # occurrences of each depth level.
33basecov=<file>      Coverage per base location.
34bincov=<file>       Binned coverage per location (one line per X bases).
35binsize=1000        Binsize for binned coverage output.
36keepshortbins=t     (ksb) Keep residual bins shorter than binsize.
37normcov=<file>      Normalized coverage by normalized location (X lines per scaffold).
38normcovo=<file>     Overall normalized coverage by normalized location (X lines for the entire assembly).
39normb=-1            If positive, use a fixed number of bins per scaffold; affects 'normcov' and 'normcovo'.
40normc=f             Normalize coverage to fraction of max per scaffold; affects 'normcov' and 'normcovo'.
41delta=f             Only print base coverage lines when the coverage differs from the previous base.
42nzo=f               Only print scaffolds with nonzero coverage.
43concise=f           Write 'basecov' in a more concise format.
44header=t            (hdr) Include headers in output files.
45headerpound=t       (#) Prepend header lines with '#' symbol.
46stdev=t             Calculate coverage standard deviation.
47covminscaf=0        (minscaf) Don't print coverage for scaffolds shorter than this.
48covwindow=0         Calculate how many bases are in windows of this size with
49                    low average coverage.  Produces an extra stats column.
50covwindowavg=5      Average coverage below this will be classified as low.
51k=0                 If positive, calculate kmer coverage statstics for this kmer length.
52keyvalue=f          Output statistics to screen as key=value pairs.
53mincov=1            When calculating percent covered, ignore bases under this depth.
54
55Processing Parameters:
56strandedcov=f       Track coverage for plus and minus strand independently.
57startcov=f          Only track start positions of reads.
58stopcov=f           Only track stop positions of reads.
59secondary=t         Use secondary alignments, if present.
60softclip=f          Include soft-clipped bases in coverage.
61minmapq=0           (minq) Ignore alignments with mapq below this.
62physical=f          (physcov) Calculate physical coverage for paired reads.  This includes the unsequenced bases.
63tlen=t              Track physical coverage from the tlen field rather than recalculating it.
64arrays=auto         Set to t/f to manually force the use of coverage arrays.  Arrays and bitsets are mutually exclusive.
65bitsets=auto        Set to t/f to manually force the use of coverage bitsets.
6632bit=f             Set to true if you need per-base coverage over 64k; does not affect per-scaffold coverage precision.
67                    This option will double RAM usage (when calculating per-base coverage).
68delcoverage=t       (delcov) Count bases covered by deletions or introns as covered.
69                    True is faster than false.
70dupecoverage=t      (dupes) Include reads flagged as duplicates in coverage.
71samstreamer=t       (ss) Load reads multithreaded to increase speed.
72
73Trimming Parameters:
74** NOTE: These are applied before adding coverage, to allow mimicking **
75** tools like CallVariants, which uses 'qtrim=r trimq=10 border=5'    **
76qtrim=f             Quality-trim.  May be set to:
77                       f (false): Don't quality-trim.
78                       r (right): Trim right (3') end only.
79                       l (left): Trim right (5') end only.
80                       rl (both): Trim both ends.
81trimq=-1            If positive, quality-trim to this threshold.
82border=0            Ignore this many bases on the left and right end.
83
84Output format (tab-delimited):
85ID, Avg_fold, Length, Ref_GC, Covered_percent, Covered_bases, Plus_reads, Minus_reads, Read_GC, Median_fold, Std_Dev
86
87ID:                Scaffold ID
88Length:            Scaffold length
89Ref_GC:            GC ratio of reference
90Avg_fold:          Average fold coverage of this scaffold
91Covered_percent:   Percent of scaffold with any coverage (only if arrays or bitsets are used)
92Covered_bases:     Number of bases with any coverage (only if arrays or bitsets are used)
93Plus_reads:        Number of reads mapped to plus strand
94Minus_reads:       Number of reads mapped to minus strand
95Read_GC:           Average GC ratio of reads mapped to this scaffold
96Median_fold:       Median fold coverage of this scaffold (only if arrays are used)
97Std_Dev:           Standard deviation of coverage (only if arrays are used)
98
99Java Parameters:
100
101-Xmx               This will set Java's memory usage, overriding
102                   autodetection. -Xmx20g will
103                   specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
104                   The max is typically 85% of physical memory.
105-eoom              This flag will cause the process to exit if an out-of-memory
106                   exception occurs.  Requires Java 8u92+.
107-da                Disable assertions.
108
109Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
110"
111}
112
113#This block allows symlinked shellscripts to correctly set classpath.
114pushd . > /dev/null
115DIR="${BASH_SOURCE[0]}"
116while [ -h "$DIR" ]; do
117  cd "$(dirname "$DIR")"
118  DIR="$(readlink "$(basename "$DIR")")"
119done
120cd "$(dirname "$DIR")"
121DIR="$(pwd)/"
122popd > /dev/null
123
124#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
125CP="$DIR""current/"
126
127z="-Xmx1g"
128z2="-Xms1g"
129set=0
130
131if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
132	usage
133	exit
134fi
135
136calcXmx () {
137	source "$DIR""/calcmem.sh"
138	setEnvironment
139	parseXmx "$@"
140	if [[ $set == 1 ]]; then
141		return
142	fi
143	freeRam 3200m 84
144	z="-Xmx${RAM}m"
145	z2="-Xms${RAM}m"
146}
147calcXmx "$@"
148
149pileup() {
150	local CMD="java $EA $EOOM $z -cp $CP jgi.CoveragePileup $@"
151	echo $CMD >&2
152	eval $CMD
153}
154
155pileup "$@"
156