1#!/usr/local/bin/bash
2
3usage(){
4echo "
5Written by Brian Bushnell
6Last modified January 28, 2020
7
8Description:  Creates one or more sketches from a fasta file,
9optionally annotated with taxonomic information.
10
11Please read bbmap/docs/guides/BBSketchGuide.txt for more information.
12
13Usage:  sketch.sh in=<fasta file> out=<sketch file>
14
15Standard parameters:
16in=<file>           A fasta file containing one or more sequences.
17out=<file>          Output filename.  If multiple files are desired it must
18                    contain the # symbol.
19blacklist=<file>    Ignore keys in this sketch file.  Additionaly, there are
20                    built-in blacklists that can be specified:
21                       nt:      Blacklist for nt
22                       refseq:  Blacklist for Refseq
23                       silva:   Blacklist for Silva
24                       img:     Blacklist for IMG
25files=1             Number of output sketch files to produce, for parallel
26                    loading.  Independent of the number of sketches produced;
27                    sketches will be randomly distributed between files.
28k=32,24             Kmer length, 1-32.  To maximize sensitivity and
29                    specificity, dual kmer lengths may be used, e.g. k=32,24
30                    Query and reference k must match.
31rcomp=t             Look at reverse-complement kmers also.
32amino=f             Use amino acid mode.  Input should be amino acids.
33translate=f         Call genes and translate to proteins.  Input should be
34                    nucleotides.  Designed for prokaryotes.
35mode=single         Possible modes:
36                       single: Write one sketch.
37                       sequence: Write one sketch per sequence.
38                       taxa: Write one sketch per taxonomic unit.
39                          Requires more memory, and taxonomic annotation.
40                       img: Write one sketch per IMG id.
41delta=t             Delta-compress sketches.
42a48=t               Encode sketches as ASCII-48 rather than hex.
43depth=f             Track the number of times kmers appear.
44                    Required for the depth2 field in comparisons.
45entropy=0.66        Ignore sequence with entropy below this value.
46ssu=t               Scan for and retain full-length SSU sequence.
47
48Size parameters:
49size=10000          Desired size of sketches (if not using autosize).
50maxfraction=0.01    (mgf) Max fraction of genomic kmers to use.
51minsize=100         Do not generate sketches for genomes smaller than this.
52autosize=t          Use flexible sizing instead of fixed-length.  This is
53                    nonlinear; a human sketch is only ~6x a bacterial sketch.
54sizemult=1          Multiply the autosized size of sketches by this factor.
55                    Normally a bacterial-size genome will get a sketch size
56                    of around 10000; if autosizefactor=2, it would be ~20000.
57density=            If this flag is set (to a number between 0 and 1),
58                    autosize and sizemult are ignored, and this fraction of
59                    genomic kmers are used.  For example, at density=0.001,
60                    a 4.5Mbp bacteria will get a 4500-kmer sketch.
61
62Metadata flags (optional; intended for single-sketch mode):
63taxid=-1            Set the NCBI taxid.
64imgid=-1            Set the IMG id.
65spid=-1             Set the JGI sequencing project id.
66name=               Set the name (taxname).
67name0=              Set name0 (normally the first sequence header).
68fname=              Set fname (normally the file name).
69meta_=              Set an arbitrary metadata field.
70                    For example, meta_Month=March.
71
72Taxonomy-specific flags:
73tree=               Specify a taxtree file.  On Genepool, use 'auto'.
74gi=                 Specify a gitable file.  On Genepool, use 'auto'.
75accession=          Specify one or more comma-delimited NCBI accession to
76                    taxid files.  On Genepool, use 'auto'.
77imgdump=            Specify an IMG dump file containing NCBI taxIDs,
78                    for IMG mode.
79taxlevel=subspecies Taxa hits below this rank will be promoted and merged
80                    with others.
81prefilter=f         For huge datasets full of junk like nt, this flag
82                    will save memory by ignoring taxa smaller than minsize.
83                    Requires taxonomic information (tree and gi).
84tossjunk=t          For taxa mode, discard taxonomically uninformative
85                    sequences.  This includes sequences with no taxid,
86                    with a tax level NO_RANK, of parent taxid of LIFE.
87silva=f             Parse headers using Silva or semicolon-delimited syntax.
88
89Ribosomal flags, which allow SSU sequences to be attached to sketches:
90processSSU=t        Run gene-calling to detect ribosomal SSU sequences.
9116Sfile=<file>      Optional file of 16S sequences, annotated with TaxIDs.
9218Sfile=<file>      Optional file of 18S sequences, annotated with TaxIDs.
93preferSSUMap=f      Prefer file SSUs over called SSUs.
94preferSSUMapEuks=t  Prefer file SSUs over called SSUs for Eukaryotes.
95SSUMapOnly=f        Only use file SSUs.
96SSUMapOnlyEuks=f    Only use file SSUs for Eukaryotes.  This prevents
97                    associating an organism with its mitochondrial or
98                    chloroplast 16S/18S, which is otherwise a problem.
99
100
101Java Parameters:
102-Xmx                This will set Java's memory usage, overriding autodetection.
103                    -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
104                    The max is typically 85% of physical memory.
105-eoom               This flag will cause the process to exit if an
106                    out-of-memory exception occurs.  Requires Java 8u92+.
107-da                 Disable assertions.
108
109For more detailed information, please read /bbmap/docs/guides/BBSketchGuide.txt.
110Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
111"
112}
113
114#This block allows symlinked shellscripts to correctly set classpath.
115pushd . > /dev/null
116DIR="${BASH_SOURCE[0]}"
117while [ -h "$DIR" ]; do
118  cd "$(dirname "$DIR")"
119  DIR="$(readlink "$(basename "$DIR")")"
120done
121cd "$(dirname "$DIR")"
122DIR="$(pwd)/"
123popd > /dev/null
124
125#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
126CP="$DIR""current/"
127
128z="-Xmx4g"
129z2="-Xms4g"
130set=0
131
132if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
133	usage
134	exit
135fi
136
137calcXmx () {
138	source "$DIR""/calcmem.sh"
139	setEnvironment
140	parseXmx "$@"
141	if [[ $set == 1 ]]; then
142		return
143	fi
144	freeRam 4000m 84
145	z="-Xmx${RAM}m"
146	z2="-Xms${RAM}m"
147}
148calcXmx "$@"
149
150sketch() {
151	local CMD="java $EA $EOOM $z $z2 -cp $CP sketch.SketchMaker $@"
152	echo $CMD >&2
153	eval $CMD
154}
155
156sketch "$@"
157