1#!/usr/local/bin/bash
2
3usage(){
4echo "
5Written by Brian Bushnell
6Last modified November 7, 2019
7
8Description:  Creates a blacklist sketch from common kmers,
9which occur in at least X different sequences or taxa.
10Please read bbmap/docs/guides/BBSketchGuide.txt for more information.
11
12Usage:  sketchblacklist.sh in=<fasta file> out=<sketch file>
13
14Standard parameters:
15in=<file>           A fasta file containing one or more sequences.
16out=<file>          Output filename.
17mintaxcount=100     Sketch kmers occuring in at least this many taxa.
18k=31                Kmer length, 1-32.  To maximize sensitivity and
19                    specificity, dual kmer lengths may be used:  k=31,24
20mode=sequence       Possible modes:
21                       sequence: Count kmers once per sequence.
22                       taxa: Count kmers once per taxonomic unit.
23name=               Set the blacklist sketch name.
24delta=t             Delta-compress sketches.
25a48=t               Encode sketches as ASCII-48 rather than hex.
26amino=f             Amino-acid mode.
27entropy=0.66        Ignore sequence with entropy below this value.
28keyfraction=0.16    Smaller values reduce blacklist size by ignoring a
29                    a fraction of the key space.  Range: 0.0001-0.5.
30
31Taxonomy-specific flags:
32tree=               Specify a taxtree file.  On Genepool, use 'auto'.
33gi=                 Specify a gitable file.  On Genepool, use 'auto'.
34accession=          Specify one or more comma-delimited NCBI accession to
35                    taxid files.  On Genepool, use 'auto'.
36taxlevel=subspecies Taxa hits below this rank will be promoted and merged
37                    with others.
38prefilter=t         Use a bloom filter to ignore low-count kmers.
39prepasses=2         Number of prefilter passes.
40prehashes=2         Number of prefilter hashes.
41prebits=-1          Manually override number of prefilter cell bits.
42tossjunk=t          For taxa mode, discard taxonomically uninformative
43                    sequences.  This includes sequences with no taxid,
44                    with a tax level NO_RANK, of parent taxid of LIFE.
45silva=f             Parse headers using Silva or semicolon-delimited syntax.
46
47Java Parameters:
48-Xmx                This will set Java's memory usage, overriding autodetection.
49                    -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs.
50                    The max is typically 85% of physical memory.
51-eoom               This flag will cause the process to exit if an
52                    out-of-memory exception occurs.  Requires Java 8u92+.
53-da                 Disable assertions.
54
55For more detailed information, please read /bbmap/docs/guides/BBSketchGuide.txt.
56Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems.
57"
58}
59
60#This block allows symlinked shellscripts to correctly set classpath.
61pushd . > /dev/null
62DIR="${BASH_SOURCE[0]}"
63while [ -h "$DIR" ]; do
64  cd "$(dirname "$DIR")"
65  DIR="$(readlink "$(basename "$DIR")")"
66done
67cd "$(dirname "$DIR")"
68DIR="$(pwd)/"
69popd > /dev/null
70
71#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/"
72CP="$DIR""current/"
73
74z="-Xmx4g"
75z2="-Xms4g"
76set=0
77
78if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then
79	usage
80	exit
81fi
82
83calcXmx () {
84	source "$DIR""/calcmem.sh"
85	setEnvironment
86	parseXmx "$@"
87	if [[ $set == 1 ]]; then
88		return
89	fi
90	freeRam 4000m 84
91	z="-Xmx${RAM}m"
92	z2="-Xms${RAM}m"
93}
94calcXmx "$@"
95
96sketchblacklist() {
97	local CMD="java $EA $EOOM $z $z2 -cp $CP sketch.BlacklistMaker $@"
98	echo $CMD >&2
99	eval $CMD
100}
101
102sketchblacklist "$@"
103