1#!/usr/local/bin/bash 2 3usage(){ 4echo " 5Written by Brian Bushnell 6Last modified September 1, 2016 7 8Description: Filters reads by name. 9 10Usage: filterbyname.sh in=<file> in2=<file2> out=<outfile> out2=<outfile2> names=<string,string,string> include=<t/f> 11 12in2 and out2 are for paired reads and are optional. 13If input is paired and there is only one output file, it will be written interleaved. 14Important! Leading > and @ symbols are NOT part of sequence names; they are part of 15the fasta, fastq, and sam specifications. Therefore, this is correct: 16names=e.coli_K12 17And these are incorrect: 18names=>e.coli_K12 19names=@e.coli_K12 20 21Parameters: 22include=f Set to 'true' to include the filtered names rather than excluding them. 23substring=f Allow one name to be a substring of the other, rather than a full match. 24 f: No substring matching. 25 t: Bidirectional substring matching. 26 header: Allow input read headers to be substrings of names in list. 27 name: Allow names in list to be substrings of input read headers. 28prefix=f Allow names to match read header prefixes. 29case=t (casesensitive) Match case also. 30ow=t (overwrite) Overwrites files that already exist. 31app=f (append) Append to files that already exist. 32zl=4 (ziplevel) Set compression level, 1 (low) to 9 (max). 33int=f (interleaved) Determines whether INPUT file is considered interleaved. 34names= A list of strings or files. The files can have one name per line, or 35 be a standard read file (fasta, fastq, or sam). 36minlen=0 Do not output reads shorter than this. 37ths=f (truncateheadersymbol) Ignore a leading @ or > symbol in the names file. 38tws=f (truncatewhitespace) Ignore leading or trailing whitespace in the names file. 39truncate=f Set both ths and tws at the same time. 40 41Positional parameters: 42These optionally allow you to output only a portion of a sequence. Zero-based, inclusive. 43Intended for a single sequence and include=t mode. 44from=-1 Only print bases starting at this position. 45to=-1 Only print bases up to this position. 46range= Set from and to with a single flag. 47 48 49Java Parameters: 50-Xmx This will set Java's memory usage, overriding autodetection. 51 -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. 52 The max is typically 85% of physical memory. 53-eoom This flag will cause the process to exit if an out-of-memory 54 exception occurs. Requires Java 8u92+. 55-da Disable assertions. 56 57To read from stdin, set 'in=stdin'. The format should be specified with an extension, like 'in=stdin.fq.gz' 58To write to stdout, set 'out=stdout'. The format should be specified with an extension, like 'out=stdout.fasta' 59 60Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems. 61" 62} 63 64#This block allows symlinked shellscripts to correctly set classpath. 65pushd . > /dev/null 66DIR="${BASH_SOURCE[0]}" 67while [ -h "$DIR" ]; do 68 cd "$(dirname "$DIR")" 69 DIR="$(readlink "$(basename "$DIR")")" 70done 71cd "$(dirname "$DIR")" 72DIR="$(pwd)/" 73popd > /dev/null 74 75#DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" 76CP="$DIR""current/" 77 78z="-Xmx800m" 79set=0 80 81if [ -z "$1" ] || [[ $1 == -h ]] || [[ $1 == --help ]]; then 82 usage 83 exit 84fi 85 86calcXmx () { 87 source "$DIR""/calcmem.sh" 88 setEnvironment 89 parseXmx "$@" 90 if [[ $set == 1 ]]; then 91 return 92 fi 93 freeRam 800m 84 94 z="-Xmx${RAM}m" 95 z2="-Xms${RAM}m" 96} 97calcXmx "$@" 98 99function filterbyname() { 100 local CMD="java $EA $EOOM $z -cp $CP driver.FilterReadsByName $@" 101 echo $CMD >&2 102 eval $CMD 103} 104 105filterbyname "$@" 106