1 package jgi;
2 
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.util.ArrayList;
6 
7 import dna.AminoAcid;
8 import fileIO.ByteFile;
9 import fileIO.FileFormat;
10 import fileIO.ReadWrite;
11 import shared.Parse;
12 import shared.Parser;
13 import shared.PreParser;
14 import shared.ReadStats;
15 import shared.Shared;
16 import shared.Timer;
17 import shared.Tools;
18 import stream.ConcurrentReadInputStream;
19 import stream.ConcurrentReadOutputStream;
20 import stream.FASTQ;
21 import stream.FastaReadInputStream;
22 import stream.Read;
23 import structures.ByteBuilder;
24 import structures.ListNum;
25 import structures.LongListSet;
26 import structures.LongListSet.LongListSetIterator;
27 
28 /**
29  * Generates mutants of kmers.
30  *
31  * @author Brian Bushnell
32  * @date January 8, 2021
33  *
34  */
35 public class KExpand {
36 
37 	/*--------------------------------------------------------------*/
38 	/*----------------        Initialization        ----------------*/
39 	/*--------------------------------------------------------------*/
40 
41 	/**
42 	 * Code entrance from the command line.
43 	 * @param args Command line arguments
44 	 */
main(String[] args)45 	public static void main(String[] args){
46 		//Start a timer immediately upon code entrance.
47 		Timer t=new Timer();
48 
49 		//Create an instance of this class
50 		KExpand x=new KExpand(args);
51 
52 		//Run the object
53 		x.process(t);
54 
55 		//Close the print stream if it was redirected
56 		Shared.closeStream(x.outstream);
57 	}
58 
59 	/**
60 	 * Constructor.
61 	 * @param args Command line arguments
62 	 */
KExpand(String[] args)63 	public KExpand(String[] args){
64 
65 		{//Preparse block for help, config files, and outstream
66 			PreParser pp=new PreParser(args, getClass(), false);
67 			args=pp.args;
68 			outstream=pp.outstream;
69 		}
70 
71 		//Set shared static variables prior to parsing
72 		ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
73 		ReadWrite.MAX_ZIP_THREADS=Shared.threads();
74 		Shared.capBuffers(4); //Only for singlethreaded programs
75 
76 		{//Parse the arguments
77 			final Parser parser=parse(args);
78 			Parser.processQuality();
79 
80 			maxReads=parser.maxReads;
81 			overwrite=ReadStats.overwrite=parser.overwrite;
82 			append=ReadStats.append=parser.append;
83 			setInterleaved=parser.setInterleaved;
84 
85 			in1=parser.in1;
86 			in2=parser.in2;
87 			qfin1=parser.qfin1;
88 			qfin2=parser.qfin2;
89 			extin=parser.extin;
90 
91 			out1=parser.out1;
92 			extout=parser.extout;
93 			amino=Shared.AMINO_IN;
94 		}
95 
96 		doPoundReplacement(); //Replace # with 1 and 2
97 		adjustInterleaving(); //Make sure interleaving agrees with number of input and output files
98 		fixExtensions(); //Add or remove .gz or .bz2 as needed
99 		checkFileExistence(); //Ensure files can be read and written
100 		checkStatics(); //Adjust file-related static fields as needed for this program
101 
102 //		k2=k-1;
103 //		shift=bitsPerBase*k;
104 //		shift2=shift-bitsPerBase;
105 //		mask=(shift>63 ? -1L : ~((-1L)<<shift));
106 
107 		{//set some constants
108 			k2=k-1;
109 			bitsPerBase=(amino ? 5 : 2);
110 			maxSymbol=(amino ? 20 : 3);
111 			symbols=maxSymbol+1;
112 			symbolArrayLen=(64+bitsPerBase-1)/bitsPerBase;
113 			symbolSpace=(1<<bitsPerBase);
114 			symbolMask=symbolSpace-1;
115 
116 			symbolToNumber=AminoAcid.symbolToNumber(amino);
117 			symbolToNumber0=AminoAcid.symbolToNumber0(amino);
118 			symbolToComplementNumber0=AminoAcid.symbolToComplementNumber0(amino);
119 
120 			clearMasks=new long[symbolArrayLen];
121 			leftMasks=new long[symbolArrayLen];
122 			rightMasks=new long[symbolArrayLen];
123 //			lengthMasks=new long[symbolArrayLen];
124 			setMasks=new long[symbols][symbolArrayLen];
125 			for(int i=0; i<symbolArrayLen; i++){
126 				clearMasks[i]=~(symbolMask<<(bitsPerBase*i));
127 				leftMasks[i]=((-1L)<<(bitsPerBase*i));
128 				rightMasks[i]=~((-1L)<<(bitsPerBase*i));
129 //				lengthMasks[i]=((1L)<<(bitsPerBase*i));
130 				for(long j=0; j<symbols; j++){
131 					setMasks[(int)j][i]=(j<<(bitsPerBase*i));
132 				}
133 			}
134 
135 			minlen=k-1;
136 			minlen2=k;
137 			shift=bitsPerBase*k;
138 			shift2=shift-bitsPerBase;
139 			mask=(shift>63 ? -1L : ~((-1L)<<shift));
140 //			kmask=lengthMasks[k];
141 		}
142 
143 		//Create output FileFormat objects
144 		ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, ordered);
145 
146 		//Create input FileFormat objects
147 		ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true);
148 		ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true);
149 	}
150 
151 	/*--------------------------------------------------------------*/
152 	/*----------------    Initialization Helpers    ----------------*/
153 	/*--------------------------------------------------------------*/
154 
155 	/** Parse arguments from the command line */
parse(String[] args)156 	private Parser parse(String[] args){
157 
158 		//Create a parser object
159 		Parser parser=new Parser();
160 		parser.overwrite=true;
161 
162 		//Set any necessary Parser defaults here
163 		//parser.foo=bar;
164 
165 		//Parse each argument
166 		for(int i=0; i<args.length; i++){
167 			String arg=args[i];
168 
169 			//Break arguments into their constituent parts, in the form of "a=b"
170 			String[] split=arg.split("=");
171 			String a=split[0].toLowerCase();
172 			String b=split.length>1 ? split[1] : null;
173 			if(b!=null && b.equalsIgnoreCase("null")){b=null;}
174 
175 			if(a.equals("verbose")){
176 				verbose=Parse.parseBoolean(b);
177 			}else if(a.equals("k")){
178 				k=Parse.parseIntKMG(b);
179 				assert(k>0 && k<=31);
180 			}else if(a.equals("rcomp")){
181 				rcomp=Parse.parseBoolean(b);
182 			}else if(a.equals("subdist") || a.equals("sdist") || a.equals("hdist") || a.equals("hammingdistance")){
183 				subDist=Parse.parseIntKMG(b);
184 				assert(subDist>=0);
185 			}else if(a.equals("deldist") || a.equals("ddist") || a.equals("deletiondistance")){
186 				delDist=Parse.parseIntKMG(b);
187 				assert(delDist>=0 && delDist<=3);
188 			}else if(a.equals("insdist") || a.equals("idist") || a.equals("insertiondistance")){
189 				insDist=Parse.parseIntKMG(b);
190 				assert(insDist>=0);
191 			}else if(a.equals("editdist") || a.equals("edist") || a.equals("edits") || a.equals("editdistance")){
192 				int x=Parse.parseIntKMG(b);
193 				assert(x>=0 && x<=3);
194 				editDist=x;
195 				subDist=Tools.max(subDist, x);
196 			}else if(a.equals("maxedits") || a.equals("emax")){
197 				maxEdits=Parse.parseIntKMG(b);
198 				assert(maxEdits>=0);
199 			}else if(a.equals("maxsubs") || a.equals("smax")){
200 				maxSubs=Parse.parseIntKMG(b);
201 				assert(maxSubs>=0);
202 			}else if(a.equals("maxdels") || a.equals("dmax")){
203 				maxDels=Parse.parseIntKMG(b);
204 				assert(maxDels>=0);
205 			}else if(a.equals("maxinss") || a.equals("imax")){
206 				maxInss=Parse.parseIntKMG(b);
207 				assert(maxInss>=0);
208 			}else if(a.equals("speed")){
209 				int x=Parse.parseIntKMG(b);
210 				assert(x>=0 && x<=16) : "Speed must be 0-16";
211 				speed=x;
212 			}else if(a.equals("skip")){
213 				int x=Parse.parseIntKMG(b);
214 				assert(x>=0) : "Skip must be >=0";
215 				skip=x;
216 			}else if(a.equals("parse_flag_goes_here")){
217 				long fake_variable=Parse.parseKMG(b);
218 				//Set a variable here
219 			}else if(parser.parse(arg, a, b)){//Parse standard flags in the parser
220 				//do nothing
221 			}else{
222 				outstream.println("Unknown parameter "+args[i]);
223 				assert(false) : "Unknown parameter "+args[i];
224 			}
225 		}
226 
227 		return parser;
228 	}
229 
230 	/** Replace # with 1 and 2 in headers */
doPoundReplacement()231 	private void doPoundReplacement(){
232 		//Do input file # replacement
233 		if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){
234 			in2=in1.replace("#", "2");
235 			in1=in1.replace("#", "1");
236 		}
237 
238 		//Ensure there is an input file
239 		if(in1==null){throw new RuntimeException("Error - at least one input file is required.");}
240 	}
241 
242 	/** Add or remove .gz or .bz2 as needed */
fixExtensions()243 	private void fixExtensions(){
244 		in1=Tools.fixExtension(in1);
245 		in2=Tools.fixExtension(in2);
246 		qfin1=Tools.fixExtension(qfin1);
247 		qfin2=Tools.fixExtension(qfin2);
248 	}
249 
250 	/** Ensure files can be read and written */
checkFileExistence()251 	private void checkFileExistence(){
252 		//Ensure output files can be written
253 		if(!Tools.testOutputFiles(overwrite, append, false, out1)){
254 			outstream.println((out1==null)+", "+out1);
255 			throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out1+"\n");
256 		}
257 
258 		//Ensure input files can be read
259 		if(!Tools.testInputFiles(false, true, in1, in2)){
260 			throw new RuntimeException("\nCan't read some input files.\n");
261 		}
262 
263 		//Ensure that no file was specified multiple times
264 		if(!Tools.testForDuplicateFiles(true, in1, in2, out1)){
265 			throw new RuntimeException("\nSome file names were specified multiple times.\n");
266 		}
267 	}
268 
269 	/** Make sure interleaving agrees with number of input and output files */
adjustInterleaving()270 	private void adjustInterleaving(){
271 		//Adjust interleaved detection based on the number of input files
272 		if(in2!=null){
273 			if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");}
274 			FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false;
275 		}
276 	}
277 
278 	/** Adjust file-related static fields as needed for this program */
checkStatics()279 	private static void checkStatics(){
280 		//Adjust the number of threads for input file reading
281 		if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
282 			ByteFile.FORCE_MODE_BF2=true;
283 		}
284 
285 		assert(FastaReadInputStream.settingsOK());
286 	}
287 
288 	/*--------------------------------------------------------------*/
289 	/*----------------         Outer Methods        ----------------*/
290 	/*--------------------------------------------------------------*/
291 
292 	/** Create read streams and process all data */
process(Timer t)293 	void process(Timer t){
294 
295 		//Create a read input stream
296 		final ConcurrentReadInputStream cris=makeCris();
297 
298 		//Reset counters
299 		readsProcessed=readsOut=0;
300 		basesProcessed=basesOut=0;
301 
302 		//Process the read stream
303 		processInner(cris);
304 		errorState|=ReadWrite.closeStreams(cris);
305 
306 		if(verbose){outstream.println("Finished reading input.");}
307 
308 		//Optionally create a read output stream
309 		final ConcurrentReadOutputStream ros=makeCros();
310 
311 		if(ros!=null){
312 			dumpKmers(ros);
313 			if(verbose){outstream.println("Finished processing output.");}
314 
315 			errorState|=ReadWrite.closeStream(ros);
316 		}
317 
318 		//Write anything that was accumulated by ReadStats
319 		errorState|=ReadStats.writeAll();
320 
321 		//Report timing and results
322 		t.stop();
323 		outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8));
324 		outstream.println(Tools.things("Kmers Processed", kmersProcessed, 8));
325 		outstream.println(Tools.things("Kmers Added", kmersAdded, 8));
326 
327 		outstream.println();
328 		outstream.println(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut, basesOut, 8, false));
329 		outstream.println(Tools.things("Kmers Out", kmersOut, 8));
330 
331 		//Throw an exception of there was an error in a thread
332 		if(errorState){
333 			throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
334 		}
335 	}
336 
337 	/*--------------------------------------------------------------*/
338 	/*----------------            Dumping           ----------------*/
339 	/*--------------------------------------------------------------*/
340 
dumpKmers(ConcurrentReadOutputStream ros)341 	private void dumpKmers(ConcurrentReadOutputStream ros) {
342 
343 		set.sort();
344 		set.shrinkToUnique();
345 		LongListSetIterator it=set.iterator();
346 
347 		long id=1;
348 		ArrayList<Read> list=new ArrayList<Read>(200);
349 		ByteBuilder bb=new ByteBuilder();
350 		while(it.hasMore()){
351 			bb.clear();
352 			final long kmer=it.next();
353 			bb.appendKmer(kmer, k);
354 			Read r=new Read(bb.toBytes(), null, id);
355 			id++;
356 			kmersOut++;
357 			readsOut++;
358 			basesOut+=r.length();
359 			list.add(r);
360 			if(list.size()>=200){
361 				ros.add(list, 0);
362 				list=new ArrayList<Read>(200);
363 			}
364 		}
365 		if(list.size()>0){ros.add(list, 0);}
366 	}
367 
368 	/*--------------------------------------------------------------*/
369 	/*----------------         Inner Methods        ----------------*/
370 	/*--------------------------------------------------------------*/
371 
makeCris()372 	private ConcurrentReadInputStream makeCris(){
373 		ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2);
374 		cris.start(); //Start the stream
375 		if(verbose){outstream.println("Started cris");}
376 		boolean paired=cris.paired();
377 		if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));}
378 		return cris;
379 	}
380 
makeCros()381 	private ConcurrentReadOutputStream makeCros(){
382 		if(ffout1==null){return null;}
383 
384 		//Select output buffer size based on whether it needs to be ordered
385 		final int buff=4;
386 
387 		final ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(ffout1, null, buff, null, false);
388 		ros.start(); //Start the stream
389 		return ros;
390 	}
391 
392 	/** Iterate through the reads */
processInner(final ConcurrentReadInputStream cris)393 	void processInner(final ConcurrentReadInputStream cris){
394 
395 		//Do anything necessary prior to processing
396 
397 		{
398 			//Grab the first ListNum of reads
399 			ListNum<Read> ln=cris.nextList();
400 
401 			//Check to ensure pairing is as expected
402 			if(ln!=null && !ln.isEmpty()){
403 				Read r=ln.get(0);
404 				assert(ffin1.samOrBam() || (r.mate!=null)==cris.paired());
405 			}
406 
407 			//As long as there is a nonempty read list...
408 			while(ln!=null && ln.size()>0){
409 //				if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access
410 
411 				processList(ln, cris);
412 
413 				//Fetch a new list
414 				ln=cris.nextList();
415 			}
416 
417 			//Notify the input stream that the final list was used
418 			if(ln!=null){
419 				cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
420 			}
421 		}
422 
423 		//Do anything necessary after processing
424 
425 	}
426 
427 	/**
428 	 * Process a list of Reads.
429 	 * @param ln The list.
430 	 * @param cris Read Input Stream
431 	 * @param ros Read Output Stream for reads that will be retained
432 	 */
processList(ListNum<Read> ln, final ConcurrentReadInputStream cris)433 	void processList(ListNum<Read> ln, final ConcurrentReadInputStream cris){
434 
435 		//Grab the actual read list from the ListNum
436 		final ArrayList<Read> reads=ln.list;
437 
438 		long added=0;
439 
440 		//Loop through each read in the list
441 		for(int idx=0; idx<reads.size(); idx++){
442 			final Read r1=reads.get(idx);
443 			final Read r2=r1.mate;
444 
445 			//Validate reads in worker threads
446 			if(!r1.validated()){r1.validate(true);}
447 			if(r2!=null && !r2.validated()){r2.validate(true);}
448 
449 			//Track the initial length for statistics
450 			final int initialLength1=r1.length();
451 			final int initialLength2=r1.mateLength();
452 
453 			//Increment counters
454 			readsProcessed+=r1.pairCount();
455 			basesProcessed+=initialLength1+initialLength2;
456 			kmersAdded+=processReadPair(r1, r2);
457 		}
458 
459 		//Notify the input stream that the list was used
460 		cris.returnList(ln);
461 //		if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access
462 	}
463 
464 
465 	/**
466 	 * Process a single read pair.
467 	 * @param r1 Read 1
468 	 * @param r2 Read 2 (may be null)
469 	 * @return Number of kmers added.
470 	 */
processReadPair(final Read r1, final Read r2)471 	long processReadPair(final Read r1, final Read r2){
472 		long x=addToMap(r1, skip);
473 		if(r2!=null){
474 			x+=addToMap(r2, skip);
475 		}
476 		return x;
477 	}
478 
addToMap(Read r, int skip)479 	private long addToMap(Read r, int skip){
480 		final byte[] bases=r.bases;
481 		long kmer=0;
482 		long rkmer=0;
483 		long added=0;
484 		int len=0;
485 
486 //		if(bases!=null){
487 //			readsProcessed++;
488 //			basesProcessed+=bases.length;
489 //		}
490 		if(bases==null || bases.length<k){return 0;}
491 
492 		if(skip>1){ //Process while skipping some kmers
493 			for(int i=0; i<bases.length; i++){
494 				byte b=bases[i];
495 				long x=symbolToNumber0[b];
496 				long x2=symbolToComplementNumber0[b];
497 				kmer=((kmer<<bitsPerBase)|x)&mask;
498 				rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask;
499 				if(isFullyDefined(b)){len++;}else{len=0; rkmer=0;}
500 				if(verbose){outstream.println("Scanning1 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
501 				if(len>=k){
502 					kmersProcessed++;
503 					if(len%skip==0){
504 						final long extraBase=(i>=bases.length-1 ? -1 : symbolToNumber[bases[i+1]]);
505 						final long extraBase2=(i>=bases.length-2 ? -1 : symbolToNumber[bases[i+2]]);
506 						final long extraBase3=(i>=bases.length-3 ? -1 : symbolToNumber[bases[i+3]]);
507 						added+=addAndMutate(kmer, rkmer, k, extraBase, extraBase2, extraBase3);
508 					}
509 				}
510 			}
511 		}else{ //Process all kmers
512 			for(int i=0; i<bases.length; i++){
513 				final byte b=bases[i];
514 				final long x=symbolToNumber0[b];
515 				final long x2=symbolToComplementNumber0[b];
516 //				assert(x!=x2) : x+", "+x2+", "+Character.toString((char)b)+"\n"+Arrays.toString(symbolToNumber0)+"\n"+Arrays.toString(symbolToComplementNumber);
517 				kmer=((kmer<<bitsPerBase)|x)&mask;
518 				//10000, 1111111111, 16, 16, 2, 10, 8
519 				rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask;
520 				if(isFullyDefined(b)){len++;}else{len=0; rkmer=0;}
521 				if(verbose){
522 //					if(verbose){
523 //						String fwd=new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k));
524 //						String rev=AminoAcid.reverseComplementBases(fwd);
525 //						String fwd2=kmerToString(kmer, Tools.min(len, k));
526 //						outstream.println("fwd="+fwd+", fwd2="+fwd2+", rev="+rev+", kmer="+kmer+", rkmer="+rkmer);
527 //						outstream.println("b="+(char)b+", x="+x+", x2="+x2+", bitsPerBase="+bitsPerBase+", shift2="+shift2);
528 //						if(!amino){
529 //							assert(AminoAcid.stringToKmer(fwd)==kmer) : fwd+", "+AminoAcid.stringToKmer(fwd)+", "+kmer+", "+len;
530 //							if(len>=k){
531 //								assert(rcomp(kmer, Tools.min(len, k))==rkmer);
532 //								assert(rcomp(rkmer, Tools.min(len, k))==kmer);
533 //								assert(AminoAcid.kmerToString(kmer, Tools.min(len, k)).equals(fwd));
534 //								assert(AminoAcid.kmerToString(rkmer, Tools.min(len, k)).equals(rev)) : AminoAcid.kmerToString(rkmer, Tools.min(len, k))+" != "+rev+" (rkmer)";
535 //							}
536 //							assert(fwd.equalsIgnoreCase(fwd2)) : fwd+", "+fwd2; //may be unsafe
537 //						}
538 //						outstream.println("Scanning6 i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+", bases="+fwd+", rbases="+rev);
539 //					}
540 				}
541 				if(len>=k){
542 //					assert(kmer==rcomp(rkmer, k)) : Long.toBinaryString(kmer)+", "+Long.toBinaryString(rkmer)+", "+Long.toBinaryString(mask)+", x="+x+", x2="+x2+", bits="+bitsPerBase+", s="+shift+", s2="+shift2+", b="+Character.toString((char)b);
543 					kmersProcessed++;
544 					final long extraBase=(i>=bases.length-1 ? -1 : symbolToNumber[bases[i+1]]);
545 					final long extraBase2=(i>=bases.length-2 ? -1 : symbolToNumber[bases[i+2]]);
546 					final long extraBase3=(i>=bases.length-3 ? -1 : symbolToNumber[bases[i+3]]);
547 					final long atm=addAndMutate(kmer, rkmer, k, extraBase, extraBase2, extraBase3);
548 					added+=atm;
549 				}
550 			}
551 		}
552 		return added;
553 	}
554 
555 
556 
557 	/**
558 	 * Adds this kmer to the table, including any mutations implied by subDist, etc.
559 	 * @param kmer Forward kmer
560 	 * @param rkmer Reverse kmer
561 	 * @param len Kmer length
562 	 * @param subDist Number of substitutions to allow
563 	 * @param delDist Number of deletions to allow
564 	 * @param insDist Number of insertions to allow
565 	 * @param extraBase Base added to end in case of deletions
566 	 * @param extraBase2 Base added to end in case of deletions
567 	 * @param extraBase3 Base added to end in case of deletions
568 	 * @return Number of kmers stored
569 	 */
addAndMutate(final long kmer, final long rkmer, final int len, final long extraBase, final long extraBase2, final long extraBase3)570 	private long addAndMutate(final long kmer, final long rkmer, final int len, final long extraBase, final long extraBase2, final long extraBase3){
571 		if(editDist>0){//Edit mode
572 			return mutateE(kmer, rkmer, len, editDist, maxSubs, maxDels, maxInss, extraBase, extraBase2, extraBase3);
573 		}else if(subDist>0 || delDist>0 || insDist>0){//Sub Del Ins mode
574 			return mutateE(kmer, rkmer, len, maxEdits, subDist, delDist, insDist, extraBase, extraBase2, extraBase3);
575 		}else{
576 			addKey(kmer, rkmer, len);
577 		}
578 		return 1;
579 	}
580 
581 
582 	/**
583 	 * Adds this kmer to the set.
584 	 * @param kmer Forward kmer
585 	 * @param rkmer Reverse kmer
586 	 * @param len Kmer length
587 	 * @return Number of kmers added
588 	 */
addKey(final long kmer, final long rkmer, final int len)589 	private long addKey(final long kmer, final long rkmer, final int len){
590 		if(verbose){outstream.println("addToMap_A; len="+len);}
591 
592 		final long key=toValue(kmer, rkmer);
593 		if(verbose){outstream.println("toValue ("+kmerToString(kmer, len)+", "+kmerToString(rkmer, len)+") = "+kmerToString(key, len)+" = "+key);}
594 		if(failsSpeed(key)){return 0;}
595 
596 		if(verbose){outstream.println("addToMap_B: "+kmerToString(key, len)+" ("+key+")");}
597 		long added=0;
598 		set.add(key);
599 		added++;
600 		if(verbose){outstream.println("addToMap added "+added+" keys.");}
601 		return 1;
602 	}
603 
604 	//*** Note!  This is functionally identical to mutateE,
605 	//*** only the default params are different
606 	@Deprecated
607 	/**
608 	 * Mutate and store this kmer through 'dist' recursions.
609 	 * @param kmer Forward kmer
610 	 * @param rkmer Reverse kmer
611 	 * @param len Kmer length
612 	 * @param subDist Number of substitutions to allow
613 	 * @param delDist Number of deletions to allow
614 	 * @param insDist Number of insertions to allow
615 	 * @param extraBase Base added to end in case of deletions
616 	 * @param extraBase2 Base added to end in case of deletions
617 	 * @param extraBase3 Base added to end in case of deletions
618 	 * @return Number of kmers stored
619 	 */
mutateSDI(final long kmer, final long rkmer, final int len, final int maxEdits, final int subDist, final int delDist, final int insDist, final long extraBase, final long extraBase2, final long extraBase3)620 	private long mutateSDI(final long kmer, final long rkmer, final int len, final int maxEdits,
621 			final int subDist, final int delDist, final int insDist,
622 			final long extraBase, final long extraBase2, final long extraBase3){
623 		long added=1;
624 		final int maxEdits2=maxEdits-1;
625 
626 		addKey(kmer, rkmer, len);
627 		if(maxEdits<1){return 1;}
628 
629 		if(verbose){outstream.println("mutateSDI; len="+len+"; kmer="+kmer+"; rkmer="+rkmer);}
630 
631 		if(subDist>0){
632 			//Sub
633 			for(int j=0; j<symbols; j++){
634 				for(int i=0; i<len; i++){
635 					final long temp=(kmer&clearMasks[i])|setMasks[j][i];
636 					if(temp!=kmer){
637 						long rtemp=rcomp(temp, len);
638 						added+=mutateSDI(temp, rtemp, len, maxEdits2,
639 								subDist-1, delDist, insDist, extraBase, extraBase2, extraBase3);
640 					}
641 				}
642 			}
643 		}
644 
645 		if(delDist>0 && extraBase>=0 && extraBase<=maxSymbol){
646 			//Del
647 			for(int i=1; i<len; i++){
648 				final long temp=(kmer&leftMasks[i])|((kmer<<bitsPerBase)&rightMasks[i])|extraBase;
649 				if(temp!=kmer){
650 					long rtemp=rcomp(temp, len);
651 					added+=mutateSDI(temp, rtemp, len, maxEdits2,
652 							subDist, delDist-1, insDist, extraBase2, extraBase3, -1);
653 				}
654 			}
655 		}
656 
657 		if(insDist>0){
658 			//Ins
659 			final long eb0=kmer&symbolMask;
660 			for(int i=0; i<len; i++){
661 				final long temp0=(kmer&leftMasks[i])|((kmer&rightMasks[i])>>bitsPerBase);
662 				for(int j=0; j<symbols; j++){
663 					final long temp=temp0|setMasks[j][i];
664 					if(temp!=kmer){
665 						long rtemp=rcomp(temp, len);
666 						added+=mutateSDI(temp, rtemp, len, maxEdits2,
667 								subDist, delDist, insDist-1, eb0, extraBase, extraBase2);
668 					}
669 				}
670 			}
671 		}
672 
673 		return added;
674 	}
675 
676 	/**
677 	 * Mutate and store this kmer through 'dist' recursions.
678 	 * @param kmer Forward kmer
679 	 * @param rkmer Reverse kmer
680 	 * @param len Kmer length
681 	 * @param editDist Number of edits to allow
682 	 * @param extraBase Base added to end in case of deletions
683 	 * @param extraBase2 Base added to end in case of deletions
684 	 * @param extraBase3 Base added to end in case of deletions
685 	 * @return Number of kmers stored
686 	 */
mutateE(final long kmer, final long rkmer, final int len, final int editDist, final int maxSubs, final int maxDels, final int maxInss, final long extraBase, final long extraBase2, final long extraBase3)687 	private long mutateE(final long kmer, final long rkmer, final int len, final int editDist,
688 			final int maxSubs, final int maxDels, final int maxInss,
689 			final long extraBase, final long extraBase2, final long extraBase3){
690 		long added=0;
691 
692 		addKey(kmer, rkmer, len);
693 		if(editDist<1){return 1;}
694 
695 		if(verbose){outstream.println("mutateE; len="+len+"; kmer="+kmer+"; rkmer="+rkmer);}
696 
697 		final int edist2=editDist-1;
698 		if(maxSubs>0){
699 			//Sub
700 			for(int j=0; j<symbols; j++){
701 				for(int i=0; i<len; i++){
702 					final long temp=(kmer&clearMasks[i])|setMasks[j][i];
703 					if(temp!=kmer){
704 						long rtemp=rcomp(temp, len);
705 						added+=mutateE(temp, rtemp, len, edist2,
706 								maxSubs-1, maxDels, maxInss, extraBase, extraBase2, extraBase3);
707 					}
708 				}
709 			}
710 		}
711 
712 		if(maxDels>0 && extraBase>=0 && extraBase<=maxSymbol){
713 			//Del
714 			for(int i=1; i<len; i++){
715 				final long temp=(kmer&leftMasks[i])|((kmer<<bitsPerBase)&rightMasks[i])|extraBase;
716 				if(temp!=kmer){
717 					long rtemp=rcomp(temp, len);
718 					added+=mutateE(temp, rtemp, len, edist2,
719 							maxSubs, maxDels-1, maxInss, extraBase2, extraBase3, -1);
720 				}
721 			}
722 		}
723 
724 		if(maxInss>0){
725 			//Ins
726 			final long eb0=kmer&symbolMask;
727 			for(int i=0; i<len; i++){
728 				final long temp0=(kmer&leftMasks[i])|((kmer&rightMasks[i])>>bitsPerBase);
729 				for(int j=0; j<symbols; j++){
730 					final long temp=temp0|setMasks[j][i];
731 					if(temp!=kmer){
732 						long rtemp=rcomp(temp, len);
733 						added+=mutateE(temp, rtemp, len, edist2,
734 								maxSubs, maxDels, maxInss-1, eb0, extraBase, extraBase2);
735 					}
736 				}
737 			}
738 		}
739 
740 		return added;
741 	}
742 
743 	/**
744 	 * Transforms a kmer into a canonical value.  Expected to be inlined.
745 	 * @param kmer Forward kmer
746 	 * @param rkmer Reverse kmer
747 	 * @return Canonical value
748 	 */
toValue(long kmer, long rkmer)749 	final long toValue(long kmer, long rkmer){
750 		if(verbose){outstream.println("toValue("+AminoAcid.kmerToString(kmer, k)+", "+AminoAcid.kmerToString(rkmer, k)+")");}
751 		final long value=(rcomp ? Tools.max(kmer, rkmer) : kmer);
752 		if(verbose){outstream.println("value="+AminoAcid.kmerToString(value, k)+" = "+value);}
753 		return value;
754 	}
755 
rcomp(long kmer, int len)756 	final long rcomp(long kmer, int len){
757 		return amino ? kmer : AminoAcid.reverseComplementBinaryFast(kmer, len);
758 	}
759 
passesSpeed(long key)760 	final boolean passesSpeed(long key){
761 		return speed<1 || ((key&Long.MAX_VALUE)%17)>=speed;
762 	}
763 
failsSpeed(long key)764 	final boolean failsSpeed(long key){
765 		return speed>0 && ((key&Long.MAX_VALUE)%17)<speed;
766 	}
767 
768 	/** Returns true if the symbol is not degenerate (e.g., 'N') for the alphabet in use. */
isFullyDefined(byte symbol)769 	final boolean isFullyDefined(byte symbol){
770 		return symbol>=0 && symbolToNumber[symbol]>=0;
771 	}
772 
773 	/** For verbose / debugging output */
kmerToString(long kmer, int k)774 	final String kmerToString(long kmer, int k){
775 		return amino ? AminoAcid.kmerToStringAA(kmer, k) : AminoAcid.kmerToString(kmer, k);
776 	}
777 
778 	/*--------------------------------------------------------------*/
779 	/*----------------            Fields            ----------------*/
780 	/*--------------------------------------------------------------*/
781 
782 	/** Primary input file path */
783 	private String in1=null;
784 	/** Secondary input file path */
785 	private String in2=null;
786 
787 	private String qfin1=null;
788 	private String qfin2=null;
789 
790 	/** Primary output file path */
791 	private String out1=null;
792 
793 	/** Override input file extension */
794 	private String extin=null;
795 	/** Override output file extension */
796 	private String extout=null;
797 
798 	/** Whether interleaved was explicitly set. */
799 	private boolean setInterleaved=false;
800 
801 	private boolean rcomp=true;
802 	private int k=31;
803 	/** k-1; used in some expressions */
804 	final int k2;
805 
806 	private int speed=0;
807 	private int skip=0;
808 
809 	private int editDist=0;
810 	private int maxSubs=99;
811 	private int maxDels=99;
812 	private int maxInss=99;
813 
814 	private int maxEdits=99;
815 	private int subDist=0;
816 	private int delDist=0;
817 	private int insDist=0;
818 
819 	private LongListSet set=new LongListSet();
820 
821 	/*--------------------------------------------------------------*/
822 	/*-----------        Symbol-Specific Constants        ----------*/
823 	/*--------------------------------------------------------------*/
824 
825 	/** True for amino acid data, false for nucleotide data */
826 	final boolean amino;
827 //	final int maxSupportedK;
828 	final int bitsPerBase;
829 	final int maxSymbol;
830 	final int symbols;
831 	final int symbolArrayLen;
832 	final int symbolSpace;
833 	final long symbolMask;
834 
835 	final int minlen;
836 	final int minlen2;
837 	final int shift;
838 	final int shift2;
839 	final long mask;
840 
841 	/** x&clearMasks[i] will clear base i */
842 	final long[] clearMasks;
843 	/** x|setMasks[i][j] will set base i to j */
844 	final long[][] setMasks;
845 	/** x&leftMasks[i] will clear all bases to the right of i (exclusive) */
846 	final long[] leftMasks;
847 	/** x&rightMasks[i] will clear all bases to the left of i (inclusive) */
848 	final long[] rightMasks;
849 //	/** x|kMasks[i] will set the bit to the left of the leftmost base */
850 //	final long[] lengthMasks;
851 
852 	private final byte[] symbolToNumber0;
853 	private final byte[] symbolToComplementNumber0;
854 	private final byte[] symbolToNumber;
855 
856 	/*--------------------------------------------------------------*/
857 
858 	/** Number of reads processed */
859 	protected long readsProcessed=0;
860 	/** Number of bases processed */
861 	protected long basesProcessed=0;
862 	/** Number of kmers processed */
863 	protected long kmersProcessed=0;
864 	/** Number of kmers added (includes mutants) */
865 	protected long kmersAdded=0;
866 
867 	/** Number of reads output */
868 	protected long readsOut=0;
869 	/** Number of bases output */
870 	protected long basesOut=0;
871 	/** Number of kmers output */
872 	protected long kmersOut=0;
873 
874 	/** Quit after processing this many input reads; -1 means no limit */
875 	private long maxReads=-1;
876 
877 	/*--------------------------------------------------------------*/
878 	/*----------------         Final Fields         ----------------*/
879 	/*--------------------------------------------------------------*/
880 
881 	/** Primary input file */
882 	private final FileFormat ffin1;
883 	/** Secondary input file */
884 	private final FileFormat ffin2;
885 
886 	/** Primary output file */
887 	private final FileFormat ffout1;
888 
889 	/*--------------------------------------------------------------*/
890 	/*----------------        Common Fields         ----------------*/
891 	/*--------------------------------------------------------------*/
892 
893 	/** Print status messages to this output stream */
894 	private PrintStream outstream=System.err;
895 	/** Print verbose messages */
896 	public static boolean verbose=false;
897 	/** True if an error was encountered */
898 	public boolean errorState=false;
899 	/** Overwrite existing output files */
900 	private boolean overwrite=false;
901 	/** Append to existing output files */
902 	private boolean append=false;
903 	/** This flag has no effect on singlethreaded programs */
904 	private final boolean ordered=false;
905 
906 }
907