1 package sketch;
2 
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 
8 import dna.AminoAcid;
9 import fileIO.FileFormat;
10 import fileIO.ReadWrite;
11 import jgi.BBMerge;
12 import jgi.TranslateSixFrames;
13 import prok.CallGenes;
14 import prok.GeneCaller;
15 import prok.Orf;
16 import prok.ProkObject;
17 import shared.Parse;
18 import shared.Tools;
19 import stream.ConcurrentReadInputStream;
20 import stream.Read;
21 import structures.EntropyTracker;
22 import structures.ListNum;
23 import tax.TaxNode;
24 import tax.TaxTree;
25 
26 /**
27  * Creates MinHashSketches rapidly.
28  *
29  * @author Brian Bushnell
30  * @date July 6, 2016
31  *
32  */
33 public class SketchMakerMini extends SketchObject {
34 
35 	/*--------------------------------------------------------------*/
36 	/*----------------        Initialization        ----------------*/
37 	/*--------------------------------------------------------------*/
38 
SketchMakerMini(SketchTool tool_, int mode_, DisplayParams params)39 	public SketchMakerMini(SketchTool tool_, int mode_, DisplayParams params){
40 		this(tool_, mode_, params.minEntropy, params.minProb, params.minQual);
41 	}
42 
43 	/**
44 	 * Constructor.
45 	 */
SketchMakerMini(SketchTool tool_, int mode_, float minEntropy_, float minProb_, byte minQual_)46 	public SketchMakerMini(SketchTool tool_, int mode_, float minEntropy_, float minProb_, byte minQual_){
47 
48 		tool=tool_;
49 		mode=mode_;
50 		minProb=minProb_;
51 		minQual=minQual_;
52 
53 		aminoShift=AminoAcid.AMINO_SHIFT;
54 		if(!aminoOrTranslate()){
55 			shift=2*k;
56 			shift2=shift-2;
57 			mask=(shift>63 ? -1L : ~((-1L)<<shift)); //Conditional allows K=32
58 		}else{
59 			shift=aminoShift*k;
60 			shift2=shift-aminoShift;
61 			mask=(shift>63 ? -1L : ~((-1L)<<shift));
62 		}
63 
64 		if(AUTOSIZE && (mode==ONE_SKETCH || mode==PER_FILE)){
65 			heap=new SketchHeap(Tools.max(tool.stTargetSketchSize, (int)(80000*Tools.mid(1, AUTOSIZE_FACTOR, 32))), tool.minKeyOccuranceCount, tool.trackCounts);
66 		}else if(AUTOSIZE_LINEAR && (mode==ONE_SKETCH || mode==PER_FILE)){
67 			heap=new SketchHeap(Tools.max(tool.stTargetSketchSize, (int)(10000000*Tools.mid(0.1, 2*AUTOSIZE_LINEAR_DENSITY, 0.00001))),
68 					tool.minKeyOccuranceCount, tool.trackCounts);
69 		}else{
70 			heap=new SketchHeap(tool.stTargetSketchSize, tool.minKeyOccuranceCount, tool.trackCounts);
71 		}
72 
73 		if(minEntropy_>0){
74 			eTracker=new EntropyTracker(entropyK, k, (amino || translate), minEntropy_, true);
75 		}else{
76 			eTracker=null;
77 		}
78 
79 		if(translate || processSSU){
80 			gCaller=CallGenes.makeGeneCaller(pgm);
81 		}else{
82 			gCaller=null;
83 		}
84 	}
85 
86 	/*--------------------------------------------------------------*/
87 	/*----------------         Outer Methods        ----------------*/
88 	/*--------------------------------------------------------------*/
89 
90 	/** Create read streams and process all data */
toSketches(final String fname, float samplerate, long reads)91 	public ArrayList<Sketch> toSketches(final String fname, float samplerate, long reads){
92 		heap.clear(false); //123
93 		final String simpleName;
94 
95 		final FileFormat ffin1, ffin2;
96 		if(fname.indexOf('#')>=0 && FileFormat.isFastq(ReadWrite.rawExtension(fname)) && !new File(fname).exists()){
97 			ffin1=FileFormat.testInput(fname.replaceFirst("#", "1"), FileFormat.FASTQ, null, true, true);
98 			ffin2=FileFormat.testInput(fname.replaceFirst("#", "2"), FileFormat.FASTQ, null, true, true);
99 		}else{
100 			ffin1=FileFormat.testInput(fname, FileFormat.FASTA, null, true, true);
101 			ffin2=null;
102 		}
103 
104 		//Create a read input stream
105 		final ConcurrentReadInputStream cris;
106 		{
107 			simpleName=ffin1.simpleName();
108 			heap.setFname(simpleName);
109 			cris=ConcurrentReadInputStream.getReadInputStream(reads, true, ffin1, ffin2, null, null);
110 			if(samplerate!=1){cris.setSampleRate(samplerate, sampleseed);}
111 			cris.start(); //Start the stream
112 			if(verbose){outstream.println("Started cris");}
113 		}
114 		if(mode==ONE_SKETCH || mode==PER_FILE){
115 			 if(heap.name0()==null){heap.setName0(simpleName);}
116 		}
117 		ArrayList<Sketch> sketches=processInner(cris);
118 
119 		errorState|=ReadWrite.closeStream(cris);
120 		sketchesMade++;
121 		return sketches;
122 	}
123 
124 	/*--------------------------------------------------------------*/
125 	/*----------------         Inner Methods        ----------------*/
126 	/*--------------------------------------------------------------*/
127 
128 	/** Iterate through the reads */
processInner(ConcurrentReadInputStream cris)129 	ArrayList<Sketch> processInner(ConcurrentReadInputStream cris){
130 		assert(heap.size()==0);
131 		ArrayList<Sketch> sketches=new ArrayList<Sketch>(mode==ONE_SKETCH || mode==PER_FILE ? 1 : 8);
132 
133 		//Grab the first ListNum of reads
134 		ListNum<Read> ln=cris.nextList();
135 		//Grab the actual read list from the ListNum
136 		ArrayList<Read> reads=(ln!=null ? ln.list : null);
137 
138 		//As long as there is a nonempty read list...
139 		while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
140 			//				if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access
141 
142 			//Loop through each read in the list
143 			for(int idx=0; idx<reads.size(); idx++){
144 				final Read r1=reads.get(idx);
145 				final Read r2=r1.mate;
146 
147 				processReadPair(r1, r2);
148 				if(mode!=ONE_SKETCH && mode!=PER_FILE){
149 					if(heap!=null && heap.size()>0 && heap.maxLen()>=Tools.max(1, minSketchSize)){
150 						int size=heap.size();
151 						Sketch sketch=new Sketch(heap, false, tool.trackCounts, null);
152 						assert(sketch.keys.length>0) : sketch.keys.length+", "+size;
153 						sketch.loadSSU();
154 						sketches.add(sketch);
155 						sketchesMade++;
156 					}
157 					if(heap!=null){heap.clear(false);}
158 				}
159 			}
160 
161 			//Notify the input stream that the list was used
162 			cris.returnList(ln);
163 			//				if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access
164 
165 			//Fetch a new list
166 			ln=cris.nextList();
167 			reads=(ln!=null ? ln.list : null);
168 		}
169 
170 		//Notify the input stream that the final list was used
171 		if(ln!=null){
172 			cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
173 		}
174 
175 		if(mode==ONE_SKETCH || mode==PER_FILE){
176 			Sketch sketch=new Sketch(heap, false, tool.trackCounts, null);
177 			sketch.loadSSU();
178 			sketches.add(sketch);
179 			sketchesMade++;
180 		}
181 		heap.clear(true);
182 		return sketches;
183 	}
184 
processReadPair(Read r1, Read r2)185 	void processReadPair(Read r1, Read r2){
186 		//Track the initial length for statistics
187 		final int initialLength1=r1.length();
188 		final int initialLength2=r1.mateLength();
189 
190 		//Increment counters
191 		readsProcessed+=r1.pairCount();
192 		basesProcessed+=initialLength1+initialLength2;
193 
194 		if(mode!=ONE_SKETCH && mode!=PER_FILE){
195 			int expectedSize=toSketchSize(initialLength1+initialLength2, -1, -1, targetSketchSize);
196 			if(heap==null || heap.capacity()<expectedSize){heap=new SketchHeap(expectedSize, tool.minKeyOccuranceCount, tool.trackCounts);}
197 		}
198 
199 		if(tool.mergePairs && r2!=null){
200 			final int insert=BBMerge.findOverlapStrict(r1, r2, false);
201 			if(insert>0){
202 				heap.genomeSequences++;
203 				r2.reverseComplement();
204 				r1=r1.joinRead(insert);
205 				r2=null;
206 			}
207 		}
208 
209 		processRead(r1);
210 		if(r2!=null){processRead(r2);}
211 
212 		if(heap.name0()==null){
213 			heap.setName0(r1.id);
214 		}
215 
216 		TaxNode tn=null;
217 		if(heap.taxID<0 && r1.length()>800){
218 			if(taxtree!=null){
219 				try {
220 					tn=taxtree.parseNodeFromHeader(r1.id, true);
221 				} catch (Throwable e) {}
222 				if(tn!=null){
223 					heap.taxID=tn.id;
224 					if(heap.taxName()==null){
225 						heap.setTaxName(tn.name);
226 					}
227 				}
228 //				System.err.println("A) "+heap.taxID+r1.id);
229 			}else{
230 				heap.taxID=TaxTree.parseHeaderStatic(r1.id);
231 //				System.err.println("B) "+heap.taxID+r1.id);
232 			}
233 		}
234 		assert(heap.taxID<0 || heap.taxName()!=null || taxtree==null) : heap.taxID+", "+heap.taxName()+", "+heap.name()+", "+tn;
235 	}
236 
237 	public void processRead(final Read r){
238 		if(amino){
239 			processReadAmino(r);
240 		}else if(translate){
241 			processReadTranslated(r);
242 		}else{
243 			processReadNucleotide(r);
244 		}
245 	}
246 
247 	public void processReadTranslated(final Read r){
248 		assert(!r.aminoacid());
249 		final ArrayList<Read> prots;
250 		if(sixframes){
251 			if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){
252 				Orf orf=gCaller.makeRna(r.id, r.bases, ProkObject.r16S);//TODO: allow 18S also
253 				if(orf!=null && orf.length()>=min_SSU_len){
254 					assert(orf.is16S());
255 					if(orf.is16S() && orf.length()>=heap.r16SLen()){heap.set16S(CallGenes.fetch(orf, r).bases);}
256 				}
257 				//TODO: Add 18S.
258 			}
259 			prots=TranslateSixFrames.toFrames(r, true, false, 6);
260 		}else{
261 			ArrayList<Orf> list;
262 			list=gCaller.callGenes(r);
263 			prots=CallGenes.translate(r, list);
264 			if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){
265 				for(Orf orf : list){
266 					if(orf.is16S() && orf.length()>=min_SSU_len && orf.length()>=heap.r16SLen()){
267 						heap.set16S(CallGenes.fetch(orf, r).bases);
268 						break;
269 					}
270 				}
271 			}
272 		}
273 		if(prots!=null){
274 			for(Read p : prots){
275 				processReadAmino(p);
276 			}
277 		}
278 	}
279 
processReadNucleotide(final Read r)280 	void processReadNucleotide(final Read r){
281 		if(processSSU && heap.r16S()==null && r.length()>=min_SSU_len && !useSSUMapOnly && !heap.isEukaryote()){
282 			Orf orf=gCaller.makeRna(r.id, r.bases, ProkObject.r16S);//TODO: 18S
283 			if(orf!=null && orf.length()>=min_SSU_len){
284 				assert(orf.start>=0 && orf.stop<r.length()) : r.length()+"\n"+orf;
285 				assert(orf.is16S());
286 				if(orf.is16S() && orf.length()>=heap.r16SLen()){heap.set16S(CallGenes.fetch(orf, r).bases);}
287 			}
288 			//TODO: Add 18S.
289 		}
290 
291 		final byte[] bases=r.bases;
292 		final byte[] quals=r.quality;
293 		final long[] baseCounts=heap.baseCounts(true);
294 		long kmer=0;
295 		long rkmer=0;
296 		int len=0;
297 		assert(!r.aminoacid());
298 
299 		final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists());
300 		final long min=minHashValue;
301 		heap.genomeSizeBases+=r.length();
302 		heap.genomeSequences++;
303 		if(eTracker!=null){eTracker.clear();}
304 
305 //		assert(false) : minProb+", "+minQual+", "+(quals==null);
306 
307 		if(quals==null || (minProb<=0 && minQual<2)){
308 //			System.err.println("A");
309 			for(int i=0; i<bases.length; i++){
310 //				System.err.println("B: len="+len);
311 				byte b=bases[i];
312 				long x=AminoAcid.baseToNumber[b];
313 				long x2=AminoAcid.baseToComplementNumber[b];
314 
315 				kmer=((kmer<<2)|x)&mask;
316 				rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
317 				if(eTracker!=null){eTracker.add(b);}
318 				if(x<0){
319 					len=0;
320 					rkmer=0;
321 				}else{
322 					len++;
323 					baseCounts[(int)x]++;
324 				}
325 
326 //				System.err.println("\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+"\n"
327 //				+AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(rkmer, k), k)+"\n"
328 //				+len+", "+(char)b+", "+x+", "+x2+"\n");
329 
330 				if(len>=k){
331 					kmersProcessed++;
332 					heap.genomeSizeKmers++;
333 //					heap.probSum++; //Note really necessary for fasta data
334 					if(eTracker==null || eTracker.passes()){
335 //						System.err.println("Pass.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString());
336 
337 //						assert(kmer==AminoAcid.reverseComplementBinaryFast(rkmer, k)) :
338 //							"\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+"\n"
339 //							+AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(rkmer, k), k)+"\n"
340 //							+len+", "+(char)b+", "+x+", "+x2;
341 
342 						final long hashcode=hash(kmer, rkmer);
343 						//				System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash);
344 						if(hashcode>min){
345 							if(noBlacklist){
346 								heap.add(hashcode);
347 							}else{
348 								heap.checkAndAdd(hashcode);
349 							}
350 						}
351 					}else{
352 //						System.err.println("Fail.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString()+"\n"+r.toFastq()+"\n"+eTracker);
353 //						assert(false);
354 					}
355 				}
356 			}
357 		}else{
358 			int zeroQualityKmers=0;
359 			int positiveQualityKmers=0;
360 
361 			float prob=1;
362 			for(int i=0; i<bases.length; i++){
363 				final byte b=bases[i];
364 				final long x=AminoAcid.baseToNumber[b];
365 				final long x2=AminoAcid.baseToComplementNumber[b];
366 
367 				kmer=((kmer<<2)|x)&mask;
368 				rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
369 				if(eTracker!=null){eTracker.add(b);}
370 
371 				final byte q=quals[i];
372 				{//Quality-related stuff
373 					assert(q>=0) : Arrays.toString(quals)+"\n"+minProb+", "+minQual;
374 //					if(x>=0){
375 //						if(q>0){
376 //							positiveQualityBases++;
377 //						}else{
378 //							zeroQualityBases++;
379 //						}
380 //					}
381 					prob=prob*align2.QualityTools.PROB_CORRECT[q];
382 					if(len>k){
383 						byte oldq=quals[i-k];
384 						prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq];
385 					}
386 					if(x<0 || q<minQual){
387 						len=0;
388 						kmer=rkmer=0;
389 						prob=1;
390 					}else{
391 						len++;
392 						baseCounts[(int)x]++;
393 					}
394 				}
395 
396 				if(len>=k){
397 					kmersProcessed++;
398 					if(prob>=minProb){
399 						heap.genomeSizeKmers++;
400 						heap.probSum+=prob;
401 						if(eTracker==null || eTracker.passes()){
402 							final long hashcode=hash(kmer, rkmer);
403 //							System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash);
404 							if(hashcode>min){
405 								if(noBlacklist){
406 									heap.add(hashcode);
407 								}else{
408 									heap.checkAndAdd(hashcode);
409 								}
410 							}
411 						}else{
412 //							System.err.println("Fail.\t"+eTracker.calcEntropy()+"\t"+eTracker.basesToString());
413 						}
414 						positiveQualityKmers++;
415 					}else if(q<=2){
416 						zeroQualityKmers++;
417 					}
418 				}
419 
420 				//This version is slow but calculates depth better.
421 //				if(len>=k){
422 //					kmersProcessed++;
423 //					heap.genomeSizeKmers++;
424 //					final long hash=hash(kmer, rkmer);
425 //					//				System.err.println(kmer+"\t"+rkmer+"\t"+z+"\t"+hash);
426 //					if(hash>min){
427 //						if(prob>=minProb || (!heap.setMode && heap.contains(hash))){
428 //							if(noBlacklist){
429 //								heap.add(hash);
430 //							}else{
431 //								heap.checkAndAdd(hash);
432 //							}
433 //						}
434 //					}
435 //				}
436 			}
437 			if(minProb>0 && zeroQualityKmers>100 && positiveQualityKmers==0){
438 				if(looksLikePacBio(r)){
439 					synchronized(this){
440 						minProb=0;
441 					}
442 					processReadNucleotide(r);
443 				}
444 			}
445 		}
446 //		assert(false);
447 	}
448 
looksLikePacBio(Read r)449 	boolean looksLikePacBio(Read r){
450 		if(r.length()<302 || r.mate!=null){return false;}
451 		if(r.quality==null){
452 			int x=Parse.parseZmw(r.id);
453 			return x>=0;
454 		}
455 		int positive=0;
456 		int zero=0;
457 		int ns=0;
458 		for(int i=0; i<r.bases.length; i++){
459 			byte b=r.bases[i];
460 			byte q=r.quality[i];
461 			if(b=='N'){ns++;}
462 			else if(q==0 || q==2){
463 				zero++;
464 			}else{
465 				positive++;
466 			}
467 		}
468 		return zero>=r.length()/2 && positive==0;
469 	}
470 
processReadAmino(final Read r)471 	void processReadAmino(final Read r){
472 		final byte[] bases=r.bases;
473 		long kmer=0;
474 		int len=0;
475 		assert(r.aminoacid());
476 
477 		final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists());
478 		final long min=minHashValue;
479 		heap.genomeSizeBases+=r.length();
480 		heap.genomeSequences++;
481 
482 		for(int i=0; i<bases.length; i++){
483 			byte b=bases[i];
484 			long x=AminoAcid.acidToNumberNoStops[b];
485 			kmer=((kmer<<aminoShift)|x)&mask;
486 //			if(eTracker!=null){eTracker.add(b);}
487 
488 			if(x<0){len=0;}else{len++;}
489 			if(len>=k){
490 				kmersProcessed++;
491 				heap.genomeSizeKmers++;
492 //				if(eTracker==null || eTracker.passes()){
493 //					assert(false) : (eTracker==null)+", "+eTracker.cutoff()+", "+eTracker.calcEntropy()+", "+r;
494 					long hashcode=hash(kmer, kmer);
495 					if(hashcode>min){
496 						if(noBlacklist){
497 							heap.add(hashcode);
498 						}else{
499 							heap.checkAndAdd(hashcode);
500 						}
501 					}
502 //				}
503 			}
504 		}
505 	}
506 
processReadAmino_old_no_entropy(final Read r)507 	void processReadAmino_old_no_entropy(final Read r){
508 		final byte[] bases=r.bases;
509 		long kmer=0;
510 		int len=0;
511 		assert(r.aminoacid());
512 
513 		final boolean noBlacklist=!(Blacklist.exists() || Whitelist.exists());
514 		final long min=minHashValue;
515 		heap.genomeSizeBases+=r.length();
516 		heap.genomeSequences++;
517 
518 		for(int i=0; i<bases.length; i++){
519 			byte b=bases[i];
520 			long x=AminoAcid.acidToNumberNoStops[b];
521 			kmer=((kmer<<aminoShift)|x)&mask;
522 			if(x<0){len=0;}else{len++;}
523 			if(len>=k){
524 				kmersProcessed++;
525 				heap.genomeSizeKmers++;
526 				long hashcode=hash(kmer, kmer);
527 				if(hashcode>min){
528 					if(noBlacklist){
529 						heap.add(hashcode);
530 					}else{
531 						heap.checkAndAdd(hashcode);
532 					}
533 				}
534 			}
535 		}
536 	}
537 
toSketch(int minCount)538 	public Sketch toSketch(int minCount){
539 		Sketch sketch=null;
540 		if(heap!=null && heap.size()>0){
541 			try {
542 				sketch=new Sketch(heap, false, tool.trackCounts, null, minCount);
543 			} catch (Throwable e) {
544 				// TODO Auto-generated catch block
545 				e.printStackTrace();
546 			}
547 			heap.clear(false);
548 		}
549 		return sketch;
550 	}
551 
add(SketchMakerMini smm)552 	public void add(SketchMakerMini smm){
553 		heap.add(smm.heap);
554 		readsProcessed+=smm.readsProcessed;
555 		basesProcessed+=smm.basesProcessed;
556 		kmersProcessed+=smm.kmersProcessed;
557 		sketchesMade+=smm.sketchesMade;
558 		pacBioDetected|=smm.pacBioDetected;
559 	}
560 
561 	/** True only if this thread has completed successfully */
562 	boolean success=false;
563 
564 	SketchHeap heap;
565 
566 	final int aminoShift;
567 	final int shift;
568 	final int shift2;
569 	final long mask;
570 	final EntropyTracker eTracker;
571 	final GeneCaller gCaller;
572 
minEntropy()573 	public float minEntropy() {
574 		// TODO Auto-generated method stub
575 		return eTracker==null ? -1 : eTracker.cutoff();
576 	}
577 
578 	/*--------------------------------------------------------------*/
579 	/*----------------            Fields            ----------------*/
580 	/*--------------------------------------------------------------*/
581 
582 	/** Number of reads processed */
583 	protected long readsProcessed=0;
584 	/** Number of bases processed */
585 	protected long basesProcessed=0;
586 	/** Number of bases processed */
587 	protected long kmersProcessed=0;
588 	/** Number of sketches started */
589 	protected long sketchesMade=0;
590 
minProb()591 	float minProb() {return minProb;}
minQual()592 	byte minQual() {return minQual;}
593 	public boolean pacBioDetected=false;
594 	private float minProb;
595 	private byte minQual;
596 
597 	/*--------------------------------------------------------------*/
598 	/*----------------         Final Fields         ----------------*/
599 	/*--------------------------------------------------------------*/
600 
601 	private final SketchTool tool;
602 	final int mode;
603 
604 	/*--------------------------------------------------------------*/
605 	/*----------------        Common Fields         ----------------*/
606 	/*--------------------------------------------------------------*/
607 
608 	/** Print status messages to this output stream */
609 	private PrintStream outstream=System.err;
610 	/** Print verbose messages */
611 	public static boolean verbose=false;
612 	/** True if an error was encountered */
613 	public boolean errorState=false;
614 
615 }
616