1 package bloom;
2 
3 import java.util.ArrayList;
4 import java.util.Arrays;
5 import java.util.BitSet;
6 
7 import dna.AminoAcid;
8 import dna.Data;
9 import fileIO.FileFormat;
10 import fileIO.ReadWrite;
11 import shared.KillSwitch;
12 import shared.Parse;
13 import shared.Parser;
14 import shared.PreParser;
15 import shared.ReadStats;
16 import shared.Timer;
17 import shared.Tools;
18 import stream.ConcurrentReadInputStream;
19 import stream.ConcurrentReadOutputStream;
20 import stream.Read;
21 import structures.ListNum;
22 
23 /**
24  * @author Brian Bushnell
25  * @date Aug 20, 2012
26  *
27  */
28 public class ErrorCorrect extends Thread{
29 
main(String[] args)30 	public static void main(String[] args){
31 
32 		{//Preparse block for help, config files, and outstream
33 			PreParser pp=new PreParser(args, new Object() { }.getClass().getEnclosingClass(), false);
34 			args=pp.args;
35 			//outstream=pp.outstream;
36 		}
37 
38 		String reads1=args[0];
39 		String reads2=(args.length>1 ? args[1] : null);
40 
41 		int k=23;
42 		int cbits=4;
43 		int gap=0;
44 		int hashes=1;
45 		int thresh1=1;
46 		int thresh2=2;
47 		int matrixbits=34;
48 		long maxReads=-1;
49 		int buildpasses=1;
50 		long tablereads=-1; //How many reads to process when building the hashtable
51 		int buildStepsize=4;
52 		String output=null;
53 		boolean ordered=true;
54 		boolean overwrite=false;
55 		boolean append=false;
56 
57 		for(int i=2; i<args.length; i++){
58 			final String arg=args[i];
59 			final String[] split=arg.split("=");
60 			String a=split[0].toLowerCase();
61 			String b=split.length>1 ? split[1] : null;
62 
63 			if(Parser.parseZip(arg, a, b)){
64 				//do nothing
65 			}else if(Parser.parseQuality(arg, a, b)){
66 				//do nothing
67 			}else if(a.equals("k") || a.equals("kmer")){
68 				k=Integer.parseInt(b);
69 			}else if(a.startsWith("cbits") || a.startsWith("cellbits")){
70 				cbits=Integer.parseInt(b);
71 			}else if(a.equals("initialthresh") || a.equals("thresh1")){
72 				thresh1=Integer.parseInt(b);
73 			}else if(a.equals("thresh") || a.equals("thresh2")){
74 				thresh2=Integer.parseInt(b);
75 			}else if(a.startsWith("gap")){
76 				gap=Integer.parseInt(b);
77 			}else if(a.startsWith("matrixbits")){
78 				matrixbits=Integer.parseInt(b);
79 			}else if(a.startsWith("hashes") || a.startsWith("multihash")){
80 				hashes=Integer.parseInt(b);
81 			}else if(a.startsWith("maxerrors")){
82 				ERROR_CORRECTION_LIMIT=Integer.parseInt(b);
83 			}else if(a.startsWith("passes")){
84 				buildpasses=Integer.parseInt(b);
85 			}else if(a.startsWith("stepsize") || a.startsWith("buildstepsize")){
86 				buildStepsize=Integer.parseInt(b);
87 			}else if(a.equals("threads") || a.equals("t")){
88 				System.err.println("Can't change threadcount for this class."); //THREADS=Integer.parseInt(b);
89 			}else if(a.startsWith("reads") || a.startsWith("maxreads")){
90 				maxReads=Parse.parseKMG(b);
91 			}else if(a.startsWith("tablereads")){
92 				tablereads=Parse.parseKMG(b);
93 			}else if(a.startsWith("build") || a.startsWith("genome")){
94 				Data.setGenome(Integer.parseInt(b));
95 				Data.sysout.println("Set genome to "+Data.GENOME_BUILD);
96 			}else if(a.equals("outputinfo") || a.startsWith("info")){
97 				OUTPUT_INFO=Parse.parseBoolean(b);
98 			}else if(a.startsWith("out")){
99 				output=b;
100 			}else if(a.startsWith("verbose")){
101 				KCountArray.verbose=Parse.parseBoolean(b);
102 //				verbose=KCountArray.verbose=Parse.parseBoolean(b);
103 			}else if(a.equals("ordered") || a.equals("ord")){
104 				ordered=Parse.parseBoolean(b);
105 			}else if(a.equals("append") || a.equals("app")){
106 				append=ReadStats.append=Parse.parseBoolean(b);
107 			}else if(a.equals("overwrite") || a.equals("ow")){
108 				overwrite=Parse.parseBoolean(b);
109 			}else{
110 				throw new RuntimeException("Unknown parameter "+args[i]);
111 			}
112 		}
113 
114 		{//Process parser fields
115 			Parser.processQuality();
116 		}
117 
118 		KCountArray kca=makeTable(reads1, reads2, k, cbits, gap, hashes, buildpasses, matrixbits, tablereads, buildStepsize, thresh1, thresh2);
119 
120 		detect(reads1, reads2, kca, k, thresh2, maxReads, output, ordered, append, overwrite);
121 	}
122 
makeTable(String reads1, String reads2, int k, int cbits, int gap, int hashes, int buildpasses, int matrixbits, long maxreads, int stepsize, int thresh1, int thresh2)123 	public static KCountArray makeTable(String reads1, String reads2, int k, int cbits, int gap, int hashes, int buildpasses, int matrixbits,
124 			long maxreads, int stepsize, int thresh1, int thresh2){
125 
126 		Timer thash=new Timer();
127 
128 		KmerCount6.maxReads=maxreads;
129 		int kbits=2*k;
130 		matrixbits=Tools.min(kbits, matrixbits);
131 
132 		thash.start();
133 //		Data.sysout.println("kbits="+(kbits)+" -> "+(1L<<kbits)+", matrixbits="+(matrixbits)+" -> "+(1L<<matrixbits)+", cbits="+cbits+", gap="+gap+", hashes="+hashes);
134 		KCountArray kca=KCountArray.makeNew(1L<<kbits, 1L<<matrixbits, cbits, gap, hashes);
135 
136 		assert(gap==0) : "TODO";
137 		if(buildpasses==1){
138 
139 			KmerCount6.count(reads1, reads2, k, cbits, gap, true, kca);
140 			kca.shutdown();
141 
142 		}else{
143 			assert(buildpasses>1);
144 			KCountArray trusted=null;
145 			for(int i=1; i<buildpasses; i++){
146 				boolean conservative=i>2;// /*or, alternately, (trusted==null || trusted.capacity()>0.3)
147 				int step=(stepsize==1 ? 1 : stepsize+i%2);
148 //				if(!conservative){step=(step+3)/4;}
149 				if(!conservative){step=Tools.min(3, (step+3)/4);}
150 
151 				KmerCount6.count(reads1, reads2, k, cbits, true, kca, trusted, maxreads, thresh1, step, conservative);
152 
153 				kca.shutdown();
154 				Data.sysout.println("Trusted:   \t"+kca.toShortString());
155 				trusted=kca;
156 				kca=KCountArray.makeNew(1L<<kbits, 1L<<matrixbits, cbits, gap, hashes);
157 
158 			}
159 
160 			KmerCount6.count(reads1, reads2, k, cbits, true, kca, trusted, maxreads, thresh2, stepsize, true);
161 
162 			kca.shutdown();
163 		}
164 
165 
166 		thash.stop();
167 //		Data.sysout.println(kca.toString());
168 		Data.sysout.println("Table :    \t"+kca.toShortString());
169 		Data.sysout.println("Hash time:      \t"+thash);
170 		return kca;
171 	}
172 
detect(String reads1, String reads2, KCountArray kca, int k, int thresh, long maxReads, String output, boolean ordered, boolean append, boolean overwrite)173 	public static void detect(String reads1, String reads2, KCountArray kca, int k, int thresh, long maxReads, String output, boolean ordered, boolean append, boolean overwrite) {
174 
175 
176 		final ConcurrentReadInputStream cris;
177 		{
178 			FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
179 			FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
180 			cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
181 		}
182 		assert(cris!=null) : reads1;
183 
184 		cris.start();
185 		if(verbose){System.err.println("Started cris");}
186 		boolean paired=cris.paired();
187 		if(verbose){System.err.println("Paired: "+paired);}
188 
189 		ConcurrentReadOutputStream ros=null;
190 		if(output!=null){
191 			String out1=output.replaceFirst("#", "1"), out2=null;
192 
193 			if(cris.paired()){
194 				if(output.contains("#")){
195 					out2=output.replaceFirst("#", "2");
196 				}else{
197 					System.err.println("Writing interleaved.");
198 				}
199 			}
200 
201 			final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS));
202 
203 			FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, OUTPUT_INFO ? ".info" : null, true, overwrite, append, ordered);
204 			FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, OUTPUT_INFO ? ".info" : null, true, overwrite, append, ordered);
205 			ros=ConcurrentReadOutputStream.getStream(ff1, ff2, buff, null, true);
206 
207 			assert(!ff1.sam()) : "Sam files need reference info for the header.";
208 		}
209 
210 
211 		if(ros!=null){
212 			ros.start();
213 			Data.sysout.println("Started output threads.");
214 		}
215 
216 		detect(cris, kca, k, thresh, maxReads, ros);
217 
218 		ReadWrite.closeStreams(cris, ros);
219 		if(verbose){System.err.println("Closed stream");}
220 	}
221 
detect(ConcurrentReadInputStream cris, KCountArray kca, int k, int thresh, long maxReads, ConcurrentReadOutputStream ros)222 	public static void detect(ConcurrentReadInputStream cris, KCountArray kca, int k, int thresh, long maxReads, ConcurrentReadOutputStream ros) {
223 		Timer tdetect=new Timer();
224 		tdetect.start();
225 
226 		ListNum<Read> ln=cris.nextList();
227 		ArrayList<Read> reads=(ln!=null ? ln.list : null);
228 
229 		long covered=0;
230 		long uncovered=0;
231 
232 		long coveredFinal=0;
233 		long uncoveredFinal=0;
234 
235 		long fullyCorrected=0;
236 		long failed=0;
237 
238 		long totalBases=0;
239 		long totalReads=0;
240 
241 
242 		while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
243 			for(Read r : reads){
244 				Read r2=r.mate;
245 				{
246 
247 //					if(r.numericID==23){verbose=true;}
248 
249 					totalReads++;
250 					if(verbose){System.err.println();}
251 					totalBases+=r.length();
252 //					BitSet bs=detectErrors(r, kca, k, thresh);
253 					BitSet bs=detectErrorsBulk(r, kca, k, thresh, 1);
254 					if(verbose){System.err.println(toString(bs, r.length()));}
255 //					Data.sysout.println(toString(detectErrorsTips(r, kca, k, thresh), r.length()));
256 					if(verbose){System.err.println(toString(detectErrors(r, kca, k, thresh), r.length()-k+1));}
257 					if(bs==null){//No errors, or can't detect errors
258 						assert(false);
259 					}else{
260 						int x=bs.cardinality();
261 						covered+=x;
262 						uncovered+=(r.length()-x);
263 						if(x<r.length()){
264 							bs=correctErrors(r, kca, k, thresh, bs, ERROR_CORRECTION_LIMIT, MAX_ERROR_BURST);
265 						}
266 						int y=bs.cardinality();
267 						coveredFinal+=y;
268 						uncoveredFinal+=(r.length()-y);
269 						if(x<r.length()){
270 							if(y==r.length()){
271 								fullyCorrected++;
272 							}else{
273 								failed++;
274 							}
275 						}
276 					}
277 				}
278 				if(r2!=null){
279 					totalReads++;
280 					totalBases+=r2.length();
281 //					BitSet bs=detectErrors(r2, kca, k, thresh);
282 					BitSet bs=detectErrorsBulk(r2, kca, k, thresh, 1);
283 					if(verbose){System.err.println(toString(bs, r2.length()));}
284 //					Data.sysout.println(toString(detectErrorsTips(r2, kca, k, thresh), r2.length()));
285 					if(verbose){System.err.println(toString(detectErrors(r2, kca, k, thresh), r2.length()-k+1));}
286 					if(bs==null){//No errors, or can't detect errors
287 					}else{
288 						int x=bs.cardinality();
289 						covered+=x;
290 						uncovered+=(r2.length()-x);
291 						if(x<r2.length()){
292 							bs=correctErrors(r2, kca, k, thresh, bs, ERROR_CORRECTION_LIMIT, MAX_ERROR_BURST);
293 						}
294 						int y=bs.cardinality();
295 						coveredFinal+=y;
296 						uncoveredFinal+=(r2.length()-y);
297 						if(x<r2.length()){
298 							if(y==r2.length()){
299 								fullyCorrected++;
300 							}else{
301 								failed++;
302 							}
303 						}
304 					}
305 				}
306 			}
307 
308 			if(ros!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight.
309 				if(DONT_OUTPUT_BAD_READS){removeBad(reads);}
310 				for(Read r : reads){
311 					if(r!=null){
312 						r.samline=null;
313 						assert(r.bases!=null);
314 						if(r.sites!=null && r.sites.isEmpty()){r.sites=null;}
315 					}
316 				}
317 //				System.err.println("Adding list of length "+readlist.size());
318 				ros.add(reads, ln.id);
319 			}
320 
321 			cris.returnList(ln);
322 			//System.err.println("fetching list");
323 			ln=cris.nextList();
324 			reads=(ln!=null ? ln.list : null);
325 		}
326 		if(verbose){System.err.println("Finished reading");}
327 		cris.returnList(ln);
328 		if(verbose){System.err.println("Returned list");}
329 
330 		tdetect.stop();
331 		Data.sysout.println("Detect time:    \t"+tdetect);
332 		Data.sysout.println("Total reads:    \t"+totalReads);
333 		Data.sysout.println("Total bases:    \t"+totalBases);
334 		Data.sysout.println("Reads Corrected:\t"+fullyCorrected);
335 		Data.sysout.println("Reads Failed:   \t"+failed);
336 
337 		Data.sysout.println("\n - before correction - ");
338 		Data.sysout.println("Covered:        \t"+covered);
339 		Data.sysout.println("Uncovered:      \t"+uncovered);
340 
341 		Data.sysout.println("\n -  after correction - ");
342 		Data.sysout.println("Covered:        \t"+coveredFinal);
343 		Data.sysout.println("Uncovered:      \t"+uncoveredFinal);
344 	}
345 
346 	/** Sets a 1 bit at start of each kmer with count at least thresh */
detectErrors(final Read r, final KCountArray kca, final int k, final int thresh)347 	public static BitSet detectErrors(final Read r, final KCountArray kca, final int k, final int thresh){
348 
349 		final int kbits=2*k;
350 		final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
351 		final int gap=kca.gap;
352 		final byte[] bases=r.bases;
353 		assert(kca.gap==0);
354 
355 		int bslen=r.length()-k-gap+1;
356 		if(bslen<1){return null;} //Read is too short to detect errors
357 		BitSet bs=new BitSet(bslen);
358 
359 		int len=0;
360 		long kmer=0;
361 		for(int i=0; i<bases.length; i++){
362 			byte b=bases[i];
363 			int x=AminoAcid.baseToNumber[b];
364 			if(x<0){
365 				len=0;
366 				kmer=0;
367 			}else{
368 				kmer=((kmer<<2)|x)&mask;
369 				len++;
370 
371 				if(len>=k){
372 					int count=kca.read(kmer);
373 					if(count>=thresh){
374 						bs.set(i+1-k);
375 					}
376 				}
377 			}
378 		}
379 
380 		return bs;
381 	}
382 
383 	/** Sets a 1 bit for every base covered by a kmer with count at least thresh */
detectErrorsBulk(final Read r, final KCountArray kca, final int k, final int thresh, final int stepsize)384 	public static BitSet detectErrorsBulk(final Read r, final KCountArray kca, final int k, final int thresh, final int stepsize){
385 
386 		final int kbits=2*k;
387 		final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
388 		final int gap=kca.gap;
389 		final byte[] bases=r.bases;
390 		assert(gap==0);
391 
392 		if(r.bases==null || r.length()<k+gap){return null;} //Read is too short to detect errors
393 		BitSet bs=new BitSet(r.length());
394 		final int setlen=k+gap;
395 
396 		int len=0;
397 		long kmer=0;
398 		for(int i=0; i<bases.length; i++){
399 			byte b=bases[i];
400 			int x=AminoAcid.baseToNumber[b];
401 			if(x<0){
402 				len=0;
403 				kmer=0;
404 			}else{
405 				kmer=((kmer<<2)|x)&mask;
406 				len++;
407 
408 				if(len>=k && ((len-k)%stepsize==0 || i==bases.length-1)){
409 					int count=kca.read(kmer);
410 					if(count>=thresh){
411 						bs.set(i+1-setlen, i+1);
412 					}
413 				}
414 			}
415 		}
416 		r.errors=bs.cardinality()-r.length();
417 
418 		return bs;
419 	}
420 
421 	/** Sets 1 for all bases.
422 	 * Then clears all bits covered by incorrect kmers. */
detectTrusted(final Read r, final KCountArray kca, final int k, final int thresh, final int detectStepsize)423 	public static BitSet detectTrusted(final Read r, final KCountArray kca, final int k, final int thresh, final int detectStepsize){
424 
425 		final int kbits=2*k;
426 		final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
427 		final int gap=kca.gap;
428 		final byte[] bases=r.bases;
429 		assert(gap==0);
430 
431 		if(r.bases==null || r.length()<k+gap){return null;} //Read is too short to detect errors
432 		BitSet bs=new BitSet(r.length());
433 		bs.set(0, r.length());
434 		final int setlen=k+gap;
435 
436 		int len=0;
437 		long kmer=0;
438 		for(int i=0; i<bases.length; i++){
439 			byte b=bases[i];
440 			int x=AminoAcid.baseToNumber[b];
441 			if(x<0){
442 				len=0;
443 				kmer=0;
444 			}else{
445 				kmer=((kmer<<2)|x)&mask;
446 				len++;
447 
448 				if(len>=k && (i%detectStepsize==0 || i==bases.length-1)){
449 					int count=kca.read(kmer);
450 					if(count<thresh){
451 						bs.clear(i+1-setlen, i+1);
452 //						bs.clear(i+1-setlen+detectStepsize, i+1-detectStepsize);
453 //						bs.clear(i+k/2-detectStepsize, i+k/2+detectStepsize);
454 //						bs.clear(i+k/2);
455 					}
456 				}
457 			}
458 		}
459 //		assert(bases.length==r.length());
460 		return bs;
461 	}
462 
detectErrorsTips(final Read r, final KCountArray kca, final int k, final int thresh)463 	public static BitSet detectErrorsTips(final Read r, final KCountArray kca, final int k, final int thresh){
464 //		if(kca.gap>0){return detectErrorsSplit(r, kca, k, thresh);}
465 
466 		final int kbits=2*k;
467 		final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
468 		final int gap=kca.gap;
469 		final byte[] bases=r.bases;
470 		assert(gap==0);
471 
472 		if(r.bases==null || r.length()<k+gap){return null;} //Read is too short to detect errors
473 		BitSet bs=new BitSet(r.length());
474 		final int setlen=k+gap;
475 
476 		int len=0;
477 		long kmer=0;
478 		for(int i=0; i<bases.length; i++){
479 			byte b=bases[i];
480 			int x=AminoAcid.baseToNumber[b];
481 			if(x<0){
482 				len=0;
483 				kmer=0;
484 			}else{
485 				kmer=((kmer<<2)|x)&mask;
486 				len++;
487 
488 				if(len>=k){
489 					int count=kca.read(kmer);
490 					if(count>=thresh){
491 						bs.set(i+1-setlen);
492 						bs.set(i);
493 					}
494 				}
495 			}
496 		}
497 		return bs;
498 	}
499 
500 
501 	/** Assumes bulk mode was used; e.g., any '0' bit is covered by no correct kmers */
correctErrors(final Read r, final KCountArray kca, final int k, final int thresh, BitSet bs, final int maxCorrections, final int maxBurst)502 	public static BitSet correctErrors(final Read r, final KCountArray kca, final int k, final int thresh, BitSet bs, final int maxCorrections, final int maxBurst){
503 		if(kca.gap>0){assert(false) : "TODO";}
504 
505 		assert(!OUTPUT_INFO) : "TODO: Outputting correction data is not yet supported.";
506 
507 		int corrections=0; //Alternately, corrections=r.errorsCorrected
508 		r.errors=0;
509 
510 		if(bs.cardinality()==0){//Cannot be corrected
511 			r.errors=r.length();
512 			return bs;
513 		}
514 
515 //		verbose=!bs.get(0);
516 		if(verbose){
517 			Data.sysout.println();
518 			Data.sysout.println(toString(bs, r.length()));
519 			Data.sysout.println(toString(detectErrorsTips(r, kca, k, thresh), r.length()));
520 			Data.sysout.println(toString(detectErrors(r, kca, k, thresh), r.length()-k+1));
521 		}
522 
523 
524 		int lastloc=-99;
525 		int burst=1;
526 		while(!bs.get(0) && corrections<maxCorrections){//While the read starts with a '0', correct from the right.
527 //			Data.sysout.println("Could not correct.");
528 //			return bs;
529 			int errorLoc=bs.nextSetBit(0)-1;//Location to left of first '1'
530 			if(Tools.absdif(errorLoc,lastloc)<=BURST_THRESH){burst++;}
531 			else{burst=1;}
532 			lastloc=errorLoc;
533 			boolean success=(burst<=MAX_ERROR_BURST) && correctFromRight(r, kca, k, thresh, bs, errorLoc);
534 			if(success){
535 				corrections++;
536 				bs=detectErrorsBulk(r, kca, k, thresh, 1);
537 				if(verbose){System.err.println(">\n"+toString(bs, r.length()));}
538 			}else{
539 				r.errors=r.length()-bs.cardinality();
540 //				r.errorsCorrected+=corrections;
541 				if(verbose){System.err.println("Could not correct.");}
542 				r.bases[errorLoc]='N';
543 				r.quality[errorLoc]=0;
544 				return bs;
545 			}
546 		}
547 
548 		burst=1;
549 		while(bs.cardinality()<r.length() && corrections<maxCorrections){
550 			if(bs.get(0)){//First bit is a "1", can correct from the left
551 				int errorLoc=bs.nextClearBit(0);//Location to left of first '0'
552 				if(Tools.absdif(errorLoc,lastloc)<=BURST_THRESH){burst++;}
553 				else{burst=1;}
554 				lastloc=errorLoc;
555 				boolean success=(burst<=MAX_ERROR_BURST) && correctFromLeft(r, kca, k, thresh, bs, errorLoc);
556 				if(success){
557 					corrections++;
558 					bs=detectErrorsBulk(r, kca, k, thresh, 1);
559 					if(verbose){System.err.println(">\n"+toString(bs, r.length()));}
560 				}else{
561 					r.errors=r.length()-bs.cardinality();
562 //					r.errorsCorrected+=corrections;
563 					r.bases[errorLoc]='N';
564 					r.quality[errorLoc]=0;
565 					if(verbose){System.err.println("Could not correct.");}
566 					return bs;
567 				}
568 			}
569 		}
570 
571 		r.errors=r.length()-bs.cardinality();
572 //		r.errorsCorrected+=corrections;
573 		assert(corrections<=maxCorrections);
574 		return bs;
575 	}
576 
577 
578 	/**
579 	 * @param r
580 	 * @param kca
581 	 * @param k
582 	 * @param thresh
583 	 * @param bs
584 	 * @param errorLoc
585 	 * @return
586 	 */
correctFromLeft(Read r, KCountArray kca, int k, int thresh, BitSet bs, int error)587 	private static boolean correctFromLeft(Read r, KCountArray kca, int k, int thresh, BitSet bs, int error) {
588 		final int kbits=2*k;
589 		final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
590 		final int gap=kca.gap;
591 		final int setlen=k+gap;
592 		final int startLoc=error-(setlen)+1;
593 		final byte oldBase=r.bases[error];
594 		final byte[] bases=r.bases;
595 
596 		final int minAdvance=Tools.min(MIN_ADVANCE, bases.length-error);
597 
598 		long kmer=0;
599 		int len=0;
600 		for(int i=startLoc; i<error; i++){
601 			byte b=bases[i];
602 			int x=AminoAcid.baseToNumber[b];
603 			if(x<0){
604 				len=0;
605 				kmer=0;
606 				throw new RuntimeException("Can't correct from left!\nerror="+error+"\n"+toString(bs, bases.length)+"\n"+new String(bases)+"\nreadID: "+r.numericID);
607 			}else{
608 				kmer=((kmer<<2)|x)&mask;
609 				len++;
610 			}
611 		}
612 		assert(len==setlen-1) : setlen+", "+len+", "+error+", "+startLoc;
613 
614 		int[] counts=KillSwitch.allocInt1D(4);
615 		int[] dists=KillSwitch.allocInt1D(4);
616 		int maxLoc=Tools.min(bases.length-1, error+setlen-1);
617 		if(!bs.get(error+1)){maxLoc=Tools.min(maxLoc, error+9);}
618 		else{
619 			for(int i=error+2; i<=maxLoc; i++){
620 				if(!bs.get(i)){
621 					maxLoc=i-1;
622 					break;
623 				}
624 			}
625 		}
626 
627 		if(verbose){System.err.println("correctFromLeft.  Error = "+error+", maxloc="+maxLoc);}
628 		for(int bnum=0; bnum<4; bnum++){
629 			byte c=AminoAcid.numberToBase[bnum];
630 			bases[error]=c;
631 			if(verbose){System.err.println("Considering "+(char)c);}
632 			long key=kmer;
633 			for(int loc=error; loc<=maxLoc; loc++){
634 				c=bases[loc];
635 				int x=AminoAcid.baseToNumber[c];
636 				if(x<0){
637 					if(verbose){System.err.println("break: N");}
638 					break;
639 				}
640 				key=((key<<2)|x)&mask;
641 				int count=kca.read(key);
642 				if(count<thresh){
643 					if(verbose){System.err.println("break: count="+count);}
644 					break;
645 				}
646 				dists[bnum]++;
647 				counts[bnum]+=count;
648 			}
649 		}
650 		bases[error]=oldBase;
651 
652 		//Note:  I could require both to be the same, to decrease false-positives
653 
654 		final int muid=maxUniqueIndex(dists);
655 		Arrays.sort(dists);
656 		final int advance=dists[3];
657 		final int delta=dists[3]-dists[2];
658 //		if(advance<minAdvance){return false;}
659 		if(delta<minAdvance){return false;}
660 
661 		int best=(muid<0 ? maxUniqueIndex(counts) : muid);
662 
663 		if(verbose){System.err.println("Best="+best+": "+Arrays.toString(dists)+"  \t"+Arrays.toString(counts));}
664 		if(best<0){return false;}
665 		byte bestC=AminoAcid.numberToBase[best];
666 		if(bestC==oldBase){return false;}
667 		bases[error]=bestC;
668 
669 		r.quality[error]=(byte)Tools.min(10, 3+delta);
670 
671 		return true;
672 	}
673 
674 
675 
676 	/**
677 	 * @param r
678 	 * @param kca
679 	 * @param k
680 	 * @param thresh
681 	 * @param bs
682 	 * @param errorLoc
683 	 * @return
684 	 */
correctFromRight(Read r, KCountArray kca, int k, int thresh, BitSet bs, int error)685 	private static boolean correctFromRight(Read r, KCountArray kca, int k, int thresh, BitSet bs, int error) {
686 		final int kbits=2*k;
687 		final int shift=kbits-2;
688 		final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
689 		final int gap=kca.gap;
690 		final int setlen=k+gap;
691 		final int stopLoc=error+(setlen)-1;
692 		final byte oldBase=r.bases[error];
693 		final byte[] bases=r.bases;
694 
695 		final int minAdvance=Tools.min(MIN_ADVANCE, error+1);
696 
697 		long kmer=0;
698 		int len=0;
699 
700 		for(int i=error+1; i<=stopLoc; i++){
701 			byte b=bases[i];
702 			int x=AminoAcid.baseToNumber[b];
703 			if(x<0){
704 				len=0;
705 				kmer=0;
706 				throw new RuntimeException("Can't correct from right!\nerror="+error+"\n"+toString(bs, bases.length)+"\n"+new String(bases));
707 			}else{
708 				kmer=((kmer<<2)|x)&mask;
709 				len++;
710 			}
711 //			Data.sysout.print((char)b);
712 		}
713 		kmer<<=2;
714 
715 		if(verbose){
716 			Data.sysout.println();
717 			String s=Long.toBinaryString(kmer);
718 			while(s.length()<kbits){s="0"+s;}
719 			Data.sysout.println("kmer = \t"+s);
720 		}
721 		assert(len==setlen-1) : setlen+", "+len+", "+error+", "+stopLoc;
722 
723 		int[] counts=KillSwitch.allocInt1D(4);
724 		int[] dists=KillSwitch.allocInt1D(4);
725 		int minLoc=Tools.max(0, error-setlen+1);
726 		if(error==0 || !bs.get(error-1)){minLoc=Tools.max(minLoc, error-9);}
727 		else{
728 			for(int i=error-2; i>=minLoc; i--){
729 				if(!bs.get(i)){
730 					minLoc=i+1;
731 					break;
732 				}
733 			}
734 		}
735 
736 		if(verbose){
737 			Data.sysout.println("correctFromRight.  Error = "+error+", minloc="+minLoc);
738 			Data.sysout.println(new String(r.bases));
739 		}
740 		for(int bnum=0; bnum<4; bnum++){
741 			byte c=AminoAcid.numberToBase[bnum];
742 			bases[error]=c;
743 			if(verbose){System.err.println("Considering "+(char)c);}
744 			long key=kmer;
745 			for(int loc=error; loc>=minLoc; loc--){
746 				c=bases[loc];
747 				int x=AminoAcid.baseToNumber[c];
748 				if(x<0){
749 					if(verbose){System.err.println("break: N");}
750 					break;
751 				}
752 				key=((key>>2)|(((long)x)<<shift))&mask;
753 //				{
754 //					String s=Long.toBinaryString(key);
755 //					while(s.length()<kbits){s="0"+s;}
756 //					Data.sysout.println("mask="+Long.toBinaryString(mask)+", shift="+shift+", c="+c+", x="+x+", key  = \t"+s);
757 //				}
758 				int count=kca.read(key);
759 				if(count<thresh){
760 					if(verbose){System.err.println("break: count="+count);}
761 					break;
762 				}
763 				dists[bnum]++;
764 				counts[bnum]+=count;
765 			}
766 		}
767 		bases[error]=oldBase;
768 
769 		//Note:  I could require both to be the same, to decrease false-positives
770 
771 		final int muid=maxUniqueIndex(dists);
772 		Arrays.sort(dists);
773 		final int advance=dists[3];
774 		final int delta=dists[3]-dists[2];
775 //		if(advance<minAdvance){return false;}
776 		if(delta<minAdvance){return false;}
777 
778 		int best=(muid<0 ? maxUniqueIndex(counts) : muid);
779 
780 		if(verbose){System.err.println("Best="+best+": "+Arrays.toString(dists)+"  \t"+Arrays.toString(counts));}
781 		if(best<0){return false;}
782 		byte bestC=AminoAcid.numberToBase[best];
783 		if(bestC==oldBase){return false;}
784 		bases[error]=bestC;
785 
786 		r.quality[error]=(byte)Tools.min(10, 3+delta);
787 
788 		return true;
789 	}
790 
791 
792 	/** returns index of highest value, if unique; else a negative number */
maxUniqueIndex(int[] array)793 	private static int maxUniqueIndex(int[] array){
794 		int max=array[0];
795 		int maxIndex=0;
796 		for(int i=1; i<array.length; i++){
797 			if(array[i]>max){
798 				max=array[i];
799 				maxIndex=i;
800 			}else if(max==array[i]){
801 				maxIndex=-1;
802 			}
803 		}
804 		return maxIndex;
805 	}
806 
toString(BitSet bs, int len)807 	public static final String toString(BitSet bs, int len){
808 //		assert(verbose);
809 		StringBuilder sb=new StringBuilder(len);
810 		for(int i=0; i<len; i++){sb.append(bs.get(i) ? '1' : '0');}
811 		return sb.toString();
812 	}
813 
removeBad(ArrayList<Read> list)814 	private static void removeBad(ArrayList<Read> list){
815 
816 		for(int i=0; i<list.size(); i++){
817 			Read r=list.get(i);
818 			if(r.errors>0){
819 				if(r.mate==null || r.mate.errors>0){
820 					list.set(i, null);
821 				}
822 			}
823 		}
824 
825 	}
826 
827 	public static boolean verbose=false;
828 	/** Bails out if a read still has errors after correcting this many. */
829 	public static int ERROR_CORRECTION_LIMIT=6;
830 	/** Max allowed number of nearby corrections.
831 	 * A long error burst indicates the read simply has low coverage, and is not being corrected correctly. */
832 	public static int MAX_ERROR_BURST=3;
833 	/** Bursts have at most this distance between errors. E.G. '1' means errors are adjacent. */
834 	public static int BURST_THRESH=2;
835 	/** Withhold uncorrectable reads from output. */
836 	public static boolean DONT_OUTPUT_BAD_READS=false;
837 	/** Do not correct an error if it is at most this far from the next error.  Instead, bail out. */
838 	public static int MIN_ADVANCE=1;
839 
840 	/** Number of threads used for error correction.  Does not control number of threads for creating the hash table.
841 	 * Additionally, up to 2 threads are used for reading and up to 2 for writing.  For this (singlethreaded) class, the number does nothing. */
842 	public static final int THREADS=1;
843 
844 	/** Output correction data instead of the corrected read */
845 	public static boolean OUTPUT_INFO=false;
846 
847 
848 }
849