1 package jgi;
2 
3 import java.util.Arrays;
4 import java.util.Locale;
5 
6 import assemble.Tadpole;
7 import assemble.Tadpole1;
8 import assemble.Tadpole2;
9 import dna.AminoAcid;
10 import shared.KillSwitch;
11 import shared.Shared;
12 import shared.Tools;
13 import stream.Read;
14 import ukmer.Kmer;
15 
16 /**
17  * @author Brian Bushnell
18  * @date Apr 15, 2014
19  *
20  */
21 public final class BBMergeOverlapper {
22 
23 	static{
24 		if(false && Shared.USE_JNI){//TODO: Disabled!
Shared.loadJNI()25 			Shared.loadJNI();
26 		}
27 	}
28 
mateByOverlapJNI(byte[] a_bases, byte[] b_bases, byte[] a_quality, byte[] b_quality, float[] aprob, float[] bprob, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int margin, int maxMismatches0, int maxMismatches, int minq)29 	private static native final int mateByOverlapJNI(byte[] a_bases, byte[] b_bases, byte[] a_quality, byte[] b_quality,
30 			float[] aprob, float[] bprob, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int margin,
31 			int maxMismatches0, int maxMismatches, int minq);
32 
mateByOverlapRatioJNI_WithQualities(byte[] a_bases, byte[] b_bases, byte[] a_quality, byte[] b_quality, float[] aprob, float[] bprob, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio, float margin, float offset)33 	private static native final int mateByOverlapRatioJNI_WithQualities(byte[] a_bases, byte[] b_bases, byte[] a_quality, byte[] b_quality,
34 			float[] aprob, float[] bprob, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio,
35 			float margin, float offset);
36 
mateByOverlapRatioJNI(byte[] a_bases, byte[] b_bases, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio, float margin, float offset, float gIncr, float bIncr)37 	private static native final int mateByOverlapRatioJNI(byte[] a_bases, byte[] b_bases,
38 			int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio,
39 			float margin, float offset, float gIncr, float bIncr);
40 
mateByOverlap(Read a, Read b, float[] aprob, float[] bprob, int[] rvector, int minOverlap0, final int minOverlap, final int minInsert0, int margin, final int maxMismatches0, final int maxMismatches, final int minq)41 	protected static final int mateByOverlap(Read a, Read b, float[] aprob, float[] bprob, int[] rvector,
42 			int minOverlap0, final int minOverlap, final int minInsert0, int margin, final int maxMismatches0, final int maxMismatches, final int minq) {
43 		if(rvector==null){rvector=KillSwitch.allocInt1D(5);}
44 		final int x;
45 		if(false && Shared.USE_JNI){//TODO: Disabled!
46 			x=mateByOverlapJNI(a.bases, b.bases, a.quality, b.quality, aprob, bprob, rvector, minOverlap0, minOverlap, minInsert0, margin, maxMismatches0, maxMismatches, minq);
47 		}else{
48 			x=mateByOverlapJava_unrolled(a, b, aprob, bprob, rvector, minOverlap0, minOverlap, minInsert0, margin, maxMismatches0, maxMismatches, minq);
49 		}
50 		return x;
51 	}
52 
mateByOverlapRatio(Read a, Read b, float[] aprob, float[] bprob, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int minInsert, final float maxRatio, final float minSecondRatio, final float margin, final float offset, final float gIncr, final float bIncr, boolean useQuality)53 	protected static final int mateByOverlapRatio(Read a, Read b, float[] aprob, float[] bprob, int[] rvector,
54 			int minOverlap0, int minOverlap, int minInsert0, int minInsert, final float maxRatio, final float minSecondRatio,
55 			final float margin, final float offset, final float gIncr, final float bIncr, boolean useQuality) {
56 		if(rvector==null){rvector=KillSwitch.allocInt1D(5);}
57 //		final boolean swapped;
58 //		if(a.length()>b.length()){
59 //			swapped=true;
60 //			a.swapBasesWithMate();
61 //			a.reverseComplement();
62 //			b.reverseComplement();
63 //		}else{
64 //			swapped=false;
65 //		}
66 
67 		final int x;
68 		if(false && Shared.USE_JNI/* && !useQuality*/){
69 			if(useQuality && a.quality!=null && b.quality!=null){
70 				x=mateByOverlapRatioJNI_WithQualities(a.bases, b.bases, a.quality, b.quality, aprob, bprob, rvector, minOverlap0, minOverlap, minInsert0, minInsert, maxRatio, margin, offset);
71 			}else{
72 				x=mateByOverlapRatioJNI(a.bases, b.bases, rvector, minOverlap0, minOverlap, minInsert0, minInsert, maxRatio, margin, offset, gIncr, bIncr);
73 			}
74 		}else{
75 			if(useQuality && a.quality!=null && b.quality!=null){
76 				x=mateByOverlapRatioJava_WithQualities(a, b, aprob, bprob, rvector, minOverlap0, minOverlap, minInsert0, minInsert,
77 						maxRatio, minSecondRatio, margin, offset);
78 			}else{
79 				x=mateByOverlapRatioJava(a, b, rvector, minOverlap0, minOverlap, minInsert0, minInsert, maxRatio, minSecondRatio, margin, offset, gIncr, bIncr);
80 			}
81 		}
82 //		if(swapped){
83 //			a.swapBasesWithMate();
84 //			a.reverseComplement();
85 //			b.reverseComplement();
86 //		}
87 		return x;
88 	}
89 
mateByOverlapRatioJava_WithQualities(Read a, Read b, float[] aprob, float[] bprob, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio, final float minSecondRatio, final float margin, final float offset)90 	protected static final int mateByOverlapRatioJava_WithQualities(Read a, Read b, float[] aprob, float[] bprob, int[] rvector,
91 			int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio, final float minSecondRatio,
92 			final float margin, final float offset) {
93 		assert(rvector!=null);
94 		assert(margin>=1);
95 		minOverlap=Tools.max(4, minOverlap0, minOverlap);
96 		minOverlap0=Tools.mid(4, minOverlap0, minOverlap);
97 		if(rvector==null){rvector=KillSwitch.allocInt1D(5);}
98 
99 		final byte[] abases=a.bases, bbases=b.bases, aqual=a.quality, bqual=b.quality;
100 		final int alen=abases.length, blen=bbases.length;
101 		final int minLength=Tools.min(alen, blen);
102 
103 		assert(aqual!=null && bqual!=null);
104 		{
105 			for(int i=0; i<aqual.length; i++){aprob[i]=probCorrect3[aqual[i]];}
106 			for(int i=0; i<bqual.length; i++){bprob[i]=probCorrect3[bqual[i]];}
107 		}
108 
109 		{
110 			float x=findBestRatio_WithQualities(a, b, aprob, bprob, minOverlap0, minOverlap, minInsert, maxRatio, offset);
111 			if(!TAG_CUSTOM && x>maxRatio){
112 				rvector[2]=minLength;
113 				rvector[4]=0;
114 				return -1;
115 			}
116 			maxRatio=Tools.min(maxRatio, x);
117 		}
118 
119 //		final float altBadlimit=Tools.max(maxRatio, 0.07f)*2f*alen+2;
120 		final float margin2=(margin+offset)/minLength;
121 
122 		int bestInsert=-1;
123 		int bestOverlap=-1;
124 		int bestBadInt=-1;
125 		float bestBad=minLength;
126 		float bestRatio=1;
127 		boolean ambig=false;
128 
129 		float secondBestBad=0;
130 		float secondBestRatio=1;
131 		int secondBestInsert=0;
132 		int secondBestOverlap=0;
133 		int secondBestBadInt=-1;
134 
135 		final float extraMult=(TAG_CUSTOM ? 80 : 1.2f);
136 
137 		final int largestInsertToTest=(alen+blen-minOverlap0);
138 		final int smallestInsertToTest=minInsert0;
139 		for(int insert=largestInsertToTest; insert>=smallestInsertToTest; insert--){
140 			if(verbose){System.err.println("d\nTesting read "+a.numericID+", overlap "+insert+", insert "+(alen+blen-insert));}
141 
142 			float good=0, bad=0;
143 			int badInt=0;
144 
145 			final int istart=(insert<=blen ? 0 : insert-blen);
146 			final int jstart=(insert>=blen ? 0 : blen-insert);
147 
148 			final int overlapLength=Tools.min(alen-istart, blen-jstart, insert);
149 //			final float badlimit=extraMult*Tools.max(minSecondRatio, Tools.min(altBadlimit, (Tools.min(bestRatio, maxRatio)))*margin*overlapLength)+1f;
150 //			final float badlimit=extraMult*Tools.max(minSecondRatio, Tools.min(bestRatio, maxRatio)*margin*overlapLength)+1f;
151 			final float badlimit=extraMult*(Tools.min(bestRatio, maxRatio)*margin*overlapLength)+1f;
152 //			final float badlimit=extraMult*Tools.max(minSecondRatio, Tools.min(bestRatio, maxRatio))*margin*overlapLength+1f;
153 
154 			final int imax=istart+overlapLength;
155 			for(int i=istart, j=jstart; i<imax && bad<=badlimit; i++, j++){
156 				assert(i>=0 && i<alen && j>=0 && j<blen) : "\njstart="+jstart+", j="+j+
157 				", istart="+istart+", i="+i+" \n"+"insert="+insert+", overlap="+overlapLength+", a.length="+a.length()+
158 				", b.length="+b.length()+", bad="+bad+", badlimit="+badlimit+", good="+good;
159 				final byte ca=abases[i], cb=bbases[j];
160 
161 				final float x=aprob[i]*bprob[j];
162 
163 				if(ca==cb){good+=x;}
164 				else{
165 					bad+=x;
166 					badInt++;
167 				}
168 			}
169 
170 //			if(verbose || true){
171 //				System.err.println("istart="+istart+", jstart="+jstart+", overlapLength="+overlapLength+", overlap="+overlap+", bestOverlap="+bestOverlap);
172 //				System.err.println("overlap="+overlap+", bad="+bad+", good="+good);
173 //				System.err.println("bestGood="+bestGood+", bestBad="+bestBad);
174 //				System.err.println();
175 //			}
176 
177 			if(bad<=badlimit){
178 				if(bad==0 && good>minOverlap0 && good<minOverlap){
179 					rvector[2]=bestBadInt;//=(int)(bestBad+0.95f);
180 					rvector[4]=1;
181 					return -1;
182 				}
183 
184 				float ratio=(bad+offset)/overlapLength;
185 //				System.err.println("*** ratio="+ratio+", bestRatio="+bestRatio);
186 
187 				if(ratio<bestRatio*margin){
188 
189 					ambig=(ratio*margin>=bestRatio || good<minOverlap);
190 					if(ratio<bestRatio){
191 						secondBestInsert=bestInsert;
192 						secondBestOverlap=bestOverlap;
193 						secondBestBad=bestBad;
194 						secondBestRatio=bestRatio;
195 						secondBestBadInt=bestBadInt;
196 
197 						bestInsert=insert;
198 						bestOverlap=overlapLength;
199 						bestBad=bad;
200 						bestRatio=ratio;
201 						bestBadInt=badInt;
202 					}
203 					else if(ratio<secondBestRatio){
204 						secondBestInsert=insert;
205 						secondBestOverlap=overlapLength;
206 						secondBestBad=bad;
207 						secondBestRatio=ratio;
208 						secondBestBadInt=badInt;
209 					}
210 
211 					if(!TAG_CUSTOM && ((ambig && bestRatio<margin2) || secondBestRatio<minSecondRatio)){
212 						rvector[2]=bestBadInt;//=(int)(bestBad+0.95f);
213 						rvector[4]=1;
214 						return -1;
215 					}
216 				}
217 			}
218 		}
219 		if(secondBestRatio<minSecondRatio){ambig=true;}
220 
221 		if(TAG_CUSTOM){
222 			if(secondBestRatio<minSecondRatio){ambig=true;}
223 			StringBuilder sb=new StringBuilder(a.id);
224 
225 			sb.append("_bi=").append(bestInsert);
226 			sb.append("_bo=").append(bestOverlap);
227 			sb.append("_bb=").append(String.format(Locale.ROOT, "%.4f", bestBad));
228 			sb.append("_br=").append(String.format(Locale.ROOT, "%.4f", bestRatio));
229 			sb.append("_bbi=").append(bestBadInt);
230 
231 			sb.append("_sbi=").append(secondBestInsert);
232 			sb.append("_sbo=").append(secondBestOverlap);
233 			sb.append("_sbb=").append(String.format(Locale.ROOT, "%.4f", secondBestBad));
234 			sb.append("_sbr=").append(String.format(Locale.ROOT, "%.4f", secondBestRatio));
235 			sb.append("_sbbi=").append(secondBestBadInt);
236 
237 			a.id=sb.toString();
238 
239 			rvector[2]=bestBadInt;//=(int)(bestBad+0.95f);
240 			rvector[4]=(ambig ? 1 : 0);
241 
242 			return (bestInsert<0 ? -1 : bestInsert);
243 		}
244 
245 		if(!ambig && bestRatio>maxRatio){bestInsert=-1;}
246 
247 		rvector[2]=bestBadInt;//=(int)(bestBad+0.95f);
248 		rvector[4]=(ambig ? 1 : 0);
249 
250 //		System.err.println("***C : "+bestOverlap+", "+ambig+", "+bestBad+", "+(bestOverlap<0 ? -1 : alen+blen-bestOverlap)+", "+
251 //				(bestOverlap<0 ? -1 : (bestOverlap<alen && alen>=blen) ? bestOverlap+alen-blen : bestOverlap)+", "+alen+", "+blen);
252 		return (bestInsert<0 ? -1 : bestInsert);
253 	}
254 
mateByOverlapRatioJava(Read a, Read b, int[] rvector, int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio, final float minSecondRatio, final float margin, final float offset, final float gIncr, final float bIncr)255 	protected static final int mateByOverlapRatioJava(Read a, Read b, int[] rvector,
256 			int minOverlap0, int minOverlap, int minInsert0, int minInsert, float maxRatio, final float minSecondRatio,
257 			final float margin, final float offset, final float gIncr, final float bIncr) {
258 		assert(rvector!=null);
259 		assert(margin>=1);
260 		minOverlap=Tools.max(4, minOverlap0, minOverlap);
261 		minOverlap0=Tools.mid(4, minOverlap0, minOverlap);
262 		if(rvector==null){rvector=KillSwitch.allocInt1D(5);}
263 
264 		final byte[] abases=a.bases, bbases=b.bases;
265 		final int alen=abases.length, blen=bbases.length;
266 		final int minLength=Tools.min(alen, blen);
267 
268 		{
269 			float x=findBestRatio(a, b, minOverlap0, minOverlap, minInsert, maxRatio, offset, gIncr, bIncr);
270 			if(verbose){
271 				System.err.println(x+", "+maxRatio+", "+Arrays.toString(rvector));
272 			}
273 			if(x>maxRatio){
274 				rvector[2]=minLength;
275 				rvector[4]=0;
276 				return -1;
277 			}
278 			maxRatio=Tools.min(maxRatio, x);
279 		}
280 
281 //		final float altBadlimit=Tools.max(maxRatio, 0.07f)*2f*alen+1;
282 		final float margin2=(margin+offset)/minLength;
283 		final byte N='N';
284 
285 		int bestInsert=-1;
286 		int bestOverlap=-1;
287 		int bestBadInt=-1;
288 		float bestBad=minLength;
289 		float bestRatio=1;
290 		boolean ambig=false;
291 
292 		float secondBestBad=0;
293 		float secondBestRatio=1;
294 		int secondBestInsert=0;
295 		int secondBestOverlap=0;
296 		int secondBestBadInt=-1;
297 
298 		final float extraMult=(TAG_CUSTOM ? 80 : 1.2f);
299 
300 		final int largestInsertToTest=(alen+blen-minOverlap0);
301 		final int smallestInsertToTest=minInsert0;
302 		for(int insert=largestInsertToTest; insert>=smallestInsertToTest; insert--){
303 //			if(verbose){System.err.println("Testing read "+a.numericID+", overlap "+insert+", insert "+(alen+blen-insert));}
304 
305 			final int istart=(insert<=blen ? 0 : insert-blen);
306 			final int jstart=(insert>=blen ? 0 : blen-insert);
307 			final int overlapLength=Tools.min(alen-istart, blen-jstart, insert);
308 
309 //			final float badlimit=Tools.min(altBadlimit, Tools.min(bestRatio, maxRatio)*margin*overlapLength);
310 			final float badlimit=extraMult*(Tools.min(bestRatio, maxRatio)*margin*overlapLength)+1f;
311 			float good=0, bad=0;
312 			int badInt=0;
313 
314 			final int imax=istart+overlapLength;
315 			for(int i=istart, j=jstart; i<imax && bad<=badlimit; i++, j++){
316 				assert(i>=0 && i<alen && j>=0 && j<blen) : "\njstart="+jstart+", j="+j+
317 				", istart="+istart+", i="+i+" \n"+"insert="+insert+", overlap="+overlapLength+", a.length="+a.length()+
318 				", b.length="+b.length()+", bad="+bad+", badlimit="+badlimit+", good="+good;
319 				final byte ca=abases[i], cb=bbases[j];
320 
321 				if(ca==cb){
322 					if(ca!=N){good+=gIncr;}
323 				}else{
324 					bad+=bIncr;
325 					badInt++;
326 				}
327 			}
328 
329 //			if(verbose || true){
330 //				System.err.println("istart="+istart+", jstart="+jstart+", overlapLength="+overlapLength+", overlap="+overlap+", bestOverlap="+bestOverlap);
331 //				System.err.println("overlap="+overlap+", bad="+bad+", good="+good);
332 //				System.err.println("bestGood="+bestGood+", bestBad="+bestBad);
333 //				System.err.println();
334 //			}
335 
336 			if(bad<=badlimit){
337 				if(bad==0 && good>minOverlap0 && good<minOverlap){
338 					rvector[2]=bestBadInt;//=(int)(bestBad+0.95f);
339 					rvector[4]=1;
340 					return -1;
341 				}
342 
343 				float ratio=(bad+offset)/overlapLength;
344 //				System.err.println("*** ratio="+ratio+", bestRatio="+bestRatio);
345 
346 
347 				if(ratio<bestRatio*margin){
348 
349 					ambig=(ratio*margin>=bestRatio || good<minOverlap);
350 
351 					if(ratio<bestRatio){
352 
353 						if(verbose){
354 							System.err.println("A: Set ambig="+ambig+": "+ratio+"*"+margin+"="+(ratio*margin)+">="+bestRatio+" || "+good+"<"+minOverlap);
355 						}
356 						secondBestInsert=bestInsert;
357 						secondBestOverlap=bestOverlap;
358 						secondBestBad=bestBad;
359 						secondBestRatio=bestRatio;
360 						secondBestBadInt=bestBadInt;
361 
362 						bestInsert=insert;
363 						bestOverlap=overlapLength;
364 						bestBad=bad;
365 						bestRatio=ratio;
366 						bestBadInt=badInt;
367 					}
368 					else if(ratio<secondBestRatio){
369 
370 						if(verbose){
371 							System.err.println("B: Set ambig="+ambig+": "+ratio+"*"+margin+"="+(ratio*margin)+">="+bestRatio+" || "+good+"<"+minOverlap);
372 						}
373 						secondBestInsert=insert;
374 						secondBestOverlap=overlapLength;
375 						secondBestBad=bad;
376 						secondBestRatio=ratio;
377 						secondBestBadInt=badInt;
378 					}
379 
380 					if(!TAG_CUSTOM && ((ambig && bestRatio<margin2) || secondBestRatio<minSecondRatio)){
381 						rvector[2]=bestBadInt;//=(int)(bestBad+0.95f);
382 						rvector[4]=1;
383 						return -1;
384 					}
385 				}
386 			}
387 		}
388 
389 		if(verbose){
390 			System.err.println("minSecondRatio="+minSecondRatio+", secondBestRatio="+secondBestRatio+", margin="+margin+", bestRatio="+bestRatio+", bestBad="+bestBad);
391 		}
392 
393 		if(TAG_CUSTOM){
394 			if(secondBestRatio<minSecondRatio){ambig=true;}
395 			StringBuilder sb=new StringBuilder(a.id);
396 
397 			sb.append("_bi=").append(bestInsert);
398 			sb.append("_bo=").append(bestOverlap);
399 			sb.append("_bb=").append(String.format(Locale.ROOT, "%.4f", bestBad));
400 			sb.append("_br=").append(String.format(Locale.ROOT, "%.4f", bestRatio));
401 			sb.append("_bbi=").append(bestBadInt);
402 
403 			sb.append("_sbi=").append(secondBestInsert);
404 			sb.append("_sbo=").append(secondBestOverlap);
405 			sb.append("_sbb=").append(String.format(Locale.ROOT, "%.4f", secondBestBad));
406 			sb.append("_sbr=").append(String.format(Locale.ROOT, "%.4f", secondBestRatio));
407 			sb.append("_sbbi=").append(secondBestBadInt);
408 
409 			a.id=sb.toString();
410 
411 			rvector[2]=bestBadInt;//=(int)(bestBad+0.95f);
412 			rvector[4]=(ambig ? 1 : 0);
413 
414 			return (bestInsert<0 ? -1 : bestInsert);
415 		}
416 
417 		if(!ambig && bestRatio>maxRatio){bestInsert=-1;}
418 
419 		rvector[2]=bestBadInt;//=(int)(bestBad+0.95f);
420 		rvector[4]=(ambig ? 1 : 0);
421 
422 //		System.err.println("***C : "+bestOverlap+", "+ambig+", "+bestBad+", "+(bestOverlap<0 ? -1 : alen+blen-bestOverlap)+", "+
423 //				(bestOverlap<0 ? -1 : (bestOverlap<alen && alen>=blen) ? bestOverlap+alen-blen : bestOverlap)+", "+alen+", "+blen);
424 
425 		return (bestInsert<0 ? -1 : bestInsert);
426 	}
427 
findBestRatio_WithQualities(Read a, Read b, final float[] aprob, final float[] bprob, final int minOverlap0, final int minOverlap, final int minInsert, final float maxRatio, final float offset)428 	protected static final float findBestRatio_WithQualities(Read a, Read b, final float[] aprob, final float[] bprob,
429 			final int minOverlap0, final int minOverlap, final int minInsert, final float maxRatio, final float offset) {
430 		final byte[] abases=a.bases, bbases=b.bases;
431 		final int alen=abases.length, blen=bbases.length;
432 
433 		float bestRatio=maxRatio+0.0001f;
434 //		final float altBadlimit=Tools.max(maxRatio, 0.07f)*2f*alen+1;
435 		final float halfmax=maxRatio*0.5f;
436 
437 
438 		final int largestInsertToTest=(alen+blen-minOverlap); //TODO: test speed with minOverlap0
439 		final int smallestInsertToTest=minInsert;
440 		for(int insert=largestInsertToTest; insert>=smallestInsertToTest; insert--){
441 			if(verbose){System.err.println("a\nTesting read "+a.numericID+", overlap "+insert+", insert "+(alen+blen-insert));}
442 
443 			final int istart=(insert<=blen ? 0 : insert-blen);
444 			final int jstart=(insert>=blen ? 0 : blen-insert);
445 			final int overlapLength=Tools.min(alen-istart, blen-jstart, insert);
446 
447 //			final float badlimit=(Tools.min(altBadlimit, bestRatio*overlapLength));
448 			final float badlimit=bestRatio*overlapLength;
449 			float good=0, bad=0;
450 
451 			final int imax=istart+overlapLength;
452 			for(int i=istart, j=jstart; i<imax && bad<=badlimit; i++, j++){
453 				assert(i>=0 && i<alen && j>=0 && j<blen) : "\njstart="+jstart+", j="+j+
454 				", istart="+istart+", i="+i+" \n"+"insert="+insert+", overlap="+overlapLength+", a.length="+a.length()+
455 				", b.length="+b.length()+", bad="+bad+", badlimit="+badlimit+", good="+good;
456 				final byte ca=abases[i], cb=bbases[j];
457 
458 				final float x=aprob[i]*bprob[j];
459 
460 				if(ca==cb){good+=x;}
461 				else{bad+=x;}
462 			}
463 
464 			if(bad<=badlimit){
465 				if(bad==0 && good>minOverlap0 && good<minOverlap){
466 					return 100f;
467 				}
468 
469 				float ratio=(bad+offset)/overlapLength;
470 
471 				if(ratio<bestRatio){
472 					bestRatio=ratio;
473 					if(good>=minOverlap && ratio<halfmax){return bestRatio;}
474 				}
475 			}
476 		}
477 
478 		return bestRatio;
479 	}
480 
findBestRatio(Read a, Read b, final int minOverlap0, final int minOverlap, final int minInsert, final float maxRatio, final float offset, final float gIncr, final float bIncr)481 	protected static final float findBestRatio(Read a, Read b,
482 			final int minOverlap0, final int minOverlap, final int minInsert, final float maxRatio, final float offset, final float gIncr, final float bIncr) {
483 		final byte[] abases=a.bases, bbases=b.bases;
484 		final int alen=abases.length, blen=bbases.length;
485 
486 		float bestRatio=maxRatio+0.0001f;
487 //		final float altBadlimit=Tools.max(maxRatio, 0.07f)*2f*alen+1;
488 		final float halfmax=maxRatio*0.5f;
489 		final byte N='N';
490 
491 
492 		final int largestInsertToTest=(alen+blen-minOverlap); //TODO: test speed with minOverlap0
493 		final int smallestInsertToTest=minInsert;
494 		for(int insert=largestInsertToTest; insert>=smallestInsertToTest; insert--){
495 //			if(verbose){System.err.println("Testing read "+a.numericID+", overlap "+insert+", insert "+(alen+blen-insert));}
496 
497 			final int istart=(insert<=blen ? 0 : insert-blen);
498 			final int jstart=(insert>=blen ? 0 : blen-insert);
499 			final int overlapLength=Tools.min(alen-istart, blen-jstart, insert);
500 
501 //			final float badlimit=(Tools.min(altBadlimit, bestRatio*overlapLength));
502 			final float badlimit=bestRatio*overlapLength;
503 			float good=0, bad=0;
504 
505 			final int imax=istart+overlapLength;
506 			for(int i=istart, j=jstart; i<imax && bad<=badlimit; i++, j++){
507 				assert(i>=0 && i<alen && j>=0 && j<blen) : "\njstart="+jstart+", j="+j+
508 				", istart="+istart+", i="+i+" \n"+"insert="+insert+", overlap="+overlapLength+", a.length="+a.length()+
509 				", b.length="+b.length()+", bad="+bad+", badlimit="+badlimit+", good="+good;
510 				final byte ca=abases[i], cb=bbases[j];
511 
512 				if(ca==cb){
513 					if(ca!=N){good+=gIncr;}
514 				}else{bad+=bIncr;}
515 			}
516 
517 			if(bad<=badlimit){
518 				if(bad==0 && good>minOverlap0 && good<minOverlap){
519 					return 100f;
520 				}
521 
522 				float ratio=(bad+offset)/overlapLength;
523 
524 				if(ratio<bestRatio){
525 					bestRatio=ratio;
526 					if(good>=minOverlap && ratio<halfmax){return bestRatio;}
527 				}
528 			}
529 		}
530 
531 		return bestRatio;
532 	}
533 
mateByOverlapJava_unrolled(Read a, Read b, float[] aprob, float[] bprob, int[] rvector, int minOverlap0, final int minOverlap, final int minInsert0, int margin, final int maxMismatches0, final int maxMismatches, final int minq)534 	protected static final int mateByOverlapJava_unrolled(Read a, Read b, float[] aprob, float[] bprob, int[] rvector,
535 			int minOverlap0, final int minOverlap, final int minInsert0, int margin, final int maxMismatches0, final int maxMismatches, final int minq) {
536 		assert(rvector!=null);
537 		minOverlap0=Tools.min(Tools.max(1, minOverlap0), minOverlap);
538 		assert(maxMismatches<=maxMismatches0);
539 		margin=Tools.max(margin, 0);
540 		assert(maxMismatches>=margin);
541 
542 		final byte[] abases=a.bases, bbases=b.bases;
543 		final byte[] aqual=a.quality, bqual=b.quality;
544 		final int alen=abases.length, blen=bbases.length;
545 
546 		int bestOverlap=-1;
547 		int bestGood=-1;
548 		int bestBad=maxMismatches0;
549 
550 		boolean ambig=false;
551 		final int maxOverlap=alen+blen-Tools.max(minOverlap, minInsert0);
552 //		assert(false) : minOverlap+", "+maxOverlap;
553 
554 		if(aqual!=null && bqual!=null){
555 			for(int i=0; i<aqual.length; i++){aprob[i]=probCorrect3[aqual[i]];}
556 			for(int i=0; i<bqual.length; i++){bprob[i]=probCorrect3[bqual[i]];}
557 		}else{
558 			for(int i=0; i<alen; i++){aprob[i]=0.98f;}
559 			for(int i=0; i<blen; i++){bprob[i]=0.98f;}
560 		}
561 
562 		final float minprob=probCorrect3[Tools.mid(1, minq, 41)];
563 
564 		for(int overlap=Tools.max(minOverlap0, 0); overlap<maxOverlap; overlap++){
565 			if(verbose){System.err.println("c\nTesting read "+a.numericID+", overlap "+overlap+", insert "+(alen+blen-overlap));}
566 
567 			int good=0, bad=0;
568 
569 			int istart=(overlap<=alen ? 0 : overlap-alen);
570 			int jstart=(overlap<=alen ? alen-overlap : 0);
571 
572 			{
573 				final int iters=Tools.min(overlap-istart, blen-istart, alen-jstart);
574 				final int imax=istart+iters;
575 				final int badlim=bestBad+margin;
576 
577 				for(int i=istart, j=jstart; i<imax && bad<=badlim; i++, j++){
578 					assert(j>=0 && j<=alen && i>=0 && i<=blen) : "\njstart="+jstart+", j="+j+
579 					", istart="+istart+", i="+i+" \n"+"overlap="+overlap+", a.length="+alen+
580 					", b.length="+blen+", bad="+bad+", badlim="+badlim+", good="+good;
581 					final byte ca1=abases[j], cb1=bbases[i];
582 					final float pc=aprob[j]*bprob[i];
583 
584 					if(pc<=minprob){//do nothing
585 					}else if(ca1==cb1){good++;}
586 					else{bad++;}
587 				}
588 
589 				if(verbose){
590 					final int overlapLen=(imax-istart);
591 					System.err.println("overlapLen="+overlapLen+"; coordinates ("+jstart+"-"+(jstart+overlapLen)+"), ("+istart+"-"+imax+")");
592 					System.err.println(new String(abases, jstart, overlapLen));
593 					System.err.println(new String(bbases, istart, overlapLen));
594 				}
595 
596 				if(verbose){System.err.println("overlap="+overlap+", bad="+bad+", good="+good+", badlim="+badlim+", bestOverlap="+
597 						bestOverlap+", bestGood="+bestGood+", bestBad="+bestBad+", ambig="+ambig+", mino="+minOverlap+", mino0="+minOverlap0+
598 						", margin="+margin+", maxMismatches="+maxMismatches);}
599 			}
600 
601 			if(bad*2<good){
602 				if(verbose){System.err.print("a");}
603 				if(good>minOverlap){//Candidate
604 					if(verbose){System.err.print("b");}
605 					if(bad<=bestBad){
606 
607 						if(verbose){System.err.print("c");}
608 						if(bad<bestBad || (bad==bestBad && good>bestGood)){//Current winner
609 							if(verbose){System.err.print("d");}
610 							if(bestBad-bad<margin){ambig=true;}
611 							bestOverlap=overlap;
612 							bestBad=bad;
613 							bestGood=good;
614 						}else if(bad==bestBad){
615 							if(verbose){System.err.print("e");}
616 							ambig=true;
617 						}
618 
619 						if(ambig && bestBad<margin){
620 							if(verbose){System.err.print("f");}
621 							rvector[2]=bestBad;
622 							rvector[4]=(ambig ? 1 : 0);
623 							return -1;
624 						}
625 					}
626 				}else if(bad<margin){
627 					if(verbose){System.err.print("g");}
628 					ambig=true;
629 					rvector[2]=bestBad;
630 					rvector[4]=(ambig ? 1 : 0);
631 					return -1;
632 				}else{
633 					if(verbose){System.err.print("h");}
634 				}
635 			}
636 		}
637 
638 		if(!ambig && bestBad>maxMismatches-margin){bestOverlap=-1;}
639 
640 		rvector[2]=bestBad;
641 		rvector[4]=(ambig ? 1 : 0);
642 
643 		if(verbose){System.err.println("bestOverlap="+
644 				bestOverlap+", bestGood="+bestGood+", bestBad="+bestBad+", ambig="+ambig+", mino="+minOverlap+", mino0="+minOverlap0+
645 				", margin="+margin+", maxMismatches="+maxMismatches);}
646 
647 		return (bestOverlap<0 ? -1 : alen+blen-bestOverlap);
648 	}
649 
650 
651 	/**
652 	 * TODO Use this
653 	 * @param a
654 	 * @param b
655 	 * @param overlap
656 	 * @return
657 	 */
expectedMismatches(Read a, Read b, int overlap)658 	protected static final float expectedMismatches(Read a, Read b, int overlap) {
659 
660 		final byte[] abases=a.bases, bbases=b.bases, aqual=a.quality, bqual=b.quality;
661 		final int alen=abases.length, blen=bbases.length;
662 		final int istart=(overlap<=blen ? 0 : overlap-blen);
663 		final int jstart=(overlap<=alen ? alen-overlap : 0);
664 
665 		float expected=0;
666 		float actual=0;
667 
668 		if(aqual==null || bqual==null){return (overlap+0)/16;}
669 
670 //		System.err.println(istart);
671 //		System.err.println(jstart);
672 //		System.err.println();
673 //
674 //		System.err.println(a.id);
675 //		System.err.println(overlap);
676 //		System.err.println(new String(a.bases));
677 //		System.err.println(new String(b.bases));
678 //		System.err.println();
679 //		for(int i=istart, j=jstart; i<overlap && i<alen && j<blen; i++, j++){
680 //			final byte ca=abases[i];
681 //			System.err.print((char)ca);
682 //		}
683 //		System.err.println();
684 //		for(int i=istart, j=jstart; i<overlap && i<alen && j<blen; i++, j++){
685 //			final byte cb=bbases[j];
686 //			System.err.print((char)cb);
687 //		}
688 //		System.err.println();
689 
690 		for(int i=istart, j=jstart; i<overlap && i<alen && j<blen; i++, j++){
691 			final byte ca=abases[i], cb=bbases[j];
692 			final byte qa=aqual[i], qb=bqual[j];
693 
694 			if(ca=='N' || cb=='N'){
695 				//do nothing
696 			}else{
697 				assert(AminoAcid.isFullyDefined(ca) && AminoAcid.isFullyDefined(cb)) :
698 					"A non-ACGTN base was detected.  Please rerun with the flag 'itn'.\n"+(char)ca+", "+(char)cb+"\n";
699 				float probC=probCorrect4[qa]*probCorrect4[qb];
700 				float probE=1-probC;
701 //				expected+=Tools.max(0.0005f, probE);
702 				expected+=probE;
703 				actual+=(ca==cb ? 0 : probC);
704 //				assert((probE==1) == (ca=='N' || cb=='N')) : ((char)ca)+", "+((char)cb)+", "+qa+", "+qb+", "+probC+", "+probE;
705 			}
706 		}
707 
708 //		System.err.println("*expected:   \t"+expected);
709 //		System.err.println("*Actual:     \t"+actual);
710 //		System.err.println();
711 //
712 //		assert(a.id.equals("insert=142 /1") || a.id.equals("insert=263 /1")) : a.id;
713 
714 		return expected;
715 	}
716 
717 	/** Attempt at quantifying probability of an event like this.
718 	 * TODO: This returns an incorrect answer if reads are unequal lengths. */
probability(Read a, Read b, int insert)719 	protected static final float probability(Read a, Read b, int insert) {
720 		final byte[] abases=a.bases, bbases=b.bases, aqual=a.quality, bqual=b.quality;
721 		final int alen=abases.length, blen=bbases.length;
722 		final int istart=(insert<=blen ? 0 : insert-blen);
723 		final int jstart=(insert>=blen ? 0 : blen-insert);
724 
725 		if(aqual==null || bqual==null){return 1;}
726 
727 		float probActual=1;
728 		float probCommon=1;
729 
730 //		float expected=0;
731 //		float actual=0;
732 //		int measuredOverlap=0;
733 
734 //		assert(false) : "\n"+a.toFastq()+"\n"+b.toFastq()+"\n"+"istart="+istart+", jstart="+jstart+", insert="+insert+", alen="+alen+", blen="+blen;
735 
736 		for(int i=istart, j=jstart; i<insert && i<alen && j<blen; i++, j++){
737 			final byte ca=abases[i], cb=bbases[j];
738 			final byte qa=aqual[i], qb=bqual[j];
739 
740 			if(ca=='N' || cb=='N'){
741 				//do nothing
742 			}else{
743 
744 //				System.err.println(((char)ca)+", "+((char)cb)+", "+i+", "+j);
745 
746 				assert(AminoAcid.isFullyDefined(ca) && AminoAcid.isFullyDefined(cb)) :
747 					"A non-ACGTN base was detected.  Please rerun with the flag 'itn'.\n"+(char)ca+", "+(char)cb+"\n";
748 				float probC=probCorrect4[qa]*probCorrect4[qb];
749 				float probM=probC+(1-probC)*0.25f; //probability of matching
750 				float probE=1-probM;
751 
752 				assert(probM>0) : qa+", "+qb+", "+probC+", "+probM+", "+probE;
753 				assert(probE>0) : qa+", "+qb+", "+probC+", "+probM+", "+probE;
754 
755 				probCommon*=Tools.max(probM, probE);
756 				probActual*=(ca==cb ? probM : probE);
757 
758 //				expected+=probE;
759 //				actual+=(ca==cb ? 0 : probM);
760 //				measuredOverlap++;
761 			}
762 		}
763 
764 //		if(probActual>probCommon){
765 //			System.err.println("expected:   \t"+expected);
766 //			System.err.println("Actual:     \t"+actual);
767 //			System.err.println("probCommon: \t"+probCommon);
768 //			System.err.println("probActual: \t"+probActual);
769 //			System.err.println();
770 //			assert(false) : "\n"+a.toFastq()+"\n"+b.toFastq()+"\n";
771 //		}
772 
773 		assert(probActual<=probCommon);
774 
775 		return (float)Math.sqrt(probActual/probCommon); //sqrt is just so people don't need to type so many zeros.
776 	}
777 
minCoverage(final Read r, final Tadpole tadpole, final int k, int cutoff)778 	protected static int minCoverage(final Read r, final Tadpole tadpole, final int k, int cutoff){
779 		if(k<32){
780 			return minCoverage(r, (Tadpole1)tadpole, k, cutoff);
781 		}else{
782 			return minCoverage(r, (Tadpole2)tadpole, k, cutoff);
783 		}
784 	}
785 
minCoverage(final Read r, final Tadpole1 tadpole, final int k, int cutoff)786 	protected static int minCoverage(final Read r, final Tadpole1 tadpole, final int k, int cutoff){
787 		final byte[] bases=r.bases;
788 		if(bases==null || bases.length<k){return cutoff;}
789 
790 		final int shift=2*k;
791 		final int shift2=shift-2;
792 		final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
793 		long kmer=0, rkmer=0;
794 		int len=0;
795 		int min=cutoff;
796 
797 		for(int i=0; i<bases.length; i++){
798 			final byte b=bases[i];
799 			final long x=AminoAcid.baseToNumber[b];
800 			final long x2=AminoAcid.baseToComplementNumber[b];
801 
802 			//Update kmers
803 			kmer=((kmer<<2)|x)&mask;
804 			rkmer=((rkmer>>>2)|(x2<<shift2))&mask;
805 
806 			//Handle Ns
807 			if(x<0){
808 				len=0;
809 				kmer=rkmer=0;
810 			}else{len++;}
811 
812 			if(len>=k){
813 				int cov=tadpole.getCount(kmer, rkmer);
814 				min=Tools.min(min, cov);
815 				if(min<cutoff){return min;}
816 			}
817 		}
818 
819 		return min;
820 	}
821 
minCoverage(final Read r, final Tadpole2 tadpole, final int k, int cutoff)822 	protected static int minCoverage(final Read r, final Tadpole2 tadpole, final int k, int cutoff){
823 		final byte[] bases=r.bases;
824 		if(bases==null || bases.length<k){return cutoff;}
825 
826 		Kmer kmer=new Kmer(k);
827 		assert(kmer!=null);
828 		int min=cutoff;
829 
830 		for(int i=0; i<bases.length; i++){
831 			final byte b=bases[i];
832 
833 			//Update kmers
834 			kmer.addRight(b);
835 
836 			if(kmer.len>=k){
837 				int cov=tadpole.getCount(kmer);
838 				min=Tools.min(min, cov);
839 				if(min<cutoff){return min;}
840 			}
841 		}
842 
843 		return min;
844 	}
845 
calcMinOverlapByEntropy(byte[] bases, int k, short[] counts, int minscore)846 	protected static int calcMinOverlapByEntropy(byte[] bases, int k, short[] counts, int minscore){
847 		return Tools.max(calcMinOverlapByEntropyTail(bases, k, counts, minscore), calcMinOverlapByEntropyHead(bases, k, counts, minscore));
848 //		return calcMinOverlapByEntropyTail(bases, k, counts, minscore);
849 	}
850 
calcMinOverlapByEntropyTail(byte[] bases, int k, short[] counts, int minscore)851 	protected static int calcMinOverlapByEntropyTail(byte[] bases, int k, short[] counts, int minscore){
852 		final int bits=2*k;
853 		final int mask=~((-1)<<(bits));
854 		int kmer=0, len=0, ones=0, twos=0;
855 
856 		if(counts==null){
857 			counts=localKmerCounts.get();
858 			if(counts==null){
859 				counts=new short[1<<(bits)];
860 				localKmerCounts.set(counts);
861 			}
862 		}
863 
864 		Arrays.fill(counts, (short)0);
865 
866 		for(int i=0, j=bases.length-1; i<bases.length; i++, j--){
867 			if(i<bases.length){
868 				final byte b=bases[j];
869 				if(!AminoAcid.isFullyDefined(b)){
870 					len=0;
871 					kmer=0;
872 				}else{
873 					len++;
874 					final int n=Dedupe.baseToNumber[b];
875 					kmer=((kmer<<2)|n)&mask;
876 
877 					if(len>=k){
878 						counts[kmer]++;
879 						if(counts[kmer]==1){ones++;}
880 						else if(counts[kmer]==2){twos++;}
881 						if(ones*4+twos>=minscore){return i;}
882 					}
883 				}
884 			}
885 		}
886 		return bases.length+1;
887 	}
888 
calcMinOverlapByEntropyHead(byte[] bases, int k, short[] counts, int minscore)889 	protected static int calcMinOverlapByEntropyHead(byte[] bases, int k, short[] counts, int minscore){
890 		final int bits=2*k;
891 		final int mask=~((-1)<<(bits));
892 		int kmer=0, len=0, ones=0, twos=0;
893 
894 		if(counts==null){
895 			counts=localKmerCounts.get();
896 			if(counts==null){
897 				counts=new short[1<<(bits)];
898 				localKmerCounts.set(counts);
899 			}
900 		}
901 
902 		Arrays.fill(counts, (short)0);
903 
904 		for(int i=0; i<bases.length; i++){
905 			if(i<bases.length){
906 				final byte b=bases[i];
907 				if(!AminoAcid.isFullyDefined(b)){
908 					len=0;
909 					kmer=0;
910 				}else{
911 					len++;
912 					final int n=Dedupe.baseToNumber[b];
913 					kmer=((kmer<<2)|n)&mask;
914 
915 					if(len>=k){
916 						counts[kmer]++;
917 						if(counts[kmer]==1){ones++;}
918 						else if(counts[kmer]==2){twos++;}
919 						if(ones*4+twos>=minscore){return i;}
920 					}
921 				}
922 			}
923 		}
924 		return bases.length+1;
925 	}
926 
927 	private static ThreadLocal<short[]> localKmerCounts=new ThreadLocal<short[]>();
928 
929 	private static final int BAD_MULT=6;
930 	private static final int GOOD_MULT_1=8;
931 	private static final int GOOD_MULT_2=400;
932 
933 	private static final boolean TAG_CUSTOM=BBMerge.TAG_CUSTOM;
934 
935 	protected static final boolean verbose=false;
936 
937 	private static final float[] probCorrect3=
938 		{0.000f, 0.251f, 0.369f, 0.499f, 0.602f, 0.684f, 0.749f, 0.800f, 0.842f, 0.874f,
939 		 0.900f, 0.921f, 0.937f, 0.950f, 0.960f, 0.968f, 0.975f, 0.980f, 0.984f, 0.987f,
940 		 0.990f, 0.992f, 0.994f, 0.995f, 0.996f, 0.997f, 0.997f, 0.998f, 0.998f, 0.999f,
941 		 0.999f, 0.999f, 0.999f, 0.999f, 1, 1, 1, 1, 1, 1,
942 		 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
943 		 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
944 		 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
945 
946 	private static final float[] probCorrect4=
947 		{0.0000f, 0.2501f, 0.3690f, 0.4988f, 0.6019f, 0.6838f, 0.7488f, 0.8005f, 0.8415f, 0.8741f,
948 		 0.9000f, 0.9206f, 0.9369f, 0.9499f, 0.9602f, 0.9684f, 0.9749f, 0.9800f, 0.9842f, 0.9874f,
949 		 0.9900f, 0.9921f, 0.9937f, 0.9950f, 0.9960f, 0.9968f, 0.9975f, 0.9980f, 0.9984f, 0.9987f,
950 		 0.9990f, 0.9992f, 0.9994f, 0.9995f, 0.9996f, 0.9997f, 0.9997f, 0.9998f, 0.9998f, 0.9999f,
951 		 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f,
952 		 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f, 0.9999f};
953 
954 	private static final float[] probCorrect5=
955 		{0.20000f, 0.20567f, 0.36904f, 0.49881f, 0.60189f, 0.68377f, 0.74881f, 0.80047f, 0.84151f, 0.87411f,
956 		 0.90000f, 0.92057f, 0.93690f, 0.94988f, 0.96019f, 0.96838f, 0.97488f, 0.98005f, 0.98415f, 0.98741f,
957 		 0.99000f, 0.99206f, 0.99369f, 0.99499f, 0.99602f, 0.99684f, 0.99749f, 0.99800f, 0.99842f, 0.99874f,
958 		 0.99900f, 0.99921f, 0.99937f, 0.99950f, 0.99960f, 0.99968f, 0.99975f, 0.99980f, 0.99984f, 0.99987f,
959 		 0.99990f, 0.99992f, 0.99994f, 0.99995f, 0.99996f, 0.99997f, 0.99997f, 0.99998f, 0.99998f, 0.99999f,
960 		 0.99999f, 0.99999f, 0.99999f, 0.99999f, 0.99999f, 0.99999f, 0.99999f, 0.99999f, 0.99999f, 0.99999f};
961 
962 }
963