1 package var2;
2 
3 import java.io.Serializable;
4 import java.util.ArrayList;
5 import java.util.Arrays;
6 import java.util.Locale;
7 import java.util.Random;
8 
9 import align2.QualityTools;
10 import dna.AminoAcid;
11 import fileIO.FileFormat;
12 import shared.KillSwitch;
13 import shared.Parse;
14 import shared.Parser;
15 import shared.Shared;
16 import shared.Timer;
17 import shared.Tools;
18 import stream.Read;
19 import stream.SamLine;
20 import structures.ByteBuilder;
21 
22 /**
23  * Tracks data for a variation.
24  * Uses half-open coordinates.
25  * @author Brian Bushnell
26  * @date November 4, 2016
27  *
28  */
29 public class Var implements Comparable<Var>, Serializable, Cloneable {
30 
31 	/**
32 	 *
33 	 */
34 	private static final long serialVersionUID = 3328626403863586829L;
35 
36 
main(String[] args)37 	public static void main(String[] args){
38 		if(args.length>1){
39 			for(int i=1; i<args.length; i++){
40 				String arg=args[i];
41 				String[] split=arg.split("=");
42 				String a=split[0].toLowerCase();
43 				String b=split.length>1 ? split[1] : null;
44 				Parser.parseCommonStatic(arg, a, b);
45 			}
46 		}
47 		Timer t=new Timer();
48 		VarMap vmap;
49 		FileFormat ff=FileFormat.testInput(args[0], FileFormat.VAR, null, true, true);
50 		if(ff.vcf()){
51 			ScafMap smap=ScafMap.loadVcfHeader(ff);
52 			vmap=VcfLoader.loadVcfFile(args[0], smap, false, false);
53 		}else{
54 			vmap=VcfLoader.loadVarFile(args[0], null);
55 		}
56 		t.stop("Loaded "+vmap.size()+" variants.\nTime: \t");
57 	}
58 
59 	/*--------------------------------------------------------------*/
60 	/*----------------         Constructors         ----------------*/
61 	/*--------------------------------------------------------------*/
62 
63 	@Override
clone()64 	public Var clone(){
65 		try {
66 			return (Var)super.clone();
67 		} catch (CloneNotSupportedException e) {
68 			// TODO Auto-generated catch block
69 			e.printStackTrace();
70 			throw new RuntimeException();
71 		}
72 	}
73 
Var(int scafnum_, int start_, int stop_, int allele_, int type_)74 	public Var(int scafnum_, int start_, int stop_, int allele_, int type_){
75 		this(scafnum_, start_, stop_, AL_MAP[allele_], type_);
76 	}
77 
Var(Var v)78 	public Var(Var v) {
79 		this(v.scafnum, v.start, v.stop, v.allele, v.type);
80 	}
81 
Var(int scafnum_, int start_, int stop_, byte[] allele_, int type_)82 	public Var(int scafnum_, int start_, int stop_, byte[] allele_, int type_){
83 		scafnum=scafnum_;
84 		start=start_;
85 		stop=stop_;
86 		allele=allele_;
87 		type=type_;
88 		hashcode=hash();
89 //		stamp=stamper.getAndIncrement();
90 		assert(allele.length>1 || allele==AL_0 ||
91 				allele==AL_A || allele==AL_C || allele==AL_G || allele==AL_T || allele==AL_N) : new String(new String(allele_)+", "+allele_.length);
92 		assert(start<=stop) : "\n"+Var.toBasicHeader()+"\n"+this+"\n";
93 //		if(type()==SUB && allele.length==1){ //TODO: 123 - mainly for testing
94 //			final byte call=allele[0];
95 //			final Scaffold scaf=ScafMap.defaultScafMap.getScaffold(scafnum);
96 //			final byte ref=scaf.bases[start];
97 ////			System.err.println((char)call+"="+(char)ref+" at scaf "+scafnum+" pos "+start);
98 //			assert(ref!=call) : (char)call+"="+(char)ref+" at scaf "+scafnum+" pos "+start;
99 //		}
100 //		assert(false) : this.alleleCount();
101 	}
102 
Var(final byte[] line, final byte delimiter)103 	public Var(final byte[] line, final byte delimiter){
104 		int a=0, b=0;
105 
106 		while(b<line.length && line[b]!=delimiter){b++;}
107 		assert(b>a) : "Missing field 0: "+new String(line);
108 		scafnum=Parse.parseInt(line, a, b);
109 		b++;
110 		a=b;
111 
112 		while(b<line.length && line[b]!=delimiter){b++;}
113 		assert(b>a) : "Missing field 1: "+new String(line);
114 		start=Parse.parseInt(line, a, b);
115 		b++;
116 		a=b;
117 
118 		while(b<line.length && line[b]!=delimiter){b++;}
119 		assert(b>a) : "Missing field 2: "+new String(line);
120 		stop=Parse.parseInt(line, a, b);
121 		b++;
122 		a=b;
123 
124 		while(b<line.length && line[b]!=delimiter){b++;}
125 		assert(b>a) : "Missing field 3: "+new String(line);
126 		type=typeInitialArray[line[a]];
127 		b++;
128 		a=b;
129 
130 		while(b<line.length && line[b]!=delimiter){b++;}
131 		assert(b>=a) : "Missing field 4: "+new String(line);
132 		if(b==a){allele=AL_0;}
133 		else if(b==a+1){allele=AL_MAP[line[a]];}
134 		else{allele=Arrays.copyOfRange(line, a, b);}
135 		b++;
136 		a=b;
137 
138 		while(b<line.length && line[b]!=delimiter){b++;}
139 		assert(b>a) : "Missing field 5: "+new String(line);
140 		r1plus=Parse.parseInt(line, a, b);
141 		b++;
142 		a=b;
143 
144 		while(b<line.length && line[b]!=delimiter){b++;}
145 		assert(b>a) : "Missing field 6: "+new String(line);
146 		r1minus=Parse.parseInt(line, a, b);
147 		b++;
148 		a=b;
149 
150 		while(b<line.length && line[b]!=delimiter){b++;}
151 		assert(b>a) : "Missing field 7: "+new String(line);
152 		r2plus=Parse.parseInt(line, a, b);
153 		b++;
154 		a=b;
155 
156 		while(b<line.length && line[b]!=delimiter){b++;}
157 		assert(b>a) : "Missing field 8: "+new String(line);
158 		r2minus=Parse.parseInt(line, a, b);
159 		b++;
160 		a=b;
161 
162 		while(b<line.length && line[b]!=delimiter){b++;}
163 		assert(b>a) : "Missing field 9: "+new String(line);
164 		properPairCount=Parse.parseInt(line, a, b);
165 		b++;
166 		a=b;
167 
168 		while(b<line.length && line[b]!=delimiter){b++;}
169 		assert(b>a) : "Missing field 10: "+new String(line);
170 		lengthSum=Parse.parseLong(line, a, b);
171 		b++;
172 		a=b;
173 
174 		while(b<line.length && line[b]!=delimiter){b++;}
175 		assert(b>a) : "Missing field 11: "+new String(line);
176 		mapQSum=Parse.parseLong(line, a, b);
177 		b++;
178 		a=b;
179 
180 		while(b<line.length && line[b]!=delimiter){b++;}
181 		assert(b>a) : "Missing field 12: "+new String(line);
182 		mapQMax=Parse.parseInt(line, a, b);
183 		b++;
184 		a=b;
185 
186 		while(b<line.length && line[b]!=delimiter){b++;}
187 		assert(b>a) : "Missing field 13: "+new String(line);
188 		baseQSum=Parse.parseLong(line, a, b);
189 		b++;
190 		a=b;
191 
192 		while(b<line.length && line[b]!=delimiter){b++;}
193 		assert(b>a) : "Missing field 14: "+new String(line);
194 		baseQMax=Parse.parseInt(line, a, b);
195 		b++;
196 		a=b;
197 
198 		while(b<line.length && line[b]!=delimiter){b++;}
199 		assert(b>a) : "Missing field 15: "+new String(line);
200 		endDistSum=Parse.parseLong(line, a, b);
201 		b++;
202 		a=b;
203 
204 		while(b<line.length && line[b]!=delimiter){b++;}
205 		assert(b>a) : "Missing field 16: "+new String(line);
206 		endDistMax=Parse.parseInt(line, a, b);
207 		b++;
208 		a=b;
209 
210 		while(b<line.length && line[b]!=delimiter){b++;}
211 		assert(b>a) : "Missing field 17: "+new String(line);
212 		idSum=Parse.parseLong(line, a, b);
213 		b++;
214 		a=b;
215 
216 		while(b<line.length && line[b]!=delimiter){b++;}
217 		assert(b>a) : "Missing field 18: "+new String(line);
218 		idMax=Parse.parseInt(line, a, b);
219 		b++;
220 		a=b;
221 
222 		while(b<line.length && line[b]!=delimiter){b++;}
223 		assert(b>a) : "Missing field 19: "+new String(line);
224 		coverage=Parse.parseInt(line, a, b);
225 		b++;
226 		a=b;
227 
228 		while(b<line.length && line[b]!=delimiter){b++;}
229 		assert(b>a) : "Missing field 20: "+new String(line);
230 		minusCoverage=Parse.parseInt(line, a, b);
231 		b++;
232 		a=b;
233 
234 		while(b<line.length && line[b]!=delimiter){b++;}
235 		assert(b>a) : "Missing field 21: "+new String(line);
236 		nearbyVarCount=Parse.parseInt(line, a, b);
237 		b++;
238 		a=b;
239 
240 		while(b<line.length && line[b]!=delimiter){b++;}
241 		assert(b>a) : "Missing field 22: "+new String(line);
242 		flagged=Parse.parseInt(line, a, b)>0;
243 		b++;
244 		a=b;
245 
246 		while(b<line.length && line[b]!=delimiter){b++;}
247 //		assert(b>a) : "Missing field 21: "+new String(line);
248 //		int contigEndDist=Parse.parseInt(line, a, b); //Unused
249 		b++;
250 		a=b;
251 
252 		while(b<line.length && line[b]!=delimiter){b++;}
253 		if(b>a){
254 //			phredScore=Parse.parseFloat(line, a, b);
255 			b++;
256 			a=b;
257 		}
258 
259 		hashcode=hash();
260 		assert(allele.length>1 || allele==AL_0 ||
261 				allele==AL_A || allele==AL_C || allele==AL_G || allele==AL_T || allele==AL_N);
262 		assert(start<=stop) : this.toString();
263 		assert(type>=LJUNCT || type==type_old()) : type+", "+type_old()+", "+this.toString();
264 	}
265 
266 	//#CHROM POS    ID        REF  ALT     QUAL
fromVCF(byte[] line, ScafMap scafMap, boolean parseCoverage, boolean parseExtended)267 	public static Var fromVCF(byte[] line, ScafMap scafMap, boolean parseCoverage, boolean parseExtended) {
268 		int a=0, b=0;
269 
270 		//CHROM
271 		while(b<line.length && line[b]!='\t'){b++;}
272 		assert(b>a) : "Missing field 0: "+new String(line);
273 		String scaf=new String(line, a, b-a);
274 		b++;
275 		a=b;
276 
277 		//POS
278 		while(b<line.length && line[b]!='\t'){b++;}
279 		assert(b>a) : "Missing field 1: "+new String(line);
280 		int pos=Parse.parseInt(line, a, b);
281 		b++;
282 		a=b;
283 
284 		//ID
285 		while(b<line.length && line[b]!='\t'){b++;}
286 		assert(b>a) : "Missing field 2: "+new String(line);
287 		//String id=new String(line, a, b-a);
288 		b++;
289 		a=b;
290 
291 		//REF
292 		while(b<line.length && line[b]!='\t'){b++;}
293 		assert(b>a) : "Missing field 3: "+new String(line);
294 		//byte[] ref=Arrays.copyOf(line, a, b);
295 		int reflen=line[a]=='.' ? 0 : b-a;
296 		b++;
297 		a=b;
298 
299 		//ALT
300 		while(b<line.length && line[b]!='\t'){b++;}
301 		assert(b>a) : "Missing field 4: "+new String(line);
302 		byte[] alt;
303 		if(b<=a+1){alt=AL_MAP[line[a]];}
304 		else{alt=Arrays.copyOfRange(line, a, b);}
305 		b++;
306 		a=b;
307 
308 		//QUAL, FILTER, INFO
309 
310 		int infoStart=b;
311 
312 //		while(b<line.length && line[b]!='\t'){b++;}
313 //		assert(b>a) : "Missing field 5: "+new String(line);
314 //		int qual=Parse.parseInt(line, a, b);
315 //		b++;
316 //		a=b;
317 
318 		final int start;
319 		if(alt.length!=reflen && alt.length>0){//For indels, which have an extra base
320 			alt=Arrays.copyOfRange(alt, 1, alt.length);
321 			start=pos;
322 			if(alt.length==0){alt=AL_0;}
323 			else if(alt.length==1 && AL_MAP[alt[0]]!=null){alt=AL_MAP[alt[0]];}
324 			if(reflen==1){reflen--;}//insertion
325 		}else{
326 			start=pos-1;
327 		}
328 		final int readlen=(alt==null || alt.length==0 || alt[0]=='.' ? 0 : alt.length);
329 		final int stop=start+reflen;
330 		assert(scaf!=null);
331 		assert(scafMap!=null);
332 		final int scafNum=scafMap.getNumber(scaf);
333 		assert(scafNum>=0) : scaf+"\n"+scafMap.keySet()+"\n"+scafMap.altKeySet()+"\n";
334 //		final Scaffold scaffold=scafMap.getScaffold(scafNum);
335 
336 		int type=-1;
337 		if(parseExtended){
338 			type=Tools.max(type, parseVcfIntDelimited(line, "TYP=", infoStart));
339 			if(type<0){type=typeStartStop(start, stop, alt);}
340 		}else{
341 			type=typeStartStop(start, stop, alt);
342 		}
343 		Var v=new Var(scafNum, start, stop, alt, type);
344 //		System.err.println(new String(line)+"\n"+v.toString()+"\nstart="+start+", stop="+stop+", type="+type+"\n");
345 //		assert(false);
346 
347 		if(parseCoverage){
348 			infoStart=Tools.indexOfDelimited(line, "R1P=", infoStart, (byte)';');
349 
350 			//R1P=2;R1M=0;R2P=0;R2M=0;
351 			int r1p=Tools.max(0, parseVcfIntDelimited(line, "R1P=", infoStart));
352 			int r1m=Tools.max(0, parseVcfIntDelimited(line, "R1M=", infoStart));
353 			int r2p=Tools.max(0, parseVcfIntDelimited(line, "R2P=", infoStart));
354 			int r2m=Tools.max(0, parseVcfIntDelimited(line, "R2M=", infoStart));
355 
356 			//AD=2;DP=24;MCOV=0;PPC=0;
357 			int cov=parseVcfIntDelimited(line, "DP=", infoStart);
358 			assert(cov>0) : new String(line, infoStart, line.length-infoStart);
359 			int mcov=parseVcfIntDelimited(line, "MCOV=", infoStart);
360 
361 			v.coverage=cov;
362 			v.minusCoverage=mcov;
363 			v.r1plus=r1p;
364 			v.r1minus=r1m;
365 			v.r2plus=r2p;
366 			v.r2minus=r2m;
367 		}
368 
369 		if(parseExtended){
370 			infoStart=Tools.indexOfDelimited(line, "PPC=", infoStart, (byte)';');
371 
372 			//Some extended fields
373 			//AD=2;DP=24;MCOV=0;PPC=0;
374 			int pc=Tools.max(0, parseVcfIntDelimited(line, "PPC=", infoStart));
375 
376 			//AF=0.0833;RAF=0.0833;LS=280;
377 			double raf=parseVcfDoubleDelimited(line, "RAF=", infoStart);
378 			long ls=Tools.max(0, parseVcfLongDelimited(line, "LS=", infoStart));
379 
380 			//MQS=86;MQM=43;BQS=64;BQM=32;
381 			long mqs=Tools.max(0, parseVcfLongDelimited(line, "MQS=", infoStart));
382 			int mqm=Tools.max(0, parseVcfIntDelimited(line, "MQM=", infoStart));
383 			long bqs=Tools.max(0, parseVcfLongDelimited(line, "BQS=", infoStart));
384 			int bqm=Tools.max(0, parseVcfIntDelimited(line, "BQM=", infoStart));
385 
386 			//EDS=18;EDM=9;IDS=1984;IDM=992;
387 			long eds=Tools.max(0, parseVcfLongDelimited(line, "EDS=", infoStart));
388 			int edm=Tools.max(0, parseVcfIntDelimited(line, "EDM=", infoStart));
389 			long ids=Tools.max(0, parseVcfLongDelimited(line, "IDS=", infoStart));
390 			int idm=Tools.max(0, parseVcfIntDelimited(line, "IDM=", infoStart));
391 
392 			//NVC=0;FLG=0;CED=601;HMP=0;SB=0.1809;DP4=37,28,15,97
393 			int nvc=Tools.max(0, parseVcfIntDelimited(line, "NVC=", infoStart));
394 			int flg=Tools.max(0, parseVcfIntDelimited(line, "FLG=", infoStart));
395 //			int ced=Tools.max(0, parseVcfIntDelimited(line, "CED=", infoStart));
396 
397 			v.properPairCount=pc;
398 			v.revisedAlleleFraction=raf;
399 			v.lengthSum=ls;
400 			v.mapQSum=mqs;
401 			v.mapQMax=mqm;
402 			v.baseQSum=bqs;
403 			v.baseQMax=bqm;
404 			v.endDistSum=eds;
405 			v.endDistMax=edm;
406 			v.idSum=ids;
407 			v.idMax=idm;
408 			v.nearbyVarCount=nvc;
409 			v.flagged=(flg>0);
410 		}
411 
412 		return v;
413 	}
414 
415 	//Parses INFO field
parseVcfIntDelimited(byte[] line, String query, int start)416 	private static int parseVcfIntDelimited(byte[] line, String query, int start){
417 		return (int)Tools.min(Integer.MAX_VALUE, parseVcfLongDelimited(line, query, start));
418 	}
419 
420 	//Parses INFO field
parseVcfLongDelimited(byte[] line, String query, final int start)421 	private static long parseVcfLongDelimited(byte[] line, String query, final int start){
422 		int loc=Tools.indexOfDelimited(line, query, start, (byte)';');
423 //		System.err.println(loc+", "+(line.length)+", "+(line.length-loc));
424 //		System.err.println(loc+", "+new String(line, loc, line.length-loc));
425 //		if(true){KillSwitch.kill();}
426 		if(loc<0){return -1;}
427 		long current=0;
428 		long mult=1;
429 		if(loc>0){
430 			if(line[loc+query.length()]=='-'){mult=-1; loc++;}
431 			for(int i=loc+query.length(); i<line.length; i++){
432 				final byte x=line[i];
433 				if(Tools.isDigit(x)){
434 					current=current*10+(x-'0');
435 				}else{
436 					assert(x==tab || x==colon) : x+", "+query+", "+new String(line, loc, i-loc+1);
437 					break;
438 				}
439 			}
440 		}
441 		return mult*current;
442 	}
443 
444 	//Parses INFO field
parseVcfDoubleDelimited(byte[] line, String query, int start)445 	private static double parseVcfDoubleDelimited(byte[] line, String query, int start){
446 		int loc=Tools.indexOfDelimited(line, query, start, (byte)';');
447 		if(loc<0){return -1;}
448 		if(loc>0){
449 			loc=loc+query.length();
450 			int loc2=loc+1;
451 			while(loc2<line.length){
452 				byte b=line[loc2];
453 				if(b==tab || b==colon){break;}
454 				loc2++;
455 			}
456 			return Parse.parseDouble(line, loc, loc2);
457 		}
458 		return -1;
459 	}
460 
461 	/*--------------------------------------------------------------*/
462 	/*----------------           Mutators           ----------------*/
463 	/*--------------------------------------------------------------*/
464 
clear()465 	public Var clear() {
466 		coverage=-1;
467 		minusCoverage=-1;
468 
469 		r1plus=0;
470 		r1minus=0;
471 		r2plus=0;
472 		r2minus=0;
473 		properPairCount=0;
474 
475 		mapQSum=0;
476 		mapQMax=0;
477 
478 		baseQSum=0;
479 		baseQMax=0;
480 
481 		endDistSum=0;
482 		endDistMax=0;
483 
484 		idSum=0;
485 		idMax=0;
486 
487 		lengthSum=0;
488 
489 		revisedAlleleFraction=-1;
490 		nearbyVarCount=-1;
491 		forced=false;
492 		flagged=false;
493 		return this;
494 	}
495 
496 	//Only called by MergeSamples from VCFLine arrays.
addCoverage(Var b)497 	public void addCoverage(Var b){
498 		assert(this.equals(b));
499 		coverage+=b.coverage;
500 		minusCoverage+=b.minusCoverage;
501 	}
502 
503 	//Called in various places such as when processing each read
add(Var b)504 	public void add(Var b){
505 		final int oldReads=alleleCount();
506 
507 //		assert(oldReads>0) : this;
508 		assert(oldReads==0 || baseQSum/oldReads<=60) : this;
509 
510 		assert(this.equals(b));
511 		r1plus+=b.r1plus;
512 		r1minus+=b.r1minus;
513 		r2plus+=b.r2plus;
514 		r2minus+=b.r2minus;
515 		properPairCount+=b.properPairCount;
516 		lengthSum+=b.lengthSum;
517 
518 		mapQSum+=b.mapQSum;
519 		mapQMax=Tools.max(mapQMax, b.mapQMax);
520 		baseQSum+=b.baseQSum;
521 		baseQMax=Tools.max(baseQMax, b.baseQMax);
522 
523 		endDistSum+=b.endDistSum;
524 		endDistMax=Tools.max(endDistMax, b.endDistMax);
525 
526 		idSum+=b.idSum;
527 		idMax=Tools.max(idMax, b.idMax);
528 
529 //		assert(count()>0 && count()>oldReads) : "\n"+this+"\n"+b;
530 		assert(alleleCount()>=oldReads) : "\n"+this+"\n"+b;
531 		assert(alleleCount()==oldReads+b.alleleCount()) : "\n"+this+"\n"+b;
532 		assert(alleleCount()==0 || baseQSum/alleleCount()<=60) : "\n"+this+"\n"+b;
533 	}
534 
add(Read r)535 	public void add(Read r){
536 		final SamLine sl=r.samline;
537 		final int bstart=calcBstart(r, sl);
538 		final int bstop=calcBstop(bstart, r);
539 		add(r, bstart, bstop);
540 	}
541 
add(Read r, final int bstart, final int bstop)542 	public void add(Read r, final int bstart, final int bstop){
543 
544 		final int oldReads=alleleCount();
545 		final SamLine sl=r.samline;
546 
547 		if(sl.strand()==0){
548 			if(sl.pairnum()==0){
549 				r1plus++;
550 			}else{
551 				r2plus++;
552 			}
553 		}else{
554 			if(sl.pairnum()==0){
555 				r1minus++;
556 			}else{
557 				r2minus++;
558 			}
559 		}
560 
561 		lengthSum+=r.length();
562 		properPairCount+=(sl.properPair() ? 1 : 0);
563 		mapQSum+=sl.mapq;
564 		mapQMax=Tools.max(mapQMax, sl.mapq);
565 
566 		int baseQ=calcBaseQ(bstart, bstop, r, sl);
567 		baseQSum+=baseQ;
568 		baseQMax=Tools.max(baseQMax, baseQ);
569 
570 		int endDist=calcEndDist(bstart, bstop, r);
571 		endDistSum+=endDist;
572 		endDistMax=Tools.max(endDistMax, endDist);
573 
574 		int id=(int)(1000*Read.identitySkewed(r.match, false, false, false, true));
575 		idSum+=id;
576 		idMax=Tools.max(idMax, id);
577 
578 		assert(alleleCount()>0) : this;
579 		assert(alleleCount()==oldReads+1) : this;
580 		assert(baseQSum/alleleCount()<=60) : this;
581 	}
582 
583 	/*--------------------------------------------------------------*/
584 	/*----------------        Helper Methods        ----------------*/
585 	/*--------------------------------------------------------------*/
586 
toVars(Read r, SamLine sl, boolean callNs, ScafMap scafMap)587 	public static ArrayList<Var> toVars(Read r, SamLine sl, boolean callNs, ScafMap scafMap){
588 //		if(!r.containsVariants()){return null;}
589 		final int scafnum=scafMap.getNumber(sl.rnameS());
590 		return toVars(r, sl, callNs, scafnum);
591 	}
592 
593 	/**
594 	 * @TODO This crashes on indels in the last position in the match string.
595 	 * @param r
596 	 * @param sl
597 	 * @param callNs
598 	 * @param scafnum
599 	 * @return A list of variants
600 	 */
toVars(Read r, SamLine sl, boolean callNs, final int scafnum)601 	public static ArrayList<Var> toVars(Read r, SamLine sl, boolean callNs, final int scafnum){
602 		final boolean hasV=r.containsVariants();
603 		final boolean callSID=(CALL_DEL || CALL_INS || CALL_SUB) && hasV;
604 		final boolean callJ=(CALL_JUNCTION) && (hasV || r.containsClipping());
605 		if(!callSID && !callJ){return null;}
606 
607 		r.toLongMatchString(false);
608 		if(sl.strand()==1 && !r.swapped()){
609 			r.reverseComplement();
610 			r.setSwapped(true);
611 		}
612 
613 		ArrayList<Var> sidList=null, jList=null;
614 		if(callSID){sidList=toSubsAndIndels(r, sl, callNs, scafnum);}
615 		if(callJ){jList=toJunctions(r, sl, scafnum, hasV, 8);}
616 
617 		if(sidList!=null){
618 			if(jList!=null){sidList.addAll(jList);}
619 			return sidList;
620 		}else{
621 			return jList;
622 		}
623 
624 
625 		//Note: Did not un-rcomp the read
626 	}
627 
628 	/**
629 	 * @TODO This crashes on indels in the last position in the match string.
630 	 * @param r
631 	 * @param sl
632 	 * @param callNs
633 	 * @param scafnum
634 	 * @return A list of variants
635 	 */
toSubsAndIndels(Read r, SamLine sl, boolean callNs, final int scafnum)636 	private static ArrayList<Var> toSubsAndIndels(Read r, SamLine sl, boolean callNs, final int scafnum){
637 		final byte[] match=r.match;
638 		final byte[] bases=r.bases;
639 		final int rpos0=sl.pos-1;
640 		ArrayList<Var> list=new ArrayList<Var>();
641 
642 		int bstart=-1, bstop=-1;
643 		int rstart=-1, rstop=-1;
644 		int mode=-1;
645 
646 		int mpos=0, bpos=0, rpos=rpos0;
647 		for(; mpos<match.length; mpos++){
648 			byte m=match[mpos];
649 
650 			if(m!=mode){
651 				if(mode=='D'){
652 					bstop=bpos;
653 					rstop=rpos;
654 //					assert(false) : (char)m+", "+(char)mode+", "+rstart+", "+bstart;
655 					if(CALL_DEL){
656 						Var v=new Var(scafnum, rstart, rstop, 0, DEL);
657 						v.add(r, bstart, bstop);
658 						list.add(v);
659 					}
660 					bstart=bstop=rstart=rstop=-1;
661 				}else if(mode=='I'){
662 					bstop=bpos;
663 					rstop=rpos;
664 					int blen=bstop-bstart;
665 					if(CALL_INS){
666 						Var v;
667 						if(blen==1){
668 							v=new Var(scafnum, rstart, rstop, bases[bstart], INS);
669 						}else{
670 							v=new Var(scafnum, rstart, rstop, Arrays.copyOfRange(bases, bstart, bstop), INS);
671 						}
672 						v.add(r, bstart, bstop);
673 						list.add(v);
674 					}
675 					bstart=bstop=rstart=rstop=-1;
676 				}
677 			}
678 
679 			if(m=='C'){
680 				bpos++;
681 			}else if(m=='m' || m=='S' || m=='N'){
682 				if(m=='S' || (m=='N' && callNs)){
683 //					System.err.println(sl.toString()+"\n"+ScafMap.defaultScafMap.getScaffold(scafnum).getSequence(sl)); //123
684 //					System.err.println(ScafMap.defaultScafMap.getScaffold(scafnum));
685 					if(CALL_SUB){
686 						Var v=new Var(scafnum, rpos, rpos+1, bases[bpos], SUB);
687 						//					System.err.println(v.toString());
688 						v.add(r, bpos, bpos+1);
689 						list.add(v);
690 
691 						if(TEST_REF_VARIANTS && v.type()==SUB && v.allele.length==1){ //TODO: 123 - mainly for testing
692 							final byte call=v.allele[0];
693 							final Scaffold scaf=ScafMap.defaultScafMap().getScaffold(scafnum);
694 							final byte ref=scaf.bases[v.start];
695 							//						System.err.println((char)call+"="+(char)ref+" at scaf "+scafnum+" pos "+start);
696 							assert(ref!=call) : (char)call+"="+(char)ref+" at scaf "+scafnum+" pos "+v.start+"\n"
697 							+sl+"\n"+ScafMap.defaultScafMap().getScaffold(scafnum).getSequence(sl)+"\n";
698 						}
699 					}
700 
701 				}
702 				bpos++;
703 				rpos++;
704 			}else if(m=='D'){
705 				if(mode!=m){
706 					rstart=rpos;
707 					bstart=bpos;
708 				}
709 				rpos++;
710 			}else if(m=='I'){
711 				if(mode!=m){
712 					rstart=rpos;
713 					bstart=bpos;
714 				}
715 				bpos++;
716 			}else{
717 				assert(false) : "Unhandled symbol "+(char)m;
718 			}
719 			mode=m;
720 		}
721 
722 		if(mode=='D'){
723 			bstop=bpos;
724 			rstop=rpos;
725 			if(CALL_DEL){
726 				Var v=new Var(scafnum, rstart, rstop, 0, DEL);
727 				v.add(r, bstart, bstop);
728 				list.add(v);
729 			}
730 			bstart=bstop=rstart=rstop=-1;
731 		}else if(mode=='I'){
732 			bstop=bpos;
733 			rstop=rpos-1;
734 			int blen=bstop-bstart;
735 			if(CALL_INS){
736 				Var v;
737 				assert(rstart<=rstop) : "\n"+rstart+", "+rstop+", "+rpos+
738 				"\n"+bstart+", "+bstop+", "+bpos+
739 				"\n"+r+"\n"+sl;
740 				if(blen==1){
741 					v=new Var(scafnum, rstart, rstop, bases[bstart], INS);
742 				}else{
743 					v=new Var(scafnum, rstart, rstop, Arrays.copyOfRange(bases, bstart, bstop), INS);
744 				}
745 				v.add(r, bstart, bstop);
746 				list.add(v);
747 			}
748 			bstart=bstop=rstart=rstop=-1;
749 		}
750 
751 		return list;
752 	}
753 
754 	/** Short or long match */
countLeftClip(byte[] match)755 	private static int countLeftClip(byte[] match){
756 
757 		byte mode=match[0];
758 		if(mode!='C'){return 0;}
759 		int current=0;
760 		boolean hasDigit=false;
761 
762 		for(int mpos=0; mpos<match.length; mpos++){
763 			byte c=match[mpos];
764 
765 			if(mode==c){
766 				current++;
767 			}else if(Tools.isDigit(c)){
768 				current=(hasDigit ? current : 0)*10+c-'0';
769 				hasDigit=true;
770 			}else{
771 				break;
772 			}
773 		}
774 		return current;
775 	}
776 
777 	/** Short or long match */
countRightClip(byte[] match)778 	private static int countRightClip(byte[] match){
779 		int mpos=match.length-1;
780 		boolean hasDigit=false;
781 		while(mpos>=0 && Tools.isDigit(match[mpos])){
782 			hasDigit=true;
783 			mpos--;
784 		}
785 		if(!hasDigit){return countRightClipLong(match);}//Necessary line
786 
787 		byte mode=match[mpos];
788 		if(mode!='C'){return 0;}
789 		int current=0;
790 
791 		for(; mpos<match.length; mpos++){
792 			byte c=match[mpos];
793 
794 			if(mode==c){
795 				current++;
796 			}else if(Tools.isDigit(c)){
797 				current=(hasDigit ? current : 0)*10+c-'0';
798 				hasDigit=true;
799 			}else{
800 				break;
801 			}
802 		}
803 		return current;
804 	}
805 
806 	/** Long-match */
countLeftClipLong(byte[] longmatch)807 	private static int countLeftClipLong(byte[] longmatch){
808 		for(int i=0; i<longmatch.length; i++){
809 			if(longmatch[i]!='C'){return i;}
810 		}
811 		return longmatch.length;
812 	}
813 
814 	/** Long-match */
countRightClipLong(byte[] longmatch)815 	private static int countRightClipLong(byte[] longmatch){
816 		for(int i=0, j=longmatch.length-1; i<longmatch.length; i++, j--){
817 			if(longmatch[j]!='C'){return i;}
818 		}
819 		return longmatch.length;
820 	}
821 
toJunctions(Read r, SamLine sl, final int scafnum, boolean containsVars, final int minClip)822 	private static ArrayList<Var> toJunctions(Read r, SamLine sl, final int scafnum,
823 			boolean containsVars, final int minClip){
824 		final byte[] match0=r.match, match;
825 		final byte[] bases=r.bases;
826 //		final int rpos0=sl.pos-1;
827 		final int start, stop;
828 		int leftClip=countLeftClip(match0), rightClip=countRightClip(match0);
829 		if(leftClip==0 && rightClip==0){//try soft-clipping
830 			int[] rvec=KillSwitch.allocInt1D(2);
831 			byte[] smatch=SoftClipper.softClipMatch(match0, minClip, false, r.start, r.stop, rvec);
832 			if(smatch==null){
833 				return null;
834 			}else{
835 				start=rvec[0];
836 				stop=rvec[1];
837 				match=smatch;
838 				leftClip=countLeftClip(match);
839 				rightClip=countRightClip(match);
840 			}
841 		}else{
842 			if(leftClip<minClip && rightClip<minClip){return null;}
843 			start=r.start;
844 			stop=r.stop;
845 			match=match0;
846 		}
847 
848 		ArrayList<Var> list=new ArrayList<Var>();
849 		if(leftClip>=minClip){//LJUNCT
850 			int bpos=leftClip-1;
851 			byte jcall=bases[bpos];
852 			int jpos=start+leftClip;
853 			Var v=new Var(scafnum, jpos, jpos+1, jcall, LJUNCT);
854 			v.add(r, bpos, bpos+1);
855 			list.add(v);
856 		}
857 		if(rightClip>=minClip){//RJUNCT
858 			int bpos=bases.length-rightClip;
859 			byte jcall=bases[bpos];
860 			int jpos=stop-rightClip+1;
861 			Var v=new Var(scafnum, jpos, jpos+1, jcall, RJUNCT);
862 			v.add(r, bpos, bpos+1);
863 			list.add(v);
864 		}
865 		return list;
866 	}
867 
calcBstart(Read r, SamLine sl)868 	public int calcBstart(Read r, SamLine sl){
869 		r.toLongMatchString(false);
870 		byte[] match=r.match;
871 		final int rstart=sl.pos-1;
872 		final int type=type();
873 
874 		int bstart=-1;
875 
876 		for(int mpos=0, rpos=rstart, bpos=0; mpos<match.length; mpos++){
877 			byte m=match[mpos];
878 			if(m=='C'){
879 				bpos++;
880 			}else if(m=='m' || m=='S' || m=='N'){
881 				if(rpos==rstart){
882 					assert(type==SUB || type==NOCALL) : type+", "+bpos+", "+rpos+"\n"+new String(match);
883 					bstart=bpos;
884 					break;
885 				}
886 				bpos++;
887 				rpos++;
888 			}else if(m=='D'){
889 				if(rpos==rstart){
890 					assert(type==DEL) : type+", "+rpos+"\n"+new String(match);
891 					bstart=bpos;
892 					break;
893 				}
894 				rpos++;
895 			}else if(m=='I'){
896 				if(rpos==rstart && type==INS){
897 					bstart=bpos;
898 					break;
899 				}
900 				bpos++;
901 			}else{
902 				assert(false) : "Unhandled symbol "+(char)m;
903 			}
904 		}
905 		assert(bstart>=0);
906 		return bstart;
907 	}
908 
calcBstop(int bstart, Read r)909 	public int calcBstop(int bstart, Read r){
910 		assert(bstart>=0);
911 		int bstop=bstart+readlen();
912 		assert(bstop<=r.length());
913 		return bstop;
914 	}
915 
calcEndDist(int bstart, int bstop, Read r)916 	public int calcEndDist(int bstart, int bstop, Read r){
917 		int dist=Tools.min(bstart, r.length()-bstop);
918 		assert(dist>=0 && dist<=r.length()/2) : dist+", "+r.length()+", "+bstart+", "+bstop+"\n"+this+"\n"+
919 			"\n"+new String(r.match)+"\n"+r.samline+"\n";
920 		assert(dist<=(r.length()-readlen())/2) : "\ndist="+dist+", r.len="+r.length()+", readlen="+readlen()+", allele='"+new String(allele)+
921 			"', allele.length="+allele.length+"\n"+"allele2="+toString(allele)+"\n(r.length()-readlen())/2="+((r.length()-readlen())/2)+"bstart="+bstart+", bstop="+bstop+
922 			"\nthis="+this+"\nmatch="+new String(r.match)+"\nsamline="+r.samline+"\n";
923 		return dist;
924 	}
925 
toString(byte[] array)926 	String toString(byte[] array) {
927 		StringBuilder sb=new StringBuilder();
928 		for(byte b : array){
929 			sb.append((int)b).append(',');
930 		}
931 		return sb.toString();
932 	}
933 
calcBaseQ(final int bstart0, final int bstop0, Read r, SamLine sl)934 	public int calcBaseQ(final int bstart0, final int bstop0, Read r, SamLine sl){
935 		final byte[] quals=r.quality;
936 		if(quals==null){return Shared.FAKE_QUAL;}
937 		final int type=type();
938 		final int bstart, bstop;
939 		final int len=r.length();
940 
941 		if(sl.strand()==0 || (sl.strand()==1 && r.swapped())){
942 			bstart=bstart0;
943 			bstop=bstop0;
944 		}else{
945 			bstart=len-bstop0-1;
946 			bstop=len-bstart0-1;
947 			assert(bstop-bstart==bstop0-bstart0);
948 		}
949 
950 		int sum=0, avg=0;
951 		if(type==DEL){
952 			if(bstart==0){
953 				sum=avg=quals[0];
954 			}else if(bstop>=len-1){
955 				sum=avg=quals[len-1];
956 			}else{
957 				assert(bstop==bstart) : bstart0+", "+bstop0+", "+bstart+", "+bstop+"\n"+
958 						r.length()+", "+r.swapped()+", "+type()+", "+readlen()+", "+reflen()+
959 						"\n"+this+"\n"+new String(r.match)+"\n"+r.samline+"\n";
960 
961 //				-1, 73, -1, 73
962 //				151, true, 2, 0, 1
963 
964 				sum=quals[bstart]+quals[bstop+1];
965 				avg=sum/2;
966 			}
967 		}else{
968 			for(int i=bstart; i<bstop; i++){
969 				sum+=quals[i];
970 			}
971 			avg=Math.round(sum/(bstop-bstart));
972 		}
973 		return avg;
974 	}
975 
reflen()976 	public int reflen(){
977 		return stop-start;
978 	}
979 
readlen()980 	int readlen(){
981 		return (allele==null || allele.length==0 || allele[0]=='.' ? 0 : allele.length);
982 	}
983 
type()984 	public int type(){return type;}
985 
type_old()986 	int type_old(){
987 		int reflen=reflen(), readlen=readlen();
988 		return typeReadlenReflen(readlen, reflen, allele);
989 	}
990 
typeStartStop(int start, int stop, byte[] allele)991 	static int typeStartStop(int start, int stop, byte[] allele){
992 		final int readlen=(allele.length==0 || allele[0]=='.' ? 0 : allele.length);
993 		final int reflen=stop-start;
994 		return typeReadlenReflen(readlen, reflen, allele);
995 	}
996 
typeReadlenReflen(int readlen, int reflen, byte[] allele)997 	static int typeReadlenReflen(int readlen, int reflen, byte[] allele){
998 		if(reflen<readlen){return INS;}
999 		if(reflen>readlen){return DEL;}
1000 //		assert(readlen>0) : start+", "+stop+", "+reflen+", "+readlen+", "+new String(allele);
1001 //		if(reflen==0){return INS;}
1002 //		if(readlen==0){return DEL;}
1003 //		assert(start<=stop) : start+", "+stop;
1004 //		assert(reflen==readlen) : reflen+", "+readlen+", "+new String(allele)+", "+start+", "+stop;
1005 		for(byte b : allele){
1006 			if(b!='N'){return SUB;}
1007 		}
1008 		return NOCALL;
1009 	}
1010 
typeString()1011 	String typeString(){
1012 		return typeArray[type()];
1013 	}
1014 
1015 	/*--------------------------------------------------------------*/
1016 	/*----------------       Contract Methods       ----------------*/
1017 	/*--------------------------------------------------------------*/
1018 
1019 	@Override
equals(Object b)1020 	public boolean equals(Object b){
1021 		return equals((Var)b);
1022 	}
1023 
equals(Var b)1024 	public boolean equals(Var b){
1025 		return hashcode==b.hashcode && compareTo(b)==0;
1026 	}
1027 
1028 	@Override
hashCode()1029 	public int hashCode(){
1030 		return hashcode;
1031 	}
1032 
toKey()1033 	public long toKey() {
1034 		final int len=(type==DEL ? reflen() : readlen());
1035 		final long key=type^((hash(allele)&0x3F)>>alleleShift)^(len>>lenShift)^(Long.rotateRight(start, startShift));
1036 		return key&0x7FFFFFFFFFFFFFFFL;
1037 
1038 //		long key=Long.rotateLeft(start, 30)^Long.rotateLeft(scafnum, 10)^hash(allele);
1039 //		return key&0x3FFFFFFFFFFFFFFFL;
1040 	}
1041 
1042 	@Override
compareTo(Var v)1043 	public int compareTo(Var v){
1044 		if(scafnum!=v.scafnum){return scafnum-v.scafnum;}
1045 		final int typeA=type(), typeB=v.type();
1046 		int stA=start+(typeA==DEL ? -1 : 0);
1047 		int stB=v.start+(typeB==DEL ? -1 : 0);
1048 		if(stA!=stB){return stA-stB;}
1049 		if(typeA!=typeB){return typeA-typeB;}
1050 		if(stop!=v.stop){return stop-v.stop;}
1051 		return compare(allele, v.allele);
1052 	}
1053 
compare(byte[] a, byte[] b)1054 	public int compare(byte[] a, byte[] b){
1055 		if(a==b){return 0;}
1056 		if(a.length!=b.length){return b.length-a.length;}
1057 		for(int i=0; i<a.length; i++){
1058 			byte ca=a[i], cb=b[i];
1059 			if(ca!=cb){return ca-cb;}
1060 		}
1061 		return 0;
1062 	}
1063 
1064 	@Override
toString()1065 	public String toString(){
1066 		return toTextQuick(new ByteBuilder()).toString();
1067 	}
1068 
toTextQuick(ByteBuilder bb)1069 	public ByteBuilder toTextQuick(ByteBuilder bb){
1070 		return toText(bb, 0.99f, 30, 30, 150, 1, 2, null);
1071 	}
1072 
toVarHeader(double properPairRate, double totalQualityAvg, double mapqAvg, double rarity, double minAlleleFraction, int ploidy, long reads, long pairs, long properPairs, long bases, String ref)1073 	public static String toVarHeader(double properPairRate, double totalQualityAvg, double mapqAvg, double rarity, double minAlleleFraction, int ploidy,
1074 			long reads, long pairs, long properPairs, long bases, String ref){
1075 		StringBuilder sb=new StringBuilder();
1076 
1077 		final double readLengthAvg=bases/Tools.max(1.0, reads);
1078 		sb.append("#fileformat\tVar_"+varFormat+"\n");
1079 		sb.append("#BBMapVersion\t"+Shared.BBMAP_VERSION_STRING+"\n");
1080 		sb.append("#ploidy\t"+ploidy+"\n");
1081 		sb.append(String.format(Locale.ROOT, "#rarity\t%.5f\n", rarity));
1082 		sb.append(String.format(Locale.ROOT, "#minAlleleFraction\t%.4f\n", minAlleleFraction));
1083 		sb.append("#reads\t"+reads+"\n");
1084 		sb.append("#pairedReads\t"+pairs+"\n");
1085 		sb.append("#properlyPairedReads\t"+properPairs+"\n");
1086 		sb.append(String.format(Locale.ROOT, "#readLengthAvg\t%.2f\n", readLengthAvg));
1087 		sb.append(String.format(Locale.ROOT, "#properPairRate\t%.4f\n", properPairRate));
1088 		sb.append(String.format(Locale.ROOT, "#totalQualityAvg\t%.4f\n", totalQualityAvg));
1089 		sb.append(String.format(Locale.ROOT, "#mapqAvg\t%.2f\n", mapqAvg));
1090 		if(ref!=null){sb.append("#reference\t"+ref+"\n");}
1091 
1092 		sb.append("#scaf\tstart\tstop\ttype\tcall\tr1p\tr1m\tr2p\tr2m\tpaired\tlengthSum");
1093 		sb.append("\tmapq\tmapqMax\tbaseq\tbaseqMax\tedist\tedistMax\tid\tidMax");
1094 		sb.append("\tcov\tminusCov");
1095 		sb.append("\tnearbyVarCount");
1096 		sb.append("\tflagged");
1097 		sb.append("\tcontigEndDist");
1098 		sb.append("\tphredScore");
1099 		if(extendedText){
1100 			sb.append("\treadCount\talleleFraction\trevisedAF\tstrandRatio\tbaseqAvg\tmapqAvg\tedistAvg\tidentityAvg");
1101 			sb.append("\tedistScore\tidentityScore\tqualityScore\tpairedScore\tbiasScore\tcoverageScore\thomopolymerScore\tscore");
1102 //			sb.append("\tforced");
1103 		}
1104 		return sb.toString();
1105 	}
1106 
toBasicHeader()1107 	public static String toBasicHeader(){
1108 		StringBuilder sb=new StringBuilder();
1109 		sb.append("#scaf\tstart\tstop\ttype\tcall\tr1p\tr1m\tr2p\tr2m\tpaired\tlengthSum");
1110 		sb.append("\tmapq\tmapqMax\tbaseq\tbaseqMax\tedist\tedistMax\tid\tidMax");
1111 		sb.append("\tcov\tminusCov");
1112 		sb.append("\tnearVarCount");
1113 		sb.append("\tcontigEndDist");
1114 		sb.append("\tphredScore");
1115 		return sb.toString();
1116 	}
1117 
toText(ByteBuilder bb, double properPairRate, double totalQualityAvg, double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map)1118 	public ByteBuilder toText(ByteBuilder bb, double properPairRate, double totalQualityAvg, double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map){
1119 		useIdentity=true;
1120 		bb.append(scafnum).tab();
1121 		bb.append(start).tab();
1122 		bb.append(stop).tab();
1123 		bb.append(typeArray[type()]).tab();
1124 		for(byte b : allele){bb.append(b);}
1125 		bb.tab();
1126 
1127 		bb.append(r1plus).tab();
1128 		bb.append(r1minus).tab();
1129 		bb.append(r2plus).tab();
1130 		bb.append(r2minus).tab();
1131 		bb.append(properPairCount).tab();
1132 		bb.append(lengthSum).tab();
1133 
1134 		bb.append(mapQSum).tab();
1135 		bb.append(mapQMax).tab();
1136 		bb.append(baseQSum).tab();
1137 		bb.append(baseQMax).tab();
1138 		bb.append(endDistSum).tab();
1139 		bb.append(endDistMax).tab();
1140 		bb.append(idSum).tab();
1141 		bb.append(idMax).tab();
1142 
1143 		bb.append(coverage).tab();
1144 		bb.append(minusCoverage).tab();
1145 		bb.append(nearbyVarCount).tab();
1146 		bb.append(flagged ? 1 : 0).tab();
1147 
1148 		final int scafEndDist=!doNscan ? nScan : (map==null ? start : contigEndDist(map));
1149 		bb.append(scafEndDist).tab();
1150 
1151 //		bb.append(prevBase<0 ? 'N' : (char)prevBase).tab();
1152 
1153 		final double score=score(properPairRate, totalQualityAvg, totalMapqAvg, readLengthAvg, rarity, ploidy, map);
1154 //		bb.append(String.format(Locale.ROOT, "%.2f", toPhredScore(score))).tab();
1155 		bb.append(toPhredScore(score), 2).tab();
1156 
1157 		if(extendedText){
1158 
1159 			bb.append(alleleCount()).tab();
1160 			final double af=alleleFraction();
1161 			bb.append(af, 4).tab();
1162 			bb.append(revisedAlleleFraction(af, readLengthAvg), 4).tab();
1163 			bb.append(strandRatio(), 4).tab();
1164 			bb.append(baseQAvg(), 2).tab();
1165 			bb.append(mapQAvg(), 2).tab();
1166 			bb.append(edistAvg(), 2).tab();
1167 			bb.append(identityAvg(), 2).tab();
1168 
1169 			bb.append(edistScore(), 4).tab();
1170 			bb.append(identityScore(), 4).tab();
1171 			bb.append(qualityScore(totalQualityAvg, totalMapqAvg), 4).tab();
1172 			bb.append(pairedScore(properPairRate, scafEndDist), 4).tab();
1173 			bb.append(biasScore(properPairRate, scafEndDist), 4).tab();
1174 			bb.append(coverageScore(ploidy, rarity, readLengthAvg), 4).tab();
1175 			bb.append(homopolymerScore(map), 4).tab();
1176 			bb.append(score, 4).tab();
1177 //			bb.append((forced() ? 1 : 0), 4).tab();
1178 		}
1179 
1180 		bb.length--;
1181 
1182 		return bb;
1183 	}
1184 
toVcfHeader(double properPairRate, double totalQualityAvg, double mapqAvg, double rarity, double minAlleleFraction, int ploidy, long reads, long pairs, long properPairs, long bases, String ref, ScafMap map, String sampleName, boolean trimWhitespace)1185 	public static String toVcfHeader(double properPairRate, double totalQualityAvg, double mapqAvg,
1186 			double rarity, double minAlleleFraction, int ploidy, long reads, long pairs,
1187 			long properPairs, long bases, String ref, ScafMap map, String sampleName,
1188 			boolean trimWhitespace) {
1189 		StringBuilder sb=new StringBuilder();
1190 
1191 		sb.append("##fileformat=VCFv4.2\n");
1192 		sb.append("##BBMapVersion="+Shared.BBMAP_VERSION_STRING+"\n");
1193 		sb.append("##ploidy="+ploidy+"\n");
1194 		sb.append(String.format(Locale.ROOT, "##rarity=%.5f\n", rarity));
1195 		sb.append(String.format(Locale.ROOT, "##minallelefraction=%.5f\n", minAlleleFraction));
1196 		sb.append("##reads="+reads+"\n");
1197 		sb.append("##pairedReads="+pairs+"\n");
1198 		sb.append("##properlyPairedReads="+properPairs+"\n");
1199 		sb.append(String.format(Locale.ROOT, "##readLengthAvg=%.3f\n", (bases/Tools.max(reads, 1.0))));
1200 		sb.append(String.format(Locale.ROOT, "##properPairRate=%.5f\n", properPairRate));
1201 		sb.append(String.format(Locale.ROOT, "##totalQualityAvg=%.3f\n", totalQualityAvg));
1202 		sb.append(String.format(Locale.ROOT, "##mapqAvg=%.3f\n", mapqAvg));
1203 		if(ref!=null){sb.append("##reference="+ref+"\n");}
1204 
1205 		for(Scaffold scaf : map.list){
1206 			String name=scaf.name;
1207 			if(trimWhitespace){name=Tools.trimWhitespace(name);}
1208 			sb.append("##contig=<ID="+name+",length="+scaf.length+">\n");
1209 		}
1210 
1211 		{
1212 			sb.append("##FILTER=<ID=FAIL,Description=\"Fail\">\n");
1213 			sb.append("##FILTER=<ID=PASS,Description=\"Pass\">\n");
1214 
1215 			sb.append("##INFO=<ID=SN,Number=1,Type=Integer,Description=\"Scaffold Number\">\n");
1216 			sb.append("##INFO=<ID=STA,Number=1,Type=Integer,Description=\"Start\">\n");
1217 			sb.append("##INFO=<ID=STO,Number=1,Type=Integer,Description=\"Stop\">\n");
1218 			sb.append("##INFO=<ID=TYP,Number=1,Type=String,Description=\"Type\">\n");
1219 
1220 			sb.append("##INFO=<ID=R1P,Number=1,Type=Integer,Description=\"Read1 Plus Count\">\n");
1221 			sb.append("##INFO=<ID=R1M,Number=1,Type=Integer,Description=\"Read1 Minus Count\">\n");
1222 			sb.append("##INFO=<ID=R2P,Number=1,Type=Integer,Description=\"Read2 Plus Count\">\n");
1223 			sb.append("##INFO=<ID=R2M,Number=1,Type=Integer,Description=\"Read2 Minus Count\">\n");
1224 
1225 			sb.append("##INFO=<ID=AD,Number=1,Type=Integer,Description=\"Allele Depth\">\n");
1226 			sb.append("##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n");
1227 //			sb.append("##INFO=<ID=COV,Number=1,Type=Integer,Description=\"Coverage\">\n");
1228 			sb.append("##INFO=<ID=MCOV,Number=1,Type=Integer,Description=\"Minus Coverage\">\n");
1229 			sb.append("##INFO=<ID=PPC,Number=1,Type=Integer,Description=\"Paired Count\">\n");
1230 
1231 			sb.append("##INFO=<ID=AF,Number=1,Type=Float,Description=\"Allele Fraction\">\n");
1232 			sb.append("##INFO=<ID=RAF,Number=1,Type=Float,Description=\"Revised Allele Fraction\">\n");
1233 			sb.append("##INFO=<ID=LS,Number=1,Type=Integer,Description=\"Length Sum\">\n");
1234 
1235 			sb.append("##INFO=<ID=MQS,Number=1,Type=Integer,Description=\"MAPQ Sum\">\n");
1236 			sb.append("##INFO=<ID=MQM,Number=1,Type=Integer,Description=\"MAPQ Max\">\n");
1237 			sb.append("##INFO=<ID=BQS,Number=1,Type=Integer,Description=\"Base Quality Sum\">\n");
1238 			sb.append("##INFO=<ID=BQM,Number=1,Type=Integer,Description=\"Base Quality Max\">\n");
1239 
1240 			sb.append("##INFO=<ID=EDS,Number=1,Type=Integer,Description=\"End Distance Sum\">\n");
1241 			sb.append("##INFO=<ID=EDM,Number=1,Type=Integer,Description=\"End Distance Max\">\n");
1242 			sb.append("##INFO=<ID=IDS,Number=1,Type=Integer,Description=\"Identity Sum\">\n");
1243 			sb.append("##INFO=<ID=IDM,Number=1,Type=Integer,Description=\"Identity Max\">\n");
1244 
1245 			sb.append("##INFO=<ID=NVC,Number=1,Type=Integer,Description=\"Nearby Variation Count\">\n");
1246 			sb.append("##INFO=<ID=FLG,Number=1,Type=Integer,Description=\"Flagged\">\n");
1247 			sb.append("##INFO=<ID=CED,Number=1,Type=Integer,Description=\"Contig End Distance\">\n");
1248 			sb.append("##INFO=<ID=HMP,Number=1,Type=Integer,Description=\"Homopolymer Count\">\n");
1249 			sb.append("##INFO=<ID=SB,Number=1,Type=Float,Description=\"Strand Bias\">\n");
1250 			sb.append("##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Ref+, Ref-, Alt+, Alt-\">\n");
1251 
1252 			sb.append("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n");
1253 			sb.append("##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n");
1254 			sb.append("##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Allele Depth\">\n");
1255 			sb.append("##FORMAT=<ID=AF,Number=1,Type=Float,Description=\"Allele Fraction\">\n");
1256 			sb.append("##FORMAT=<ID=RAF,Number=1,Type=Float,Description=\"Revised Allele Fraction\">\n");
1257 			sb.append("##FORMAT=<ID=NVC,Number=1,Type=Integer,Description=\"Nearby Variation Count\">\n");
1258 			sb.append("##FORMAT=<ID=FLG,Number=1,Type=Integer,Description=\"Flagged\">\n");
1259 			sb.append("##FORMAT=<ID=SB,Number=1,Type=Float,Description=\"Strand Bias\">\n");
1260 
1261 			sb.append("##FORMAT=<ID=SC,Number=1,Type=Float,Description=\"Score\">\n");
1262 			sb.append("##FORMAT=<ID=PF,Number=1,Type=String,Description=\"Pass Filter\">\n");
1263 
1264 		}
1265 
1266 		sb.append("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
1267 		if(sampleName!=null){sb.append('\t').append(sampleName);}
1268 		return sb.toString();
1269 	}
1270 
1271 	//#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	1323_1066121
1272 	//ChrIII_A_nidulans_FGSC_A4	133	.	T	C	204	PASS
1273 	//	DP=27;VDB=1.614640e-01;RPB=9.947863e-01;AF1=0.5;AC1=1;DP4=5,8,6,8;MQ=49;FQ=180;PV4=1,1,0.055,1
1274 	//	GT:PL:DP:SP:GQ	0/1:234,0,207:27:0:99
toVCF(ByteBuilder bb, double properPairRate, double totalQualityAvg, double mapqAvg, double readLengthAvg, int ploidy, ScafMap map, VarFilter filter, boolean trimWhitespace)1275 	public ByteBuilder toVCF(ByteBuilder bb, double properPairRate, double totalQualityAvg, double mapqAvg, double readLengthAvg,
1276 			int ploidy, ScafMap map, VarFilter filter, boolean trimWhitespace){
1277 
1278 		final Scaffold scaf=map.getScaffold(scafnum);
1279 		final byte[] bases=scaf.bases;
1280 		final int reflen=reflen(), readlen=readlen(), type=type();
1281 		final double score=phredScore(properPairRate, totalQualityAvg, mapqAvg, readLengthAvg, filter.rarity, ploidy, map);
1282 		final boolean pass=(filter==null ? true :
1283 			filter.passesFilter(this, properPairRate, totalQualityAvg, mapqAvg, readLengthAvg, ploidy, map, true));
1284 		bb.append(trimWhitespace ? Tools.trimWhitespace(scaf.name) : scaf.name).tab();
1285 		boolean indel=(type==INS || type==DEL);
1286 		boolean addPrevBase=true;
1287 		final int vcfStart=start+(indel && addPrevBase ? 0 : 1);
1288 		bb.append(vcfStart).tab();
1289 		bb.append('.').tab();
1290 
1291 		byte prevBase=(bases==null ? (byte)'N' : bases[Tools.mid(start-1, 0, bases.length-1)]);
1292 		if(UPPER_CASE_ALLELES){prevBase=(byte) Tools.toUpperCase(prevBase);}
1293 
1294 		if(addPrevBase){
1295 			if(reflen==0 || allele.length<1){bb.append(prevBase);}
1296 			for(int i=0, rpos=start; i<reflen; i++, rpos++){
1297 				bb.append(bases==null || rpos<0 || rpos>=bases.length ? (char)'N' : (char)bases[rpos]);
1298 			}
1299 			bb.tab();
1300 
1301 			if(reflen==0 || allele.length<1){bb.append(prevBase);}
1302 			bb.append(allele).tab();
1303 		}else{
1304 			if(reflen==0){
1305 				bb.append('.');
1306 			}else{
1307 				for(int i=0, rpos=start; i<reflen; i++, rpos++){
1308 					char refBase=bases==null || rpos<0 || rpos>=bases.length ? (char)'N' : (char)bases[rpos];
1309 					if(UPPER_CASE_ALLELES){refBase=Tools.toUpperCase(refBase);}
1310 					bb.append(refBase);
1311 				}
1312 			}
1313 			bb.tab();
1314 
1315 			if(allele.length<1){
1316 				bb.append('.').tab();
1317 			}else{
1318 				bb.append(allele).tab();
1319 			}
1320 		}
1321 
1322 //		bb.append(String.format(Locale.ROOT, "%.2f\t", score));
1323 		bb.append(score, 2).tab();
1324 		bb.append(pass ? "PASS\t" : "FAIL\t");
1325 
1326 		final int scafEndDist=!doNscan ? nScan : (map==null ? start : contigEndDist(map));
1327 		final int count=alleleCount();
1328 		final double af=alleleFraction();
1329 		final double raf=revisedAlleleFraction(af, readLengthAvg);
1330 		final double strandBias=strandBiasScore(scafEndDist);
1331 
1332 		{//INFO
1333 			assert(Scaffold.trackStrand()==(minusCoverage>=0)) : Scaffold.trackStrand()+", "+minusCoverage;
1334 			final int covMinus=(Scaffold.trackStrand() ? minusCoverage : coverage/2);
1335 			final int covPlus=Tools.max(0, coverage-covMinus);
1336 			final int refMinus=Tools.max(0, covMinus-alleleMinusCount());
1337 			final int refPlus=Tools.max(0, covPlus-allelePlusCount());
1338 
1339 			bb.append("SN=").append(scafnum).append(';');
1340 			bb.append("STA=").append(start).append(';');
1341 			bb.append("STO=").append(stop).append(';');
1342 			bb.append("TYP=").append(typeArray[type()]).append(';');
1343 
1344 			bb.append("R1P=").append(r1plus).append(';');
1345 			bb.append("R1M=").append(r1minus).append(';');
1346 			bb.append("R2P=").append(r2plus).append(';');
1347 			bb.append("R2M=").append(r2minus).append(';');
1348 
1349 			bb.append("AD=").append(count).append(';');
1350 			bb.append("DP=").append(Tools.max(coverage, count)).append(';');
1351 			bb.append("MCOV=").append(minusCoverage).append(';');
1352 			bb.append("PPC=").append(properPairCount).append(';');
1353 
1354 			bb.append("AF=").append(af,4).append(';');
1355 			bb.append("RAF=").append(raf,4).append(';');
1356 			bb.append("LS=").append(lengthSum).append(';');
1357 
1358 			bb.append("MQS=").append(mapQSum).append(';');
1359 			bb.append("MQM=").append(mapQMax).append(';');
1360 			bb.append("BQS=").append(baseQSum).append(';');
1361 			bb.append("BQM=").append(baseQMax).append(';');
1362 
1363 			bb.append("EDS=").append(endDistSum).append(';');
1364 			bb.append("EDM=").append(endDistMax).append(';');
1365 			bb.append("IDS=").append(idSum).append(';');
1366 			bb.append("IDM=").append(idMax).append(';');
1367 
1368 			bb.append("NVC=").append(nearbyVarCount).append(';');
1369 			bb.append("FLG=").append(flagged ? 1 : 0).append(';');
1370 			bb.append("CED=").append(scafEndDist).append(';');
1371 			bb.append("HMP=").append(homopolymerCount(map)).append(';');
1372 			bb.append("SB=").append(strandBias,4).append(';');
1373 
1374 			if(Scaffold.trackStrand()){
1375 				bb.append("DP4=").append(refPlus).append(',').append(refMinus).append(',');
1376 				bb.append(allelePlusCount()).append(',').append(alleleMinusCount()).append(';');
1377 			}
1378 
1379 			bb.length--;
1380 		}
1381 		{
1382 			bb.tab();
1383 			bb.append("GT:DP:AD:AF:RAF:NVC:FLG:SB:SC:PF");
1384 			bb.tab();
1385 
1386 			bb.append(genotype(ploidy, pass));
1387 			bb.append(':');
1388 			bb.append(Tools.max(coverage, count));
1389 			bb.append(':');
1390 			bb.append(count);
1391 			bb.append(':');
1392 			bb.append(af,4);
1393 			bb.append(':');
1394 
1395 			bb.append(raf,4);
1396 			bb.append(':');
1397 			bb.append(nearbyVarCount);
1398 			bb.append(':');
1399 			bb.append(flagged ? 1 : 0);
1400 			bb.append(':');
1401 			bb.append(strandBias,4);
1402 			bb.append(':');
1403 
1404 			bb.append(score,2);
1405 			bb.append(':');
1406 			bb.append(pass ? "PASS" : "FAIL");
1407 		}
1408 
1409 		return bb;
1410 	}
1411 
calcCopies(int ploidy)1412 	public int calcCopies(int ploidy){
1413 		final double af=alleleFraction();
1414 //		final int count=count();
1415 		if(ploidy==1){
1416 			return af<0.4 ? 0 : 1;
1417 		}else if(ploidy==2){
1418 			if(af<0.2){return 0;}
1419 			if(af<0.8){return 1;}
1420 			return 2;
1421 		}
1422 
1423 		int copies=(int)Math.round(ploidy*af);
1424 		if(af>=0.5){copies=Tools.max(copies, 1);}
1425 		copies=Tools.mid(MIN_VAR_COPIES, copies, ploidy);
1426 		return copies;
1427 	}
1428 
1429 	//TODO: Actually, I should also track the ref coverage.
genotype(int ploidy, boolean pass)1430 	private String genotype(int ploidy, boolean pass) {
1431 		if(!pass && noPassDotGenotype){
1432 			if(ploidy==1){return ".";}
1433 			else if(ploidy==2){return "./.";}
1434 			StringBuilder sb=new StringBuilder(ploidy*2-1);
1435 			sb.append('.');
1436 			for(int i=1; i<ploidy; i++){
1437 				sb.append('/').append('.');
1438 			}
1439 			return sb.toString();
1440 		}
1441 		int copies=calcCopies(ploidy);
1442 		if(ploidy==1){return copies==0 ? "0" : "1";}
1443 		if(ploidy==2){
1444 			if(copies==0){return "0/0";}
1445 			if(copies==1){return "0/1";}
1446 			return "1/1";
1447 		}
1448 		StringBuilder sb=new StringBuilder(ploidy*2);
1449 		int refCopies=ploidy-copies;
1450 
1451 		for(int i=0; i<refCopies; i++){
1452 			sb.append(0).append('/');
1453 		}
1454 		for(int i=0; i<copies; i++){
1455 			sb.append(1).append('/');
1456 		}
1457 		sb.setLength(sb.length()-1);
1458 		return sb.toString();
1459 	}
1460 
hash()1461 	private int hash(){
1462 		return scafnum^Integer.rotateLeft(start, 9)^Integer.rotateRight(stop, 9)^hash(allele);
1463 	}
1464 
hash(byte[] a)1465 	public static final int hash(byte[] a){
1466 		int code=123456789;
1467 		for(byte b : a){
1468 			code=Integer.rotateLeft(code, 3)^codes[b];
1469 		}
1470 		return code&Integer.MAX_VALUE;
1471 	}
1472 
calcCoverage(ScafMap map)1473 	public int calcCoverage(ScafMap map){
1474 		if(coverage>=0){return coverage;}
1475 
1476 		Scaffold scaf=map.getScaffold(scafnum);
1477 		coverage=scaf.calcCoverage(this);
1478 		if(Scaffold.trackStrand()){minusCoverage=scaf.minusCoverage(this);}
1479 		return coverage;
1480 	}
1481 
1482 	/*--------------------------------------------------------------*/
1483 	/*----------------        Scoring Methods       ----------------*/
1484 	/*--------------------------------------------------------------*/
1485 
toPhredScore(double score)1486 	public static double toPhredScore(double score){
1487 		if(score==0){return 0;}
1488 		score=score*0.998;
1489 		return 2.5*QualityTools.probErrorToPhredDouble(1-score);
1490 	}
1491 
phredScore(double properPairRate, double totalQualityAvg, double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map)1492 	public double phredScore(double properPairRate, double totalQualityAvg, double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map){
1493 		double score=score(properPairRate, totalQualityAvg, totalMapqAvg, readLengthAvg, rarity, ploidy, map);
1494 		return toPhredScore(score);
1495 	}
1496 
score(double properPairRate, double totalQualityAvg, double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map)1497 	public double score(double properPairRate, double totalQualityAvg, double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map){
1498 		int scafEndDist=(map==null ? start : contigEndDist(map));
1499 
1500 		double cs=coverageScore(ploidy, rarity, readLengthAvg);
1501 		if(cs==0){return 0;}
1502 		double es=(useEdist ? edistScore() : 1);
1503 		double qs=qualityScore(totalQualityAvg, totalMapqAvg);
1504 		double ps=(usePairing ? pairedScore(properPairRate, scafEndDist) : 1);
1505 		double bs=(useBias ? biasScore(properPairRate, scafEndDist) : 1);
1506 		double is=(useIdentity ? identityScore() : 1);
1507 		double hs=(useHomopolymer ? homopolymerScore(map) : 1);
1508 		return Math.pow(es*qs*ps*bs*cs*is*hs, 0.2);
1509 	}
1510 
edistScore()1511 	public double edistScore(){
1512 		double lengthAvg=lengthAvg();
1513 		double edistAvg=((edistAvg()*2+endDistMax))*0.333333333333;
1514 		double constant=5+Tools.min(20, lengthAvg*0.1)+lengthAvg*0.01;
1515 		double weighted=Tools.max(0.05, edistAvg-Tools.min(constant, edistAvg*0.95));
1516 		weighted=weighted*weighted;
1517 		return weighted/(weighted+4);
1518 	}
1519 
identityScore()1520 	public double identityScore(){
1521 		double lengthAvg=lengthAvg();
1522 		double idAvg=0.001f*(((identityAvg()+idMax))*0.5f);
1523 		double weighted=Tools.min(1, (idAvg*lengthAvg+(0.65f*Tools.max(1, readlen())))/lengthAvg); //Diminish impact of this var on itself
1524 		weighted=0.75f+0.25f*weighted; //Compress the range
1525 		return weighted;
1526 	}
1527 
qualityScore(double totalBaseqAvg, double totalMapqAvg)1528 	public double qualityScore(double totalBaseqAvg, double totalMapqAvg){
1529 		return baseQualityScore(totalBaseqAvg)*mapQualityScore(totalMapqAvg);
1530 	}
1531 
1532 //	public double baseQualityScore(double totalBaseqAvg){
1533 //		double bqAvg=baseQAvg();
1534 //		final double delta=Tools.max(0, totalBaseqAvg-bqAvg);
1535 //	}
1536 
baseQualityScore(double totalBaseqAvg)1537 	public double baseQualityScore(double totalBaseqAvg){
1538 		double bqAvg=baseQAvg();
1539 
1540 		if(totalBaseqAvg<32 && bqAvg<32){//Fudge factor for recalibrated quality scores, since this was calibrated for raw Illumina scores.
1541 			//This section is not well-tested, though.
1542 			double fudgeFactor1=0.75*(32-totalBaseqAvg);
1543 			double fudgeFactor2=0.75*(32-bqAvg);
1544 			totalBaseqAvg+=fudgeFactor1;
1545 			bqAvg+=Tools.min(fudgeFactor1, fudgeFactor2);
1546 		}
1547 
1548 		//Difference between this variation's quality and average quality.
1549 		//Positive delta means that this is lower quality
1550 		final double delta=totalBaseqAvg-bqAvg;
1551 		if(delta>0){//This is kind of mysterious, but appears to normally reduce bqAvg for low-quality vars
1552 			bqAvg=Tools.max(bqAvg*0.5, bqAvg-0.5*delta);
1553 		}
1554 
1555 		double mult=0.25;
1556 		double thresh=12;
1557 		if(bqAvg>thresh){
1558 			bqAvg=bqAvg-thresh+(thresh*mult);
1559 		}else{
1560 			bqAvg=bqAvg*mult;
1561 		}
1562 
1563 		double baseProbAvg=1-Math.pow(10, 0-.1*bqAvg);
1564 		double d=baseProbAvg*baseProbAvg;
1565 		return d;
1566 	}
1567 
1568 //	public double mapQualityScore(double totalMapqAvg){
1569 //		double mqAvg=mapQAvg();
1570 //		double mqAvg2=(0.25f*(3*mqAvg+mapQMax));
1571 //		final double delta=totalMapqAvg-mqAvg;
1572 //		double score=(1-Math.pow(10, 0-.1*(mqAvg2+4)));
1573 //		if(delta>0){
1574 //
1575 //		}else{
1576 //
1577 //		}
1578 //	}
1579 
mapQualityScore(double totalMapqAvg)1580 	public double mapQualityScore(double totalMapqAvg){
1581 		double mqAvg=0.5f*(mapQAvg()+mapQMax);
1582 
1583 		double mapProbAvg=1-Math.pow(10, 0-.1*(mqAvg+2));
1584 		double d=mapProbAvg;
1585 		return d;
1586 	}
1587 
pairedScore(double properPairRate, int scafEndDist)1588 	public double pairedScore(double properPairRate, int scafEndDist){
1589 		if(properPairRate<0.5){return 0.98;}
1590 		final double count=alleleCount();
1591 		if(count==0){return 0;}
1592 		double rate=properPairCount/count;
1593 		rate=rate*(count/(0.1+count));
1594 		if(rate*1.05>=properPairRate){return Tools.max(rate, 1-0.001*properPairRate);}
1595 		double score=((rate*1.05)/properPairRate)*0.5+0.5;
1596 		score=Tools.max(0.1, score);
1597 		return modifyByEndDist(score, scafEndDist);
1598 	}
1599 
modifyByEndDist(double x, int scafEndDist)1600 	public double modifyByEndDist(double x, int scafEndDist){
1601 		if(x>=0.99 || !doNscan || scafEndDist>=nScan){return x;}
1602 		if(scafEndDist<minEndDistForBias){return Tools.max(x, 0.98+0.02*x);}
1603 		double delta=1-x;
1604 		delta=delta*(scafEndDist*scafEndDist)/(nScan*nScan);
1605 		return 1-delta;
1606 	}
1607 
coverageScore(int ploidy, double rarity, double readLengthAvg)1608 	public double coverageScore(int ploidy, double rarity, double readLengthAvg){
1609 		int count=alleleCount();
1610 		if(count==0){return 0;}
1611 		double rawScore=count/(lowCoveragePenalty+count); //This may be too severe...
1612 
1613 //		double ratio=alleleFraction();
1614 
1615 		double ratio=0.98;
1616 		if(coverage>0){
1617 			double dif=coverage-count;
1618 			if(dif>0){
1619 				dif=dif-coverage*.01f-Tools.min(0.5f, coverage*.1f);
1620 				dif=Tools.max(0.1f, dif);
1621 			}
1622 			ratio=(coverage-dif)/coverage;
1623 			if(type()==SUB && revisedAlleleFraction!=-1 && revisedAlleleFraction<ratio){ratio=revisedAlleleFraction;}
1624 			else{
1625 				ratio=adjustForInsertionLength(ratio, readLengthAvg);
1626 			}
1627 			if(rarity<1 && ratio>rarity){
1628 				double minExpected=1f/ploidy;
1629 				if(ratio<minExpected){
1630 					ratio=minExpected-((minExpected-ratio)*0.1);
1631 				}
1632 			}
1633 		}
1634 
1635 		double ratio2=Tools.min(1, ploidy*ratio);
1636 		return rawScore*ratio2;
1637 	}
1638 
reviseAlleleFraction(double readLengthAvg, Scaffold scaffold, VarMap map)1639 	public void reviseAlleleFraction(double readLengthAvg, Scaffold scaffold, VarMap map){
1640 		assert(type()==INS);
1641 		final int ilen=readlen();
1642 		if(ilen<3 || start<1 || start>=scaffold.length-2){return;}
1643 		final byte[] bases=scaffold.bases;
1644 
1645 		final double afIns=alleleFraction();
1646 		final double rafIns=revisedAlleleFraction(afIns, readLengthAvg);
1647 		final double revisedDif=0.55*(rafIns-afIns); //Half on the left and half on the right, on average
1648 		final double mult=revisedDif/allele.length;
1649 
1650 		for(int i=0, j=start; i<allele.length && j<scaffold.bases.length; i++, j++){
1651 			final byte b=allele[i];
1652 			if(b!=bases[j]){
1653 				Var key=new Var(scaffold.number, j, j+1, b, SUB);
1654 				Var affectedSub=map.get(key);
1655 //				System.err.println("At pos "+j+": ref="+(char)bases[j]+", alt="+(char)b);
1656 				if(affectedSub!=null){
1657 					assert(key.type()==SUB);
1658 //					System.err.println("Found "+value);
1659 					final double subModifier=revisedDif-mult*i;
1660 					synchronized(affectedSub){
1661 						double afSub=affectedSub.alleleFraction();
1662 						double rafSub=affectedSub.revisedAlleleFraction;
1663 						double modified=afSub-subModifier;
1664 						if(rafSub==-1){
1665 //							System.err.println("sub="+sub+", old="+old);
1666 							affectedSub.revisedAlleleFraction=Tools.max(afSub*0.05, modified);
1667 						}else{
1668 							affectedSub.revisedAlleleFraction=Tools.min(rafSub, Tools.max(afSub*0.05, modified));
1669 						}
1670 					}
1671 				}
1672 			}
1673 		}
1674 
1675 		for(int i=0, j=start-1; i<allele.length && j>=0; i++, j--){
1676 			final byte b=allele[allele.length-1-i];
1677 			if(b!=bases[j]){
1678 				Var key=new Var(scaffold.number, j, j+1, b, SUB);
1679 				Var affectedSub=map.get(key);
1680 				if(affectedSub!=null){
1681 					assert(key.type()==SUB);
1682 //					System.err.println("Found "+value);
1683 					final double subModifier=revisedDif-mult*i;
1684 					synchronized(affectedSub){
1685 						double afSub=affectedSub.alleleFraction();
1686 						double rafSub=affectedSub.revisedAlleleFraction;
1687 						double modified=afSub-subModifier;
1688 						if(rafSub==-1){
1689 //							System.err.println("sub="+sub+", old="+old);
1690 							affectedSub.revisedAlleleFraction=Tools.max(afSub*0.05, modified);
1691 						}else{
1692 							affectedSub.revisedAlleleFraction=Tools.min(rafSub, Tools.max(afSub*0.05, modified));
1693 						}
1694 					}
1695 				}
1696 			}
1697 		}
1698 	}
1699 
revisedAlleleFraction(double af, double readLengthAvg)1700 	public double revisedAlleleFraction(double af, double readLengthAvg){
1701 		if(revisedAlleleFraction!=-1){
1702 			return revisedAlleleFraction;
1703 		}else if(type()==INS){
1704 			return revisedAlleleFraction=adjustForInsertionLength(af, readLengthAvg);
1705 		}
1706 		return af;
1707 	}
1708 
adjustForInsertionLength(final double ratio, final double rlen0)1709 	public double adjustForInsertionLength(final double ratio, final double rlen0){
1710 		if(type()!=INS){return ratio;}
1711 		final int ilen=readlen();
1712 		if(ilen<2){return ratio;}
1713 
1714 		final double rlen=Tools.max(ilen*1.2+6, rlen0);
1715 		final double sites=rlen+ilen-1;
1716 		final double goodSites=rlen-ilen*1.1-6;
1717 
1718 		final double expectedFraction=goodSites/sites;
1719 		final double revisedRatio=Tools.min(ratio/expectedFraction, 1-(1-ratio)*0.1);
1720 		return revisedRatio;
1721 	}
1722 
homopolymerScore(ScafMap map)1723 	public double homopolymerScore(ScafMap map){
1724 		if(map==null){return 1;}
1725 
1726 		int count=homopolymerCount(map);
1727 //		assert(false) : count;
1728 		if(count<2){return 1;}
1729 		return 1f-(count*0.1f/9);
1730 	}
1731 
homopolymerCount(ScafMap map)1732 	public int homopolymerCount(ScafMap map){
1733 		if(map==null){return 0;}
1734 		final byte[] bases=map.getScaffold(scafnum).bases;
1735 		if(bases==null){return 0;}
1736 
1737 		final int type=type();
1738 		if(type==SUB){
1739 			assert(start==stop-1) : start+", "+stop;
1740 			final byte base=allele[0];
1741 			int x=homopolymerCountSub(bases, start, base);
1742 //			assert(false) : (char)base+", "+x;
1743 			return x;
1744 		}else if(type==INS){
1745 			final byte base1=allele[0], base2=allele[allele.length-1];
1746 			int i=0;
1747 			while(i<allele.length && allele[i]==base1){i++;}
1748 			while(i<allele.length && allele[i]==base2){i++;}
1749 			if(i<bases.length){return 0;}
1750 			int left=homopolymerCountLeft(bases, start, base1);
1751 			int right=homopolymerCountRight(bases, stop+1, base2);
1752 //			assert(false) : "INS "+(left+right+1)+" "+new String(allele)+" "+new String(bases, start-4, 8);
1753 			return left+right+1;
1754 		}else if(type==DEL){
1755 			if(start<0 || start+1>=bases.length || stop<=0 || stop>=bases.length){return 0;}
1756 			final byte base1=bases[start+1], base2=bases[stop-1];
1757 			int pos=start+1;
1758 			while(pos<=stop && bases[pos]==base1){pos++;}
1759 			while(pos<=stop && bases[pos]==base2){pos++;}
1760 			if(pos<=stop){return 0;}
1761 			int left=homopolymerCountLeft(bases, start, base1);
1762 			int right=homopolymerCountRight(bases, stop, base2);
1763 //			assert(false || reflen()>10) : "DEL "+(left+right+1)+" "+new String(allele)+" "+new String(bases, start-4, stop-start+8);
1764 			return left+right+1;
1765 		}else{
1766 //			assert(false) : type();
1767 			return 0;
1768 		}
1769 	}
1770 
homopolymerCountSub(final byte[] bases, final int pos, final byte base)1771 	public static int homopolymerCountSub(final byte[] bases, final int pos, final byte base){
1772 //		System.err.println("A:"+pos+", "+bases.length+", "+(char)base);
1773 		if(pos<0 || pos>=bases.length){return 0;}
1774 		if(!AminoAcid.isFullyDefined(base)){return 0;}
1775 //		System.err.println("B:"+pos+", "+bases.length);
1776 
1777 		int count1=0;
1778 		for(int i=pos-1, lim=Tools.max(0, pos-4); i>=lim; i--){
1779 			if(bases[i]==base){count1++;}
1780 			else{break;}
1781 		}
1782 //		System.err.println("C:"+pos+", "+bases.length+", "+count1);
1783 
1784 		int count2=0;
1785 		for(int i=pos+1, lim=Tools.min(bases.length, pos+5); i<lim; i++){
1786 			if(bases[i]==base){count2++;}
1787 			else{break;}
1788 		}
1789 //		System.err.println("D:"+pos+", "+bases.length+", "+count2);
1790 //		System.err.println("E:"+new String(bases, pos-4, 9));
1791 		assert(count1+count2<=8) : count1+", "+count2;
1792 
1793 		return count1+count2+(count1>0 && count2>0 ? 1 : 0);
1794 	}
1795 
homopolymerCountLeft(final byte[] bases, final int pos, final byte base)1796 	public static int homopolymerCountLeft(final byte[] bases, final int pos, final byte base){
1797 		if(pos<0 || bases[pos]!=base){return 0;}
1798 		if(!AminoAcid.isFullyDefined(base)){return 0;}
1799 
1800 		int count=0;
1801 		for(int i=pos, lim=Tools.max(0, pos-3); i>=lim; i--){
1802 			if(bases[i]==base){count++;}
1803 			else{break;}
1804 		}
1805 		return count;
1806 	}
1807 
homopolymerCountRight(final byte[] bases, final int pos, final byte base)1808 	public static int homopolymerCountRight(final byte[] bases, final int pos, final byte base){
1809 		if(pos<0 || bases[pos]!=base){return 0;}
1810 		if(!AminoAcid.isFullyDefined(base)){return 0;}
1811 
1812 		int count=0;
1813 		for(int i=pos, lim=Tools.min(bases.length, pos+4); i<lim; i++){
1814 			if(bases[i]==base){count++;}
1815 			else{break;}
1816 		}
1817 		return count;
1818 	}
1819 
biasScore(double properPairRate, int scafEndDist)1820 	public double biasScore(double properPairRate, int scafEndDist){
1821 		double strandBias=strandBiasScore(scafEndDist);
1822 		double readBias=readBiasScore(properPairRate);
1823 		return Math.sqrt(strandBias*readBias);
1824 	}
1825 
strandBiasScore(int scafEndDist)1826 	public double strandBiasScore(int scafEndDist){
1827 		int plus=allelePlusCount();
1828 		int minus=alleleMinusCount();
1829 		final double x=eventProb(plus, minus);
1830 		final double x2=modifyByEndDist(x, scafEndDist);
1831 
1832 		//TODO:
1833 		//This should correct strand bias based on whether the overall read distribution is biased,
1834 		//for homozygous variations when minus coverage is being tracked
1835 
1836 		double result=x2;
1837 		if(plus+minus>=20 && x2<0.9){//This block added based on analyzing NIST GIAB diff
1838 			int min=Tools.min(plus, minus);
1839 			int max=Tools.max(plus, minus);
1840 			if(min>1 && min>0.06f*max){//Seen on both strands; relax stringency
1841 				double y=0.15+(0.2*min)/max; //Higher constants (maximum of 1.0 combined) push the end result closer to 1
1842 				result=y+(1-y)*x2;
1843 			}
1844 		}
1845 //		if(start==15699277){System.err.println(plus+", "+minus+", "+x+", "+x2+", "+result);}
1846 		return result;
1847 	}
1848 
1849 	//This seems to cause trouble with some NIST GIAB Illumina data...  not clear why
readBiasScore(double properPairRate)1850 	public double readBiasScore(double properPairRate){
1851 		if(properPairRate<0.5){return 0.95f;}
1852 		final int r1=r1AlleleCount(), r2=r2AlleleCount();
1853 		final double x=eventProb(r1, r2);
1854 
1855 		//This block added based on analyzing NIST GIAB diff
1856 		final double x2=0.10+0.90*x;
1857 		double result=x2;
1858 		if(r1+r2>=20 && x2<0.9){
1859 			int min=Tools.min(r1, r2);
1860 			int max=Tools.max(r1, r2);
1861 			if(min>1 && min>0.07f*max){//Seen on both reads; relax stringency
1862 				double y=0.15+(0.2*min)/max; //Higher constants (maximum of 1.0 combined) push the end result closer to 1
1863 				result=y+(1-y)*x2;
1864 			}
1865 		}
1866 		return result;
1867 	}
1868 
1869 	/** Adjusted probability of a binomial event being at least this lopsided. */
eventProb(int a, int b)1870 	public static double eventProb(int a, int b){
1871 
1872 		double allowedBias=0.75;
1873 		double slopMult=0.95;
1874 
1875 		double n=a+b;
1876 		double k=Tools.min(a, b);
1877 
1878 		double slop=n*(allowedBias*0.5);
1879 		double dif=n-k*2;
1880 		dif=dif-(Tools.min(slop, dif)*slopMult);
1881 		n=k*2+dif;
1882 //		k=n*0.5-dif;
1883 		assert(k<=n*0.5) : a+", "+b+", "+n+", "+k+", "+slop+", "+dif;
1884 
1885 		if(n>PROBLEN){
1886 			double mult=PROBLEN/n;
1887 			n=PROBLEN;
1888 			k=(int)(k*mult);
1889 		}
1890 
1891 		int n2=(int)Math.round(n);
1892 		int k2=Tools.min(n2/2, (int)(k+1));
1893 
1894 
1895 //		if(a+b>3){
1896 //			System.err.println(n+", "+k+", "+n2+", "+k2);
1897 //		}
1898 
1899 		double result=prob[n2][k2];
1900 		if(result<1 || a==b || a+1==b || a==b+1){return result;}
1901 
1902 		double slope=Tools.min(a, b)/(double)Tools.max(a, b);
1903 		return (0.998+slope*0.002);
1904 	}
1905 
1906 	/*--------------------------------------------------------------*/
1907 	/*----------------           Getters            ----------------*/
1908 	/*--------------------------------------------------------------*/
1909 
allelePlusCount()1910 	public int allelePlusCount(){return r1plus+r2plus;}
alleleMinusCount()1911 	public int alleleMinusCount(){return r1minus+r2minus;}
r1AlleleCount()1912 	public int r1AlleleCount(){return r1plus+r1minus;}
r2AlleleCount()1913 	public int r2AlleleCount(){return r2plus+r2minus;}
alleleCount()1914 	public int alleleCount(){return r1plus+r1minus+r2plus+r2minus;}
1915 
alleleFraction()1916 	public double alleleFraction(){
1917 		int count=alleleCount();
1918 		int cov=Tools.max(count, coverage, 1);
1919 		return count/(double)cov;
1920 	}
1921 
strandRatio()1922 	public double strandRatio(){
1923 		int plus=allelePlusCount();
1924 		int minus=alleleMinusCount();
1925 		if(plus==minus){return 1;}
1926 		return (Tools.min(plus,  minus)+1)/(double)Tools.max(plus, minus);
1927 	}
baseQAvg()1928 	public double baseQAvg(){return baseQSum/(double)alleleCount();}
mapQAvg()1929 	public double mapQAvg(){return mapQSum/(double)alleleCount();}
edistAvg()1930 	public double edistAvg(){return endDistSum/(double)alleleCount();}
identityAvg()1931 	public double identityAvg(){return idSum/(double)alleleCount();}
lengthAvg()1932 	public double lengthAvg(){return lengthSum/(double)alleleCount();}
properPairRate()1933 	public double properPairRate(){return properPairCount/(double)alleleCount();}
1934 
1935 
setCoverage(int coverage_, int minusCoverage_)1936 	public void setCoverage(int coverage_, int minusCoverage_){
1937 		coverage=coverage_;
1938 		minusCoverage=minusCoverage_;
1939 	}
1940 
coverage()1941 	public int coverage(){
1942 		assert(coverage>-1) : coverage+", "+this;
1943 		return coverage;
1944 	}
1945 
hasCoverage()1946 	public boolean hasCoverage(){
1947 		return coverage>-1;
1948 	}
1949 
contigEndDist(ScafMap map)1950 	public int contigEndDist(ScafMap map){
1951 		Scaffold scaf=map.getScaffold(scafnum);
1952 		int len=scaf.length;
1953 		byte[] bases=scaf.bases;
1954 
1955 		int scafEndDist=Tools.max(0, Tools.min(start, len-stop));
1956 		if(bases==null || nScan<1){return scafEndDist;}
1957 		int limit=Tools.min(nScan, scafEndDist);
1958 		int contigEndDist=leftContigEndDist(bases, limit);
1959 		limit=Tools.min(limit, contigEndDist);
1960 		contigEndDist=rightContigEndDist(bases, limit);
1961 		return Tools.min(scafEndDist, contigEndDist);
1962 	}
1963 
leftContigEndDist(byte[] bases, int maxDist)1964 	public int leftContigEndDist(byte[] bases, int maxDist){
1965 		if(start>=bases.length){return Tools.min(bases.length, maxDist+1);}
1966 		int ns=0;
1967 		for(int i=start, lim=Tools.max(0, start-maxDist); i>=lim; i--){
1968 			if(AminoAcid.isFullyDefined(bases[i])){
1969 				ns=0;
1970 			}else{
1971 				ns++;
1972 				if(ns>=10){
1973 					int x=start-i-ns+1;
1974 					assert(x>=0);
1975 					return x;
1976 				}
1977 			}
1978 		}
1979 		return maxDist+1;
1980 	}
1981 
rightContigEndDist(byte[] bases, int maxDist)1982 	public int rightContigEndDist(byte[] bases, int maxDist){
1983 		if(stop<0){return Tools.min(bases.length, maxDist+1);}
1984 		int ns=0;
1985 		for(int i=stop, lim=Tools.min(bases.length-1, stop+maxDist); i<=lim; i++){
1986 			if(AminoAcid.isFullyDefined(bases[i])){
1987 				ns=0;
1988 			}else{
1989 				ns++;
1990 				if(ns>=10){
1991 					int x=i-stop-ns+1;
1992 					assert(x>=0);
1993 					return x;
1994 				}
1995 			}
1996 		}
1997 		return maxDist+1;
1998 	}
1999 
scafName()2000 	public String scafName(){
2001 		return scafName(ScafMap.defaultScafMap());
2002 	}
2003 
scafName(ScafMap map)2004 	public String scafName(ScafMap map){
2005 		return map.getScaffold(scafnum).name;
2006 	}
2007 
setForced(boolean b)2008 	public Var setForced(boolean b){forced=b; return this;}
forced()2009 	public boolean forced(){return forced;}
2010 
setFlagged(boolean b)2011 	public Var setFlagged(boolean b){flagged=b; return this;}
flagged()2012 	public boolean flagged(){return flagged;}
2013 
2014 
ins()2015 	public final boolean ins(){return type==INS;}
del()2016 	public final boolean del(){return type==DEL;}
indel()2017 	public final boolean indel(){return type==INS || type==DEL;}
sub()2018 	public final boolean sub(){return type==SUB;}
complex()2019 	public final boolean complex(){return type==COMPLEX;}
frameshift()2020 	public final boolean frameshift(){
2021 		int delta=Tools.absdif(reflen(), readlen());
2022 		return delta%3!=0;
2023 	}
2024 
2025 	/*--------------------------------------------------------------*/
2026 	/*----------------         Final Fields         ----------------*/
2027 	/*--------------------------------------------------------------*/
2028 
2029 	public final int scafnum;
2030 	public final int start;
2031 	public final int stop; //Half-open, so stop is always after start except for insertions
2032 	public final byte[] allele;
2033 	public final int hashcode;
2034 	public final int type;
2035 
2036 	/*--------------------------------------------------------------*/
2037 	/*----------------        Mutable Fields        ----------------*/
2038 	/*--------------------------------------------------------------*/
2039 
2040 	private int coverage=-1;
2041 	private int minusCoverage=-1;
2042 
2043 	int r1plus;
2044 	int r1minus;
2045 	int r2plus;
2046 	int r2minus;
2047 	int properPairCount;
2048 
2049 	long mapQSum;
2050 	public int mapQMax;
2051 
2052 	long baseQSum;
2053 	public int baseQMax;
2054 
2055 	long endDistSum;
2056 	public int endDistMax;
2057 
2058 	long idSum;
2059 	int idMax;
2060 
2061 	long lengthSum;
2062 
2063 	int nearbyVarCount=-1;
2064 
2065 	double revisedAlleleFraction=-1;
2066 	private boolean forced=false;
2067 	private boolean flagged=false;
2068 
2069 	/*--------------------------------------------------------------*/
2070 	/*----------------        Static Fields         ----------------*/
2071 	/*--------------------------------------------------------------*/
2072 
2073 	public static boolean CALL_INS=true;
2074 	public static boolean CALL_DEL=true;
2075 	public static boolean CALL_SUB=true;
2076 	public static boolean CALL_NOCALL=false;
2077 	public static boolean CALL_JUNCTION=false;
2078 
2079 	public static boolean extendedText=true;
2080 
2081 	public static boolean noPassDotGenotype=false;
2082 
2083 	public static boolean useHomopolymer=true;
2084 	public static boolean useIdentity=true;
2085 	public static boolean usePairing=true;
2086 	public static boolean useBias=true;
2087 	public static boolean useEdist=true;
2088 	public static boolean doNscan=true;
2089 	public static double lowCoveragePenalty=0.8;
2090 	public static int nScan=600;
2091 	public static int minEndDistForBias=200;
2092 
2093 	public static int MIN_VAR_COPIES=0;
2094 
2095 	final static int PROBLEN=100;
2096 	public static final boolean UPPER_CASE_ALLELES=true;
2097 	/** Verify that there are no variants with alt equal to ref */
2098 	private static final boolean TEST_REF_VARIANTS=false;
2099 
2100 //	static final String[] typeArray=new String[] {"NOCALL","SUB","DEL","INS"};
2101 //	static final int NOCALL=0, SUB=1, DEL=2, INS=3;
2102 
2103 	private static final byte colon=';';
2104 	private static final byte tab='\t';
2105 	public static final String[] typeArray=new String[] {"INS","NOCALL","SUB","DEL","LJUNCT","RJUNCT","BJUNCT","MULTI","COMPLEX"};
2106 	/** Insertion */
2107 	public static final int INS=0;
2108 	/** No-call (length-neutral, ref N) */
2109 	public static final int NOCALL=1;
2110 	/** Substitution (length-neutral) */
2111 	public static final int SUB=2;
2112 	/** Deletion */
2113 	public static final int DEL=3;
2114 	/** Left-junction (left side clipped, right side normal) */
2115 	public static final int LJUNCT=4;
2116 	/** Right-junction (right side clipped, left side normal) */
2117 	public static final int RJUNCT=5;
2118 	/** Bidirectional junction (both sides clipped) */
2119 	public static final int BJUNCT=6;
2120 	/** Multiallelic (dominates all other types) */
2121 	public static final int MULTI=7;
2122 	/** Left-junction (left-side clipped, right side normal) */
2123 	public static final int COMPLEX=8;
2124 
2125 	/** Number of variant types; same as typeArray.length */
2126 	public static final int VAR_TYPES=COMPLEX+1;
2127 
2128 	/** Initial letter for abbreviating type names */
2129 	static final byte[] typeInitialArray=new byte[128];
2130 
2131 	static final byte[] AL_0=new byte[0];
2132 	static final byte[] AL_A=new byte[] {(byte)'A'};
2133 	static final byte[] AL_C=new byte[] {(byte)'C'};
2134 	static final byte[] AL_G=new byte[] {(byte)'G'};
2135 	static final byte[] AL_T=new byte[] {(byte)'T'};
2136 	static final byte[] AL_N=new byte[] {(byte)'N'};
2137 	static final byte[][] AL_MAP=makeMap();
2138 	static final int[] codes=makeCodes();
2139 
2140 	//For 64-bit hash keys
2141 	private static final int typeBits=2;
2142 	private static final int alleleBits=6;
2143 	private static final int scafBits=16;
2144 	private static final int lenBits=8;
2145 
2146 	private static final int alleleShift=typeBits;
2147 	private static final int scafShift=alleleShift+alleleBits;
2148 	private static final int lenShift=scafShift+scafBits;
2149 	private static final int startShift=lenShift+lenBits;
2150 
2151 
2152 //	public static ScafMap scafMap;
2153 
makeCodes()2154 	static final int[] makeCodes(){
2155 		Random randy=new Random(1);
2156 		int[] array=new int[256];
2157 		for(int i=0; i<array.length; i++){
2158 			array[i]=randy.nextInt();
2159 		}
2160 		return array;
2161 	}
2162 
makeMap()2163 	static final byte[][] makeMap(){
2164 		byte[][] map=new byte[128][];
2165 		map[0]=map['.']=map['\t']=AL_0;
2166 		map['A']=map['a']=AL_A;
2167 		map['C']=map['c']=AL_C;
2168 		map['G']=map['g']=AL_G;
2169 		map['T']=map['t']=AL_T;
2170 		map['N']=map['n']=AL_N;
2171 		for(int i=0; i<map.length; i++){
2172 			if(map[i]==null){map[i]=new byte[(byte)i];}
2173 		}
2174 		return map;
2175 	}
2176 
2177 	/** factorial[n]=n! */
2178 	private static final double[] factorial=makeFactorialArray(PROBLEN+1);
2179 	/** binomial[n][k] = combinations in n pick k (no replacement, order-insensitive) */
2180 	private static final double[][] binomial=makeBinomialMatrix(PROBLEN+1);
2181 	/** prob[n][k] = probability of an event this lopsided or worse. */
2182 	private static final double[][] prob=makeProbMatrix(PROBLEN+1);
2183 
makeFactorialArray(int len)2184 	private static double[] makeFactorialArray(int len) {
2185 		double[] x=new double[len];
2186 		x[0]=1;
2187 		for(int i=1; i<len; i++){
2188 			x[i]=x[i-1]*i;
2189 		}
2190 		return x;
2191 	}
2192 
makeBinomialMatrix(int len)2193 	private static double[][] makeBinomialMatrix(int len) {
2194 		double[][] matrix=new double[len][];
2195 		for(int n=0; n<len; n++){
2196 			final int kmax=n/2;
2197 			final double nf=factorial[n];
2198 			matrix[n]=new double[kmax+1];
2199 			for(int k=0; k<=kmax; k++){
2200 				final double kf=factorial[k];
2201 				final double nmkf=factorial[n-k];
2202 				double combinations=nf/kf;
2203 				combinations=combinations/nmkf;
2204 				matrix[n][k]=combinations;
2205 			}
2206 		}
2207 		return matrix;
2208 	}
2209 
2210 	/** Combinations in n pick k, allowing somewhat higher max values... perhaps? */
bigBinomial(int n, int k)2211 	private static double bigBinomial(int n, int k){
2212 
2213 		double combinations=1;
2214 		for(int a=-1, b=-1, c=-1; a<=n || b<=k || c<=n-k; ) {
2215 			double mult=1;
2216 			if(combinations<1000000000 && a<=n){//Increase combinations
2217 				a++;
2218 				mult=Tools.max(1, a);
2219 			}else if(b<=k){//Decrease combinations
2220 				b++;
2221 				mult=1.0/Tools.max(1, a);
2222 			}else if(c<=n-k){//Decrease combinations
2223 				c++;
2224 				mult=1.0/Tools.max(1, a);
2225 			}else{
2226 				assert(false) : a+", "+b+", "+c+", "+n+", "+k+", "+(n-k)+", "+combinations;
2227 			}
2228 			combinations*=mult;
2229 			assert(combinations<Double.MAX_VALUE) : a+", "+b+", "+c+", "+n+", "+k+", "+(n-k)+", "+combinations;
2230 		}
2231 
2232 		return combinations;
2233 	}
2234 
2235 	private static double[][] makeProbMatrix(int len) {
2236 		double[][] matrix=new double[len][];
2237 		double mult=2;
2238 		for(int n=0; n<len; n++){
2239 			final int kmax=n/2;
2240 			final double[] array=matrix[n]=new double[kmax+1];
2241 			for(int k=0; k<=kmax; k++){
2242 				final double combinations=binomial[n][k];
2243 				array[k]=combinations*mult;
2244 			}
2245 //			if(n<=12){System.err.println(Arrays.toString(array));}
2246 			for(int k=0; k<=kmax; k++){
2247 				array[k]=Tools.min(1, (k==0 ? 0 : array[k-1])+array[k]);
2248 			}
2249 //			if(n<=12){System.err.println(Arrays.toString(array));}
2250 //			assert(array[kmax]==1) : Arrays.toString(array);
2251 			mult*=0.5;
2252 //			if(n<=12){System.err.println();}
2253 		}
2254 		return matrix;
2255 	}
2256 
2257 	static {
2258 		Arrays.fill(typeInitialArray, (byte)-1);
2259 		typeInitialArray['I']=INS;
2260 		typeInitialArray['N']=NOCALL;
2261 		typeInitialArray['S']=SUB;
2262 		typeInitialArray['D']=DEL;
2263 		typeInitialArray['L']=LJUNCT;
2264 		typeInitialArray['R']=RJUNCT;
2265 		typeInitialArray['B']=BJUNCT;
2266 		typeInitialArray['M']=MULTI;
2267 		typeInitialArray['C']=COMPLEX;
2268 	}
2269 
2270 	public static final String varFormat="1.3";
2271 }
2272 
2273