1 package var2; 2 3 import java.io.Serializable; 4 import java.util.ArrayList; 5 import java.util.Arrays; 6 import java.util.Locale; 7 import java.util.Random; 8 9 import align2.QualityTools; 10 import dna.AminoAcid; 11 import fileIO.FileFormat; 12 import shared.KillSwitch; 13 import shared.Parse; 14 import shared.Parser; 15 import shared.Shared; 16 import shared.Timer; 17 import shared.Tools; 18 import stream.Read; 19 import stream.SamLine; 20 import structures.ByteBuilder; 21 22 /** 23 * Tracks data for a variation. 24 * Uses half-open coordinates. 25 * @author Brian Bushnell 26 * @date November 4, 2016 27 * 28 */ 29 public class Var implements Comparable<Var>, Serializable, Cloneable { 30 31 /** 32 * 33 */ 34 private static final long serialVersionUID = 3328626403863586829L; 35 36 main(String[] args)37 public static void main(String[] args){ 38 if(args.length>1){ 39 for(int i=1; i<args.length; i++){ 40 String arg=args[i]; 41 String[] split=arg.split("="); 42 String a=split[0].toLowerCase(); 43 String b=split.length>1 ? split[1] : null; 44 Parser.parseCommonStatic(arg, a, b); 45 } 46 } 47 Timer t=new Timer(); 48 VarMap vmap; 49 FileFormat ff=FileFormat.testInput(args[0], FileFormat.VAR, null, true, true); 50 if(ff.vcf()){ 51 ScafMap smap=ScafMap.loadVcfHeader(ff); 52 vmap=VcfLoader.loadVcfFile(args[0], smap, false, false); 53 }else{ 54 vmap=VcfLoader.loadVarFile(args[0], null); 55 } 56 t.stop("Loaded "+vmap.size()+" variants.\nTime: \t"); 57 } 58 59 /*--------------------------------------------------------------*/ 60 /*---------------- Constructors ----------------*/ 61 /*--------------------------------------------------------------*/ 62 63 @Override clone()64 public Var clone(){ 65 try { 66 return (Var)super.clone(); 67 } catch (CloneNotSupportedException e) { 68 // TODO Auto-generated catch block 69 e.printStackTrace(); 70 throw new RuntimeException(); 71 } 72 } 73 Var(int scafnum_, int start_, int stop_, int allele_, int type_)74 public Var(int scafnum_, int start_, int stop_, int allele_, int type_){ 75 this(scafnum_, start_, stop_, AL_MAP[allele_], type_); 76 } 77 Var(Var v)78 public Var(Var v) { 79 this(v.scafnum, v.start, v.stop, v.allele, v.type); 80 } 81 Var(int scafnum_, int start_, int stop_, byte[] allele_, int type_)82 public Var(int scafnum_, int start_, int stop_, byte[] allele_, int type_){ 83 scafnum=scafnum_; 84 start=start_; 85 stop=stop_; 86 allele=allele_; 87 type=type_; 88 hashcode=hash(); 89 // stamp=stamper.getAndIncrement(); 90 assert(allele.length>1 || allele==AL_0 || 91 allele==AL_A || allele==AL_C || allele==AL_G || allele==AL_T || allele==AL_N) : new String(new String(allele_)+", "+allele_.length); 92 assert(start<=stop) : "\n"+Var.toBasicHeader()+"\n"+this+"\n"; 93 // if(type()==SUB && allele.length==1){ //TODO: 123 - mainly for testing 94 // final byte call=allele[0]; 95 // final Scaffold scaf=ScafMap.defaultScafMap.getScaffold(scafnum); 96 // final byte ref=scaf.bases[start]; 97 //// System.err.println((char)call+"="+(char)ref+" at scaf "+scafnum+" pos "+start); 98 // assert(ref!=call) : (char)call+"="+(char)ref+" at scaf "+scafnum+" pos "+start; 99 // } 100 // assert(false) : this.alleleCount(); 101 } 102 Var(final byte[] line, final byte delimiter)103 public Var(final byte[] line, final byte delimiter){ 104 int a=0, b=0; 105 106 while(b<line.length && line[b]!=delimiter){b++;} 107 assert(b>a) : "Missing field 0: "+new String(line); 108 scafnum=Parse.parseInt(line, a, b); 109 b++; 110 a=b; 111 112 while(b<line.length && line[b]!=delimiter){b++;} 113 assert(b>a) : "Missing field 1: "+new String(line); 114 start=Parse.parseInt(line, a, b); 115 b++; 116 a=b; 117 118 while(b<line.length && line[b]!=delimiter){b++;} 119 assert(b>a) : "Missing field 2: "+new String(line); 120 stop=Parse.parseInt(line, a, b); 121 b++; 122 a=b; 123 124 while(b<line.length && line[b]!=delimiter){b++;} 125 assert(b>a) : "Missing field 3: "+new String(line); 126 type=typeInitialArray[line[a]]; 127 b++; 128 a=b; 129 130 while(b<line.length && line[b]!=delimiter){b++;} 131 assert(b>=a) : "Missing field 4: "+new String(line); 132 if(b==a){allele=AL_0;} 133 else if(b==a+1){allele=AL_MAP[line[a]];} 134 else{allele=Arrays.copyOfRange(line, a, b);} 135 b++; 136 a=b; 137 138 while(b<line.length && line[b]!=delimiter){b++;} 139 assert(b>a) : "Missing field 5: "+new String(line); 140 r1plus=Parse.parseInt(line, a, b); 141 b++; 142 a=b; 143 144 while(b<line.length && line[b]!=delimiter){b++;} 145 assert(b>a) : "Missing field 6: "+new String(line); 146 r1minus=Parse.parseInt(line, a, b); 147 b++; 148 a=b; 149 150 while(b<line.length && line[b]!=delimiter){b++;} 151 assert(b>a) : "Missing field 7: "+new String(line); 152 r2plus=Parse.parseInt(line, a, b); 153 b++; 154 a=b; 155 156 while(b<line.length && line[b]!=delimiter){b++;} 157 assert(b>a) : "Missing field 8: "+new String(line); 158 r2minus=Parse.parseInt(line, a, b); 159 b++; 160 a=b; 161 162 while(b<line.length && line[b]!=delimiter){b++;} 163 assert(b>a) : "Missing field 9: "+new String(line); 164 properPairCount=Parse.parseInt(line, a, b); 165 b++; 166 a=b; 167 168 while(b<line.length && line[b]!=delimiter){b++;} 169 assert(b>a) : "Missing field 10: "+new String(line); 170 lengthSum=Parse.parseLong(line, a, b); 171 b++; 172 a=b; 173 174 while(b<line.length && line[b]!=delimiter){b++;} 175 assert(b>a) : "Missing field 11: "+new String(line); 176 mapQSum=Parse.parseLong(line, a, b); 177 b++; 178 a=b; 179 180 while(b<line.length && line[b]!=delimiter){b++;} 181 assert(b>a) : "Missing field 12: "+new String(line); 182 mapQMax=Parse.parseInt(line, a, b); 183 b++; 184 a=b; 185 186 while(b<line.length && line[b]!=delimiter){b++;} 187 assert(b>a) : "Missing field 13: "+new String(line); 188 baseQSum=Parse.parseLong(line, a, b); 189 b++; 190 a=b; 191 192 while(b<line.length && line[b]!=delimiter){b++;} 193 assert(b>a) : "Missing field 14: "+new String(line); 194 baseQMax=Parse.parseInt(line, a, b); 195 b++; 196 a=b; 197 198 while(b<line.length && line[b]!=delimiter){b++;} 199 assert(b>a) : "Missing field 15: "+new String(line); 200 endDistSum=Parse.parseLong(line, a, b); 201 b++; 202 a=b; 203 204 while(b<line.length && line[b]!=delimiter){b++;} 205 assert(b>a) : "Missing field 16: "+new String(line); 206 endDistMax=Parse.parseInt(line, a, b); 207 b++; 208 a=b; 209 210 while(b<line.length && line[b]!=delimiter){b++;} 211 assert(b>a) : "Missing field 17: "+new String(line); 212 idSum=Parse.parseLong(line, a, b); 213 b++; 214 a=b; 215 216 while(b<line.length && line[b]!=delimiter){b++;} 217 assert(b>a) : "Missing field 18: "+new String(line); 218 idMax=Parse.parseInt(line, a, b); 219 b++; 220 a=b; 221 222 while(b<line.length && line[b]!=delimiter){b++;} 223 assert(b>a) : "Missing field 19: "+new String(line); 224 coverage=Parse.parseInt(line, a, b); 225 b++; 226 a=b; 227 228 while(b<line.length && line[b]!=delimiter){b++;} 229 assert(b>a) : "Missing field 20: "+new String(line); 230 minusCoverage=Parse.parseInt(line, a, b); 231 b++; 232 a=b; 233 234 while(b<line.length && line[b]!=delimiter){b++;} 235 assert(b>a) : "Missing field 21: "+new String(line); 236 nearbyVarCount=Parse.parseInt(line, a, b); 237 b++; 238 a=b; 239 240 while(b<line.length && line[b]!=delimiter){b++;} 241 assert(b>a) : "Missing field 22: "+new String(line); 242 flagged=Parse.parseInt(line, a, b)>0; 243 b++; 244 a=b; 245 246 while(b<line.length && line[b]!=delimiter){b++;} 247 // assert(b>a) : "Missing field 21: "+new String(line); 248 // int contigEndDist=Parse.parseInt(line, a, b); //Unused 249 b++; 250 a=b; 251 252 while(b<line.length && line[b]!=delimiter){b++;} 253 if(b>a){ 254 // phredScore=Parse.parseFloat(line, a, b); 255 b++; 256 a=b; 257 } 258 259 hashcode=hash(); 260 assert(allele.length>1 || allele==AL_0 || 261 allele==AL_A || allele==AL_C || allele==AL_G || allele==AL_T || allele==AL_N); 262 assert(start<=stop) : this.toString(); 263 assert(type>=LJUNCT || type==type_old()) : type+", "+type_old()+", "+this.toString(); 264 } 265 266 //#CHROM POS ID REF ALT QUAL fromVCF(byte[] line, ScafMap scafMap, boolean parseCoverage, boolean parseExtended)267 public static Var fromVCF(byte[] line, ScafMap scafMap, boolean parseCoverage, boolean parseExtended) { 268 int a=0, b=0; 269 270 //CHROM 271 while(b<line.length && line[b]!='\t'){b++;} 272 assert(b>a) : "Missing field 0: "+new String(line); 273 String scaf=new String(line, a, b-a); 274 b++; 275 a=b; 276 277 //POS 278 while(b<line.length && line[b]!='\t'){b++;} 279 assert(b>a) : "Missing field 1: "+new String(line); 280 int pos=Parse.parseInt(line, a, b); 281 b++; 282 a=b; 283 284 //ID 285 while(b<line.length && line[b]!='\t'){b++;} 286 assert(b>a) : "Missing field 2: "+new String(line); 287 //String id=new String(line, a, b-a); 288 b++; 289 a=b; 290 291 //REF 292 while(b<line.length && line[b]!='\t'){b++;} 293 assert(b>a) : "Missing field 3: "+new String(line); 294 //byte[] ref=Arrays.copyOf(line, a, b); 295 int reflen=line[a]=='.' ? 0 : b-a; 296 b++; 297 a=b; 298 299 //ALT 300 while(b<line.length && line[b]!='\t'){b++;} 301 assert(b>a) : "Missing field 4: "+new String(line); 302 byte[] alt; 303 if(b<=a+1){alt=AL_MAP[line[a]];} 304 else{alt=Arrays.copyOfRange(line, a, b);} 305 b++; 306 a=b; 307 308 //QUAL, FILTER, INFO 309 310 int infoStart=b; 311 312 // while(b<line.length && line[b]!='\t'){b++;} 313 // assert(b>a) : "Missing field 5: "+new String(line); 314 // int qual=Parse.parseInt(line, a, b); 315 // b++; 316 // a=b; 317 318 final int start; 319 if(alt.length!=reflen && alt.length>0){//For indels, which have an extra base 320 alt=Arrays.copyOfRange(alt, 1, alt.length); 321 start=pos; 322 if(alt.length==0){alt=AL_0;} 323 else if(alt.length==1 && AL_MAP[alt[0]]!=null){alt=AL_MAP[alt[0]];} 324 if(reflen==1){reflen--;}//insertion 325 }else{ 326 start=pos-1; 327 } 328 final int readlen=(alt==null || alt.length==0 || alt[0]=='.' ? 0 : alt.length); 329 final int stop=start+reflen; 330 assert(scaf!=null); 331 assert(scafMap!=null); 332 final int scafNum=scafMap.getNumber(scaf); 333 assert(scafNum>=0) : scaf+"\n"+scafMap.keySet()+"\n"+scafMap.altKeySet()+"\n"; 334 // final Scaffold scaffold=scafMap.getScaffold(scafNum); 335 336 int type=-1; 337 if(parseExtended){ 338 type=Tools.max(type, parseVcfIntDelimited(line, "TYP=", infoStart)); 339 if(type<0){type=typeStartStop(start, stop, alt);} 340 }else{ 341 type=typeStartStop(start, stop, alt); 342 } 343 Var v=new Var(scafNum, start, stop, alt, type); 344 // System.err.println(new String(line)+"\n"+v.toString()+"\nstart="+start+", stop="+stop+", type="+type+"\n"); 345 // assert(false); 346 347 if(parseCoverage){ 348 infoStart=Tools.indexOfDelimited(line, "R1P=", infoStart, (byte)';'); 349 350 //R1P=2;R1M=0;R2P=0;R2M=0; 351 int r1p=Tools.max(0, parseVcfIntDelimited(line, "R1P=", infoStart)); 352 int r1m=Tools.max(0, parseVcfIntDelimited(line, "R1M=", infoStart)); 353 int r2p=Tools.max(0, parseVcfIntDelimited(line, "R2P=", infoStart)); 354 int r2m=Tools.max(0, parseVcfIntDelimited(line, "R2M=", infoStart)); 355 356 //AD=2;DP=24;MCOV=0;PPC=0; 357 int cov=parseVcfIntDelimited(line, "DP=", infoStart); 358 assert(cov>0) : new String(line, infoStart, line.length-infoStart); 359 int mcov=parseVcfIntDelimited(line, "MCOV=", infoStart); 360 361 v.coverage=cov; 362 v.minusCoverage=mcov; 363 v.r1plus=r1p; 364 v.r1minus=r1m; 365 v.r2plus=r2p; 366 v.r2minus=r2m; 367 } 368 369 if(parseExtended){ 370 infoStart=Tools.indexOfDelimited(line, "PPC=", infoStart, (byte)';'); 371 372 //Some extended fields 373 //AD=2;DP=24;MCOV=0;PPC=0; 374 int pc=Tools.max(0, parseVcfIntDelimited(line, "PPC=", infoStart)); 375 376 //AF=0.0833;RAF=0.0833;LS=280; 377 double raf=parseVcfDoubleDelimited(line, "RAF=", infoStart); 378 long ls=Tools.max(0, parseVcfLongDelimited(line, "LS=", infoStart)); 379 380 //MQS=86;MQM=43;BQS=64;BQM=32; 381 long mqs=Tools.max(0, parseVcfLongDelimited(line, "MQS=", infoStart)); 382 int mqm=Tools.max(0, parseVcfIntDelimited(line, "MQM=", infoStart)); 383 long bqs=Tools.max(0, parseVcfLongDelimited(line, "BQS=", infoStart)); 384 int bqm=Tools.max(0, parseVcfIntDelimited(line, "BQM=", infoStart)); 385 386 //EDS=18;EDM=9;IDS=1984;IDM=992; 387 long eds=Tools.max(0, parseVcfLongDelimited(line, "EDS=", infoStart)); 388 int edm=Tools.max(0, parseVcfIntDelimited(line, "EDM=", infoStart)); 389 long ids=Tools.max(0, parseVcfLongDelimited(line, "IDS=", infoStart)); 390 int idm=Tools.max(0, parseVcfIntDelimited(line, "IDM=", infoStart)); 391 392 //NVC=0;FLG=0;CED=601;HMP=0;SB=0.1809;DP4=37,28,15,97 393 int nvc=Tools.max(0, parseVcfIntDelimited(line, "NVC=", infoStart)); 394 int flg=Tools.max(0, parseVcfIntDelimited(line, "FLG=", infoStart)); 395 // int ced=Tools.max(0, parseVcfIntDelimited(line, "CED=", infoStart)); 396 397 v.properPairCount=pc; 398 v.revisedAlleleFraction=raf; 399 v.lengthSum=ls; 400 v.mapQSum=mqs; 401 v.mapQMax=mqm; 402 v.baseQSum=bqs; 403 v.baseQMax=bqm; 404 v.endDistSum=eds; 405 v.endDistMax=edm; 406 v.idSum=ids; 407 v.idMax=idm; 408 v.nearbyVarCount=nvc; 409 v.flagged=(flg>0); 410 } 411 412 return v; 413 } 414 415 //Parses INFO field parseVcfIntDelimited(byte[] line, String query, int start)416 private static int parseVcfIntDelimited(byte[] line, String query, int start){ 417 return (int)Tools.min(Integer.MAX_VALUE, parseVcfLongDelimited(line, query, start)); 418 } 419 420 //Parses INFO field parseVcfLongDelimited(byte[] line, String query, final int start)421 private static long parseVcfLongDelimited(byte[] line, String query, final int start){ 422 int loc=Tools.indexOfDelimited(line, query, start, (byte)';'); 423 // System.err.println(loc+", "+(line.length)+", "+(line.length-loc)); 424 // System.err.println(loc+", "+new String(line, loc, line.length-loc)); 425 // if(true){KillSwitch.kill();} 426 if(loc<0){return -1;} 427 long current=0; 428 long mult=1; 429 if(loc>0){ 430 if(line[loc+query.length()]=='-'){mult=-1; loc++;} 431 for(int i=loc+query.length(); i<line.length; i++){ 432 final byte x=line[i]; 433 if(Tools.isDigit(x)){ 434 current=current*10+(x-'0'); 435 }else{ 436 assert(x==tab || x==colon) : x+", "+query+", "+new String(line, loc, i-loc+1); 437 break; 438 } 439 } 440 } 441 return mult*current; 442 } 443 444 //Parses INFO field parseVcfDoubleDelimited(byte[] line, String query, int start)445 private static double parseVcfDoubleDelimited(byte[] line, String query, int start){ 446 int loc=Tools.indexOfDelimited(line, query, start, (byte)';'); 447 if(loc<0){return -1;} 448 if(loc>0){ 449 loc=loc+query.length(); 450 int loc2=loc+1; 451 while(loc2<line.length){ 452 byte b=line[loc2]; 453 if(b==tab || b==colon){break;} 454 loc2++; 455 } 456 return Parse.parseDouble(line, loc, loc2); 457 } 458 return -1; 459 } 460 461 /*--------------------------------------------------------------*/ 462 /*---------------- Mutators ----------------*/ 463 /*--------------------------------------------------------------*/ 464 clear()465 public Var clear() { 466 coverage=-1; 467 minusCoverage=-1; 468 469 r1plus=0; 470 r1minus=0; 471 r2plus=0; 472 r2minus=0; 473 properPairCount=0; 474 475 mapQSum=0; 476 mapQMax=0; 477 478 baseQSum=0; 479 baseQMax=0; 480 481 endDistSum=0; 482 endDistMax=0; 483 484 idSum=0; 485 idMax=0; 486 487 lengthSum=0; 488 489 revisedAlleleFraction=-1; 490 nearbyVarCount=-1; 491 forced=false; 492 flagged=false; 493 return this; 494 } 495 496 //Only called by MergeSamples from VCFLine arrays. addCoverage(Var b)497 public void addCoverage(Var b){ 498 assert(this.equals(b)); 499 coverage+=b.coverage; 500 minusCoverage+=b.minusCoverage; 501 } 502 503 //Called in various places such as when processing each read add(Var b)504 public void add(Var b){ 505 final int oldReads=alleleCount(); 506 507 // assert(oldReads>0) : this; 508 assert(oldReads==0 || baseQSum/oldReads<=60) : this; 509 510 assert(this.equals(b)); 511 r1plus+=b.r1plus; 512 r1minus+=b.r1minus; 513 r2plus+=b.r2plus; 514 r2minus+=b.r2minus; 515 properPairCount+=b.properPairCount; 516 lengthSum+=b.lengthSum; 517 518 mapQSum+=b.mapQSum; 519 mapQMax=Tools.max(mapQMax, b.mapQMax); 520 baseQSum+=b.baseQSum; 521 baseQMax=Tools.max(baseQMax, b.baseQMax); 522 523 endDistSum+=b.endDistSum; 524 endDistMax=Tools.max(endDistMax, b.endDistMax); 525 526 idSum+=b.idSum; 527 idMax=Tools.max(idMax, b.idMax); 528 529 // assert(count()>0 && count()>oldReads) : "\n"+this+"\n"+b; 530 assert(alleleCount()>=oldReads) : "\n"+this+"\n"+b; 531 assert(alleleCount()==oldReads+b.alleleCount()) : "\n"+this+"\n"+b; 532 assert(alleleCount()==0 || baseQSum/alleleCount()<=60) : "\n"+this+"\n"+b; 533 } 534 add(Read r)535 public void add(Read r){ 536 final SamLine sl=r.samline; 537 final int bstart=calcBstart(r, sl); 538 final int bstop=calcBstop(bstart, r); 539 add(r, bstart, bstop); 540 } 541 add(Read r, final int bstart, final int bstop)542 public void add(Read r, final int bstart, final int bstop){ 543 544 final int oldReads=alleleCount(); 545 final SamLine sl=r.samline; 546 547 if(sl.strand()==0){ 548 if(sl.pairnum()==0){ 549 r1plus++; 550 }else{ 551 r2plus++; 552 } 553 }else{ 554 if(sl.pairnum()==0){ 555 r1minus++; 556 }else{ 557 r2minus++; 558 } 559 } 560 561 lengthSum+=r.length(); 562 properPairCount+=(sl.properPair() ? 1 : 0); 563 mapQSum+=sl.mapq; 564 mapQMax=Tools.max(mapQMax, sl.mapq); 565 566 int baseQ=calcBaseQ(bstart, bstop, r, sl); 567 baseQSum+=baseQ; 568 baseQMax=Tools.max(baseQMax, baseQ); 569 570 int endDist=calcEndDist(bstart, bstop, r); 571 endDistSum+=endDist; 572 endDistMax=Tools.max(endDistMax, endDist); 573 574 int id=(int)(1000*Read.identitySkewed(r.match, false, false, false, true)); 575 idSum+=id; 576 idMax=Tools.max(idMax, id); 577 578 assert(alleleCount()>0) : this; 579 assert(alleleCount()==oldReads+1) : this; 580 assert(baseQSum/alleleCount()<=60) : this; 581 } 582 583 /*--------------------------------------------------------------*/ 584 /*---------------- Helper Methods ----------------*/ 585 /*--------------------------------------------------------------*/ 586 toVars(Read r, SamLine sl, boolean callNs, ScafMap scafMap)587 public static ArrayList<Var> toVars(Read r, SamLine sl, boolean callNs, ScafMap scafMap){ 588 // if(!r.containsVariants()){return null;} 589 final int scafnum=scafMap.getNumber(sl.rnameS()); 590 return toVars(r, sl, callNs, scafnum); 591 } 592 593 /** 594 * @TODO This crashes on indels in the last position in the match string. 595 * @param r 596 * @param sl 597 * @param callNs 598 * @param scafnum 599 * @return A list of variants 600 */ toVars(Read r, SamLine sl, boolean callNs, final int scafnum)601 public static ArrayList<Var> toVars(Read r, SamLine sl, boolean callNs, final int scafnum){ 602 final boolean hasV=r.containsVariants(); 603 final boolean callSID=(CALL_DEL || CALL_INS || CALL_SUB) && hasV; 604 final boolean callJ=(CALL_JUNCTION) && (hasV || r.containsClipping()); 605 if(!callSID && !callJ){return null;} 606 607 r.toLongMatchString(false); 608 if(sl.strand()==1 && !r.swapped()){ 609 r.reverseComplement(); 610 r.setSwapped(true); 611 } 612 613 ArrayList<Var> sidList=null, jList=null; 614 if(callSID){sidList=toSubsAndIndels(r, sl, callNs, scafnum);} 615 if(callJ){jList=toJunctions(r, sl, scafnum, hasV, 8);} 616 617 if(sidList!=null){ 618 if(jList!=null){sidList.addAll(jList);} 619 return sidList; 620 }else{ 621 return jList; 622 } 623 624 625 //Note: Did not un-rcomp the read 626 } 627 628 /** 629 * @TODO This crashes on indels in the last position in the match string. 630 * @param r 631 * @param sl 632 * @param callNs 633 * @param scafnum 634 * @return A list of variants 635 */ toSubsAndIndels(Read r, SamLine sl, boolean callNs, final int scafnum)636 private static ArrayList<Var> toSubsAndIndels(Read r, SamLine sl, boolean callNs, final int scafnum){ 637 final byte[] match=r.match; 638 final byte[] bases=r.bases; 639 final int rpos0=sl.pos-1; 640 ArrayList<Var> list=new ArrayList<Var>(); 641 642 int bstart=-1, bstop=-1; 643 int rstart=-1, rstop=-1; 644 int mode=-1; 645 646 int mpos=0, bpos=0, rpos=rpos0; 647 for(; mpos<match.length; mpos++){ 648 byte m=match[mpos]; 649 650 if(m!=mode){ 651 if(mode=='D'){ 652 bstop=bpos; 653 rstop=rpos; 654 // assert(false) : (char)m+", "+(char)mode+", "+rstart+", "+bstart; 655 if(CALL_DEL){ 656 Var v=new Var(scafnum, rstart, rstop, 0, DEL); 657 v.add(r, bstart, bstop); 658 list.add(v); 659 } 660 bstart=bstop=rstart=rstop=-1; 661 }else if(mode=='I'){ 662 bstop=bpos; 663 rstop=rpos; 664 int blen=bstop-bstart; 665 if(CALL_INS){ 666 Var v; 667 if(blen==1){ 668 v=new Var(scafnum, rstart, rstop, bases[bstart], INS); 669 }else{ 670 v=new Var(scafnum, rstart, rstop, Arrays.copyOfRange(bases, bstart, bstop), INS); 671 } 672 v.add(r, bstart, bstop); 673 list.add(v); 674 } 675 bstart=bstop=rstart=rstop=-1; 676 } 677 } 678 679 if(m=='C'){ 680 bpos++; 681 }else if(m=='m' || m=='S' || m=='N'){ 682 if(m=='S' || (m=='N' && callNs)){ 683 // System.err.println(sl.toString()+"\n"+ScafMap.defaultScafMap.getScaffold(scafnum).getSequence(sl)); //123 684 // System.err.println(ScafMap.defaultScafMap.getScaffold(scafnum)); 685 if(CALL_SUB){ 686 Var v=new Var(scafnum, rpos, rpos+1, bases[bpos], SUB); 687 // System.err.println(v.toString()); 688 v.add(r, bpos, bpos+1); 689 list.add(v); 690 691 if(TEST_REF_VARIANTS && v.type()==SUB && v.allele.length==1){ //TODO: 123 - mainly for testing 692 final byte call=v.allele[0]; 693 final Scaffold scaf=ScafMap.defaultScafMap().getScaffold(scafnum); 694 final byte ref=scaf.bases[v.start]; 695 // System.err.println((char)call+"="+(char)ref+" at scaf "+scafnum+" pos "+start); 696 assert(ref!=call) : (char)call+"="+(char)ref+" at scaf "+scafnum+" pos "+v.start+"\n" 697 +sl+"\n"+ScafMap.defaultScafMap().getScaffold(scafnum).getSequence(sl)+"\n"; 698 } 699 } 700 701 } 702 bpos++; 703 rpos++; 704 }else if(m=='D'){ 705 if(mode!=m){ 706 rstart=rpos; 707 bstart=bpos; 708 } 709 rpos++; 710 }else if(m=='I'){ 711 if(mode!=m){ 712 rstart=rpos; 713 bstart=bpos; 714 } 715 bpos++; 716 }else{ 717 assert(false) : "Unhandled symbol "+(char)m; 718 } 719 mode=m; 720 } 721 722 if(mode=='D'){ 723 bstop=bpos; 724 rstop=rpos; 725 if(CALL_DEL){ 726 Var v=new Var(scafnum, rstart, rstop, 0, DEL); 727 v.add(r, bstart, bstop); 728 list.add(v); 729 } 730 bstart=bstop=rstart=rstop=-1; 731 }else if(mode=='I'){ 732 bstop=bpos; 733 rstop=rpos-1; 734 int blen=bstop-bstart; 735 if(CALL_INS){ 736 Var v; 737 assert(rstart<=rstop) : "\n"+rstart+", "+rstop+", "+rpos+ 738 "\n"+bstart+", "+bstop+", "+bpos+ 739 "\n"+r+"\n"+sl; 740 if(blen==1){ 741 v=new Var(scafnum, rstart, rstop, bases[bstart], INS); 742 }else{ 743 v=new Var(scafnum, rstart, rstop, Arrays.copyOfRange(bases, bstart, bstop), INS); 744 } 745 v.add(r, bstart, bstop); 746 list.add(v); 747 } 748 bstart=bstop=rstart=rstop=-1; 749 } 750 751 return list; 752 } 753 754 /** Short or long match */ countLeftClip(byte[] match)755 private static int countLeftClip(byte[] match){ 756 757 byte mode=match[0]; 758 if(mode!='C'){return 0;} 759 int current=0; 760 boolean hasDigit=false; 761 762 for(int mpos=0; mpos<match.length; mpos++){ 763 byte c=match[mpos]; 764 765 if(mode==c){ 766 current++; 767 }else if(Tools.isDigit(c)){ 768 current=(hasDigit ? current : 0)*10+c-'0'; 769 hasDigit=true; 770 }else{ 771 break; 772 } 773 } 774 return current; 775 } 776 777 /** Short or long match */ countRightClip(byte[] match)778 private static int countRightClip(byte[] match){ 779 int mpos=match.length-1; 780 boolean hasDigit=false; 781 while(mpos>=0 && Tools.isDigit(match[mpos])){ 782 hasDigit=true; 783 mpos--; 784 } 785 if(!hasDigit){return countRightClipLong(match);}//Necessary line 786 787 byte mode=match[mpos]; 788 if(mode!='C'){return 0;} 789 int current=0; 790 791 for(; mpos<match.length; mpos++){ 792 byte c=match[mpos]; 793 794 if(mode==c){ 795 current++; 796 }else if(Tools.isDigit(c)){ 797 current=(hasDigit ? current : 0)*10+c-'0'; 798 hasDigit=true; 799 }else{ 800 break; 801 } 802 } 803 return current; 804 } 805 806 /** Long-match */ countLeftClipLong(byte[] longmatch)807 private static int countLeftClipLong(byte[] longmatch){ 808 for(int i=0; i<longmatch.length; i++){ 809 if(longmatch[i]!='C'){return i;} 810 } 811 return longmatch.length; 812 } 813 814 /** Long-match */ countRightClipLong(byte[] longmatch)815 private static int countRightClipLong(byte[] longmatch){ 816 for(int i=0, j=longmatch.length-1; i<longmatch.length; i++, j--){ 817 if(longmatch[j]!='C'){return i;} 818 } 819 return longmatch.length; 820 } 821 toJunctions(Read r, SamLine sl, final int scafnum, boolean containsVars, final int minClip)822 private static ArrayList<Var> toJunctions(Read r, SamLine sl, final int scafnum, 823 boolean containsVars, final int minClip){ 824 final byte[] match0=r.match, match; 825 final byte[] bases=r.bases; 826 // final int rpos0=sl.pos-1; 827 final int start, stop; 828 int leftClip=countLeftClip(match0), rightClip=countRightClip(match0); 829 if(leftClip==0 && rightClip==0){//try soft-clipping 830 int[] rvec=KillSwitch.allocInt1D(2); 831 byte[] smatch=SoftClipper.softClipMatch(match0, minClip, false, r.start, r.stop, rvec); 832 if(smatch==null){ 833 return null; 834 }else{ 835 start=rvec[0]; 836 stop=rvec[1]; 837 match=smatch; 838 leftClip=countLeftClip(match); 839 rightClip=countRightClip(match); 840 } 841 }else{ 842 if(leftClip<minClip && rightClip<minClip){return null;} 843 start=r.start; 844 stop=r.stop; 845 match=match0; 846 } 847 848 ArrayList<Var> list=new ArrayList<Var>(); 849 if(leftClip>=minClip){//LJUNCT 850 int bpos=leftClip-1; 851 byte jcall=bases[bpos]; 852 int jpos=start+leftClip; 853 Var v=new Var(scafnum, jpos, jpos+1, jcall, LJUNCT); 854 v.add(r, bpos, bpos+1); 855 list.add(v); 856 } 857 if(rightClip>=minClip){//RJUNCT 858 int bpos=bases.length-rightClip; 859 byte jcall=bases[bpos]; 860 int jpos=stop-rightClip+1; 861 Var v=new Var(scafnum, jpos, jpos+1, jcall, RJUNCT); 862 v.add(r, bpos, bpos+1); 863 list.add(v); 864 } 865 return list; 866 } 867 calcBstart(Read r, SamLine sl)868 public int calcBstart(Read r, SamLine sl){ 869 r.toLongMatchString(false); 870 byte[] match=r.match; 871 final int rstart=sl.pos-1; 872 final int type=type(); 873 874 int bstart=-1; 875 876 for(int mpos=0, rpos=rstart, bpos=0; mpos<match.length; mpos++){ 877 byte m=match[mpos]; 878 if(m=='C'){ 879 bpos++; 880 }else if(m=='m' || m=='S' || m=='N'){ 881 if(rpos==rstart){ 882 assert(type==SUB || type==NOCALL) : type+", "+bpos+", "+rpos+"\n"+new String(match); 883 bstart=bpos; 884 break; 885 } 886 bpos++; 887 rpos++; 888 }else if(m=='D'){ 889 if(rpos==rstart){ 890 assert(type==DEL) : type+", "+rpos+"\n"+new String(match); 891 bstart=bpos; 892 break; 893 } 894 rpos++; 895 }else if(m=='I'){ 896 if(rpos==rstart && type==INS){ 897 bstart=bpos; 898 break; 899 } 900 bpos++; 901 }else{ 902 assert(false) : "Unhandled symbol "+(char)m; 903 } 904 } 905 assert(bstart>=0); 906 return bstart; 907 } 908 calcBstop(int bstart, Read r)909 public int calcBstop(int bstart, Read r){ 910 assert(bstart>=0); 911 int bstop=bstart+readlen(); 912 assert(bstop<=r.length()); 913 return bstop; 914 } 915 calcEndDist(int bstart, int bstop, Read r)916 public int calcEndDist(int bstart, int bstop, Read r){ 917 int dist=Tools.min(bstart, r.length()-bstop); 918 assert(dist>=0 && dist<=r.length()/2) : dist+", "+r.length()+", "+bstart+", "+bstop+"\n"+this+"\n"+ 919 "\n"+new String(r.match)+"\n"+r.samline+"\n"; 920 assert(dist<=(r.length()-readlen())/2) : "\ndist="+dist+", r.len="+r.length()+", readlen="+readlen()+", allele='"+new String(allele)+ 921 "', allele.length="+allele.length+"\n"+"allele2="+toString(allele)+"\n(r.length()-readlen())/2="+((r.length()-readlen())/2)+"bstart="+bstart+", bstop="+bstop+ 922 "\nthis="+this+"\nmatch="+new String(r.match)+"\nsamline="+r.samline+"\n"; 923 return dist; 924 } 925 toString(byte[] array)926 String toString(byte[] array) { 927 StringBuilder sb=new StringBuilder(); 928 for(byte b : array){ 929 sb.append((int)b).append(','); 930 } 931 return sb.toString(); 932 } 933 calcBaseQ(final int bstart0, final int bstop0, Read r, SamLine sl)934 public int calcBaseQ(final int bstart0, final int bstop0, Read r, SamLine sl){ 935 final byte[] quals=r.quality; 936 if(quals==null){return Shared.FAKE_QUAL;} 937 final int type=type(); 938 final int bstart, bstop; 939 final int len=r.length(); 940 941 if(sl.strand()==0 || (sl.strand()==1 && r.swapped())){ 942 bstart=bstart0; 943 bstop=bstop0; 944 }else{ 945 bstart=len-bstop0-1; 946 bstop=len-bstart0-1; 947 assert(bstop-bstart==bstop0-bstart0); 948 } 949 950 int sum=0, avg=0; 951 if(type==DEL){ 952 if(bstart==0){ 953 sum=avg=quals[0]; 954 }else if(bstop>=len-1){ 955 sum=avg=quals[len-1]; 956 }else{ 957 assert(bstop==bstart) : bstart0+", "+bstop0+", "+bstart+", "+bstop+"\n"+ 958 r.length()+", "+r.swapped()+", "+type()+", "+readlen()+", "+reflen()+ 959 "\n"+this+"\n"+new String(r.match)+"\n"+r.samline+"\n"; 960 961 // -1, 73, -1, 73 962 // 151, true, 2, 0, 1 963 964 sum=quals[bstart]+quals[bstop+1]; 965 avg=sum/2; 966 } 967 }else{ 968 for(int i=bstart; i<bstop; i++){ 969 sum+=quals[i]; 970 } 971 avg=Math.round(sum/(bstop-bstart)); 972 } 973 return avg; 974 } 975 reflen()976 public int reflen(){ 977 return stop-start; 978 } 979 readlen()980 int readlen(){ 981 return (allele==null || allele.length==0 || allele[0]=='.' ? 0 : allele.length); 982 } 983 type()984 public int type(){return type;} 985 type_old()986 int type_old(){ 987 int reflen=reflen(), readlen=readlen(); 988 return typeReadlenReflen(readlen, reflen, allele); 989 } 990 typeStartStop(int start, int stop, byte[] allele)991 static int typeStartStop(int start, int stop, byte[] allele){ 992 final int readlen=(allele.length==0 || allele[0]=='.' ? 0 : allele.length); 993 final int reflen=stop-start; 994 return typeReadlenReflen(readlen, reflen, allele); 995 } 996 typeReadlenReflen(int readlen, int reflen, byte[] allele)997 static int typeReadlenReflen(int readlen, int reflen, byte[] allele){ 998 if(reflen<readlen){return INS;} 999 if(reflen>readlen){return DEL;} 1000 // assert(readlen>0) : start+", "+stop+", "+reflen+", "+readlen+", "+new String(allele); 1001 // if(reflen==0){return INS;} 1002 // if(readlen==0){return DEL;} 1003 // assert(start<=stop) : start+", "+stop; 1004 // assert(reflen==readlen) : reflen+", "+readlen+", "+new String(allele)+", "+start+", "+stop; 1005 for(byte b : allele){ 1006 if(b!='N'){return SUB;} 1007 } 1008 return NOCALL; 1009 } 1010 typeString()1011 String typeString(){ 1012 return typeArray[type()]; 1013 } 1014 1015 /*--------------------------------------------------------------*/ 1016 /*---------------- Contract Methods ----------------*/ 1017 /*--------------------------------------------------------------*/ 1018 1019 @Override equals(Object b)1020 public boolean equals(Object b){ 1021 return equals((Var)b); 1022 } 1023 equals(Var b)1024 public boolean equals(Var b){ 1025 return hashcode==b.hashcode && compareTo(b)==0; 1026 } 1027 1028 @Override hashCode()1029 public int hashCode(){ 1030 return hashcode; 1031 } 1032 toKey()1033 public long toKey() { 1034 final int len=(type==DEL ? reflen() : readlen()); 1035 final long key=type^((hash(allele)&0x3F)>>alleleShift)^(len>>lenShift)^(Long.rotateRight(start, startShift)); 1036 return key&0x7FFFFFFFFFFFFFFFL; 1037 1038 // long key=Long.rotateLeft(start, 30)^Long.rotateLeft(scafnum, 10)^hash(allele); 1039 // return key&0x3FFFFFFFFFFFFFFFL; 1040 } 1041 1042 @Override compareTo(Var v)1043 public int compareTo(Var v){ 1044 if(scafnum!=v.scafnum){return scafnum-v.scafnum;} 1045 final int typeA=type(), typeB=v.type(); 1046 int stA=start+(typeA==DEL ? -1 : 0); 1047 int stB=v.start+(typeB==DEL ? -1 : 0); 1048 if(stA!=stB){return stA-stB;} 1049 if(typeA!=typeB){return typeA-typeB;} 1050 if(stop!=v.stop){return stop-v.stop;} 1051 return compare(allele, v.allele); 1052 } 1053 compare(byte[] a, byte[] b)1054 public int compare(byte[] a, byte[] b){ 1055 if(a==b){return 0;} 1056 if(a.length!=b.length){return b.length-a.length;} 1057 for(int i=0; i<a.length; i++){ 1058 byte ca=a[i], cb=b[i]; 1059 if(ca!=cb){return ca-cb;} 1060 } 1061 return 0; 1062 } 1063 1064 @Override toString()1065 public String toString(){ 1066 return toTextQuick(new ByteBuilder()).toString(); 1067 } 1068 toTextQuick(ByteBuilder bb)1069 public ByteBuilder toTextQuick(ByteBuilder bb){ 1070 return toText(bb, 0.99f, 30, 30, 150, 1, 2, null); 1071 } 1072 toVarHeader(double properPairRate, double totalQualityAvg, double mapqAvg, double rarity, double minAlleleFraction, int ploidy, long reads, long pairs, long properPairs, long bases, String ref)1073 public static String toVarHeader(double properPairRate, double totalQualityAvg, double mapqAvg, double rarity, double minAlleleFraction, int ploidy, 1074 long reads, long pairs, long properPairs, long bases, String ref){ 1075 StringBuilder sb=new StringBuilder(); 1076 1077 final double readLengthAvg=bases/Tools.max(1.0, reads); 1078 sb.append("#fileformat\tVar_"+varFormat+"\n"); 1079 sb.append("#BBMapVersion\t"+Shared.BBMAP_VERSION_STRING+"\n"); 1080 sb.append("#ploidy\t"+ploidy+"\n"); 1081 sb.append(String.format(Locale.ROOT, "#rarity\t%.5f\n", rarity)); 1082 sb.append(String.format(Locale.ROOT, "#minAlleleFraction\t%.4f\n", minAlleleFraction)); 1083 sb.append("#reads\t"+reads+"\n"); 1084 sb.append("#pairedReads\t"+pairs+"\n"); 1085 sb.append("#properlyPairedReads\t"+properPairs+"\n"); 1086 sb.append(String.format(Locale.ROOT, "#readLengthAvg\t%.2f\n", readLengthAvg)); 1087 sb.append(String.format(Locale.ROOT, "#properPairRate\t%.4f\n", properPairRate)); 1088 sb.append(String.format(Locale.ROOT, "#totalQualityAvg\t%.4f\n", totalQualityAvg)); 1089 sb.append(String.format(Locale.ROOT, "#mapqAvg\t%.2f\n", mapqAvg)); 1090 if(ref!=null){sb.append("#reference\t"+ref+"\n");} 1091 1092 sb.append("#scaf\tstart\tstop\ttype\tcall\tr1p\tr1m\tr2p\tr2m\tpaired\tlengthSum"); 1093 sb.append("\tmapq\tmapqMax\tbaseq\tbaseqMax\tedist\tedistMax\tid\tidMax"); 1094 sb.append("\tcov\tminusCov"); 1095 sb.append("\tnearbyVarCount"); 1096 sb.append("\tflagged"); 1097 sb.append("\tcontigEndDist"); 1098 sb.append("\tphredScore"); 1099 if(extendedText){ 1100 sb.append("\treadCount\talleleFraction\trevisedAF\tstrandRatio\tbaseqAvg\tmapqAvg\tedistAvg\tidentityAvg"); 1101 sb.append("\tedistScore\tidentityScore\tqualityScore\tpairedScore\tbiasScore\tcoverageScore\thomopolymerScore\tscore"); 1102 // sb.append("\tforced"); 1103 } 1104 return sb.toString(); 1105 } 1106 toBasicHeader()1107 public static String toBasicHeader(){ 1108 StringBuilder sb=new StringBuilder(); 1109 sb.append("#scaf\tstart\tstop\ttype\tcall\tr1p\tr1m\tr2p\tr2m\tpaired\tlengthSum"); 1110 sb.append("\tmapq\tmapqMax\tbaseq\tbaseqMax\tedist\tedistMax\tid\tidMax"); 1111 sb.append("\tcov\tminusCov"); 1112 sb.append("\tnearVarCount"); 1113 sb.append("\tcontigEndDist"); 1114 sb.append("\tphredScore"); 1115 return sb.toString(); 1116 } 1117 toText(ByteBuilder bb, double properPairRate, double totalQualityAvg, double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map)1118 public ByteBuilder toText(ByteBuilder bb, double properPairRate, double totalQualityAvg, double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map){ 1119 useIdentity=true; 1120 bb.append(scafnum).tab(); 1121 bb.append(start).tab(); 1122 bb.append(stop).tab(); 1123 bb.append(typeArray[type()]).tab(); 1124 for(byte b : allele){bb.append(b);} 1125 bb.tab(); 1126 1127 bb.append(r1plus).tab(); 1128 bb.append(r1minus).tab(); 1129 bb.append(r2plus).tab(); 1130 bb.append(r2minus).tab(); 1131 bb.append(properPairCount).tab(); 1132 bb.append(lengthSum).tab(); 1133 1134 bb.append(mapQSum).tab(); 1135 bb.append(mapQMax).tab(); 1136 bb.append(baseQSum).tab(); 1137 bb.append(baseQMax).tab(); 1138 bb.append(endDistSum).tab(); 1139 bb.append(endDistMax).tab(); 1140 bb.append(idSum).tab(); 1141 bb.append(idMax).tab(); 1142 1143 bb.append(coverage).tab(); 1144 bb.append(minusCoverage).tab(); 1145 bb.append(nearbyVarCount).tab(); 1146 bb.append(flagged ? 1 : 0).tab(); 1147 1148 final int scafEndDist=!doNscan ? nScan : (map==null ? start : contigEndDist(map)); 1149 bb.append(scafEndDist).tab(); 1150 1151 // bb.append(prevBase<0 ? 'N' : (char)prevBase).tab(); 1152 1153 final double score=score(properPairRate, totalQualityAvg, totalMapqAvg, readLengthAvg, rarity, ploidy, map); 1154 // bb.append(String.format(Locale.ROOT, "%.2f", toPhredScore(score))).tab(); 1155 bb.append(toPhredScore(score), 2).tab(); 1156 1157 if(extendedText){ 1158 1159 bb.append(alleleCount()).tab(); 1160 final double af=alleleFraction(); 1161 bb.append(af, 4).tab(); 1162 bb.append(revisedAlleleFraction(af, readLengthAvg), 4).tab(); 1163 bb.append(strandRatio(), 4).tab(); 1164 bb.append(baseQAvg(), 2).tab(); 1165 bb.append(mapQAvg(), 2).tab(); 1166 bb.append(edistAvg(), 2).tab(); 1167 bb.append(identityAvg(), 2).tab(); 1168 1169 bb.append(edistScore(), 4).tab(); 1170 bb.append(identityScore(), 4).tab(); 1171 bb.append(qualityScore(totalQualityAvg, totalMapqAvg), 4).tab(); 1172 bb.append(pairedScore(properPairRate, scafEndDist), 4).tab(); 1173 bb.append(biasScore(properPairRate, scafEndDist), 4).tab(); 1174 bb.append(coverageScore(ploidy, rarity, readLengthAvg), 4).tab(); 1175 bb.append(homopolymerScore(map), 4).tab(); 1176 bb.append(score, 4).tab(); 1177 // bb.append((forced() ? 1 : 0), 4).tab(); 1178 } 1179 1180 bb.length--; 1181 1182 return bb; 1183 } 1184 toVcfHeader(double properPairRate, double totalQualityAvg, double mapqAvg, double rarity, double minAlleleFraction, int ploidy, long reads, long pairs, long properPairs, long bases, String ref, ScafMap map, String sampleName, boolean trimWhitespace)1185 public static String toVcfHeader(double properPairRate, double totalQualityAvg, double mapqAvg, 1186 double rarity, double minAlleleFraction, int ploidy, long reads, long pairs, 1187 long properPairs, long bases, String ref, ScafMap map, String sampleName, 1188 boolean trimWhitespace) { 1189 StringBuilder sb=new StringBuilder(); 1190 1191 sb.append("##fileformat=VCFv4.2\n"); 1192 sb.append("##BBMapVersion="+Shared.BBMAP_VERSION_STRING+"\n"); 1193 sb.append("##ploidy="+ploidy+"\n"); 1194 sb.append(String.format(Locale.ROOT, "##rarity=%.5f\n", rarity)); 1195 sb.append(String.format(Locale.ROOT, "##minallelefraction=%.5f\n", minAlleleFraction)); 1196 sb.append("##reads="+reads+"\n"); 1197 sb.append("##pairedReads="+pairs+"\n"); 1198 sb.append("##properlyPairedReads="+properPairs+"\n"); 1199 sb.append(String.format(Locale.ROOT, "##readLengthAvg=%.3f\n", (bases/Tools.max(reads, 1.0)))); 1200 sb.append(String.format(Locale.ROOT, "##properPairRate=%.5f\n", properPairRate)); 1201 sb.append(String.format(Locale.ROOT, "##totalQualityAvg=%.3f\n", totalQualityAvg)); 1202 sb.append(String.format(Locale.ROOT, "##mapqAvg=%.3f\n", mapqAvg)); 1203 if(ref!=null){sb.append("##reference="+ref+"\n");} 1204 1205 for(Scaffold scaf : map.list){ 1206 String name=scaf.name; 1207 if(trimWhitespace){name=Tools.trimWhitespace(name);} 1208 sb.append("##contig=<ID="+name+",length="+scaf.length+">\n"); 1209 } 1210 1211 { 1212 sb.append("##FILTER=<ID=FAIL,Description=\"Fail\">\n"); 1213 sb.append("##FILTER=<ID=PASS,Description=\"Pass\">\n"); 1214 1215 sb.append("##INFO=<ID=SN,Number=1,Type=Integer,Description=\"Scaffold Number\">\n"); 1216 sb.append("##INFO=<ID=STA,Number=1,Type=Integer,Description=\"Start\">\n"); 1217 sb.append("##INFO=<ID=STO,Number=1,Type=Integer,Description=\"Stop\">\n"); 1218 sb.append("##INFO=<ID=TYP,Number=1,Type=String,Description=\"Type\">\n"); 1219 1220 sb.append("##INFO=<ID=R1P,Number=1,Type=Integer,Description=\"Read1 Plus Count\">\n"); 1221 sb.append("##INFO=<ID=R1M,Number=1,Type=Integer,Description=\"Read1 Minus Count\">\n"); 1222 sb.append("##INFO=<ID=R2P,Number=1,Type=Integer,Description=\"Read2 Plus Count\">\n"); 1223 sb.append("##INFO=<ID=R2M,Number=1,Type=Integer,Description=\"Read2 Minus Count\">\n"); 1224 1225 sb.append("##INFO=<ID=AD,Number=1,Type=Integer,Description=\"Allele Depth\">\n"); 1226 sb.append("##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n"); 1227 // sb.append("##INFO=<ID=COV,Number=1,Type=Integer,Description=\"Coverage\">\n"); 1228 sb.append("##INFO=<ID=MCOV,Number=1,Type=Integer,Description=\"Minus Coverage\">\n"); 1229 sb.append("##INFO=<ID=PPC,Number=1,Type=Integer,Description=\"Paired Count\">\n"); 1230 1231 sb.append("##INFO=<ID=AF,Number=1,Type=Float,Description=\"Allele Fraction\">\n"); 1232 sb.append("##INFO=<ID=RAF,Number=1,Type=Float,Description=\"Revised Allele Fraction\">\n"); 1233 sb.append("##INFO=<ID=LS,Number=1,Type=Integer,Description=\"Length Sum\">\n"); 1234 1235 sb.append("##INFO=<ID=MQS,Number=1,Type=Integer,Description=\"MAPQ Sum\">\n"); 1236 sb.append("##INFO=<ID=MQM,Number=1,Type=Integer,Description=\"MAPQ Max\">\n"); 1237 sb.append("##INFO=<ID=BQS,Number=1,Type=Integer,Description=\"Base Quality Sum\">\n"); 1238 sb.append("##INFO=<ID=BQM,Number=1,Type=Integer,Description=\"Base Quality Max\">\n"); 1239 1240 sb.append("##INFO=<ID=EDS,Number=1,Type=Integer,Description=\"End Distance Sum\">\n"); 1241 sb.append("##INFO=<ID=EDM,Number=1,Type=Integer,Description=\"End Distance Max\">\n"); 1242 sb.append("##INFO=<ID=IDS,Number=1,Type=Integer,Description=\"Identity Sum\">\n"); 1243 sb.append("##INFO=<ID=IDM,Number=1,Type=Integer,Description=\"Identity Max\">\n"); 1244 1245 sb.append("##INFO=<ID=NVC,Number=1,Type=Integer,Description=\"Nearby Variation Count\">\n"); 1246 sb.append("##INFO=<ID=FLG,Number=1,Type=Integer,Description=\"Flagged\">\n"); 1247 sb.append("##INFO=<ID=CED,Number=1,Type=Integer,Description=\"Contig End Distance\">\n"); 1248 sb.append("##INFO=<ID=HMP,Number=1,Type=Integer,Description=\"Homopolymer Count\">\n"); 1249 sb.append("##INFO=<ID=SB,Number=1,Type=Float,Description=\"Strand Bias\">\n"); 1250 sb.append("##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Ref+, Ref-, Alt+, Alt-\">\n"); 1251 1252 sb.append("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"); 1253 sb.append("##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read Depth\">\n"); 1254 sb.append("##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Allele Depth\">\n"); 1255 sb.append("##FORMAT=<ID=AF,Number=1,Type=Float,Description=\"Allele Fraction\">\n"); 1256 sb.append("##FORMAT=<ID=RAF,Number=1,Type=Float,Description=\"Revised Allele Fraction\">\n"); 1257 sb.append("##FORMAT=<ID=NVC,Number=1,Type=Integer,Description=\"Nearby Variation Count\">\n"); 1258 sb.append("##FORMAT=<ID=FLG,Number=1,Type=Integer,Description=\"Flagged\">\n"); 1259 sb.append("##FORMAT=<ID=SB,Number=1,Type=Float,Description=\"Strand Bias\">\n"); 1260 1261 sb.append("##FORMAT=<ID=SC,Number=1,Type=Float,Description=\"Score\">\n"); 1262 sb.append("##FORMAT=<ID=PF,Number=1,Type=String,Description=\"Pass Filter\">\n"); 1263 1264 } 1265 1266 sb.append("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); 1267 if(sampleName!=null){sb.append('\t').append(sampleName);} 1268 return sb.toString(); 1269 } 1270 1271 //#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1323_1066121 1272 //ChrIII_A_nidulans_FGSC_A4 133 . T C 204 PASS 1273 // DP=27;VDB=1.614640e-01;RPB=9.947863e-01;AF1=0.5;AC1=1;DP4=5,8,6,8;MQ=49;FQ=180;PV4=1,1,0.055,1 1274 // GT:PL:DP:SP:GQ 0/1:234,0,207:27:0:99 toVCF(ByteBuilder bb, double properPairRate, double totalQualityAvg, double mapqAvg, double readLengthAvg, int ploidy, ScafMap map, VarFilter filter, boolean trimWhitespace)1275 public ByteBuilder toVCF(ByteBuilder bb, double properPairRate, double totalQualityAvg, double mapqAvg, double readLengthAvg, 1276 int ploidy, ScafMap map, VarFilter filter, boolean trimWhitespace){ 1277 1278 final Scaffold scaf=map.getScaffold(scafnum); 1279 final byte[] bases=scaf.bases; 1280 final int reflen=reflen(), readlen=readlen(), type=type(); 1281 final double score=phredScore(properPairRate, totalQualityAvg, mapqAvg, readLengthAvg, filter.rarity, ploidy, map); 1282 final boolean pass=(filter==null ? true : 1283 filter.passesFilter(this, properPairRate, totalQualityAvg, mapqAvg, readLengthAvg, ploidy, map, true)); 1284 bb.append(trimWhitespace ? Tools.trimWhitespace(scaf.name) : scaf.name).tab(); 1285 boolean indel=(type==INS || type==DEL); 1286 boolean addPrevBase=true; 1287 final int vcfStart=start+(indel && addPrevBase ? 0 : 1); 1288 bb.append(vcfStart).tab(); 1289 bb.append('.').tab(); 1290 1291 byte prevBase=(bases==null ? (byte)'N' : bases[Tools.mid(start-1, 0, bases.length-1)]); 1292 if(UPPER_CASE_ALLELES){prevBase=(byte) Tools.toUpperCase(prevBase);} 1293 1294 if(addPrevBase){ 1295 if(reflen==0 || allele.length<1){bb.append(prevBase);} 1296 for(int i=0, rpos=start; i<reflen; i++, rpos++){ 1297 bb.append(bases==null || rpos<0 || rpos>=bases.length ? (char)'N' : (char)bases[rpos]); 1298 } 1299 bb.tab(); 1300 1301 if(reflen==0 || allele.length<1){bb.append(prevBase);} 1302 bb.append(allele).tab(); 1303 }else{ 1304 if(reflen==0){ 1305 bb.append('.'); 1306 }else{ 1307 for(int i=0, rpos=start; i<reflen; i++, rpos++){ 1308 char refBase=bases==null || rpos<0 || rpos>=bases.length ? (char)'N' : (char)bases[rpos]; 1309 if(UPPER_CASE_ALLELES){refBase=Tools.toUpperCase(refBase);} 1310 bb.append(refBase); 1311 } 1312 } 1313 bb.tab(); 1314 1315 if(allele.length<1){ 1316 bb.append('.').tab(); 1317 }else{ 1318 bb.append(allele).tab(); 1319 } 1320 } 1321 1322 // bb.append(String.format(Locale.ROOT, "%.2f\t", score)); 1323 bb.append(score, 2).tab(); 1324 bb.append(pass ? "PASS\t" : "FAIL\t"); 1325 1326 final int scafEndDist=!doNscan ? nScan : (map==null ? start : contigEndDist(map)); 1327 final int count=alleleCount(); 1328 final double af=alleleFraction(); 1329 final double raf=revisedAlleleFraction(af, readLengthAvg); 1330 final double strandBias=strandBiasScore(scafEndDist); 1331 1332 {//INFO 1333 assert(Scaffold.trackStrand()==(minusCoverage>=0)) : Scaffold.trackStrand()+", "+minusCoverage; 1334 final int covMinus=(Scaffold.trackStrand() ? minusCoverage : coverage/2); 1335 final int covPlus=Tools.max(0, coverage-covMinus); 1336 final int refMinus=Tools.max(0, covMinus-alleleMinusCount()); 1337 final int refPlus=Tools.max(0, covPlus-allelePlusCount()); 1338 1339 bb.append("SN=").append(scafnum).append(';'); 1340 bb.append("STA=").append(start).append(';'); 1341 bb.append("STO=").append(stop).append(';'); 1342 bb.append("TYP=").append(typeArray[type()]).append(';'); 1343 1344 bb.append("R1P=").append(r1plus).append(';'); 1345 bb.append("R1M=").append(r1minus).append(';'); 1346 bb.append("R2P=").append(r2plus).append(';'); 1347 bb.append("R2M=").append(r2minus).append(';'); 1348 1349 bb.append("AD=").append(count).append(';'); 1350 bb.append("DP=").append(Tools.max(coverage, count)).append(';'); 1351 bb.append("MCOV=").append(minusCoverage).append(';'); 1352 bb.append("PPC=").append(properPairCount).append(';'); 1353 1354 bb.append("AF=").append(af,4).append(';'); 1355 bb.append("RAF=").append(raf,4).append(';'); 1356 bb.append("LS=").append(lengthSum).append(';'); 1357 1358 bb.append("MQS=").append(mapQSum).append(';'); 1359 bb.append("MQM=").append(mapQMax).append(';'); 1360 bb.append("BQS=").append(baseQSum).append(';'); 1361 bb.append("BQM=").append(baseQMax).append(';'); 1362 1363 bb.append("EDS=").append(endDistSum).append(';'); 1364 bb.append("EDM=").append(endDistMax).append(';'); 1365 bb.append("IDS=").append(idSum).append(';'); 1366 bb.append("IDM=").append(idMax).append(';'); 1367 1368 bb.append("NVC=").append(nearbyVarCount).append(';'); 1369 bb.append("FLG=").append(flagged ? 1 : 0).append(';'); 1370 bb.append("CED=").append(scafEndDist).append(';'); 1371 bb.append("HMP=").append(homopolymerCount(map)).append(';'); 1372 bb.append("SB=").append(strandBias,4).append(';'); 1373 1374 if(Scaffold.trackStrand()){ 1375 bb.append("DP4=").append(refPlus).append(',').append(refMinus).append(','); 1376 bb.append(allelePlusCount()).append(',').append(alleleMinusCount()).append(';'); 1377 } 1378 1379 bb.length--; 1380 } 1381 { 1382 bb.tab(); 1383 bb.append("GT:DP:AD:AF:RAF:NVC:FLG:SB:SC:PF"); 1384 bb.tab(); 1385 1386 bb.append(genotype(ploidy, pass)); 1387 bb.append(':'); 1388 bb.append(Tools.max(coverage, count)); 1389 bb.append(':'); 1390 bb.append(count); 1391 bb.append(':'); 1392 bb.append(af,4); 1393 bb.append(':'); 1394 1395 bb.append(raf,4); 1396 bb.append(':'); 1397 bb.append(nearbyVarCount); 1398 bb.append(':'); 1399 bb.append(flagged ? 1 : 0); 1400 bb.append(':'); 1401 bb.append(strandBias,4); 1402 bb.append(':'); 1403 1404 bb.append(score,2); 1405 bb.append(':'); 1406 bb.append(pass ? "PASS" : "FAIL"); 1407 } 1408 1409 return bb; 1410 } 1411 calcCopies(int ploidy)1412 public int calcCopies(int ploidy){ 1413 final double af=alleleFraction(); 1414 // final int count=count(); 1415 if(ploidy==1){ 1416 return af<0.4 ? 0 : 1; 1417 }else if(ploidy==2){ 1418 if(af<0.2){return 0;} 1419 if(af<0.8){return 1;} 1420 return 2; 1421 } 1422 1423 int copies=(int)Math.round(ploidy*af); 1424 if(af>=0.5){copies=Tools.max(copies, 1);} 1425 copies=Tools.mid(MIN_VAR_COPIES, copies, ploidy); 1426 return copies; 1427 } 1428 1429 //TODO: Actually, I should also track the ref coverage. genotype(int ploidy, boolean pass)1430 private String genotype(int ploidy, boolean pass) { 1431 if(!pass && noPassDotGenotype){ 1432 if(ploidy==1){return ".";} 1433 else if(ploidy==2){return "./.";} 1434 StringBuilder sb=new StringBuilder(ploidy*2-1); 1435 sb.append('.'); 1436 for(int i=1; i<ploidy; i++){ 1437 sb.append('/').append('.'); 1438 } 1439 return sb.toString(); 1440 } 1441 int copies=calcCopies(ploidy); 1442 if(ploidy==1){return copies==0 ? "0" : "1";} 1443 if(ploidy==2){ 1444 if(copies==0){return "0/0";} 1445 if(copies==1){return "0/1";} 1446 return "1/1"; 1447 } 1448 StringBuilder sb=new StringBuilder(ploidy*2); 1449 int refCopies=ploidy-copies; 1450 1451 for(int i=0; i<refCopies; i++){ 1452 sb.append(0).append('/'); 1453 } 1454 for(int i=0; i<copies; i++){ 1455 sb.append(1).append('/'); 1456 } 1457 sb.setLength(sb.length()-1); 1458 return sb.toString(); 1459 } 1460 hash()1461 private int hash(){ 1462 return scafnum^Integer.rotateLeft(start, 9)^Integer.rotateRight(stop, 9)^hash(allele); 1463 } 1464 hash(byte[] a)1465 public static final int hash(byte[] a){ 1466 int code=123456789; 1467 for(byte b : a){ 1468 code=Integer.rotateLeft(code, 3)^codes[b]; 1469 } 1470 return code&Integer.MAX_VALUE; 1471 } 1472 calcCoverage(ScafMap map)1473 public int calcCoverage(ScafMap map){ 1474 if(coverage>=0){return coverage;} 1475 1476 Scaffold scaf=map.getScaffold(scafnum); 1477 coverage=scaf.calcCoverage(this); 1478 if(Scaffold.trackStrand()){minusCoverage=scaf.minusCoverage(this);} 1479 return coverage; 1480 } 1481 1482 /*--------------------------------------------------------------*/ 1483 /*---------------- Scoring Methods ----------------*/ 1484 /*--------------------------------------------------------------*/ 1485 toPhredScore(double score)1486 public static double toPhredScore(double score){ 1487 if(score==0){return 0;} 1488 score=score*0.998; 1489 return 2.5*QualityTools.probErrorToPhredDouble(1-score); 1490 } 1491 phredScore(double properPairRate, double totalQualityAvg, double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map)1492 public double phredScore(double properPairRate, double totalQualityAvg, double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map){ 1493 double score=score(properPairRate, totalQualityAvg, totalMapqAvg, readLengthAvg, rarity, ploidy, map); 1494 return toPhredScore(score); 1495 } 1496 score(double properPairRate, double totalQualityAvg, double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map)1497 public double score(double properPairRate, double totalQualityAvg, double totalMapqAvg, double readLengthAvg, double rarity, int ploidy, ScafMap map){ 1498 int scafEndDist=(map==null ? start : contigEndDist(map)); 1499 1500 double cs=coverageScore(ploidy, rarity, readLengthAvg); 1501 if(cs==0){return 0;} 1502 double es=(useEdist ? edistScore() : 1); 1503 double qs=qualityScore(totalQualityAvg, totalMapqAvg); 1504 double ps=(usePairing ? pairedScore(properPairRate, scafEndDist) : 1); 1505 double bs=(useBias ? biasScore(properPairRate, scafEndDist) : 1); 1506 double is=(useIdentity ? identityScore() : 1); 1507 double hs=(useHomopolymer ? homopolymerScore(map) : 1); 1508 return Math.pow(es*qs*ps*bs*cs*is*hs, 0.2); 1509 } 1510 edistScore()1511 public double edistScore(){ 1512 double lengthAvg=lengthAvg(); 1513 double edistAvg=((edistAvg()*2+endDistMax))*0.333333333333; 1514 double constant=5+Tools.min(20, lengthAvg*0.1)+lengthAvg*0.01; 1515 double weighted=Tools.max(0.05, edistAvg-Tools.min(constant, edistAvg*0.95)); 1516 weighted=weighted*weighted; 1517 return weighted/(weighted+4); 1518 } 1519 identityScore()1520 public double identityScore(){ 1521 double lengthAvg=lengthAvg(); 1522 double idAvg=0.001f*(((identityAvg()+idMax))*0.5f); 1523 double weighted=Tools.min(1, (idAvg*lengthAvg+(0.65f*Tools.max(1, readlen())))/lengthAvg); //Diminish impact of this var on itself 1524 weighted=0.75f+0.25f*weighted; //Compress the range 1525 return weighted; 1526 } 1527 qualityScore(double totalBaseqAvg, double totalMapqAvg)1528 public double qualityScore(double totalBaseqAvg, double totalMapqAvg){ 1529 return baseQualityScore(totalBaseqAvg)*mapQualityScore(totalMapqAvg); 1530 } 1531 1532 // public double baseQualityScore(double totalBaseqAvg){ 1533 // double bqAvg=baseQAvg(); 1534 // final double delta=Tools.max(0, totalBaseqAvg-bqAvg); 1535 // } 1536 baseQualityScore(double totalBaseqAvg)1537 public double baseQualityScore(double totalBaseqAvg){ 1538 double bqAvg=baseQAvg(); 1539 1540 if(totalBaseqAvg<32 && bqAvg<32){//Fudge factor for recalibrated quality scores, since this was calibrated for raw Illumina scores. 1541 //This section is not well-tested, though. 1542 double fudgeFactor1=0.75*(32-totalBaseqAvg); 1543 double fudgeFactor2=0.75*(32-bqAvg); 1544 totalBaseqAvg+=fudgeFactor1; 1545 bqAvg+=Tools.min(fudgeFactor1, fudgeFactor2); 1546 } 1547 1548 //Difference between this variation's quality and average quality. 1549 //Positive delta means that this is lower quality 1550 final double delta=totalBaseqAvg-bqAvg; 1551 if(delta>0){//This is kind of mysterious, but appears to normally reduce bqAvg for low-quality vars 1552 bqAvg=Tools.max(bqAvg*0.5, bqAvg-0.5*delta); 1553 } 1554 1555 double mult=0.25; 1556 double thresh=12; 1557 if(bqAvg>thresh){ 1558 bqAvg=bqAvg-thresh+(thresh*mult); 1559 }else{ 1560 bqAvg=bqAvg*mult; 1561 } 1562 1563 double baseProbAvg=1-Math.pow(10, 0-.1*bqAvg); 1564 double d=baseProbAvg*baseProbAvg; 1565 return d; 1566 } 1567 1568 // public double mapQualityScore(double totalMapqAvg){ 1569 // double mqAvg=mapQAvg(); 1570 // double mqAvg2=(0.25f*(3*mqAvg+mapQMax)); 1571 // final double delta=totalMapqAvg-mqAvg; 1572 // double score=(1-Math.pow(10, 0-.1*(mqAvg2+4))); 1573 // if(delta>0){ 1574 // 1575 // }else{ 1576 // 1577 // } 1578 // } 1579 mapQualityScore(double totalMapqAvg)1580 public double mapQualityScore(double totalMapqAvg){ 1581 double mqAvg=0.5f*(mapQAvg()+mapQMax); 1582 1583 double mapProbAvg=1-Math.pow(10, 0-.1*(mqAvg+2)); 1584 double d=mapProbAvg; 1585 return d; 1586 } 1587 pairedScore(double properPairRate, int scafEndDist)1588 public double pairedScore(double properPairRate, int scafEndDist){ 1589 if(properPairRate<0.5){return 0.98;} 1590 final double count=alleleCount(); 1591 if(count==0){return 0;} 1592 double rate=properPairCount/count; 1593 rate=rate*(count/(0.1+count)); 1594 if(rate*1.05>=properPairRate){return Tools.max(rate, 1-0.001*properPairRate);} 1595 double score=((rate*1.05)/properPairRate)*0.5+0.5; 1596 score=Tools.max(0.1, score); 1597 return modifyByEndDist(score, scafEndDist); 1598 } 1599 modifyByEndDist(double x, int scafEndDist)1600 public double modifyByEndDist(double x, int scafEndDist){ 1601 if(x>=0.99 || !doNscan || scafEndDist>=nScan){return x;} 1602 if(scafEndDist<minEndDistForBias){return Tools.max(x, 0.98+0.02*x);} 1603 double delta=1-x; 1604 delta=delta*(scafEndDist*scafEndDist)/(nScan*nScan); 1605 return 1-delta; 1606 } 1607 coverageScore(int ploidy, double rarity, double readLengthAvg)1608 public double coverageScore(int ploidy, double rarity, double readLengthAvg){ 1609 int count=alleleCount(); 1610 if(count==0){return 0;} 1611 double rawScore=count/(lowCoveragePenalty+count); //This may be too severe... 1612 1613 // double ratio=alleleFraction(); 1614 1615 double ratio=0.98; 1616 if(coverage>0){ 1617 double dif=coverage-count; 1618 if(dif>0){ 1619 dif=dif-coverage*.01f-Tools.min(0.5f, coverage*.1f); 1620 dif=Tools.max(0.1f, dif); 1621 } 1622 ratio=(coverage-dif)/coverage; 1623 if(type()==SUB && revisedAlleleFraction!=-1 && revisedAlleleFraction<ratio){ratio=revisedAlleleFraction;} 1624 else{ 1625 ratio=adjustForInsertionLength(ratio, readLengthAvg); 1626 } 1627 if(rarity<1 && ratio>rarity){ 1628 double minExpected=1f/ploidy; 1629 if(ratio<minExpected){ 1630 ratio=minExpected-((minExpected-ratio)*0.1); 1631 } 1632 } 1633 } 1634 1635 double ratio2=Tools.min(1, ploidy*ratio); 1636 return rawScore*ratio2; 1637 } 1638 reviseAlleleFraction(double readLengthAvg, Scaffold scaffold, VarMap map)1639 public void reviseAlleleFraction(double readLengthAvg, Scaffold scaffold, VarMap map){ 1640 assert(type()==INS); 1641 final int ilen=readlen(); 1642 if(ilen<3 || start<1 || start>=scaffold.length-2){return;} 1643 final byte[] bases=scaffold.bases; 1644 1645 final double afIns=alleleFraction(); 1646 final double rafIns=revisedAlleleFraction(afIns, readLengthAvg); 1647 final double revisedDif=0.55*(rafIns-afIns); //Half on the left and half on the right, on average 1648 final double mult=revisedDif/allele.length; 1649 1650 for(int i=0, j=start; i<allele.length && j<scaffold.bases.length; i++, j++){ 1651 final byte b=allele[i]; 1652 if(b!=bases[j]){ 1653 Var key=new Var(scaffold.number, j, j+1, b, SUB); 1654 Var affectedSub=map.get(key); 1655 // System.err.println("At pos "+j+": ref="+(char)bases[j]+", alt="+(char)b); 1656 if(affectedSub!=null){ 1657 assert(key.type()==SUB); 1658 // System.err.println("Found "+value); 1659 final double subModifier=revisedDif-mult*i; 1660 synchronized(affectedSub){ 1661 double afSub=affectedSub.alleleFraction(); 1662 double rafSub=affectedSub.revisedAlleleFraction; 1663 double modified=afSub-subModifier; 1664 if(rafSub==-1){ 1665 // System.err.println("sub="+sub+", old="+old); 1666 affectedSub.revisedAlleleFraction=Tools.max(afSub*0.05, modified); 1667 }else{ 1668 affectedSub.revisedAlleleFraction=Tools.min(rafSub, Tools.max(afSub*0.05, modified)); 1669 } 1670 } 1671 } 1672 } 1673 } 1674 1675 for(int i=0, j=start-1; i<allele.length && j>=0; i++, j--){ 1676 final byte b=allele[allele.length-1-i]; 1677 if(b!=bases[j]){ 1678 Var key=new Var(scaffold.number, j, j+1, b, SUB); 1679 Var affectedSub=map.get(key); 1680 if(affectedSub!=null){ 1681 assert(key.type()==SUB); 1682 // System.err.println("Found "+value); 1683 final double subModifier=revisedDif-mult*i; 1684 synchronized(affectedSub){ 1685 double afSub=affectedSub.alleleFraction(); 1686 double rafSub=affectedSub.revisedAlleleFraction; 1687 double modified=afSub-subModifier; 1688 if(rafSub==-1){ 1689 // System.err.println("sub="+sub+", old="+old); 1690 affectedSub.revisedAlleleFraction=Tools.max(afSub*0.05, modified); 1691 }else{ 1692 affectedSub.revisedAlleleFraction=Tools.min(rafSub, Tools.max(afSub*0.05, modified)); 1693 } 1694 } 1695 } 1696 } 1697 } 1698 } 1699 revisedAlleleFraction(double af, double readLengthAvg)1700 public double revisedAlleleFraction(double af, double readLengthAvg){ 1701 if(revisedAlleleFraction!=-1){ 1702 return revisedAlleleFraction; 1703 }else if(type()==INS){ 1704 return revisedAlleleFraction=adjustForInsertionLength(af, readLengthAvg); 1705 } 1706 return af; 1707 } 1708 adjustForInsertionLength(final double ratio, final double rlen0)1709 public double adjustForInsertionLength(final double ratio, final double rlen0){ 1710 if(type()!=INS){return ratio;} 1711 final int ilen=readlen(); 1712 if(ilen<2){return ratio;} 1713 1714 final double rlen=Tools.max(ilen*1.2+6, rlen0); 1715 final double sites=rlen+ilen-1; 1716 final double goodSites=rlen-ilen*1.1-6; 1717 1718 final double expectedFraction=goodSites/sites; 1719 final double revisedRatio=Tools.min(ratio/expectedFraction, 1-(1-ratio)*0.1); 1720 return revisedRatio; 1721 } 1722 homopolymerScore(ScafMap map)1723 public double homopolymerScore(ScafMap map){ 1724 if(map==null){return 1;} 1725 1726 int count=homopolymerCount(map); 1727 // assert(false) : count; 1728 if(count<2){return 1;} 1729 return 1f-(count*0.1f/9); 1730 } 1731 homopolymerCount(ScafMap map)1732 public int homopolymerCount(ScafMap map){ 1733 if(map==null){return 0;} 1734 final byte[] bases=map.getScaffold(scafnum).bases; 1735 if(bases==null){return 0;} 1736 1737 final int type=type(); 1738 if(type==SUB){ 1739 assert(start==stop-1) : start+", "+stop; 1740 final byte base=allele[0]; 1741 int x=homopolymerCountSub(bases, start, base); 1742 // assert(false) : (char)base+", "+x; 1743 return x; 1744 }else if(type==INS){ 1745 final byte base1=allele[0], base2=allele[allele.length-1]; 1746 int i=0; 1747 while(i<allele.length && allele[i]==base1){i++;} 1748 while(i<allele.length && allele[i]==base2){i++;} 1749 if(i<bases.length){return 0;} 1750 int left=homopolymerCountLeft(bases, start, base1); 1751 int right=homopolymerCountRight(bases, stop+1, base2); 1752 // assert(false) : "INS "+(left+right+1)+" "+new String(allele)+" "+new String(bases, start-4, 8); 1753 return left+right+1; 1754 }else if(type==DEL){ 1755 if(start<0 || start+1>=bases.length || stop<=0 || stop>=bases.length){return 0;} 1756 final byte base1=bases[start+1], base2=bases[stop-1]; 1757 int pos=start+1; 1758 while(pos<=stop && bases[pos]==base1){pos++;} 1759 while(pos<=stop && bases[pos]==base2){pos++;} 1760 if(pos<=stop){return 0;} 1761 int left=homopolymerCountLeft(bases, start, base1); 1762 int right=homopolymerCountRight(bases, stop, base2); 1763 // assert(false || reflen()>10) : "DEL "+(left+right+1)+" "+new String(allele)+" "+new String(bases, start-4, stop-start+8); 1764 return left+right+1; 1765 }else{ 1766 // assert(false) : type(); 1767 return 0; 1768 } 1769 } 1770 homopolymerCountSub(final byte[] bases, final int pos, final byte base)1771 public static int homopolymerCountSub(final byte[] bases, final int pos, final byte base){ 1772 // System.err.println("A:"+pos+", "+bases.length+", "+(char)base); 1773 if(pos<0 || pos>=bases.length){return 0;} 1774 if(!AminoAcid.isFullyDefined(base)){return 0;} 1775 // System.err.println("B:"+pos+", "+bases.length); 1776 1777 int count1=0; 1778 for(int i=pos-1, lim=Tools.max(0, pos-4); i>=lim; i--){ 1779 if(bases[i]==base){count1++;} 1780 else{break;} 1781 } 1782 // System.err.println("C:"+pos+", "+bases.length+", "+count1); 1783 1784 int count2=0; 1785 for(int i=pos+1, lim=Tools.min(bases.length, pos+5); i<lim; i++){ 1786 if(bases[i]==base){count2++;} 1787 else{break;} 1788 } 1789 // System.err.println("D:"+pos+", "+bases.length+", "+count2); 1790 // System.err.println("E:"+new String(bases, pos-4, 9)); 1791 assert(count1+count2<=8) : count1+", "+count2; 1792 1793 return count1+count2+(count1>0 && count2>0 ? 1 : 0); 1794 } 1795 homopolymerCountLeft(final byte[] bases, final int pos, final byte base)1796 public static int homopolymerCountLeft(final byte[] bases, final int pos, final byte base){ 1797 if(pos<0 || bases[pos]!=base){return 0;} 1798 if(!AminoAcid.isFullyDefined(base)){return 0;} 1799 1800 int count=0; 1801 for(int i=pos, lim=Tools.max(0, pos-3); i>=lim; i--){ 1802 if(bases[i]==base){count++;} 1803 else{break;} 1804 } 1805 return count; 1806 } 1807 homopolymerCountRight(final byte[] bases, final int pos, final byte base)1808 public static int homopolymerCountRight(final byte[] bases, final int pos, final byte base){ 1809 if(pos<0 || bases[pos]!=base){return 0;} 1810 if(!AminoAcid.isFullyDefined(base)){return 0;} 1811 1812 int count=0; 1813 for(int i=pos, lim=Tools.min(bases.length, pos+4); i<lim; i++){ 1814 if(bases[i]==base){count++;} 1815 else{break;} 1816 } 1817 return count; 1818 } 1819 biasScore(double properPairRate, int scafEndDist)1820 public double biasScore(double properPairRate, int scafEndDist){ 1821 double strandBias=strandBiasScore(scafEndDist); 1822 double readBias=readBiasScore(properPairRate); 1823 return Math.sqrt(strandBias*readBias); 1824 } 1825 strandBiasScore(int scafEndDist)1826 public double strandBiasScore(int scafEndDist){ 1827 int plus=allelePlusCount(); 1828 int minus=alleleMinusCount(); 1829 final double x=eventProb(plus, minus); 1830 final double x2=modifyByEndDist(x, scafEndDist); 1831 1832 //TODO: 1833 //This should correct strand bias based on whether the overall read distribution is biased, 1834 //for homozygous variations when minus coverage is being tracked 1835 1836 double result=x2; 1837 if(plus+minus>=20 && x2<0.9){//This block added based on analyzing NIST GIAB diff 1838 int min=Tools.min(plus, minus); 1839 int max=Tools.max(plus, minus); 1840 if(min>1 && min>0.06f*max){//Seen on both strands; relax stringency 1841 double y=0.15+(0.2*min)/max; //Higher constants (maximum of 1.0 combined) push the end result closer to 1 1842 result=y+(1-y)*x2; 1843 } 1844 } 1845 // if(start==15699277){System.err.println(plus+", "+minus+", "+x+", "+x2+", "+result);} 1846 return result; 1847 } 1848 1849 //This seems to cause trouble with some NIST GIAB Illumina data... not clear why readBiasScore(double properPairRate)1850 public double readBiasScore(double properPairRate){ 1851 if(properPairRate<0.5){return 0.95f;} 1852 final int r1=r1AlleleCount(), r2=r2AlleleCount(); 1853 final double x=eventProb(r1, r2); 1854 1855 //This block added based on analyzing NIST GIAB diff 1856 final double x2=0.10+0.90*x; 1857 double result=x2; 1858 if(r1+r2>=20 && x2<0.9){ 1859 int min=Tools.min(r1, r2); 1860 int max=Tools.max(r1, r2); 1861 if(min>1 && min>0.07f*max){//Seen on both reads; relax stringency 1862 double y=0.15+(0.2*min)/max; //Higher constants (maximum of 1.0 combined) push the end result closer to 1 1863 result=y+(1-y)*x2; 1864 } 1865 } 1866 return result; 1867 } 1868 1869 /** Adjusted probability of a binomial event being at least this lopsided. */ eventProb(int a, int b)1870 public static double eventProb(int a, int b){ 1871 1872 double allowedBias=0.75; 1873 double slopMult=0.95; 1874 1875 double n=a+b; 1876 double k=Tools.min(a, b); 1877 1878 double slop=n*(allowedBias*0.5); 1879 double dif=n-k*2; 1880 dif=dif-(Tools.min(slop, dif)*slopMult); 1881 n=k*2+dif; 1882 // k=n*0.5-dif; 1883 assert(k<=n*0.5) : a+", "+b+", "+n+", "+k+", "+slop+", "+dif; 1884 1885 if(n>PROBLEN){ 1886 double mult=PROBLEN/n; 1887 n=PROBLEN; 1888 k=(int)(k*mult); 1889 } 1890 1891 int n2=(int)Math.round(n); 1892 int k2=Tools.min(n2/2, (int)(k+1)); 1893 1894 1895 // if(a+b>3){ 1896 // System.err.println(n+", "+k+", "+n2+", "+k2); 1897 // } 1898 1899 double result=prob[n2][k2]; 1900 if(result<1 || a==b || a+1==b || a==b+1){return result;} 1901 1902 double slope=Tools.min(a, b)/(double)Tools.max(a, b); 1903 return (0.998+slope*0.002); 1904 } 1905 1906 /*--------------------------------------------------------------*/ 1907 /*---------------- Getters ----------------*/ 1908 /*--------------------------------------------------------------*/ 1909 allelePlusCount()1910 public int allelePlusCount(){return r1plus+r2plus;} alleleMinusCount()1911 public int alleleMinusCount(){return r1minus+r2minus;} r1AlleleCount()1912 public int r1AlleleCount(){return r1plus+r1minus;} r2AlleleCount()1913 public int r2AlleleCount(){return r2plus+r2minus;} alleleCount()1914 public int alleleCount(){return r1plus+r1minus+r2plus+r2minus;} 1915 alleleFraction()1916 public double alleleFraction(){ 1917 int count=alleleCount(); 1918 int cov=Tools.max(count, coverage, 1); 1919 return count/(double)cov; 1920 } 1921 strandRatio()1922 public double strandRatio(){ 1923 int plus=allelePlusCount(); 1924 int minus=alleleMinusCount(); 1925 if(plus==minus){return 1;} 1926 return (Tools.min(plus, minus)+1)/(double)Tools.max(plus, minus); 1927 } baseQAvg()1928 public double baseQAvg(){return baseQSum/(double)alleleCount();} mapQAvg()1929 public double mapQAvg(){return mapQSum/(double)alleleCount();} edistAvg()1930 public double edistAvg(){return endDistSum/(double)alleleCount();} identityAvg()1931 public double identityAvg(){return idSum/(double)alleleCount();} lengthAvg()1932 public double lengthAvg(){return lengthSum/(double)alleleCount();} properPairRate()1933 public double properPairRate(){return properPairCount/(double)alleleCount();} 1934 1935 setCoverage(int coverage_, int minusCoverage_)1936 public void setCoverage(int coverage_, int minusCoverage_){ 1937 coverage=coverage_; 1938 minusCoverage=minusCoverage_; 1939 } 1940 coverage()1941 public int coverage(){ 1942 assert(coverage>-1) : coverage+", "+this; 1943 return coverage; 1944 } 1945 hasCoverage()1946 public boolean hasCoverage(){ 1947 return coverage>-1; 1948 } 1949 contigEndDist(ScafMap map)1950 public int contigEndDist(ScafMap map){ 1951 Scaffold scaf=map.getScaffold(scafnum); 1952 int len=scaf.length; 1953 byte[] bases=scaf.bases; 1954 1955 int scafEndDist=Tools.max(0, Tools.min(start, len-stop)); 1956 if(bases==null || nScan<1){return scafEndDist;} 1957 int limit=Tools.min(nScan, scafEndDist); 1958 int contigEndDist=leftContigEndDist(bases, limit); 1959 limit=Tools.min(limit, contigEndDist); 1960 contigEndDist=rightContigEndDist(bases, limit); 1961 return Tools.min(scafEndDist, contigEndDist); 1962 } 1963 leftContigEndDist(byte[] bases, int maxDist)1964 public int leftContigEndDist(byte[] bases, int maxDist){ 1965 if(start>=bases.length){return Tools.min(bases.length, maxDist+1);} 1966 int ns=0; 1967 for(int i=start, lim=Tools.max(0, start-maxDist); i>=lim; i--){ 1968 if(AminoAcid.isFullyDefined(bases[i])){ 1969 ns=0; 1970 }else{ 1971 ns++; 1972 if(ns>=10){ 1973 int x=start-i-ns+1; 1974 assert(x>=0); 1975 return x; 1976 } 1977 } 1978 } 1979 return maxDist+1; 1980 } 1981 rightContigEndDist(byte[] bases, int maxDist)1982 public int rightContigEndDist(byte[] bases, int maxDist){ 1983 if(stop<0){return Tools.min(bases.length, maxDist+1);} 1984 int ns=0; 1985 for(int i=stop, lim=Tools.min(bases.length-1, stop+maxDist); i<=lim; i++){ 1986 if(AminoAcid.isFullyDefined(bases[i])){ 1987 ns=0; 1988 }else{ 1989 ns++; 1990 if(ns>=10){ 1991 int x=i-stop-ns+1; 1992 assert(x>=0); 1993 return x; 1994 } 1995 } 1996 } 1997 return maxDist+1; 1998 } 1999 scafName()2000 public String scafName(){ 2001 return scafName(ScafMap.defaultScafMap()); 2002 } 2003 scafName(ScafMap map)2004 public String scafName(ScafMap map){ 2005 return map.getScaffold(scafnum).name; 2006 } 2007 setForced(boolean b)2008 public Var setForced(boolean b){forced=b; return this;} forced()2009 public boolean forced(){return forced;} 2010 setFlagged(boolean b)2011 public Var setFlagged(boolean b){flagged=b; return this;} flagged()2012 public boolean flagged(){return flagged;} 2013 2014 ins()2015 public final boolean ins(){return type==INS;} del()2016 public final boolean del(){return type==DEL;} indel()2017 public final boolean indel(){return type==INS || type==DEL;} sub()2018 public final boolean sub(){return type==SUB;} complex()2019 public final boolean complex(){return type==COMPLEX;} frameshift()2020 public final boolean frameshift(){ 2021 int delta=Tools.absdif(reflen(), readlen()); 2022 return delta%3!=0; 2023 } 2024 2025 /*--------------------------------------------------------------*/ 2026 /*---------------- Final Fields ----------------*/ 2027 /*--------------------------------------------------------------*/ 2028 2029 public final int scafnum; 2030 public final int start; 2031 public final int stop; //Half-open, so stop is always after start except for insertions 2032 public final byte[] allele; 2033 public final int hashcode; 2034 public final int type; 2035 2036 /*--------------------------------------------------------------*/ 2037 /*---------------- Mutable Fields ----------------*/ 2038 /*--------------------------------------------------------------*/ 2039 2040 private int coverage=-1; 2041 private int minusCoverage=-1; 2042 2043 int r1plus; 2044 int r1minus; 2045 int r2plus; 2046 int r2minus; 2047 int properPairCount; 2048 2049 long mapQSum; 2050 public int mapQMax; 2051 2052 long baseQSum; 2053 public int baseQMax; 2054 2055 long endDistSum; 2056 public int endDistMax; 2057 2058 long idSum; 2059 int idMax; 2060 2061 long lengthSum; 2062 2063 int nearbyVarCount=-1; 2064 2065 double revisedAlleleFraction=-1; 2066 private boolean forced=false; 2067 private boolean flagged=false; 2068 2069 /*--------------------------------------------------------------*/ 2070 /*---------------- Static Fields ----------------*/ 2071 /*--------------------------------------------------------------*/ 2072 2073 public static boolean CALL_INS=true; 2074 public static boolean CALL_DEL=true; 2075 public static boolean CALL_SUB=true; 2076 public static boolean CALL_NOCALL=false; 2077 public static boolean CALL_JUNCTION=false; 2078 2079 public static boolean extendedText=true; 2080 2081 public static boolean noPassDotGenotype=false; 2082 2083 public static boolean useHomopolymer=true; 2084 public static boolean useIdentity=true; 2085 public static boolean usePairing=true; 2086 public static boolean useBias=true; 2087 public static boolean useEdist=true; 2088 public static boolean doNscan=true; 2089 public static double lowCoveragePenalty=0.8; 2090 public static int nScan=600; 2091 public static int minEndDistForBias=200; 2092 2093 public static int MIN_VAR_COPIES=0; 2094 2095 final static int PROBLEN=100; 2096 public static final boolean UPPER_CASE_ALLELES=true; 2097 /** Verify that there are no variants with alt equal to ref */ 2098 private static final boolean TEST_REF_VARIANTS=false; 2099 2100 // static final String[] typeArray=new String[] {"NOCALL","SUB","DEL","INS"}; 2101 // static final int NOCALL=0, SUB=1, DEL=2, INS=3; 2102 2103 private static final byte colon=';'; 2104 private static final byte tab='\t'; 2105 public static final String[] typeArray=new String[] {"INS","NOCALL","SUB","DEL","LJUNCT","RJUNCT","BJUNCT","MULTI","COMPLEX"}; 2106 /** Insertion */ 2107 public static final int INS=0; 2108 /** No-call (length-neutral, ref N) */ 2109 public static final int NOCALL=1; 2110 /** Substitution (length-neutral) */ 2111 public static final int SUB=2; 2112 /** Deletion */ 2113 public static final int DEL=3; 2114 /** Left-junction (left side clipped, right side normal) */ 2115 public static final int LJUNCT=4; 2116 /** Right-junction (right side clipped, left side normal) */ 2117 public static final int RJUNCT=5; 2118 /** Bidirectional junction (both sides clipped) */ 2119 public static final int BJUNCT=6; 2120 /** Multiallelic (dominates all other types) */ 2121 public static final int MULTI=7; 2122 /** Left-junction (left-side clipped, right side normal) */ 2123 public static final int COMPLEX=8; 2124 2125 /** Number of variant types; same as typeArray.length */ 2126 public static final int VAR_TYPES=COMPLEX+1; 2127 2128 /** Initial letter for abbreviating type names */ 2129 static final byte[] typeInitialArray=new byte[128]; 2130 2131 static final byte[] AL_0=new byte[0]; 2132 static final byte[] AL_A=new byte[] {(byte)'A'}; 2133 static final byte[] AL_C=new byte[] {(byte)'C'}; 2134 static final byte[] AL_G=new byte[] {(byte)'G'}; 2135 static final byte[] AL_T=new byte[] {(byte)'T'}; 2136 static final byte[] AL_N=new byte[] {(byte)'N'}; 2137 static final byte[][] AL_MAP=makeMap(); 2138 static final int[] codes=makeCodes(); 2139 2140 //For 64-bit hash keys 2141 private static final int typeBits=2; 2142 private static final int alleleBits=6; 2143 private static final int scafBits=16; 2144 private static final int lenBits=8; 2145 2146 private static final int alleleShift=typeBits; 2147 private static final int scafShift=alleleShift+alleleBits; 2148 private static final int lenShift=scafShift+scafBits; 2149 private static final int startShift=lenShift+lenBits; 2150 2151 2152 // public static ScafMap scafMap; 2153 makeCodes()2154 static final int[] makeCodes(){ 2155 Random randy=new Random(1); 2156 int[] array=new int[256]; 2157 for(int i=0; i<array.length; i++){ 2158 array[i]=randy.nextInt(); 2159 } 2160 return array; 2161 } 2162 makeMap()2163 static final byte[][] makeMap(){ 2164 byte[][] map=new byte[128][]; 2165 map[0]=map['.']=map['\t']=AL_0; 2166 map['A']=map['a']=AL_A; 2167 map['C']=map['c']=AL_C; 2168 map['G']=map['g']=AL_G; 2169 map['T']=map['t']=AL_T; 2170 map['N']=map['n']=AL_N; 2171 for(int i=0; i<map.length; i++){ 2172 if(map[i]==null){map[i]=new byte[(byte)i];} 2173 } 2174 return map; 2175 } 2176 2177 /** factorial[n]=n! */ 2178 private static final double[] factorial=makeFactorialArray(PROBLEN+1); 2179 /** binomial[n][k] = combinations in n pick k (no replacement, order-insensitive) */ 2180 private static final double[][] binomial=makeBinomialMatrix(PROBLEN+1); 2181 /** prob[n][k] = probability of an event this lopsided or worse. */ 2182 private static final double[][] prob=makeProbMatrix(PROBLEN+1); 2183 makeFactorialArray(int len)2184 private static double[] makeFactorialArray(int len) { 2185 double[] x=new double[len]; 2186 x[0]=1; 2187 for(int i=1; i<len; i++){ 2188 x[i]=x[i-1]*i; 2189 } 2190 return x; 2191 } 2192 makeBinomialMatrix(int len)2193 private static double[][] makeBinomialMatrix(int len) { 2194 double[][] matrix=new double[len][]; 2195 for(int n=0; n<len; n++){ 2196 final int kmax=n/2; 2197 final double nf=factorial[n]; 2198 matrix[n]=new double[kmax+1]; 2199 for(int k=0; k<=kmax; k++){ 2200 final double kf=factorial[k]; 2201 final double nmkf=factorial[n-k]; 2202 double combinations=nf/kf; 2203 combinations=combinations/nmkf; 2204 matrix[n][k]=combinations; 2205 } 2206 } 2207 return matrix; 2208 } 2209 2210 /** Combinations in n pick k, allowing somewhat higher max values... perhaps? */ bigBinomial(int n, int k)2211 private static double bigBinomial(int n, int k){ 2212 2213 double combinations=1; 2214 for(int a=-1, b=-1, c=-1; a<=n || b<=k || c<=n-k; ) { 2215 double mult=1; 2216 if(combinations<1000000000 && a<=n){//Increase combinations 2217 a++; 2218 mult=Tools.max(1, a); 2219 }else if(b<=k){//Decrease combinations 2220 b++; 2221 mult=1.0/Tools.max(1, a); 2222 }else if(c<=n-k){//Decrease combinations 2223 c++; 2224 mult=1.0/Tools.max(1, a); 2225 }else{ 2226 assert(false) : a+", "+b+", "+c+", "+n+", "+k+", "+(n-k)+", "+combinations; 2227 } 2228 combinations*=mult; 2229 assert(combinations<Double.MAX_VALUE) : a+", "+b+", "+c+", "+n+", "+k+", "+(n-k)+", "+combinations; 2230 } 2231 2232 return combinations; 2233 } 2234 2235 private static double[][] makeProbMatrix(int len) { 2236 double[][] matrix=new double[len][]; 2237 double mult=2; 2238 for(int n=0; n<len; n++){ 2239 final int kmax=n/2; 2240 final double[] array=matrix[n]=new double[kmax+1]; 2241 for(int k=0; k<=kmax; k++){ 2242 final double combinations=binomial[n][k]; 2243 array[k]=combinations*mult; 2244 } 2245 // if(n<=12){System.err.println(Arrays.toString(array));} 2246 for(int k=0; k<=kmax; k++){ 2247 array[k]=Tools.min(1, (k==0 ? 0 : array[k-1])+array[k]); 2248 } 2249 // if(n<=12){System.err.println(Arrays.toString(array));} 2250 // assert(array[kmax]==1) : Arrays.toString(array); 2251 mult*=0.5; 2252 // if(n<=12){System.err.println();} 2253 } 2254 return matrix; 2255 } 2256 2257 static { 2258 Arrays.fill(typeInitialArray, (byte)-1); 2259 typeInitialArray['I']=INS; 2260 typeInitialArray['N']=NOCALL; 2261 typeInitialArray['S']=SUB; 2262 typeInitialArray['D']=DEL; 2263 typeInitialArray['L']=LJUNCT; 2264 typeInitialArray['R']=RJUNCT; 2265 typeInitialArray['B']=BJUNCT; 2266 typeInitialArray['M']=MULTI; 2267 typeInitialArray['C']=COMPLEX; 2268 } 2269 2270 public static final String varFormat="1.3"; 2271 } 2272 2273