1 package kmer; 2 3 import java.io.PrintStream; 4 import java.util.ArrayList; 5 import java.util.Arrays; 6 import java.util.concurrent.ArrayBlockingQueue; 7 8 import dna.AminoAcid; 9 import fileIO.FileFormat; 10 import fileIO.ReadWrite; 11 import fileIO.TextStreamWriter; 12 import jgi.BBMerge; 13 import shared.PreParser; 14 import shared.Shared; 15 import shared.Timer; 16 import shared.Tools; 17 import stream.ConcurrentReadInputStream; 18 import stream.Read; 19 import structures.IntList; 20 import structures.ListNum; 21 22 /** 23 * @author Brian Bushnell 24 * @date Mar 4, 2015 25 * 26 */ 27 public class TableLoaderLockFree { 28 29 /*--------------------------------------------------------------*/ 30 /*---------------- Initialization ----------------*/ 31 /*--------------------------------------------------------------*/ 32 33 /** 34 * Code entrance from the command line. 35 * @param args Command line arguments 36 */ main(String[] args)37 public static void main(String[] args){ 38 39 {//Preparse block for help, config files, and outstream 40 PreParser pp=new PreParser(args, null, false); 41 args=pp.args; 42 outstream=pp.outstream; 43 } 44 45 Timer t=new Timer(); 46 47 AbstractKmerTable[] tables=makeTables(AbstractKmerTable.ARRAY1D, 12, -1L, false, 1.0); 48 49 int k=31; 50 int mink=0; 51 int speed=0; 52 int hdist=0; 53 int edist=0; 54 boolean rcomp=true; 55 boolean maskMiddle=false; 56 57 //Create a new Loader instance 58 TableLoaderLockFree loader=new TableLoaderLockFree(tables, k, mink, speed, hdist, edist, rcomp, maskMiddle); 59 loader.setRefSkip(0); 60 loader.hammingDistance2=0; 61 loader.editDistance2=0; 62 loader.storeMode(SET_IF_NOT_PRESENT); 63 64 ///And run it 65 String[] refs=args; 66 String[] literals=null; 67 boolean keepNames=false; 68 boolean useRefNames=false; 69 long kmers=loader.processData(refs, literals, keepNames, useRefNames, false); 70 t.stop(); 71 72 outstream.println("Time: \t"+t); 73 outstream.println("Return: \t"+kmers); 74 outstream.println("refKmers: \t"+loader.refKmers); 75 outstream.println("refBases: \t"+loader.refBases); 76 outstream.println("refReads: \t"+loader.refReads); 77 78 //Close the print stream if it was redirected 79 Shared.closeStream(outstream); 80 } 81 TableLoaderLockFree(AbstractKmerTable[] tables_, int k_)82 public TableLoaderLockFree(AbstractKmerTable[] tables_, int k_){ 83 this(tables_, k_, 0, 0, 0, 0, true, false); 84 } 85 TableLoaderLockFree(AbstractKmerTable[] tables_, int k_, int mink_, int speed_, int hdist_, int edist_, boolean rcomp_, boolean maskMiddle_)86 public TableLoaderLockFree(AbstractKmerTable[] tables_, int k_, int mink_, int speed_, int hdist_, int edist_, boolean rcomp_, boolean maskMiddle_){ 87 tables=tables_; 88 k=k_; 89 k2=k-1; 90 mink=mink_; 91 rcomp=rcomp_; 92 useShortKmers=(mink>0 && mink<k); 93 speed=speed_; 94 hammingDistance=hdist_; 95 editDistance=edist_; 96 middleMask=maskMiddle ? ~(3L<<(2*(k/2))) : -1L; 97 } 98 99 100 /*--------------------------------------------------------------*/ 101 /*---------------- Outer Methods ----------------*/ 102 /*--------------------------------------------------------------*/ 103 104 105 // public static AbstractKmerTable[] makeTables(int tableType, int initialSize, long coreMask, boolean growable){ 106 // return AbstractKmerTable.preallocate(WAYS, tableType, initialSize, coreMask, growable); 107 // } 108 109 // public static AbstractKmerTable[] makeTables(int tableType, int[] schedule, long coreMask){ 110 // return AbstractKmerTable.preallocate(WAYS, tableType, schedule, coreMask); 111 // } 112 makeTables(int tableType, int bytesPerKmer, long coreMask, boolean prealloc, double memRatio)113 public static AbstractKmerTable[] makeTables(int tableType, int bytesPerKmer, long coreMask, 114 boolean prealloc, double memRatio){ 115 ScheduleMaker scheduleMaker=new ScheduleMaker(WAYS, bytesPerKmer, prealloc, memRatio); 116 int[] schedule=scheduleMaker.makeSchedule(); 117 return AbstractKmerTable.preallocate(WAYS, tableType, schedule, coreMask); 118 } 119 120 ScheduleMaker scheduleMaker=new ScheduleMaker(WAYS, 12, false, 0.8); 121 int[] schedule=scheduleMaker.makeSchedule(); 122 processData(String[] ref, String[] literal, boolean keepNames, boolean useRefNames, boolean ecc_)123 public long processData(String[] ref, String[] literal, boolean keepNames, boolean useRefNames, boolean ecc_){ 124 125 scaffoldNames=null; 126 refNames=null; 127 refScafCounts=null; 128 scaffoldLengths=null; 129 ecc=ecc_; 130 131 if(keepNames){ 132 scaffoldNames=new ArrayList<String>(); 133 refNames=new ArrayList<String>(); 134 scaffoldLengths=new IntList(); 135 136 if(ref!=null){ 137 for(String s : ref){refNames.add(s);} 138 } 139 if(literal!=null){refNames.add("literal");} 140 refScafCounts=new int[refNames.size()]; 141 142 if(useRefNames){toRefNames();} 143 } 144 145 return spawnLoadThreads(ref, literal); 146 } 147 setRefSkip(int x)148 public void setRefSkip(int x){setRefSkip(x, x);} 149 setRefSkip(int min, int max)150 public void setRefSkip(int min, int max){ 151 max=Tools.max(min, max); 152 if(min==max){ 153 minRefSkip=maxRefSkip=min; 154 }else{ 155 minRefSkip=min; 156 maxRefSkip=max; 157 } 158 variableRefSkip=(minRefSkip!=maxRefSkip); 159 } 160 storeMode(final int x)161 public void storeMode(final int x){ 162 assert(x==SET_IF_NOT_PRESENT || x==SET_ALWAYS || x==INCREMENT); 163 storeMode=x; 164 } 165 166 /*--------------------------------------------------------------*/ 167 /*---------------- Inner Methods ----------------*/ 168 /*--------------------------------------------------------------*/ 169 170 171 /** 172 * Fills tables with kmers from references, using multiple LoadThread. 173 * @return Number of kmers stored. 174 */ spawnLoadThreads(String[] ref, String[] literal)175 private long spawnLoadThreads(String[] ref, String[] literal){ 176 Timer t=new Timer(); 177 if((ref==null || ref.length<1) && (literal==null || literal.length<1)){return 0;} 178 long added=0; 179 180 /* Create load threads */ 181 LoadThread[] loaders=new LoadThread[WAYS]; 182 for(int i=0; i<loaders.length; i++){ 183 loaders[i]=new LoadThread(i); 184 loaders[i].start(); 185 } 186 187 /* For each reference file... */ 188 int refNum=0; 189 if(ref!=null){ 190 for(String refname : ref){ 191 192 /* Start an input stream */ 193 FileFormat ff=FileFormat.testInput(refname, FileFormat.FASTA, null, false, true); 194 ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1L, false, ff, null, null, null, Shared.USE_MPI, true); 195 cris.start(); //4567 196 ListNum<Read> ln=cris.nextList(); 197 ArrayList<Read> reads=(ln!=null ? ln.list : null); 198 199 /* Iterate through read lists from the input stream */ 200 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 201 { 202 /* Assign a unique ID number to each scaffold */ 203 ArrayList<Read> reads2=new ArrayList<Read>(reads); 204 if(scaffoldNames!=null){ 205 for(Read r1 : reads2){ 206 final Read r2=r1.mate; 207 final Integer id=scaffoldNames.size(); 208 if(ecc && r1!=null && r2!=null){BBMerge.findOverlapStrict(r1, r2, true);} 209 refScafCounts[refNum]++; 210 scaffoldNames.add(r1.id==null ? id.toString() : r1.id); 211 int len=r1.length(); 212 r1.obj=id; 213 if(r2!=null){ 214 r2.obj=id; 215 len+=r2.length(); 216 } 217 scaffoldLengths.add(len); 218 } 219 } 220 221 if(REPLICATE_AMBIGUOUS){ 222 reads2=Tools.replicateAmbiguous(reads2, Tools.min(k, mink)); 223 } 224 225 /* Send a pointer to the read list to each LoadThread */ 226 for(LoadThread lt : loaders){ 227 boolean b=true; 228 while(b){ 229 try { 230 lt.queue.put(reads2); 231 b=false; 232 } catch (InterruptedException e) { 233 //TODO: This will hang due to still-running threads. 234 throw new RuntimeException(e); 235 } 236 } 237 } 238 } 239 240 /* Dispose of the old list and fetch a new one */ 241 cris.returnList(ln); 242 ln=cris.nextList(); 243 reads=(ln!=null ? ln.list : null); 244 } 245 /* Cleanup */ 246 cris.returnList(ln); 247 errorState|=ReadWrite.closeStream(cris); 248 refNum++; 249 } 250 } 251 252 /* If there are literal sequences to use as references */ 253 if(literal!=null){ 254 ArrayList<Read> list=new ArrayList<Read>(literal.length); 255 if(verbose){outstream.println("Adding literals "+Arrays.toString(literal));} 256 257 /* Assign a unique ID number to each literal sequence */ 258 for(int i=0; i<literal.length; i++){ 259 if(scaffoldNames!=null){ 260 final Integer id=scaffoldNames.size(); 261 final Read r=new Read(literal[i].getBytes(), null, id); 262 refScafCounts[refNum]++; 263 scaffoldNames.add(id.toString()); 264 scaffoldLengths.add(r.length()); 265 r.obj=id; 266 }else{ 267 final Read r=new Read(literal[i].getBytes(), null, i); 268 list.add(r); 269 } 270 } 271 272 if(REPLICATE_AMBIGUOUS){ 273 list=Tools.replicateAmbiguous(list, Tools.min(k, mink)); 274 } 275 276 /* Send a pointer to the read list to each LoadThread */ 277 for(LoadThread lt : loaders){ 278 boolean b=true; 279 while(b){ 280 try { 281 lt.queue.put(list); 282 b=false; 283 } catch (InterruptedException e) { 284 //TODO: This will hang due to still-running threads. 285 throw new RuntimeException(e); 286 } 287 } 288 } 289 } 290 291 /* Signal loaders to terminate */ 292 for(LoadThread lt : loaders){ 293 boolean b=true; 294 while(b){ 295 try { 296 lt.queue.put(POISON); 297 b=false; 298 } catch (InterruptedException e) { 299 //TODO: This will hang due to still-running threads. 300 throw new RuntimeException(e); 301 } 302 } 303 } 304 305 /* Wait for loaders to die, and gather statistics */ 306 for(LoadThread lt : loaders){ 307 while(lt.getState()!=Thread.State.TERMINATED){ 308 try { 309 lt.join(); 310 } catch (InterruptedException e) { 311 // TODO Auto-generated catch block 312 e.printStackTrace(); 313 } 314 } 315 added+=lt.addedT; 316 refKmers+=lt.refKmersT; 317 refBases+=lt.refBasesT; 318 refReads+=lt.refReadsT; 319 } 320 //Correct statistics for number of threads, since each thread processes all reference data 321 refKmers/=WAYS; 322 refBases/=WAYS; 323 refReads/=WAYS; 324 325 t.stop(); 326 if(DISPLAY_PROGRESS){ 327 outstream.println("Added "+added+" kmers; time: \t"+t); 328 Shared.printMemory(); 329 outstream.println(); 330 } 331 332 if(verbose){ 333 TextStreamWriter tsw=new TextStreamWriter("stdout", false, false, false, FileFormat.TEXT); 334 tsw.start(); 335 for(AbstractKmerTable table : tables){ 336 table.dumpKmersAsText(tsw, k, 1, Integer.MAX_VALUE); 337 } 338 tsw.poisonAndWait(); 339 } 340 341 return added; 342 } 343 344 345 346 /** 347 * Fills the scaffold names array with reference names. 348 */ toRefNames()349 public void toRefNames(){ 350 final int numRefs=refNames.size(); 351 for(int r=0, s=1; r<numRefs; r++){ 352 final int scafs=refScafCounts[r]; 353 final int lim=s+scafs; 354 final String name=ReadWrite.stripToCore(refNames.get(r)); 355 // outstream.println("r="+r+", s="+s+", scafs="+scafs+", lim="+lim+", name="+name); 356 while(s<lim){ 357 // outstream.println(r+", "+s+". Setting "+scaffoldNames.get(s)+" -> "+name); 358 scaffoldNames.set(s, name); 359 s++; 360 } 361 } 362 } 363 364 /*--------------------------------------------------------------*/ 365 /*---------------- Inner Classes ----------------*/ 366 /*--------------------------------------------------------------*/ 367 368 /** 369 * Loads kmers into a table. Each thread handles all kmers X such that X%WAYS==tnum. 370 */ 371 private class LoadThread extends Thread{ 372 LoadThread(final int tnum_)373 public LoadThread(final int tnum_){ 374 tnum=tnum_; 375 map=tables[tnum]; 376 } 377 378 /** 379 * Get the next list of reads (or scaffolds) from the queue. 380 * @return List of reads 381 */ fetch()382 private ArrayList<Read> fetch(){ 383 ArrayList<Read> list=null; 384 while(list==null){ 385 try { 386 list=queue.take(); 387 } catch (InterruptedException e) { 388 // TODO Auto-generated catch block 389 e.printStackTrace(); 390 } 391 } 392 return list; 393 } 394 395 @Override run()396 public void run(){ 397 ArrayList<Read> reads=fetch(); 398 while(reads!=POISON){ 399 for(Read r1 : reads){ 400 assert(r1.pairnum()==0); 401 final Read r2=r1.mate; 402 403 addedT+=addToMap(r1, minRefSkip); 404 if(r2!=null){addedT+=addToMap(r2, minRefSkip);} 405 } 406 reads=fetch(); 407 } 408 409 if(map.canRebalance() && map.size()>2L*map.arrayLength()){ 410 map.rebalance(); 411 } 412 } 413 414 /** 415 * @param r The current read to process 416 * @param skip Number of bases to skip between kmers 417 * @return Number of kmers stored 418 */ addToMap(Read r, int skip)419 private long addToMap(Read r, int skip){ 420 if(variableRefSkip){ 421 int rblen=r.length(); 422 skip=(rblen>20000000 ? k : rblen>5000000 ? 11 : rblen>500000 ? 2 : 0); 423 skip=Tools.mid(minRefSkip, maxRefSkip, skip); 424 } 425 final byte[] bases=r.bases; 426 final int shift=2*k; 427 final int shift2=shift-2; 428 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 429 final long kmask=kMasks[k]; 430 long kmer=0; 431 long rkmer=0; 432 long added=0; 433 int len=0; 434 435 if(bases!=null){ 436 refReadsT++; 437 refBasesT+=bases.length; 438 } 439 if(bases==null || bases.length<k){return 0;} 440 441 final int id=(r.obj==null ? 1 : ((Integer)r.obj).intValue()); 442 443 if(skip>1){ //Process while skipping some kmers 444 for(int i=0; i<bases.length; i++){ 445 byte b=bases[i]; 446 long x=AminoAcid.baseToNumber[b]; 447 long x2=AminoAcid.baseToComplementNumber[b]; 448 kmer=((kmer<<2)|x)&mask; 449 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 450 if(x<0){len=0; rkmer=0;}else{len++;} 451 if(verbose){outstream.println("Scanning1 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 452 if(len>=k){ 453 refKmersT++; 454 if(len%skip==0){ 455 final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]); 456 added+=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance); 457 if(useShortKmers){ 458 if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);} 459 if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);} 460 } 461 } 462 } 463 } 464 }else{ //Process all kmers 465 for(int i=0; i<bases.length; i++){ 466 byte b=bases[i]; 467 long x=AminoAcid.baseToNumber[b]; 468 long x2=AminoAcid.baseToComplementNumber[b]; 469 kmer=((kmer<<2)|x)&mask; 470 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 471 if(x<0){len=0; rkmer=0;}else{len++;} 472 if(verbose){outstream.println("Scanning2 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 473 if(len>=k){ 474 refKmersT++; 475 final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]); 476 final long atm=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance); 477 added+=atm; 478 // assert(false) : atm+", "+map.contains(toValue(kmer, rkmer, kmask)); 479 if(useShortKmers){ 480 if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);} 481 if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);} 482 } 483 } 484 } 485 } 486 return added; 487 } 488 489 490 /** 491 * Adds short kmers on the left end of the read. 492 * @param kmer Forward kmer 493 * @param rkmer Reverse kmer 494 * @param extraBase Base added to end in case of deletions 495 * @param id Scaffold number 496 * @return Number of kmers stored 497 */ addToMapLeftShift(long kmer, long rkmer, final long extraBase, final int id)498 private long addToMapLeftShift(long kmer, long rkmer, final long extraBase, final int id){ 499 if(verbose){outstream.println("addToMapLeftShift");} 500 long added=0; 501 for(int i=k-1; i>=mink; i--){ 502 kmer=kmer&rightMasks[i]; 503 rkmer=rkmer>>>2; 504 long x=addToMap(kmer, rkmer, i, extraBase, id, kMasks[i], hammingDistance2, editDistance2); 505 added+=x; 506 if(verbose){ 507 if((toValue(kmer, rkmer, kMasks[i]))%WAYS==tnum){ 508 outstream.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added left-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i)+"; value="+(toValue(kmer, rkmer, kMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+kMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]); 509 outstream.println("i="+i+"; tnum="+tnum+"; Looking for left-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i)); 510 final long value=toValue(kmer, rkmer, kMasks[i]); 511 if(map.contains(value)){outstream.println("Found "+value);} 512 } 513 } 514 } 515 return added; 516 } 517 518 519 /** 520 * Adds short kmers on the right end of the read. 521 * @param kmer Forward kmer 522 * @param rkmer Reverse kmer 523 * @param id Scaffold number 524 * @return Number of kmers stored 525 */ addToMapRightShift(long kmer, long rkmer, final int id)526 private long addToMapRightShift(long kmer, long rkmer, final int id){ 527 if(verbose){outstream.println("addToMapRightShift");} 528 long added=0; 529 for(int i=k-1; i>=mink; i--){ 530 long extraBase=kmer&3L; 531 kmer=kmer>>>2; 532 rkmer=rkmer&rightMasks[i]; 533 // assert(Long.numberOfLeadingZeros(kmer)>=2*(32-i)) : Long.numberOfLeadingZeros(kmer)+", "+i+", "+kmer+", "+kMasks[i]; 534 // assert(Long.numberOfLeadingZeros(rkmer)>=2*(32-i)) : Long.numberOfLeadingZeros(rkmer)+", "+i+", "+rkmer+", "+kMasks[i]; 535 long x=addToMap(kmer, rkmer, i, extraBase, id, kMasks[i], hammingDistance2, editDistance2); 536 added+=x; 537 if(verbose){ 538 if((toValue(kmer, rkmer, kMasks[i]))%WAYS==tnum){ 539 outstream.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added right-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i)+"; value="+(toValue(kmer, rkmer, kMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+kMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]); 540 outstream.println("i="+i+"; tnum="+tnum+"; Looking for right-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i)); 541 final long value=toValue(kmer, rkmer, kMasks[i]); 542 if(map.contains(value)){outstream.println("Found "+value);} 543 } 544 } 545 } 546 return added; 547 } 548 549 550 /** 551 * Adds this kmer to the table, including any mutations implied by editDistance or hammingDistance. 552 * @param kmer Forward kmer 553 * @param rkmer Reverse kmer 554 * @param len Kmer length 555 * @param extraBase Base added to end in case of deletions 556 * @param id Scaffold number 557 * @param kmask0 558 * @return Number of kmers stored 559 */ addToMap(final long kmer, final long rkmer, final int len, final long extraBase, final int id, final long kmask0, final int hdist, final int edist)560 private long addToMap(final long kmer, final long rkmer, final int len, final long extraBase, final int id, final long kmask0, final int hdist, final int edist){ 561 562 assert(kmask0==kMasks[len]) : kmask0+", "+len+", "+kMasks[len]+", "+Long.numberOfTrailingZeros(kmask0)+", "+Long.numberOfTrailingZeros(kMasks[len]); 563 564 if(verbose){outstream.println("addToMap_A; len="+len+"; kMasks[len]="+kMasks[len]);} 565 assert((kmer&kmask0)==0); 566 final long added; 567 if(hdist==0){ 568 final long key=toValue(kmer, rkmer, kmask0); 569 if(failsSpeed(key)){return 0;} 570 if(key%WAYS!=tnum){return 0;} 571 if(verbose){outstream.println("addToMap_B: "+AminoAcid.kmerToString(kmer&~kMasks[len], len)+" = "+key);} 572 if(storeMode==SET_IF_NOT_PRESENT){ 573 added=map.setIfNotPresent(key, id); 574 }else if(storeMode==SET_ALWAYS){ 575 added=map.set(key, id); 576 }else{ 577 assert(storeMode==INCREMENT); 578 added=map.increment(key, 1); 579 } 580 }else if(edist>0){ 581 // long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]); 582 added=mutate(kmer, rkmer, len, id, edist, extraBase); 583 }else{ 584 added=mutate(kmer, rkmer, len, id, hdist, -1); 585 } 586 if(verbose){outstream.println("addToMap added "+added+" keys.");} 587 return added; 588 } 589 590 /** 591 * Mutate and store this kmer through 'dist' recursions. 592 * @param kmer Forward kmer 593 * @param rkmer Reverse kmer 594 * @param id Scaffold number 595 * @param dist Number of mutations 596 * @param extraBase Base added to end in case of deletions 597 * @return Number of kmers stored 598 */ mutate(final long kmer, final long rkmer, final int len, final int id, final int dist, final long extraBase)599 private long mutate(final long kmer, final long rkmer, final int len, final int id, final int dist, final long extraBase){ 600 long added=0; 601 602 final long key=toValue(kmer, rkmer, kMasks[len]); 603 604 if(verbose){outstream.println("mutate_A; len="+len+"; kmer="+kmer+"; rkmer="+rkmer+"; kMasks[len]="+kMasks[len]);} 605 if(key%WAYS==tnum){ 606 if(verbose){outstream.println("mutate_B: "+AminoAcid.kmerToString(kmer&~kMasks[len], len)+" = "+key);} 607 int x; 608 if(storeMode==SET_IF_NOT_PRESENT){ 609 x=map.setIfNotPresent(key, id); 610 }else if(storeMode==SET_ALWAYS){ 611 x=map.set(key, id); 612 }else{ 613 assert(storeMode==INCREMENT); 614 x=map.increment(key, 1); 615 x=(x==1 ? 1 : 0); 616 } 617 if(verbose){outstream.println("mutate_B added "+x+" keys.");} 618 added+=x; 619 assert(map.contains(key)); 620 } 621 622 if(dist>0){ 623 final int dist2=dist-1; 624 625 //Sub 626 for(int j=0; j<4; j++){ 627 for(int i=0; i<len; i++){ 628 final long temp=(kmer&clearMasks[i])|setMasks[j][i]; 629 if(temp!=kmer){ 630 long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len); 631 added+=mutate(temp, rtemp, len, id, dist2, extraBase); 632 } 633 } 634 } 635 636 if(editDistance>0){ 637 //Del 638 if(extraBase>=0 && extraBase<=3){ 639 for(int i=1; i<len; i++){ 640 final long temp=(kmer&leftMasks[i])|((kmer<<2)&rightMasks[i])|extraBase; 641 if(temp!=kmer){ 642 long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len); 643 added+=mutate(temp, rtemp, len, id, dist2, -1); 644 } 645 } 646 } 647 648 //Ins 649 final long eb2=kmer&3; 650 for(int i=1; i<len; i++){ 651 final long temp0=(kmer&leftMasks[i])|((kmer&rightMasks[i])>>2); 652 for(int j=0; j<4; j++){ 653 final long temp=temp0|setMasks[j][i-1]; 654 if(temp!=kmer){ 655 long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len); 656 added+=mutate(temp, rtemp, len, id, dist2, eb2); 657 } 658 } 659 } 660 } 661 662 } 663 664 return added; 665 } 666 667 /*--------------------------------------------------------------*/ 668 669 /** Number of kmers stored by this thread */ 670 public long addedT=0; 671 /** Number of items encountered by this thread */ 672 public long refKmersT=0, refReadsT=0, refBasesT=0; 673 /** Thread number; used to determine which kmers to store */ 674 public final int tnum; 675 /** Buffer of input read lists */ 676 public final ArrayBlockingQueue<ArrayList<Read>> queue=new ArrayBlockingQueue<ArrayList<Read>>(32); 677 678 /** Destination for storing kmers */ 679 private final AbstractKmerTable map; 680 681 } 682 683 /*--------------------------------------------------------------*/ 684 /*---------------- Static Methods ----------------*/ 685 /*--------------------------------------------------------------*/ 686 687 /** 688 * Transforms a kmer into a canonical value stored in the table. Expected to be inlined. 689 * @param kmer Forward kmer 690 * @param rkmer Reverse kmer 691 * @param lengthMask Bitmask with single '1' set to left of kmer 692 * @return Canonical value 693 */ toValue(long kmer, long rkmer, long lengthMask)694 private final long toValue(long kmer, long rkmer, long lengthMask){ 695 assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer; 696 long value=(rcomp ? Tools.max(kmer, rkmer) : kmer); 697 return (value&middleMask)|lengthMask; 698 } 699 passesSpeed(long key)700 final boolean passesSpeed(long key){ 701 return speed<1 || ((key&Long.MAX_VALUE)%17)>=speed; 702 } 703 failsSpeed(long key)704 final boolean failsSpeed(long key){ 705 return speed>0 && ((key&Long.MAX_VALUE)%17)<speed; 706 } 707 708 /*--------------------------------------------------------------*/ 709 /*---------------- Fields ----------------*/ 710 /*--------------------------------------------------------------*/ 711 712 /** Has this class encountered errors while processing? */ 713 public boolean errorState=false; 714 715 /** How to associate values with kmers */ 716 private int storeMode=SET_IF_NOT_PRESENT; 717 718 /** Hold kmers. A kmer X such that X%WAYS=Y will be stored in keySets[Y] */ 719 public AbstractKmerTable[] tables; 720 /** A scaffold's name is stored at scaffoldNames.get(id). 721 * scaffoldNames[0] is reserved, so the first id is 1. */ 722 public ArrayList<String> scaffoldNames; 723 /** Names of reference files (refNames[0] is valid). */ 724 public ArrayList<String> refNames; 725 /** Number of scaffolds per reference. */ 726 public int[] refScafCounts; 727 /** scaffoldLengths[id] stores the length of that scaffold */ 728 public IntList scaffoldLengths=new IntList(); 729 730 /** Make the middle base in a kmer a wildcard to improve sensitivity */ 731 public final boolean maskMiddle=false; 732 /** Correct errors via read overlap */ 733 public boolean ecc=false; 734 735 /** Store reference kmers with up to this many substitutions */ 736 public final int hammingDistance; 737 /** Store reference kmers with up to this many edits (including indels) */ 738 public final int editDistance; 739 /** Store short reference kmers with up to this many substitutions */ 740 public int hammingDistance2=-1; 741 /** Store short reference kmers with up to this many edits (including indels) */ 742 public int editDistance2=-1; 743 /** Always skip at least this many consecutive kmers when hashing reference. 744 * 1 means every kmer is used, 2 means every other, etc. */ 745 private int minRefSkip=0; 746 /** Never skip more than this many consecutive kmers when hashing reference. */ 747 private int maxRefSkip=0; 748 749 private boolean variableRefSkip=false; 750 751 /*--------------------------------------------------------------*/ 752 /*---------------- Statistics ----------------*/ 753 /*--------------------------------------------------------------*/ 754 755 long refReads=0; 756 long refBases=0; 757 long refKmers=0; 758 759 long storedKmers=0; 760 761 /*--------------------------------------------------------------*/ 762 /*---------------- Final Primitives ----------------*/ 763 /*--------------------------------------------------------------*/ 764 765 /** Look for reverse-complements as well as forward kmers. Default: true */ 766 private final boolean rcomp; 767 /** AND bitmask with 0's at the middle base */ 768 private final long middleMask; 769 770 /** Normal kmer length */ 771 private final int k; 772 /** k-1; used in some expressions */ 773 private final int k2; 774 /** Shortest kmer to use for trimming */ 775 private final int mink; 776 /** Attempt to match kmers shorter than normal k on read ends when doing kTrimming. */ 777 private final boolean useShortKmers; 778 779 /** Fraction of kmers to skip, 0 to 16 out of 17 */ 780 private final int speed; 781 782 /*--------------------------------------------------------------*/ 783 /*---------------- Static Fields ----------------*/ 784 /*--------------------------------------------------------------*/ 785 786 /** Default initial size of data structures */ 787 private static final int initialSizeDefault=128000; 788 789 /** Number of tables (and threads, during loading) */ 790 private static final int WAYS=7; //123 791 /** Verbose messages */ 792 public static final boolean verbose=false; //123 793 794 /** Print messages to this stream */ 795 private static PrintStream outstream=System.err; 796 /** Display progress messages such as memory usage */ 797 public static boolean DISPLAY_PROGRESS=true; 798 /** Indicates end of input stream */ 799 private static final ArrayList<Read> POISON=new ArrayList<Read>(0); 800 /** Make unambiguous copies of ref sequences with ambiguous bases */ 801 public static boolean REPLICATE_AMBIGUOUS=false; 802 803 /** x&clearMasks[i] will clear base i */ 804 private static final long[] clearMasks; 805 /** x|setMasks[i][j] will set base i to j */ 806 private static final long[][] setMasks; 807 /** x&leftMasks[i] will clear all bases to the right of i (exclusive) */ 808 private static final long[] leftMasks; 809 /** x&rightMasks[i] will clear all bases to the left of i (inclusive) */ 810 private static final long[] rightMasks; 811 /** x|kMasks[i] will set the bit to the left of the leftmost base */ 812 private static final long[] kMasks; 813 814 public static final int SET_IF_NOT_PRESENT=1, SET_ALWAYS=2, INCREMENT=3; 815 816 /*--------------------------------------------------------------*/ 817 /*---------------- Static Initializers ----------------*/ 818 /*--------------------------------------------------------------*/ 819 820 static{ 821 clearMasks=new long[32]; 822 leftMasks=new long[32]; 823 rightMasks=new long[32]; 824 kMasks=new long[32]; 825 setMasks=new long[4][32]; 826 for(int i=0; i<32; i++){ 827 clearMasks[i]=~(3L<<(2*i)); 828 } 829 for(int i=0; i<32; i++){ 830 leftMasks[i]=((-1L)<<(2*i)); 831 } 832 for(int i=0; i<32; i++){ 833 rightMasks[i]=~((-1L)<<(2*i)); 834 } 835 for(int i=0; i<32; i++){ 836 kMasks[i]=((1L)<<(2*i)); 837 } 838 for(int i=0; i<32; i++){ 839 for(long j=0; j<4; j++){ 840 setMasks[(int)j][i]=(j<<(2*i)); 841 } 842 } 843 } 844 845 } 846