1 package jgi; 2 3 import java.io.File; 4 import java.io.PrintStream; 5 import java.util.ArrayList; 6 7 import dna.AminoAcid; 8 import fileIO.ByteFile; 9 import fileIO.FileFormat; 10 import fileIO.ReadWrite; 11 import shared.Parse; 12 import shared.Parser; 13 import shared.PreParser; 14 import shared.ReadStats; 15 import shared.Shared; 16 import shared.Timer; 17 import shared.Tools; 18 import stream.ConcurrentReadInputStream; 19 import stream.ConcurrentReadOutputStream; 20 import stream.FASTQ; 21 import stream.FastaReadInputStream; 22 import stream.Read; 23 import structures.ByteBuilder; 24 import structures.ListNum; 25 import structures.LongListSet; 26 import structures.LongListSet.LongListSetIterator; 27 28 /** 29 * Generates mutants of kmers. 30 * 31 * @author Brian Bushnell 32 * @date January 8, 2021 33 * 34 */ 35 public class KExpand { 36 37 /*--------------------------------------------------------------*/ 38 /*---------------- Initialization ----------------*/ 39 /*--------------------------------------------------------------*/ 40 41 /** 42 * Code entrance from the command line. 43 * @param args Command line arguments 44 */ main(String[] args)45 public static void main(String[] args){ 46 //Start a timer immediately upon code entrance. 47 Timer t=new Timer(); 48 49 //Create an instance of this class 50 KExpand x=new KExpand(args); 51 52 //Run the object 53 x.process(t); 54 55 //Close the print stream if it was redirected 56 Shared.closeStream(x.outstream); 57 } 58 59 /** 60 * Constructor. 61 * @param args Command line arguments 62 */ KExpand(String[] args)63 public KExpand(String[] args){ 64 65 {//Preparse block for help, config files, and outstream 66 PreParser pp=new PreParser(args, getClass(), false); 67 args=pp.args; 68 outstream=pp.outstream; 69 } 70 71 //Set shared static variables prior to parsing 72 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; 73 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); 74 Shared.capBuffers(4); //Only for singlethreaded programs 75 76 {//Parse the arguments 77 final Parser parser=parse(args); 78 Parser.processQuality(); 79 80 maxReads=parser.maxReads; 81 overwrite=ReadStats.overwrite=parser.overwrite; 82 append=ReadStats.append=parser.append; 83 setInterleaved=parser.setInterleaved; 84 85 in1=parser.in1; 86 in2=parser.in2; 87 qfin1=parser.qfin1; 88 qfin2=parser.qfin2; 89 extin=parser.extin; 90 91 out1=parser.out1; 92 extout=parser.extout; 93 amino=Shared.AMINO_IN; 94 } 95 96 doPoundReplacement(); //Replace # with 1 and 2 97 adjustInterleaving(); //Make sure interleaving agrees with number of input and output files 98 fixExtensions(); //Add or remove .gz or .bz2 as needed 99 checkFileExistence(); //Ensure files can be read and written 100 checkStatics(); //Adjust file-related static fields as needed for this program 101 102 // k2=k-1; 103 // shift=bitsPerBase*k; 104 // shift2=shift-bitsPerBase; 105 // mask=(shift>63 ? -1L : ~((-1L)<<shift)); 106 107 {//set some constants 108 k2=k-1; 109 bitsPerBase=(amino ? 5 : 2); 110 maxSymbol=(amino ? 20 : 3); 111 symbols=maxSymbol+1; 112 symbolArrayLen=(64+bitsPerBase-1)/bitsPerBase; 113 symbolSpace=(1<<bitsPerBase); 114 symbolMask=symbolSpace-1; 115 116 symbolToNumber=AminoAcid.symbolToNumber(amino); 117 symbolToNumber0=AminoAcid.symbolToNumber0(amino); 118 symbolToComplementNumber0=AminoAcid.symbolToComplementNumber0(amino); 119 120 clearMasks=new long[symbolArrayLen]; 121 leftMasks=new long[symbolArrayLen]; 122 rightMasks=new long[symbolArrayLen]; 123 // lengthMasks=new long[symbolArrayLen]; 124 setMasks=new long[symbols][symbolArrayLen]; 125 for(int i=0; i<symbolArrayLen; i++){ 126 clearMasks[i]=~(symbolMask<<(bitsPerBase*i)); 127 leftMasks[i]=((-1L)<<(bitsPerBase*i)); 128 rightMasks[i]=~((-1L)<<(bitsPerBase*i)); 129 // lengthMasks[i]=((1L)<<(bitsPerBase*i)); 130 for(long j=0; j<symbols; j++){ 131 setMasks[(int)j][i]=(j<<(bitsPerBase*i)); 132 } 133 } 134 135 minlen=k-1; 136 minlen2=k; 137 shift=bitsPerBase*k; 138 shift2=shift-bitsPerBase; 139 mask=(shift>63 ? -1L : ~((-1L)<<shift)); 140 // kmask=lengthMasks[k]; 141 } 142 143 //Create output FileFormat objects 144 ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, append, ordered); 145 146 //Create input FileFormat objects 147 ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true); 148 ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true); 149 } 150 151 /*--------------------------------------------------------------*/ 152 /*---------------- Initialization Helpers ----------------*/ 153 /*--------------------------------------------------------------*/ 154 155 /** Parse arguments from the command line */ parse(String[] args)156 private Parser parse(String[] args){ 157 158 //Create a parser object 159 Parser parser=new Parser(); 160 parser.overwrite=true; 161 162 //Set any necessary Parser defaults here 163 //parser.foo=bar; 164 165 //Parse each argument 166 for(int i=0; i<args.length; i++){ 167 String arg=args[i]; 168 169 //Break arguments into their constituent parts, in the form of "a=b" 170 String[] split=arg.split("="); 171 String a=split[0].toLowerCase(); 172 String b=split.length>1 ? split[1] : null; 173 if(b!=null && b.equalsIgnoreCase("null")){b=null;} 174 175 if(a.equals("verbose")){ 176 verbose=Parse.parseBoolean(b); 177 }else if(a.equals("k")){ 178 k=Parse.parseIntKMG(b); 179 assert(k>0 && k<=31); 180 }else if(a.equals("rcomp")){ 181 rcomp=Parse.parseBoolean(b); 182 }else if(a.equals("subdist") || a.equals("sdist") || a.equals("hdist") || a.equals("hammingdistance")){ 183 subDist=Parse.parseIntKMG(b); 184 assert(subDist>=0); 185 }else if(a.equals("deldist") || a.equals("ddist") || a.equals("deletiondistance")){ 186 delDist=Parse.parseIntKMG(b); 187 assert(delDist>=0 && delDist<=3); 188 }else if(a.equals("insdist") || a.equals("idist") || a.equals("insertiondistance")){ 189 insDist=Parse.parseIntKMG(b); 190 assert(insDist>=0); 191 }else if(a.equals("editdist") || a.equals("edist") || a.equals("edits") || a.equals("editdistance")){ 192 int x=Parse.parseIntKMG(b); 193 assert(x>=0 && x<=3); 194 editDist=x; 195 subDist=Tools.max(subDist, x); 196 }else if(a.equals("maxedits") || a.equals("emax")){ 197 maxEdits=Parse.parseIntKMG(b); 198 assert(maxEdits>=0); 199 }else if(a.equals("maxsubs") || a.equals("smax")){ 200 maxSubs=Parse.parseIntKMG(b); 201 assert(maxSubs>=0); 202 }else if(a.equals("maxdels") || a.equals("dmax")){ 203 maxDels=Parse.parseIntKMG(b); 204 assert(maxDels>=0); 205 }else if(a.equals("maxinss") || a.equals("imax")){ 206 maxInss=Parse.parseIntKMG(b); 207 assert(maxInss>=0); 208 }else if(a.equals("speed")){ 209 int x=Parse.parseIntKMG(b); 210 assert(x>=0 && x<=16) : "Speed must be 0-16"; 211 speed=x; 212 }else if(a.equals("skip")){ 213 int x=Parse.parseIntKMG(b); 214 assert(x>=0) : "Skip must be >=0"; 215 skip=x; 216 }else if(a.equals("parse_flag_goes_here")){ 217 long fake_variable=Parse.parseKMG(b); 218 //Set a variable here 219 }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser 220 //do nothing 221 }else{ 222 outstream.println("Unknown parameter "+args[i]); 223 assert(false) : "Unknown parameter "+args[i]; 224 } 225 } 226 227 return parser; 228 } 229 230 /** Replace # with 1 and 2 in headers */ doPoundReplacement()231 private void doPoundReplacement(){ 232 //Do input file # replacement 233 if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){ 234 in2=in1.replace("#", "2"); 235 in1=in1.replace("#", "1"); 236 } 237 238 //Ensure there is an input file 239 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} 240 } 241 242 /** Add or remove .gz or .bz2 as needed */ fixExtensions()243 private void fixExtensions(){ 244 in1=Tools.fixExtension(in1); 245 in2=Tools.fixExtension(in2); 246 qfin1=Tools.fixExtension(qfin1); 247 qfin2=Tools.fixExtension(qfin2); 248 } 249 250 /** Ensure files can be read and written */ checkFileExistence()251 private void checkFileExistence(){ 252 //Ensure output files can be written 253 if(!Tools.testOutputFiles(overwrite, append, false, out1)){ 254 outstream.println((out1==null)+", "+out1); 255 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out1+"\n"); 256 } 257 258 //Ensure input files can be read 259 if(!Tools.testInputFiles(false, true, in1, in2)){ 260 throw new RuntimeException("\nCan't read some input files.\n"); 261 } 262 263 //Ensure that no file was specified multiple times 264 if(!Tools.testForDuplicateFiles(true, in1, in2, out1)){ 265 throw new RuntimeException("\nSome file names were specified multiple times.\n"); 266 } 267 } 268 269 /** Make sure interleaving agrees with number of input and output files */ adjustInterleaving()270 private void adjustInterleaving(){ 271 //Adjust interleaved detection based on the number of input files 272 if(in2!=null){ 273 if(FASTQ.FORCE_INTERLEAVED){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");} 274 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; 275 } 276 } 277 278 /** Adjust file-related static fields as needed for this program */ checkStatics()279 private static void checkStatics(){ 280 //Adjust the number of threads for input file reading 281 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ 282 ByteFile.FORCE_MODE_BF2=true; 283 } 284 285 assert(FastaReadInputStream.settingsOK()); 286 } 287 288 /*--------------------------------------------------------------*/ 289 /*---------------- Outer Methods ----------------*/ 290 /*--------------------------------------------------------------*/ 291 292 /** Create read streams and process all data */ process(Timer t)293 void process(Timer t){ 294 295 //Create a read input stream 296 final ConcurrentReadInputStream cris=makeCris(); 297 298 //Reset counters 299 readsProcessed=readsOut=0; 300 basesProcessed=basesOut=0; 301 302 //Process the read stream 303 processInner(cris); 304 errorState|=ReadWrite.closeStreams(cris); 305 306 if(verbose){outstream.println("Finished reading input.");} 307 308 //Optionally create a read output stream 309 final ConcurrentReadOutputStream ros=makeCros(); 310 311 if(ros!=null){ 312 dumpKmers(ros); 313 if(verbose){outstream.println("Finished processing output.");} 314 315 errorState|=ReadWrite.closeStream(ros); 316 } 317 318 //Write anything that was accumulated by ReadStats 319 errorState|=ReadStats.writeAll(); 320 321 //Report timing and results 322 t.stop(); 323 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); 324 outstream.println(Tools.things("Kmers Processed", kmersProcessed, 8)); 325 outstream.println(Tools.things("Kmers Added", kmersAdded, 8)); 326 327 outstream.println(); 328 outstream.println(Tools.readsBasesOut(readsProcessed, basesProcessed, readsOut, basesOut, 8, false)); 329 outstream.println(Tools.things("Kmers Out", kmersOut, 8)); 330 331 //Throw an exception of there was an error in a thread 332 if(errorState){ 333 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); 334 } 335 } 336 337 /*--------------------------------------------------------------*/ 338 /*---------------- Dumping ----------------*/ 339 /*--------------------------------------------------------------*/ 340 dumpKmers(ConcurrentReadOutputStream ros)341 private void dumpKmers(ConcurrentReadOutputStream ros) { 342 343 set.sort(); 344 set.shrinkToUnique(); 345 LongListSetIterator it=set.iterator(); 346 347 long id=1; 348 ArrayList<Read> list=new ArrayList<Read>(200); 349 ByteBuilder bb=new ByteBuilder(); 350 while(it.hasMore()){ 351 bb.clear(); 352 final long kmer=it.next(); 353 bb.appendKmer(kmer, k); 354 Read r=new Read(bb.toBytes(), null, id); 355 id++; 356 kmersOut++; 357 readsOut++; 358 basesOut+=r.length(); 359 list.add(r); 360 if(list.size()>=200){ 361 ros.add(list, 0); 362 list=new ArrayList<Read>(200); 363 } 364 } 365 if(list.size()>0){ros.add(list, 0);} 366 } 367 368 /*--------------------------------------------------------------*/ 369 /*---------------- Inner Methods ----------------*/ 370 /*--------------------------------------------------------------*/ 371 makeCris()372 private ConcurrentReadInputStream makeCris(){ 373 ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, ffin2, qfin1, qfin2); 374 cris.start(); //Start the stream 375 if(verbose){outstream.println("Started cris");} 376 boolean paired=cris.paired(); 377 if(!ffin1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));} 378 return cris; 379 } 380 makeCros()381 private ConcurrentReadOutputStream makeCros(){ 382 if(ffout1==null){return null;} 383 384 //Select output buffer size based on whether it needs to be ordered 385 final int buff=4; 386 387 final ConcurrentReadOutputStream ros=ConcurrentReadOutputStream.getStream(ffout1, null, buff, null, false); 388 ros.start(); //Start the stream 389 return ros; 390 } 391 392 /** Iterate through the reads */ processInner(final ConcurrentReadInputStream cris)393 void processInner(final ConcurrentReadInputStream cris){ 394 395 //Do anything necessary prior to processing 396 397 { 398 //Grab the first ListNum of reads 399 ListNum<Read> ln=cris.nextList(); 400 401 //Check to ensure pairing is as expected 402 if(ln!=null && !ln.isEmpty()){ 403 Read r=ln.get(0); 404 assert(ffin1.samOrBam() || (r.mate!=null)==cris.paired()); 405 } 406 407 //As long as there is a nonempty read list... 408 while(ln!=null && ln.size()>0){ 409 // if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} //Disabled due to non-static access 410 411 processList(ln, cris); 412 413 //Fetch a new list 414 ln=cris.nextList(); 415 } 416 417 //Notify the input stream that the final list was used 418 if(ln!=null){ 419 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); 420 } 421 } 422 423 //Do anything necessary after processing 424 425 } 426 427 /** 428 * Process a list of Reads. 429 * @param ln The list. 430 * @param cris Read Input Stream 431 * @param ros Read Output Stream for reads that will be retained 432 */ processList(ListNum<Read> ln, final ConcurrentReadInputStream cris)433 void processList(ListNum<Read> ln, final ConcurrentReadInputStream cris){ 434 435 //Grab the actual read list from the ListNum 436 final ArrayList<Read> reads=ln.list; 437 438 long added=0; 439 440 //Loop through each read in the list 441 for(int idx=0; idx<reads.size(); idx++){ 442 final Read r1=reads.get(idx); 443 final Read r2=r1.mate; 444 445 //Validate reads in worker threads 446 if(!r1.validated()){r1.validate(true);} 447 if(r2!=null && !r2.validated()){r2.validate(true);} 448 449 //Track the initial length for statistics 450 final int initialLength1=r1.length(); 451 final int initialLength2=r1.mateLength(); 452 453 //Increment counters 454 readsProcessed+=r1.pairCount(); 455 basesProcessed+=initialLength1+initialLength2; 456 kmersAdded+=processReadPair(r1, r2); 457 } 458 459 //Notify the input stream that the list was used 460 cris.returnList(ln); 461 // if(verbose){outstream.println("Returned a list.");} //Disabled due to non-static access 462 } 463 464 465 /** 466 * Process a single read pair. 467 * @param r1 Read 1 468 * @param r2 Read 2 (may be null) 469 * @return Number of kmers added. 470 */ processReadPair(final Read r1, final Read r2)471 long processReadPair(final Read r1, final Read r2){ 472 long x=addToMap(r1, skip); 473 if(r2!=null){ 474 x+=addToMap(r2, skip); 475 } 476 return x; 477 } 478 addToMap(Read r, int skip)479 private long addToMap(Read r, int skip){ 480 final byte[] bases=r.bases; 481 long kmer=0; 482 long rkmer=0; 483 long added=0; 484 int len=0; 485 486 // if(bases!=null){ 487 // readsProcessed++; 488 // basesProcessed+=bases.length; 489 // } 490 if(bases==null || bases.length<k){return 0;} 491 492 if(skip>1){ //Process while skipping some kmers 493 for(int i=0; i<bases.length; i++){ 494 byte b=bases[i]; 495 long x=symbolToNumber0[b]; 496 long x2=symbolToComplementNumber0[b]; 497 kmer=((kmer<<bitsPerBase)|x)&mask; 498 rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask; 499 if(isFullyDefined(b)){len++;}else{len=0; rkmer=0;} 500 if(verbose){outstream.println("Scanning1 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 501 if(len>=k){ 502 kmersProcessed++; 503 if(len%skip==0){ 504 final long extraBase=(i>=bases.length-1 ? -1 : symbolToNumber[bases[i+1]]); 505 final long extraBase2=(i>=bases.length-2 ? -1 : symbolToNumber[bases[i+2]]); 506 final long extraBase3=(i>=bases.length-3 ? -1 : symbolToNumber[bases[i+3]]); 507 added+=addAndMutate(kmer, rkmer, k, extraBase, extraBase2, extraBase3); 508 } 509 } 510 } 511 }else{ //Process all kmers 512 for(int i=0; i<bases.length; i++){ 513 final byte b=bases[i]; 514 final long x=symbolToNumber0[b]; 515 final long x2=symbolToComplementNumber0[b]; 516 // assert(x!=x2) : x+", "+x2+", "+Character.toString((char)b)+"\n"+Arrays.toString(symbolToNumber0)+"\n"+Arrays.toString(symbolToComplementNumber); 517 kmer=((kmer<<bitsPerBase)|x)&mask; 518 //10000, 1111111111, 16, 16, 2, 10, 8 519 rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask; 520 if(isFullyDefined(b)){len++;}else{len=0; rkmer=0;} 521 if(verbose){ 522 // if(verbose){ 523 // String fwd=new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)); 524 // String rev=AminoAcid.reverseComplementBases(fwd); 525 // String fwd2=kmerToString(kmer, Tools.min(len, k)); 526 // outstream.println("fwd="+fwd+", fwd2="+fwd2+", rev="+rev+", kmer="+kmer+", rkmer="+rkmer); 527 // outstream.println("b="+(char)b+", x="+x+", x2="+x2+", bitsPerBase="+bitsPerBase+", shift2="+shift2); 528 // if(!amino){ 529 // assert(AminoAcid.stringToKmer(fwd)==kmer) : fwd+", "+AminoAcid.stringToKmer(fwd)+", "+kmer+", "+len; 530 // if(len>=k){ 531 // assert(rcomp(kmer, Tools.min(len, k))==rkmer); 532 // assert(rcomp(rkmer, Tools.min(len, k))==kmer); 533 // assert(AminoAcid.kmerToString(kmer, Tools.min(len, k)).equals(fwd)); 534 // assert(AminoAcid.kmerToString(rkmer, Tools.min(len, k)).equals(rev)) : AminoAcid.kmerToString(rkmer, Tools.min(len, k))+" != "+rev+" (rkmer)"; 535 // } 536 // assert(fwd.equalsIgnoreCase(fwd2)) : fwd+", "+fwd2; //may be unsafe 537 // } 538 // outstream.println("Scanning6 i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+", bases="+fwd+", rbases="+rev); 539 // } 540 } 541 if(len>=k){ 542 // assert(kmer==rcomp(rkmer, k)) : Long.toBinaryString(kmer)+", "+Long.toBinaryString(rkmer)+", "+Long.toBinaryString(mask)+", x="+x+", x2="+x2+", bits="+bitsPerBase+", s="+shift+", s2="+shift2+", b="+Character.toString((char)b); 543 kmersProcessed++; 544 final long extraBase=(i>=bases.length-1 ? -1 : symbolToNumber[bases[i+1]]); 545 final long extraBase2=(i>=bases.length-2 ? -1 : symbolToNumber[bases[i+2]]); 546 final long extraBase3=(i>=bases.length-3 ? -1 : symbolToNumber[bases[i+3]]); 547 final long atm=addAndMutate(kmer, rkmer, k, extraBase, extraBase2, extraBase3); 548 added+=atm; 549 } 550 } 551 } 552 return added; 553 } 554 555 556 557 /** 558 * Adds this kmer to the table, including any mutations implied by subDist, etc. 559 * @param kmer Forward kmer 560 * @param rkmer Reverse kmer 561 * @param len Kmer length 562 * @param subDist Number of substitutions to allow 563 * @param delDist Number of deletions to allow 564 * @param insDist Number of insertions to allow 565 * @param extraBase Base added to end in case of deletions 566 * @param extraBase2 Base added to end in case of deletions 567 * @param extraBase3 Base added to end in case of deletions 568 * @return Number of kmers stored 569 */ addAndMutate(final long kmer, final long rkmer, final int len, final long extraBase, final long extraBase2, final long extraBase3)570 private long addAndMutate(final long kmer, final long rkmer, final int len, final long extraBase, final long extraBase2, final long extraBase3){ 571 if(editDist>0){//Edit mode 572 return mutateE(kmer, rkmer, len, editDist, maxSubs, maxDels, maxInss, extraBase, extraBase2, extraBase3); 573 }else if(subDist>0 || delDist>0 || insDist>0){//Sub Del Ins mode 574 return mutateE(kmer, rkmer, len, maxEdits, subDist, delDist, insDist, extraBase, extraBase2, extraBase3); 575 }else{ 576 addKey(kmer, rkmer, len); 577 } 578 return 1; 579 } 580 581 582 /** 583 * Adds this kmer to the set. 584 * @param kmer Forward kmer 585 * @param rkmer Reverse kmer 586 * @param len Kmer length 587 * @return Number of kmers added 588 */ addKey(final long kmer, final long rkmer, final int len)589 private long addKey(final long kmer, final long rkmer, final int len){ 590 if(verbose){outstream.println("addToMap_A; len="+len);} 591 592 final long key=toValue(kmer, rkmer); 593 if(verbose){outstream.println("toValue ("+kmerToString(kmer, len)+", "+kmerToString(rkmer, len)+") = "+kmerToString(key, len)+" = "+key);} 594 if(failsSpeed(key)){return 0;} 595 596 if(verbose){outstream.println("addToMap_B: "+kmerToString(key, len)+" ("+key+")");} 597 long added=0; 598 set.add(key); 599 added++; 600 if(verbose){outstream.println("addToMap added "+added+" keys.");} 601 return 1; 602 } 603 604 //*** Note! This is functionally identical to mutateE, 605 //*** only the default params are different 606 @Deprecated 607 /** 608 * Mutate and store this kmer through 'dist' recursions. 609 * @param kmer Forward kmer 610 * @param rkmer Reverse kmer 611 * @param len Kmer length 612 * @param subDist Number of substitutions to allow 613 * @param delDist Number of deletions to allow 614 * @param insDist Number of insertions to allow 615 * @param extraBase Base added to end in case of deletions 616 * @param extraBase2 Base added to end in case of deletions 617 * @param extraBase3 Base added to end in case of deletions 618 * @return Number of kmers stored 619 */ mutateSDI(final long kmer, final long rkmer, final int len, final int maxEdits, final int subDist, final int delDist, final int insDist, final long extraBase, final long extraBase2, final long extraBase3)620 private long mutateSDI(final long kmer, final long rkmer, final int len, final int maxEdits, 621 final int subDist, final int delDist, final int insDist, 622 final long extraBase, final long extraBase2, final long extraBase3){ 623 long added=1; 624 final int maxEdits2=maxEdits-1; 625 626 addKey(kmer, rkmer, len); 627 if(maxEdits<1){return 1;} 628 629 if(verbose){outstream.println("mutateSDI; len="+len+"; kmer="+kmer+"; rkmer="+rkmer);} 630 631 if(subDist>0){ 632 //Sub 633 for(int j=0; j<symbols; j++){ 634 for(int i=0; i<len; i++){ 635 final long temp=(kmer&clearMasks[i])|setMasks[j][i]; 636 if(temp!=kmer){ 637 long rtemp=rcomp(temp, len); 638 added+=mutateSDI(temp, rtemp, len, maxEdits2, 639 subDist-1, delDist, insDist, extraBase, extraBase2, extraBase3); 640 } 641 } 642 } 643 } 644 645 if(delDist>0 && extraBase>=0 && extraBase<=maxSymbol){ 646 //Del 647 for(int i=1; i<len; i++){ 648 final long temp=(kmer&leftMasks[i])|((kmer<<bitsPerBase)&rightMasks[i])|extraBase; 649 if(temp!=kmer){ 650 long rtemp=rcomp(temp, len); 651 added+=mutateSDI(temp, rtemp, len, maxEdits2, 652 subDist, delDist-1, insDist, extraBase2, extraBase3, -1); 653 } 654 } 655 } 656 657 if(insDist>0){ 658 //Ins 659 final long eb0=kmer&symbolMask; 660 for(int i=0; i<len; i++){ 661 final long temp0=(kmer&leftMasks[i])|((kmer&rightMasks[i])>>bitsPerBase); 662 for(int j=0; j<symbols; j++){ 663 final long temp=temp0|setMasks[j][i]; 664 if(temp!=kmer){ 665 long rtemp=rcomp(temp, len); 666 added+=mutateSDI(temp, rtemp, len, maxEdits2, 667 subDist, delDist, insDist-1, eb0, extraBase, extraBase2); 668 } 669 } 670 } 671 } 672 673 return added; 674 } 675 676 /** 677 * Mutate and store this kmer through 'dist' recursions. 678 * @param kmer Forward kmer 679 * @param rkmer Reverse kmer 680 * @param len Kmer length 681 * @param editDist Number of edits to allow 682 * @param extraBase Base added to end in case of deletions 683 * @param extraBase2 Base added to end in case of deletions 684 * @param extraBase3 Base added to end in case of deletions 685 * @return Number of kmers stored 686 */ mutateE(final long kmer, final long rkmer, final int len, final int editDist, final int maxSubs, final int maxDels, final int maxInss, final long extraBase, final long extraBase2, final long extraBase3)687 private long mutateE(final long kmer, final long rkmer, final int len, final int editDist, 688 final int maxSubs, final int maxDels, final int maxInss, 689 final long extraBase, final long extraBase2, final long extraBase3){ 690 long added=0; 691 692 addKey(kmer, rkmer, len); 693 if(editDist<1){return 1;} 694 695 if(verbose){outstream.println("mutateE; len="+len+"; kmer="+kmer+"; rkmer="+rkmer);} 696 697 final int edist2=editDist-1; 698 if(maxSubs>0){ 699 //Sub 700 for(int j=0; j<symbols; j++){ 701 for(int i=0; i<len; i++){ 702 final long temp=(kmer&clearMasks[i])|setMasks[j][i]; 703 if(temp!=kmer){ 704 long rtemp=rcomp(temp, len); 705 added+=mutateE(temp, rtemp, len, edist2, 706 maxSubs-1, maxDels, maxInss, extraBase, extraBase2, extraBase3); 707 } 708 } 709 } 710 } 711 712 if(maxDels>0 && extraBase>=0 && extraBase<=maxSymbol){ 713 //Del 714 for(int i=1; i<len; i++){ 715 final long temp=(kmer&leftMasks[i])|((kmer<<bitsPerBase)&rightMasks[i])|extraBase; 716 if(temp!=kmer){ 717 long rtemp=rcomp(temp, len); 718 added+=mutateE(temp, rtemp, len, edist2, 719 maxSubs, maxDels-1, maxInss, extraBase2, extraBase3, -1); 720 } 721 } 722 } 723 724 if(maxInss>0){ 725 //Ins 726 final long eb0=kmer&symbolMask; 727 for(int i=0; i<len; i++){ 728 final long temp0=(kmer&leftMasks[i])|((kmer&rightMasks[i])>>bitsPerBase); 729 for(int j=0; j<symbols; j++){ 730 final long temp=temp0|setMasks[j][i]; 731 if(temp!=kmer){ 732 long rtemp=rcomp(temp, len); 733 added+=mutateE(temp, rtemp, len, edist2, 734 maxSubs, maxDels, maxInss-1, eb0, extraBase, extraBase2); 735 } 736 } 737 } 738 } 739 740 return added; 741 } 742 743 /** 744 * Transforms a kmer into a canonical value. Expected to be inlined. 745 * @param kmer Forward kmer 746 * @param rkmer Reverse kmer 747 * @return Canonical value 748 */ toValue(long kmer, long rkmer)749 final long toValue(long kmer, long rkmer){ 750 if(verbose){outstream.println("toValue("+AminoAcid.kmerToString(kmer, k)+", "+AminoAcid.kmerToString(rkmer, k)+")");} 751 final long value=(rcomp ? Tools.max(kmer, rkmer) : kmer); 752 if(verbose){outstream.println("value="+AminoAcid.kmerToString(value, k)+" = "+value);} 753 return value; 754 } 755 rcomp(long kmer, int len)756 final long rcomp(long kmer, int len){ 757 return amino ? kmer : AminoAcid.reverseComplementBinaryFast(kmer, len); 758 } 759 passesSpeed(long key)760 final boolean passesSpeed(long key){ 761 return speed<1 || ((key&Long.MAX_VALUE)%17)>=speed; 762 } 763 failsSpeed(long key)764 final boolean failsSpeed(long key){ 765 return speed>0 && ((key&Long.MAX_VALUE)%17)<speed; 766 } 767 768 /** Returns true if the symbol is not degenerate (e.g., 'N') for the alphabet in use. */ isFullyDefined(byte symbol)769 final boolean isFullyDefined(byte symbol){ 770 return symbol>=0 && symbolToNumber[symbol]>=0; 771 } 772 773 /** For verbose / debugging output */ kmerToString(long kmer, int k)774 final String kmerToString(long kmer, int k){ 775 return amino ? AminoAcid.kmerToStringAA(kmer, k) : AminoAcid.kmerToString(kmer, k); 776 } 777 778 /*--------------------------------------------------------------*/ 779 /*---------------- Fields ----------------*/ 780 /*--------------------------------------------------------------*/ 781 782 /** Primary input file path */ 783 private String in1=null; 784 /** Secondary input file path */ 785 private String in2=null; 786 787 private String qfin1=null; 788 private String qfin2=null; 789 790 /** Primary output file path */ 791 private String out1=null; 792 793 /** Override input file extension */ 794 private String extin=null; 795 /** Override output file extension */ 796 private String extout=null; 797 798 /** Whether interleaved was explicitly set. */ 799 private boolean setInterleaved=false; 800 801 private boolean rcomp=true; 802 private int k=31; 803 /** k-1; used in some expressions */ 804 final int k2; 805 806 private int speed=0; 807 private int skip=0; 808 809 private int editDist=0; 810 private int maxSubs=99; 811 private int maxDels=99; 812 private int maxInss=99; 813 814 private int maxEdits=99; 815 private int subDist=0; 816 private int delDist=0; 817 private int insDist=0; 818 819 private LongListSet set=new LongListSet(); 820 821 /*--------------------------------------------------------------*/ 822 /*----------- Symbol-Specific Constants ----------*/ 823 /*--------------------------------------------------------------*/ 824 825 /** True for amino acid data, false for nucleotide data */ 826 final boolean amino; 827 // final int maxSupportedK; 828 final int bitsPerBase; 829 final int maxSymbol; 830 final int symbols; 831 final int symbolArrayLen; 832 final int symbolSpace; 833 final long symbolMask; 834 835 final int minlen; 836 final int minlen2; 837 final int shift; 838 final int shift2; 839 final long mask; 840 841 /** x&clearMasks[i] will clear base i */ 842 final long[] clearMasks; 843 /** x|setMasks[i][j] will set base i to j */ 844 final long[][] setMasks; 845 /** x&leftMasks[i] will clear all bases to the right of i (exclusive) */ 846 final long[] leftMasks; 847 /** x&rightMasks[i] will clear all bases to the left of i (inclusive) */ 848 final long[] rightMasks; 849 // /** x|kMasks[i] will set the bit to the left of the leftmost base */ 850 // final long[] lengthMasks; 851 852 private final byte[] symbolToNumber0; 853 private final byte[] symbolToComplementNumber0; 854 private final byte[] symbolToNumber; 855 856 /*--------------------------------------------------------------*/ 857 858 /** Number of reads processed */ 859 protected long readsProcessed=0; 860 /** Number of bases processed */ 861 protected long basesProcessed=0; 862 /** Number of kmers processed */ 863 protected long kmersProcessed=0; 864 /** Number of kmers added (includes mutants) */ 865 protected long kmersAdded=0; 866 867 /** Number of reads output */ 868 protected long readsOut=0; 869 /** Number of bases output */ 870 protected long basesOut=0; 871 /** Number of kmers output */ 872 protected long kmersOut=0; 873 874 /** Quit after processing this many input reads; -1 means no limit */ 875 private long maxReads=-1; 876 877 /*--------------------------------------------------------------*/ 878 /*---------------- Final Fields ----------------*/ 879 /*--------------------------------------------------------------*/ 880 881 /** Primary input file */ 882 private final FileFormat ffin1; 883 /** Secondary input file */ 884 private final FileFormat ffin2; 885 886 /** Primary output file */ 887 private final FileFormat ffout1; 888 889 /*--------------------------------------------------------------*/ 890 /*---------------- Common Fields ----------------*/ 891 /*--------------------------------------------------------------*/ 892 893 /** Print status messages to this output stream */ 894 private PrintStream outstream=System.err; 895 /** Print verbose messages */ 896 public static boolean verbose=false; 897 /** True if an error was encountered */ 898 public boolean errorState=false; 899 /** Overwrite existing output files */ 900 private boolean overwrite=false; 901 /** Append to existing output files */ 902 private boolean append=false; 903 /** This flag has no effect on singlethreaded programs */ 904 private final boolean ordered=false; 905 906 } 907