1 package jgi; 2 3 import java.io.File; 4 import java.io.PrintStream; 5 import java.util.ArrayList; 6 import java.util.Arrays; 7 import java.util.BitSet; 8 import java.util.HashMap; 9 import java.util.HashSet; 10 import java.util.Locale; 11 import java.util.concurrent.ArrayBlockingQueue; 12 import java.util.concurrent.atomic.AtomicLongArray; 13 14 import cardinality.CardinalityTracker; 15 import dna.AminoAcid; 16 import dna.Data; 17 import fileIO.ByteFile; 18 import fileIO.ByteStreamWriter; 19 import fileIO.FileFormat; 20 import fileIO.ReadWrite; 21 import fileIO.TextStreamWriter; 22 import kmer.AbstractKmerTable; 23 import kmer.ScheduleMaker; 24 import shared.KillSwitch; 25 import shared.Parse; 26 import shared.Parser; 27 import shared.PreParser; 28 import shared.ReadStats; 29 import shared.Shared; 30 import shared.Timer; 31 import shared.Tools; 32 import shared.TrimRead; 33 import stream.ConcurrentReadInputStream; 34 import stream.ConcurrentReadOutputStream; 35 import stream.FASTQ; 36 import stream.FastaReadInputStream; 37 import stream.Read; 38 import stream.SamLine; 39 import structures.IntList; 40 import structures.ListNum; 41 42 /** 43 * Separates or trims reads based on matching kmers in a reference. 44 * Supports arbitrary K and inexact matches. 45 * Supercedes BBDukF by allowing simultaneous filtering, left- and right-trimming with different references. 46 * @author Brian Bushnell 47 * @date Aug 30, 2013 48 * 49 */ 50 public class BBDuk2 { 51 52 /*--------------------------------------------------------------*/ 53 /*---------------- Initialization ----------------*/ 54 /*--------------------------------------------------------------*/ 55 56 /** 57 * Code entrance from the command line. 58 * @param args Command line arguments 59 */ main(String[] args)60 public static void main(String[] args){ 61 //Create a new BBDuk2 instance 62 BBDuk2 x=new BBDuk2(args); 63 64 ///And run it 65 x.process(); 66 67 //Close the print stream if it was redirected 68 Shared.closeStream(outstream); 69 } 70 71 /** 72 * Constructor. 73 * @param args Command line arguments 74 */ BBDuk2(String[] args)75 public BBDuk2(String[] args){ 76 77 {//Preparse block for help, config files, and outstream 78 PreParser pp=new PreParser(args, getClass(), true); 79 args=pp.args; 80 outstream=pp.outstream; 81 } 82 83 /* Set global defaults */ 84 ReadWrite.ZIPLEVEL=2; 85 ReadWrite.USE_UNPIGZ=true; 86 ReadWrite.USE_PIGZ=true; 87 88 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ 89 ByteFile.FORCE_MODE_BF2=true; 90 } 91 92 /* Initialize local variables with defaults */ 93 boolean setOut=false, setOutb=false; 94 boolean ktrimRight_=false, ktrimLeft_=false, ktrimN_=false, ktrimExclusive_=false, kfilter_=false; 95 boolean findBestMatch_=false; 96 boolean addTrimmedToBad_=true; 97 boolean rcomp_=true; 98 boolean forbidNs_=false; 99 boolean useForest_=false, useTable_=false, useArray_=true, prealloc_=false; 100 int k_=27, kbig_=-1; 101 int mink_=-1; 102 int ways_=-1; //Currently disabled 103 int maxBadKmers_=0; 104 long skipreads_=0; 105 byte TRIM_SYMBOL_='N'; 106 boolean kmaskLowercase_=false; 107 108 Parser parser=new Parser(); 109 parser.trimq=6; 110 parser.minAvgQuality=0; 111 parser.minReadLength=10; 112 parser.maxReadLength=Integer.MAX_VALUE; 113 parser.minLenFraction=0f; 114 parser.requireBothBad=false; 115 parser.maxNs=-1; 116 boolean trimByOverlap_=false, useQualityForOverlap_=false, strictOverlap_=true; 117 boolean trimPairsEvenly_=false; 118 boolean ordered_=false; 119 int minoverlap_=-1, mininsert_=-1; 120 int restrictLeft_=0, restrictRight_=0, speed_=0, qSkip_=1; 121 boolean printNonZeroOnly_=true; 122 boolean rename_=false, useRefNames_=false; 123 boolean skipr1_=false, skipr2_=false; 124 boolean ecc_=false; 125 float minBaseFrequency_=0; 126 127 128 String[] literal=null; 129 String[] ref=null; 130 131 scaffoldNames.add(""); //Necessary so that the first real scaffold gets an id of 1, not zero 132 scaffoldLengths.add(0); 133 134 /* Parse arguments */ 135 for(int i=0; i<args.length; i++){ 136 final String arg=args[i]; 137 String[] split=arg.split("="); 138 String a=split[0].toLowerCase(); 139 String b=split.length>1 ? split[1] : null; 140 141 if(Parser.parseZip(arg, a, b)){ 142 //do nothing 143 }else if(Parser.parseHist(arg, a, b)){ 144 //do nothing 145 }else if(Parser.parseCommonStatic(arg, a, b)){ 146 //do nothing 147 }else if(Parser.parseQualityAdjust(arg, a, b)){ 148 //do nothing 149 }else if(Parser.parseQuality(arg, a, b)){ 150 //do nothing 151 }else if(Parser.parseFasta(arg, a, b)){ 152 //do nothing 153 }else if(parser.parseInterleaved(arg, a, b)){ 154 //do nothing 155 }else if(parser.parseTrim(arg, a, b)){ 156 //do nothing 157 }else if(parser.parseCommon(arg, a, b)){ 158 //do nothing 159 }else if(parser.parseCardinality(arg, a, b)){ 160 //do nothing 161 }else if(a.equals("in") || a.equals("in1")){ 162 in1=b; 163 }else if(a.equals("in2")){ 164 in2=b; 165 }else if(a.equals("qfin") || a.equals("qfin1")){ 166 qfin1=b; 167 }else if(a.equals("qfin2")){ 168 qfin2=b; 169 }else if(a.equals("out") || a.equals("out1") || a.equals("outu") || a.equals("outu1") || a.equals("outnonmatch") || 170 a.equals("outnonmatch1") || a.equals("outunnmatch") || a.equals("outunmatch1") || a.equals("outunnmatched") || a.equals("outunmatched1")){ 171 out1=b; 172 setOut=true; 173 }else if(a.equals("out2") || a.equals("outu2") || a.equals("outnonmatch2") || a.equals("outunmatch2") || 174 a.equals("outnonmatched2") || a.equals("outunmatched2")){ 175 out2=b; 176 }else if(a.equals("outb") || a.equals("outm") || a.equals("outb1") || a.equals("outm1") || a.equals("outbad") || 177 a.equals("outbad1") || a.equals("outmatch") || a.equals("outmatch1")){ 178 outb1=b; 179 setOut=true; 180 }else if(a.equals("outb2") || a.equals("outm2") || a.equals("outbad2") || a.equals("outmatch2")){ 181 outb2=b; 182 }else if(a.equals("outs") || a.equals("outsingle")){ 183 outsingle=b; 184 }else if(a.equals("stats") || a.equals("scafstats")){ 185 outstats=b; 186 }else if(a.equals("refstats")){ 187 outrefstats=b; 188 }else if(a.equals("rpkm") || a.equals("fpkm") || a.equals("cov") || a.equals("coverage")){ 189 outrpkm=b; 190 }else if(a.equals("sam") || a.equals("bam")){ 191 samFile=b; 192 }else if(a.equals("duk") || a.equals("outduk")){ 193 outduk=b; 194 }else if(a.equals("rqc")){ 195 outrqc=b; 196 }else if(a.equals("ref")){ 197 ref=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(",")); 198 }else if(a.equals("filterref") || a.equals("fref")){ 199 refFilter=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(",")); 200 }else if(a.equals("maskref") || a.equals("mref") || a.equals("nref")){ 201 refMask=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(",")); 202 }else if(a.equals("rightref") || a.equals("trref") || a.equals("rref")){ 203 refRight=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(",")); 204 }else if(a.equals("leftref") || a.equals("tlref") || a.equals("lref")){ 205 refLeft=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(",")); 206 }else if(a.equals("literal")){ 207 literal=(b==null) ? null : b.split(","); 208 // assert(false) : b+", "+Arrays.toString(literal); 209 }else if(a.equals("filterliteral") || a.equals("fliteral")){ 210 literalFilter=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(",")); 211 }else if(a.equals("maskliteral") || a.equals("mliteral") || a.equals("nliteral")){ 212 literalMask=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(",")); 213 }else if(a.equals("rightliteral") || a.equals("trliteral") || a.equals("rliteral")){ 214 literalRight=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(",")); 215 }else if(a.equals("leftliteral") || a.equals("tlliteral") || a.equals("lliteral")){ 216 literalLeft=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(",")); 217 }else if(a.equals("forest")){ 218 useForest_=Parse.parseBoolean(b); 219 if(useForest_){useTable_=useArray_=false;} 220 }else if(a.equals("table")){ 221 useTable_=Parse.parseBoolean(b); 222 if(useTable_){useForest_=useArray_=false;} 223 }else if(a.equals("array")){ 224 useArray_=Parse.parseBoolean(b); 225 if(useArray_){useTable_=useForest_=false;} 226 }else if(a.equals("ways")){ 227 ways_=Integer.parseInt(b); 228 }else if(a.equals("ordered") || a.equals("ord")){ 229 ordered_=Parse.parseBoolean(b); 230 System.err.println("Set ORDERED to "+ordered_); 231 }else if(a.equals("skipr1")){ 232 skipr1_=Parse.parseBoolean(b); 233 }else if(a.equals("skipr2")){ 234 skipr2_=Parse.parseBoolean(b); 235 }else if(a.equals("k")){ 236 assert(b!=null) : "\nThe k key needs an integer value greater than 0, such as k=27\n"; 237 k_=Integer.parseInt(b); 238 if(k_>31){ 239 kbig_=k_; 240 k_=31; 241 }else{ 242 kbig_=-1; 243 } 244 assert(k_>0 && k_<32) : "k must be at least 1; default is 27."; 245 }else if(a.equals("mink") || a.equals("kmin")){ 246 mink_=Integer.parseInt(b); 247 assert(mink_<0 || (mink_>0 && mink_<32)) : "kmin must be between 1 and 31; default is 4, negative numbers disable it."; 248 }else if(a.equals("useshortkmers") || a.equals("shortkmers") || a.equals("usk")){ 249 useShortKmers=Parse.parseBoolean(b); 250 }else if(a.equals("trimextra") || a.equals("trimpad") || a.equals("tp")){ 251 trimPad=Integer.parseInt(b); 252 }else if(a.equals("hdist") || a.equals("hammingdistance")){ 253 hammingDistance=Integer.parseInt(b); 254 assert(hammingDistance>=0 && hammingDistance<=4) : "hamming distance must be between 0 and 4; default is 0."; 255 }else if(a.equals("qhdist") || a.equals("queryhammingdistance")){ 256 qHammingDistance=Integer.parseInt(b); 257 assert(qHammingDistance>=0 && qHammingDistance<=4) : "hamming distance must be between 0 and 4; default is 0."; 258 }else if(a.equals("edits") || a.equals("edist") || a.equals("editdistance")){ 259 editDistance=Integer.parseInt(b); 260 assert(editDistance>=0 && editDistance<=3) : "edit distance must be between 0 and 3; default is 0."; 261 }else if(a.equals("hdist2") || a.equals("hammingdistance2")){ 262 hammingDistance2=Integer.parseInt(b); 263 assert(hammingDistance2>=0 && hammingDistance2<=4) : "hamming distance must be between 0 and 3; default is 0."; 264 }else if(a.equals("qhdist2") || a.equals("queryhammingdistance2")){ 265 qHammingDistance2=Integer.parseInt(b); 266 assert(qHammingDistance2>=0 && qHammingDistance2<=4) : "hamming distance must be between 0 and 3; default is 0."; 267 }else if(a.equals("edits2") || a.equals("edist2") || a.equals("editdistance2")){ 268 editDistance2=Integer.parseInt(b); 269 assert(editDistance2>=0 && editDistance2<=3) : "edit distance must be between 0 and 2; default is 0."; 270 }else if(a.equals("maxskip") || a.equals("maxrskip") || a.equals("mxs")){ 271 maxSkip=Integer.parseInt(b); 272 }else if(a.equals("minskip") || a.equals("minrskip") || a.equals("mns")){ 273 minSkip=Integer.parseInt(b); 274 }else if(a.equals("skip") || a.equals("refskip") || a.equals("rskip")){ 275 minSkip=maxSkip=Integer.parseInt(b); 276 }else if(a.equals("qskip")){ 277 qSkip_=Integer.parseInt(b); 278 }else if(a.equals("speed")){ 279 speed_=Integer.parseInt(b); 280 assert(speed_>=0 && speed_<=15) : "Speed range is 0 to 15. Value: "+speed_; 281 }else if(a.equals("skipreads")){ 282 skipreads_=Parse.parseKMG(b); 283 }else if(a.equals("maxbadkmers") || a.equals("mbk")){ 284 maxBadKmers_=Integer.parseInt(b); 285 }else if(a.equals("minhits") || a.equals("minkmerhits") || a.equals("mkh")){ 286 maxBadKmers_=Integer.parseInt(b)-1; 287 }else if(a.equals("showspeed") || a.equals("ss")){ 288 showSpeed=Parse.parseBoolean(b); 289 }else if(a.equals("verbose")){ 290 assert(false) : "Verbose flag is currently static final; must be recompiled to change."; 291 assert(WAYS>1) : "WAYS=1 is for debug mode."; 292 // verbose=Parse.parseBoolean(b); //123 293 if(verbose){outstream=System.err;} //For some reason System.out does not print in verbose mode. 294 }else if(a.equals("mm") || a.equals("maskmiddle")){ 295 maskMiddle=Parse.parseBoolean(b); 296 }else if(a.equals("rcomp")){ 297 rcomp_=Parse.parseBoolean(b); 298 }else if(a.equals("forbidns") || a.equals("forbidn") || a.equals("fn")){ 299 forbidNs_=Parse.parseBoolean(b); 300 }else if(a.equals("findbestmatch") || a.equals("fbm")){ 301 findBestMatch_=Parse.parseBoolean(b); 302 }else if(a.equals("kfilter")){ 303 kfilter_=Parse.parseBoolean(b); 304 }else if(a.equals("kmask") || a.equals("mask")){ 305 if("lc".equalsIgnoreCase(b) || "lowercase".equalsIgnoreCase(b)){ 306 kmaskLowercase_=true; 307 ktrimN_=true; 308 }else{ 309 if(b==null){b="";} 310 if(b.length()==1 && !b.equalsIgnoreCase("t") && !b.equalsIgnoreCase("f")){ 311 ktrimN_=true; 312 TRIM_SYMBOL_=(byte)b.charAt(0); 313 }else{ 314 ktrimN_=Parse.parseBoolean(b); 315 } 316 } 317 }else if(a.equals("ktrim")){ 318 throw new RuntimeException("BBDuk2 does not need the ktrim flag. " + 319 "It will trim according to which references are specified - for example, if lref is set, " + 320 "it will do left-trimming. To specify masking modes, use the kmask flag; " + 321 "e.g. kmask=lowercase or kmask=X."); 322 // if(b==null){b="";} 323 // if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){ktrimLeft_=true;} 324 // else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){ktrimRight_=true;} 325 // else if(b.equalsIgnoreCase("n") || b.equalsIgnoreCase("mask")){ktrimN_=true;} 326 // else if(b.length()==1 && !b.equalsIgnoreCase("t") && !b.equalsIgnoreCase("f")){ 327 // ktrimN_=true; 328 // TRIM_SYMBOL_=(byte)b.charAt(0); 329 // }else{ 330 // boolean x=Parse.parseBoolean(b); 331 // if(!x){ 332 // ktrimRight_=ktrimLeft_=false; 333 // }else{ 334 // throw new RuntimeException("\nInvalid setting for ktrim: "+b+"\nvalues must be f (false), l (left), r (right), or n"); 335 // } 336 // } 337 }else if(a.equals("ktrimright")){ 338 ktrimRight_=Parse.parseBoolean(b); 339 ktrimLeft_=ktrimN_=!(ktrimRight_); 340 }else if(a.equals("ktrimleft")){ 341 ktrimLeft_=Parse.parseBoolean(b); 342 ktrimRight_=ktrimN_=!(ktrimLeft_); 343 }else if(a.equals("ktrimn")){ 344 ktrimN_=Parse.parseBoolean(b); 345 ktrimLeft_=ktrimRight_=!(ktrimN_); 346 }else if(a.equals("ktrimexclusive")){ 347 ktrimExclusive_=Parse.parseBoolean(b); 348 }else if(a.equals("tbo") || a.equals("trimbyoverlap")){ 349 trimByOverlap_=Parse.parseBoolean(b); 350 }else if(a.equals("strictoverlap")){ 351 strictOverlap_=Parse.parseBoolean(b); 352 }else if(a.equals("usequality")){ 353 useQualityForOverlap_=Parse.parseBoolean(b); 354 }else if(a.equals("tpe") || a.equals("tbe") || a.equals("trimpairsevenly")){ 355 trimPairsEvenly_=Parse.parseBoolean(b); 356 }else if(a.equals("ottm") || a.equals("outputtrimmedtomatch")){ 357 addTrimmedToBad_=Parse.parseBoolean(b); 358 }else if(a.equals("minoverlap")){ 359 minoverlap_=Integer.parseInt(b); 360 }else if(a.equals("mininsert")){ 361 mininsert_=Integer.parseInt(b); 362 }else if(a.equals("prealloc") || a.equals("preallocate")){ 363 if(b==null || b.length()<1 || Character.isLetter(b.charAt(0))){ 364 prealloc_=Parse.parseBoolean(b); 365 }else{ 366 preallocFraction=Tools.max(0, Double.parseDouble(b)); 367 prealloc_=(preallocFraction>0); 368 } 369 }else if(a.equals("restrictleft")){ 370 restrictLeft_=Integer.parseInt(b); 371 }else if(a.equals("restrictright")){ 372 restrictRight_=Integer.parseInt(b); 373 }else if(a.equals("statscolumns") || a.equals("columns") || a.equals("cols")){ 374 STATS_COLUMNS=Integer.parseInt(b); 375 assert(STATS_COLUMNS==3 || STATS_COLUMNS==5) : "statscolumns bust be either 3 or 5. Invalid value: "+STATS_COLUMNS; 376 }else if(a.equals("nzo") || a.equals("nonzeroonly")){ 377 printNonZeroOnly_=Parse.parseBoolean(b); 378 }else if(a.equals("rename")){ 379 rename_=Parse.parseBoolean(b); 380 }else if(a.equals("refnames") || a.equals("userefnames")){ 381 useRefNames_=Parse.parseBoolean(b); 382 }else if(a.equals("initialsize")){ 383 initialSize=Parse.parseIntKMG(b); 384 }else if(a.equals("dump")){ 385 dump=b; 386 }else if(a.equals("entropyk") || a.equals("ek")){ 387 entropyK=Integer.parseInt(b); 388 }else if(a.equals("entropywindow") || a.equals("ew")){ 389 entropyWindow=Integer.parseInt(b); 390 }else if(a.equals("minentropy") || a.equals("entropy") || a.equals("entropyfilter")){ 391 entropyCutoff=Float.parseFloat(b); 392 }else if(a.equals("verifyentropy")){ 393 verifyEntropy=Parse.parseBoolean(b); 394 }else if(a.equals("minbasefrequency")){ 395 minBaseFrequency_=Float.parseFloat(b); 396 }else if(a.equals("ecco") || a.equals("ecc")){ 397 ecc_=Parse.parseBoolean(b); 398 }else if(a.equals("copyundefined") || a.equals("cu")){ 399 REPLICATE_AMBIGUOUS=Parse.parseBoolean(b); 400 }else if(a.equals("path")){ 401 Data.setPath(b); 402 }else if(i==0 && in1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ 403 in1=args[i]; 404 }else if(i==1 && out1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ 405 out1=args[i]; 406 setOut=true; 407 }else if(i==2 && ref==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ 408 ref=(new File(args[i]).exists() ? new String[] {args[i]} : args[i].split(",")); 409 }else{ 410 throw new RuntimeException("Unknown parameter "+args[i]); 411 } 412 } 413 414 if(hammingDistance2==-1){hammingDistance2=hammingDistance;} 415 if(qHammingDistance2==-1){qHammingDistance2=qHammingDistance;} 416 if(editDistance2==-1){editDistance2=editDistance;} 417 minBaseFrequency=minBaseFrequency_; 418 419 {//Process parser fields 420 Parser.processQuality(); 421 422 maxReads=parser.maxReads; 423 samplerate=parser.samplerate; 424 sampleseed=parser.sampleseed; 425 recalibrateQuality=parser.recalibrateQuality; 426 427 overwrite=ReadStats.overwrite=parser.overwrite; 428 append=ReadStats.append=parser.append; 429 // testsize=parser.testsize; 430 // trimBadSequence=parser.trimBadSequence; 431 // breakLength=parser.breakLength; 432 433 forceTrimModulo=parser.forceTrimModulo; 434 forceTrimLeft=parser.forceTrimLeft; 435 forceTrimRight=parser.forceTrimRight; 436 forceTrimRight2=parser.forceTrimRight2; 437 qtrimLeft=parser.qtrimLeft; 438 qtrimRight=parser.qtrimRight; 439 trimq=parser.trimq; 440 trimE=parser.trimE(); 441 minLenFraction=parser.minLenFraction; 442 minAvgQuality=parser.minAvgQuality; 443 minAvgQualityBases=parser.minAvgQualityBases; 444 chastityFilter=parser.chastityFilter; 445 failBadBarcodes=parser.failBadBarcodes; 446 removeBadBarcodes=parser.removeBadBarcodes; 447 failIfNoBarcode=parser.failIfNoBarcode; 448 barcodes=parser.barcodes; 449 minReadLength=parser.minReadLength; 450 maxReadLength=parser.maxReadLength; 451 maxNs=parser.maxNs; 452 minConsecutiveBases=parser.minConsecutiveBases; 453 // untrim=parser.untrim; 454 // minTrimLength=(parser.minTrimLength>=0 ? parser.minTrimLength : minTrimLength); 455 // requireBothBad=parser.requireBothBad; 456 removePairsIfEitherBad=!parser.requireBothBad; 457 458 minGC=parser.minGC; 459 maxGC=parser.maxGC; 460 filterGC=(minGC>0 || maxGC<1); 461 usePairGC=parser.usePairGC; 462 463 loglog=(parser.loglog ? CardinalityTracker.makeTracker(parser) : null); 464 465 THREADS=Shared.threads(); 466 } 467 468 if(prealloc_){ 469 System.err.println("Note - if this program runs out of memory, please disable the prealloc flag."); 470 } 471 472 if(minoverlap_>=0){ 473 minOverlap=Tools.max(minoverlap_, 1); 474 minOverlap0=Tools.min(minOverlap0, minOverlap); 475 } 476 477 if(mininsert_>=0){ 478 minInsert=Tools.max(mininsert_, 1); 479 minInsert0=Tools.min(minInsert0, minInsert); 480 } 481 482 if(refFilter!=null || literalFilter!=null){kfilter_=true;} 483 if(refMask!=null || literalMask!=null){ktrimN_=true;} 484 if(refRight!=null || literalRight!=null){ktrimRight_=true;} 485 if(refLeft!=null || literalLeft!=null){ktrimLeft_=true;} 486 487 if(ref!=null){ 488 if(!ktrimN_ && !ktrimRight_ && !ktrimLeft_){kfilter_=true;} 489 if(kfilter_ && refFilter==null){ 490 refFilter=ref; 491 }else if(ktrimN_ && refMask==null){ 492 refMask=ref; 493 }else if(ktrimRight_ && refRight==null){ 494 refRight=ref; 495 }else if(ktrimLeft_ && refLeft==null){ 496 refLeft=ref; 497 } 498 } 499 if(literal!=null){ 500 if(!ktrimN_ && !ktrimRight_ && !ktrimLeft_){kfilter_=true;} 501 if(kfilter_ && literalFilter==null){ 502 literalFilter=literal; 503 }else if(ktrimN_ && literalMask==null){ 504 literalMask=literal; 505 }else if(ktrimRight_ && literalRight==null){ 506 literalRight=literal; 507 }else if(ktrimLeft_ && literalLeft==null){ 508 literalLeft=literal; 509 } 510 } 511 512 kfilter_=(refFilter!=null || literalFilter!=null); 513 ktrimN_=(refMask!=null || literalMask!=null); 514 ktrimRight_=(refRight!=null || literalRight!=null); 515 ktrimLeft_=(refLeft!=null || literalLeft!=null); 516 517 kfilter=kfilter_; 518 ktrimN=ktrimN_; 519 ktrimRight=ktrimRight_; 520 ktrimLeft=ktrimLeft_; 521 ktrimExclusive=ktrimExclusive_; 522 findBestMatch=findBestMatch_; 523 524 if(refFilter!=null){ 525 for(String s : refFilter){refNames.add(s);} 526 } 527 if(literalFilter!=null){refNames.add("literal");} 528 refScafCounts=new int[refNames.size()]; 529 530 /* Set final variables; post-process and validate argument combinations */ 531 532 useForest=useForest_; 533 useTable=useTable_; 534 useArray=useArray_; 535 hammingDistance=Tools.max(editDistance, hammingDistance); 536 hammingDistance2=Tools.max(editDistance2, hammingDistance2); 537 minSkip=Tools.max(1, Tools.min(minSkip, maxSkip)); 538 maxSkip=Tools.max(minSkip, maxSkip); 539 addTrimmedToBad=addTrimmedToBad_; 540 rcomp=rcomp_; 541 forbidNs=(forbidNs_ || hammingDistance<1); 542 trimSymbol=TRIM_SYMBOL_; 543 kmaskLowercase=kmaskLowercase_; 544 skipreads=skipreads_; 545 trimByOverlap=trimByOverlap_; 546 useQualityForOverlap=useQualityForOverlap_; 547 strictOverlap=strictOverlap_; 548 trimPairsEvenly=trimPairsEvenly_; 549 ordered=ordered_; 550 restrictLeft=Tools.max(restrictLeft_, 0); 551 restrictRight=Tools.max(restrictRight_, 0); 552 printNonZeroOnly=printNonZeroOnly_; 553 rename=rename_; 554 useRefNames=useRefNames_; 555 speed=speed_; 556 qSkip=qSkip_; 557 noAccel=(speed<1 && qSkip<2); 558 skipR1=skipr1_; 559 skipR2=skipr2_; 560 ecc=ecc_; 561 562 if(strictOverlap){ 563 maxRatio=0.05f; 564 ratioMargin=9f; 565 ratioOffset=0.5f; 566 efilterRatio=3.5f; 567 efilterOffset=0.05f; 568 pfilterRatio=0.001f; 569 meeFilter=15f; 570 }else{ 571 maxRatio=0.10f; 572 ratioMargin=5f; 573 ratioOffset=0.4f; 574 efilterRatio=6f; 575 efilterOffset=0.05f; 576 pfilterRatio=0.00005f; 577 meeFilter=999999999; 578 } 579 580 MAKE_QUALITY_HISTOGRAM=ReadStats.COLLECT_QUALITY_STATS; 581 MAKE_QUALITY_ACCURACY=ReadStats.COLLECT_QUALITY_ACCURACY; 582 MAKE_MATCH_HISTOGRAM=ReadStats.COLLECT_MATCH_STATS; 583 MAKE_BASE_HISTOGRAM=ReadStats.COLLECT_BASE_STATS; 584 MAKE_EHIST=ReadStats.COLLECT_ERROR_STATS; 585 MAKE_INDELHIST=ReadStats.COLLECT_INDEL_STATS; 586 MAKE_LHIST=ReadStats.COLLECT_LENGTH_STATS; 587 MAKE_GCHIST=ReadStats.COLLECT_GC_STATS; 588 MAKE_IDHIST=ReadStats.COLLECT_IDENTITY_STATS; 589 590 { 591 long usableMemory; 592 long tableMemory; 593 long numTables=Tools.max(1, (kfilter ? 1 : 0)+(ktrimLeft ? 1 : 0)+(ktrimRight ? 1 : 0)+(ktrimN ? 1 : 0)); 594 595 { 596 long memory=Runtime.getRuntime().maxMemory(); 597 double xmsRatio=Shared.xmsRatio(); 598 usableMemory=(long)Tools.max(((memory-96000000-(20*400000 /* for atomic arrays */))*(xmsRatio>0.97 ? 0.82 : 0.72)), memory*0.45); 599 tableMemory=(long)(usableMemory*.95); 600 } 601 602 if(initialSize<1){ 603 final long memOverWays=tableMemory/(12*WAYS*numTables); 604 final double mem2=(prealloc_ ? preallocFraction : 1)*tableMemory; 605 initialSize=(prealloc_ || memOverWays<initialSizeDefault ? (int)Tools.min(2142000000, (long)(mem2/(12*WAYS*numTables))) : initialSizeDefault); 606 if(initialSize!=initialSizeDefault){ 607 System.err.println("Initial size set to "+initialSize); 608 } 609 } 610 } 611 612 if(ktrimLeft_ || ktrimRight_ || ktrimN_){ 613 if(kbig_>k_){ 614 System.err.println("*********************** WARNING ***********************"); 615 System.err.println("WARNING: When kmer-trimming, the maximum value of K is "+k_+"."); 616 System.err.println("K has been reduced from "+kbig_+" to "+k_+"."); 617 System.err.println("***********************************************************"); 618 kbig_=k_; 619 } 620 } 621 622 if((speed>0 || qSkip>1) && kbig_>k_){ 623 System.err.println("*********************** WARNING ***********************"); 624 System.err.println("WARNING: When speed>0 or qskip>1, the maximum value of K is "+k_+"."); 625 System.err.println("K has been reduced from "+kbig_+" to "+k_+"."); 626 System.err.println("***********************************************************"); 627 kbig_=k_; 628 } 629 630 if((speed>0 && qSkip>1) || (qSkip>1 && maxSkip>1) || (speed>0 && maxSkip>1)){ 631 System.err.println("WARNING: It is not recommended to use more than one of 'qskip', 'speed', and 'rskip/maxskip' together."); 632 System.err.println("qskip="+qSkip+", speed="+speed+", maxskip="+maxSkip); 633 } 634 635 k=k_; 636 k2=k-1; 637 kbig=kbig_; 638 if(kbig>k){ 639 minSkip=maxSkip=0; 640 if(maskMiddle){ 641 System.err.println("maskMiddle was disabled because kbig>k"); 642 maskMiddle=false; 643 } 644 } 645 mink=Tools.min((mink_<1 ? 6 : mink_), k); 646 maxBadKmers=maxBadKmers_; 647 if(mink_>0 && mink_<k){useShortKmers=true;} 648 if(useShortKmers){ 649 if(maskMiddle){ 650 System.err.println("maskMiddle was disabled because useShortKmers=true"); 651 maskMiddle=false; 652 } 653 } 654 assert(findBestMatch==false || kfilter==false || kbig<=k) : "K must be less than 32 in 'findBestMatch' mode"; 655 656 assert(!useShortKmers || ktrimRight || ktrimLeft || ktrimN) : "\nSetting mink or useShortKmers also requires setting a ktrim mode, such as 'r', 'l', or 'n'\n"; 657 658 middleMask=maskMiddle ? ~(3L<<(2*(k/2))) : -1L; 659 660 if(kfilter || ktrimN || ktrimRight || ktrimLeft){ 661 System.err.println("k="+k); 662 if(kbig>k){System.err.println("kbig="+kbig);} 663 if(mink<k && (ktrimN || ktrimRight || ktrimLeft)){System.err.println("mink="+mink);} 664 if(maskMiddle){System.err.println("maskMiddle="+maskMiddle);} 665 if(hammingDistance>0){System.err.println("hamming distance="+hammingDistance);} 666 if(editDistance>0){System.err.println("edit distance="+editDistance);} 667 } 668 if(kfilter){printFilterPlan("kfiltering", refFilter, literalFilter);} 669 if(ktrimN){printFilterPlan(((char)trimSymbol)+"-masking", refMask, literalMask);} 670 if(ktrimRight){printFilterPlan("right-ktrimming", refRight, literalRight);} 671 if(ktrimLeft){printFilterPlan("left-ktrimming", refLeft, literalLeft);} 672 if(qtrimRight || qtrimLeft){ 673 System.err.println("quality-trimming "+((qtrimRight && qtrimLeft) ? "both ends" : qtrimRight ? "right end" : " left end")+" to Q"+trimq); 674 } 675 System.err.println(); 676 677 hitCounts=(outduk==null ? null : new long[HITCOUNT_LEN+1]); 678 679 680 /* Adjust I/O settings and filenames */ 681 682 assert(FastaReadInputStream.settingsOK()); 683 684 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} 685 686 if(in1!=null && in1.contains("#") && !new File(in1).exists()){ 687 int pound=in1.lastIndexOf('#'); 688 String a=in1.substring(0, pound); 689 String b=in1.substring(pound+1); 690 in1=a+1+b; 691 in2=a+2+b; 692 } 693 if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;} 694 if(in2!=null){ 695 if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");} 696 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; 697 } 698 699 if(qfin1!=null && qfin1.contains("#") && in2!=null && !new File(qfin1).exists()){ 700 int pound=qfin1.lastIndexOf('#'); 701 String a=qfin1.substring(0, pound); 702 String b=qfin1.substring(pound+1); 703 qfin1=a+1+b; 704 qfin2=a+2+b; 705 } 706 707 if(out1!=null && out1.contains("#")){ 708 int pound=out1.lastIndexOf('#'); 709 String a=out1.substring(0, pound); 710 String b=out1.substring(pound+1); 711 out1=a+1+b; 712 out2=a+2+b; 713 } 714 715 if(outb1!=null && outb1.contains("#")){ 716 int pound=outb1.lastIndexOf('#'); 717 String a=outb1.substring(0, pound); 718 String b=outb1.substring(pound+1); 719 outb1=a+1+b; 720 outb2=a+2+b; 721 } 722 723 if((out2!=null || outb2!=null) && (in1!=null && in2==null)){ 724 if(!FASTQ.FORCE_INTERLEAVED){System.err.println("Forcing interleaved input because paired output was specified for a single input file.");} 725 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=true; 726 } 727 728 if(!setOut){ 729 System.err.println("No output stream specified. To write to stdout, please specify 'out=stdout.fq' or similar."); 730 // out1="stdout"; 731 // outstream=System.err; 732 // out2=null; 733 out1=out2=null; 734 }else if("stdout".equalsIgnoreCase(out1) || "standarddout".equalsIgnoreCase(out1)){ 735 out1="stdout.fq"; 736 outstream=System.err; 737 out2=null; 738 } 739 740 if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, outb1, outb2, outsingle, outstats, outrpkm, outduk, outrqc, outrefstats)){ 741 throw new RuntimeException("\nCan't write to some output files; overwrite="+overwrite+"\n"); 742 } 743 if(!Tools.testInputFiles(false, true, in1, in2, qfin1, qfin2)){ 744 throw new RuntimeException("\nCan't read some input files.\n"); 745 } 746 if(!Tools.testInputFiles(true, true, refFilter, refMask, refRight, refLeft)){ 747 throw new RuntimeException("\nCan't read from some reference files.\n"); 748 } 749 if(!Tools.testForDuplicateFiles(true, in1, in2, qfin1, qfin2, out1, out2, outb1, outb2, outsingle, outstats, outrpkm, outduk, outrqc, outrefstats)){ 750 throw new RuntimeException("\nSome file names were specified multiple times.\n"); 751 } 752 753 assert(THREADS>0) : "THREADS must be greater than 0."; 754 755 assert(in1==null || in1.toLowerCase().startsWith("stdin") || in1.toLowerCase().startsWith("standardin") || new File(in1).exists()) : "Can't find "+in1; 756 assert(in2==null || in2.toLowerCase().startsWith("stdin") || in2.toLowerCase().startsWith("standardin") || new File(in2).exists()) : "Can't find "+in2; 757 758 if(!(kfilter || ktrimN || ktrimRight || ktrimLeft || qtrimLeft || qtrimRight || minAvgQuality>0 || maxNs>=0 || trimByOverlap || 759 MAKE_QUALITY_HISTOGRAM || MAKE_MATCH_HISTOGRAM || MAKE_BASE_HISTOGRAM || MAKE_QUALITY_ACCURACY || 760 MAKE_EHIST || MAKE_INDELHIST || MAKE_LHIST || MAKE_GCHIST || MAKE_IDHIST || 761 forceTrimLeft>0 || forceTrimRight>0 || forceTrimModulo>0 || minBaseFrequency>0 || recalibrateQuality)){ 762 System.err.println("NOTE: No reference files specified, no trimming mode, no min avg quality, no histograms - read sequences will not be changed."); 763 } 764 765 if(recalibrateQuality){ 766 SamLine.SET_FROM_OK=true;//TODO: Should ban other operations 767 } 768 769 if(ref!=null){ 770 for(String s0 : ref){ 771 assert(s0!=null) : "Specified a null reference."; 772 String s=s0.toLowerCase(); 773 assert(s==null || s.startsWith("stdin") || s.startsWith("standardin") || new File(s0).exists()) : "Can't find "+s0; 774 } 775 } 776 777 //Initialize tables 778 final int tableType=(useForest ? AbstractKmerTable.FOREST1D : useTable ? AbstractKmerTable.TABLE : useArray ? AbstractKmerTable.ARRAY1D : 0); 779 ScheduleMaker scheduleMaker=new ScheduleMaker(WAYS, 12, prealloc_, (prealloc_ ? preallocFraction : 0.7)); 780 int[] schedule=scheduleMaker.makeSchedule(); 781 if(kfilter){ 782 filterMaps=AbstractKmerTable.preallocate(WAYS, tableType, schedule, -1L); 783 }else{filterMaps=null;} 784 if(ktrimN){ 785 maskMaps=AbstractKmerTable.preallocate(WAYS, tableType, schedule, -1L); 786 }else{maskMaps=null;} 787 if(ktrimRight){ 788 trimRightMaps=AbstractKmerTable.preallocate(WAYS, tableType, schedule, -1L); 789 }else{trimRightMaps=null;} 790 if(ktrimLeft){ 791 trimLeftMaps=AbstractKmerTable.preallocate(WAYS, tableType, schedule, -1L); 792 }else{trimLeftMaps=null;} 793 794 //Initialize entropy 795 calcEntropy=(entropyCutoff>0); 796 if(calcEntropy){ 797 assert(entropyWindow>0 && entropyCutoff>=0 && entropyCutoff<=1); 798 entropy=new double[entropyWindow+2]; 799 final double mult=1d/entropyWindow; 800 for(int i=0; i<entropy.length; i++){ 801 double pk=i*mult; 802 entropy[i]=pk*Math.log(pk); 803 } 804 entropyMult=-1/Math.log(entropyWindow); 805 entropyKmerspace=(1<<(2*entropyK)); 806 }else{ 807 entropy=null; 808 entropyMult=0; 809 entropyKmerspace=1; 810 } 811 } 812 813 printFilterPlan(String action, String[] refs, String[] lits)814 private static void printFilterPlan(String action, String[] refs, String[] lits){ 815 String and=(refs!=null && lits!=null ? " and " : ""); 816 String s1=(lits==null || lits.length!=1 ? "s" : ""); 817 String s2=(refs==null || refs.length!=1 ? "s" : ""); 818 System.err.println(action+" using "+(lits!=null ? lits.length+" literal"+s1 : "")+and+(refs!=null ? refs.length+" reference"+s2 : "")+"."); 819 } 820 821 822 /*--------------------------------------------------------------*/ 823 /*---------------- Outer Methods ----------------*/ 824 /*--------------------------------------------------------------*/ 825 826 process()827 public void process(){ 828 829 if(recalibrateQuality){ 830 if(samFile!=null){ 831 CalcTrueQuality.main2(new String[] {"in="+samFile, "showstats=f"}); 832 } 833 CalcTrueQuality.initializeMatrices(); 834 } 835 836 /* Check for output file collisions */ 837 if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, outb1, outb2, outstats, outrpkm, outduk, outrqc, outrefstats)){ 838 throw new RuntimeException("One or more output files were duplicate or could not be written to. Check the names or set the 'overwrite=true' flag."); 839 } 840 841 /* Start overall timer */ 842 Timer t=new Timer(); 843 844 // boolean dq0=FASTQ.DETECT_QUALITY; 845 // boolean ti0=FASTQ.TEST_INTERLEAVED; 846 // int rbl0=Shared.bufferLen();; 847 // FASTQ.DETECT_QUALITY=false; 848 // FASTQ.TEST_INTERLEAVED=false; 849 // Shared.setBufferLen(16; 850 851 process2(t.time1); 852 853 // FASTQ.DETECT_QUALITY=dq0; 854 // FASTQ.TEST_INTERLEAVED=ti0; 855 // Shared.setBufferLen(rbl0; 856 857 /* Stop timer and calculate speed statistics */ 858 t.stop(); 859 860 861 if(showSpeed){ 862 outstream.println(); 863 outstream.println(Tools.timeReadsBasesProcessed(t, readsIn, basesIn, 8)); 864 } 865 866 /* Throw an exception if errors were detected */ 867 if(errorState){ 868 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); 869 } 870 } 871 872 process2(long startTime)873 public void process2(long startTime){ 874 875 /* Start phase timer */ 876 Timer t=new Timer(); 877 878 if(DISPLAY_PROGRESS){ 879 outstream.println("Initial:"); 880 Shared.printMemory(); 881 outstream.println(); 882 } 883 884 /* Fill tables with reference kmers */ 885 { 886 final boolean oldTI=FASTQ.TEST_INTERLEAVED; //TODO: This needs to be changed to a non-static field, or somehow 'read mode' and 'ref mode' need to be distinguished. 887 final boolean oldFI=FASTQ.FORCE_INTERLEAVED; 888 final boolean oldSplit=FastaReadInputStream.SPLIT_READS; 889 final int oldML=FastaReadInputStream.MIN_READ_LEN; 890 891 FASTQ.TEST_INTERLEAVED=false; 892 FASTQ.FORCE_INTERLEAVED=false; 893 FastaReadInputStream.SPLIT_READS=false; 894 FastaReadInputStream.MIN_READ_LEN=1; 895 896 if(kfilter){ 897 storedKmersFilter=spawnLoadThreads(refFilter, literalFilter, filterMaps, true); 898 } 899 if(ktrimN){ 900 storedKmersMask=spawnLoadThreads(refMask, literalMask, maskMaps, false); 901 } 902 if(ktrimRight){ 903 storedKmersRight=spawnLoadThreads(refRight, literalRight, trimRightMaps, false); 904 } 905 if(ktrimLeft){ 906 storedKmersLeft=spawnLoadThreads(refLeft, literalLeft, trimLeftMaps, false); 907 } 908 909 FASTQ.TEST_INTERLEAVED=oldTI; 910 FASTQ.FORCE_INTERLEAVED=oldFI; 911 FastaReadInputStream.SPLIT_READS=oldSplit; 912 FastaReadInputStream.MIN_READ_LEN=oldML; 913 914 if(useRefNames){toRefNames();} 915 t.stop(); 916 } 917 918 { 919 long ram=freeMemory(); 920 ALLOW_LOCAL_ARRAYS=(scaffoldNames!=null && Tools.max(THREADS, 1)*3*8*scaffoldNames.size()<ram*5); 921 } 922 923 /* Dump kmers to text */ 924 if(dump!=null && filterMaps!=null){ 925 ByteStreamWriter bsw=new ByteStreamWriter(dump, overwrite, false, true); 926 bsw.start(); 927 for(AbstractKmerTable set : filterMaps){ 928 set.dumpKmersAsBytes(bsw, k, 0, Integer.MAX_VALUE, null); 929 } 930 bsw.poisonAndWait(); 931 } 932 933 final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR; 934 Read.VALIDATE_IN_CONSTRUCTOR=THREADS<4; 935 936 /* Do kmer matching of input reads */ 937 spawnProcessThreads(t); 938 939 Read.VALIDATE_IN_CONSTRUCTOR=vic; 940 941 /* Write legacy duk statistics (which requires tables) */ 942 writeDuk(System.nanoTime()-startTime); 943 944 /* Unload kmers to save memory */ 945 if(RELEASE_TABLES){unloadKmers();} 946 947 /* Write statistics to files */ 948 writeStats(); 949 writeRPKM(); 950 writeRefStats(); 951 writeRqc(); 952 953 /* Unload sequence data to save memory */ 954 if(RELEASE_TABLES){unloadScaffolds();} 955 956 outstream.println("\nInput: \t"+readsIn+" reads \t\t"+basesIn+" bases."); 957 958 if(kfilter){ 959 outstream.println("Contaminants: \t"+readsKFiltered+" reads ("+toPercent(readsKFiltered, readsIn)+") \t"+ 960 basesKFiltered+" bases ("+toPercent(basesKFiltered, basesIn)+")"); 961 outstream.flush(); 962 } 963 if(qtrimLeft || qtrimRight){ 964 outstream.println("QTrimmed: \t"+readsQTrimmed+" reads ("+toPercent(readsQTrimmed, readsIn)+") \t"+ 965 basesQTrimmed+" bases ("+toPercent(basesQTrimmed, basesIn)+")"); 966 } 967 if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0){ 968 outstream.println("FTrimmed: \t"+readsFTrimmed+" reads ("+toPercent(readsFTrimmed, readsIn)+") \t"+ 969 basesFTrimmed+" bases ("+toPercent(basesFTrimmed, basesIn)+")"); 970 } 971 if(ktrimLeft || ktrimRight || ktrimN){ 972 outstream.println("KTrimmed: \t"+readsKTrimmed+" reads ("+toPercent(readsKTrimmed, readsIn)+") \t"+ 973 basesKTrimmed+" bases ("+toPercent(basesKTrimmed, basesIn)+")"); 974 } 975 if(trimByOverlap){ 976 outstream.println("Trimmed by overlap: \t"+readsTrimmedByOverlap+" reads ("+toPercent(readsTrimmedByOverlap, readsIn)+") \t"+ 977 basesTrimmedByOverlap+" bases ("+toPercent(basesTrimmedByOverlap, basesIn)+")"); 978 } 979 if(filterGC){ 980 outstream.println("Filtered by GC: \t"+badGcReads+" reads ("+toPercent(badGcReads, readsIn)+") \t"+ 981 badGcBases+" bases ("+toPercent(badGcBases, basesIn)+")"); 982 } 983 if(minAvgQuality>0 || maxNs>=0 || minBaseFrequency>0 || chastityFilter || removeBadBarcodes){ 984 outstream.println("Low quality discards: \t"+readsQFiltered+" reads ("+toPercent(readsQFiltered, readsIn)+") \t"+ 985 basesQFiltered+" bases ("+toPercent(basesQFiltered, basesIn)+")"); 986 } 987 if(calcEntropy){ 988 outstream.println("Low entropy discards: \t"+readsEFiltered+" reads ("+toPercent(readsEFiltered, readsIn)+") \t"+ 989 basesEFiltered+" bases ("+toPercent(basesEFiltered, basesIn)+")"); 990 } 991 992 final long readsRemoved=readsIn-readsOut; 993 final long basesRemoved=basesIn-basesOut; 994 995 outstream.println("Total Removed: \t"+readsRemoved+" reads ("+toPercent(readsRemoved, readsIn)+") \t"+ 996 basesRemoved+" bases ("+toPercent(basesRemoved, basesIn)+")"); 997 998 outstream.println("Result: \t"+readsOut+" reads ("+toPercent(readsOut, readsIn)+") \t"+ 999 basesOut+" bases ("+toPercent(basesOut, basesIn)+")"); 1000 1001 if(loglog!=null){ 1002 outstream.println("Unique "+loglog.k+"-mers: \t"+loglog.cardinality()); 1003 } 1004 } 1005 toPercent(long numerator, long denominator)1006 private static String toPercent(long numerator, long denominator){ 1007 if(denominator<1){return "0.00%";} 1008 return String.format(Locale.ROOT, "%.2f%%",numerator*100.0/denominator); 1009 } 1010 1011 /** 1012 * Clear stored kmers. 1013 */ unloadKmers()1014 public void unloadKmers(){ 1015 for(AbstractKmerTable[] akta : new AbstractKmerTable[][] {filterMaps, maskMaps, trimRightMaps, trimLeftMaps}){ 1016 if(akta!=null){ 1017 for(int i=0; i<akta.length; i++){akta[i]=null;} 1018 } 1019 } 1020 } 1021 1022 /** 1023 * Clear stored sequence data. 1024 */ unloadScaffolds()1025 public void unloadScaffolds(){ 1026 if(scaffoldNames!=null && !scaffoldNames.isEmpty()){ 1027 scaffoldNames.clear(); 1028 scaffoldNames.trimToSize(); 1029 } 1030 scaffoldReadCounts=null; 1031 scaffoldBaseCounts=null; 1032 hitCounts=null; 1033 scaffoldLengths=null; 1034 } 1035 1036 /** 1037 * Write statistics about how many reads matched each reference scaffold. 1038 */ writeStats()1039 private void writeStats(){ 1040 if(outstats==null){return;} 1041 final TextStreamWriter tsw=new TextStreamWriter(outstats, overwrite, false, false); 1042 tsw.start(); 1043 1044 long rsum=0, bsum=0; 1045 1046 /* Create StringCount list of scaffold names and hitcounts */ 1047 ArrayList<StringCount> list=new ArrayList<StringCount>(); 1048 for(int i=1; i<scaffoldNames.size(); i++){ 1049 final long num1=scaffoldReadCounts.get(i), num2=scaffoldBaseCounts.get(i); 1050 if(num1>0 || !printNonZeroOnly){ 1051 rsum+=num1; 1052 bsum+=num2; 1053 final String s=scaffoldNames.get(i); 1054 final int len=scaffoldLengths.get(i); 1055 final StringCount sn=new StringCount(s, len, num1, num2); 1056 list.add(sn); 1057 } 1058 } 1059 Shared.sort(list); 1060 final double rmult=100.0/(readsIn>0 ? readsIn : 1); 1061 final double bmult=100.0/(basesIn>0 ? basesIn : 1); 1062 1063 tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n"); 1064 1065 if(STATS_COLUMNS==3){ 1066 tsw.print(String.format(Locale.ROOT, "#Total\t%d\n",readsIn)); 1067 tsw.print(String.format(Locale.ROOT, "#Matched\t%d\t%.5f%%\n",rsum,rmult*rsum)); 1068 tsw.print("#Name\tReads\tReadsPct\n"); 1069 for(int i=0; i<list.size(); i++){ 1070 StringCount sn=list.get(i); 1071 tsw.print(String.format(Locale.ROOT, "%s\t%d\t%.5f%%\n",sn.name,sn.reads,(sn.reads*rmult))); 1072 } 1073 }else{ 1074 tsw.print(String.format(Locale.ROOT, "#Total\t%d\t%d\n",readsIn,basesIn)); 1075 tsw.print(String.format(Locale.ROOT, "#Matched\t%d\t%.5f%%\n",rsum,rmult*rsum,bsum,bsum*bmult)); 1076 tsw.print("#Name\tReads\tReadsPct\tBases\tBasesPct\n"); 1077 for(int i=0; i<list.size(); i++){ 1078 StringCount sn=list.get(i); 1079 tsw.print(String.format(Locale.ROOT, "%s\t%d\t%.5f%%\t%d\t%.5f%%\n",sn.name,sn.reads,(sn.reads*rmult),sn.bases,(sn.bases*bmult))); 1080 } 1081 } 1082 tsw.poisonAndWait(); 1083 } 1084 1085 /** 1086 * Write RPKM statistics. 1087 */ writeRPKM()1088 private void writeRPKM(){ 1089 if(outrpkm==null){return;} 1090 final TextStreamWriter tsw=new TextStreamWriter(outrpkm, overwrite, false, false); 1091 tsw.start(); 1092 1093 /* Count mapped reads */ 1094 long mapped=0; 1095 for(int i=0; i<scaffoldReadCounts.length(); i++){ 1096 mapped+=scaffoldReadCounts.get(i); 1097 } 1098 1099 /* Print header */ 1100 tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n"); 1101 tsw.print(String.format(Locale.ROOT, "#Reads\t%d\n",readsIn)); 1102 tsw.print(String.format(Locale.ROOT, "#Mapped\t%d\n",mapped)); 1103 tsw.print(String.format(Locale.ROOT, "#RefSequences\t%d\n",Tools.max(0, scaffoldNames.size()-1))); 1104 tsw.print("#Name\tLength\tBases\tCoverage\tReads\tRPKM\n"); 1105 1106 final float mult=1000000000f/Tools.max(1, mapped); 1107 1108 /* Print data */ 1109 for(int i=1; i<scaffoldNames.size(); i++){ 1110 final long reads=scaffoldReadCounts.get(i); 1111 final long bases=scaffoldBaseCounts.get(i); 1112 final String s=scaffoldNames.get(i); 1113 final int len=scaffoldLengths.get(i); 1114 final double invlen=1.0/Tools.max(1, len); 1115 final double mult2=mult*invlen; 1116 if(reads>0 || !printNonZeroOnly){ 1117 tsw.print(String.format(Locale.ROOT, "%s\t%d\t%d\t%.4f\t%d\t%.4f\n",s,len,bases,bases*invlen,reads,reads*mult2)); 1118 } 1119 } 1120 tsw.poisonAndWait(); 1121 } 1122 1123 /** 1124 * Write statistics on a per-reference basis. 1125 */ writeRefStats()1126 private void writeRefStats(){ 1127 if(outrefstats==null){return;} 1128 final TextStreamWriter tsw=new TextStreamWriter(outrefstats, overwrite, false, false); 1129 tsw.start(); 1130 1131 /* Count mapped reads */ 1132 long mapped=0; 1133 for(int i=0; i<scaffoldReadCounts.length(); i++){ 1134 mapped+=scaffoldReadCounts.get(i); 1135 } 1136 1137 final int numRefs=refNames.size(); 1138 long[] refReadCounts=new long[numRefs]; 1139 long[] refBaseCounts=new long[numRefs]; 1140 long[] refLengths=new long[numRefs]; 1141 1142 for(int r=0, s=1; r<numRefs; r++){ 1143 final int lim=s+refScafCounts[r]; 1144 while(s<lim){ 1145 refReadCounts[r]+=scaffoldReadCounts.get(s); 1146 refBaseCounts[r]+=scaffoldBaseCounts.get(s); 1147 refLengths[r]+=scaffoldLengths.get(s); 1148 s++; 1149 } 1150 } 1151 1152 /* Print header */ 1153 tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n"); 1154 tsw.print(String.format(Locale.ROOT, "#Reads\t%d\n",readsIn)); 1155 tsw.print(String.format(Locale.ROOT, "#Mapped\t%d\n",mapped)); 1156 tsw.print(String.format(Locale.ROOT, "#References\t%d\n",Tools.max(0, refNames.size()))); 1157 tsw.print("#Name\tLength\tScaffolds\tBases\tCoverage\tReads\tRPKM\n"); 1158 1159 final float mult=1000000000f/Tools.max(1, mapped); 1160 1161 /* Print data */ 1162 for(int i=0; i<refNames.size(); i++){ 1163 final long reads=refReadCounts[i]; 1164 final long bases=refBaseCounts[i]; 1165 final long len=refLengths[i]; 1166 final int scafs=refScafCounts[i]; 1167 final String name=ReadWrite.stripToCore(refNames.get(i)); 1168 final double invlen=1.0/Tools.max(1, len); 1169 final double mult2=mult*invlen; 1170 if(reads>0 || !printNonZeroOnly){ 1171 tsw.print(String.format(Locale.ROOT, "%s\t%d\t%d\t%d\t%.4f\t%d\t%.4f\n",name,len,scafs,bases,bases*invlen,reads,reads*mult2)); 1172 } 1173 } 1174 tsw.poisonAndWait(); 1175 } 1176 1177 /** 1178 * Write processing statistics in DUK's format. 1179 * @param time Elapsed time, nanoseconds 1180 */ writeDuk(long time)1181 private void writeDuk(long time){ 1182 if(outduk==null){return;} 1183 final TextStreamWriter tsw=new TextStreamWriter(outduk, overwrite, false, false); 1184 tsw.start(); 1185 tsw.println(dukString(time, refFilter)); 1186 tsw.poisonAndWait(); 1187 } 1188 1189 /** 1190 * Write RQCFilter stats. 1191 * @param time Elapsed time, nanoseconds 1192 */ writeRqc()1193 private void writeRqc(){ 1194 if(outrqc==null){return;} 1195 addToRqcMap(); 1196 if(outrqc.endsWith("hashmap")){return;} 1197 final TextStreamWriter tsw=new TextStreamWriter(outrqc, overwrite, false, false); 1198 tsw.start(); 1199 tsw.println(rqcString()); 1200 tsw.poisonAndWait(); 1201 } 1202 rqcString()1203 public static String rqcString(){ 1204 if(RQC_MAP==null){return null;} 1205 StringBuilder sb=new StringBuilder(); 1206 1207 String[] keys=new String[] {"inputReads", "inputBases", "qtrimmedReads", "qtrimmedBases", "qfilteredReads", "qfilteredBases", 1208 "ktrimmedReads", "ktrimmedBases", "kfilteredReads", "kfilteredBases", "outputReads", "outputBases"}; 1209 1210 for(String key : keys){ 1211 String value=RQC_MAP.get(key); 1212 if(value!=null){ 1213 sb.append(key+"="+value+"\n"); 1214 } 1215 } 1216 1217 return sb.toString(); 1218 } 1219 addToRqcMap()1220 private void addToRqcMap(){ 1221 putRqc("inputReads", readsIn, false); 1222 putRqc("inputBases", basesIn, false); 1223 if(qtrimLeft || qtrimRight){ 1224 putRqc("qtrimmedReads", readsQTrimmed, false); 1225 putRqc("qtrimmedBases", basesQTrimmed, false); 1226 } 1227 putRqc("qfilteredReads", readsQFiltered, false); 1228 putRqc("qfilteredBases", basesQFiltered, false); 1229 1230 if(ktrimLeft || ktrimRight || ktrimN){ 1231 putRqc("ktrimmedReads", readsKTrimmed, true); 1232 putRqc("ktrimmedBases", basesKTrimmed, true); 1233 }else{ 1234 putRqc("kfilteredReads", readsKFiltered, false); 1235 putRqc("kfilteredBases", basesKFiltered, false); 1236 } 1237 putRqc("outputReads", readsOut, true); 1238 putRqc("outputBases", basesOut, true); 1239 } 1240 putRqc(String key, long value, boolean evict)1241 private static void putRqc(String key, long value, boolean evict){putRqc(key, value+"", evict);} 1242 putRqc(String key, String value, boolean evict)1243 private static void putRqc(String key, String value, boolean evict){ 1244 if(RQC_MAP==null){RQC_MAP=new HashMap<String,String>();} 1245 if(evict || !RQC_MAP.containsKey(key)){RQC_MAP.put(key, value);} 1246 } 1247 1248 /** 1249 * Helper method; formats statistics to be duk-compatible 1250 * @param time Elapsed time, nanoseconds 1251 * @return duk output string 1252 */ dukString(long time, String[] ref)1253 private String dukString(long time, String[] ref){ 1254 StringBuilder sb=new StringBuilder(); 1255 sb.append("##INPUT PARAMETERS##\n"); 1256 sb.append("#Reference file: "+(ref==null || ref.length<1 ? null : ref.length==1 ? ref[0] : Arrays.toString(ref))+"\n"); 1257 sb.append("#Query file: "+in1+(in2==null ? "" : ","+in2)+"\n"); 1258 sb.append("#Not matched reads file: "+out1+(out2==null ? "" : ","+out2)+"\n"); 1259 sb.append("#Matched reads file: "+outb1+(outb2==null ? "" : ","+outb2)+"\n"); 1260 sb.append("#Output file (duk): "+outduk+"\n"); 1261 sb.append("#Output file (stats): "+outstats+"\n"); 1262 sb.append("#Mer size: "+k+"\n"); 1263 long size=0; 1264 if(kfilter){ 1265 for(AbstractKmerTable x : filterMaps){size+=x.size();} 1266 } 1267 if(ktrimN){ 1268 for(AbstractKmerTable x : maskMaps){size+=x.size();} 1269 } 1270 if(ktrimRight){ 1271 for(AbstractKmerTable x : trimRightMaps){size+=x.size();} 1272 } 1273 if(ktrimLeft){ 1274 for(AbstractKmerTable x : trimLeftMaps){size+=x.size();} 1275 } 1276 sb.append("#Avg step size: "+String.format(Locale.ROOT, "%.1f", refKmers/(double)(Tools.max(1, size)))+"\n"); 1277 sb.append("#Cut off: "+maxBadKmers+"\n"); 1278 sb.append("#Mask middle: "+maskMiddle+"\n"); 1279 sb.append("#Quality trim: "+((qtrimLeft || qtrimRight) ? trimq : "false")+"\n"); 1280 sb.append("\n"); 1281 1282 sb.append("##REFERENCE STAT##\n"); 1283 sb.append("#Total Reads: "+refReads+"\n"); 1284 sb.append("#Total Bases: "+refBases+"\n"); 1285 sb.append("#Total kmers: "+refKmers+"\n"); 1286 sb.append("#Total stored kmers: "+size+"\n"); 1287 sb.append("\n"); 1288 1289 sb.append("## ELAPSED TIME##\n"); 1290 sb.append("# Time: "+String.format(Locale.ROOT, "%.2f", time/1000000000.0)+" seconds\n"); 1291 sb.append("\n"); 1292 1293 sb.append("##QUERY FILE STAT##\n"); 1294 sb.append("# Total number of reads: "+readsIn+"\n"); 1295 sb.append("# Total number of matched reads: "+readsKFiltered+"\n"); 1296 sb.append("# Match ratio: "+String.format(Locale.ROOT, "%.6f", readsKFiltered*1.0/readsIn)+"\n"); 1297 sb.append("\n"); 1298 1299 sb.append("##P-VALUE##\n"); 1300 sb.append("#Avg number of Kmer for each read: "+((basesIn/(Tools.max(readsIn, 1)))-k)+"\n"); 1301 // sb.append("# P value for the given threshold 1 is 4.05231e-14\n"); //duk prints a P value; not sure what it means 1302 sb.append("\n"); 1303 1304 sb.append("## Histogram of kmer occurance for reads with at least one occurance ##\n"); 1305 sb.append("#NumOcc\tNumReads\tPercentage\n"); 1306 1307 long sum=Tools.sum(hitCounts); 1308 double mult=100.0/(sum<1 ? 1 : sum); 1309 for(int i=0; i<hitCounts.length; i++){ 1310 long x=hitCounts[i]; 1311 if(x>0){ 1312 sb.append(i).append('\t').append(x).append('\t').append(String.format(Locale.ROOT, "%.4f",(x*mult))).append('\n'); 1313 } 1314 } 1315 1316 return sb.toString(); 1317 } 1318 1319 /** 1320 * Fills the scaffold names array with reference names. 1321 */ toRefNames()1322 private void toRefNames(){ 1323 final int numRefs=refNames.size(); 1324 for(int r=0, s=1; r<numRefs; r++){ 1325 final int scafs=refScafCounts[r]; 1326 final int lim=s+scafs; 1327 final String name=ReadWrite.stripToCore(refNames.get(r)); 1328 // System.err.println("r="+r+", s="+s+", scafs="+scafs+", lim="+lim+", name="+name); 1329 while(s<lim){ 1330 // System.err.println(r+", "+s+". Setting "+scaffoldNames.get(s)+" -> "+name); 1331 scaffoldNames.set(s, name); 1332 s++; 1333 } 1334 } 1335 } 1336 1337 1338 /*--------------------------------------------------------------*/ 1339 /*---------------- Inner Methods ----------------*/ 1340 /*--------------------------------------------------------------*/ 1341 1342 1343 /** 1344 * Fills tables with kmers from references, using multiple LoadThread. 1345 * @return Number of kmers stored. 1346 */ spawnLoadThreads(String[] ref, String[] literal, AbstractKmerTable[] maps, boolean countRef)1347 private long spawnLoadThreads(String[] ref, String[] literal, AbstractKmerTable[] maps, boolean countRef){ 1348 Timer t=new Timer(); 1349 if((ref==null || ref.length<1) && (literal==null || literal.length<1)){return 0;} 1350 long added=0; 1351 1352 /* Create load threads */ 1353 LoadThread[] loaders=new LoadThread[WAYS]; 1354 for(int i=0; i<loaders.length; i++){ 1355 loaders[i]=new LoadThread(i, maps[i]); 1356 loaders[i].start(); 1357 } 1358 1359 /* For each reference file... */ 1360 int refNum=0; 1361 if(ref!=null){ 1362 for(String refname : ref){ 1363 1364 /* Start an input stream */ 1365 FileFormat ff=FileFormat.testInput(refname, FileFormat.FASTA, null, false, true); 1366 ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1L, false, ff, null, null, null, Shared.USE_MPI, true); 1367 cris.start(); //4567 1368 ListNum<Read> ln=cris.nextList(); 1369 ArrayList<Read> reads=(ln!=null ? ln.list : null); 1370 1371 /* Iterate through read lists from the input stream */ 1372 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 1373 { 1374 /* Assign a unique ID number to each scaffold */ 1375 ArrayList<Read> reads2=new ArrayList<Read>(reads); 1376 for(Read r1 : reads2){ 1377 final Read r2=r1.mate; 1378 final Integer id=scaffoldNames.size(); 1379 if(countRef){refScafCounts[refNum]++;} 1380 scaffoldNames.add(r1.id==null ? id.toString() : r1.id); 1381 int len=r1.length(); 1382 r1.obj=id; 1383 if(r2!=null){ 1384 r2.obj=id; 1385 len+=r2.length(); 1386 } 1387 scaffoldLengths.add(len); 1388 } 1389 1390 if(REPLICATE_AMBIGUOUS){ 1391 reads2=Tools.replicateAmbiguous(reads2, Tools.min(k, mink)); 1392 } 1393 1394 /* Send a pointer to the read list to each LoadThread */ 1395 for(LoadThread lt : loaders){ 1396 boolean b=true; 1397 while(b){ 1398 try { 1399 lt.queue.put(reads2); 1400 b=false; 1401 } catch (InterruptedException e) { 1402 //TODO: This will hang due to still-running threads. 1403 throw new RuntimeException(e); 1404 } 1405 } 1406 } 1407 } 1408 1409 /* Dispose of the old list and fetch a new one */ 1410 cris.returnList(ln); 1411 ln=cris.nextList(); 1412 reads=(ln!=null ? ln.list : null); 1413 } 1414 /* Cleanup */ 1415 cris.returnList(ln); 1416 errorState|=ReadWrite.closeStream(cris); 1417 refNum++; 1418 } 1419 } 1420 1421 /* If there are literal sequences to use as references */ 1422 if(literal!=null){ 1423 ArrayList<Read> list=new ArrayList<Read>(literal.length); 1424 if(verbose){System.err.println("Adding literals "+Arrays.toString(literal));} 1425 1426 /* Assign a unique ID number to each literal sequence */ 1427 for(int i=0; i<literal.length; i++){ 1428 final Integer id=scaffoldNames.size(); 1429 final Read r=new Read(literal[i].getBytes(), null, id); 1430 if(countRef){refScafCounts[refNum]++;} 1431 scaffoldNames.add(id.toString()); 1432 scaffoldLengths.add(r.length()); 1433 r.obj=id; 1434 list.add(r); 1435 } 1436 1437 if(REPLICATE_AMBIGUOUS){ 1438 list=Tools.replicateAmbiguous(list, Tools.min(k, mink)); 1439 } 1440 1441 /* Send a pointer to the read list to each LoadThread */ 1442 for(LoadThread lt : loaders){ 1443 boolean b=true; 1444 while(b){ 1445 try { 1446 lt.queue.put(list); 1447 b=false; 1448 } catch (InterruptedException e) { 1449 //TODO: This will hang due to still-running threads. 1450 throw new RuntimeException(e); 1451 } 1452 } 1453 } 1454 } 1455 1456 /* Signal loaders to terminate */ 1457 for(LoadThread lt : loaders){ 1458 boolean b=true; 1459 while(b){ 1460 try { 1461 lt.queue.put(POISON); 1462 b=false; 1463 } catch (InterruptedException e) { 1464 //TODO: This will hang due to still-running threads. 1465 throw new RuntimeException(e); 1466 } 1467 } 1468 } 1469 1470 /* Wait for loaders to die, and gather statistics */ 1471 boolean success=true; 1472 for(LoadThread lt : loaders){ 1473 while(lt.getState()!=Thread.State.TERMINATED){ 1474 try { 1475 lt.join(); 1476 } catch (InterruptedException e) { 1477 // TODO Auto-generated catch block 1478 e.printStackTrace(); 1479 } 1480 } 1481 added+=lt.added; 1482 refKmers+=lt.refKmersT; 1483 refBases+=lt.refBasesT; 1484 refReads+=lt.refReadsT; 1485 success&=lt.success; 1486 } 1487 if(!success){KillSwitch.kill("Failed loading ref kmers; aborting.");} 1488 1489 //Correct statistics for number of threads, since each thread processes all reference data 1490 refKmers/=WAYS; 1491 refBases/=WAYS; 1492 refReads/=WAYS; 1493 1494 scaffoldReadCounts=new AtomicLongArray(scaffoldNames.size()); 1495 scaffoldBaseCounts=new AtomicLongArray(scaffoldNames.size()); 1496 1497 t.stop(); 1498 if(DISPLAY_PROGRESS){ 1499 outstream.println("Added "+added+" kmers; time: \t"+t); 1500 Shared.printMemory(); 1501 outstream.println(); 1502 } 1503 1504 if(verbose){ 1505 TextStreamWriter tsw=new TextStreamWriter("stdout", false, false, false, FileFormat.TEXT); 1506 tsw.start(); 1507 1508 if(kfilter){ 1509 tsw.println("kfilter tables:"); 1510 for(AbstractKmerTable x : filterMaps){x.dumpKmersAsText(tsw, k, 1, Integer.MAX_VALUE);} 1511 } 1512 if(ktrimN){ 1513 tsw.println("ktrimN tables:"); 1514 for(AbstractKmerTable x : maskMaps){x.dumpKmersAsText(tsw, k, 1, Integer.MAX_VALUE);} 1515 } 1516 if(ktrimRight){ 1517 tsw.println("ktrimRight tables:"); 1518 for(AbstractKmerTable x : trimRightMaps){x.dumpKmersAsText(tsw, k, 1, Integer.MAX_VALUE);} 1519 } 1520 if(ktrimLeft){ 1521 tsw.println("ktrimLeft tables:"); 1522 for(AbstractKmerTable x : trimLeftMaps){x.dumpKmersAsText(tsw, k, 1, Integer.MAX_VALUE);} 1523 } 1524 1525 tsw.poisonAndWait(); 1526 } 1527 1528 return added; 1529 } 1530 1531 /** 1532 * Match reads against reference kmers, using multiple ProcessThread. 1533 * @param t 1534 */ spawnProcessThreads(Timer t)1535 private void spawnProcessThreads(Timer t){ 1536 t.start(); 1537 1538 /* Create read input stream */ 1539 final ConcurrentReadInputStream cris; 1540 final boolean paired; 1541 { 1542 FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); 1543 FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true); 1544 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, ff1.samOrBam(), ff1, ff2, qfin1, qfin2); 1545 cris.setSampleRate(samplerate, sampleseed); 1546 cris.start(); //4567 1547 paired=cris.paired(); 1548 if(!ff1.samOrBam()){outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired"));} 1549 } 1550 1551 /* Create read output streams */ 1552 final ConcurrentReadOutputStream ros, rosb, ross; 1553 if(out1!=null){ 1554 final int buff=(!ordered ? 12 : Tools.max(32, 2*Shared.threads())); 1555 FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, append, ordered); 1556 FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, append, ordered); 1557 ros=ConcurrentReadOutputStream.getStream(ff1, ff2, null, null, buff, null, true); 1558 ros.start(); 1559 }else{ros=null;} 1560 if(outb1!=null){ 1561 final int buff=(!ordered ? 12 : Tools.max(32, 2*Shared.threads())); 1562 FileFormat ff1=FileFormat.testOutput(outb1, FileFormat.FASTQ, null, true, overwrite, append, ordered); 1563 FileFormat ff2=FileFormat.testOutput(outb2, FileFormat.FASTQ, null, true, overwrite, append, ordered); 1564 rosb=ConcurrentReadOutputStream.getStream(ff1, ff2, null, null, buff, null, true); 1565 rosb.start(); 1566 }else{rosb=null;} 1567 if(outsingle!=null){ 1568 final int buff=(!ordered ? 12 : Tools.max(32, 2*Shared.threads())); 1569 FileFormat ff=FileFormat.testOutput(outsingle, FileFormat.FASTQ, null, true, overwrite, append, ordered); 1570 ross=ConcurrentReadOutputStream.getStream(ff, null, null, null, buff, null, true); 1571 ross.start(); 1572 }else{ross=null;} 1573 if(ros!=null || rosb!=null || ross!=null){ 1574 t.stop(); 1575 outstream.println("Started output streams:\t"+t); 1576 t.start(); 1577 } 1578 1579 /* Optionally skip the first reads, since initial reads may have lower quality */ 1580 if(skipreads>0){ 1581 long skipped=0; 1582 1583 ListNum<Read> ln=cris.nextList(); 1584 ArrayList<Read> reads=(ln!=null ? ln.list : null); 1585 1586 while(skipped<skipreads && reads!=null && reads.size()>0){ 1587 skipped+=reads.size(); 1588 1589 if(rosb!=null){rosb.add(new ArrayList<Read>(1), ln.id);} 1590 if(ros!=null){ros.add(new ArrayList<Read>(1), ln.id);} 1591 if(ross!=null){ross.add(new ArrayList<Read>(1), ln.id);} 1592 1593 cris.returnList(ln); 1594 ln=cris.nextList(); 1595 reads=(ln!=null ? ln.list : null); 1596 } 1597 cris.returnList(ln); 1598 if(reads==null || reads.isEmpty()){ 1599 ReadWrite.closeStreams(cris, ros, rosb, ross); 1600 System.err.println("Skipped all of the reads."); 1601 System.exit(0); 1602 } 1603 } 1604 1605 /* Create ProcessThreads */ 1606 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(THREADS); 1607 for(int i=0; i<THREADS; i++){alpt.add(new ProcessThread(cris, ros, rosb, ross, ALLOW_LOCAL_ARRAYS));} 1608 for(ProcessThread pt : alpt){pt.start();} 1609 1610 /* Wait for threads to die, and gather statistics */ 1611 for(ProcessThread pt : alpt){ 1612 while(pt.getState()!=Thread.State.TERMINATED){ 1613 try { 1614 pt.join(); 1615 } catch (InterruptedException e) { 1616 // TODO Auto-generated catch block 1617 e.printStackTrace(); 1618 } 1619 } 1620 readsIn+=pt.readsInT; 1621 basesIn+=pt.basesInT; 1622 readsOut+=pt.readsOutT; 1623 basesOut+=pt.basesOutT; 1624 readsKFiltered+=pt.readsKFilteredT; 1625 basesKFiltered+=pt.basesKFilteredT; 1626 readsQTrimmed+=pt.readsQTrimmedT; 1627 basesQTrimmed+=pt.basesQTrimmedT; 1628 readsFTrimmed+=pt.readsFTrimmedT; 1629 basesFTrimmed+=pt.basesFTrimmedT; 1630 readsKTrimmed+=pt.readsKTrimmedT; 1631 basesKTrimmed+=pt.basesKTrimmedT; 1632 readsTrimmedByOverlap+=pt.readsTrimmedByOverlapT; 1633 basesTrimmedByOverlap+=pt.basesTrimmedByOverlapT; 1634 badGcReads+=pt.badGcReadsT; 1635 badGcBases+=pt.badGcBasesT; 1636 readsQFiltered+=pt.readsQFilteredT; 1637 basesQFiltered+=pt.basesQFilteredT; 1638 readsEFiltered+=pt.readsEFilteredT; 1639 basesEFiltered+=pt.basesEFilteredT; 1640 1641 if(hitCounts!=null){ 1642 for(int i=0; i<hitCounts.length; i++){hitCounts[i]+=pt.hitCountsT[i];} 1643 pt.hitCountsT=null; 1644 } 1645 if(pt.scaffoldReadCountsT!=null && scaffoldReadCounts!=null){ 1646 for(int i=0; i<pt.scaffoldReadCountsT.length; i++){scaffoldReadCounts.addAndGet(i, pt.scaffoldReadCountsT[i]);} 1647 pt.scaffoldReadCountsT=null; 1648 } 1649 if(pt.scaffoldBaseCountsT!=null && scaffoldBaseCounts!=null){ 1650 for(int i=0; i<pt.scaffoldBaseCountsT.length; i++){scaffoldBaseCounts.addAndGet(i, pt.scaffoldBaseCountsT[i]);} 1651 pt.scaffoldBaseCountsT=null; 1652 } 1653 } 1654 1655 /* Shut down I/O streams; capture error status */ 1656 errorState|=ReadWrite.closeStreams(cris, ros, rosb, ross); 1657 errorState|=ReadStats.writeAll(); 1658 1659 t.stop(); 1660 if(showSpeed){ 1661 outstream.println("Processing time: \t\t"+t); 1662 } 1663 } 1664 1665 /*--------------------------------------------------------------*/ 1666 /*---------------- Inner Classes ----------------*/ 1667 /*--------------------------------------------------------------*/ 1668 1669 1670 /** 1671 * Loads kmers into a table. Each thread handles all kmers X such that X%WAYS==tnum. 1672 */ 1673 private class LoadThread extends Thread{ 1674 LoadThread(final int tnum_, final AbstractKmerTable map_)1675 public LoadThread(final int tnum_, final AbstractKmerTable map_){ 1676 tnum=tnum_; 1677 map=map_; 1678 } 1679 1680 /** 1681 * Get the next list of reads (or scaffolds) from the queue. 1682 * @return List of reads 1683 */ fetch()1684 private ArrayList<Read> fetch(){ 1685 ArrayList<Read> list=null; 1686 while(list==null){ 1687 try { 1688 list=queue.take(); 1689 } catch (InterruptedException e) { 1690 // TODO Auto-generated catch block 1691 e.printStackTrace(); 1692 } 1693 } 1694 return list; 1695 } 1696 1697 @Override run()1698 public void run(){ 1699 ArrayList<Read> reads=fetch(); 1700 while(reads!=POISON){ 1701 for(Read r1 : reads){ 1702 assert(r1.pairnum()==0); 1703 final Read r2=r1.mate; 1704 1705 final int rblen=(r1==null ? 0 : r1.length()); 1706 final int rblen2=r1.mateLength(); 1707 1708 added+=addToMap(r1, rblen>20000000 ? k : rblen>5000000 ? 11 : rblen>500000 ? 2 : 0); 1709 if(r2!=null){ 1710 added+=addToMap(r2, rblen2>20000000 ? k : rblen2>5000000 ? 11 : rblen2>500000 ? 2 : 0); 1711 } 1712 } 1713 reads=fetch(); 1714 } 1715 1716 if(map.canRebalance() && map.size()>2L*map.arrayLength()){ 1717 map.rebalance(); 1718 } 1719 success=true; 1720 } 1721 1722 /** 1723 * @param r The current read to process 1724 * @param skip Number of bases to skip between kmers 1725 * @return Number of kmers stored 1726 */ addToMap(Read r, int skip)1727 private long addToMap(Read r, int skip){ 1728 skip=Tools.max(minSkip, Tools.min(maxSkip, skip)); 1729 final byte[] bases=r.bases; 1730 final int shift=2*k; 1731 final int shift2=shift-2; 1732 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 1733 final long kmask=lengthMasks[k]; 1734 long kmer=0; 1735 long rkmer=0; 1736 long added=0; 1737 int len=0; 1738 1739 if(bases!=null){ 1740 refReadsT++; 1741 refBasesT+=bases.length; 1742 } 1743 if(bases==null || bases.length<k){return 0;} 1744 1745 final int id=(Integer)r.obj; 1746 1747 if(skip>1){ //Process while skipping some kmers 1748 for(int i=0; i<bases.length; i++){ 1749 byte b=bases[i]; 1750 long x=AminoAcid.baseToNumber[b]; 1751 long x2=AminoAcid.baseToComplementNumber[b]; 1752 kmer=((kmer<<2)|x)&mask; 1753 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 1754 if(x<0){len=0; rkmer=0;}else{len++;} 1755 if(verbose){System.err.println("Scanning1 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 1756 if(len>=k){ 1757 refKmersT++; 1758 if(len%skip==0){ 1759 final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]); 1760 added+=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance); 1761 if(useShortKmers){ 1762 if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);} 1763 if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);} 1764 } 1765 } 1766 } 1767 } 1768 }else{ //Process all kmers 1769 for(int i=0; i<bases.length; i++){ 1770 byte b=bases[i]; 1771 long x=AminoAcid.baseToNumber[b]; 1772 long x2=AminoAcid.baseToComplementNumber[b]; 1773 kmer=((kmer<<2)|x)&mask; 1774 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 1775 if(x<0){len=0; rkmer=0;}else{len++;} 1776 if(verbose){System.err.println("Scanning2 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 1777 if(len>=k){ 1778 refKmersT++; 1779 final long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]); 1780 final long atm=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance); 1781 added+=atm; 1782 // assert(false) : atm+", "+map.contains(toValue(kmer, rkmer, kmask)); 1783 if(useShortKmers){ 1784 if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);} 1785 if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);} 1786 } 1787 } 1788 } 1789 } 1790 return added; 1791 } 1792 1793 1794 /** 1795 * Adds short kmers on the left end of the read. 1796 * @param kmer Forward kmer 1797 * @param rkmer Reverse kmer 1798 * @param extraBase Base added to end in case of deletions 1799 * @param id Scaffold number 1800 * @return Number of kmers stored 1801 */ addToMapLeftShift(long kmer, long rkmer, final long extraBase, final int id)1802 private long addToMapLeftShift(long kmer, long rkmer, final long extraBase, final int id){ 1803 if(verbose){System.err.println("addToMapLeftShift");} 1804 long added=0; 1805 for(int i=k-1; i>=mink; i--){ 1806 kmer=kmer&rightMasks[i]; 1807 rkmer=rkmer>>>2; 1808 long x=addToMap(kmer, rkmer, i, extraBase, id, lengthMasks[i], hammingDistance2, editDistance2); 1809 added+=x; 1810 if(verbose){ 1811 if((toValue(kmer, rkmer, lengthMasks[i]))%WAYS==tnum){ 1812 System.err.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added left-shift kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[i], i)+"; value="+(toValue(kmer, rkmer, lengthMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+lengthMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]); 1813 System.err.println("i="+i+"; tnum="+tnum+"; Looking for left-shift kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[i], i)); 1814 final long value=toValue(kmer, rkmer, lengthMasks[i]); 1815 if(map.contains(value)){System.err.println("Found "+value);} 1816 } 1817 } 1818 } 1819 return added; 1820 } 1821 1822 1823 /** 1824 * Adds short kmers on the right end of the read. 1825 * @param kmer Forward kmer 1826 * @param rkmer Reverse kmer 1827 * @param id Scaffold number 1828 * @return Number of kmers stored 1829 */ addToMapRightShift(long kmer, long rkmer, final int id)1830 private long addToMapRightShift(long kmer, long rkmer, final int id){ 1831 if(verbose){System.err.println("addToMapRightShift");} 1832 long added=0; 1833 for(int i=k-1; i>=mink; i--){ 1834 long extraBase=kmer&3L; 1835 kmer=kmer>>>2; 1836 rkmer=rkmer&rightMasks[i]; 1837 // assert(Long.numberOfLeadingZeros(kmer)>=2*(32-i)) : Long.numberOfLeadingZeros(kmer)+", "+i+", "+kmer+", "+kMasks[i]; 1838 // assert(Long.numberOfLeadingZeros(rkmer)>=2*(32-i)) : Long.numberOfLeadingZeros(rkmer)+", "+i+", "+rkmer+", "+kMasks[i]; 1839 long x=addToMap(kmer, rkmer, i, extraBase, id, lengthMasks[i], hammingDistance2, editDistance2); 1840 added+=x; 1841 if(verbose){ 1842 if((toValue(kmer, rkmer, lengthMasks[i]))%WAYS==tnum){ 1843 System.err.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added right-shift kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[i], i)+"; value="+(toValue(kmer, rkmer, lengthMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+lengthMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]); 1844 System.err.println("i="+i+"; tnum="+tnum+"; Looking for right-shift kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[i], i)); 1845 final long value=toValue(kmer, rkmer, lengthMasks[i]); 1846 if(map.contains(value)){System.err.println("Found "+value);} 1847 } 1848 } 1849 } 1850 return added; 1851 } 1852 1853 1854 /** 1855 * Adds this kmer to the table, including any mutations implied by editDistance or hammingDistance. 1856 * @param kmer Forward kmer 1857 * @param rkmer Reverse kmer 1858 * @param len Kmer length 1859 * @param extraBase Base added to end in case of deletions 1860 * @param id Scaffold number 1861 * @param kmask0 1862 * @return Number of kmers stored 1863 */ addToMap(final long kmer, final long rkmer, final int len, final long extraBase, final int id, final long kmask0, final int hdist, final int edist)1864 private long addToMap(final long kmer, final long rkmer, final int len, final long extraBase, final int id, final long kmask0, final int hdist, final int edist){ 1865 1866 assert(kmask0==lengthMasks[len]) : kmask0+", "+len+", "+lengthMasks[len]+", "+Long.numberOfTrailingZeros(kmask0)+", "+Long.numberOfTrailingZeros(lengthMasks[len]); 1867 1868 if(verbose){System.err.println("addToMap_A; len="+len+"; kMasks[len]="+lengthMasks[len]);} 1869 assert((kmer&kmask0)==0); 1870 final long added; 1871 if(hdist==0){ 1872 final long key=toValue(kmer, rkmer, kmask0); 1873 if(speed>0 && ((key/WAYS)&15)<speed){return 0;} 1874 if(key%WAYS!=tnum){return 0;} 1875 if(verbose){System.err.println("addToMap_B: "+AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+" = "+key);} 1876 added=map.setIfNotPresent(key, id); 1877 }else if(edist>0){ 1878 // long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]); 1879 added=mutate(kmer, rkmer, len, id, edist, extraBase); 1880 }else{ 1881 added=mutate(kmer, rkmer, len, id, hdist, -1); 1882 } 1883 if(verbose){System.err.println("addToMap added "+added+" keys.");} 1884 return added; 1885 } 1886 1887 /** 1888 * Mutate and store this kmer through 'dist' recursions. 1889 * @param kmer Forward kmer 1890 * @param rkmer Reverse kmer 1891 * @param id Scaffold number 1892 * @param dist Number of mutations 1893 * @param extraBase Base added to end in case of deletions 1894 * @return Number of kmers stored 1895 */ mutate(final long kmer, final long rkmer, final int len, final int id, final int dist, final long extraBase)1896 private long mutate(final long kmer, final long rkmer, final int len, final int id, final int dist, final long extraBase){ 1897 long added=0; 1898 1899 final long key=toValue(kmer, rkmer, lengthMasks[len]); 1900 1901 if(verbose){System.err.println("mutate_A; len="+len+"; kmer="+kmer+"; rkmer="+rkmer+"; kMasks[len]="+lengthMasks[len]);} 1902 if(key%WAYS==tnum){ 1903 if(verbose){System.err.println("mutate_B: "+AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+" = "+key);} 1904 int x=map.setIfNotPresent(key, id); 1905 if(verbose){System.err.println("mutate_B added "+x+" keys.");} 1906 added+=x; 1907 assert(map.contains(key)); 1908 } 1909 1910 if(dist>0){ 1911 final int dist2=dist-1; 1912 1913 //Sub 1914 for(int j=0; j<4; j++){ 1915 for(int i=0; i<len; i++){ 1916 final long temp=(kmer&clearMasks[i])|setMasks[j][i]; 1917 if(temp!=kmer){ 1918 long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len); 1919 added+=mutate(temp, rtemp, len, id, dist2, extraBase); 1920 } 1921 } 1922 } 1923 1924 if(editDistance>0){ 1925 //Del 1926 if(extraBase>=0 && extraBase<=3){ 1927 for(int i=1; i<len; i++){ 1928 final long temp=(kmer&leftMasks[i])|((kmer<<2)&rightMasks[i])|extraBase; 1929 if(temp!=kmer){ 1930 long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len); 1931 added+=mutate(temp, rtemp, len, id, dist2, -1); 1932 } 1933 } 1934 } 1935 1936 //Ins 1937 final long eb2=kmer&3; 1938 for(int i=1; i<len; i++){ 1939 final long temp0=(kmer&leftMasks[i])|((kmer&rightMasks[i])>>2); 1940 for(int j=0; j<4; j++){ 1941 final long temp=temp0|setMasks[j][i-1]; 1942 if(temp!=kmer){ 1943 long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len); 1944 added+=mutate(temp, rtemp, len, id, dist2, eb2); 1945 } 1946 } 1947 } 1948 } 1949 1950 } 1951 1952 return added; 1953 } 1954 1955 /*--------------------------------------------------------------*/ 1956 1957 /** Number of kmers stored by this thread */ 1958 public long added=0; 1959 /** Number of items encountered by this thread */ 1960 public long refKmersT=0, refReadsT=0, refBasesT=0; 1961 /** Thread number; used to determine which kmers to store */ 1962 public final int tnum; 1963 /** Buffer of input read lists */ 1964 public final ArrayBlockingQueue<ArrayList<Read>> queue=new ArrayBlockingQueue<ArrayList<Read>>(32); 1965 1966 /** Destination for storing kmers */ 1967 private final AbstractKmerTable map; 1968 1969 /** Completed successfully */ 1970 boolean success=false; 1971 1972 } 1973 1974 /*--------------------------------------------------------------*/ 1975 1976 /** 1977 * Matches read kmers against reference kmers, performs binning and/or trimming, and writes output. 1978 */ 1979 private class ProcessThread extends Thread{ 1980 1981 /** 1982 * Constructor 1983 * @param cris_ Read input stream 1984 * @param ros_ Unmatched read output stream (optional) 1985 * @param rosb_ Matched read output stream (optional) 1986 */ ProcessThread(ConcurrentReadInputStream cris_, ConcurrentReadOutputStream ros_, ConcurrentReadOutputStream rosb_, ConcurrentReadOutputStream ross_, boolean localArrays)1987 public ProcessThread(ConcurrentReadInputStream cris_, ConcurrentReadOutputStream ros_, ConcurrentReadOutputStream rosb_, ConcurrentReadOutputStream ross_, boolean localArrays){ 1988 cris=cris_; 1989 ros=ros_; 1990 rosb=rosb_; 1991 ross=ross_; 1992 1993 readstats=(MAKE_QUALITY_HISTOGRAM || MAKE_MATCH_HISTOGRAM || MAKE_BASE_HISTOGRAM || MAKE_QUALITY_ACCURACY || 1994 MAKE_EHIST || MAKE_INDELHIST || MAKE_LHIST || MAKE_GCHIST || MAKE_IDHIST) ? 1995 new ReadStats() : null; 1996 1997 final int alen=(scaffoldNames==null ? 0 : scaffoldNames.size()); 1998 1999 if(findBestMatch){ 2000 countArray=new int[alen]; 2001 idList=new IntList(); 2002 countList=new IntList(); 2003 }else{ 2004 countArray=null; 2005 idList=countList=null; 2006 } 2007 2008 overlapVector=(trimByOverlap ? new int[5] : null); 2009 2010 hitCountsT=(hitCounts==null ? null : new long[hitCounts.length]); 2011 2012 if(localArrays && alen>0 && alen<10000){ 2013 scaffoldReadCountsT=new long[alen]; 2014 scaffoldBaseCountsT=new long[alen]; 2015 }else{ 2016 scaffoldReadCountsT=scaffoldBaseCountsT=null; 2017 } 2018 2019 if(calcEntropy){ 2020 entropyCounts=new short[entropyKmerspace]; 2021 entropyCountCounts=new short[entropyWindow+2]; 2022 entropyCountCounts[0]=(short)entropyWindow; 2023 }else{ 2024 entropyCounts=entropyCountCounts=null; 2025 } 2026 } 2027 2028 @Override run()2029 public void run(){ 2030 ListNum<Read> ln=cris.nextList(); 2031 ArrayList<Read> reads=(ln!=null ? ln.list : null); 2032 ArrayList<Read> bad=(rosb==null ? null : new ArrayList<Read>(Shared.bufferLen())); 2033 ArrayList<Read> single=new ArrayList<Read>(Shared.bufferLen()); 2034 2035 //While there are more reads lists... 2036 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 2037 2038 int removed=0; 2039 2040 //For each read (or pair) in the list... 2041 for(int i=0; i<reads.size(); i++){ 2042 final Read r1=reads.get(i); 2043 final Read r2=r1.mate; 2044 2045 if(!r1.validated()){r1.validate(true);} 2046 if(r2!=null && !r2.validated()){r2.validate(true);} 2047 2048 if(readstats!=null){ 2049 if(MAKE_QUALITY_HISTOGRAM){readstats.addToQualityHistogram(r1);} 2050 if(MAKE_BASE_HISTOGRAM){readstats.addToBaseHistogram(r1);} 2051 if(MAKE_MATCH_HISTOGRAM){readstats.addToMatchHistogram(r1);} 2052 if(MAKE_QUALITY_ACCURACY){readstats.addToQualityAccuracy(r1);} 2053 2054 if(MAKE_EHIST){readstats.addToErrorHistogram(r1);} 2055 if(MAKE_INDELHIST){readstats.addToIndelHistogram(r1);} 2056 if(MAKE_LHIST){readstats.addToLengthHistogram(r1);} 2057 if(MAKE_GCHIST){readstats.addToGCHistogram(r1);} 2058 if(MAKE_IDHIST){readstats.addToIdentityHistogram(r1);} 2059 } 2060 2061 if(loglog!=null){loglog.hash(r1);} 2062 2063 final int initialLength1=r1.length(); 2064 final int initialLength2=r1.mateLength(); 2065 2066 final int minlen1=(int)Tools.max(initialLength1*minLenFraction, minReadLength); 2067 final int minlen2=(int)Tools.max(initialLength2*minLenFraction, minReadLength); 2068 2069 if(verbose){System.err.println("Considering read "+r1.id+" "+new String(r1.bases));} 2070 2071 readsInT++; 2072 basesInT+=initialLength1; 2073 if(r2!=null){ 2074 readsInT++; 2075 basesInT+=initialLength2; 2076 } 2077 2078 if(chastityFilter){ 2079 if(r1!=null && r1.failsChastity()){ 2080 r1.setDiscarded(true); 2081 if(r2!=null){r2.setDiscarded(true);} 2082 } 2083 } 2084 2085 if(removeBadBarcodes){ 2086 if(r1!=null && !r1.discarded() && r1.failsBarcode(barcodes, failIfNoBarcode)){ 2087 if(failBadBarcodes){KillSwitch.kill("Invalid barcode detected: "+r1.id+"\nThis can be disabled with the flag barcodefilter=f");} 2088 r1.setDiscarded(true); 2089 if(r2!=null){r2.setDiscarded(true);} 2090 } 2091 } 2092 2093 if(recalibrateQuality){ 2094 if(r1!=null && !r1.discarded()){ 2095 CalcTrueQuality.recalibrate(r1); 2096 } 2097 if(r2!=null && !r2.discarded()){ 2098 CalcTrueQuality.recalibrate(r2); 2099 } 2100 } 2101 2102 if(filterGC && (initialLength1>0 || initialLength2>0)){ 2103 float gc1=(initialLength1>0 ? r1.gc() : -1); 2104 float gc2=(initialLength2>0 ? r2.gc() : gc1); 2105 if(gc1==-1){gc1=gc2;} 2106 if(usePairGC){ 2107 final float gc; 2108 if(r2==null){ 2109 gc=gc1; 2110 }else{ 2111 gc=(gc1*initialLength1+gc2*initialLength2)/(initialLength1+initialLength2); 2112 } 2113 gc1=gc2=gc; 2114 } 2115 if(r1!=null && !r1.discarded() && (gc1<minGC || gc1>maxGC)){ 2116 r1.setDiscarded(true); 2117 badGcBasesT+=initialLength1; 2118 badGcReadsT++; 2119 } 2120 if(r2!=null && !r2.discarded() && (gc2<minGC || gc2>maxGC)){ 2121 r2.setDiscarded(true); 2122 badGcBasesT+=initialLength2; 2123 badGcReadsT++; 2124 } 2125 } 2126 2127 if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0){ 2128 if(r1!=null && !r1.discarded()){ 2129 final int len=r1.length(); 2130 final int a=forceTrimLeft>0 ? forceTrimLeft : 0; 2131 final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len; 2132 final int b1=forceTrimRight>0 ? forceTrimRight : len; 2133 final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len; 2134 final int b=Tools.min(b0, b1, b2); 2135 final int x=TrimRead.trimToPosition(r1, a, b, 1); 2136 basesFTrimmedT+=x; 2137 readsFTrimmedT+=(x>0 ? 1 : 0); 2138 if(r1.length()<minlen1){r1.setDiscarded(true);} 2139 } 2140 if(r2!=null && !r2.discarded()){ 2141 final int len=r2.length(); 2142 final int a=forceTrimLeft>0 ? forceTrimLeft : 0; 2143 final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len; 2144 final int b1=forceTrimRight>0 ? forceTrimRight : len; 2145 final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len; 2146 final int b=Tools.min(b0, b1, b2); 2147 final int x=TrimRead.trimToPosition(r2, a, b, 1); 2148 basesFTrimmedT+=x; 2149 readsFTrimmedT+=(x>0 ? 1 : 0); 2150 if(r2.length()<minlen2){r2.setDiscarded(true);} 2151 } 2152 } 2153 2154 boolean remove; 2155 if(removePairsIfEitherBad){remove=r1.discarded() || (r2!=null && r2.discarded());} 2156 else{remove=r1.discarded() && (r2==null || r2.discarded());} 2157 2158 if(remove){ 2159 if(r1!=null){ 2160 basesQFilteredT+=r1.length(); 2161 readsQFilteredT++; 2162 } 2163 if(r2!=null){ 2164 basesQFilteredT+=r2.length(); 2165 readsQFilteredT++; 2166 } 2167 if(bad!=null){bad.add(r1);} 2168 }else{ 2169 2170 if(ecc && r1!=null && r2!=null){BBMerge.findOverlapStrict(r1, r2, true);} 2171 2172 //Process kmers 2173 if(kfilter && storedKmersFilter>0){ 2174 //Do kmer matching 2175 2176 if(!findBestMatch){ 2177 final int a=(kbig<=k ? countSetKmers(r1, filterMaps) : countSetKmersBig(r1, filterMaps)); 2178 final int b=(kbig<=k ? countSetKmers(r2, filterMaps) : countSetKmersBig(r2, filterMaps)); 2179 2180 if(r1!=null && a>maxBadKmers){r1.setDiscarded(true);} 2181 if(r2!=null && b>maxBadKmers){r2.setDiscarded(true);} 2182 }else{ 2183 final int a=findBestMatch(r1, filterMaps); 2184 final int b=findBestMatch(r2, filterMaps); 2185 2186 if(r1!=null && a>0){r1.setDiscarded(true);} 2187 if(r2!=null && b>0){r2.setDiscarded(true);} 2188 } 2189 2190 if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) || 2191 (r1.discarded() && (r2==null || r2.discarded()))){ 2192 remove=true; 2193 if(r1!=null){ 2194 readsKFilteredT++; 2195 basesKFilteredT+=r1.length(); 2196 } 2197 if(r2!=null){ 2198 readsKFilteredT++; 2199 basesKFilteredT+=r2.length(); 2200 } 2201 if(bad!=null){bad.add(r1);} 2202 } 2203 2204 } 2205 2206 if(ktrimN && storedKmersMask>0){ 2207 remove=remove || ktrim0(r1, r2, bad, maskMaps, NMODE, minlen1, minlen2); 2208 } 2209 2210 if(ktrimRight && storedKmersRight>0){ 2211 remove=remove || ktrim0(r1, r2, bad, trimRightMaps, RIGHTMODE, minlen1, minlen2); 2212 2213 if(trimPairsEvenly && xsum>0 && r2!=null && r1.length()!=r2.length()){ 2214 int x; 2215 if(r1.length()>r2.length()){ 2216 x=TrimRead.trimToPosition(r1, 0, r2.length()-1, 1); 2217 }else{ 2218 x=TrimRead.trimToPosition(r2, 0, r1.length()-1, 1); 2219 } 2220 if(rktsum<2){readsKTrimmedT++;} 2221 basesKTrimmedT+=x; 2222 2223 assert(r1.length()==r2.length()) : r1.length()+", "+r2.length(); 2224 } 2225 } 2226 2227 if(!remove && trimByOverlap && r2!=null && expectedErrors(r1, r2)<meeFilter){ 2228 2229 if(aprob==null || aprob.length<r1.length()){aprob=new float[r1.length()];} 2230 if(bprob==null || bprob.length<r2.length()){bprob=new float[r2.length()];} 2231 2232 //Do overlap trimming 2233 r2.reverseComplement(); 2234 // int bestInsert=BBMergeOverlapper.mateByOverlap(r1, r2, aprob, bprob, overlapVector, minOverlap0, minOverlap, 2235 // overlapMargin, overlapMaxMismatches0, overlapMaxMismatches, overlapMinq); 2236 int bestInsert=BBMergeOverlapper.mateByOverlapRatio(r1, r2, aprob, bprob, overlapVector, minOverlap0, minOverlap, 2237 minInsert0, minInsert, maxRatio, 0.12f, ratioMargin, ratioOffset, 0.95f, 0.95f, useQualityForOverlap); 2238 2239 if(bestInsert<minInsert){bestInsert=-1;} 2240 boolean ambig=(overlapVector[4]==1); 2241 final int bestBad=overlapVector[2]; 2242 2243 if(bestInsert>0 && !ambig && r1.quality!=null && r2.quality!=null && useQualityForOverlap){ 2244 if(efilterRatio>0 && bestInsert>0 && !ambig){ 2245 float bestExpected=BBMergeOverlapper.expectedMismatches(r1, r2, bestInsert); 2246 if((bestExpected+efilterOffset)*efilterRatio<bestBad){ambig=true;} 2247 } 2248 if(pfilterRatio>0 && bestInsert>0 && !ambig){ 2249 float probability=BBMergeOverlapper.probability(r1, r2, bestInsert); 2250 if(probability<pfilterRatio){bestInsert=-1;} 2251 } 2252 } 2253 2254 r2.reverseComplement(); 2255 2256 if(bestInsert>0 && !ambig){ 2257 if(bestInsert<r1.length()){ 2258 if(verbose){System.err.println("Overlap right trimming r1 to "+0+", "+(bestInsert-1));} 2259 int x=TrimRead.trimToPosition(r1, 0, bestInsert-1, 1); 2260 if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r1.bases));} 2261 readsTrimmedByOverlapT++; 2262 basesTrimmedByOverlapT+=x; 2263 } 2264 if(bestInsert<r2.length()){ 2265 if(verbose){System.err.println("Overlap right trimming r2 to "+0+", "+(bestInsert-1));} 2266 int x=TrimRead.trimToPosition(r2, 0, bestInsert-1, 1); 2267 if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r2.bases));} 2268 readsTrimmedByOverlapT++; 2269 basesTrimmedByOverlapT+=x; 2270 } 2271 } 2272 } 2273 2274 if(ktrimLeft && storedKmersLeft>0){ 2275 remove=remove || ktrim0(r1, r2, bad, trimLeftMaps, LEFTMODE, minlen1, minlen2); 2276 } 2277 2278 } 2279 2280 if(!remove){ 2281 //Do quality trimming 2282 2283 int rlen1=0, rlen2=0; 2284 if(r1!=null){ 2285 if(qtrimLeft || qtrimRight){ 2286 int x=TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq, trimE, 1); 2287 basesQTrimmedT+=x; 2288 readsQTrimmedT+=(x>0 ? 1 : 0); 2289 } 2290 rlen1=r1.length(); 2291 if(rlen1<minlen1 || rlen1>maxReadLength){r1.setDiscarded(true);} 2292 } 2293 if(r2!=null){ 2294 if(qtrimLeft || qtrimRight){ 2295 int x=TrimRead.trimFast(r2, qtrimLeft, qtrimRight, trimq, trimE, 1); 2296 basesQTrimmedT+=x; 2297 readsQTrimmedT+=(x>0 ? 1 : 0); 2298 } 2299 rlen2=r2.length(); 2300 if(rlen2<minlen2 || rlen2>maxReadLength){r2.setDiscarded(true);} 2301 } 2302 2303 //Discard reads if too short 2304 if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) || 2305 (r1.discarded() && (r2==null || r2.discarded()))){ 2306 basesQTrimmedT+=r1.pairLength(); 2307 remove=true; 2308 if(addTrimmedToBad && bad!=null){bad.add(r1);} 2309 } 2310 2311 } 2312 2313 if(!remove){ 2314 //Do quality filtering 2315 2316 //Determine whether to discard the reads based on average quality 2317 if(minAvgQuality>0){ 2318 if(r1!=null && r1.quality!=null && r1.avgQuality(false, minAvgQualityBases)<minAvgQuality){r1.setDiscarded(true);} 2319 if(r2!=null && r2.quality!=null && r2.avgQuality(false, minAvgQualityBases)<minAvgQuality){r2.setDiscarded(true);} 2320 } 2321 //Determine whether to discard the reads based on the presence of Ns 2322 if(maxNs>=0){ 2323 if(r1!=null && r1.countUndefined()>maxNs){r1.setDiscarded(true);} 2324 if(r2!=null && r2.countUndefined()>maxNs){r2.setDiscarded(true);} 2325 } 2326 //Determine whether to discard the reads based on a lack of useful kmers 2327 if(minConsecutiveBases>0){ 2328 if(r1!=null && !r1.discarded() && !r1.hasMinConsecutiveBases(minConsecutiveBases)){r1.setDiscarded(true);} 2329 if(r2!=null && !r2.discarded() && !r2.hasMinConsecutiveBases(minConsecutiveBases)){r2.setDiscarded(true);} 2330 } 2331 //Determine whether to discard the reads based on minimum base frequency 2332 if(minBaseFrequency>0){ 2333 if(r1!=null && r1.minBaseCount()<minBaseFrequency*r1.length()){r1.setDiscarded(true);} 2334 if(r2!=null && r2.minBaseCount()<minBaseFrequency*r2.length()){r2.setDiscarded(true);} 2335 } 2336 2337 //Discard reads if too short 2338 if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) || 2339 (r1.discarded() && (r2==null || r2.discarded()))){ 2340 basesQFilteredT+=r1.pairLength(); 2341 readsQFilteredT+=r1.pairCount(); 2342 remove=true; 2343 if(addTrimmedToBad && bad!=null){bad.add(r1);} 2344 } 2345 } 2346 2347 if(!remove && calcEntropy){ 2348 //Test entropy 2349 2350 if(r1!=null && !r1.discarded() && entropyCutoff>averageEntropy(r1.bases, entropyK, entropyWindow, 2351 entropyCounts, entropyCountCounts, entropyKmerspace, verifyEntropy)){r1.setDiscarded(true);} 2352 if(r2!=null && !r2.discarded() && entropyCutoff>averageEntropy(r2.bases, entropyK, entropyWindow, 2353 entropyCounts, entropyCountCounts, entropyKmerspace, verifyEntropy)){r2.setDiscarded(true);} 2354 2355 if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) || 2356 (r1.discarded() && (r2==null || r2.discarded()))){ 2357 basesEFilteredT+=r1.pairLength(); 2358 readsEFilteredT+=r1.pairCount(); 2359 remove=true; 2360 if(bad!=null){bad.add(r1);} 2361 } 2362 } 2363 2364 if(ross!=null){ 2365 if(!r1.discarded() && (r2==null || r2.discarded())){ 2366 Read clone=r1.clone(); 2367 clone.mate=null; 2368 single.add(clone); 2369 }else if(r2!=null && r1.discarded() && !r2.discarded()){ 2370 Read clone=r2.clone(); 2371 clone.mate=null; 2372 single.add(clone); 2373 } 2374 } 2375 2376 if(remove){ 2377 //Evict read 2378 removed++; 2379 if(r2!=null){removed++;} 2380 reads.set(i, null); 2381 // System.err.println("X1\t"+removed); 2382 }else{ 2383 //Track statistics 2384 2385 if(r1!=null){ 2386 readsOutT++; 2387 basesOutT+=r1.length(); 2388 } 2389 if(r2!=null){ 2390 readsOutT++; 2391 basesOutT+=r2.length(); 2392 } 2393 // System.err.println("X2\t"+readsOutT); 2394 } 2395 } 2396 2397 //Send matched list to matched output stream 2398 if(rosb!=null){ 2399 rosb.add(bad, ln.id); 2400 bad.clear(); 2401 } 2402 2403 //Send unmatched list to unmatched output stream 2404 if(ros!=null){ 2405 ros.add((removed>0 ? Tools.condenseNew(reads) : reads), ln.id); //Creates a new list if old one became empty, to prevent shutting down the cris. 2406 } 2407 2408 if(ross!=null){ 2409 ross.add(single, ln.id); 2410 single.clear(); 2411 } 2412 2413 //Fetch a new read list 2414 cris.returnList(ln); 2415 ln=cris.nextList(); 2416 reads=(ln!=null ? ln.list : null); 2417 } 2418 cris.returnList(ln); 2419 } 2420 2421 /*--------------------------------------------------------------*/ 2422 /*---------------- Helper Methods ----------------*/ 2423 /*--------------------------------------------------------------*/ 2424 ktrim0(final Read r1, final Read r2, ArrayList<Read> bad, AbstractKmerTable[] maps, int mode, int minlen1, int minlen2)2425 private boolean ktrim0(final Read r1, final Read r2, ArrayList<Read> bad, AbstractKmerTable[] maps, int mode, int minlen1, int minlen2){ 2426 boolean remove=false; 2427 int rlen1=0, rlen2=0; 2428 xsum=0; 2429 rktsum=0; 2430 if(r1!=null){ 2431 final int x=(mode==NMODE ? kmask(r1, maps) : ktrim(r1, maps, mode)); 2432 xsum+=x; 2433 rktsum+=(x>0 ? 1 : 0); 2434 rlen1=r1.length(); 2435 if(rlen1<minlen1){r1.setDiscarded(true);} 2436 } 2437 if(r2!=null){ 2438 final int x=(mode==NMODE ? kmask(r2, maps) : ktrim(r2, maps, mode)); 2439 xsum+=x; 2440 rktsum+=(x>0 ? 1 : 0); 2441 rlen2=r2.length(); 2442 if(rlen2<minlen2){r2.setDiscarded(true);} 2443 } 2444 2445 if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) || 2446 (r1.discarded() && (r2==null || r2.discarded()))){ 2447 if(!ktrimN){ 2448 xsum+=(rlen1+rlen2); 2449 rktsum=r1.pairCount(); 2450 } 2451 remove=true; 2452 if(addTrimmedToBad && bad!=null){bad.add(r1);} 2453 } 2454 basesKTrimmedT+=xsum; 2455 readsKTrimmedT+=rktsum; 2456 2457 return remove; 2458 } 2459 2460 /** 2461 * Transforms a kmer into all canonical values for a given Hamming distance. 2462 * Returns the related id stored in the tables. 2463 * @param kmer Forward kmer 2464 * @param rkmer Reverse kmer 2465 * @param lengthMask Bitmask with single '1' set to left of kmer 2466 * @param qPos Position of kmer in query 2467 * @param len kmer length 2468 * @param qHDist Hamming distance 2469 * @param sets Kmer hash tables 2470 * @return Value stored in table, or -1 2471 */ getValue(final long kmer, final long rkmer, final long lengthMask, final int qPos, final int len, final int qHDist, final AbstractKmerTable[] sets)2472 private final int getValue(final long kmer, final long rkmer, final long lengthMask, final int qPos, final int len, final int qHDist, final AbstractKmerTable[] sets){ 2473 int id=getValue(kmer, rkmer, lengthMask, qPos, sets); 2474 if(id<1 && qHDist>0){ 2475 final int qHDist2=qHDist-1; 2476 2477 //Sub 2478 for(int j=0; j<4 && id<1; j++){ 2479 for(int i=0; i<len && id<1; i++){ 2480 final long temp=(kmer&clearMasks[i])|setMasks[j][i]; 2481 if(temp!=kmer){ 2482 long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len); 2483 id=getValue(temp, rtemp, lengthMask, qPos, len, qHDist2, sets); 2484 } 2485 } 2486 } 2487 } 2488 return id; 2489 } 2490 2491 /** 2492 * Transforms a kmer into a canonical value stored in the table and search. 2493 * @param kmer Forward kmer 2494 * @param rkmer Reverse kmer 2495 * @param lengthMask Bitmask with single '1' set to left of kmer 2496 * @param qPos Position of kmer in query 2497 * @param sets Kmer hash tables 2498 * @return Value stored in table 2499 */ getValue(final long kmer, final long rkmer, final long lengthMask, final int qPos, final AbstractKmerTable[] sets)2500 private final int getValue(final long kmer, final long rkmer, final long lengthMask, final int qPos, final AbstractKmerTable[] sets){ 2501 assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer; 2502 if(qSkip>1 && (qPos%qSkip!=0)){return -1;} 2503 2504 final long max=(rcomp ? Tools.max(kmer, rkmer) : kmer); 2505 final long key=(max&middleMask)|lengthMask; 2506 if(noAccel || ((key/WAYS)&15)>=speed){ 2507 if(verbose){System.err.println("Testing key "+key);} 2508 AbstractKmerTable set=sets[(int)(key%WAYS)]; 2509 final int id=set.getValue(key); 2510 return id; 2511 } 2512 return -1; 2513 } 2514 2515 /*--------------------------------------------------------------*/ 2516 2517 /** 2518 * Counts the number of kmer hits for a read. 2519 * @param r Read to process 2520 * @param sets Kmer tables 2521 * @return Number of hits 2522 */ countSetKmers(final Read r, final AbstractKmerTable sets[])2523 private final int countSetKmers(final Read r, final AbstractKmerTable sets[]){ 2524 if(r==null || r.length()<k){return 0;} 2525 if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;} 2526 final byte[] bases=r.bases; 2527 final int minlen=k-1; 2528 final int minlen2=(maskMiddle ? k/2 : k); 2529 final int shift=2*k; 2530 final int shift2=shift-2; 2531 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 2532 final long kmask=lengthMasks[k]; 2533 long kmer=0; 2534 long rkmer=0; 2535 int found=0; 2536 int len=0; 2537 2538 final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight)); 2539 final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft)); 2540 2541 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ 2542 for(int i=start; i<stop; i++){ 2543 byte b=bases[i]; 2544 long x=Dedupe.baseToNumber[b]; 2545 long x2=Dedupe.baseToComplementNumber[b]; 2546 kmer=((kmer<<2)|x)&mask; 2547 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 2548 if(b=='N' && forbidNs){len=0; rkmer=0;}else{len++;} 2549 if(verbose){System.err.println("Scanning6 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 2550 if(len>=minlen2 && i>=minlen){ 2551 final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets); 2552 if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);} 2553 if(id>0){ 2554 if(verbose){System.err.println("Found = "+(found+1)+"/"+maxBadKmers);} 2555 if(found==maxBadKmers){ 2556 if(scaffoldReadCountsT!=null){ 2557 scaffoldReadCountsT[id]++; 2558 scaffoldBaseCountsT[id]+=bases.length; 2559 }else{ 2560 scaffoldReadCounts.addAndGet(id, 1); 2561 scaffoldBaseCounts.addAndGet(id, bases.length); 2562 } 2563 if(hitCounts==null){ 2564 return (found=found+1); 2565 }//Early exit, but prevents generation of histogram that goes over maxBadKmers+1. 2566 } 2567 found++; 2568 } 2569 } 2570 } 2571 2572 if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;} 2573 return found; 2574 } 2575 2576 /** 2577 * Returns the id of the sequence with the most kmer matches to this read, or -1 if none are over maxBadKmers. 2578 * @param r Read to process 2579 * @param sets Kmer tables 2580 * @return id of best match 2581 */ findBestMatch(final Read r, final AbstractKmerTable sets[])2582 private final int findBestMatch(final Read r, final AbstractKmerTable sets[]){ 2583 idList.size=0; 2584 if(r==null || r.length()<k){return -1;} 2585 if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return -1;} 2586 final byte[] bases=r.bases; 2587 final int minlen=k-1; 2588 final int minlen2=(maskMiddle ? k/2 : k); 2589 final int shift=2*k; 2590 final int shift2=shift-2; 2591 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 2592 final long kmask=lengthMasks[k]; 2593 long kmer=0; 2594 long rkmer=0; 2595 int len=0; 2596 int found=0; 2597 2598 final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight)); 2599 final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft)); 2600 2601 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ 2602 for(int i=start; i<stop; i++){ 2603 byte b=bases[i]; 2604 long x=Dedupe.baseToNumber[b]; 2605 long x2=Dedupe.baseToComplementNumber[b]; 2606 kmer=((kmer<<2)|x)&mask; 2607 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 2608 if(b=='N' && forbidNs){len=0; rkmer=0;}else{len++;} 2609 if(verbose){System.err.println("Scanning6 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 2610 if(len>=minlen2 && i>=minlen){ 2611 final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets); 2612 if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);} 2613 if(id>0){ 2614 countArray[id]++; 2615 if(countArray[id]==1){idList.add(id);} 2616 found++; 2617 if(verbose){System.err.println("Found = "+found+"/"+maxBadKmers);} 2618 } 2619 } 2620 } 2621 2622 final int id, max; 2623 if(found>maxBadKmers){ 2624 max=condenseLoose(countArray, idList, countList); 2625 int id0=-1; 2626 for(int i=0; i<countList.size; i++){ 2627 if(countList.get(i)==max){ 2628 id0=idList.get(i); break; 2629 } 2630 } 2631 if(rename){rename(r, idList, countList);} 2632 id=id0; 2633 }else{ 2634 max=0; 2635 id=-1; 2636 } 2637 2638 if(found>maxBadKmers){ 2639 if(scaffoldReadCountsT!=null){ 2640 scaffoldReadCountsT[id]++; 2641 scaffoldBaseCountsT[id]+=bases.length; 2642 }else{ 2643 scaffoldReadCounts.addAndGet(id, 1); 2644 scaffoldBaseCounts.addAndGet(id, bases.length); 2645 } 2646 } 2647 2648 if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;} 2649 return id; 2650 } 2651 2652 /** Estimates kmer hit counts for kmers longer than k using consecutive matches 2653 * @param r 2654 * @param sets 2655 * @return Number of sets of consecutive hits of exactly length kbig 2656 */ countSetKmersBig(final Read r, final AbstractKmerTable sets[])2657 private final int countSetKmersBig(final Read r, final AbstractKmerTable sets[]){ 2658 if(r==null || r.length()<kbig){return 0;} 2659 if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;} 2660 assert(kbig>k); 2661 final int sub=kbig-k-1; 2662 assert(sub>=0) : kbig+", "+sub; 2663 final byte[] bases=r.bases; 2664 final int minlen=k-1; 2665 final int minlen2=(maskMiddle ? k/2 : k); 2666 final int shift=2*k; 2667 final int shift2=shift-2; 2668 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 2669 final long kmask=lengthMasks[k]; 2670 long kmer=0; 2671 long rkmer=0; 2672 int found=0; 2673 int len=0; 2674 2675 int bkStart=-1; 2676 int bkStop=-1; 2677 int id=-1, lastId=-1; 2678 2679 final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight)); 2680 final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft)); 2681 2682 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ 2683 for(int i=start; i<stop; i++){ 2684 byte b=bases[i]; 2685 long x=Dedupe.baseToNumber[b]; 2686 long x2=Dedupe.baseToComplementNumber[b]; 2687 kmer=((kmer<<2)|x)&mask; 2688 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 2689 if(b=='N' && forbidNs){len=0; rkmer=0;}else{len++;} 2690 if(verbose){System.err.println("Scanning7 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 2691 if(len>=minlen2 && i>=minlen){ 2692 id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets); 2693 if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);} 2694 if(id>0){ 2695 lastId=id; 2696 if(bkStart==-1){bkStart=i;} 2697 bkStop=i; 2698 }else{ 2699 if(bkStart>-1){ 2700 int dif=bkStop-bkStart-sub; 2701 bkStop=bkStart=-1; 2702 if(dif>0){ 2703 int old=found; 2704 found+=dif; 2705 if(found>maxBadKmers && old<=maxBadKmers){ 2706 if(scaffoldReadCountsT!=null){ 2707 scaffoldReadCountsT[lastId]++; 2708 scaffoldBaseCountsT[lastId]+=bases.length; 2709 }else{ 2710 scaffoldReadCounts.addAndGet(lastId, 1); 2711 scaffoldBaseCounts.addAndGet(lastId, bases.length); 2712 } 2713 if(hitCounts==null){ 2714 return found; 2715 }//Early exit, but prevents generation of histogram that goes over maxBadKmers+1. 2716 } 2717 } 2718 } 2719 } 2720 } 2721 } 2722 2723 // This catches the case where valid kmers extend to the end of the read 2724 if(bkStart>-1){ 2725 int dif=bkStop-bkStart-sub; 2726 bkStop=bkStart=-1; 2727 if(dif>0){ 2728 int old=found; 2729 found+=dif; 2730 if(found>maxBadKmers && old<=maxBadKmers){ 2731 if(scaffoldReadCountsT!=null){ 2732 scaffoldReadCountsT[lastId]++; 2733 scaffoldBaseCountsT[lastId]+=bases.length; 2734 }else{ 2735 scaffoldReadCounts.addAndGet(lastId, 1); 2736 scaffoldBaseCounts.addAndGet(lastId, bases.length); 2737 } 2738 } 2739 } 2740 } 2741 2742 if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;} 2743 return found; 2744 } 2745 2746 /** 2747 * Trim a read to remove matching kmers and everything to their left or right. 2748 * @param r Read to process 2749 * @param sets Kmer tables 2750 * @return Number of bases trimmed 2751 */ ktrim(final Read r, final AbstractKmerTable[] sets, int mode)2752 private final int ktrim(final Read r, final AbstractKmerTable[] sets, int mode){ 2753 assert(mode==RIGHTMODE || mode==LEFTMODE); 2754 if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k))){return 0;} 2755 if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;} 2756 if(verbose){System.err.println("KTrimming read "+r.id);} 2757 final byte[] bases=r.bases, quals=r.quality; 2758 final int minlen=k-1; 2759 final int minlen2=(maskMiddle ? k/2 : k); 2760 final int shift=2*k; 2761 final int shift2=shift-2; 2762 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 2763 final long kmask=lengthMasks[k]; 2764 long kmer=0; 2765 long rkmer=0; 2766 int found=0; 2767 int len=0; 2768 int id0=-1; //ID of first kmer found. 2769 2770 int minLoc=999999999, minLocExclusive=999999999; 2771 int maxLoc=-1, maxLocExclusive=-1; 2772 final int initialLength=r.length(); 2773 2774 final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight)); 2775 final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft)); 2776 2777 //Scan for normal kmers 2778 for(int i=start; i<stop; i++){ 2779 byte b=bases[i]; 2780 long x=Dedupe.baseToNumber[b]; 2781 long x2=Dedupe.baseToComplementNumber[b]; 2782 kmer=((kmer<<2)|x)&mask; 2783 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 2784 if(b=='N' && forbidNs){len=0; rkmer=0;}else{len++;} 2785 if(verbose){System.err.println("Scanning3 i="+i+", kmer="+kmer+", rkmer="+rkmer+", len="+len+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 2786 if(len>=minlen2 && i>=minlen){ 2787 final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets); 2788 if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);} 2789 if(id>0){ 2790 if(id0<0){id0=id;} 2791 minLoc=Tools.min(minLoc, i-k+1); 2792 assert(minLoc>=0); 2793 maxLoc=i; 2794 found++; 2795 } 2796 } 2797 } 2798 2799 if(minLoc!=minLocExclusive){minLocExclusive=minLoc+k;} 2800 if(maxLoc!=maxLocExclusive){maxLocExclusive=maxLoc-k;} 2801 2802 //If nothing was found, scan for short kmers. Only used for trimming. 2803 if(useShortKmers && found==0){ 2804 assert(!maskMiddle && middleMask==-1) : maskMiddle+", "+middleMask+", k="+", mink="+mink; 2805 2806 //Look for short kmers on left side 2807 if(mode==LEFTMODE || mode==NMODE){ 2808 kmer=0; 2809 rkmer=0; 2810 len=0; 2811 final int lim=Tools.min(k, stop); 2812 for(int i=start; i<lim; i++){ 2813 byte b=bases[i]; 2814 long x=Dedupe.baseToNumber[b]; 2815 long x2=Dedupe.baseToComplementNumber[b]; 2816 kmer=((kmer<<2)|x)&mask; 2817 rkmer=rkmer|(x2<<(2*len)); 2818 len++; 2819 if(verbose){System.err.println("Scanning4 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 2820 if(len>=mink){ 2821 2822 if(verbose){ 2823 System.err.println("Looking for left kmer "+AminoAcid.kmerToString(kmer, len)); 2824 System.err.println("Looking for left rkmer "+AminoAcid.kmerToString(rkmer, len)); 2825 } 2826 final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets); 2827 if(id>0){ 2828 if(id0<0){id0=id;} 2829 if(verbose){System.err.println("Found "+kmer);} 2830 minLoc=0; 2831 minLocExclusive=Tools.min(minLocExclusive, i+1); 2832 maxLoc=Tools.max(maxLoc, i); 2833 maxLocExclusive=Tools.max(maxLocExclusive, 0); 2834 found++; 2835 } 2836 } 2837 } 2838 } 2839 2840 //Look for short kmers on right side 2841 if(mode==RIGHTMODE || mode==NMODE){ 2842 kmer=0; 2843 rkmer=0; 2844 len=0; 2845 final int lim=Tools.max(-1, stop-k); 2846 for(int i=stop-1; i>lim; i--){ 2847 byte b=bases[i]; 2848 long x=Dedupe.baseToNumber[b]; 2849 long x2=Dedupe.baseToComplementNumber[b]; 2850 kmer=kmer|(x<<(2*len)); 2851 rkmer=((rkmer<<2)|x2)&mask; 2852 len++; 2853 if(verbose){System.err.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 2854 if(len>=mink){ 2855 2856 final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets); 2857 if(verbose){System.err.println("Looking for right kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+"; id="+id+"; kmask="+lengthMasks[len]);} 2858 if(id>0){ 2859 if(id0<0){id0=id;} 2860 if(verbose){System.err.println("Found "+kmer);} 2861 minLoc=i; 2862 minLocExclusive=Tools.min(minLocExclusive, bases.length); 2863 maxLoc=bases.length-1; 2864 maxLocExclusive=Tools.max(maxLocExclusive, i-1); 2865 found++; 2866 } 2867 } 2868 } 2869 } 2870 } 2871 2872 2873 if(verbose){System.err.println("found="+found+", minLoc="+minLoc+", maxLoc="+maxLoc+", minLocExclusive="+minLocExclusive+", maxLocExclusive="+maxLocExclusive);} 2874 2875 if(found==0){return 0;} 2876 assert(found>0) : "Overflow in 'found' variable."; 2877 2878 {//Increment counter for the scaffold whose kmer was first detected 2879 if(scaffoldReadCountsT!=null){ 2880 scaffoldReadCountsT[id0]++; 2881 scaffoldBaseCountsT[id0]+=bases.length; 2882 }else{ 2883 scaffoldReadCounts.addAndGet(id0, 1); 2884 scaffoldBaseCounts.addAndGet(id0, bases.length); 2885 } 2886 } 2887 2888 if(trimPad!=0){ 2889 maxLoc=Tools.mid(0, maxLoc+trimPad, bases.length); 2890 minLoc=Tools.mid(0, minLoc-trimPad, bases.length); 2891 maxLocExclusive=Tools.mid(0, maxLocExclusive+trimPad, bases.length); 2892 minLocExclusive=Tools.mid(0, minLocExclusive-trimPad, bases.length); 2893 } 2894 2895 //Old version. No longer needed. 2896 // if(mode==NMODE){ //Replace kmer hit zone with the trim symbol 2897 // Arrays.fill(bases, minLoc, maxLoc+1, trimSymbol); 2898 // if(quals!=null){Arrays.fill(quals, minLoc, maxLoc+1, (byte)0);} 2899 // return maxLoc-minLoc+1; 2900 // } 2901 2902 if(mode==LEFTMODE){ //Trim from the read start to the rightmost kmer base 2903 if(verbose){System.err.println("Left trimming to "+(ktrimExclusive ? maxLocExclusive+1 : maxLoc+1)+", "+0);} 2904 int x=TrimRead.trimToPosition(r, ktrimExclusive ? maxLocExclusive+1 : maxLoc+1, bases.length-1, 1); 2905 if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r.bases));} 2906 return x; 2907 }else{ //Trim from the leftmost kmer base to the read stop 2908 assert(mode==RIGHTMODE); 2909 if(verbose){System.err.println("Right trimming to "+0+", "+(ktrimExclusive ? minLocExclusive-1 : minLoc-1));} 2910 int x=TrimRead.trimToPosition(r, 0, ktrimExclusive ? minLocExclusive-1 : minLoc-1, 1); 2911 if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r.bases));} 2912 return x; 2913 } 2914 } 2915 2916 2917 /** 2918 * Mask a read to cover matching kmers. 2919 * @param r Read to process 2920 * @param sets Kmer tables 2921 * @return Number of bases masked 2922 */ kmask(final Read r, final AbstractKmerTable[] sets)2923 private final int kmask(final Read r, final AbstractKmerTable[] sets){ 2924 assert(ktrimN); 2925 if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k))){return 0;} 2926 if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;} 2927 if(verbose){System.err.println("KMasking read "+r.id);} 2928 final byte[] bases=r.bases, quals=r.quality; 2929 if(bases==null || bases.length<k){return 0;} 2930 final int minlen=k-1; 2931 final int minlen2=(maskMiddle ? k/2 : k); 2932 final int shift=2*k; 2933 final int shift2=shift-2; 2934 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 2935 final long kmask=lengthMasks[k]; 2936 long kmer=0; 2937 long rkmer=0; 2938 int found=0; 2939 int len=0; 2940 int id0=-1; //ID of first kmer found. 2941 2942 BitSet bs=new BitSet(bases.length+trimPad+1); 2943 2944 final int minus=k-1-trimPad; 2945 final int plus=trimPad+1; 2946 2947 final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight)); 2948 final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft)); 2949 2950 //Scan for normal kmers 2951 for(int i=start; i<stop; i++){ 2952 byte b=bases[i]; 2953 long x=Dedupe.baseToNumber[b]; 2954 long x2=Dedupe.baseToComplementNumber[b]; 2955 kmer=((kmer<<2)|x)&mask; 2956 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 2957 if(b=='N' && forbidNs){len=0; rkmer=0;}else{len++;} 2958 if(verbose){System.err.println("Scanning3 i="+i+", kmer="+kmer+", rkmer="+rkmer+", len="+len+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 2959 if(len>=minlen2 && i>=minlen){ 2960 final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets); 2961 if(verbose){System.err.println("Testing kmer "+kmer+"; id="+id);} 2962 if(id>0){ 2963 if(id0<0){id0=id;} 2964 if(verbose){ 2965 System.err.println("a: Found "+kmer); 2966 System.err.println("Setting "+Tools.max(0, i-minus)+", "+(i+plus)); 2967 System.err.println("i="+i+", minus="+minus+", plus="+plus+", trimpad="+trimPad+", k="+k); 2968 } 2969 bs.set(Tools.max(0, i-minus), i+plus); 2970 found++; 2971 } 2972 } 2973 } 2974 2975 //If nothing was found, scan for short kmers. 2976 if(useShortKmers){ 2977 assert(!maskMiddle && middleMask==-1) : maskMiddle+", "+middleMask+", k="+", mink="+mink; 2978 2979 //Look for short kmers on left side 2980 { 2981 kmer=0; 2982 rkmer=0; 2983 len=0; 2984 final int lim=Tools.min(k, stop); 2985 for(int i=start; i<lim; i++){ 2986 byte b=bases[i]; 2987 long x=Dedupe.baseToNumber[b]; 2988 long x2=Dedupe.baseToComplementNumber[b]; 2989 kmer=((kmer<<2)|x)&mask; 2990 rkmer=rkmer|(x2<<(2*len)); 2991 len++; 2992 if(verbose){System.err.println("Scanning4 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 2993 if(len>=mink){ 2994 2995 if(verbose){ 2996 System.err.println("Looking for left kmer "+AminoAcid.kmerToString(kmer, len)); 2997 System.err.println("Looking for left rkmer "+AminoAcid.kmerToString(rkmer, len)); 2998 } 2999 final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets); 3000 if(id>0){ 3001 if(id0<0){id0=id;} 3002 if(verbose){ 3003 System.err.println("b: Found "+kmer); 3004 System.err.println("Setting "+0+", "+(i+plus)); 3005 } 3006 bs.set(0, i+plus); 3007 found++; 3008 } 3009 } 3010 } 3011 } 3012 3013 //Look for short kmers on right side 3014 { 3015 kmer=0; 3016 rkmer=0; 3017 len=0; 3018 final int lim=Tools.max(-1, stop-k); 3019 for(int i=stop-1; i>lim; i--){ 3020 byte b=bases[i]; 3021 long x=Dedupe.baseToNumber[b]; 3022 long x2=Dedupe.baseToComplementNumber[b]; 3023 kmer=kmer|(x<<(2*len)); 3024 rkmer=((rkmer<<2)|x2)&mask; 3025 len++; 3026 if(verbose){System.err.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 3027 if(len>=mink){ 3028 3029 final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets); 3030 if(verbose){System.err.println("Looking for right kmer "+AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+"; id="+id+"; kmask="+lengthMasks[len]);} 3031 if(id>0){ 3032 if(id0<0){id0=id;} 3033 if(verbose){ 3034 System.err.println("c: Found "+kmer); 3035 System.err.println("Setting "+Tools.max(0, i-trimPad)+", "+bases.length); 3036 } 3037 bs.set(Tools.max(0, i-trimPad), bases.length); 3038 found++; 3039 } 3040 } 3041 } 3042 } 3043 } 3044 3045 3046 if(verbose){System.err.println("found="+found+", bitset="+bs);} 3047 3048 if(found==0){return 0;} 3049 assert(found>0) : "Overflow in 'found' variable."; 3050 3051 {//Increment counter for the scaffold whose kmer was first detected 3052 if(scaffoldReadCountsT!=null){ 3053 scaffoldReadCountsT[id0]++; 3054 scaffoldBaseCountsT[id0]+=bases.length; 3055 }else{ 3056 scaffoldReadCounts.addAndGet(id0, 1); 3057 scaffoldBaseCounts.addAndGet(id0, bases.length); 3058 } 3059 } 3060 3061 int cardinality=bs.cardinality(); 3062 assert(cardinality>0); 3063 3064 //Replace kmer hit zone with the trim symbol 3065 for(int i=0; i<bases.length; i++){ 3066 if(bs.get(i)){ 3067 if(kmaskLowercase){ 3068 bases[i]=(byte)Tools.toLowerCase(bases[i]); 3069 }else{ 3070 bases[i]=trimSymbol; 3071 if(quals!=null && trimSymbol=='N'){quals[i]=0;} 3072 } 3073 } 3074 } 3075 return cardinality; 3076 } 3077 3078 /** 3079 * @param r 3080 * @param idList 3081 * @param countList 3082 */ rename(Read r, IntList idList, IntList countList)3083 private void rename(Read r, IntList idList, IntList countList) { 3084 if(r==null || idList.size<1){return;} 3085 StringBuilder sb=new StringBuilder(); 3086 if(r.id==null){sb.append(r.numericID);} 3087 else{sb.append(r.id);} 3088 for(int i=0; i<idList.size; i++){ 3089 int id=idList.get(i); 3090 int count=countList.get(i); 3091 sb.append('\t'); 3092 sb.append(scaffoldNames.get(id)); 3093 sb.append('='); 3094 sb.append(count); 3095 } 3096 r.id=sb.toString(); 3097 } 3098 3099 /** 3100 * Pack a list of counts from an array to an IntList. 3101 * @param loose Counter array 3102 * @param packed Unique values 3103 * @param counts Counts of values 3104 * @return 3105 */ condenseLoose(int[] loose, IntList packed, IntList counts)3106 private int condenseLoose(int[] loose, IntList packed, IntList counts){ 3107 counts.size=0; 3108 if(packed.size<1){return 0;} 3109 3110 int max=0; 3111 for(int i=0; i<packed.size; i++){ 3112 final int p=packed.get(i); 3113 final int c=loose[p]; 3114 counts.add(c); 3115 loose[p]=0; 3116 max=Tools.max(max, c); 3117 } 3118 return max; 3119 } 3120 expectedErrors(Read r1, Read r2)3121 private float expectedErrors(Read r1, Read r2){ 3122 float a=(r1==null ? 0 : r1.expectedErrors(false, -1)); 3123 float b=(r2==null ? 0 : r2.expectedErrors(false, -1)); 3124 return Tools.max(a, b); 3125 } 3126 3127 /*--------------------------------------------------------------*/ 3128 /*---------------- Entropy Methods ----------------*/ 3129 /*--------------------------------------------------------------*/ 3130 averageEntropy(final byte[] bases, final int k, final int window, final short[] counts, final short[] countCounts, final int kmerspace, boolean verify)3131 private float averageEntropy(final byte[] bases, final int k, 3132 final int window, final short[] counts, final short[] countCounts, final int kmerspace, boolean verify){ 3133 assert(k>0) : "k must be greater than 0"; 3134 // Arrays.fill(counts, 0); 3135 3136 assert(countCounts[0]==window); 3137 if(verify){ 3138 for(int c : counts){assert(c==0);} 3139 for(int i=1; i<countCounts.length; i++){assert(countCounts[i]==0);} 3140 } 3141 3142 final int mask=(k>15 ? -1 : ~((-1)<<(2*k))); 3143 int current=0; 3144 //int ns=0; 3145 int kmer=0, kmer2=0; 3146 3147 double entropySum=0; 3148 int entropyMeasurements=0; 3149 3150 for(int i=0, i2=-window; i2<bases.length; i++, i2++){ 3151 3152 // System.err.println("\nStart: i="+i+", current="+current+", ns="+ns+"\n"+Arrays.toString(counts)+"\n"+Arrays.toString(countCounts)); 3153 3154 if(i<bases.length){ 3155 byte b=bases[i]; 3156 if(!AminoAcid.isFullyDefined(b)){ 3157 // ns++; 3158 b='A'; 3159 } 3160 final int n=Dedupe.baseToNumber[b]; 3161 kmer=((kmer<<2)|n)&mask; 3162 3163 if(counts[kmer]<1){ 3164 assert(counts[kmer]==0); 3165 current++; 3166 } 3167 countCounts[counts[kmer]]--; 3168 assert(countCounts[counts[kmer]]>=-1): i+", "+current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(counts)+"\n"+Arrays.toString(countCounts); 3169 counts[kmer]++; 3170 assert(counts[kmer]<=window+1) : Arrays.toString(counts)+"\n"+Arrays.toString(countCounts); 3171 countCounts[counts[kmer]]++; 3172 if(verify){ 3173 assert(current==Tools.cardinality(counts)) : current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(counts); 3174 assert(Tools.sum(countCounts)>0 && (Tools.sum(countCounts)<=window+1)): current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(countCounts); 3175 } 3176 3177 // System.err.println("Added "+kmer+"; counts["+kmer+"]="+counts[kmer]); 3178 } 3179 3180 if(i2>=0){ 3181 byte b2=bases[i2]; 3182 if(!AminoAcid.isFullyDefined(b2)){ 3183 // ns--; 3184 b2='A'; 3185 } 3186 final int n2=Dedupe.baseToNumber[b2]; 3187 kmer2=((kmer2<<2)|n2)&mask; 3188 3189 countCounts[counts[kmer2]]--; 3190 assert(countCounts[counts[kmer2]]>=0); 3191 counts[kmer2]--; 3192 countCounts[counts[kmer2]]++; 3193 if(counts[kmer2]<1){ 3194 assert(counts[kmer2]==0) : Arrays.toString(counts); 3195 current--; 3196 } 3197 if(verify){ 3198 assert(current==Tools.cardinality(counts)) : current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(counts); 3199 assert(Tools.sum(countCounts)>=0 && (Tools.sum(countCounts)<=window)): current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(countCounts); 3200 } 3201 3202 // System.err.println("Removed "+kmer2+"; count="+counts[kmer2]); 3203 } 3204 3205 if(verify && i2>-1 && i<bases.length){ 3206 assert(Tools.sum(counts)==window); 3207 assert(Tools.sum(countCounts)==window): current+", "+Tools.cardinality(counts)+"\n"+Arrays.toString(countCounts); 3208 } 3209 3210 if(i2>=-1 && i<bases.length){ 3211 float e=calcEntropy(countCounts, window, kmerspace); 3212 entropySum+=e; 3213 entropyMeasurements++; 3214 } 3215 } 3216 3217 // System.err.println(" *** "); 3218 // System.err.println(entropySum+", "+entropyMeasurements+", "+(entropySum/(Tools.max(1, entropyMeasurements)))); 3219 // System.err.println(window+", "+k+", "+kmerspace+", "+counts.length+", "+countCounts.length); 3220 // System.err.println(" *** "); 3221 3222 return (float)(entropySum/(Tools.max(1, entropyMeasurements))); 3223 } 3224 calcEntropy(short[] countCounts, int window, int kmerspace)3225 private float calcEntropy(short[] countCounts, int window, int kmerspace){ 3226 double sum=0; 3227 for(int i=1; i<countCounts.length; i++){ 3228 int cc=countCounts[i]; 3229 double pklogpk=entropy[i]; 3230 sum+=(cc*pklogpk); 3231 } 3232 // System.err.println("sum = "+sum); 3233 // System.err.println("entropy = "+(sum*entropyMult)); 3234 return (float)(sum*entropyMult); 3235 } 3236 3237 /*--------------------------------------------------------------*/ 3238 3239 /** Input read stream */ 3240 private final ConcurrentReadInputStream cris; 3241 /** Output read streams */ 3242 private final ConcurrentReadOutputStream ros, rosb, ross; 3243 3244 private final ReadStats readstats; 3245 private final int[] overlapVector; 3246 private final int[] countArray; 3247 3248 private final IntList idList; 3249 private final IntList countList; 3250 3251 long[] hitCountsT; 3252 long[] scaffoldReadCountsT; 3253 long[] scaffoldBaseCountsT; 3254 3255 final short[] entropyCounts; 3256 final short[] entropyCountCounts; 3257 3258 private float[] aprob, bprob; 3259 3260 private long readsInT=0; 3261 private long basesInT=0; 3262 private long readsOutT=0; 3263 private long basesOutT=0; 3264 3265 private long readsQTrimmedT=0; 3266 private long basesQTrimmedT=0; 3267 private long readsFTrimmedT=0; 3268 private long basesFTrimmedT=0; 3269 private long readsQFilteredT=0; 3270 private long basesQFilteredT=0; 3271 private long readsEFilteredT=0; 3272 private long basesEFilteredT=0; 3273 3274 private long readsKTrimmedT=0; 3275 private long basesKTrimmedT=0; 3276 private long readsKFilteredT=0; 3277 private long basesKFilteredT=0; 3278 3279 private long readsTrimmedByOverlapT=0; 3280 private long basesTrimmedByOverlapT=0; 3281 3282 private long badGcBasesT=0; 3283 private long badGcReadsT=0; 3284 3285 private int xsum=0, rktsum=0; 3286 } 3287 3288 /*--------------------------------------------------------------*/ 3289 /*---------------- Static Methods ----------------*/ 3290 /*--------------------------------------------------------------*/ 3291 3292 /** Current available memory */ freeMemory()3293 private static final long freeMemory(){ 3294 Runtime rt=Runtime.getRuntime(); 3295 return rt.freeMemory(); 3296 } 3297 3298 /** 3299 * Transforms a kmer into a canonical value stored in the table. Expected to be inlined. 3300 * @param kmer Forward kmer 3301 * @param rkmer Reverse kmer 3302 * @param lengthMask Bitmask with single '1' set to left of kmer 3303 * @return Canonical value 3304 */ toValue(long kmer, long rkmer, long lengthMask)3305 private final long toValue(long kmer, long rkmer, long lengthMask){ 3306 assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer; 3307 long value=(rcomp ? Tools.max(kmer, rkmer) : kmer); 3308 return (value&middleMask)|lengthMask; 3309 } 3310 3311 /*--------------------------------------------------------------*/ 3312 /*---------------- Fields ----------------*/ 3313 /*--------------------------------------------------------------*/ 3314 3315 /** For calculating kmer cardinality */ 3316 private final CardinalityTracker loglog; 3317 3318 /** Has this class encountered errors while processing? */ 3319 public boolean errorState=false; 3320 3321 /** Fraction of available memory preallocated to arrays */ 3322 private double preallocFraction=1.0; 3323 /** Initial size of data structures */ 3324 private int initialSize=-1; 3325 3326 /** Hold kmers for filtering. A kmer X such that X%WAYS=Y will be stored in keySets[Y] */ 3327 private final AbstractKmerTable[] filterMaps; 3328 /** Hold kmers for masking */ 3329 private final AbstractKmerTable[] maskMaps; 3330 /** Hold kmers for trimming right (3') */ 3331 private final AbstractKmerTable[] trimRightMaps; 3332 /** Hold kmers for trimming left (5') */ 3333 private final AbstractKmerTable[] trimLeftMaps; 3334 3335 /** A scaffold's name is stored at scaffoldNames.get(id). 3336 * scaffoldNames[0] is reserved, so the first id is 1. */ 3337 private final ArrayList<String> scaffoldNames=new ArrayList<String>(); 3338 /** Names of reference files (refNames[0] is valid). */ 3339 private final ArrayList<String> refNames=new ArrayList<String>(); 3340 /** Number of scaffolds per reference. */ 3341 private final int[] refScafCounts; 3342 /** scaffoldCounts[id] stores the number of reads with kmer matches to that scaffold */ 3343 private AtomicLongArray scaffoldReadCounts; 3344 /** scaffoldBaseCounts[id] stores the number of bases with kmer matches to that scaffold */ 3345 private AtomicLongArray scaffoldBaseCounts; 3346 /** Set to false to force threads to share atomic counter arrays. */ 3347 private boolean ALLOW_LOCAL_ARRAYS=true; 3348 /** scaffoldLengths[id] stores the length of that scaffold */ 3349 private IntList scaffoldLengths=new IntList(); 3350 /** hitCounts[x] stores the number of reads with exactly x kmer matches */ 3351 private long[] hitCounts; 3352 /** Array of reference files from which to load kmers */ 3353 private String[] refFilter=null; 3354 /** Array of reference files from which to load kmers */ 3355 private String[] refMask=null; 3356 /** Array of reference files from which to load kmers */ 3357 private String[] refRight=null; 3358 /** Array of reference files from which to load kmers */ 3359 private String[] refLeft=null; 3360 /** Array of literal strings from which to load kmers */ 3361 private String[] literalFilter=null; 3362 /** Array of literal strings from which to load kmers */ 3363 private String[] literalMask=null; 3364 /** Array of literal strings from which to load kmers */ 3365 private String[] literalRight=null; 3366 /** Array of literal strings from which to load kmers */ 3367 private String[] literalLeft=null; 3368 3369 /** Input reads */ 3370 private String in1=null, in2=null; 3371 /** Input qual files */ 3372 private String qfin1=null, qfin2=null; 3373 /** Output reads (unmatched and at least minlen) */ 3374 private String out1=null, out2=null; 3375 /** Output reads (matched or shorter than minlen) */ 3376 private String outb1=null, outb2=null; 3377 /** Output reads whose mate was discarded */ 3378 private String outsingle=null; 3379 /** Statistics output files */ 3380 private String outstats=null, outduk=null, outrqc=null, outrpkm=null, outrefstats=null; 3381 3382 /** Optional file for quality score recalibration */ 3383 private String samFile=null; 3384 3385 /** Dump kmers here. */ 3386 private String dump=null; 3387 3388 /** Maximum input reads (or pairs) to process. Does not apply to references. -1 means unlimited. */ 3389 private long maxReads=-1; 3390 /** Process this fraction of input reads. */ 3391 private float samplerate=1f; 3392 /** Set samplerate seed to this value. */ 3393 private long sampleseed=-1; 3394 3395 /** Output reads in input order. May reduce speed. */ 3396 private final boolean ordered; 3397 /** Attempt to match kmers shorter than normal k on read ends when doing kTrimming. */ 3398 private boolean useShortKmers=false; 3399 /** Make the middle base in a kmer a wildcard to improve sensitivity */ 3400 private boolean maskMiddle=true; 3401 3402 /** Store reference kmers with up to this many substitutions */ 3403 private int hammingDistance=0; 3404 /** Search for query kmers with up to this many substitutions */ 3405 private int qHammingDistance=0; 3406 /** Store reference kmers with up to this many edits (including indels) */ 3407 private int editDistance=0; 3408 /** Store short reference kmers with up to this many substitutions */ 3409 private int hammingDistance2=-1; 3410 /** Search for short query kmers with up to this many substitutions */ 3411 private int qHammingDistance2=-1; 3412 /** Store short reference kmers with up to this many edits (including indels) */ 3413 private int editDistance2=-1; 3414 /** Never skip more than this many consecutive kmers when hashing reference. */ 3415 private int maxSkip=1; 3416 /** Always skip at least this many consecutive kmers when hashing reference. 3417 * 1 means every kmer is used, 2 means every other, etc. */ 3418 private int minSkip=1; 3419 3420 /** Trim this much extra around matched kmers */ 3421 private int trimPad; 3422 3423 /*--------------------------------------------------------------*/ 3424 /*---------------- Entropy Fields ----------------*/ 3425 /*--------------------------------------------------------------*/ 3426 3427 private int entropyK=5; 3428 private int entropyWindow=50; 3429 private float entropyCutoff=-1; 3430 private boolean verifyEntropy=false; 3431 3432 private final boolean calcEntropy; 3433 private final int entropyKmerspace; 3434 private final double entropyMult; 3435 private final double[] entropy; 3436 3437 /*--------------------------------------------------------------*/ 3438 /*---------------- Statistics ----------------*/ 3439 /*--------------------------------------------------------------*/ 3440 3441 long readsIn=0; 3442 long basesIn=0; 3443 long readsOut=0; 3444 long basesOut=0; 3445 3446 long readsQTrimmed=0; 3447 long basesQTrimmed=0; 3448 long readsFTrimmed=0; 3449 long basesFTrimmed=0; 3450 long readsQFiltered=0; 3451 long basesQFiltered=0; 3452 long readsEFiltered=0; 3453 long basesEFiltered=0; 3454 3455 long readsKTrimmed=0; 3456 long basesKTrimmed=0; 3457 long readsKFiltered=0; 3458 long basesKFiltered=0; 3459 3460 long badGcReads; 3461 long badGcBases; 3462 3463 long readsTrimmedByOverlap; 3464 long basesTrimmedByOverlap; 3465 3466 long refReads=0; 3467 long refBases=0; 3468 long refKmers=0; 3469 3470 long storedKmersFilter=0; 3471 long storedKmersMask=0; 3472 long storedKmersRight=0; 3473 long storedKmersLeft=0; 3474 3475 /*--------------------------------------------------------------*/ 3476 /*---------------- Final Primitives ----------------*/ 3477 /*--------------------------------------------------------------*/ 3478 3479 /** Don't look for kmers in read 1 */ 3480 private final boolean skipR1; 3481 /** Don't look for kmers in read 2 */ 3482 private final boolean skipR2; 3483 /** Correct errors via read overlap */ 3484 private final boolean ecc; 3485 3486 /** Look for reverse-complements as well as forward kmers. Default: true */ 3487 private final boolean rcomp; 3488 /** Don't allow a read 'N' to match a reference 'A'. 3489 * Reduces sensitivity when hdist>0 or edist>0. Default: false. */ 3490 private final boolean forbidNs; 3491 /** AND bitmask with 0's at the middle base */ 3492 private final long middleMask; 3493 /** Use HashForest data structure */ 3494 private final boolean useForest; 3495 /** Use KmerTable data structure */ 3496 private final boolean useTable; 3497 /** Use HashArray data structure (default) */ 3498 private final boolean useArray; 3499 3500 /** Normal kmer length */ 3501 final int k; 3502 /** k-1; used in some expressions */ 3503 final int k2; 3504 /** Emulated kmer greater than k */ 3505 final int kbig; 3506 /** Shortest kmer to use for trimming */ 3507 final int mink; 3508 /** A read may contain up to this many kmers before being considered a match. Default: 0 */ 3509 final int maxBadKmers; 3510 3511 /** Recalibrate quality scores using matrices */ 3512 final boolean recalibrateQuality; 3513 /** Quality-trim the left side */ 3514 final boolean qtrimLeft; 3515 /** Quality-trim the right side */ 3516 final boolean qtrimRight; 3517 /** Trim bases at this quality or below. Default: 4 */ 3518 final float trimq; 3519 /** Error rate for trimming (derived from trimq) */ 3520 private final float trimE; 3521 /** Throw away reads below this average quality after trimming. Default: 0 */ 3522 final float minAvgQuality; 3523 /** If positive, calculate average quality from the first X bases only. Default: 0 */ 3524 final int minAvgQualityBases; 3525 /** Throw away reads failing chastity filter (:Y: in read header) */ 3526 final boolean chastityFilter; 3527 /** Crash if a barcode is encountered that contains Ns or is not in the table */ 3528 final boolean failBadBarcodes; 3529 /** Remove reads with Ns in barcodes or that are not in the table */ 3530 final boolean removeBadBarcodes; 3531 /** Fail reads missing a barcode */ 3532 final boolean failIfNoBarcode; 3533 /** A set of valid barcodes; null if unused */ 3534 final HashSet<String> barcodes; 3535 /** Throw away reads containing more than this many Ns. Default: -1 (disabled) */ 3536 final int maxNs; 3537 /** Throw away reads containing without at least this many consecutive called bases. */ 3538 int minConsecutiveBases=0; 3539 /** Throw away reads containing fewer than this fraction of any particular base. */ 3540 final float minBaseFrequency; 3541 /** Throw away reads shorter than this after trimming. Default: 10 */ 3542 final int minReadLength; 3543 /** Throw away reads longer than this after trimming. Default: Integer.MAX_VALUE */ 3544 final int maxReadLength; 3545 /** Toss reads shorter than this fraction of initial length, after trimming */ 3546 final float minLenFraction; 3547 /** Filter reads by whether or not they have matching kmers */ 3548 final boolean kfilter; 3549 /** Trim matching kmers and all bases to the left */ 3550 final boolean ktrimLeft; 3551 /** Trim matching kmers and all bases to the right */ 3552 final boolean ktrimRight; 3553 /** Don't trim, but replace matching kmers with a symbol (default N) */ 3554 final boolean ktrimN; 3555 /** Exclude kmer itself when ktrimming */ 3556 final boolean ktrimExclusive; 3557 /** Replace bases covered by matched kmers with this symbol */ 3558 final byte trimSymbol; 3559 /** Convert masked bases to lowercase */ 3560 final boolean kmaskLowercase; 3561 /** Output over-trimmed reads to outbad (outmatch). If false, they are discarded. */ 3562 final boolean addTrimmedToBad; 3563 /** Find the sequence that shares the most kmer matches when filtering. */ 3564 final boolean findBestMatch; 3565 /** Trim pairs to the same length, when adapter-trimming */ 3566 final boolean trimPairsEvenly; 3567 /** Trim left bases of the read to this position (exclusive, 0-based) */ 3568 final int forceTrimLeft; 3569 /** Trim right bases of the read after this position (exclusive, 0-based) */ 3570 final int forceTrimRight; 3571 /** Trim this many rightmost bases of the read */ 3572 final int forceTrimRight2; 3573 /** Trim right bases of the read modulo this value. 3574 * e.g. forceTrimModulo=50 would trim the last 3bp from a 153bp read. */ 3575 final int forceTrimModulo; 3576 3577 /** Discard reads with GC below this. */ 3578 final float minGC; 3579 /** Discard reads with GC above this. */ 3580 final float maxGC; 3581 /** Discard reads outside of GC bounds. */ 3582 final boolean filterGC; 3583 /** Average GC for paired reads. */ 3584 final boolean usePairGC; 3585 3586 /** If positive, only look for kmer matches in the leftmost X bases */ 3587 int restrictLeft; 3588 /** If positive, only look for kmer matches the rightmost X bases */ 3589 int restrictRight; 3590 3591 /** Trim implied adapters based on overlap, for reads with insert size shorter than read length */ 3592 final boolean trimByOverlap; 3593 final boolean useQualityForOverlap; 3594 final boolean strictOverlap; 3595 3596 // int minOverlap0=11; 3597 // int minOverlap=24; 3598 // final int overlapMargin=2; 3599 // final int overlapMaxMismatches0=4; 3600 // final int overlapMaxMismatches=4; 3601 // final int overlapMinq=13; 3602 3603 int minOverlap0=7; 3604 int minOverlap=14; 3605 int minInsert0=16; 3606 int minInsert=50; 3607 3608 final float maxRatio; 3609 final float ratioMargin; 3610 final float ratioOffset; 3611 final float efilterRatio; 3612 final float efilterOffset; 3613 final float pfilterRatio; 3614 final float meeFilter; 3615 3616 /** Skip this many initial input reads */ 3617 private final long skipreads; 3618 3619 /** Pairs go to outbad if either of them is bad, as opposed to requiring both to be bad. 3620 * Default: true. */ 3621 private final boolean removePairsIfEitherBad; 3622 3623 /** Print only statistics for scaffolds that matched at least one read 3624 * Default: true. */ 3625 private final boolean printNonZeroOnly; 3626 3627 /** Rename reads to indicate what they matched. 3628 * Default: false. */ 3629 private final boolean rename; 3630 /** Use names of reference files instead of scaffolds. 3631 * Default: false. */ 3632 private final boolean useRefNames; 3633 3634 /** Fraction of kmers to skip, 0 to 15 out of 16 */ 3635 private final int speed; 3636 3637 /** Skip this many kmers when examining the read. Default 1. 3638 * 1 means every kmer is used, 2 means every other, etc. */ 3639 private final int qSkip; 3640 3641 /** True if speed and qSkip are disabled. */ 3642 private final boolean noAccel; 3643 3644 private final boolean MAKE_QUALITY_ACCURACY; 3645 private final boolean MAKE_QUALITY_HISTOGRAM; 3646 private final boolean MAKE_MATCH_HISTOGRAM; 3647 private final boolean MAKE_BASE_HISTOGRAM; 3648 3649 private final boolean MAKE_EHIST; 3650 private final boolean MAKE_INDELHIST; 3651 private final boolean MAKE_LHIST; 3652 private final boolean MAKE_GCHIST; 3653 private final boolean MAKE_IDHIST; 3654 3655 /*--------------------------------------------------------------*/ 3656 /*---------------- Static Fields ----------------*/ 3657 /*--------------------------------------------------------------*/ 3658 3659 /** Number of tables (and threads, during loading) */ 3660 private static final int WAYS=7; //123 3661 /** Default initial size of data structures */ 3662 private static final int initialSizeDefault=128000; 3663 /** Verbose messages */ 3664 public static final boolean verbose=false; //123 3665 3666 /** Print messages to this stream */ 3667 private static PrintStream outstream=System.err; 3668 /** Permission to overwrite existing files */ 3669 public static boolean overwrite=true; 3670 /** Permission to append to existing files */ 3671 public static boolean append=false; 3672 /** Print speed statistics upon completion */ 3673 public static boolean showSpeed=true; 3674 /** Display progress messages such as memory usage */ 3675 public static boolean DISPLAY_PROGRESS=true; 3676 /** Number of ProcessThreads */ 3677 public static int THREADS=Shared.threads(); 3678 /** Indicates end of input stream */ 3679 private static final ArrayList<Read> POISON=new ArrayList<Read>(0); 3680 /** Number of columns for statistics output, 3 or 5 */ 3681 public static int STATS_COLUMNS=3; 3682 /** Release memory used by kmer storage after processing reads */ 3683 public static boolean RELEASE_TABLES=true; 3684 /** Max value of hitCount array */ 3685 public static final int HITCOUNT_LEN=1000; 3686 /** Make unambiguous copies of ref sequences with ambiguous bases */ 3687 public static boolean REPLICATE_AMBIGUOUS=false; 3688 3689 /** x&clearMasks[i] will clear base i */ 3690 private static final long[] clearMasks; 3691 /** x|setMasks[i][j] will set base i to j */ 3692 private static final long[][] setMasks; 3693 /** x&leftMasks[i] will clear all bases to the right of i (exclusive) */ 3694 private static final long[] leftMasks; 3695 /** x&rightMasks[i] will clear all bases to the left of i (inclusive) */ 3696 private static final long[] rightMasks; 3697 /** x|kMasks[i] will set the bit to the left of the leftmost base */ 3698 private static final long[] lengthMasks; 3699 3700 public static HashMap<String,String> RQC_MAP=null; 3701 3702 public static final int FILTERMODE=0, RIGHTMODE=1, LEFTMODE=2, NMODE=3; 3703 3704 /*--------------------------------------------------------------*/ 3705 /*---------------- Static Initializers ----------------*/ 3706 /*--------------------------------------------------------------*/ 3707 3708 static{ 3709 clearMasks=new long[32]; 3710 leftMasks=new long[32]; 3711 rightMasks=new long[32]; 3712 lengthMasks=new long[32]; 3713 setMasks=new long[4][32]; 3714 for(int i=0; i<32; i++){ 3715 clearMasks[i]=~(3L<<(2*i)); 3716 } 3717 for(int i=0; i<32; i++){ 3718 leftMasks[i]=((-1L)<<(2*i)); 3719 } 3720 for(int i=0; i<32; i++){ 3721 rightMasks[i]=~((-1L)<<(2*i)); 3722 } 3723 for(int i=0; i<32; i++){ 3724 lengthMasks[i]=((1L)<<(2*i)); 3725 } 3726 for(int i=0; i<32; i++){ 3727 for(long j=0; j<4; j++){ 3728 setMasks[(int)j][i]=(j<<(2*i)); 3729 } 3730 } 3731 } 3732 3733 } 3734