1 package jgi; 2 3 import java.io.File; 4 import java.io.PrintStream; 5 import java.util.ArrayList; 6 import java.util.Arrays; 7 import java.util.BitSet; 8 import java.util.HashMap; 9 import java.util.HashSet; 10 import java.util.Locale; 11 import java.util.concurrent.ArrayBlockingQueue; 12 import java.util.concurrent.atomic.AtomicLongArray; 13 14 import cardinality.CardinalityTracker; 15 import dna.AminoAcid; 16 import dna.Data; 17 import fileIO.ByteFile; 18 import fileIO.ByteStreamWriter; 19 import fileIO.FileFormat; 20 import fileIO.ReadWrite; 21 import fileIO.TextStreamWriter; 22 import hiseq.FlowcellCoordinate; 23 import json.JsonObject; 24 import kmer.AbstractKmerTable; 25 import kmer.AbstractKmerTableSet; 26 import kmer.ScheduleMaker; 27 import shared.KillSwitch; 28 import shared.Parse; 29 import shared.Parser; 30 import shared.PreParser; 31 import shared.ReadStats; 32 import shared.Shared; 33 import shared.Timer; 34 import shared.Tools; 35 import shared.TrimRead; 36 import stream.ConcurrentReadInputStream; 37 import stream.ConcurrentReadOutputStream; 38 import stream.FASTQ; 39 import stream.FastaReadInputStream; 40 import stream.Read; 41 import stream.SamLine; 42 import structures.EntropyTracker; 43 import structures.IntList; 44 import structures.ListNum; 45 import structures.PolymerTracker; 46 import var2.CallVariants; 47 import var2.ScafMap; 48 import var2.Var; 49 import var2.VarMap; 50 import var2.VcfLoader; 51 52 /** 53 * Separates, trims, or masks sequences based on matching kmers in a reference. 54 * Supports Hamming and and edit distance. 55 * Supports K 1-31 and emulated K>31. 56 * @author Brian Bushnell 57 * @date Aug 30, 2013 58 * 59 */ 60 public class BBDuk { 61 62 /*--------------------------------------------------------------*/ 63 /*---------------- Initialization ----------------*/ 64 /*--------------------------------------------------------------*/ 65 66 /** 67 * Code entrance from the command line. 68 * @param args Command line arguments 69 */ main(String[] args)70 public static void main(String[] args){ 71 72 // if(PreParser.isAmino(args)){ 73 // BBDukAA.main(args);//No longer needed since this supports AAs 74 // return; 75 // } 76 77 //Create a new BBDuk instance 78 BBDuk x=new BBDuk(args); 79 80 //And run it 81 x.process(); 82 83 //Close the print stream if it was redirected 84 Shared.closeStream(outstream); 85 assert(WAYS==7 && !verbose) : "Undo WAYS and verbose"; 86 } 87 88 /** 89 * Constructor. 90 * @param args Command line arguments 91 */ BBDuk(String[] args)92 public BBDuk(String[] args){ 93 {//Preparse block for help, config files, and outstream 94 PreParser pp=new PreParser(args, getClass(), true); 95 args=pp.args; 96 outstream=pp.outstream; 97 jsonStats=pp.jsonObject; 98 json=pp.json; 99 } 100 101 /* Set global defaults */ 102 ReadWrite.ZIPLEVEL=2; 103 ReadWrite.USE_UNPIGZ=true; 104 ReadWrite.USE_PIGZ=true; 105 106 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ 107 ByteFile.FORCE_MODE_BF2=true; 108 } 109 110 /* Initialize local variables with defaults */ 111 boolean setOut=false, setOutb=false; 112 boolean ktrimRight_=false, ktrimLeft_=false, ktrimN_=false, ktrimExclusive_=false, ksplit_=false; 113 boolean findBestMatch_=false; 114 boolean addTrimmedToBad_=true; 115 boolean rcomp_=true; 116 boolean forbidNs_=false; 117 boolean useForest_=false, useTable_=false, useArray_=true, prealloc_=false; 118 int k_=27, kbig_=-1; 119 int mink_=-1; 120 int ways_=-1; //Currently disabled for speed 121 int maxBadKmers_=0; 122 long skipreads_=0; 123 byte TRIM_SYMBOL_='N'; 124 boolean kmaskLowercase_=false; 125 boolean kmaskFullyCovered_=false; 126 boolean histogramsBeforeProcessing_=true; 127 boolean trimFailuresTo1bp_=false; 128 129 Parser parser=new Parser(); 130 parser.trimq=6; 131 parser.minAvgQuality=0; 132 parser.minReadLength=10; 133 parser.maxReadLength=Integer.MAX_VALUE; 134 parser.minLenFraction=0f; 135 parser.requireBothBad=false; 136 parser.maxNs=-1; 137 parser.overwrite=overwrite; 138 boolean trimByOverlap_=false, useQualityForOverlap_=false, strictOverlap_=true; 139 boolean trimPairsEvenly_=false; 140 boolean ordered_=false; 141 int minoverlap_=-1, mininsert_=-1; 142 int restrictLeft_=0, restrictRight_=0, speed_=0, qSkip_=1; 143 boolean printNonZeroOnly_=true; 144 boolean rename_=false, useRefNames_=false; 145 boolean skipr1_=false, skipr2_=false; 146 boolean ecc_=false; 147 float minBaseFrequency_=0; 148 float minKmerFraction_=0; 149 float minCoveredFraction_=0; 150 boolean setk=false; 151 152 scaffoldNames.add(""); //Necessary so that the first real scaffold gets an id of 1, not zero 153 scaffoldLengths.add(0); 154 155 /* Parse arguments */ 156 for(int i=0; i<args.length; i++){ 157 158 final String arg=args[i]; 159 String[] split=arg.split("="); 160 String a=split[0].toLowerCase(); 161 String b=split.length>1 ? split[1] : null; 162 163 if(Parser.parseZip(arg, a, b)){ 164 //do nothing 165 }else if(Parser.parseHist(arg, a, b)){ 166 //do nothing 167 }else if(Parser.parseCommonStatic(arg, a, b)){ 168 //do nothing 169 }else if(Parser.parseQualityAdjust(arg, a, b)){ 170 //do nothing 171 }else if(Parser.parseQuality(arg, a, b)){ 172 //do nothing 173 }else if(Parser.parseFasta(arg, a, b)){ 174 //do nothing 175 }else if(parser.parseInterleaved(arg, a, b)){ 176 //do nothing 177 }else if(parser.parseTrim(arg, a, b)){ 178 //do nothing 179 }else if(parser.parseCommon(arg, a, b)){ 180 //do nothing 181 }else if(parser.parseCardinality(arg, a, b)){ 182 //do nothing 183 }else if(a.equals("in") || a.equals("in1")){ 184 in1=b; 185 }else if(a.equals("in2")){ 186 in2=b; 187 }else if(a.equals("qfin") || a.equals("qfin1")){ 188 qfin1=b; 189 }else if(a.equals("qfin2")){ 190 qfin2=b; 191 }else if(a.equals("qfout") || a.equals("qfout1")){ 192 qfout1=b; 193 }else if(a.equals("qfin2")){ 194 qfout2=b; 195 }else if(a.equals("out") || a.equals("out1") || a.equals("outu") || a.equals("outu1") || a.equals("outnonmatch") || 196 a.equals("outnonmatch1") || a.equals("outunnmatch") || a.equals("outunmatch1") || a.equals("outunnmatched") || a.equals("outunmatched1")){ 197 out1=b; 198 setOut=true; 199 }else if(a.equals("out2") || a.equals("outu2") || a.equals("outnonmatch2") || a.equals("outunmatch2") || 200 a.equals("outnonmatched2") || a.equals("outunmatched2")){ 201 out2=b; 202 }else if(a.equals("outb") || a.equals("outm") || a.equals("outb1") || a.equals("outm1") || a.equals("outbad") || 203 a.equals("outbad1") || a.equals("outmatch") || a.equals("outmatch1")){ 204 outb1=b; 205 setOut=true; 206 }else if(a.equals("outb2") || a.equals("outm2") || a.equals("outbad2") || a.equals("outmatch2")){ 207 outb2=b; 208 }else if(a.equals("outs") || a.equals("outsingle")){ 209 outsingle=b; 210 }else if(a.equals("stats") || a.equals("scafstats")){ 211 outstats=b; 212 }else if(a.equals("polymerstats") || a.equals("polymerstatsfile") || a.equals("pstats") || a.equals("phist")){ 213 polymerStatsFile=b; 214 }else if(a.equals("refstats")){ 215 outrefstats=b; 216 }else if(a.equals("rpkm") || a.equals("fpkm") || a.equals("cov") || a.equals("coverage")){ 217 outrpkm=b; 218 }else if(a.equals("sam") || a.equals("bam")){ 219 samFile=b; 220 }else if(a.equals("duk") || a.equals("outduk")){ 221 outduk=b; 222 }else if(a.equals("rqc")){ 223 outrqc=b; 224 }else if(a.equals("ref") || a.equals("adapters")){ 225 ref=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(",")); 226 }else if(a.equals("altref")){ 227 altref=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(",")); 228 }else if(a.equals("samref") || a.equals("bamref")){ 229 samref=b; 230 }else if(a.equals("literal")){ 231 literal=(b==null) ? null : b.split(","); 232 // assert(false) : b+", "+Arrays.toString(literal); 233 }else if(a.equals("forest")){ 234 useForest_=Parse.parseBoolean(b); 235 if(useForest_){useTable_=useArray_=false;} 236 }else if(a.equals("table")){ 237 useTable_=Parse.parseBoolean(b); 238 if(useTable_){useForest_=useArray_=false;} 239 }else if(a.equals("array")){ 240 useArray_=Parse.parseBoolean(b); 241 if(useArray_){useTable_=useForest_=false;} 242 }else if(a.equals("ways")){ 243 ways_=Integer.parseInt(b); 244 }else if(a.equals("ordered") || a.equals("ord")){ 245 ordered_=Parse.parseBoolean(b); 246 }else if(a.equals("skipr1")){ 247 skipr1_=Parse.parseBoolean(b); 248 }else if(a.equals("skipr2")){ 249 skipr2_=Parse.parseBoolean(b); 250 }else if(a.equals("k")){ 251 assert(b!=null) : "\nThe k key needs an integer value greater than 0, such as k=27\n"; 252 k_=Integer.parseInt(b); 253 setk=true; 254 }else if(a.equals("mink") || a.equals("kmin")){ 255 mink_=Integer.parseInt(b); 256 assert(mink_<0 || (mink_>0 && mink_<32)) : "kmin must be between 1 and 31; default is 4, negative numbers disable it."; 257 }else if(a.equals("useshortkmers") || a.equals("shortkmers") || a.equals("usk")){ 258 useShortKmers=Parse.parseBoolean(b); 259 }else if(a.equals("trimextra") || a.equals("trimpad") || a.equals("tp")){ 260 trimPad=Integer.parseInt(b); 261 }else if(a.equals("hdist") || a.equals("hammingdistance")){ 262 hammingDistance=Integer.parseInt(b); 263 assert(hammingDistance>=0 && hammingDistance<4) : "hamming distance must be between 0 and 3; default is 0."; 264 }else if(a.equals("qhdist") || a.equals("queryhammingdistance")){ 265 qHammingDistance=Integer.parseInt(b); 266 assert(qHammingDistance>=0 && qHammingDistance<4) : "hamming distance must be between 0 and 3; default is 0."; 267 }else if(a.equals("edits") || a.equals("edist") || a.equals("editdistance")){ 268 editDistance=Integer.parseInt(b); 269 assert(editDistance>=0 && editDistance<3) : "edit distance must be between 0 and 2; default is 0.\n" + 270 "You can bypass this error message with the -da flag, but edist=3 at K=31 " + 271 "requires 15,000,000x the time and memory for indexing compared to edist=0."; 272 }else if(a.equals("hdist2") || a.equals("hammingdistance2")){ 273 hammingDistance2=Integer.parseInt(b); 274 assert(hammingDistance2>=0 && hammingDistance2<4) : "hamming distance must be between 0 and 3; default is 0."; 275 }else if(a.equals("qhdist2") || a.equals("queryhammingdistance2")){ 276 qHammingDistance2=Integer.parseInt(b); 277 assert(qHammingDistance2>=0 && qHammingDistance2<4) : "hamming distance must be between 0 and 3; default is 0."; 278 }else if(a.equals("edits2") || a.equals("edist2") || a.equals("editdistance2")){ 279 editDistance2=Integer.parseInt(b); 280 assert(editDistance2>=0 && editDistance2<3) : "edit distance must be between 0 and 2; default is 0."; 281 }else if(a.equals("maxskip") || a.equals("maxrskip") || a.equals("mxs")){ 282 maxSkip=Integer.parseInt(b); 283 }else if(a.equals("minskip") || a.equals("minrskip") || a.equals("mns")){ 284 minSkip=Integer.parseInt(b); 285 }else if(a.equals("skip") || a.equals("refskip") || a.equals("rskip")){ 286 minSkip=maxSkip=Integer.parseInt(b); 287 }else if(a.equals("qskip")){ 288 qSkip_=Integer.parseInt(b); 289 }else if(a.equals("speed")){ 290 speed_=Integer.parseInt(b); 291 assert(speed_>=0 && speed_<=16) : "Speed range is 0 to 16. Value: "+speed_; 292 }else if(a.equals("skipreads")){ 293 skipreads_=Parse.parseKMG(b); 294 }else if(a.equals("maxbadkmers") || a.equals("mbk")){ 295 maxBadKmers_=Integer.parseInt(b); 296 }else if(a.equals("minhits") || a.equals("minkmerhits") || a.equals("mkh")){ 297 maxBadKmers_=Integer.parseInt(b)-1; 298 }else if(a.equals("minkmerfraction") || a.equals("minfraction") || a.equals("mkf")){ 299 minKmerFraction_=Float.parseFloat(b); 300 }else if(a.equals("mincoveredfraction") || a.equals("mincovfraction") || a.equals("mcf")){ 301 minCoveredFraction_=Float.parseFloat(b); 302 }else if(a.equals("showspeed") || a.equals("ss")){ 303 showSpeed=Parse.parseBoolean(b); 304 }else if(a.equals("verbose")){ 305 assert(false) : "Verbose flag is currently static final; must be recompiled to change."; 306 assert(WAYS>1) : "WAYS=1 is for debug mode."; 307 }else if(a.equals("mm") || a.equals("maskmiddle")){ 308 maskMiddle=Parse.parseBoolean(b); 309 }else if(a.equals("rcomp")){ 310 rcomp_=Parse.parseBoolean(b); 311 }else if(a.equals("forbidns") || a.equals("forbidn") || a.equals("fn")){ 312 forbidNs_=Parse.parseBoolean(b); 313 }else if(a.equals("findbestmatch") || a.equals("fbm")){ 314 findBestMatch_=Parse.parseBoolean(b); 315 }else if(a.equals("kfilter")){ 316 boolean x=Parse.parseBoolean(b); 317 if(x){ktrimLeft_=ktrimRight_=ktrimN_=ksplit_=false;} 318 }else if(a.equals("ksplit")){ 319 boolean x=Parse.parseBoolean(b); 320 if(x){ksplit_=true; ktrimLeft_=ktrimRight_=ktrimN_=false;} 321 else{ksplit_=false;} 322 }else if(a.equals("ktrim")){ 323 if(b==null){b="";} 324 if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){ktrimLeft_=true;ktrimRight_=false;ktrimN_=false;} 325 else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){ktrimLeft_=false;ktrimRight_=true;ktrimN_=false;} 326 else if(b.equalsIgnoreCase("n")){ 327 ktrimLeft_=ktrimRight_=ksplit_=false; 328 ktrimN_=true; 329 }else if(b.length()==1 && !b.equalsIgnoreCase("t") && !b.equalsIgnoreCase("f")){ 330 ktrimLeft_=ktrimRight_=ksplit_=false; 331 ktrimN_=true; 332 TRIM_SYMBOL_=(byte)b.charAt(0); 333 }else{ 334 assert(b!=null && (b.equalsIgnoreCase("f") || b.equalsIgnoreCase("false"))) : 335 "\nInvalid setting for ktrim - values must be f (false), l (left), r (right), or n."; 336 ktrimRight_=ktrimLeft_=false; 337 } 338 }else if(a.equals("kmask") || a.equals("mask")){ 339 if("lc".equalsIgnoreCase(b) || "lowercase".equalsIgnoreCase(b)){ 340 kmaskLowercase_=true; 341 ktrimN_=true; 342 ktrimLeft_=ktrimRight_=ksplit_=false; 343 }else{ 344 if(Parse.parseBoolean(b)){b="N";} 345 if(b!=null && b.length()==1){ 346 ktrimLeft_=false;ktrimRight_=false;ktrimN_=true; 347 TRIM_SYMBOL_=(byte)b.charAt(0); 348 }else{ 349 boolean x=Parse.parseBoolean(b); 350 // assert(!x) : "\nInvalid setting for kmask - values must be f (false), t (true), or a single character for replacement."; 351 ktrimN_=x; 352 } 353 } 354 }else if(a.equals("kmaskfullycovered") || a.equals("maskfullycovered") || a.equals("mfc")){ 355 kmaskFullyCovered_=Parse.parseBoolean(b); 356 }else if(a.equals("ktrimright")){ 357 ktrimRight_=Parse.parseBoolean(b); 358 ktrimLeft_=ktrimN_=!(ktrimRight_); 359 }else if(a.equals("ktrimleft")){ 360 ktrimLeft_=Parse.parseBoolean(b); 361 ktrimRight_=ktrimN_=!(ktrimLeft_); 362 }else if(a.equals("ktrimn")){ 363 ktrimN_=Parse.parseBoolean(b); 364 ktrimLeft_=ktrimRight_=!(ktrimN_); 365 }else if(a.equals("ktrimexclusive")){ 366 ktrimExclusive_=Parse.parseBoolean(b); 367 }else if(a.equals("tbo") || a.equals("trimbyoverlap")){ 368 trimByOverlap_=Parse.parseBoolean(b); 369 }else if(a.equals("strictoverlap")){ 370 strictOverlap_=Parse.parseBoolean(b); 371 }else if(a.equals("usequality")){ 372 useQualityForOverlap_=Parse.parseBoolean(b); 373 }else if(a.equals("tpe") || a.equals("tbe") || a.equals("trimpairsevenly")){ 374 trimPairsEvenly_=Parse.parseBoolean(b); 375 }else if(a.equals("ottm") || a.equals("outputtrimmedtomatch")){ 376 addTrimmedToBad_=Parse.parseBoolean(b); 377 }else if(a.equals("minoverlap")){ 378 minoverlap_=Integer.parseInt(b); 379 }else if(a.equals("mininsert")){ 380 mininsert_=Integer.parseInt(b); 381 }else if(a.equals("prealloc") || a.equals("preallocate")){ 382 if(b==null || b.length()<1 || Character.isLetter(b.charAt(0))){ 383 prealloc_=Parse.parseBoolean(b); 384 }else{ 385 preallocFraction=Tools.max(0, Double.parseDouble(b)); 386 prealloc_=(preallocFraction>0); 387 } 388 }else if(a.equals("restrictleft")){ 389 restrictLeft_=Integer.parseInt(b); 390 }else if(a.equals("restrictright")){ 391 restrictRight_=Integer.parseInt(b); 392 }else if(a.equals("statscolumns") || a.equals("columns") || a.equals("cols")){ 393 STATS_COLUMNS=Integer.parseInt(b); 394 assert(STATS_COLUMNS==3 || STATS_COLUMNS==5) : "statscolumns bust be either 3 or 5. Invalid value: "+STATS_COLUMNS; 395 }else if(a.equals("nzo") || a.equals("nonzeroonly")){ 396 printNonZeroOnly_=Parse.parseBoolean(b); 397 }else if(a.equals("rename")){ 398 rename_=Parse.parseBoolean(b); 399 }else if(a.equals("refnames") || a.equals("userefnames")){ 400 useRefNames_=Parse.parseBoolean(b); 401 }else if(a.equals("initialsize")){ 402 initialSize=Parse.parseIntKMG(b); 403 }else if(a.equals("dump")){ 404 dump=b; 405 }else if(a.equals("minentropy") || a.equals("entropy") || a.equals("entropyfilter")){ 406 entropyCutoff=Float.parseFloat(b); 407 }else if(a.equals("verifyentropy")){ 408 EntropyTracker.verify=Parse.parseBoolean(b); 409 // }else if(a.equals("entropytracker") || a.equals("usetracker")){ 410 // useEntropyTracker=Parse.parseBoolean(b); 411 }else if(a.equals("entropymask") || a.equals("maskentropy")){ 412 if(b==null){ 413 entropyMask=true; 414 }else if(b.equalsIgnoreCase("lc") || b.equalsIgnoreCase("lowercase")){ 415 entropyMask=true; 416 entropyMaskLowercase=true; 417 }else if(b.equalsIgnoreCase("filter")){ 418 entropyMask=false; 419 }else{ 420 entropyMask=Parse.parseBoolean(b); 421 } 422 }else if(a.equals("entropytrim") || a.equals("trimentropy")){ 423 entropyTrim=parseEnd(b); 424 }else if(a.equals("entropymark") || a.equals("markentropy")){ 425 entropyMark=Parse.parseBoolean(b); 426 } 427 428 else if(a.equals("countpolymers")){ 429 countPolymers=Parse.parseBoolean(b); 430 }else if(a.equals("polybase1")){ 431 polymerChar1=(byte)b.charAt(0); 432 }else if(a.equals("polybase2")){ 433 polymerChar2=(byte)b.charAt(0); 434 }else if(a.equals("polymerratio") || a.equals("pratio")){ 435 assert(b!=null); 436 b=b.toUpperCase(); 437 if(b.length()==2){ 438 polymerChar1=(byte)b.charAt(0); 439 polymerChar2=(byte)b.charAt(1); 440 }else if(b.length()==3){ 441 assert(b.charAt(1)==','); 442 polymerChar1=(byte)b.charAt(0); 443 polymerChar2=(byte)b.charAt(2); 444 }else{ 445 assert(false) : "Format should be pratio=G,C"; 446 } 447 assert(polymerChar1>=0 && polymerChar2>=0); 448 assert(AminoAcid.baseToNumberACGTN[polymerChar1]>=0 && AminoAcid.baseToNumberACGTN[polymerChar2]>=0) : "Only ACGTN polymer tracking is possible: "+arg; 449 }else if(a.equals("polymerlength") || a.equals("plen")){ 450 polymerLength=Integer.parseInt(b); 451 assert(polymerLength>=1); 452 } 453 454 else if(a.equals("minbasefrequency")){ 455 minBaseFrequency_=Float.parseFloat(b); 456 }else if(a.equals("ecco") || a.equals("ecc")){ 457 ecc_=Parse.parseBoolean(b); 458 }else if(a.equals("copyundefined") || a.equals("cu")){ 459 REPLICATE_AMBIGUOUS=Parse.parseBoolean(b); 460 }else if(a.equals("path")){ 461 Data.setPath(b); 462 }else if(a.equals("maxbasesoutm")){ 463 maxBasesOutm=Parse.parseKMG(b); 464 }else if(a.equals("maxbasesoutu") || a.equals("maxbasesout")){ 465 maxBasesOutu=Parse.parseKMG(b); 466 }else if(a.equals("vars") || a.equals("variants") || a.equals("varfile") || a.equals("inv")){ 467 varFile=b; 468 }else if(a.equals("vcf") || a.equals("vcffile")){ 469 vcfFile=b; 470 }else if(a.equals("unfixvars") || a.equals("unfixvariants")){ 471 unfixVariants=Parse.parseBoolean(b); 472 }else if(a.equals("histogramsbefore") || a.equals("histbefore")){ 473 histogramsBeforeProcessing_=Parse.parseBoolean(b); 474 }else if(a.equals("histogramsafter") || a.equals("histafter")){ 475 histogramsBeforeProcessing_=!Parse.parseBoolean(b); 476 } 477 478 else if(a.equals("trimfailures") || a.equals("trimfailuresto1bp")){ 479 trimFailuresTo1bp_=Parse.parseBoolean(b); 480 } 481 482 else if(a.equals("minx") || a.equals("xmin")){ 483 xMinLoc=Parse.parseIntKMG(b); 484 }else if(a.equals("miny") || a.equals("ymin")){ 485 yMinLoc=Parse.parseIntKMG(b); 486 }else if(a.equals("maxx") || a.equals("xmax")){ 487 xMaxLoc=Parse.parseIntKMG(b); 488 }else if(a.equals("maxy") || a.equals("ymax")){ 489 yMaxLoc=Parse.parseIntKMG(b); 490 } 491 492 else if(a.equalsIgnoreCase("pairedToSingle")){ 493 pairedToSingle=Parse.parseBoolean(b); 494 } 495 496 else if(a.equals("filtersubs") || a.equals("filtervars")){ 497 filterVars=Parse.parseBoolean(b); 498 }else if(a.equals("maxbadsubs") || a.equals("maxbadbars")){ 499 maxBadSubs=Integer.parseInt(b); 500 }else if(a.equals("maxbadsubdepth") || a.equals("maxbadvardepth") || a.equals("maxbadsuballeledepth") || a.equals("maxbadvaralleledepth") || a.equals("mbsad")){ 501 maxBadSubAlleleDepth=Integer.parseInt(b); 502 }else if(a.equals("minbadsubreaddepth") || a.equals("minbadvarreaddepth") || a.equals("mbsrd")){ 503 minBadSubReadDepth=Integer.parseInt(b); 504 } 505 506 else if(a.equals("khist") || a.equals("khistin")){ 507 khistIn=b; 508 }else if(a.equals("khistout")){ 509 khistOut=b; 510 } 511 512 else if(a.equals("json")){ 513 json=Parse.parseBoolean(b); 514 } 515 516 else if(a.equals("swift")){ 517 swift=Parse.parseBoolean(b); 518 } 519 520 else if(i==0 && in1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ 521 in1=args[i]; 522 }else if(i==1 && out1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ 523 out1=args[i]; 524 setOut=true; 525 }else if(i==2 && ref==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ 526 ref=(new File(args[i]).exists() ? new String[] {args[i]} : args[i].split(",")); 527 }else{ 528 throw new RuntimeException("Unknown parameter "+args[i]); 529 } 530 } 531 if(WAYS!=7){ 532 System.err.println("WARNING! WAYS="+WAYS+". Probably debug mode."); 533 } 534 535 if(khistIn!=null || khistOut!=null){ 536 if(khistIn!=null){parser.loglog=true;} 537 if(khistOut!=null){parser.loglogOut=true;} 538 parser.loglogbuckets=Tools.max(parser.loglogbuckets, 128000); 539 CardinalityTracker.trackCounts=true; 540 } 541 542 if(ordered_ && !silent){ 543 outstream.println("Set ORDERED to "+ordered_); 544 if(jsonStats!=null){jsonStats.add("ordered", ordered_);} 545 } 546 if(silent || json){ 547 AbstractKmerTableSet.DISPLAY_PROGRESS=false; //TODO: Test to make sure this occurs for silent mode 548 } 549 550 if(hammingDistance2==-1){hammingDistance2=hammingDistance;} 551 if(qHammingDistance2==-1){qHammingDistance2=qHammingDistance;} 552 if(editDistance2==-1){editDistance2=editDistance;} 553 minBaseFrequency=minBaseFrequency_; 554 555 {//Process parser fields 556 Parser.processQuality(); 557 558 maxReads=parser.maxReads; 559 samplerate=parser.samplerate; 560 sampleseed=parser.sampleseed; 561 recalibrateQuality=parser.recalibrateQuality; 562 563 overwrite=ReadStats.overwrite=parser.overwrite; 564 append=ReadStats.append=parser.append; 565 // testsize=parser.testsize; 566 // trimBadSequence=parser.trimBadSequence; 567 // breakLength=parser.breakLength; 568 569 forceTrimModulo=parser.forceTrimModulo; 570 forceTrimLeft=parser.forceTrimLeft; 571 forceTrimRight=parser.forceTrimRight; 572 forceTrimRight2=parser.forceTrimRight2; 573 trimq=parser.trimq; 574 trimE=parser.trimE(); 575 qtrimLeft=parser.qtrimLeft && trimE<1; 576 qtrimRight=parser.qtrimRight && trimE<1; 577 trimClip=parser.trimClip; 578 trimPolyA=parser.trimPolyA; 579 trimPolyGLeft=parser.trimPolyGLeft; 580 trimPolyGRight=parser.trimPolyGRight; 581 filterPolyG=parser.filterPolyG; 582 trimPolyCLeft=parser.trimPolyCLeft; 583 trimPolyCRight=parser.trimPolyCRight; 584 filterPolyC=parser.filterPolyC; 585 minLenFraction=parser.minLenFraction; 586 minAvgQuality=parser.minAvgQuality; 587 minAvgQualityBases=parser.minAvgQualityBases; 588 minBaseQuality=parser.minBaseQuality; 589 chastityFilter=parser.chastityFilter; 590 failBadBarcodes=parser.failBadBarcodes; 591 removeBadBarcodes=parser.removeBadBarcodes; 592 failIfNoBarcode=parser.failIfNoBarcode; 593 barcodes=parser.barcodes; 594 minReadLength=parser.minReadLength; 595 maxReadLength=parser.maxReadLength; 596 maxNs=parser.maxNs; 597 minConsecutiveBases=parser.minConsecutiveBases; 598 // untrim=parser.untrim; 599 // minTrimLength=(parser.minTrimLength>=0 ? parser.minTrimLength : minTrimLength); 600 // requireBothBad=parser.requireBothBad; 601 removePairsIfEitherBad=(!parser.requireBothBad) && (!trimFailuresTo1bp_); 602 tossJunk=parser.tossJunk; 603 604 minGC=parser.minGC; 605 maxGC=parser.maxGC; 606 filterGC=(minGC>0 || maxGC<1); 607 usePairGC=parser.usePairGC; 608 609 loglogIn=(parser.loglog ? CardinalityTracker.makeTracker(parser) : null); 610 loglogOut=(parser.loglogOut ? CardinalityTracker.makeTracker(parser) : null); 611 612 THREADS=Shared.threads(); 613 silent=Parser.silent; 614 if(silent){ 615 DISPLAY_PROGRESS=false; 616 showSpeed=false; 617 } 618 } 619 620 ref=modifyRefPath(ref, refNames); 621 altref=modifyRefPath(altref, altRefNames); 622 if(literal!=null){ 623 refNames.add("literal"); 624 if(!altRefNames.isEmpty()){altRefNames.add("literal");} 625 } 626 refScafCounts=new int[refNames.size()]; 627 628 if(minoverlap_>=0){ 629 minOverlap=Tools.max(minoverlap_, 1); 630 minOverlap0=Tools.min(minOverlap0, minOverlap); 631 } 632 633 if(mininsert_>=0){ 634 minInsert=Tools.max(mininsert_, 1); 635 minInsert0=Tools.min(minInsert0, minInsert); 636 } 637 638 /* Set final variables; post-process and validate argument combinations */ 639 640 useForest=useForest_; 641 useTable=useTable_; 642 useArray=useArray_; 643 hammingDistance=Tools.max(editDistance, hammingDistance); 644 hammingDistance2=Tools.max(editDistance2, hammingDistance2); 645 minSkip=Tools.max(1, Tools.min(minSkip, maxSkip)); 646 maxSkip=Tools.max(minSkip, maxSkip); 647 addTrimmedToBad=addTrimmedToBad_; 648 forbidNs=(forbidNs_ || hammingDistance<1); 649 trimSymbol=TRIM_SYMBOL_; 650 kmaskLowercase=kmaskLowercase_; 651 kmaskFullyCovered=kmaskFullyCovered_; 652 skipreads=skipreads_; 653 trimByOverlap=trimByOverlap_; 654 useQualityForOverlap=useQualityForOverlap_; 655 strictOverlap=strictOverlap_; 656 trimPairsEvenly=trimPairsEvenly_; 657 ordered=ordered_; 658 restrictLeft=Tools.max(restrictLeft_, 0); 659 restrictRight=Tools.max(restrictRight_, 0); 660 printNonZeroOnly=printNonZeroOnly_; 661 rename=rename_; 662 findBestMatch=(rename || findBestMatch_); 663 useRefNames=useRefNames_; 664 speed=speed_; 665 qSkip=qSkip_; 666 noAccel=(speed<1 && qSkip<2); 667 accel=!noAccel; 668 skipR1=skipr1_; 669 skipR2=skipr2_; 670 ecc=ecc_; 671 locationFilter=(xMinLoc>0 || yMinLoc>0 || xMaxLoc>-1 || yMaxLoc>-1); 672 trimFailuresTo1bp=trimFailuresTo1bp_; 673 674 amino=Shared.AMINO_IN; 675 rcomp=rcomp_ && !amino; 676 677 //Set K 678 maxSupportedK=(amino ? 12 : 31); 679 if(!setk){k_=(amino ? 11 : 27);} 680 kbig_=(k_>maxSupportedK ? k_ : -1); 681 k_=Tools.min(k_, maxSupportedK); 682 683 if(strictOverlap){ 684 maxRatio=0.05f; 685 ratioMargin=9f; 686 ratioOffset=0.5f; 687 efilterRatio=3.5f; 688 efilterOffset=0.05f; 689 pfilterRatio=0.001f; 690 meeFilter=15f; 691 }else{ 692 maxRatio=0.10f; 693 ratioMargin=5f; 694 ratioOffset=0.4f; 695 efilterRatio=6f; 696 efilterOffset=0.05f; 697 pfilterRatio=0.00005f; 698 meeFilter=999999999; 699 } 700 701 histogramsBeforeProcessing=histogramsBeforeProcessing_; 702 MAKE_QUALITY_HISTOGRAM=ReadStats.COLLECT_QUALITY_STATS; 703 MAKE_QUALITY_ACCURACY=ReadStats.COLLECT_QUALITY_ACCURACY; 704 MAKE_MATCH_HISTOGRAM=ReadStats.COLLECT_MATCH_STATS; 705 MAKE_BASE_HISTOGRAM=ReadStats.COLLECT_BASE_STATS; 706 MAKE_EHIST=ReadStats.COLLECT_ERROR_STATS; 707 MAKE_INDELHIST=ReadStats.COLLECT_INDEL_STATS; 708 MAKE_LHIST=ReadStats.COLLECT_LENGTH_STATS; 709 MAKE_GCHIST=ReadStats.COLLECT_GC_STATS; 710 MAKE_ENTROPYHIST=ReadStats.COLLECT_ENTROPY_STATS; 711 MAKE_IDHIST=ReadStats.COLLECT_IDENTITY_STATS; 712 MAKE_IHIST=ReadStats.COLLECT_INSERT_STATS; 713 714 { 715 long usableMemory; 716 long tableMemory; 717 718 { 719 long memory=Runtime.getRuntime().maxMemory(); 720 double xmsRatio=Shared.xmsRatio(); 721 usableMemory=(long)Tools.max(((memory-96000000-(20*400000 /* for atomic arrays */))*(xmsRatio>0.97 ? 0.82 : 0.72)), memory*0.45); 722 tableMemory=(long)(usableMemory*.95); 723 } 724 725 if(initialSize<1){ 726 final long memOverWays=tableMemory/(12*WAYS); 727 final double mem2=(prealloc_ ? preallocFraction : 1)*tableMemory; 728 initialSize=(prealloc_ || memOverWays<initialSizeDefault ? (int)Tools.min(2142000000, (long)(mem2/(12*WAYS))) : initialSizeDefault); 729 if(initialSize!=initialSizeDefault && !silent){ 730 outstream.println("Initial size set to "+initialSize); 731 } 732 } 733 } 734 735 if(ktrimLeft_ || ktrimRight_ || ktrimN_ || ksplit_){ 736 if(kbig_>k_){ 737 outstream.println("*********************** WARNING ***********************"); 738 outstream.println("WARNING: When kmer-trimming or masking, the maximum value of K is "+k_+"."); 739 outstream.println("K has been reduced from "+kbig_+" to "+k_+"."); 740 outstream.println("***********************************************************"); 741 kbig_=k_; 742 } 743 } 744 745 if((speed>0 || qSkip>1) && kbig_>k_){ 746 outstream.println("*********************** WARNING ***********************"); 747 outstream.println("WARNING: When speed>0 or qskip>1, the maximum value of K is "+k_+"."); 748 outstream.println("K has been reduced from "+kbig_+" to "+k_+"."); 749 outstream.println("***********************************************************"); 750 kbig_=k_; 751 } 752 753 if((speed>0 && qSkip>1) || (qSkip>1 && maxSkip>1) || (speed>0 && maxSkip>1)){ 754 outstream.println("WARNING: It is not recommended to use more than one of 'qskip', 'speed', and 'rskip/maxskip' together."); 755 outstream.println("qskip="+qSkip+", speed="+speed+", maxskip="+maxSkip); 756 } 757 758 k=k_; 759 k2=k-1; 760 kbig=kbig_; 761 keff=Tools.max(k, kbig); 762 if(kbig>k){ 763 minSkip=maxSkip=0; 764 if(maskMiddle){ 765 outstream.println("maskMiddle was disabled because kbig>k"); 766 maskMiddle=false; 767 } 768 } 769 mink=Tools.min((mink_<1 ? 6 : mink_), k); 770 maxBadKmers0=maxBadKmers_; 771 772 {//set some constants 773 bitsPerBase=(amino ? 5 : 2); 774 maxSymbol=(amino ? 20 : 3); 775 symbols=maxSymbol+1; 776 symbolArrayLen=(64+bitsPerBase-1)/bitsPerBase; 777 symbolSpace=(1<<bitsPerBase); 778 symbolMask=symbolSpace-1; 779 780 symbolToNumber=AminoAcid.symbolToNumber(amino); 781 symbolToNumber0=AminoAcid.symbolToNumber0(amino); 782 symbolToComplementNumber0=AminoAcid.symbolToComplementNumber0(amino); 783 784 clearMasks=new long[symbolArrayLen]; 785 leftMasks=new long[symbolArrayLen]; 786 rightMasks=new long[symbolArrayLen]; 787 lengthMasks=new long[symbolArrayLen]; 788 setMasks=new long[symbols][symbolArrayLen]; 789 for(int i=0; i<symbolArrayLen; i++){ 790 clearMasks[i]=~(symbolMask<<(bitsPerBase*i)); 791 leftMasks[i]=((-1L)<<(bitsPerBase*i)); 792 rightMasks[i]=~((-1L)<<(bitsPerBase*i)); 793 lengthMasks[i]=((1L)<<(bitsPerBase*i)); 794 for(long j=0; j<symbols; j++){ 795 setMasks[(int)j][i]=(j<<(bitsPerBase*i)); 796 } 797 } 798 799 minlen=k-1; 800 minminlen=mink-1; 801 minlen2=(maskMiddle ? k/2 : k); 802 shift=bitsPerBase*k; 803 shift2=shift-bitsPerBase; 804 mask=(shift>63 ? -1L : ~((-1L)<<shift)); 805 kmask=lengthMasks[k]; 806 } 807 808 minKmerFraction=Tools.max(minKmerFraction_, 0); 809 assert(minKmerFraction<=1) : "minKmerFraction must range from 0 to 1; value="+minKmerFraction; 810 811 minCoveredFraction=Tools.max(minCoveredFraction_, 0); 812 assert(minCoveredFraction<=1) : "minCoveredFraction must range from 0 to 1; value="+minCoveredFraction; 813 814 if(mink_>0 && mink_<k){useShortKmers=true;} 815 if(useShortKmers){ 816 if(maskMiddle){ 817 outstream.println("maskMiddle was disabled because useShortKmers=true"); 818 maskMiddle=false; 819 } 820 } 821 822 ktrimRight=ktrimRight_; 823 ktrimLeft=ktrimLeft_; 824 ktrimN=ktrimN_; 825 ksplit=ksplit_; 826 ktrimExclusive=ktrimExclusive_; 827 kfilter=(ref!=null || literal!=null) && !(ktrimRight || ktrimLeft || ktrimN || ksplit); 828 assert(findBestMatch==false || kfilter==false || kbig<=k) : "K must be less than 32 in 'findBestMatch' mode"; 829 830 assert(!useShortKmers || ktrimRight || ktrimLeft || ktrimN || ksplit) : "\nSetting mink or useShortKmers also requires setting a ktrim mode, such as 'r', 'l', or 'n'\n"; 831 832 middleMask=maskMiddle ? ~(symbolMask<<(bitsPerBase*(k/2))) : -1L; 833 834 hitCounts=(outduk==null ? null : new long[HITCOUNT_LEN+1]); 835 836 837 /* Adjust I/O settings and filenames */ 838 839 assert(FastaReadInputStream.settingsOK()); 840 841 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} 842 843 if(in1!=null && in1.contains("#") && !new File(in1).exists()){ 844 int pound=in1.lastIndexOf('#'); 845 String a=in1.substring(0, pound); 846 String b=in1.substring(pound+1); 847 in1=a+1+b; 848 in2=a+2+b; 849 } 850 if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;} 851 if(in2!=null){ 852 if(FASTQ.FORCE_INTERLEAVED && !silent){outstream.println("Reset INTERLEAVED to false because paired input files were specified.");} 853 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; 854 } 855 856 if(qfin1!=null && qfin1.contains("#") && in2!=null && !new File(qfin1).exists()){ 857 int pound=qfin1.lastIndexOf('#'); 858 String a=qfin1.substring(0, pound); 859 String b=qfin1.substring(pound+1); 860 qfin1=a+1+b; 861 qfin2=a+2+b; 862 } 863 864 if(out1!=null && out1.contains("#")){ 865 int pound=out1.lastIndexOf('#'); 866 String a=out1.substring(0, pound); 867 String b=out1.substring(pound+1); 868 out1=a+1+b; 869 out2=a+2+b; 870 } 871 872 if(qfout1!=null && qfout1.contains("#") && in2!=null && !new File(qfout1).exists()){ 873 int pound=qfout1.lastIndexOf('#'); 874 String a=qfout1.substring(0, pound); 875 String b=qfout1.substring(pound+1); 876 qfout1=a+1+b; 877 qfout2=a+2+b; 878 } 879 880 if(outb1!=null && outb1.contains("#")){ 881 int pound=outb1.lastIndexOf('#'); 882 String a=outb1.substring(0, pound); 883 String b=outb1.substring(pound+1); 884 outb1=a+1+b; 885 outb2=a+2+b; 886 } 887 888 if((out2!=null || outb2!=null) && (in1!=null && in2==null)){ 889 if(!FASTQ.FORCE_INTERLEAVED){outstream.println("Forcing interleaved input because paired output was specified for a single input file.");} 890 FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=true; 891 } 892 893 if(!setOut){ 894 if(!silent && !json){ 895 outstream.println("No output stream specified. To write to stdout, please specify 'out=stdout.fq' or similar."); 896 } 897 out1=out2=null; 898 }else if("stdout".equalsIgnoreCase(out1) || "standarddout".equalsIgnoreCase(out1)){ 899 out1="stdout.fq"; 900 out2=null; 901 } 902 903 in1=Tools.fixExtension(in1); 904 in2=Tools.fixExtension(in2); 905 qfin1=Tools.fixExtension(qfin1); 906 qfin2=Tools.fixExtension(qfin2); 907 ref=Tools.fixExtension(ref); 908 909 if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, qfout1, qfout2, outb1, outb2, outsingle, outstats, outrpkm, outduk, outrqc, outrefstats, polymerStatsFile, khistIn, khistOut)){ 910 throw new RuntimeException("\nCan't write to some output files; overwrite="+overwrite+"\n"); 911 } 912 if(!Tools.testInputFiles(false, true, in1, in2, qfin1, qfin2)){ 913 throw new RuntimeException("\nCan't read some input files.\n"); 914 } 915 if(!Tools.testInputFiles(true, true, ref)){ 916 throw new RuntimeException("\nCan't read to some reference files.\n"); 917 } 918 if(!Tools.testForDuplicateFiles(true, in1, in2, qfin1, qfin2, qfout1, qfout2, 919 out1, out2, outb1, outb2, outsingle, outstats, outrpkm, outduk, outrqc, outrefstats, polymerStatsFile, khistIn, khistOut)){ 920 throw new RuntimeException("\nSome file names were specified multiple times.\n"); 921 } 922 923 assert(THREADS>0) : "THREADS must be greater than 0."; 924 925 assert(in1==null || in1.toLowerCase().startsWith("stdin") || in1.toLowerCase().startsWith("standardin") || new File(in1).exists()) : "Can't find "+in1; 926 assert(in2==null || in2.toLowerCase().startsWith("stdin") || in2.toLowerCase().startsWith("standardin") || new File(in2).exists()) : "Can't find "+in2; 927 928 ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); 929 ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true); 930 931 final int defaultFormat=(ffin1==null ? FileFormat.FASTQ : ffin1.format()); 932 ffout1=FileFormat.testOutput(out1, defaultFormat, null, true, overwrite, append, ordered); 933 ffout2=FileFormat.testOutput(out2, defaultFormat, null, true, overwrite, append, ordered); 934 ffoutb1=FileFormat.testOutput(outb1, defaultFormat, null, true, overwrite, append, ordered); 935 ffoutb2=FileFormat.testOutput(outb2, defaultFormat, null, true, overwrite, append, ordered); 936 ffouts=FileFormat.testOutput(outsingle, defaultFormat, null, true, overwrite, append, ordered); 937 938 parser.validateStdio(ffin1, ffout1, ffoutb1, ffouts); 939 940 makeReadStats=ReadStats.collectingStats(); 941 942 //This block just causes problems when new features are added, so it's disabled. 943 if(!((ref!=null || literal!=null) || qtrimLeft || qtrimRight || minAvgQuality>0 || minBaseQuality>0 || maxNs>=0 || trimByOverlap || 944 makeReadStats || entropyMask || entropyTrim>0 || entropyMark || entropyCutoff>=0 || filterVars || 945 forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0 || minBaseFrequency>0 || recalibrateQuality || 946 trimPolyA>0 || trimPolyGLeft>0 || trimPolyGRight>0 || filterPolyG>0 || trimPolyCLeft>0 || trimPolyCRight>0 || filterPolyC>0)){ 947 // outstream.println("NOTE: No reference files specified, no trimming mode, no min avg quality, no histograms - read sequences will not be changed."); 948 } 949 950 if(recalibrateQuality || true){ 951 SamLine.SET_FROM_OK=true;//TODO: Should ban other operations 952 } 953 954 if(ref!=null){ 955 for(String s0 : ref){ 956 assert(s0!=null) : "Specified a null reference."; 957 String s=s0.toLowerCase(); 958 assert(s==null || s.startsWith("stdin") || s.startsWith("standardin") || new File(s0).exists()) : "Can't find "+s0; 959 } 960 } 961 962 //Initialize tables 963 final int tableType=(useForest ? AbstractKmerTable.FOREST1D : useTable ? AbstractKmerTable.TABLE : useArray ? AbstractKmerTable.ARRAY1D : 0); 964 ScheduleMaker scheduleMaker=new ScheduleMaker(WAYS, 12, prealloc_, (prealloc_ ? preallocFraction : 0.9)); 965 int[] schedule=scheduleMaker.makeSchedule(); 966 keySets=AbstractKmerTable.preallocate(WAYS, tableType, schedule, -1L); 967 968 //Initialize entropy 969 calcEntropy=(entropyCutoff>=0 || entropyMark); 970 if(calcEntropy){ 971 assert(EntropyTracker.defaultWindowBases>0 && (entropyMark || (entropyCutoff>=0 && entropyCutoff<=1))); 972 } 973 assert(calcEntropy || (!entropyMask && !entropyMark && entropyTrim==0)) : "Entropy masking/trimming operations require the entropy flag to be set."; 974 975 if(polymerStatsFile!=null || (polymerChar1>=0 && polymerChar2>=0)){ 976 countPolymers=true; 977 } 978 979 //Initialize polymer-tracking 980 if(countPolymers){ 981 // assert(polymerChar1>=0 && AminoAcid.baseToNumberACGTN[polymerChar1]>=0); 982 // assert(polymerChar2>=0 && AminoAcid.baseToNumberACGTN[polymerChar2]>=0); 983 pTracker=new PolymerTracker(); 984 }else{ 985 pTracker=null; 986 } 987 } 988 modifyRefPath(String[] array, ArrayList<String> list)989 String[] modifyRefPath(String[] array, ArrayList<String> list){ 990 if(array==null){return array;} 991 for(String s : array){ 992 String s2=modifyRefPath(s); 993 list.add(s2); 994 } 995 return list.toArray(new String[0]); 996 } 997 modifyRefPath(String s)998 String modifyRefPath(String s){ 999 if(s==null || new File(s).exists()){ 1000 //do nothing 1001 }else{ 1002 if("phix".equalsIgnoreCase(s)){ 1003 s=Data.findPath("?phix174_ill.ref.fa.gz"); 1004 }else if("lambda".equalsIgnoreCase(s)){ 1005 s=Data.findPath("?lambda.fa.gz"); 1006 }else if("kapa".equalsIgnoreCase(s)){ 1007 s=Data.findPath("?kapatags.L40.fa"); 1008 }else if("pjet".equalsIgnoreCase(s)){ 1009 s=Data.findPath("?pJET1.2.fa"); 1010 }else if("mtst".equalsIgnoreCase(s)){ 1011 s=Data.findPath("?mtst.fa"); 1012 }else if("adapters".equalsIgnoreCase(s)){ 1013 s=Data.findPath("?adapters.fa"); 1014 }else if("truseq".equalsIgnoreCase(s)){ 1015 s=Data.findPath("?truseq.fa.gz"); 1016 }else if("nextera".equalsIgnoreCase(s)){ 1017 s=Data.findPath("?nextera.fa.gz"); 1018 }else if("artifacts".equalsIgnoreCase(s)){ 1019 s=Data.findPath("?sequencing_artifacts.fa.gz"); 1020 }else{ 1021 assert(false) : "Can't find reference file "+s; 1022 } 1023 } 1024 return s; 1025 } 1026 1027 1028 /*--------------------------------------------------------------*/ 1029 /*---------------- Outer Methods ----------------*/ 1030 /*--------------------------------------------------------------*/ 1031 1032 process()1033 public void process(){ 1034 1035 if(samref!=null){ 1036 scafMap=ScafMap.loadReference(samref, true); 1037 } 1038 1039 if(varFile!=null || vcfFile!=null || filterVars){ 1040 if(scafMap==null){scafMap=ScafMap.loadSamHeader(in1);} 1041 assert(scafMap!=null && scafMap.size()>0) : "No scaffold names were loaded."; 1042 if(varFile!=null){ 1043 outstream.println("Loading variants."); 1044 varMap=VcfLoader.loadVarFile(varFile, scafMap); 1045 }else if(vcfFile!=null){ 1046 outstream.println("Loading variants."); 1047 varMap=VcfLoader.loadVcfFile(vcfFile, scafMap, false, false); 1048 } 1049 // if(varMap!=null && varMap.size()>0){ 1050 // varKeySet=new HashSet<VarKey>(); 1051 // for(Var v : varMap.toArray(false)){ 1052 // if(v.type==Var.INS || v.type==Var.DEL){ 1053 // varKeySet.add(VarKey.toVarKey(v)); 1054 // } 1055 // } 1056 // } 1057 fixVariants=(makeReadStats && varMap!=null && varMap.size()>0 && scafMap!=null && scafMap.size()>0); 1058 } 1059 1060 if(recalibrateQuality){ 1061 if(samFile!=null){ 1062 CalcTrueQuality.main2(new String[] {"in="+samFile, "showstats=f"}); 1063 } 1064 CalcTrueQuality.initializeMatrices(); 1065 } 1066 1067 /* Check for output file collisions */ 1068 if(!Tools.testOutputFiles(overwrite, append, false, out1, out2, outb1, outb2, outstats, outrpkm, outduk, outrqc, outrefstats)){ 1069 throw new RuntimeException("One or more output files were duplicate or could not be written to. Check the names or set the 'overwrite=true' flag."); 1070 } 1071 1072 /* Start overall timer */ 1073 Timer t=new Timer(); 1074 1075 // boolean dq0=FASTQ.DETECT_QUALITY; 1076 // boolean ti0=FASTQ.TEST_INTERLEAVED; 1077 // int rbl0=Shared.bufferLen();; 1078 // FASTQ.DETECT_QUALITY=false; 1079 // FASTQ.TEST_INTERLEAVED=false; 1080 // Shared.setBufferLen(16; 1081 1082 process2(t.time1); 1083 1084 // FASTQ.DETECT_QUALITY=dq0; 1085 // FASTQ.TEST_INTERLEAVED=ti0; 1086 // Shared.setBufferLen(rbl0; 1087 1088 /* Stop timer and calculate speed statistics */ 1089 t.stop(); 1090 lastReadsOut=readsOut; 1091 1092 if(showSpeed && !json){ 1093 outstream.println(); 1094 outstream.println(Tools.timeReadsBasesProcessed(t, readsIn, basesIn, 8)); 1095 } 1096 1097 if(outstream!=System.err && outstream!=System.out){outstream.close();} 1098 1099 /* Throw an exception if errors were detected */ 1100 if(errorState){ 1101 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); 1102 } 1103 } 1104 1105 process2(long startTime)1106 public void process2(long startTime){ 1107 1108 /* Start phase timer */ 1109 Timer t=new Timer(); 1110 1111 if(DISPLAY_PROGRESS && !json){ 1112 outstream.println("Initial:"); 1113 Shared.printMemory(); 1114 outstream.println(); 1115 } 1116 1117 /* Fill tables with reference kmers */ 1118 if((ref!=null && ref.length>0) || (literal!=null && literal.length>0)){ 1119 final boolean oldTI=FASTQ.TEST_INTERLEAVED; //TODO: This needs to be changed to a non-static field, or somehow 'read mode' and 'ref mode' need to be distinguished. 1120 final boolean oldFI=FASTQ.FORCE_INTERLEAVED; 1121 final boolean oldPC=FASTQ.PARSE_CUSTOM; 1122 final boolean oldSplit=FastaReadInputStream.SPLIT_READS; 1123 final int oldML=FastaReadInputStream.MIN_READ_LEN; 1124 1125 FASTQ.TEST_INTERLEAVED=false; 1126 FASTQ.FORCE_INTERLEAVED=false; 1127 FASTQ.PARSE_CUSTOM=false; 1128 FastaReadInputStream.SPLIT_READS=false; 1129 FastaReadInputStream.MIN_READ_LEN=1; 1130 1131 1132 storedKmers=spawnLoadThreads(); 1133 if(storedKmers<1 && altRefNames.size()>0){ 1134 outstream.println("Ref had no kmers; using alt ref."); 1135 ref=altref; 1136 refNames=altRefNames; 1137 refScafCounts=new int[refNames.size()]; 1138 scaffoldNames.clear(); 1139 scaffoldLengths.clear(); 1140 storedKmers=spawnLoadThreads(); 1141 } 1142 1143 FASTQ.TEST_INTERLEAVED=oldTI; 1144 FASTQ.FORCE_INTERLEAVED=oldFI; 1145 FASTQ.PARSE_CUSTOM=oldPC; 1146 FastaReadInputStream.SPLIT_READS=oldSplit; 1147 FastaReadInputStream.MIN_READ_LEN=oldML; 1148 1149 if(useRefNames){toRefNames();} 1150 t.stop(); 1151 } 1152 1153 { 1154 long ram=freeMemory(); 1155 ALLOW_LOCAL_ARRAYS=(scaffoldNames!=null && Tools.max(THREADS, 1)*3*8*scaffoldNames.size()<ram*5); 1156 } 1157 1158 /* Dump kmers to text */ 1159 if(dump!=null){ 1160 ByteStreamWriter bsw=new ByteStreamWriter(dump, overwrite, false, true); 1161 bsw.start(); 1162 for(AbstractKmerTable set : keySets){ 1163 set.dumpKmersAsBytes(bsw, k, 0, Integer.MAX_VALUE, null); 1164 } 1165 bsw.poisonAndWait(); 1166 } 1167 1168 if(storedKmers<1 && (ktrimRight || ktrimLeft || ktrimN || ksplit)){ 1169 outstream.println("****** WARNING! A KMER OPERATION WAS CHOSEN BUT NO KMERS WERE LOADED. ******"); 1170 if(ref==null && literal==null){ 1171 outstream.println("****** YOU NEED TO SPECIFY A REFERENCE FILE OR LITERAL SEQUENCE. ******\n"); 1172 }else{ 1173 outstream.println("****** PLEASE ENSURE K IS LESS THAN OR EQUAL TO REF SEQUENCE LENGTHS. ******\n"); 1174 } 1175 if(ktrimRight && trimByOverlap){ 1176 ktrimRight=false; 1177 }else{ 1178 assert(false) : "You can bypass this assertion with the -da flag."; 1179 } 1180 } 1181 1182 final boolean vic=Read.VALIDATE_IN_CONSTRUCTOR; 1183 Read.VALIDATE_IN_CONSTRUCTOR=THREADS<4; 1184 1185 /* Do kmer matching of input reads */ 1186 spawnProcessThreads(t); 1187 1188 Read.VALIDATE_IN_CONSTRUCTOR=vic; 1189 1190 /* Write legacy duk statistics (which requires tables) */ 1191 writeDuk(System.nanoTime()-startTime); 1192 1193 /* Unload kmers to save memory */ 1194 if(RELEASE_TABLES){unloadKmers();} 1195 1196 /* Write statistics to files */ 1197 writeStats(); 1198 writeRPKM(); 1199 writeRefStats(); 1200 writeRqc(); 1201 if(pTracker!=null && polymerStatsFile!=null){ 1202 ReadWrite.writeString(pTracker.toHistogramCumulative(), polymerStatsFile); 1203 } 1204 1205 /* Unload sequence data to save memory */ 1206 if(RELEASE_TABLES){unloadScaffolds();} 1207 1208 if(silent){return;} 1209 if(json){ 1210 outstream.println(toJson(startTime)); 1211 return; 1212 } 1213 1214 outstream.println("\nInput: \t"+readsIn+" reads \t\t"+basesIn+" bases."); 1215 1216 if((ref!=null || literal!=null) && !(ktrimLeft || ktrimRight || ktrimN)){ 1217 outstream.println("Contaminants: \t"+readsKFiltered+" reads ("+toPercent(readsKFiltered, readsIn)+") \t"+ 1218 basesKFiltered+" bases ("+toPercent(basesKFiltered, basesIn)+")"); 1219 outstream.flush(); 1220 } 1221 if(qtrimLeft || qtrimRight){ 1222 outstream.println("QTrimmed: \t"+readsQTrimmed+" reads ("+toPercent(readsQTrimmed, readsIn)+") \t"+ 1223 basesQTrimmed+" bases ("+toPercent(basesQTrimmed, basesIn)+")"); 1224 } 1225 if(trimPolyA>0 || trimPolyGLeft>0 || trimPolyGRight>0 || filterPolyG>0 || trimPolyCLeft>0 || trimPolyCRight>0 || filterPolyC>0){ 1226 outstream.println("Polymer-trimmed: \t"+readsPolyTrimmed+" reads ("+toPercent(readsPolyTrimmed, readsIn)+") \t"+ 1227 basesPolyTrimmed+" bases ("+toPercent(basesPolyTrimmed, basesIn)+")"); 1228 } 1229 if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0){ 1230 outstream.println("FTrimmed: \t"+readsFTrimmed+" reads ("+toPercent(readsFTrimmed, readsIn)+") \t"+ 1231 basesFTrimmed+" bases ("+toPercent(basesFTrimmed, basesIn)+")"); 1232 } 1233 if(ktrimLeft || ktrimRight || ktrimN){ 1234 String x=(ktrimN ? "KMasked: " : "KTrimmed:"); 1235 outstream.println(x+" \t"+readsKTrimmed+" reads ("+toPercent(readsKTrimmed, readsIn)+") \t"+ 1236 basesKTrimmed+" bases ("+toPercent(basesKTrimmed, basesIn)+")"); 1237 } 1238 if(swift){ 1239 outstream.println("Trimmed by Swift: \t"+readsTrimmedBySwift+" reads ("+toPercent(readsTrimmedBySwift, readsIn)+") \t"+ 1240 basesTrimmedBySwift+" bases ("+toPercent(basesTrimmedBySwift, basesIn)+")"); 1241 } 1242 if(trimByOverlap){ 1243 outstream.println("Trimmed by overlap: \t"+readsTrimmedByOverlap+" reads ("+toPercent(readsTrimmedByOverlap, readsIn)+") \t"+ 1244 basesTrimmedByOverlap+" bases ("+toPercent(basesTrimmedByOverlap, basesIn)+")"); 1245 } 1246 if(filterGC){ 1247 outstream.println("Filtered by GC: \t"+badGcReads+" reads ("+toPercent(badGcReads, readsIn)+") \t"+ 1248 badGcBases+" bases ("+toPercent(badGcBases, basesIn)+")"); 1249 } 1250 if(locationFilter || chastityFilter || removeBadBarcodes){ 1251 outstream.println("Filtered by header: \t"+badHeaderReads+" reads ("+toPercent(badHeaderReads, readsIn)+") \t"+ 1252 badHeaderBases+" bases ("+toPercent(badHeaderBases, basesIn)+")"); 1253 } 1254 if(minAvgQuality>0 || minBaseQuality>0 || maxNs>=0 || minBaseFrequency>0 || chastityFilter || removeBadBarcodes){ 1255 outstream.println("Low quality discards: \t"+readsQFiltered+" reads ("+toPercent(readsQFiltered, readsIn)+") \t"+ 1256 basesQFiltered+" bases ("+toPercent(basesQFiltered, basesIn)+")"); 1257 } 1258 if(polymerChar1>=0 && polymerChar2>=0){ 1259 outstream.println("Polymer Counts: \t"+padRight(pTracker.getCountCumulative(polymerChar1, polymerLength)+" "+Character.toString((char)polymerChar1), 18)+"\t"+ 1260 padRight(pTracker.getCountCumulative(polymerChar2, polymerLength)+" "+Character.toString((char)polymerChar2), 18)+"\t"+ 1261 "("+String.format(Locale.ROOT, "%.4f", pTracker.calcRatioCumulative(polymerChar1, polymerChar2, polymerLength))+" ratio)"); 1262 } 1263 if(calcEntropy){ 1264 String prefix; 1265 if(entropyTrim>0){ 1266 prefix=("Entropy-trimmed: \t"); 1267 }else if(entropyMask){ 1268 prefix=("Entropy-masked: \t"); 1269 }else{ 1270 prefix=("Low entropy discards: \t"); 1271 } 1272 outstream.println(prefix+readsEFiltered+" reads ("+toPercent(readsEFiltered, readsIn)+") \t"+ 1273 basesEFiltered+" bases ("+toPercent(basesEFiltered, basesIn)+")"); 1274 } 1275 1276 final long readsRemoved=readsIn-readsOut; 1277 final long basesRemoved=basesIn-basesOut; 1278 1279 outstream.println("Total Removed: \t"+readsRemoved+" reads ("+toPercent(readsRemoved, readsIn)+") \t"+ 1280 basesRemoved+" bases ("+toPercent(basesRemoved, basesIn)+")"); 1281 1282 outstream.println("Result: \t"+readsOut+" reads ("+toPercent(readsOut, readsIn)+") \t"+ 1283 basesOut+" bases ("+toPercent(basesOut, basesIn)+")"); 1284 1285 if(loglogIn!=null){ 1286 outstream.println("Unique "+loglogIn.k+"-mers: \t"+loglogIn.cardinality()); 1287 if(khistIn!=null){ 1288 loglogIn.printKhist(khistIn, overwrite, append, true, 2); 1289 } 1290 } 1291 if(loglogOut!=null){ 1292 outstream.println("Unique "+loglogOut.k+"-mers out: \t"+loglogOut.cardinality()); 1293 if(khistOut!=null){ 1294 loglogOut.printKhist(khistOut, overwrite, append, true, 2); 1295 } 1296 } 1297 } 1298 toJson(long startTime)1299 private String toJson(long startTime){ 1300 1301 jsonStats.add("k", k); 1302 jsonStats.add("mode", ktrimLeft ? "ktrimLeft" : ktrimRight ? "ktrimRight" : ktrimN ? "ktrimN" : "kFilter"); 1303 jsonStats.add("readsIn", readsIn); 1304 jsonStats.add("basesIn", basesIn); 1305 1306 if((ref!=null || literal!=null) && !(ktrimLeft || ktrimRight || ktrimN)){ 1307 jsonStats.add("readsKFiltered", readsKFiltered); 1308 jsonStats.add("basesKFiltered", basesKFiltered); 1309 } 1310 if(qtrimLeft || qtrimRight){ 1311 jsonStats.add("readsQTrimmed", readsQTrimmed); 1312 jsonStats.add("basesQTrimmed", basesQTrimmed); 1313 } 1314 if(trimPolyA>0 || trimPolyGLeft>0 || trimPolyGRight>0 || filterPolyG>0 || trimPolyCLeft>0 || trimPolyCRight>0 || filterPolyC>0){ 1315 jsonStats.add("readsPolyTrimmed", readsPolyTrimmed); 1316 jsonStats.add("basesPolyTrimmed", basesPolyTrimmed); 1317 } 1318 if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0){ 1319 jsonStats.add("readsFTrimmed", readsFTrimmed); 1320 jsonStats.add("basesFTrimmed", basesFTrimmed); 1321 } 1322 if(ktrimLeft || ktrimRight || ktrimN){ 1323 String x=(ktrimN ? "KMasked: " : "KTrimmed:"); 1324 jsonStats.add("reads"+x, readsKTrimmed); 1325 jsonStats.add("bases+x", basesKTrimmed); 1326 } 1327 if(swift){ 1328 jsonStats.add("readsTrimmedBySwift", readsTrimmedBySwift); 1329 jsonStats.add("basesTrimmedBySwift", basesTrimmedBySwift); 1330 } 1331 if(trimByOverlap){ 1332 jsonStats.add("readsTrimmedByOverlap", readsTrimmedByOverlap); 1333 jsonStats.add("basesTrimmedByOverlap", basesTrimmedByOverlap); 1334 } 1335 if(filterGC){ 1336 jsonStats.add("badGcReads", badGcReads); 1337 jsonStats.add("badGcBases", badGcBases); 1338 } 1339 if(locationFilter || chastityFilter || removeBadBarcodes){ 1340 jsonStats.add("badHeaderReads", badHeaderReads); 1341 jsonStats.add("badHeaderBases", badHeaderBases); 1342 } 1343 if(minAvgQuality>0 || minBaseQuality>0 || maxNs>=0 || minBaseFrequency>0 || chastityFilter || removeBadBarcodes){ 1344 jsonStats.add("readsQFiltered", readsQFiltered); 1345 jsonStats.add("basesQFiltered", basesQFiltered); 1346 } 1347 if(polymerChar1>=0 && polymerChar2>=0){ 1348 jsonStats.add("poly"+Character.toString((char)polymerChar1), pTracker.getCountCumulative(polymerChar1, polymerLength)); 1349 jsonStats.add("poly"+Character.toString((char)polymerChar2), pTracker.getCountCumulative(polymerChar2, polymerLength)); 1350 jsonStats.add("polyRatio", pTracker.calcRatioCumulative(polymerChar1, polymerChar2, polymerLength)); 1351 } 1352 if(calcEntropy){ 1353 String suffix; 1354 if(entropyTrim>0){ 1355 suffix=("EntropyTrimmed"); 1356 }else if(entropyMask){ 1357 suffix=("EntropyMasked"); 1358 }else{ 1359 suffix=("EntropyFiltered"); 1360 } 1361 jsonStats.add("reads"+suffix, readsEFiltered); 1362 jsonStats.add("bases"+suffix, basesEFiltered); 1363 } 1364 1365 final long readsRemoved=readsIn-readsOut; 1366 final long basesRemoved=basesIn-basesOut; 1367 1368 jsonStats.add("readsRemoved", readsRemoved); 1369 jsonStats.add("basesRemoved", basesRemoved); 1370 jsonStats.add("readsOut", readsOut); 1371 jsonStats.add("basesOut", basesOut); 1372 1373 if(loglogIn!=null){ 1374 jsonStats.add("uniqueKmersIn", loglogIn.cardinality()); 1375 } 1376 if(loglogOut!=null){ 1377 jsonStats.add("uniqueKmersOut", loglogOut.cardinality()); 1378 } 1379 jsonStats.add("time", (System.nanoTime()-startTime)/1000000000.0); 1380 return jsonStats.toString(); 1381 } 1382 toPercent(long numerator, long denominator)1383 private static String toPercent(long numerator, long denominator){ 1384 if(denominator<1){return "0.00%";} 1385 return String.format(Locale.ROOT, "%.2f%%",numerator*100.0/denominator); 1386 } 1387 padRight(String s, int minLen)1388 private static String padRight(String s, int minLen){ 1389 while(s.length()<minLen){s=s+" ";} 1390 return s; 1391 } 1392 1393 /** 1394 * Clear stored kmers. 1395 */ unloadKmers()1396 public void unloadKmers(){ 1397 if(keySets!=null){ 1398 for(int i=0; i<keySets.length; i++){keySets[i]=null;} 1399 } 1400 } 1401 1402 /** 1403 * Clear stored sequence data. 1404 */ unloadScaffolds()1405 public void unloadScaffolds(){ 1406 if(scaffoldNames!=null && !scaffoldNames.isEmpty()){ 1407 scaffoldNames.clear(); 1408 scaffoldNames.trimToSize(); 1409 } 1410 scaffoldReadCounts=null; 1411 scaffoldBaseCounts=null; 1412 hitCounts=null; 1413 scaffoldLengths=null; 1414 } 1415 1416 /** 1417 * Write statistics about how many reads matched each reference scaffold. 1418 */ writeStats()1419 private void writeStats(){ 1420 if(outstats==null){return;} 1421 final TextStreamWriter tsw=new TextStreamWriter(outstats, overwrite, false, false); 1422 tsw.start(); 1423 1424 long rsum=0, bsum=0; 1425 1426 /* Create StringCount list of scaffold names and hitcounts */ 1427 ArrayList<StringCount> list=new ArrayList<StringCount>(); 1428 for(int i=1; i<scaffoldNames.size(); i++){ 1429 final long num1=scaffoldReadCounts.get(i), num2=scaffoldBaseCounts.get(i); 1430 if(num1>0 || !printNonZeroOnly){ 1431 rsum+=num1; 1432 bsum+=num2; 1433 final String s=scaffoldNames.get(i); 1434 final int len=scaffoldLengths.get(i); 1435 final StringCount sn=new StringCount(s, len, num1, num2); 1436 list.add(sn); 1437 } 1438 } 1439 Shared.sort(list); 1440 final double rmult=100.0/(readsIn>0 ? readsIn : 1); 1441 final double bmult=100.0/(basesIn>0 ? basesIn : 1); 1442 1443 tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n"); 1444 1445 if(STATS_COLUMNS==3){ 1446 tsw.print(String.format(Locale.ROOT, "#Total\t%d\n",readsIn)); 1447 tsw.print(String.format(Locale.ROOT, "#Matched\t%d\t%.5f%%\n",rsum,rmult*rsum)); 1448 tsw.print("#Name\tReads\tReadsPct\n"); 1449 for(int i=0; i<list.size(); i++){ 1450 StringCount sn=list.get(i); 1451 tsw.print(String.format(Locale.ROOT, "%s\t%d\t%.5f%%\n",sn.name,sn.reads,(sn.reads*rmult))); 1452 } 1453 }else{ 1454 tsw.print(String.format(Locale.ROOT, "#Total\t%d\t%d\n",readsIn,basesIn)); 1455 tsw.print(String.format(Locale.ROOT, "#Matched\t%d\t%.5f%%\n",rsum,rmult*rsum,bsum,bsum*bmult)); 1456 tsw.print("#Name\tReads\tReadsPct\tBases\tBasesPct\n"); 1457 for(int i=0; i<list.size(); i++){ 1458 StringCount sn=list.get(i); 1459 tsw.print(String.format(Locale.ROOT, "%s\t%d\t%.5f%%\t%d\t%.5f%%\n",sn.name,sn.reads,(sn.reads*rmult),sn.bases,(sn.bases*bmult))); 1460 } 1461 } 1462 1463 tsw.poisonAndWait(); 1464 } 1465 1466 /** 1467 * Write RPKM statistics. 1468 */ writeRPKM()1469 private void writeRPKM(){ 1470 if(outrpkm==null){return;} 1471 final TextStreamWriter tsw=new TextStreamWriter(outrpkm, overwrite, false, false); 1472 tsw.start(); 1473 1474 /* Count mapped reads */ 1475 long mapped=0; 1476 for(int i=0; i<scaffoldReadCounts.length(); i++){ 1477 mapped+=scaffoldReadCounts.get(i); 1478 } 1479 1480 /* Print header */ 1481 tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n"); 1482 tsw.print(String.format(Locale.ROOT, "#Reads\t%d\n",readsIn)); 1483 tsw.print(String.format(Locale.ROOT, "#Mapped\t%d\n",mapped)); 1484 tsw.print(String.format(Locale.ROOT, "#RefSequences\t%d\n",Tools.max(0, scaffoldNames.size()-1))); 1485 tsw.print("#Name\tLength\tBases\tCoverage\tReads\tRPKM\n"); 1486 1487 final float mult=1000000000f/Tools.max(1, mapped); 1488 1489 /* Print data */ 1490 for(int i=1; i<scaffoldNames.size(); i++){ 1491 final long reads=scaffoldReadCounts.get(i); 1492 final long bases=scaffoldBaseCounts.get(i); 1493 final String s=scaffoldNames.get(i); 1494 final int len=scaffoldLengths.get(i); 1495 final double invlen=1.0/Tools.max(1, len); 1496 final double mult2=mult*invlen; 1497 if(reads>0 || !printNonZeroOnly){ 1498 tsw.print(String.format(Locale.ROOT, "%s\t%d\t%d\t%.4f\t%d\t%.4f\n",s,len,bases,bases*invlen,reads,reads*mult2)); 1499 } 1500 } 1501 tsw.poisonAndWait(); 1502 } 1503 1504 /** 1505 * Write statistics on a per-reference basis. 1506 */ writeRefStats()1507 private void writeRefStats(){ 1508 if(outrefstats==null){return;} 1509 final TextStreamWriter tsw=new TextStreamWriter(outrefstats, overwrite, false, false); 1510 tsw.start(); 1511 1512 /* Count mapped reads */ 1513 long mapped=0; 1514 for(int i=0; i<scaffoldReadCounts.length(); i++){ 1515 mapped+=scaffoldReadCounts.get(i); 1516 } 1517 1518 final int numRefs=refNames.size(); 1519 long[] refReadCounts=new long[numRefs]; 1520 long[] refBaseCounts=new long[numRefs]; 1521 long[] refLengths=new long[numRefs]; 1522 1523 for(int r=0, s=1; r<numRefs; r++){ 1524 final int lim=s+refScafCounts[r]; 1525 while(s<lim){ 1526 refReadCounts[r]+=scaffoldReadCounts.get(s); 1527 refBaseCounts[r]+=scaffoldBaseCounts.get(s); 1528 refLengths[r]+=scaffoldLengths.get(s); 1529 s++; 1530 } 1531 } 1532 1533 /* Print header */ 1534 tsw.print("#File\t"+in1+(in2==null ? "" : "\t"+in2)+"\n"); 1535 tsw.print(String.format(Locale.ROOT, "#Reads\t%d\n",readsIn)); 1536 tsw.print(String.format(Locale.ROOT, "#Mapped\t%d\n",mapped)); 1537 tsw.print(String.format(Locale.ROOT, "#References\t%d\n",Tools.max(0, refNames.size()))); 1538 tsw.print("#Name\tLength\tScaffolds\tBases\tCoverage\tReads\tRPKM\n"); 1539 1540 final float mult=1000000000f/Tools.max(1, mapped); 1541 1542 /* Print data */ 1543 for(int i=0; i<refNames.size(); i++){ 1544 final long reads=refReadCounts[i]; 1545 final long bases=refBaseCounts[i]; 1546 final long len=refLengths[i]; 1547 final int scafs=refScafCounts[i]; 1548 final String name=ReadWrite.stripToCore(refNames.get(i)); 1549 final double invlen=1.0/Tools.max(1, len); 1550 final double mult2=mult*invlen; 1551 if(reads>0 || !printNonZeroOnly){ 1552 tsw.print(String.format(Locale.ROOT, "%s\t%d\t%d\t%d\t%.4f\t%d\t%.4f\n",name,len,scafs,bases,bases*invlen,reads,reads*mult2)); 1553 } 1554 } 1555 tsw.poisonAndWait(); 1556 } 1557 1558 /** 1559 * Write processing statistics in DUK's format. 1560 * @param time Elapsed time, nanoseconds 1561 */ writeDuk(long time)1562 private void writeDuk(long time){ 1563 if(outduk==null){return;} 1564 final TextStreamWriter tsw=new TextStreamWriter(outduk, overwrite, false, false); 1565 tsw.start(); 1566 tsw.println(dukString(time)); 1567 tsw.poisonAndWait(); 1568 } 1569 1570 /** 1571 * Write RQCFilter stats. 1572 * @param time Elapsed time, nanoseconds 1573 */ writeRqc()1574 private void writeRqc(){ 1575 if(outrqc==null){return;} 1576 addToRqcMap(); 1577 if(outrqc.endsWith("hashmap")){return;} 1578 final TextStreamWriter tsw=new TextStreamWriter(outrqc, overwrite, false, false); 1579 tsw.start(); 1580 tsw.println(rqcString()); 1581 tsw.poisonAndWait(); 1582 } 1583 rqcString()1584 public static String rqcString(){ 1585 if(RQC_MAP==null){return null;} 1586 StringBuilder sb=new StringBuilder(); 1587 1588 String[] keys=new String[] {"inputReads", "inputBases", "qtrimmedReads", "qtrimmedBases", "qfilteredReads", "qfilteredBases", 1589 "ktrimmedReads", "ktrimmedBases", "kfilteredReads", "kfilteredBases", "outputReads", "outputBases"}; 1590 1591 for(String key : keys){ 1592 Object value=RQC_MAP.get(key); 1593 if(value!=null){ 1594 sb.append(key+"="+value+"\n"); 1595 } 1596 } 1597 1598 return sb.toString(); 1599 } 1600 addToRqcMap()1601 private void addToRqcMap(){ 1602 putRqc("inputReads", readsIn, false, false); 1603 putRqc("inputBases", basesIn, false, false); 1604 if(qtrimLeft || qtrimRight){ 1605 putRqc("qtrimmedReads", readsQTrimmed, false, true); 1606 putRqc("qtrimmedBases", basesQTrimmed, false, true); 1607 } 1608 putRqc("qfilteredReads", readsQFiltered, false, true); 1609 putRqc("qfilteredBases", basesQFiltered, false, true); 1610 1611 if(ktrimLeft || ktrimRight || ktrimN){ 1612 putRqc("ktrimmedReads", readsKTrimmed, true, true); 1613 putRqc("ktrimmedBases", basesKTrimmed, true, true); 1614 }else{ 1615 putRqc("kfilteredReads", readsKFiltered, false, true); 1616 putRqc("kfilteredBases", basesKFiltered, false, true); 1617 } 1618 putRqc("outputReads", readsOut, true, false); 1619 putRqc("outputBases", basesOut, true, false); 1620 } 1621 putRqc(String key, Long value, boolean evict, boolean add)1622 public static void putRqc(String key, Long value, boolean evict, boolean add){ 1623 if(RQC_MAP==null){RQC_MAP=new HashMap<String,Long>();} 1624 Long old=RQC_MAP.get(key); 1625 if(evict || old==null){RQC_MAP.put(key, value);} 1626 else if(add){RQC_MAP.put(key, value+old);} 1627 } 1628 1629 /** 1630 * Helper method; formats statistics to be duk-compatible 1631 * @param time Elapsed time, nanoseconds 1632 * @return duk output string 1633 */ dukString(long time)1634 private String dukString(long time){ 1635 StringBuilder sb=new StringBuilder(); 1636 sb.append("##INPUT PARAMETERS##\n"); 1637 sb.append("#Reference file: "+(ref==null || ref.length<1 ? null : ref.length==1 ? ref[0] : Arrays.toString(ref))+"\n"); 1638 sb.append("#Query file: "+in1+(in2==null ? "" : ","+in2)+"\n"); 1639 sb.append("#Not matched reads file: "+out1+(out2==null ? "" : ","+out2)+"\n"); 1640 sb.append("#Matched reads file: "+outb1+(outb2==null ? "" : ","+outb2)+"\n"); 1641 sb.append("#Output file (duk): "+outduk+"\n"); 1642 sb.append("#Output file (stats): "+outstats+"\n"); 1643 sb.append("#Mer size: "+k+"\n"); 1644 long size=0; 1645 for(AbstractKmerTable x : keySets){size+=x.size();} 1646 sb.append("#Avg step size: "+String.format(Locale.ROOT, "%.1f", refKmers/(double)(Tools.max(1, size)))+"\n"); 1647 sb.append("#Cut off: "+maxBadKmers0+"\n"); 1648 sb.append("#Mask middle: "+maskMiddle+"\n"); 1649 sb.append("#Quality trim: "+((qtrimLeft || qtrimRight) ? trimq : "false")+"\n"); 1650 sb.append("\n"); 1651 1652 sb.append("##REFERENCE STAT##\n"); 1653 sb.append("#Total Reads: "+refReads+"\n"); 1654 sb.append("#Total Bases: "+refBases+"\n"); 1655 sb.append("#Total kmers: "+refKmers+"\n"); 1656 sb.append("#Total stored kmers: "+size+"\n"); 1657 sb.append("\n"); 1658 1659 sb.append("## ELAPSED TIME##\n"); 1660 sb.append("# Time: "+String.format(Locale.ROOT, "%.2f", time/1000000000.0)+" seconds\n"); 1661 sb.append("\n"); 1662 1663 sb.append("##QUERY FILE STAT##\n"); 1664 sb.append("# Total number of reads: "+readsIn+"\n"); 1665 sb.append("# Total number of matched reads: "+readsKFiltered+"\n"); 1666 sb.append("# Match ratio: "+String.format(Locale.ROOT, "%.6f", readsKFiltered*1.0/readsIn)+"\n"); 1667 sb.append("\n"); 1668 1669 sb.append("##P-VALUE##\n"); 1670 sb.append("#Avg number of Kmer for each read: "+((basesIn/(Tools.max(readsIn, 1)))-k)+"\n"); 1671 // sb.append("# P value for the given threshold 1 is 4.05231e-14\n"); //duk prints a P value; not sure what it means 1672 sb.append("\n"); 1673 1674 sb.append("## Histogram of kmer occurance for reads with at least one occurance ##\n"); 1675 sb.append("#NumOcc\tNumReads\tPercentage\n"); 1676 1677 long sum=Tools.sum(hitCounts); 1678 double mult=100.0/(sum<1 ? 1 : sum); 1679 for(int i=0; i<hitCounts.length; i++){ 1680 long x=hitCounts[i]; 1681 if(x>0){ 1682 sb.append(i).append('\t').append(x).append('\t').append(String.format(Locale.ROOT, "%.4f",(x*mult))).append('\n'); 1683 } 1684 } 1685 1686 return sb.toString(); 1687 } 1688 1689 /** 1690 * Fills the scaffold names array with reference names. 1691 */ toRefNames()1692 private void toRefNames(){ 1693 final int numRefs=refNames.size(); 1694 for(int r=0, s=1; r<numRefs; r++){ 1695 final int scafs=refScafCounts[r]; 1696 final int lim=s+scafs; 1697 final String name=ReadWrite.stripToCore(refNames.get(r)); 1698 // outstream.println("r="+r+", s="+s+", scafs="+scafs+", lim="+lim+", name="+name); 1699 while(s<lim){ 1700 // outstream.println(r+", "+s+". Setting "+scaffoldNames.get(s)+" -> "+name); 1701 scaffoldNames.set(s, name); 1702 s++; 1703 } 1704 } 1705 } 1706 getPolymerRatio()1707 public double getPolymerRatio(){ 1708 return pTracker.calcRatioCumulative(polymerChar1, polymerChar2, polymerLength); 1709 } 1710 1711 1712 /*--------------------------------------------------------------*/ 1713 /*---------------- Inner Methods ----------------*/ 1714 /*--------------------------------------------------------------*/ 1715 1716 1717 /** 1718 * Fills tables with kmers from references, using multiple LoadThread. 1719 * @return Number of kmers stored. 1720 */ spawnLoadThreads()1721 private long spawnLoadThreads(){ 1722 Timer t=new Timer(); 1723 if((ref==null || ref.length<1) && (literal==null || literal.length<1)){return 0;} 1724 long added=0; 1725 1726 /* Create load threads */ 1727 LoadThread[] loaders=new LoadThread[WAYS]; 1728 for(int i=0; i<loaders.length; i++){ 1729 loaders[i]=new LoadThread(i); 1730 loaders[i].start(); 1731 } 1732 1733 /* For each reference file... */ 1734 int refNum=0; 1735 if(ref!=null){ 1736 for(String refname : ref){ 1737 1738 /* Start an input stream */ 1739 FileFormat ff=FileFormat.testInput(refname, FileFormat.FASTA, null, false, true); 1740 ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1L, false, ff, null, null, null, Shared.USE_MPI, true); 1741 cris.start(); //4567 1742 ListNum<Read> ln=cris.nextList(); 1743 ArrayList<Read> reads=(ln!=null ? ln.list : null); 1744 1745 /* Iterate through read lists from the input stream */ 1746 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 1747 { 1748 /* Assign a unique ID number to each scaffold */ 1749 ArrayList<Read> reads2=new ArrayList<Read>(reads); 1750 for(Read r1 : reads2){ 1751 final Read r2=r1.mate; 1752 final Integer id=scaffoldNames.size(); 1753 refScafCounts[refNum]++; 1754 scaffoldNames.add(r1.id==null ? id.toString() : r1.id); 1755 int len=r1.length(); 1756 r1.obj=id; 1757 if(r2!=null){ 1758 r2.obj=id; 1759 len+=r2.length(); 1760 } 1761 scaffoldLengths.add(len); 1762 } 1763 1764 if(REPLICATE_AMBIGUOUS){ 1765 reads2=Tools.replicateAmbiguous(reads2, Tools.min(k, mink)); 1766 } 1767 1768 /* Send a pointer to the read list to each LoadThread */ 1769 for(LoadThread lt : loaders){ 1770 boolean b=true; 1771 while(b){ 1772 try { 1773 lt.queue.put(reads2); 1774 b=false; 1775 } catch (InterruptedException e) { 1776 //TODO: This will hang due to still-running threads. 1777 throw new RuntimeException(e); 1778 } 1779 } 1780 } 1781 } 1782 1783 /* Dispose of the old list and fetch a new one */ 1784 cris.returnList(ln); 1785 ln=cris.nextList(); 1786 reads=(ln!=null ? ln.list : null); 1787 } 1788 /* Cleanup */ 1789 cris.returnList(ln); 1790 errorState|=ReadWrite.closeStream(cris); 1791 refNum++; 1792 } 1793 } 1794 1795 /* If there are literal sequences to use as references */ 1796 if(literal!=null){ 1797 ArrayList<Read> list=new ArrayList<Read>(literal.length); 1798 if(verbose){outstream.println("Adding literals "+Arrays.toString(literal));} 1799 1800 /* Assign a unique ID number to each literal sequence */ 1801 for(int i=0; i<literal.length; i++){ 1802 final Integer id=scaffoldNames.size(); 1803 final Read r=new Read(literal[i].getBytes(), null, id); 1804 refScafCounts[refNum]++; 1805 scaffoldNames.add(id.toString()); 1806 scaffoldLengths.add(r.length()); 1807 r.obj=id; 1808 list.add(r); 1809 } 1810 1811 if(REPLICATE_AMBIGUOUS){ 1812 list=Tools.replicateAmbiguous(list, Tools.min(k, mink)); 1813 } 1814 1815 /* Send a pointer to the read list to each LoadThread */ 1816 for(LoadThread lt : loaders){ 1817 boolean b=true; 1818 while(b){ 1819 try { 1820 lt.queue.put(list); 1821 b=false; 1822 } catch (InterruptedException e) { 1823 //TODO: This will hang due to still-running threads. 1824 throw new RuntimeException(e); 1825 } 1826 } 1827 } 1828 } 1829 1830 /* Signal loaders to terminate */ 1831 for(LoadThread lt : loaders){ 1832 boolean b=true; 1833 while(b){ 1834 try { 1835 lt.queue.put(POISON); 1836 b=false; 1837 } catch (InterruptedException e) { 1838 //TODO: This will hang due to still-running threads. 1839 throw new RuntimeException(e); 1840 } 1841 } 1842 } 1843 1844 /* Wait for loaders to die, and gather statistics */ 1845 boolean success=true; 1846 for(LoadThread lt : loaders){ 1847 while(lt.getState()!=Thread.State.TERMINATED){ 1848 try { 1849 lt.join(); 1850 } catch (InterruptedException e) { 1851 // TODO Auto-generated catch block 1852 e.printStackTrace(); 1853 } 1854 } 1855 added+=lt.addedT; 1856 refKmers+=lt.refKmersT; 1857 refBases+=lt.refBasesT; 1858 refReads+=lt.refReadsT; 1859 // modsum+=lt.modsumT; 1860 success&=lt.success; 1861 } 1862 if(!success){KillSwitch.kill("Failed loading ref kmers; aborting.");} 1863 1864 //Correct statistics for number of threads, since each thread processes all reference data 1865 refKmers/=WAYS; 1866 refBases/=WAYS; 1867 refReads/=WAYS; 1868 1869 scaffoldReadCounts=new AtomicLongArray(scaffoldNames.size()); 1870 scaffoldBaseCounts=new AtomicLongArray(scaffoldNames.size()); 1871 1872 t.stop(); 1873 if(DISPLAY_PROGRESS && !json){ 1874 outstream.println("Added "+added+" kmers; time: \t"+t); 1875 Shared.printMemory(); 1876 outstream.println(); 1877 } 1878 1879 if(verbose){ 1880 TextStreamWriter tsw=new TextStreamWriter("stdout", false, false, false, FileFormat.TEXT); 1881 tsw.start(); 1882 for(AbstractKmerTable table : keySets){ 1883 table.dumpKmersAsText(tsw, k, 1, Integer.MAX_VALUE); 1884 } 1885 tsw.poisonAndWait(); 1886 } 1887 1888 return added; 1889 } 1890 1891 /** 1892 * Match reads against reference kmers, using multiple ProcessThread. 1893 * @param t 1894 */ spawnProcessThreads(Timer t)1895 private void spawnProcessThreads(Timer t){ 1896 t.start(); 1897 1898 /* Create read input stream */ 1899 final ConcurrentReadInputStream cris; 1900 final boolean paired; 1901 { 1902 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, ffin1.samOrBam(), ffin1, ffin2, qfin1, qfin2); 1903 cris.setSampleRate(samplerate, sampleseed); 1904 cris.start(); //4567 1905 paired=cris.paired(); 1906 if(!ffin1.samOrBam() && !silent){ 1907 if(json){ 1908 jsonStats.add("paired", paired); 1909 }else{ 1910 outstream.println("Input is being processed as "+(paired ? "paired" : "unpaired")); 1911 } 1912 } 1913 } 1914 1915 /* Create read output streams */ 1916 final ConcurrentReadOutputStream ros, rosb, ross; 1917 { 1918 final int buff=(!ordered ? 12 : Tools.max(32, 2*Shared.threads())); 1919 if(out1!=null){ 1920 ros=ConcurrentReadOutputStream.getStream(ffout1, ffout2, qfout1, qfout2, buff, null, true); 1921 ros.start(); 1922 }else{ros=null;} 1923 if(outb1!=null){ 1924 rosb=ConcurrentReadOutputStream.getStream(ffoutb1, ffoutb2, null, null, buff, null, true); 1925 rosb.start(); 1926 }else{rosb=null;} 1927 if(outsingle!=null){ 1928 ross=ConcurrentReadOutputStream.getStream(ffouts, null, null, null, buff, null, true); 1929 ross.start(); 1930 }else{ross=null;} 1931 if(ros!=null || rosb!=null || ross!=null){ 1932 t.stop(); 1933 if(!silent && !json){outstream.println("Started output streams:\t"+t);} 1934 t.start(); 1935 } 1936 } 1937 1938 /* Optionally skip the first reads, since initial reads may have lower quality */ 1939 if(skipreads>0){ 1940 long skipped=0; 1941 1942 ListNum<Read> ln=cris.nextList(); 1943 ArrayList<Read> reads=(ln!=null ? ln.list : null); 1944 1945 while(skipped<skipreads && reads!=null && reads.size()>0){ 1946 skipped+=reads.size(); 1947 1948 if(rosb!=null){rosb.add(new ArrayList<Read>(1), ln.id);} 1949 if(ros!=null){ros.add(new ArrayList<Read>(1), ln.id);} 1950 if(ross!=null){ross.add(new ArrayList<Read>(1), ln.id);} 1951 1952 cris.returnList(ln); 1953 ln=cris.nextList(); 1954 reads=(ln!=null ? ln.list : null); 1955 } 1956 cris.returnList(ln);//if added for compiler benefit 1957 if(reads==null || reads.isEmpty()){ 1958 ReadWrite.closeStreams(cris, ros, rosb, ross); 1959 outstream.println("Skipped all of the reads."); 1960 System.exit(0); 1961 } 1962 } 1963 1964 /* Create ProcessThreads */ 1965 ArrayList<ProcessThread> alpt=new ArrayList<ProcessThread>(THREADS); 1966 for(int i=0; i<THREADS; i++){alpt.add(new ProcessThread(cris, ros, rosb, ross, ALLOW_LOCAL_ARRAYS));} 1967 for(ProcessThread pt : alpt){pt.start();} 1968 1969 /* Wait for threads to die, and gather statistics */ 1970 for(ProcessThread pt : alpt){ 1971 1972 /* Wait for a thread to die */ 1973 while(pt.getState()!=Thread.State.TERMINATED){ 1974 try { 1975 pt.join(); 1976 } catch (InterruptedException e) { 1977 e.printStackTrace(); 1978 } 1979 } 1980 1981 /* Accumulate data from per-thread counters */ 1982 readsIn+=pt.readsInT; 1983 basesIn+=pt.basesInT; 1984 readsOut+=pt.readsOutuT; 1985 basesOut+=pt.basesOutuT; 1986 readsKFiltered+=pt.readsKFilteredT; 1987 basesKFiltered+=pt.basesKFilteredT; 1988 readsQTrimmed+=pt.readsQTrimmedT; 1989 basesQTrimmed+=pt.basesQTrimmedT; 1990 readsFTrimmed+=pt.readsFTrimmedT; 1991 basesFTrimmed+=pt.basesFTrimmedT; 1992 readsKTrimmed+=pt.readsKTrimmedT; 1993 basesKTrimmed+=pt.basesKTrimmedT; 1994 readsTrimmedBySwift+=pt.readsTrimmedBySwiftT; 1995 basesTrimmedBySwift+=pt.basesTrimmedBySwiftT; 1996 readsTrimmedByOverlap+=pt.readsTrimmedByOverlapT; 1997 basesTrimmedByOverlap+=pt.basesTrimmedByOverlapT; 1998 badGcReads+=pt.badGcReadsT; 1999 badGcBases+=pt.badGcBasesT; 2000 badHeaderReads+=pt.badHeaderReadsT; 2001 badHeaderBases+=pt.badHeaderBasesT; 2002 readsQFiltered+=pt.readsQFilteredT; 2003 basesQFiltered+=pt.basesQFilteredT; 2004 readsNFiltered+=pt.readsNFilteredT; 2005 basesNFiltered+=pt.basesNFilteredT; 2006 readsEFiltered+=pt.readsEFilteredT; 2007 basesEFiltered+=pt.basesEFilteredT; 2008 readsPolyTrimmed+=pt.readsPolyTrimmedT; 2009 basesPolyTrimmed+=pt.basesPolyTrimmedT; 2010 2011 if(pTracker!=null){ 2012 pTracker.add(pt.pTrackerT); 2013 } 2014 2015 if(hitCounts!=null){ 2016 for(int i=0; i<hitCounts.length; i++){hitCounts[i]+=pt.hitCountsT[i];} 2017 pt.hitCountsT=null; 2018 } 2019 if(pt.scaffoldReadCountsT!=null && scaffoldReadCounts!=null){ 2020 for(int i=0; i<pt.scaffoldReadCountsT.length; i++){scaffoldReadCounts.addAndGet(i, pt.scaffoldReadCountsT[i]);} 2021 pt.scaffoldReadCountsT=null; 2022 } 2023 if(pt.scaffoldBaseCountsT!=null && scaffoldBaseCounts!=null){ 2024 for(int i=0; i<pt.scaffoldBaseCountsT.length; i++){scaffoldBaseCounts.addAndGet(i, pt.scaffoldBaseCountsT[i]);} 2025 pt.scaffoldBaseCountsT=null; 2026 } 2027 errorState|=(!pt.finishedSuccessfully); 2028 } 2029 2030 /* Shut down I/O streams; capture error status */ 2031 { 2032 //Prevent a spurious error message in the event of a race condition when maxReads is set. 2033 boolean b=ReadWrite.closeStream(cris); 2034 if(maxReads<1 || maxReads==Long.MAX_VALUE || (maxReads!=readsIn && maxReads*2!=readsIn && samplerate<1)){errorState|=b;} 2035 } 2036 errorState|=ReadWrite.closeOutputStreams(ros, rosb, ross); 2037 errorState|=ReadStats.writeAll(); 2038 2039 t.stop(); 2040 if(showSpeed && !json){ 2041 outstream.println("Processing time: \t\t"+t); 2042 } 2043 } 2044 2045 /*--------------------------------------------------------------*/ 2046 /*---------------- Inner Classes ----------------*/ 2047 /*--------------------------------------------------------------*/ 2048 2049 /** 2050 * Loads kmers into a table. Each thread handles all kmers X such that X%WAYS==tnum. 2051 */ 2052 private class LoadThread extends Thread{ 2053 LoadThread(final int tnum_)2054 public LoadThread(final int tnum_){ 2055 tnum=tnum_; 2056 map=keySets[tnum]; 2057 } 2058 2059 /** 2060 * Get the next list of reads (or scaffolds) from the queue. 2061 * @return List of reads 2062 */ fetch()2063 private ArrayList<Read> fetch(){ 2064 ArrayList<Read> list=null; 2065 while(list==null){ 2066 try { 2067 list=queue.take(); 2068 } catch (InterruptedException e) { 2069 // TODO Auto-generated catch block 2070 e.printStackTrace(); 2071 } 2072 } 2073 return list; 2074 } 2075 2076 @Override run()2077 public void run(){ 2078 ArrayList<Read> reads=fetch(); 2079 while(reads!=POISON){ 2080 for(Read r1 : reads){ 2081 assert(r1.pairnum()==0); 2082 final Read r2=r1.mate; 2083 2084 final int rblen=(r1==null ? 0 : r1.length()); 2085 final int rblen2=r1.mateLength(); 2086 2087 addedT+=addToMap(r1, rblen>20000000 ? k : rblen>5000000 ? 11 : rblen>500000 ? 2 : 0); 2088 if(r2!=null){ 2089 addedT+=addToMap(r2, rblen2>20000000 ? k : rblen2>5000000 ? 11 : rblen2>500000 ? 2 : 0); 2090 } 2091 } 2092 reads=fetch(); 2093 } 2094 2095 if(map.canRebalance() && map.size()>2L*map.arrayLength()){ 2096 map.rebalance(); 2097 } 2098 success=true; 2099 } 2100 2101 /** 2102 * Store the read's kmers in a table. 2103 * @param r The current read to process 2104 * @param skip Number of bases to skip between kmers 2105 * @return Number of kmers stored 2106 */ addToMap(Read r, int skip)2107 private long addToMap(Read r, int skip){ 2108 skip=Tools.max(minSkip, Tools.min(maxSkip, skip)); 2109 final byte[] bases=r.bases; 2110 long kmer=0; 2111 long rkmer=0; 2112 long added=0; 2113 int len=0; 2114 2115 if(bases!=null){ 2116 refReadsT++; 2117 refBasesT+=bases.length; 2118 } 2119 if(bases==null || bases.length<k){return 0;} 2120 2121 final int id=(Integer)r.obj; 2122 2123 if(skip>1){ //Process while skipping some kmers 2124 for(int i=0; i<bases.length; i++){ 2125 byte b=bases[i]; 2126 long x=symbolToNumber0[b]; 2127 long x2=symbolToComplementNumber0[b]; 2128 kmer=((kmer<<bitsPerBase)|x)&mask; 2129 rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask; 2130 if(isFullyDefined(b)){len++;}else{len=0; rkmer=0;} 2131 if(verbose){outstream.println("Scanning1 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 2132 if(len>=k){ 2133 refKmersT++; 2134 if(len%skip==0){ 2135 final long extraBase=(i>=bases.length-1 ? -1 : symbolToNumber[bases[i+1]]); 2136 added+=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance); 2137 if(useShortKmers){ 2138 if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);} 2139 if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);} 2140 } 2141 } 2142 } 2143 } 2144 }else{ //Process all kmers 2145 for(int i=0; i<bases.length; i++){ 2146 final byte b=bases[i]; 2147 final long x=symbolToNumber0[b]; 2148 final long x2=symbolToComplementNumber0[b]; 2149 // assert(x!=x2) : x+", "+x2+", "+Character.toString((char)b)+"\n"+Arrays.toString(symbolToNumber0)+"\n"+Arrays.toString(symbolToComplementNumber); 2150 kmer=((kmer<<bitsPerBase)|x)&mask; 2151 //10000, 1111111111, 16, 16, 2, 10, 8 2152 rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask; 2153 if(isFullyDefined(b)){len++;}else{len=0; rkmer=0;} 2154 if(verbose){ 2155 if(verbose){ 2156 String fwd=new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)); 2157 String rev=AminoAcid.reverseComplementBases(fwd); 2158 String fwd2=kmerToString(kmer, Tools.min(len, k)); 2159 outstream.println("fwd="+fwd+", fwd2="+fwd2+", rev="+rev+", kmer="+kmer+", rkmer="+rkmer); 2160 outstream.println("b="+(char)b+", x="+x+", x2="+x2+", bitsPerBase="+bitsPerBase+", shift2="+shift2); 2161 if(!amino){ 2162 assert(AminoAcid.stringToKmer(fwd)==kmer) : fwd+", "+AminoAcid.stringToKmer(fwd)+", "+kmer+", "+len; 2163 if(len>=k){ 2164 assert(rcomp(kmer, Tools.min(len, k))==rkmer); 2165 assert(rcomp(rkmer, Tools.min(len, k))==kmer); 2166 assert(AminoAcid.kmerToString(kmer, Tools.min(len, k)).equals(fwd)); 2167 assert(AminoAcid.kmerToString(rkmer, Tools.min(len, k)).equals(rev)) : AminoAcid.kmerToString(rkmer, Tools.min(len, k))+" != "+rev+" (rkmer)"; 2168 } 2169 assert(fwd.equalsIgnoreCase(fwd2)) : fwd+", "+fwd2; //may be unsafe 2170 } 2171 outstream.println("Scanning6 i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+", bases="+fwd+", rbases="+rev); 2172 } 2173 } 2174 if(len>=k){ 2175 // assert(kmer==rcomp(rkmer, k)) : Long.toBinaryString(kmer)+", "+Long.toBinaryString(rkmer)+", "+Long.toBinaryString(mask)+", x="+x+", x2="+x2+", bits="+bitsPerBase+", s="+shift+", s2="+shift2+", b="+Character.toString((char)b); 2176 refKmersT++; 2177 final long extraBase=(i>=bases.length-1 ? -1 : symbolToNumber[bases[i+1]]); 2178 final long atm=addToMap(kmer, rkmer, k, extraBase, id, kmask, hammingDistance, editDistance); 2179 added+=atm; 2180 // assert(false) : atm+", "+map.contains(toValue(kmer, rkmer, kmask)); 2181 if(useShortKmers){ 2182 if(i==k2){added+=addToMapRightShift(kmer, rkmer, id);} 2183 if(i==bases.length-1){added+=addToMapLeftShift(kmer, rkmer, extraBase, id);} 2184 } 2185 } 2186 } 2187 } 2188 return added; 2189 } 2190 2191 2192 /** 2193 * Adds short kmers on the left end of the read. 2194 * @param kmer Forward kmer 2195 * @param rkmer Reverse kmer 2196 * @param extraBase Base added to end in case of deletions 2197 * @param id Scaffold number 2198 * @return Number of kmers stored 2199 */ addToMapLeftShift(long kmer, long rkmer, final long extraBase, final int id)2200 private long addToMapLeftShift(long kmer, long rkmer, final long extraBase, final int id){ 2201 if(verbose){outstream.println("addToMapLeftShift");} 2202 long added=0; 2203 for(int i=k-1; i>=mink; i--){ 2204 kmer=kmer&rightMasks[i]; 2205 rkmer=rkmer>>>bitsPerBase; 2206 long x=addToMap(kmer, rkmer, i, extraBase, id, lengthMasks[i], hammingDistance2, editDistance2); 2207 added+=x; 2208 if(verbose){ 2209 if((toValue(kmer, rkmer, lengthMasks[i]))%WAYS==tnum){ 2210 outstream.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added left-shift kmer "+kmerToString(kmer&~lengthMasks[i], i)+"; value="+(toValue(kmer, rkmer, lengthMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+lengthMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]); 2211 outstream.println("i="+i+"; tnum="+tnum+"; Looking for left-shift kmer "+kmerToString(kmer&~lengthMasks[i], i)); 2212 final long value=toValue(kmer, rkmer, lengthMasks[i]); 2213 if(map.contains(value)){outstream.println("Found "+value);} 2214 } 2215 } 2216 } 2217 return added; 2218 } 2219 2220 2221 /** 2222 * Adds short kmers on the right end of the read. 2223 * @param kmer Forward kmer 2224 * @param rkmer Reverse kmer 2225 * @param id Scaffold number 2226 * @return Number of kmers stored 2227 */ addToMapRightShift(long kmer, long rkmer, final int id)2228 private long addToMapRightShift(long kmer, long rkmer, final int id){ 2229 if(verbose){outstream.println("addToMapRightShift");} 2230 long added=0; 2231 for(int i=k-1; i>=mink; i--){ 2232 long extraBase=kmer&symbolMask; 2233 kmer=kmer>>>bitsPerBase; 2234 rkmer=rkmer&rightMasks[i]; 2235 // assert(Long.numberOfLeadingZeros(kmer)>=2*(32-i)) : Long.numberOfLeadingZeros(kmer)+", "+i+", "+kmer+", "+kMasks[i]; 2236 // assert(Long.numberOfLeadingZeros(rkmer)>=2*(32-i)) : Long.numberOfLeadingZeros(rkmer)+", "+i+", "+rkmer+", "+kMasks[i]; 2237 long x=addToMap(kmer, rkmer, i, extraBase, id, lengthMasks[i], hammingDistance2, editDistance2); 2238 added+=x; 2239 if(verbose){ 2240 if((toValue(kmer, rkmer, lengthMasks[i]))%WAYS==tnum){ 2241 outstream.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added right-shift kmer "+kmerToString(kmer&~lengthMasks[i], i)+"; value="+(toValue(kmer, rkmer, lengthMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+lengthMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]); 2242 outstream.println("i="+i+"; tnum="+tnum+"; Looking for right-shift kmer "+kmerToString(kmer&~lengthMasks[i], i)); 2243 final long value=toValue(kmer, rkmer, lengthMasks[i]); 2244 if(map.contains(value)){outstream.println("Found "+value);} 2245 } 2246 } 2247 } 2248 return added; 2249 } 2250 2251 2252 /** 2253 * Adds this kmer to the table, including any mutations implied by editDistance or hammingDistance. 2254 * @param kmer Forward kmer 2255 * @param rkmer Reverse kmer 2256 * @param len Kmer length 2257 * @param extraBase Base added to end in case of deletions 2258 * @param id Scaffold number 2259 * @param kmask0 2260 * @return Number of kmers stored 2261 */ addToMap(final long kmer, final long rkmer, final int len, final long extraBase, final int id, final long kmask0, final int hdist, final int edist)2262 private long addToMap(final long kmer, final long rkmer, final int len, final long extraBase, final int id, final long kmask0, final int hdist, final int edist){ 2263 2264 assert(kmask0==lengthMasks[len]) : kmask0+", "+len+", "+lengthMasks[len]+", "+Long.numberOfTrailingZeros(kmask0)+", "+Long.numberOfTrailingZeros(lengthMasks[len]); 2265 2266 if(verbose){outstream.println("addToMap_A; len="+len+"; kMasks[len]="+lengthMasks[len]);} 2267 assert((kmer&kmask0)==0); 2268 final long added; 2269 if(hdist==0){ 2270 final long key=toValue(kmer, rkmer, kmask0); 2271 if(verbose){outstream.println("toValue ("+kmerToString(kmer, len)+", "+kmerToString(rkmer, len)+") = "+kmerToString(key, len)+" = "+key);} 2272 if(failsSpeed(key)){return 0;} 2273 if(key%WAYS!=tnum){return 0;} 2274 if(verbose){outstream.println("addToMap_B: "+kmerToString(key, len)+" ("+key+")");} 2275 added=map.setIfNotPresent(key, id); 2276 }else if(edist>0){ 2277 // long extraBase=(i>=bases.length-1 ? -1 : symbolToNumber2bases[i+1]]); 2278 added=mutate(kmer, rkmer, len, id, edist, extraBase); 2279 }else{ 2280 added=mutate(kmer, rkmer, len, id, hdist, -1); 2281 } 2282 if(verbose){outstream.println("addToMap added "+added+" keys.");} 2283 return added; 2284 } 2285 2286 /** 2287 * Mutate and store this kmer through 'dist' recursions. 2288 * @param kmer Forward kmer 2289 * @param rkmer Reverse kmer 2290 * @param id Scaffold number 2291 * @param dist Number of mutations 2292 * @param extraBase Base added to end in case of deletions 2293 * @return Number of kmers stored 2294 */ mutate(final long kmer, final long rkmer, final int len, final int id, final int dist, final long extraBase)2295 private long mutate(final long kmer, final long rkmer, final int len, final int id, final int dist, final long extraBase){ 2296 long added=0; 2297 2298 final long key=toValue(kmer, rkmer, lengthMasks[len]); 2299 2300 // if(dist==1){System.err.println(".\t.\t"+kmerToString(kmer, k)+" initial.");}//123 2301 2302 if(verbose){outstream.println("mutate_A; len="+len+"; kmer="+kmer+"; rkmer="+rkmer+"; kMasks[len]="+lengthMasks[len]);} 2303 if(key%WAYS==tnum){ 2304 if(verbose){outstream.println("mutate_B: "+kmerToString(kmer&~lengthMasks[len], len)+" = "+key);} 2305 int x=map.setIfNotPresent(key, id); 2306 // if(x>0){System.err.println(".\t.\t"+kmerToString(kmer, k)+" Added!");}//123 2307 if(verbose){outstream.println("mutate_B added "+x+" keys.");} 2308 added+=x; 2309 assert(map.contains(key)); 2310 } 2311 2312 if(dist>0){ 2313 final int dist2=dist-1; 2314 2315 //Sub 2316 for(int j=0; j<symbols; j++){ 2317 for(int i=0; i<len; i++){ 2318 final long temp=(kmer&clearMasks[i])|setMasks[j][i]; 2319 // System.err.println("cm:"+kmerToString(clearMasks[i], k));//123 2320 // System.err.println("sm:"+kmerToString(setMasks[j][i], k));//123 2321 // assert(Long.bitCount((temp^kmer))<=bitsPerBase) : //Warning: Slow assertion //123 2322 // "\n"+kmerToString(kmer, k)+"\n"+kmerToString(temp, k)+"\n"+i+","+j+","+k; 2323 // System.err.println(j+"\t"+i+"\t"+kmerToString(temp, k));//123 2324 if(temp!=kmer){ 2325 long rtemp=rcomp(temp, len); 2326 added+=mutate(temp, rtemp, len, id, dist2, extraBase); 2327 } 2328 } 2329 } 2330 2331 if(editDistance>0){ 2332 //Del 2333 if(extraBase>=0 && extraBase<=maxSymbol){ 2334 for(int i=1; i<len; i++){ 2335 final long temp=(kmer&leftMasks[i])|((kmer<<bitsPerBase)&rightMasks[i])|extraBase; 2336 if(temp!=kmer){ 2337 long rtemp=rcomp(temp, len); 2338 added+=mutate(temp, rtemp, len, id, dist2, -1); 2339 } 2340 } 2341 } 2342 2343 //Ins 2344 final long eb2=kmer&symbolMask; 2345 for(int i=1; i<len; i++){ 2346 final long temp0=(kmer&leftMasks[i])|((kmer&rightMasks[i])>>bitsPerBase); 2347 for(int j=0; j<symbols; j++){ 2348 final long temp=temp0|setMasks[j][i-1]; 2349 if(temp!=kmer){ 2350 long rtemp=rcomp(temp, len); 2351 added+=mutate(temp, rtemp, len, id, dist2, eb2); 2352 } 2353 } 2354 } 2355 } 2356 2357 } 2358 2359 // if(dist==1){//123 2360 // System.err.println("Added "+added); 2361 // assert(false); 2362 // } 2363 2364 return added; 2365 } 2366 2367 /*--------------------------------------------------------------*/ 2368 2369 /** Number of kmers stored by this thread */ 2370 public long addedT=0; 2371 /** Number of items encountered by this thread */ 2372 public long refKmersT=0, refReadsT=0, refBasesT=0; 2373 /** Thread number; used to determine which kmers to store */ 2374 public final int tnum; 2375 /** Buffer of input read lists */ 2376 public final ArrayBlockingQueue<ArrayList<Read>> queue=new ArrayBlockingQueue<ArrayList<Read>>(32); 2377 // /** Used to trick compiler */ 2378 // public long modsumT=0; //123 2379 2380 /** Destination for storing kmers */ 2381 private final AbstractKmerTable map; 2382 2383 /** Completed successfully */ 2384 boolean success=false; 2385 2386 } 2387 2388 /*--------------------------------------------------------------*/ 2389 2390 /** 2391 * Matches read kmers against reference kmers, performs binning and/or trimming, and writes output. 2392 */ 2393 private class ProcessThread extends Thread{ 2394 2395 /** 2396 * Constructor 2397 * @param cris_ Read input stream 2398 * @param ros_ Unmatched read output stream (optional) 2399 * @param rosb_ Matched read output stream (optional) 2400 * @param ross_ Singleton read output stream (optional) 2401 */ ProcessThread(ConcurrentReadInputStream cris_, ConcurrentReadOutputStream ros_, ConcurrentReadOutputStream rosb_, ConcurrentReadOutputStream ross_, boolean localArrays)2402 public ProcessThread(ConcurrentReadInputStream cris_, ConcurrentReadOutputStream ros_, ConcurrentReadOutputStream rosb_, ConcurrentReadOutputStream ross_, boolean localArrays){ 2403 cris=cris_; 2404 ros=ros_; 2405 rosb=rosb_; 2406 ross=ross_; 2407 2408 readstats=makeReadStats ? new ReadStats() : null; 2409 2410 final int alen=(scaffoldNames==null ? 0 : scaffoldNames.size()); 2411 2412 if(findBestMatch){ 2413 countArray=new int[alen]; 2414 idList=new IntList(); 2415 countList=new IntList(); 2416 }else{ 2417 countArray=null; 2418 idList=countList=null; 2419 } 2420 2421 overlapVector=(trimByOverlap ? new int[5] : null); 2422 2423 hitCountsT=(hitCounts==null ? null : new long[hitCounts.length]); 2424 2425 if(localArrays && alen>0 && alen<10000 && scaffoldReadCounts!=null && scaffoldBaseCounts!=null){ 2426 scaffoldReadCountsT=new long[alen]; 2427 scaffoldBaseCountsT=new long[alen]; 2428 }else{ 2429 scaffoldReadCountsT=scaffoldBaseCountsT=null; 2430 } 2431 2432 if(calcEntropy){ 2433 eTrackerT=new EntropyTracker(amino, Tools.max(0, entropyCutoff), entropyHighpass); 2434 }else{ 2435 eTrackerT=null; 2436 } 2437 2438 if(countPolymers){ 2439 pTrackerT=new PolymerTracker(); 2440 }else{ 2441 pTrackerT=null; 2442 } 2443 2444 maxBasesOutmT=(maxBasesOutm>0 ? Tools.max(1, maxBasesOutm/THREADS) : -1); 2445 maxBasesOutuT=(maxBasesOutu>0 ? Tools.max(1, maxBasesOutu/THREADS) : -1); 2446 2447 flowCoords=(locationFilter ? new FlowcellCoordinate() : null); 2448 } 2449 2450 @Override run()2451 public void run(){ 2452 ListNum<Read> ln=cris.nextList(); 2453 ArrayList<Read> reads=(ln!=null ? ln.list : null); 2454 ArrayList<Read> bad=(rosb==null ? null : new ArrayList<Read>(Shared.bufferLen())); 2455 ArrayList<Read> single=new ArrayList<Read>(Shared.bufferLen()); 2456 2457 final boolean ktrimrl=ktrimLeft || ktrimRight; 2458 final boolean doKmerTrimming=storedKmers>0 && (ktrimLeft || ktrimRight || ktrimN || ksplit); 2459 final boolean doKmerFiltering=storedKmers>0 && !doKmerTrimming; 2460 2461 //While there are more reads lists... 2462 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 2463 2464 int removed=0; 2465 2466 //For each read (or pair) in the list... 2467 for(int i=0; i<reads.size(); i++){ 2468 final Read r1=reads.get(i); 2469 if(pairedToSingle && r1.mate!=null){ 2470 r1.mate=null; 2471 } 2472 final Read r2=r1.mate; 2473 2474 if(!r1.validated()){r1.validate(true);} 2475 if(r2!=null && !r2.validated()){r2.validate(true);} 2476 2477 boolean remove=false; 2478 if(tossJunk){ 2479 if(r1!=null && r1.junk()){ 2480 setDiscarded(r1); 2481 remove=true; 2482 } 2483 if(r2!=null && r2.junk()){ 2484 setDiscarded(r2); 2485 remove=true; 2486 } 2487 } 2488 2489 if(isNotDiscarded(r1)){ 2490 2491 if(histogramsBeforeProcessing){addToHistograms(r1, r2);} 2492 2493 if(loglogIn!=null){loglogIn.hash(r1);} 2494 2495 } 2496 2497 final int initialLength1=r1.length(); 2498 final int initialLength2=r1.mateLength(); 2499 final int initialPairLength=initialLength1+initialLength2; 2500 final int pairCount=r1.pairCount(); 2501 2502 final int minlen1=(int)Tools.max(initialLength1*minLenFraction, minReadLength); 2503 final int minlen2=(int)Tools.max(initialLength2*minLenFraction, minReadLength); 2504 2505 if(verbose){outstream.println("Considering read "+r1.id+" "+new String(r1.bases));} 2506 2507 readsInT+=pairCount; 2508 basesInT+=initialPairLength; 2509 2510 if(!remove){//due to being junk 2511 if(chastityFilter){ 2512 if(r1!=null && r1.failsChastity()){ 2513 setDiscarded(r1); 2514 if(r2!=null){setDiscarded(r2);} 2515 } 2516 } 2517 2518 if(locationFilter && isNotDiscarded(r1)){ 2519 flowCoords.setFrom(r1.id); 2520 boolean discard=false; 2521 if(xMinLoc>-1 && flowCoords.x<xMinLoc){discard=true;} 2522 if(xMaxLoc>-1 && flowCoords.x>xMaxLoc){discard=true;} 2523 if(yMinLoc>-1 && flowCoords.y<yMinLoc){discard=true;} 2524 if(yMaxLoc>-1 && flowCoords.y>yMaxLoc){discard=true;} 2525 if(discard){ 2526 setDiscarded(r1); 2527 if(r2!=null){setDiscarded(r2);} 2528 } 2529 } 2530 2531 if(removeBadBarcodes){ 2532 if(isNotDiscarded(r1) && r1.failsBarcode(barcodes, failIfNoBarcode)){ 2533 if(failBadBarcodes){KillSwitch.kill("Invalid barcode detected: "+r1.id+"\nThis can be disabled with the flag barcodefilter=f");} 2534 setDiscarded(r1); 2535 if(r2!=null){setDiscarded(r2);} 2536 } 2537 } 2538 2539 if(isDiscarded(r1)){ 2540 badHeaderBasesT+=initialPairLength; 2541 badHeaderReadsT+=pairCount; 2542 } 2543 2544 if(recalibrateQuality){ 2545 if(isNotDiscarded(r1)){ 2546 CalcTrueQuality.recalibrate(r1); 2547 } 2548 if(isNotDiscarded(r2)){ 2549 CalcTrueQuality.recalibrate(r2); 2550 } 2551 } 2552 2553 if(filterGC && (initialLength1>0 || initialLength2>0)){ 2554 float gc1=(initialLength1>0 ? r1.gc() : -1); 2555 float gc2=(initialLength2>0 ? r2.gc() : gc1); 2556 if(gc1==-1){gc1=gc2;} 2557 if(usePairGC){ 2558 final float gc; 2559 if(r2==null){ 2560 gc=gc1; 2561 }else{ 2562 gc=(gc1*initialLength1+gc2*initialLength2)/(initialPairLength); 2563 } 2564 gc1=gc2=gc; 2565 } 2566 if(isNotDiscarded(r1) && (gc1<minGC || gc1>maxGC)){ 2567 setDiscarded(r1); 2568 badGcBasesT+=initialLength1; 2569 badGcReadsT++; 2570 } 2571 if(isNotDiscarded(r2) && (gc2<minGC || gc2>maxGC)){ 2572 setDiscarded(r2); 2573 badGcBasesT+=initialLength2; 2574 badGcReadsT++; 2575 } 2576 } 2577 2578 if(forceTrimLeft>0 || forceTrimRight>0 || forceTrimRight2>0 || forceTrimModulo>0){ 2579 if(isNotDiscarded(r1)){ 2580 final int len=r1.length(); 2581 final int a=forceTrimLeft>0 ? forceTrimLeft : 0; 2582 final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len; 2583 final int b1=forceTrimRight>0 ? forceTrimRight : len; 2584 final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len; 2585 final int b=Tools.min(b0, b1, b2); 2586 final int x=TrimRead.trimToPosition(r1, a, b, 1); 2587 basesFTrimmedT+=x; 2588 readsFTrimmedT+=(x>0 ? 1 : 0); 2589 if(r1.length()<minlen1){setDiscarded(r1);} 2590 } 2591 if(isNotDiscarded(r2)){ 2592 final int len=r2.length(); 2593 final int a=forceTrimLeft>0 ? forceTrimLeft : 0; 2594 final int b0=forceTrimModulo>0 ? len-1-len%forceTrimModulo : len; 2595 final int b1=forceTrimRight>0 ? forceTrimRight : len; 2596 final int b2=forceTrimRight2>0 ? len-1-forceTrimRight2 : len; 2597 final int b=Tools.min(b0, b1, b2); 2598 final int x=TrimRead.trimToPosition(r2, a, b, 1); 2599 basesFTrimmedT+=x; 2600 readsFTrimmedT+=(x>0 ? 1 : 0); 2601 if(r2.length()<minlen2){setDiscarded(r2);} 2602 } 2603 } 2604 2605 if(filterVars){ 2606 if(isNotDiscarded(r1)){ 2607 boolean b=passesVariantFilter(r1); 2608 if(!b){setDiscarded(r1);} 2609 } 2610 if(isNotDiscarded(r2)){ 2611 boolean b=passesVariantFilter(r2); 2612 if(!b){setDiscarded(r2);} 2613 } 2614 } 2615 2616 if(isNotDiscarded(r1) && r1.length()<minlen1){setDiscarded(r1);} 2617 if(isNotDiscarded(r2) && r2.length()<minlen2){setDiscarded(r2);} 2618 2619 if(removePairsIfEitherBad){remove=isDiscarded(r1) || isDiscarded(r2);} 2620 else{remove=isDiscarded(r1) && isNullOrDiscarded(r2);} 2621 } 2622 2623 if(remove){ 2624 if(r1!=null){ 2625 basesQFilteredT+=initialLength1; 2626 readsQFilteredT++; 2627 } 2628 if(r2!=null){ 2629 basesQFilteredT+=initialLength2; 2630 readsQFilteredT++; 2631 } 2632 if(bad!=null){bad.add(r1);} 2633 }else{ 2634 2635 if(ecc && r1!=null && r2!=null){BBMerge.findOverlapStrict(r1, r2, true);} 2636 2637 //Process kmers 2638 if(doKmerTrimming){ 2639 2640 int rlen1=0, rlen2=0; 2641 int xsum=0; 2642 int rktsum=0; 2643 2644 if(ktrimrl){ 2645 if(r1!=null){ 2646 int x=ktrim(r1, keySets); 2647 xsum+=x; 2648 rktsum+=(x>0 ? 1 : 0); 2649 rlen1=r1.length(); 2650 if(rlen1<minlen1){setDiscarded(r1);} 2651 } 2652 if(r2!=null){ 2653 int x=ktrim(r2, keySets); 2654 xsum+=x; 2655 rktsum+=(x>0 ? 1 : 0); 2656 rlen2=r2.length(); 2657 if(rlen2<minlen2){setDiscarded(r2);} 2658 } 2659 }else if(ktrimN){ 2660 if(r1!=null){ 2661 int x=kmask(r1, keySets); 2662 xsum+=x; 2663 rktsum+=(x>0 ? 1 : 0); 2664 rlen1=r1.length(); 2665 if(rlen1<minlen1){setDiscarded(r1);} 2666 } 2667 if(r2!=null){ 2668 int x=kmask(r2, keySets); 2669 xsum+=x; 2670 rktsum+=(x>0 ? 1 : 0); 2671 rlen2=r2.length(); 2672 if(rlen2<minlen2){setDiscarded(r2);} 2673 } 2674 }else if(ksplit){ 2675 assert(r2==null); 2676 if(r1!=null){ 2677 int oldLen=r1.pairLength(); 2678 boolean b=ksplit(r1, keySets); 2679 int trimmed=oldLen-r1.pairLength(); 2680 xsum+=trimmed; 2681 rktsum+=(trimmed>0 ? 1 : 0); 2682 rlen1=r1.length(); 2683 // if(rlen1<minlen1){setDiscarded(r1);} 2684 // if(b && r1.mateLength()<minlen1){setDiscarded(r1);} 2685 } 2686 } 2687 2688 if(ksplit){ 2689 remove=(r1.mate!=null); 2690 if(remove && addTrimmedToBad && bad!=null){bad.add(r1);} 2691 }else if(shouldRemove(r1, r2)){ 2692 if(!ktrimN){ 2693 xsum+=(rlen1+rlen2); 2694 rktsum=pairCount; 2695 } 2696 remove=true; 2697 if(addTrimmedToBad && bad!=null){bad.add(r1);} 2698 }else if(ktrimRight && trimPairsEvenly && xsum>0 && r2!=null && r1.length()!=r2.length()){ 2699 int x; 2700 if(r1.length()>r2.length()){ 2701 x=TrimRead.trimToPosition(r1, 0, r2.length()-1, 1); 2702 }else{ 2703 x=TrimRead.trimToPosition(r2, 0, r1.length()-1, 1); 2704 } 2705 if(rktsum<2){rktsum++;} 2706 xsum+=x; 2707 assert(r1.length()==r2.length()) : r1.length()+", "+r2.length(); 2708 } 2709 basesKTrimmedT+=xsum; 2710 readsKTrimmedT+=rktsum; 2711 2712 }else if(doKmerFiltering){ 2713 //Do kmer matching 2714 2715 if(minCoveredFraction>0){ 2716 if(isNotDiscarded(r1)){ 2717 final int minCoveredBases=(int)Math.ceil(minCoveredFraction*r1.length()); 2718 final int covered=countCoveredBases(r1, keySets, minCoveredBases); 2719 if(covered>=minCoveredBases){setDiscarded(r1);} 2720 } 2721 if(isNotDiscarded(r2)){ 2722 final int minCoveredBases=(int)Math.ceil(minCoveredFraction*r2.length()); 2723 final int covered=countCoveredBases(r2, keySets, minCoveredBases); 2724 if(covered>=minCoveredBases){setDiscarded(r2);} 2725 } 2726 }else{ 2727 2728 final int maxBadKmersR1, maxBadKmersR2; 2729 if(minKmerFraction==0){ 2730 maxBadKmersR1=maxBadKmersR2=maxBadKmers0; 2731 }else{ 2732 final int vk1=r1.numValidKmers(keff), vk2=(r2==null ? 0 : r2.numValidKmers(keff)); 2733 maxBadKmersR1=Tools.max(maxBadKmers0, (int)((vk1-1)*minKmerFraction)); 2734 maxBadKmersR2=Tools.max(maxBadKmers0, (int)((vk2-1)*minKmerFraction)); 2735 } 2736 2737 if(!findBestMatch){ 2738 final int a=(kbig<=k ? countSetKmers(r1, keySets, maxBadKmersR1) : countSetKmersBig(r1, keySets, maxBadKmersR1)); 2739 final int b=(kbig<=k ? countSetKmers(r2, keySets, maxBadKmersR2) : countSetKmersBig(r2, keySets, maxBadKmersR2)); 2740 2741 if(r1!=null && a>maxBadKmersR1){setDiscarded(r1);} 2742 if(r2!=null && b>maxBadKmersR2){setDiscarded(r2);} 2743 2744 }else{ 2745 final int a=findBestMatch(r1, keySets, maxBadKmersR1); 2746 final int b=findBestMatch(r2, keySets, maxBadKmersR2); 2747 2748 if(r1!=null && a>0){setDiscarded(r1);} 2749 if(r2!=null && b>0){setDiscarded(r2);} 2750 } 2751 } 2752 2753 if(shouldRemove(r1, r2)){ 2754 remove=true; 2755 if(r1!=null){ 2756 readsKFilteredT++; 2757 basesKFilteredT+=initialLength1; 2758 } 2759 if(r2!=null){ 2760 readsKFilteredT++; 2761 basesKFilteredT+=initialLength2; 2762 } 2763 if(bad!=null){bad.add(r1);} 2764 } 2765 2766 } 2767 } 2768 2769 // assert(false) : remove+", "+trimByOverlap+", "+(r2!=null); 2770 2771 if(!remove && trimByOverlap && r2!=null && expectedErrors(r1, r2)<meeFilter){ 2772 2773 if(aprob==null || aprob.length<r1.length()){aprob=new float[r1.length()];} 2774 if(bprob==null || bprob.length<r2.length()){bprob=new float[r2.length()];} 2775 2776 //Do overlap trimming 2777 r2.reverseComplement(); 2778 // int bestInsert=BBMergeOverlapper.mateByOverlap(r1, r2, aprob, bprob, overlapVector, minOverlap0, minOverlap, 2779 // overlapMargin, overlapMaxMismatches0, overlapMaxMismatches, overlapMinq); 2780 int bestInsert=BBMergeOverlapper.mateByOverlapRatio(r1, r2, aprob, bprob, overlapVector, minOverlap0, minOverlap, 2781 minInsert0, minInsert, maxRatio, 0.12f, ratioMargin, ratioOffset, 0.95f, 0.95f, useQualityForOverlap); 2782 2783 if(bestInsert<minInsert){bestInsert=-1;} 2784 boolean ambig=(overlapVector[4]==1); 2785 final int bestBad=overlapVector[2]; 2786 2787 if(bestInsert>0 && !ambig && r1.quality!=null && r2.quality!=null && useQualityForOverlap){ 2788 if(efilterRatio>0 && bestInsert>0 && !ambig){ 2789 float bestExpected=BBMergeOverlapper.expectedMismatches(r1, r2, bestInsert); 2790 if((bestExpected+efilterOffset)*efilterRatio<bestBad){ambig=true;} 2791 } 2792 if(pfilterRatio>0 && bestInsert>0 && !ambig){ 2793 float probability=BBMergeOverlapper.probability(r1, r2, bestInsert); 2794 if(probability<pfilterRatio){bestInsert=-1;} 2795 } 2796 if(meeFilter>=0 && bestInsert>0 && !ambig){ 2797 float expected=BBMergeOverlapper.expectedMismatches(r1, r2, bestInsert); 2798 if(expected>meeFilter){bestInsert=-1;} 2799 } 2800 } 2801 2802 r2.reverseComplement(); 2803 2804 if(bestInsert>0 && !ambig){ 2805 if(bestInsert<r1.length()){ 2806 if(verbose){outstream.println("Overlap right trimming r1 to "+0+", "+(bestInsert-1));} 2807 int x=TrimRead.trimToPosition(r1, 0, bestInsert-1, 1); 2808 if(verbose){outstream.println("Trimmed "+x+" bases: "+new String(r1.bases));} 2809 readsTrimmedByOverlapT++; 2810 basesTrimmedByOverlapT+=x; 2811 } 2812 if(bestInsert<r2.length()){ 2813 if(verbose){outstream.println("Overlap right trimming r2 to "+0+", "+(bestInsert-1));} 2814 int x=TrimRead.trimToPosition(r2, 0, bestInsert-1, 1); 2815 if(verbose){outstream.println("Trimmed "+x+" bases: "+new String(r2.bases));} 2816 readsTrimmedByOverlapT++; 2817 basesTrimmedByOverlapT+=x; 2818 } 2819 } 2820 } 2821 2822 if(!remove && swift){ 2823 //Do Swift trimming 2824 2825 int rlen1=0, rlen2=0; 2826 if(r1!=null){ 2827 int x=trimSwift(r1); 2828 basesTrimmedBySwiftT+=x; 2829 readsTrimmedBySwiftT+=(x>0 ? 1 : 0); 2830 rlen1=r1.length(); 2831 if(rlen1<minlen1){setDiscarded(r1);} 2832 } 2833 if(r2!=null){ 2834 int x=trimSwift(r2); 2835 basesTrimmedBySwiftT+=x; 2836 readsTrimmedBySwiftT+=(x>0 ? 1 : 0); 2837 rlen2=r2.length(); 2838 if(rlen2<minlen2){setDiscarded(r2);} 2839 } 2840 2841 //Discard reads if too short 2842 if(shouldRemove(r1, r2)){ 2843 basesTrimmedBySwiftT+=r1.pairLength(); 2844 remove=true; 2845 if(addTrimmedToBad && bad!=null){bad.add(r1);} 2846 } 2847 } 2848 2849 if(!remove && trimPolyA>0){ 2850 //Do poly-A trimming 2851 2852 int rlen1=0, rlen2=0; 2853 if(r1!=null){ 2854 int x=trimPolyA(r1, trimPolyA); 2855 basesPolyTrimmedT+=x; 2856 readsPolyTrimmedT+=(x>0 ? 1 : 0); 2857 rlen1=r1.length(); 2858 if(rlen1<minlen1){setDiscarded(r1);} 2859 } 2860 if(r2!=null){ 2861 int x=trimPolyA(r2, trimPolyA); 2862 basesPolyTrimmedT+=x; 2863 readsPolyTrimmedT+=(x>0 ? 1 : 0); 2864 rlen2=r2.length(); 2865 if(rlen2<minlen2){setDiscarded(r2);} 2866 } 2867 2868 //Discard reads if too short 2869 if(shouldRemove(r1, r2)){ 2870 basesPolyTrimmedT+=r1.pairLength(); 2871 remove=true; 2872 if(addTrimmedToBad && bad!=null){bad.add(r1);} 2873 } 2874 } 2875 2876 if(!remove && (trimPolyGLeft>0 || trimPolyGRight>0 || filterPolyG>0)){ 2877 //Do poly-G trimming 2878 2879 int rlen1=0, rlen2=0; 2880 if(r1!=null){ 2881 if(filterPolyG>0 && r1.countLeft('G')>=filterPolyG){ 2882 setDiscarded(r1); 2883 readsPolyTrimmedT++; 2884 }else if(trimPolyGLeft>0 || trimPolyGRight>0){ 2885 int x=trimPoly(r1, trimPolyGLeft, trimPolyGRight, (byte)'G'); 2886 basesPolyTrimmedT+=x; 2887 readsPolyTrimmedT+=(x>0 ? 1 : 0); 2888 rlen1=r1.length(); 2889 if(rlen1<minlen1){setDiscarded(r1);} 2890 } 2891 } 2892 if(r2!=null){ 2893 if(filterPolyG>0 && r2.countLeft('G')>=filterPolyG){ 2894 setDiscarded(r1); 2895 readsPolyTrimmedT++; 2896 }else if(trimPolyGLeft>0 || trimPolyGRight>0){ 2897 int x=trimPoly(r2, trimPolyGLeft, trimPolyGRight, (byte)'G'); 2898 basesPolyTrimmedT+=x; 2899 readsPolyTrimmedT+=(x>0 ? 1 : 0); 2900 rlen2=r2.length(); 2901 if(rlen2<minlen2){setDiscarded(r2);} 2902 } 2903 } 2904 2905 //Discard reads if too short 2906 if(shouldRemove(r1, r2)){ 2907 basesPolyTrimmedT+=r1.pairLength(); 2908 remove=true; 2909 if(addTrimmedToBad && bad!=null){bad.add(r1);} 2910 } 2911 } 2912 2913 if(!remove && (trimPolyCLeft>0 || trimPolyCRight>0 || filterPolyC>0)){ 2914 //Do poly-C trimming 2915 2916 int rlen1=0, rlen2=0; 2917 if(r1!=null){ 2918 if(filterPolyC>0 && r1.countLeft('G')>=filterPolyC){ 2919 setDiscarded(r1); 2920 readsPolyTrimmedT++; 2921 }else if(trimPolyCLeft>0 || trimPolyCRight>0){ 2922 int x=trimPoly(r1, trimPolyCLeft, trimPolyCRight, (byte)'C'); 2923 basesPolyTrimmedT+=x; 2924 readsPolyTrimmedT+=(x>0 ? 1 : 0); 2925 rlen1=r1.length(); 2926 if(rlen1<minlen1){setDiscarded(r1);} 2927 } 2928 } 2929 if(r2!=null){ 2930 if(filterPolyC>0 && r2.countLeft('G')>=filterPolyC){ 2931 setDiscarded(r1); 2932 readsPolyTrimmedT++; 2933 }else if(trimPolyCLeft>0 || trimPolyCRight>0){ 2934 int x=trimPoly(r2, trimPolyCLeft, trimPolyCRight, (byte)'C'); 2935 basesPolyTrimmedT+=x; 2936 readsPolyTrimmedT+=(x>0 ? 1 : 0); 2937 rlen2=r2.length(); 2938 if(rlen2<minlen2){setDiscarded(r2);} 2939 } 2940 } 2941 2942 //Discard reads if too short 2943 if(shouldRemove(r1, r2)){ 2944 basesPolyTrimmedT+=r1.pairLength(); 2945 remove=true; 2946 if(addTrimmedToBad && bad!=null){bad.add(r1);} 2947 } 2948 } 2949 2950 if(!remove && (entropyMask || entropyTrim>0)){ 2951 //Mask/trim entropy 2952 if(isNotDiscarded(r1)){ 2953 int masked=(entropyTrim>0 ? trimLowEntropy(r1, null, eTrackerT) : maskLowEntropy(r1, null, eTrackerT)); 2954 basesEFilteredT+=masked; 2955 readsEFilteredT+=(masked>0 ? 1 : 0); 2956 } 2957 if(isNotDiscarded(r2)){ 2958 int masked=(entropyTrim>0 ? trimLowEntropy(r2, null, eTrackerT) : maskLowEntropy(r2, null, eTrackerT)); 2959 basesEFilteredT+=masked; 2960 readsEFilteredT+=(masked>0 ? 1 : 0); 2961 } 2962 } 2963 2964 if(entropyMark){ 2965 markLowEntropy(r1, eTrackerT); 2966 markLowEntropy(r2, eTrackerT); 2967 } 2968 2969 if(!remove){ 2970 //Do quality trimming 2971 2972 if(qtrimLeft || qtrimRight || trimClip){ 2973 if(r1!=null){ 2974 int x=TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq, trimE, 1, trimClip); 2975 basesQTrimmedT+=x; 2976 readsQTrimmedT+=(x>0 ? 1 : 0); 2977 2978 // assert(false) : trimClip+", "+x; 2979 } 2980 if(r2!=null){ 2981 int x=TrimRead.trimFast(r2, qtrimLeft, qtrimRight, trimq, trimE, 1, trimClip); 2982 basesQTrimmedT+=x; 2983 readsQTrimmedT+=(x>0 ? 1 : 0); 2984 } 2985 } 2986 2987 if(isNotDiscarded(r1)){ 2988 int len=r1.length(); 2989 if(len<minlen1 || len>maxReadLength){setDiscarded(r1);} 2990 } 2991 if(isNotDiscarded(r2)){ 2992 int len=r2.length(); 2993 if(len<minlen2 || len>maxReadLength){setDiscarded(r2);} 2994 } 2995 2996 //Discard reads if too short 2997 if(shouldRemove(r1, r2)){ 2998 basesQTrimmedT+=r1.pairLength(); 2999 remove=true; 3000 if(addTrimmedToBad && bad!=null){bad.add(r1);} 3001 } 3002 3003 } 3004 3005 if(!remove){ 3006 //Do quality filtering 3007 3008 //Determine whether to discard the reads based on average quality 3009 if(minAvgQuality>0){ 3010 if(r1!=null && r1.quality!=null && r1.avgQuality(false, minAvgQualityBases)<minAvgQuality){setDiscarded(r1);} 3011 if(r2!=null && r2.quality!=null && r2.avgQuality(false, minAvgQualityBases)<minAvgQuality){setDiscarded(r2);} 3012 } 3013 //Determine whether to discard the reads based on lowest quality base 3014 if(minBaseQuality>0){ 3015 if(r1!=null && r1.quality!=null && r1.minQuality()<minBaseQuality){setDiscarded(r1);} 3016 if(r2!=null && r2.quality!=null && r2.minQuality()<minBaseQuality){setDiscarded(r2);} 3017 } 3018 //Determine whether to discard the reads based on the presence of Ns 3019 if(maxNs>=0){ 3020 if(r1!=null && r1.countUndefined()>maxNs){ 3021 readsNFilteredT++; 3022 basesNFilteredT+=r1.length(); 3023 setDiscarded(r1); 3024 } 3025 if(r2!=null && r2.countUndefined()>maxNs){ 3026 readsNFilteredT++; 3027 basesNFilteredT+=r2.length(); 3028 setDiscarded(r2); 3029 } 3030 } 3031 //Determine whether to discard the reads based on a lack of useful kmers 3032 if(minConsecutiveBases>0){ 3033 if(isNotDiscarded(r1) && !r1.hasMinConsecutiveBases(minConsecutiveBases)){setDiscarded(r1);} 3034 if(isNotDiscarded(r2) && !r2.hasMinConsecutiveBases(minConsecutiveBases)){setDiscarded(r2);} 3035 } 3036 //Determine whether to discard the reads based on minimum base frequency 3037 if(minBaseFrequency>0){ 3038 if(r1!=null && r1.minBaseCount()<minBaseFrequency*r1.length()){setDiscarded(r1);} 3039 if(r2!=null && r2.minBaseCount()<minBaseFrequency*r2.length()){setDiscarded(r2);} 3040 } 3041 3042 //Discard reads if too short 3043 if(shouldRemove(r1, r2)){ 3044 basesQFilteredT+=r1.pairLength(); 3045 readsQFilteredT+=pairCount; 3046 remove=true; 3047 if(addTrimmedToBad && bad!=null){bad.add(r1);} 3048 } 3049 } 3050 3051 if(!remove && calcEntropy && entropyCutoff>=0 && !entropyMask && entropyTrim<1){ 3052 //Test entropy 3053 if(isNotDiscarded(r1) && !eTrackerT.passes(r1.bases, true)){setDiscarded(r1);} 3054 if(isNotDiscarded(r2) && !eTrackerT.passes(r2.bases, true)){setDiscarded(r2);} 3055 3056 if(shouldRemove(r1, r2)){ 3057 basesEFilteredT+=r1.pairLength(); 3058 readsEFilteredT+=pairCount; 3059 remove=true; 3060 if(bad!=null){bad.add(r1);} 3061 } 3062 } 3063 3064 if(!remove && !histogramsBeforeProcessing){ 3065 addToHistograms(r1, r2); 3066 } 3067 3068 if(ross!=null){ 3069 if(isNotDiscarded(r1) && isNullOrDiscarded(r2)){ 3070 Read clone=r1.clone(); 3071 clone.mate=null; 3072 single.add(clone); 3073 }else if(r2!=null && isDiscarded(r1) && isNotDiscarded(r2)){ 3074 Read clone=r2.clone(); 3075 clone.mate=null; 3076 single.add(clone); 3077 } 3078 } 3079 3080 if(remove && !trimFailuresTo1bp){ 3081 //Evict read 3082 removed++; 3083 if(r2!=null){removed++;} 3084 reads.set(i, null); 3085 3086 readsOutmT+=pairCount; 3087 basesOutmT+=r1.pairLength(); 3088 }else{ 3089 if(loglogOut!=null){loglogOut.hash(r1);} 3090 readsOutuT+=pairCount; 3091 basesOutuT+=r1.pairLength(); 3092 } 3093 } 3094 3095 //Send matched list to matched output stream 3096 if(rosb!=null){ 3097 rosb.add(bad, ln.id); 3098 bad.clear(); 3099 } 3100 3101 //Send unmatched list to unmatched output stream 3102 if(ros!=null){ 3103 ros.add((removed>0 ? Tools.condenseNew(reads) : reads), ln.id); //Creates a new list if old one became empty, to prevent shutting down the cris. 3104 } 3105 3106 if(ross!=null){ 3107 ross.add(single, ln.id); 3108 single.clear(); 3109 } 3110 3111 if(maxBasesOutmT>=0 && basesOutmT>=maxBasesOutmT){break;} 3112 if(maxBasesOutuT>=0 && basesOutuT>=maxBasesOutuT){break;} 3113 3114 //Fetch a new read list 3115 cris.returnList(ln); 3116 ln=cris.nextList(); 3117 reads=(ln!=null ? ln.list : null); 3118 } 3119 cris.returnList(ln); 3120 finishedSuccessfully=true; 3121 } 3122 setDiscarded(Read r)3123 private void setDiscarded(Read r){ 3124 if(trimFailuresTo1bp){ 3125 if(r.length()>1){TrimRead.trimByAmount(r, 0, r.length()-1, 1, false);} 3126 }else{ 3127 r.setDiscarded(true); 3128 } 3129 } 3130 isDiscarded(Read r)3131 private boolean isDiscarded(Read r){ 3132 if(r==null){return false;} 3133 if(r.discarded()){return true;} 3134 return trimFailuresTo1bp && r.length()==1; 3135 } 3136 isNullOrDiscarded(Read r)3137 private boolean isNullOrDiscarded(Read r){ 3138 if(r==null){return true;} 3139 if(r.discarded()){return true;} 3140 return trimFailuresTo1bp && r.length()==1; 3141 } 3142 isNotDiscarded(Read r)3143 private boolean isNotDiscarded(Read r){ 3144 if(r==null){return false;} 3145 if(r.discarded()){return false;} 3146 return !(trimFailuresTo1bp && r.length()==1); 3147 } 3148 shouldRemove(Read r1, Read r2)3149 private boolean shouldRemove(Read r1, Read r2){ 3150 return (removePairsIfEitherBad && (isDiscarded(r1) || isDiscarded(r2))) || 3151 (isDiscarded(r1) && isNullOrDiscarded(r2)); 3152 } 3153 3154 /*--------------------------------------------------------------*/ 3155 /*---------------- Helper Methods ----------------*/ 3156 /*--------------------------------------------------------------*/ 3157 addToHistograms(Read r1, Read r2)3158 private void addToHistograms(Read r1, Read r2) { 3159 3160 if(pTrackerT!=null){ 3161 pTrackerT.addPair(r1); 3162 } 3163 3164 if(fixVariants){ 3165 CallVariants.fixVars(r1, varMap, scafMap); 3166 CallVariants.fixVars(r2, varMap, scafMap); 3167 } 3168 3169 if(readstats!=null){ 3170 if(MAKE_QUALITY_HISTOGRAM){readstats.addToQualityHistogram(r1);} 3171 if(MAKE_BASE_HISTOGRAM){readstats.addToBaseHistogram(r1);} 3172 if(MAKE_MATCH_HISTOGRAM){readstats.addToMatchHistogram(r1);} 3173 if(MAKE_QUALITY_ACCURACY){readstats.addToQualityAccuracy(r1);} 3174 3175 if(MAKE_EHIST){readstats.addToErrorHistogram(r1);} 3176 if(MAKE_INDELHIST){readstats.addToIndelHistogram(r1);} 3177 if(MAKE_LHIST){readstats.addToLengthHistogram(r1);} 3178 if(MAKE_GCHIST){readstats.addToGCHistogram(r1);} 3179 if(MAKE_ENTROPYHIST){readstats.addToEntropyHistogram(r1);} 3180 if(MAKE_IDHIST){readstats.addToIdentityHistogram(r1);} 3181 3182 if(MAKE_IHIST){ 3183 SamLine sl1=r1.samline; 3184 if(sl1!=null && !r1.secondary() && sl1.pairnum()==0){ 3185 readstats.addToInsertHistogram(sl1); 3186 } 3187 } 3188 } 3189 3190 if(fixVariants && unfixVariants){ 3191 CallVariants.unfixVars(r1); 3192 CallVariants.unfixVars(r2); 3193 } 3194 } 3195 3196 /** 3197 * Transforms a kmer into all canonical values for a given Hamming distance. 3198 * Returns the related id stored in the tables. 3199 * @param kmer Forward kmer 3200 * @param rkmer Reverse kmer 3201 * @param lengthMask Bitmask with single '1' set to left of kmer 3202 * @param qPos Position of kmer in query 3203 * @param len kmer length 3204 * @param qHDist Hamming distance 3205 * @param sets Kmer hash tables 3206 * @return Value stored in table, or -1 3207 */ getValue(final long kmer, final long rkmer, final long lengthMask, final int qPos, final int len, final int qHDist, final AbstractKmerTable[] sets)3208 private final int getValue(final long kmer, final long rkmer, final long lengthMask, final int qPos, final int len, final int qHDist, final AbstractKmerTable[] sets){ 3209 assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer; 3210 if(verbose){outstream.println("getValue()");} 3211 int id=getValueInner(kmer, rkmer, lengthMask, len, qPos, sets); 3212 if(id<1 && qHDist>0){ 3213 final int qHDist2=qHDist-1; 3214 3215 //Sub 3216 for(int j=0; j<symbols && id<1; j++){ 3217 for(int i=0; i<len && id<1; i++){ 3218 final long temp=(kmer&clearMasks[i])|setMasks[j][i]; 3219 // outstream.println(i+", "+j+", "+setMasks[j][i]+", "+qHDist); 3220 if(temp!=kmer){ 3221 long rtemp=rcomp(temp, len); 3222 // assert(lengthMask==0 || (temp<lengthMask && rtemp<lengthMask)) : lengthMask+", "+temp+", "+rtemp+", "+kmer+", "+rkmer+ 3223 // "\n"+len+", "+Long.numberOfTrailingZeros(lengthMask)+"\n"+ 3224 // Long.toBinaryString(lengthMask|0x8000000000000000L)+"\n"+ 3225 // Long.toBinaryString(temp|0x8000000000000000L)+"\n"+ 3226 // Long.toBinaryString(rtemp|0x8000000000000000L); 3227 id=getValue(temp, rtemp, lengthMask, qPos, len, qHDist2, sets); 3228 } 3229 } 3230 } 3231 } 3232 return id; 3233 } 3234 3235 /** 3236 * Transforms a kmer into a canonical value stored in the table and search. 3237 * @param kmer Forward kmer 3238 * @param rkmer Reverse kmer 3239 * @param lengthMask Bitmask with single '1' set to left of kmer 3240 * @param qPos Position of kmer in query 3241 * @param sets Kmer hash tables 3242 * @return Value stored in table 3243 */ getValueInner(final long kmer, final long rkmer, final long lengthMask, final int len, final int qPos, final AbstractKmerTable[] sets)3244 private final int getValueInner(final long kmer, final long rkmer, final long lengthMask, final int len, final int qPos, final AbstractKmerTable[] sets){ 3245 assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : lengthMask+", "+kmer+", "+rkmer; 3246 if(qSkip>1 && (qPos%qSkip!=0)){return -1;} 3247 3248 if(verbose){ 3249 outstream.println("getValueInner(kmer="+AminoAcid.kmerToString(kmer, len)+", rkmer="+AminoAcid.kmerToString(rkmer, len)+", len="+len+", mask="+lengthMask+")"); 3250 outstream.println("getValueInner(kmer="+kmer+", rkmer="+rkmer+", len="+len+", mask="+lengthMask+")"); 3251 } 3252 final long max=(rcomp ? Tools.max(kmer, rkmer) : kmer); 3253 if(verbose){outstream.println("max="+AminoAcid.kmerToString(max, len)+" ("+max+")");} 3254 final long key=(max&middleMask)|lengthMask; 3255 if(verbose){outstream.println("key="+AminoAcid.kmerToString(key, len)+" ("+key+")");} 3256 if(passesSpeed(key)){ 3257 if(verbose){outstream.println("Testing key "+kmerToString(key, len)+" ("+key+")");} 3258 AbstractKmerTable set=sets[(int)(key%WAYS)]; 3259 final int id=set.getValue(key); 3260 if(verbose){outstream.println("getValueInner("+kmerToString(kmer, len)+", "+kmerToString(rkmer, len)+") > "+kmerToString(key, len)+" ("+key+") = "+id);} 3261 return id; 3262 } 3263 if(verbose){outstream.println("Invalid key.");} 3264 return -1; 3265 } 3266 3267 3268 /** 3269 * Counts the number of kmer hits for a read. 3270 * @param r Read to process 3271 * @param sets Kmer tables 3272 * @return Number of hits 3273 */ countSetKmers(final Read r, final AbstractKmerTable[] sets, final int maxBadKmers)3274 private final int countSetKmers(final Read r, final AbstractKmerTable[] sets, final int maxBadKmers){ 3275 if(r==null || r.length()<k || storedKmers<1){return 0;} 3276 if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;} 3277 final byte[] bases=r.bases; 3278 long kmer=0; 3279 long rkmer=0; 3280 int found=0; 3281 int len=0; 3282 3283 final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight)); 3284 final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft)); 3285 3286 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ 3287 for(int i=start; i<stop; i++){ 3288 byte b=bases[i]; 3289 long x=symbolToNumber0[b]; 3290 long x2=symbolToComplementNumber0[b]; 3291 kmer=((kmer<<bitsPerBase)|x)&mask; 3292 rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask; 3293 if(forbidNs && !isFullyDefined(b)){len=0; rkmer=0;}else{len++;} 3294 if(verbose){ 3295 String fwd=new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)); 3296 String rev=AminoAcid.reverseComplementBases(fwd); 3297 String fwd2=kmerToString(kmer, Tools.min(len, k)); 3298 outstream.println("fwd="+fwd+", fwd2="+fwd2+", rev="+rev+", kmer="+kmer+", rkmer="+rkmer); 3299 outstream.println("b="+(char)b+", x="+x+", x2="+x2+", bitsPerBase="+bitsPerBase+", shift2="+shift2); 3300 if(!amino){ 3301 assert(AminoAcid.stringToKmer(fwd)==kmer) : fwd+", "+AminoAcid.stringToKmer(fwd)+", "+kmer+", "+len; 3302 if(len>=k){ 3303 assert(rcomp(kmer, Tools.min(len, k))==rkmer); 3304 assert(rcomp(rkmer, Tools.min(len, k))==kmer); 3305 assert(AminoAcid.kmerToString(kmer, Tools.min(len, k)).equals(fwd)); 3306 assert(AminoAcid.kmerToString(rkmer, Tools.min(len, k)).equals(rev)) : AminoAcid.kmerToString(rkmer, Tools.min(len, k))+" != "+rev+" (rkmer)"; 3307 } 3308 assert(fwd.equalsIgnoreCase(fwd2)) : fwd+", "+fwd2; //may be unsafe 3309 } 3310 outstream.println("Scanning6 i="+i+", len="+len+", kmer="+kmer+", rkmer="+rkmer+", bases="+fwd+", rbases="+rev); 3311 } 3312 if(len>=minlen2 && i>=minlen){ 3313 final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets); 3314 if(verbose){outstream.println("Testing kmer "+kmer+"; id="+id);} 3315 if(id>0){ 3316 if(verbose){outstream.println("Found = "+(found+1)+"/"+maxBadKmers);} 3317 if(found==maxBadKmers){ 3318 if(scaffoldReadCountsT!=null){ 3319 scaffoldReadCountsT[id]++; 3320 scaffoldBaseCountsT[id]+=bases.length; 3321 }else{ 3322 scaffoldReadCounts.addAndGet(id, 1); 3323 scaffoldBaseCounts.addAndGet(id, bases.length); 3324 } 3325 if(hitCounts==null){ 3326 return (found=found+1); 3327 }//Early exit, but prevents generation of histogram that goes over maxBadKmers+1. 3328 } 3329 found++; 3330 } 3331 } 3332 } 3333 3334 if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;} 3335 return found; 3336 } 3337 3338 3339 /** 3340 * Counts the number of kmer hits for a read. 3341 * @param r Read to process 3342 * @param sets Kmer tables 3343 * @return Number of hits 3344 */ countCoveredBases(final Read r, final AbstractKmerTable[] sets, final int minCoveredBases)3345 private final int countCoveredBases(final Read r, final AbstractKmerTable[] sets, final int minCoveredBases){ 3346 if(r==null || r.length()<k || storedKmers<1){return 0;} 3347 if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;} 3348 final byte[] bases=r.bases; 3349 long kmer=0; 3350 long rkmer=0; 3351 int found=0; 3352 int len=0; 3353 int lastFound=-1; 3354 boolean recorded=false; 3355 3356 final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight)); 3357 final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft)); 3358 3359 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ 3360 for(int i=start; i<stop; i++){ 3361 byte b=bases[i]; 3362 long x=symbolToNumber0[b]; 3363 long x2=symbolToComplementNumber0[b]; 3364 kmer=((kmer<<bitsPerBase)|x)&mask; 3365 rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask; 3366 if(forbidNs && !isFullyDefined(b)){len=0; rkmer=0;}else{len++;} 3367 if(verbose){outstream.println("Scanning6b i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 3368 if(len>=minlen2 && i>=minlen){ 3369 final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets); 3370 if(verbose){outstream.println("Testing kmer "+kmer+"; id="+id);} 3371 if(id>0){ 3372 3373 int extra=Tools.min(k, i-lastFound); 3374 found+=extra; 3375 lastFound=i; 3376 3377 if(verbose){outstream.println("Found = "+found+"/"+minCoveredBases);} 3378 if(found>=minCoveredBases){ 3379 if(!recorded){ 3380 if(scaffoldReadCountsT!=null){ 3381 scaffoldReadCountsT[id]++; 3382 scaffoldBaseCountsT[id]+=bases.length; 3383 }else{ 3384 scaffoldReadCounts.addAndGet(id, 1); 3385 scaffoldBaseCounts.addAndGet(id, bases.length); 3386 } 3387 } 3388 if(hitCounts==null){ 3389 return found; 3390 } 3391 } 3392 } 3393 } 3394 } 3395 3396 if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;} 3397 return found; 3398 } 3399 3400 /** 3401 * Returns the id of the sequence with the most kmer matches to this read, or -1 if none are over maxBadKmers. 3402 * @param r Read to process 3403 * @param sets Kmer tables 3404 * @return id of best match 3405 */ findBestMatch(final Read r, final AbstractKmerTable[] sets, final int maxBadKmers)3406 private final int findBestMatch(final Read r, final AbstractKmerTable[] sets, final int maxBadKmers){ 3407 idList.size=0; 3408 if(r==null || r.length()<k || storedKmers<1){return -1;} 3409 if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return -1;} 3410 final byte[] bases=r.bases; 3411 long kmer=0; 3412 long rkmer=0; 3413 int len=0; 3414 int found=0; 3415 3416 final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight)); 3417 final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft)); 3418 3419 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ 3420 for(int i=start; i<stop; i++){ 3421 byte b=bases[i]; 3422 long x=symbolToNumber0[b]; 3423 long x2=symbolToComplementNumber0[b]; 3424 kmer=((kmer<<bitsPerBase)|x)&mask; 3425 rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask; 3426 if(forbidNs && !isFullyDefined(b)){len=0; rkmer=0;}else{len++;} 3427 if(verbose){outstream.println("Scanning6 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 3428 if(len>=minlen2 && i>=minlen){ 3429 if(verbose){outstream.println("Lookup kmer="+AminoAcid.kmerToString(kmer, k)+", rkmer="+AminoAcid.kmerToString(rkmer, k));} 3430 final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets); 3431 if(id>0){ 3432 countArray[id]++; 3433 if(countArray[id]==1){idList.add(id);} 3434 found++; 3435 if(verbose){outstream.println("Found = "+found+"/"+maxBadKmers);} 3436 } 3437 } 3438 } 3439 3440 final int id, max; 3441 if(found>maxBadKmers){ 3442 max=condenseLoose(countArray, idList, countList); 3443 int id0=-1; 3444 for(int i=0; i<countList.size; i++){ 3445 if(countList.get(i)==max){ 3446 id0=idList.get(i); break; 3447 } 3448 } 3449 if(rename){rename(r, idList, countList);} 3450 id=id0; 3451 }else{ 3452 max=0; 3453 id=-1; 3454 } 3455 3456 if(found>maxBadKmers){ 3457 if(scaffoldReadCountsT!=null){ 3458 scaffoldReadCountsT[id]++; 3459 scaffoldBaseCountsT[id]+=bases.length; 3460 }else{ 3461 scaffoldReadCounts.addAndGet(id, 1); 3462 scaffoldBaseCounts.addAndGet(id, bases.length); 3463 } 3464 } 3465 3466 if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;} 3467 return id; 3468 } 3469 3470 /** Estimates kmer hit counts for kmers longer than k using consecutive matches 3471 * @param r 3472 * @param sets 3473 * @return Number of sets of consecutive hits of exactly length kbig 3474 */ countSetKmersBig(final Read r, final AbstractKmerTable[] sets, final int maxBadKmers)3475 private final int countSetKmersBig(final Read r, final AbstractKmerTable[] sets, final int maxBadKmers){ 3476 if(r==null || r.length()<kbig || storedKmers<1){return 0;} 3477 if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;} 3478 assert(kbig>k); 3479 final int sub=kbig-k-1; 3480 assert(sub>=0) : kbig+", "+sub; 3481 final byte[] bases=r.bases; 3482 long kmer=0; 3483 long rkmer=0; 3484 int found=0; 3485 int len=0; 3486 3487 int bkStart=-1; 3488 int bkStop=-1; 3489 int id=-1, lastId=-1; 3490 3491 final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight)); 3492 final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft)); 3493 3494 /* Loop through the bases, maintaining a forward and reverse kmer via bitshifts */ 3495 for(int i=start; i<stop; i++){ 3496 byte b=bases[i]; 3497 long x=symbolToNumber0[b]; 3498 long x2=symbolToComplementNumber0[b]; 3499 kmer=((kmer<<bitsPerBase)|x)&mask; 3500 rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask; 3501 if(forbidNs && !isFullyDefined(b)){len=0; rkmer=0;}else{len++;} 3502 if(verbose){outstream.println("Scanning7 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 3503 if(len>=minlen2 && i>=minlen){ 3504 id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets); 3505 if(verbose){outstream.println("Testing kmer "+kmer+"; id="+id);} 3506 if(id>0){ 3507 lastId=id; 3508 if(bkStart==-1){bkStart=i;} 3509 bkStop=i; 3510 }else{ 3511 if(bkStart>-1){ 3512 int dif=bkStop-bkStart-sub; 3513 bkStop=bkStart=-1; 3514 if(dif>0){ 3515 int old=found; 3516 found+=dif; 3517 if(found>maxBadKmers && old<=maxBadKmers){ 3518 if(scaffoldReadCountsT!=null){ 3519 scaffoldReadCountsT[lastId]++; 3520 scaffoldBaseCountsT[lastId]+=bases.length; 3521 }else{ 3522 scaffoldReadCounts.addAndGet(lastId, 1); 3523 scaffoldBaseCounts.addAndGet(lastId, bases.length); 3524 } 3525 if(hitCounts==null){ 3526 return found; 3527 }//Early exit, but prevents generation of histogram that goes over maxBadKmers+1. 3528 } 3529 } 3530 } 3531 } 3532 } 3533 } 3534 3535 // This catches the case where valid kmers extend to the end of the read 3536 if(bkStart>-1){ 3537 int dif=bkStop-bkStart-sub; 3538 bkStop=bkStart=-1; 3539 if(dif>0){ 3540 int old=found; 3541 found+=dif; 3542 if(found>maxBadKmers && old<=maxBadKmers){ 3543 if(scaffoldReadCountsT!=null){ 3544 scaffoldReadCountsT[lastId]++; 3545 scaffoldBaseCountsT[lastId]+=bases.length; 3546 }else{ 3547 scaffoldReadCounts.addAndGet(lastId, 1); 3548 scaffoldBaseCounts.addAndGet(lastId, bases.length); 3549 } 3550 } 3551 } 3552 } 3553 3554 if(hitCountsT!=null){hitCountsT[Tools.min(found, HITCOUNT_LEN)]++;} 3555 return found; 3556 } 3557 3558 /** 3559 * Trim a read to remove matching kmers and everything to their left or right. 3560 * @param r Read to process 3561 * @param sets Kmer tables 3562 * @return Number of bases trimmed 3563 */ ktrim(final Read r, final AbstractKmerTable[] sets)3564 private final int ktrim(final Read r, final AbstractKmerTable[] sets){ 3565 assert(ktrimLeft || ktrimRight); 3566 if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k)) || storedKmers<1){return 0;} 3567 if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;} 3568 if(verbose){outstream.println("KTrimming read "+r.id);} 3569 final byte[] bases=r.bases, quals=r.quality; 3570 long kmer=0; 3571 long rkmer=0; 3572 int found=0; 3573 int len=0; 3574 int id0=-1; //ID of first kmer found. 3575 3576 int minLoc=999999999, minLocExclusive=999999999; 3577 int maxLoc=-1, maxLocExclusive=-1; 3578 final int initialLength=r.length(); 3579 3580 final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight)); 3581 final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft)); 3582 3583 //Scan for normal kmers 3584 for(int i=start; i<stop; i++){ 3585 byte b=bases[i]; 3586 long x=symbolToNumber0[b]; 3587 long x2=symbolToComplementNumber0[b]; 3588 kmer=((kmer<<bitsPerBase)|x)&mask; 3589 rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask; 3590 if(forbidNs && !isFullyDefined(b)){len=0; rkmer=0;}else{len++;} 3591 if(verbose){outstream.println("Scanning3 i="+i+", kmer="+kmer+", rkmer="+rkmer+", len="+len+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 3592 if(len>=minlen2 && i>=minlen){ 3593 final int id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets); 3594 if(id>0){ 3595 if(id0<0){id0=id;} 3596 minLoc=Tools.min(minLoc, i-k+1); 3597 assert(minLoc>=0); 3598 maxLoc=i; 3599 found++; 3600 } 3601 } 3602 } 3603 3604 if(minLoc!=minLocExclusive){minLocExclusive=minLoc+k;} 3605 if(maxLoc!=maxLocExclusive){maxLocExclusive=maxLoc-k;} 3606 3607 //If nothing was found, scan for short kmers. Only used for trimming. 3608 if(useShortKmers && found==0){ 3609 assert(!maskMiddle && middleMask==-1) : maskMiddle+", "+middleMask+", k="+", mink="+mink; 3610 3611 //Look for short kmers on left side 3612 if(ktrimLeft){ 3613 kmer=0; 3614 rkmer=0; 3615 len=0; 3616 final int lim=Tools.min(k, stop); 3617 for(int i=start; i<lim; i++){ 3618 byte b=bases[i]; 3619 long x=symbolToNumber0[b]; 3620 long x2=symbolToComplementNumber0[b]; 3621 kmer=((kmer<<bitsPerBase)|x)&mask; 3622 rkmer=rkmer|(x2<<(bitsPerBase*len)); 3623 len++; 3624 if(verbose){outstream.println("Scanning4 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 3625 if(len>=mink){ 3626 3627 if(verbose){ 3628 outstream.println("Looking for left kmer "+kmerToString(kmer, len)); 3629 outstream.println("Looking for left rkmer "+kmerToString(rkmer, len)); 3630 } 3631 3632 final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets); 3633 if(id>0){ 3634 if(id0<0){id0=id;} 3635 if(verbose){outstream.println("Found "+kmer);} 3636 minLoc=0; 3637 minLocExclusive=Tools.min(minLocExclusive, i+1); 3638 maxLoc=Tools.max(maxLoc, i); 3639 maxLocExclusive=Tools.max(maxLocExclusive, 0); 3640 found++; 3641 } 3642 } 3643 } 3644 } 3645 3646 //Look for short kmers on right side 3647 if(ktrimRight){ 3648 kmer=0; 3649 rkmer=0; 3650 len=0; 3651 final int lim=Tools.max(-1, stop-k); 3652 for(int i=stop-1; i>lim; i--){ 3653 byte b=bases[i]; 3654 long x=symbolToNumber0[b]; 3655 long x2=symbolToComplementNumber0[b]; 3656 kmer=kmer|(x<<(bitsPerBase*len)); 3657 rkmer=((rkmer<<bitsPerBase)|x2)&mask; 3658 len++; 3659 if(verbose){outstream.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 3660 if(len>=mink){ 3661 if(verbose){ 3662 outstream.println("Looking for right kmer "+ 3663 AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+"; value="+toValue(kmer, rkmer, lengthMasks[len])+"; kmask="+lengthMasks[len]); 3664 } 3665 final int id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets); 3666 if(id>0){ 3667 if(id0<0){id0=id;} 3668 if(verbose){outstream.println("Found "+kmer);} 3669 minLoc=i; 3670 minLocExclusive=Tools.min(minLocExclusive, bases.length); 3671 maxLoc=bases.length-1; 3672 maxLocExclusive=Tools.max(maxLocExclusive, i-1); 3673 found++; 3674 } 3675 } 3676 } 3677 } 3678 } 3679 3680 3681 if(verbose){outstream.println("found="+found+", minLoc="+minLoc+", maxLoc="+maxLoc+", minLocExclusive="+minLocExclusive+", maxLocExclusive="+maxLocExclusive);} 3682 3683 if(found==0){return 0;} 3684 assert(found>0) : "Overflow in 'found' variable."; 3685 3686 {//Increment counter for the scaffold whose kmer was first detected 3687 if(scaffoldReadCountsT!=null){ 3688 scaffoldReadCountsT[id0]++; 3689 scaffoldBaseCountsT[id0]+=bases.length; 3690 }else{ 3691 scaffoldReadCounts.addAndGet(id0, 1); 3692 scaffoldBaseCounts.addAndGet(id0, bases.length); 3693 } 3694 } 3695 3696 if(trimPad!=0){ 3697 maxLoc=Tools.mid(0, maxLoc+trimPad, bases.length); 3698 minLoc=Tools.mid(0, minLoc-trimPad, bases.length); 3699 maxLocExclusive=Tools.mid(0, maxLocExclusive+trimPad, bases.length); 3700 minLocExclusive=Tools.mid(0, minLocExclusive-trimPad, bases.length); 3701 } 3702 3703 if(ktrimLeft){ //Trim from the read start to the rightmost kmer base 3704 if(verbose){outstream.println("Left trimming to "+(ktrimExclusive ? maxLocExclusive+1 : maxLoc+1)+", "+0);} 3705 int x=TrimRead.trimToPosition(r, ktrimExclusive ? maxLocExclusive+1 : maxLoc+1, bases.length-1, 1); 3706 if(verbose){outstream.println("Trimmed "+x+" bases: "+new String(r.bases));} 3707 return x; 3708 }else{ //Trim from the leftmost kmer base to the read stop 3709 assert(ktrimRight); 3710 if(verbose){outstream.println("Right trimming to "+0+", "+(ktrimExclusive ? minLocExclusive-1 : minLoc-1));} 3711 int x=TrimRead.trimToPosition(r, 0, ktrimExclusive ? minLocExclusive-1 : minLoc-1, 1); 3712 if(verbose){outstream.println("Trimmed "+x+" bases: "+new String(r.bases));} 3713 return x; 3714 } 3715 } 3716 3717 3718 /** 3719 * Mask a read to cover matching kmers. 3720 * @param r Read to process 3721 * @param sets Kmer tables 3722 * @return Number of bases masked 3723 */ kmask(final Read r, final AbstractKmerTable[] sets)3724 private final int kmask(final Read r, final AbstractKmerTable[] sets){ 3725 assert(ktrimN); 3726 if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k)) || storedKmers<1){return 0;} 3727 if((skipR1 && r.pairnum()==0) || (skipR2 && r.pairnum()==1)){return 0;} 3728 if(verbose){outstream.println("KMasking read "+r.id);} 3729 final byte[] bases=r.bases, quals=r.quality; 3730 if(bases==null || bases.length<k){return 0;} 3731 long kmer=0; 3732 long rkmer=0; 3733 int found=0; 3734 int len=0; 3735 int id0=-1; //ID of first kmer found. 3736 3737 final BitSet bs=new BitSet(bases.length+trimPad+1); 3738 if(kmaskFullyCovered){bs.set(0, bases.length);} 3739 3740 final int minus=k-1-trimPad; 3741 final int plus=trimPad+1; 3742 3743 final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight)); 3744 final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft)); 3745 3746 //Scan for normal kmers 3747 for(int i=start; i<stop; i++){ 3748 byte b=bases[i]; 3749 long x=symbolToNumber0[b]; 3750 long x2=symbolToComplementNumber0[b]; 3751 kmer=((kmer<<bitsPerBase)|x)&mask; 3752 rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask; 3753 if(forbidNs && !isFullyDefined(b)){len=0; rkmer=0;}else{len++;} 3754 if(verbose){outstream.println("Scanning3 i="+i+", kmer="+kmer+", rkmer="+rkmer+", len="+len+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 3755 3756 if(i>=minlen){ 3757 final int id; 3758 if(len>=minlen2){ 3759 id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets); 3760 }else{ 3761 id=-1; 3762 } 3763 if(id>0){ 3764 if(id0<0){id0=id;} 3765 if(verbose){ 3766 outstream.println("a: Found "+kmer); 3767 outstream.println("Setting "+Tools.max(0, i-minus)+", "+(i+plus)); 3768 outstream.println("i="+i+", minus="+minus+", plus="+plus+", trimpad="+trimPad+", k="+k); 3769 } 3770 if(!kmaskFullyCovered){bs.set(Tools.max(0, i-minus), i+plus);} 3771 found++; 3772 }else if(kmaskFullyCovered){ 3773 bs.clear(Tools.max(0, i-minus), i+plus); 3774 } 3775 } 3776 } 3777 3778 //If nothing was found, scan for short kmers. 3779 if(useShortKmers){ 3780 assert(!maskMiddle && middleMask==-1) : maskMiddle+", "+middleMask+", k="+", mink="+mink; 3781 3782 //Look for short kmers on left side 3783 { 3784 kmer=0; 3785 rkmer=0; 3786 len=0; 3787 int len2=0; 3788 final int lim=Tools.min(k, stop); 3789 for(int i=start; i<lim; i++){ 3790 byte b=bases[i]; 3791 long x=symbolToNumber0[b]; 3792 long x2=symbolToComplementNumber0[b]; 3793 kmer=((kmer<<bitsPerBase)|x)&mask; 3794 rkmer=rkmer|(x2<<(bitsPerBase*len)); 3795 len++; 3796 len2++; 3797 if(verbose){outstream.println("Scanning4 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 3798 3799 if(len2>=minminlen){ 3800 if(verbose){ 3801 outstream.println("Looking for left kmer "+kmerToString(kmer, len)); 3802 outstream.println("Looking for left rkmer "+kmerToString(rkmer, len)); 3803 } 3804 final int id; 3805 if(len>=mink){ 3806 id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets); 3807 }else{ 3808 id=-1; 3809 } 3810 if(id>0){ 3811 if(id0<0){id0=id;} 3812 if(verbose){ 3813 outstream.println("b: Found "+kmer); 3814 outstream.println("Setting "+0+", "+Tools.min(bases.length, i+trimPad+1)); 3815 } 3816 if(!kmaskFullyCovered){bs.set(0, Tools.min(bases.length, i+trimPad+1));} 3817 found++; 3818 }else if(kmaskFullyCovered){ 3819 bs.clear(0, Tools.min(bases.length, i+trimPad+1)); 3820 } 3821 } 3822 } 3823 } 3824 3825 //Look for short kmers on right side 3826 { 3827 kmer=0; 3828 rkmer=0; 3829 len=0; 3830 int len2=0; 3831 final int lim=Tools.max(-1, stop-k); 3832 for(int i=stop-1; i>lim; i--){ 3833 byte b=bases[i]; 3834 long x=symbolToNumber0[b]; 3835 long x2=symbolToComplementNumber0[b]; 3836 kmer=kmer|(x<<(bitsPerBase*len)); 3837 rkmer=((rkmer<<bitsPerBase)|x2)&mask; 3838 len++; 3839 len2++; 3840 if(verbose){outstream.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 3841 3842 if(len2>=minminlen){ 3843 if(verbose){ 3844 outstream.println("Looking for right kmer "+ 3845 AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+"; value="+toValue(kmer, rkmer, lengthMasks[len])+"; kmask="+lengthMasks[len]); 3846 } 3847 final int id; 3848 if(len>=mink){ 3849 id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets); 3850 }else{ 3851 id=-1; 3852 } 3853 if(id>0){ 3854 if(id0<0){id0=id;} 3855 if(verbose){ 3856 outstream.println("c: Found "+kmer); 3857 outstream.println("Setting "+Tools.max(0, i-trimPad)+", "+bases.length); 3858 } 3859 if(!kmaskFullyCovered){bs.set(Tools.max(0, i-trimPad), bases.length);} 3860 found++; 3861 }else if(kmaskFullyCovered){ 3862 bs.clear(Tools.max(0, i-trimPad), bases.length); 3863 } 3864 } 3865 } 3866 } 3867 } 3868 3869 3870 if(verbose){outstream.println("found="+found+", bitset="+bs);} 3871 3872 if(found==0){return 0;} 3873 assert(found>0) : "Overflow in 'found' variable."; 3874 3875 {//Increment counter for the scaffold whose kmer was first detected 3876 if(scaffoldReadCountsT!=null){ 3877 scaffoldReadCountsT[id0]++; 3878 scaffoldBaseCountsT[id0]+=bases.length; 3879 }else{ 3880 scaffoldReadCounts.addAndGet(id0, 1); 3881 scaffoldBaseCounts.addAndGet(id0, bases.length); 3882 } 3883 } 3884 // int y=r.countNocalls(); 3885 int cardinality=bs.cardinality(); 3886 // assert(cardinality>0); 3887 3888 //Replace kmer hit zone with the trim symbol 3889 for(int i=0; i<bases.length; i++){ 3890 if(bs.get(i)){ 3891 if(kmaskLowercase){ 3892 bases[i]=(byte)Tools.toLowerCase(bases[i]); 3893 }else{ 3894 bases[i]=trimSymbol; 3895 if(quals!=null && trimSymbol=='N'){quals[i]=0;} 3896 } 3897 } 3898 } 3899 // assert(cardinality==r.countNocalls() || y>0) : cardinality+", "+r.countNocalls()+"\n"+r.length()+"\n"+bs+"\n"+r;//123 3900 return cardinality; 3901 } 3902 3903 3904 /** 3905 * Mask a read to cover matching kmers. 3906 * @param r Read to process 3907 * @param sets Kmer tables 3908 * @return Number of bases masked 3909 */ ksplit(final Read r, final AbstractKmerTable[] sets)3910 private final boolean ksplit(final Read r, final AbstractKmerTable[] sets){ 3911 if(r==null || r.length()<Tools.max(1, (useShortKmers ? Tools.min(k, mink) : k)) || storedKmers<1){return false;} 3912 assert(r.mate==null) : "Kmer splitting should only be performed on unpaired reads."; 3913 assert(ksplit); 3914 if(verbose){outstream.println("KSplitting read "+r.id);} 3915 final byte[] bases=r.bases, quals=r.quality; 3916 if(bases==null || bases.length<k){return false;} 3917 long kmer=0; 3918 long rkmer=0; 3919 long found=0; 3920 int len=0; 3921 int id0=-1; //ID of first kmer found. 3922 int leftmost=Integer.MAX_VALUE, rightmost=-1; 3923 3924 final int minus=k-1-trimPad; 3925 final int plus=trimPad; 3926 3927 final int start=(restrictRight<1 ? 0 : Tools.max(0, bases.length-restrictRight)); 3928 final int stop=(restrictLeft<1 ? bases.length : Tools.min(bases.length, restrictLeft)); 3929 3930 //Scan for normal kmers 3931 for(int i=start; i<stop; i++){ 3932 byte b=bases[i]; 3933 long x=symbolToNumber0[b]; 3934 long x2=symbolToComplementNumber0[b]; 3935 kmer=((kmer<<bitsPerBase)|x)&mask; 3936 rkmer=((rkmer>>>bitsPerBase)|(x2<<shift2))&mask; 3937 if(forbidNs && !isFullyDefined(b)){len=0; rkmer=0;}else{len++;} 3938 if(verbose){outstream.println("Scanning3 i="+i+", kmer="+kmer+", rkmer="+rkmer+", len="+len+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 3939 3940 if(i>=minlen){ 3941 final int id; 3942 if(len>=minlen2){ 3943 id=getValue(kmer, rkmer, kmask, i, k, qHammingDistance, sets); 3944 }else{ 3945 id=-1; 3946 } 3947 if(id>0){ 3948 if(id0<0){id0=id;} 3949 if(verbose){ 3950 outstream.println("a: Found "+kmer); 3951 outstream.println("Setting "+Tools.max(0, i-minus)+", "+(i+plus)); 3952 outstream.println("i="+i+", minus="+minus+", plus="+plus+", trimpad="+trimPad+", k="+k); 3953 } 3954 leftmost=Tools.min(leftmost, Tools.max(0, i-minus)); 3955 rightmost=Tools.max(rightmost, i+plus); 3956 found++; 3957 } 3958 } 3959 } 3960 3961 //If nothing was found, scan for short kmers. 3962 if(useShortKmers && id0==-1){ 3963 assert(!maskMiddle && middleMask==-1) : maskMiddle+", "+middleMask+", k="+", mink="+mink; 3964 3965 //Look for short kmers on right side 3966 { 3967 kmer=0; 3968 rkmer=0; 3969 len=0; 3970 int len2=0; 3971 final int lim=Tools.max(-1, stop-k); 3972 for(int i=stop-1; i>lim; i--){ 3973 byte b=bases[i]; 3974 long x=symbolToNumber0[b]; 3975 long x2=symbolToComplementNumber0[b]; 3976 kmer=kmer|(x<<(bitsPerBase*len)); 3977 rkmer=((rkmer<<bitsPerBase)|x2)&mask; 3978 len++; 3979 len2++; 3980 if(verbose){outstream.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 3981 3982 if(len2>=minminlen){ 3983 if(verbose){ 3984 outstream.println("Looking for right kmer "+ 3985 AminoAcid.kmerToString(kmer&~lengthMasks[len], len)+"; value="+toValue(kmer, rkmer, lengthMasks[len])+"; kmask="+lengthMasks[len]); 3986 } 3987 final int id; 3988 if(len>=mink){ 3989 id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets); 3990 }else{ 3991 id=-1; 3992 } 3993 if(id>0){ 3994 if(id0<0){id0=id;} 3995 if(verbose){ 3996 outstream.println("b: Found "+kmer); 3997 outstream.println("Setting "+Tools.max(0, i-trimPad)+", "+bases.length); 3998 } 3999 leftmost=Tools.min(leftmost, Tools.max(0, i-trimPad)); 4000 rightmost=bases.length-1; 4001 found++; 4002 } 4003 } 4004 } 4005 } 4006 4007 //Look for short kmers on left side 4008 if(id0==-1){ 4009 kmer=0; 4010 rkmer=0; 4011 len=0; 4012 int len2=0; 4013 final int lim=Tools.min(k, stop); 4014 for(int i=start; i<lim; i++){ 4015 byte b=bases[i]; 4016 long x=symbolToNumber0[b]; 4017 long x2=symbolToComplementNumber0[b]; 4018 kmer=((kmer<<bitsPerBase)|x)&mask; 4019 rkmer=rkmer|(x2<<(bitsPerBase*len)); 4020 len++; 4021 len2++; 4022 if(verbose){outstream.println("Scanning4 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 4023 4024 if(len2>=minminlen){ 4025 if(verbose){ 4026 outstream.println("Looking for left kmer "+kmerToString(kmer, len)); 4027 outstream.println("Looking for left rkmer "+kmerToString(rkmer, len)); 4028 } 4029 final int id; 4030 if(len>=mink){ 4031 id=getValue(kmer, rkmer, lengthMasks[len], i, len, qHammingDistance2, sets); 4032 }else{ 4033 id=-1; 4034 } 4035 if(id>0){ 4036 if(id0<0){id0=id;} 4037 if(verbose){ 4038 outstream.println("c: Found "+kmer); 4039 outstream.println("Setting "+0+", "+(i+trimPad)); 4040 } 4041 leftmost=0; 4042 rightmost=Tools.max(rightmost, i+trimPad); 4043 found++; 4044 } 4045 } 4046 } 4047 } 4048 } 4049 4050 4051 if(verbose){outstream.println("found="+found);} 4052 4053 if(found==0){return false;} 4054 4055 {//Increment counter for the scaffold whose kmer was first detected 4056 if(scaffoldReadCountsT!=null){ 4057 scaffoldReadCountsT[id0]++; 4058 scaffoldBaseCountsT[id0]+=bases.length; 4059 }else{ 4060 scaffoldReadCounts.addAndGet(id0, 1); 4061 scaffoldBaseCounts.addAndGet(id0, bases.length); 4062 } 4063 } 4064 4065 if(leftmost==0){ 4066 TrimRead.trimToPosition(r, rightmost+1, bases.length-1, 1); 4067 return false; 4068 }else if(rightmost==bases.length-1){ 4069 TrimRead.trimToPosition(r, 0, leftmost-1, 1); 4070 return false; 4071 }else{ 4072 Read r2=r.subRead(rightmost+1, bases.length-1); 4073 TrimRead.trimToPosition(r, 0, leftmost-1, 1); 4074 r.mate=r2; 4075 r2.mate=r; 4076 r2.setPairnum(1); 4077 return true; 4078 } 4079 } 4080 4081 /** 4082 * @param r 4083 * @param idList 4084 * @param countList 4085 */ rename(Read r, IntList idList, IntList countList)4086 private void rename(Read r, IntList idList, IntList countList) { 4087 if(r==null || idList.size<1){return;} 4088 StringBuilder sb=new StringBuilder(); 4089 if(r.id==null){sb.append(r.numericID);} 4090 else{sb.append(r.id);} 4091 for(int i=0; i<idList.size; i++){ 4092 int id=idList.get(i); 4093 int count=countList.get(i); 4094 sb.append('\t'); 4095 sb.append(scaffoldNames.get(id)); 4096 sb.append('='); 4097 sb.append(count); 4098 } 4099 r.id=sb.toString(); 4100 } 4101 4102 /** 4103 * Pack a list of counts from an array to an IntList. 4104 * @param loose Counter array 4105 * @param packed Unique values 4106 * @param counts Counts of values 4107 * @return 4108 */ condenseLoose(int[] loose, IntList packed, IntList counts)4109 private int condenseLoose(int[] loose, IntList packed, IntList counts){ 4110 counts.size=0; 4111 if(packed.size<1){return 0;} 4112 4113 int max=0; 4114 for(int i=0; i<packed.size; i++){ 4115 final int p=packed.get(i); 4116 final int c=loose[p]; 4117 counts.add(c); 4118 loose[p]=0; 4119 max=Tools.max(max, c); 4120 } 4121 return max; 4122 } 4123 expectedErrors(Read r1, Read r2)4124 private float expectedErrors(Read r1, Read r2){ 4125 float a=(r1==null ? 0 : r1.expectedErrors(false, -1)); 4126 float b=(r2==null ? 0 : r2.expectedErrors(false, -1)); 4127 return Tools.max(a, b); 4128 } 4129 4130 /*--------------------------------------------------------------*/ 4131 /*---------------- Entropy Methods ----------------*/ 4132 /*--------------------------------------------------------------*/ 4133 maskLowEntropy(final Read r, BitSet bs, EntropyTracker et)4134 private int maskLowEntropy(final Read r, BitSet bs, EntropyTracker et){ 4135 final int window=et.windowBases(); 4136 if(r==null || r.length()<window){return 0;} 4137 final byte[] bases=r.bases; 4138 if(bs==null){bs=new BitSet(r.length());} 4139 else{bs.clear();} 4140 4141 et.clear(); 4142 for(int i=0, min=window-1; i<bases.length; i++){ 4143 et.add(bases[i]); 4144 if(i>=min && et.ns()<1 && !et.passes()){bs.set(et.leftPos(), et.rightPos()+1);} 4145 } 4146 4147 return maskFromBitset(r, bs, entropyMaskLowercase); 4148 } 4149 trimLowEntropy(final Read r, BitSet bs, EntropyTracker et)4150 private int trimLowEntropy(final Read r, BitSet bs, EntropyTracker et){ 4151 final int window=et.windowBases(); 4152 System.err.println("Trimming "+r.id+", len "+r.length()+", window "+window); 4153 if(r==null || r.length()<window){return 0;} 4154 final byte[] bases=r.bases; 4155 if(bs==null){bs=new BitSet(r.length());} 4156 else{bs.clear();} 4157 4158 et.clear(); 4159 for(int i=0, min=window-1; i<bases.length; i++){ 4160 et.add(bases[i]); 4161 if(i>=min && et.ns()<1 && !et.passes()){bs.set(et.leftPos(), et.rightPos()+1);} 4162 } 4163 4164 //Now, trim from bitset - which could be spun out into a function 4165 int left=0, right=0; 4166 final int len=bases.length; 4167 for(int i=0; i<len; i++){ 4168 if(bs.get(i)){left++;} 4169 else {break;} 4170 } 4171 for(int i=len-1; i>=0; i--){ 4172 if(bs.get(i)){right++;} 4173 else {break;} 4174 } 4175 // System.err.println(new String(bases)+"\nleft="+left+", right="+right); 4176 if(left==0 && right==0){return 0;} 4177 return TrimRead.trimByAmount(r, left, right, 1); 4178 } 4179 markLowEntropy(final Read r, EntropyTracker et)4180 private void markLowEntropy(final Read r, EntropyTracker et){ 4181 final int window=et.windowBases(); 4182 if(r==null || r.length()<window){return;} 4183 final byte[] bases=r.bases; 4184 4185 float[] values=new float[r.length()]; 4186 Arrays.fill(values, 1); 4187 4188 et.clear(); 4189 for(int i=0, min=window-1; i<bases.length; i++){ 4190 et.add(bases[i]); 4191 if(i>=min && et.ns()<1){ 4192 float e=et.calcEntropy(); 4193 for(int j=et.leftPos(), max=et.rightPos(); j<=max; j++){ 4194 values[j]=Tools.min(e, values[j]); 4195 } 4196 } 4197 } 4198 4199 if(r.quality==null){ 4200 r.quality=new byte[r.length()]; 4201 } 4202 for(int i=0; i<values.length; i++){ 4203 byte q=(byte)(values[i]*41); 4204 r.quality[i]=q; 4205 } 4206 } 4207 maskFromBitset(final Read r, final BitSet bs, final boolean lowercase)4208 private int maskFromBitset(final Read r, final BitSet bs, final boolean lowercase){ 4209 final byte[] bases=r.bases; 4210 final byte[] quals=r.quality; 4211 int sum=0; 4212 if(!lowercase){ 4213 for(int i=bs.nextSetBit(0); i>=0; i=bs.nextSetBit(i+1)){ 4214 if(bases[i]!='N'){ 4215 sum++; 4216 bases[i]='N'; 4217 if(quals!=null){quals[i]=0;} 4218 } 4219 } 4220 }else{ 4221 for(int i=bs.nextSetBit(0); i>=0; i=bs.nextSetBit(i+1)){ 4222 if(!Tools.isLowerCase(bases[i])){ 4223 if(bases[i]!='N'){sum++;} 4224 bases[i]=(byte)Tools.toLowerCase(bases[i]); 4225 //Don't change quality 4226 } 4227 } 4228 } 4229 return sum; 4230 } 4231 passesVariantFilter(Read r)4232 public final boolean passesVariantFilter(Read r){ 4233 if(!r.mapped() || r.bases==null || r.samline==null || r.match==null){return true;} 4234 //TODO: Add Vars as well, like in FilterSam 4235 if(Read.countSubs(r.match)<=maxBadSubs){return true;} 4236 ArrayList<Var> list=CallVariants.findUniqueSubs(r, r.samline, varMap, scafMap, maxBadSubAlleleDepth, maxBadAlleleFraction, minBadSubReadDepth, minBadSubEDist); 4237 return list==null || list.size()<=maxBadSubs; 4238 } 4239 4240 /*--------------------------------------------------------------*/ 4241 4242 /** Input read stream */ 4243 private final ConcurrentReadInputStream cris; 4244 /** Output read streams */ 4245 private final ConcurrentReadOutputStream ros, rosb, ross; 4246 4247 private final FlowcellCoordinate flowCoords; 4248 4249 private final ReadStats readstats; 4250 private final int[] overlapVector; 4251 private final int[] countArray; 4252 4253 private final IntList idList; 4254 private final IntList countList; 4255 4256 //These "*T" fields are used to store counts on a per-thread basis. 4257 4258 long[] hitCountsT; 4259 long[] scaffoldReadCountsT; 4260 long[] scaffoldBaseCountsT; 4261 4262 final EntropyTracker eTrackerT; 4263 final PolymerTracker pTrackerT; 4264 4265 private float[] aprob, bprob; 4266 4267 long readsInT=0; 4268 long basesInT=0; 4269 long readsOutuT=0; 4270 long basesOutuT=0; 4271 4272 long readsOutmT=0; 4273 long basesOutmT=0; 4274 4275 final long maxBasesOutmT; 4276 final long maxBasesOutuT; 4277 4278 long readsQTrimmedT=0; 4279 long basesQTrimmedT=0; 4280 long readsFTrimmedT=0; 4281 long basesFTrimmedT=0; 4282 long readsQFilteredT=0; 4283 long basesQFilteredT=0; 4284 long readsNFilteredT=0; 4285 long basesNFilteredT=0; 4286 long readsEFilteredT=0; 4287 long basesEFilteredT=0; 4288 long readsPolyTrimmedT=0; 4289 long basesPolyTrimmedT=0; 4290 4291 long readsKTrimmedT=0; 4292 long basesKTrimmedT=0; 4293 long readsKFilteredT=0; 4294 long basesKFilteredT=0; 4295 4296 long readsTrimmedBySwiftT=0; 4297 long basesTrimmedBySwiftT=0; 4298 4299 long readsTrimmedByOverlapT=0; 4300 long basesTrimmedByOverlapT=0; 4301 4302 long badGcBasesT=0; 4303 long badGcReadsT=0; 4304 4305 long badHeaderBasesT=0; 4306 long badHeaderReadsT=0; 4307 4308 boolean finishedSuccessfully=false; 4309 } 4310 4311 /*--------------------------------------------------------------*/ 4312 /*---------------- Static Methods ----------------*/ 4313 /*--------------------------------------------------------------*/ 4314 4315 /** Current available memory */ freeMemory()4316 private static final long freeMemory(){ 4317 Runtime rt=Runtime.getRuntime(); 4318 return rt.freeMemory(); 4319 } 4320 4321 /** 4322 * Transforms a kmer into a canonical value stored in the table. Expected to be inlined. 4323 * @param kmer Forward kmer 4324 * @param rkmer Reverse kmer 4325 * @param lengthMask Bitmask with single '1' set to left of kmer 4326 * @return Canonical value 4327 */ toValue(long kmer, long rkmer, long lengthMask)4328 final long toValue(long kmer, long rkmer, long lengthMask){ 4329 assert(lengthMask==0 || (kmer<lengthMask && rkmer<lengthMask)) : 4330 "\n"+Long.toBinaryString(lengthMask)+ 4331 "\n"+Long.toBinaryString(kmer)+ 4332 "\n"+Long.toBinaryString(rkmer)+ 4333 "\n"+Long.toBinaryString(rcomp(kmer, k)); 4334 if(verbose){outstream.println("toValue("+AminoAcid.kmerToString(kmer, k)+", "+AminoAcid.kmerToString(rkmer, k)+", "+lengthMask+")");} 4335 final long value=(rcomp ? Tools.max(kmer, rkmer) : kmer); 4336 if(verbose){outstream.println("value="+AminoAcid.kmerToString(value, k)+" = "+value);} 4337 final long ret=(value&middleMask)|lengthMask; 4338 if(verbose){outstream.println("ret="+AminoAcid.kmerToString(ret, k)+" = "+ret);} 4339 return ret; 4340 } 4341 rcomp(long kmer, int len)4342 final long rcomp(long kmer, int len){ 4343 return amino ? kmer : AminoAcid.reverseComplementBinaryFast(kmer, len); 4344 } 4345 passesSpeed(long key)4346 final boolean passesSpeed(long key){ 4347 return speed<1 || ((key&Long.MAX_VALUE)%17)>=speed; 4348 } 4349 failsSpeed(long key)4350 final boolean failsSpeed(long key){ 4351 return speed>0 && ((key&Long.MAX_VALUE)%17)<speed; 4352 } 4353 trimPolyA(final Read r, final int minPoly)4354 public static int trimPolyA(final Read r, final int minPoly){ 4355 assert(minPoly>0); 4356 if(r==null || r.length()<minPoly){return 0;} 4357 4358 int left=Tools.max(r.countLeft((byte)'A'), r.countLeft((byte)'T')); 4359 int right=Tools.max(r.countRight((byte)'A'), r.countRight((byte)'T')); 4360 4361 if(left<minPoly){left=0;} 4362 if(right<minPoly){right=0;} 4363 int trimmed=0; 4364 if(left>0 || right>0){ 4365 trimmed=TrimRead.trimByAmount(r, left, right, 1); 4366 } 4367 return trimmed; 4368 } 4369 trimPoly(final Read r, final int minPolyLeft, final int minPolyRight, final byte c)4370 public static int trimPoly(final Read r, final int minPolyLeft, final int minPolyRight, final byte c){ 4371 assert(minPolyLeft>0 || minPolyRight>0); 4372 if(r==null){return 0;} 4373 4374 int left=minPolyLeft>0 ? r.countLeft(c) : 0; 4375 int right=minPolyRight>0 ? r.countRight(c) : 0; 4376 4377 if(left<minPolyLeft){left=0;} 4378 if(right<minPolyRight){right=0;} 4379 int trimmed=0; 4380 if(left>0 || right>0){ 4381 trimmed=TrimRead.trimByAmount(r, left, right, 1); 4382 } 4383 return trimmed; 4384 } 4385 trimSwift(Read r)4386 private static int trimSwift(Read r){ 4387 int left=0, right=0, trimmed=0; 4388 if(r.pairnum()==0){ 4389 for(int i=r.length()-1; i>=0; i--){ 4390 byte b=r.bases[i]; 4391 if(b=='C' || b=='T' || b=='N'){right++;} 4392 else{break;} 4393 } 4394 4395 }else{ 4396 for(int i=0; i<r.length(); i++){ 4397 byte b=r.bases[i]; 4398 if(b=='G' || b=='A' || b=='N'){left++;} 4399 else{break;} 4400 } 4401 } 4402 if(left>0 || right>0){ 4403 trimmed=TrimRead.trimByAmount(r, left, right, 1); 4404 } 4405 return trimmed; 4406 } 4407 parseEnd(String s)4408 private static int parseEnd(String s){ 4409 if(s==null){return RIGHTLEFT;} 4410 if(s.equalsIgnoreCase("right") || s.equalsIgnoreCase("r")){return RIGHT;} 4411 if(s.equalsIgnoreCase("left") || s.equalsIgnoreCase("l")){return LEFT;} 4412 if(s.equalsIgnoreCase("rl") || s.equalsIgnoreCase("lr") || s.equalsIgnoreCase("both") || s.equalsIgnoreCase("b")){return RIGHTLEFT;} 4413 return Parse.parseBoolean(s) ? RIGHTLEFT : 0; 4414 } 4415 4416 /*--------------------------------------------------------------*/ 4417 /*---------------- Fields ----------------*/ 4418 /*--------------------------------------------------------------*/ 4419 4420 //TODO: Document 4421 boolean silent=false; 4422 boolean json=false; 4423 4424 boolean swift=false; //https://issues.jgi-psf.org/browse/AUTOQC-2193 4425 4426 /** For calculating kmer cardinality in input */ 4427 final CardinalityTracker loglogIn; 4428 /** For calculating kmer cardinality in output */ 4429 final CardinalityTracker loglogOut; 4430 /** Requires (and sets) cardinality tracking. This is for input kmers. */ 4431 String khistIn=null; 4432 /** Requires (and sets) cardinality tracking. This is for output kmers. */ 4433 String khistOut=null; 4434 4435 /** Has this class encountered errors while processing? */ 4436 public boolean errorState=false; 4437 4438 /** Fraction of available memory preallocated to arrays */ 4439 private double preallocFraction=1.0; 4440 /** Initial size of data structures */ 4441 private int initialSize=-1; 4442 4443 /** Hold kmers. A kmer X such that X%WAYS=Y will be stored in keySets[Y] */ 4444 final AbstractKmerTable[] keySets; 4445 /** A scaffold's name is stored at scaffoldNames.get(id). 4446 * scaffoldNames[0] is reserved, so the first id is 1. */ 4447 final ArrayList<String> scaffoldNames=new ArrayList<String>(); 4448 /** Names of reference files (refNames[0] is valid). */ 4449 private ArrayList<String> refNames=new ArrayList<String>(); 4450 private final ArrayList<String> altRefNames=new ArrayList<String>(); 4451 /** Number of scaffolds per reference. */ 4452 private int[] refScafCounts; 4453 /** scaffoldCounts[id] stores the number of reads with kmer matches to that scaffold */ 4454 AtomicLongArray scaffoldReadCounts; 4455 /** scaffoldBaseCounts[id] stores the number of bases with kmer matches to that scaffold */ 4456 AtomicLongArray scaffoldBaseCounts; 4457 /** Set to false to force threads to share atomic counter arrays. */ 4458 private boolean ALLOW_LOCAL_ARRAYS=true; 4459 /** scaffoldLengths[id] stores the length of that scaffold */ 4460 private IntList scaffoldLengths=new IntList(); 4461 /** hitCounts[x] stores the number of reads with exactly x kmer matches */ 4462 long[] hitCounts; 4463 /** Array of reference files from which to load kmers */ 4464 private String[] ref=null; 4465 /** Alternate reference to be used if main reference has no kmers */ 4466 private String[] altref=null; 4467 /** Array of literal strings from which to load kmers */ 4468 private String[] literal=null; 4469 /** Optional reference for sam file */ 4470 private String samref=null; 4471 4472 /** Input reads */ 4473 private String in1=null, in2=null; 4474 /** Input FileFormats */ 4475 private final FileFormat ffin1, ffin2; 4476 /** Input qual files */ 4477 private String qfin1=null, qfin2=null; 4478 /** Output qual files */ 4479 private String qfout1=null, qfout2=null; 4480 /** Output reads (unmatched and at least minlen) */ 4481 private String out1=null, out2=null; 4482 /** Output reads (matched or shorter than minlen) */ 4483 private String outb1=null, outb2=null; 4484 /** Output FileFormats */ 4485 private final FileFormat ffout1, ffout2, ffoutb1, ffoutb2, ffouts; 4486 /** Output reads whose mate was discarded */ 4487 private String outsingle=null; 4488 /** Statistics output files */ 4489 private String outstats=null, outrqc=null, outrpkm=null, outrefstats=null, polymerStatsFile=null; 4490 @Deprecated 4491 /** duk-style statistics */ 4492 private String outduk=null; 4493 4494 final boolean tossJunk; 4495 4496 /** Dump kmers here. */ 4497 private String dump=null; 4498 4499 /** Quit after this many bases written to outm */ 4500 long maxBasesOutm=-1; 4501 /** Quit after this many bases written to outu */ 4502 long maxBasesOutu=-1; 4503 4504 /** Maximum input reads (or pairs) to process. Does not apply to references. -1 means unlimited. */ 4505 private long maxReads=-1; 4506 /** Process this fraction of input reads. */ 4507 private float samplerate=1f; 4508 /** Set samplerate seed to this value. */ 4509 private long sampleseed=-1; 4510 4511 /** Output reads in input order. May reduce speed. */ 4512 private final boolean ordered; 4513 /** Attempt to match kmers shorter than normal k on read ends when doing kTrimming. */ 4514 boolean useShortKmers=false; 4515 /** Make the middle base in a kmer a wildcard to improve sensitivity */ 4516 boolean maskMiddle=true; 4517 4518 /** Store reference kmers with up to this many substitutions */ 4519 int hammingDistance=0; 4520 /** Search for query kmers with up to this many substitutions */ 4521 int qHammingDistance=0; 4522 /** Store reference kmers with up to this many edits (including indels) */ 4523 int editDistance=0; 4524 /** Store short reference kmers with up to this many substitutions */ 4525 int hammingDistance2=-1; 4526 /** Search for short query kmers with up to this many substitutions */ 4527 int qHammingDistance2=-1; 4528 /** Store short reference kmers with up to this many edits (including indels) */ 4529 int editDistance2=-1; 4530 /** Never skip more than this many consecutive kmers when hashing reference. */ 4531 int maxSkip=1; 4532 /** Always skip at least this many consecutive kmers when hashing reference. 4533 * 1 means every kmer is used, 2 means every other, etc. */ 4534 int minSkip=1; 4535 4536 /** Trim this much extra around matched kmers */ 4537 int trimPad; 4538 4539 /*--------------------------------------------------------------*/ 4540 /*---------------- Flowcell Filtering ----------------*/ 4541 /*--------------------------------------------------------------*/ 4542 4543 //TODO: Document 4544 int xMinLoc=-1; 4545 int yMinLoc=-1; 4546 int xMaxLoc=-1; 4547 int yMaxLoc=-1; 4548 final boolean locationFilter; 4549 4550 /*--------------------------------------------------------------*/ 4551 /*---------------- Variant-Related ----------------*/ 4552 /*--------------------------------------------------------------*/ 4553 4554 //TODO: Document 4555 private String varFile=null; 4556 private String vcfFile=null; 4557 private VarMap varMap=null; 4558 // private HashSet<VarKey> varKeySet=null; 4559 private ScafMap scafMap=null; 4560 private boolean fixVariants=false; 4561 private boolean unfixVariants=true; 4562 4563 /** Optional file for quality score recalibration */ 4564 private String samFile=null; 4565 4566 /** Filter reads with unsupported substitutions */ 4567 private boolean filterVars=false; 4568 /** Maximum allowed unsupported substitutions in a read */ 4569 private int maxBadSubs=2; 4570 /** Maximum variant depth for a variant to be considered unsupported */ 4571 private int maxBadSubAlleleDepth=1; 4572 /** Minimum read depth for a variant to be considered unsupported */ 4573 private int minBadSubReadDepth=2; 4574 //TODO 4575 private int minBadSubEDist=0; 4576 //TODO 4577 private float maxBadAlleleFraction=0; 4578 4579 /*--------------------------------------------------------------*/ 4580 /*---------------- Entropy Fields ----------------*/ 4581 /*--------------------------------------------------------------*/ 4582 4583 /** Minimum entropy to be considered "complex", on a scale of 0-1 */ 4584 float entropyCutoff=-1; 4585 /** Mask entropy with a highpass filter */ 4586 boolean entropyHighpass=true; 4587 4588 /** Change the quality scores to be proportional to the entropy */ 4589 boolean entropyMark=false; 4590 /** Mask low-entropy areas (e.g., with N) */ 4591 boolean entropyMask=false; 4592 /** Trim only trailing or leading low-entropy areas, ignoring middle areas */ 4593 int entropyTrim=0; 4594 /** Convert low-entropy areas to lower case */ 4595 boolean entropyMaskLowercase=false; 4596 4597 /** Perform entropy calculation */ 4598 final boolean calcEntropy; 4599 4600 4601 /*--------------------------------------------------------------*/ 4602 /*---------------- Statistics ----------------*/ 4603 /*--------------------------------------------------------------*/ 4604 4605 /** Stores JSON output */ 4606 JsonObject jsonStats; 4607 4608 long readsIn=0; 4609 long basesIn=0; 4610 long readsOut=0; 4611 long basesOut=0; 4612 4613 long readsQTrimmed=0; 4614 long basesQTrimmed=0; 4615 long readsFTrimmed=0; 4616 long basesFTrimmed=0; 4617 long readsQFiltered=0; 4618 long basesQFiltered=0; 4619 long readsEFiltered=0; 4620 long basesEFiltered=0; 4621 long readsNFiltered=0; 4622 long basesNFiltered=0; 4623 4624 long readsPolyTrimmed=0; 4625 long basesPolyTrimmed=0; 4626 4627 long readsKTrimmed=0; 4628 long basesKTrimmed=0; 4629 long readsKFiltered=0; 4630 long basesKFiltered=0; 4631 4632 long badGcReads; 4633 long badGcBases; 4634 4635 long badHeaderReads=0; 4636 long badHeaderBases=0; 4637 4638 long readsTrimmedByOverlap; 4639 long basesTrimmedByOverlap; 4640 4641 long readsTrimmedBySwift; 4642 long basesTrimmedBySwift; 4643 4644 long refReads=0; 4645 long refBases=0; 4646 long refKmers=0; 4647 4648 // public long modsum=0; //123 4649 4650 long storedKmers=0; 4651 4652 /*--------------------------------------------------------------*/ 4653 /*---------------- Homopolymers ----------------*/ 4654 /*--------------------------------------------------------------*/ 4655 4656 boolean countPolymers=false; 4657 byte polymerChar1=-1; 4658 byte polymerChar2=-1; 4659 /** Minimum length to consider a homopolymer, for the purpose of statistics */ 4660 int polymerLength=20; 4661 4662 /** Tracks homopolymer statistics */ 4663 PolymerTracker pTracker; 4664 4665 /*--------------------------------------------------------------*/ 4666 /*---------------- Final Primitives ----------------*/ 4667 /*--------------------------------------------------------------*/ 4668 4669 /** Don't look for kmers in read 1 */ 4670 final boolean skipR1; 4671 /** Don't look for kmers in read 2 */ 4672 final boolean skipR2; 4673 /** Correct errors via read overlap */ 4674 final boolean ecc; 4675 /** True if a ReadStats object is being used for collecting data */ 4676 final boolean makeReadStats; 4677 4678 /** Look for reverse-complements as well as forward kmers. Default: true */ 4679 final boolean rcomp; 4680 /** Don't allow a read 'N' to match a reference 'A'. 4681 * Reduces sensitivity when hdist>0 or edist>0. Default: false. */ 4682 final boolean forbidNs; 4683 /** AND bitmask with 0's at the middle base */ 4684 final long middleMask; 4685 /** Use HashForest data structure */ 4686 private final boolean useForest; 4687 /** Use KmerTable data structure */ 4688 private final boolean useTable; 4689 /** Use HashArray data structure (default) */ 4690 private final boolean useArray; 4691 4692 /** Normal kmer length */ 4693 final int k; 4694 /** k-1; used in some expressions */ 4695 final int k2; 4696 /** Emulated kmer greater than k */ 4697 final int kbig; 4698 /** Effective kmer size */ 4699 final int keff; 4700 /** Shortest kmer to use for trimming */ 4701 final int mink; 4702 /** A read may contain up to this many kmers before being considered a match. Default: 0 */ 4703 final int maxBadKmers0; 4704 /** A read must share at least this fraction of its kmers to be considered a match. Default: 0 */ 4705 final float minKmerFraction; 4706 /** Reference kmers must cover at least this fraction of read bases to be considered a match. Default: 0 */ 4707 final float minCoveredFraction; 4708 4709 /** Recalibrate quality scores using matrices */ 4710 final boolean recalibrateQuality; 4711 /** Quality-trim the left side */ 4712 final boolean qtrimLeft; 4713 /** Quality-trim the right side */ 4714 final boolean qtrimRight; 4715 /** Trim soft-clipped bases */ 4716 final boolean trimClip; 4717 /** Trim poly-A tails of at least this length */ 4718 final int trimPolyA; 4719 4720 /** Trim poly-G prefixes of at least this length */ 4721 final int trimPolyGLeft; 4722 /** Trim poly-G tails of at least this length */ 4723 final int trimPolyGRight; 4724 /** Remove reads with poly-G prefixes of at least this length */ 4725 final int filterPolyG; 4726 4727 /** Trim poly-C prefixes of at least this length */ 4728 final int trimPolyCLeft; 4729 /** Trim poly-C tails of at least this length */ 4730 final int trimPolyCRight; 4731 /** Remove reads with poly-C prefixes of at least this length */ 4732 final int filterPolyC; 4733 4734 /** Trim bases at this quality or below. Default: 4 */ 4735 final float trimq; 4736 /** Error rate for trimming (derived from trimq) */ 4737 private final float trimE; 4738 /** Throw away reads below this average quality after trimming. Default: 0 */ 4739 final float minAvgQuality; 4740 /** Throw away reads with any base below this quality after trimming. Default: 0 */ 4741 final byte minBaseQuality; 4742 /** If positive, calculate average quality from the first X bases only. Default: 0 */ 4743 final int minAvgQualityBases; 4744 /** Throw away reads failing chastity filter (:Y: in read header) */ 4745 final boolean chastityFilter; 4746 /** Crash if a barcode is encountered that contains Ns or is not in the table */ 4747 final boolean failBadBarcodes; 4748 /** Remove reads with Ns in barcodes or that are not in the table */ 4749 final boolean removeBadBarcodes; 4750 /** Fail reads missing a barcode */ 4751 final boolean failIfNoBarcode; 4752 /** A set of valid barcodes; null if unused */ 4753 final HashSet<String> barcodes; 4754 /** Throw away reads containing more than this many Ns. Default: -1 (disabled) */ 4755 final int maxNs; 4756 /** Throw away reads containing without at least this many consecutive called bases. */ 4757 final int minConsecutiveBases; 4758 /** Throw away reads containing fewer than this fraction of any particular base. */ 4759 final float minBaseFrequency; 4760 /** Throw away reads shorter than this after trimming. Default: 10 */ 4761 final int minReadLength; 4762 /** Throw away reads longer than this after trimming. Default: Integer.MAX_VALUE */ 4763 final int maxReadLength; 4764 /** Toss reads shorter than this fraction of initial length, after trimming */ 4765 final float minLenFraction; 4766 /** Filter reads by whether or not they have matching kmers */ 4767 private final boolean kfilter; 4768 /** Trim matching kmers and all bases to the left */ 4769 final boolean ktrimLeft; 4770 /** Trim matching kmers and all bases to the right */ 4771 boolean ktrimRight; 4772 /** Don't trim, but replace matching kmers with a symbol (default N) */ 4773 final boolean ktrimN; 4774 /** Exclude kmer itself when ktrimming */ 4775 final boolean ktrimExclusive; 4776 /** Split into two reads around the kmer */ 4777 final boolean ksplit; 4778 /** Replace bases covered by matched kmers with this symbol */ 4779 final byte trimSymbol; 4780 /** Convert kmer-masked bases to lowercase */ 4781 final boolean kmaskLowercase; 4782 /** Only mask fully-covered bases **/ 4783 final boolean kmaskFullyCovered; 4784 /** Output over-trimmed reads to outbad (outmatch). If false, they are discarded. */ 4785 final boolean addTrimmedToBad; 4786 /** Find the sequence that shares the most kmer matches when filtering. */ 4787 final boolean findBestMatch; 4788 /** Trim pairs to the same length, when adapter-trimming */ 4789 final boolean trimPairsEvenly; 4790 /** Trim left bases of the read to this position (exclusive, 0-based) */ 4791 final int forceTrimLeft; 4792 /** Trim right bases of the read after this position (exclusive, 0-based) */ 4793 final int forceTrimRight; 4794 /** Trim this many rightmost bases of the read */ 4795 final int forceTrimRight2; 4796 /** Trim right bases of the read modulo this value. 4797 * e.g. forceTrimModulo=50 would trim the last 3bp from a 153bp read. */ 4798 final int forceTrimModulo; 4799 4800 /** Discard reads with GC below this. */ 4801 final float minGC; 4802 /** Discard reads with GC above this. */ 4803 final float maxGC; 4804 /** Discard reads outside of GC bounds. */ 4805 final boolean filterGC; 4806 /** Average GC for paired reads. */ 4807 final boolean usePairGC; 4808 4809 /** If positive, only look for kmer matches in the leftmost X bases */ 4810 int restrictLeft; 4811 /** If positive, only look for kmer matches the rightmost X bases */ 4812 int restrictRight; 4813 4814 /** Skip this many initial input reads */ 4815 private final long skipreads; 4816 4817 /** Pairs go to outbad if either of them is bad, as opposed to requiring both to be bad. 4818 * Default: true. */ 4819 final boolean removePairsIfEitherBad; 4820 4821 /** Rather than discarding, trim failures to 1bp. 4822 * Default: false. */ 4823 final boolean trimFailuresTo1bp; 4824 4825 /** Print only statistics for scaffolds that matched at least one read 4826 * Default: true. */ 4827 private final boolean printNonZeroOnly; 4828 4829 /** Rename reads to indicate what they matched. 4830 * Default: false. */ 4831 final boolean rename; 4832 /** Use names of reference files instead of scaffolds. 4833 * Default: false. */ 4834 private final boolean useRefNames; 4835 4836 /** Fraction of kmers to skip, 0 to 16 out of 17 */ 4837 final int speed; 4838 4839 /** Skip this many kmers when examining the read. Default 1. 4840 * 1 means every kmer is used, 2 means every other, etc. */ 4841 final int qSkip; 4842 4843 /** noAccel is true if speed and qSkip are disabled, accel is the opposite. */ 4844 final boolean noAccel; 4845 private final boolean accel; 4846 4847 private boolean pairedToSingle=false; 4848 4849 /*--------------------------------------------------------------*/ 4850 /*----------- Symbol-Specific Constants ----------*/ 4851 /*--------------------------------------------------------------*/ 4852 4853 /** True for amino acid data, false for nucleotide data */ 4854 final boolean amino; 4855 final int maxSupportedK; 4856 final int bitsPerBase; 4857 final int maxSymbol; 4858 final int symbols; 4859 final int symbolArrayLen; 4860 final int symbolSpace; 4861 final long symbolMask; 4862 4863 final int minlen; 4864 final int minminlen; 4865 final int minlen2; 4866 final int shift; 4867 final int shift2; 4868 final long mask; 4869 final long kmask; 4870 4871 /** x&clearMasks[i] will clear base i */ 4872 final long[] clearMasks; 4873 /** x|setMasks[i][j] will set base i to j */ 4874 final long[][] setMasks; 4875 /** x&leftMasks[i] will clear all bases to the right of i (exclusive) */ 4876 final long[] leftMasks; 4877 /** x&rightMasks[i] will clear all bases to the left of i (inclusive) */ 4878 final long[] rightMasks; 4879 /** x|kMasks[i] will set the bit to the left of the leftmost base */ 4880 final long[] lengthMasks; 4881 4882 /** Symbol code; -1 for undefined */ 4883 final byte[] symbolToNumber; 4884 /** Symbol code; 0 for undefined */ 4885 final byte[] symbolToNumber0; 4886 /** Complementary symbol code; 0 for undefined */ 4887 final byte[] symbolToComplementNumber0; 4888 4889 /** For verbose / debugging output */ kmerToString(long kmer, int k)4890 final String kmerToString(long kmer, int k){ 4891 return amino ? AminoAcid.kmerToStringAA(kmer, k) : AminoAcid.kmerToString(kmer, k); 4892 } 4893 4894 /** Returns true if the symbol is not degenerate (e.g., 'N') for the alphabet in use. */ isFullyDefined(byte symbol)4895 final boolean isFullyDefined(byte symbol){ 4896 return symbol>=0 && symbolToNumber[symbol]>=0; 4897 } 4898 4899 /*--------------------------------------------------------------*/ 4900 /*---------------- BBMerge Flags ----------------*/ 4901 /*--------------------------------------------------------------*/ 4902 4903 /** Trim implied adapters based on overlap, for reads with insert size shorter than read length */ 4904 final boolean trimByOverlap; 4905 final boolean useQualityForOverlap; 4906 final boolean strictOverlap; 4907 4908 int minOverlap0=7; 4909 int minOverlap=14; 4910 int minInsert0=16; 4911 int minInsert=40; 4912 4913 final float maxRatio; 4914 final float ratioMargin; 4915 final float ratioOffset; 4916 final float efilterRatio; 4917 final float efilterOffset; 4918 final float pfilterRatio; 4919 final float meeFilter; 4920 4921 /*--------------------------------------------------------------*/ 4922 /*---------------- Histogram Flags ----------------*/ 4923 /*--------------------------------------------------------------*/ 4924 4925 /** 4926 * Generate histograms from the reads before rather than after processing; 4927 * default is true. Khist is handled independently. 4928 */ 4929 final boolean histogramsBeforeProcessing; 4930 4931 final boolean MAKE_QUALITY_ACCURACY; 4932 final boolean MAKE_QUALITY_HISTOGRAM; 4933 final boolean MAKE_MATCH_HISTOGRAM; 4934 final boolean MAKE_BASE_HISTOGRAM; 4935 4936 final boolean MAKE_EHIST; 4937 final boolean MAKE_INDELHIST; 4938 final boolean MAKE_LHIST; 4939 final boolean MAKE_GCHIST; 4940 final boolean MAKE_ENTROPYHIST; 4941 final boolean MAKE_IDHIST; 4942 4943 final boolean MAKE_IHIST; 4944 4945 /*--------------------------------------------------------------*/ 4946 /*---------------- Static Fields ----------------*/ 4947 /*--------------------------------------------------------------*/ 4948 4949 /** Number of tables (and threads, during loading) */ 4950 private static final int WAYS=7; //123 4951 /** Default initial size of data structures */ 4952 private static final int initialSizeDefault=128000; 4953 /** Verbose messages */ 4954 public static final boolean verbose=false; //123 4955 4956 /** Ends for some operations like entropytrim; could be migrated over to other operations */ 4957 private static final int RIGHT=1, LEFT=2, RIGHTLEFT=3; 4958 4959 /** Number of reads output in the last run */ 4960 public static long lastReadsOut; 4961 /** Print messages to this stream */ 4962 private static PrintStream outstream=System.err; 4963 /** Permission to overwrite existing files */ 4964 public static boolean overwrite=true; 4965 /** Permission to append to existing files */ 4966 public static boolean append=false; 4967 /** Print speed statistics upon completion */ 4968 public static boolean showSpeed=true; 4969 /** Display progress messages such as memory usage */ 4970 public static boolean DISPLAY_PROGRESS=true; 4971 /** Number of ProcessThreads */ 4972 public static int THREADS=Shared.threads(); 4973 /** Indicates end of input stream */ 4974 static final ArrayList<Read> POISON=new ArrayList<Read>(0); 4975 /** Number of columns for statistics output, 3 or 5 */ 4976 public static int STATS_COLUMNS=3; 4977 /** Release memory used by kmer storage after processing reads */ 4978 public static boolean RELEASE_TABLES=true; 4979 /** Max value of hitCount array */ 4980 public static final int HITCOUNT_LEN=1000; 4981 /** Make unambiguous copies of ref sequences with ambiguous bases */ 4982 public static boolean REPLICATE_AMBIGUOUS=false; 4983 4984 /** Stores some data for statistics when running RQCFilter; not used otherwise. */ 4985 public static HashMap<String, Long> RQC_MAP=null; 4986 4987 } 4988