1 package jgi; 2 3 import java.io.File; 4 import java.io.PrintStream; 5 import java.util.ArrayList; 6 import java.util.Arrays; 7 import java.util.BitSet; 8 import java.util.HashMap; 9 import java.util.Locale; 10 11 import align2.QualityTools; 12 import dna.ChromosomeArray; 13 import dna.Data; 14 import dna.Scaffold; 15 import fileIO.ByteFile; 16 import fileIO.ByteStreamWriter; 17 import fileIO.FileFormat; 18 import fileIO.ReadWrite; 19 import fileIO.TextFile; 20 import fileIO.TextStreamWriter; 21 import shared.KillSwitch; 22 import shared.Parse; 23 import shared.Parser; 24 import shared.PreParser; 25 import shared.ReadStats; 26 import shared.Shared; 27 import shared.Timer; 28 import shared.Tools; 29 import shared.TrimRead; 30 import stream.Read; 31 import stream.SamLine; 32 import stream.SamLineStreamer; 33 import stream.ScaffoldCoordinates; 34 import stream.SiteScore; 35 import structures.ByteBuilder; 36 import structures.CoverageArray; 37 import structures.CoverageArray2; 38 import structures.CoverageArray3; 39 import structures.ListNum; 40 import structures.LongList; 41 42 /** 43 * @author Brian Bushnell 44 * @date Jan 4, 2013 45 * 46 */ 47 public class CoveragePileup { 48 49 /*--------------------------------------------------------------*/ 50 /*---------------- Main ----------------*/ 51 /*--------------------------------------------------------------*/ 52 main(String[] args)53 public static void main(String[] args){ 54 Timer t=new Timer(); 55 56 CoveragePileup x=new CoveragePileup(args); 57 58 x.process(); 59 60 t.stop(); 61 if(!KEY_VALUE){ 62 x.outstream.println(); 63 x.outstream.println("Time: \t"+t); 64 } 65 66 //Close the print stream if it was redirected 67 Shared.closeStream(x.outstream); 68 } 69 70 /*--------------------------------------------------------------*/ 71 /*---------------- Initialization ----------------*/ 72 /*--------------------------------------------------------------*/ 73 CoveragePileup(String[] args)74 public CoveragePileup(String[] args){ 75 76 {//Preparse block for help, config files, and outstream 77 PreParser pp=new PreParser(args, printCommand ? getClass() : null, false); 78 args=pp.args; 79 outstream=pp.outstream; 80 } 81 82 int vectorMode=-1; 83 boolean outset=false; 84 ReadWrite.USE_UNPIGZ=true; 85 // SamLine.RNAME_AS_BYTES=false; 86 87 for(int i=0; i<args.length; i++){ 88 final String arg=args[i]; 89 final String[] split=arg.split("="); 90 String a=split[0].toLowerCase(); 91 String b=split.length>1 ? split[1] : null; 92 93 if(a.equals("ref") || a.equals("reference") || a.equals("fasta")){ 94 reference=b; 95 }else if(a.equals("addfromref")){ 96 ADD_FROM_REF=Parse.parseBoolean(b); 97 }else if(a.equals("addfromreads")){ 98 ADD_FROM_READS=Parse.parseBoolean(b); 99 }else if(a.equals("in") || a.equals("in1")){ 100 in1=b; 101 }else if(a.equals("in2")){ 102 in2=b; 103 }else if(a.equals("out") || a.equals("coveragestats") || a.equals("covstats") || a.equals("stats")){ 104 covstats=b; 105 outset=true; 106 }else if(a.equals("minscaf") || a.equals("covminscaf")){ 107 minscaf=Integer.parseInt(b); 108 }else if(a.equals("mindepth") || a.equals("mincov")){ 109 minDepthToBeCovered=Integer.parseInt(b); 110 }else if(a.equals("border")){ 111 border=Integer.parseInt(b); 112 }else if(a.equals("qtrim")/* || a.equals("trim")*/){ 113 if(b==null || b.length()==0){qtrimRight=qtrimLeft=true;} 114 else if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){qtrimLeft=true;qtrimRight=false;} 115 else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){qtrimLeft=false;qtrimRight=true;} 116 else if(b.equalsIgnoreCase("both") || b.equalsIgnoreCase("rl") || b.equalsIgnoreCase("lr")){qtrimLeft=qtrimRight=true;} 117 else if(Tools.isDigit(b.charAt(0))){ 118 trimq=Float.parseFloat(b); 119 qtrimRight=trimq>0; 120 }else{qtrimRight=qtrimLeft=Parse.parseBoolean(b);} 121 }else if(a.equals("trimq") || a.equals("trimquality")){ 122 trimq=Float.parseFloat(b); 123 }else if(a.equals("ss") || a.equals("samstreamer")){ 124 if(b!=null && Tools.isDigit(b.charAt(0))){ 125 useStreamer=true; 126 streamerThreads=Tools.max(1, Integer.parseInt(b)); 127 }else{ 128 useStreamer=Parse.parseBoolean(b); 129 } 130 }else if(a.equals("minq") || a.equals("minmapq")){ 131 minMapq=Integer.parseInt(b); 132 }else if(a.equals("outsam")){ 133 outsam=b; 134 }else if(a.equals("rpkm") || a.equals("fpkm") || a.equals("outrpkm")){ 135 outrpkm=b; 136 }else if(a.equals("outorf")){ 137 outorf=b; 138 }else if(a.equals("orffasta") || a.equals("fastaorf")){ 139 orffasta=b; 140 }else if(a.equals("basecov") || a.equals("outcov")){ 141 basecov=b; 142 }else if(a.equals("bincov") || a.equals("outbinned")){ 143 bincov=b; 144 }else if(a.equals("normcov") || a.equals("outnormalized")){ 145 normcov=b; 146 }else if(a.equals("normcovo") || a.equals("outnormalizedoverall")){ 147 normcovOverall=b; 148 }else if(a.equals("delta")){ 149 DELTA_ONLY=Parse.parseBoolean(b); 150 }else if(a.equals("physical") || a.equals("physicalcoverage") || a.equals("physcov")){ 151 PHYSICAL_COVERAGE=Parse.parseBoolean(b); 152 }else if(a.equals("tlen")){ 153 USE_TLEN=Parse.parseBoolean(b); 154 }else if(a.equals("hist") || a.equals("histogram") || a.equals("covhist")){ 155 histogram=b; 156 }else if(a.equals("reads")){ 157 maxReads=Parse.parseKMG(b); 158 }else if(a.equals("scafs") || a.equals("scaffolds")){ 159 initialScaffolds=Tools.max(128, (int)(Tools.min(Long.parseLong(b),2000000000))); 160 }else if(a.equals("binsize")){ 161 binsize=Integer.parseInt(b); 162 }else if(a.equals("32bit")){ 163 bits32=Parse.parseBoolean(b); 164 }else if(a.equals("bitset") || a.equals("usebitset") || a.equals("bitsets") || a.equals("usebitsets")){ 165 // if(Parse.parseBoolean(b)){arrayMode=BITSET_MODE;} 166 vectorMode=Parse.parseBoolean(b) ? BITSET_MODE : NOTHING_MODE; 167 }else if(a.equals("array") || a.equals("arrays") || a.equals("usearrays")){ 168 vectorMode=Parse.parseBoolean(b) ? ARRAY_MODE : NOTHING_MODE; 169 }else if(a.equals("median") || a.equals("calcmedian")){ 170 if(Parse.parseBoolean(b)){ 171 vectorMode=ARRAY_MODE; 172 } 173 }else if(a.startsWith("nonzero") || a.equals("nzo")){ 174 NONZERO_ONLY=Parse.parseBoolean(b); 175 if(verbose){outstream.println("Set NONZERO_ONLY to "+NONZERO_ONLY);} 176 }else if(a.equals("append") || a.equals("app")){ 177 append=ReadStats.append=Parse.parseBoolean(b); 178 }else if(a.equals("overwrite") || a.equals("ow")){ 179 overwrite=Parse.parseBoolean(b); 180 if(verbose){outstream.println("Set overwrite to "+overwrite);} 181 }else if(a.equalsIgnoreCase("twocolumn")){ 182 TWOCOLUMN=Parse.parseBoolean(b); 183 if(verbose){outstream.println("Set TWOCOLUMN to "+TWOCOLUMN);} 184 }else if(a.equalsIgnoreCase("keyvalue") || a.equalsIgnoreCase("machineout")){ 185 KEY_VALUE=Parse.parseBoolean(b); 186 }else if(a.equalsIgnoreCase("countgc")){ 187 COUNT_GC=Parse.parseBoolean(b); 188 if(verbose){outstream.println("Set COUNT_GC to "+COUNT_GC);} 189 }else if(a.equals("secondary") || a.equals("usesecondary")){ 190 USE_SECONDARY=Parse.parseBoolean(b); 191 if(verbose){outstream.println("Set USE_SECONDARY_ALIGNMENTS to "+USE_SECONDARY);} 192 }else if(a.equals("softclip") || a.equals("includesoftclip")){ 193 INCLUDE_SOFT_CLIP=Parse.parseBoolean(b); 194 if(verbose){outstream.println("Set INCLUDE_SOFT_CLIP to "+INCLUDE_SOFT_CLIP);} 195 }else if(a.equals("keepshortbins") || a.equals("ksb")){ 196 KEEP_SHORT_BINS=Parse.parseBoolean(b); 197 if(verbose){outstream.println("Set KEEP_SHORT_BINS to "+KEEP_SHORT_BINS);} 198 }else if(a.equals("strandedcoverage") || a.equals("strandedcov") || a.equals("covstranded") || a.equals("stranded")){ 199 STRANDED=Parse.parseBoolean(b); 200 }else if(a.equals("startcov") || a.equals("covstart") || a.equals("startonly")){ 201 START_ONLY=Parse.parseBoolean(b); 202 }else if(a.equals("stopcov") || a.equals("covstop") || a.equals("stoponly")){ 203 STOP_ONLY=Parse.parseBoolean(b); 204 }else if(a.equals("concise")){ 205 CONCISE=Parse.parseBoolean(b); 206 }else if(a.equals("verbose")){ 207 verbose=Parse.parseBoolean(b); 208 }else if(a.equals("normc") || a.equals("normalizecoverage")){ 209 NORMALIZE_COVERAGE=Parse.parseBoolean(b); 210 }else if(a.equals("header") || a.equals("hdr")){ 211 printHeader=Parse.parseBoolean(b); 212 }else if(a.equals("headerpound") || a.equals("#")){ 213 headerPound=Parse.parseBoolean(b); 214 }else if(a.equals("stdev")){ 215 calcCovStdev=Parse.parseBoolean(b); 216 }else if(a.equals("delcov") || a.equals("dels") || a.equals("includedels") || a.equals("includedeletions") || a.equals("delcoverage")){ 217 INCLUDE_DELETIONS=Parse.parseBoolean(b); 218 }else if(a.equals("dupecoverage") || a.equals("dupecov") || a.equals("dupes") || a.equals("duplicates") || a.equals("includeduplicates")){ 219 INCLUDE_DUPLICATES=Parse.parseBoolean(b); 220 }else if(a.equals("ignoredupes") || a.equals("ignoreduplicates")){ 221 INCLUDE_DUPLICATES=!Parse.parseBoolean(b); 222 }else if(a.equals("normb") || a.equals("normalizebins")){ 223 try { 224 NORMALIZE_LENGTH_BINS=Integer.parseInt(b); 225 } catch (NumberFormatException e) { 226 boolean x=Parse.parseBoolean(b); 227 NORMALIZE_LENGTH_BINS=x ? 100 : -1; 228 } 229 }else if(a.equals("covwindow")){ 230 if(b==null || b.length()<1 || Character.isLetter(b.charAt(0))){ 231 USE_WINDOW=Parse.parseBoolean(b); 232 }else{ 233 LOW_COV_WINDOW=Integer.parseInt(b); 234 USE_WINDOW=(LOW_COV_WINDOW>0); 235 } 236 }else if(a.equals("covwindowavg")){ 237 LOW_COV_DEPTH=Double.parseDouble(b); 238 }else if(a.equals("k")){ 239 k=Integer.parseInt(b); 240 }else if(Parser.parseCommonStatic(arg, a, b)){ 241 //do nothing 242 }else if(Parser.parseZip(arg, a, b)){ 243 //do nothing 244 }else if(in1==null && arg.indexOf('=')<0 && new File(arg).exists()){ 245 in1=arg; 246 }else{ 247 throw new RuntimeException("Unknown parameter: "+args[i]); 248 } 249 250 } 251 trimE=(float)QualityTools.phredToProbError(trimq); 252 // assert(false) : qtrimLeft+", "+qtrimRight+", "+trimq+", "+trimE; 253 254 ReadWrite.SAMTOOLS_IGNORE_FLAG|=ReadWrite.SAM_UNMAPPED; 255 if(!USE_SECONDARY){ReadWrite.SAMTOOLS_IGNORE_FLAG|=ReadWrite.SAM_SECONDARY;} 256 if(!INCLUDE_DUPLICATES){ReadWrite.SAMTOOLS_IGNORE_FLAG|=ReadWrite.SAM_DUPLICATE;} 257 258 if(outsam==null){ 259 SamLine.PARSE_0=false; 260 SamLine.PARSE_6=false; 261 SamLine.PARSE_7=false; 262 SamLine.PARSE_8=false; 263 if(k<1 && trimq<1){SamLine.PARSE_10=false;} 264 SamLine.PARSE_OPTIONAL=false; 265 } 266 267 if(vectorMode>-1){ 268 USE_BITSETS=(vectorMode==BITSET_MODE); 269 USE_COVERAGE_ARRAYS=(vectorMode==ARRAY_MODE); 270 }else{ 271 if(histogram==null && basecov==null && bincov==null && normcov==null && 272 normcovOverall==null && outorf==null && !calcCovStdev){//No need for coverage array! 273 USE_COVERAGE_ARRAYS=false; 274 if(TWOCOLUMN){//No need for bitset, either! 275 USE_BITSETS=false; 276 }else{ 277 USE_BITSETS=true; 278 } 279 } 280 } 281 282 if(verbose){ 283 outstream.println("Set USE_COVERAGE_ARRAYS to "+USE_COVERAGE_ARRAYS); 284 outstream.println("Set USE_BITSETS to "+USE_BITSETS); 285 } 286 287 if(maxReads<0){maxReads=Long.MAX_VALUE;} 288 { 289 final String a=(args.length>0 ? args[0] : null); 290 // final String b=(args.length>1 ? args[1] : null); 291 if(in1==null && a!=null && a.indexOf('=')<0 && (a.startsWith("stdin") || new File(a).exists())){in1=a;} 292 // if(covstats==null && b!=null && b.indexOf('=')<0){covstats=b;} 293 if(in1==null){in1="stdin";} 294 // if(covstats==null && !outset){ 295 //// out="stdout"; 296 //// outstream.println("Warning: output destination not set; producing no output. To print to standard out, set 'out=stdout'"); 297 // outstream=System.err; 298 // } 299 } 300 assert(in1!=null); 301 // assert(out!=null || outset) : "Output file was not set."; 302 303 if(STRANDED){ 304 assert(basecov==null || basecov.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output."; 305 assert(bincov==null || bincov.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output."; 306 assert(normcov==null || normcov.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output."; 307 assert(normcovOverall==null || normcovOverall.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output."; 308 assert(histogram==null || histogram.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output."; 309 assert(covstats==null || covstats.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output."; 310 } 311 312 if(!Tools.testInputFiles(false, true, in1, in2, reference)){ 313 throw new RuntimeException("\nCan't read some input files.\n"); 314 } 315 316 if(!Tools.testOutputFiles(overwrite, append, false, basecov, bincov, normcov, normcovOverall, histogram, covstats, outrpkm)){ 317 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+ 318 basecov+", "+bincov+", "+normcov+", "+normcovOverall+", "+histogram+", "+covstats+", "+outrpkm+"\n"); 319 } 320 } 321 322 323 /*--------------------------------------------------------------*/ 324 /*---------------- Data Structures ----------------*/ 325 /*--------------------------------------------------------------*/ 326 327 328 /** The goal if this is to garbage-collect unnecessary objects, not really for reusing the object */ clear()329 public void clear(){ 330 list=null; 331 table=null; 332 pairTable=null; 333 334 program=null; 335 version=null; 336 337 in1=null; 338 covstats=null; 339 outsam=null; 340 outorf=null; 341 outrpkm=null; 342 reference=null; 343 histogram=null; 344 basecov=null; 345 bincov=null; 346 normcov=null; 347 normcovOverall=null; 348 orffasta=null; 349 350 error=false; 351 352 refBases=0; 353 mappedBases=0; 354 mappedNonClippedBases=0; 355 mappedBasesWithDels=0; 356 mappedReads=0; 357 properPairs=0; 358 readsProcessed=0; 359 basesProcessed=0; 360 kmersProcessed=0; 361 mappedKmers=0; 362 totalCoveredBases1=0; 363 totalCoveredBases2=0; 364 scaffoldsWithCoverage1=0; 365 scaffoldsWithCoverage2=0; 366 totalScaffolds=0; 367 } 368 createDataStructures()369 public void createDataStructures(){ 370 refBases=0; 371 mappedBases=0; 372 mappedNonClippedBases=0; 373 mappedBasesWithDels=0; 374 mappedReads=0; 375 properPairs=0; 376 readsProcessed=0; 377 basesProcessed=0; 378 kmersProcessed=0; 379 mappedKmers=0; 380 totalCoveredBases1=0; 381 totalCoveredBases2=0; 382 scaffoldsWithCoverage1=0; 383 scaffoldsWithCoverage2=0; 384 totalScaffolds=0; 385 error=false; 386 list=new ArrayList<Scaffold>(initialScaffolds); 387 table=new HashMap<String, Scaffold>(initialScaffolds); 388 389 if(PHYSICAL_COVERAGE){ 390 pairTable=new HashMap<String, SamLine>(); 391 if(COUNT_GC){ 392 COUNT_GC=false; 393 outstream.println("COUNT_GC disabled for physical coverage mode."); 394 } 395 if(USE_SECONDARY){ 396 USE_SECONDARY=false; 397 outstream.println("USE_SECONDARY disabled for physical coverage mode."); 398 } 399 400 SamLine.PARSE_0=true; 401 SamLine.PARSE_6=true; 402 SamLine.PARSE_7=true; 403 SamLine.PARSE_8=true; 404 SamLine.PARSE_10=false; 405 SamLine.PARSE_OPTIONAL=false; 406 } 407 } 408 409 410 /*--------------------------------------------------------------*/ 411 /*---------------- Outer Methods ----------------*/ 412 /*--------------------------------------------------------------*/ 413 414 /** Read and process all input data. */ process()415 public void process(){ 416 createDataStructures(); 417 418 // System.err.println("A"); 419 if(in1!=null && FileFormat.isBamFile(in1)){ 420 if(Data.SAMTOOLS() && useStreamer){ReadWrite.USE_SAMBAMBA=false;} //Disable because it takes forever to read the header 421 } 422 ByteFile tf=ByteFile.makeByteFile(in1, false); 423 424 // System.err.println("B"); 425 426 final ByteStreamWriter tsw=(outsam==null ? null : new ByteStreamWriter(outsam, overwrite, false, true)); 427 if(tsw!=null){tsw.start();} 428 ReadWrite.USE_SAMBAMBA=true; 429 430 // System.err.println("C"); 431 processHeader(tf, tsw); 432 433 // System.err.println("D"); 434 processReference(); 435 // System.err.println("E"); 436 if(maxReads<0){maxReads=Long.MAX_VALUE;} 437 if(useStreamer && !FileFormat.isStdio(in1)){ 438 tf.close(); 439 // System.err.println("F"); 440 processViaStreamer(tsw); 441 // System.err.println("G"); 442 }else{ 443 // processViaByteFile(tf, line, tsw); 444 processViaByteFile(tf, tsw); 445 // System.err.println("H"); 446 } 447 // System.err.println("I"); 448 449 printOutput(); 450 451 if(orffasta!=null){ 452 processOrfsFasta(orffasta, outorf, table); 453 } 454 455 if(tsw!=null){tsw.waitForFinish();} 456 } 457 processViaStreamer(ByteStreamWriter tsw)458 private void processViaStreamer(ByteStreamWriter tsw){ 459 SamLineStreamer ss=new SamLineStreamer(in1, streamerThreads, false, maxReads); 460 ss.start(); 461 ByteBuilder bb=new ByteBuilder(33000); 462 for(ListNum<SamLine> ln=ss.nextLines(); ln!=null && ln.size()>0; ln=ss.nextLines()){ 463 ArrayList<SamLine> list=(ln==null ? null : ln.list); 464 for(SamLine sl : list){ 465 if(tsw!=null){ 466 sl.toBytes(bb); 467 bb.nl(); 468 if(bb.length>=16384){ 469 tsw.print(bb); 470 bb.clear(); 471 } 472 } 473 processSamLine(sl); 474 } 475 } 476 if(tsw!=null){ 477 if(bb.length>0){tsw.print(bb);} 478 tsw.poison(); 479 } 480 } 481 processViaByteFile(ByteFile tf, ByteStreamWriter tsw)482 private void processViaByteFile(ByteFile tf, ByteStreamWriter tsw){ 483 for(byte[] line=tf.nextLine(); line!=null && readsProcessed<maxReads; line=tf.nextLine()){ 484 if(tsw!=null){tsw.println(line);} 485 processSamLine(line); 486 } 487 488 tf.close(); 489 if(tsw!=null){tsw.poison();} 490 } 491 492 493 /*--------------------------------------------------------------*/ 494 /*---------------- Setup ----------------*/ 495 /*--------------------------------------------------------------*/ 496 497 498 /** Process all sam header lines from the tf. 499 * Once a non-header line is encountered, return it. 500 * If non-null, print all lines to the tsw. */ processHeader(ByteFile tf, ByteStreamWriter tsw)501 public void processHeader(ByteFile tf, ByteStreamWriter tsw){ 502 byte[] line=null; 503 for(line=tf.nextLine(); line!=null && (line.length==0 || line[0]=='@'); line=tf.nextLine()){ 504 // System.err.println(new String(line)); 505 if(tsw!=null){tsw.println(line);} 506 507 if(line.length>2){ 508 final byte a=line[1], b=line[2]; 509 510 if(a=='S' && b=='Q'){ 511 Scaffold scaf=new Scaffold(line); 512 if(COUNT_GC){scaf.basecount=KillSwitch.allocLong1D(8);} 513 assert(!table.containsKey(scaf.name)) : "\nDuplicate scaffold name!\n"+scaf+"\n\n"+table.get(scaf.name); 514 table.put(scaf.name, scaf); 515 list.add(scaf); 516 refBases+=scaf.length; 517 // sc.obj=new CoverageArray2(table.size(), sc.length+1); 518 // outstream.println("Made scaffold "+sc.name+" of length "+sc.length); 519 }else if(a=='P' && b=='G'){ 520 String[] split=new String(line).split("\t"); 521 for(String s : split){ 522 if(s.startsWith("PN:")){ 523 if(program==null){program=Data.forceIntern(s.substring(3));} 524 }else if(s.startsWith("VN:")){ 525 if(version==null){version=Data.forceIntern(s.substring(3));} 526 } 527 } 528 }else if(a=='R' && b=='G'){ 529 //Do nothing 530 }else if(a=='H' && b=='D'){ 531 //Do nothing 532 }else if(a=='C' && b=='O'){ 533 //Do nothing 534 }else{ 535 // assert(false) : line; 536 } 537 } 538 } 539 if(line!=null){tf.pushBack(line);} 540 // return line; 541 } 542 543 loadScaffoldsFromIndex(int minChrom, int maxChrom)544 public void loadScaffoldsFromIndex(int minChrom, int maxChrom){ 545 546 final int[][] lengths=Data.scaffoldLengths; 547 final int[][] locs=Data.scaffoldLocs; 548 final byte[][][] names=Data.scaffoldNames; 549 final int[] counts=new int[8]; 550 for(int chrom=minChrom; chrom<=maxChrom; chrom++){ 551 final ChromosomeArray ca=Data.getChromosome(chrom); 552 // assert(false) : lengths[chrom]+", "+lengths.length+", "+names[chrom]+", "+locs[chrom]+", "+(ca==null); 553 if(lengths[chrom]!=null){ 554 final int[] clengths=lengths[chrom]; 555 final int[] clocs=locs[chrom]; 556 final byte[][] cnames=names[chrom]; 557 for(int idx=0; idx<clengths.length; idx++){ 558 final int length=clengths[idx]; 559 final int loc=clocs[idx]; 560 final String name=new String(cnames[idx]); 561 final Scaffold scaf=new Scaffold(name, length); 562 if(ca!=null){ 563 scaf.gc=ca.calcGC(loc, length, counts); 564 } 565 if(COUNT_GC){scaf.basecount=KillSwitch.allocLong1D(8);} 566 assert(!table.containsKey(scaf.name)) : "\nDuplicate scaffold name!\n"+scaf+"\n\n"+table.get(scaf.name); 567 table.put(scaf.name, scaf); 568 list.add(scaf); 569 refBases+=scaf.length; 570 } 571 } 572 } 573 } 574 575 processReference()576 public void processReference(){ 577 if(reference==null){return;} 578 579 ByteFile bf=ByteFile.makeByteFile(reference, false); 580 Scaffold scaf=null; 581 int len=0; 582 final long[] acgtn=KillSwitch.allocLong1D(8); 583 boolean addLen=false; 584 for(byte[] s=bf.nextLine(); s!=null; s=bf.nextLine()){ 585 if(s.length>0 && s[0]=='>'){ 586 if(scaf!=null){ 587 if(scaf.length>0 && scaf.length!=len){ 588 outstream.println("ERROR: Scaffold "+scaf.name+" has contradictory lengths of "+scaf.length+" and "+len+"\n" 589 + "This probably indicates a corrupt or incorrect reference."); 590 errorState=true; 591 if(Shared.EA()){KillSwitch.kill();} 592 } 593 // else{outstream.println(scaf.name+", "+scaf.length+", "+len);} 594 scaf.length=len; 595 if(addLen){ 596 refBases+=scaf.length; 597 addLen=false; 598 } 599 scaf.gc=(float)((acgtn[1]+acgtn[2])*1d/Data.max(1, acgtn[0]+acgtn[1]+acgtn[2]+acgtn[3])); 600 scaf=null; 601 len=0; 602 Arrays.fill(acgtn, 0); 603 } 604 605 String name=new String(s, 1, s.length-1); 606 scaf=table.get(name); 607 if(ADD_FROM_REF && scaf==null){ 608 scaf=new Scaffold(name, 0); 609 if(!warned){ 610 outstream.println("Warning - SAM header did not include "+name+"\nAbsent scaffolds will be added; further warnings will be suppressed."); 611 warned=true; 612 } 613 if(COUNT_GC){scaf.basecount=KillSwitch.allocLong1D(8);} 614 table.put(name, scaf); 615 list.add(scaf); 616 addLen=true; 617 } 618 }else{ 619 len+=s.length; 620 for(int i=0; i<s.length; i++){ 621 acgtn[charToNum[s[i]]]++; 622 } 623 } 624 } 625 if(scaf!=null){ 626 if(scaf.length>0 && scaf.length!=len){ 627 outstream.println("ERROR: Scaffold "+scaf.name+" has contradictory lengths of "+scaf.length+" and "+len+"\n" 628 + "This probably indicates a corrupt or incorrect reference."); 629 errorState=true; 630 if(Shared.EA()){KillSwitch.kill();} 631 } 632 // else{outstream.println(scaf.name+", "+scaf.length+", "+len);} 633 scaf.length=len; 634 if(addLen){ 635 refBases+=scaf.length; 636 addLen=false; 637 } 638 scaf.gc=(float)((acgtn[1]+acgtn[2])*1d/Data.max(1, acgtn[0]+acgtn[1]+acgtn[2]+acgtn[3])); 639 scaf=null; 640 len=0; 641 Arrays.fill(acgtn, 0); 642 } 643 } 644 645 processOrfsFasta(String fname_in, String fname_out, HashMap<String, Scaffold> map)646 public void processOrfsFasta(String fname_in, String fname_out, HashMap<String, Scaffold> map){ 647 TextFile tf=new TextFile(fname_in, false); 648 assert(!fname_in.equalsIgnoreCase(fname_out)); 649 TextStreamWriter tsw=new TextStreamWriter(fname_out, overwrite, false, true); 650 tsw.start(); 651 652 if(printHeader){ 653 String pound=(headerPound ? "#" : ""); 654 tsw.print(pound+"mappedBases="+mappedBases+"\n"); 655 tsw.print(pound+"mappedNonClippedBases="+mappedNonClippedBases+"\n"); 656 tsw.print(pound+"mappedBasesWithDels="+mappedBasesWithDels+"\n"); 657 tsw.print(pound+"mappedReads="+mappedReads+"\n"); 658 tsw.print(pound+"name\tlength\tdepthSum\tavgDepth\tavgDepth/mappedBases\tminDepth\tmaxDepth\tmedianDepth\tstdDevDepth\tfractionCovered\n"); 659 } 660 661 String line; 662 final StringBuilder sb=new StringBuilder(500); 663 // Formatter formatter=new Formatter(sb); 664 665 while((line=tf.nextLine())!=null){ 666 if(line.length()>1 && line.charAt(0)=='>'){ 667 668 String[] split=line.split(" # "); //' # ' used as delimiters 669 670 String orfname=split[0].substring(1).trim(); //In case there are spaces around the ' # ' delimiters 671 String scafname=orfname; 672 if(scafname.contains("_")){//PRODIGAL pads _1 to the name of the first orf of a scaffold, and etc 673 int last=scafname.lastIndexOf('_'); 674 boolean numeric=false; 675 for(int i=last+1; i<scafname.length(); i++){ 676 if(Tools.isDigit(scafname.charAt(i))){numeric=true;} 677 else{numeric=false; break;} 678 } 679 if(numeric){scafname=scafname.substring(0, last);} 680 } 681 682 int start=Integer.parseInt(split[1].trim()); 683 int stop=Integer.parseInt(split[2].trim()); 684 int strand=Integer.parseInt(split[3].trim()); 685 if(strand==1){strand=Shared.PLUS;}else{strand=Shared.MINUS;} 686 Orf orf=new Orf(orfname, start, stop, (byte)strand); 687 688 Scaffold scaf=map.get(scafname); 689 // if(scaf==null){scaf=map.get(orfname);} 690 691 // assert(scaf!=null) : "\nCan't find scaffold for ("+orf+")\nfrom line\n"+line+"\n"; 692 // assert(orf.start>=0 && orf.stop<scaf.length) : "\norf goes out of scaffold bounds.\n"+orf+"\n"+scaf+"\n"; 693 694 if(scaf==null){ 695 outstream.println("Can't find scaffold for ("+orf+")\nfrom line\n"+line+"\nscafname='"+scafname+"'\norfname='"+orfname+"'"); 696 if(ABORT_ON_ERROR){ 697 tsw.poison(); 698 throw new RuntimeException("Aborting."); 699 } 700 }else{ 701 if(orf.start<0 && orf.stop>=scaf.length){ 702 outstream.println("orf goes out of scaffold bounds.\n"+orf+"\n"+scaf); 703 if(ABORT_ON_ERROR){ 704 tsw.poison(); 705 throw new RuntimeException("Aborting."); 706 } 707 } 708 709 CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); //TODO: Strand logic here depends on stranding protocol. 710 orf.readCoverageArray(ca); 711 } 712 713 // { 714 // tsw.print(String.format(Locale.ROOT, "%s\t%d\t", args)); 715 // } 716 717 sb.append(orf.name).append('\t'); 718 sb.append(orf.length()).append('\t'); 719 sb.append(orf.baseDepth).append('\t'); 720 sb.append(String.format(Locale.ROOT, "%.4f", orf.avgCoverage())).append('\t'); 721 sb.append(orf.avgCoverage()/mappedNonClippedBases); 722 723 sb.append('\t'); 724 sb.append(orf.minDepth).append('\t'); 725 sb.append(orf.maxDepth).append('\t'); 726 sb.append(orf.medianDepth).append('\t'); 727 sb.append(String.format(Locale.ROOT, "%.4f",orf.stdevDepth)).append('\t'); 728 sb.append(String.format(Locale.ROOT, "%.4f",orf.fractionCovered())); 729 730 sb.append('\n'); 731 tsw.print(sb.toString()); 732 sb.setLength(0); 733 } 734 } 735 736 tsw.poison(); 737 tsw.waitForFinish(); 738 } 739 740 741 /*--------------------------------------------------------------*/ 742 /*---------------- Inner Methods ----------------*/ 743 /*--------------------------------------------------------------*/ 744 addCoverage(final String scafName, final byte[] seq, byte[] match, final int start0, final int stop0, final int readlen, final int nonClippedBases, final int strand, int incrementFrags, boolean properPair, SamLine sl)745 public boolean addCoverage(final String scafName, final byte[] seq, byte[] match, final int start0, final int stop0, final int readlen, 746 final int nonClippedBases, final int strand, int incrementFrags, boolean properPair, SamLine sl){//sl is optional 747 Scaffold scaf=table.get(scafName); 748 if(scaf==null){ 749 if(ADD_FROM_READS){ 750 if(!warned){ 751 outstream.println("Warning: A read was mapped to unknown reference sequence "+scafName+ 752 "\nFurther warnings will be suppressed. These scaffolds will be included, but \n" 753 + "some coverage statistics will be inaccurate as scaffold lengths are not known."); 754 warned=true; 755 } 756 scaf=new Scaffold(scafName, 0); 757 if(COUNT_GC){scaf.basecount=KillSwitch.allocLong1D(8);} 758 table.put(scafName, scaf); 759 list.add(scaf); 760 return addCoverage(scaf, seq, match, start0, stop0, readlen, nonClippedBases, strand, incrementFrags, properPair, sl); 761 }else if(EA){ 762 KillSwitch.kill("ERROR: A read was mapped to unknown reference sequence "+scafName); 763 }else if(!warned){ 764 outstream.println("Warning: Can't find "+scafName+"\nThis scaffold will not be included. Further warnings will be suppressed."); 765 warned=true; 766 error=true; 767 } 768 return false; 769 } 770 return addCoverage(scaf, seq, match, start0, stop0, readlen, nonClippedBases, strand, incrementFrags, properPair, sl); 771 } 772 addCoverage(final Scaffold scaf, final byte[] seq, byte match[], final int start0, final int stop0, final int readlen, final int nonClippedBases, final int strand, int incrementFrags, boolean properPair, SamLine sl)773 public boolean addCoverage(final Scaffold scaf, final byte[] seq, byte match[], final int start0, final int stop0, final int readlen, final int nonClippedBases, 774 final int strand, int incrementFrags, boolean properPair, SamLine sl){//sl is optional 775 if(scaf==null){ 776 assert(false) : "Adding coverage to a null Scaffold."; 777 return false; 778 } 779 780 if(ADD_FROM_READS){ 781 int x=scaf.length; 782 scaf.length=Tools.max(scaf.length, stop0+1); 783 if(scaf.length!=x){ 784 refBases=refBases+scaf.length-x; 785 } 786 } 787 788 final int start=Tools.max(start0, 0); 789 final int stop=Tools.min(stop0, scaf.length-1); 790 791 assert(start>=0 && stop>=0) : "\nAn error was encountered when processing a read. Output will not be valid.\n"+ 792 "\nscafName="+scaf.name+"\nseq="+new String(seq)+"\nstart="+start+ 793 "\nstop="+stop+"\nreadlen="+readlen+"\nstrand="+strand+"\nscaf.length="+scaf.length+"\nscaf="+scaf; 794 795 mappedBases+=readlen; 796 mappedNonClippedBases+=nonClippedBases; 797 mappedBasesWithDels+=(stop-start+1); 798 mappedReads++; 799 if(properPair){properPairs++;} 800 801 scaf.readhits++; 802 scaf.fraghits+=incrementFrags; 803 if(strand==1){scaf.readhitsMinus++;} 804 805 if(seq!=null && scaf.basecount!=null){ 806 final long[] counts=scaf.basecount; 807 for(int i=0; i<seq.length; i++){ 808 counts[charToNum[seq[i]]]++; 809 } 810 } 811 812 if(!INCLUDE_DELETIONS && !START_ONLY && !STOP_ONLY){ 813 assert(match!=null) : "Coverage excluding deletions cannot be calculated without a match string."; 814 return addCoverageIgnoringDeletions(scaf, seq, match, start, stop, readlen, strand, incrementFrags); 815 } 816 817 final int basehits=stop-start+1; 818 scaf.basehits+=basehits; 819 820 if(USE_COVERAGE_ARRAYS){ 821 if(scaf.obj1==null){ 822 scaf.obj1=(bits32 ? new CoverageArray3(table.size(), scaf.length+1) : new CoverageArray2(table.size(), scaf.length+1)); 823 if(STRANDED){ 824 scaf.obj2=(bits32 ? new CoverageArray3(table.size(), scaf.length+1) : new CoverageArray2(table.size(), scaf.length+1)); 825 } 826 } 827 CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); 828 if(START_ONLY){ 829 ca.increment(start); 830 }else if(STOP_ONLY){ 831 ca.increment(stop); 832 }else{ 833 ca.incrementRange(start, stop); 834 // mappedBases+=(stop-start+1); 835 // mappedBases+=(readlen); 836 // assert(readlen==(stop-start+1)) : "readlen="+readlen+", (stop-start+1)="+(stop-start+1)+", start="+start+", stop="+stop+", start0="+start0+", stop0="+stop0+ 837 // "\n"+sl+"\n"; 838 //123 839 } 840 }else if(USE_BITSETS){ 841 if(scaf.obj1==null){ 842 scaf.obj1=new BitSet(scaf.length+1); 843 if(STRANDED){ 844 scaf.obj2=new BitSet(scaf.length+1); 845 } 846 } 847 BitSet bs=(BitSet)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); 848 if(START_ONLY){ 849 bs.set(start); 850 }else if(STOP_ONLY){ 851 bs.set(stop); 852 }else{ 853 bs.set(start, stop+1); 854 } 855 } 856 857 return true; 858 } 859 addCoverageIgnoringDeletions(final Scaffold scaf, final byte[] seq, byte match[], final int start, final int stop, final int readlen, final int strand, int incrementFrags)860 private boolean addCoverageIgnoringDeletions(final Scaffold scaf, final byte[] seq, byte match[], final int start, final int stop, final int readlen, final int strand, int incrementFrags){ 861 assert(!INCLUDE_DELETIONS && !START_ONLY && !STOP_ONLY); 862 assert(match!=null) : "Coverage excluding deletions cannot be calculated without a match string."; 863 864 if(Read.isShortMatchString(match)){ 865 match=Read.toLongMatchString(match); 866 } 867 868 int basehits=0; 869 if(USE_COVERAGE_ARRAYS){ 870 if(scaf.obj1==null){ 871 scaf.obj1=(bits32 ? new CoverageArray3(table.size(), scaf.length+1) : new CoverageArray2(table.size(), scaf.length+1)); 872 if(STRANDED){ 873 scaf.obj2=(bits32 ? new CoverageArray3(table.size(), scaf.length+1) : new CoverageArray2(table.size(), scaf.length+1)); 874 } 875 } 876 CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); 877 for(int rpos=start, mpos=0; mpos<match.length && rpos<=stop; mpos++){ 878 byte m=match[mpos]; 879 if(m=='m' || m=='S' || m=='N'){ 880 ca.increment(rpos, 1); 881 basehits++; 882 rpos++; 883 }else if(m=='X' || m=='Y' || m=='C' || m=='I'){ 884 //do nothing 885 }else if(m=='D'){ 886 rpos++; 887 }else{ 888 assert(false) : "Unhandled symbol "+m; 889 } 890 } 891 }else if(USE_BITSETS){ 892 if(scaf.obj1==null){ 893 scaf.obj1=new BitSet(scaf.length+1); 894 if(STRANDED){ 895 scaf.obj2=new BitSet(scaf.length+1); 896 } 897 } 898 BitSet bs=(BitSet)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); 899 for(int rpos=start, mpos=0; mpos<match.length && rpos<=stop; mpos++){ 900 byte m=match[mpos]; 901 if(m=='m' || m=='S' || m=='N'){ 902 bs.set(rpos); 903 basehits++; 904 rpos++; 905 }else if(m=='X' || m=='Y' || m=='C' || m=='I'){ 906 //do nothing 907 }else if(m=='D'){ 908 rpos++; 909 }else{ 910 assert(false) : "Unhandled symbol "+m; 911 } 912 } 913 } 914 scaf.basehits+=basehits; 915 916 return true; 917 } 918 919 processSamLine(byte[] line)920 public boolean processSamLine(byte[] line){ 921 if(line==null || line.length<3){ 922 return false; 923 }else if(line[0]=='@'){ 924 if(!error){ 925 outstream.println("Unexpected header line: "+line); 926 outstream.println("This should not cause problems, and is probably due to concatenated sam files.\n" + 927 "Supressing future unexpected header warnings."); 928 error=true; 929 } 930 931 if(line[1]=='S' && line[2]=='Q'){ 932 Scaffold scaf=new Scaffold(line); 933 if(!table.containsKey(scaf.name)){ 934 if(COUNT_GC){scaf.basecount=KillSwitch.allocLong1D(8);} 935 table.put(scaf.name, scaf); 936 list.add(scaf); 937 refBases+=scaf.length; 938 } 939 } 940 }else{ 941 SamLine sl=new SamLine(line); 942 return processSamLine(sl); 943 } 944 return false; 945 } 946 trim(SamLine sl)947 private int trim(SamLine sl){ 948 assert(border>0 || (trimq>=0 && (qtrimLeft || qtrimRight))); 949 Read r=null; 950 Scaffold sc=null; 951 int leftTrimAmount=border, rightTrimAmount=border; 952 if(border>0){ 953 r=sl.toRead(false); 954 sc=table.get(sl.rnameS()); 955 assert(sc!=null) : sl+"\n\n"+sl.rnameS()+"\n\n"+table; 956 int skipTrimRange=Tools.max(10, border+5); 957 if(r.start<skipTrimRange){ 958 if(r.strand()==Shared.PLUS){leftTrimAmount=0;} 959 else{rightTrimAmount=0;} 960 } 961 if(r.stop>sc.length-skipTrimRange){ 962 if(r.strand()==Shared.PLUS){rightTrimAmount=0;} 963 else{leftTrimAmount=0;} 964 } 965 } 966 final int len0=sl.length(); 967 if(qtrimLeft || qtrimRight){ 968 long packed=TrimRead.testOptimal(sl.seq, sl.qual, trimE); 969 if(qtrimLeft){leftTrimAmount=Tools.max(leftTrimAmount, (int)((packed>>32)&0xFFFFFFFFL));} 970 if(qtrimRight){rightTrimAmount=Tools.max(rightTrimAmount, (int)((packed)&0xFFFFFFFFL));} 971 // assert(false) : qtrimLeft+", "+qtrimRight+", "+trimq+", "+trimE+", "+rightTrimAmount+"\n"+sl+"\n"; 972 } 973 final int trimmed; 974 if(leftTrimAmount<1 && rightTrimAmount<1){trimmed=0;} 975 else{ 976 if(r==null){r=sl.toRead(false);} 977 if(sc==null){sc=table.get(sl.rnameS());} 978 int scaflen=(sc==null ? 1999999999 : sc.length); 979 trimmed=TrimRead.trimReadWithMatch(r, sl, leftTrimAmount, rightTrimAmount, 0, scaflen, false); 980 } 981 // assert(trimmed==len0-sl.length()) : trimmed+", "+len0+", "+sl.length(); 982 // assert(rightTrimAmount>0) : qtrimLeft+", "+qtrimRight+", "+trimq+", "+trimE+", "+rightTrimAmount+"\n"+sl+"\n"; 983 return trimmed; 984 } 985 processSamLine(SamLine sl)986 public boolean processSamLine(SamLine sl){ 987 readsProcessed++; 988 basesProcessed+=sl.length(); 989 if(sl.duplicate() && !INCLUDE_DUPLICATES){return false;} 990 991 if(border>0 || (trimq>=0 && (qtrimLeft || qtrimRight))){ 992 if(sl.mapped() && sl.seq!=null){ 993 int trimmed=trim(sl); 994 if(sl.length()<1 || trimmed<0){return false;}//trimmed<0 implies everything was trimmed 995 } 996 } 997 998 final int kmers=Tools.countKmers(sl.seq, k); 999 final double cKmers=sl.qual==null ? kmers : Tools.countCorrectKmers(sl.qual, k); 1000 kmersProcessed+=kmers; 1001 correctKmers+=cKmers; 1002 final boolean properPair=(sl.hasMate() && sl.mapped() && sl.primary() && sl.properPair()); 1003 if(PHYSICAL_COVERAGE && properPair){ 1004 SamLine mate=pairTable.remove(sl.qname); 1005 if(mate==null){pairTable.put(sl.qname, sl);} 1006 else{ 1007 final int start1=sl.start(INCLUDE_SOFT_CLIP, false); 1008 final int stop1=sl.stop(start1, INCLUDE_SOFT_CLIP, false); 1009 final int start2=mate.start(INCLUDE_SOFT_CLIP, false); 1010 final int stop2=mate.stop(start2, INCLUDE_SOFT_CLIP, false); 1011 final int strand=(sl.pairnum()==0 ? sl.strand() : mate.strand()); 1012 final int length=USE_TLEN ? sl.tlen : Tools.max(stop1, stop2)-Tools.min(start1, start2)+1; 1013 mappedKmers+=kmers; 1014 addCoverage(sl.rnameS(), null, null, Tools.min(start1, start2), Tools.max(stop1, stop2), length, sl.mappedNonClippedBases(), strand, 2, sl.properPair(), sl); 1015 } 1016 }else if(sl.mapped() && (USE_SECONDARY || sl.primary()) && sl.mapq>=minMapq){ 1017 assert(sl.seq!=null || sl.cigar!=null) : "This program requires bases or a cigar string for every sam line. Problem line:\n"+sl+"\n"; 1018 // assert(sl.seq!=null) : sl.toString(); 1019 final int length=sl.length(); 1020 final int start=sl.start(INCLUDE_SOFT_CLIP, false); 1021 final int stop=sl.stop(start, INCLUDE_SOFT_CLIP, false); 1022 // assert(false && length==stop-start+1) : length+", "+start+", "+stop+", "+(stop-start+1); 1023 // assert(false) : "'"+new String(sl.rname())+"', '"+sl.rnameS()+"'"; 1024 // assert(false) : "'"+sl.rnameS()+"'"; 1025 final byte[] match=(INCLUDE_DELETIONS ? null : sl.toShortMatch(true)); 1026 mappedKmers+=kmers; 1027 return addCoverage(sl.rnameS(), sl.seq, match, start, stop, length, sl.mappedNonClippedBases(), sl.strand(), sl.hasMate() ? 1 : 2, sl.properPair(), sl); 1028 } 1029 return false; 1030 } 1031 1032 processRead(Read r)1033 public boolean processRead(Read r){ 1034 readsProcessed++; 1035 final int kmers=Tools.countKmers(r.bases, k); 1036 final double cKmers=r.quality==null ? kmers : Tools.countCorrectKmers(r.quality, k); 1037 kmersProcessed+=kmers; 1038 correctKmers+=cKmers; 1039 if(r.mapped() && r.bases!=null){ 1040 if(USE_SECONDARY && r.sites!=null && r.sites.size()>0){ 1041 boolean b=false; 1042 for(SiteScore ss : r.sites){ 1043 b=processRead(r, ss) || b; 1044 } 1045 return b; 1046 }else{ 1047 final Read mate=r.mate; 1048 final boolean set1=coords.set(r); 1049 final boolean set2=(PHYSICAL_COVERAGE && r.paired() && r.pairnum()==0 && coords2.set(mate)); 1050 if(set1 && set2 && Tools.equals(coords.name, coords2.name)){ 1051 final int start1=coords.start; 1052 final int stop1=coords.stop; 1053 final int start2=coords2.start; 1054 final int stop2=coords2.stop; 1055 final int strand=r.strand(); 1056 final int length=Tools.max(stop1, stop2)-Tools.min(start1, start2)+1; 1057 mappedKmers+=kmers; 1058 addCoverage(new String(coords.name), null, null, Tools.min(start1, start2), Tools.max(stop1, stop2), length, 1059 r.mappedNonClippedBases(), strand, 2-r.mateCount(), r.paired(), null); 1060 }else{ 1061 if(set1){ 1062 mappedKmers+=kmers; 1063 return addCoverage(new String(coords.name), r.bases, r.match, coords.start, coords.stop, r.length(), 1064 r.mappedNonClippedBases(), coords.strand, 2-r.mateCount(), r.paired(), null); 1065 } 1066 } 1067 } 1068 } 1069 return false; 1070 } 1071 1072 processRead(Read r, SiteScore ss)1073 public boolean processRead(Read r, SiteScore ss){ 1074 if(ss!=null && r.bases!=null){ 1075 if(coords.set(ss)){ 1076 return addCoverage(new String(coords.name), r.bases, ss.match, coords.start, coords.stop, r.length(), 1077 r.mappedNonClippedBases(), coords.strand, 1, r.paired(), null); 1078 } 1079 } 1080 return false; 1081 } 1082 1083 1084 /*--------------------------------------------------------------*/ 1085 /*---------------- Output Methods ----------------*/ 1086 /*--------------------------------------------------------------*/ 1087 1088 printOutput()1089 public void printOutput(){ 1090 1091 totalScaffolds=list.size(); 1092 1093 long refKmers=0; 1094 long refBases2=0; 1095 for(Scaffold sc : list){ 1096 refBases2+=sc.length; 1097 if(sc.length>=k){refKmers+=sc.length-k+1;} 1098 } 1099 1100 String basecov1=(basecov==null ? null : (STRANDED ? basecov.replaceFirst("#", "1") : basecov)); 1101 String bincov1=(bincov==null ? null : (STRANDED ? bincov.replaceFirst("#", "1") : bincov)); 1102 String normcov1=(normcov==null ? null : (STRANDED ? normcov.replaceFirst("#", "1") : normcov)); 1103 String normcovOverall1=(normcovOverall==null ? null : (STRANDED ? normcovOverall.replaceFirst("#", "1") : normcovOverall)); 1104 String histogram1=(histogram==null ? null : (STRANDED ? histogram.replaceFirst("#", "1") : histogram)); 1105 String stats1=(covstats==null ? null : (STRANDED ? covstats.replaceFirst("#", "1") : covstats)); 1106 1107 String basecov2=(basecov==null || !STRANDED ? null : basecov.replaceFirst("#", "2")); 1108 String bincov2=(bincov==null || !STRANDED ? null : bincov.replaceFirst("#", "2")); 1109 String normcov2=(normcov==null || !STRANDED ? null : normcov.replaceFirst("#", "2")); 1110 String normcovOverall2=(normcovOverall==null ? null : (STRANDED ? normcovOverall.replaceFirst("#", "2") : normcovOverall)); 1111 String histogram2=(histogram==null || !STRANDED ? null : histogram.replaceFirst("#", "2")); 1112 String stats2=(covstats==null || !STRANDED ? null : covstats.replaceFirst("#", "2")); 1113 1114 if(CONCISE){ 1115 writeCoveragePerBaseConcise(basecov1, list, 0, minscaf); 1116 writeCoveragePerBaseConcise(basecov2, list, 1, minscaf); 1117 }else{ 1118 writeCoveragePerBase(basecov1, list, DELTA_ONLY, 0, minscaf); 1119 writeCoveragePerBase(basecov2, list, DELTA_ONLY, 1, minscaf); 1120 } 1121 if(KEEP_SHORT_BINS){ 1122 writeCoveragePerBaseBinned2(bincov1, list, binsize, 0, minscaf); 1123 writeCoveragePerBaseBinned2(bincov2, list, binsize, 1, minscaf); 1124 }else{ 1125 writeCoveragePerBaseBinned(bincov1, list, binsize, 0, minscaf); 1126 writeCoveragePerBaseBinned(bincov2, list, binsize, 1, minscaf); 1127 } 1128 if(normcov!=null){ 1129 writeCoveragePerBaseNormalized(normcov1, list, binsize, 0, minscaf); 1130 writeCoveragePerBaseNormalized(normcov2, list, binsize, 1, minscaf); 1131 } 1132 if(normcovOverall!=null){ 1133 writeCoveragePerBaseNormalizedOverall(normcovOverall1, list, binsize, 0, minscaf); 1134 writeCoveragePerBaseNormalizedOverall(normcovOverall2, list, binsize, 1, minscaf); 1135 } 1136 if(outrpkm!=null){ 1137 writeRPKM(outrpkm, in1, null, readsProcessed, NONZERO_ONLY,list); 1138 } 1139 1140 { 1141 long[] hist=writeStats(stats1, 0); 1142 if(hist!=null){writeHist(histogram1, hist);} 1143 1144 if(STRANDED){ 1145 hist=writeStats(stats2, 1); 1146 if(hist!=null){writeHist(histogram2, hist);} 1147 } 1148 } 1149 1150 final double mult=1.0/refBases; 1151 double depthCovered=mappedBases*mult; 1152 double depthCovered2=mappedNonClippedBases*mult; 1153 double depthCovered3=mappedBasesWithDels*mult; 1154 double pctScaffoldsWithCoverage=scaffoldsWithCoverage1*100.0/totalScaffolds; 1155 double pctCovered=totalCoveredBases1*100*mult; 1156 1157 if(!KEY_VALUE){ 1158 outstream.println("Reads: \t"+readsProcessed); 1159 outstream.println("Mapped reads: \t"+mappedReads); 1160 // outstream.println("Mapped bases: \t"+mappedBases); 1161 // outstream.println("Mapped non-clipped bases: \t"+mappedNonClippedBases); 1162 outstream.println("Mapped bases: \t"+mappedNonClippedBases); 1163 outstream.println("Ref scaffolds: \t"+totalScaffolds); 1164 outstream.println("Ref bases: \t"+refBases); 1165 1166 if(k>0){ 1167 String kcovS=k+"-mer coverage:"; 1168 String kcorrectS="Percent correct "+k+"-mers:"; 1169 while(kcovS.length()<26){kcovS=kcovS+" ";} 1170 while(kcovS.length()<26){kcorrectS=kcorrectS+" ";} 1171 outstream.println(String.format(Locale.ROOT, "\n"+kcovS+" \t%.3f", mappedKmers*1.0/refKmers)); 1172 outstream.println(String.format(Locale.ROOT, kcorrectS+" \t%.3f", 100*correctKmers/kmersProcessed)); 1173 // outstream.println(kmersProcessed+", "+correctKmers); 1174 } 1175 1176 outstream.println(String.format(Locale.ROOT, "\nPercent mapped: \t%.3f", mappedReads*100f/readsProcessed)); 1177 outstream.println(String.format(Locale.ROOT, "Percent proper pairs: \t%.3f", properPairs*100f/readsProcessed)); 1178 // outstream.println(depthCovered); 1179 outstream.println(String.format(Locale.ROOT, "Average coverage: \t%.3f", depthCovered2)); 1180 outstream.println(String.format(Locale.ROOT, "Average coverage with deletions: \t%.3f", depthCovered3)); 1181 if(USE_COVERAGE_ARRAYS && calcCovStdev){ 1182 double[] stdev=standardDeviation(list, 0, minscaf); 1183 outstream.println(String.format(Locale.ROOT, "Standard deviation: \t%.3f", stdev[1])); 1184 } 1185 outstream.println(String.format(Locale.ROOT, "Percent scaffolds with any coverage: \t%.2f", pctScaffoldsWithCoverage)); 1186 if(USE_COVERAGE_ARRAYS || USE_BITSETS){ 1187 outstream.println(String.format(Locale.ROOT, "Percent of reference bases covered: \t%.2f", pctCovered)); 1188 } 1189 }else{ 1190 outstream.println("reads="+readsProcessed); 1191 outstream.println("mappedReads="+mappedReads); 1192 outstream.println("mappedBases="+mappedNonClippedBases); 1193 outstream.println("mappedBasesWithDels="+mappedBasesWithDels); 1194 outstream.println("refScaffolds="+totalScaffolds); 1195 outstream.println("refBases="+refBases); 1196 outstream.println(String.format(Locale.ROOT, "percentMapped=%.3f", mappedReads*100f/readsProcessed)); 1197 outstream.println(String.format(Locale.ROOT, "percentPaired=%.3f", properPairs*100f/readsProcessed)); 1198 outstream.println(String.format(Locale.ROOT, "averageCoverage=%.3f", depthCovered2)); 1199 if(USE_COVERAGE_ARRAYS && calcCovStdev){ 1200 double[] stdev=standardDeviation(list, 0, minscaf); 1201 outstream.println(String.format(Locale.ROOT, "standardDeviation=%.3f", stdev[1])); 1202 } 1203 outstream.println(String.format(Locale.ROOT, "percentCoveredScaffolds=%.2f", pctScaffoldsWithCoverage)); 1204 if(USE_COVERAGE_ARRAYS || USE_BITSETS){ 1205 outstream.println(String.format(Locale.ROOT, "percentCoveredBases=%.2f", pctCovered)); 1206 } 1207 } 1208 } 1209 basesUnderAverageCoverage(final int[] array, final double avg, final int window)1210 public int basesUnderAverageCoverage(final int[] array, final double avg, final int window){ 1211 if(array.length<window){return 0;} 1212 final long limit=(long)Math.ceil(window*avg); 1213 long covSum=0; 1214 int baseCount=0; 1215 for(int i=0; i<window; i++){ 1216 covSum+=array[i]; 1217 } 1218 1219 boolean below=false; 1220 int lastStop=-1, lastStart=0; 1221 for(int a=0, b=window; b<array.length; a++, b++){ 1222 if(covSum>=limit){ 1223 if(below){//end range 1224 baseCount=b-Tools.max(lastStop+1, lastStart); 1225 lastStop=b-1; 1226 below=false; 1227 } 1228 }else{ 1229 if(!below){//start range 1230 lastStart=a; 1231 below=true; 1232 } 1233 } 1234 covSum-=array[a]; 1235 assert(covSum>=0); 1236 covSum+=array[b]; 1237 } 1238 1239 if(below){//end range 1240 baseCount=array.length-Tools.max(lastStop, lastStart); 1241 } 1242 1243 assert(baseCount>=0); 1244 return baseCount; 1245 } 1246 basesUnderAverageCoverage(final char[] array, final double avg, final int window)1247 public int basesUnderAverageCoverage(final char[] array, final double avg, final int window){ 1248 if(array.length<window){return 0;} 1249 final long limit=(long)Math.ceil(window*avg); 1250 long covSum=0; 1251 int baseCount=0; 1252 for(int i=0; i<window; i++){ 1253 covSum+=array[i]; 1254 } 1255 1256 // outstream.println("limit: "+limit); 1257 1258 boolean below=false; 1259 int lastStop=-1, lastStart=0; 1260 for(int a=0, b=window; b<array.length; a++, b++){ 1261 if(covSum>=limit){ 1262 if(below){//end range 1263 baseCount+=b-Tools.max(lastStop+1, lastStart); 1264 1265 // outstream.println("\nprev: "+lastStop+", "+lastStart); 1266 // outstream.println("end range at "+a+", "+b); 1267 // outstream.println("baseCount: "+baseCount+", covSum="+covSum); 1268 1269 lastStop=b-1; 1270 below=false; 1271 } 1272 }else{ 1273 if(!below){//start range 1274 1275 // outstream.println("\nprev: "+lastStop+", "+lastStart); 1276 // outstream.println("start range at "+a+", "+b); 1277 // outstream.println("baseCount: "+baseCount+", covSum="+covSum); 1278 1279 lastStart=a; 1280 below=true; 1281 } 1282 } 1283 covSum-=array[a]; 1284 assert(covSum>=0); 1285 covSum+=array[b]; 1286 } 1287 1288 if(below){//end range 1289 baseCount+=array.length-Tools.max(lastStop+1, lastStart); 1290 1291 // outstream.println("\nprev: "+lastStop+", "+lastStart); 1292 // outstream.println("end range at "+array.length); 1293 // outstream.println("baseCount: "+baseCount+", covSum="+covSum); 1294 } 1295 1296 assert(baseCount>=0); 1297 return baseCount; 1298 } 1299 writeStats(String fname, int strand)1300 public long[] writeStats(String fname, int strand){ 1301 // outstream.println("Writing stats for "+fname+", "+strand); 1302 final TextStreamWriter tsw=(fname==null ? null : new TextStreamWriter(fname, overwrite, false, true)); 1303 1304 if(tsw!=null){ 1305 tsw.start(); 1306 if(printHeader){ 1307 String pound=(headerPound ? "#" : ""); 1308 if(TWOCOLUMN){ 1309 tsw.println(pound+"ID\tAvg_fold"); 1310 }else{ 1311 tsw.println(pound+"ID\tAvg_fold\tLength\tRef_GC\tCovered_percent\tCovered_bases\tPlus_reads\tMinus_reads"+ 1312 (COUNT_GC ? "\tRead_GC" : "")+ 1313 (USE_COVERAGE_ARRAYS ? ("\tMedian_fold\tStd_Dev") : "")+ 1314 (USE_WINDOW ? "\tUnder_"+String.format(Locale.ROOT, "%.0f",LOW_COV_DEPTH)+"/"+LOW_COV_WINDOW : "")); 1315 } 1316 } 1317 //Maximally: 1318 //"ID\tAvg_fold\tLength\tRef_GC\tCovered_percent\tCovered_bases\tPlus_reads\tMinus_reads\tRead_GC\tMedian_fold\tStd_Dev\tUnder_X" 1319 } 1320 1321 final int histmax=(bits32 ? 1000000 : Character.MAX_VALUE); 1322 final LongList hist=new LongList(Character.MAX_VALUE); 1323 1324 long coveredScafTemp=0; 1325 long coveredBaseTemp=0; 1326 for(Scaffold scaf : list){ 1327 final long sum=scaf.basehits; 1328 int covered=0; 1329 int median=0; 1330 int underWindowAverage=0; 1331 final double stdev; 1332 if(USE_COVERAGE_ARRAYS){ 1333 CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); 1334 if(ca!=null){ 1335 for(int i=0; i<scaf.length; i++){ 1336 int x=ca.get(i); 1337 hist.increment(Tools.min(x, histmax)); 1338 // sum+=x; 1339 if(x>=minDepthToBeCovered){covered++;} 1340 } 1341 if(bits32){ 1342 int[] array=((CoverageArray3)ca).array; 1343 stdev=Tools.standardDeviation(array); 1344 underWindowAverage=basesUnderAverageCoverage(array, LOW_COV_DEPTH, LOW_COV_WINDOW); 1345 Shared.sort(array); 1346 Tools.reverseInPlace(array); 1347 median=ca.get(scaf.length/2); 1348 }else{ 1349 char[] array=((CoverageArray2)ca).array; 1350 stdev=Tools.standardDeviation(array); 1351 underWindowAverage=basesUnderAverageCoverage(array, LOW_COV_DEPTH, LOW_COV_WINDOW); 1352 Arrays.sort(array); 1353 Tools.reverseInPlace(array); 1354 median=ca.get(scaf.length/2); 1355 } 1356 }else{ 1357 hist.increment(0, scaf.length); 1358 stdev=0; 1359 } 1360 }else if(USE_BITSETS){ 1361 // sum+=scaf.basehits; 1362 BitSet bs=(BitSet)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); 1363 covered=(bs==null ? 0 : bs.cardinality()); 1364 stdev=-1; 1365 }else{ 1366 stdev=-1; 1367 } 1368 1369 if(sum>0){ 1370 coveredScafTemp++; 1371 } 1372 // pw.print(scaf.name); 1373 if(tsw!=null && (sum>0 || !NONZERO_ONLY) && scaf.length>=minscaf){ 1374 if(TWOCOLUMN){ 1375 tsw.print(String.format(Locale.ROOT, "%s\t%.4f\n", scaf.name, sum/(double)scaf.length)); 1376 }else if(COUNT_GC){ 1377 long[] bc=scaf.basecount; 1378 double gc=(bc[1]+bc[2])*1d/Data.max(1, bc[0]+bc[1]+bc[2]+bc[3]); 1379 if(USE_COVERAGE_ARRAYS){ 1380 if(USE_WINDOW){ 1381 1382 //Variable portion: 1383 //"\tRead_GC\tMedian_fold\tStd_Dev\tUnder_X\n" 1384 //\t%.4f\t%d\t%.2f\t%d\n 1385 1386 tsw.print(String.format(Locale.ROOT, "%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\t%.4f\t%d\t%.2f\t%d\n", scaf.name, sum/(double)scaf.length, scaf.length, 1387 scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus, 1388 gc, median, stdev, underWindowAverage)); 1389 }else{ 1390 tsw.print(String.format(Locale.ROOT, "%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\t%.4f\t%d\t%.2f\n", scaf.name, sum/(double)scaf.length, scaf.length, 1391 scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus, 1392 gc, median, stdev)); 1393 } 1394 }else{ 1395 tsw.print(String.format(Locale.ROOT, "%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\t%.4f\n", scaf.name, sum/(double)scaf.length, scaf.length, 1396 scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus, 1397 gc)); 1398 } 1399 }else{ 1400 if(USE_COVERAGE_ARRAYS){ 1401 if(USE_WINDOW){ 1402 tsw.print(String.format(Locale.ROOT, "%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\t%d\t%.2f\t%d\n", scaf.name, sum/(double)scaf.length, scaf.length, 1403 scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus, 1404 median, stdev, underWindowAverage)); 1405 }else{ 1406 tsw.print(String.format(Locale.ROOT, "%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\t%d\t%.2f\n", scaf.name, sum/(double)scaf.length, scaf.length, 1407 scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus, 1408 median, stdev)); 1409 } 1410 }else{ 1411 tsw.print(String.format(Locale.ROOT, "%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\n", scaf.name, sum/(double)scaf.length, scaf.length, 1412 scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus)); 1413 } 1414 } 1415 } 1416 coveredBaseTemp+=covered; 1417 } 1418 1419 if(strand==0){ 1420 scaffoldsWithCoverage1+=coveredScafTemp; 1421 totalCoveredBases1+=coveredBaseTemp; 1422 }else{ 1423 scaffoldsWithCoverage2+=coveredScafTemp; 1424 totalCoveredBases2+=coveredBaseTemp; 1425 } 1426 1427 if(tsw!=null){tsw.poisonAndWait();} 1428 return hist==null ? null : hist.array; 1429 } 1430 1431 /** 1432 * Write a histogram of number of bases covered to each depth 1433 * @param fname Output filename 1434 * @param counts counts[X] stores the number of bases with coverage X 1435 */ writeHist(String fname, long[] counts)1436 public static void writeHist(String fname, long[] counts){ 1437 if(fname==null){return;} 1438 assert(counts!=null) : "Can't write a histogram with null counts."; 1439 ByteStreamWriter tsw=new ByteStreamWriter(fname, overwrite, false, false); 1440 tsw.start(); 1441 if(printHeader){ 1442 if(headerPound){tsw.print('#');} 1443 tsw.println("Coverage\tnumBases"); 1444 } 1445 int max=0; 1446 for(max=counts.length-1; max>0 && counts[max]==0; max--){} 1447 for(int i=0; i<=max; i++){ 1448 long x=counts[i]; 1449 tsw.print(i); 1450 tsw.print('\t'); 1451 tsw.println(x); 1452 } 1453 1454 tsw.poisonAndWait(); 1455 } 1456 1457 /** 1458 * Prints coverage in this format: 1459 * scafname TAB position TAB coverage 1460 * scafname TAB position TAB coverage 1461 * @param fname Output filename 1462 * @param list List of reference scaffolds 1463 * @param deltaOnly Only write lines when coverage changes 1464 * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus) 1465 */ writeCoveragePerBase(String fname, ArrayList<Scaffold> list, boolean deltaOnly, int strand, int minscaf)1466 public void writeCoveragePerBase(String fname, ArrayList<Scaffold> list, boolean deltaOnly, int strand, int minscaf){ 1467 if(fname==null || (!STRANDED && strand>0)){return;} 1468 1469 if(verbose){outstream.println("Starting tsw "+fname);} 1470 ByteStreamWriter tsw=new ByteStreamWriter(fname, overwrite, false, true); 1471 if(verbose){outstream.println("Created tsw "+fname);} 1472 tsw.start(); 1473 // if(verbose){outstream.println("Started tsw "+fname);} 1474 if(printHeader){ 1475 if(headerPound){tsw.print('#');} 1476 tsw.println("RefName\tPos\tCoverage"); 1477 } 1478 1479 for(Scaffold scaf : list){ 1480 int last=-1; 1481 CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); 1482 if(scaf.length>=minscaf){ 1483 for(int i=0, len=scaf.length; i<len; i++){ 1484 int x=(ca==null ? 0 : ca.get(i)); 1485 if(!deltaOnly || x!=last){ 1486 if(x>0 || !NONZERO_ONLY){ 1487 tsw.print(scaf.name); 1488 tsw.print('\t'); 1489 tsw.print(i); 1490 tsw.print('\t'); 1491 tsw.println(x); 1492 } 1493 last=x; 1494 } 1495 } 1496 } 1497 } 1498 1499 if(verbose){outstream.println("Closing tsw "+fname);} 1500 tsw.poisonAndWait(); 1501 if(verbose){outstream.println("Closed tsw "+fname);} 1502 } 1503 1504 /** 1505 * Prints coverage in this format, skipping zero-coverage positions: 1506 * #scafname 1507 * position TAB coverage 1508 * position TAB coverage 1509 * @param fname Output filename 1510 * @param list List of reference scaffolds 1511 * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus) 1512 */ writeCoveragePerBaseConcise(String fname, ArrayList<Scaffold> list, int strand, int minscaf)1513 public void writeCoveragePerBaseConcise(String fname, ArrayList<Scaffold> list, int strand, int minscaf){ 1514 if(fname==null || (!STRANDED && strand>0)){return;} 1515 1516 if(verbose){outstream.println("Starting tsw "+fname);} 1517 ByteStreamWriter tsw=new ByteStreamWriter(fname, overwrite, false, true); 1518 tsw.start(); 1519 if(verbose){outstream.println("Started tsw "+fname);} 1520 // tsw.print(pound+"RefName\tPos\tCoverage\n"); 1521 1522 for(Scaffold scaf : list){ 1523 tsw.print('#'); 1524 tsw.println(scaf.name); 1525 CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); 1526 if(scaf.length>=minscaf){ 1527 for(int i=0; i<scaf.length; i++){ 1528 int x=(ca==null ? 0 : ca.get(i)); 1529 if(x>0){ 1530 tsw.print(i); 1531 tsw.print('\t'); 1532 tsw.println(x); 1533 } 1534 } 1535 } 1536 } 1537 1538 if(verbose){outstream.println("Closing tsw "+fname);} 1539 tsw.poisonAndWait(); 1540 if(verbose){outstream.println("Closed tsw "+fname);} 1541 // assert(false); 1542 } 1543 1544 /** 1545 * Note. As written, this will truncate all trailing bases of each scaffold's length modulo binsize. 1546 * For example, with binsize 1000, the last 500 bases of a 1500 base scaffold will be ignored. 1547 * @param fname Output filename 1548 * @param list List of reference scaffolds 1549 * @param binsize Width of coverage bins in bp 1550 * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus) 1551 */ writeCoveragePerBaseBinned(String fname, ArrayList<Scaffold> list, int binsize, int strand, int minscaf)1552 public static void writeCoveragePerBaseBinned(String fname, ArrayList<Scaffold> list, int binsize, int strand, int minscaf){ 1553 if(fname==null || (!STRANDED && strand>0)){return;} 1554 TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false); 1555 tsw.start(); 1556 if(printHeader){ 1557 String pound=(headerPound ? "#" : ""); 1558 if(calcCovStdev){ 1559 double[] stdev=standardDeviation(list, strand, minscaf); 1560 if(stdev!=null){ 1561 tsw.print(pound+"Mean\t"+String.format(Locale.ROOT, "%.3f", stdev[0])+"\n"); 1562 tsw.print(pound+"STDev\t"+String.format(Locale.ROOT, "%.3f", stdev[1])+"\n"); 1563 } 1564 } 1565 tsw.print(pound+"RefName\tCov\tPos\tRunningPos\n"); 1566 } 1567 1568 long running=0; 1569 final float invbin=1f/binsize; 1570 for(Scaffold scaf : list){ 1571 if(scaf.length>=binsize && scaf.length>=minscaf){ 1572 CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); 1573 int lastPos=-1, nextPos=binsize-1; 1574 long sum=0; 1575 for(int i=0; i<scaf.length; i++){ 1576 int x=(ca==null ? 0 : ca.get(i)); 1577 sum+=x; 1578 if(i>=nextPos){ 1579 if(sum>0 || !NONZERO_ONLY){ 1580 tsw.print(String.format(Locale.ROOT, "%s\t%.2f\t%d\t%d\n", scaf.name, sum*invbin, (i+1), running)); 1581 } 1582 lastPos=i; 1583 running+=binsize; 1584 nextPos+=binsize; 1585 sum=0; 1586 } 1587 } 1588 } 1589 } 1590 1591 tsw.poisonAndWait(); 1592 } 1593 1594 /** 1595 * This version will NOT truncate all trailing bases of each scaffold's length modulo binsize. 1596 * @param fname Output filename 1597 * @param list List of reference scaffolds 1598 * @param binsize Width of coverage bins in bp 1599 * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus) 1600 */ writeCoveragePerBaseBinned2(String fname, ArrayList<Scaffold> list, int binsize, int strand, int minscaf)1601 public static void writeCoveragePerBaseBinned2(String fname, ArrayList<Scaffold> list, int binsize, int strand, int minscaf){ 1602 if(fname==null || (!STRANDED && strand>0)){return;} 1603 TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false); 1604 tsw.start(); 1605 if(printHeader){ 1606 String pound=(headerPound ? "#" : ""); 1607 if(calcCovStdev){ 1608 double[] stdev=standardDeviationBinned(list, binsize, strand, minscaf); 1609 if(stdev!=null){ 1610 tsw.print(pound+"Mean\t"+String.format(Locale.ROOT, "%.3f", stdev[0])+"\n"); 1611 tsw.print(pound+"STDev\t"+String.format(Locale.ROOT, "%.3f", stdev[1])+"\n"); 1612 } 1613 } 1614 tsw.print(pound+"RefName\tCov\tPos\tRunningPos\n"); 1615 } 1616 1617 long running=0; 1618 for(Scaffold scaf : list){ 1619 CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); 1620 int lastPos=-1, nextPos=binsize-1; 1621 long sum=0; 1622 final int lim=scaf.length-1; 1623 for(int i=0; i<scaf.length; i++){ 1624 int x=(ca==null ? 0 : ca.get(i)); 1625 sum+=x; 1626 if(i>=nextPos || i==lim){ 1627 int bin=(i-lastPos); 1628 if(scaf.length>=minscaf){ 1629 if(sum>0 || !NONZERO_ONLY){ 1630 tsw.print(String.format(Locale.ROOT, "%s\t%.2f\t%d\t%d\n", scaf.name, sum/(float)bin, (i+1), running)); 1631 } 1632 } 1633 running+=bin; 1634 nextPos+=binsize; 1635 lastPos=i; 1636 sum=0; 1637 } 1638 } 1639 } 1640 1641 tsw.poisonAndWait(); 1642 } 1643 1644 //Unsafe because it will fail if there are over 2 billion bins standardDeviationBinnedUnsafe(String fname, ArrayList<Scaffold> scaffolds, int binsize, int strand, int minscaf)1645 public static double[] standardDeviationBinnedUnsafe(String fname, ArrayList<Scaffold> scaffolds, int binsize, int strand, int minscaf){ 1646 1647 LongList list=new LongList(); 1648 for(Scaffold scaf : scaffolds){ 1649 CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); 1650 int lastPos=-1, nextPos=binsize-1; 1651 long sum=0; 1652 final int lim=scaf.length-1; 1653 for(int i=0; i<scaf.length; i++){ 1654 int x=(ca==null ? 0 : ca.get(i)); 1655 sum+=x; 1656 if(i>=nextPos || i==lim){ 1657 int bin=(i-lastPos); 1658 if(scaf.length>=minscaf){ 1659 list.add((int)(10*(sum/(double)bin))); 1660 } 1661 nextPos+=binsize; 1662 lastPos=i; 1663 sum=0; 1664 } 1665 } 1666 } 1667 list.sort(); 1668 1669 double mean=0.1*list.mean(); 1670 double median=0.1*list.median(); 1671 double mode=0.1*list.mode(); 1672 double stdev=0.1*list.stdev(); 1673 return new double[] {mean, median, mode, stdev}; 1674 } 1675 standardDeviationBinned(ArrayList<Scaffold> scaffolds, int binsize, int strand, int minscaf)1676 public static double[] standardDeviationBinned(ArrayList<Scaffold> scaffolds, int binsize, int strand, int minscaf){ 1677 double totalSum=0; 1678 long bins=0; 1679 1680 for(Scaffold scaf : scaffolds){ 1681 if(scaf.length>=minscaf){ 1682 CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); 1683 int lastPos=-1, nextPos=binsize-1; 1684 long tempSum=0; 1685 final int lim=scaf.length-1; 1686 for(int i=0; i<scaf.length; i++){ 1687 int x=(ca==null ? 0 : ca.get(i)); 1688 tempSum+=x; 1689 if(i>=nextPos || i==lim){ 1690 int bin=(i-lastPos); 1691 double depth=(tempSum/(double)bin); 1692 totalSum+=depth; 1693 bins++; 1694 nextPos+=binsize; 1695 lastPos=i; 1696 tempSum=0; 1697 } 1698 } 1699 } 1700 } 1701 1702 if(bins<1){return new double[] {0, 0};} 1703 final double mean=totalSum/(double)bins; 1704 double sumdev2=0; 1705 1706 for(Scaffold scaf : scaffolds){ 1707 if(scaf.length>=minscaf){ 1708 CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); 1709 int lastPos=-1, nextPos=binsize-1; 1710 long tempSum=0; 1711 final int lim=scaf.length-1; 1712 for(int i=0; i<scaf.length; i++){ 1713 int x=(ca==null ? 0 : ca.get(i)); 1714 tempSum+=x; 1715 if(i>=nextPos || i==lim){ 1716 int bin=(i-lastPos); 1717 double depth=(tempSum/(double)bin); 1718 double dev=mean-depth; 1719 sumdev2+=(dev*dev); 1720 nextPos+=binsize; 1721 lastPos=i; 1722 tempSum=0; 1723 } 1724 } 1725 } 1726 } 1727 1728 final double stdev=Math.sqrt(sumdev2/bins); 1729 return new double[] {mean, stdev}; 1730 } 1731 standardDeviation(ArrayList<Scaffold> scaffolds, int strand, int minscaf)1732 public static double[] standardDeviation(ArrayList<Scaffold> scaffolds, int strand, int minscaf){ 1733 long totalSum=0, bins=0; 1734 for(Scaffold scaf : scaffolds){ 1735 // System.err.println(scaf.name+", "+scaf.length+"/"+minscaf); 1736 if(scaf.length>=minscaf){ 1737 final CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); 1738 bins+=scaf.length; 1739 for(int i=0; i<scaf.length; i++){ 1740 int depth=(ca==null ? 0 : ca.get(i)); 1741 totalSum+=depth; 1742 } 1743 } 1744 } 1745 1746 if(bins<1){return new double[] {0, 0};} 1747 final double mean=totalSum/(double)bins; 1748 double sumdev2=0; 1749 // System.err.println(totalSum+", "+bins+", "+mean); 1750 for(Scaffold scaf : scaffolds){ 1751 if(scaf.length>=minscaf){ 1752 final CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); 1753 double sumTemp=0; 1754 for(int i=0; i<scaf.length; i++){ 1755 int depth=(ca==null ? 0 : ca.get(i)); 1756 double dev=mean-depth; 1757 sumTemp+=(dev*dev); 1758 } 1759 sumdev2+=sumTemp; 1760 } 1761 } 1762 1763 final double stdev=Math.sqrt(sumdev2/bins); 1764 // System.err.println(totalSum+", "+bins+", "+mean+", "+stdev); 1765 return new double[] {mean, stdev}; 1766 } 1767 1768 1769 /** 1770 * @param fname Output filename 1771 * @param list List of reference scaffolds 1772 * @param binsize Width of coverage bins in bp 1773 * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus) 1774 */ writeCoveragePerBaseNormalized(String fname, ArrayList<Scaffold> list, double binsize, int strand, int minscaf)1775 public static void writeCoveragePerBaseNormalized(String fname, ArrayList<Scaffold> list, double binsize, int strand, int minscaf){ 1776 if(fname==null || (!STRANDED && strand>0)){return;} 1777 TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false); 1778 tsw.start(); 1779 if(printHeader){ 1780 String pound=(headerPound ? "#" : ""); 1781 tsw.print(pound+"RefName\tBin\tCov\tPos\tRunningPos\n"); 1782 } 1783 1784 double running=0; 1785 double invbin=1.0/binsize; 1786 final double invbincount=1.0/NORMALIZE_LENGTH_BINS; 1787 for(Scaffold scaf : list){ 1788 if(NORMALIZE_LENGTH_BINS>0){ 1789 binsize=scaf.length*invbincount; 1790 invbin=1.0/binsize; 1791 } 1792 1793 if(scaf.length>=binsize && scaf.length>=minscaf){ 1794 long max=-1; 1795 1796 final CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); 1797 double lastPos=-1, nextPos=binsize-1; 1798 long sum=0; 1799 1800 if(NORMALIZE_COVERAGE){ 1801 for(int i=0; i<scaf.length; i++){ 1802 int x=(ca==null ? 0 : ca.get(i)); 1803 sum+=x; 1804 if(i>=nextPos){ 1805 max=Tools.max(sum, max); 1806 running+=binsize; 1807 nextPos+=binsize; 1808 sum=0; 1809 } 1810 } 1811 lastPos=-1; 1812 nextPos=binsize-1; 1813 sum=0; 1814 assert(max>-1) : max; 1815 } 1816 max=Tools.max(max, 1); 1817 final double binmult=(NORMALIZE_COVERAGE ? 1d/max : invbin); 1818 1819 // assert(false) : NORMALIZE_COVERAGE+", "+binmult+", "+invbin+", "+max+", "+binsize; 1820 1821 final String formatString=NORMALIZE_COVERAGE ? "%s\t%d\t%.5f\t%d\t%d\n" : "%s\t%d\t%.2f\t%d\t%d\n"; 1822 int bin=1; 1823 for(int i=0; i<scaf.length; i++){ 1824 int x=(ca==null ? 0 : ca.get(i)); 1825 sum+=x; 1826 if(i>=nextPos){ 1827 // outstream.println(x+", "+i+", "+nextPos+", "+sum+", "+(sum*binmult)); 1828 tsw.print(String.format(formatString, scaf.name, bin, sum*binmult, (i+1), (long)running)); 1829 bin++; 1830 lastPos=i; 1831 running+=binsize; 1832 nextPos+=binsize; 1833 sum=0; 1834 } 1835 } 1836 } 1837 } 1838 1839 tsw.poisonAndWait(); 1840 } 1841 1842 1843 1844 /** 1845 * @param fname Output filename 1846 * @param list List of reference scaffolds 1847 * @param binsize Width of coverage bins in bp 1848 * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus) 1849 */ writeCoveragePerBaseNormalizedOverall(String fname, ArrayList<Scaffold> list, double binsize, int strand, int minscaf)1850 public static void writeCoveragePerBaseNormalizedOverall(String fname, ArrayList<Scaffold> list, double binsize, int strand, int minscaf){ 1851 if(fname==null || (!STRANDED && strand>0)){return;} 1852 1853 assert(NORMALIZE_LENGTH_BINS>0) : "Must set 'normalizebins' flag to a positive integer."; 1854 double running=0; 1855 double invbin=1.0/binsize; 1856 long usedScafs=0; 1857 final double invbincount=1.0/NORMALIZE_LENGTH_BINS; 1858 1859 double[] normalized=new double[NORMALIZE_LENGTH_BINS+1]; 1860 double[] absolute=new double[NORMALIZE_LENGTH_BINS+1]; 1861 1862 for(Scaffold scaf : list){ 1863 if(NORMALIZE_LENGTH_BINS>0){ 1864 binsize=scaf.length*invbincount; 1865 invbin=1.0/binsize; 1866 } 1867 1868 if(scaf.length>=binsize && scaf.length>=minscaf){ 1869 usedScafs++; 1870 1871 if(scaf.readhits>0){ 1872 long max=-1; 1873 final CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); 1874 double lastPos=-1, nextPos=binsize-1; 1875 long sum=0; 1876 1877 { 1878 for(int i=0; i<scaf.length; i++){ 1879 int x=(ca==null ? 0 : ca.get(i)); 1880 sum+=x; 1881 if(i>=nextPos){ 1882 max=Tools.max(sum, max); 1883 running+=binsize; 1884 nextPos+=binsize; 1885 sum=0; 1886 } 1887 } 1888 lastPos=-1; 1889 nextPos=binsize-1; 1890 sum=0; 1891 assert(max>-1) : max; 1892 } 1893 max=Tools.max(max, 1); 1894 final double binmult=1d/max; 1895 1896 // assert(false) : NORMALIZE_COVERAGE+", "+binmult+", "+invbin+", "+max+", "+binsize; 1897 1898 int bin=1; 1899 for(int i=0; i<scaf.length; i++){ 1900 int x=(ca==null ? 0 : ca.get(i)); 1901 sum+=x; 1902 if(i>=nextPos){ 1903 normalized[bin]+=(sum*binmult); 1904 absolute[bin]+=(sum*invbin); 1905 bin++; 1906 lastPos=i; 1907 running+=binsize; 1908 nextPos+=binsize; 1909 sum=0; 1910 } 1911 } 1912 } 1913 } 1914 } 1915 1916 TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false); 1917 tsw.start(); 1918 if(printHeader){ 1919 String pound=(headerPound ? "#" : ""); 1920 tsw.print(pound+"RefName\tBin\tAbsCov\tNormCov\n"); 1921 } 1922 double invScafs=1d/Tools.max(1, usedScafs); 1923 1924 final double maxNorm=Tools.max(normalized); 1925 final double normMult=1/maxNorm; 1926 1927 for(int bin=1; bin<normalized.length; bin++){ 1928 // assert((absolute[bin]*invScafs)!=Double.NaN && (normalized[bin]*invScafs)!=Double.NaN) : invScafs+", "+absolute[bin]+", "+normalized[bin]; 1929 // assert(false) : invScafs+", "+absolute[bin]+", "+normalized[bin]+", "+(absolute[bin]*invScafs)+", "+(normalized[bin]*invScafs); 1930 tsw.print(String.format(Locale.ROOT, "%s\t%d\t%.5f\t%.5f\n", "all", bin, absolute[bin]*invScafs, normalized[bin]*normMult)); 1931 } 1932 1933 tsw.poisonAndWait(); 1934 } 1935 1936 1937 1938 /** 1939 * Write RPKM statistics. 1940 */ writeRPKM(String out, String in1, String in2, long readsIn, boolean printNonZeroOnly, ArrayList<Scaffold> list)1941 public static void writeRPKM(String out, String in1, String in2, long readsIn, boolean printNonZeroOnly, ArrayList<Scaffold> list){ 1942 if(out==null){return;} 1943 final TextStreamWriter tsw=new TextStreamWriter(out, overwrite, false, false); 1944 tsw.start(); 1945 1946 /* Count mapped reads */ 1947 long mappedReads=0; 1948 long mappedFrags=0; 1949 for(Scaffold scaf : list){ 1950 mappedReads+=scaf.readhits; 1951 mappedFrags+=scaf.fraghits; 1952 } 1953 mappedFrags/=2; 1954 1955 /* Print header */ 1956 tsw.print("#File\t"+(in1==null ? "" : in1)+(in2==null ? "" : "\t"+in2)+"\n"); 1957 tsw.print(String.format(Locale.ROOT, "#Reads\t%d\n",readsIn)); 1958 tsw.print(String.format(Locale.ROOT, "#Mapped\t%d\n",mappedReads)); 1959 tsw.print(String.format(Locale.ROOT, "#RefSequences\t%d\n",list.size())); 1960 tsw.print("#Name\tLength\tBases\tCoverage\tReads\tRPKM\tFrags\tFPKM\n"); 1961 1962 final float readMult=1000000000f/Tools.max(1, mappedReads); 1963 final float fragMult=1000000000f/Tools.max(1, mappedFrags); 1964 1965 /* Print data */ 1966 for(final Scaffold scaf : list){ 1967 final long reads=scaf.readhits; 1968 final long frags=scaf.fraghits/2; 1969 final long bases=scaf.basehits; 1970 final String s=scaf.name; 1971 final int len=scaf.length; 1972 final double invlen=1.0/Tools.max(1, len); 1973 final double readMult2=readMult*invlen; 1974 final double fragMult2=fragMult*invlen; 1975 if(reads>0 || !printNonZeroOnly){ 1976 tsw.print(String.format(Locale.ROOT, "%s\t%d\t%d\t%.4f\t%d\t%.4f\t%d\t%.4f\n",s,len,bases,bases*invlen,reads,reads*readMult2,frags,frags*fragMult2)); 1977 } 1978 } 1979 tsw.poisonAndWait(); 1980 } 1981 1982 1983 /*--------------------------------------------------------------*/ 1984 /*---------------- Fields ----------------*/ 1985 /*--------------------------------------------------------------*/ 1986 1987 /** The list of all scaffolds */ 1988 private ArrayList<Scaffold> list; 1989 /** Maps names to scaffolds */ 1990 private HashMap<String, Scaffold> table; 1991 /** Converts BBMap index coordinates to scaffold coordinates */ 1992 private final ScaffoldCoordinates coords=new ScaffoldCoordinates(), coords2=new ScaffoldCoordinates(); 1993 1994 /** Mapping program name */ 1995 private String program=null; 1996 /** Mapping program version */ 1997 private String version=null; 1998 1999 //Inputs 2000 /** Primary input file (typically sam) */ 2001 public String in1=null; 2002 /** Secondary input file (typically for coverage directly from BBMap) */ 2003 public String in2=null; 2004 /** Optional, for calculating GC */ 2005 public String reference=null; 2006 public String orffasta=null; 2007 2008 //Outputs 2009 /** Stream unaltered sam input to this output */ 2010 public String outsam=null; 2011 /** Coverage statistics, one line per scaffold */ 2012 public String covstats=null; 2013 public String outorf=null; 2014 /** Coverage histogram, one line per depth and one point per base */ 2015 public String histogram=null; 2016 /** Coverage with one line per base */ 2017 public String basecov=null; 2018 /** Coverage with one file per scaffold */ 2019 public String basecov_ps=null; 2020 /** Coverage with one line per bin */ 2021 public String bincov=null; 2022 /** Coverage with one line per bin, normalized by length and/or height */ 2023 public String normcov=null; 2024 /** Coverage with one line per bin, normalized by length and/or height, for combined reference */ 2025 public String normcovOverall=null; 2026 /** rpkm/fpkm output, similar to Seal */ 2027 public String outrpkm=null; 2028 2029 /** Typically indicates that a header line was encountered in an unexpected place, e.g. with concatenated sam files. */ 2030 private boolean error=false; 2031 2032 private boolean warned=false; 2033 private final boolean EA=Shared.EA(); 2034 2035 /** Total length of reference */ 2036 public long refBases=0; 2037 public long mappedBases=0; 2038 public long mappedNonClippedBases=0; 2039 public long mappedBasesWithDels=0; 2040 public long mappedReads=0; 2041 public long properPairs=0; 2042 public long readsProcessed=0; 2043 public long basesProcessed=0; 2044 public long kmersProcessed=0; 2045 public long mappedKmers=0; 2046 public double correctKmers=0; 2047 public long totalCoveredBases1=0; 2048 public long totalCoveredBases2=0; 2049 public long scaffoldsWithCoverage1=0; 2050 public long scaffoldsWithCoverage2=0; 2051 public long totalScaffolds=0; 2052 2053 public int k=0; 2054 2055 //Don't reset these variables when clearing. 2056 public long maxReads=-1; 2057 public int initialScaffolds=4096; 2058 public int binsize=1000; 2059 public boolean bits32=false; 2060 public int minMapq=0; 2061 public boolean useStreamer=true; 2062 public int streamerThreads=2; 2063 public int minDepthToBeCovered=1; 2064 2065 private boolean qtrimLeft=false; 2066 private boolean qtrimRight=false; 2067 private float trimq=-1; 2068 private final float trimE; 2069 private int border=0; 2070 2071 /** Don't print coverage info for scaffolds shorter than this */ 2072 public int minscaf=0; 2073 2074 public HashMap<String, SamLine> pairTable=new HashMap<String, SamLine>(); 2075 2076 public PrintStream outstream=System.err; 2077 2078 private boolean errorState=false; 2079 2080 /*--------------------------------------------------------------*/ 2081 /*---------------- Static Fields ----------------*/ 2082 /*--------------------------------------------------------------*/ 2083 2084 2085 /** Permission to overwrite existing files */ 2086 public static boolean overwrite=false; 2087 /** Permission to append to existing files */ 2088 public static boolean append=false; 2089 /** Print verbose log messages */ 2090 public static boolean verbose=false; 2091 /** Print the arguments to main */ 2092 public static boolean printCommand=true; 2093 2094 /** Print headers in output files */ 2095 public static boolean printHeader=true; 2096 /** Prepend '#' symbol to header lines */ 2097 public static boolean headerPound=true; 2098 /** Calculate standard deviation of coverage */ 2099 public static boolean calcCovStdev=true; 2100 2101 /** Window size to use when calculating average coverage, 2102 * for detecting contiguous low-coverage areas */ 2103 public static int LOW_COV_WINDOW=500; 2104 /** Min average coverage to not be classified as low-depth */ 2105 public static double LOW_COV_DEPTH=5; 2106 /** Print number of bases below a certain average coverage in a window */ 2107 public static boolean USE_WINDOW=false; 2108 2109 /** Track base composition of reads covering each scaffold */ 2110 public static boolean COUNT_GC=true; 2111 /** Output in 2-column format ("#ID\tAvg_fold\n") */ 2112 public static boolean TWOCOLUMN=false; 2113 /** Track coverage for strands independently */ 2114 public static boolean STRANDED=false; 2115 /** Add scaffold information from the reference (in addition to sam header) */ 2116 public static boolean ADD_FROM_REF=true; 2117 /** Add scaffold information from reads mapped to unknown scaffolds (in addition to sam header) */ 2118 public static boolean ADD_FROM_READS=false; 2119 /** Only print scaffolds with nonzero coverage */ 2120 public static boolean NONZERO_ONLY=false; 2121 /** Store coverage info in numeric arrays */ 2122 public static boolean USE_COVERAGE_ARRAYS=true; 2123 /** Store coverage info in bitsets */ 2124 public static boolean USE_BITSETS=false; 2125 /** Only print lines when coverage changes (for compatibility with Jigsaw) */ 2126 public static boolean DELTA_ONLY=false; 2127 /** Process secondary alignments */ 2128 public static boolean USE_SECONDARY=true; 2129 /** Include coverage of unsequenced middle portion of pairs */ 2130 public static boolean PHYSICAL_COVERAGE=false; 2131 /** Use 'tlen' field when calculating physical coverage */ 2132 public static boolean USE_TLEN=true; 2133 /** Abort on error; otherwise, errors may be ignored */ 2134 public static boolean ABORT_ON_ERROR=true; 2135 /** Print coverage for the last bin of a scaffold, even if it is shorter than binsize */ 2136 public static boolean KEEP_SHORT_BINS=true; 2137 /** Only track coverage for start location */ 2138 public static boolean START_ONLY=false; 2139 /** Only track coverage for stop location */ 2140 public static boolean STOP_ONLY=false; 2141 /** This appears to be the same as nonzeroonly... */ 2142 public static boolean CONCISE=false; 2143 /** Normalize coverage by expression contig coverage as a fraction of its max coverage */ 2144 public static boolean NORMALIZE_COVERAGE=false; 2145 /** Normalize contig length by binning into this many bins per contig */ 2146 public static int NORMALIZE_LENGTH_BINS=100; 2147 /** Include soft-clipped bases in coverage */ 2148 public static boolean INCLUDE_SOFT_CLIP=false; 2149 /** Include deletions/introns in coverage */ 2150 public static boolean INCLUDE_DELETIONS=true; 2151 /** Include reads flagged as duplicates in coverage */ 2152 public static boolean INCLUDE_DUPLICATES=true; 2153 2154 public static boolean KEY_VALUE; 2155 2156 /** Translation array for tracking base counts */ 2157 private static final byte[] charToNum=AssemblyStats2.makeCharToNum(); 2158 2159 private static final int NOTHING_MODE=0, BITSET_MODE=1, ARRAY_MODE=2; 2160 2161 } 2162