1 package jgi;
2 
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.BitSet;
8 import java.util.HashMap;
9 import java.util.Locale;
10 
11 import align2.QualityTools;
12 import dna.ChromosomeArray;
13 import dna.Data;
14 import dna.Scaffold;
15 import fileIO.ByteFile;
16 import fileIO.ByteStreamWriter;
17 import fileIO.FileFormat;
18 import fileIO.ReadWrite;
19 import fileIO.TextFile;
20 import fileIO.TextStreamWriter;
21 import shared.KillSwitch;
22 import shared.Parse;
23 import shared.Parser;
24 import shared.PreParser;
25 import shared.ReadStats;
26 import shared.Shared;
27 import shared.Timer;
28 import shared.Tools;
29 import shared.TrimRead;
30 import stream.Read;
31 import stream.SamLine;
32 import stream.SamLineStreamer;
33 import stream.ScaffoldCoordinates;
34 import stream.SiteScore;
35 import structures.ByteBuilder;
36 import structures.CoverageArray;
37 import structures.CoverageArray2;
38 import structures.CoverageArray3;
39 import structures.ListNum;
40 import structures.LongList;
41 
42 /**
43  * @author Brian Bushnell
44  * @date Jan 4, 2013
45  *
46  */
47 public class CoveragePileup {
48 
49 	/*--------------------------------------------------------------*/
50 	/*----------------             Main             ----------------*/
51 	/*--------------------------------------------------------------*/
52 
main(String[] args)53 	public static void main(String[] args){
54 		Timer t=new Timer();
55 
56 		CoveragePileup x=new CoveragePileup(args);
57 
58 		x.process();
59 
60 		t.stop();
61 		if(!KEY_VALUE){
62 			x.outstream.println();
63 			x.outstream.println("Time: \t"+t);
64 		}
65 
66 		//Close the print stream if it was redirected
67 		Shared.closeStream(x.outstream);
68 	}
69 
70 	/*--------------------------------------------------------------*/
71 	/*----------------        Initialization        ----------------*/
72 	/*--------------------------------------------------------------*/
73 
CoveragePileup(String[] args)74 	public CoveragePileup(String[] args){
75 
76 		{//Preparse block for help, config files, and outstream
77 			PreParser pp=new PreParser(args, printCommand ? getClass() : null, false);
78 			args=pp.args;
79 			outstream=pp.outstream;
80 		}
81 
82 		int vectorMode=-1;
83 		boolean outset=false;
84 		ReadWrite.USE_UNPIGZ=true;
85 //		SamLine.RNAME_AS_BYTES=false;
86 
87 		for(int i=0; i<args.length; i++){
88 			final String arg=args[i];
89 			final String[] split=arg.split("=");
90 			String a=split[0].toLowerCase();
91 			String b=split.length>1 ? split[1] : null;
92 
93 			if(a.equals("ref") || a.equals("reference") || a.equals("fasta")){
94 				reference=b;
95 			}else if(a.equals("addfromref")){
96 				ADD_FROM_REF=Parse.parseBoolean(b);
97 			}else if(a.equals("addfromreads")){
98 				ADD_FROM_READS=Parse.parseBoolean(b);
99 			}else if(a.equals("in") || a.equals("in1")){
100 				in1=b;
101 			}else if(a.equals("in2")){
102 				in2=b;
103 			}else if(a.equals("out") || a.equals("coveragestats") || a.equals("covstats") || a.equals("stats")){
104 				covstats=b;
105 				outset=true;
106 			}else if(a.equals("minscaf") || a.equals("covminscaf")){
107 				minscaf=Integer.parseInt(b);
108 			}else if(a.equals("mindepth") || a.equals("mincov")){
109 				minDepthToBeCovered=Integer.parseInt(b);
110 			}else if(a.equals("border")){
111 				border=Integer.parseInt(b);
112 			}else if(a.equals("qtrim")/* || a.equals("trim")*/){
113 				if(b==null || b.length()==0){qtrimRight=qtrimLeft=true;}
114 				else if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){qtrimLeft=true;qtrimRight=false;}
115 				else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){qtrimLeft=false;qtrimRight=true;}
116 				else if(b.equalsIgnoreCase("both") || b.equalsIgnoreCase("rl") || b.equalsIgnoreCase("lr")){qtrimLeft=qtrimRight=true;}
117 				else if(Tools.isDigit(b.charAt(0))){
118 					trimq=Float.parseFloat(b);
119 					qtrimRight=trimq>0;
120 				}else{qtrimRight=qtrimLeft=Parse.parseBoolean(b);}
121 			}else if(a.equals("trimq") || a.equals("trimquality")){
122 				trimq=Float.parseFloat(b);
123 			}else if(a.equals("ss") || a.equals("samstreamer")){
124 				if(b!=null && Tools.isDigit(b.charAt(0))){
125 					useStreamer=true;
126 					streamerThreads=Tools.max(1, Integer.parseInt(b));
127 				}else{
128 					useStreamer=Parse.parseBoolean(b);
129 				}
130 			}else if(a.equals("minq") || a.equals("minmapq")){
131 				minMapq=Integer.parseInt(b);
132 			}else if(a.equals("outsam")){
133 				outsam=b;
134 			}else if(a.equals("rpkm") || a.equals("fpkm") || a.equals("outrpkm")){
135 				outrpkm=b;
136 			}else if(a.equals("outorf")){
137 				outorf=b;
138 			}else if(a.equals("orffasta") || a.equals("fastaorf")){
139 				orffasta=b;
140 			}else if(a.equals("basecov") || a.equals("outcov")){
141 				basecov=b;
142 			}else if(a.equals("bincov") || a.equals("outbinned")){
143 				bincov=b;
144 			}else if(a.equals("normcov") || a.equals("outnormalized")){
145 				normcov=b;
146 			}else if(a.equals("normcovo") || a.equals("outnormalizedoverall")){
147 				normcovOverall=b;
148 			}else if(a.equals("delta")){
149 				DELTA_ONLY=Parse.parseBoolean(b);
150 			}else if(a.equals("physical") || a.equals("physicalcoverage") || a.equals("physcov")){
151 				PHYSICAL_COVERAGE=Parse.parseBoolean(b);
152 			}else if(a.equals("tlen")){
153 				USE_TLEN=Parse.parseBoolean(b);
154 			}else if(a.equals("hist") || a.equals("histogram") || a.equals("covhist")){
155 				histogram=b;
156 			}else if(a.equals("reads")){
157 				maxReads=Parse.parseKMG(b);
158 			}else if(a.equals("scafs") || a.equals("scaffolds")){
159 				initialScaffolds=Tools.max(128, (int)(Tools.min(Long.parseLong(b),2000000000)));
160 			}else if(a.equals("binsize")){
161 				binsize=Integer.parseInt(b);
162 			}else if(a.equals("32bit")){
163 				bits32=Parse.parseBoolean(b);
164 			}else if(a.equals("bitset") || a.equals("usebitset") || a.equals("bitsets") || a.equals("usebitsets")){
165 //				if(Parse.parseBoolean(b)){arrayMode=BITSET_MODE;}
166 				vectorMode=Parse.parseBoolean(b) ? BITSET_MODE : NOTHING_MODE;
167 			}else if(a.equals("array") || a.equals("arrays") || a.equals("usearrays")){
168 				vectorMode=Parse.parseBoolean(b) ? ARRAY_MODE : NOTHING_MODE;
169 			}else if(a.equals("median") || a.equals("calcmedian")){
170 				if(Parse.parseBoolean(b)){
171 					vectorMode=ARRAY_MODE;
172 				}
173 			}else if(a.startsWith("nonzero") || a.equals("nzo")){
174 				NONZERO_ONLY=Parse.parseBoolean(b);
175 				if(verbose){outstream.println("Set NONZERO_ONLY to "+NONZERO_ONLY);}
176 			}else if(a.equals("append") || a.equals("app")){
177 				append=ReadStats.append=Parse.parseBoolean(b);
178 			}else if(a.equals("overwrite") || a.equals("ow")){
179 				overwrite=Parse.parseBoolean(b);
180 				if(verbose){outstream.println("Set overwrite to "+overwrite);}
181 			}else if(a.equalsIgnoreCase("twocolumn")){
182 				TWOCOLUMN=Parse.parseBoolean(b);
183 				if(verbose){outstream.println("Set TWOCOLUMN to "+TWOCOLUMN);}
184 			}else if(a.equalsIgnoreCase("keyvalue") || a.equalsIgnoreCase("machineout")){
185 				KEY_VALUE=Parse.parseBoolean(b);
186 			}else if(a.equalsIgnoreCase("countgc")){
187 				COUNT_GC=Parse.parseBoolean(b);
188 				if(verbose){outstream.println("Set COUNT_GC to "+COUNT_GC);}
189 			}else if(a.equals("secondary") || a.equals("usesecondary")){
190 				USE_SECONDARY=Parse.parseBoolean(b);
191 				if(verbose){outstream.println("Set USE_SECONDARY_ALIGNMENTS to "+USE_SECONDARY);}
192 			}else if(a.equals("softclip") || a.equals("includesoftclip")){
193 				INCLUDE_SOFT_CLIP=Parse.parseBoolean(b);
194 				if(verbose){outstream.println("Set INCLUDE_SOFT_CLIP to "+INCLUDE_SOFT_CLIP);}
195 			}else if(a.equals("keepshortbins") || a.equals("ksb")){
196 				KEEP_SHORT_BINS=Parse.parseBoolean(b);
197 				if(verbose){outstream.println("Set KEEP_SHORT_BINS to "+KEEP_SHORT_BINS);}
198 			}else if(a.equals("strandedcoverage") || a.equals("strandedcov") || a.equals("covstranded") || a.equals("stranded")){
199 				STRANDED=Parse.parseBoolean(b);
200 			}else if(a.equals("startcov") || a.equals("covstart") || a.equals("startonly")){
201 				START_ONLY=Parse.parseBoolean(b);
202 			}else if(a.equals("stopcov") || a.equals("covstop") || a.equals("stoponly")){
203 				STOP_ONLY=Parse.parseBoolean(b);
204 			}else if(a.equals("concise")){
205 				CONCISE=Parse.parseBoolean(b);
206 			}else if(a.equals("verbose")){
207 				verbose=Parse.parseBoolean(b);
208 			}else if(a.equals("normc") || a.equals("normalizecoverage")){
209 				NORMALIZE_COVERAGE=Parse.parseBoolean(b);
210 			}else if(a.equals("header") || a.equals("hdr")){
211 				printHeader=Parse.parseBoolean(b);
212 			}else if(a.equals("headerpound") || a.equals("#")){
213 				headerPound=Parse.parseBoolean(b);
214 			}else if(a.equals("stdev")){
215 				calcCovStdev=Parse.parseBoolean(b);
216 			}else if(a.equals("delcov") || a.equals("dels") || a.equals("includedels") || a.equals("includedeletions") || a.equals("delcoverage")){
217 				INCLUDE_DELETIONS=Parse.parseBoolean(b);
218 			}else if(a.equals("dupecoverage") || a.equals("dupecov") || a.equals("dupes") || a.equals("duplicates") || a.equals("includeduplicates")){
219 				INCLUDE_DUPLICATES=Parse.parseBoolean(b);
220 			}else if(a.equals("ignoredupes") || a.equals("ignoreduplicates")){
221 				INCLUDE_DUPLICATES=!Parse.parseBoolean(b);
222 			}else if(a.equals("normb") || a.equals("normalizebins")){
223 				try {
224 					NORMALIZE_LENGTH_BINS=Integer.parseInt(b);
225 				} catch (NumberFormatException e) {
226 					boolean x=Parse.parseBoolean(b);
227 					NORMALIZE_LENGTH_BINS=x ? 100 : -1;
228 				}
229 			}else if(a.equals("covwindow")){
230 				if(b==null || b.length()<1 || Character.isLetter(b.charAt(0))){
231 					USE_WINDOW=Parse.parseBoolean(b);
232 				}else{
233 					LOW_COV_WINDOW=Integer.parseInt(b);
234 					USE_WINDOW=(LOW_COV_WINDOW>0);
235 				}
236 			}else if(a.equals("covwindowavg")){
237 				LOW_COV_DEPTH=Double.parseDouble(b);
238 			}else if(a.equals("k")){
239 				k=Integer.parseInt(b);
240 			}else if(Parser.parseCommonStatic(arg, a, b)){
241 				//do nothing
242 			}else if(Parser.parseZip(arg, a, b)){
243 				//do nothing
244 			}else if(in1==null && arg.indexOf('=')<0 && new File(arg).exists()){
245 				in1=arg;
246 			}else{
247 				throw new RuntimeException("Unknown parameter: "+args[i]);
248 			}
249 
250 		}
251 		trimE=(float)QualityTools.phredToProbError(trimq);
252 //		assert(false) : qtrimLeft+", "+qtrimRight+", "+trimq+", "+trimE;
253 
254 		ReadWrite.SAMTOOLS_IGNORE_FLAG|=ReadWrite.SAM_UNMAPPED;
255 		if(!USE_SECONDARY){ReadWrite.SAMTOOLS_IGNORE_FLAG|=ReadWrite.SAM_SECONDARY;}
256 		if(!INCLUDE_DUPLICATES){ReadWrite.SAMTOOLS_IGNORE_FLAG|=ReadWrite.SAM_DUPLICATE;}
257 
258 		if(outsam==null){
259 			SamLine.PARSE_0=false;
260 			SamLine.PARSE_6=false;
261 			SamLine.PARSE_7=false;
262 			SamLine.PARSE_8=false;
263 			if(k<1 && trimq<1){SamLine.PARSE_10=false;}
264 			SamLine.PARSE_OPTIONAL=false;
265 		}
266 
267 		if(vectorMode>-1){
268 			USE_BITSETS=(vectorMode==BITSET_MODE);
269 			USE_COVERAGE_ARRAYS=(vectorMode==ARRAY_MODE);
270 		}else{
271 			if(histogram==null && basecov==null && bincov==null && normcov==null &&
272 					normcovOverall==null && outorf==null && !calcCovStdev){//No need for coverage array!
273 				USE_COVERAGE_ARRAYS=false;
274 				if(TWOCOLUMN){//No need for bitset, either!
275 					USE_BITSETS=false;
276 				}else{
277 					USE_BITSETS=true;
278 				}
279 			}
280 		}
281 
282 		if(verbose){
283 			outstream.println("Set USE_COVERAGE_ARRAYS to "+USE_COVERAGE_ARRAYS);
284 			outstream.println("Set USE_BITSETS to "+USE_BITSETS);
285 		}
286 
287 		if(maxReads<0){maxReads=Long.MAX_VALUE;}
288 		{
289 			final String a=(args.length>0 ? args[0] : null);
290 //			final String b=(args.length>1 ? args[1] : null);
291 			if(in1==null && a!=null && a.indexOf('=')<0 && (a.startsWith("stdin") || new File(a).exists())){in1=a;}
292 //			if(covstats==null && b!=null && b.indexOf('=')<0){covstats=b;}
293 			if(in1==null){in1="stdin";}
294 //			if(covstats==null && !outset){
295 ////				out="stdout";
296 ////				outstream.println("Warning: output destination not set; producing no output.  To print to standard out, set 'out=stdout'");
297 //				outstream=System.err;
298 //			}
299 		}
300 		assert(in1!=null);
301 //		assert(out!=null || outset) : "Output file was not set.";
302 
303 		if(STRANDED){
304 			assert(basecov==null || basecov.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output.";
305 			assert(bincov==null || bincov.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output.";
306 			assert(normcov==null || normcov.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output.";
307 			assert(normcovOverall==null || normcovOverall.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output.";
308 			assert(histogram==null || histogram.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output.";
309 			assert(covstats==null || covstats.indexOf('#')>=0) : "Output filenames must contain '#' symbol for strand-specific output.";
310 		}
311 
312 		if(!Tools.testInputFiles(false, true, in1, in2, reference)){
313 			throw new RuntimeException("\nCan't read some input files.\n");
314 		}
315 
316 		if(!Tools.testOutputFiles(overwrite, append, false, basecov, bincov, normcov, normcovOverall, histogram, covstats, outrpkm)){
317 			throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+
318 					basecov+", "+bincov+", "+normcov+", "+normcovOverall+", "+histogram+", "+covstats+", "+outrpkm+"\n");
319 		}
320 	}
321 
322 
323 	/*--------------------------------------------------------------*/
324 	/*----------------       Data Structures        ----------------*/
325 	/*--------------------------------------------------------------*/
326 
327 
328 	/** The goal if this is to garbage-collect unnecessary objects, not really for reusing the object */
clear()329 	public void clear(){
330 		list=null;
331 		table=null;
332 		pairTable=null;
333 
334 		program=null;
335 		version=null;
336 
337 		in1=null;
338 		covstats=null;
339 		outsam=null;
340 		outorf=null;
341 		outrpkm=null;
342 		reference=null;
343 		histogram=null;
344 		basecov=null;
345 		bincov=null;
346 		normcov=null;
347 		normcovOverall=null;
348 		orffasta=null;
349 
350 		error=false;
351 
352 		refBases=0;
353 		mappedBases=0;
354 		mappedNonClippedBases=0;
355 		mappedBasesWithDels=0;
356 		mappedReads=0;
357 		properPairs=0;
358 		readsProcessed=0;
359 		basesProcessed=0;
360 		kmersProcessed=0;
361 		mappedKmers=0;
362 		totalCoveredBases1=0;
363 		totalCoveredBases2=0;
364 		scaffoldsWithCoverage1=0;
365 		scaffoldsWithCoverage2=0;
366 		totalScaffolds=0;
367 	}
368 
createDataStructures()369 	public void createDataStructures(){
370 		refBases=0;
371 		mappedBases=0;
372 		mappedNonClippedBases=0;
373 		mappedBasesWithDels=0;
374 		mappedReads=0;
375 		properPairs=0;
376 		readsProcessed=0;
377 		basesProcessed=0;
378 		kmersProcessed=0;
379 		mappedKmers=0;
380 		totalCoveredBases1=0;
381 		totalCoveredBases2=0;
382 		scaffoldsWithCoverage1=0;
383 		scaffoldsWithCoverage2=0;
384 		totalScaffolds=0;
385 		error=false;
386 		list=new ArrayList<Scaffold>(initialScaffolds);
387 		table=new HashMap<String, Scaffold>(initialScaffolds);
388 
389 		if(PHYSICAL_COVERAGE){
390 			pairTable=new HashMap<String, SamLine>();
391 			if(COUNT_GC){
392 				COUNT_GC=false;
393 				outstream.println("COUNT_GC disabled for physical coverage mode.");
394 			}
395 			if(USE_SECONDARY){
396 				USE_SECONDARY=false;
397 				outstream.println("USE_SECONDARY disabled for physical coverage mode.");
398 			}
399 
400 			SamLine.PARSE_0=true;
401 			SamLine.PARSE_6=true;
402 			SamLine.PARSE_7=true;
403 			SamLine.PARSE_8=true;
404 			SamLine.PARSE_10=false;
405 			SamLine.PARSE_OPTIONAL=false;
406 		}
407 	}
408 
409 
410 	/*--------------------------------------------------------------*/
411 	/*----------------         Outer Methods        ----------------*/
412 	/*--------------------------------------------------------------*/
413 
414 	/** Read and process all input data. */
process()415 	public void process(){
416 		createDataStructures();
417 
418 //		System.err.println("A");
419 		if(in1!=null && FileFormat.isBamFile(in1)){
420 			if(Data.SAMTOOLS() && useStreamer){ReadWrite.USE_SAMBAMBA=false;} //Disable because it takes forever to read the header
421 		}
422 		ByteFile tf=ByteFile.makeByteFile(in1, false);
423 
424 //		System.err.println("B");
425 
426 		final ByteStreamWriter tsw=(outsam==null ? null : new ByteStreamWriter(outsam, overwrite, false, true));
427 		if(tsw!=null){tsw.start();}
428 		ReadWrite.USE_SAMBAMBA=true;
429 
430 //		System.err.println("C");
431 		processHeader(tf, tsw);
432 
433 //		System.err.println("D");
434 		processReference();
435 //		System.err.println("E");
436 		if(maxReads<0){maxReads=Long.MAX_VALUE;}
437 		if(useStreamer && !FileFormat.isStdio(in1)){
438 			tf.close();
439 //			System.err.println("F");
440 			processViaStreamer(tsw);
441 //			System.err.println("G");
442 		}else{
443 //			processViaByteFile(tf, line, tsw);
444 			processViaByteFile(tf, tsw);
445 //			System.err.println("H");
446 		}
447 //		System.err.println("I");
448 
449 		printOutput();
450 
451 		if(orffasta!=null){
452 			processOrfsFasta(orffasta, outorf, table);
453 		}
454 
455 		if(tsw!=null){tsw.waitForFinish();}
456 	}
457 
processViaStreamer(ByteStreamWriter tsw)458 	private void processViaStreamer(ByteStreamWriter tsw){
459 		SamLineStreamer ss=new SamLineStreamer(in1, streamerThreads, false, maxReads);
460 		ss.start();
461 		ByteBuilder bb=new ByteBuilder(33000);
462 		for(ListNum<SamLine> ln=ss.nextLines(); ln!=null && ln.size()>0; ln=ss.nextLines()){
463 			ArrayList<SamLine> list=(ln==null ? null : ln.list);
464 			for(SamLine sl : list){
465 				if(tsw!=null){
466 					sl.toBytes(bb);
467 					bb.nl();
468 					if(bb.length>=16384){
469 						tsw.print(bb);
470 						bb.clear();
471 					}
472 				}
473 				processSamLine(sl);
474 			}
475 		}
476 		if(tsw!=null){
477 			if(bb.length>0){tsw.print(bb);}
478 			tsw.poison();
479 		}
480 	}
481 
processViaByteFile(ByteFile tf, ByteStreamWriter tsw)482 	private void processViaByteFile(ByteFile tf, ByteStreamWriter tsw){
483 		for(byte[] line=tf.nextLine(); line!=null && readsProcessed<maxReads; line=tf.nextLine()){
484 			if(tsw!=null){tsw.println(line);}
485 			processSamLine(line);
486 		}
487 
488 		tf.close();
489 		if(tsw!=null){tsw.poison();}
490 	}
491 
492 
493 	/*--------------------------------------------------------------*/
494 	/*----------------             Setup            ----------------*/
495 	/*--------------------------------------------------------------*/
496 
497 
498 	/** Process all sam header lines from the tf.
499 	 * Once a non-header line is encountered, return it.
500 	 * If non-null, print all lines to the tsw. */
processHeader(ByteFile tf, ByteStreamWriter tsw)501 	public void processHeader(ByteFile tf, ByteStreamWriter tsw){
502 		byte[] line=null;
503 		for(line=tf.nextLine(); line!=null && (line.length==0 || line[0]=='@'); line=tf.nextLine()){
504 //			System.err.println(new String(line));
505 			if(tsw!=null){tsw.println(line);}
506 
507 			if(line.length>2){
508 				final byte a=line[1], b=line[2];
509 
510 				if(a=='S' && b=='Q'){
511 					Scaffold scaf=new Scaffold(line);
512 					if(COUNT_GC){scaf.basecount=KillSwitch.allocLong1D(8);}
513 					assert(!table.containsKey(scaf.name)) : "\nDuplicate scaffold name!\n"+scaf+"\n\n"+table.get(scaf.name);
514 					table.put(scaf.name, scaf);
515 					list.add(scaf);
516 					refBases+=scaf.length;
517 //					sc.obj=new CoverageArray2(table.size(), sc.length+1);
518 //					outstream.println("Made scaffold "+sc.name+" of length "+sc.length);
519 				}else if(a=='P' && b=='G'){
520 					String[] split=new String(line).split("\t");
521 					for(String s : split){
522 						if(s.startsWith("PN:")){
523 							if(program==null){program=Data.forceIntern(s.substring(3));}
524 						}else if(s.startsWith("VN:")){
525 							if(version==null){version=Data.forceIntern(s.substring(3));}
526 						}
527 					}
528 				}else if(a=='R' && b=='G'){
529 					//Do nothing
530 				}else if(a=='H' && b=='D'){
531 					//Do nothing
532 				}else if(a=='C' && b=='O'){
533 					//Do nothing
534 				}else{
535 					//				assert(false) : line;
536 				}
537 			}
538 		}
539 		if(line!=null){tf.pushBack(line);}
540 //		return line;
541 	}
542 
543 
loadScaffoldsFromIndex(int minChrom, int maxChrom)544 	public void loadScaffoldsFromIndex(int minChrom, int maxChrom){
545 
546 		final int[][] lengths=Data.scaffoldLengths;
547 		final int[][] locs=Data.scaffoldLocs;
548 		final byte[][][] names=Data.scaffoldNames;
549 		final int[] counts=new int[8];
550 		for(int chrom=minChrom; chrom<=maxChrom; chrom++){
551 			final ChromosomeArray ca=Data.getChromosome(chrom);
552 //			assert(false) : lengths[chrom]+", "+lengths.length+", "+names[chrom]+", "+locs[chrom]+", "+(ca==null);
553 			if(lengths[chrom]!=null){
554 				final int[] clengths=lengths[chrom];
555 				final int[] clocs=locs[chrom];
556 				final byte[][] cnames=names[chrom];
557 				for(int idx=0; idx<clengths.length; idx++){
558 					final int length=clengths[idx];
559 					final int loc=clocs[idx];
560 					final String name=new String(cnames[idx]);
561 					final Scaffold scaf=new Scaffold(name, length);
562 					if(ca!=null){
563 						scaf.gc=ca.calcGC(loc, length, counts);
564 					}
565 					if(COUNT_GC){scaf.basecount=KillSwitch.allocLong1D(8);}
566 					assert(!table.containsKey(scaf.name)) : "\nDuplicate scaffold name!\n"+scaf+"\n\n"+table.get(scaf.name);
567 					table.put(scaf.name, scaf);
568 					list.add(scaf);
569 					refBases+=scaf.length;
570 				}
571 			}
572 		}
573 	}
574 
575 
processReference()576 	public void processReference(){
577 		if(reference==null){return;}
578 
579 		ByteFile bf=ByteFile.makeByteFile(reference, false);
580 		Scaffold scaf=null;
581 		int len=0;
582 		final long[] acgtn=KillSwitch.allocLong1D(8);
583 		boolean addLen=false;
584 		for(byte[] s=bf.nextLine(); s!=null; s=bf.nextLine()){
585 			if(s.length>0 && s[0]=='>'){
586 				if(scaf!=null){
587 					if(scaf.length>0 && scaf.length!=len){
588 						outstream.println("ERROR: Scaffold "+scaf.name+" has contradictory lengths of "+scaf.length+" and "+len+"\n"
589 								+ "This probably indicates a corrupt or incorrect reference.");
590 						errorState=true;
591 						if(Shared.EA()){KillSwitch.kill();}
592 					}
593 //					else{outstream.println(scaf.name+", "+scaf.length+", "+len);}
594 					scaf.length=len;
595 					if(addLen){
596 						refBases+=scaf.length;
597 						addLen=false;
598 					}
599 					scaf.gc=(float)((acgtn[1]+acgtn[2])*1d/Data.max(1, acgtn[0]+acgtn[1]+acgtn[2]+acgtn[3]));
600 					scaf=null;
601 					len=0;
602 					Arrays.fill(acgtn, 0);
603 				}
604 
605 				String name=new String(s, 1, s.length-1);
606 				scaf=table.get(name);
607 				if(ADD_FROM_REF && scaf==null){
608 					scaf=new Scaffold(name, 0);
609 					if(!warned){
610 						outstream.println("Warning - SAM header did not include "+name+"\nAbsent scaffolds will be added; further warnings will be suppressed.");
611 						warned=true;
612 					}
613 					if(COUNT_GC){scaf.basecount=KillSwitch.allocLong1D(8);}
614 					table.put(name, scaf);
615 					list.add(scaf);
616 					addLen=true;
617 				}
618 			}else{
619 				len+=s.length;
620 				for(int i=0; i<s.length; i++){
621 					acgtn[charToNum[s[i]]]++;
622 				}
623 			}
624 		}
625 		if(scaf!=null){
626 			if(scaf.length>0 && scaf.length!=len){
627 				outstream.println("ERROR: Scaffold "+scaf.name+" has contradictory lengths of "+scaf.length+" and "+len+"\n"
628 						+ "This probably indicates a corrupt or incorrect reference.");
629 				errorState=true;
630 				if(Shared.EA()){KillSwitch.kill();}
631 			}
632 //			else{outstream.println(scaf.name+", "+scaf.length+", "+len);}
633 			scaf.length=len;
634 			if(addLen){
635 				refBases+=scaf.length;
636 				addLen=false;
637 			}
638 			scaf.gc=(float)((acgtn[1]+acgtn[2])*1d/Data.max(1, acgtn[0]+acgtn[1]+acgtn[2]+acgtn[3]));
639 			scaf=null;
640 			len=0;
641 			Arrays.fill(acgtn, 0);
642 		}
643 	}
644 
645 
processOrfsFasta(String fname_in, String fname_out, HashMap<String, Scaffold> map)646 	public void processOrfsFasta(String fname_in, String fname_out, HashMap<String, Scaffold> map){
647 		TextFile tf=new TextFile(fname_in, false);
648 		assert(!fname_in.equalsIgnoreCase(fname_out));
649 		TextStreamWriter tsw=new TextStreamWriter(fname_out, overwrite, false, true);
650 		tsw.start();
651 
652 		if(printHeader){
653 			String pound=(headerPound ? "#" : "");
654 			tsw.print(pound+"mappedBases="+mappedBases+"\n");
655 			tsw.print(pound+"mappedNonClippedBases="+mappedNonClippedBases+"\n");
656 			tsw.print(pound+"mappedBasesWithDels="+mappedBasesWithDels+"\n");
657 			tsw.print(pound+"mappedReads="+mappedReads+"\n");
658 			tsw.print(pound+"name\tlength\tdepthSum\tavgDepth\tavgDepth/mappedBases\tminDepth\tmaxDepth\tmedianDepth\tstdDevDepth\tfractionCovered\n");
659 		}
660 
661 		String line;
662 		final StringBuilder sb=new StringBuilder(500);
663 //		Formatter formatter=new Formatter(sb);
664 
665 		while((line=tf.nextLine())!=null){
666 			if(line.length()>1 && line.charAt(0)=='>'){
667 
668 				String[] split=line.split(" # "); //' # ' used as delimiters
669 
670 				String orfname=split[0].substring(1).trim(); //In case there are spaces around the ' # ' delimiters
671 				String scafname=orfname;
672 				if(scafname.contains("_")){//PRODIGAL pads _1 to the name of the first orf of a scaffold, and etc
673 					int last=scafname.lastIndexOf('_');
674 					boolean numeric=false;
675 					for(int i=last+1; i<scafname.length(); i++){
676 						if(Tools.isDigit(scafname.charAt(i))){numeric=true;}
677 						else{numeric=false; break;}
678 					}
679 					if(numeric){scafname=scafname.substring(0, last);}
680 				}
681 
682 				int start=Integer.parseInt(split[1].trim());
683 				int stop=Integer.parseInt(split[2].trim());
684 				int strand=Integer.parseInt(split[3].trim());
685 				if(strand==1){strand=Shared.PLUS;}else{strand=Shared.MINUS;}
686 				Orf orf=new Orf(orfname, start, stop, (byte)strand);
687 
688 				Scaffold scaf=map.get(scafname);
689 //				if(scaf==null){scaf=map.get(orfname);}
690 
691 //				assert(scaf!=null) : "\nCan't find scaffold for ("+orf+")\nfrom line\n"+line+"\n";
692 //				assert(orf.start>=0 && orf.stop<scaf.length) : "\norf goes out of scaffold bounds.\n"+orf+"\n"+scaf+"\n";
693 
694 				if(scaf==null){
695 					outstream.println("Can't find scaffold for ("+orf+")\nfrom line\n"+line+"\nscafname='"+scafname+"'\norfname='"+orfname+"'");
696 					if(ABORT_ON_ERROR){
697 						tsw.poison();
698 						throw new RuntimeException("Aborting.");
699 					}
700 				}else{
701 					if(orf.start<0 && orf.stop>=scaf.length){
702 						outstream.println("orf goes out of scaffold bounds.\n"+orf+"\n"+scaf);
703 						if(ABORT_ON_ERROR){
704 							tsw.poison();
705 							throw new RuntimeException("Aborting.");
706 						}
707 					}
708 
709 					CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1); //TODO:  Strand logic here depends on stranding protocol.
710 					orf.readCoverageArray(ca);
711 				}
712 
713 //				{
714 //					tsw.print(String.format(Locale.ROOT, "%s\t%d\t", args));
715 //				}
716 
717 				sb.append(orf.name).append('\t');
718 				sb.append(orf.length()).append('\t');
719 				sb.append(orf.baseDepth).append('\t');
720 				sb.append(String.format(Locale.ROOT, "%.4f", orf.avgCoverage())).append('\t');
721 				sb.append(orf.avgCoverage()/mappedNonClippedBases);
722 
723 				sb.append('\t');
724 				sb.append(orf.minDepth).append('\t');
725 				sb.append(orf.maxDepth).append('\t');
726 				sb.append(orf.medianDepth).append('\t');
727 				sb.append(String.format(Locale.ROOT, "%.4f",orf.stdevDepth)).append('\t');
728 				sb.append(String.format(Locale.ROOT, "%.4f",orf.fractionCovered()));
729 
730 				sb.append('\n');
731 				tsw.print(sb.toString());
732 				sb.setLength(0);
733 			}
734 		}
735 
736 		tsw.poison();
737 		tsw.waitForFinish();
738 	}
739 
740 
741 	/*--------------------------------------------------------------*/
742 	/*----------------         Inner Methods        ----------------*/
743 	/*--------------------------------------------------------------*/
744 
addCoverage(final String scafName, final byte[] seq, byte[] match, final int start0, final int stop0, final int readlen, final int nonClippedBases, final int strand, int incrementFrags, boolean properPair, SamLine sl)745 	public boolean addCoverage(final String scafName, final byte[] seq, byte[] match, final int start0, final int stop0, final int readlen,
746 			final int nonClippedBases, final int strand, int incrementFrags, boolean properPair, SamLine sl){//sl is optional
747 		Scaffold scaf=table.get(scafName);
748 		if(scaf==null){
749 			if(ADD_FROM_READS){
750 				if(!warned){
751 					outstream.println("Warning: A read was mapped to unknown reference sequence "+scafName+
752 							"\nFurther warnings will be suppressed.  These scaffolds will be included, but \n"
753 							+ "some coverage statistics will be inaccurate as scaffold lengths are not known.");
754 					warned=true;
755 				}
756 				scaf=new Scaffold(scafName, 0);
757 				if(COUNT_GC){scaf.basecount=KillSwitch.allocLong1D(8);}
758 				table.put(scafName, scaf);
759 				list.add(scaf);
760 				return addCoverage(scaf, seq, match, start0, stop0, readlen, nonClippedBases, strand, incrementFrags, properPair, sl);
761 			}else if(EA){
762 				KillSwitch.kill("ERROR: A read was mapped to unknown reference sequence "+scafName);
763 			}else if(!warned){
764 				outstream.println("Warning: Can't find "+scafName+"\nThis scaffold will not be included.  Further warnings will be suppressed.");
765 				warned=true;
766 				error=true;
767 			}
768 			return false;
769 		}
770 		return addCoverage(scaf, seq, match, start0, stop0, readlen, nonClippedBases, strand, incrementFrags, properPair, sl);
771 	}
772 
addCoverage(final Scaffold scaf, final byte[] seq, byte match[], final int start0, final int stop0, final int readlen, final int nonClippedBases, final int strand, int incrementFrags, boolean properPair, SamLine sl)773 	public boolean addCoverage(final Scaffold scaf, final byte[] seq, byte match[], final int start0, final int stop0, final int readlen, final int nonClippedBases,
774 			final int strand, int incrementFrags, boolean properPair, SamLine sl){//sl is optional
775 		if(scaf==null){
776 			assert(false) : "Adding coverage to a null Scaffold.";
777 			return false;
778 		}
779 
780 		if(ADD_FROM_READS){
781 			int x=scaf.length;
782 			scaf.length=Tools.max(scaf.length, stop0+1);
783 			if(scaf.length!=x){
784 				refBases=refBases+scaf.length-x;
785 			}
786 		}
787 
788 		final int start=Tools.max(start0, 0);
789 		final int stop=Tools.min(stop0, scaf.length-1);
790 
791 		assert(start>=0 && stop>=0) : "\nAn error was encountered when processing a read. Output will not be valid.\n"+
792 			"\nscafName="+scaf.name+"\nseq="+new String(seq)+"\nstart="+start+
793 			"\nstop="+stop+"\nreadlen="+readlen+"\nstrand="+strand+"\nscaf.length="+scaf.length+"\nscaf="+scaf;
794 
795 		mappedBases+=readlen;
796 		mappedNonClippedBases+=nonClippedBases;
797 		mappedBasesWithDels+=(stop-start+1);
798 		mappedReads++;
799 		if(properPair){properPairs++;}
800 
801 		scaf.readhits++;
802 		scaf.fraghits+=incrementFrags;
803 		if(strand==1){scaf.readhitsMinus++;}
804 
805 		if(seq!=null && scaf.basecount!=null){
806 			final long[] counts=scaf.basecount;
807 			for(int i=0; i<seq.length; i++){
808 				counts[charToNum[seq[i]]]++;
809 			}
810 		}
811 
812 		if(!INCLUDE_DELETIONS && !START_ONLY && !STOP_ONLY){
813 			assert(match!=null) : "Coverage excluding deletions cannot be calculated without a match string.";
814 			return addCoverageIgnoringDeletions(scaf, seq, match, start, stop, readlen, strand, incrementFrags);
815 		}
816 
817 		final int basehits=stop-start+1;
818 		scaf.basehits+=basehits;
819 
820 		if(USE_COVERAGE_ARRAYS){
821 			if(scaf.obj1==null){
822 				scaf.obj1=(bits32 ? new CoverageArray3(table.size(), scaf.length+1) : new CoverageArray2(table.size(), scaf.length+1));
823 				if(STRANDED){
824 					scaf.obj2=(bits32 ? new CoverageArray3(table.size(), scaf.length+1) : new CoverageArray2(table.size(), scaf.length+1));
825 				}
826 			}
827 			CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
828 			if(START_ONLY){
829 				ca.increment(start);
830 			}else if(STOP_ONLY){
831 				ca.increment(stop);
832 			}else{
833 				ca.incrementRange(start, stop);
834 //				mappedBases+=(stop-start+1);
835 //				mappedBases+=(readlen);
836 //				assert(readlen==(stop-start+1)) : "readlen="+readlen+", (stop-start+1)="+(stop-start+1)+", start="+start+", stop="+stop+", start0="+start0+", stop0="+stop0+
837 //				"\n"+sl+"\n";
838 				//123
839 			}
840 		}else if(USE_BITSETS){
841 			if(scaf.obj1==null){
842 				scaf.obj1=new BitSet(scaf.length+1);
843 				if(STRANDED){
844 					scaf.obj2=new BitSet(scaf.length+1);
845 				}
846 			}
847 			BitSet bs=(BitSet)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
848 			if(START_ONLY){
849 				bs.set(start);
850 			}else if(STOP_ONLY){
851 				bs.set(stop);
852 			}else{
853 				bs.set(start, stop+1);
854 			}
855 		}
856 
857 		return true;
858 	}
859 
addCoverageIgnoringDeletions(final Scaffold scaf, final byte[] seq, byte match[], final int start, final int stop, final int readlen, final int strand, int incrementFrags)860 	private boolean addCoverageIgnoringDeletions(final Scaffold scaf, final byte[] seq, byte match[], final int start, final int stop, final int readlen, final int strand, int incrementFrags){
861 		assert(!INCLUDE_DELETIONS && !START_ONLY && !STOP_ONLY);
862 		assert(match!=null) : "Coverage excluding deletions cannot be calculated without a match string.";
863 
864 		if(Read.isShortMatchString(match)){
865 			match=Read.toLongMatchString(match);
866 		}
867 
868 		int basehits=0;
869 		if(USE_COVERAGE_ARRAYS){
870 			if(scaf.obj1==null){
871 				scaf.obj1=(bits32 ? new CoverageArray3(table.size(), scaf.length+1) : new CoverageArray2(table.size(), scaf.length+1));
872 				if(STRANDED){
873 					scaf.obj2=(bits32 ? new CoverageArray3(table.size(), scaf.length+1) : new CoverageArray2(table.size(), scaf.length+1));
874 				}
875 			}
876 			CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
877 			for(int rpos=start, mpos=0; mpos<match.length && rpos<=stop; mpos++){
878 				byte m=match[mpos];
879 				if(m=='m' || m=='S' || m=='N'){
880 					ca.increment(rpos, 1);
881 					basehits++;
882 					rpos++;
883 				}else if(m=='X' || m=='Y' || m=='C' || m=='I'){
884 					//do nothing
885 				}else if(m=='D'){
886 					rpos++;
887 				}else{
888 					assert(false) : "Unhandled symbol "+m;
889 				}
890 			}
891 		}else if(USE_BITSETS){
892 			if(scaf.obj1==null){
893 				scaf.obj1=new BitSet(scaf.length+1);
894 				if(STRANDED){
895 					scaf.obj2=new BitSet(scaf.length+1);
896 				}
897 			}
898 			BitSet bs=(BitSet)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
899 			for(int rpos=start, mpos=0; mpos<match.length && rpos<=stop; mpos++){
900 				byte m=match[mpos];
901 				if(m=='m' || m=='S' || m=='N'){
902 					bs.set(rpos);
903 					basehits++;
904 					rpos++;
905 				}else if(m=='X' || m=='Y' || m=='C' || m=='I'){
906 					//do nothing
907 				}else if(m=='D'){
908 					rpos++;
909 				}else{
910 					assert(false) : "Unhandled symbol "+m;
911 				}
912 			}
913 		}
914 		scaf.basehits+=basehits;
915 
916 		return true;
917 	}
918 
919 
processSamLine(byte[] line)920 	public boolean processSamLine(byte[] line){
921 		if(line==null || line.length<3){
922 			return false;
923 		}else if(line[0]=='@'){
924 			if(!error){
925 				outstream.println("Unexpected header line: "+line);
926 				outstream.println("This should not cause problems, and is probably due to concatenated sam files.\n" +
927 						"Supressing future unexpected header warnings.");
928 				error=true;
929 			}
930 
931 			if(line[1]=='S' && line[2]=='Q'){
932 				Scaffold scaf=new Scaffold(line);
933 				if(!table.containsKey(scaf.name)){
934 					if(COUNT_GC){scaf.basecount=KillSwitch.allocLong1D(8);}
935 					table.put(scaf.name, scaf);
936 					list.add(scaf);
937 					refBases+=scaf.length;
938 				}
939 			}
940 		}else{
941 			SamLine sl=new SamLine(line);
942 			return processSamLine(sl);
943 		}
944 		return false;
945 	}
946 
trim(SamLine sl)947 	private int trim(SamLine sl){
948 		assert(border>0 || (trimq>=0 && (qtrimLeft || qtrimRight)));
949 		Read r=null;
950 		Scaffold sc=null;
951 		int leftTrimAmount=border, rightTrimAmount=border;
952 		if(border>0){
953 			r=sl.toRead(false);
954 			sc=table.get(sl.rnameS());
955 			assert(sc!=null) : sl+"\n\n"+sl.rnameS()+"\n\n"+table;
956 			int skipTrimRange=Tools.max(10, border+5);
957 			if(r.start<skipTrimRange){
958 				if(r.strand()==Shared.PLUS){leftTrimAmount=0;}
959 				else{rightTrimAmount=0;}
960 			}
961 			if(r.stop>sc.length-skipTrimRange){
962 				if(r.strand()==Shared.PLUS){rightTrimAmount=0;}
963 				else{leftTrimAmount=0;}
964 			}
965 		}
966 		final int len0=sl.length();
967 		if(qtrimLeft || qtrimRight){
968 			long packed=TrimRead.testOptimal(sl.seq, sl.qual, trimE);
969 			if(qtrimLeft){leftTrimAmount=Tools.max(leftTrimAmount, (int)((packed>>32)&0xFFFFFFFFL));}
970 			if(qtrimRight){rightTrimAmount=Tools.max(rightTrimAmount, (int)((packed)&0xFFFFFFFFL));}
971 //			assert(false) : qtrimLeft+", "+qtrimRight+", "+trimq+", "+trimE+", "+rightTrimAmount+"\n"+sl+"\n";
972 		}
973 		final int trimmed;
974 		if(leftTrimAmount<1 && rightTrimAmount<1){trimmed=0;}
975 		else{
976 			if(r==null){r=sl.toRead(false);}
977 			if(sc==null){sc=table.get(sl.rnameS());}
978 			int scaflen=(sc==null ? 1999999999 : sc.length);
979 			trimmed=TrimRead.trimReadWithMatch(r, sl, leftTrimAmount, rightTrimAmount, 0, scaflen, false);
980 		}
981 //		assert(trimmed==len0-sl.length()) : trimmed+", "+len0+", "+sl.length();
982 //		assert(rightTrimAmount>0) : qtrimLeft+", "+qtrimRight+", "+trimq+", "+trimE+", "+rightTrimAmount+"\n"+sl+"\n";
983 		return trimmed;
984 	}
985 
processSamLine(SamLine sl)986 	public boolean processSamLine(SamLine sl){
987 		readsProcessed++;
988 		basesProcessed+=sl.length();
989 		if(sl.duplicate() && !INCLUDE_DUPLICATES){return false;}
990 
991 		if(border>0 || (trimq>=0 && (qtrimLeft || qtrimRight))){
992 			if(sl.mapped() && sl.seq!=null){
993 				int trimmed=trim(sl);
994 				if(sl.length()<1 || trimmed<0){return false;}//trimmed<0 implies everything was trimmed
995 			}
996 		}
997 
998 		final int kmers=Tools.countKmers(sl.seq, k);
999 		final double cKmers=sl.qual==null ? kmers : Tools.countCorrectKmers(sl.qual, k);
1000 		kmersProcessed+=kmers;
1001 		correctKmers+=cKmers;
1002 		final boolean properPair=(sl.hasMate() && sl.mapped() && sl.primary() && sl.properPair());
1003 		if(PHYSICAL_COVERAGE && properPair){
1004 			SamLine mate=pairTable.remove(sl.qname);
1005 			if(mate==null){pairTable.put(sl.qname, sl);}
1006 			else{
1007 				final int start1=sl.start(INCLUDE_SOFT_CLIP, false);
1008 				final int stop1=sl.stop(start1, INCLUDE_SOFT_CLIP, false);
1009 				final int start2=mate.start(INCLUDE_SOFT_CLIP, false);
1010 				final int stop2=mate.stop(start2, INCLUDE_SOFT_CLIP, false);
1011 				final int strand=(sl.pairnum()==0 ? sl.strand() : mate.strand());
1012 				final int length=USE_TLEN ? sl.tlen : Tools.max(stop1, stop2)-Tools.min(start1, start2)+1;
1013 				mappedKmers+=kmers;
1014 				addCoverage(sl.rnameS(), null, null, Tools.min(start1, start2), Tools.max(stop1, stop2), length, sl.mappedNonClippedBases(), strand, 2, sl.properPair(), sl);
1015 			}
1016 		}else if(sl.mapped() && (USE_SECONDARY || sl.primary()) && sl.mapq>=minMapq){
1017 			assert(sl.seq!=null || sl.cigar!=null) : "This program requires bases or a cigar string for every sam line.  Problem line:\n"+sl+"\n";
1018 //			assert(sl.seq!=null) : sl.toString();
1019 			final int length=sl.length();
1020 			final int start=sl.start(INCLUDE_SOFT_CLIP, false);
1021 			final int stop=sl.stop(start, INCLUDE_SOFT_CLIP, false);
1022 //			assert(false && length==stop-start+1) : length+", "+start+", "+stop+", "+(stop-start+1);
1023 //			assert(false) : "'"+new String(sl.rname())+"', '"+sl.rnameS()+"'";
1024 //			assert(false) : "'"+sl.rnameS()+"'";
1025 			final byte[] match=(INCLUDE_DELETIONS ? null : sl.toShortMatch(true));
1026 			mappedKmers+=kmers;
1027 			return addCoverage(sl.rnameS(), sl.seq, match, start, stop, length, sl.mappedNonClippedBases(), sl.strand(), sl.hasMate() ? 1 : 2, sl.properPair(), sl);
1028 		}
1029 		return false;
1030 	}
1031 
1032 
processRead(Read r)1033 	public boolean processRead(Read r){
1034 		readsProcessed++;
1035 		final int kmers=Tools.countKmers(r.bases, k);
1036 		final double cKmers=r.quality==null ? kmers : Tools.countCorrectKmers(r.quality, k);
1037 		kmersProcessed+=kmers;
1038 		correctKmers+=cKmers;
1039 		if(r.mapped() && r.bases!=null){
1040 			if(USE_SECONDARY && r.sites!=null && r.sites.size()>0){
1041 				boolean b=false;
1042 				for(SiteScore ss : r.sites){
1043 					b=processRead(r, ss) || b;
1044 				}
1045 				return b;
1046 			}else{
1047 				final Read mate=r.mate;
1048 				final boolean set1=coords.set(r);
1049 				final boolean set2=(PHYSICAL_COVERAGE && r.paired() && r.pairnum()==0 && coords2.set(mate));
1050 				if(set1 && set2 && Tools.equals(coords.name, coords2.name)){
1051 					final int start1=coords.start;
1052 					final int stop1=coords.stop;
1053 					final int start2=coords2.start;
1054 					final int stop2=coords2.stop;
1055 					final int strand=r.strand();
1056 					final int length=Tools.max(stop1, stop2)-Tools.min(start1, start2)+1;
1057 					mappedKmers+=kmers;
1058 					addCoverage(new String(coords.name), null, null, Tools.min(start1, start2), Tools.max(stop1, stop2), length,
1059 							r.mappedNonClippedBases(), strand, 2-r.mateCount(), r.paired(), null);
1060 				}else{
1061 					if(set1){
1062 						mappedKmers+=kmers;
1063 						return addCoverage(new String(coords.name), r.bases, r.match, coords.start, coords.stop, r.length(),
1064 								r.mappedNonClippedBases(), coords.strand, 2-r.mateCount(), r.paired(), null);
1065 					}
1066 				}
1067 			}
1068 		}
1069 		return false;
1070 	}
1071 
1072 
processRead(Read r, SiteScore ss)1073 	public boolean processRead(Read r, SiteScore ss){
1074 		if(ss!=null && r.bases!=null){
1075 			if(coords.set(ss)){
1076 				return addCoverage(new String(coords.name), r.bases, ss.match, coords.start, coords.stop, r.length(),
1077 						r.mappedNonClippedBases(), coords.strand, 1, r.paired(), null);
1078 			}
1079 		}
1080 		return false;
1081 	}
1082 
1083 
1084 	/*--------------------------------------------------------------*/
1085 	/*----------------        Output Methods        ----------------*/
1086 	/*--------------------------------------------------------------*/
1087 
1088 
printOutput()1089 	public void printOutput(){
1090 
1091 		totalScaffolds=list.size();
1092 
1093 		long refKmers=0;
1094 		long refBases2=0;
1095 		for(Scaffold sc : list){
1096 			refBases2+=sc.length;
1097 			if(sc.length>=k){refKmers+=sc.length-k+1;}
1098 		}
1099 
1100 		String basecov1=(basecov==null ? null : (STRANDED ? basecov.replaceFirst("#", "1") : basecov));
1101 		String bincov1=(bincov==null ? null : (STRANDED ? bincov.replaceFirst("#", "1") : bincov));
1102 		String normcov1=(normcov==null ? null : (STRANDED ? normcov.replaceFirst("#", "1") : normcov));
1103 		String normcovOverall1=(normcovOverall==null ? null : (STRANDED ? normcovOverall.replaceFirst("#", "1") : normcovOverall));
1104 		String histogram1=(histogram==null ? null : (STRANDED ? histogram.replaceFirst("#", "1") : histogram));
1105 		String stats1=(covstats==null ? null : (STRANDED ? covstats.replaceFirst("#", "1") : covstats));
1106 
1107 		String basecov2=(basecov==null || !STRANDED ? null : basecov.replaceFirst("#", "2"));
1108 		String bincov2=(bincov==null || !STRANDED ? null : bincov.replaceFirst("#", "2"));
1109 		String normcov2=(normcov==null || !STRANDED ? null : normcov.replaceFirst("#", "2"));
1110 		String normcovOverall2=(normcovOverall==null ? null : (STRANDED ? normcovOverall.replaceFirst("#", "2") : normcovOverall));
1111 		String histogram2=(histogram==null || !STRANDED ? null : histogram.replaceFirst("#", "2"));
1112 		String stats2=(covstats==null || !STRANDED ? null : covstats.replaceFirst("#", "2"));
1113 
1114 		if(CONCISE){
1115 			writeCoveragePerBaseConcise(basecov1, list, 0, minscaf);
1116 			writeCoveragePerBaseConcise(basecov2, list, 1, minscaf);
1117 		}else{
1118 			writeCoveragePerBase(basecov1, list, DELTA_ONLY, 0, minscaf);
1119 			writeCoveragePerBase(basecov2, list, DELTA_ONLY, 1, minscaf);
1120 		}
1121 		if(KEEP_SHORT_BINS){
1122 			writeCoveragePerBaseBinned2(bincov1, list, binsize, 0, minscaf);
1123 			writeCoveragePerBaseBinned2(bincov2, list, binsize, 1, minscaf);
1124 		}else{
1125 			writeCoveragePerBaseBinned(bincov1, list, binsize, 0, minscaf);
1126 			writeCoveragePerBaseBinned(bincov2, list, binsize, 1, minscaf);
1127 		}
1128 		if(normcov!=null){
1129 			writeCoveragePerBaseNormalized(normcov1, list, binsize, 0, minscaf);
1130 			writeCoveragePerBaseNormalized(normcov2, list, binsize, 1, minscaf);
1131 		}
1132 		if(normcovOverall!=null){
1133 			writeCoveragePerBaseNormalizedOverall(normcovOverall1, list, binsize, 0, minscaf);
1134 			writeCoveragePerBaseNormalizedOverall(normcovOverall2, list, binsize, 1, minscaf);
1135 		}
1136 		if(outrpkm!=null){
1137 			writeRPKM(outrpkm, in1, null, readsProcessed, NONZERO_ONLY,list);
1138 		}
1139 
1140 		{
1141 			long[] hist=writeStats(stats1, 0);
1142 			if(hist!=null){writeHist(histogram1, hist);}
1143 
1144 			if(STRANDED){
1145 				hist=writeStats(stats2, 1);
1146 				if(hist!=null){writeHist(histogram2, hist);}
1147 			}
1148 		}
1149 
1150 		final double mult=1.0/refBases;
1151 		double depthCovered=mappedBases*mult;
1152 		double depthCovered2=mappedNonClippedBases*mult;
1153 		double depthCovered3=mappedBasesWithDels*mult;
1154 		double pctScaffoldsWithCoverage=scaffoldsWithCoverage1*100.0/totalScaffolds;
1155 		double pctCovered=totalCoveredBases1*100*mult;
1156 
1157 		if(!KEY_VALUE){
1158 			outstream.println("Reads:                               \t"+readsProcessed);
1159 			outstream.println("Mapped reads:                        \t"+mappedReads);
1160 			//		outstream.println("Mapped bases:                        \t"+mappedBases);
1161 			//		outstream.println("Mapped non-clipped bases:            \t"+mappedNonClippedBases);
1162 			outstream.println("Mapped bases:                        \t"+mappedNonClippedBases);
1163 			outstream.println("Ref scaffolds:                       \t"+totalScaffolds);
1164 			outstream.println("Ref bases:                           \t"+refBases);
1165 
1166 			if(k>0){
1167 				String kcovS=k+"-mer coverage:";
1168 				String kcorrectS="Percent correct "+k+"-mers:";
1169 				while(kcovS.length()<26){kcovS=kcovS+" ";}
1170 				while(kcovS.length()<26){kcorrectS=kcorrectS+" ";}
1171 				outstream.println(String.format(Locale.ROOT, "\n"+kcovS+"           \t%.3f", mappedKmers*1.0/refKmers));
1172 				outstream.println(String.format(Locale.ROOT, kcorrectS+"           \t%.3f", 100*correctKmers/kmersProcessed));
1173 				//			outstream.println(kmersProcessed+", "+correctKmers);
1174 			}
1175 
1176 			outstream.println(String.format(Locale.ROOT, "\nPercent mapped:                      \t%.3f", mappedReads*100f/readsProcessed));
1177 			outstream.println(String.format(Locale.ROOT, "Percent proper pairs:                \t%.3f", properPairs*100f/readsProcessed));
1178 //			outstream.println(depthCovered);
1179 			outstream.println(String.format(Locale.ROOT, "Average coverage:                    \t%.3f", depthCovered2));
1180 			outstream.println(String.format(Locale.ROOT, "Average coverage with deletions:     \t%.3f", depthCovered3));
1181 			if(USE_COVERAGE_ARRAYS && calcCovStdev){
1182 				double[] stdev=standardDeviation(list, 0, minscaf);
1183 				outstream.println(String.format(Locale.ROOT, "Standard deviation:                    \t%.3f", stdev[1]));
1184 			}
1185 			outstream.println(String.format(Locale.ROOT, "Percent scaffolds with any coverage: \t%.2f", pctScaffoldsWithCoverage));
1186 			if(USE_COVERAGE_ARRAYS || USE_BITSETS){
1187 				outstream.println(String.format(Locale.ROOT, "Percent of reference bases covered:  \t%.2f", pctCovered));
1188 			}
1189 		}else{
1190 			outstream.println("reads="+readsProcessed);
1191 			outstream.println("mappedReads="+mappedReads);
1192 			outstream.println("mappedBases="+mappedNonClippedBases);
1193 			outstream.println("mappedBasesWithDels="+mappedBasesWithDels);
1194 			outstream.println("refScaffolds="+totalScaffolds);
1195 			outstream.println("refBases="+refBases);
1196 			outstream.println(String.format(Locale.ROOT, "percentMapped=%.3f", mappedReads*100f/readsProcessed));
1197 			outstream.println(String.format(Locale.ROOT, "percentPaired=%.3f", properPairs*100f/readsProcessed));
1198 			outstream.println(String.format(Locale.ROOT, "averageCoverage=%.3f", depthCovered2));
1199 			if(USE_COVERAGE_ARRAYS && calcCovStdev){
1200 				double[] stdev=standardDeviation(list, 0, minscaf);
1201 				outstream.println(String.format(Locale.ROOT, "standardDeviation=%.3f", stdev[1]));
1202 			}
1203 			outstream.println(String.format(Locale.ROOT, "percentCoveredScaffolds=%.2f", pctScaffoldsWithCoverage));
1204 			if(USE_COVERAGE_ARRAYS || USE_BITSETS){
1205 				outstream.println(String.format(Locale.ROOT, "percentCoveredBases=%.2f", pctCovered));
1206 			}
1207 		}
1208 	}
1209 
basesUnderAverageCoverage(final int[] array, final double avg, final int window)1210 	public int basesUnderAverageCoverage(final int[] array, final double avg, final int window){
1211 		if(array.length<window){return 0;}
1212 		final long limit=(long)Math.ceil(window*avg);
1213 		long covSum=0;
1214 		int baseCount=0;
1215 		for(int i=0; i<window; i++){
1216 			covSum+=array[i];
1217 		}
1218 
1219 		boolean below=false;
1220 		int lastStop=-1, lastStart=0;
1221 		for(int a=0, b=window; b<array.length; a++, b++){
1222 			if(covSum>=limit){
1223 				if(below){//end range
1224 					baseCount=b-Tools.max(lastStop+1, lastStart);
1225 					lastStop=b-1;
1226 					below=false;
1227 				}
1228 			}else{
1229 				if(!below){//start range
1230 					lastStart=a;
1231 					below=true;
1232 				}
1233 			}
1234 			covSum-=array[a];
1235 			assert(covSum>=0);
1236 			covSum+=array[b];
1237 		}
1238 
1239 		if(below){//end range
1240 			baseCount=array.length-Tools.max(lastStop, lastStart);
1241 		}
1242 
1243 		assert(baseCount>=0);
1244 		return baseCount;
1245 	}
1246 
basesUnderAverageCoverage(final char[] array, final double avg, final int window)1247 	public int basesUnderAverageCoverage(final char[] array, final double avg, final int window){
1248 		if(array.length<window){return 0;}
1249 		final long limit=(long)Math.ceil(window*avg);
1250 		long covSum=0;
1251 		int baseCount=0;
1252 		for(int i=0; i<window; i++){
1253 			covSum+=array[i];
1254 		}
1255 
1256 //		outstream.println("limit: "+limit);
1257 
1258 		boolean below=false;
1259 		int lastStop=-1, lastStart=0;
1260 		for(int a=0, b=window; b<array.length; a++, b++){
1261 			if(covSum>=limit){
1262 				if(below){//end range
1263 					baseCount+=b-Tools.max(lastStop+1, lastStart);
1264 
1265 //					outstream.println("\nprev: "+lastStop+", "+lastStart);
1266 //					outstream.println("end range at "+a+", "+b);
1267 //					outstream.println("baseCount: "+baseCount+", covSum="+covSum);
1268 
1269 					lastStop=b-1;
1270 					below=false;
1271 				}
1272 			}else{
1273 				if(!below){//start range
1274 
1275 //					outstream.println("\nprev: "+lastStop+", "+lastStart);
1276 //					outstream.println("start range at "+a+", "+b);
1277 //					outstream.println("baseCount: "+baseCount+", covSum="+covSum);
1278 
1279 					lastStart=a;
1280 					below=true;
1281 				}
1282 			}
1283 			covSum-=array[a];
1284 			assert(covSum>=0);
1285 			covSum+=array[b];
1286 		}
1287 
1288 		if(below){//end range
1289 			baseCount+=array.length-Tools.max(lastStop+1, lastStart);
1290 
1291 //			outstream.println("\nprev: "+lastStop+", "+lastStart);
1292 //			outstream.println("end range at "+array.length);
1293 //			outstream.println("baseCount: "+baseCount+", covSum="+covSum);
1294 		}
1295 
1296 		assert(baseCount>=0);
1297 		return baseCount;
1298 	}
1299 
writeStats(String fname, int strand)1300 	public long[] writeStats(String fname, int strand){
1301 //		outstream.println("Writing stats for "+fname+", "+strand);
1302 		final TextStreamWriter tsw=(fname==null ? null : new TextStreamWriter(fname, overwrite, false, true));
1303 
1304 		if(tsw!=null){
1305 			tsw.start();
1306 			if(printHeader){
1307 				String pound=(headerPound ? "#" : "");
1308 				if(TWOCOLUMN){
1309 					tsw.println(pound+"ID\tAvg_fold");
1310 				}else{
1311 					tsw.println(pound+"ID\tAvg_fold\tLength\tRef_GC\tCovered_percent\tCovered_bases\tPlus_reads\tMinus_reads"+
1312 							(COUNT_GC ? "\tRead_GC" : "")+
1313 							(USE_COVERAGE_ARRAYS ? ("\tMedian_fold\tStd_Dev") : "")+
1314 							(USE_WINDOW ? "\tUnder_"+String.format(Locale.ROOT, "%.0f",LOW_COV_DEPTH)+"/"+LOW_COV_WINDOW : ""));
1315 				}
1316 			}
1317 			//Maximally:
1318 			//"ID\tAvg_fold\tLength\tRef_GC\tCovered_percent\tCovered_bases\tPlus_reads\tMinus_reads\tRead_GC\tMedian_fold\tStd_Dev\tUnder_X"
1319 		}
1320 
1321 		final int histmax=(bits32 ? 1000000 : Character.MAX_VALUE);
1322 		final LongList hist=new LongList(Character.MAX_VALUE);
1323 
1324 		long coveredScafTemp=0;
1325 		long coveredBaseTemp=0;
1326 		for(Scaffold scaf : list){
1327 			final long sum=scaf.basehits;
1328 			int covered=0;
1329 			int median=0;
1330 			int underWindowAverage=0;
1331 			final double stdev;
1332 			if(USE_COVERAGE_ARRAYS){
1333 				CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
1334 				if(ca!=null){
1335 					for(int i=0; i<scaf.length; i++){
1336 						int x=ca.get(i);
1337 						hist.increment(Tools.min(x, histmax));
1338 //						sum+=x;
1339 						if(x>=minDepthToBeCovered){covered++;}
1340 					}
1341 					if(bits32){
1342 						int[] array=((CoverageArray3)ca).array;
1343 						stdev=Tools.standardDeviation(array);
1344 						underWindowAverage=basesUnderAverageCoverage(array, LOW_COV_DEPTH, LOW_COV_WINDOW);
1345 						Shared.sort(array);
1346 						Tools.reverseInPlace(array);
1347 						median=ca.get(scaf.length/2);
1348 					}else{
1349 						char[] array=((CoverageArray2)ca).array;
1350 						stdev=Tools.standardDeviation(array);
1351 						underWindowAverage=basesUnderAverageCoverage(array, LOW_COV_DEPTH, LOW_COV_WINDOW);
1352 						Arrays.sort(array);
1353 						Tools.reverseInPlace(array);
1354 						median=ca.get(scaf.length/2);
1355 					}
1356 				}else{
1357 					hist.increment(0, scaf.length);
1358 					stdev=0;
1359 				}
1360 			}else if(USE_BITSETS){
1361 //				sum+=scaf.basehits;
1362 				BitSet bs=(BitSet)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
1363 				covered=(bs==null ? 0 : bs.cardinality());
1364 				stdev=-1;
1365 			}else{
1366 				stdev=-1;
1367 			}
1368 
1369 			if(sum>0){
1370 				coveredScafTemp++;
1371 			}
1372 			//			pw.print(scaf.name);
1373 			if(tsw!=null && (sum>0 || !NONZERO_ONLY) && scaf.length>=minscaf){
1374 				if(TWOCOLUMN){
1375 					tsw.print(String.format(Locale.ROOT, "%s\t%.4f\n", scaf.name, sum/(double)scaf.length));
1376 				}else if(COUNT_GC){
1377 					long[] bc=scaf.basecount;
1378 					double gc=(bc[1]+bc[2])*1d/Data.max(1, bc[0]+bc[1]+bc[2]+bc[3]);
1379 					if(USE_COVERAGE_ARRAYS){
1380 						if(USE_WINDOW){
1381 
1382 							//Variable portion:
1383 							//"\tRead_GC\tMedian_fold\tStd_Dev\tUnder_X\n"
1384 							//\t%.4f\t%d\t%.2f\t%d\n
1385 
1386 							tsw.print(String.format(Locale.ROOT, "%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\t%.4f\t%d\t%.2f\t%d\n", scaf.name, sum/(double)scaf.length, scaf.length,
1387 									scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus,
1388 									gc, median, stdev, underWindowAverage));
1389 						}else{
1390 							tsw.print(String.format(Locale.ROOT, "%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\t%.4f\t%d\t%.2f\n", scaf.name, sum/(double)scaf.length, scaf.length,
1391 									scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus,
1392 									gc, median, stdev));
1393 						}
1394 					}else{
1395 						tsw.print(String.format(Locale.ROOT, "%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\t%.4f\n", scaf.name, sum/(double)scaf.length, scaf.length,
1396 								scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus,
1397 								gc));
1398 					}
1399 				}else{
1400 					if(USE_COVERAGE_ARRAYS){
1401 						if(USE_WINDOW){
1402 							tsw.print(String.format(Locale.ROOT, "%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\t%d\t%.2f\t%d\n", scaf.name, sum/(double)scaf.length, scaf.length,
1403 									scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus,
1404 									median, stdev, underWindowAverage));
1405 						}else{
1406 							tsw.print(String.format(Locale.ROOT, "%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\t%d\t%.2f\n", scaf.name, sum/(double)scaf.length, scaf.length,
1407 									scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus,
1408 									median, stdev));
1409 						}
1410 					}else{
1411 						tsw.print(String.format(Locale.ROOT, "%s\t%.4f\t%d\t%.4f\t%.4f\t%d\t%d\t%d\n", scaf.name, sum/(double)scaf.length, scaf.length,
1412 								scaf.gc, covered*100d/scaf.length, covered, (scaf.readhits-scaf.readhitsMinus), scaf.readhitsMinus));
1413 					}
1414 				}
1415 			}
1416 			coveredBaseTemp+=covered;
1417 		}
1418 
1419 		if(strand==0){
1420 			scaffoldsWithCoverage1+=coveredScafTemp;
1421 			totalCoveredBases1+=coveredBaseTemp;
1422 		}else{
1423 			scaffoldsWithCoverage2+=coveredScafTemp;
1424 			totalCoveredBases2+=coveredBaseTemp;
1425 		}
1426 
1427 		if(tsw!=null){tsw.poisonAndWait();}
1428 		return hist==null ? null : hist.array;
1429 	}
1430 
1431 	/**
1432 	 * Write a histogram of number of bases covered to each depth
1433 	 * @param fname Output filename
1434 	 * @param counts counts[X] stores the number of bases with coverage X
1435 	 */
writeHist(String fname, long[] counts)1436 	public static void writeHist(String fname, long[] counts){
1437 		if(fname==null){return;}
1438 		assert(counts!=null) : "Can't write a histogram with null counts.";
1439 		ByteStreamWriter tsw=new ByteStreamWriter(fname, overwrite, false, false);
1440 		tsw.start();
1441 		if(printHeader){
1442 			if(headerPound){tsw.print('#');}
1443 			tsw.println("Coverage\tnumBases");
1444 		}
1445 		int max=0;
1446 		for(max=counts.length-1; max>0 && counts[max]==0; max--){}
1447 		for(int i=0; i<=max; i++){
1448 			long x=counts[i];
1449 			tsw.print(i);
1450 			tsw.print('\t');
1451 			tsw.println(x);
1452 		}
1453 
1454 		tsw.poisonAndWait();
1455 	}
1456 
1457 	/**
1458 	 * Prints coverage in this format:
1459 	 * scafname TAB position TAB coverage
1460 	 * scafname TAB position TAB coverage
1461 	 * @param fname Output filename
1462 	 * @param list List of reference scaffolds
1463 	 * @param deltaOnly Only write lines when coverage changes
1464 	 * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus)
1465 	 */
writeCoveragePerBase(String fname, ArrayList<Scaffold> list, boolean deltaOnly, int strand, int minscaf)1466 	public void writeCoveragePerBase(String fname, ArrayList<Scaffold> list, boolean deltaOnly, int strand, int minscaf){
1467 		if(fname==null || (!STRANDED && strand>0)){return;}
1468 
1469 		if(verbose){outstream.println("Starting tsw "+fname);}
1470 		ByteStreamWriter tsw=new ByteStreamWriter(fname, overwrite, false, true);
1471 		if(verbose){outstream.println("Created tsw "+fname);}
1472 		tsw.start();
1473 //		if(verbose){outstream.println("Started tsw "+fname);}
1474 		if(printHeader){
1475 			if(headerPound){tsw.print('#');}
1476 			tsw.println("RefName\tPos\tCoverage");
1477 		}
1478 
1479 		for(Scaffold scaf : list){
1480 			int last=-1;
1481 			CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
1482 			if(scaf.length>=minscaf){
1483 				for(int i=0, len=scaf.length; i<len; i++){
1484 					int x=(ca==null ? 0 : ca.get(i));
1485 					if(!deltaOnly || x!=last){
1486 						if(x>0 || !NONZERO_ONLY){
1487 							tsw.print(scaf.name);
1488 							tsw.print('\t');
1489 							tsw.print(i);
1490 							tsw.print('\t');
1491 							tsw.println(x);
1492 						}
1493 						last=x;
1494 					}
1495 				}
1496 			}
1497 		}
1498 
1499 		if(verbose){outstream.println("Closing tsw "+fname);}
1500 		tsw.poisonAndWait();
1501 		if(verbose){outstream.println("Closed tsw "+fname);}
1502 	}
1503 
1504 	/**
1505 	 * Prints coverage in this format, skipping zero-coverage positions:
1506 	 * #scafname
1507 	 * position TAB coverage
1508 	 * position TAB coverage
1509 	 * @param fname Output filename
1510 	 * @param list List of reference scaffolds
1511 	 * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus)
1512 	 */
writeCoveragePerBaseConcise(String fname, ArrayList<Scaffold> list, int strand, int minscaf)1513 	public void writeCoveragePerBaseConcise(String fname, ArrayList<Scaffold> list, int strand, int minscaf){
1514 		if(fname==null || (!STRANDED && strand>0)){return;}
1515 
1516 		if(verbose){outstream.println("Starting tsw "+fname);}
1517 		ByteStreamWriter tsw=new ByteStreamWriter(fname, overwrite, false, true);
1518 		tsw.start();
1519 		if(verbose){outstream.println("Started tsw "+fname);}
1520 //		tsw.print(pound+"RefName\tPos\tCoverage\n");
1521 
1522 		for(Scaffold scaf : list){
1523 			tsw.print('#');
1524 			tsw.println(scaf.name);
1525 			CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
1526 			if(scaf.length>=minscaf){
1527 				for(int i=0; i<scaf.length; i++){
1528 					int x=(ca==null ? 0 : ca.get(i));
1529 					if(x>0){
1530 						tsw.print(i);
1531 						tsw.print('\t');
1532 						tsw.println(x);
1533 					}
1534 				}
1535 			}
1536 		}
1537 
1538 		if(verbose){outstream.println("Closing tsw "+fname);}
1539 		tsw.poisonAndWait();
1540 		if(verbose){outstream.println("Closed tsw "+fname);}
1541 //		assert(false);
1542 	}
1543 
1544 	/**
1545 	 * Note.  As written, this will truncate all trailing bases of each scaffold's length modulo binsize.
1546 	 * For example, with binsize 1000, the last 500 bases of a 1500 base scaffold will be ignored.
1547 	 * @param fname Output filename
1548 	 * @param list List of reference scaffolds
1549 	 * @param binsize Width of coverage bins in bp
1550 	 * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus)
1551 	 */
writeCoveragePerBaseBinned(String fname, ArrayList<Scaffold> list, int binsize, int strand, int minscaf)1552 	public static void writeCoveragePerBaseBinned(String fname, ArrayList<Scaffold> list, int binsize, int strand, int minscaf){
1553 		if(fname==null || (!STRANDED && strand>0)){return;}
1554 		TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
1555 		tsw.start();
1556 		if(printHeader){
1557 			String pound=(headerPound ? "#" : "");
1558 			if(calcCovStdev){
1559 				double[] stdev=standardDeviation(list, strand, minscaf);
1560 				if(stdev!=null){
1561 					tsw.print(pound+"Mean\t"+String.format(Locale.ROOT, "%.3f", stdev[0])+"\n");
1562 					tsw.print(pound+"STDev\t"+String.format(Locale.ROOT, "%.3f", stdev[1])+"\n");
1563 				}
1564 			}
1565 			tsw.print(pound+"RefName\tCov\tPos\tRunningPos\n");
1566 		}
1567 
1568 		long running=0;
1569 		final float invbin=1f/binsize;
1570 		for(Scaffold scaf : list){
1571 			if(scaf.length>=binsize && scaf.length>=minscaf){
1572 				CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
1573 				int lastPos=-1, nextPos=binsize-1;
1574 				long sum=0;
1575 				for(int i=0; i<scaf.length; i++){
1576 					int x=(ca==null ? 0 : ca.get(i));
1577 					sum+=x;
1578 					if(i>=nextPos){
1579 						if(sum>0 || !NONZERO_ONLY){
1580 							tsw.print(String.format(Locale.ROOT, "%s\t%.2f\t%d\t%d\n", scaf.name, sum*invbin, (i+1), running));
1581 						}
1582 						lastPos=i;
1583 						running+=binsize;
1584 						nextPos+=binsize;
1585 						sum=0;
1586 					}
1587 				}
1588 			}
1589 		}
1590 
1591 		tsw.poisonAndWait();
1592 	}
1593 
1594 	/**
1595 	 * This version will NOT truncate all trailing bases of each scaffold's length modulo binsize.
1596 	 * @param fname Output filename
1597 	 * @param list List of reference scaffolds
1598 	 * @param binsize Width of coverage bins in bp
1599 	 * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus)
1600 	 */
writeCoveragePerBaseBinned2(String fname, ArrayList<Scaffold> list, int binsize, int strand, int minscaf)1601 	public static void writeCoveragePerBaseBinned2(String fname, ArrayList<Scaffold> list, int binsize, int strand, int minscaf){
1602 		if(fname==null || (!STRANDED && strand>0)){return;}
1603 		TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
1604 		tsw.start();
1605 		if(printHeader){
1606 			String pound=(headerPound ? "#" : "");
1607 			if(calcCovStdev){
1608 				double[] stdev=standardDeviationBinned(list, binsize, strand, minscaf);
1609 				if(stdev!=null){
1610 					tsw.print(pound+"Mean\t"+String.format(Locale.ROOT, "%.3f", stdev[0])+"\n");
1611 					tsw.print(pound+"STDev\t"+String.format(Locale.ROOT, "%.3f", stdev[1])+"\n");
1612 				}
1613 			}
1614 			tsw.print(pound+"RefName\tCov\tPos\tRunningPos\n");
1615 		}
1616 
1617 		long running=0;
1618 		for(Scaffold scaf : list){
1619 			CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
1620 			int lastPos=-1, nextPos=binsize-1;
1621 			long sum=0;
1622 			final int lim=scaf.length-1;
1623 			for(int i=0; i<scaf.length; i++){
1624 				int x=(ca==null ? 0 : ca.get(i));
1625 				sum+=x;
1626 				if(i>=nextPos || i==lim){
1627 					int bin=(i-lastPos);
1628 					if(scaf.length>=minscaf){
1629 						if(sum>0 || !NONZERO_ONLY){
1630 							tsw.print(String.format(Locale.ROOT, "%s\t%.2f\t%d\t%d\n", scaf.name, sum/(float)bin, (i+1), running));
1631 						}
1632 					}
1633 					running+=bin;
1634 					nextPos+=binsize;
1635 					lastPos=i;
1636 					sum=0;
1637 				}
1638 			}
1639 		}
1640 
1641 		tsw.poisonAndWait();
1642 	}
1643 
1644 	//Unsafe because it will fail if there are over 2 billion bins
standardDeviationBinnedUnsafe(String fname, ArrayList<Scaffold> scaffolds, int binsize, int strand, int minscaf)1645 	public static double[] standardDeviationBinnedUnsafe(String fname, ArrayList<Scaffold> scaffolds, int binsize, int strand, int minscaf){
1646 
1647 		LongList list=new LongList();
1648 		for(Scaffold scaf : scaffolds){
1649 			CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
1650 			int lastPos=-1, nextPos=binsize-1;
1651 			long sum=0;
1652 			final int lim=scaf.length-1;
1653 			for(int i=0; i<scaf.length; i++){
1654 				int x=(ca==null ? 0 : ca.get(i));
1655 				sum+=x;
1656 				if(i>=nextPos || i==lim){
1657 					int bin=(i-lastPos);
1658 					if(scaf.length>=minscaf){
1659 						list.add((int)(10*(sum/(double)bin)));
1660 					}
1661 					nextPos+=binsize;
1662 					lastPos=i;
1663 					sum=0;
1664 				}
1665 			}
1666 		}
1667 		list.sort();
1668 
1669 		double mean=0.1*list.mean();
1670 		double median=0.1*list.median();
1671 		double mode=0.1*list.mode();
1672 		double stdev=0.1*list.stdev();
1673 		return new double[] {mean, median, mode, stdev};
1674 	}
1675 
standardDeviationBinned(ArrayList<Scaffold> scaffolds, int binsize, int strand, int minscaf)1676 	public static double[] standardDeviationBinned(ArrayList<Scaffold> scaffolds, int binsize, int strand, int minscaf){
1677 		double totalSum=0;
1678 		long bins=0;
1679 
1680 		for(Scaffold scaf : scaffolds){
1681 			if(scaf.length>=minscaf){
1682 				CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
1683 				int lastPos=-1, nextPos=binsize-1;
1684 				long tempSum=0;
1685 				final int lim=scaf.length-1;
1686 				for(int i=0; i<scaf.length; i++){
1687 					int x=(ca==null ? 0 : ca.get(i));
1688 					tempSum+=x;
1689 					if(i>=nextPos || i==lim){
1690 						int bin=(i-lastPos);
1691 						double depth=(tempSum/(double)bin);
1692 						totalSum+=depth;
1693 						bins++;
1694 						nextPos+=binsize;
1695 						lastPos=i;
1696 						tempSum=0;
1697 					}
1698 				}
1699 			}
1700 		}
1701 
1702 		if(bins<1){return new double[] {0, 0};}
1703 		final double mean=totalSum/(double)bins;
1704 		double sumdev2=0;
1705 
1706 		for(Scaffold scaf : scaffolds){
1707 			if(scaf.length>=minscaf){
1708 				CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
1709 				int lastPos=-1, nextPos=binsize-1;
1710 				long tempSum=0;
1711 				final int lim=scaf.length-1;
1712 				for(int i=0; i<scaf.length; i++){
1713 					int x=(ca==null ? 0 : ca.get(i));
1714 					tempSum+=x;
1715 					if(i>=nextPos || i==lim){
1716 						int bin=(i-lastPos);
1717 						double depth=(tempSum/(double)bin);
1718 						double dev=mean-depth;
1719 						sumdev2+=(dev*dev);
1720 						nextPos+=binsize;
1721 						lastPos=i;
1722 						tempSum=0;
1723 					}
1724 				}
1725 			}
1726 		}
1727 
1728 		final double stdev=Math.sqrt(sumdev2/bins);
1729 		return new double[] {mean, stdev};
1730 	}
1731 
standardDeviation(ArrayList<Scaffold> scaffolds, int strand, int minscaf)1732 	public static double[] standardDeviation(ArrayList<Scaffold> scaffolds, int strand, int minscaf){
1733 		long totalSum=0, bins=0;
1734 		for(Scaffold scaf : scaffolds){
1735 //			System.err.println(scaf.name+", "+scaf.length+"/"+minscaf);
1736 			if(scaf.length>=minscaf){
1737 				final CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
1738 				bins+=scaf.length;
1739 				for(int i=0; i<scaf.length; i++){
1740 					int depth=(ca==null ? 0 : ca.get(i));
1741 					totalSum+=depth;
1742 				}
1743 			}
1744 		}
1745 
1746 		if(bins<1){return new double[] {0, 0};}
1747 		final double mean=totalSum/(double)bins;
1748 		double sumdev2=0;
1749 //		System.err.println(totalSum+", "+bins+", "+mean);
1750 		for(Scaffold scaf : scaffolds){
1751 			if(scaf.length>=minscaf){
1752 				final CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
1753 				double sumTemp=0;
1754 				for(int i=0; i<scaf.length; i++){
1755 					int depth=(ca==null ? 0 : ca.get(i));
1756 					double dev=mean-depth;
1757 					sumTemp+=(dev*dev);
1758 				}
1759 				sumdev2+=sumTemp;
1760 			}
1761 		}
1762 
1763 		final double stdev=Math.sqrt(sumdev2/bins);
1764 //		System.err.println(totalSum+", "+bins+", "+mean+", "+stdev);
1765 		return new double[] {mean, stdev};
1766 	}
1767 
1768 
1769 	/**
1770 	 * @param fname Output filename
1771 	 * @param list List of reference scaffolds
1772 	 * @param binsize Width of coverage bins in bp
1773 	 * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus)
1774 	 */
writeCoveragePerBaseNormalized(String fname, ArrayList<Scaffold> list, double binsize, int strand, int minscaf)1775 	public static void writeCoveragePerBaseNormalized(String fname, ArrayList<Scaffold> list, double binsize, int strand, int minscaf){
1776 		if(fname==null || (!STRANDED && strand>0)){return;}
1777 		TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
1778 		tsw.start();
1779 		if(printHeader){
1780 			String pound=(headerPound ? "#" : "");
1781 			tsw.print(pound+"RefName\tBin\tCov\tPos\tRunningPos\n");
1782 		}
1783 
1784 		double running=0;
1785 		double invbin=1.0/binsize;
1786 		final double invbincount=1.0/NORMALIZE_LENGTH_BINS;
1787 		for(Scaffold scaf : list){
1788 			if(NORMALIZE_LENGTH_BINS>0){
1789 				binsize=scaf.length*invbincount;
1790 				invbin=1.0/binsize;
1791 			}
1792 
1793 			if(scaf.length>=binsize && scaf.length>=minscaf){
1794 				long max=-1;
1795 
1796 				final CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
1797 				double lastPos=-1, nextPos=binsize-1;
1798 				long sum=0;
1799 
1800 				if(NORMALIZE_COVERAGE){
1801 					for(int i=0; i<scaf.length; i++){
1802 						int x=(ca==null ? 0 : ca.get(i));
1803 						sum+=x;
1804 						if(i>=nextPos){
1805 							max=Tools.max(sum, max);
1806 							running+=binsize;
1807 							nextPos+=binsize;
1808 							sum=0;
1809 						}
1810 					}
1811 					lastPos=-1;
1812 					nextPos=binsize-1;
1813 					sum=0;
1814 					assert(max>-1) : max;
1815 				}
1816 				max=Tools.max(max, 1);
1817 				final double binmult=(NORMALIZE_COVERAGE ? 1d/max : invbin);
1818 
1819 //				assert(false) : NORMALIZE_COVERAGE+", "+binmult+", "+invbin+", "+max+", "+binsize;
1820 
1821 				final String formatString=NORMALIZE_COVERAGE ? "%s\t%d\t%.5f\t%d\t%d\n" : "%s\t%d\t%.2f\t%d\t%d\n";
1822 				int bin=1;
1823 				for(int i=0; i<scaf.length; i++){
1824 					int x=(ca==null ? 0 : ca.get(i));
1825 					sum+=x;
1826 					if(i>=nextPos){
1827 //						outstream.println(x+", "+i+", "+nextPos+", "+sum+", "+(sum*binmult));
1828 						tsw.print(String.format(formatString, scaf.name, bin, sum*binmult, (i+1), (long)running));
1829 						bin++;
1830 						lastPos=i;
1831 						running+=binsize;
1832 						nextPos+=binsize;
1833 						sum=0;
1834 					}
1835 				}
1836 			}
1837 		}
1838 
1839 		tsw.poisonAndWait();
1840 	}
1841 
1842 
1843 
1844 	/**
1845 	 * @param fname Output filename
1846 	 * @param list List of reference scaffolds
1847 	 * @param binsize Width of coverage bins in bp
1848 	 * @param strand Only use coverage from reads mapped to this strand (0=plus, 1=minus)
1849 	 */
writeCoveragePerBaseNormalizedOverall(String fname, ArrayList<Scaffold> list, double binsize, int strand, int minscaf)1850 	public static void writeCoveragePerBaseNormalizedOverall(String fname, ArrayList<Scaffold> list, double binsize, int strand, int minscaf){
1851 		if(fname==null || (!STRANDED && strand>0)){return;}
1852 
1853 		assert(NORMALIZE_LENGTH_BINS>0) : "Must set 'normalizebins' flag to a positive integer.";
1854 		double running=0;
1855 		double invbin=1.0/binsize;
1856 		long usedScafs=0;
1857 		final double invbincount=1.0/NORMALIZE_LENGTH_BINS;
1858 
1859 		double[] normalized=new double[NORMALIZE_LENGTH_BINS+1];
1860 		double[] absolute=new double[NORMALIZE_LENGTH_BINS+1];
1861 
1862 		for(Scaffold scaf : list){
1863 			if(NORMALIZE_LENGTH_BINS>0){
1864 				binsize=scaf.length*invbincount;
1865 				invbin=1.0/binsize;
1866 			}
1867 
1868 			if(scaf.length>=binsize && scaf.length>=minscaf){
1869 				usedScafs++;
1870 
1871 				if(scaf.readhits>0){
1872 					long max=-1;
1873 					final CoverageArray ca=(CoverageArray)(STRANDED && strand==1 ? scaf.obj2 : scaf.obj1);
1874 					double lastPos=-1, nextPos=binsize-1;
1875 					long sum=0;
1876 
1877 					{
1878 						for(int i=0; i<scaf.length; i++){
1879 							int x=(ca==null ? 0 : ca.get(i));
1880 							sum+=x;
1881 							if(i>=nextPos){
1882 								max=Tools.max(sum, max);
1883 								running+=binsize;
1884 								nextPos+=binsize;
1885 								sum=0;
1886 							}
1887 						}
1888 						lastPos=-1;
1889 						nextPos=binsize-1;
1890 						sum=0;
1891 						assert(max>-1) : max;
1892 					}
1893 					max=Tools.max(max, 1);
1894 					final double binmult=1d/max;
1895 
1896 					//				assert(false) : NORMALIZE_COVERAGE+", "+binmult+", "+invbin+", "+max+", "+binsize;
1897 
1898 					int bin=1;
1899 					for(int i=0; i<scaf.length; i++){
1900 						int x=(ca==null ? 0 : ca.get(i));
1901 						sum+=x;
1902 						if(i>=nextPos){
1903 							normalized[bin]+=(sum*binmult);
1904 							absolute[bin]+=(sum*invbin);
1905 							bin++;
1906 							lastPos=i;
1907 							running+=binsize;
1908 							nextPos+=binsize;
1909 							sum=0;
1910 						}
1911 					}
1912 				}
1913 			}
1914 		}
1915 
1916 		TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, false);
1917 		tsw.start();
1918 		if(printHeader){
1919 			String pound=(headerPound ? "#" : "");
1920 			tsw.print(pound+"RefName\tBin\tAbsCov\tNormCov\n");
1921 		}
1922 		double invScafs=1d/Tools.max(1, usedScafs);
1923 
1924 		final double maxNorm=Tools.max(normalized);
1925 		final double normMult=1/maxNorm;
1926 
1927 		for(int bin=1; bin<normalized.length; bin++){
1928 //			assert((absolute[bin]*invScafs)!=Double.NaN && (normalized[bin]*invScafs)!=Double.NaN) : invScafs+", "+absolute[bin]+", "+normalized[bin];
1929 //			assert(false) : invScafs+", "+absolute[bin]+", "+normalized[bin]+", "+(absolute[bin]*invScafs)+", "+(normalized[bin]*invScafs);
1930 			tsw.print(String.format(Locale.ROOT, "%s\t%d\t%.5f\t%.5f\n", "all", bin, absolute[bin]*invScafs, normalized[bin]*normMult));
1931 		}
1932 
1933 		tsw.poisonAndWait();
1934 	}
1935 
1936 
1937 
1938 	/**
1939 	 * Write RPKM statistics.
1940 	 */
writeRPKM(String out, String in1, String in2, long readsIn, boolean printNonZeroOnly, ArrayList<Scaffold> list)1941 	public static void writeRPKM(String out, String in1, String in2, long readsIn, boolean printNonZeroOnly, ArrayList<Scaffold> list){
1942 		if(out==null){return;}
1943 		final TextStreamWriter tsw=new TextStreamWriter(out, overwrite, false, false);
1944 		tsw.start();
1945 
1946 		/* Count mapped reads */
1947 		long mappedReads=0;
1948 		long mappedFrags=0;
1949 		for(Scaffold scaf : list){
1950 			mappedReads+=scaf.readhits;
1951 			mappedFrags+=scaf.fraghits;
1952 		}
1953 		mappedFrags/=2;
1954 
1955 		/* Print header */
1956 		tsw.print("#File\t"+(in1==null ? "" : in1)+(in2==null ? "" : "\t"+in2)+"\n");
1957 		tsw.print(String.format(Locale.ROOT, "#Reads\t%d\n",readsIn));
1958 		tsw.print(String.format(Locale.ROOT, "#Mapped\t%d\n",mappedReads));
1959 		tsw.print(String.format(Locale.ROOT, "#RefSequences\t%d\n",list.size()));
1960 		tsw.print("#Name\tLength\tBases\tCoverage\tReads\tRPKM\tFrags\tFPKM\n");
1961 
1962 		final float readMult=1000000000f/Tools.max(1, mappedReads);
1963 		final float fragMult=1000000000f/Tools.max(1, mappedFrags);
1964 
1965 		/* Print data */
1966 		for(final Scaffold scaf : list){
1967 			final long reads=scaf.readhits;
1968 			final long frags=scaf.fraghits/2;
1969 			final long bases=scaf.basehits;
1970 			final String s=scaf.name;
1971 			final int len=scaf.length;
1972 			final double invlen=1.0/Tools.max(1, len);
1973 			final double readMult2=readMult*invlen;
1974 			final double fragMult2=fragMult*invlen;
1975 			if(reads>0 || !printNonZeroOnly){
1976 				tsw.print(String.format(Locale.ROOT, "%s\t%d\t%d\t%.4f\t%d\t%.4f\t%d\t%.4f\n",s,len,bases,bases*invlen,reads,reads*readMult2,frags,frags*fragMult2));
1977 			}
1978 		}
1979 		tsw.poisonAndWait();
1980 	}
1981 
1982 
1983 	/*--------------------------------------------------------------*/
1984 	/*----------------            Fields            ----------------*/
1985 	/*--------------------------------------------------------------*/
1986 
1987 	/** The list of all scaffolds */
1988 	private ArrayList<Scaffold> list;
1989 	/** Maps names to scaffolds */
1990 	private HashMap<String, Scaffold> table;
1991 	/** Converts BBMap index coordinates to scaffold coordinates */
1992 	private final ScaffoldCoordinates coords=new ScaffoldCoordinates(), coords2=new ScaffoldCoordinates();
1993 
1994 	/** Mapping program name */
1995 	private String program=null;
1996 	/** Mapping program version */
1997 	private String version=null;
1998 
1999 	//Inputs
2000 	/** Primary input file (typically sam) */
2001 	public String in1=null;
2002 	/** Secondary input file (typically for coverage directly from BBMap) */
2003 	public String in2=null;
2004 	/** Optional, for calculating GC */
2005 	public String reference=null;
2006 	public String orffasta=null;
2007 
2008 	//Outputs
2009 	/** Stream unaltered sam input to this output */
2010 	public String outsam=null;
2011 	/** Coverage statistics, one line per scaffold */
2012 	public String covstats=null;
2013 	public String outorf=null;
2014 	/** Coverage histogram, one line per depth and one point per base */
2015 	public String histogram=null;
2016 	/** Coverage with one line per base */
2017 	public String basecov=null;
2018 	/** Coverage with one file per scaffold */
2019 	public String basecov_ps=null;
2020 	/** Coverage with one line per bin */
2021 	public String bincov=null;
2022 	/** Coverage with one line per bin, normalized by length and/or height */
2023 	public String normcov=null;
2024 	/** Coverage with one line per bin, normalized by length and/or height, for combined reference */
2025 	public String normcovOverall=null;
2026 	/** rpkm/fpkm output, similar to Seal */
2027 	public String outrpkm=null;
2028 
2029 	/** Typically indicates that a header line was encountered in an unexpected place, e.g. with concatenated sam files. */
2030 	private boolean error=false;
2031 
2032 	private boolean warned=false;
2033 	private final boolean EA=Shared.EA();
2034 
2035 	/** Total length of reference */
2036 	public long refBases=0;
2037 	public long mappedBases=0;
2038 	public long mappedNonClippedBases=0;
2039 	public long mappedBasesWithDels=0;
2040 	public long mappedReads=0;
2041 	public long properPairs=0;
2042 	public long readsProcessed=0;
2043 	public long basesProcessed=0;
2044 	public long kmersProcessed=0;
2045 	public long mappedKmers=0;
2046 	public double correctKmers=0;
2047 	public long totalCoveredBases1=0;
2048 	public long totalCoveredBases2=0;
2049 	public long scaffoldsWithCoverage1=0;
2050 	public long scaffoldsWithCoverage2=0;
2051 	public long totalScaffolds=0;
2052 
2053 	public int k=0;
2054 
2055 	//Don't reset these variables when clearing.
2056 	public long maxReads=-1;
2057 	public int initialScaffolds=4096;
2058 	public int binsize=1000;
2059 	public boolean bits32=false;
2060 	public int minMapq=0;
2061 	public boolean useStreamer=true;
2062 	public int streamerThreads=2;
2063 	public int minDepthToBeCovered=1;
2064 
2065 	private boolean qtrimLeft=false;
2066 	private boolean qtrimRight=false;
2067 	private float trimq=-1;
2068 	private final float trimE;
2069 	private int border=0;
2070 
2071 	/** Don't print coverage info for scaffolds shorter than this */
2072 	public int minscaf=0;
2073 
2074 	public HashMap<String, SamLine> pairTable=new HashMap<String, SamLine>();
2075 
2076 	public PrintStream outstream=System.err;
2077 
2078 	private boolean errorState=false;
2079 
2080 	/*--------------------------------------------------------------*/
2081 	/*----------------        Static Fields         ----------------*/
2082 	/*--------------------------------------------------------------*/
2083 
2084 
2085 	/** Permission to overwrite existing files */
2086 	public static boolean overwrite=false;
2087 	/** Permission to append to existing files */
2088 	public static boolean append=false;
2089 	/** Print verbose log messages */
2090 	public static boolean verbose=false;
2091 	/** Print the arguments to main */
2092 	public static boolean printCommand=true;
2093 
2094 	/** Print headers in output files */
2095 	public static boolean printHeader=true;
2096 	/** Prepend '#' symbol to header lines */
2097 	public static boolean headerPound=true;
2098 	/** Calculate standard deviation of coverage */
2099 	public static boolean calcCovStdev=true;
2100 
2101 	/** Window size to use when calculating average coverage,
2102 	 * for detecting contiguous low-coverage areas */
2103 	public static int LOW_COV_WINDOW=500;
2104 	/** Min average coverage to not be classified as low-depth */
2105 	public static double LOW_COV_DEPTH=5;
2106 	/** Print number of bases below a certain average coverage in a window */
2107 	public static boolean USE_WINDOW=false;
2108 
2109 	/** Track base composition of reads covering each scaffold */
2110 	public static boolean COUNT_GC=true;
2111 	/** Output in 2-column format ("#ID\tAvg_fold\n") */
2112 	public static boolean TWOCOLUMN=false;
2113 	/** Track coverage for strands independently */
2114 	public static boolean STRANDED=false;
2115 	/** Add scaffold information from the reference (in addition to sam header) */
2116 	public static boolean ADD_FROM_REF=true;
2117 	/** Add scaffold information from reads mapped to unknown scaffolds (in addition to sam header) */
2118 	public static boolean ADD_FROM_READS=false;
2119 	/** Only print scaffolds with nonzero coverage */
2120 	public static boolean NONZERO_ONLY=false;
2121 	/** Store coverage info in numeric arrays */
2122 	public static boolean USE_COVERAGE_ARRAYS=true;
2123 	/** Store coverage info in bitsets */
2124 	public static boolean USE_BITSETS=false;
2125 	/** Only print lines when coverage changes (for compatibility with Jigsaw) */
2126 	public static boolean DELTA_ONLY=false;
2127 	/** Process secondary alignments */
2128 	public static boolean USE_SECONDARY=true;
2129 	/** Include coverage of unsequenced middle portion of pairs */
2130 	public static boolean PHYSICAL_COVERAGE=false;
2131 	/** Use 'tlen' field when calculating physical coverage */
2132 	public static boolean USE_TLEN=true;
2133 	/** Abort on error; otherwise, errors may be ignored */
2134 	public static boolean ABORT_ON_ERROR=true;
2135 	/** Print coverage for the last bin of a scaffold, even if it is shorter than binsize */
2136 	public static boolean KEEP_SHORT_BINS=true;
2137 	/** Only track coverage for start location */
2138 	public static boolean START_ONLY=false;
2139 	/** Only track coverage for stop location */
2140 	public static boolean STOP_ONLY=false;
2141 	/** This appears to be the same as nonzeroonly... */
2142 	public static boolean CONCISE=false;
2143 	/** Normalize coverage by expression contig coverage as a fraction of its max coverage */
2144 	public static boolean NORMALIZE_COVERAGE=false;
2145 	/** Normalize contig length by binning into this many bins per contig */
2146 	public static int NORMALIZE_LENGTH_BINS=100;
2147 	/** Include soft-clipped bases in coverage */
2148 	public static boolean INCLUDE_SOFT_CLIP=false;
2149 	/** Include deletions/introns in coverage */
2150 	public static boolean INCLUDE_DELETIONS=true;
2151 	/** Include reads flagged as duplicates in coverage */
2152 	public static boolean INCLUDE_DUPLICATES=true;
2153 
2154 	public static boolean KEY_VALUE;
2155 
2156 	/** Translation array for tracking base counts */
2157 	private static final byte[] charToNum=AssemblyStats2.makeCharToNum();
2158 
2159 	private static final int NOTHING_MODE=0, BITSET_MODE=1, ARRAY_MODE=2;
2160 
2161 }
2162