1 package jgi;
2 
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.util.ArrayList;
6 import java.util.concurrent.atomic.AtomicLong;
7 
8 import assemble.Tadpole;
9 import fileIO.ByteFile;
10 import fileIO.ReadWrite;
11 import kmer.AbstractKmerTableSet;
12 import kmer.KmerTableSet;
13 import shared.Parse;
14 import shared.Parser;
15 import shared.PreParser;
16 import shared.ReadStats;
17 import shared.Shared;
18 import shared.Timer;
19 import shared.Tools;
20 import stream.FastaReadInputStream;
21 import ukmer.KmerTableSetU;
22 
23 /**
24  * @author Brian Bushnell
25  * @date Nov 22, 2013
26  *
27  */
28 public class KmerFilterSetMaker {
29 
30 	/**
31 	 * Code entrance from the command line.
32 	 * @param args Command line arguments
33 	 */
main(String[] args)34 	public static void main(String[] args){
35 		Timer t=new Timer(), t2=new Timer();
36 		t.start();
37 		t2.start();
38 
39 		//Create a new CountKmersExact instance
40 		KmerFilterSetMaker x=new KmerFilterSetMaker(args);
41 		t2.stop();
42 //		outstream.println("Initialization Time:      \t"+t2);
43 
44 		///And run it
45 		x.process(t);
46 
47 		//Close the print stream if it was redirected
48 		Shared.closeStream(outstream);
49 	}
50 
51 	/**
52 	 * Constructor.
53 	 * @param args Command line arguments
54 	 */
KmerFilterSetMaker(String[] args)55 	public KmerFilterSetMaker(String[] args){
56 
57 		{//Preparse block for help, config files, and outstream
58 			PreParser pp=new PreParser(args, getClass(), false);
59 			args=pp.args;
60 			outstream=pp.outstream;
61 		}
62 		PreParser.silent=true;
63 
64 		/* Set global defaults */
65 		ReadWrite.ZIPLEVEL=2;
66 		ReadWrite.USE_UNPIGZ=true;
67 
68 		if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
69 			ByteFile.FORCE_MODE_BF2=true;
70 		}
71 
72 		/* Initialize local variables with defaults */
73 		boolean useForest_=false, useTable_=false, useArray_=true;
74 		Parser parser=new Parser();
75 
76 		/* Parse arguments */
77 		for(int i=0; i<args.length; i++){
78 
79 			final String arg=args[i];
80 			String[] split=arg.split("=");
81 			String a=split[0].toLowerCase();
82 			String b=split.length>1 ? split[1] : null;
83 
84 			if(Parser.parseCommonStatic(arg, a, b)){
85 				//do nothing
86 			}else if(Parser.parseZip(arg, a, b)){
87 				//do nothing
88 			}else if(Parser.parseQuality(arg, a, b)){
89 				//do nothing
90 			}else if(Parser.parseFasta(arg, a, b)){
91 				//do nothing
92 			}else if(parser.parseInterleaved(arg, a, b)){
93 				//do nothing
94 			}else if(parser.parseTrim(arg, a, b)){
95 				//do nothing
96 			}else if(a.equals("out") || a.equals("out1") || a.equals("outkmers") || a.equals("outk") || a.equals("dump")){
97 				kmerOutFile=b;
98 			}else if(a.equals("in") || a.equals("in1")){
99 				inFile=b;
100 			}else if(a.equals("initial") || a.equals("starting") || a.equals("initialset") || a.equals("startingset")){
101 				initialKmerFile=b;
102 			}else if(a.equals("temp") || a.equals("pattern")){
103 				outTemp=b;
104 				assert(outTemp.contains("#"));
105 			}else if(a.equals("mincount") || a.equals("min")){
106 				minCount=Integer.parseInt(b);
107 			}else if(a.equals("maxns")){
108 				maxNs=Integer.parseInt(b);
109 				AbstractKmerTableSet.maxNs=maxNs;
110 			}else if(a.equals("minlen")){
111 				minLen=Integer.parseInt(b);
112 				AbstractKmerTableSet.minLen=minLen;
113 			}else if(a.equals("passes") || a.equals("maxpasses")){
114 				maxPasses=Integer.parseInt(b);
115 			}else if(a.equals("minkmers") || a.equals("minkmersperpass") || a.equals("minkpp")){
116 				minKmersPerIteration=Integer.parseInt(b);
117 			}else if(a.equals("maxkmers") || a.equals("maxkmersperpass") || a.equals("maxkpp")){
118 				maxKmersPerIteration=Integer.parseInt(b);
119 				if(maxKmersPerIteration<1){
120 					maxKmersPerIteration=Integer.MAX_VALUE;
121 				}
122 			}
123 //			else if(a.equals("maxcounttodump") || a.equals("maxdump") || a.equals("maxcount")){
124 //				maxToDump=Integer.parseInt(b);
125 //			}
126 
127 			else if(a.equals("append") || a.equals("app")){
128 				append=ReadStats.append=Parse.parseBoolean(b);
129 			}else if(a.equals("overwrite") || a.equals("ow")){
130 				overwrite=Parse.parseBoolean(b);
131 			}else if(a.equals("threads") || a.equals("t")){
132 				THREADS=(b==null || b.equalsIgnoreCase("auto") ? Shared.threads() : Integer.parseInt(b));
133 			}else if(a.equals("verbose")){
134 				assert(false) : "Verbose flag is currently static final; must be recompiled to change.";
135 //				verbose=Parse.parseBoolean(b);
136 			}
137 
138 			else if(KmerTableSet.isValidArgument(a)){
139 				tableArgs.add(arg);
140 			}
141 
142 			else{
143 				throw new RuntimeException("Unknown parameter "+args[i]);
144 			}
145 		}
146 
147 		{//Process parser fields
148 			Parser.processQuality();
149 		}
150 
151 		if(outTemp==null){
152 			outTemp=makeTempFile("rtemp_#", ReadWrite.getExtension(inFile));
153 		}
154 
155 		/* Adjust I/O settings and filenames */
156 
157 		assert(FastaReadInputStream.settingsOK());
158 
159 		assert(kmerOutFile!=null) : "Kmer output file is required.";
160 		if(kmerOutFile!=null && !Tools.canWrite(kmerOutFile, overwrite)){throw new RuntimeException("Output file "+kmerOutFile+" already exists, and overwrite="+overwrite);}
161 
162 		assert(THREADS>0);
163 
164 		k=Tadpole.preparseK(args);
165 	}
166 
167 
168 	/*--------------------------------------------------------------*/
169 	/*----------------         Outer Methods        ----------------*/
170 	/*--------------------------------------------------------------*/
171 
172 
process(Timer t)173 	public void process(Timer t){
174 
175 		/* Check for output file collisions */
176 		Tools.testOutputFiles(overwrite, append, false, kmerOutFile);
177 
178 		/* Count kmers */
179 		process2();
180 
181 		/* Stop timer and calculate speed statistics */
182 		t.stop();
183 
184 		/* Throw an exception if errors were detected */
185 		if(errorState){
186 			throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
187 		}
188 	}
189 
190 
process2()191 	public void process2(){
192 
193 		/* Start phase timer */
194 		Timer t=new Timer();
195 
196 		AbstractKmerTableSet.DISPLAY_STATS=false;
197 
198 		runAllPasses(inFile, outTemp);
199 
200 		t.stop();
201 		outstream.println();
202 		outstream.println("Input:                      \t"+readsIn+" reads \t\t"+basesIn+" bases.");
203 		outstream.println("Output:                     \t"+kmersOut+" kmers.");
204 		outstream.println("Passes:                     \t"+numPasses);
205 
206 //		if(shave || rinse){
207 //			kmersRemoved=shave(shave, rinse, shaveDepth);
208 //		}
209 //
210 //		outstream.println("\nFor K="+tables.kbig());
211 //		outstream.println("Unique Kmers:               \t"+tables.kmersLoaded);
212 //		if(shave || rinse){
213 //			outstream.println("After Shaving:              \t"+(tables.kmersLoaded-kmersRemoved));
214 //		}
215 
216 		outstream.println();
217 
218 		outstream.println("Time:                       \t"+t);
219 	}
220 
runAllPasses(String initialInputFile, String tempPattern)221 	int runAllPasses(String initialInputFile, String tempPattern){
222 		//Handle initial input set
223 		if(initialKmerFile==null){
224 			AbstractKmerTableSet.overwrite=true;
225 			AbstractKmerTableSet.append=false;
226 		}else{
227 			AbstractKmerTableSet.overwrite=false;
228 			AbstractKmerTableSet.append=true;
229 
230 			boolean same=initialKmerFile.equals(kmerOutFile) || new File(initialKmerFile).equals(new File(kmerOutFile));
231 			ReformatReads x=new ReformatReads(new String[] {
232 					"in="+initialKmerFile, "out="+(same ? null : kmerOutFile), "ow", "silent"});
233 			x.process(new Timer());
234 			initialSetSize=x.readsProcessed;
235 		}
236 
237 		//Process primary input
238 		int maxCount=20000000; //Initial array size
239 		String in=initialInputFile;
240 		String lastOut=null;
241 		for(int pass=0; pass<maxPasses && maxCount>=minCount; pass++){
242 			String out=tempPattern.replace("#", ""+pass);
243 			maxCount=runOnePass(in, out, kmerOutFile, maxCount, pass);
244 			if(pass>0){new File(in).delete();}
245 			in=lastOut=out;
246 			AbstractKmerTableSet.overwrite=false;
247 			AbstractKmerTableSet.append=true;
248 		}
249 		if(lastOut!=null){new File(lastOut).delete();}//This does not actually need to be created most of the time
250 		return maxCount;
251 	}
252 
runOnePass(String inFile, String outFile, String kmerFile, int lastMaxSeen, int pass)253 	int runOnePass(String inFile, String outFile, String kmerFile, int lastMaxSeen, int pass){
254 		Timer t=new Timer();
255 
256 		@SuppressWarnings("unchecked")
257 		ArrayList<String> tableArgs2=(ArrayList<String>) tableArgs.clone();
258 		tableArgs2.add("k="+k);
259 		tableArgs2.add("in="+inFile);
260 //		tableArgs2.add("showstats=f");
261 //		tableArgs2.add("showprogress=f");
262 
263 		AbstractKmerTableSet.DISPLAY_STATS=AbstractKmerTableSet.DISPLAY_PROGRESS=false;
264 
265 		System.err.print("Pass "+pass+"  \t");
266 
267 		//Read input file
268 		AbstractKmerTableSet tables;
269 		if(k<=31){//TODO: 123 add "false" to the clause to force KmerTableSetU usage.
270 			tables=new KmerTableSet(tableArgs2.toArray(new String[0]), 12);
271 		}else{
272 			tables=new KmerTableSetU(tableArgs2.toArray(new String[0]), 0);
273 		}
274 		tables.process(t);
275 		System.err.print(tables.readsIn+" reads \t"+tables.kmersIn+" kmers \t");
276 		if(pass==0){
277 			readsIn+=tables.readsIn;
278 			basesIn+=tables.basesIn;
279 			kmersIn+=tables.kmersIn;
280 		}
281 		numPasses++;
282 
283 		//Summarize counts
284 		long[] counts=tables.fillHistogram(lastMaxSeen);
285 		int max=0;
286 		for(int i=counts.length-1; i>=1; i--){
287 			if(counts[i]>0){
288 				max=i;
289 				break;
290 			}
291 		}
292 		System.err.print(max+" max depth \t");
293 
294 		//Determine minimum count to retain
295 		int numGood=0;
296 		int minCountToKeep=-1;
297 		for(int i=max; i>=1 && numGood<minKmersPerIteration; i--){
298 			if(i==1 && numGood>0){break;}
299 			numGood+=counts[i];
300 			minCountToKeep=i;
301 		}
302 		System.err.print(numGood+" high kmers \t");
303 
304 		if(numGood<1){
305 			System.err.println(0+" retained");
306 			return -1;
307 		}
308 
309 		final int maxToKeep=(int)Tools.min(maxKmersPerIteration, tables.readsIn);
310 		final int numKept=Tools.min(maxToKeep, numGood);
311 //		final long outSize=initialSetSize+kmersOut+numKept;
312 //		System.err.println("\nmaxToKeep="+maxToKeep+", numKept="+numKept+", numGood="+numGood+
313 //				", maxKmersPerIteration="+maxKmersPerIteration+", tables.readsIn="+tables.readsIn);
314 
315 		//Append retained kmers to a file
316 		if(numKept>0){
317 			tables.dumpKmersAsBytes_MT(kmerFile, minCountToKeep, Integer.MAX_VALUE, false, new AtomicLong(numKept));
318 //			tables.dumpKmersAsBytes(kmerFile, minCountToKeep, Integer.MAX_VALUE, false, new AtomicLong(numKept));
319 		}
320 
321 		System.err.println(numKept+" retained");
322 
323 		kmersOut+=numKept;
324 
325 		//Filter to retain only unmatched reads
326 		if(minCountToKeep>=1 && tables.readsIn>=1){
327 			ArrayList<String> bbdukArgs=new ArrayList<String>();
328 			bbdukArgs.add("in="+inFile);
329 			bbdukArgs.add("outu="+outFile);
330 			bbdukArgs.add("ref="+kmerFile); //Technically this can only be the latest kmers, in case this file gets big.
331 			bbdukArgs.add("k="+k);
332 			bbdukArgs.add("mm=f");
333 			bbdukArgs.add("rcomp="+tables.rcomp());
334 			bbdukArgs.add("silent=t");
335 			if(maxNs>=0 && maxNs<Integer.MAX_VALUE){bbdukArgs.add("maxns="+maxNs);}
336 			if(minLen>0){bbdukArgs.add("minlen="+minLen);}
337 			bbdukArgs.add("ordered");
338 			BBDuk.main(bbdukArgs.toArray(new String[0]));
339 		}
340 
341 		return minCountToKeep;
342 	}
343 
344 	/*--------------------------------------------------------------*/
345 	/*----------------        Helper Methods        ----------------*/
346 	/*--------------------------------------------------------------*/
347 
348 	/*--------------------------------------------------------------*/
349 	/*----------------        Helper Classes        ----------------*/
350 	/*--------------------------------------------------------------*/
351 
352 	/*--------------------------------------------------------------*/
353 	/*----------------            Fields            ----------------*/
354 	/*--------------------------------------------------------------*/
355 
356 //	/** Hold kmers. */
357 //	private final AbstractKmerTableSet tables;
358 
359 //	private boolean shave=false;
360 //	private boolean rinse=false;
361 //	private int shaveDepth=1;
362 
363 	private ArrayList<String> tableArgs=new ArrayList<String>();
364 
365 	private long basesIn=0;
366 	private long readsIn=0;
367 
368 	private long kmersIn=0;
369 	private long kmersOut=0;
370 	private long numPasses;
371 	private long initialSetSize=0;
372 
373 	private int maxPasses=3000;
374 	private int minCount=1;
375 	private int minKmersPerIteration=1;
376 	private int maxKmersPerIteration=2;
377 	private int maxNs=Integer.MAX_VALUE;
378 	private int minLen=1;
379 
380 	/** Kmer count output file */
381 	private String kmerOutFile=null;
382 
383 	private String inFile=null;
384 	private String initialKmerFile=null;
385 	private String outTemp=null;
386 	private String tempKmerFile=makeTempFile("ktemp",".fa");
387 
388 	private boolean errorState=false;
389 
makeTempFile(String prefix, String ext)390 	private String makeTempFile(String prefix, String ext){
391 		if(!ext.startsWith(".")){ext="."+ext;}
392 //		try {
393 			return prefix+"_"+(System.nanoTime()&0xFFFFF)+"_"+((int)(10000000*(2+Math.random())))+ext;
394 //			return File.createTempFile("temp#", ".fq").getAbsolutePath();
395 //		} catch (IOException e) {
396 //			// TODO Auto-generated catch block
397 //			e.printStackTrace();
398 //		}
399 //		return null;
400 	}
401 
402 	/*--------------------------------------------------------------*/
403 	/*----------------       Final Primitives       ----------------*/
404 	/*--------------------------------------------------------------*/
405 
406 	final int k;
407 
408 	/*--------------------------------------------------------------*/
409 	/*----------------         Static Fields        ----------------*/
410 	/*--------------------------------------------------------------*/
411 
412 	/** Print messages to this stream */
413 	private static PrintStream outstream=System.err;
414 	/** Permission to overwrite existing files */
415 	public static boolean overwrite=false;
416 	/** Permission to append to existing files */
417 	public static boolean append=false;
418 	/** Display progress messages such as memory usage */
419 	public static boolean DISPLAY_PROGRESS=false;
420 	/** Verbose messages */
421 	public static final boolean verbose=false;
422 	/** Number of ProcessThreads */
423 	public static int THREADS=Shared.threads();
424 
425 }
426