1 package jgi; 2 3 import java.io.File; 4 import java.io.PrintStream; 5 import java.util.ArrayList; 6 import java.util.concurrent.atomic.AtomicLong; 7 8 import assemble.Tadpole; 9 import fileIO.ByteFile; 10 import fileIO.ReadWrite; 11 import kmer.AbstractKmerTableSet; 12 import kmer.KmerTableSet; 13 import shared.Parse; 14 import shared.Parser; 15 import shared.PreParser; 16 import shared.ReadStats; 17 import shared.Shared; 18 import shared.Timer; 19 import shared.Tools; 20 import stream.FastaReadInputStream; 21 import ukmer.KmerTableSetU; 22 23 /** 24 * @author Brian Bushnell 25 * @date Nov 22, 2013 26 * 27 */ 28 public class KmerFilterSetMaker { 29 30 /** 31 * Code entrance from the command line. 32 * @param args Command line arguments 33 */ main(String[] args)34 public static void main(String[] args){ 35 Timer t=new Timer(), t2=new Timer(); 36 t.start(); 37 t2.start(); 38 39 //Create a new CountKmersExact instance 40 KmerFilterSetMaker x=new KmerFilterSetMaker(args); 41 t2.stop(); 42 // outstream.println("Initialization Time: \t"+t2); 43 44 ///And run it 45 x.process(t); 46 47 //Close the print stream if it was redirected 48 Shared.closeStream(outstream); 49 } 50 51 /** 52 * Constructor. 53 * @param args Command line arguments 54 */ KmerFilterSetMaker(String[] args)55 public KmerFilterSetMaker(String[] args){ 56 57 {//Preparse block for help, config files, and outstream 58 PreParser pp=new PreParser(args, getClass(), false); 59 args=pp.args; 60 outstream=pp.outstream; 61 } 62 PreParser.silent=true; 63 64 /* Set global defaults */ 65 ReadWrite.ZIPLEVEL=2; 66 ReadWrite.USE_UNPIGZ=true; 67 68 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ 69 ByteFile.FORCE_MODE_BF2=true; 70 } 71 72 /* Initialize local variables with defaults */ 73 boolean useForest_=false, useTable_=false, useArray_=true; 74 Parser parser=new Parser(); 75 76 /* Parse arguments */ 77 for(int i=0; i<args.length; i++){ 78 79 final String arg=args[i]; 80 String[] split=arg.split("="); 81 String a=split[0].toLowerCase(); 82 String b=split.length>1 ? split[1] : null; 83 84 if(Parser.parseCommonStatic(arg, a, b)){ 85 //do nothing 86 }else if(Parser.parseZip(arg, a, b)){ 87 //do nothing 88 }else if(Parser.parseQuality(arg, a, b)){ 89 //do nothing 90 }else if(Parser.parseFasta(arg, a, b)){ 91 //do nothing 92 }else if(parser.parseInterleaved(arg, a, b)){ 93 //do nothing 94 }else if(parser.parseTrim(arg, a, b)){ 95 //do nothing 96 }else if(a.equals("out") || a.equals("out1") || a.equals("outkmers") || a.equals("outk") || a.equals("dump")){ 97 kmerOutFile=b; 98 }else if(a.equals("in") || a.equals("in1")){ 99 inFile=b; 100 }else if(a.equals("initial") || a.equals("starting") || a.equals("initialset") || a.equals("startingset")){ 101 initialKmerFile=b; 102 }else if(a.equals("temp") || a.equals("pattern")){ 103 outTemp=b; 104 assert(outTemp.contains("#")); 105 }else if(a.equals("mincount") || a.equals("min")){ 106 minCount=Integer.parseInt(b); 107 }else if(a.equals("maxns")){ 108 maxNs=Integer.parseInt(b); 109 AbstractKmerTableSet.maxNs=maxNs; 110 }else if(a.equals("minlen")){ 111 minLen=Integer.parseInt(b); 112 AbstractKmerTableSet.minLen=minLen; 113 }else if(a.equals("passes") || a.equals("maxpasses")){ 114 maxPasses=Integer.parseInt(b); 115 }else if(a.equals("minkmers") || a.equals("minkmersperpass") || a.equals("minkpp")){ 116 minKmersPerIteration=Integer.parseInt(b); 117 }else if(a.equals("maxkmers") || a.equals("maxkmersperpass") || a.equals("maxkpp")){ 118 maxKmersPerIteration=Integer.parseInt(b); 119 if(maxKmersPerIteration<1){ 120 maxKmersPerIteration=Integer.MAX_VALUE; 121 } 122 } 123 // else if(a.equals("maxcounttodump") || a.equals("maxdump") || a.equals("maxcount")){ 124 // maxToDump=Integer.parseInt(b); 125 // } 126 127 else if(a.equals("append") || a.equals("app")){ 128 append=ReadStats.append=Parse.parseBoolean(b); 129 }else if(a.equals("overwrite") || a.equals("ow")){ 130 overwrite=Parse.parseBoolean(b); 131 }else if(a.equals("threads") || a.equals("t")){ 132 THREADS=(b==null || b.equalsIgnoreCase("auto") ? Shared.threads() : Integer.parseInt(b)); 133 }else if(a.equals("verbose")){ 134 assert(false) : "Verbose flag is currently static final; must be recompiled to change."; 135 // verbose=Parse.parseBoolean(b); 136 } 137 138 else if(KmerTableSet.isValidArgument(a)){ 139 tableArgs.add(arg); 140 } 141 142 else{ 143 throw new RuntimeException("Unknown parameter "+args[i]); 144 } 145 } 146 147 {//Process parser fields 148 Parser.processQuality(); 149 } 150 151 if(outTemp==null){ 152 outTemp=makeTempFile("rtemp_#", ReadWrite.getExtension(inFile)); 153 } 154 155 /* Adjust I/O settings and filenames */ 156 157 assert(FastaReadInputStream.settingsOK()); 158 159 assert(kmerOutFile!=null) : "Kmer output file is required."; 160 if(kmerOutFile!=null && !Tools.canWrite(kmerOutFile, overwrite)){throw new RuntimeException("Output file "+kmerOutFile+" already exists, and overwrite="+overwrite);} 161 162 assert(THREADS>0); 163 164 k=Tadpole.preparseK(args); 165 } 166 167 168 /*--------------------------------------------------------------*/ 169 /*---------------- Outer Methods ----------------*/ 170 /*--------------------------------------------------------------*/ 171 172 process(Timer t)173 public void process(Timer t){ 174 175 /* Check for output file collisions */ 176 Tools.testOutputFiles(overwrite, append, false, kmerOutFile); 177 178 /* Count kmers */ 179 process2(); 180 181 /* Stop timer and calculate speed statistics */ 182 t.stop(); 183 184 /* Throw an exception if errors were detected */ 185 if(errorState){ 186 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); 187 } 188 } 189 190 process2()191 public void process2(){ 192 193 /* Start phase timer */ 194 Timer t=new Timer(); 195 196 AbstractKmerTableSet.DISPLAY_STATS=false; 197 198 runAllPasses(inFile, outTemp); 199 200 t.stop(); 201 outstream.println(); 202 outstream.println("Input: \t"+readsIn+" reads \t\t"+basesIn+" bases."); 203 outstream.println("Output: \t"+kmersOut+" kmers."); 204 outstream.println("Passes: \t"+numPasses); 205 206 // if(shave || rinse){ 207 // kmersRemoved=shave(shave, rinse, shaveDepth); 208 // } 209 // 210 // outstream.println("\nFor K="+tables.kbig()); 211 // outstream.println("Unique Kmers: \t"+tables.kmersLoaded); 212 // if(shave || rinse){ 213 // outstream.println("After Shaving: \t"+(tables.kmersLoaded-kmersRemoved)); 214 // } 215 216 outstream.println(); 217 218 outstream.println("Time: \t"+t); 219 } 220 runAllPasses(String initialInputFile, String tempPattern)221 int runAllPasses(String initialInputFile, String tempPattern){ 222 //Handle initial input set 223 if(initialKmerFile==null){ 224 AbstractKmerTableSet.overwrite=true; 225 AbstractKmerTableSet.append=false; 226 }else{ 227 AbstractKmerTableSet.overwrite=false; 228 AbstractKmerTableSet.append=true; 229 230 boolean same=initialKmerFile.equals(kmerOutFile) || new File(initialKmerFile).equals(new File(kmerOutFile)); 231 ReformatReads x=new ReformatReads(new String[] { 232 "in="+initialKmerFile, "out="+(same ? null : kmerOutFile), "ow", "silent"}); 233 x.process(new Timer()); 234 initialSetSize=x.readsProcessed; 235 } 236 237 //Process primary input 238 int maxCount=20000000; //Initial array size 239 String in=initialInputFile; 240 String lastOut=null; 241 for(int pass=0; pass<maxPasses && maxCount>=minCount; pass++){ 242 String out=tempPattern.replace("#", ""+pass); 243 maxCount=runOnePass(in, out, kmerOutFile, maxCount, pass); 244 if(pass>0){new File(in).delete();} 245 in=lastOut=out; 246 AbstractKmerTableSet.overwrite=false; 247 AbstractKmerTableSet.append=true; 248 } 249 if(lastOut!=null){new File(lastOut).delete();}//This does not actually need to be created most of the time 250 return maxCount; 251 } 252 runOnePass(String inFile, String outFile, String kmerFile, int lastMaxSeen, int pass)253 int runOnePass(String inFile, String outFile, String kmerFile, int lastMaxSeen, int pass){ 254 Timer t=new Timer(); 255 256 @SuppressWarnings("unchecked") 257 ArrayList<String> tableArgs2=(ArrayList<String>) tableArgs.clone(); 258 tableArgs2.add("k="+k); 259 tableArgs2.add("in="+inFile); 260 // tableArgs2.add("showstats=f"); 261 // tableArgs2.add("showprogress=f"); 262 263 AbstractKmerTableSet.DISPLAY_STATS=AbstractKmerTableSet.DISPLAY_PROGRESS=false; 264 265 System.err.print("Pass "+pass+" \t"); 266 267 //Read input file 268 AbstractKmerTableSet tables; 269 if(k<=31){//TODO: 123 add "false" to the clause to force KmerTableSetU usage. 270 tables=new KmerTableSet(tableArgs2.toArray(new String[0]), 12); 271 }else{ 272 tables=new KmerTableSetU(tableArgs2.toArray(new String[0]), 0); 273 } 274 tables.process(t); 275 System.err.print(tables.readsIn+" reads \t"+tables.kmersIn+" kmers \t"); 276 if(pass==0){ 277 readsIn+=tables.readsIn; 278 basesIn+=tables.basesIn; 279 kmersIn+=tables.kmersIn; 280 } 281 numPasses++; 282 283 //Summarize counts 284 long[] counts=tables.fillHistogram(lastMaxSeen); 285 int max=0; 286 for(int i=counts.length-1; i>=1; i--){ 287 if(counts[i]>0){ 288 max=i; 289 break; 290 } 291 } 292 System.err.print(max+" max depth \t"); 293 294 //Determine minimum count to retain 295 int numGood=0; 296 int minCountToKeep=-1; 297 for(int i=max; i>=1 && numGood<minKmersPerIteration; i--){ 298 if(i==1 && numGood>0){break;} 299 numGood+=counts[i]; 300 minCountToKeep=i; 301 } 302 System.err.print(numGood+" high kmers \t"); 303 304 if(numGood<1){ 305 System.err.println(0+" retained"); 306 return -1; 307 } 308 309 final int maxToKeep=(int)Tools.min(maxKmersPerIteration, tables.readsIn); 310 final int numKept=Tools.min(maxToKeep, numGood); 311 // final long outSize=initialSetSize+kmersOut+numKept; 312 // System.err.println("\nmaxToKeep="+maxToKeep+", numKept="+numKept+", numGood="+numGood+ 313 // ", maxKmersPerIteration="+maxKmersPerIteration+", tables.readsIn="+tables.readsIn); 314 315 //Append retained kmers to a file 316 if(numKept>0){ 317 tables.dumpKmersAsBytes_MT(kmerFile, minCountToKeep, Integer.MAX_VALUE, false, new AtomicLong(numKept)); 318 // tables.dumpKmersAsBytes(kmerFile, minCountToKeep, Integer.MAX_VALUE, false, new AtomicLong(numKept)); 319 } 320 321 System.err.println(numKept+" retained"); 322 323 kmersOut+=numKept; 324 325 //Filter to retain only unmatched reads 326 if(minCountToKeep>=1 && tables.readsIn>=1){ 327 ArrayList<String> bbdukArgs=new ArrayList<String>(); 328 bbdukArgs.add("in="+inFile); 329 bbdukArgs.add("outu="+outFile); 330 bbdukArgs.add("ref="+kmerFile); //Technically this can only be the latest kmers, in case this file gets big. 331 bbdukArgs.add("k="+k); 332 bbdukArgs.add("mm=f"); 333 bbdukArgs.add("rcomp="+tables.rcomp()); 334 bbdukArgs.add("silent=t"); 335 if(maxNs>=0 && maxNs<Integer.MAX_VALUE){bbdukArgs.add("maxns="+maxNs);} 336 if(minLen>0){bbdukArgs.add("minlen="+minLen);} 337 bbdukArgs.add("ordered"); 338 BBDuk.main(bbdukArgs.toArray(new String[0])); 339 } 340 341 return minCountToKeep; 342 } 343 344 /*--------------------------------------------------------------*/ 345 /*---------------- Helper Methods ----------------*/ 346 /*--------------------------------------------------------------*/ 347 348 /*--------------------------------------------------------------*/ 349 /*---------------- Helper Classes ----------------*/ 350 /*--------------------------------------------------------------*/ 351 352 /*--------------------------------------------------------------*/ 353 /*---------------- Fields ----------------*/ 354 /*--------------------------------------------------------------*/ 355 356 // /** Hold kmers. */ 357 // private final AbstractKmerTableSet tables; 358 359 // private boolean shave=false; 360 // private boolean rinse=false; 361 // private int shaveDepth=1; 362 363 private ArrayList<String> tableArgs=new ArrayList<String>(); 364 365 private long basesIn=0; 366 private long readsIn=0; 367 368 private long kmersIn=0; 369 private long kmersOut=0; 370 private long numPasses; 371 private long initialSetSize=0; 372 373 private int maxPasses=3000; 374 private int minCount=1; 375 private int minKmersPerIteration=1; 376 private int maxKmersPerIteration=2; 377 private int maxNs=Integer.MAX_VALUE; 378 private int minLen=1; 379 380 /** Kmer count output file */ 381 private String kmerOutFile=null; 382 383 private String inFile=null; 384 private String initialKmerFile=null; 385 private String outTemp=null; 386 private String tempKmerFile=makeTempFile("ktemp",".fa"); 387 388 private boolean errorState=false; 389 makeTempFile(String prefix, String ext)390 private String makeTempFile(String prefix, String ext){ 391 if(!ext.startsWith(".")){ext="."+ext;} 392 // try { 393 return prefix+"_"+(System.nanoTime()&0xFFFFF)+"_"+((int)(10000000*(2+Math.random())))+ext; 394 // return File.createTempFile("temp#", ".fq").getAbsolutePath(); 395 // } catch (IOException e) { 396 // // TODO Auto-generated catch block 397 // e.printStackTrace(); 398 // } 399 // return null; 400 } 401 402 /*--------------------------------------------------------------*/ 403 /*---------------- Final Primitives ----------------*/ 404 /*--------------------------------------------------------------*/ 405 406 final int k; 407 408 /*--------------------------------------------------------------*/ 409 /*---------------- Static Fields ----------------*/ 410 /*--------------------------------------------------------------*/ 411 412 /** Print messages to this stream */ 413 private static PrintStream outstream=System.err; 414 /** Permission to overwrite existing files */ 415 public static boolean overwrite=false; 416 /** Permission to append to existing files */ 417 public static boolean append=false; 418 /** Display progress messages such as memory usage */ 419 public static boolean DISPLAY_PROGRESS=false; 420 /** Verbose messages */ 421 public static final boolean verbose=false; 422 /** Number of ProcessThreads */ 423 public static int THREADS=Shared.threads(); 424 425 } 426