1 package tax; 2 3 import java.io.File; 4 import java.io.PrintStream; 5 import java.util.LinkedHashMap; 6 import java.util.Locale; 7 8 import fileIO.ByteFile; 9 import fileIO.ByteStreamWriter; 10 import fileIO.FileFormat; 11 import fileIO.ReadWrite; 12 import fileIO.TextStreamWriter; 13 import shared.Parse; 14 import shared.Parser; 15 import shared.PreParser; 16 import shared.Shared; 17 import shared.Timer; 18 import shared.Tools; 19 import stream.FastaReadInputStream; 20 21 /** 22 * Constructs a directory and file tree of sequences 23 * corresponding to a taxonomic tree. 24 * 25 * @author Brian Bushnell 26 * @date December 12, 2017 27 * 28 */ 29 public class ExplodeTree { 30 31 /*--------------------------------------------------------------*/ 32 /*---------------- Initialization ----------------*/ 33 /*--------------------------------------------------------------*/ 34 35 /** 36 * Code entrance from the command line. 37 * @param args Command line arguments 38 */ main(String[] args)39 public static void main(String[] args){ 40 Timer t=new Timer(); 41 ExplodeTree x=new ExplodeTree(args); 42 x.process(t); 43 44 //Close the print stream if it was redirected 45 Shared.closeStream(x.outstream); 46 } 47 48 /** 49 * Constructor. 50 * @param args Command line arguments 51 */ ExplodeTree(String[] args)52 public ExplodeTree(String[] args){ 53 54 {//Preparse block for help, config files, and outstream 55 PreParser pp=new PreParser(args, getClass(), false); 56 args=pp.args; 57 outstream=pp.outstream; 58 } 59 60 //Set shared static variables 61 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; 62 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); 63 64 //Create a parser object 65 Parser parser=new Parser(); 66 67 //Parse each argument 68 for(int i=0; i<args.length; i++){ 69 String arg=args[i]; 70 71 //Break arguments into their constituent parts, in the form of "a=b" 72 String[] split=arg.split("="); 73 String a=split[0].toLowerCase(); 74 String b=split.length>1 ? split[1] : null; 75 76 if(a.equals("verbose")){ 77 verbose=Parse.parseBoolean(b); 78 }else if(a.equals("out") || a.equals("path") || a.equals("outpath")){ 79 outPath=b; 80 }else if(a.equals("prefix")){ 81 prefix=b; 82 }else if(a.equals("results") || a.equals("result")){ 83 resultsFile=b; 84 }else if(a.equals("makedirectories") || a.equals("mkdirs") || a.equals("mkdir")){ 85 makeDirectories=Parse.parseBoolean(b); 86 }else if(a.equals("tree") || a.equals("taxtree")){ 87 taxTreeFile=b; 88 }else if(parser.parse(arg, a, b)){//Parse standard flags in the parser 89 //do nothing 90 }else{ 91 outstream.println("Unknown parameter "+args[i]); 92 assert(false) : "Unknown parameter "+args[i]; 93 // throw new RuntimeException("Unknown parameter "+args[i]); 94 } 95 } 96 if(prefix==null){prefix="";} 97 if("auto".equalsIgnoreCase(taxTreeFile)){taxTreeFile=TaxTree.defaultTreeFile();} 98 99 {//Process parser fields 100 Parser.processQuality(); 101 102 maxReads=parser.maxReads; 103 104 overwrite=parser.overwrite; 105 106 in1=parser.in1; 107 108 extin=parser.extin; 109 } 110 111 if(outPath==null || outPath.trim().length()==0){outPath="";} 112 else{ 113 outPath=outPath.trim().replace('\\', '/').replaceAll("/+", "/"); 114 if(!outPath.endsWith("/")){outPath=outPath+"/";} 115 } 116 117 assert(FastaReadInputStream.settingsOK()); 118 119 //Ensure there is an input file 120 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} 121 122 //Adjust the number of threads for input file reading 123 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ 124 ByteFile.FORCE_MODE_BF2=true; 125 } 126 127 //Ensure output files can be written 128 if(!Tools.testOutputFiles(overwrite, false, false, resultsFile)){ 129 outstream.println(resultsFile); 130 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+resultsFile+"\n"); 131 } 132 133 //Ensure input files can be read 134 if(!Tools.testInputFiles(false, true, in1)){ 135 throw new RuntimeException("\nCan't read some input files.\n"); 136 } 137 138 //Ensure that no file was specified multiple times 139 if(!Tools.testForDuplicateFiles(true, in1, resultsFile)){ 140 throw new RuntimeException("\nSome file names were specified multiple times.\n"); 141 } 142 143 //Create input FileFormat objects 144 ffin1=FileFormat.testInput(in1, FileFormat.FASTA, extin, true, true); 145 146 tree=TaxTree.loadTaxTree(taxTreeFile, outstream, true, false); 147 } 148 149 /*--------------------------------------------------------------*/ 150 /*---------------- Outer Methods ----------------*/ 151 /*--------------------------------------------------------------*/ 152 makeDirectoryTree(String root, boolean writeNames)153 public void makeDirectoryTree(String root, boolean writeNames){ 154 for(TaxNode node : tree.nodes){ 155 if(node!=null){ 156 String dir=tree.toDir(node, root); 157 File df=new File(dir); 158 if(!df.exists()){df.mkdirs();} 159 if(writeNames){ 160 try { 161 String fname=node.simpleName()+".name"; 162 File nf=new File(fname); 163 if(!nf.exists()){ 164 ReadWrite.writeString(node.name, dir+fname); 165 } 166 } catch (Exception e) { 167 // TODO Auto-generated catch block 168 e.printStackTrace(); 169 } 170 } 171 } 172 } 173 } 174 175 /** Create read streams and process all data */ process(Timer t)176 public void process(Timer t){ 177 178 Timer t2=new Timer(); 179 if(makeDirectories){ 180 makeDirectoryTree(outPath, true); 181 t2.stop("Finished making directories. "); 182 t2.start(); 183 } 184 processInner(); 185 t2.stop(); 186 t2.stop("Finished writing data. "); 187 188 //Do anything necessary after processing 189 190 if(resultsFile!=null){ 191 TextStreamWriter tsw=new TextStreamWriter(resultsFile, overwrite, false, false); 192 tsw.start(); 193 for(TaxNode tn : nodes.keySet()){ 194 Long data=nodes.get(tn); 195 if(data==null){data=0L;} 196 tsw.println(tn.id+"\t"+data+"\t"+tn.levelStringExtended(false)+"\t"+tn.name); 197 } 198 errorState|=tsw.poisonAndWait(); 199 } 200 201 //Report timing and results 202 { 203 t.stop(); 204 205 //Calculate units per nanosecond 206 double rpnano=readsProcessed/(double)(t.elapsed); 207 double lpnano=linesProcessed/(double)(t.elapsed); 208 double bpnano=basesProcessed/(double)(t.elapsed); 209 210 //Add "k" and "m" for large numbers 211 String rpstring=Tools.padKM(readsProcessed, 8); 212 String lpstring=Tools.padKM(linesProcessed, 8); 213 String bpstring=Tools.padKM(basesProcessed, 8); 214 215 String li="Lines In: \t"+linesProcessed+" lines"; 216 String lo="Lines Out: \t"+linesOut+" lines"; 217 while(lo.length()<li.length()){lo=lo+" ";} 218 219 String ri="Reads In: \t"+readsProcessed+" reads"; 220 String ro="Reads Out: \t"+readsOut+" reads"; 221 while(ro.length()<ri.length()){ro=ro+" ";} 222 223 outstream.println(ri+"\t"+basesProcessed+" bases"); 224 outstream.println(ro+"\t"+basesOut+" bases"); 225 outstream.println(li); 226 outstream.println(lo); 227 outstream.println(); 228 229 outstream.println("Time: \t"+t); 230 outstream.println("Reads Processed: "+rpstring+" \t"+String.format(Locale.ROOT, "%.2fk reads/sec", rpnano*1000000)); 231 outstream.println("Lines Processed: "+lpstring+" \t"+String.format(Locale.ROOT, "%.2fk reads/sec", lpnano*1000000)); 232 outstream.println("Bases Processed: "+bpstring+" \t"+String.format(Locale.ROOT, "%.2fm bases/sec", bpnano*1000)); 233 } 234 235 //Throw an exception of there was an error in a thread 236 if(errorState){ 237 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); 238 } 239 } 240 241 /*--------------------------------------------------------------*/ 242 /*---------------- Inner Methods ----------------*/ 243 /*--------------------------------------------------------------*/ 244 245 /** Iterate through the reads */ processInner()246 void processInner(){ 247 ByteFile bf=ByteFile.makeByteFile(ffin1); 248 TaxNode currentNode=null; 249 long currentSize=0; 250 ByteStreamWriter bsw=null; 251 for(byte[] line=bf.nextLine(); line!=null; line=bf.nextLine()){ 252 linesProcessed++; 253 if(line.length>0){ 254 final boolean header=(line[0]=='>'); 255 if(header){ 256 if(maxReads>0 && readsProcessed>=maxReads){break;} 257 readsProcessed++; 258 if(currentNode!=null){nodes.put(currentNode, nodes.get(currentNode)+currentSize);} 259 260 final TaxNode tn=tree.parseNodeFromHeader(new String(line, 1, line.length-1), false); 261 262 if(tn==null || tn!=currentNode){ 263 if(bsw!=null){errorState=bsw.poisonAndWait()|errorState; bsw=null;} 264 } 265 if(tn!=null && tn!=currentNode){ 266 String dir=tree.toDir(tn, outPath); 267 final boolean found=nodes.containsKey(tn); 268 if(!found){nodes.put(tn, 0L);} 269 FileFormat ff=FileFormat.testOutput(dir+prefix+tn.id+".fa.gz", FileFormat.FASTA, null, true, overwrite && !found, found, false); 270 bsw=new ByteStreamWriter(ff); 271 bsw.start(); 272 } 273 274 currentNode=tn; 275 currentSize=0; 276 if(bsw!=null){readsOut++;} 277 }else{ 278 basesProcessed+=line.length; 279 currentSize+=line.length; 280 } 281 if(bsw!=null){ 282 linesOut++; 283 if(!header){basesOut+=line.length;} 284 bsw.println(line); 285 } 286 } 287 } 288 if(bsw!=null){ 289 errorState=bsw.poisonAndWait()|errorState; bsw=null; 290 if(currentNode!=null){nodes.put(currentNode, nodes.get(currentNode)+currentSize);} 291 } 292 bf.close(); 293 } 294 295 /*--------------------------------------------------------------*/ 296 /*---------------- Fields ----------------*/ 297 /*--------------------------------------------------------------*/ 298 299 /** Primary input file path */ 300 private String in1=null; 301 302 /** Primary output file path */ 303 private String outPath=null; 304 305 private String prefix; 306 307 /** Override input file extension */ 308 private String extin=null; 309 310 /** For listing what is present in the output */ 311 public String resultsFile=null; 312 313 public String taxTreeFile=null; 314 315 public boolean makeDirectories=true; 316 317 public LinkedHashMap<TaxNode, Long> nodes=new LinkedHashMap<TaxNode, Long>(); 318 319 /*--------------------------------------------------------------*/ 320 321 /** Number of reads processed */ 322 protected long readsProcessed=0; 323 /** Number of lines processed */ 324 protected long linesProcessed=0; 325 /** Number of bases processed */ 326 protected long basesProcessed=0; 327 328 /** Number of reads out */ 329 public long readsOut=0; 330 /** Number of lines out */ 331 public long linesOut=0; 332 /** Number of bases out */ 333 public long basesOut=0; 334 335 /** Quit after processing this many input reads; -1 means no limit */ 336 private long maxReads=-1; 337 338 /*--------------------------------------------------------------*/ 339 /*---------------- Final Fields ----------------*/ 340 /*--------------------------------------------------------------*/ 341 342 /** Primary input file */ 343 private final FileFormat ffin1; 344 345 private final TaxTree tree; 346 347 /*--------------------------------------------------------------*/ 348 /*---------------- Common Fields ----------------*/ 349 /*--------------------------------------------------------------*/ 350 351 /** Print status messages to this output stream */ 352 private PrintStream outstream=System.err; 353 /** Print verbose messages */ 354 public static boolean verbose=false; 355 /** True if an error was encountered */ 356 public boolean errorState=false; 357 /** Overwrite existing output files */ 358 private boolean overwrite=true; 359 360 } 361