1 package sketch;
2 
3 import java.io.PrintStream;
4 import java.util.ArrayList;
5 import java.util.Arrays;
6 
7 import fileIO.ByteFile;
8 import fileIO.ByteStreamWriter;
9 import fileIO.FileFormat;
10 import fileIO.ReadWrite;
11 import shared.Parse;
12 import shared.Parser;
13 import shared.PreParser;
14 import shared.Shared;
15 import shared.Timer;
16 import shared.Tools;
17 import structures.ByteBuilder;
18 import tax.TaxTree;
19 
20 /**
21  * @author Brian Bushnell
22  * @date May 9, 2016
23  *
24  */
25 public class AddSSU {
26 
27 	/*--------------------------------------------------------------*/
28 	/*----------------        Initialization        ----------------*/
29 	/*--------------------------------------------------------------*/
30 
31 	/**
32 	 * Code entrance from the command line.
33 	 * @param args Command line arguments
34 	 */
main(String[] args)35 	public static void main(String[] args){
36 		//Start a timer immediately upon code entrance.
37 		Timer t=new Timer();
38 
39 		//Create an instance of this class
40 		AddSSU x=new AddSSU(args);
41 
42 		//Run the object
43 		x.process(t);
44 
45 		//Close the print stream if it was redirected
46 		Shared.closeStream(x.outstream);
47 	}
48 
49 	/**
50 	 * Constructor.
51 	 * @param args Command line arguments
52 	 */
AddSSU(String[] args)53 	public AddSSU(String[] args){
54 
55 		{//Preparse block for help, config files, and outstream
56 			PreParser pp=new PreParser(args, /*getClass()*/null, false);
57 			args=pp.args;
58 			outstream=pp.outstream;
59 		}
60 
61 		//Set shared static variables prior to parsing
62 		ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
63 		ReadWrite.MAX_ZIP_THREADS=Shared.threads();
64 
65 		{//Parse the arguments
66 			final Parser parser=parse(args);
67 			overwrite=parser.overwrite;
68 			append=parser.append;
69 
70 			in1=parser.in1;
71 
72 			out1=parser.out1;
73 		}
74 
75 		fixExtensions(); //Add or remove .gz or .bz2 as needed
76 		checkFileExistence(); //Ensure files can be read and written
77 		checkStatics(); //Adjust file-related static fields as needed for this program
78 
79 		ffout1=FileFormat.testOutput(out1, FileFormat.SKETCH, null, true, overwrite, append, false);
80 		ffin1=FileFormat.testInput(in1, FileFormat.SKETCH, null, true, false);
81 
82 		if(verbose){
83 			System.err.println("Set r16SFile="+r16SFile);
84 			System.err.println("Set r18SFile="+r18SFile);
85 		}
86 
87 		tree=(treeFile!=null && (preferSSUMapEuks || preferSSUMapProks || clear16SEuks || clear18SEuks ||
88 				clear16SProks || clear18SProks || useSSUMapOnlyEuks || useSSUMapOnlyProks) ? TaxTree.loadTaxTree(treeFile, outstream, false, false) : null);
89 
90 		if(preferSSUMapEuks || preferSSUMapProks || clear16SEuks || clear18SEuks || clear16SProks || clear18SProks || useSSUMapOnlyEuks || useSSUMapOnlyProks){
91 			assert(tree!=null) : "preferSSUMapForEuks, clear16SEuks, and clear18SEuks require a TaxTree.";
92 		}
93 	}
94 
95 	/*--------------------------------------------------------------*/
96 	/*----------------    Initialization Helpers    ----------------*/
97 	/*--------------------------------------------------------------*/
98 
99 	/** Parse arguments from the command line */
parse(String[] args)100 	private Parser parse(String[] args){
101 
102 		Parser parser=new Parser();
103 		for(int i=0; i<args.length; i++){
104 			String arg=args[i];
105 			String[] split=arg.split("=");
106 			String a=split[0].toLowerCase();
107 			String b=split.length>1 ? split[1] : null;
108 			if(b!=null && b.equalsIgnoreCase("null")){b=null;}
109 
110 			if(a.equalsIgnoreCase("16S") || a.equalsIgnoreCase("16Sfile")){
111 				r16SFile=b;
112 			}else if(a.equalsIgnoreCase("18S") || a.equalsIgnoreCase("18Sfile")){
113 				r18SFile=b;
114 			}else if(a.equalsIgnoreCase("tree") || a.equalsIgnoreCase("treefile")){
115 				treeFile=b;
116 			}else if(a.equals("lines")){
117 				maxLines=Long.parseLong(b);
118 				if(maxLines<0){maxLines=Long.MAX_VALUE;}
119 			}else if(a.equals("verbose")){
120 				verbose=Parse.parseBoolean(b);
121 //				ByteFile1.verbose=verbose;
122 //				ByteFile2.verbose=verbose;
123 //				ReadWrite.verbose=verbose;
124 			}
125 
126 			else if(a.equalsIgnoreCase("preferSSUMap")){
127 				preferSSUMap=Parse.parseBoolean(b);
128 			}else if(a.equalsIgnoreCase("preferSSUMapForEuks") || a.equalsIgnoreCase("preferSSUMapEuks")){
129 				preferSSUMapEuks=Parse.parseBoolean(b);
130 			}else if(a.equalsIgnoreCase("useSSUMapOnly")){
131 				useSSUMapOnly=Parse.parseBoolean(b);
132 			}else if(a.equalsIgnoreCase("useSSUMapOnlyEuks") || a.equalsIgnoreCase("SSUMapOnlyEuks")){
133 				useSSUMapOnlyEuks=Parse.parseBoolean(b);
134 			}else if(a.equalsIgnoreCase("useSSUMapOnlyProks") || a.equalsIgnoreCase("SSUMapOnlyProks")){
135 				useSSUMapOnlyProks=Parse.parseBoolean(b);
136 			}else if(a.equalsIgnoreCase("preferSSUMapForProks") || a.equalsIgnoreCase("preferSSUMapProks")){
137 				preferSSUMapProks=Parse.parseBoolean(b);
138 			}
139 
140 			else if(a.equalsIgnoreCase("clearAll")){
141 				clear16S=clear18S=Parse.parseBoolean(b);
142 			}else if(a.equalsIgnoreCase("clear16S")){
143 				clear16S=Parse.parseBoolean(b);
144 			}else if(a.equalsIgnoreCase("clear18S")){
145 				clear18S=Parse.parseBoolean(b);
146 			}else if(a.equalsIgnoreCase("clear16SEuks")){
147 				clear16SEuks=Parse.parseBoolean(b);
148 			}else if(a.equalsIgnoreCase("clear18SEuks")){
149 				clear18SEuks=Parse.parseBoolean(b);
150 			}else if(a.equalsIgnoreCase("clear16SProks")){
151 				clear16SProks=Parse.parseBoolean(b);
152 			}else if(a.equalsIgnoreCase("clear18SProks")){
153 				clear18SProks=Parse.parseBoolean(b);
154 			}
155 
156 			else if(parser.parse(arg, a, b)){
157 				//do nothing
158 			}else{
159 				outstream.println("Unknown parameter "+args[i]);
160 				assert(false) : "Unknown parameter "+args[i];
161 				//				throw new RuntimeException("Unknown parameter "+args[i]);
162 			}
163 		}
164 		if("auto".equalsIgnoreCase(r16SFile)){r16SFile=TaxTree.default16SFile();}
165 		if("auto".equalsIgnoreCase(r18SFile)){r18SFile=TaxTree.default18SFile();}
166 		SSUMap.r16SFile=r16SFile;
167 		SSUMap.r18SFile=r18SFile;
168 
169 		return parser;
170 	}
171 
172 	/** Add or remove .gz or .bz2 as needed */
fixExtensions()173 	private void fixExtensions(){
174 		in1=Tools.fixExtension(in1);
175 		if(in1==null){throw new RuntimeException("Error - at least one input file is required.");}
176 	}
177 
178 	/** Ensure files can be read and written */
checkFileExistence()179 	private void checkFileExistence(){
180 		//Ensure output files can be written
181 		if(!Tools.testOutputFiles(overwrite, append, false, out1)){
182 			outstream.println((out1==null)+", "+out1);
183 			throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out1+"\n");
184 		}
185 
186 		//Ensure input files can be read
187 		if(!Tools.testInputFiles(false, true, in1, r16SFile, r18SFile)){
188 			throw new RuntimeException("\nCan't read some input files.\n");
189 		}
190 		assert(in1!=null) : "Input sketch file is required";
191 		assert(r16SFile!=null || r18SFile!=null) : "Input SSU file is required";
192 
193 		//Ensure that no file was specified multiple times
194 		if(!Tools.testForDuplicateFiles(true, in1, out1, r16SFile, r18SFile)){
195 			throw new RuntimeException("\nSome file names were specified multiple times.\n");
196 		}
197 	}
198 
199 	/** Adjust file-related static fields as needed for this program */
checkStatics()200 	private static void checkStatics(){
201 		//Adjust the number of threads for input file reading
202 		if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){
203 			ByteFile.FORCE_MODE_BF2=true;
204 		}
205 
206 //		if(!ByteFile.FORCE_MODE_BF2){
207 //			ByteFile.FORCE_MODE_BF2=false;
208 //			ByteFile.FORCE_MODE_BF1=true;
209 //		}
210 	}
211 
212 	/*--------------------------------------------------------------*/
213 	/*----------------         Outer Methods        ----------------*/
214 	/*--------------------------------------------------------------*/
215 
process(Timer t)216 	void process(Timer t){
217 
218 		ByteFile bf=ByteFile.makeByteFile(ffin1);
219 		ByteStreamWriter bsw=makeBSW(ffout1);
220 
221 		processInner(bf, bsw);
222 
223 		errorState|=bf.close();
224 		if(bsw!=null){errorState|=bsw.poisonAndWait();}
225 
226 		t.stop();
227 
228 		outstream.println(Tools.timeLinesBytesProcessed(t, linesProcessed, bytesProcessed, 8));
229 		outstream.println(Tools.linesBytesOut(linesProcessed, bytesProcessed, linesOut, bytesOut, 8, true));
230 
231 		outstream.println();
232 		outstream.println(Tools.number("Sketches:", sketchCount, 8));
233 		outstream.println(Tools.number("16S In:", r16Sin, 8));
234 		outstream.println(Tools.number("18S In:", r18Sin, 8));
235 		outstream.println(Tools.number("16S Added:", r16SfromMap, 8));
236 		outstream.println(Tools.number("18S Added:", r18SfromMap, 8));
237 		outstream.println(Tools.numberPercent("16S Out:", r16Sout, r16Sout*100.0/sketchCount, 2, 8));
238 		outstream.println(Tools.numberPercent("18S Out:", r18Sout, r18Sout*100.0/sketchCount, 2, 8));
239 
240 		if(errorState){
241 			throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
242 		}
243 	}
244 
245 	/*--------------------------------------------------------------*/
246 	/*----------------         Inner Methods        ----------------*/
247 	/*--------------------------------------------------------------*/
248 
makeBSW(FileFormat ff)249 	private static ByteStreamWriter makeBSW(FileFormat ff){
250 		if(ff==null){return null;}
251 		ByteStreamWriter bsw=new ByteStreamWriter(ff);
252 		bsw.start();
253 		return bsw;
254 	}
255 
256 //	private void processInner_old(ByteFile bf, ByteStreamWriter bsw){
257 //		SSUMap.load(outstream);
258 //
259 //		if(verbose){
260 //			System.err.println("Loaded SSUMap; |16S|="+SSUMap.r16SCount()+", |18S|="+SSUMap.r18SCount());
261 //		}
262 //
263 //		byte[] line=bf.nextLine();
264 ////		ByteBuilder bb=new ByteBuilder();
265 //
266 //		final byte[] ssuBytes="SSU:".getBytes();
267 //		final byte[] r16SBytes="16S:".getBytes();
268 //		final byte[] r18SBytes="18S:".getBytes();
269 //
270 //		while(line!=null){
271 //			if(line.length>0){
272 //				if(maxLines>0 && linesProcessed>=maxLines){break;}
273 //				linesProcessed++;
274 //				bytesProcessed+=(line.length+1);
275 //
276 //				final boolean header=(line[0]=='#');
277 //
278 //				linesOut++;
279 //				bytesOut+=(line.length+1);
280 //
281 //				if(header){
282 //					if(Tools.startsWith(line, "#SZ:")){
283 //						sketchCount++;
284 //
285 //						bsw.print(line);
286 //
287 //						final int tid=parseTaxID(line);
288 //						final boolean has16S=Tools.contains(line, ssuBytes, 0) || Tools.contains(line, r16SBytes, 0);
289 //						final boolean has18S=Tools.contains(line, r18SBytes, 0);
290 //
291 //						if(verbose){
292 //							System.err.println("For line "+new String(line)+":");
293 //							System.err.println("tid="+tid+", has16S="+has16S+", has18S="+has18S);
294 //						}
295 //
296 //						if(tid>0){
297 //							final byte[] r16S=has16S ? null : SSUMap.r16SMap.get(tid);
298 //							final byte[] r18S=has18S ? null : SSUMap.r18SMap.get(tid);
299 //							if(r16S!=null){bsw.print("\t16S:").print(r16S.length); ssuOut++;}
300 //							if(r18S!=null){bsw.print("\t18S:").print(r18S.length); ssuOut++;}
301 //							if(r16S!=null){bsw.print("\n#16S:").print(r16S);}
302 //							if(r18S!=null){bsw.print("\n#18S:").print(r18S);}
303 //
304 //							if(verbose){System.err.println("Found 16S: "+(r16S!=null)+"; found 18S: "+(r18S!=null));}
305 //						}
306 //						bsw.println();
307 //					}else if(Tools.startsWith(line, "#16S:") || Tools.startsWith(line, "#18S:") || Tools.startsWith(line, "#SSU:")){
308 //						bsw.println(line);
309 //						ssuIn++;
310 //						ssuOut++;
311 //					}else{
312 //						assert(Tools.startsWith(line, "##")) : new String(line);
313 //						bsw.println(line);
314 //					}
315 //				}else{
316 //					bsw.println(line);
317 //				}
318 //			}
319 //			line=bf.nextLine();
320 //		}
321 //	}
322 
processInner(ByteFile bf, ByteStreamWriter bsw)323 	private void processInner(ByteFile bf, ByteStreamWriter bsw){
324 		SSUMap.load(outstream);
325 
326 		if(verbose){
327 			System.err.println("Loaded SSUMap; |16S|="+SSUMap.r16SCount()+", |18S|="+SSUMap.r18SCount());
328 		}
329 
330 		byte[] line=bf.nextLine();
331 //		ByteBuilder bb=new ByteBuilder();
332 
333 //		final byte[] ssuBytes="SSU:".getBytes();
334 //		final byte[] r16SBytes="16S:".getBytes();
335 //		final byte[] r18SBytes="18S:".getBytes();
336 
337 		SketchHeader header=null;
338 		while(line!=null){
339 			if(line.length>0){
340 				if(maxLines>0 && linesProcessed>=maxLines){break;}
341 				linesProcessed++;
342 				bytesProcessed+=(line.length+1);
343 
344 				final boolean isHeader=(line[0]=='#');
345 
346 				if(isHeader){
347 					if(Tools.startsWith(line, "#SZ:")){
348 						assert(header==null) : "\nReplacing this:\n"+header.toBytes()+"\nWith this:\n"+new String(line)+"\n";
349 						header=new SketchHeader(line);
350 						sketchCount++;
351 					}else if(Tools.startsWith(line, "##")){
352 						bsw.println(line);
353 
354 						linesOut++;
355 						bytesOut+=(line.length+1);
356 					}else{
357 						header.addLine(line);
358 					}
359 				}else{
360 					if(header!=null){
361 						try {
362 							processHeader(header);
363 						} catch (Throwable e) {
364 							e.printStackTrace();
365 							assert(false) : header.toBytes();
366 						}
367 						r16Sout+=(header.r16S==null ? 0 : 1);
368 						r18Sout+=(header.r18S==null ? 0 : 1);
369 						linesOut+=1+(header.r16S==null ? 0 : 1)+(header.r18S==null ? 0 : 1);
370 						ByteBuilder bb=header.toBytes();
371 						bytesOut+=(bb.length+1);
372 						bsw.println(bb);
373 						header=null;
374 					}
375 					bsw.println(line);
376 
377 					linesOut++;
378 					bytesOut+=(line.length+1);
379 				}
380 			}
381 			line=bf.nextLine();
382 		}
383 	}
384 
processHeader(SketchHeader header)385 	void processHeader(SketchHeader header){
386 
387 		if(verbose){System.err.println("Processing tid "+header.tid+":\n"+header.toBytes()+"\n");}
388 
389 		final boolean euk=(tree!=null && header.tid>0 && header.tid<SketchObject.minFakeID) ? tree.isEukaryote(header.tid) : false;
390 		final boolean prok=(tree!=null && header.tid>0 && header.tid<SketchObject.minFakeID) ? tree.isProkaryote(header.tid) : false;
391 		if(useSSUMapOnly || (useSSUMapOnlyEuks && euk) || (useSSUMapOnlyProks && prok)){header.r16S=header.r18S=null;}
392 		if(header.tid>0){
393 			final boolean preferMap=(preferSSUMap || (preferSSUMapEuks && euk) || (preferSSUMapProks && prok));
394 			byte[] r16S=(SSUMap.r16SMap==null ? null : SSUMap.r16SMap.get(header.tid));
395 			byte[] r18S=(SSUMap.r18SMap==null ? null : SSUMap.r18SMap.get(header.tid));
396 			if(r16S!=null && (preferMap || header.r16S==null)){
397 				header.r16S=r16S;
398 				r16SfromMap++;
399 			}
400 			if(r18S!=null && (preferMap || header.r18S==null)){
401 				header.r18S=r18S;
402 				r18SfromMap++;
403 			}
404 		}
405 		if(clear16S || (clear16SEuks && euk) || (clear16SProks && prok)){header.r16S=null;}
406 		if(clear18S || (clear18SEuks && euk) || (clear18SProks && prok)){header.r18S=null;}
407 	}
408 
parseTaxID(byte[] line)409 	int parseTaxID(byte[] line){
410 		String[] split=Tools.tabPattern.split(new String(line));
411 		for(String s : split){
412 			if(s.startsWith("ID:") || s.startsWith("TAXID:")){
413 				final int colon=s.indexOf(':');
414 				final String sub=s.substring(colon+1);
415 				return Integer.parseInt(sub);
416 			}
417 		}
418 		return -1;
419 	}
420 
421 	/*--------------------------------------------------------------*/
422 
423 	//A very limited parser
424 	private class SketchHeader {
425 
SketchHeader(byte[] line)426 		SketchHeader(byte[] line){
427 			this(new String(line, 1, line.length-1));
428 		}
429 
SketchHeader(String line)430 		SketchHeader(String line){
431 			if(line.charAt(0)=='#'){line=line.substring(1);}
432 			assert(line.startsWith("SZ:"));
433 			String[] split=Tools.tabPattern.split(line);
434 			fields=new ArrayList<String>(line.length()+2);
435 			int tid_=-1;
436 			for(String s : split){
437 				if(s.startsWith("16S:") || s.startsWith("18S:") || s.startsWith("SSU:")){
438 					//do nothing
439 				}else{
440 					if(s.startsWith("ID:") || s.startsWith("TAXID:")){
441 						final int colon=s.indexOf(':');
442 						final String sub=s.substring(colon+1);
443 						tid_=Integer.parseInt(sub);
444 					}
445 					fields.add(s);
446 				}
447 			}
448 			tid=tid_;
449 		}
450 
addLine(byte[] line)451 		void addLine(byte[] line){
452 			assert(line[0]=='#');
453 			assert(line[1]=='1' || line[1]=='S') : new String(line);
454 			if(Tools.startsWith(line, "#16S:") || Tools.startsWith(line, "#SSU:")){
455 				assert(r16S==null);
456 				r16S=Arrays.copyOfRange(line, 5, line.length);
457 				r16Sin++;
458 			}else if(Tools.startsWith(line, "#18S:")){
459 				assert(r18S==null);
460 				r18S=Arrays.copyOfRange(line, 5, line.length);
461 				r18Sin++;
462 			}else{
463 				assert(false) : new String(line);
464 			}
465 		}
466 
toBytes()467 		ByteBuilder toBytes(){
468 			ByteBuilder bb=new ByteBuilder(1000);
469 			bb.append('#');
470 			for(int i=0; i<fields.size(); i++){
471 				if(i>0){bb.tab();}
472 				bb.append(fields.get(i));
473 			}
474 			if(r16S!=null){bb.tab().append("16S:").append(r16S.length);}
475 			if(r18S!=null){bb.tab().append("18S:").append(r18S.length);}
476 
477 			if(r16S!=null){bb.nl().append("#16S:").append(r16S);}
478 			if(r18S!=null){bb.nl().append("#18S:").append(r18S);}
479 			return bb;
480 		}
481 
482 		final int tid;
483 		ArrayList<String> fields;
484 		byte[] r16S;
485 		byte[] r18S;
486 	}
487 
488 	/*--------------------------------------------------------------*/
489 	/*----------------            Fields            ----------------*/
490 	/*--------------------------------------------------------------*/
491 
492 	private String in1=null;
493 	private String out1=null;
494 	private String r16SFile="auto";
495 	private String r18SFile="auto";
496 	private String treeFile="auto";
497 
498 	boolean preferSSUMap=false;
499 	boolean preferSSUMapEuks=false;
500 	boolean preferSSUMapProks=false;
501 	boolean useSSUMapOnly=false;
502 	boolean useSSUMapOnlyEuks=false;
503 	boolean useSSUMapOnlyProks=false;
504 	boolean clear16S=false;
505 	boolean clear18S=false;
506 	boolean clear16SEuks=false;
507 	boolean clear18SEuks=false;
508 	boolean clear16SProks=false;
509 	boolean clear18SProks=false;
510 
511 	/*--------------------------------------------------------------*/
512 
513 	private long linesProcessed=0;
514 	private long linesOut=0;
515 	private long bytesProcessed=0;
516 	private long bytesOut=0;
517 
518 	private long sketchCount=0;
519 
520 	private long r16Sin=0;
521 	private long r16Sout=0;
522 	private long r16SfromMap=0;
523 	private long r18Sin=0;
524 	private long r18Sout=0;
525 	private long r18SfromMap=0;
526 
527 	private long maxLines=Long.MAX_VALUE;
528 
529 	/*--------------------------------------------------------------*/
530 	/*----------------         Final Fields         ----------------*/
531 	/*--------------------------------------------------------------*/
532 
533 	private final FileFormat ffin1;
534 	private final FileFormat ffout1;
535 
536 	private final TaxTree tree;
537 
538 	/*--------------------------------------------------------------*/
539 	/*----------------        Common Fields         ----------------*/
540 	/*--------------------------------------------------------------*/
541 
542 	private PrintStream outstream=System.err;
543 	public static boolean verbose=false;
544 	public boolean errorState=false;
545 	private boolean overwrite=false;
546 	private boolean append=false;
547 
548 }
549