1 package sketch; 2 3 import java.io.PrintStream; 4 import java.util.ArrayList; 5 import java.util.Arrays; 6 7 import fileIO.ByteFile; 8 import fileIO.ByteStreamWriter; 9 import fileIO.FileFormat; 10 import fileIO.ReadWrite; 11 import shared.Parse; 12 import shared.Parser; 13 import shared.PreParser; 14 import shared.Shared; 15 import shared.Timer; 16 import shared.Tools; 17 import structures.ByteBuilder; 18 import tax.TaxTree; 19 20 /** 21 * @author Brian Bushnell 22 * @date May 9, 2016 23 * 24 */ 25 public class AddSSU { 26 27 /*--------------------------------------------------------------*/ 28 /*---------------- Initialization ----------------*/ 29 /*--------------------------------------------------------------*/ 30 31 /** 32 * Code entrance from the command line. 33 * @param args Command line arguments 34 */ main(String[] args)35 public static void main(String[] args){ 36 //Start a timer immediately upon code entrance. 37 Timer t=new Timer(); 38 39 //Create an instance of this class 40 AddSSU x=new AddSSU(args); 41 42 //Run the object 43 x.process(t); 44 45 //Close the print stream if it was redirected 46 Shared.closeStream(x.outstream); 47 } 48 49 /** 50 * Constructor. 51 * @param args Command line arguments 52 */ AddSSU(String[] args)53 public AddSSU(String[] args){ 54 55 {//Preparse block for help, config files, and outstream 56 PreParser pp=new PreParser(args, /*getClass()*/null, false); 57 args=pp.args; 58 outstream=pp.outstream; 59 } 60 61 //Set shared static variables prior to parsing 62 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; 63 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); 64 65 {//Parse the arguments 66 final Parser parser=parse(args); 67 overwrite=parser.overwrite; 68 append=parser.append; 69 70 in1=parser.in1; 71 72 out1=parser.out1; 73 } 74 75 fixExtensions(); //Add or remove .gz or .bz2 as needed 76 checkFileExistence(); //Ensure files can be read and written 77 checkStatics(); //Adjust file-related static fields as needed for this program 78 79 ffout1=FileFormat.testOutput(out1, FileFormat.SKETCH, null, true, overwrite, append, false); 80 ffin1=FileFormat.testInput(in1, FileFormat.SKETCH, null, true, false); 81 82 if(verbose){ 83 System.err.println("Set r16SFile="+r16SFile); 84 System.err.println("Set r18SFile="+r18SFile); 85 } 86 87 tree=(treeFile!=null && (preferSSUMapEuks || preferSSUMapProks || clear16SEuks || clear18SEuks || 88 clear16SProks || clear18SProks || useSSUMapOnlyEuks || useSSUMapOnlyProks) ? TaxTree.loadTaxTree(treeFile, outstream, false, false) : null); 89 90 if(preferSSUMapEuks || preferSSUMapProks || clear16SEuks || clear18SEuks || clear16SProks || clear18SProks || useSSUMapOnlyEuks || useSSUMapOnlyProks){ 91 assert(tree!=null) : "preferSSUMapForEuks, clear16SEuks, and clear18SEuks require a TaxTree."; 92 } 93 } 94 95 /*--------------------------------------------------------------*/ 96 /*---------------- Initialization Helpers ----------------*/ 97 /*--------------------------------------------------------------*/ 98 99 /** Parse arguments from the command line */ parse(String[] args)100 private Parser parse(String[] args){ 101 102 Parser parser=new Parser(); 103 for(int i=0; i<args.length; i++){ 104 String arg=args[i]; 105 String[] split=arg.split("="); 106 String a=split[0].toLowerCase(); 107 String b=split.length>1 ? split[1] : null; 108 if(b!=null && b.equalsIgnoreCase("null")){b=null;} 109 110 if(a.equalsIgnoreCase("16S") || a.equalsIgnoreCase("16Sfile")){ 111 r16SFile=b; 112 }else if(a.equalsIgnoreCase("18S") || a.equalsIgnoreCase("18Sfile")){ 113 r18SFile=b; 114 }else if(a.equalsIgnoreCase("tree") || a.equalsIgnoreCase("treefile")){ 115 treeFile=b; 116 }else if(a.equals("lines")){ 117 maxLines=Long.parseLong(b); 118 if(maxLines<0){maxLines=Long.MAX_VALUE;} 119 }else if(a.equals("verbose")){ 120 verbose=Parse.parseBoolean(b); 121 // ByteFile1.verbose=verbose; 122 // ByteFile2.verbose=verbose; 123 // ReadWrite.verbose=verbose; 124 } 125 126 else if(a.equalsIgnoreCase("preferSSUMap")){ 127 preferSSUMap=Parse.parseBoolean(b); 128 }else if(a.equalsIgnoreCase("preferSSUMapForEuks") || a.equalsIgnoreCase("preferSSUMapEuks")){ 129 preferSSUMapEuks=Parse.parseBoolean(b); 130 }else if(a.equalsIgnoreCase("useSSUMapOnly")){ 131 useSSUMapOnly=Parse.parseBoolean(b); 132 }else if(a.equalsIgnoreCase("useSSUMapOnlyEuks") || a.equalsIgnoreCase("SSUMapOnlyEuks")){ 133 useSSUMapOnlyEuks=Parse.parseBoolean(b); 134 }else if(a.equalsIgnoreCase("useSSUMapOnlyProks") || a.equalsIgnoreCase("SSUMapOnlyProks")){ 135 useSSUMapOnlyProks=Parse.parseBoolean(b); 136 }else if(a.equalsIgnoreCase("preferSSUMapForProks") || a.equalsIgnoreCase("preferSSUMapProks")){ 137 preferSSUMapProks=Parse.parseBoolean(b); 138 } 139 140 else if(a.equalsIgnoreCase("clearAll")){ 141 clear16S=clear18S=Parse.parseBoolean(b); 142 }else if(a.equalsIgnoreCase("clear16S")){ 143 clear16S=Parse.parseBoolean(b); 144 }else if(a.equalsIgnoreCase("clear18S")){ 145 clear18S=Parse.parseBoolean(b); 146 }else if(a.equalsIgnoreCase("clear16SEuks")){ 147 clear16SEuks=Parse.parseBoolean(b); 148 }else if(a.equalsIgnoreCase("clear18SEuks")){ 149 clear18SEuks=Parse.parseBoolean(b); 150 }else if(a.equalsIgnoreCase("clear16SProks")){ 151 clear16SProks=Parse.parseBoolean(b); 152 }else if(a.equalsIgnoreCase("clear18SProks")){ 153 clear18SProks=Parse.parseBoolean(b); 154 } 155 156 else if(parser.parse(arg, a, b)){ 157 //do nothing 158 }else{ 159 outstream.println("Unknown parameter "+args[i]); 160 assert(false) : "Unknown parameter "+args[i]; 161 // throw new RuntimeException("Unknown parameter "+args[i]); 162 } 163 } 164 if("auto".equalsIgnoreCase(r16SFile)){r16SFile=TaxTree.default16SFile();} 165 if("auto".equalsIgnoreCase(r18SFile)){r18SFile=TaxTree.default18SFile();} 166 SSUMap.r16SFile=r16SFile; 167 SSUMap.r18SFile=r18SFile; 168 169 return parser; 170 } 171 172 /** Add or remove .gz or .bz2 as needed */ fixExtensions()173 private void fixExtensions(){ 174 in1=Tools.fixExtension(in1); 175 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} 176 } 177 178 /** Ensure files can be read and written */ checkFileExistence()179 private void checkFileExistence(){ 180 //Ensure output files can be written 181 if(!Tools.testOutputFiles(overwrite, append, false, out1)){ 182 outstream.println((out1==null)+", "+out1); 183 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output file "+out1+"\n"); 184 } 185 186 //Ensure input files can be read 187 if(!Tools.testInputFiles(false, true, in1, r16SFile, r18SFile)){ 188 throw new RuntimeException("\nCan't read some input files.\n"); 189 } 190 assert(in1!=null) : "Input sketch file is required"; 191 assert(r16SFile!=null || r18SFile!=null) : "Input SSU file is required"; 192 193 //Ensure that no file was specified multiple times 194 if(!Tools.testForDuplicateFiles(true, in1, out1, r16SFile, r18SFile)){ 195 throw new RuntimeException("\nSome file names were specified multiple times.\n"); 196 } 197 } 198 199 /** Adjust file-related static fields as needed for this program */ checkStatics()200 private static void checkStatics(){ 201 //Adjust the number of threads for input file reading 202 if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.threads()>2){ 203 ByteFile.FORCE_MODE_BF2=true; 204 } 205 206 // if(!ByteFile.FORCE_MODE_BF2){ 207 // ByteFile.FORCE_MODE_BF2=false; 208 // ByteFile.FORCE_MODE_BF1=true; 209 // } 210 } 211 212 /*--------------------------------------------------------------*/ 213 /*---------------- Outer Methods ----------------*/ 214 /*--------------------------------------------------------------*/ 215 process(Timer t)216 void process(Timer t){ 217 218 ByteFile bf=ByteFile.makeByteFile(ffin1); 219 ByteStreamWriter bsw=makeBSW(ffout1); 220 221 processInner(bf, bsw); 222 223 errorState|=bf.close(); 224 if(bsw!=null){errorState|=bsw.poisonAndWait();} 225 226 t.stop(); 227 228 outstream.println(Tools.timeLinesBytesProcessed(t, linesProcessed, bytesProcessed, 8)); 229 outstream.println(Tools.linesBytesOut(linesProcessed, bytesProcessed, linesOut, bytesOut, 8, true)); 230 231 outstream.println(); 232 outstream.println(Tools.number("Sketches:", sketchCount, 8)); 233 outstream.println(Tools.number("16S In:", r16Sin, 8)); 234 outstream.println(Tools.number("18S In:", r18Sin, 8)); 235 outstream.println(Tools.number("16S Added:", r16SfromMap, 8)); 236 outstream.println(Tools.number("18S Added:", r18SfromMap, 8)); 237 outstream.println(Tools.numberPercent("16S Out:", r16Sout, r16Sout*100.0/sketchCount, 2, 8)); 238 outstream.println(Tools.numberPercent("18S Out:", r18Sout, r18Sout*100.0/sketchCount, 2, 8)); 239 240 if(errorState){ 241 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); 242 } 243 } 244 245 /*--------------------------------------------------------------*/ 246 /*---------------- Inner Methods ----------------*/ 247 /*--------------------------------------------------------------*/ 248 makeBSW(FileFormat ff)249 private static ByteStreamWriter makeBSW(FileFormat ff){ 250 if(ff==null){return null;} 251 ByteStreamWriter bsw=new ByteStreamWriter(ff); 252 bsw.start(); 253 return bsw; 254 } 255 256 // private void processInner_old(ByteFile bf, ByteStreamWriter bsw){ 257 // SSUMap.load(outstream); 258 // 259 // if(verbose){ 260 // System.err.println("Loaded SSUMap; |16S|="+SSUMap.r16SCount()+", |18S|="+SSUMap.r18SCount()); 261 // } 262 // 263 // byte[] line=bf.nextLine(); 264 //// ByteBuilder bb=new ByteBuilder(); 265 // 266 // final byte[] ssuBytes="SSU:".getBytes(); 267 // final byte[] r16SBytes="16S:".getBytes(); 268 // final byte[] r18SBytes="18S:".getBytes(); 269 // 270 // while(line!=null){ 271 // if(line.length>0){ 272 // if(maxLines>0 && linesProcessed>=maxLines){break;} 273 // linesProcessed++; 274 // bytesProcessed+=(line.length+1); 275 // 276 // final boolean header=(line[0]=='#'); 277 // 278 // linesOut++; 279 // bytesOut+=(line.length+1); 280 // 281 // if(header){ 282 // if(Tools.startsWith(line, "#SZ:")){ 283 // sketchCount++; 284 // 285 // bsw.print(line); 286 // 287 // final int tid=parseTaxID(line); 288 // final boolean has16S=Tools.contains(line, ssuBytes, 0) || Tools.contains(line, r16SBytes, 0); 289 // final boolean has18S=Tools.contains(line, r18SBytes, 0); 290 // 291 // if(verbose){ 292 // System.err.println("For line "+new String(line)+":"); 293 // System.err.println("tid="+tid+", has16S="+has16S+", has18S="+has18S); 294 // } 295 // 296 // if(tid>0){ 297 // final byte[] r16S=has16S ? null : SSUMap.r16SMap.get(tid); 298 // final byte[] r18S=has18S ? null : SSUMap.r18SMap.get(tid); 299 // if(r16S!=null){bsw.print("\t16S:").print(r16S.length); ssuOut++;} 300 // if(r18S!=null){bsw.print("\t18S:").print(r18S.length); ssuOut++;} 301 // if(r16S!=null){bsw.print("\n#16S:").print(r16S);} 302 // if(r18S!=null){bsw.print("\n#18S:").print(r18S);} 303 // 304 // if(verbose){System.err.println("Found 16S: "+(r16S!=null)+"; found 18S: "+(r18S!=null));} 305 // } 306 // bsw.println(); 307 // }else if(Tools.startsWith(line, "#16S:") || Tools.startsWith(line, "#18S:") || Tools.startsWith(line, "#SSU:")){ 308 // bsw.println(line); 309 // ssuIn++; 310 // ssuOut++; 311 // }else{ 312 // assert(Tools.startsWith(line, "##")) : new String(line); 313 // bsw.println(line); 314 // } 315 // }else{ 316 // bsw.println(line); 317 // } 318 // } 319 // line=bf.nextLine(); 320 // } 321 // } 322 processInner(ByteFile bf, ByteStreamWriter bsw)323 private void processInner(ByteFile bf, ByteStreamWriter bsw){ 324 SSUMap.load(outstream); 325 326 if(verbose){ 327 System.err.println("Loaded SSUMap; |16S|="+SSUMap.r16SCount()+", |18S|="+SSUMap.r18SCount()); 328 } 329 330 byte[] line=bf.nextLine(); 331 // ByteBuilder bb=new ByteBuilder(); 332 333 // final byte[] ssuBytes="SSU:".getBytes(); 334 // final byte[] r16SBytes="16S:".getBytes(); 335 // final byte[] r18SBytes="18S:".getBytes(); 336 337 SketchHeader header=null; 338 while(line!=null){ 339 if(line.length>0){ 340 if(maxLines>0 && linesProcessed>=maxLines){break;} 341 linesProcessed++; 342 bytesProcessed+=(line.length+1); 343 344 final boolean isHeader=(line[0]=='#'); 345 346 if(isHeader){ 347 if(Tools.startsWith(line, "#SZ:")){ 348 assert(header==null) : "\nReplacing this:\n"+header.toBytes()+"\nWith this:\n"+new String(line)+"\n"; 349 header=new SketchHeader(line); 350 sketchCount++; 351 }else if(Tools.startsWith(line, "##")){ 352 bsw.println(line); 353 354 linesOut++; 355 bytesOut+=(line.length+1); 356 }else{ 357 header.addLine(line); 358 } 359 }else{ 360 if(header!=null){ 361 try { 362 processHeader(header); 363 } catch (Throwable e) { 364 e.printStackTrace(); 365 assert(false) : header.toBytes(); 366 } 367 r16Sout+=(header.r16S==null ? 0 : 1); 368 r18Sout+=(header.r18S==null ? 0 : 1); 369 linesOut+=1+(header.r16S==null ? 0 : 1)+(header.r18S==null ? 0 : 1); 370 ByteBuilder bb=header.toBytes(); 371 bytesOut+=(bb.length+1); 372 bsw.println(bb); 373 header=null; 374 } 375 bsw.println(line); 376 377 linesOut++; 378 bytesOut+=(line.length+1); 379 } 380 } 381 line=bf.nextLine(); 382 } 383 } 384 processHeader(SketchHeader header)385 void processHeader(SketchHeader header){ 386 387 if(verbose){System.err.println("Processing tid "+header.tid+":\n"+header.toBytes()+"\n");} 388 389 final boolean euk=(tree!=null && header.tid>0 && header.tid<SketchObject.minFakeID) ? tree.isEukaryote(header.tid) : false; 390 final boolean prok=(tree!=null && header.tid>0 && header.tid<SketchObject.minFakeID) ? tree.isProkaryote(header.tid) : false; 391 if(useSSUMapOnly || (useSSUMapOnlyEuks && euk) || (useSSUMapOnlyProks && prok)){header.r16S=header.r18S=null;} 392 if(header.tid>0){ 393 final boolean preferMap=(preferSSUMap || (preferSSUMapEuks && euk) || (preferSSUMapProks && prok)); 394 byte[] r16S=(SSUMap.r16SMap==null ? null : SSUMap.r16SMap.get(header.tid)); 395 byte[] r18S=(SSUMap.r18SMap==null ? null : SSUMap.r18SMap.get(header.tid)); 396 if(r16S!=null && (preferMap || header.r16S==null)){ 397 header.r16S=r16S; 398 r16SfromMap++; 399 } 400 if(r18S!=null && (preferMap || header.r18S==null)){ 401 header.r18S=r18S; 402 r18SfromMap++; 403 } 404 } 405 if(clear16S || (clear16SEuks && euk) || (clear16SProks && prok)){header.r16S=null;} 406 if(clear18S || (clear18SEuks && euk) || (clear18SProks && prok)){header.r18S=null;} 407 } 408 parseTaxID(byte[] line)409 int parseTaxID(byte[] line){ 410 String[] split=Tools.tabPattern.split(new String(line)); 411 for(String s : split){ 412 if(s.startsWith("ID:") || s.startsWith("TAXID:")){ 413 final int colon=s.indexOf(':'); 414 final String sub=s.substring(colon+1); 415 return Integer.parseInt(sub); 416 } 417 } 418 return -1; 419 } 420 421 /*--------------------------------------------------------------*/ 422 423 //A very limited parser 424 private class SketchHeader { 425 SketchHeader(byte[] line)426 SketchHeader(byte[] line){ 427 this(new String(line, 1, line.length-1)); 428 } 429 SketchHeader(String line)430 SketchHeader(String line){ 431 if(line.charAt(0)=='#'){line=line.substring(1);} 432 assert(line.startsWith("SZ:")); 433 String[] split=Tools.tabPattern.split(line); 434 fields=new ArrayList<String>(line.length()+2); 435 int tid_=-1; 436 for(String s : split){ 437 if(s.startsWith("16S:") || s.startsWith("18S:") || s.startsWith("SSU:")){ 438 //do nothing 439 }else{ 440 if(s.startsWith("ID:") || s.startsWith("TAXID:")){ 441 final int colon=s.indexOf(':'); 442 final String sub=s.substring(colon+1); 443 tid_=Integer.parseInt(sub); 444 } 445 fields.add(s); 446 } 447 } 448 tid=tid_; 449 } 450 addLine(byte[] line)451 void addLine(byte[] line){ 452 assert(line[0]=='#'); 453 assert(line[1]=='1' || line[1]=='S') : new String(line); 454 if(Tools.startsWith(line, "#16S:") || Tools.startsWith(line, "#SSU:")){ 455 assert(r16S==null); 456 r16S=Arrays.copyOfRange(line, 5, line.length); 457 r16Sin++; 458 }else if(Tools.startsWith(line, "#18S:")){ 459 assert(r18S==null); 460 r18S=Arrays.copyOfRange(line, 5, line.length); 461 r18Sin++; 462 }else{ 463 assert(false) : new String(line); 464 } 465 } 466 toBytes()467 ByteBuilder toBytes(){ 468 ByteBuilder bb=new ByteBuilder(1000); 469 bb.append('#'); 470 for(int i=0; i<fields.size(); i++){ 471 if(i>0){bb.tab();} 472 bb.append(fields.get(i)); 473 } 474 if(r16S!=null){bb.tab().append("16S:").append(r16S.length);} 475 if(r18S!=null){bb.tab().append("18S:").append(r18S.length);} 476 477 if(r16S!=null){bb.nl().append("#16S:").append(r16S);} 478 if(r18S!=null){bb.nl().append("#18S:").append(r18S);} 479 return bb; 480 } 481 482 final int tid; 483 ArrayList<String> fields; 484 byte[] r16S; 485 byte[] r18S; 486 } 487 488 /*--------------------------------------------------------------*/ 489 /*---------------- Fields ----------------*/ 490 /*--------------------------------------------------------------*/ 491 492 private String in1=null; 493 private String out1=null; 494 private String r16SFile="auto"; 495 private String r18SFile="auto"; 496 private String treeFile="auto"; 497 498 boolean preferSSUMap=false; 499 boolean preferSSUMapEuks=false; 500 boolean preferSSUMapProks=false; 501 boolean useSSUMapOnly=false; 502 boolean useSSUMapOnlyEuks=false; 503 boolean useSSUMapOnlyProks=false; 504 boolean clear16S=false; 505 boolean clear18S=false; 506 boolean clear16SEuks=false; 507 boolean clear18SEuks=false; 508 boolean clear16SProks=false; 509 boolean clear18SProks=false; 510 511 /*--------------------------------------------------------------*/ 512 513 private long linesProcessed=0; 514 private long linesOut=0; 515 private long bytesProcessed=0; 516 private long bytesOut=0; 517 518 private long sketchCount=0; 519 520 private long r16Sin=0; 521 private long r16Sout=0; 522 private long r16SfromMap=0; 523 private long r18Sin=0; 524 private long r18Sout=0; 525 private long r18SfromMap=0; 526 527 private long maxLines=Long.MAX_VALUE; 528 529 /*--------------------------------------------------------------*/ 530 /*---------------- Final Fields ----------------*/ 531 /*--------------------------------------------------------------*/ 532 533 private final FileFormat ffin1; 534 private final FileFormat ffout1; 535 536 private final TaxTree tree; 537 538 /*--------------------------------------------------------------*/ 539 /*---------------- Common Fields ----------------*/ 540 /*--------------------------------------------------------------*/ 541 542 private PrintStream outstream=System.err; 543 public static boolean verbose=false; 544 public boolean errorState=false; 545 private boolean overwrite=false; 546 private boolean append=false; 547 548 } 549