1 package fileIO; 2 3 import java.io.BufferedReader; 4 import java.io.File; 5 import java.io.IOException; 6 import java.io.InputStream; 7 import java.io.InputStreamReader; 8 import java.util.ArrayList; 9 import java.util.Arrays; 10 import java.util.List; 11 12 import jgi.TestFormat; 13 import shared.Parse; 14 import shared.PreParser; 15 import shared.Tools; 16 17 /** 18 * This class contains metadata about a file 19 * @author Brian Bushnell 20 * @date Dec 19, 2012 21 * 22 */ 23 public final class FileFormat { 24 main(String[] args)25 public static void main(String[] args){ 26 27 {//Preparse block for help, config files, and outstream 28 PreParser pp=new PreParser(args, null /*new Object() { }.getClass().getEnclosingClass()*/, false); 29 args=pp.args; 30 //outstream=pp.outstream; 31 } 32 33 stream.FASTQ.warnQualityChange=false; 34 PRINT_WARNING=false; 35 boolean full=false; 36 ArrayList<String> files=new ArrayList<String>(); 37 for(int i=0; i<args.length; i++){ 38 39 final String arg=args[i]; 40 String[] split=arg.split("="); 41 String a=split[0].toLowerCase(); 42 String b=split.length>1 ? split[1] : null; 43 44 if(a.equals("verbose")){ 45 verbose=Parse.parseBoolean(b); 46 }else if(a.equals("full")){ 47 full=Parse.parseBoolean(b); 48 }else if(b!=null){ 49 // assert(a.startsWith("in")) : "Unknown parameter "+arg; 50 if(a.startsWith("in")){files.add(b);} 51 }else{ 52 files.add(arg); 53 } 54 } 55 56 if(full){ 57 TestFormat.main(args); 58 }else{ 59 for(String fname : files){ 60 test(fname, true); 61 } 62 } 63 64 } 65 test(String fname, boolean forceFileRead)66 private static void test(String fname, boolean forceFileRead){ 67 FileFormat ffName=testInput(fname, FASTQ, null, false, false, false); 68 FileFormat ffContent=testInput(fname, ffName.format(), null, false, true, true); 69 FileFormat ff=ffContent; 70 // assert(false) : ffName+"\n"+ffContent; 71 if(ff==null){ 72 System.out.println("null"); 73 }else{ 74 int q=33; 75 int len=-1; 76 boolean i=false; 77 if(ff.fastq()){ 78 byte qold=stream.FASTQ.ASCII_OFFSET; 79 stream.FASTQ.ASCII_OFFSET=33; 80 int[] qi=testInterleavedAndQuality(fname, false); 81 q=qi[0]; 82 i=(qi[1]==INTERLEAVED); 83 len=qi[2]; 84 stream.FASTQ.ASCII_OFFSET=qold; 85 }else if(ff.fasta()){ 86 i=stream.FASTQ.testInterleavedFasta(fname, false); 87 } 88 if(ff.isSequence()){ 89 String qs=(q==33 ? "sanger" : q==64 ? "illumina" : ""+q); 90 System.out.print(qs+"\t"+FORMAT_ARRAY[ff.format()]+"\t"+COMPRESSION_ARRAY[ff.compression()]); 91 System.out.print("\t"+(i ? "interleaved" : "single-ended")); 92 if(len>0){System.out.print("\t"+len+"bp");} 93 }else{ 94 System.out.print(FORMAT_ARRAY[ff.format()]+"\t"+COMPRESSION_ARRAY[ff.compression()]); 95 } 96 if(ffName.format()!=ff.format()){System.out.print("\t"+FORMAT_ARRAY[ffName.format()]+"\t(File extension differs from contents)");} 97 System.out.println(); 98 } 99 } 100 101 /*--------------------------------------------------------------*/ 102 /*---------------- Initialization ----------------*/ 103 /*--------------------------------------------------------------*/ 104 testInput(String fname, String overrideExtension, boolean allowSubprocess)105 public static FileFormat testInput(String fname, String overrideExtension, boolean allowSubprocess){ 106 if(verbose){System.err.println("testInputA("+fname+", "+overrideExtension+", "+allowSubprocess+")");} 107 return testInput(fname, FASTQ, overrideExtension, allowSubprocess, true); 108 } 109 testInputList(List<String> fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead)110 public static FileFormat[] testInputList(List<String> fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead){ 111 if(verbose){System.err.println("testInputList("+fname+", "+defaultFormat+", "+overrideExtension+", "+allowSubprocess+", "+allowFileRead+")");} 112 FileFormat[] ffa=new FileFormat[fname.size()]; 113 for(int i=0; i<fname.size(); i++){ 114 ffa[i]=testInput(fname.get(i), defaultFormat, overrideExtension, allowSubprocess, allowFileRead, false); 115 } 116 return ffa; 117 } 118 testInput(String fnames[], int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead)119 public static FileFormat[] testInput(String fnames[], int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead){ 120 FileFormat[] array=new FileFormat[fnames.length]; 121 for(int i=0; i<fnames.length; i++){ 122 array[i]=testInput(fnames[i], defaultFormat, overrideExtension, allowSubprocess, allowFileRead); 123 } 124 return array; 125 } 126 testInput(String fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead)127 public static FileFormat testInput(String fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead){ 128 if(verbose){System.err.println("testInputB("+fname+", "+defaultFormat+", "+overrideExtension+", "+allowSubprocess+", "+allowFileRead+")");} 129 return testInput(fname, defaultFormat, overrideExtension, allowSubprocess, allowFileRead, false); 130 } 131 testInput(String fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead, boolean forceFileRead)132 public static FileFormat testInput(String fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead, boolean forceFileRead){ 133 if(verbose){System.err.println("testInputC("+fname+", "+defaultFormat+", "+overrideExtension+", "+allowSubprocess+", "+allowFileRead+", "+forceFileRead+")");} 134 if(fname==null){return null;} 135 int overrideFormat=0; 136 int overrideCompression=0; 137 if(overrideExtension!=null && overrideExtension.length()>0){ 138 int[] a=testFormat(overrideExtension, false, false); 139 if(a!=null){ 140 overrideFormat=a[0]; 141 if(a[1]!=RAW){overrideCompression=a[1];} 142 } 143 } 144 return testInput(fname, defaultFormat, overrideFormat, overrideCompression, allowSubprocess, allowFileRead, forceFileRead); 145 } 146 147 /** 148 * Create an input FileFormat object for this filename. 149 * @param fname Filename (path). 150 * @param defaultFormat Use this format if the name is unclear and the format is not autodetected. 151 * @param overrideFormat If specified, ignore the file extension and autodetection and input using this format. 152 * @param overrideCompression If specified, ignore the file extension and input using this compression protocol. 153 * @param allowSubprocess Permission to spawn a subprocess like bgzip. 154 * @param allowFileRead Permission to read the file while constructing this FileFormat, for the purpose of format detection. 155 * @param forceFileRead Force reading the file while constructing this FileFormat, for the purpose of format detection. 156 * @return A FileFormat, or null if the filename is null. 157 */ testInput(String fname, int defaultFormat, int overrideFormat, int overrideCompression, boolean allowSubprocess, boolean allowFileRead, boolean forceFileRead)158 public static FileFormat testInput(String fname, int defaultFormat, int overrideFormat, 159 int overrideCompression, boolean allowSubprocess, boolean allowFileRead, boolean forceFileRead){ 160 if(verbose){System.err.println("testInputD("+fname+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowSubprocess+", "+allowFileRead+", "+forceFileRead+")");} 161 if(fname==null){return null;} 162 return new FileFormat(fname, READ, defaultFormat, overrideFormat, overrideCompression, allowSubprocess, allowFileRead, forceFileRead, false, false, false, true); 163 } 164 165 /** 166 * Create an output FileFormat object for this filename. 167 * @param fname Filename (path). 168 * @param defaultFormat Use this format if the name is unclear. 169 * @param overrideExtension If specified, ignore the file extension and output in this format. 170 * @param allowSubprocess Permission to spawn a subprocess like bgzip. 171 * @param overwrite Permission to overwrite existing files. 172 * @param append Permission to append to existing files. 173 * @param ordered True if the input order should be maintained (for multithreaded read processing). 174 * @return A FileFormat, or null if the filename is null. 175 */ testOutput(String fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean overwrite, boolean append, boolean ordered)176 public static FileFormat testOutput(String fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean overwrite, boolean append, boolean ordered){ 177 if(fname==null){return null;} 178 int overrideFormat=0; 179 int overrideCompression=0; 180 if(overrideExtension!=null && overrideExtension.length()>0){ 181 int[] a=testFormat(overrideExtension, false, false); 182 if(a!=null){ 183 overrideFormat=a[0]; 184 if(a[1]!=RAW){overrideCompression=a[1];} 185 } 186 } 187 return testOutput(fname, defaultFormat, overrideFormat, overrideCompression, allowSubprocess, overwrite, append, ordered); 188 } 189 testOutput(String fname, int defaultFormat, int overrideFormat, int overrideCompression, boolean allowSubprocess, boolean overwrite, boolean append, boolean ordered)190 public static FileFormat testOutput(String fname, int defaultFormat, int overrideFormat, int overrideCompression, boolean allowSubprocess, boolean overwrite, boolean append, boolean ordered){ 191 if(fname==null){return null;} 192 return new FileFormat(fname, WRITE, defaultFormat, overrideFormat, overrideCompression, allowSubprocess, false, false, overwrite, append, ordered, false); 193 } 194 195 /*--------------------------------------------------------------*/ 196 /*---------------- Constructor ----------------*/ 197 /*--------------------------------------------------------------*/ 198 FileFormat(String fname, int mode_, int defaultFormat, int overrideFormat, int overrideCompression, boolean allowSubprocess_, boolean allowFileRead, boolean forceFileRead, boolean overwrite_, boolean append_, boolean ordered_, boolean input_)199 private FileFormat(String fname, int mode_, int defaultFormat, int overrideFormat, int overrideCompression, boolean allowSubprocess_, 200 boolean allowFileRead, boolean forceFileRead, boolean overwrite_, boolean append_, boolean ordered_, boolean input_){ 201 // , boolean interleaved_, long maxReads_){ 202 203 if(verbose){ 204 // new Exception().printStackTrace(System.err); 205 System.err.println("FileFormat(fname="+fname+", mode="+mode_+", dFormat="+defaultFormat+", oFormat="+overrideFormat+", oCompression="+overrideCompression+ 206 ", allowSub="+allowSubprocess_+", allowRead="+allowFileRead+", forceFileRead="+forceFileRead+ 207 ", ow="+overwrite_+", append="+append_+", ordered="+ordered_+")"); 208 } 209 assert(!forceFileRead || allowFileRead); 210 211 // assert(!overwrite_ || !append_) : "Both overwrite and append may not be set to true."; 212 if(overwrite_ && append_){overwrite_=false;} 213 214 assert(fname!=null); 215 fname=fname.trim().replace('\\', '/'); 216 assert(fname.trim().length()>0) : fname; 217 218 if(defaultFormat<1 && !forceFileRead){defaultFormat=FQ;} 219 allowFileRead&=(mode_==READ); 220 int[] a=testFormat(fname, allowFileRead, forceFileRead); 221 222 if(verbose){System.err.println(Arrays.toString(a));} 223 224 if(a[0]==UNKNOWN && overrideFormat<1){ 225 a[0]=defaultFormat; 226 if(defaultFormat!=TEXT && PRINT_WARNING){ 227 System.err.println("Unspecified format for "+(mode_==READ ? "input" : "output")+" "+(fname==null ? "stream" : fname)+"; defaulting to "+FORMAT_ARRAY[a[0]]+"."); 228 } 229 } 230 if(verbose){System.err.println(Arrays.toString(a));} 231 232 if(overrideFormat>0){a[0]=overrideFormat;} 233 if(overrideCompression>0){a[1]=overrideCompression;} 234 235 if(verbose){System.err.println(Arrays.toString(a));} 236 237 238 // {format, compression, type, interleaved, quality, length} 239 name=fname; 240 simpleName=new File(name).getName(); 241 format=a[0]; 242 compression=a[1]; 243 type=a[2]; 244 interleaving=a[3]; 245 asciiOffset=a[4]; 246 length=a[5]; 247 mode=mode_; 248 input=input_; 249 250 overwrite=overwrite_; 251 append=append_; 252 allowSubprocess=allowSubprocess_; 253 ordered=ordered_; 254 amino="faa".equals(rawExtension()); 255 256 // interleaved=interleaved_; 257 // maxReads=write() ? -1 : maxReads_; 258 259 assert(forceFileRead || !unknownFormat()) : "Unknown file format for "+fname+"\n"+ 260 mode_+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowSubprocess_+", "+allowFileRead+", "+overwrite_; 261 assert(!unknownCompression()) : "Unknown compression for "+fname+"\n"+ 262 mode_+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowSubprocess_+", "+allowFileRead+", "+overwrite_; 263 assert(!unknownType()) : "Unknown stream type for "+fname+"\n"+ 264 mode_+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowSubprocess_+", "+allowFileRead+", "+overwrite_; 265 assert(!unknownMode()) : "Unknown I/O mode for "+fname+"\n"+ 266 mode_+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowSubprocess_+", "+allowFileRead+", "+overwrite_; 267 } 268 269 /*--------------------------------------------------------------*/ 270 /*---------------- Methods ----------------*/ 271 /*--------------------------------------------------------------*/ 272 273 @Override toString()274 public String toString(){ 275 StringBuilder sb=new StringBuilder(); 276 sb.append(name).append(','); 277 sb.append(format+"("+FORMAT_ARRAY[format]+")").append(','); 278 sb.append(compression+"("+COMPRESSION_ARRAY[compression]+")").append(','); 279 sb.append(type+"("+TYPE_ARRAY[type]+")").append(','); 280 sb.append(interleaving+"("+INTERLEAVING_ARRAY[interleaving]+")").append(','); 281 // sb.append("ascii"+asciiOffset).append(','); 282 sb.append(mode+"("+MODE_ARRAY[mode]+")").append(','); 283 sb.append("ow="+(overwrite ? "t" : "f")).append(','); 284 sb.append("app="+(append ? "t" : "f")).append(','); 285 sb.append("sub="+(allowSubprocess ? "t" : "f")).append(','); 286 sb.append("ordered="+(ordered ? "t" : "f")); 287 return sb.toString(); 288 } 289 toString(int[] vector)290 public static String toString(int[] vector){ 291 int format=vector[0], compression=vector[1], type=vector[2], interleaving=vector[3]; 292 StringBuilder sb=new StringBuilder(); 293 sb.append(format+"("+FORMAT_ARRAY[format]+")").append(','); 294 sb.append(compression+"("+COMPRESSION_ARRAY[compression]+")").append(','); 295 sb.append(type+"("+TYPE_ARRAY[type]+")").append(','); 296 sb.append(interleaving+"("+INTERLEAVING_ARRAY[interleaving]+")"); 297 return sb.toString(); 298 } 299 300 301 /*--------------------------------------------------------------*/ 302 /*---------------- Static Methods ----------------*/ 303 /*--------------------------------------------------------------*/ 304 305 /** Returns an int array: {format, compression, type, interleaved, quality, length} */ testFormat(String fname, boolean allowFileRead, boolean forceFileRead)306 public static final int[] testFormat(String fname, boolean allowFileRead, boolean forceFileRead){ 307 if(verbose){System.err.println("testFormat("+fname+", "+allowFileRead+", "+forceFileRead+")");} 308 final int[] r=new int[] {UNKNOWN, RAW, FILE, UNKNOWN, -1, -1}; 309 if(fname==null || fname.length()<1){ 310 r[2]=STDIO; 311 return r; 312 } 313 String slc=fname.trim().toLowerCase(); 314 if(slc.indexOf('/')<0){slc=slc.substring(slc.lastIndexOf('/')+1);} 315 String comp=ReadWrite.compressionType(slc); 316 String ext=ReadWrite.rawExtension(slc); 317 318 if(ext==null){} 319 else if(ext.equals("fq") || ext.equals("fastq") || (comp!=null && comp.equals("fqz"))){r[0]=FASTQ;} 320 else if(isFasta(ext)){r[0]=FASTA;} 321 else if(/*ext.equals("txt") || */ext.equals("bread")){r[0]=BREAD;} 322 else if(ext.equals("sam")){r[0]=SAM;} 323 else if(ext.equals("csfasta")){r[0]=CSFASTA;} 324 else if(ext.equals("qual")){r[0]=QUAL;} 325 else if(ext.equals("bam")){r[0]=BAM;} 326 else if(ext.equals("sites") || ext.equals("sitesonly")){r[0]=SITES;} 327 else if(ext.equals("info") || ext.equals("attachment")){r[0]=ATTACHMENT;} 328 else if(ext.equals("scarf")){r[0]=SCARF;} 329 else if(ext.equals("phylip")){r[0]=PHYLIP;} 330 else if(ext.equals("header") || ext.equals("headers")){r[0]=HEADER;} 331 else if(ext.equals("int1d")){r[0]=INT1D;} 332 else if(ext.equals("long1d")){r[0]=LONG1D;} 333 else if(ext.equals("bitset")){r[0]=BITSET;} 334 else if(ext.equals("sketch")){r[0]=SKETCH;} 335 else if(ext.equals("oneline") || ext.equals("flat")){r[0]=ONELINE;} 336 else if(ext.equals("fastr") || ext.equals("fr")){r[0]=FASTR;} 337 else if(ext.equals("vcf")){r[0]=VCF;} 338 else if(ext.equals("var")){r[0]=VAR;} 339 else if(ext.equals("gff") || ext.equals("gff3")){r[0]=GFF;} 340 else if(ext.equals("bed")){r[0]=BED;} 341 else if(ext.equals("pgm") || ext.equals("pkm")){r[0]=PGM;} 342 else if(ext.equals("embl")){r[0]=EMBL;} 343 else if(ext.equals("gbk")){r[0]=GBK;} 344 else if(ext.equals("gbff")){r[0]=GBFF;} 345 else if(ext.equals("alm")){r[0]=ALM;} 346 347 if(comp!=null){ 348 r[1]=Tools.find(comp, COMPRESSION_ARRAY); 349 assert(r[1]>0) : "Unhandled compression type: "+comp; 350 } 351 352 // if(r[1]==GZIP && allowFileRead){ 353 // //Check magic number, perhaps 354 // } 355 356 if(slc.length()>2 && slc.charAt(0)=='s' && slc.charAt(1)=='t'){ 357 if(slc.equals("stdin") || slc.startsWith("stdin.") || slc.equals("standardin")){r[2]=STDIO;} 358 else if(slc.equals("stdout") || slc.startsWith("stdout.") || slc.equals("standardout")){r[2]=STDIO;} 359 }else if("/dev/null".equalsIgnoreCase(slc)){ 360 r[2]=DEVNULL; 361 } 362 363 if(verbose){System.err.println("Before reading: \t"+r[0]+", "+toString(r)+", "+forceFileRead+", "+(r[0]!=BAM));} 364 if(r[0]==UNKNOWN || (r[0]!=BAM && forceFileRead) || 365 ((r[0]==FASTQ || r[0]==FASTA) && r[3]==UNKNOWN && allowFileRead && !stream.FASTQ.FORCE_INTERLEAVED && stream.FASTQ.TEST_INTERLEAVED)){ 366 File f=(allowFileRead && r[2]==FILE ? new File(fname) : null); 367 if(f!=null && f.exists() && !f.isDirectory()){ 368 // //a: {quality, interleaved, length, format} 369 // //r: {format, compression, type, interleaved, quality, length} 370 try { 371 int[] a=testInterleavedAndQuality(fname, false); 372 if(a!=null){ 373 final int aq=a[0], ai=a[1], al=a[2], af=a[3]; 374 if(aq>-1){r[4]=aq;} 375 if(ai!=UNKNOWN){r[3]=ai;} 376 if(af!=UNKNOWN && (af!=BREAD || (r[0]!=HEADER && r[0]!=TEXT))){r[0]=af;} 377 if(al>1 && r[5]==-1){r[5]=al;} 378 } 379 } catch (Exception e) { 380 // TODO Auto-generated catch block 381 e.printStackTrace(); 382 } 383 384 if(verbose){System.err.println("After reading: \t"+r[0]+", "+toString(r)+", "+forceFileRead+", "+(r[0]!=BAM));} 385 }else if(r[0]==UNKNOWN){ 386 if(fname.equals("sequential")){r[0]=SEQUENTIAL;} 387 else if(fname.equals("random")){r[0]=RANDOM;} 388 else if(fname.equals("sitesonly")){r[0]=SITES;} 389 } 390 } 391 392 if(r[3]==UNKNOWN && (r[0]==FASTQ || r[0]==FASTA)){ 393 if(stream.FASTQ.FORCE_INTERLEAVED){r[3]=2;} 394 else{r[3]=1;} 395 } 396 // assert(false) : Arrays.toString(r); 397 398 if(r[2]==STDIO && allowFileRead){ 399 File f=new File(fname); 400 if(f.exists() && !f.isDirectory()){r[2]=FILE;} 401 } 402 if(verbose){System.err.println("testFormat return:\t"+r[0]+", "+toString(r)+", "+forceFileRead+", "+(r[0]!=BAM)+", "+r[4]);} 403 return r; 404 } 405 hasFastaExtension(String fname)406 public static boolean hasFastaExtension(String fname){ 407 int[] r=testFormat(fname, false, false); 408 return r[0]==FA; 409 } 410 hasFastqExtension(String fname)411 public static boolean hasFastqExtension(String fname){ 412 int[] r=testFormat(fname, false, false); 413 return r[0]==FQ; 414 } 415 hasFastqOrFastqExtension(String fname)416 public static boolean hasFastqOrFastqExtension(String fname){ 417 int[] r=testFormat(fname, false, false); 418 return r[0]==FQ || r[0]==FA; 419 } 420 hasSamOrBamExtension(String fname)421 public static boolean hasSamOrBamExtension(String fname){ 422 int[] r=testFormat(fname, false, false); 423 return r[0]==SAM || r[0]==BAM; 424 } 425 426 /*--------------------------------------------------------------*/ 427 /*---------------- ??????? ----------------*/ 428 /*--------------------------------------------------------------*/ 429 430 /** 431 * @param fname File to read 432 * @return {quality, interleaved, length, format} 433 */ testInterleavedAndQuality(String fname, boolean forceFastq)434 public static int[] testInterleavedAndQuality(String fname, boolean forceFastq){ 435 final ArrayList<String> oct=getFirstOctet(fname); 436 return testInterleavedAndQuality(oct, fname, forceFastq); 437 } 438 getFirstOctet(String fname)439 public static ArrayList<String> getFirstOctet(String fname){ 440 if(fname==null){return null;} 441 if(fname.equalsIgnoreCase("stdin") || fname.toLowerCase().startsWith("stdin.")){return null;} 442 443 ArrayList<String> oct=new ArrayList<String>(8); 444 445 { 446 InputStream is=ReadWrite.getInputStream(fname, false, fname.toLowerCase().endsWith(".bz2")); 447 BufferedReader br=new BufferedReader(new InputStreamReader(is)); 448 try { 449 int cntr=0; 450 for(String s=br.readLine(); s!=null && cntr<8; s=br.readLine()){ 451 oct.add(s); 452 cntr++; 453 } 454 } catch (IOException e) { 455 // TODO Auto-generated catch block 456 e.printStackTrace(); 457 } 458 ReadWrite.finishReading(is, fname, true, br); 459 } 460 return oct; 461 } 462 463 /** 464 * @param oct First 8 lines of file 465 * @param fname File to read 466 * @return {quality, interleaved, length, format} 467 */ testInterleavedAndQuality(final ArrayList<String> oct, String fname, boolean forceFastq)468 public static int[] testInterleavedAndQuality(final ArrayList<String> oct, String fname, boolean forceFastq){ 469 int len=-1, format=UNKNOWN; 470 byte q=-1, i=UNKNOWN; 471 if(oct==null || oct.size()<1){ 472 return new int[] {q, i, len, format}; 473 } 474 { 475 String s1=oct.size()>0 ? oct.get(0) : ""; 476 String s2=oct.size()>1 ? oct.get(1) : ""; 477 String s3=oct.size()>2 ? oct.get(2) : ""; 478 int b1=(s1.length()>0 ? s1.charAt(0) : -1); 479 int b2=(s2.length()>0 ? s2.charAt(0) : -1); 480 int b3=(s3.length()>0 ? s3.charAt(0) : -1); 481 482 if(b1=='>'){format=FA;} 483 else if(b1=='@'){ 484 if(b3=='+'){format=FQ;} 485 else if(b2<0 || b2=='@'){format=SAM;} 486 else{format=UNKNOWN;} //probably a truncated fastq file? 487 }else if(b1=='#'){ 488 if(s1.startsWith("#SZ:") || s1.startsWith("#SIZE:")){ 489 format=SKETCH; 490 int x1=s1.indexOf(':'); 491 int x2=s1.indexOf('\t'); 492 if(x2>x1){ 493 try { 494 len=Integer.parseInt(s1.substring(x1+1, x2)); 495 } catch (NumberFormatException e) {} 496 } 497 }else if(s1.startsWith("#FASTR") || s1.startsWith("#FR")){ 498 format=FASTR; 499 if(s1.endsWith("\tINT")){i=INTERLEAVED;} 500 else{i=SINGLE;} 501 }else if(s1.startsWith("##fileformat=VCF")){ 502 format=VCF; 503 }else if(s1.startsWith("#fileformat\tVar_")){ 504 format=VAR; 505 }else if(s1.startsWith("##gff-version")){ 506 format=GFF; 507 }else if(s1.startsWith("LOCUS ")){ 508 format=GBFF; 509 }else{format=TEXT;} 510 } 511 // else{format=BREAD;} //or possibly scarf 512 513 if(format!=FQ){len=-1;} 514 } 515 516 if(format==FQ || forceFastq){ 517 boolean oldDQ=stream.FASTQ.DETECT_QUALITY; 518 byte oldQin=stream.FASTQ.ASCII_OFFSET; 519 byte oldQout=stream.FASTQ.ASCII_OFFSET_OUT; 520 stream.FASTQ.DETECT_QUALITY=true; 521 q=stream.FASTQ.testQuality(oct); 522 i=(byte)(stream.FASTQ.testInterleaved(oct, fname, false) ? INTERLEAVED : SINGLE); 523 // stream.FASTQ.DETECT_QUALITY=old; 524 { 525 String a=oct.size()>1 ? oct.get(1) : null; 526 String b=oct.size()>5 ? oct.get(5) : null; 527 if(a!=null){len=Tools.max(a.length(), len);} 528 if(b!=null){len=Tools.max(b.length(), len);} 529 if(len<2){len=-1;} 530 } 531 stream.FASTQ.DETECT_QUALITY=oldDQ; 532 stream.FASTQ.ASCII_OFFSET=oldQin; 533 stream.FASTQ.ASCII_OFFSET_OUT=oldQout; 534 } 535 int[] r=new int[] {q, i, len, format}; 536 if(verbose){System.err.println(Arrays.toString(r));} 537 return r; 538 } 539 isFasta(String ext)540 public static boolean isFasta(String ext){ 541 if(ext==null){return false;} 542 return (ext.equals("fa") || ext.equals("fasta") || ext.equals("fas") || ext.equals("fna") || ext.equals("ffn") 543 || ext.equals("frn") || ext.equals("seq") || ext.equals("fsa") || ext.equals("faa")); 544 } 545 isFastaFile(String fname)546 public static boolean isFastaFile(String fname){ 547 if(fname==null){return false;} 548 String ext=ReadWrite.rawExtension(fname); 549 return isFasta(ext); 550 } 551 isPgmFile(String fname)552 public static boolean isPgmFile(String fname){ 553 if(fname==null){return false;} 554 String ext=ReadWrite.rawExtension(fname); 555 return isPgm(ext); 556 } 557 isAmino(String ext)558 public static boolean isAmino(String ext){ 559 if(ext==null){return false;} 560 return ext.equals("faa"); //TODO: Investigate whether other extensions imply AA. 561 } 562 isStdio(String s)563 public static boolean isStdio(String s){ 564 if(s==null){return false;} 565 if(new File(s).exists()){return false;} 566 if(s.contains(".")){s=s.substring(0, s.indexOf('.')); 567 } 568 return (s.equalsIgnoreCase("stdin") || s.equalsIgnoreCase("stdout") || s.equalsIgnoreCase("stderr")); 569 } 570 isFastq(String ext)571 public static boolean isFastq(String ext){ 572 if(ext==null){return false;} 573 return (ext.equals("fq") || ext.equals("fastq")); 574 } 575 isPgm(String ext)576 public static boolean isPgm(String ext){ 577 if(ext==null){return false;} 578 return (ext.equals("pgm") || ext.equals("pkm")); 579 } 580 isFastqFile(String fname)581 public static boolean isFastqFile(String fname){ 582 if(fname==null){return false;} 583 String ext=ReadWrite.rawExtension(fname); 584 return isFastq(ext); 585 } 586 isSamOrBam(String ext)587 public static boolean isSamOrBam(String ext){ 588 if(ext==null){return false;} 589 return (ext.equals("sam") || ext.equals("bam")); 590 } 591 isSamOrBamFile(String fname)592 public static boolean isSamOrBamFile(String fname){ 593 if(fname==null){return false;} 594 String ext=ReadWrite.rawExtension(fname); 595 return isSamOrBam(ext); 596 } 597 isBam(String ext)598 public static boolean isBam(String ext){ 599 if(ext==null){return false;} 600 return ext.equals("bam"); 601 } 602 isBamFile(String fname)603 public static boolean isBamFile(String fname){ 604 if(fname==null){return false;} 605 String ext=ReadWrite.rawExtension(fname); 606 return isBam(ext); 607 } 608 deleteIfPresent()609 public void deleteIfPresent() { 610 File f=new File(name); 611 if(f.exists()){f.delete();} 612 } 613 614 /*--------------------------------------------------------------*/ 615 /*---------------- Getters ----------------*/ 616 /*--------------------------------------------------------------*/ 617 rawExtension()618 public String rawExtension() { 619 return ReadWrite.rawExtension(name); 620 } rawExtensionCode()621 public int rawExtensionCode() { 622 String ext=ReadWrite.rawExtension(name); 623 String comp=ReadWrite.compressionType(name); 624 return rawExtensionCode(ext, comp); 625 } rawExtensionCode(String ext, String comp)626 private int rawExtensionCode(String ext, String comp) { 627 if(ext==null){return UNKNOWN;} 628 else if(ext.equals("fq") || ext.equals("fastq") || (comp!=null && comp.equals("fqz"))){return FASTQ;} 629 else if(isFasta(ext)){return FASTA;} 630 else if(ext.equals("bread")){return BREAD;} 631 else if(ext.equals("sam")){return SAM;} 632 else if(ext.equals("csfasta")){return CSFASTA;} 633 else if(ext.equals("qual")){return QUAL;} 634 else if(ext.equals("bam")){return BAM;} 635 else if(ext.equals("sites") || ext.equals("sitesonly")){return SITES;} 636 else if(ext.equals("info") || ext.equals("attachment")){return ATTACHMENT;} 637 else if(ext.equals("scarf")){return SCARF;} 638 else if(ext.equals("phylip")){return PHYLIP;} 639 else if(ext.equals("header") || ext.equals("headers")){return HEADER;} 640 else if(ext.equals("int1d")){return INT1D;} 641 else if(ext.equals("long1d")){return LONG1D;} 642 else if(ext.equals("bitset")){return BITSET;} 643 else if(ext.equals("sketch")){return SKETCH;} 644 else if(ext.equals("oneline") || ext.equals("flat")){return ONELINE;} 645 else if(ext.equals("fastr") || ext.equals("fr")){return FASTR;} 646 else if(ext.equals("vcf")){return VCF;} 647 else if(ext.equals("var")){return VAR;} 648 else if(ext.equals("gff") || ext.equals("gff3")){return GFF;} 649 else if(ext.equals("bed")){return BED;} 650 else if(ext.equals("pgm") || ext.equals("pkm")){return PGM;} 651 else if(ext.equals("embl")){return EMBL;} 652 else if(ext.equals("gbk")){return GBK;} 653 else if(ext.equals("gbff")){return GBFF;} 654 else if(ext.equals("txt") || ext.equals("text") || ext.equals("tsv") || ext.equals("csv")){return TXT;} 655 return UNKNOWN; 656 } 657 name()658 public final String name(){return name;} simpleName()659 public final String simpleName(){return simpleName;} format()660 public final int format(){return format;} compression()661 public final int compression(){return compression;} type()662 public final int type(){return type;} mode()663 public final int mode(){return mode;} amino()664 public final boolean amino(){return amino;} hasName()665 public final boolean hasName(){return name!=null;} asciiOffset()666 public final int asciiOffset(){return asciiOffset;} length()667 public final int length(){return length;} 668 canWrite()669 public final boolean canWrite(){ 670 assert(write()); 671 if(stdio() || devnull()){return true;} 672 assert(hasName()); 673 File f=new File(name); 674 if(!f.exists()){return true;} 675 if(!f.canWrite()){return false;} 676 return overwrite() || append(); 677 } 678 canRead()679 public final boolean canRead(){ 680 assert(read()); 681 if(stdio()){return true;} 682 assert(hasName()); 683 File f=new File(name); 684 return f.canRead(); 685 } 686 unknownField()687 public final boolean unknownField(){return unknownFormat() || unknownCompression() || unknownType() || unknownMode();} 688 unknownFormat()689 public final boolean unknownFormat(){return format<=UNKNOWN;} fasta()690 public final boolean fasta(){return format==FASTA;} fastq()691 public final boolean fastq(){return format==FASTQ;} fastr()692 public final boolean fastr(){return format==FASTR;} bread()693 public final boolean bread(){return format==BREAD;} sam()694 public final boolean sam(){return format==SAM;} samOrBam()695 public final boolean samOrBam(){return format==SAM || format==BAM;} csfasta()696 public final boolean csfasta(){return format==CSFASTA;} qual()697 public final boolean qual(){return format==QUAL;} sequential()698 public final boolean sequential(){return format==SEQUENTIAL;} random()699 public final boolean random(){return format==RANDOM;} sites()700 public final boolean sites(){return format==SITES;} attachment()701 public final boolean attachment(){return format==ATTACHMENT;} header()702 public final boolean header(){return format==HEADER;} bam()703 public final boolean bam(){return format==BAM;} scarf()704 public final boolean scarf(){return format==SCARF;} text()705 public final boolean text(){return format==TEXT;} int1d()706 public final boolean int1d(){return format==INT1D;} long1d()707 public final boolean long1d(){return format==LONG1D;} bitset()708 public final boolean bitset(){return format==BITSET;} sketch()709 public final boolean sketch(){return format==SKETCH;} oneline()710 public final boolean oneline(){return format==ONELINE;} var()711 public final boolean var(){return format==VAR;} vcf()712 public final boolean vcf(){return format==VCF;} gff()713 public final boolean gff(){return format==GFF;} bed()714 public final boolean bed(){return format==BED;} pgm()715 public final boolean pgm(){return format==PGM;} embl()716 public final boolean embl(){return format==EMBL;} gbk()717 public final boolean gbk(){return format==GBK;} gbff()718 public final boolean gbff(){return format==GBFF;} alm()719 public final boolean alm(){return format==ALM;} 720 preferShreds()721 public final boolean preferShreds(){ 722 return preferShreds; 723 } 724 isSequence()725 public boolean isSequence() {return fasta() || fastq() || fastr() || bread() || samOrBam() || csfasta() || scarf() || header() || oneline() || gbk() || embl();} 726 unknownCompression()727 public final boolean unknownCompression(){return compression<=UNKNOWN;} raw()728 public final boolean raw(){return compression==RAW;} gzip()729 public final boolean gzip(){return compression==GZIP;} zip()730 public final boolean zip(){return compression==ZIP;} bz2()731 public final boolean bz2(){return compression==BZ2;} fqz()732 public final boolean fqz(){return compression==FQZ;} lz()733 public final boolean lz(){return compression==LZ;} xz()734 public final boolean xz(){return compression==XZ;} sevenz()735 public final boolean sevenz(){return compression==SEVENZ;} dsrc()736 public final boolean dsrc(){return compression==DSRC;} compressed()737 public final boolean compressed(){return compression!=RAW || format==BAM;} 738 unknownType()739 public final boolean unknownType(){return type<=UNKNOWN;} file()740 public final boolean file(){return type==FILE;} stdio()741 public final boolean stdio(){return type==STDIO;} stdin()742 public final boolean stdin(){return type==STDIO && input;} stdout()743 public final boolean stdout(){return type==STDIO && !input;} devnull()744 public final boolean devnull(){return type==DEVNULL;} 745 unknownMode()746 public final boolean unknownMode(){return mode<=UNKNOWN;} read()747 public final boolean read(){return mode==READ;} write()748 public final boolean write(){return mode==WRITE;} 749 overwrite()750 public final boolean overwrite(){return overwrite;} append()751 public final boolean append(){return append;} allowSubprocess()752 public final boolean allowSubprocess(){return allowSubprocess;} ordered()753 public final boolean ordered(){return ordered;} 754 interleaved()755 public boolean interleaved(){return interleaving==INTERLEAVED;} 756 exists()757 public final boolean exists(){ 758 if(!file()){return read();} 759 File f=new File(name); 760 if(!f.exists() && !gzip()){return false;} 761 long size=f.length(); 762 return size>10; 763 } 764 765 // public final boolean interleaved(){return interleaved;} 766 // public final long maxReads(){return maxReads;} 767 768 /*--------------------------------------------------------------*/ 769 /*---------------- Fields ----------------*/ 770 /*--------------------------------------------------------------*/ 771 772 private final String name; 773 private final String simpleName; 774 private final int format; 775 private final int asciiOffset; 776 private final int compression; 777 private final int type; 778 private final int mode; 779 private final int interleaving; 780 private final int length; 781 private final boolean input; 782 private final boolean amino; 783 784 private final boolean overwrite; 785 private final boolean append; 786 private final boolean allowSubprocess; 787 private final boolean ordered; 788 789 // private final int magicNumber; 790 791 public boolean preferShreds=false; 792 // private final long maxReads; 793 794 /*--------------------------------------------------------------*/ 795 /*---------------- Statics ----------------*/ 796 /*--------------------------------------------------------------*/ 797 798 public static boolean verbose=false; 799 public static boolean PRINT_WARNING=true; 800 801 /*--------------------------------------------------------------*/ 802 /*---------------- Constants ----------------*/ 803 /*--------------------------------------------------------------*/ 804 805 public static final int UNKNOWN=0; 806 807 /* Format */ 808 809 public static final int FA=1, FASTA=1; 810 public static final int FQ=2, FASTQ=2; 811 public static final int BREAD=3; 812 public static final int SAM=4; 813 public static final int CSFASTA=5; 814 public static final int QUAL=6; 815 public static final int SEQUENTIAL=7; 816 public static final int RANDOM=8; 817 public static final int SITES=9; 818 public static final int ATTACHMENT=10; 819 public static final int BAM=11; 820 public static final int SCARF=12; 821 public static final int TEXT=13, TXT=13; 822 public static final int PHYLIP=14; 823 public static final int HEADER=15; 824 public static final int INT1D=16; 825 public static final int LONG1D=17; 826 public static final int BITSET=18; 827 public static final int SKETCH=19; 828 public static final int ONELINE=20; 829 public static final int FR=21, FASTR=21; 830 public static final int VCF=22; 831 public static final int VAR=23; 832 public static final int GFF=24; 833 public static final int BED=25; 834 public static final int PGM=26, PKM=26; 835 public static final int EMBL=27; 836 public static final int GBK=28; 837 public static final int GBFF=29;//TODO: this may be the same as GBK... 838 //Alignment Model, from Consensus package 839 public static final int ALM=30; 840 841 public static final String[] FORMAT_ARRAY=new String[] { 842 "unknown", "fasta", "fastq", "bread", "sam", "csfasta", 843 "qual", "sequential", "random", "sites", "attachment", 844 "bam", "scarf", "text", "phylip", "header", "int1d", 845 "long1d", "bitset", "sketch", "oneline", "fastr", 846 "vcf", "var", "gff", "bed", "pgm", "embl", "gbk", "gbff", "alm" 847 }; 848 849 public static final String[] EXTENSION_LIST=new String[] { 850 "fq", "fastq", "fa", "fasta", "fas", "fna", 851 "ffn", "frn", "seq", "fsa", "faa", 852 "bread", "sam", "csfasta", "qual", "bam", 853 "scarf", "phylip", "txt", 854 "gz", "gzip", "bz2", "zip", "xz", "dsrc", "header", "headers", 855 "int1d", "long1d", "bitset", "sketch", "oneline", "flat", "fqz", 856 "gff", "gff3", "var", "vcf", "bed", "pgm", "embl", "gbk", "gbff", "alm" 857 }; 858 859 /* Compression */ 860 861 public static final int RAW=1; 862 public static final int GZ=2, GZIP=2; 863 public static final int ZIP=3; 864 public static final int BZ2=4; 865 public static final int XZ=5; 866 public static final int c4=6; 867 public static final int SEVENZ=7; 868 public static final int DSRC=8; 869 public static final int FQZ=9; 870 public static final int LZ=10; 871 public static final int AC=11; 872 873 public static final String[] COMPRESSION_ARRAY=new String[] { 874 "unknown", "raw", "gz", "zip", "bz2", "xz", 875 "c4", "7z", "dsrc", "fqz", "lz", "ac" 876 }; 877 878 /* Type */ 879 880 public static final int FILE=1; 881 public static final int STDIO=2, STDIN=2, STDOUT=2; 882 public static final int DEVNULL=3; 883 // public static final int NULL=4; 884 885 private static final String[] TYPE_ARRAY=new String[] { 886 "unknown", "file", "stdio", "devnull" 887 }; 888 889 /* Mode */ 890 891 public static final int READ=1, WRITE=2; 892 893 private static final String[] MODE_ARRAY=new String[] { 894 "unknown", "read", "write" 895 }; 896 897 /* Interleaving */ 898 899 public static final int SINGLE=1, INTERLEAVED=2; 900 901 private static final String[] INTERLEAVING_ARRAY=new String[] { 902 "unknown", "single-ended", "interleaved" 903 }; 904 905 } 906