1 package fileIO;
2 
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.IOException;
6 import java.io.InputStream;
7 import java.io.InputStreamReader;
8 import java.util.ArrayList;
9 import java.util.Arrays;
10 import java.util.List;
11 
12 import jgi.TestFormat;
13 import shared.Parse;
14 import shared.PreParser;
15 import shared.Tools;
16 
17 /**
18  * This class contains metadata about a file
19  * @author Brian Bushnell
20  * @date Dec 19, 2012
21  *
22  */
23 public final class FileFormat {
24 
main(String[] args)25 	public static void main(String[] args){
26 
27 		{//Preparse block for help, config files, and outstream
28 			PreParser pp=new PreParser(args, null /*new Object() { }.getClass().getEnclosingClass()*/, false);
29 			args=pp.args;
30 			//outstream=pp.outstream;
31 		}
32 
33 		stream.FASTQ.warnQualityChange=false;
34 		PRINT_WARNING=false;
35 		boolean full=false;
36 		ArrayList<String> files=new ArrayList<String>();
37 		for(int i=0; i<args.length; i++){
38 
39 			final String arg=args[i];
40 			String[] split=arg.split("=");
41 			String a=split[0].toLowerCase();
42 			String b=split.length>1 ? split[1] : null;
43 
44 			if(a.equals("verbose")){
45 				verbose=Parse.parseBoolean(b);
46 			}else if(a.equals("full")){
47 				full=Parse.parseBoolean(b);
48 			}else if(b!=null){
49 //				assert(a.startsWith("in")) : "Unknown parameter "+arg;
50 				if(a.startsWith("in")){files.add(b);}
51 			}else{
52 				files.add(arg);
53 			}
54 		}
55 
56 		if(full){
57 			TestFormat.main(args);
58 		}else{
59 			for(String fname : files){
60 				test(fname, true);
61 			}
62 		}
63 
64 	}
65 
test(String fname, boolean forceFileRead)66 	private static void test(String fname, boolean forceFileRead){
67 		FileFormat ffName=testInput(fname, FASTQ, null, false, false, false);
68 		FileFormat ffContent=testInput(fname, ffName.format(), null, false, true, true);
69 		FileFormat ff=ffContent;
70 //		assert(false) : ffName+"\n"+ffContent;
71 		if(ff==null){
72 			System.out.println("null");
73 		}else{
74 			int q=33;
75 			int len=-1;
76 			boolean i=false;
77 			if(ff.fastq()){
78 				byte qold=stream.FASTQ.ASCII_OFFSET;
79 				stream.FASTQ.ASCII_OFFSET=33;
80 				int[] qi=testInterleavedAndQuality(fname, false);
81 				q=qi[0];
82 				i=(qi[1]==INTERLEAVED);
83 				len=qi[2];
84 				stream.FASTQ.ASCII_OFFSET=qold;
85 			}else if(ff.fasta()){
86 				i=stream.FASTQ.testInterleavedFasta(fname, false);
87 			}
88 			if(ff.isSequence()){
89 				String qs=(q==33 ? "sanger" : q==64 ? "illumina" : ""+q);
90 				System.out.print(qs+"\t"+FORMAT_ARRAY[ff.format()]+"\t"+COMPRESSION_ARRAY[ff.compression()]);
91 				System.out.print("\t"+(i ? "interleaved" : "single-ended"));
92 				if(len>0){System.out.print("\t"+len+"bp");}
93 			}else{
94 				System.out.print(FORMAT_ARRAY[ff.format()]+"\t"+COMPRESSION_ARRAY[ff.compression()]);
95 			}
96 			if(ffName.format()!=ff.format()){System.out.print("\t"+FORMAT_ARRAY[ffName.format()]+"\t(File extension differs from contents)");}
97 			System.out.println();
98 		}
99 	}
100 
101 	/*--------------------------------------------------------------*/
102 	/*----------------        Initialization        ----------------*/
103 	/*--------------------------------------------------------------*/
104 
testInput(String fname, String overrideExtension, boolean allowSubprocess)105 	public static FileFormat testInput(String fname, String overrideExtension, boolean allowSubprocess){
106 		if(verbose){System.err.println("testInputA("+fname+", "+overrideExtension+", "+allowSubprocess+")");}
107 		return testInput(fname, FASTQ, overrideExtension, allowSubprocess, true);
108 	}
109 
testInputList(List<String> fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead)110 	public static FileFormat[] testInputList(List<String> fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead){
111 		if(verbose){System.err.println("testInputList("+fname+", "+defaultFormat+", "+overrideExtension+", "+allowSubprocess+", "+allowFileRead+")");}
112 		FileFormat[] ffa=new FileFormat[fname.size()];
113 		for(int i=0; i<fname.size(); i++){
114 			ffa[i]=testInput(fname.get(i), defaultFormat, overrideExtension, allowSubprocess, allowFileRead, false);
115 		}
116 		return ffa;
117 	}
118 
testInput(String fnames[], int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead)119 	public static FileFormat[] testInput(String fnames[], int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead){
120 		FileFormat[] array=new FileFormat[fnames.length];
121 		for(int i=0; i<fnames.length; i++){
122 			array[i]=testInput(fnames[i], defaultFormat, overrideExtension, allowSubprocess, allowFileRead);
123 		}
124 		return array;
125 	}
126 
testInput(String fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead)127 	public static FileFormat testInput(String fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead){
128 		if(verbose){System.err.println("testInputB("+fname+", "+defaultFormat+", "+overrideExtension+", "+allowSubprocess+", "+allowFileRead+")");}
129 		return testInput(fname, defaultFormat, overrideExtension, allowSubprocess, allowFileRead, false);
130 	}
131 
testInput(String fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead, boolean forceFileRead)132 	public static FileFormat testInput(String fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead, boolean forceFileRead){
133 		if(verbose){System.err.println("testInputC("+fname+", "+defaultFormat+", "+overrideExtension+", "+allowSubprocess+", "+allowFileRead+", "+forceFileRead+")");}
134 		if(fname==null){return null;}
135 		int overrideFormat=0;
136 		int overrideCompression=0;
137 		if(overrideExtension!=null && overrideExtension.length()>0){
138 			int[] a=testFormat(overrideExtension, false, false);
139 			if(a!=null){
140 				overrideFormat=a[0];
141 				if(a[1]!=RAW){overrideCompression=a[1];}
142 			}
143 		}
144 		return testInput(fname, defaultFormat, overrideFormat, overrideCompression, allowSubprocess, allowFileRead, forceFileRead);
145 	}
146 
147 	/**
148 	 * Create an input FileFormat object for this filename.
149 	 * @param fname Filename (path).
150 	 * @param defaultFormat Use this format if the name is unclear and the format is not autodetected.
151 	 * @param overrideFormat If specified, ignore the file extension and autodetection and input using this format.
152 	 * @param overrideCompression If specified, ignore the file extension and input using this compression protocol.
153 	 * @param allowSubprocess Permission to spawn a subprocess like bgzip.
154 	 * @param allowFileRead Permission to read the file while constructing this FileFormat, for the purpose of format detection.
155 	 * @param forceFileRead Force reading the file while constructing this FileFormat, for the purpose of format detection.
156 	 * @return A FileFormat, or null if the filename is null.
157 	 */
testInput(String fname, int defaultFormat, int overrideFormat, int overrideCompression, boolean allowSubprocess, boolean allowFileRead, boolean forceFileRead)158 	public static FileFormat testInput(String fname, int defaultFormat, int overrideFormat,
159 			int overrideCompression, boolean allowSubprocess, boolean allowFileRead, boolean forceFileRead){
160 		if(verbose){System.err.println("testInputD("+fname+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowSubprocess+", "+allowFileRead+", "+forceFileRead+")");}
161 		if(fname==null){return null;}
162 		return new FileFormat(fname, READ, defaultFormat, overrideFormat, overrideCompression, allowSubprocess, allowFileRead, forceFileRead, false, false, false, true);
163 	}
164 
165 	/**
166 	 * Create an output FileFormat object for this filename.
167 	 * @param fname Filename (path).
168 	 * @param defaultFormat Use this format if the name is unclear.
169 	 * @param overrideExtension If specified, ignore the file extension and output in this format.
170 	 * @param allowSubprocess Permission to spawn a subprocess like bgzip.
171 	 * @param overwrite Permission to overwrite existing files.
172 	 * @param append Permission to append to existing files.
173 	 * @param ordered True if the input order should be maintained (for multithreaded read processing).
174 	 * @return A FileFormat, or null if the filename is null.
175 	 */
testOutput(String fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean overwrite, boolean append, boolean ordered)176 	public static FileFormat testOutput(String fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean overwrite, boolean append, boolean ordered){
177 		if(fname==null){return null;}
178 		int overrideFormat=0;
179 		int overrideCompression=0;
180 		if(overrideExtension!=null && overrideExtension.length()>0){
181 			int[] a=testFormat(overrideExtension, false, false);
182 			if(a!=null){
183 				overrideFormat=a[0];
184 				if(a[1]!=RAW){overrideCompression=a[1];}
185 			}
186 		}
187 		return testOutput(fname, defaultFormat, overrideFormat, overrideCompression, allowSubprocess, overwrite, append, ordered);
188 	}
189 
testOutput(String fname, int defaultFormat, int overrideFormat, int overrideCompression, boolean allowSubprocess, boolean overwrite, boolean append, boolean ordered)190 	public static FileFormat testOutput(String fname, int defaultFormat, int overrideFormat, int overrideCompression, boolean allowSubprocess, boolean overwrite, boolean append, boolean ordered){
191 		if(fname==null){return null;}
192 		return new FileFormat(fname, WRITE, defaultFormat, overrideFormat, overrideCompression, allowSubprocess, false, false, overwrite, append, ordered, false);
193 	}
194 
195 	/*--------------------------------------------------------------*/
196 	/*----------------          Constructor         ----------------*/
197 	/*--------------------------------------------------------------*/
198 
FileFormat(String fname, int mode_, int defaultFormat, int overrideFormat, int overrideCompression, boolean allowSubprocess_, boolean allowFileRead, boolean forceFileRead, boolean overwrite_, boolean append_, boolean ordered_, boolean input_)199 	private FileFormat(String fname, int mode_, int defaultFormat, int overrideFormat, int overrideCompression, boolean allowSubprocess_,
200 			boolean allowFileRead, boolean forceFileRead, boolean overwrite_, boolean append_, boolean ordered_, boolean input_){
201 //			, boolean interleaved_, long maxReads_){
202 
203 		if(verbose){
204 //			new Exception().printStackTrace(System.err);
205 			System.err.println("FileFormat(fname="+fname+", mode="+mode_+", dFormat="+defaultFormat+", oFormat="+overrideFormat+", oCompression="+overrideCompression+
206 					", allowSub="+allowSubprocess_+", allowRead="+allowFileRead+", forceFileRead="+forceFileRead+
207 					", ow="+overwrite_+", append="+append_+", ordered="+ordered_+")");
208 		}
209 		assert(!forceFileRead || allowFileRead);
210 
211 //		assert(!overwrite_ || !append_) : "Both overwrite and append may not be set to true.";
212 		if(overwrite_ && append_){overwrite_=false;}
213 
214 		assert(fname!=null);
215 		fname=fname.trim().replace('\\', '/');
216 		assert(fname.trim().length()>0) : fname;
217 
218 		if(defaultFormat<1 && !forceFileRead){defaultFormat=FQ;}
219 		allowFileRead&=(mode_==READ);
220 		int[] a=testFormat(fname, allowFileRead, forceFileRead);
221 
222 		if(verbose){System.err.println(Arrays.toString(a));}
223 
224 		if(a[0]==UNKNOWN && overrideFormat<1){
225 			a[0]=defaultFormat;
226 			if(defaultFormat!=TEXT && PRINT_WARNING){
227 				System.err.println("Unspecified format for "+(mode_==READ ? "input" : "output")+" "+(fname==null ? "stream" : fname)+"; defaulting to "+FORMAT_ARRAY[a[0]]+".");
228 			}
229 		}
230 		if(verbose){System.err.println(Arrays.toString(a));}
231 
232 		if(overrideFormat>0){a[0]=overrideFormat;}
233 		if(overrideCompression>0){a[1]=overrideCompression;}
234 
235 		if(verbose){System.err.println(Arrays.toString(a));}
236 
237 
238 //		{format, compression, type, interleaved, quality, length}
239 		name=fname;
240 		simpleName=new File(name).getName();
241 		format=a[0];
242 		compression=a[1];
243 		type=a[2];
244 		interleaving=a[3];
245 		asciiOffset=a[4];
246 		length=a[5];
247 		mode=mode_;
248 		input=input_;
249 
250 		overwrite=overwrite_;
251 		append=append_;
252 		allowSubprocess=allowSubprocess_;
253 		ordered=ordered_;
254 		amino="faa".equals(rawExtension());
255 
256 //		interleaved=interleaved_;
257 //		maxReads=write() ? -1 : maxReads_;
258 
259 		assert(forceFileRead || !unknownFormat()) : "Unknown file format for "+fname+"\n"+
260 			mode_+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowSubprocess_+", "+allowFileRead+", "+overwrite_;
261 		assert(!unknownCompression()) : "Unknown compression for "+fname+"\n"+
262 			mode_+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowSubprocess_+", "+allowFileRead+", "+overwrite_;
263 		assert(!unknownType()) : "Unknown stream type for "+fname+"\n"+
264 			mode_+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowSubprocess_+", "+allowFileRead+", "+overwrite_;
265 		assert(!unknownMode()) : "Unknown I/O mode for "+fname+"\n"+
266 			mode_+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowSubprocess_+", "+allowFileRead+", "+overwrite_;
267 	}
268 
269 	/*--------------------------------------------------------------*/
270 	/*----------------            Methods           ----------------*/
271 	/*--------------------------------------------------------------*/
272 
273 	@Override
toString()274 	public String toString(){
275 		StringBuilder sb=new StringBuilder();
276 		sb.append(name).append(',');
277 		sb.append(format+"("+FORMAT_ARRAY[format]+")").append(',');
278 		sb.append(compression+"("+COMPRESSION_ARRAY[compression]+")").append(',');
279 		sb.append(type+"("+TYPE_ARRAY[type]+")").append(',');
280 		sb.append(interleaving+"("+INTERLEAVING_ARRAY[interleaving]+")").append(',');
281 //		sb.append("ascii"+asciiOffset).append(',');
282 		sb.append(mode+"("+MODE_ARRAY[mode]+")").append(',');
283 		sb.append("ow="+(overwrite ? "t" : "f")).append(',');
284 		sb.append("app="+(append ? "t" : "f")).append(',');
285 		sb.append("sub="+(allowSubprocess ? "t" : "f")).append(',');
286 		sb.append("ordered="+(ordered ? "t" : "f"));
287 		return sb.toString();
288 	}
289 
toString(int[] vector)290 	public static String toString(int[] vector){
291 		int format=vector[0], compression=vector[1], type=vector[2], interleaving=vector[3];
292 		StringBuilder sb=new StringBuilder();
293 		sb.append(format+"("+FORMAT_ARRAY[format]+")").append(',');
294 		sb.append(compression+"("+COMPRESSION_ARRAY[compression]+")").append(',');
295 		sb.append(type+"("+TYPE_ARRAY[type]+")").append(',');
296 		sb.append(interleaving+"("+INTERLEAVING_ARRAY[interleaving]+")");
297 		return sb.toString();
298 	}
299 
300 
301 	/*--------------------------------------------------------------*/
302 	/*----------------        Static Methods        ----------------*/
303 	/*--------------------------------------------------------------*/
304 
305 	/** Returns an int array: {format, compression, type, interleaved, quality, length} */
testFormat(String fname, boolean allowFileRead, boolean forceFileRead)306 	public static final int[] testFormat(String fname, boolean allowFileRead, boolean forceFileRead){
307 		if(verbose){System.err.println("testFormat("+fname+", "+allowFileRead+", "+forceFileRead+")");}
308 		final int[] r=new int[] {UNKNOWN, RAW, FILE, UNKNOWN, -1, -1};
309 		if(fname==null || fname.length()<1){
310 			r[2]=STDIO;
311 			return r;
312 		}
313 		String slc=fname.trim().toLowerCase();
314 		if(slc.indexOf('/')<0){slc=slc.substring(slc.lastIndexOf('/')+1);}
315 		String comp=ReadWrite.compressionType(slc);
316 		String ext=ReadWrite.rawExtension(slc);
317 
318 		if(ext==null){}
319 		else if(ext.equals("fq") || ext.equals("fastq") || (comp!=null && comp.equals("fqz"))){r[0]=FASTQ;}
320 		else if(isFasta(ext)){r[0]=FASTA;}
321 		else if(/*ext.equals("txt") || */ext.equals("bread")){r[0]=BREAD;}
322 		else if(ext.equals("sam")){r[0]=SAM;}
323 		else if(ext.equals("csfasta")){r[0]=CSFASTA;}
324 		else if(ext.equals("qual")){r[0]=QUAL;}
325 		else if(ext.equals("bam")){r[0]=BAM;}
326 		else if(ext.equals("sites") || ext.equals("sitesonly")){r[0]=SITES;}
327 		else if(ext.equals("info") || ext.equals("attachment")){r[0]=ATTACHMENT;}
328 		else if(ext.equals("scarf")){r[0]=SCARF;}
329 		else if(ext.equals("phylip")){r[0]=PHYLIP;}
330 		else if(ext.equals("header") || ext.equals("headers")){r[0]=HEADER;}
331 		else if(ext.equals("int1d")){r[0]=INT1D;}
332 		else if(ext.equals("long1d")){r[0]=LONG1D;}
333 		else if(ext.equals("bitset")){r[0]=BITSET;}
334 		else if(ext.equals("sketch")){r[0]=SKETCH;}
335 		else if(ext.equals("oneline") || ext.equals("flat")){r[0]=ONELINE;}
336 		else if(ext.equals("fastr") || ext.equals("fr")){r[0]=FASTR;}
337 		else if(ext.equals("vcf")){r[0]=VCF;}
338 		else if(ext.equals("var")){r[0]=VAR;}
339 		else if(ext.equals("gff") || ext.equals("gff3")){r[0]=GFF;}
340 		else if(ext.equals("bed")){r[0]=BED;}
341 		else if(ext.equals("pgm") || ext.equals("pkm")){r[0]=PGM;}
342 		else if(ext.equals("embl")){r[0]=EMBL;}
343 		else if(ext.equals("gbk")){r[0]=GBK;}
344 		else if(ext.equals("gbff")){r[0]=GBFF;}
345 		else if(ext.equals("alm")){r[0]=ALM;}
346 
347 		if(comp!=null){
348 			r[1]=Tools.find(comp, COMPRESSION_ARRAY);
349 			assert(r[1]>0) : "Unhandled compression type: "+comp;
350 		}
351 
352 //		if(r[1]==GZIP && allowFileRead){
353 //			//Check magic number, perhaps
354 //		}
355 
356 		if(slc.length()>2 && slc.charAt(0)=='s' && slc.charAt(1)=='t'){
357 			if(slc.equals("stdin") || slc.startsWith("stdin.") || slc.equals("standardin")){r[2]=STDIO;}
358 			else if(slc.equals("stdout") || slc.startsWith("stdout.") || slc.equals("standardout")){r[2]=STDIO;}
359 		}else if("/dev/null".equalsIgnoreCase(slc)){
360 			r[2]=DEVNULL;
361 		}
362 
363 		if(verbose){System.err.println("Before reading: \t"+r[0]+", "+toString(r)+", "+forceFileRead+", "+(r[0]!=BAM));}
364 		if(r[0]==UNKNOWN || (r[0]!=BAM && forceFileRead) ||
365 				((r[0]==FASTQ || r[0]==FASTA) && r[3]==UNKNOWN && allowFileRead && !stream.FASTQ.FORCE_INTERLEAVED && stream.FASTQ.TEST_INTERLEAVED)){
366 			File f=(allowFileRead && r[2]==FILE ? new File(fname) : null);
367 			if(f!=null && f.exists() && !f.isDirectory()){
368 //				//a: {quality, interleaved, length, format}
369 //				//r: {format, compression, type, interleaved, quality, length}
370 				try {
371 					int[] a=testInterleavedAndQuality(fname, false);
372 					if(a!=null){
373 						final int aq=a[0], ai=a[1], al=a[2], af=a[3];
374 						if(aq>-1){r[4]=aq;}
375 						if(ai!=UNKNOWN){r[3]=ai;}
376 						if(af!=UNKNOWN && (af!=BREAD || (r[0]!=HEADER && r[0]!=TEXT))){r[0]=af;}
377 						if(al>1 && r[5]==-1){r[5]=al;}
378 					}
379 				} catch (Exception e) {
380 					// TODO Auto-generated catch block
381 					e.printStackTrace();
382 				}
383 
384 				if(verbose){System.err.println("After reading:   \t"+r[0]+", "+toString(r)+", "+forceFileRead+", "+(r[0]!=BAM));}
385 			}else if(r[0]==UNKNOWN){
386 				if(fname.equals("sequential")){r[0]=SEQUENTIAL;}
387 				else if(fname.equals("random")){r[0]=RANDOM;}
388 				else if(fname.equals("sitesonly")){r[0]=SITES;}
389 			}
390 		}
391 
392 		if(r[3]==UNKNOWN && (r[0]==FASTQ || r[0]==FASTA)){
393 			if(stream.FASTQ.FORCE_INTERLEAVED){r[3]=2;}
394 			else{r[3]=1;}
395 		}
396 //		assert(false) : Arrays.toString(r);
397 
398 		if(r[2]==STDIO && allowFileRead){
399 			File f=new File(fname);
400 			if(f.exists() && !f.isDirectory()){r[2]=FILE;}
401 		}
402 		if(verbose){System.err.println("testFormat return:\t"+r[0]+", "+toString(r)+", "+forceFileRead+", "+(r[0]!=BAM)+", "+r[4]);}
403 		return r;
404 	}
405 
hasFastaExtension(String fname)406 	public static boolean hasFastaExtension(String fname){
407 		int[] r=testFormat(fname, false, false);
408 		return r[0]==FA;
409 	}
410 
hasFastqExtension(String fname)411 	public static boolean hasFastqExtension(String fname){
412 		int[] r=testFormat(fname, false, false);
413 		return r[0]==FQ;
414 	}
415 
hasFastqOrFastqExtension(String fname)416 	public static boolean hasFastqOrFastqExtension(String fname){
417 		int[] r=testFormat(fname, false, false);
418 		return r[0]==FQ || r[0]==FA;
419 	}
420 
hasSamOrBamExtension(String fname)421 	public static boolean hasSamOrBamExtension(String fname){
422 		int[] r=testFormat(fname, false, false);
423 		return r[0]==SAM || r[0]==BAM;
424 	}
425 
426 	/*--------------------------------------------------------------*/
427 	/*----------------            ???????           ----------------*/
428 	/*--------------------------------------------------------------*/
429 
430 	/**
431 	 * @param fname File to read
432 	 * @return {quality, interleaved, length, format}
433 	 */
testInterleavedAndQuality(String fname, boolean forceFastq)434 	public static int[] testInterleavedAndQuality(String fname, boolean forceFastq){
435 		final ArrayList<String> oct=getFirstOctet(fname);
436 		return testInterleavedAndQuality(oct, fname, forceFastq);
437 	}
438 
getFirstOctet(String fname)439 	public static ArrayList<String> getFirstOctet(String fname){
440 		if(fname==null){return null;}
441 		if(fname.equalsIgnoreCase("stdin") || fname.toLowerCase().startsWith("stdin.")){return null;}
442 
443 		ArrayList<String> oct=new ArrayList<String>(8);
444 
445 		{
446 			InputStream is=ReadWrite.getInputStream(fname, false, fname.toLowerCase().endsWith(".bz2"));
447 			BufferedReader br=new BufferedReader(new InputStreamReader(is));
448 			try {
449 				int cntr=0;
450 				for(String s=br.readLine(); s!=null && cntr<8; s=br.readLine()){
451 					oct.add(s);
452 					cntr++;
453 				}
454 			} catch (IOException e) {
455 				// TODO Auto-generated catch block
456 				e.printStackTrace();
457 			}
458 			ReadWrite.finishReading(is, fname, true, br);
459 		}
460 		return oct;
461 	}
462 
463 	/**
464 	 * @param oct First 8 lines of file
465 	 * @param fname File to read
466 	 * @return {quality, interleaved, length, format}
467 	 */
testInterleavedAndQuality(final ArrayList<String> oct, String fname, boolean forceFastq)468 	public static int[] testInterleavedAndQuality(final ArrayList<String> oct, String fname, boolean forceFastq){
469 		int len=-1, format=UNKNOWN;
470 		byte q=-1, i=UNKNOWN;
471 		if(oct==null || oct.size()<1){
472 			return new int[] {q, i, len, format};
473 		}
474 		{
475 			String s1=oct.size()>0 ? oct.get(0) : "";
476 			String s2=oct.size()>1 ? oct.get(1) : "";
477 			String s3=oct.size()>2 ? oct.get(2) : "";
478 			int b1=(s1.length()>0 ? s1.charAt(0) : -1);
479 			int b2=(s2.length()>0 ? s2.charAt(0) : -1);
480 			int b3=(s3.length()>0 ? s3.charAt(0) : -1);
481 
482 			if(b1=='>'){format=FA;}
483 			else if(b1=='@'){
484 				if(b3=='+'){format=FQ;}
485 				else if(b2<0 || b2=='@'){format=SAM;}
486 				else{format=UNKNOWN;} //probably a truncated fastq file?
487 			}else if(b1=='#'){
488 				if(s1.startsWith("#SZ:") || s1.startsWith("#SIZE:")){
489 					format=SKETCH;
490 					int x1=s1.indexOf(':');
491 					int x2=s1.indexOf('\t');
492 					if(x2>x1){
493 						try {
494 							len=Integer.parseInt(s1.substring(x1+1, x2));
495 						} catch (NumberFormatException e) {}
496 					}
497 				}else if(s1.startsWith("#FASTR") || s1.startsWith("#FR")){
498 					format=FASTR;
499 					if(s1.endsWith("\tINT")){i=INTERLEAVED;}
500 					else{i=SINGLE;}
501 				}else if(s1.startsWith("##fileformat=VCF")){
502 					format=VCF;
503 				}else if(s1.startsWith("#fileformat\tVar_")){
504 					format=VAR;
505 				}else if(s1.startsWith("##gff-version")){
506 					format=GFF;
507 				}else if(s1.startsWith("LOCUS ")){
508 					format=GBFF;
509 				}else{format=TEXT;}
510 			}
511 //			else{format=BREAD;} //or possibly scarf
512 
513 			if(format!=FQ){len=-1;}
514 		}
515 
516 		if(format==FQ || forceFastq){
517 			boolean oldDQ=stream.FASTQ.DETECT_QUALITY;
518 			byte oldQin=stream.FASTQ.ASCII_OFFSET;
519 			byte oldQout=stream.FASTQ.ASCII_OFFSET_OUT;
520 			stream.FASTQ.DETECT_QUALITY=true;
521 			q=stream.FASTQ.testQuality(oct);
522 			i=(byte)(stream.FASTQ.testInterleaved(oct, fname, false) ? INTERLEAVED : SINGLE);
523 			//		stream.FASTQ.DETECT_QUALITY=old;
524 			{
525 				String a=oct.size()>1 ? oct.get(1) : null;
526 				String b=oct.size()>5 ? oct.get(5) : null;
527 				if(a!=null){len=Tools.max(a.length(), len);}
528 				if(b!=null){len=Tools.max(b.length(), len);}
529 				if(len<2){len=-1;}
530 			}
531 			stream.FASTQ.DETECT_QUALITY=oldDQ;
532 			stream.FASTQ.ASCII_OFFSET=oldQin;
533 			stream.FASTQ.ASCII_OFFSET_OUT=oldQout;
534 		}
535 		int[] r=new int[] {q, i, len, format};
536 		if(verbose){System.err.println(Arrays.toString(r));}
537 		return r;
538 	}
539 
isFasta(String ext)540 	public static boolean isFasta(String ext){
541 		if(ext==null){return false;}
542 		return (ext.equals("fa") || ext.equals("fasta") || ext.equals("fas") || ext.equals("fna") || ext.equals("ffn")
543 			|| ext.equals("frn") || ext.equals("seq") || ext.equals("fsa") || ext.equals("faa"));
544 	}
545 
isFastaFile(String fname)546 	public static boolean isFastaFile(String fname){
547 		if(fname==null){return false;}
548 		String ext=ReadWrite.rawExtension(fname);
549 		return isFasta(ext);
550 	}
551 
isPgmFile(String fname)552 	public static boolean isPgmFile(String fname){
553 		if(fname==null){return false;}
554 		String ext=ReadWrite.rawExtension(fname);
555 		return isPgm(ext);
556 	}
557 
isAmino(String ext)558 	public static boolean isAmino(String ext){
559 		if(ext==null){return false;}
560 		return ext.equals("faa"); //TODO: Investigate whether other extensions imply AA.
561 	}
562 
isStdio(String s)563 	public static boolean isStdio(String s){
564 		if(s==null){return false;}
565 		if(new File(s).exists()){return false;}
566 		if(s.contains(".")){s=s.substring(0, s.indexOf('.'));
567 		}
568 		return (s.equalsIgnoreCase("stdin") || s.equalsIgnoreCase("stdout") || s.equalsIgnoreCase("stderr"));
569 	}
570 
isFastq(String ext)571 	public static boolean isFastq(String ext){
572 		if(ext==null){return false;}
573 		return (ext.equals("fq") || ext.equals("fastq"));
574 	}
575 
isPgm(String ext)576 	public static boolean isPgm(String ext){
577 		if(ext==null){return false;}
578 		return (ext.equals("pgm") || ext.equals("pkm"));
579 	}
580 
isFastqFile(String fname)581 	public static boolean isFastqFile(String fname){
582 		if(fname==null){return false;}
583 		String ext=ReadWrite.rawExtension(fname);
584 		return isFastq(ext);
585 	}
586 
isSamOrBam(String ext)587 	public static boolean isSamOrBam(String ext){
588 		if(ext==null){return false;}
589 		return (ext.equals("sam") || ext.equals("bam"));
590 	}
591 
isSamOrBamFile(String fname)592 	public static boolean isSamOrBamFile(String fname){
593 		if(fname==null){return false;}
594 		String ext=ReadWrite.rawExtension(fname);
595 		return isSamOrBam(ext);
596 	}
597 
isBam(String ext)598 	public static boolean isBam(String ext){
599 		if(ext==null){return false;}
600 		return ext.equals("bam");
601 	}
602 
isBamFile(String fname)603 	public static boolean isBamFile(String fname){
604 		if(fname==null){return false;}
605 		String ext=ReadWrite.rawExtension(fname);
606 		return isBam(ext);
607 	}
608 
deleteIfPresent()609 	public void deleteIfPresent() {
610 		File f=new File(name);
611 		if(f.exists()){f.delete();}
612 	}
613 
614 	/*--------------------------------------------------------------*/
615 	/*----------------            Getters           ----------------*/
616 	/*--------------------------------------------------------------*/
617 
rawExtension()618 	public String rawExtension() {
619 		return ReadWrite.rawExtension(name);
620 	}
rawExtensionCode()621 	public int rawExtensionCode() {
622 		String ext=ReadWrite.rawExtension(name);
623 		String comp=ReadWrite.compressionType(name);
624 		return rawExtensionCode(ext, comp);
625 	}
rawExtensionCode(String ext, String comp)626 	private int rawExtensionCode(String ext, String comp) {
627 		if(ext==null){return UNKNOWN;}
628 		else if(ext.equals("fq") || ext.equals("fastq") || (comp!=null && comp.equals("fqz"))){return FASTQ;}
629 		else if(isFasta(ext)){return FASTA;}
630 		else if(ext.equals("bread")){return BREAD;}
631 		else if(ext.equals("sam")){return SAM;}
632 		else if(ext.equals("csfasta")){return CSFASTA;}
633 		else if(ext.equals("qual")){return QUAL;}
634 		else if(ext.equals("bam")){return BAM;}
635 		else if(ext.equals("sites") || ext.equals("sitesonly")){return SITES;}
636 		else if(ext.equals("info") || ext.equals("attachment")){return ATTACHMENT;}
637 		else if(ext.equals("scarf")){return SCARF;}
638 		else if(ext.equals("phylip")){return PHYLIP;}
639 		else if(ext.equals("header") || ext.equals("headers")){return HEADER;}
640 		else if(ext.equals("int1d")){return INT1D;}
641 		else if(ext.equals("long1d")){return LONG1D;}
642 		else if(ext.equals("bitset")){return BITSET;}
643 		else if(ext.equals("sketch")){return SKETCH;}
644 		else if(ext.equals("oneline") || ext.equals("flat")){return ONELINE;}
645 		else if(ext.equals("fastr") || ext.equals("fr")){return FASTR;}
646 		else if(ext.equals("vcf")){return VCF;}
647 		else if(ext.equals("var")){return VAR;}
648 		else if(ext.equals("gff") || ext.equals("gff3")){return GFF;}
649 		else if(ext.equals("bed")){return BED;}
650 		else if(ext.equals("pgm") || ext.equals("pkm")){return PGM;}
651 		else if(ext.equals("embl")){return EMBL;}
652 		else if(ext.equals("gbk")){return GBK;}
653 		else if(ext.equals("gbff")){return GBFF;}
654 		else if(ext.equals("txt") || ext.equals("text") || ext.equals("tsv") || ext.equals("csv")){return TXT;}
655 		return UNKNOWN;
656 	}
657 
name()658 	public final String name(){return name;}
simpleName()659 	public final String simpleName(){return simpleName;}
format()660 	public final int format(){return format;}
compression()661 	public final int compression(){return compression;}
type()662 	public final int type(){return type;}
mode()663 	public final int mode(){return mode;}
amino()664 	public final boolean amino(){return amino;}
hasName()665 	public final boolean hasName(){return name!=null;}
asciiOffset()666 	public final int asciiOffset(){return asciiOffset;}
length()667 	public final int length(){return length;}
668 
canWrite()669 	public final boolean canWrite(){
670 		assert(write());
671 		if(stdio() || devnull()){return true;}
672 		assert(hasName());
673 		File f=new File(name);
674 		if(!f.exists()){return true;}
675 		if(!f.canWrite()){return false;}
676 		return overwrite() || append();
677 	}
678 
canRead()679 	public final boolean canRead(){
680 		assert(read());
681 		if(stdio()){return true;}
682 		assert(hasName());
683 		File f=new File(name);
684 		return f.canRead();
685 	}
686 
unknownField()687 	public final boolean unknownField(){return unknownFormat() || unknownCompression() || unknownType() || unknownMode();}
688 
unknownFormat()689 	public final boolean unknownFormat(){return format<=UNKNOWN;}
fasta()690 	public final boolean fasta(){return format==FASTA;}
fastq()691 	public final boolean fastq(){return format==FASTQ;}
fastr()692 	public final boolean fastr(){return format==FASTR;}
bread()693 	public final boolean bread(){return format==BREAD;}
sam()694 	public final boolean sam(){return format==SAM;}
samOrBam()695 	public final boolean samOrBam(){return format==SAM || format==BAM;}
csfasta()696 	public final boolean csfasta(){return format==CSFASTA;}
qual()697 	public final boolean qual(){return format==QUAL;}
sequential()698 	public final boolean sequential(){return format==SEQUENTIAL;}
random()699 	public final boolean random(){return format==RANDOM;}
sites()700 	public final boolean sites(){return format==SITES;}
attachment()701 	public final boolean attachment(){return format==ATTACHMENT;}
header()702 	public final boolean header(){return format==HEADER;}
bam()703 	public final boolean bam(){return format==BAM;}
scarf()704 	public final boolean scarf(){return format==SCARF;}
text()705 	public final boolean text(){return format==TEXT;}
int1d()706 	public final boolean int1d(){return format==INT1D;}
long1d()707 	public final boolean long1d(){return format==LONG1D;}
bitset()708 	public final boolean bitset(){return format==BITSET;}
sketch()709 	public final boolean sketch(){return format==SKETCH;}
oneline()710 	public final boolean oneline(){return format==ONELINE;}
var()711 	public final boolean var(){return format==VAR;}
vcf()712 	public final boolean vcf(){return format==VCF;}
gff()713 	public final boolean gff(){return format==GFF;}
bed()714 	public final boolean bed(){return format==BED;}
pgm()715 	public final boolean pgm(){return format==PGM;}
embl()716 	public final boolean embl(){return format==EMBL;}
gbk()717 	public final boolean gbk(){return format==GBK;}
gbff()718 	public final boolean gbff(){return format==GBFF;}
alm()719 	public final boolean alm(){return format==ALM;}
720 
preferShreds()721 	public final boolean preferShreds(){
722 		return preferShreds;
723 	}
724 
isSequence()725 	public boolean isSequence() {return fasta() || fastq() || fastr() || bread() || samOrBam() || csfasta() || scarf() || header() || oneline() || gbk() || embl();}
726 
unknownCompression()727 	public final boolean unknownCompression(){return compression<=UNKNOWN;}
raw()728 	public final boolean raw(){return compression==RAW;}
gzip()729 	public final boolean gzip(){return compression==GZIP;}
zip()730 	public final boolean zip(){return compression==ZIP;}
bz2()731 	public final boolean bz2(){return compression==BZ2;}
fqz()732 	public final boolean fqz(){return compression==FQZ;}
lz()733 	public final boolean lz(){return compression==LZ;}
xz()734 	public final boolean xz(){return compression==XZ;}
sevenz()735 	public final boolean sevenz(){return compression==SEVENZ;}
dsrc()736 	public final boolean dsrc(){return compression==DSRC;}
compressed()737 	public final boolean compressed(){return compression!=RAW || format==BAM;}
738 
unknownType()739 	public final boolean unknownType(){return type<=UNKNOWN;}
file()740 	public final boolean file(){return type==FILE;}
stdio()741 	public final boolean stdio(){return type==STDIO;}
stdin()742 	public final boolean stdin(){return type==STDIO && input;}
stdout()743 	public final boolean stdout(){return type==STDIO && !input;}
devnull()744 	public final boolean devnull(){return type==DEVNULL;}
745 
unknownMode()746 	public final boolean unknownMode(){return mode<=UNKNOWN;}
read()747 	public final boolean read(){return mode==READ;}
write()748 	public final boolean write(){return mode==WRITE;}
749 
overwrite()750 	public final boolean overwrite(){return overwrite;}
append()751 	public final boolean append(){return append;}
allowSubprocess()752 	public final boolean allowSubprocess(){return allowSubprocess;}
ordered()753 	public final boolean ordered(){return ordered;}
754 
interleaved()755 	public boolean interleaved(){return interleaving==INTERLEAVED;}
756 
exists()757 	public final boolean exists(){
758 		if(!file()){return read();}
759 		File f=new File(name);
760 		if(!f.exists() && !gzip()){return false;}
761 		long size=f.length();
762 		return size>10;
763 	}
764 
765 //	public final boolean interleaved(){return interleaved;}
766 //	public final long maxReads(){return maxReads;}
767 
768 	/*--------------------------------------------------------------*/
769 	/*----------------            Fields            ----------------*/
770 	/*--------------------------------------------------------------*/
771 
772 	private final String name;
773 	private final String simpleName;
774 	private final int format;
775 	private final int asciiOffset;
776 	private final int compression;
777 	private final int type;
778 	private final int mode;
779 	private final int interleaving;
780 	private final int length;
781 	private final boolean input;
782 	private final boolean amino;
783 
784 	private final boolean overwrite;
785 	private final boolean append;
786 	private final boolean allowSubprocess;
787 	private final boolean ordered;
788 
789 //	private final int magicNumber;
790 
791 	public boolean preferShreds=false;
792 //	private final long maxReads;
793 
794 	/*--------------------------------------------------------------*/
795 	/*----------------           Statics            ----------------*/
796 	/*--------------------------------------------------------------*/
797 
798 	public static boolean verbose=false;
799 	public static boolean PRINT_WARNING=true;
800 
801 	/*--------------------------------------------------------------*/
802 	/*----------------          Constants           ----------------*/
803 	/*--------------------------------------------------------------*/
804 
805 	public static final int UNKNOWN=0;
806 
807 	/* Format */
808 
809 	public static final int FA=1, FASTA=1;
810 	public static final int FQ=2, FASTQ=2;
811 	public static final int BREAD=3;
812 	public static final int SAM=4;
813 	public static final int CSFASTA=5;
814 	public static final int QUAL=6;
815 	public static final int SEQUENTIAL=7;
816 	public static final int RANDOM=8;
817 	public static final int SITES=9;
818 	public static final int ATTACHMENT=10;
819 	public static final int BAM=11;
820 	public static final int SCARF=12;
821 	public static final int TEXT=13, TXT=13;
822 	public static final int PHYLIP=14;
823 	public static final int HEADER=15;
824 	public static final int INT1D=16;
825 	public static final int LONG1D=17;
826 	public static final int BITSET=18;
827 	public static final int SKETCH=19;
828 	public static final int ONELINE=20;
829 	public static final int FR=21, FASTR=21;
830 	public static final int VCF=22;
831 	public static final int VAR=23;
832 	public static final int GFF=24;
833 	public static final int BED=25;
834 	public static final int PGM=26, PKM=26;
835 	public static final int EMBL=27;
836 	public static final int GBK=28;
837 	public static final int GBFF=29;//TODO: this may be the same as GBK...
838 	//Alignment Model, from Consensus package
839 	public static final int ALM=30;
840 
841 	public static final String[] FORMAT_ARRAY=new String[] {
842 		"unknown", "fasta", "fastq", "bread", "sam", "csfasta",
843 		"qual", "sequential", "random", "sites", "attachment",
844 		"bam", "scarf", "text", "phylip", "header", "int1d",
845 		"long1d", "bitset", "sketch", "oneline", "fastr",
846 		"vcf", "var", "gff", "bed", "pgm", "embl", "gbk", "gbff", "alm"
847 	};
848 
849 	public static final String[] EXTENSION_LIST=new String[] {
850 		"fq", "fastq", "fa", "fasta", "fas", "fna",
851 		"ffn", "frn", "seq", "fsa", "faa",
852 		"bread", "sam", "csfasta", "qual", "bam",
853 		"scarf", "phylip", "txt",
854 		"gz", "gzip", "bz2", "zip", "xz", "dsrc", "header", "headers",
855 		"int1d", "long1d", "bitset", "sketch", "oneline", "flat", "fqz",
856 		"gff", "gff3", "var", "vcf", "bed", "pgm", "embl", "gbk", "gbff", "alm"
857 	};
858 
859 	/* Compression */
860 
861 	public static final int RAW=1;
862 	public static final int GZ=2, GZIP=2;
863 	public static final int ZIP=3;
864 	public static final int BZ2=4;
865 	public static final int XZ=5;
866 	public static final int c4=6;
867 	public static final int SEVENZ=7;
868 	public static final int DSRC=8;
869 	public static final int FQZ=9;
870 	public static final int LZ=10;
871 	public static final int AC=11;
872 
873 	public static final String[] COMPRESSION_ARRAY=new String[] {
874 		"unknown", "raw", "gz", "zip", "bz2", "xz",
875 		"c4", "7z", "dsrc", "fqz", "lz", "ac"
876 	};
877 
878 	/* Type */
879 
880 	public static final int FILE=1;
881 	public static final int STDIO=2, STDIN=2, STDOUT=2;
882 	public static final int DEVNULL=3;
883 //	public static final int NULL=4;
884 
885 	private static final String[] TYPE_ARRAY=new String[] {
886 		"unknown", "file", "stdio", "devnull"
887 	};
888 
889 	/* Mode */
890 
891 	public static final int READ=1, WRITE=2;
892 
893 	private static final String[] MODE_ARRAY=new String[] {
894 		"unknown", "read", "write"
895 	};
896 
897 	/* Interleaving */
898 
899 	public static final int SINGLE=1, INTERLEAVED=2;
900 
901 	private static final String[] INTERLEAVING_ARRAY=new String[] {
902 		"unknown", "single-ended", "interleaved"
903 	};
904 
905 }
906