1 package tax;
2 
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.util.ArrayList;
6 import java.util.Arrays;
7 import java.util.Collections;
8 import java.util.HashMap;
9 import java.util.Locale;
10 
11 import fileIO.ByteFile;
12 import fileIO.ByteFile1;
13 import fileIO.ByteFile2;
14 import fileIO.ByteStreamWriter;
15 import fileIO.FileFormat;
16 import fileIO.ReadWrite;
17 import fileIO.TextFile;
18 import shared.Parse;
19 import shared.Parser;
20 import shared.PreParser;
21 import shared.Shared;
22 import shared.Timer;
23 import shared.Tools;
24 import stream.ConcurrentGenericReadInputStream;
25 import stream.FastaReadInputStream;
26 import structures.ByteBuilder;
27 import structures.StringNum;
28 
29 /**
30  * Counts patterns in Accessions.
31  * Handles hashing for Accession to TaxID lookups.
32  * @author Brian Bushnell
33  * @date May 9, 2018
34  *
35  */
36 public class AnalyzeAccession_ST {
37 
main(String[] args)38 	public static void main(String[] args){
39 		//Start a timer immediately upon code entrance.
40 		Timer t=new Timer();
41 
42 		//Create an instance of this class
43 		AnalyzeAccession_ST x=new AnalyzeAccession_ST(args);
44 
45 		//Run the object
46 		x.process(t);
47 
48 		//Close the print stream if it was redirected
49 		Shared.closeStream(x.outstream);
50 	}
51 
AnalyzeAccession_ST(String[] args)52 	public AnalyzeAccession_ST(String[] args){
53 
54 		{//Preparse block for help, config files, and outstream
55 			PreParser pp=new PreParser(args, getClass(), false);
56 			args=pp.args;
57 			outstream=pp.outstream;
58 		}
59 
60 		ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true;
61 		ReadWrite.MAX_ZIP_THREADS=Shared.threads();
62 
63 		Parser parser=new Parser();
64 		for(int i=0; i<args.length; i++){
65 			String arg=args[i];
66 			String[] split=arg.split("=");
67 			String a=split[0].toLowerCase();
68 			String b=split.length>1 ? split[1] : null;
69 
70 			if(a.equals("lines")){
71 				maxLines=Long.parseLong(b);
72 				if(maxLines<0){maxLines=Long.MAX_VALUE;}
73 			}else if(a.equals("verbose")){
74 				verbose=Parse.parseBoolean(b);
75 				ByteFile1.verbose=verbose;
76 				ByteFile2.verbose=verbose;
77 				stream.FastaReadInputStream.verbose=verbose;
78 				ConcurrentGenericReadInputStream.verbose=verbose;
79 				stream.FastqReadInputStream.verbose=verbose;
80 				ReadWrite.verbose=verbose;
81 			}else if(a.equals("in")){
82 				if(b==null){in.clear();}
83 				else{
84 					String[] split2=b.split(",");
85 					for(String s2 : split2){
86 						in.add(s2);
87 					}
88 				}
89 			}else if(b==null && new File(arg).exists()){
90 				in.add(arg);
91 			}else if(parser.parse(arg, a, b)){
92 				//do nothing
93 			}else{
94 				outstream.println("Unknown parameter "+args[i]);
95 				assert(false) : "Unknown parameter "+args[i];
96 				//				throw new RuntimeException("Unknown parameter "+args[i]);
97 			}
98 		}
99 
100 		{//Process parser fields
101 			overwrite=parser.overwrite;
102 			append=parser.append;
103 
104 			out=parser.out1;
105 		}
106 
107 		assert(FastaReadInputStream.settingsOK());
108 
109 		if(in==null){throw new RuntimeException("Error - at least one input file is required.");}
110 
111 		if(!ByteFile.FORCE_MODE_BF2){
112 			ByteFile.FORCE_MODE_BF2=false;
113 			ByteFile.FORCE_MODE_BF1=true;
114 		}
115 
116 		if(out!=null && out.equalsIgnoreCase("null")){out=null;}
117 
118 		if(!Tools.testOutputFiles(overwrite, append, false, out)){
119 			outstream.println((out==null)+", "+out);
120 			throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out+"\n");
121 		}
122 
123 		ffout=FileFormat.testOutput(out, FileFormat.TXT, null, true, overwrite, append, false);
124 		ffina=new FileFormat[in.size()];
125 		for(int i=0; i<in.size(); i++){
126 			ffina[i]=FileFormat.testInput(in.get(i), FileFormat.TXT, null, true, false);
127 		}
128 	}
129 
process(Timer t)130 	void process(Timer t){
131 
132 		for(FileFormat ffin : ffina){
133 			process_inner(ffin);
134 		}
135 
136 		if(ffout!=null){
137 			ByteStreamWriter bsw=new ByteStreamWriter(ffout);
138 			bsw.println("#Pattern\tCount\tCombos\tBits");
139 			ArrayList<StringNum> list=new ArrayList<StringNum>();
140 			list.addAll(countMap.values());
141 			Collections.sort(list);
142 			Collections.reverse(list);
143 			for(StringNum sn : list){
144 				double combos=1;
145 				for(int i=0; i<sn.s.length(); i++){
146 					char c=sn.s.charAt(i);
147 					if(c=='D'){combos*=10;}
148 					else if(c=='L'){combos*=26;}
149 				}
150 				bsw.print(sn.toString().getBytes());
151 				bsw.println("\t"+(long)combos+"\t"+String.format(Locale.ROOT, "%.2f", Tools.log2(combos)));
152 			}
153 			bsw.start();
154 			errorState|=bsw.poisonAndWait();
155 		}
156 
157 		t.stop();
158 
159 		outstream.println(Tools.timeLinesBytesProcessed(t, linesProcessed, bytesProcessed, 8));
160 
161 		outstream.println();
162 		outstream.println("Valid Lines:       \t"+linesOut);
163 		outstream.println("Invalid Lines:     \t"+(linesProcessed-linesOut));
164 
165 		if(errorState){
166 			throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt.");
167 		}
168 	}
169 
process_inner(FileFormat ffin)170 	void process_inner(FileFormat ffin){
171 
172 		ByteFile bf=ByteFile.makeByteFile(ffin);
173 
174 		byte[] line=bf.nextLine();
175 		StringBuilder buffer=new StringBuilder(32);
176 
177 		for(int lineNum=0; line!=null; lineNum++){
178 			if(line.length>0){
179 				if(maxLines>0 && linesProcessed>=maxLines){break;}
180 				linesProcessed++;
181 				bytesProcessed+=(line.length+1);
182 
183 				assert((lineNum==0)==(Tools.startsWith(line, "accession"))) : "Line "+lineNum+": "+new String(line);
184 //				final boolean valid=(line[0]!='#');
185 
186 				if(true){
187 					linesOut++;
188 					bytesOut+=(line.length+1);
189 					increment(line, buffer);
190 				}
191 			}
192 			line=bf.nextLine();
193 		}
194 
195 		errorState|=bf.close();
196 	}
197 
increment(byte[] line, StringBuilder buffer)198 	void increment(byte[] line, StringBuilder buffer){
199 		buffer.setLength(0);
200 		for(int i=0; i<line.length; i++){
201 			final byte b=line[i];
202 			if(b==' ' || b=='\t' || b=='.'){break;}
203 			buffer.append((char)remap[b]);
204 		}
205 		String key=buffer.toString();
206 		StringNum value=countMap.get(key);
207 		if(value!=null){value.increment();}
208 		else{countMap.put(key, new StringNum(key, 1));}
209 	}
210 
combos(String s)211 	public static long combos(String s){
212 		double combos=1;
213 		for(int i=0; i<s.length(); i++){
214 			char c=s.charAt(i);
215 			if(c=='D'){combos*=10;}
216 			else if(c=='L'){combos*=26;}
217 		}
218 		return (combos>=Long.MAX_VALUE ? Long.MAX_VALUE : (long)Math.ceil(combos));
219 	}
220 
combos(byte[] s)221 	public static long combos(byte[] s){
222 		double combos=1;
223 		for(int i=0; i<s.length; i++){
224 			byte c=s[i];
225 			if(c=='D'){combos*=10;}
226 			else if(c=='L'){combos*=26;}
227 		}
228 		return (combos>=Long.MAX_VALUE ? -1 : (long)Math.ceil(combos));
229 	}
230 
231 	/*--------------------------------------------------------------*/
232 
loadCodeMap(String fname)233 	public static HashMap<String, Integer> loadCodeMap(String fname){
234 		assert(codeMap==null);
235 		TextFile tf=new TextFile(fname);
236 		ArrayList<String> list=new ArrayList<String>();
237 		for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){
238 			if(!line.startsWith("#")){
239 				String[] split=line.split("\t");
240 				list.add(split[0]);
241 			}
242 		}
243 		HashMap<String, Integer> map=new HashMap<String, Integer>(list.size()*3);
244 		codeBits=(int)Math.ceil(Tools.log2(list.size()));
245 		final int patternBits=63-codeBits;
246 		final long maxCombos=((1L<<(patternBits-1))-1);
247 		for(int i=0; i<list.size(); i++){
248 			String s=list.get(i);
249 			longestPattern=Tools.max(longestPattern, s.length());
250 			long combos=combos(s);
251 			if(combos<0 || combos>=maxCombos){map.put(s, -1);}
252 			else{map.put(s, i);}
253 		}
254 		codeMap=map;
255 		return map;
256 	}
257 
digitize(String s)258 	public static long digitize(String s){
259 		String pattern=remap(s);
260 		Integer code=codeMap.get(pattern);
261 		if(code==null){return -2;}
262 		if(code.intValue()<0){return -1;}
263 
264 		long number=0;
265 		for(int i=0; i<pattern.length(); i++){
266 			char c=s.charAt(i);
267 			char p=pattern.charAt(i);
268 			if(p=='-'){
269 				//do nothing
270 			}else if(p=='D'){
271 				number=(number*10)+(c-'0');
272 			}else if(p=='L'){
273 				number=(number*26)+(Tools.toUpperCase(c)-'A');
274 			}else{
275 				assert(false) : s;
276 			}
277 		}
278 		number=(number<<codeBits)+code;
279 		return number;
280 	}
281 
digitize(byte[] s)282 	public static long digitize(byte[] s){
283 		String pattern=remap(s);
284 		Integer code=codeMap.get(pattern);
285 		if(code==null){return -2;}
286 		if(code.intValue()<0){return -1;}
287 
288 		long number=0;
289 		for(int i=0; i<pattern.length(); i++){
290 			byte c=s[i];
291 			char p=pattern.charAt(i);
292 			if(p=='-'){
293 				//do nothing
294 			}else if(p=='D'){
295 				number=(number*10)+(c-'0');
296 			}else if(p=='L'){
297 				number=(number*26)+(Tools.toUpperCase(c)-'A');
298 			}else{
299 				assert(false) : s;
300 			}
301 		}
302 		number=(number<<codeBits)+code;
303 		return number;
304 	}
305 
remap(String s)306 	public static String remap(String s){
307 		ByteBuilder buffer=new ByteBuilder(s.length());
308 		for(int i=0; i<s.length(); i++){
309 			final char b=s.charAt(i);
310 			if(b==' ' || b=='\t' || b=='.'){break;}
311 			buffer.append((char)remap[b]);
312 		}
313 		return buffer.toString();
314 	}
315 
remap(byte[] s)316 	public static String remap(byte[] s){
317 		ByteBuilder buffer=new ByteBuilder(s.length);
318 		for(int i=0; i<s.length; i++){
319 			final byte b=s[i];
320 			if(b==' ' || b=='\t' || b=='.'){break;}
321 			buffer.append((char)remap[b]);
322 		}
323 		return buffer.toString();
324 	}
325 
326 	/*--------------------------------------------------------------*/
327 
328 	private ArrayList<String> in=new ArrayList<String>();
329 	private String out=null;
330 
331 	/*--------------------------------------------------------------*/
332 
333 	private HashMap<String, StringNum> countMap=new HashMap<String, StringNum>();
334 	public static HashMap<String, Integer> codeMap;
335 	private static int codeBits=-1;
336 	private static int longestPattern=-1;
337 
338 	private long linesProcessed=0;
339 	private long linesOut=0;
340 	private long bytesProcessed=0;
341 	private long bytesOut=0;
342 
343 	private long maxLines=Long.MAX_VALUE;
344 
345 	/*--------------------------------------------------------------*/
346 
347 	private final FileFormat[] ffina;
348 	private final FileFormat ffout;
349 
350 	private static final byte[] remap=makeRemap();
351 
makeRemap()352 	private static byte[] makeRemap(){
353 		byte[] array=new byte[128];
354 		Arrays.fill(array, (byte)'?');
355 		for(int i='A'; i<='Z'; i++){array[i]='L';}
356 		for(int i='a'; i<='z'; i++){array[i]='L';}
357 		for(int i='0'; i<='9'; i++){array[i]='D';}
358 		array['_']=array['-']='-';
359 		return array;
360 	}
361 
362 	/*--------------------------------------------------------------*/
363 
364 	private PrintStream outstream=System.err;
365 	public static boolean verbose=false;
366 	public boolean errorState=false;
367 	private boolean overwrite=false;
368 	private boolean append=false;
369 
370 }
371