1 /*
2  * To change this license header, choose License Headers in Project Properties.
3  * To change this template file, choose Tools | Templates
4  * and open the template in the editor.
5  */
6 package jgi;
7 
8 import static jgi.Dedupe.maxNmer;
9 import static jgi.Dedupe.nmerIndex;
10 
11 import java.util.ArrayList;
12 import java.util.Arrays;
13 import java.util.List;
14 import java.util.Locale;
15 import java.util.concurrent.ArrayBlockingQueue;
16 
17 import dna.AminoAcid;
18 import fileIO.ByteStreamWriter;
19 import fileIO.FileFormat;
20 import fileIO.ReadWrite;
21 import shared.Parse;
22 import shared.Parser;
23 import shared.PreParser;
24 import shared.Shared;
25 import shared.Timer;
26 import shared.Tools;
27 import stream.ConcurrentReadInputStream;
28 import stream.Read;
29 import structures.ByteBuilder;
30 import structures.ListNum;
31 
32 /**
33  *
34  * @author syao
35  * Last updated : 01102018
36  */
37 public class TetramerFrequencies {
main(String[] args)38 	public static void main(String[] args){
39 
40 		System.out.println("Start Tetramer Frequencies analysis ...");
41 
42 		final int oldThreads=Shared.threads();
43 		Shared.capThreads(16);
44 
45 		Timer t=new Timer();
46 
47 		//Create an instance of this class
48 		TetramerFrequencies x=new TetramerFrequencies(args);
49 
50 		//Run the object
51 		x.process(t);
52 
53 		//Close the print stream if it was redirected
54 		Shared.closeStream(x.outstream);
55 
56 		Shared.setThreads(oldThreads);
57 	}
58 
TetramerFrequencies(String[] args)59 	public TetramerFrequencies(String[] args){
60 
61 		{//Preparse block for help, config files, and outstream
62 			PreParser pp=new PreParser(args, getClass(), false);
63 			args=pp.args;
64 			outstream=pp.outstream;
65 		}
66 
67 		int k_=4;
68 		Parser parser=new Parser();
69 		for (String arg : args) {
70 			String[] split=arg.split("=");
71 			String a=split[0].toLowerCase();
72 			String b=split.length>1 ? split[1] : null;
73 
74 			if (a.equals("s") || a.equals("step")){
75 				step = Integer.parseInt(b);
76 			} else if (a.equals("w") || a.equals("window")){
77 				winSize = Integer.parseInt(b);
78 			} else if (a.equals("out") || a.equals("freq")){
79 				out1 = b;
80 			} else if (a.equals("dropshort")){
81 				keepShort=!Parse.parseBoolean(b);
82 			} else if (a.equals("keepshort") || a.equals("short")){
83 				keepShort=Parse.parseBoolean(b);
84 			} else if (a.equals("k")){
85 				k_ = Integer.parseInt(b);
86 			} else if(parser.parse(arg, a, b)){
87 				//do nothing
88 			} else {
89 				throw new RuntimeException("Unknown argument "+arg);
90 			}
91 		}
92 
93 		{//Process parser fields
94 			Parser.processQuality();
95 
96 			maxReads=parser.maxReads;
97 			in1=parser.in1;
98 		}
99 
100 		k=k_;
101 		ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true);
102 		assert(ffin1!=null) : "No input file.";
103 		assert(ffin1.exists() && ffin1.canRead()) : "Cannot read input file "+in1+".";
104 
105 
106 		// initialize the class member textstream writer here, so no need of keep results in memory
107 		if (out1==null || out1.equals("")){
108 			out1 = "stdout";
109 		}
110 
111 		threads=Tools.max(1, Shared.threads());
112 		inq=new ArrayBlockingQueue<Line>(threads+1);
113 
114 		FileFormat ff=FileFormat.testOutput(out1, FileFormat.TEXT, null, true, true, false, true);
115 		bsw = new ByteStreamWriter(ff);
116 		bsw.start();
117 
118 		//create and write the output header
119 		List<String> head = new ArrayList<String>();
120 		head.add("scaffold");
121 		head.add("range");
122 
123 		// the unit tetramer strings in header
124 		tetramerGen2(head);
125 		bsw.add(new ByteBuilder(String.join("\t", head)).append('\n'), nextID);
126 		nextID++;
127 	}
128 
process(Timer t)129 	void process(Timer t){
130 
131 		ArrayList<PrintThread> alpt=spawnThreads();
132 
133 		final ConcurrentReadInputStream cris;
134 		{
135 			cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null);
136 			cris.start();
137 		}
138 
139 		long readsProcessed=0;
140 		{
141 			ListNum<Read> ln=cris.nextList();
142 			ArrayList<Read> reads=(ln!=null ? ln.list : null);
143 
144 			if(reads!=null && !reads.isEmpty()){
145 				Read r=reads.get(0);
146 				assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
147 			}
148 
149 			while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
150 				if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
151 
152 				for (Read r1 : reads){
153 					windowedTetramerProfile(r1.bases, r1.id);
154 					readsProcessed++;
155 				}
156 
157 				cris.returnList(ln);
158 				if(verbose){
159 					outstream.println("Returned a list.");
160 				}
161 				ln=cris.nextList();
162 				reads=(ln!=null ? ln.list : null);
163 			}
164 
165 			if(ln!=null){
166 				cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
167 			}
168 		}
169 
170 		putLine(POISON_LINE);
171 		waitForFinish(alpt);
172 
173 		// close the output file
174 		bsw.poisonAndWait();
175 
176 		ReadWrite.closeStream(cris);
177 		if(verbose){outstream.println("Finished.");}
178 
179 		t.stop();
180 		outstream.println("Time:                         \t"+t);
181 		outstream.println("Reads Processed:    "+readsProcessed+" \t"+String.format(Locale.ROOT, "%.2fk reads/sec", (readsProcessed/(double)(t.elapsed))*1000000));
182 	}
183 
184 	// TODO: optimize the algorithm ...
windowedTetramerProfileOpt(byte[] bases, String header)185 	private void windowedTetramerProfileOpt(byte[] bases, String header){
186 		int sidx = 0;
187 		int eidx = winSize;
188 
189 		final int[] counts = new int[maxNmer+1];
190 
191 		int leadlen = 0;
192 		int leadtmer = -1;
193 		int endlen = 0;
194 		int endtmer = -1;
195 
196 		int tmer = 0;
197 		int len = 0;
198 
199 		for (int i=0; i<= eidx; i++){
200 			int x = AminoAcid.baseToNumber[bases[i]];
201 			if (x < 0){
202 				tmer = 0;
203 				len = 0;
204 			} else {
205 				tmer = (leadtmer<<2)|x ;   // this can produce -1 vaue if any base in tetramer is N!
206 				len++;
207 				if (len >= k){
208 					int idx = tmerIndex(tmer);
209 					counts[idx]++;
210 
211 					if( leadtmer==-1){
212 						leadtmer = tmer;
213 					} else if(i == eidx){
214 
215 					}
216 
217 
218 					len--;
219 				}
220 			}
221 		}
222 
223 
224 
225 	}
226 
windowedTetramerProfile(byte[] bases, String header)227 	private void windowedTetramerProfile(byte[] bases, String header){
228 		int sidx = 0;
229 		int eidx = winSize<1 ? bases.length : Tools.min(bases.length, winSize);
230 
231 		while (eidx <= bases.length){
232 			Line line=new Line(header, bases, sidx, eidx, nextID);
233 			putLine(line);
234 			sidx+=windowsPerLine*step;
235 			eidx+=windowsPerLine*step;
236 			nextID++;
237 		}
238 	}
239 
240 	//Slow version
241 	void append(Line line, int[] counts, StringBuilder sb){
242 		sb.append(line.header);
243 		sb.append('\t');
244 		sb.append(line.sidx+1);
245 		sb.append('-');
246 		sb.append(line.eidx);
247 		float mult=1f/(line.length()-k+1);
248 		for (int cnt: counts){
249 			sb.append('\t');
250 			sb.append(String.format(Locale.ROOT, "%.4f", cnt*mult));
251 		}
252 		sb.append('\n');
253 	}
254 
255 	void append(Line line, int[] counts, ByteBuilder bb){
256 		bb.append(line.header);
257 		bb.tab();
258 		bb.append(line.sidx+1);
259 		bb.append('-');
260 		bb.append(line.eidx);
261 		for (int cnt: counts){
262 			bb.tab();
263 			bb.append(cnt);
264 		}
265 		bb.nl();
266 	}
267 
268 	// factor this out so we can work on reads
269 	public int[] tetramerCounter(byte[] bases, int startidx, int endidx, int[] counts){
270 		if (counts == null){
271 			counts = new int[maxNmer+1];
272 		}
273 
274 		for (int i=startidx; i<=endidx-k; i++){
275 			int kmer = 0;   // the binary representation of the tetramer in the sliding window
276 			for (int j=0; j<k; j++){
277 				int x = AminoAcid.baseToNumber[bases[i+j]];
278 				kmer= (kmer<<2)|x ;   // this can produce -1 vaue if any base in tetramer is N!
279 			}
280 
281 			// ignore tetramers containing "N" base
282 			if (kmer > -1){
283 				int idx = tmerIndex(kmer);
284 				counts[idx]++;
285 			}
286 		}
287 
288 		return counts;
289 	}
290 
291 	private int tmerIndex(int tmer){
292 		int rtmer = AminoAcid.reverseComplementBinaryFast(tmer, k);
293 		int mtmer = Tools.min(tmer, rtmer);
294 		int idx = -1;
295 		try{
296 			idx = nmerIndex[rtmer];
297 		} catch (ArrayIndexOutOfBoundsException e){
298 			System.out.printf("ArrayIndexOutOfBoundsException tmer=%d: rtmer=%d; \n", tmer, rtmer);
299 			System.exit(-1);
300 		}
301 		return idx;
302 	}
303 
304 
305 	public List<String> tetramerGen(List<String> tlist){
306 		if (tlist == null){
307 			tlist = new ArrayList<String>();
308 		}
309 
310 		int[] bases = {0, 1, 2, 3}; // A C G T
311 
312 		for (int b1 : bases){
313 			for (int b2 : bases){
314 				for (int b3 : bases){
315 					for (int b4 : bases){
316 						int tcode = b1;
317 						tcode = (tcode<<2)|b2;
318 						tcode = (tcode<<2)|b3;
319 						tcode = (tcode<<2)|b4;
320 						String tmer = AminoAcid.kmerToString(tcode, k).toLowerCase();
321 						String rtmer = AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(tcode, k), k).toLowerCase();
322 
323 						if (!tlist.contains(rtmer)){
324 							tlist.add(tmer);
325 						}
326 					}
327 				}
328 			}
329 		}
330 		// total of 136 unique tetramers
331 		return tlist;
332 	}
333 
334 	public List<String> tetramerGen2(List<String> tlist){
335 		if (tlist == null){
336 			tlist = new ArrayList<String>();
337 		}
338 
339 		final int max=(1<<(2*k))-1;
340 
341 		for(int i=0; i<=max; i++){
342 			final int a=i, b=AminoAcid.reverseComplementBinaryFast(i, k);
343 			int min=Tools.min(a, b);
344 			if(min==a){
345 				tlist.add(AminoAcid.kmerToString(a, k).toLowerCase());
346 			}
347 		}
348 		return tlist;
349 	}
350 
351 	public void printTetramerFromCode(long code){
352 		System.out.println(AminoAcid.kmerToString(code, k));
353 	}
354 
355 //	public void printTetramers(byte[] bases, String header){
356 //		int sidx = 0;
357 //		int eidx = winSize;
358 //
359 //		while (true){
360 //			if (eidx > bases.length){
361 //				break;
362 //			}
363 //			System.out.printf("%s %d-%d\n", header, sidx+1, eidx);
364 //
365 //			// count tetramer here
366 //			for (int i=sidx; i<eidx-k; i++){
367 //				char[] tetramer = new char[k];
368 //				for (int j=0; j<k; j++){
369 //					tetramer[j] = (char)bases[i+j];
370 //				}
371 //				System.out.println(new String(tetramer));
372 //			}
373 //
374 //			sidx += step;
375 //			eidx += step;
376 //		}
377 //	}
378 
379 	public static void printHelp(){
380 		List<String> helplist = new ArrayList<String>();
381 		helplist.add("Program Name : TetramerFrequencies v1.1");
382 		helplist.add("Usage : ");
383 		helplist.add(" -h : this page");
384 		helplist.add(" -s : step [500]");
385 		helplist.add(" -w : window size [2000]. If set to 0 the whole sequence is processed");
386 		System.out.println(String.join("\n", helplist));
387 	}
388 
389 	/*--------------------------------------------------------------*/
390 
391 	final Line takeLine(){
392 		Line line=null;
393 		while(line==null){
394 			try {
395 				line=inq.take();
396 			} catch (InterruptedException e) {
397 				// TODO Auto-generated catch block
398 				e.printStackTrace();
399 			}
400 		}
401 //		System.err.println("takeLine("+line.id+")");
402 		return line;
403 	}
404 
405 	final void putLine(Line line){
406 //		System.err.println("putLine("+line.id+")");
407 		while(line!=null){
408 			try {
409 				inq.put(line);
410 				line=null;
411 			} catch (InterruptedException e) {
412 				// TODO Auto-generated catch block
413 				e.printStackTrace();
414 			}
415 		}
416 	}
417 
418 	/** Spawn process threads */
419 	private ArrayList<PrintThread> spawnThreads(){
420 
421 		//Do anything necessary prior to processing
422 
423 		//Fill a list with PrintThreads
424 		ArrayList<PrintThread> alpt=new ArrayList<PrintThread>(threads);
425 		for(int i=0; i<threads; i++){
426 			alpt.add(new PrintThread());
427 		}
428 		if(verbose){outstream.println("Spawned threads.");}
429 
430 		//Start the threads
431 		for(PrintThread pt : alpt){
432 			pt.start();
433 		}
434 		if(verbose){outstream.println("Started threads.");}
435 
436 		//Do anything necessary after processing
437 		return alpt;
438 	}
439 
440 	private void waitForFinish(ArrayList<PrintThread> alpt){
441 		//Wait for completion of all threads
442 		boolean allSuccess=true;
443 		for(PrintThread pt : alpt){
444 			while(pt.getState()!=Thread.State.TERMINATED){
445 				try {
446 					//Attempt a join operation
447 					pt.join();
448 				} catch (InterruptedException e) {
449 					//Potentially handle this, if it is expected to occur
450 					e.printStackTrace();
451 				}
452 			}
453 		}
454 	}
455 
456 	private class PrintThread extends Thread {
457 
458 		PrintThread(){}
459 
460 		@Override
461 		public void run(){
462 			Line line=takeLine();
463 			while(line!=null && line!=POISON_LINE){
464 				processLine(line);
465 				line=takeLine();
466 			}
467 			putLine(POISON_LINE);
468 		}
469 
470 		private void processLine(Line line){
471 			ByteBuilder bb=new ByteBuilder(512);
472 			for(int i=0; i<windowsPerLine && line.eidx<=line.bases.length; i++){
473 				Arrays.fill(counts, 0);
474 				tetramerCounter(line.bases, line.sidx, line.eidx, counts);
475 				if(keepShort || line.length()>=winSize){
476 					append(line, counts, bb);
477 				}
478 				line.sidx+=step;
479 				line.eidx+=step;
480 			}
481 			bsw.add(bb, line.id);
482 		}
483 
484 		private final int[] counts=new int[maxNmer+1];
485 	}
486 
487 	private class Line {
488 
489 		Line(String header_, byte[] bases_, int sidx_, int eidx_, long id_){
490 			header=header_;
491 			bases=bases_;
492 			sidx=sidx_;
493 			eidx=eidx_;
494 			id=id_;
495 		}
496 
497 		public int length() {
498 			return eidx-sidx+1;
499 		}
500 
501 		final String header;
502 		final byte[] bases;
503 		int sidx;
504 		int eidx;
505 		final long id;
506 
507 	}
508 
509 	/*--------------------------------------------------------------*/
510 
511 	final Line POISON_LINE=new Line(null, null, -1, -1, -1);
512 	private final ArrayBlockingQueue<Line> inq;
513 	private final int threads;
514 	private long nextID=0;
515 
516 	/*--------------------------------------------------------------*/
517 
518 	private String in1 = null;
519 	private String out1 = null;
520 	private ByteStreamWriter bsw = null; // for output
521 
522 	private final FileFormat ffin1;
523 
524 	private java.io.PrintStream outstream=System.err;
525 
526 	/*--------------------------------------------------------------*/
527 
528 	private long maxReads=-1;
529 	int step = 500;
530 	private int winSize = 2000;
531 	private final int k;
532 	private boolean keepShort=false;
533 
534 	/*--------------------------------------------------------------*/
535 
536 	private final static int windowsPerLine=8;
537 	public static boolean verbose=false;
538 }
539