1 package bloom;
2 
3 import java.util.ArrayList;
4 import java.util.Locale;
5 
6 import dna.AminoAcid;
7 import fileIO.FileFormat;
8 import shared.Timer;
9 import stream.ConcurrentReadInputStream;
10 import stream.FastaReadInputStream;
11 import stream.Read;
12 import structures.ListNum;
13 
14 /**
15  * @author Brian Bushnell
16  * @date Jul 5, 2012
17  *
18  */
19 public class KmerCount4 extends KmerCountAbstract {
20 
main(String[] args)21 	public static void main(String[] args){
22 
23 		Timer t=new Timer();
24 
25 		String fname1=args[0];
26 		String fname2=(args.length>3 || args[1].contains(".") ? args[1] : null);
27 		int k=14;
28 		int cbits=16;
29 		int gap=0;
30 
31 		for(int i=(fname2==null ? 1 : 2); i<args.length; i++){
32 			final String arg=args[i];
33 			final String[] split=arg.split("=");
34 			String a=split[0].toLowerCase();
35 			String b=split.length>1 ? split[1] : null;
36 
37 			if(a.equals("k") || a.equals("kmer")){
38 				k=Integer.parseInt(b);
39 			}else if(a.startsWith("cbits") || a.startsWith("cellbits")){
40 				cbits=Integer.parseInt(b);
41 			}else if(a.startsWith("gap")){
42 				gap=Integer.parseInt(b);
43 			}else{
44 				throw new RuntimeException("Unknown parameter "+args[i]);
45 			}
46 		}
47 
48 		KCountArray2 count=null;
49 
50 		if(fileIO.FileFormat.hasFastaExtension(fname1)){
51 			assert(!FastaReadInputStream.SPLIT_READS);
52 			FastaReadInputStream.MIN_READ_LEN=k;
53 		}
54 
55 		if(gap==0){
56 			count=count(fname1, fname2, k, cbits, true);
57 		}else{
58 			count=countFastqSplit(fname1, fname2, (k+1)/2, k/2, gap, cbits, true, null);
59 		}
60 
61 
62 		t.stop();
63 		System.out.println("Finished counting; time = "+t);
64 
65 		printStatistics(count);
66 
67 	}
68 
printStatistics(KCountArray2 count)69 	public static void printStatistics(KCountArray2 count){
70 		long[] freq=count.transformToFrequency();
71 
72 //		System.out.println(count+"\n");
73 //		System.out.println(Arrays.toString(freq)+"\n");
74 
75 		long sum=sum(freq);
76 		System.out.println("Kmer fraction:");
77 		int lim1=8, lim2=16;
78 		for(int i=0; i<lim1; i++){
79 			String prefix=i+"";
80 			while(prefix.length()<8){prefix=prefix+" ";}
81 			System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%%   ",(100l*freq[i]/(double)sum))+"\t"+freq[i]);
82 		}
83 		while(lim1<=freq.length){
84 			int x=0;
85 			for(int i=lim1; i<lim2; i++){
86 				x+=freq[i];
87 			}
88 			String prefix=lim1+"-"+(lim2-1);
89 			if(lim2>=freq.length){prefix=lim1+"+";}
90 			while(prefix.length()<8){prefix=prefix+" ";}
91 			System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%%   ",(100l*x/(double)sum))+"\t"+x);
92 			lim1*=2;
93 			lim2=min(lim2*2, freq.length);
94 		}
95 
96 		long sum2=sum-freq[0];
97 		long x=freq[1];
98 		System.out.println();
99 		System.out.println("Keys Counted:  \t         \t"+keysCounted);
100 		System.out.println("Unique:        \t         \t"+sum2);
101 		System.out.println("Avg Sites/Key: \t         \t"+String.format(Locale.ROOT, "%.3f    ",(keysCounted*1d/sum2)));
102 		System.out.println();
103 		System.out.println("Singleton:     \t"+String.format(Locale.ROOT, "%.3f%%   ",(100l*x/(double)sum2))+"\t"+x);
104 		x=sum2-x;
105 		System.out.println("Useful:        \t"+String.format(Locale.ROOT, "%.3f%%   ",(100l*x/(double)sum2))+"\t"+x);
106 	}
107 
count(String reads1, String reads2, int k, int cbits, boolean rcomp)108 	public static KCountArray2 count(String reads1, String reads2, int k, int cbits, boolean rcomp){
109 		return count(reads1, reads2, k, cbits, rcomp, null);
110 	}
111 
count(String reads1, String reads2, int k, int cbits, boolean rcomp, KCountArray2 count)112 	public static KCountArray2 count(String reads1, String reads2, int k, int cbits, boolean rcomp, KCountArray2 count){
113 		assert(k>=1 && k<20);
114 		final int kbits=2*k;
115 		final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits));
116 
117 		if(count==null){
118 			final long cells=1L<<kbits;
119 			if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));}
120 			count=new KCountArray2(cells, cbits);
121 		}
122 
123 		final ConcurrentReadInputStream cris;
124 		{
125 			FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
126 			FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
127 			cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
128 			cris.start(); //4567
129 		}
130 
131 		assert(cris!=null) : reads1;
132 		System.err.println("Started cris");
133 		boolean paired=cris.paired();
134 		if(verbose){System.err.println("Paired: "+paired);}
135 
136 		ListNum<Read> ln=cris.nextList();
137 		ArrayList<Read> reads=(ln!=null ? ln.list : null);
138 
139 		if(reads!=null && !reads.isEmpty()){
140 			Read r=reads.get(0);
141 			assert(paired==(r.mate!=null));
142 		}
143 
144 		while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
145 			//System.err.println("reads.size()="+reads.size());
146 			for(Read r : reads){
147 				readsProcessed++;
148 
149 				addRead(r, count, k, mask, rcomp);
150 				if(r.mate!=null){
151 					addRead(r.mate, count, k, mask, rcomp);
152 				}
153 
154 			}
155 			//System.err.println("returning list");
156 			cris.returnList(ln);
157 			//System.err.println("fetching list");
158 			ln=cris.nextList();
159 			reads=(ln!=null ? ln.list : null);
160 		}
161 		if(verbose){System.err.println("Finished reading");}
162 		cris.returnList(ln);
163 		if(verbose){System.err.println("Returned list");}
164 		cris.close();
165 		if(verbose){System.err.println("Closed stream");}
166 		if(verbose){System.err.println("Processed "+readsProcessed+" reads.");}
167 
168 
169 		return count;
170 	}
171 
countFastqSplit(String reads1, String reads2, int k1, int k2, int gap, int cbits, boolean rcomp, KCountArray2 count)172 	public static KCountArray2 countFastqSplit(String reads1, String reads2, int k1, int k2, int gap, int cbits, boolean rcomp, KCountArray2 count){
173 		assert(k1+k2>=1 && k1+k2<20);
174 		assert(gap>=0);
175 		final int kbits1=2*k1;
176 		final int kbits2=2*k2;
177 		final long mask1=~((-1L)<<(kbits1));
178 		final long mask2=~((-1L)<<(kbits2));
179 
180 		if(count==null){
181 			final long cells=1L<<(kbits1+kbits2);
182 			if(verbose){System.err.println("k1="+k1+", k2="+k2+", kbits1="+kbits1+", kbits2="+kbits2+", cells="+cells+
183 					", mask1="+Long.toHexString(mask1)+", mask2="+Long.toHexString(mask2));}
184 			count=new KCountArray2(cells, cbits, gap);
185 		}
186 		assert(count.gap==gap);
187 
188 		final ConcurrentReadInputStream cris;
189 		{
190 			FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true);
191 			FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true);
192 			cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2);
193 			cris.start(); //4567
194 		}
195 
196 		assert(cris!=null) : reads1;
197 		System.err.println("Started cris");
198 		boolean paired=cris.paired();
199 		if(verbose){System.err.println("Paired: "+paired);}
200 
201 		ListNum<Read> ln=cris.nextList();
202 		ArrayList<Read> reads=(ln!=null ? ln.list : null);
203 
204 		if(reads!=null && !reads.isEmpty()){
205 			Read r=reads.get(0);
206 			assert(paired==(r.mate!=null));
207 		}
208 
209 		while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
210 			//System.err.println("reads.size()="+reads.size());
211 			for(Read r : reads){
212 				readsProcessed++;
213 
214 				addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp);
215 				if(r.mate!=null){
216 					addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp);
217 				}
218 
219 			}
220 			//System.err.println("returning list");
221 			cris.returnList(ln);
222 			//System.err.println("fetching list");
223 			ln=cris.nextList();
224 			reads=(ln!=null ? ln.list : null);
225 		}
226 		if(verbose){System.err.println("Finished reading");}
227 		cris.returnList(ln);
228 		if(verbose){System.err.println("Returned list");}
229 		cris.close();
230 		if(verbose){System.err.println("Closed stream");}
231 		if(verbose){System.err.println("Processed "+readsProcessed+" reads.");}
232 
233 
234 		return count;
235 	}
236 
addRead(final Read r, final KCountArray2 count, final int k, final long mask, boolean rcomp)237 	public static void addRead(final Read r, final KCountArray2 count, final int k, final long mask, boolean rcomp){
238 		int len=0;
239 		long kmer=0;
240 		byte[] bases=r.bases;
241 		byte[] quals=r.quality;
242 		for(int i=0; i<bases.length; i++){
243 			byte b=bases[i];
244 			int x=AminoAcid.baseToNumber[b];
245 			if(x<0 || (quals!=null && quals[i]<minQuality)){
246 				len=0;
247 				kmer=0;
248 			}else{
249 				kmer=((kmer<<2)|x)&mask;
250 				len++;
251 				if(len>=k){
252 					keysCounted++;
253 //					System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
254 					count.increment(kmer, 1);
255 //					System.out.println(" -> "+count.read(kmer));
256 //					System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
257 //					array[(int)kmer]++;
258 //					System.out.println(" -> "+array[(int)kmer]+"\n");
259 //					assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
260 				}
261 			}
262 		}
263 		if(rcomp){
264 			r.reverseComplement();
265 			addRead(r, count, k, mask, false);
266 		}
267 	}
268 
addReadSplit(final Read r, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp)269 	public static void addReadSplit(final Read r, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
270 		int len=0;
271 		int shift=k2*2;
272 		long kmer1=0;
273 		long kmer2=0;
274 		byte[] bases=r.bases;
275 		byte[] quals=r.quality;
276 
277 		assert(kmer1>=kmer2);
278 
279 //		assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
280 
281 		for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
282 			int x1=AminoAcid.baseToNumber[bases[i]];
283 			int x2=AminoAcid.baseToNumber[bases[j]];
284 			if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){
285 				len=0;
286 				kmer1=0;
287 				kmer2=0;
288 			}else{
289 				kmer1=((kmer1<<2)|x1)&mask1;
290 				kmer2=((kmer2<<2)|x2)&mask2;
291 				len++;
292 				if(len>=k1){
293 					keysCounted++;
294 //					System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
295 
296 					long key=(kmer1<<shift)|kmer2;
297 //					System.err.println(Long.toHexString(key));
298 					count.increment(key, 1);
299 //					System.out.println(" -> "+count.read(kmer));
300 //					System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
301 //					array[(int)kmer]++;
302 //					System.out.println(" -> "+array[(int)kmer]+"\n");
303 //					assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
304 				}
305 			}
306 		}
307 		if(rcomp){
308 			r.reverseComplement();
309 			addReadSplit(r, count, k1, k2, mask1, mask2, gap, false);
310 		}
311 	}
312 
addReadSplit(final byte[] bases, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp)313 	public static void addReadSplit(final byte[] bases, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){
314 		int len=0;
315 		int shift=k2*2;
316 		long kmer1=0;
317 		long kmer2=0;
318 		byte[] quals=null;
319 
320 		assert(kmer1>=kmer2);
321 
322 //		assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap;
323 
324 		for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){
325 			int x1=AminoAcid.baseToNumber[bases[i]];
326 			int x2=AminoAcid.baseToNumber[bases[j]];
327 			if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){
328 				len=0;
329 				kmer1=0;
330 				kmer2=0;
331 			}else{
332 				kmer1=((kmer1<<2)|x1)&mask1;
333 				kmer2=((kmer2<<2)|x2)&mask2;
334 				len++;
335 				if(len>=k1){
336 					keysCounted++;
337 //					System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer));
338 
339 					long key=(kmer1<<shift)|kmer2;
340 					System.out.println(Long.toHexString(kmer1));
341 					System.out.println(Long.toHexString(kmer2));
342 					System.out.println(Long.toHexString(key));
343 					count.increment(key, 1);
344 //					System.out.println(" -> "+count.read(kmer));
345 //					System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]);
346 //					array[(int)kmer]++;
347 //					System.out.println(" -> "+array[(int)kmer]+"\n");
348 //					assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3);
349 				}
350 			}
351 		}
352 		if(rcomp){
353 			AminoAcid.reverseComplementBasesInPlace(bases);
354 			addReadSplit(bases, count, k1, k2, mask1, mask2, gap, false);
355 		}
356 	}
357 
358 }
359