1 package bloom; 2 3 import java.util.ArrayList; 4 import java.util.Locale; 5 6 import dna.AminoAcid; 7 import fileIO.FileFormat; 8 import shared.Timer; 9 import stream.ConcurrentReadInputStream; 10 import stream.FastaReadInputStream; 11 import stream.Read; 12 import structures.ListNum; 13 14 /** 15 * @author Brian Bushnell 16 * @date Jul 5, 2012 17 * 18 */ 19 public class KmerCount4 extends KmerCountAbstract { 20 main(String[] args)21 public static void main(String[] args){ 22 23 Timer t=new Timer(); 24 25 String fname1=args[0]; 26 String fname2=(args.length>3 || args[1].contains(".") ? args[1] : null); 27 int k=14; 28 int cbits=16; 29 int gap=0; 30 31 for(int i=(fname2==null ? 1 : 2); i<args.length; i++){ 32 final String arg=args[i]; 33 final String[] split=arg.split("="); 34 String a=split[0].toLowerCase(); 35 String b=split.length>1 ? split[1] : null; 36 37 if(a.equals("k") || a.equals("kmer")){ 38 k=Integer.parseInt(b); 39 }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ 40 cbits=Integer.parseInt(b); 41 }else if(a.startsWith("gap")){ 42 gap=Integer.parseInt(b); 43 }else{ 44 throw new RuntimeException("Unknown parameter "+args[i]); 45 } 46 } 47 48 KCountArray2 count=null; 49 50 if(fileIO.FileFormat.hasFastaExtension(fname1)){ 51 assert(!FastaReadInputStream.SPLIT_READS); 52 FastaReadInputStream.MIN_READ_LEN=k; 53 } 54 55 if(gap==0){ 56 count=count(fname1, fname2, k, cbits, true); 57 }else{ 58 count=countFastqSplit(fname1, fname2, (k+1)/2, k/2, gap, cbits, true, null); 59 } 60 61 62 t.stop(); 63 System.out.println("Finished counting; time = "+t); 64 65 printStatistics(count); 66 67 } 68 printStatistics(KCountArray2 count)69 public static void printStatistics(KCountArray2 count){ 70 long[] freq=count.transformToFrequency(); 71 72 // System.out.println(count+"\n"); 73 // System.out.println(Arrays.toString(freq)+"\n"); 74 75 long sum=sum(freq); 76 System.out.println("Kmer fraction:"); 77 int lim1=8, lim2=16; 78 for(int i=0; i<lim1; i++){ 79 String prefix=i+""; 80 while(prefix.length()<8){prefix=prefix+" ";} 81 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*freq[i]/(double)sum))+"\t"+freq[i]); 82 } 83 while(lim1<=freq.length){ 84 int x=0; 85 for(int i=lim1; i<lim2; i++){ 86 x+=freq[i]; 87 } 88 String prefix=lim1+"-"+(lim2-1); 89 if(lim2>=freq.length){prefix=lim1+"+";} 90 while(prefix.length()<8){prefix=prefix+" ";} 91 System.out.println(prefix+"\t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum))+"\t"+x); 92 lim1*=2; 93 lim2=min(lim2*2, freq.length); 94 } 95 96 long sum2=sum-freq[0]; 97 long x=freq[1]; 98 System.out.println(); 99 System.out.println("Keys Counted: \t \t"+keysCounted); 100 System.out.println("Unique: \t \t"+sum2); 101 System.out.println("Avg Sites/Key: \t \t"+String.format(Locale.ROOT, "%.3f ",(keysCounted*1d/sum2))); 102 System.out.println(); 103 System.out.println("Singleton: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); 104 x=sum2-x; 105 System.out.println("Useful: \t"+String.format(Locale.ROOT, "%.3f%% ",(100l*x/(double)sum2))+"\t"+x); 106 } 107 count(String reads1, String reads2, int k, int cbits, boolean rcomp)108 public static KCountArray2 count(String reads1, String reads2, int k, int cbits, boolean rcomp){ 109 return count(reads1, reads2, k, cbits, rcomp, null); 110 } 111 count(String reads1, String reads2, int k, int cbits, boolean rcomp, KCountArray2 count)112 public static KCountArray2 count(String reads1, String reads2, int k, int cbits, boolean rcomp, KCountArray2 count){ 113 assert(k>=1 && k<20); 114 final int kbits=2*k; 115 final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits)); 116 117 if(count==null){ 118 final long cells=1L<<kbits; 119 if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));} 120 count=new KCountArray2(cells, cbits); 121 } 122 123 final ConcurrentReadInputStream cris; 124 { 125 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); 126 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); 127 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); 128 cris.start(); //4567 129 } 130 131 assert(cris!=null) : reads1; 132 System.err.println("Started cris"); 133 boolean paired=cris.paired(); 134 if(verbose){System.err.println("Paired: "+paired);} 135 136 ListNum<Read> ln=cris.nextList(); 137 ArrayList<Read> reads=(ln!=null ? ln.list : null); 138 139 if(reads!=null && !reads.isEmpty()){ 140 Read r=reads.get(0); 141 assert(paired==(r.mate!=null)); 142 } 143 144 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 145 //System.err.println("reads.size()="+reads.size()); 146 for(Read r : reads){ 147 readsProcessed++; 148 149 addRead(r, count, k, mask, rcomp); 150 if(r.mate!=null){ 151 addRead(r.mate, count, k, mask, rcomp); 152 } 153 154 } 155 //System.err.println("returning list"); 156 cris.returnList(ln); 157 //System.err.println("fetching list"); 158 ln=cris.nextList(); 159 reads=(ln!=null ? ln.list : null); 160 } 161 if(verbose){System.err.println("Finished reading");} 162 cris.returnList(ln); 163 if(verbose){System.err.println("Returned list");} 164 cris.close(); 165 if(verbose){System.err.println("Closed stream");} 166 if(verbose){System.err.println("Processed "+readsProcessed+" reads.");} 167 168 169 return count; 170 } 171 countFastqSplit(String reads1, String reads2, int k1, int k2, int gap, int cbits, boolean rcomp, KCountArray2 count)172 public static KCountArray2 countFastqSplit(String reads1, String reads2, int k1, int k2, int gap, int cbits, boolean rcomp, KCountArray2 count){ 173 assert(k1+k2>=1 && k1+k2<20); 174 assert(gap>=0); 175 final int kbits1=2*k1; 176 final int kbits2=2*k2; 177 final long mask1=~((-1L)<<(kbits1)); 178 final long mask2=~((-1L)<<(kbits2)); 179 180 if(count==null){ 181 final long cells=1L<<(kbits1+kbits2); 182 if(verbose){System.err.println("k1="+k1+", k2="+k2+", kbits1="+kbits1+", kbits2="+kbits2+", cells="+cells+ 183 ", mask1="+Long.toHexString(mask1)+", mask2="+Long.toHexString(mask2));} 184 count=new KCountArray2(cells, cbits, gap); 185 } 186 assert(count.gap==gap); 187 188 final ConcurrentReadInputStream cris; 189 { 190 FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); 191 FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); 192 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ff1, ff2); 193 cris.start(); //4567 194 } 195 196 assert(cris!=null) : reads1; 197 System.err.println("Started cris"); 198 boolean paired=cris.paired(); 199 if(verbose){System.err.println("Paired: "+paired);} 200 201 ListNum<Read> ln=cris.nextList(); 202 ArrayList<Read> reads=(ln!=null ? ln.list : null); 203 204 if(reads!=null && !reads.isEmpty()){ 205 Read r=reads.get(0); 206 assert(paired==(r.mate!=null)); 207 } 208 209 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 210 //System.err.println("reads.size()="+reads.size()); 211 for(Read r : reads){ 212 readsProcessed++; 213 214 addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp); 215 if(r.mate!=null){ 216 addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp); 217 } 218 219 } 220 //System.err.println("returning list"); 221 cris.returnList(ln); 222 //System.err.println("fetching list"); 223 ln=cris.nextList(); 224 reads=(ln!=null ? ln.list : null); 225 } 226 if(verbose){System.err.println("Finished reading");} 227 cris.returnList(ln); 228 if(verbose){System.err.println("Returned list");} 229 cris.close(); 230 if(verbose){System.err.println("Closed stream");} 231 if(verbose){System.err.println("Processed "+readsProcessed+" reads.");} 232 233 234 return count; 235 } 236 addRead(final Read r, final KCountArray2 count, final int k, final long mask, boolean rcomp)237 public static void addRead(final Read r, final KCountArray2 count, final int k, final long mask, boolean rcomp){ 238 int len=0; 239 long kmer=0; 240 byte[] bases=r.bases; 241 byte[] quals=r.quality; 242 for(int i=0; i<bases.length; i++){ 243 byte b=bases[i]; 244 int x=AminoAcid.baseToNumber[b]; 245 if(x<0 || (quals!=null && quals[i]<minQuality)){ 246 len=0; 247 kmer=0; 248 }else{ 249 kmer=((kmer<<2)|x)&mask; 250 len++; 251 if(len>=k){ 252 keysCounted++; 253 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); 254 count.increment(kmer, 1); 255 // System.out.println(" -> "+count.read(kmer)); 256 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); 257 // array[(int)kmer]++; 258 // System.out.println(" -> "+array[(int)kmer]+"\n"); 259 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); 260 } 261 } 262 } 263 if(rcomp){ 264 r.reverseComplement(); 265 addRead(r, count, k, mask, false); 266 } 267 } 268 addReadSplit(final Read r, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp)269 public static void addReadSplit(final Read r, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ 270 int len=0; 271 int shift=k2*2; 272 long kmer1=0; 273 long kmer2=0; 274 byte[] bases=r.bases; 275 byte[] quals=r.quality; 276 277 assert(kmer1>=kmer2); 278 279 // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; 280 281 for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){ 282 int x1=AminoAcid.baseToNumber[bases[i]]; 283 int x2=AminoAcid.baseToNumber[bases[j]]; 284 if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){ 285 len=0; 286 kmer1=0; 287 kmer2=0; 288 }else{ 289 kmer1=((kmer1<<2)|x1)&mask1; 290 kmer2=((kmer2<<2)|x2)&mask2; 291 len++; 292 if(len>=k1){ 293 keysCounted++; 294 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); 295 296 long key=(kmer1<<shift)|kmer2; 297 // System.err.println(Long.toHexString(key)); 298 count.increment(key, 1); 299 // System.out.println(" -> "+count.read(kmer)); 300 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); 301 // array[(int)kmer]++; 302 // System.out.println(" -> "+array[(int)kmer]+"\n"); 303 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); 304 } 305 } 306 } 307 if(rcomp){ 308 r.reverseComplement(); 309 addReadSplit(r, count, k1, k2, mask1, mask2, gap, false); 310 } 311 } 312 addReadSplit(final byte[] bases, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp)313 public static void addReadSplit(final byte[] bases, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ 314 int len=0; 315 int shift=k2*2; 316 long kmer1=0; 317 long kmer2=0; 318 byte[] quals=null; 319 320 assert(kmer1>=kmer2); 321 322 // assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; 323 324 for(int i=0, j=i+k1+gap; j<bases.length; i++, j++){ 325 int x1=AminoAcid.baseToNumber[bases[i]]; 326 int x2=AminoAcid.baseToNumber[bases[j]]; 327 if(x1<0 || x2<0 || (quals!=null && (quals[i]<minQuality || quals[j]<minQuality))){ 328 len=0; 329 kmer1=0; 330 kmer2=0; 331 }else{ 332 kmer1=((kmer1<<2)|x1)&mask1; 333 kmer2=((kmer2<<2)|x2)&mask2; 334 len++; 335 if(len>=k1){ 336 keysCounted++; 337 // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); 338 339 long key=(kmer1<<shift)|kmer2; 340 System.out.println(Long.toHexString(kmer1)); 341 System.out.println(Long.toHexString(kmer2)); 342 System.out.println(Long.toHexString(key)); 343 count.increment(key, 1); 344 // System.out.println(" -> "+count.read(kmer)); 345 // System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); 346 // array[(int)kmer]++; 347 // System.out.println(" -> "+array[(int)kmer]+"\n"); 348 // assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); 349 } 350 } 351 } 352 if(rcomp){ 353 AminoAcid.reverseComplementBasesInPlace(bases); 354 addReadSplit(bases, count, k1, k2, mask1, mask2, gap, false); 355 } 356 } 357 358 } 359