1 /* 2 * To change this license header, choose License Headers in Project Properties. 3 * To change this template file, choose Tools | Templates 4 * and open the template in the editor. 5 */ 6 package jgi; 7 8 import static jgi.Dedupe.maxNmer; 9 import static jgi.Dedupe.nmerIndex; 10 11 import java.util.ArrayList; 12 import java.util.Arrays; 13 import java.util.List; 14 import java.util.Locale; 15 import java.util.concurrent.ArrayBlockingQueue; 16 17 import dna.AminoAcid; 18 import fileIO.ByteStreamWriter; 19 import fileIO.FileFormat; 20 import fileIO.ReadWrite; 21 import shared.Parse; 22 import shared.Parser; 23 import shared.PreParser; 24 import shared.Shared; 25 import shared.Timer; 26 import shared.Tools; 27 import stream.ConcurrentReadInputStream; 28 import stream.Read; 29 import structures.ByteBuilder; 30 import structures.ListNum; 31 32 /** 33 * 34 * @author syao 35 * Last updated : 01102018 36 */ 37 public class TetramerFrequencies { main(String[] args)38 public static void main(String[] args){ 39 40 System.out.println("Start Tetramer Frequencies analysis ..."); 41 42 final int oldThreads=Shared.threads(); 43 Shared.capThreads(16); 44 45 Timer t=new Timer(); 46 47 //Create an instance of this class 48 TetramerFrequencies x=new TetramerFrequencies(args); 49 50 //Run the object 51 x.process(t); 52 53 //Close the print stream if it was redirected 54 Shared.closeStream(x.outstream); 55 56 Shared.setThreads(oldThreads); 57 } 58 TetramerFrequencies(String[] args)59 public TetramerFrequencies(String[] args){ 60 61 {//Preparse block for help, config files, and outstream 62 PreParser pp=new PreParser(args, getClass(), false); 63 args=pp.args; 64 outstream=pp.outstream; 65 } 66 67 int k_=4; 68 Parser parser=new Parser(); 69 for (String arg : args) { 70 String[] split=arg.split("="); 71 String a=split[0].toLowerCase(); 72 String b=split.length>1 ? split[1] : null; 73 74 if (a.equals("s") || a.equals("step")){ 75 step = Integer.parseInt(b); 76 } else if (a.equals("w") || a.equals("window")){ 77 winSize = Integer.parseInt(b); 78 } else if (a.equals("out") || a.equals("freq")){ 79 out1 = b; 80 } else if (a.equals("dropshort")){ 81 keepShort=!Parse.parseBoolean(b); 82 } else if (a.equals("keepshort") || a.equals("short")){ 83 keepShort=Parse.parseBoolean(b); 84 } else if (a.equals("k")){ 85 k_ = Integer.parseInt(b); 86 } else if(parser.parse(arg, a, b)){ 87 //do nothing 88 } else { 89 throw new RuntimeException("Unknown argument "+arg); 90 } 91 } 92 93 {//Process parser fields 94 Parser.processQuality(); 95 96 maxReads=parser.maxReads; 97 in1=parser.in1; 98 } 99 100 k=k_; 101 ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); 102 assert(ffin1!=null) : "No input file."; 103 assert(ffin1.exists() && ffin1.canRead()) : "Cannot read input file "+in1+"."; 104 105 106 // initialize the class member textstream writer here, so no need of keep results in memory 107 if (out1==null || out1.equals("")){ 108 out1 = "stdout"; 109 } 110 111 threads=Tools.max(1, Shared.threads()); 112 inq=new ArrayBlockingQueue<Line>(threads+1); 113 114 FileFormat ff=FileFormat.testOutput(out1, FileFormat.TEXT, null, true, true, false, true); 115 bsw = new ByteStreamWriter(ff); 116 bsw.start(); 117 118 //create and write the output header 119 List<String> head = new ArrayList<String>(); 120 head.add("scaffold"); 121 head.add("range"); 122 123 // the unit tetramer strings in header 124 tetramerGen2(head); 125 bsw.add(new ByteBuilder(String.join("\t", head)).append('\n'), nextID); 126 nextID++; 127 } 128 process(Timer t)129 void process(Timer t){ 130 131 ArrayList<PrintThread> alpt=spawnThreads(); 132 133 final ConcurrentReadInputStream cris; 134 { 135 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, true, ffin1, null); 136 cris.start(); 137 } 138 139 long readsProcessed=0; 140 { 141 ListNum<Read> ln=cris.nextList(); 142 ArrayList<Read> reads=(ln!=null ? ln.list : null); 143 144 if(reads!=null && !reads.isEmpty()){ 145 Read r=reads.get(0); 146 assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired()); 147 } 148 149 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 150 if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} 151 152 for (Read r1 : reads){ 153 windowedTetramerProfile(r1.bases, r1.id); 154 readsProcessed++; 155 } 156 157 cris.returnList(ln); 158 if(verbose){ 159 outstream.println("Returned a list."); 160 } 161 ln=cris.nextList(); 162 reads=(ln!=null ? ln.list : null); 163 } 164 165 if(ln!=null){ 166 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); 167 } 168 } 169 170 putLine(POISON_LINE); 171 waitForFinish(alpt); 172 173 // close the output file 174 bsw.poisonAndWait(); 175 176 ReadWrite.closeStream(cris); 177 if(verbose){outstream.println("Finished.");} 178 179 t.stop(); 180 outstream.println("Time: \t"+t); 181 outstream.println("Reads Processed: "+readsProcessed+" \t"+String.format(Locale.ROOT, "%.2fk reads/sec", (readsProcessed/(double)(t.elapsed))*1000000)); 182 } 183 184 // TODO: optimize the algorithm ... windowedTetramerProfileOpt(byte[] bases, String header)185 private void windowedTetramerProfileOpt(byte[] bases, String header){ 186 int sidx = 0; 187 int eidx = winSize; 188 189 final int[] counts = new int[maxNmer+1]; 190 191 int leadlen = 0; 192 int leadtmer = -1; 193 int endlen = 0; 194 int endtmer = -1; 195 196 int tmer = 0; 197 int len = 0; 198 199 for (int i=0; i<= eidx; i++){ 200 int x = AminoAcid.baseToNumber[bases[i]]; 201 if (x < 0){ 202 tmer = 0; 203 len = 0; 204 } else { 205 tmer = (leadtmer<<2)|x ; // this can produce -1 vaue if any base in tetramer is N! 206 len++; 207 if (len >= k){ 208 int idx = tmerIndex(tmer); 209 counts[idx]++; 210 211 if( leadtmer==-1){ 212 leadtmer = tmer; 213 } else if(i == eidx){ 214 215 } 216 217 218 len--; 219 } 220 } 221 } 222 223 224 225 } 226 windowedTetramerProfile(byte[] bases, String header)227 private void windowedTetramerProfile(byte[] bases, String header){ 228 int sidx = 0; 229 int eidx = winSize<1 ? bases.length : Tools.min(bases.length, winSize); 230 231 while (eidx <= bases.length){ 232 Line line=new Line(header, bases, sidx, eidx, nextID); 233 putLine(line); 234 sidx+=windowsPerLine*step; 235 eidx+=windowsPerLine*step; 236 nextID++; 237 } 238 } 239 240 //Slow version 241 void append(Line line, int[] counts, StringBuilder sb){ 242 sb.append(line.header); 243 sb.append('\t'); 244 sb.append(line.sidx+1); 245 sb.append('-'); 246 sb.append(line.eidx); 247 float mult=1f/(line.length()-k+1); 248 for (int cnt: counts){ 249 sb.append('\t'); 250 sb.append(String.format(Locale.ROOT, "%.4f", cnt*mult)); 251 } 252 sb.append('\n'); 253 } 254 255 void append(Line line, int[] counts, ByteBuilder bb){ 256 bb.append(line.header); 257 bb.tab(); 258 bb.append(line.sidx+1); 259 bb.append('-'); 260 bb.append(line.eidx); 261 for (int cnt: counts){ 262 bb.tab(); 263 bb.append(cnt); 264 } 265 bb.nl(); 266 } 267 268 // factor this out so we can work on reads 269 public int[] tetramerCounter(byte[] bases, int startidx, int endidx, int[] counts){ 270 if (counts == null){ 271 counts = new int[maxNmer+1]; 272 } 273 274 for (int i=startidx; i<=endidx-k; i++){ 275 int kmer = 0; // the binary representation of the tetramer in the sliding window 276 for (int j=0; j<k; j++){ 277 int x = AminoAcid.baseToNumber[bases[i+j]]; 278 kmer= (kmer<<2)|x ; // this can produce -1 vaue if any base in tetramer is N! 279 } 280 281 // ignore tetramers containing "N" base 282 if (kmer > -1){ 283 int idx = tmerIndex(kmer); 284 counts[idx]++; 285 } 286 } 287 288 return counts; 289 } 290 291 private int tmerIndex(int tmer){ 292 int rtmer = AminoAcid.reverseComplementBinaryFast(tmer, k); 293 int mtmer = Tools.min(tmer, rtmer); 294 int idx = -1; 295 try{ 296 idx = nmerIndex[rtmer]; 297 } catch (ArrayIndexOutOfBoundsException e){ 298 System.out.printf("ArrayIndexOutOfBoundsException tmer=%d: rtmer=%d; \n", tmer, rtmer); 299 System.exit(-1); 300 } 301 return idx; 302 } 303 304 305 public List<String> tetramerGen(List<String> tlist){ 306 if (tlist == null){ 307 tlist = new ArrayList<String>(); 308 } 309 310 int[] bases = {0, 1, 2, 3}; // A C G T 311 312 for (int b1 : bases){ 313 for (int b2 : bases){ 314 for (int b3 : bases){ 315 for (int b4 : bases){ 316 int tcode = b1; 317 tcode = (tcode<<2)|b2; 318 tcode = (tcode<<2)|b3; 319 tcode = (tcode<<2)|b4; 320 String tmer = AminoAcid.kmerToString(tcode, k).toLowerCase(); 321 String rtmer = AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(tcode, k), k).toLowerCase(); 322 323 if (!tlist.contains(rtmer)){ 324 tlist.add(tmer); 325 } 326 } 327 } 328 } 329 } 330 // total of 136 unique tetramers 331 return tlist; 332 } 333 334 public List<String> tetramerGen2(List<String> tlist){ 335 if (tlist == null){ 336 tlist = new ArrayList<String>(); 337 } 338 339 final int max=(1<<(2*k))-1; 340 341 for(int i=0; i<=max; i++){ 342 final int a=i, b=AminoAcid.reverseComplementBinaryFast(i, k); 343 int min=Tools.min(a, b); 344 if(min==a){ 345 tlist.add(AminoAcid.kmerToString(a, k).toLowerCase()); 346 } 347 } 348 return tlist; 349 } 350 351 public void printTetramerFromCode(long code){ 352 System.out.println(AminoAcid.kmerToString(code, k)); 353 } 354 355 // public void printTetramers(byte[] bases, String header){ 356 // int sidx = 0; 357 // int eidx = winSize; 358 // 359 // while (true){ 360 // if (eidx > bases.length){ 361 // break; 362 // } 363 // System.out.printf("%s %d-%d\n", header, sidx+1, eidx); 364 // 365 // // count tetramer here 366 // for (int i=sidx; i<eidx-k; i++){ 367 // char[] tetramer = new char[k]; 368 // for (int j=0; j<k; j++){ 369 // tetramer[j] = (char)bases[i+j]; 370 // } 371 // System.out.println(new String(tetramer)); 372 // } 373 // 374 // sidx += step; 375 // eidx += step; 376 // } 377 // } 378 379 public static void printHelp(){ 380 List<String> helplist = new ArrayList<String>(); 381 helplist.add("Program Name : TetramerFrequencies v1.1"); 382 helplist.add("Usage : "); 383 helplist.add(" -h : this page"); 384 helplist.add(" -s : step [500]"); 385 helplist.add(" -w : window size [2000]. If set to 0 the whole sequence is processed"); 386 System.out.println(String.join("\n", helplist)); 387 } 388 389 /*--------------------------------------------------------------*/ 390 391 final Line takeLine(){ 392 Line line=null; 393 while(line==null){ 394 try { 395 line=inq.take(); 396 } catch (InterruptedException e) { 397 // TODO Auto-generated catch block 398 e.printStackTrace(); 399 } 400 } 401 // System.err.println("takeLine("+line.id+")"); 402 return line; 403 } 404 405 final void putLine(Line line){ 406 // System.err.println("putLine("+line.id+")"); 407 while(line!=null){ 408 try { 409 inq.put(line); 410 line=null; 411 } catch (InterruptedException e) { 412 // TODO Auto-generated catch block 413 e.printStackTrace(); 414 } 415 } 416 } 417 418 /** Spawn process threads */ 419 private ArrayList<PrintThread> spawnThreads(){ 420 421 //Do anything necessary prior to processing 422 423 //Fill a list with PrintThreads 424 ArrayList<PrintThread> alpt=new ArrayList<PrintThread>(threads); 425 for(int i=0; i<threads; i++){ 426 alpt.add(new PrintThread()); 427 } 428 if(verbose){outstream.println("Spawned threads.");} 429 430 //Start the threads 431 for(PrintThread pt : alpt){ 432 pt.start(); 433 } 434 if(verbose){outstream.println("Started threads.");} 435 436 //Do anything necessary after processing 437 return alpt; 438 } 439 440 private void waitForFinish(ArrayList<PrintThread> alpt){ 441 //Wait for completion of all threads 442 boolean allSuccess=true; 443 for(PrintThread pt : alpt){ 444 while(pt.getState()!=Thread.State.TERMINATED){ 445 try { 446 //Attempt a join operation 447 pt.join(); 448 } catch (InterruptedException e) { 449 //Potentially handle this, if it is expected to occur 450 e.printStackTrace(); 451 } 452 } 453 } 454 } 455 456 private class PrintThread extends Thread { 457 458 PrintThread(){} 459 460 @Override 461 public void run(){ 462 Line line=takeLine(); 463 while(line!=null && line!=POISON_LINE){ 464 processLine(line); 465 line=takeLine(); 466 } 467 putLine(POISON_LINE); 468 } 469 470 private void processLine(Line line){ 471 ByteBuilder bb=new ByteBuilder(512); 472 for(int i=0; i<windowsPerLine && line.eidx<=line.bases.length; i++){ 473 Arrays.fill(counts, 0); 474 tetramerCounter(line.bases, line.sidx, line.eidx, counts); 475 if(keepShort || line.length()>=winSize){ 476 append(line, counts, bb); 477 } 478 line.sidx+=step; 479 line.eidx+=step; 480 } 481 bsw.add(bb, line.id); 482 } 483 484 private final int[] counts=new int[maxNmer+1]; 485 } 486 487 private class Line { 488 489 Line(String header_, byte[] bases_, int sidx_, int eidx_, long id_){ 490 header=header_; 491 bases=bases_; 492 sidx=sidx_; 493 eidx=eidx_; 494 id=id_; 495 } 496 497 public int length() { 498 return eidx-sidx+1; 499 } 500 501 final String header; 502 final byte[] bases; 503 int sidx; 504 int eidx; 505 final long id; 506 507 } 508 509 /*--------------------------------------------------------------*/ 510 511 final Line POISON_LINE=new Line(null, null, -1, -1, -1); 512 private final ArrayBlockingQueue<Line> inq; 513 private final int threads; 514 private long nextID=0; 515 516 /*--------------------------------------------------------------*/ 517 518 private String in1 = null; 519 private String out1 = null; 520 private ByteStreamWriter bsw = null; // for output 521 522 private final FileFormat ffin1; 523 524 private java.io.PrintStream outstream=System.err; 525 526 /*--------------------------------------------------------------*/ 527 528 private long maxReads=-1; 529 int step = 500; 530 private int winSize = 2000; 531 private final int k; 532 private boolean keepShort=false; 533 534 /*--------------------------------------------------------------*/ 535 536 private final static int windowsPerLine=8; 537 public static boolean verbose=false; 538 } 539