1 package jgi; 2 3 import java.util.ArrayList; 4 import java.util.HashMap; 5 import java.util.LinkedHashMap; 6 import java.util.PriorityQueue; 7 8 import shared.Parse; 9 import shared.Shared; 10 import shared.Timer; 11 import shared.Tools; 12 import stream.ConcurrentReadInputStream; 13 import stream.ConcurrentReadOutputStream; 14 import stream.Read; 15 import stream.ReadStreamWriter; 16 import stream.SamLine; 17 import structures.ListNum; 18 import template.BBTool_ST; 19 20 /** 21 * @author Brian Bushnell 22 * @date Jan 30, 2015 23 * 24 */ 25 public class DedupeByMapping extends BBTool_ST{ 26 main(String[] args)27 public static void main(String[] args){ 28 Timer t=new Timer(); 29 DedupeByMapping bbt=new DedupeByMapping(args); 30 bbt.process(t); 31 } 32 33 /** 34 * @param args 35 */ DedupeByMapping(String[] args)36 public DedupeByMapping(String[] args) { 37 super(args); 38 reparse(args); 39 SamLine.SET_FROM_OK=true; 40 ReadStreamWriter.USE_ATTACHED_SAMLINE=true; 41 if(sorted){queue=new PriorityQueue<Quad>(initialSize);} 42 } 43 44 /* (non-Javadoc) 45 * @see jgi.BBTool_ST#setDefaults() 46 */ 47 @Override setDefaults()48 protected void setDefaults() { 49 keepUnmapped=true; 50 keepSingletons=true; 51 sorted=false; 52 usePairOrder=true; 53 } 54 55 @Override parseArgument(String arg, String a, String b)56 public boolean parseArgument(String arg, String a, String b) { 57 if(a.equals("keepunmapped") | a.equals("ku")){ 58 keepUnmapped=Parse.parseBoolean(b); 59 return true; 60 }else if(a.equals("keepsingletons") | a.equals("ks")){ 61 keepSingletons=Parse.parseBoolean(b); 62 return true; 63 }else if(a.equals("ignorepairorder") | a.equals("ipo")){ 64 usePairOrder=!Parse.parseBoolean(b); 65 return true; 66 }else if(a.equals("sorted")){ 67 sorted=Parse.parseBoolean(b); 68 return true; 69 } 70 return false; 71 } 72 73 @Override useSharedHeader()74 protected final boolean useSharedHeader(){return true;} 75 76 @Override processReadPair(Read r1, Read r2)77 protected boolean processReadPair(Read r1, Read r2) { 78 assert(r2==null); 79 return (sorted ? processReadPair_sorted(r1) : processReadPair_unsorted(r1)); 80 } 81 processReadPair_unsorted(Read r1)82 boolean processReadPair_unsorted(Read r1) { 83 SamLine sl=r1.samline; 84 if(!sl.primary()){return false;} 85 if(sl.mapped()){ 86 String rname=new String(sl.rname()); 87 Integer x=contigToNumber.get(rname); 88 if(x==null){ 89 x=contigToNumber.size(); 90 contigToNumber.put(rname, x); 91 } 92 r1.chrom=x; 93 r1.start=sl.start(true, false); 94 r1.stop=sl.stop(r1.start, true, false); 95 r1.setStrand(sl.strand()); 96 }else{ 97 r1.chrom=-1; 98 r1.start=-1; 99 } 100 101 Read old=nameToRead.get(r1.id); 102 if(old==null){ 103 nameToRead.put(r1.id, r1); 104 }else{ 105 //assert(old.mate==null);//This triggers... maybe, after filtering and leaving some unpaired reads 106 old.mate=r1; 107 r1.mate=old; 108 SamLine sl2=old.samline; 109 if(sl2.pairnum()==1){ 110 nameToRead.put(r1.id, r1); 111 } 112 } 113 return true; 114 } 115 116 processReadPair_sorted(Read r1)117 boolean processReadPair_sorted(Read r1) { 118 assert(false) : "TODO"; 119 SamLine sl=r1.samline; 120 if(!sl.primary()){return false;} 121 if(sl.mapped()){ 122 String rname=new String(sl.rname()); 123 Integer x=contigToNumber.get(rname); 124 if(x==null){ 125 x=contigToNumber.size(); 126 contigToNumber.put(rname, x); 127 } 128 r1.chrom=x; 129 r1.start=sl.start(true, false); 130 r1.stop=sl.stop(r1.start, true, false); 131 r1.setStrand(sl.strand()); 132 }else{ 133 r1.chrom=-1; 134 r1.start=-1; 135 } 136 137 Read old=nameToRead.get(r1.id); 138 if(old==null){ 139 nameToRead.put(r1.id, r1); 140 }else{ 141 assert(old.mate==null); 142 old.mate=r1; 143 r1.mate=old; 144 SamLine sl2=old.samline; 145 if(sl2.pairnum()==1){ 146 nameToRead.put(r1.id, r1); 147 } 148 } 149 return true; 150 } 151 152 @Override processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros)153 protected void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){ 154 if(sorted){processInner_sorted(cris, ros);} 155 else{processInner_unsorted(cris, ros);} 156 } 157 158 processInner_unsorted(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros)159 void processInner_unsorted(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){ 160 161 readsProcessed=0; 162 basesProcessed=0; 163 164 { 165 166 ListNum<Read> ln=cris.nextList(); 167 ArrayList<Read> reads=(ln!=null ? ln.list : null); 168 169 if(reads!=null && !reads.isEmpty()){ 170 Read r=reads.get(0); 171 assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired()); 172 } 173 174 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 175 if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} 176 177 for(int idx=0; idx<reads.size(); idx++){ 178 final Read r1=reads.get(idx); 179 assert(r1.mate==null); 180 assert(r1.samline!=null); 181 182 final int initialLength1=r1.length(); 183 184 { 185 readsProcessed++; 186 basesProcessed+=initialLength1; 187 } 188 189 processReadPair(r1, null); 190 } 191 192 cris.returnList(ln); 193 if(verbose){outstream.println("Returned a list.");} 194 ln=cris.nextList(); 195 reads=(ln!=null ? ln.list : null); 196 } 197 if(ln!=null){ 198 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); 199 } 200 } 201 202 { 203 contigToNumber=null; 204 ArrayList<Read> list=new ArrayList<Read>(nameToRead.size()); 205 for(String key : nameToRead.keySet()){ 206 list.add(nameToRead.get(key)); 207 } 208 nameToRead=null; 209 for(int i=0; i<list.size(); i++){ 210 Read r1=list.set(i, null); 211 212 Read r2=r1.mate; 213 if(!r1.mapped() && !r1.mateMapped()){ 214 unmappedReads+=r1.pairCount(); 215 unmappedBases+=r1.pairLength(); 216 if(keepUnmapped){ 217 retainedReads+=r1.pairCount(); 218 retainedBases+=r1.pairLength(); 219 unmapped.add(r1); 220 } 221 }else if(keepSingletons && r2!=null && (r1.mapped()!=r1.mateMapped())){ 222 retainedReads+=r1.pairCount(); 223 retainedBases+=r1.pairLength(); 224 unmapped.add(r1); 225 }else{ 226 // System.err.println(r1.strandChar()+", "+r1.start+", "+r1.stop+", "+r2.strandChar()+", "+r2.start+", "+r2.stop); 227 Quad q=toQuad(r1, r2); 228 Read old1=quadToRead.get(q); 229 if(old1==null){quadToRead.put(q, r1);} 230 else{ 231 Read old2=old1.mate; 232 float a=(r1.expectedErrors(true, 0)+(r2==null ? 0 : r2.expectedErrors(true, 0)))/r1.pairLength(); 233 float b=old1.expectedErrors(true, 0)+(old2==null ? 0 : old2.expectedErrors(true, 0))/(old1.length()+old1.mateLength()); 234 if(a<b){ 235 quadToRead.put(q, r1); 236 duplicateReads+=1+old1.mateCount(); 237 duplicateBases+=old1.length()+old1.mateLength(); 238 }else{ 239 duplicateReads+=r1.pairCount(); 240 duplicateBases+=r1.pairLength(); 241 } 242 } 243 } 244 } 245 list=null; 246 nameToRead=null; 247 } 248 249 { 250 ArrayList<Read> list=new ArrayList<Read>(Shared.bufferLen()); 251 int num=0; 252 for(Quad q : quadToRead.keySet()){ 253 Read r=quadToRead.get(q); 254 if(keepUnmapped || r.mapped() || (r.mate!=null && r.mate.mapped())){ 255 retainedReads+=1+r.mateCount(); 256 retainedBases+=r.length()+r.mateLength(); 257 list.add(r); 258 if(list.size()>=Shared.bufferLen()){ 259 if(ros!=null){ 260 ros.add(list, num); 261 num++; 262 } 263 list=new ArrayList<Read>(Shared.bufferLen()); 264 } 265 } 266 } 267 if(list.size()>0){ 268 if(ros!=null){ 269 ros.add(list, num); 270 num++; 271 } 272 list=null; 273 } 274 if(ros!=null && unmapped.size()>0){ 275 ros.add(unmapped, num); 276 num++; 277 } 278 } 279 outstream.println("Duplicate reads: "+duplicateReads+" \t("+duplicateBases+" bases)"); 280 outstream.println("Unconsidered reads: "+unmappedReads+" \t("+unmappedBases+" bases)"); 281 outstream.println("Retained reads: "+retainedReads+" \t("+retainedBases+" bases)"); 282 } 283 284 285 processInner_sorted(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros)286 void processInner_sorted(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){ 287 assert(false) : "TODO"; 288 readsProcessed=0; 289 basesProcessed=0; 290 291 { 292 293 ListNum<Read> ln=cris.nextList(); 294 ArrayList<Read> reads=(ln!=null ? ln.list : null); 295 296 if(reads!=null && !reads.isEmpty()){ 297 Read r=reads.get(0); 298 assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired()); 299 } 300 301 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 302 if(verbose){outstream.println("Fetched "+reads.size()+" reads.");} 303 304 for(int idx=0; idx<reads.size(); idx++){ 305 final Read r1=reads.get(idx); 306 assert(r1.mate==null); 307 assert(r1.samline!=null); 308 309 final int initialLength1=r1.length(); 310 311 { 312 readsProcessed++; 313 basesProcessed+=initialLength1; 314 } 315 316 processReadPair(r1, null); 317 } 318 319 cris.returnList(ln); 320 if(verbose){outstream.println("Returned a list.");} 321 ln=cris.nextList(); 322 reads=(ln!=null ? ln.list : null); 323 } 324 if(ln!=null){ 325 cris.returnList(ln.id, ln.list==null || ln.list.isEmpty()); 326 } 327 } 328 329 { 330 contigToNumber=null; 331 ArrayList<Read> list=new ArrayList<Read>(nameToRead.size()); 332 for(String key : nameToRead.keySet()){ 333 list.add(nameToRead.get(key)); 334 } 335 nameToRead=null; 336 for(int i=0; i<list.size(); i++){ 337 Read r1=list.set(i, null); 338 339 Read r2=r1.mate; 340 if(!r1.mapped() && !r1.mateMapped()){ 341 unmappedReads+=r1.pairCount(); 342 unmappedBases+=r1.pairLength(); 343 if(keepUnmapped){unmapped.add(r1);} 344 }else{ 345 Quad q=toQuad(r1, r2); 346 Read old1=quadToRead.get(q); 347 if(old1==null){quadToRead.put(q, r1);} 348 else{ 349 Read old2=old1.mate; 350 float a=(r1.expectedErrors(true, 0)+(r2==null ? 0 : r2.expectedErrors(true, 0)))/r1.pairLength(); 351 float b=old1.expectedErrors(true, 0)+(old2==null ? 0 : old2.expectedErrors(true, 0))/(old1.length()+old1.mateLength()); 352 if(a<b){ 353 quadToRead.put(q, r1); 354 duplicateReads+=1+old1.mateCount(); 355 duplicateBases+=old1.length()+old1.mateLength(); 356 }else{ 357 duplicateReads+=r1.pairCount(); 358 duplicateBases+=r1.pairLength(); 359 } 360 } 361 } 362 } 363 list=null; 364 nameToRead=null; 365 } 366 367 { 368 ArrayList<Read> list=new ArrayList<Read>(Shared.bufferLen()); 369 int num=0; 370 for(Quad q : quadToRead.keySet()){ 371 Read r=quadToRead.get(q); 372 if(keepUnmapped || r.mapped() || (r.mate!=null && r.mate.mapped())){ 373 list.add(r); 374 if(list.size()>=Shared.bufferLen()){ 375 if(ros!=null){ 376 ros.add(list, num); 377 num++; 378 } 379 list=new ArrayList<Read>(Shared.bufferLen()); 380 } 381 } 382 } 383 if(list.size()>0){ 384 if(ros!=null){ 385 ros.add(list, num); 386 num++; 387 } 388 list=null; 389 } 390 if(ros!=null && unmapped.size()>0){ 391 ros.add(unmapped, num); 392 num++; 393 } 394 } 395 outstream.println("Duplicate reads: "+duplicateReads+" \t("+duplicateBases+" bases)"); 396 outstream.println("Unmapped reads: "+unmappedReads+" \t("+unmappedBases+" bases)"); 397 } 398 399 @Override startupSubclass()400 protected void startupSubclass() {} 401 402 @Override shutdownSubclass()403 protected void shutdownSubclass() {} 404 405 @Override showStatsSubclass(Timer t, long readsIn, long basesIn)406 protected void showStatsSubclass(Timer t, long readsIn, long basesIn) {} 407 toQuad(Read r1, Read r2)408 private Quad toQuad(Read r1, Read r2){ 409 410 // if(usePairOrder){ 411 // start1=start1_; 412 // start2=start2_; 413 // chr1=chr1_; 414 // chr2=chr2_; 415 // }else{ 416 // start1=Tools.max(start1_,start2_); 417 // start2=Tools.min(start1_,start2_); 418 // chr1=Tools.max(chr1_,chr2_); 419 // chr2=Tools.min(chr1_,chr2_); 420 // } 421 // 422 // int pos1, pos2, chrom1, chrom2; 423 // 424 // if() 425 426 final int s1=r1.strand(), a1=r1.start, b1=r1.stop, c1=r1.chrom; 427 final int s2=(r2==null ? 0 : r2.strand()), a2=(r2==null ? 0 : r2.start), b2=(r2==null ? 0 : r2.stop), c2=(r2==null ? 0 : r2.chrom); 428 final Quad q; 429 if(usePairOrder){ 430 q=new Quad(s1==0 ? a1 : b1, c1, s2==0 ? a2 : b2, c2); 431 }else{ 432 if(s1==0){ 433 q=new Quad(s1==0 ? a1 : b1, c1, s2==0 ? a2 : b2, c2); 434 }else{ 435 q=new Quad(s2==0 ? a2 : b2, c2, s1==0 ? a1 : b1, c1); 436 } 437 } 438 439 //q=new Quad((r1.strand()==0 ? r1.start : r1.stop), r1.chrom, r2==null ? -2 : (r2.strand()==0 ? r2.start : r2.stop), r2==null ? -2 : r2.chrom); 440 return q; 441 } 442 443 private static class Quad implements Comparable<Quad>{ 444 Quad(int start1_, int start2_, int chr1_, int chr2_)445 Quad(int start1_, int start2_, int chr1_, int chr2_){ 446 start1=start1_; 447 start2=start2_; 448 chr1=chr1_; 449 chr2=chr2_; 450 451 // System.err.println(usePairOrder+", "+this); 452 } 453 454 @Override toString()455 public String toString(){ 456 return "("+start1+","+start2+","+chr1+","+chr2+")"; 457 } 458 459 @Override hashCode()460 public int hashCode(){ 461 return start1^(Integer.rotateLeft(start2, 8))^(Integer.rotateLeft(chr1, 16))^(Integer.rotateLeft(chr2, 24)); 462 } 463 464 @Override equals(Object o)465 public boolean equals(Object o){ 466 return equals((Quad)o); 467 } 468 equals(Quad o)469 public boolean equals(Quad o){ 470 return start1==o.start1 && start2==o.start2 && chr1==o.chr1 && chr2==o.chr2; 471 } 472 473 @Override compareTo(Quad b)474 public int compareTo(Quad b) { 475 int x; 476 x=chr1-b.chr1; 477 if(x!=0){return x;} 478 x=start1-b.start1; 479 if(x!=0){return x;} 480 x=chr2-b.chr2; 481 if(x!=0){return x;} 482 x=start2-b.start2; 483 return x; 484 } 485 486 final int start1, start2, chr1, chr2; 487 } 488 489 private boolean keepUnmapped; 490 private boolean keepSingletons; 491 private boolean sorted; 492 private static boolean usePairOrder; 493 494 private long duplicateReads=0; 495 private long duplicateBases=0; 496 private long unmappedReads=0; 497 private long unmappedBases=0; 498 private long retainedReads=0; 499 private long retainedBases=0; 500 501 private int initialSize=(int)Tools.min(2000000, Tools.max(80000, Shared.memAvailable(1)/4000)); 502 503 private HashMap<String, Integer> contigToNumber=new HashMap<String, Integer>(initialSize); 504 private LinkedHashMap<Quad, Read> quadToRead=new LinkedHashMap<Quad, Read>(initialSize); 505 private LinkedHashMap<String, Read> nameToRead=new LinkedHashMap<String, Read>(initialSize); 506 private ArrayList<Read> unmapped=new ArrayList<Read>(initialSize/2); 507 508 private PriorityQueue<Quad> queue; 509 510 } 511