1 package jgi;
2 
3 import java.util.ArrayList;
4 import java.util.HashMap;
5 import java.util.LinkedHashMap;
6 import java.util.PriorityQueue;
7 
8 import shared.Parse;
9 import shared.Shared;
10 import shared.Timer;
11 import shared.Tools;
12 import stream.ConcurrentReadInputStream;
13 import stream.ConcurrentReadOutputStream;
14 import stream.Read;
15 import stream.ReadStreamWriter;
16 import stream.SamLine;
17 import structures.ListNum;
18 import template.BBTool_ST;
19 
20 /**
21  * @author Brian Bushnell
22  * @date Jan 30, 2015
23  *
24  */
25 public class DedupeByMapping extends BBTool_ST{
26 
main(String[] args)27 	public static void main(String[] args){
28 		Timer t=new Timer();
29 		DedupeByMapping bbt=new DedupeByMapping(args);
30 		bbt.process(t);
31 	}
32 
33 	/**
34 	 * @param args
35 	 */
DedupeByMapping(String[] args)36 	public DedupeByMapping(String[] args) {
37 		super(args);
38 		reparse(args);
39 		SamLine.SET_FROM_OK=true;
40 		ReadStreamWriter.USE_ATTACHED_SAMLINE=true;
41 		if(sorted){queue=new PriorityQueue<Quad>(initialSize);}
42 	}
43 
44 	/* (non-Javadoc)
45 	 * @see jgi.BBTool_ST#setDefaults()
46 	 */
47 	@Override
setDefaults()48 	protected void setDefaults() {
49 		keepUnmapped=true;
50 		keepSingletons=true;
51 		sorted=false;
52 		usePairOrder=true;
53 	}
54 
55 	@Override
parseArgument(String arg, String a, String b)56 	public boolean parseArgument(String arg, String a, String b) {
57 		if(a.equals("keepunmapped") | a.equals("ku")){
58 			keepUnmapped=Parse.parseBoolean(b);
59 			return true;
60 		}else if(a.equals("keepsingletons") | a.equals("ks")){
61 			keepSingletons=Parse.parseBoolean(b);
62 			return true;
63 		}else if(a.equals("ignorepairorder") | a.equals("ipo")){
64 			usePairOrder=!Parse.parseBoolean(b);
65 			return true;
66 		}else if(a.equals("sorted")){
67 			sorted=Parse.parseBoolean(b);
68 			return true;
69 		}
70 		return false;
71 	}
72 
73 	@Override
useSharedHeader()74 	protected final boolean useSharedHeader(){return true;}
75 
76 	@Override
processReadPair(Read r1, Read r2)77 	protected boolean processReadPair(Read r1, Read r2) {
78 		assert(r2==null);
79 		return (sorted ? processReadPair_sorted(r1) : processReadPair_unsorted(r1));
80 	}
81 
processReadPair_unsorted(Read r1)82 	boolean processReadPair_unsorted(Read r1) {
83 		SamLine sl=r1.samline;
84 		if(!sl.primary()){return false;}
85 		if(sl.mapped()){
86 			String rname=new String(sl.rname());
87 			Integer x=contigToNumber.get(rname);
88 			if(x==null){
89 				x=contigToNumber.size();
90 				contigToNumber.put(rname, x);
91 			}
92 			r1.chrom=x;
93 			r1.start=sl.start(true, false);
94 			r1.stop=sl.stop(r1.start, true, false);
95 			r1.setStrand(sl.strand());
96 		}else{
97 			r1.chrom=-1;
98 			r1.start=-1;
99 		}
100 
101 		Read old=nameToRead.get(r1.id);
102 		if(old==null){
103 			nameToRead.put(r1.id, r1);
104 		}else{
105 			//assert(old.mate==null);//This triggers...  maybe, after filtering and leaving some unpaired reads
106 			old.mate=r1;
107 			r1.mate=old;
108 			SamLine sl2=old.samline;
109 			if(sl2.pairnum()==1){
110 				nameToRead.put(r1.id, r1);
111 			}
112 		}
113 		return true;
114 	}
115 
116 
processReadPair_sorted(Read r1)117 	boolean processReadPair_sorted(Read r1) {
118 		assert(false) : "TODO";
119 		SamLine sl=r1.samline;
120 		if(!sl.primary()){return false;}
121 		if(sl.mapped()){
122 			String rname=new String(sl.rname());
123 			Integer x=contigToNumber.get(rname);
124 			if(x==null){
125 				x=contigToNumber.size();
126 				contigToNumber.put(rname, x);
127 			}
128 			r1.chrom=x;
129 			r1.start=sl.start(true, false);
130 			r1.stop=sl.stop(r1.start, true, false);
131 			r1.setStrand(sl.strand());
132 		}else{
133 			r1.chrom=-1;
134 			r1.start=-1;
135 		}
136 
137 		Read old=nameToRead.get(r1.id);
138 		if(old==null){
139 			nameToRead.put(r1.id, r1);
140 		}else{
141 			assert(old.mate==null);
142 			old.mate=r1;
143 			r1.mate=old;
144 			SamLine sl2=old.samline;
145 			if(sl2.pairnum()==1){
146 				nameToRead.put(r1.id, r1);
147 			}
148 		}
149 		return true;
150 	}
151 
152 	@Override
processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros)153 	protected void processInner(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
154 		if(sorted){processInner_sorted(cris, ros);}
155 		else{processInner_unsorted(cris, ros);}
156 	}
157 
158 
processInner_unsorted(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros)159 	void processInner_unsorted(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
160 
161 		readsProcessed=0;
162 		basesProcessed=0;
163 
164 		{
165 
166 			ListNum<Read> ln=cris.nextList();
167 			ArrayList<Read> reads=(ln!=null ? ln.list : null);
168 
169 			if(reads!=null && !reads.isEmpty()){
170 				Read r=reads.get(0);
171 				assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
172 			}
173 
174 			while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
175 				if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
176 
177 				for(int idx=0; idx<reads.size(); idx++){
178 					final Read r1=reads.get(idx);
179 					assert(r1.mate==null);
180 					assert(r1.samline!=null);
181 
182 					final int initialLength1=r1.length();
183 
184 					{
185 						readsProcessed++;
186 						basesProcessed+=initialLength1;
187 					}
188 
189 					processReadPair(r1, null);
190 				}
191 
192 				cris.returnList(ln);
193 				if(verbose){outstream.println("Returned a list.");}
194 				ln=cris.nextList();
195 				reads=(ln!=null ? ln.list : null);
196 			}
197 			if(ln!=null){
198 				cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
199 			}
200 		}
201 
202 		{
203 			contigToNumber=null;
204 			ArrayList<Read> list=new ArrayList<Read>(nameToRead.size());
205 			for(String key : nameToRead.keySet()){
206 				list.add(nameToRead.get(key));
207 			}
208 			nameToRead=null;
209 			for(int i=0; i<list.size(); i++){
210 				Read r1=list.set(i, null);
211 
212 				Read r2=r1.mate;
213 				if(!r1.mapped() && !r1.mateMapped()){
214 					unmappedReads+=r1.pairCount();
215 					unmappedBases+=r1.pairLength();
216 					if(keepUnmapped){
217 						retainedReads+=r1.pairCount();
218 						retainedBases+=r1.pairLength();
219 						unmapped.add(r1);
220 					}
221 				}else if(keepSingletons && r2!=null && (r1.mapped()!=r1.mateMapped())){
222 					retainedReads+=r1.pairCount();
223 					retainedBases+=r1.pairLength();
224 					unmapped.add(r1);
225 				}else{
226 //					System.err.println(r1.strandChar()+", "+r1.start+", "+r1.stop+", "+r2.strandChar()+", "+r2.start+", "+r2.stop);
227 					Quad q=toQuad(r1, r2);
228 					Read old1=quadToRead.get(q);
229 					if(old1==null){quadToRead.put(q, r1);}
230 					else{
231 						Read old2=old1.mate;
232 						float a=(r1.expectedErrors(true, 0)+(r2==null ? 0 : r2.expectedErrors(true, 0)))/r1.pairLength();
233 						float b=old1.expectedErrors(true, 0)+(old2==null ? 0 : old2.expectedErrors(true, 0))/(old1.length()+old1.mateLength());
234 						if(a<b){
235 							quadToRead.put(q, r1);
236 							duplicateReads+=1+old1.mateCount();
237 							duplicateBases+=old1.length()+old1.mateLength();
238 						}else{
239 							duplicateReads+=r1.pairCount();
240 							duplicateBases+=r1.pairLength();
241 						}
242 					}
243 				}
244 			}
245 			list=null;
246 			nameToRead=null;
247 		}
248 
249 		{
250 			ArrayList<Read> list=new ArrayList<Read>(Shared.bufferLen());
251 			int num=0;
252 			for(Quad q : quadToRead.keySet()){
253 				Read r=quadToRead.get(q);
254 				if(keepUnmapped || r.mapped() || (r.mate!=null && r.mate.mapped())){
255 					retainedReads+=1+r.mateCount();
256 					retainedBases+=r.length()+r.mateLength();
257 					list.add(r);
258 					if(list.size()>=Shared.bufferLen()){
259 						if(ros!=null){
260 							ros.add(list, num);
261 							num++;
262 						}
263 						list=new ArrayList<Read>(Shared.bufferLen());
264 					}
265 				}
266 			}
267 			if(list.size()>0){
268 				if(ros!=null){
269 					ros.add(list, num);
270 					num++;
271 				}
272 				list=null;
273 			}
274 			if(ros!=null && unmapped.size()>0){
275 				ros.add(unmapped, num);
276 				num++;
277 			}
278 		}
279 		outstream.println("Duplicate reads:     "+duplicateReads+" \t("+duplicateBases+" bases)");
280 		outstream.println("Unconsidered reads:  "+unmappedReads+" \t("+unmappedBases+" bases)");
281 		outstream.println("Retained reads:      "+retainedReads+" \t("+retainedBases+" bases)");
282 	}
283 
284 
285 
processInner_sorted(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros)286 	void processInner_sorted(final ConcurrentReadInputStream cris, final ConcurrentReadOutputStream ros){
287 		assert(false) : "TODO";
288 		readsProcessed=0;
289 		basesProcessed=0;
290 
291 		{
292 
293 			ListNum<Read> ln=cris.nextList();
294 			ArrayList<Read> reads=(ln!=null ? ln.list : null);
295 
296 			if(reads!=null && !reads.isEmpty()){
297 				Read r=reads.get(0);
298 				assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired());
299 			}
300 
301 			while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
302 				if(verbose){outstream.println("Fetched "+reads.size()+" reads.");}
303 
304 				for(int idx=0; idx<reads.size(); idx++){
305 					final Read r1=reads.get(idx);
306 					assert(r1.mate==null);
307 					assert(r1.samline!=null);
308 
309 					final int initialLength1=r1.length();
310 
311 					{
312 						readsProcessed++;
313 						basesProcessed+=initialLength1;
314 					}
315 
316 					processReadPair(r1, null);
317 				}
318 
319 				cris.returnList(ln);
320 				if(verbose){outstream.println("Returned a list.");}
321 				ln=cris.nextList();
322 				reads=(ln!=null ? ln.list : null);
323 			}
324 			if(ln!=null){
325 				cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());
326 			}
327 		}
328 
329 		{
330 			contigToNumber=null;
331 			ArrayList<Read> list=new ArrayList<Read>(nameToRead.size());
332 			for(String key : nameToRead.keySet()){
333 				list.add(nameToRead.get(key));
334 			}
335 			nameToRead=null;
336 			for(int i=0; i<list.size(); i++){
337 				Read r1=list.set(i, null);
338 
339 				Read r2=r1.mate;
340 				if(!r1.mapped() && !r1.mateMapped()){
341 					unmappedReads+=r1.pairCount();
342 					unmappedBases+=r1.pairLength();
343 					if(keepUnmapped){unmapped.add(r1);}
344 				}else{
345 					Quad q=toQuad(r1, r2);
346 					Read old1=quadToRead.get(q);
347 					if(old1==null){quadToRead.put(q, r1);}
348 					else{
349 						Read old2=old1.mate;
350 						float a=(r1.expectedErrors(true, 0)+(r2==null ? 0 : r2.expectedErrors(true, 0)))/r1.pairLength();
351 						float b=old1.expectedErrors(true, 0)+(old2==null ? 0 : old2.expectedErrors(true, 0))/(old1.length()+old1.mateLength());
352 						if(a<b){
353 							quadToRead.put(q, r1);
354 							duplicateReads+=1+old1.mateCount();
355 							duplicateBases+=old1.length()+old1.mateLength();
356 						}else{
357 							duplicateReads+=r1.pairCount();
358 							duplicateBases+=r1.pairLength();
359 						}
360 					}
361 				}
362 			}
363 			list=null;
364 			nameToRead=null;
365 		}
366 
367 		{
368 			ArrayList<Read> list=new ArrayList<Read>(Shared.bufferLen());
369 			int num=0;
370 			for(Quad q : quadToRead.keySet()){
371 				Read r=quadToRead.get(q);
372 				if(keepUnmapped || r.mapped() || (r.mate!=null && r.mate.mapped())){
373 					list.add(r);
374 					if(list.size()>=Shared.bufferLen()){
375 						if(ros!=null){
376 							ros.add(list, num);
377 							num++;
378 						}
379 						list=new ArrayList<Read>(Shared.bufferLen());
380 					}
381 				}
382 			}
383 			if(list.size()>0){
384 				if(ros!=null){
385 					ros.add(list, num);
386 					num++;
387 				}
388 				list=null;
389 			}
390 			if(ros!=null && unmapped.size()>0){
391 				ros.add(unmapped, num);
392 				num++;
393 			}
394 		}
395 		outstream.println("Duplicate reads:    "+duplicateReads+" \t("+duplicateBases+" bases)");
396 		outstream.println("Unmapped reads:     "+unmappedReads+" \t("+unmappedBases+" bases)");
397 	}
398 
399 	@Override
startupSubclass()400 	protected void startupSubclass() {}
401 
402 	@Override
shutdownSubclass()403 	protected void shutdownSubclass() {}
404 
405 	@Override
showStatsSubclass(Timer t, long readsIn, long basesIn)406 	protected void showStatsSubclass(Timer t, long readsIn, long basesIn) {}
407 
toQuad(Read r1, Read r2)408 	private Quad toQuad(Read r1, Read r2){
409 
410 //		if(usePairOrder){
411 //			start1=start1_;
412 //			start2=start2_;
413 //			chr1=chr1_;
414 //			chr2=chr2_;
415 //		}else{
416 //			start1=Tools.max(start1_,start2_);
417 //			start2=Tools.min(start1_,start2_);
418 //			chr1=Tools.max(chr1_,chr2_);
419 //			chr2=Tools.min(chr1_,chr2_);
420 //		}
421 //
422 //		int pos1, pos2, chrom1, chrom2;
423 //
424 //		if()
425 
426 		final int s1=r1.strand(), a1=r1.start, b1=r1.stop, c1=r1.chrom;
427 		final int s2=(r2==null ? 0 : r2.strand()), a2=(r2==null ? 0 : r2.start), b2=(r2==null ? 0 : r2.stop), c2=(r2==null ? 0 : r2.chrom);
428 		final Quad q;
429 		if(usePairOrder){
430 			q=new Quad(s1==0 ? a1 : b1, c1, s2==0 ? a2 : b2, c2);
431 		}else{
432 			if(s1==0){
433 				q=new Quad(s1==0 ? a1 : b1, c1, s2==0 ? a2 : b2, c2);
434 			}else{
435 				q=new Quad(s2==0 ? a2 : b2, c2, s1==0 ? a1 : b1, c1);
436 			}
437 		}
438 
439 		//q=new Quad((r1.strand()==0 ? r1.start : r1.stop), r1.chrom, r2==null ? -2 : (r2.strand()==0 ? r2.start : r2.stop), r2==null ? -2 : r2.chrom);
440 		return q;
441 	}
442 
443 	private static class Quad implements Comparable<Quad>{
444 
Quad(int start1_, int start2_, int chr1_, int chr2_)445 		Quad(int start1_, int start2_, int chr1_, int chr2_){
446 			start1=start1_;
447 			start2=start2_;
448 			chr1=chr1_;
449 			chr2=chr2_;
450 
451 //			System.err.println(usePairOrder+", "+this);
452 		}
453 
454 		@Override
toString()455 		public String toString(){
456 			return "("+start1+","+start2+","+chr1+","+chr2+")";
457 		}
458 
459 		@Override
hashCode()460 		public int hashCode(){
461 			return start1^(Integer.rotateLeft(start2, 8))^(Integer.rotateLeft(chr1, 16))^(Integer.rotateLeft(chr2, 24));
462 		}
463 
464 		@Override
equals(Object o)465 		public boolean equals(Object o){
466 			return equals((Quad)o);
467 		}
468 
equals(Quad o)469 		public boolean equals(Quad o){
470 			return start1==o.start1 && start2==o.start2 && chr1==o.chr1 && chr2==o.chr2;
471 		}
472 
473 		@Override
compareTo(Quad b)474 		public int compareTo(Quad b) {
475 			int x;
476 			x=chr1-b.chr1;
477 			if(x!=0){return x;}
478 			x=start1-b.start1;
479 			if(x!=0){return x;}
480 			x=chr2-b.chr2;
481 			if(x!=0){return x;}
482 			x=start2-b.start2;
483 			return x;
484 		}
485 
486 		final int start1, start2, chr1, chr2;
487 	}
488 
489 	private boolean keepUnmapped;
490 	private boolean keepSingletons;
491 	private boolean sorted;
492 	private static boolean usePairOrder;
493 
494 	private long duplicateReads=0;
495 	private long duplicateBases=0;
496 	private long unmappedReads=0;
497 	private long unmappedBases=0;
498 	private long retainedReads=0;
499 	private long retainedBases=0;
500 
501 	private int initialSize=(int)Tools.min(2000000, Tools.max(80000, Shared.memAvailable(1)/4000));
502 
503 	private HashMap<String, Integer> contigToNumber=new HashMap<String, Integer>(initialSize);
504 	private LinkedHashMap<Quad, Read> quadToRead=new LinkedHashMap<Quad, Read>(initialSize);
505 	private LinkedHashMap<String, Read> nameToRead=new LinkedHashMap<String, Read>(initialSize);
506 	private ArrayList<Read> unmapped=new ArrayList<Read>(initialSize/2);
507 
508 	private PriorityQueue<Quad> queue;
509 
510 }
511