1 package jgi;
2 
3 import java.io.File;
4 import java.io.PrintStream;
5 import java.io.Serializable;
6 import java.util.ArrayDeque;
7 import java.util.ArrayList;
8 import java.util.Arrays;
9 import java.util.Comparator;
10 import java.util.HashMap;
11 import java.util.HashSet;
12 import java.util.LinkedHashMap;
13 import java.util.Locale;
14 import java.util.PriorityQueue;
15 import java.util.Random;
16 import java.util.concurrent.atomic.AtomicIntegerArray;
17 
18 import align2.BandedAligner;
19 import dna.AminoAcid;
20 import fileIO.ByteFile1;
21 import fileIO.ByteFile2;
22 import fileIO.ByteStreamWriter;
23 import fileIO.FileFormat;
24 import fileIO.ReadWrite;
25 import fileIO.TextStreamWriter;
26 import shared.Parse;
27 import shared.Parser;
28 import shared.PreParser;
29 import shared.ReadStats;
30 import shared.Shared;
31 import shared.Timer;
32 import shared.Tools;
33 import shared.TrimRead;
34 import sort.ReadComparator;
35 import sort.ReadComparatorID;
36 import sort.ReadComparatorName;
37 import sort.ReadLengthComparator;
38 import sort.ReadQualityComparator;
39 import stream.ConcurrentCollectionReadInputStream;
40 import stream.ConcurrentGenericReadInputStream;
41 import stream.ConcurrentReadInputStream;
42 import stream.ConcurrentReadOutputStream;
43 import stream.FASTQ;
44 import stream.FastaReadInputStream;
45 import stream.FastqReadInputStream;
46 import stream.Read;
47 import structures.ListNum;
48 import structures.LongM;
49 
50 /**
51  * @author Brian Bushnell
52  * @date Nov 30, 2017
53  *
54  */
55 public final class DedupeProtein {
56 
main(String[] args)57 	public static void main(String[] args){
58 		int rbl0=Shared.bufferLen();;
59 		Shared.setBufferLen(16);
60 
61 		DedupeProtein dd=new DedupeProtein(args);
62 		dd.process();
63 
64 		Shared.setBufferLen(rbl0);
65 
66 		//Close the print stream if it was redirected
67 		Shared.closeStream(outstream);
68 	}
69 
DedupeProtein(String[] args)70 	public DedupeProtein(String[] args){
71 
72 		{//Preparse block for help, config files, and outstream
73 			PreParser pp=new PreParser(args, getClass(), true);
74 			args=pp.args;
75 			outstream=pp.outstream;
76 		}
77 
78 		ReadWrite.ZIPLEVEL=2;
79 		ReadWrite.USE_UNPIGZ=ReadWrite.USE_PIGZ=true;
80 		ReadWrite.MAX_ZIP_THREADS=Shared.threads();
81 		Read.TO_UPPER_CASE=true;
82 
83 		boolean setMcsfs=false;
84 		int bandwidth_=-1;
85 		int k_=31;
86 		int subset_=0, subsetCount_=1;
87 		boolean ascending=false;
88 		Parser parser=new Parser();
89 
90 		for(int i=0; i<args.length; i++){
91 
92 			final String arg=args[i];
93 			String[] split=arg.split("=");
94 			String a=split[0].toLowerCase();
95 			String b=split.length>1 ? split[1] : null;
96 
97 			if(Parser.parseZip(arg, a, b)){
98 				//do nothing
99 			}else if(Parser.parseCommonStatic(arg, a, b)){
100 				//do nothing
101 			}else if(Parser.parseQuality(arg, a, b)){
102 				//do nothing
103 			}else if(Parser.parseFasta(arg, a, b)){
104 				//do nothing
105 			}else if(parser.parseQTrim(arg, a, b)){
106 				//do nothing
107 			}else if(parser.parseInterleaved(arg, a, b)){
108 				//do nothing
109 			}else if(a.equals("in") || a.equals("in1")){
110 				assert(b!=null) : "Bad parameter: "+arg;
111 				if(b.indexOf(',')>=0 && !new File(b).exists()){
112 					in1=b.split(",");
113 				}else{
114 					in1=new String[] {b};
115 				}
116 			}else if(a.equals("in2")){
117 				assert(b!=null) : "Bad parameter: "+arg;
118 				if(b.indexOf(',')>=0 && !new File(b).exists()){
119 					in2=b.split(",");
120 				}else{
121 					in2=new String[] {b};
122 				}
123 			}else if(a.equals("out") || a.equals("out")){
124 				out=b;
125 			}else if(a.equals("out2")){
126 				throw new RuntimeException("Dedupe does not allow 'out2'; for paired reads, output is interleaved.");
127 			}else if(a.equals("clusterfilepattern") || a.equals("pattern")){
128 				clusterFilePattern=b;
129 				assert(clusterFilePattern==null || clusterFilePattern.contains("%")) : "pattern must contain the % symbol.";
130 			}else if(a.equals("outbest")){
131 				outbest=b;
132 			}else if(a.equals("outd") || a.equals("outduplicate")){
133 				outdupe=b;
134 			}else if(a.equals("csf") || a.equals("clusterstatsfile")){
135 				outcsf=b;
136 			}else if(a.equals("dot") || a.equals("graph") || a.equals("outdot") || a.equals("outgraph")){
137 				outgraph=b;
138 			}else if(a.equals("mcsfs") || a.equals("minclustersizeforstats")){
139 				minClusterSizeForStats=Integer.parseInt(b);
140 			}else if(a.equals("mcs") || a.equals("minclustersize")){
141 				minClusterSize=Integer.parseInt(b);
142 				if(!setMcsfs){
143 					minClusterSizeForStats=minClusterSize;
144 				}
145 			}else if(a.equals("append") || a.equals("app")){
146 				append=ReadStats.append=Parse.parseBoolean(b);
147 			}else if(a.equals("overwrite") || a.equals("ow")){
148 				overwrite=Parse.parseBoolean(b);
149 			}else if(a.equals("sort")){
150 				if(b==null){sort=true;}
151 				else if(b.equalsIgnoreCase("a") || b.equalsIgnoreCase("ascending")){
152 					sort=true;
153 					ascending=true;
154 				}else if(b.equalsIgnoreCase("d") || b.equalsIgnoreCase("descending")){
155 					sort=true;
156 					ascending=false;
157 				}else if(b.equalsIgnoreCase("length")){
158 					sort=true;
159 					comparator=ReadLengthComparator.comparator;
160 				}else if(b.equalsIgnoreCase("quality")){
161 					sort=true;
162 					comparator=ReadQualityComparator.comparator;
163 				}else if(b.equalsIgnoreCase("name")){
164 					sort=true;
165 					comparator=ReadComparatorName.comparator;
166 				}else if(b.equalsIgnoreCase("id")){
167 					sort=true;
168 					comparator=ReadComparatorID.comparator;
169 				}else{
170 					sort=Parse.parseBoolean(b);
171 				}
172 			}else if(a.equals("ascending")){
173 				ascending=Parse.parseBoolean(b);
174 			}else if(a.equals("ordered")){
175 				boolean x=Parse.parseBoolean(b);
176 				if(x){
177 					ascending=true;
178 					sort=true;
179 					comparator=ReadComparatorID.comparator;
180 				}else{
181 					//do nothing
182 				}
183 			}else if(a.equals("ac") || a.equals("absorbcontainment") || a.equals("absorbcontainments") || a.equals("tc") || a.equals("testcontainment") || a.equals("containment")){
184 				absorbContainment=Parse.parseBoolean(b);
185 			}else if(a.equals("am") || a.equals("absorbmatch") || a.equals("absorbmatches") || a.equals("tm") || a.equals("testmatch")){
186 				absorbMatch=Parse.parseBoolean(b);
187 			}else if(a.equals("ao") || a.equals("absorboverlap") || a.equals("absorboverlaps") || a.equals("to") || a.equals("testoverlap")){
188 				absorbOverlap=Parse.parseBoolean(b);
189 			}else if(a.equals("fo") || a.equals("findoverlap") || a.equals("findoverlaps")){
190 				findOverlaps=Parse.parseBoolean(b);
191 			}else if(a.equals("c") || a.equals("cluster") || a.equals("clusters")){
192 				makeClusters=Parse.parseBoolean(b);
193 			}else if(a.equals("fmj") || a.equals("fixmultijoin") || a.equals("fixmultijoins")){
194 				fixMultiJoins=Parse.parseBoolean(b);
195 			}else if(a.equals("foc") || a.equals("fixoffsetcontradiction") || a.equals("fixoffsetcontradictions")){
196 				fixOffsetContradictions=Parse.parseBoolean(b);
197 			}else if(a.equals("pto") || a.equals("preventtransitiveoverlap") || a.equals("preventtransitiveoverlaps")){
198 				preventTransitiveOverlaps=Parse.parseBoolean(b);
199 			}else if(a.equals("pbr") || a.equals("pickbestrepresentative")){
200 				pickBestRepresentativePerCluster=Parse.parseBoolean(b);
201 			}else if(a.equals("mst") || a.equals("maxspanningtree")){
202 				maxSpanningTree=Parse.parseBoolean(b);
203 			}else if(a.equals("pc") || a.equals("processcluster") || a.equals("processclusters")){
204 				processClusters=Parse.parseBoolean(b);
205 			}else if(a.equals("rnc") || a.equals("renamecluster") || a.equals("renameclusters")){
206 				renameClusters=Parse.parseBoolean(b);
207 				if(renameClusters){storeName=false;}
208 			}else if(a.equals("rc") || a.equals("removecycles") || a.equals("removecycle")){
209 				removeCycles=Parse.parseBoolean(b);
210 			}else if(a.equals("uo") || a.equals("uniqueonly")){
211 				UNIQUE_ONLY=Parse.parseBoolean(b);
212 			}else if(a.equals("rmn") || a.equals("requirematchingnames")){
213 				REQUIRE_MATCHING_NAMES=Parse.parseBoolean(b);
214 			}else if(a.equals("ngn") || a.equals("numbergraphnodes")){
215 				NUMBER_GRAPH_NODES=Parse.parseBoolean(b);
216 			}else if(a.equals("addpairnum")){
217 				ADD_PAIRNUM_TO_NAME=Parse.parseBoolean(b);
218 			}else if(a.equals("hashns")){
219 				HASH_NS=Parse.parseBoolean(b);
220 			}else if(a.equals("pn") || a.equals("prefixname")){
221 //				PREFIX_NAME=Parse.parseBoolean(b);
222 			}else if(a.equals("k")){
223 				k_=Integer.parseInt(b);
224 				assert(k_>0 && k_<11) : "k must be between 1 and 10; default is 10, and lower values are slower.";
225 			}else if(a.equals("minscaf") || a.equals("ms")){
226 				MINSCAF=FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b);
227 			}else if(a.equals("mlp") || a.equals("minlengthpercent")){
228 				minLengthPercent=Float.parseFloat(b);
229 			}else if(a.equals("mop") || a.equals("minoverlappercent")){
230 				minOverlapPercentCluster=minOverlapPercentMerge=Float.parseFloat(b);
231 			}else if(a.equals("mopc") || a.equals("minoverlappercentcluster")){
232 				minOverlapPercentCluster=Float.parseFloat(b);
233 			}else if(a.equals("mopm") || a.equals("minoverlappercentmerge")){
234 				minOverlapPercentMerge=Float.parseFloat(b);
235 			}else if(a.equals("mo") || a.equals("minoverlap")){
236 				minOverlapCluster=minOverlapMerge=Integer.parseInt(b);
237 			}else if(a.equals("moc") || a.equals("minoverlapcluster")){
238 				minOverlapCluster=Integer.parseInt(b);
239 			}else if(a.equals("mom") || a.equals("minoverlapmerge")){
240 				minOverlapMerge=Integer.parseInt(b);
241 			}else if(a.equals("rt") || a.equals("rigoroustransitive")){
242 				rigorousTransitive=Parse.parseBoolean(b);
243 			}else if(a.equals("e") || a.equals("maxedits") || a.equals("edits") || a.equals("edist")){
244 				maxEdits=Integer.parseInt(b);
245 			}else if(a.equals("s") || a.equals("maxsubs") || a.equals("maxsubstitutions") || a.equals("hdist")){
246 				maxSubs=Integer.parseInt(b);
247 			}else if(a.equals("bw") || a.equals("bandwidth")){
248 				bandwidth_=Integer.parseInt(b);
249 			}else if(a.equals("mid") || a.equals("minidentity")){
250 				minIdentity=Float.parseFloat(b);
251 				minIdentityMult=(minIdentity==100f ? 0 : (100f-minIdentity)/100f);
252 			}else if(a.equals("threads") || a.equals("t")){
253 				THREADS=(b==null || b.equalsIgnoreCase("auto") ? Shared.threads() : Integer.parseInt(b));
254 			}else if(a.equals("showspeed") || a.equals("ss")){
255 				showSpeed=Parse.parseBoolean(b);
256 			}else if(a.equals("verbose")){
257 				verbose=Parse.parseBoolean(b);
258 //				BandedAligner.verbose=verbose;
259 			}else if(a.equals("contigbreak") || (arg.contains("=") && (a.equals("n") || a.equals("-n")))){
260 				maxNs=Integer.parseInt(b);
261 			}else if(a.equals("reads") || a.startsWith("maxreads")){
262 				maxReads=Parse.parseKMG(b);
263 			}else if(a.equals("sn") || a.equals("storename") || a.equals("storenames") || a.equals("keepnames")){
264 				storeName=Parse.parseBoolean(b);
265 			}else if(a.equals("ssx") || a.equals("storesuffix") || a.equals("storesuffixes")){
266 				storeSuffix=Parse.parseBoolean(b);
267 			}else if(a.equals("numaffixmaps") || a.equals("nam")){
268 				numAffixMaps=Integer.parseInt(b);
269 			}else if(a.equals("amino") || a.equals("protein")){
270 				//do nothing
271 			}else if(a.equals("mac") || a.equals("maxaffixcopies")){
272 				maxAffixCopies=Integer.parseInt(b);
273 			}else if(a.equals("me") || a.equals("maxedges")){
274 				maxEdges=Integer.parseInt(b);
275 				maxEdges2=maxEdges*2;
276 				if(maxEdges2<1){maxEdges2=Integer.MAX_VALUE-1;}
277 			}else if(a.equals("ignoreaffix1") || a.equals("ia1")){
278 				ignoreAffix1=Parse.parseBoolean(b);
279 			}else if(a.equals("parsedepth") || a.equals("pd")){
280 				parseDepth=Parse.parseBoolean(b);
281 			}else if(a.equals("printlengthinedges") || a.equals("ple")){
282 				printLengthInEdges=Parse.parseBoolean(b);
283 			}else if(a.equals("depthmult") || a.equals("depthratio") || a.equals("dr")){
284 				depthRatio=Float.parseFloat(b);
285 				if(depthRatio<=0){
286 					parseDepth=false;
287 				}else{
288 					parseDepth=true;
289 					assert(depthRatio>0);
290 					if(depthRatio<1){depthRatio=1/depthRatio;}
291 				}
292 			}else if(a.equals("storequality") || a.equals("sq")){
293 				storeQuality=Parse.parseBoolean(b);
294 			}else if(a.equals("exact") || a.equals("ex")){
295 				exact=Parse.parseBoolean(b);
296 			}else if(a.equals("uniquenames") || a.equals("un")){
297 				uniqueNames=Parse.parseBoolean(b);
298 			}else if(a.equals("ftl") || a.equals("forcetrimleft")){
299 				forceTrimLeft=Integer.parseInt(b);
300 			}else if(a.equals("ftr") || a.equals("forcetrimright")){
301 				forceTrimRight=Integer.parseInt(b);
302 			}else if(a.equals("subset") || a.equals("sst")){
303 				subset_=Integer.parseInt(b);
304 			}else if(a.equals("subsets") || a.equals("subsetcount") || a.equals("sstc")){
305 				subsetCount_=Integer.parseInt(b);
306 			}else if(i==0 && in1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
307 				String c=args[i];
308 				if(c.indexOf(',')>=0 && !new File(c).exists()){
309 					in1=c.split(",");
310 				}else{
311 					in1=new String[] {c};
312 				}
313 			}else if(i==1 && out==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){
314 				out=args[i];
315 			}else{
316 				throw new RuntimeException("Unknown parameter "+args[i]);
317 			}
318 		}
319 
320 		comparator.setAscending(ascending);
321 
322 		{//Process parser fields
323 			Parser.processQuality();
324 			qTrimLeft=parser.qtrimLeft;
325 			qTrimRight=parser.qtrimRight;
326 			trimQ=parser.trimq;
327 			trimE=parser.trimE();
328 		}
329 
330 		if(verbose){
331 			ReadWrite.verbose=ConcurrentGenericReadInputStream.verbose=ConcurrentReadOutputStream.verbose=ByteFile1.verbose=ByteFile2.verbose=FastqReadInputStream.verbose=true;
332 		}
333 
334 		k=k_;
335 		k2=k-1;
336 		subset=subset_;
337 		subsetCount=subsetCount_;
338 		subsetMode=subsetCount>1;
339 		assert(subset>=0 && subset<subsetCount) : "subset="+subset+", subsetCount="+subsetCount;
340 
341 		BandedAligner.penalizeOffCenter=true;
342 
343 		if(maxSpanningTree){removeCycles=fixMultiJoins=false;}
344 		if(absorbOverlap){processClusters=true;}
345 		if(processClusters || renameClusters || maxSpanningTree){makeClusters=true;}
346 		if(makeClusters){findOverlaps=true;}
347 		if(renameClusters){uniqueNames=/*storeName=*/false;}
348 
349 		if(bandwidth_>-1){
350 			bandwidth=Tools.min(bandwidth_, 2*maxEdits+1);
351 			customBandwidth=(bandwidth<2*maxEdits+1);
352 		}else{
353 			bandwidth=2*maxEdits+1;
354 			customBandwidth=false;
355 		}
356 		maxSubs=Tools.max(maxSubs, maxEdits);
357 		if(maxSubs>0 || minIdentity<100 || findOverlaps){storeSuffix=true;}
358 
359 		assert(FastaReadInputStream.settingsOK());
360 
361 		if(in1==null){throw new RuntimeException("Error - at least one input file is required.");}
362 
363 		for(int i=0; i<in1.length; i++){
364 			if(in1[i].equalsIgnoreCase("stdin") && !new File(in1[i]).exists()){in1[i]="stdin.fa";}
365 		}
366 
367 //		assert(false) : Arrays.toString(in);
368 
369 //		if(!setOut && clusterFilePattern==null){out="stdout.fa";}
370 //		else
371 //			if("stdout".equalsIgnoreCase(out) || "standarddout".equalsIgnoreCase(out)){
372 //			out="stdout.fa";
373 //			outstream=System.err;
374 //		}
375 		if(!Tools.canWrite(out, overwrite)){throw new RuntimeException("Output file "+out+" already exists, and overwrite="+overwrite);}
376 
377 		for(int i=0; i<in1.length; i++){
378 			assert(!in1[i].equalsIgnoreCase(out));
379 		}
380 //		assert(false) : "\nabsorbContainment="+absorbContainment+", findOverlaps="+findOverlaps+", absorbOverlap="+absorbOverlap+"\n"+
381 //			"processClusters="+processClusters+", renameClusters="+renameClusters+", makeClusters="+makeClusters+", uniqueNames="+uniqueNames+"\n"+
382 //			"storeName="+storeName+", DISPLAY_PROGRESS="+DISPLAY_PROGRESS+", removeCycles="+removeCycles;
383 		if(absorbContainment || findOverlaps){
384 //			assert(false);
385 			affixMaps=new HashMap[numAffixMaps];
386 			for(int i=0; i<numAffixMaps; i++){
387 				affixMaps[i]=new HashMap<LongM, ArrayList<Unit>>(4000000);
388 			}
389 			if(affixMaps.length>0){affixMap1=affixMaps[0];}
390 			if(affixMaps.length>1){affixMap2=affixMaps[1];}
391 		}
392 //		assert(false) : absorbContainment+", "+(affixMap==null);
393 
394 		if(outdupe==null){
395 			dupeWriter=null;
396 		}else{
397 			FileFormat ff=FileFormat.testOutput(outdupe, FileFormat.FASTA, null, true, overwrite, append, false);
398 			dupeWriter=new ByteStreamWriter(ff);
399 		}
400 	}
401 
process()402 	public void process(){
403 
404 		Timer t=new Timer();
405 
406 		boolean dq0=FASTQ.DETECT_QUALITY;
407 		boolean ti0=FASTQ.TEST_INTERLEAVED;
408 
409 		process2();
410 
411 		FASTQ.DETECT_QUALITY=dq0;
412 		FASTQ.TEST_INTERLEAVED=ti0;
413 
414 		t.stop();
415 
416 		if(showSpeed){
417 			outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8));
418 		}
419 
420 		if(errorState){
421 			throw new RuntimeException("Dedupe terminated in an error state; the output may be corrupt.");
422 		}
423 	}
424 
process2()425 	public void process2(){
426 		if(dupeWriter!=null){dupeWriter.start();}
427 //		assert(false) : out;
428 		Timer t=new Timer();
429 
430 		if(DISPLAY_PROGRESS){
431 			outstream.println("Initial:");
432 			Shared.printMemory();
433 			outstream.println();
434 		}
435 
436 		processMatches(t);
437 
438 		forceTrimLeft=forceTrimRight=-1;
439 		qTrimLeft=qTrimRight=false;
440 
441 
442 		if(absorbContainment){
443 			processContainments(t);
444 		}
445 
446 		if(dupeWriter!=null){dupeWriter.poisonAndWait();}
447 
448 		if(findOverlaps){
449 			findOverlaps(t);
450 
451 			killAffixMaps();
452 
453 			if(processClusters || renameClusters || maxSpanningTree){codeMap=null;}
454 
455 			if(maxSpanningTree){
456 				processMst(t);
457 			}
458 
459 			if(processClusters){
460 				processClusters(t, absorbOverlap);
461 			}
462 //			if(renameClusters){
463 //				renameClusters(t);
464 //			}
465 //			assert(false) : (codeMap==null)+", "+(affixMap1==null)+", "+processedClusters;
466 		}
467 
468 		outstream.println("Input:                  \t"+readsProcessed+" reads \t\t"+basesProcessed+" bases.");
469 
470 		if(absorbMatch){
471 			outstream.println("Duplicates:             \t"+matches+" reads ("+String.format(Locale.ROOT, "%.2f",matches*100.0/readsProcessed)+"%) \t"+
472 					baseMatches+" bases ("+String.format(Locale.ROOT, "%.2f",baseMatches*100.0/basesProcessed)+"%)     \t"+collisions+" collisions.");
473 		}
474 		if(absorbContainment){
475 			outstream.println("Containments:           \t"+containments+" reads ("+String.format(Locale.ROOT, "%.2f",containments*100.0/readsProcessed)+"%) \t"+
476 					baseContainments+" bases ("+String.format(Locale.ROOT, "%.2f",baseContainments*100.0/basesProcessed)+"%)    \t"+containmentCollisions+" collisions.");
477 		}
478 		if(findOverlaps){
479 			outstream.println("Overlaps:               \t"+overlaps+" reads ("+String.format(Locale.ROOT, "%.2f",overlaps*100.0/readsProcessed)+"%) \t"+
480 					baseOverlaps+" bases ("+String.format(Locale.ROOT, "%.2f",baseOverlaps*100.0/basesProcessed)+"%)    \t"+overlapCollisions+" collisions.");
481 		}
482 //		outstream.println("Result:                 \t"+(addedToMain-containments)+" reads \t\t"+(basesProcessed-baseMatches-baseContainments)+" bases.");
483 
484 		long outReads=(addedToMain-containments);
485 		if(UNIQUE_ONLY){outReads=readsProcessed-matches-containments;}
486 		long outBases=(basesProcessed-baseMatches-baseContainments);
487 		outstream.println("Result:                 \t"+outReads+" reads ("+String.format(Locale.ROOT, "%.2f",outReads*100.0/readsProcessed)+"%) \t"+
488 				outBases+" bases ("+String.format(Locale.ROOT, "%.2f",outBases*100.0/basesProcessed)+"%)");
489 
490 		outstream.println("");
491 
492 		if(out!=null || clusterFilePattern!=null || outbest!=null || outgraph!=null || outcsf!=null){
493 			writeOutput(outcsf, t);
494 		}
495 
496 	}
497 
killAffixMaps()498 	private void killAffixMaps(){
499 		if(affixMaps==null){return;}
500 		for(int i=0; i<numAffixMaps; i++){
501 			if(affixMaps[i]!=null){affixMaps[i].clear();}
502 			affixMaps[i]=null;
503 		}
504 		affixMap1=null;
505 		affixMap2=null;
506 		affixMaps=null;
507 	}
508 
makeCrisArray(ArrayList<Read> list)509 	private ConcurrentReadInputStream[] makeCrisArray(ArrayList<Read> list){
510 		final ConcurrentReadInputStream[] array;
511 
512 		if(list!=null){
513 			array=new ConcurrentReadInputStream[] {new ConcurrentCollectionReadInputStream(list, null, -1)};
514 			array[0].start(); //This deadlocks if ConcurrentReadInputStream extends Thread rather than spawning a new thread.
515 		}else{
516 			array=new ConcurrentReadInputStream[in1.length];
517 			multipleInputFiles=array.length>1;
518 			for(int i=0; i<in1.length; i++){
519 				if(verbose){System.err.println("Creating cris for "+in1[i]);}
520 
521 				final ConcurrentReadInputStream cris;
522 				{
523 					FileFormat ff1=FileFormat.testInput(in1[i], FileFormat.FASTA, null, !multipleInputFiles || ReadWrite.USE_UNPIGZ, true);
524 					FileFormat ff2=(in2==null || in2.length<=i ? null : FileFormat.testInput(in2[i], FileFormat.FASTA, null, !multipleInputFiles || ReadWrite.USE_UNPIGZ, true));
525 					cris=ConcurrentReadInputStream.getReadInputStream(maxReads, ff1.samOrBam(), ff1, ff2);
526 					cris.start();
527 					if(cris.paired()){
528 						THREADS=1;//Temp fix for losing reads when multithreaded and paired
529 						if(absorbContainment){
530 							System.err.println("Set absorbContainment to false because it is not currently supported for paired reads.");
531 							absorbContainment=false;
532 						}
533 					}
534 				}
535 				array[i]=cris;
536 			}
537 		}
538 		return array;
539 	}
540 
processMatches(Timer t)541 	private void processMatches(Timer t){
542 		crisa=makeCrisArray(null);
543 
544 		ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS);
545 		for(int i=0; i<THREADS; i++){alht.add(new HashThread(true, (absorbContainment|findOverlaps), absorbMatch, false, false));}
546 		for(HashThread ht : alht){ht.start();}
547 		for(HashThread ht : alht){
548 			while(ht.getState()!=Thread.State.TERMINATED){
549 				try {
550 					ht.join();
551 				} catch (InterruptedException e) {
552 					// TODO Auto-generated catch block
553 					e.printStackTrace();
554 				}
555 			}
556 			matches+=ht.matchesT;
557 			collisions+=ht.collisionsT;
558 			containments+=ht.containmentsT;
559 			containmentCollisions+=ht.containmentCollisionsT;
560 			baseContainments+=ht.baseContainmentsT;
561 			baseMatches+=ht.baseMatchesT;
562 			addedToMain+=ht.addedToMainT;
563 			readsProcessed+=ht.readsProcessedT;
564 			basesProcessed+=ht.basesProcessedT;
565 		}
566 		alht.clear();
567 
568 		if(verbose){System.err.println("Attempting to close input streams (1).");}
569 		for(ConcurrentReadInputStream cris : crisa){
570 			errorState|=ReadWrite.closeStream(cris);
571 		}
572 		crisa=null;
573 
574 		if(DISPLAY_PROGRESS){
575 			t.stop();
576 			outstream.println("Found "+matches+" duplicates.");
577 			outstream.println("Finished exact matches.    Time: "+t);
578 			Shared.printMemory();
579 			if(verbose){outstream.println(affixMap1);}
580 			outstream.println();
581 			t.start();
582 		}
583 	}
584 
processContainments(Timer t)585 	private void processContainments(Timer t){
586 		ArrayList<Read> list=new ArrayList<Read>((int)addedToMain);
587 		for(ArrayList<Unit> alu : codeMap.values()){
588 			for(Unit u : alu){
589 				assert(u.r.mate==null) : "Containments are not currently supported with paired reads.";
590 				if(u.valid() && u.r.pairnum()==0){list.add(u.r);}
591 			}
592 		}
593 
594 		//	if(minLengthPercent>0){
595 		//		if(verbose){System.err.println("Sorting.");}
596 		//		Shared.sort(list, comparator);
597 		//		Collections.reverse(list);
598 		//		assert(list.isEmpty() || list.get(0).length()<=list.get(list.size()-1).length()) :
599 		//			list.get(0).length()+", "+list.get(list.size()-1).length();
600 		//	}
601 
602 		crisa=makeCrisArray(subsetMode ? null : list);
603 
604 		ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS);
605 		for(int i=0; i<THREADS; i++){alht.add(new HashThread(false, false, false, true, false));}
606 
607 		for(HashThread ht : alht){ht.start();}
608 		for(HashThread ht : alht){
609 			while(ht.getState()!=Thread.State.TERMINATED){
610 				try {
611 					ht.join();
612 				} catch (InterruptedException e) {
613 					// TODO Auto-generated catch block
614 					e.printStackTrace();
615 				}
616 			}
617 			assert(ht.matchesT==0);
618 			assert(ht.collisionsT==0);
619 			assert(ht.baseMatchesT==0);
620 			assert(ht.addedToMainT==0);
621 //			assert(ht.readsProcessedT==0);
622 //			assert(ht.basesProcessedT==0);
623 			//		matches+=ht.matchesT;
624 			//		collisions+=ht.collisionsT;
625 			containments+=ht.containmentsT;
626 			containmentCollisions+=ht.containmentCollisionsT;
627 			baseContainments+=ht.baseContainmentsT;
628 			//		baseMatches+=ht.baseMatchesT;
629 			//		addedToMain+=ht.addedToMainT;
630 			//		readsProcessed+=ht.readsProcessedT;
631 			//		basesProcessed+=ht.basesProcessedT;
632 		}
633 		alht.clear();
634 		if(verbose){System.err.println("Attempting to close input streams (2).");}
635 		for(ConcurrentReadInputStream cris : crisa){
636 			errorState|=ReadWrite.closeStream(cris);
637 		}
638 
639 		if(DISPLAY_PROGRESS){
640 			t.stop();
641 			outstream.println("Found "+containments+" contained sequences.");
642 			outstream.println("Finished containment.      Time: "+t);
643 			Shared.printMemory();
644 			outstream.println();
645 			t.start();
646 		}
647 		crisa=null;
648 		if(!findOverlaps){
649 			killAffixMaps();
650 		}
651 
652 		long x=removeInvalid(list);
653 		list.clear();
654 
655 		if(DISPLAY_PROGRESS){
656 			t.stop();
657 			outstream.println("Removed "+x+" invalid entries.");
658 			outstream.println("Finished invalid removal.  Time: "+t);
659 			Shared.printMemory();
660 			outstream.println();
661 			t.start();
662 		}
663 	}
664 
findOverlaps(Timer t)665 	private void findOverlaps(Timer t){
666 
667 		ArrayList<Read> list=new ArrayList<Read>((int)addedToMain);
668 		for(ArrayList<Unit> alu : codeMap.values()){
669 			for(Unit u : alu){
670 				if(u.valid() && u.r.pairnum()==0){
671 					u.unitID=list.size();
672 					list.add(u.r);
673 					if(u.r.mate!=null){
674 						Unit u2=(Unit)u.r.mate.obj;
675 						u2.unitID=u.unitID;
676 					}
677 				}else{
678 					u.unitID=Integer.MAX_VALUE;
679 				}
680 			}
681 		}
682 
683 		if(preventTransitiveOverlaps){
684 			clusterNumbers=new AtomicIntegerArray(list.size());
685 			for(int i=0; i<clusterNumbers.length(); i++){
686 				clusterNumbers.set(i, i);
687 			}
688 		}
689 
690 		crisa=makeCrisArray(subsetMode ? null : list);
691 
692 		ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS);
693 		for(int i=0; i<THREADS; i++){alht.add(new HashThread(false, false, false, false, true));}
694 
695 		for(HashThread ht : alht){ht.start();}
696 		for(HashThread ht : alht){
697 			while(ht.getState()!=Thread.State.TERMINATED){
698 				try {
699 					ht.join();
700 				} catch (InterruptedException e) {
701 					// TODO Auto-generated catch block
702 					e.printStackTrace();
703 				}
704 			}
705 			assert(ht.matchesT==0);
706 			assert(ht.collisionsT==0);
707 			assert(ht.baseMatchesT==0);
708 			assert(ht.addedToMainT==0);
709 
710 			overlaps+=ht.overlapsT;
711 			baseOverlaps+=ht.baseOverlapsT;
712 			overlapCollisions+=ht.overlapCollisionsT;
713 		}
714 		alht.clear();
715 		if(verbose){System.err.println("Attempting to close input streams (3).");}
716 		for(ConcurrentReadInputStream cris : crisa){
717 			errorState|=ReadWrite.closeStream(cris);
718 		}
719 
720 		if(DISPLAY_PROGRESS){
721 			t.stop();
722 			outstream.println("Found "+overlaps+" overlaps.");
723 			outstream.println("Finished finding overlaps. Time: "+t);
724 			Shared.printMemory();
725 			outstream.println();
726 			t.start();
727 		}
728 
729 		crisa=null;
730 
731 		if(makeClusters){
732 			int intransitive=0, redundant=0;
733 			assert((intransitive=countIntransitive(t, list, rigorousTransitive))==0);
734 			assert((redundant=countRedundant(t, list))==0);
735 			long overlaps=countOverlaps(t, list);
736 			assert(intransitive==0);
737 			assert(redundant==0);
738 //			makeTransitive(t, list, rigorousTransitive);
739 			if(clusterQueue==null){
740 				clusterQueue=new ArrayDeque<ArrayList<Unit>>(list.size()/4+1);
741 				processedClusters=new ArrayList<ArrayList<Unit>>();
742 			}else{
743 				assert(clusterQueue.isEmpty());
744 			}
745 			makeClusters(t, list);
746 		}
747 
748 		list.clear();
749 	}
750 
makeTransitive(Timer t, ArrayList<Read> list, boolean rigorous)751 	private long makeTransitive(Timer t, ArrayList<Read> list, boolean rigorous){
752 		assert(false) : "No longer needed.";
753 		long added=0;
754 		for(Read r : list){
755 			assert(r!=null);
756 			Unit u=(Unit) r.obj;
757 			assert(u!=null);
758 			assert(u.valid());
759 //			outstream.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size()));
760 			if(u.valid()){
761 
762 				if(u.overlapList!=null){
763 					for(Overlap o : u.overlapList){
764 						Unit u2=(o.u1==u ? o.u2 : o.u1);
765 						assert(u2!=u);
766 						if(u2.overlapList==null){
767 							u2.overlapList=new ArrayList<Overlap>(2);
768 							u2.overlapList.add(o);
769 						}else{
770 							boolean found=false;
771 							if(rigorous){
772 								found=u2.overlapList.contains(o);
773 							}else{
774 								for(Overlap o2 : u2.overlapList){
775 									if(o2.u1==u || o2.u2==u){found=true; break;}
776 								}
777 							}
778 							if(!found){
779 								added++;
780 								u2.overlapList.add(o);
781 							}
782 						}
783 					}
784 				}
785 			}
786 		}
787 
788 		for(Read r : list){
789 			Unit u=(Unit) r.obj;
790 			if(u.valid()){
791 				assert(u.isTransitive());
792 			}
793 		}
794 
795 		if(DISPLAY_PROGRESS){
796 			t.stop();
797 			outstream.println("Added overlaps: "+added);
798 			outstream.println("Made overlaps transitive.  Time: "+t);
799 			Shared.printMemory();
800 			outstream.println();
801 			t.start();
802 		}
803 		return added;
804 	}
805 
countIntransitive(Timer t, ArrayList<Read> list, boolean rigorous)806 	private int countIntransitive(Timer t, ArrayList<Read> list, boolean rigorous){
807 		if(!countTransitive){return 0;}
808 		int transitive=0, intransitive=0;
809 		for(Read r : list){
810 			assert(r!=null);
811 			Unit u=(Unit) r.obj;
812 			assert(u!=null);
813 			assert(u.valid());
814 //			outstream.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size()));
815 			if(u.valid()){
816 				if(rigorous ? u.isPerfectlyTransitive() : u.isTransitive()){
817 					transitive++;
818 				}else{
819 					intransitive++;
820 				}
821 			}
822 		}
823 
824 		if(DISPLAY_PROGRESS){
825 			t.stop();
826 			outstream.println("Intransitive:   "+intransitive+", \ttransitive: "+transitive);
827 			outstream.println("Checked transitivity.      Time: "+t);
828 			Shared.printMemory();
829 			outstream.println();
830 			t.start();
831 		}
832 
833 		return intransitive;
834 	}
835 
countRedundant(Timer t, ArrayList<Read> list)836 	private int countRedundant(Timer t, ArrayList<Read> list){
837 		if(!countRedundant){return 0;}
838 		int redundant=0, nonredundant=0;
839 		for(Read r : list){
840 			assert(r!=null);
841 			Unit u=(Unit) r.obj;
842 			assert(u!=null);
843 			assert(u.valid());
844 //			outstream.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size()));
845 			if(u.valid()){
846 				if(u.isNonRedundant()){
847 					nonredundant++;
848 				}else{
849 					redundant++;
850 				}
851 			}
852 		}
853 
854 		if(DISPLAY_PROGRESS){
855 			t.stop();
856 			outstream.println("Redundant:      "+redundant+",  \tnonredundant: "+nonredundant);
857 			outstream.println("Checked redundancy.        Time: "+t);
858 			Shared.printMemory();
859 			outstream.println();
860 			t.start();
861 		}
862 		return redundant;
863 	}
864 
countOverlaps(Timer t, ArrayList<Read> list)865 	private long countOverlaps(Timer t, ArrayList<Read> list){
866 
867 		long overlaps=0, length=0;
868 		for(Read r : list){
869 			assert(r!=null);
870 			Unit u=(Unit) r.obj;
871 			assert(u!=null);
872 			assert(u.valid());
873 //			outstream.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size()));
874 			if(u.valid() && u.overlapList!=null){
875 				for(Overlap o : u.overlapList){
876 					overlaps++;
877 					length+=o.overlapLen;
878 				}
879 			}
880 		}
881 
882 		if(DISPLAY_PROGRESS){
883 			t.stop();
884 			outstream.println("Overlaps:       "+overlaps+",  \tlength: "+length);
885 			outstream.println("Counted overlaps.          Time: "+t);
886 			Shared.printMemory();
887 			outstream.println();
888 			t.start();
889 		}
890 		return overlaps;
891 	}
892 
fillClusterSizeMatrix(ArrayList<ArrayList<Unit>> clusters, long[][] clusterSize)893 	private long fillClusterSizeMatrix(ArrayList<ArrayList<Unit>> clusters, long[][] clusterSize){
894 		int max=0;
895 		for(ArrayList<Unit> cluster : clusters){
896 			final int cs=Tools.min(clusterSize.length-1, cluster.size());
897 			{
898 				long reads=0, bases=0;
899 				for(Unit u2 : cluster){
900 					reads++;
901 					bases+=u2.length();
902 				}
903 				clusterSize[0][cs]++;
904 				clusterSize[1][cs]+=reads;
905 				clusterSize[2][cs]+=bases;
906 			}
907 			max=Tools.max(max, cluster.size());
908 		}
909 		return max;
910 	}
911 
makeClusters(Timer t, ArrayList<Read> list)912 	private void makeClusters(Timer t, ArrayList<Read> list){
913 
914 		final int clusterlen=70000;
915 		long[][] clusterSize=new long[3][clusterlen];
916 		int max=0;
917 		for(Read r : list){
918 			Unit u=(Unit) r.obj;
919 
920 			if(!u.clustered()){
921 				ArrayList<Unit> cluster=u.makeCluster();
922 				if(cluster.size()>2){cluster.trimToSize();}
923 				if(cluster.size()==1 || (!processClusters && !maxSpanningTree)){processedClusters.add(cluster);}
924 				else{clusterQueue.add(cluster);}
925 				final int cs=Tools.min(clusterlen-1, cluster.size());
926 				{
927 					long reads=0, bases=0;
928 					for(Unit u2 : cluster){
929 						reads++;
930 						bases+=u2.length();
931 					}
932 					clusterSize[0][cs]++;
933 					clusterSize[1][cs]+=reads;
934 					clusterSize[2][cs]+=bases;
935 				}
936 				max=Tools.max(max, cluster.size());
937 			}
938 		}
939 
940 		if(DISPLAY_PROGRESS){
941 			t.stop();
942 			outstream.println(toClusterSizeString(clusterSize));
943 			outstream.println("\nLargest:          "+max);
944 			outstream.println("Finished making clusters.  Time: "+t);
945 			Shared.printMemory();
946 			outstream.println();
947 			t.start();
948 		}
949 
950 
951 		long x=removeInvalid(list);
952 
953 		if(DISPLAY_PROGRESS){
954 			t.stop();
955 			outstream.println("Removed "+x+" invalid entries.");
956 			outstream.println("Finished invalid removal.  Time: "+t);
957 			Shared.printMemory();
958 			outstream.println();
959 			t.start();
960 		}
961 	}
962 
toClusterSizeString(long[][] clusterSizeMatrix)963 	private String toClusterSizeString(long[][] clusterSizeMatrix){
964 
965 		long[] clusterSize=clusterSizeMatrix[0];
966 		long[] clusterReads=clusterSizeMatrix[1];
967 		long[] clusterBases=clusterSizeMatrix[2];
968 
969 		long totalClusters=Tools.sum(clusterSize);
970 
971 		long bigClusters=0;
972 		for(int i=minClusterSize; i<clusterSize.length; i++){
973 			bigClusters+=clusterSize[i];
974 		}
975 
976 		final int spaces=18;
977 		final int spaces2=spaces*2, spaces3=spaces*3;
978 
979 		final StringBuilder sb=new StringBuilder(100), sb2=new StringBuilder(1000);
980 		sb2.append("Clusters:");
981 		while(sb2.length()<spaces){sb2.append(' ');}
982 		sb2.append(totalClusters+(minClusterSize<2 ? "" : " ("+bigClusters+" of at least size "+minClusterSize+")")+"\n");
983 
984 		sb.append("Size Range");
985 		while(sb.length()<spaces){sb.append(' ');}
986 		sb.append("Clusters");
987 		while(sb.length()<spaces2){sb.append(' ');}
988 		sb.append("Reads");
989 		while(sb.length()<spaces3){sb.append(' ');}
990 		sb.append("Bases");
991 
992 		sb2.append('\n');
993 		sb2.append(sb);
994 		sb.setLength(0);
995 
996 		for(int i=0; i<clusterSize.length-1; i=Tools.max(i+1, i*2)){
997 			int a=i+1, b=i*2;
998 			if(i<2){
999 				sb.append(a);
1000 				while(sb.length()<spaces){sb.append(' ');}
1001 				sb.append(clusterSize[a]);
1002 				while(sb.length()<spaces2){sb.append(' ');}
1003 				sb.append(clusterReads[a]);
1004 				while(sb.length()<spaces3){sb.append(' ');}
1005 				sb.append(clusterBases[a]);
1006 			}else if(b>=clusterSize.length){
1007 				long x=Tools.sum(clusterSize, a, clusterSize.length-1);
1008 				long y=Tools.sum(clusterReads, a, clusterSize.length-1);
1009 				long z=Tools.sum(clusterBases, a, clusterSize.length-1);
1010 				if(x>0){
1011 					sb.append(a+"+");
1012 					while(sb.length()<spaces){sb.append(' ');}
1013 					sb.append(x);
1014 					while(sb.length()<spaces2){sb.append(' ');}
1015 					sb.append(y);
1016 					while(sb.length()<spaces3){sb.append(' ');}
1017 					sb.append(z);
1018 				}
1019 			}else{
1020 				long x=Tools.sum(clusterSize, a, b);
1021 				long y=Tools.sum(clusterReads, a, b);
1022 				long z=Tools.sum(clusterBases, a, b);
1023 				if(x>0){
1024 					sb.append(a+"-"+b);
1025 					while(sb.length()<spaces){sb.append(' ');}
1026 					sb.append(x);
1027 					while(sb.length()<spaces2){sb.append(' ');}
1028 					sb.append(y);
1029 					while(sb.length()<spaces3){sb.append(' ');}
1030 					sb.append(z);
1031 				}
1032 			}
1033 			if(sb.length()>0){
1034 				sb2.append('\n');
1035 				sb2.append(sb);
1036 				sb.setLength(0);
1037 			}
1038 		}
1039 		return sb2.toString();
1040 	}
1041 
renameClusters(Timer t)1042 	private void renameClusters(Timer t){
1043 		assert(false) : "This is now unused; renaming is done at output time.";
1044 		int cnum=0;
1045 		final StringBuilder sb=new StringBuilder(64);
1046 		for(ArrayList<Unit> alu : processedClusters){
1047 			for(int i=0; i<alu.size(); i++){
1048 				Unit u=alu.get(i);
1049 				Read r=u.r;
1050 				sb.append("Cluster ");
1051 				sb.append(cnum);
1052 				sb.append(",contig ");
1053 				sb.append(i);
1054 				if(u.offsetValid()){
1055 					sb.append(",pos ");
1056 					sb.append(u.offset());
1057 				}
1058 				r.id=sb.toString();
1059 				sb.setLength(0);
1060 			}
1061 		}
1062 
1063 		if(DISPLAY_PROGRESS){
1064 			t.stop();
1065 			outstream.println("Finished cluster renaming. Time: "+t);
1066 			Shared.printMemory();
1067 			outstream.println();
1068 			t.start();
1069 		}
1070 	}
1071 
processMst(Timer t)1072 	private void processMst(Timer t){
1073 
1074 		if(DISPLAY_PROGRESS){outstream.println("Converting to Maximum Spanning Tree.");}
1075 
1076 		ArrayList<MstThread> alct=new ArrayList<MstThread>(THREADS);
1077 		for(int i=0; i<THREADS; i++){
1078 			alct.add(new MstThread());
1079 		}
1080 
1081 		long overlapsRemoved=0;
1082 		long overlapBasesRemoved=0;
1083 		long overlapsRetained=0;
1084 		long overlapBasesRetained=0;
1085 
1086 		for(MstThread ct : alct){ct.start();}
1087 		for(MstThread ct : alct){
1088 			while(ct.getState()!=Thread.State.TERMINATED){
1089 				try {
1090 					ct.join();
1091 				} catch (InterruptedException e) {
1092 					e.printStackTrace();
1093 				}
1094 			}
1095 
1096 			overlapsRemoved+=ct.overlapsRemovedT;
1097 			overlapBasesRemoved+=ct.overlapBasesRemovedT;
1098 			overlapsRetained+=ct.overlapsRetainedT;
1099 			overlapBasesRetained+=ct.overlapBasesRetainedT;
1100 		}
1101 		assert(clusterQueue.isEmpty());
1102 		if(processClusters){
1103 			for(MstThread ct : alct){
1104 				clusterQueue.addAll(ct.processedT);
1105 				ct.processedT.clear();
1106 				ct.processedT=null;
1107 			}
1108 		}else{
1109 			for(MstThread ct : alct){
1110 				processedClusters.addAll(ct.processedT);
1111 				ct.processedT.clear();
1112 				ct.processedT=null;
1113 			}
1114 			clusterQueue=null;
1115 		}
1116 		alct.clear();
1117 
1118 		assert(affixMaps==null);
1119 		killAffixMaps();
1120 
1121 
1122 		if(DISPLAY_PROGRESS){
1123 			t.stop();
1124 			outstream.println("Removed "+(overlapsRemoved)+" edges ("+overlapBasesRemoved+" bases).");
1125 			outstream.println("Retained "+(overlapsRetained)+" edges ("+overlapBasesRetained+" bases).");
1126 
1127 //			outstream.println("\nAfter conversion to Maximum Spanning Tree:");
1128 //			final int[] clusterSize=new int[8200];
1129 //			int max=0;
1130 //			for(ArrayList<Unit> cluster : processedClusters){
1131 //				clusterSize[Tools.min(clusterSize.length-1, cluster.size())]++;
1132 //				max=Tools.max(max, cluster.size());
1133 //			}
1134 //			outstream.println(toClusterSizeString(clusterSize));
1135 //			outstream.println("Largest:          "+max);
1136 
1137 			outstream.println("Finished MST conversion.   Time: "+t);
1138 			Shared.printMemory();
1139 			outstream.println();
1140 			t.start();
1141 		}
1142 	}
1143 
processClusters(Timer t, boolean mergeClusters)1144 	private void processClusters(Timer t, boolean mergeClusters){
1145 
1146 		ArrayList<ClusterThread> alct=new ArrayList<ClusterThread>(THREADS);
1147 		for(int i=0; i<THREADS; i++){
1148 			alct.add(new ClusterThread(fixMultiJoins, removeCycles, fixOffsetContradictions,
1149 					mergeClusters, mergeClusters, mergeClusters));
1150 		}
1151 
1152 		long leafMerges=0;
1153 		long innerMerges=0;
1154 		long leafBaseMerges=0;
1155 		long innerBaseMerges=0;
1156 
1157 		long multiJoinFailures=0;
1158 		long multiJoinsFound=0;
1159 		long multiJoinBasesFound=0;
1160 		long unitsFlipped=0;
1161 		long overlapsFlipped=0;
1162 		long offsetContradictoryOverlaps=0;
1163 		long offsetContradictoryClusters=0;
1164 		long cycleOverlaps=0;
1165 		long cycleClusters=0;
1166 
1167 		for(ClusterThread ct : alct){ct.start();}
1168 		for(ClusterThread ct : alct){
1169 			while(ct.getState()!=Thread.State.TERMINATED){
1170 				try {
1171 					ct.join();
1172 				} catch (InterruptedException e) {
1173 					e.printStackTrace();
1174 				}
1175 			}
1176 
1177 			leafMerges+=ct.leafMergesT;
1178 			innerMerges+=ct.innerMergesT;
1179 			leafBaseMerges+=ct.leafBaseMergesT;
1180 			innerBaseMerges+=ct.innerBaseMergesT;
1181 
1182 			multiJoinFailures+=ct.multiJoinFailuresT;
1183 			multiJoinsFound+=ct.multiJoinsFoundT;
1184 			multiJoinBasesFound+=ct.multiJoinBasesFoundT;
1185 			unitsFlipped+=ct.unitsFlippedT;
1186 			overlapsFlipped+=ct.overlapsFlippedT;
1187 			offsetContradictoryOverlaps+=ct.offsetContradictoryOverlapsT;
1188 			offsetContradictoryClusters+=ct.offsetContradictoryClustersT;
1189 			cycleOverlaps+=ct.cycleOverlapsT;
1190 			cycleClusters+=ct.cycleClustersT;
1191 		}
1192 		alct.clear();
1193 
1194 		assert(affixMaps==null || affixMap1==null);
1195 		killAffixMaps();
1196 
1197 		assert(clusterQueue.isEmpty());
1198 		clusterQueue=null;
1199 
1200 		if(DISPLAY_PROGRESS){
1201 			t.stop();
1202 			if(fixMultiJoins){
1203 				outstream.println("Found "+(multiJoinsFound)+" multijoins ("+multiJoinBasesFound+" bases).");
1204 				outstream.println("Experienced "+(multiJoinFailures)+" multijoin removal failures.");
1205 			}
1206 			if(fixOffsetContradictions){
1207 				outstream.println("Found "+(offsetContradictoryClusters)+" clusters ("+offsetContradictoryOverlaps+" overlaps) with contradictory offset cycles.");
1208 			}
1209 			outstream.println("Found "+(cycleClusters)+" clusters ("+cycleOverlaps+" overlaps) with remaining cycles.");
1210 			if(absorbOverlap){
1211 				outstream.println("Merged "+(leafMerges)+" leaves ("+leafBaseMerges+" bases).");
1212 				outstream.println("Merged "+(innerMerges)+" nonleaves ("+innerBaseMerges+" bases).");
1213 			}
1214 
1215 			outstream.println("\nAfter processing clusters:");
1216 			final long[][] clusterSize=new long[3][70000];
1217 			final long max=fillClusterSizeMatrix(processedClusters, clusterSize);
1218 			outstream.println(toClusterSizeString(clusterSize));
1219 			outstream.println("\nLargest:          "+max);
1220 
1221 			outstream.println("Finished processing.       Time: "+t);
1222 			Shared.printMemory();
1223 			outstream.println();
1224 			t.start();
1225 		}
1226 	}
1227 
removeInvalid(ArrayList<Read> list)1228 	private long removeInvalid(ArrayList<Read> list){
1229 		final LongM keym=new LongM();
1230 		long removedC=0, removedP=0, removedS=0, invalid=0;
1231 
1232 		for(int j=0, lim=list.size(); j<lim; j++){
1233 			final Read r=list.get(j);
1234 			final Unit u=(Unit)r.obj;
1235 
1236 			if(!u.valid()){
1237 
1238 				invalid++;
1239 
1240 				if(codeMap!=null && !codeMap.isEmpty()){
1241 					Long key=u.code1;
1242 					ArrayList<Unit> alu=codeMap.get(key);
1243 					if(alu!=null){
1244 						int valid=0;
1245 						for(int i=alu.size()-1; i>=0; i--){
1246 							Unit u2=alu.get(i);
1247 							if(u2==null || !u2.valid()){
1248 								alu.remove(i);
1249 								removedC++;
1250 							}
1251 							else{valid++;}
1252 						}
1253 						if(valid==0){codeMap.remove(key);}
1254 					}
1255 				}
1256 
1257 				if(affixMap1!=null && !affixMap1.isEmpty()){
1258 					{
1259 						keym.set(u.prefix1);
1260 						ArrayList<Unit> alu=affixMap1.get(keym);
1261 						if(alu!=null){
1262 							int valid=0;
1263 							for(int i=alu.size()-1; i>=0; i--){
1264 								Unit u2=alu.get(i);
1265 								if(u2==null || !u2.valid()){
1266 									alu.remove(i);
1267 									removedP++;
1268 								}
1269 								else{valid++;}
1270 							}
1271 							if(valid==0){affixMap1.remove(keym);}
1272 						}
1273 					}
1274 					if(storeSuffix){
1275 						keym.set(u.suffix1);
1276 						ArrayList<Unit> alu=affixMap1.get(keym);
1277 						if(alu!=null){
1278 							int valid=0;
1279 							for(int i=alu.size()-1; i>=0; i--){
1280 								Unit u2=alu.get(i);
1281 								if(u2==null || !u2.valid()){
1282 									alu.remove(i);
1283 									removedS++;
1284 								}
1285 								else{valid++;}
1286 							}
1287 							if(valid==0){affixMap1.remove(keym);}
1288 						}
1289 					}
1290 				}
1291 				if(affixMap2!=null && !affixMap2.isEmpty()){
1292 					if(u.prefix2!=-1){
1293 						keym.set(u.prefix2);
1294 						ArrayList<Unit> alu=affixMap2.get(keym);
1295 						if(alu!=null){
1296 							int valid=0;
1297 							for(int i=alu.size()-1; i>=0; i--){
1298 								Unit u2=alu.get(i);
1299 								if(u2==null || !u2.valid()){
1300 									alu.remove(i);
1301 									removedP++;
1302 								}
1303 								else{valid++;}
1304 							}
1305 							if(valid==0){affixMap2.remove(keym);}
1306 						}
1307 					}
1308 					if(storeSuffix && u.suffix2!=-1){
1309 						keym.set(u.suffix2);
1310 						ArrayList<Unit> alu=affixMap2.get(keym);
1311 						if(alu!=null){
1312 							int valid=0;
1313 							for(int i=alu.size()-1; i>=0; i--){
1314 								Unit u2=alu.get(i);
1315 								if(u2==null || !u2.valid()){
1316 									alu.remove(i);
1317 									removedS++;
1318 								}
1319 								else{valid++;}
1320 							}
1321 							if(valid==0){affixMap2.remove(keym);}
1322 						}
1323 					}
1324 
1325 				}
1326 
1327 				list.set(j, null);
1328 			}
1329 		}
1330 
1331 		if(invalid>0){
1332 			Tools.condenseStrict(list);
1333 		}
1334 		if(verbose){
1335 			outstream.println("Removed invalids: "+removedC+", "+removedP+", "+removedS);
1336 		}
1337 		return invalid;
1338 	}
1339 
1340 
addToArray(HashMap<Long, ArrayList<Unit>> codeMap, boolean sort, boolean clear, long outNum)1341 	private static ArrayList<Read> addToArray(HashMap<Long, ArrayList<Unit>> codeMap, boolean sort, boolean clear, long outNum){
1342 		assert(outNum<=Integer.MAX_VALUE);
1343 		if(verbose){System.err.println("Making list.");}
1344 		ArrayList<Read> list=new ArrayList<Read>((int)outNum);
1345 		if(verbose){System.err.println("Adding.");}
1346 		for(ArrayList<Unit> alu : codeMap.values()){
1347 			for(Unit u : alu){
1348 				if(u.valid() && u.r.pairnum()==0){list.add(u.r);}
1349 			}
1350 			if(clear){alu.clear();}
1351 		}
1352 		if(clear){codeMap.clear();}
1353 
1354 		if(sort){
1355 			if(verbose){System.err.println("Sorting.");}
1356 			Shared.sort(list, comparator);
1357 //			if(ascending){
1358 //				Collections.reverse(list);
1359 //				assert(list.isEmpty() || list.get(0).length()<=list.get(list.size()-1).length()) :
1360 //					list.get(0).length()+", "+list.get(list.size()-1).length();
1361 //			}else{
1362 //				assert(list.isEmpty() || list.get(0).length()>=list.get(list.size()-1).length()) :
1363 //					list.get(0).length()+", "+list.get(list.size()-1).length();
1364 //			}
1365 		}
1366 		assert(list.size()==outNum || list.size()*2L==outNum || UNIQUE_ONLY) : list.size()+", "+outNum;
1367 		return list;
1368 	}
1369 
writeOutput(String clusterStatsFile, Timer t)1370 	private void writeOutput(String clusterStatsFile, Timer t){
1371 //		verbose=true;
1372 //		assert(false) : (processedClusters==null)+", "+(processedClusters.isEmpty())+", "+outgraph+", "+out+", "+clusterFilePattern;
1373 		if(processedClusters==null || processedClusters.isEmpty()){
1374 
1375 			if(out!=null || clusterFilePattern!=null){
1376 
1377 				ArrayList<Read> list=addToArray(codeMap, sort, true, addedToMain-containments);
1378 				codeMap=null;
1379 
1380 				if(sort){
1381 					if(DISPLAY_PROGRESS){
1382 						t.stop();
1383 						outstream.println("Sorted output.             Time: "+t);
1384 						Shared.printMemory();
1385 						outstream.println();
1386 						t.start();
1387 					}
1388 				}
1389 
1390 				writeOutput(list);
1391 			}
1392 		}else{
1393 			if(outgraph!=null){
1394 				writeGraph(outgraph, processedClusters);
1395 			}
1396 			if(out!=null || clusterFilePattern!=null || clusterStatsFile!=null || outbest!=null){
1397 				writeOutputClusters(clusterStatsFile, processedClusters);
1398 			}
1399 		}
1400 
1401 		if(DISPLAY_PROGRESS){
1402 			t.stop();
1403 			outstream.println("Printed output.            Time: "+t);
1404 			Shared.printMemory();
1405 			outstream.println();
1406 			t.start();
1407 		}
1408 	}
1409 
1410 
1411 
writeOutput(ArrayList<Read> list)1412 	private void writeOutput(ArrayList<Read> list){
1413 
1414 		final ByteStreamWriter tsw=(out==null ? null : new ByteStreamWriter(out, overwrite, append, true));
1415 
1416 		if(verbose){System.err.println("Writing from array.");}
1417 		tsw.start();
1418 
1419 		HashSet<String> names=((uniqueNames && storeName) ?
1420 				new HashSet<String>(Tools.min(Integer.MAX_VALUE, Tools.max((int)addedToMain, (int)(addedToMain*1.35)))) : null);
1421 		long rid=0;
1422 		for(int x=0; x<list.size(); x++){
1423 			Read r=list.get(x);
1424 			list.set(x, null);
1425 
1426 			if(r.mate!=null && r.pairnum()!=0){r=r.mate;}
1427 
1428 			assert(r.mate==null || r.mate.discarded()==r.discarded());
1429 
1430 			if(!r.discarded()){
1431 				rid++;
1432 
1433 				for(int i=0; r!=null && i<2; i++){
1434 					if(multipleInputFiles){r.numericID=rid;}
1435 					if(names!=null){
1436 						String name=(r.id==null ? ""+r.numericID : r.id);
1437 						if(names.contains(name)){
1438 							for(long j=0; j<Integer.MAX_VALUE; j++){
1439 								String name2=name+"_dd"+j;
1440 								if(r.mate!=null){name2+=(" /"+(i+1));}
1441 								if(!names.contains(name2)){
1442 									r.id=name2;
1443 									names.add(name2);
1444 									break;
1445 								}
1446 							}
1447 						}else{
1448 							names.add(name);
1449 						}
1450 					}
1451 					tsw.println(r);
1452 					r.setDiscarded(true);
1453 					r=r.mate;
1454 				}
1455 			}
1456 		}
1457 		if(verbose){System.err.println("Shutting down tsw "+tsw.fname);}
1458 		tsw.poisonAndWait();
1459 	}
1460 
1461 
writeOutputClusters(String clusterStatsFile, ArrayList<ArrayList<Unit>> clist)1462 	private void writeOutputClusters(String clusterStatsFile, ArrayList<ArrayList<Unit>> clist){
1463 
1464 //		Shared.sort(clist, CLUSTER_LENGTH_COMPARATOR); //Causes a crash: java.lang.System.arraycopy(Native Method)
1465 		clist.sort(CLUSTER_LENGTH_COMPARATOR);
1466 
1467 		if(verbose){System.err.println("Writing clusters.");}
1468 
1469 		final ByteStreamWriter tswAll=(out==null ? null : new ByteStreamWriter(out, overwrite, append, true));
1470 		if(tswAll!=null){tswAll.start();}
1471 		ByteStreamWriter tswCluster=null;
1472 		ByteStreamWriter tswBest=null;
1473 
1474 		if(outbest!=null){
1475 			tswBest=new ByteStreamWriter(outbest, overwrite, append, true);
1476 			tswBest.start();
1477 		}
1478 
1479 		TextStreamWriter csf=null;
1480 		if(clusterStatsFile!=null){
1481 			csf=new TextStreamWriter(clusterStatsFile, overwrite, false, false);
1482 			csf.start();
1483 			csf.print("#Name\tsize\t"+nmerLength+"-mer frequencies\n");
1484 		}
1485 
1486 		HashSet<String> names=((uniqueNames && storeName) ?
1487 				new HashSet<String>(Tools.min(Integer.MAX_VALUE, Tools.max((int)addedToMain, (int)(addedToMain*1.35)))) : null);
1488 		long rid=0;
1489 		final long[] nmerCounts=new long[maxNmer+1];
1490 
1491 		final StringBuilder sb=new StringBuilder(64);
1492 
1493 		for(int cnum=0; cnum<clist.size(); cnum++){
1494 			final ArrayList<Unit> alu=clist.get(cnum);
1495 //			clist.set(cnum, null); //This breaks subsequent output processing
1496 
1497 			if(alu.size()<minClusterSize){
1498 				if(verbose){System.err.println("Ignoring small cluster "+cnum+", size "+alu.size());}
1499 
1500 				if(csf!=null && alu.size()>=minClusterSizeForStats){
1501 					float[] profile=makeNmerProfile(alu, nmerCounts);
1502 					sb.append("Cluster_");
1503 					sb.append(cnum);
1504 					sb.append('\t');
1505 					sb.append(alu.size());
1506 					sb.append('\t');
1507 					for(float f : profile){
1508 						sb.append(String.format(Locale.ROOT, "%.5f ", f));
1509 					}
1510 					sb.setCharAt(sb.length()-1, '\n');
1511 					csf.print(sb.toString());
1512 					sb.setLength(0);
1513 				}
1514 			}else{
1515 				if(verbose){System.err.println("Writing cluster "+cnum+", size "+alu.size());}
1516 
1517 				if(clusterFilePattern!=null){
1518 					if(tswCluster!=null){
1519 						if(verbose){System.err.println("Shutting down tswCluster "+tswCluster.fname);}
1520 						tswCluster.poisonAndWait();
1521 						tswCluster=null;
1522 					}
1523 					tswCluster=new ByteStreamWriter(clusterFilePattern.replaceFirst("%", ""+cnum), overwrite, append, true);
1524 					if(verbose){System.err.println("Starting tswCluster "+tswCluster.fname);}
1525 					tswCluster.start();
1526 				}
1527 
1528 				if(csf!=null && alu.size()>=minClusterSizeForStats){
1529 					float[] profile=makeNmerProfile(alu, nmerCounts);
1530 					sb.append("Cluster_");
1531 					sb.append(cnum);
1532 					sb.append('\t');
1533 					sb.append(alu.size());
1534 					sb.append('\t');
1535 					for(float f : profile){
1536 						sb.append(String.format(Locale.ROOT, "%.5f ", f));
1537 					}
1538 					sb.setCharAt(sb.length()-1, '\n');
1539 					csf.print(sb.toString());
1540 					sb.setLength(0);
1541 				}
1542 
1543 				if(pickBestRepresentativePerCluster){
1544 					pickBestRepresenative(alu, true);
1545 				}
1546 
1547 				if(outbest!=null){
1548 					Unit u=pickBestRepresenative((ArrayList<Unit>)alu.clone(), false);
1549 					tswBest.println(u.r);
1550 					if(u.r.mate!=null){tswBest.println(u.r.mate);}
1551 				}
1552 
1553 				for(int contig=0; contig<alu.size(); contig++){
1554 					final Unit u0=alu.get(contig);
1555 					alu.set(contig, null);
1556 					Read r=u0.r;
1557 					if(r.mate!=null && r.pairnum()!=0){r=r.mate;}
1558 
1559 					if(!r.discarded()){
1560 						rid++;
1561 
1562 						for(int i=0; r!=null && i<2; i++){
1563 							assert(r.pairnum()==i) : i+", "+r.pairnum()+", "+(r.mate==null ? 9 : r.mate.pairnum());
1564 							Unit u=(Unit)r.obj;
1565 							if(verbose){System.err.println("Writing read "+r.id);}
1566 							r.numericID=rid;
1567 							if(renameClusters){
1568 								sb.append("Cluster_");
1569 								sb.append(cnum);
1570 								sb.append(",contig_");
1571 								sb.append(contig);
1572 								if(u.offsetValid()){
1573 									sb.append(",pos_");
1574 									sb.append(u.offset());
1575 								}
1576 								if(r.mate!=null){sb.append(" /"+(i+1));}
1577 								r.id=(r.id==null ? sb.toString() : r.id+"\t"+sb);
1578 								sb.setLength(0);
1579 							}else if(names!=null){
1580 								String name=(r.id==null ? ""+r.numericID : r.id);
1581 								if(names.contains(name)){
1582 									for(long j=0; j<Integer.MAX_VALUE; j++){
1583 										String name2=name+"_dd"+j;
1584 										if(!names.contains(name2)){
1585 											r.id=name2;
1586 											names.add(name2);
1587 											break;
1588 										}
1589 									}
1590 								}else{
1591 									names.add(name);
1592 								}
1593 							}
1594 							if(tswAll!=null){tswAll.println(r);}
1595 							if(tswCluster!=null){tswCluster.println(r);}
1596 							r.setDiscarded(true);
1597 							r=r.mate;
1598 						}
1599 					}
1600 				}
1601 			}
1602 		}
1603 		if(csf!=null){
1604 			if(verbose){System.err.println("Shutting down csf "+csf.fname);}
1605 			csf.poisonAndWait();
1606 		}
1607 		if(tswBest!=null){
1608 			if(verbose){System.err.println("Shutting down tswBest "+tswBest.fname);}
1609 			tswBest.poisonAndWait();
1610 		}
1611 		if(tswAll!=null){
1612 			if(verbose){System.err.println("Shutting down tswAll "+tswAll.fname);}
1613 			tswAll.poisonAndWait();
1614 		}
1615 		if(tswCluster!=null){
1616 			if(verbose){System.err.println("Shutting down tswCluster "+tswCluster.fname);}
1617 			tswCluster.poisonAndWait();
1618 		}
1619 	}
1620 
1621 
writeGraph(String graphFile, ArrayList<ArrayList<Unit>> clist)1622 	private void writeGraph(String graphFile, ArrayList<ArrayList<Unit>> clist){
1623 //		Shared.sort(clist, CLUSTER_LENGTH_COMPARATOR);
1624 		clist.sort(CLUSTER_LENGTH_COMPARATOR);
1625 
1626 		if(verbose){System.err.println("Writing overlap graph.");}
1627 
1628 		final TextStreamWriter tsw=(graphFile==null ? null : new TextStreamWriter(graphFile, overwrite, append, true));
1629 		if(tsw!=null){
1630 			tsw.start();
1631 			tsw.print("digraph G {\n");
1632 		}
1633 
1634 		for(int cnum=0; cnum<clist.size(); cnum++){
1635 			final ArrayList<Unit> alu=clist.get(cnum);
1636 //			clist.set(cnum, null); //This breaks subsequent output processing
1637 //			Shared.sort(alu); //TODO: Remove
1638 
1639 			if(alu.size()<minClusterSize){
1640 				if(verbose){System.err.println("Ignoring small cluster "+cnum+", size "+alu.size());}
1641 			}else{
1642 				if(verbose){System.err.println("Processing cluster "+cnum+", size "+alu.size());}
1643 
1644 				for(int contig=0; contig<alu.size(); contig++){
1645 					final Unit u0=alu.get(contig);
1646 //					alu.set(contig, null); //This breaks subsequent output processing
1647 					Read r=u0.r;
1648 					if(r.mate!=null && r.pairnum()!=0){r=r.mate;}
1649 
1650 					{
1651 						for(int i=0; r!=null && i<2; i++){
1652 							assert(r.pairnum()==i) : i+", "+r.pairnum()+", "+(r.mate==null ? 9 : r.mate.pairnum());
1653 							Unit u=(Unit)r.obj;
1654 							if(verbose){System.err.println("Writing read "+r.id);}
1655 
1656 							if(tsw!=null){
1657 								tsw.print("\t"+toGraphName(r)+"\n");
1658 								if(r.mate!=null && r.pairnum()==0){
1659 									Read r2=r.mate;
1660 									tsw.print(toGraphName(r)+" -> "+toGraphName(r2)+" [label=mate]");
1661 								}
1662 								if(u.overlapList!=null){
1663 									for(Overlap o : u.overlapList){
1664 										if(u==o.u1){
1665 											Read r2=o.u2.r;
1666 											tsw.print("\t"+toGraphName(r)+" -> "+toGraphName(r2)+" [label=\""+o.toLabel()+"\"]\n");
1667 										}
1668 									}
1669 								}
1670 							}
1671 							r=r.mate;
1672 						}
1673 					}
1674 				}
1675 			}
1676 		}
1677 
1678 		if(tsw!=null){
1679 			tsw.print("}\n");
1680 			if(verbose){System.err.println("Shutting down tswAll "+tsw.fname);}
1681 			tsw.poisonAndWait();
1682 		}
1683 	}
1684 
toGraphName(Read r)1685 	private static String toGraphName(Read r){
1686 		if(NUMBER_GRAPH_NODES || r.id==null){
1687 			return r.numericID+((ADD_PAIRNUM_TO_NAME || r.mate!=null) ? "."+(r.pairnum()+1) : "");
1688 		}else{
1689 			return r.id.replace(' ','_').replace('\t','_');
1690 		}
1691 	}
1692 
pickBestRepresenative(ArrayList<Unit> alu, boolean clearList)1693 	private Unit pickBestRepresenative(ArrayList<Unit> alu, boolean clearList){
1694 		if(alu==null || alu.isEmpty()){return null;}
1695 		float[] quality=new float[alu.size()];
1696 		int[] lengths=new int[alu.size()];
1697 		for(int i=0; i<alu.size(); i++){
1698 			Unit u=alu.get(i);
1699 			int len=u.r.length();
1700 			quality[i]=u.r.expectedErrors(true, 0)/len;
1701 			lengths[i]=len;
1702 		}
1703 		Arrays.sort(quality);
1704 		Arrays.sort(lengths);
1705 		int medianLength=lengths[lengths.length/2];
1706 		float bestQuality=quality[0];
1707 
1708 		float currentBestQuality=9999999;
1709 		Unit best=null;
1710 		for(int i=0; i<alu.size(); i++){
1711 			Unit u=alu.get(i);
1712 			int len=u.r.length();
1713 			float deviation=Tools.absdif(len, medianLength)*1f/(medianLength+1);
1714 			if(deviation<0.05){
1715 				float qual=u.r.expectedErrors(true, 0)/len;
1716 				qual=(qual+.001f)*(1+10*deviation);
1717 				if(qual<currentBestQuality || best==null){
1718 					currentBestQuality=qual;
1719 					best=u;
1720 				}
1721 			}
1722 		}
1723 		if(clearList){
1724 			alu.clear();
1725 			alu.add(best);
1726 		}
1727 		return best;
1728 	}
1729 
hash(byte[] bases)1730 	public static long hash(byte[] bases){
1731 		long code=bases.length;
1732 		for(int i=0; i<bases.length; i++){
1733 			byte b=bases[i];
1734 			int mode=(int)(code&31);
1735 			assert(hashcodes[b]!=null) : "Invalid sequence character: '"+(char)b+"'";
1736 			code=code^hashcodes[b][mode];
1737 			code=Long.rotateLeft(code, 1);
1738 		}
1739 		return code;
1740 	}
1741 
1742 
hashReversed(byte[] bases)1743 	public static long hashReversed(byte[] bases){
1744 		long code=bases.length;
1745 		for(int i=bases.length-1; i>=0; i--){
1746 			byte b=bases[i];
1747 			assert(hashcodesR[b]!=null) : "Invalid sequence character: '"+(char)b+"'";
1748 			int mode=(int)(code&31);
1749 			code=code^hashcodesR[b][mode];
1750 			code=Long.rotateLeft(code, 1);
1751 		}
1752 		return code;
1753 	}
1754 
makeCodes(int symbols, int modes, long seed)1755 	private static synchronized long[][] makeCodes(int symbols, int modes, long seed){
1756 		Random randy=Shared.threadLocalRandom(seed);
1757 		long[][] r=new long[symbols][modes];
1758 		for(int i=0; i<symbols; i++){
1759 			for(int j=0; j<modes; j++){
1760 				r[i][j]=randy.nextLong();
1761 			}
1762 		}
1763 		return r;
1764 	}
1765 
1766 	/** Handles IUPAC codes and invalid symbols */
makeCodes2(int modes, long seed)1767 	private static synchronized long[][] makeCodes2(int modes, long seed){
1768 		long[][] r=makeCodes(128, modes, seed);
1769 
1770 		for(int i=0; i<26; i++){
1771 			char c=(char)('A'+i);
1772 			r[Tools.toLowerCase(c)]=r[c];
1773 		}
1774 		return r;
1775 	}
1776 
addDupe(Read r)1777 	private void addDupe(Read r){
1778 		if(dupeWriter==null){return;}
1779 		if(r.mate==null || r.pairnum()==0){
1780 			synchronized(dupeWriter){
1781 				dupeWriter.println(r);
1782 				if(r.mate!=null){
1783 					dupeWriter.println(r.mate);
1784 				}
1785 			}
1786 		}
1787 	}
1788 
1789 
1790 	private final class MstThread extends Thread{
1791 
MstThread()1792 		public MstThread(){}
1793 
1794 		@Override
run()1795 		public void run(){
1796 
1797 			ArrayList<Unit> cluster=null;
1798 			while((cluster=nextCluster())!=null){
1799 				makeMst(cluster);
1800 				processedT.add(cluster);
1801 			}
1802 
1803 		}
1804 
makeMst(ArrayList<Unit> cluster)1805 		public void makeMst(ArrayList<Unit> cluster){
1806 			assert(heap.isEmpty());
1807 			unvisit(cluster);
1808 			for(Unit u : cluster){
1809 				u.flags&=~Unit.VISIT_MASK;
1810 				Shared.sort(u.overlapList);
1811 			}
1812 			{
1813 				Unit u=cluster.get(0);
1814 				u.setVisited(true);
1815 				heap.addAll(u.overlapList);
1816 			}
1817 //			assert(false) : cluster.size();
1818 			while(!heap.isEmpty()){
1819 				Overlap o=heap.poll();
1820 				assert(!o.mst());
1821 				if(!o.invalid()){
1822 //					assert(o.u1.overlapList.contains(o)); //slow
1823 //					assert(o.u2.overlapList.contains(o)); //slow
1824 					assert(o.u1.visited() || o.u2.visited());
1825 					final Unit u=(!o.u1.visited() ? o.u1 : !o.u2.visited()? o.u2 : null);
1826 					if(u!=null){
1827 						o.setMst(true);
1828 						u.setVisited(true);
1829 						overlapsRetainedT++;
1830 						overlapBasesRetainedT+=o.overlapLen;
1831 						for(Overlap o2 : u.overlapList){
1832 							if(o2.mst()){
1833 								//do nothing
1834 							}else if(!o2.u1.visited() || !o2.u2.visited()){
1835 								if(heap.size()>=Integer.MAX_VALUE){
1836 									removeInvalid(heap);
1837 								}
1838 								heap.add(o2);
1839 							}else if(!o2.invalid()){
1840 								o2.setInvalid(true);
1841 								overlapsRemovedT++;
1842 								overlapBasesRemovedT+=o2.overlapLen;
1843 							}
1844 						}
1845 					}
1846 				}
1847 			}
1848 			for(Unit u : cluster){
1849 				ArrayList<Overlap> alo=u.overlapList;
1850 				int removed=0;
1851 				for(int i=0; i<alo.size(); i++){
1852 					Overlap o=alo.get(i);
1853 					if(o.invalid()){
1854 						assert(!o.mst());
1855 						alo.set(i, null);
1856 						removed++;
1857 					}else{
1858 						assert(o.mst());
1859 					}
1860 				}
1861 				if(removed>0){
1862 					Tools.condenseStrict(alo);
1863 					alo.trimToSize();
1864 				}
1865 			}
1866 		}
1867 
removeInvalid(PriorityQueue<Overlap> heap)1868 		private void removeInvalid(PriorityQueue<Overlap> heap){
1869 			ArrayList<Overlap> valid=new ArrayList<Overlap>(heap.size());
1870 			for(Overlap o : heap){
1871 				if(!o.invalid()){
1872 					assert(!o.u1.visited() || !o.u2.visited());
1873 					valid.add(o);
1874 				}
1875 			}
1876 			heap.clear();
1877 			heap.addAll(valid);
1878 		}
1879 
1880 
1881 		public long overlapsRemovedT=0;
1882 		public long overlapBasesRemovedT=0;
1883 		public long overlapsRetainedT=0;
1884 		public long overlapBasesRetainedT=0;
1885 
1886 		private final PriorityQueue<Overlap> heap=new PriorityQueue<Overlap>((1<<16)-1);
1887 		private ArrayList<ArrayList<Unit>> processedT=new ArrayList<ArrayList<Unit>>();
1888 	}
1889 
1890 
1891 	/**
1892 	 * Processes clustered sets of reads.
1893 	 * @author Brian Bushnell
1894 	 * @date Aug 9, 2013
1895 	 *
1896 	 */
1897 	private final class ClusterThread extends Thread{
1898 
ClusterThread(boolean fixMultiJoins_, boolean removeCycles_, boolean fixOffsetContradictions_, boolean mergeClusters_, boolean mergeLeaves_, boolean mergeInner_)1899 		public ClusterThread(boolean fixMultiJoins_, boolean removeCycles_,
1900 				boolean fixOffsetContradictions_, boolean mergeClusters_, boolean mergeLeaves_, boolean mergeInner_){
1901 			fixMultiJoinsT=fixMultiJoins_;
1902 			fixOffsetContradictionsT=fixOffsetContradictions_;
1903 			mergeClustersT=mergeClusters_;
1904 			mergeLeavesT=mergeLeaves_;
1905 			mergeInnerT=mergeInner_;
1906 
1907 //			assert(false) : fixMultiJoinsT+", "+canonicizeT+", "+fixCanonContradictionsT+", "+mergeLeavesT+", "+mergeInnerT;
1908 			bandy=(maxEdits>0 ? BandedAligner.makeBandedAligner(bandwidth) : null);
1909 //			assert(false) : fixMultiJoinsT+", "+canonicizeT+", "+fixCanonContradictionsT+", "+fixOffsetContradictionsT+", "+mergeClustersT+", "+removeCycles_;
1910 		}
1911 
1912 		@Override
run()1913 		public void run(){
1914 
1915 			final ArrayList<Unit> temp=new ArrayList<Unit>(1000);
1916 
1917 			ArrayList<Unit> cluster=null;
1918 			while((cluster=nextCluster())!=null){
1919 
1920 				if(EA){
1921 					for(Unit u : cluster){assert(u.r.mate==null) : "Cluster processing/merging is not supported for paired reads, only cluster generation.";}
1922 				}
1923 
1924 //				for(Unit u : cluster){assert(!u.visited());}
1925 				unvisit(cluster);
1926 
1927 				reorderClusterBreadthFirst(cluster);
1928 				int multiJoinCount=findMultiJoinsInCluster(cluster, fixMultiJoinsT);
1929 
1930 				if(EA){
1931 					for(Unit u : cluster){assert(!u.visited());}
1932 				}
1933 
1934 				boolean ok=true;
1935 				if(multiJoinCount!=0){
1936 					assert(multiJoinCount>0);
1937 					multiJoinsFoundT+=multiJoinCount;
1938 					if(!fixMultiJoinsT){
1939 						multiJoinFailuresT++;
1940 						ok=false;
1941 					}
1942 				}
1943 
1944 				int cycleOverlaps=0;
1945 				if(ok){
1946 					cycleOverlaps=findCycles(cluster, removeCycles);
1947 					for(Unit u : cluster){assert(!u.visited());}
1948 					if(cycleOverlaps>0){
1949 						cycleOverlapsT+=cycleOverlaps;
1950 						cycleClustersT++;
1951 					}
1952 				}
1953 
1954 				int offsetContradictions=0;
1955 				if(ok && fixOffsetContradictionsT){
1956 					if(EA){
1957 						for(Unit u : cluster){
1958 							assert(!u.visited());
1959 							assert(!u.offsetContradiction());
1960 							assert(!u.offsetValid());
1961 							if(u.overlapList!=null){
1962 								for(Overlap o : u.overlapList){
1963 									assert(!o.invalid());
1964 									assert(!o.offsetContradiction());
1965 									assert(o.type==FORWARD) : o;
1966 								}
1967 							}
1968 						}
1969 					}
1970 					offsetContradictions=generateOffsetsBreadthFirst(cluster, temp);
1971 //					System.err.println("Made offsets for cluster of size "+cluster.size()+"; contradictions = "+offsetContradictions+"; set = "+temp.size());
1972 					temp.clear();
1973 					for(Unit u : cluster){assert(!u.visited());}
1974 					if(offsetContradictions>0){
1975 						offsetContradictoryOverlapsT+=offsetContradictions;
1976 						offsetContradictoryClustersT++;
1977 						if(fixOffsetContradictionsT){
1978 							if(verbose){System.err.println("Pruning cluster to remove offset contradictions.");}
1979 							fullyPruneCluster(cluster, temp);
1980 							if(verbose){System.err.println("Resulting size: "+cluster.size());}
1981 							if(EA){
1982 								for(Unit u : cluster){
1983 									assert(!u.visited());
1984 									assert(!u.offsetContradiction());
1985 									assert(u.offsetValid());
1986 									if(u.overlapList!=null){
1987 										for(Overlap o : u.overlapList){
1988 											assert(!o.invalid());
1989 											assert(!o.offsetContradiction());
1990 											assert(o.type==FORWARD) : o;
1991 										}
1992 									}
1993 								}
1994 							}
1995 						}else{
1996 							ok=false;
1997 						}
1998 					}
1999 					if(ok){Shared.sort(cluster, UNIT_OFFSET_COMPARATOR);}
2000 				}
2001 
2002 				if(ok && absorbOverlap){
2003 					mergeCluster(cluster);
2004 				}
2005 
2006 				processedClustersT.add(cluster);
2007 				if(processedClustersT.size()>=threadMaxReadsToBuffer){
2008 					synchronized(processedClusters){
2009 						processedClusters.addAll(processedClustersT);
2010 						processedClustersT.clear();
2011 					}
2012 				}
2013 			}
2014 			synchronized(processedClusters){
2015 				processedClusters.addAll(processedClustersT);
2016 				processedClustersT.clear();
2017 			}
2018 		}
2019 
fullyPruneCluster(ArrayList<Unit> cluster, ArrayList<Unit> temp)2020 		private void fullyPruneCluster(ArrayList<Unit> cluster, ArrayList<Unit> temp){
2021 			assert(cluster.size()>1) : cluster.size();
2022 			ArrayList<Unit> pruned=pruneCluster(cluster, true, true, temp);
2023 			assert(temp.isEmpty());
2024 			assert(pruned==null || pruned.size()>0);
2025 			while(pruned!=null){
2026 				ArrayList<Unit> subcluster=pruned;
2027 				for(Unit u : subcluster){
2028 					u.clearVolatileFlags();
2029 					if(u.overlapList!=null){
2030 						for(Overlap o : u.overlapList){
2031 							o.clearVolatileFlags();
2032 						}
2033 					}
2034 				}
2035 				assert(subcluster.size()>0);
2036 				pruned=pruneCluster(subcluster, false, false, temp);
2037 				assert(temp.isEmpty());
2038 				assert(pruned==null || pruned.size()>0);
2039 				assert(subcluster.size()>0);
2040 				if(subcluster.size()==1){
2041 					processedClustersT.add(subcluster);
2042 				}else{
2043 					assert(subcluster.size()>1);
2044 					synchronized(clusterQueue){
2045 						clusterQueue.add(subcluster);
2046 					}
2047 				}
2048 			}
2049 		}
2050 
2051 		/**
2052 		 * @param cluster
2053 		 */
mergeCluster(ArrayList<Unit> cluster)2054 		private void mergeCluster(ArrayList<Unit> cluster) {
2055 			if(cluster.size()==1){return;}
2056 			if(mergeLeavesT){
2057 				mergeLeaves(cluster);
2058 			}
2059 			if(mergeInnerT){
2060 				mergeInner(cluster);
2061 			}
2062 		}
2063 
2064 		/**
2065 		 * Finds places in the cluster where two Units are joined by multiple different Overlaps.
2066 		 * Returns number of multijoins found.
2067 		 * @param cluster
2068 		 */
findMultiJoinsInCluster(ArrayList<Unit> cluster, boolean resolveProblems)2069 		private int findMultiJoinsInCluster(ArrayList<Unit> cluster, boolean resolveProblems) {
2070 			if(cluster.size()<2){return 0;}
2071 			int totalMultiJoins=0;
2072 			for(Unit ua : cluster){
2073 				ArrayList<Overlap> list=ua.overlapList;
2074 				assert(list!=null);
2075 				if(list.size()>1){
2076 					Shared.sort(list);
2077 
2078 					int multiJoins=0;
2079 					for(int i=0; i<list.size(); i++){
2080 						Overlap o=list.get(i);
2081 						Unit ub=(o.u1==ua ? o.u2 : o.u1);
2082 						assert(ua!=ub);
2083 						assert(ua==o.u1 || ua==o.u2);
2084 						if(ub.visited()){
2085 							multiJoins++;
2086 							multiJoinBasesFoundT+=o.overlapLen;
2087 							if(!o.multiJoin()){o.setMultiJoin(true);}
2088 							if(resolveProblems){list.set(i, null);}
2089 						}else{
2090 							ub.setVisited(true);
2091 						}
2092 					}
2093 
2094 					if(multiJoins>0){
2095 						totalMultiJoins+=multiJoins;
2096 						if(resolveProblems){Tools.condenseStrict(list);}
2097 					}
2098 
2099 					for(int i=0; i<list.size(); i++){
2100 						Overlap o=list.get(i);
2101 						Unit ub=(o.u1==ua ? o.u2 : o.u1);
2102 						assert(ua!=ub);
2103 						assert(ua==o.u1 || ua==o.u2);
2104 						assert(ub.visited());
2105 						ub.setVisited(false);
2106 					}
2107 				}
2108 
2109 			}
2110 
2111 			return totalMultiJoins;
2112 		}
2113 
pruneCluster(ArrayList<Unit> cluster, boolean pruneContradictoryNodes, boolean pruneContradictoryOverlaps, ArrayList<Unit> visited)2114 		private ArrayList<Unit> pruneCluster(ArrayList<Unit> cluster, boolean pruneContradictoryNodes, boolean pruneContradictoryOverlaps, ArrayList<Unit> visited){
2115 			if(verbose){System.err.println("pruneCluster(size="+cluster.size()+", "+pruneContradictoryNodes+", "+pruneContradictoryOverlaps+")");}
2116 
2117 			//pruneContradictoryOverlaps is less strict than pruneContradictoryNodes
2118 			assert(pruneContradictoryOverlaps || !pruneContradictoryNodes);
2119 
2120 			for(Unit ua : cluster){
2121 				assert(!ua.visited());
2122 				assert(ua.isPerfectlyTransitive()) : ua;
2123 				if(ua.visited()){ua.setVisited(false);}
2124 			}
2125 
2126 			int prunedOverlaps=0;
2127 			int visits=1;
2128 
2129 			{
2130 				final Unit root=cluster.get(0);
2131 				assert(!root.contradiction());
2132 				root.setVisited(true);
2133 				visited.add(root);
2134 			}
2135 
2136 			for(int i=0; i<visited.size(); i++){
2137 				Unit ua=visited.get(i);
2138 
2139 				if(ua.visited() && (!ua.contradiction() || !pruneContradictoryNodes)){
2140 					ArrayList<Overlap> list=ua.overlapList;
2141 					if(list!=null){
2142 						int removed=0;
2143 						for(int j=0; j<list.size(); j++){
2144 							Overlap o=list.get(j);
2145 							Unit ub=(o.u1==ua ? o.u2 : o.u1);
2146 							assert(o.u1==ua || o.u2==ua);
2147 							assert(ua!=ub);
2148 							assert(ub.valid());
2149 
2150 							assert(!o.offsetContradiction() || (ua.offsetContradiction() || ub.offsetContradiction())) :
2151 								"\n"+o.offsetContradiction()+", "+ua.offsetContradiction()+", "+ub.offsetContradiction();
2152 
2153 //							assert(o.contradiction()==(ua.contradiction() && ub.contradiction())) :
2154 //								"\n"+o.canonContradiction()+", "+o.offsetContradiction()+
2155 //								"\n"+ua.canonContradiction()+", "+ua.offsetContradiction()+
2156 //								"\n"+ub.canonContradiction()+", "+ub.offsetContradiction();
2157 
2158 							final boolean remove=(pruneContradictoryNodes && ub.contradiction() || (pruneContradictoryOverlaps && o.contradiction()));
2159 							if(!remove && !ub.visited()){
2160 								ub.setVisited(true);
2161 								visited.add(ub);
2162 								visits++;
2163 							}
2164 
2165 							if(remove){
2166 								if(!o.invalid()){o.setInvalid(true);}
2167 								list.set(j, null);
2168 								removed++;
2169 								prunedOverlaps++;
2170 							}else{
2171 								assert(!o.invalid());
2172 							}
2173 						}
2174 						if(removed>0){Tools.condenseStrict(list);}
2175 					}
2176 				}
2177 			}
2178 
2179 			if(verbose){System.err.println("cluster.size()="+cluster.size()+", visits="+visits+", visited.size()="+visited.size());}
2180 
2181 			final int numUnvisited=cluster.size()-visits;
2182 			ArrayList<Unit> pruned=(numUnvisited==0 ? null : new ArrayList<Unit>(numUnvisited));
2183 			assert(visits==visited.size());
2184 			assert(visits>=1 && visits<=cluster.size());
2185 
2186 			if(visits<cluster.size()){
2187 				pruned=new ArrayList<Unit>(cluster.size()-visits);
2188 				for(Unit ua : cluster){
2189 					if(!ua.visited()){
2190 						pruned.add(ua);
2191 						ArrayList<Overlap> list=ua.overlapList;
2192 						if(list!=null){
2193 							int removed=0;
2194 							for(int j=0; j<list.size(); j++){
2195 								Overlap o=list.get(j);
2196 								Unit ub=(o.u1==ua ? o.u2 : o.u1);
2197 								assert(o.u1==ua || o.u2==ua);
2198 								assert(ua!=ub);
2199 								assert(ub.valid());
2200 
2201 								if(ub.visited() || o.invalid()){
2202 									assert(ub.visited()==o.invalid()) : "\n"+o+"\n"+ub;
2203 									list.set(j, null);
2204 									removed++;
2205 								}
2206 							}
2207 							if(removed>0){Tools.condenseStrict(list);}
2208 						}
2209 					}
2210 				}
2211 				assert(pruned.size()==numUnvisited);
2212 			}else{
2213 				assert(prunedOverlaps==0) : "If this fails then I may need to mark overlaps to remove.";
2214 			}
2215 			for(Unit u : cluster){
2216 				assert(u.isPerfectlyTransitive()) : u;
2217 				if(EA){
2218 					if(u.overlapList!=null){
2219 						for(Overlap o : u.overlapList){assert(!o.invalid());}
2220 					}
2221 				}
2222 				if(u.visited()){u.setVisited(false);}
2223 			}
2224 			cluster.clear();
2225 			cluster.addAll(visited);
2226 			cluster.trimToSize();
2227 
2228 //			for(Unit u : cluster){
2229 ////				assert(u.canonicized());
2230 //				for(Overlap o : u.overlapList){
2231 //					assert(pruned==null || !pruned.contains(o.u1));
2232 //					assert(pruned==null || !pruned.contains(o.u2));
2233 //					assert(cluster.contains(o.u1));
2234 //					assert(cluster.contains(o.u2));
2235 //				}
2236 //			}
2237 //			if(pruned!=null){
2238 //				for(Unit u : pruned){
2239 //					for(Overlap o : u.overlapList){
2240 //						assert(pruned.contains(o.u1));
2241 //						assert(pruned.contains(o.u2));
2242 //						assert(!cluster.contains(o.u1));
2243 //						assert(!cluster.contains(o.u2));
2244 //					}
2245 //				}
2246 //			}
2247 
2248 			visited.clear();
2249 			return pruned;
2250 		}
2251 
2252 		/**
2253 		 * Cluster should already be ordered breadth-first
2254 		 * This may fail because removing cycles could change breadth-first traversal, but if it fails, an assertion will be thrown
2255 		 * @param cluster
2256 		 */
findCycles(ArrayList<Unit> cluster, boolean remove)2257 		private int findCycles(ArrayList<Unit> cluster, boolean remove){
2258 
2259 			{
2260 				final Unit root=cluster.get(0);
2261 				assert(root.length()>=cluster.get(cluster.size()-1).length());
2262 				root.setVisited(true);
2263 			}
2264 			int cycles=0;
2265 
2266 			for(Unit ua : cluster){
2267 				assert(ua.visited());
2268 				ArrayList<Overlap> list=ua.overlapList;
2269 				if(list!=null){
2270 					int removed=0;
2271 					for(int i=0; i<list.size(); i++){
2272 						Overlap o=list.get(i);
2273 						Unit ub=(o.u1==ua ? o.u2 : o.u1);
2274 						assert(o.u1==ua || o.u2==ua);
2275 						assert(ua!=ub);
2276 						assert(ub.valid());
2277 
2278 						if(!o.visited()){
2279 							o.setVisited(true);
2280 							if(ub.visited()){
2281 								if(!o.cyclic()){
2282 									o.setCyclic(true);
2283 									cycles++;
2284 								}
2285 							}else{
2286 								ub.setVisited(true);
2287 							}
2288 						}
2289 						if(remove && o.cyclic()){
2290 							list.set(i, null);
2291 							removed++;
2292 						}
2293 					}
2294 					if(removed>0){Tools.condenseStrict(list);}
2295 				}
2296 			}
2297 
2298 			for(Unit u : cluster){
2299 				if(u.visited()){u.setVisited(false);}
2300 				if(u.overlapList!=null){
2301 					for(Overlap o : u.overlapList){
2302 						if(o.visited()){o.setVisited(false);}
2303 					}
2304 				}
2305 			}
2306 
2307 			return cycles;
2308 		}
2309 
2310 		/**
2311 		 * Cluster should already be ordered breadth-first
2312 		 * @param cluster
2313 		 */
generateOffsetsBreadthFirst(ArrayList<Unit> cluster, ArrayList<Unit> temp)2314 		private int generateOffsetsBreadthFirst(ArrayList<Unit> cluster, ArrayList<Unit> temp){
2315 
2316 
2317 			assert(temp!=null);
2318 			assert(temp.isEmpty());
2319 			{
2320 				final Unit root=cluster.get(0);
2321 				assert(root.length()>=cluster.get(cluster.size()-1).length());
2322 				root.setOffset(0);
2323 				temp.add(root);
2324 			}
2325 
2326 			int contradictions=0;
2327 			for(int i=0; i<temp.size(); i++){
2328 				Unit u=temp.get(i);
2329 				assert(!u.visited()) : i;
2330 				assert(u.offsetValid() || contradictions>0) : i+", "+temp.size()+", "+contradictions+"\n"+toString(temp);
2331 				if(u.offsetValid() && !u.offsetContradiction()){
2332 					contradictions+=setOffsetsNeighbors(u, temp);
2333 					assert(contradictions==0 || (i>0 && temp.size()>2));
2334 				}
2335 			}
2336 
2337 			int min=0;
2338 			for(Unit u : temp){
2339 				if(u.visited()){u.setVisited(false);}
2340 				if(u.overlapList!=null){
2341 					for(Overlap o : u.overlapList){
2342 						if(o.visited()){o.setVisited(false);}
2343 					}
2344 				}
2345 				if(u.offsetValid() && !u.offsetContradiction()){
2346 					min=Tools.min(min, u.offset());
2347 				}
2348 			}
2349 
2350 			if(verbose){
2351 				System.err.println("min offset = "+min);
2352 			}
2353 
2354 			for(Unit u : temp){
2355 				if(u.offsetValid()){
2356 					if(verbose){System.err.println("Set "+u.name()+" offset from "+u.offset+" to "+(u.offset-min));}
2357 					u.offset=u.offset-min;
2358 				}
2359 			}
2360 
2361 
2362 			return contradictions;
2363 		}
2364 
2365 		/**
2366 		 * @param root
2367 		 */
setOffsetsNeighbors(final Unit root, final ArrayList<Unit> temp)2368 		private int setOffsetsNeighbors(final Unit root, final ArrayList<Unit> temp) {
2369 			if(verbose){System.err.println("\nsetOffsetsNeighbors("+root.name()+")\nroot.code1="+root.code1+"\n");}
2370 			assert(root.valid());
2371 			assert(!root.visited());
2372 			assert(root.offsetValid());
2373 			assert(!root.offsetContradiction());
2374 			root.setVisited(true);
2375 			if(root.overlapList==null){return 0;}
2376 			final int contradictions=countOffsetContradictions(root, false);
2377 			if(verbose){System.err.println("\ncontradictions="+contradictions);}
2378 			for(Overlap o : root.overlapList){
2379 				Unit u=(o.u1==root ? o.u2 : o.u1);
2380 				assert(o.u1==root || o.u2==root);
2381 				assert(root!=u);
2382 				assert(u.valid());
2383 
2384 				if(verbose){System.err.println("\nProcessing Overlap "+o);}
2385 				if(!o.visited() && !o.offsetContradiction()){
2386 					o.setVisited(true);
2387 					if(!u.offsetContradiction()){
2388 						if(verbose){System.err.println("Calling setOffset:  "+o);}
2389 						if(!u.offsetValid()){temp.add(u);}
2390 						boolean b=setOffset(root, u, o);
2391 						if(verbose){System.err.println("Finished setOffset: "+o);}
2392 
2393 //						if(x>0){
2394 //							if(verbose){System.err.println("\n*********************************************");}
2395 //							if(verbose){System.err.println("Problem detected with contig "+u.name());}
2396 //							if(verbose){System.err.println("*********************************************\n");}
2397 //							verbose=true;
2398 //							int y2=countOffsetContradictions(root, false);
2399 //							assert(contradictions==y2);
2400 //						}
2401 
2402 						assert(b) : "\n"+contradictions+", "+o.offsetContradiction()+", "+root.offsetContradiction()+", "+u.offsetContradiction()+"\n"
2403 							+root.offsetValid()+", "+u.offsetValid()+", "+OVERLAP_TYPE_NAMES[o.type]+"\n"+b
2404 							+fixMultiJoins; //This assertion can fail if a multijoin is present
2405 						assert(u.offsetValid());
2406 					}
2407 				}
2408 			}
2409 			return contradictions;
2410 		}
2411 
countOffsetContradictions(Unit root, boolean includeKnown)2412 		private int countOffsetContradictions(Unit root, boolean includeKnown){
2413 			if(verbose){System.err.println("\ncountContradictions("+root.name()+", "+includeKnown+")\nroot.code1="+root.code1+"\n");}
2414 			assert(root.valid());
2415 			assert(root.visited());
2416 			assert(root.offsetValid());
2417 //			assert(!root.offsetContradiction());
2418 			if(root.overlapList==null){return 0;}
2419 			int contradictions=0;
2420 			for(Overlap o : root.overlapList){
2421 				Unit u=(o.u1==root ? o.u2 : o.u1);
2422 				assert(o.u1==root || o.u2==root);
2423 				assert(root!=u);
2424 				assert(u.valid());
2425 
2426 				if(verbose){System.err.println("\nOverlap "+o+"\nu="+u.name()+", offsetValid="+u.offsetValid());}
2427 
2428 				boolean contradictory=(u.offsetValid() && u.offset()!=calcOffset(root, u, o));
2429 				if(verbose){System.err.println("contradictory=            \t"+contradictory);}
2430 				if(contradictory){
2431 					if(includeKnown || !u.offsetContradiction()){
2432 						contradictions++;
2433 						if(!root.offsetContradiction()){root.setOffsetContradiction(true);}
2434 					}
2435 					if(!o.offsetContradiction()){o.setOffsetContradiction(true);}
2436 					if(!u.offsetContradiction()){u.setOffsetContradiction(true);}
2437 				}
2438 				assert(contradictory==o.offsetContradiction()) : contradictory+", "+o.offsetContradiction();
2439 				if(verbose){
2440 					System.err.println("root.offsetContradiction()=\t"+root.offsetContradiction());
2441 					System.err.println("u.offsetContradiction()=   \t"+u.offsetContradiction());
2442 					System.err.println("o.offsetContradiction()=   \t"+o.offsetContradiction());
2443 					System.err.println("contradictions=           \t"+contradictions);
2444 				}
2445 			}
2446 			if(verbose){System.err.println("Final contradictions="+contradictions+"\n");}
2447 			return contradictions;
2448 		}
2449 
toString(ArrayList<Unit> cluster)2450 		private String toString(ArrayList<Unit> cluster){
2451 			for(int i=0; i<cluster.size(); i++){
2452 				Unit u=cluster.get(i);
2453 				u.r.id=""+i;
2454 			}
2455 			StringBuilder sb=new StringBuilder(1000);
2456 			for(Unit u : cluster){
2457 				sb.append(">"+u.name()+"\n");
2458 				sb.append(new String(u.bases()));
2459 				sb.append("\n");
2460 			}
2461 			sb.append("\n*****\n");
2462 			for(Unit u : cluster){
2463 				sb.append("\n"+u.name()+":");
2464 				if(u.overlapList!=null){
2465 					for(Overlap o : u.overlapList){
2466 						Unit ub=(o.u1==u ? o.u2 : o.u1);
2467 						sb.append(" "+ub.name());
2468 					}
2469 				}
2470 			}
2471 			sb.append("\n");
2472 			return sb.toString();
2473 		}
2474 
toShortString(ArrayList<Unit> cluster)2475 		private String toShortString(ArrayList<Unit> cluster){
2476 			for(int i=0; i<cluster.size(); i++){
2477 				Unit u=cluster.get(i);
2478 				u.r.id=""+i;
2479 			}
2480 			StringBuilder sb=new StringBuilder(1000);
2481 			for(Unit u : cluster){
2482 				sb.append("\n"+u.name()+":");
2483 				if(u.overlapList!=null){
2484 					for(Overlap o : u.overlapList){
2485 						Unit ub=(o.u1==u ? o.u2 : o.u1);
2486 						sb.append(" "+ub.name());
2487 					}
2488 				}
2489 			}
2490 			sb.append("\n");
2491 			return sb.toString();
2492 		}
2493 
2494 
2495 		/**
2496 		 * @param root
2497 		 * @param u2
2498 		 * @param o
2499 		 * @return Number of contradictions
2500 		 */
canonicize(final Unit root, final Unit u2, final Overlap o)2501 		private int canonicize(final Unit root, final Unit u2, final Overlap o){
2502 			if(o.type==FORWARD){return 0;}
2503 			if(o.type==REVERSE){o.reverseDirection();}
2504 			assert(o.type==FORWARD);
2505 			assert(o.test(bandy, o.edits+maxEdits));
2506 			return 0;
2507 		}
2508 
2509 
2510 		/**
2511 		 * @param root
2512 		 * @param u2
2513 		 * @param o
2514 		 * @return true if no contradictions
2515 		 */
setOffset(final Unit root, final Unit u2, final Overlap o)2516 		private boolean setOffset(final Unit root, final Unit u2, final Overlap o){
2517 			assert(root.offsetValid());
2518 			assert(!root.offsetContradiction());
2519 			int offset=calcOffset(root, u2, o);
2520 
2521 			if(u2.offsetValid()){return u2.offset()==offset;}
2522 			u2.setOffset(offset);
2523 
2524 			if(verbose){
2525 				System.err.println("\nroot = "+(root.name()==null ? root.r.numericID+"" : root.name())+", u2 = "+(u2.name()==null ? u2.r.numericID+"" : u2.name())
2526 						+"\no = "+o
2527 						+"\nroot.offset = "+root.offset()
2528 						+"\nu2.offset = "+u2.offset());
2529 			}
2530 
2531 			return true;
2532 		}
2533 
2534 
calcOffset(final Unit root, final Unit ub, final Overlap o)2535 		private int calcOffset(final Unit root, final Unit ub, final Overlap o){
2536 			assert(root.offsetValid());
2537 			if(o.type==FORWARD){
2538 				if(root==o.u1){
2539 					int dif=o.start1-o.start2;
2540 					if(verbose){System.err.println("root==o.u1=="+root.name()+", start1="+o.start1+"; u2==o.u2=="+ub.name()+", start2="+o.start2+", dif="+dif);}
2541 					return root.offset+dif;
2542 				}else{
2543 					int dif=o.start2-o.start1;
2544 					if(verbose){System.err.println("root==o.u2=="+root.name()+", start2="+o.start2+"; u2==o.u1=="+ub.name()+", start1="+o.start1+", dif="+dif);}
2545 					return root.offset+dif;
2546 				}
2547 			}else{
2548 				assert(false) : o;
2549 				throw new RuntimeException("TODO");
2550 			}
2551 		}
2552 
2553 
2554 		/**
2555 		 * @param cluster
2556 		 */
mergeLeaves(ArrayList<Unit> cluster)2557 		private void mergeLeaves(ArrayList<Unit> cluster) {
2558 			assert(false) : "TODO";
2559 			for(Unit u : cluster){
2560 
2561 			}
2562 		}
2563 
2564 		/**
2565 		 * @param cluster
2566 		 */
mergeInner(ArrayList<Unit> cluster)2567 		private void mergeInner(ArrayList<Unit> cluster) {
2568 			assert(false) : "TODO";
2569 			for(Unit u : cluster){
2570 
2571 			}
2572 		}
2573 
2574 		private ArrayList<ArrayList<Unit>> processedClustersT=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer);
2575 
2576 		long leafMergesT=0;
2577 		long innerMergesT=0;
2578 		long leafBaseMergesT=0;
2579 		long innerBaseMergesT=0;
2580 
2581 		long multiJoinFailuresT=0;
2582 		long multiJoinsFoundT=0;
2583 		long multiJoinBasesFoundT=0;
2584 		long unitsFlippedT=0;
2585 		long overlapsFlippedT=0;
2586 		long offsetContradictoryOverlapsT=0;
2587 		long offsetContradictoryClustersT=0;
2588 		long cycleOverlapsT=0;
2589 		long cycleClustersT=0;
2590 
2591 		private final boolean fixMultiJoinsT;
2592 		private final boolean fixOffsetContradictionsT;
2593 		private final boolean mergeClustersT;
2594 		private final boolean mergeLeavesT;
2595 		private final boolean mergeInnerT;
2596 		private final BandedAligner bandy;
2597 	}
2598 
2599 
2600 	/**
2601 	 * @param cluster
2602 	 */
unvisit(ArrayList<Unit> cluster)2603 	private void unvisit(ArrayList<Unit> cluster) {
2604 		for(Unit u : cluster){
2605 			if(u.visited()){u.setVisited(false);}
2606 		}
2607 	}
2608 
2609 	/**
2610 	 * @param cluster
2611 	 */
reorderClusterBreadthFirst(ArrayList<Unit> cluster)2612 	private void reorderClusterBreadthFirst(ArrayList<Unit> cluster) {
2613 		if(verbose){System.err.println("reorderClusterBreadthFirst");}
2614 
2615 		final int size=cluster.size();
2616 		Shared.sort(cluster); //Now it is in descending length
2617 		final Unit root=cluster.get(0);
2618 		assert(root.length()>=cluster.get(size-1).length()) : root.length()+", "+cluster.get(size-1).length()+", "+root.compareTo(cluster.get(size-1));
2619 
2620 		ArrayList<Unit> breadthFirst=new ArrayList<Unit>(cluster.size());
2621 		root.setVisited(true);
2622 //		System.err.println("root="+root.name());
2623 		breadthFirst.add(root);
2624 		for(int i=0; i<breadthFirst.size(); i++){
2625 			Unit u=breadthFirst.get(i);
2626 			Shared.sort(u.overlapList); //Sorted in descending overlap length
2627 			if(u.overlapList!=null){
2628 				for(Overlap o : u.overlapList){
2629 					if(!o.u1.visited()){
2630 						//					System.err.println("Visiting "+o.u1.name());
2631 						o.u1.setVisited(true);
2632 						breadthFirst.add(o.u1);
2633 					}
2634 					if(!o.u2.visited()){
2635 						//					System.err.println("Visiting "+o.u2.name());
2636 						o.u2.setVisited(true);
2637 						breadthFirst.add(o.u2);
2638 					}
2639 					//				System.err.println("***");
2640 					//				System.err.println(toShortString(breadthFirst));
2641 				}
2642 			}
2643 		}
2644 		for(Unit u : cluster){
2645 			assert(u.visited());
2646 			if(u.visited()){u.setVisited(false);}
2647 			if(EA){
2648 				if(u.overlapList!=null){
2649 					for(Overlap o : u.overlapList){assert(!o.visited());}
2650 				}
2651 			}
2652 		}
2653 //		System.err.println("***");
2654 //		System.err.println("Final:");
2655 //		System.err.println(toShortString(breadthFirst));
2656 		assert(cluster.size()==breadthFirst.size());
2657 		cluster.clear();
2658 		cluster.addAll(breadthFirst);
2659 	}
2660 
2661 
2662 
2663 	/** Returns next cluster larger than 1 element.
2664 	 * Singleton clusters are added directly to 'processed'. */
nextCluster()2665 	private ArrayList<Unit> nextCluster(){
2666 		synchronized(clusterQueue){
2667 			ArrayList<Unit> cluster=clusterQueue.poll();
2668 			assert(cluster==null || cluster.size()>1);
2669 //			while(cluster!=null && cluster.size()<2){
2670 ////				unmark(cluster);
2671 //				processedClustersT.add(cluster);
2672 //				cluster=clusterQueue.poll();
2673 //			}
2674 			return cluster;
2675 		}
2676 	}
2677 
2678 
2679 	/**
2680 	 * Creates Unit objects or uses ones already attached to reads.
2681 	 * Places them in local storage and percolates them to shared storage (codeMap), removing exact duplicates.
2682 	 * Also hashes tips and places these in shared affixMap.
2683 	 * Looks for containments in the affix map.
2684 	 * @author Brian Bushnell
2685 	 * @date Jul 24, 2013
2686 	 *
2687 	 */
2688 	private final class HashThread extends Thread{
2689 
HashThread(boolean addToCodeMap_, boolean addToAffixMap_, boolean findMatches_, boolean findContainments_, boolean findOverlaps_)2690 		public HashThread(boolean addToCodeMap_, boolean addToAffixMap_, boolean findMatches_, boolean findContainments_, boolean findOverlaps_){
2691 			addToCodeMapT=addToCodeMap_;
2692 			addToAffixMapT=addToAffixMap_;
2693 			findContainmentsT=findContainments_;
2694 			findOverlapsT=findOverlaps_;
2695 			findMatchesT=findMatches_;
2696 			tid=getTid();
2697 			crisq=new ArrayDeque<ConcurrentReadInputStream>(crisa.length);
2698 			for(int i=0; i<crisa.length; i++){
2699 //				if(verbose){System.err.println("Adding to crisq.");}
2700 				crisq.add(crisa[(i+tid)%crisa.length]);
2701 			}
2702 			bandy=(maxEdits>0 && (findOverlapsT || findContainmentsT) ? BandedAligner.makeBandedAligner(bandwidth) : null);
2703 
2704 //			assert(addToCodeMapT) : "addToCodeMapT="+addToCodeMapT+", addToAffixMapT="+addToAffixMapT+", findContainmentsT="+findContainmentsT+
2705 //			", findOverlapsT="+findOverlapsT+", findMatchesT="+findMatchesT+", convertToUpperCaseT="+convertToUpperCaseT+", numAffixMaps="+numAffixMaps;
2706 		}
2707 
2708 		@Override
run()2709 		public void run(){
2710 
2711 			ConcurrentReadInputStream cris=crisq.poll();
2712 
2713 //			String lastName=null;
2714 
2715 			while(cris!=null){
2716 				ListNum<Read> ln=cris.nextList();
2717 				ArrayList<Read> reads=(ln!=null ? ln.list : null);
2718 
2719 				while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning
2720 
2721 					for(Read r : reads){
2722 //						lastName=r.id;
2723 						processReadOuter(r);
2724 					}
2725 
2726 					if(codeMapT!=null && (codeMapT.size()>threadMaxReadsToBuffer || basesStoredT>threadMaxBasesToBuffer)){
2727 						assert(addToCodeMapT);
2728 						long added=mergeMaps();
2729 						addedToMainT+=added;
2730 					}
2731 
2732 					cris.returnList(ln);
2733 					ln=cris.nextList();
2734 					reads=(ln!=null ? ln.list : null);
2735 				}
2736 				cris.returnList(ln);
2737 				if(codeMapT!=null && !codeMapT.isEmpty()){
2738 					long added=mergeMaps();
2739 					addedToMainT+=added;
2740 				}
2741 				cris=crisq.poll();
2742 			}
2743 
2744 //			System.err.println("Thread finished; last name = "+lastName);
2745 
2746 			codeMapT=null;
2747 			localConflictList=null;
2748 			sharedConflictList=null;
2749 		}
2750 
2751 		/** Return true if this read was a member of this subset. */
processReadOuter(Read r1)2752 		private boolean processReadOuter(Read r1){
2753 			if(r1.length()<MINSCAF){return false;}
2754 			Read r2=r1.mate;
2755 
2756 			assert(r1.pairnum()==0);
2757 			assert(r2==null || r2.pairnum()==1);
2758 
2759 			if(!addToCodeMapT && r1.obj==null){
2760 				if(r1.bases!=null && r1.length()>=MINSCAF){
2761 					final Unit u=(r1.obj!=null ? (Unit)r1.obj : new Unit(r1));
2762 					assert(u.r==r1 && (r1.obj==u || r1.obj==null));
2763 					final long code=u.code1;
2764 					r1.obj=u;
2765 					assert(u.r==r1 && r1.obj==u);
2766 					if(r2!=null && r2.obj==null){r2.obj=new Unit(r2);}
2767 
2768 					//Check for subset membership
2769 					final boolean inSet=u.inSet();
2770 					if(inSet){
2771 						final Long codeL=code;
2772 						ArrayList<Unit> list=codeMap.get(codeL);
2773 						boolean found=false;
2774 						for(Unit u0 : list){
2775 							//Replace with existing read
2776 							if(u0.equals(u) && u0.r.numericID==r1.numericID){
2777 								r1=u0.r;
2778 								r2=r1.mate;
2779 								found=true;
2780 								break;
2781 							}
2782 						}
2783 						assert(list!=null);
2784 						if(!found){
2785 							return false;
2786 						}
2787 					}
2788 				}
2789 			}
2790 			boolean b=processRead(r1);
2791 			if(r2!=null){processRead(r2);}
2792 			return b;
2793 		}
2794 
2795 		/** Return true if this read was a member of this subset. */
processRead(Read r)2796 		private boolean processRead(Read r){
2797 			if(r.length()<MINSCAF){return false;}
2798 
2799 			final boolean inSet;
2800 			if(!storeName){r.id=null;}
2801 			if(!storeQuality){r.quality=null;}
2802 
2803 			if(forceTrimLeft>0 || forceTrimRight>0){//Added at request of RQC team
2804 				if(r!=null && r.length()>0){
2805 					TrimRead.trimToPosition(r, forceTrimLeft>0 ? forceTrimLeft : 0, forceTrimRight>0 ? forceTrimRight : r.length(), 1);
2806 				}
2807 			}
2808 			if(qTrimLeft || qTrimRight){
2809 				TrimRead.trimFast(r, qTrimLeft, qTrimRight, trimQ, trimE, 0);
2810 			}
2811 			if(r.length()<MINSCAF){return false;}
2812 
2813 			readsProcessedT++;
2814 			basesProcessedT+=r.length();
2815 
2816 			final Unit u=(r.obj!=null ? (Unit)r.obj : new Unit(r));
2817 			assert(u.r==r && (r.obj==u || r.obj==null));
2818 			final long code=u.code1;
2819 
2820 			//Check for subset membership
2821 			inSet=u.inSet();
2822 
2823 			r.obj=u;
2824 			assert(u.r==r && r.obj==u);
2825 			if(r.mate!=null && r.mate.obj==null){r.mate.obj=new Unit(r.mate);}
2826 
2827 			if(verbose){System.err.println("Generated "+code+" for sequence "+u.name()+"\t"+new String(r.bases, 0, Tools.min(40, r.length())));}
2828 
2829 			if(addToCodeMapT && inSet){
2830 				final Long codeL=code;
2831 				ArrayList<Unit> list=codeMapT.get(codeL);
2832 				if(list==null){
2833 					if(verbose){System.err.println("Unique.");}
2834 					list=new ArrayList<Unit>(1);
2835 					list.add(u);
2836 					basesStoredT+=r.length();
2837 					codeMapT.put(codeL, list);
2838 				}else{
2839 					if(verbose){System.err.println("Exists.");}
2840 					boolean match=false;
2841 					if(findMatchesT){
2842 						for(Unit u2 : list){
2843 							if(pairedEqualsRC(u, u2)){
2844 //								if(u.r.mate!=null){
2845 //									verbose=true;
2846 //
2847 //									Unit um=(Unit)u.r.mate.obj;
2848 //									Unit u2m=(Unit)u2.r.mate.obj;
2849 //
2850 //									if(verbose){
2851 //										System.err.println("********");
2852 //										System.err.println(u.r.toFastq());
2853 //										System.err.println(u.r.mate.toFastq());
2854 //										System.err.println("********");
2855 //										System.err.println(u2.r.toFastq());
2856 //										System.err.println(u2.r.mate.toFastq());
2857 //										System.err.println("********");
2858 //										System.err.println(u);
2859 //										System.err.println(u2);
2860 //										System.err.println(um);
2861 //										System.err.println(u2m);
2862 //										System.err.println("********");
2863 //										System.err.println(u.equals(u2));
2864 //										System.err.println(u.compareTo(u2));
2865 //										System.err.println("********");
2866 //										System.err.println(um.equals(u2m));
2867 //										System.err.println(um.compareTo(u2m));
2868 //										System.err.println("********");
2869 //									}
2870 //
2871 //									verbose=false;
2872 //								}
2873 								assert(u.r.mate==null || pairedEqualsRC((Unit)u.r.mate.obj, (Unit)u2.r.mate.obj)) :
2874 									u.r.toFastq()+"\n"+u2.r.toFastq()+"\n"+u.r.mate.toFastq()+"\n"+u2.r.mate.toFastq()+
2875 									"\n"+u+"\n"+u2+"\n"+u.r.mate.obj+"\n"+u2.r.mate.obj;
2876 //								if(verbose){System.err.println("Matches "+new String(r2.bases, 0, Tools.min(40, r2.length())));}
2877 								match=true;
2878 								u2.absorbMatch(u);
2879 								if(UNIQUE_ONLY){
2880 									synchronized(u2){
2881 										if(u2.valid()){
2882 											matchesT++;
2883 											baseMatchesT+=u2.length();
2884 											u2.setValid(false);
2885 											addDupe(u2.r);
2886 										}
2887 									}
2888 								}
2889 								break;
2890 							}
2891 						}
2892 					}
2893 					if(match){
2894 						addDupe(r);
2895 						matchesT++;
2896 						baseMatchesT+=r.length();
2897 						//							if(verbose){System.err.println("matchesT="+matchesT+", baseMatchesT="+baseMatchesT);}
2898 					}else{
2899 						collisionsT++;
2900 						if(verbose){System.err.println("False collision; count = "+collisionsT);}
2901 						list.add(u);
2902 						basesStoredT+=r.length();
2903 					}
2904 				}
2905 			}
2906 
2907 			if(findContainmentsT){
2908 				int x=findContainments(u);
2909 			}
2910 
2911 			if(findOverlapsT){
2912 				int x=findOverlaps(u);
2913 			}
2914 
2915 			return inSet;
2916 		}
2917 
findContainments(final Unit u)2918 		private int findContainments(final Unit u){
2919 			if(minLengthPercent<=0 && maxSubs<=0 && minIdentity>=100 && !u.valid()){return 0;}
2920 			final byte[] bases=u.bases();
2921 			final int shift=bitsPerBase*k;
2922 			final int shift2=shift-bitsPerBase;
2923 			final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
2924 			long kmer=0;
2925 			int hits=0;
2926 			int currentContainments=0;
2927 			int len=0;
2928 
2929 			if(bases==null || bases.length<k){return -1;}
2930 			final LongM key=new LongM();
2931 
2932 			for(int i=0; i<bases.length; i++){
2933 				byte b=bases[i];
2934 				long x=acidToNumber[b];
2935 				kmer=((kmer<<bitsPerBase)|x)&mask;
2936 				if(HASH_NS || isFullyDefined(b)){len++;}
2937 				else{len=0;}
2938 //				if(verbose){System.err.println("Scanning i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
2939 				if(len>=k){
2940 					key.set(kmer); //Canonical
2941 					for(int am=0; am<affixMaps.length; am++){
2942 						ArrayList<Unit> list=affixMaps[am].get(key);
2943 						if(list!=null){
2944 							for(Unit u2 : list){
2945 								if(u!=u2 && !u.equals(u2)){
2946 									if(u2.valid()){
2947 										hits++;
2948 										if(verbose){
2949 											System.err.println("\nFound potential containment at am="+am+", i="+i+", key="+key.value()+
2950 													", pre1="+u2.prefix1+", pre2="+u2.prefix2+
2951 													", suf1="+u2.suffix1+", suf2="+u2.suffix2+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i, k)));
2952 										}
2953 										if(u.contains(u2, i, key, bandy, am)){
2954 											synchronized(u2){
2955 												if(u2.valid()){
2956 													currentContainments++;
2957 													baseContainmentsT+=u2.length();
2958 													u2.setValid(false);
2959 													addDupe(u2.r);
2960 												}
2961 											}
2962 											if(UNIQUE_ONLY){
2963 												synchronized(u){
2964 													if(u.valid()){
2965 														currentContainments++;
2966 														baseContainmentsT+=u.length();
2967 														u.setValid(false);
2968 														addDupe(u.r);
2969 													}
2970 												}
2971 											}
2972 
2973 											if(verbose){System.err.println("Added containment "+u2);}
2974 										}
2975 									}
2976 								}
2977 							}
2978 						}
2979 					}
2980 				}
2981 			}
2982 //			assert(false) : hits+", "+currentContainments+", "+baseContainments+"\n"+containmentMapT+"\n";
2983 
2984 			containmentCollisionsT+=(hits-currentContainments);
2985 //			outstream.println("hits="+hits+", currentContainments="+currentContainments);
2986 			containmentsT+=currentContainments;
2987 			return hits;
2988 		}
2989 
findOverlaps(final Unit u)2990 		private int findOverlaps(final Unit u){
2991 //			if(minLengthPercent<=0 && maxSubs<=0 && minIdentity>=100 && !u.valid()){return 0;}
2992 //			if(u.overlapList!=null){u.overlapList.clear();}
2993 			final byte[] bases=u.bases();
2994 			final int shift=bitsPerBase*k;
2995 			final int shift2=shift-bitsPerBase;
2996 			final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
2997 			long kmer=0;
2998 			int hits=0;
2999 			int currentOverlaps=0;
3000 			int len=0;
3001 
3002 			if(bases==null || bases.length<k){return -1;}
3003 			final LongM key=new LongM();
3004 
3005 			boolean quit=false;
3006 
3007 			for(int i=0; i<bases.length && !quit; i++){
3008 				byte b=bases[i];
3009 				long x=acidToNumber[b];
3010 				kmer=((kmer<<bitsPerBase)|x)&mask;
3011 				if(HASH_NS || isFullyDefined(b)){len++;}
3012 				else{len=0;}
3013 //				if(verbose){System.err.println("Scanning i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));}
3014 				if(len>=k){
3015 					key.set(kmer); //Canonical key
3016 					for(int am=0; am<affixMaps.length; am++){
3017 						ArrayList<Unit> list=affixMaps[am].get(key);
3018 						if(list!=null){//found a key collision
3019 							for(Unit u2 : list){
3020 								if(quit){break;}//too many edges
3021 								int u1cluster=-1, u2cluster=-2;
3022 								if(preventTransitiveOverlaps && u!=u2){
3023 									u1cluster=u.determineCluster();
3024 									u2cluster=u2.determineCluster();
3025 								}
3026 								if(u1cluster!=u2cluster && u!=u2 && !u.equals(u2) && u2.r!=u.r.mate){//TODO:  Not sure why identical things are banned...  possibly relates to avoiding inter-pair edges?
3027 									if(u2.valid()){
3028 										hits++;
3029 
3030 //										boolean flag=(u.code1==-3676200394282040623L && u2.code1==-7034423913727372751L) ||
3031 //												(u2.code1==-3676200394282040623L && u.code1==-7034423913727372751L);
3032 										final boolean flag=false;
3033 										if(verbose || flag){
3034 											System.err.println("\nFound potential overlap at am="+am+", i="+i+", key="+key.value()+
3035 													", pre1="+u2.prefix1+", pre2="+u2.prefix2+
3036 													", suf1="+u2.suffix1+", suf2="+u2.suffix2+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i, k)));
3037 										}
3038 
3039 										final Overlap o;
3040 										if(maxEdges>1000000000 || u.overlapList==null || u2.overlapList==null ||
3041 												(u.overlapList.size()<maxEdges && u2.overlapList.size()<maxEdges2)){
3042 											 o=u.makeOverlap(u2, i, key, bandy, am);
3043 
3044 										}else{
3045 											o=null;
3046 											if(u.overlapList.size()>maxEdges){quit=true;}
3047 										}
3048 										if(o!=null){
3049 
3050 											if(preventTransitiveOverlaps){
3051 												 mergeClusterIds(u1cluster, u2cluster);
3052 											}
3053 
3054 											assert(o.test(bandy, o.edits+maxEdits)) : o;
3055 											if(verbose || flag){System.err.println("Created overlap "+o);}
3056 
3057 											long comp=u.length()-u2.length();
3058 											if(comp==0){comp=u.code1-u2.code1;}
3059 											if(comp==0){comp=u.code2-u2.code2;}
3060 											if(comp==0){comp=u.prefix1-u2.prefix1;}
3061 											if(comp==0){comp=u.suffix1-u2.suffix1;}
3062 											if(comp==0){comp=(u.r.numericID-u2.r.numericID);}
3063 											assert(comp!=0) : u+", "+u2;
3064 											Unit ua=(comp<0 ? u : u2);
3065 											Unit ub=(comp<0 ? u2 : u);
3066 											assert(ua!=ub);
3067 											if(verbose || flag){
3068 												System.err.println("ua="+ua.code1);
3069 												System.err.println("ub="+ub.code1);
3070 												System.err.println("u ="+u.code1);
3071 												System.err.println("u2="+u2.code1);
3072 												System.err.println("u.r ="+u.r.numericID);
3073 												System.err.println("u2.r="+u2.r.numericID);
3074 												System.err.println("ua contains o? "+ua.alreadyHas(o));
3075 												System.err.println("ub contains o? "+ub.alreadyHas(o));
3076 												System.err.println("ua.list="+ua.overlapList);
3077 												System.err.println("ub.list="+ub.overlapList);
3078 											}
3079 
3080 //											assert(ua.alreadyHas(o)==ub.alreadyHas(o));
3081 
3082 											final boolean uaContainedOverlap;
3083 
3084 											synchronized(ua){
3085 												if(ua.overlapList==null){ua.overlapList=new ArrayList<Overlap>(2);}
3086 												if(!ua.overlapList.contains(o)){
3087 													if(EA){
3088 														synchronized(ub){
3089 															assert(ub.overlapList==null || !ub.overlapList.contains(o)) :
3090 																ua.alreadyHas(o)+", "+ub.alreadyHas(o)+"\n"+o+"\n"+ub.overlapList.get(ub.overlapList.indexOf(o))+
3091 																"\nua.list="+ua.overlapList+"\nub.list="+ub.overlapList+"\nu.code1="+u.code1+"\nu2.code1="+u2.code1;
3092 														}
3093 													}
3094 													currentOverlaps++;
3095 													baseOverlapsT+=o.overlapLen;
3096 													ua.overlapList.add(o);
3097 													if(verbose || flag){System.err.println("Added overlap "+o);}
3098 													uaContainedOverlap=false;
3099 												}else{
3100 													if(verbose || flag){System.err.println("Already contained overlap "+o);}
3101 													hits--;
3102 													uaContainedOverlap=true;
3103 												}
3104 											}
3105 
3106 											if(!uaContainedOverlap){
3107 												synchronized(ub){
3108 													if(ub.overlapList==null){ub.overlapList=new ArrayList<Overlap>(2);}
3109 													assert(!ub.overlapList.contains(o));
3110 													ub.overlapList.add(o);
3111 													if(verbose || flag){System.err.println("Added overlap "+o);}
3112 												}
3113 											}else{
3114 												if(verbose || flag){System.err.println("Already contained overlap "+o);}
3115 											}
3116 
3117 
3118 //											assert(ua.alreadyHas(o));
3119 //											assert(ub.alreadyHas(o));
3120 //											assert(ua.overlapList.contains(o));
3121 //											assert(ub.overlapList.contains(o));
3122 											if(verbose || flag){
3123 												System.err.println("ua contains o? "+ua.alreadyHas(o));
3124 												System.err.println("ub contains o? "+ub.alreadyHas(o));
3125 												System.err.println("ua.list="+ua.overlapList);
3126 												System.err.println("ub.list="+ub.overlapList);
3127 											}
3128 										}
3129 									}
3130 								}
3131 							}
3132 						}
3133 					}
3134 				}
3135 			}
3136 			if(EA){
3137 				synchronized(u){
3138 					if(u.overlapList!=null && u.overlapList.isEmpty()){
3139 						assert(false) : "Why would this happen?";
3140 						u.overlapList=null;
3141 					}
3142 				}
3143 			}
3144 //			assert(false) : hits+", "+currentOverlaps+", "+baseOverlaps+"\n"+overlapMapT+"\n";
3145 
3146 //			assert(hits==currentOverlaps) : hits+", "+currentOverlaps;
3147 
3148 			overlapCollisionsT+=(hits-currentOverlaps);
3149 //			outstream.println("hits="+hits+", currentOverlaps="+currentOverlaps);
3150 			overlapsT+=currentOverlaps;
3151 			return hits;
3152 		}
3153 
3154 		/** Insert reads processed by a thread into the shared code and affix maps.
3155 		 * If operating in subset mode, only store reads with code equal to subset mod subsetCount. */
mergeMaps()3156 		private long mergeMaps(){
3157 			if(verbose){System.err.println("Merging maps.");}
3158 			long novelReads=0, novelKeys=0;
3159 			long collisionReads=0;
3160 			long mergedReads=0;
3161 
3162 			assert(localConflictList.isEmpty());
3163 			assert(sharedConflictList.isEmpty());
3164 
3165 			synchronized(codeMap){
3166 				for(Long key : codeMapT.keySet()){
3167 					if(codeMap.containsKey(key)){
3168 						localConflictList.add(codeMapT.get(key));
3169 						sharedConflictList.add(codeMap.get(key));
3170 					}else{
3171 						ArrayList<Unit> list=codeMapT.get(key);
3172 						codeMap.put(key, list);
3173 						addedList.addAll(list);
3174 						novelReads+=list.size();
3175 						novelKeys++;
3176 					}
3177 				}
3178 			}
3179 
3180 			if(verbose){System.err.println("Novel reads = "+novelReads+", conflicts = "+localConflictList.size());}
3181 
3182 			for(int i=0; i<localConflictList.size(); i++){
3183 				ArrayList<Unit> listT=localConflictList.get(i);
3184 				ArrayList<Unit> list=sharedConflictList.get(i);
3185 				synchronized(list){
3186 					for(Unit u : listT){
3187 						if(verbose){System.err.println("Processing novel unit "+u.name());}
3188 						boolean match=false;
3189 						if(findMatchesT){
3190 							for(Unit u2 : list){
3191 								if(pairedEqualsRC(u, u2)){
3192 									//								if(verbose){System.err.println("Matches "+new String(r2.bases, 0, Tools.min(40, r2.length())));}
3193 									u2.absorbMatch(u);
3194 									if(UNIQUE_ONLY){
3195 										synchronized(u2){
3196 											if(u2.valid()){
3197 												mergedReads++;
3198 												baseMatchesT+=u2.length();
3199 												u2.setValid(false);
3200 												addDupe(u2.r);
3201 											}
3202 										}
3203 									}
3204 									match=true;
3205 									break;
3206 								}
3207 							}
3208 						}
3209 						if(match){
3210 							addDupe(u.r);
3211 							mergedReads++;
3212 							baseMatchesT+=u.length();
3213 							if(verbose){System.err.println("matchesT="+matchesT+", baseMatchesT="+baseMatchesT);}
3214 						}else{
3215 							collisionReads++;
3216 							if(verbose){System.err.println("False collision; count = "+collisionReads);}
3217 							list.add(u);
3218 							addedList.add(u);
3219 						}
3220 					}
3221 				}
3222 			}
3223 			matchesT+=mergedReads;
3224 			collisionsT+=collisionReads;
3225 			if(verbose){System.err.println("Done Merging.");}
3226 			if(verbose){System.err.println("mapT.size="+codeMapT.size()+", basesStoredT="+basesStoredT);}
3227 
3228 			codeMapT.clear();
3229 			localConflictList.clear();
3230 			sharedConflictList.clear();
3231 
3232 			if(!addedList.isEmpty()){
3233 				if(addToAffixMapT){
3234 					final LongM p=new LongM(-1, true);
3235 					assert(affixMap1!=null || affixMap2!=null);
3236 					if(affixMap1!=null && !ignoreAffix1){//Allows you to not use am1
3237 						synchronized(affixMap1){
3238 							for(Unit u : addedList){
3239 								if(verbose){System.err.println("Processing affixes for "+u.name());}
3240 								if(u.prefix1!=-1 || u.prefix1!=u.suffix1){
3241 									if(verbose){System.err.println("Using prefix "+u.prefix1);}
3242 									p.set(u.prefix1);
3243 									ArrayList<Unit> alu=affixMap1.get(p);
3244 									if(alu==null){
3245 										if(verbose){System.err.println("Made new alu for "+p);}
3246 										alu=new ArrayList<Unit>(2);
3247 										affixMap1.put(p.iCopy(), alu);
3248 									}
3249 									if(alu.size()<maxAffixCopies){
3250 										if(verbose){System.err.println("Added "+u.name());}
3251 										alu.add(u);
3252 									}
3253 									if(verbose){System.err.println(affixMap1.get(p));}
3254 								}
3255 								if(storeSuffix && u.prefix1!=u.suffix1){
3256 									if(verbose){System.err.println("Using suffix "+u.suffix1);}
3257 									p.set(u.suffix1);
3258 									ArrayList<Unit> alu=affixMap1.get(p);
3259 									if(alu==null){
3260 										if(verbose){System.err.println("Made new alu for "+p);}
3261 										alu=new ArrayList<Unit>(2);
3262 										affixMap1.put(p.iCopy(), alu);
3263 									}
3264 									if(alu.size()<maxAffixCopies){
3265 										if(verbose){System.err.println("Added "+u.name());}
3266 										alu.add(u);
3267 									}
3268 									if(verbose){System.err.println(affixMap1.get(p));}
3269 								}
3270 							}
3271 						}
3272 					}
3273 					if(affixMap2!=null){
3274 						synchronized(affixMap2){
3275 							for(Unit u : addedList){
3276 								if(u.prefix2!=-1 || u.prefix2!=u.suffix2){
3277 									p.set(u.prefix2);
3278 									ArrayList<Unit> alu=affixMap2.get(p);
3279 									if(alu==null){
3280 										alu=new ArrayList<Unit>(2);
3281 										affixMap2.put(p.iCopy(), alu);
3282 									}
3283 									if(alu.size()<maxAffixCopies){alu.add(u);}
3284 								}
3285 								if(storeSuffix && u.prefix2!=u.suffix2){
3286 									p.set(u.suffix2);
3287 									ArrayList<Unit> alu=affixMap2.get(p);
3288 									if(alu==null){
3289 										alu=new ArrayList<Unit>(2);
3290 										affixMap2.put(p.iCopy(), alu);
3291 									}
3292 									if(alu.size()<maxAffixCopies){alu.add(u);}
3293 								}
3294 							}
3295 						}
3296 					}
3297 				}
3298 			}
3299 
3300 			addedList.clear();
3301 			basesStoredT=0;
3302 			return collisionReads+novelReads;
3303 		}
3304 
getTid()3305 		private int getTid(){
3306 			synchronized(HashThread.class){
3307 				int x=tcount;
3308 				tcount++;
3309 				return x;
3310 			}
3311 		}
3312 
3313 		private LinkedHashMap<Long, ArrayList<Unit>> codeMapT=new LinkedHashMap<Long, ArrayList<Unit>>(threadMaxReadsToBuffer*8);
3314 		private ArrayList<Unit> addedList=new ArrayList<Unit>(threadMaxReadsToBuffer);
3315 		private ArrayList<ArrayList<Unit>> localConflictList=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer);
3316 		private ArrayList<ArrayList<Unit>> sharedConflictList=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer);
3317 
3318 		long matchesT=0;
3319 		long baseMatchesT=0;
3320 		long baseContainmentsT=0;
3321 		long collisionsT=0;
3322 		long containmentsT=0;
3323 		long containmentCollisionsT=0;
3324 		long basesStoredT=0;
3325 		long addedToMainT=0;
3326 		long readsProcessedT=0;
3327 		long basesProcessedT=0;
3328 		long overlapsT=0;
3329 		long baseOverlapsT=0;
3330 		long overlapCollisionsT=0;
3331 
3332 		private final boolean addToCodeMapT;
3333 		private final boolean addToAffixMapT;
3334 		private final boolean findContainmentsT;
3335 		private final boolean findOverlapsT;
3336 		private final boolean findMatchesT;
3337 //		private final boolean convertToUpperCaseT;
3338 		private final int tid;
3339 		private final ArrayDeque<ConcurrentReadInputStream> crisq;
3340 		private final BandedAligner bandy;
3341 	}
3342 
equalsRC(byte[] a, byte[] b)3343 	public static boolean equalsRC(byte[] a, byte[] b){
3344 		if(a==b){return true;}
3345 		if(a==null || b==null){return false;}
3346 		if(a.length!=b.length){return false;}
3347 
3348 		for(int i=0; i<a.length; i++){
3349 			final byte aa=a[i], bb=b[i];
3350 			if(aa!=bb){return false;}
3351 		}
3352 		return true;
3353 	}
3354 
pairedEqualsRC(Unit ua, Unit ub)3355 	public static boolean pairedEqualsRC(Unit ua, Unit ub){
3356 		if(verbose){System.err.println("pairedEqualsRC("+ua.name()+", "+ub.name()+")");}
3357 		if(verbose){System.err.println("ea");}
3358 		boolean b=equalsRC(ua, ub);
3359 		if(verbose){System.err.println("eb");}
3360 		if(!b){return false;}
3361 		if(verbose){System.err.println("ec");}
3362 
3363 		if(ua.r!=null && ub.r!=null){
3364 			if(verbose){System.err.println("ed");}
3365 			assert((ua.r.mate==null)==(ub.r.mate==null));
3366 			if(verbose){System.err.println("ee");}
3367 			if(ua.r.mate!=null && ub.r.mate!=null){
3368 				if(verbose){System.err.println("ef");}
3369 				return ua.r.pairnum()==ub.r.pairnum() && Tools.compare(ua.r.mate.bases, ub.r.mate.bases)==0;
3370 			}
3371 			if(verbose){System.err.println("eg");}
3372 		}
3373 		if(verbose){System.err.println("eh");}
3374 		return true;
3375 	}
3376 
equalsRC(Unit ua, Unit ub)3377 	private static boolean equalsRC(Unit ua, Unit ub){
3378 		if(verbose){System.err.println("equalsRC("+ua.name()+", "+ub.name()+")");}
3379 		return ua.code1==ub.code1 && ua.code2==ub.code2 && (ua.prefix1==ub.prefix1 && ua.suffix1==ub.suffix1) && compareRC(ua, ub)==0;
3380 	}
3381 
comparePairedRC(Unit ua, Unit ub)3382 	public static int comparePairedRC(Unit ua, Unit ub){
3383 		if(verbose){System.err.println("comparePairedRC("+ua.name()+", "+ub.name()+")");}
3384 		int x=compareRC(ua, ub);
3385 		if(x!=0){return x;}
3386 
3387 		if(ua.r!=null && ub.r!=null && ua.r.mate!=null && ub.r.mate!=null){
3388 			if(ua.r.pairnum()!=ub.r.pairnum()){return ua.r.pairnum()-ub.r.pairnum();}
3389 			return compareRC((Unit)ua.r.mate.obj, (Unit)ub.r.mate.obj);
3390 		}
3391 		return 0;
3392 	}
3393 
3394 	//TODO
3395 	//This is really for sorting by length.
compareRC(Unit ua, Unit ub)3396 	private static int compareRC(Unit ua, Unit ub){
3397 		if(verbose){System.err.println("compareRC("+ua.name()+", "+ub.name()+")");}
3398 		if(ua==ub){return 0;}
3399 		if(verbose){System.err.println("a");}
3400 		if(verbose){System.err.println("a1");}
3401 		if(ua.length()!=ub.length()){return ub.length()-ua.length();}
3402 		if(verbose){System.err.println("a2");}
3403 
3404 		if(REQUIRE_MATCHING_NAMES){
3405 			if(ua.name()!=null && ub.name()!=null){
3406 				int x=ua.name().compareTo(ub.name());
3407 				if(x!=0){return x;}
3408 			}
3409 		}
3410 		if(verbose){System.err.println("a3");}
3411 
3412 		if(ua.r==null || ub.r==null){
3413 			if(verbose){System.err.println("b");}
3414 			if(verbose){System.err.println("b1");}
3415 			if(verbose){System.err.println("c");}
3416 			if(ua.prefix1!=ub.prefix1){return ua.prefix1>ub.prefix1 ? 1 : -1;}
3417 			if(ua.suffix1!=ub.suffix1){return ua.suffix1>ub.suffix1 ? 1 : -1;}
3418 			if(verbose){System.err.println("e");}
3419 			if(ua.code1!=ub.code1){return ua.code1>ub.code1 ? 1 : -1;}
3420 			if(ua.code2!=ub.code2){return ua.code2>ub.code2 ? 1 : -1;}
3421 
3422 			return ua.pairnum()-ub.pairnum();
3423 		}
3424 		if(verbose){System.err.println("f");}
3425 		final byte[] a=ua.r.bases, b=ub.r.bases;
3426 		if(a==b){return 0;}
3427 		if(a==null || b==null){return a==null ? -1 : 1;}
3428 		if(verbose){System.err.println("g");}
3429 
3430 		if(verbose){System.err.println("h");}
3431 		for(int i=0; i<a.length; i++){
3432 			final byte aa=a[i], bb=b[i];
3433 			if(aa!=bb){return aa-bb;}
3434 		}
3435 
3436 		if(verbose){System.err.println("j");}
3437 		return ua.pairnum()-ub.pairnum();
3438 	}
3439 
hashTip(byte[] bases, boolean prefix, int k, int skipInitialBases)3440 	private long hashTip(byte[] bases, boolean prefix, int k, int skipInitialBases){
3441 		if(bases==null || bases.length<k){return -1;}
3442 
3443 		final int shift=bitsPerBase*k;
3444 		final int shift2=shift-bitsPerBase;
3445 		final long mask=(shift>63 ? -1L : ~((-1L)<<shift));
3446 		long kmer=0;
3447 		int len=0;
3448 
3449 		final int start=(prefix ? 0+skipInitialBases : bases.length-k-skipInitialBases);
3450 		final int stop=start+k;
3451 
3452 //		if(verbose){
3453 //			System.err.println("\n"+new String(bases));
3454 //			System.err.println("prefix="+prefix+", start="+start+", stop="+stop);
3455 ////			System.err.print(new String(bases));
3456 //		}
3457 		for(int i=start; i<stop; i++){
3458 			byte b=bases[i];
3459 //			if(verbose){System.err.print((char)b);}
3460 			long x=acidToNumber[b];
3461 			kmer=((kmer<<bitsPerBase)|x)&mask;
3462 			len++;
3463 		}
3464 		if(verbose){System.err.println(new String(bases, start, k)+" = "+kmer);}
3465 		assert(len==k) : len+","+k;
3466 		return kmer;
3467 	}
3468 
calcMaxEdits(int maxEdits, float minIdentityMult, int len)3469 	private static final int calcMaxEdits(int maxEdits, float minIdentityMult, int len){
3470 		return minIdentityMult==0 ? maxEdits : Tools.max(maxEdits, (int)Math.round(len*minIdentityMult));
3471 	}
3472 
3473 
3474 	private class Overlap implements Comparable<Overlap>{
3475 
Overlap(Unit u1_, Unit u2_, int type_, int start1_, int start2_, int stop1_, int stop2_, int len_, int mismatches_, int edits_, BandedAligner bandy)3476 		public Overlap(Unit u1_, Unit u2_, int type_, int start1_, int start2_, int stop1_, int stop2_, int len_, int mismatches_, int edits_, BandedAligner bandy){
3477 			assert(u1_!=u2_);
3478 			if(verbose){System.err.println("\nCreating an overlap.");}
3479 			u1=u1_;
3480 			u2=u2_;
3481 			type=type_;
3482 			start1=start1_;
3483 			start2=start2_;
3484 			stop1=stop1_;
3485 			stop2=stop2_;
3486 			overlapLen=len_;
3487 			mismatches=mismatches_;
3488 			edits=edits_;
3489 
3490 			assert(Tools.absdif(Tools.absdif(start1, stop1), Tools.absdif(start2, stop2))<=maxEdits) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)
3491 				+"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1)
3492 				+"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1);
3493 
3494 			assert(start1>=0 && start1<=u1.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length();
3495 			assert(stop1>=0 && stop1<=u1.length()) :   "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length();
3496 			assert(start2>=0 && start2<=u2.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length();
3497 			assert(stop2>=0 && stop2<=u2.length()) :   "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length();
3498 
3499 			assert(type==FORWARD || type==REVERSE);
3500 
3501 			if(verbose){System.err.println(this);}
3502 
3503 			assert(Tools.absdif(Tools.absdif(start1, stop1), Tools.absdif(start1, stop1))<=maxEdits);
3504 
3505 			assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)
3506 				+"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1)
3507 				+"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1);
3508 			if(verbose){System.err.println("Passed test 1.");}
3509 
3510 //			bandy.verbose=true;
3511 //			test(bandy);
3512 //			assert(false);
3513 
3514 			assert(u1!=u2);
3515 			u1.firstInOverlap(u2);
3516 			u2.firstInOverlap(u1);
3517 			assert(u1.length()!=u2.length() || u1.code1!=u2.code1 || u1.code2!=u2.code2 || (u1.r!=null && u1.r.mate!=null)) : "Collision? \n"+this+"\n"+u1+"\n"+u2;
3518 			assert(u1.firstInOverlap(u2)!=u2.firstInOverlap(u1)) :
3519 				"\nu1.firstInOverlap(u2)="+u1.firstInOverlap(u2)+"\nu2.firstInOverlap(u1)="+u2.firstInOverlap(u1)+"\nu1="+u1+"\nu2="+u2;
3520 
3521 			if(!u1.firstInOverlap(u2)){
3522 				if(verbose){System.err.println("\nSwapping.");}
3523 				swap();
3524 				if(verbose){System.err.println(this);}
3525 
3526 				if(EA && !customBandwidth && !test(bandy, edits+maxEdits)){
3527 					System.err.println("\n"+this);
3528 					swap();
3529 					System.err.println("\n"+this);
3530 					assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n";
3531 					System.err.println("Passed test 2a, "+bandy.lastEdits+" edits.\n");
3532 					swap();
3533 					System.err.println("\n"+this);
3534 					assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)
3535 						+"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1)
3536 						+"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1);
3537 					System.err.println("Passed test 2b, "+bandy.lastEdits+" edits.\n");
3538 				}
3539 
3540 				assert(customBandwidth || test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)
3541 					+"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1)
3542 					+"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1);
3543 				if(verbose){System.err.println("Passed test 2.");}
3544 			}
3545 
3546 			if(type==REVERSE){
3547 				if(verbose){System.err.println("\nReversing.");}
3548 				reverseDirection();
3549 				if(verbose){System.err.println(this);}
3550 
3551 				if(EA && !Shared.anomaly && !customBandwidth && bandy!=null && !test(bandy, edits+maxEdits)){
3552 					Shared.anomaly=true;
3553 					BandedAligner.verbose=true;
3554 					System.err.println("\n********** Failed test 3, "+bandy.lastEdits+" edits. **********\n");
3555 					reverseDirection();
3556 					System.err.println(this);
3557 					assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n";
3558 					System.err.println("Passed test 3a, "+bandy.lastEdits+" edits.\n");
3559 					reverseDirection();
3560 					System.err.println(this);
3561 					assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)
3562 						+"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1)
3563 						+"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1);
3564 					System.err.println("Passed test 3b, "+bandy.lastEdits+" edits.\n");
3565 					BandedAligner.verbose=false;
3566 					assert(false);
3567 				}
3568 
3569 				assert(customBandwidth || test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n";
3570 				if(verbose){System.err.println("Passed test 3.");}
3571 			}
3572 			//Now all overlaps should be FORWARD or FORWARDRC and u1 should be at least as big as u2
3573 			assert(type==FORWARD);
3574 			assert(u1.length()>=u2.length());
3575 			assert(u1.firstInOverlap(u2));
3576 			assert(!u2.firstInOverlap(u1));
3577 			if(verbose){System.err.println("Finished overlap initialization.");}
3578 		}
3579 
test(BandedAligner bandy, int editLimit)3580 		public boolean test(BandedAligner bandy, int editLimit){
3581 			final int last1=u1.length()-1, last2=u2.length()-1;
3582 			if(verbose){System.err.println("Testing "+OVERLAP_TYPE_NAMES[type]+", "+start1+", "+start2);}
3583 			if(type==FORWARD){
3584 				assert(start1==0 || start2==0) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2;
3585 				if(start2==0){
3586 					if(verbose){System.err.println("A");}
3587 					return u1.overlapsForward(u2, start1, start2, bandy, false, editLimit);}
3588 				else{
3589 					if(verbose){System.err.println("B");}
3590 					return u2.overlapsForward(u1, start2, start1, bandy, false, editLimit);}
3591 			}
3592 			if(type==REVERSE){
3593 				assert(start1==last1 || start2==last2) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2;
3594 				if(start2==last2){return u1.overlapsReverse(u2, start1, start2, bandy, false, editLimit);}
3595 				else{return u2.overlapsReverse(u1, start2, start1, bandy, false, editLimit);}
3596 			}
3597 			throw new RuntimeException();
3598 		}
3599 
3600 		@Override
equals(Object o)3601 		public boolean equals(Object o){
3602 			return equals((Overlap)o);
3603 		}
3604 
equals(Overlap o)3605 		public boolean equals(Overlap o){
3606 			if(this==o){return true;}
3607 			assert(o!=null) : "*A*\n"+this+"\n"+o+"\n"+u1+"\n"+u2;
3608 			assert(u1!=null && u2!=null) : "*B*\n"+this+"\n"+o+"\n"+u1+"\n"+u2;
3609 			assert(u1!=o.u2 || u2!=o.u1) : "*C*\n"+this+"\n"+o+"\n"+u1.firstInOverlap(u2)+"\n"+o.u1.firstInOverlap(o.u2)+"\n"+u1+"\n"+u2;
3610 			return (u1==o.u1 && u2==o.u2 && type==o.type && start1==o.start1 && start2==o.start2 && stop1==o.stop1 && stop2==o.stop2)
3611 					;//|| (u1==o.u2 && u2==o.u1 && type==reverseType(o.type) && start1==o.start2 && start2==o.start1);
3612 		}
3613 
3614 //		public int compareTo(Overlap o){
3615 //			int a=compareTo2(o);
3616 //			int b=o.compareTo2(this);
3617 //			assert(a==-b) : "\n"+this+"\n"+o+"\na="+a+", b="+b+", equals="+this.equals(o)
3618 //				+"\nu1.compareTo(o.u1)="+u1.compareTo(o.u1)+"\no.u1.compareTo(u1)="+o.u1.compareTo(u1)
3619 //				+"\nu2.compareTo(o.u2)="+u2.compareTo(o.u2)+"\no.u2.compareTo(u2)="+o.u2.compareTo(u2);
3620 //			return a;
3621 //		}
3622 
3623 		@Override
compareTo(Overlap o)3624 		public int compareTo(Overlap o){
3625 			int score1=overlapLen-50*(mismatches+edits);
3626 			int score2=o.overlapLen-50*(o.mismatches+o.edits);
3627 			if(score1!=score2){return score2-score1;}
3628 			if(overlapLen!=o.overlapLen){return o.overlapLen-overlapLen;}
3629 			int x=u1.compareTo(o.u1);
3630 			if(x!=0){return -x;}
3631 			x=u2.compareTo(o.u2);
3632 			if(x!=0){return -x;}
3633 			if(type!=o.type){return type-o.type;}
3634 			if((u1!=o.u1 || u2!=o.u2) && absorbMatch && !subsetMode){
3635 				boolean oldv=verbose;
3636 				verbose=true;
3637 				System.err.println(this);
3638 				System.err.println(o);
3639 				System.err.println("********");
3640 				System.err.println(u1);
3641 				System.err.println(u2);
3642 				System.err.println(o.u1);
3643 				System.err.println(o.u2);
3644 				System.err.println("********");
3645 				System.err.println(u1.equals(o.u1));
3646 				System.err.println("********");
3647 				System.err.println(u2.equals(o.u2));
3648 				System.err.println("********");
3649 				System.err.println(u1.compareTo(o.u1));
3650 				System.err.println("********");
3651 				System.err.println(u2.compareTo(o.u2));
3652 				System.err.println("********");
3653 				verbose=oldv;
3654 			}
3655 			assert(!absorbMatch || (u1==o.u1 && u2==o.u2) || subsetMode) : "\n"+u1.r+"\n"+u2.r+"\n"+o.u1.r+"\n"+o.u2.r
3656 				+"\n\n"+u1.r.mate+"\n"+u2.r.mate+"\n"+o.u1.r.mate+"\n"+o.u2.r.mate;
3657 //			assert(false) : "\n"+this+"\n"+o+"\n>"+u1.name()+"\n"+new String(u1.bases())+"\n>"+u2.name()+"\n"+new String(u2.bases())+"\n";
3658 			if(start1!=o.start1){return start1-o.start1;}
3659 			if(stop1!=o.stop1){return stop1-o.stop1;}
3660 			if(start2!=o.start2){return start2-o.start2;}
3661 			if(stop2!=o.stop2){return stop2-o.stop2;}
3662 			if(this.equals(o)){
3663 				return 0;
3664 			}else{
3665 				//TODO: ensure this assumption is valid.
3666 				assert(!absorbContainment || !absorbMatch || subsetMode) : "\n"+this+"\n"+o+"\n>"+u1.name()+"\n"+new String(u1.bases())+"\n>"+u2.name()+"\n"+new String(u2.bases())+"\n";
3667 
3668 				if(u1.unitID!=o.u1.unitID){return u1.unitID-o.u1.unitID;}
3669 				if(u2.unitID!=o.u2.unitID){return u2.unitID-o.u2.unitID;}
3670 			}
3671 			return 0;
3672 		}
3673 
3674 		@Override
hashCode()3675 		public int hashCode(){
3676 			return u1.hashCode()^u2.hashCode()^overlapLen;
3677 		}
3678 
swap()3679 		public void swap(){
3680 			Unit tempu=u1;
3681 			u1=u2;
3682 			u2=tempu;
3683 			int temp=start1;
3684 			start1=start2;
3685 			start2=temp;
3686 			temp=stop1;
3687 			stop1=stop2;
3688 			stop2=temp;
3689 //			if(type==FORWARDRC){type=REVERSERC;}
3690 //			else if(type==REVERSERC){type=FORWARDRC;}
3691 			//What about forward and reverse?
3692 		}
3693 
reverseDirection()3694 		public void reverseDirection(){
3695 			type=reverseType(type);
3696 			int temp=start1;
3697 			start1=stop1;
3698 			stop1=temp;
3699 			temp=start2;
3700 			start2=stop2;
3701 			stop2=temp;
3702 		}
3703 
3704 		@Override
toString()3705 		public String toString(){
3706 			StringBuilder sb=new StringBuilder(80);
3707 			sb.append("type=");
3708 			sb.append(OVERLAP_TYPE_NAMES[type]);
3709 			sb.append(", len=");
3710 			sb.append(overlapLen);
3711 			sb.append(", subs=");
3712 			sb.append(mismatches);
3713 			sb.append(", edits=");
3714 			sb.append(edits);
3715 
3716 			sb.append(" (");
3717 			sb.append(u1.name()==null ? u1.r.numericID+"" : u1.name());
3718 			if(printLengthInEdges){
3719 				sb.append(", length=");
3720 				sb.append(u1.length());
3721 			}
3722 			sb.append(", start1=");
3723 			sb.append(start1);
3724 			sb.append(", stop1=");
3725 			sb.append(stop1);
3726 
3727 			sb.append(") (");
3728 			sb.append(u2.name()==null ? u2.r.numericID+"" : u2.name());
3729 			if(printLengthInEdges){
3730 				sb.append(", length=");
3731 				sb.append(u2.length());
3732 			}
3733 			sb.append(", start2=");
3734 			sb.append(start2);
3735 			sb.append(", stop2=");
3736 			sb.append(stop2);
3737 			sb.append(")");
3738 			return sb.toString();
3739 		}
3740 
toLabel()3741 		public String toLabel(){
3742 			StringBuilder sb=new StringBuilder(80);
3743 			sb.append(OVERLAP_TYPE_ABBREVIATIONS[type]);
3744 			sb.append(',');
3745 			sb.append(overlapLen);
3746 			sb.append(',');
3747 			sb.append(mismatches);
3748 			sb.append(',');
3749 			sb.append(edits);
3750 
3751 			if(printLengthInEdges){
3752 				sb.append(',');
3753 				sb.append(u1.length());
3754 			}
3755 			sb.append(',');
3756 			sb.append(start1);
3757 			sb.append(',');
3758 			sb.append(stop1);
3759 
3760 			if(printLengthInEdges){
3761 				sb.append(',');
3762 				sb.append(u2.length());
3763 			}
3764 			sb.append(',');
3765 			sb.append(start2);
3766 			sb.append(',');
3767 			sb.append(stop2);
3768 
3769 			return sb.toString();
3770 		}
3771 
setOffsetContradiction(boolean b)3772 		private void setOffsetContradiction(boolean b){
3773 			assert(b!=offsetContradiction()) : b+", "+offsetContradiction();
3774 			if(b){flags|=OFFSET_CONTRADICTION_MASK;}
3775 			else{flags&=~OFFSET_CONTRADICTION_MASK;}
3776 			assert(b==offsetContradiction()) : b+", "+offsetContradiction();
3777 		}
3778 
setMultiJoin(boolean b)3779 		private void setMultiJoin(boolean b){
3780 			assert(b!=multiJoin()) : b+", "+multiJoin();
3781 			if(b){flags|=MULTIJOIN_MASK;}
3782 			else{flags&=~MULTIJOIN_MASK;}
3783 			assert(b==multiJoin()) : b+", "+multiJoin();
3784 		}
3785 
setVisited(boolean b)3786 		private void setVisited(boolean b){
3787 			assert(b!=visited()) : b+", "+visited();
3788 			if(b){flags|=VISITED_MASK;}
3789 			else{flags&=~VISITED_MASK;}
3790 			assert(b==visited()) : b+", "+visited();
3791 		}
3792 
setCyclic(boolean b)3793 		private void setCyclic(boolean b){
3794 			assert(b!=cyclic()) : b+", "+cyclic();
3795 			if(b){flags|=CYCLIC_MASK;}
3796 			else{flags&=~CYCLIC_MASK;}
3797 			assert(b==cyclic()) : b+", "+cyclic();
3798 		}
3799 
setInvalid(boolean b)3800 		private void setInvalid(boolean b){
3801 			assert(b!=invalid()) : b+", "+invalid();
3802 			assert(b!=mst()) : b+", "+mst()+", "+invalid();
3803 			if(b){flags|=INVALID_MASK;}
3804 			else{flags&=~INVALID_MASK;}
3805 			assert(b==invalid()) : b+", "+invalid();
3806 		}
3807 
setMst(boolean b)3808 		private void setMst(boolean b){
3809 			assert(b!=mst()) : b+", "+mst();
3810 			assert(b!=invalid()) : b+", "+mst()+", "+invalid();
3811 			if(b){flags|=MST_MASK;}
3812 			else{flags&=~MST_MASK;}
3813 			assert(b==mst()) : b+", "+mst();
3814 		}
3815 
clearVolatileFlags()3816 		public void clearVolatileFlags(){
3817 			flags=0;
3818 //			flags=flags&~(MULTIJOIN_MASK|VISITED_MASK|CANON_CONTRADICTION_MASK|CYCLIC_MASK|OFFSET_CONTRADICTION_MASK|INVALID_MASK);
3819 //			assert(!canonContradiction());
3820 //			assert(!offsetContradiction());
3821 //			assert(!multiJoin());
3822 //			assert(!visited());
3823 //			assert(!cyclic());
3824 //			assert(!invalid());
3825 		}
3826 
offsetContradiction()3827 		public boolean offsetContradiction(){return (OFFSET_CONTRADICTION_MASK&flags)==OFFSET_CONTRADICTION_MASK;}
multiJoin()3828 		public boolean multiJoin(){return (MULTIJOIN_MASK&flags)==MULTIJOIN_MASK;}
visited()3829 		public boolean visited(){return (VISITED_MASK&flags)==VISITED_MASK;}
cyclic()3830 		public boolean cyclic(){return (CYCLIC_MASK&flags)==CYCLIC_MASK;}
invalid()3831 		public boolean invalid(){return (INVALID_MASK&flags)==INVALID_MASK;}
mst()3832 		public boolean mst(){return (MST_MASK&flags)==MST_MASK;}
contradiction()3833 		public boolean contradiction(){return offsetContradiction();}
3834 
3835 		private static final long VISITED_MASK=(1L<<0);
3836 		private static final long MULTIJOIN_MASK=(1L<<1);
3837 		private static final long CYCLIC_MASK=(1L<<2);
3838 		private static final long CANON_CONTRADICTION_MASK=(1L<<3);
3839 		private static final long OFFSET_CONTRADICTION_MASK=(1L<<4);
3840 		private static final long INVALID_MASK=(1L<<5);
3841 		private static final long MST_MASK=(1L<<6);
3842 
3843 		Unit u1;
3844 		Unit u2;
3845 		int type;
3846 		int start1;
3847 		int start2;
3848 		int stop1;
3849 		int stop2;
3850 
3851 		long flags=0;
3852 
3853 		final int overlapLen;
3854 		final int mismatches;
3855 		final int edits;
3856 	}
3857 
3858 	/**
3859 	 * @return
3860 	 */
determineCluster2(final int uid)3861 	private int determineCluster2(final int uid) {
3862 		assert(clusterNumbers!=null);
3863 		boolean stable=false;
3864 		int cluster=uid;
3865 		while(!stable){
3866 			cluster=clusterNumbers.get(uid);
3867 			if(cluster==0 || cluster==uid){return cluster;}
3868 			assert(cluster<=uid);
3869 			final int next=determineCluster2(cluster);
3870 			if(next>=cluster){return cluster;}
3871 			stable=clusterNumbers.compareAndSet(uid, cluster, next);
3872 		}
3873 		return cluster;
3874 	}
3875 
3876 
mergeClusterIds(int cluster1, int cluster2)3877 	private int mergeClusterIds(int cluster1, int cluster2) {
3878 		assert(clusterNumbers!=null);
3879 
3880 //		System.err.println("Merging clusters "+cluster1+" and "+cluster2);
3881 
3882 		while(cluster1!=cluster2){
3883 			int min=Tools.min(cluster1, cluster2);
3884 			if(cluster1!=min){
3885 				assert(cluster1>min);
3886 				boolean b=clusterNumbers.compareAndSet(cluster1, cluster1, min);
3887 				if(!b){
3888 					cluster1=determineCluster2(cluster1);
3889 					min=Tools.min(cluster1, cluster2);
3890 				}
3891 			}
3892 			if(cluster2!=min){
3893 				assert(cluster2>min);
3894 				boolean b=clusterNumbers.compareAndSet(cluster2, cluster2, min);
3895 				if(!b){
3896 					cluster2=determineCluster2(cluster2);
3897 					min=Tools.min(cluster1, cluster2);
3898 				}
3899 			}
3900 		}
3901 //		System.err.println("Returning "+cluster1);
3902 		return cluster1;
3903 	}
3904 
3905 	private class Unit implements Comparable<Unit>, Serializable {
3906 
3907 		/**
3908 		 *
3909 		 */
3910 		private static final long serialVersionUID = 5232407003873807738L;
3911 
Unit(Read r_)3912 		public Unit(Read r_){
3913 //			this(r_, canonical_, canonical_ ? hash(r_.bases) : hashReversed(r_.bases));
3914 			this(r_, hash(r_.bases), hashReversed(r_.bases));
3915 		}
3916 
Unit(Read r_, long codeF_, long codeR_)3917 		public Unit(Read r_, long codeF_, long codeR_){
3918 			r=r_;
3919 			code1=Tools.min(codeF_, codeR_);
3920 			code2=Tools.max(codeF_, codeR_);
3921 			long f=r.length();
3922 			prefix1=hashTip(r.bases, true, k, 0);
3923 			suffix1=hashTip(r.bases, false, k, 0);
3924 			if(r.length()>bitsPerBase*k){
3925 				prefix2=hashTip(r.bases, true, k, k);
3926 				suffix2=hashTip(r.bases, false, k, k);
3927 			}
3928 //			System.err.println(r.id+", "+prefix1+", "+suffix1+", "+code1+", "+code2);
3929 			if(r.pairnum()==1){f|=PAIRNUM_MASK;}
3930 			flags=f;
3931 			assert(length()==r.length());
3932 			assert(pairnum()==r.pairnum());
3933 			if(parseDepth){
3934 				int[] quad=KmerNormalize.parseDepth(r.id, null);
3935 				if(quad!=null){depth=quad[r.pairnum()];}
3936 			}
3937 		}
3938 
determineCluster()3939 		int determineCluster() {
3940 			return determineCluster2(unitID);
3941 		}
3942 
absorbMatch(Unit u)3943 		public void absorbMatch(Unit u){
3944 
3945 			assert(code1==u.code1 && code2==u.code2 && length()==u.length());
3946 			if(r==null || u.r==null){return;}
3947 			u.r.setDiscarded(true);
3948 			final byte[] bases1=r.bases, bases2=u.r.bases;
3949 			final byte[] quals1=r.quality, quals2=u.r.quality;
3950 
3951 			assert((r.mate==null) == (u.r.mate==null));
3952 
3953 			if(r.mate!=null && !u.r.mate.discarded()){
3954 				((Unit)r.mate.obj).absorbMatch((Unit)u.r.mate.obj);
3955 			}
3956 			if(quals1==null || quals2==null){return;}
3957 
3958 			for(int i=0; i<bases1.length; i++){
3959 				byte b1=bases1[i], b2=bases2[i];
3960 				if(!isFullyDefined(b1) && isFullyDefined(b2)){bases1[i]=b2;}
3961 				else{assert(b1==b2);}
3962 				if(quals1!=null && quals2!=null){
3963 					quals1[i]=Tools.max(quals1[i], quals2[i]);
3964 				}
3965 			}
3966 		}
3967 
alreadyHas(Overlap o)3968 		public boolean alreadyHas(Overlap o){
3969 			if(overlapList==null){return false;}
3970 			for(int i=0; i<overlapList.size(); i++){
3971 				Overlap o2=overlapList.get(i);
3972 				if(o.equals(o2)){
3973 					assert(overlapList.contains(o));
3974 					assert(o2.equals(o));
3975 					return true;
3976 				}
3977 			}
3978 			assert(!overlapList.contains(o));
3979 			return false;
3980 		}
3981 
makeCluster()3982 		public ArrayList<Unit> makeCluster() {
3983 			assert(!visited());
3984 			assert(!clustered());
3985 			assert(valid());
3986 //			assert(set.isEmpty());
3987 			ArrayList<Unit> cluster=new ArrayList<Unit>(overlapList==null ? 1 : overlapList.size()+1);
3988 			cluster.add(this);
3989 			setClustered(true);
3990 
3991 			int added=1;
3992 			for(int i=0; i<cluster.size(); i++){
3993 				Unit u=cluster.get(i);
3994 				added+=u.visit(cluster);
3995 			}
3996 
3997 			assert(added==cluster.size());
3998 			return cluster;
3999 		}
4000 
visit(ArrayList<Unit> cluster)4001 		public int visit(ArrayList<Unit> cluster) {
4002 			assert(!visited());
4003 			assert(clustered());
4004 			assert(valid());
4005 //			assert(cluster.contains(this));
4006 			setVisited(true);
4007 			int added=0;
4008 
4009 			if(r!=null && r.mate!=null){
4010 				Unit u2=(Unit)r.mate.obj;
4011 				assert(u2!=this);
4012 				assert(u2.valid());
4013 				if(!u2.clustered()){
4014 					u2.setClustered(true);
4015 					cluster.add(u2);
4016 					added++;
4017 				}
4018 			}
4019 
4020 			if(overlapList!=null){
4021 				for(Overlap o : overlapList){
4022 					Unit u2=(o.u1==this ? o.u2 : o.u1);
4023 					assert(o.u1==this || o.u2==this);
4024 					assert(u2!=this);
4025 					assert(u2.valid());
4026 					if(!u2.clustered()){
4027 						u2.setClustered(true);
4028 						cluster.add(u2);
4029 						added++;
4030 					}
4031 				}
4032 			}
4033 			return added;
4034 		}
4035 
isTransitive()4036 		public boolean isTransitive(){
4037 			assert(valid());
4038 			if(overlapList==null || overlapList.size()==0){return true;}
4039 			for(Overlap o : overlapList){
4040 				assert(o.u1==this || o.u2==this);
4041 				Unit u2=(o.u1==this ? o.u2 : o.u1);
4042 				assert(u2!=this);
4043 				if(u2.overlapList==null){
4044 					return false;
4045 				}else{
4046 					boolean found=false;
4047 					for(Overlap o2 : u2.overlapList){
4048 						if(o2.u1==this || o2.u2==this){
4049 							found=true; break;
4050 						}
4051 					}
4052 					if(!found){return false;}
4053 				}
4054 			}
4055 			return true;
4056 		}
4057 
isPerfectlyTransitive()4058 		public boolean isPerfectlyTransitive(){
4059 			assert(valid());
4060 			if(overlapList==null || overlapList.size()==0){return true;}
4061 			for(Overlap o : overlapList){
4062 				assert(o.u1==this || o.u2==this);
4063 				Unit u2=(o.u1==this ? o.u2 : o.u1);
4064 				assert(u2!=this);
4065 				if(u2.overlapList==null){
4066 					return false;
4067 				}else{
4068 					boolean found=false;
4069 					for(Overlap o2 : u2.overlapList){
4070 						if(o2==o){
4071 							found=true; break;
4072 						}
4073 					}
4074 					if(!found){return false;}
4075 				}
4076 			}
4077 			return true;
4078 		}
4079 
isNonRedundant()4080 		public boolean isNonRedundant(){
4081 			assert(valid());
4082 			if(overlapList==null || overlapList.size()==0){return true;}
4083 			for(int i=0; i<overlapList.size(); i++){
4084 				Overlap a=overlapList.get(i);
4085 				for(int j=0; j<overlapList.size(); j++){
4086 					Overlap b=overlapList.get(j);
4087 					if((i==j)!=(a.equals(b))){
4088 						return false;
4089 					}
4090 				}
4091 			}
4092 			return true;
4093 		}
4094 
contains(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum)4095 		public boolean contains(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum) {
4096 			if(verbose){System.err.println("contains: Considering key "+key+", unit "+u2);}
4097 			if(u2.length()>this.length()){return false;} //Smaller things cannot contain larger things
4098 			if(u2.length()==this.length()){//For equal size, only one can contain the other the other
4099 				boolean pass=false;
4100 				if(!pass && u2.unitID>this.unitID){return false;}else if(u2.unitID<this.unitID){pass=true;}
4101 				if(!pass && u2.r.numericID>this.r.numericID){return false;}else if(u2.r.numericID<this.r.numericID){pass=true;}
4102 				if(!pass && u2.code1>this.code1){return false;}else if(u2.code1<this.code1){pass=true;}
4103 				if(!pass && u2.r.bases.hashCode()>this.r.bases.hashCode()){return false;}else if(u2.r.bases.hashCode()<this.r.bases.hashCode()){pass=true;}
4104 				if(!pass){return false;}
4105 			}
4106 			if(minLengthPercent>0 && (u2.length()*100f/length())<minLengthPercent){return false;}
4107 			if(verbose){System.err.println("passed tests for "+u2.length()+", "+this.length()+", "+u2.unitID+", "+this.unitID);}
4108 
4109 			assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() || (r!=null && r.mate!=null) || //REQUIRE_MATCHING_NAMES ||
4110 					(u2.prefix1!=prefix1 && u2.suffix1!=suffix1)) :
4111 						"Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r;
4112 
4113 			if(tableNum==0){
4114 				if(key.value()==u2.prefix1){
4115 					if(verbose){System.err.println("Containment A1");}
4116 					if(containsForward(u2, loc-k2, bandy, tableNum==0)){return true;}
4117 				}
4118 				if(key.value()==u2.suffix1){
4119 					if(verbose){System.err.println("Containment B1");}
4120 					if(containsReverse(u2, loc, bandy, tableNum==0)){return true;}
4121 				}
4122 			}else{
4123 				if(key.value()==u2.prefix2){
4124 					if(verbose){System.err.println("Containment A2");}
4125 					if(containsForward(u2, loc-k2-k, bandy, tableNum==0)){return true;}
4126 				}
4127 				if(key.value()==u2.suffix2){
4128 					if(verbose){System.err.println("Containment B2");}
4129 					if(containsReverse(u2, loc+k, bandy, tableNum==0)){return true;}
4130 				}
4131 			}
4132 			return false;
4133 		}
4134 
containsForward(Unit u2, int start, BandedAligner bandy, boolean earlyExit)4135 		private boolean containsForward(Unit u2, int start, BandedAligner bandy, boolean earlyExit) {
4136 			if(start+u2.length()>length() || start<0 || start>=length()){return false;}
4137 //			if(true){return false;}
4138 			if(u2.r!=null){
4139 				final byte[] a=bases(), b=u2.bases();
4140 				int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
4141 
4142 				for(int i=start, j=0; j<b.length; i++, j++){
4143 					byte aa=a[i];
4144 					byte bb=b[j];
4145 					if(aa!=bb){
4146 						if(exact || (isFullyDefined(aa) && isFullyDefined(bb))){
4147 							if(earlyExit && j<k2){return false;}
4148 							if((mismatches=mismatches+1)>maxMismatches){
4149 								if(bandy==null || maxEdits<1){return false;}
4150 								int edits=bandy.alignForward(b, a, 0, start, maxEdits, exact);
4151 								assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
4152 								return edits<=maxEdits && bandy.score()>4*edits;
4153 							}
4154 						}
4155 					}
4156 				}
4157 				return true;
4158 			}else{
4159 				assert(false) : "TODO: Verify by hashing and checking both tips";
4160 				return false;
4161 			}
4162 		}
4163 
containsReverse(Unit u2, int start, BandedAligner bandy, boolean earlyExit)4164 		private boolean containsReverse(Unit u2, int start, BandedAligner bandy, boolean earlyExit) {
4165 			if(start+1<u2.length() || start<0 || start>=length()){return false;}
4166 //			if(true){return false;}
4167 			if(u2.r!=null){
4168 				final byte[] a=bases(), b=u2.bases();
4169 				int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
4170 
4171 				for(int i=start, j=b.length-1, iprefix=start-k2; j>=0; i--, j--){
4172 					byte aa=a[i];
4173 					byte bb=b[j];
4174 					if(aa!=bb){
4175 						if(exact || (isFullyDefined(aa) && isFullyDefined(bb))){
4176 							if(earlyExit && i>iprefix){return false;}
4177 							if((mismatches=mismatches+1)>maxMismatches){
4178 								if(bandy==null || maxEdits<1){return false;}
4179 								int edits=bandy.alignReverse(b, a, b.length-1, start, maxEdits, exact);
4180 								assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
4181 								return edits<=maxEdits && bandy.score()>4*edits;
4182 							}
4183 						}
4184 					}
4185 				}
4186 				return true;
4187 			}else{
4188 				assert(false) : "TODO: Verify by hashing and checking both tips";
4189 				return false;
4190 			}
4191 		}
4192 
4193 
depthCongruent(int aa, int bb)4194 		public boolean depthCongruent(int aa, int bb){
4195 			if(aa<5 && bb<5){return true;}
4196 			final int a=Tools.max(1, Tools.min(aa, bb));
4197 			final int b=Tools.max(aa, bb);
4198 			return a*depthRatio>=b;
4199 		}
4200 
overlaps(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum, int editLimit)4201 		public boolean overlaps(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum, int editLimit) {
4202 //			return makeOverlap(u2, loc, key, bandy, earlyExit)!=null;
4203 
4204 //			assert(false) : "TODO";
4205 			if(verbose){System.err.println("overlaps: Considering key "+key+", unit "+u2);}
4206 			if(parseDepth && !depthCongruent(depth, u2.depth)){return false;}
4207 			if(minLengthPercent>0){
4208 				final int len1=length(), len2=u2.length();
4209 				if(Tools.min(len1, len2)*100f/Tools.max(len1, len2)<minLengthPercent){return false;}
4210 			}
4211 			assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() ||
4212 					(u2.prefix1!=prefix1 && u2.suffix1!=suffix1)) :
4213 						"Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r;
4214 
4215 
4216 			if(tableNum==0){
4217 				if(key.value()==u2.prefix1){
4218 					if(verbose){System.err.println("Testing overlaps A1");}
4219 					if(overlapsForward(u2, loc-k2, 0, bandy, tableNum==0, editLimit)){
4220 						if(verbose){System.err.println("Found Overlap A1F");}
4221 						return true;
4222 					}
4223 					if(verbose){System.err.println("No Overlap.");}
4224 				}
4225 				if(key.value()==u2.suffix1){
4226 					if(verbose){System.err.println("Testing overlaps B1");}
4227 					if(verbose){System.err.println("Testing overlaps B1R");}
4228 					if(overlapsReverse(u2, loc, u2.length()-1, bandy, tableNum==0, editLimit)){
4229 						if(verbose){System.err.println("Found Overlap B1R");}
4230 						return true;
4231 					}
4232 					if(verbose){System.err.println("No Overlap.");}
4233 				}
4234 			}else{
4235 				if(key.value()==u2.prefix2){
4236 					if(verbose){System.err.println("Testing overlaps A2");}
4237 					if(overlapsForward(u2, loc-k2-k, 0, bandy, tableNum==0, editLimit)){
4238 						if(verbose){System.err.println("Found Overlap A2F");}
4239 						return true;
4240 					}
4241 					if(verbose){System.err.println("No Overlap.");}
4242 				}
4243 				if(key.value()==u2.suffix2){
4244 					if(verbose){System.err.println("Testing overlaps B2");}
4245 					if(overlapsReverse(u2, loc+k, u2.length()-1, bandy, tableNum==0, editLimit)){
4246 						if(verbose){System.err.println("Found Overlap B2R");}
4247 						return true;
4248 					}
4249 					if(verbose){System.err.println("No Overlap.");}
4250 				}
4251 			}
4252 
4253 			return false;
4254 		}
4255 
4256 		/**
4257 		 * @param u2
4258 		 * @param loc
4259 		 * @param key
4260 		 * @return
4261 		 */
makeOverlap(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum)4262 		protected Overlap makeOverlap(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum) {
4263 			if(verbose){System.err.println("makeOverlap: Considering key "+key+", unit "+u2);}
4264 			if(parseDepth && !depthCongruent(depth, u2.depth)){return null;}
4265 			if(minLengthPercent>0){
4266 				final int len1=length(), len2=u2.length();
4267 				if(Tools.min(len1, len2)*100f/Tools.max(len1, len2)<minLengthPercent){return null;}
4268 			}
4269 			assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() || (r!=null && r.mate!=null) ||
4270 					(u2.prefix1!=prefix1 && u2.suffix1!=suffix1)) :
4271 						"Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r;
4272 
4273 
4274 			Overlap o=null;
4275 			if(tableNum==0){
4276 				if(key.value()==u2.prefix1){
4277 					if(verbose){System.err.println("\nTesting makeOverlap A1F");}
4278 					if((o=makeOverlapForward(u2, loc-k2, bandy, tableNum==0))!=null){
4279 						if(verbose){System.err.println("Made Overlap A1F");}
4280 						return o;
4281 					}
4282 					if(verbose){System.err.println("No Overlap.");}
4283 				}
4284 				if(key.value()==u2.suffix1){
4285 					if(verbose){System.err.println("\nTesting makeOverlap B1R");}
4286 					if((o=makeOverlapReverse(u2, loc, bandy, tableNum==0))!=null){
4287 						if(verbose){System.err.println("Made Overlap B1R");}
4288 						return o;
4289 					}
4290 					if(verbose){System.err.println("No Overlap.");}
4291 				}
4292 			}else{
4293 				if(key.value()==u2.prefix2){
4294 					if(verbose){System.err.println("\nTesting makeOverlap A2F");}
4295 					if((o=makeOverlapForward(u2, loc-k2-k, bandy, tableNum==0))!=null){
4296 						if(verbose){System.err.println("Made Overlap A2F");}
4297 						return o;
4298 					}
4299 					if(verbose){System.err.println("No Overlap.");}
4300 				}
4301 				if(key.value()==u2.suffix2){
4302 					if(verbose){System.err.println("\nTesting makeOverlap B2R");}
4303 					if((o=makeOverlapReverse(u2, loc+k, bandy, tableNum==0))!=null){
4304 						if(verbose){System.err.println("Made Overlap B2R");}
4305 						return o;
4306 					}
4307 					if(verbose){System.err.println("No Overlap.");}
4308 				}
4309 			}
4310 			return o;
4311 		}
4312 
overlapsForward(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits)4313 		private boolean overlapsForward(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) {
4314 			if(verbose){System.err.println("overlapsForward(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");}
4315 
4316 			final int len1=length(), len2=u2.length();
4317 			if(start1<0){
4318 				start2-=start1;
4319 				start1=0;
4320 				if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);}
4321 			}
4322 			int overlapLength=Tools.min(len1-start1, len2-start2);
4323 			int overlapLength2=Tools.max(len1-start1, len2-start2);
4324 			int stop1=start1+overlapLength-1, stop2=start2+overlapLength-1;
4325 			if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
4326 
4327 			if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
4328 				if(verbose){
4329 					System.err.println("Side block. allowAllContainedOverlaps="+allowAllContainedOverlaps+", minOverlapCluster="+minOverlapCluster);
4330 					System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
4331 							", overlapLen="+overlapLength+", maxEdits="+maxEdits);
4332 				}
4333 				if(overlapLength2<minOverlapCluster){return false;}
4334 				if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;}
4335 			}
4336 
4337 			final byte[] a=bases(), b=u2.bases();
4338 			assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
4339 			int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, overlapLength);
4340 
4341 			if(verbose){
4342 				System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
4343 						", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
4344 			}
4345 
4346 			for(int i=start1, j=start2; j<=stop2; i++, j++){
4347 				byte aa=a[i];
4348 				byte bb=b[j];
4349 				if(aa!=bb){
4350 					if(exact || (isFullyDefined(aa) && isFullyDefined(bb))){
4351 						if(earlyExit && j<k2){return false;}
4352 						mismatches++;
4353 						if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
4354 						if(mismatches>maxMismatches){
4355 							if(bandy==null || maxEdits<1){return false;}
4356 							if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");}
4357 							int edits=bandy.alignForward(b, a, 0, start1, maxEdits, exact);
4358 							assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
4359 							stop2=bandy.lastQueryLoc;
4360 							stop1=bandy.lastRefLoc;
4361 							return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment
4362 						}
4363 					}
4364 				}
4365 			}
4366 			return true;
4367 		}
4368 
overlapsReverse(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits)4369 		private boolean overlapsReverse(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) {
4370 			if(verbose){System.err.println("overlapsReverse(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");}
4371 
4372 			final int len1=length(), len2=u2.length();
4373 			if(start1>=len1){
4374 				start2-=(start1-len1+1);
4375 				start1=len1-1;
4376 				if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);}
4377 			}
4378 			final int overlapLength=Tools.min(start1+1, start2+1);
4379 			final int overlapLength2=Tools.max(start1+1, start2+1);
4380 			int stop1=start1-overlapLength+1, stop2=start2-overlapLength+1;
4381 			if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
4382 
4383 			if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
4384 				if(overlapLength2<minOverlapCluster){return false;}
4385 				if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;}
4386 			}
4387 
4388 			final byte[] a=bases(), b=u2.bases();
4389 			assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
4390 			int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
4391 
4392 			if(verbose){
4393 				System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
4394 						", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
4395 			}
4396 
4397 			for(int i=start1, j=start2, iprefix=start1-k2; i>=stop1; i--, j--){
4398 				byte aa=a[i];
4399 				byte bb=b[j];
4400 				if(aa!=bb){
4401 					if(exact || (isFullyDefined(aa) && isFullyDefined(bb))){
4402 						if(earlyExit && i>iprefix){return false;}
4403 						mismatches++;
4404 						if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
4405 						if(mismatches>maxMismatches){
4406 							if(bandy==null || maxEdits<1){return false;}
4407 							if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");}
4408 							int edits=bandy.alignReverse(b, a, b.length-1, start1, maxEdits, exact);
4409 							assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
4410 							stop2=bandy.lastQueryLoc;
4411 							stop1=bandy.lastRefLoc;
4412 							return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment
4413 						}
4414 					}
4415 				}
4416 			}
4417 			return true;
4418 		}
4419 
makeOverlapForward(Unit u2, int start1, BandedAligner bandy, boolean earlyExit)4420 		private Overlap makeOverlapForward(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) {
4421 			if(verbose){System.err.println("makeOverlapForward(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");}
4422 			final int len1=length(), len2=u2.length();
4423 			int start2=0;
4424 			if(start1<0){
4425 				start2-=start1;
4426 				start1=0;
4427 			}
4428 			final int overlapLength=Tools.min(len1-start1, len2-start2);
4429 			final int overlapLength2=Tools.max(len1-start1, len2-start2);
4430 			int stop1=start1+overlapLength-1, stop2=start2+overlapLength-1;
4431 			if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
4432 
4433 			if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
4434 				if(overlapLength<minOverlapCluster){return null;}
4435 				if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;}
4436 			}
4437 
4438 			final byte[] a=bases(), b=u2.bases();
4439 			assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
4440 			int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, overlapLength);
4441 
4442 			if(verbose){
4443 				System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
4444 						", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
4445 			}
4446 
4447 			for(int i=start1, j=start2; j<=stop2; i++, j++){
4448 				byte aa=a[i];
4449 				byte bb=b[j];
4450 				if(aa!=bb){
4451 					if(exact || (isFullyDefined(aa) && isFullyDefined(bb))){
4452 						if(earlyExit && j<k2){return null;}
4453 						mismatches++;
4454 						if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
4455 						if(mismatches>1 && bandy!=null){
4456 							if(maxEdits<1){return null;}
4457 							if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");}
4458 							int edits=bandy.alignForward(b, a, start2, start1, maxEdits, exact);
4459 							if(edits>maxEdits || bandy.score()<=4*edits){
4460 								if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");}
4461 								return null;
4462 							}
4463 							assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
4464 							stop2=bandy.lastQueryLoc;
4465 							stop1=bandy.lastRefLoc;
4466 //							if(bandy.lastOffset>0){//Ref longer than query
4467 //								for(int k=0; k<bandy.lastOffset; k++){
4468 //									if(stop1+1<=len1){stop1++;}
4469 //									else{stop2--;}//I don't think this can happen
4470 //								}
4471 //							}else if(bandy.lastOffset<0){//Query longer than ref
4472 //								for(int k=0; k>bandy.lastOffset; k--){
4473 //									if(stop2+1<=len2){stop2++;}
4474 //									else{stop1--;}
4475 //								}
4476 //							}
4477 							return new Overlap(this, u2, FORWARD, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy);
4478 						}else if(mismatches>maxMismatches){return null;}
4479 					}
4480 				}
4481 			}
4482 			return new Overlap(this, u2, FORWARD, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy);
4483 		}
4484 
makeOverlapReverse(Unit u2, int start1, BandedAligner bandy, boolean earlyExit)4485 		private Overlap makeOverlapReverse(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) {
4486 			if(verbose){System.err.println("makeOverlapReverse(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");}
4487 
4488 			final int len1=length(), len2=u2.length();
4489 			int start2=len2-1;
4490 			if(start1>=len1){
4491 				start2-=(start1-len1+1);
4492 				start1=len1-1;
4493 			}
4494 			final int overlapLength=Tools.min(start1+1, start2+1);
4495 			final int overlapLength2=Tools.max(start1+1, start2+1);
4496 			int stop1=start1-overlapLength+1, stop2=start2-overlapLength+1;
4497 			if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);}
4498 
4499 			if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){
4500 				if(overlapLength<minOverlapCluster){return null;}
4501 				if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;}
4502 			}
4503 
4504 			final byte[] a=bases(), b=u2.bases();
4505 			assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1;
4506 			int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length);
4507 
4508 			if(verbose){
4509 				System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+
4510 						", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits);
4511 			}
4512 
4513 			for(int i=start1, j=start2, iprefix=start1-k2; i>=stop1; i--, j--){
4514 				byte aa=a[i];
4515 				byte bb=b[j];
4516 				if(aa!=bb){
4517 					if(exact || (isFullyDefined(aa) && isFullyDefined(bb))){
4518 						if(earlyExit && i>iprefix){return null;}
4519 						mismatches++;
4520 						if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);}
4521 						if(mismatches>1 && bandy!=null){
4522 							if(maxEdits<1){return null;}
4523 							if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");}
4524 							int edits=bandy.alignReverse(b, a, start2, start1, maxEdits, exact);
4525 							if(edits>maxEdits || bandy.score()<=4*edits){
4526 								if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");}
4527 								return null;
4528 							}
4529 							assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits;
4530 							stop2=bandy.lastQueryLoc;
4531 							stop1=bandy.lastRefLoc;
4532 //							if(bandy.lastOffset>0){//Ref longer than query
4533 //								for(int k=0; k<bandy.lastOffset; k++){
4534 //									if(stop1>0){stop1--;}
4535 //									else{stop2++;}//I don't think this can happen
4536 //								}
4537 //							}else if(bandy.lastOffset<0){//Query longer than ref
4538 //								for(int k=0; k>bandy.lastOffset; k--){
4539 //									if(stop2>0){stop2--;}
4540 //									else{stop1++;}
4541 //								}
4542 //							}
4543 							return new Overlap(this, u2, REVERSE, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy);
4544 						}else if(mismatches>maxMismatches){return null;}
4545 					}
4546 				}
4547 			}
4548 			return new Overlap(this, u2, REVERSE, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy);
4549 		}
4550 
4551 		@Override
compareTo(Unit b)4552 		public int compareTo(Unit b) {
4553 			int x=comparePairedRC(this, b);
4554 			return x;
4555 		}
4556 
4557 		@Override
equals(Object b)4558 		public boolean equals(Object b){return equals((Unit)b);}
equals(Unit b)4559 		public boolean equals(Unit b){
4560 			boolean x=pairedEqualsRC(this, b);
4561 			return x;
4562 		}
4563 
4564 		@Override
hashCode()4565 		public int hashCode(){
4566 			return (int)((code1^(code1>>>32))&0xFFFFFFFFL);
4567 		}
4568 
setValid(boolean b)4569 		private synchronized void setValid(boolean b){
4570 			assert(b!=valid());
4571 //			if(!b){System.err.println("Setting invalid "+name());}
4572 			if(b){flags&=~INVALID_MASK;}
4573 			else{flags|=INVALID_MASK;}
4574 			assert(b==valid());
4575 		}
4576 
setClustered(boolean b)4577 		private synchronized void setClustered(boolean b){
4578 			assert(b!=clustered());
4579 			if(b){flags|=CLUSTER_MASK;}
4580 			else{flags&=~CLUSTER_MASK;}
4581 			assert(b==clustered());
4582 		}
4583 
setVisited(boolean b)4584 		private void setVisited(boolean b){
4585 			assert(b!=visited());
4586 			if(b){flags|=VISIT_MASK;}
4587 			else{flags&=~VISIT_MASK;}
4588 			assert(b==visited());
4589 		}
4590 
setOffset(int x)4591 		private synchronized void setOffset(int x){
4592 			offset=x;
4593 			setOffsetValid(true);
4594 		}
4595 
setOffsetValid(boolean b)4596 		private synchronized void setOffsetValid(boolean b){
4597 			assert(!offsetValid());
4598 			if(b){flags|=OFFSET_VALID_MASK;}
4599 			else{flags&=~OFFSET_VALID_MASK;}
4600 			assert(b==offsetValid());
4601 		}
4602 
setOffsetContradiction(boolean b)4603 		private synchronized void setOffsetContradiction(boolean b){
4604 //			assert(b!=offsetContradiction());
4605 			assert(offsetValid());
4606 			if(b){flags|=OFFSET_CONTRADICTION_MASK;}
4607 			else{flags&=~OFFSET_CONTRADICTION_MASK;}
4608 			assert(b==offsetContradiction());
4609 		}
4610 
4611 		/** Return true if 'this' should be the first Unit in the overlap object */
firstInOverlap(Unit u2)4612 		public boolean firstInOverlap(Unit u2){
4613 			assert(this!=u2) : "\n"+this.r+"\n"+u2.r;
4614 			if(u2.length()!=length()){return u2.length()<length();}
4615 			if(u2.code1!=code1){return u2.code1<code1;}
4616 			if(u2.code2!=code2){return u2.code2<code2;}
4617 			int x=compareTo(u2);
4618 			assert(x!=0 || (r!=null && r.mate!=null));
4619 			if(x!=0){return x>=0;}
4620 			return r.numericID>=u2.r.numericID;
4621 		}
4622 
inSet()4623 		public final boolean inSet(){
4624 			if(subsetCount<2){return true;}
4625 			if(r.pairnum()>0){return ((Unit)r.mate.obj).inSet();}
4626 			return ((code1&Long.MAX_VALUE)%subsetCount)==subset;
4627 		}
4628 
bases()4629 		public byte[] bases(){return r==null ? null : r.bases;}
4630 
name()4631 		public String name(){return r!=null ? r.id : null /*code+""*/;}
4632 		@Override
toString()4633 		public String toString(){return "("+name()+","+code1+","+code2+","+length()+","+prefix1+","+suffix1+","+"c"+",d="+depth+")";}
4634 
4635 
4636 		public final Read r;
4637 		public final long code1;
4638 		public final long code2;
4639 		public long prefix1=-1;
4640 		public long suffix1=-1;
4641 		public long prefix2=-1;
4642 		public long suffix2=-1;
4643 		/** Distance of leftmost side of this read relative to leftmost side of root.
4644 		 * Assumes everything is in 'forward' orientation. */
4645 		public int offset=-999999999;
4646 		public int depth=1;
4647 //		private boolean valid=true;
4648 
4649 		public int unitID;
4650 
4651 		public ArrayList<Overlap> overlapList;
4652 
4653 		private long flags;
4654 		/** True if this contig should be output, false if not */
valid()4655 		public final boolean valid(){return (INVALID_MASK&flags)==0;}
4656 		/** Length of this contig */
length()4657 		public final int length(){return (int)(LEN_MASK&flags);}
4658 		/** Position of this contig relative to root */
offset()4659 		public final int offset(){
4660 			assert(offsetValid());
4661 			return offset;
4662 		}
pairnum()4663 		public int pairnum(){return (PAIRNUM_MASK&flags)==PAIRNUM_MASK ? 1 : 0;}
4664 
clearVolatileFlags()4665 		public void clearVolatileFlags(){
4666 			flags=flags&~(VISIT_MASK|OFFSET_VALID_MASK|OFFSET_CONTRADICTION_MASK);
4667 			assert(!visited());
4668 			assert(!offsetValid());
4669 			assert(!offsetContradiction());
4670 		}
4671 
visited()4672 		public boolean visited(){return (VISIT_MASK&flags)==VISIT_MASK;}
clustered()4673 		public boolean clustered(){return (CLUSTER_MASK&flags)==CLUSTER_MASK;}
offsetValid()4674 		public boolean offsetValid(){return (OFFSET_VALID_MASK&flags)==OFFSET_VALID_MASK;}
offsetContradiction()4675 		public boolean offsetContradiction(){return (OFFSET_CONTRADICTION_MASK&flags)==OFFSET_CONTRADICTION_MASK;}
contradiction()4676 		public boolean contradiction(){return offsetContradiction();}
4677 
4678 		private static final long LEN_MASK=0x7FFFFFFFL;
4679 //		private static final long CANON_MASK=(1L<<33);
4680 		private static final long INVALID_MASK=(1L<<34);
4681 		private static final long VISIT_MASK=(1L<<35);
4682 		private static final long CLUSTER_MASK=(1L<<36);
4683 //		private static final long CANONICIZED_MASK=(1L<<37);
4684 //		private static final long CANON_CONTRADICTION_MASK=(1L<<38);
4685 		private static final long OFFSET_VALID_MASK=(1L<<39);
4686 		private static final long OFFSET_CONTRADICTION_MASK=(1L<<40);
4687 		private static final long PAIRNUM_MASK=(1L<<41);
4688 	}
4689 
4690 	private static final class UnitOffsetComparator implements Comparator<Unit> {
4691 
UnitOffsetComparator()4692 		UnitOffsetComparator(){}
4693 
4694 		/* (non-Javadoc)
4695 		 * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
4696 		 */
4697 		@Override
compare(Unit a, Unit b)4698 		public int compare(Unit a, Unit b) {
4699 			if(a.offsetValid() && b.offsetValid()){
4700 				int x=a.offset()-b.offset();
4701 				if(x!=0){return x;}
4702 			}else{
4703 				if(a.offsetValid()){return -1;}
4704 				if(b.offsetValid()){return 1;}
4705 			}
4706 			return a.compareTo(b);
4707 		}
4708 
4709 	}
4710 
4711 	private static final class ClusterLengthComparator implements Comparator<ArrayList<Unit>> {
4712 
ClusterLengthComparator()4713 		ClusterLengthComparator(){}
4714 
4715 		/* (non-Javadoc)
4716 		 * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
4717 		 */
4718 		@Override
compare(ArrayList<Unit> a, ArrayList<Unit> b)4719 		public int compare(ArrayList<Unit> a, ArrayList<Unit> b) {
4720 			if(a.size()!=b.size()){return b.size()-a.size();}
4721 			if(a.isEmpty() && b.isEmpty()){return 0;}
4722 			return a.get(0).compareTo(b.get(0));
4723 		}
4724 
4725 	}
4726 
4727 	/** Makes a nmer (e.g., tetramer) profile of a cluster */
makeNmerProfile(ArrayList<Unit> alu, long[] array_)4728 	private final float[] makeNmerProfile(ArrayList<Unit> alu, long[] array_){
4729 		final int nbits=bitsPerBase*nmerLength;
4730 		final long[] array=(array_==null ? new long[maxNmer+1] : array_);
4731 		final int mask=~((-1)<<(nbits));
4732 
4733 		long keysCounted=0;
4734 
4735 		for(Unit u : alu){
4736 			byte[] bases=u.r.bases;
4737 			int len=0;
4738 			int kmer=0;
4739 			for(byte b : bases){
4740 				int x=AminoAcid.baseToNumber[b];
4741 				if(x<0){
4742 					len=0;
4743 					kmer=0;
4744 				}else{
4745 					kmer=((kmer<<bitsPerBase)|x)&mask;
4746 					len++;
4747 					if(len>=nmerLength){
4748 						keysCounted++;
4749 						array[kmer]++;
4750 					}
4751 				}
4752 			}
4753 		}
4754 
4755 		if(keysCounted==0){keysCounted=1;}
4756 		final float mult=1f/keysCounted;
4757 
4758 		float[] r=new float[array.length];
4759 		for(int i=0; i<array.length; i++){
4760 			r[i]=array[i]*mult;
4761 			array[i]=0;
4762 		}
4763 		return r;
4764 	}
4765 
isFullyDefined(byte b)4766 	private static boolean isFullyDefined(byte b){
4767 		return AminoAcid.acidToNumber[b]>=0;
4768 	}
4769 
4770 	private ConcurrentReadInputStream crisa[];
4771 
4772 	private final ByteStreamWriter dupeWriter;
4773 
4774 	private String[] in1=null;
4775 	private String[] in2=null;
4776 	private String out=null;
4777 	private String clusterFilePattern=null;
4778 	private String outbest=null;
4779 	private String outdupe=null;
4780 	private String outcsf=null;
4781 	private String outgraph=null;
4782 	private int maxNs=-1;
4783 	private long maxReads=-1;
4784 	public boolean errorState=false;
4785 	boolean sort=false;
4786 //	boolean ascending=true;
4787 	boolean absorbContainment=true;
4788 	boolean absorbMatch=true;
4789 	boolean findOverlaps=false;
4790 	boolean makeClusters=false;
4791 	boolean processClusters=false;
4792 	boolean renameClusters=false;
4793 	boolean absorbOverlap=false;
4794 	boolean storeSuffix=false;
4795 	boolean storeName=true;
4796 	boolean storeQuality=true;
4797 	boolean exact=true;
4798 	boolean uniqueNames=true;
4799 	boolean maxSpanningTree=false;
4800 
4801 	boolean removeCycles=true;
4802 	boolean fixMultiJoins=true;
4803 	boolean fixOffsetContradictions=false;
4804 	boolean countTransitive=false;
4805 	boolean countRedundant=false;
4806 
4807 	private boolean multipleInputFiles=false;
4808 	private boolean rigorousTransitive=false;
4809 	private int numAffixMaps=1;
4810 	private int maxAffixCopies=2000000000;
4811 	private int maxEdges=2000000000;
4812 	private int maxEdges2=2000000000;
4813 	private boolean allowAllContainedOverlaps=false;
4814 //	private boolean toUpperCase=false;
4815 
4816 	/** Trim left bases of the read to this position (exclusive, 0-based) */
4817 	private int forceTrimLeft=-1;
4818 	/** Trim right bases of the read after this position (exclusive, 0-based) */
4819 	private int forceTrimRight=-1;
4820 
4821 	private boolean qTrimLeft=false;
4822 	private boolean qTrimRight=false;
4823 	private float trimQ=6;
4824 	/** Error rate for trimming (derived from trimq) */
4825 	private final float trimE;
4826 
4827 	int maxEdits=0;
4828 	int maxSubs=0;
4829 	int bandwidth=9;
4830 	final boolean customBandwidth;
4831 	float minIdentity=100;
4832 	float minIdentityMult=0;
4833 	float minLengthPercent=0;
4834 	int minOverlapCluster=100;
4835 	int minOverlapMerge=1;
4836 	float minOverlapPercentCluster=0;
4837 	float minOverlapPercentMerge=0;
4838 
4839 	private int minClusterSize=1;
4840 	private int minClusterSizeForStats=1;
4841 	private boolean pickBestRepresentativePerCluster=false;
4842 
4843 	long readsProcessed=0;
4844 	long basesProcessed=0;
4845 	long collisions=0;
4846 	long containments=0;
4847 	long baseContainments=0;
4848 	long containmentCollisions=0;
4849 	long matches=0;
4850 	long baseMatches=0;
4851 	long overlaps=0;
4852 	long baseOverlaps=0;
4853 	long overlapCollisions=0;
4854 	long addedToMain=0;
4855 
4856 	private final int subset;
4857 	private final int subsetCount;
4858 	private final boolean subsetMode;
4859 
4860 	private final int k;
4861 	private final int k2;
4862 	private final boolean EA=Shared.EA();
4863 
4864 //	static final boolean amino=true;
4865 	static final int bitsPerBase=5; //amino
4866 //	static final boolean ignoreReverseComplement=true;
4867 
4868 	/*--------------------------------------------------------------*/
4869 	/*----------------         Static Fields        ----------------*/
4870 	/*--------------------------------------------------------------*/
4871 
4872 	private static ReadComparator comparator=ReadLengthComparator.comparator;
4873 
4874 	private static int tcount=0;
4875 
4876 	private LinkedHashMap<Long, ArrayList<Unit>> codeMap=new LinkedHashMap<Long, ArrayList<Unit>>(4000000);
4877 	private HashMap<LongM, ArrayList<Unit>> affixMap1=null;
4878 	private HashMap<LongM, ArrayList<Unit>> affixMap2=null;
4879 	private HashMap<LongM, ArrayList<Unit>>[] affixMaps=null;
4880 	private ArrayDeque<ArrayList<Unit>> clusterQueue=null;
4881 	private ArrayList<ArrayList<Unit>> processedClusters=null;
4882 	private AtomicIntegerArray clusterNumbers=null;
4883 
4884 	private static final UnitOffsetComparator UNIT_OFFSET_COMPARATOR=new UnitOffsetComparator();
4885 	private static final ClusterLengthComparator CLUSTER_LENGTH_COMPARATOR=new ClusterLengthComparator();
4886 	private static final long[][] hashcodes=makeCodes2(32, 1);
4887 	private static final long[][] hashcodesR=makeCodes2(32, 2);
4888 
4889 	public static final byte[] acidToNumber=new byte[128];
4890 
4891 	public static final int nmerLength=4;
4892 	public static final int maxNmer=((int) Math.pow(20, nmerLength))-1;
4893 	private static PrintStream outstream=System.err;
4894 	/** Permission to overwrite existing files */
4895 	public static boolean overwrite=false;
4896 	/** Permission to append to existing files */
4897 	public static boolean append=false;
4898 	public static boolean showSpeed=true;
4899 	public static boolean verbose=false;
4900 	public static boolean preventTransitiveOverlaps=false;
4901 	public static boolean ignoreAffix1=false;
4902 	public static boolean parseDepth=false;
4903 	public static boolean printLengthInEdges=false;
4904 	public static float depthRatio=2;
4905 	public static int MINSCAF=0;
4906 	public static int THREADS=Shared.threads();
4907 	public static int threadMaxReadsToBuffer=4000;
4908 	public static int threadMaxBasesToBuffer=32000000;
4909 	public static boolean DISPLAY_PROGRESS=true;
4910 	public static boolean UNIQUE_ONLY=false;
4911 	public static boolean REQUIRE_MATCHING_NAMES=false;
4912 	public static boolean NUMBER_GRAPH_NODES=true;
4913 	public static boolean ADD_PAIRNUM_TO_NAME=true;
4914 	public static boolean HASH_NS=false;
4915 
reverseType(int type)4916 	private static int reverseType(int type){return (type+2)%4;}
4917 	public static final int FORWARD=0;
4918 	public static final int REVERSE=2;
4919 	public static final String[] OVERLAP_TYPE_NAMES=new String[] {"FORWARD", "FORWARDRC", "REVERSE", "REVERSERC"};
4920 	public static final String[] OVERLAP_TYPE_ABBREVIATIONS=new String[] {"F", "FRC", "R", "RRC"};
4921 
4922 	static{//All others are 0
4923 		for(int i=0; i<acidToNumber.length; i++){
4924 			byte b=AminoAcid.acidToNumber[i];
4925 			acidToNumber[i]=(b<0 ? 21 : b);
4926 		}
4927 	}
4928 
4929 }
4930