1 package jgi; 2 3 import java.io.File; 4 import java.io.PrintStream; 5 import java.io.Serializable; 6 import java.util.ArrayDeque; 7 import java.util.ArrayList; 8 import java.util.Arrays; 9 import java.util.Comparator; 10 import java.util.HashMap; 11 import java.util.HashSet; 12 import java.util.LinkedHashMap; 13 import java.util.Locale; 14 import java.util.PriorityQueue; 15 import java.util.Random; 16 import java.util.concurrent.atomic.AtomicIntegerArray; 17 18 import align2.BandedAligner; 19 import dna.AminoAcid; 20 import fileIO.ByteFile1; 21 import fileIO.ByteFile2; 22 import fileIO.ByteStreamWriter; 23 import fileIO.FileFormat; 24 import fileIO.ReadWrite; 25 import fileIO.TextStreamWriter; 26 import shared.Parse; 27 import shared.Parser; 28 import shared.PreParser; 29 import shared.ReadStats; 30 import shared.Shared; 31 import shared.Timer; 32 import shared.Tools; 33 import shared.TrimRead; 34 import sort.ReadComparator; 35 import sort.ReadComparatorID; 36 import sort.ReadComparatorName; 37 import sort.ReadLengthComparator; 38 import sort.ReadQualityComparator; 39 import stream.ConcurrentCollectionReadInputStream; 40 import stream.ConcurrentGenericReadInputStream; 41 import stream.ConcurrentReadInputStream; 42 import stream.ConcurrentReadOutputStream; 43 import stream.FASTQ; 44 import stream.FastaReadInputStream; 45 import stream.FastqReadInputStream; 46 import stream.Read; 47 import structures.ListNum; 48 import structures.LongM; 49 50 /** 51 * @author Brian Bushnell 52 * @date Nov 30, 2017 53 * 54 */ 55 public final class DedupeProtein { 56 main(String[] args)57 public static void main(String[] args){ 58 int rbl0=Shared.bufferLen();; 59 Shared.setBufferLen(16); 60 61 DedupeProtein dd=new DedupeProtein(args); 62 dd.process(); 63 64 Shared.setBufferLen(rbl0); 65 66 //Close the print stream if it was redirected 67 Shared.closeStream(outstream); 68 } 69 DedupeProtein(String[] args)70 public DedupeProtein(String[] args){ 71 72 {//Preparse block for help, config files, and outstream 73 PreParser pp=new PreParser(args, getClass(), true); 74 args=pp.args; 75 outstream=pp.outstream; 76 } 77 78 ReadWrite.ZIPLEVEL=2; 79 ReadWrite.USE_UNPIGZ=ReadWrite.USE_PIGZ=true; 80 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); 81 Read.TO_UPPER_CASE=true; 82 83 boolean setMcsfs=false; 84 int bandwidth_=-1; 85 int k_=31; 86 int subset_=0, subsetCount_=1; 87 boolean ascending=false; 88 Parser parser=new Parser(); 89 90 for(int i=0; i<args.length; i++){ 91 92 final String arg=args[i]; 93 String[] split=arg.split("="); 94 String a=split[0].toLowerCase(); 95 String b=split.length>1 ? split[1] : null; 96 97 if(Parser.parseZip(arg, a, b)){ 98 //do nothing 99 }else if(Parser.parseCommonStatic(arg, a, b)){ 100 //do nothing 101 }else if(Parser.parseQuality(arg, a, b)){ 102 //do nothing 103 }else if(Parser.parseFasta(arg, a, b)){ 104 //do nothing 105 }else if(parser.parseQTrim(arg, a, b)){ 106 //do nothing 107 }else if(parser.parseInterleaved(arg, a, b)){ 108 //do nothing 109 }else if(a.equals("in") || a.equals("in1")){ 110 assert(b!=null) : "Bad parameter: "+arg; 111 if(b.indexOf(',')>=0 && !new File(b).exists()){ 112 in1=b.split(","); 113 }else{ 114 in1=new String[] {b}; 115 } 116 }else if(a.equals("in2")){ 117 assert(b!=null) : "Bad parameter: "+arg; 118 if(b.indexOf(',')>=0 && !new File(b).exists()){ 119 in2=b.split(","); 120 }else{ 121 in2=new String[] {b}; 122 } 123 }else if(a.equals("out") || a.equals("out")){ 124 out=b; 125 }else if(a.equals("out2")){ 126 throw new RuntimeException("Dedupe does not allow 'out2'; for paired reads, output is interleaved."); 127 }else if(a.equals("clusterfilepattern") || a.equals("pattern")){ 128 clusterFilePattern=b; 129 assert(clusterFilePattern==null || clusterFilePattern.contains("%")) : "pattern must contain the % symbol."; 130 }else if(a.equals("outbest")){ 131 outbest=b; 132 }else if(a.equals("outd") || a.equals("outduplicate")){ 133 outdupe=b; 134 }else if(a.equals("csf") || a.equals("clusterstatsfile")){ 135 outcsf=b; 136 }else if(a.equals("dot") || a.equals("graph") || a.equals("outdot") || a.equals("outgraph")){ 137 outgraph=b; 138 }else if(a.equals("mcsfs") || a.equals("minclustersizeforstats")){ 139 minClusterSizeForStats=Integer.parseInt(b); 140 }else if(a.equals("mcs") || a.equals("minclustersize")){ 141 minClusterSize=Integer.parseInt(b); 142 if(!setMcsfs){ 143 minClusterSizeForStats=minClusterSize; 144 } 145 }else if(a.equals("append") || a.equals("app")){ 146 append=ReadStats.append=Parse.parseBoolean(b); 147 }else if(a.equals("overwrite") || a.equals("ow")){ 148 overwrite=Parse.parseBoolean(b); 149 }else if(a.equals("sort")){ 150 if(b==null){sort=true;} 151 else if(b.equalsIgnoreCase("a") || b.equalsIgnoreCase("ascending")){ 152 sort=true; 153 ascending=true; 154 }else if(b.equalsIgnoreCase("d") || b.equalsIgnoreCase("descending")){ 155 sort=true; 156 ascending=false; 157 }else if(b.equalsIgnoreCase("length")){ 158 sort=true; 159 comparator=ReadLengthComparator.comparator; 160 }else if(b.equalsIgnoreCase("quality")){ 161 sort=true; 162 comparator=ReadQualityComparator.comparator; 163 }else if(b.equalsIgnoreCase("name")){ 164 sort=true; 165 comparator=ReadComparatorName.comparator; 166 }else if(b.equalsIgnoreCase("id")){ 167 sort=true; 168 comparator=ReadComparatorID.comparator; 169 }else{ 170 sort=Parse.parseBoolean(b); 171 } 172 }else if(a.equals("ascending")){ 173 ascending=Parse.parseBoolean(b); 174 }else if(a.equals("ordered")){ 175 boolean x=Parse.parseBoolean(b); 176 if(x){ 177 ascending=true; 178 sort=true; 179 comparator=ReadComparatorID.comparator; 180 }else{ 181 //do nothing 182 } 183 }else if(a.equals("ac") || a.equals("absorbcontainment") || a.equals("absorbcontainments") || a.equals("tc") || a.equals("testcontainment") || a.equals("containment")){ 184 absorbContainment=Parse.parseBoolean(b); 185 }else if(a.equals("am") || a.equals("absorbmatch") || a.equals("absorbmatches") || a.equals("tm") || a.equals("testmatch")){ 186 absorbMatch=Parse.parseBoolean(b); 187 }else if(a.equals("ao") || a.equals("absorboverlap") || a.equals("absorboverlaps") || a.equals("to") || a.equals("testoverlap")){ 188 absorbOverlap=Parse.parseBoolean(b); 189 }else if(a.equals("fo") || a.equals("findoverlap") || a.equals("findoverlaps")){ 190 findOverlaps=Parse.parseBoolean(b); 191 }else if(a.equals("c") || a.equals("cluster") || a.equals("clusters")){ 192 makeClusters=Parse.parseBoolean(b); 193 }else if(a.equals("fmj") || a.equals("fixmultijoin") || a.equals("fixmultijoins")){ 194 fixMultiJoins=Parse.parseBoolean(b); 195 }else if(a.equals("foc") || a.equals("fixoffsetcontradiction") || a.equals("fixoffsetcontradictions")){ 196 fixOffsetContradictions=Parse.parseBoolean(b); 197 }else if(a.equals("pto") || a.equals("preventtransitiveoverlap") || a.equals("preventtransitiveoverlaps")){ 198 preventTransitiveOverlaps=Parse.parseBoolean(b); 199 }else if(a.equals("pbr") || a.equals("pickbestrepresentative")){ 200 pickBestRepresentativePerCluster=Parse.parseBoolean(b); 201 }else if(a.equals("mst") || a.equals("maxspanningtree")){ 202 maxSpanningTree=Parse.parseBoolean(b); 203 }else if(a.equals("pc") || a.equals("processcluster") || a.equals("processclusters")){ 204 processClusters=Parse.parseBoolean(b); 205 }else if(a.equals("rnc") || a.equals("renamecluster") || a.equals("renameclusters")){ 206 renameClusters=Parse.parseBoolean(b); 207 if(renameClusters){storeName=false;} 208 }else if(a.equals("rc") || a.equals("removecycles") || a.equals("removecycle")){ 209 removeCycles=Parse.parseBoolean(b); 210 }else if(a.equals("uo") || a.equals("uniqueonly")){ 211 UNIQUE_ONLY=Parse.parseBoolean(b); 212 }else if(a.equals("rmn") || a.equals("requirematchingnames")){ 213 REQUIRE_MATCHING_NAMES=Parse.parseBoolean(b); 214 }else if(a.equals("ngn") || a.equals("numbergraphnodes")){ 215 NUMBER_GRAPH_NODES=Parse.parseBoolean(b); 216 }else if(a.equals("addpairnum")){ 217 ADD_PAIRNUM_TO_NAME=Parse.parseBoolean(b); 218 }else if(a.equals("hashns")){ 219 HASH_NS=Parse.parseBoolean(b); 220 }else if(a.equals("pn") || a.equals("prefixname")){ 221 // PREFIX_NAME=Parse.parseBoolean(b); 222 }else if(a.equals("k")){ 223 k_=Integer.parseInt(b); 224 assert(k_>0 && k_<11) : "k must be between 1 and 10; default is 10, and lower values are slower."; 225 }else if(a.equals("minscaf") || a.equals("ms")){ 226 MINSCAF=FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); 227 }else if(a.equals("mlp") || a.equals("minlengthpercent")){ 228 minLengthPercent=Float.parseFloat(b); 229 }else if(a.equals("mop") || a.equals("minoverlappercent")){ 230 minOverlapPercentCluster=minOverlapPercentMerge=Float.parseFloat(b); 231 }else if(a.equals("mopc") || a.equals("minoverlappercentcluster")){ 232 minOverlapPercentCluster=Float.parseFloat(b); 233 }else if(a.equals("mopm") || a.equals("minoverlappercentmerge")){ 234 minOverlapPercentMerge=Float.parseFloat(b); 235 }else if(a.equals("mo") || a.equals("minoverlap")){ 236 minOverlapCluster=minOverlapMerge=Integer.parseInt(b); 237 }else if(a.equals("moc") || a.equals("minoverlapcluster")){ 238 minOverlapCluster=Integer.parseInt(b); 239 }else if(a.equals("mom") || a.equals("minoverlapmerge")){ 240 minOverlapMerge=Integer.parseInt(b); 241 }else if(a.equals("rt") || a.equals("rigoroustransitive")){ 242 rigorousTransitive=Parse.parseBoolean(b); 243 }else if(a.equals("e") || a.equals("maxedits") || a.equals("edits") || a.equals("edist")){ 244 maxEdits=Integer.parseInt(b); 245 }else if(a.equals("s") || a.equals("maxsubs") || a.equals("maxsubstitutions") || a.equals("hdist")){ 246 maxSubs=Integer.parseInt(b); 247 }else if(a.equals("bw") || a.equals("bandwidth")){ 248 bandwidth_=Integer.parseInt(b); 249 }else if(a.equals("mid") || a.equals("minidentity")){ 250 minIdentity=Float.parseFloat(b); 251 minIdentityMult=(minIdentity==100f ? 0 : (100f-minIdentity)/100f); 252 }else if(a.equals("threads") || a.equals("t")){ 253 THREADS=(b==null || b.equalsIgnoreCase("auto") ? Shared.threads() : Integer.parseInt(b)); 254 }else if(a.equals("showspeed") || a.equals("ss")){ 255 showSpeed=Parse.parseBoolean(b); 256 }else if(a.equals("verbose")){ 257 verbose=Parse.parseBoolean(b); 258 // BandedAligner.verbose=verbose; 259 }else if(a.equals("contigbreak") || (arg.contains("=") && (a.equals("n") || a.equals("-n")))){ 260 maxNs=Integer.parseInt(b); 261 }else if(a.equals("reads") || a.startsWith("maxreads")){ 262 maxReads=Parse.parseKMG(b); 263 }else if(a.equals("sn") || a.equals("storename") || a.equals("storenames") || a.equals("keepnames")){ 264 storeName=Parse.parseBoolean(b); 265 }else if(a.equals("ssx") || a.equals("storesuffix") || a.equals("storesuffixes")){ 266 storeSuffix=Parse.parseBoolean(b); 267 }else if(a.equals("numaffixmaps") || a.equals("nam")){ 268 numAffixMaps=Integer.parseInt(b); 269 }else if(a.equals("amino") || a.equals("protein")){ 270 //do nothing 271 }else if(a.equals("mac") || a.equals("maxaffixcopies")){ 272 maxAffixCopies=Integer.parseInt(b); 273 }else if(a.equals("me") || a.equals("maxedges")){ 274 maxEdges=Integer.parseInt(b); 275 maxEdges2=maxEdges*2; 276 if(maxEdges2<1){maxEdges2=Integer.MAX_VALUE-1;} 277 }else if(a.equals("ignoreaffix1") || a.equals("ia1")){ 278 ignoreAffix1=Parse.parseBoolean(b); 279 }else if(a.equals("parsedepth") || a.equals("pd")){ 280 parseDepth=Parse.parseBoolean(b); 281 }else if(a.equals("printlengthinedges") || a.equals("ple")){ 282 printLengthInEdges=Parse.parseBoolean(b); 283 }else if(a.equals("depthmult") || a.equals("depthratio") || a.equals("dr")){ 284 depthRatio=Float.parseFloat(b); 285 if(depthRatio<=0){ 286 parseDepth=false; 287 }else{ 288 parseDepth=true; 289 assert(depthRatio>0); 290 if(depthRatio<1){depthRatio=1/depthRatio;} 291 } 292 }else if(a.equals("storequality") || a.equals("sq")){ 293 storeQuality=Parse.parseBoolean(b); 294 }else if(a.equals("exact") || a.equals("ex")){ 295 exact=Parse.parseBoolean(b); 296 }else if(a.equals("uniquenames") || a.equals("un")){ 297 uniqueNames=Parse.parseBoolean(b); 298 }else if(a.equals("ftl") || a.equals("forcetrimleft")){ 299 forceTrimLeft=Integer.parseInt(b); 300 }else if(a.equals("ftr") || a.equals("forcetrimright")){ 301 forceTrimRight=Integer.parseInt(b); 302 }else if(a.equals("subset") || a.equals("sst")){ 303 subset_=Integer.parseInt(b); 304 }else if(a.equals("subsets") || a.equals("subsetcount") || a.equals("sstc")){ 305 subsetCount_=Integer.parseInt(b); 306 }else if(i==0 && in1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ 307 String c=args[i]; 308 if(c.indexOf(',')>=0 && !new File(c).exists()){ 309 in1=c.split(","); 310 }else{ 311 in1=new String[] {c}; 312 } 313 }else if(i==1 && out==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ 314 out=args[i]; 315 }else{ 316 throw new RuntimeException("Unknown parameter "+args[i]); 317 } 318 } 319 320 comparator.setAscending(ascending); 321 322 {//Process parser fields 323 Parser.processQuality(); 324 qTrimLeft=parser.qtrimLeft; 325 qTrimRight=parser.qtrimRight; 326 trimQ=parser.trimq; 327 trimE=parser.trimE(); 328 } 329 330 if(verbose){ 331 ReadWrite.verbose=ConcurrentGenericReadInputStream.verbose=ConcurrentReadOutputStream.verbose=ByteFile1.verbose=ByteFile2.verbose=FastqReadInputStream.verbose=true; 332 } 333 334 k=k_; 335 k2=k-1; 336 subset=subset_; 337 subsetCount=subsetCount_; 338 subsetMode=subsetCount>1; 339 assert(subset>=0 && subset<subsetCount) : "subset="+subset+", subsetCount="+subsetCount; 340 341 BandedAligner.penalizeOffCenter=true; 342 343 if(maxSpanningTree){removeCycles=fixMultiJoins=false;} 344 if(absorbOverlap){processClusters=true;} 345 if(processClusters || renameClusters || maxSpanningTree){makeClusters=true;} 346 if(makeClusters){findOverlaps=true;} 347 if(renameClusters){uniqueNames=/*storeName=*/false;} 348 349 if(bandwidth_>-1){ 350 bandwidth=Tools.min(bandwidth_, 2*maxEdits+1); 351 customBandwidth=(bandwidth<2*maxEdits+1); 352 }else{ 353 bandwidth=2*maxEdits+1; 354 customBandwidth=false; 355 } 356 maxSubs=Tools.max(maxSubs, maxEdits); 357 if(maxSubs>0 || minIdentity<100 || findOverlaps){storeSuffix=true;} 358 359 assert(FastaReadInputStream.settingsOK()); 360 361 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} 362 363 for(int i=0; i<in1.length; i++){ 364 if(in1[i].equalsIgnoreCase("stdin") && !new File(in1[i]).exists()){in1[i]="stdin.fa";} 365 } 366 367 // assert(false) : Arrays.toString(in); 368 369 // if(!setOut && clusterFilePattern==null){out="stdout.fa";} 370 // else 371 // if("stdout".equalsIgnoreCase(out) || "standarddout".equalsIgnoreCase(out)){ 372 // out="stdout.fa"; 373 // outstream=System.err; 374 // } 375 if(!Tools.canWrite(out, overwrite)){throw new RuntimeException("Output file "+out+" already exists, and overwrite="+overwrite);} 376 377 for(int i=0; i<in1.length; i++){ 378 assert(!in1[i].equalsIgnoreCase(out)); 379 } 380 // assert(false) : "\nabsorbContainment="+absorbContainment+", findOverlaps="+findOverlaps+", absorbOverlap="+absorbOverlap+"\n"+ 381 // "processClusters="+processClusters+", renameClusters="+renameClusters+", makeClusters="+makeClusters+", uniqueNames="+uniqueNames+"\n"+ 382 // "storeName="+storeName+", DISPLAY_PROGRESS="+DISPLAY_PROGRESS+", removeCycles="+removeCycles; 383 if(absorbContainment || findOverlaps){ 384 // assert(false); 385 affixMaps=new HashMap[numAffixMaps]; 386 for(int i=0; i<numAffixMaps; i++){ 387 affixMaps[i]=new HashMap<LongM, ArrayList<Unit>>(4000000); 388 } 389 if(affixMaps.length>0){affixMap1=affixMaps[0];} 390 if(affixMaps.length>1){affixMap2=affixMaps[1];} 391 } 392 // assert(false) : absorbContainment+", "+(affixMap==null); 393 394 if(outdupe==null){ 395 dupeWriter=null; 396 }else{ 397 FileFormat ff=FileFormat.testOutput(outdupe, FileFormat.FASTA, null, true, overwrite, append, false); 398 dupeWriter=new ByteStreamWriter(ff); 399 } 400 } 401 process()402 public void process(){ 403 404 Timer t=new Timer(); 405 406 boolean dq0=FASTQ.DETECT_QUALITY; 407 boolean ti0=FASTQ.TEST_INTERLEAVED; 408 409 process2(); 410 411 FASTQ.DETECT_QUALITY=dq0; 412 FASTQ.TEST_INTERLEAVED=ti0; 413 414 t.stop(); 415 416 if(showSpeed){ 417 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); 418 } 419 420 if(errorState){ 421 throw new RuntimeException("Dedupe terminated in an error state; the output may be corrupt."); 422 } 423 } 424 process2()425 public void process2(){ 426 if(dupeWriter!=null){dupeWriter.start();} 427 // assert(false) : out; 428 Timer t=new Timer(); 429 430 if(DISPLAY_PROGRESS){ 431 outstream.println("Initial:"); 432 Shared.printMemory(); 433 outstream.println(); 434 } 435 436 processMatches(t); 437 438 forceTrimLeft=forceTrimRight=-1; 439 qTrimLeft=qTrimRight=false; 440 441 442 if(absorbContainment){ 443 processContainments(t); 444 } 445 446 if(dupeWriter!=null){dupeWriter.poisonAndWait();} 447 448 if(findOverlaps){ 449 findOverlaps(t); 450 451 killAffixMaps(); 452 453 if(processClusters || renameClusters || maxSpanningTree){codeMap=null;} 454 455 if(maxSpanningTree){ 456 processMst(t); 457 } 458 459 if(processClusters){ 460 processClusters(t, absorbOverlap); 461 } 462 // if(renameClusters){ 463 // renameClusters(t); 464 // } 465 // assert(false) : (codeMap==null)+", "+(affixMap1==null)+", "+processedClusters; 466 } 467 468 outstream.println("Input: \t"+readsProcessed+" reads \t\t"+basesProcessed+" bases."); 469 470 if(absorbMatch){ 471 outstream.println("Duplicates: \t"+matches+" reads ("+String.format(Locale.ROOT, "%.2f",matches*100.0/readsProcessed)+"%) \t"+ 472 baseMatches+" bases ("+String.format(Locale.ROOT, "%.2f",baseMatches*100.0/basesProcessed)+"%) \t"+collisions+" collisions."); 473 } 474 if(absorbContainment){ 475 outstream.println("Containments: \t"+containments+" reads ("+String.format(Locale.ROOT, "%.2f",containments*100.0/readsProcessed)+"%) \t"+ 476 baseContainments+" bases ("+String.format(Locale.ROOT, "%.2f",baseContainments*100.0/basesProcessed)+"%) \t"+containmentCollisions+" collisions."); 477 } 478 if(findOverlaps){ 479 outstream.println("Overlaps: \t"+overlaps+" reads ("+String.format(Locale.ROOT, "%.2f",overlaps*100.0/readsProcessed)+"%) \t"+ 480 baseOverlaps+" bases ("+String.format(Locale.ROOT, "%.2f",baseOverlaps*100.0/basesProcessed)+"%) \t"+overlapCollisions+" collisions."); 481 } 482 // outstream.println("Result: \t"+(addedToMain-containments)+" reads \t\t"+(basesProcessed-baseMatches-baseContainments)+" bases."); 483 484 long outReads=(addedToMain-containments); 485 if(UNIQUE_ONLY){outReads=readsProcessed-matches-containments;} 486 long outBases=(basesProcessed-baseMatches-baseContainments); 487 outstream.println("Result: \t"+outReads+" reads ("+String.format(Locale.ROOT, "%.2f",outReads*100.0/readsProcessed)+"%) \t"+ 488 outBases+" bases ("+String.format(Locale.ROOT, "%.2f",outBases*100.0/basesProcessed)+"%)"); 489 490 outstream.println(""); 491 492 if(out!=null || clusterFilePattern!=null || outbest!=null || outgraph!=null || outcsf!=null){ 493 writeOutput(outcsf, t); 494 } 495 496 } 497 killAffixMaps()498 private void killAffixMaps(){ 499 if(affixMaps==null){return;} 500 for(int i=0; i<numAffixMaps; i++){ 501 if(affixMaps[i]!=null){affixMaps[i].clear();} 502 affixMaps[i]=null; 503 } 504 affixMap1=null; 505 affixMap2=null; 506 affixMaps=null; 507 } 508 makeCrisArray(ArrayList<Read> list)509 private ConcurrentReadInputStream[] makeCrisArray(ArrayList<Read> list){ 510 final ConcurrentReadInputStream[] array; 511 512 if(list!=null){ 513 array=new ConcurrentReadInputStream[] {new ConcurrentCollectionReadInputStream(list, null, -1)}; 514 array[0].start(); //This deadlocks if ConcurrentReadInputStream extends Thread rather than spawning a new thread. 515 }else{ 516 array=new ConcurrentReadInputStream[in1.length]; 517 multipleInputFiles=array.length>1; 518 for(int i=0; i<in1.length; i++){ 519 if(verbose){System.err.println("Creating cris for "+in1[i]);} 520 521 final ConcurrentReadInputStream cris; 522 { 523 FileFormat ff1=FileFormat.testInput(in1[i], FileFormat.FASTA, null, !multipleInputFiles || ReadWrite.USE_UNPIGZ, true); 524 FileFormat ff2=(in2==null || in2.length<=i ? null : FileFormat.testInput(in2[i], FileFormat.FASTA, null, !multipleInputFiles || ReadWrite.USE_UNPIGZ, true)); 525 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, ff1.samOrBam(), ff1, ff2); 526 cris.start(); 527 if(cris.paired()){ 528 THREADS=1;//Temp fix for losing reads when multithreaded and paired 529 if(absorbContainment){ 530 System.err.println("Set absorbContainment to false because it is not currently supported for paired reads."); 531 absorbContainment=false; 532 } 533 } 534 } 535 array[i]=cris; 536 } 537 } 538 return array; 539 } 540 processMatches(Timer t)541 private void processMatches(Timer t){ 542 crisa=makeCrisArray(null); 543 544 ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS); 545 for(int i=0; i<THREADS; i++){alht.add(new HashThread(true, (absorbContainment|findOverlaps), absorbMatch, false, false));} 546 for(HashThread ht : alht){ht.start();} 547 for(HashThread ht : alht){ 548 while(ht.getState()!=Thread.State.TERMINATED){ 549 try { 550 ht.join(); 551 } catch (InterruptedException e) { 552 // TODO Auto-generated catch block 553 e.printStackTrace(); 554 } 555 } 556 matches+=ht.matchesT; 557 collisions+=ht.collisionsT; 558 containments+=ht.containmentsT; 559 containmentCollisions+=ht.containmentCollisionsT; 560 baseContainments+=ht.baseContainmentsT; 561 baseMatches+=ht.baseMatchesT; 562 addedToMain+=ht.addedToMainT; 563 readsProcessed+=ht.readsProcessedT; 564 basesProcessed+=ht.basesProcessedT; 565 } 566 alht.clear(); 567 568 if(verbose){System.err.println("Attempting to close input streams (1).");} 569 for(ConcurrentReadInputStream cris : crisa){ 570 errorState|=ReadWrite.closeStream(cris); 571 } 572 crisa=null; 573 574 if(DISPLAY_PROGRESS){ 575 t.stop(); 576 outstream.println("Found "+matches+" duplicates."); 577 outstream.println("Finished exact matches. Time: "+t); 578 Shared.printMemory(); 579 if(verbose){outstream.println(affixMap1);} 580 outstream.println(); 581 t.start(); 582 } 583 } 584 processContainments(Timer t)585 private void processContainments(Timer t){ 586 ArrayList<Read> list=new ArrayList<Read>((int)addedToMain); 587 for(ArrayList<Unit> alu : codeMap.values()){ 588 for(Unit u : alu){ 589 assert(u.r.mate==null) : "Containments are not currently supported with paired reads."; 590 if(u.valid() && u.r.pairnum()==0){list.add(u.r);} 591 } 592 } 593 594 // if(minLengthPercent>0){ 595 // if(verbose){System.err.println("Sorting.");} 596 // Shared.sort(list, comparator); 597 // Collections.reverse(list); 598 // assert(list.isEmpty() || list.get(0).length()<=list.get(list.size()-1).length()) : 599 // list.get(0).length()+", "+list.get(list.size()-1).length(); 600 // } 601 602 crisa=makeCrisArray(subsetMode ? null : list); 603 604 ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS); 605 for(int i=0; i<THREADS; i++){alht.add(new HashThread(false, false, false, true, false));} 606 607 for(HashThread ht : alht){ht.start();} 608 for(HashThread ht : alht){ 609 while(ht.getState()!=Thread.State.TERMINATED){ 610 try { 611 ht.join(); 612 } catch (InterruptedException e) { 613 // TODO Auto-generated catch block 614 e.printStackTrace(); 615 } 616 } 617 assert(ht.matchesT==0); 618 assert(ht.collisionsT==0); 619 assert(ht.baseMatchesT==0); 620 assert(ht.addedToMainT==0); 621 // assert(ht.readsProcessedT==0); 622 // assert(ht.basesProcessedT==0); 623 // matches+=ht.matchesT; 624 // collisions+=ht.collisionsT; 625 containments+=ht.containmentsT; 626 containmentCollisions+=ht.containmentCollisionsT; 627 baseContainments+=ht.baseContainmentsT; 628 // baseMatches+=ht.baseMatchesT; 629 // addedToMain+=ht.addedToMainT; 630 // readsProcessed+=ht.readsProcessedT; 631 // basesProcessed+=ht.basesProcessedT; 632 } 633 alht.clear(); 634 if(verbose){System.err.println("Attempting to close input streams (2).");} 635 for(ConcurrentReadInputStream cris : crisa){ 636 errorState|=ReadWrite.closeStream(cris); 637 } 638 639 if(DISPLAY_PROGRESS){ 640 t.stop(); 641 outstream.println("Found "+containments+" contained sequences."); 642 outstream.println("Finished containment. Time: "+t); 643 Shared.printMemory(); 644 outstream.println(); 645 t.start(); 646 } 647 crisa=null; 648 if(!findOverlaps){ 649 killAffixMaps(); 650 } 651 652 long x=removeInvalid(list); 653 list.clear(); 654 655 if(DISPLAY_PROGRESS){ 656 t.stop(); 657 outstream.println("Removed "+x+" invalid entries."); 658 outstream.println("Finished invalid removal. Time: "+t); 659 Shared.printMemory(); 660 outstream.println(); 661 t.start(); 662 } 663 } 664 findOverlaps(Timer t)665 private void findOverlaps(Timer t){ 666 667 ArrayList<Read> list=new ArrayList<Read>((int)addedToMain); 668 for(ArrayList<Unit> alu : codeMap.values()){ 669 for(Unit u : alu){ 670 if(u.valid() && u.r.pairnum()==0){ 671 u.unitID=list.size(); 672 list.add(u.r); 673 if(u.r.mate!=null){ 674 Unit u2=(Unit)u.r.mate.obj; 675 u2.unitID=u.unitID; 676 } 677 }else{ 678 u.unitID=Integer.MAX_VALUE; 679 } 680 } 681 } 682 683 if(preventTransitiveOverlaps){ 684 clusterNumbers=new AtomicIntegerArray(list.size()); 685 for(int i=0; i<clusterNumbers.length(); i++){ 686 clusterNumbers.set(i, i); 687 } 688 } 689 690 crisa=makeCrisArray(subsetMode ? null : list); 691 692 ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS); 693 for(int i=0; i<THREADS; i++){alht.add(new HashThread(false, false, false, false, true));} 694 695 for(HashThread ht : alht){ht.start();} 696 for(HashThread ht : alht){ 697 while(ht.getState()!=Thread.State.TERMINATED){ 698 try { 699 ht.join(); 700 } catch (InterruptedException e) { 701 // TODO Auto-generated catch block 702 e.printStackTrace(); 703 } 704 } 705 assert(ht.matchesT==0); 706 assert(ht.collisionsT==0); 707 assert(ht.baseMatchesT==0); 708 assert(ht.addedToMainT==0); 709 710 overlaps+=ht.overlapsT; 711 baseOverlaps+=ht.baseOverlapsT; 712 overlapCollisions+=ht.overlapCollisionsT; 713 } 714 alht.clear(); 715 if(verbose){System.err.println("Attempting to close input streams (3).");} 716 for(ConcurrentReadInputStream cris : crisa){ 717 errorState|=ReadWrite.closeStream(cris); 718 } 719 720 if(DISPLAY_PROGRESS){ 721 t.stop(); 722 outstream.println("Found "+overlaps+" overlaps."); 723 outstream.println("Finished finding overlaps. Time: "+t); 724 Shared.printMemory(); 725 outstream.println(); 726 t.start(); 727 } 728 729 crisa=null; 730 731 if(makeClusters){ 732 int intransitive=0, redundant=0; 733 assert((intransitive=countIntransitive(t, list, rigorousTransitive))==0); 734 assert((redundant=countRedundant(t, list))==0); 735 long overlaps=countOverlaps(t, list); 736 assert(intransitive==0); 737 assert(redundant==0); 738 // makeTransitive(t, list, rigorousTransitive); 739 if(clusterQueue==null){ 740 clusterQueue=new ArrayDeque<ArrayList<Unit>>(list.size()/4+1); 741 processedClusters=new ArrayList<ArrayList<Unit>>(); 742 }else{ 743 assert(clusterQueue.isEmpty()); 744 } 745 makeClusters(t, list); 746 } 747 748 list.clear(); 749 } 750 makeTransitive(Timer t, ArrayList<Read> list, boolean rigorous)751 private long makeTransitive(Timer t, ArrayList<Read> list, boolean rigorous){ 752 assert(false) : "No longer needed."; 753 long added=0; 754 for(Read r : list){ 755 assert(r!=null); 756 Unit u=(Unit) r.obj; 757 assert(u!=null); 758 assert(u.valid()); 759 // outstream.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size())); 760 if(u.valid()){ 761 762 if(u.overlapList!=null){ 763 for(Overlap o : u.overlapList){ 764 Unit u2=(o.u1==u ? o.u2 : o.u1); 765 assert(u2!=u); 766 if(u2.overlapList==null){ 767 u2.overlapList=new ArrayList<Overlap>(2); 768 u2.overlapList.add(o); 769 }else{ 770 boolean found=false; 771 if(rigorous){ 772 found=u2.overlapList.contains(o); 773 }else{ 774 for(Overlap o2 : u2.overlapList){ 775 if(o2.u1==u || o2.u2==u){found=true; break;} 776 } 777 } 778 if(!found){ 779 added++; 780 u2.overlapList.add(o); 781 } 782 } 783 } 784 } 785 } 786 } 787 788 for(Read r : list){ 789 Unit u=(Unit) r.obj; 790 if(u.valid()){ 791 assert(u.isTransitive()); 792 } 793 } 794 795 if(DISPLAY_PROGRESS){ 796 t.stop(); 797 outstream.println("Added overlaps: "+added); 798 outstream.println("Made overlaps transitive. Time: "+t); 799 Shared.printMemory(); 800 outstream.println(); 801 t.start(); 802 } 803 return added; 804 } 805 countIntransitive(Timer t, ArrayList<Read> list, boolean rigorous)806 private int countIntransitive(Timer t, ArrayList<Read> list, boolean rigorous){ 807 if(!countTransitive){return 0;} 808 int transitive=0, intransitive=0; 809 for(Read r : list){ 810 assert(r!=null); 811 Unit u=(Unit) r.obj; 812 assert(u!=null); 813 assert(u.valid()); 814 // outstream.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size())); 815 if(u.valid()){ 816 if(rigorous ? u.isPerfectlyTransitive() : u.isTransitive()){ 817 transitive++; 818 }else{ 819 intransitive++; 820 } 821 } 822 } 823 824 if(DISPLAY_PROGRESS){ 825 t.stop(); 826 outstream.println("Intransitive: "+intransitive+", \ttransitive: "+transitive); 827 outstream.println("Checked transitivity. Time: "+t); 828 Shared.printMemory(); 829 outstream.println(); 830 t.start(); 831 } 832 833 return intransitive; 834 } 835 countRedundant(Timer t, ArrayList<Read> list)836 private int countRedundant(Timer t, ArrayList<Read> list){ 837 if(!countRedundant){return 0;} 838 int redundant=0, nonredundant=0; 839 for(Read r : list){ 840 assert(r!=null); 841 Unit u=(Unit) r.obj; 842 assert(u!=null); 843 assert(u.valid()); 844 // outstream.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size())); 845 if(u.valid()){ 846 if(u.isNonRedundant()){ 847 nonredundant++; 848 }else{ 849 redundant++; 850 } 851 } 852 } 853 854 if(DISPLAY_PROGRESS){ 855 t.stop(); 856 outstream.println("Redundant: "+redundant+", \tnonredundant: "+nonredundant); 857 outstream.println("Checked redundancy. Time: "+t); 858 Shared.printMemory(); 859 outstream.println(); 860 t.start(); 861 } 862 return redundant; 863 } 864 countOverlaps(Timer t, ArrayList<Read> list)865 private long countOverlaps(Timer t, ArrayList<Read> list){ 866 867 long overlaps=0, length=0; 868 for(Read r : list){ 869 assert(r!=null); 870 Unit u=(Unit) r.obj; 871 assert(u!=null); 872 assert(u.valid()); 873 // outstream.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size())); 874 if(u.valid() && u.overlapList!=null){ 875 for(Overlap o : u.overlapList){ 876 overlaps++; 877 length+=o.overlapLen; 878 } 879 } 880 } 881 882 if(DISPLAY_PROGRESS){ 883 t.stop(); 884 outstream.println("Overlaps: "+overlaps+", \tlength: "+length); 885 outstream.println("Counted overlaps. Time: "+t); 886 Shared.printMemory(); 887 outstream.println(); 888 t.start(); 889 } 890 return overlaps; 891 } 892 fillClusterSizeMatrix(ArrayList<ArrayList<Unit>> clusters, long[][] clusterSize)893 private long fillClusterSizeMatrix(ArrayList<ArrayList<Unit>> clusters, long[][] clusterSize){ 894 int max=0; 895 for(ArrayList<Unit> cluster : clusters){ 896 final int cs=Tools.min(clusterSize.length-1, cluster.size()); 897 { 898 long reads=0, bases=0; 899 for(Unit u2 : cluster){ 900 reads++; 901 bases+=u2.length(); 902 } 903 clusterSize[0][cs]++; 904 clusterSize[1][cs]+=reads; 905 clusterSize[2][cs]+=bases; 906 } 907 max=Tools.max(max, cluster.size()); 908 } 909 return max; 910 } 911 makeClusters(Timer t, ArrayList<Read> list)912 private void makeClusters(Timer t, ArrayList<Read> list){ 913 914 final int clusterlen=70000; 915 long[][] clusterSize=new long[3][clusterlen]; 916 int max=0; 917 for(Read r : list){ 918 Unit u=(Unit) r.obj; 919 920 if(!u.clustered()){ 921 ArrayList<Unit> cluster=u.makeCluster(); 922 if(cluster.size()>2){cluster.trimToSize();} 923 if(cluster.size()==1 || (!processClusters && !maxSpanningTree)){processedClusters.add(cluster);} 924 else{clusterQueue.add(cluster);} 925 final int cs=Tools.min(clusterlen-1, cluster.size()); 926 { 927 long reads=0, bases=0; 928 for(Unit u2 : cluster){ 929 reads++; 930 bases+=u2.length(); 931 } 932 clusterSize[0][cs]++; 933 clusterSize[1][cs]+=reads; 934 clusterSize[2][cs]+=bases; 935 } 936 max=Tools.max(max, cluster.size()); 937 } 938 } 939 940 if(DISPLAY_PROGRESS){ 941 t.stop(); 942 outstream.println(toClusterSizeString(clusterSize)); 943 outstream.println("\nLargest: "+max); 944 outstream.println("Finished making clusters. Time: "+t); 945 Shared.printMemory(); 946 outstream.println(); 947 t.start(); 948 } 949 950 951 long x=removeInvalid(list); 952 953 if(DISPLAY_PROGRESS){ 954 t.stop(); 955 outstream.println("Removed "+x+" invalid entries."); 956 outstream.println("Finished invalid removal. Time: "+t); 957 Shared.printMemory(); 958 outstream.println(); 959 t.start(); 960 } 961 } 962 toClusterSizeString(long[][] clusterSizeMatrix)963 private String toClusterSizeString(long[][] clusterSizeMatrix){ 964 965 long[] clusterSize=clusterSizeMatrix[0]; 966 long[] clusterReads=clusterSizeMatrix[1]; 967 long[] clusterBases=clusterSizeMatrix[2]; 968 969 long totalClusters=Tools.sum(clusterSize); 970 971 long bigClusters=0; 972 for(int i=minClusterSize; i<clusterSize.length; i++){ 973 bigClusters+=clusterSize[i]; 974 } 975 976 final int spaces=18; 977 final int spaces2=spaces*2, spaces3=spaces*3; 978 979 final StringBuilder sb=new StringBuilder(100), sb2=new StringBuilder(1000); 980 sb2.append("Clusters:"); 981 while(sb2.length()<spaces){sb2.append(' ');} 982 sb2.append(totalClusters+(minClusterSize<2 ? "" : " ("+bigClusters+" of at least size "+minClusterSize+")")+"\n"); 983 984 sb.append("Size Range"); 985 while(sb.length()<spaces){sb.append(' ');} 986 sb.append("Clusters"); 987 while(sb.length()<spaces2){sb.append(' ');} 988 sb.append("Reads"); 989 while(sb.length()<spaces3){sb.append(' ');} 990 sb.append("Bases"); 991 992 sb2.append('\n'); 993 sb2.append(sb); 994 sb.setLength(0); 995 996 for(int i=0; i<clusterSize.length-1; i=Tools.max(i+1, i*2)){ 997 int a=i+1, b=i*2; 998 if(i<2){ 999 sb.append(a); 1000 while(sb.length()<spaces){sb.append(' ');} 1001 sb.append(clusterSize[a]); 1002 while(sb.length()<spaces2){sb.append(' ');} 1003 sb.append(clusterReads[a]); 1004 while(sb.length()<spaces3){sb.append(' ');} 1005 sb.append(clusterBases[a]); 1006 }else if(b>=clusterSize.length){ 1007 long x=Tools.sum(clusterSize, a, clusterSize.length-1); 1008 long y=Tools.sum(clusterReads, a, clusterSize.length-1); 1009 long z=Tools.sum(clusterBases, a, clusterSize.length-1); 1010 if(x>0){ 1011 sb.append(a+"+"); 1012 while(sb.length()<spaces){sb.append(' ');} 1013 sb.append(x); 1014 while(sb.length()<spaces2){sb.append(' ');} 1015 sb.append(y); 1016 while(sb.length()<spaces3){sb.append(' ');} 1017 sb.append(z); 1018 } 1019 }else{ 1020 long x=Tools.sum(clusterSize, a, b); 1021 long y=Tools.sum(clusterReads, a, b); 1022 long z=Tools.sum(clusterBases, a, b); 1023 if(x>0){ 1024 sb.append(a+"-"+b); 1025 while(sb.length()<spaces){sb.append(' ');} 1026 sb.append(x); 1027 while(sb.length()<spaces2){sb.append(' ');} 1028 sb.append(y); 1029 while(sb.length()<spaces3){sb.append(' ');} 1030 sb.append(z); 1031 } 1032 } 1033 if(sb.length()>0){ 1034 sb2.append('\n'); 1035 sb2.append(sb); 1036 sb.setLength(0); 1037 } 1038 } 1039 return sb2.toString(); 1040 } 1041 renameClusters(Timer t)1042 private void renameClusters(Timer t){ 1043 assert(false) : "This is now unused; renaming is done at output time."; 1044 int cnum=0; 1045 final StringBuilder sb=new StringBuilder(64); 1046 for(ArrayList<Unit> alu : processedClusters){ 1047 for(int i=0; i<alu.size(); i++){ 1048 Unit u=alu.get(i); 1049 Read r=u.r; 1050 sb.append("Cluster "); 1051 sb.append(cnum); 1052 sb.append(",contig "); 1053 sb.append(i); 1054 if(u.offsetValid()){ 1055 sb.append(",pos "); 1056 sb.append(u.offset()); 1057 } 1058 r.id=sb.toString(); 1059 sb.setLength(0); 1060 } 1061 } 1062 1063 if(DISPLAY_PROGRESS){ 1064 t.stop(); 1065 outstream.println("Finished cluster renaming. Time: "+t); 1066 Shared.printMemory(); 1067 outstream.println(); 1068 t.start(); 1069 } 1070 } 1071 processMst(Timer t)1072 private void processMst(Timer t){ 1073 1074 if(DISPLAY_PROGRESS){outstream.println("Converting to Maximum Spanning Tree.");} 1075 1076 ArrayList<MstThread> alct=new ArrayList<MstThread>(THREADS); 1077 for(int i=0; i<THREADS; i++){ 1078 alct.add(new MstThread()); 1079 } 1080 1081 long overlapsRemoved=0; 1082 long overlapBasesRemoved=0; 1083 long overlapsRetained=0; 1084 long overlapBasesRetained=0; 1085 1086 for(MstThread ct : alct){ct.start();} 1087 for(MstThread ct : alct){ 1088 while(ct.getState()!=Thread.State.TERMINATED){ 1089 try { 1090 ct.join(); 1091 } catch (InterruptedException e) { 1092 e.printStackTrace(); 1093 } 1094 } 1095 1096 overlapsRemoved+=ct.overlapsRemovedT; 1097 overlapBasesRemoved+=ct.overlapBasesRemovedT; 1098 overlapsRetained+=ct.overlapsRetainedT; 1099 overlapBasesRetained+=ct.overlapBasesRetainedT; 1100 } 1101 assert(clusterQueue.isEmpty()); 1102 if(processClusters){ 1103 for(MstThread ct : alct){ 1104 clusterQueue.addAll(ct.processedT); 1105 ct.processedT.clear(); 1106 ct.processedT=null; 1107 } 1108 }else{ 1109 for(MstThread ct : alct){ 1110 processedClusters.addAll(ct.processedT); 1111 ct.processedT.clear(); 1112 ct.processedT=null; 1113 } 1114 clusterQueue=null; 1115 } 1116 alct.clear(); 1117 1118 assert(affixMaps==null); 1119 killAffixMaps(); 1120 1121 1122 if(DISPLAY_PROGRESS){ 1123 t.stop(); 1124 outstream.println("Removed "+(overlapsRemoved)+" edges ("+overlapBasesRemoved+" bases)."); 1125 outstream.println("Retained "+(overlapsRetained)+" edges ("+overlapBasesRetained+" bases)."); 1126 1127 // outstream.println("\nAfter conversion to Maximum Spanning Tree:"); 1128 // final int[] clusterSize=new int[8200]; 1129 // int max=0; 1130 // for(ArrayList<Unit> cluster : processedClusters){ 1131 // clusterSize[Tools.min(clusterSize.length-1, cluster.size())]++; 1132 // max=Tools.max(max, cluster.size()); 1133 // } 1134 // outstream.println(toClusterSizeString(clusterSize)); 1135 // outstream.println("Largest: "+max); 1136 1137 outstream.println("Finished MST conversion. Time: "+t); 1138 Shared.printMemory(); 1139 outstream.println(); 1140 t.start(); 1141 } 1142 } 1143 processClusters(Timer t, boolean mergeClusters)1144 private void processClusters(Timer t, boolean mergeClusters){ 1145 1146 ArrayList<ClusterThread> alct=new ArrayList<ClusterThread>(THREADS); 1147 for(int i=0; i<THREADS; i++){ 1148 alct.add(new ClusterThread(fixMultiJoins, removeCycles, fixOffsetContradictions, 1149 mergeClusters, mergeClusters, mergeClusters)); 1150 } 1151 1152 long leafMerges=0; 1153 long innerMerges=0; 1154 long leafBaseMerges=0; 1155 long innerBaseMerges=0; 1156 1157 long multiJoinFailures=0; 1158 long multiJoinsFound=0; 1159 long multiJoinBasesFound=0; 1160 long unitsFlipped=0; 1161 long overlapsFlipped=0; 1162 long offsetContradictoryOverlaps=0; 1163 long offsetContradictoryClusters=0; 1164 long cycleOverlaps=0; 1165 long cycleClusters=0; 1166 1167 for(ClusterThread ct : alct){ct.start();} 1168 for(ClusterThread ct : alct){ 1169 while(ct.getState()!=Thread.State.TERMINATED){ 1170 try { 1171 ct.join(); 1172 } catch (InterruptedException e) { 1173 e.printStackTrace(); 1174 } 1175 } 1176 1177 leafMerges+=ct.leafMergesT; 1178 innerMerges+=ct.innerMergesT; 1179 leafBaseMerges+=ct.leafBaseMergesT; 1180 innerBaseMerges+=ct.innerBaseMergesT; 1181 1182 multiJoinFailures+=ct.multiJoinFailuresT; 1183 multiJoinsFound+=ct.multiJoinsFoundT; 1184 multiJoinBasesFound+=ct.multiJoinBasesFoundT; 1185 unitsFlipped+=ct.unitsFlippedT; 1186 overlapsFlipped+=ct.overlapsFlippedT; 1187 offsetContradictoryOverlaps+=ct.offsetContradictoryOverlapsT; 1188 offsetContradictoryClusters+=ct.offsetContradictoryClustersT; 1189 cycleOverlaps+=ct.cycleOverlapsT; 1190 cycleClusters+=ct.cycleClustersT; 1191 } 1192 alct.clear(); 1193 1194 assert(affixMaps==null || affixMap1==null); 1195 killAffixMaps(); 1196 1197 assert(clusterQueue.isEmpty()); 1198 clusterQueue=null; 1199 1200 if(DISPLAY_PROGRESS){ 1201 t.stop(); 1202 if(fixMultiJoins){ 1203 outstream.println("Found "+(multiJoinsFound)+" multijoins ("+multiJoinBasesFound+" bases)."); 1204 outstream.println("Experienced "+(multiJoinFailures)+" multijoin removal failures."); 1205 } 1206 if(fixOffsetContradictions){ 1207 outstream.println("Found "+(offsetContradictoryClusters)+" clusters ("+offsetContradictoryOverlaps+" overlaps) with contradictory offset cycles."); 1208 } 1209 outstream.println("Found "+(cycleClusters)+" clusters ("+cycleOverlaps+" overlaps) with remaining cycles."); 1210 if(absorbOverlap){ 1211 outstream.println("Merged "+(leafMerges)+" leaves ("+leafBaseMerges+" bases)."); 1212 outstream.println("Merged "+(innerMerges)+" nonleaves ("+innerBaseMerges+" bases)."); 1213 } 1214 1215 outstream.println("\nAfter processing clusters:"); 1216 final long[][] clusterSize=new long[3][70000]; 1217 final long max=fillClusterSizeMatrix(processedClusters, clusterSize); 1218 outstream.println(toClusterSizeString(clusterSize)); 1219 outstream.println("\nLargest: "+max); 1220 1221 outstream.println("Finished processing. Time: "+t); 1222 Shared.printMemory(); 1223 outstream.println(); 1224 t.start(); 1225 } 1226 } 1227 removeInvalid(ArrayList<Read> list)1228 private long removeInvalid(ArrayList<Read> list){ 1229 final LongM keym=new LongM(); 1230 long removedC=0, removedP=0, removedS=0, invalid=0; 1231 1232 for(int j=0, lim=list.size(); j<lim; j++){ 1233 final Read r=list.get(j); 1234 final Unit u=(Unit)r.obj; 1235 1236 if(!u.valid()){ 1237 1238 invalid++; 1239 1240 if(codeMap!=null && !codeMap.isEmpty()){ 1241 Long key=u.code1; 1242 ArrayList<Unit> alu=codeMap.get(key); 1243 if(alu!=null){ 1244 int valid=0; 1245 for(int i=alu.size()-1; i>=0; i--){ 1246 Unit u2=alu.get(i); 1247 if(u2==null || !u2.valid()){ 1248 alu.remove(i); 1249 removedC++; 1250 } 1251 else{valid++;} 1252 } 1253 if(valid==0){codeMap.remove(key);} 1254 } 1255 } 1256 1257 if(affixMap1!=null && !affixMap1.isEmpty()){ 1258 { 1259 keym.set(u.prefix1); 1260 ArrayList<Unit> alu=affixMap1.get(keym); 1261 if(alu!=null){ 1262 int valid=0; 1263 for(int i=alu.size()-1; i>=0; i--){ 1264 Unit u2=alu.get(i); 1265 if(u2==null || !u2.valid()){ 1266 alu.remove(i); 1267 removedP++; 1268 } 1269 else{valid++;} 1270 } 1271 if(valid==0){affixMap1.remove(keym);} 1272 } 1273 } 1274 if(storeSuffix){ 1275 keym.set(u.suffix1); 1276 ArrayList<Unit> alu=affixMap1.get(keym); 1277 if(alu!=null){ 1278 int valid=0; 1279 for(int i=alu.size()-1; i>=0; i--){ 1280 Unit u2=alu.get(i); 1281 if(u2==null || !u2.valid()){ 1282 alu.remove(i); 1283 removedS++; 1284 } 1285 else{valid++;} 1286 } 1287 if(valid==0){affixMap1.remove(keym);} 1288 } 1289 } 1290 } 1291 if(affixMap2!=null && !affixMap2.isEmpty()){ 1292 if(u.prefix2!=-1){ 1293 keym.set(u.prefix2); 1294 ArrayList<Unit> alu=affixMap2.get(keym); 1295 if(alu!=null){ 1296 int valid=0; 1297 for(int i=alu.size()-1; i>=0; i--){ 1298 Unit u2=alu.get(i); 1299 if(u2==null || !u2.valid()){ 1300 alu.remove(i); 1301 removedP++; 1302 } 1303 else{valid++;} 1304 } 1305 if(valid==0){affixMap2.remove(keym);} 1306 } 1307 } 1308 if(storeSuffix && u.suffix2!=-1){ 1309 keym.set(u.suffix2); 1310 ArrayList<Unit> alu=affixMap2.get(keym); 1311 if(alu!=null){ 1312 int valid=0; 1313 for(int i=alu.size()-1; i>=0; i--){ 1314 Unit u2=alu.get(i); 1315 if(u2==null || !u2.valid()){ 1316 alu.remove(i); 1317 removedS++; 1318 } 1319 else{valid++;} 1320 } 1321 if(valid==0){affixMap2.remove(keym);} 1322 } 1323 } 1324 1325 } 1326 1327 list.set(j, null); 1328 } 1329 } 1330 1331 if(invalid>0){ 1332 Tools.condenseStrict(list); 1333 } 1334 if(verbose){ 1335 outstream.println("Removed invalids: "+removedC+", "+removedP+", "+removedS); 1336 } 1337 return invalid; 1338 } 1339 1340 addToArray(HashMap<Long, ArrayList<Unit>> codeMap, boolean sort, boolean clear, long outNum)1341 private static ArrayList<Read> addToArray(HashMap<Long, ArrayList<Unit>> codeMap, boolean sort, boolean clear, long outNum){ 1342 assert(outNum<=Integer.MAX_VALUE); 1343 if(verbose){System.err.println("Making list.");} 1344 ArrayList<Read> list=new ArrayList<Read>((int)outNum); 1345 if(verbose){System.err.println("Adding.");} 1346 for(ArrayList<Unit> alu : codeMap.values()){ 1347 for(Unit u : alu){ 1348 if(u.valid() && u.r.pairnum()==0){list.add(u.r);} 1349 } 1350 if(clear){alu.clear();} 1351 } 1352 if(clear){codeMap.clear();} 1353 1354 if(sort){ 1355 if(verbose){System.err.println("Sorting.");} 1356 Shared.sort(list, comparator); 1357 // if(ascending){ 1358 // Collections.reverse(list); 1359 // assert(list.isEmpty() || list.get(0).length()<=list.get(list.size()-1).length()) : 1360 // list.get(0).length()+", "+list.get(list.size()-1).length(); 1361 // }else{ 1362 // assert(list.isEmpty() || list.get(0).length()>=list.get(list.size()-1).length()) : 1363 // list.get(0).length()+", "+list.get(list.size()-1).length(); 1364 // } 1365 } 1366 assert(list.size()==outNum || list.size()*2L==outNum || UNIQUE_ONLY) : list.size()+", "+outNum; 1367 return list; 1368 } 1369 writeOutput(String clusterStatsFile, Timer t)1370 private void writeOutput(String clusterStatsFile, Timer t){ 1371 // verbose=true; 1372 // assert(false) : (processedClusters==null)+", "+(processedClusters.isEmpty())+", "+outgraph+", "+out+", "+clusterFilePattern; 1373 if(processedClusters==null || processedClusters.isEmpty()){ 1374 1375 if(out!=null || clusterFilePattern!=null){ 1376 1377 ArrayList<Read> list=addToArray(codeMap, sort, true, addedToMain-containments); 1378 codeMap=null; 1379 1380 if(sort){ 1381 if(DISPLAY_PROGRESS){ 1382 t.stop(); 1383 outstream.println("Sorted output. Time: "+t); 1384 Shared.printMemory(); 1385 outstream.println(); 1386 t.start(); 1387 } 1388 } 1389 1390 writeOutput(list); 1391 } 1392 }else{ 1393 if(outgraph!=null){ 1394 writeGraph(outgraph, processedClusters); 1395 } 1396 if(out!=null || clusterFilePattern!=null || clusterStatsFile!=null || outbest!=null){ 1397 writeOutputClusters(clusterStatsFile, processedClusters); 1398 } 1399 } 1400 1401 if(DISPLAY_PROGRESS){ 1402 t.stop(); 1403 outstream.println("Printed output. Time: "+t); 1404 Shared.printMemory(); 1405 outstream.println(); 1406 t.start(); 1407 } 1408 } 1409 1410 1411 writeOutput(ArrayList<Read> list)1412 private void writeOutput(ArrayList<Read> list){ 1413 1414 final ByteStreamWriter tsw=(out==null ? null : new ByteStreamWriter(out, overwrite, append, true)); 1415 1416 if(verbose){System.err.println("Writing from array.");} 1417 tsw.start(); 1418 1419 HashSet<String> names=((uniqueNames && storeName) ? 1420 new HashSet<String>(Tools.min(Integer.MAX_VALUE, Tools.max((int)addedToMain, (int)(addedToMain*1.35)))) : null); 1421 long rid=0; 1422 for(int x=0; x<list.size(); x++){ 1423 Read r=list.get(x); 1424 list.set(x, null); 1425 1426 if(r.mate!=null && r.pairnum()!=0){r=r.mate;} 1427 1428 assert(r.mate==null || r.mate.discarded()==r.discarded()); 1429 1430 if(!r.discarded()){ 1431 rid++; 1432 1433 for(int i=0; r!=null && i<2; i++){ 1434 if(multipleInputFiles){r.numericID=rid;} 1435 if(names!=null){ 1436 String name=(r.id==null ? ""+r.numericID : r.id); 1437 if(names.contains(name)){ 1438 for(long j=0; j<Integer.MAX_VALUE; j++){ 1439 String name2=name+"_dd"+j; 1440 if(r.mate!=null){name2+=(" /"+(i+1));} 1441 if(!names.contains(name2)){ 1442 r.id=name2; 1443 names.add(name2); 1444 break; 1445 } 1446 } 1447 }else{ 1448 names.add(name); 1449 } 1450 } 1451 tsw.println(r); 1452 r.setDiscarded(true); 1453 r=r.mate; 1454 } 1455 } 1456 } 1457 if(verbose){System.err.println("Shutting down tsw "+tsw.fname);} 1458 tsw.poisonAndWait(); 1459 } 1460 1461 writeOutputClusters(String clusterStatsFile, ArrayList<ArrayList<Unit>> clist)1462 private void writeOutputClusters(String clusterStatsFile, ArrayList<ArrayList<Unit>> clist){ 1463 1464 // Shared.sort(clist, CLUSTER_LENGTH_COMPARATOR); //Causes a crash: java.lang.System.arraycopy(Native Method) 1465 clist.sort(CLUSTER_LENGTH_COMPARATOR); 1466 1467 if(verbose){System.err.println("Writing clusters.");} 1468 1469 final ByteStreamWriter tswAll=(out==null ? null : new ByteStreamWriter(out, overwrite, append, true)); 1470 if(tswAll!=null){tswAll.start();} 1471 ByteStreamWriter tswCluster=null; 1472 ByteStreamWriter tswBest=null; 1473 1474 if(outbest!=null){ 1475 tswBest=new ByteStreamWriter(outbest, overwrite, append, true); 1476 tswBest.start(); 1477 } 1478 1479 TextStreamWriter csf=null; 1480 if(clusterStatsFile!=null){ 1481 csf=new TextStreamWriter(clusterStatsFile, overwrite, false, false); 1482 csf.start(); 1483 csf.print("#Name\tsize\t"+nmerLength+"-mer frequencies\n"); 1484 } 1485 1486 HashSet<String> names=((uniqueNames && storeName) ? 1487 new HashSet<String>(Tools.min(Integer.MAX_VALUE, Tools.max((int)addedToMain, (int)(addedToMain*1.35)))) : null); 1488 long rid=0; 1489 final long[] nmerCounts=new long[maxNmer+1]; 1490 1491 final StringBuilder sb=new StringBuilder(64); 1492 1493 for(int cnum=0; cnum<clist.size(); cnum++){ 1494 final ArrayList<Unit> alu=clist.get(cnum); 1495 // clist.set(cnum, null); //This breaks subsequent output processing 1496 1497 if(alu.size()<minClusterSize){ 1498 if(verbose){System.err.println("Ignoring small cluster "+cnum+", size "+alu.size());} 1499 1500 if(csf!=null && alu.size()>=minClusterSizeForStats){ 1501 float[] profile=makeNmerProfile(alu, nmerCounts); 1502 sb.append("Cluster_"); 1503 sb.append(cnum); 1504 sb.append('\t'); 1505 sb.append(alu.size()); 1506 sb.append('\t'); 1507 for(float f : profile){ 1508 sb.append(String.format(Locale.ROOT, "%.5f ", f)); 1509 } 1510 sb.setCharAt(sb.length()-1, '\n'); 1511 csf.print(sb.toString()); 1512 sb.setLength(0); 1513 } 1514 }else{ 1515 if(verbose){System.err.println("Writing cluster "+cnum+", size "+alu.size());} 1516 1517 if(clusterFilePattern!=null){ 1518 if(tswCluster!=null){ 1519 if(verbose){System.err.println("Shutting down tswCluster "+tswCluster.fname);} 1520 tswCluster.poisonAndWait(); 1521 tswCluster=null; 1522 } 1523 tswCluster=new ByteStreamWriter(clusterFilePattern.replaceFirst("%", ""+cnum), overwrite, append, true); 1524 if(verbose){System.err.println("Starting tswCluster "+tswCluster.fname);} 1525 tswCluster.start(); 1526 } 1527 1528 if(csf!=null && alu.size()>=minClusterSizeForStats){ 1529 float[] profile=makeNmerProfile(alu, nmerCounts); 1530 sb.append("Cluster_"); 1531 sb.append(cnum); 1532 sb.append('\t'); 1533 sb.append(alu.size()); 1534 sb.append('\t'); 1535 for(float f : profile){ 1536 sb.append(String.format(Locale.ROOT, "%.5f ", f)); 1537 } 1538 sb.setCharAt(sb.length()-1, '\n'); 1539 csf.print(sb.toString()); 1540 sb.setLength(0); 1541 } 1542 1543 if(pickBestRepresentativePerCluster){ 1544 pickBestRepresenative(alu, true); 1545 } 1546 1547 if(outbest!=null){ 1548 Unit u=pickBestRepresenative((ArrayList<Unit>)alu.clone(), false); 1549 tswBest.println(u.r); 1550 if(u.r.mate!=null){tswBest.println(u.r.mate);} 1551 } 1552 1553 for(int contig=0; contig<alu.size(); contig++){ 1554 final Unit u0=alu.get(contig); 1555 alu.set(contig, null); 1556 Read r=u0.r; 1557 if(r.mate!=null && r.pairnum()!=0){r=r.mate;} 1558 1559 if(!r.discarded()){ 1560 rid++; 1561 1562 for(int i=0; r!=null && i<2; i++){ 1563 assert(r.pairnum()==i) : i+", "+r.pairnum()+", "+(r.mate==null ? 9 : r.mate.pairnum()); 1564 Unit u=(Unit)r.obj; 1565 if(verbose){System.err.println("Writing read "+r.id);} 1566 r.numericID=rid; 1567 if(renameClusters){ 1568 sb.append("Cluster_"); 1569 sb.append(cnum); 1570 sb.append(",contig_"); 1571 sb.append(contig); 1572 if(u.offsetValid()){ 1573 sb.append(",pos_"); 1574 sb.append(u.offset()); 1575 } 1576 if(r.mate!=null){sb.append(" /"+(i+1));} 1577 r.id=(r.id==null ? sb.toString() : r.id+"\t"+sb); 1578 sb.setLength(0); 1579 }else if(names!=null){ 1580 String name=(r.id==null ? ""+r.numericID : r.id); 1581 if(names.contains(name)){ 1582 for(long j=0; j<Integer.MAX_VALUE; j++){ 1583 String name2=name+"_dd"+j; 1584 if(!names.contains(name2)){ 1585 r.id=name2; 1586 names.add(name2); 1587 break; 1588 } 1589 } 1590 }else{ 1591 names.add(name); 1592 } 1593 } 1594 if(tswAll!=null){tswAll.println(r);} 1595 if(tswCluster!=null){tswCluster.println(r);} 1596 r.setDiscarded(true); 1597 r=r.mate; 1598 } 1599 } 1600 } 1601 } 1602 } 1603 if(csf!=null){ 1604 if(verbose){System.err.println("Shutting down csf "+csf.fname);} 1605 csf.poisonAndWait(); 1606 } 1607 if(tswBest!=null){ 1608 if(verbose){System.err.println("Shutting down tswBest "+tswBest.fname);} 1609 tswBest.poisonAndWait(); 1610 } 1611 if(tswAll!=null){ 1612 if(verbose){System.err.println("Shutting down tswAll "+tswAll.fname);} 1613 tswAll.poisonAndWait(); 1614 } 1615 if(tswCluster!=null){ 1616 if(verbose){System.err.println("Shutting down tswCluster "+tswCluster.fname);} 1617 tswCluster.poisonAndWait(); 1618 } 1619 } 1620 1621 writeGraph(String graphFile, ArrayList<ArrayList<Unit>> clist)1622 private void writeGraph(String graphFile, ArrayList<ArrayList<Unit>> clist){ 1623 // Shared.sort(clist, CLUSTER_LENGTH_COMPARATOR); 1624 clist.sort(CLUSTER_LENGTH_COMPARATOR); 1625 1626 if(verbose){System.err.println("Writing overlap graph.");} 1627 1628 final TextStreamWriter tsw=(graphFile==null ? null : new TextStreamWriter(graphFile, overwrite, append, true)); 1629 if(tsw!=null){ 1630 tsw.start(); 1631 tsw.print("digraph G {\n"); 1632 } 1633 1634 for(int cnum=0; cnum<clist.size(); cnum++){ 1635 final ArrayList<Unit> alu=clist.get(cnum); 1636 // clist.set(cnum, null); //This breaks subsequent output processing 1637 // Shared.sort(alu); //TODO: Remove 1638 1639 if(alu.size()<minClusterSize){ 1640 if(verbose){System.err.println("Ignoring small cluster "+cnum+", size "+alu.size());} 1641 }else{ 1642 if(verbose){System.err.println("Processing cluster "+cnum+", size "+alu.size());} 1643 1644 for(int contig=0; contig<alu.size(); contig++){ 1645 final Unit u0=alu.get(contig); 1646 // alu.set(contig, null); //This breaks subsequent output processing 1647 Read r=u0.r; 1648 if(r.mate!=null && r.pairnum()!=0){r=r.mate;} 1649 1650 { 1651 for(int i=0; r!=null && i<2; i++){ 1652 assert(r.pairnum()==i) : i+", "+r.pairnum()+", "+(r.mate==null ? 9 : r.mate.pairnum()); 1653 Unit u=(Unit)r.obj; 1654 if(verbose){System.err.println("Writing read "+r.id);} 1655 1656 if(tsw!=null){ 1657 tsw.print("\t"+toGraphName(r)+"\n"); 1658 if(r.mate!=null && r.pairnum()==0){ 1659 Read r2=r.mate; 1660 tsw.print(toGraphName(r)+" -> "+toGraphName(r2)+" [label=mate]"); 1661 } 1662 if(u.overlapList!=null){ 1663 for(Overlap o : u.overlapList){ 1664 if(u==o.u1){ 1665 Read r2=o.u2.r; 1666 tsw.print("\t"+toGraphName(r)+" -> "+toGraphName(r2)+" [label=\""+o.toLabel()+"\"]\n"); 1667 } 1668 } 1669 } 1670 } 1671 r=r.mate; 1672 } 1673 } 1674 } 1675 } 1676 } 1677 1678 if(tsw!=null){ 1679 tsw.print("}\n"); 1680 if(verbose){System.err.println("Shutting down tswAll "+tsw.fname);} 1681 tsw.poisonAndWait(); 1682 } 1683 } 1684 toGraphName(Read r)1685 private static String toGraphName(Read r){ 1686 if(NUMBER_GRAPH_NODES || r.id==null){ 1687 return r.numericID+((ADD_PAIRNUM_TO_NAME || r.mate!=null) ? "."+(r.pairnum()+1) : ""); 1688 }else{ 1689 return r.id.replace(' ','_').replace('\t','_'); 1690 } 1691 } 1692 pickBestRepresenative(ArrayList<Unit> alu, boolean clearList)1693 private Unit pickBestRepresenative(ArrayList<Unit> alu, boolean clearList){ 1694 if(alu==null || alu.isEmpty()){return null;} 1695 float[] quality=new float[alu.size()]; 1696 int[] lengths=new int[alu.size()]; 1697 for(int i=0; i<alu.size(); i++){ 1698 Unit u=alu.get(i); 1699 int len=u.r.length(); 1700 quality[i]=u.r.expectedErrors(true, 0)/len; 1701 lengths[i]=len; 1702 } 1703 Arrays.sort(quality); 1704 Arrays.sort(lengths); 1705 int medianLength=lengths[lengths.length/2]; 1706 float bestQuality=quality[0]; 1707 1708 float currentBestQuality=9999999; 1709 Unit best=null; 1710 for(int i=0; i<alu.size(); i++){ 1711 Unit u=alu.get(i); 1712 int len=u.r.length(); 1713 float deviation=Tools.absdif(len, medianLength)*1f/(medianLength+1); 1714 if(deviation<0.05){ 1715 float qual=u.r.expectedErrors(true, 0)/len; 1716 qual=(qual+.001f)*(1+10*deviation); 1717 if(qual<currentBestQuality || best==null){ 1718 currentBestQuality=qual; 1719 best=u; 1720 } 1721 } 1722 } 1723 if(clearList){ 1724 alu.clear(); 1725 alu.add(best); 1726 } 1727 return best; 1728 } 1729 hash(byte[] bases)1730 public static long hash(byte[] bases){ 1731 long code=bases.length; 1732 for(int i=0; i<bases.length; i++){ 1733 byte b=bases[i]; 1734 int mode=(int)(code&31); 1735 assert(hashcodes[b]!=null) : "Invalid sequence character: '"+(char)b+"'"; 1736 code=code^hashcodes[b][mode]; 1737 code=Long.rotateLeft(code, 1); 1738 } 1739 return code; 1740 } 1741 1742 hashReversed(byte[] bases)1743 public static long hashReversed(byte[] bases){ 1744 long code=bases.length; 1745 for(int i=bases.length-1; i>=0; i--){ 1746 byte b=bases[i]; 1747 assert(hashcodesR[b]!=null) : "Invalid sequence character: '"+(char)b+"'"; 1748 int mode=(int)(code&31); 1749 code=code^hashcodesR[b][mode]; 1750 code=Long.rotateLeft(code, 1); 1751 } 1752 return code; 1753 } 1754 makeCodes(int symbols, int modes, long seed)1755 private static synchronized long[][] makeCodes(int symbols, int modes, long seed){ 1756 Random randy=Shared.threadLocalRandom(seed); 1757 long[][] r=new long[symbols][modes]; 1758 for(int i=0; i<symbols; i++){ 1759 for(int j=0; j<modes; j++){ 1760 r[i][j]=randy.nextLong(); 1761 } 1762 } 1763 return r; 1764 } 1765 1766 /** Handles IUPAC codes and invalid symbols */ makeCodes2(int modes, long seed)1767 private static synchronized long[][] makeCodes2(int modes, long seed){ 1768 long[][] r=makeCodes(128, modes, seed); 1769 1770 for(int i=0; i<26; i++){ 1771 char c=(char)('A'+i); 1772 r[Tools.toLowerCase(c)]=r[c]; 1773 } 1774 return r; 1775 } 1776 addDupe(Read r)1777 private void addDupe(Read r){ 1778 if(dupeWriter==null){return;} 1779 if(r.mate==null || r.pairnum()==0){ 1780 synchronized(dupeWriter){ 1781 dupeWriter.println(r); 1782 if(r.mate!=null){ 1783 dupeWriter.println(r.mate); 1784 } 1785 } 1786 } 1787 } 1788 1789 1790 private final class MstThread extends Thread{ 1791 MstThread()1792 public MstThread(){} 1793 1794 @Override run()1795 public void run(){ 1796 1797 ArrayList<Unit> cluster=null; 1798 while((cluster=nextCluster())!=null){ 1799 makeMst(cluster); 1800 processedT.add(cluster); 1801 } 1802 1803 } 1804 makeMst(ArrayList<Unit> cluster)1805 public void makeMst(ArrayList<Unit> cluster){ 1806 assert(heap.isEmpty()); 1807 unvisit(cluster); 1808 for(Unit u : cluster){ 1809 u.flags&=~Unit.VISIT_MASK; 1810 Shared.sort(u.overlapList); 1811 } 1812 { 1813 Unit u=cluster.get(0); 1814 u.setVisited(true); 1815 heap.addAll(u.overlapList); 1816 } 1817 // assert(false) : cluster.size(); 1818 while(!heap.isEmpty()){ 1819 Overlap o=heap.poll(); 1820 assert(!o.mst()); 1821 if(!o.invalid()){ 1822 // assert(o.u1.overlapList.contains(o)); //slow 1823 // assert(o.u2.overlapList.contains(o)); //slow 1824 assert(o.u1.visited() || o.u2.visited()); 1825 final Unit u=(!o.u1.visited() ? o.u1 : !o.u2.visited()? o.u2 : null); 1826 if(u!=null){ 1827 o.setMst(true); 1828 u.setVisited(true); 1829 overlapsRetainedT++; 1830 overlapBasesRetainedT+=o.overlapLen; 1831 for(Overlap o2 : u.overlapList){ 1832 if(o2.mst()){ 1833 //do nothing 1834 }else if(!o2.u1.visited() || !o2.u2.visited()){ 1835 if(heap.size()>=Integer.MAX_VALUE){ 1836 removeInvalid(heap); 1837 } 1838 heap.add(o2); 1839 }else if(!o2.invalid()){ 1840 o2.setInvalid(true); 1841 overlapsRemovedT++; 1842 overlapBasesRemovedT+=o2.overlapLen; 1843 } 1844 } 1845 } 1846 } 1847 } 1848 for(Unit u : cluster){ 1849 ArrayList<Overlap> alo=u.overlapList; 1850 int removed=0; 1851 for(int i=0; i<alo.size(); i++){ 1852 Overlap o=alo.get(i); 1853 if(o.invalid()){ 1854 assert(!o.mst()); 1855 alo.set(i, null); 1856 removed++; 1857 }else{ 1858 assert(o.mst()); 1859 } 1860 } 1861 if(removed>0){ 1862 Tools.condenseStrict(alo); 1863 alo.trimToSize(); 1864 } 1865 } 1866 } 1867 removeInvalid(PriorityQueue<Overlap> heap)1868 private void removeInvalid(PriorityQueue<Overlap> heap){ 1869 ArrayList<Overlap> valid=new ArrayList<Overlap>(heap.size()); 1870 for(Overlap o : heap){ 1871 if(!o.invalid()){ 1872 assert(!o.u1.visited() || !o.u2.visited()); 1873 valid.add(o); 1874 } 1875 } 1876 heap.clear(); 1877 heap.addAll(valid); 1878 } 1879 1880 1881 public long overlapsRemovedT=0; 1882 public long overlapBasesRemovedT=0; 1883 public long overlapsRetainedT=0; 1884 public long overlapBasesRetainedT=0; 1885 1886 private final PriorityQueue<Overlap> heap=new PriorityQueue<Overlap>((1<<16)-1); 1887 private ArrayList<ArrayList<Unit>> processedT=new ArrayList<ArrayList<Unit>>(); 1888 } 1889 1890 1891 /** 1892 * Processes clustered sets of reads. 1893 * @author Brian Bushnell 1894 * @date Aug 9, 2013 1895 * 1896 */ 1897 private final class ClusterThread extends Thread{ 1898 ClusterThread(boolean fixMultiJoins_, boolean removeCycles_, boolean fixOffsetContradictions_, boolean mergeClusters_, boolean mergeLeaves_, boolean mergeInner_)1899 public ClusterThread(boolean fixMultiJoins_, boolean removeCycles_, 1900 boolean fixOffsetContradictions_, boolean mergeClusters_, boolean mergeLeaves_, boolean mergeInner_){ 1901 fixMultiJoinsT=fixMultiJoins_; 1902 fixOffsetContradictionsT=fixOffsetContradictions_; 1903 mergeClustersT=mergeClusters_; 1904 mergeLeavesT=mergeLeaves_; 1905 mergeInnerT=mergeInner_; 1906 1907 // assert(false) : fixMultiJoinsT+", "+canonicizeT+", "+fixCanonContradictionsT+", "+mergeLeavesT+", "+mergeInnerT; 1908 bandy=(maxEdits>0 ? BandedAligner.makeBandedAligner(bandwidth) : null); 1909 // assert(false) : fixMultiJoinsT+", "+canonicizeT+", "+fixCanonContradictionsT+", "+fixOffsetContradictionsT+", "+mergeClustersT+", "+removeCycles_; 1910 } 1911 1912 @Override run()1913 public void run(){ 1914 1915 final ArrayList<Unit> temp=new ArrayList<Unit>(1000); 1916 1917 ArrayList<Unit> cluster=null; 1918 while((cluster=nextCluster())!=null){ 1919 1920 if(EA){ 1921 for(Unit u : cluster){assert(u.r.mate==null) : "Cluster processing/merging is not supported for paired reads, only cluster generation.";} 1922 } 1923 1924 // for(Unit u : cluster){assert(!u.visited());} 1925 unvisit(cluster); 1926 1927 reorderClusterBreadthFirst(cluster); 1928 int multiJoinCount=findMultiJoinsInCluster(cluster, fixMultiJoinsT); 1929 1930 if(EA){ 1931 for(Unit u : cluster){assert(!u.visited());} 1932 } 1933 1934 boolean ok=true; 1935 if(multiJoinCount!=0){ 1936 assert(multiJoinCount>0); 1937 multiJoinsFoundT+=multiJoinCount; 1938 if(!fixMultiJoinsT){ 1939 multiJoinFailuresT++; 1940 ok=false; 1941 } 1942 } 1943 1944 int cycleOverlaps=0; 1945 if(ok){ 1946 cycleOverlaps=findCycles(cluster, removeCycles); 1947 for(Unit u : cluster){assert(!u.visited());} 1948 if(cycleOverlaps>0){ 1949 cycleOverlapsT+=cycleOverlaps; 1950 cycleClustersT++; 1951 } 1952 } 1953 1954 int offsetContradictions=0; 1955 if(ok && fixOffsetContradictionsT){ 1956 if(EA){ 1957 for(Unit u : cluster){ 1958 assert(!u.visited()); 1959 assert(!u.offsetContradiction()); 1960 assert(!u.offsetValid()); 1961 if(u.overlapList!=null){ 1962 for(Overlap o : u.overlapList){ 1963 assert(!o.invalid()); 1964 assert(!o.offsetContradiction()); 1965 assert(o.type==FORWARD) : o; 1966 } 1967 } 1968 } 1969 } 1970 offsetContradictions=generateOffsetsBreadthFirst(cluster, temp); 1971 // System.err.println("Made offsets for cluster of size "+cluster.size()+"; contradictions = "+offsetContradictions+"; set = "+temp.size()); 1972 temp.clear(); 1973 for(Unit u : cluster){assert(!u.visited());} 1974 if(offsetContradictions>0){ 1975 offsetContradictoryOverlapsT+=offsetContradictions; 1976 offsetContradictoryClustersT++; 1977 if(fixOffsetContradictionsT){ 1978 if(verbose){System.err.println("Pruning cluster to remove offset contradictions.");} 1979 fullyPruneCluster(cluster, temp); 1980 if(verbose){System.err.println("Resulting size: "+cluster.size());} 1981 if(EA){ 1982 for(Unit u : cluster){ 1983 assert(!u.visited()); 1984 assert(!u.offsetContradiction()); 1985 assert(u.offsetValid()); 1986 if(u.overlapList!=null){ 1987 for(Overlap o : u.overlapList){ 1988 assert(!o.invalid()); 1989 assert(!o.offsetContradiction()); 1990 assert(o.type==FORWARD) : o; 1991 } 1992 } 1993 } 1994 } 1995 }else{ 1996 ok=false; 1997 } 1998 } 1999 if(ok){Shared.sort(cluster, UNIT_OFFSET_COMPARATOR);} 2000 } 2001 2002 if(ok && absorbOverlap){ 2003 mergeCluster(cluster); 2004 } 2005 2006 processedClustersT.add(cluster); 2007 if(processedClustersT.size()>=threadMaxReadsToBuffer){ 2008 synchronized(processedClusters){ 2009 processedClusters.addAll(processedClustersT); 2010 processedClustersT.clear(); 2011 } 2012 } 2013 } 2014 synchronized(processedClusters){ 2015 processedClusters.addAll(processedClustersT); 2016 processedClustersT.clear(); 2017 } 2018 } 2019 fullyPruneCluster(ArrayList<Unit> cluster, ArrayList<Unit> temp)2020 private void fullyPruneCluster(ArrayList<Unit> cluster, ArrayList<Unit> temp){ 2021 assert(cluster.size()>1) : cluster.size(); 2022 ArrayList<Unit> pruned=pruneCluster(cluster, true, true, temp); 2023 assert(temp.isEmpty()); 2024 assert(pruned==null || pruned.size()>0); 2025 while(pruned!=null){ 2026 ArrayList<Unit> subcluster=pruned; 2027 for(Unit u : subcluster){ 2028 u.clearVolatileFlags(); 2029 if(u.overlapList!=null){ 2030 for(Overlap o : u.overlapList){ 2031 o.clearVolatileFlags(); 2032 } 2033 } 2034 } 2035 assert(subcluster.size()>0); 2036 pruned=pruneCluster(subcluster, false, false, temp); 2037 assert(temp.isEmpty()); 2038 assert(pruned==null || pruned.size()>0); 2039 assert(subcluster.size()>0); 2040 if(subcluster.size()==1){ 2041 processedClustersT.add(subcluster); 2042 }else{ 2043 assert(subcluster.size()>1); 2044 synchronized(clusterQueue){ 2045 clusterQueue.add(subcluster); 2046 } 2047 } 2048 } 2049 } 2050 2051 /** 2052 * @param cluster 2053 */ mergeCluster(ArrayList<Unit> cluster)2054 private void mergeCluster(ArrayList<Unit> cluster) { 2055 if(cluster.size()==1){return;} 2056 if(mergeLeavesT){ 2057 mergeLeaves(cluster); 2058 } 2059 if(mergeInnerT){ 2060 mergeInner(cluster); 2061 } 2062 } 2063 2064 /** 2065 * Finds places in the cluster where two Units are joined by multiple different Overlaps. 2066 * Returns number of multijoins found. 2067 * @param cluster 2068 */ findMultiJoinsInCluster(ArrayList<Unit> cluster, boolean resolveProblems)2069 private int findMultiJoinsInCluster(ArrayList<Unit> cluster, boolean resolveProblems) { 2070 if(cluster.size()<2){return 0;} 2071 int totalMultiJoins=0; 2072 for(Unit ua : cluster){ 2073 ArrayList<Overlap> list=ua.overlapList; 2074 assert(list!=null); 2075 if(list.size()>1){ 2076 Shared.sort(list); 2077 2078 int multiJoins=0; 2079 for(int i=0; i<list.size(); i++){ 2080 Overlap o=list.get(i); 2081 Unit ub=(o.u1==ua ? o.u2 : o.u1); 2082 assert(ua!=ub); 2083 assert(ua==o.u1 || ua==o.u2); 2084 if(ub.visited()){ 2085 multiJoins++; 2086 multiJoinBasesFoundT+=o.overlapLen; 2087 if(!o.multiJoin()){o.setMultiJoin(true);} 2088 if(resolveProblems){list.set(i, null);} 2089 }else{ 2090 ub.setVisited(true); 2091 } 2092 } 2093 2094 if(multiJoins>0){ 2095 totalMultiJoins+=multiJoins; 2096 if(resolveProblems){Tools.condenseStrict(list);} 2097 } 2098 2099 for(int i=0; i<list.size(); i++){ 2100 Overlap o=list.get(i); 2101 Unit ub=(o.u1==ua ? o.u2 : o.u1); 2102 assert(ua!=ub); 2103 assert(ua==o.u1 || ua==o.u2); 2104 assert(ub.visited()); 2105 ub.setVisited(false); 2106 } 2107 } 2108 2109 } 2110 2111 return totalMultiJoins; 2112 } 2113 pruneCluster(ArrayList<Unit> cluster, boolean pruneContradictoryNodes, boolean pruneContradictoryOverlaps, ArrayList<Unit> visited)2114 private ArrayList<Unit> pruneCluster(ArrayList<Unit> cluster, boolean pruneContradictoryNodes, boolean pruneContradictoryOverlaps, ArrayList<Unit> visited){ 2115 if(verbose){System.err.println("pruneCluster(size="+cluster.size()+", "+pruneContradictoryNodes+", "+pruneContradictoryOverlaps+")");} 2116 2117 //pruneContradictoryOverlaps is less strict than pruneContradictoryNodes 2118 assert(pruneContradictoryOverlaps || !pruneContradictoryNodes); 2119 2120 for(Unit ua : cluster){ 2121 assert(!ua.visited()); 2122 assert(ua.isPerfectlyTransitive()) : ua; 2123 if(ua.visited()){ua.setVisited(false);} 2124 } 2125 2126 int prunedOverlaps=0; 2127 int visits=1; 2128 2129 { 2130 final Unit root=cluster.get(0); 2131 assert(!root.contradiction()); 2132 root.setVisited(true); 2133 visited.add(root); 2134 } 2135 2136 for(int i=0; i<visited.size(); i++){ 2137 Unit ua=visited.get(i); 2138 2139 if(ua.visited() && (!ua.contradiction() || !pruneContradictoryNodes)){ 2140 ArrayList<Overlap> list=ua.overlapList; 2141 if(list!=null){ 2142 int removed=0; 2143 for(int j=0; j<list.size(); j++){ 2144 Overlap o=list.get(j); 2145 Unit ub=(o.u1==ua ? o.u2 : o.u1); 2146 assert(o.u1==ua || o.u2==ua); 2147 assert(ua!=ub); 2148 assert(ub.valid()); 2149 2150 assert(!o.offsetContradiction() || (ua.offsetContradiction() || ub.offsetContradiction())) : 2151 "\n"+o.offsetContradiction()+", "+ua.offsetContradiction()+", "+ub.offsetContradiction(); 2152 2153 // assert(o.contradiction()==(ua.contradiction() && ub.contradiction())) : 2154 // "\n"+o.canonContradiction()+", "+o.offsetContradiction()+ 2155 // "\n"+ua.canonContradiction()+", "+ua.offsetContradiction()+ 2156 // "\n"+ub.canonContradiction()+", "+ub.offsetContradiction(); 2157 2158 final boolean remove=(pruneContradictoryNodes && ub.contradiction() || (pruneContradictoryOverlaps && o.contradiction())); 2159 if(!remove && !ub.visited()){ 2160 ub.setVisited(true); 2161 visited.add(ub); 2162 visits++; 2163 } 2164 2165 if(remove){ 2166 if(!o.invalid()){o.setInvalid(true);} 2167 list.set(j, null); 2168 removed++; 2169 prunedOverlaps++; 2170 }else{ 2171 assert(!o.invalid()); 2172 } 2173 } 2174 if(removed>0){Tools.condenseStrict(list);} 2175 } 2176 } 2177 } 2178 2179 if(verbose){System.err.println("cluster.size()="+cluster.size()+", visits="+visits+", visited.size()="+visited.size());} 2180 2181 final int numUnvisited=cluster.size()-visits; 2182 ArrayList<Unit> pruned=(numUnvisited==0 ? null : new ArrayList<Unit>(numUnvisited)); 2183 assert(visits==visited.size()); 2184 assert(visits>=1 && visits<=cluster.size()); 2185 2186 if(visits<cluster.size()){ 2187 pruned=new ArrayList<Unit>(cluster.size()-visits); 2188 for(Unit ua : cluster){ 2189 if(!ua.visited()){ 2190 pruned.add(ua); 2191 ArrayList<Overlap> list=ua.overlapList; 2192 if(list!=null){ 2193 int removed=0; 2194 for(int j=0; j<list.size(); j++){ 2195 Overlap o=list.get(j); 2196 Unit ub=(o.u1==ua ? o.u2 : o.u1); 2197 assert(o.u1==ua || o.u2==ua); 2198 assert(ua!=ub); 2199 assert(ub.valid()); 2200 2201 if(ub.visited() || o.invalid()){ 2202 assert(ub.visited()==o.invalid()) : "\n"+o+"\n"+ub; 2203 list.set(j, null); 2204 removed++; 2205 } 2206 } 2207 if(removed>0){Tools.condenseStrict(list);} 2208 } 2209 } 2210 } 2211 assert(pruned.size()==numUnvisited); 2212 }else{ 2213 assert(prunedOverlaps==0) : "If this fails then I may need to mark overlaps to remove."; 2214 } 2215 for(Unit u : cluster){ 2216 assert(u.isPerfectlyTransitive()) : u; 2217 if(EA){ 2218 if(u.overlapList!=null){ 2219 for(Overlap o : u.overlapList){assert(!o.invalid());} 2220 } 2221 } 2222 if(u.visited()){u.setVisited(false);} 2223 } 2224 cluster.clear(); 2225 cluster.addAll(visited); 2226 cluster.trimToSize(); 2227 2228 // for(Unit u : cluster){ 2229 //// assert(u.canonicized()); 2230 // for(Overlap o : u.overlapList){ 2231 // assert(pruned==null || !pruned.contains(o.u1)); 2232 // assert(pruned==null || !pruned.contains(o.u2)); 2233 // assert(cluster.contains(o.u1)); 2234 // assert(cluster.contains(o.u2)); 2235 // } 2236 // } 2237 // if(pruned!=null){ 2238 // for(Unit u : pruned){ 2239 // for(Overlap o : u.overlapList){ 2240 // assert(pruned.contains(o.u1)); 2241 // assert(pruned.contains(o.u2)); 2242 // assert(!cluster.contains(o.u1)); 2243 // assert(!cluster.contains(o.u2)); 2244 // } 2245 // } 2246 // } 2247 2248 visited.clear(); 2249 return pruned; 2250 } 2251 2252 /** 2253 * Cluster should already be ordered breadth-first 2254 * This may fail because removing cycles could change breadth-first traversal, but if it fails, an assertion will be thrown 2255 * @param cluster 2256 */ findCycles(ArrayList<Unit> cluster, boolean remove)2257 private int findCycles(ArrayList<Unit> cluster, boolean remove){ 2258 2259 { 2260 final Unit root=cluster.get(0); 2261 assert(root.length()>=cluster.get(cluster.size()-1).length()); 2262 root.setVisited(true); 2263 } 2264 int cycles=0; 2265 2266 for(Unit ua : cluster){ 2267 assert(ua.visited()); 2268 ArrayList<Overlap> list=ua.overlapList; 2269 if(list!=null){ 2270 int removed=0; 2271 for(int i=0; i<list.size(); i++){ 2272 Overlap o=list.get(i); 2273 Unit ub=(o.u1==ua ? o.u2 : o.u1); 2274 assert(o.u1==ua || o.u2==ua); 2275 assert(ua!=ub); 2276 assert(ub.valid()); 2277 2278 if(!o.visited()){ 2279 o.setVisited(true); 2280 if(ub.visited()){ 2281 if(!o.cyclic()){ 2282 o.setCyclic(true); 2283 cycles++; 2284 } 2285 }else{ 2286 ub.setVisited(true); 2287 } 2288 } 2289 if(remove && o.cyclic()){ 2290 list.set(i, null); 2291 removed++; 2292 } 2293 } 2294 if(removed>0){Tools.condenseStrict(list);} 2295 } 2296 } 2297 2298 for(Unit u : cluster){ 2299 if(u.visited()){u.setVisited(false);} 2300 if(u.overlapList!=null){ 2301 for(Overlap o : u.overlapList){ 2302 if(o.visited()){o.setVisited(false);} 2303 } 2304 } 2305 } 2306 2307 return cycles; 2308 } 2309 2310 /** 2311 * Cluster should already be ordered breadth-first 2312 * @param cluster 2313 */ generateOffsetsBreadthFirst(ArrayList<Unit> cluster, ArrayList<Unit> temp)2314 private int generateOffsetsBreadthFirst(ArrayList<Unit> cluster, ArrayList<Unit> temp){ 2315 2316 2317 assert(temp!=null); 2318 assert(temp.isEmpty()); 2319 { 2320 final Unit root=cluster.get(0); 2321 assert(root.length()>=cluster.get(cluster.size()-1).length()); 2322 root.setOffset(0); 2323 temp.add(root); 2324 } 2325 2326 int contradictions=0; 2327 for(int i=0; i<temp.size(); i++){ 2328 Unit u=temp.get(i); 2329 assert(!u.visited()) : i; 2330 assert(u.offsetValid() || contradictions>0) : i+", "+temp.size()+", "+contradictions+"\n"+toString(temp); 2331 if(u.offsetValid() && !u.offsetContradiction()){ 2332 contradictions+=setOffsetsNeighbors(u, temp); 2333 assert(contradictions==0 || (i>0 && temp.size()>2)); 2334 } 2335 } 2336 2337 int min=0; 2338 for(Unit u : temp){ 2339 if(u.visited()){u.setVisited(false);} 2340 if(u.overlapList!=null){ 2341 for(Overlap o : u.overlapList){ 2342 if(o.visited()){o.setVisited(false);} 2343 } 2344 } 2345 if(u.offsetValid() && !u.offsetContradiction()){ 2346 min=Tools.min(min, u.offset()); 2347 } 2348 } 2349 2350 if(verbose){ 2351 System.err.println("min offset = "+min); 2352 } 2353 2354 for(Unit u : temp){ 2355 if(u.offsetValid()){ 2356 if(verbose){System.err.println("Set "+u.name()+" offset from "+u.offset+" to "+(u.offset-min));} 2357 u.offset=u.offset-min; 2358 } 2359 } 2360 2361 2362 return contradictions; 2363 } 2364 2365 /** 2366 * @param root 2367 */ setOffsetsNeighbors(final Unit root, final ArrayList<Unit> temp)2368 private int setOffsetsNeighbors(final Unit root, final ArrayList<Unit> temp) { 2369 if(verbose){System.err.println("\nsetOffsetsNeighbors("+root.name()+")\nroot.code1="+root.code1+"\n");} 2370 assert(root.valid()); 2371 assert(!root.visited()); 2372 assert(root.offsetValid()); 2373 assert(!root.offsetContradiction()); 2374 root.setVisited(true); 2375 if(root.overlapList==null){return 0;} 2376 final int contradictions=countOffsetContradictions(root, false); 2377 if(verbose){System.err.println("\ncontradictions="+contradictions);} 2378 for(Overlap o : root.overlapList){ 2379 Unit u=(o.u1==root ? o.u2 : o.u1); 2380 assert(o.u1==root || o.u2==root); 2381 assert(root!=u); 2382 assert(u.valid()); 2383 2384 if(verbose){System.err.println("\nProcessing Overlap "+o);} 2385 if(!o.visited() && !o.offsetContradiction()){ 2386 o.setVisited(true); 2387 if(!u.offsetContradiction()){ 2388 if(verbose){System.err.println("Calling setOffset: "+o);} 2389 if(!u.offsetValid()){temp.add(u);} 2390 boolean b=setOffset(root, u, o); 2391 if(verbose){System.err.println("Finished setOffset: "+o);} 2392 2393 // if(x>0){ 2394 // if(verbose){System.err.println("\n*********************************************");} 2395 // if(verbose){System.err.println("Problem detected with contig "+u.name());} 2396 // if(verbose){System.err.println("*********************************************\n");} 2397 // verbose=true; 2398 // int y2=countOffsetContradictions(root, false); 2399 // assert(contradictions==y2); 2400 // } 2401 2402 assert(b) : "\n"+contradictions+", "+o.offsetContradiction()+", "+root.offsetContradiction()+", "+u.offsetContradiction()+"\n" 2403 +root.offsetValid()+", "+u.offsetValid()+", "+OVERLAP_TYPE_NAMES[o.type]+"\n"+b 2404 +fixMultiJoins; //This assertion can fail if a multijoin is present 2405 assert(u.offsetValid()); 2406 } 2407 } 2408 } 2409 return contradictions; 2410 } 2411 countOffsetContradictions(Unit root, boolean includeKnown)2412 private int countOffsetContradictions(Unit root, boolean includeKnown){ 2413 if(verbose){System.err.println("\ncountContradictions("+root.name()+", "+includeKnown+")\nroot.code1="+root.code1+"\n");} 2414 assert(root.valid()); 2415 assert(root.visited()); 2416 assert(root.offsetValid()); 2417 // assert(!root.offsetContradiction()); 2418 if(root.overlapList==null){return 0;} 2419 int contradictions=0; 2420 for(Overlap o : root.overlapList){ 2421 Unit u=(o.u1==root ? o.u2 : o.u1); 2422 assert(o.u1==root || o.u2==root); 2423 assert(root!=u); 2424 assert(u.valid()); 2425 2426 if(verbose){System.err.println("\nOverlap "+o+"\nu="+u.name()+", offsetValid="+u.offsetValid());} 2427 2428 boolean contradictory=(u.offsetValid() && u.offset()!=calcOffset(root, u, o)); 2429 if(verbose){System.err.println("contradictory= \t"+contradictory);} 2430 if(contradictory){ 2431 if(includeKnown || !u.offsetContradiction()){ 2432 contradictions++; 2433 if(!root.offsetContradiction()){root.setOffsetContradiction(true);} 2434 } 2435 if(!o.offsetContradiction()){o.setOffsetContradiction(true);} 2436 if(!u.offsetContradiction()){u.setOffsetContradiction(true);} 2437 } 2438 assert(contradictory==o.offsetContradiction()) : contradictory+", "+o.offsetContradiction(); 2439 if(verbose){ 2440 System.err.println("root.offsetContradiction()=\t"+root.offsetContradiction()); 2441 System.err.println("u.offsetContradiction()= \t"+u.offsetContradiction()); 2442 System.err.println("o.offsetContradiction()= \t"+o.offsetContradiction()); 2443 System.err.println("contradictions= \t"+contradictions); 2444 } 2445 } 2446 if(verbose){System.err.println("Final contradictions="+contradictions+"\n");} 2447 return contradictions; 2448 } 2449 toString(ArrayList<Unit> cluster)2450 private String toString(ArrayList<Unit> cluster){ 2451 for(int i=0; i<cluster.size(); i++){ 2452 Unit u=cluster.get(i); 2453 u.r.id=""+i; 2454 } 2455 StringBuilder sb=new StringBuilder(1000); 2456 for(Unit u : cluster){ 2457 sb.append(">"+u.name()+"\n"); 2458 sb.append(new String(u.bases())); 2459 sb.append("\n"); 2460 } 2461 sb.append("\n*****\n"); 2462 for(Unit u : cluster){ 2463 sb.append("\n"+u.name()+":"); 2464 if(u.overlapList!=null){ 2465 for(Overlap o : u.overlapList){ 2466 Unit ub=(o.u1==u ? o.u2 : o.u1); 2467 sb.append(" "+ub.name()); 2468 } 2469 } 2470 } 2471 sb.append("\n"); 2472 return sb.toString(); 2473 } 2474 toShortString(ArrayList<Unit> cluster)2475 private String toShortString(ArrayList<Unit> cluster){ 2476 for(int i=0; i<cluster.size(); i++){ 2477 Unit u=cluster.get(i); 2478 u.r.id=""+i; 2479 } 2480 StringBuilder sb=new StringBuilder(1000); 2481 for(Unit u : cluster){ 2482 sb.append("\n"+u.name()+":"); 2483 if(u.overlapList!=null){ 2484 for(Overlap o : u.overlapList){ 2485 Unit ub=(o.u1==u ? o.u2 : o.u1); 2486 sb.append(" "+ub.name()); 2487 } 2488 } 2489 } 2490 sb.append("\n"); 2491 return sb.toString(); 2492 } 2493 2494 2495 /** 2496 * @param root 2497 * @param u2 2498 * @param o 2499 * @return Number of contradictions 2500 */ canonicize(final Unit root, final Unit u2, final Overlap o)2501 private int canonicize(final Unit root, final Unit u2, final Overlap o){ 2502 if(o.type==FORWARD){return 0;} 2503 if(o.type==REVERSE){o.reverseDirection();} 2504 assert(o.type==FORWARD); 2505 assert(o.test(bandy, o.edits+maxEdits)); 2506 return 0; 2507 } 2508 2509 2510 /** 2511 * @param root 2512 * @param u2 2513 * @param o 2514 * @return true if no contradictions 2515 */ setOffset(final Unit root, final Unit u2, final Overlap o)2516 private boolean setOffset(final Unit root, final Unit u2, final Overlap o){ 2517 assert(root.offsetValid()); 2518 assert(!root.offsetContradiction()); 2519 int offset=calcOffset(root, u2, o); 2520 2521 if(u2.offsetValid()){return u2.offset()==offset;} 2522 u2.setOffset(offset); 2523 2524 if(verbose){ 2525 System.err.println("\nroot = "+(root.name()==null ? root.r.numericID+"" : root.name())+", u2 = "+(u2.name()==null ? u2.r.numericID+"" : u2.name()) 2526 +"\no = "+o 2527 +"\nroot.offset = "+root.offset() 2528 +"\nu2.offset = "+u2.offset()); 2529 } 2530 2531 return true; 2532 } 2533 2534 calcOffset(final Unit root, final Unit ub, final Overlap o)2535 private int calcOffset(final Unit root, final Unit ub, final Overlap o){ 2536 assert(root.offsetValid()); 2537 if(o.type==FORWARD){ 2538 if(root==o.u1){ 2539 int dif=o.start1-o.start2; 2540 if(verbose){System.err.println("root==o.u1=="+root.name()+", start1="+o.start1+"; u2==o.u2=="+ub.name()+", start2="+o.start2+", dif="+dif);} 2541 return root.offset+dif; 2542 }else{ 2543 int dif=o.start2-o.start1; 2544 if(verbose){System.err.println("root==o.u2=="+root.name()+", start2="+o.start2+"; u2==o.u1=="+ub.name()+", start1="+o.start1+", dif="+dif);} 2545 return root.offset+dif; 2546 } 2547 }else{ 2548 assert(false) : o; 2549 throw new RuntimeException("TODO"); 2550 } 2551 } 2552 2553 2554 /** 2555 * @param cluster 2556 */ mergeLeaves(ArrayList<Unit> cluster)2557 private void mergeLeaves(ArrayList<Unit> cluster) { 2558 assert(false) : "TODO"; 2559 for(Unit u : cluster){ 2560 2561 } 2562 } 2563 2564 /** 2565 * @param cluster 2566 */ mergeInner(ArrayList<Unit> cluster)2567 private void mergeInner(ArrayList<Unit> cluster) { 2568 assert(false) : "TODO"; 2569 for(Unit u : cluster){ 2570 2571 } 2572 } 2573 2574 private ArrayList<ArrayList<Unit>> processedClustersT=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer); 2575 2576 long leafMergesT=0; 2577 long innerMergesT=0; 2578 long leafBaseMergesT=0; 2579 long innerBaseMergesT=0; 2580 2581 long multiJoinFailuresT=0; 2582 long multiJoinsFoundT=0; 2583 long multiJoinBasesFoundT=0; 2584 long unitsFlippedT=0; 2585 long overlapsFlippedT=0; 2586 long offsetContradictoryOverlapsT=0; 2587 long offsetContradictoryClustersT=0; 2588 long cycleOverlapsT=0; 2589 long cycleClustersT=0; 2590 2591 private final boolean fixMultiJoinsT; 2592 private final boolean fixOffsetContradictionsT; 2593 private final boolean mergeClustersT; 2594 private final boolean mergeLeavesT; 2595 private final boolean mergeInnerT; 2596 private final BandedAligner bandy; 2597 } 2598 2599 2600 /** 2601 * @param cluster 2602 */ unvisit(ArrayList<Unit> cluster)2603 private void unvisit(ArrayList<Unit> cluster) { 2604 for(Unit u : cluster){ 2605 if(u.visited()){u.setVisited(false);} 2606 } 2607 } 2608 2609 /** 2610 * @param cluster 2611 */ reorderClusterBreadthFirst(ArrayList<Unit> cluster)2612 private void reorderClusterBreadthFirst(ArrayList<Unit> cluster) { 2613 if(verbose){System.err.println("reorderClusterBreadthFirst");} 2614 2615 final int size=cluster.size(); 2616 Shared.sort(cluster); //Now it is in descending length 2617 final Unit root=cluster.get(0); 2618 assert(root.length()>=cluster.get(size-1).length()) : root.length()+", "+cluster.get(size-1).length()+", "+root.compareTo(cluster.get(size-1)); 2619 2620 ArrayList<Unit> breadthFirst=new ArrayList<Unit>(cluster.size()); 2621 root.setVisited(true); 2622 // System.err.println("root="+root.name()); 2623 breadthFirst.add(root); 2624 for(int i=0; i<breadthFirst.size(); i++){ 2625 Unit u=breadthFirst.get(i); 2626 Shared.sort(u.overlapList); //Sorted in descending overlap length 2627 if(u.overlapList!=null){ 2628 for(Overlap o : u.overlapList){ 2629 if(!o.u1.visited()){ 2630 // System.err.println("Visiting "+o.u1.name()); 2631 o.u1.setVisited(true); 2632 breadthFirst.add(o.u1); 2633 } 2634 if(!o.u2.visited()){ 2635 // System.err.println("Visiting "+o.u2.name()); 2636 o.u2.setVisited(true); 2637 breadthFirst.add(o.u2); 2638 } 2639 // System.err.println("***"); 2640 // System.err.println(toShortString(breadthFirst)); 2641 } 2642 } 2643 } 2644 for(Unit u : cluster){ 2645 assert(u.visited()); 2646 if(u.visited()){u.setVisited(false);} 2647 if(EA){ 2648 if(u.overlapList!=null){ 2649 for(Overlap o : u.overlapList){assert(!o.visited());} 2650 } 2651 } 2652 } 2653 // System.err.println("***"); 2654 // System.err.println("Final:"); 2655 // System.err.println(toShortString(breadthFirst)); 2656 assert(cluster.size()==breadthFirst.size()); 2657 cluster.clear(); 2658 cluster.addAll(breadthFirst); 2659 } 2660 2661 2662 2663 /** Returns next cluster larger than 1 element. 2664 * Singleton clusters are added directly to 'processed'. */ nextCluster()2665 private ArrayList<Unit> nextCluster(){ 2666 synchronized(clusterQueue){ 2667 ArrayList<Unit> cluster=clusterQueue.poll(); 2668 assert(cluster==null || cluster.size()>1); 2669 // while(cluster!=null && cluster.size()<2){ 2670 //// unmark(cluster); 2671 // processedClustersT.add(cluster); 2672 // cluster=clusterQueue.poll(); 2673 // } 2674 return cluster; 2675 } 2676 } 2677 2678 2679 /** 2680 * Creates Unit objects or uses ones already attached to reads. 2681 * Places them in local storage and percolates them to shared storage (codeMap), removing exact duplicates. 2682 * Also hashes tips and places these in shared affixMap. 2683 * Looks for containments in the affix map. 2684 * @author Brian Bushnell 2685 * @date Jul 24, 2013 2686 * 2687 */ 2688 private final class HashThread extends Thread{ 2689 HashThread(boolean addToCodeMap_, boolean addToAffixMap_, boolean findMatches_, boolean findContainments_, boolean findOverlaps_)2690 public HashThread(boolean addToCodeMap_, boolean addToAffixMap_, boolean findMatches_, boolean findContainments_, boolean findOverlaps_){ 2691 addToCodeMapT=addToCodeMap_; 2692 addToAffixMapT=addToAffixMap_; 2693 findContainmentsT=findContainments_; 2694 findOverlapsT=findOverlaps_; 2695 findMatchesT=findMatches_; 2696 tid=getTid(); 2697 crisq=new ArrayDeque<ConcurrentReadInputStream>(crisa.length); 2698 for(int i=0; i<crisa.length; i++){ 2699 // if(verbose){System.err.println("Adding to crisq.");} 2700 crisq.add(crisa[(i+tid)%crisa.length]); 2701 } 2702 bandy=(maxEdits>0 && (findOverlapsT || findContainmentsT) ? BandedAligner.makeBandedAligner(bandwidth) : null); 2703 2704 // assert(addToCodeMapT) : "addToCodeMapT="+addToCodeMapT+", addToAffixMapT="+addToAffixMapT+", findContainmentsT="+findContainmentsT+ 2705 // ", findOverlapsT="+findOverlapsT+", findMatchesT="+findMatchesT+", convertToUpperCaseT="+convertToUpperCaseT+", numAffixMaps="+numAffixMaps; 2706 } 2707 2708 @Override run()2709 public void run(){ 2710 2711 ConcurrentReadInputStream cris=crisq.poll(); 2712 2713 // String lastName=null; 2714 2715 while(cris!=null){ 2716 ListNum<Read> ln=cris.nextList(); 2717 ArrayList<Read> reads=(ln!=null ? ln.list : null); 2718 2719 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 2720 2721 for(Read r : reads){ 2722 // lastName=r.id; 2723 processReadOuter(r); 2724 } 2725 2726 if(codeMapT!=null && (codeMapT.size()>threadMaxReadsToBuffer || basesStoredT>threadMaxBasesToBuffer)){ 2727 assert(addToCodeMapT); 2728 long added=mergeMaps(); 2729 addedToMainT+=added; 2730 } 2731 2732 cris.returnList(ln); 2733 ln=cris.nextList(); 2734 reads=(ln!=null ? ln.list : null); 2735 } 2736 cris.returnList(ln); 2737 if(codeMapT!=null && !codeMapT.isEmpty()){ 2738 long added=mergeMaps(); 2739 addedToMainT+=added; 2740 } 2741 cris=crisq.poll(); 2742 } 2743 2744 // System.err.println("Thread finished; last name = "+lastName); 2745 2746 codeMapT=null; 2747 localConflictList=null; 2748 sharedConflictList=null; 2749 } 2750 2751 /** Return true if this read was a member of this subset. */ processReadOuter(Read r1)2752 private boolean processReadOuter(Read r1){ 2753 if(r1.length()<MINSCAF){return false;} 2754 Read r2=r1.mate; 2755 2756 assert(r1.pairnum()==0); 2757 assert(r2==null || r2.pairnum()==1); 2758 2759 if(!addToCodeMapT && r1.obj==null){ 2760 if(r1.bases!=null && r1.length()>=MINSCAF){ 2761 final Unit u=(r1.obj!=null ? (Unit)r1.obj : new Unit(r1)); 2762 assert(u.r==r1 && (r1.obj==u || r1.obj==null)); 2763 final long code=u.code1; 2764 r1.obj=u; 2765 assert(u.r==r1 && r1.obj==u); 2766 if(r2!=null && r2.obj==null){r2.obj=new Unit(r2);} 2767 2768 //Check for subset membership 2769 final boolean inSet=u.inSet(); 2770 if(inSet){ 2771 final Long codeL=code; 2772 ArrayList<Unit> list=codeMap.get(codeL); 2773 boolean found=false; 2774 for(Unit u0 : list){ 2775 //Replace with existing read 2776 if(u0.equals(u) && u0.r.numericID==r1.numericID){ 2777 r1=u0.r; 2778 r2=r1.mate; 2779 found=true; 2780 break; 2781 } 2782 } 2783 assert(list!=null); 2784 if(!found){ 2785 return false; 2786 } 2787 } 2788 } 2789 } 2790 boolean b=processRead(r1); 2791 if(r2!=null){processRead(r2);} 2792 return b; 2793 } 2794 2795 /** Return true if this read was a member of this subset. */ processRead(Read r)2796 private boolean processRead(Read r){ 2797 if(r.length()<MINSCAF){return false;} 2798 2799 final boolean inSet; 2800 if(!storeName){r.id=null;} 2801 if(!storeQuality){r.quality=null;} 2802 2803 if(forceTrimLeft>0 || forceTrimRight>0){//Added at request of RQC team 2804 if(r!=null && r.length()>0){ 2805 TrimRead.trimToPosition(r, forceTrimLeft>0 ? forceTrimLeft : 0, forceTrimRight>0 ? forceTrimRight : r.length(), 1); 2806 } 2807 } 2808 if(qTrimLeft || qTrimRight){ 2809 TrimRead.trimFast(r, qTrimLeft, qTrimRight, trimQ, trimE, 0); 2810 } 2811 if(r.length()<MINSCAF){return false;} 2812 2813 readsProcessedT++; 2814 basesProcessedT+=r.length(); 2815 2816 final Unit u=(r.obj!=null ? (Unit)r.obj : new Unit(r)); 2817 assert(u.r==r && (r.obj==u || r.obj==null)); 2818 final long code=u.code1; 2819 2820 //Check for subset membership 2821 inSet=u.inSet(); 2822 2823 r.obj=u; 2824 assert(u.r==r && r.obj==u); 2825 if(r.mate!=null && r.mate.obj==null){r.mate.obj=new Unit(r.mate);} 2826 2827 if(verbose){System.err.println("Generated "+code+" for sequence "+u.name()+"\t"+new String(r.bases, 0, Tools.min(40, r.length())));} 2828 2829 if(addToCodeMapT && inSet){ 2830 final Long codeL=code; 2831 ArrayList<Unit> list=codeMapT.get(codeL); 2832 if(list==null){ 2833 if(verbose){System.err.println("Unique.");} 2834 list=new ArrayList<Unit>(1); 2835 list.add(u); 2836 basesStoredT+=r.length(); 2837 codeMapT.put(codeL, list); 2838 }else{ 2839 if(verbose){System.err.println("Exists.");} 2840 boolean match=false; 2841 if(findMatchesT){ 2842 for(Unit u2 : list){ 2843 if(pairedEqualsRC(u, u2)){ 2844 // if(u.r.mate!=null){ 2845 // verbose=true; 2846 // 2847 // Unit um=(Unit)u.r.mate.obj; 2848 // Unit u2m=(Unit)u2.r.mate.obj; 2849 // 2850 // if(verbose){ 2851 // System.err.println("********"); 2852 // System.err.println(u.r.toFastq()); 2853 // System.err.println(u.r.mate.toFastq()); 2854 // System.err.println("********"); 2855 // System.err.println(u2.r.toFastq()); 2856 // System.err.println(u2.r.mate.toFastq()); 2857 // System.err.println("********"); 2858 // System.err.println(u); 2859 // System.err.println(u2); 2860 // System.err.println(um); 2861 // System.err.println(u2m); 2862 // System.err.println("********"); 2863 // System.err.println(u.equals(u2)); 2864 // System.err.println(u.compareTo(u2)); 2865 // System.err.println("********"); 2866 // System.err.println(um.equals(u2m)); 2867 // System.err.println(um.compareTo(u2m)); 2868 // System.err.println("********"); 2869 // } 2870 // 2871 // verbose=false; 2872 // } 2873 assert(u.r.mate==null || pairedEqualsRC((Unit)u.r.mate.obj, (Unit)u2.r.mate.obj)) : 2874 u.r.toFastq()+"\n"+u2.r.toFastq()+"\n"+u.r.mate.toFastq()+"\n"+u2.r.mate.toFastq()+ 2875 "\n"+u+"\n"+u2+"\n"+u.r.mate.obj+"\n"+u2.r.mate.obj; 2876 // if(verbose){System.err.println("Matches "+new String(r2.bases, 0, Tools.min(40, r2.length())));} 2877 match=true; 2878 u2.absorbMatch(u); 2879 if(UNIQUE_ONLY){ 2880 synchronized(u2){ 2881 if(u2.valid()){ 2882 matchesT++; 2883 baseMatchesT+=u2.length(); 2884 u2.setValid(false); 2885 addDupe(u2.r); 2886 } 2887 } 2888 } 2889 break; 2890 } 2891 } 2892 } 2893 if(match){ 2894 addDupe(r); 2895 matchesT++; 2896 baseMatchesT+=r.length(); 2897 // if(verbose){System.err.println("matchesT="+matchesT+", baseMatchesT="+baseMatchesT);} 2898 }else{ 2899 collisionsT++; 2900 if(verbose){System.err.println("False collision; count = "+collisionsT);} 2901 list.add(u); 2902 basesStoredT+=r.length(); 2903 } 2904 } 2905 } 2906 2907 if(findContainmentsT){ 2908 int x=findContainments(u); 2909 } 2910 2911 if(findOverlapsT){ 2912 int x=findOverlaps(u); 2913 } 2914 2915 return inSet; 2916 } 2917 findContainments(final Unit u)2918 private int findContainments(final Unit u){ 2919 if(minLengthPercent<=0 && maxSubs<=0 && minIdentity>=100 && !u.valid()){return 0;} 2920 final byte[] bases=u.bases(); 2921 final int shift=bitsPerBase*k; 2922 final int shift2=shift-bitsPerBase; 2923 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 2924 long kmer=0; 2925 int hits=0; 2926 int currentContainments=0; 2927 int len=0; 2928 2929 if(bases==null || bases.length<k){return -1;} 2930 final LongM key=new LongM(); 2931 2932 for(int i=0; i<bases.length; i++){ 2933 byte b=bases[i]; 2934 long x=acidToNumber[b]; 2935 kmer=((kmer<<bitsPerBase)|x)&mask; 2936 if(HASH_NS || isFullyDefined(b)){len++;} 2937 else{len=0;} 2938 // if(verbose){System.err.println("Scanning i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 2939 if(len>=k){ 2940 key.set(kmer); //Canonical 2941 for(int am=0; am<affixMaps.length; am++){ 2942 ArrayList<Unit> list=affixMaps[am].get(key); 2943 if(list!=null){ 2944 for(Unit u2 : list){ 2945 if(u!=u2 && !u.equals(u2)){ 2946 if(u2.valid()){ 2947 hits++; 2948 if(verbose){ 2949 System.err.println("\nFound potential containment at am="+am+", i="+i+", key="+key.value()+ 2950 ", pre1="+u2.prefix1+", pre2="+u2.prefix2+ 2951 ", suf1="+u2.suffix1+", suf2="+u2.suffix2+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i, k))); 2952 } 2953 if(u.contains(u2, i, key, bandy, am)){ 2954 synchronized(u2){ 2955 if(u2.valid()){ 2956 currentContainments++; 2957 baseContainmentsT+=u2.length(); 2958 u2.setValid(false); 2959 addDupe(u2.r); 2960 } 2961 } 2962 if(UNIQUE_ONLY){ 2963 synchronized(u){ 2964 if(u.valid()){ 2965 currentContainments++; 2966 baseContainmentsT+=u.length(); 2967 u.setValid(false); 2968 addDupe(u.r); 2969 } 2970 } 2971 } 2972 2973 if(verbose){System.err.println("Added containment "+u2);} 2974 } 2975 } 2976 } 2977 } 2978 } 2979 } 2980 } 2981 } 2982 // assert(false) : hits+", "+currentContainments+", "+baseContainments+"\n"+containmentMapT+"\n"; 2983 2984 containmentCollisionsT+=(hits-currentContainments); 2985 // outstream.println("hits="+hits+", currentContainments="+currentContainments); 2986 containmentsT+=currentContainments; 2987 return hits; 2988 } 2989 findOverlaps(final Unit u)2990 private int findOverlaps(final Unit u){ 2991 // if(minLengthPercent<=0 && maxSubs<=0 && minIdentity>=100 && !u.valid()){return 0;} 2992 // if(u.overlapList!=null){u.overlapList.clear();} 2993 final byte[] bases=u.bases(); 2994 final int shift=bitsPerBase*k; 2995 final int shift2=shift-bitsPerBase; 2996 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 2997 long kmer=0; 2998 int hits=0; 2999 int currentOverlaps=0; 3000 int len=0; 3001 3002 if(bases==null || bases.length<k){return -1;} 3003 final LongM key=new LongM(); 3004 3005 boolean quit=false; 3006 3007 for(int i=0; i<bases.length && !quit; i++){ 3008 byte b=bases[i]; 3009 long x=acidToNumber[b]; 3010 kmer=((kmer<<bitsPerBase)|x)&mask; 3011 if(HASH_NS || isFullyDefined(b)){len++;} 3012 else{len=0;} 3013 // if(verbose){System.err.println("Scanning i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 3014 if(len>=k){ 3015 key.set(kmer); //Canonical key 3016 for(int am=0; am<affixMaps.length; am++){ 3017 ArrayList<Unit> list=affixMaps[am].get(key); 3018 if(list!=null){//found a key collision 3019 for(Unit u2 : list){ 3020 if(quit){break;}//too many edges 3021 int u1cluster=-1, u2cluster=-2; 3022 if(preventTransitiveOverlaps && u!=u2){ 3023 u1cluster=u.determineCluster(); 3024 u2cluster=u2.determineCluster(); 3025 } 3026 if(u1cluster!=u2cluster && u!=u2 && !u.equals(u2) && u2.r!=u.r.mate){//TODO: Not sure why identical things are banned... possibly relates to avoiding inter-pair edges? 3027 if(u2.valid()){ 3028 hits++; 3029 3030 // boolean flag=(u.code1==-3676200394282040623L && u2.code1==-7034423913727372751L) || 3031 // (u2.code1==-3676200394282040623L && u.code1==-7034423913727372751L); 3032 final boolean flag=false; 3033 if(verbose || flag){ 3034 System.err.println("\nFound potential overlap at am="+am+", i="+i+", key="+key.value()+ 3035 ", pre1="+u2.prefix1+", pre2="+u2.prefix2+ 3036 ", suf1="+u2.suffix1+", suf2="+u2.suffix2+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i, k))); 3037 } 3038 3039 final Overlap o; 3040 if(maxEdges>1000000000 || u.overlapList==null || u2.overlapList==null || 3041 (u.overlapList.size()<maxEdges && u2.overlapList.size()<maxEdges2)){ 3042 o=u.makeOverlap(u2, i, key, bandy, am); 3043 3044 }else{ 3045 o=null; 3046 if(u.overlapList.size()>maxEdges){quit=true;} 3047 } 3048 if(o!=null){ 3049 3050 if(preventTransitiveOverlaps){ 3051 mergeClusterIds(u1cluster, u2cluster); 3052 } 3053 3054 assert(o.test(bandy, o.edits+maxEdits)) : o; 3055 if(verbose || flag){System.err.println("Created overlap "+o);} 3056 3057 long comp=u.length()-u2.length(); 3058 if(comp==0){comp=u.code1-u2.code1;} 3059 if(comp==0){comp=u.code2-u2.code2;} 3060 if(comp==0){comp=u.prefix1-u2.prefix1;} 3061 if(comp==0){comp=u.suffix1-u2.suffix1;} 3062 if(comp==0){comp=(u.r.numericID-u2.r.numericID);} 3063 assert(comp!=0) : u+", "+u2; 3064 Unit ua=(comp<0 ? u : u2); 3065 Unit ub=(comp<0 ? u2 : u); 3066 assert(ua!=ub); 3067 if(verbose || flag){ 3068 System.err.println("ua="+ua.code1); 3069 System.err.println("ub="+ub.code1); 3070 System.err.println("u ="+u.code1); 3071 System.err.println("u2="+u2.code1); 3072 System.err.println("u.r ="+u.r.numericID); 3073 System.err.println("u2.r="+u2.r.numericID); 3074 System.err.println("ua contains o? "+ua.alreadyHas(o)); 3075 System.err.println("ub contains o? "+ub.alreadyHas(o)); 3076 System.err.println("ua.list="+ua.overlapList); 3077 System.err.println("ub.list="+ub.overlapList); 3078 } 3079 3080 // assert(ua.alreadyHas(o)==ub.alreadyHas(o)); 3081 3082 final boolean uaContainedOverlap; 3083 3084 synchronized(ua){ 3085 if(ua.overlapList==null){ua.overlapList=new ArrayList<Overlap>(2);} 3086 if(!ua.overlapList.contains(o)){ 3087 if(EA){ 3088 synchronized(ub){ 3089 assert(ub.overlapList==null || !ub.overlapList.contains(o)) : 3090 ua.alreadyHas(o)+", "+ub.alreadyHas(o)+"\n"+o+"\n"+ub.overlapList.get(ub.overlapList.indexOf(o))+ 3091 "\nua.list="+ua.overlapList+"\nub.list="+ub.overlapList+"\nu.code1="+u.code1+"\nu2.code1="+u2.code1; 3092 } 3093 } 3094 currentOverlaps++; 3095 baseOverlapsT+=o.overlapLen; 3096 ua.overlapList.add(o); 3097 if(verbose || flag){System.err.println("Added overlap "+o);} 3098 uaContainedOverlap=false; 3099 }else{ 3100 if(verbose || flag){System.err.println("Already contained overlap "+o);} 3101 hits--; 3102 uaContainedOverlap=true; 3103 } 3104 } 3105 3106 if(!uaContainedOverlap){ 3107 synchronized(ub){ 3108 if(ub.overlapList==null){ub.overlapList=new ArrayList<Overlap>(2);} 3109 assert(!ub.overlapList.contains(o)); 3110 ub.overlapList.add(o); 3111 if(verbose || flag){System.err.println("Added overlap "+o);} 3112 } 3113 }else{ 3114 if(verbose || flag){System.err.println("Already contained overlap "+o);} 3115 } 3116 3117 3118 // assert(ua.alreadyHas(o)); 3119 // assert(ub.alreadyHas(o)); 3120 // assert(ua.overlapList.contains(o)); 3121 // assert(ub.overlapList.contains(o)); 3122 if(verbose || flag){ 3123 System.err.println("ua contains o? "+ua.alreadyHas(o)); 3124 System.err.println("ub contains o? "+ub.alreadyHas(o)); 3125 System.err.println("ua.list="+ua.overlapList); 3126 System.err.println("ub.list="+ub.overlapList); 3127 } 3128 } 3129 } 3130 } 3131 } 3132 } 3133 } 3134 } 3135 } 3136 if(EA){ 3137 synchronized(u){ 3138 if(u.overlapList!=null && u.overlapList.isEmpty()){ 3139 assert(false) : "Why would this happen?"; 3140 u.overlapList=null; 3141 } 3142 } 3143 } 3144 // assert(false) : hits+", "+currentOverlaps+", "+baseOverlaps+"\n"+overlapMapT+"\n"; 3145 3146 // assert(hits==currentOverlaps) : hits+", "+currentOverlaps; 3147 3148 overlapCollisionsT+=(hits-currentOverlaps); 3149 // outstream.println("hits="+hits+", currentOverlaps="+currentOverlaps); 3150 overlapsT+=currentOverlaps; 3151 return hits; 3152 } 3153 3154 /** Insert reads processed by a thread into the shared code and affix maps. 3155 * If operating in subset mode, only store reads with code equal to subset mod subsetCount. */ mergeMaps()3156 private long mergeMaps(){ 3157 if(verbose){System.err.println("Merging maps.");} 3158 long novelReads=0, novelKeys=0; 3159 long collisionReads=0; 3160 long mergedReads=0; 3161 3162 assert(localConflictList.isEmpty()); 3163 assert(sharedConflictList.isEmpty()); 3164 3165 synchronized(codeMap){ 3166 for(Long key : codeMapT.keySet()){ 3167 if(codeMap.containsKey(key)){ 3168 localConflictList.add(codeMapT.get(key)); 3169 sharedConflictList.add(codeMap.get(key)); 3170 }else{ 3171 ArrayList<Unit> list=codeMapT.get(key); 3172 codeMap.put(key, list); 3173 addedList.addAll(list); 3174 novelReads+=list.size(); 3175 novelKeys++; 3176 } 3177 } 3178 } 3179 3180 if(verbose){System.err.println("Novel reads = "+novelReads+", conflicts = "+localConflictList.size());} 3181 3182 for(int i=0; i<localConflictList.size(); i++){ 3183 ArrayList<Unit> listT=localConflictList.get(i); 3184 ArrayList<Unit> list=sharedConflictList.get(i); 3185 synchronized(list){ 3186 for(Unit u : listT){ 3187 if(verbose){System.err.println("Processing novel unit "+u.name());} 3188 boolean match=false; 3189 if(findMatchesT){ 3190 for(Unit u2 : list){ 3191 if(pairedEqualsRC(u, u2)){ 3192 // if(verbose){System.err.println("Matches "+new String(r2.bases, 0, Tools.min(40, r2.length())));} 3193 u2.absorbMatch(u); 3194 if(UNIQUE_ONLY){ 3195 synchronized(u2){ 3196 if(u2.valid()){ 3197 mergedReads++; 3198 baseMatchesT+=u2.length(); 3199 u2.setValid(false); 3200 addDupe(u2.r); 3201 } 3202 } 3203 } 3204 match=true; 3205 break; 3206 } 3207 } 3208 } 3209 if(match){ 3210 addDupe(u.r); 3211 mergedReads++; 3212 baseMatchesT+=u.length(); 3213 if(verbose){System.err.println("matchesT="+matchesT+", baseMatchesT="+baseMatchesT);} 3214 }else{ 3215 collisionReads++; 3216 if(verbose){System.err.println("False collision; count = "+collisionReads);} 3217 list.add(u); 3218 addedList.add(u); 3219 } 3220 } 3221 } 3222 } 3223 matchesT+=mergedReads; 3224 collisionsT+=collisionReads; 3225 if(verbose){System.err.println("Done Merging.");} 3226 if(verbose){System.err.println("mapT.size="+codeMapT.size()+", basesStoredT="+basesStoredT);} 3227 3228 codeMapT.clear(); 3229 localConflictList.clear(); 3230 sharedConflictList.clear(); 3231 3232 if(!addedList.isEmpty()){ 3233 if(addToAffixMapT){ 3234 final LongM p=new LongM(-1, true); 3235 assert(affixMap1!=null || affixMap2!=null); 3236 if(affixMap1!=null && !ignoreAffix1){//Allows you to not use am1 3237 synchronized(affixMap1){ 3238 for(Unit u : addedList){ 3239 if(verbose){System.err.println("Processing affixes for "+u.name());} 3240 if(u.prefix1!=-1 || u.prefix1!=u.suffix1){ 3241 if(verbose){System.err.println("Using prefix "+u.prefix1);} 3242 p.set(u.prefix1); 3243 ArrayList<Unit> alu=affixMap1.get(p); 3244 if(alu==null){ 3245 if(verbose){System.err.println("Made new alu for "+p);} 3246 alu=new ArrayList<Unit>(2); 3247 affixMap1.put(p.iCopy(), alu); 3248 } 3249 if(alu.size()<maxAffixCopies){ 3250 if(verbose){System.err.println("Added "+u.name());} 3251 alu.add(u); 3252 } 3253 if(verbose){System.err.println(affixMap1.get(p));} 3254 } 3255 if(storeSuffix && u.prefix1!=u.suffix1){ 3256 if(verbose){System.err.println("Using suffix "+u.suffix1);} 3257 p.set(u.suffix1); 3258 ArrayList<Unit> alu=affixMap1.get(p); 3259 if(alu==null){ 3260 if(verbose){System.err.println("Made new alu for "+p);} 3261 alu=new ArrayList<Unit>(2); 3262 affixMap1.put(p.iCopy(), alu); 3263 } 3264 if(alu.size()<maxAffixCopies){ 3265 if(verbose){System.err.println("Added "+u.name());} 3266 alu.add(u); 3267 } 3268 if(verbose){System.err.println(affixMap1.get(p));} 3269 } 3270 } 3271 } 3272 } 3273 if(affixMap2!=null){ 3274 synchronized(affixMap2){ 3275 for(Unit u : addedList){ 3276 if(u.prefix2!=-1 || u.prefix2!=u.suffix2){ 3277 p.set(u.prefix2); 3278 ArrayList<Unit> alu=affixMap2.get(p); 3279 if(alu==null){ 3280 alu=new ArrayList<Unit>(2); 3281 affixMap2.put(p.iCopy(), alu); 3282 } 3283 if(alu.size()<maxAffixCopies){alu.add(u);} 3284 } 3285 if(storeSuffix && u.prefix2!=u.suffix2){ 3286 p.set(u.suffix2); 3287 ArrayList<Unit> alu=affixMap2.get(p); 3288 if(alu==null){ 3289 alu=new ArrayList<Unit>(2); 3290 affixMap2.put(p.iCopy(), alu); 3291 } 3292 if(alu.size()<maxAffixCopies){alu.add(u);} 3293 } 3294 } 3295 } 3296 } 3297 } 3298 } 3299 3300 addedList.clear(); 3301 basesStoredT=0; 3302 return collisionReads+novelReads; 3303 } 3304 getTid()3305 private int getTid(){ 3306 synchronized(HashThread.class){ 3307 int x=tcount; 3308 tcount++; 3309 return x; 3310 } 3311 } 3312 3313 private LinkedHashMap<Long, ArrayList<Unit>> codeMapT=new LinkedHashMap<Long, ArrayList<Unit>>(threadMaxReadsToBuffer*8); 3314 private ArrayList<Unit> addedList=new ArrayList<Unit>(threadMaxReadsToBuffer); 3315 private ArrayList<ArrayList<Unit>> localConflictList=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer); 3316 private ArrayList<ArrayList<Unit>> sharedConflictList=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer); 3317 3318 long matchesT=0; 3319 long baseMatchesT=0; 3320 long baseContainmentsT=0; 3321 long collisionsT=0; 3322 long containmentsT=0; 3323 long containmentCollisionsT=0; 3324 long basesStoredT=0; 3325 long addedToMainT=0; 3326 long readsProcessedT=0; 3327 long basesProcessedT=0; 3328 long overlapsT=0; 3329 long baseOverlapsT=0; 3330 long overlapCollisionsT=0; 3331 3332 private final boolean addToCodeMapT; 3333 private final boolean addToAffixMapT; 3334 private final boolean findContainmentsT; 3335 private final boolean findOverlapsT; 3336 private final boolean findMatchesT; 3337 // private final boolean convertToUpperCaseT; 3338 private final int tid; 3339 private final ArrayDeque<ConcurrentReadInputStream> crisq; 3340 private final BandedAligner bandy; 3341 } 3342 equalsRC(byte[] a, byte[] b)3343 public static boolean equalsRC(byte[] a, byte[] b){ 3344 if(a==b){return true;} 3345 if(a==null || b==null){return false;} 3346 if(a.length!=b.length){return false;} 3347 3348 for(int i=0; i<a.length; i++){ 3349 final byte aa=a[i], bb=b[i]; 3350 if(aa!=bb){return false;} 3351 } 3352 return true; 3353 } 3354 pairedEqualsRC(Unit ua, Unit ub)3355 public static boolean pairedEqualsRC(Unit ua, Unit ub){ 3356 if(verbose){System.err.println("pairedEqualsRC("+ua.name()+", "+ub.name()+")");} 3357 if(verbose){System.err.println("ea");} 3358 boolean b=equalsRC(ua, ub); 3359 if(verbose){System.err.println("eb");} 3360 if(!b){return false;} 3361 if(verbose){System.err.println("ec");} 3362 3363 if(ua.r!=null && ub.r!=null){ 3364 if(verbose){System.err.println("ed");} 3365 assert((ua.r.mate==null)==(ub.r.mate==null)); 3366 if(verbose){System.err.println("ee");} 3367 if(ua.r.mate!=null && ub.r.mate!=null){ 3368 if(verbose){System.err.println("ef");} 3369 return ua.r.pairnum()==ub.r.pairnum() && Tools.compare(ua.r.mate.bases, ub.r.mate.bases)==0; 3370 } 3371 if(verbose){System.err.println("eg");} 3372 } 3373 if(verbose){System.err.println("eh");} 3374 return true; 3375 } 3376 equalsRC(Unit ua, Unit ub)3377 private static boolean equalsRC(Unit ua, Unit ub){ 3378 if(verbose){System.err.println("equalsRC("+ua.name()+", "+ub.name()+")");} 3379 return ua.code1==ub.code1 && ua.code2==ub.code2 && (ua.prefix1==ub.prefix1 && ua.suffix1==ub.suffix1) && compareRC(ua, ub)==0; 3380 } 3381 comparePairedRC(Unit ua, Unit ub)3382 public static int comparePairedRC(Unit ua, Unit ub){ 3383 if(verbose){System.err.println("comparePairedRC("+ua.name()+", "+ub.name()+")");} 3384 int x=compareRC(ua, ub); 3385 if(x!=0){return x;} 3386 3387 if(ua.r!=null && ub.r!=null && ua.r.mate!=null && ub.r.mate!=null){ 3388 if(ua.r.pairnum()!=ub.r.pairnum()){return ua.r.pairnum()-ub.r.pairnum();} 3389 return compareRC((Unit)ua.r.mate.obj, (Unit)ub.r.mate.obj); 3390 } 3391 return 0; 3392 } 3393 3394 //TODO 3395 //This is really for sorting by length. compareRC(Unit ua, Unit ub)3396 private static int compareRC(Unit ua, Unit ub){ 3397 if(verbose){System.err.println("compareRC("+ua.name()+", "+ub.name()+")");} 3398 if(ua==ub){return 0;} 3399 if(verbose){System.err.println("a");} 3400 if(verbose){System.err.println("a1");} 3401 if(ua.length()!=ub.length()){return ub.length()-ua.length();} 3402 if(verbose){System.err.println("a2");} 3403 3404 if(REQUIRE_MATCHING_NAMES){ 3405 if(ua.name()!=null && ub.name()!=null){ 3406 int x=ua.name().compareTo(ub.name()); 3407 if(x!=0){return x;} 3408 } 3409 } 3410 if(verbose){System.err.println("a3");} 3411 3412 if(ua.r==null || ub.r==null){ 3413 if(verbose){System.err.println("b");} 3414 if(verbose){System.err.println("b1");} 3415 if(verbose){System.err.println("c");} 3416 if(ua.prefix1!=ub.prefix1){return ua.prefix1>ub.prefix1 ? 1 : -1;} 3417 if(ua.suffix1!=ub.suffix1){return ua.suffix1>ub.suffix1 ? 1 : -1;} 3418 if(verbose){System.err.println("e");} 3419 if(ua.code1!=ub.code1){return ua.code1>ub.code1 ? 1 : -1;} 3420 if(ua.code2!=ub.code2){return ua.code2>ub.code2 ? 1 : -1;} 3421 3422 return ua.pairnum()-ub.pairnum(); 3423 } 3424 if(verbose){System.err.println("f");} 3425 final byte[] a=ua.r.bases, b=ub.r.bases; 3426 if(a==b){return 0;} 3427 if(a==null || b==null){return a==null ? -1 : 1;} 3428 if(verbose){System.err.println("g");} 3429 3430 if(verbose){System.err.println("h");} 3431 for(int i=0; i<a.length; i++){ 3432 final byte aa=a[i], bb=b[i]; 3433 if(aa!=bb){return aa-bb;} 3434 } 3435 3436 if(verbose){System.err.println("j");} 3437 return ua.pairnum()-ub.pairnum(); 3438 } 3439 hashTip(byte[] bases, boolean prefix, int k, int skipInitialBases)3440 private long hashTip(byte[] bases, boolean prefix, int k, int skipInitialBases){ 3441 if(bases==null || bases.length<k){return -1;} 3442 3443 final int shift=bitsPerBase*k; 3444 final int shift2=shift-bitsPerBase; 3445 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 3446 long kmer=0; 3447 int len=0; 3448 3449 final int start=(prefix ? 0+skipInitialBases : bases.length-k-skipInitialBases); 3450 final int stop=start+k; 3451 3452 // if(verbose){ 3453 // System.err.println("\n"+new String(bases)); 3454 // System.err.println("prefix="+prefix+", start="+start+", stop="+stop); 3455 //// System.err.print(new String(bases)); 3456 // } 3457 for(int i=start; i<stop; i++){ 3458 byte b=bases[i]; 3459 // if(verbose){System.err.print((char)b);} 3460 long x=acidToNumber[b]; 3461 kmer=((kmer<<bitsPerBase)|x)&mask; 3462 len++; 3463 } 3464 if(verbose){System.err.println(new String(bases, start, k)+" = "+kmer);} 3465 assert(len==k) : len+","+k; 3466 return kmer; 3467 } 3468 calcMaxEdits(int maxEdits, float minIdentityMult, int len)3469 private static final int calcMaxEdits(int maxEdits, float minIdentityMult, int len){ 3470 return minIdentityMult==0 ? maxEdits : Tools.max(maxEdits, (int)Math.round(len*minIdentityMult)); 3471 } 3472 3473 3474 private class Overlap implements Comparable<Overlap>{ 3475 Overlap(Unit u1_, Unit u2_, int type_, int start1_, int start2_, int stop1_, int stop2_, int len_, int mismatches_, int edits_, BandedAligner bandy)3476 public Overlap(Unit u1_, Unit u2_, int type_, int start1_, int start2_, int stop1_, int stop2_, int len_, int mismatches_, int edits_, BandedAligner bandy){ 3477 assert(u1_!=u2_); 3478 if(verbose){System.err.println("\nCreating an overlap.");} 3479 u1=u1_; 3480 u2=u2_; 3481 type=type_; 3482 start1=start1_; 3483 start2=start2_; 3484 stop1=stop1_; 3485 stop2=stop2_; 3486 overlapLen=len_; 3487 mismatches=mismatches_; 3488 edits=edits_; 3489 3490 assert(Tools.absdif(Tools.absdif(start1, stop1), Tools.absdif(start2, stop2))<=maxEdits) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) 3491 +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) 3492 +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); 3493 3494 assert(start1>=0 && start1<=u1.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length(); 3495 assert(stop1>=0 && stop1<=u1.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length(); 3496 assert(start2>=0 && start2<=u2.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length(); 3497 assert(stop2>=0 && stop2<=u2.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length(); 3498 3499 assert(type==FORWARD || type==REVERSE); 3500 3501 if(verbose){System.err.println(this);} 3502 3503 assert(Tools.absdif(Tools.absdif(start1, stop1), Tools.absdif(start1, stop1))<=maxEdits); 3504 3505 assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) 3506 +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) 3507 +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); 3508 if(verbose){System.err.println("Passed test 1.");} 3509 3510 // bandy.verbose=true; 3511 // test(bandy); 3512 // assert(false); 3513 3514 assert(u1!=u2); 3515 u1.firstInOverlap(u2); 3516 u2.firstInOverlap(u1); 3517 assert(u1.length()!=u2.length() || u1.code1!=u2.code1 || u1.code2!=u2.code2 || (u1.r!=null && u1.r.mate!=null)) : "Collision? \n"+this+"\n"+u1+"\n"+u2; 3518 assert(u1.firstInOverlap(u2)!=u2.firstInOverlap(u1)) : 3519 "\nu1.firstInOverlap(u2)="+u1.firstInOverlap(u2)+"\nu2.firstInOverlap(u1)="+u2.firstInOverlap(u1)+"\nu1="+u1+"\nu2="+u2; 3520 3521 if(!u1.firstInOverlap(u2)){ 3522 if(verbose){System.err.println("\nSwapping.");} 3523 swap(); 3524 if(verbose){System.err.println(this);} 3525 3526 if(EA && !customBandwidth && !test(bandy, edits+maxEdits)){ 3527 System.err.println("\n"+this); 3528 swap(); 3529 System.err.println("\n"+this); 3530 assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n"; 3531 System.err.println("Passed test 2a, "+bandy.lastEdits+" edits.\n"); 3532 swap(); 3533 System.err.println("\n"+this); 3534 assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) 3535 +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) 3536 +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); 3537 System.err.println("Passed test 2b, "+bandy.lastEdits+" edits.\n"); 3538 } 3539 3540 assert(customBandwidth || test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) 3541 +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) 3542 +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); 3543 if(verbose){System.err.println("Passed test 2.");} 3544 } 3545 3546 if(type==REVERSE){ 3547 if(verbose){System.err.println("\nReversing.");} 3548 reverseDirection(); 3549 if(verbose){System.err.println(this);} 3550 3551 if(EA && !Shared.anomaly && !customBandwidth && bandy!=null && !test(bandy, edits+maxEdits)){ 3552 Shared.anomaly=true; 3553 BandedAligner.verbose=true; 3554 System.err.println("\n********** Failed test 3, "+bandy.lastEdits+" edits. **********\n"); 3555 reverseDirection(); 3556 System.err.println(this); 3557 assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n"; 3558 System.err.println("Passed test 3a, "+bandy.lastEdits+" edits.\n"); 3559 reverseDirection(); 3560 System.err.println(this); 3561 assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) 3562 +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) 3563 +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); 3564 System.err.println("Passed test 3b, "+bandy.lastEdits+" edits.\n"); 3565 BandedAligner.verbose=false; 3566 assert(false); 3567 } 3568 3569 assert(customBandwidth || test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n"; 3570 if(verbose){System.err.println("Passed test 3.");} 3571 } 3572 //Now all overlaps should be FORWARD or FORWARDRC and u1 should be at least as big as u2 3573 assert(type==FORWARD); 3574 assert(u1.length()>=u2.length()); 3575 assert(u1.firstInOverlap(u2)); 3576 assert(!u2.firstInOverlap(u1)); 3577 if(verbose){System.err.println("Finished overlap initialization.");} 3578 } 3579 test(BandedAligner bandy, int editLimit)3580 public boolean test(BandedAligner bandy, int editLimit){ 3581 final int last1=u1.length()-1, last2=u2.length()-1; 3582 if(verbose){System.err.println("Testing "+OVERLAP_TYPE_NAMES[type]+", "+start1+", "+start2);} 3583 if(type==FORWARD){ 3584 assert(start1==0 || start2==0) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2; 3585 if(start2==0){ 3586 if(verbose){System.err.println("A");} 3587 return u1.overlapsForward(u2, start1, start2, bandy, false, editLimit);} 3588 else{ 3589 if(verbose){System.err.println("B");} 3590 return u2.overlapsForward(u1, start2, start1, bandy, false, editLimit);} 3591 } 3592 if(type==REVERSE){ 3593 assert(start1==last1 || start2==last2) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2; 3594 if(start2==last2){return u1.overlapsReverse(u2, start1, start2, bandy, false, editLimit);} 3595 else{return u2.overlapsReverse(u1, start2, start1, bandy, false, editLimit);} 3596 } 3597 throw new RuntimeException(); 3598 } 3599 3600 @Override equals(Object o)3601 public boolean equals(Object o){ 3602 return equals((Overlap)o); 3603 } 3604 equals(Overlap o)3605 public boolean equals(Overlap o){ 3606 if(this==o){return true;} 3607 assert(o!=null) : "*A*\n"+this+"\n"+o+"\n"+u1+"\n"+u2; 3608 assert(u1!=null && u2!=null) : "*B*\n"+this+"\n"+o+"\n"+u1+"\n"+u2; 3609 assert(u1!=o.u2 || u2!=o.u1) : "*C*\n"+this+"\n"+o+"\n"+u1.firstInOverlap(u2)+"\n"+o.u1.firstInOverlap(o.u2)+"\n"+u1+"\n"+u2; 3610 return (u1==o.u1 && u2==o.u2 && type==o.type && start1==o.start1 && start2==o.start2 && stop1==o.stop1 && stop2==o.stop2) 3611 ;//|| (u1==o.u2 && u2==o.u1 && type==reverseType(o.type) && start1==o.start2 && start2==o.start1); 3612 } 3613 3614 // public int compareTo(Overlap o){ 3615 // int a=compareTo2(o); 3616 // int b=o.compareTo2(this); 3617 // assert(a==-b) : "\n"+this+"\n"+o+"\na="+a+", b="+b+", equals="+this.equals(o) 3618 // +"\nu1.compareTo(o.u1)="+u1.compareTo(o.u1)+"\no.u1.compareTo(u1)="+o.u1.compareTo(u1) 3619 // +"\nu2.compareTo(o.u2)="+u2.compareTo(o.u2)+"\no.u2.compareTo(u2)="+o.u2.compareTo(u2); 3620 // return a; 3621 // } 3622 3623 @Override compareTo(Overlap o)3624 public int compareTo(Overlap o){ 3625 int score1=overlapLen-50*(mismatches+edits); 3626 int score2=o.overlapLen-50*(o.mismatches+o.edits); 3627 if(score1!=score2){return score2-score1;} 3628 if(overlapLen!=o.overlapLen){return o.overlapLen-overlapLen;} 3629 int x=u1.compareTo(o.u1); 3630 if(x!=0){return -x;} 3631 x=u2.compareTo(o.u2); 3632 if(x!=0){return -x;} 3633 if(type!=o.type){return type-o.type;} 3634 if((u1!=o.u1 || u2!=o.u2) && absorbMatch && !subsetMode){ 3635 boolean oldv=verbose; 3636 verbose=true; 3637 System.err.println(this); 3638 System.err.println(o); 3639 System.err.println("********"); 3640 System.err.println(u1); 3641 System.err.println(u2); 3642 System.err.println(o.u1); 3643 System.err.println(o.u2); 3644 System.err.println("********"); 3645 System.err.println(u1.equals(o.u1)); 3646 System.err.println("********"); 3647 System.err.println(u2.equals(o.u2)); 3648 System.err.println("********"); 3649 System.err.println(u1.compareTo(o.u1)); 3650 System.err.println("********"); 3651 System.err.println(u2.compareTo(o.u2)); 3652 System.err.println("********"); 3653 verbose=oldv; 3654 } 3655 assert(!absorbMatch || (u1==o.u1 && u2==o.u2) || subsetMode) : "\n"+u1.r+"\n"+u2.r+"\n"+o.u1.r+"\n"+o.u2.r 3656 +"\n\n"+u1.r.mate+"\n"+u2.r.mate+"\n"+o.u1.r.mate+"\n"+o.u2.r.mate; 3657 // assert(false) : "\n"+this+"\n"+o+"\n>"+u1.name()+"\n"+new String(u1.bases())+"\n>"+u2.name()+"\n"+new String(u2.bases())+"\n"; 3658 if(start1!=o.start1){return start1-o.start1;} 3659 if(stop1!=o.stop1){return stop1-o.stop1;} 3660 if(start2!=o.start2){return start2-o.start2;} 3661 if(stop2!=o.stop2){return stop2-o.stop2;} 3662 if(this.equals(o)){ 3663 return 0; 3664 }else{ 3665 //TODO: ensure this assumption is valid. 3666 assert(!absorbContainment || !absorbMatch || subsetMode) : "\n"+this+"\n"+o+"\n>"+u1.name()+"\n"+new String(u1.bases())+"\n>"+u2.name()+"\n"+new String(u2.bases())+"\n"; 3667 3668 if(u1.unitID!=o.u1.unitID){return u1.unitID-o.u1.unitID;} 3669 if(u2.unitID!=o.u2.unitID){return u2.unitID-o.u2.unitID;} 3670 } 3671 return 0; 3672 } 3673 3674 @Override hashCode()3675 public int hashCode(){ 3676 return u1.hashCode()^u2.hashCode()^overlapLen; 3677 } 3678 swap()3679 public void swap(){ 3680 Unit tempu=u1; 3681 u1=u2; 3682 u2=tempu; 3683 int temp=start1; 3684 start1=start2; 3685 start2=temp; 3686 temp=stop1; 3687 stop1=stop2; 3688 stop2=temp; 3689 // if(type==FORWARDRC){type=REVERSERC;} 3690 // else if(type==REVERSERC){type=FORWARDRC;} 3691 //What about forward and reverse? 3692 } 3693 reverseDirection()3694 public void reverseDirection(){ 3695 type=reverseType(type); 3696 int temp=start1; 3697 start1=stop1; 3698 stop1=temp; 3699 temp=start2; 3700 start2=stop2; 3701 stop2=temp; 3702 } 3703 3704 @Override toString()3705 public String toString(){ 3706 StringBuilder sb=new StringBuilder(80); 3707 sb.append("type="); 3708 sb.append(OVERLAP_TYPE_NAMES[type]); 3709 sb.append(", len="); 3710 sb.append(overlapLen); 3711 sb.append(", subs="); 3712 sb.append(mismatches); 3713 sb.append(", edits="); 3714 sb.append(edits); 3715 3716 sb.append(" ("); 3717 sb.append(u1.name()==null ? u1.r.numericID+"" : u1.name()); 3718 if(printLengthInEdges){ 3719 sb.append(", length="); 3720 sb.append(u1.length()); 3721 } 3722 sb.append(", start1="); 3723 sb.append(start1); 3724 sb.append(", stop1="); 3725 sb.append(stop1); 3726 3727 sb.append(") ("); 3728 sb.append(u2.name()==null ? u2.r.numericID+"" : u2.name()); 3729 if(printLengthInEdges){ 3730 sb.append(", length="); 3731 sb.append(u2.length()); 3732 } 3733 sb.append(", start2="); 3734 sb.append(start2); 3735 sb.append(", stop2="); 3736 sb.append(stop2); 3737 sb.append(")"); 3738 return sb.toString(); 3739 } 3740 toLabel()3741 public String toLabel(){ 3742 StringBuilder sb=new StringBuilder(80); 3743 sb.append(OVERLAP_TYPE_ABBREVIATIONS[type]); 3744 sb.append(','); 3745 sb.append(overlapLen); 3746 sb.append(','); 3747 sb.append(mismatches); 3748 sb.append(','); 3749 sb.append(edits); 3750 3751 if(printLengthInEdges){ 3752 sb.append(','); 3753 sb.append(u1.length()); 3754 } 3755 sb.append(','); 3756 sb.append(start1); 3757 sb.append(','); 3758 sb.append(stop1); 3759 3760 if(printLengthInEdges){ 3761 sb.append(','); 3762 sb.append(u2.length()); 3763 } 3764 sb.append(','); 3765 sb.append(start2); 3766 sb.append(','); 3767 sb.append(stop2); 3768 3769 return sb.toString(); 3770 } 3771 setOffsetContradiction(boolean b)3772 private void setOffsetContradiction(boolean b){ 3773 assert(b!=offsetContradiction()) : b+", "+offsetContradiction(); 3774 if(b){flags|=OFFSET_CONTRADICTION_MASK;} 3775 else{flags&=~OFFSET_CONTRADICTION_MASK;} 3776 assert(b==offsetContradiction()) : b+", "+offsetContradiction(); 3777 } 3778 setMultiJoin(boolean b)3779 private void setMultiJoin(boolean b){ 3780 assert(b!=multiJoin()) : b+", "+multiJoin(); 3781 if(b){flags|=MULTIJOIN_MASK;} 3782 else{flags&=~MULTIJOIN_MASK;} 3783 assert(b==multiJoin()) : b+", "+multiJoin(); 3784 } 3785 setVisited(boolean b)3786 private void setVisited(boolean b){ 3787 assert(b!=visited()) : b+", "+visited(); 3788 if(b){flags|=VISITED_MASK;} 3789 else{flags&=~VISITED_MASK;} 3790 assert(b==visited()) : b+", "+visited(); 3791 } 3792 setCyclic(boolean b)3793 private void setCyclic(boolean b){ 3794 assert(b!=cyclic()) : b+", "+cyclic(); 3795 if(b){flags|=CYCLIC_MASK;} 3796 else{flags&=~CYCLIC_MASK;} 3797 assert(b==cyclic()) : b+", "+cyclic(); 3798 } 3799 setInvalid(boolean b)3800 private void setInvalid(boolean b){ 3801 assert(b!=invalid()) : b+", "+invalid(); 3802 assert(b!=mst()) : b+", "+mst()+", "+invalid(); 3803 if(b){flags|=INVALID_MASK;} 3804 else{flags&=~INVALID_MASK;} 3805 assert(b==invalid()) : b+", "+invalid(); 3806 } 3807 setMst(boolean b)3808 private void setMst(boolean b){ 3809 assert(b!=mst()) : b+", "+mst(); 3810 assert(b!=invalid()) : b+", "+mst()+", "+invalid(); 3811 if(b){flags|=MST_MASK;} 3812 else{flags&=~MST_MASK;} 3813 assert(b==mst()) : b+", "+mst(); 3814 } 3815 clearVolatileFlags()3816 public void clearVolatileFlags(){ 3817 flags=0; 3818 // flags=flags&~(MULTIJOIN_MASK|VISITED_MASK|CANON_CONTRADICTION_MASK|CYCLIC_MASK|OFFSET_CONTRADICTION_MASK|INVALID_MASK); 3819 // assert(!canonContradiction()); 3820 // assert(!offsetContradiction()); 3821 // assert(!multiJoin()); 3822 // assert(!visited()); 3823 // assert(!cyclic()); 3824 // assert(!invalid()); 3825 } 3826 offsetContradiction()3827 public boolean offsetContradiction(){return (OFFSET_CONTRADICTION_MASK&flags)==OFFSET_CONTRADICTION_MASK;} multiJoin()3828 public boolean multiJoin(){return (MULTIJOIN_MASK&flags)==MULTIJOIN_MASK;} visited()3829 public boolean visited(){return (VISITED_MASK&flags)==VISITED_MASK;} cyclic()3830 public boolean cyclic(){return (CYCLIC_MASK&flags)==CYCLIC_MASK;} invalid()3831 public boolean invalid(){return (INVALID_MASK&flags)==INVALID_MASK;} mst()3832 public boolean mst(){return (MST_MASK&flags)==MST_MASK;} contradiction()3833 public boolean contradiction(){return offsetContradiction();} 3834 3835 private static final long VISITED_MASK=(1L<<0); 3836 private static final long MULTIJOIN_MASK=(1L<<1); 3837 private static final long CYCLIC_MASK=(1L<<2); 3838 private static final long CANON_CONTRADICTION_MASK=(1L<<3); 3839 private static final long OFFSET_CONTRADICTION_MASK=(1L<<4); 3840 private static final long INVALID_MASK=(1L<<5); 3841 private static final long MST_MASK=(1L<<6); 3842 3843 Unit u1; 3844 Unit u2; 3845 int type; 3846 int start1; 3847 int start2; 3848 int stop1; 3849 int stop2; 3850 3851 long flags=0; 3852 3853 final int overlapLen; 3854 final int mismatches; 3855 final int edits; 3856 } 3857 3858 /** 3859 * @return 3860 */ determineCluster2(final int uid)3861 private int determineCluster2(final int uid) { 3862 assert(clusterNumbers!=null); 3863 boolean stable=false; 3864 int cluster=uid; 3865 while(!stable){ 3866 cluster=clusterNumbers.get(uid); 3867 if(cluster==0 || cluster==uid){return cluster;} 3868 assert(cluster<=uid); 3869 final int next=determineCluster2(cluster); 3870 if(next>=cluster){return cluster;} 3871 stable=clusterNumbers.compareAndSet(uid, cluster, next); 3872 } 3873 return cluster; 3874 } 3875 3876 mergeClusterIds(int cluster1, int cluster2)3877 private int mergeClusterIds(int cluster1, int cluster2) { 3878 assert(clusterNumbers!=null); 3879 3880 // System.err.println("Merging clusters "+cluster1+" and "+cluster2); 3881 3882 while(cluster1!=cluster2){ 3883 int min=Tools.min(cluster1, cluster2); 3884 if(cluster1!=min){ 3885 assert(cluster1>min); 3886 boolean b=clusterNumbers.compareAndSet(cluster1, cluster1, min); 3887 if(!b){ 3888 cluster1=determineCluster2(cluster1); 3889 min=Tools.min(cluster1, cluster2); 3890 } 3891 } 3892 if(cluster2!=min){ 3893 assert(cluster2>min); 3894 boolean b=clusterNumbers.compareAndSet(cluster2, cluster2, min); 3895 if(!b){ 3896 cluster2=determineCluster2(cluster2); 3897 min=Tools.min(cluster1, cluster2); 3898 } 3899 } 3900 } 3901 // System.err.println("Returning "+cluster1); 3902 return cluster1; 3903 } 3904 3905 private class Unit implements Comparable<Unit>, Serializable { 3906 3907 /** 3908 * 3909 */ 3910 private static final long serialVersionUID = 5232407003873807738L; 3911 Unit(Read r_)3912 public Unit(Read r_){ 3913 // this(r_, canonical_, canonical_ ? hash(r_.bases) : hashReversed(r_.bases)); 3914 this(r_, hash(r_.bases), hashReversed(r_.bases)); 3915 } 3916 Unit(Read r_, long codeF_, long codeR_)3917 public Unit(Read r_, long codeF_, long codeR_){ 3918 r=r_; 3919 code1=Tools.min(codeF_, codeR_); 3920 code2=Tools.max(codeF_, codeR_); 3921 long f=r.length(); 3922 prefix1=hashTip(r.bases, true, k, 0); 3923 suffix1=hashTip(r.bases, false, k, 0); 3924 if(r.length()>bitsPerBase*k){ 3925 prefix2=hashTip(r.bases, true, k, k); 3926 suffix2=hashTip(r.bases, false, k, k); 3927 } 3928 // System.err.println(r.id+", "+prefix1+", "+suffix1+", "+code1+", "+code2); 3929 if(r.pairnum()==1){f|=PAIRNUM_MASK;} 3930 flags=f; 3931 assert(length()==r.length()); 3932 assert(pairnum()==r.pairnum()); 3933 if(parseDepth){ 3934 int[] quad=KmerNormalize.parseDepth(r.id, null); 3935 if(quad!=null){depth=quad[r.pairnum()];} 3936 } 3937 } 3938 determineCluster()3939 int determineCluster() { 3940 return determineCluster2(unitID); 3941 } 3942 absorbMatch(Unit u)3943 public void absorbMatch(Unit u){ 3944 3945 assert(code1==u.code1 && code2==u.code2 && length()==u.length()); 3946 if(r==null || u.r==null){return;} 3947 u.r.setDiscarded(true); 3948 final byte[] bases1=r.bases, bases2=u.r.bases; 3949 final byte[] quals1=r.quality, quals2=u.r.quality; 3950 3951 assert((r.mate==null) == (u.r.mate==null)); 3952 3953 if(r.mate!=null && !u.r.mate.discarded()){ 3954 ((Unit)r.mate.obj).absorbMatch((Unit)u.r.mate.obj); 3955 } 3956 if(quals1==null || quals2==null){return;} 3957 3958 for(int i=0; i<bases1.length; i++){ 3959 byte b1=bases1[i], b2=bases2[i]; 3960 if(!isFullyDefined(b1) && isFullyDefined(b2)){bases1[i]=b2;} 3961 else{assert(b1==b2);} 3962 if(quals1!=null && quals2!=null){ 3963 quals1[i]=Tools.max(quals1[i], quals2[i]); 3964 } 3965 } 3966 } 3967 alreadyHas(Overlap o)3968 public boolean alreadyHas(Overlap o){ 3969 if(overlapList==null){return false;} 3970 for(int i=0; i<overlapList.size(); i++){ 3971 Overlap o2=overlapList.get(i); 3972 if(o.equals(o2)){ 3973 assert(overlapList.contains(o)); 3974 assert(o2.equals(o)); 3975 return true; 3976 } 3977 } 3978 assert(!overlapList.contains(o)); 3979 return false; 3980 } 3981 makeCluster()3982 public ArrayList<Unit> makeCluster() { 3983 assert(!visited()); 3984 assert(!clustered()); 3985 assert(valid()); 3986 // assert(set.isEmpty()); 3987 ArrayList<Unit> cluster=new ArrayList<Unit>(overlapList==null ? 1 : overlapList.size()+1); 3988 cluster.add(this); 3989 setClustered(true); 3990 3991 int added=1; 3992 for(int i=0; i<cluster.size(); i++){ 3993 Unit u=cluster.get(i); 3994 added+=u.visit(cluster); 3995 } 3996 3997 assert(added==cluster.size()); 3998 return cluster; 3999 } 4000 visit(ArrayList<Unit> cluster)4001 public int visit(ArrayList<Unit> cluster) { 4002 assert(!visited()); 4003 assert(clustered()); 4004 assert(valid()); 4005 // assert(cluster.contains(this)); 4006 setVisited(true); 4007 int added=0; 4008 4009 if(r!=null && r.mate!=null){ 4010 Unit u2=(Unit)r.mate.obj; 4011 assert(u2!=this); 4012 assert(u2.valid()); 4013 if(!u2.clustered()){ 4014 u2.setClustered(true); 4015 cluster.add(u2); 4016 added++; 4017 } 4018 } 4019 4020 if(overlapList!=null){ 4021 for(Overlap o : overlapList){ 4022 Unit u2=(o.u1==this ? o.u2 : o.u1); 4023 assert(o.u1==this || o.u2==this); 4024 assert(u2!=this); 4025 assert(u2.valid()); 4026 if(!u2.clustered()){ 4027 u2.setClustered(true); 4028 cluster.add(u2); 4029 added++; 4030 } 4031 } 4032 } 4033 return added; 4034 } 4035 isTransitive()4036 public boolean isTransitive(){ 4037 assert(valid()); 4038 if(overlapList==null || overlapList.size()==0){return true;} 4039 for(Overlap o : overlapList){ 4040 assert(o.u1==this || o.u2==this); 4041 Unit u2=(o.u1==this ? o.u2 : o.u1); 4042 assert(u2!=this); 4043 if(u2.overlapList==null){ 4044 return false; 4045 }else{ 4046 boolean found=false; 4047 for(Overlap o2 : u2.overlapList){ 4048 if(o2.u1==this || o2.u2==this){ 4049 found=true; break; 4050 } 4051 } 4052 if(!found){return false;} 4053 } 4054 } 4055 return true; 4056 } 4057 isPerfectlyTransitive()4058 public boolean isPerfectlyTransitive(){ 4059 assert(valid()); 4060 if(overlapList==null || overlapList.size()==0){return true;} 4061 for(Overlap o : overlapList){ 4062 assert(o.u1==this || o.u2==this); 4063 Unit u2=(o.u1==this ? o.u2 : o.u1); 4064 assert(u2!=this); 4065 if(u2.overlapList==null){ 4066 return false; 4067 }else{ 4068 boolean found=false; 4069 for(Overlap o2 : u2.overlapList){ 4070 if(o2==o){ 4071 found=true; break; 4072 } 4073 } 4074 if(!found){return false;} 4075 } 4076 } 4077 return true; 4078 } 4079 isNonRedundant()4080 public boolean isNonRedundant(){ 4081 assert(valid()); 4082 if(overlapList==null || overlapList.size()==0){return true;} 4083 for(int i=0; i<overlapList.size(); i++){ 4084 Overlap a=overlapList.get(i); 4085 for(int j=0; j<overlapList.size(); j++){ 4086 Overlap b=overlapList.get(j); 4087 if((i==j)!=(a.equals(b))){ 4088 return false; 4089 } 4090 } 4091 } 4092 return true; 4093 } 4094 contains(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum)4095 public boolean contains(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum) { 4096 if(verbose){System.err.println("contains: Considering key "+key+", unit "+u2);} 4097 if(u2.length()>this.length()){return false;} //Smaller things cannot contain larger things 4098 if(u2.length()==this.length()){//For equal size, only one can contain the other the other 4099 boolean pass=false; 4100 if(!pass && u2.unitID>this.unitID){return false;}else if(u2.unitID<this.unitID){pass=true;} 4101 if(!pass && u2.r.numericID>this.r.numericID){return false;}else if(u2.r.numericID<this.r.numericID){pass=true;} 4102 if(!pass && u2.code1>this.code1){return false;}else if(u2.code1<this.code1){pass=true;} 4103 if(!pass && u2.r.bases.hashCode()>this.r.bases.hashCode()){return false;}else if(u2.r.bases.hashCode()<this.r.bases.hashCode()){pass=true;} 4104 if(!pass){return false;} 4105 } 4106 if(minLengthPercent>0 && (u2.length()*100f/length())<minLengthPercent){return false;} 4107 if(verbose){System.err.println("passed tests for "+u2.length()+", "+this.length()+", "+u2.unitID+", "+this.unitID);} 4108 4109 assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() || (r!=null && r.mate!=null) || //REQUIRE_MATCHING_NAMES || 4110 (u2.prefix1!=prefix1 && u2.suffix1!=suffix1)) : 4111 "Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r; 4112 4113 if(tableNum==0){ 4114 if(key.value()==u2.prefix1){ 4115 if(verbose){System.err.println("Containment A1");} 4116 if(containsForward(u2, loc-k2, bandy, tableNum==0)){return true;} 4117 } 4118 if(key.value()==u2.suffix1){ 4119 if(verbose){System.err.println("Containment B1");} 4120 if(containsReverse(u2, loc, bandy, tableNum==0)){return true;} 4121 } 4122 }else{ 4123 if(key.value()==u2.prefix2){ 4124 if(verbose){System.err.println("Containment A2");} 4125 if(containsForward(u2, loc-k2-k, bandy, tableNum==0)){return true;} 4126 } 4127 if(key.value()==u2.suffix2){ 4128 if(verbose){System.err.println("Containment B2");} 4129 if(containsReverse(u2, loc+k, bandy, tableNum==0)){return true;} 4130 } 4131 } 4132 return false; 4133 } 4134 containsForward(Unit u2, int start, BandedAligner bandy, boolean earlyExit)4135 private boolean containsForward(Unit u2, int start, BandedAligner bandy, boolean earlyExit) { 4136 if(start+u2.length()>length() || start<0 || start>=length()){return false;} 4137 // if(true){return false;} 4138 if(u2.r!=null){ 4139 final byte[] a=bases(), b=u2.bases(); 4140 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4141 4142 for(int i=start, j=0; j<b.length; i++, j++){ 4143 byte aa=a[i]; 4144 byte bb=b[j]; 4145 if(aa!=bb){ 4146 if(exact || (isFullyDefined(aa) && isFullyDefined(bb))){ 4147 if(earlyExit && j<k2){return false;} 4148 if((mismatches=mismatches+1)>maxMismatches){ 4149 if(bandy==null || maxEdits<1){return false;} 4150 int edits=bandy.alignForward(b, a, 0, start, maxEdits, exact); 4151 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4152 return edits<=maxEdits && bandy.score()>4*edits; 4153 } 4154 } 4155 } 4156 } 4157 return true; 4158 }else{ 4159 assert(false) : "TODO: Verify by hashing and checking both tips"; 4160 return false; 4161 } 4162 } 4163 containsReverse(Unit u2, int start, BandedAligner bandy, boolean earlyExit)4164 private boolean containsReverse(Unit u2, int start, BandedAligner bandy, boolean earlyExit) { 4165 if(start+1<u2.length() || start<0 || start>=length()){return false;} 4166 // if(true){return false;} 4167 if(u2.r!=null){ 4168 final byte[] a=bases(), b=u2.bases(); 4169 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4170 4171 for(int i=start, j=b.length-1, iprefix=start-k2; j>=0; i--, j--){ 4172 byte aa=a[i]; 4173 byte bb=b[j]; 4174 if(aa!=bb){ 4175 if(exact || (isFullyDefined(aa) && isFullyDefined(bb))){ 4176 if(earlyExit && i>iprefix){return false;} 4177 if((mismatches=mismatches+1)>maxMismatches){ 4178 if(bandy==null || maxEdits<1){return false;} 4179 int edits=bandy.alignReverse(b, a, b.length-1, start, maxEdits, exact); 4180 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4181 return edits<=maxEdits && bandy.score()>4*edits; 4182 } 4183 } 4184 } 4185 } 4186 return true; 4187 }else{ 4188 assert(false) : "TODO: Verify by hashing and checking both tips"; 4189 return false; 4190 } 4191 } 4192 4193 depthCongruent(int aa, int bb)4194 public boolean depthCongruent(int aa, int bb){ 4195 if(aa<5 && bb<5){return true;} 4196 final int a=Tools.max(1, Tools.min(aa, bb)); 4197 final int b=Tools.max(aa, bb); 4198 return a*depthRatio>=b; 4199 } 4200 overlaps(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum, int editLimit)4201 public boolean overlaps(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum, int editLimit) { 4202 // return makeOverlap(u2, loc, key, bandy, earlyExit)!=null; 4203 4204 // assert(false) : "TODO"; 4205 if(verbose){System.err.println("overlaps: Considering key "+key+", unit "+u2);} 4206 if(parseDepth && !depthCongruent(depth, u2.depth)){return false;} 4207 if(minLengthPercent>0){ 4208 final int len1=length(), len2=u2.length(); 4209 if(Tools.min(len1, len2)*100f/Tools.max(len1, len2)<minLengthPercent){return false;} 4210 } 4211 assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() || 4212 (u2.prefix1!=prefix1 && u2.suffix1!=suffix1)) : 4213 "Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r; 4214 4215 4216 if(tableNum==0){ 4217 if(key.value()==u2.prefix1){ 4218 if(verbose){System.err.println("Testing overlaps A1");} 4219 if(overlapsForward(u2, loc-k2, 0, bandy, tableNum==0, editLimit)){ 4220 if(verbose){System.err.println("Found Overlap A1F");} 4221 return true; 4222 } 4223 if(verbose){System.err.println("No Overlap.");} 4224 } 4225 if(key.value()==u2.suffix1){ 4226 if(verbose){System.err.println("Testing overlaps B1");} 4227 if(verbose){System.err.println("Testing overlaps B1R");} 4228 if(overlapsReverse(u2, loc, u2.length()-1, bandy, tableNum==0, editLimit)){ 4229 if(verbose){System.err.println("Found Overlap B1R");} 4230 return true; 4231 } 4232 if(verbose){System.err.println("No Overlap.");} 4233 } 4234 }else{ 4235 if(key.value()==u2.prefix2){ 4236 if(verbose){System.err.println("Testing overlaps A2");} 4237 if(overlapsForward(u2, loc-k2-k, 0, bandy, tableNum==0, editLimit)){ 4238 if(verbose){System.err.println("Found Overlap A2F");} 4239 return true; 4240 } 4241 if(verbose){System.err.println("No Overlap.");} 4242 } 4243 if(key.value()==u2.suffix2){ 4244 if(verbose){System.err.println("Testing overlaps B2");} 4245 if(overlapsReverse(u2, loc+k, u2.length()-1, bandy, tableNum==0, editLimit)){ 4246 if(verbose){System.err.println("Found Overlap B2R");} 4247 return true; 4248 } 4249 if(verbose){System.err.println("No Overlap.");} 4250 } 4251 } 4252 4253 return false; 4254 } 4255 4256 /** 4257 * @param u2 4258 * @param loc 4259 * @param key 4260 * @return 4261 */ makeOverlap(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum)4262 protected Overlap makeOverlap(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum) { 4263 if(verbose){System.err.println("makeOverlap: Considering key "+key+", unit "+u2);} 4264 if(parseDepth && !depthCongruent(depth, u2.depth)){return null;} 4265 if(minLengthPercent>0){ 4266 final int len1=length(), len2=u2.length(); 4267 if(Tools.min(len1, len2)*100f/Tools.max(len1, len2)<minLengthPercent){return null;} 4268 } 4269 assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() || (r!=null && r.mate!=null) || 4270 (u2.prefix1!=prefix1 && u2.suffix1!=suffix1)) : 4271 "Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r; 4272 4273 4274 Overlap o=null; 4275 if(tableNum==0){ 4276 if(key.value()==u2.prefix1){ 4277 if(verbose){System.err.println("\nTesting makeOverlap A1F");} 4278 if((o=makeOverlapForward(u2, loc-k2, bandy, tableNum==0))!=null){ 4279 if(verbose){System.err.println("Made Overlap A1F");} 4280 return o; 4281 } 4282 if(verbose){System.err.println("No Overlap.");} 4283 } 4284 if(key.value()==u2.suffix1){ 4285 if(verbose){System.err.println("\nTesting makeOverlap B1R");} 4286 if((o=makeOverlapReverse(u2, loc, bandy, tableNum==0))!=null){ 4287 if(verbose){System.err.println("Made Overlap B1R");} 4288 return o; 4289 } 4290 if(verbose){System.err.println("No Overlap.");} 4291 } 4292 }else{ 4293 if(key.value()==u2.prefix2){ 4294 if(verbose){System.err.println("\nTesting makeOverlap A2F");} 4295 if((o=makeOverlapForward(u2, loc-k2-k, bandy, tableNum==0))!=null){ 4296 if(verbose){System.err.println("Made Overlap A2F");} 4297 return o; 4298 } 4299 if(verbose){System.err.println("No Overlap.");} 4300 } 4301 if(key.value()==u2.suffix2){ 4302 if(verbose){System.err.println("\nTesting makeOverlap B2R");} 4303 if((o=makeOverlapReverse(u2, loc+k, bandy, tableNum==0))!=null){ 4304 if(verbose){System.err.println("Made Overlap B2R");} 4305 return o; 4306 } 4307 if(verbose){System.err.println("No Overlap.");} 4308 } 4309 } 4310 return o; 4311 } 4312 overlapsForward(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits)4313 private boolean overlapsForward(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) { 4314 if(verbose){System.err.println("overlapsForward(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");} 4315 4316 final int len1=length(), len2=u2.length(); 4317 if(start1<0){ 4318 start2-=start1; 4319 start1=0; 4320 if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);} 4321 } 4322 int overlapLength=Tools.min(len1-start1, len2-start2); 4323 int overlapLength2=Tools.max(len1-start1, len2-start2); 4324 int stop1=start1+overlapLength-1, stop2=start2+overlapLength-1; 4325 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 4326 4327 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 4328 if(verbose){ 4329 System.err.println("Side block. allowAllContainedOverlaps="+allowAllContainedOverlaps+", minOverlapCluster="+minOverlapCluster); 4330 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 4331 ", overlapLen="+overlapLength+", maxEdits="+maxEdits); 4332 } 4333 if(overlapLength2<minOverlapCluster){return false;} 4334 if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;} 4335 } 4336 4337 final byte[] a=bases(), b=u2.bases(); 4338 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 4339 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, overlapLength); 4340 4341 if(verbose){ 4342 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 4343 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 4344 } 4345 4346 for(int i=start1, j=start2; j<=stop2; i++, j++){ 4347 byte aa=a[i]; 4348 byte bb=b[j]; 4349 if(aa!=bb){ 4350 if(exact || (isFullyDefined(aa) && isFullyDefined(bb))){ 4351 if(earlyExit && j<k2){return false;} 4352 mismatches++; 4353 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 4354 if(mismatches>maxMismatches){ 4355 if(bandy==null || maxEdits<1){return false;} 4356 if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");} 4357 int edits=bandy.alignForward(b, a, 0, start1, maxEdits, exact); 4358 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4359 stop2=bandy.lastQueryLoc; 4360 stop1=bandy.lastRefLoc; 4361 return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment 4362 } 4363 } 4364 } 4365 } 4366 return true; 4367 } 4368 overlapsReverse(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits)4369 private boolean overlapsReverse(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) { 4370 if(verbose){System.err.println("overlapsReverse(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");} 4371 4372 final int len1=length(), len2=u2.length(); 4373 if(start1>=len1){ 4374 start2-=(start1-len1+1); 4375 start1=len1-1; 4376 if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);} 4377 } 4378 final int overlapLength=Tools.min(start1+1, start2+1); 4379 final int overlapLength2=Tools.max(start1+1, start2+1); 4380 int stop1=start1-overlapLength+1, stop2=start2-overlapLength+1; 4381 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 4382 4383 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 4384 if(overlapLength2<minOverlapCluster){return false;} 4385 if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;} 4386 } 4387 4388 final byte[] a=bases(), b=u2.bases(); 4389 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 4390 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4391 4392 if(verbose){ 4393 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 4394 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 4395 } 4396 4397 for(int i=start1, j=start2, iprefix=start1-k2; i>=stop1; i--, j--){ 4398 byte aa=a[i]; 4399 byte bb=b[j]; 4400 if(aa!=bb){ 4401 if(exact || (isFullyDefined(aa) && isFullyDefined(bb))){ 4402 if(earlyExit && i>iprefix){return false;} 4403 mismatches++; 4404 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 4405 if(mismatches>maxMismatches){ 4406 if(bandy==null || maxEdits<1){return false;} 4407 if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");} 4408 int edits=bandy.alignReverse(b, a, b.length-1, start1, maxEdits, exact); 4409 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4410 stop2=bandy.lastQueryLoc; 4411 stop1=bandy.lastRefLoc; 4412 return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment 4413 } 4414 } 4415 } 4416 } 4417 return true; 4418 } 4419 makeOverlapForward(Unit u2, int start1, BandedAligner bandy, boolean earlyExit)4420 private Overlap makeOverlapForward(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) { 4421 if(verbose){System.err.println("makeOverlapForward(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");} 4422 final int len1=length(), len2=u2.length(); 4423 int start2=0; 4424 if(start1<0){ 4425 start2-=start1; 4426 start1=0; 4427 } 4428 final int overlapLength=Tools.min(len1-start1, len2-start2); 4429 final int overlapLength2=Tools.max(len1-start1, len2-start2); 4430 int stop1=start1+overlapLength-1, stop2=start2+overlapLength-1; 4431 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 4432 4433 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 4434 if(overlapLength<minOverlapCluster){return null;} 4435 if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;} 4436 } 4437 4438 final byte[] a=bases(), b=u2.bases(); 4439 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 4440 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, overlapLength); 4441 4442 if(verbose){ 4443 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 4444 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 4445 } 4446 4447 for(int i=start1, j=start2; j<=stop2; i++, j++){ 4448 byte aa=a[i]; 4449 byte bb=b[j]; 4450 if(aa!=bb){ 4451 if(exact || (isFullyDefined(aa) && isFullyDefined(bb))){ 4452 if(earlyExit && j<k2){return null;} 4453 mismatches++; 4454 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 4455 if(mismatches>1 && bandy!=null){ 4456 if(maxEdits<1){return null;} 4457 if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");} 4458 int edits=bandy.alignForward(b, a, start2, start1, maxEdits, exact); 4459 if(edits>maxEdits || bandy.score()<=4*edits){ 4460 if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");} 4461 return null; 4462 } 4463 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4464 stop2=bandy.lastQueryLoc; 4465 stop1=bandy.lastRefLoc; 4466 // if(bandy.lastOffset>0){//Ref longer than query 4467 // for(int k=0; k<bandy.lastOffset; k++){ 4468 // if(stop1+1<=len1){stop1++;} 4469 // else{stop2--;}//I don't think this can happen 4470 // } 4471 // }else if(bandy.lastOffset<0){//Query longer than ref 4472 // for(int k=0; k>bandy.lastOffset; k--){ 4473 // if(stop2+1<=len2){stop2++;} 4474 // else{stop1--;} 4475 // } 4476 // } 4477 return new Overlap(this, u2, FORWARD, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy); 4478 }else if(mismatches>maxMismatches){return null;} 4479 } 4480 } 4481 } 4482 return new Overlap(this, u2, FORWARD, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy); 4483 } 4484 makeOverlapReverse(Unit u2, int start1, BandedAligner bandy, boolean earlyExit)4485 private Overlap makeOverlapReverse(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) { 4486 if(verbose){System.err.println("makeOverlapReverse(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");} 4487 4488 final int len1=length(), len2=u2.length(); 4489 int start2=len2-1; 4490 if(start1>=len1){ 4491 start2-=(start1-len1+1); 4492 start1=len1-1; 4493 } 4494 final int overlapLength=Tools.min(start1+1, start2+1); 4495 final int overlapLength2=Tools.max(start1+1, start2+1); 4496 int stop1=start1-overlapLength+1, stop2=start2-overlapLength+1; 4497 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 4498 4499 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 4500 if(overlapLength<minOverlapCluster){return null;} 4501 if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;} 4502 } 4503 4504 final byte[] a=bases(), b=u2.bases(); 4505 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 4506 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4507 4508 if(verbose){ 4509 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 4510 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 4511 } 4512 4513 for(int i=start1, j=start2, iprefix=start1-k2; i>=stop1; i--, j--){ 4514 byte aa=a[i]; 4515 byte bb=b[j]; 4516 if(aa!=bb){ 4517 if(exact || (isFullyDefined(aa) && isFullyDefined(bb))){ 4518 if(earlyExit && i>iprefix){return null;} 4519 mismatches++; 4520 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 4521 if(mismatches>1 && bandy!=null){ 4522 if(maxEdits<1){return null;} 4523 if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");} 4524 int edits=bandy.alignReverse(b, a, start2, start1, maxEdits, exact); 4525 if(edits>maxEdits || bandy.score()<=4*edits){ 4526 if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");} 4527 return null; 4528 } 4529 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4530 stop2=bandy.lastQueryLoc; 4531 stop1=bandy.lastRefLoc; 4532 // if(bandy.lastOffset>0){//Ref longer than query 4533 // for(int k=0; k<bandy.lastOffset; k++){ 4534 // if(stop1>0){stop1--;} 4535 // else{stop2++;}//I don't think this can happen 4536 // } 4537 // }else if(bandy.lastOffset<0){//Query longer than ref 4538 // for(int k=0; k>bandy.lastOffset; k--){ 4539 // if(stop2>0){stop2--;} 4540 // else{stop1++;} 4541 // } 4542 // } 4543 return new Overlap(this, u2, REVERSE, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy); 4544 }else if(mismatches>maxMismatches){return null;} 4545 } 4546 } 4547 } 4548 return new Overlap(this, u2, REVERSE, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy); 4549 } 4550 4551 @Override compareTo(Unit b)4552 public int compareTo(Unit b) { 4553 int x=comparePairedRC(this, b); 4554 return x; 4555 } 4556 4557 @Override equals(Object b)4558 public boolean equals(Object b){return equals((Unit)b);} equals(Unit b)4559 public boolean equals(Unit b){ 4560 boolean x=pairedEqualsRC(this, b); 4561 return x; 4562 } 4563 4564 @Override hashCode()4565 public int hashCode(){ 4566 return (int)((code1^(code1>>>32))&0xFFFFFFFFL); 4567 } 4568 setValid(boolean b)4569 private synchronized void setValid(boolean b){ 4570 assert(b!=valid()); 4571 // if(!b){System.err.println("Setting invalid "+name());} 4572 if(b){flags&=~INVALID_MASK;} 4573 else{flags|=INVALID_MASK;} 4574 assert(b==valid()); 4575 } 4576 setClustered(boolean b)4577 private synchronized void setClustered(boolean b){ 4578 assert(b!=clustered()); 4579 if(b){flags|=CLUSTER_MASK;} 4580 else{flags&=~CLUSTER_MASK;} 4581 assert(b==clustered()); 4582 } 4583 setVisited(boolean b)4584 private void setVisited(boolean b){ 4585 assert(b!=visited()); 4586 if(b){flags|=VISIT_MASK;} 4587 else{flags&=~VISIT_MASK;} 4588 assert(b==visited()); 4589 } 4590 setOffset(int x)4591 private synchronized void setOffset(int x){ 4592 offset=x; 4593 setOffsetValid(true); 4594 } 4595 setOffsetValid(boolean b)4596 private synchronized void setOffsetValid(boolean b){ 4597 assert(!offsetValid()); 4598 if(b){flags|=OFFSET_VALID_MASK;} 4599 else{flags&=~OFFSET_VALID_MASK;} 4600 assert(b==offsetValid()); 4601 } 4602 setOffsetContradiction(boolean b)4603 private synchronized void setOffsetContradiction(boolean b){ 4604 // assert(b!=offsetContradiction()); 4605 assert(offsetValid()); 4606 if(b){flags|=OFFSET_CONTRADICTION_MASK;} 4607 else{flags&=~OFFSET_CONTRADICTION_MASK;} 4608 assert(b==offsetContradiction()); 4609 } 4610 4611 /** Return true if 'this' should be the first Unit in the overlap object */ firstInOverlap(Unit u2)4612 public boolean firstInOverlap(Unit u2){ 4613 assert(this!=u2) : "\n"+this.r+"\n"+u2.r; 4614 if(u2.length()!=length()){return u2.length()<length();} 4615 if(u2.code1!=code1){return u2.code1<code1;} 4616 if(u2.code2!=code2){return u2.code2<code2;} 4617 int x=compareTo(u2); 4618 assert(x!=0 || (r!=null && r.mate!=null)); 4619 if(x!=0){return x>=0;} 4620 return r.numericID>=u2.r.numericID; 4621 } 4622 inSet()4623 public final boolean inSet(){ 4624 if(subsetCount<2){return true;} 4625 if(r.pairnum()>0){return ((Unit)r.mate.obj).inSet();} 4626 return ((code1&Long.MAX_VALUE)%subsetCount)==subset; 4627 } 4628 bases()4629 public byte[] bases(){return r==null ? null : r.bases;} 4630 name()4631 public String name(){return r!=null ? r.id : null /*code+""*/;} 4632 @Override toString()4633 public String toString(){return "("+name()+","+code1+","+code2+","+length()+","+prefix1+","+suffix1+","+"c"+",d="+depth+")";} 4634 4635 4636 public final Read r; 4637 public final long code1; 4638 public final long code2; 4639 public long prefix1=-1; 4640 public long suffix1=-1; 4641 public long prefix2=-1; 4642 public long suffix2=-1; 4643 /** Distance of leftmost side of this read relative to leftmost side of root. 4644 * Assumes everything is in 'forward' orientation. */ 4645 public int offset=-999999999; 4646 public int depth=1; 4647 // private boolean valid=true; 4648 4649 public int unitID; 4650 4651 public ArrayList<Overlap> overlapList; 4652 4653 private long flags; 4654 /** True if this contig should be output, false if not */ valid()4655 public final boolean valid(){return (INVALID_MASK&flags)==0;} 4656 /** Length of this contig */ length()4657 public final int length(){return (int)(LEN_MASK&flags);} 4658 /** Position of this contig relative to root */ offset()4659 public final int offset(){ 4660 assert(offsetValid()); 4661 return offset; 4662 } pairnum()4663 public int pairnum(){return (PAIRNUM_MASK&flags)==PAIRNUM_MASK ? 1 : 0;} 4664 clearVolatileFlags()4665 public void clearVolatileFlags(){ 4666 flags=flags&~(VISIT_MASK|OFFSET_VALID_MASK|OFFSET_CONTRADICTION_MASK); 4667 assert(!visited()); 4668 assert(!offsetValid()); 4669 assert(!offsetContradiction()); 4670 } 4671 visited()4672 public boolean visited(){return (VISIT_MASK&flags)==VISIT_MASK;} clustered()4673 public boolean clustered(){return (CLUSTER_MASK&flags)==CLUSTER_MASK;} offsetValid()4674 public boolean offsetValid(){return (OFFSET_VALID_MASK&flags)==OFFSET_VALID_MASK;} offsetContradiction()4675 public boolean offsetContradiction(){return (OFFSET_CONTRADICTION_MASK&flags)==OFFSET_CONTRADICTION_MASK;} contradiction()4676 public boolean contradiction(){return offsetContradiction();} 4677 4678 private static final long LEN_MASK=0x7FFFFFFFL; 4679 // private static final long CANON_MASK=(1L<<33); 4680 private static final long INVALID_MASK=(1L<<34); 4681 private static final long VISIT_MASK=(1L<<35); 4682 private static final long CLUSTER_MASK=(1L<<36); 4683 // private static final long CANONICIZED_MASK=(1L<<37); 4684 // private static final long CANON_CONTRADICTION_MASK=(1L<<38); 4685 private static final long OFFSET_VALID_MASK=(1L<<39); 4686 private static final long OFFSET_CONTRADICTION_MASK=(1L<<40); 4687 private static final long PAIRNUM_MASK=(1L<<41); 4688 } 4689 4690 private static final class UnitOffsetComparator implements Comparator<Unit> { 4691 UnitOffsetComparator()4692 UnitOffsetComparator(){} 4693 4694 /* (non-Javadoc) 4695 * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) 4696 */ 4697 @Override compare(Unit a, Unit b)4698 public int compare(Unit a, Unit b) { 4699 if(a.offsetValid() && b.offsetValid()){ 4700 int x=a.offset()-b.offset(); 4701 if(x!=0){return x;} 4702 }else{ 4703 if(a.offsetValid()){return -1;} 4704 if(b.offsetValid()){return 1;} 4705 } 4706 return a.compareTo(b); 4707 } 4708 4709 } 4710 4711 private static final class ClusterLengthComparator implements Comparator<ArrayList<Unit>> { 4712 ClusterLengthComparator()4713 ClusterLengthComparator(){} 4714 4715 /* (non-Javadoc) 4716 * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) 4717 */ 4718 @Override compare(ArrayList<Unit> a, ArrayList<Unit> b)4719 public int compare(ArrayList<Unit> a, ArrayList<Unit> b) { 4720 if(a.size()!=b.size()){return b.size()-a.size();} 4721 if(a.isEmpty() && b.isEmpty()){return 0;} 4722 return a.get(0).compareTo(b.get(0)); 4723 } 4724 4725 } 4726 4727 /** Makes a nmer (e.g., tetramer) profile of a cluster */ makeNmerProfile(ArrayList<Unit> alu, long[] array_)4728 private final float[] makeNmerProfile(ArrayList<Unit> alu, long[] array_){ 4729 final int nbits=bitsPerBase*nmerLength; 4730 final long[] array=(array_==null ? new long[maxNmer+1] : array_); 4731 final int mask=~((-1)<<(nbits)); 4732 4733 long keysCounted=0; 4734 4735 for(Unit u : alu){ 4736 byte[] bases=u.r.bases; 4737 int len=0; 4738 int kmer=0; 4739 for(byte b : bases){ 4740 int x=AminoAcid.baseToNumber[b]; 4741 if(x<0){ 4742 len=0; 4743 kmer=0; 4744 }else{ 4745 kmer=((kmer<<bitsPerBase)|x)&mask; 4746 len++; 4747 if(len>=nmerLength){ 4748 keysCounted++; 4749 array[kmer]++; 4750 } 4751 } 4752 } 4753 } 4754 4755 if(keysCounted==0){keysCounted=1;} 4756 final float mult=1f/keysCounted; 4757 4758 float[] r=new float[array.length]; 4759 for(int i=0; i<array.length; i++){ 4760 r[i]=array[i]*mult; 4761 array[i]=0; 4762 } 4763 return r; 4764 } 4765 isFullyDefined(byte b)4766 private static boolean isFullyDefined(byte b){ 4767 return AminoAcid.acidToNumber[b]>=0; 4768 } 4769 4770 private ConcurrentReadInputStream crisa[]; 4771 4772 private final ByteStreamWriter dupeWriter; 4773 4774 private String[] in1=null; 4775 private String[] in2=null; 4776 private String out=null; 4777 private String clusterFilePattern=null; 4778 private String outbest=null; 4779 private String outdupe=null; 4780 private String outcsf=null; 4781 private String outgraph=null; 4782 private int maxNs=-1; 4783 private long maxReads=-1; 4784 public boolean errorState=false; 4785 boolean sort=false; 4786 // boolean ascending=true; 4787 boolean absorbContainment=true; 4788 boolean absorbMatch=true; 4789 boolean findOverlaps=false; 4790 boolean makeClusters=false; 4791 boolean processClusters=false; 4792 boolean renameClusters=false; 4793 boolean absorbOverlap=false; 4794 boolean storeSuffix=false; 4795 boolean storeName=true; 4796 boolean storeQuality=true; 4797 boolean exact=true; 4798 boolean uniqueNames=true; 4799 boolean maxSpanningTree=false; 4800 4801 boolean removeCycles=true; 4802 boolean fixMultiJoins=true; 4803 boolean fixOffsetContradictions=false; 4804 boolean countTransitive=false; 4805 boolean countRedundant=false; 4806 4807 private boolean multipleInputFiles=false; 4808 private boolean rigorousTransitive=false; 4809 private int numAffixMaps=1; 4810 private int maxAffixCopies=2000000000; 4811 private int maxEdges=2000000000; 4812 private int maxEdges2=2000000000; 4813 private boolean allowAllContainedOverlaps=false; 4814 // private boolean toUpperCase=false; 4815 4816 /** Trim left bases of the read to this position (exclusive, 0-based) */ 4817 private int forceTrimLeft=-1; 4818 /** Trim right bases of the read after this position (exclusive, 0-based) */ 4819 private int forceTrimRight=-1; 4820 4821 private boolean qTrimLeft=false; 4822 private boolean qTrimRight=false; 4823 private float trimQ=6; 4824 /** Error rate for trimming (derived from trimq) */ 4825 private final float trimE; 4826 4827 int maxEdits=0; 4828 int maxSubs=0; 4829 int bandwidth=9; 4830 final boolean customBandwidth; 4831 float minIdentity=100; 4832 float minIdentityMult=0; 4833 float minLengthPercent=0; 4834 int minOverlapCluster=100; 4835 int minOverlapMerge=1; 4836 float minOverlapPercentCluster=0; 4837 float minOverlapPercentMerge=0; 4838 4839 private int minClusterSize=1; 4840 private int minClusterSizeForStats=1; 4841 private boolean pickBestRepresentativePerCluster=false; 4842 4843 long readsProcessed=0; 4844 long basesProcessed=0; 4845 long collisions=0; 4846 long containments=0; 4847 long baseContainments=0; 4848 long containmentCollisions=0; 4849 long matches=0; 4850 long baseMatches=0; 4851 long overlaps=0; 4852 long baseOverlaps=0; 4853 long overlapCollisions=0; 4854 long addedToMain=0; 4855 4856 private final int subset; 4857 private final int subsetCount; 4858 private final boolean subsetMode; 4859 4860 private final int k; 4861 private final int k2; 4862 private final boolean EA=Shared.EA(); 4863 4864 // static final boolean amino=true; 4865 static final int bitsPerBase=5; //amino 4866 // static final boolean ignoreReverseComplement=true; 4867 4868 /*--------------------------------------------------------------*/ 4869 /*---------------- Static Fields ----------------*/ 4870 /*--------------------------------------------------------------*/ 4871 4872 private static ReadComparator comparator=ReadLengthComparator.comparator; 4873 4874 private static int tcount=0; 4875 4876 private LinkedHashMap<Long, ArrayList<Unit>> codeMap=new LinkedHashMap<Long, ArrayList<Unit>>(4000000); 4877 private HashMap<LongM, ArrayList<Unit>> affixMap1=null; 4878 private HashMap<LongM, ArrayList<Unit>> affixMap2=null; 4879 private HashMap<LongM, ArrayList<Unit>>[] affixMaps=null; 4880 private ArrayDeque<ArrayList<Unit>> clusterQueue=null; 4881 private ArrayList<ArrayList<Unit>> processedClusters=null; 4882 private AtomicIntegerArray clusterNumbers=null; 4883 4884 private static final UnitOffsetComparator UNIT_OFFSET_COMPARATOR=new UnitOffsetComparator(); 4885 private static final ClusterLengthComparator CLUSTER_LENGTH_COMPARATOR=new ClusterLengthComparator(); 4886 private static final long[][] hashcodes=makeCodes2(32, 1); 4887 private static final long[][] hashcodesR=makeCodes2(32, 2); 4888 4889 public static final byte[] acidToNumber=new byte[128]; 4890 4891 public static final int nmerLength=4; 4892 public static final int maxNmer=((int) Math.pow(20, nmerLength))-1; 4893 private static PrintStream outstream=System.err; 4894 /** Permission to overwrite existing files */ 4895 public static boolean overwrite=false; 4896 /** Permission to append to existing files */ 4897 public static boolean append=false; 4898 public static boolean showSpeed=true; 4899 public static boolean verbose=false; 4900 public static boolean preventTransitiveOverlaps=false; 4901 public static boolean ignoreAffix1=false; 4902 public static boolean parseDepth=false; 4903 public static boolean printLengthInEdges=false; 4904 public static float depthRatio=2; 4905 public static int MINSCAF=0; 4906 public static int THREADS=Shared.threads(); 4907 public static int threadMaxReadsToBuffer=4000; 4908 public static int threadMaxBasesToBuffer=32000000; 4909 public static boolean DISPLAY_PROGRESS=true; 4910 public static boolean UNIQUE_ONLY=false; 4911 public static boolean REQUIRE_MATCHING_NAMES=false; 4912 public static boolean NUMBER_GRAPH_NODES=true; 4913 public static boolean ADD_PAIRNUM_TO_NAME=true; 4914 public static boolean HASH_NS=false; 4915 reverseType(int type)4916 private static int reverseType(int type){return (type+2)%4;} 4917 public static final int FORWARD=0; 4918 public static final int REVERSE=2; 4919 public static final String[] OVERLAP_TYPE_NAMES=new String[] {"FORWARD", "FORWARDRC", "REVERSE", "REVERSERC"}; 4920 public static final String[] OVERLAP_TYPE_ABBREVIATIONS=new String[] {"F", "FRC", "R", "RRC"}; 4921 4922 static{//All others are 0 4923 for(int i=0; i<acidToNumber.length; i++){ 4924 byte b=AminoAcid.acidToNumber[i]; 4925 acidToNumber[i]=(b<0 ? 21 : b); 4926 } 4927 } 4928 4929 } 4930