1 package jgi; 2 3 import java.io.File; 4 import java.io.PrintStream; 5 import java.io.Serializable; 6 import java.util.ArrayDeque; 7 import java.util.ArrayList; 8 import java.util.Arrays; 9 import java.util.Comparator; 10 import java.util.HashMap; 11 import java.util.HashSet; 12 import java.util.LinkedHashMap; 13 import java.util.Locale; 14 import java.util.PriorityQueue; 15 import java.util.Random; 16 import java.util.concurrent.atomic.AtomicIntegerArray; 17 18 import align2.BandedAligner; 19 import dna.AminoAcid; 20 import fileIO.ByteFile1; 21 import fileIO.ByteFile2; 22 import fileIO.ByteStreamWriter; 23 import fileIO.FileFormat; 24 import fileIO.ReadWrite; 25 import fileIO.TextStreamWriter; 26 import shared.Parse; 27 import shared.Parser; 28 import shared.PreParser; 29 import shared.ReadStats; 30 import shared.Shared; 31 import shared.Timer; 32 import shared.Tools; 33 import shared.TrimRead; 34 import sort.ReadComparator; 35 import sort.ReadComparatorID; 36 import sort.ReadComparatorName; 37 import sort.ReadLengthComparator; 38 import sort.ReadQualityComparator; 39 import stream.ConcurrentCollectionReadInputStream; 40 import stream.ConcurrentGenericReadInputStream; 41 import stream.ConcurrentReadInputStream; 42 import stream.ConcurrentReadOutputStream; 43 import stream.FASTQ; 44 import stream.FastaReadInputStream; 45 import stream.FastqReadInputStream; 46 import stream.Read; 47 import structures.ListNum; 48 import structures.LongM; 49 50 /** 51 * @author Brian Bushnell 52 * @date Jul 18, 2013 53 * 54 */ 55 public final class Dedupe2 { 56 main(String[] args)57 public static void main(String[] args){ 58 int rbl0=Shared.bufferLen(); 59 Shared.setBufferLen(16); 60 61 Dedupe2 dd=new Dedupe2(args); 62 dd.process(); 63 64 Shared.setBufferLen(rbl0); 65 66 //Close the print stream if it was redirected 67 Shared.closeStream(outstream); 68 } 69 Dedupe2(String[] args)70 public Dedupe2(String[] args){ 71 72 {//Preparse block for help, config files, and outstream 73 PreParser pp=new PreParser(args, getClass(), true); 74 args=pp.args; 75 outstream=pp.outstream; 76 } 77 78 ReadWrite.ZIPLEVEL=2; 79 ReadWrite.USE_UNPIGZ=ReadWrite.USE_PIGZ=true; 80 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); 81 Read.TO_UPPER_CASE=true; 82 83 boolean setMcsfs=false; 84 int bandwidth_=-1; 85 int k_=31; 86 int subset_=0, subsetCount_=1; 87 boolean ascending=false; 88 Parser parser=new Parser(); 89 90 for(int i=0; i<args.length; i++){ 91 92 final String arg=args[i]; 93 String[] split=arg.split("="); 94 String a=split[0].toLowerCase(); 95 String b=split.length>1 ? split[1] : null; 96 97 if(Parser.parseZip(arg, a, b)){ 98 //do nothing 99 }else if(Parser.parseCommonStatic(arg, a, b)){ 100 //do nothing 101 }else if(Parser.parseQuality(arg, a, b)){ 102 //do nothing 103 }else if(Parser.parseFasta(arg, a, b)){ 104 //do nothing 105 }else if(parser.parseQTrim(arg, a, b)){ 106 //do nothing 107 }else if(parser.parseInterleaved(arg, a, b)){ 108 //do nothing 109 }else if(a.equals("in") || a.equals("in1")){ 110 assert(b!=null) : "Bad parameter: "+arg; 111 if(b.indexOf(',')>=0 && !new File(b).exists()){ 112 in1=b.split(","); 113 }else{ 114 in1=new String[] {b}; 115 } 116 }else if(a.equals("in2")){ 117 assert(b!=null) : "Bad parameter: "+arg; 118 if(b.indexOf(',')>=0 && !new File(b).exists()){ 119 in2=b.split(","); 120 }else{ 121 in2=new String[] {b}; 122 } 123 }else if(a.equals("out")){ 124 out=b; 125 }else if(a.equals("clusterfilepattern") || a.equals("pattern")){ 126 clusterFilePattern=b; 127 assert(clusterFilePattern==null || clusterFilePattern.contains("%")) : "pattern must contain the % symbol."; 128 }else if(a.equals("outbest")){ 129 outbest=b; 130 }else if(a.equals("outd") || a.equals("outduplicate")){ 131 outdupe=b; 132 }else if(a.equals("csf") || a.equals("clusterstatsfile")){ 133 outcsf=b; 134 }else if(a.equals("dot") || a.equals("graph") || a.equals("outdot") || a.equals("outgraph")){ 135 outgraph=b; 136 }else if(a.equals("mcsfs") || a.equals("minclustersizeforstats")){ 137 minClusterSizeForStats=Integer.parseInt(b); 138 }else if(a.equals("mcs") || a.equals("minclustersize")){ 139 minClusterSize=Integer.parseInt(b); 140 if(!setMcsfs){ 141 minClusterSizeForStats=minClusterSize; 142 } 143 }else if(a.equals("append") || a.equals("app")){ 144 append=ReadStats.append=Parse.parseBoolean(b); 145 }else if(a.equals("overwrite") || a.equals("ow")){ 146 overwrite=Parse.parseBoolean(b); 147 }else if(a.equals("sort")){ 148 if(b==null){sort=true;} 149 else if(b.equalsIgnoreCase("a") || b.equalsIgnoreCase("ascending")){ 150 sort=true; 151 ascending=true; 152 }else if(b.equalsIgnoreCase("d") || b.equalsIgnoreCase("descending")){ 153 sort=true; 154 ascending=false; 155 }else if(b.equalsIgnoreCase("length")){ 156 sort=true; 157 comparator=ReadLengthComparator.comparator; 158 }else if(b.equalsIgnoreCase("quality")){ 159 sort=true; 160 comparator=ReadQualityComparator.comparator; 161 }else if(b.equalsIgnoreCase("name")){ 162 sort=true; 163 comparator=ReadComparatorName.comparator; 164 }else if(b.equalsIgnoreCase("id")){ 165 sort=true; 166 comparator=ReadComparatorID.comparator; 167 }else{ 168 sort=Parse.parseBoolean(b); 169 } 170 }else if(a.equals("ascending")){ 171 ascending=Parse.parseBoolean(b); 172 }else if(a.equals("ordered")){ 173 boolean x=Parse.parseBoolean(b); 174 if(x){ 175 ascending=true; 176 sort=true; 177 comparator=ReadComparatorID.comparator; 178 }else{ 179 //do nothing 180 } 181 }else if(a.equals("arc") || a.equals("absorbrc") || a.equals("trc") || a.equals("testrc")){ 182 ignoreReverseComplement=!Parse.parseBoolean(b); 183 }else if(a.equals("ac") || a.equals("absorbcontainment") || a.equals("absorbcontainments") || a.equals("tc") || a.equals("testcontainment") || a.equals("containment")){ 184 absorbContainment=Parse.parseBoolean(b); 185 }else if(a.equals("am") || a.equals("absorbmatch") || a.equals("absorbmatches") || a.equals("tm") || a.equals("testmatch")){ 186 absorbMatch=Parse.parseBoolean(b); 187 }else if(a.equals("ao") || a.equals("absorboverlap") || a.equals("absorboverlaps") || a.equals("to") || a.equals("testoverlap")){ 188 absorbOverlap=Parse.parseBoolean(b); 189 }else if(a.equals("fo") || a.equals("findoverlap") || a.equals("findoverlaps")){ 190 findOverlaps=Parse.parseBoolean(b); 191 }else if(a.equals("c") || a.equals("cluster") || a.equals("clusters")){ 192 makeClusters=Parse.parseBoolean(b); 193 }else if(a.equals("fmj") || a.equals("fixmultijoin") || a.equals("fixmultijoins")){ 194 fixMultiJoins=Parse.parseBoolean(b); 195 }else if(a.equals("fcc") || a.equals("fixcanoncontradiction") || a.equals("fixcanoncontradictions")){ 196 fixCanonContradictions=Parse.parseBoolean(b); 197 }else if(a.equals("foc") || a.equals("fixoffsetcontradiction") || a.equals("fixoffsetcontradictions")){ 198 fixOffsetContradictions=Parse.parseBoolean(b); 199 }else if(a.equals("pto") || a.equals("preventtransitiveoverlap") || a.equals("preventtransitiveoverlaps")){ 200 preventTransitiveOverlaps=Parse.parseBoolean(b); 201 }else if(a.equals("pbr") || a.equals("pickbestrepresentative")){ 202 pickBestRepresentativePerCluster=Parse.parseBoolean(b); 203 }else if(a.equals("mst") || a.equals("maxspanningtree")){ 204 maxSpanningTree=Parse.parseBoolean(b); 205 }else if(a.equals("cc") || a.equals("canonicizecluster") || a.equals("canonicizeclusters")){ 206 canonicizeClusters=Parse.parseBoolean(b); 207 }else if(a.equals("pc") || a.equals("processcluster") || a.equals("processclusters")){ 208 processClusters=Parse.parseBoolean(b); 209 }else if(a.equals("rnc") || a.equals("renamecluster") || a.equals("renameclusters")){ 210 renameClusters=Parse.parseBoolean(b); 211 if(renameClusters){storeName=false;} 212 }else if(a.equals("rc") || a.equals("removecycles") || a.equals("removecycle")){ 213 removeCycles=Parse.parseBoolean(b); 214 }else if(a.equals("uo") || a.equals("uniqueonly")){ 215 UNIQUE_ONLY=Parse.parseBoolean(b); 216 }else if(a.equals("rmn") || a.equals("requirematchingnames")){ 217 REQUIRE_MATCHING_NAMES=Parse.parseBoolean(b); 218 }else if(a.equals("ngn") || a.equals("numbergraphnodes")){ 219 NUMBER_GRAPH_NODES=Parse.parseBoolean(b); 220 }else if(a.equals("addpairnum")){ 221 ADD_PAIRNUM_TO_NAME=Parse.parseBoolean(b); 222 }else if(a.equals("hashns")){ 223 HASH_NS=Parse.parseBoolean(b); 224 }else if(a.equals("pn") || a.equals("prefixname")){ 225 // PREFIX_NAME=Parse.parseBoolean(b); 226 }else if(a.equals("k")){ 227 k_=Integer.parseInt(b); 228 assert(k_>0 && k_<32) : "k must be between 1 and 31; default is 31, and lower values are slower."; 229 }else if(a.equals("minscaf") || a.equals("ms")){ 230 MINSCAF=FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); 231 }else if(a.equals("mlp") || a.equals("minlengthpercent")){ 232 minLengthPercent=Float.parseFloat(b); 233 }else if(a.equals("mop") || a.equals("minoverlappercent")){ 234 minOverlapPercentCluster=minOverlapPercentMerge=Float.parseFloat(b); 235 }else if(a.equals("mopc") || a.equals("minoverlappercentcluster")){ 236 minOverlapPercentCluster=Float.parseFloat(b); 237 }else if(a.equals("mopm") || a.equals("minoverlappercentmerge")){ 238 minOverlapPercentMerge=Float.parseFloat(b); 239 }else if(a.equals("mo") || a.equals("minoverlap")){ 240 minOverlapCluster=minOverlapMerge=Integer.parseInt(b); 241 }else if(a.equals("moc") || a.equals("minoverlapcluster")){ 242 minOverlapCluster=Integer.parseInt(b); 243 }else if(a.equals("mom") || a.equals("minoverlapmerge")){ 244 minOverlapMerge=Integer.parseInt(b); 245 }else if(a.equals("rt") || a.equals("rigoroustransitive")){ 246 rigorousTransitive=Parse.parseBoolean(b); 247 }else if(a.equals("e") || a.equals("maxedits") || a.equals("edits") || a.equals("edist")){ 248 maxEdits=Integer.parseInt(b); 249 }else if(a.equals("s") || a.equals("maxsubs") || a.equals("maxsubstitutions") || a.equals("hdist")){ 250 maxSubs=Integer.parseInt(b); 251 }else if(a.equals("bw") || a.equals("bandwidth")){ 252 bandwidth_=Integer.parseInt(b); 253 }else if(a.equals("mid") || a.equals("minidentity")){ 254 minIdentity=Float.parseFloat(b); 255 minIdentityMult=(minIdentity==100f ? 0 : (100f-minIdentity)/100f); 256 }else if(a.equals("threads") || a.equals("t")){ 257 THREADS=(b==null || b.equalsIgnoreCase("auto") ? Shared.threads() : Integer.parseInt(b)); 258 }else if(a.equals("showspeed") || a.equals("ss")){ 259 showSpeed=Parse.parseBoolean(b); 260 }else if(a.equals("verbose")){ 261 verbose=Parse.parseBoolean(b); 262 // BandedAligner.verbose=verbose; 263 }else if(a.equals("contigbreak") || (arg.contains("=") && (a.equals("n") || a.equals("-n")))){ 264 maxNs=Integer.parseInt(b); 265 }else if(a.equals("reads") || a.startsWith("maxreads")){ 266 maxReads=Parse.parseKMG(b); 267 }else if(a.equals("sn") || a.equals("storename") || a.equals("storenames") || a.equals("keepnames")){ 268 storeName=Parse.parseBoolean(b); 269 }else if(a.equals("ssx") || a.equals("storesuffix") || a.equals("storesuffixes")){ 270 storeSuffix=Parse.parseBoolean(b); 271 }else if(a.equals("numaffixmaps") || a.equals("nam")){ 272 numAffixMaps=Integer.parseInt(b); 273 }else if(a.equals("mac") || a.equals("maxaffixcopies")){ 274 maxAffixCopies=Integer.parseInt(b); 275 }else if(a.equals("me") || a.equals("maxedges")){ 276 maxEdges=Integer.parseInt(b); 277 maxEdges2=maxEdges*2; 278 if(maxEdges2<1){maxEdges2=Integer.MAX_VALUE-1;} 279 }else if(a.equals("ignoreaffix1") || a.equals("ia1")){ 280 ignoreAffix1=Parse.parseBoolean(b); 281 }else if(a.equals("parsedepth") || a.equals("pd")){ 282 parseDepth=Parse.parseBoolean(b); 283 }else if(a.equals("printlengthinedges") || a.equals("ple")){ 284 printLengthInEdges=Parse.parseBoolean(b); 285 }else if(a.equals("depthmult") || a.equals("depthratio") || a.equals("dr")){ 286 depthRatio=Float.parseFloat(b); 287 if(depthRatio<=0){ 288 parseDepth=false; 289 }else{ 290 parseDepth=true; 291 assert(depthRatio>0); 292 if(depthRatio<1){depthRatio=1/depthRatio;} 293 } 294 }else if(a.equals("storequality") || a.equals("sq")){ 295 storeQuality=Parse.parseBoolean(b); 296 }else if(a.equals("exact") || a.equals("ex")){ 297 exact=Parse.parseBoolean(b); 298 }else if(a.equals("uniquenames") || a.equals("un")){ 299 uniqueNames=Parse.parseBoolean(b); 300 }else if(a.equals("ftl") || a.equals("forcetrimleft")){ 301 forceTrimLeft=Integer.parseInt(b); 302 }else if(a.equals("ftr") || a.equals("forcetrimright")){ 303 forceTrimRight=Integer.parseInt(b); 304 }else if(a.equals("subset") || a.equals("sst")){ 305 subset_=Integer.parseInt(b); 306 }else if(a.equals("subsets") || a.equals("subsetcount") || a.equals("sstc")){ 307 subsetCount_=Integer.parseInt(b); 308 }else if(i==0 && in1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ 309 String c=args[i]; 310 if(c.indexOf(',')>=0 && !new File(c).exists()){ 311 in1=c.split(","); 312 }else{ 313 in1=new String[] {c}; 314 } 315 }else if(i==1 && out==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ 316 out=args[i]; 317 }else{ 318 throw new RuntimeException("Unknown parameter "+args[i]); 319 } 320 } 321 322 comparator.setAscending(ascending); 323 324 {//Process parser fields 325 Parser.processQuality(); 326 qTrimLeft=parser.qtrimLeft; 327 qTrimRight=parser.qtrimRight; 328 trimQ=parser.trimq; 329 trimE=parser.trimE(); 330 } 331 332 if(verbose){ 333 ReadWrite.verbose=ConcurrentGenericReadInputStream.verbose=ConcurrentReadOutputStream.verbose=ByteFile1.verbose=ByteFile2.verbose=FastqReadInputStream.verbose=true; 334 } 335 // verbose=false; 336 337 k=k_; 338 k2=k-1; 339 subset=subset_; 340 subsetCount=subsetCount_; 341 subsetMode=subsetCount>1; 342 assert(subset>=0 && subset<subsetCount) : "subset="+subset+", subsetCount="+subsetCount; 343 344 BandedAligner.penalizeOffCenter=true; 345 346 if(maxSpanningTree){removeCycles=fixMultiJoins=false;} 347 if(absorbOverlap){processClusters=true;} 348 if(processClusters || renameClusters || maxSpanningTree){makeClusters=true;} 349 if(makeClusters){findOverlaps=true;} 350 if(renameClusters){uniqueNames=/*storeName=*/false;} 351 352 if(bandwidth_>-1){ 353 bandwidth=Tools.min(bandwidth_, 2*maxEdits+1); 354 customBandwidth=(bandwidth<2*maxEdits+1); 355 }else{ 356 bandwidth=2*maxEdits+1; 357 customBandwidth=false; 358 } 359 maxSubs=Tools.max(maxSubs, maxEdits); 360 if(maxSubs>0 || minIdentity<100 || findOverlaps){storeSuffix=true;} 361 362 assert(FastaReadInputStream.settingsOK()); 363 364 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} 365 366 for(int i=0; i<in1.length; i++){ 367 if(in1[i].equalsIgnoreCase("stdin") && !new File(in1[i]).exists()){in1[i]="stdin.fa";} 368 } 369 370 // assert(false) : Arrays.toString(in); 371 372 // if(!setOut && clusterFilePattern==null){out="stdout.fa";} 373 // else 374 // if("stdout".equalsIgnoreCase(out) || "standarddout".equalsIgnoreCase(out)){ 375 // out="stdout.fa"; 376 // outstream=System.err; 377 // } 378 if(!Tools.canWrite(out, overwrite)){throw new RuntimeException("Output file "+out+" already exists, and overwrite="+overwrite);} 379 380 for(int i=0; i<in1.length; i++){ 381 assert(!in1[i].equalsIgnoreCase(out)); 382 } 383 // assert(false) : "\nabsorbContainment="+absorbContainment+", findOverlaps="+findOverlaps+", absorbOverlap="+absorbOverlap+"\n"+ 384 // "processClusters="+processClusters+", renameClusters="+renameClusters+", makeClusters="+makeClusters+", uniqueNames="+uniqueNames+", storeName="+storeName; 385 if(absorbContainment || findOverlaps){ 386 // assert(false); 387 affixMaps=new HashMap[numAffixMaps]; 388 for(int i=0; i<numAffixMaps; i++){ 389 affixMaps[i]=new HashMap<LongM, ArrayList<Unit>>(4000000); 390 } 391 } 392 // assert(false) : absorbContainment+", "+(affixMap==null); 393 394 if(outdupe==null){ 395 dupeWriter=null; 396 }else{ 397 FileFormat ff=FileFormat.testOutput(outdupe, FileFormat.FASTA, null, true, overwrite, append, false); 398 dupeWriter=new ByteStreamWriter(ff); 399 } 400 } 401 process()402 public void process(){ 403 404 Timer t=new Timer(); 405 406 boolean dq0=FASTQ.DETECT_QUALITY; 407 boolean ti0=FASTQ.TEST_INTERLEAVED; 408 409 process2(); 410 411 FASTQ.DETECT_QUALITY=dq0; 412 FASTQ.TEST_INTERLEAVED=ti0; 413 414 t.stop(); 415 416 if(showSpeed){ 417 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); 418 } 419 420 if(errorState){ 421 throw new RuntimeException("Dedupe2 terminated in an error state; the output may be corrupt."); 422 } 423 } 424 process2()425 public void process2(){ 426 if(dupeWriter!=null){dupeWriter.start();} 427 // assert(false) : out; 428 Timer t=new Timer(); 429 430 if(DISPLAY_PROGRESS){ 431 outstream.println("Initial:"); 432 Shared.printMemory(); 433 outstream.println(); 434 } 435 436 processMatches(t); 437 438 forceTrimLeft=forceTrimRight=-1; 439 qTrimLeft=qTrimRight=false; 440 441 if(absorbContainment){ 442 processContainments(t); 443 } 444 445 if(dupeWriter!=null){dupeWriter.poisonAndWait();} 446 447 if(findOverlaps){ 448 findOverlaps(t); 449 450 killAffixMaps(); 451 452 if(processClusters || renameClusters || maxSpanningTree){codeMap=null;} 453 454 if(maxSpanningTree){ 455 processMst(t); 456 } 457 458 if(processClusters){ 459 processClusters(t, absorbOverlap); 460 } 461 // if(renameClusters){ 462 // renameClusters(t); 463 // } 464 // assert(false) : (codeMap==null)+", "+(affixMap1==null)+", "+processedClusters; 465 } 466 467 outstream.println("Input: \t"+readsProcessed+" reads \t\t"+basesProcessed+" bases."); 468 469 if(absorbMatch){ 470 outstream.println("Duplicates: \t"+matches+" reads ("+String.format(Locale.ROOT, "%.2f",matches*100.0/readsProcessed)+"%) \t"+ 471 baseMatches+" bases ("+String.format(Locale.ROOT, "%.2f",baseMatches*100.0/basesProcessed)+"%) \t"+collisions+" collisions."); 472 } 473 if(absorbContainment){ 474 outstream.println("Containments: \t"+containments+" reads ("+String.format(Locale.ROOT, "%.2f",containments*100.0/readsProcessed)+"%) \t"+ 475 baseContainments+" bases ("+String.format(Locale.ROOT, "%.2f",baseContainments*100.0/basesProcessed)+"%) \t"+containmentCollisions+" collisions."); 476 } 477 if(findOverlaps){ 478 outstream.println("Overlaps: \t"+overlaps+" reads ("+String.format(Locale.ROOT, "%.2f",overlaps*100.0/readsProcessed)+"%) \t"+ 479 baseOverlaps+" bases ("+String.format(Locale.ROOT, "%.2f",baseOverlaps*100.0/basesProcessed)+"%) \t"+overlapCollisions+" collisions."); 480 } 481 // outstream.println("Result: \t"+(addedToMain-containments)+" reads \t\t"+(basesProcessed-baseMatches-baseContainments)+" bases."); 482 483 long outReads=(addedToMain-containments); 484 if(UNIQUE_ONLY){outReads=readsProcessed-matches-containments;} 485 long outBases=(basesProcessed-baseMatches-baseContainments); 486 outstream.println("Result: \t"+outReads+" reads ("+String.format(Locale.ROOT, "%.2f",outReads*100.0/readsProcessed)+"%) \t"+ 487 outBases+" bases ("+String.format(Locale.ROOT, "%.2f",outBases*100.0/basesProcessed)+"%)"); 488 489 outstream.println(""); 490 491 if(out!=null || clusterFilePattern!=null || outbest!=null || outgraph!=null || outcsf!=null){ 492 writeOutput(outcsf, t); 493 } 494 495 } 496 killAffixMaps()497 private void killAffixMaps(){ 498 if(affixMaps==null){return;} 499 for(int i=0; i<numAffixMaps; i++){ 500 if(affixMaps[i]!=null){affixMaps[i].clear();} 501 affixMaps[i]=null; 502 } 503 affixMaps=null; 504 } 505 makeCrisArray(ArrayList<Read> list)506 private ConcurrentReadInputStream[] makeCrisArray(ArrayList<Read> list){ 507 final ConcurrentReadInputStream[] array; 508 509 if(list!=null){ 510 array=new ConcurrentReadInputStream[] {new ConcurrentCollectionReadInputStream(list, null, -1)}; 511 array[0].start(); //This deadlocks if ConcurrentReadInputStream extends Thread rather than spawning a new thread. 512 }else{ 513 array=new ConcurrentReadInputStream[in1.length]; 514 multipleInputFiles=array.length>1; 515 for(int i=0; i<in1.length; i++){ 516 if(verbose){System.err.println("Creating cris for "+in1[i]);} 517 518 final ConcurrentReadInputStream cris; 519 { 520 FileFormat ff1=FileFormat.testInput(in1[i], FileFormat.FASTA, null, !multipleInputFiles || ReadWrite.USE_UNPIGZ, true); 521 FileFormat ff2=(in2==null || in2.length<=i ? null : FileFormat.testInput(in2[i], FileFormat.FASTA, null, !multipleInputFiles || ReadWrite.USE_UNPIGZ, true)); 522 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, ff1.samOrBam(), ff1, ff2); 523 cris.start(); 524 if(cris.paired()){ 525 THREADS=1;//Temp fix for losing reads when multithreaded and paired 526 if(absorbContainment){ 527 System.err.println("Set absorbContainment to false because it is not currently supported for paired reads."); 528 absorbContainment=false; 529 } 530 } 531 } 532 array[i]=cris; 533 } 534 } 535 return array; 536 } 537 processMatches(Timer t)538 private void processMatches(Timer t){ 539 crisa=makeCrisArray(null); 540 541 ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS); 542 for(int i=0; i<THREADS; i++){alht.add(new HashThread(true, (absorbContainment|findOverlaps), absorbMatch, false, false));} 543 for(HashThread ht : alht){ht.start();} 544 for(HashThread ht : alht){ 545 while(ht.getState()!=Thread.State.TERMINATED){ 546 try { 547 ht.join(); 548 } catch (InterruptedException e) { 549 // TODO Auto-generated catch block 550 e.printStackTrace(); 551 } 552 } 553 matches+=ht.matchesT; 554 collisions+=ht.collisionsT; 555 containments+=ht.containmentsT; 556 containmentCollisions+=ht.containmentCollisionsT; 557 baseContainments+=ht.baseContainmentsT; 558 baseMatches+=ht.baseMatchesT; 559 addedToMain+=ht.addedToMainT; 560 readsProcessed+=ht.readsProcessedT; 561 basesProcessed+=ht.basesProcessedT; 562 } 563 alht.clear(); 564 565 if(verbose){System.err.println("Attempting to close input streams (1).");} 566 for(ConcurrentReadInputStream cris : crisa){ 567 errorState|=ReadWrite.closeStream(cris); 568 } 569 crisa=null; 570 571 if(DISPLAY_PROGRESS){ 572 t.stop(); 573 outstream.println("Found "+matches+" duplicates."); 574 outstream.println("Finished exact matches. Time: "+t); 575 Shared.printMemory(); 576 if(verbose){outstream.println(affixMaps[0]);} 577 outstream.println(); 578 t.start(); 579 } 580 } 581 processContainments(Timer t)582 private void processContainments(Timer t){ 583 ArrayList<Read> list=new ArrayList<Read>((int)addedToMain); 584 for(ArrayList<Unit> alu : codeMap.values()){ 585 for(Unit u : alu){ 586 assert(u.r.mate==null) : "Containments are not currently supported with paired reads."; 587 if(u.valid() && u.r.pairnum()==0){list.add(u.r);} 588 } 589 } 590 591 // if(minLengthPercent>0){ 592 // if(verbose){System.err.println("Sorting.");} 593 // Shared.sort(list, ReadLengthComparator.comparator); 594 // Collections.reverse(list); 595 // assert(list.isEmpty() || list.get(0).length()<=list.get(list.size()-1).length()) : 596 // list.get(0).length()+", "+list.get(list.size()-1).length(); 597 // } 598 599 crisa=makeCrisArray(subsetMode ? null : list); 600 601 ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS); 602 for(int i=0; i<THREADS; i++){alht.add(new HashThread(false, false, false, true, false));} 603 604 for(HashThread ht : alht){ht.start();} 605 for(HashThread ht : alht){ 606 while(ht.getState()!=Thread.State.TERMINATED){ 607 try { 608 ht.join(); 609 } catch (InterruptedException e) { 610 // TODO Auto-generated catch block 611 e.printStackTrace(); 612 } 613 } 614 assert(ht.matchesT==0); 615 assert(ht.collisionsT==0); 616 assert(ht.baseMatchesT==0); 617 assert(ht.addedToMainT==0); 618 // assert(ht.readsProcessedT==0); 619 // assert(ht.basesProcessedT==0); 620 // matches+=ht.matchesT; 621 // collisions+=ht.collisionsT; 622 containments+=ht.containmentsT; 623 containmentCollisions+=ht.containmentCollisionsT; 624 baseContainments+=ht.baseContainmentsT; 625 // baseMatches+=ht.baseMatchesT; 626 // addedToMain+=ht.addedToMainT; 627 // readsProcessed+=ht.readsProcessedT; 628 // basesProcessed+=ht.basesProcessedT; 629 } 630 alht.clear(); 631 if(verbose){System.err.println("Attempting to close input streams (2).");} 632 for(ConcurrentReadInputStream cris : crisa){ 633 errorState|=ReadWrite.closeStream(cris); 634 } 635 636 if(DISPLAY_PROGRESS){ 637 t.stop(); 638 outstream.println("Found "+containments+" contained sequences."); 639 outstream.println("Finished containment. Time: "+t); 640 Shared.printMemory(); 641 outstream.println(); 642 t.start(); 643 } 644 crisa=null; 645 if(!findOverlaps){ 646 killAffixMaps(); 647 } 648 649 long x=removeInvalid(list); 650 list.clear(); 651 652 if(DISPLAY_PROGRESS){ 653 t.stop(); 654 outstream.println("Removed "+x+" invalid entries."); 655 outstream.println("Finished invalid removal. Time: "+t); 656 Shared.printMemory(); 657 outstream.println(); 658 t.start(); 659 } 660 } 661 findOverlaps(Timer t)662 private void findOverlaps(Timer t){ 663 664 ArrayList<Read> list=new ArrayList<Read>((int)addedToMain); 665 for(ArrayList<Unit> alu : codeMap.values()){ 666 for(Unit u : alu){ 667 if(u.valid() && u.r.pairnum()==0){ 668 u.unitID=list.size(); 669 list.add(u.r); 670 if(u.r.mate!=null){ 671 Unit u2=(Unit)u.r.mate.obj; 672 u2.unitID=u.unitID; 673 } 674 }else{ 675 u.unitID=Integer.MAX_VALUE; 676 } 677 } 678 } 679 680 if(preventTransitiveOverlaps){ 681 clusterNumbers=new AtomicIntegerArray(list.size()); 682 for(int i=0; i<clusterNumbers.length(); i++){ 683 clusterNumbers.set(i, i); 684 } 685 } 686 687 crisa=makeCrisArray(subsetMode ? null : list); 688 689 ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS); 690 for(int i=0; i<THREADS; i++){alht.add(new HashThread(false, false, false, false, true));} 691 692 for(HashThread ht : alht){ht.start();} 693 for(HashThread ht : alht){ 694 while(ht.getState()!=Thread.State.TERMINATED){ 695 try { 696 ht.join(); 697 } catch (InterruptedException e) { 698 // TODO Auto-generated catch block 699 e.printStackTrace(); 700 } 701 } 702 assert(ht.matchesT==0); 703 assert(ht.collisionsT==0); 704 assert(ht.baseMatchesT==0); 705 assert(ht.addedToMainT==0); 706 707 overlaps+=ht.overlapsT; 708 baseOverlaps+=ht.baseOverlapsT; 709 overlapCollisions+=ht.overlapCollisionsT; 710 } 711 alht.clear(); 712 if(verbose){System.err.println("Attempting to close input streams (3).");} 713 for(ConcurrentReadInputStream cris : crisa){ 714 errorState|=ReadWrite.closeStream(cris); 715 } 716 717 if(DISPLAY_PROGRESS){ 718 t.stop(); 719 outstream.println("Found "+overlaps+" overlaps."); 720 outstream.println("Finished finding overlaps. Time: "+t); 721 Shared.printMemory(); 722 outstream.println(); 723 t.start(); 724 } 725 726 crisa=null; 727 728 if(makeClusters){ 729 int intransitive=0, redundant=0; 730 assert((intransitive=countIntransitive(t, list, rigorousTransitive))==0); 731 assert((redundant=countRedundant(t, list))==0); 732 long overlaps=countOverlaps(t, list); 733 assert(intransitive==0); 734 assert(redundant==0); 735 // makeTransitive(t, list, rigorousTransitive); 736 if(clusterQueue==null){ 737 clusterQueue=new ArrayDeque<ArrayList<Unit>>(list.size()/4+1); 738 processedClusters=new ArrayList<ArrayList<Unit>>(); 739 }else{ 740 assert(clusterQueue.isEmpty()); 741 } 742 makeClusters(t, list); 743 } 744 745 list.clear(); 746 } 747 makeTransitive(Timer t, ArrayList<Read> list, boolean rigorous)748 private long makeTransitive(Timer t, ArrayList<Read> list, boolean rigorous){ 749 assert(false) : "No longer needed."; 750 long added=0; 751 for(Read r : list){ 752 assert(r!=null); 753 Unit u=(Unit) r.obj; 754 assert(u!=null); 755 assert(u.valid()); 756 // outstream.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size())); 757 if(u.valid()){ 758 759 if(u.overlapList!=null){ 760 for(Overlap o : u.overlapList){ 761 Unit u2=(o.u1==u ? o.u2 : o.u1); 762 assert(u2!=u); 763 if(u2.overlapList==null){ 764 u2.overlapList=new ArrayList<Overlap>(2); 765 u2.overlapList.add(o); 766 }else{ 767 boolean found=false; 768 if(rigorous){ 769 found=u2.overlapList.contains(o); 770 }else{ 771 for(Overlap o2 : u2.overlapList){ 772 if(o2.u1==u || o2.u2==u){found=true; break;} 773 } 774 } 775 if(!found){ 776 added++; 777 u2.overlapList.add(o); 778 } 779 } 780 } 781 } 782 } 783 } 784 785 for(Read r : list){ 786 Unit u=(Unit) r.obj; 787 if(u.valid()){ 788 assert(u.isTransitive()); 789 } 790 } 791 792 if(DISPLAY_PROGRESS){ 793 t.stop(); 794 outstream.println("Added overlaps: "+added); 795 outstream.println("Made overlaps transitive. Time: "+t); 796 Shared.printMemory(); 797 outstream.println(); 798 t.start(); 799 } 800 return added; 801 } 802 countIntransitive(Timer t, ArrayList<Read> list, boolean rigorous)803 private int countIntransitive(Timer t, ArrayList<Read> list, boolean rigorous){ 804 if(!countTransitive){return 0;} 805 int transitive=0, intransitive=0; 806 for(Read r : list){ 807 assert(r!=null); 808 Unit u=(Unit) r.obj; 809 assert(u!=null); 810 assert(u.valid()); 811 // outstream.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size())); 812 if(u.valid()){ 813 if(rigorous ? u.isPerfectlyTransitive() : u.isTransitive()){ 814 transitive++; 815 }else{ 816 intransitive++; 817 } 818 } 819 } 820 821 if(DISPLAY_PROGRESS){ 822 t.stop(); 823 outstream.println("Intransitive: "+intransitive+", \ttransitive: "+transitive); 824 outstream.println("Checked transitivity. Time: "+t); 825 Shared.printMemory(); 826 outstream.println(); 827 t.start(); 828 } 829 830 return intransitive; 831 } 832 countRedundant(Timer t, ArrayList<Read> list)833 private int countRedundant(Timer t, ArrayList<Read> list){ 834 if(!countRedundant){return 0;} 835 int redundant=0, nonredundant=0; 836 for(Read r : list){ 837 assert(r!=null); 838 Unit u=(Unit) r.obj; 839 assert(u!=null); 840 assert(u.valid()); 841 // outstream.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size())); 842 if(u.valid()){ 843 if(u.isNonRedundant()){ 844 nonredundant++; 845 }else{ 846 redundant++; 847 } 848 } 849 } 850 851 if(DISPLAY_PROGRESS){ 852 t.stop(); 853 outstream.println("Redundant: "+redundant+", \tnonredundant: "+nonredundant); 854 outstream.println("Checked redundancy. Time: "+t); 855 Shared.printMemory(); 856 outstream.println(); 857 t.start(); 858 } 859 return redundant; 860 } 861 countOverlaps(Timer t, ArrayList<Read> list)862 private long countOverlaps(Timer t, ArrayList<Read> list){ 863 864 long overlaps=0, length=0; 865 for(Read r : list){ 866 assert(r!=null); 867 Unit u=(Unit) r.obj; 868 assert(u!=null); 869 assert(u.valid()); 870 // outstream.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size())); 871 if(u.valid() && u.overlapList!=null){ 872 for(Overlap o : u.overlapList){ 873 overlaps++; 874 length+=o.overlapLen; 875 } 876 } 877 } 878 879 if(DISPLAY_PROGRESS){ 880 t.stop(); 881 outstream.println("Overlaps: "+overlaps+", \tlength: "+length); 882 outstream.println("Counted overlaps. Time: "+t); 883 Shared.printMemory(); 884 outstream.println(); 885 t.start(); 886 } 887 return overlaps; 888 } 889 fillClusterSizeMatrix(ArrayList<ArrayList<Unit>> clusters, long[][] clusterSize)890 private long fillClusterSizeMatrix(ArrayList<ArrayList<Unit>> clusters, long[][] clusterSize){ 891 int max=0; 892 for(ArrayList<Unit> cluster : clusters){ 893 final int cs=Tools.min(clusterSize.length-1, cluster.size()); 894 { 895 long reads=0, bases=0; 896 for(Unit u2 : cluster){ 897 reads++; 898 bases+=u2.length(); 899 } 900 clusterSize[0][cs]++; 901 clusterSize[1][cs]+=reads; 902 clusterSize[2][cs]+=bases; 903 } 904 max=Tools.max(max, cluster.size()); 905 } 906 return max; 907 } 908 makeClusters(Timer t, ArrayList<Read> list)909 private void makeClusters(Timer t, ArrayList<Read> list){ 910 911 final int clusterlen=70000; 912 long[][] clusterSize=new long[3][clusterlen]; 913 int max=0; 914 for(Read r : list){ 915 Unit u=(Unit) r.obj; 916 917 if(!u.clustered()){ 918 ArrayList<Unit> cluster=u.makeCluster(); 919 if(cluster.size()>2){cluster.trimToSize();} 920 if(cluster.size()==1 || (!processClusters && !maxSpanningTree)){processedClusters.add(cluster);} 921 else{clusterQueue.add(cluster);} 922 final int cs=Tools.min(clusterlen-1, cluster.size()); 923 { 924 long reads=0, bases=0; 925 for(Unit u2 : cluster){ 926 reads++; 927 bases+=u2.length(); 928 } 929 clusterSize[0][cs]++; 930 clusterSize[1][cs]+=reads; 931 clusterSize[2][cs]+=bases; 932 } 933 max=Tools.max(max, cluster.size()); 934 } 935 } 936 937 if(DISPLAY_PROGRESS){ 938 t.stop(); 939 outstream.println(toClusterSizeString(clusterSize)); 940 outstream.println("\nLargest: "+max); 941 outstream.println("Finished making clusters. Time: "+t); 942 Shared.printMemory(); 943 outstream.println(); 944 t.start(); 945 } 946 947 948 long x=removeInvalid(list); 949 950 if(DISPLAY_PROGRESS){ 951 t.stop(); 952 outstream.println("Removed "+x+" invalid entries."); 953 outstream.println("Finished invalid removal. Time: "+t); 954 Shared.printMemory(); 955 outstream.println(); 956 t.start(); 957 } 958 } 959 toClusterSizeString(long[][] clusterSizeMatrix)960 private String toClusterSizeString(long[][] clusterSizeMatrix){ 961 962 long[] clusterSize=clusterSizeMatrix[0]; 963 long[] clusterReads=clusterSizeMatrix[1]; 964 long[] clusterBases=clusterSizeMatrix[2]; 965 966 long totalClusters=Tools.sum(clusterSize); 967 968 long bigClusters=0; 969 for(int i=minClusterSize; i<clusterSize.length; i++){ 970 bigClusters+=clusterSize[i]; 971 } 972 973 final int spaces=18; 974 final int spaces2=spaces*2, spaces3=spaces*3; 975 976 final StringBuilder sb=new StringBuilder(100), sb2=new StringBuilder(1000); 977 sb2.append("Clusters:"); 978 while(sb2.length()<spaces){sb2.append(' ');} 979 sb2.append(totalClusters+(minClusterSize<2 ? "" : " ("+bigClusters+" of at least size "+minClusterSize+")")+"\n"); 980 981 sb.append("Size Range"); 982 while(sb.length()<spaces){sb.append(' ');} 983 sb.append("Clusters"); 984 while(sb.length()<spaces2){sb.append(' ');} 985 sb.append("Reads"); 986 while(sb.length()<spaces3){sb.append(' ');} 987 sb.append("Bases"); 988 989 sb2.append('\n'); 990 sb2.append(sb); 991 sb.setLength(0); 992 993 for(int i=0; i<clusterSize.length-1; i=Tools.max(i+1, i*2)){ 994 int a=i+1, b=i*2; 995 if(i<2){ 996 sb.append(a); 997 while(sb.length()<spaces){sb.append(' ');} 998 sb.append(clusterSize[a]); 999 while(sb.length()<spaces2){sb.append(' ');} 1000 sb.append(clusterReads[a]); 1001 while(sb.length()<spaces3){sb.append(' ');} 1002 sb.append(clusterBases[a]); 1003 }else if(b>=clusterSize.length){ 1004 long x=Tools.sum(clusterSize, a, clusterSize.length-1); 1005 long y=Tools.sum(clusterReads, a, clusterSize.length-1); 1006 long z=Tools.sum(clusterBases, a, clusterSize.length-1); 1007 if(x>0){ 1008 sb.append(a+"+"); 1009 while(sb.length()<spaces){sb.append(' ');} 1010 sb.append(x); 1011 while(sb.length()<spaces2){sb.append(' ');} 1012 sb.append(y); 1013 while(sb.length()<spaces3){sb.append(' ');} 1014 sb.append(z); 1015 } 1016 }else{ 1017 long x=Tools.sum(clusterSize, a, b); 1018 long y=Tools.sum(clusterReads, a, b); 1019 long z=Tools.sum(clusterBases, a, b); 1020 if(x>0){ 1021 sb.append(a+"-"+b); 1022 while(sb.length()<spaces){sb.append(' ');} 1023 sb.append(x); 1024 while(sb.length()<spaces2){sb.append(' ');} 1025 sb.append(y); 1026 while(sb.length()<spaces3){sb.append(' ');} 1027 sb.append(z); 1028 } 1029 } 1030 if(sb.length()>0){ 1031 sb2.append('\n'); 1032 sb2.append(sb); 1033 sb.setLength(0); 1034 } 1035 } 1036 return sb2.toString(); 1037 } 1038 renameClusters(Timer t)1039 private void renameClusters(Timer t){ 1040 assert(false) : "This is now unused; renaming is done at output time."; 1041 int cnum=0; 1042 final StringBuilder sb=new StringBuilder(64); 1043 for(ArrayList<Unit> alu : processedClusters){ 1044 for(int i=0; i<alu.size(); i++){ 1045 Unit u=alu.get(i); 1046 Read r=u.r; 1047 sb.append("Cluster "); 1048 sb.append(cnum); 1049 sb.append(",contig "); 1050 sb.append(i); 1051 if(u.offsetValid()){ 1052 sb.append(",pos "); 1053 sb.append(u.offset()); 1054 } 1055 r.id=sb.toString(); 1056 sb.setLength(0); 1057 } 1058 } 1059 1060 if(DISPLAY_PROGRESS){ 1061 t.stop(); 1062 outstream.println("Finished cluster renaming. Time: "+t); 1063 Shared.printMemory(); 1064 outstream.println(); 1065 t.start(); 1066 } 1067 } 1068 processMst(Timer t)1069 private void processMst(Timer t){ 1070 1071 if(DISPLAY_PROGRESS){outstream.println("Converting to Maximum Spanning Tree.");} 1072 1073 ArrayList<MstThread> alct=new ArrayList<MstThread>(THREADS); 1074 for(int i=0; i<THREADS; i++){ 1075 alct.add(new MstThread()); 1076 } 1077 1078 long overlapsRemoved=0; 1079 long overlapBasesRemoved=0; 1080 long overlapsRetained=0; 1081 long overlapBasesRetained=0; 1082 1083 for(MstThread ct : alct){ct.start();} 1084 for(MstThread ct : alct){ 1085 while(ct.getState()!=Thread.State.TERMINATED){ 1086 try { 1087 ct.join(); 1088 } catch (InterruptedException e) { 1089 e.printStackTrace(); 1090 } 1091 } 1092 1093 overlapsRemoved+=ct.overlapsRemovedT; 1094 overlapBasesRemoved+=ct.overlapBasesRemovedT; 1095 overlapsRetained+=ct.overlapsRetainedT; 1096 overlapBasesRetained+=ct.overlapBasesRetainedT; 1097 } 1098 assert(clusterQueue.isEmpty()); 1099 if(processClusters){ 1100 for(MstThread ct : alct){ 1101 clusterQueue.addAll(ct.processedT); 1102 ct.processedT.clear(); 1103 ct.processedT=null; 1104 } 1105 }else{ 1106 for(MstThread ct : alct){ 1107 processedClusters.addAll(ct.processedT); 1108 ct.processedT.clear(); 1109 ct.processedT=null; 1110 } 1111 clusterQueue=null; 1112 } 1113 alct.clear(); 1114 1115 assert(affixMaps==null); 1116 killAffixMaps(); 1117 1118 1119 if(DISPLAY_PROGRESS){ 1120 t.stop(); 1121 outstream.println("Removed "+(overlapsRemoved)+" edges ("+overlapBasesRemoved+" bases)."); 1122 outstream.println("Retained "+(overlapsRetained)+" edges ("+overlapBasesRetained+" bases)."); 1123 1124 // outstream.println("\nAfter conversion to Maximum Spanning Tree:"); 1125 // final int[] clusterSize=new int[8200]; 1126 // int max=0; 1127 // for(ArrayList<Unit> cluster : processedClusters){ 1128 // clusterSize[Tools.min(clusterSize.length-1, cluster.size())]++; 1129 // max=Tools.max(max, cluster.size()); 1130 // } 1131 // outstream.println(toClusterSizeString(clusterSize)); 1132 // outstream.println("Largest: "+max); 1133 1134 outstream.println("Finished MST conversion. Time: "+t); 1135 Shared.printMemory(); 1136 outstream.println(); 1137 t.start(); 1138 } 1139 } 1140 processClusters(Timer t, boolean mergeClusters)1141 private void processClusters(Timer t, boolean mergeClusters){ 1142 1143 ArrayList<ClusterThread> alct=new ArrayList<ClusterThread>(THREADS); 1144 for(int i=0; i<THREADS; i++){ 1145 alct.add(new ClusterThread(fixMultiJoins, canonicizeClusters, removeCycles, fixCanonContradictions, fixOffsetContradictions, 1146 mergeClusters, mergeClusters, mergeClusters)); 1147 } 1148 1149 long leafMerges=0; 1150 long innerMerges=0; 1151 long leafBaseMerges=0; 1152 long innerBaseMerges=0; 1153 1154 long multiJoinFailures=0; 1155 long multiJoinsFound=0; 1156 long multiJoinBasesFound=0; 1157 long unitsFlipped=0; 1158 long overlapsFlipped=0; 1159 long canonContradictoryOverlaps=0; 1160 long canonContradictoryClusters=0; 1161 long offsetContradictoryOverlaps=0; 1162 long offsetContradictoryClusters=0; 1163 long cycleOverlaps=0; 1164 long cycleClusters=0; 1165 1166 for(ClusterThread ct : alct){ct.start();} 1167 for(ClusterThread ct : alct){ 1168 while(ct.getState()!=Thread.State.TERMINATED){ 1169 try { 1170 ct.join(); 1171 } catch (InterruptedException e) { 1172 e.printStackTrace(); 1173 } 1174 } 1175 1176 leafMerges+=ct.leafMergesT; 1177 innerMerges+=ct.innerMergesT; 1178 leafBaseMerges+=ct.leafBaseMergesT; 1179 innerBaseMerges+=ct.innerBaseMergesT; 1180 1181 multiJoinFailures+=ct.multiJoinFailuresT; 1182 multiJoinsFound+=ct.multiJoinsFoundT; 1183 multiJoinBasesFound+=ct.multiJoinBasesFoundT; 1184 unitsFlipped+=ct.unitsFlippedT; 1185 overlapsFlipped+=ct.overlapsFlippedT; 1186 canonContradictoryOverlaps+=ct.canonContradictoryOverlapsT; 1187 canonContradictoryClusters+=ct.canonContradictoryClustersT; 1188 offsetContradictoryOverlaps+=ct.offsetContradictoryOverlapsT; 1189 offsetContradictoryClusters+=ct.offsetContradictoryClustersT; 1190 cycleOverlaps+=ct.cycleOverlapsT; 1191 cycleClusters+=ct.cycleClustersT; 1192 } 1193 alct.clear(); 1194 1195 assert(affixMaps==null); 1196 killAffixMaps(); 1197 1198 assert(clusterQueue.isEmpty()); 1199 clusterQueue=null; 1200 1201 if(DISPLAY_PROGRESS){ 1202 t.stop(); 1203 if(fixMultiJoins){ 1204 outstream.println("Found "+(multiJoinsFound)+" multijoins ("+multiJoinBasesFound+" bases)."); 1205 outstream.println("Experienced "+(multiJoinFailures)+" multijoin removal failures."); 1206 } 1207 if(canonicizeClusters){ 1208 outstream.println("Flipped "+(unitsFlipped)+" reads and "+overlapsFlipped+" overlaps."); 1209 outstream.println("Found "+(canonContradictoryClusters)+" clusters ("+canonContradictoryOverlaps+" overlaps) with contradictory orientation cycles."); 1210 } 1211 if(fixOffsetContradictions){ 1212 outstream.println("Found "+(offsetContradictoryClusters)+" clusters ("+offsetContradictoryOverlaps+" overlaps) with contradictory offset cycles."); 1213 } 1214 outstream.println("Found "+(cycleClusters)+" clusters ("+cycleOverlaps+" overlaps) with remaining cycles."); 1215 if(absorbOverlap){ 1216 outstream.println("Merged "+(leafMerges)+" leaves ("+leafBaseMerges+" bases)."); 1217 outstream.println("Merged "+(innerMerges)+" nonleaves ("+innerBaseMerges+" bases)."); 1218 } 1219 1220 outstream.println("\nAfter processing clusters:"); 1221 final long[][] clusterSize=new long[3][70000]; 1222 final long max=fillClusterSizeMatrix(processedClusters, clusterSize); 1223 outstream.println(toClusterSizeString(clusterSize)); 1224 outstream.println("\nLargest: "+max); 1225 1226 outstream.println("Finished processing. Time: "+t); 1227 Shared.printMemory(); 1228 outstream.println(); 1229 t.start(); 1230 } 1231 } 1232 removeInvalid(ArrayList<Read> list)1233 private long removeInvalid(ArrayList<Read> list){ 1234 final LongM keym=new LongM(); 1235 long removedC=0, removedP=0, removedS=0, invalid=0; 1236 1237 for(int j=0, lim=list.size(); j<lim; j++){ 1238 final Read r=list.get(j); 1239 final Unit u=(Unit)r.obj; 1240 1241 if(!u.valid()){ 1242 1243 invalid++; 1244 1245 if(codeMap!=null && !codeMap.isEmpty()){ 1246 Long key=u.code1; 1247 ArrayList<Unit> alu=codeMap.get(key); 1248 if(alu!=null){ 1249 int valid=0; 1250 for(int i=alu.size()-1; i>=0; i--){ 1251 Unit u2=alu.get(i); 1252 if(u2==null || !u2.valid()){ 1253 alu.remove(i); 1254 removedC++; 1255 } 1256 else{valid++;} 1257 } 1258 if(valid==0){codeMap.remove(key);} 1259 } 1260 } 1261 1262 for(int num=0; num<numAffixMaps && affixMaps!=null; num++){ 1263 HashMap<LongM, ArrayList<Unit>> map=affixMaps[num]; 1264 if(map!=null && !map.isEmpty()){ 1265 if(u.prefixes[num]!=-1){ 1266 keym.set(u.prefixes[num]); 1267 ArrayList<Unit> alu=map.get(keym); 1268 if(alu!=null){ 1269 int valid=0; 1270 for(int i=alu.size()-1; i>=0; i--){ 1271 Unit u2=alu.get(i); 1272 if(u2==null || !u2.valid()){ 1273 alu.remove(i); 1274 removedP++; 1275 } 1276 else{valid++;} 1277 } 1278 if(valid==0){map.remove(keym);} 1279 } 1280 } 1281 if(storeSuffix && u.suffixes[num]!=-1){ 1282 keym.set(u.suffixes[num]); 1283 ArrayList<Unit> alu=map.get(keym); 1284 if(alu!=null){ 1285 int valid=0; 1286 for(int i=alu.size()-1; i>=0; i--){ 1287 Unit u2=alu.get(i); 1288 if(u2==null || !u2.valid()){ 1289 alu.remove(i); 1290 removedS++; 1291 } 1292 else{valid++;} 1293 } 1294 if(valid==0){map.remove(keym);} 1295 } 1296 } 1297 } 1298 } 1299 list.set(j, null); 1300 } 1301 } 1302 1303 if(invalid>0){ 1304 Tools.condenseStrict(list); 1305 } 1306 if(verbose){ 1307 outstream.println("Removed invalids: "+removedC+", "+removedP+", "+removedS); 1308 } 1309 return invalid; 1310 } 1311 1312 addToArray(HashMap<Long, ArrayList<Unit>> codeMap, boolean sort, boolean clear, long outNum)1313 private static ArrayList<Read> addToArray(HashMap<Long, ArrayList<Unit>> codeMap, boolean sort, boolean clear, long outNum){ 1314 assert(outNum<=Integer.MAX_VALUE); 1315 if(verbose){System.err.println("Making list.");} 1316 ArrayList<Read> list=new ArrayList<Read>((int)outNum); 1317 if(verbose){System.err.println("Adding.");} 1318 for(ArrayList<Unit> alu : codeMap.values()){ 1319 for(Unit u : alu){ 1320 if(u.valid() && u.r.pairnum()==0){list.add(u.r);} 1321 } 1322 if(clear){alu.clear();} 1323 } 1324 if(clear){codeMap.clear();} 1325 1326 if(sort){ 1327 if(verbose){System.err.println("Sorting.");} 1328 Shared.sort(list, comparator); 1329 // if(ascending){ 1330 // Collections.reverse(list); 1331 // assert(list.isEmpty() || list.get(0).length()<=list.get(list.size()-1).length()) : 1332 // list.get(0).length()+", "+list.get(list.size()-1).length(); 1333 // }else{ 1334 // assert(list.isEmpty() || list.get(0).length()>=list.get(list.size()-1).length()) : 1335 // list.get(0).length()+", "+list.get(list.size()-1).length(); 1336 // } 1337 } 1338 assert(list.size()==outNum || list.size()*2L==outNum || UNIQUE_ONLY) : list.size()+", "+outNum; 1339 return list; 1340 } 1341 writeOutput(String clusterStatsFile, Timer t)1342 private void writeOutput(String clusterStatsFile, Timer t){ 1343 // verbose=true; 1344 // assert(false) : (processedClusters==null)+", "+(processedClusters.isEmpty())+", "+outgraph+", "+out+", "+clusterFilePattern; 1345 if(processedClusters==null || processedClusters.isEmpty()){ 1346 1347 if(out!=null || clusterFilePattern!=null){ 1348 1349 ArrayList<Read> list=addToArray(codeMap, sort, true, addedToMain-containments); 1350 codeMap=null; 1351 1352 if(sort){ 1353 if(DISPLAY_PROGRESS){ 1354 t.stop(); 1355 outstream.println("Sorted output. Time: "+t); 1356 Shared.printMemory(); 1357 outstream.println(); 1358 t.start(); 1359 } 1360 } 1361 1362 writeOutput(list); 1363 } 1364 }else{ 1365 if(outgraph!=null){ 1366 writeGraph(outgraph, processedClusters); 1367 } 1368 if(out!=null || clusterFilePattern!=null || clusterStatsFile!=null || outbest!=null){ 1369 writeOutputClusters(clusterStatsFile, processedClusters); 1370 } 1371 } 1372 1373 if(DISPLAY_PROGRESS){ 1374 t.stop(); 1375 outstream.println("Printed output. Time: "+t); 1376 Shared.printMemory(); 1377 outstream.println(); 1378 t.start(); 1379 } 1380 } 1381 1382 1383 writeOutput(ArrayList<Read> list)1384 private void writeOutput(ArrayList<Read> list){ 1385 1386 final ByteStreamWriter tsw=(out==null ? null : new ByteStreamWriter(out, overwrite, append, true)); 1387 1388 if(verbose){System.err.println("Writing from array.");} 1389 tsw.start(); 1390 1391 HashSet<String> names=((uniqueNames && storeName) ? 1392 new HashSet<String>(Tools.min(Integer.MAX_VALUE, Tools.max((int)addedToMain, (int)(addedToMain*1.35)))) : null); 1393 long rid=0; 1394 for(int x=0; x<list.size(); x++){ 1395 Read r=list.get(x); 1396 list.set(x, null); 1397 1398 if(r.mate!=null && r.pairnum()!=0){r=r.mate;} 1399 1400 if(!r.discarded()){ 1401 rid++; 1402 1403 for(int i=0; r!=null && i<2; i++){ 1404 if(multipleInputFiles){r.numericID=rid;} 1405 if(names!=null){ 1406 String name=(r.id==null ? ""+r.numericID : r.id); 1407 if(names.contains(name)){ 1408 for(long j=0; j<Integer.MAX_VALUE; j++){ 1409 String name2=name+"_dd"+j; 1410 if(r.mate!=null){name2+=(" /"+(i+1));} 1411 if(!names.contains(name2)){ 1412 r.id=name2; 1413 names.add(name2); 1414 break; 1415 } 1416 } 1417 }else{ 1418 names.add(name); 1419 } 1420 } 1421 tsw.println(r); 1422 r.setDiscarded(true); 1423 r=r.mate; 1424 } 1425 } 1426 } 1427 if(verbose){System.err.println("Shutting down tsw "+tsw.fname);} 1428 tsw.poisonAndWait(); 1429 } 1430 1431 writeOutputClusters(String clusterStatsFile, ArrayList<ArrayList<Unit>> clist)1432 private void writeOutputClusters(String clusterStatsFile, ArrayList<ArrayList<Unit>> clist){ 1433 1434 // Shared.sort(clist, CLUSTER_LENGTH_COMPARATOR); 1435 clist.sort(CLUSTER_LENGTH_COMPARATOR); 1436 1437 if(verbose){System.err.println("Writing clusters.");} 1438 1439 final ByteStreamWriter tswAll=(out==null ? null : new ByteStreamWriter(out, overwrite, append, true)); 1440 if(tswAll!=null){tswAll.start();} 1441 ByteStreamWriter tswCluster=null; 1442 ByteStreamWriter tswBest=null; 1443 1444 if(outbest!=null){ 1445 tswBest=new ByteStreamWriter(outbest, overwrite, append, true); 1446 tswBest.start(); 1447 } 1448 1449 TextStreamWriter csf=null; 1450 if(clusterStatsFile!=null){ 1451 csf=new TextStreamWriter(clusterStatsFile, overwrite, false, false); 1452 csf.start(); 1453 csf.print("#Name\tsize\t"+nmerLength+"-mer frequencies\n"); 1454 } 1455 1456 HashSet<String> names=((uniqueNames && storeName) ? 1457 new HashSet<String>(Tools.min(Integer.MAX_VALUE, Tools.max((int)addedToMain, (int)(addedToMain*1.35)))) : null); 1458 long rid=0; 1459 final long[] nmerCounts=new long[maxNmer+1]; 1460 1461 final StringBuilder sb=new StringBuilder(64); 1462 1463 for(int cnum=0; cnum<clist.size(); cnum++){ 1464 final ArrayList<Unit> alu=clist.get(cnum); 1465 // clist.set(cnum, null); //This breaks subsequent output processing 1466 1467 if(alu.size()<minClusterSize){ 1468 if(verbose){System.err.println("Ignoring small cluster "+cnum+", size "+alu.size());} 1469 1470 if(csf!=null && alu.size()>=minClusterSizeForStats){ 1471 float[] profile=makeNmerProfile(alu, nmerCounts); 1472 sb.append("Cluster_"); 1473 sb.append(cnum); 1474 sb.append('\t'); 1475 sb.append(alu.size()); 1476 sb.append('\t'); 1477 for(float f : profile){ 1478 sb.append(String.format(Locale.ROOT, "%.5f ", f)); 1479 } 1480 sb.setCharAt(sb.length()-1, '\n'); 1481 csf.print(sb.toString()); 1482 sb.setLength(0); 1483 } 1484 }else{ 1485 if(verbose){System.err.println("Writing cluster "+cnum+", size "+alu.size());} 1486 1487 if(clusterFilePattern!=null){ 1488 if(tswCluster!=null){ 1489 if(verbose){System.err.println("Shutting down tswCluster "+tswCluster.fname);} 1490 tswCluster.poisonAndWait(); 1491 tswCluster=null; 1492 } 1493 tswCluster=new ByteStreamWriter(clusterFilePattern.replaceFirst("%", ""+cnum), overwrite, append, true); 1494 if(verbose){System.err.println("Starting tswCluster "+tswCluster.fname);} 1495 tswCluster.start(); 1496 } 1497 1498 if(csf!=null && alu.size()>=minClusterSizeForStats){ 1499 float[] profile=makeNmerProfile(alu, nmerCounts); 1500 sb.append("Cluster_"); 1501 sb.append(cnum); 1502 sb.append('\t'); 1503 sb.append(alu.size()); 1504 sb.append('\t'); 1505 for(float f : profile){ 1506 sb.append(String.format(Locale.ROOT, "%.5f ", f)); 1507 } 1508 sb.setCharAt(sb.length()-1, '\n'); 1509 csf.print(sb.toString()); 1510 sb.setLength(0); 1511 } 1512 1513 if(pickBestRepresentativePerCluster){ 1514 pickBestRepresenative(alu, true); 1515 } 1516 1517 if(outbest!=null){ 1518 Unit u=pickBestRepresenative((ArrayList<Unit>)alu.clone(), false); 1519 tswBest.println(u.r); 1520 if(u.r.mate!=null){tswBest.println(u.r.mate);} 1521 } 1522 1523 for(int contig=0; contig<alu.size(); contig++){ 1524 final Unit u0=alu.get(contig); 1525 alu.set(contig, null); 1526 Read r=u0.r; 1527 if(r.mate!=null && r.pairnum()!=0){r=r.mate;} 1528 1529 if(!r.discarded()){ 1530 rid++; 1531 1532 for(int i=0; r!=null && i<2; i++){ 1533 assert(r.pairnum()==i) : i+", "+r.pairnum()+", "+(r.mate==null ? 9 : r.mate.pairnum()); 1534 Unit u=(Unit)r.obj; 1535 if(verbose){System.err.println("Writing read "+r.id);} 1536 r.numericID=rid; 1537 if(renameClusters){ 1538 sb.append("Cluster_"); 1539 sb.append(cnum); 1540 sb.append(",contig_"); 1541 sb.append(contig); 1542 if(u.offsetValid()){ 1543 sb.append(",pos_"); 1544 sb.append(u.offset()); 1545 } 1546 if(r.mate!=null){sb.append(" /"+(i+1));} 1547 r.id=(r.id==null ? sb.toString() : r.id+"\t"+sb); 1548 sb.setLength(0); 1549 }else if(names!=null){ 1550 String name=(r.id==null ? ""+r.numericID : r.id); 1551 if(names.contains(name)){ 1552 for(long j=0; j<Integer.MAX_VALUE; j++){ 1553 String name2=name+"_dd"+j; 1554 if(!names.contains(name2)){ 1555 r.id=name2; 1556 names.add(name2); 1557 break; 1558 } 1559 } 1560 }else{ 1561 names.add(name); 1562 } 1563 } 1564 if(tswAll!=null){tswAll.println(r);} 1565 if(tswCluster!=null){tswCluster.println(r);} 1566 r.setDiscarded(true); 1567 r=r.mate; 1568 } 1569 } 1570 } 1571 } 1572 } 1573 if(csf!=null){ 1574 if(verbose){System.err.println("Shutting down csf "+csf.fname);} 1575 csf.poisonAndWait(); 1576 } 1577 if(tswBest!=null){ 1578 if(verbose){System.err.println("Shutting down tswBest "+tswBest.fname);} 1579 tswBest.poisonAndWait(); 1580 } 1581 if(tswAll!=null){ 1582 if(verbose){System.err.println("Shutting down tswAll "+tswAll.fname);} 1583 tswAll.poisonAndWait(); 1584 } 1585 if(tswCluster!=null){ 1586 if(verbose){System.err.println("Shutting down tswCluster "+tswCluster.fname);} 1587 tswCluster.poisonAndWait(); 1588 } 1589 } 1590 1591 writeGraph(String graphFile, ArrayList<ArrayList<Unit>> clist)1592 private void writeGraph(String graphFile, ArrayList<ArrayList<Unit>> clist){ 1593 // Shared.sort(clist, CLUSTER_LENGTH_COMPARATOR); 1594 clist.sort(CLUSTER_LENGTH_COMPARATOR); 1595 1596 if(verbose){System.err.println("Writing overlap graph.");} 1597 1598 final TextStreamWriter tsw=(graphFile==null ? null : new TextStreamWriter(graphFile, overwrite, append, true)); 1599 if(tsw!=null){ 1600 tsw.start(); 1601 tsw.print("digraph G {\n"); 1602 } 1603 1604 for(int cnum=0; cnum<clist.size(); cnum++){ 1605 final ArrayList<Unit> alu=clist.get(cnum); 1606 // clist.set(cnum, null); //This breaks subsequent output processing 1607 // Shared.sort(alu); //TODO: Remove 1608 1609 if(alu.size()<minClusterSize){ 1610 if(verbose){System.err.println("Ignoring small cluster "+cnum+", size "+alu.size());} 1611 }else{ 1612 if(verbose){System.err.println("Processing cluster "+cnum+", size "+alu.size());} 1613 1614 for(int contig=0; contig<alu.size(); contig++){ 1615 final Unit u0=alu.get(contig); 1616 // alu.set(contig, null); //This breaks subsequent output processing 1617 Read r=u0.r; 1618 if(r.mate!=null && r.pairnum()!=0){r=r.mate;} 1619 1620 { 1621 for(int i=0; r!=null && i<2; i++){ 1622 assert(r.pairnum()==i) : i+", "+r.pairnum()+", "+(r.mate==null ? 9 : r.mate.pairnum()); 1623 Unit u=(Unit)r.obj; 1624 if(verbose){System.err.println("Writing read "+r.id);} 1625 1626 if(tsw!=null){ 1627 tsw.print("\t"+toGraphName(r)+"\n"); 1628 if(r.mate!=null && r.pairnum()==0){ 1629 Read r2=r.mate; 1630 tsw.print(toGraphName(r)+" -> "+toGraphName(r2)+" [label=mate]"); 1631 } 1632 if(u.overlapList!=null){ 1633 for(Overlap o : u.overlapList){ 1634 if(u==o.u1){ 1635 Read r2=o.u2.r; 1636 tsw.print("\t"+toGraphName(r)+" -> "+toGraphName(r2)+" [label=\""+o.toLabel()+"\"]\n"); 1637 } 1638 } 1639 } 1640 } 1641 r=r.mate; 1642 } 1643 } 1644 } 1645 } 1646 } 1647 1648 if(tsw!=null){ 1649 tsw.print("}\n"); 1650 if(verbose){System.err.println("Shutting down tswAll "+tsw.fname);} 1651 tsw.poisonAndWait(); 1652 } 1653 } 1654 toGraphName(Read r)1655 private static String toGraphName(Read r){ 1656 if(NUMBER_GRAPH_NODES || r.id==null){ 1657 return r.numericID+((ADD_PAIRNUM_TO_NAME || r.mate!=null) ? "."+(r.pairnum()+1) : ""); 1658 }else{ 1659 return r.id.replace(' ','_').replace('\t','_'); 1660 } 1661 } 1662 pickBestRepresenative(ArrayList<Unit> alu, boolean clearList)1663 private Unit pickBestRepresenative(ArrayList<Unit> alu, boolean clearList){ 1664 if(alu==null || alu.isEmpty()){return null;} 1665 float[] quality=new float[alu.size()]; 1666 int[] lengths=new int[alu.size()]; 1667 for(int i=0; i<alu.size(); i++){ 1668 Unit u=alu.get(i); 1669 int len=u.r.length(); 1670 quality[i]=u.r.expectedErrors(true, 0)/len; 1671 lengths[i]=len; 1672 } 1673 Arrays.sort(quality); 1674 Arrays.sort(lengths); 1675 int medianLength=lengths[lengths.length/2]; 1676 float bestQuality=quality[0]; 1677 1678 float currentBestQuality=9999999; 1679 Unit best=null; 1680 for(int i=0; i<alu.size(); i++){ 1681 Unit u=alu.get(i); 1682 int len=u.r.length(); 1683 float deviation=Tools.absdif(len, medianLength)*1f/(medianLength+1); 1684 if(deviation<0.05){ 1685 float qual=u.r.expectedErrors(true, 0)/len; 1686 qual=(qual+.001f)*(1+10*deviation); 1687 if(qual<currentBestQuality || best==null){ 1688 currentBestQuality=qual; 1689 best=u; 1690 } 1691 } 1692 } 1693 if(clearList){ 1694 alu.clear(); 1695 alu.add(best); 1696 } 1697 return best; 1698 } 1699 hash(byte[] bases)1700 public static long hash(byte[] bases){ 1701 long code=bases.length; 1702 for(int i=0; i<bases.length; i++){ 1703 byte b=bases[i]; 1704 int mode=(int)(code&31); 1705 assert(hashcodes[b]!=null) : "Invalid sequence character: '"+(char)b+"'"; 1706 code=code^hashcodes[b][mode]; 1707 code=Long.rotateLeft(code, 1); 1708 } 1709 return code; 1710 } 1711 1712 hashReversed(byte[] bases)1713 public static long hashReversed(byte[] bases){ 1714 long code=bases.length; 1715 for(int i=bases.length-1; i>=0; i--){ 1716 byte b=bases[i]; 1717 assert(hashcodes[b]!=null) : "Invalid sequence character: '"+(char)b+"'"; 1718 b=baseToComplementExtended[b]; 1719 int mode=(int)(code&31); 1720 code=code^hashcodes[b][mode]; 1721 code=Long.rotateLeft(code, 1); 1722 } 1723 return code; 1724 } 1725 1726 isCanonical(byte[] bases)1727 public static boolean isCanonical(byte[] bases){ 1728 if(ignoreReverseComplement || bases==null || bases.length==0){return true;} 1729 final int lim=(bases.length+1)/2; 1730 for(int i=0, j=bases.length-1; i<lim; i++, j--){ 1731 byte a=bases[i], b=baseToComplementExtended[bases[j]]; 1732 if(a<b){return true;} 1733 if(b<a){return false;} 1734 } 1735 assert((bases.length&1)==0 || bases[lim-1]==baseToComplementExtended[bases[lim-1]]) : 1736 bases.length+", "+lim+", "+bases[lim-1]+", "+(char)bases[lim-1]+(bases.length<1000 ? "\n'"+new String(bases)+"'\n" : ""); //palindrome absorb 1737 return true; //palindrome 1738 } 1739 1740 1741 private static synchronized long[][] makeCodes(int symbols, int modes){ 1742 Random randy=new Random(1); 1743 long[][] r=new long[symbols][modes]; 1744 for(int i=0; i<symbols; i++){ 1745 for(int j=0; j<modes; j++){ 1746 r[i][j]=randy.nextLong(); 1747 } 1748 } 1749 return r; 1750 } 1751 1752 // /** Handles IUPAC codes */ 1753 // private static synchronized long[][] makeCodes2(int modes){ 1754 // long[][] r0=makeCodes(26, modes); 1755 // long[][] r=new long[Tools.max('Z','z')+1][]; 1756 // for(int i=0; i<26; i++){ 1757 // char c=(char)('A'+i); 1758 // r[c]=r[Tools.toLowerCase(c)]=r0[i]; 1759 // } 1760 // return r; 1761 // } 1762 1763 /** Handles IUPAC codes and invalid symbols */ 1764 private static synchronized long[][] makeCodes2(int modes){ 1765 long[][] r=makeCodes(128, modes); 1766 1767 for(int i=0; i<26; i++){ 1768 char c=(char)('A'+i); 1769 r[Tools.toLowerCase(c)]=r[c]; 1770 } 1771 return r; 1772 } 1773 1774 private void addDupe(Read r){ 1775 if(dupeWriter==null){return;} 1776 if(r.mate==null || r.pairnum()==0){ 1777 synchronized(dupeWriter){ 1778 dupeWriter.println(r); 1779 if(r.mate!=null){ 1780 dupeWriter.println(r.mate); 1781 } 1782 } 1783 } 1784 } 1785 1786 1787 private final class MstThread extends Thread{ 1788 1789 public MstThread(){} 1790 1791 @Override 1792 public void run(){ 1793 1794 ArrayList<Unit> cluster=null; 1795 while((cluster=nextCluster())!=null){ 1796 makeMst(cluster); 1797 processedT.add(cluster); 1798 } 1799 1800 } 1801 1802 public void makeMst(ArrayList<Unit> cluster){ 1803 assert(heap.isEmpty()); 1804 unvisit(cluster); 1805 for(Unit u : cluster){ 1806 u.flags&=~Unit.VISIT_MASK; 1807 Shared.sort(u.overlapList); 1808 } 1809 { 1810 Unit u=cluster.get(0); 1811 u.setVisited(true); 1812 heap.addAll(u.overlapList); 1813 } 1814 // assert(false) : cluster.size(); 1815 while(!heap.isEmpty()){ 1816 Overlap o=heap.poll(); 1817 assert(!o.mst()); 1818 if(!o.invalid()){ 1819 // assert(o.u1.overlapList.contains(o)); //slow 1820 // assert(o.u2.overlapList.contains(o)); //slow 1821 assert(o.u1.visited() || o.u2.visited()); 1822 final Unit u=(!o.u1.visited() ? o.u1 : !o.u2.visited()? o.u2 : null); 1823 if(u!=null){ 1824 o.setMst(true); 1825 u.setVisited(true); 1826 overlapsRetainedT++; 1827 overlapBasesRetainedT+=o.overlapLen; 1828 for(Overlap o2 : u.overlapList){ 1829 if(o2.mst()){ 1830 //do nothing 1831 }else if(!o2.u1.visited() || !o2.u2.visited()){ 1832 if(heap.size()>=Integer.MAX_VALUE){ 1833 removeInvalid(heap); 1834 } 1835 heap.add(o2); 1836 }else if(!o2.invalid()){ 1837 o2.setInvalid(true); 1838 overlapsRemovedT++; 1839 overlapBasesRemovedT+=o2.overlapLen; 1840 } 1841 } 1842 } 1843 } 1844 } 1845 for(Unit u : cluster){ 1846 ArrayList<Overlap> alo=u.overlapList; 1847 int removed=0; 1848 for(int i=0; i<alo.size(); i++){ 1849 Overlap o=alo.get(i); 1850 if(o.invalid()){ 1851 assert(!o.mst()); 1852 alo.set(i, null); 1853 removed++; 1854 }else{ 1855 assert(o.mst()); 1856 } 1857 } 1858 if(removed>0){ 1859 Tools.condenseStrict(alo); 1860 alo.trimToSize(); 1861 } 1862 } 1863 } 1864 1865 private void removeInvalid(PriorityQueue<Overlap> heap){ 1866 ArrayList<Overlap> valid=new ArrayList<Overlap>(heap.size()); 1867 for(Overlap o : heap){ 1868 if(!o.invalid()){ 1869 assert(!o.u1.visited() || !o.u2.visited()); 1870 valid.add(o); 1871 } 1872 } 1873 heap.clear(); 1874 heap.addAll(valid); 1875 } 1876 1877 1878 public long overlapsRemovedT=0; 1879 public long overlapBasesRemovedT=0; 1880 public long overlapsRetainedT=0; 1881 public long overlapBasesRetainedT=0; 1882 1883 private final PriorityQueue<Overlap> heap=new PriorityQueue<Overlap>((1<<16)-1); 1884 private ArrayList<ArrayList<Unit>> processedT=new ArrayList<ArrayList<Unit>>(); 1885 } 1886 1887 1888 /** 1889 * Processes clustered sets of reads. 1890 * @author Brian Bushnell 1891 * @date Aug 9, 2013 1892 * 1893 */ 1894 private final class ClusterThread extends Thread{ 1895 1896 public ClusterThread(boolean fixMultiJoins_, boolean canonicize_, boolean removeCycles_, 1897 boolean fixCanonContradictions_, boolean fixOffsetContradictions_, boolean mergeClusters_, boolean mergeLeaves_, boolean mergeInner_){ 1898 fixMultiJoinsT=fixMultiJoins_; 1899 canonicizeT=canonicize_; 1900 fixCanonContradictionsT=fixCanonContradictions_; 1901 fixOffsetContradictionsT=fixOffsetContradictions_; 1902 mergeClustersT=mergeClusters_; 1903 mergeLeavesT=mergeLeaves_; 1904 mergeInnerT=mergeInner_; 1905 1906 // assert(false) : fixMultiJoinsT+", "+canonicizeT+", "+fixCanonContradictionsT+", "+mergeLeavesT+", "+mergeInnerT; 1907 bandy=(maxEdits>0 ? BandedAligner.makeBandedAligner(bandwidth) : null); 1908 // assert(false) : fixMultiJoinsT+", "+canonicizeT+", "+fixCanonContradictionsT+", "+fixOffsetContradictionsT+", "+mergeClustersT+", "+removeCycles_; 1909 } 1910 1911 @Override 1912 public void run(){ 1913 1914 final ArrayList<Unit> temp=new ArrayList<Unit>(1000); 1915 1916 ArrayList<Unit> cluster=null; 1917 while((cluster=nextCluster())!=null){ 1918 1919 if(EA){ 1920 for(Unit u : cluster){assert(u.r.mate==null) : "Cluster processing/merging is not supported for paired reads, only cluster generation.";} 1921 } 1922 1923 // for(Unit u : cluster){assert(!u.visited());} 1924 unvisit(cluster); 1925 1926 reorderClusterBreadthFirst(cluster); 1927 int multiJoinCount=findMultiJoinsInCluster(cluster, fixMultiJoinsT); 1928 1929 if(EA){ 1930 for(Unit u : cluster){assert(!u.visited());} 1931 } 1932 1933 boolean ok=true; 1934 if(multiJoinCount!=0){ 1935 assert(multiJoinCount>0); 1936 multiJoinsFoundT+=multiJoinCount; 1937 if(!fixMultiJoinsT){ 1938 multiJoinFailuresT++; 1939 ok=false; 1940 } 1941 } 1942 1943 int canonContradictions=0; 1944 if(ok && canonicizeT){ 1945 if(EA){ 1946 for(Unit u : cluster){ 1947 assert(!u.visited()); 1948 assert(!u.canonContradiction()); 1949 assert(!u.canonicized()); 1950 if(u.overlapList!=null){ 1951 for(Overlap o : u.overlapList){ 1952 assert(!o.invalid()); 1953 assert(!o.canonContradiction()) : 1954 o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+cluster.contains(o.u1)+", "+cluster.contains(o.u2); 1955 } 1956 } 1957 } 1958 } 1959 canonContradictions=canonicizeClusterBreadthFirst(cluster, temp); 1960 // System.err.println("Canonicized cluster of size "+cluster.size()+"; contradictions = "+canonContradictions+"; canonicized = "+temp.size()); 1961 temp.clear(); 1962 for(Unit u : cluster){assert(!u.visited());} 1963 if(canonContradictions>0){ 1964 canonContradictoryOverlapsT+=canonContradictions; 1965 canonContradictoryClustersT++; 1966 if(fixCanonContradictionsT){ 1967 if(verbose){System.err.println("Pruning cluster to remove canonization contradictions.");} 1968 fullyPruneCluster(cluster, temp); 1969 if(verbose){System.err.println("Resulting size: "+cluster.size());} 1970 if(EA){ 1971 for(Unit u : cluster){ 1972 assert(!u.visited()); 1973 assert(!u.canonContradiction()); 1974 assert(u.canonicized()); 1975 if(u.overlapList!=null){ 1976 for(Overlap o : u.overlapList){ 1977 assert(!o.invalid()); 1978 assert(!o.canonContradiction()); 1979 assert(o.type==FORWARD) : "\n"+o+"\n"+ 1980 o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+o.u1.canonicized()+", "+o.u2.canonicized()+ 1981 "\n"+cluster.contains(o.u1)+", "+cluster.contains(o.u2)+", "+cluster.size(); 1982 } 1983 } 1984 } 1985 } 1986 }else{ 1987 ok=false; 1988 } 1989 } 1990 } 1991 1992 int cycleOverlaps=0; 1993 if(ok){ 1994 cycleOverlaps=findCycles(cluster, removeCycles); 1995 for(Unit u : cluster){assert(!u.visited());} 1996 if(cycleOverlaps>0){ 1997 cycleOverlapsT+=cycleOverlaps; 1998 cycleClustersT++; 1999 } 2000 } 2001 2002 int offsetContradictions=0; 2003 if(ok && fixOffsetContradictionsT){ 2004 if(EA){ 2005 for(Unit u : cluster){ 2006 assert(!u.visited()); 2007 assert(!u.offsetContradiction()); 2008 assert(!u.offsetValid()); 2009 assert(u.canonicized()); 2010 if(u.overlapList!=null){ 2011 for(Overlap o : u.overlapList){ 2012 assert(!o.invalid()); 2013 assert(!o.offsetContradiction()); 2014 assert(o.type==FORWARD) : o; 2015 } 2016 } 2017 } 2018 } 2019 offsetContradictions=generateOffsetsBreadthFirst(cluster, temp); 2020 // System.err.println("Made offsets for cluster of size "+cluster.size()+"; contradictions = "+offsetContradictions+"; set = "+temp.size()); 2021 temp.clear(); 2022 for(Unit u : cluster){assert(!u.visited());} 2023 if(offsetContradictions>0){ 2024 offsetContradictoryOverlapsT+=offsetContradictions; 2025 offsetContradictoryClustersT++; 2026 if(fixOffsetContradictionsT){ 2027 if(verbose){System.err.println("Pruning cluster to remove offset contradictions.");} 2028 fullyPruneCluster(cluster, temp); 2029 if(verbose){System.err.println("Resulting size: "+cluster.size());} 2030 if(EA){ 2031 for(Unit u : cluster){ 2032 assert(!u.visited()); 2033 assert(!u.offsetContradiction()); 2034 assert(u.offsetValid()); 2035 if(u.overlapList!=null){ 2036 for(Overlap o : u.overlapList){ 2037 assert(!o.invalid()); 2038 assert(!o.offsetContradiction()); 2039 assert(o.type==FORWARD) : o; 2040 } 2041 } 2042 } 2043 } 2044 }else{ 2045 ok=false; 2046 } 2047 } 2048 if(ok){Shared.sort(cluster, UNIT_OFFSET_COMPARATOR);} 2049 } 2050 2051 if(ok && absorbOverlap){ 2052 mergeCluster(cluster); 2053 } 2054 processedClustersT.add(cluster)2055 processedClustersT.add(cluster); 2056 if(processedClustersT.size()>=threadMaxReadsToBuffer){ 2057 synchronized(processedClusters){ 2058 processedClusters.addAll(processedClustersT); processedClustersT.clear()2059 processedClustersT.clear(); 2060 } 2061 } 2062 } 2063 synchronized(processedClusters){ 2064 processedClusters.addAll(processedClustersT); processedClustersT.clear()2065 processedClustersT.clear(); 2066 } 2067 } 2068 2069 private void fullyPruneCluster(ArrayList<Unit> cluster, ArrayList<Unit> temp){ 2070 assert(cluster.size()>1) : cluster.size(); 2071 ArrayList<Unit> pruned=pruneCluster(cluster, true, true, temp); 2072 assert(temp.isEmpty()); 2073 assert(pruned==null || pruned.size()>0); 2074 while(pruned!=null){ 2075 ArrayList<Unit> subcluster=pruned; 2076 for(Unit u : subcluster){ 2077 u.clearVolatileFlags(); 2078 if(u.overlapList!=null){ 2079 for(Overlap o : u.overlapList){ 2080 o.clearVolatileFlags(); 2081 } 2082 } 2083 } 2084 assert(subcluster.size()>0); 2085 pruned=pruneCluster(subcluster, false, false, temp); 2086 assert(temp.isEmpty()); 2087 assert(pruned==null || pruned.size()>0); 2088 assert(subcluster.size()>0); 2089 if(subcluster.size()==1){ 2090 processedClustersT.add(subcluster); 2091 }else{ 2092 assert(subcluster.size()>1); 2093 synchronized(clusterQueue){ 2094 clusterQueue.add(subcluster); 2095 } 2096 } 2097 } 2098 } 2099 2100 /** 2101 * @param cluster 2102 */ 2103 private void mergeCluster(ArrayList<Unit> cluster) { 2104 if(cluster.size()==1){return;} 2105 if(mergeLeavesT){ 2106 mergeLeaves(cluster); 2107 } 2108 if(mergeInnerT){ 2109 mergeInner(cluster); 2110 } 2111 } 2112 2113 /** 2114 * Finds places in the cluster where two Units are joined by multiple different Overlaps. 2115 * Returns number of multijoins found. 2116 * @param cluster 2117 */ 2118 private int findMultiJoinsInCluster(ArrayList<Unit> cluster, boolean resolveProblems) { 2119 if(cluster.size()<2){return 0;} 2120 int totalMultiJoins=0; 2121 for(Unit ua : cluster){ 2122 ArrayList<Overlap> list=ua.overlapList; 2123 assert(list!=null); 2124 if(list.size()>1){ 2125 Shared.sort(list); 2126 2127 int multiJoins=0; 2128 for(int i=0; i<list.size(); i++){ 2129 Overlap o=list.get(i); 2130 Unit ub=(o.u1==ua ? o.u2 : o.u1); 2131 assert(ua!=ub); 2132 assert(ua==o.u1 || ua==o.u2); 2133 if(ub.visited()){ 2134 multiJoins++; 2135 multiJoinBasesFoundT+=o.overlapLen; 2136 if(!o.multiJoin()){o.setMultiJoin(true);} 2137 if(resolveProblems){list.set(i, null);} 2138 }else{ 2139 ub.setVisited(true); 2140 } 2141 } 2142 2143 if(multiJoins>0){ 2144 totalMultiJoins+=multiJoins; 2145 if(resolveProblems){Tools.condenseStrict(list);} 2146 } 2147 2148 for(int i=0; i<list.size(); i++){ 2149 Overlap o=list.get(i); 2150 Unit ub=(o.u1==ua ? o.u2 : o.u1); 2151 assert(ua!=ub); 2152 assert(ua==o.u1 || ua==o.u2); 2153 assert(ub.visited()); 2154 ub.setVisited(false); 2155 } 2156 } 2157 2158 } 2159 2160 return totalMultiJoins; 2161 } 2162 2163 private ArrayList<Unit> pruneCluster(ArrayList<Unit> cluster, boolean pruneContradictoryNodes, boolean pruneContradictoryOverlaps, ArrayList<Unit> visited){ 2164 if(verbose){System.err.println("pruneCluster(size="+cluster.size()+", "+pruneContradictoryNodes+", "+pruneContradictoryOverlaps+")");} 2165 2166 //pruneContradictoryOverlaps is less strict than pruneContradictoryNodes 2167 assert(pruneContradictoryOverlaps || !pruneContradictoryNodes); 2168 2169 for(Unit ua : cluster){ 2170 assert(!ua.visited()); 2171 assert(ua.isPerfectlyTransitive()) : ua; 2172 if(ua.visited()){ua.setVisited(false);} 2173 } 2174 2175 int prunedOverlaps=0; 2176 int visits=1; 2177 2178 { 2179 final Unit root=cluster.get(0); 2180 assert(!root.contradiction()); 2181 root.setVisited(true); 2182 visited.add(root); 2183 } 2184 2185 for(int i=0; i<visited.size(); i++){ 2186 Unit ua=visited.get(i); 2187 2188 if(ua.visited() && (!ua.contradiction() || !pruneContradictoryNodes)){ 2189 ArrayList<Overlap> list=ua.overlapList; 2190 if(list!=null){ 2191 int removed=0; 2192 for(int j=0; j<list.size(); j++){ 2193 Overlap o=list.get(j); 2194 Unit ub=(o.u1==ua ? o.u2 : o.u1); 2195 assert(o.u1==ua || o.u2==ua); 2196 assert(ua!=ub); 2197 assert(ub.valid()); 2198 2199 assert(!o.canonContradiction() || (ua.canonContradiction() || ub.canonContradiction())) : 2200 "\n"+o.canonContradiction()+", "+ua.canonContradiction()+", "+ub.canonContradiction(); 2201 2202 assert(!o.offsetContradiction() || (ua.offsetContradiction() || ub.offsetContradiction())) : 2203 "\n"+o.offsetContradiction()+", "+ua.offsetContradiction()+", "+ub.offsetContradiction(); 2204 2205 // assert(o.contradiction()==(ua.contradiction() && ub.contradiction())) : 2206 // "\n"+o.canonContradiction()+", "+o.offsetContradiction()+ 2207 // "\n"+ua.canonContradiction()+", "+ua.offsetContradiction()+ 2208 // "\n"+ub.canonContradiction()+", "+ub.offsetContradiction(); 2209 2210 final boolean remove=(pruneContradictoryNodes && ub.contradiction() || (pruneContradictoryOverlaps && o.contradiction())); 2211 if(!remove && !ub.visited()){ 2212 ub.setVisited(true); 2213 visited.add(ub); 2214 visits++; 2215 } 2216 2217 if(remove){ 2218 if(!o.invalid()){o.setInvalid(true);} 2219 list.set(j, null); 2220 removed++; 2221 prunedOverlaps++; 2222 }else{ 2223 assert(!o.invalid()); 2224 } 2225 } 2226 if(removed>0){Tools.condenseStrict(list);} 2227 } 2228 } 2229 } 2230 2231 if(verbose){System.err.println("cluster.size()="+cluster.size()+", visits="+visits+", visited.size()="+visited.size());} 2232 2233 // if(visited.size()==11486){ //TODO: For testing. Remove. 2234 // for(int i=0; i<visited.size(); i++){ 2235 // Unit u=visited.get(i); 2236 // assert(u.visited()); 2237 // assert(!u.canonContradiction()); 2238 // assert(u.canonicized()); 2239 // for(Overlap o : u.overlapList){ 2240 // assert(!o.canonContradiction()); 2241 // assert(o.type==FORWARD) : "\n\no="+o+"\ni="+i+", u.overlapList.size="+u.overlapList.size()+"\n"+ 2242 // o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+o.u1.canonicized()+", "+o.u2.canonicized()+ 2243 // "\n"+visited.contains(o.u1)+", "+visited.contains(o.u2)+", "+visited.size()+ 2244 // "\n"+u.overlapList; 2245 // } 2246 // } 2247 // } 2248 2249 final int numUnvisited=cluster.size()-visits; 2250 ArrayList<Unit> pruned=(numUnvisited==0 ? null : new ArrayList<Unit>(numUnvisited)); 2251 assert(visits==visited.size()); 2252 assert(visits>=1 && visits<=cluster.size()); 2253 2254 if(visits<cluster.size()){ 2255 pruned=new ArrayList<Unit>(cluster.size()-visits); 2256 for(Unit ua : cluster){ 2257 if(!ua.visited()){ 2258 pruned.add(ua); 2259 ArrayList<Overlap> list=ua.overlapList; 2260 if(list!=null){ 2261 int removed=0; 2262 for(int j=0; j<list.size(); j++){ 2263 Overlap o=list.get(j); 2264 Unit ub=(o.u1==ua ? o.u2 : o.u1); 2265 assert(o.u1==ua || o.u2==ua); 2266 assert(ua!=ub); 2267 assert(ub.valid()); 2268 2269 if(ub.visited() || o.invalid()){ 2270 assert(ub.visited()==o.invalid()) : "\n"+o+"\n"+ub; 2271 list.set(j, null); 2272 removed++; 2273 } 2274 } 2275 if(removed>0){Tools.condenseStrict(list);} 2276 } 2277 } 2278 } 2279 assert(pruned.size()==numUnvisited); 2280 }else{ 2281 assert(prunedOverlaps==0) : "If this fails then I may need to mark overlaps to remove."; 2282 } 2283 for(Unit u : cluster){ 2284 assert(u.isPerfectlyTransitive()) : u; 2285 if(EA){ 2286 if(u.overlapList!=null){ 2287 for(Overlap o : u.overlapList){assert(!o.invalid());} 2288 } 2289 } 2290 if(u.visited()){u.setVisited(false);} 2291 } 2292 cluster.clear(); 2293 cluster.addAll(visited); 2294 cluster.trimToSize(); 2295 2296 // for(Unit u : cluster){ 2297 //// assert(u.canonicized()); 2298 // for(Overlap o : u.overlapList){ 2299 // assert(pruned==null || !pruned.contains(o.u1)); 2300 // assert(pruned==null || !pruned.contains(o.u2)); 2301 // assert(cluster.contains(o.u1)); 2302 // assert(cluster.contains(o.u2)); 2303 // } 2304 // } 2305 // if(pruned!=null){ 2306 // for(Unit u : pruned){ 2307 // for(Overlap o : u.overlapList){ 2308 // assert(pruned.contains(o.u1)); 2309 // assert(pruned.contains(o.u2)); 2310 // assert(!cluster.contains(o.u1)); 2311 // assert(!cluster.contains(o.u2)); 2312 // } 2313 // } 2314 // } 2315 2316 visited.clear(); 2317 return pruned; 2318 } 2319 2320 /** 2321 * Cluster should already be ordered breadth-first 2322 * This may fail because removing cycles could change breadth-first traversal, but if it fails, an assertion will be thrown 2323 * @param cluster 2324 */ 2325 private int findCycles(ArrayList<Unit> cluster, boolean remove){ 2326 2327 { 2328 final Unit root=cluster.get(0); 2329 assert(root.length()>=cluster.get(cluster.size()-1).length()); 2330 root.setVisited(true); 2331 } 2332 int cycles=0; 2333 2334 for(Unit ua : cluster){ 2335 assert(ua.visited()); 2336 ArrayList<Overlap> list=ua.overlapList; 2337 if(list!=null){ 2338 int removed=0; 2339 for(int i=0; i<list.size(); i++){ 2340 Overlap o=list.get(i); 2341 Unit ub=(o.u1==ua ? o.u2 : o.u1); 2342 assert(o.u1==ua || o.u2==ua); 2343 assert(ua!=ub); 2344 assert(ub.valid()); 2345 2346 if(!o.visited()){ 2347 o.setVisited(true); 2348 if(ub.visited()){ 2349 if(!o.cyclic()){ 2350 o.setCyclic(true); 2351 cycles++; 2352 } 2353 }else{ 2354 ub.setVisited(true); 2355 } 2356 } 2357 if(remove && o.cyclic()){ 2358 list.set(i, null); 2359 removed++; 2360 } 2361 } 2362 if(removed>0){Tools.condenseStrict(list);} 2363 } 2364 } 2365 2366 for(Unit u : cluster){ 2367 if(u.visited()){u.setVisited(false);} 2368 if(u.overlapList!=null){ 2369 for(Overlap o : u.overlapList){ 2370 if(o.visited()){o.setVisited(false);} 2371 } 2372 } 2373 } 2374 2375 return cycles; 2376 } 2377 2378 /** 2379 * Cluster should already be ordered breadth-first 2380 * @param cluster 2381 */ 2382 private int generateOffsetsBreadthFirst(ArrayList<Unit> cluster, ArrayList<Unit> temp){ 2383 2384 2385 assert(temp!=null); 2386 assert(temp.isEmpty()); 2387 { 2388 final Unit root=cluster.get(0); 2389 assert(root.length()>=cluster.get(cluster.size()-1).length()); 2390 root.setOffset(0); 2391 temp.add(root); 2392 } 2393 2394 int contradictions=0; 2395 for(int i=0; i<temp.size(); i++){ 2396 Unit u=temp.get(i); 2397 assert(!u.visited()) : i; 2398 assert(u.offsetValid() || contradictions>0) : i+", "+temp.size()+", "+contradictions+"\n"+toString(temp); 2399 if(u.offsetValid() && !u.offsetContradiction()){ 2400 contradictions+=setOffsetsNeighbors(u, temp); 2401 assert(contradictions==0 || (i>0 && temp.size()>2)); 2402 } 2403 } 2404 2405 int min=0; 2406 for(Unit u : temp){ 2407 if(u.visited()){u.setVisited(false);} 2408 if(u.overlapList!=null){ 2409 for(Overlap o : u.overlapList){ 2410 if(o.visited()){o.setVisited(false);} 2411 } 2412 } 2413 if(u.offsetValid() && !u.offsetContradiction()){ 2414 min=Tools.min(min, u.offset()); 2415 } 2416 } 2417 2418 if(verbose){ 2419 System.err.println("min offset = "+min); 2420 } 2421 2422 for(Unit u : temp){ 2423 if(u.offsetValid()){ 2424 if(verbose){System.err.println("Set "+u.name()+" offset from "+u.offset+" to "+(u.offset-min));} 2425 u.offset=u.offset-min; 2426 } 2427 } 2428 2429 2430 return contradictions; 2431 } 2432 2433 /** 2434 * @param root 2435 */ 2436 private int setOffsetsNeighbors(final Unit root, final ArrayList<Unit> temp) { 2437 if(verbose){System.err.println("\nsetOffsetsNeighbors("+root.name()+")\nroot.code1="+root.code1+"\n");} 2438 assert(root.valid()); 2439 assert(!root.visited()); 2440 assert(root.offsetValid()); 2441 assert(!root.offsetContradiction()); 2442 root.setVisited(true); 2443 if(root.overlapList==null){return 0;} 2444 final int contradictions=countOffsetContradictions(root, false); 2445 if(verbose){System.err.println("\ncontradictions="+contradictions);} 2446 for(Overlap o : root.overlapList){ 2447 Unit u=(o.u1==root ? o.u2 : o.u1); 2448 assert(o.u1==root || o.u2==root); 2449 assert(root!=u); 2450 assert(u.valid()); 2451 2452 if(verbose){System.err.println("\nProcessing Overlap "+o);} 2453 if(!o.visited() && !o.offsetContradiction()){ 2454 o.setVisited(true); 2455 if(!u.offsetContradiction()){ 2456 if(verbose){System.err.println("Calling setOffset: "+o);} 2457 if(!u.offsetValid()){temp.add(u);} 2458 boolean b=setOffset(root, u, o); 2459 if(verbose){System.err.println("Finished setOffset: "+o);} 2460 2461 // if(x>0){ 2462 // if(verbose){System.err.println("\n*********************************************");} 2463 // if(verbose){System.err.println("Problem detected with contig "+u.name());} 2464 // if(verbose){System.err.println("*********************************************\n");} 2465 // verbose=true; 2466 // int y2=countOffsetContradictions(root, false); 2467 // assert(contradictions==y2); 2468 // } 2469 2470 assert(b) : "\n"+contradictions+", "+o.offsetContradiction()+", "+root.offsetContradiction()+", "+u.offsetContradiction()+"\n" 2471 +root.offsetValid()+", "+u.offsetValid()+", "+OVERLAP_TYPE_NAMES[o.type]+"\n"+b 2472 +fixMultiJoins; //This assertion can fail if a multijoin is present 2473 assert(u.offsetValid()); 2474 } 2475 } 2476 } 2477 return contradictions; 2478 } 2479 2480 private int countOffsetContradictions(Unit root, boolean includeKnown){ 2481 if(verbose){System.err.println("\ncountContradictions("+root.name()+", "+includeKnown+")\nroot.code1="+root.code1+"\n");} 2482 assert(root.valid()); 2483 assert(root.visited()); 2484 assert(root.offsetValid()); 2485 // assert(!root.offsetContradiction()); 2486 if(root.overlapList==null){return 0;} 2487 int contradictions=0; 2488 for(Overlap o : root.overlapList){ 2489 Unit u=(o.u1==root ? o.u2 : o.u1); 2490 assert(o.u1==root || o.u2==root); 2491 assert(root!=u); 2492 assert(u.valid()); 2493 2494 if(verbose){System.err.println("\nOverlap "+o+"\nu="+u.name()+", offsetValid="+u.offsetValid());} 2495 2496 boolean contradictory=(u.offsetValid() && u.offset()!=calcOffset(root, u, o)); 2497 if(verbose){System.err.println("contradictory= \t"+contradictory);} 2498 if(contradictory){ 2499 if(includeKnown || !u.offsetContradiction()){ 2500 contradictions++; 2501 if(!root.offsetContradiction()){root.setOffsetContradiction(true);} 2502 } 2503 if(!o.offsetContradiction()){o.setOffsetContradiction(true);} 2504 if(!u.offsetContradiction()){u.setOffsetContradiction(true);} 2505 } 2506 assert(contradictory==o.offsetContradiction()) : contradictory+", "+o.offsetContradiction(); 2507 if(verbose){ 2508 System.err.println("root.offsetContradiction()=\t"+root.offsetContradiction()); 2509 System.err.println("u.offsetContradiction()= \t"+u.offsetContradiction()); 2510 System.err.println("o.offsetContradiction()= \t"+o.offsetContradiction()); 2511 System.err.println("contradictions= \t"+contradictions); 2512 } 2513 } 2514 if(verbose){System.err.println("Final contradictions="+contradictions+"\n");} 2515 return contradictions; 2516 } 2517 2518 /** 2519 * Cluster should already be ordered breadth-first 2520 * @param cluster 2521 */ 2522 private int canonicizeClusterBreadthFirst(ArrayList<Unit> cluster, ArrayList<Unit> temp) { 2523 2524 assert(temp!=null); 2525 assert(temp.isEmpty()); 2526 { 2527 final Unit root=cluster.get(0); 2528 assert(root.length()>=cluster.get(cluster.size()-1).length()); 2529 root.setCanonicized(true); 2530 temp.add(root); 2531 } 2532 2533 int contradictions=0; 2534 for(int i=0; i<temp.size(); i++){ 2535 final Unit u=temp.get(i); 2536 assert(!u.visited()) : i; 2537 assert(u.canonicized() || contradictions>0) : i+", "+temp.size()+", "+contradictions+"\n"+toString(temp); 2538 if(u.canonicized() && !u.canonContradiction()){ 2539 contradictions+=canonicizeNeighbors(u, temp); 2540 assert(contradictions==0 || (i>0 && temp.size()>2)); 2541 2542 if(u.overlapList!=null){ 2543 for(Overlap o : u.overlapList){ 2544 assert(o.type==FORWARD || o.canonContradiction() || o.u1.canonContradiction() || o.u2.canonContradiction()) : 2545 o+"\n"+contradictions+", "+o.canonContradiction()+", "+o.u1.canonContradiction()+", "+o.u2.canonContradiction()+ 2546 "\n"+o.u1.canonicized()+", "+o.u2.canonicized()+", "+o.u1.visited()+", "+o.u2.visited(); 2547 } 2548 } 2549 } 2550 2551 // if(u.r.numericID==59462 || u.r.numericID==56439){ //TODO: remove 2552 // System.err.println("\nid="+u.r.numericID+", canonicized="+u.canonicized()+", contradiction="+u.canonContradiction()+", visited="+u.visited()); 2553 // for(Overlap o : u.overlapList){ 2554 // Unit u2=(o.u1==u ? o.u2 : o.u1); 2555 // assert(o.u1==u || o.u2==u); 2556 // assert(u2!=u); 2557 // assert(u2.valid()); 2558 // System.err.println("o = "+o); 2559 // System.err.println("o.contradiction="+o.canonContradiction()); 2560 // System.err.println("u2.id="+u2.r.numericID+", canonicized="+u2.canonicized()+", contradiction="+u2.canonContradiction()+", visited="+u.visited()); 2561 // } 2562 // } 2563 } 2564 2565 for(Unit u : temp){ 2566 if(u.visited()){u.setVisited(false);} 2567 if(EA){ 2568 if(u.overlapList!=null){ 2569 for(Overlap o : u.overlapList){assert(!o.visited());} 2570 } 2571 } 2572 } 2573 2574 return contradictions; 2575 } 2576 2577 /** 2578 * @param root 2579 */ 2580 private int canonicizeNeighbors(Unit root, ArrayList<Unit> canonicized) { 2581 if(verbose){System.err.println("\ncanonicizeNeighbors("+root.name()+")\nroot.code1="+root.code1+"\n");} 2582 assert(root.valid()); 2583 assert(!root.visited()); 2584 assert(root.canonicized()); 2585 assert(!root.canonContradiction()); 2586 root.setVisited(true); 2587 if(root.overlapList==null){return 0;} 2588 final int contradictions=countCanonContradictions(root, false); 2589 if(verbose){System.err.println("\ncontradictions="+contradictions);} 2590 for(Overlap o : root.overlapList){ 2591 Unit u=(o.u1==root ? o.u2 : o.u1); 2592 assert(o.u1==root || o.u2==root); 2593 assert(root!=u); 2594 assert(u.valid()); 2595 2596 if(verbose){System.err.println("\nProcessing Overlap "+o);} 2597 if(!o.canonContradiction()){ 2598 if(!u.canonContradiction()){ 2599 boolean b=u.canonicized(); 2600 int dir=o.type; 2601 if(verbose){System.err.println("Calling canonicize: "+o);} 2602 int x=canonicize(root, u, o); 2603 if(verbose){System.err.println("Finished canonicize: "+o);} 2604 2605 // if(x>0){ 2606 // if(verbose){System.err.println("\n*********************************************");} 2607 // if(verbose){System.err.println("Problem detected with contig "+u.name());} 2608 // if(verbose){System.err.println("*********************************************\n");} 2609 // verbose=true; 2610 // int y2=countCanonContradictions(root, false); 2611 // assert(contradictions==y2); 2612 // } 2613 2614 assert(x==0 || (u.canonicized() && (o.type==FORWARDRC || o.type==REVERSERC))); 2615 assert(x==0) : "\n"+x+", "+contradictions+", "+o.canonContradiction()+", "+root.canonContradiction()+", "+u.canonContradiction()+"\n" 2616 +root.canonicized()+", "+u.canonicized()+", "+OVERLAP_TYPE_NAMES[o.type]+"\n"+b+", "+dir 2617 +fixMultiJoins; //This assertion can fail if a multijoin is present 2618 if(!u.canonicized()){ 2619 u.setCanonicized(true); 2620 canonicized.add(u); 2621 } 2622 assert(u.canonicized()); 2623 } 2624 } 2625 } 2626 if(EA){ 2627 for(Overlap o : root.overlapList){ 2628 assert(o.type==FORWARD || o.canonContradiction() || o.u1.canonContradiction() || o.u2.canonContradiction()) : 2629 o+"\n"+contradictions+", "+o.canonContradiction()+", "+o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+root.canonContradiction()+ 2630 "\n"+o.u1.canonicized()+", "+o.u2.canonicized()+", "+o.u1.visited()+", "+o.u2.visited(); 2631 } 2632 } 2633 return contradictions; 2634 } 2635 2636 private int countCanonContradictions(Unit root, boolean includeKnown){ 2637 if(verbose){System.err.println("\ncountContradictions("+root.name()+", "+includeKnown+")\nroot.code1="+root.code1+"\n");} 2638 assert(root.valid()); 2639 assert(root.visited()); 2640 assert(root.canonicized()); 2641 // assert(!root.canonContradiction()); 2642 if(root.overlapList==null){return 0;} 2643 int contradictions=0; 2644 for(Overlap o : root.overlapList){ 2645 Unit ub=(o.u1==root ? o.u2 : o.u1); 2646 assert(o.u1==root || o.u2==root); 2647 assert(root!=ub); 2648 assert(ub.valid()); 2649 2650 if(verbose){System.err.println("\nOverlap "+o+"\nu="+ub.name()+", canonicized="+ub.canonicized());} 2651 2652 boolean contradictory=(ub.canonicized() && (o.type==FORWARDRC || o.type==REVERSERC)); 2653 if(verbose){System.err.println("contradictory= \t"+contradictory);} 2654 if(contradictory){ 2655 if(!o.canonContradiction()){o.setCanonContradiction(true);} 2656 if(includeKnown || !ub.canonContradiction()){ 2657 contradictions++; 2658 if(!root.canonContradiction()){root.setCanonContradiction(true);} 2659 if(!ub.canonContradiction()){ub.setCanonContradiction(true);} 2660 } 2661 } 2662 2663 assert(!o.canonContradiction() || (root.canonContradiction() || ub.canonContradiction())) : 2664 "\n"+contradictory+", "+o.canonContradiction()+", "+root.canonContradiction()+", "+ub.canonContradiction(); 2665 2666 assert(contradictory==o.canonContradiction()) : contradictory+", "+o.canonContradiction(); 2667 if(verbose){ 2668 System.err.println("root.canonContradiction()=\t"+root.canonContradiction()); 2669 System.err.println("u.canonContradiction()= \t"+ub.canonContradiction()); 2670 System.err.println("o.canonContradiction()= \t"+o.canonContradiction()); 2671 System.err.println("contradictions= \t"+contradictions); 2672 } 2673 } 2674 if(verbose){System.err.println("Final contradictions="+contradictions+"\n");} 2675 return contradictions; 2676 } 2677 2678 private String toString(ArrayList<Unit> cluster){ 2679 for(int i=0; i<cluster.size(); i++){ 2680 Unit u=cluster.get(i); 2681 u.r.id=""+i; 2682 } 2683 StringBuilder sb=new StringBuilder(1000); 2684 for(Unit u : cluster){ 2685 sb.append(">"+u.name()+"\n"); 2686 sb.append(new String(u.bases())); 2687 sb.append("\n"); 2688 } 2689 sb.append("\n*****\n"); 2690 for(Unit u : cluster){ 2691 sb.append("\n"+u.name()+":"); 2692 if(u.overlapList!=null){ 2693 for(Overlap o : u.overlapList){ 2694 Unit ub=(o.u1==u ? o.u2 : o.u1); 2695 sb.append(" "+ub.name()); 2696 } 2697 } 2698 } 2699 sb.append("\n"); 2700 return sb.toString(); 2701 } 2702 2703 private String toShortString(ArrayList<Unit> cluster){ 2704 for(int i=0; i<cluster.size(); i++){ 2705 Unit u=cluster.get(i); 2706 u.r.id=""+i; 2707 } 2708 StringBuilder sb=new StringBuilder(1000); 2709 for(Unit u : cluster){ 2710 sb.append("\n"+u.name()+":"); 2711 if(u.overlapList!=null){ 2712 for(Overlap o : u.overlapList){ 2713 Unit ub=(o.u1==u ? o.u2 : o.u1); 2714 sb.append(" "+ub.name()); 2715 } 2716 } 2717 } 2718 sb.append("\n"); 2719 return sb.toString(); 2720 } 2721 2722 2723 /** 2724 * @param root 2725 * @param u2 2726 * @param o 2727 * @return Number of contradictions 2728 */ 2729 private int canonicize(final Unit root, final Unit u2, final Overlap o){ 2730 if(o.type==FORWARD){return 0;} 2731 if(o.type==FORWARDRC || o.type==REVERSERC){ 2732 if(u2.canonicized()){return 1;} 2733 u2.reverseComplement(); 2734 unitsFlippedT++; 2735 for(Overlap o2 : u2.overlapList){ 2736 overlapsFlippedT++; 2737 o2.flip(u2, bandy); 2738 } 2739 assert(o.type==FORWARD || o.type==REVERSE) : OVERLAP_TYPE_NAMES[o.type]; 2740 } 2741 if(o.type==REVERSE){o.reverseDirection();} 2742 assert(o.type==FORWARD); 2743 assert(o.test(bandy, o.edits+maxEdits)); 2744 return 0; 2745 } 2746 2747 2748 /** 2749 * @param root 2750 * @param u2 2751 * @param o 2752 * @return true if no contradictions 2753 */ 2754 private boolean setOffset(final Unit root, final Unit u2, final Overlap o){ 2755 assert(root.offsetValid()); 2756 assert(!root.offsetContradiction()); 2757 int offset=calcOffset(root, u2, o); 2758 2759 if(u2.offsetValid()){return u2.offset()==offset;} 2760 u2.setOffset(offset); 2761 2762 if(verbose){ 2763 System.err.println("\nroot = "+(root.name()==null ? root.r.numericID+"" : root.name())+", u2 = "+(u2.name()==null ? u2.r.numericID+"" : u2.name()) 2764 +"\no = "+o 2765 +"\nroot.offset = "+root.offset() 2766 +"\nu2.offset = "+u2.offset()); 2767 } 2768 2769 return true; 2770 } 2771 2772 2773 private int calcOffset(final Unit root, final Unit ub, final Overlap o){ 2774 assert(root.offsetValid()); 2775 if(o.type==FORWARD){ 2776 if(root==o.u1){ 2777 int dif=o.start1-o.start2; 2778 if(verbose){System.err.println("root==o.u1=="+root.name()+", start1="+o.start1+"; u2==o.u2=="+ub.name()+", start2="+o.start2+", dif="+dif);} 2779 return root.offset+dif; 2780 }else{ 2781 int dif=o.start2-o.start1; 2782 if(verbose){System.err.println("root==o.u2=="+root.name()+", start2="+o.start2+"; u2==o.u1=="+ub.name()+", start1="+o.start1+", dif="+dif);} 2783 return root.offset+dif; 2784 } 2785 }else{ 2786 assert(false) : o; 2787 throw new RuntimeException("TODO"); 2788 } 2789 } 2790 2791 2792 /** 2793 * @param cluster 2794 */ 2795 private void mergeLeaves(ArrayList<Unit> cluster) { 2796 assert(false) : "TODO"; 2797 for(Unit u : cluster){ 2798 2799 } 2800 } 2801 2802 /** 2803 * @param cluster 2804 */ 2805 private void mergeInner(ArrayList<Unit> cluster) { 2806 assert(false) : "TODO"; 2807 for(Unit u : cluster){ 2808 2809 } 2810 } 2811 2812 private ArrayList<ArrayList<Unit>> processedClustersT=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer); 2813 2814 long leafMergesT=0; 2815 long innerMergesT=0; 2816 long leafBaseMergesT=0; 2817 long innerBaseMergesT=0; 2818 2819 long multiJoinFailuresT=0; 2820 long multiJoinsFoundT=0; 2821 long multiJoinBasesFoundT=0; 2822 long unitsFlippedT=0; 2823 long overlapsFlippedT=0; 2824 long canonContradictoryOverlapsT=0; 2825 long canonContradictoryClustersT=0; 2826 long offsetContradictoryOverlapsT=0; 2827 long offsetContradictoryClustersT=0; 2828 long cycleOverlapsT=0; 2829 long cycleClustersT=0; 2830 2831 private final boolean fixMultiJoinsT; 2832 private final boolean canonicizeT; 2833 private final boolean fixCanonContradictionsT; 2834 private final boolean fixOffsetContradictionsT; 2835 private final boolean mergeClustersT; 2836 private final boolean mergeLeavesT; 2837 private final boolean mergeInnerT; 2838 private final BandedAligner bandy; 2839 } 2840 2841 2842 /** 2843 * @param cluster 2844 */ 2845 private void unvisit(ArrayList<Unit> cluster) { 2846 for(Unit u : cluster){ 2847 if(u.visited()){u.setVisited(false);} 2848 } 2849 } 2850 2851 /** 2852 * @param cluster 2853 */ 2854 private void reorderClusterBreadthFirst(ArrayList<Unit> cluster) { 2855 if(verbose){System.err.println("reorderClusterBreadthFirst");} 2856 2857 final int size=cluster.size(); 2858 Shared.sort(cluster); //Now it is in descending length 2859 final Unit root=cluster.get(0); 2860 assert(root.length()>=cluster.get(size-1).length()) : root.length()+", "+cluster.get(size-1).length()+", "+root.compareTo(cluster.get(size-1)); 2861 2862 ArrayList<Unit> breadthFirst=new ArrayList<Unit>(cluster.size()); 2863 root.setVisited(true); 2864 // System.err.println("root="+root.name()); 2865 breadthFirst.add(root); 2866 for(int i=0; i<breadthFirst.size(); i++){ 2867 Unit u=breadthFirst.get(i); 2868 Shared.sort(u.overlapList); //Sorted in descending overlap length 2869 if(u.overlapList!=null){ 2870 for(Overlap o : u.overlapList){ 2871 if(!o.u1.visited()){ 2872 // System.err.println("Visiting "+o.u1.name()); 2873 o.u1.setVisited(true); 2874 breadthFirst.add(o.u1); 2875 } 2876 if(!o.u2.visited()){ 2877 // System.err.println("Visiting "+o.u2.name()); 2878 o.u2.setVisited(true); 2879 breadthFirst.add(o.u2); 2880 } 2881 // System.err.println("***"); 2882 // System.err.println(toShortString(breadthFirst)); 2883 } 2884 } 2885 } 2886 for(Unit u : cluster){ 2887 assert(u.visited()); 2888 if(u.visited()){u.setVisited(false);} 2889 if(EA){ 2890 if(u.overlapList!=null){ 2891 for(Overlap o : u.overlapList){assert(!o.visited());} 2892 } 2893 } 2894 } 2895 // System.err.println("***"); 2896 // System.err.println("Final:"); 2897 // System.err.println(toShortString(breadthFirst)); 2898 assert(cluster.size()==breadthFirst.size()); 2899 cluster.clear(); 2900 cluster.addAll(breadthFirst); 2901 } 2902 2903 2904 2905 /** Returns next cluster larger than 1 element. 2906 * Singleton clusters are added directly to 'processed'. */ 2907 private ArrayList<Unit> nextCluster(){ 2908 synchronized(clusterQueue){ 2909 ArrayList<Unit> cluster=clusterQueue.poll(); 2910 assert(cluster==null || cluster.size()>1); 2911 // while(cluster!=null && cluster.size()<2){ 2912 //// unmark(cluster); 2913 // processedClustersT.add(cluster); 2914 // cluster=clusterQueue.poll(); 2915 // } 2916 return cluster; 2917 } 2918 } 2919 2920 2921 /** 2922 * Creates Unit objects or uses ones already attached to reads. 2923 * Places them in local storage and percolates them to shared storage (codeMap), removing exact duplicates. 2924 * Also hashes tips and places these in shared affixMap. 2925 * Looks for containments in the affix map. 2926 * @author Brian Bushnell 2927 * @date Jul 24, 2013 2928 * 2929 */ 2930 private final class HashThread extends Thread{ 2931 2932 public HashThread(boolean addToCodeMap_, boolean addToAffixMap_, boolean findMatches_, boolean findContainments_, boolean findOverlaps_){ 2933 addToCodeMapT=addToCodeMap_; 2934 addToAffixMapT=addToAffixMap_; 2935 findContainmentsT=findContainments_; 2936 findOverlapsT=findOverlaps_; 2937 findMatchesT=findMatches_; 2938 tid=getTid(); 2939 crisq=new ArrayDeque<ConcurrentReadInputStream>(crisa.length); 2940 for(int i=0; i<crisa.length; i++){ 2941 // if(verbose){System.err.println("Adding to crisq.");} 2942 crisq.add(crisa[(i+tid)%crisa.length]); 2943 } 2944 bandy=(maxEdits>0 && (findOverlapsT || findContainmentsT) ? BandedAligner.makeBandedAligner(bandwidth) : null); 2945 2946 // assert(addToCodeMapT) : "addToCodeMapT="+addToCodeMapT+", addToAffixMapT="+addToAffixMapT+", findContainmentsT="+findContainmentsT+ 2947 // ", findOverlapsT="+findOverlapsT+", findMatchesT="+findMatchesT+", convertToUpperCaseT="+convertToUpperCaseT+", numAffixMaps="+numAffixMaps; 2948 } 2949 2950 @Override 2951 public void run(){ 2952 2953 ConcurrentReadInputStream cris=crisq.poll(); 2954 2955 while(cris!=null){ 2956 ListNum<Read> ln=cris.nextList(); 2957 ArrayList<Read> reads=(ln!=null ? ln.list : null); 2958 // long xx=0; 2959 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 2960 2961 for(Read r : reads){ 2962 processReadOuter(r); 2963 } 2964 2965 if(codeMapT!=null && (codeMapT.size()>threadMaxReadsToBuffer || basesStoredT>threadMaxBasesToBuffer)){ 2966 assert(addToCodeMapT); 2967 long added=mergeMaps(); 2968 addedToMainT+=added; 2969 } 2970 2971 cris.returnList(ln); 2972 ln=cris.nextList(); 2973 reads=(ln!=null ? ln.list : null); 2974 } 2975 cris.returnList(ln); 2976 if(codeMapT!=null && !codeMapT.isEmpty()){ 2977 long added=mergeMaps(); 2978 addedToMainT+=added; 2979 } 2980 cris=crisq.poll(); 2981 } 2982 2983 codeMapT=null; 2984 localConflictList=null; 2985 sharedConflictList=null; 2986 } 2987 2988 /** Return true if this read was a member of this subset. */ 2989 private boolean processReadOuter(Read r1){ 2990 if(r1.length()<MINSCAF){return false;} 2991 Read r2=r1.mate; 2992 2993 assert(r1.pairnum()==0); 2994 assert(r2==null || r2.pairnum()==1); 2995 2996 if(!addToCodeMapT && r1.obj==null){ 2997 if(r1.bases!=null && r1.length()>=MINSCAF){ 2998 final Unit u=(r1.obj!=null ? (Unit)r1.obj : new Unit(r1)); 2999 assert(u.r==r1 && (r1.obj==u || r1.obj==null)); 3000 final long code=u.code1; 3001 r1.obj=u; 3002 assert(u.r==r1 && r1.obj==u); 3003 if(r2!=null && r2.obj==null){r2.obj=new Unit(r2);} 3004 3005 //Check for subset membership 3006 final boolean inSet=u.inSet(); 3007 if(inSet){ 3008 final Long codeL=code; 3009 ArrayList<Unit> list=codeMap.get(codeL); 3010 boolean found=false; 3011 for(Unit u0 : list){ 3012 //Replace with existing read 3013 if(u0.equals(u) && u0.r.numericID==r1.numericID){ 3014 r1=u0.r; 3015 r2=r1.mate; 3016 found=true; 3017 break; 3018 } 3019 } 3020 assert(list!=null); 3021 if(!found){ 3022 return false; 3023 } 3024 } 3025 } 3026 } 3027 boolean b=processRead(r1); 3028 if(r2!=null){processRead(r2);} 3029 return b; 3030 } 3031 3032 /** Return true if this read was a member of this subset. */ 3033 private boolean processRead(Read r){ 3034 if(r.length()<MINSCAF){return false;} 3035 3036 final boolean inSet; 3037 if(!storeName){r.id=null;} 3038 if(!storeQuality){r.quality=null;} 3039 3040 if(forceTrimLeft>0 || forceTrimRight>0){//Added at request of RQC team 3041 if(r!=null && r.length()>0){ 3042 TrimRead.trimToPosition(r, forceTrimLeft>0 ? forceTrimLeft : 0, forceTrimRight>0 ? forceTrimRight : r.length(), 1); 3043 } 3044 } 3045 if(qTrimLeft || qTrimRight){ 3046 TrimRead.trimFast(r, qTrimLeft, qTrimRight, trimQ, trimE, 0); 3047 } 3048 if(r.length()<MINSCAF){return false;} 3049 3050 readsProcessedT++; 3051 basesProcessedT+=r.length(); 3052 3053 final Unit u=(r.obj!=null ? (Unit)r.obj : new Unit(r)); 3054 assert(u.r==r && (r.obj==u || r.obj==null)); 3055 final long code=u.code1; 3056 3057 //Check for subset membership 3058 inSet=u.inSet(); 3059 3060 r.obj=u; 3061 assert(u.r==r && r.obj==u); 3062 if(r.mate!=null && r.mate.obj==null){r.mate.obj=new Unit(r.mate);} 3063 3064 if(verbose){System.err.println("Generated "+code+" for sequence "+u.name()+"\t"+new String(r.bases, 0, Tools.min(40, r.length())));} 3065 3066 if(addToCodeMapT && inSet){ 3067 final Long codeL=code; 3068 ArrayList<Unit> list=codeMapT.get(codeL); 3069 if(list==null){ 3070 if(verbose){System.err.println("Unique.");} 3071 list=new ArrayList<Unit>(1); 3072 list.add(u); 3073 basesStoredT+=r.length(); 3074 codeMapT.put(codeL, list); 3075 }else{ 3076 if(verbose){System.err.println("Exists.");} 3077 boolean match=false; 3078 if(findMatchesT){ 3079 for(Unit u2 : list){ 3080 if(pairedEqualsRC(u, u2)){ 3081 // if(u.r.mate!=null){ 3082 // verbose=true; 3083 // 3084 // Unit um=(Unit)u.r.mate.obj; 3085 // Unit u2m=(Unit)u2.r.mate.obj; 3086 // 3087 // if(verbose){ 3088 // System.err.println("********"); 3089 // System.err.println(u.r.toFastq()); 3090 // System.err.println(u.r.mate.toFastq()); 3091 // System.err.println("********"); 3092 // System.err.println(u2.r.toFastq()); 3093 // System.err.println(u2.r.mate.toFastq()); 3094 // System.err.println("********"); 3095 // System.err.println(u); 3096 // System.err.println(u2); 3097 // System.err.println(um); 3098 // System.err.println(u2m); 3099 // System.err.println("********"); 3100 // System.err.println(u.equals(u2)); 3101 // System.err.println(u.compareTo(u2)); 3102 // System.err.println("********"); 3103 // System.err.println(um.equals(u2m)); 3104 // System.err.println(um.compareTo(u2m)); 3105 // System.err.println("********"); 3106 // } 3107 // 3108 // verbose=false; 3109 // } 3110 assert(u.r.mate==null || pairedEqualsRC((Unit)u.r.mate.obj, (Unit)u2.r.mate.obj)) : 3111 u.r.toFastq()+"\n"+u2.r.toFastq()+"\n"+u.r.mate.toFastq()+"\n"+u2.r.mate.toFastq()+ 3112 "\n"+u+"\n"+u2+"\n"+u.r.mate.obj+"\n"+u2.r.mate.obj; 3113 // if(verbose){System.err.println("Matches "+new String(r2.bases, 0, Tools.min(40, r2.length())));} 3114 match=true; 3115 u2.absorbMatch(u); 3116 if(UNIQUE_ONLY){ 3117 synchronized(u2){ 3118 if(u2.valid()){ 3119 matchesT++; 3120 baseMatchesT+=u2.length(); 3121 u2.setValid(false); 3122 addDupe(u2.r); 3123 } 3124 } 3125 } 3126 break; 3127 } 3128 } 3129 } 3130 if(match){ 3131 addDupe(r); 3132 matchesT++; 3133 baseMatchesT+=r.length(); 3134 // if(verbose){System.err.println("matchesT="+matchesT+", baseMatchesT="+baseMatchesT);} 3135 }else{ 3136 collisionsT++; 3137 if(verbose){System.err.println("False collision; count = "+collisionsT);} 3138 list.add(u); 3139 basesStoredT+=r.length(); 3140 } 3141 } 3142 } 3143 3144 if(findContainmentsT){ 3145 int x=findContainments(u); 3146 } 3147 3148 if(findOverlapsT){ 3149 int x=findOverlaps(u); 3150 } 3151 3152 return inSet; 3153 } 3154 3155 private int findContainments(final Unit u){ 3156 if(minLengthPercent<=0 && maxSubs<=0 && minIdentity>=100 && !u.valid()){return 0;} 3157 final byte[] bases=u.bases(); 3158 final int shift=2*k; 3159 final int shift2=shift-2; 3160 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 3161 long kmer=0; 3162 long rkmer=0; 3163 int hits=0; 3164 int currentContainments=0; 3165 int len=0; 3166 3167 if(bases==null || bases.length<k){return -1;} 3168 final LongM key=new LongM(); 3169 3170 for(int i=0; i<bases.length; i++){ 3171 byte b=bases[i]; 3172 long x=baseToNumber[b]; 3173 long x2=baseToComplementNumber[b]; 3174 kmer=((kmer<<2)|x)&mask; 3175 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 3176 if(HASH_NS || AminoAcid.isFullyDefined(b)){len++;} 3177 else{len=0; rkmer=0;} 3178 // if(verbose){System.err.println("Scanning i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 3179 if(len>=k){ 3180 key.set(Tools.max(kmer, rkmer)); //Canonical 3181 for(int am=0; am<affixMaps.length; am++){ 3182 ArrayList<Unit> list=affixMaps[am].get(key); 3183 if(list!=null){ 3184 for(Unit u2 : list){ 3185 if(u!=u2 && !u.equals(u2)){ 3186 if(u2.valid()){ 3187 hits++; 3188 if(verbose){ 3189 System.err.println("\nFound potential containment at am="+am+", i="+i+", key="+key.value()+ 3190 ", pre="+Arrays.toString(u2.prefixes)+", suf="+Arrays.toString(u2.suffixes)+ 3191 ", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i, k))); 3192 } 3193 if(u.contains(u2, i, key, bandy, am)){ 3194 synchronized(u2){ 3195 if(u2.valid()){ 3196 currentContainments++; 3197 baseContainmentsT+=u2.length(); 3198 u2.setValid(false); 3199 addDupe(u2.r); 3200 } 3201 } 3202 if(UNIQUE_ONLY){ 3203 synchronized(u){ 3204 if(u.valid()){ 3205 currentContainments++; 3206 baseContainmentsT+=u.length(); 3207 u.setValid(false); 3208 addDupe(u.r); 3209 } 3210 } 3211 } 3212 3213 if(verbose){System.err.println("Added containment "+u2);} 3214 } 3215 } 3216 } 3217 } 3218 } 3219 } 3220 } 3221 } 3222 // assert(false) : hits+", "+currentContainments+", "+baseContainments+"\n"+containmentMapT+"\n"; 3223 3224 containmentCollisionsT+=(hits-currentContainments); 3225 // outstream.println("hits="+hits+", currentContainments="+currentContainments); 3226 containmentsT+=currentContainments; 3227 return hits; 3228 } 3229 3230 private int findOverlaps(final Unit u){ 3231 // if(minLengthPercent<=0 && maxSubs<=0 && minIdentity>=100 && !u.valid()){return 0;} 3232 // if(u.overlapList!=null){u.overlapList.clear();} 3233 final byte[] bases=u.bases(); 3234 final int shift=2*k; 3235 final int shift2=shift-2; 3236 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 3237 long kmer=0; 3238 long rkmer=0; 3239 int hits=0; 3240 int currentOverlaps=0; 3241 int len=0; 3242 3243 if(bases==null || bases.length<k){return -1;} 3244 final LongM key=new LongM(); 3245 3246 boolean quit=false; 3247 3248 for(int i=0; i<bases.length && !quit; i++){ 3249 byte b=bases[i]; 3250 long x=baseToNumber[b]; 3251 long x2=baseToComplementNumber[b]; 3252 kmer=((kmer<<2)|x)&mask; 3253 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 3254 if(HASH_NS || AminoAcid.isFullyDefined(b)){len++;} 3255 else{len=0; rkmer=0;} 3256 // if(verbose){System.err.println("Scanning i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 3257 if(len>=k){//valid key 3258 key.set(Tools.max(kmer, rkmer)); //Canonical key 3259 for(int am=0; am<affixMaps.length; am++){ 3260 ArrayList<Unit> list=affixMaps[am].get(key); 3261 if(list!=null){//found a key collision 3262 for(Unit u2 : list){ 3263 if(quit){break;}//too many edges 3264 int u1cluster=-1, u2cluster=-2; 3265 if(preventTransitiveOverlaps && u!=u2){ 3266 u1cluster=u.determineCluster(); 3267 u2cluster=u2.determineCluster(); 3268 } 3269 if(u1cluster!=u2cluster && u!=u2 && !u.equals(u2) && u2.r!=u.r.mate){//TODO: Not sure why identical things are banned... possibly relates to avoiding inter-pair edges? 3270 if(u2.valid()){ 3271 hits++; 3272 3273 // boolean flag=(u.code1==-3676200394282040623L && u2.code1==-7034423913727372751L) || 3274 // (u2.code1==-3676200394282040623L && u.code1==-7034423913727372751L); 3275 final boolean flag=false; 3276 if(verbose || flag){ 3277 System.err.println("\nFound potential overlap at am="+am+", i="+i+", key="+key.value()+ 3278 ", pre="+Arrays.toString(u2.prefixes)+", suf="+Arrays.toString(u2.suffixes)+ 3279 ", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i, k))); 3280 } 3281 3282 final Overlap o; 3283 if(maxEdges>1000000000 || u.overlapList==null || u2.overlapList==null || 3284 (u.overlapList.size()<maxEdges && u2.overlapList.size()<maxEdges2)){ 3285 o=u.makeOverlap(u2, i, key, bandy, am); 3286 3287 }else{ 3288 o=null; 3289 if(u.overlapList.size()>maxEdges){quit=true;} 3290 } 3291 if(o!=null){ 3292 3293 if(preventTransitiveOverlaps){ 3294 mergeClusterIds(u1cluster, u2cluster); 3295 } 3296 3297 assert(o.test(bandy, o.edits+maxEdits)) : o; 3298 if(verbose || flag){System.err.println("Created overlap "+o);} 3299 3300 long comp=u.length()-u2.length(); 3301 if(comp==0){comp=u.code1-u2.code1;} 3302 if(comp==0){comp=u.code2-u2.code2;} 3303 if(comp==0){comp=u.prefixes[0]-u2.prefixes[0];} 3304 if(comp==0){comp=u.suffixes[0]-u2.suffixes[0];} 3305 if(comp==0){comp=(u.r.numericID-u2.r.numericID);} 3306 assert(comp!=0) : u+", "+u2; 3307 Unit ua=(comp<0 ? u : u2); 3308 Unit ub=(comp<0 ? u2 : u); 3309 assert(ua!=ub); 3310 if(verbose || flag){ 3311 System.err.println("ua="+ua.code1); 3312 System.err.println("ub="+ub.code1); 3313 System.err.println("u ="+u.code1); 3314 System.err.println("u2="+u2.code1); 3315 System.err.println("u.r ="+u.r.numericID); 3316 System.err.println("u2.r="+u2.r.numericID); 3317 System.err.println("ua contains o? "+ua.alreadyHas(o)); 3318 System.err.println("ub contains o? "+ub.alreadyHas(o)); 3319 System.err.println("ua.list="+ua.overlapList); 3320 System.err.println("ub.list="+ub.overlapList); 3321 } 3322 3323 // assert(ua.alreadyHas(o)==ub.alreadyHas(o)); 3324 3325 final boolean uaContainedOverlap; 3326 3327 synchronized(ua){ 3328 if(ua.overlapList==null){ua.overlapList=new ArrayList<Overlap>(2);} 3329 if(!ua.overlapList.contains(o)){ 3330 if(EA){ 3331 synchronized(ub){ 3332 assert(ub.overlapList==null || !ub.overlapList.contains(o)) : 3333 ua.alreadyHas(o)+", "+ub.alreadyHas(o)+"\n"+o+"\n"+ub.overlapList.get(ub.overlapList.indexOf(o))+ 3334 "\nua.list="+ua.overlapList+"\nub.list="+ub.overlapList+"\nu.code1="+u.code1+"\nu2.code1="+u2.code1; 3335 } 3336 } 3337 currentOverlaps++; 3338 baseOverlapsT+=o.overlapLen; 3339 ua.overlapList.add(o); 3340 if(verbose || flag){System.err.println("Added overlap "+o);} 3341 uaContainedOverlap=false; 3342 }else{ 3343 if(verbose || flag){System.err.println("Already contained overlap "+o);} 3344 hits--; 3345 uaContainedOverlap=true; 3346 } 3347 } 3348 3349 if(!uaContainedOverlap){ 3350 synchronized(ub){ 3351 if(ub.overlapList==null){ub.overlapList=new ArrayList<Overlap>(2);} 3352 assert(!ub.overlapList.contains(o)); 3353 ub.overlapList.add(o); 3354 if(verbose || flag){System.err.println("Added overlap "+o);} 3355 } 3356 }else{ 3357 if(verbose || flag){System.err.println("Already contained overlap "+o);} 3358 } 3359 3360 3361 // assert(ua.alreadyHas(o)); 3362 // assert(ub.alreadyHas(o)); 3363 // assert(ua.overlapList.contains(o)); 3364 // assert(ub.overlapList.contains(o)); 3365 if(verbose || flag){ 3366 System.err.println("ua contains o? "+ua.alreadyHas(o)); 3367 System.err.println("ub contains o? "+ub.alreadyHas(o)); 3368 System.err.println("ua.list="+ua.overlapList); 3369 System.err.println("ub.list="+ub.overlapList); 3370 } 3371 } 3372 } 3373 } 3374 } 3375 } 3376 } 3377 } 3378 } 3379 if(EA){ 3380 synchronized(u){ 3381 if(u.overlapList!=null && u.overlapList.isEmpty()){ 3382 assert(false) : "Why would this happen?"; 3383 u.overlapList=null; 3384 } 3385 } 3386 } 3387 // assert(false) : hits+", "+currentOverlaps+", "+baseOverlaps+"\n"+overlapMapT+"\n"; 3388 3389 // assert(hits==currentOverlaps) : hits+", "+currentOverlaps; 3390 3391 overlapCollisionsT+=(hits-currentOverlaps); 3392 // outstream.println("hits="+hits+", currentOverlaps="+currentOverlaps); 3393 overlapsT+=currentOverlaps; 3394 return hits; 3395 } 3396 3397 /** Insert reads processed by a thread into the shared code and affix maps. 3398 * If operating in subset mode, only store reads with code equal to subset mod subsetCount. */ 3399 private long mergeMaps(){ 3400 if(verbose){System.err.println("Merging maps.");} 3401 long novelReads=0, novelKeys=0; 3402 long collisionReads=0; 3403 long mergedReads=0; 3404 3405 assert(localConflictList.isEmpty()); 3406 assert(sharedConflictList.isEmpty()); 3407 3408 synchronized(codeMap){ 3409 for(Long key : codeMapT.keySet()){ 3410 if(codeMap.containsKey(key)){ 3411 localConflictList.add(codeMapT.get(key)); 3412 sharedConflictList.add(codeMap.get(key)); 3413 }else{ 3414 ArrayList<Unit> list=codeMapT.get(key); 3415 codeMap.put(key, list); 3416 addedList.addAll(list); 3417 novelReads+=list.size(); 3418 novelKeys++; 3419 } 3420 } 3421 } 3422 3423 if(verbose){System.err.println("Novel reads = "+novelReads+", conflicts = "+localConflictList.size());} 3424 3425 for(int i=0; i<localConflictList.size(); i++){ 3426 ArrayList<Unit> listT=localConflictList.get(i); 3427 ArrayList<Unit> list=sharedConflictList.get(i); 3428 synchronized(list){ 3429 for(Unit u : listT){ 3430 if(verbose){System.err.println("Processing novel unit "+u.name());} 3431 boolean match=false; 3432 if(findMatchesT){ 3433 for(Unit u2 : list){ 3434 if(pairedEqualsRC(u, u2)){ 3435 // if(verbose){System.err.println("Matches "+new String(r2.bases, 0, Tools.min(40, r2.length())));} 3436 u2.absorbMatch(u); 3437 if(UNIQUE_ONLY){ 3438 synchronized(u2){ 3439 if(u2.valid()){ 3440 mergedReads++; 3441 baseMatchesT+=u2.length(); 3442 u2.setValid(false); 3443 addDupe(u2.r); 3444 } 3445 } 3446 } 3447 match=true; 3448 break; 3449 } 3450 } 3451 } 3452 if(match){ 3453 addDupe(u.r); 3454 mergedReads++; 3455 baseMatchesT+=u.length(); 3456 if(verbose){System.err.println("matchesT="+matchesT+", baseMatchesT="+baseMatchesT);} 3457 }else{ 3458 collisionReads++; 3459 if(verbose){System.err.println("False collision; count = "+collisionReads);} 3460 list.add(u); 3461 addedList.add(u); 3462 } 3463 } 3464 } 3465 } 3466 matchesT+=mergedReads; 3467 collisionsT+=collisionReads; 3468 if(verbose){System.err.println("Done Merging.");} 3469 if(verbose){System.err.println("mapT.size="+codeMapT.size()+", basesStoredT="+basesStoredT);} 3470 3471 codeMapT.clear(); 3472 localConflictList.clear(); 3473 sharedConflictList.clear(); 3474 3475 if(!addedList.isEmpty()){ 3476 if(addToAffixMapT){ 3477 final LongM p=new LongM(-1, true); 3478 assert(affixMaps!=null); 3479 assert(affixMaps[0]!=null || (affixMaps.length>1 && affixMaps[1]!=null)); 3480 3481 for(int i=0; i<numAffixMaps; i++){ 3482 HashMap<LongM, ArrayList<Unit>> map=affixMaps[i]; 3483 if(map!=null && (i>0 || !ignoreAffix1)){ 3484 synchronized(map){ 3485 for(Unit u : addedList){ 3486 final long prefix=u.prefixes[i], suffix=u.suffixes[i]; 3487 if(verbose){System.err.println("Processing affixes for "+u.name());} 3488 if(prefix!=-1 || prefix!=suffix){ 3489 if(verbose){System.err.println("Using prefix "+prefix);} 3490 p.set(prefix); 3491 ArrayList<Unit> alu=map.get(p); 3492 if(alu==null){ 3493 if(verbose){System.err.println("Made new alu for "+p);} 3494 alu=new ArrayList<Unit>(2); 3495 map.put(p.iCopy(), alu); 3496 } 3497 if(alu.size()<maxAffixCopies){ 3498 if(verbose){System.err.println("Added "+u.name());} 3499 alu.add(u); 3500 } 3501 if(verbose){System.err.println(map.get(p));} 3502 } 3503 if(storeSuffix && prefix!=suffix){ 3504 if(verbose){System.err.println("Using suffix "+suffix);} 3505 p.set(suffix); 3506 ArrayList<Unit> alu=map.get(p); 3507 if(alu==null){ 3508 if(verbose){System.err.println("Made new alu for "+p);} 3509 alu=new ArrayList<Unit>(2); 3510 map.put(p.iCopy(), alu); 3511 } 3512 if(alu.size()<maxAffixCopies){ 3513 if(verbose){System.err.println("Added "+u.name());} 3514 alu.add(u); 3515 } 3516 if(verbose){System.err.println(map.get(p));} 3517 } 3518 } 3519 } 3520 } 3521 } 3522 } 3523 } 3524 3525 addedList.clear(); 3526 basesStoredT=0; 3527 return collisionReads+novelReads; 3528 } 3529 3530 private int getTid(){ 3531 synchronized(HashThread.class){ 3532 int x=tcount; 3533 tcount++; 3534 return x; 3535 } 3536 } 3537 3538 private LinkedHashMap<Long, ArrayList<Unit>> codeMapT=new LinkedHashMap<Long, ArrayList<Unit>>(threadMaxReadsToBuffer*8); 3539 private ArrayList<Unit> addedList=new ArrayList<Unit>(threadMaxReadsToBuffer); 3540 private ArrayList<ArrayList<Unit>> localConflictList=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer); 3541 private ArrayList<ArrayList<Unit>> sharedConflictList=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer); 3542 3543 long matchesT=0; 3544 long baseMatchesT=0; 3545 long baseContainmentsT=0; 3546 long collisionsT=0; 3547 long containmentsT=0; 3548 long containmentCollisionsT=0; 3549 long basesStoredT=0; 3550 long addedToMainT=0; 3551 long readsProcessedT=0; 3552 long basesProcessedT=0; 3553 long overlapsT=0; 3554 long baseOverlapsT=0; 3555 long overlapCollisionsT=0; 3556 3557 private final boolean addToCodeMapT; 3558 private final boolean addToAffixMapT; 3559 private final boolean findContainmentsT; 3560 private final boolean findOverlapsT; 3561 private final boolean findMatchesT; 3562 // private final boolean convertToUpperCaseT; 3563 private final int tid; 3564 private final ArrayDeque<ConcurrentReadInputStream> crisq; 3565 private final BandedAligner bandy; 3566 } 3567 3568 public static boolean equalsRC(byte[] a, byte[] b){ 3569 if(a==b){return true;} 3570 if(a==null || b==null){return false;} 3571 if(a.length!=b.length){return false;} 3572 3573 boolean ca=isCanonical(a); 3574 boolean cb=isCanonical(b); 3575 3576 if(ca==cb){ 3577 for(int i=0; i<a.length; i++){ 3578 final byte aa=a[i], bb=b[i]; 3579 if(aa!=bb){return false;} 3580 } 3581 }else{ 3582 for(int i=0, j=b.length-1; i<a.length; i++, j--){ 3583 final byte aa=a[i], bb=baseToComplementExtended[b[j]]; 3584 if(aa!=bb){return false;} 3585 } 3586 } 3587 return true; 3588 } 3589 3590 public static boolean pairedEqualsRC(Unit ua, Unit ub){ 3591 if(verbose){System.err.println("pairedEqualsRC("+ua.name()+", "+ub.name()+")");} 3592 if(verbose){System.err.println("ea");} 3593 boolean b=equalsRC(ua, ub); 3594 if(verbose){System.err.println("eb");} 3595 if(!b){return false;} 3596 if(verbose){System.err.println("ec");} 3597 3598 if(ua.r!=null && ub.r!=null){ 3599 if(verbose){System.err.println("ed");} 3600 assert((ua.r.mate==null)==(ub.r.mate==null)); 3601 if(verbose){System.err.println("ee");} 3602 if(ua.r.mate!=null && ub.r.mate!=null){ 3603 if(verbose){System.err.println("ef");} 3604 return ua.canonical()==ub.canonical() && ua.r.pairnum()==ub.r.pairnum() && Tools.compare(ua.r.mate.bases, ub.r.mate.bases)==0; 3605 } 3606 if(verbose){System.err.println("eg");} 3607 } 3608 if(verbose){System.err.println("eh");} 3609 return true; 3610 } 3611 3612 private static boolean equalsRC(Unit ua, Unit ub){ 3613 if(verbose){System.err.println("equalsRC("+ua.name()+", "+ub.name()+")");} 3614 return ua.code1==ub.code1 && ua.code2==ub.code2 && (ua.canonical()==ub.canonical() ? (ua.prefixes[0]==ub.prefixes[0] && ua.suffixes[0]==ub.suffixes[0]) : 3615 (ua.prefixes[0]==ub.suffixes[0] && ua.suffixes[0]==ub.prefixes[0])) && compareRC(ua, ub)==0; 3616 } 3617 3618 public static int comparePairedRC(Unit ua, Unit ub){ 3619 if(verbose){System.err.println("comparePairedRC("+ua.name()+", "+ub.name()+")");} 3620 int x=compareRC(ua, ub); 3621 if(x!=0){return x;} 3622 3623 if(ua.r!=null && ub.r!=null && ua.r.mate!=null && ub.r.mate!=null){ 3624 if(ua.r.pairnum()!=ub.r.pairnum()){return ua.r.pairnum()-ub.r.pairnum();} 3625 return compareRC((Unit)ua.r.mate.obj, (Unit)ub.r.mate.obj); 3626 } 3627 return 0; 3628 } 3629 3630 //TODO 3631 //This is really for sorting by length. 3632 private static int compareRC(Unit ua, Unit ub){ 3633 if(verbose){System.err.println("compareRC("+ua.name()+", "+ub.name()+")");} 3634 if(ua==ub){return 0;} 3635 if(verbose){System.err.println("a");} 3636 if(verbose){System.err.println("a1");} 3637 if(ua.length()!=ub.length()){return ub.length()-ua.length();} 3638 if(verbose){System.err.println("a2");} 3639 3640 if(REQUIRE_MATCHING_NAMES){ 3641 if(ua.name()!=null && ub.name()!=null){ 3642 int x=ua.name().compareTo(ub.name()); 3643 if(x!=0){return x;} 3644 } 3645 } 3646 if(verbose){System.err.println("a3");} 3647 3648 if(ua.r==null || ub.r==null){ 3649 if(verbose){System.err.println("b");} 3650 if(verbose){System.err.println("b1");} 3651 if(ua.canonical()){ 3652 if(verbose){System.err.println("c");} 3653 if(ub.canonical()){ 3654 if(ua.prefixes[0]!=ub.prefixes[0]){return ua.prefixes[0]>ub.prefixes[0] ? 1 : -1;} 3655 if(ua.suffixes[0]!=ub.suffixes[0]){return ua.suffixes[0]>ub.suffixes[0] ? 1 : -1;} 3656 }else{ 3657 if(ua.prefixes[0]!=ub.suffixes[0]){return ua.prefixes[0]>ub.suffixes[0] ? 1 : -1;} 3658 if(ua.suffixes[0]!=ub.prefixes[0]){return ua.suffixes[0]>ub.prefixes[0] ? 1 : -1;} 3659 } 3660 }else{ 3661 if(verbose){System.err.println("d");} 3662 if(ub.canonical()){ 3663 if(ua.suffixes[0]!=ub.prefixes[0]){return ua.suffixes[0]>ub.prefixes[0] ? 1 : -1;} 3664 if(ua.prefixes[0]!=ub.suffixes[0]){return ua.prefixes[0]>ub.suffixes[0] ? 1 : -1;} 3665 }else{ 3666 if(ua.suffixes[0]!=ub.suffixes[0]){return ua.suffixes[0]>ub.suffixes[0] ? 1 : -1;} 3667 if(ua.prefixes[0]!=ub.prefixes[0]){return ua.prefixes[0]>ub.prefixes[0] ? 1 : -1;} 3668 } 3669 } 3670 if(verbose){System.err.println("e");} 3671 if(ua.code1!=ub.code1){return ua.code1>ub.code1 ? 1 : -1;} 3672 if(ua.code2!=ub.code2){return ua.code2>ub.code2 ? 1 : -1;} 3673 3674 return ua.pairnum()-ub.pairnum(); 3675 } 3676 if(verbose){System.err.println("f");} 3677 final byte[] a=ua.r.bases, b=ub.r.bases; 3678 if(a==b){return 0;} 3679 if(a==null || b==null){return a==null ? -1 : 1;} 3680 if(verbose){System.err.println("g");} 3681 3682 if(ua.canonical()==ub.canonical()){ 3683 if(verbose){System.err.println("h");} 3684 if(ua.canonical() && ub.canonical()){ 3685 for(int i=0; i<a.length; i++){ 3686 final byte aa=a[i], bb=b[i]; 3687 if(aa!=bb){return aa-bb;} 3688 } 3689 }else{ 3690 for(int i=a.length-1; i>=0; i--){ 3691 final byte aa=baseToComplementExtended[a[i]], bb=baseToComplementExtended[b[i]]; 3692 if(aa!=bb){return aa-bb;} 3693 } 3694 } 3695 }else{ 3696 if(verbose){System.err.println("i");} 3697 if(ua.canonical()){ 3698 for(int i=0, j=b.length-1; i<a.length; i++, j--){ 3699 final byte aa=a[i], bb=baseToComplementExtended[b[j]]; 3700 if(aa!=bb){return aa-bb;} 3701 } 3702 }else{ 3703 for(int i=a.length-1, j=0; i>=0; i--, j++){ 3704 final byte aa=baseToComplementExtended[a[i]], bb=b[j]; 3705 if(aa!=bb){return aa-bb;} 3706 } 3707 } 3708 } 3709 3710 if(verbose){System.err.println("j");} 3711 return ua.pairnum()-ub.pairnum(); 3712 } 3713 3714 private static long hashTip(byte[] bases, boolean prefix, int k, int skipInitialBases){ 3715 if(bases==null || bases.length<k){return -1;} 3716 3717 final int shift=2*k; 3718 final int shift2=shift-2; 3719 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 3720 long kmer=0; 3721 long rkmer=0; 3722 int len=0; 3723 3724 final int start=(prefix ? 0+skipInitialBases : bases.length-k-skipInitialBases); 3725 final int stop=start+k; 3726 3727 // if(verbose){ 3728 // System.err.println("\n"+new String(bases)); 3729 // System.err.println("prefix="+prefix+", start="+start+", stop="+stop); 3730 //// System.err.print(new String(bases)); 3731 // } 3732 for(int i=start; i<stop; i++){ 3733 byte b=bases[i]; 3734 // if(verbose){System.err.print((char)b);} 3735 long x=baseToNumber[b]; 3736 long x2=baseToComplementNumber[b]; 3737 kmer=((kmer<<2)|x)&mask; 3738 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 3739 len++; 3740 } 3741 if(verbose){System.err.println(new String(bases, start, k)+" = "+Tools.max(kmer, rkmer));} 3742 assert(len==k) : len+","+k; 3743 return Tools.max(kmer, rkmer); 3744 } 3745 3746 private static final int calcMaxEdits(int maxEdits, float minIdentityMult, int len){ 3747 return minIdentityMult==0 ? maxEdits : Tools.max(maxEdits, (int)Math.round(len*minIdentityMult)); 3748 } 3749 3750 3751 private class Overlap implements Comparable<Overlap>{ 3752 3753 public Overlap(Unit u1_, Unit u2_, int type_, int start1_, int start2_, int stop1_, int stop2_, int len_, int mismatches_, int edits_, BandedAligner bandy){ 3754 assert(u1_!=u2_); 3755 if(verbose){System.err.println("\nCreating an overlap.");} 3756 u1=u1_; 3757 u2=u2_; 3758 type=type_; 3759 start1=start1_; 3760 start2=start2_; 3761 stop1=stop1_; 3762 stop2=stop2_; 3763 overlapLen=len_; 3764 mismatches=mismatches_; 3765 edits=edits_; 3766 3767 assert(Tools.absdif(Tools.absdif(start1, stop1), Tools.absdif(start2, stop2))<=maxEdits) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) 3768 +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) 3769 +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); 3770 3771 assert(start1>=0 && start1<=u1.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length(); 3772 assert(stop1>=0 && stop1<=u1.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length(); 3773 assert(start2>=0 && start2<=u2.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length(); 3774 assert(stop2>=0 && stop2<=u2.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length(); 3775 3776 assert(type==FORWARD || type==FORWARDRC || type==REVERSE || type==REVERSERC); 3777 3778 if(verbose){System.err.println(this);} 3779 3780 assert(Tools.absdif(Tools.absdif(start1, stop1), Tools.absdif(start1, stop1))<=maxEdits); 3781 3782 assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) 3783 +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) 3784 +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); 3785 if(verbose){System.err.println("Passed test 1.");} 3786 3787 // bandy.verbose=true; 3788 // test(bandy); 3789 // assert(false); 3790 3791 assert(u1!=u2); 3792 u1.firstInOverlap(u2); 3793 u2.firstInOverlap(u1); 3794 assert(u1.length()!=u2.length() || u1.code1!=u2.code1 || u1.code2!=u2.code2 || (u1.r!=null && u1.r.mate!=null)) : "Collision? \n"+this+"\n"+u1+"\n"+u2; 3795 assert(u1.firstInOverlap(u2)!=u2.firstInOverlap(u1)) : 3796 "\nu1.firstInOverlap(u2)="+u1.firstInOverlap(u2)+"\nu2.firstInOverlap(u1)="+u2.firstInOverlap(u1)+"\nu1="+u1+"\nu2="+u2; 3797 3798 if(!u1.firstInOverlap(u2)){ 3799 if(verbose){System.err.println("\nSwapping.");} 3800 swap(); 3801 if(verbose){System.err.println(this);} 3802 3803 if(EA && !customBandwidth && !test(bandy, edits+maxEdits)){ 3804 System.err.println("\n"+this); 3805 swap(); 3806 System.err.println("\n"+this); 3807 assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n"; 3808 System.err.println("Passed test 2a, "+bandy.lastEdits+" edits.\n"); 3809 swap(); 3810 System.err.println("\n"+this); 3811 assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) 3812 +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) 3813 +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); 3814 System.err.println("Passed test 2b, "+bandy.lastEdits+" edits.\n"); 3815 } 3816 3817 assert(customBandwidth || test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) 3818 +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) 3819 +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); 3820 if(verbose){System.err.println("Passed test 2.");} 3821 } 3822 3823 if(type==REVERSE || type==REVERSERC){ 3824 if(verbose){System.err.println("\nReversing.");} 3825 reverseDirection(); 3826 if(verbose){System.err.println(this);} 3827 3828 if(EA && !Shared.anomaly && !customBandwidth && bandy!=null && !test(bandy, edits+maxEdits)){ 3829 Shared.anomaly=true; 3830 BandedAligner.verbose=true; 3831 System.err.println("\n********** Failed test 3, "+bandy.lastEdits+" edits. ***************\n"); 3832 reverseDirection(); 3833 System.err.println(this); 3834 assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n"; 3835 System.err.println("Passed test 3a, "+bandy.lastEdits+" edits.\n"); 3836 reverseDirection(); 3837 System.err.println(this); 3838 assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) 3839 +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) 3840 +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); 3841 System.err.println("Passed test 3b, "+bandy.lastEdits+" edits.\n"); 3842 BandedAligner.verbose=false; 3843 } 3844 3845 assert(customBandwidth || test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n"; 3846 if(verbose){System.err.println("Passed test 3.");} 3847 } 3848 //Now all overlaps should be FORWARD or FORWARDRC and u1 should be at least as big as u2 3849 assert(type==FORWARD || type==FORWARDRC); 3850 assert(u1.length()>=u2.length()); 3851 assert(u1.firstInOverlap(u2)); 3852 assert(!u2.firstInOverlap(u1)); 3853 if(verbose){System.err.println("Finished overlap initialization.");} 3854 } 3855 3856 public boolean test(BandedAligner bandy, int editLimit){ 3857 final int last1=u1.length()-1, last2=u2.length()-1; 3858 if(verbose){System.err.println("Testing "+OVERLAP_TYPE_NAMES[type]+", "+start1+", "+start2);} 3859 if(type==FORWARD){ 3860 assert(start1==0 || start2==0) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2; 3861 if(start2==0){ 3862 if(verbose){System.err.println("A");} 3863 return u1.overlapsForward(u2, start1, start2, bandy, false, editLimit);} 3864 else{ 3865 if(verbose){System.err.println("B");} 3866 return u2.overlapsForward(u1, start2, start1, bandy, false, editLimit);} 3867 } 3868 if(type==FORWARDRC){ 3869 assert(start1==0 || start2==last2) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2; 3870 if(start2==last2){return u1.overlapsForwardRC(u2, start1, start2, bandy, false, editLimit);} 3871 else{return u2.overlapsReverseRC(u1, start2, start1, bandy, false, editLimit);} 3872 } 3873 if(type==REVERSE){ 3874 assert(start1==last1 || start2==last2) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2; 3875 if(start2==last2){return u1.overlapsReverse(u2, start1, start2, bandy, false, editLimit);} 3876 else{return u2.overlapsReverse(u1, start2, start1, bandy, false, editLimit);} 3877 } 3878 if(type==REVERSERC){ 3879 assert(start1==last1 || start2==0) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2; 3880 if(start2==0){return u1.overlapsReverseRC(u2, start1, start2, bandy, false, editLimit);} 3881 else{return u2.overlapsForwardRC(u1, start2, start1, bandy, false, editLimit);} 3882 } 3883 throw new RuntimeException(); 3884 } 3885 3886 @Override 3887 public boolean equals(Object o){ 3888 return equals((Overlap)o); 3889 } 3890 3891 public boolean equals(Overlap o){ 3892 if(this==o){return true;} 3893 assert(o!=null) : "*A*\n"+this+"\n"+o+"\n"+u1+"\n"+u2; 3894 assert(u1!=null && u2!=null) : "*B*\n"+this+"\n"+o+"\n"+u1+"\n"+u2; 3895 assert(u1!=o.u2 || u2!=o.u1) : "*C*\n"+this+"\n"+o+"\n"+u1.firstInOverlap(u2)+"\n"+o.u1.firstInOverlap(o.u2)+"\n"+u1+"\n"+u2; 3896 return (u1==o.u1 && u2==o.u2 && type==o.type && start1==o.start1 && start2==o.start2 && stop1==o.stop1 && stop2==o.stop2) 3897 ;//|| (u1==o.u2 && u2==o.u1 && type==reverseType(o.type) && start1==o.start2 && start2==o.start1); 3898 } 3899 3900 // public int compareTo(Overlap o){ 3901 // int a=compareTo2(o); 3902 // int b=o.compareTo2(this); 3903 // assert(a==-b) : "\n"+this+"\n"+o+"\na="+a+", b="+b+", equals="+this.equals(o) 3904 // +"\nu1.compareTo(o.u1)="+u1.compareTo(o.u1)+"\no.u1.compareTo(u1)="+o.u1.compareTo(u1) 3905 // +"\nu2.compareTo(o.u2)="+u2.compareTo(o.u2)+"\no.u2.compareTo(u2)="+o.u2.compareTo(u2); 3906 // return a; 3907 // } 3908 3909 @Override 3910 public int compareTo(Overlap o){ 3911 int score1=overlapLen-50*(mismatches+edits); 3912 int score2=o.overlapLen-50*(o.mismatches+o.edits); 3913 if(score1!=score2){return score2-score1;} 3914 if(overlapLen!=o.overlapLen){return o.overlapLen-overlapLen;} 3915 int x=u1.compareTo(o.u1); 3916 if(x!=0){return -x;} 3917 x=u2.compareTo(o.u2); 3918 if(x!=0){return -x;} 3919 if(type!=o.type){return type-o.type;} 3920 if((u1!=o.u1 || u2!=o.u2) && absorbMatch && !subsetMode){ 3921 boolean oldv=verbose; 3922 verbose=true; 3923 System.err.println(this); 3924 System.err.println(o); 3925 System.err.println("********"); 3926 System.err.println(u1); 3927 System.err.println(u2); 3928 System.err.println(o.u1); 3929 System.err.println(o.u2); 3930 System.err.println("********"); 3931 System.err.println(u1.equals(o.u1)); 3932 System.err.println("********"); 3933 System.err.println(u2.equals(o.u2)); 3934 System.err.println("********"); 3935 System.err.println(u1.compareTo(o.u1)); 3936 System.err.println("********"); 3937 System.err.println(u2.compareTo(o.u2)); 3938 System.err.println("********"); 3939 verbose=oldv; 3940 } 3941 assert(!absorbMatch || (u1==o.u1 && u2==o.u2) || subsetMode) : "\n"+u1.r+"\n"+u2.r+"\n"+o.u1.r+"\n"+o.u2.r 3942 +"\n\n"+u1.r.mate+"\n"+u2.r.mate+"\n"+o.u1.r.mate+"\n"+o.u2.r.mate; 3943 // assert(false) : "\n"+this+"\n"+o+"\n>"+u1.name()+"\n"+new String(u1.bases())+"\n>"+u2.name()+"\n"+new String(u2.bases())+"\n"; 3944 if(start1!=o.start1){return start1-o.start1;} 3945 if(stop1!=o.stop1){return stop1-o.stop1;} 3946 if(start2!=o.start2){return start2-o.start2;} 3947 if(stop2!=o.stop2){return stop2-o.stop2;} 3948 if(this.equals(o)){ 3949 return 0; 3950 }else{ 3951 //TODO: ensure this assumption is valid. 3952 assert(!absorbContainment || !absorbMatch || subsetMode) : "\n"+this+"\n"+o+"\n>"+u1.name()+"\n"+new String(u1.bases())+"\n>"+u2.name()+"\n"+new String(u2.bases())+"\n"; 3953 3954 if(u1.unitID!=o.u1.unitID){return u1.unitID-o.u1.unitID;} 3955 if(u2.unitID!=o.u2.unitID){return u2.unitID-o.u2.unitID;} 3956 } 3957 return 0; 3958 } 3959 3960 @Override 3961 public int hashCode(){ 3962 return u1.hashCode()^u2.hashCode()^overlapLen; 3963 } 3964 3965 public void flip(Unit changed, BandedAligner bandy){ 3966 3967 if(changed==u2){ 3968 if(type==FORWARD){type=FORWARDRC;} 3969 else if(type==FORWARDRC){type=FORWARD;} 3970 else if(type==REVERSE){type=REVERSERC;} 3971 else if(type==REVERSERC){type=REVERSE;} 3972 else{throw new RuntimeException("Unknown overlap type "+type);} 3973 start2=u2.length()-start2-1; 3974 stop2=u2.length()-stop2-1; 3975 }else if(changed==u1){ 3976 if(type==FORWARD){type=REVERSERC;} 3977 else if(type==FORWARDRC){type=REVERSE;} 3978 else if(type==REVERSE){type=FORWARDRC;} 3979 else if(type==REVERSERC){type=FORWARD;} 3980 else{throw new RuntimeException("Unknown overlap type "+type);} 3981 start1=u1.length()-start1-1; 3982 stop1=u1.length()-stop1-1; 3983 }else{throw new RuntimeException("'changed' was not in the Overlap.");} 3984 3985 assert(test(bandy, edits+maxEdits)); 3986 } 3987 3988 public void swap(){ 3989 Unit tempu=u1; 3990 u1=u2; 3991 u2=tempu; 3992 int temp=start1; 3993 start1=start2; 3994 start2=temp; 3995 temp=stop1; 3996 stop1=stop2; 3997 stop2=temp; 3998 if(type==FORWARDRC){type=REVERSERC;} 3999 else if(type==REVERSERC){type=FORWARDRC;} 4000 } 4001 4002 public void reverseDirection(){ 4003 type=reverseType(type); 4004 int temp=start1; 4005 start1=stop1; 4006 stop1=temp; 4007 temp=start2; 4008 start2=stop2; 4009 stop2=temp; 4010 } 4011 4012 @Override 4013 public String toString(){ 4014 StringBuilder sb=new StringBuilder(80); 4015 sb.append("type="); 4016 sb.append(OVERLAP_TYPE_NAMES[type]); 4017 sb.append(", len="); 4018 sb.append(overlapLen); 4019 sb.append(", subs="); 4020 sb.append(mismatches); 4021 sb.append(", edits="); 4022 sb.append(edits); 4023 4024 sb.append(" ("); 4025 sb.append(u1.name()==null ? u1.r.numericID+"" : u1.name()); 4026 if(printLengthInEdges){ 4027 sb.append(", length="); 4028 sb.append(u1.length()); 4029 } 4030 sb.append(", start1="); 4031 sb.append(start1); 4032 sb.append(", stop1="); 4033 sb.append(stop1); 4034 4035 sb.append(") ("); 4036 sb.append(u2.name()==null ? u2.r.numericID+"" : u2.name()); 4037 if(printLengthInEdges){ 4038 sb.append(", length="); 4039 sb.append(u2.length()); 4040 } 4041 sb.append(", start2="); 4042 sb.append(start2); 4043 sb.append(", stop2="); 4044 sb.append(stop2); 4045 sb.append(")"); 4046 return sb.toString(); 4047 } 4048 4049 public String toLabel(){ 4050 StringBuilder sb=new StringBuilder(80); 4051 sb.append(OVERLAP_TYPE_ABBREVIATIONS[type]); 4052 sb.append(','); 4053 sb.append(overlapLen); 4054 sb.append(','); 4055 sb.append(mismatches); 4056 sb.append(','); 4057 sb.append(edits); 4058 4059 if(printLengthInEdges){ 4060 sb.append(','); 4061 sb.append(u1.length()); 4062 } 4063 sb.append(','); 4064 sb.append(start1); 4065 sb.append(','); 4066 sb.append(stop1); 4067 4068 if(printLengthInEdges){ 4069 sb.append(','); 4070 sb.append(u2.length()); 4071 } 4072 sb.append(','); 4073 sb.append(start2); 4074 sb.append(','); 4075 sb.append(stop2); 4076 4077 return sb.toString(); 4078 } 4079 4080 4081 private void setCanonContradiction(boolean b){ 4082 assert(b!=canonContradiction()) : b+", "+canonContradiction(); 4083 if(b){flags|=CANON_CONTRADICTION_MASK;} 4084 else{flags&=~CANON_CONTRADICTION_MASK;} 4085 assert(b==canonContradiction()) : b+", "+canonContradiction(); 4086 } 4087 4088 private void setOffsetContradiction(boolean b){ 4089 assert(b!=offsetContradiction()) : b+", "+offsetContradiction(); 4090 if(b){flags|=OFFSET_CONTRADICTION_MASK;} 4091 else{flags&=~OFFSET_CONTRADICTION_MASK;} 4092 assert(b==offsetContradiction()) : b+", "+offsetContradiction(); 4093 } 4094 4095 private void setMultiJoin(boolean b){ 4096 assert(b!=multiJoin()) : b+", "+multiJoin(); 4097 if(b){flags|=MULTIJOIN_MASK;} 4098 else{flags&=~MULTIJOIN_MASK;} 4099 assert(b==multiJoin()) : b+", "+multiJoin(); 4100 } 4101 4102 private void setVisited(boolean b){ 4103 assert(b!=visited()) : b+", "+visited(); 4104 if(b){flags|=VISITED_MASK;} 4105 else{flags&=~VISITED_MASK;} 4106 assert(b==visited()) : b+", "+visited(); 4107 } 4108 4109 private void setCyclic(boolean b){ 4110 assert(b!=cyclic()) : b+", "+cyclic(); 4111 if(b){flags|=CYCLIC_MASK;} 4112 else{flags&=~CYCLIC_MASK;} 4113 assert(b==cyclic()) : b+", "+cyclic(); 4114 } 4115 4116 private void setInvalid(boolean b){ 4117 assert(b!=invalid()) : b+", "+invalid(); 4118 assert(b!=mst()) : b+", "+mst()+", "+invalid(); 4119 if(b){flags|=INVALID_MASK;} 4120 else{flags&=~INVALID_MASK;} 4121 assert(b==invalid()) : b+", "+invalid(); 4122 } 4123 4124 private void setMst(boolean b){ 4125 assert(b!=mst()) : b+", "+mst(); 4126 assert(b!=invalid()) : b+", "+mst()+", "+invalid(); 4127 if(b){flags|=MST_MASK;} 4128 else{flags&=~MST_MASK;} 4129 assert(b==mst()) : b+", "+mst(); 4130 } 4131 4132 public void clearVolatileFlags(){ 4133 flags=0; 4134 // flags=flags&~(MULTIJOIN_MASK|VISITED_MASK|CANON_CONTRADICTION_MASK|CYCLIC_MASK|OFFSET_CONTRADICTION_MASK|INVALID_MASK); 4135 // assert(!canonContradiction()); 4136 // assert(!offsetContradiction()); 4137 // assert(!multiJoin()); 4138 // assert(!visited()); 4139 // assert(!cyclic()); 4140 // assert(!invalid()); 4141 } 4142 4143 public boolean canonContradiction(){return (CANON_CONTRADICTION_MASK&flags)==CANON_CONTRADICTION_MASK;} 4144 public boolean offsetContradiction(){return (OFFSET_CONTRADICTION_MASK&flags)==OFFSET_CONTRADICTION_MASK;} 4145 public boolean multiJoin(){return (MULTIJOIN_MASK&flags)==MULTIJOIN_MASK;} 4146 public boolean visited(){return (VISITED_MASK&flags)==VISITED_MASK;} 4147 public boolean cyclic(){return (CYCLIC_MASK&flags)==CYCLIC_MASK;} 4148 public boolean invalid(){return (INVALID_MASK&flags)==INVALID_MASK;} 4149 public boolean mst(){return (MST_MASK&flags)==MST_MASK;} 4150 public boolean contradiction(){return canonContradiction() || offsetContradiction();} 4151 4152 private static final long VISITED_MASK=(1L<<0); 4153 private static final long MULTIJOIN_MASK=(1L<<1); 4154 private static final long CYCLIC_MASK=(1L<<2); 4155 private static final long CANON_CONTRADICTION_MASK=(1L<<3); 4156 private static final long OFFSET_CONTRADICTION_MASK=(1L<<4); 4157 private static final long INVALID_MASK=(1L<<5); 4158 private static final long MST_MASK=(1L<<6); 4159 4160 Unit u1; 4161 Unit u2; 4162 int type; 4163 int start1; 4164 int start2; 4165 int stop1; 4166 int stop2; 4167 4168 long flags=0; 4169 4170 final int overlapLen; 4171 final int mismatches; 4172 final int edits; 4173 } 4174 4175 /** 4176 * @return 4177 */ 4178 private int determineCluster2(final int uid) { 4179 assert(clusterNumbers!=null); 4180 boolean stable=false; 4181 int cluster=uid; 4182 while(!stable){ 4183 cluster=clusterNumbers.get(uid); 4184 if(cluster==0 || cluster==uid){return cluster;} 4185 assert(cluster<=uid); 4186 final int next=determineCluster2(cluster); 4187 if(next>=cluster){return cluster;} 4188 stable=clusterNumbers.compareAndSet(uid, cluster, next); 4189 } 4190 return cluster; 4191 } 4192 4193 4194 private int mergeClusterIds(int cluster1, int cluster2) { 4195 assert(clusterNumbers!=null); 4196 4197 // System.err.println("Merging clusters "+cluster1+" and "+cluster2); 4198 4199 while(cluster1!=cluster2){ 4200 int min=Tools.min(cluster1, cluster2); 4201 if(cluster1!=min){ 4202 assert(cluster1>min); 4203 boolean b=clusterNumbers.compareAndSet(cluster1, cluster1, min); 4204 if(!b){ 4205 cluster1=determineCluster2(cluster1); 4206 min=Tools.min(cluster1, cluster2); 4207 } 4208 } 4209 if(cluster2!=min){ 4210 assert(cluster2>min); 4211 boolean b=clusterNumbers.compareAndSet(cluster2, cluster2, min); 4212 if(!b){ 4213 cluster2=determineCluster2(cluster2); 4214 min=Tools.min(cluster1, cluster2); 4215 } 4216 } 4217 } 4218 // System.err.println("Returning "+cluster1); 4219 return cluster1; 4220 } 4221 4222 private class Unit implements Comparable<Unit>, Serializable { 4223 4224 /** 4225 * 4226 */ 4227 private static final long serialVersionUID = -992343322822460643L; 4228 4229 public Unit(Read r_){ 4230 this(r_, isCanonical(r_.bases)); 4231 } 4232 4233 public Unit(Read r_, boolean canonical_){ 4234 // this(r_, canonical_, canonical_ ? hash(r_.bases) : hashReversed(r_.bases)); 4235 this(r_, canonical_, hash(r_.bases), hashReversed(r_.bases)); 4236 } 4237 4238 public Unit(Read r_, boolean canonical_, long codeF_, long codeR_){ 4239 r=r_; 4240 code1=Tools.min(codeF_, codeR_); 4241 code2=Tools.max(codeF_, codeR_); 4242 final int len=r.length(); 4243 for(int i=0; i<prefixes.length; i++){ 4244 if(len>(i+1)*k){ 4245 prefixes[i]=hashTip(r.bases, true, k, i*k); 4246 suffixes[i]=hashTip(r.bases, false, k, i*k); 4247 }else{ 4248 prefixes[i]=-1; 4249 suffixes[i]=-1; 4250 } 4251 } 4252 long f=r.length(); 4253 if(canonical_){f|=CANON_MASK;} 4254 if(r.pairnum()==1){f|=PAIRNUM_MASK;} 4255 flags=f; 4256 assert(canonical()==canonical_); 4257 assert(length()==r.length()); 4258 assert(pairnum()==r.pairnum()); 4259 if(parseDepth){ 4260 int[] quad=KmerNormalize.parseDepth(r.id, null); 4261 if(quad!=null){depth=quad[r.pairnum()];} 4262 } 4263 } 4264 4265 int determineCluster() { 4266 return determineCluster2(unitID); 4267 } 4268 4269 public void absorbMatch(Unit u){ 4270 4271 assert(code1==u.code1 && code2==u.code2 && length()==u.length()); 4272 if(r==null || u.r==null){return;} 4273 u.r.setDiscarded(true); 4274 final byte[] bases1=r.bases, bases2=u.r.bases; 4275 final byte[] quals1=r.quality, quals2=u.r.quality; 4276 4277 assert((r.mate==null) == (u.r.mate==null)); 4278 4279 if(r.mate!=null && !u.r.mate.discarded()){ 4280 ((Unit)r.mate.obj).absorbMatch((Unit)u.r.mate.obj); 4281 } 4282 if(quals1==null || quals2==null){return;} 4283 4284 if(canonical()==u.canonical()){ 4285 for(int i=0; i<bases1.length; i++){ 4286 byte b1=bases1[i], b2=bases2[i]; 4287 if(!AminoAcid.isFullyDefined(b1) && AminoAcid.isFullyDefined(b2)){bases1[i]=b2;} 4288 else{assert(b1==b2);} 4289 if(quals1!=null && quals2!=null){ 4290 quals1[i]=Tools.max(quals1[i], quals2[i]); 4291 } 4292 } 4293 }else{ 4294 for(int i=0, j=bases2.length-1; i<bases1.length; i++, j--){ 4295 byte b1=bases1[i], b2=baseToComplementExtended[bases2[j]]; 4296 if(!AminoAcid.isFullyDefined(b1) && AminoAcid.isFullyDefined(b2)){bases1[i]=b2;} 4297 else{assert(b1==b2);} 4298 if(quals1!=null && quals2!=null){ 4299 quals1[i]=Tools.max(quals1[i], quals2[j]); 4300 } 4301 } 4302 } 4303 } 4304 4305 public boolean alreadyHas(Overlap o){ 4306 if(overlapList==null){return false;} 4307 for(int i=0; i<overlapList.size(); i++){ 4308 Overlap o2=overlapList.get(i); 4309 if(o.equals(o2)){ 4310 assert(overlapList.contains(o)); 4311 assert(o2.equals(o)); 4312 return true; 4313 } 4314 } 4315 assert(!overlapList.contains(o)); 4316 return false; 4317 } 4318 4319 public ArrayList<Unit> makeCluster() { 4320 assert(!visited()); 4321 assert(!clustered()); 4322 assert(valid()); 4323 // assert(set.isEmpty()); 4324 ArrayList<Unit> cluster=new ArrayList<Unit>(overlapList==null ? 1 : overlapList.size()+1); 4325 cluster.add(this); 4326 setClustered(true); 4327 4328 int added=1; 4329 for(int i=0; i<cluster.size(); i++){ 4330 Unit u=cluster.get(i); 4331 added+=u.visit(cluster); 4332 } 4333 4334 assert(added==cluster.size()); 4335 return cluster; 4336 } 4337 4338 public int visit(ArrayList<Unit> cluster) { 4339 assert(!visited()); 4340 assert(clustered()); 4341 assert(valid()); 4342 // assert(cluster.contains(this)); 4343 setVisited(true); 4344 int added=0; 4345 4346 if(r!=null && r.mate!=null){ 4347 Unit u2=(Unit)r.mate.obj; 4348 assert(u2!=this); 4349 assert(u2.valid()); 4350 if(!u2.clustered()){ 4351 u2.setClustered(true); 4352 cluster.add(u2); 4353 added++; 4354 } 4355 } 4356 4357 if(overlapList!=null){ 4358 for(Overlap o : overlapList){ 4359 Unit u2=(o.u1==this ? o.u2 : o.u1); 4360 assert(o.u1==this || o.u2==this); 4361 assert(u2!=this); 4362 assert(u2.valid()); 4363 if(!u2.clustered()){ 4364 u2.setClustered(true); 4365 cluster.add(u2); 4366 added++; 4367 } 4368 } 4369 } 4370 return added; 4371 } 4372 4373 public boolean isTransitive(){ 4374 assert(valid()); 4375 if(overlapList==null || overlapList.size()==0){return true;} 4376 for(Overlap o : overlapList){ 4377 assert(o.u1==this || o.u2==this); 4378 Unit u2=(o.u1==this ? o.u2 : o.u1); 4379 assert(u2!=this); 4380 if(u2.overlapList==null){ 4381 return false; 4382 }else{ 4383 boolean found=false; 4384 for(Overlap o2 : u2.overlapList){ 4385 if(o2.u1==this || o2.u2==this){ 4386 found=true; break; 4387 } 4388 } 4389 if(!found){return false;} 4390 } 4391 } 4392 return true; 4393 } 4394 4395 public boolean isPerfectlyTransitive(){ 4396 assert(valid()); 4397 if(overlapList==null || overlapList.size()==0){return true;} 4398 for(Overlap o : overlapList){ 4399 assert(o.u1==this || o.u2==this); 4400 Unit u2=(o.u1==this ? o.u2 : o.u1); 4401 assert(u2!=this); 4402 if(u2.overlapList==null){ 4403 return false; 4404 }else{ 4405 boolean found=false; 4406 for(Overlap o2 : u2.overlapList){ 4407 if(o2==o){ 4408 found=true; break; 4409 } 4410 } 4411 if(!found){return false;} 4412 } 4413 } 4414 return true; 4415 } 4416 4417 public boolean isNonRedundant(){ 4418 assert(valid()); 4419 if(overlapList==null || overlapList.size()==0){return true;} 4420 for(int i=0; i<overlapList.size(); i++){ 4421 Overlap a=overlapList.get(i); 4422 for(int j=0; j<overlapList.size(); j++){ 4423 Overlap b=overlapList.get(j); 4424 if((i==j)!=(a.equals(b))){ 4425 return false; 4426 } 4427 } 4428 } 4429 return true; 4430 } 4431 4432 public boolean contains(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum) { 4433 if(verbose){System.err.println("contains: Considering key "+key+", unit "+u2);} 4434 if(u2.length()>this.length()){return false;} //Smaller things cannot contain larger things 4435 if(u2.length()==this.length()){//For equal size, only one can contain the other the other 4436 boolean pass=false; 4437 if(!pass && u2.unitID>this.unitID){return false;}else if(u2.unitID<this.unitID){pass=true;} 4438 if(!pass && u2.r.numericID>this.r.numericID){return false;}else if(u2.r.numericID<this.r.numericID){pass=true;} 4439 if(!pass && u2.code1>this.code1){return false;}else if(u2.code1<this.code1){pass=true;} 4440 if(!pass && u2.r.bases.hashCode()>this.r.bases.hashCode()){return false;}else if(u2.r.bases.hashCode()<this.r.bases.hashCode()){pass=true;} 4441 if(!pass){return false;} 4442 } 4443 if(minLengthPercent>0 && (u2.length()*100f/length())<minLengthPercent){return false;} 4444 assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() || (r!=null && r.mate!=null) || //REQUIRE_MATCHING_NAMES || 4445 (canonical()==u2.canonical() ? (u2.prefixes[0]!=prefixes[0] && u2.suffixes[0]!=suffixes[0]) : (u2.prefixes[0]!=suffixes[0] && u2.suffixes[0]!=prefixes[0]))) : 4446 "Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r; 4447 4448 final boolean earlyExit=(tableNum==0); 4449 final int x=(tableNum+1); 4450 final int ktn=k*tableNum; 4451 4452 if(key.value()==u2.prefixes[tableNum]){ 4453 if(verbose){System.err.println("Containment A"+x);} 4454 if(containsForward(u2, loc-k2-ktn, bandy, earlyExit) || containsReverseRC(u2, loc+ktn, bandy, earlyExit)){return true;} 4455 } 4456 if(key.value()==u2.suffixes[tableNum]){ 4457 if(verbose){System.err.println("Containment B"+x);} 4458 if(containsReverse(u2, loc+ktn, bandy, earlyExit) || containsForwardRC(u2, loc-k2-ktn, bandy, earlyExit)){return true;} 4459 } 4460 return false; 4461 } 4462 4463 private boolean containsForward(Unit u2, int start, BandedAligner bandy, boolean earlyExit) { 4464 if(start+u2.length()>length() || start<0 || start>=length()){return false;} 4465 // if(true){return false;} 4466 if(u2.r!=null){ 4467 final byte[] a=bases(), b=u2.bases(); 4468 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4469 4470 for(int i=start, j=0; j<b.length; i++, j++){ 4471 byte aa=a[i]; 4472 byte bb=b[j]; 4473 if(aa!=bb){ 4474 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 4475 if(earlyExit && j<k2){return false;} 4476 if((mismatches=mismatches+1)>maxMismatches){ 4477 if(bandy==null || maxEdits<1){return false;} 4478 int edits=bandy.alignForward(b, a, 0, start, maxEdits, exact); 4479 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4480 return edits<=maxEdits && bandy.score()>4*edits; 4481 } 4482 } 4483 } 4484 } 4485 return true; 4486 }else{ 4487 assert(false) : "TODO: Verify by hashing and checking both tips"; 4488 return false; 4489 } 4490 } 4491 4492 private boolean containsForwardRC(Unit u2, int start, BandedAligner bandy, boolean earlyExit) { 4493 if(ignoreReverseComplement){return false;} 4494 if(start+u2.length()>length() || start<0 || start>=length()){return false;} 4495 // if(true){return false;} 4496 if(u2.r!=null){ 4497 final byte[] a=bases(), b=u2.bases(); 4498 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4499 4500 for(int i=start, j=b.length-1, iprefix=start+k2; j>=0; i++, j--){ 4501 byte aa=a[i]; 4502 byte bb=baseToComplementExtended[b[j]]; 4503 if(aa!=bb){ 4504 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 4505 if(earlyExit && i<iprefix){return false;} 4506 if((mismatches=mismatches+1)>maxMismatches){ 4507 if(bandy==null || maxEdits<1){return false;} 4508 int edits=bandy.alignForwardRC(b, a, b.length-1, start, maxEdits, exact); 4509 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4510 return edits<=maxEdits && bandy.score()>4*edits; 4511 } 4512 } 4513 } 4514 } 4515 return true; 4516 }else{ 4517 assert(false) : "TODO: Verify by hashing and checking both tips"; 4518 return false; 4519 } 4520 } 4521 4522 private boolean containsReverse(Unit u2, int start, BandedAligner bandy, boolean earlyExit) { 4523 if(start+1<u2.length() || start<0 || start>=length()){return false;} 4524 // if(true){return false;} 4525 if(u2.r!=null){ 4526 final byte[] a=bases(), b=u2.bases(); 4527 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4528 4529 for(int i=start, j=b.length-1, iprefix=start-k2; j>=0; i--, j--){ 4530 byte aa=a[i]; 4531 byte bb=b[j]; 4532 if(aa!=bb){ 4533 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 4534 if(earlyExit && i>iprefix){return false;} 4535 if((mismatches=mismatches+1)>maxMismatches){ 4536 if(bandy==null || maxEdits<1){return false;} 4537 int edits=bandy.alignReverse(b, a, b.length-1, start, maxEdits, exact); 4538 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4539 return edits<=maxEdits && bandy.score()>4*edits; 4540 } 4541 } 4542 } 4543 } 4544 return true; 4545 }else{ 4546 assert(false) : "TODO: Verify by hashing and checking both tips"; 4547 return false; 4548 } 4549 } 4550 4551 private boolean containsReverseRC(Unit u2, int start, BandedAligner bandy, boolean earlyExit) { 4552 if(ignoreReverseComplement){return false;} 4553 if(start+1<u2.length() || start<0 || start>=length()){return false;} 4554 // if(true){return false;} 4555 if(u2.r!=null){ 4556 final byte[] a=bases(), b=u2.bases(); 4557 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4558 4559 for(int i=start, j=0; j<b.length; i--, j++){ 4560 byte aa=a[i]; 4561 byte bb=baseToComplementExtended[b[j]]; 4562 if(aa!=bb){ 4563 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 4564 if(earlyExit && j<k2){return false;} 4565 if((mismatches=mismatches+1)>maxMismatches){ 4566 if(bandy==null || maxEdits<1){return false;} 4567 int edits=bandy.alignReverseRC(b, a, 0, start, maxEdits, exact); 4568 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4569 return edits<=maxEdits && bandy.score()>4*edits; 4570 } 4571 } 4572 } 4573 } 4574 return true; 4575 }else{ 4576 assert(false) : "TODO: Verify by hashing and checking both tips"; 4577 return false; 4578 } 4579 } 4580 4581 4582 public boolean depthCongruent(int aa, int bb){ 4583 if(aa<5 && bb<5){return true;} 4584 final int a=Tools.max(1, Tools.min(aa, bb)); 4585 final int b=Tools.max(aa, bb); 4586 return a*depthRatio>=b; 4587 } 4588 4589 public boolean overlaps(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum, int editLimit) { 4590 // return makeOverlap(u2, loc, key, bandy, earlyExit)!=null; 4591 4592 // assert(false) : "TODO"; 4593 if(verbose){System.err.println("overlaps: Considering key "+key+", unit "+u2);} 4594 if(parseDepth && !depthCongruent(depth, u2.depth)){return false;} 4595 if(minLengthPercent>0){ 4596 final int len1=length(), len2=u2.length(); 4597 if(Tools.min(len1, len2)*100f/Tools.max(len1, len2)<minLengthPercent){return false;} 4598 } 4599 assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() || 4600 (canonical()==u2.canonical() ? (u2.prefixes[0]!=prefixes[0] && u2.suffixes[0]!=suffixes[0]) : (u2.prefixes[0]!=suffixes[0] && u2.suffixes[0]!=prefixes[0]))) : 4601 "Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r; 4602 4603 final boolean earlyExit=(tableNum==0); 4604 final int x=(tableNum+1); 4605 final int ktn=k*tableNum; 4606 4607 if(key.value()==u2.prefixes[tableNum]){ 4608 if(verbose){System.err.println("Testing overlaps A"+x);} 4609 if(overlapsForward(u2, loc-k2-ktn, 0, bandy, earlyExit, editLimit)){ 4610 if(verbose){System.err.println("Found Overlap A"+x+"F");} 4611 return true; 4612 } 4613 if(overlapsReverseRC(u2, loc+ktn, 0, bandy, earlyExit, editLimit)){ 4614 if(verbose){System.err.println("Found Overlap A"+x+"R");} 4615 return true; 4616 } 4617 if(verbose){System.err.println("No Overlap.");} 4618 } 4619 4620 if(key.value()==u2.suffixes[tableNum]){ 4621 if(verbose){System.err.println("Testing overlaps B"+x);} 4622 if(overlapsForwardRC(u2, loc-k2-ktn, u2.length()-1, bandy, earlyExit, editLimit)){ 4623 if(verbose){System.err.println("Found Overlap B"+x+"F");} 4624 return true; 4625 } 4626 if(overlapsReverse(u2, loc+ktn, u2.length()-1, bandy, earlyExit, editLimit)){ 4627 if(verbose){System.err.println("Found Overlap B"+x+"R");} 4628 return true; 4629 } 4630 if(verbose){System.err.println("No Overlap.");} 4631 } 4632 4633 return false; 4634 } 4635 4636 /** 4637 * @param u2 4638 * @param loc 4639 * @param key 4640 * @return 4641 */ 4642 protected Overlap makeOverlap(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum) { 4643 if(verbose){System.err.println("makeOverlap: Considering key "+key+", unit "+u2);} 4644 if(parseDepth && !depthCongruent(depth, u2.depth)){return null;} 4645 if(minLengthPercent>0){ 4646 final int len1=length(), len2=u2.length(); 4647 if(Tools.min(len1, len2)*100f/Tools.max(len1, len2)<minLengthPercent){return null;} 4648 } 4649 assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() || (r!=null && r.mate!=null) || 4650 (canonical()==u2.canonical() ? (u2.prefixes[0]!=prefixes[0] && u2.suffixes[0]!=suffixes[0]) : (u2.prefixes[0]!=suffixes[0] && u2.suffixes[0]!=prefixes[0]))) : 4651 "Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r; 4652 4653 final boolean earlyExit=(tableNum==0); 4654 final int x=(tableNum+1); 4655 final int ktn=k*tableNum; 4656 4657 Overlap o=null; 4658 if(key.value()==u2.prefixes[tableNum]){ 4659 if(verbose){System.err.println("\nTesting makeOverlap A"+x+"F");} 4660 if((o=makeOverlapForward(u2, loc-k2-ktn, bandy, earlyExit))!=null){ 4661 if(verbose){System.err.println("Made Overlap A"+x+"F");} 4662 return o; 4663 } 4664 if(verbose){System.err.println("\nTesting makeOverlap A"+x+"R");} 4665 if((o=makeOverlapReverseRC(u2, loc+ktn, bandy, earlyExit))!=null){ 4666 if(verbose){System.err.println("Made Overlap A"+x+"R");} 4667 return o; 4668 } 4669 if(verbose){System.err.println("No Overlap.");} 4670 } 4671 if(key.value()==u2.suffixes[tableNum]){ 4672 if(verbose){System.err.println("\nTesting makeOverlap B"+x+"F");} 4673 if((o=makeOverlapForwardRC(u2, loc-k2-ktn, bandy, earlyExit))!=null){ 4674 if(verbose){System.err.println("Made Overlap B"+x+"F");} 4675 return o; 4676 } 4677 if(verbose){System.err.println("\nTesting makeOverlap B"+x+"R");} 4678 if((o=makeOverlapReverse(u2, loc+ktn, bandy, earlyExit))!=null){ 4679 if(verbose){System.err.println("Made Overlap B"+x+"R");} 4680 return o; 4681 } 4682 if(verbose){System.err.println("No Overlap.");} 4683 } 4684 return o; 4685 } 4686 4687 private boolean overlapsForward(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) { 4688 if(verbose){System.err.println("overlapsForward(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");} 4689 4690 final int len1=length(), len2=u2.length(); 4691 if(start1<0){ 4692 start2-=start1; 4693 start1=0; 4694 if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);} 4695 } 4696 int overlapLength=Tools.min(len1-start1, len2-start2); 4697 int overlapLength2=Tools.max(len1-start1, len2-start2); 4698 int stop1=start1+overlapLength-1, stop2=start2+overlapLength-1; 4699 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 4700 4701 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 4702 if(verbose){ 4703 System.err.println("Side block. allowAllContainedOverlaps="+allowAllContainedOverlaps+", minOverlapCluster="+minOverlapCluster); 4704 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 4705 ", overlapLen="+overlapLength+", maxEdits="+maxEdits); 4706 } 4707 if(overlapLength2<minOverlapCluster){return false;} 4708 if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;} 4709 } 4710 4711 final byte[] a=bases(), b=u2.bases(); 4712 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 4713 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, overlapLength); 4714 4715 if(verbose){ 4716 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 4717 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 4718 } 4719 4720 for(int i=start1, j=start2; j<=stop2; i++, j++){ 4721 byte aa=a[i]; 4722 byte bb=b[j]; 4723 if(aa!=bb){ 4724 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 4725 if(earlyExit && j<k2){return false;} 4726 mismatches++; 4727 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 4728 if(mismatches>maxMismatches){ 4729 if(bandy==null || maxEdits<1){return false;} 4730 if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");} 4731 int edits=bandy.alignForward(b, a, 0, start1, maxEdits, exact); 4732 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4733 stop2=bandy.lastQueryLoc; 4734 stop1=bandy.lastRefLoc; 4735 return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment 4736 } 4737 } 4738 } 4739 } 4740 return true; 4741 } 4742 4743 private boolean overlapsForwardRC(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) { 4744 if(verbose){System.err.println("overlapsForwardRC(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");} 4745 4746 if(ignoreReverseComplement){return false;} 4747 final int len1=length(), len2=u2.length(); 4748 if(start1<0){ 4749 start2+=start1; 4750 start1=0; 4751 if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);} 4752 } 4753 final int overlapLength=Tools.min(len1-start1, start2+1); 4754 final int overlapLength2=Tools.max(len1-start1, start2+1); 4755 int stop1=start1+overlapLength-1, stop2=start2-overlapLength+1; 4756 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 4757 4758 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 4759 if(overlapLength2<minOverlapCluster){return false;} 4760 if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;} 4761 } 4762 4763 final byte[] a=bases(), b=u2.bases(); 4764 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 4765 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4766 4767 if(verbose){ 4768 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 4769 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 4770 } 4771 4772 for(int i=start1, j=start2, iprefix=start1+k2; i<=stop1; i++, j--){ 4773 byte aa=a[i]; 4774 byte bb=baseToComplementExtended[b[j]]; 4775 if(aa!=bb){ 4776 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 4777 if(earlyExit && i<iprefix){return false;} 4778 mismatches++; 4779 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 4780 if(mismatches>maxMismatches){ 4781 if(bandy==null || maxEdits<1){return false;} 4782 if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");} 4783 int edits=bandy.alignForwardRC(b, a, b.length-1, start1, maxEdits, exact); 4784 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4785 stop2=bandy.lastQueryLoc; 4786 stop1=bandy.lastRefLoc; 4787 return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment 4788 } 4789 } 4790 } 4791 } 4792 return true; 4793 } 4794 4795 private boolean overlapsReverse(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) { 4796 if(verbose){System.err.println("overlapsReverse(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");} 4797 4798 final int len1=length(), len2=u2.length(); 4799 if(start1>=len1){ 4800 start2-=(start1-len1+1); 4801 start1=len1-1; 4802 if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);} 4803 } 4804 final int overlapLength=Tools.min(start1+1, start2+1); 4805 final int overlapLength2=Tools.max(start1+1, start2+1); 4806 int stop1=start1-overlapLength+1, stop2=start2-overlapLength+1; 4807 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 4808 4809 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 4810 if(overlapLength2<minOverlapCluster){return false;} 4811 if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;} 4812 } 4813 4814 final byte[] a=bases(), b=u2.bases(); 4815 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 4816 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4817 4818 if(verbose){ 4819 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 4820 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 4821 } 4822 4823 for(int i=start1, j=start2, iprefix=start1-k2; i>=stop1; i--, j--){ 4824 byte aa=a[i]; 4825 byte bb=b[j]; 4826 if(aa!=bb){ 4827 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 4828 if(earlyExit && i>iprefix){return false;} 4829 mismatches++; 4830 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 4831 if(mismatches>maxMismatches){ 4832 if(bandy==null || maxEdits<1){return false;} 4833 if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");} 4834 int edits=bandy.alignReverse(b, a, b.length-1, start1, maxEdits, exact); 4835 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4836 stop2=bandy.lastQueryLoc; 4837 stop1=bandy.lastRefLoc; 4838 return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment 4839 } 4840 } 4841 } 4842 } 4843 return true; 4844 } 4845 4846 private boolean overlapsReverseRC(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) { 4847 if(verbose){System.err.println("overlapsReverseRC(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");} 4848 4849 if(ignoreReverseComplement){return false;} 4850 final int len1=length(), len2=u2.length(); 4851 if(start1>=len1){ 4852 start2+=(start1-len1+1); 4853 start1=len1-1; 4854 if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);} 4855 } 4856 final int overlapLength=Tools.min(start1+1, len2-start2); 4857 final int overlapLength2=Tools.max(start1+1, len2-start2); 4858 int stop1=start1-overlapLength+1, stop2=start2+overlapLength-1; 4859 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 4860 4861 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 4862 if(overlapLength2<minOverlapCluster){return false;} 4863 if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;} 4864 } 4865 4866 final byte[] a=bases(), b=u2.bases(); 4867 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 4868 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4869 4870 if(verbose){ 4871 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 4872 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 4873 } 4874 4875 for(int i=start1, j=start2; j<=stop2; i--, j++){ 4876 byte aa=a[i]; 4877 byte bb=baseToComplementExtended[b[j]]; 4878 if(aa!=bb){ 4879 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 4880 if(earlyExit && j<k2){return false;} 4881 mismatches++; 4882 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 4883 if(mismatches>maxMismatches){ 4884 if(bandy==null || maxEdits<1){return false;} 4885 if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");} 4886 int edits=bandy.alignReverseRC(b, a, 0, start1, maxEdits, exact); 4887 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4888 stop2=bandy.lastQueryLoc; 4889 stop1=bandy.lastRefLoc; 4890 return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment 4891 } 4892 } 4893 } 4894 } 4895 return true; 4896 } 4897 4898 4899 4900 private Overlap makeOverlapForward(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) { 4901 if(verbose){System.err.println("makeOverlapForward(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");} 4902 final int len1=length(), len2=u2.length(); 4903 int start2=0; 4904 if(start1<0){ 4905 start2-=start1; 4906 start1=0; 4907 } 4908 final int overlapLength=Tools.min(len1-start1, len2-start2); 4909 final int overlapLength2=Tools.max(len1-start1, len2-start2); 4910 int stop1=start1+overlapLength-1, stop2=start2+overlapLength-1; 4911 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 4912 4913 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 4914 if(overlapLength<minOverlapCluster){return null;} 4915 if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;} 4916 } 4917 4918 final byte[] a=bases(), b=u2.bases(); 4919 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 4920 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, overlapLength); 4921 4922 if(verbose){ 4923 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 4924 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 4925 } 4926 4927 for(int i=start1, j=start2; j<=stop2; i++, j++){ 4928 byte aa=a[i]; 4929 byte bb=b[j]; 4930 if(aa!=bb){ 4931 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 4932 if(earlyExit && j<k2){return null;} 4933 mismatches++; 4934 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 4935 if(mismatches>1 && bandy!=null){ 4936 if(maxEdits<1){return null;} 4937 if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");} 4938 int edits=bandy.alignForward(b, a, start2, start1, maxEdits, exact); 4939 if(edits>maxEdits || bandy.score()<=4*edits){ 4940 if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");} 4941 return null; 4942 } 4943 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4944 stop2=bandy.lastQueryLoc; 4945 stop1=bandy.lastRefLoc; 4946 // if(bandy.lastOffset>0){//Ref longer than query 4947 // for(int k=0; k<bandy.lastOffset; k++){ 4948 // if(stop1+1<=len1){stop1++;} 4949 // else{stop2--;}//I don't think this can happen 4950 // } 4951 // }else if(bandy.lastOffset<0){//Query longer than ref 4952 // for(int k=0; k>bandy.lastOffset; k--){ 4953 // if(stop2+1<=len2){stop2++;} 4954 // else{stop1--;} 4955 // } 4956 // } 4957 return new Overlap(this, u2, FORWARD, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy); 4958 }else if(mismatches>maxMismatches){return null;} 4959 } 4960 } 4961 } 4962 return new Overlap(this, u2, FORWARD, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy); 4963 } 4964 4965 private Overlap makeOverlapForwardRC(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) { 4966 if(verbose){System.err.println("makeOverlapForwardRC(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");} 4967 if(ignoreReverseComplement){return null;} 4968 final int len1=length(), len2=u2.length(); 4969 int start2=len2-1; 4970 if(start1<0){ 4971 start2+=start1; 4972 start1=0; 4973 } 4974 final int overlapLength=Tools.min(len1-start1, start2+1); 4975 final int overlapLength2=Tools.max(len1-start1, start2+1); 4976 int stop1=start1+overlapLength-1, stop2=start2-overlapLength+1; 4977 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 4978 4979 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 4980 if(overlapLength<minOverlapCluster){return null;} 4981 if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;} 4982 } 4983 4984 final byte[] a=bases(), b=u2.bases(); 4985 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 4986 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4987 4988 for(int i=start1, j=start2, iprefix=start1+k2; i<=stop1; i++, j--){ 4989 byte aa=a[i]; 4990 byte bb=baseToComplementExtended[b[j]]; 4991 if(aa!=bb){ 4992 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 4993 if(earlyExit && i<iprefix){return null;} 4994 mismatches++; 4995 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 4996 if(mismatches>1 && bandy!=null){ 4997 if(maxEdits<1){return null;} 4998 if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");} 4999 int edits=bandy.alignForwardRC(b, a, start2, start1, maxEdits, exact); 5000 if(edits>maxEdits || bandy.score()<=4*edits){ 5001 if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");} 5002 return null; 5003 } 5004 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 5005 stop2=bandy.lastQueryLoc; 5006 stop1=bandy.lastRefLoc; 5007 // if(bandy.lastOffset>0){//Ref longer than query 5008 // for(int k=0; k<bandy.lastOffset; k++){ 5009 // if(stop1+1<=len1){stop1++;} 5010 // else{stop2++;}//I don't think this can happen 5011 // } 5012 // }else if(bandy.lastOffset<0){//Query longer than ref 5013 // for(int k=0; k>bandy.lastOffset; k--){ 5014 // if(stop2>0){stop2--;} 5015 // else{stop1--;} 5016 // } 5017 // } 5018 return new Overlap(this, u2, FORWARDRC, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy); 5019 }else if(mismatches>maxMismatches){return null;} 5020 } 5021 } 5022 } 5023 return new Overlap(this, u2, FORWARDRC, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy); 5024 } 5025 5026 private Overlap makeOverlapReverse(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) { 5027 if(verbose){System.err.println("makeOverlapReverse(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");} 5028 5029 final int len1=length(), len2=u2.length(); 5030 int start2=len2-1; 5031 if(start1>=len1){ 5032 start2-=(start1-len1+1); 5033 start1=len1-1; 5034 } 5035 final int overlapLength=Tools.min(start1+1, start2+1); 5036 final int overlapLength2=Tools.max(start1+1, start2+1); 5037 int stop1=start1-overlapLength+1, stop2=start2-overlapLength+1; 5038 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 5039 5040 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 5041 if(overlapLength<minOverlapCluster){return null;} 5042 if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;} 5043 } 5044 5045 final byte[] a=bases(), b=u2.bases(); 5046 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 5047 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 5048 5049 if(verbose){ 5050 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 5051 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 5052 } 5053 5054 for(int i=start1, j=start2, iprefix=start1-k2; i>=stop1; i--, j--){ 5055 byte aa=a[i]; 5056 byte bb=b[j]; 5057 if(aa!=bb){ 5058 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 5059 if(earlyExit && i>iprefix){return null;} 5060 mismatches++; 5061 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 5062 if(mismatches>1 && bandy!=null){ 5063 if(maxEdits<1){return null;} 5064 if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");} 5065 int edits=bandy.alignReverse(b, a, start2, start1, maxEdits, exact); 5066 if(edits>maxEdits || bandy.score()<=4*edits){ 5067 if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");} 5068 return null; 5069 } 5070 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 5071 stop2=bandy.lastQueryLoc; 5072 stop1=bandy.lastRefLoc; 5073 // if(bandy.lastOffset>0){//Ref longer than query 5074 // for(int k=0; k<bandy.lastOffset; k++){ 5075 // if(stop1>0){stop1--;} 5076 // else{stop2++;}//I don't think this can happen 5077 // } 5078 // }else if(bandy.lastOffset<0){//Query longer than ref 5079 // for(int k=0; k>bandy.lastOffset; k--){ 5080 // if(stop2>0){stop2--;} 5081 // else{stop1++;} 5082 // } 5083 // } 5084 return new Overlap(this, u2, REVERSE, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy); 5085 }else if(mismatches>maxMismatches){return null;} 5086 } 5087 } 5088 } 5089 return new Overlap(this, u2, REVERSE, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy); 5090 } 5091 5092 private Overlap makeOverlapReverseRC(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) { 5093 if(verbose){System.err.println("makeOverlapReverseRC(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");} 5094 if(ignoreReverseComplement){return null;} 5095 final int len1=length(), len2=u2.length(); 5096 int start2=0; 5097 if(start1>=len1){ 5098 start2+=(start1-len1+1); 5099 start1=len1-1; 5100 } 5101 final int overlapLength=Tools.min(start1+1, len2-start2); 5102 final int overlapLength2=Tools.max(start1+1, len2-start2); 5103 int stop1=start1-overlapLength+1, stop2=start2+overlapLength-1; 5104 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 5105 5106 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 5107 if(overlapLength<minOverlapCluster){return null;} 5108 if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;} 5109 } 5110 5111 final byte[] a=bases(), b=u2.bases(); 5112 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 5113 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 5114 5115 if(verbose){ 5116 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 5117 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 5118 } 5119 5120 for(int i=start1, j=start2; j<=stop2; i--, j++){ 5121 byte aa=a[i]; 5122 byte bb=baseToComplementExtended[b[j]]; 5123 if(aa!=bb){ 5124 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 5125 if(earlyExit && j<k2){return null;} 5126 mismatches++; 5127 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 5128 if(mismatches>1 && bandy!=null){ 5129 if(maxEdits<1){return null;} 5130 if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");} 5131 int edits=bandy.alignReverseRC(b, a, start2, start1, maxEdits, exact); 5132 if(edits>maxEdits || bandy.score()<=4*edits){ 5133 if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");} 5134 return null; 5135 } 5136 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 5137 stop2=bandy.lastQueryLoc; 5138 stop1=bandy.lastRefLoc; 5139 // if(bandy.lastOffset>0){//Ref longer than query 5140 // for(int k=0; k<bandy.lastOffset; k++){ 5141 // if(stop1>0){stop1--;} 5142 // else{stop2--;}//I don't think this can happen 5143 // } 5144 // }else if(bandy.lastOffset<0){//Query longer than ref 5145 // for(int k=0; k>bandy.lastOffset; k--){ 5146 // if(stop2+1<=len2){stop2++;} 5147 // else{stop1++;} 5148 // } 5149 // } 5150 return new Overlap(this, u2, REVERSERC, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy); 5151 }else if(mismatches>maxMismatches){return null;} 5152 } 5153 } 5154 } 5155 return new Overlap(this, u2, REVERSERC, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy); 5156 } 5157 5158 @Override 5159 public int compareTo(Unit b) { 5160 int x=comparePairedRC(this, b); 5161 // int y=comparePairedRC(b, this); 5162 // boolean eq1=this.equals(b); 5163 // boolean eq2=b.equals(this); 5164 // 5165 // assert((x==y)==(x==0)) : x+", "+y+"\n"+this+"\n"+b; 5166 // assert((x>0 == y<0) || (x==0 && y==0)) : x+", "+y+"\n"+this+"\n"+b; 5167 // 5168 // assert(eq1==eq2): x+", "+y+"\n"+this+"\n"+b; 5169 // assert(eq1==(x==0)): x+", "+y+"\n"+this+"\n"+b; 5170 // 5171 // assert(eq1 || this!=b); 5172 // 5173 // if(verbose){ //TODO: Remove 5174 // System.err.println(this+"\n"+b+"\n"+this.r.toFastq()+"\n"+this.r.mate.toFastq()+"\n"+b.r.toFastq()+"\n"+b.r.mate.toFastq()+"\n"); 5175 // System.err.println("\n"+x+", "+y+", "+eq1+", "+eq2); 5176 // verbose=false; 5177 // } 5178 5179 return x; 5180 } 5181 5182 @Override 5183 public boolean equals(Object b){return equals((Unit)b);} 5184 public boolean equals(Unit b){ 5185 boolean x=pairedEqualsRC(this, b); 5186 // assert(x==pairedEqualsRC(b, this)); 5187 // assert(x==(comparePairedRC(this, b)==0)); 5188 // assert(x==(comparePairedRC(b, this)==0)); 5189 // assert(x || this!=b); 5190 // System.err.println("\n****EQUALS?****:\n"+this+"\n"+b+"\n**** ****"); //TODO: Remove 5191 return x; 5192 } 5193 5194 @Override 5195 public int hashCode(){ 5196 return (int)((code1^(code1>>>32))&0xFFFFFFFFL); 5197 } 5198 5199 private synchronized void setValid(boolean b){ 5200 assert(b!=valid()); 5201 // if(!b){System.err.println("Setting invalid "+name());} 5202 if(b){flags&=~INVALID_MASK;} 5203 else{flags|=INVALID_MASK;} 5204 assert(b==valid()); 5205 } 5206 5207 private synchronized void setClustered(boolean b){ 5208 assert(b!=clustered()); 5209 if(b){flags|=CLUSTER_MASK;} 5210 else{flags&=~CLUSTER_MASK;} 5211 assert(b==clustered()); 5212 } 5213 5214 private void setVisited(boolean b){ 5215 assert(b!=visited()); 5216 if(b){flags|=VISIT_MASK;} 5217 else{flags&=~VISIT_MASK;} 5218 assert(b==visited()); 5219 } 5220 5221 private synchronized void setCanonical(boolean b){ 5222 assert(b!=canonical()); 5223 if(b){flags|=CANON_MASK;} 5224 else{flags&=~CANON_MASK;} 5225 assert(b==canonical()); 5226 assert(r==null || b==isCanonical(r.bases)); 5227 } 5228 5229 private void setCanonicized(boolean b){ 5230 assert(b!=canonicized()); 5231 if(b){flags|=CANONICIZED_MASK;} 5232 else{flags&=~CANONICIZED_MASK;} 5233 assert(b==canonicized()); 5234 } 5235 5236 private synchronized void setCanonContradiction(boolean b){ 5237 // assert(b!=canonContradiction()); 5238 if(b){flags|=CANON_CONTRADICTION_MASK;} 5239 else{flags&=~CANON_CONTRADICTION_MASK;} 5240 assert(b==canonContradiction()); 5241 } 5242 5243 private synchronized void setOffset(int x){ 5244 offset=x; 5245 setOffsetValid(true); 5246 } 5247 5248 private synchronized void setOffsetValid(boolean b){ 5249 assert(!offsetValid()); 5250 if(b){flags|=OFFSET_VALID_MASK;} 5251 else{flags&=~OFFSET_VALID_MASK;} 5252 assert(b==offsetValid()); 5253 } 5254 5255 private synchronized void setOffsetContradiction(boolean b){ 5256 // assert(b!=offsetContradiction()); 5257 assert(offsetValid()); 5258 if(b){flags|=OFFSET_CONTRADICTION_MASK;} 5259 else{flags&=~OFFSET_CONTRADICTION_MASK;} 5260 assert(b==offsetContradiction()); 5261 } 5262 5263 private void reverseComplement(){ 5264 assert(r!=null); 5265 r.reverseComplement(); 5266 5267 if(prefixes!=null){ 5268 assert(suffixes!=null) : "Can't rcomp with null suffix array."; 5269 for(int i=0; i<prefixes.length; i++){ 5270 long temp=prefixes[i]; 5271 prefixes[i]=suffixes[i]; 5272 suffixes[i]=temp; 5273 } 5274 } 5275 setCanonical(!canonical()); 5276 } 5277 5278 /** Return true if 'this' should be the first Unit in the overlap object */ 5279 public boolean firstInOverlap(Unit u2){ 5280 assert(this!=u2) : "\n"+this.r+"\n"+u2.r; 5281 if(u2.length()!=length()){return u2.length()<length();} 5282 if(u2.code1!=code1){return u2.code1<code1;} 5283 if(u2.code2!=code2){return u2.code2<code2;} 5284 int x=compareTo(u2); 5285 assert(x!=0 || (r!=null && r.mate!=null)); 5286 if(x!=0){return x>=0;} 5287 return r.numericID>=u2.r.numericID; 5288 } 5289 5290 public final boolean inSet(){ 5291 if(subsetCount<2){return true;} 5292 if(r.pairnum()>0){return ((Unit)r.mate.obj).inSet();} 5293 return ((code1&Long.MAX_VALUE)%subsetCount)==subset; 5294 } 5295 5296 public byte[] bases(){return r==null ? null : r.bases;} 5297 5298 public String name(){return r!=null ? r.id : null /*code+""*/;} 5299 @Override 5300 public String toString(){return "("+name()+","+code1+","+code2+","+length()+","+prefixes[0]+","+suffixes[0]+","+(canonical()?"c":"nc")+",d="+depth+")";} 5301 5302 5303 public final Read r; 5304 public final long code1; 5305 public final long code2; 5306 public final long[] prefixes=(numAffixMaps>0 ? new long[numAffixMaps] : null); 5307 public final long[] suffixes=(storeSuffix && numAffixMaps>0 ? new long[numAffixMaps] : null); 5308 5309 /** Distance of leftmost side of this read relative to leftmost side of root. 5310 * Assumes everything is in 'forward' orientation. */ 5311 public int offset=-999999999; 5312 public int depth=1; 5313 // private boolean valid=true; 5314 5315 public int unitID; 5316 5317 public ArrayList<Overlap> overlapList; 5318 5319 private long flags; 5320 /** True if the original read orientation was canonical */ 5321 public final boolean canonical(){return (CANON_MASK&flags)!=0;} 5322 /** True if this contig should be output, false if not */ 5323 public final boolean valid(){return (INVALID_MASK&flags)==0;} 5324 /** Length of this contig */ 5325 public final int length(){return (int)(LEN_MASK&flags);} 5326 /** Position of this contig relative to root */ 5327 public final int offset(){ 5328 assert(offsetValid()); 5329 return offset; 5330 } 5331 public int pairnum(){return (PAIRNUM_MASK&flags)==PAIRNUM_MASK ? 1 : 0;} 5332 5333 public void clearVolatileFlags(){ 5334 flags=flags&~(CANONICIZED_MASK|VISIT_MASK|CANON_CONTRADICTION_MASK|OFFSET_VALID_MASK|OFFSET_CONTRADICTION_MASK); 5335 assert(!visited()); 5336 assert(!canonicized()); 5337 assert(!canonContradiction()); 5338 assert(!offsetValid()); 5339 assert(!offsetContradiction()); 5340 } 5341 5342 public boolean visited(){return (VISIT_MASK&flags)==VISIT_MASK;} 5343 public boolean clustered(){return (CLUSTER_MASK&flags)==CLUSTER_MASK;} 5344 public boolean canonicized(){return (CANONICIZED_MASK&flags)==CANONICIZED_MASK;} 5345 public boolean canonContradiction(){return (CANON_CONTRADICTION_MASK&flags)==CANON_CONTRADICTION_MASK;} 5346 public boolean offsetValid(){return (OFFSET_VALID_MASK&flags)==OFFSET_VALID_MASK;} 5347 public boolean offsetContradiction(){return (OFFSET_CONTRADICTION_MASK&flags)==OFFSET_CONTRADICTION_MASK;} 5348 public boolean contradiction(){return offsetContradiction() || canonContradiction();} 5349 5350 private static final long LEN_MASK=0x7FFFFFFFL; 5351 private static final long CANON_MASK=(1L<<33); 5352 private static final long INVALID_MASK=(1L<<34); 5353 private static final long VISIT_MASK=(1L<<35); 5354 private static final long CLUSTER_MASK=(1L<<36); 5355 private static final long CANONICIZED_MASK=(1L<<37); 5356 private static final long CANON_CONTRADICTION_MASK=(1L<<38); 5357 private static final long OFFSET_VALID_MASK=(1L<<39); 5358 private static final long OFFSET_CONTRADICTION_MASK=(1L<<40); 5359 private static final long PAIRNUM_MASK=(1L<<41); 5360 } 5361 5362 private static final class UnitOffsetComparator implements Comparator<Unit> { 5363 5364 UnitOffsetComparator(){} 5365 5366 /* (non-Javadoc) 5367 * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) 5368 */ 5369 @Override 5370 public int compare(Unit a, Unit b) { 5371 if(a.offsetValid() && b.offsetValid()){ 5372 int x=a.offset()-b.offset(); 5373 if(x!=0){return x;} 5374 }else{ 5375 if(a.offsetValid()){return -1;} 5376 if(b.offsetValid()){return 1;} 5377 } 5378 return a.compareTo(b); 5379 } 5380 5381 } 5382 5383 private static final class ClusterLengthComparator implements Comparator<ArrayList<Unit>> { 5384 5385 ClusterLengthComparator(){} 5386 5387 /* (non-Javadoc) 5388 * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) 5389 */ 5390 @Override 5391 public int compare(ArrayList<Unit> a, ArrayList<Unit> b) { 5392 if(a.size()!=b.size()){return b.size()-a.size();} 5393 if(a.isEmpty() && b.isEmpty()){return 0;} 5394 return a.get(0).compareTo(b.get(0)); 5395 } 5396 5397 } 5398 5399 private static final int[] makeNmerIndex(int n){ 5400 final int max=(1<<(2*n))-1; 5401 int[] array=new int[max+1]; 5402 5403 int count=0; 5404 for(int i=0; i<=max; i++){ 5405 final int a=i, b=AminoAcid.reverseComplementBinaryFast(i, n); 5406 int min=Tools.min(a, b); 5407 if(min==a){ 5408 array[a]=array[b]=count; 5409 count++; 5410 } 5411 } 5412 return array; 5413 } 5414 5415 /** Makes a nmer (e.g., tetramer) profile of a cluster */ 5416 private static final float[] makeNmerProfile(ArrayList<Unit> alu, long[] array_){ 5417 final int nbits=2*nmerLength; 5418 final long[] array=(array_==null ? new long[maxNmer+1] : array_); 5419 final int mask=~((-1)<<(nbits)); 5420 5421 long keysCounted=0; 5422 5423 for(Unit u : alu){ 5424 byte[] bases=u.r.bases; 5425 int len=0; 5426 int kmer=0; 5427 for(byte b : bases){ 5428 int x=AminoAcid.baseToNumber[b]; 5429 if(x<0){ 5430 len=0; 5431 kmer=0; 5432 }else{ 5433 kmer=((kmer<<2)|x)&mask; 5434 len++; 5435 if(len>=nmerLength){ 5436 int rkmer=AminoAcid.reverseComplementBinaryFast(kmer, nmerLength); 5437 keysCounted++; 5438 array[nmerIndex[Tools.min(kmer, rkmer)]]++; 5439 } 5440 } 5441 } 5442 } 5443 5444 if(keysCounted==0){keysCounted=1;} 5445 final float mult=1f/keysCounted; 5446 5447 float[] r=new float[array.length]; 5448 for(int i=0; i<array.length; i++){ 5449 r[i]=array[i]*mult; 5450 array[i]=0; 5451 } 5452 return r; 5453 } 5454 5455 private ConcurrentReadInputStream crisa[]; 5456 5457 private final ByteStreamWriter dupeWriter; 5458 5459 private String[] in1=null; 5460 private String[] in2=null; 5461 private String out=null; 5462 private String clusterFilePattern=null; 5463 private String outbest=null; 5464 private String outdupe=null; 5465 private String outcsf=null; 5466 private String outgraph=null; 5467 private int maxNs=-1; 5468 private long maxReads=-1; 5469 public boolean errorState=false; 5470 boolean sort=false; 5471 // boolean ascending=true; 5472 boolean absorbContainment=true; 5473 boolean absorbMatch=true; 5474 boolean findOverlaps=false; 5475 boolean makeClusters=false; 5476 boolean processClusters=false; 5477 boolean renameClusters=false; 5478 boolean absorbOverlap=false; 5479 boolean storeSuffix=true; 5480 boolean storeName=true; 5481 boolean storeQuality=true; 5482 boolean exact=true; 5483 boolean uniqueNames=true; 5484 boolean maxSpanningTree=false; 5485 5486 boolean canonicizeClusters=true; 5487 boolean removeCycles=true; 5488 boolean fixMultiJoins=true; 5489 boolean fixCanonContradictions=false; 5490 boolean fixOffsetContradictions=false; 5491 boolean countTransitive=false; 5492 boolean countRedundant=false; 5493 5494 private boolean multipleInputFiles=false; 5495 private boolean rigorousTransitive=false; 5496 private int numAffixMaps=1; 5497 private int maxAffixCopies=2000000000; 5498 private int maxEdges=2000000000; 5499 private int maxEdges2=2000000000; 5500 private boolean allowAllContainedOverlaps=false; 5501 // private boolean toUpperCase=false; 5502 5503 /** Trim left bases of the read to this position (exclusive, 0-based) */ 5504 private int forceTrimLeft=-1; 5505 /** Trim right bases of the read after this position (exclusive, 0-based) */ 5506 private int forceTrimRight=-1; 5507 5508 private boolean qTrimLeft=false; 5509 private boolean qTrimRight=false; 5510 private float trimQ=6; 5511 /** Error rate for trimming (derived from trimq) */ 5512 private final float trimE; 5513 5514 int maxEdits=0; 5515 int maxSubs=0; 5516 int bandwidth=9; 5517 final boolean customBandwidth; 5518 float minIdentity=100; 5519 float minIdentityMult=0; 5520 float minLengthPercent=0; 5521 int minOverlapCluster=100; 5522 int minOverlapMerge=1; 5523 float minOverlapPercentCluster=0; 5524 float minOverlapPercentMerge=0; 5525 5526 private int minClusterSize=1; 5527 private int minClusterSizeForStats=1; 5528 private boolean pickBestRepresentativePerCluster=false; 5529 5530 long readsProcessed=0; 5531 long basesProcessed=0; 5532 long collisions=0; 5533 long containments=0; 5534 long baseContainments=0; 5535 long containmentCollisions=0; 5536 long matches=0; 5537 long baseMatches=0; 5538 long overlaps=0; 5539 long baseOverlaps=0; 5540 long overlapCollisions=0; 5541 long addedToMain=0; 5542 5543 private final int subset; 5544 private final int subsetCount; 5545 private final boolean subsetMode; 5546 5547 private final int k; 5548 private final int k2; 5549 private final boolean EA=Shared.EA(); 5550 5551 /*--------------------------------------------------------------*/ 5552 /*---------------- Static Fields ----------------*/ 5553 /*--------------------------------------------------------------*/ 5554 5555 private static ReadComparator comparator=ReadLengthComparator.comparator; 5556 5557 private static int tcount=0; 5558 5559 private LinkedHashMap<Long, ArrayList<Unit>> codeMap=new LinkedHashMap<Long, ArrayList<Unit>>(4000000); 5560 // private HashMap<LongM, ArrayList<Unit>> affixMap1=null; 5561 // private HashMap<LongM, ArrayList<Unit>> affixMap2=null; 5562 private HashMap<LongM, ArrayList<Unit>>[] affixMaps=null; 5563 private ArrayDeque<ArrayList<Unit>> clusterQueue=null; 5564 private ArrayList<ArrayList<Unit>> processedClusters=null; 5565 private AtomicIntegerArray clusterNumbers=null; 5566 5567 private static final UnitOffsetComparator UNIT_OFFSET_COMPARATOR=new UnitOffsetComparator(); 5568 private static final ClusterLengthComparator CLUSTER_LENGTH_COMPARATOR=new ClusterLengthComparator(); 5569 private static final long[][] hashcodes=makeCodes2(32); 5570 public static final byte[] baseToNumber=new byte[128]; 5571 public static final byte[] baseToComplementNumber=new byte[128]; 5572 public static final byte[] baseToComplementExtended=new byte[128]; 5573 public static final int nmerLength=4; 5574 public static final int[] nmerIndex=makeNmerIndex(nmerLength); 5575 public static final int maxNmer=Tools.max(nmerIndex); 5576 private static PrintStream outstream=System.err; 5577 /** Permission to overwrite existing files */ 5578 public static boolean overwrite=false; 5579 /** Permission to append to existing files */ 5580 public static boolean append=false; 5581 public static boolean showSpeed=true; 5582 public static boolean verbose=false; 5583 public static boolean ignoreReverseComplement=false; 5584 public static boolean preventTransitiveOverlaps=false; 5585 public static boolean ignoreAffix1=false; 5586 public static boolean parseDepth=false; 5587 public static boolean printLengthInEdges=false; 5588 public static float depthRatio=2; 5589 public static int MINSCAF=0; 5590 public static int THREADS=Shared.threads(); 5591 public static int threadMaxReadsToBuffer=4000; 5592 public static int threadMaxBasesToBuffer=32000000; 5593 public static boolean DISPLAY_PROGRESS=true; 5594 public static boolean UNIQUE_ONLY=false; 5595 public static boolean REQUIRE_MATCHING_NAMES=false; 5596 public static boolean NUMBER_GRAPH_NODES=true; 5597 public static boolean ADD_PAIRNUM_TO_NAME=true; 5598 public static boolean HASH_NS=false; 5599 5600 private static int reverseType(int type){return (type+2)%4;} 5601 public static final int FORWARD=0; 5602 public static final int FORWARDRC=1; 5603 public static final int REVERSE=2; 5604 public static final int REVERSERC=3; 5605 public static final String[] OVERLAP_TYPE_NAMES=new String[] {"FORWARD", "FORWARDRC", "REVERSE", "REVERSERC"}; 5606 public static final String[] OVERLAP_TYPE_ABBREVIATIONS=new String[] {"F", "FRC", "R", "RRC"}; 5607 5608 static{//All others are 0 5609 baseToNumber['A']=baseToNumber['a']=0; 5610 baseToNumber['C']=baseToNumber['c']=1; 5611 baseToNumber['G']=baseToNumber['g']=2; 5612 baseToNumber['T']=baseToNumber['t']=3; 5613 baseToNumber['U']=baseToNumber['u']=3; 5614 5615 baseToComplementNumber['A']=baseToComplementNumber['a']=3; 5616 baseToComplementNumber['C']=baseToComplementNumber['c']=2; 5617 baseToComplementNumber['G']=baseToComplementNumber['g']=1; 5618 baseToComplementNumber['T']=baseToComplementNumber['t']=0; 5619 baseToComplementNumber['U']=baseToComplementNumber['u']=0; 5620 5621 for(int i=0; i<AminoAcid.baseToComplementExtended.length; i++){ 5622 byte b=AminoAcid.baseToComplementExtended[i]; 5623 baseToComplementExtended[i]=(b<0 ? (byte)i : b); 5624 } 5625 } 5626 5627 } 5628