1 package jgi; 2 3 import java.io.File; 4 import java.io.PrintStream; 5 import java.io.Serializable; 6 import java.util.ArrayDeque; 7 import java.util.ArrayList; 8 import java.util.Arrays; 9 import java.util.Comparator; 10 import java.util.HashMap; 11 import java.util.HashSet; 12 import java.util.LinkedHashMap; 13 import java.util.Locale; 14 import java.util.PriorityQueue; 15 import java.util.Random; 16 import java.util.concurrent.atomic.AtomicIntegerArray; 17 18 import align2.BandedAligner; 19 import dna.AminoAcid; 20 import fileIO.ByteFile1; 21 import fileIO.ByteFile2; 22 import fileIO.ByteStreamWriter; 23 import fileIO.FileFormat; 24 import fileIO.ReadWrite; 25 import fileIO.TextStreamWriter; 26 import shared.Parse; 27 import shared.Parser; 28 import shared.PreParser; 29 import shared.ReadStats; 30 import shared.Shared; 31 import shared.Timer; 32 import shared.Tools; 33 import shared.TrimRead; 34 import sort.ReadComparator; 35 import sort.ReadComparatorID; 36 import sort.ReadComparatorName; 37 import sort.ReadLengthComparator; 38 import sort.ReadQualityComparator; 39 import stream.ConcurrentCollectionReadInputStream; 40 import stream.ConcurrentGenericReadInputStream; 41 import stream.ConcurrentReadInputStream; 42 import stream.ConcurrentReadOutputStream; 43 import stream.FASTQ; 44 import stream.FastaReadInputStream; 45 import stream.FastqReadInputStream; 46 import stream.Read; 47 import structures.ListNum; 48 import structures.LongM; 49 50 /** 51 * @author Brian Bushnell 52 * @date Jul 18, 2013 53 * 54 */ 55 public final class Dedupe { 56 main(String[] args)57 public static void main(String[] args){ 58 {//Preparse to see which Dedupe version should be used 59 int nam=1; 60 boolean amino=false; 61 for(int i=0; i<args.length; i++){ 62 final String arg=args[i]; 63 String[] split=arg.split("="); 64 String a=split[0].toLowerCase(); 65 String b=split.length>1 ? split[1] : null; 66 while(a.charAt(0)=='-'){a=a.substring(1);} 67 if(a.equals("nam") || a.equals("numaffixmaps")){ 68 if(b!=null){nam=Integer.parseInt(b);} 69 }else if(a.equals("amino") || a.equals("protein")){ 70 amino=Parse.parseBoolean(b); 71 } 72 } 73 if(amino){ 74 DedupeProtein.main(args); 75 return; 76 } 77 if(nam>2){ 78 Dedupe2.main(args); 79 return; 80 } 81 } 82 83 int rbl0=Shared.bufferLen();; 84 Shared.setBufferLen(16); 85 86 Dedupe dd=new Dedupe(args); 87 dd.process(); 88 89 Shared.setBufferLen(rbl0); 90 91 //Close the print stream if it was redirected 92 Shared.closeStream(outstream); 93 } 94 Dedupe(String[] args)95 public Dedupe(String[] args){ 96 97 {//Preparse block for help, config files, and outstream 98 PreParser pp=new PreParser(args, getClass(), true); 99 args=pp.args; 100 outstream=pp.outstream; 101 } 102 103 ReadWrite.ZIPLEVEL=2; 104 ReadWrite.USE_UNPIGZ=ReadWrite.USE_PIGZ=true; 105 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); 106 Read.TO_UPPER_CASE=true; 107 108 boolean setMcsfs=false; 109 int bandwidth_=-1; 110 int k_=31; 111 int subset_=0, subsetCount_=1; 112 boolean ascending=false; 113 Parser parser=new Parser(); 114 115 for(int i=0; i<args.length; i++){ 116 117 final String arg=args[i]; 118 String[] split=arg.split("="); 119 String a=split[0].toLowerCase(); 120 String b=split.length>1 ? split[1] : null; 121 122 if(Parser.parseZip(arg, a, b)){ 123 //do nothing 124 }else if(Parser.parseCommonStatic(arg, a, b)){ 125 //do nothing 126 }else if(Parser.parseQuality(arg, a, b)){ 127 //do nothing 128 }else if(Parser.parseFasta(arg, a, b)){ 129 //do nothing 130 }else if(parser.parseQTrim(arg, a, b)){ 131 //do nothing 132 }else if(parser.parseInterleaved(arg, a, b)){ 133 //do nothing 134 }else if(a.equals("in") || a.equals("in1")){ 135 if(b!=null && b.indexOf(',')>=0 && !new File(b).exists()){ 136 in1=b.split(","); 137 }else{ 138 in1=new String[] {b}; 139 } 140 }else if(a.equals("in2")){ 141 if(b!=null && b.indexOf(',')>=0 && !new File(b).exists()){ 142 in2=b.split(","); 143 }else{ 144 in2=new String[] {b}; 145 } 146 }else if(a.equals("out") || a.equals("out")){ 147 out=b; 148 }else if(a.equals("out2")){ 149 throw new RuntimeException("Dedupe does not allow 'out2'; for paired reads, output is interleaved."); 150 }else if(a.equals("clusterfilepattern") || a.equals("pattern")){ 151 clusterFilePattern=b; 152 assert(clusterFilePattern==null || clusterFilePattern.contains("%")) : "pattern must contain the % symbol."; 153 }else if(a.equals("outbest")){ 154 outbest=b; 155 }else if(a.equals("outd") || a.equals("outduplicate")){ 156 outdupe=b; 157 }else if(a.equals("csf") || a.equals("clusterstatsfile")){ 158 outcsf=b; 159 }else if(a.equals("dot") || a.equals("graph") || a.equals("outdot") || a.equals("outgraph")){ 160 outgraph=b; 161 }else if(a.equals("mcsfs") || a.equals("minclustersizeforstats")){ 162 minClusterSizeForStats=Integer.parseInt(b); 163 }else if(a.equals("mcs") || a.equals("minclustersize")){ 164 minClusterSize=Integer.parseInt(b); 165 if(!setMcsfs){ 166 minClusterSizeForStats=minClusterSize; 167 } 168 }else if(a.equals("append") || a.equals("app")){ 169 append=ReadStats.append=Parse.parseBoolean(b); 170 }else if(a.equals("overwrite") || a.equals("ow")){ 171 overwrite=Parse.parseBoolean(b); 172 }else if(a.equals("sort")){ 173 if(b==null){sort=true;} 174 else if(b.equalsIgnoreCase("a") || b.equalsIgnoreCase("ascending")){ 175 sort=true; 176 ascending=true; 177 }else if(b.equalsIgnoreCase("d") || b.equalsIgnoreCase("descending")){ 178 sort=true; 179 ascending=false; 180 }else if(b.equalsIgnoreCase("length")){ 181 sort=true; 182 comparator=ReadLengthComparator.comparator; 183 }else if(b.equalsIgnoreCase("quality")){ 184 sort=true; 185 comparator=ReadQualityComparator.comparator; 186 }else if(b.equalsIgnoreCase("name")){ 187 sort=true; 188 comparator=ReadComparatorName.comparator; 189 }else if(b.equalsIgnoreCase("id")){ 190 sort=true; 191 comparator=ReadComparatorID.comparator; 192 }else{ 193 sort=Parse.parseBoolean(b); 194 } 195 }else if(a.equals("ascending")){ 196 ascending=Parse.parseBoolean(b); 197 }else if(a.equals("ordered")){ 198 boolean x=Parse.parseBoolean(b); 199 if(x){ 200 ascending=true; 201 sort=true; 202 comparator=ReadComparatorID.comparator; 203 }else{ 204 //do nothing 205 } 206 }else if(a.equals("arc") || a.equals("absorbrc") || a.equals("trc") || a.equals("testrc")){ 207 ignoreReverseComplement=!Parse.parseBoolean(b); 208 }else if(a.equals("ac") || a.equals("absorbcontainment") || a.equals("absorbcontainments") || a.equals("tc") || a.equals("testcontainment") || a.equals("containment")){ 209 absorbContainment=Parse.parseBoolean(b); 210 }else if(a.equals("am") || a.equals("absorbmatch") || a.equals("absorbmatches") || a.equals("tm") || a.equals("testmatch")){ 211 absorbMatch=Parse.parseBoolean(b); 212 }else if(a.equals("ao") || a.equals("absorboverlap") || a.equals("absorboverlaps") || a.equals("to") || a.equals("testoverlap")){ 213 absorbOverlap=Parse.parseBoolean(b); 214 }else if(a.equals("fo") || a.equals("findoverlap") || a.equals("findoverlaps")){ 215 findOverlaps=Parse.parseBoolean(b); 216 }else if(a.equals("c") || a.equals("cluster") || a.equals("clusters")){ 217 makeClusters=Parse.parseBoolean(b); 218 }else if(a.equals("fmj") || a.equals("fixmultijoin") || a.equals("fixmultijoins")){ 219 fixMultiJoins=Parse.parseBoolean(b); 220 }else if(a.equals("fcc") || a.equals("fixcanoncontradiction") || a.equals("fixcanoncontradictions")){ 221 fixCanonContradictions=Parse.parseBoolean(b); 222 }else if(a.equals("foc") || a.equals("fixoffsetcontradiction") || a.equals("fixoffsetcontradictions")){ 223 fixOffsetContradictions=Parse.parseBoolean(b); 224 }else if(a.equals("pto") || a.equals("preventtransitiveoverlap") || a.equals("preventtransitiveoverlaps")){ 225 preventTransitiveOverlaps=Parse.parseBoolean(b); 226 }else if(a.equals("pbr") || a.equals("pickbestrepresentative")){ 227 pickBestRepresentativePerCluster=Parse.parseBoolean(b); 228 }else if(a.equals("mst") || a.equals("maxspanningtree")){ 229 maxSpanningTree=Parse.parseBoolean(b); 230 }else if(a.equals("cc") || a.equals("canonicizecluster") || a.equals("canonicizeclusters")){ 231 canonicizeClusters=Parse.parseBoolean(b); 232 }else if(a.equals("pc") || a.equals("processcluster") || a.equals("processclusters")){ 233 processClusters=Parse.parseBoolean(b); 234 }else if(a.equals("rnc") || a.equals("renamecluster") || a.equals("renameclusters")){ 235 renameClusters=Parse.parseBoolean(b); 236 if(renameClusters){storeName=false;} 237 }else if(a.equals("rc") || a.equals("removecycles") || a.equals("removecycle")){ 238 removeCycles=Parse.parseBoolean(b); 239 }else if(a.equals("uo") || a.equals("uniqueonly")){ 240 UNIQUE_ONLY=Parse.parseBoolean(b); 241 }else if(a.equals("rmn") || a.equals("requirematchingnames")){ 242 REQUIRE_MATCHING_NAMES=Parse.parseBoolean(b); 243 }else if(a.equals("ngn") || a.equals("numbergraphnodes")){ 244 NUMBER_GRAPH_NODES=Parse.parseBoolean(b); 245 }else if(a.equals("addpairnum")){ 246 ADD_PAIRNUM_TO_NAME=Parse.parseBoolean(b); 247 }else if(a.equals("hashns")){ 248 HASH_NS=Parse.parseBoolean(b); 249 }else if(a.equals("pn") || a.equals("prefixname")){ 250 // PREFIX_NAME=Parse.parseBoolean(b); 251 }else if(a.equals("k")){ 252 k_=Integer.parseInt(b); 253 assert(k_>0 && k_<32) : "k must be between 1 and 31; default is 31, and lower values are slower."; 254 }else if(a.equals("minscaf") || a.equals("ms")){ 255 MINSCAF=FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); 256 }else if(a.equals("mlp") || a.equals("minlengthpercent")){ 257 minLengthPercent=Float.parseFloat(b); 258 }else if(a.equals("mop") || a.equals("minoverlappercent")){ 259 minOverlapPercentCluster=minOverlapPercentMerge=Float.parseFloat(b); 260 }else if(a.equals("mopc") || a.equals("minoverlappercentcluster")){ 261 minOverlapPercentCluster=Float.parseFloat(b); 262 }else if(a.equals("mopm") || a.equals("minoverlappercentmerge")){ 263 minOverlapPercentMerge=Float.parseFloat(b); 264 }else if(a.equals("mo") || a.equals("minoverlap")){ 265 minOverlapCluster=minOverlapMerge=Integer.parseInt(b); 266 }else if(a.equals("moc") || a.equals("minoverlapcluster")){ 267 minOverlapCluster=Integer.parseInt(b); 268 }else if(a.equals("mom") || a.equals("minoverlapmerge")){ 269 minOverlapMerge=Integer.parseInt(b); 270 }else if(a.equals("rt") || a.equals("rigoroustransitive")){ 271 rigorousTransitive=Parse.parseBoolean(b); 272 }else if(a.equals("e") || a.equals("maxedits") || a.equals("edits") || a.equals("edist")){ 273 maxEdits=Integer.parseInt(b); 274 }else if(a.equals("s") || a.equals("maxsubs") || a.equals("maxsubstitutions") || a.equals("hdist")){ 275 maxSubs=Integer.parseInt(b); 276 }else if(a.equals("bw") || a.equals("bandwidth")){ 277 bandwidth_=Integer.parseInt(b); 278 }else if(a.equals("mid") || a.equals("minidentity")){ 279 minIdentity=Float.parseFloat(b); 280 minIdentityMult=(minIdentity==100f ? 0 : (100f-minIdentity)/100f); 281 }else if(a.equals("threads") || a.equals("t")){ 282 THREADS=(b==null || b.equalsIgnoreCase("auto") ? Shared.threads() : Integer.parseInt(b)); 283 }else if(a.equals("showspeed") || a.equals("ss")){ 284 showSpeed=Parse.parseBoolean(b); 285 }else if(a.equals("verbose")){ 286 verbose=Parse.parseBoolean(b); 287 // BandedAligner.verbose=verbose; 288 }else if(a.equals("contigbreak") || (arg.contains("=") && (a.equals("n") || a.equals("-n")))){ 289 maxNs=Integer.parseInt(b); 290 }else if(a.equals("reads") || a.startsWith("maxreads")){ 291 maxReads=Parse.parseKMG(b); 292 }else if(a.equals("sn") || a.equals("storename") || a.equals("storenames") || a.equals("keepnames")){ 293 storeName=Parse.parseBoolean(b); 294 }else if(a.equals("ssx") || a.equals("storesuffix") || a.equals("storesuffixes")){ 295 storeSuffix=Parse.parseBoolean(b); 296 }else if(a.equals("numaffixmaps") || a.equals("nam")){ 297 numAffixMaps=Integer.parseInt(b); 298 }else if(a.equals("amino") || a.equals("protein")){ 299 assert(!Parse.parseBoolean(b)) : "This program does not support amino acids; use DedupeProtein."; 300 }else if(a.equals("mac") || a.equals("maxaffixcopies")){ 301 maxAffixCopies=Integer.parseInt(b); 302 }else if(a.equals("me") || a.equals("maxedges")){ 303 maxEdges=Integer.parseInt(b); 304 maxEdges2=maxEdges*2; 305 if(maxEdges2<1){maxEdges2=Integer.MAX_VALUE-1;} 306 }else if(a.equals("ignoreaffix1") || a.equals("ia1")){ 307 ignoreAffix1=Parse.parseBoolean(b); 308 }else if(a.equals("parsedepth") || a.equals("pd")){ 309 parseDepth=Parse.parseBoolean(b); 310 }else if(a.equals("printlengthinedges") || a.equals("ple")){ 311 printLengthInEdges=Parse.parseBoolean(b); 312 }else if(a.equals("depthmult") || a.equals("depthratio") || a.equals("dr")){ 313 depthRatio=Float.parseFloat(b); 314 if(depthRatio<=0){ 315 parseDepth=false; 316 }else{ 317 parseDepth=true; 318 assert(depthRatio>0); 319 if(depthRatio<1){depthRatio=1/depthRatio;} 320 } 321 }else if(a.equals("storequality") || a.equals("sq")){ 322 storeQuality=Parse.parseBoolean(b); 323 }else if(a.equals("exact") || a.equals("ex")){ 324 exact=Parse.parseBoolean(b); 325 }else if(a.equals("uniquenames") || a.equals("un")){ 326 uniqueNames=Parse.parseBoolean(b); 327 }else if(a.equals("mergeheaders") || a.equals("mergenames")){ 328 mergeHeaders=Parse.parseBoolean(b); 329 }else if(a.equals("mergedelimiter") || a.equals("mergesymbol")){ 330 mergeDelimiter=Parse.parseSymbol(b); 331 }else if(a.equals("ftl") || a.equals("forcetrimleft")){ 332 forceTrimLeft=Integer.parseInt(b); 333 }else if(a.equals("ftr") || a.equals("forcetrimright")){ 334 forceTrimRight=Integer.parseInt(b); 335 }else if(a.equals("subset") || a.equals("sst")){ 336 subset_=Integer.parseInt(b); 337 }else if(a.equals("subsets") || a.equals("subsetcount") || a.equals("sstc")){ 338 subsetCount_=Integer.parseInt(b); 339 }else if(i==0 && in1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ 340 String c=args[i]; 341 if(c.indexOf(',')>=0 && !new File(c).exists()){ 342 in1=c.split(","); 343 }else{ 344 in1=new String[] {c}; 345 } 346 }else if(i==1 && out==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ 347 out=args[i]; 348 }else{ 349 throw new RuntimeException("Unknown parameter "+args[i]); 350 } 351 } 352 353 comparator.setAscending(ascending); 354 355 {//Process parser fields 356 Parser.processQuality(); 357 qTrimLeft=parser.qtrimLeft; 358 qTrimRight=parser.qtrimRight; 359 trimQ=parser.trimq; 360 trimE=parser.trimE(); 361 } 362 363 if(verbose){ 364 ReadWrite.verbose=ConcurrentGenericReadInputStream.verbose=ConcurrentReadOutputStream.verbose=ByteFile1.verbose=ByteFile2.verbose=FastqReadInputStream.verbose=true; 365 } 366 367 k=k_; 368 k2=k-1; 369 subset=subset_; 370 subsetCount=subsetCount_; 371 subsetMode=subsetCount>1; 372 assert(subset>=0 && subset<subsetCount) : "subset="+subset+", subsetCount="+subsetCount; 373 374 BandedAligner.penalizeOffCenter=true; 375 376 if(maxSpanningTree){removeCycles=fixMultiJoins=false;} 377 if(absorbOverlap){processClusters=true;} 378 if(processClusters || renameClusters || maxSpanningTree){makeClusters=true;} 379 if(makeClusters){findOverlaps=true;} 380 if(renameClusters){uniqueNames=/*storeName=*/false;} 381 382 if(bandwidth_>-1){ 383 bandwidth=Tools.min(bandwidth_, 2*maxEdits+1); 384 customBandwidth=(bandwidth<2*maxEdits+1); 385 }else{ 386 bandwidth=2*maxEdits+1; 387 customBandwidth=false; 388 } 389 maxSubs=Tools.max(maxSubs, maxEdits); 390 if(maxSubs>0 || minIdentity<100 || findOverlaps){storeSuffix=true;} 391 392 assert(FastaReadInputStream.settingsOK()); 393 394 if(in1==null){throw new RuntimeException("Error - at least one input file is required.");} 395 396 for(int i=0; i<in1.length; i++){ 397 if(in1[i].equalsIgnoreCase("stdin") && !new File(in1[i]).exists()){in1[i]="stdin.fa";} 398 } 399 400 // assert(false) : Arrays.toString(in); 401 402 // if(!setOut && clusterFilePattern==null){out="stdout.fa";} 403 // else 404 // if("stdout".equalsIgnoreCase(out) || "standarddout".equalsIgnoreCase(out)){ 405 // out="stdout.fa"; 406 // outstream=System.err; 407 // } 408 if(!Tools.canWrite(out, overwrite)){throw new RuntimeException("Output file "+out+" already exists, and overwrite="+overwrite);} 409 410 for(int i=0; i<in1.length; i++){ 411 assert(!in1[i].equalsIgnoreCase(out)); 412 } 413 // assert(false) : "\nabsorbContainment="+absorbContainment+", findOverlaps="+findOverlaps+", absorbOverlap="+absorbOverlap+"\n"+ 414 // "processClusters="+processClusters+", renameClusters="+renameClusters+", makeClusters="+makeClusters+", uniqueNames="+uniqueNames+"\n"+ 415 // "storeName="+storeName+", DISPLAY_PROGRESS="+DISPLAY_PROGRESS+", removeCycles="+removeCycles; 416 if(absorbContainment || findOverlaps){ 417 // assert(false); 418 affixMaps=new HashMap[numAffixMaps]; 419 for(int i=0; i<numAffixMaps; i++){ 420 affixMaps[i]=new HashMap<LongM, ArrayList<Unit>>(4000000); 421 } 422 if(affixMaps.length>0){affixMap1=affixMaps[0];} 423 if(affixMaps.length>1){affixMap2=affixMaps[1];} 424 } 425 // assert(false) : absorbContainment+", "+(affixMap==null); 426 427 if(outdupe==null){ 428 dupeWriter=null; 429 }else{ 430 FileFormat ff=FileFormat.testOutput(outdupe, FileFormat.FASTA, null, true, overwrite, append, false); 431 dupeWriter=new ByteStreamWriter(ff); 432 } 433 } 434 process()435 public void process(){ 436 437 Timer t=new Timer(); 438 439 boolean dq0=FASTQ.DETECT_QUALITY; 440 boolean ti0=FASTQ.TEST_INTERLEAVED; 441 442 process2(); 443 444 FASTQ.DETECT_QUALITY=dq0; 445 FASTQ.TEST_INTERLEAVED=ti0; 446 447 t.stop(); 448 449 if(showSpeed){ 450 outstream.println(Tools.timeReadsBasesProcessed(t, readsProcessed, basesProcessed, 8)); 451 } 452 453 if(errorState){ 454 throw new RuntimeException("Dedupe terminated in an error state; the output may be corrupt."); 455 } 456 } 457 process2()458 public void process2(){ 459 if(dupeWriter!=null){dupeWriter.start();} 460 // assert(false) : out; 461 Timer t=new Timer(); 462 463 if(DISPLAY_PROGRESS){ 464 outstream.println("Initial:"); 465 Shared.printMemory(); 466 outstream.println(); 467 } 468 469 processMatches(t); 470 471 forceTrimLeft=forceTrimRight=-1; 472 qTrimLeft=qTrimRight=false; 473 474 475 if(absorbContainment){ 476 processContainments(t); 477 } 478 479 if(dupeWriter!=null){dupeWriter.poisonAndWait();} 480 481 if(findOverlaps){ 482 findOverlaps(t); 483 484 killAffixMaps(); 485 486 if(processClusters || renameClusters || maxSpanningTree){codeMap=null;} 487 488 if(maxSpanningTree){ 489 processMst(t); 490 } 491 492 if(processClusters){ 493 processClusters(t, absorbOverlap); 494 } 495 // if(renameClusters){ 496 // renameClusters(t); 497 // } 498 // assert(false) : (codeMap==null)+", "+(affixMap1==null)+", "+processedClusters; 499 } 500 501 outstream.println("Input: \t"+readsProcessed+" reads \t\t"+basesProcessed+" bases."); 502 503 if(absorbMatch){ 504 outstream.println("Duplicates: \t"+matches+" reads ("+String.format(Locale.ROOT, "%.2f",matches*100.0/readsProcessed)+"%) \t"+ 505 baseMatches+" bases ("+String.format(Locale.ROOT, "%.2f",baseMatches*100.0/basesProcessed)+"%) \t"+collisions+" collisions."); 506 } 507 if(absorbContainment){ 508 outstream.println("Containments: \t"+containments+" reads ("+String.format(Locale.ROOT, "%.2f",containments*100.0/readsProcessed)+"%) \t"+ 509 baseContainments+" bases ("+String.format(Locale.ROOT, "%.2f",baseContainments*100.0/basesProcessed)+"%) \t"+containmentCollisions+" collisions."); 510 } 511 if(findOverlaps){ 512 outstream.println("Overlaps: \t"+overlaps+" reads ("+String.format(Locale.ROOT, "%.2f",overlaps*100.0/readsProcessed)+"%) \t"+ 513 baseOverlaps+" bases ("+String.format(Locale.ROOT, "%.2f",baseOverlaps*100.0/basesProcessed)+"%) \t"+overlapCollisions+" collisions."); 514 } 515 // outstream.println("Result: \t"+(addedToMain-containments)+" reads \t\t"+(basesProcessed-baseMatches-baseContainments)+" bases."); 516 517 long outReads=(addedToMain-containments); 518 if(UNIQUE_ONLY){outReads=readsProcessed-matches-containments;} 519 long outBases=(basesProcessed-baseMatches-baseContainments); 520 outstream.println("Result: \t"+outReads+" reads ("+String.format(Locale.ROOT, "%.2f",outReads*100.0/readsProcessed)+"%) \t"+ 521 outBases+" bases ("+String.format(Locale.ROOT, "%.2f",outBases*100.0/basesProcessed)+"%)"); 522 523 outstream.println(""); 524 525 if(out!=null || clusterFilePattern!=null || outbest!=null || outgraph!=null || outcsf!=null){ 526 writeOutput(outcsf, t); 527 } 528 529 } 530 killAffixMaps()531 private void killAffixMaps(){ 532 if(affixMaps==null){return;} 533 for(int i=0; i<numAffixMaps; i++){ 534 if(affixMaps[i]!=null){affixMaps[i].clear();} 535 affixMaps[i]=null; 536 } 537 affixMap1=null; 538 affixMap2=null; 539 affixMaps=null; 540 } 541 makeCrisArray(ArrayList<Read> list)542 private ConcurrentReadInputStream[] makeCrisArray(ArrayList<Read> list){ 543 final ConcurrentReadInputStream[] array; 544 545 if(list!=null){ 546 array=new ConcurrentReadInputStream[] {new ConcurrentCollectionReadInputStream(list, null, -1)}; 547 array[0].start(); //This deadlocks if ConcurrentReadInputStream extends Thread rather than spawning a new thread. 548 }else{ 549 array=new ConcurrentReadInputStream[in1.length]; 550 multipleInputFiles=array.length>1; 551 for(int i=0; i<in1.length; i++){ 552 if(verbose){System.err.println("Creating cris for "+in1[i]);} 553 554 final ConcurrentReadInputStream cris; 555 { 556 FileFormat ff1=FileFormat.testInput(in1[i], FileFormat.FASTA, null, !multipleInputFiles || ReadWrite.USE_UNPIGZ, true); 557 FileFormat ff2=(in2==null || in2.length<=i ? null : FileFormat.testInput(in2[i], FileFormat.FASTA, null, !multipleInputFiles || ReadWrite.USE_UNPIGZ, true)); 558 cris=ConcurrentReadInputStream.getReadInputStream(maxReads, ff1.samOrBam(), ff1, ff2); 559 cris.start(); 560 if(cris.paired()){ 561 THREADS=1;//Temp fix for losing reads when multithreaded and paired 562 if(absorbContainment){ 563 System.err.println("Set absorbContainment to false because it is not currently supported for paired reads."); 564 absorbContainment=false; 565 } 566 } 567 } 568 array[i]=cris; 569 } 570 } 571 return array; 572 } 573 processMatches(Timer t)574 private void processMatches(Timer t){ 575 crisa=makeCrisArray(null); 576 577 ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS); 578 for(int i=0; i<THREADS; i++){alht.add(new HashThread(true, (absorbContainment|findOverlaps), absorbMatch, false, false));} 579 for(HashThread ht : alht){ht.start();} 580 for(HashThread ht : alht){ 581 while(ht.getState()!=Thread.State.TERMINATED){ 582 try { 583 ht.join(); 584 } catch (InterruptedException e) { 585 // TODO Auto-generated catch block 586 e.printStackTrace(); 587 } 588 } 589 matches+=ht.matchesT; 590 collisions+=ht.collisionsT; 591 containments+=ht.containmentsT; 592 containmentCollisions+=ht.containmentCollisionsT; 593 baseContainments+=ht.baseContainmentsT; 594 baseMatches+=ht.baseMatchesT; 595 addedToMain+=ht.addedToMainT; 596 readsProcessed+=ht.readsProcessedT; 597 basesProcessed+=ht.basesProcessedT; 598 } 599 alht.clear(); 600 601 if(verbose){System.err.println("Attempting to close input streams (1).");} 602 for(ConcurrentReadInputStream cris : crisa){ 603 errorState|=ReadWrite.closeStream(cris); 604 } 605 crisa=null; 606 607 if(DISPLAY_PROGRESS){ 608 t.stop(); 609 outstream.println("Found "+matches+" duplicates."); 610 outstream.println("Finished exact matches. Time: "+t); 611 Shared.printMemory(); 612 if(verbose){outstream.println(affixMap1);} 613 outstream.println(); 614 t.start(); 615 } 616 } 617 processContainments(Timer t)618 private void processContainments(Timer t){ 619 ArrayList<Read> list=new ArrayList<Read>((int)addedToMain); 620 for(ArrayList<Unit> alu : codeMap.values()){ 621 for(Unit u : alu){ 622 assert(u.r.mate==null) : "Containments are not currently supported with paired reads."; 623 if(u.valid() && u.r.pairnum()==0){list.add(u.r);} 624 } 625 } 626 627 // if(minLengthPercent>0){ 628 // if(verbose){System.err.println("Sorting.");} 629 // Shared.sort(list, comparator); 630 // Collections.reverse(list); 631 // assert(list.isEmpty() || list.get(0).length()<=list.get(list.size()-1).length()) : 632 // list.get(0).length()+", "+list.get(list.size()-1).length(); 633 // } 634 635 crisa=makeCrisArray(subsetMode ? null : list); 636 637 ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS); 638 for(int i=0; i<THREADS; i++){alht.add(new HashThread(false, false, false, true, false));} 639 640 for(HashThread ht : alht){ht.start();} 641 for(HashThread ht : alht){ 642 while(ht.getState()!=Thread.State.TERMINATED){ 643 try { 644 ht.join(); 645 } catch (InterruptedException e) { 646 // TODO Auto-generated catch block 647 e.printStackTrace(); 648 } 649 } 650 assert(ht.matchesT==0); 651 assert(ht.collisionsT==0); 652 assert(ht.baseMatchesT==0); 653 assert(ht.addedToMainT==0); 654 // assert(ht.readsProcessedT==0); 655 // assert(ht.basesProcessedT==0); 656 // matches+=ht.matchesT; 657 // collisions+=ht.collisionsT; 658 containments+=ht.containmentsT; 659 containmentCollisions+=ht.containmentCollisionsT; 660 baseContainments+=ht.baseContainmentsT; 661 // baseMatches+=ht.baseMatchesT; 662 // addedToMain+=ht.addedToMainT; 663 // readsProcessed+=ht.readsProcessedT; 664 // basesProcessed+=ht.basesProcessedT; 665 } 666 alht.clear(); 667 if(verbose){System.err.println("Attempting to close input streams (2).");} 668 for(ConcurrentReadInputStream cris : crisa){ 669 errorState|=ReadWrite.closeStream(cris); 670 } 671 672 if(DISPLAY_PROGRESS){ 673 t.stop(); 674 outstream.println("Found "+containments+" contained sequences."); 675 outstream.println("Finished containment. Time: "+t); 676 Shared.printMemory(); 677 outstream.println(); 678 t.start(); 679 } 680 crisa=null; 681 if(!findOverlaps){ 682 killAffixMaps(); 683 } 684 685 long x=removeInvalid(list); 686 list.clear(); 687 688 if(DISPLAY_PROGRESS){ 689 t.stop(); 690 outstream.println("Removed "+x+" invalid entries."); 691 outstream.println("Finished invalid removal. Time: "+t); 692 Shared.printMemory(); 693 outstream.println(); 694 t.start(); 695 } 696 } 697 findOverlaps(Timer t)698 private void findOverlaps(Timer t){ 699 700 ArrayList<Read> list=new ArrayList<Read>((int)addedToMain); 701 for(ArrayList<Unit> alu : codeMap.values()){ 702 for(Unit u : alu){ 703 if(u.valid() && u.r.pairnum()==0){ 704 u.unitID=list.size(); 705 list.add(u.r); 706 if(u.r.mate!=null){ 707 Unit u2=(Unit)u.r.mate.obj; 708 u2.unitID=u.unitID; 709 } 710 }else{ 711 u.unitID=Integer.MAX_VALUE; 712 } 713 } 714 } 715 716 if(preventTransitiveOverlaps){ 717 clusterNumbers=new AtomicIntegerArray(list.size()); 718 for(int i=0; i<clusterNumbers.length(); i++){ 719 clusterNumbers.set(i, i); 720 } 721 } 722 723 crisa=makeCrisArray(subsetMode ? null : list); 724 725 ArrayList<HashThread> alht=new ArrayList<HashThread>(THREADS); 726 for(int i=0; i<THREADS; i++){alht.add(new HashThread(false, false, false, false, true));} 727 728 for(HashThread ht : alht){ht.start();} 729 for(HashThread ht : alht){ 730 while(ht.getState()!=Thread.State.TERMINATED){ 731 try { 732 ht.join(); 733 } catch (InterruptedException e) { 734 // TODO Auto-generated catch block 735 e.printStackTrace(); 736 } 737 } 738 assert(ht.matchesT==0); 739 assert(ht.collisionsT==0); 740 assert(ht.baseMatchesT==0); 741 assert(ht.addedToMainT==0); 742 743 overlaps+=ht.overlapsT; 744 baseOverlaps+=ht.baseOverlapsT; 745 overlapCollisions+=ht.overlapCollisionsT; 746 } 747 alht.clear(); 748 if(verbose){System.err.println("Attempting to close input streams (3).");} 749 for(ConcurrentReadInputStream cris : crisa){ 750 errorState|=ReadWrite.closeStream(cris); 751 } 752 753 if(DISPLAY_PROGRESS){ 754 t.stop(); 755 outstream.println("Found "+overlaps+" overlaps."); 756 outstream.println("Finished finding overlaps. Time: "+t); 757 Shared.printMemory(); 758 outstream.println(); 759 t.start(); 760 } 761 762 crisa=null; 763 764 if(makeClusters){ 765 int intransitive=0, redundant=0; 766 assert((intransitive=countIntransitive(t, list, rigorousTransitive))==0); 767 assert((redundant=countRedundant(t, list))==0); 768 long overlaps=countOverlaps(t, list); 769 assert(intransitive==0); 770 assert(redundant==0); 771 // makeTransitive(t, list, rigorousTransitive); 772 if(clusterQueue==null){ 773 clusterQueue=new ArrayDeque<ArrayList<Unit>>(list.size()/4+1); 774 processedClusters=new ArrayList<ArrayList<Unit>>(); 775 }else{ 776 assert(clusterQueue.isEmpty()); 777 } 778 makeClusters(t, list); 779 } 780 781 list.clear(); 782 } 783 makeTransitive(Timer t, ArrayList<Read> list, boolean rigorous)784 private long makeTransitive(Timer t, ArrayList<Read> list, boolean rigorous){ 785 assert(false) : "No longer needed."; 786 long added=0; 787 for(Read r : list){ 788 assert(r!=null); 789 Unit u=(Unit) r.obj; 790 assert(u!=null); 791 assert(u.valid()); 792 // outstream.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size())); 793 if(u.valid()){ 794 795 if(u.overlapList!=null){ 796 for(Overlap o : u.overlapList){ 797 Unit u2=(o.u1==u ? o.u2 : o.u1); 798 assert(u2!=u); 799 if(u2.overlapList==null){ 800 u2.overlapList=new ArrayList<Overlap>(2); 801 u2.overlapList.add(o); 802 }else{ 803 boolean found=false; 804 if(rigorous){ 805 found=u2.overlapList.contains(o); 806 }else{ 807 for(Overlap o2 : u2.overlapList){ 808 if(o2.u1==u || o2.u2==u){found=true; break;} 809 } 810 } 811 if(!found){ 812 added++; 813 u2.overlapList.add(o); 814 } 815 } 816 } 817 } 818 } 819 } 820 821 for(Read r : list){ 822 Unit u=(Unit) r.obj; 823 if(u.valid()){ 824 assert(u.isTransitive()); 825 } 826 } 827 828 if(DISPLAY_PROGRESS){ 829 t.stop(); 830 outstream.println("Added overlaps: "+added); 831 outstream.println("Made overlaps transitive. Time: "+t); 832 Shared.printMemory(); 833 outstream.println(); 834 t.start(); 835 } 836 return added; 837 } 838 countIntransitive(Timer t, ArrayList<Read> list, boolean rigorous)839 private int countIntransitive(Timer t, ArrayList<Read> list, boolean rigorous){ 840 if(!countTransitive){return 0;} 841 int transitive=0, intransitive=0; 842 for(Read r : list){ 843 assert(r!=null); 844 Unit u=(Unit) r.obj; 845 assert(u!=null); 846 assert(u.valid()); 847 // outstream.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size())); 848 if(u.valid()){ 849 if(rigorous ? u.isPerfectlyTransitive() : u.isTransitive()){ 850 transitive++; 851 }else{ 852 intransitive++; 853 } 854 } 855 } 856 857 if(DISPLAY_PROGRESS){ 858 t.stop(); 859 outstream.println("Intransitive: "+intransitive+", \ttransitive: "+transitive); 860 outstream.println("Checked transitivity. Time: "+t); 861 Shared.printMemory(); 862 outstream.println(); 863 t.start(); 864 } 865 866 return intransitive; 867 } 868 countRedundant(Timer t, ArrayList<Read> list)869 private int countRedundant(Timer t, ArrayList<Read> list){ 870 if(!countRedundant){return 0;} 871 int redundant=0, nonredundant=0; 872 for(Read r : list){ 873 assert(r!=null); 874 Unit u=(Unit) r.obj; 875 assert(u!=null); 876 assert(u.valid()); 877 // outstream.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size())); 878 if(u.valid()){ 879 if(u.isNonRedundant()){ 880 nonredundant++; 881 }else{ 882 redundant++; 883 } 884 } 885 } 886 887 if(DISPLAY_PROGRESS){ 888 t.stop(); 889 outstream.println("Redundant: "+redundant+", \tnonredundant: "+nonredundant); 890 outstream.println("Checked redundancy. Time: "+t); 891 Shared.printMemory(); 892 outstream.println(); 893 t.start(); 894 } 895 return redundant; 896 } 897 countOverlaps(Timer t, ArrayList<Read> list)898 private long countOverlaps(Timer t, ArrayList<Read> list){ 899 900 long overlaps=0, length=0; 901 for(Read r : list){ 902 assert(r!=null); 903 Unit u=(Unit) r.obj; 904 assert(u!=null); 905 assert(u.valid()); 906 // outstream.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size())); 907 if(u.valid() && u.overlapList!=null){ 908 for(Overlap o : u.overlapList){ 909 overlaps++; 910 length+=o.overlapLen; 911 } 912 } 913 } 914 915 if(DISPLAY_PROGRESS){ 916 t.stop(); 917 outstream.println("Overlaps: "+overlaps+", \tlength: "+length); 918 outstream.println("Counted overlaps. Time: "+t); 919 Shared.printMemory(); 920 outstream.println(); 921 t.start(); 922 } 923 return overlaps; 924 } 925 fillClusterSizeMatrix(ArrayList<ArrayList<Unit>> clusters, long[][] clusterSize)926 private long fillClusterSizeMatrix(ArrayList<ArrayList<Unit>> clusters, long[][] clusterSize){ 927 int max=0; 928 for(ArrayList<Unit> cluster : clusters){ 929 final int cs=Tools.min(clusterSize.length-1, cluster.size()); 930 { 931 long reads=0, bases=0; 932 for(Unit u2 : cluster){ 933 reads++; 934 bases+=u2.length(); 935 } 936 clusterSize[0][cs]++; 937 clusterSize[1][cs]+=reads; 938 clusterSize[2][cs]+=bases; 939 } 940 max=Tools.max(max, cluster.size()); 941 } 942 return max; 943 } 944 makeClusters(Timer t, ArrayList<Read> list)945 private void makeClusters(Timer t, ArrayList<Read> list){ 946 947 final int clusterlen=70000; 948 long[][] clusterSize=new long[3][clusterlen]; 949 int max=0; 950 for(Read r : list){ 951 Unit u=(Unit) r.obj; 952 953 if(!u.clustered()){ 954 ArrayList<Unit> cluster=u.makeCluster(); 955 if(cluster.size()>2){cluster.trimToSize();} 956 if(cluster.size()==1 || (!processClusters && !maxSpanningTree)){processedClusters.add(cluster);} 957 else{clusterQueue.add(cluster);} 958 final int cs=Tools.min(clusterlen-1, cluster.size()); 959 { 960 long reads=0, bases=0; 961 for(Unit u2 : cluster){ 962 reads++; 963 bases+=u2.length(); 964 } 965 clusterSize[0][cs]++; 966 clusterSize[1][cs]+=reads; 967 clusterSize[2][cs]+=bases; 968 } 969 max=Tools.max(max, cluster.size()); 970 } 971 } 972 973 if(DISPLAY_PROGRESS){ 974 t.stop(); 975 outstream.println(toClusterSizeString(clusterSize)); 976 outstream.println("\nLargest: "+max); 977 outstream.println("Finished making clusters. Time: "+t); 978 Shared.printMemory(); 979 outstream.println(); 980 t.start(); 981 } 982 983 984 long x=removeInvalid(list); 985 986 if(DISPLAY_PROGRESS){ 987 t.stop(); 988 outstream.println("Removed "+x+" invalid entries."); 989 outstream.println("Finished invalid removal. Time: "+t); 990 Shared.printMemory(); 991 outstream.println(); 992 t.start(); 993 } 994 } 995 toClusterSizeString(long[][] clusterSizeMatrix)996 private String toClusterSizeString(long[][] clusterSizeMatrix){ 997 998 long[] clusterSize=clusterSizeMatrix[0]; 999 long[] clusterReads=clusterSizeMatrix[1]; 1000 long[] clusterBases=clusterSizeMatrix[2]; 1001 1002 long totalClusters=Tools.sum(clusterSize); 1003 1004 long bigClusters=0; 1005 for(int i=minClusterSize; i<clusterSize.length; i++){ 1006 bigClusters+=clusterSize[i]; 1007 } 1008 1009 final int spaces=18; 1010 final int spaces2=spaces*2, spaces3=spaces*3; 1011 1012 final StringBuilder sb=new StringBuilder(100), sb2=new StringBuilder(1000); 1013 sb2.append("Clusters:"); 1014 while(sb2.length()<spaces){sb2.append(' ');} 1015 sb2.append(totalClusters+(minClusterSize<2 ? "" : " ("+bigClusters+" of at least size "+minClusterSize+")")+"\n"); 1016 1017 sb.append("Size Range"); 1018 while(sb.length()<spaces){sb.append(' ');} 1019 sb.append("Clusters"); 1020 while(sb.length()<spaces2){sb.append(' ');} 1021 sb.append("Reads"); 1022 while(sb.length()<spaces3){sb.append(' ');} 1023 sb.append("Bases"); 1024 1025 sb2.append('\n'); 1026 sb2.append(sb); 1027 sb.setLength(0); 1028 1029 for(int i=0; i<clusterSize.length-1; i=Tools.max(i+1, i*2)){ 1030 int a=i+1, b=i*2; 1031 if(i<2){ 1032 sb.append(a); 1033 while(sb.length()<spaces){sb.append(' ');} 1034 sb.append(clusterSize[a]); 1035 while(sb.length()<spaces2){sb.append(' ');} 1036 sb.append(clusterReads[a]); 1037 while(sb.length()<spaces3){sb.append(' ');} 1038 sb.append(clusterBases[a]); 1039 }else if(b>=clusterSize.length){ 1040 long x=Tools.sum(clusterSize, a, clusterSize.length-1); 1041 long y=Tools.sum(clusterReads, a, clusterSize.length-1); 1042 long z=Tools.sum(clusterBases, a, clusterSize.length-1); 1043 if(x>0){ 1044 sb.append(a+"+"); 1045 while(sb.length()<spaces){sb.append(' ');} 1046 sb.append(x); 1047 while(sb.length()<spaces2){sb.append(' ');} 1048 sb.append(y); 1049 while(sb.length()<spaces3){sb.append(' ');} 1050 sb.append(z); 1051 } 1052 }else{ 1053 long x=Tools.sum(clusterSize, a, b); 1054 long y=Tools.sum(clusterReads, a, b); 1055 long z=Tools.sum(clusterBases, a, b); 1056 if(x>0){ 1057 sb.append(a+"-"+b); 1058 while(sb.length()<spaces){sb.append(' ');} 1059 sb.append(x); 1060 while(sb.length()<spaces2){sb.append(' ');} 1061 sb.append(y); 1062 while(sb.length()<spaces3){sb.append(' ');} 1063 sb.append(z); 1064 } 1065 } 1066 if(sb.length()>0){ 1067 sb2.append('\n'); 1068 sb2.append(sb); 1069 sb.setLength(0); 1070 } 1071 } 1072 return sb2.toString(); 1073 } 1074 renameClusters(Timer t)1075 private void renameClusters(Timer t){ 1076 assert(false) : "This is now unused; renaming is done at output time."; 1077 int cnum=0; 1078 final StringBuilder sb=new StringBuilder(64); 1079 for(ArrayList<Unit> alu : processedClusters){ 1080 for(int i=0; i<alu.size(); i++){ 1081 Unit u=alu.get(i); 1082 Read r=u.r; 1083 sb.append("Cluster "); 1084 sb.append(cnum); 1085 sb.append(",contig "); 1086 sb.append(i); 1087 if(u.offsetValid()){ 1088 sb.append(",pos "); 1089 sb.append(u.offset()); 1090 } 1091 r.id=sb.toString(); 1092 sb.setLength(0); 1093 } 1094 } 1095 1096 if(DISPLAY_PROGRESS){ 1097 t.stop(); 1098 outstream.println("Finished cluster renaming. Time: "+t); 1099 Shared.printMemory(); 1100 outstream.println(); 1101 t.start(); 1102 } 1103 } 1104 processMst(Timer t)1105 private void processMst(Timer t){ 1106 1107 if(DISPLAY_PROGRESS){outstream.println("Converting to Maximum Spanning Tree.");} 1108 1109 ArrayList<MstThread> alct=new ArrayList<MstThread>(THREADS); 1110 for(int i=0; i<THREADS; i++){ 1111 alct.add(new MstThread()); 1112 } 1113 1114 long overlapsRemoved=0; 1115 long overlapBasesRemoved=0; 1116 long overlapsRetained=0; 1117 long overlapBasesRetained=0; 1118 1119 for(MstThread ct : alct){ct.start();} 1120 for(MstThread ct : alct){ 1121 while(ct.getState()!=Thread.State.TERMINATED){ 1122 try { 1123 ct.join(); 1124 } catch (InterruptedException e) { 1125 e.printStackTrace(); 1126 } 1127 } 1128 1129 overlapsRemoved+=ct.overlapsRemovedT; 1130 overlapBasesRemoved+=ct.overlapBasesRemovedT; 1131 overlapsRetained+=ct.overlapsRetainedT; 1132 overlapBasesRetained+=ct.overlapBasesRetainedT; 1133 } 1134 assert(clusterQueue.isEmpty()); 1135 if(processClusters){ 1136 for(MstThread ct : alct){ 1137 clusterQueue.addAll(ct.processedT); 1138 ct.processedT.clear(); 1139 ct.processedT=null; 1140 } 1141 }else{ 1142 for(MstThread ct : alct){ 1143 processedClusters.addAll(ct.processedT); 1144 ct.processedT.clear(); 1145 ct.processedT=null; 1146 } 1147 clusterQueue=null; 1148 } 1149 alct.clear(); 1150 1151 assert(affixMaps==null); 1152 killAffixMaps(); 1153 1154 1155 if(DISPLAY_PROGRESS){ 1156 t.stop(); 1157 outstream.println("Removed "+(overlapsRemoved)+" edges ("+overlapBasesRemoved+" bases)."); 1158 outstream.println("Retained "+(overlapsRetained)+" edges ("+overlapBasesRetained+" bases)."); 1159 1160 // outstream.println("\nAfter conversion to Maximum Spanning Tree:"); 1161 // final int[] clusterSize=new int[8200]; 1162 // int max=0; 1163 // for(ArrayList<Unit> cluster : processedClusters){ 1164 // clusterSize[Tools.min(clusterSize.length-1, cluster.size())]++; 1165 // max=Tools.max(max, cluster.size()); 1166 // } 1167 // outstream.println(toClusterSizeString(clusterSize)); 1168 // outstream.println("Largest: "+max); 1169 1170 outstream.println("Finished MST conversion. Time: "+t); 1171 Shared.printMemory(); 1172 outstream.println(); 1173 t.start(); 1174 } 1175 } 1176 processClusters(Timer t, boolean mergeClusters)1177 private void processClusters(Timer t, boolean mergeClusters){ 1178 1179 ArrayList<ClusterThread> alct=new ArrayList<ClusterThread>(THREADS); 1180 for(int i=0; i<THREADS; i++){ 1181 alct.add(new ClusterThread(fixMultiJoins, canonicizeClusters, removeCycles, fixCanonContradictions, fixOffsetContradictions, 1182 mergeClusters, mergeClusters, mergeClusters)); 1183 } 1184 1185 long leafMerges=0; 1186 long innerMerges=0; 1187 long leafBaseMerges=0; 1188 long innerBaseMerges=0; 1189 1190 long multiJoinFailures=0; 1191 long multiJoinsFound=0; 1192 long multiJoinBasesFound=0; 1193 long unitsFlipped=0; 1194 long overlapsFlipped=0; 1195 long canonContradictoryOverlaps=0; 1196 long canonContradictoryClusters=0; 1197 long offsetContradictoryOverlaps=0; 1198 long offsetContradictoryClusters=0; 1199 long cycleOverlaps=0; 1200 long cycleClusters=0; 1201 1202 for(ClusterThread ct : alct){ct.start();} 1203 for(ClusterThread ct : alct){ 1204 while(ct.getState()!=Thread.State.TERMINATED){ 1205 try { 1206 ct.join(); 1207 } catch (InterruptedException e) { 1208 e.printStackTrace(); 1209 } 1210 } 1211 1212 leafMerges+=ct.leafMergesT; 1213 innerMerges+=ct.innerMergesT; 1214 leafBaseMerges+=ct.leafBaseMergesT; 1215 innerBaseMerges+=ct.innerBaseMergesT; 1216 1217 multiJoinFailures+=ct.multiJoinFailuresT; 1218 multiJoinsFound+=ct.multiJoinsFoundT; 1219 multiJoinBasesFound+=ct.multiJoinBasesFoundT; 1220 unitsFlipped+=ct.unitsFlippedT; 1221 overlapsFlipped+=ct.overlapsFlippedT; 1222 canonContradictoryOverlaps+=ct.canonContradictoryOverlapsT; 1223 canonContradictoryClusters+=ct.canonContradictoryClustersT; 1224 offsetContradictoryOverlaps+=ct.offsetContradictoryOverlapsT; 1225 offsetContradictoryClusters+=ct.offsetContradictoryClustersT; 1226 cycleOverlaps+=ct.cycleOverlapsT; 1227 cycleClusters+=ct.cycleClustersT; 1228 } 1229 alct.clear(); 1230 1231 assert(affixMaps==null || affixMap1==null); 1232 killAffixMaps(); 1233 1234 assert(clusterQueue.isEmpty()); 1235 clusterQueue=null; 1236 1237 if(DISPLAY_PROGRESS){ 1238 t.stop(); 1239 if(fixMultiJoins){ 1240 outstream.println("Found "+(multiJoinsFound)+" multijoins ("+multiJoinBasesFound+" bases)."); 1241 outstream.println("Experienced "+(multiJoinFailures)+" multijoin removal failures."); 1242 } 1243 if(canonicizeClusters){ 1244 outstream.println("Flipped "+(unitsFlipped)+" reads and "+overlapsFlipped+" overlaps."); 1245 outstream.println("Found "+(canonContradictoryClusters)+" clusters ("+canonContradictoryOverlaps+" overlaps) with contradictory orientation cycles."); 1246 } 1247 if(fixOffsetContradictions){ 1248 outstream.println("Found "+(offsetContradictoryClusters)+" clusters ("+offsetContradictoryOverlaps+" overlaps) with contradictory offset cycles."); 1249 } 1250 outstream.println("Found "+(cycleClusters)+" clusters ("+cycleOverlaps+" overlaps) with remaining cycles."); 1251 if(absorbOverlap){ 1252 outstream.println("Merged "+(leafMerges)+" leaves ("+leafBaseMerges+" bases)."); 1253 outstream.println("Merged "+(innerMerges)+" nonleaves ("+innerBaseMerges+" bases)."); 1254 } 1255 1256 outstream.println("\nAfter processing clusters:"); 1257 final long[][] clusterSize=new long[3][70000]; 1258 final long max=fillClusterSizeMatrix(processedClusters, clusterSize); 1259 outstream.println(toClusterSizeString(clusterSize)); 1260 outstream.println("\nLargest: "+max); 1261 1262 outstream.println("Finished processing. Time: "+t); 1263 Shared.printMemory(); 1264 outstream.println(); 1265 t.start(); 1266 } 1267 } 1268 removeInvalid(ArrayList<Read> list)1269 private long removeInvalid(ArrayList<Read> list){ 1270 final LongM keym=new LongM(); 1271 long removedC=0, removedP=0, removedS=0, invalid=0; 1272 1273 for(int j=0, lim=list.size(); j<lim; j++){ 1274 final Read r=list.get(j); 1275 final Unit u=(Unit)r.obj; 1276 1277 if(!u.valid()){ 1278 1279 invalid++; 1280 1281 if(codeMap!=null && !codeMap.isEmpty()){ 1282 Long key=u.code1; 1283 ArrayList<Unit> alu=codeMap.get(key); 1284 if(alu!=null){ 1285 int valid=0; 1286 for(int i=alu.size()-1; i>=0; i--){ 1287 Unit u2=alu.get(i); 1288 if(u2==null || !u2.valid()){ 1289 alu.remove(i); 1290 removedC++; 1291 } 1292 else{valid++;} 1293 } 1294 if(valid==0){codeMap.remove(key);} 1295 } 1296 } 1297 1298 if(affixMap1!=null && !affixMap1.isEmpty()){ 1299 { 1300 keym.set(u.prefix1); 1301 ArrayList<Unit> alu=affixMap1.get(keym); 1302 if(alu!=null){ 1303 int valid=0; 1304 for(int i=alu.size()-1; i>=0; i--){ 1305 Unit u2=alu.get(i); 1306 if(u2==null || !u2.valid()){ 1307 alu.remove(i); 1308 removedP++; 1309 } 1310 else{valid++;} 1311 } 1312 if(valid==0){affixMap1.remove(keym);} 1313 } 1314 } 1315 if(storeSuffix){ 1316 keym.set(u.suffix1); 1317 ArrayList<Unit> alu=affixMap1.get(keym); 1318 if(alu!=null){ 1319 int valid=0; 1320 for(int i=alu.size()-1; i>=0; i--){ 1321 Unit u2=alu.get(i); 1322 if(u2==null || !u2.valid()){ 1323 alu.remove(i); 1324 removedS++; 1325 } 1326 else{valid++;} 1327 } 1328 if(valid==0){affixMap1.remove(keym);} 1329 } 1330 } 1331 } 1332 if(affixMap2!=null && !affixMap2.isEmpty()){ 1333 if(u.prefix2!=-1){ 1334 keym.set(u.prefix2); 1335 ArrayList<Unit> alu=affixMap2.get(keym); 1336 if(alu!=null){ 1337 int valid=0; 1338 for(int i=alu.size()-1; i>=0; i--){ 1339 Unit u2=alu.get(i); 1340 if(u2==null || !u2.valid()){ 1341 alu.remove(i); 1342 removedP++; 1343 } 1344 else{valid++;} 1345 } 1346 if(valid==0){affixMap2.remove(keym);} 1347 } 1348 } 1349 if(storeSuffix && u.suffix2!=-1){ 1350 keym.set(u.suffix2); 1351 ArrayList<Unit> alu=affixMap2.get(keym); 1352 if(alu!=null){ 1353 int valid=0; 1354 for(int i=alu.size()-1; i>=0; i--){ 1355 Unit u2=alu.get(i); 1356 if(u2==null || !u2.valid()){ 1357 alu.remove(i); 1358 removedS++; 1359 } 1360 else{valid++;} 1361 } 1362 if(valid==0){affixMap2.remove(keym);} 1363 } 1364 } 1365 1366 } 1367 1368 list.set(j, null); 1369 } 1370 } 1371 1372 if(invalid>0){ 1373 Tools.condenseStrict(list); 1374 } 1375 if(verbose){ 1376 outstream.println("Removed invalids: "+removedC+", "+removedP+", "+removedS); 1377 } 1378 return invalid; 1379 } 1380 1381 addToArray(HashMap<Long, ArrayList<Unit>> codeMap, boolean sort, boolean clear, long outNum)1382 private static ArrayList<Read> addToArray(HashMap<Long, ArrayList<Unit>> codeMap, boolean sort, boolean clear, long outNum){ 1383 assert(outNum<=Integer.MAX_VALUE); 1384 if(verbose){System.err.println("Making list.");} 1385 ArrayList<Read> list=new ArrayList<Read>((int)outNum); 1386 if(verbose){System.err.println("Adding.");} 1387 for(ArrayList<Unit> alu : codeMap.values()){ 1388 for(Unit u : alu){ 1389 if(u.valid() && u.r.pairnum()==0){list.add(u.r);} 1390 } 1391 if(clear){alu.clear();} 1392 } 1393 if(clear){codeMap.clear();} 1394 1395 if(sort){ 1396 if(verbose){System.err.println("Sorting.");} 1397 Shared.sort(list, comparator); 1398 // if(ascending){ 1399 // Collections.reverse(list); 1400 // assert(list.isEmpty() || list.get(0).length()<=list.get(list.size()-1).length()) : 1401 // list.get(0).length()+", "+list.get(list.size()-1).length(); 1402 // }else{ 1403 // assert(list.isEmpty() || list.get(0).length()>=list.get(list.size()-1).length()) : 1404 // list.get(0).length()+", "+list.get(list.size()-1).length(); 1405 // } 1406 } 1407 assert(list.size()==outNum || list.size()*2L==outNum || UNIQUE_ONLY) : list.size()+", "+outNum; 1408 return list; 1409 } 1410 writeOutput(String clusterStatsFile, Timer t)1411 private void writeOutput(String clusterStatsFile, Timer t){ 1412 // verbose=true; 1413 // assert(false) : (processedClusters==null)+", "+(processedClusters.isEmpty())+", "+outgraph+", "+out+", "+clusterFilePattern; 1414 if(processedClusters==null || processedClusters.isEmpty()){ 1415 1416 if(out!=null || clusterFilePattern!=null){ 1417 1418 ArrayList<Read> list=addToArray(codeMap, sort, true, addedToMain-containments); 1419 codeMap=null; 1420 1421 if(sort){ 1422 if(DISPLAY_PROGRESS){ 1423 t.stop(); 1424 outstream.println("Sorted output. Time: "+t); 1425 Shared.printMemory(); 1426 outstream.println(); 1427 t.start(); 1428 } 1429 } 1430 1431 writeOutput(list); 1432 } 1433 }else{ 1434 if(outgraph!=null){ 1435 writeGraph(outgraph, processedClusters); 1436 } 1437 if(out!=null || clusterFilePattern!=null || clusterStatsFile!=null || outbest!=null){ 1438 writeOutputClusters(clusterStatsFile, processedClusters); 1439 } 1440 } 1441 1442 if(DISPLAY_PROGRESS){ 1443 t.stop(); 1444 outstream.println("Printed output. Time: "+t); 1445 Shared.printMemory(); 1446 outstream.println(); 1447 t.start(); 1448 } 1449 } 1450 1451 1452 writeOutput(ArrayList<Read> list)1453 private void writeOutput(ArrayList<Read> list){ 1454 1455 final ByteStreamWriter tsw=(out==null ? null : new ByteStreamWriter(out, overwrite, append, true)); 1456 1457 if(verbose){System.err.println("Writing from array.");} 1458 tsw.start(); 1459 1460 HashSet<String> names=((uniqueNames && storeName) ? 1461 new HashSet<String>(Tools.min(Integer.MAX_VALUE, Tools.max((int)addedToMain, (int)(addedToMain*1.35)))) : null); 1462 long rid=0; 1463 for(int x=0; x<list.size(); x++){ 1464 Read r=list.get(x); 1465 list.set(x, null); 1466 1467 if(r.mate!=null && r.pairnum()!=0){r=r.mate;} 1468 1469 assert(r.mate==null || r.mate.discarded()==r.discarded()); 1470 1471 if(!r.discarded()){ 1472 rid++; 1473 1474 for(int i=0; r!=null && i<2; i++){ 1475 if(multipleInputFiles){r.numericID=rid;} 1476 if(names!=null){ 1477 String name=(r.id==null ? ""+r.numericID : r.id); 1478 if(names.contains(name)){ 1479 for(long j=0; j<Integer.MAX_VALUE; j++){ 1480 String name2=name+"_dd"+j; 1481 if(r.mate!=null){name2+=(" /"+(i+1));} 1482 if(!names.contains(name2)){ 1483 r.id=name2; 1484 names.add(name2); 1485 break; 1486 } 1487 } 1488 }else{ 1489 names.add(name); 1490 } 1491 } 1492 tsw.println(r); 1493 r.setDiscarded(true); 1494 r=r.mate; 1495 } 1496 } 1497 } 1498 if(verbose){System.err.println("Shutting down tsw "+tsw.fname);} 1499 tsw.poisonAndWait(); 1500 } 1501 1502 writeOutputClusters(String clusterStatsFile, ArrayList<ArrayList<Unit>> clist)1503 private void writeOutputClusters(String clusterStatsFile, ArrayList<ArrayList<Unit>> clist){ 1504 1505 // Shared.sort(clist, CLUSTER_LENGTH_COMPARATOR); //Causes a crash: java.lang.System.arraycopy(Native Method) 1506 clist.sort(CLUSTER_LENGTH_COMPARATOR); 1507 1508 if(verbose){System.err.println("Writing clusters.");} 1509 1510 final ByteStreamWriter tswAll=(out==null ? null : new ByteStreamWriter(out, overwrite, append, true)); 1511 if(tswAll!=null){tswAll.start();} 1512 ByteStreamWriter tswCluster=null; 1513 ByteStreamWriter tswBest=null; 1514 1515 if(outbest!=null){ 1516 tswBest=new ByteStreamWriter(outbest, overwrite, append, true); 1517 tswBest.start(); 1518 } 1519 1520 TextStreamWriter csf=null; 1521 if(clusterStatsFile!=null){ 1522 csf=new TextStreamWriter(clusterStatsFile, overwrite, false, false); 1523 csf.start(); 1524 csf.print("#Name\tsize\t"+nmerLength+"-mer frequencies\n"); 1525 } 1526 1527 HashSet<String> names=((uniqueNames && storeName) ? 1528 new HashSet<String>(Tools.min(Integer.MAX_VALUE, Tools.max((int)addedToMain, (int)(addedToMain*1.35)))) : null); 1529 long rid=0; 1530 final long[] nmerCounts=new long[maxNmer+1]; 1531 1532 final StringBuilder sb=new StringBuilder(64); 1533 1534 for(int cnum=0; cnum<clist.size(); cnum++){ 1535 final ArrayList<Unit> alu=clist.get(cnum); 1536 // clist.set(cnum, null); //This breaks subsequent output processing 1537 1538 if(alu.size()<minClusterSize){ 1539 if(verbose){System.err.println("Ignoring small cluster "+cnum+", size "+alu.size());} 1540 1541 if(csf!=null && alu.size()>=minClusterSizeForStats){ 1542 float[] profile=makeNmerProfile(alu, nmerCounts); 1543 sb.append("Cluster_"); 1544 sb.append(cnum); 1545 sb.append('\t'); 1546 sb.append(alu.size()); 1547 sb.append('\t'); 1548 for(float f : profile){ 1549 sb.append(String.format(Locale.ROOT, "%.5f ", f)); 1550 } 1551 sb.setCharAt(sb.length()-1, '\n'); 1552 csf.print(sb.toString()); 1553 sb.setLength(0); 1554 } 1555 }else{ 1556 if(verbose){System.err.println("Writing cluster "+cnum+", size "+alu.size());} 1557 1558 if(clusterFilePattern!=null){ 1559 if(tswCluster!=null){ 1560 if(verbose){System.err.println("Shutting down tswCluster "+tswCluster.fname);} 1561 tswCluster.poisonAndWait(); 1562 tswCluster=null; 1563 } 1564 tswCluster=new ByteStreamWriter(clusterFilePattern.replaceFirst("%", ""+cnum), overwrite, append, true); 1565 if(verbose){System.err.println("Starting tswCluster "+tswCluster.fname);} 1566 tswCluster.start(); 1567 } 1568 1569 if(csf!=null && alu.size()>=minClusterSizeForStats){ 1570 float[] profile=makeNmerProfile(alu, nmerCounts); 1571 sb.append("Cluster_"); 1572 sb.append(cnum); 1573 sb.append('\t'); 1574 sb.append(alu.size()); 1575 sb.append('\t'); 1576 for(float f : profile){ 1577 sb.append(String.format(Locale.ROOT, "%.5f ", f)); 1578 } 1579 sb.setCharAt(sb.length()-1, '\n'); 1580 csf.print(sb.toString()); 1581 sb.setLength(0); 1582 } 1583 1584 if(pickBestRepresentativePerCluster){ 1585 pickBestRepresenative(alu, true); 1586 } 1587 1588 if(outbest!=null){ 1589 Unit u=pickBestRepresenative((ArrayList<Unit>)alu.clone(), false); 1590 tswBest.println(u.r); 1591 if(u.r.mate!=null){tswBest.println(u.r.mate);} 1592 } 1593 1594 for(int contig=0; contig<alu.size(); contig++){ 1595 final Unit u0=alu.get(contig); 1596 alu.set(contig, null); 1597 Read r=u0.r; 1598 if(r.mate!=null && r.pairnum()!=0){r=r.mate;} 1599 1600 if(!r.discarded()){ 1601 rid++; 1602 1603 for(int i=0; r!=null && i<2; i++){ 1604 assert(r.pairnum()==i) : i+", "+r.pairnum()+", "+(r.mate==null ? 9 : r.mate.pairnum()); 1605 Unit u=(Unit)r.obj; 1606 if(verbose){System.err.println("Writing read "+r.id);} 1607 r.numericID=rid; 1608 if(renameClusters){ 1609 sb.append("Cluster_"); 1610 sb.append(cnum); 1611 sb.append(",contig_"); 1612 sb.append(contig); 1613 if(u.offsetValid()){ 1614 sb.append(",pos_"); 1615 sb.append(u.offset()); 1616 } 1617 if(r.mate!=null){sb.append(" /"+(i+1));} 1618 r.id=(r.id==null ? sb.toString() : r.id+"\t"+sb); 1619 sb.setLength(0); 1620 }else if(names!=null){ 1621 String name=(r.id==null ? ""+r.numericID : r.id); 1622 if(names.contains(name)){ 1623 for(long j=0; j<Integer.MAX_VALUE; j++){ 1624 String name2=name+"_dd"+j; 1625 if(!names.contains(name2)){ 1626 r.id=name2; 1627 names.add(name2); 1628 break; 1629 } 1630 } 1631 }else{ 1632 names.add(name); 1633 } 1634 } 1635 if(tswAll!=null){tswAll.println(r);} 1636 if(tswCluster!=null){tswCluster.println(r);} 1637 r.setDiscarded(true); 1638 r=r.mate; 1639 } 1640 } 1641 } 1642 } 1643 } 1644 if(csf!=null){ 1645 if(verbose){System.err.println("Shutting down csf "+csf.fname);} 1646 csf.poisonAndWait(); 1647 } 1648 if(tswBest!=null){ 1649 if(verbose){System.err.println("Shutting down tswBest "+tswBest.fname);} 1650 tswBest.poisonAndWait(); 1651 } 1652 if(tswAll!=null){ 1653 if(verbose){System.err.println("Shutting down tswAll "+tswAll.fname);} 1654 tswAll.poisonAndWait(); 1655 } 1656 if(tswCluster!=null){ 1657 if(verbose){System.err.println("Shutting down tswCluster "+tswCluster.fname);} 1658 tswCluster.poisonAndWait(); 1659 } 1660 } 1661 1662 writeGraph(String graphFile, ArrayList<ArrayList<Unit>> clist)1663 private void writeGraph(String graphFile, ArrayList<ArrayList<Unit>> clist){ 1664 // Shared.sort(clist, CLUSTER_LENGTH_COMPARATOR); 1665 clist.sort(CLUSTER_LENGTH_COMPARATOR); 1666 1667 if(verbose){System.err.println("Writing overlap graph.");} 1668 1669 final TextStreamWriter tsw=(graphFile==null ? null : new TextStreamWriter(graphFile, overwrite, append, true)); 1670 if(tsw!=null){ 1671 tsw.start(); 1672 tsw.print("digraph G {\n"); 1673 } 1674 1675 for(int cnum=0; cnum<clist.size(); cnum++){ 1676 final ArrayList<Unit> alu=clist.get(cnum); 1677 // clist.set(cnum, null); //This breaks subsequent output processing 1678 // Shared.sort(alu); //TODO: Remove 1679 1680 if(alu.size()<minClusterSize){ 1681 if(verbose){System.err.println("Ignoring small cluster "+cnum+", size "+alu.size());} 1682 }else{ 1683 if(verbose){System.err.println("Processing cluster "+cnum+", size "+alu.size());} 1684 1685 for(int contig=0; contig<alu.size(); contig++){ 1686 final Unit u0=alu.get(contig); 1687 // alu.set(contig, null); //This breaks subsequent output processing 1688 Read r=u0.r; 1689 if(r.mate!=null && r.pairnum()!=0){r=r.mate;} 1690 1691 { 1692 for(int i=0; r!=null && i<2; i++){ 1693 assert(r.pairnum()==i) : i+", "+r.pairnum()+", "+(r.mate==null ? 9 : r.mate.pairnum()); 1694 Unit u=(Unit)r.obj; 1695 if(verbose){System.err.println("Writing read "+r.id);} 1696 1697 if(tsw!=null){ 1698 tsw.print("\t"+toGraphName(r)+"\n"); 1699 if(r.mate!=null && r.pairnum()==0){ 1700 Read r2=r.mate; 1701 tsw.print(toGraphName(r)+" -> "+toGraphName(r2)+" [label=mate]"); 1702 } 1703 if(u.overlapList!=null){ 1704 for(Overlap o : u.overlapList){ 1705 if(u==o.u1){ 1706 Read r2=o.u2.r; 1707 tsw.print("\t"+toGraphName(r)+" -> "+toGraphName(r2)+" [label=\""+o.toLabel()+"\"]\n"); 1708 } 1709 } 1710 } 1711 } 1712 r=r.mate; 1713 } 1714 } 1715 } 1716 } 1717 } 1718 1719 if(tsw!=null){ 1720 tsw.print("}\n"); 1721 if(verbose){System.err.println("Shutting down tswAll "+tsw.fname);} 1722 tsw.poisonAndWait(); 1723 } 1724 } 1725 toGraphName(Read r)1726 private static String toGraphName(Read r){ 1727 if(NUMBER_GRAPH_NODES || r.id==null){ 1728 return r.numericID+((ADD_PAIRNUM_TO_NAME || r.mate!=null) ? "."+(r.pairnum()+1) : ""); 1729 }else{ 1730 return r.id.replace(' ','_').replace('\t','_'); 1731 } 1732 } 1733 pickBestRepresenative(ArrayList<Unit> alu, boolean clearList)1734 private Unit pickBestRepresenative(ArrayList<Unit> alu, boolean clearList){ 1735 if(alu==null || alu.isEmpty()){return null;} 1736 float[] quality=new float[alu.size()]; 1737 int[] lengths=new int[alu.size()]; 1738 for(int i=0; i<alu.size(); i++){ 1739 Unit u=alu.get(i); 1740 int len=u.r.length(); 1741 quality[i]=u.r.expectedErrors(true, 0)/len; 1742 lengths[i]=len; 1743 } 1744 Arrays.sort(quality); 1745 Arrays.sort(lengths); 1746 int medianLength=lengths[lengths.length/2]; 1747 float bestQuality=quality[0]; 1748 1749 float currentBestQuality=9999999; 1750 Unit best=null; 1751 for(int i=0; i<alu.size(); i++){ 1752 Unit u=alu.get(i); 1753 int len=u.r.length(); 1754 float deviation=Tools.absdif(len, medianLength)*1f/(medianLength+1); 1755 if(deviation<0.05){ 1756 float qual=u.r.expectedErrors(true, 0)/len; 1757 qual=(qual+.001f)*(1+10*deviation); 1758 if(qual<currentBestQuality || best==null){ 1759 currentBestQuality=qual; 1760 best=u; 1761 } 1762 } 1763 } 1764 if(clearList){ 1765 alu.clear(); 1766 alu.add(best); 1767 } 1768 return best; 1769 } 1770 hash(byte[] bases)1771 public static long hash(byte[] bases){ 1772 long code=bases.length; 1773 for(int i=0; i<bases.length; i++){ 1774 byte b=bases[i]; 1775 int mode=(int)(code&31); 1776 assert(hashcodes[b]!=null) : "Invalid sequence character: '"+(char)b+"'"; 1777 code=code^hashcodes[b][mode]; 1778 code=Long.rotateLeft(code, 1); 1779 } 1780 return code; 1781 } 1782 1783 hashReversed(byte[] bases)1784 public static long hashReversed(byte[] bases){ 1785 long code=bases.length; 1786 for(int i=bases.length-1; i>=0; i--){ 1787 byte b=bases[i]; 1788 assert(hashcodes[b]!=null) : "Invalid sequence character: '"+(char)b+"'"; 1789 b=baseToComplementExtended[b]; 1790 int mode=(int)(code&31); 1791 code=code^hashcodes[b][mode]; 1792 code=Long.rotateLeft(code, 1); 1793 } 1794 return code; 1795 } 1796 1797 isCanonical(byte[] bases)1798 public static boolean isCanonical(byte[] bases){ 1799 if(ignoreReverseComplement || bases==null || bases.length==0){return true;} 1800 final int lim=(bases.length+1)/2; 1801 for(int i=0, j=bases.length-1; i<lim; i++, j--){ 1802 byte a=bases[i], b=baseToComplementExtended[bases[j]]; 1803 if(a<b){return true;} 1804 if(b<a){return false;} 1805 } 1806 assert((bases.length&1)==0 || bases[lim-1]==baseToComplementExtended[bases[lim-1]]) : 1807 bases.length+", "+lim+", "+bases[lim-1]+", "+(char)bases[lim-1]+(bases.length<1000 ? "\n'"+new String(bases)+"'\n" : ""); //palindrome absorb 1808 return true; //palindrome 1809 } 1810 1811 1812 private static synchronized long[][] makeCodes(int symbols, int modes){ 1813 Random randy=new Random(1); 1814 long[][] r=new long[symbols][modes]; 1815 for(int i=0; i<symbols; i++){ 1816 for(int j=0; j<modes; j++){ 1817 r[i][j]=randy.nextLong(); 1818 } 1819 } 1820 return r; 1821 } 1822 1823 // /** Handles IUPAC codes */ 1824 // private static synchronized long[][] makeCodes2(int modes){ 1825 // long[][] r0=makeCodes(26, modes); 1826 // long[][] r=new long[Tools.max('Z','z')+1][]; 1827 // for(int i=0; i<26; i++){ 1828 // char c=(char)('A'+i); 1829 // r[c]=r[Tools.toLowerCase(c)]=r0[i]; 1830 // } 1831 // return r; 1832 // } 1833 1834 /** Handles IUPAC codes and invalid symbols */ 1835 private static synchronized long[][] makeCodes2(int modes){ 1836 long[][] r=makeCodes(128, modes); 1837 1838 for(int i=0; i<26; i++){ 1839 char c=(char)('A'+i); 1840 r[Tools.toLowerCase(c)]=r[c]; 1841 } 1842 return r; 1843 } 1844 1845 private void addDupe(Read r, Read primary){ 1846 if(mergeHeaders && primary!=null && r!=null && r.id!=null){ 1847 primary.id+=mergeDelimiter+r.id; 1848 } 1849 if(dupeWriter==null){return;} 1850 if(r.mate==null || r.pairnum()==0){ 1851 synchronized(dupeWriter){ 1852 dupeWriter.println(r); 1853 if(r.mate!=null){ 1854 dupeWriter.println(r.mate); 1855 } 1856 } 1857 } 1858 } 1859 1860 1861 private final class MstThread extends Thread{ 1862 1863 public MstThread(){} 1864 1865 @Override 1866 public void run(){ 1867 1868 ArrayList<Unit> cluster=null; 1869 while((cluster=nextCluster())!=null){ 1870 makeMst(cluster); 1871 processedT.add(cluster); 1872 } 1873 1874 } 1875 1876 public void makeMst(ArrayList<Unit> cluster){ 1877 assert(heap.isEmpty()); 1878 unvisit(cluster); 1879 for(Unit u : cluster){ 1880 u.flags&=~Unit.VISIT_MASK; 1881 Shared.sort(u.overlapList); 1882 } 1883 { 1884 Unit u=cluster.get(0); 1885 u.setVisited(true); 1886 heap.addAll(u.overlapList); 1887 } 1888 // assert(false) : cluster.size(); 1889 while(!heap.isEmpty()){ 1890 Overlap o=heap.poll(); 1891 assert(!o.mst()); 1892 if(!o.invalid()){ 1893 // assert(o.u1.overlapList.contains(o)); //slow 1894 // assert(o.u2.overlapList.contains(o)); //slow 1895 assert(o.u1.visited() || o.u2.visited()); 1896 final Unit u=(!o.u1.visited() ? o.u1 : !o.u2.visited()? o.u2 : null); 1897 if(u!=null){ 1898 o.setMst(true); 1899 u.setVisited(true); 1900 overlapsRetainedT++; 1901 overlapBasesRetainedT+=o.overlapLen; 1902 for(Overlap o2 : u.overlapList){ 1903 if(o2.mst()){ 1904 //do nothing 1905 }else if(!o2.u1.visited() || !o2.u2.visited()){ 1906 if(heap.size()>=Integer.MAX_VALUE){ 1907 removeInvalid(heap); 1908 } 1909 heap.add(o2); 1910 }else if(!o2.invalid()){ 1911 o2.setInvalid(true); 1912 overlapsRemovedT++; 1913 overlapBasesRemovedT+=o2.overlapLen; 1914 } 1915 } 1916 } 1917 } 1918 } 1919 for(Unit u : cluster){ 1920 ArrayList<Overlap> alo=u.overlapList; 1921 int removed=0; 1922 for(int i=0; i<alo.size(); i++){ 1923 Overlap o=alo.get(i); 1924 if(o.invalid()){ 1925 assert(!o.mst()); 1926 alo.set(i, null); 1927 removed++; 1928 }else{ 1929 assert(o.mst()); 1930 } 1931 } 1932 if(removed>0){ 1933 Tools.condenseStrict(alo); 1934 alo.trimToSize(); 1935 } 1936 } 1937 } 1938 1939 private void removeInvalid(PriorityQueue<Overlap> heap){ 1940 ArrayList<Overlap> valid=new ArrayList<Overlap>(heap.size()); 1941 for(Overlap o : heap){ 1942 if(!o.invalid()){ 1943 assert(!o.u1.visited() || !o.u2.visited()); 1944 valid.add(o); 1945 } 1946 } 1947 heap.clear(); 1948 heap.addAll(valid); 1949 } 1950 1951 1952 public long overlapsRemovedT=0; 1953 public long overlapBasesRemovedT=0; 1954 public long overlapsRetainedT=0; 1955 public long overlapBasesRetainedT=0; 1956 1957 private final PriorityQueue<Overlap> heap=new PriorityQueue<Overlap>((1<<16)-1); 1958 private ArrayList<ArrayList<Unit>> processedT=new ArrayList<ArrayList<Unit>>(); 1959 } 1960 1961 1962 /** 1963 * Processes clustered sets of reads. 1964 * @author Brian Bushnell 1965 * @date Aug 9, 2013 1966 * 1967 */ 1968 private final class ClusterThread extends Thread{ 1969 1970 public ClusterThread(boolean fixMultiJoins_, boolean canonicize_, boolean removeCycles_, 1971 boolean fixCanonContradictions_, boolean fixOffsetContradictions_, boolean mergeClusters_, boolean mergeLeaves_, boolean mergeInner_){ 1972 fixMultiJoinsT=fixMultiJoins_; 1973 canonicizeT=canonicize_; 1974 fixCanonContradictionsT=fixCanonContradictions_; 1975 fixOffsetContradictionsT=fixOffsetContradictions_; 1976 mergeClustersT=mergeClusters_; 1977 mergeLeavesT=mergeLeaves_; 1978 mergeInnerT=mergeInner_; 1979 1980 // assert(false) : fixMultiJoinsT+", "+canonicizeT+", "+fixCanonContradictionsT+", "+mergeLeavesT+", "+mergeInnerT; 1981 bandy=(maxEdits>0 ? BandedAligner.makeBandedAligner(bandwidth) : null); 1982 // assert(false) : fixMultiJoinsT+", "+canonicizeT+", "+fixCanonContradictionsT+", "+fixOffsetContradictionsT+", "+mergeClustersT+", "+removeCycles_; 1983 } 1984 1985 @Override 1986 public void run(){ 1987 1988 final ArrayList<Unit> temp=new ArrayList<Unit>(1000); 1989 1990 ArrayList<Unit> cluster=null; 1991 while((cluster=nextCluster())!=null){ 1992 1993 if(EA){ 1994 for(Unit u : cluster){assert(u.r.mate==null) : "Cluster processing/merging is not supported for paired reads, only cluster generation.";} 1995 } 1996 1997 // for(Unit u : cluster){assert(!u.visited());} 1998 unvisit(cluster); 1999 2000 reorderClusterBreadthFirst(cluster); 2001 int multiJoinCount=findMultiJoinsInCluster(cluster, fixMultiJoinsT); 2002 2003 if(EA){ 2004 for(Unit u : cluster){assert(!u.visited());} 2005 } 2006 2007 boolean ok=true; 2008 if(multiJoinCount!=0){ 2009 assert(multiJoinCount>0); 2010 multiJoinsFoundT+=multiJoinCount; 2011 if(!fixMultiJoinsT){ 2012 multiJoinFailuresT++; 2013 ok=false; 2014 } 2015 } 2016 2017 int canonContradictions=0; 2018 if(ok && canonicizeT){ 2019 if(EA){ 2020 for(Unit u : cluster){ 2021 assert(!u.visited()); 2022 assert(!u.canonContradiction()); 2023 assert(!u.canonicized()); 2024 if(u.overlapList!=null){ 2025 for(Overlap o : u.overlapList){ 2026 assert(!o.invalid()); 2027 assert(!o.canonContradiction()) : 2028 o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+cluster.contains(o.u1)+", "+cluster.contains(o.u2); 2029 } 2030 } 2031 } 2032 } 2033 canonContradictions=canonicizeClusterBreadthFirst(cluster, temp); 2034 // System.err.println("Canonicized cluster of size "+cluster.size()+"; contradictions = "+canonContradictions+"; canonicized = "+temp.size()); 2035 temp.clear(); 2036 for(Unit u : cluster){assert(!u.visited());} 2037 if(canonContradictions>0){ 2038 canonContradictoryOverlapsT+=canonContradictions; 2039 canonContradictoryClustersT++; 2040 if(fixCanonContradictionsT){ 2041 if(verbose){System.err.println("Pruning cluster to remove canonization contradictions.");} 2042 fullyPruneCluster(cluster, temp); 2043 if(verbose){System.err.println("Resulting size: "+cluster.size());} 2044 if(EA){ 2045 for(Unit u : cluster){ 2046 assert(!u.visited()); 2047 assert(!u.canonContradiction()); 2048 assert(u.canonicized()); 2049 if(u.overlapList!=null){ 2050 for(Overlap o : u.overlapList){ 2051 assert(!o.invalid()); 2052 assert(!o.canonContradiction()); 2053 assert(o.type==FORWARD) : "\n"+o+"\n"+ 2054 o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+o.u1.canonicized()+", "+o.u2.canonicized()+ 2055 "\n"+cluster.contains(o.u1)+", "+cluster.contains(o.u2)+", "+cluster.size(); 2056 } 2057 } 2058 } 2059 } 2060 }else{ 2061 ok=false; 2062 } 2063 } 2064 } 2065 2066 int cycleOverlaps=0; 2067 if(ok){ 2068 cycleOverlaps=findCycles(cluster, removeCycles); 2069 for(Unit u : cluster){assert(!u.visited());} 2070 if(cycleOverlaps>0){ 2071 cycleOverlapsT+=cycleOverlaps; 2072 cycleClustersT++; 2073 } 2074 } 2075 2076 int offsetContradictions=0; 2077 if(ok && fixOffsetContradictionsT){ 2078 if(EA){ 2079 for(Unit u : cluster){ 2080 assert(!u.visited()); 2081 assert(!u.offsetContradiction()); 2082 assert(!u.offsetValid()); 2083 assert(u.canonicized()); 2084 if(u.overlapList!=null){ 2085 for(Overlap o : u.overlapList){ 2086 assert(!o.invalid()); 2087 assert(!o.offsetContradiction()); 2088 assert(o.type==FORWARD) : o; 2089 } 2090 } 2091 } 2092 } 2093 offsetContradictions=generateOffsetsBreadthFirst(cluster, temp); 2094 // System.err.println("Made offsets for cluster of size "+cluster.size()+"; contradictions = "+offsetContradictions+"; set = "+temp.size()); 2095 temp.clear(); 2096 for(Unit u : cluster){assert(!u.visited());} 2097 if(offsetContradictions>0){ 2098 offsetContradictoryOverlapsT+=offsetContradictions; 2099 offsetContradictoryClustersT++; 2100 if(fixOffsetContradictionsT){ 2101 if(verbose){System.err.println("Pruning cluster to remove offset contradictions.");} 2102 fullyPruneCluster(cluster, temp); 2103 if(verbose){System.err.println("Resulting size: "+cluster.size());} 2104 if(EA){ 2105 for(Unit u : cluster){ 2106 assert(!u.visited()); 2107 assert(!u.offsetContradiction()); 2108 assert(u.offsetValid()); 2109 if(u.overlapList!=null){ 2110 for(Overlap o : u.overlapList){ 2111 assert(!o.invalid()); 2112 assert(!o.offsetContradiction()); 2113 assert(o.type==FORWARD) : o; 2114 } 2115 } 2116 } 2117 } 2118 }else{ 2119 ok=false; 2120 } 2121 } 2122 if(ok){Shared.sort(cluster, UNIT_OFFSET_COMPARATOR);} 2123 } 2124 2125 if(ok && absorbOverlap){ 2126 mergeCluster(cluster); 2127 } 2128 processedClustersT.add(cluster)2129 processedClustersT.add(cluster); 2130 if(processedClustersT.size()>=threadMaxReadsToBuffer){ 2131 synchronized(processedClusters){ 2132 processedClusters.addAll(processedClustersT); processedClustersT.clear()2133 processedClustersT.clear(); 2134 } 2135 } 2136 } 2137 synchronized(processedClusters){ 2138 processedClusters.addAll(processedClustersT); processedClustersT.clear()2139 processedClustersT.clear(); 2140 } 2141 } 2142 2143 private void fullyPruneCluster(ArrayList<Unit> cluster, ArrayList<Unit> temp){ 2144 assert(cluster.size()>1) : cluster.size(); 2145 ArrayList<Unit> pruned=pruneCluster(cluster, true, true, temp); 2146 assert(temp.isEmpty()); 2147 assert(pruned==null || pruned.size()>0); 2148 while(pruned!=null){ 2149 ArrayList<Unit> subcluster=pruned; 2150 for(Unit u : subcluster){ 2151 u.clearVolatileFlags(); 2152 if(u.overlapList!=null){ 2153 for(Overlap o : u.overlapList){ 2154 o.clearVolatileFlags(); 2155 } 2156 } 2157 } 2158 assert(subcluster.size()>0); 2159 pruned=pruneCluster(subcluster, false, false, temp); 2160 assert(temp.isEmpty()); 2161 assert(pruned==null || pruned.size()>0); 2162 assert(subcluster.size()>0); 2163 if(subcluster.size()==1){ 2164 processedClustersT.add(subcluster); 2165 }else{ 2166 assert(subcluster.size()>1); 2167 synchronized(clusterQueue){ 2168 clusterQueue.add(subcluster); 2169 } 2170 } 2171 } 2172 } 2173 2174 /** 2175 * @param cluster 2176 */ 2177 private void mergeCluster(ArrayList<Unit> cluster) { 2178 if(cluster.size()==1){return;} 2179 if(mergeLeavesT){ 2180 mergeLeaves(cluster); 2181 } 2182 if(mergeInnerT){ 2183 mergeInner(cluster); 2184 } 2185 } 2186 2187 /** 2188 * Finds places in the cluster where two Units are joined by multiple different Overlaps. 2189 * Returns number of multijoins found. 2190 * @param cluster 2191 */ 2192 private int findMultiJoinsInCluster(ArrayList<Unit> cluster, boolean resolveProblems) { 2193 if(cluster.size()<2){return 0;} 2194 int totalMultiJoins=0; 2195 for(Unit ua : cluster){ 2196 ArrayList<Overlap> list=ua.overlapList; 2197 assert(list!=null); 2198 if(list.size()>1){ 2199 Shared.sort(list); 2200 2201 int multiJoins=0; 2202 for(int i=0; i<list.size(); i++){ 2203 Overlap o=list.get(i); 2204 Unit ub=(o.u1==ua ? o.u2 : o.u1); 2205 assert(ua!=ub); 2206 assert(ua==o.u1 || ua==o.u2); 2207 if(ub.visited()){ 2208 multiJoins++; 2209 multiJoinBasesFoundT+=o.overlapLen; 2210 if(!o.multiJoin()){o.setMultiJoin(true);} 2211 if(resolveProblems){list.set(i, null);} 2212 }else{ 2213 ub.setVisited(true); 2214 } 2215 } 2216 2217 if(multiJoins>0){ 2218 totalMultiJoins+=multiJoins; 2219 if(resolveProblems){Tools.condenseStrict(list);} 2220 } 2221 2222 for(int i=0; i<list.size(); i++){ 2223 Overlap o=list.get(i); 2224 Unit ub=(o.u1==ua ? o.u2 : o.u1); 2225 assert(ua!=ub); 2226 assert(ua==o.u1 || ua==o.u2); 2227 assert(ub.visited()); 2228 ub.setVisited(false); 2229 } 2230 } 2231 2232 } 2233 2234 return totalMultiJoins; 2235 } 2236 2237 private ArrayList<Unit> pruneCluster(ArrayList<Unit> cluster, boolean pruneContradictoryNodes, boolean pruneContradictoryOverlaps, ArrayList<Unit> visited){ 2238 if(verbose){System.err.println("pruneCluster(size="+cluster.size()+", "+pruneContradictoryNodes+", "+pruneContradictoryOverlaps+")");} 2239 2240 //pruneContradictoryOverlaps is less strict than pruneContradictoryNodes 2241 assert(pruneContradictoryOverlaps || !pruneContradictoryNodes); 2242 2243 for(Unit ua : cluster){ 2244 assert(!ua.visited()); 2245 assert(ua.isPerfectlyTransitive()) : ua; 2246 if(ua.visited()){ua.setVisited(false);} 2247 } 2248 2249 int prunedOverlaps=0; 2250 int visits=1; 2251 2252 { 2253 final Unit root=cluster.get(0); 2254 assert(!root.contradiction()); 2255 root.setVisited(true); 2256 visited.add(root); 2257 } 2258 2259 for(int i=0; i<visited.size(); i++){ 2260 Unit ua=visited.get(i); 2261 2262 if(ua.visited() && (!ua.contradiction() || !pruneContradictoryNodes)){ 2263 ArrayList<Overlap> list=ua.overlapList; 2264 if(list!=null){ 2265 int removed=0; 2266 for(int j=0; j<list.size(); j++){ 2267 Overlap o=list.get(j); 2268 Unit ub=(o.u1==ua ? o.u2 : o.u1); 2269 assert(o.u1==ua || o.u2==ua); 2270 assert(ua!=ub); 2271 assert(ub.valid()); 2272 2273 assert(!o.canonContradiction() || (ua.canonContradiction() || ub.canonContradiction())) : 2274 "\n"+o.canonContradiction()+", "+ua.canonContradiction()+", "+ub.canonContradiction(); 2275 2276 assert(!o.offsetContradiction() || (ua.offsetContradiction() || ub.offsetContradiction())) : 2277 "\n"+o.offsetContradiction()+", "+ua.offsetContradiction()+", "+ub.offsetContradiction(); 2278 2279 // assert(o.contradiction()==(ua.contradiction() && ub.contradiction())) : 2280 // "\n"+o.canonContradiction()+", "+o.offsetContradiction()+ 2281 // "\n"+ua.canonContradiction()+", "+ua.offsetContradiction()+ 2282 // "\n"+ub.canonContradiction()+", "+ub.offsetContradiction(); 2283 2284 final boolean remove=(pruneContradictoryNodes && ub.contradiction() || (pruneContradictoryOverlaps && o.contradiction())); 2285 if(!remove && !ub.visited()){ 2286 ub.setVisited(true); 2287 visited.add(ub); 2288 visits++; 2289 } 2290 2291 if(remove){ 2292 if(!o.invalid()){o.setInvalid(true);} 2293 list.set(j, null); 2294 removed++; 2295 prunedOverlaps++; 2296 }else{ 2297 assert(!o.invalid()); 2298 } 2299 } 2300 if(removed>0){Tools.condenseStrict(list);} 2301 } 2302 } 2303 } 2304 2305 if(verbose){System.err.println("cluster.size()="+cluster.size()+", visits="+visits+", visited.size()="+visited.size());} 2306 2307 // if(visited.size()==11486){ //TODO: For testing. Remove. 2308 // for(int i=0; i<visited.size(); i++){ 2309 // Unit u=visited.get(i); 2310 // assert(u.visited()); 2311 // assert(!u.canonContradiction()); 2312 // assert(u.canonicized()); 2313 // for(Overlap o : u.overlapList){ 2314 // assert(!o.canonContradiction()); 2315 // assert(o.type==FORWARD) : "\n\no="+o+"\ni="+i+", u.overlapList.size="+u.overlapList.size()+"\n"+ 2316 // o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+o.u1.canonicized()+", "+o.u2.canonicized()+ 2317 // "\n"+visited.contains(o.u1)+", "+visited.contains(o.u2)+", "+visited.size()+ 2318 // "\n"+u.overlapList; 2319 // } 2320 // } 2321 // } 2322 2323 final int numUnvisited=cluster.size()-visits; 2324 ArrayList<Unit> pruned=(numUnvisited==0 ? null : new ArrayList<Unit>(numUnvisited)); 2325 assert(visits==visited.size()); 2326 assert(visits>=1 && visits<=cluster.size()); 2327 2328 if(visits<cluster.size()){ 2329 pruned=new ArrayList<Unit>(cluster.size()-visits); 2330 for(Unit ua : cluster){ 2331 if(!ua.visited()){ 2332 pruned.add(ua); 2333 ArrayList<Overlap> list=ua.overlapList; 2334 if(list!=null){ 2335 int removed=0; 2336 for(int j=0; j<list.size(); j++){ 2337 Overlap o=list.get(j); 2338 Unit ub=(o.u1==ua ? o.u2 : o.u1); 2339 assert(o.u1==ua || o.u2==ua); 2340 assert(ua!=ub); 2341 assert(ub.valid()); 2342 2343 if(ub.visited() || o.invalid()){ 2344 assert(ub.visited()==o.invalid()) : "\n"+o+"\n"+ub; 2345 list.set(j, null); 2346 removed++; 2347 } 2348 } 2349 if(removed>0){Tools.condenseStrict(list);} 2350 } 2351 } 2352 } 2353 assert(pruned.size()==numUnvisited); 2354 }else{ 2355 assert(prunedOverlaps==0) : "If this fails then I may need to mark overlaps to remove."; 2356 } 2357 for(Unit u : cluster){ 2358 assert(u.isPerfectlyTransitive()) : u; 2359 if(EA){ 2360 if(u.overlapList!=null){ 2361 for(Overlap o : u.overlapList){assert(!o.invalid());} 2362 } 2363 } 2364 if(u.visited()){u.setVisited(false);} 2365 } 2366 cluster.clear(); 2367 cluster.addAll(visited); 2368 cluster.trimToSize(); 2369 2370 // for(Unit u : cluster){ 2371 //// assert(u.canonicized()); 2372 // for(Overlap o : u.overlapList){ 2373 // assert(pruned==null || !pruned.contains(o.u1)); 2374 // assert(pruned==null || !pruned.contains(o.u2)); 2375 // assert(cluster.contains(o.u1)); 2376 // assert(cluster.contains(o.u2)); 2377 // } 2378 // } 2379 // if(pruned!=null){ 2380 // for(Unit u : pruned){ 2381 // for(Overlap o : u.overlapList){ 2382 // assert(pruned.contains(o.u1)); 2383 // assert(pruned.contains(o.u2)); 2384 // assert(!cluster.contains(o.u1)); 2385 // assert(!cluster.contains(o.u2)); 2386 // } 2387 // } 2388 // } 2389 2390 visited.clear(); 2391 return pruned; 2392 } 2393 2394 /** 2395 * Cluster should already be ordered breadth-first 2396 * This may fail because removing cycles could change breadth-first traversal, but if it fails, an assertion will be thrown 2397 * @param cluster 2398 */ 2399 private int findCycles(ArrayList<Unit> cluster, boolean remove){ 2400 2401 { 2402 final Unit root=cluster.get(0); 2403 assert(root.length()>=cluster.get(cluster.size()-1).length()); 2404 root.setVisited(true); 2405 } 2406 int cycles=0; 2407 2408 for(Unit ua : cluster){ 2409 assert(ua.visited()); 2410 ArrayList<Overlap> list=ua.overlapList; 2411 if(list!=null){ 2412 int removed=0; 2413 for(int i=0; i<list.size(); i++){ 2414 Overlap o=list.get(i); 2415 Unit ub=(o.u1==ua ? o.u2 : o.u1); 2416 assert(o.u1==ua || o.u2==ua); 2417 assert(ua!=ub); 2418 assert(ub.valid()); 2419 2420 if(!o.visited()){ 2421 o.setVisited(true); 2422 if(ub.visited()){ 2423 if(!o.cyclic()){ 2424 o.setCyclic(true); 2425 cycles++; 2426 } 2427 }else{ 2428 ub.setVisited(true); 2429 } 2430 } 2431 if(remove && o.cyclic()){ 2432 list.set(i, null); 2433 removed++; 2434 } 2435 } 2436 if(removed>0){Tools.condenseStrict(list);} 2437 } 2438 } 2439 2440 for(Unit u : cluster){ 2441 if(u.visited()){u.setVisited(false);} 2442 if(u.overlapList!=null){ 2443 for(Overlap o : u.overlapList){ 2444 if(o.visited()){o.setVisited(false);} 2445 } 2446 } 2447 } 2448 2449 return cycles; 2450 } 2451 2452 /** 2453 * Cluster should already be ordered breadth-first 2454 * @param cluster 2455 */ 2456 private int generateOffsetsBreadthFirst(ArrayList<Unit> cluster, ArrayList<Unit> temp){ 2457 2458 2459 assert(temp!=null); 2460 assert(temp.isEmpty()); 2461 { 2462 final Unit root=cluster.get(0); 2463 assert(root.length()>=cluster.get(cluster.size()-1).length()); 2464 root.setOffset(0); 2465 temp.add(root); 2466 } 2467 2468 int contradictions=0; 2469 for(int i=0; i<temp.size(); i++){ 2470 Unit u=temp.get(i); 2471 assert(!u.visited()) : i; 2472 assert(u.offsetValid() || contradictions>0) : i+", "+temp.size()+", "+contradictions+"\n"+toString(temp); 2473 if(u.offsetValid() && !u.offsetContradiction()){ 2474 contradictions+=setOffsetsNeighbors(u, temp); 2475 assert(contradictions==0 || (i>0 && temp.size()>2)); 2476 } 2477 } 2478 2479 int min=0; 2480 for(Unit u : temp){ 2481 if(u.visited()){u.setVisited(false);} 2482 if(u.overlapList!=null){ 2483 for(Overlap o : u.overlapList){ 2484 if(o.visited()){o.setVisited(false);} 2485 } 2486 } 2487 if(u.offsetValid() && !u.offsetContradiction()){ 2488 min=Tools.min(min, u.offset()); 2489 } 2490 } 2491 2492 if(verbose){ 2493 System.err.println("min offset = "+min); 2494 } 2495 2496 for(Unit u : temp){ 2497 if(u.offsetValid()){ 2498 if(verbose){System.err.println("Set "+u.name()+" offset from "+u.offset+" to "+(u.offset-min));} 2499 u.offset=u.offset-min; 2500 } 2501 } 2502 2503 2504 return contradictions; 2505 } 2506 2507 /** 2508 * @param root 2509 */ 2510 private int setOffsetsNeighbors(final Unit root, final ArrayList<Unit> temp) { 2511 if(verbose){System.err.println("\nsetOffsetsNeighbors("+root.name()+")\nroot.code1="+root.code1+"\n");} 2512 assert(root.valid()); 2513 assert(!root.visited()); 2514 assert(root.offsetValid()); 2515 assert(!root.offsetContradiction()); 2516 root.setVisited(true); 2517 if(root.overlapList==null){return 0;} 2518 final int contradictions=countOffsetContradictions(root, false); 2519 if(verbose){System.err.println("\ncontradictions="+contradictions);} 2520 for(Overlap o : root.overlapList){ 2521 Unit u=(o.u1==root ? o.u2 : o.u1); 2522 assert(o.u1==root || o.u2==root); 2523 assert(root!=u); 2524 assert(u.valid()); 2525 2526 if(verbose){System.err.println("\nProcessing Overlap "+o);} 2527 if(!o.visited() && !o.offsetContradiction()){ 2528 o.setVisited(true); 2529 if(!u.offsetContradiction()){ 2530 if(verbose){System.err.println("Calling setOffset: "+o);} 2531 if(!u.offsetValid()){temp.add(u);} 2532 boolean b=setOffset(root, u, o); 2533 if(verbose){System.err.println("Finished setOffset: "+o);} 2534 2535 // if(x>0){ 2536 // if(verbose){System.err.println("\n*********************************************");} 2537 // if(verbose){System.err.println("Problem detected with contig "+u.name());} 2538 // if(verbose){System.err.println("*********************************************\n");} 2539 // verbose=true; 2540 // int y2=countOffsetContradictions(root, false); 2541 // assert(contradictions==y2); 2542 // } 2543 2544 assert(b) : "\n"+contradictions+", "+o.offsetContradiction()+", "+root.offsetContradiction()+", "+u.offsetContradiction()+"\n" 2545 +root.offsetValid()+", "+u.offsetValid()+", "+OVERLAP_TYPE_NAMES[o.type]+"\n"+b 2546 +fixMultiJoins; //This assertion can fail if a multijoin is present 2547 assert(u.offsetValid()); 2548 } 2549 } 2550 } 2551 return contradictions; 2552 } 2553 2554 private int countOffsetContradictions(Unit root, boolean includeKnown){ 2555 if(verbose){System.err.println("\ncountContradictions("+root.name()+", "+includeKnown+")\nroot.code1="+root.code1+"\n");} 2556 assert(root.valid()); 2557 assert(root.visited()); 2558 assert(root.offsetValid()); 2559 // assert(!root.offsetContradiction()); 2560 if(root.overlapList==null){return 0;} 2561 int contradictions=0; 2562 for(Overlap o : root.overlapList){ 2563 Unit u=(o.u1==root ? o.u2 : o.u1); 2564 assert(o.u1==root || o.u2==root); 2565 assert(root!=u); 2566 assert(u.valid()); 2567 2568 if(verbose){System.err.println("\nOverlap "+o+"\nu="+u.name()+", offsetValid="+u.offsetValid());} 2569 2570 boolean contradictory=(u.offsetValid() && u.offset()!=calcOffset(root, u, o)); 2571 if(verbose){System.err.println("contradictory= \t"+contradictory);} 2572 if(contradictory){ 2573 if(includeKnown || !u.offsetContradiction()){ 2574 contradictions++; 2575 if(!root.offsetContradiction()){root.setOffsetContradiction(true);} 2576 } 2577 if(!o.offsetContradiction()){o.setOffsetContradiction(true);} 2578 if(!u.offsetContradiction()){u.setOffsetContradiction(true);} 2579 } 2580 assert(contradictory==o.offsetContradiction()) : contradictory+", "+o.offsetContradiction(); 2581 if(verbose){ 2582 System.err.println("root.offsetContradiction()=\t"+root.offsetContradiction()); 2583 System.err.println("u.offsetContradiction()= \t"+u.offsetContradiction()); 2584 System.err.println("o.offsetContradiction()= \t"+o.offsetContradiction()); 2585 System.err.println("contradictions= \t"+contradictions); 2586 } 2587 } 2588 if(verbose){System.err.println("Final contradictions="+contradictions+"\n");} 2589 return contradictions; 2590 } 2591 2592 /** 2593 * Cluster should already be ordered breadth-first 2594 * @param cluster 2595 */ 2596 private int canonicizeClusterBreadthFirst(ArrayList<Unit> cluster, ArrayList<Unit> temp) { 2597 2598 assert(temp!=null); 2599 assert(temp.isEmpty()); 2600 { 2601 final Unit root=cluster.get(0); 2602 assert(root.length()>=cluster.get(cluster.size()-1).length()); 2603 root.setCanonicized(true); 2604 temp.add(root); 2605 } 2606 2607 int contradictions=0; 2608 for(int i=0; i<temp.size(); i++){ 2609 final Unit u=temp.get(i); 2610 assert(!u.visited()) : i; 2611 assert(u.canonicized() || contradictions>0) : i+", "+temp.size()+", "+contradictions+"\n"+toString(temp); 2612 if(u.canonicized() && !u.canonContradiction()){ 2613 contradictions+=canonicizeNeighbors(u, temp); 2614 assert(contradictions==0 || (i>0 && temp.size()>2)); 2615 2616 if(u.overlapList!=null){ 2617 for(Overlap o : u.overlapList){ 2618 assert(o.type==FORWARD || o.canonContradiction() || o.u1.canonContradiction() || o.u2.canonContradiction()) : 2619 o+"\n"+contradictions+", "+o.canonContradiction()+", "+o.u1.canonContradiction()+", "+o.u2.canonContradiction()+ 2620 "\n"+o.u1.canonicized()+", "+o.u2.canonicized()+", "+o.u1.visited()+", "+o.u2.visited(); 2621 } 2622 } 2623 } 2624 2625 // if(u.r.numericID==59462 || u.r.numericID==56439){ //TODO: remove 2626 // System.err.println("\nid="+u.r.numericID+", canonicized="+u.canonicized()+", contradiction="+u.canonContradiction()+", visited="+u.visited()); 2627 // for(Overlap o : u.overlapList){ 2628 // Unit u2=(o.u1==u ? o.u2 : o.u1); 2629 // assert(o.u1==u || o.u2==u); 2630 // assert(u2!=u); 2631 // assert(u2.valid()); 2632 // System.err.println("o = "+o); 2633 // System.err.println("o.contradiction="+o.canonContradiction()); 2634 // System.err.println("u2.id="+u2.r.numericID+", canonicized="+u2.canonicized()+", contradiction="+u2.canonContradiction()+", visited="+u.visited()); 2635 // } 2636 // } 2637 } 2638 2639 for(Unit u : temp){ 2640 if(u.visited()){u.setVisited(false);} 2641 if(EA){ 2642 if(u.overlapList!=null){ 2643 for(Overlap o : u.overlapList){assert(!o.visited());} 2644 } 2645 } 2646 } 2647 2648 return contradictions; 2649 } 2650 2651 /** 2652 * @param root 2653 */ 2654 private int canonicizeNeighbors(Unit root, ArrayList<Unit> canonicized) { 2655 if(verbose){System.err.println("\ncanonicizeNeighbors("+root.name()+")\nroot.code1="+root.code1+"\n");} 2656 assert(root.valid()); 2657 assert(!root.visited()); 2658 assert(root.canonicized()); 2659 assert(!root.canonContradiction()); 2660 root.setVisited(true); 2661 if(root.overlapList==null){return 0;} 2662 final int contradictions=countCanonContradictions(root, false); 2663 if(verbose){System.err.println("\ncontradictions="+contradictions);} 2664 for(Overlap o : root.overlapList){ 2665 Unit u=(o.u1==root ? o.u2 : o.u1); 2666 assert(o.u1==root || o.u2==root); 2667 assert(root!=u); 2668 assert(u.valid()); 2669 2670 if(verbose){System.err.println("\nProcessing Overlap "+o);} 2671 if(!o.canonContradiction()){ 2672 if(!u.canonContradiction()){ 2673 boolean b=u.canonicized(); 2674 int dir=o.type; 2675 if(verbose){System.err.println("Calling canonicize: "+o);} 2676 int x=canonicize(root, u, o); 2677 if(verbose){System.err.println("Finished canonicize: "+o);} 2678 2679 // if(x>0){ 2680 // if(verbose){System.err.println("\n*********************************************");} 2681 // if(verbose){System.err.println("Problem detected with contig "+u.name());} 2682 // if(verbose){System.err.println("*********************************************\n");} 2683 // verbose=true; 2684 // int y2=countCanonContradictions(root, false); 2685 // assert(contradictions==y2); 2686 // } 2687 2688 assert(x==0 || (u.canonicized() && (o.type==FORWARDRC || o.type==REVERSERC))); 2689 assert(x==0) : "\n"+x+", "+contradictions+", "+o.canonContradiction()+", "+root.canonContradiction()+", "+u.canonContradiction()+"\n" 2690 +root.canonicized()+", "+u.canonicized()+", "+OVERLAP_TYPE_NAMES[o.type]+"\n"+b+", "+dir 2691 +fixMultiJoins; //This assertion can fail if a multijoin is present 2692 if(!u.canonicized()){ 2693 u.setCanonicized(true); 2694 canonicized.add(u); 2695 } 2696 assert(u.canonicized()); 2697 } 2698 } 2699 } 2700 if(EA){ 2701 for(Overlap o : root.overlapList){ 2702 assert(o.type==FORWARD || o.canonContradiction() || o.u1.canonContradiction() || o.u2.canonContradiction()) : 2703 o+"\n"+contradictions+", "+o.canonContradiction()+", "+o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+root.canonContradiction()+ 2704 "\n"+o.u1.canonicized()+", "+o.u2.canonicized()+", "+o.u1.visited()+", "+o.u2.visited(); 2705 } 2706 } 2707 return contradictions; 2708 } 2709 2710 private int countCanonContradictions(Unit root, boolean includeKnown){ 2711 if(verbose){System.err.println("\ncountContradictions("+root.name()+", "+includeKnown+")\nroot.code1="+root.code1+"\n");} 2712 assert(root.valid()); 2713 assert(root.visited()); 2714 assert(root.canonicized()); 2715 // assert(!root.canonContradiction()); 2716 if(root.overlapList==null){return 0;} 2717 int contradictions=0; 2718 for(Overlap o : root.overlapList){ 2719 Unit ub=(o.u1==root ? o.u2 : o.u1); 2720 assert(o.u1==root || o.u2==root); 2721 assert(root!=ub); 2722 assert(ub.valid()); 2723 2724 if(verbose){System.err.println("\nOverlap "+o+"\nu="+ub.name()+", canonicized="+ub.canonicized());} 2725 2726 boolean contradictory=(ub.canonicized() && (o.type==FORWARDRC || o.type==REVERSERC)); 2727 if(verbose){System.err.println("contradictory= \t"+contradictory);} 2728 if(contradictory){ 2729 if(!o.canonContradiction()){o.setCanonContradiction(true);} 2730 if(includeKnown || !ub.canonContradiction()){ 2731 contradictions++; 2732 if(!root.canonContradiction()){root.setCanonContradiction(true);} 2733 if(!ub.canonContradiction()){ub.setCanonContradiction(true);} 2734 } 2735 } 2736 2737 assert(!o.canonContradiction() || (root.canonContradiction() || ub.canonContradiction())) : 2738 "\n"+contradictory+", "+o.canonContradiction()+", "+root.canonContradiction()+", "+ub.canonContradiction(); 2739 2740 assert(contradictory==o.canonContradiction()) : contradictory+", "+o.canonContradiction(); 2741 if(verbose){ 2742 System.err.println("root.canonContradiction()=\t"+root.canonContradiction()); 2743 System.err.println("u.canonContradiction()= \t"+ub.canonContradiction()); 2744 System.err.println("o.canonContradiction()= \t"+o.canonContradiction()); 2745 System.err.println("contradictions= \t"+contradictions); 2746 } 2747 } 2748 if(verbose){System.err.println("Final contradictions="+contradictions+"\n");} 2749 return contradictions; 2750 } 2751 2752 private String toString(ArrayList<Unit> cluster){ 2753 for(int i=0; i<cluster.size(); i++){ 2754 Unit u=cluster.get(i); 2755 u.r.id=""+i; 2756 } 2757 StringBuilder sb=new StringBuilder(1000); 2758 for(Unit u : cluster){ 2759 sb.append(">"+u.name()+"\n"); 2760 sb.append(new String(u.bases())); 2761 sb.append("\n"); 2762 } 2763 sb.append("\n*****\n"); 2764 for(Unit u : cluster){ 2765 sb.append("\n"+u.name()+":"); 2766 if(u.overlapList!=null){ 2767 for(Overlap o : u.overlapList){ 2768 Unit ub=(o.u1==u ? o.u2 : o.u1); 2769 sb.append(" "+ub.name()); 2770 } 2771 } 2772 } 2773 sb.append("\n"); 2774 return sb.toString(); 2775 } 2776 2777 private String toShortString(ArrayList<Unit> cluster){ 2778 for(int i=0; i<cluster.size(); i++){ 2779 Unit u=cluster.get(i); 2780 u.r.id=""+i; 2781 } 2782 StringBuilder sb=new StringBuilder(1000); 2783 for(Unit u : cluster){ 2784 sb.append("\n"+u.name()+":"); 2785 if(u.overlapList!=null){ 2786 for(Overlap o : u.overlapList){ 2787 Unit ub=(o.u1==u ? o.u2 : o.u1); 2788 sb.append(" "+ub.name()); 2789 } 2790 } 2791 } 2792 sb.append("\n"); 2793 return sb.toString(); 2794 } 2795 2796 2797 /** 2798 * @param root 2799 * @param u2 2800 * @param o 2801 * @return Number of contradictions 2802 */ 2803 private int canonicize(final Unit root, final Unit u2, final Overlap o){ 2804 if(o.type==FORWARD){return 0;} 2805 if(o.type==FORWARDRC || o.type==REVERSERC){ 2806 if(u2.canonicized()){return 1;} 2807 u2.reverseComplement(); 2808 unitsFlippedT++; 2809 for(Overlap o2 : u2.overlapList){ 2810 overlapsFlippedT++; 2811 o2.flip(u2, bandy); 2812 } 2813 assert(o.type==FORWARD || o.type==REVERSE) : OVERLAP_TYPE_NAMES[o.type]; 2814 } 2815 if(o.type==REVERSE){o.reverseDirection();} 2816 assert(o.type==FORWARD); 2817 assert(o.test(bandy, o.edits+maxEdits)); 2818 return 0; 2819 } 2820 2821 2822 /** 2823 * @param root 2824 * @param u2 2825 * @param o 2826 * @return true if no contradictions 2827 */ 2828 private boolean setOffset(final Unit root, final Unit u2, final Overlap o){ 2829 assert(root.offsetValid()); 2830 assert(!root.offsetContradiction()); 2831 int offset=calcOffset(root, u2, o); 2832 2833 if(u2.offsetValid()){return u2.offset()==offset;} 2834 u2.setOffset(offset); 2835 2836 if(verbose){ 2837 System.err.println("\nroot = "+(root.name()==null ? root.r.numericID+"" : root.name())+", u2 = "+(u2.name()==null ? u2.r.numericID+"" : u2.name()) 2838 +"\no = "+o 2839 +"\nroot.offset = "+root.offset() 2840 +"\nu2.offset = "+u2.offset()); 2841 } 2842 2843 return true; 2844 } 2845 2846 2847 private int calcOffset(final Unit root, final Unit ub, final Overlap o){ 2848 assert(root.offsetValid()); 2849 if(o.type==FORWARD){ 2850 if(root==o.u1){ 2851 int dif=o.start1-o.start2; 2852 if(verbose){System.err.println("root==o.u1=="+root.name()+", start1="+o.start1+"; u2==o.u2=="+ub.name()+", start2="+o.start2+", dif="+dif);} 2853 return root.offset+dif; 2854 }else{ 2855 int dif=o.start2-o.start1; 2856 if(verbose){System.err.println("root==o.u2=="+root.name()+", start2="+o.start2+"; u2==o.u1=="+ub.name()+", start1="+o.start1+", dif="+dif);} 2857 return root.offset+dif; 2858 } 2859 }else{ 2860 assert(false) : o; 2861 throw new RuntimeException("TODO"); 2862 } 2863 } 2864 2865 2866 /** 2867 * @param cluster 2868 */ 2869 private void mergeLeaves(ArrayList<Unit> cluster) { 2870 assert(false) : "TODO"; 2871 for(Unit u : cluster){ 2872 2873 } 2874 } 2875 2876 /** 2877 * @param cluster 2878 */ 2879 private void mergeInner(ArrayList<Unit> cluster) { 2880 assert(false) : "TODO"; 2881 for(Unit u : cluster){ 2882 2883 } 2884 } 2885 2886 private ArrayList<ArrayList<Unit>> processedClustersT=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer); 2887 2888 long leafMergesT=0; 2889 long innerMergesT=0; 2890 long leafBaseMergesT=0; 2891 long innerBaseMergesT=0; 2892 2893 long multiJoinFailuresT=0; 2894 long multiJoinsFoundT=0; 2895 long multiJoinBasesFoundT=0; 2896 long unitsFlippedT=0; 2897 long overlapsFlippedT=0; 2898 long canonContradictoryOverlapsT=0; 2899 long canonContradictoryClustersT=0; 2900 long offsetContradictoryOverlapsT=0; 2901 long offsetContradictoryClustersT=0; 2902 long cycleOverlapsT=0; 2903 long cycleClustersT=0; 2904 2905 private final boolean fixMultiJoinsT; 2906 private final boolean canonicizeT; 2907 private final boolean fixCanonContradictionsT; 2908 private final boolean fixOffsetContradictionsT; 2909 private final boolean mergeClustersT; 2910 private final boolean mergeLeavesT; 2911 private final boolean mergeInnerT; 2912 private final BandedAligner bandy; 2913 } 2914 2915 2916 /** 2917 * @param cluster 2918 */ 2919 private void unvisit(ArrayList<Unit> cluster) { 2920 for(Unit u : cluster){ 2921 if(u.visited()){u.setVisited(false);} 2922 } 2923 } 2924 2925 /** 2926 * @param cluster 2927 */ 2928 private void reorderClusterBreadthFirst(ArrayList<Unit> cluster) { 2929 if(verbose){System.err.println("reorderClusterBreadthFirst");} 2930 2931 final int size=cluster.size(); 2932 Shared.sort(cluster); //Now it is in descending length 2933 final Unit root=cluster.get(0); 2934 assert(root.length()>=cluster.get(size-1).length()) : root.length()+", "+cluster.get(size-1).length()+", "+root.compareTo(cluster.get(size-1)); 2935 2936 ArrayList<Unit> breadthFirst=new ArrayList<Unit>(cluster.size()); 2937 root.setVisited(true); 2938 // System.err.println("root="+root.name()); 2939 breadthFirst.add(root); 2940 for(int i=0; i<breadthFirst.size(); i++){ 2941 Unit u=breadthFirst.get(i); 2942 Shared.sort(u.overlapList); //Sorted in descending overlap length 2943 if(u.overlapList!=null){ 2944 for(Overlap o : u.overlapList){ 2945 if(!o.u1.visited()){ 2946 // System.err.println("Visiting "+o.u1.name()); 2947 o.u1.setVisited(true); 2948 breadthFirst.add(o.u1); 2949 } 2950 if(!o.u2.visited()){ 2951 // System.err.println("Visiting "+o.u2.name()); 2952 o.u2.setVisited(true); 2953 breadthFirst.add(o.u2); 2954 } 2955 // System.err.println("***"); 2956 // System.err.println(toShortString(breadthFirst)); 2957 } 2958 } 2959 } 2960 for(Unit u : cluster){ 2961 assert(u.visited()); 2962 if(u.visited()){u.setVisited(false);} 2963 if(EA){ 2964 if(u.overlapList!=null){ 2965 for(Overlap o : u.overlapList){assert(!o.visited());} 2966 } 2967 } 2968 } 2969 // System.err.println("***"); 2970 // System.err.println("Final:"); 2971 // System.err.println(toShortString(breadthFirst)); 2972 assert(cluster.size()==breadthFirst.size()); 2973 cluster.clear(); 2974 cluster.addAll(breadthFirst); 2975 } 2976 2977 2978 2979 /** Returns next cluster larger than 1 element. 2980 * Singleton clusters are added directly to 'processed'. */ 2981 private ArrayList<Unit> nextCluster(){ 2982 synchronized(clusterQueue){ 2983 ArrayList<Unit> cluster=clusterQueue.poll(); 2984 assert(cluster==null || cluster.size()>1); 2985 // while(cluster!=null && cluster.size()<2){ 2986 //// unmark(cluster); 2987 // processedClustersT.add(cluster); 2988 // cluster=clusterQueue.poll(); 2989 // } 2990 return cluster; 2991 } 2992 } 2993 2994 2995 /** 2996 * Creates Unit objects or uses ones already attached to reads. 2997 * Places them in local storage and percolates them to shared storage (codeMap), removing exact duplicates. 2998 * Also hashes tips and places these in shared affixMap. 2999 * Looks for containments in the affix map. 3000 * @author Brian Bushnell 3001 * @date Jul 24, 2013 3002 * 3003 */ 3004 private final class HashThread extends Thread{ 3005 3006 public HashThread(boolean addToCodeMap_, boolean addToAffixMap_, boolean findMatches_, boolean findContainments_, boolean findOverlaps_){ 3007 addToCodeMapT=addToCodeMap_; 3008 addToAffixMapT=addToAffixMap_; 3009 findContainmentsT=findContainments_; 3010 findOverlapsT=findOverlaps_; 3011 findMatchesT=findMatches_; 3012 tid=getTid(); 3013 crisq=new ArrayDeque<ConcurrentReadInputStream>(crisa.length); 3014 for(int i=0; i<crisa.length; i++){ 3015 // if(verbose){System.err.println("Adding to crisq.");} 3016 crisq.add(crisa[(i+tid)%crisa.length]); 3017 } 3018 bandy=(maxEdits>0 && (findOverlapsT || findContainmentsT) ? BandedAligner.makeBandedAligner(bandwidth) : null); 3019 3020 // assert(addToCodeMapT) : "addToCodeMapT="+addToCodeMapT+", addToAffixMapT="+addToAffixMapT+", findContainmentsT="+findContainmentsT+ 3021 // ", findOverlapsT="+findOverlapsT+", findMatchesT="+findMatchesT+", convertToUpperCaseT="+convertToUpperCaseT+", numAffixMaps="+numAffixMaps; 3022 } 3023 3024 @Override 3025 public void run(){ 3026 3027 ConcurrentReadInputStream cris=crisq.poll(); 3028 3029 // String lastName=null; 3030 3031 while(cris!=null){ 3032 ListNum<Read> ln=cris.nextList(); 3033 ArrayList<Read> reads=(ln!=null ? ln.list : null); 3034 3035 while(ln!=null && reads!=null && reads.size()>0){//ln!=null prevents a compiler potential null access warning 3036 3037 for(Read r : reads){ 3038 // lastName=r.id; 3039 processReadOuter(r); 3040 } 3041 3042 if(codeMapT!=null && (codeMapT.size()>threadMaxReadsToBuffer || basesStoredT>threadMaxBasesToBuffer)){ 3043 assert(addToCodeMapT); 3044 long added=mergeMaps(); 3045 addedToMainT+=added; 3046 } 3047 3048 cris.returnList(ln); 3049 ln=cris.nextList(); 3050 reads=(ln!=null ? ln.list : null); 3051 } 3052 cris.returnList(ln); 3053 if(codeMapT!=null && !codeMapT.isEmpty()){ 3054 long added=mergeMaps(); 3055 addedToMainT+=added; 3056 } 3057 cris=crisq.poll(); 3058 } 3059 3060 // System.err.println("Thread finished; last name = "+lastName); 3061 3062 codeMapT=null; 3063 localConflictList=null; 3064 sharedConflictList=null; 3065 } 3066 3067 /** Return true if this read was a member of this subset. */ 3068 private boolean processReadOuter(Read r1){ 3069 if(r1.length()<MINSCAF){return false;} 3070 Read r2=r1.mate; 3071 3072 assert(r1.pairnum()==0); 3073 assert(r2==null || r2.pairnum()==1); 3074 3075 if(!addToCodeMapT && r1.obj==null){ 3076 if(r1.bases!=null && r1.length()>=MINSCAF){ 3077 final Unit u=(r1.obj!=null ? (Unit)r1.obj : new Unit(r1)); 3078 assert(u.r==r1 && (r1.obj==u || r1.obj==null)); 3079 final long code=u.code1; 3080 r1.obj=u; 3081 assert(u.r==r1 && r1.obj==u); 3082 if(r2!=null && r2.obj==null){r2.obj=new Unit(r2);} 3083 3084 //Check for subset membership 3085 final boolean inSet=u.inSet(); 3086 if(inSet){ 3087 final Long codeL=code; 3088 ArrayList<Unit> list=codeMap.get(codeL); 3089 boolean found=false; 3090 for(Unit u0 : list){ 3091 //Replace with existing read 3092 if(u0.equals(u) && u0.r.numericID==r1.numericID){ 3093 r1=u0.r; 3094 r2=r1.mate; 3095 found=true; 3096 break; 3097 } 3098 } 3099 assert(list!=null); 3100 if(!found){ 3101 return false; 3102 } 3103 } 3104 } 3105 } 3106 boolean b=processRead(r1); 3107 if(r2!=null){processRead(r2);} 3108 return b; 3109 } 3110 3111 /** Return true if this read was a member of this subset. */ 3112 private boolean processRead(Read r){ 3113 if(r.length()<MINSCAF){return false;} 3114 3115 final boolean inSet; 3116 if(!storeName){r.id=null;} 3117 if(!storeQuality){r.quality=null;} 3118 3119 if(forceTrimLeft>0 || forceTrimRight>0){//Added at request of RQC team 3120 if(r!=null && r.length()>0){ 3121 TrimRead.trimToPosition(r, forceTrimLeft>0 ? forceTrimLeft : 0, forceTrimRight>0 ? forceTrimRight : r.length(), 1); 3122 } 3123 } 3124 if(qTrimLeft || qTrimRight){ 3125 TrimRead.trimFast(r, qTrimLeft, qTrimRight, trimQ, trimE, 0); 3126 } 3127 if(r.length()<MINSCAF){return false;} 3128 3129 readsProcessedT++; 3130 basesProcessedT+=r.length(); 3131 3132 final Unit u=(r.obj!=null ? (Unit)r.obj : new Unit(r)); 3133 assert(u.r==r && (r.obj==u || r.obj==null)); 3134 final long code=u.code1; 3135 3136 //Check for subset membership 3137 inSet=u.inSet(); 3138 3139 r.obj=u; 3140 assert(u.r==r && r.obj==u); 3141 if(r.mate!=null && r.mate.obj==null){r.mate.obj=new Unit(r.mate);} 3142 3143 if(verbose){System.err.println("Generated "+code+" for sequence "+u.name()+"\t"+new String(r.bases, 0, Tools.min(40, r.length())));} 3144 Read primary=null; 3145 3146 if(addToCodeMapT && inSet){ 3147 final Long codeL=code; 3148 ArrayList<Unit> list=codeMapT.get(codeL); 3149 if(list==null){ 3150 if(verbose){System.err.println("Unique.");} 3151 list=new ArrayList<Unit>(1); 3152 list.add(u); 3153 basesStoredT+=r.length(); 3154 codeMapT.put(codeL, list); 3155 }else{ 3156 if(verbose){System.err.println("Exists.");} 3157 boolean match=false; 3158 if(findMatchesT){ 3159 for(Unit u2 : list){ 3160 if(pairedEqualsRC(u, u2)){ 3161 // if(u.r.mate!=null){ 3162 // verbose=true; 3163 // 3164 // Unit um=(Unit)u.r.mate.obj; 3165 // Unit u2m=(Unit)u2.r.mate.obj; 3166 // 3167 // if(verbose){ 3168 // System.err.println("********"); 3169 // System.err.println(u.r.toFastq()); 3170 // System.err.println(u.r.mate.toFastq()); 3171 // System.err.println("********"); 3172 // System.err.println(u2.r.toFastq()); 3173 // System.err.println(u2.r.mate.toFastq()); 3174 // System.err.println("********"); 3175 // System.err.println(u); 3176 // System.err.println(u2); 3177 // System.err.println(um); 3178 // System.err.println(u2m); 3179 // System.err.println("********"); 3180 // System.err.println(u.equals(u2)); 3181 // System.err.println(u.compareTo(u2)); 3182 // System.err.println("********"); 3183 // System.err.println(um.equals(u2m)); 3184 // System.err.println(um.compareTo(u2m)); 3185 // System.err.println("********"); 3186 // } 3187 // 3188 // verbose=false; 3189 // } 3190 assert(u.r.mate==null || pairedEqualsRC((Unit)u.r.mate.obj, (Unit)u2.r.mate.obj)) : 3191 u.r.toFastq()+"\n"+u2.r.toFastq()+"\n"+u.r.mate.toFastq()+"\n"+u2.r.mate.toFastq()+ 3192 "\n"+u+"\n"+u2+"\n"+u.r.mate.obj+"\n"+u2.r.mate.obj; 3193 // if(verbose){System.err.println("Matches "+new String(r2.bases, 0, Tools.min(40, r2.length())));} 3194 match=true; 3195 primary=u2.r; 3196 u2.absorbMatch(u); 3197 if(UNIQUE_ONLY){ 3198 synchronized(u2){ 3199 if(u2.valid()){ 3200 matchesT++; 3201 baseMatchesT+=u2.length(); 3202 u2.setValid(false); 3203 addDupe(u2.r, null); 3204 } 3205 } 3206 } 3207 break; 3208 } 3209 } 3210 } 3211 if(match){ 3212 addDupe(r, primary); 3213 matchesT++; 3214 baseMatchesT+=r.length(); 3215 // if(verbose){System.err.println("matchesT="+matchesT+", baseMatchesT="+baseMatchesT);} 3216 }else{ 3217 collisionsT++; 3218 if(verbose){System.err.println("False collision; count = "+collisionsT);} 3219 list.add(u); 3220 basesStoredT+=r.length(); 3221 } 3222 } 3223 } 3224 3225 if(findContainmentsT){ 3226 int x=findContainments(u); 3227 } 3228 3229 if(findOverlapsT){ 3230 int x=findOverlaps(u); 3231 } 3232 3233 return inSet; 3234 } 3235 3236 private int findContainments(final Unit u){ 3237 if(minLengthPercent<=0 && maxSubs<=0 && minIdentity>=100 && !u.valid()){return 0;} 3238 final byte[] bases=u.bases(); 3239 final int shift=2*k; 3240 final int shift2=shift-2; 3241 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 3242 long kmer=0; 3243 long rkmer=0; 3244 int hits=0; 3245 int currentContainments=0; 3246 int len=0; 3247 3248 if(bases==null || bases.length<k){return -1;} 3249 final LongM key=new LongM(); 3250 3251 for(int i=0; i<bases.length; i++){ 3252 byte b=bases[i]; 3253 long x=baseToNumber[b]; 3254 long x2=baseToComplementNumber[b]; 3255 kmer=((kmer<<2)|x)&mask; 3256 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 3257 if(HASH_NS || AminoAcid.isFullyDefined(b)){len++;} 3258 else{len=0; rkmer=0;} 3259 // if(verbose){System.err.println("Scanning i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 3260 if(len>=k){ 3261 key.set(Tools.max(kmer, rkmer)); //Canonical 3262 for(int am=0; am<affixMaps.length; am++){ 3263 ArrayList<Unit> list=affixMaps[am].get(key); 3264 if(list!=null){ 3265 for(Unit u2 : list){ 3266 if(u!=u2 && !u.equals(u2)){ 3267 if(u2.valid()){ 3268 hits++; 3269 if(verbose){ 3270 System.err.println("\nFound potential containment at am="+am+", i="+i+", key="+key.value()+ 3271 ", pre1="+u2.prefix1+", pre2="+u2.prefix2+ 3272 ", suf1="+u2.suffix1+", suf2="+u2.suffix2+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i, k))); 3273 } 3274 if(u.contains(u2, i, key, bandy, am)){ 3275 synchronized(u2){ 3276 if(u2.valid()){ 3277 currentContainments++; 3278 baseContainmentsT+=u2.length(); 3279 u2.setValid(false); 3280 addDupe(u2.r, u.r); 3281 // u2.absorbContainment(u); 3282 } 3283 } 3284 if(UNIQUE_ONLY){ 3285 synchronized(u){ 3286 if(u.valid()){ 3287 currentContainments++; 3288 baseContainmentsT+=u.length(); 3289 u.setValid(false); 3290 addDupe(u.r, null); 3291 } 3292 } 3293 } 3294 3295 if(verbose){System.err.println("Added containment "+u2);} 3296 } 3297 } 3298 } 3299 } 3300 } 3301 } 3302 } 3303 } 3304 // assert(false) : hits+", "+currentContainments+", "+baseContainments+"\n"+containmentMapT+"\n"; 3305 3306 containmentCollisionsT+=(hits-currentContainments); 3307 // outstream.println("hits="+hits+", currentContainments="+currentContainments); 3308 containmentsT+=currentContainments; 3309 return hits; 3310 } 3311 3312 private int findOverlaps(final Unit u){ 3313 // if(minLengthPercent<=0 && maxSubs<=0 && minIdentity>=100 && !u.valid()){return 0;} 3314 // if(u.overlapList!=null){u.overlapList.clear();} 3315 final byte[] bases=u.bases(); 3316 final int shift=2*k; 3317 final int shift2=shift-2; 3318 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 3319 long kmer=0; 3320 long rkmer=0; 3321 int hits=0; 3322 int currentOverlaps=0; 3323 int len=0; 3324 3325 if(bases==null || bases.length<k){return -1;} 3326 final LongM key=new LongM(); 3327 3328 boolean quit=false; 3329 3330 for(int i=0; i<bases.length && !quit; i++){ 3331 byte b=bases[i]; 3332 long x=baseToNumber[b]; 3333 long x2=baseToComplementNumber[b]; 3334 kmer=((kmer<<2)|x)&mask; 3335 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 3336 if(HASH_NS || AminoAcid.isFullyDefined(b)){len++;} 3337 else{len=0; rkmer=0;} 3338 // if(verbose){System.err.println("Scanning i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} 3339 if(len>=k){ 3340 key.set(Tools.max(kmer, rkmer)); //Canonical key 3341 for(int am=0; am<affixMaps.length; am++){ 3342 ArrayList<Unit> list=affixMaps[am].get(key); 3343 if(list!=null){//found a key collision 3344 for(Unit u2 : list){ 3345 if(quit){break;}//too many edges 3346 int u1cluster=-1, u2cluster=-2; 3347 if(preventTransitiveOverlaps && u!=u2){ 3348 u1cluster=u.determineCluster(); 3349 u2cluster=u2.determineCluster(); 3350 } 3351 if(u1cluster!=u2cluster && u!=u2 && !u.equals(u2) && u2.r!=u.r.mate){//TODO: Not sure why identical things are banned... possibly relates to avoiding inter-pair edges? 3352 if(u2.valid()){ 3353 hits++; 3354 3355 // boolean flag=(u.code1==-3676200394282040623L && u2.code1==-7034423913727372751L) || 3356 // (u2.code1==-3676200394282040623L && u.code1==-7034423913727372751L); 3357 final boolean flag=false; 3358 if(verbose || flag){ 3359 System.err.println("\nFound potential overlap at am="+am+", i="+i+", key="+key.value()+ 3360 ", pre1="+u2.prefix1+", pre2="+u2.prefix2+ 3361 ", suf1="+u2.suffix1+", suf2="+u2.suffix2+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i, k))); 3362 } 3363 3364 final Overlap o; 3365 if(maxEdges>1000000000 || u.overlapList==null || u2.overlapList==null || 3366 (u.overlapList.size()<maxEdges && u2.overlapList.size()<maxEdges2)){ 3367 o=u.makeOverlap(u2, i, key, bandy, am); 3368 3369 }else{ 3370 o=null; 3371 if(u.overlapList.size()>maxEdges){quit=true;} 3372 } 3373 if(o!=null){ 3374 3375 if(preventTransitiveOverlaps){ 3376 mergeClusterIds(u1cluster, u2cluster); 3377 } 3378 3379 assert(o.test(bandy, o.edits+maxEdits)) : o; 3380 if(verbose || flag){System.err.println("Created overlap "+o);} 3381 3382 long comp=u.length()-u2.length(); 3383 if(comp==0){comp=u.code1-u2.code1;} 3384 if(comp==0){comp=u.code2-u2.code2;} 3385 if(comp==0){comp=u.prefix1-u2.prefix1;} 3386 if(comp==0){comp=u.suffix1-u2.suffix1;} 3387 if(comp==0){comp=(u.r.numericID-u2.r.numericID);} 3388 assert(comp!=0) : u+", "+u2; 3389 Unit ua=(comp<0 ? u : u2); 3390 Unit ub=(comp<0 ? u2 : u); 3391 assert(ua!=ub); 3392 if(verbose || flag){ 3393 System.err.println("ua="+ua.code1); 3394 System.err.println("ub="+ub.code1); 3395 System.err.println("u ="+u.code1); 3396 System.err.println("u2="+u2.code1); 3397 System.err.println("u.r ="+u.r.numericID); 3398 System.err.println("u2.r="+u2.r.numericID); 3399 System.err.println("ua contains o? "+ua.alreadyHas(o)); 3400 System.err.println("ub contains o? "+ub.alreadyHas(o)); 3401 System.err.println("ua.list="+ua.overlapList); 3402 System.err.println("ub.list="+ub.overlapList); 3403 } 3404 3405 // assert(ua.alreadyHas(o)==ub.alreadyHas(o)); 3406 3407 final boolean uaContainedOverlap; 3408 3409 synchronized(ua){ 3410 if(ua.overlapList==null){ua.overlapList=new ArrayList<Overlap>(2);} 3411 if(!ua.overlapList.contains(o)){ 3412 if(EA){ 3413 synchronized(ub){ 3414 assert(ub.overlapList==null || !ub.overlapList.contains(o)) : 3415 ua.alreadyHas(o)+", "+ub.alreadyHas(o)+"\n"+o+"\n"+ub.overlapList.get(ub.overlapList.indexOf(o))+ 3416 "\nua.list="+ua.overlapList+"\nub.list="+ub.overlapList+"\nu.code1="+u.code1+"\nu2.code1="+u2.code1; 3417 } 3418 } 3419 currentOverlaps++; 3420 baseOverlapsT+=o.overlapLen; 3421 ua.overlapList.add(o); 3422 if(verbose || flag){System.err.println("Added overlap "+o);} 3423 uaContainedOverlap=false; 3424 }else{ 3425 if(verbose || flag){System.err.println("Already contained overlap "+o);} 3426 hits--; 3427 uaContainedOverlap=true; 3428 } 3429 } 3430 3431 if(!uaContainedOverlap){ 3432 synchronized(ub){ 3433 if(ub.overlapList==null){ub.overlapList=new ArrayList<Overlap>(2);} 3434 assert(!ub.overlapList.contains(o)); 3435 ub.overlapList.add(o); 3436 if(verbose || flag){System.err.println("Added overlap "+o);} 3437 } 3438 }else{ 3439 if(verbose || flag){System.err.println("Already contained overlap "+o);} 3440 } 3441 3442 3443 // assert(ua.alreadyHas(o)); 3444 // assert(ub.alreadyHas(o)); 3445 // assert(ua.overlapList.contains(o)); 3446 // assert(ub.overlapList.contains(o)); 3447 if(verbose || flag){ 3448 System.err.println("ua contains o? "+ua.alreadyHas(o)); 3449 System.err.println("ub contains o? "+ub.alreadyHas(o)); 3450 System.err.println("ua.list="+ua.overlapList); 3451 System.err.println("ub.list="+ub.overlapList); 3452 } 3453 } 3454 } 3455 } 3456 } 3457 } 3458 } 3459 } 3460 } 3461 if(EA){ 3462 synchronized(u){ 3463 if(u.overlapList!=null && u.overlapList.isEmpty()){ 3464 assert(false) : "Why would this happen?"; 3465 u.overlapList=null; 3466 } 3467 } 3468 } 3469 // assert(false) : hits+", "+currentOverlaps+", "+baseOverlaps+"\n"+overlapMapT+"\n"; 3470 3471 // assert(hits==currentOverlaps) : hits+", "+currentOverlaps; 3472 3473 overlapCollisionsT+=(hits-currentOverlaps); 3474 // outstream.println("hits="+hits+", currentOverlaps="+currentOverlaps); 3475 overlapsT+=currentOverlaps; 3476 return hits; 3477 } 3478 3479 /** Insert reads processed by a thread into the shared code and affix maps. 3480 * If operating in subset mode, only store reads with code equal to subset mod subsetCount. */ 3481 private long mergeMaps(){ 3482 if(verbose){System.err.println("Merging maps.");} 3483 long novelReads=0, novelKeys=0; 3484 long collisionReads=0; 3485 long mergedReads=0; 3486 3487 assert(localConflictList.isEmpty()); 3488 assert(sharedConflictList.isEmpty()); 3489 3490 synchronized(codeMap){ 3491 for(Long key : codeMapT.keySet()){ 3492 if(codeMap.containsKey(key)){ 3493 localConflictList.add(codeMapT.get(key)); 3494 sharedConflictList.add(codeMap.get(key)); 3495 }else{ 3496 ArrayList<Unit> list=codeMapT.get(key); 3497 codeMap.put(key, list); 3498 addedList.addAll(list); 3499 novelReads+=list.size(); 3500 novelKeys++; 3501 } 3502 } 3503 } 3504 3505 if(verbose){System.err.println("Novel reads = "+novelReads+", conflicts = "+localConflictList.size());} 3506 3507 for(int i=0; i<localConflictList.size(); i++){ 3508 ArrayList<Unit> listT=localConflictList.get(i); 3509 ArrayList<Unit> list=sharedConflictList.get(i); 3510 synchronized(list){ 3511 for(Unit u : listT){ 3512 if(verbose){System.err.println("Processing novel unit "+u.name());} 3513 boolean match=false; 3514 Read primary=null; 3515 if(findMatchesT){ 3516 for(Unit u2 : list){ 3517 if(pairedEqualsRC(u, u2)){ 3518 // if(verbose){System.err.println("Matches "+new String(r2.bases, 0, Tools.min(40, r2.length())));} 3519 u2.absorbMatch(u); 3520 if(UNIQUE_ONLY){ 3521 synchronized(u2){ 3522 if(u2.valid()){ 3523 mergedReads++; 3524 baseMatchesT+=u2.length(); 3525 u2.setValid(false); 3526 addDupe(u2.r, null); 3527 } 3528 } 3529 } 3530 match=true; 3531 primary=u2.r; 3532 break; 3533 } 3534 } 3535 } 3536 if(match){ 3537 addDupe(u.r, primary); 3538 mergedReads++; 3539 baseMatchesT+=u.length(); 3540 if(verbose){System.err.println("matchesT="+matchesT+", baseMatchesT="+baseMatchesT);} 3541 }else{ 3542 collisionReads++; 3543 if(verbose){System.err.println("False collision; count = "+collisionReads);} 3544 list.add(u); 3545 addedList.add(u); 3546 } 3547 } 3548 } 3549 } 3550 matchesT+=mergedReads; 3551 collisionsT+=collisionReads; 3552 if(verbose){System.err.println("Done Merging.");} 3553 if(verbose){System.err.println("mapT.size="+codeMapT.size()+", basesStoredT="+basesStoredT);} 3554 3555 codeMapT.clear(); 3556 localConflictList.clear(); 3557 sharedConflictList.clear(); 3558 3559 if(!addedList.isEmpty()){ 3560 if(addToAffixMapT){ 3561 final LongM p=new LongM(-1, true); 3562 assert(affixMap1!=null || affixMap2!=null); 3563 if(affixMap1!=null && !ignoreAffix1){//Allows you to not use am1 3564 synchronized(affixMap1){ 3565 for(Unit u : addedList){ 3566 if(verbose){System.err.println("Processing affixes for "+u.name());} 3567 if(u.prefix1!=-1 || u.prefix1!=u.suffix1){ 3568 if(verbose){System.err.println("Using prefix "+u.prefix1);} 3569 p.set(u.prefix1); 3570 ArrayList<Unit> alu=affixMap1.get(p); 3571 if(alu==null){ 3572 if(verbose){System.err.println("Made new alu for "+p);} 3573 alu=new ArrayList<Unit>(2); 3574 affixMap1.put(p.iCopy(), alu); 3575 } 3576 if(alu.size()<maxAffixCopies){ 3577 if(verbose){System.err.println("Added "+u.name());} 3578 alu.add(u); 3579 } 3580 if(verbose){System.err.println(affixMap1.get(p));} 3581 } 3582 if(storeSuffix && u.prefix1!=u.suffix1){ 3583 if(verbose){System.err.println("Using suffix "+u.suffix1);} 3584 p.set(u.suffix1); 3585 ArrayList<Unit> alu=affixMap1.get(p); 3586 if(alu==null){ 3587 if(verbose){System.err.println("Made new alu for "+p);} 3588 alu=new ArrayList<Unit>(2); 3589 affixMap1.put(p.iCopy(), alu); 3590 } 3591 if(alu.size()<maxAffixCopies){ 3592 if(verbose){System.err.println("Added "+u.name());} 3593 alu.add(u); 3594 } 3595 if(verbose){System.err.println(affixMap1.get(p));} 3596 } 3597 } 3598 } 3599 } 3600 if(affixMap2!=null){ 3601 synchronized(affixMap2){ 3602 for(Unit u : addedList){ 3603 if(u.prefix2!=-1 || u.prefix2!=u.suffix2){ 3604 p.set(u.prefix2); 3605 ArrayList<Unit> alu=affixMap2.get(p); 3606 if(alu==null){ 3607 alu=new ArrayList<Unit>(2); 3608 affixMap2.put(p.iCopy(), alu); 3609 } 3610 if(alu.size()<maxAffixCopies){alu.add(u);} 3611 } 3612 if(storeSuffix && u.prefix2!=u.suffix2){ 3613 p.set(u.suffix2); 3614 ArrayList<Unit> alu=affixMap2.get(p); 3615 if(alu==null){ 3616 alu=new ArrayList<Unit>(2); 3617 affixMap2.put(p.iCopy(), alu); 3618 } 3619 if(alu.size()<maxAffixCopies){alu.add(u);} 3620 } 3621 } 3622 } 3623 } 3624 } 3625 } 3626 3627 addedList.clear(); 3628 basesStoredT=0; 3629 return collisionReads+novelReads; 3630 } 3631 3632 private int getTid(){ 3633 synchronized(HashThread.class){ 3634 int x=tcount; 3635 tcount++; 3636 return x; 3637 } 3638 } 3639 3640 private LinkedHashMap<Long, ArrayList<Unit>> codeMapT=new LinkedHashMap<Long, ArrayList<Unit>>(threadMaxReadsToBuffer*8); 3641 private ArrayList<Unit> addedList=new ArrayList<Unit>(threadMaxReadsToBuffer); 3642 private ArrayList<ArrayList<Unit>> localConflictList=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer); 3643 private ArrayList<ArrayList<Unit>> sharedConflictList=new ArrayList<ArrayList<Unit>>(threadMaxReadsToBuffer); 3644 3645 long matchesT=0; 3646 long baseMatchesT=0; 3647 long baseContainmentsT=0; 3648 long collisionsT=0; 3649 long containmentsT=0; 3650 long containmentCollisionsT=0; 3651 long basesStoredT=0; 3652 long addedToMainT=0; 3653 long readsProcessedT=0; 3654 long basesProcessedT=0; 3655 long overlapsT=0; 3656 long baseOverlapsT=0; 3657 long overlapCollisionsT=0; 3658 3659 private final boolean addToCodeMapT; 3660 private final boolean addToAffixMapT; 3661 private final boolean findContainmentsT; 3662 private final boolean findOverlapsT; 3663 private final boolean findMatchesT; 3664 // private final boolean convertToUpperCaseT; 3665 private final int tid; 3666 private final ArrayDeque<ConcurrentReadInputStream> crisq; 3667 private final BandedAligner bandy; 3668 } 3669 3670 public static boolean equalsRC(byte[] a, byte[] b){ 3671 if(a==b){return true;} 3672 if(a==null || b==null){return false;} 3673 if(a.length!=b.length){return false;} 3674 3675 boolean ca=isCanonical(a); 3676 boolean cb=isCanonical(b); 3677 3678 if(ca==cb){ 3679 for(int i=0; i<a.length; i++){ 3680 final byte aa=a[i], bb=b[i]; 3681 if(aa!=bb){return false;} 3682 } 3683 }else{ 3684 for(int i=0, j=b.length-1; i<a.length; i++, j--){ 3685 final byte aa=a[i], bb=baseToComplementExtended[b[j]]; 3686 if(aa!=bb){return false;} 3687 } 3688 } 3689 return true; 3690 } 3691 3692 public static boolean pairedEqualsRC(Unit ua, Unit ub){ 3693 if(verbose){System.err.println("pairedEqualsRC("+ua.name()+", "+ub.name()+")");} 3694 if(verbose){System.err.println("ea");} 3695 boolean b=equalsRC(ua, ub); 3696 if(verbose){System.err.println("eb");} 3697 if(!b){return false;} 3698 if(verbose){System.err.println("ec");} 3699 3700 if(ua.r!=null && ub.r!=null){ 3701 if(verbose){System.err.println("ed");} 3702 assert((ua.r.mate==null)==(ub.r.mate==null)); 3703 if(verbose){System.err.println("ee");} 3704 if(ua.r.mate!=null && ub.r.mate!=null){ 3705 if(verbose){System.err.println("ef");} 3706 return ua.canonical()==ub.canonical() && ua.r.pairnum()==ub.r.pairnum() && Tools.compare(ua.r.mate.bases, ub.r.mate.bases)==0; 3707 } 3708 if(verbose){System.err.println("eg");} 3709 } 3710 if(verbose){System.err.println("eh");} 3711 return true; 3712 } 3713 3714 private static boolean equalsRC(Unit ua, Unit ub){ 3715 if(verbose){System.err.println("equalsRC("+ua.name()+", "+ub.name()+")");} 3716 return ua.code1==ub.code1 && ua.code2==ub.code2 && (ua.canonical()==ub.canonical() ? (ua.prefix1==ub.prefix1 && ua.suffix1==ub.suffix1) : 3717 (ua.prefix1==ub.suffix1 && ua.suffix1==ub.prefix1)) && compareRC(ua, ub)==0; 3718 } 3719 3720 public static int comparePairedRC(Unit ua, Unit ub){ 3721 if(verbose){System.err.println("comparePairedRC("+ua.name()+", "+ub.name()+")");} 3722 int x=compareRC(ua, ub); 3723 if(x!=0){return x;} 3724 3725 if(ua.r!=null && ub.r!=null && ua.r.mate!=null && ub.r.mate!=null){ 3726 if(ua.r.pairnum()!=ub.r.pairnum()){return ua.r.pairnum()-ub.r.pairnum();} 3727 return compareRC((Unit)ua.r.mate.obj, (Unit)ub.r.mate.obj); 3728 } 3729 return 0; 3730 } 3731 3732 //TODO 3733 //This is really for sorting by length. 3734 private static int compareRC(Unit ua, Unit ub){ 3735 if(verbose){System.err.println("compareRC("+ua.name()+", "+ub.name()+")");} 3736 if(ua==ub){return 0;} 3737 if(verbose){System.err.println("a");} 3738 if(verbose){System.err.println("a1");} 3739 if(ua.length()!=ub.length()){return ub.length()-ua.length();} 3740 if(verbose){System.err.println("a2");} 3741 3742 if(REQUIRE_MATCHING_NAMES){ 3743 if(ua.name()!=null && ub.name()!=null){ 3744 int x=ua.name().compareTo(ub.name()); 3745 if(x!=0){return x;} 3746 } 3747 } 3748 if(verbose){System.err.println("a3");} 3749 3750 if(ua.r==null || ub.r==null){ 3751 if(verbose){System.err.println("b");} 3752 if(verbose){System.err.println("b1");} 3753 if(ua.canonical()){ 3754 if(verbose){System.err.println("c");} 3755 if(ub.canonical()){ 3756 if(ua.prefix1!=ub.prefix1){return ua.prefix1>ub.prefix1 ? 1 : -1;} 3757 if(ua.suffix1!=ub.suffix1){return ua.suffix1>ub.suffix1 ? 1 : -1;} 3758 }else{ 3759 if(ua.prefix1!=ub.suffix1){return ua.prefix1>ub.suffix1 ? 1 : -1;} 3760 if(ua.suffix1!=ub.prefix1){return ua.suffix1>ub.prefix1 ? 1 : -1;} 3761 } 3762 }else{ 3763 if(verbose){System.err.println("d");} 3764 if(ub.canonical()){ 3765 if(ua.suffix1!=ub.prefix1){return ua.suffix1>ub.prefix1 ? 1 : -1;} 3766 if(ua.prefix1!=ub.suffix1){return ua.prefix1>ub.suffix1 ? 1 : -1;} 3767 }else{ 3768 if(ua.suffix1!=ub.suffix1){return ua.suffix1>ub.suffix1 ? 1 : -1;} 3769 if(ua.prefix1!=ub.prefix1){return ua.prefix1>ub.prefix1 ? 1 : -1;} 3770 } 3771 } 3772 if(verbose){System.err.println("e");} 3773 if(ua.code1!=ub.code1){return ua.code1>ub.code1 ? 1 : -1;} 3774 if(ua.code2!=ub.code2){return ua.code2>ub.code2 ? 1 : -1;} 3775 3776 return ua.pairnum()-ub.pairnum(); 3777 } 3778 if(verbose){System.err.println("f");} 3779 final byte[] a=ua.r.bases, b=ub.r.bases; 3780 if(a==b){return 0;} 3781 if(a==null || b==null){return a==null ? -1 : 1;} 3782 if(verbose){System.err.println("g");} 3783 3784 if(ua.canonical()==ub.canonical()){ 3785 if(verbose){System.err.println("h");} 3786 if(ua.canonical() && ub.canonical()){ 3787 for(int i=0; i<a.length; i++){ 3788 final byte aa=a[i], bb=b[i]; 3789 if(aa!=bb){return aa-bb;} 3790 } 3791 }else{ 3792 for(int i=a.length-1; i>=0; i--){ 3793 final byte aa=baseToComplementExtended[a[i]], bb=baseToComplementExtended[b[i]]; 3794 if(aa!=bb){return aa-bb;} 3795 } 3796 } 3797 }else{ 3798 if(verbose){System.err.println("i");} 3799 if(ua.canonical()){ 3800 for(int i=0, j=b.length-1; i<a.length; i++, j--){ 3801 final byte aa=a[i], bb=baseToComplementExtended[b[j]]; 3802 if(aa!=bb){return aa-bb;} 3803 } 3804 }else{ 3805 for(int i=a.length-1, j=0; i>=0; i--, j++){ 3806 final byte aa=baseToComplementExtended[a[i]], bb=b[j]; 3807 if(aa!=bb){return aa-bb;} 3808 } 3809 } 3810 } 3811 3812 if(verbose){System.err.println("j");} 3813 return ua.pairnum()-ub.pairnum(); 3814 } 3815 3816 private static long hashTip(byte[] bases, boolean prefix, int k, int skipInitialBases){ 3817 if(bases==null || bases.length<k){return -1;} 3818 3819 final int shift=2*k; 3820 final int shift2=shift-2; 3821 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 3822 long kmer=0; 3823 long rkmer=0; 3824 int len=0; 3825 3826 final int start=(prefix ? 0+skipInitialBases : bases.length-k-skipInitialBases); 3827 final int stop=start+k; 3828 3829 // if(verbose){ 3830 // System.err.println("\n"+new String(bases)); 3831 // System.err.println("prefix="+prefix+", start="+start+", stop="+stop); 3832 //// System.err.print(new String(bases)); 3833 // } 3834 for(int i=start; i<stop; i++){ 3835 byte b=bases[i]; 3836 // if(verbose){System.err.print((char)b);} 3837 long x=baseToNumber[b]; 3838 long x2=baseToComplementNumber[b]; 3839 kmer=((kmer<<2)|x)&mask; 3840 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 3841 len++; 3842 } 3843 if(verbose){System.err.println(new String(bases, start, k)+" = "+Tools.max(kmer, rkmer));} 3844 assert(len==k) : len+","+k; 3845 return Tools.max(kmer, rkmer); 3846 } 3847 3848 private static final int calcMaxEdits(int maxEdits, float minIdentityMult, int len){ 3849 return minIdentityMult==0 ? maxEdits : Tools.max(maxEdits, (int)Math.round(len*minIdentityMult)); 3850 } 3851 3852 3853 private class Overlap implements Comparable<Overlap>{ 3854 3855 public Overlap(Unit u1_, Unit u2_, int type_, int start1_, int start2_, int stop1_, int stop2_, int len_, int mismatches_, int edits_, BandedAligner bandy){ 3856 assert(u1_!=u2_); 3857 if(verbose){System.err.println("\nCreating an overlap.");} 3858 u1=u1_; 3859 u2=u2_; 3860 type=type_; 3861 start1=start1_; 3862 start2=start2_; 3863 stop1=stop1_; 3864 stop2=stop2_; 3865 overlapLen=len_; 3866 mismatches=mismatches_; 3867 edits=edits_; 3868 3869 assert(Tools.absdif(Tools.absdif(start1, stop1), Tools.absdif(start2, stop2))<=maxEdits) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) 3870 +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) 3871 +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); 3872 3873 assert(start1>=0 && start1<=u1.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length(); 3874 assert(stop1>=0 && stop1<=u1.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length(); 3875 assert(start2>=0 && start2<=u2.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length(); 3876 assert(stop2>=0 && stop2<=u2.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length(); 3877 3878 assert(type==FORWARD || type==FORWARDRC || type==REVERSE || type==REVERSERC); 3879 3880 if(verbose){System.err.println(this);} 3881 3882 assert(Tools.absdif(Tools.absdif(start1, stop1), Tools.absdif(start1, stop1))<=maxEdits); 3883 3884 assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) 3885 +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) 3886 +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); 3887 if(verbose){System.err.println("Passed test 1.");} 3888 3889 // bandy.verbose=true; 3890 // test(bandy); 3891 // assert(false); 3892 3893 assert(u1!=u2); 3894 u1.firstInOverlap(u2); 3895 u2.firstInOverlap(u1); 3896 assert(u1.length()!=u2.length() || u1.code1!=u2.code1 || u1.code2!=u2.code2 || (u1.r!=null && u1.r.mate!=null)) : "Collision? \n"+this+"\n"+u1+"\n"+u2; 3897 assert(u1.firstInOverlap(u2)!=u2.firstInOverlap(u1)) : 3898 "\nu1.firstInOverlap(u2)="+u1.firstInOverlap(u2)+"\nu2.firstInOverlap(u1)="+u2.firstInOverlap(u1)+"\nu1="+u1+"\nu2="+u2; 3899 3900 if(!u1.firstInOverlap(u2)){ 3901 if(verbose){System.err.println("\nSwapping.");} 3902 swap(); 3903 if(verbose){System.err.println(this);} 3904 3905 if(EA && !customBandwidth && !test(bandy, edits+maxEdits)){ 3906 System.err.println("\n"+this); 3907 swap(); 3908 System.err.println("\n"+this); 3909 assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n"; 3910 System.err.println("Passed test 2a, "+bandy.lastEdits+" edits.\n"); 3911 swap(); 3912 System.err.println("\n"+this); 3913 assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) 3914 +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) 3915 +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); 3916 System.err.println("Passed test 2b, "+bandy.lastEdits+" edits.\n"); 3917 } 3918 3919 assert(customBandwidth || test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) 3920 +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) 3921 +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); 3922 if(verbose){System.err.println("Passed test 2.");} 3923 } 3924 3925 if(type==REVERSE || type==REVERSERC){ 3926 if(verbose){System.err.println("\nReversing.");} 3927 reverseDirection(); 3928 if(verbose){System.err.println(this);} 3929 3930 if(EA && !Shared.anomaly && !customBandwidth && bandy!=null && !test(bandy, edits+maxEdits)){ 3931 Shared.anomaly=true; 3932 BandedAligner.verbose=true; 3933 System.err.println("\n********** Failed test 3, "+bandy.lastEdits+" edits. **********\n"); 3934 reverseDirection(); 3935 System.err.println(this); 3936 assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n"; 3937 System.err.println("Passed test 3a, "+bandy.lastEdits+" edits.\n"); 3938 reverseDirection(); 3939 System.err.println(this); 3940 assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) 3941 +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) 3942 +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); 3943 System.err.println("Passed test 3b, "+bandy.lastEdits+" edits.\n"); 3944 BandedAligner.verbose=false; 3945 assert(false); 3946 } 3947 3948 assert(customBandwidth || test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n"; 3949 if(verbose){System.err.println("Passed test 3.");} 3950 } 3951 //Now all overlaps should be FORWARD or FORWARDRC and u1 should be at least as big as u2 3952 assert(type==FORWARD || type==FORWARDRC); 3953 assert(u1.length()>=u2.length()); 3954 assert(u1.firstInOverlap(u2)); 3955 assert(!u2.firstInOverlap(u1)); 3956 if(verbose){System.err.println("Finished overlap initialization.");} 3957 } 3958 3959 public boolean test(BandedAligner bandy, int editLimit){ 3960 final int last1=u1.length()-1, last2=u2.length()-1; 3961 if(verbose){System.err.println("Testing "+OVERLAP_TYPE_NAMES[type]+", "+start1+", "+start2);} 3962 if(type==FORWARD){ 3963 assert(start1==0 || start2==0) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2; 3964 if(start2==0){ 3965 if(verbose){System.err.println("A");} 3966 return u1.overlapsForward(u2, start1, start2, bandy, false, editLimit);} 3967 else{ 3968 if(verbose){System.err.println("B");} 3969 return u2.overlapsForward(u1, start2, start1, bandy, false, editLimit);} 3970 } 3971 if(type==FORWARDRC){ 3972 assert(start1==0 || start2==last2) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2; 3973 if(start2==last2){return u1.overlapsForwardRC(u2, start1, start2, bandy, false, editLimit);} 3974 else{return u2.overlapsReverseRC(u1, start2, start1, bandy, false, editLimit);} 3975 } 3976 if(type==REVERSE){ 3977 assert(start1==last1 || start2==last2) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2; 3978 if(start2==last2){return u1.overlapsReverse(u2, start1, start2, bandy, false, editLimit);} 3979 else{return u2.overlapsReverse(u1, start2, start1, bandy, false, editLimit);} 3980 } 3981 if(type==REVERSERC){ 3982 assert(start1==last1 || start2==0) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2; 3983 if(start2==0){return u1.overlapsReverseRC(u2, start1, start2, bandy, false, editLimit);} 3984 else{return u2.overlapsForwardRC(u1, start2, start1, bandy, false, editLimit);} 3985 } 3986 throw new RuntimeException(); 3987 } 3988 3989 @Override 3990 public boolean equals(Object o){ 3991 return equals((Overlap)o); 3992 } 3993 3994 public boolean equals(Overlap o){ 3995 if(this==o){return true;} 3996 assert(o!=null) : "*A*\n"+this+"\n"+o+"\n"+u1+"\n"+u2; 3997 assert(u1!=null && u2!=null) : "*B*\n"+this+"\n"+o+"\n"+u1+"\n"+u2; 3998 assert(u1!=o.u2 || u2!=o.u1) : "*C*\n"+this+"\n"+o+"\n"+u1.firstInOverlap(u2)+"\n"+o.u1.firstInOverlap(o.u2)+"\n"+u1+"\n"+u2; 3999 return (u1==o.u1 && u2==o.u2 && type==o.type && start1==o.start1 && start2==o.start2 && stop1==o.stop1 && stop2==o.stop2) 4000 ;//|| (u1==o.u2 && u2==o.u1 && type==reverseType(o.type) && start1==o.start2 && start2==o.start1); 4001 } 4002 4003 // public int compareTo(Overlap o){ 4004 // int a=compareTo2(o); 4005 // int b=o.compareTo2(this); 4006 // assert(a==-b) : "\n"+this+"\n"+o+"\na="+a+", b="+b+", equals="+this.equals(o) 4007 // +"\nu1.compareTo(o.u1)="+u1.compareTo(o.u1)+"\no.u1.compareTo(u1)="+o.u1.compareTo(u1) 4008 // +"\nu2.compareTo(o.u2)="+u2.compareTo(o.u2)+"\no.u2.compareTo(u2)="+o.u2.compareTo(u2); 4009 // return a; 4010 // } 4011 4012 @Override 4013 public int compareTo(Overlap o){ 4014 int score1=overlapLen-50*(mismatches+edits); 4015 int score2=o.overlapLen-50*(o.mismatches+o.edits); 4016 if(score1!=score2){return score2-score1;} 4017 if(overlapLen!=o.overlapLen){return o.overlapLen-overlapLen;} 4018 int x=u1.compareTo(o.u1); 4019 if(x!=0){return -x;} 4020 x=u2.compareTo(o.u2); 4021 if(x!=0){return -x;} 4022 if(type!=o.type){return type-o.type;} 4023 if((u1!=o.u1 || u2!=o.u2) && absorbMatch && !subsetMode){ 4024 boolean oldv=verbose; 4025 verbose=true; 4026 System.err.println(this); 4027 System.err.println(o); 4028 System.err.println("********"); 4029 System.err.println(u1); 4030 System.err.println(u2); 4031 System.err.println(o.u1); 4032 System.err.println(o.u2); 4033 System.err.println("********"); 4034 System.err.println(u1.equals(o.u1)); 4035 System.err.println("********"); 4036 System.err.println(u2.equals(o.u2)); 4037 System.err.println("********"); 4038 System.err.println(u1.compareTo(o.u1)); 4039 System.err.println("********"); 4040 System.err.println(u2.compareTo(o.u2)); 4041 System.err.println("********"); 4042 verbose=oldv; 4043 } 4044 assert(!absorbMatch || (u1==o.u1 && u2==o.u2) || subsetMode) : "\n"+u1.r+"\n"+u2.r+"\n"+o.u1.r+"\n"+o.u2.r 4045 +"\n\n"+u1.r.mate+"\n"+u2.r.mate+"\n"+o.u1.r.mate+"\n"+o.u2.r.mate; 4046 // assert(false) : "\n"+this+"\n"+o+"\n>"+u1.name()+"\n"+new String(u1.bases())+"\n>"+u2.name()+"\n"+new String(u2.bases())+"\n"; 4047 if(start1!=o.start1){return start1-o.start1;} 4048 if(stop1!=o.stop1){return stop1-o.stop1;} 4049 if(start2!=o.start2){return start2-o.start2;} 4050 if(stop2!=o.stop2){return stop2-o.stop2;} 4051 if(this.equals(o)){ 4052 return 0; 4053 }else{ 4054 //TODO: ensure this assumption is valid. 4055 assert(!absorbContainment || !absorbMatch || subsetMode) : "\n"+this+"\n"+o+"\n>"+u1.name()+"\n"+new String(u1.bases())+"\n>"+u2.name()+"\n"+new String(u2.bases())+"\n"; 4056 4057 if(u1.unitID!=o.u1.unitID){return u1.unitID-o.u1.unitID;} 4058 if(u2.unitID!=o.u2.unitID){return u2.unitID-o.u2.unitID;} 4059 } 4060 return 0; 4061 } 4062 4063 @Override 4064 public int hashCode(){ 4065 return u1.hashCode()^u2.hashCode()^overlapLen; 4066 } 4067 4068 public void flip(Unit changed, BandedAligner bandy){ 4069 4070 if(changed==u2){ 4071 if(type==FORWARD){type=FORWARDRC;} 4072 else if(type==FORWARDRC){type=FORWARD;} 4073 else if(type==REVERSE){type=REVERSERC;} 4074 else if(type==REVERSERC){type=REVERSE;} 4075 else{throw new RuntimeException("Unknown overlap type "+type);} 4076 start2=u2.length()-start2-1; 4077 stop2=u2.length()-stop2-1; 4078 }else if(changed==u1){ 4079 if(type==FORWARD){type=REVERSERC;} 4080 else if(type==FORWARDRC){type=REVERSE;} 4081 else if(type==REVERSE){type=FORWARDRC;} 4082 else if(type==REVERSERC){type=FORWARD;} 4083 else{throw new RuntimeException("Unknown overlap type "+type);} 4084 start1=u1.length()-start1-1; 4085 stop1=u1.length()-stop1-1; 4086 }else{throw new RuntimeException("'changed' was not in the Overlap.");} 4087 4088 assert(test(bandy, edits+maxEdits)); 4089 } 4090 4091 public void swap(){ 4092 Unit tempu=u1; 4093 u1=u2; 4094 u2=tempu; 4095 int temp=start1; 4096 start1=start2; 4097 start2=temp; 4098 temp=stop1; 4099 stop1=stop2; 4100 stop2=temp; 4101 if(type==FORWARDRC){type=REVERSERC;} 4102 else if(type==REVERSERC){type=FORWARDRC;} 4103 } 4104 4105 public void reverseDirection(){ 4106 type=reverseType(type); 4107 int temp=start1; 4108 start1=stop1; 4109 stop1=temp; 4110 temp=start2; 4111 start2=stop2; 4112 stop2=temp; 4113 } 4114 4115 @Override 4116 public String toString(){ 4117 StringBuilder sb=new StringBuilder(80); 4118 sb.append("type="); 4119 sb.append(OVERLAP_TYPE_NAMES[type]); 4120 sb.append(", len="); 4121 sb.append(overlapLen); 4122 sb.append(", subs="); 4123 sb.append(mismatches); 4124 sb.append(", edits="); 4125 sb.append(edits); 4126 4127 sb.append(" ("); 4128 sb.append(u1.name()==null ? u1.r.numericID+"" : u1.name()); 4129 if(printLengthInEdges){ 4130 sb.append(", length="); 4131 sb.append(u1.length()); 4132 } 4133 sb.append(", start1="); 4134 sb.append(start1); 4135 sb.append(", stop1="); 4136 sb.append(stop1); 4137 4138 sb.append(") ("); 4139 sb.append(u2.name()==null ? u2.r.numericID+"" : u2.name()); 4140 if(printLengthInEdges){ 4141 sb.append(", length="); 4142 sb.append(u2.length()); 4143 } 4144 sb.append(", start2="); 4145 sb.append(start2); 4146 sb.append(", stop2="); 4147 sb.append(stop2); 4148 sb.append(")"); 4149 return sb.toString(); 4150 } 4151 4152 public String toLabel(){ 4153 StringBuilder sb=new StringBuilder(80); 4154 sb.append(OVERLAP_TYPE_ABBREVIATIONS[type]); 4155 sb.append(','); 4156 sb.append(overlapLen); 4157 sb.append(','); 4158 sb.append(mismatches); 4159 sb.append(','); 4160 sb.append(edits); 4161 4162 if(printLengthInEdges){ 4163 sb.append(','); 4164 sb.append(u1.length()); 4165 } 4166 sb.append(','); 4167 sb.append(start1); 4168 sb.append(','); 4169 sb.append(stop1); 4170 4171 if(printLengthInEdges){ 4172 sb.append(','); 4173 sb.append(u2.length()); 4174 } 4175 sb.append(','); 4176 sb.append(start2); 4177 sb.append(','); 4178 sb.append(stop2); 4179 4180 return sb.toString(); 4181 } 4182 4183 4184 private void setCanonContradiction(boolean b){ 4185 assert(b!=canonContradiction()) : b+", "+canonContradiction(); 4186 if(b){flags|=CANON_CONTRADICTION_MASK;} 4187 else{flags&=~CANON_CONTRADICTION_MASK;} 4188 assert(b==canonContradiction()) : b+", "+canonContradiction(); 4189 } 4190 4191 private void setOffsetContradiction(boolean b){ 4192 assert(b!=offsetContradiction()) : b+", "+offsetContradiction(); 4193 if(b){flags|=OFFSET_CONTRADICTION_MASK;} 4194 else{flags&=~OFFSET_CONTRADICTION_MASK;} 4195 assert(b==offsetContradiction()) : b+", "+offsetContradiction(); 4196 } 4197 4198 private void setMultiJoin(boolean b){ 4199 assert(b!=multiJoin()) : b+", "+multiJoin(); 4200 if(b){flags|=MULTIJOIN_MASK;} 4201 else{flags&=~MULTIJOIN_MASK;} 4202 assert(b==multiJoin()) : b+", "+multiJoin(); 4203 } 4204 4205 private void setVisited(boolean b){ 4206 assert(b!=visited()) : b+", "+visited(); 4207 if(b){flags|=VISITED_MASK;} 4208 else{flags&=~VISITED_MASK;} 4209 assert(b==visited()) : b+", "+visited(); 4210 } 4211 4212 private void setCyclic(boolean b){ 4213 assert(b!=cyclic()) : b+", "+cyclic(); 4214 if(b){flags|=CYCLIC_MASK;} 4215 else{flags&=~CYCLIC_MASK;} 4216 assert(b==cyclic()) : b+", "+cyclic(); 4217 } 4218 4219 private void setInvalid(boolean b){ 4220 assert(b!=invalid()) : b+", "+invalid(); 4221 assert(b!=mst()) : b+", "+mst()+", "+invalid(); 4222 if(b){flags|=INVALID_MASK;} 4223 else{flags&=~INVALID_MASK;} 4224 assert(b==invalid()) : b+", "+invalid(); 4225 } 4226 4227 private void setMst(boolean b){ 4228 assert(b!=mst()) : b+", "+mst(); 4229 assert(b!=invalid()) : b+", "+mst()+", "+invalid(); 4230 if(b){flags|=MST_MASK;} 4231 else{flags&=~MST_MASK;} 4232 assert(b==mst()) : b+", "+mst(); 4233 } 4234 4235 public void clearVolatileFlags(){ 4236 flags=0; 4237 // flags=flags&~(MULTIJOIN_MASK|VISITED_MASK|CANON_CONTRADICTION_MASK|CYCLIC_MASK|OFFSET_CONTRADICTION_MASK|INVALID_MASK); 4238 // assert(!canonContradiction()); 4239 // assert(!offsetContradiction()); 4240 // assert(!multiJoin()); 4241 // assert(!visited()); 4242 // assert(!cyclic()); 4243 // assert(!invalid()); 4244 } 4245 4246 public boolean canonContradiction(){return (CANON_CONTRADICTION_MASK&flags)==CANON_CONTRADICTION_MASK;} 4247 public boolean offsetContradiction(){return (OFFSET_CONTRADICTION_MASK&flags)==OFFSET_CONTRADICTION_MASK;} 4248 public boolean multiJoin(){return (MULTIJOIN_MASK&flags)==MULTIJOIN_MASK;} 4249 public boolean visited(){return (VISITED_MASK&flags)==VISITED_MASK;} 4250 public boolean cyclic(){return (CYCLIC_MASK&flags)==CYCLIC_MASK;} 4251 public boolean invalid(){return (INVALID_MASK&flags)==INVALID_MASK;} 4252 public boolean mst(){return (MST_MASK&flags)==MST_MASK;} 4253 public boolean contradiction(){return canonContradiction() || offsetContradiction();} 4254 4255 private static final long VISITED_MASK=(1L<<0); 4256 private static final long MULTIJOIN_MASK=(1L<<1); 4257 private static final long CYCLIC_MASK=(1L<<2); 4258 private static final long CANON_CONTRADICTION_MASK=(1L<<3); 4259 private static final long OFFSET_CONTRADICTION_MASK=(1L<<4); 4260 private static final long INVALID_MASK=(1L<<5); 4261 private static final long MST_MASK=(1L<<6); 4262 4263 Unit u1; 4264 Unit u2; 4265 int type; 4266 int start1; 4267 int start2; 4268 int stop1; 4269 int stop2; 4270 4271 long flags=0; 4272 4273 final int overlapLen; 4274 final int mismatches; 4275 final int edits; 4276 } 4277 4278 /** 4279 * @return 4280 */ 4281 private int determineCluster2(final int uid) { 4282 assert(clusterNumbers!=null); 4283 boolean stable=false; 4284 int cluster=uid; 4285 while(!stable){ 4286 cluster=clusterNumbers.get(uid); 4287 if(cluster==0 || cluster==uid){return cluster;} 4288 assert(cluster<=uid); 4289 final int next=determineCluster2(cluster); 4290 if(next>=cluster){return cluster;} 4291 stable=clusterNumbers.compareAndSet(uid, cluster, next); 4292 } 4293 return cluster; 4294 } 4295 4296 4297 private int mergeClusterIds(int cluster1, int cluster2) { 4298 assert(clusterNumbers!=null); 4299 4300 // System.err.println("Merging clusters "+cluster1+" and "+cluster2); 4301 4302 while(cluster1!=cluster2){ 4303 int min=Tools.min(cluster1, cluster2); 4304 if(cluster1!=min){ 4305 assert(cluster1>min); 4306 boolean b=clusterNumbers.compareAndSet(cluster1, cluster1, min); 4307 if(!b){ 4308 cluster1=determineCluster2(cluster1); 4309 min=Tools.min(cluster1, cluster2); 4310 } 4311 } 4312 if(cluster2!=min){ 4313 assert(cluster2>min); 4314 boolean b=clusterNumbers.compareAndSet(cluster2, cluster2, min); 4315 if(!b){ 4316 cluster2=determineCluster2(cluster2); 4317 min=Tools.min(cluster1, cluster2); 4318 } 4319 } 4320 } 4321 // System.err.println("Returning "+cluster1); 4322 return cluster1; 4323 } 4324 4325 private class Unit implements Comparable<Unit>, Serializable { 4326 4327 /** 4328 * 4329 */ 4330 private static final long serialVersionUID = 5232407003873807738L; 4331 4332 public Unit(Read r_){ 4333 this(r_, isCanonical(r_.bases)); 4334 } 4335 4336 public Unit(Read r_, boolean canonical_){ 4337 // this(r_, canonical_, canonical_ ? hash(r_.bases) : hashReversed(r_.bases)); 4338 this(r_, canonical_, hash(r_.bases), hashReversed(r_.bases)); 4339 } 4340 4341 public Unit(Read r_, boolean canonical_, long codeF_, long codeR_){ 4342 r=r_; 4343 code1=Tools.min(codeF_, codeR_); 4344 code2=Tools.max(codeF_, codeR_); 4345 long f=r.length(); 4346 prefix1=hashTip(r.bases, true, k, 0); 4347 suffix1=hashTip(r.bases, false, k, 0); 4348 if(r.length()>2*k){ 4349 prefix2=hashTip(r.bases, true, k, k); 4350 suffix2=hashTip(r.bases, false, k, k); 4351 } 4352 if(canonical_){f|=CANON_MASK;} 4353 if(r.pairnum()==1){f|=PAIRNUM_MASK;} 4354 flags=f; 4355 assert(canonical()==canonical_); 4356 assert(length()==r.length()); 4357 assert(pairnum()==r.pairnum()); 4358 if(parseDepth){ 4359 int[] quad=KmerNormalize.parseDepth(r.id, null); 4360 if(quad!=null){depth=quad[r.pairnum()];} 4361 } 4362 } 4363 4364 int determineCluster() { 4365 return determineCluster2(unitID); 4366 } 4367 4368 public void absorbMatch(Unit u){ 4369 4370 assert(code1==u.code1 && code2==u.code2 && length()==u.length()); 4371 if(r==null || u.r==null){return;} 4372 u.r.setDiscarded(true); 4373 final byte[] bases1=r.bases, bases2=u.r.bases; 4374 final byte[] quals1=r.quality, quals2=u.r.quality; 4375 4376 assert((r.mate==null) == (u.r.mate==null)); 4377 4378 if(r.mate!=null && !u.r.mate.discarded()){ 4379 ((Unit)r.mate.obj).absorbMatch((Unit)u.r.mate.obj); 4380 } 4381 if(quals1==null || quals2==null){return;} 4382 4383 if(canonical()==u.canonical()){ 4384 for(int i=0; i<bases1.length; i++){ 4385 byte b1=bases1[i], b2=bases2[i]; 4386 if(!AminoAcid.isFullyDefined(b1) && AminoAcid.isFullyDefined(b2)){bases1[i]=b2;} 4387 else{assert(b1==b2);} 4388 if(quals1!=null && quals2!=null){ 4389 quals1[i]=Tools.max(quals1[i], quals2[i]); 4390 } 4391 } 4392 }else{ 4393 for(int i=0, j=bases2.length-1; i<bases1.length; i++, j--){ 4394 byte b1=bases1[i], b2=baseToComplementExtended[bases2[j]]; 4395 if(!AminoAcid.isFullyDefined(b1) && AminoAcid.isFullyDefined(b2)){bases1[i]=b2;} 4396 else{assert(b1==b2);} 4397 if(quals1!=null && quals2!=null){ 4398 quals1[i]=Tools.max(quals1[i], quals2[j]); 4399 } 4400 } 4401 } 4402 4403 // if(mergeHeaders){ 4404 // r.id+=mergeDelimiter+u.r.id; 4405 // assert(false) : r.id; 4406 // } 4407 // assert(false) : r.id; 4408 } 4409 4410 // public void absorbContainment(Unit u){ 4411 // if(mergeHeaders){ 4412 // r.id+=mergeDelimiter+u.r.id; 4413 // assert(false) : r.id; 4414 // } 4415 // assert(false) : r.id; 4416 // } 4417 4418 public boolean alreadyHas(Overlap o){ 4419 if(overlapList==null){return false;} 4420 for(int i=0; i<overlapList.size(); i++){ 4421 Overlap o2=overlapList.get(i); 4422 if(o.equals(o2)){ 4423 assert(overlapList.contains(o)); 4424 assert(o2.equals(o)); 4425 return true; 4426 } 4427 } 4428 assert(!overlapList.contains(o)); 4429 return false; 4430 } 4431 4432 /** 4433 * @return A cluster 4434 */ 4435 public ArrayList<Unit> makeCluster() { 4436 assert(!visited()); 4437 assert(!clustered()); 4438 assert(valid()); 4439 // assert(set.isEmpty()); 4440 ArrayList<Unit> cluster=new ArrayList<Unit>(overlapList==null ? 1 : overlapList.size()+1); 4441 cluster.add(this); 4442 setClustered(true); 4443 4444 int added=1; 4445 for(int i=0; i<cluster.size(); i++){ 4446 Unit u=cluster.get(i); 4447 added+=u.visit(cluster); 4448 } 4449 4450 assert(added==cluster.size()); 4451 return cluster; 4452 } 4453 4454 /** 4455 * @param cluster 4456 * @return Number of units added to the cluster 4457 */ 4458 public int visit(ArrayList<Unit> cluster) { 4459 assert(!visited()); 4460 assert(clustered()); 4461 assert(valid()); 4462 // assert(cluster.contains(this)); 4463 setVisited(true); 4464 int added=0; 4465 4466 if(r!=null && r.mate!=null){ 4467 Unit u2=(Unit)r.mate.obj; 4468 assert(u2!=this); 4469 assert(u2.valid()); 4470 if(!u2.clustered()){ 4471 u2.setClustered(true); 4472 cluster.add(u2); 4473 added++; 4474 } 4475 } 4476 4477 if(overlapList!=null){ 4478 for(Overlap o : overlapList){ 4479 Unit u2=(o.u1==this ? o.u2 : o.u1); 4480 assert(o.u1==this || o.u2==this); 4481 assert(u2!=this); 4482 assert(u2.valid()); 4483 if(!u2.clustered()){ 4484 u2.setClustered(true); 4485 cluster.add(u2); 4486 added++; 4487 } 4488 } 4489 } 4490 return added; 4491 } 4492 4493 public boolean isTransitive(){ 4494 assert(valid()); 4495 if(overlapList==null || overlapList.size()==0){return true;} 4496 for(Overlap o : overlapList){ 4497 assert(o.u1==this || o.u2==this); 4498 Unit u2=(o.u1==this ? o.u2 : o.u1); 4499 assert(u2!=this); 4500 if(u2.overlapList==null){ 4501 return false; 4502 }else{ 4503 boolean found=false; 4504 for(Overlap o2 : u2.overlapList){ 4505 if(o2.u1==this || o2.u2==this){ 4506 found=true; break; 4507 } 4508 } 4509 if(!found){return false;} 4510 } 4511 } 4512 return true; 4513 } 4514 4515 public boolean isPerfectlyTransitive(){ 4516 assert(valid()); 4517 if(overlapList==null || overlapList.size()==0){return true;} 4518 for(Overlap o : overlapList){ 4519 assert(o.u1==this || o.u2==this); 4520 Unit u2=(o.u1==this ? o.u2 : o.u1); 4521 assert(u2!=this); 4522 if(u2.overlapList==null){ 4523 return false; 4524 }else{ 4525 boolean found=false; 4526 for(Overlap o2 : u2.overlapList){ 4527 if(o2==o){ 4528 found=true; break; 4529 } 4530 } 4531 if(!found){return false;} 4532 } 4533 } 4534 return true; 4535 } 4536 4537 public boolean isNonRedundant(){ 4538 assert(valid()); 4539 if(overlapList==null || overlapList.size()==0){return true;} 4540 for(int i=0; i<overlapList.size(); i++){ 4541 Overlap a=overlapList.get(i); 4542 for(int j=0; j<overlapList.size(); j++){ 4543 Overlap b=overlapList.get(j); 4544 if((i==j)!=(a.equals(b))){ 4545 return false; 4546 } 4547 } 4548 } 4549 return true; 4550 } 4551 4552 public boolean contains(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum) { 4553 if(verbose){System.err.println("contains: Considering key "+key+", unit "+u2);} 4554 if(u2.length()>this.length()){return false;} //Smaller things cannot contain larger things 4555 if(u2.length()==this.length()){//For equal size, only one can contain the other the other 4556 boolean pass=false; 4557 if(!pass && u2.unitID>this.unitID){return false;}else if(u2.unitID<this.unitID){pass=true;} 4558 if(!pass && u2.r.numericID>this.r.numericID){return false;}else if(u2.r.numericID<this.r.numericID){pass=true;} 4559 if(!pass && u2.code1>this.code1){return false;}else if(u2.code1<this.code1){pass=true;} 4560 if(!pass && u2.r.bases.hashCode()>this.r.bases.hashCode()){return false;}else if(u2.r.bases.hashCode()<this.r.bases.hashCode()){pass=true;} 4561 if(!pass){return false;} 4562 } 4563 if(minLengthPercent>0 && (u2.length()*100f/length())<minLengthPercent){return false;} 4564 assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() || (r!=null && r.mate!=null) || //REQUIRE_MATCHING_NAMES || 4565 (canonical()==u2.canonical() ? (u2.prefix1!=prefix1 && u2.suffix1!=suffix1) : (u2.prefix1!=suffix1 && u2.suffix1!=prefix1))) : 4566 "Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r; 4567 4568 if(tableNum==0){ 4569 if(key.value()==u2.prefix1){ 4570 if(verbose){System.err.println("Containment A1");} 4571 if(containsForward(u2, loc-k2, bandy, tableNum==0) || containsReverseRC(u2, loc, bandy, tableNum==0)){return true;} 4572 } 4573 if(key.value()==u2.suffix1){ 4574 if(verbose){System.err.println("Containment B1");} 4575 if(containsReverse(u2, loc, bandy, tableNum==0) || containsForwardRC(u2, loc-k2, bandy, tableNum==0)){return true;} 4576 } 4577 }else{ 4578 if(key.value()==u2.prefix2){ 4579 if(verbose){System.err.println("Containment A2");} 4580 if(containsForward(u2, loc-k2-k, bandy, tableNum==0) || containsReverseRC(u2, loc+k, bandy, tableNum==0)){return true;} 4581 } 4582 if(key.value()==u2.suffix2){ 4583 if(verbose){System.err.println("Containment B2");} 4584 if(containsReverse(u2, loc+k, bandy, tableNum==0) || containsForwardRC(u2, loc-k2-k, bandy, tableNum==0)){return true;} 4585 } 4586 } 4587 return false; 4588 } 4589 4590 private boolean containsForward(Unit u2, int start, BandedAligner bandy, boolean earlyExit) { 4591 if(start+u2.length()>length() || start<0 || start>=length()){return false;} 4592 // if(true){return false;} 4593 if(u2.r!=null){ 4594 final byte[] a=bases(), b=u2.bases(); 4595 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4596 4597 for(int i=start, j=0; j<b.length; i++, j++){ 4598 byte aa=a[i]; 4599 byte bb=b[j]; 4600 if(aa!=bb){ 4601 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 4602 if(earlyExit && j<k2){return false;} 4603 if((mismatches=mismatches+1)>maxMismatches){ 4604 if(bandy==null || maxEdits<1){return false;} 4605 int edits=bandy.alignForward(b, a, 0, start, maxEdits, exact); 4606 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4607 return edits<=maxEdits && bandy.score()>4*edits; 4608 } 4609 } 4610 } 4611 } 4612 return true; 4613 }else{ 4614 assert(false) : "TODO: Verify by hashing and checking both tips"; 4615 return false; 4616 } 4617 } 4618 4619 private boolean containsForwardRC(Unit u2, int start, BandedAligner bandy, boolean earlyExit) { 4620 if(ignoreReverseComplement){return false;} 4621 if(start+u2.length()>length() || start<0 || start>=length()){return false;} 4622 // if(true){return false;} 4623 if(u2.r!=null){ 4624 final byte[] a=bases(), b=u2.bases(); 4625 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4626 4627 for(int i=start, j=b.length-1, iprefix=start+k2; j>=0; i++, j--){ 4628 byte aa=a[i]; 4629 byte bb=baseToComplementExtended[b[j]]; 4630 if(aa!=bb){ 4631 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 4632 if(earlyExit && i<iprefix){return false;} 4633 if((mismatches=mismatches+1)>maxMismatches){ 4634 if(bandy==null || maxEdits<1){return false;} 4635 int edits=bandy.alignForwardRC(b, a, b.length-1, start, maxEdits, exact); 4636 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4637 return edits<=maxEdits && bandy.score()>4*edits; 4638 } 4639 } 4640 } 4641 } 4642 return true; 4643 }else{ 4644 assert(false) : "TODO: Verify by hashing and checking both tips"; 4645 return false; 4646 } 4647 } 4648 4649 private boolean containsReverse(Unit u2, int start, BandedAligner bandy, boolean earlyExit) { 4650 if(start+1<u2.length() || start<0 || start>=length()){return false;} 4651 // if(true){return false;} 4652 if(u2.r!=null){ 4653 final byte[] a=bases(), b=u2.bases(); 4654 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4655 4656 for(int i=start, j=b.length-1, iprefix=start-k2; j>=0; i--, j--){ 4657 byte aa=a[i]; 4658 byte bb=b[j]; 4659 if(aa!=bb){ 4660 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 4661 if(earlyExit && i>iprefix){return false;} 4662 if((mismatches=mismatches+1)>maxMismatches){ 4663 if(bandy==null || maxEdits<1){return false;} 4664 int edits=bandy.alignReverse(b, a, b.length-1, start, maxEdits, exact); 4665 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4666 return edits<=maxEdits && bandy.score()>4*edits; 4667 } 4668 } 4669 } 4670 } 4671 return true; 4672 }else{ 4673 assert(false) : "TODO: Verify by hashing and checking both tips"; 4674 return false; 4675 } 4676 } 4677 4678 private boolean containsReverseRC(Unit u2, int start, BandedAligner bandy, boolean earlyExit) { 4679 if(ignoreReverseComplement){return false;} 4680 if(start+1<u2.length() || start<0 || start>=length()){return false;} 4681 // if(true){return false;} 4682 if(u2.r!=null){ 4683 final byte[] a=bases(), b=u2.bases(); 4684 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4685 4686 for(int i=start, j=0; j<b.length; i--, j++){ 4687 byte aa=a[i]; 4688 byte bb=baseToComplementExtended[b[j]]; 4689 if(aa!=bb){ 4690 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 4691 if(earlyExit && j<k2){return false;} 4692 if((mismatches=mismatches+1)>maxMismatches){ 4693 if(bandy==null || maxEdits<1){return false;} 4694 int edits=bandy.alignReverseRC(b, a, 0, start, maxEdits, exact); 4695 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4696 return edits<=maxEdits && bandy.score()>4*edits; 4697 } 4698 } 4699 } 4700 } 4701 return true; 4702 }else{ 4703 assert(false) : "TODO: Verify by hashing and checking both tips"; 4704 return false; 4705 } 4706 } 4707 4708 4709 public boolean depthCongruent(int aa, int bb){ 4710 if(aa<5 && bb<5){return true;} 4711 final int a=Tools.max(1, Tools.min(aa, bb)); 4712 final int b=Tools.max(aa, bb); 4713 return a*depthRatio>=b; 4714 } 4715 4716 public boolean overlaps(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum, int editLimit) { 4717 // return makeOverlap(u2, loc, key, bandy, earlyExit)!=null; 4718 4719 // assert(false) : "TODO"; 4720 if(verbose){System.err.println("overlaps: Considering key "+key+", unit "+u2);} 4721 if(parseDepth && !depthCongruent(depth, u2.depth)){return false;} 4722 if(minLengthPercent>0){ 4723 final int len1=length(), len2=u2.length(); 4724 if(Tools.min(len1, len2)*100f/Tools.max(len1, len2)<minLengthPercent){return false;} 4725 } 4726 assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() || 4727 (canonical()==u2.canonical() ? (u2.prefix1!=prefix1 && u2.suffix1!=suffix1) : (u2.prefix1!=suffix1 && u2.suffix1!=prefix1))) : 4728 "Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r; 4729 4730 4731 if(tableNum==0){ 4732 if(key.value()==u2.prefix1){ 4733 if(verbose){System.err.println("Testing overlaps A1");} 4734 if(overlapsForward(u2, loc-k2, 0, bandy, tableNum==0, editLimit)){ 4735 if(verbose){System.err.println("Found Overlap A1F");} 4736 return true; 4737 } 4738 if(overlapsReverseRC(u2, loc, 0, bandy, tableNum==0, editLimit)){ 4739 if(verbose){System.err.println("Found Overlap A1R");} 4740 return true; 4741 } 4742 if(verbose){System.err.println("No Overlap.");} 4743 } 4744 if(key.value()==u2.suffix1){ 4745 if(verbose){System.err.println("Testing overlaps B1");} 4746 if(verbose){System.err.println("Testing overlaps B1F");} 4747 if(overlapsForwardRC(u2, loc-k2, u2.length()-1, bandy, tableNum==0, editLimit)){ 4748 if(verbose){System.err.println("Found Overlap B1F");} 4749 return true; 4750 } 4751 if(verbose){System.err.println("Testing overlaps B1R");} 4752 if(overlapsReverse(u2, loc, u2.length()-1, bandy, tableNum==0, editLimit)){ 4753 if(verbose){System.err.println("Found Overlap B1R");} 4754 return true; 4755 } 4756 if(verbose){System.err.println("No Overlap.");} 4757 } 4758 }else{ 4759 if(key.value()==u2.prefix2){ 4760 if(verbose){System.err.println("Testing overlaps A2");} 4761 if(overlapsForward(u2, loc-k2-k, 0, bandy, tableNum==0, editLimit)){ 4762 if(verbose){System.err.println("Found Overlap A2F");} 4763 return true; 4764 } 4765 if(overlapsReverseRC(u2, loc+k, 0, bandy, tableNum==0, editLimit)){ 4766 if(verbose){System.err.println("Found Overlap A2R");} 4767 return true; 4768 } 4769 if(verbose){System.err.println("No Overlap.");} 4770 } 4771 if(key.value()==u2.suffix2){ 4772 if(verbose){System.err.println("Testing overlaps B2");} 4773 if(overlapsForwardRC(u2, loc-k2-k, u2.length()-1, bandy, tableNum==0, editLimit)){ 4774 if(verbose){System.err.println("Found Overlap B2F");} 4775 return true; 4776 } 4777 if(overlapsReverse(u2, loc+k, u2.length()-1, bandy, tableNum==0, editLimit)){ 4778 if(verbose){System.err.println("Found Overlap B2R");} 4779 return true; 4780 } 4781 if(verbose){System.err.println("No Overlap.");} 4782 } 4783 } 4784 4785 return false; 4786 } 4787 4788 /** 4789 * @param u2 4790 * @param loc 4791 * @param key 4792 * @return 4793 */ 4794 protected Overlap makeOverlap(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum) { 4795 if(verbose){System.err.println("makeOverlap: Considering key "+key+", unit "+u2);} 4796 if(parseDepth && !depthCongruent(depth, u2.depth)){return null;} 4797 if(minLengthPercent>0){ 4798 final int len1=length(), len2=u2.length(); 4799 if(Tools.min(len1, len2)*100f/Tools.max(len1, len2)<minLengthPercent){return null;} 4800 } 4801 assert(u2.code1!=code1 || u2.code2!=code2 || u2.length()!=length() || (r!=null && r.mate!=null) || 4802 (canonical()==u2.canonical() ? (u2.prefix1!=prefix1 && u2.suffix1!=suffix1) : (u2.prefix1!=suffix1 && u2.suffix1!=prefix1))) : 4803 "Collision? \n"+this+"\n"+u2+"\n"+r+"\n"+u2.r; 4804 4805 4806 Overlap o=null; 4807 if(tableNum==0){ 4808 if(key.value()==u2.prefix1){ 4809 if(verbose){System.err.println("\nTesting makeOverlap A1F");} 4810 if((o=makeOverlapForward(u2, loc-k2, bandy, tableNum==0))!=null){ 4811 if(verbose){System.err.println("Made Overlap A1F");} 4812 return o; 4813 } 4814 if(verbose){System.err.println("\nTesting makeOverlap A1R");} 4815 if((o=makeOverlapReverseRC(u2, loc, bandy, tableNum==0))!=null){ 4816 if(verbose){System.err.println("Made Overlap A1R");} 4817 return o; 4818 } 4819 if(verbose){System.err.println("No Overlap.");} 4820 } 4821 if(key.value()==u2.suffix1){ 4822 if(verbose){System.err.println("\nTesting makeOverlap B1F");} 4823 if((o=makeOverlapForwardRC(u2, loc-k2, bandy, tableNum==0))!=null){ 4824 if(verbose){System.err.println("Made Overlap B1F");} 4825 return o; 4826 } 4827 if(verbose){System.err.println("\nTesting makeOverlap B1R");} 4828 if((o=makeOverlapReverse(u2, loc, bandy, tableNum==0))!=null){ 4829 if(verbose){System.err.println("Made Overlap B1R");} 4830 return o; 4831 } 4832 if(verbose){System.err.println("No Overlap.");} 4833 } 4834 }else{ 4835 if(key.value()==u2.prefix2){ 4836 if(verbose){System.err.println("\nTesting makeOverlap A2F");} 4837 if((o=makeOverlapForward(u2, loc-k2-k, bandy, tableNum==0))!=null){ 4838 if(verbose){System.err.println("Made Overlap A2F");} 4839 return o; 4840 } 4841 if(verbose){System.err.println("\nTesting makeOverlap A2R");} 4842 if((o=makeOverlapReverseRC(u2, loc+k, bandy, tableNum==0))!=null){ 4843 if(verbose){System.err.println("Made Overlap A2R");} 4844 return o; 4845 } 4846 if(verbose){System.err.println("No Overlap.");} 4847 } 4848 if(key.value()==u2.suffix2){ 4849 if(verbose){System.err.println("\nTesting makeOverlap B2F");} 4850 if((o=makeOverlapForwardRC(u2, loc-k2-k, bandy, tableNum==0))!=null){ 4851 if(verbose){System.err.println("Made Overlap B2F");} 4852 return o; 4853 } 4854 if(verbose){System.err.println("\nTesting makeOverlap B2R");} 4855 if((o=makeOverlapReverse(u2, loc+k, bandy, tableNum==0))!=null){ 4856 if(verbose){System.err.println("Made Overlap B2R");} 4857 return o; 4858 } 4859 if(verbose){System.err.println("No Overlap.");} 4860 } 4861 } 4862 return o; 4863 } 4864 4865 private boolean overlapsForward(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) { 4866 if(verbose){System.err.println("overlapsForward(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");} 4867 4868 final int len1=length(), len2=u2.length(); 4869 if(start1<0){ 4870 start2-=start1; 4871 start1=0; 4872 if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);} 4873 } 4874 int overlapLength=Tools.min(len1-start1, len2-start2); 4875 int overlapLength2=Tools.max(len1-start1, len2-start2); 4876 int stop1=start1+overlapLength-1, stop2=start2+overlapLength-1; 4877 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 4878 4879 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 4880 if(verbose){ 4881 System.err.println("Side block. allowAllContainedOverlaps="+allowAllContainedOverlaps+", minOverlapCluster="+minOverlapCluster); 4882 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 4883 ", overlapLen="+overlapLength+", maxEdits="+maxEdits); 4884 } 4885 if(overlapLength2<minOverlapCluster){return false;} 4886 if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;} 4887 } 4888 4889 final byte[] a=bases(), b=u2.bases(); 4890 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 4891 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, overlapLength); 4892 4893 if(verbose){ 4894 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 4895 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 4896 } 4897 4898 for(int i=start1, j=start2; j<=stop2; i++, j++){ 4899 byte aa=a[i]; 4900 byte bb=b[j]; 4901 if(aa!=bb){ 4902 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 4903 if(earlyExit && j<k2){return false;} 4904 mismatches++; 4905 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 4906 if(mismatches>maxMismatches){ 4907 if(bandy==null || maxEdits<1){return false;} 4908 if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");} 4909 int edits=bandy.alignForward(b, a, 0, start1, maxEdits, exact); 4910 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4911 stop2=bandy.lastQueryLoc; 4912 stop1=bandy.lastRefLoc; 4913 return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment 4914 } 4915 } 4916 } 4917 } 4918 return true; 4919 } 4920 4921 private boolean overlapsForwardRC(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) { 4922 if(verbose){System.err.println("overlapsForwardRC(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");} 4923 4924 if(ignoreReverseComplement){return false;} 4925 final int len1=length(), len2=u2.length(); 4926 if(start1<0){ 4927 start2+=start1; 4928 start1=0; 4929 if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);} 4930 } 4931 final int overlapLength=Tools.min(len1-start1, start2+1); 4932 final int overlapLength2=Tools.max(len1-start1, start2+1); 4933 int stop1=start1+overlapLength-1, stop2=start2-overlapLength+1; 4934 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 4935 4936 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 4937 if(overlapLength2<minOverlapCluster){return false;} 4938 if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;} 4939 } 4940 4941 final byte[] a=bases(), b=u2.bases(); 4942 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 4943 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4944 4945 if(verbose){ 4946 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 4947 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 4948 } 4949 4950 for(int i=start1, j=start2, iprefix=start1+k2; i<=stop1; i++, j--){ 4951 byte aa=a[i]; 4952 // assert(b[j]>=0) : new String(b); //TODO: remove //123 4953 byte bb=baseToComplementExtended[b[j]]; 4954 if(aa!=bb){ 4955 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 4956 if(earlyExit && i<iprefix){return false;} 4957 mismatches++; 4958 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 4959 if(mismatches>maxMismatches){ 4960 if(bandy==null || maxEdits<1){return false;} 4961 if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");} 4962 int edits=bandy.alignForwardRC(b, a, b.length-1, start1, maxEdits, exact); 4963 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 4964 stop2=bandy.lastQueryLoc; 4965 stop1=bandy.lastRefLoc; 4966 return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment 4967 } 4968 } 4969 } 4970 } 4971 return true; 4972 } 4973 4974 private boolean overlapsReverse(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) { 4975 if(verbose){System.err.println("overlapsReverse(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");} 4976 4977 final int len1=length(), len2=u2.length(); 4978 if(start1>=len1){ 4979 start2-=(start1-len1+1); 4980 start1=len1-1; 4981 if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);} 4982 } 4983 final int overlapLength=Tools.min(start1+1, start2+1); 4984 final int overlapLength2=Tools.max(start1+1, start2+1); 4985 int stop1=start1-overlapLength+1, stop2=start2-overlapLength+1; 4986 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 4987 4988 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 4989 if(overlapLength2<minOverlapCluster){return false;} 4990 if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;} 4991 } 4992 4993 final byte[] a=bases(), b=u2.bases(); 4994 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 4995 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 4996 4997 if(verbose){ 4998 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 4999 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 5000 } 5001 5002 for(int i=start1, j=start2, iprefix=start1-k2; i>=stop1; i--, j--){ 5003 byte aa=a[i]; 5004 byte bb=b[j]; 5005 if(aa!=bb){ 5006 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 5007 if(earlyExit && i>iprefix){return false;} 5008 mismatches++; 5009 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 5010 if(mismatches>maxMismatches){ 5011 if(bandy==null || maxEdits<1){return false;} 5012 if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");} 5013 int edits=bandy.alignReverse(b, a, b.length-1, start1, maxEdits, exact); 5014 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 5015 stop2=bandy.lastQueryLoc; 5016 stop1=bandy.lastRefLoc; 5017 return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment 5018 } 5019 } 5020 } 5021 } 5022 return true; 5023 } 5024 5025 private boolean overlapsReverseRC(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) { 5026 if(verbose){System.err.println("overlapsReverseRC(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");} 5027 5028 if(ignoreReverseComplement){return false;} 5029 final int len1=length(), len2=u2.length(); 5030 if(start1>=len1){ 5031 start2+=(start1-len1+1); 5032 start1=len1-1; 5033 if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);} 5034 } 5035 final int overlapLength=Tools.min(start1+1, len2-start2); 5036 final int overlapLength2=Tools.max(start1+1, len2-start2); 5037 int stop1=start1-overlapLength+1, stop2=start2+overlapLength-1; 5038 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 5039 5040 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 5041 if(overlapLength2<minOverlapCluster){return false;} 5042 if(minOverlapPercentCluster>0f && (overlapLength2*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return false;} 5043 } 5044 5045 final byte[] a=bases(), b=u2.bases(); 5046 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 5047 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 5048 5049 if(verbose){ 5050 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 5051 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 5052 } 5053 5054 for(int i=start1, j=start2; j<=stop2; i--, j++){ 5055 byte aa=a[i]; 5056 byte bb=baseToComplementExtended[b[j]]; 5057 if(aa!=bb){ 5058 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 5059 if(earlyExit && j<k2){return false;} 5060 mismatches++; 5061 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 5062 if(mismatches>maxMismatches){ 5063 if(bandy==null || maxEdits<1){return false;} 5064 if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");} 5065 int edits=bandy.alignReverseRC(b, a, 0, start1, maxEdits, exact); 5066 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 5067 stop2=bandy.lastQueryLoc; 5068 stop1=bandy.lastRefLoc; 5069 return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment 5070 } 5071 } 5072 } 5073 } 5074 return true; 5075 } 5076 5077 5078 5079 private Overlap makeOverlapForward(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) { 5080 if(verbose){System.err.println("makeOverlapForward(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");} 5081 final int len1=length(), len2=u2.length(); 5082 int start2=0; 5083 if(start1<0){ 5084 start2-=start1; 5085 start1=0; 5086 } 5087 final int overlapLength=Tools.min(len1-start1, len2-start2); 5088 final int overlapLength2=Tools.max(len1-start1, len2-start2); 5089 int stop1=start1+overlapLength-1, stop2=start2+overlapLength-1; 5090 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 5091 5092 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 5093 if(overlapLength<minOverlapCluster){return null;} 5094 if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;} 5095 } 5096 5097 final byte[] a=bases(), b=u2.bases(); 5098 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 5099 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, overlapLength); 5100 5101 if(verbose){ 5102 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 5103 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 5104 } 5105 5106 for(int i=start1, j=start2; j<=stop2; i++, j++){ 5107 byte aa=a[i]; 5108 byte bb=b[j]; 5109 if(aa!=bb){ 5110 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 5111 if(earlyExit && j<k2){return null;} 5112 mismatches++; 5113 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 5114 if(mismatches>1 && bandy!=null){ 5115 if(maxEdits<1){return null;} 5116 if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");} 5117 int edits=bandy.alignForward(b, a, start2, start1, maxEdits, exact); 5118 if(edits>maxEdits || bandy.score()<=4*edits){ 5119 if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");} 5120 return null; 5121 } 5122 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 5123 stop2=bandy.lastQueryLoc; 5124 stop1=bandy.lastRefLoc; 5125 // if(bandy.lastOffset>0){//Ref longer than query 5126 // for(int k=0; k<bandy.lastOffset; k++){ 5127 // if(stop1+1<=len1){stop1++;} 5128 // else{stop2--;}//I don't think this can happen 5129 // } 5130 // }else if(bandy.lastOffset<0){//Query longer than ref 5131 // for(int k=0; k>bandy.lastOffset; k--){ 5132 // if(stop2+1<=len2){stop2++;} 5133 // else{stop1--;} 5134 // } 5135 // } 5136 return new Overlap(this, u2, FORWARD, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy); 5137 }else if(mismatches>maxMismatches){return null;} 5138 } 5139 } 5140 } 5141 return new Overlap(this, u2, FORWARD, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy); 5142 } 5143 5144 private Overlap makeOverlapForwardRC(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) { 5145 if(verbose){System.err.println("makeOverlapForwardRC(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");} 5146 if(ignoreReverseComplement){return null;} 5147 final int len1=length(), len2=u2.length(); 5148 int start2=len2-1; 5149 if(start1<0){ 5150 start2+=start1; 5151 start1=0; 5152 } 5153 final int overlapLength=Tools.min(len1-start1, start2+1); 5154 final int overlapLength2=Tools.max(len1-start1, start2+1); 5155 int stop1=start1+overlapLength-1, stop2=start2-overlapLength+1; 5156 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 5157 5158 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 5159 if(overlapLength<minOverlapCluster){return null;} 5160 if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;} 5161 } 5162 5163 final byte[] a=bases(), b=u2.bases(); 5164 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 5165 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 5166 5167 for(int i=start1, j=start2, iprefix=start1+k2; i<=stop1; i++, j--){ 5168 byte aa=a[i]; 5169 byte bb=baseToComplementExtended[b[j]]; 5170 if(aa!=bb){ 5171 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 5172 if(earlyExit && i<iprefix){return null;} 5173 mismatches++; 5174 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 5175 if(mismatches>1 && bandy!=null){ 5176 if(maxEdits<1){return null;} 5177 if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");} 5178 int edits=bandy.alignForwardRC(b, a, start2, start1, maxEdits, exact); 5179 if(edits>maxEdits || bandy.score()<=4*edits){ 5180 if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");} 5181 return null; 5182 } 5183 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 5184 stop2=bandy.lastQueryLoc; 5185 stop1=bandy.lastRefLoc; 5186 // if(bandy.lastOffset>0){//Ref longer than query 5187 // for(int k=0; k<bandy.lastOffset; k++){ 5188 // if(stop1+1<=len1){stop1++;} 5189 // else{stop2++;}//I don't think this can happen 5190 // } 5191 // }else if(bandy.lastOffset<0){//Query longer than ref 5192 // for(int k=0; k>bandy.lastOffset; k--){ 5193 // if(stop2>0){stop2--;} 5194 // else{stop1--;} 5195 // } 5196 // } 5197 return new Overlap(this, u2, FORWARDRC, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy); 5198 }else if(mismatches>maxMismatches){return null;} 5199 } 5200 } 5201 } 5202 return new Overlap(this, u2, FORWARDRC, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy); 5203 } 5204 5205 private Overlap makeOverlapReverse(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) { 5206 if(verbose){System.err.println("makeOverlapReverse(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");} 5207 5208 final int len1=length(), len2=u2.length(); 5209 int start2=len2-1; 5210 if(start1>=len1){ 5211 start2-=(start1-len1+1); 5212 start1=len1-1; 5213 } 5214 final int overlapLength=Tools.min(start1+1, start2+1); 5215 final int overlapLength2=Tools.max(start1+1, start2+1); 5216 int stop1=start1-overlapLength+1, stop2=start2-overlapLength+1; 5217 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 5218 5219 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 5220 if(overlapLength<minOverlapCluster){return null;} 5221 if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;} 5222 } 5223 5224 final byte[] a=bases(), b=u2.bases(); 5225 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 5226 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 5227 5228 if(verbose){ 5229 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 5230 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 5231 } 5232 5233 for(int i=start1, j=start2, iprefix=start1-k2; i>=stop1; i--, j--){ 5234 byte aa=a[i]; 5235 byte bb=b[j]; 5236 if(aa!=bb){ 5237 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 5238 if(earlyExit && i>iprefix){return null;} 5239 mismatches++; 5240 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 5241 if(mismatches>1 && bandy!=null){ 5242 if(maxEdits<1){return null;} 5243 if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");} 5244 int edits=bandy.alignReverse(b, a, start2, start1, maxEdits, exact); 5245 if(edits>maxEdits || bandy.score()<=4*edits){ 5246 if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");} 5247 return null; 5248 } 5249 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 5250 stop2=bandy.lastQueryLoc; 5251 stop1=bandy.lastRefLoc; 5252 // if(bandy.lastOffset>0){//Ref longer than query 5253 // for(int k=0; k<bandy.lastOffset; k++){ 5254 // if(stop1>0){stop1--;} 5255 // else{stop2++;}//I don't think this can happen 5256 // } 5257 // }else if(bandy.lastOffset<0){//Query longer than ref 5258 // for(int k=0; k>bandy.lastOffset; k--){ 5259 // if(stop2>0){stop2--;} 5260 // else{stop1++;} 5261 // } 5262 // } 5263 return new Overlap(this, u2, REVERSE, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy); 5264 }else if(mismatches>maxMismatches){return null;} 5265 } 5266 } 5267 } 5268 return new Overlap(this, u2, REVERSE, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy); 5269 } 5270 5271 private Overlap makeOverlapReverseRC(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) { 5272 if(verbose){System.err.println("makeOverlapReverseRC(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");} 5273 if(ignoreReverseComplement){return null;} 5274 final int len1=length(), len2=u2.length(); 5275 int start2=0; 5276 if(start1>=len1){ 5277 start2+=(start1-len1+1); 5278 start1=len1-1; 5279 } 5280 final int overlapLength=Tools.min(start1+1, len2-start2); 5281 final int overlapLength2=Tools.max(start1+1, len2-start2); 5282 int stop1=start1-overlapLength+1, stop2=start2+overlapLength-1; 5283 if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} 5284 5285 if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ 5286 if(overlapLength<minOverlapCluster){return null;} 5287 if(minOverlapPercentCluster>0f && (overlapLength*100f/Tools.min(len1, len2))<minOverlapPercentCluster){return null;} 5288 } 5289 5290 final byte[] a=bases(), b=u2.bases(); 5291 assert(a!=null && b!=null) : "Null bases for "+code1+" or "+u2.code1; 5292 int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); 5293 5294 if(verbose){ 5295 System.err.println("start1="+start1+", stop1="+stop1+", len1="+len1+", start2="+start2+", stop2="+stop2+", len2="+len2+ 5296 ", overlapLen="+overlapLength+", maxMismatches="+maxMismatches+", maxEdits="+maxEdits); 5297 } 5298 5299 for(int i=start1, j=start2; j<=stop2; i--, j++){ 5300 byte aa=a[i]; 5301 byte bb=baseToComplementExtended[b[j]]; 5302 if(aa!=bb){ 5303 if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ 5304 if(earlyExit && j<k2){return null;} 5305 mismatches++; 5306 if(verbose){System.err.println("i="+i+", j="+j+", "+(char)aa+"!="+(char)bb+", mismatches="+mismatches+"/"+maxMismatches);} 5307 if(mismatches>1 && bandy!=null){ 5308 if(maxEdits<1){return null;} 5309 if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");} 5310 int edits=bandy.alignReverseRC(b, a, start2, start1, maxEdits, exact); 5311 if(edits>maxEdits || bandy.score()<=4*edits){ 5312 if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");} 5313 return null; 5314 } 5315 assert(b.length<k || a.length<k || bandy.lastRow>=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; 5316 stop2=bandy.lastQueryLoc; 5317 stop1=bandy.lastRefLoc; 5318 // if(bandy.lastOffset>0){//Ref longer than query 5319 // for(int k=0; k<bandy.lastOffset; k++){ 5320 // if(stop1>0){stop1--;} 5321 // else{stop2--;}//I don't think this can happen 5322 // } 5323 // }else if(bandy.lastOffset<0){//Query longer than ref 5324 // for(int k=0; k>bandy.lastOffset; k--){ 5325 // if(stop2+1<=len2){stop2++;} 5326 // else{stop1++;} 5327 // } 5328 // } 5329 return new Overlap(this, u2, REVERSERC, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy); 5330 }else if(mismatches>maxMismatches){return null;} 5331 } 5332 } 5333 } 5334 return new Overlap(this, u2, REVERSERC, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy); 5335 } 5336 5337 @Override 5338 public int compareTo(Unit b) { 5339 int x=comparePairedRC(this, b); 5340 // int y=comparePairedRC(b, this); 5341 // boolean eq1=this.equals(b); 5342 // boolean eq2=b.equals(this); 5343 // 5344 // assert((x==y)==(x==0)) : x+", "+y+"\n"+this+"\n"+b; 5345 // assert((x>0 == y<0) || (x==0 && y==0)) : x+", "+y+"\n"+this+"\n"+b; 5346 // 5347 // assert(eq1==eq2): x+", "+y+"\n"+this+"\n"+b; 5348 // assert(eq1==(x==0)): x+", "+y+"\n"+this+"\n"+b; 5349 // 5350 // assert(eq1 || this!=b); 5351 // 5352 // if(verbose){ //TODO: Remove 5353 // System.err.println(this+"\n"+b+"\n"+this.r.toFastq()+"\n"+this.r.mate.toFastq()+"\n"+b.r.toFastq()+"\n"+b.r.mate.toFastq()+"\n"); 5354 // System.err.println("\n"+x+", "+y+", "+eq1+", "+eq2); 5355 // verbose=false; 5356 // } 5357 5358 return x; 5359 } 5360 5361 @Override 5362 public boolean equals(Object b){return equals((Unit)b);} 5363 public boolean equals(Unit b){ 5364 boolean x=pairedEqualsRC(this, b); 5365 // assert(x==pairedEqualsRC(b, this)); 5366 // assert(x==(comparePairedRC(this, b)==0)); 5367 // assert(x==(comparePairedRC(b, this)==0)); 5368 // assert(x || this!=b); 5369 // System.err.println("\n****EQUALS?****:\n"+this+"\n"+b+"\n**** ****"); //TODO: Remove 5370 return x; 5371 } 5372 5373 @Override 5374 public int hashCode(){ 5375 return (int)((code1^(code1>>>32))&0xFFFFFFFFL); 5376 } 5377 5378 private synchronized void setValid(boolean b){ 5379 assert(b!=valid()); 5380 // if(!b){System.err.println("Setting invalid "+name());} 5381 if(b){flags&=~INVALID_MASK;} 5382 else{flags|=INVALID_MASK;} 5383 assert(b==valid()); 5384 } 5385 5386 private synchronized void setClustered(boolean b){ 5387 assert(b!=clustered()); 5388 if(b){flags|=CLUSTER_MASK;} 5389 else{flags&=~CLUSTER_MASK;} 5390 assert(b==clustered()); 5391 } 5392 5393 private void setVisited(boolean b){ 5394 assert(b!=visited()); 5395 if(b){flags|=VISIT_MASK;} 5396 else{flags&=~VISIT_MASK;} 5397 assert(b==visited()); 5398 } 5399 5400 private synchronized void setCanonical(boolean b){ 5401 assert(b!=canonical()); 5402 if(b){flags|=CANON_MASK;} 5403 else{flags&=~CANON_MASK;} 5404 assert(b==canonical()); 5405 assert(r==null || b==isCanonical(r.bases)); 5406 } 5407 5408 private void setCanonicized(boolean b){ 5409 assert(b!=canonicized()); 5410 if(b){flags|=CANONICIZED_MASK;} 5411 else{flags&=~CANONICIZED_MASK;} 5412 assert(b==canonicized()); 5413 } 5414 5415 private synchronized void setCanonContradiction(boolean b){ 5416 // assert(b!=canonContradiction()); 5417 if(b){flags|=CANON_CONTRADICTION_MASK;} 5418 else{flags&=~CANON_CONTRADICTION_MASK;} 5419 assert(b==canonContradiction()); 5420 } 5421 5422 private synchronized void setOffset(int x){ 5423 offset=x; 5424 setOffsetValid(true); 5425 } 5426 5427 private synchronized void setOffsetValid(boolean b){ 5428 assert(!offsetValid()); 5429 if(b){flags|=OFFSET_VALID_MASK;} 5430 else{flags&=~OFFSET_VALID_MASK;} 5431 assert(b==offsetValid()); 5432 } 5433 5434 private synchronized void setOffsetContradiction(boolean b){ 5435 // assert(b!=offsetContradiction()); 5436 assert(offsetValid()); 5437 if(b){flags|=OFFSET_CONTRADICTION_MASK;} 5438 else{flags&=~OFFSET_CONTRADICTION_MASK;} 5439 assert(b==offsetContradiction()); 5440 } 5441 5442 private void reverseComplement(){ 5443 assert(r!=null); 5444 r.reverseComplement(); 5445 long temp=prefix1; 5446 prefix1=suffix1; 5447 suffix1=temp; 5448 temp=prefix2; 5449 prefix2=suffix2; 5450 suffix2=temp; 5451 setCanonical(!canonical()); 5452 } 5453 5454 /** Return true if 'this' should be the first Unit in the overlap object */ 5455 public boolean firstInOverlap(Unit u2){ 5456 assert(this!=u2) : "\n"+this.r+"\n"+u2.r; 5457 if(u2.length()!=length()){return u2.length()<length();} 5458 if(u2.code1!=code1){return u2.code1<code1;} 5459 if(u2.code2!=code2){return u2.code2<code2;} 5460 int x=compareTo(u2); 5461 assert(x!=0 || (r!=null && r.mate!=null)); 5462 if(x!=0){return x>=0;} 5463 return r.numericID>=u2.r.numericID; 5464 } 5465 5466 public final boolean inSet(){ 5467 if(subsetCount<2){return true;} 5468 if(r.pairnum()>0){return ((Unit)r.mate.obj).inSet();} 5469 return ((code1&Long.MAX_VALUE)%subsetCount)==subset; 5470 } 5471 5472 public byte[] bases(){return r==null ? null : r.bases;} 5473 5474 public String name(){return r!=null ? r.id : null /*code+""*/;} 5475 @Override 5476 public String toString(){return "("+name()+","+code1+","+code2+","+length()+","+prefix1+","+suffix1+","+(canonical()?"c":"nc")+",d="+depth+")";} 5477 5478 5479 public final Read r; 5480 public final long code1; 5481 public final long code2; 5482 public long prefix1=-1; 5483 public long suffix1=-1; 5484 public long prefix2=-1; 5485 public long suffix2=-1; 5486 /** Distance of leftmost side of this read relative to leftmost side of root. 5487 * Assumes everything is in 'forward' orientation. */ 5488 public int offset=-999999999; 5489 public int depth=1; 5490 // private boolean valid=true; 5491 5492 public int unitID; 5493 5494 public ArrayList<Overlap> overlapList; 5495 5496 private long flags; 5497 /** True if the original read orientation was canonical */ 5498 public final boolean canonical(){return (CANON_MASK&flags)!=0;} 5499 /** True if this contig should be output, false if not */ 5500 public final boolean valid(){return (INVALID_MASK&flags)==0;} 5501 /** Length of this contig */ 5502 public final int length(){return (int)(LEN_MASK&flags);} 5503 /** Position of this contig relative to root */ 5504 public final int offset(){ 5505 assert(offsetValid()); 5506 return offset; 5507 } 5508 public int pairnum(){return (PAIRNUM_MASK&flags)==PAIRNUM_MASK ? 1 : 0;} 5509 5510 public void clearVolatileFlags(){ 5511 flags=flags&~(CANONICIZED_MASK|VISIT_MASK|CANON_CONTRADICTION_MASK|OFFSET_VALID_MASK|OFFSET_CONTRADICTION_MASK); 5512 assert(!visited()); 5513 assert(!canonicized()); 5514 assert(!canonContradiction()); 5515 assert(!offsetValid()); 5516 assert(!offsetContradiction()); 5517 } 5518 5519 public boolean visited(){return (VISIT_MASK&flags)==VISIT_MASK;} 5520 public boolean clustered(){return (CLUSTER_MASK&flags)==CLUSTER_MASK;} 5521 public boolean canonicized(){return (CANONICIZED_MASK&flags)==CANONICIZED_MASK;} 5522 public boolean canonContradiction(){return (CANON_CONTRADICTION_MASK&flags)==CANON_CONTRADICTION_MASK;} 5523 public boolean offsetValid(){return (OFFSET_VALID_MASK&flags)==OFFSET_VALID_MASK;} 5524 public boolean offsetContradiction(){return (OFFSET_CONTRADICTION_MASK&flags)==OFFSET_CONTRADICTION_MASK;} 5525 public boolean contradiction(){return offsetContradiction() || canonContradiction();} 5526 5527 private static final long LEN_MASK=0x7FFFFFFFL; 5528 private static final long CANON_MASK=(1L<<33); 5529 private static final long INVALID_MASK=(1L<<34); 5530 private static final long VISIT_MASK=(1L<<35); 5531 private static final long CLUSTER_MASK=(1L<<36); 5532 private static final long CANONICIZED_MASK=(1L<<37); 5533 private static final long CANON_CONTRADICTION_MASK=(1L<<38); 5534 private static final long OFFSET_VALID_MASK=(1L<<39); 5535 private static final long OFFSET_CONTRADICTION_MASK=(1L<<40); 5536 private static final long PAIRNUM_MASK=(1L<<41); 5537 } 5538 5539 private static final class UnitOffsetComparator implements Comparator<Unit> { 5540 5541 UnitOffsetComparator(){} 5542 5543 /* (non-Javadoc) 5544 * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) 5545 */ 5546 @Override 5547 public int compare(Unit a, Unit b) { 5548 if(a.offsetValid() && b.offsetValid()){ 5549 int x=a.offset()-b.offset(); 5550 if(x!=0){return x;} 5551 }else{ 5552 if(a.offsetValid()){return -1;} 5553 if(b.offsetValid()){return 1;} 5554 } 5555 return a.compareTo(b); 5556 } 5557 5558 } 5559 5560 private static final class ClusterLengthComparator implements Comparator<ArrayList<Unit>> { 5561 5562 ClusterLengthComparator(){} 5563 5564 /* (non-Javadoc) 5565 * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) 5566 */ 5567 @Override 5568 public int compare(ArrayList<Unit> a, ArrayList<Unit> b) { 5569 if(a.size()!=b.size()){return b.size()-a.size();} 5570 if(a.isEmpty() && b.isEmpty()){return 0;} 5571 return a.get(0).compareTo(b.get(0)); 5572 } 5573 5574 } 5575 5576 public static final int[] makeNmerIndex(final int n){ 5577 final int max=(1<<(2*n))-1; 5578 int[] array=new int[max+1]; 5579 5580 int count=0; 5581 for(int i=0; i<=max; i++){ 5582 final int a=i, b=AminoAcid.reverseComplementBinaryFast(i, n); 5583 int min=Tools.min(a, b); 5584 if(min==a){ 5585 array[a]=array[b]=count; 5586 count++; 5587 } 5588 } 5589 return array; 5590 } 5591 5592 /** Makes a nmer (e.g., tetramer) profile of a cluster */ 5593 private static final float[] makeNmerProfile(ArrayList<Unit> alu, long[] array_){ 5594 final int nbits=2*nmerLength; 5595 final long[] array=(array_==null ? new long[maxNmer+1] : array_); 5596 final int mask=~((-1)<<(nbits)); 5597 5598 long keysCounted=0; 5599 5600 for(Unit u : alu){ 5601 byte[] bases=u.r.bases; 5602 int len=0; 5603 int kmer=0; 5604 for(byte b : bases){ 5605 int x=AminoAcid.baseToNumber[b]; 5606 if(x<0){ 5607 len=0; 5608 kmer=0; 5609 }else{ 5610 kmer=((kmer<<2)|x)&mask; 5611 len++; 5612 if(len>=nmerLength){ 5613 int rkmer=AminoAcid.reverseComplementBinaryFast(kmer, nmerLength); 5614 keysCounted++; 5615 array[nmerIndex[Tools.min(kmer, rkmer)]]++; 5616 } 5617 } 5618 } 5619 } 5620 5621 if(keysCounted==0){keysCounted=1;} 5622 final float mult=1f/keysCounted; 5623 5624 float[] r=new float[array.length]; 5625 for(int i=0; i<array.length; i++){ 5626 r[i]=array[i]*mult; 5627 array[i]=0; 5628 } 5629 return r; 5630 } 5631 5632 private ConcurrentReadInputStream crisa[]; 5633 5634 private final ByteStreamWriter dupeWriter; 5635 5636 private String[] in1=null; 5637 private String[] in2=null; 5638 private String out=null; 5639 private String clusterFilePattern=null; 5640 private String outbest=null; 5641 private String outdupe=null; 5642 private String outcsf=null; 5643 private String outgraph=null; 5644 private int maxNs=-1; 5645 private long maxReads=-1; 5646 public boolean errorState=false; 5647 boolean sort=false; 5648 // boolean ascending=true; 5649 boolean absorbContainment=true; 5650 boolean absorbMatch=true; 5651 boolean findOverlaps=false; 5652 boolean makeClusters=false; 5653 boolean processClusters=false; 5654 boolean renameClusters=false; 5655 boolean absorbOverlap=false; 5656 boolean storeSuffix=false; 5657 boolean storeName=true; 5658 boolean storeQuality=true; 5659 boolean exact=true; 5660 boolean uniqueNames=true; 5661 boolean mergeHeaders=false; 5662 String mergeDelimiter=">"; 5663 boolean maxSpanningTree=false; 5664 5665 boolean canonicizeClusters=true; 5666 boolean removeCycles=true; 5667 boolean fixMultiJoins=true; 5668 boolean fixCanonContradictions=false; 5669 boolean fixOffsetContradictions=false; 5670 boolean countTransitive=false; 5671 boolean countRedundant=false; 5672 5673 private boolean multipleInputFiles=false; 5674 private boolean rigorousTransitive=false; 5675 private int numAffixMaps=1; 5676 private int maxAffixCopies=2000000000; 5677 private int maxEdges=2000000000; 5678 private int maxEdges2=2000000000; 5679 private boolean allowAllContainedOverlaps=false; 5680 // private boolean toUpperCase=false; 5681 5682 /** Trim left bases of the read to this position (exclusive, 0-based) */ 5683 private int forceTrimLeft=-1; 5684 /** Trim right bases of the read after this position (exclusive, 0-based) */ 5685 private int forceTrimRight=-1; 5686 5687 private boolean qTrimLeft=false; 5688 private boolean qTrimRight=false; 5689 private float trimQ=6; 5690 /** Error rate for trimming (derived from trimq) */ 5691 private final float trimE; 5692 5693 int maxEdits=0; 5694 int maxSubs=0; 5695 int bandwidth=9; 5696 final boolean customBandwidth; 5697 float minIdentity=100; 5698 float minIdentityMult=0; 5699 float minLengthPercent=0; 5700 int minOverlapCluster=100; 5701 int minOverlapMerge=1; 5702 float minOverlapPercentCluster=0; 5703 float minOverlapPercentMerge=0; 5704 5705 private int minClusterSize=1; 5706 private int minClusterSizeForStats=1; 5707 private boolean pickBestRepresentativePerCluster=false; 5708 5709 long readsProcessed=0; 5710 long basesProcessed=0; 5711 long collisions=0; 5712 long containments=0; 5713 long baseContainments=0; 5714 long containmentCollisions=0; 5715 long matches=0; 5716 long baseMatches=0; 5717 long overlaps=0; 5718 long baseOverlaps=0; 5719 long overlapCollisions=0; 5720 long addedToMain=0; 5721 5722 private final int subset; 5723 private final int subsetCount; 5724 private final boolean subsetMode; 5725 5726 private final int k; 5727 private final int k2; 5728 private final boolean EA=Shared.EA(); 5729 5730 /*--------------------------------------------------------------*/ 5731 /*---------------- Static Fields ----------------*/ 5732 /*--------------------------------------------------------------*/ 5733 5734 private static ReadComparator comparator=ReadLengthComparator.comparator; 5735 5736 private static int tcount=0; 5737 5738 private LinkedHashMap<Long, ArrayList<Unit>> codeMap=new LinkedHashMap<Long, ArrayList<Unit>>(4000000); 5739 private HashMap<LongM, ArrayList<Unit>> affixMap1=null; 5740 private HashMap<LongM, ArrayList<Unit>> affixMap2=null; 5741 private HashMap<LongM, ArrayList<Unit>>[] affixMaps=null; 5742 private ArrayDeque<ArrayList<Unit>> clusterQueue=null; 5743 private ArrayList<ArrayList<Unit>> processedClusters=null; 5744 private AtomicIntegerArray clusterNumbers=null; 5745 5746 private static final UnitOffsetComparator UNIT_OFFSET_COMPARATOR=new UnitOffsetComparator(); 5747 private static final ClusterLengthComparator CLUSTER_LENGTH_COMPARATOR=new ClusterLengthComparator(); 5748 private static final long[][] hashcodes=makeCodes2(32); 5749 public static final byte[] baseToNumber=new byte[128]; 5750 public static final byte[] baseToComplementNumber=new byte[128]; 5751 public static final byte[] baseToComplementExtended=new byte[128]; 5752 public static final int nmerLength=4; 5753 public static final int[] nmerIndex=makeNmerIndex(nmerLength); 5754 public static final int maxNmer=Tools.max(nmerIndex); 5755 private static PrintStream outstream=System.err; 5756 /** Permission to overwrite existing files */ 5757 public static boolean overwrite=false; 5758 /** Permission to append to existing files */ 5759 public static boolean append=false; 5760 public static boolean showSpeed=true; 5761 public static boolean verbose=false; 5762 public static boolean ignoreReverseComplement=false; 5763 public static boolean preventTransitiveOverlaps=false; 5764 public static boolean ignoreAffix1=false; 5765 public static boolean parseDepth=false; 5766 public static boolean printLengthInEdges=false; 5767 public static float depthRatio=2; 5768 public static int MINSCAF=0; 5769 public static int THREADS=Shared.threads(); 5770 public static int threadMaxReadsToBuffer=4000; 5771 public static int threadMaxBasesToBuffer=32000000; 5772 public static boolean DISPLAY_PROGRESS=true; 5773 public static boolean UNIQUE_ONLY=false; 5774 public static boolean REQUIRE_MATCHING_NAMES=false; 5775 public static boolean NUMBER_GRAPH_NODES=true; 5776 public static boolean ADD_PAIRNUM_TO_NAME=true; 5777 public static boolean HASH_NS=false; 5778 5779 private static int reverseType(int type){return (type+2)%4;} 5780 public static final int FORWARD=0; 5781 public static final int FORWARDRC=1; 5782 public static final int REVERSE=2; 5783 public static final int REVERSERC=3; 5784 public static final String[] OVERLAP_TYPE_NAMES=new String[] {"FORWARD", "FORWARDRC", "REVERSE", "REVERSERC"}; 5785 public static final String[] OVERLAP_TYPE_ABBREVIATIONS=new String[] {"F", "FRC", "R", "RRC"}; 5786 5787 static{//All others are 0 5788 baseToNumber['A']=baseToNumber['a']=0; 5789 baseToNumber['C']=baseToNumber['c']=1; 5790 baseToNumber['G']=baseToNumber['g']=2; 5791 baseToNumber['T']=baseToNumber['t']=3; 5792 baseToNumber['U']=baseToNumber['u']=3; 5793 5794 baseToComplementNumber['A']=baseToComplementNumber['a']=3; 5795 baseToComplementNumber['C']=baseToComplementNumber['c']=2; 5796 baseToComplementNumber['G']=baseToComplementNumber['g']=1; 5797 baseToComplementNumber['T']=baseToComplementNumber['t']=0; 5798 baseToComplementNumber['U']=baseToComplementNumber['u']=0; 5799 5800 for(int i=0; i<AminoAcid.baseToComplementExtended.length; i++){ 5801 byte b=AminoAcid.baseToComplementExtended[i]; 5802 baseToComplementExtended[i]=(b<0 ? (byte)i : b); 5803 } 5804 } 5805 5806 } 5807