1 package sketch; 2 3 import java.io.PrintStream; 4 import java.util.ArrayList; 5 import java.util.Arrays; 6 import java.util.Collections; 7 import java.util.Random; 8 9 import aligner.SingleStateAlignerFlat2; 10 import aligner.SingleStateAlignerFlatFloat; 11 import dna.AminoAcid; 12 import dna.Data; 13 import prok.GeneCaller; 14 import prok.GeneModel; 15 import prok.GeneModelParser; 16 import prok.ProkObject; 17 import shared.Parse; 18 import shared.Shared; 19 import shared.Timer; 20 import shared.Tools; 21 import stream.Read; 22 import structures.EntropyTracker; 23 import structures.IntList3; 24 import tax.TaxTree; 25 26 public class SketchObject { 27 28 /*--------------------------------------------------------------*/ 29 /*---------------- Parsing ----------------*/ 30 /*--------------------------------------------------------------*/ 31 main(String[] args)32 public static void main(String[] args){ 33 Timer t=new Timer(); 34 for(int i=0; i<1000000; i++){ 35 wkidToAniExact(0.5, 32, 23, 0.000005); 36 } 37 t.stop(); 38 System.out.println(t); 39 } 40 parseSketchFlags(String arg, String a, String b)41 public static boolean parseSketchFlags(String arg, String a, String b){ 42 43 if(parseCoding(arg, a, b)){ 44 // 45 } 46 47 else if(a.equalsIgnoreCase("k")){ 48 if(b.indexOf(',')>=0){ 49 String[] split=b.split(","); 50 assert(split.length==2) : "Bad argument "+arg; 51 int x=Integer.parseInt(split[0]); 52 int y=Integer.parseInt(split[1]); 53 k=Tools.max(x, y); 54 k2=Tools.min(x, y); 55 if(k==k2){k2=0;} 56 }else{ 57 k=Integer.parseInt(b); 58 k2=0; 59 } 60 setK=true; 61 }else if(a.equalsIgnoreCase("rcomp")){ 62 rcomp=Parse.parseBoolean(b); 63 }else if(a.equalsIgnoreCase("minfakeid")){ 64 minFakeID=Integer.parseInt(b); 65 }else if(a.equalsIgnoreCase("hashNames")){ 66 hashNames=Parse.parseBoolean(b); 67 }else if(a.equalsIgnoreCase("preferSSUMap")){ 68 preferSSUMap=Parse.parseBoolean(b); 69 }else if(a.equalsIgnoreCase("preferSSUMapForEuks") || a.equalsIgnoreCase("preferSSUMapEuks")){ 70 preferSSUMapForEuks=Parse.parseBoolean(b); 71 }else if(a.equalsIgnoreCase("useSSUMapOnly")){ 72 useSSUMapOnly=Parse.parseBoolean(b); 73 }else if(a.equalsIgnoreCase("useSSUMapOnlyForEuks") || a.equalsIgnoreCase("SSUMapOnlyEuks")){ 74 useSSUMapOnlyForEuks=Parse.parseBoolean(b); 75 }else if(a.equalsIgnoreCase("skipNonCanonical")){ 76 skipNonCanonical=Parse.parseBoolean(b); 77 }else if(a.equalsIgnoreCase("useSizeEstimateInScore") || a.equalsIgnoreCase("useSizeEstimate")){ 78 useSizeEstimate=Parse.parseBoolean(b); 79 }else if(a.equalsIgnoreCase("blacklist") || a.equalsIgnoreCase("bl")){ 80 blacklist=b; 81 }else if(a.equalsIgnoreCase("whitelist") || a.equalsIgnoreCase("usewhitelist")){ 82 useWhitelist=Parse.parseBoolean(b); 83 } 84 85 else if(a.equalsIgnoreCase("sampleseed")){ 86 sampleseed=Long.parseLong(b); 87 } 88 89 else if(a.equalsIgnoreCase("size") || a.equalsIgnoreCase("sketchsize")){ 90 if("auto".equalsIgnoreCase(b)){ 91 AUTOSIZE=true; 92 AUTOSIZE_LINEAR=false; 93 SET_AUTOSIZE=true; 94 }else if("linear".equalsIgnoreCase(b)){ 95 AUTOSIZE=false; 96 AUTOSIZE_LINEAR=true; 97 SET_AUTOSIZE=true; 98 }else{ 99 AUTOSIZE=false; 100 AUTOSIZE_LINEAR=false; 101 targetSketchSize=Parse.parseIntKMG(b); 102 SET_TARGET_SIZE=true; 103 } 104 }else if(a.equalsIgnoreCase("maxfraction") || a.equalsIgnoreCase("maxgenomefraction") || a.equalsIgnoreCase("mgf")){ 105 maxGenomeFraction=Float.parseFloat(b); 106 }else if(a.equalsIgnoreCase("smallsketchsize")){ 107 smallSketchSize=Integer.parseInt(b); 108 }else if(a.equalsIgnoreCase("minSketchSize")){ 109 minSketchSize=Tools.max(1, Integer.parseInt(b)); 110 }else if(a.equalsIgnoreCase("autosize")){ 111 AUTOSIZE=Parse.parseBoolean(b); 112 if(AUTOSIZE){AUTOSIZE_LINEAR=false;} 113 SET_AUTOSIZE=true; 114 }else if(a.equalsIgnoreCase("autosizefactor") || a.equalsIgnoreCase("autosizefraction") || a.equalsIgnoreCase("autosizemult") || a.equalsIgnoreCase("sizemult")){ 115 AUTOSIZE_FACTOR=Float.parseFloat(b); 116 SET_AUTOSIZE_FACTOR=true; 117 AUTOSIZE_LINEAR=false; 118 SET_AUTOSIZE=true; 119 }else if(a.equalsIgnoreCase("linear") || a.equalsIgnoreCase("lineardensity") || a.equalsIgnoreCase("density")){ 120 if(Tools.isNumeric(b)){ 121 AUTOSIZE_LINEAR_DENSITY=Double.parseDouble(b); 122 AUTOSIZE_LINEAR=true; 123 AUTOSIZE=false; 124 SET_AUTOSIZE=true; 125 }else{ 126 AUTOSIZE_LINEAR=Parse.parseBoolean(b); 127 if(AUTOSIZE_LINEAR){AUTOSIZE=false;} 128 SET_AUTOSIZE=true; 129 } 130 }else if(a.equalsIgnoreCase("maxGenomeFractionSmall")){ 131 maxGenomeFractionSmall=Float.parseFloat(b); 132 }else if(a.equalsIgnoreCase("keyFraction")){ 133 double d=Double.parseDouble(b); 134 setKeyFraction(d); 135 }else if(a.equalsIgnoreCase("prealloc")){ 136 if(b==null || Character.isLetter(b.charAt(0))){ 137 if(Parse.parseBoolean(b)){ 138 prealloc=1.0f; 139 }else{ 140 prealloc=0; 141 } 142 }else{ 143 prealloc=Float.parseFloat(b); 144 } 145 } 146 147 else if(a.equalsIgnoreCase("intmap")){ 148 SketchIndex.useIntMap=Parse.parseBoolean(b); 149 }else if(a.equalsIgnoreCase("intmapsize")){ 150 SketchIndex.intMapSize=Parse.parseIntKMG(b); 151 }else if(a.equalsIgnoreCase("bitsetbits")){ 152 assert(false) : "bitsetbits should be 2."; 153 // bitSetBits=Integer.parseInt(b); 154 } 155 156 // else if(a.equalsIgnoreCase("minkmercount") || a.equalsIgnoreCase("minkeycount")){ 157 // minKeyOccuranceCount=Integer.parseInt(b); 158 // } 159 else if(a.equalsIgnoreCase("sketchHeapFactor")){ 160 sketchHeapFactor=Tools.max(1.0, Double.parseDouble(b)); 161 } 162 163 else if(a.equalsIgnoreCase("killok")){ 164 KILL_OK=Parse.parseBoolean(b); 165 }else if(a.equalsIgnoreCase("ssa") || a.equalsIgnoreCase("usessa")){ 166 useSSA=Parse.parseBoolean(b); 167 }else if(a.equalsIgnoreCase("ssa3") || a.equalsIgnoreCase("usessa3")){ 168 useSSA3=Parse.parseBoolean(b); 169 }else if(a.equalsIgnoreCase("exactani")){ 170 EXACT_ANI=Parse.parseBoolean(b); 171 }else if(a.equalsIgnoreCase("translate") || a.equalsIgnoreCase("callgenes")){ 172 translate=Parse.parseBoolean(b); 173 defaultParams.translate=translate; 174 }else if(a.equalsIgnoreCase("sixframes") || a.equalsIgnoreCase("6frames")){ 175 sixframes=Parse.parseBoolean(b); 176 defaultParams.sixframes=sixframes; 177 if(sixframes){ 178 translate=defaultParams.translate=true; 179 } 180 }else if(a.equalsIgnoreCase("processSSU") || a.equalsIgnoreCase("process16S") || a.equalsIgnoreCase("16S") || a.equalsIgnoreCase("SSU")){ 181 processSSU=Parse.parseBoolean(b); 182 }else if(a.equalsIgnoreCase("hashSeed")){ 183 hashSeed=Long.parseLong(b); 184 //remakeCodes(hashSeed); //Happens in postParse 185 } 186 187 else if(a.equalsIgnoreCase("forceDisableMultithreadedFastq")){ 188 forceDisableMultithreadedFastq=Parse.parseBoolean(b); 189 } 190 191 else if(a.equalsIgnoreCase("verifyentropy")){ 192 EntropyTracker.verify=Parse.parseBoolean(b); 193 }else if(a.equalsIgnoreCase("entropyK")){ 194 entropyK=Integer.parseInt(b); 195 }else if(a.equalsIgnoreCase("fastentropy") || a.equalsIgnoreCase("fentropy")){ 196 if(Parse.parseBoolean(b)){EntropyTracker.speed=EntropyTracker.FAST;} 197 }else if(a.equalsIgnoreCase("mediumentropy") || a.equalsIgnoreCase("mentropy")){ 198 if(Parse.parseBoolean(b)){EntropyTracker.speed=EntropyTracker.MEDIUM;} 199 }else if(a.equalsIgnoreCase("slowentropy") || a.equalsIgnoreCase("sentropy")){ 200 if(Parse.parseBoolean(b)){EntropyTracker.speed=EntropyTracker.SLOW;} 201 }else if(a.equalsIgnoreCase("superslowentropy") || a.equalsIgnoreCase("ssentropy")){ 202 if(Parse.parseBoolean(b)){EntropyTracker.speed=EntropyTracker.SUPERSLOW;} 203 }else if(a.equalsIgnoreCase("verbose2")){ 204 verbose2=Parse.parseBoolean(b); 205 }else if(a.equalsIgnoreCase("loadSketchesFromSketchFile2")){ 206 LOADER2=Parse.parseBoolean(b); 207 } 208 209 else if(a.equalsIgnoreCase("useToValue2") || a.equalsIgnoreCase("ToValue2")){ 210 useToValue2=Parse.parseBoolean(b); 211 }else if(a.equals("ssu") || a.equals("ssufile")){ 212 assert(false) : "ssufile is deprecated; please specify 16Sfile and/or 18Sfile independently"; 213 }else if(a.equalsIgnoreCase("16Sfile")/* || a.equalsIgnoreCase("ssufile") || a.equalsIgnoreCase("silvafile")*/){ 214 SSUMap.r16SFile=b; 215 if("auto".equalsIgnoreCase(SSUMap.r16SFile)){SSUMap.r16SFile=TaxTree.default16SFile();} 216 assert(SSUMap.r16SMap==null); 217 }else if(a.equalsIgnoreCase("18Sfile")){ 218 SSUMap.r18SFile=b; 219 if("auto".equalsIgnoreCase(SSUMap.r18SFile)){SSUMap.r18SFile=TaxTree.default18SFile();} 220 assert(SSUMap.r18SMap==null); 221 } 222 223 // else if(a.equalsIgnoreCase("minHashValue")){ 224 // minHashValue=Tools.max(1, Long.parseLong(b)); 225 // } 226 227 else{ 228 return false; 229 } 230 return true; 231 } 232 parseCoding(String arg, String a, String b)233 private static boolean parseCoding(String arg, String a, String b){ 234 if(a.equalsIgnoreCase("deltaout") || a.equalsIgnoreCase("delta")){ 235 deltaOut=Parse.parseBoolean(b); 236 }else if(a.equalsIgnoreCase("a33") || a.equalsIgnoreCase("a48")){ 237 boolean x=Parse.parseBoolean(b); 238 if(x){CODING=A48;} 239 else if(CODING==A48){CODING=HEX;} 240 }else if(a.equalsIgnoreCase("hex")){ 241 boolean x=Parse.parseBoolean(b); 242 if(x){CODING=HEX;} 243 else if(CODING==HEX){CODING=A48;} 244 }else if(a.equalsIgnoreCase("raw")){ 245 boolean x=Parse.parseBoolean(b); 246 if(x){CODING=RAW;} 247 else if(CODING==RAW){CODING=A48;} 248 }else{ 249 return false; 250 } 251 return true; 252 } 253 parseMode(String[] args)254 static int parseMode(String[] args){ 255 int mode=defaultParams.mode; 256 for(String arg : args){ 257 String[] split=arg.split("="); 258 String a=split[0].toLowerCase(); 259 String b=split.length>1 ? split[1] : null; 260 int x=parseMode(arg, a, b); 261 if(x>-1){mode=x;} 262 } 263 return mode; 264 } 265 parseMode(String arg, String a, String b)266 static int parseMode(String arg, String a, String b){ 267 int mode_=-1; 268 if(a.equalsIgnoreCase("mode")){ 269 assert(b!=null) : "Bad parameter: "+arg; 270 if(Tools.isDigit(b.charAt(0))){ 271 mode_=Integer.parseInt(b); 272 }else if(b.equalsIgnoreCase("single") || b.equalsIgnoreCase("onesketch")){ 273 mode_=ONE_SKETCH; 274 }else if(b.equalsIgnoreCase("taxa") || b.equalsIgnoreCase("pertaxa")){ 275 mode_=PER_TAXA; 276 }else if(b.equalsIgnoreCase("sequence") || b.equalsIgnoreCase("persequence") || b.equalsIgnoreCase("header") || b.equalsIgnoreCase("perheader")){ 277 mode_=PER_SEQUENCE; 278 // }else if(b.equalsIgnoreCase("header") || b.equalsIgnoreCase("perheader")){ 279 // mode_=PER_HEADER; 280 }else if(b.equalsIgnoreCase("img")){ 281 mode_=PER_IMG; 282 }else if(b.equalsIgnoreCase("perfile") || b.equalsIgnoreCase("file")){ 283 mode_=PER_FILE; 284 }else{ 285 assert(false) : "Bad parameter: "+arg; 286 } 287 }else if(a.equalsIgnoreCase("single") || a.equalsIgnoreCase("onesketch")){ 288 mode_=ONE_SKETCH; 289 }else if(a.equalsIgnoreCase("taxa") || a.equalsIgnoreCase("pertaxa")){ 290 mode_=PER_TAXA; 291 }else if(a.equalsIgnoreCase("sequence") || a.equalsIgnoreCase("persequence") || a.equalsIgnoreCase("header") || a.equalsIgnoreCase("perheader")){ 292 mode_=PER_SEQUENCE; 293 // }else if(a.equalsIgnoreCase("header") || a.equalsIgnoreCase("perheader")){ 294 // mode_=PER_HEADER; 295 }else if(a.equalsIgnoreCase("perfile") || a.equalsIgnoreCase("file")){ 296 mode_=PER_FILE; 297 } 298 return mode_; 299 } 300 301 /*--------------------------------------------------------------*/ 302 /*---------------- Initialization ----------------*/ 303 /*--------------------------------------------------------------*/ 304 setTaxtree(String taxTreeFile, PrintStream outstream)305 static synchronized void setTaxtree(String taxTreeFile, PrintStream outstream){ 306 if(taxTreeFile==null){ 307 taxtree=null; 308 return; 309 } 310 if(treefile!=null){ 311 assert(!treefile.equals(taxTreeFile)); 312 if(treefile.equals(taxTreeFile)){return;} 313 treefile=taxTreeFile; 314 } 315 taxtree=TaxTree.loadTaxTree(taxTreeFile, outstream, hashNames, false); 316 } 317 reset()318 public static void reset(){ 319 postparsed=false; 320 blacklist=null; 321 useWhitelist=false; 322 defaultParams=new DisplayParams(); 323 amino=false; 324 translate=false; 325 sixframes=false; 326 } 327 postParse()328 public static void postParse(){ 329 if(postparsed){return;} 330 postparsed=true; 331 IntList3.defaultMode=IntList3.UNIQUE; //Not really safe, if Seal uses Sketch... 332 333 if(defaultParams.amino()){amino=true;} 334 if(amino){Shared.AMINO_IN=true;} 335 336 if(amino8){ 337 AminoAcid.AMINO_SHIFT=3; 338 System.err.println("Set AMINO_SHIFT to "+AminoAcid.AMINO_SHIFT); 339 } 340 341 if(amino || translate){ 342 rcomp=false; 343 if(k>12){ 344 int oldk=k; 345 int oldk2=k2; 346 // k=Tools.min(k, 63/AminoAcid.AMINO_SHIFT); 347 // k2=Tools.min(k2, k); 348 k=defaultKAmino; 349 k2=defaultK2Amino; 350 if(k==k2){k2=0;} 351 if(k!=oldk || k2!=oldk2){System.err.println("Set k to "+k+(k2>0 ? ","+k2 : ""));} 352 } 353 // AUTOSIZE_FACTOR=(AUTOSIZE_FACTOR*defaultK)/k; 354 // System.err.println("Set AUTOSIZE_FACTOR to "+AUTOSIZE_FACTOR); 355 } 356 357 if(aminoOrTranslate()){ 358 bitsPerBase=5;//Maybe it is safe to keep these at 4 and 8 or 5 and 8; need to check. 359 bitsPerCycle=10; 360 }else{ 361 bitsPerBase=2; 362 bitsPerCycle=8; 363 } 364 basesPerCycle=bitsPerCycle/bitsPerBase; 365 hashCycles=((k*bitsPerBase)+bitsPerCycle-1)/bitsPerCycle; 366 367 cycleMask=~((-1L)<<bitsPerCycle); 368 maxCycles=(64+bitsPerCycle-1)/bitsPerCycle; 369 codeIncrement=(int)(cycleMask+1); 370 codes=makeCodes(maxCycles, codeIncrement, hashSeed, false); 371 codes1D=makeCodes1D(codes); 372 373 if(k2>0){ 374 assert(k2<k) : "k2 must be less than k."; 375 // assert(k2%basesPerCycle==0) : "k2 must be a multiple of "+basesPerCycle; //No longer required! Still recommended for speed though. 376 377 k2mask=~((-1L)<<(bitsPerBase*k2)); 378 k2submask=~((-1L)<<(bitsPerBase*(k2%basesPerCycle))); 379 k2shift=(k-k2); //for toValue2 380 k2midmask=(k2mask<<((k2shift*bitsPerBase)/2)); //for toValue2 381 hashCycles2=k2/basesPerCycle; 382 }else{ 383 useToValue2=false; 384 } 385 codeMax=hashCycles*codeIncrement; 386 codeMax2=hashCycles2*codeIncrement; 387 388 // hasher=k2<1 ? new Hasher1() : k2submask==0 ? new Hasher2() : new Hasher3(); 389 390 if(translate || processSSU){ 391 if(pgmFile==null){ 392 pgmFile=Data.findPath("?model.pgm"); 393 } 394 pgm=GeneModelParser.loadModel(pgmFile); 395 GeneCaller.call16S=processSSU; 396 GeneCaller.call18S=false;//processSSU; 397 GeneCaller.call23S=false; 398 GeneCaller.call5S=false; 399 GeneCaller.calltRNA=false; 400 GeneCaller.callCDS=translate; 401 402 if(GeneCaller.call16S || GeneCaller.call18S || GeneCaller.call23S){ 403 ProkObject.loadLongKmers(); 404 ProkObject.loadConsensusSequenceFromFile(true, true); 405 } 406 } 407 408 // if(HASH_VERSION<2 && useToValue2){HASH_VERSION=2;} 409 // else if(HASH_VERSION==2 && !useToValue2){HASH_VERSION=1;} 410 // assert(blacklist!=null) : blacklist+", "+(blacklist==null); 411 if(blacklist!=null){Blacklist.addFiles(blacklist);} 412 413 // System.err.println("amino="+amino+", k="+k+", k2="+k2+", bitsPerCycle="+bitsPerCycle+", bitsPerBase="+bitsPerBase+ 414 // ", basesPerCycle="+basesPerCycle+", hashCycles="+hashCycles+", k2mask="+k2mask+", k2submask="+k2submask+", hashCycles2="+hashCycles2+ 415 // ", codeMax="+codeMax+", codeMax2="+codeMax2); 416 // structures.IntList3.ascending=false;//123 for testing 417 418 alignerPool=new AlignmentThreadPool(Shared.threads()); 419 } 420 421 /*--------------------------------------------------------------*/ 422 /*---------------- Hashing ----------------*/ 423 /*--------------------------------------------------------------*/ 424 425 private static void remakeCodes(long hashSeed){ 426 long[][] codes2=makeCodes(maxCycles, codeIncrement, hashSeed, false); 427 long[] codes1D2=makeCodes1D(codes2); 428 for(int i=0; i<codes2.length; i++){ 429 for(int j=0; j<codes2[i].length; j++){ 430 codes[i][j]=codes2[i][j]; 431 } 432 } 433 for(int i=0; i<codes1D2.length; i++){ 434 codes1D[i]=codes1D2[i]; 435 } 436 } 437 438 public static long[][] makeCodes(int symbols, int modes, long seed, boolean positive){ 439 Random randy=new Random(seed); 440 long mask=positive ? Long.MAX_VALUE : -1L; 441 long[][] r=new long[symbols][modes]; 442 for(int i=0; i<symbols; i++){ 443 for(int j=0; j<modes; j++){ 444 r[i][j]=randy.nextLong()&mask; 445 } 446 } 447 for(int i=0; i<3; i++) { 448 antialias(r, randy); 449 } 450 return r; 451 } 452 453 private static void antialias(long[][] matrix, Random randy){ 454 for(long[] array : matrix){ 455 antialias(array, randy); 456 } 457 } 458 459 private static void antialias(long[] array, Random randy){ 460 for(int i=0; i<64; i++){ 461 antialiasNumbers(array, randy); 462 antialiasBit(array, randy, i); 463 } 464 } 465 466 private static void antialiasBit(long[] array, Random randy, int bit){ 467 int half=array.length/2; 468 long ones=0; 469 for(int i=0; i<array.length; i++){ 470 ones+=(array[i]>>bit)&1L; 471 } 472 final long orMask=1L<<bit; 473 final long andMask=~orMask; 474 while(ones<half-1){ 475 int loc=randy.nextInt(array.length); 476 while((array[loc]&orMask)!=0){ 477 loc=randy.nextInt(array.length); 478 } 479 array[loc]|=orMask; 480 ones++; 481 } 482 while(ones>half+1){ 483 int loc=randy.nextInt(array.length); 484 while((array[loc]&orMask)==0){ 485 loc=randy.nextInt(array.length); 486 } 487 array[loc]&=andMask; 488 ones--; 489 } 490 } 491 492 private static void antialiasNumbers(long[] array, Random randy){ 493 for(int i=0; i<array.length; i++){ 494 array[i]=antialiasNumber(array[i], randy); 495 } 496 } 497 498 private static long antialiasNumber(long number, Random randy){ 499 int ones=Long.bitCount(number); 500 while(Long.bitCount(number)<31){ 501 number=number|(1L<<randy.nextInt(64)); 502 } 503 while(Long.bitCount(number)>33){ 504 number=number&~(1L<<randy.nextInt(64)); 505 } 506 return number; 507 } 508 509 // public static long[] makeCodes1D(int symbols, int modes, long seed, boolean positive){ 510 // Random randy=Shared.threadLocalRandom(seed); 511 // long mask=positive ? Long.MAX_VALUE : -1L; 512 // long[] r=new long[symbols*modes]; 513 // for(int i=0; i<r.length; i++){ 514 // r[i]=randy.nextLong()&mask; 515 // } 516 // return r; 517 // } 518 519 public static long[] makeCodes1D(long[][] codes2D){ 520 final int len=codes2D.length*codes2D[0].length; 521 long[] codes1D=new long[len]; 522 int k=0; 523 for(long[] array : codes2D){ 524 for(long x : array){ 525 codes1D[k]=x; 526 k++; 527 } 528 } 529 return codes1D; 530 } 531 532 public static final long hash(long kmer){//Avoid using this. 533 return rcomp ? hash(kmer, AminoAcid.reverseComplementBinaryFast(kmer, k)) : hash(kmer, kmer); 534 } 535 536 public static final long hash(long kmer, long rkmer){ 537 assert(postparsed); 538 // return hasher.hash_inner(kmer); //Slow 539 // return k2<1 ? hash1(kmer) : hash2(kmer); 540 if(useToValue2){return hashToValue2(kmer, rkmer);} 541 final long key=toValue(kmer, rkmer); 542 return k2<1 ? hash1(key) : k2submask==0 ? hash2(key) : hash3(key); 543 } 544 545 /** 546 * New version. 547 * Generates a hash code from a kmer. 548 * @param kmer Kmer to hash 549 * @return Hash code 550 */ 551 private static final long hashToValue2(final long kmer0, final long rkmer0){ 552 // return Tools.max(hash64shift(kmer0), hash64shift(kmer0&0xFFFFFFFFFFFFL));//Something like this would be around 10% faster; not worth it. 553 long kmer=kmer0, rkmer=rkmer0; 554 final long key; 555 final boolean useK1; 556 if(rcomp){ 557 assert(!amino && !translate); 558 final long kmer2=((kmer&k2midmask)>>>k2shift); 559 final long rkmer2=((rkmer&k2midmask)>>>k2shift); 560 final long max2=Tools.max(kmer2, rkmer2); 561 // long cmasked=max2&kmerChoiceMask; 562 // useK1=kmerChoiceBitset.get((int)cmasked); 563 useK1=((max2%4999L)&1L)==0L; 564 key=useK1 ? Tools.max(kmer, rkmer) : max2; 565 // System.err.println("\n"+longer+", "+max2+", "+(max2%4999L)+", "+((max2%4999L)&1L)+ 566 // ", "+Long.toHexString(kmer)+", "+Long.toHexString(max2)+", "+Long.toHexString(value)); 567 // long value=longer ? (kmer2>rkmer2 ? kmer : rkmer) : max2; 568 // System.err.println("\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+ 569 // "\n"+AminoAcid.kmerToString(kmer2, k2)+"\n"+AminoAcid.kmerToString(rkmer2, k2)+"\n"+AminoAcid.kmerToString(value, k)+"\n"); 570 // assert(kmer2==AminoAcid.reverseComplementBinaryFast(rkmer2, k2));) 571 }else{ 572 assert(amino || translate); 573 final long kmer2=(kmer&k2mask); 574 useK1=((kmer2%4999)&1L)==0L; 575 key=useK1 ? kmer : kmer2; 576 } 577 // if(key%5!=0){return -1;}//This makes it a little faster. 578 final long bit=useK1 ? 0 : 1; //Note that this gets reversed later during the subtraction process 579 580 // System.err.println(bit+", "+Long.toHexString(kmer)+", "+Long.toHexString(k2mask)); 581 // assert(bit==0) : bit+", "+Long.toHexString(kmer); 582 583 long hashcode=key, data=key; 584 // for(int i=0; i<codeMax; i+=codeIncrement){ 585 // int x=(int)(data&cycleMask); 586 // data>>>=bitsPerCycle; 587 // hashcode^=codes1D[i+x]; 588 // } 589 // for(int i=0; data!=0; i+=codeIncrement){ 590 // int x=(int)(data&cycleMask); 591 // data>>>=bitsPerCycle; 592 // hashcode^=codes1D[i+x]; 593 // } 594 {//5% faster than other loops. 595 int i=0; 596 do{ 597 int x=(int)(data&cycleMask); 598 data>>>=bitsPerCycle; 599 hashcode^=codes1D[i+x]; 600 i+=codeIncrement; 601 }while(data!=0); 602 } 603 hashcode=(hashcode&(-2L))|bit; 604 605 return hashcode; 606 } 607 608 /** Taken from Thomas Wang, Jan 1997: 609 * http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm 610 * 611 * This is much faster than the table version. Results seem similar y. 612 */ 613 private static long hash64shift(long key){ 614 key = (~key) + (key << 21); // key = (key << 21) - key - 1; 615 key = key ^ (key >>> 24); 616 key = (key + (key << 3)) + (key << 8); // key * 265 617 key = key ^ (key >>> 14); 618 key = (key + (key << 2)) + (key << 4); // key * 21 619 key = key ^ (key >>> 28); 620 key = key + (key << 31); 621 return key; 622 } 623 624 /** 625 * Fastest version! 626 * Generates a hash code from a kmer. 627 * @param kmer Kmer to hash 628 * @return Hash code 629 */ 630 private static final long hash1(long kmer){ 631 // if(kmer%5!=0){return -1;} 632 long code=kmer; 633 for(int i=0; i<codeMax; i+=codeIncrement){ 634 int x=(int)(kmer&cycleMask); 635 kmer>>=bitsPerCycle; 636 code^=codes1D[i+x]; 637 } 638 639 return code; 640 } 641 642 643 /** 644 * Generates a hash code from a kmer, using dual kmer lengths. 645 * @param kmer Kmer to hash 646 * @return Hash code 647 */ 648 private static final long hash2(final long kmer0){ 649 // return hash64shift(kmer0); 650 long kmer=kmer0; 651 long code=0; 652 653 for(int i=0; i<codeMax2; i+=codeIncrement){ 654 int x=(int)(kmer&cycleMask); 655 kmer>>=bitsPerCycle; 656 code^=codes1D[i+x]; 657 } 658 final long code2=code^(k2mask&kmer0); 659 for(int i=codeMax2; i<codeMax; i+=codeIncrement){ 660 int x=(int)(kmer&cycleMask); 661 kmer>>=bitsPerCycle; 662 code^=codes1D[i+x]; 663 } 664 return ((code&1)==1) ? code2 : code^kmer0; //Random; faster than max. 665 } 666 667 668 /** 669 * Generates a hash code from a kmer, using dual kmer lengths, allowing K2 to be a non-multiple-of-4. 670 * Uses one additional and, xor, and lookup. 671 * @param kmer Kmer to hash 672 * @return Hash code 673 */ 674 private static final long hash3(final long kmer0){ 675 // return hash64shift(kmer0); 676 677 long kmer=kmer0; 678 long code=0; 679 assert(k2submask!=0); 680 681 int i=0; 682 for(; i<codeMax2; i+=codeIncrement){ 683 int x=(int)(kmer&cycleMask); 684 kmer>>=bitsPerCycle; 685 code^=codes1D[i+x]; 686 } 687 final int residual=(int)(kmer&k2submask); 688 final long code2=code^(k2mask&kmer0)^codes1D[i+residual]; 689 for(; i<codeMax; i+=codeIncrement){ 690 int x=(int)(kmer&cycleMask); 691 kmer>>=bitsPerCycle; 692 code^=codes1D[i+x]; 693 } 694 return ((code&1)==1) ? code2 : code^kmer0; //Random; faster than max. 695 } 696 697 698 /*--------------------------------------------------------------*/ 699 /*---------------- Convenience Methods ----------------*/ 700 /*--------------------------------------------------------------*/ 701 702 public static final float align(byte[] query, byte[] ref){ 703 SingleStateAlignerFlat2 ssa=GeneCaller.getSSA(); 704 int a=0, b=ref.length-1; 705 int[] max=ssa.fillUnlimited(query, ref, a, b, -9999); 706 if(max==null){return 0;} 707 708 final int rows=max[0]; 709 final int maxCol=max[1]; 710 final int maxState=max[2]; 711 final float id=ssa.tracebackIdentity(query, ref, a, b, rows, maxCol, maxState, null); 712 return id; 713 } 714 715 public static final float alignAndMakeMatch(Read r, byte[] ref){ 716 byte[] query=r.bases; 717 SingleStateAlignerFlat2 ssa=GeneCaller.getSSA(); 718 int a=0, b=ref.length-1; 719 int[] max=ssa.fillUnlimited(query, ref, a, b, -9999); 720 if(max==null){return 0;} 721 722 final int rows=max[0]; 723 final int maxCol=max[1]; 724 final int maxState=max[2]; 725 byte[] match=ssa.traceback(query, ref, a, b, rows, maxCol, 0); 726 int[] score=ssa.score(query, ref, a, b, rows, maxCol, 0); 727 final float id=Read.identity(match); 728 r.match=match; 729 r.start=score[1]; 730 r.stop=score[2]; 731 r.setMapped(true); 732 return id; 733 } 734 735 public static final float alignAndMakeMatch(Read r, byte[] ref, float[] refWeights/*, float[] insWeights, float[] delWeights*/){ 736 byte[] query=r.bases; 737 SingleStateAlignerFlatFloat ssa=GeneCaller.getSSAF(); 738 // ssa.setWeights(refWeights, insWeights, delWeights); 739 ssa.setWeights(refWeights); 740 int a=0, b=ref.length-1; 741 int[] max=ssa.fillUnlimited(query, ref, a, b, -9999); 742 if(max==null){return 0;} 743 744 final int rows=max[0]; 745 final int maxCol=max[1]; 746 final int maxState=max[2]; 747 byte[] match=ssa.traceback(query, ref, a, b, rows, maxCol, 0); 748 int[] score=ssa.score(query, ref, a, b, rows, maxCol, 0); 749 final float id=Read.identity(match); 750 r.match=match; 751 r.start=score[1]; 752 r.stop=score[2]; 753 r.setMapped(true); 754 return id; 755 } 756 757 static String fixMeta(String s){ 758 if(s==null){return null;} 759 int colon=s.indexOf(':'); 760 assert(colon>=0); 761 if(s.length()==colon+5 && s.endsWith(":null")){return null;} 762 return s.replace('\t', ' '); 763 } 764 765 static ArrayList<String> fixMeta(ArrayList<String> list){ 766 if(list==null || list.isEmpty()){return null;} 767 for(int i=0; i<list.size(); i++){ 768 String s=list.get(i); 769 s=fixMeta(s); 770 if(s==null){ 771 list.remove(i); 772 i--; 773 }else{ 774 list.set(i, s); 775 } 776 } 777 if(list.isEmpty()){return null;} 778 list.trimToSize(); 779 Collections.sort(list); 780 return list; 781 } 782 783 public static final float aniToWkid(final double ani){ 784 final float wkid; 785 if(ani<=0){ 786 wkid=0; 787 }else if(k2<1){ 788 wkid=(float)Math.pow(ani, k); 789 }else{ 790 wkid=0.5f*(float)(Math.pow(ani, k)+Math.pow(ani, k2)); 791 } 792 return wkid; 793 } 794 795 public static final float wkidToAniExact(final double wkid, final int A, final int B, final double maxDeviation){ 796 assert(A>B); 797 assert(maxDeviation>0); 798 final double logWkid=Math.log(wkid); 799 final double invA=1.0/A; 800 final double invB=1.0/B; 801 802 double aniUpper=Math.exp(logWkid*invA); 803 double aniLower=Math.exp(logWkid*invB); 804 assert(aniLower<=aniUpper); 805 double aniEst=(aniLower+aniUpper)*0.5f; 806 // System.err.println(aniEst); 807 // if(B>16){//Fast mode 808 // aniUpper=(aniUpper+3*aniEst)*0.25; 809 // aniLower=(aniLower+3*aniEst)*0.25; 810 // } 811 double wkidEst=aniToWkid(aniEst, A, B); 812 int iters=1; 813 // System.err.println(aniLower+"\t"+aniUpper+"\t"+aniEst+"\t"+aniToWkid(aniEst, A, B)+"\t"+(wkidEst-wkid)); 814 while(Math.abs(wkidEst-wkid)>maxDeviation && iters<40){ 815 if(wkidEst<wkid){aniLower=aniEst;} 816 else{aniUpper=aniEst;} 817 aniEst=(aniLower+aniUpper)*0.5f; 818 wkidEst=aniToWkid(aniEst, A, B); 819 // System.err.println(aniLower+"\t"+aniUpper+"\t"+aniEst+"\t"+wkidEst+"\t"+(wkidEst-wkid)); 820 iters++; 821 } 822 // System.err.println("Iterations: "+iters); 823 return (float)aniEst; 824 } 825 826 public static double aniToWkid(double ani, int A, int B){ 827 if(A<B){int C=A; A=B; B=C;}//Swap 828 double aPow=Math.pow(ani, A); 829 double bPow=Math.pow(ani, B); 830 // return 0.5*(aPow+bPow); 831 return useToValue2 ? 0.5*(aPow+bPow) : 0.5*(aPow+(bPow*0.4+aPow*0.6)); 832 } 833 834 //From Kayla 835 // public static double aniToWkid(double ANI, int K2, int K1){ 836 // if(K2<K1){int C=K2; K2=K1; K1=C;}//Swap 837 // 838 // return 0.5*((1 - (1 - Math.pow(ANI, K2-K1))*Math.pow(ANI, K1))*Math.pow(ANI, K2) + 839 // (1 + (1 - Math.pow(ANI, K2-K1))*Math.pow(ANI, K1))*Math.pow(ANI, K1)); 840 // } 841 842 public static double aniToWkid(double ani, int A){ 843 return Math.pow(ani, A); 844 } 845 846 public static final float wkidToAniExact(final double wkid, final int k){ 847 return (float)Math.exp(Math.log(wkid)/(k)); 848 } 849 850 public static final float wkidToAni(final double wkid){ 851 final float ani; 852 if(wkid<=0){ 853 ani=0; 854 }else if(k2<1){ 855 ani=(float)Math.exp(Math.log(wkid)/k); 856 }else{ 857 if(EXACT_ANI){return wkidToAniExact(wkid, k, k2, 0.00005);} 858 859 //This is linear interpolation which is not correct. But for some reason it works really well. 860 final double log=Math.log(wkid); 861 862 // double ani1=Math.exp(log/k); 863 // double ani2=Math.exp(log/k2); 864 // ani=(float)(0.5*(ani1+ani2)); 865 866 //alternatively... this one seems to work better. 867 // ani=(float)Math.exp(2*log/(k*1.1+k2*0.9)); 868 ani=(float)Math.exp(2*log/(1.2*k+.8*k2)); 869 } 870 return ani; 871 } 872 873 // public static void kmerArrayToSketchArray(long[] kmers){ 874 // for(int i=0; i<kmers.length; i++){ 875 // long kmer=kmers[i]; 876 // long hash=SketchTool.hash(kmer); 877 // assert(hash>=minHashValue); 878 // hash=Long.MAX_VALUE-hash; 879 // kmers[i]=hash; 880 // } 881 // Shared.sort(kmers); 882 // } 883 884 public static void hashArrayToSketchArray(long[] keys){ 885 for(int i=0; i<keys.length; i++){ 886 long hash=keys[i]; 887 assert(hash>=minHashValue); 888 hash=Long.MAX_VALUE-hash; 889 keys[i]=hash; 890 } 891 Shared.sort(keys); 892 } 893 894 public static final long genomeSizeEstimate(long min, int length) { 895 if(length==0){return 0;} 896 double est=((double)Long.MAX_VALUE)*2*length/(Tools.max(min, 1)); 897 // if(k2>0){est*=0.5;} //This is necessary if the hash function uses max, but not random. 898 // System.err.println("max="+Long.MAX_VALUE+", min="+min+", len="+length+", est="+(long)est); 899 // new Exception().printStackTrace(System.err); 900 return (long)Math.ceil(est); 901 } 902 903 public static final int toSketchSize(long genomeSizeBases, long genomeSizeKmers, long genomeSizeEstimate, int maxSketchSize){ 904 // assert(false) : genomeSizeBases+", "+genomeSizeKmers+", "+genomeSizeEstimate+", "+maxSketchSize+", "+useSizeEstimate; 905 if(genomeSizeEstimate>0 && useSizeEstimate){ 906 return toSketchSizeKmers(genomeSizeEstimate, maxSketchSize); 907 } 908 if(genomeSizeKmers>0){ 909 return toSketchSizeKmers(genomeSizeKmers, maxSketchSize); 910 } 911 assert(genomeSizeBases>0) : "BBSketch does not currently support empty files.\n" 912 +genomeSizeBases+", "+genomeSizeKmers+", "+genomeSizeEstimate+", "+maxSketchSize+", "+useSizeEstimate; 913 return toSketchSizeBases(genomeSizeBases, maxSketchSize); 914 } 915 916 private static final int toSketchSizeBases(long genomeSizeBases, int maxSketchSize) { 917 return toSketchSizeKmers(Tools.max(0, genomeSizeBases-k+1), maxSketchSize); 918 } 919 920 private static final int toSketchSizeKmers(long genomeSizeKmers, int maxSketchSize) { 921 // System.err.println(genomeSizeKmers+", "+maxSketchSize); 922 // new Exception().printStackTrace(); 923 if(AUTOSIZE){ 924 if(aminoOrTranslate()){ 925 float linear1=Tools.min(60+0.5f*(float)Math.sqrt(genomeSizeKmers), 926 maxGenomeFractionSmall*genomeSizeKmers*2); 927 float linear2=genomeSizeKmers*maxGenomeFraction*2; 928 float poly=0+1f*(float)Math.sqrt(genomeSizeKmers)+90f*(float)Math.pow(genomeSizeKmers, 0.3); 929 float log=Tools.max(1000, -4000+3500*(float)Math.log(Tools.max(1, genomeSizeKmers))); 930 float min=Tools.min(Tools.max(linear1, linear2), poly, log); 931 assert(min>=Integer.MIN_VALUE && min<=Integer.MAX_VALUE) : min; 932 933 // System.err.println(genomeSizeKmers+" -> "+linear1+", "+linear2+", "+poly+", "+log); 934 935 return (int)Tools.min(genomeSizeKmers*keyFraction2, min*AUTOSIZE_FACTOR); 936 // return (int)Tools.max(1, min*AUTOSIZE_FACTOR); 937 }else{ 938 float linear1=Tools.min(smallSketchSize+0.5f*(float)Math.sqrt(genomeSizeKmers), 939 maxGenomeFractionSmall*genomeSizeKmers-10); 940 float linear2=genomeSizeKmers*maxGenomeFraction; 941 float poly=0+1f*(float)Math.sqrt(genomeSizeKmers)+90f*(float)Math.pow(genomeSizeKmers, 0.3); 942 float log=Tools.max(1000, -4000+3500*(float)Math.log(Tools.max(1, genomeSizeKmers))+8f*(float)Math.pow(genomeSizeKmers, 0.3)); 943 float min=Tools.min(Tools.max(linear1, linear2), poly, log); 944 assert(min>=Integer.MIN_VALUE && min<=Integer.MAX_VALUE) : min; 945 946 // System.err.println(genomeSizeKmers+" -> "+linear1+", "+linear2+", "+poly+", "+log); 947 948 int result=(int)Tools.min(genomeSizeKmers*keyFraction2, min*AUTOSIZE_FACTOR); 949 // System.err.println(result); 950 return result; 951 // return (int)Tools.max(1, min*AUTOSIZE_FACTOR); 952 } 953 }else if(AUTOSIZE_LINEAR){ 954 if(aminoOrTranslate()){ 955 return (int)(Tools.min(10000000, Tools.min(maxGenomeFractionSmall, AUTOSIZE_LINEAR_DENSITY)*genomeSizeKmers)); 956 }else{ 957 return (int)(Tools.min(10000000, Tools.min(maxGenomeFractionSmall, AUTOSIZE_LINEAR_DENSITY)*genomeSizeKmers)); 958 } 959 }else{ 960 if(aminoOrTranslate()){//kmers are roughly 3x smaller... 961 return Tools.min((int)(2+3*maxGenomeFraction*genomeSizeKmers), maxSketchSize); 962 }else{ 963 return Tools.min((int)(2+maxGenomeFraction*genomeSizeKmers), maxSketchSize); 964 } 965 } 966 } 967 968 // static final long toValue2(long kmer, long rkmer){ 969 // assert(k2>0 && k2<k) : "k="+k+","+k2; 970 // 971 // final long value; 972 // if(rcomp){ 973 // assert(!amino && !translate); 974 //// if(!rcomp){return kmer;} 975 // long kmer2=((kmer&k2midmask)>>>k2shift); 976 // long rkmer2=((rkmer&k2midmask)>>>k2shift); 977 // 978 //// assert(kmer2>=0); 979 //// assert(kmer<0 || kmer2<=kmer); 980 //// assert(rkmer<0 || rkmer2<=kmer); 981 // 982 //// assert(false) : "\n"+Long.toBinaryString(kmer)+ 983 //// "\n"+Long.toBinaryString(rkmer)+ 984 //// "\n"+Long.toBinaryString(kmer2)+ 985 //// "\n"+Long.toBinaryString(rkmer2); 986 // 987 // //TODO: Slow 988 //// assert(kmer==AminoAcid.reverseComplementBinaryFast(rkmer, k)) : 989 //// "\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+"\n"+AminoAcid.kmerToString(AminoAcid.reverseComplementBinaryFast(rkmer, k), k)+"\n"; 990 //// assert(kmer2==AminoAcid.reverseComplementBinaryFast(rkmer2, k2)); 991 // long max2=Tools.max(kmer2, rkmer2); 992 //// long cmasked=max2&kmerChoiceMask; 993 //// boolean longer=kmerChoiceBitset.get((int)cmasked); 994 // boolean longer=((max2%4999L)&1L)==0L; 995 // value=longer ? Tools.max(kmer, rkmer) : max2; 996 //// System.err.println("\n"+longer+", "+max2+", "+(max2%4999L)+", "+((max2%4999L)&1L)+ 997 //// ", "+Long.toHexString(kmer)+", "+Long.toHexString(max2)+", "+Long.toHexString(value)); 998 //// long value=longer ? (kmer2>rkmer2 ? kmer : rkmer) : max2; 999 //// System.err.println("\n"+AminoAcid.kmerToString(kmer, k)+"\n"+AminoAcid.kmerToString(rkmer, k)+ 1000 //// "\n"+AminoAcid.kmerToString(kmer2, k2)+"\n"+AminoAcid.kmerToString(rkmer2, k2)+"\n"+AminoAcid.kmerToString(value, k)+"\n"); 1001 //// assert(kmer2==AminoAcid.reverseComplementBinaryFast(rkmer2, k2));) 1002 // }else{ 1003 // assert(amino || translate); 1004 //// long kmer2=(kmer&k2mask); 1005 // long kmer2=((kmer&k2midmask)>>>k2shift); 1006 // boolean longer=((kmer2%4999)&1)==0; 1007 // value=longer ? kmer : kmer2; 1008 // } 1009 // return value; 1010 // } 1011 1012 private static final long toValue(long kmer, long rkmer){ 1013 // assert(toValue2(kmer, rkmer)==toValue2(rkmer, kmer)); 1014 assert(!useToValue2); 1015 // if(useToValue2){return toValue2(kmer, rkmer);} 1016 long value=(rcomp ? Tools.max(kmer, rkmer) : kmer); 1017 return value; 1018 } 1019 1020 /*--------------------------------------------------------------*/ 1021 /*---------------- Constants ----------------*/ 1022 /*--------------------------------------------------------------*/ 1023 1024 public static final int RAW=0, HEX=1, A48=2; 1025 public static final char[] codingArray={'R', 'H', 'A'}; 1026 1027 static final byte[] hexTable=new byte[128]; 1028 static { 1029 Arrays.fill(hexTable, (byte)-1); 1030 for(int i='0'; i<='9'; i++){ 1031 hexTable[i]=(byte)(i-'0'); 1032 } 1033 for(int i='A'; i<='F'; i++){ 1034 hexTable[i]=hexTable[i+'a'-'A']=(byte)(i-'A'+10); 1035 } 1036 hexTable['x']=hexTable['X']=hexTable['-']=hexTable['+']=0; 1037 } 1038 1039 public static final int ONE_SKETCH=1, PER_SEQUENCE=2, PER_TAXA=3, PER_IMG=4/*, PER_HEADER=5*/, PER_FILE=6; 1040 1041 /*--------------------------------------------------------------*/ 1042 /*---------------- Default Locations ----------------*/ 1043 /*--------------------------------------------------------------*/ 1044 1045 // public static ArrayList<String> IMG_PATH(){return makePaths(IMG_PATH, 31);} 1046 // public static ArrayList<String> NT_PATH(){return makePaths(NT_PATH, 31);} 1047 // public static ArrayList<String> NR_PATH(){throw new RuntimeException("NR is not currently available.");} 1048 // public static ArrayList<String> REFSEQ_PATH(){return makePaths(REFSEQ_PATH, 31);} 1049 // public static ArrayList<String> SILVA_PATH(){return makePaths(SILVA_PATH, 31);} 1050 1051 private static ArrayList<String> makePaths(String pattern, int files){ 1052 ArrayList<String> list=new ArrayList<String>(files); 1053 for(int i=0; i<files; i++){ 1054 list.add(pattern.replace("#", ""+i)); 1055 } 1056 return list; 1057 } 1058 1059 private static final String IMG_PATH="/global/projectb/sandbox/gaag/bbtools/img/current/img#.sketch"; 1060 private static final String NT_PATH="/global/projectb/sandbox/gaag/bbtools/nt/current/taxa#.sketch"; 1061 private static final String NR_PATH="/global/projectb/sandbox/gaag/bbtools/nr/current/taxa#.sketch"; 1062 private static final String REFSEQ_PATH="/global/projectb/sandbox/gaag/bbtools/refseq/current/taxa#.sketch"; 1063 private static final String REFSEQ_PATH_BIG="/global/projectb/sandbox/gaag/bbtools/refseq/current/big#.sketch"; 1064 private static final String SILVA_PATH="/global/projectb/sandbox/gaag/bbtools/silva/latest/both_taxa#.sketch"; 1065 private static final String PROKPROT_PATH="/global/projectb/sandbox/gaag/bbtools/refseq/current/prot/taxa#.sketch"; 1066 private static final String PROKPROT_PATH_BIG="/global/projectb/sandbox/gaag/bbtools/refseq/current/prot/big#.sketch"; 1067 private static final String MITO_PATH="/global/projectb/sandbox/gaag/bbtools/mito2/taxa#.sketch"; 1068 private static final String FUNGI_PATH="/global/projectb/sandbox/gaag/bbtools/mito2/fungi#.sketch"; 1069 1070 private static final String IMG_PATH_AWS=null; 1071 private static final String NT_PATH_AWS="/test1/sketch/latest/nt/taxa#.sketch"; 1072 private static final String NR_PATH_AWS=null; 1073 private static final String REFSEQ_PATH_AWS="/test1/sketch/latest/refseq/taxa#.sketch"; 1074 private static final String SILVA_PATH_AWS="/test1/sketch/latest/ribo/both_taxa#.sketch"; 1075 private static final String PROKPROT_PATH_AWS="/test1/sketch/latest/protein/taxa#.sketch"; 1076 private static final String MITO_PATH_AWS=null; 1077 private static final String FUNGI_PATH_AWS=null; 1078 1079 public static final String IMG_PATH(){return Shared.AWS ? IMG_PATH_AWS : IMG_PATH;} 1080 public static final String NT_PATH(){return Shared.AWS ? NT_PATH_AWS : NT_PATH;} 1081 public static final String NR_PATH(){return Shared.AWS ? NR_PATH_AWS : NR_PATH;} 1082 public static final String REFSEQ_PATH(){return Shared.AWS ? REFSEQ_PATH_AWS : REFSEQ_PATH;} 1083 public static final String REFSEQ_PATH_BIG(){return REFSEQ_PATH_BIG;} 1084 public static final String SILVA_PATH(){return Shared.AWS ? SILVA_PATH_AWS : SILVA_PATH;} 1085 public static final String PROKPROT_PATH(){return Shared.AWS ? PROKPROT_PATH_AWS : PROKPROT_PATH;} 1086 public static final String PROKPROT_PATH_BIG(){return PROKPROT_PATH_BIG;} 1087 public static final String MITO_PATH(){return Shared.AWS ? MITO_PATH_AWS : MITO_PATH;} 1088 public static final String FUNGI_PATH(){return Shared.AWS ? FUNGI_PATH_AWS : FUNGI_PATH;} 1089 1090 /*--------------------------------------------------------------*/ 1091 /*---------------- Getters ----------------*/ 1092 /*--------------------------------------------------------------*/ 1093 1094 public static boolean useWhitelist() {return useWhitelist;} 1095 public static String blacklist() {return blacklist;} 1096 1097 /*--------------------------------------------------------------*/ 1098 /*---------------- Static Fields ----------------*/ 1099 /*--------------------------------------------------------------*/ 1100 1101 public static int CODING=A48; 1102 public static boolean deltaOut=true; 1103 public static boolean rcomp=true; 1104 1105 public static final int defaultK=32; 1106 public static final int defaultK2=24; 1107 public static final int defaultKAmino=12; 1108 public static final int defaultK2Amino=9; 1109 public static int k=defaultK; 1110 public static int k2=defaultK2; 1111 public static int entropyK=3; 1112 public static boolean setK=false; 1113 public static boolean amino=false; 1114 public static boolean amino8=false; 1115 public static boolean translate=false; 1116 public static boolean sixframes=false; 1117 public static boolean processSSU=true; 1118 public static int min_SSU_len=1000; 1119 public static int HASH_VERSION=2; 1120 public static String pgmFile=null; 1121 public static GeneModel pgm=null; 1122 1123 static boolean aminoOrTranslate(){return amino | translate;} 1124 1125 private static int bitsPerCycle=8; //Optimal speed for K=31 is 9bpc (15% faster than 8). 9, 10, and 11 are similar. 1126 private static long cycleMask=~((-1L)<<bitsPerCycle); 1127 private static final long codeOrMask=1L; 1128 private static final long codeAndMask=~1L; 1129 private static int maxCycles=(64+bitsPerCycle-1)/bitsPerCycle; 1130 private static int codeIncrement=(int)(cycleMask+1); 1131 private static int codeMax; 1132 private static int codeMax2; 1133 1134 private static long hashSeed=12345; 1135 private static long[][] codes;//=makeCodes(maxCycles, codeIncrement, hashSeed, false); 1136 private static long[] codes1D;//=makeCodes1D(codes); 1137 static boolean useToValue2=true; 1138 1139 //These are set in postParse() 1140 private static int bitsPerBase=2; 1141 private static int basesPerCycle=bitsPerCycle/bitsPerBase; 1142 private static int hashCycles=64/bitsPerCycle; //Note: Needs to be variable based on k to make k2 codes compatible with k codes 1143 private static int hashCycles2; 1144 private static long k2mask; 1145 private static long k2submask; 1146 //Difference in length of k and k2, in bits 1147 private static int k2shift; 1148 private static long k2midmask; 1149 1150 /** 1151 * Make the SketchHeap size this factor bigger, 1152 * when minKeyOccuranceCount is used 1153 */ 1154 public static double sketchHeapFactor=8; 1155 // static int minKeyOccuranceCount=1; 1156 1157 public static int minSketchSize=3; 1158 public static int targetSketchSize=10000; 1159 public static boolean AUTOSIZE=true; 1160 public static float AUTOSIZE_FACTOR=1; 1161 public static boolean SET_AUTOSIZE_FACTOR=false; 1162 public static boolean SET_AUTOSIZE=false; 1163 public static boolean SET_TARGET_SIZE=false; 1164 public static boolean AUTOSIZE_LINEAR=false; 1165 public static double AUTOSIZE_LINEAR_DENSITY=0.001; 1166 public static float maxGenomeFraction=0.04f;//Was 0.015, but that is often too sparse 1167 public static float maxGenomeFractionSmall=0.10f; 1168 public static int smallSketchSize=150; 1169 public static boolean makeIndex=true; 1170 public static float prealloc=0; 1171 public static boolean allToAll=false; 1172 public static boolean compareSelf=false; 1173 public static boolean skipCompare=false; 1174 public static final int bitSetBits=2; //Needs to be 2 for unique counts. 1175 1176 private static double keyFraction=0.16; 1177 private static double keyFraction2=keyFraction*1.2; 1178 public static long minHashValue=setMinHashValue(); 1179 public static double keyFraction(){return keyFraction;} 1180 public static void setKeyFraction(double d){ 1181 assert(d>0); 1182 keyFraction=Tools.mid(0.0001, d, 0.5); 1183 keyFraction2=Tools.mid(0.0001, keyFraction*1.2, 0.5); 1184 } 1185 public static long setMinHashValue(){ 1186 double mult=1-2*keyFraction; 1187 minHashValue=(long)(mult*Long.MAX_VALUE); 1188 assert(minHashValue>=0 && minHashValue<Long.MAX_VALUE); 1189 return minHashValue; 1190 } 1191 1192 public static int minFakeID=1900000000; 1193 static boolean hashNames=false; 1194 static boolean skipNonCanonical=true; 1195 static boolean useSizeEstimate=true; 1196 public static boolean allowMultithreadedFastq=false; 1197 static boolean forceDisableMultithreadedFastq=false; 1198 1199 static boolean preferSSUMap=false; 1200 static boolean preferSSUMapForEuks=true; 1201 static boolean useSSUMapOnly=false; 1202 static boolean useSSUMapOnlyForEuks=false; 1203 static boolean ban18SForProks=true; 1204 static boolean ban16SForEuks=true; 1205 1206 static long sampleseed=-1L; 1207 1208 public static TaxTree taxtree=null; 1209 private static String treefile=null; 1210 static String blacklist=null; 1211 static boolean useWhitelist=false; 1212 private static boolean postparsed=false; 1213 public static boolean KILL_OK=false; 1214 public static boolean EXACT_ANI=true; 1215 public static boolean useSSA=true; 1216 public static boolean useSSA3=false; 1217 1218 //Needs to be last due to dependencies. 1219 public static DisplayParams defaultParams=new DisplayParams(); 1220 1221 static AlignmentThreadPool alignerPool=null; 1222 1223 public static boolean verbose2=false; 1224 public static boolean LOADER2=true; 1225 1226 } 1227