1 package stream; 2 3 import java.io.Serializable; 4 import java.util.ArrayList; 5 import java.util.Arrays; 6 import java.util.HashSet; 7 8 import align2.GapTools; 9 import align2.QualityTools; 10 import dna.AminoAcid; 11 import dna.ChromosomeArray; 12 import dna.Data; 13 import shared.KillSwitch; 14 import shared.Shared; 15 import shared.Tools; 16 import shared.TrimRead; 17 import structures.ByteBuilder; 18 import ukmer.Kmer; 19 20 public final class Read implements Comparable<Read>, Cloneable, Serializable{ 21 22 /** 23 * 24 */ 25 private static final long serialVersionUID = -1026645233407290096L; 26 main(String[] args)27 public static void main(String[] args){ 28 byte[] a=args[0].getBytes(); 29 System.out.println(new String(a)); 30 byte[] b=toShortMatchString(a); 31 System.out.println(new String(b)); 32 byte[] c=toLongMatchString(b); 33 System.out.println(new String(c)); 34 byte[] d=toLongMatchString(c); 35 System.out.println(new String(d)); 36 // byte[] e=toShortMatchString(b); 37 // System.out.println(new String(e)); 38 39 } 40 Read(byte[] bases_, byte[] quals_, long id_)41 public Read(byte[] bases_, byte[] quals_, long id_){ 42 this(bases_, quals_, Long.toString(id_), id_); 43 } 44 Read(byte[] bases_, byte[] quals_, String name_, long id_)45 public Read(byte[] bases_, byte[] quals_, String name_, long id_){ 46 this(bases_, quals_, name_, id_, 0, -1, -1, -1); 47 } 48 Read(byte[] bases_, byte[] quals_, String name_, long id_, int flag_)49 public Read(byte[] bases_, byte[] quals_, String name_, long id_, int flag_){ 50 this(bases_, quals_, name_, id_, flag_, -1, -1, -1); 51 } 52 Read(byte[] s_, byte[] quals_, long id_, int chrom_, int start_, int stop_, byte strand_)53 public Read(byte[] s_, byte[] quals_, long id_, int chrom_, int start_, int stop_, byte strand_){ 54 this(s_, quals_, Long.toString(id_), id_, (int)strand_, chrom_, start_, stop_); 55 } 56 57 // public Read(byte[] bases_, byte[] quals_, String id_, long numericID_, byte strand_, int chrom_, int start_, int stop_){ 58 // this(bases_, quals_, id_, numericID_, (int)strand_, chrom_, start_, stop_); 59 // assert(strand_==0 || strand_==1); 60 // assert(start_<=stop_) : chrom_+", "+start_+", "+stop_+", "+numericID_; 61 // } 62 63 /** Note that strand can be used as flag */ Read(byte[] bases_, byte[] quals_, String id_, long numericID_, int flags_, int chrom_, int start_, int stop_)64 public Read(byte[] bases_, byte[] quals_, String id_, long numericID_, int flags_, int chrom_, int start_, int stop_){ 65 flags=flags_&~VALIDATEDMASK; 66 bases=bases_; 67 quality=quals_; 68 69 chrom=chrom_; 70 start=start_; 71 stop=stop_; 72 73 id=id_; 74 numericID=numericID_; 75 76 // assert(amino()) : Shared.AMINO_IN;//123 77 if(VALIDATE_IN_CONSTRUCTOR){validate(true);} 78 } 79 80 /*--------------------------------------------------------------*/ 81 /*---------------- Validation ----------------*/ 82 /*--------------------------------------------------------------*/ 83 validate(final boolean processAssertions)84 public boolean validate(final boolean processAssertions){ 85 assert(!validated()); 86 87 // if(false){//This causes problems with error-corrected PacBio reads. 88 // boolean x=(quality==null || quality.length<1 || quality[0]<=80 || !FASTQ.DETECT_QUALITY || FASTQ.IGNORE_BAD_QUALITY); 89 // if(!x){ 90 // if(processAssertions){ 91 // KillSwitch.kill("Quality value ("+quality[0]+") appears too high.\n"+Arrays.toString(quality)+ 92 // "\n"+Arrays.toString(bases)+"\n"+numericID+"\n"+id+"\n"+FASTQ.ASCII_OFFSET); 93 // } 94 // return false; 95 // } 96 // } 97 98 if(bases==null){ 99 quality=null; //I could require this to be true 100 if(FIX_HEADER){fixHeader(processAssertions);} 101 setValidated(true); 102 return true; 103 } 104 105 validateQualityLength(processAssertions); 106 107 final boolean passesJunk; 108 109 // assert(false) : SKIP_SLOW_VALIDATION+","+VALIDATE_BRANCHLESS+","+JUNK_MODE; 110 if(SKIP_SLOW_VALIDATION){ 111 passesJunk=true; 112 }else if(!aminoacid()){ 113 if(U_TO_T){uToT();} 114 if(VALIDATE_BRANCHLESS){ 115 passesJunk=validateCommonCase_branchless(processAssertions); 116 }else{ 117 passesJunk=validateCommonCase(processAssertions); 118 } 119 }else{ 120 // if(U_TO_T){uToT();} This is amino, so... 121 fixCase(); 122 passesJunk=validateJunk(processAssertions); 123 if(CHANGE_QUALITY){fixQuality();} 124 } 125 126 if(FIX_HEADER){fixHeader(processAssertions);} 127 128 setValidated(true); 129 130 return true; 131 } 132 133 134 checkQuality()135 public boolean checkQuality(){ 136 if(quality==null){return true;} 137 for(int i=0; quality!=null && i<bases.length; i++){ 138 if((bases[i]=='N')!=(quality[i]<1)){ 139 assert(false) : (char)bases[i]+", "+quality[i]+", "+i+", "+CHANGE_QUALITY+", "+MIN_CALLED_QUALITY+"\n"+toFastq(); 140 return false; 141 } 142 } 143 return true; 144 } 145 uToT()146 private void uToT(){ 147 if(bases==null){return;} 148 for(int i=0; i<bases.length; i++){ 149 bases[i]=AminoAcid.uToT[bases[i]]; 150 } 151 } 152 tToU()153 private void tToU(){ 154 if(bases==null){return;} 155 for(int i=0; i<bases.length; i++){ 156 bases[i]=AminoAcid.tToU[bases[i]]; 157 } 158 } 159 validateJunk(boolean processAssertions)160 private boolean validateJunk(boolean processAssertions){ 161 assert(bases!=null); 162 if(JUNK_MODE==IGNORE_JUNK){return true;} 163 final byte nocall; 164 final byte[] toNum; 165 final boolean aa=aminoacid(); 166 if(aa){ 167 nocall='X'; 168 toNum=AminoAcid.acidToNumberExtended; 169 }else{ 170 nocall='N'; 171 toNum=AminoAcid.baseToNumberExtended; 172 } 173 for(int i=0; i<bases.length; i++){ 174 byte b=bases[i]; 175 int num=toNum[b]; 176 // System.err.println(Character.toString(b)+" -> "+num); 177 if(num<0){ 178 if(JUNK_MODE==FIX_JUNK){ 179 bases[i]=nocall; 180 }else if(JUNK_MODE==FLAG_JUNK){ 181 setJunk(true); 182 return false; 183 }else{ 184 if(processAssertions){ 185 KillSwitch.kill("\nAn input file appears to be misformatted:\n" 186 + "The character with ASCII code "+b+" appeared where "+(aa ? "an amino acid" : "a base")+" was expected" 187 + (b>31 && b<127 ? ": '"+Character.toString((char)b)+"'\n" : ".\n") 188 + "Sequence #"+numericID+"\n" 189 + "Sequence ID '"+id+"'\n" 190 + "Sequence: "+Tools.toStringSafe(bases)+"\n" 191 + "Flags: "+Long.toBinaryString(flags)+"\n\n" 192 + "This can be bypassed with the flag 'tossjunk', 'fixjunk', or 'ignorejunk'"); 193 } 194 setJunk(true); 195 return false; 196 } 197 } 198 } 199 return true; 200 } 201 validateQualityLength(boolean processAssertions)202 private void validateQualityLength(boolean processAssertions){ 203 if(quality==null || quality.length==bases.length){return;} 204 if(TOSS_BROKEN_QUALITY){ 205 quality=null; 206 setDiscarded(true); 207 setJunk(true); 208 }else if(NULLIFY_BROKEN_QUALITY){ 209 quality=null; 210 setJunk(true); 211 }else if(FLAG_BROKEN_QUALITY){ 212 setJunk(true); 213 }else{ 214 boolean x=false; 215 assert(x=processAssertions); 216 if(x){ 217 KillSwitch.kill("\nMismatch between length of bases and qualities for read "+numericID+" (id="+id+").\n"+ 218 "# qualities="+quality.length+", # bases="+bases.length+"\n\n"+ 219 FASTQ.qualToString(quality)+"\n"+new String(bases)+"\n\n" 220 + "This can be bypassed with the flag 'tossbrokenreads' or 'nullifybrokenquality'"); 221 } 222 } 223 } 224 fixQuality()225 private void fixQuality(){ 226 if(quality==null || !CHANGE_QUALITY){return;} 227 228 final byte[] toNumber=aminoacid() ? AminoAcid.acidToNumber : AminoAcid.baseToNumber; 229 230 for(int i=0; i<quality.length; i++){ 231 byte b=bases[i]; 232 byte q=quality[i]; 233 quality[i]=capQuality(q, b); 234 // if(toNumber[b]>=0){ 235 // if(q<MIN_CALLED_QUALITY){ 236 // quality[i]=MIN_CALLED_QUALITY; 237 // }else if(q>MAX_CALLED_QUALITY){ 238 // quality[i]=MAX_CALLED_QUALITY; 239 // } 240 // }else{ 241 // quality[i]=0; 242 // } 243 } 244 } 245 fixCase()246 private void fixCase(){ 247 if(bases==null || (!DOT_DASH_X_TO_N && !TO_UPPER_CASE && !LOWER_CASE_TO_N)){return;} 248 final boolean aa=aminoacid(); 249 250 final byte[] caseMap, ddxMap; 251 if(!aa){ 252 caseMap=TO_UPPER_CASE ? AminoAcid.toUpperCase : LOWER_CASE_TO_N ? AminoAcid.lowerCaseToNocall : null; 253 ddxMap=DOT_DASH_X_TO_N ? AminoAcid.dotDashXToNocall : null; 254 }else{ 255 caseMap=TO_UPPER_CASE ? AminoAcid.toUpperCase : LOWER_CASE_TO_N ? AminoAcid.lowerCaseToNocallAA : null; 256 ddxMap=DOT_DASH_X_TO_N ? AminoAcid.dotDashXToNocallAA : null; 257 } 258 259 // assert(false) : (AminoAcid.toUpperCase==caseMap)+", "+ddxMap; 260 261 if(DOT_DASH_X_TO_N){ 262 if(TO_UPPER_CASE || LOWER_CASE_TO_N){ 263 for(int i=0; i<bases.length; i++){ 264 byte b=bases[i]; 265 bases[i]=caseMap[ddxMap[b]]; 266 } 267 }else{ 268 for(int i=0; i<bases.length; i++){ 269 byte b=bases[i]; 270 bases[i]=ddxMap[b]; 271 } 272 } 273 }else{ 274 if(TO_UPPER_CASE || LOWER_CASE_TO_N){ 275 for(int i=0; i<bases.length; i++){ 276 byte b=bases[i]; 277 bases[i]=caseMap[b]; 278 } 279 }else{ 280 assert(false); 281 } 282 } 283 } 284 validateCommonCase_branchless(boolean processAssertions)285 private boolean validateCommonCase_branchless(boolean processAssertions){ 286 287 assert(!aminoacid()); 288 assert(bases!=null); 289 290 if(TO_UPPER_CASE || LOWER_CASE_TO_N){fixCase();} 291 292 final byte nocall='N'; 293 final byte[] toNum=AminoAcid.baseToNumber; 294 final byte[] map=(DOT_DASH_X_TO_N && IUPAC_TO_N ? AminoAcid.baseToACGTN : 295 DOT_DASH_X_TO_N ? AminoAcid.dotDashXToNocall : IUPAC_TO_N ? AminoAcid.iupacToNocall : null); 296 297 if(JUNK_MODE==IGNORE_JUNK){ 298 if(quality!=null && CHANGE_QUALITY){ 299 if(DOT_DASH_X_TO_N || IUPAC_TO_N){ 300 for(int i=0; i<bases.length; i++){ 301 final byte b=map[bases[i]]; 302 final byte q=quality[i]; 303 final int num=toNum[b]; 304 bases[i]=b; 305 quality[i]=(num>=0 ? qMap[q] : 0); 306 } 307 }else{ 308 for(int i=0; i<bases.length; i++){ 309 final byte b=bases[i]; 310 final byte q=quality[i]; 311 final int num=toNum[b]; 312 quality[i]=(num>=0 ? qMap[q] : 0); 313 } 314 } 315 }else if(DOT_DASH_X_TO_N || IUPAC_TO_N){ 316 for(int i=0; i<bases.length; i++){ 317 byte b=map[bases[i]]; 318 bases[i]=b; 319 } 320 } 321 return true; 322 } 323 324 int junkOr=0; 325 // int iupacOr=0; 326 final byte[] toNumE=AminoAcid.baseToNumberExtended; 327 328 // asdf 329 330 if(DOT_DASH_X_TO_N || IUPAC_TO_N){ 331 if(quality!=null && CHANGE_QUALITY){ 332 for(int i=0; i<bases.length; i++){ 333 final byte b=map[bases[i]]; 334 final byte q=quality[i]; 335 final int numE=toNumE[b]; 336 final int num=toNum[b]; 337 junkOr|=numE; 338 // iupacOr|=num; 339 bases[i]=b; 340 quality[i]=(num>=0 ? qMap[q] : 0); 341 } 342 }else{ 343 for(int i=0; i<bases.length; i++){ 344 final byte b=map[bases[i]]; 345 final int numE=toNumE[b]; 346 // final int num=toNum[b]; 347 junkOr|=numE; 348 // iupacOr|=num; 349 bases[i]=b; 350 } 351 } 352 }else{ 353 if(quality!=null && CHANGE_QUALITY){ 354 for(int i=0; i<bases.length; i++){ 355 final byte b=bases[i]; 356 final byte q=quality[i]; 357 final int numE=toNumE[b]; 358 final int num=toNum[b]; 359 junkOr|=numE; 360 // iupacOr|=num; 361 quality[i]=(num>=0 ? qMap[q] : 0); 362 } 363 }else{ 364 for(int i=0; i<bases.length; i++){ 365 final byte b=bases[i]; 366 final int numE=toNumE[b]; 367 // final int num=toNum[b]; 368 junkOr|=numE; 369 // iupacOr|=num; 370 } 371 } 372 } 373 374 // System.err.println(junkOr+", "+JUNK_MODE); 375 376 //Common case 377 if(junkOr>=0){return true;} 378 // if(junkOr>=0 && (JUNK_MODE!=FIX_JUNK_AND_IUPAC || iupacOr>=0)){return true;} 379 // 380 // assert(junkOr<0 || (JUNK_MODE==FIX_JUNK_AND_IUPAC && iupacOr<0)); 381 382 //TODO: I could disable VALIDATE_BRANCHLESS here, if it's not final 383 //VALIDATE_BRANCHLESS=false; 384 if(JUNK_MODE==FIX_JUNK){ 385 for(int i=0; i<bases.length; i++){ 386 byte b=bases[i]; 387 final int numE=toNumE[b]; 388 389 if(numE<0){ 390 bases[i]=nocall; 391 if(quality!=null){quality[i]=0;} 392 } 393 } 394 return true; 395 }else if(JUNK_MODE==FLAG_JUNK){ 396 setJunk(true); 397 return false; 398 }else if(JUNK_MODE==FIX_JUNK_AND_IUPAC){ 399 for(int i=0; i<bases.length; i++){ 400 byte b=bases[i]; 401 final byte c=AminoAcid.baseToACGTN[b]; 402 bases[i]=c; 403 } 404 return true; 405 }else{ 406 if(processAssertions){ 407 int i=0; 408 for(; i<bases.length; i++){ 409 if(toNumE[bases[i]]<0){break;} 410 } 411 byte b=bases[i]; 412 KillSwitch.kill("\nAn input file appears to be misformatted:\n" 413 + "The character with ASCII code "+b+" appeared where a base was expected" 414 + (b>31 && b<127 ? ": '"+Character.toString((char)b)+"'\n" : ".\n") 415 + "Sequence #"+numericID+"\n" 416 + "Sequence ID: '"+id+"'\n" 417 + "Sequence: '"+Tools.toStringSafe(bases)+"'\n\n" 418 + "This can be bypassed with the flag 'tossjunk', 'fixjunk', or 'ignorejunk'"); 419 } 420 setJunk(true); 421 return false; 422 } 423 } 424 validateCommonCase(boolean processAssertions)425 private boolean validateCommonCase(boolean processAssertions){ 426 427 assert(!aminoacid()); 428 assert(bases!=null); 429 430 if(TO_UPPER_CASE || LOWER_CASE_TO_N){fixCase();} 431 432 final byte nocall='N'; 433 final byte[] toNum=AminoAcid.baseToNumber; 434 final byte[] ddxMap=AminoAcid.dotDashXToNocall; 435 436 if(JUNK_MODE==IGNORE_JUNK){ 437 if(quality!=null && CHANGE_QUALITY){ 438 if(DOT_DASH_X_TO_N){ 439 for(int i=0; i<bases.length; i++){ 440 final byte b=ddxMap[bases[i]]; 441 final byte q=quality[i]; 442 final int num=toNum[b]; 443 bases[i]=b; 444 quality[i]=(num>=0 ? qMap[q] : 0); 445 } 446 }else{ 447 for(int i=0; i<bases.length; i++){ 448 final byte b=bases[i]; 449 final byte q=quality[i]; 450 final int num=toNum[b]; 451 quality[i]=(num>=0 ? qMap[q] : 0); 452 } 453 } 454 }else if(DOT_DASH_X_TO_N){ 455 for(int i=0; i<bases.length; i++){ 456 byte b=ddxMap[bases[i]]; 457 bases[i]=b; 458 } 459 } 460 }else if(DOT_DASH_X_TO_N){ 461 final byte[] toNumE=AminoAcid.baseToNumberExtended; 462 if(quality!=null && CHANGE_QUALITY){ 463 for(int i=0; i<bases.length; i++){ 464 byte b=ddxMap[bases[i]]; 465 final byte q=quality[i]; 466 final int numE=toNumE[b]; 467 468 if(numE<0){ 469 if(JUNK_MODE==FIX_JUNK){ 470 b=nocall; 471 }else if(JUNK_MODE==FLAG_JUNK){ 472 setJunk(true); 473 return false; 474 }else{ 475 if(processAssertions){ 476 KillSwitch.kill("\nAn input file appears to be misformatted:\n" 477 + "The character with ASCII code "+bases[1]+" appeared where a base was expected.\n" 478 + "Sequence #"+numericID+"\n" 479 + "Sequence ID '"+id+"'\n" 480 + "Sequence: "+Tools.toStringSafe(bases)+"\n\n" 481 + "This can be bypassed with the flag 'tossjunk', 'fixjunk', or 'ignorejunk'"); 482 } 483 setJunk(true); 484 return false; 485 } 486 } 487 488 final int num=toNum[b]; 489 bases[i]=b; 490 quality[i]=(num>=0 ? qMap[q] : 0); 491 } 492 }else{ 493 for(int i=0; i<bases.length; i++){ 494 byte b=ddxMap[bases[i]]; 495 final int numE=toNumE[b]; 496 497 if(numE<0){ 498 if(JUNK_MODE==FIX_JUNK){ 499 b=nocall; 500 }else if(JUNK_MODE==FLAG_JUNK){ 501 setJunk(true); 502 return false; 503 }else{ 504 if(processAssertions){ 505 KillSwitch.kill("\nAn input file appears to be misformatted:\n" 506 + "The character with ASCII code "+bases[1]+" appeared where a base was expected.\n" 507 + "Sequence #"+numericID+"\n" 508 + "Sequence ID '"+id+"'\n" 509 + "Sequence: "+Tools.toStringSafe(bases)+"\n\n" 510 + "This can be bypassed with the flag 'tossjunk', 'fixjunk', or 'ignorejunk'"); 511 } 512 setJunk(true); 513 return false; 514 } 515 } 516 517 bases[i]=b; 518 } 519 } 520 }else{ 521 final byte[] toNumE=AminoAcid.baseToNumberExtended; 522 if(quality!=null && CHANGE_QUALITY){ 523 for(int i=0; i<bases.length; i++){ 524 byte b=bases[i]; 525 final byte q=quality[i]; 526 final int numE=toNumE[b]; 527 528 if(numE<0){ 529 if(JUNK_MODE==FIX_JUNK){ 530 bases[i]=b=nocall; 531 }else if(JUNK_MODE==FLAG_JUNK){ 532 setJunk(true); 533 return false; 534 }else{ 535 if(processAssertions){ 536 KillSwitch.kill("\nAn input file appears to be misformatted:\n" 537 + "The character with ASCII code "+bases[1]+" appeared where a base was expected.\n" 538 + "Sequence #"+numericID+"\n" 539 + "Sequence ID '"+id+"'\n" 540 + "Sequence: "+Tools.toStringSafe(bases)+"\n\n" 541 + "This can be bypassed with the flag 'tossjunk', 'fixjunk', or 'ignorejunk'"); 542 } 543 setJunk(true); 544 return false; 545 } 546 } 547 548 final int num=toNum[b]; 549 bases[i]=b; 550 quality[i]=(num>=0 ? qMap[q] : 0); 551 } 552 }else{ 553 for(int i=0; i<bases.length; i++){ 554 byte b=bases[i]; 555 final int numE=toNumE[b]; 556 557 if(numE<0){ 558 if(JUNK_MODE==FIX_JUNK){ 559 bases[i]=b=nocall; 560 }else if(JUNK_MODE==FLAG_JUNK){ 561 setJunk(true); 562 return false; 563 }else{ 564 if(processAssertions){ 565 KillSwitch.kill("\nAn input file appears to be misformatted:\n" 566 + "The character with ASCII code "+bases[1]+" appeared where a base was expected.\n" 567 + "Sequence #"+numericID+"\n" 568 + "Sequence ID '"+id+"'\n" 569 + "Sequence: "+Tools.toStringSafe(bases)+"\n\n" 570 + "This can be bypassed with the flag 'tossjunk', 'fixjunk', or 'ignorejunk'"); 571 } 572 setJunk(true); 573 return false; 574 } 575 } 576 } 577 } 578 } 579 return true; 580 } 581 fixHeader(boolean processAssertions)582 private final void fixHeader(boolean processAssertions){ 583 id=Tools.fixHeader(id, ALLOW_NULL_HEADER, processAssertions); 584 } 585 586 /*--------------------------------------------------------------*/ 587 /*---------------- Various ----------------*/ 588 /*--------------------------------------------------------------*/ 589 590 absdif(int a, int b)591 private static final int absdif(int a, int b){ 592 return a>b ? a-b : b-a; 593 } 594 595 /** Returns true if these reads are identical, allowing at most n no-calls and m mismatches of max quality q*/ isDuplicateByBases(Read r, int nmax, int mmax, byte qmax, boolean banSameQualityMismatch)596 public boolean isDuplicateByBases(Read r, int nmax, int mmax, byte qmax, boolean banSameQualityMismatch){ 597 return isDuplicateByBases(r, nmax, mmax, qmax, banSameQualityMismatch, false); 598 } 599 600 601 602 /** Returns true if these reads are identical, allowing at most n no-calls and m mismatches of max quality q*/ isDuplicateByBases(Read r, int nmax, int mmax, byte qmax, boolean banSameQualityMismatch, boolean allowDifferentLength)603 public boolean isDuplicateByBases(Read r, int nmax, int mmax, byte qmax, boolean banSameQualityMismatch, boolean allowDifferentLength){ 604 int n=0, m=0; 605 assert(r.length()==bases.length) : "Merging different-length reads is supported but seems to be not useful."; 606 if(!allowDifferentLength && r.length()!=bases.length){return false;} 607 int minLen=Tools.min(bases.length, r.length()); 608 for(int i=0; i<minLen; i++){ 609 byte b1=bases[i]; 610 byte b2=r.bases[i]; 611 if(b1=='N' || b2=='N'){ 612 n++; 613 if(n>nmax){return false;} 614 }else if(b1!=b2){ 615 m++; 616 if(m>mmax){return false;} 617 if(quality[i]>qmax && r.quality[i]>qmax){return false;} 618 if(banSameQualityMismatch && quality[i]==r.quality[i]){return false;} 619 } 620 } 621 return true; 622 } 623 isDuplicateByMapping(Read r, boolean bothEnds, boolean checkAlignment)624 public boolean isDuplicateByMapping(Read r, boolean bothEnds, boolean checkAlignment){ 625 if(bases.length!=r.length()){ 626 return isDuplicateByMappingDifferentLength(r, bothEnds, checkAlignment); 627 } 628 assert(this!=r && mate!=r); 629 assert(!bothEnds || bases.length==r.length()); 630 if(!mapped() || !r.mapped()){return false;} 631 // if(chrom==-1 && start==-1){return false;} 632 if(chrom<1 && start<1){return false;} 633 634 // if(chrom!=r.chrom || strand()!=r.strand() || start!=r.start){return false;} 635 //// if(mate==null && stop!=r.stop){return false;} //For unpaired reads, require both ends match 636 // if(stop!=r.stop){return false;} //For unpaired reads, require both ends match 637 // return true; 638 639 if(chrom!=r.chrom || strand()!=r.strand()){return false;} 640 if(bothEnds){ 641 if(start!=r.start || stop!=r.stop){return false;} 642 }else{ 643 if(strand()==Shared.PLUS){ 644 if(start!=r.start){return false;} 645 }else{ 646 if(stop!=r.stop){return false;} 647 } 648 } 649 if(checkAlignment){ 650 if(perfect() && r.perfect()){return true;} 651 if(match!=null && r.match!=null){ 652 if(match.length!=r.match.length){return false;} 653 for(int i=0; i<match.length; i++){ 654 byte a=match[i]; 655 byte b=r.match[i]; 656 if(a!=b){ 657 if((a=='D') != (b=='D')){return false;} 658 if((a=='I' || a=='X' || a=='Y') != (b=='I' || b=='X' || b=='Y')){return false;} 659 } 660 } 661 } 662 } 663 return true; 664 } 665 isDuplicateByMappingDifferentLength(Read r, boolean bothEnds, boolean checkAlignment)666 public boolean isDuplicateByMappingDifferentLength(Read r, boolean bothEnds, boolean checkAlignment){ 667 assert(this!=r && mate!=r); 668 assert(bases.length!=r.length()); 669 if(bothEnds){return false;} 670 // assert(!bothEnds || bases.length==r.length()); 671 if(!mapped() || !r.mapped()){return false;} 672 // if(chrom==-1 && start==-1){return false;} 673 if(chrom<1 && start<1){return false;} 674 675 // if(chrom!=r.chrom || strand()!=r.strand() || start!=r.start){return false;} 676 //// if(mate==null && stop!=r.stop){return false;} //For unpaired reads, require both ends match 677 // if(stop!=r.stop){return false;} //For unpaired reads, require both ends match 678 // return true; 679 680 if(chrom!=r.chrom || strand()!=r.strand()){return false;} 681 682 if(strand()==Shared.PLUS){ 683 if(start!=r.start){return false;} 684 }else{ 685 if(stop!=r.stop){return false;} 686 } 687 688 if(checkAlignment){ 689 if(perfect() && r.perfect()){return true;} 690 if(match!=null && r.match!=null){ 691 int minLen=Tools.min(match.length, r.match.length); 692 for(int i=0; i<minLen; i++){ 693 byte a=match[i]; 694 byte b=r.match[i]; 695 if(a!=b){ 696 if((a=='D') != (b=='D')){return false;} 697 if((a=='I' || a=='X' || a=='Y') != (b=='I' || b=='X' || b=='Y')){return false;} 698 } 699 } 700 } 701 } 702 return true; 703 } 704 merge(Read r, boolean mergeVectors, boolean mergeN)705 public void merge(Read r, boolean mergeVectors, boolean mergeN){mergePrivate(r, mergeVectors, mergeN, true);} 706 mergePrivate(Read r, boolean mergeVectors, boolean mergeN, boolean mergeMate)707 private void mergePrivate(Read r, boolean mergeVectors, boolean mergeN, boolean mergeMate){ 708 assert(r!=this); 709 assert(r!=this.mate); 710 assert(r!=r.mate); 711 assert(this!=this.mate); 712 assert(r.mate==null || r.mate.mate==r); 713 assert(this.mate==null || this.mate.mate==this); 714 assert(r.mate==null || r.numericID==r.mate.numericID); 715 assert(mate==null || numericID==mate.numericID); 716 mergeN=(mergeN||mergeVectors); 717 718 assert(r.length()==bases.length) : "Merging different-length reads is supported but seems to be not useful."; 719 720 if((mergeN || mergeVectors) && bases.length<r.length()){ 721 int oldLenB=bases.length; 722 start=Tools.min(start, r.start); 723 stop=Tools.max(stop, r.stop); 724 mapScore=Tools.max(mapScore, r.mapScore); 725 726 bases=KillSwitch.copyOfRange(bases, 0, r.length()); 727 quality=KillSwitch.copyOfRange(quality, 0, r.quality.length); 728 for(int i=oldLenB; i<bases.length; i++){ 729 bases[i]='N'; 730 quality[i]=0; 731 } 732 match=null; 733 r.match=null; 734 } 735 736 copies+=r.copies; 737 738 739 // if(numericID==11063941 || r.numericID==11063941 || numericID==8715632){ 740 // System.err.println("***************"); 741 // System.err.println(this.toText()+"\n"); 742 // System.err.println(r.toText()+"\n"); 743 // System.err.println(mergeVectors+", "+mergeN+", "+mergeMate+"\n"); 744 // } 745 746 boolean pflag1=perfect(); 747 boolean pflag2=r.perfect(); 748 749 final int minLenB=Tools.min(bases.length, r.length()); 750 751 if(mergeN){ 752 if(quality==null){ 753 for(int i=0; i<minLenB; i++){ 754 byte b=r.bases[i]; 755 if(bases[i]=='N' && b!='N'){bases[i]=b;} 756 } 757 }else{ 758 for(int i=0; i<minLenB; i++){ 759 final byte b1=bases[i]; 760 final byte b2=r.bases[i]; 761 final byte q1=Tools.max((byte)0, quality[i]); 762 final byte q2=Tools.max((byte)0, r.quality[i]); 763 if(b1==b2){ 764 if(b1=='N'){ 765 //do nothing 766 }else if(mergeVectors){ 767 //merge qualities 768 // quality[i]=(byte) Tools.min(40, q1+q2); 769 if(q1>=q2){ 770 quality[i]=(byte) Tools.min(48, q1+1+q2/4); 771 }else{ 772 quality[i]=(byte) Tools.min(48, q2+1+q1/4); 773 } 774 } 775 }else if(b1=='N'){ 776 bases[i]=b2; 777 quality[i]=q2; 778 }else if(b2=='N'){ 779 //do nothing 780 }else if(mergeVectors){ 781 if(q1<1 && q2<1){ 782 //Special case - e.g. Illumina calls bases at 0 quality. 783 //Possibly best to keep the matching allele if one matches the ref. 784 //But for now, do nothing. 785 //This was causing problems changing perfect match strings into imperfect matches. 786 }else if(q1==q2){ 787 assert(b1!=b2); 788 bases[i]='N'; 789 quality[i]=0; 790 }else if(q1>q2){ 791 bases[i]=b1; 792 quality[i]=(byte)(q1-q2/2); 793 }else{ 794 bases[i]=b2; 795 quality[i]=(byte)(q2-q1/2); 796 } 797 assert(quality[i]>=0 && quality[i]<=48); 798 } 799 } 800 } 801 } 802 803 //TODO: 804 //Note that the read may need to be realigned after merging, so the match string may be rendered incorrect. 805 806 if(mergeN && match!=null){ 807 if(r.match==null){match=null;} 808 else{ 809 if(match.length!=r.match.length){match=null;} 810 else{ 811 boolean ok=true; 812 for(int i=0; i<match.length && ok; i++){ 813 byte a=match[i], b=r.match[i]; 814 if(a!=b){ 815 if((a=='m' || a=='S') && b=='N'){ 816 //do nothing; 817 }else if(a=='N' && (b=='m' || b=='S')){ 818 match[i]=b; 819 }else{ 820 ok=false; 821 } 822 } 823 } 824 if(!ok){match=null;} 825 } 826 } 827 } 828 829 if(mergeMate && mate!=null){ 830 mate.mergePrivate(r.mate, mergeVectors, mergeN, false); 831 assert(copies==mate.copies); 832 } 833 assert(copies>1); 834 835 assert(r!=this); 836 assert(r!=this.mate); 837 assert(r!=r.mate); 838 assert(this!=this.mate); 839 assert(r.mate==null || r.mate.mate==r); 840 assert(this.mate==null || this.mate.mate==this); 841 assert(r.mate==null || r.numericID==r.mate.numericID); 842 assert(mate==null || numericID==mate.numericID); 843 } 844 845 @Override toString()846 public String toString(){return toText(false).toString();} 847 toSites()848 public ByteBuilder toSites(){return toSites((ByteBuilder)null);} 849 toSites(ByteBuilder sb)850 public ByteBuilder toSites(ByteBuilder sb){ 851 if(numSites()==0){ 852 if(sb==null){sb=new ByteBuilder(2);} 853 sb.append('.'); 854 }else{ 855 if(sb==null){sb=new ByteBuilder(sites.size()*20);} 856 int appended=0; 857 for(SiteScore ss : sites){ 858 if(appended>0){sb.append('\t');} 859 if(ss!=null){ 860 ss.toBytes(sb); 861 appended++; 862 } 863 } 864 if(appended==0){sb.append('.');} 865 } 866 return sb; 867 } 868 toInfo()869 public ByteBuilder toInfo(){ 870 if(obj==null){return new ByteBuilder();} 871 if(obj.getClass()==ByteBuilder.class){return (ByteBuilder)obj;} 872 return new ByteBuilder(obj.toString()); 873 } 874 toInfo(ByteBuilder bb)875 public ByteBuilder toInfo(ByteBuilder bb){ 876 if(obj==null){return bb;} 877 if(obj.getClass()==ByteBuilder.class){return bb.append((ByteBuilder)obj);} 878 return bb.append(obj.toString()); 879 } 880 toFastq()881 public ByteBuilder toFastq(){ 882 return FASTQ.toFASTQ(this, (ByteBuilder)null); 883 } 884 toFastq(ByteBuilder bb)885 public ByteBuilder toFastq(ByteBuilder bb){ 886 return FASTQ.toFASTQ(this, bb); 887 } 888 toFasta()889 public ByteBuilder toFasta(){return toFasta(Shared.FASTA_WRAP);} toFasta(ByteBuilder bb)890 public ByteBuilder toFasta(ByteBuilder bb){return toFasta(Shared.FASTA_WRAP, bb);} 891 toFasta(int wrap)892 public ByteBuilder toFasta(int wrap){ 893 return toFasta(wrap, (ByteBuilder)null); 894 } 895 toFasta(int wrap, ByteBuilder bb)896 public ByteBuilder toFasta(int wrap, ByteBuilder bb){ 897 if(wrap<1){wrap=Integer.MAX_VALUE;} 898 int len=(id==null ? Tools.stringLength(numericID) : id.length())+(bases==null ? 0 : bases.length+bases.length/wrap)+5; 899 if(bb==null){bb=new ByteBuilder(len+1);} 900 bb.append('>'); 901 if(id==null){bb.append(numericID);} 902 else{bb.append(id);} 903 if(bases!=null){ 904 int pos=0; 905 while(pos<bases.length-wrap){ 906 bb.append('\n'); 907 bb.append(bases, pos, wrap); 908 pos+=wrap; 909 } 910 if(pos<bases.length){ 911 bb.append('\n'); 912 bb.append(bases, pos, bases.length-pos); 913 } 914 } 915 return bb; 916 } 917 toSam()918 public ByteBuilder toSam(){ 919 return toSam((ByteBuilder)null); 920 } 921 toSam(ByteBuilder bb)922 public ByteBuilder toSam(ByteBuilder bb){ 923 SamLine sl=new SamLine(this, pairnum()); 924 return sl.toBytes(bb);//Note: This used to have a newline 925 } 926 header()927 public static CharSequence header(){ 928 929 StringBuilder sb=new StringBuilder(); 930 sb.append("id"); 931 sb.append('\t'); 932 sb.append("numericID"); 933 sb.append('\t'); 934 sb.append("chrom"); 935 sb.append('\t'); 936 sb.append("strand"); 937 sb.append('\t'); 938 sb.append("start"); 939 sb.append('\t'); 940 sb.append("stop"); 941 sb.append('\t'); 942 943 sb.append("flags"); 944 sb.append('\t'); 945 946 sb.append("copies"); 947 sb.append('\t'); 948 949 sb.append("errors,fixed"); 950 sb.append('\t'); 951 sb.append("mapScore"); 952 sb.append('\t'); 953 sb.append("length"); 954 sb.append('\t'); 955 956 sb.append("bases"); 957 sb.append('\t'); 958 sb.append("quality"); 959 sb.append('\t'); 960 961 sb.append("insert"); 962 sb.append('\t'); 963 { 964 //These are not really necessary... 965 sb.append("avgQual"); 966 sb.append('\t'); 967 } 968 969 sb.append("match"); 970 sb.append('\t'); 971 sb.append("SiteScores: "+SiteScore.header()); 972 return sb; 973 } 974 toText(boolean okToCompressMatch)975 public ByteBuilder toText(boolean okToCompressMatch){ 976 return toText(okToCompressMatch, (ByteBuilder)null); 977 } 978 toText(boolean okToCompressMatch, ByteBuilder bb)979 public ByteBuilder toText(boolean okToCompressMatch, ByteBuilder bb){ 980 981 final byte[] oldmatch=match; 982 final boolean oldshortmatch=this.shortmatch(); 983 if(COMPRESS_MATCH_BEFORE_WRITING && !shortmatch() && okToCompressMatch){ 984 match=toShortMatchString(match); 985 setShortMatch(true); 986 } 987 988 if(bb==null){bb=new ByteBuilder();} 989 bb.append(id); 990 bb.tab(); 991 bb.append(numericID); 992 bb.tab(); 993 bb.append(chrom); 994 bb.tab(); 995 bb.append(Shared.strandCodes2[strand()]); 996 bb.tab(); 997 bb.append(start); 998 bb.tab(); 999 bb.append(stop); 1000 bb.tab(); 1001 1002 for(int i=maskArray.length-1; i>=0; i--){ 1003 bb.append(flagToNumber(maskArray[i])); 1004 } 1005 bb.tab(); 1006 1007 bb.append(copies); 1008 bb.tab(); 1009 1010 bb.append(errors); 1011 bb.tab(); 1012 bb.append(mapScore); 1013 bb.tab(); 1014 1015 if(bases==null){bb.append('.');} 1016 else{bb.append(bases);} 1017 bb.tab(); 1018 1019 // int qualSum=0; 1020 // int qualMin=99999; 1021 1022 if(quality==null){ 1023 bb.append('.'); 1024 }else{ 1025 bb.ensureExtra(quality.length); 1026 for(int i=0, j=bb.length; i<quality.length; i++, j++){ 1027 byte q=quality[i]; 1028 bb.array[j]=(byte)(q+ASCII_OFFSET); 1029 // qualSum+=q; 1030 // qualMin=Tools.min(q, qualMin); 1031 } 1032 bb.length+=quality.length; 1033 } 1034 bb.tab(); 1035 1036 if(insert<1){bb.append('.');}else{bb.append(insert);}; 1037 bb.tab(); 1038 1039 if(true || quality==null){ 1040 bb.append('.'); 1041 bb.tab(); 1042 }else{ 1043 // //These are not really necessary... 1044 // sb.append(qualSum/quality.length); 1045 // sb.append('\t'); 1046 } 1047 1048 if(match==null){bb.append('.');} 1049 else{bb.append(match);} 1050 bb.tab(); 1051 1052 if(gaps==null){ 1053 bb.append('.'); 1054 }else{ 1055 for(int i=0; i<gaps.length; i++){ 1056 if(i>0){bb.append('~');} 1057 bb.append(gaps[i]); 1058 } 1059 } 1060 1061 if(sites!=null && sites.size()>0){ 1062 1063 assert(absdif(start, stop)<3000 || (gaps==null) == (sites.get(0).gaps==null)) : 1064 "\n"+this.numericID+"\n"+Arrays.toString(gaps)+"\n"+sites.toString()+"\n"; 1065 1066 for(SiteScore ss : sites){ 1067 bb.tab(); 1068 if(ss==null){ 1069 bb.append((byte[])null); 1070 }else{ 1071 ss.toBytes(bb); 1072 } 1073 bb.append(ss==null ? "null" : ss.toText()); 1074 } 1075 } 1076 1077 if(originalSite!=null){ 1078 bb.tab(); 1079 bb.append('*'); 1080 originalSite.toBytes(bb); 1081 } 1082 1083 match=oldmatch; 1084 setShortMatch(oldshortmatch); 1085 1086 return bb; 1087 } 1088 1089 public static Read fromText(String line){ 1090 if(line.length()==1 && line.charAt(0)=='.'){return null;} 1091 1092 String[] split=line.split("\t"); 1093 1094 if(split.length<17){ 1095 throw new RuntimeException("Error parsing read from text.\n\n" + 1096 "This may be caused be attempting to parse the wrong format.\n" + 1097 "Please ensure that the file extension is correct:\n" + 1098 "\tFASTQ should end in .fastq or .fq\n" + 1099 "\tFASTA should end in .fasta or .fa, .fas, .fna, .ffn, .frn, .seq, .fsa\n" + 1100 "\tSAM should end in .sam\n" + 1101 "\tNative format should end in .txt or .bread\n" + 1102 "If a file is compressed, there must be a compression extension after the format extension:\n" + 1103 "\tgzipped files should end in .gz or .gzip\n" + 1104 "\tzipped files should end in .zip and have only 1 file per archive\n" + 1105 "\tbz2 files should end in .bz2\n"); 1106 } 1107 1108 final String id=new String(split[0]); 1109 long numericID=Long.parseLong(split[1]); 1110 int chrom=Byte.parseByte(split[2]); 1111 // byte strand=Byte.parseByte(split[3]); 1112 int start=Integer.parseInt(split[4]); 1113 int stop=Integer.parseInt(split[5]); 1114 1115 int flags=Integer.parseInt(split[6], 2); 1116 1117 int copies=Integer.parseInt(split[7]); 1118 1119 int errors; 1120 int errorsCorrected; 1121 if(split[8].indexOf(',')>=0){ 1122 String[] estring=split[8].split(","); 1123 errors=Integer.parseInt(estring[0]); 1124 errorsCorrected=Integer.parseInt(estring[1]); 1125 }else{ 1126 errors=Integer.parseInt(split[8]); 1127 errorsCorrected=0; 1128 } 1129 1130 int mapScore=Integer.parseInt(split[9]); 1131 1132 byte[] basesOriginal=split[10].getBytes(); 1133 byte[] qualityOriginal=(split[11].equals(".") ? null : split[11].getBytes()); 1134 1135 if(qualityOriginal!=null){ 1136 for(int i=0; i<qualityOriginal.length; i++){ 1137 byte b=qualityOriginal[i]; 1138 b=(byte) (b-ASCII_OFFSET); 1139 assert(b>=-1) : b; 1140 qualityOriginal[i]=b; 1141 } 1142 } 1143 1144 int insert=-1; 1145 if(!split[12].equals(".")){insert=Integer.parseInt(split[12]);} 1146 1147 byte[] match=null; 1148 if(!split[14].equals(".")){match=split[14].getBytes();} 1149 int[] gaps=null; 1150 if(!split[15].equals(".")){ 1151 1152 String[] gstring=split[16].split("~"); 1153 gaps=new int[gstring.length]; 1154 for(int i=0; i<gstring.length; i++){ 1155 gaps[i]=Integer.parseInt(gstring[i]); 1156 } 1157 } 1158 1159 // assert(false) : split[16]; 1160 1161 Read r=new Read(basesOriginal, qualityOriginal, id, numericID, flags, chrom, start, stop); 1162 r.match=match; 1163 r.errors=errors; 1164 r.mapScore=mapScore; 1165 r.copies=copies; 1166 r.gaps=gaps; 1167 r.insert=insert; 1168 1169 int firstScore=(ADD_BEST_SITE_TO_LIST_FROM_TEXT) ? 17 : 18; 1170 1171 int scores=split.length-firstScore; 1172 1173 int mSites=0; 1174 for(int i=firstScore; i<split.length; i++){ 1175 if(split[i].charAt(0)!='*'){mSites++;} 1176 } 1177 1178 //This can be disabled to handle very old text format. 1179 if(mSites>0){r.sites=new ArrayList<SiteScore>(mSites);} 1180 for(int i=firstScore; i<split.length; i++){ 1181 SiteScore ss=SiteScore.fromText(split[i]); 1182 if(split[i].charAt(0)=='*'){r.originalSite=ss;} 1183 else{r.sites.add(ss);} 1184 } 1185 1186 if(DECOMPRESS_MATCH_ON_LOAD && r.shortmatch()){ 1187 r.toLongMatchString(true); 1188 } 1189 1190 assert(r.numSites()==0 || absdif(r.start, r.stop)<3000 || (r.gaps==null) == (r.topSite().gaps==null)) : 1191 "\n"+r.numericID+", "+r.chrom+", "+r.strand()+", "+r.start+", "+r.stop+", "+Arrays.toString(r.gaps)+"\n"+r.sites+"\n"+line+"\n"; 1192 1193 return r; 1194 } 1195 1196 /** Inflates gaps between contigs in a scaffold. */ 1197 public void inflateGaps(int minGapIn, int minGapOut) { 1198 assert(minGapIn>0); 1199 if(!containsNocalls()){return;} 1200 final ByteBuilder bbb=new ByteBuilder(); 1201 final ByteBuilder bbq=(quality==null ? null : new ByteBuilder()); 1202 1203 int gap=0; 1204 for(int i=0; i<bases.length; i++){ 1205 byte b=bases[i]; 1206 byte q=(quality==null ? 0 : quality[i]); 1207 if(b=='N'){ 1208 gap++; 1209 }else{ 1210 while(gap>=minGapIn && gap<minGapOut){ 1211 gap++; 1212 bbb.append('N'); 1213 if(bbq!=null){bbq.append(0);} 1214 } 1215 gap=0; 1216 } 1217 bbb.append(b); 1218 if(bbq!=null){bbq.append(q);} 1219 } 1220 1221 while(gap>=minGapIn && gap<minGapOut){//Handle trailing bases 1222 gap++; 1223 bbb.append('N'); 1224 if(bbq!=null){bbq.append(0);} 1225 } 1226 1227 assert(bbb.length()>=bases.length); 1228 if(bbb.length()>bases.length){ 1229 bases=bbb.toBytes(); 1230 if(bbq!=null){quality=bbq.toBytes();} 1231 } 1232 } 1233 1234 public ArrayList<Read> breakAtGaps(final boolean agp, final int minContig){ 1235 ArrayList<Read> list=new ArrayList<Read>(); 1236 byte prev='N'; 1237 int lastN=-1, lastBase=-1; 1238 int contignum=1; 1239 long feature=1; 1240 ByteBuilder bb=(agp ? new ByteBuilder() : null); 1241 assert(obj==null); 1242 for(int i=0; i<bases.length; i++){ 1243 final byte b=bases[i]; 1244 if(b=='N'){ 1245 if(prev!='N'){ 1246 final int start=lastN+1, stop=i; 1247 byte[] b2=KillSwitch.copyOfRange(bases, start, stop); 1248 byte[] q2=(quality==null ? null : KillSwitch.copyOfRange(quality, start, stop)); 1249 Read r=new Read(b2, q2, id+"_c"+contignum, numericID); 1250 if(r.length()>=minContig){list.add(r);} 1251 contignum++; 1252 1253 if(bb!=null){ 1254 bb.append(id).append('\t'); 1255 bb.append(start+1).append('\t'); 1256 bb.append(stop).append('\t'); 1257 bb.append(feature).append('\t'); 1258 feature++; 1259 bb.append('W').append('\t'); 1260 bb.append(r.id).append('\t'); 1261 bb.append(1).append('\t'); 1262 bb.append(r.length()).append('\t'); 1263 bb.append('+').append('\n'); 1264 } 1265 } 1266 lastN=i; 1267 }else{ 1268 if(bb!=null && prev=='N' && lastBase>=0){ 1269 bb.append(id).append('\t'); 1270 bb.append(lastBase+2).append('\t'); 1271 bb.append(i).append('\t'); 1272 bb.append(feature).append('\t'); 1273 feature++; 1274 bb.append('N').append('\t'); 1275 bb.append((i-lastBase-1)).append('\t'); 1276 bb.append("scaffold").append('\t'); 1277 bb.append("yes").append('\t'); 1278 bb.append("paired-ends").append('\n'); 1279 } 1280 lastBase=i; 1281 } 1282 prev=b; 1283 } 1284 if(prev!='N'){ 1285 final int start=lastN+1, stop=bases.length; 1286 byte[] b2=KillSwitch.copyOfRange(bases, start, stop); 1287 byte[] q2=(quality==null ? null : KillSwitch.copyOfRange(quality, start, stop)); 1288 Read r=new Read(b2, q2, id+"_c"+contignum, numericID); 1289 if(r.length()>=minContig){list.add(r);} 1290 contignum++; 1291 1292 if(bb!=null){ 1293 bb.append(id).append('\t'); 1294 bb.append(start+1).append('\t'); 1295 bb.append(stop).append('\t'); 1296 bb.append(feature).append('\t'); 1297 feature++; 1298 bb.append('W').append('\t'); 1299 bb.append(r.id).append('\t'); 1300 bb.append(1).append('\t'); 1301 bb.append(r.length()).append('\t'); 1302 bb.append('+').append('\n'); 1303 } 1304 }else{ 1305 if(bb!=null && prev=='N' && lastBase>=0){ 1306 bb.append(id).append('\t'); 1307 bb.append(lastBase+2).append('\t'); 1308 bb.append(bases.length).append('\t'); 1309 bb.append(feature).append('\t'); 1310 feature++; 1311 bb.append('N').append('\t'); 1312 bb.append((bases.length-lastBase-1)).append('\t'); 1313 bb.append("scaffold").append('\t'); 1314 bb.append("yes").append('\t'); 1315 bb.append("paired-ends").append('\n'); 1316 } 1317 lastBase=bases.length; 1318 } 1319 if(bb!=null){obj=bb.toBytes();} 1320 return list; 1321 } 1322 1323 /** Reverse-complements the read. */ reverseComplement()1324 public void reverseComplement() { 1325 AminoAcid.reverseComplementBasesInPlace(bases); 1326 Tools.reverseInPlace(quality); 1327 setStrand(strand()^1); 1328 } 1329 1330 /** Complements the read. */ complement()1331 public void complement() { 1332 AminoAcid.reverseComplementBasesInPlace(bases); 1333 } 1334 1335 @Override compareTo(Read o)1336 public int compareTo(Read o) { 1337 if(chrom!=o.chrom){return chrom-o.chrom;} 1338 if(start!=o.start){return start-o.start;} 1339 if(stop!=o.stop){return stop-o.stop;} 1340 if(strand()!=o.strand()){return strand()-o.strand();} 1341 return 0; 1342 } 1343 toSite()1344 public SiteScore toSite(){ 1345 assert(start<=stop) : this.toText(false); 1346 SiteScore ss=new SiteScore(chrom, strand(), start, stop, 0, 0, rescued(), perfect()); 1347 if(paired()){ 1348 ss.setSlowPairedScore(mapScore-1, mapScore); 1349 }else{ 1350 ss.setSlowPairedScore(mapScore, 0); 1351 } 1352 ss.setScore(mapScore); 1353 ss.gaps=gaps; 1354 ss.match=match; 1355 originalSite=ss; 1356 return ss; 1357 } 1358 topSite()1359 public SiteScore topSite(){ 1360 final SiteScore ss=(sites==null || sites.isEmpty()) ? null : sites.get(0); 1361 assert(sites==null || sites.isEmpty() || ss!=null) : "Top site is null for read "+this; 1362 return ss; 1363 } 1364 numSites()1365 public int numSites(){ 1366 return (sites==null ? 0 : sites.size()); 1367 } 1368 makeOriginalSite()1369 public SiteScore makeOriginalSite(){ 1370 originalSite=toSite(); 1371 return originalSite; 1372 } 1373 setFromSite(SiteScore ss)1374 public void setFromSite(SiteScore ss){ 1375 assert(ss!=null); 1376 chrom=ss.chrom; 1377 setStrand(ss.strand); 1378 start=ss.start; 1379 stop=ss.stop; 1380 mapScore=ss.slowScore; 1381 setRescued(ss.rescued); 1382 gaps=ss.gaps; 1383 setPerfect(ss.perfect); 1384 1385 match=ss.match; 1386 1387 if(gaps!=null){ 1388 gaps=ss.gaps=GapTools.fixGaps(start, stop, gaps, Shared.MINGAP); 1389 // gaps[0]=Tools.min(gaps[0], start); 1390 // gaps[gaps.length-1]=Tools.max(gaps[gaps.length-1], stop); 1391 } 1392 } 1393 1394 // public static int[] fixGaps(int a, int b, int[] gaps, int minGap){ 1395 //// System.err.println("fixGaps input: "+a+", "+b+", "+Arrays.toString(gaps)+", "+minGap); 1396 // int[] r=GapTools.fixGaps(a, b, gaps, minGap); 1397 //// System.err.println("fixGaps output: "+Arrays.toString(r)); 1398 // return r; 1399 // } 1400 setFromOriginalSite()1401 public void setFromOriginalSite(){ 1402 setFromSite(originalSite); 1403 } setFromTopSite()1404 public void setFromTopSite(){ 1405 final SiteScore ss=topSite(); 1406 if(ss==null){ 1407 clearSite(); 1408 setMapped(false); 1409 return; 1410 } 1411 setMapped(true); 1412 setFromSite(ss); 1413 } 1414 setFromTopSite(boolean randomIfAmbiguous, boolean primary, int maxPairDist)1415 public void setFromTopSite(boolean randomIfAmbiguous, boolean primary, int maxPairDist){ 1416 final SiteScore ss0=topSite(); 1417 if(ss0==null){ 1418 clearSite(); 1419 setMapped(false); 1420 return; 1421 } 1422 setMapped(true); 1423 1424 if(sites.size()==1 || !randomIfAmbiguous || !ambiguous()){ 1425 setFromSite(ss0); 1426 return; 1427 } 1428 1429 if(primary || mate==null || !mate.mapped() || !mate.paired()){ 1430 int count=1; 1431 for(int i=1; i<sites.size(); i++){ 1432 SiteScore ss=sites.get(i); 1433 if(ss.score<ss0.score || (ss0.perfect && !ss.perfect) || (ss0.semiperfect && !ss.semiperfect)){break;} 1434 count++; 1435 } 1436 1437 int x=(int)(numericID%count); 1438 if(x>0){ 1439 SiteScore ss=sites.get(x); 1440 sites.set(0, ss); 1441 sites.set(x, ss0); 1442 } 1443 setFromSite(sites.get(0)); 1444 return; 1445 } 1446 1447 // assert(false) : "TODO: Proper strand orientation, and more."; 1448 //TODO: Also, this code appears to sometimes duplicate sitescores(?) 1449 // for(int i=0; i<list.size(); i++){ 1450 // SiteScore ss=list.get(i); 1451 // if(ss.chrom==mate.chrom && Tools.min(Tools.absdifUnsigned(ss.start, mate.stop), Tools.absdifUnsigned(ss.stop, mate.start))<=maxPairDist){ 1452 // list.set(0, ss); 1453 // list.set(i, ss0); 1454 // setFromSite(ss); 1455 // return; 1456 // } 1457 // } 1458 1459 //If unsuccessful, recur unpaired. 1460 1461 this.setPaired(false); 1462 mate.setPaired(false); 1463 setFromTopSite(randomIfAmbiguous, true, maxPairDist); 1464 } 1465 clearPairMapping()1466 public void clearPairMapping(){ 1467 clearMapping(); 1468 if(mate!=null){mate.clearMapping();} 1469 } 1470 clearMapping()1471 public void clearMapping(){ 1472 clearSite(); 1473 match=null; 1474 sites=null; 1475 setMapped(false); 1476 setPaired(false); 1477 if(mate!=null){mate.setPaired(false);} 1478 } 1479 clearSite()1480 public void clearSite(){ 1481 chrom=-1; 1482 setStrand(0); 1483 start=-1; 1484 stop=-1; 1485 // errors=0; 1486 mapScore=0; 1487 gaps=null; 1488 } 1489 1490 clearAnswers(boolean clearMate)1491 public void clearAnswers(boolean clearMate) { 1492 // assert(mate==null || (pairnum()==0 && mate.pairnum()==1)) : pairnum()+", "+mate.pairnum(); 1493 clearSite(); 1494 match=null; 1495 sites=null; 1496 flags=(flags&(SYNTHMASK|PAIRNUMMASK|SWAPMASK)); 1497 if(clearMate && mate!=null){ 1498 mate.clearSite(); 1499 mate.match=null; 1500 mate.sites=null; 1501 mate.flags=(mate.flags&(SYNTHMASK|PAIRNUMMASK|SWAPMASK)); 1502 } 1503 // assert(mate==null || (pairnum()==0 && mate.pairnum()==1)) : pairnum()+", "+mate.pairnum(); 1504 } 1505 1506 isBadPair(boolean requireCorrectStrands, boolean sameStrandPairs, int maxdist)1507 public boolean isBadPair(boolean requireCorrectStrands, boolean sameStrandPairs, int maxdist){ 1508 if(mate==null || paired()){return false;} 1509 if(!mapped() || !mate.mapped()){return false;} 1510 if(chrom!=mate.chrom){return true;} 1511 1512 { 1513 int inner; 1514 if(start<=mate.start){inner=mate.start-stop;} 1515 else{inner=start-mate.stop;} 1516 if(inner>maxdist){return true;} 1517 } 1518 // if(absdif(start, mate.start)>maxdist){return true;} 1519 if(requireCorrectStrands){ 1520 if((strand()==mate.strand())!=sameStrandPairs){return true;} 1521 } 1522 if(!sameStrandPairs){ 1523 if(strand()==Shared.PLUS && mate.strand()==Shared.MINUS){ 1524 if(start>=mate.stop){return true;} 1525 }else if(strand()==Shared.MINUS && mate.strand()==Shared.PLUS){ 1526 if(mate.start>=stop){return true;} 1527 } 1528 } 1529 return false; 1530 } 1531 countMismatches()1532 public int countMismatches(){ 1533 assert(match!=null); 1534 int x=0; 1535 for(byte b : match){ 1536 if(b=='S'){x++;} 1537 } 1538 return x; 1539 } 1540 1541 /** 1542 * @param k 1543 * @return Number of valid kmers 1544 */ numValidKmers(int k)1545 public int numValidKmers(int k) { 1546 if(bases==null){return 0;} 1547 int len=0, counted=0; 1548 for(int i=0; i<bases.length; i++){ 1549 byte b=bases[i]; 1550 long x=AminoAcid.baseToNumber[b]; 1551 if(x<0){len=0;}else{len++;} 1552 if(len>=k){counted++;} 1553 } 1554 return counted; 1555 } 1556 1557 /** 1558 * @param match string 1559 * @return Total number of match, sub, del, ins, or clip symbols 1560 */ matchToMsdicn(byte[] match)1561 public static final int[] matchToMsdicn(byte[] match) { 1562 if(match==null || match.length<1){return null;} 1563 int[] msdicn=KillSwitch.allocInt1D(6); 1564 1565 byte mode='0', c='0'; 1566 int current=0; 1567 for(int i=0; i<match.length; i++){ 1568 c=match[i]; 1569 if(Tools.isDigit(c)){ 1570 current=(current*10)+(c-'0'); 1571 }else{ 1572 if(mode==c){ 1573 current=Tools.max(current+1, 2); 1574 }else{ 1575 current=Tools.max(current, 1); 1576 1577 if(mode=='m'){ 1578 msdicn[0]+=current; 1579 }else if(mode=='S'){ 1580 msdicn[1]+=current; 1581 }else if(mode=='D'){ 1582 msdicn[2]+=current; 1583 }else if(mode=='I'){ 1584 msdicn[3]+=current; 1585 }else if(mode=='C' || mode=='X' || mode=='Y'){ 1586 msdicn[4]+=current; 1587 }else if(mode=='N' || mode=='R'){ 1588 msdicn[5]+=current; 1589 } 1590 mode=c; 1591 current=0; 1592 } 1593 } 1594 } 1595 if(current>0 || !Tools.isDigit(c)){ 1596 current=Tools.max(current, 1); 1597 if(mode=='m'){ 1598 msdicn[0]+=current; 1599 }else if(mode=='S'){ 1600 msdicn[1]+=current; 1601 }else if(mode=='D'){ 1602 msdicn[2]+=current; 1603 }else if(mode=='I'){ 1604 msdicn[3]+=current; 1605 }else if(mode=='C' || mode=='X' || mode=='Y'){ 1606 msdicn[4]+=current; 1607 }else if(mode=='N' || mode=='R'){ 1608 msdicn[5]+=current; 1609 } 1610 } 1611 return msdicn; 1612 } 1613 1614 1615 /** 1616 * @param match string 1617 * @return Ref length of match string 1618 */ calcMatchLength(byte[] match)1619 public static final int calcMatchLength(byte[] match) { 1620 if(match==null || match.length<1){return 0;} 1621 1622 byte mode='0', c='0'; 1623 int current=0; 1624 int len=0; 1625 for(int i=0; i<match.length; i++){ 1626 c=match[i]; 1627 if(Tools.isDigit(c)){ 1628 current=(current*10)+(c-'0'); 1629 }else{ 1630 if(mode==c){ 1631 current=Tools.max(current+1, 2); 1632 }else{ 1633 current=Tools.max(current, 1); 1634 1635 if(mode=='m'){ 1636 len+=current; 1637 }else if(mode=='S'){ 1638 len+=current; 1639 }else if(mode=='D'){ 1640 len+=current; 1641 }else if(mode=='I'){ //Do nothing 1642 //len+=current; 1643 }else if(mode=='C'){ 1644 len+=current; 1645 }else if(mode=='X'){ //Not sure about this case, but adding seems fine 1646 len+=current; 1647 // assert(false) : new String(match); 1648 }else if(mode=='Y'){ //Do nothing 1649 //len+=current; 1650 // assert(false) : new String(match); 1651 }else if(mode=='N' || mode=='R'){ 1652 len+=current; 1653 } 1654 mode=c; 1655 current=0; 1656 } 1657 } 1658 } 1659 if(current>0 || !Tools.isDigit(c)){ 1660 current=Tools.max(current, 1); 1661 if(mode=='m'){ 1662 len+=current; 1663 }else if(mode=='S'){ 1664 len+=current; 1665 }else if(mode=='D'){ 1666 len+=current; 1667 }else if(mode=='I'){ //Do nothing 1668 //len+=current; 1669 }else if(mode=='C'){ 1670 len+=current; 1671 }else if(mode=='X'){ //Not sure about this case, but adding seems fine 1672 len+=current; 1673 assert(false) : new String(match); 1674 }else if(mode=='Y'){ //Do nothing 1675 //len+=current; 1676 // assert(false) : new String(match); 1677 }else if(mode=='N' || mode=='R'){ 1678 len+=current; 1679 } 1680 } 1681 return len; 1682 } 1683 identity()1684 public final float identity() {return identity(match);} 1685 identity(byte[] match)1686 public static final float identity(byte[] match) { 1687 if(FLAT_IDENTITY){ 1688 return identityFlat(match, true); 1689 }else{ 1690 return identitySkewed(match, true, true, false, false); 1691 } 1692 } 1693 hasLongInsertion(int maxlen)1694 public final boolean hasLongInsertion(int maxlen){ 1695 return hasLongInsertion(match, maxlen); 1696 } 1697 hasLongDeletion(int maxlen)1698 public final boolean hasLongDeletion(int maxlen){ 1699 return hasLongDeletion(match, maxlen); 1700 } 1701 hasLongInsertion(byte[] match, int maxlen)1702 public static final boolean hasLongInsertion(byte[] match, int maxlen){ 1703 if(match==null || match.length<maxlen){return false;} 1704 byte prev='0'; 1705 int len=0; 1706 for(byte b : match){ 1707 if(b=='I' || b=='X' || b=='Y'){ 1708 if(b==prev){len++;} 1709 else{len=1;} 1710 if(len>maxlen){return true;} 1711 }else{ 1712 len=0; 1713 } 1714 prev=b; 1715 } 1716 return false; 1717 } 1718 hasLongDeletion(byte[] match, int maxlen)1719 public static final boolean hasLongDeletion(byte[] match, int maxlen){ 1720 if(match==null || match.length<maxlen){return false;} 1721 byte prev='0'; 1722 int len=0; 1723 for(byte b : match){ 1724 if(b=='D'){ 1725 if(b==prev){len++;} 1726 else{len=1;} 1727 if(len>maxlen){return true;} 1728 }else{ 1729 len=0; 1730 } 1731 prev=b; 1732 } 1733 return false; 1734 } 1735 mappedNonClippedBases()1736 public int mappedNonClippedBases() { 1737 if(!mapped() || match==null || bases==null){return 0;} 1738 1739 int len=0; 1740 byte mode='0', c='0'; 1741 int current=0; 1742 for(int i=0; i<match.length; i++){ 1743 c=match[i]; 1744 if(Tools.isDigit(c)){ 1745 current=(current*10)+(c-'0'); 1746 }else{ 1747 if(mode==c){ 1748 current=Tools.max(current+1, 2); 1749 }else{ 1750 current=Tools.max(current, 1); 1751 1752 if(mode=='D' || mode=='C' || mode=='X' || mode=='Y' || mode=='d'){ 1753 1754 }else{ 1755 len+=current; 1756 } 1757 mode=c; 1758 current=0; 1759 } 1760 } 1761 } 1762 if(current>0 || !Tools.isDigit(c)){ 1763 current=Tools.max(current, 1); 1764 if(mode=='D' || mode=='C' || mode=='X' || mode=='Y' || mode=='d'){ 1765 1766 }else{ 1767 len+=current; 1768 } 1769 mode=c; 1770 current=0; 1771 } 1772 return len; 1773 } 1774 1775 /** 1776 * Handles short or long mode. 1777 * @param match string 1778 * @return Identity based on number of match, sub, del, ins, or N symbols 1779 */ identityFlat(byte[] match, boolean penalizeN)1780 public static final float identityFlat(byte[] match, boolean penalizeN) { 1781 // assert(false) : new String(match); 1782 if(match==null || match.length<1){return 0;} 1783 1784 int good=0, bad=0, n=0; 1785 1786 byte mode='0', c='0'; 1787 int current=0; 1788 for(int i=0; i<match.length; i++){ 1789 c=match[i]; 1790 if(Tools.isDigit(c)){ 1791 current=(current*10)+(c-'0'); 1792 }else{ 1793 if(mode==c){ 1794 current=Tools.max(current+1, 2); 1795 }else{ 1796 current=Tools.max(current, 1); 1797 1798 if(mode=='m'){ 1799 good+=current; 1800 // System.out.println("G: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad); 1801 }else if(mode=='R' || mode=='N'){ 1802 n+=current; 1803 }else if(mode=='C' || mode=='V'){ 1804 //Do nothing 1805 //I assume this is clipped because it went off the end of a scaffold, and thus is irrelevant to identity 1806 }else if(mode!='0'){ 1807 assert(mode=='S' || mode=='D' || mode=='I' || mode=='X' || mode=='Y' || mode=='i' || mode=='d') : (char)mode; 1808 if(mode!='D' || current<SamLine.INTRON_LIMIT){ 1809 bad+=current; 1810 } 1811 // System.out.println("B: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad); 1812 } 1813 mode=c; 1814 current=0; 1815 } 1816 } 1817 } 1818 if(current>0 || !Tools.isDigit(c)){ 1819 current=Tools.max(current, 1); 1820 if(mode=='m'){ 1821 good+=current; 1822 }else if(mode=='R' || mode=='N'){ 1823 n+=current; 1824 }else if(mode=='C' || mode=='V'){ 1825 //Do nothing 1826 //I assume this is clipped because it went off the end of a scaffold, and thus is irrelevant to identity 1827 }else if(mode!='0'){ 1828 assert(mode=='S' || mode=='I' || mode=='X' || mode=='Y') : (char)mode; 1829 if(mode!='D' || current<SamLine.INTRON_LIMIT){ 1830 bad+=current; 1831 } 1832 // System.out.println("B: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad); 1833 } 1834 } 1835 1836 1837 float good2=good+n*(penalizeN ? 0.25f : 0); 1838 float bad2=bad+n*(penalizeN ? 0.75f : 0); 1839 float r=good2/Tools.max(good2+bad2, 1); 1840 // assert(false) : new String(match)+"\nmode='"+(char)mode+"', current="+current+", good="+good+", bad="+bad; 1841 1842 // System.out.println("match="+new String(match)+"\ngood="+good+", bad="+bad+", r="+r); 1843 // System.out.println(Arrays.toString(matchToMsdicn(match))); 1844 1845 return r; 1846 } 1847 1848 /** 1849 * Handles short or long mode. 1850 * @param match string 1851 * @return Identity based on number of match, sub, del, ins, or N symbols 1852 */ identitySkewed(byte[] match, boolean penalizeN, boolean sqrt, boolean log, boolean single)1853 public static final float identitySkewed(byte[] match, boolean penalizeN, boolean sqrt, boolean log, boolean single) { 1854 // assert(false) : new String(match); 1855 if(match==null || match.length<1){return 0;} 1856 1857 int good=0, bad=0, n=0; 1858 1859 byte mode='0', c='0'; 1860 int current=0; 1861 for(int i=0; i<match.length; i++){ 1862 c=match[i]; 1863 if(Tools.isDigit(c)){ 1864 current=(current*10)+(c-'0'); 1865 }else{ 1866 if(mode==c){ 1867 current=Tools.max(current+1, 2); 1868 }else{ 1869 current=Tools.max(current, 1); 1870 1871 if(mode=='m'){ 1872 good+=current; 1873 // System.out.println("G: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad); 1874 }else if(mode=='D'){ 1875 if(current<SamLine.INTRON_LIMIT){ 1876 int x; 1877 1878 if(sqrt){x=(int)Math.ceil(Math.sqrt(current));} 1879 else if(log){x=(int)Math.ceil(Tools.log2(current));} 1880 else{x=1;} 1881 1882 bad+=(Tools.min(x, current)); 1883 } 1884 1885 // System.out.println("D: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad+", x="+x); 1886 }else if(mode=='R' || mode=='N'){ 1887 n+=current; 1888 }else if(mode=='C' || mode=='V'){ 1889 //Do nothing 1890 //I assume this is clipped because it went off the end of a scaffold, and thus is irrelevant to identity 1891 }else if(mode!='0'){ 1892 assert(mode=='S' || mode=='I' || mode=='X' || mode=='Y' || mode=='i' || mode=='d') : (char)mode; 1893 bad+=current; 1894 // System.out.println("B: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad); 1895 } 1896 mode=c; 1897 current=0; 1898 } 1899 } 1900 } 1901 if(current>0 || !Tools.isDigit(c)){ 1902 current=Tools.max(current, 1); 1903 if(mode=='m'){ 1904 good+=current; 1905 }else if(mode=='R' || mode=='N'){ 1906 n+=current; 1907 }else if(mode=='C' || mode=='V'){ 1908 //Do nothing 1909 //I assume this is clipped because it went off the end of a scaffold, and thus is irrelevant to identity 1910 }else if(mode!='0'){ 1911 assert(mode=='S' || mode=='I' || mode=='X' || mode=='Y') : (char)mode; 1912 if(mode!='D' || current<SamLine.INTRON_LIMIT){ 1913 bad+=current; 1914 } 1915 // System.out.println("B: mode="+(char)mode+", c="+(char)c+", current="+current+", good="+good+", bad="+bad); 1916 } 1917 } 1918 1919 1920 float good2=good+n*(penalizeN ? 0.25f : 0); 1921 float bad2=bad+n*(penalizeN ? 0.75f : 0); 1922 float r=good2/Tools.max(good2+bad2, 1); 1923 // assert(false) : new String(match)+"\nmode='"+(char)mode+"', current="+current+", good="+good+", bad="+bad; 1924 1925 // System.out.println("match="+new String(match)+"\ngood="+good+", bad="+bad+", r="+r); 1926 // System.out.println(Arrays.toString(matchToMsdicn(match))); 1927 1928 return r; 1929 } 1930 failsChastity()1931 public boolean failsChastity(){ 1932 return failsChastity(true); 1933 } 1934 failsChastity(boolean processAssertions)1935 public boolean failsChastity(boolean processAssertions){ 1936 if(id==null){return false;} 1937 int space=id.indexOf(' '); 1938 if(space<0 || space+5>id.length()){return false;} 1939 char a=id.charAt(space+1); 1940 char b=id.charAt(space+2); 1941 char c=id.charAt(space+3); 1942 char d=id.charAt(space+4); 1943 1944 if(a=='/'){ 1945 if(b<'1' || b>'4' || c!=':'){ 1946 if(!processAssertions){return false;} 1947 KillSwitch.kill("Strangely formatted read. Please disable chastityfilter with the flag chastityfilter=f. id:"+id); 1948 } 1949 return d=='Y'; 1950 }else{ 1951 if(processAssertions){ 1952 assert(a=='1' || a=='2' || a=='3' || a=='4') : id; 1953 assert(b==':') : id; 1954 assert(d==':'); 1955 } 1956 if(a<'1' || a>'4' || b!=':' || d!=':'){ 1957 if(!processAssertions){return false;} 1958 KillSwitch.kill("Strangely formatted read. Please disable chastityfilter with the flag chastityfilter=f. id:"+id); 1959 } 1960 return c=='Y'; 1961 } 1962 } 1963 failsBarcode(HashSet<String> set, boolean failIfNoBarcode)1964 public boolean failsBarcode(HashSet<String> set, boolean failIfNoBarcode){ 1965 if(id==null){return false;} 1966 1967 final int loc=id.lastIndexOf(':'); 1968 final int loc2=Tools.max(id.indexOf(' '), id.indexOf('/')); 1969 if(loc<0 || loc<=loc2 || loc>=id.length()-1){ 1970 return failIfNoBarcode; 1971 } 1972 1973 if(set==null){ 1974 for(int i=loc+1; i<id.length(); i++){ 1975 char c=id.charAt(i); 1976 boolean ok=(c=='+' || AminoAcid.isFullyDefined(c)); 1977 if(!ok){return true;} 1978 } 1979 return false; 1980 }else{ 1981 String code=id.substring(loc+1); 1982 return !set.contains(code); 1983 } 1984 } 1985 1986 /** 1987 * Parse the barcode from an Illumina header. 1988 * @param failIfNoBarcode Terminate the JVM if no barcode is present. 1989 * @return The barcode 1990 */ barcode(boolean failIfNoBarcode)1991 public String barcode(boolean failIfNoBarcode){ 1992 1993 if(id==null){ 1994 if(failIfNoBarcode && Shared.EA()){ 1995 KillSwitch.kill("Encountered a read header without a barcode:\n"+id+"\n"); 1996 } 1997 return null; 1998 } 1999 2000 final int loc=id.lastIndexOf(':'); 2001 final int loc2=Tools.max(id.indexOf(' '), id.indexOf('/')); 2002 if(loc<0 || loc<=loc2 || loc>=id.length()-1){ 2003 if(failIfNoBarcode && Shared.EA()){ 2004 KillSwitch.kill("Encountered a read header without a barcode:\n"+id+"\n"); 2005 } 2006 return null; 2007 } 2008 2009 String code=id.substring(loc+1); 2010 return code; 2011 } 2012 2013 /** 2014 * @return The rname of this Read's SamLine, if present and mapped. 2015 */ rnameS()2016 public String rnameS() { 2017 if(samline==null || !samline.mapped()){return null;} 2018 return samline.rnameS(); 2019 } 2020 2021 /** Average based on summing quality scores */ avgQuality(boolean countUndefined, int maxBases)2022 public double avgQuality(boolean countUndefined, int maxBases){ 2023 return AVERAGE_QUALITY_BY_PROBABILITY ? avgQualityByProbabilityDouble(countUndefined, maxBases) : avgQualityByScoreDouble(maxBases); 2024 } 2025 2026 /** Average based on summing quality scores */ avgQualityInt(boolean countUndefined, int maxBases)2027 public int avgQualityInt(boolean countUndefined, int maxBases){ 2028 return AVERAGE_QUALITY_BY_PROBABILITY ? avgQualityByProbabilityInt(countUndefined, maxBases) : avgQualityByScoreInt(maxBases); 2029 } 2030 2031 /** Average based on summing error probabilities */ avgQualityByProbabilityInt(boolean countUndefined, int maxBases)2032 public int avgQualityByProbabilityInt(boolean countUndefined, int maxBases){ 2033 if(bases==null || bases.length==0){return 0;} 2034 return avgQualityByProbabilityInt(bases, quality, countUndefined, maxBases); 2035 } 2036 2037 /** Average based on summing error probabilities */ avgQualityByProbabilityDouble(boolean countUndefined, int maxBases)2038 public double avgQualityByProbabilityDouble(boolean countUndefined, int maxBases){ 2039 if(bases==null || bases.length==0){return 0;} 2040 return avgQualityByProbabilityDouble(bases, quality, countUndefined, maxBases); 2041 } 2042 2043 /** Average based on summing error probabilities */ probabilityErrorFree(boolean countUndefined, int maxBases)2044 public double probabilityErrorFree(boolean countUndefined, int maxBases){ 2045 if(bases==null || bases.length==0){return 0;} 2046 return probabilityErrorFree(bases, quality, countUndefined, maxBases); 2047 } 2048 2049 /** Average based on summing error probabilities */ avgQualityByProbabilityInt(byte[] bases, byte[] quality, boolean countUndefined, int maxBases)2050 public static int avgQualityByProbabilityInt(byte[] bases, byte[] quality, boolean countUndefined, int maxBases){ 2051 if(quality==null){return 40;} 2052 if(quality.length==0){return 0;} 2053 float e=expectedErrors(bases, quality, countUndefined, maxBases); 2054 final int div=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length)); 2055 float p=e/div; 2056 return QualityTools.probErrorToPhred(p); 2057 } 2058 2059 /** Average based on summing error probabilities */ avgQualityByProbabilityDouble(byte[] bases, byte[] quality, boolean countUndefined, int maxBases)2060 public static double avgQualityByProbabilityDouble(byte[] bases, byte[] quality, boolean countUndefined, int maxBases){ 2061 if(quality==null){return 40;} 2062 if(quality.length==0){return 0;} 2063 float e=expectedErrors(bases, quality, countUndefined, maxBases); 2064 final int div=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length)); 2065 float p=e/div; 2066 return QualityTools.probErrorToPhredDouble(p); 2067 } 2068 2069 /** Average based on summing quality scores */ avgQualityByScoreInt(int maxBases)2070 public int avgQualityByScoreInt(int maxBases){ 2071 if(bases==null || bases.length==0){return 0;} 2072 if(quality==null){return 40;} 2073 int x=0, limit=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length)); 2074 for(int i=0; i<limit; i++){ 2075 byte b=quality[i]; 2076 x+=(b<0 ? 0 : b); 2077 } 2078 return x/limit; 2079 } 2080 2081 /** Average based on summing quality scores */ avgQualityByScoreDouble(int maxBases)2082 public double avgQualityByScoreDouble(int maxBases){ 2083 if(bases==null || bases.length==0){return 0;} 2084 if(quality==null){return 40;} 2085 int x=0, limit=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length)); 2086 for(int i=0; i<limit; i++){ 2087 byte b=quality[i]; 2088 x+=(b<0 ? 0 : b); 2089 } 2090 return x/(double)limit; 2091 } 2092 2093 /** Used by BBMap tipsearch. */ avgQualityFirstNBases(int n)2094 public int avgQualityFirstNBases(int n){ 2095 if(bases==null || bases.length==0){return 0;} 2096 if(quality==null || n<1){return 40;} 2097 assert(quality!=null); 2098 int x=0; 2099 if(n>quality.length){return 0;} 2100 for(int i=0; i<n; i++){ 2101 byte b=quality[i]; 2102 x+=(b<0 ? 0 : b); 2103 } 2104 return x/n; 2105 } 2106 2107 /** Used by BBMap tipsearch. */ avgQualityLastNBases(int n)2108 public int avgQualityLastNBases(int n){ 2109 if(bases==null || bases.length==0){return 0;} 2110 if(quality==null || n<1){return 40;} 2111 assert(quality!=null); 2112 int x=0; 2113 if(n>quality.length){return 0;} 2114 for(int i=bases.length-n; i<bases.length; i++){ 2115 byte b=quality[i]; 2116 x+=(b<0 ? 0 : b); 2117 } 2118 return x/n; 2119 } 2120 2121 /** Used by BBDuk. */ minQuality()2122 public int minQuality(){ 2123 byte min=41; 2124 if(bases!=null && quality!=null){ 2125 for(byte q : quality){ 2126 min=Tools.min(min, q); 2127 } 2128 } 2129 return min; 2130 } 2131 2132 /** Used by BBMap tipsearch. */ minQualityFirstNBases(int n)2133 public byte minQualityFirstNBases(int n){ 2134 if(bases==null || bases.length==0){return 0;} 2135 if(quality==null || n<1){return 41;} 2136 assert(quality!=null && n>0); 2137 if(n>quality.length){return 0;} 2138 byte x=quality[0]; 2139 for(int i=1; i<n; i++){ 2140 byte b=quality[i]; 2141 if(b<x){x=b;} 2142 } 2143 return x; 2144 } 2145 2146 /** Used by BBMap tipsearch. */ minQualityLastNBases(int n)2147 public byte minQualityLastNBases(int n){ 2148 if(bases==null || bases.length==0){return 0;} 2149 if(quality==null || n<1){return 41;} 2150 assert(quality!=null && n>0); 2151 if(n>quality.length){return 0;} 2152 byte x=quality[bases.length-n]; 2153 for(int i=bases.length-n; i<bases.length; i++){ 2154 byte b=quality[i]; 2155 if(b<x){x=b;} 2156 } 2157 return x; 2158 } 2159 containsNonM()2160 public boolean containsNonM(){ 2161 assert(match!=null && valid()); 2162 for(int i=0; i<match.length; i++){ 2163 byte b=match[i]; 2164 assert(b!='M'); 2165 if(b>'9' && b!='m'){return true;} 2166 } 2167 return false; 2168 } 2169 containsNonNM()2170 public boolean containsNonNM(){ 2171 assert(match!=null && valid()); 2172 for(int i=0; i<match.length; i++){ 2173 byte b=match[i]; 2174 assert(b!='M'); 2175 if(b>'9' && b!='m' && b!='N'){return true;} 2176 } 2177 return false; 2178 } 2179 containsVariants()2180 public boolean containsVariants(){ 2181 assert(match!=null && valid()) : (match==null)+", "+(valid())+"\n"+samline+"\n"; 2182 for(int i=0; i<match.length; i++){ 2183 byte b=match[i]; 2184 assert(b!='M'); 2185 if(b>'9' && b!='m' && b!='N' && b!='C'){return true;} 2186 } 2187 return false; 2188 } 2189 containsClipping()2190 public boolean containsClipping(){ 2191 assert(match!=null && valid()) : (match==null)+", "+(valid())+"\n"+samline+"\n"; 2192 if(match.length<1){return false;} 2193 if(match[0]=='C'){return true;} 2194 for(int i=match.length-1; i>0; i--){ 2195 if(match[i]=='C'){return true;} 2196 if(match[i]>'9'){break;} 2197 } 2198 return false; 2199 } 2200 2201 /** 2202 * @return {m,S,C,N,I,D}; 2203 */ countMatchSymbols()2204 public int[] countMatchSymbols(){ 2205 int m=0, S=0, C=0, N=0, I=0, D=0; 2206 int current=0; 2207 byte last='?'; 2208 for(byte b : match){ 2209 if(Tools.isDigit(b)){ 2210 current=current*10+b-'0'; 2211 }else{ 2212 current=Tools.max(current, 1); 2213 if(last=='m'){ 2214 m+=current; 2215 }else if(last=='S'){ 2216 S+=current; 2217 }else if(last=='C'){ 2218 C+=current; 2219 }else if(last=='N'){ 2220 N+=current; 2221 }else if(last=='I'){ 2222 I+=current; 2223 }else if(last=='D'){ 2224 D+=current; 2225 }else{ 2226 assert(last=='?') : "Unhandled symbol "+(char)last+"\n"+new String(match); 2227 } 2228 current=0; 2229 last=b; 2230 } 2231 } 2232 current=Tools.max(current, 1); 2233 if(last=='m'){ 2234 m+=current; 2235 }else if(last=='S'){ 2236 S+=current; 2237 }else if(last=='C'){ 2238 C+=current; 2239 }else if(last=='N'){ 2240 N+=current; 2241 }else if(last=='I'){ 2242 I+=current; 2243 }else if(last=='D'){ 2244 D+=current; 2245 }else{ 2246 assert(last=='?') : "Unhandled symbol "+(char)last+"\n"+new String(match); 2247 } 2248 current=0; 2249 return new int[] {m,S,C,N,I,D}; 2250 } 2251 containsNonNMXY()2252 public boolean containsNonNMXY(){ 2253 assert(match!=null && valid()); 2254 for(int i=0; i<match.length; i++){ 2255 byte b=match[i]; 2256 assert(b!='M'); 2257 if(b>'9' && b!='m' && b!='N' && b!='X' && b!='Y'){return true;} 2258 } 2259 return false; 2260 } 2261 containsSDI()2262 public boolean containsSDI(){ 2263 assert(match!=null && valid()); 2264 for(int i=0; i<match.length; i++){ 2265 byte b=match[i]; 2266 assert(b!='M'); 2267 if(b=='S' || b=='s' || b=='D' || b=='I'){return true;} 2268 } 2269 return false; 2270 } 2271 containsNonNMS()2272 public boolean containsNonNMS(){ 2273 assert(match!=null && valid()); 2274 for(int i=0; i<match.length; i++){ 2275 byte b=match[i]; 2276 assert(b!='M'); 2277 if(b>'9' && b!='m' && b!='s' && b!='N' && b!='S'){return true;} 2278 } 2279 return false; 2280 } 2281 containsConsecutiveS(int num)2282 public boolean containsConsecutiveS(int num){ 2283 assert(match!=null && valid() && !shortmatch()); 2284 int cnt=0; 2285 for(int i=0; i<match.length; i++){ 2286 byte b=match[i]; 2287 assert(b!='M'); 2288 if(b=='S'){ 2289 cnt++; 2290 if(cnt>=num){return true;} 2291 }else{ 2292 cnt=0; 2293 } 2294 } 2295 return false; 2296 } 2297 containsIndels()2298 public boolean containsIndels(){ 2299 assert(match!=null && valid()); 2300 for(int i=0; i<match.length; i++){ 2301 byte b=match[i]; 2302 if(b=='I' || b=='D' || b=='X' || b=='Y'){return true;} 2303 } 2304 return false; 2305 } 2306 countSubs()2307 public int countSubs(){ 2308 assert(match!=null && valid()) : (match!=null)+", "+valid()+", "+shortmatch(); 2309 return countSubs(match); 2310 } 2311 containsInMatch(char c)2312 public boolean containsInMatch(char c){ 2313 assert(match!=null && valid()); 2314 for(int i=0; i<match.length; i++){ 2315 byte b=match[i]; 2316 if(b==c){return true;} 2317 } 2318 return false; 2319 } 2320 containsNocalls()2321 public boolean containsNocalls(){ 2322 for(int i=0; i<bases.length; i++){ 2323 byte b=bases[i]; 2324 if(b=='N'){return true;} 2325 } 2326 return false; 2327 } 2328 countNocalls()2329 public int countNocalls(){ 2330 return countNocalls(bases); 2331 } 2332 countSubs(byte[] match)2333 public static int countSubs(byte[] match){ 2334 int S=0; 2335 int current=0; 2336 byte last='?'; 2337 for(byte b : match){ 2338 if(Tools.isDigit(b)){ 2339 current=current*10+b-'0'; 2340 }else{ 2341 if(last=='S'){S+=Tools.max(1, current);} 2342 current=0; 2343 last=b; 2344 } 2345 } 2346 if(last=='S'){S+=Tools.max(1, current);} 2347 // assert(S==0) : S+"\t"+new String(match); 2348 return S; 2349 // int x=0; 2350 // assert(match!=null); 2351 // for(int i=0; i<match.length; i++){ 2352 // byte b=match[i]; 2353 // if(b=='S'){x++;} 2354 // assert(!Tools.isDigit(b)); 2355 // } 2356 // return x; 2357 } 2358 countVars(byte[] match)2359 public static int countVars(byte[] match){ 2360 return countVars(match, true, true, true); 2361 } 2362 countVars(byte[] match, boolean sub, boolean ins, boolean del)2363 public static int countVars(byte[] match, boolean sub, boolean ins, boolean del){ 2364 int S=0, I=0, D=0; 2365 int current=0; 2366 byte last='?'; 2367 for(byte b : match){ 2368 if(Tools.isDigit(b)){ 2369 current=current*10+b-'0'; 2370 }else{ 2371 if(last=='S'){S+=Tools.max(1, current);} 2372 else if(last=='I'){I++;} 2373 else if(last=='D'){D++;} 2374 current=0; 2375 last=b; 2376 } 2377 } 2378 if(last=='S'){S+=Tools.max(1, current);} 2379 else if(last=='I'){I++;} 2380 else if(last=='D'){D++;} 2381 return (sub ? S : 0)+(ins ? I : 0)+(del ? D : 0); 2382 } 2383 containsSubs(byte[] match)2384 public static boolean containsSubs(byte[] match){ 2385 int x=0; 2386 assert(match!=null); 2387 for(int i=0; i<match.length; i++){ 2388 byte b=match[i]; 2389 if(b=='S'){return true;} 2390 } 2391 return false; 2392 } 2393 containsVars(byte[] match)2394 public static boolean containsVars(byte[] match){ 2395 int x=0; 2396 assert(match!=null); 2397 for(int i=0; i<match.length; i++){ 2398 byte b=match[i]; 2399 if(b=='S' || b=='I' || b=='D'){return true;} 2400 } 2401 return false; 2402 } 2403 countNocalls(byte[] match)2404 public static int countNocalls(byte[] match){ 2405 int n=0; 2406 for(int i=0; i<match.length; i++){ 2407 byte b=match[i]; 2408 if(b=='N'){n++;} 2409 } 2410 return n; 2411 } 2412 countInsertions(byte[] match)2413 public static int countInsertions(byte[] match){ 2414 int n=0; 2415 for(int i=0; i<match.length; i++){ 2416 byte b=match[i]; 2417 if(b=='I'){n++;} 2418 } 2419 return n; 2420 } 2421 countDeletions(byte[] match)2422 public static int countDeletions(byte[] match){ 2423 int n=0; 2424 for(int i=0; i<match.length; i++){ 2425 byte b=match[i]; 2426 if(b=='D'){n++;} 2427 } 2428 return n; 2429 } 2430 countInsertionEvents(byte[] match)2431 public static int countInsertionEvents(byte[] match){ 2432 int n=0; 2433 byte prev='N'; 2434 for(int i=0; i<match.length; i++){ 2435 byte b=match[i]; 2436 if(b=='I' && prev!=b){n++;} 2437 prev=b; 2438 } 2439 return n; 2440 } 2441 countDeletionEvents(byte[] match)2442 public static int countDeletionEvents(byte[] match){ 2443 int n=0; 2444 byte prev='N'; 2445 for(int i=0; i<match.length; i++){ 2446 byte b=match[i]; 2447 if(b=='D' && prev!=b){n++;} 2448 prev=b; 2449 } 2450 return n; 2451 } 2452 containsNonACGTN()2453 public boolean containsNonACGTN(){ 2454 if(bases==null){return false;} 2455 for(byte b : bases){ 2456 if(AminoAcid.baseToNumberACGTN[b]<0){return true;} 2457 } 2458 return false; 2459 } 2460 containsUndefined()2461 public boolean containsUndefined(){ 2462 if(bases==null){return false;} 2463 final byte[] symbolToNumber=AminoAcid.symbolToNumber(amino()); 2464 for(byte b : bases){ 2465 if(symbolToNumber[b]<0){return true;} 2466 } 2467 return false; 2468 } 2469 containsLowercase()2470 public boolean containsLowercase(){ 2471 if(bases==null){return false;} 2472 for(byte b : bases){ 2473 if(Tools.isLowerCase(b)){return true;} 2474 } 2475 return false; 2476 } 2477 countUndefined()2478 public int countUndefined(){ 2479 if(bases==null){return 0;} 2480 final byte[] symbolToNumber=AminoAcid.symbolToNumber(amino()); 2481 int n=0; 2482 for(byte b : bases){ 2483 if(symbolToNumber[b]<0){n++;} 2484 } 2485 return n; 2486 } 2487 hasMinConsecutiveBases(final int min)2488 public boolean hasMinConsecutiveBases(final int min){ 2489 if(bases==null){return min<=0;} 2490 final byte[] symbolToNumber=AminoAcid.symbolToNumber(amino()); 2491 int len=0; 2492 for(byte b : bases){ 2493 if(symbolToNumber[b]<0){len=0;} 2494 else{ 2495 len++; 2496 if(len>=min){return true;} 2497 } 2498 } 2499 return false; 2500 } 2501 2502 2503 /** 2504 * @return The number of occurrences of the rarest base. 2505 */ minBaseCount()2506 public int minBaseCount(){ 2507 if(bases==null){return 0;} 2508 int a=0, c=0, g=0, t=0; 2509 for(byte b : bases){ 2510 if(b=='A'){a++;} 2511 else if(b=='C'){c++;} 2512 else if(b=='G'){g++;} 2513 else if(b=='T'){t++;} 2514 } 2515 return Tools.min(a, c, g, t); 2516 } 2517 containsXY()2518 public boolean containsXY(){ 2519 assert(match!=null && valid()); 2520 return containsXY(match); 2521 } 2522 containsXY(byte[] match)2523 public static boolean containsXY(byte[] match){ 2524 if(match==null){return false;} 2525 for(int i=0; i<match.length; i++){ 2526 byte b=match[i]; 2527 if(b=='X' || b=='Y'){return true;} 2528 } 2529 return false; 2530 } 2531 containsXY2()2532 public boolean containsXY2(){ 2533 if(match==null || match.length<1){return false;} 2534 boolean b=(match[0]=='X' || match[match.length-1]=='Y'); 2535 assert(!valid() || b==containsXY()); 2536 return b; 2537 } 2538 containsXYC()2539 public boolean containsXYC(){ 2540 if(match==null || match.length<1){return false;} 2541 boolean b=(match[0]=='X' || match[match.length-1]=='Y'); 2542 assert(!valid() || b==containsXY()); 2543 return b || match[0]=='C' || match[match.length-1]=='C'; 2544 } 2545 2546 /** Replaces 'B' in match string with 'S', 'm', or 'N' */ fixMatchB()2547 public boolean fixMatchB(){ 2548 assert(match!=null); 2549 final ChromosomeArray ca; 2550 if(Data.GENOME_BUILD>=0){ 2551 ca=Data.getChromosome(chrom); 2552 }else{ 2553 ca=null; 2554 } 2555 boolean originallyShort=shortmatch(); 2556 if(originallyShort){match=toLongMatchString(match);} 2557 int mloc=0, cloc=0, rloc=start; 2558 for(; mloc<match.length; mloc++){ 2559 byte m=match[mloc]; 2560 2561 if(m=='B'){ 2562 byte r=(ca==null ? (byte)'?' : ca.get(rloc)); 2563 byte c=bases[cloc]; 2564 if(r=='N' || c=='N'){ 2565 match[mloc]='N'; 2566 }else if(r==c || Tools.toUpperCase(r)==Tools.toUpperCase(c)){ 2567 match[mloc]='m'; 2568 }else{ 2569 if(ca==null){ 2570 if(originallyShort){ 2571 match=toShortMatchString(match); 2572 } 2573 for(int i=0; i<match.length; i++){ 2574 if(match[i]=='B'){match[i]='N';} 2575 } 2576 return false; 2577 } 2578 match[mloc]='S'; 2579 } 2580 cloc++; 2581 rloc++; 2582 }else if(m=='m' || m=='S' || m=='N' || m=='s' || m=='C'){ 2583 cloc++; 2584 rloc++; 2585 }else if(m=='D'){ 2586 rloc++; 2587 }else if(m=='I' || m=='X' || m=='Y'){ 2588 cloc++; 2589 } 2590 } 2591 if(originallyShort){match=toShortMatchString(match);} 2592 return true; 2593 } 2594 expectedTipErrors(boolean countUndefined, int maxBases)2595 public float expectedTipErrors(boolean countUndefined, int maxBases){ 2596 return expectedTipErrors(bases, quality, countUndefined, maxBases); 2597 } 2598 expectedErrorsIncludingMate(boolean countUndefined)2599 public float expectedErrorsIncludingMate(boolean countUndefined){ 2600 float a=expectedErrors(countUndefined, length()); 2601 float b=(mate==null ? 0 : mate.expectedErrors(countUndefined, mate.length())); 2602 return a+b; 2603 } 2604 expectedErrors(boolean countUndefined, int maxBases)2605 public float expectedErrors(boolean countUndefined, int maxBases){ 2606 return expectedErrors(bases, quality, countUndefined, maxBases); 2607 } 2608 probabilityErrorFree(byte[] bases, byte[] quality, boolean countUndefined, int maxBases)2609 public static float probabilityErrorFree(byte[] bases, byte[] quality, boolean countUndefined, int maxBases){ 2610 if(quality==null){return 0;} 2611 final int limit=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length)); 2612 final float[] array=QualityTools.PROB_CORRECT; 2613 assert(array[0]>0 && array[0]<1); 2614 float product=1; 2615 for(int i=0; i<limit; i++){ 2616 byte b=bases[i]; 2617 byte q=quality[i]; 2618 if(AminoAcid.isFullyDefined(b)){ 2619 product*=array[q]; 2620 }else if(countUndefined){ 2621 return 0; 2622 } 2623 } 2624 return product; 2625 } 2626 expectedErrors(byte[] bases, byte[] quality, boolean countUndefined, int maxBases)2627 public static float expectedErrors(byte[] bases, byte[] quality, boolean countUndefined, int maxBases){ 2628 if(quality==null){return 0;} 2629 final int limit=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length)); 2630 final float[] array=QualityTools.PROB_ERROR; 2631 assert(array[0]>0 && array[0]<1); 2632 float sum=0; 2633 for(int i=0; i<limit; i++){ 2634 byte b=bases[i]; 2635 boolean d=AminoAcid.isFullyDefined(b); 2636 //assert((quality[i]==0)==d) : "Q="+quality[i]+" for base "+(char)b; 2637 if(d || countUndefined){ 2638 byte q=(d ? quality[i] : 0); 2639 sum+=array[q]; 2640 } 2641 } 2642 return sum; 2643 } 2644 2645 /** Runs backwards instead of forwards */ expectedTipErrors(byte[] bases, byte[] quality, boolean countUndefined, int maxBases)2646 public static float expectedTipErrors(byte[] bases, byte[] quality, boolean countUndefined, int maxBases){ 2647 if(quality==null){return 0;} 2648 final int limit; 2649 { 2650 final int limit0=(maxBases<1 ? quality.length : Tools.min(maxBases, quality.length)); 2651 limit=quality.length-limit0; 2652 } 2653 final float[] array=QualityTools.PROB_ERROR; 2654 assert(array[0]>0 && array[0]<1); 2655 float sum=0; 2656 for(int i=quality.length-1; i>=limit; i--){ 2657 byte b=bases[i]; 2658 byte q=quality[i]; 2659 if(AminoAcid.isFullyDefined(b)){ 2660 sum+=array[q]; 2661 }else{ 2662 assert(q==0); 2663 if(countUndefined){sum+=0.75f;} 2664 } 2665 } 2666 return sum; 2667 } 2668 estimateErrors()2669 public int estimateErrors() { 2670 if(quality==null){return 0;} 2671 assert(match!=null) : this.toText(false); 2672 2673 int count=0; 2674 for(int ci=0, mi=0; ci<bases.length && mi<match.length; mi++){ 2675 2676 // byte b=bases[ci]; 2677 byte q=quality[ci]; 2678 byte m=match[mi]; 2679 if(m=='m' || m=='s' || m=='N'){ 2680 ci++; 2681 }else if(m=='X' || m=='Y'){ 2682 ci++; 2683 count++; 2684 }else if(m=='I'){ 2685 ci++; 2686 }else if(m=='D'){ 2687 2688 }else if(m=='S'){ 2689 ci++; 2690 if(q<19){ 2691 count++; 2692 } 2693 } 2694 2695 } 2696 return count; 2697 } 2698 2699 /** {M, S, D, I, N, splice} */ countErrors(int minSplice)2700 public int[] countErrors(int minSplice) { 2701 assert(match!=null) : this.toText(false); 2702 int m=0; 2703 int s=0; 2704 int d=0; 2705 int i=0; 2706 int n=0; 2707 int splice=0; 2708 2709 byte prev=' '; 2710 int streak=0; 2711 minSplice=Tools.max(minSplice, 1); 2712 2713 for(int pos=0; pos<match.length; pos++){ 2714 final byte b=match[pos]; 2715 2716 if(b==prev){streak++;}else{streak=1;} 2717 2718 if(b=='m'){ 2719 m++; 2720 }else if(b=='N' || b=='C'){ 2721 n++; 2722 }else if(b=='X' || b=='Y'){ 2723 i++; 2724 }else if(b=='I'){ 2725 i++; 2726 }else if(b=='D'){ 2727 d++; 2728 if(streak==minSplice){splice++;} 2729 }else if(b=='S'){ 2730 s++; 2731 }else{ 2732 if(Tools.isDigit(b) && shortmatch()){ 2733 System.err.println("Warning! Found read in shortmatch form during countErrors():\n"+this); //Usually caused by verbose output. 2734 if(mate!=null){System.err.println("mate:\n"+mate.id+"\t"+new String(mate.bases));} 2735 System.err.println("Stack trace: "); 2736 new Exception().printStackTrace(); 2737 match=toLongMatchString(match); 2738 setShortMatch(false); 2739 return countErrors(minSplice); 2740 }else{ 2741 throw new RuntimeException("\nUnknown symbol "+(char)b+":\n"+new String(match)+"\n"+this+"\nshortmatch="+this.shortmatch()); 2742 } 2743 } 2744 2745 prev=b; 2746 } 2747 2748 // assert(i==0) : i+"\n"+this+"\n"+new String(match)+"\n"+Arrays.toString(new int[] {m, s, d, i, n, splice}); 2749 2750 return new int[] {m, s, d, i, n, splice}; 2751 } 2752 isShortMatchString(byte[] match)2753 public static boolean isShortMatchString(byte[] match){ 2754 byte last=' '; 2755 int streak=0; 2756 for(int i=0; i<match.length; i++){ 2757 byte b=match[i]; 2758 if(Tools.isDigit(b)){return true;} 2759 if(b==last){ 2760 streak++; 2761 if(streak>3){return true;} 2762 }else{ 2763 streak=0; 2764 last=b; 2765 } 2766 } 2767 return false; 2768 } 2769 toShortMatchString(boolean doAssertion)2770 public void toShortMatchString(boolean doAssertion){ 2771 if(shortmatch()){ 2772 assert(!doAssertion); 2773 return; 2774 } 2775 match=toShortMatchString(match); 2776 setShortMatch(true); 2777 } 2778 toShortMatchString(byte[] match)2779 public static byte[] toShortMatchString(byte[] match){ 2780 if(match==null){return null;} 2781 assert(match.length>0); 2782 ByteBuilder sb=new ByteBuilder(10); 2783 2784 byte prev=match[0]; 2785 int count=1; 2786 for(int i=1; i<match.length; i++){ 2787 byte m=match[i]; 2788 assert(Tools.isLetter(m) || m==0) : new String(match); 2789 if(m==0){System.err.println("Warning! Converting empty match string to short form.");} 2790 if(m==prev){count++;} 2791 else{ 2792 sb.append(prev); 2793 if(count>1){sb.append(count);} 2794 // else if(count==2){sb.append(prev);} 2795 prev=m; 2796 count=1; 2797 } 2798 } 2799 sb.append(prev); 2800 if(count>1){sb.append(count);} 2801 // else if(count==2){sb.append(prev);} 2802 2803 byte[] r=sb.toBytes(); 2804 return r; 2805 } 2806 toLongMatchString(boolean doAssertion)2807 public void toLongMatchString(boolean doAssertion){ 2808 if(!shortmatch()){ 2809 assert(!doAssertion); 2810 return; 2811 } 2812 match=toLongMatchString(match); 2813 setShortMatch(false); 2814 } 2815 toLongMatchString(byte[] shortmatch)2816 public static byte[] toLongMatchString(byte[] shortmatch){ 2817 if(shortmatch==null){return null;} 2818 assert(shortmatch.length>0); 2819 2820 int count=0; 2821 int current=0; 2822 for(int i=0; i<shortmatch.length; i++){ 2823 byte m=shortmatch[i]; 2824 if(Tools.isLetter(m)){ 2825 count++; 2826 count+=(current>0 ? current-1 : 0); 2827 current=0; 2828 }else{ 2829 assert(Tools.isDigit(m)); 2830 current=(current*10)+(m-48); //48 == '0' 2831 } 2832 } 2833 count+=(current>0 ? current-1 : 0); 2834 2835 2836 byte[] r=new byte[count]; 2837 current=0; 2838 byte lastLetter='?'; 2839 int j=0; 2840 for(int i=0; i<shortmatch.length; i++){ 2841 byte m=shortmatch[i]; 2842 if(Tools.isLetter(m)){ 2843 while(current>1){ 2844 r[j]=lastLetter; 2845 current--; 2846 j++; 2847 } 2848 current=0; 2849 2850 r[j]=m; 2851 j++; 2852 lastLetter=m; 2853 }else{ 2854 assert(Tools.isDigit(m)); 2855 current=(current*10)+(m-48); //48 == '0' 2856 } 2857 } 2858 while(current>1){ 2859 r[j]=lastLetter; 2860 current--; 2861 j++; 2862 } 2863 2864 assert(r[r.length-1]>0); 2865 return r; 2866 } 2867 parseCustomRname()2868 public String parseCustomRname(){ 2869 assert(id.startsWith("SYN")) : "Can't parse header "+id; 2870 return new Header(id, pairnum()).rname; 2871 } 2872 2873 /** Bases of the read. */ 2874 public byte[] bases; 2875 2876 /** Quality of the read. */ 2877 public byte[] quality; 2878 2879 /** Alignment string. E.G. mmmmDDDmmm would have 4 matching bases, then a 3-base deletion, then 3 matching bases. */ 2880 public byte[] match; 2881 2882 public int[] gaps; 2883 name()2884 public String name() {return id;} 2885 public String id; 2886 public long numericID; 2887 public int chrom; 2888 public int start; 2889 public int stop; 2890 2891 public int copies=1; 2892 2893 /** Errors detected */ 2894 public int errors=0; 2895 2896 /** Alignment score from BBMap. Assumed to max at approx 100*bases.length */ 2897 public int mapScore=0; 2898 2899 public ArrayList<SiteScore> sites; 2900 public SiteScore originalSite; //Origin site for synthetic reads 2901 public Object obj=null;//Don't set this to a SamLine; it's for other things. 2902 public SamLine samline=null; 2903 public Read mate; 2904 2905 public int flags; 2906 2907 /** -1 if invalid. TODO: Currently not retained through most processes. */ 2908 private int insert=-1; 2909 2910 /** A random number for deterministic usage. 2911 * May decrease speed in multithreaded applications. 2912 */ 2913 public double rand=-1; 2914 time()2915 public long time(){ 2916 assert(obj!=null && obj.getClass()==Long.class) : obj; 2917 return ((Long)obj).longValue(); 2918 } 2919 /** Number of bases in this pair, including the mate if present. */ pairLength()2920 public int pairLength(){return length()+mateLength();} 2921 /** Number of bases in this pair, including the mate if present. */ numPairKmers(int k)2922 public int numPairKmers(int k){return numKmers(k)+numMateKmers(k);} 2923 /** Number of reads in this pair. Returns 1 if the read has no mate, and 2 if it does. */ pairCount()2924 public int pairCount(){return 1+mateCount();} length()2925 public int length(){return bases==null ? 0 : bases.length;} numKmers(int k)2926 public int numKmers(int k){return bases==null ? 0 : Tools.max(0, bases.length-k+1);} qlength()2927 public int qlength(){return quality==null ? 0 : quality.length;} mateLength()2928 public int mateLength(){return mate==null ? 0 : mate.length();} numMateKmers(int k)2929 public int numMateKmers(int k){return mate==null ? 0 : mate.numKmers(k);} mateId()2930 public String mateId(){return mate==null ? null : mate.id;} mateCount()2931 public int mateCount(){return mate==null ? 0 : 1;} mateMapped()2932 public boolean mateMapped(){return mate==null ? false : mate.mapped();} countMateBytes()2933 public long countMateBytes(){return mate==null ? 0 : mate.countBytes();} countMateFastqBytes()2934 public long countMateFastqBytes(){return mate==null ? 0 : mate.countFastqBytes();} 2935 /** Number of bytes this read pair uses in memory, approximately */ countPairBytes()2936 public long countPairBytes(){return countBytes()+(mate==null ? 0 : mate.countBytes());} 2937 2938 /** Number of bytes this read uses in memory, approximately */ countBytes()2939 public long countBytes(){ 2940 long sum=129; //Approximate per-read overhead 2941 sum+=(bases==null ? 0 : bases.length+16); 2942 sum+=(quality==null ? 0 : quality.length+16); 2943 sum+=(id==null ? 0 : id.length()*2+16); 2944 sum+=(match==null ? 0 : match.length+16); 2945 sum+=(samline==null ? 0 : samline.countBytes()); 2946 return sum; 2947 } 2948 2949 /** Number of bytes this read uses in on disk in Fastq format */ countFastqBytes()2950 public long countFastqBytes(){ 2951 long sum=6;//4 newlines, +, @ 2952 sum+=(bases==null ? 0 : bases.length); 2953 sum+=(quality==null ? 0 : quality.length); 2954 sum+=(id==null ? 0 : id.length()); 2955 return sum; 2956 } 2957 countLeft(final char base)2958 public int countLeft(final char base){return countLeft((byte)base);} countRight(final char base)2959 public int countRight(final char base){return countRight((byte)base);} 2960 countLeft(final byte base)2961 public int countLeft(final byte base){ 2962 for(int i=0; i<bases.length; i++){ 2963 final byte b=bases[i]; 2964 if(b!=base){return i;} 2965 } 2966 return bases.length; 2967 } 2968 countRight(final byte base)2969 public int countRight(final byte base){ 2970 for(int i=bases.length-1; i>=0; i--){ 2971 final byte b=bases[i]; 2972 if(b!=base){return bases.length-i-1;} 2973 } 2974 return bases.length; 2975 } 2976 untrim()2977 public boolean untrim(){ 2978 if(obj==null || obj.getClass()!=TrimRead.class){return false;} 2979 ((TrimRead)obj).untrim(); 2980 obj=null; 2981 return true; 2982 } 2983 trailingLowerCase()2984 public int trailingLowerCase(){ 2985 for(int i=bases.length-1; i>=0;){ 2986 if(Tools.isLowerCase(bases[i])){ 2987 i--; 2988 }else{ 2989 return bases.length-i-1; 2990 } 2991 } 2992 return bases.length; 2993 } leadingLowerCase()2994 public int leadingLowerCase(){ 2995 for(int i=0; i<bases.length; i++){ 2996 if(!Tools.isLowerCase(bases[i])){return i;} 2997 } 2998 return bases.length; 2999 } 3000 strandChar()3001 public char strandChar(){return Shared.strandCodes2[strand()];} strand()3002 public byte strand(){return (byte)(flags&1);} mapped()3003 public boolean mapped(){return (flags&MAPPEDMASK)==MAPPEDMASK;} paired()3004 public boolean paired(){return (flags&PAIREDMASK)==PAIREDMASK;} synthetic()3005 public boolean synthetic(){return (flags&SYNTHMASK)==SYNTHMASK;} ambiguous()3006 public boolean ambiguous(){return (flags&AMBIMASK)==AMBIMASK;} perfect()3007 public boolean perfect(){return (flags&PERFECTMASK)==PERFECTMASK;} 3008 // public boolean semiperfect(){return perfect() ? true : list!=null && list.size()>0 ? list.get(0).semiperfect : false;} //TODO: This is a hack. Add a semiperfect flag. rescued()3009 public boolean rescued(){return (flags&RESCUEDMASK)==RESCUEDMASK;} discarded()3010 public boolean discarded(){return (flags&DISCARDMASK)==DISCARDMASK;} invalid()3011 public boolean invalid(){return (flags&INVALIDMASK)==INVALIDMASK;} swapped()3012 public boolean swapped(){return (flags&SWAPMASK)==SWAPMASK;} shortmatch()3013 public boolean shortmatch(){return (flags&SHORTMATCHMASK)==SHORTMATCHMASK;} insertvalid()3014 public boolean insertvalid(){return (flags&INSERTMASK)==INSERTMASK;} hasAdapter()3015 public boolean hasAdapter(){return (flags&ADAPTERMASK)==ADAPTERMASK;} secondary()3016 public boolean secondary(){return (flags&SECONDARYMASK)==SECONDARYMASK;} aminoacid()3017 public boolean aminoacid(){return (flags&AAMASK)==AAMASK;} amino()3018 public boolean amino(){return (flags&AAMASK)==AAMASK;} junk()3019 public boolean junk(){return (flags&JUNKMASK)==JUNKMASK;} validated()3020 public boolean validated(){return (flags&VALIDATEDMASK)==VALIDATEDMASK;} tested()3021 public boolean tested(){return (flags&TESTEDMASK)==TESTEDMASK;} invertedRepeat()3022 public boolean invertedRepeat(){return (flags&IRMASK)==IRMASK;} trimmed()3023 public boolean trimmed(){return (flags&TRIMMEDMASK)==TRIMMEDMASK;} 3024 3025 /** For paired ends: 0 for read1, 1 for read2 */ pairnum()3026 public int pairnum(){return (flags&PAIRNUMMASK)>>PAIRNUMSHIFT;} valid()3027 public boolean valid(){return !invalid();} 3028 getFlag(int mask)3029 public boolean getFlag(int mask){return (flags&mask)==mask;} flagToNumber(int mask)3030 public int flagToNumber(int mask){return (flags&mask)==mask ? 1 : 0;} 3031 setFlag(int mask, boolean b)3032 public void setFlag(int mask, boolean b){ 3033 flags=(flags&~mask); 3034 if(b){flags|=mask;} 3035 } 3036 setStrand(int b)3037 public void setStrand(int b){ 3038 assert(b==1 || b==0); 3039 flags=(flags&(~1))|b; 3040 } 3041 3042 /** For paired ends: 0 for read1, 1 for read2 */ setPairnum(int b)3043 public void setPairnum(int b){ 3044 // System.err.println("Setting pairnum to "+b+" for "+id); 3045 // assert(!id.equals("2_chr1_0_1853883_1853982_1845883_ecoli_K12") || b==1); 3046 assert(b==1 || b==0); 3047 flags=(flags&(~PAIRNUMMASK))|(b<<PAIRNUMSHIFT); 3048 // assert(pairnum()==b); 3049 } 3050 setPaired(boolean b)3051 public void setPaired(boolean b){ 3052 flags=(flags&~PAIREDMASK); 3053 if(b){flags|=PAIREDMASK;} 3054 } 3055 setSynthetic(boolean b)3056 public void setSynthetic(boolean b){ 3057 flags=(flags&~SYNTHMASK); 3058 if(b){flags|=SYNTHMASK;} 3059 } 3060 setAmbiguous(boolean b)3061 public void setAmbiguous(boolean b){ 3062 flags=(flags&~AMBIMASK); 3063 if(b){flags|=AMBIMASK;} 3064 } 3065 setPerfectFlag(int maxScore)3066 public boolean setPerfectFlag(int maxScore){ 3067 final SiteScore ss=topSite(); 3068 if(ss==null){ 3069 setPerfect(false); 3070 }else{ 3071 assert(ss.slowScore<=maxScore) : maxScore+", "+ss.slowScore+", "+ss.toText(); 3072 3073 if(ss.slowScore==maxScore || ss.perfect){ 3074 assert(testMatchPerfection(true)) : "\n"+ss+"\n"+maxScore+"\n"+this+"\n"+mate+"\n"; 3075 setPerfect(true); 3076 }else{ 3077 boolean flag=testMatchPerfection(false); 3078 setPerfect(flag); 3079 assert(flag || !ss.perfect) : "flag="+flag+", ss.perfect="+ss.perfect+"\nmatch="+new String(match)+"\n"+this.toText(false); 3080 assert(!flag || ss.slowScore>=maxScore) : "\n"+ss+"\n"+maxScore+"\n"+this+"\n"+mate+"\n"; 3081 } 3082 } 3083 return perfect(); 3084 } 3085 testMatchPerfection(boolean returnIfNoMatch)3086 private boolean testMatchPerfection(boolean returnIfNoMatch){ 3087 if(match==null){return returnIfNoMatch;} 3088 boolean flag=(match.length==bases.length); 3089 if(shortmatch()){ 3090 flag=(match.length==0 || match[0]=='m'); 3091 for(int i=0; i<match.length && flag; i++){flag=(match[i]=='m' || Tools.isDigit(match[i]));} 3092 }else{ 3093 for(int i=0; i<match.length && flag; i++){flag=(match[i]=='m');} 3094 } 3095 for(int i=0; i<bases.length && flag; i++){flag=(bases[i]!='N');} 3096 return flag; 3097 } 3098 3099 /** 3100 * @return GC fraction 3101 */ gc()3102 public float gc() { 3103 if(bases==null || bases.length<1){return 0;} 3104 int at=0, gc=0; 3105 for(byte b : bases){ 3106 int x=AminoAcid.baseToNumber[b]; 3107 if(x>-1){ 3108 if(x==0 || x==3){at++;} 3109 else{gc++;} 3110 } 3111 } 3112 if(gc<1){return 0;} 3113 return gc*1f/(at+gc); 3114 } 3115 3116 /** 3117 * @param swapFrom 3118 * @param swapTo 3119 * @return number of swaps 3120 */ swapBase(byte swapFrom, byte swapTo)3121 public int swapBase(byte swapFrom, byte swapTo) { 3122 if(bases==null){return 0;} 3123 int swaps=0; 3124 for(int i=0; i<bases.length; i++){ 3125 if(bases[i]==swapFrom){ 3126 bases[i]=swapTo; 3127 swaps++; 3128 } 3129 } 3130 return swaps; 3131 } 3132 3133 /** 3134 * @param remap Table of new values 3135 */ remap(byte[] remap)3136 public void remap(byte[] remap) { 3137 if(bases==null){return;} 3138 for(int i=0; i<bases.length; i++){ 3139 bases[i]=remap[bases[i]]; 3140 } 3141 } 3142 3143 /** 3144 * @param remap Table of new values 3145 */ remapAndCount(byte[] remap)3146 public int remapAndCount(byte[] remap) { 3147 if(bases==null){return 0;} 3148 int swaps=0; 3149 for(int i=0; i<bases.length; i++){ 3150 byte a=bases[i]; 3151 byte b=remap[a]; 3152 if(a!=b){ 3153 bases[i]=b; 3154 swaps++; 3155 } 3156 } 3157 return swaps; 3158 } 3159 convertUndefinedTo(byte b)3160 public int convertUndefinedTo(byte b){ 3161 if(bases==null){return 0;} 3162 int changed=0; 3163 for(int i=0; i<bases.length; i++){ 3164 if(b<0 || AminoAcid.baseToNumberACGTN[bases[i]]<0){ 3165 changed++; 3166 bases[i]=b; 3167 if(quality!=null){quality[i]=0;} 3168 } 3169 } 3170 return changed; 3171 } 3172 swapBasesWithMate()3173 public void swapBasesWithMate(){ 3174 if(mate==null){ 3175 assert(false); 3176 return; 3177 } 3178 byte[] temp=bases; 3179 bases=mate.bases; 3180 mate.bases=temp; 3181 temp=quality; 3182 quality=mate.quality; 3183 mate.quality=temp; 3184 } 3185 insert()3186 public int insert(){ 3187 return insertvalid() ? insert : -1; 3188 } 3189 insertSizeMapped(boolean ignoreStrand)3190 public int insertSizeMapped(boolean ignoreStrand){ 3191 return insertSizeMapped(this, mate, ignoreStrand); 3192 } 3193 insertSizeMapped(Read r1, Read r2, boolean ignoreStrand)3194 public static int insertSizeMapped(Read r1, Read r2, boolean ignoreStrand){ 3195 // assert(false) : ignoreStrand+", "+(r2==null)+", "+(r1.mapped())+", "+(r2.mapped())+", "+(r1.strand()==r2.strand())+", "+r1.strand()+", "+r2.strand(); 3196 if(ignoreStrand || r2==null || !r1.mapped() || !r2.mapped() || r1.strand()==r2.strand()){return insertSizeMapped_Unstranded(r1, r2);} 3197 return insertSizeMapped_PlusLeft(r1, r2); 3198 } 3199 3200 /** TODO: This is not correct when the insert is shorter than a read's bases with same-strand reads */ insertSizeMapped_PlusLeft(Read r1, Read r2)3201 public static int insertSizeMapped_PlusLeft(Read r1, Read r2){ 3202 if(r1.strand()>r2.strand()){return insertSizeMapped_PlusLeft(r2, r1);} 3203 if(r1.strand()==r2.strand() || r1.start>r2.stop){return insertSizeMapped_Unstranded(r2, r1);} //So r1 is always on the left. 3204 // if(!mapped() || !mate.mapped()){return 0;} 3205 if(r1.chrom!=r2.chrom){return 0;} 3206 if(r1.start==r1.stop || r2.start==r2.stop){return 0;} //??? 3207 3208 int a=r1.length(); 3209 int b=r2.length(); 3210 int mid=r2.start-r1.stop-1; 3211 if(-mid>=a+b){return insertSizeMapped_Unstranded(r1, r2);} //Not properly oriented; plus read is to the right of minus read 3212 return mid+a+b; 3213 } 3214 insertSizeMapped_Unstranded(Read r1, Read r2)3215 public static int insertSizeMapped_Unstranded(Read r1, Read r2){ 3216 if(r2==null){return r1.start==r1.stop ? 0 : r1.stop-r1.start+1;} 3217 3218 if(r1.start>r2.start){return insertSizeMapped_Unstranded(r2, r1);} //So r1 is always on the left side. 3219 3220 // if(!mapped() || !mate.mapped()){return 0;} 3221 if(r1.start==r1.stop || r2.start==r2.stop){return 0;} //??? 3222 3223 if(r1.chrom!=r2.chrom){return 0;} 3224 int a=r1.length(); 3225 int b=r2.length(); 3226 if(false && Tools.overlap(r1.start, r1.stop, r2.start, r2.stop)){ 3227 //This does not handle very short inserts 3228 return Tools.max(r1.stop, r2.stop)-Tools.min(r1.start, r2.start)+1; 3229 3230 }else{ 3231 if(r1.start<r2.start){ 3232 int mid=r2.start-r1.stop-1; 3233 // assert(false) : mid+", "+a+", "+b; 3234 // if(-mid>a && -mid>b){return Tools.min(a, b);} //Strange situation, no way to guess insert size 3235 if(-mid>=a+b){return 0;} //Strange situation, no way to guess insert size 3236 return mid+a+b; 3237 }else{ 3238 assert(r1.start==r2.start); 3239 return Tools.min(a, b); 3240 } 3241 } 3242 } 3243 insertSizeOriginalSite()3244 public int insertSizeOriginalSite(){ 3245 if(mate==null){ 3246 // System.err.println("A: "+(originalSite==null ? "null" : (originalSite.stop-originalSite.start+1))); 3247 return (originalSite==null ? -1 : originalSite.stop-originalSite.start+1); 3248 } 3249 3250 final SiteScore ssa=originalSite, ssb=mate.originalSite; 3251 final int x; 3252 if(ssa==null || ssb==null){ 3253 // System.err.println("B: 0"); 3254 x=0; 3255 }else{ 3256 x=insertSize(ssa, ssb, bases.length, mate.length()); 3257 } 3258 3259 assert(pairnum()>=mate.pairnum() || x==mate.insertSizeOriginalSite()); 3260 return x; 3261 } 3262 insertSize(SiteScore ssa, SiteScore ssb, int lena, int lenb)3263 public static int insertSize(SiteScore ssa, SiteScore ssb, int lena, int lenb){ 3264 return insertSize(ssa.chrom, ssb.chrom, ssa.start, ssb.start, ssa.stop, ssb.stop, lena, lenb); 3265 } 3266 insertSize(int chroma, int chromb, int starta, int startb, int stopa, int stopb, int lena, int lenb)3267 public static int insertSize(int chroma, int chromb, int starta, int startb, int stopa, int stopb, int lena, int lenb){ 3268 3269 final int x; 3270 3271 // if(mate==null || ){return bases==null ? 0 : bases.length;} 3272 if(chroma!=chromb){x=0;} 3273 else{ 3274 3275 if(Tools.overlap(starta, stopa, startb, stopb)){ 3276 x=Tools.max(stopa, stopb)-Tools.min(starta, startb)+1; 3277 // System.err.println("C: "+x); 3278 }else{ 3279 if(starta<=startb){ 3280 int mid=startb-stopa-1; 3281 // assert(false) : mid+", "+a+", "+b; 3282 x=mid+lena+lenb; 3283 // System.err.println("D: "+x); 3284 }else{ 3285 int mid=starta-stopb-1; 3286 // assert(false) : mid+", "+a+", "+b; 3287 x=mid+lena+lenb; 3288 // System.err.println("E: "+x); 3289 } 3290 } 3291 } 3292 return x; 3293 } 3294 subRead(int from, int to)3295 public Read subRead(int from, int to){ 3296 Read r=this.clone(); 3297 r.bases=KillSwitch.copyOfRange(bases, from, to); 3298 r.quality=(quality==null ? null : KillSwitch.copyOfRange(quality, from, to)); 3299 r.mate=null; 3300 // assert(Tools.indexOf(r.bases, (byte)'J')<0); 3301 return r; 3302 } 3303 joinRead()3304 public Read joinRead(){ 3305 if(insert<1 || mate==null || !insertvalid()){return this;} 3306 assert(insert>9 || bases.length<20) : "Perhaps old read format is being used? This appears to be a quality value, not an insert.\n"+this+"\n\n"+mate+"\n"; 3307 return joinRead(this, mate, insert); 3308 } 3309 joinRead(int x)3310 public Read joinRead(int x){ 3311 if(x<1 || mate==null){return this;} 3312 assert(x>9 || bases.length<20) : "Perhaps old read format is being used? This appears to be a quality value, not an insert.\n"+this+"\n\n"+mate+"\n"; 3313 return joinRead(this, mate, x); 3314 } 3315 joinRead(Read a, Read b, int insert)3316 public static Read joinRead(Read a, Read b, int insert){ 3317 assert(a!=null && b!=null && insert>0); 3318 final int lengthSum=a.length()+b.length(); 3319 final int overlap=Tools.min(insert, lengthSum-insert); 3320 3321 // System.err.println(insert); 3322 final byte[] bases=new byte[insert], abases=a.bases, bbases=b.bases; 3323 final byte[] aquals=a.quality, bquals=b.quality; 3324 final byte[] quals=(aquals==null || bquals==null ? null : new byte[insert]); 3325 assert(aquals==null || (aquals.length==abases.length && bquals.length==bbases.length)); 3326 3327 int mismatches=0; 3328 3329 int start, stop; 3330 3331 if(overlap<=0){//Simple join in which there is no overlap 3332 int lim=insert-b.length(); 3333 if(quals==null){ 3334 for(int i=0; i<a.length(); i++){ 3335 bases[i]=abases[i]; 3336 } 3337 for(int i=a.length(); i<lim; i++){ 3338 bases[i]='N'; 3339 } 3340 for(int i=0; i<b.length(); i++){ 3341 bases[i+lim]=bbases[i]; 3342 } 3343 }else{ 3344 for(int i=0; i<a.length(); i++){ 3345 bases[i]=abases[i]; 3346 quals[i]=aquals[i]; 3347 } 3348 for(int i=a.length(); i<lim; i++){ 3349 bases[i]='N'; 3350 quals[i]=0; 3351 } 3352 for(int i=0; i<b.length(); i++){ 3353 bases[i+lim]=bbases[i]; 3354 quals[i+lim]=bquals[i]; 3355 } 3356 } 3357 3358 start=Tools.min(a.start, b.start); 3359 // stop=start+insert-1; 3360 stop=Tools.max(a.stop, b.stop); 3361 3362 // }else if(insert>=a.length() && insert>=b.length()){ //Overlapped join, proper orientation 3363 // final int lim1=a.length()-overlap; 3364 // final int lim2=a.length(); 3365 // for(int i=0; i<lim1; i++){ 3366 // bases[i]=abases[i]; 3367 // quals[i]=aquals[i]; 3368 // } 3369 // for(int i=lim1, j=0; i<lim2; i++, j++){ 3370 // assert(false) : "TODO"; 3371 // bases[i]='N'; 3372 // quals[i]=0; 3373 // } 3374 // for(int i=lim2, j=overlap; i<bases.length; i++, j++){ 3375 // bases[i]=bbases[j]; 3376 // quals[i]=bquals[j]; 3377 // } 3378 }else{ //reads go off ends of molecule. 3379 if(quals==null){ 3380 for(int i=0; i<a.length() && i<bases.length; i++){ 3381 bases[i]=abases[i]; 3382 } 3383 for(int i=bases.length-1, j=b.length()-1; i>=0 && j>=0; i--, j--){ 3384 byte ca=bases[i], cb=bbases[j]; 3385 if(ca==0 || ca=='N'){ 3386 bases[i]=cb; 3387 }else if(ca==cb){ 3388 }else{ 3389 bases[i]=(ca>=cb ? ca : cb); 3390 if(ca!='N' && cb!='N'){mismatches++;} 3391 } 3392 } 3393 }else{ 3394 for(int i=0; i<a.length() && i<bases.length; i++){ 3395 bases[i]=abases[i]; 3396 quals[i]=aquals[i]; 3397 } 3398 for(int i=bases.length-1, j=b.length()-1; i>=0 && j>=0; i--, j--){ 3399 byte ca=bases[i], cb=bbases[j]; 3400 byte qa=quals[i], qb=bquals[j]; 3401 if(ca==0 || ca=='N'){ 3402 bases[i]=cb; 3403 quals[i]=qb; 3404 }else if(cb==0 || cb=='N'){ 3405 //do nothing 3406 }else if(ca==cb){ 3407 quals[i]=(byte)Tools.min((Tools.max(qa, qb)+Tools.min(qa, qb)/4), MAX_MERGE_QUALITY); 3408 }else{ 3409 bases[i]=(qa>qb ? ca : qa<qb ? cb : (byte)'N'); 3410 quals[i]=(byte)(Tools.max(qa, qb)-Tools.min(qa, qb)); 3411 if(ca!='N' && cb!='N'){mismatches++;} 3412 } 3413 } 3414 } 3415 3416 if(a.strand()==0){ 3417 start=a.start; 3418 // stop=start+insert-1; 3419 stop=b.stop; 3420 }else{ 3421 stop=a.stop; 3422 // start=stop-insert+1; 3423 start=b.start; 3424 } 3425 if(start>stop){ 3426 start=Tools.min(a.start, b.start); 3427 stop=Tools.max(a.stop, b.stop); 3428 } 3429 } 3430 // assert(mismatches>=countMismatches(a, b, insert, 999)); 3431 // System.err.println(mismatches); 3432 if(a.chrom==0 || start==stop || (!a.mapped() && !a.synthetic())){start=stop=a.chrom=0;} 3433 3434 // System.err.println(bases.length+", "+start+", "+stop); 3435 3436 Read r=new Read(bases, null, a.id, a.numericID, a.flags, a.chrom, start, stop); 3437 r.quality=quals; //This prevents quality from getting capped. 3438 if(a.chrom==0 || start==stop || (!a.mapped() && !a.synthetic())){r.setMapped(true);} 3439 r.setInsert(insert); 3440 r.setPaired(false); 3441 r.copies=a.copies; 3442 r.mapScore=a.mapScore+b.mapScore; 3443 if(overlap<=0){ 3444 r.mapScore=a.mapScore+b.mapScore; 3445 r.errors=a.errors+b.errors; 3446 //TODO r.gaps=? 3447 }else{//Hard to calculate 3448 r.mapScore=(int)((a.mapScore*(long)a.length()+b.mapScore*(long)b.length())/insert); 3449 r.errors=a.errors; 3450 } 3451 3452 3453 assert(r.insertvalid()) : "\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n"; 3454 assert(r.insert()==r.length()) : r.insert()+"\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n"; 3455 // assert(false) : "\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n"; 3456 3457 //TODO: Triggered by BBMerge in useratio mode for some reason. 3458 // assert(Shared.anomaly || (a.insertSizeMapped(false)>0 == r.insertSizeMapped(false)>0)) : 3459 // "\n"+r.length()+"\n"+r.insert()+"\n"+r.insertSizeMapped(false)+"\n"+a.insert()+"\n"+a.insertSizeMapped(false)+ 3460 // "\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n"; 3461 3462 return r; 3463 } 3464 3465 /** 3466 * @param minlen 3467 * @param maxlen 3468 * @return A list of read fragments 3469 */ split(int minlen, int maxlen)3470 public ArrayList<Read> split(int minlen, int maxlen) { 3471 int len=bases==null ? 0 : bases.length; 3472 if(len<minlen){return null;} 3473 int parts=(len+maxlen-1)/maxlen; 3474 ArrayList<Read> subreads=new ArrayList<Read>(parts); 3475 if(len<=maxlen){ 3476 subreads.add(this); 3477 }else{ 3478 float ideal=Tools.max(minlen, len/(float)parts); 3479 int actual=(int)ideal; 3480 assert(false) : "TODO"; //Some assertion goes here, I forget what 3481 for(int i=0; i<parts; i++){ 3482 int a=i*actual; 3483 int b=(i+1)*actual; 3484 if(b>bases.length){b=bases.length;} 3485 // if(b-a<) 3486 byte[] subbases=KillSwitch.copyOfRange(bases, a, b); 3487 byte[] subquals=(quality==null ? null : KillSwitch.copyOfRange(quality, a, b+1)); 3488 Read r=new Read(subbases, subquals, id+"_"+i, numericID, flags); 3489 subreads.add(r); 3490 } 3491 } 3492 return subreads; 3493 } 3494 3495 /** Generate and return an array of canonical kmers for this read */ toKmers(final int k, final int gap, long[] kmers, boolean makeCanonical, Kmer longkmer)3496 public long[] toKmers(final int k, final int gap, long[] kmers, boolean makeCanonical, Kmer longkmer) { 3497 if(gap>0){throw new RuntimeException("Gapped reads: TODO");} 3498 if(k>31){return toLongKmers(k, kmers, makeCanonical, longkmer);} 3499 if(bases==null || bases.length<k+gap){return null;} 3500 3501 final int arraylen=bases.length-k+1; 3502 if(kmers==null || kmers.length!=arraylen){kmers=new long[arraylen];} 3503 Arrays.fill(kmers, -1); 3504 3505 final int shift=2*k; 3506 final int shift2=shift-2; 3507 final long mask=(shift>63 ? -1L : ~((-1L)<<shift)); 3508 long kmer=0, rkmer=0; 3509 int len=0; 3510 3511 for(int i=0; i<bases.length; i++){ 3512 byte b=bases[i]; 3513 long x=AminoAcid.baseToNumber[b]; 3514 long x2=AminoAcid.baseToComplementNumber[b]; 3515 kmer=((kmer<<2)|x)&mask; 3516 rkmer=((rkmer>>>2)|(x2<<shift2))&mask; 3517 if(x<0){len=0; rkmer=0;}else{len++;} 3518 if(len>=k){ 3519 kmers[i-k+1]=makeCanonical ? Tools.max(kmer, rkmer) : kmer; 3520 } 3521 } 3522 return kmers; 3523 } 3524 3525 // /** Generate and return an array of canonical kmers for this read */ 3526 // public long[] toKmers(final int k, final int gap, long[] kmers, boolean makeCanonical, Kmer longkmer) { 3527 // if(gap>0){throw new RuntimeException("Gapped reads: TODO");} 3528 // if(k>31){return toLongKmers(k, kmers, makeCanonical, longkmer);} 3529 // if(bases==null || bases.length<k+gap){return null;} 3530 // 3531 // final int kbits=2*k; 3532 // final long mask=(kbits>63 ? -1L : ~((-1L)<<kbits)); 3533 // 3534 // int len=0; 3535 // long kmer=0; 3536 // final int arraylen=bases.length-k+1; 3537 // if(kmers==null || kmers.length!=arraylen){kmers=new long[arraylen];} 3538 // Arrays.fill(kmers, -1); 3539 // 3540 // for(int i=0; i<bases.length; i++){ 3541 // byte b=bases[i]; 3542 // int x=AminoAcid.baseToNumber[b]; 3543 // if(x<0){ 3544 // len=0; 3545 // kmer=0; 3546 // }else{ 3547 // kmer=((kmer<<2)|x)&mask; 3548 // len++; 3549 // 3550 // if(len>=k){ 3551 // kmers[i-k+1]=kmer; 3552 // } 3553 // } 3554 // } 3555 // 3556 //// System.out.println(new String(bases)); 3557 //// System.out.println(Arrays.toString(kmers)); 3558 // 3559 // if(makeCanonical){ 3560 // this.reverseComplement(); 3561 // len=0; 3562 // kmer=0; 3563 // for(int i=0, j=bases.length-1; i<bases.length; i++, j--){ 3564 // byte b=bases[i]; 3565 // int x=AminoAcid.baseToNumber[b]; 3566 // if(x<0){ 3567 // len=0; 3568 // kmer=0; 3569 // }else{ 3570 // kmer=((kmer<<2)|x)&mask; 3571 // len++; 3572 // 3573 // if(len>=k){ 3574 // assert(kmer==AminoAcid.reverseComplementBinaryFast(kmers[j], k)); 3575 // kmers[j]=Tools.max(kmers[j], kmer); 3576 // } 3577 // } 3578 // } 3579 // this.reverseComplement(); 3580 // 3581 //// System.out.println(Arrays.toString(kmers)); 3582 // } 3583 // 3584 // 3585 // return kmers; 3586 // } 3587 3588 /** Generate and return an array of canonical kmers for this read */ toLongKmers(final int k, long[] kmers, boolean makeCanonical, Kmer kmer)3589 public long[] toLongKmers(final int k, long[] kmers, boolean makeCanonical, Kmer kmer) { 3590 assert(k>31) : k; 3591 assert(makeCanonical); 3592 if(bases==null || bases.length<k){return null;} 3593 kmer.clear(); 3594 3595 final int arraylen=bases.length-k+1; 3596 if(kmers==null || kmers.length!=arraylen){kmers=new long[arraylen];} 3597 Arrays.fill(kmers, -1); 3598 3599 for(int i=0; i<bases.length; i++){ 3600 byte b=bases[i]; 3601 kmer.addRight(b); 3602 if(!AminoAcid.isFullyDefined(b)){kmer.clear();} 3603 if(kmer.len>=k){ 3604 kmers[i-k+1]=kmer.xor(); 3605 } 3606 } 3607 3608 return kmers; 3609 } 3610 3611 // /** Generate and return an array of canonical kmers for this read */ 3612 // public long[] toLongKmers(final int k, long[] kmers, boolean makeCanonical, Kmer longkmer) { 3613 // assert(k>31) : k; 3614 // if(bases==null || bases.length<k){return null;} 3615 // 3616 // final int kbits=2*k; 3617 // final long mask=Long.MAX_VALUE; 3618 // 3619 // int len=0; 3620 // long kmer=0; 3621 // final int arraylen=bases.length-k+1; 3622 // if(kmers==null || kmers.length!=arraylen){kmers=new long[arraylen];} 3623 // Arrays.fill(kmers, -1); 3624 // 3625 // 3626 // final int tailshift=k%32; 3627 // final int tailshiftbits=tailshift*2; 3628 // 3629 // for(int i=0; i<bases.length; i++){ 3630 // byte b=bases[i]; 3631 // int x=AminoAcid.baseToNumber[b]; 3632 // if(x<0){ 3633 // len=0; 3634 // kmer=0; 3635 // }else{ 3636 // kmer=Long.rotateLeft(kmer, 2); 3637 // kmer=kmer^x; 3638 // len++; 3639 // 3640 // if(len>=k){ 3641 // long x2=AminoAcid.baseToNumber[bases[i-k]]; 3642 // kmer=kmer^(x2<<tailshiftbits); 3643 // kmers[i-k+1]=kmer; 3644 // } 3645 // } 3646 // } 3647 // if(makeCanonical){ 3648 // this.reverseComplement(); 3649 // len=0; 3650 // kmer=0; 3651 // for(int i=0, j=bases.length-1; i<bases.length; i++, j--){ 3652 // byte b=bases[i]; 3653 // int x=AminoAcid.baseToNumber[b]; 3654 // if(x<0){ 3655 // len=0; 3656 // kmer=0; 3657 // }else{ 3658 // kmer=Long.rotateLeft(kmer, 2); 3659 // kmer=kmer^x; 3660 // len++; 3661 // 3662 // if(len>=k){ 3663 // long x2=AminoAcid.baseToNumber[bases[i-k]]; 3664 // kmer=kmer^(x2<<tailshiftbits); 3665 // kmers[j]=mask&(Tools.max(kmers[j], kmer)); 3666 // } 3667 // } 3668 // } 3669 // this.reverseComplement(); 3670 // }else{ 3671 // assert(false) : "Long kmers should be made canonical here because they cannot be canonicized later."; 3672 // } 3673 // 3674 // return kmers; 3675 // } 3676 CHECKSITES(Read r, byte[] basesM)3677 public static final boolean CHECKSITES(Read r, byte[] basesM){ 3678 return CHECKSITES(r.sites, r.bases, basesM, r.numericID, true); 3679 } 3680 CHECKSITES(Read r, byte[] basesM, boolean verifySorted)3681 public static final boolean CHECKSITES(Read r, byte[] basesM, boolean verifySorted){ 3682 return CHECKSITES(r.sites, r.bases, basesM, r.numericID, verifySorted); 3683 } 3684 CHECKSITES(ArrayList<SiteScore> list, byte[] basesP, byte[] basesM, long id)3685 public static final boolean CHECKSITES(ArrayList<SiteScore> list, byte[] basesP, byte[] basesM, long id){ 3686 return CHECKSITES(list, basesP, basesM, id, true); 3687 } 3688 CHECKSITES(ArrayList<SiteScore> list, byte[] basesP, byte[] basesM, long id, boolean verifySorted)3689 public static final boolean CHECKSITES(ArrayList<SiteScore> list, byte[] basesP, byte[] basesM, long id, boolean verifySorted){ 3690 return true; //Temporarily disabled 3691 // if(list==null || list.isEmpty()){return true;} 3692 // SiteScore prev=null; 3693 // for(int i=0; i<list.size(); i++){ 3694 // SiteScore ss=list.get(i); 3695 // if(ss.strand==Gene.MINUS && basesM==null && basesP!=null){basesM=AminoAcid.reverseComplementBases(basesP);} 3696 // byte[] bases=(ss.strand==Gene.PLUS ? basesP : basesM); 3697 // if(verbose){System.err.println("Checking site "+i+": "+ss);} 3698 // boolean b=CHECKSITE(ss, bases, id); 3699 // assert(b) : id+"\n"+new String(basesP)+"\n"+ss+"\n"; 3700 // if(verbose){System.err.println("Checked site "+i+" = "+ss+"\nss.p="+ss.perfect+", ss.sp="+ss.semiperfect);} 3701 // if(!b){ 3702 //// System.err.println("Error at SiteScore "+i+": ss.p="+ss.perfect+", ss.sp="+ss.semiperfect); 3703 // return false; 3704 // } 3705 // if(verifySorted && prev!=null && ss.score>prev.score){ 3706 // if(verbose){System.err.println("verifySorted failed.");} 3707 // return false; 3708 // } 3709 // prev=ss; 3710 // } 3711 // return true; 3712 } 3713 3714 /** Makes sure 'bases' is for correct strand. */ CHECKORDER(ArrayList<SiteScore> list)3715 public static final boolean CHECKORDER(ArrayList<SiteScore> list){ 3716 if(list==null || list.size()<2){return true;} 3717 SiteScore prev=list.get(0); 3718 for(int i=0; i<list.size(); i++){ 3719 SiteScore ss=list.get(i); 3720 if(ss.score>prev.score){return false;} 3721 prev=ss; 3722 } 3723 return true; 3724 } 3725 3726 /** Makes sure 'bases' is for correct strand. */ CHECKSITE(SiteScore ss, byte[] basesP, byte[] basesM, long id)3727 public static final boolean CHECKSITE(SiteScore ss, byte[] basesP, byte[] basesM, long id){ 3728 return CHECKSITE(ss, ss.plus() ? basesP : basesM, id); 3729 } 3730 3731 /** Make sure 'bases' is for correct strand! */ CHECKSITE(SiteScore ss, byte[] bases, long id)3732 public static final boolean CHECKSITE(SiteScore ss, byte[] bases, long id){ 3733 return true; //Temporarily disabled 3734 // if(ss==null){return true;} 3735 //// System.err.println("Checking site "+ss+"\nss.p="+ss.perfect+", ss.sp="+ss.semiperfect+", bases="+new String(bases)); 3736 // if(ss.perfect){assert(ss.semiperfect) : ss+"\n"+new String(bases);} 3737 // if(ss.gaps!=null){ 3738 // if(ss.gaps[0]!=ss.start || ss.gaps[ss.gaps.length-1]!=ss.stop){return false;} 3739 //// assert(ss.gaps[0]==ss.start && ss.gaps[ss.gaps.length-1]==ss.stop); 3740 // } 3741 // 3742 // if(!(ss.pairedScore<1 || (ss.slowScore<=0 && ss.pairedScore>ss.quickScore ) || ss.pairedScore>ss.slowScore)){ 3743 // System.err.println("Site paired score violation: "+ss.quickScore+", "+ss.slowScore+", "+ss.pairedScore); 3744 // return false; 3745 // } 3746 // 3747 // final boolean xy=ss.matchContainsXY(); 3748 // if(bases!=null){ 3749 // 3750 // final boolean p0=ss.perfect; 3751 // final boolean sp0=ss.semiperfect; 3752 // final boolean p1=ss.isPerfect(bases); 3753 // final boolean sp1=(p1 ? true : ss.isSemiPerfect(bases)); 3754 // 3755 // assert(p0==p1 || (xy && p1)) : p0+"->"+p1+", "+sp0+"->"+sp1+", "+ss.isSemiPerfect(bases)+ 3756 // "\nnumericID="+id+"\n"+new String(bases)+"\n\n"+Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n\n"+ss+"\n\n"; 3757 // assert(sp0==sp1 || (xy && sp1)) : p0+"->"+p1+", "+sp0+"->"+sp1+", "+ss.isSemiPerfect(bases)+ 3758 // "\nnumericID="+id+"\n"+new String(bases)+"\n\n"+Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n\n"+ss+"\n\n"; 3759 // 3760 //// ss.setPerfect(bases, false); 3761 // 3762 // assert(p0==ss.perfect) : 3763 // p0+"->"+ss.perfect+", "+sp0+"->"+ss.semiperfect+", "+ss.isSemiPerfect(bases)+"\nnumericID="+id+"\n\n"+new String(bases)+"\n\n"+ 3764 // Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n"+ss+"\n\n"; 3765 // assert(sp0==ss.semiperfect) : 3766 // p0+"->"+ss.perfect+", "+sp0+"->"+ss.semiperfect+", "+ss.isSemiPerfect(bases)+"\nnumericID="+id+"\n\n"+new String(bases)+"\n\n"+ 3767 // Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n"+ss+"\n\n"; 3768 // if(ss.perfect){assert(ss.semiperfect);} 3769 // } 3770 // if(ss.match!=null && ss.matchLength()!=ss.mappedLength()){ 3771 // if(verbose){System.err.println("Returning false because matchLength!=mappedLength:\n"+ss.matchLength()+", "+ss.mappedLength()+"\n"+ss);} 3772 // return false; 3773 // } 3774 // return true; 3775 } 3776 setPerfect(boolean b)3777 public void setPerfect(boolean b){ 3778 flags=(flags&~PERFECTMASK); 3779 if(b){flags|=PERFECTMASK;} 3780 } 3781 setRescued(boolean b)3782 public void setRescued(boolean b){ 3783 flags=(flags&~RESCUEDMASK); 3784 if(b){flags|=RESCUEDMASK;} 3785 } 3786 setMapped(boolean b)3787 public void setMapped(boolean b){ 3788 flags=(flags&~MAPPEDMASK); 3789 if(b){flags|=MAPPEDMASK;} 3790 } 3791 setDiscarded(boolean b)3792 public void setDiscarded(boolean b){ 3793 flags=(flags&~DISCARDMASK); 3794 if(b){flags|=DISCARDMASK;} 3795 } 3796 setInvalid(boolean b)3797 public void setInvalid(boolean b){ 3798 flags=(flags&~INVALIDMASK); 3799 if(b){flags|=INVALIDMASK;} 3800 } 3801 setSwapped(boolean b)3802 public void setSwapped(boolean b){ 3803 flags=(flags&~SWAPMASK); 3804 if(b){flags|=SWAPMASK;} 3805 } 3806 setShortMatch(boolean b)3807 public void setShortMatch(boolean b){ 3808 flags=(flags&~SHORTMATCHMASK); 3809 if(b){flags|=SHORTMATCHMASK;} 3810 } 3811 setInsertValid(boolean b)3812 public void setInsertValid(boolean b){ 3813 flags=(flags&~INSERTMASK); 3814 if(b){flags|=INSERTMASK;} 3815 } 3816 setHasAdapter(boolean b)3817 public void setHasAdapter(boolean b){ 3818 flags=(flags&~ADAPTERMASK); 3819 if(b){flags|=ADAPTERMASK;} 3820 } 3821 setSecondary(boolean b)3822 public void setSecondary(boolean b){ 3823 flags=(flags&~SECONDARYMASK); 3824 if(b){flags|=SECONDARYMASK;} 3825 } 3826 setAminoAcid(boolean b)3827 public void setAminoAcid(boolean b){ 3828 flags=(flags&~AAMASK); 3829 if(b){flags|=AAMASK;} 3830 } 3831 setJunk(boolean b)3832 public void setJunk(boolean b){ 3833 flags=(flags&~JUNKMASK); 3834 if(b){flags|=JUNKMASK;} 3835 } 3836 setValidated(boolean b)3837 public void setValidated(boolean b){ 3838 flags=(flags&~VALIDATEDMASK); 3839 if(b){flags|=VALIDATEDMASK;} 3840 } 3841 setTested(boolean b)3842 public void setTested(boolean b){ 3843 flags=(flags&~TESTEDMASK); 3844 if(b){flags|=TESTEDMASK;} 3845 } 3846 setInvertedRepeat(boolean b)3847 public void setInvertedRepeat(boolean b){ 3848 flags=(flags&~IRMASK); 3849 if(b){flags|=IRMASK;} 3850 } 3851 setTrimmed(boolean b)3852 public void setTrimmed(boolean b){ 3853 flags=(flags&~TRIMMEDMASK); 3854 if(b){flags|=TRIMMEDMASK;} 3855 } 3856 setInsert(int x)3857 public void setInsert(int x){ 3858 if(x<1){x=-1;} 3859 // assert(x==-1 || x>9 || length()<20) : x+", "+length(); //Invalid assertion for synthetic reads. 3860 insert=x; 3861 setInsertValid(x>0); 3862 if(mate!=null){ 3863 mate.insert=x; 3864 mate.setInsertValid(x>0); 3865 } 3866 } 3867 makeMaskArray(int max)3868 private static int[] makeMaskArray(int max) { 3869 int[] r=new int[max+1]; 3870 for(int i=0; i<r.length; i++){r[i]=(1<<i);} 3871 return r; 3872 } 3873 3874 3875 getFakeQuality(int len)3876 public static byte[] getFakeQuality(int len){ 3877 if(len>=QUALCACHE.length){ 3878 byte[] r=KillSwitch.allocByte1D(len); 3879 Arrays.fill(r, (byte)30); 3880 return r; 3881 } 3882 if(QUALCACHE[len]==null){ 3883 synchronized(QUALCACHE){ 3884 if(QUALCACHE[len]==null){ 3885 QUALCACHE[len]=KillSwitch.allocByte1D(len); 3886 Arrays.fill(QUALCACHE[len], (byte)30); 3887 } 3888 } 3889 } 3890 return QUALCACHE[len]; 3891 } 3892 getScaffoldName(boolean requireSingleScaffold)3893 public byte[] getScaffoldName(boolean requireSingleScaffold){ 3894 byte[] name=null; 3895 if(mapped()){ 3896 if(!requireSingleScaffold || Data.isSingleScaffold(chrom, start, stop)){ 3897 int idx=Data.scaffoldIndex(chrom, (start+stop)/2); 3898 name=Data.scaffoldNames[chrom][idx]; 3899 // int scaflen=Data.scaffoldLengths[chrom][idx]; 3900 // a1=Data.scaffoldRelativeLoc(chrom, start, idx); 3901 // b1=a1-start1+stop1; 3902 } 3903 } 3904 return name; 3905 } 3906 bisulfite(boolean AtoG, boolean CtoT, boolean GtoA, boolean TtoC)3907 public void bisulfite(boolean AtoG, boolean CtoT, boolean GtoA, boolean TtoC){ 3908 for(int i=0; i<bases.length; i++){ 3909 final int x=AminoAcid.baseToNumber[bases[i]]; 3910 if(x==0 && AtoG){bases[i]='G';} 3911 else if(x==1 && CtoT){bases[i]='T';} 3912 else if(x==2 && GtoA){bases[i]='A';} 3913 else if(x==3 && TtoC){bases[i]='C';} 3914 } 3915 } 3916 copy()3917 public Read copy(){ 3918 Read r=clone(); 3919 r.bases=(r.bases==null ? null : r.bases.clone()); 3920 r.quality=(r.quality==null ? null : r.quality.clone()); 3921 r.match=(r.match==null ? null : r.match.clone()); 3922 r.gaps=(r.gaps==null ? null : r.gaps.clone()); 3923 r.originalSite=(r.originalSite==null ? null : r.originalSite.clone()); 3924 r.sites=(ArrayList<SiteScore>) (r.sites==null ? null : r.sites.clone()); 3925 r.mate=null; 3926 3927 if(r.sites!=null){ 3928 for(int i=0; i<r.sites.size(); i++){ 3929 r.sites.set(i, r.sites.get(i).clone()); 3930 } 3931 } 3932 return r; 3933 } 3934 3935 @Override clone()3936 public Read clone(){ 3937 try { 3938 return (Read) super.clone(); 3939 } catch (CloneNotSupportedException e) { 3940 // TODO Auto-generated catch block 3941 e.printStackTrace(); 3942 } 3943 throw new RuntimeException(); 3944 } 3945 3946 /** 3947 * @return This protein in canonical nucleotide space. 3948 */ aminoToNucleic()3949 public Read aminoToNucleic() { 3950 assert(aminoacid()) : "This read is not flagged as an amino acid sequence."; 3951 Read r=this.clone(); 3952 r.setAminoAcid(false); 3953 r.bases=AminoAcid.toNTs(r.bases); 3954 if(quality!=null){ 3955 byte[] ntquals=new byte[r.quality.length*3]; 3956 for(int i=0; i<quality.length; i++){ 3957 byte q=quality[i]; 3958 byte q2=(byte)Tools.min(q+5, MAX_CALLED_QUALITY); 3959 ntquals[i]=ntquals[i+1]=ntquals[i+2]=q2; 3960 } 3961 r.quality=ntquals; 3962 } 3963 return r; 3964 } 3965 3966 private static final byte[][] QUALCACHE=new byte[1000][]; 3967 3968 3969 public static final int STRANDMASK=1; 3970 public static final int MAPPEDMASK=(1<<1); 3971 public static final int PAIREDMASK=(1<<2); 3972 public static final int PERFECTMASK=(1<<3); 3973 public static final int AMBIMASK=(1<<4); 3974 public static final int RESCUEDMASK=(1<<5); 3975 // public static final int COLORMASK=(1<<6); //TODO: Change to semiperfectmask? 3976 public static final int SYNTHMASK=(1<<7); 3977 public static final int DISCARDMASK=(1<<8); 3978 public static final int INVALIDMASK=(1<<9); 3979 public static final int SWAPMASK=(1<<10); 3980 public static final int SHORTMATCHMASK=(1<<11); 3981 3982 public static final int PAIRNUMSHIFT=12; 3983 public static final int PAIRNUMMASK=(1<<PAIRNUMSHIFT); 3984 3985 public static final int INSERTMASK=(1<<13); 3986 public static final int ADAPTERMASK=(1<<14); 3987 public static final int SECONDARYMASK=(1<<15); 3988 public static final int AAMASK=(1<<16); 3989 public static final int JUNKMASK=(1<<17); 3990 public static final int VALIDATEDMASK=(1<<18); 3991 public static final int TESTEDMASK=(1<<19); 3992 public static final int IRMASK=(1<<20); 3993 public static final int TRIMMEDMASK=(1<<21); 3994 3995 private static final int[] maskArray=makeMaskArray(22); //Be sure this is big enough for all flags! 3996 3997 public static boolean TO_UPPER_CASE=false; 3998 public static boolean LOWER_CASE_TO_N=false; 3999 public static boolean DOT_DASH_X_TO_N=false; 4000 public static boolean AVERAGE_QUALITY_BY_PROBABILITY=true; 4001 public static boolean FIX_HEADER=false; 4002 public static boolean ALLOW_NULL_HEADER=false; 4003 public static boolean SKIP_SLOW_VALIDATION=false; 4004 public static final boolean VALIDATE_BRANCHLESS=true; 4005 4006 public static final int IGNORE_JUNK=0; 4007 public static final int FLAG_JUNK=1; 4008 public static final int FIX_JUNK=2; 4009 public static final int CRASH_JUNK=3; 4010 public static final int FIX_JUNK_AND_IUPAC=4; 4011 public static int JUNK_MODE=CRASH_JUNK; 4012 public static boolean IUPAC_TO_N=false; 4013 4014 public static boolean U_TO_T=false; 4015 public static boolean COMPRESS_MATCH_BEFORE_WRITING=true; 4016 public static boolean DECOMPRESS_MATCH_ON_LOAD=true; //Set to false for some applications, like sorting, perhaps 4017 4018 public static boolean ADD_BEST_SITE_TO_LIST_FROM_TEXT=true; 4019 public static boolean NULLIFY_BROKEN_QUALITY=false; 4020 public static boolean TOSS_BROKEN_QUALITY=false; 4021 public static boolean FLAG_BROKEN_QUALITY=false; 4022 public static boolean FLAT_IDENTITY=true; 4023 public static boolean VALIDATE_IN_CONSTRUCTOR=true; 4024 4025 public static boolean verbose=false; 4026 4027 /*--------------------------------------------------------------*/ 4028 4029 /** Offset for quality scores, normally 33, or 64 for some old Illumina data */ 4030 private static final byte ASCII_OFFSET=33; 4031 4032 /** Allow quality scores to be changed, typically for corrupt Illumina data */ 4033 public static boolean CHANGE_QUALITY=true; //Cap all quality values between MIN_CALLED_QUALITY and MAX_CALLED_QUALITY 4034 4035 /** Minimum allowed quality score for called (ACGT) bases */ 4036 private static byte MIN_CALLED_QUALITY=2; 4037 4038 /** Maximum allowed quality score for called (ACGT) bases */ 4039 private static byte MAX_CALLED_QUALITY=41; 4040 4041 /** Maximum allowed quality score for merged reads, which otherwise would normally be very high */ 4042 public static byte MAX_MERGE_QUALITY=41; 4043 public static byte[] qMap=makeQmap(MIN_CALLED_QUALITY, MAX_CALLED_QUALITY); 4044 MIN_CALLED_QUALITY()4045 public static byte MIN_CALLED_QUALITY(){return MIN_CALLED_QUALITY;} MAX_CALLED_QUALITY()4046 public static byte MAX_CALLED_QUALITY(){return MAX_CALLED_QUALITY;} 4047 setMaxCalledQuality(int x)4048 public static void setMaxCalledQuality(int x){ 4049 x=Tools.mid(1, x, 93); 4050 if(x!=MAX_CALLED_QUALITY){ 4051 MAX_CALLED_QUALITY=(byte)x; 4052 qMap=makeQmap(MIN_CALLED_QUALITY, MAX_CALLED_QUALITY); 4053 } 4054 } 4055 setMinCalledQuality(int x)4056 public static void setMinCalledQuality(int x){ 4057 x=Tools.mid(0, x, 93); 4058 if(x!=MIN_CALLED_QUALITY){ 4059 MIN_CALLED_QUALITY=(byte)x; 4060 qMap=makeQmap(MIN_CALLED_QUALITY, MAX_CALLED_QUALITY); 4061 } 4062 } 4063 capQuality(long q)4064 public static byte capQuality(long q){ 4065 return (byte)Tools.mid(MIN_CALLED_QUALITY, q, MAX_CALLED_QUALITY); 4066 } 4067 capQuality(byte q)4068 public static byte capQuality(byte q){ 4069 return qMap[q]; 4070 } 4071 capQuality(byte q, byte b)4072 public static byte capQuality(byte q, byte b){ 4073 return AminoAcid.isFullyDefined(b) ? qMap[q] : 0; 4074 } 4075 makeQmap(byte min, byte max)4076 private static byte[] makeQmap(byte min, byte max){ 4077 byte[] r=(qMap==null ? new byte[128] : qMap); 4078 for(int i=0; i<r.length; i++){ 4079 r[i]=(byte) Tools.mid(min, i, max); 4080 } 4081 return r; 4082 } 4083 } 4084