1 package dna; 2 import java.util.Arrays; 3 import java.util.HashMap; 4 5 import align2.QualityTools; 6 import shared.KillSwitch; 7 import shared.Tools; 8 import structures.ByteBuilder; 9 10 11 /** 12 * @author Brian Bushnell 13 * @date July 1, 2010 14 * 15 */ 16 public final class AminoAcid { 17 18 main(String[] args)19 public static void main(String[] args){ 20 // for(String s : stringToAA.keySet()){ 21 // System.out.println(s+"\t->\t"+stringToAA.get(s)); 22 // } 23 24 String bases="atctgatTGGcgcgatatatcg"; 25 String acids=stringToAAs(bases); 26 27 System.out.println(bases+" -> "+acids); 28 29 } 30 31 AminoAcid()32 private AminoAcid(){ 33 this(null); 34 assert(false); 35 System.exit(0); 36 } 37 AminoAcid(String line)38 private AminoAcid(String line){ 39 String[] s2=line.split(", "); 40 String[] s3=new String[s2.length-3]; 41 for(int i=3; i<s2.length; i++){ 42 s3[i-3]=s2[i]; 43 } 44 45 name=s2[0]; 46 symbol=s2[1]; 47 letter=s2[2].charAt(0); 48 codeStrings=s3; 49 } 50 AminoAcid(String n, String c3, String c1, String[] bases)51 private AminoAcid(String n, String c3, String c1, String[] bases){ 52 name=n; 53 symbol=c3; 54 letter=c1.charAt(0); 55 codeStrings=bases; 56 } 57 58 @Override toString()59 public String toString(){ 60 return name+", "+symbol+", "+letter+", "+Arrays.toString(codeStrings); 61 } 62 kmerToString(long kmer, int k)63 public static String kmerToString(long kmer, int k){ 64 ByteBuilder sb=new ByteBuilder(k); 65 for(int i=0; i<k; i++){ 66 int x=(int)(kmer&3); 67 sb.append((char)numberToBase[x]); 68 kmer>>=2; 69 } 70 return sb.reverse().toString(); 71 } 72 stringToKmer(String s)73 public static long stringToKmer(String s){ 74 long kmer=0; 75 for(int i=0; i<s.length(); i++){ 76 char c=s.charAt(i); 77 kmer=(kmer<<2)|(baseToNumber[c]); 78 } 79 return kmer; 80 } 81 kmerToStringAA(long kmer, int k)82 public static String kmerToStringAA(long kmer, int k){ 83 ByteBuilder sb=new ByteBuilder(k); 84 for(int i=0; i<k; i++){ 85 int x=(int)(kmer&31); 86 sb.append((char)numberToAcid[x]); 87 kmer>>=5; 88 } 89 return sb.reverse().toString(); 90 } 91 codonToString(int codon)92 public static final String codonToString(int codon){ 93 return codon>=0 && codon<codonToString.length ? codonToString[codon] : "NNN"; 94 } 95 canonicalCodon()96 public String canonicalCodon(){ 97 return codeStrings[0]; 98 } 99 100 101 public final String name; 102 public final String symbol; 103 public final char letter; 104 public final String[] codeStrings; 105 106 107 //a=1 108 //c=2 109 //g=4 110 //t=8 111 112 // R G A (puRine) 113 // Y T C (pYrimidine) 114 // K G T (Ketone) 115 // M A C (aMino group) 116 // S G C (Strong interaction) 117 // W A T (Weak interaction) 118 // B G T C (not A) (B comes after A) 119 // D G A T (not C) (D comes after C) 120 // H A C T (not G) (H comes after G) 121 // V G C A (not T, not U) (V comes after U) 122 // N A G C T (aNy) 123 // X masked 124 // - gap of indeterminate length 125 126 public static final String[] canonicalCodons=new String[21]; 127 128 public static final byte[] numberToBase={ 129 'A','C','G','T','N' 130 }; 131 132 public static final byte[] numberToAcid=new byte[21]; 133 134 public static final byte[] numberToComplementaryBase={ 135 'T','G','C','A','N' 136 }; 137 138 public static final byte[] numberToComplement={ 139 3,2,1,0,4 140 }; 141 142 public static final byte[] numberToBaseExtended={ 143 ' ','A','C','M','G','R','S','V', //0-7 144 'T','W','Y','H','K','D','B','N', //8-15 145 'X',' ',' ',' ',' ',' ',' ',' ', //16-23 146 }; 147 148 /** Has 'N' in position 0. Mainly for translating compressed arrays containing zeroes to bases. */ 149 public static final byte[] numberToBaseExtended2={ 150 'N','A','C','M','G','R','S','V', //0-7 151 'T','W','Y','H','K','D','B','N', //8-15 152 'X',' ',' ',' ',' ',' ',' ',' ', //16-23 153 }; 154 155 public static final byte[] degenerateBases={ 156 ' ',' ',' ','M',' ','R','S','V', //0-7 157 ' ','W','Y','H','K','D','B',' ', //8-15 158 ' ',' ',' ',' ',' ',' ',' ',' ', //16-23 159 }; 160 161 public static final byte[] numberToComplementaryBaseExtended={ 162 ' ','T','G','K','C','Y','W','B', //0-7 163 'A','S','R','D','M','H','V','N', //8-15 164 'X',' ',' ',' ',' ',' ',' ',' ', //16-23 165 }; 166 167 /** Element i is: N-bit code for a symbol, -1 otherwise */ symbolToNumber(boolean amino)168 public static final byte[] symbolToNumber(boolean amino){ 169 return amino ? acidToNumber : baseToNumber; 170 } 171 172 /** Element i is: N-bit code for a symbol, 0 otherwise */ symbolToNumber0(boolean amino)173 public static final byte[] symbolToNumber0(boolean amino){ 174 return amino ? acidToNumber0 : baseToNumber0; 175 } 176 177 /** Element i is: N-bit code for a symbol, -1 otherwise */ symbolToComplementNumber(boolean amino)178 public static final byte[] symbolToComplementNumber(boolean amino){ 179 return amino ? acidToNumber : baseToComplementNumber; 180 } 181 182 /** Element i is: N-bit code for a symbol, 0 otherwise */ symbolToComplementNumber0(boolean amino)183 public static final byte[] symbolToComplementNumber0(boolean amino){ 184 return amino ? acidToNumber0 : baseToComplementNumber0; 185 } 186 187 /** Element i is: 5-bit alphabetical code for a symbol, -1 otherwise */ 188 public static final byte[] acidToNumber=new byte[128]; 189 190 /** Element i is: 5-bit alphabetical code for a symbol other than stop, -1 otherwise */ 191 public static final byte[] acidToNumberNoStops=new byte[128]; 192 193 /** Element i is: 5-bit alphabetical code for a symbol, 0 otherwise */ 194 public static final byte[] acidToNumber0=new byte[128];//Rename acidToNumber0 195 196 /** Element i is: 5-bit alphabetical code for a symbol (plus X, B, J, Z, . and -), -1 otherwise */ 197 public static final byte[] acidToNumberExtended=new byte[128]; 198 199 /** Element i is: 5-bit alphabetical code for a symbol, -1 otherwise */ 200 public static final byte[] acidToNumber8=new byte[128]; 201 202 /** Element i is: 0 for 'A', 1 for 'C', 2 for 'G', 3 for 'T', -1 otherwise */ 203 public static final byte[] baseToNumber=new byte[128]; 204 205 /** Element i is: 0 for 'A', 1 for 'C', 2 for 'G', 3 for 'T', 0 otherwise */ 206 public static final byte[] baseToNumber0=new byte[128]; 207 208 /** Element i is: 3 for 'A', 2 for 'C', 1 for 'G', 0 for 'T', -1 otherwise */ 209 public static final byte[] baseToComplementNumber=new byte[128]; 210 211 /** Element i is: 3 for 'A', 2 for 'C', 1 for 'G', 0 for 'T', 0 otherwise */ 212 public static final byte[] baseToComplementNumber0=new byte[128]; 213 214 /** Element i is: 0 for 'A', 1 for 'C', 2 for 'G', 3 for 'T', 4 for 'N', -1 otherwise */ 215 public static final byte[] baseToNumberACGTN=new byte[128]; 216 217 /** Element i is: 0 for 'A', 1 for 'C', 2 for 'G', 3 for 'T', 0 for 'N', -1 otherwise */ 218 public static final byte[] baseToNumberACGTN2=new byte[128]; 219 220 /** Element i is: 0 for 'A', 1 for 'C', 2 for 'G', 3 for 'T', 4 otherwise */ 221 public static final byte[] baseToNumberACGTother=new byte[128]; 222 223 /** A>A, C>C, G>G, T/U>T, other>N */ 224 public static final byte[] baseToACGTN=new byte[128]; 225 226 public static final byte[] baseToComplementExtended=new byte[128]; 227 228 public static final String[] codonToString=new String[64]; 229 230 /** Uracil to Thymine, everything else unchanged */ 231 public static final byte[] uToT=new byte[256]; 232 /** Thymine to Uracil, everything else unchanged */ 233 public static final byte[] tToU=new byte[256]; 234 /** . - X to N, everything else unchanged */ 235 public static final byte[] dotDashXToNocall=new byte[256]; 236 /** . - X to ., everything else unchanged */ 237 public static final byte[] dotDashXToNocallAA=new byte[256]; 238 /** Letters to uppercase, everything else unchanged */ 239 public static final byte[] toUpperCase=new byte[256]; 240 /** Lowercase to N, everything else unchanged */ 241 public static final byte[] lowerCaseToNocall=new byte[256]; 242 /** Lowercase to ., everything else unchanged */ 243 public static final byte[] lowerCaseToNocallAA=new byte[256]; 244 /** Non-acgtACGT alphabet letters to N */ 245 public static final byte[] iupacToNocall=new byte[256]; 246 247 /** Element i is the bitwise OR of constituent IUPAC base numbers in baseToNumber.<br> 248 * For example, baseToNumberExtended['M'] = (baseToNumber['A'] | baseToNumber['C']) = (1 | 2) = 3 <br> 249 * Invalid characters are -1 */ 250 public static final byte[] baseToNumberExtended=new byte[128]; 251 public static final AminoAcid[] AlphabeticalAAs=new AminoAcid[21]; 252 public static final AminoAcid[] codeToAA=new AminoAcid[66]; 253 public static final char[] codeToChar=new char[66]; 254 public static final byte[] codeToByte=new byte[66]; 255 public static final byte[] aminoToCode=new byte[128]; 256 public static final HashMap<String, AminoAcid> stringToAA=new HashMap<String, AminoAcid>(512); 257 258 public static final AminoAcid Alanine=new AminoAcid("Alanine, Ala, A, GCU, GCC, GCA, GCG"); 259 public static final AminoAcid Arginine=new AminoAcid("Arginine, Arg, R, CGU, CGC, CGA, CGG, AGA, AGG"); 260 public static final AminoAcid Asparagine=new AminoAcid("Asparagine, Asn, N, AAU, AAC"); 261 public static final AminoAcid AsparticAcid=new AminoAcid("AsparticAcid, Asp, D, GAU, GAC"); 262 public static final AminoAcid Cysteine=new AminoAcid("Cysteine, Cys, C, UGU, UGC"); 263 public static final AminoAcid GlutamicAcid=new AminoAcid("GlutamicAcid, Glu, E, GAA, GAG"); 264 public static final AminoAcid Glutamine=new AminoAcid("Glutamine, Gln, Q, CAA, CAG"); 265 public static final AminoAcid Glycine=new AminoAcid("Glycine, Gly, G, GGU, GGC, GGA, GGG"); 266 public static final AminoAcid Histidine=new AminoAcid("Histidine, His, H, CAU, CAC"); 267 public static final AminoAcid Isoleucine=new AminoAcid("Isoleucine, Ile, I, AUU, AUC, AUA"); 268 public static final AminoAcid Leucine=new AminoAcid("Leucine, Leu, L, UUA, UUG, CUU, CUC, CUA, CUG"); 269 public static final AminoAcid Lysine=new AminoAcid("Lysine, Lys, K, AAA, AAG"); 270 public static final AminoAcid Methionine=new AminoAcid("Methionine, Met, M, AUG"); 271 public static final AminoAcid Phenylalanine=new AminoAcid("Phenylalanine, Phe, F, UUU, UUC"); 272 public static final AminoAcid Proline=new AminoAcid("Proline, Pro, P, CCU, CCC, CCA, CCG"); 273 public static final AminoAcid Serine=new AminoAcid("Serine, Ser, S, UCU, UCC, UCA, UCG, AGU, AGC"); 274 public static final AminoAcid Threonine=new AminoAcid("Threonine, Thr, T, ACU, ACC, ACA, ACG"); 275 public static final AminoAcid Tryptophan=new AminoAcid("Tryptophan, Trp, W, UGG"); 276 public static final AminoAcid Tyrosine=new AminoAcid("Tyrosine, Tyr, Y, UAU, UAC"); 277 public static final AminoAcid Valine=new AminoAcid("Valine, Val, V, GUU, GUC, GUA, GUG"); 278 public static final AminoAcid END=new AminoAcid("End, End, *, UAA, UGA, UAG"); 279 public static final AminoAcid ANY=new AminoAcid("Any, Any, X, XXX"); 280 281 public static int AMINO_SHIFT=5; 282 283 284 public static final byte[][] COLORS=new byte[][] { 285 {0, 1, 2, 3}, 286 {1, 0, 3, 2}, 287 {2, 3, 0, 1}, 288 {3, 2, 1, 0} 289 }; 290 291 /** Returns a new reverse-complemented array in ASCII coding*/ reverseComplementBases(final byte[] in)292 public static final byte[] reverseComplementBases(final byte[] in){ 293 byte[] out=new byte[in.length]; 294 final int last=in.length-1; 295 for(int i=0; i<in.length; i++){ 296 out[i]=baseToComplementExtended[in[last-i]]; 297 } 298 return out; 299 } 300 301 reverseComplementBasesInPlace(final byte[] in)302 public static final void reverseComplementBasesInPlace(final byte[] in){ 303 if(in!=null){reverseComplementBasesInPlace(in, in.length);} 304 } complementBasesInPlace(final byte[] in)305 public static final void complementBasesInPlace(final byte[] in){ 306 if(in==null){return;} 307 complementBasesInPlace(in, in.length); 308 } complementBasesInPlace(final byte[] in, final int length)309 public static final void complementBasesInPlace(final byte[] in, final int length){ 310 if(in==null){return;} 311 for(int i=0; i<length; i++){ 312 in[i]=baseToComplementExtended[in[i]]; 313 } 314 } 315 reverseComplementBasesInPlace(final byte[] in, final int length)316 public static final void reverseComplementBasesInPlace(final byte[] in, final int length){ 317 if(in==null){return;} 318 final int last=length-1; 319 final int max=length/2; 320 for(int i=0; i<max; i++){ 321 byte a=in[i]; 322 byte b=in[last-i]; 323 // assert(b>0 && b<baseToComplementExtended.length) : ((int)b)+"\t"+((char)b)+"\t"+Arrays.toString(in); 324 // System.out.println((char)a+", "+(char)b+", "+i+", "+last); 325 in[i]=baseToComplementExtended[b]; 326 in[last-i]=baseToComplementExtended[a]; 327 } 328 if((length&1)==1){//Odd length; process middle 329 in[max]=baseToComplementExtended[in[max]]; 330 } 331 } 332 reverseComplementBases(String in)333 public static final String reverseComplementBases(String in){ 334 return in==null ? null : new String(reverseComplementBases(in.getBytes())); 335 } 336 reverseComplementBinary(int kmer, int k)337 public static final int reverseComplementBinary(int kmer, int k){ 338 int out=0; 339 kmer=~kmer; 340 for(int i=0; i<k; i++){ 341 out=((out<<2)|(kmer&3)); 342 kmer>>=2; 343 } 344 return out; 345 } 346 reverseComplementBinary(long kmer, int k)347 public static final long reverseComplementBinary(long kmer, int k){ 348 long out=0; 349 kmer=~kmer; 350 for(int i=0; i<k; i++){ 351 out=((out<<2)|(kmer&3L)); 352 kmer>>=2; 353 } 354 return out; 355 } 356 reverseComplementBinaryFast(int kmer, int k)357 public static final int reverseComplementBinaryFast(int kmer, int k){ 358 int out=0; 359 int extra=k&3; 360 for(int i=0; i<extra; i++){ 361 out=((out<<2)|((~kmer)&3)); 362 kmer>>=2; 363 } 364 k-=extra; 365 for(int i=0; i<k; i+=4){ 366 out=((out<<8)|(rcompBinaryTable[kmer&0xFF])); 367 kmer>>=8; 368 } 369 return out; 370 } 371 reverseComplementBinaryFast(long kmer, int k)372 public static final long reverseComplementBinaryFast(long kmer, int k){ 373 long out=0; 374 int extra=k&3; 375 for(int i=0; i<extra; i++){ 376 out=((out<<2)|((~kmer)&3L)); 377 kmer>>=2; 378 } 379 k-=extra; 380 for(int i=0; i<k; i+=4){ 381 out=((out<<8)|(rcompBinaryTable[(int)(kmer&0xFFL)])); 382 kmer>>=8; 383 } 384 return out; 385 } 386 baseToColor(byte base1, byte base2)387 public static final byte baseToColor(byte base1, byte base2){ 388 byte a=baseToNumber[base1]; 389 byte b=baseToNumber[base2]; 390 if(a<0 && b<0){return 'N';} 391 if(a<0){a=3;} 392 if(b<0){b=3;} 393 return COLORS[a][b]; 394 } 395 colorToBase(byte base1, byte color)396 public static final byte colorToBase(byte base1, byte color){ 397 if(!isFullyDefined(base1) || color<0 || color>3){ 398 return (byte)'N'; 399 } 400 byte a=baseToNumber[base1]; 401 402 return numberToBase[COLORS[a][color]]; 403 } 404 405 // public static final byte toNumber(String code){ 406 // return toNumber(code.charAt(0), code.charAt(1), code.charAt(2)); 407 // } 408 toAA(String code)409 public static final AminoAcid toAA(String code){ 410 return toAA(code.charAt(0), code.charAt(1), code.charAt(2)); 411 } 412 toChar(String code)413 public static final char toChar(String code){ 414 return toChar(code.charAt(0), code.charAt(1), code.charAt(2)); 415 } 416 splitBase(char c)417 public static final char[] splitBase(char c){ 418 byte b=baseToNumberExtended[c]; 419 int len=Integer.bitCount(b); 420 char[] out=new char[len]; 421 422 int index=0; 423 for(int i=0; i<4; i++){ 424 if(((1<<i)&b)!=0){ 425 out[index]=(char)numberToBase[i]; 426 index++; 427 } 428 } 429 return out; 430 } 431 432 433 434 numberToBases(int code, int n)435 public static final byte[] numberToBases(int code, int n){ 436 437 byte[] bytes=KillSwitch.allocByte1D(n); 438 439 for(int i=n-1; i>=0; i--){ 440 int temp=code&3; 441 code>>=2; 442 bytes[i]=numberToBase[temp]; 443 } 444 445 return bytes; 446 } 447 baseTupleToNumber(byte[] tuple)448 public static final int baseTupleToNumber(byte[] tuple){ 449 450 int r=0; 451 for(int i=0; i<tuple.length; i++){ 452 int temp=baseToNumberACGTN[tuple[i]]; 453 if(temp<0 || temp>3){return -1;} 454 r=((r<<2)|temp); 455 } 456 457 return r; 458 } 459 isFullyDefined(char base)460 public static boolean isFullyDefined(char base){ 461 return baseToNumber[base]>=0; 462 } 463 isFullyDefined(byte base)464 public static boolean isFullyDefined(byte base){ 465 return base>=0 && baseToNumber[base]>=0; 466 } 467 isFullyDefinedAA(byte acid)468 public static boolean isFullyDefinedAA(byte acid){ 469 return acid>=0 && acidToNumber[acid]>=0; 470 } 471 isFullyDefinedAANoStops(byte acid)472 public static boolean isFullyDefinedAANoStops(byte acid){ 473 return acid>=0 && acidToNumberNoStops[acid]>=0; 474 } 475 isACGTN(char base)476 public static boolean isACGTN(char base){ 477 return baseToNumberACGTN[base]>=0; 478 } 479 isACGTN(byte base)480 public static boolean isACGTN(byte base){ 481 return base>=0 && baseToNumberACGTN[base]>=0; 482 } 483 containsOnlyACGTN(String s)484 public static boolean containsOnlyACGTN(String s){ 485 if(s==null || s.length()==0){return true;} 486 for(int i=0; i<s.length(); i++){ 487 char c=s.charAt(i); 488 if(baseToNumberACGTN[c]<0){return false;} 489 } 490 return true; 491 } 492 containsOnlyACGTNQ(String s)493 public static boolean containsOnlyACGTNQ(String s){ 494 if(s==null || s.length()==0){return true;} 495 for(int i=0; i<s.length(); i++){ 496 char c=s.charAt(i); 497 if(c!='?' && baseToNumberACGTN[c]<0){return false;} 498 } 499 return true; 500 } 501 containsOnlyACGTN(byte[] array)502 public static boolean containsOnlyACGTN(byte[] array){ 503 if(array==null || array.length==0){return true;} 504 for(int i=0; i<array.length; i++){ 505 byte b=array[i]; 506 if(b<0 || baseToNumberACGTN[b]<0){return false;} 507 } 508 return true; 509 } 510 isFullyDefined(String s)511 public static boolean isFullyDefined(String s){ 512 for(int i=0; i<s.length(); i++){ 513 if(!isFullyDefined(s.charAt(i))){return false;} 514 } 515 return true; 516 } 517 isFullyDefined(byte[] s)518 public static boolean isFullyDefined(byte[] s){ 519 for(int i=0; i<s.length; i++){ 520 if(!isFullyDefined(s[i])){return false;} 521 } 522 return true; 523 } 524 countUndefined(byte[] s)525 public static int countUndefined(byte[] s){ 526 int x=0; 527 for(int i=0; i<s.length; i++){ 528 if(!isFullyDefined(s[i])){x++;} 529 } 530 return x; 531 } 532 countDefined(byte[] s)533 public static int countDefined(byte[] s){ 534 if(s==null){return 0;} 535 int x=0; 536 for(int i=0; i<s.length; i++){ 537 if(isFullyDefined(s[i])){x++;} 538 } 539 return x; 540 } 541 toNumber(String s)542 public static final byte toNumber(String s){ 543 assert(s.length()==3); 544 int num=0; 545 for(int i=0; i<3; i++){ 546 char c=s.charAt(i); 547 int x=baseToNumber[c]; 548 if(x<0){return (byte)-1;} 549 num=(num<<2)|x; 550 } 551 return (byte)num; 552 } 553 toNumber(char c1, char c2, char c3)554 public static final byte toNumber(char c1, char c2, char c3){ 555 assert(baseToNumberACGTN2[c1]>=0 && baseToNumberACGTN2[c2]>=0 && baseToNumberACGTN2[c3]>=0); 556 int x=(baseToNumberACGTN2[c1]<<4)|(baseToNumberACGTN2[c2]<<2)|(baseToNumberACGTN2[c3]); 557 return (byte)x; 558 } 559 toAA(char c1, char c2, char c3)560 public static final AminoAcid toAA(char c1, char c2, char c3){ 561 assert(baseToNumberACGTN2[c1]>=0 && baseToNumberACGTN2[c2]>=0 && baseToNumberACGTN2[c3]>=0); 562 int x=(baseToNumberACGTN2[c1]<<4)|(baseToNumberACGTN2[c2]<<2)|(baseToNumberACGTN2[c3]); 563 return codeToAA[x]; 564 } 565 toChar(char c1, char c2, char c3)566 public static final char toChar(char c1, char c2, char c3){ 567 assert(baseToNumberACGTN2[c1]>=0 && baseToNumberACGTN2[c2]>=0 && baseToNumberACGTN2[c3]>=0); 568 int x=(baseToNumberACGTN2[c1]<<4)|(baseToNumberACGTN2[c2]<<2)|(baseToNumberACGTN2[c3]); 569 return codeToChar[x]; 570 } 571 toByte(byte c1, byte c2, byte c3)572 public static final byte toByte(byte c1, byte c2, byte c3){ 573 int a=baseToNumber[c1], b=baseToNumber[c2], c=baseToNumber[c3]; 574 if(a<0 || b<0 || c<0){return (byte)'X';} 575 int x=((a<<4)|(b<<2)|c); 576 return codeToByte[x]; 577 } 578 toChar(byte c1, byte c2, byte c3)579 public static final char toChar(byte c1, byte c2, byte c3){ 580 assert(baseToNumberACGTN2[c1]>=0 && baseToNumberACGTN2[c2]>=0 && baseToNumberACGTN2[c3]>=0); 581 byte n1=baseToNumberACGTN2[c1], n2=baseToNumberACGTN2[c2], n3=baseToNumberACGTN2[c3]; 582 if(n1>3 || n2>3 || n3>3){return '?';} 583 int x=(n1<<4)|(n2<<2)|(n3); 584 // return (x<codeToChar.length ? codeToChar[x] : '?'); 585 return codeToChar[x]; 586 } 587 stringToAAs(String bases)588 public static final String stringToAAs(String bases){ 589 StringBuilder sb=new StringBuilder(bases.length()/3); 590 for(int i=2; i<bases.length(); i+=3){ 591 char a=toAA(bases.charAt(i-2), bases.charAt(i-1), bases.charAt(i)).letter; 592 sb.append(a); 593 } 594 return sb.toString(); 595 } 596 toAAsSixFrames(byte[] bases)597 public static final byte[][] toAAsSixFrames(byte[] bases){ 598 byte[][] out=new byte[6][]; 599 if(bases!=null && bases.length>2){ 600 for(int i=0; i<3; i++){ 601 out[i]=toAAs(bases, i); 602 } 603 byte[] rcomp=reverseComplementBases(bases); 604 for(int i=0; i<3; i++){ 605 out[i+3]=toAAs(rcomp, i); 606 } 607 } 608 return out; 609 } 610 toQualitySixFrames(byte[] quals, int offset)611 public static final byte[][] toQualitySixFrames(byte[] quals, int offset){ 612 byte[][] out=new byte[6][]; 613 if(quals!=null && quals.length>2){ 614 for(int i=0; i<3; i++){ 615 out[i]=toAAQuality(quals, i); 616 } 617 Tools.reverseInPlace(quals); 618 for(int i=0; i<3; i++){ 619 out[i+3]=toAAQuality(quals, i); 620 } 621 Tools.reverseInPlace(quals); 622 } 623 624 if(offset!=0){ 625 for(byte[] array : out){ 626 if(array!=null){ 627 for(int i=0; i<array.length; i++){ 628 array[i]+=offset; 629 } 630 } 631 } 632 } 633 634 return out; 635 } 636 toAAs(byte[] bases, int frame)637 public static final byte[] toAAs(byte[] bases, int frame){ 638 assert(frame>=0 && frame<3); 639 if(bases==null){return null;} 640 int blen=bases.length-frame; 641 if(blen<3){return null;} 642 blen=blen-(blen%3); 643 final int stop=frame+blen; 644 final int alen=blen/3; 645 646 byte[] out=KillSwitch.allocByte1D(alen); 647 for(int i=2+frame, j=0; i<stop; i+=3, j++){ 648 byte a=toByte(bases[i-2], bases[i-1], bases[i]); 649 out[j]=a; 650 } 651 return out; 652 } 653 toAAs(byte[] bases, int start, int stop)654 public static final byte[] toAAs(byte[] bases, int start, int stop){ 655 if(bases==null){return null;} 656 stop-=2; 657 final int blen=stop-start; 658 final int alen=blen/3; 659 660 byte[] out=KillSwitch.allocByte1D(alen); 661 for(int i=2+start, j=0; i<stop; i+=3, j++){ 662 byte a=toByte(bases[i-2], bases[i-1], bases[i]); 663 out[j]=a; 664 } 665 return out; 666 } 667 toAAQuality(byte[] quals, int frame)668 public static final byte[] toAAQuality(byte[] quals, int frame){ 669 assert(frame>=0 && frame<3); 670 int blen=quals.length-frame; 671 if(blen<3){return null;} 672 blen=blen-(blen%3); 673 final int stop=frame+blen; 674 final int alen=blen/3; 675 676 byte[] out=KillSwitch.allocByte1D(alen); 677 for(int i=2+frame, j=0; i<stop; i+=3, j++){ 678 byte qa=quals[i-2], qb=quals[i-1], qc=quals[i]; 679 float pa=QualityTools.PROB_CORRECT[qa], pb=QualityTools.PROB_CORRECT[qb], pc=QualityTools.PROB_CORRECT[qc]; 680 float p=pa*pb*pc; 681 byte q=QualityTools.probCorrectToPhred(p); 682 out[j]=q; 683 684 // System.out.println(); 685 // System.out.println(qa+", "+qb+", "+qc+" -> "+q); 686 // System.out.println(pa+", "+pb+", "+pc+" -> "+p); 687 688 } 689 // System.out.println(Arrays.toString(out)); 690 return out; 691 } 692 toNTs(final byte[] aminos)693 public static final byte[] toNTs(final byte[] aminos){ 694 if(aminos==null){return null;} 695 final int alen=aminos.length; 696 final int blen=alen*3; 697 698 final byte[] out=KillSwitch.allocByte1D(blen); 699 for(int i=0, j=0; i<alen; i++, j+=3){ 700 int code=aminoToCode[aminos[i]]; 701 out[j+2]=numberToBase[(code&3)]; 702 out[j+1]=numberToBase[((code>>2)&3)]; 703 out[j]=numberToBase[((code>>4)&3)]; 704 } 705 return out; 706 } 707 708 public static final short[] rcompBinaryTable=makeBinaryRcompTable(4); 709 makeBinaryRcompTable(int k)710 private static final short[] makeBinaryRcompTable(int k){ 711 int bits=2*k; 712 short[] r=new short[1<<bits]; 713 for(int i=0; i<r.length; i++){ 714 r[i]=(short)reverseComplementBinary(i, k); 715 } 716 return r; 717 } 718 719 static { 720 721 for(int i=0; i<uToT.length; i++){uToT[i]=(byte)i;} 722 uToT['u']='t'; 723 uToT['U']='T'; 724 725 for(int i=0; i<tToU.length; i++){tToU[i]=(byte)i;} 726 tToU['t']='u'; 727 tToU['T']='U'; 728 729 for(int i=0; i<dotDashXToNocall.length; i++){ 730 dotDashXToNocall[i]=(byte)i; 731 iupacToNocall[i]=(byte)i; 732 } 733 dotDashXToNocall['.']='N'; 734 dotDashXToNocall['-']='N'; 735 dotDashXToNocall['X']='N'; 736 dotDashXToNocall['x']='N'; 737 dotDashXToNocall['n']='N'; 738 739 for(int i=0; i<dotDashXToNocallAA.length; i++){dotDashXToNocallAA[i]=(byte)i;} 740 dotDashXToNocallAA['.']='X'; 741 dotDashXToNocallAA['-']='X'; 742 dotDashXToNocallAA['X']='X'; 743 dotDashXToNocallAA['x']='X'; 744 745 for(int i=0; i<toUpperCase.length; i++){ 746 toUpperCase[i]=(byte) ((i>='a' && i<='z') ? i-32 : i); 747 lowerCaseToNocall[i]=((i>='a' && i<='z') ? (byte)'N' : (byte)i); 748 lowerCaseToNocallAA[i]=((i>='a' && i<='z') ? (byte)'.' : (byte)i); 749 } 750 751 Arrays.fill(baseToACGTN, (byte)B)752 Arrays.fill(baseToACGTN, (byte)'N'); 753 Arrays.fill(baseToNumberExtended, (byte)-1)754 Arrays.fill(baseToNumberExtended, (byte)-1); 755 for(int i=0; i<numberToBaseExtended.length; i++){ 756 char x=(char)numberToBaseExtended[i]; 757 if(!Character.isWhitespace(x)){ 758 baseToNumberExtended[x]=(byte)i; 759 baseToNumberExtended[Tools.toLowerCase(x)]=(byte)i; 760 } 761 } 762 baseToNumberExtended['U']=8; 763 baseToNumberExtended['u']=8; 764 Arrays.fill(baseToNumberACGTN, (byte)-1)765 Arrays.fill(baseToNumberACGTN, (byte)-1); Arrays.fill(baseToNumberACGTother, (byte)4)766 Arrays.fill(baseToNumberACGTother, (byte)4); 767 for(int i=0; i<numberToBase.length; i++){ 768 char x=(char)numberToBase[i]; 769 if(!Character.isWhitespace(x)){ 770 baseToNumberACGTN[x]=baseToNumberACGTother[x]=(byte)i; 771 baseToNumberACGTN[Tools.toLowerCase(x)]=baseToNumberACGTother[Tools.toLowerCase(x)]=(byte)i; 772 baseToACGTN[x]=baseToACGTN[Tools.toLowerCase(x)]=(byte)x; 773 } 774 } 775 baseToNumberACGTN['U']=baseToNumberACGTN['u']=3; 776 baseToNumberACGTother['U']=baseToNumberACGTother['u']=3; 777 baseToACGTN['U']=baseToACGTN['u']=(byte)'T'; 778 779 for(int i=0; i<baseToNumberACGTN.length; i++){baseToNumberACGTN2[i]=baseToNumberACGTN[i];} 780 baseToNumberACGTN2['N']=0; 781 baseToNumberACGTN2['n']=0; 782 Arrays.fill(baseToNumber, (byte)-1)783 Arrays.fill(baseToNumber, (byte)-1); Arrays.fill(baseToNumber0, (byte)0)784 Arrays.fill(baseToNumber0, (byte)0); 785 for(int i=0; i<numberToBase.length; i++){ 786 char x=(char)numberToBase[i]; 787 if(x=='A' || x=='C' || x=='G' || x=='T'){ 788 baseToNumber0[x]=baseToNumber[x]=(byte)i; 789 baseToNumber0[Tools.toLowerCase(x)]=baseToNumber[Tools.toLowerCase(x)]=(byte)i; 790 } 791 } 792 baseToNumber0['U']=baseToNumber['U']=3; 793 baseToNumber0['u']=baseToNumber['u']=3; 794 Arrays.fill(baseToComplementNumber, (byte)-1)795 Arrays.fill(baseToComplementNumber, (byte)-1); 796 baseToComplementNumber['A']=baseToComplementNumber['a']=3; 797 baseToComplementNumber['C']=baseToComplementNumber['c']=2; 798 baseToComplementNumber['G']=baseToComplementNumber['g']=1; 799 baseToComplementNumber['T']=baseToComplementNumber['t']=0; 800 baseToComplementNumber['U']=baseToComplementNumber['u']=0; 801 Arrays.fill(baseToComplementNumber0, (byte)0)802 Arrays.fill(baseToComplementNumber0, (byte)0); 803 baseToComplementNumber0['A']=baseToComplementNumber0['a']=3; 804 baseToComplementNumber0['C']=baseToComplementNumber0['c']=2; 805 baseToComplementNumber0['G']=baseToComplementNumber0['g']=1; 806 baseToComplementNumber0['T']=baseToComplementNumber0['t']=0; 807 baseToComplementNumber0['U']=baseToComplementNumber0['u']=0; 808 809 //Invalid symbols are unchanged. 810 //This prevents crashes from -1 being out of bounds, and allows 811 //consecutive rcomp operations to restore the original sequence. 812 for(int i=0; i<baseToComplementExtended.length; i++){ 813 baseToComplementExtended[i]=(byte)i; 814 } 815 // Arrays.fill(baseToComplementExtended, (byte)-1); 816 for(int i=0; i<numberToBaseExtended.length; i++){ 817 char x=(char)numberToBaseExtended[i]; 818 char x2=(char)numberToComplementaryBaseExtended[i]; 819 baseToComplementExtended[x]=(byte)x2; 820 baseToComplementExtended[Tools.toLowerCase(x)]=(byte)Tools.toLowerCase(x2); 821 } 822 baseToComplementExtended['U']=(byte)'A'; 823 baseToComplementExtended['u']=(byte)'a'; 824 baseToComplementExtended['?']=(byte)'?'; 825 baseToComplementExtended[' ']=(byte)' '; 826 baseToComplementExtended['-']=(byte)'-'; 827 baseToComplementExtended['*']=(byte)'*'; 828 baseToComplementExtended['.']=(byte)'.'; 829 830 831 AlphabeticalAAs[0]=Alanine; 832 AlphabeticalAAs[1]=Arginine; 833 AlphabeticalAAs[2]=Asparagine; 834 AlphabeticalAAs[3]=AsparticAcid; 835 AlphabeticalAAs[4]=Cysteine; 836 AlphabeticalAAs[5]=GlutamicAcid; 837 AlphabeticalAAs[6]=Glutamine; 838 AlphabeticalAAs[7]=Glycine; 839 AlphabeticalAAs[8]=Histidine; 840 AlphabeticalAAs[9]=Isoleucine; 841 AlphabeticalAAs[10]=Leucine; 842 AlphabeticalAAs[11]=Lysine; 843 AlphabeticalAAs[12]=Methionine; 844 AlphabeticalAAs[13]=Phenylalanine; 845 AlphabeticalAAs[14]=Proline; 846 AlphabeticalAAs[15]=Serine; 847 AlphabeticalAAs[16]=Threonine; 848 AlphabeticalAAs[17]=Tryptophan; 849 AlphabeticalAAs[18]=Tyrosine; 850 AlphabeticalAAs[19]=Valine; 851 AlphabeticalAAs[20]=END; 852 // AlphabeticalAAs[21]=ANY; 853 Arrays.fill(aminoToCode, (byte)-1)854 Arrays.fill(aminoToCode, (byte)-1); Arrays.fill(acidToNumber, (byte)-1)855 Arrays.fill(acidToNumber, (byte)-1); Arrays.fill(acidToNumber0, (byte)0)856 Arrays.fill(acidToNumber0, (byte)0); Arrays.fill(acidToNumber8, (byte)-1)857 Arrays.fill(acidToNumber8, (byte)-1); 858 for(int i=0; i<AlphabeticalAAs.length; i++){ 859 AminoAcid aa=AlphabeticalAAs[i]; 860 861 acidToNumber[aa.letter]=(byte)i; 862 acidToNumber[Tools.toLowerCase(aa.letter)]=(byte)i; 863 acidToNumber0[aa.letter]=(byte)i; 864 acidToNumber0[Tools.toLowerCase(aa.letter)]=(byte)i; 865 numberToAcid[i]=(byte)aa.letter; 866 canonicalCodons[i]=aa.canonicalCodon(); 867 stringToAA.put(aa.name, aa)868 stringToAA.put(aa.name, aa); stringToAA.put(aa.symbol, aa)869 stringToAA.put(aa.symbol, aa); 870 stringToAA.put(aa.letter+"", aa); 871 for(int j=0; j<aa.codeStrings.length; j++){ 872 String s=aa.codeStrings[j]; stringToAA.put(s, aa)873 stringToAA.put(s, aa); 874 aa.codeStrings[j]=s.replace('U', 'T'); stringToAA.put(aa.codeStrings[j], aa)875 stringToAA.put(aa.codeStrings[j], aa); 876 877 int x=toNumber(s); 878 // System.out.println("x="+x+", aa="+aa); 879 codeToAA[x]=aa; 880 codeToChar[x]=aa.letter; 881 codeToByte[x]=(byte)(aa.letter); 882 if(j==0){ 883 aminoToCode[aa.letter]=(byte)x; 884 aminoToCode[Tools.toLowerCase(aa.letter)]=(byte)x; 885 } 886 } 887 } 888 889 for(int i=0; i<acidToNumberNoStops.length; i++){acidToNumberNoStops[i]=acidToNumber[i];} 890 acidToNumberNoStops[END.letter]=-1; 891 892 for(int i=0; i<acidToNumber.length; i++){ 893 acidToNumberExtended[i]=acidToNumber[i]; 894 } 895 896 { 897 byte anySym=(byte)(Tools.max(acidToNumberExtended)+1); 898 byte dash=(byte)(anySym+1); 899 acidToNumberExtended['x']=acidToNumberExtended['X']=acidToNumberExtended['.']=anySym; 900 acidToNumberExtended['b']=acidToNumberExtended['B']=anySym; 901 acidToNumberExtended['z']=acidToNumberExtended['Z']=anySym; 902 acidToNumberExtended['j']=acidToNumberExtended['J']=anySym; 903 acidToNumberExtended['-']=dash; 904 } 905 906 acidToNumber8['H']=acidToNumber8['K']=acidToNumber8['R']=0; 907 acidToNumber8['D']=acidToNumber8['E']=1; 908 acidToNumber8['S']=acidToNumber8['T']=acidToNumber8['N']=acidToNumber8['Q']=2; 909 acidToNumber8['A']=acidToNumber8['V']=acidToNumber8['L']=acidToNumber8['I']=acidToNumber8['M']=3; 910 acidToNumber8['F']=acidToNumber8['Y']=acidToNumber8['W']=4; 911 acidToNumber8['P']=acidToNumber8['G']=5; 912 acidToNumber8['C']=acidToNumber8['*']=6; 913 acidToNumber8['B']=acidToNumber8['Z']=7; 914 915 aminoToCode['X']=aminoToCode['x']=aminoToCode['B']=aminoToCode['b']= 916 aminoToCode['Z']=aminoToCode['z']=aminoToCode['J']=aminoToCode['j']=65; 917 codeToAA[65]=ANY; 918 codeToChar[65]='X'; 919 codeToByte[65]='X'; 920 921 stringToAA.put("X", ANY); 922 stringToAA.put("Start", Methionine); 923 stringToAA.put("Begin", Methionine); 924 stringToAA.put("Stop", END); 925 stringToAA.put("Aspartic Acid", AsparticAcid); 926 stringToAA.put("Glutamic Acid", GlutamicAcid); 927 928 String[] temp=stringToAA.keySet().toArray(new String[0]); 929 930 for(String s : temp){ 931 AminoAcid aa=stringToAA.get(s); 932 assert(aa!=null); s.toLowerCase()933 stringToAA.put(s.toLowerCase(), aa); 934 } 935 936 for(int i=0; i<codonToString.length; i++){ 937 codonToString[i]=kmerToString(i, 3); 938 } 939 940 for(int i='A'; i<='z'; i++){ 941 if(baseToNumber[i]<0 && baseToNumberExtended[i]>=0){ 942 iupacToNocall[i]='N'; 943 } 944 } 945 946 } 947 948 } 949