1 package dna;
2 import java.util.Arrays;
3 import java.util.HashMap;
4 
5 import align2.QualityTools;
6 import shared.KillSwitch;
7 import shared.Tools;
8 import structures.ByteBuilder;
9 
10 
11 /**
12  * @author Brian Bushnell
13  * @date July 1, 2010
14  *
15  */
16 public final class AminoAcid {
17 
18 
main(String[] args)19 	public static void main(String[] args){
20 //		for(String s : stringToAA.keySet()){
21 //			System.out.println(s+"\t->\t"+stringToAA.get(s));
22 //		}
23 
24 		String bases="atctgatTGGcgcgatatatcg";
25 		String acids=stringToAAs(bases);
26 
27 		System.out.println(bases+" -> "+acids);
28 
29 	}
30 
31 
AminoAcid()32 	private AminoAcid(){
33 		this(null);
34 		assert(false);
35 		System.exit(0);
36 	}
37 
AminoAcid(String line)38 	private AminoAcid(String line){
39 		String[] s2=line.split(", ");
40 		String[] s3=new String[s2.length-3];
41 		for(int i=3; i<s2.length; i++){
42 			s3[i-3]=s2[i];
43 		}
44 
45 		name=s2[0];
46 		symbol=s2[1];
47 		letter=s2[2].charAt(0);
48 		codeStrings=s3;
49 	}
50 
AminoAcid(String n, String c3, String c1, String[] bases)51 	private AminoAcid(String n, String c3, String c1, String[] bases){
52 		name=n;
53 		symbol=c3;
54 		letter=c1.charAt(0);
55 		codeStrings=bases;
56 	}
57 
58 	@Override
toString()59 	public String toString(){
60 		return name+", "+symbol+", "+letter+", "+Arrays.toString(codeStrings);
61 	}
62 
kmerToString(long kmer, int k)63 	public static String kmerToString(long kmer, int k){
64 		ByteBuilder sb=new ByteBuilder(k);
65 		for(int i=0; i<k; i++){
66 			int x=(int)(kmer&3);
67 			sb.append((char)numberToBase[x]);
68 			kmer>>=2;
69 		}
70 		return sb.reverse().toString();
71 	}
72 
stringToKmer(String s)73 	public static long stringToKmer(String s){
74 		long kmer=0;
75 		for(int i=0; i<s.length(); i++){
76 			char c=s.charAt(i);
77 			kmer=(kmer<<2)|(baseToNumber[c]);
78 		}
79 		return kmer;
80 	}
81 
kmerToStringAA(long kmer, int k)82 	public static String kmerToStringAA(long kmer, int k){
83 		ByteBuilder sb=new ByteBuilder(k);
84 		for(int i=0; i<k; i++){
85 			int x=(int)(kmer&31);
86 			sb.append((char)numberToAcid[x]);
87 			kmer>>=5;
88 		}
89 		return sb.reverse().toString();
90 	}
91 
codonToString(int codon)92 	public static final String codonToString(int codon){
93 		return codon>=0 && codon<codonToString.length ? codonToString[codon] : "NNN";
94 	}
95 
canonicalCodon()96 	public String canonicalCodon(){
97 		return codeStrings[0];
98 	}
99 
100 
101 	public final String name;
102 	public final String symbol;
103 	public final char letter;
104 	public final String[] codeStrings;
105 
106 
107 	//a=1
108 	//c=2
109 	//g=4
110 	//t=8
111 
112 //	R 	G A (puRine)
113 //	Y 	T C (pYrimidine)
114 //	K 	G T (Ketone)
115 //	M 	A C (aMino group)
116 //	S 	G C (Strong interaction)
117 //	W 	A T (Weak interaction)
118 //	B 	G T C (not A) (B comes after A)
119 //	D 	G A T (not C) (D comes after C)
120 //	H 	A C T (not G) (H comes after G)
121 //	V 	G C A (not T, not U) (V comes after U)
122 //	N 	A G C T (aNy)
123 //	X 	masked
124 //	- 	gap of indeterminate length
125 
126 	public static final String[] canonicalCodons=new String[21];
127 
128 	public static final byte[] numberToBase={
129 		'A','C','G','T','N'
130 	};
131 
132 	public static final byte[] numberToAcid=new byte[21];
133 
134 	public static final byte[] numberToComplementaryBase={
135 		'T','G','C','A','N'
136 	};
137 
138 	public static final byte[] numberToComplement={
139 		3,2,1,0,4
140 	};
141 
142 	public static final byte[] numberToBaseExtended={
143 		' ','A','C','M','G','R','S','V', //0-7
144 		'T','W','Y','H','K','D','B','N', //8-15
145 		'X',' ',' ',' ',' ',' ',' ',' ', //16-23
146 	};
147 
148 	/** Has 'N' in position 0.  Mainly for translating compressed arrays containing zeroes to bases. */
149 	public static final byte[] numberToBaseExtended2={
150 		'N','A','C','M','G','R','S','V', //0-7
151 		'T','W','Y','H','K','D','B','N', //8-15
152 		'X',' ',' ',' ',' ',' ',' ',' ', //16-23
153 	};
154 
155 	public static final byte[] degenerateBases={
156 		' ',' ',' ','M',' ','R','S','V', //0-7
157 		' ','W','Y','H','K','D','B',' ', //8-15
158 		' ',' ',' ',' ',' ',' ',' ',' ', //16-23
159 	};
160 
161 	public static final byte[] numberToComplementaryBaseExtended={
162 		' ','T','G','K','C','Y','W','B', //0-7
163 		'A','S','R','D','M','H','V','N', //8-15
164 		'X',' ',' ',' ',' ',' ',' ',' ', //16-23
165 	};
166 
167 	/** Element i is: N-bit code for a symbol, -1 otherwise */
symbolToNumber(boolean amino)168 	public static final byte[] symbolToNumber(boolean amino){
169 		return amino ? acidToNumber : baseToNumber;
170 	}
171 
172 	/** Element i is: N-bit code for a symbol, 0 otherwise */
symbolToNumber0(boolean amino)173 	public static final byte[] symbolToNumber0(boolean amino){
174 		return amino ? acidToNumber0 : baseToNumber0;
175 	}
176 
177 	/** Element i is: N-bit code for a symbol, -1 otherwise */
symbolToComplementNumber(boolean amino)178 	public static final byte[] symbolToComplementNumber(boolean amino){
179 		return amino ? acidToNumber : baseToComplementNumber;
180 	}
181 
182 	/** Element i is: N-bit code for a symbol, 0 otherwise */
symbolToComplementNumber0(boolean amino)183 	public static final byte[] symbolToComplementNumber0(boolean amino){
184 		return amino ? acidToNumber0 : baseToComplementNumber0;
185 	}
186 
187 	/** Element i is: 5-bit alphabetical code for a symbol, -1 otherwise */
188 	public static final byte[] acidToNumber=new byte[128];
189 
190 	/** Element i is: 5-bit alphabetical code for a symbol other than stop, -1 otherwise */
191 	public static final byte[] acidToNumberNoStops=new byte[128];
192 
193 	/** Element i is: 5-bit alphabetical code for a symbol, 0 otherwise */
194 	public static final byte[] acidToNumber0=new byte[128];//Rename acidToNumber0
195 
196 	/** Element i is: 5-bit alphabetical code for a symbol (plus X, B, J, Z, . and -), -1 otherwise */
197 	public static final byte[] acidToNumberExtended=new byte[128];
198 
199 	/** Element i is: 5-bit alphabetical code for a symbol, -1 otherwise */
200 	public static final byte[] acidToNumber8=new byte[128];
201 
202 	/** Element i is: 0 for 'A', 1 for 'C', 2 for 'G', 3 for 'T', -1 otherwise */
203 	public static final byte[] baseToNumber=new byte[128];
204 
205 	/** Element i is: 0 for 'A', 1 for 'C', 2 for 'G', 3 for 'T', 0 otherwise */
206 	public static final byte[] baseToNumber0=new byte[128];
207 
208 	/** Element i is: 3 for 'A', 2 for 'C', 1 for 'G', 0 for 'T', -1 otherwise */
209 	public static final byte[] baseToComplementNumber=new byte[128];
210 
211 	/** Element i is: 3 for 'A', 2 for 'C', 1 for 'G', 0 for 'T', 0 otherwise */
212 	public static final byte[] baseToComplementNumber0=new byte[128];
213 
214 	/** Element i is: 0 for 'A', 1 for 'C', 2 for 'G', 3 for 'T', 4 for 'N', -1 otherwise */
215 	public static final byte[] baseToNumberACGTN=new byte[128];
216 
217 	/** Element i is: 0 for 'A', 1 for 'C', 2 for 'G', 3 for 'T', 0 for 'N', -1 otherwise */
218 	public static final byte[] baseToNumberACGTN2=new byte[128];
219 
220 	/** Element i is: 0 for 'A', 1 for 'C', 2 for 'G', 3 for 'T', 4 otherwise */
221 	public static final byte[] baseToNumberACGTother=new byte[128];
222 
223 	/** A>A, C>C, G>G, T/U>T, other>N */
224 	public static final byte[] baseToACGTN=new byte[128];
225 
226 	public static final byte[] baseToComplementExtended=new byte[128];
227 
228 	public static final String[] codonToString=new String[64];
229 
230 	/** Uracil to Thymine, everything else unchanged */
231 	public static final byte[] uToT=new byte[256];
232 	/** Thymine to Uracil, everything else unchanged */
233 	public static final byte[] tToU=new byte[256];
234 	/** . - X to N, everything else unchanged */
235 	public static final byte[] dotDashXToNocall=new byte[256];
236 	/** . - X to ., everything else unchanged */
237 	public static final byte[] dotDashXToNocallAA=new byte[256];
238 	/** Letters to uppercase, everything else unchanged */
239 	public static final byte[] toUpperCase=new byte[256];
240 	/** Lowercase to N, everything else unchanged */
241 	public static final byte[] lowerCaseToNocall=new byte[256];
242 	/** Lowercase to ., everything else unchanged */
243 	public static final byte[] lowerCaseToNocallAA=new byte[256];
244 	/** Non-acgtACGT alphabet letters to N */
245 	public static final byte[] iupacToNocall=new byte[256];
246 
247 	/** Element i is the bitwise OR of constituent IUPAC base numbers in baseToNumber.<br>
248 	 * For example, baseToNumberExtended['M'] = (baseToNumber['A'] | baseToNumber['C']) = (1 | 2) = 3 <br>
249 	 * Invalid characters are -1 */
250 	public static final byte[] baseToNumberExtended=new byte[128];
251 	public static final AminoAcid[] AlphabeticalAAs=new AminoAcid[21];
252 	public static final AminoAcid[] codeToAA=new AminoAcid[66];
253 	public static final char[] codeToChar=new char[66];
254 	public static final byte[] codeToByte=new byte[66];
255 	public static final byte[] aminoToCode=new byte[128];
256 	public static final HashMap<String, AminoAcid> stringToAA=new HashMap<String, AminoAcid>(512);
257 
258 	public static final AminoAcid Alanine=new AminoAcid("Alanine, Ala, A, GCU, GCC, GCA, GCG");
259 	public static final AminoAcid Arginine=new AminoAcid("Arginine, Arg, R, CGU, CGC, CGA, CGG, AGA, AGG");
260 	public static final AminoAcid Asparagine=new AminoAcid("Asparagine, Asn, N, AAU, AAC");
261 	public static final AminoAcid AsparticAcid=new AminoAcid("AsparticAcid, Asp, D, GAU, GAC");
262 	public static final AminoAcid Cysteine=new AminoAcid("Cysteine, Cys, C, UGU, UGC");
263 	public static final AminoAcid GlutamicAcid=new AminoAcid("GlutamicAcid, Glu, E, GAA, GAG");
264 	public static final AminoAcid Glutamine=new AminoAcid("Glutamine, Gln, Q, CAA, CAG");
265 	public static final AminoAcid Glycine=new AminoAcid("Glycine, Gly, G, GGU, GGC, GGA, GGG");
266 	public static final AminoAcid Histidine=new AminoAcid("Histidine, His, H, CAU, CAC");
267 	public static final AminoAcid Isoleucine=new AminoAcid("Isoleucine, Ile, I, AUU, AUC, AUA");
268 	public static final AminoAcid Leucine=new AminoAcid("Leucine, Leu, L, UUA, UUG, CUU, CUC, CUA, CUG");
269 	public static final AminoAcid Lysine=new AminoAcid("Lysine, Lys, K, AAA, AAG");
270 	public static final AminoAcid Methionine=new AminoAcid("Methionine, Met, M, AUG");
271 	public static final AminoAcid Phenylalanine=new AminoAcid("Phenylalanine, Phe, F, UUU, UUC");
272 	public static final AminoAcid Proline=new AminoAcid("Proline, Pro, P, CCU, CCC, CCA, CCG");
273 	public static final AminoAcid Serine=new AminoAcid("Serine, Ser, S, UCU, UCC, UCA, UCG, AGU, AGC");
274 	public static final AminoAcid Threonine=new AminoAcid("Threonine, Thr, T, ACU, ACC, ACA, ACG");
275 	public static final AminoAcid Tryptophan=new AminoAcid("Tryptophan, Trp, W, UGG");
276 	public static final AminoAcid Tyrosine=new AminoAcid("Tyrosine, Tyr, Y, UAU, UAC");
277 	public static final AminoAcid Valine=new AminoAcid("Valine, Val, V, GUU, GUC, GUA, GUG");
278 	public static final AminoAcid END=new AminoAcid("End, End, *, UAA, UGA, UAG");
279 	public static final AminoAcid ANY=new AminoAcid("Any, Any, X, XXX");
280 
281 	public static int AMINO_SHIFT=5;
282 
283 
284 	public static final byte[][] COLORS=new byte[][] {
285 		{0, 1, 2, 3},
286 		{1, 0, 3, 2},
287 		{2, 3, 0, 1},
288 		{3, 2, 1, 0}
289 	};
290 
291 	/** Returns a new reverse-complemented array in ASCII coding*/
reverseComplementBases(final byte[] in)292 	public static final byte[] reverseComplementBases(final byte[] in){
293 		byte[] out=new byte[in.length];
294 		final int last=in.length-1;
295 		for(int i=0; i<in.length; i++){
296 			out[i]=baseToComplementExtended[in[last-i]];
297 		}
298 		return out;
299 	}
300 
301 
reverseComplementBasesInPlace(final byte[] in)302 	public static final void reverseComplementBasesInPlace(final byte[] in){
303 		if(in!=null){reverseComplementBasesInPlace(in, in.length);}
304 	}
complementBasesInPlace(final byte[] in)305 	public static final void complementBasesInPlace(final byte[] in){
306 		if(in==null){return;}
307 		complementBasesInPlace(in, in.length);
308 	}
complementBasesInPlace(final byte[] in, final int length)309 	public static final void complementBasesInPlace(final byte[] in, final int length){
310 		if(in==null){return;}
311 		for(int i=0; i<length; i++){
312 			in[i]=baseToComplementExtended[in[i]];
313 		}
314 	}
315 
reverseComplementBasesInPlace(final byte[] in, final int length)316 	public static final void reverseComplementBasesInPlace(final byte[] in, final int length){
317 		if(in==null){return;}
318 		final int last=length-1;
319 		final int max=length/2;
320 		for(int i=0; i<max; i++){
321 			byte a=in[i];
322 			byte b=in[last-i];
323 //			assert(b>0 && b<baseToComplementExtended.length) : ((int)b)+"\t"+((char)b)+"\t"+Arrays.toString(in);
324 //			System.out.println((char)a+", "+(char)b+", "+i+", "+last);
325 			in[i]=baseToComplementExtended[b];
326 			in[last-i]=baseToComplementExtended[a];
327 		}
328 		if((length&1)==1){//Odd length; process middle
329 			in[max]=baseToComplementExtended[in[max]];
330 		}
331 	}
332 
reverseComplementBases(String in)333 	public static final String reverseComplementBases(String in){
334 		return in==null ? null : new String(reverseComplementBases(in.getBytes()));
335 	}
336 
reverseComplementBinary(int kmer, int k)337 	public static final int reverseComplementBinary(int kmer, int k){
338 		int out=0;
339 		kmer=~kmer;
340 		for(int i=0; i<k; i++){
341 			out=((out<<2)|(kmer&3));
342 			kmer>>=2;
343 		}
344 		return out;
345 	}
346 
reverseComplementBinary(long kmer, int k)347 	public static final long reverseComplementBinary(long kmer, int k){
348 		long out=0;
349 		kmer=~kmer;
350 		for(int i=0; i<k; i++){
351 			out=((out<<2)|(kmer&3L));
352 			kmer>>=2;
353 		}
354 		return out;
355 	}
356 
reverseComplementBinaryFast(int kmer, int k)357 	public static final int reverseComplementBinaryFast(int kmer, int k){
358 		int out=0;
359 		int extra=k&3;
360 		for(int i=0; i<extra; i++){
361 			out=((out<<2)|((~kmer)&3));
362 			kmer>>=2;
363 		}
364 		k-=extra;
365 		for(int i=0; i<k; i+=4){
366 			out=((out<<8)|(rcompBinaryTable[kmer&0xFF]));
367 			kmer>>=8;
368 		}
369 		return out;
370 	}
371 
reverseComplementBinaryFast(long kmer, int k)372 	public static final long reverseComplementBinaryFast(long kmer, int k){
373 		long out=0;
374 		int extra=k&3;
375 		for(int i=0; i<extra; i++){
376 			out=((out<<2)|((~kmer)&3L));
377 			kmer>>=2;
378 		}
379 		k-=extra;
380 		for(int i=0; i<k; i+=4){
381 			out=((out<<8)|(rcompBinaryTable[(int)(kmer&0xFFL)]));
382 			kmer>>=8;
383 		}
384 		return out;
385 	}
386 
baseToColor(byte base1, byte base2)387 	public static final byte baseToColor(byte base1, byte base2){
388 		byte a=baseToNumber[base1];
389 		byte b=baseToNumber[base2];
390 		if(a<0 && b<0){return 'N';}
391 		if(a<0){a=3;}
392 		if(b<0){b=3;}
393 		return COLORS[a][b];
394 	}
395 
colorToBase(byte base1, byte color)396 	public static final byte colorToBase(byte base1, byte color){
397 		if(!isFullyDefined(base1) || color<0 || color>3){
398 			return (byte)'N';
399 		}
400 		byte a=baseToNumber[base1];
401 
402 		return numberToBase[COLORS[a][color]];
403 	}
404 
405 //	public static final byte toNumber(String code){
406 //		return toNumber(code.charAt(0), code.charAt(1), code.charAt(2));
407 //	}
408 
toAA(String code)409 	public static final AminoAcid toAA(String code){
410 		return toAA(code.charAt(0), code.charAt(1), code.charAt(2));
411 	}
412 
toChar(String code)413 	public static final char toChar(String code){
414 		return toChar(code.charAt(0), code.charAt(1), code.charAt(2));
415 	}
416 
splitBase(char c)417 	public static final char[] splitBase(char c){
418 		byte b=baseToNumberExtended[c];
419 		int len=Integer.bitCount(b);
420 		char[] out=new char[len];
421 
422 		int index=0;
423 		for(int i=0; i<4; i++){
424 			if(((1<<i)&b)!=0){
425 				out[index]=(char)numberToBase[i];
426 				index++;
427 			}
428 		}
429 		return out;
430 	}
431 
432 
433 
434 
numberToBases(int code, int n)435 	public static final byte[] numberToBases(int code, int n){
436 
437 		byte[] bytes=KillSwitch.allocByte1D(n);
438 
439 		for(int i=n-1; i>=0; i--){
440 			int temp=code&3;
441 			code>>=2;
442 			bytes[i]=numberToBase[temp];
443 		}
444 
445 		return bytes;
446 	}
447 
baseTupleToNumber(byte[] tuple)448 	public static final int baseTupleToNumber(byte[] tuple){
449 
450 		int r=0;
451 		for(int i=0; i<tuple.length; i++){
452 			int temp=baseToNumberACGTN[tuple[i]];
453 			if(temp<0 || temp>3){return -1;}
454 			r=((r<<2)|temp);
455 		}
456 
457 		return r;
458 	}
459 
isFullyDefined(char base)460 	public static boolean isFullyDefined(char base){
461 		return baseToNumber[base]>=0;
462 	}
463 
isFullyDefined(byte base)464 	public static boolean isFullyDefined(byte base){
465 		return base>=0 && baseToNumber[base]>=0;
466 	}
467 
isFullyDefinedAA(byte acid)468 	public static boolean isFullyDefinedAA(byte acid){
469 		return acid>=0 && acidToNumber[acid]>=0;
470 	}
471 
isFullyDefinedAANoStops(byte acid)472 	public static boolean isFullyDefinedAANoStops(byte acid){
473 		return acid>=0 && acidToNumberNoStops[acid]>=0;
474 	}
475 
isACGTN(char base)476 	public static boolean isACGTN(char base){
477 		return baseToNumberACGTN[base]>=0;
478 	}
479 
isACGTN(byte base)480 	public static boolean isACGTN(byte base){
481 		return base>=0 && baseToNumberACGTN[base]>=0;
482 	}
483 
containsOnlyACGTN(String s)484 	public static boolean containsOnlyACGTN(String s){
485 		if(s==null || s.length()==0){return true;}
486 		for(int i=0; i<s.length(); i++){
487 			char c=s.charAt(i);
488 			if(baseToNumberACGTN[c]<0){return false;}
489 		}
490 		return true;
491 	}
492 
containsOnlyACGTNQ(String s)493 	public static boolean containsOnlyACGTNQ(String s){
494 		if(s==null || s.length()==0){return true;}
495 		for(int i=0; i<s.length(); i++){
496 			char c=s.charAt(i);
497 			if(c!='?' && baseToNumberACGTN[c]<0){return false;}
498 		}
499 		return true;
500 	}
501 
containsOnlyACGTN(byte[] array)502 	public static boolean containsOnlyACGTN(byte[] array){
503 		if(array==null || array.length==0){return true;}
504 		for(int i=0; i<array.length; i++){
505 			byte b=array[i];
506 			if(b<0 || baseToNumberACGTN[b]<0){return false;}
507 		}
508 		return true;
509 	}
510 
isFullyDefined(String s)511 	public static boolean isFullyDefined(String s){
512 		for(int i=0; i<s.length(); i++){
513 			if(!isFullyDefined(s.charAt(i))){return false;}
514 		}
515 		return true;
516 	}
517 
isFullyDefined(byte[] s)518 	public static boolean isFullyDefined(byte[] s){
519 		for(int i=0; i<s.length; i++){
520 			if(!isFullyDefined(s[i])){return false;}
521 		}
522 		return true;
523 	}
524 
countUndefined(byte[] s)525 	public static int countUndefined(byte[] s){
526 		int x=0;
527 		for(int i=0; i<s.length; i++){
528 			if(!isFullyDefined(s[i])){x++;}
529 		}
530 		return x;
531 	}
532 
countDefined(byte[] s)533 	public static int countDefined(byte[] s){
534 		if(s==null){return 0;}
535 		int x=0;
536 		for(int i=0; i<s.length; i++){
537 			if(isFullyDefined(s[i])){x++;}
538 		}
539 		return x;
540 	}
541 
toNumber(String s)542 	public static final byte toNumber(String s){
543 		assert(s.length()==3);
544 		int num=0;
545 		for(int i=0; i<3; i++){
546 			char c=s.charAt(i);
547 			int x=baseToNumber[c];
548 			if(x<0){return (byte)-1;}
549 			num=(num<<2)|x;
550 		}
551 		return (byte)num;
552 	}
553 
toNumber(char c1, char c2, char c3)554 	public static final byte toNumber(char c1, char c2, char c3){
555 		assert(baseToNumberACGTN2[c1]>=0 && baseToNumberACGTN2[c2]>=0 && baseToNumberACGTN2[c3]>=0);
556 		int x=(baseToNumberACGTN2[c1]<<4)|(baseToNumberACGTN2[c2]<<2)|(baseToNumberACGTN2[c3]);
557 		return (byte)x;
558 	}
559 
toAA(char c1, char c2, char c3)560 	public static final AminoAcid toAA(char c1, char c2, char c3){
561 		assert(baseToNumberACGTN2[c1]>=0 && baseToNumberACGTN2[c2]>=0 && baseToNumberACGTN2[c3]>=0);
562 		int x=(baseToNumberACGTN2[c1]<<4)|(baseToNumberACGTN2[c2]<<2)|(baseToNumberACGTN2[c3]);
563 		return codeToAA[x];
564 	}
565 
toChar(char c1, char c2, char c3)566 	public static final char toChar(char c1, char c2, char c3){
567 		assert(baseToNumberACGTN2[c1]>=0 && baseToNumberACGTN2[c2]>=0 && baseToNumberACGTN2[c3]>=0);
568 		int x=(baseToNumberACGTN2[c1]<<4)|(baseToNumberACGTN2[c2]<<2)|(baseToNumberACGTN2[c3]);
569 		return codeToChar[x];
570 	}
571 
toByte(byte c1, byte c2, byte c3)572 	public static final byte toByte(byte c1, byte c2, byte c3){
573 		int a=baseToNumber[c1], b=baseToNumber[c2], c=baseToNumber[c3];
574 		if(a<0 || b<0 || c<0){return (byte)'X';}
575 		int x=((a<<4)|(b<<2)|c);
576 		return codeToByte[x];
577 	}
578 
toChar(byte c1, byte c2, byte c3)579 	public static final char toChar(byte c1, byte c2, byte c3){
580 		assert(baseToNumberACGTN2[c1]>=0 && baseToNumberACGTN2[c2]>=0 && baseToNumberACGTN2[c3]>=0);
581 		byte n1=baseToNumberACGTN2[c1], n2=baseToNumberACGTN2[c2], n3=baseToNumberACGTN2[c3];
582 		if(n1>3 || n2>3 || n3>3){return '?';}
583 		int x=(n1<<4)|(n2<<2)|(n3);
584 //		return (x<codeToChar.length ? codeToChar[x] : '?');
585 		return codeToChar[x];
586 	}
587 
stringToAAs(String bases)588 	public static final String stringToAAs(String bases){
589 		StringBuilder sb=new StringBuilder(bases.length()/3);
590 		for(int i=2; i<bases.length(); i+=3){
591 			char a=toAA(bases.charAt(i-2), bases.charAt(i-1), bases.charAt(i)).letter;
592 			sb.append(a);
593 		}
594 		return sb.toString();
595 	}
596 
toAAsSixFrames(byte[] bases)597 	public static final byte[][] toAAsSixFrames(byte[] bases){
598 		byte[][] out=new byte[6][];
599 		if(bases!=null && bases.length>2){
600 			for(int i=0; i<3; i++){
601 				out[i]=toAAs(bases, i);
602 			}
603 			byte[] rcomp=reverseComplementBases(bases);
604 			for(int i=0; i<3; i++){
605 				out[i+3]=toAAs(rcomp, i);
606 			}
607 		}
608 		return out;
609 	}
610 
toQualitySixFrames(byte[] quals, int offset)611 	public static final byte[][] toQualitySixFrames(byte[] quals, int offset){
612 		byte[][] out=new byte[6][];
613 		if(quals!=null && quals.length>2){
614 			for(int i=0; i<3; i++){
615 				out[i]=toAAQuality(quals, i);
616 			}
617 			Tools.reverseInPlace(quals);
618 			for(int i=0; i<3; i++){
619 				out[i+3]=toAAQuality(quals, i);
620 			}
621 			Tools.reverseInPlace(quals);
622 		}
623 
624 		if(offset!=0){
625 			for(byte[] array : out){
626 				if(array!=null){
627 					for(int i=0; i<array.length; i++){
628 						array[i]+=offset;
629 					}
630 				}
631 			}
632 		}
633 
634 		return out;
635 	}
636 
toAAs(byte[] bases, int frame)637 	public static final byte[] toAAs(byte[] bases, int frame){
638 		assert(frame>=0 && frame<3);
639 		if(bases==null){return null;}
640 		int blen=bases.length-frame;
641 		if(blen<3){return null;}
642 		blen=blen-(blen%3);
643 		final int stop=frame+blen;
644 		final int alen=blen/3;
645 
646 		byte[] out=KillSwitch.allocByte1D(alen);
647 		for(int i=2+frame, j=0; i<stop; i+=3, j++){
648 			byte a=toByte(bases[i-2], bases[i-1], bases[i]);
649 			out[j]=a;
650 		}
651 		return out;
652 	}
653 
toAAs(byte[] bases, int start, int stop)654 	public static final byte[] toAAs(byte[] bases, int start, int stop){
655 		if(bases==null){return null;}
656 		stop-=2;
657 		final int blen=stop-start;
658 		final int alen=blen/3;
659 
660 		byte[] out=KillSwitch.allocByte1D(alen);
661 		for(int i=2+start, j=0; i<stop; i+=3, j++){
662 			byte a=toByte(bases[i-2], bases[i-1], bases[i]);
663 			out[j]=a;
664 		}
665 		return out;
666 	}
667 
toAAQuality(byte[] quals, int frame)668 	public static final byte[] toAAQuality(byte[] quals, int frame){
669 		assert(frame>=0 && frame<3);
670 		int blen=quals.length-frame;
671 		if(blen<3){return null;}
672 		blen=blen-(blen%3);
673 		final int stop=frame+blen;
674 		final int alen=blen/3;
675 
676 		byte[] out=KillSwitch.allocByte1D(alen);
677 		for(int i=2+frame, j=0; i<stop; i+=3, j++){
678 			byte qa=quals[i-2], qb=quals[i-1], qc=quals[i];
679 			float pa=QualityTools.PROB_CORRECT[qa], pb=QualityTools.PROB_CORRECT[qb], pc=QualityTools.PROB_CORRECT[qc];
680 			float p=pa*pb*pc;
681 			byte q=QualityTools.probCorrectToPhred(p);
682 			out[j]=q;
683 
684 //			System.out.println();
685 //			System.out.println(qa+", "+qb+", "+qc+" -> "+q);
686 //			System.out.println(pa+", "+pb+", "+pc+" -> "+p);
687 
688 		}
689 //		System.out.println(Arrays.toString(out));
690 		return out;
691 	}
692 
toNTs(final byte[] aminos)693 	public static final byte[] toNTs(final byte[] aminos){
694 		if(aminos==null){return null;}
695 		final int alen=aminos.length;
696 		final int blen=alen*3;
697 
698 		final byte[] out=KillSwitch.allocByte1D(blen);
699 		for(int i=0, j=0; i<alen; i++, j+=3){
700 			int code=aminoToCode[aminos[i]];
701 			out[j+2]=numberToBase[(code&3)];
702 			out[j+1]=numberToBase[((code>>2)&3)];
703 			out[j]=numberToBase[((code>>4)&3)];
704 		}
705 		return out;
706 	}
707 
708 	public static final short[] rcompBinaryTable=makeBinaryRcompTable(4);
709 
makeBinaryRcompTable(int k)710 	private static final short[] makeBinaryRcompTable(int k){
711 		int bits=2*k;
712 		short[] r=new short[1<<bits];
713 		for(int i=0; i<r.length; i++){
714 			r[i]=(short)reverseComplementBinary(i, k);
715 		}
716 		return r;
717 	}
718 
719 	static {
720 
721 		for(int i=0; i<uToT.length; i++){uToT[i]=(byte)i;}
722 		uToT['u']='t';
723 		uToT['U']='T';
724 
725 		for(int i=0; i<tToU.length; i++){tToU[i]=(byte)i;}
726 		tToU['t']='u';
727 		tToU['T']='U';
728 
729 		for(int i=0; i<dotDashXToNocall.length; i++){
730 			dotDashXToNocall[i]=(byte)i;
731 			iupacToNocall[i]=(byte)i;
732 		}
733 		dotDashXToNocall['.']='N';
734 		dotDashXToNocall['-']='N';
735 		dotDashXToNocall['X']='N';
736 		dotDashXToNocall['x']='N';
737 		dotDashXToNocall['n']='N';
738 
739 		for(int i=0; i<dotDashXToNocallAA.length; i++){dotDashXToNocallAA[i]=(byte)i;}
740 		dotDashXToNocallAA['.']='X';
741 		dotDashXToNocallAA['-']='X';
742 		dotDashXToNocallAA['X']='X';
743 		dotDashXToNocallAA['x']='X';
744 
745 		for(int i=0; i<toUpperCase.length; i++){
746 			toUpperCase[i]=(byte) ((i>='a' && i<='z') ? i-32 : i);
747 			lowerCaseToNocall[i]=((i>='a' && i<='z') ? (byte)'N' : (byte)i);
748 			lowerCaseToNocallAA[i]=((i>='a' && i<='z') ? (byte)'.' : (byte)i);
749 		}
750 
751 
Arrays.fill(baseToACGTN, (byte)B)752 		Arrays.fill(baseToACGTN, (byte)'N');
753 
Arrays.fill(baseToNumberExtended, (byte)-1)754 		Arrays.fill(baseToNumberExtended, (byte)-1);
755 		for(int i=0; i<numberToBaseExtended.length; i++){
756 			char x=(char)numberToBaseExtended[i];
757 			if(!Character.isWhitespace(x)){
758 				baseToNumberExtended[x]=(byte)i;
759 				baseToNumberExtended[Tools.toLowerCase(x)]=(byte)i;
760 			}
761 		}
762 		baseToNumberExtended['U']=8;
763 		baseToNumberExtended['u']=8;
764 
Arrays.fill(baseToNumberACGTN, (byte)-1)765 		Arrays.fill(baseToNumberACGTN, (byte)-1);
Arrays.fill(baseToNumberACGTother, (byte)4)766 		Arrays.fill(baseToNumberACGTother, (byte)4);
767 		for(int i=0; i<numberToBase.length; i++){
768 			char x=(char)numberToBase[i];
769 			if(!Character.isWhitespace(x)){
770 				baseToNumberACGTN[x]=baseToNumberACGTother[x]=(byte)i;
771 				baseToNumberACGTN[Tools.toLowerCase(x)]=baseToNumberACGTother[Tools.toLowerCase(x)]=(byte)i;
772 				baseToACGTN[x]=baseToACGTN[Tools.toLowerCase(x)]=(byte)x;
773 			}
774 		}
775 		baseToNumberACGTN['U']=baseToNumberACGTN['u']=3;
776 		baseToNumberACGTother['U']=baseToNumberACGTother['u']=3;
777 		baseToACGTN['U']=baseToACGTN['u']=(byte)'T';
778 
779 		for(int i=0; i<baseToNumberACGTN.length; i++){baseToNumberACGTN2[i]=baseToNumberACGTN[i];}
780 		baseToNumberACGTN2['N']=0;
781 		baseToNumberACGTN2['n']=0;
782 
Arrays.fill(baseToNumber, (byte)-1)783 		Arrays.fill(baseToNumber, (byte)-1);
Arrays.fill(baseToNumber0, (byte)0)784 		Arrays.fill(baseToNumber0, (byte)0);
785 		for(int i=0; i<numberToBase.length; i++){
786 			char x=(char)numberToBase[i];
787 			if(x=='A' || x=='C' || x=='G' || x=='T'){
788 				baseToNumber0[x]=baseToNumber[x]=(byte)i;
789 				baseToNumber0[Tools.toLowerCase(x)]=baseToNumber[Tools.toLowerCase(x)]=(byte)i;
790 			}
791 		}
792 		baseToNumber0['U']=baseToNumber['U']=3;
793 		baseToNumber0['u']=baseToNumber['u']=3;
794 
Arrays.fill(baseToComplementNumber, (byte)-1)795 		Arrays.fill(baseToComplementNumber, (byte)-1);
796 		baseToComplementNumber['A']=baseToComplementNumber['a']=3;
797 		baseToComplementNumber['C']=baseToComplementNumber['c']=2;
798 		baseToComplementNumber['G']=baseToComplementNumber['g']=1;
799 		baseToComplementNumber['T']=baseToComplementNumber['t']=0;
800 		baseToComplementNumber['U']=baseToComplementNumber['u']=0;
801 
Arrays.fill(baseToComplementNumber0, (byte)0)802 		Arrays.fill(baseToComplementNumber0, (byte)0);
803 		baseToComplementNumber0['A']=baseToComplementNumber0['a']=3;
804 		baseToComplementNumber0['C']=baseToComplementNumber0['c']=2;
805 		baseToComplementNumber0['G']=baseToComplementNumber0['g']=1;
806 		baseToComplementNumber0['T']=baseToComplementNumber0['t']=0;
807 		baseToComplementNumber0['U']=baseToComplementNumber0['u']=0;
808 
809 		//Invalid symbols are unchanged.
810 		//This prevents crashes from -1 being out of bounds, and allows
811 		//consecutive rcomp operations to restore the original sequence.
812 		for(int i=0; i<baseToComplementExtended.length; i++){
813 			baseToComplementExtended[i]=(byte)i;
814 		}
815 //		Arrays.fill(baseToComplementExtended, (byte)-1);
816 		for(int i=0; i<numberToBaseExtended.length; i++){
817 			char x=(char)numberToBaseExtended[i];
818 			char x2=(char)numberToComplementaryBaseExtended[i];
819 			baseToComplementExtended[x]=(byte)x2;
820 			baseToComplementExtended[Tools.toLowerCase(x)]=(byte)Tools.toLowerCase(x2);
821 		}
822 		baseToComplementExtended['U']=(byte)'A';
823 		baseToComplementExtended['u']=(byte)'a';
824 		baseToComplementExtended['?']=(byte)'?';
825 		baseToComplementExtended[' ']=(byte)' ';
826 		baseToComplementExtended['-']=(byte)'-';
827 		baseToComplementExtended['*']=(byte)'*';
828 		baseToComplementExtended['.']=(byte)'.';
829 
830 
831 		AlphabeticalAAs[0]=Alanine;
832 		AlphabeticalAAs[1]=Arginine;
833 		AlphabeticalAAs[2]=Asparagine;
834 		AlphabeticalAAs[3]=AsparticAcid;
835 		AlphabeticalAAs[4]=Cysteine;
836 		AlphabeticalAAs[5]=GlutamicAcid;
837 		AlphabeticalAAs[6]=Glutamine;
838 		AlphabeticalAAs[7]=Glycine;
839 		AlphabeticalAAs[8]=Histidine;
840 		AlphabeticalAAs[9]=Isoleucine;
841 		AlphabeticalAAs[10]=Leucine;
842 		AlphabeticalAAs[11]=Lysine;
843 		AlphabeticalAAs[12]=Methionine;
844 		AlphabeticalAAs[13]=Phenylalanine;
845 		AlphabeticalAAs[14]=Proline;
846 		AlphabeticalAAs[15]=Serine;
847 		AlphabeticalAAs[16]=Threonine;
848 		AlphabeticalAAs[17]=Tryptophan;
849 		AlphabeticalAAs[18]=Tyrosine;
850 		AlphabeticalAAs[19]=Valine;
851 		AlphabeticalAAs[20]=END;
852 //		AlphabeticalAAs[21]=ANY;
853 
Arrays.fill(aminoToCode, (byte)-1)854 		Arrays.fill(aminoToCode, (byte)-1);
Arrays.fill(acidToNumber, (byte)-1)855 		Arrays.fill(acidToNumber, (byte)-1);
Arrays.fill(acidToNumber0, (byte)0)856 		Arrays.fill(acidToNumber0, (byte)0);
Arrays.fill(acidToNumber8, (byte)-1)857 		Arrays.fill(acidToNumber8, (byte)-1);
858 		for(int i=0; i<AlphabeticalAAs.length; i++){
859 			AminoAcid aa=AlphabeticalAAs[i];
860 
861 			acidToNumber[aa.letter]=(byte)i;
862 			acidToNumber[Tools.toLowerCase(aa.letter)]=(byte)i;
863 			acidToNumber0[aa.letter]=(byte)i;
864 			acidToNumber0[Tools.toLowerCase(aa.letter)]=(byte)i;
865 			numberToAcid[i]=(byte)aa.letter;
866 			canonicalCodons[i]=aa.canonicalCodon();
867 
stringToAA.put(aa.name, aa)868 			stringToAA.put(aa.name, aa);
stringToAA.put(aa.symbol, aa)869 			stringToAA.put(aa.symbol, aa);
870 			stringToAA.put(aa.letter+"", aa);
871 			for(int j=0; j<aa.codeStrings.length; j++){
872 				String s=aa.codeStrings[j];
stringToAA.put(s, aa)873 				stringToAA.put(s, aa);
874 				aa.codeStrings[j]=s.replace('U', 'T');
stringToAA.put(aa.codeStrings[j], aa)875 				stringToAA.put(aa.codeStrings[j], aa);
876 
877 				int x=toNumber(s);
878 //				System.out.println("x="+x+", aa="+aa);
879 				codeToAA[x]=aa;
880 				codeToChar[x]=aa.letter;
881 				codeToByte[x]=(byte)(aa.letter);
882 				if(j==0){
883 					aminoToCode[aa.letter]=(byte)x;
884 					aminoToCode[Tools.toLowerCase(aa.letter)]=(byte)x;
885 				}
886 			}
887 		}
888 
889 		for(int i=0; i<acidToNumberNoStops.length; i++){acidToNumberNoStops[i]=acidToNumber[i];}
890 		acidToNumberNoStops[END.letter]=-1;
891 
892 		for(int i=0; i<acidToNumber.length; i++){
893 			acidToNumberExtended[i]=acidToNumber[i];
894 		}
895 
896 		{
897 			byte anySym=(byte)(Tools.max(acidToNumberExtended)+1);
898 			byte dash=(byte)(anySym+1);
899 			acidToNumberExtended['x']=acidToNumberExtended['X']=acidToNumberExtended['.']=anySym;
900 			acidToNumberExtended['b']=acidToNumberExtended['B']=anySym;
901 			acidToNumberExtended['z']=acidToNumberExtended['Z']=anySym;
902 			acidToNumberExtended['j']=acidToNumberExtended['J']=anySym;
903 			acidToNumberExtended['-']=dash;
904 		}
905 
906 		acidToNumber8['H']=acidToNumber8['K']=acidToNumber8['R']=0;
907 		acidToNumber8['D']=acidToNumber8['E']=1;
908 		acidToNumber8['S']=acidToNumber8['T']=acidToNumber8['N']=acidToNumber8['Q']=2;
909 		acidToNumber8['A']=acidToNumber8['V']=acidToNumber8['L']=acidToNumber8['I']=acidToNumber8['M']=3;
910 		acidToNumber8['F']=acidToNumber8['Y']=acidToNumber8['W']=4;
911 		acidToNumber8['P']=acidToNumber8['G']=5;
912 		acidToNumber8['C']=acidToNumber8['*']=6;
913 		acidToNumber8['B']=acidToNumber8['Z']=7;
914 
915 		aminoToCode['X']=aminoToCode['x']=aminoToCode['B']=aminoToCode['b']=
916 				aminoToCode['Z']=aminoToCode['z']=aminoToCode['J']=aminoToCode['j']=65;
917 		codeToAA[65]=ANY;
918 		codeToChar[65]='X';
919 		codeToByte[65]='X';
920 
921 		stringToAA.put("X", ANY);
922 		stringToAA.put("Start", Methionine);
923 		stringToAA.put("Begin", Methionine);
924 		stringToAA.put("Stop", END);
925 		stringToAA.put("Aspartic Acid", AsparticAcid);
926 		stringToAA.put("Glutamic Acid", GlutamicAcid);
927 
928 		String[] temp=stringToAA.keySet().toArray(new String[0]);
929 
930 		for(String s : temp){
931 			AminoAcid aa=stringToAA.get(s);
932 			assert(aa!=null);
s.toLowerCase()933 			stringToAA.put(s.toLowerCase(), aa);
934 		}
935 
936 		for(int i=0; i<codonToString.length; i++){
937 			codonToString[i]=kmerToString(i, 3);
938 		}
939 
940 		for(int i='A'; i<='z'; i++){
941 			if(baseToNumber[i]<0 && baseToNumberExtended[i]>=0){
942 				iupacToNocall[i]='N';
943 			}
944 		}
945 
946 	}
947 
948 }
949