1 /* $Id: faatran.c $ */
2
3 /* copyright (c) 1996, 1997, 1998, 1999, 2014 by William R. Pearson and
4 The Rector & Visitors of the University of Virginia */
5
6 /* Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9
10 http://www.apache.org/licenses/LICENSE-2.0
11
12 Unless required by applicable law or agreed to in writing,
13 software distributed under this License is distributed on an "AS
14 IS" BASIS, WITHOUT WRRANTIES OR CONDITIONS OF ANY KIND, either
15 express or implied. See the License for the specific language
16 governing permissions and limitations under the License.
17 */
18
19 /* aatran.c translates from nt to aa, 1 char codes */
20 /* modified July 2, 1987 for all 6 frames */
21 /* 23 Jan 1991 fixed bug for short sequences */
22
23 /* this mapping is not alphabet independent */
24
25 #define XTERNAL
26 #include <stdio.h>
27 #include <stdlib.h>
28
29 #include "upam.h"
30 #include "uascii.h"
31
32 /*
33 1. The Standard Code (transl_table=1)
34
35 By default all transl_table in GenBank flatfiles are equal to id 1, and this
36 is not shown. When transl_table is not equal to id 1, it is shown as a
37 qualifier on the CDS feature.
38
39 */
40 static
41 char *AA1="FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
42 /*
43 Starts = ---M---------------M---------------M----------------------------
44 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
45 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
46 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
47
48 2. The Vertebrate Mitochondrial Code (transl_table=2)
49 */
50 static
51 char *AA2 ="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG";
52 /*
53 Starts = --------------------------------MMMM---------------M------------
54 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
55 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
56 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
57
58 3. The Yeast Mitochondrial Code (transl_table=3)
59 */
60 static
61 char *AA3 ="FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
62 /*
63 Starts = -----------------------------------M----------------------------
64 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
65 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
66 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
67
68 4. The Mold, Protozoan, and Coelenterate Mitochondrial Code and the
69 Mycoplasma/Spiroplasma Code (transl_table=4)
70 */
71 static
72 char *AA4 ="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
73 /*
74 Starts = --MM---------------M------------MMMM---------------M------------
75 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
76 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
77 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
78
79 5. The Invertebrate Mitochondrial Code (transl_table=5)
80 */
81 static
82 char *AA5 ="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG";
83 /*
84 Starts = ---M----------------------------MMMM---------------M------------
85 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
86 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
87 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
88
89 6. The Ciliate, Dasycladacean and Hexamita Nuclear Code (transl_table=6)
90 */
91 static
92 char *AA6 ="FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
93 /*
94 Starts = -----------------------------------M----------------------------
95 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
96 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
97 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
98
99 9. The Echinoderm Mitochondrial Code (transl_table=9)
100 */
101 static
102 char *AA7 ="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG";
103 /*
104 Starts = -----------------------------------M----------------------------
105 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
106 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
107 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
108
109 10. The Euplotid Nuclear Code (transl_table=10)
110 */
111 static
112 char *AA10="FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
113 /*
114 Starts = -----------------------------------M----------------------------
115 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
116 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
117 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
118
119 11. The Bacterial "Code" (transl_table=11)
120 */
121 static
122 char *AA11="FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
123 /*
124 Starts = ---M---------------M------------MMMM---------------M------------
125 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
126 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
127 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
128
129 12. The Alternative Yeast Nuclear Code (transl_table=12)
130 */
131 static
132 char *AA12 ="FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
133 /*
134 Starts = -------------------M---------------M----------------------------
135 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
136 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
137 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
138
139 13. The Ascidian Mitochondrial Code (transl_table=13)
140 */
141 static
142 char *AA13="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG";
143 /*
144 Starts = -----------------------------------M----------------------------
145 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
146 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
147 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
148
149 14. The Flatworm Mitochondrial Code (transl_table=14)
150 */
151 static
152 char *AA14 ="FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG";
153 /*
154 Starts = -----------------------------------M----------------------------
155 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
156 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
157 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
158
159 15. Blepharisma Nuclear Code (transl_table=15)
160 */
161 static
162 char *AA15="FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
163 /*
164 Starts = -----------------------------------M----------------------------
165 Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
166 Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
167 Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
168 */
169
170 static
171 char *AA16 ="FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
172 /*
173 id 16 ,
174 name "Chlorophycean Mitochondrial" ,
175 sncbieaa "-----------------------------------M----------------------------"
176 -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
177 -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
178 -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
179 */
180
181 static
182 char *AA21 ="FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG";
183 /*
184 name "Trematode Mitochondrial" ,
185 id 21 ,
186 sncbieaa "-----------------------------------M---------------M------------"
187 -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
188 -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
189 -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
190 */
191
192 static
193 char *AA22 ="FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
194 /*
195 name "Scenedesmus obliquus Mitochondrial" ,
196 id 22 ,
197 sncbieaa "-----------------------------------M----------------------------"
198 -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
199 -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
200 -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
201 */
202
203 static
204 char *AA23 ="FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
205 /*
206 name "Thraustochytrium Mitochondrial" ,
207 id 23 ,
208 sncbieaa "--------------------------------M--M---------------M------------"
209 -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
210 -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
211 -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
212 */
213
214
215 static char aacmap[64]={
216 'K','N','K','N','T','T','T','T','R','S','R','S','I','I','M','I',
217 'Q','H','Q','H','P','P','P','P','R','R','R','R','L','L','L','L',
218 'E','D','E','D','A','A','A','A','G','G','G','G','V','V','V','V',
219 '*','Y','*','Y','S','S','S','S','*','C','W','C','L','F','L','F'
220 };
221
222 static int aamap[64]; /* integer aa values */
223 static int aamapr[64]; /* reverse sequence map */
224
225 /* tnt is used only by aatran.c. It must be consistent with lascii and
226 the nt alphabet. It uses 3,3 because T and U are considered separately
227 */
228 static int tnt[]={0,0,1,2,3,3,0,1,0,0,1,2,0,0,0,1,0,0,
229 0,1,2,3,3,0,1,0,0,1,2,0,0,0,1,0,0};
230
231 static int debug_set;
232
233 int
aatran(const unsigned char * ntseq,unsigned char * aaseq,int maxs,int frame)234 aatran(const unsigned char *ntseq, unsigned char *aaseq, int maxs, int frame)
235 {
236 int iaa, im, nna, i;
237 register int *nnp;
238 const unsigned char *nts0;
239 register int *aamp;
240 register unsigned char *aap;
241
242 iaa=nna=(maxs-(frame<3?frame:frame-3))/3;
243 if (nna <= 3 ) {
244 aaseq[0]=EOSEQ;
245 return 0;
246 }
247
248 nnp = tnt;
249
250 if (frame < 3) {
251 aamp = aamap;
252 nts0 = &ntseq[frame];
253 aap = aaseq;
254 while (nna--) {
255 im = nnp[*nts0++]<<4;
256 im += nnp[*nts0++]<<2;
257 im += nnp[*nts0++];
258 *aap++ = aamp[im];
259
260 /* this check is included because of a bug in tfasty
261 which occurs only during the alignment process */
262
263 #ifdef DEBUG
264 if (debug_set && aamp[im] > MAXUC) {
265 fprintf(stderr,"faatran: %d %d %d %d %d?%d\n",
266 *(nts0-3),*(nts0-2),*(nts0-1), im, aamp[im],aamap[im]);
267
268 /* this allows recovery, but should not be done frequently */
269 for (i=0; i<64; i++) {
270 aamap[i]=aascii[aacmap[i]];
271 aamapr[i]=aascii[aacmap[(~i)&63]];
272 }
273 *(aap-1) = aamp[im];
274 }
275 #endif
276 }
277 }
278 else {
279 aamp = aamapr;
280 nts0 = &ntseq[maxs-(frame-3)];
281 aap = aaseq;
282 while (nna--) {
283 im = nnp[*--nts0]<<4;
284 im += nnp[*--nts0]<<2;
285 im += nnp[*--nts0];
286 *aap++ = aamp[im];
287 /* this check is included because of a bug in tfasty
288 which occurs only during the alignment process */
289
290 #ifdef DEBUG
291 if (debug_set && aamp[im] > MAXUC) {
292 fprintf(stderr,"faatran: %d %d %d %d %d?%d\n",
293 *(nts0-3),*(nts0-2),*(nts0-1), im, aamp[im],aamap[im]);
294
295 /* this allows recovery, but should not be done frequently */
296 for (i=0; i<64; i++) {
297 aamap[i]=aascii[aacmap[i]];
298 aamapr[i]=aascii[aacmap[(~i)&63]];
299 }
300 *(aap-1) = aamp[im];
301 }
302 #endif
303 }
304 }
305 aaseq[iaa]=EOSEQ;
306 return iaa;
307 }
308
309 /* slower version that masks out NNN,XXX */
310
311 /* - A C G T U R Y M W S K D H V B N X */
312 static int snt[]={0,0,1,2,3,3,0,1,0,0,4,4,4,4,4,4,4,4};
313
314 int
saatran(const unsigned char * ntseq,unsigned char * aaseq,int maxs,int frame)315 saatran(const unsigned char *ntseq,
316 unsigned char *aaseq, int maxs, int frame)
317 {
318 int iaa, im, it, nna, xflag;
319 register int *nnp;
320 const unsigned char *nts0;
321 register int *aamp;
322 register unsigned char *aap;
323
324 iaa=nna=(maxs-(frame<3?frame:frame-3))/3;
325 if (nna <= 3 ) {
326 aaseq[0]=EOSEQ;
327 return 0;
328 }
329
330 nnp = snt;
331 if (frame < 3) {
332 aamp = aamap;
333 nts0 = &ntseq[frame];
334 aap = aaseq;
335 while (nna--) {
336 xflag = 0;
337 if ((it=nnp[*nts0++])<4) {im = it<<4;}
338 else {xflag = 1; im=0;}
339 if ((it=nnp[*nts0++])<4) {im += it<<2;}
340 else xflag = 1;
341 if ((it=nnp[*nts0++])<4) {im += it;}
342 else xflag = 1;
343 if (xflag) *aap++ = aascii['X'];
344 else *aap++ = aamp[im];
345 }
346 }
347 else {
348 aamp = aamapr;
349 nts0 = &ntseq[maxs-(frame-3)];
350 aap = aaseq;
351 while (nna--) {
352 xflag = 0;
353 if ((it=nnp[*--nts0]) < 4) im = it<<4;
354 else {xflag = 1; im=0;}
355 if ((it=nnp[*--nts0]) < 4) im += it<<2;
356 else xflag = 1;
357 if ((it=nnp[*--nts0]) < 4) im += it;
358 else xflag = 1;
359 if (xflag) *aap++ = aascii['X'];
360 else *aap++ = aamp[im];
361 }
362 }
363 aaseq[iaa]=EOSEQ;
364 return iaa;
365 }
366
367 void
aainit(int tr_type,int debug)368 aainit(int tr_type, int debug)
369 {
370 int i,j;
371 char *aasmap;
372 int ascii_star;
373 int imap[4]={3,1,0,2}, i0, i1, i2, ii;
374
375 debug_set = debug;
376
377 aasmap = AA1;
378
379 ascii_star = aascii['*'];
380 aascii['*'] = TERM;
381
382 if (tr_type > 0) {
383 /* need to put in a new translation table */
384 switch (tr_type) {
385 case 1: aasmap = AA1; break;
386 case 2: aasmap = AA2; break;
387 case 3: aasmap = AA3; break;
388 case 4: aasmap = AA4; break;
389 case 5: aasmap = AA5; break;
390 case 6: aasmap = AA6; break;
391 case 7: aasmap = AA7; break;
392 case 10: aasmap = AA10; break;
393 case 11: aasmap = AA11; break;
394 case 12: aasmap = AA12; break;
395 case 13: aasmap = AA13; break;
396 case 14: aasmap = AA14; break;
397 case 15: aasmap = AA15; break;
398 case 16: aasmap = AA16; break;
399 case 21: aasmap = AA21; break;
400 case 22: aasmap = AA22; break;
401 case 23: aasmap = AA23; break;
402
403 default: aasmap = AA1; break;
404 }
405
406 if (debug) fprintf(stderr," codon table: %d\n new old\n",tr_type);
407 for (i0 = 0; i0 < 4; i0++)
408 for (i1 = 0; i1 < 4; i1++)
409 for (i2 = 0; i2 < 4; i2++) {
410 ii = (imap[i0]<<4) + (imap[i1]<<2) + imap[i2];
411 if (debug && aacmap[ii] != *aasmap) {
412 fprintf(stderr," %c%c%c: %c - %c\n",
413 nt[imap[i0]+1],nt[imap[i1]+1],nt[imap[i2]+1],
414 *aasmap,aacmap[ii]);
415 }
416 aacmap[ii]= *aasmap++;
417 }
418
419
420 for (i=0; i<64; i++) {
421 fprintf(stderr,"'%c',",aacmap[i]);
422 if ((i%16)==15) fputc('\n',stderr);
423 }
424 fputc('\n',stderr);
425
426 }
427 for (i=0; i<64; i++) {
428 aamap[i]=aascii[aacmap[i]];
429 if (aamap[i] > TERM) {
430 fprintf(stderr," *** error - codon out of range: %d %d (%c)\n",i,aamap[i], NCBIstdaa_l[aamap[i]] );
431 }
432 aamapr[i]=aascii[aacmap[(~i)&63]];
433 if (aamapr[i] > TERM) {
434 fprintf(stderr," *** error - codon_r out of range: %d %d (%c)\n",i,aamapr[i], NCBIstdaa_l[aamapr[i]]);
435 }
436 }
437 aascii['*'] = ascii_star;
438 }
439
440 void
aagetmap(char * to,int n)441 aagetmap(char *to, int n)
442 {
443 int i;
444 for (i=0; i<n; i++) to[i] = aacmap[i];
445 }
446