1 /****************************************************************\
2 *                                                                *
3 *  Nucleotide Translation Code                                   *
4 *                                                                *
5 *  Guy St.C. Slater..   mailto:guy@ebi.ac.uk                     *
6 *  Copyright (C) 2000-2009.  All Rights Reserved.                *
7 *                                                                *
8 *  This source code is distributed under the terms of the        *
9 *  GNU General Public License, version 3. See the file COPYING   *
10 *  or http://www.gnu.org/licenses/gpl.txt for details            *
11 *                                                                *
12 *  If you use this code, please keep this notice intact.         *
13 *                                                                *
14 \****************************************************************/
15 
16 #include <ctype.h>
17 #include <string.h>
18 #include <stdlib.h> /* For atoi() */
19 
20 #include "translate.h"
21 
22 /* ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
23      CTAG
24  0 - 0000 - blank          Representation of Nucleotides
25  1 G 0001 C                -----------------------------
26  2 A 0010 T
27  3 R 0011 Y (AG)       The standard (IUB) nucleotides
28  4 T 0100 A            are used in the array "-GARTKWDCSMVYBHN"
29  5 K 0101 M (GT)       so that:
30  6 W 0110 W (AT)
31  7 D 0111 H (AGT)       1: Four bits per base are used.
32  8 C 1000 G             2: A bit is set for each base represented.
33  9 S 1001 S (CG)        3: Any base reversed is it's complement.
34 10 M 1010 K (AC)
35 11 V 1011 B (ACG)
36 12 Y 1100 R (CT)
37 13 B 1101 V (CGT)
38 14 H 1110 D (ACT)
39 15 N 1111 N (ATGC)
40  :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: */
41 
42 /**/
43 
Translate_ArgumentSet_create(Argument * arg)44 Translate_ArgumentSet *Translate_ArgumentSet_create(Argument *arg){
45     register ArgumentSet *as;
46     static Translate_ArgumentSet tas = {NULL};
47     if(arg){
48         as = ArgumentSet_create("Translation Options");
49         ArgumentSet_add_option(as, '\0', "geneticcode", NULL,
50             "Use built-in or custom genetic code", "1",
51             Argument_parse_string, &tas.genetic_code);
52         Argument_absorb_ArgumentSet(arg, as);
53         }
54     return &tas;
55     }
56 
57 /**/
58 
Translate_initialise_nucleotide_data(Translate * t)59 static void Translate_initialise_nucleotide_data(Translate *t){
60     register gint i;
61     for(i = 0; i < Translate_NT_SET_SIZE; i++)
62         t->nt2d[t->nt[i]] = t->nt2d[tolower(t->nt[i])] = i;
63     t->nt2d['X'] = t->nt2d['x'] = t->nt2d['N'];
64     t->nt2d['U'] = t->nt2d['u'] = t->nt2d['T'];
65     return;
66     }
67 
Translate_initialise_peptide_data(Translate * t)68 static void Translate_initialise_peptide_data(Translate *t){
69     register gint i, j;
70     guchar pimagrp[Translate_PIMA_SET_SIZE][6] = {
71      "aIV", "bLM",  "dFWY", "lND",   "kDE",  "oEQ",
72      "nKR", "iST",  "hAG",  "cab",   "edH",  "mlk",
73      "pon", "jihP", "fCcd", "rHmpi", "xfrj", "Xx*" };
74     for(i = 0; i < Translate_AA_SET_SIZE; i++)
75         t->aa2d[t->aa[i]] = i;
76     for(i = 1; i < 23; i++)
77         t->aamask[i] = (1L<<(i-1));  /* First is zero  */
78     for(i = 0; i < Translate_PIMA_SET_SIZE; i++){
79         t->aamask[t->aa2d[pimagrp[i][0]]]
80       = t->aamask[t->aa2d[pimagrp[i][1]]];
81         for(j = 2; pimagrp[i][j]; j++)
82             t->aamask[t->aa2d[pimagrp[i][0]]]
83          |= t->aamask[t->aa2d[pimagrp[i][j]]];
84         }
85     return;
86     }
87 
Translate_initialise_translation_data(Translate * t)88 static void Translate_initialise_translation_data(Translate *t){
89     register gchar a, b, c, x, y, z;
90     register gint i, tmp;
91     for(x = 0; x < 16; x++){
92         for(y = 0; y < 16; y++){
93             for(z = 0; z < 16; z++){
94                 tmp = 0;
95                 for(a = 0; a < 4; a++){
96                     if(x != (x|1<<a))
97                         continue;
98                     for(b = 0; b < 4; b++){
99                         if(y != (y|1<<b))
100                             continue;
101                         for(c = 0; c < 4; c++){
102                             if(z != (z|1<<c))
103                                 continue;
104                             tmp = (tmp|t->aamask[
105                                 t->aa2d[t->code[((a<<4)|(b<<2)|c)]]]);
106                             }
107                         }
108                     }
109                 for(i = 0; t->aamask[i] != (tmp|t->aamask[i]); i++);
110                 t->trans[x|(y<<4)|(z<<8)] = i;
111                 }
112             }
113        }
114     return;
115     }
116 
Translate_initialise_reverse_translate_data(Translate * t)117 static void Translate_initialise_reverse_translate_data(Translate *t){
118     register gint i;
119     for(i = 0; t->code[i]; i++){
120         if(!t->revtrans[t->code[i]])
121             t->revtrans[t->code[i]] = g_ptr_array_new();
122         g_ptr_array_add(t->revtrans[t->code[i]], GINT_TO_POINTER(i));
123         }
124     return;
125     }
126 
127 /**/
128 
Translate_convert_genetic_code(gchar * code)129 static gchar *Translate_convert_genetic_code(gchar *code){
130     register gint a, b, c, n = 0;
131     gint table[4] = {3, 2, 0, 1};
132     register gchar *result;
133     g_assert(code && (strlen(code) == 64));
134     result = g_new(gchar, 65);
135     result[64] = '\0';
136     for(a = 0; a < 4; a++)
137         for(b = 0; b < 4; b++)
138             for(c = 0; c < 4; c++)
139                 result[n++] = code[(table[a] << 4)
140                                  | (table[b] << 2)
141                                  |  table[c]];
142     return result;
143     }
144 /* Converts genetic code from NCBI format (TCAG order)
145  * to the format internally by exonerate (GATC order)
146  */
147 
148 typedef struct {
149      gint  id;
150     gchar *name;
151     gchar *code;
152 } Translate_GeneticCode;
153 
Translate_get_builtin_genetic_code(gint id)154 static gchar *Translate_get_builtin_genetic_code(gint id){
155     register gint i;
156     Translate_GeneticCode code[17] = {
157         { 1, "The Standard Code",
158         "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
159         { 2, "The Vertebrate Mitochondrial Code",
160         "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG"},
161         { 3, "The Yeast Mitochondrial Code",
162         "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
163         { 4, "The Mold, Protozoan, and Coelenterate Mitochondrial Code"
164              " and the Mycoplasma/Spiroplasma Code",
165         "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
166         { 5, "The Invertebrate Mitochondrial Code",
167         "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG"},
168         { 6, "The Ciliate, Dasycladacean and Hexamita Nuclear Code",
169         "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
170         { 9, "The Echinoderm and Flatworm Mitochondrial Code",
171         "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG"},
172         {10, "The Euplotid Nuclear Code",
173         "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
174         {11, "The Bacterial and Plant Plastid Code",
175         "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
176         {12, "The Alternative Yeast Nuclear Code",
177         "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
178         {13, "The Ascidian Mitochondrial Code",
179         "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG"},
180         {14, "The Alternative Flatworm Mitochondrial Code",
181         "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG"},
182         {15, "Blepharisma Nuclear Code",
183         "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
184         {16, "Chlorophycean Mitochondrial Code",
185         "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
186         {21, "Trematode Mitochondrial Code",
187         "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG"},
188         {22, "Scenedesmus obliquus mitochondrial Code",
189         "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
190         {23, "Thraustochytrium Mitochondrial Code",
191         "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"}
192         };
193     for(i = 0; i < 17; i++){
194         if(code[i].id == id)
195             return Translate_convert_genetic_code(code[i].code);
196         }
197     g_error("No built in genetic code corresponding to id [%d]", id);
198     return NULL;
199     }
200 /* Built in genetic codes taken from:
201  * http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
202  */
203 
Translate_get_genetic_code(gchar * code)204 static gchar *Translate_get_genetic_code(gchar *code){
205     register gint len, id;
206     if(!code)
207         return g_strdup(
208          "GGGGEEDDVVVVAAAARRSSKKNNMIIITTTTW*CC**YYLLFFSSSSRRRRQQHHLLLLPPPP");
209     len = strlen(code);
210     if(len == 64)
211         return Translate_convert_genetic_code(code);
212     if((len == 1) || (len == 2)){
213         id = atoi(code);
214         return Translate_get_builtin_genetic_code(id);
215         }
216     g_error("Could not use genetic code [%s]", code);
217     return NULL;
218     }
219 
220 /**/
221 
Translate_create(gboolean use_pima)222 Translate *Translate_create(gboolean use_pima){
223     register Translate *t = g_new0(Translate, 1);
224     register Translate_ArgumentSet *tas = Translate_ArgumentSet_create(NULL);
225     t->ref_count = 1;
226     t->nt = (guchar*)"-GARTKWDCSMVYBHN";
227     t->aa = (guchar*)"-ARNDCQEGHILKMFPSTWYV*ablkonihdmcepjfrxX";
228     t->code = (guchar*)Translate_get_genetic_code(tas->genetic_code);
229     Translate_initialise_nucleotide_data(t);
230     Translate_initialise_peptide_data(t);
231     Translate_initialise_translation_data(t);
232     Translate_initialise_reverse_translate_data(t);
233     if(!use_pima)
234         t->aa = (guchar*)"-ARNDCQEGHILKMFPSTWYV*XXXXXXXXXXXXXXXXXX";
235     return t;
236     }
237 
Translate_share(Translate * t)238 Translate *Translate_share(Translate *t){
239     g_assert(t);
240     t->ref_count++;
241     return t;
242     }
243 
Translate_destroy(Translate * t)244 void Translate_destroy(Translate *t){
245     register gint i;
246     g_assert(t);
247     if(--t->ref_count)
248         return;
249     for(i = 0; t->code[i]; i++){
250         if(t->revtrans[t->code[i]]){
251             g_ptr_array_free(t->revtrans[t->code[i]], TRUE);
252             t->revtrans[t->code[i]] = NULL;
253             }
254         }
255     g_free(t->code);
256     g_free(t);
257     return;
258     }
259 
Translate_sequence(Translate * t,gchar * dna,gint dna_length,gint frame,gchar * aaseq,guchar * filter)260 gint Translate_sequence(Translate *t, gchar *dna, gint dna_length,
261                         gint frame, gchar *aaseq, guchar *filter){
262     register gchar *dp, *ap = aaseq, *end;
263     if((frame > 0) && (frame < 4)){
264         end = dna + dna_length-2;
265         if(filter){
266             for(dp = dna + frame - 1; dp < end; dp += 3)
267                 *ap++ = Translate_base(t, filter[(guchar)dp[0]],
268                                           filter[(guchar)dp[1]],
269                                           filter[(guchar)dp[2]]);
270         } else {
271             for(dp = dna + frame - 1; dp < end; dp += 3)
272                 *ap++ = Translate_codon(t, dp);
273             }
274         *ap = '\0';
275         return ap - aaseq;
276         }
277     if((frame < 0) && (frame > -4)){
278         if(filter){
279             for(dp = dna + dna_length + frame - 2; dp >= dna; dp -= 3)
280                 *ap++ = Translate_base(t, filter[(guchar)dp[0]],
281                                           filter[(guchar)dp[1]],
282                                           filter[(guchar)dp[2]]);
283         } else {
284             for(dp = dna + dna_length + frame - 2; dp >= dna; dp -= 3)
285                 *ap++ = Translate_codon(t, dp);
286             }
287         *ap = '\0';
288         return ap - aaseq;
289         }
290     g_error("Invalid reading frame [%d]", frame);
291     return 0; /* Never reached */
292     }
293 /* Returns length of peptide generated
294  */
295 
Translate_reverse(Translate * t,gchar * aaseq,gint length,Translate_reverse_func trf,gpointer user_data)296 void Translate_reverse(Translate *t, gchar *aaseq, gint length,
297                        Translate_reverse_func trf, gpointer user_data){
298     register gint i, j, total, *prod, id, codon;
299     register gchar *nucleotide;
300     g_assert(length > 0);
301     g_assert(aaseq);
302     g_assert(aaseq[0]);
303     g_assert(trf);
304     prod = g_new0(gint, length);
305     nucleotide = g_new(gchar, (length*3)+1);
306     nucleotide[length*3] = '\0';
307     total = t->revtrans[(guchar)aaseq[0]]
308           ? t->revtrans[(guchar)aaseq[0]]->len
309           : 1;
310     for(i = 1; i < length; i++){
311         if(t->revtrans[(guchar)aaseq[i]])
312             total *= t->revtrans[(guchar)aaseq[i]]->len;
313         }
314     prod[0] = 1;
315     for(i = 1; i < length; i++)
316         prod[i] = prod[i-1]
317                 * (t->revtrans[(guchar)aaseq[i-1]]
318                 ? t->revtrans[(guchar)aaseq[i-1]]->len
319                 : 1);
320     for(i = 0; i < total; i++){
321         for(j = 0; j < length; j++){
322             if(t->revtrans[(guchar)aaseq[j]]){
323                 id = (i/prod[j])%t->revtrans[(guchar)aaseq[j]]->len;
324                 codon = GPOINTER_TO_INT(
325                         t->revtrans[(guchar)aaseq[j]]->pdata[id]);
326                 nucleotide[j*3]     = t->nt[1<<((codon>>4)&3)];
327                 nucleotide[(j*3)+1] = t->nt[1<<((codon>>2)&3)];
328                 nucleotide[(j*3)+2] = t->nt[1<<(codon&3)];
329             } else {
330                 nucleotide[j*3]     =
331                 nucleotide[(j*3)+1] =
332                 nucleotide[(j*3)+2] = 'N';
333                 }
334             }
335         trf(nucleotide, length*3, user_data);
336         }
337     g_free(nucleotide);
338     g_free(prod);
339     return;
340     }
341 
342 /* TODO: Add alternative genetic codes (when required) from:
343  * http://www.ncbi.nlm.nih.gov/htbin-post/Taxonomy/wprintgc?mode=c
344  */
345 
346