1 /****************************************************************\
2 * *
3 * Nucleotide Translation Code *
4 * *
5 * Guy St.C. Slater.. mailto:guy@ebi.ac.uk *
6 * Copyright (C) 2000-2009. All Rights Reserved. *
7 * *
8 * This source code is distributed under the terms of the *
9 * GNU General Public License, version 3. See the file COPYING *
10 * or http://www.gnu.org/licenses/gpl.txt for details *
11 * *
12 * If you use this code, please keep this notice intact. *
13 * *
14 \****************************************************************/
15
16 #include <ctype.h>
17 #include <string.h>
18 #include <stdlib.h> /* For atoi() */
19
20 #include "translate.h"
21
22 /* ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
23 CTAG
24 0 - 0000 - blank Representation of Nucleotides
25 1 G 0001 C -----------------------------
26 2 A 0010 T
27 3 R 0011 Y (AG) The standard (IUB) nucleotides
28 4 T 0100 A are used in the array "-GARTKWDCSMVYBHN"
29 5 K 0101 M (GT) so that:
30 6 W 0110 W (AT)
31 7 D 0111 H (AGT) 1: Four bits per base are used.
32 8 C 1000 G 2: A bit is set for each base represented.
33 9 S 1001 S (CG) 3: Any base reversed is it's complement.
34 10 M 1010 K (AC)
35 11 V 1011 B (ACG)
36 12 Y 1100 R (CT)
37 13 B 1101 V (CGT)
38 14 H 1110 D (ACT)
39 15 N 1111 N (ATGC)
40 :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: */
41
42 /**/
43
Translate_ArgumentSet_create(Argument * arg)44 Translate_ArgumentSet *Translate_ArgumentSet_create(Argument *arg){
45 register ArgumentSet *as;
46 static Translate_ArgumentSet tas = {NULL};
47 if(arg){
48 as = ArgumentSet_create("Translation Options");
49 ArgumentSet_add_option(as, '\0', "geneticcode", NULL,
50 "Use built-in or custom genetic code", "1",
51 Argument_parse_string, &tas.genetic_code);
52 Argument_absorb_ArgumentSet(arg, as);
53 }
54 return &tas;
55 }
56
57 /**/
58
Translate_initialise_nucleotide_data(Translate * t)59 static void Translate_initialise_nucleotide_data(Translate *t){
60 register gint i;
61 for(i = 0; i < Translate_NT_SET_SIZE; i++)
62 t->nt2d[t->nt[i]] = t->nt2d[tolower(t->nt[i])] = i;
63 t->nt2d['X'] = t->nt2d['x'] = t->nt2d['N'];
64 t->nt2d['U'] = t->nt2d['u'] = t->nt2d['T'];
65 return;
66 }
67
Translate_initialise_peptide_data(Translate * t)68 static void Translate_initialise_peptide_data(Translate *t){
69 register gint i, j;
70 guchar pimagrp[Translate_PIMA_SET_SIZE][6] = {
71 "aIV", "bLM", "dFWY", "lND", "kDE", "oEQ",
72 "nKR", "iST", "hAG", "cab", "edH", "mlk",
73 "pon", "jihP", "fCcd", "rHmpi", "xfrj", "Xx*" };
74 for(i = 0; i < Translate_AA_SET_SIZE; i++)
75 t->aa2d[t->aa[i]] = i;
76 for(i = 1; i < 23; i++)
77 t->aamask[i] = (1L<<(i-1)); /* First is zero */
78 for(i = 0; i < Translate_PIMA_SET_SIZE; i++){
79 t->aamask[t->aa2d[pimagrp[i][0]]]
80 = t->aamask[t->aa2d[pimagrp[i][1]]];
81 for(j = 2; pimagrp[i][j]; j++)
82 t->aamask[t->aa2d[pimagrp[i][0]]]
83 |= t->aamask[t->aa2d[pimagrp[i][j]]];
84 }
85 return;
86 }
87
Translate_initialise_translation_data(Translate * t)88 static void Translate_initialise_translation_data(Translate *t){
89 register gchar a, b, c, x, y, z;
90 register gint i, tmp;
91 for(x = 0; x < 16; x++){
92 for(y = 0; y < 16; y++){
93 for(z = 0; z < 16; z++){
94 tmp = 0;
95 for(a = 0; a < 4; a++){
96 if(x != (x|1<<a))
97 continue;
98 for(b = 0; b < 4; b++){
99 if(y != (y|1<<b))
100 continue;
101 for(c = 0; c < 4; c++){
102 if(z != (z|1<<c))
103 continue;
104 tmp = (tmp|t->aamask[
105 t->aa2d[t->code[((a<<4)|(b<<2)|c)]]]);
106 }
107 }
108 }
109 for(i = 0; t->aamask[i] != (tmp|t->aamask[i]); i++);
110 t->trans[x|(y<<4)|(z<<8)] = i;
111 }
112 }
113 }
114 return;
115 }
116
Translate_initialise_reverse_translate_data(Translate * t)117 static void Translate_initialise_reverse_translate_data(Translate *t){
118 register gint i;
119 for(i = 0; t->code[i]; i++){
120 if(!t->revtrans[t->code[i]])
121 t->revtrans[t->code[i]] = g_ptr_array_new();
122 g_ptr_array_add(t->revtrans[t->code[i]], GINT_TO_POINTER(i));
123 }
124 return;
125 }
126
127 /**/
128
Translate_convert_genetic_code(gchar * code)129 static gchar *Translate_convert_genetic_code(gchar *code){
130 register gint a, b, c, n = 0;
131 gint table[4] = {3, 2, 0, 1};
132 register gchar *result;
133 g_assert(code && (strlen(code) == 64));
134 result = g_new(gchar, 65);
135 result[64] = '\0';
136 for(a = 0; a < 4; a++)
137 for(b = 0; b < 4; b++)
138 for(c = 0; c < 4; c++)
139 result[n++] = code[(table[a] << 4)
140 | (table[b] << 2)
141 | table[c]];
142 return result;
143 }
144 /* Converts genetic code from NCBI format (TCAG order)
145 * to the format internally by exonerate (GATC order)
146 */
147
148 typedef struct {
149 gint id;
150 gchar *name;
151 gchar *code;
152 } Translate_GeneticCode;
153
Translate_get_builtin_genetic_code(gint id)154 static gchar *Translate_get_builtin_genetic_code(gint id){
155 register gint i;
156 Translate_GeneticCode code[17] = {
157 { 1, "The Standard Code",
158 "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
159 { 2, "The Vertebrate Mitochondrial Code",
160 "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG"},
161 { 3, "The Yeast Mitochondrial Code",
162 "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
163 { 4, "The Mold, Protozoan, and Coelenterate Mitochondrial Code"
164 " and the Mycoplasma/Spiroplasma Code",
165 "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
166 { 5, "The Invertebrate Mitochondrial Code",
167 "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG"},
168 { 6, "The Ciliate, Dasycladacean and Hexamita Nuclear Code",
169 "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
170 { 9, "The Echinoderm and Flatworm Mitochondrial Code",
171 "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG"},
172 {10, "The Euplotid Nuclear Code",
173 "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
174 {11, "The Bacterial and Plant Plastid Code",
175 "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
176 {12, "The Alternative Yeast Nuclear Code",
177 "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
178 {13, "The Ascidian Mitochondrial Code",
179 "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG"},
180 {14, "The Alternative Flatworm Mitochondrial Code",
181 "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG"},
182 {15, "Blepharisma Nuclear Code",
183 "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
184 {16, "Chlorophycean Mitochondrial Code",
185 "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
186 {21, "Trematode Mitochondrial Code",
187 "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG"},
188 {22, "Scenedesmus obliquus mitochondrial Code",
189 "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"},
190 {23, "Thraustochytrium Mitochondrial Code",
191 "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"}
192 };
193 for(i = 0; i < 17; i++){
194 if(code[i].id == id)
195 return Translate_convert_genetic_code(code[i].code);
196 }
197 g_error("No built in genetic code corresponding to id [%d]", id);
198 return NULL;
199 }
200 /* Built in genetic codes taken from:
201 * http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
202 */
203
Translate_get_genetic_code(gchar * code)204 static gchar *Translate_get_genetic_code(gchar *code){
205 register gint len, id;
206 if(!code)
207 return g_strdup(
208 "GGGGEEDDVVVVAAAARRSSKKNNMIIITTTTW*CC**YYLLFFSSSSRRRRQQHHLLLLPPPP");
209 len = strlen(code);
210 if(len == 64)
211 return Translate_convert_genetic_code(code);
212 if((len == 1) || (len == 2)){
213 id = atoi(code);
214 return Translate_get_builtin_genetic_code(id);
215 }
216 g_error("Could not use genetic code [%s]", code);
217 return NULL;
218 }
219
220 /**/
221
Translate_create(gboolean use_pima)222 Translate *Translate_create(gboolean use_pima){
223 register Translate *t = g_new0(Translate, 1);
224 register Translate_ArgumentSet *tas = Translate_ArgumentSet_create(NULL);
225 t->ref_count = 1;
226 t->nt = (guchar*)"-GARTKWDCSMVYBHN";
227 t->aa = (guchar*)"-ARNDCQEGHILKMFPSTWYV*ablkonihdmcepjfrxX";
228 t->code = (guchar*)Translate_get_genetic_code(tas->genetic_code);
229 Translate_initialise_nucleotide_data(t);
230 Translate_initialise_peptide_data(t);
231 Translate_initialise_translation_data(t);
232 Translate_initialise_reverse_translate_data(t);
233 if(!use_pima)
234 t->aa = (guchar*)"-ARNDCQEGHILKMFPSTWYV*XXXXXXXXXXXXXXXXXX";
235 return t;
236 }
237
Translate_share(Translate * t)238 Translate *Translate_share(Translate *t){
239 g_assert(t);
240 t->ref_count++;
241 return t;
242 }
243
Translate_destroy(Translate * t)244 void Translate_destroy(Translate *t){
245 register gint i;
246 g_assert(t);
247 if(--t->ref_count)
248 return;
249 for(i = 0; t->code[i]; i++){
250 if(t->revtrans[t->code[i]]){
251 g_ptr_array_free(t->revtrans[t->code[i]], TRUE);
252 t->revtrans[t->code[i]] = NULL;
253 }
254 }
255 g_free(t->code);
256 g_free(t);
257 return;
258 }
259
Translate_sequence(Translate * t,gchar * dna,gint dna_length,gint frame,gchar * aaseq,guchar * filter)260 gint Translate_sequence(Translate *t, gchar *dna, gint dna_length,
261 gint frame, gchar *aaseq, guchar *filter){
262 register gchar *dp, *ap = aaseq, *end;
263 if((frame > 0) && (frame < 4)){
264 end = dna + dna_length-2;
265 if(filter){
266 for(dp = dna + frame - 1; dp < end; dp += 3)
267 *ap++ = Translate_base(t, filter[(guchar)dp[0]],
268 filter[(guchar)dp[1]],
269 filter[(guchar)dp[2]]);
270 } else {
271 for(dp = dna + frame - 1; dp < end; dp += 3)
272 *ap++ = Translate_codon(t, dp);
273 }
274 *ap = '\0';
275 return ap - aaseq;
276 }
277 if((frame < 0) && (frame > -4)){
278 if(filter){
279 for(dp = dna + dna_length + frame - 2; dp >= dna; dp -= 3)
280 *ap++ = Translate_base(t, filter[(guchar)dp[0]],
281 filter[(guchar)dp[1]],
282 filter[(guchar)dp[2]]);
283 } else {
284 for(dp = dna + dna_length + frame - 2; dp >= dna; dp -= 3)
285 *ap++ = Translate_codon(t, dp);
286 }
287 *ap = '\0';
288 return ap - aaseq;
289 }
290 g_error("Invalid reading frame [%d]", frame);
291 return 0; /* Never reached */
292 }
293 /* Returns length of peptide generated
294 */
295
Translate_reverse(Translate * t,gchar * aaseq,gint length,Translate_reverse_func trf,gpointer user_data)296 void Translate_reverse(Translate *t, gchar *aaseq, gint length,
297 Translate_reverse_func trf, gpointer user_data){
298 register gint i, j, total, *prod, id, codon;
299 register gchar *nucleotide;
300 g_assert(length > 0);
301 g_assert(aaseq);
302 g_assert(aaseq[0]);
303 g_assert(trf);
304 prod = g_new0(gint, length);
305 nucleotide = g_new(gchar, (length*3)+1);
306 nucleotide[length*3] = '\0';
307 total = t->revtrans[(guchar)aaseq[0]]
308 ? t->revtrans[(guchar)aaseq[0]]->len
309 : 1;
310 for(i = 1; i < length; i++){
311 if(t->revtrans[(guchar)aaseq[i]])
312 total *= t->revtrans[(guchar)aaseq[i]]->len;
313 }
314 prod[0] = 1;
315 for(i = 1; i < length; i++)
316 prod[i] = prod[i-1]
317 * (t->revtrans[(guchar)aaseq[i-1]]
318 ? t->revtrans[(guchar)aaseq[i-1]]->len
319 : 1);
320 for(i = 0; i < total; i++){
321 for(j = 0; j < length; j++){
322 if(t->revtrans[(guchar)aaseq[j]]){
323 id = (i/prod[j])%t->revtrans[(guchar)aaseq[j]]->len;
324 codon = GPOINTER_TO_INT(
325 t->revtrans[(guchar)aaseq[j]]->pdata[id]);
326 nucleotide[j*3] = t->nt[1<<((codon>>4)&3)];
327 nucleotide[(j*3)+1] = t->nt[1<<((codon>>2)&3)];
328 nucleotide[(j*3)+2] = t->nt[1<<(codon&3)];
329 } else {
330 nucleotide[j*3] =
331 nucleotide[(j*3)+1] =
332 nucleotide[(j*3)+2] = 'N';
333 }
334 }
335 trf(nucleotide, length*3, user_data);
336 }
337 g_free(nucleotide);
338 g_free(prod);
339 return;
340 }
341
342 /* TODO: Add alternative genetic codes (when required) from:
343 * http://www.ncbi.nlm.nih.gov/htbin-post/Taxonomy/wprintgc?mode=c
344 */
345
346