1 /* @source cutgextract application
2 **
3 ** Create EMBOSS codon usage files from the CUTG database
4 **
5 ** @author Copyright (C) Alan Bleasby (ableasby@hgmp.mrc.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ******************************************************************************/
22
23 #include "emboss.h"
24
25 #define CODONS 64
26 #define TABLE_ESTIMATE 1000
27
28
29
30
31 /* @datastatic CutgPValues ****************************************************
32 **
33 ** Codon usage table data values
34 **
35 ** @alias CutgSValues
36 ** @alias CutgOValues
37 **
38 ** @attr Count [ajint[CODONS]] Number of occurrences for each codon
39 ** in standard order
40 ** @attr Division [AjPStr] EMBL/GenBank division
41 ** @attr Doc [AjPStr] Documentation string
42 ** @attr Species [AjPStr] Species
43 ** @attr Warn [ajint] Number of warnings issued
44 ** @attr Skip [ajint] Number of CDSs skipped
45 ** @attr CdsCount [ajint] Number of CDSs counted
46 ** @attr Padding [char[4]] Padding to alignment boundary
47 ******************************************************************************/
48
49 typedef struct CutgSValues
50 {
51 ajint Count[CODONS];
52 AjPStr Division;
53 AjPStr Doc;
54 AjPStr Species;
55 ajint Warn;
56 ajint Skip;
57 ajint CdsCount;
58 char Padding[4];
59 } CutgOValues;
60 #define CutgPValues CutgOValues*
61
62 static AjPStr cutgextractSavepid = NULL;
63 static AjPStr cutgextractLine = NULL;
64 static AjPStr cutgextractOrg = NULL;
65
66 static char *cutgextract_next(AjPFile inf, const AjPStr wildspecies,
67 AjPStr* pspecies, AjPStr* pdoc);
68 static ajint cutgextract_readcodons(AjPFile inf, AjBool allrecords,
69 ajint *count);
70
71
72
73
74 /* @prog cutgextract **********************************************************
75 **
76 ** Extract data from CUTG.
77 **
78 ** Reads all *.codon files in the input, and builds a table for each organism.
79 ** Writes the tables to the EMBOSS_DATA/CODONS directory at the end.
80 **
81 ******************************************************************************/
82
main(int argc,char ** argv)83 int main(int argc, char **argv)
84 {
85 const char *codons[]=
86 {
87 "TAG","TAA","TGA","GCG","GCA","GCT","GCC","TGT", /* 00-07 */
88 "TGC","GAT","GAC","GAA","GAG","TTT","TTC","GGT", /* 08-15 */
89 "GGG","GGA","GGC","CAT","CAC","ATA","ATT","ATC", /* 16-23 */
90 "AAA","AAG","CTA","TTA","TTG","CTT","CTC","CTG", /* 24-31 */
91 "ATG","AAT","AAC","CCG","CCA","CCT","CCC","CAA", /* 32-39 */
92 "CAG","CGT","CGA","CGC","AGG","AGA","CGG","TCG", /* 40-47 */
93 "TCA","AGT","TCT","TCC","AGC","ACG","ACT","ACA", /* 48-55 */
94 "ACC","GTA","GTT","GTC","GTG","TGG","TAT","TAC" /* 56-63 */
95 };
96
97 const char *aa=
98 "***AAAACCDDEEFFGGGGHHIIIKKLLLLLLMNNPPPPQQRRRRRRSSSSSSTTTTVVVVWYY";
99
100 AjPFile inf = NULL;
101 AjPFile outf = NULL;
102 char *entryname = NULL;
103 AjPStr fname = NULL;
104 AjPStr key = NULL;
105 AjPStr tmpkey = NULL;
106 AjBool allrecords = AJFALSE;
107
108 AjPTable table = NULL;
109 ajint i = 0;
110 ajint j = 0;
111 ajint k = 0;
112 ajint x = 0;
113 ajint savecount[3];
114
115 AjPStr *keyarray = NULL;
116 CutgPValues *valarray = NULL;
117 AjPCod codon = NULL;
118 ajint sum = 0;
119 char c;
120
121 AjPList flist = NULL;
122 AjPFile logf = NULL;
123 AjPStr entry = NULL;
124 AjPStr baseentry = NULL;
125 AjPStr wild = NULL;
126 AjPStr division = NULL;
127 AjPStr release = NULL;
128 AjPStr wildspecies = NULL;
129 CutgPValues value = NULL;
130 AjPStr docstr = NULL;
131 AjPStr species = NULL;
132 AjPStr filename = NULL;
133 ajint nstops;
134
135 embInit("cutgextract",argc,argv);
136
137 tmpkey = ajStrNew();
138 fname = ajStrNew();
139
140
141 table = ajTablestrNewConst(TABLE_ESTIMATE);
142
143
144 flist = ajAcdGetDirlist("directory");
145 wild = ajAcdGetString("wildspec");
146 release = ajAcdGetString("release");
147 logf = ajAcdGetOutfile("outfile");
148 wildspecies = ajAcdGetString("species");
149 filename = ajAcdGetString("filename");
150 allrecords = ajAcdGetBoolean("allrecords");
151
152 ajStrInsertC(&release, 0, "CUTG");
153 ajStrRemoveWhite(&release);
154
155 while(ajListPop(flist,(void **)&entry))
156 {
157 ajStrAssignS(&baseentry, entry);
158 ajFilenameTrimPath(&baseentry);
159 ajDebug("Testing file '%S'\n", entry);
160 if(!ajStrMatchWildS(baseentry,wild))
161 {
162 ajStrDel(&entry);
163 continue;
164 }
165
166 ajDebug("... matched wildcard '%S'\n", wild);
167 inf = ajFileNewInNameS(entry);
168 if(!inf)
169 ajFatal("cannot open file %S",entry);
170
171 ajFmtPrintS(&division, "%F", inf);
172 ajFilenameTrimAll(&division);
173
174 while((entryname = cutgextract_next(inf, wildspecies,
175 &species, &docstr)))
176 {
177 if(ajStrGetLen(filename))
178 ajStrAssignS(&tmpkey,filename);
179 else
180 ajStrAssignC(&tmpkey,entryname);
181
182 /* See if organism is already in the table */
183
184 value = ajTableFetchmodV(table,tmpkey);
185 if(!value) /* Initialise */
186 {
187 key = ajStrNewS(tmpkey);
188 AJNEW0(value);
189 ajStrAssignS(&value->Species,species);
190 ajStrAssignS(&value->Division, division);
191 ajTablePut(table,(void *)key,(void *)value);
192 }
193 for(k=0;k<3;k++)
194 savecount[k] = value->Count[k];
195 nstops = cutgextract_readcodons(inf,allrecords, value->Count);
196 if(nstops < 1)
197 {
198 value->Skip++;
199 continue;
200 }
201 value->CdsCount++;
202 if(nstops>1)
203 {
204 value->CdsCount += (nstops - 1);
205 value->Warn++;
206 ajWarn("Found %d stop codons (%d %d %d) for CDS '%S'",
207 nstops,
208 value->Count[0] - savecount[0],
209 value->Count[1] - savecount[1],
210 value->Count[2] - savecount[2],
211 cutgextractSavepid);
212 }
213 }
214 ajStrDel(&entry);
215 ajFileClose(&inf);
216 }
217
218 ajTableToarrayKeysValues(table,(void***) &keyarray, (void***) &valarray);
219
220 i = 0;
221 while(keyarray[i])
222 {
223 key = keyarray[i];
224 value = (CutgPValues) valarray[i++];
225 codon = ajCodNew();
226 sum = 0;
227 for(j=0;j<CODONS;++j)
228 {
229 sum += value->Count[j];
230 x = ajCodIndexC(codons[j]);
231 codon->num[x] = value->Count[j];
232
233 c = aa[j];
234 if(c=='*')
235 codon->aa[x] = 27;
236 else
237 codon->aa[x] = c-'A';
238 }
239 ajCodCalcUsage(codon,sum);
240
241 ajStrAppendC(&key, ".cut");
242 if(allrecords)
243 {
244 if(value->Warn)
245 ajFmtPrintF(logf, "Writing %S CDS: %d Warnings: %d\n",
246 key, value->CdsCount, value->Warn);
247 else
248 ajFmtPrintF(logf, "Writing %S CDS: %d\n",
249 key, value->CdsCount);
250 }
251 else
252 {
253 if(value->Skip)
254 ajFmtPrintF(logf, "Writing %S CDS: %d Skipped: %d\n",
255 key, value->CdsCount, value->Skip);
256 else
257 ajFmtPrintF(logf, "Writing %S CDS: %d\n",
258 key, value->CdsCount);
259 }
260
261 ajFmtPrintS(&fname,"CODONS/%S",key);
262 outf = ajDatafileNewOutNameS(fname);
263 if(!outf)
264 ajFatal("Cannot open output file %S",fname);
265
266 ajCodSetNameS(codon, key);
267 ajCodSetSpeciesS(codon, value->Species);
268 ajCodSetDivisionS(codon, value->Division);
269 ajCodSetReleaseS(codon, release);
270 ajCodSetNumcds(codon, value->CdsCount);
271 ajCodSetNumcodons(codon, sum);
272
273 ajCodWrite(codon, outf);
274 ajFileClose(&outf);
275
276
277 ajStrDel(&key);
278 ajStrDel(&value->Division);
279 ajStrDel(&value->Doc);
280 ajStrDel(&value->Species);
281 AJFREE(value);
282 ajCodDel(&codon);
283 }
284
285 AJFREE(keyarray);
286 AJFREE(valarray);
287
288 ajTableFree(&table);
289 ajListFree(&flist);
290 ajStrDel(&wild);
291 ajStrDel(&release);
292 ajStrDel(&wildspecies);
293 ajStrDel(&filename);
294 ajFileClose(&logf);
295
296 ajStrDel(&cutgextractSavepid);
297 ajStrDel(&cutgextractLine);
298 ajStrDel(&cutgextractOrg);
299
300 ajStrDel(&fname);
301 ajStrDel(&tmpkey);
302 ajStrDel(&species);
303 ajStrDel(&docstr);
304 ajStrDel(&division);
305 ajStrDel(&baseentry);
306
307 embExit();
308
309 return 0;
310 }
311
312
313
314
315 /* @funcstatic cutgextract_next ***********************************************
316 **
317 ** Reads the first line of a CUTG database entry,
318 ** returning the name of the species.
319 **
320 ** Each entry has one line beginning with '>' followed by fields
321 ** delimited by a backslash:
322 **
323 ** GenBank entry name
324 ** GenBank accession number
325 ** GenBank feature location
326 ** Length of feature in nucleotides
327 ** Protein ID
328 ** Genus and species
329 ** Genbank entry description and feature qualifiers
330 **
331 ** The second line is 64 integers giving the number of times each codon
332 ** appears in this coding sequence.
333 **
334 ** @param [u] inf [AjPFile] Input CUTG database file
335 ** @param [r] wildspecies [const AjPStr] Wildcard species to select
336 ** @param [w] pspecies [AjPStr*] Species for this entry
337 ** @param [w] pdoc [AjPStr*] Documentation for this entry
338 ** @return [char*] Undocumented
339 ** @@
340 ******************************************************************************/
341
cutgextract_next(AjPFile inf,const AjPStr wildspecies,AjPStr * pspecies,AjPStr * pdoc)342 static char* cutgextract_next(AjPFile inf, const AjPStr wildspecies,
343 AjPStr* pspecies, AjPStr* pdoc)
344 {
345 AjPStrTok handle = NULL;
346 AjPStr token = NULL;
347 ajint i;
348 ajint len;
349 char *p = NULL;
350 char c;
351 AjBool done = ajFalse;
352
353 if(!cutgextractLine)
354 cutgextractLine = ajStrNew();
355
356 if(!cutgextractOrg)
357 cutgextractOrg = ajStrNew();
358
359 ajStrAssignC(&cutgextractLine,"");
360 ajStrAssignC(pdoc,"");
361 while (!done)
362 {
363
364 while(ajStrGetCharFirst(cutgextractLine) != '>')
365 if(!ajReadlineTrim(inf,&cutgextractLine))
366 return NULL;
367
368 handle = ajStrTokenNewC(cutgextractLine,"\\\n\t\r");
369 for(i=0;i<7;++i) {
370 ajStrTokenNextParseC(handle,"\\\n\t\r",&token);
371 if(i==5)
372 {
373 ajStrAssignC(&cutgextractOrg,"E");
374 ajStrAppendS(&cutgextractOrg, token);
375 ajStrAssignS(pspecies, token);
376 if(ajStrMatchWildS(token,wildspecies))
377 {
378 done = ajTrue;
379 }
380 }
381
382 switch(i)
383 {
384 case 0:
385 ajStrAppendC(pdoc, "#ID ");
386 ajStrAppendS(pdoc, token);
387 ajStrAppendC(pdoc, "\n");
388 break;
389 case 1:
390 ajStrAppendC(pdoc, "#AC ");
391 ajStrAppendS(pdoc, token);
392 ajStrAppendC(pdoc, "\n");
393 break;
394 case 2:
395 ajStrAppendC(pdoc, "#FT ");
396 ajStrAppendS(pdoc, token);
397 ajStrAppendC(pdoc, "\n");
398 break;
399 case 3:
400 ajStrAppendC(pdoc, "#FL ");
401 ajStrAppendS(pdoc, token);
402 ajStrAppendC(pdoc, "\n");
403 break;
404 case 4:
405 ajStrAppendC(pdoc, "#PI ");
406 ajStrAppendS(pdoc, token);
407 ajStrAppendC(pdoc, "\n");
408 ajStrAssignS(&cutgextractSavepid, token);
409 break;
410 case 5:
411 ajStrAppendC(pdoc, "#OS ");
412 ajStrAppendS(pdoc, token);
413 ajStrAppendC(pdoc, "\n");
414 break;
415 case 6:
416 ajStrAppendC(pdoc, "#DE ");
417 ajStrAppendS(pdoc, token);
418 ajStrAppendC(pdoc, "\n");
419 break;
420 default:
421 break;
422 }
423 }
424
425 ajStrTokenDel(&handle);
426 ajStrDel(&token);
427 if(!done)
428 if(!ajReadlineTrim(inf,&cutgextractLine))
429 return NULL;
430 }
431
432 len = ajStrGetLen(cutgextractOrg);
433 p = ajStrGetuniquePtr(&cutgextractOrg);
434 for(i=0;i<len;++i)
435 {
436 c = p[i];
437 if(c=='/' || c==' ' || c=='.' || c=='\'')
438 p[i]='_';
439 }
440
441 if(p[strlen(p)-1]=='_')
442 p[strlen(p)-1]='\0';
443 ajStrDel(&token);
444 ajStrTokenDel(&handle);
445
446 return p;
447 }
448
449
450
451
452 /* @funcstatic cutgextract_readcodons *****************************************
453 **
454 ** Reads the codon frequency line from a CUTG database entry.
455 **
456 ** Codons are reported according to the number of codons per amino acid
457 ** (6, 4, 2, 3, 1, stop) in the order:
458 **
459 ** CGA CGC CGG CGU AGA AGG CUA CUC CUG CUU UUA UUG UCA UCC UCG UCU AGC AGU
460 ** R R R R R R L L L L L L S S S S S S
461 ** ACA ACC ACG ACU CCA CCC CCG CCU GCA GCC GCG GCU GGA GGC GGG GGU
462 ** T T T T P P P P A A A A G G G G
463 ** GUA GUC GUG GUU AAA AAG AAC AAU CAA CAG CAC CAU GAA GAG GAC GAU
464 ** V V V V K K N N Q Q H H E E D D
465 ** UAC UAU UGC UGU UUC UUU AUA AUC AUU AUG UGG UAA UAG UGA
466 ** Y Y C C F F I I I M W * * *
467 ** @param [u] inf [AjPFile] Input CUTG database file
468 ** @param [r] allrecords [AjBool] If false, skip bad records with more than
469 ** one stop codon in the standard genetic code
470 ** @param [w] count [ajint*] Codon usage total so far for this species
471 ** @return [ajint] Number of stop codons, assuming a standard genetic code
472 ** @@
473 ******************************************************************************/
474
cutgextract_readcodons(AjPFile inf,AjBool allrecords,ajint * count)475 static ajint cutgextract_readcodons(AjPFile inf, AjBool allrecords,
476 ajint *count)
477 {
478 static int cutidx[] =
479 {
480 42,43,46,41,45,44,26,30,31,29,27,28,48,51,47,50,
481 52,49,55,56,53,54,36,38,35,37, 4, 6, 3, 5,17,18,
482 16,15,57,59,60,58,24,25,34,33,39,40,20,19,11,12,
483 10, 9,63,62, 8, 7,14,13,21,23,22,32,61, 1, 0, 2
484 };
485 AjPStr line = NULL;
486 AjPStr value = NULL;
487 ajint thiscount[64];
488
489 AjPStrTok token = NULL;
490 ajint i;
491 ajint n = 0;
492 ajint nstops = 0;
493
494
495 if(!line)
496 {
497 line = ajStrNew();
498 value = ajStrNew();
499 }
500
501 if(!ajReadlineTrim(inf,&line))
502 ajFatal("Premature end of file");
503
504
505 token = ajStrTokenNewC(line," \n\t\r");
506 for(i=0;i<CODONS;++i)
507 {
508 ajStrTokenNextParseC(token," \n\t\r",&value);
509 ajStrToInt(value,&n);
510 thiscount[cutidx[i]] = n;
511 if(i>60)
512 nstops += n;
513 }
514
515 ajStrDel(&line);
516 ajStrDel(&value);
517 ajStrTokenDel(&token);
518
519 if(!allrecords)
520 if(nstops > 1)
521 return -1;
522
523 for(i=0;i<CODONS;++i)
524 {
525 count[i] += thiscount[i];
526 }
527
528 return nstops;
529 }
530