1 /* @source cutgextract application
2 **
3 ** Create EMBOSS codon usage files from the CUTG database
4 **
5 ** @author Copyright (C) Alan Bleasby (ableasby@hgmp.mrc.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
21 ******************************************************************************/
22 
23 #include "emboss.h"
24 
25 #define CODONS 64
26 #define TABLE_ESTIMATE 1000
27 
28 
29 
30 
31 /* @datastatic CutgPValues ****************************************************
32 **
33 ** Codon usage table data values
34 **
35 ** @alias CutgSValues
36 ** @alias CutgOValues
37 **
38 ** @attr Count [ajint[CODONS]] Number of occurrences for each codon
39 **                             in standard order
40 ** @attr Division [AjPStr] EMBL/GenBank division
41 ** @attr Doc [AjPStr] Documentation string
42 ** @attr Species [AjPStr] Species
43 ** @attr Warn [ajint] Number of warnings issued
44 ** @attr Skip [ajint] Number of CDSs skipped
45 ** @attr CdsCount [ajint] Number of CDSs counted
46 ** @attr Padding [char[4]] Padding to alignment boundary
47 ******************************************************************************/
48 
49 typedef struct CutgSValues
50 {
51     ajint Count[CODONS];
52     AjPStr Division;
53     AjPStr Doc;
54     AjPStr Species;
55     ajint Warn;
56     ajint Skip;
57     ajint CdsCount;
58     char  Padding[4];
59 } CutgOValues;
60 #define CutgPValues CutgOValues*
61 
62 static AjPStr cutgextractSavepid = NULL;
63 static AjPStr cutgextractLine = NULL;
64 static AjPStr cutgextractOrg  = NULL;
65 
66 static char *cutgextract_next(AjPFile inf, const AjPStr wildspecies,
67 			      AjPStr* pspecies, AjPStr* pdoc);
68 static ajint cutgextract_readcodons(AjPFile inf, AjBool allrecords,
69 				    ajint *count);
70 
71 
72 
73 
74 /* @prog cutgextract **********************************************************
75 **
76 ** Extract data from CUTG.
77 **
78 ** Reads all *.codon files in the input, and builds a table for each organism.
79 ** Writes the tables to the EMBOSS_DATA/CODONS directory at the end.
80 **
81 ******************************************************************************/
82 
main(int argc,char ** argv)83 int main(int argc, char **argv)
84 {
85     const char *codons[]=
86     {
87 	"TAG","TAA","TGA","GCG","GCA","GCT","GCC","TGT", /* 00-07 */
88 	"TGC","GAT","GAC","GAA","GAG","TTT","TTC","GGT", /* 08-15 */
89 	"GGG","GGA","GGC","CAT","CAC","ATA","ATT","ATC", /* 16-23 */
90 	"AAA","AAG","CTA","TTA","TTG","CTT","CTC","CTG", /* 24-31 */
91 	"ATG","AAT","AAC","CCG","CCA","CCT","CCC","CAA", /* 32-39 */
92 	"CAG","CGT","CGA","CGC","AGG","AGA","CGG","TCG", /* 40-47 */
93 	"TCA","AGT","TCT","TCC","AGC","ACG","ACT","ACA", /* 48-55 */
94 	"ACC","GTA","GTT","GTC","GTG","TGG","TAT","TAC"	 /* 56-63 */
95     };
96 
97     const char *aa=
98 	"***AAAACCDDEEFFGGGGHHIIIKKLLLLLLMNNPPPPQQRRRRRRSSSSSSTTTTVVVVWYY";
99 
100     AjPFile inf     = NULL;
101     AjPFile outf    = NULL;
102     char *entryname = NULL;
103     AjPStr fname    = NULL;
104     AjPStr key      = NULL;
105     AjPStr tmpkey   = NULL;
106     AjBool allrecords = AJFALSE;
107 
108     AjPTable table  = NULL;
109     ajint i = 0;
110     ajint j = 0;
111     ajint k = 0;
112     ajint x = 0;
113     ajint savecount[3];
114 
115     AjPStr *keyarray = NULL;
116     CutgPValues *valarray = NULL;
117     AjPCod codon  = NULL;
118     ajint sum = 0;
119     char c;
120 
121     AjPList flist = NULL;
122     AjPFile logf = NULL;
123     AjPStr  entry = NULL;
124     AjPStr  baseentry = NULL;
125     AjPStr  wild  = NULL;
126     AjPStr division = NULL;
127     AjPStr release = NULL;
128     AjPStr wildspecies = NULL;
129     CutgPValues value = NULL;
130     AjPStr docstr = NULL;
131     AjPStr species = NULL;
132     AjPStr filename = NULL;
133     ajint nstops;
134 
135     embInit("cutgextract",argc,argv);
136 
137     tmpkey = ajStrNew();
138     fname  = ajStrNew();
139 
140 
141     table = ajTablestrNewConst(TABLE_ESTIMATE);
142 
143 
144     flist = ajAcdGetDirlist("directory");
145     wild  = ajAcdGetString("wildspec");
146     release  = ajAcdGetString("release");
147     logf = ajAcdGetOutfile("outfile");
148     wildspecies = ajAcdGetString("species");
149     filename = ajAcdGetString("filename");
150     allrecords = ajAcdGetBoolean("allrecords");
151 
152     ajStrInsertC(&release, 0, "CUTG");
153     ajStrRemoveWhite(&release);
154 
155     while(ajListPop(flist,(void **)&entry))
156     {
157 	ajStrAssignS(&baseentry, entry);
158 	ajFilenameTrimPath(&baseentry);
159 	ajDebug("Testing file '%S'\n", entry);
160 	if(!ajStrMatchWildS(baseentry,wild))
161 	{
162 	    ajStrDel(&entry);
163 	    continue;
164 	}
165 
166 	ajDebug("... matched wildcard '%S'\n", wild);
167 	inf = ajFileNewInNameS(entry);
168 	if(!inf)
169 	    ajFatal("cannot open file %S",entry);
170 
171 	ajFmtPrintS(&division, "%F", inf);
172 	ajFilenameTrimAll(&division);
173 
174 	while((entryname = cutgextract_next(inf, wildspecies,
175 					    &species, &docstr)))
176 	{
177 	    if(ajStrGetLen(filename))
178 		ajStrAssignS(&tmpkey,filename);
179 	    else
180 		ajStrAssignC(&tmpkey,entryname);
181 
182 	    /* See if organism is already in the table */
183 
184 	    value = ajTableFetchmodV(table,tmpkey);
185             if(!value)			/* Initialise */
186 	    {
187 		key = ajStrNewS(tmpkey);
188 		AJNEW0(value);
189 		ajStrAssignS(&value->Species,species);
190 		ajStrAssignS(&value->Division, division);
191 		ajTablePut(table,(void *)key,(void *)value);
192 	    }
193 	    for(k=0;k<3;k++)
194 		savecount[k] = value->Count[k];
195 	    nstops = cutgextract_readcodons(inf,allrecords, value->Count);
196 	    if(nstops < 1)
197 	    {
198 		value->Skip++;
199 		continue;
200 	    }
201 	    value->CdsCount++;
202 	    if(nstops>1)
203 	    {
204 		value->CdsCount += (nstops - 1);
205 		value->Warn++;
206 		ajWarn("Found %d stop codons (%d %d %d) for CDS '%S'",
207 		       nstops,
208 		       value->Count[0] - savecount[0],
209 		       value->Count[1] - savecount[1],
210 		       value->Count[2] - savecount[2],
211 		       cutgextractSavepid);
212 	    }
213 	}
214 	ajStrDel(&entry);
215 	ajFileClose(&inf);
216     }
217 
218     ajTableToarrayKeysValues(table,(void***) &keyarray, (void***) &valarray);
219 
220     i = 0;
221     while(keyarray[i])
222     {
223 	key   = keyarray[i];
224 	value = (CutgPValues) valarray[i++];
225 	codon = ajCodNew();
226 	sum   = 0;
227 	for(j=0;j<CODONS;++j)
228 	{
229 	    sum += value->Count[j];
230 	    x = ajCodIndexC(codons[j]);
231 	    codon->num[x] = value->Count[j];
232 
233 	    c = aa[j];
234 	    if(c=='*')
235 		codon->aa[x] = 27;
236 	    else
237 		codon->aa[x] = c-'A';
238 	}
239 	ajCodCalcUsage(codon,sum);
240 
241 	ajStrAppendC(&key, ".cut");
242 	if(allrecords)
243 	{
244 	    if(value->Warn)
245 		ajFmtPrintF(logf, "Writing %S CDS: %d Warnings: %d\n",
246 			    key, value->CdsCount, value->Warn);
247 	    else
248 		ajFmtPrintF(logf, "Writing %S CDS: %d\n",
249 			    key, value->CdsCount);
250 	}
251 	else
252 	{
253 	    if(value->Skip)
254 		ajFmtPrintF(logf, "Writing %S CDS: %d Skipped: %d\n",
255 			    key, value->CdsCount, value->Skip);
256 	    else
257 		ajFmtPrintF(logf, "Writing %S CDS: %d\n",
258 			    key, value->CdsCount);
259 	}
260 
261 	ajFmtPrintS(&fname,"CODONS/%S",key);
262 	outf = ajDatafileNewOutNameS(fname);
263 	if(!outf)
264 	    ajFatal("Cannot open output file %S",fname);
265 
266 	ajCodSetNameS(codon, key);
267 	ajCodSetSpeciesS(codon, value->Species);
268 	ajCodSetDivisionS(codon, value->Division);
269 	ajCodSetReleaseS(codon, release);
270 	ajCodSetNumcds(codon, value->CdsCount);
271 	ajCodSetNumcodons(codon, sum);
272 
273 	ajCodWrite(codon, outf);
274 	ajFileClose(&outf);
275 
276 
277 	ajStrDel(&key);
278 	ajStrDel(&value->Division);
279 	ajStrDel(&value->Doc);
280 	ajStrDel(&value->Species);
281 	AJFREE(value);
282 	ajCodDel(&codon);
283     }
284 
285     AJFREE(keyarray);
286     AJFREE(valarray);
287 
288     ajTableFree(&table);
289     ajListFree(&flist);
290     ajStrDel(&wild);
291     ajStrDel(&release);
292     ajStrDel(&wildspecies);
293     ajStrDel(&filename);
294     ajFileClose(&logf);
295 
296     ajStrDel(&cutgextractSavepid);
297     ajStrDel(&cutgextractLine);
298     ajStrDel(&cutgextractOrg);
299 
300     ajStrDel(&fname);
301     ajStrDel(&tmpkey);
302     ajStrDel(&species);
303     ajStrDel(&docstr);
304     ajStrDel(&division);
305     ajStrDel(&baseentry);
306 
307     embExit();
308 
309     return 0;
310 }
311 
312 
313 
314 
315 /* @funcstatic cutgextract_next ***********************************************
316 **
317 ** Reads the first line of a CUTG database entry,
318 ** returning the name of the species.
319 **
320 ** Each entry has one line beginning with '>' followed by fields
321 ** delimited by a backslash:
322 **
323 ** GenBank entry name
324 ** GenBank accession number
325 ** GenBank feature location
326 ** Length of feature in nucleotides
327 ** Protein ID
328 ** Genus and species
329 ** Genbank entry description and feature qualifiers
330 **
331 ** The second line is 64 integers giving the number of times each codon
332 ** appears in this coding sequence.
333 **
334 ** @param [u] inf [AjPFile] Input CUTG database file
335 ** @param [r] wildspecies [const AjPStr] Wildcard species to select
336 ** @param [w] pspecies [AjPStr*] Species for this entry
337 ** @param [w] pdoc [AjPStr*] Documentation for this entry
338 ** @return [char*] Undocumented
339 ** @@
340 ******************************************************************************/
341 
cutgextract_next(AjPFile inf,const AjPStr wildspecies,AjPStr * pspecies,AjPStr * pdoc)342 static char* cutgextract_next(AjPFile inf, const AjPStr wildspecies,
343 			      AjPStr* pspecies, AjPStr* pdoc)
344 {
345     AjPStrTok handle    = NULL;
346     AjPStr token = NULL;
347     ajint i;
348     ajint len;
349     char *p = NULL;
350     char c;
351     AjBool done = ajFalse;
352 
353     if(!cutgextractLine)
354 	cutgextractLine = ajStrNew();
355 
356     if(!cutgextractOrg)
357 	cutgextractOrg  = ajStrNew();
358 
359     ajStrAssignC(&cutgextractLine,"");
360     ajStrAssignC(pdoc,"");
361     while (!done)
362     {
363 
364 	while(ajStrGetCharFirst(cutgextractLine) != '>')
365 	    if(!ajReadlineTrim(inf,&cutgextractLine))
366 		return NULL;
367 
368 	handle = ajStrTokenNewC(cutgextractLine,"\\\n\t\r");
369 	for(i=0;i<7;++i) {
370 	    ajStrTokenNextParseC(handle,"\\\n\t\r",&token);
371 	    if(i==5)
372 	    {
373 		ajStrAssignC(&cutgextractOrg,"E");
374 		ajStrAppendS(&cutgextractOrg, token);
375 		ajStrAssignS(pspecies, token);
376 		if(ajStrMatchWildS(token,wildspecies))
377 		{
378 		    done = ajTrue;
379 		}
380 	    }
381 
382 	    switch(i)
383 	    {
384 	    case 0:
385 		ajStrAppendC(pdoc, "#ID ");
386 		ajStrAppendS(pdoc, token);
387 		ajStrAppendC(pdoc, "\n");
388 		break;
389 	    case 1:
390 		ajStrAppendC(pdoc, "#AC ");
391 		ajStrAppendS(pdoc, token);
392 		ajStrAppendC(pdoc, "\n");
393 		break;
394 	    case 2:
395 		ajStrAppendC(pdoc, "#FT ");
396 		ajStrAppendS(pdoc, token);
397 		ajStrAppendC(pdoc, "\n");
398 		break;
399 	    case 3:
400 		ajStrAppendC(pdoc, "#FL ");
401 		ajStrAppendS(pdoc, token);
402 		ajStrAppendC(pdoc, "\n");
403 		break;
404 	    case 4:
405 		ajStrAppendC(pdoc, "#PI ");
406 		ajStrAppendS(pdoc, token);
407 		ajStrAppendC(pdoc, "\n");
408 		ajStrAssignS(&cutgextractSavepid, token);
409 		break;
410 	    case 5:
411 		ajStrAppendC(pdoc, "#OS ");
412 		ajStrAppendS(pdoc, token);
413 		ajStrAppendC(pdoc, "\n");
414 		break;
415 	    case 6:
416 		ajStrAppendC(pdoc, "#DE ");
417 		ajStrAppendS(pdoc, token);
418 		ajStrAppendC(pdoc, "\n");
419 		break;
420 	    default:
421 		break;
422 	    }
423 	}
424 
425 	ajStrTokenDel(&handle);
426 	ajStrDel(&token);
427 	if(!done)
428 	    if(!ajReadlineTrim(inf,&cutgextractLine))
429 		return NULL;
430     }
431 
432     len = ajStrGetLen(cutgextractOrg);
433     p   = ajStrGetuniquePtr(&cutgextractOrg);
434     for(i=0;i<len;++i)
435     {
436 	c = p[i];
437 	if(c=='/' || c==' ' || c=='.' || c=='\'')
438 	    p[i]='_';
439     }
440 
441     if(p[strlen(p)-1]=='_')
442 	p[strlen(p)-1]='\0';
443     ajStrDel(&token);
444     ajStrTokenDel(&handle);
445 
446     return p;
447 }
448 
449 
450 
451 
452 /* @funcstatic cutgextract_readcodons *****************************************
453 **
454 ** Reads the codon frequency line from a CUTG database entry.
455 **
456 ** Codons are reported according to the number of codons per amino acid
457 ** (6, 4, 2, 3, 1, stop) in the order:
458 **
459 ** CGA CGC CGG CGU AGA AGG CUA CUC CUG CUU UUA UUG UCA UCC UCG UCU AGC AGU
460 **  R   R   R   R   R   R   L   L   L   L   L   L   S   S   S   S   S   S
461 ** ACA ACC ACG ACU CCA CCC CCG CCU GCA GCC GCG GCU GGA GGC GGG GGU
462 **  T   T   T   T   P   P   P   P   A   A   A   A   G   G   G   G
463 ** GUA GUC GUG GUU AAA AAG AAC AAU CAA CAG CAC CAU GAA GAG GAC GAU
464 **  V   V   V   V   K   K   N   N   Q   Q   H   H   E   E   D   D
465 ** UAC UAU UGC UGU UUC UUU AUA AUC AUU AUG UGG UAA UAG UGA
466 **  Y   Y   C   C   F   F   I   I   I   M   W   *   *   *
467 ** @param [u] inf [AjPFile] Input CUTG database file
468 ** @param [r] allrecords [AjBool] If false, skip bad records with more than
469 **                             one stop codon in the standard genetic code
470 ** @param [w] count [ajint*] Codon usage total so far for this species
471 ** @return [ajint] Number of stop codons, assuming a standard genetic code
472 ** @@
473 ******************************************************************************/
474 
cutgextract_readcodons(AjPFile inf,AjBool allrecords,ajint * count)475 static ajint cutgextract_readcodons(AjPFile inf, AjBool allrecords,
476 				    ajint *count)
477 {
478     static int cutidx[] =
479     {
480 	42,43,46,41,45,44,26,30,31,29,27,28,48,51,47,50,
481 	52,49,55,56,53,54,36,38,35,37, 4, 6, 3, 5,17,18,
482 	16,15,57,59,60,58,24,25,34,33,39,40,20,19,11,12,
483 	10, 9,63,62, 8, 7,14,13,21,23,22,32,61, 1, 0, 2
484     };
485     AjPStr line  = NULL;
486     AjPStr value = NULL;
487     ajint thiscount[64];
488 
489     AjPStrTok token = NULL;
490     ajint i;
491     ajint n = 0;
492     ajint nstops = 0;
493 
494 
495     if(!line)
496     {
497 	line  = ajStrNew();
498 	value = ajStrNew();
499     }
500 
501     if(!ajReadlineTrim(inf,&line))
502 	ajFatal("Premature end of file");
503 
504 
505     token = ajStrTokenNewC(line," \n\t\r");
506     for(i=0;i<CODONS;++i)
507     {
508 	ajStrTokenNextParseC(token," \n\t\r",&value);
509 	ajStrToInt(value,&n);
510 	thiscount[cutidx[i]] = n;
511 	if(i>60)
512 	    nstops += n;
513     }
514 
515     ajStrDel(&line);
516     ajStrDel(&value);
517     ajStrTokenDel(&token);
518 
519     if(!allrecords)
520 	if(nstops > 1)
521 	    return -1;
522 
523     for(i=0;i<CODONS;++i)
524     {
525 	count[i] += thiscount[i];
526     }
527 
528     return nstops;
529 }
530