1 /* @source etandem application
2 **
3 ** Tandem searches for tandem repeats
4 ** @author Copyright (C) Richard Durbin (rd@sanger.ac.uk)
5 ** and Jean Thierry-Mieg 1992
6 ** @@
7 ** The original application is part of the ACEDB genome database
8 ** package, written by ** Richard Durbin (MRC LMB, UK)
9 ** rd@mrc-lmba.cam.ac.uk, and Jean Thierry-Mieg (CRBM du CNRS,
10 ** France) mieg@crbm1.cnusc.fr
11 **
12 ** This program is free software; you can redistribute it and/or
13 ** modify it under the terms of the GNU General Public License
14 ** as published by the Free Software Foundation; either version 2
15 ** of the License, or (at your option) any later version.
16 **
17 ** This program is distributed in the hope that it will be useful,
18 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 ** GNU General Public License for more details.
21 **
22 ** You should have received a copy of the GNU General Public License
23 ** along with this program; if not, write to the Free Software
24 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
25 ******************************************************************************/
26 
27 #include "emboss.h"
28 
29 
30 
31 
32 #define DEBUG 0
33 
34 #define NMAX 1000
35 
36 
37 
38 
39 /* @datastatic EtandemPCons ***************************************************
40 **
41 ** Consensus pattern structure
42 **
43 ** @alias EtandemSCons
44 ** @alias EtandemOCons
45 **
46 ** @attr tab [ajint*] Undocumented
47 ** @attr max [ajint*] Undocumented
48 ** @attr start [ajint] Undocumented
49 ** @attr score [ajint] Undocumented
50 ** @attr bestScore [ajint] Undocumented
51 ** @attr ibest [ajint] Undocumented
52 ** @attr bestMax [ajint*] Undocumented
53 ** @attr phase [ajint] Undocumented
54 ** @attr repeat [ajint] Undocumented
55 ** @attr next [struct EtandemSCons*] Next node in linked list
56 ** @@
57 ******************************************************************************/
58 
59 typedef struct EtandemSCons
60 {
61     ajint* tab ;
62     ajint* max ;
63     ajint start ;
64     ajint score ;
65     ajint bestScore ;
66     ajint ibest ;
67     ajint* bestMax ;
68     ajint phase ;
69     ajint repeat ;
70     struct EtandemSCons* next ;
71 } EtandemOCons, *EtandemPCons ;
72 
73 
74 
75 
76 static EtandemOCons rootStruct ;
77 static EtandemPCons root = &rootStruct ;
78 static ajint *ring ;
79 static char letter[5]  = "acgtn" ;
80 static AjBool mismatch = AJFALSE ;
81 static AjBool uniform  = AJFALSE ;
82 static ajint thresh = 20 ;
83 static ajint nbase;
84 static ajint nmin;
85 static ajint nmax;
86 static AjPSeqCvt cvt;
87 
88 static EtandemPCons etandem_consCreate(void);
89 static void etandem_consDestroy(EtandemPCons *cons);
90 static void etandem_basicReport(AjPFeattable tab, AjPFile outfile,
91 				const EtandemPCons a);
92 static void etandem_report(EtandemPCons *a);
93 static void etandem_finalReport(AjPFeattable tab, AjPFile outfile);
94 
95 #define ATAB(x,y) (a->tab[x+5*y])
96 
97 static ajint nCons = 0 ;
98 
99 
100 
101 
102 /* @funcstatic etandem_consCreate *********************************************
103 **
104 ** Undocumented.
105 **
106 ** @return [EtandemPCons] Undocumented
107 ** @@
108 ******************************************************************************/
109 
etandem_consCreate(void)110 static EtandemPCons etandem_consCreate(void)
111 {
112     static EtandemPCons res;
113 
114     AJNEW0(res);
115     AJCNEW0(res->max, nmax+1);
116     AJCNEW0(res->bestMax, nmax+1);
117     AJCNEW0(res->tab, 5*nmax+5);
118     ++nCons;
119 
120     return res;
121 }
122 
123 
124 
125 
126 /* @funcstatic etandem_consDestroy ********************************************
127 **
128 ** Undocumented.
129 **
130 ** @param [d] cons [EtandemPCons*] Undocumented
131 ** @@
132 ******************************************************************************/
133 
etandem_consDestroy(EtandemPCons * cons)134 static void etandem_consDestroy(EtandemPCons *cons)
135 {
136 
137     if(!*cons)
138 	return;
139     --nCons;
140     AJFREE((*cons)->max);
141     AJFREE((*cons)->bestMax);
142     AJFREE((*cons)->tab);
143     AJFREE(*cons);
144 
145     return;
146 }
147 
148 
149 
150 
151 /***************** reporting code *****************/
152 
153 static EtandemOCons reportRootStruct;
154 static EtandemPCons reportRoot = &reportRootStruct;
155 
156 
157 
158 
159 /* @funcstatic etandem_basicReport ********************************************
160 **
161 ** Undocumented.
162 **
163 ** @param [u] tab [AjPFeattable] Feature table
164 ** @param [u] outfile [AjPFile] Output file (null unless original output
165 **                              is needed)
166 ** @param [r] a [const EtandemPCons] Undocumented
167 ** @@
168 ******************************************************************************/
169 
etandem_basicReport(AjPFeattable tab,AjPFile outfile,const EtandemPCons a)170 static void etandem_basicReport(AjPFeattable tab, AjPFile outfile,
171 				const EtandemPCons a)
172 {
173     ajint j;
174     ajint copies;
175     ajint n;
176     float perc;
177     AjPStr constr = NULL;
178     AjPStr rpthit = NULL;
179     AjPStr s = NULL;
180     AjPFeature gf;
181 
182     ajDebug("basicReport\n");
183 
184     n = a->repeat;
185 
186     if(!rpthit)
187       ajStrAssignC(&rpthit, "SO:0000705");
188 
189     copies = (a->ibest - a->start + 1) / n;
190     perc = (float)100.0 * (a->bestScore + n * (copies + 1)) /
191 	((float)2.0 * n * copies);
192     if(outfile)
193       ajFmtPrintF(outfile, "%6d %10d %10d %2d %3d %5.1f ",
194 		  a->bestScore, a->start+1, a->ibest+1,
195 		  n, copies, perc);
196 
197     gf = ajFeatNew(tab, NULL, rpthit,
198 		   a->start+1, a->ibest+1,
199 		   (float) a->bestScore, '+', 0);
200     ajFeatTagAddCC(gf, "rpt_type", "TANDEM");
201     ajFmtPrintS(&s, "*rpt_size %d", n);
202     ajFeatTagAddSS(gf, NULL, s);
203     ajFmtPrintS(&s, "*rpt_count %d", copies);
204     ajFeatTagAddSS(gf, NULL, s);
205     ajFmtPrintS(&s, "*identity %.1f", perc);
206     ajFeatTagAddSS(gf, NULL, s);
207 
208     /* make the consensus */
209 
210     for(j = (a->phase+1) % n; j < n; ++j)
211     {
212 	/*ajDebug("      bestMax[%d] letter[%d] '%c'\n",
213 		j, a->bestMax[j], letter[a->bestMax[j]]);*/
214 	ajStrAppendK(&constr, letter[a->bestMax[j]]);
215     }
216 
217     if((a->phase+1) % n)
218 	for(j = 0; j <= a->phase; ++j)
219 	{
220 	    /*ajDebug("more: bestMax[%d] letter[%d] '%c'\n",
221 		    j, a->bestMax[j], letter[a->bestMax[j]]);*/
222 	    ajStrAppendK(&constr, letter[a->bestMax[j]]);
223 	}
224 
225     ajFmtPrintS(&s, "*consensus %S", constr);
226     ajFeatTagAddSS(gf, NULL, s);
227 
228     if(outfile)
229       ajFmtPrintF(outfile, "%S\n", constr);
230 
231     ajStrDel(&rpthit);
232     ajStrDel(&s);
233     ajStrDel(&constr);
234 
235     return;
236 }
237 
238 
239 
240 
241 /* @funcstatic etandem_report *************************************************
242 **
243 ** Undocumented.
244 **
245 ** @param [d] a [EtandemPCons*] Undocumented
246 ** @@
247 ******************************************************************************/
248 
etandem_report(EtandemPCons * a)249 static void etandem_report(EtandemPCons *a)
250 {
251     ajint j;
252     ajint firstchar;
253 
254     if((*a)->bestScore >= thresh)
255     {
256 	if(uniform)
257 	    goto good;
258 
259 	/* else check not a single letter pattern */
260 	firstchar = (*a)->bestMax[0];
261 
262 	for(j = 1; j < (*a)->repeat; j++)
263 	    if((*a)->bestMax[j] != firstchar)
264 		goto good;
265     }
266 
267     etandem_consDestroy(a);	/* don't destroy, since reporting repeatedly */
268     return;
269 
270  good:
271     (*a)->next = reportRoot->next;
272     reportRoot->next = *a;
273 
274     return;
275 }
276 
277 
278 
279 
280 /* @funcstatic etandem_finalReport ********************************************
281 **
282 ** Undocumented.
283 **
284 ** @param [u] tab [AjPFeattable] Feature table
285 ** @param [u] outfile [AjPFile] Output file (null unless original output
286 **                              is needed)
287 ** @@
288 ******************************************************************************/
289 
etandem_finalReport(AjPFeattable tab,AjPFile outfile)290 static void etandem_finalReport(AjPFeattable tab, AjPFile outfile)
291 {
292     ajint start;
293     ajint end;
294     EtandemPCons a;
295     EtandemPCons top;
296     EtandemPCons olda;
297 
298     ajDebug("finalReport\n");
299     while(reportRoot->next)
300     {					/* find top score */
301 	top = reportRoot;
302 	for(a = reportRoot->next; a; a = a->next)
303 	    if(a->bestScore > top->bestScore ||
304 		(a->bestScore == top->bestScore && a->repeat < top->repeat))
305 		top = a;
306 
307 	/* report that */
308 	etandem_basicReport(tab, outfile, top);
309 	/* destroy all overlapping entries, including self  */
310 	start = top->start;
311 	end = top->ibest;
312 	olda = reportRoot;
313 	for(a = olda->next; a; olda = a, a = a->next)
314 	    if(a->ibest >= start && a->start <= end)
315 	    {
316 		olda->next = a->next;
317 		etandem_consDestroy(&a);
318 		a = olda;
319 	    }
320     }
321 
322     return;
323 }
324 
325 
326 
327 
328 /* @prog etandem **************************************************************
329 **
330 ** Looks for tandem repeats in a nucleotide sequence
331 **
332 ******************************************************************************/
333 
main(int argc,char ** argv)334 int main(int argc, char **argv)
335 {
336 
337     ajint ibase;
338     ajint base;
339     const char *cp;
340     AjPSeq sequence = NULL;
341     ajint i;
342     ajint j;
343     ajint x;
344     ajint phase;
345     ajint n;
346     EtandemPCons new;
347     EtandemPCons a;
348     EtandemPCons b;
349     EtandemPCons olda;
350     EtandemPCons oldb;
351     AjPStr nseq = NULL;
352     AjPFeattable tab = NULL;
353     AjPReport report = NULL;
354     AjPFile outfile  = NULL;
355     AjPStr tmpstr    = NULL;
356 
357     embInit("etandem", argc, argv);
358 
359     nmin     = ajAcdGetInt("minrepeat");
360     nmax     = ajAcdGetInt("maxrepeat");
361     mismatch = ajAcdGetBoolean("mismatch");
362     thresh   = ajAcdGetInt("threshold");
363     uniform  = ajAcdGetBoolean("uniform");
364     report   = ajAcdGetReport("outfile");
365     outfile  = ajAcdGetOutfile("origfile");
366     sequence = ajAcdGetSeq("sequence");
367     nbase    = ajSeqGetLen(sequence);
368 
369     tab = ajFeattableNewSeq(sequence);
370 
371     ajFmtPrintAppS(&tmpstr, "Threshold: %d\n", thresh);
372     ajFmtPrintAppS(&tmpstr, "Minrepeat: %d\n", nmin);
373     ajFmtPrintAppS(&tmpstr, "Maxrepeat: %d\n", nmax);
374     ajFmtPrintAppS(&tmpstr, "Mismatch: %B\n", mismatch);
375     ajFmtPrintAppS(&tmpstr, "Uniform: %B\n", uniform);
376     ajReportSetHeaderS(report, tmpstr);
377 
378     cvt = ajSeqcvtNewC("ACGTN");
379     ajSeqConvertNum(sequence, cvt, &nseq);
380 
381     AJCNEW(ring, nbase);
382 
383     reportRoot->bestScore = thresh - 1;
384 
385     for(n = nmin; n <= nmax; ++n)
386     {
387 	cp = ajStrGetPtr(nseq);
388 	for(ibase = 0; ibase < nbase; ++ibase, ++cp)
389 	{
390 	    base = *cp - 1;
391             if(base < 0) base = 4;
392 
393 	    /* set up local ring */
394 	    phase = ibase % n;
395 	    ring[phase] = base;
396 
397 	    if(ibase < n-1)
398 		continue;
399 
400 	    /* start new Cons */
401 	    new = etandem_consCreate();
402 	    new->start = ibase - n + 1;
403 	    new->next = root->next;
404 	    new->score = new->bestScore = -n;
405 	    new->phase = phase;	/* phase of the last base of pattern */
406 	    new->repeat = n;
407 	    root->next = new;
408 
409 	    /* add last nmer to active Cons's */
410 	    olda = root;
411 
412 	    if(DEBUG)
413 		ajDebug("%d\n", ibase);
414 
415 	    for(a = olda->next; a; olda = a, a = a->next)
416 	    {
417 		if(a->phase == phase)
418 		{
419 		    for(i = 0; i < n; ++i)
420 		    {
421 			x = ring[i];
422                         if((x+5*i) < 0) ajErr("ATAB(%d,%d) n:%d ibase:%d", x, i, n, ibase);
423 			if(x == 4 && mismatch)
424 			{
425 			    --a->score;
426 			    continue;
427 			}
428 			++ATAB(x,i);
429 			if(x == a->max[i])
430 			    ++a->score;
431 			else if(ATAB(x,i) > ATAB(a->max[i],i))
432 			{
433 			    a->max[i] = x;
434 			    ++a->score;
435 			}
436 			else
437 			{
438 			    --a->score;
439 			    if(ATAB(x,i) == ATAB(a->max[i], i))
440 				a->max[i] = x;
441 			}
442 		    }
443 
444 		    if(a->score > a->bestScore)
445 		    {
446 			a->bestScore = a->score;
447 			a->ibest = ibase;
448 			for(j = 0; j < n; ++j)
449 			    a->bestMax[j] = a->max[j];
450 		    }
451 		    else if(a->score < 0)
452 		    {
453 			if(DEBUG) ajDebug("D");
454 			olda->next = a->next;
455 			etandem_report(&a);
456 			a = olda;
457 		    }
458 		}
459 	    }
460 
461 	    /* remove duplicate max tables */
462 	    olda = root;
463 	    for(a = olda->next; a; olda = a, a = a->next)
464 	    {
465 		if(a->phase == phase)
466 		{
467 		    oldb = a;
468 		    for(b = a->next; b; b = b->next)
469 		    {			/* all phases */
470 			for(j = 0; j < n; ++j)
471 			{
472 			    if(a->max[j] != b->max[j])
473 				goto nextb;
474 			}
475 
476 			if(a->bestScore > b->bestScore)
477 			{		/* remove b */
478 			    oldb->next = b->next;
479 			    if(DEBUG)
480 			    {
481 				ajDebug("B");
482 				etandem_basicReport(tab, outfile, b);
483 			    }
484 			    etandem_consDestroy(&b);
485 			    b = oldb;
486 			}
487 			else
488 			{		/* remove a */
489 			    olda->next = a->next;
490 			    if(DEBUG)
491 			    {
492 				ajDebug("A");
493 				etandem_basicReport(tab, outfile, a);
494 			    }
495 			    etandem_consDestroy(&a);
496 			    a = olda;
497 			    goto nexta;
498 			}
499 
500 		    nextb:
501 			oldb = b;
502 		    }
503 		nexta:
504 		   ;
505 		}
506 	    }
507 
508 	    if(DEBUG)
509 		for(a = root->next; a; a = a->next)
510 		    etandem_basicReport(tab, outfile, a);
511 	}
512 
513 	while((a = root->next))
514 	{
515 	    root->next = a->next;
516 	    etandem_report(&a);
517 	}
518     }
519 
520     etandem_finalReport(tab, outfile);
521     ajReportWrite(report, tab, sequence);
522     ajReportDel(&report);
523     ajFeattableDel(&tab);
524     ajSeqDel(&sequence);
525     ajFileClose(&outfile);
526 
527     ajSeqcvtDel(&cvt);
528     ajStrDel(&nseq);
529     ajStrDel(&tmpstr);
530     AJFREE(ring);
531 
532     embExit();
533 
534     return 0;
535 }
536