1 /* @source etandem application
2 **
3 ** Tandem searches for tandem repeats
4 ** @author Copyright (C) Richard Durbin (rd@sanger.ac.uk)
5 ** and Jean Thierry-Mieg 1992
6 ** @@
7 ** The original application is part of the ACEDB genome database
8 ** package, written by ** Richard Durbin (MRC LMB, UK)
9 ** rd@mrc-lmba.cam.ac.uk, and Jean Thierry-Mieg (CRBM du CNRS,
10 ** France) mieg@crbm1.cnusc.fr
11 **
12 ** This program is free software; you can redistribute it and/or
13 ** modify it under the terms of the GNU General Public License
14 ** as published by the Free Software Foundation; either version 2
15 ** of the License, or (at your option) any later version.
16 **
17 ** This program is distributed in the hope that it will be useful,
18 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ** GNU General Public License for more details.
21 **
22 ** You should have received a copy of the GNU General Public License
23 ** along with this program; if not, write to the Free Software
24 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
25 ******************************************************************************/
26
27 #include "emboss.h"
28
29
30
31
32 #define DEBUG 0
33
34 #define NMAX 1000
35
36
37
38
39 /* @datastatic EtandemPCons ***************************************************
40 **
41 ** Consensus pattern structure
42 **
43 ** @alias EtandemSCons
44 ** @alias EtandemOCons
45 **
46 ** @attr tab [ajint*] Undocumented
47 ** @attr max [ajint*] Undocumented
48 ** @attr start [ajint] Undocumented
49 ** @attr score [ajint] Undocumented
50 ** @attr bestScore [ajint] Undocumented
51 ** @attr ibest [ajint] Undocumented
52 ** @attr bestMax [ajint*] Undocumented
53 ** @attr phase [ajint] Undocumented
54 ** @attr repeat [ajint] Undocumented
55 ** @attr next [struct EtandemSCons*] Next node in linked list
56 ** @@
57 ******************************************************************************/
58
59 typedef struct EtandemSCons
60 {
61 ajint* tab ;
62 ajint* max ;
63 ajint start ;
64 ajint score ;
65 ajint bestScore ;
66 ajint ibest ;
67 ajint* bestMax ;
68 ajint phase ;
69 ajint repeat ;
70 struct EtandemSCons* next ;
71 } EtandemOCons, *EtandemPCons ;
72
73
74
75
76 static EtandemOCons rootStruct ;
77 static EtandemPCons root = &rootStruct ;
78 static ajint *ring ;
79 static char letter[5] = "acgtn" ;
80 static AjBool mismatch = AJFALSE ;
81 static AjBool uniform = AJFALSE ;
82 static ajint thresh = 20 ;
83 static ajint nbase;
84 static ajint nmin;
85 static ajint nmax;
86 static AjPSeqCvt cvt;
87
88 static EtandemPCons etandem_consCreate(void);
89 static void etandem_consDestroy(EtandemPCons *cons);
90 static void etandem_basicReport(AjPFeattable tab, AjPFile outfile,
91 const EtandemPCons a);
92 static void etandem_report(EtandemPCons *a);
93 static void etandem_finalReport(AjPFeattable tab, AjPFile outfile);
94
95 #define ATAB(x,y) (a->tab[x+5*y])
96
97 static ajint nCons = 0 ;
98
99
100
101
102 /* @funcstatic etandem_consCreate *********************************************
103 **
104 ** Undocumented.
105 **
106 ** @return [EtandemPCons] Undocumented
107 ** @@
108 ******************************************************************************/
109
etandem_consCreate(void)110 static EtandemPCons etandem_consCreate(void)
111 {
112 static EtandemPCons res;
113
114 AJNEW0(res);
115 AJCNEW0(res->max, nmax+1);
116 AJCNEW0(res->bestMax, nmax+1);
117 AJCNEW0(res->tab, 5*nmax+5);
118 ++nCons;
119
120 return res;
121 }
122
123
124
125
126 /* @funcstatic etandem_consDestroy ********************************************
127 **
128 ** Undocumented.
129 **
130 ** @param [d] cons [EtandemPCons*] Undocumented
131 ** @@
132 ******************************************************************************/
133
etandem_consDestroy(EtandemPCons * cons)134 static void etandem_consDestroy(EtandemPCons *cons)
135 {
136
137 if(!*cons)
138 return;
139 --nCons;
140 AJFREE((*cons)->max);
141 AJFREE((*cons)->bestMax);
142 AJFREE((*cons)->tab);
143 AJFREE(*cons);
144
145 return;
146 }
147
148
149
150
151 /***************** reporting code *****************/
152
153 static EtandemOCons reportRootStruct;
154 static EtandemPCons reportRoot = &reportRootStruct;
155
156
157
158
159 /* @funcstatic etandem_basicReport ********************************************
160 **
161 ** Undocumented.
162 **
163 ** @param [u] tab [AjPFeattable] Feature table
164 ** @param [u] outfile [AjPFile] Output file (null unless original output
165 ** is needed)
166 ** @param [r] a [const EtandemPCons] Undocumented
167 ** @@
168 ******************************************************************************/
169
etandem_basicReport(AjPFeattable tab,AjPFile outfile,const EtandemPCons a)170 static void etandem_basicReport(AjPFeattable tab, AjPFile outfile,
171 const EtandemPCons a)
172 {
173 ajint j;
174 ajint copies;
175 ajint n;
176 float perc;
177 AjPStr constr = NULL;
178 AjPStr rpthit = NULL;
179 AjPStr s = NULL;
180 AjPFeature gf;
181
182 ajDebug("basicReport\n");
183
184 n = a->repeat;
185
186 if(!rpthit)
187 ajStrAssignC(&rpthit, "SO:0000705");
188
189 copies = (a->ibest - a->start + 1) / n;
190 perc = (float)100.0 * (a->bestScore + n * (copies + 1)) /
191 ((float)2.0 * n * copies);
192 if(outfile)
193 ajFmtPrintF(outfile, "%6d %10d %10d %2d %3d %5.1f ",
194 a->bestScore, a->start+1, a->ibest+1,
195 n, copies, perc);
196
197 gf = ajFeatNew(tab, NULL, rpthit,
198 a->start+1, a->ibest+1,
199 (float) a->bestScore, '+', 0);
200 ajFeatTagAddCC(gf, "rpt_type", "TANDEM");
201 ajFmtPrintS(&s, "*rpt_size %d", n);
202 ajFeatTagAddSS(gf, NULL, s);
203 ajFmtPrintS(&s, "*rpt_count %d", copies);
204 ajFeatTagAddSS(gf, NULL, s);
205 ajFmtPrintS(&s, "*identity %.1f", perc);
206 ajFeatTagAddSS(gf, NULL, s);
207
208 /* make the consensus */
209
210 for(j = (a->phase+1) % n; j < n; ++j)
211 {
212 /*ajDebug(" bestMax[%d] letter[%d] '%c'\n",
213 j, a->bestMax[j], letter[a->bestMax[j]]);*/
214 ajStrAppendK(&constr, letter[a->bestMax[j]]);
215 }
216
217 if((a->phase+1) % n)
218 for(j = 0; j <= a->phase; ++j)
219 {
220 /*ajDebug("more: bestMax[%d] letter[%d] '%c'\n",
221 j, a->bestMax[j], letter[a->bestMax[j]]);*/
222 ajStrAppendK(&constr, letter[a->bestMax[j]]);
223 }
224
225 ajFmtPrintS(&s, "*consensus %S", constr);
226 ajFeatTagAddSS(gf, NULL, s);
227
228 if(outfile)
229 ajFmtPrintF(outfile, "%S\n", constr);
230
231 ajStrDel(&rpthit);
232 ajStrDel(&s);
233 ajStrDel(&constr);
234
235 return;
236 }
237
238
239
240
241 /* @funcstatic etandem_report *************************************************
242 **
243 ** Undocumented.
244 **
245 ** @param [d] a [EtandemPCons*] Undocumented
246 ** @@
247 ******************************************************************************/
248
etandem_report(EtandemPCons * a)249 static void etandem_report(EtandemPCons *a)
250 {
251 ajint j;
252 ajint firstchar;
253
254 if((*a)->bestScore >= thresh)
255 {
256 if(uniform)
257 goto good;
258
259 /* else check not a single letter pattern */
260 firstchar = (*a)->bestMax[0];
261
262 for(j = 1; j < (*a)->repeat; j++)
263 if((*a)->bestMax[j] != firstchar)
264 goto good;
265 }
266
267 etandem_consDestroy(a); /* don't destroy, since reporting repeatedly */
268 return;
269
270 good:
271 (*a)->next = reportRoot->next;
272 reportRoot->next = *a;
273
274 return;
275 }
276
277
278
279
280 /* @funcstatic etandem_finalReport ********************************************
281 **
282 ** Undocumented.
283 **
284 ** @param [u] tab [AjPFeattable] Feature table
285 ** @param [u] outfile [AjPFile] Output file (null unless original output
286 ** is needed)
287 ** @@
288 ******************************************************************************/
289
etandem_finalReport(AjPFeattable tab,AjPFile outfile)290 static void etandem_finalReport(AjPFeattable tab, AjPFile outfile)
291 {
292 ajint start;
293 ajint end;
294 EtandemPCons a;
295 EtandemPCons top;
296 EtandemPCons olda;
297
298 ajDebug("finalReport\n");
299 while(reportRoot->next)
300 { /* find top score */
301 top = reportRoot;
302 for(a = reportRoot->next; a; a = a->next)
303 if(a->bestScore > top->bestScore ||
304 (a->bestScore == top->bestScore && a->repeat < top->repeat))
305 top = a;
306
307 /* report that */
308 etandem_basicReport(tab, outfile, top);
309 /* destroy all overlapping entries, including self */
310 start = top->start;
311 end = top->ibest;
312 olda = reportRoot;
313 for(a = olda->next; a; olda = a, a = a->next)
314 if(a->ibest >= start && a->start <= end)
315 {
316 olda->next = a->next;
317 etandem_consDestroy(&a);
318 a = olda;
319 }
320 }
321
322 return;
323 }
324
325
326
327
328 /* @prog etandem **************************************************************
329 **
330 ** Looks for tandem repeats in a nucleotide sequence
331 **
332 ******************************************************************************/
333
main(int argc,char ** argv)334 int main(int argc, char **argv)
335 {
336
337 ajint ibase;
338 ajint base;
339 const char *cp;
340 AjPSeq sequence = NULL;
341 ajint i;
342 ajint j;
343 ajint x;
344 ajint phase;
345 ajint n;
346 EtandemPCons new;
347 EtandemPCons a;
348 EtandemPCons b;
349 EtandemPCons olda;
350 EtandemPCons oldb;
351 AjPStr nseq = NULL;
352 AjPFeattable tab = NULL;
353 AjPReport report = NULL;
354 AjPFile outfile = NULL;
355 AjPStr tmpstr = NULL;
356
357 embInit("etandem", argc, argv);
358
359 nmin = ajAcdGetInt("minrepeat");
360 nmax = ajAcdGetInt("maxrepeat");
361 mismatch = ajAcdGetBoolean("mismatch");
362 thresh = ajAcdGetInt("threshold");
363 uniform = ajAcdGetBoolean("uniform");
364 report = ajAcdGetReport("outfile");
365 outfile = ajAcdGetOutfile("origfile");
366 sequence = ajAcdGetSeq("sequence");
367 nbase = ajSeqGetLen(sequence);
368
369 tab = ajFeattableNewSeq(sequence);
370
371 ajFmtPrintAppS(&tmpstr, "Threshold: %d\n", thresh);
372 ajFmtPrintAppS(&tmpstr, "Minrepeat: %d\n", nmin);
373 ajFmtPrintAppS(&tmpstr, "Maxrepeat: %d\n", nmax);
374 ajFmtPrintAppS(&tmpstr, "Mismatch: %B\n", mismatch);
375 ajFmtPrintAppS(&tmpstr, "Uniform: %B\n", uniform);
376 ajReportSetHeaderS(report, tmpstr);
377
378 cvt = ajSeqcvtNewC("ACGTN");
379 ajSeqConvertNum(sequence, cvt, &nseq);
380
381 AJCNEW(ring, nbase);
382
383 reportRoot->bestScore = thresh - 1;
384
385 for(n = nmin; n <= nmax; ++n)
386 {
387 cp = ajStrGetPtr(nseq);
388 for(ibase = 0; ibase < nbase; ++ibase, ++cp)
389 {
390 base = *cp - 1;
391 if(base < 0) base = 4;
392
393 /* set up local ring */
394 phase = ibase % n;
395 ring[phase] = base;
396
397 if(ibase < n-1)
398 continue;
399
400 /* start new Cons */
401 new = etandem_consCreate();
402 new->start = ibase - n + 1;
403 new->next = root->next;
404 new->score = new->bestScore = -n;
405 new->phase = phase; /* phase of the last base of pattern */
406 new->repeat = n;
407 root->next = new;
408
409 /* add last nmer to active Cons's */
410 olda = root;
411
412 if(DEBUG)
413 ajDebug("%d\n", ibase);
414
415 for(a = olda->next; a; olda = a, a = a->next)
416 {
417 if(a->phase == phase)
418 {
419 for(i = 0; i < n; ++i)
420 {
421 x = ring[i];
422 if((x+5*i) < 0) ajErr("ATAB(%d,%d) n:%d ibase:%d", x, i, n, ibase);
423 if(x == 4 && mismatch)
424 {
425 --a->score;
426 continue;
427 }
428 ++ATAB(x,i);
429 if(x == a->max[i])
430 ++a->score;
431 else if(ATAB(x,i) > ATAB(a->max[i],i))
432 {
433 a->max[i] = x;
434 ++a->score;
435 }
436 else
437 {
438 --a->score;
439 if(ATAB(x,i) == ATAB(a->max[i], i))
440 a->max[i] = x;
441 }
442 }
443
444 if(a->score > a->bestScore)
445 {
446 a->bestScore = a->score;
447 a->ibest = ibase;
448 for(j = 0; j < n; ++j)
449 a->bestMax[j] = a->max[j];
450 }
451 else if(a->score < 0)
452 {
453 if(DEBUG) ajDebug("D");
454 olda->next = a->next;
455 etandem_report(&a);
456 a = olda;
457 }
458 }
459 }
460
461 /* remove duplicate max tables */
462 olda = root;
463 for(a = olda->next; a; olda = a, a = a->next)
464 {
465 if(a->phase == phase)
466 {
467 oldb = a;
468 for(b = a->next; b; b = b->next)
469 { /* all phases */
470 for(j = 0; j < n; ++j)
471 {
472 if(a->max[j] != b->max[j])
473 goto nextb;
474 }
475
476 if(a->bestScore > b->bestScore)
477 { /* remove b */
478 oldb->next = b->next;
479 if(DEBUG)
480 {
481 ajDebug("B");
482 etandem_basicReport(tab, outfile, b);
483 }
484 etandem_consDestroy(&b);
485 b = oldb;
486 }
487 else
488 { /* remove a */
489 olda->next = a->next;
490 if(DEBUG)
491 {
492 ajDebug("A");
493 etandem_basicReport(tab, outfile, a);
494 }
495 etandem_consDestroy(&a);
496 a = olda;
497 goto nexta;
498 }
499
500 nextb:
501 oldb = b;
502 }
503 nexta:
504 ;
505 }
506 }
507
508 if(DEBUG)
509 for(a = root->next; a; a = a->next)
510 etandem_basicReport(tab, outfile, a);
511 }
512
513 while((a = root->next))
514 {
515 root->next = a->next;
516 etandem_report(&a);
517 }
518 }
519
520 etandem_finalReport(tab, outfile);
521 ajReportWrite(report, tab, sequence);
522 ajReportDel(&report);
523 ajFeattableDel(&tab);
524 ajSeqDel(&sequence);
525 ajFileClose(&outfile);
526
527 ajSeqcvtDel(&cvt);
528 ajStrDel(&nseq);
529 ajStrDel(&tmpstr);
530 AJFREE(ring);
531
532 embExit();
533
534 return 0;
535 }
536