1 static char const rcsid[] = "$Id: dust.c,v 6.5 2003/05/30 17:25:36 coulouri Exp $";
2
3 /* dust.c
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * File Name: dust.c
29 *
30 * Author(s): Roma Tatusov, John Kuzio
31 *
32 * Version Creation Date: 5/26/95
33 *
34 * $Revision: 6.5 $
35 *
36 * File Description: a utility to find low complexity NA regions
37 *
38 * Modifications:
39 * --------------------------------------------------------------------------
40 * Date Name Description of modification
41 * ------- ---------- -----------------------------------------------------
42 *
43 * $Log: dust.c,v $
44 * Revision 6.5 2003/05/30 17:25:36 coulouri
45 * add rcsid
46 *
47 * Revision 6.4 1999/08/18 17:50:47 kans
48 * include dust.h only after other headers are included to provide prototypes on Mac
49 *
50 * Revision 6.3 1999/08/18 16:26:17 sicotte
51 * Optimized by Moving a Malloc/Free outside of calling functions and other misc optimizations
52 *
53 * Revision 6.2 1999/03/12 17:03:29 kuzio
54 * cast
55 *
56 * Revision 6.1 1998/09/24 15:26:39 egorov
57 * Fix lint complaints
58 *
59 * Revision 6.0 1997/08/25 18:53:04 madden
60 * Revision changed to 6.0
61 *
62 * Revision 5.5 1997/04/24 20:50:41 kuzio
63 * quantify speed up -- remove useless loop
64 *
65 * Revision 5.4 1997/03/12 21:43:48 kuzio
66 * for interactive dusting: FilterSeq from: ,Int4 window, to: ,Int4Ptr exval,
67 *
68 * Revision 5.3 1997/02/26 15:59:55 kuzio
69 * cast: wo1 ((Int2) (flen-i), seq+i, i, cloc);
70 *
71 * Revision 5.2 1997/02/14 22:29:48 kuzio
72 * filter function updates
73 *
74 * Revision 5.1 1997/02/11 00:20:45 kuzio
75 * support for graphs added
76 *
77 * Revision 5.0 1996/05/28 13:43:15 ostell
78 * Set to revision 5.0
79 *
80 * Revision 4.3 1996/01/22 18:25:58 kuzio
81 * function info in err code levels
82 *
83 * Revision 4.2 1995/12/07 18:56:19 kuzio
84 * Trimmed error reports for invalid residues
85 *
86 * Revision 4.1 1995/11/01 12:31:59 kuzio
87 * fixed nested comment line 491
88 *
89 * Revision 4.0 1995/07/26 13:50:15 ostell
90 * force revision to 4.0
91 *
92 * Revision 1.2 1995/07/05 14:06:22 kuzio
93 * fix EOS typo in dusttripfind array setup
94 *
95 * Revision 1.1 1995/05/26 18:47:47 kuzio
96 * Initial revision
97 *
98 *
99 * ==========================================================================
100 */
101
102 #include <sequtil.h>
103 #include <objmgr.h>
104 #include <seqport.h>
105 #include <dust.h>
106
107 static char _this_module[] = "dust";
108 #undef THIS_MODULE
109 #define THIS_MODULE _this_module
110 static char _this_file[] = __FILE__;
111 #undef THIS_FILE
112 #define THIS_FILE _this_file
113
114 /* local, file scope, structures and variables */
115
116 typedef struct endpoints {
117 struct endpoints PNTR next;
118 Int4 from, to;
119 } DREGION;
120
121 typedef struct localcurrents {
122 Int4 curlevel, curstart, curend;
123 } DCURLOC;
124
125 /* local functions */
126
127 static Int4 dust_segs PROTO ((Int4, SeqPortPtr, Int4, DREGION PNTR,
128 Int4, Int4, Int4, Int4));
129 static Int4 wo PROTO ((Int4, SeqPortPtr, Int4, DCURLOC PNTR, Int4 PNTR, UcharPtr seq));
130 static void wo1 PROTO ((Int4, UcharPtr, Int4, DCURLOC PNTR));
131 static Int4 dusttripfind PROTO ((SeqPortPtr, UcharPtr, Int4, Int4,
132 Int4 PNTR));
133 static SeqLocPtr slpDust PROTO ((SeqPortPtr, SeqLocPtr, SeqIdPtr,
134 ValNodePtr PNTR, DREGION PNTR,
135 Int4, Int4));
136
137 /* a simple bioseq */
138
BioseqDust(BioseqPtr bsp,Int4 start,Int4 end,Int2 level,Int2 window,Int2 minwin,Int2 linker)139 SeqLocPtr BioseqDust (BioseqPtr bsp, Int4 start, Int4 end,
140 Int2 level, Int2 window, Int2 minwin, Int2 linker)
141 {
142 SeqLocPtr slp = NULL; /* initialize */
143 ValNodePtr vnp = NULL;
144
145 SeqPortPtr spp;
146
147 DREGION PNTR reg, PNTR regold;
148 Int4 nreg;
149 Int4 l;
150 Int4 loopDustMax = 1;
151
152 /* error msg stuff */
153 /* ErrSetOptFlags (EO_MSG_CODES | EO_SHOW_FILELINE | EO_BEEP); */
154 ErrSetOptFlags (EO_MSG_CODES);
155
156 /* make sure bioseq is there */
157 if (!bsp)
158 {
159 ErrPostEx (SEV_ERROR, 1, 1,
160 "no bioseq");
161 ErrShow ();
162 return slp;
163 }
164 if (!ISA_na (bsp->mol))
165 {
166 ErrPostEx (SEV_WARNING, 1, 2,
167 "not nucleic acid");
168 ErrShow ();
169 return slp;
170 }
171
172 /* place for dusted regions */
173 reg = MemNew (sizeof (DREGION));
174 if (!reg)
175 {
176 ErrPostEx (SEV_FATAL, 1, 3,
177 "memory allocation error");
178 ErrShow ();
179 return slp;
180 }
181 reg->from = 0;
182 reg->to = 0;
183 reg->next = NULL;
184
185 /* do it */
186 spp = SeqPortNew (bsp, start, end, 0, Seq_code_ncbi2na);
187 if (!spp)
188 {
189 ErrPostEx (SEV_ERROR, 1, 4,
190 "sequence port open failed");
191 ErrShow ();
192 MemFree (reg);
193 return slp;
194 }
195
196 l = spp->totlen;
197 nreg = dust_segs (l, spp, start, reg, (Int4)level, (Int4)window, (Int4)minwin, (Int4)linker);
198 slp = slpDust (spp, NULL, bsp->id, &vnp, reg, nreg, loopDustMax);
199
200 /* clean up memory */
201 SeqPortFree (spp);
202 while (reg)
203 {
204 regold = reg;
205 reg = reg->next;
206 MemFree (regold);
207 }
208
209 return slp;
210 }
211
212 /* sequence location pointers */
213
SeqLocDust(SeqLocPtr this_slp,Int2 level,Int2 window,Int2 minwin,Int2 linker)214 SeqLocPtr SeqLocDust (SeqLocPtr this_slp,
215 Int2 level, Int2 window, Int2 minwin, Int2 linker)
216 {
217 SeqLocPtr next_slp, slp = NULL;
218 ValNodePtr vnp = NULL;
219
220 SeqIdPtr id;
221 BioseqPtr bsp;
222 SeqPortPtr spp;
223
224 DREGION PNTR reg, PNTR regold;
225 Int4 nreg;
226 Int4 start, end, l;
227 Int2 loopDustMax = 0;
228
229 /* error msg stuff */
230 ErrSetOptFlags (EO_MSG_CODES);
231
232 if (!this_slp)
233 {
234 ErrPostEx (SEV_ERROR, 2, 1,
235 "no sequence location given for dusting");
236 ErrShow ();
237 return slp;
238 }
239
240 /* place for dusted regions */
241 regold = reg = MemNew (sizeof (DREGION));
242 if (!reg)
243 {
244 ErrPostEx (SEV_FATAL, 2, 2,
245 "memory allocation error");
246 ErrShow ();
247 return slp;
248 }
249 reg->from = 0;
250 reg->to = 0;
251 reg->next = NULL;
252
253 /* count seqlocs */
254 next_slp = NULL;
255 while ((next_slp = SeqLocFindNext (this_slp, next_slp)) != NULL)
256 loopDustMax++;
257 if (!loopDustMax)
258 {
259 ErrPostEx (SEV_ERROR, 2, 3,
260 "can not find next seq loc");
261 ErrShow ();
262 }
263
264 /* loop for dusting as needed */
265 next_slp = NULL;
266 while ((next_slp = SeqLocFindNext (this_slp, next_slp)) != NULL)
267 {
268 /* offsets into actual sequence */
269 start = SeqLocStart (next_slp);
270 end = SeqLocStop (next_slp);
271
272 /* if all goes okay should get a seqport pointer */
273 id = SeqLocId (next_slp);
274 if (!id)
275 {
276 ErrPostEx (SEV_ERROR, 2, 4,
277 "no bioseq id");
278 ErrShow ();
279 continue;
280 }
281 bsp = BioseqLockById (id);
282 if (!bsp)
283 {
284 ErrPostEx (SEV_ERROR, 2, 5,
285 "no bioseq");
286 ErrShow ();
287 continue;
288 }
289 if (!ISA_na (bsp->mol))
290 {
291 ErrPostEx (SEV_WARNING, 2, 6,
292 "not nucleic acid");
293 ErrShow ();
294 BioseqUnlock (bsp);
295 continue;
296 }
297 spp = SeqPortNew (bsp, start, end, 0, Seq_code_ncbi2na);
298 BioseqUnlock (bsp);
299 if (!spp)
300 {
301 ErrPostEx (SEV_ERROR, 2, 7,
302 "sequence port open failed");
303 ErrShow ();
304 continue;
305 }
306
307 l = spp->totlen;
308 nreg = dust_segs (l, spp, start, reg,
309 (Int4)level, (Int4)window, (Int4)minwin, (Int4)linker);
310 slp = slpDust (spp, slp, id, &vnp,
311 reg, nreg, loopDustMax);
312 /* find tail - this way avoids referencing the pointer */
313 while (reg->next) reg = reg->next;
314
315 SeqPortFree (spp);
316 }
317
318 /* clean up memory */
319 reg = regold;
320 while (reg)
321 {
322 regold = reg;
323 reg = reg->next;
324 MemFree (regold);
325 }
326
327 return slp;
328 }
329
330 /* entry point for dusting - from BioseqDust or SeqLocDust */
331
dust_segs(Int4 length,SeqPortPtr spp,Int4 start,DREGION PNTR reg,Int4 level,Int4 windowsize,Int4 minwin,Int4 linker)332 static Int4 dust_segs (Int4 length, SeqPortPtr spp, Int4 start,
333 DREGION PNTR reg,
334 Int4 level, Int4 windowsize, Int4 minwin, Int4 linker)
335 {
336 Int4 len;
337 Int4 i, invrescount;
338 Int4 retlen;
339 UcharPtr seq;
340 DREGION PNTR regold = NULL;
341 DCURLOC cloc;
342 Int4 nreg, windowhalf;
343
344 /* defaults are more-or-less in keeping with original dust */
345 if (level < 2 || level > 64) level = 20;
346 if (windowsize < 8 || windowsize > 64) windowsize = 64;
347 if (minwin < 4 || minwin > 128) minwin = 4;
348 if (linker < 1 || linker > 32) linker = 1;
349 windowhalf = windowsize / 2;
350
351 /* keep track of weird invalid residues */
352 invrescount = 0;
353
354 nreg = 0;
355 seq = MemNew (windowsize); /* triplets */
356 if (!seq)
357 {
358 ErrPostEx (SEV_FATAL, 4, 1,
359 "failed to allocate triplet buffer");
360 ErrShow ();
361 return nreg;
362 }
363
364 for (i = 0; i < length; i += windowhalf)
365 {
366 len = (Int4) ((length > i+windowsize) ? windowsize : length-i);
367 len -= 2;
368 retlen = wo (len, spp, i, &cloc, &invrescount, seq);
369
370 /* get rid of itsy-bitsy's, dusttripfind aborts - move 1 triplet away */
371 if ((cloc.curend - cloc.curstart + 1) < minwin)
372 {
373 if (retlen != len)
374 {
375 i += (retlen - windowhalf + 3);
376 }
377 continue;
378 }
379
380 if (cloc.curlevel > level)
381 {
382 if (nreg &&
383 regold->to + linker >= cloc.curstart+i+start &&
384 regold->from <= cloc.curstart + i + start)
385 {
386 /* overlap windows nicely if needed */
387 regold->to = cloc.curend + i + start;
388 }
389 else
390 {
391 /* new window or dusted regions do not overlap */
392 reg->from = cloc.curstart + i + start;
393 reg->to = cloc.curend + i + start;
394 /* 5' edge effects - 3' edge effects - are best handled interactively */
395 /* it prbly would be good to put 'linker' as an interactive option */
396 /* interactive means wrapping stuff up in a graphics shell */
397 regold = reg;
398 reg = MemNew (sizeof (DREGION));
399 if (!reg)
400 {
401 ErrPostEx (SEV_FATAL, 3, 1,
402 "memory allocation error");
403 ErrShow ();
404 MemFree(seq);
405 return nreg;
406 }
407 reg->next = NULL;
408 regold->next = reg;
409 nreg++;
410 }
411 /* kill virtually all 3' tiling anomalies */
412 if (cloc.curend < windowhalf)
413 {
414 i += (cloc.curend - windowhalf);
415 }
416 } /* end 'if' high score */
417 } /* end for */
418 MemFree (seq);
419 if (invrescount > 0)
420 {
421 ErrPostEx (SEV_INFO, 3, 2,
422 "Total invalid residues found: %ld", (long) invrescount);
423 ErrShow ();
424 }
425 return nreg;
426 }
427
wo(Int4 len,SeqPortPtr spp,Int4 iseg,DCURLOC PNTR cloc,Int4 PNTR invrescount,UcharPtr seq)428 static Int4 wo (Int4 len, SeqPortPtr spp, Int4 iseg, DCURLOC PNTR cloc,
429 Int4 PNTR invrescount, UcharPtr seq)
430 {
431 Int4 i, flen;
432
433 cloc->curlevel = 0;
434 cloc->curstart = 0;
435 cloc->curend = 0;
436
437 /* get the chunk of sequence in triplets */
438
439 SeqPortSeek (spp, iseg, SEEK_SET);
440 MemSet (seq,0,len+2); /* Zero the triplet buffer */
441 flen = dusttripfind (spp, seq, iseg, len, invrescount);
442
443 /* dust the chunk */
444 for (i = 0; i < flen; i++)
445 {
446 wo1 (flen-i, seq+i, i, cloc);
447 }
448
449 cloc->curend += cloc->curstart;
450
451 return flen;
452 }
453
wo1(Int4 len,UcharPtr seq,Int4 iwo,DCURLOC PNTR cloc)454 static void wo1 (Int4 len, UcharPtr seq, Int4 iwo, DCURLOC PNTR cloc)
455 {
456 Uint4 sum;
457 Int4 loop;
458 Int4 newlevel;
459
460 Int2 PNTR countsptr;
461 Int2 counts[4*4*4];
462 MemSet (counts, 0, sizeof (counts));
463 /* zero everything */
464 sum = 0;
465 newlevel = 0;
466
467 /* dust loop -- specific for triplets */
468 for (loop = 0; loop < len; loop++)
469 {
470 countsptr = &counts[*seq++];
471 if (*countsptr)
472 {
473 sum += (Uint8)(*countsptr);
474
475 newlevel = 10 * sum / loop;
476
477 if (cloc->curlevel < newlevel)
478 {
479 cloc->curlevel = newlevel;
480 cloc->curstart = iwo;
481 cloc->curend = loop + 2; /* triplets */
482 }
483 }
484 (*countsptr)++;
485 }
486 return;
487 }
488
489 /* fill an array with 2-bit encoded triplets */
490
dusttripfind(SeqPortPtr spp,UcharPtr s1,Int4 icur,Int4 max,Int4 PNTR invrescount)491 static Int4 dusttripfind (SeqPortPtr spp, UcharPtr s1, Int4 icur, Int4 max,
492 Int4 PNTR invrescount)
493 {
494 Int4 pos;
495 Int4 n;
496 UcharPtr s2, s3;
497 Int2 c;
498 Boolean flagVD;
499
500 n = 0;
501
502 s2 = s1 + 1;
503 s3 = s1 + 2;
504
505 SeqPortSeek (spp, icur, SEEK_SET);
506
507 /* set up needs streamlining */
508 /* start again at segment or virtual sequence bounderies */
509 /* set up 1 */
510 if ((c = SeqPortGetResidue (spp)) == SEQPORT_EOF)
511 return n;
512 if (c == SEQPORT_EOS || c == SEQPORT_VIRT)
513 return n;
514 if (!IS_residue (c))
515 {
516 c = 0; /* 255 it's 'A' */
517 if (*invrescount < 3)
518 {
519 pos = SeqPortTell (spp);
520 ErrPostEx (SEV_INFO, 5, 1,
521 "Invalid residue converted to 'A': %ld", (long) pos);
522 ErrShow ();
523 }
524 (*invrescount)++;
525 }
526 *s1 |= c;
527 *s1 <<= 2;
528
529 /* set up 2 */
530 if ((c = SeqPortGetResidue (spp)) == SEQPORT_EOF)
531 return n;
532 if (c == SEQPORT_EOS || c == SEQPORT_VIRT)
533 return n;
534 if (!IS_residue (c))
535 {
536 c = 0; /* 255 it's 'A' */
537 if (*invrescount < 3)
538 {
539 pos = SeqPortTell (spp);
540 ErrPostEx (SEV_INFO, 5, 1,
541 "Invalid residue converted to 'A': %ld", (long) pos);
542 ErrShow ();
543 }
544 (*invrescount)++;
545 }
546 *s1 |= c;
547 *s2 |= c;
548
549 /* triplet fill loop */
550 flagVD = TRUE;
551 while ((c = SeqPortGetResidue (spp)) != SEQPORT_EOF && n < max)
552 {
553 if (c == INVALID_RESIDUE)
554 {
555 c = 0; /* 255 it's 'A' */
556 if (*invrescount < 3)
557 {
558 pos = SeqPortTell (spp);
559 ErrPostEx (SEV_INFO, 5, 1,
560 "Invalid residue converted to 'A': %ld", (long) pos);
561 ErrShow ();
562 }
563 (*invrescount)++;
564 }
565 if (IS_residue (c))
566 {
567 *s1 <<= 2;
568 *s2 <<= 2;
569 *s1 |= c;
570 *s2 |= c;
571 *s3 |= c;
572 s1++;
573 s2++;
574 s3++;
575 n++;
576 }
577 else
578 {
579 switch (c)
580 {
581 case SEQPORT_EOS: /* 252 rare */
582 break;
583 /* VIRT if there is an undetermined segment of sequence */
584 case SEQPORT_VIRT: /* 251 ignore ? */
585 default:
586 /* flagVD = TRUE; dust across v-seg */
587 flagVD = FALSE; /* don't dust across */
588 break;
589 }
590 if (!flagVD) break;
591 }
592 } /* end while */
593
594 return n;
595 }
596
597 /* look for dustable locations */
598
slpDust(SeqPortPtr spp,SeqLocPtr slp,SeqIdPtr id,ValNodePtr PNTR vnp,DREGION PNTR reg,Int4 nreg,Int4 loopDustMax)599 static SeqLocPtr slpDust (SeqPortPtr spp, SeqLocPtr slp, SeqIdPtr id,
600 ValNodePtr PNTR vnp, DREGION PNTR reg,
601 Int4 nreg, Int4 loopDustMax)
602 {
603 SeqIntPtr sintp;
604 Int4 i;
605 Boolean flagNoPack;
606
607 /* point to dusted locations */
608 if (nreg)
609 {
610
611 /* loopDustMax == 1 forces PACKED_INT IN - PACKED_INT OUT as needed */
612 flagNoPack = FALSE;
613 if (nreg == 1 && loopDustMax == 1) flagNoPack = TRUE;
614
615 if (!slp)
616 {
617 if ((slp = ValNodeNew (NULL)) == NULL)
618 {
619 ErrPostEx (SEV_ERROR, 6, 1,
620 "val node new failed");
621 ErrShow ();
622 return slp;
623 }
624 }
625
626 if (flagNoPack)
627 {
628 slp->choice = SEQLOC_INT;
629 }
630 else
631 {
632 slp->choice = SEQLOC_PACKED_INT;
633 }
634
635 for (i = 0; i < nreg; i++)
636 {
637 sintp = SeqIntNew ();
638 if (!sintp)
639 {
640 ErrPostEx (SEV_FATAL, 6, 2,
641 "memory allocation error");
642 ErrShow ();
643 return slp;
644 }
645 sintp->id = SeqIdDup (id);
646 sintp->from = reg->from;
647 sintp->to = reg->to;
648 if (!flagNoPack) ValNodeAddPointer
649 (vnp, SEQLOC_INT, sintp);
650 reg = reg->next;
651 }
652
653 if (flagNoPack)
654 {
655 slp->data.ptrvalue = (Pointer) sintp;
656 }
657 else
658 {
659 slp->data.ptrvalue = *vnp;
660 }
661 }
662 return slp;
663 }
664
DustGraph(SeqPortPtr spp,Int4 length,Int2 level,Int2 window,Int2 minwin,Int2 linker)665 extern FloatHiPtr DustGraph (SeqPortPtr spp, Int4 length, Int2 level,
666 Int2 window, Int2 minwin, Int2 linker)
667 {
668 DCURLOC cloc;
669 Int2 windowhalf;
670 Int2 retlen;
671 Int4 len;
672 Int4 i, n;
673 Int4 invrescount;
674 FloatHi maxval, curval;
675 FloatHiPtr fhead, ftemp;
676 FloatHiPtr fptr;
677 UcharPtr seq;
678
679 invrescount = 0;
680 windowhalf = window / 2;
681
682 maxval = level * 5.0;
683 if (maxval > 100.0)
684 maxval = 100.0;
685
686 fptr = (FloatHiPtr) MemNew ((sizeof (FloatHi)) * length);
687 fhead = fptr;
688 for (i = 0; i < length; i++)
689 *fptr++ = 0.0;
690
691 fptr = fhead;
692 seq = MemNew ((size_t)window); /* triplets */
693 if (!seq)
694 {
695 ErrPostEx (SEV_FATAL, 4, 1,
696 "failed to allocate triplet buffer");
697 ErrShow ();
698 return 0;
699 }
700
701 for (i = 0; i < length; i += windowhalf)
702 {
703 len = (length > i+window) ? window : (Int2) (length-i);
704 len -= 2;
705 retlen = wo (len, spp, i, &cloc, &invrescount, seq);
706
707 if ((cloc.curend - cloc.curstart + 1) < minwin)
708 {
709 if (retlen != len)
710 i += (retlen - windowhalf + 3);
711 continue;
712 }
713 curval = (FloatHi) cloc.curlevel;
714 if (curval > maxval)
715 curval = maxval;
716 fptr = fhead + cloc.curstart + i;
717 for (n = cloc.curstart; n < cloc.curend+1; n++)
718 *fptr++ = curval;
719 }
720 MemFree(seq);
721 fptr = fhead;
722 for (i = 0; i < length-((Int4)linker); i++)
723 {
724 if (*fptr != 0)
725 {
726 ftemp = fptr + linker;
727 if (*ftemp != 0)
728 {
729 curval = (*ftemp + *fptr) / 2;
730 fptr++;
731 for (n = 1; n < linker; n++)
732 *fptr++ = curval;
733 i += (linker - 1);
734 }
735 }
736 else
737 {
738 fptr++;
739 }
740 }
741
742 return fhead;
743 }
744