1 static char const rcsid[] = "$Id: urkfltr.c,v 6.8 2003/05/30 17:25:38 coulouri Exp $";
2
3 /*
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * File Name: urkfltr.c
29 *
30 * Author(s): John Kuzio
31 *
32 * Version Creation Date: 98-01-01
33 *
34 * $Revision: 6.8 $
35 *
36 * File Description: filter
37 *
38 * Modifications:
39 * --------------------------------------------------------------------------
40 * Date Name Description of modification
41 * --------------------------------------------------------------------------
42 * $Log: urkfltr.c,v $
43 * Revision 6.8 2003/05/30 17:25:38 coulouri
44 * add rcsid
45 *
46 * Revision 6.7 1998/12/23 21:34:04 kuzio
47 * SeqPortStuff
48 *
49 * Revision 6.6 1998/09/16 18:03:34 kuzio
50 * cvs logging
51 *
52 *
53 * ==========================================================================
54 */
55
56 #include <urkfltr.h>
57
ReadFilterData(FilterDatPtr fltp)58 extern Int4 ReadFilterData (FilterDatPtr fltp)
59 {
60 FILE *fin;
61 Int2 i, val;
62 Int4 type;
63 FloatHi score, temp;
64 Char buf[PATH_MAX], buff[PATH_MAX];
65
66 if (fltp == NULL)
67 return 0;
68
69 if (fltp->filterdatafile == NULL || fltp->scr == NULL)
70 return 0;
71
72 if (FindPath ("ncbi", "ncbi", "data", buf, sizeof (buf)))
73 FileBuildPath (buf, NULL, fltp->filterdatafile);
74 else
75 StrCpy(buf, fltp->filterdatafile);
76
77 if ((fin = FileOpen (buf, "r")) == NULL)
78 {
79 if ((fin = FileOpen (fltp->filterdatafile, "r")) == NULL)
80 {
81 return 0;
82 }
83 }
84
85 val = 0;
86 type = -1;
87 while (FileGets (buff, sizeof (buff), fin) != NULL)
88 {
89 if (buff[0] == ';')
90 continue;
91 if (type == -1)
92 {
93 sscanf (buff, "%d", &type);
94 if (type != 1)
95 {
96 FileClose (fin);
97 return 0;
98 }
99 if (FileGets (buff, sizeof (buff), fin) == NULL)
100 {
101 FileClose (fin);
102 return 0;
103 }
104 sscanf (buff, "%d", &type);
105 fltp->window = type;
106 continue;
107 }
108 sscanf (buff, "%lf8.3", &score);
109 fltp->scr[val] = score;
110 val++;
111 }
112 FileClose (fin);
113 fltp->maxscr = 0.0;
114 for (i = 0; i < val; i++)
115 {
116 temp = fltp->scr[i];
117 if (temp < 0.0)
118 temp *= -1.0;
119 if (temp > fltp->maxscr)
120 fltp->maxscr = temp;
121 }
122 fltp->maxscr *= (fltp->window);
123 return val;
124 }
125
FilterDatNew(Int4 filtertype,Int4 window)126 extern FilterDatPtr FilterDatNew (Int4 filtertype, Int4 window)
127 {
128 FilterDatPtr fltp;
129
130 if ((fltp = (FilterDatPtr) MemNew (sizeof (FilterDat))) != NULL)
131 {
132 fltp->window = window;
133 fltp->filtertype = filtertype;
134 if (fltp->filtertype >= AA_FILTER_COMP)
135 fltp->filtersize = 24;
136 else
137 fltp->filtersize = 15;
138 fltp->filterdatafile = NULL;
139 fltp->maxscr = 0.0;
140
141 if (fltp->filtertype >= AA_FILTER_COMP)
142 /* no data available for selenocysteine U -- 24 characters */
143 fltp->res = StringSave ("ABCDEFGHIKLMNPQRSTVWXYZ*");
144 else
145 /* standard -- 15 characters */
146 fltp->res = StringSave ("ABCDGHKMNRSTUWY");
147
148 fltp->scr =
149 (FloatHiPtr) MemNew ((size_t)(sizeof(FloatHi)*fltp->filtersize));
150 if (fltp->scr != NULL)
151 {
152 MemSet ((Pointer) fltp->scr, 0,
153 (size_t) (sizeof (FloatHi) * fltp->filtersize));
154 }
155 else
156 {
157 fltp = (FilterDatPtr) MemFree (fltp);
158 }
159 }
160 return fltp;
161 }
162
FilterDatFree(FilterDatPtr fltp)163 extern FilterDatPtr FilterDatFree (FilterDatPtr fltp)
164 {
165 if (fltp != NULL)
166 {
167 MemFree (fltp->filterdatafile);
168 MemFree (fltp->res);
169 MemFree (fltp->scr);
170 fltp = (FilterDatPtr) MemFree (fltp);
171 }
172 return fltp;
173 }
174
urkFilterSeq(CharPtr seq,Int4 start,Int4 end,FilterDatPtr fltp)175 extern FloatHiPtr urkFilterSeq (CharPtr seq, Int4 start, Int4 end,
176 FilterDatPtr fltp)
177 {
178 FloatHiPtr ringptr;
179 FloatHiPtr fhead, fptr;
180 Int4 i, n, iring, length;
181 FloatHi run_score;
182 Boolean flagMatch;
183
184 length = end - start + 1;
185 if (length < fltp->window)
186 return NULL;
187
188 if ((ringptr = (FloatHiPtr)
189 MemNew ((size_t) (sizeof (FloatHi) * fltp->window))) == NULL)
190 return NULL;
191
192 if ((fhead = fptr = (FloatHiPtr)
193 MemNew ((size_t) (sizeof (FloatHi) * length))) == NULL)
194 {
195 MemFree (ringptr);
196 return NULL;
197 }
198
199 MemSet ((Pointer) fptr, 0, (size_t) (sizeof (FloatHi) * length));
200
201 iring = 0;
202 run_score = 0.0;
203 for (i = 0; i < (fltp->window)-1; i++)
204 {
205 flagMatch = FALSE;
206 n = 0;
207 while (fltp->res[n] != '\0')
208 {
209 if (seq[i] == fltp->res[n])
210 {
211 flagMatch = TRUE;
212 break;
213 }
214 n++;
215 }
216 if (flagMatch)
217 {
218 run_score += fltp->scr[n];
219 ringptr[iring] = fltp->scr[n];
220 }
221 else
222 {
223 ringptr[iring] = 0.0;
224 }
225 iring++;
226 }
227 ringptr[iring] = 0.0; /* dummy last value in setup */
228
229 fptr += (fltp->window/2);
230 for (i = fltp->window; i < length; i++)
231 {
232 run_score = run_score - ringptr[iring];
233 flagMatch = FALSE;
234 n = 0;
235 while (fltp->res[n] != '\0')
236 {
237 if (seq[i] == fltp->res[n])
238 {
239 flagMatch = TRUE;
240 break;
241 }
242 n++;
243 }
244 if (flagMatch)
245 {
246 run_score += fltp->scr[n];
247 ringptr[iring] = fltp->scr[n];
248 }
249 else
250 {
251 ringptr[iring] = 0.0;
252 }
253 iring++;
254 if (iring == fltp->window)
255 iring = 0;
256 /* separate
257 *fptr = run_score/fltp->window;
258 */
259 *fptr = run_score;
260 fptr++;
261 }
262 MemFree (ringptr);
263 return fhead;
264 }
265
FilterSeqPort(SeqPortPtr spp,Int4 start,Int4 end,FilterDatPtr fltp)266 extern FloatHiPtr FilterSeqPort (SeqPortPtr spp,
267 Int4 start, Int4 end,
268 FilterDatPtr fltp)
269 {
270 CharPtr seq, seqhead;
271 Int4 i, loop, this_start, this_end;
272 FloatHiPtr fltscr, fltmp, flthead = NULL;
273
274 if (spp == NULL || fltp == NULL)
275 return NULL;
276
277 if (end-start+1 > MAXSEQCHUNK)
278 {
279 this_start = start;
280 this_end = this_start + MAXSEQCHUNK;
281 }
282 else
283 {
284 this_start = start;
285 this_end = end;
286 }
287
288 loop = 0;
289 while (this_end <= end)
290 {
291 seq = seqhead = (CharPtr) MemNew ((size_t) (sizeof (Char) *
292 (this_end-this_start+1)));
293 if (seq == NULL)
294 return NULL;
295
296 SeqPortSeek (spp, this_start, SEEK_SET);
297 for (i = this_start; i <= this_end; i++)
298 *seq++ = (Char) SeqPortGetResidue (spp);
299 *seq = '\0';
300 fltscr = urkFilterSeq (seqhead, this_start, this_end, fltp);
301 MemFree (seqhead);
302 if (this_start == start && this_end == end)
303 {
304 flthead = fltscr;
305 break;
306 }
307 if (this_end == end)
308 {
309 flthead = Realloc (flthead,
310 (size_t) ((loop * MAXSEQCHUNK) +
311 (sizeof (FloatHi) * (this_end-this_start+1))));
312 i = fltp->window/2;
313 if (!(fltp->window/2))
314 i++;
315 fltmp = flthead + (loop * (MAXSEQCHUNK-i));
316 fltscr += ((fltp->window)/2);
317 for (i = this_start; i <= this_end; i++)
318 *fltmp++ = *fltscr++;
319 break;
320 }
321 if (this_start == start)
322 {
323 flthead = fltscr;
324 }
325 else
326 {
327 flthead = Realloc (flthead, (size_t) ((loop+1)*(MAXSEQCHUNK)));
328 i = fltp->window/2;
329 if (!(fltp->window/2))
330 i++;
331 fltmp = flthead + (loop * (MAXSEQCHUNK-i));
332 for (i = this_start; i <= this_end; i++)
333 *fltmp++ = *fltscr++;
334 }
335 this_start = this_end+1-fltp->window;
336 this_end = this_start + MAXSEQCHUNK;
337 if (this_end > end)
338 this_end = end;
339 loop++;
340 }
341 return flthead;
342 }
343
FilterBioseq(BioseqPtr bsp,Int4 start,Int4 end,FilterDatPtr fltp)344 extern FloatHiPtr FilterBioseq (BioseqPtr bsp,
345 Int4 start, Int4 end,
346 FilterDatPtr fltp)
347 {
348 SeqPortPtr spp;
349 FloatHiPtr fltscr;
350
351 if (bsp == NULL || fltp == NULL)
352 return NULL;
353
354 if (fltp->filtertype >= AA_FILTER_COMP && !ISA_aa (bsp->mol))
355 return NULL;
356
357 if (fltp->filtertype < AA_FILTER_COMP && !ISA_na (bsp->mol))
358 return NULL;
359
360 if (ISA_aa (bsp->mol))
361 spp = SeqPortNew (bsp, 0, bsp->length-1, 0, Seq_code_iupacaa);
362 else
363 spp = SeqPortNew (bsp, 0, bsp->length-1, 0, Seq_code_iupacna);
364 fltscr = FilterSeqPort (spp, start, end, fltp);
365 SeqPortFree (spp);
366 return fltscr;
367 }
368
FilterSeqLoc(SeqLocPtr slp,FilterDatPtr fltp)369 extern FloatHiPtr FilterSeqLoc (SeqLocPtr slp,
370 FilterDatPtr fltp)
371 {
372 BioseqPtr bsp;
373 Int4 start, end;
374 FloatHiPtr fltscr;
375
376 if (slp == NULL || fltp == NULL)
377 return NULL;
378
379 if (slp->choice != SEQLOC_INT)
380 return NULL;
381
382 if ((bsp = BioseqLockById (SeqLocId (slp))) == NULL)
383 return NULL;
384
385 start = SeqLocStart (slp);
386 end = SeqLocStop (slp);
387 fltscr = FilterBioseq (bsp, start, end, fltp);
388 BioseqUnlock (bsp);
389 return fltscr;
390 }
391
FilterFilter(FloatHiPtr score,FloatHi maxscore,Int4 window,FloatHi percentcut,Int4 length,SeqIdPtr sip,Boolean flagMeld,Boolean flagHighPass)392 extern SeqLocPtr FilterFilter (FloatHiPtr score, FloatHi maxscore,
393 Int4 window, FloatHi percentcut,
394 Int4 length, SeqIdPtr sip,
395 Boolean flagMeld, Boolean flagHighPass)
396 {
397 Int4 i;
398 Int4 start, stop, left, right;
399 SeqLocPtr slp, slpt, slph = NULL;
400 SeqIntPtr sint, sin1, sin2;
401 FloatHiPtr scr;
402
403 if (score == NULL)
404 return NULL;
405
406 scr = score;
407 for (i = 0; i < length; i++)
408 {
409 *scr = *scr / maxscore * 100;
410 scr++;
411 }
412
413 if (flagHighPass)
414 {
415 for (i = 0; i < length; i++)
416 {
417 if (*score < percentcut)
418 break;
419 score++;
420 }
421 if (i == length)
422 return NULL;
423 /* just continue */
424 while (i < length)
425 {
426 if (*score < percentcut)
427 {
428 start = i;
429 while (*score < percentcut && i < length)
430 {
431 score++;
432 i++;
433 }
434 stop = i - 1;
435 slp = SeqLocIntNew (start, stop, Seq_strand_unknown, sip);
436 ValNodeLink (&slph, slp);
437 }
438 else
439 {
440 score++;
441 i++;
442 }
443 }
444 }
445 else
446 {
447 for (i = 0; i < length; i++)
448 {
449 if (*score >= percentcut)
450 break;
451 score++;
452 }
453 if (i == length)
454 return NULL;
455 /* just continue */
456 while (i < length)
457 {
458 if (*score >= percentcut)
459 {
460 start = i;
461 while (*score >= percentcut && i < length)
462 {
463 score++;
464 i++;
465 }
466 stop = i - 1;
467 slp = SeqLocIntNew (start, stop, Seq_strand_unknown, sip);
468 ValNodeLink (&slph, slp);
469 }
470 else
471 {
472 score++;
473 i++;
474 }
475 }
476 }
477 if (window != 0)
478 {
479 left = right = window / 2;
480 if (window%2 == 0)
481 left--;
482 slp = slph;
483 while (slp != NULL)
484 {
485 sint = slp->data.ptrvalue;
486 start = sint->from;
487 stop = sint->to;
488 start -= left;
489 if (start < 0)
490 start = 0;
491 stop += right;
492 if (stop > length-1)
493 stop = length - 1;
494 sint->from = start;
495 sint->to = stop;
496 slp = slp->next;
497 }
498 }
499 if (flagMeld)
500 {
501 slp = slph;
502 while (slp != NULL)
503 {
504 sin1 = slp->data.ptrvalue;
505 while (slp->next != NULL)
506 {
507 slpt = slp->next;
508 sin2 = slpt->data.ptrvalue;
509 if (sin2->from > sin1->to)
510 break;
511 sin1->to = sin2->to;
512 slp->next = slpt->next;
513 slpt->next = NULL;
514 SeqLocFree (slpt);
515 }
516 slp = slp->next;
517 }
518 }
519 return slph;
520 }
521