1 static char const rcsid[] = "$Id: urkfltr.c,v 6.8 2003/05/30 17:25:38 coulouri Exp $";
2 
3 /*
4 * ===========================================================================
5 *
6 *                            PUBLIC DOMAIN NOTICE
7 *               National Center for Biotechnology Information
8 *
9 *  This software/database is a "United States Government Work" under the
10 *  terms of the United States Copyright Act.  It was written as part of
11 *  the author's official duties as a United States Government employee and
12 *  thus cannot be copyrighted.  This software/database is freely available
13 *  to the public for use. The National Library of Medicine and the U.S.
14 *  Government have not placed any restriction on its use or reproduction.
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 *  Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * File Name: urkfltr.c
29 *
30 * Author(s): John Kuzio
31 *
32 * Version Creation Date: 98-01-01
33 *
34 * $Revision: 6.8 $
35 *
36 * File Description: filter
37 *
38 * Modifications:
39 * --------------------------------------------------------------------------
40 * Date       Name        Description of modification
41 * --------------------------------------------------------------------------
42 * $Log: urkfltr.c,v $
43 * Revision 6.8  2003/05/30 17:25:38  coulouri
44 * add rcsid
45 *
46 * Revision 6.7  1998/12/23 21:34:04  kuzio
47 * SeqPortStuff
48 *
49 * Revision 6.6  1998/09/16 18:03:34  kuzio
50 * cvs logging
51 *
52 *
53 * ==========================================================================
54 */
55 
56 #include <urkfltr.h>
57 
ReadFilterData(FilterDatPtr fltp)58 extern Int4 ReadFilterData (FilterDatPtr fltp)
59 {
60   FILE    *fin;
61   Int2    i, val;
62   Int4    type;
63   FloatHi score, temp;
64   Char    buf[PATH_MAX], buff[PATH_MAX];
65 
66   if (fltp == NULL)
67     return 0;
68 
69   if (fltp->filterdatafile == NULL || fltp->scr == NULL)
70     return 0;
71 
72   if (FindPath ("ncbi", "ncbi", "data", buf, sizeof (buf)))
73     FileBuildPath (buf, NULL, fltp->filterdatafile);
74   else
75     StrCpy(buf, fltp->filterdatafile);
76 
77   if ((fin = FileOpen (buf, "r")) == NULL)
78   {
79     if ((fin = FileOpen (fltp->filterdatafile, "r")) == NULL)
80     {
81       return 0;
82     }
83   }
84 
85   val = 0;
86   type = -1;
87   while (FileGets (buff, sizeof (buff), fin) != NULL)
88   {
89     if (buff[0] == ';')
90       continue;
91     if (type == -1)
92     {
93       sscanf (buff, "%d", &type);
94       if (type != 1)
95       {
96         FileClose (fin);
97         return 0;
98       }
99       if (FileGets (buff, sizeof (buff), fin) == NULL)
100       {
101         FileClose (fin);
102         return 0;
103       }
104       sscanf (buff, "%d", &type);
105       fltp->window = type;
106       continue;
107     }
108     sscanf (buff, "%lf8.3", &score);
109     fltp->scr[val] = score;
110     val++;
111   }
112   FileClose (fin);
113   fltp->maxscr = 0.0;
114   for (i = 0; i < val; i++)
115   {
116     temp = fltp->scr[i];
117     if (temp < 0.0)
118       temp *= -1.0;
119     if (temp > fltp->maxscr)
120       fltp->maxscr = temp;
121   }
122   fltp->maxscr *= (fltp->window);
123   return val;
124 }
125 
FilterDatNew(Int4 filtertype,Int4 window)126 extern FilterDatPtr FilterDatNew (Int4 filtertype, Int4 window)
127 {
128   FilterDatPtr fltp;
129 
130   if ((fltp = (FilterDatPtr) MemNew (sizeof (FilterDat))) != NULL)
131   {
132     fltp->window = window;
133     fltp->filtertype = filtertype;
134     if (fltp->filtertype >= AA_FILTER_COMP)
135       fltp->filtersize = 24;
136     else
137       fltp->filtersize = 15;
138     fltp->filterdatafile = NULL;
139     fltp->maxscr = 0.0;
140 
141     if (fltp->filtertype >= AA_FILTER_COMP)
142 /* no data available for selenocysteine U -- 24 characters */
143       fltp->res = StringSave ("ABCDEFGHIKLMNPQRSTVWXYZ*");
144     else
145 /* standard -- 15 characters */
146       fltp->res = StringSave ("ABCDGHKMNRSTUWY");
147 
148     fltp->scr =
149      (FloatHiPtr) MemNew ((size_t)(sizeof(FloatHi)*fltp->filtersize));
150     if (fltp->scr != NULL)
151     {
152       MemSet ((Pointer) fltp->scr, 0,
153               (size_t) (sizeof (FloatHi) * fltp->filtersize));
154     }
155     else
156     {
157       fltp = (FilterDatPtr) MemFree (fltp);
158     }
159   }
160   return fltp;
161 }
162 
FilterDatFree(FilterDatPtr fltp)163 extern FilterDatPtr FilterDatFree (FilterDatPtr fltp)
164 {
165   if (fltp != NULL)
166   {
167     MemFree (fltp->filterdatafile);
168     MemFree (fltp->res);
169     MemFree (fltp->scr);
170     fltp = (FilterDatPtr) MemFree (fltp);
171   }
172   return fltp;
173 }
174 
urkFilterSeq(CharPtr seq,Int4 start,Int4 end,FilterDatPtr fltp)175 extern FloatHiPtr urkFilterSeq (CharPtr seq, Int4 start, Int4 end,
176                                 FilterDatPtr fltp)
177 {
178   FloatHiPtr   ringptr;
179   FloatHiPtr   fhead, fptr;
180   Int4         i, n, iring, length;
181   FloatHi      run_score;
182   Boolean      flagMatch;
183 
184   length = end - start + 1;
185   if (length < fltp->window)
186     return NULL;
187 
188   if ((ringptr = (FloatHiPtr)
189       MemNew ((size_t) (sizeof (FloatHi) * fltp->window))) == NULL)
190     return NULL;
191 
192   if ((fhead = fptr = (FloatHiPtr)
193       MemNew ((size_t) (sizeof (FloatHi) * length))) == NULL)
194   {
195     MemFree (ringptr);
196     return NULL;
197   }
198 
199   MemSet ((Pointer) fptr, 0, (size_t) (sizeof (FloatHi) * length));
200 
201   iring = 0;
202   run_score = 0.0;
203   for (i = 0; i < (fltp->window)-1; i++)
204   {
205     flagMatch = FALSE;
206     n = 0;
207     while (fltp->res[n] != '\0')
208     {
209       if (seq[i] == fltp->res[n])
210       {
211         flagMatch = TRUE;
212         break;
213       }
214       n++;
215     }
216     if (flagMatch)
217     {
218       run_score += fltp->scr[n];
219       ringptr[iring] = fltp->scr[n];
220     }
221     else
222     {
223       ringptr[iring] = 0.0;
224     }
225     iring++;
226   }
227   ringptr[iring] = 0.0; /* dummy last value in setup */
228 
229   fptr += (fltp->window/2);
230   for (i = fltp->window; i < length; i++)
231   {
232     run_score = run_score - ringptr[iring];
233     flagMatch = FALSE;
234     n = 0;
235     while (fltp->res[n] != '\0')
236     {
237       if (seq[i] == fltp->res[n])
238       {
239         flagMatch = TRUE;
240         break;
241       }
242       n++;
243     }
244     if (flagMatch)
245     {
246       run_score += fltp->scr[n];
247       ringptr[iring] = fltp->scr[n];
248     }
249     else
250     {
251       ringptr[iring] = 0.0;
252     }
253     iring++;
254     if (iring == fltp->window)
255       iring = 0;
256 /* separate
257     *fptr = run_score/fltp->window;
258 */
259     *fptr = run_score;
260     fptr++;
261   }
262   MemFree (ringptr);
263   return fhead;
264 }
265 
FilterSeqPort(SeqPortPtr spp,Int4 start,Int4 end,FilterDatPtr fltp)266 extern FloatHiPtr FilterSeqPort (SeqPortPtr spp,
267                                  Int4 start, Int4 end,
268                                  FilterDatPtr fltp)
269 {
270   CharPtr    seq, seqhead;
271   Int4       i, loop, this_start, this_end;
272   FloatHiPtr fltscr, fltmp, flthead = NULL;
273 
274   if (spp == NULL || fltp == NULL)
275     return NULL;
276 
277   if (end-start+1 > MAXSEQCHUNK)
278   {
279     this_start = start;
280     this_end = this_start + MAXSEQCHUNK;
281   }
282   else
283   {
284     this_start = start;
285     this_end = end;
286   }
287 
288   loop = 0;
289   while (this_end <= end)
290   {
291     seq = seqhead = (CharPtr) MemNew ((size_t) (sizeof (Char) *
292                                       (this_end-this_start+1)));
293     if (seq == NULL)
294       return NULL;
295 
296     SeqPortSeek (spp, this_start, SEEK_SET);
297     for (i = this_start; i <= this_end; i++)
298       *seq++ = (Char) SeqPortGetResidue (spp);
299     *seq = '\0';
300     fltscr = urkFilterSeq (seqhead, this_start, this_end, fltp);
301     MemFree (seqhead);
302     if (this_start == start && this_end == end)
303     {
304       flthead = fltscr;
305       break;
306     }
307     if (this_end == end)
308     {
309       flthead = Realloc (flthead,
310                 (size_t) ((loop * MAXSEQCHUNK) +
311                  (sizeof (FloatHi) * (this_end-this_start+1))));
312       i = fltp->window/2;
313       if (!(fltp->window/2))
314         i++;
315       fltmp = flthead + (loop * (MAXSEQCHUNK-i));
316       fltscr += ((fltp->window)/2);
317       for (i = this_start; i <= this_end; i++)
318         *fltmp++ = *fltscr++;
319       break;
320     }
321     if (this_start == start)
322     {
323       flthead = fltscr;
324     }
325     else
326     {
327       flthead = Realloc (flthead, (size_t) ((loop+1)*(MAXSEQCHUNK)));
328       i = fltp->window/2;
329       if (!(fltp->window/2))
330         i++;
331       fltmp = flthead + (loop * (MAXSEQCHUNK-i));
332       for (i = this_start; i <= this_end; i++)
333         *fltmp++ = *fltscr++;
334     }
335     this_start = this_end+1-fltp->window;
336     this_end = this_start + MAXSEQCHUNK;
337     if (this_end > end)
338       this_end = end;
339     loop++;
340   }
341   return flthead;
342 }
343 
FilterBioseq(BioseqPtr bsp,Int4 start,Int4 end,FilterDatPtr fltp)344 extern FloatHiPtr FilterBioseq (BioseqPtr bsp,
345                                 Int4 start, Int4 end,
346                                 FilterDatPtr fltp)
347 {
348   SeqPortPtr spp;
349   FloatHiPtr fltscr;
350 
351   if (bsp == NULL || fltp == NULL)
352     return NULL;
353 
354   if (fltp->filtertype >= AA_FILTER_COMP && !ISA_aa (bsp->mol))
355     return NULL;
356 
357   if (fltp->filtertype < AA_FILTER_COMP && !ISA_na (bsp->mol))
358     return NULL;
359 
360   if (ISA_aa (bsp->mol))
361     spp = SeqPortNew (bsp, 0, bsp->length-1, 0, Seq_code_iupacaa);
362   else
363     spp = SeqPortNew (bsp, 0, bsp->length-1, 0, Seq_code_iupacna);
364   fltscr = FilterSeqPort (spp, start, end, fltp);
365   SeqPortFree (spp);
366   return fltscr;
367 }
368 
FilterSeqLoc(SeqLocPtr slp,FilterDatPtr fltp)369 extern FloatHiPtr FilterSeqLoc (SeqLocPtr slp,
370                                 FilterDatPtr fltp)
371 {
372   BioseqPtr  bsp;
373   Int4       start, end;
374   FloatHiPtr fltscr;
375 
376   if (slp == NULL || fltp == NULL)
377     return NULL;
378 
379   if (slp->choice != SEQLOC_INT)
380     return NULL;
381 
382   if ((bsp = BioseqLockById (SeqLocId (slp))) == NULL)
383     return NULL;
384 
385   start = SeqLocStart (slp);
386   end = SeqLocStop (slp);
387   fltscr = FilterBioseq (bsp, start, end, fltp);
388   BioseqUnlock (bsp);
389   return fltscr;
390 }
391 
FilterFilter(FloatHiPtr score,FloatHi maxscore,Int4 window,FloatHi percentcut,Int4 length,SeqIdPtr sip,Boolean flagMeld,Boolean flagHighPass)392 extern SeqLocPtr FilterFilter (FloatHiPtr score, FloatHi maxscore,
393                                Int4 window, FloatHi percentcut,
394                                Int4 length, SeqIdPtr sip,
395                                Boolean flagMeld, Boolean flagHighPass)
396 {
397   Int4       i;
398   Int4       start, stop, left, right;
399   SeqLocPtr  slp, slpt, slph = NULL;
400   SeqIntPtr  sint, sin1, sin2;
401   FloatHiPtr scr;
402 
403   if (score == NULL)
404     return NULL;
405 
406   scr = score;
407   for (i = 0; i < length; i++)
408   {
409     *scr = *scr / maxscore * 100;
410     scr++;
411   }
412 
413   if (flagHighPass)
414   {
415     for (i = 0; i < length; i++)
416     {
417       if (*score < percentcut)
418         break;
419       score++;
420     }
421     if (i == length)
422       return NULL;
423 /* just continue */
424     while (i < length)
425     {
426       if (*score < percentcut)
427       {
428         start = i;
429         while (*score < percentcut && i < length)
430         {
431           score++;
432           i++;
433         }
434         stop = i - 1;
435         slp = SeqLocIntNew (start, stop, Seq_strand_unknown, sip);
436         ValNodeLink (&slph, slp);
437       }
438       else
439       {
440         score++;
441         i++;
442       }
443     }
444   }
445   else
446   {
447     for (i = 0; i < length; i++)
448     {
449       if (*score >= percentcut)
450         break;
451       score++;
452     }
453     if (i == length)
454       return NULL;
455 /* just continue */
456     while (i < length)
457     {
458       if (*score >= percentcut)
459       {
460         start = i;
461         while (*score >= percentcut && i < length)
462         {
463           score++;
464           i++;
465         }
466         stop = i - 1;
467         slp = SeqLocIntNew (start, stop, Seq_strand_unknown, sip);
468         ValNodeLink (&slph, slp);
469       }
470       else
471       {
472         score++;
473         i++;
474       }
475     }
476   }
477   if (window != 0)
478   {
479     left = right = window / 2;
480     if (window%2 == 0)
481       left--;
482     slp = slph;
483     while (slp != NULL)
484     {
485       sint = slp->data.ptrvalue;
486       start = sint->from;
487       stop = sint->to;
488       start -= left;
489       if (start < 0)
490         start = 0;
491       stop += right;
492       if (stop > length-1)
493         stop = length - 1;
494       sint->from = start;
495       sint->to = stop;
496       slp = slp->next;
497     }
498   }
499   if (flagMeld)
500   {
501     slp = slph;
502     while (slp != NULL)
503     {
504       sin1 = slp->data.ptrvalue;
505       while (slp->next != NULL)
506       {
507         slpt = slp->next;
508         sin2 = slpt->data.ptrvalue;
509         if (sin2->from > sin1->to)
510           break;
511         sin1->to = sin2->to;
512         slp->next = slpt->next;
513         slpt->next = NULL;
514         SeqLocFree (slpt);
515       }
516       slp = slp->next;
517     }
518   }
519   return slph;
520 }
521