1 static char const rcsid[] = "$Id: urkpcc.c,v 6.13 2003/05/30 17:25:38 coulouri Exp $";
2 
3 /*
4 * ===========================================================================
5 *
6 *                            PUBLIC DOMAIN NOTICE
7 *               National Center for Biotechnology Information
8 *
9 *  This software/database is a "United States Government Work" under the
10 *  terms of the United States Copyright Act.  It was written as part of
11 *  the author's official duties as a United States Government employee and
12 *  thus cannot be copyrighted.  This software/database is freely available
13 *  to the public for use. The National Library of Medicine and the U.S.
14 *  Government have not placed any restriction on its use or reproduction.
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 *  Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * File Name: urkpcc.c
29 *
30 * Author(s): John Kuzio
31 *
32 * Version Creation Date: 98-01-01
33 *
34 * $Revision: 6.13 $
35 *
36 * File Description: coiled-coils
37 *
38 * Modifications:
39 * --------------------------------------------------------------------------
40 * Date       Name        Description of modification
41 * --------------------------------------------------------------------------
42 * $Log: urkpcc.c,v $
43 * Revision 6.13  2003/05/30 17:25:38  coulouri
44 * add rcsid
45 *
46 * Revision 6.12  1998/11/16 14:29:49  kuzio
47 * flagBoundaryCondition
48 *
49 * Revision 6.11  1998/09/16 18:03:35  kuzio
50 * cvs logging
51 *
52 *
53 * ==========================================================================
54 */
55 
56 #include <urkpcc.h>
57 
ReadPccData(PccDatPtr pccp)58 extern Int4 ReadPccData (PccDatPtr pccp)
59 {
60   FILE    *fin;
61   Int4    i, n, val, count;
62   FloatHi score;
63   Char    buf[PATH_MAX], buff[PATH_MAX], tbuf[4];
64   CharPtr bptr;
65 
66   if (pccp == NULL)
67     return 0;
68 
69   if (pccp->pccdatafile == NULL || pccp->scr == NULL)
70     return 0;
71 
72   if (pccp->window < 7 || pccp->window > 42)
73     pccp->window = 22;
74 
75   if (FindPath ("ncbi", "ncbi", "data", buf, sizeof (buf)))
76     FileBuildPath (buf, NULL, pccp->pccdatafile);
77   else
78     StrCpy (buf, pccp->pccdatafile);
79 
80   if ((fin = FileOpen (buf, "r")) == NULL)
81   {
82     if ((fin = FileOpen (pccp->pccdatafile, "r")) == NULL)
83     {
84       return 0;
85     }
86   }
87 
88 /* no data available for selenocysteine U -- 24 characters */
89   pccp->res = StringSave ("ABCDEFGHIKLMNPQRSTVUWXYZ* ");
90   pccp->res[0] = '\0';
91 
92   val = 0;
93   while ((FileGets (buff, sizeof (buff), fin)) != NULL)
94   {
95     if ((bptr = StrChr (buff, ';')) != NULL)
96       *bptr = '\0';
97     if (StrLen (buff) < 16)
98       continue;
99     bptr = buff;
100     while (IS_WHITESP (*bptr))
101       bptr++;
102     sprintf (tbuf, "%c", *bptr);
103     StrCat (pccp->res, tbuf);
104     for (n = 0; n < 7; n++)
105     {
106       while (!IS_WHITESP (*bptr))
107         bptr++;
108       while (IS_WHITESP (*bptr))
109         bptr++;
110       sscanf (bptr, "%lf", &score);
111       pccp->scr[val][n] = score;
112     }
113     val++;
114     if (val == 24)
115       break;
116   }
117   FileClose (fin);
118   count = val;
119 
120   for (n = 7; n < pccp->window; n++)
121   {
122     if (n == 42)
123       break;
124     val = n % 7;
125     for (i = 0; i < count; i++)
126       pccp->scr[i][n] = pccp->scr[i][val];
127   }
128   return count;
129 }
130 
PccDatNew(void)131 extern PccDatPtr PccDatNew (void)
132 {
133   PccDatPtr pccp;
134   Int4      i;
135 
136   if ((pccp = (PccDatPtr) MemNew (sizeof (PccDat))) != NULL)
137   {
138     pccp->meang  = 0.77;
139     pccp->stdg   = 0.20;
140     pccp->meanc  = 1.63;
141     pccp->stdc   = 0.24;
142     pccp->window = 22;
143     pccp->linker = 32;
144 
145     pccp->pccdatafile = StringSave ("KSpcc.mat");
146 
147     if ((pccp->scr = (Pointer) MemNew ((size_t)(sizeof(Pointer)*24)))
148          != NULL)
149     {
150       for (i = 0; i < 24; i++)
151       {
152         pccp->scr[i] = (FloatHiPtr)
153                         MemNew ((size_t)(sizeof(FloatHi)*42));
154       }
155     }
156     else
157     {
158       pccp = (PccDatPtr) MemFree (pccp);
159     }
160   }
161   return pccp;
162 }
163 
PccDatFree(PccDatPtr pccp)164 extern PccDatPtr PccDatFree (PccDatPtr pccp)
165 {
166   Int4      i;
167 
168   if (pccp != NULL)
169   {
170     MemFree (pccp->pccdatafile);
171     MemFree (pccp->res);
172     for (i = 0; i < 24; i++)
173       MemFree (pccp->scr[i]);
174     MemFree (pccp->scr);
175     pccp = (PccDatPtr) MemFree (pccp);
176   }
177   return pccp;
178 }
179 
ProbPredictCC(FloatHi score,PccDatPtr pccp,FloatHi sqrt2pi)180 static FloatHi ProbPredictCC (FloatHi score, PccDatPtr pccp,
181                               FloatHi sqrt2pi)
182 {
183   FloatHi   constg, constc;
184   FloatHi   evalg, evalc;
185   FloatHi   Gg, Gc;
186   FloatHi   prob;
187 
188   constg = 1.0 / (pccp->stdg * sqrt2pi);
189   constc = 1.0 / (pccp->stdc * sqrt2pi);
190 
191   evalg = (((score - pccp->meang) / pccp->stdg) *
192            ((score - pccp->meang) / pccp->stdg)) / -2.0;
193   evalc = (((score - pccp->meanc) / pccp->stdc) *
194            ((score - pccp->meanc) / pccp->stdc)) / -2.0;
195 
196   evalg = exp (evalg);
197   evalc = exp (evalc);
198   Gg = constg * evalg;
199   Gc = constc * evalc;
200   prob = Gc / ((30 * Gg) + Gc);
201 
202   return prob;
203 }
204 
nroot(FloatHi fv,Int4 wv)205 static FloatHi nroot (FloatHi fv, Int4 wv)
206 {
207   FloatHi  fi = 1.0;
208 
209   fi = fi / (FloatHi) wv;
210   fv = pow (fv, fi);
211 
212   return fv;
213 }
214 
PredictCC(CharPtr seq,Int4 start,Int4 end,PccDatPtr pccp)215 extern FloatHiPtr PredictCC (CharPtr seq, Int4 start, Int4 end,
216                              PccDatPtr pccp)
217 {
218   Int4        i, n, col;
219   Int4        stop;
220   CharPtr     seqhead;
221   FloatHiPtr  fptrhead, fptrtemp, fptr;
222   FloatHi     sqrt2pi, temp;
223   Boolean     flagMatch;
224 
225   if (seq == NULL || pccp == NULL)
226     return NULL;
227 
228   if ((fptr = (FloatHiPtr) MemNew ((size_t) (sizeof (FloatHi) *
229                                    (end-start+1+pccp->window)))) == NULL)
230   {
231     return fptr;
232   }
233 
234   sqrt2pi = nroot ((2.0*PI), 2);
235 
236   fptrhead = fptrtemp = fptr;
237   seqhead = seq;
238   stop = end - pccp->window;
239   for (i = start; i < stop; i++)
240   {
241     seq = seqhead;
242     temp = 1.0;
243     for (col = 0; col < pccp->window; col++)
244     {
245       flagMatch = FALSE;
246       n = 0;
247       while (pccp->res[n] != '\0')
248       {
249         if (*seq == pccp->res[n])
250         {
251           flagMatch = TRUE;
252           break;
253         }
254         n++;
255       }
256       if (flagMatch)
257         temp *= pccp->scr[n][col];
258       seq++;
259     }
260     temp = nroot (temp, pccp->window);
261     temp = ProbPredictCC (temp, pccp, sqrt2pi);
262     if (temp < 0.0)
263       temp = 0.0;
264     fptr = fptrtemp;
265     for (n = 0; n < pccp->window; n++)
266     {
267       if (temp > *fptr)
268         *fptr = temp;
269       fptr++;
270     }
271     fptrtemp++;
272     seqhead++;
273   }
274   return fptrhead;
275 }
276 
277 /*
278  seqport should be opened full length (0 to bsp->length-1)
279  start and end reflect where you want to search
280 */
281 
PredictCCSeqPort(SeqPortPtr spp,Int4 start,Int4 end,PccDatPtr pccp)282 extern FloatHiPtr PredictCCSeqPort (SeqPortPtr spp,
283                                     Int4 start, Int4 end,
284                                     PccDatPtr pccp)
285 {
286   CharPtr    seq, seqhead;
287   Int4       i;
288   FloatHiPtr pccprob;
289 
290   if (spp == NULL || pccp == NULL)
291     return NULL;
292 
293   seq = seqhead = (CharPtr) MemNew ((size_t) (sizeof (Char) *
294                                     (end-start+1)));
295   if (seq == NULL)
296     return NULL;
297 
298   SeqPortSeek (spp, start, SEEK_SET);
299   for (i = start; i <= end; i++)
300   {
301     *seq = SeqPortGetResidue (spp);
302     seq++;
303   }
304   pccprob = PredictCC (seqhead, start, end, pccp);
305   MemFree (seqhead);
306   return pccprob;
307 }
308 
PredictCCBioseq(BioseqPtr bsp,Int4 start,Int4 end,PccDatPtr pccp)309 extern FloatHiPtr PredictCCBioseq (BioseqPtr bsp,
310                                    Int4 start, Int4 end,
311                                    PccDatPtr pccp)
312 {
313   SeqPortPtr spp;
314   FloatHiPtr pccprob;
315 
316   if (bsp == NULL || pccp == NULL)
317     return NULL;
318 
319   if (!ISA_aa (bsp->mol))
320     return NULL;
321 
322   spp = SeqPortNew (bsp, 0, bsp->length-1, 0, Seq_code_iupacaa);
323   pccprob = PredictCCSeqPort (spp, start, end, pccp);
324   SeqPortFree (spp);
325   return pccprob;
326 }
327 
PredictCCSeqLoc(SeqLocPtr slp,PccDatPtr pccp)328 extern FloatHiPtr PredictCCSeqLoc (SeqLocPtr slp,
329                                    PccDatPtr pccp)
330 {
331   BioseqPtr  bsp;
332   Int4       start, end;
333   FloatHiPtr pccprob;
334 
335   if (slp == NULL || pccp == NULL)
336     return NULL;
337 
338   if (slp->choice != SEQLOC_INT)
339     return NULL;
340 
341   if ((bsp = BioseqLockById (SeqLocId (slp))) == NULL)
342     return NULL;
343 
344   if (!ISA_aa (bsp->mol))
345   {
346     BioseqUnlock (bsp);
347     return NULL;
348   }
349 
350   start = SeqLocStart (slp);
351   end = SeqLocStop (slp);
352   pccprob = PredictCCBioseq (bsp, start, end, pccp);
353   BioseqUnlock (bsp);
354   return pccprob;
355 }
356 
FilterCC(FloatHiPtr score,FloatHi percentcut,Int4 length,Int4 linker,SeqIdPtr sip,Boolean flagBoundaryCondition)357 extern SeqLocPtr FilterCC (FloatHiPtr score, FloatHi percentcut,
358                            Int4 length, Int4 linker, SeqIdPtr sip,
359                            Boolean flagBoundaryCondition)
360 {
361   Int4       i;
362   Int4       start, stop;
363   FloatHi    lopr, hipr;
364   SeqLocPtr  nextslp, slp, slph = NULL;
365   SeqIntPtr  sint;
366 
367   if (score == NULL)
368     return NULL;
369 
370   percentcut /= 100.0;
371   lopr = percentcut * 0.80;
372   hipr = percentcut + (percentcut - lopr);
373 
374   if (flagBoundaryCondition)
375   {
376     for (i = 0; i < length; i++)
377     {
378       if (*score >= lopr && *score <= hipr)
379         break;
380       score++;
381     }
382   }
383   else
384   {
385     for (i = 0; i < length; i++)
386     {
387       if (*score > percentcut)
388         break;
389       score++;
390     }
391   }
392   if (i == length)
393     return NULL;
394 
395   if (flagBoundaryCondition)
396   {
397     while (i < length)
398     {
399       if (*score >= lopr && *score <= hipr)
400       {
401         start = i;
402         while (*score >= lopr && *score <= hipr && i < length)
403         {
404           score++;
405           i++;
406         }
407         stop = i - 1;
408         slp = SeqLocIntNew (start, stop, Seq_strand_unknown, sip);
409         ValNodeLink (&slph, slp);
410       }
411       else
412       {
413         score++;
414         i++;
415       }
416     }
417   }
418   else
419   {
420     while (i < length)
421     {
422       if (*score > percentcut)
423       {
424         start = i;
425         while (*score > percentcut && i < length)
426         {
427           score++;
428           i++;
429         }
430         stop = i - 1;
431         slp = SeqLocIntNew (start, stop, Seq_strand_unknown, sip);
432         ValNodeLink (&slph, slp);
433       }
434       else
435       {
436         score++;
437         i++;
438       }
439     }
440   }
441 
442   if (linker > 0)
443   {
444     slp = slph;
445     while (slp != NULL)
446     {
447       if (slp->next != NULL)
448       {
449         nextslp = slp->next;
450         stop = SeqLocStop (slp);
451         start = SeqLocStart (nextslp);
452         if (start-stop-1 <= linker)
453         {
454           sint = slp->data.ptrvalue;
455           sint->to = SeqLocStop (nextslp);
456           slp->next = nextslp->next;
457           nextslp->next = NULL;
458           SeqLocFree (nextslp);
459           continue;
460         }
461       }
462       slp = slp->next;
463     }
464   }
465 
466   return slph;
467 }
468 
FilterCCVS(FloatHiPtr score,FloatHi percentcut,Int4 length,Int4 linker,SeqIdPtr sip,Boolean flagBoundaryCondition)469 extern SeqLocPtr FilterCCVS (FloatHiPtr score, FloatHi percentcut,
470                              Int4 length, Int4 linker, SeqIdPtr sip,
471                              Boolean flagBoundaryCondition)
472 
473 {
474   Int4       i;
475   Int4       start, stop;
476   FloatHi    lopr, hipr;
477   SeqLocPtr  nextslp, slp, slph = NULL;
478   SeqIntPtr  sint;
479   Int4       cccount;
480 
481   if (score == NULL)
482     return NULL;
483 
484   percentcut /= 100.0;
485   lopr = percentcut * 0.80;
486   hipr = percentcut + (percentcut - lopr);
487 
488   if (flagBoundaryCondition)
489   {
490     for (i = 0; i < length; i++)
491     {
492       if (*score >= lopr && *score <= hipr)
493         break;
494       score++;
495     }
496   }
497   else
498   {
499     for (i = 0; i < length; i++)
500     {
501       if (*score > percentcut)
502         break;
503       score++;
504     }
505   }
506   if (i == length)
507     return NULL;
508 
509   if (flagBoundaryCondition)
510   {
511     while (i < length)
512     {
513       if (*score >= lopr && *score <= hipr)
514       {
515         start = i;
516         while (*score >= lopr && *score <= hipr && i < length)
517         {
518           score++;
519           i++;
520         }
521         stop = i - 1;
522         slp = SeqLocIntNew (start, stop, Seq_strand_unknown, sip);
523         ValNodeLink (&slph, slp);
524       }
525       else
526       {
527         score++;
528         i++;
529       }
530     }
531   }
532   else
533   {
534     while (i < length)
535     {
536       if (*score > percentcut)
537       {
538         start = i;
539         while (*score > percentcut && i < length)
540         {
541           score++;
542           i++;
543         }
544         stop = i - 1;
545         slp = SeqLocIntNew (start, stop, Seq_strand_unknown, sip);
546         ValNodeLink (&slph, slp);
547       }
548       else
549       {
550         score++;
551         i++;
552       }
553     }
554   }
555 
556   if (linker > 0)
557   {
558     cccount = 0;
559     slp = slph;
560     while (slp != NULL)
561     {
562       cccount++;
563       slp = slp->next;
564     }
565     linker *= (cccount-1);
566     slp = slph;
567     while (slp != NULL)
568     {
569       if (slp->next != NULL)
570       {
571         nextslp = slp->next;
572         stop = SeqLocStop (slp);
573         start = SeqLocStart (nextslp);
574         if (start-stop-1 <= linker)
575         {
576           sint = slp->data.ptrvalue;
577           sint->to = SeqLocStop (nextslp);
578           slp->next = nextslp->next;
579           nextslp->next = NULL;
580           SeqLocFree (nextslp);
581           continue;
582         }
583       }
584       slp = slp->next;
585     }
586   }
587 
588   return slph;
589 }
590