1 static char const rcsid[] = "$Id: urkpcc.c,v 6.13 2003/05/30 17:25:38 coulouri Exp $";
2
3 /*
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * File Name: urkpcc.c
29 *
30 * Author(s): John Kuzio
31 *
32 * Version Creation Date: 98-01-01
33 *
34 * $Revision: 6.13 $
35 *
36 * File Description: coiled-coils
37 *
38 * Modifications:
39 * --------------------------------------------------------------------------
40 * Date Name Description of modification
41 * --------------------------------------------------------------------------
42 * $Log: urkpcc.c,v $
43 * Revision 6.13 2003/05/30 17:25:38 coulouri
44 * add rcsid
45 *
46 * Revision 6.12 1998/11/16 14:29:49 kuzio
47 * flagBoundaryCondition
48 *
49 * Revision 6.11 1998/09/16 18:03:35 kuzio
50 * cvs logging
51 *
52 *
53 * ==========================================================================
54 */
55
56 #include <urkpcc.h>
57
ReadPccData(PccDatPtr pccp)58 extern Int4 ReadPccData (PccDatPtr pccp)
59 {
60 FILE *fin;
61 Int4 i, n, val, count;
62 FloatHi score;
63 Char buf[PATH_MAX], buff[PATH_MAX], tbuf[4];
64 CharPtr bptr;
65
66 if (pccp == NULL)
67 return 0;
68
69 if (pccp->pccdatafile == NULL || pccp->scr == NULL)
70 return 0;
71
72 if (pccp->window < 7 || pccp->window > 42)
73 pccp->window = 22;
74
75 if (FindPath ("ncbi", "ncbi", "data", buf, sizeof (buf)))
76 FileBuildPath (buf, NULL, pccp->pccdatafile);
77 else
78 StrCpy (buf, pccp->pccdatafile);
79
80 if ((fin = FileOpen (buf, "r")) == NULL)
81 {
82 if ((fin = FileOpen (pccp->pccdatafile, "r")) == NULL)
83 {
84 return 0;
85 }
86 }
87
88 /* no data available for selenocysteine U -- 24 characters */
89 pccp->res = StringSave ("ABCDEFGHIKLMNPQRSTVUWXYZ* ");
90 pccp->res[0] = '\0';
91
92 val = 0;
93 while ((FileGets (buff, sizeof (buff), fin)) != NULL)
94 {
95 if ((bptr = StrChr (buff, ';')) != NULL)
96 *bptr = '\0';
97 if (StrLen (buff) < 16)
98 continue;
99 bptr = buff;
100 while (IS_WHITESP (*bptr))
101 bptr++;
102 sprintf (tbuf, "%c", *bptr);
103 StrCat (pccp->res, tbuf);
104 for (n = 0; n < 7; n++)
105 {
106 while (!IS_WHITESP (*bptr))
107 bptr++;
108 while (IS_WHITESP (*bptr))
109 bptr++;
110 sscanf (bptr, "%lf", &score);
111 pccp->scr[val][n] = score;
112 }
113 val++;
114 if (val == 24)
115 break;
116 }
117 FileClose (fin);
118 count = val;
119
120 for (n = 7; n < pccp->window; n++)
121 {
122 if (n == 42)
123 break;
124 val = n % 7;
125 for (i = 0; i < count; i++)
126 pccp->scr[i][n] = pccp->scr[i][val];
127 }
128 return count;
129 }
130
PccDatNew(void)131 extern PccDatPtr PccDatNew (void)
132 {
133 PccDatPtr pccp;
134 Int4 i;
135
136 if ((pccp = (PccDatPtr) MemNew (sizeof (PccDat))) != NULL)
137 {
138 pccp->meang = 0.77;
139 pccp->stdg = 0.20;
140 pccp->meanc = 1.63;
141 pccp->stdc = 0.24;
142 pccp->window = 22;
143 pccp->linker = 32;
144
145 pccp->pccdatafile = StringSave ("KSpcc.mat");
146
147 if ((pccp->scr = (Pointer) MemNew ((size_t)(sizeof(Pointer)*24)))
148 != NULL)
149 {
150 for (i = 0; i < 24; i++)
151 {
152 pccp->scr[i] = (FloatHiPtr)
153 MemNew ((size_t)(sizeof(FloatHi)*42));
154 }
155 }
156 else
157 {
158 pccp = (PccDatPtr) MemFree (pccp);
159 }
160 }
161 return pccp;
162 }
163
PccDatFree(PccDatPtr pccp)164 extern PccDatPtr PccDatFree (PccDatPtr pccp)
165 {
166 Int4 i;
167
168 if (pccp != NULL)
169 {
170 MemFree (pccp->pccdatafile);
171 MemFree (pccp->res);
172 for (i = 0; i < 24; i++)
173 MemFree (pccp->scr[i]);
174 MemFree (pccp->scr);
175 pccp = (PccDatPtr) MemFree (pccp);
176 }
177 return pccp;
178 }
179
ProbPredictCC(FloatHi score,PccDatPtr pccp,FloatHi sqrt2pi)180 static FloatHi ProbPredictCC (FloatHi score, PccDatPtr pccp,
181 FloatHi sqrt2pi)
182 {
183 FloatHi constg, constc;
184 FloatHi evalg, evalc;
185 FloatHi Gg, Gc;
186 FloatHi prob;
187
188 constg = 1.0 / (pccp->stdg * sqrt2pi);
189 constc = 1.0 / (pccp->stdc * sqrt2pi);
190
191 evalg = (((score - pccp->meang) / pccp->stdg) *
192 ((score - pccp->meang) / pccp->stdg)) / -2.0;
193 evalc = (((score - pccp->meanc) / pccp->stdc) *
194 ((score - pccp->meanc) / pccp->stdc)) / -2.0;
195
196 evalg = exp (evalg);
197 evalc = exp (evalc);
198 Gg = constg * evalg;
199 Gc = constc * evalc;
200 prob = Gc / ((30 * Gg) + Gc);
201
202 return prob;
203 }
204
nroot(FloatHi fv,Int4 wv)205 static FloatHi nroot (FloatHi fv, Int4 wv)
206 {
207 FloatHi fi = 1.0;
208
209 fi = fi / (FloatHi) wv;
210 fv = pow (fv, fi);
211
212 return fv;
213 }
214
PredictCC(CharPtr seq,Int4 start,Int4 end,PccDatPtr pccp)215 extern FloatHiPtr PredictCC (CharPtr seq, Int4 start, Int4 end,
216 PccDatPtr pccp)
217 {
218 Int4 i, n, col;
219 Int4 stop;
220 CharPtr seqhead;
221 FloatHiPtr fptrhead, fptrtemp, fptr;
222 FloatHi sqrt2pi, temp;
223 Boolean flagMatch;
224
225 if (seq == NULL || pccp == NULL)
226 return NULL;
227
228 if ((fptr = (FloatHiPtr) MemNew ((size_t) (sizeof (FloatHi) *
229 (end-start+1+pccp->window)))) == NULL)
230 {
231 return fptr;
232 }
233
234 sqrt2pi = nroot ((2.0*PI), 2);
235
236 fptrhead = fptrtemp = fptr;
237 seqhead = seq;
238 stop = end - pccp->window;
239 for (i = start; i < stop; i++)
240 {
241 seq = seqhead;
242 temp = 1.0;
243 for (col = 0; col < pccp->window; col++)
244 {
245 flagMatch = FALSE;
246 n = 0;
247 while (pccp->res[n] != '\0')
248 {
249 if (*seq == pccp->res[n])
250 {
251 flagMatch = TRUE;
252 break;
253 }
254 n++;
255 }
256 if (flagMatch)
257 temp *= pccp->scr[n][col];
258 seq++;
259 }
260 temp = nroot (temp, pccp->window);
261 temp = ProbPredictCC (temp, pccp, sqrt2pi);
262 if (temp < 0.0)
263 temp = 0.0;
264 fptr = fptrtemp;
265 for (n = 0; n < pccp->window; n++)
266 {
267 if (temp > *fptr)
268 *fptr = temp;
269 fptr++;
270 }
271 fptrtemp++;
272 seqhead++;
273 }
274 return fptrhead;
275 }
276
277 /*
278 seqport should be opened full length (0 to bsp->length-1)
279 start and end reflect where you want to search
280 */
281
PredictCCSeqPort(SeqPortPtr spp,Int4 start,Int4 end,PccDatPtr pccp)282 extern FloatHiPtr PredictCCSeqPort (SeqPortPtr spp,
283 Int4 start, Int4 end,
284 PccDatPtr pccp)
285 {
286 CharPtr seq, seqhead;
287 Int4 i;
288 FloatHiPtr pccprob;
289
290 if (spp == NULL || pccp == NULL)
291 return NULL;
292
293 seq = seqhead = (CharPtr) MemNew ((size_t) (sizeof (Char) *
294 (end-start+1)));
295 if (seq == NULL)
296 return NULL;
297
298 SeqPortSeek (spp, start, SEEK_SET);
299 for (i = start; i <= end; i++)
300 {
301 *seq = SeqPortGetResidue (spp);
302 seq++;
303 }
304 pccprob = PredictCC (seqhead, start, end, pccp);
305 MemFree (seqhead);
306 return pccprob;
307 }
308
PredictCCBioseq(BioseqPtr bsp,Int4 start,Int4 end,PccDatPtr pccp)309 extern FloatHiPtr PredictCCBioseq (BioseqPtr bsp,
310 Int4 start, Int4 end,
311 PccDatPtr pccp)
312 {
313 SeqPortPtr spp;
314 FloatHiPtr pccprob;
315
316 if (bsp == NULL || pccp == NULL)
317 return NULL;
318
319 if (!ISA_aa (bsp->mol))
320 return NULL;
321
322 spp = SeqPortNew (bsp, 0, bsp->length-1, 0, Seq_code_iupacaa);
323 pccprob = PredictCCSeqPort (spp, start, end, pccp);
324 SeqPortFree (spp);
325 return pccprob;
326 }
327
PredictCCSeqLoc(SeqLocPtr slp,PccDatPtr pccp)328 extern FloatHiPtr PredictCCSeqLoc (SeqLocPtr slp,
329 PccDatPtr pccp)
330 {
331 BioseqPtr bsp;
332 Int4 start, end;
333 FloatHiPtr pccprob;
334
335 if (slp == NULL || pccp == NULL)
336 return NULL;
337
338 if (slp->choice != SEQLOC_INT)
339 return NULL;
340
341 if ((bsp = BioseqLockById (SeqLocId (slp))) == NULL)
342 return NULL;
343
344 if (!ISA_aa (bsp->mol))
345 {
346 BioseqUnlock (bsp);
347 return NULL;
348 }
349
350 start = SeqLocStart (slp);
351 end = SeqLocStop (slp);
352 pccprob = PredictCCBioseq (bsp, start, end, pccp);
353 BioseqUnlock (bsp);
354 return pccprob;
355 }
356
FilterCC(FloatHiPtr score,FloatHi percentcut,Int4 length,Int4 linker,SeqIdPtr sip,Boolean flagBoundaryCondition)357 extern SeqLocPtr FilterCC (FloatHiPtr score, FloatHi percentcut,
358 Int4 length, Int4 linker, SeqIdPtr sip,
359 Boolean flagBoundaryCondition)
360 {
361 Int4 i;
362 Int4 start, stop;
363 FloatHi lopr, hipr;
364 SeqLocPtr nextslp, slp, slph = NULL;
365 SeqIntPtr sint;
366
367 if (score == NULL)
368 return NULL;
369
370 percentcut /= 100.0;
371 lopr = percentcut * 0.80;
372 hipr = percentcut + (percentcut - lopr);
373
374 if (flagBoundaryCondition)
375 {
376 for (i = 0; i < length; i++)
377 {
378 if (*score >= lopr && *score <= hipr)
379 break;
380 score++;
381 }
382 }
383 else
384 {
385 for (i = 0; i < length; i++)
386 {
387 if (*score > percentcut)
388 break;
389 score++;
390 }
391 }
392 if (i == length)
393 return NULL;
394
395 if (flagBoundaryCondition)
396 {
397 while (i < length)
398 {
399 if (*score >= lopr && *score <= hipr)
400 {
401 start = i;
402 while (*score >= lopr && *score <= hipr && i < length)
403 {
404 score++;
405 i++;
406 }
407 stop = i - 1;
408 slp = SeqLocIntNew (start, stop, Seq_strand_unknown, sip);
409 ValNodeLink (&slph, slp);
410 }
411 else
412 {
413 score++;
414 i++;
415 }
416 }
417 }
418 else
419 {
420 while (i < length)
421 {
422 if (*score > percentcut)
423 {
424 start = i;
425 while (*score > percentcut && i < length)
426 {
427 score++;
428 i++;
429 }
430 stop = i - 1;
431 slp = SeqLocIntNew (start, stop, Seq_strand_unknown, sip);
432 ValNodeLink (&slph, slp);
433 }
434 else
435 {
436 score++;
437 i++;
438 }
439 }
440 }
441
442 if (linker > 0)
443 {
444 slp = slph;
445 while (slp != NULL)
446 {
447 if (slp->next != NULL)
448 {
449 nextslp = slp->next;
450 stop = SeqLocStop (slp);
451 start = SeqLocStart (nextslp);
452 if (start-stop-1 <= linker)
453 {
454 sint = slp->data.ptrvalue;
455 sint->to = SeqLocStop (nextslp);
456 slp->next = nextslp->next;
457 nextslp->next = NULL;
458 SeqLocFree (nextslp);
459 continue;
460 }
461 }
462 slp = slp->next;
463 }
464 }
465
466 return slph;
467 }
468
FilterCCVS(FloatHiPtr score,FloatHi percentcut,Int4 length,Int4 linker,SeqIdPtr sip,Boolean flagBoundaryCondition)469 extern SeqLocPtr FilterCCVS (FloatHiPtr score, FloatHi percentcut,
470 Int4 length, Int4 linker, SeqIdPtr sip,
471 Boolean flagBoundaryCondition)
472
473 {
474 Int4 i;
475 Int4 start, stop;
476 FloatHi lopr, hipr;
477 SeqLocPtr nextslp, slp, slph = NULL;
478 SeqIntPtr sint;
479 Int4 cccount;
480
481 if (score == NULL)
482 return NULL;
483
484 percentcut /= 100.0;
485 lopr = percentcut * 0.80;
486 hipr = percentcut + (percentcut - lopr);
487
488 if (flagBoundaryCondition)
489 {
490 for (i = 0; i < length; i++)
491 {
492 if (*score >= lopr && *score <= hipr)
493 break;
494 score++;
495 }
496 }
497 else
498 {
499 for (i = 0; i < length; i++)
500 {
501 if (*score > percentcut)
502 break;
503 score++;
504 }
505 }
506 if (i == length)
507 return NULL;
508
509 if (flagBoundaryCondition)
510 {
511 while (i < length)
512 {
513 if (*score >= lopr && *score <= hipr)
514 {
515 start = i;
516 while (*score >= lopr && *score <= hipr && i < length)
517 {
518 score++;
519 i++;
520 }
521 stop = i - 1;
522 slp = SeqLocIntNew (start, stop, Seq_strand_unknown, sip);
523 ValNodeLink (&slph, slp);
524 }
525 else
526 {
527 score++;
528 i++;
529 }
530 }
531 }
532 else
533 {
534 while (i < length)
535 {
536 if (*score > percentcut)
537 {
538 start = i;
539 while (*score > percentcut && i < length)
540 {
541 score++;
542 i++;
543 }
544 stop = i - 1;
545 slp = SeqLocIntNew (start, stop, Seq_strand_unknown, sip);
546 ValNodeLink (&slph, slp);
547 }
548 else
549 {
550 score++;
551 i++;
552 }
553 }
554 }
555
556 if (linker > 0)
557 {
558 cccount = 0;
559 slp = slph;
560 while (slp != NULL)
561 {
562 cccount++;
563 slp = slp->next;
564 }
565 linker *= (cccount-1);
566 slp = slph;
567 while (slp != NULL)
568 {
569 if (slp->next != NULL)
570 {
571 nextslp = slp->next;
572 stop = SeqLocStop (slp);
573 start = SeqLocStart (nextslp);
574 if (start-stop-1 <= linker)
575 {
576 sint = slp->data.ptrvalue;
577 sint->to = SeqLocStop (nextslp);
578 slp->next = nextslp->next;
579 nextslp->next = NULL;
580 SeqLocFree (nextslp);
581 continue;
582 }
583 }
584 slp = slp->next;
585 }
586 }
587
588 return slph;
589 }
590