1 static char const rcsid[] = "$Id: urkepi.c,v 6.19 2003/05/30 17:25:38 coulouri Exp $";
2
3 /*
4 * ===========================================================================
5 *
6 * PUBLIC DOMAIN NOTICE
7 * National Center for Biotechnology Information
8 *
9 * This software/database is a "United States Government Work" under the
10 * terms of the United States Copyright Act. It was written as part of
11 * the author's official duties as a United States Government employee and
12 * thus cannot be copyrighted. This software/database is freely available
13 * to the public for use. The National Library of Medicine and the U.S.
14 * Government have not placed any restriction on its use or reproduction.
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * File Name: urkepi.c
29 *
30 * Author(s): John Kuzio
31 *
32 * Version Creation Date: 98-01-01
33 *
34 * $Revision: 6.19 $
35 *
36 * File Description: epi - low complexity
37 *
38 * Modifications:
39 * --------------------------------------------------------------------------
40 * Date Name Description of modification
41 * --------------------------------------------------------------------------
42 * $Log: urkepi.c,v $
43 * Revision 6.19 2003/05/30 17:25:38 coulouri
44 * add rcsid
45 *
46 * Revision 6.18 1999/02/25 15:06:48 kuzio
47 * commutative prob func optimization
48 *
49 * Revision 6.17 1998/11/16 14:29:50 kuzio
50 * flagBoundaryCondition
51 *
52 * Revision 6.16 1998/09/16 18:03:33 kuzio
53 * cvs logging
54 *
55 *
56 * ==========================================================================
57 */
58
59 #include <urkepi.h>
60
61 /* ideal */
62 /* #define POSSIBLE_RESIDUES 20 */
63 /* 20 amino acids + selenocysteine (U) + stop (*) + unknown (X) */
64 /* #define POSSIBLE_RESIDUES 23 */
65 /* realistic; ideal + others */
66 #define POSSIBLE_RESIDUES 21
67 #define POSSIBLE_AA_RESIDUES 21
68 /* nucleotides ... just A C G T */
69 /* #define POSSIBLE_NA_RESIDUES 4 */
70 /* nucleotides ... A C G T N */
71 #define POSSIBLE_NA_RESIDUES 5
72
EpiDatNew(void)73 extern EpiDatPtr EpiDatNew (void)
74 {
75 EpiDatPtr epip;
76
77 if ((epip = (EpiDatPtr) MemNew (sizeof (EpiDat))) != NULL)
78 {
79 epip->method = 0; /* no optimization */
80 epip->linker = 5; /* less than "classical" epitope */
81 epip->mwindow = 6;
82 epip->mpercentcut = 0.0;
83 epip->window = 16;
84 epip->percentcut = 4.0;
85 epip->nucleotuple = 3;
86 if ((epip->epiarray = (Int4Ptr) MemNew ((size_t) (sizeof (Int4) * 256)))
87 == NULL)
88 {
89 return (EpiDatPtr) MemFree (epip);
90 }
91 }
92 return epip;
93 }
94
EpiDatFree(EpiDatPtr epip)95 extern EpiDatPtr EpiDatFree (EpiDatPtr epip)
96 {
97 if (epip != NULL)
98 {
99 MemFree (epip->sequence);
100 MemFree (epip->epiarray);
101 MemFree (epip->score);
102 epip = (EpiDatPtr) MemFree (epip);
103 }
104 return epip;
105 }
106
ArrayScore(Int4Ptr epiarray)107 static FloatHi ArrayScore (Int4Ptr epiarray)
108 {
109 FloatHi episcore;
110 Int4 i;
111
112 episcore = 1.0;
113 for (i = (Int4) 'A'; i <= (Int4) 'Z'; i++)
114 episcore *= (FloatHi) epiarray[i];
115
116 return episcore;
117 }
118
NucArrayScore(Int4Ptr epiarray,Int4 maxarray)119 static FloatHi NucArrayScore (Int4Ptr epiarray, Int4 maxarray)
120 {
121 FloatHi episcore;
122 Int4 i;
123
124 episcore = 1.0;
125 for (i = 0; i < maxarray; i++)
126 episcore *= (FloatHi) epiarray[i];
127
128 return episcore;
129 }
130
DownArrayScore(FloatHi episcore,FloatHi arrayvalue)131 static FloatHi DownArrayScore (FloatHi episcore, FloatHi arrayvalue)
132 {
133 return episcore / arrayvalue;
134 }
135
UpArrayScore(FloatHi episcore,FloatHi arrayvalue)136 static FloatHi UpArrayScore (FloatHi episcore, FloatHi arrayvalue)
137 {
138 return episcore * arrayvalue;
139 }
140
UpdateScore(FloatHiPtr fptrscan,FloatHi episcore,Int4 start,Int4 end)141 static void UpdateScore (FloatHiPtr fptrscan, FloatHi episcore,
142 Int4 start, Int4 end)
143 {
144 Int4 i;
145
146 for (i = start; i <= end; i++)
147 {
148 if (episcore < *fptrscan)
149 *fptrscan = episcore;
150 fptrscan++;
151 }
152 return;
153 }
154
MaxScore(Int4 window,Int4 possibleresidues)155 static FloatHi MaxScore (Int4 window, Int4 possibleresidues)
156 {
157 FloatHi maxscore;
158 Int4 a, b, m, n;
159
160 a = window / possibleresidues;
161 if ((m = window % possibleresidues) != 0)
162 a++;
163 else
164 m = possibleresidues;
165 b = a + 1;
166 n = possibleresidues - m;
167 maxscore = (FloatHi) (pow ((FloatHi) a, (FloatHi) n) *
168 pow ((FloatHi) b, (FloatHi) m));
169 return maxscore;
170 }
171
PercentEpiScore(FloatHi episcore,Int4 win,FloatHi maxscore)172 static FloatHi PercentEpiScore (FloatHi episcore, Int4 win, FloatHi maxscore)
173 {
174 return (episcore - (FloatHi) (win + 1)) * 100.0 / maxscore;
175 }
176
PredictEpiI0(CharPtr seqin,Int4 start,Int4 end,Int4 window,EpiDatPtr epip,FloatHiPtr fptrhead,FloatHi percentcut)177 static Int4 PredictEpiI0 (CharPtr seqin, Int4 start, Int4 end, Int4 window,
178 EpiDatPtr epip, FloatHiPtr fptrhead,
179 FloatHi percentcut)
180 {
181 Int4 i, j, k;
182 CharPtr seq;
183 FloatHiPtr fptr;
184 FloatHi maxscore, episcore, percentscore;
185 Int4 minwin;
186
187 if (seqin == NULL || epip == NULL)
188 return -1;
189
190 seq = &seqin[start];
191 fptr = &fptrhead[start];
192 minwin = window;
193
194 /* "zero" array for every cycle */
195 for (j = (Int4) 'A'; j <= (Int4) 'Z'; j++)
196 epip->epiarray[j] = 1;
197
198 /* minwin-mer set up */
199 k = start + minwin - 1;
200 for (j = start; j < k && j <= end; j++)
201 {
202 switch (*seq)
203 {
204 case '*':
205 case 'B':
206 case 'J':
207 case 'O':
208 case 'U':
209 case 'X':
210 case 'Z':
211 epip->epiarray['X'] += 1;
212 break;
213 default:
214 epip->epiarray[*seq] += 1;
215 break;
216 }
217 seq++;
218 }
219 maxscore = MaxScore (window, POSSIBLE_AA_RESIDUES);
220 episcore = ArrayScore (epip->epiarray);
221
222 /* run -- rolling scan */
223 for (i = k; i <= end; i++)
224 {
225 switch (*seq)
226 {
227 case '*':
228 case 'B':
229 case 'J':
230 case 'O':
231 case 'U':
232 case 'X':
233 case 'Z':
234 episcore = DownArrayScore (episcore, epip->epiarray['X']);
235 epip->epiarray['X'] += 1;
236 episcore = UpArrayScore (episcore, epip->epiarray['X']);
237 break;
238 default:
239 episcore = DownArrayScore (episcore, epip->epiarray[*seq]);
240 epip->epiarray[*seq] += 1;
241 episcore = UpArrayScore (episcore, epip->epiarray[*seq]);
242 break;
243 }
244 if ((percentscore = PercentEpiScore (episcore, window, maxscore))
245 <= percentcut)
246 UpdateScore (fptr, percentscore, i-window+1, i);
247 fptr++;
248 seq++;
249 switch (*(seq-window))
250 {
251 case '*':
252 case 'B':
253 case 'J':
254 case 'O':
255 case 'U':
256 case 'X':
257 case 'Z':
258 episcore = DownArrayScore (episcore, epip->epiarray['X']);
259 epip->epiarray['X'] -= 1;
260 episcore = UpArrayScore (episcore, epip->epiarray['X']);
261 break;
262 default:
263 episcore = DownArrayScore (episcore, epip->epiarray[*(seq-window)]);
264 epip->epiarray[*(seq-window)] -= 1;
265 episcore = UpArrayScore (episcore, epip->epiarray[*(seq-window)]);
266 break;
267 }
268 }
269 return (i - 1);
270 }
271
NumericSequence(CharPtr seq,Int4 start,Int4 stop,Int4 epi)272 static Int4Ptr NumericSequence (CharPtr seq, Int4 start, Int4 stop, Int4 epi)
273 {
274 Int4Ptr sn, seqnumarray;
275 Int4 i, k;
276
277 if ((sn = seqnumarray = (Int4Ptr) MemNew ((size_t)
278 (sizeof (Int4) * (stop-start+1)))) == NULL)
279 {
280 return NULL;
281 }
282 /* rock */
283 for (i = start; i < start+epi-1; i++)
284 {
285 *sn *= POSSIBLE_NA_RESIDUES;
286 switch (*seq)
287 {
288 case 'A':
289 *sn += 0;
290 break;
291 case 'C':
292 *sn += 1;
293 break;
294 case 'G':
295 *sn += 2;
296 break;
297 case 'T':
298 *sn += 3;
299 break;
300 default:
301 *sn += 4;
302 break;
303 }
304 seq++;
305 }
306 /* roll */
307 k = (Int4) pow (POSSIBLE_NA_RESIDUES, (FloatHi) (epi-1));
308 for (i = start+epi-1; i <= stop; i++)
309 {
310 *sn *= POSSIBLE_NA_RESIDUES;
311 switch (*seq)
312 {
313 case 'A':
314 *sn += 0;
315 break;
316 case 'C':
317 *sn += 1;
318 break;
319 case 'G':
320 *sn += 2;
321 break;
322 case 'T':
323 *sn += 3;
324 break;
325 default:
326 *sn += 4;
327 break;
328 }
329 *(sn+1) = *sn;
330 seq++;
331 switch (*(seq-epi))
332 {
333 case 'A':
334 *(sn+1) -= (0 * k);
335 break;
336 case 'C':
337 *(sn+1) -= (1 * k);
338 break;
339 case 'G':
340 *(sn+1) -= (2 * k);
341 break;
342 case 'T':
343 *(sn+1) -= (3 * k);
344 break;
345 default:
346 *(sn+1) -= (4 * k);
347 break;
348 }
349 sn++;
350 }
351 return seqnumarray;
352 }
353
PredictEpiI1(CharPtr seqin,Int4 start,Int4 end,Int4 window,EpiDatPtr epip,FloatHiPtr fptrhead,FloatHi percentcut,Int4 episode)354 static Int4 PredictEpiI1 (CharPtr seqin, Int4 start, Int4 end, Int4 window,
355 EpiDatPtr epip, FloatHiPtr fptrhead,
356 FloatHi percentcut, Int4 episode)
357 {
358 Int4 i, j, k, kpow;
359 CharPtr seq;
360 FloatHiPtr fptr;
361 FloatHi maxscore, episcore, percentscore;
362 Int4Ptr sn, seqnumarray;
363
364 if (seqin == NULL || epip == NULL)
365 return -1;
366
367 seq = &seqin[start];
368 fptr = &fptrhead[start];
369
370 /* "zero" array for every cycle */
371 MemFree (epip->epiarray);
372 kpow = (Int4) pow (POSSIBLE_NA_RESIDUES, (FloatHi) episode);
373 if ((epip->epiarray = (Int4Ptr) MemNew ((size_t)
374 (sizeof (Int4) * (kpow+1)))) == NULL)
375 {
376 return -1;
377 }
378 for (j = 0; j < kpow; j++)
379 epip->epiarray[j] = 1;
380
381 /* ACGTN for nucleotide sequence */
382 if ((sn = seqnumarray = NumericSequence (seq, start, end, episode)) == NULL)
383 return -1;
384
385 /* min_window_mer set up */
386 k = start + window - 1;
387 i = 0;
388 for (j = start; j < k && j <= end; j++)
389 {
390 epip->epiarray[sn[i]] += 1;
391 i++;
392 }
393 maxscore = MaxScore (window, kpow);
394 episcore = NucArrayScore (epip->epiarray, kpow);
395
396 /* run -- rolling scan */
397 for (j = k; j <= end; j++)
398 {
399 episcore = DownArrayScore (episcore, epip->epiarray[sn[i]]);
400 epip->epiarray[sn[i]] += 1;
401 episcore = UpArrayScore (episcore, epip->epiarray[sn[i]]);
402 if ((percentscore = PercentEpiScore (episcore, window, maxscore))
403 <= percentcut)
404 UpdateScore (fptr, percentscore, i-window+1, i+episode-1);
405 fptr++;
406 i++;
407 episcore = DownArrayScore (episcore, epip->epiarray[sn[i-window]]);
408 epip->epiarray[sn[i-window]] -= 1;
409 episcore = UpArrayScore (episcore, epip->epiarray[sn[i-window]]);
410 }
411 MemFree (seqnumarray);
412 return (i - 1);
413 }
414
PredictEpi(CharPtr seqin,Int4 start,Int4 end,EpiDatPtr epip,Boolean flagIsAA)415 extern FloatHiPtr PredictEpi (CharPtr seqin, Int4 start, Int4 end,
416 EpiDatPtr epip, Boolean flagIsAA)
417 {
418 Int4 i, n, retval;
419 FloatHiPtr epiprob, epitemp;
420 FloatHi pcut, dcut;
421 Int4 minwin;
422 FloatHi mcut;
423
424 if (seqin == NULL || epip == NULL)
425 return NULL;
426
427 if (start + (epip->window - 1) > end)
428 return NULL;
429
430 if ((epiprob = (FloatHiPtr) MemNew ((size_t) (sizeof (FloatHi) *
431 (end-start+2)))) == NULL)
432 {
433 return NULL;
434 }
435
436 if ((epitemp = (FloatHiPtr) MemNew ((size_t) (sizeof (FloatHi) *
437 (end-start+2)))) == NULL)
438 {
439 MemFree (epiprob);
440 return NULL;
441 }
442
443 minwin = epip->mwindow;
444 mcut = epip->mpercentcut;
445 for (i = start; i <= end; i++)
446 {
447 epiprob[i] = 100.0;
448 epitemp[i] = 100.0;
449 }
450 dcut = (epip->percentcut - mcut) / (FloatHi) (epip->window - minwin);
451 for (n = minwin; n <= epip->window; n++)
452 {
453 pcut = mcut + ((FloatHi) (n - minwin) * dcut);
454 if (flagIsAA)
455 retval = PredictEpiI0 (seqin, start, end, n, epip, epitemp, pcut);
456 else
457 retval = PredictEpiI1 (seqin, start, end, n, epip, epitemp, pcut,
458 epip->nucleotuple);
459 if (retval < 0)
460 {
461 ErrPostEx (SEV_ERROR, 1, 1, "EPI prediction error");
462 ErrShow ();
463 break;
464 }
465 for (i = start; i <= end; i++)
466 {
467 if (epitemp[i] < epiprob[i])
468 epiprob[i] = epitemp[i];
469 epitemp[i] = 100.0;
470 }
471 }
472 MemFree (epitemp);
473 return epiprob;
474 }
475
476 /*
477 seqport should be opened full length (0 to bsp->length-1)
478 start and end reflect where you want to search
479 */
480
PredictEpiSeqPort(SeqPortPtr spp,Int4 start,Int4 end,EpiDatPtr epip,Boolean flagIsAA)481 extern FloatHiPtr PredictEpiSeqPort (SeqPortPtr spp,
482 Int4 start, Int4 end,
483 EpiDatPtr epip, Boolean flagIsAA)
484 {
485 Int4 i, chunk;
486 Uint1Ptr seq, sq;
487 FloatHiPtr epiprob;
488
489 if (spp == NULL || epip == NULL)
490 return NULL;
491
492 sq = seq = (Uint1Ptr) MemNew ((size_t) (sizeof (Char) * (end-start+2)));
493 if (seq == NULL)
494 return NULL;
495
496 i = start;
497 chunk = 4096;
498 while (i < end)
499 {
500 SeqPortRead (spp, sq, (Int2) chunk);
501 sq += chunk;
502 i += chunk;
503 }
504 seq[end-start+1] = 0;
505 epip->sequence = seq;
506 epiprob = PredictEpi ((CharPtr) seq, start, end, epip, flagIsAA);
507 return epiprob;
508 }
509
PredictEpiBioseq(BioseqPtr bsp,Int4 start,Int4 end,EpiDatPtr epip)510 extern FloatHiPtr PredictEpiBioseq (BioseqPtr bsp,
511 Int4 start, Int4 end,
512 EpiDatPtr epip)
513 {
514 SeqPortPtr spp;
515 FloatHiPtr epiprob;
516 Boolean flagIsAA;
517
518 if (bsp == NULL || epip == NULL)
519 return NULL;
520
521 flagIsAA = (Boolean) ISA_aa (bsp->mol);
522
523 if (flagIsAA)
524 spp = SeqPortNew (bsp, 0, bsp->length-1, 0, Seq_code_iupacaa);
525 else
526 spp = SeqPortNew (bsp, 0, bsp->length-1, 0, Seq_code_iupacna);
527 epiprob = PredictEpiSeqPort (spp, start, end, epip, flagIsAA);
528 SeqPortFree (spp);
529 return epiprob;
530 }
531
PredictEpiSeqLoc(SeqLocPtr slp,EpiDatPtr epip)532 extern FloatHiPtr PredictEpiSeqLoc (SeqLocPtr slp,
533 EpiDatPtr epip)
534 {
535 BioseqPtr bsp;
536 Int4 start, end;
537 FloatHiPtr epiprob;
538
539 if (slp == NULL || epip == NULL)
540 return NULL;
541
542 if (slp->choice != SEQLOC_INT)
543 return NULL;
544
545 if ((bsp = BioseqLockById (SeqLocId (slp))) == NULL)
546 return NULL;
547
548 start = SeqLocStart (slp);
549 end = SeqLocStop (slp);
550 epiprob = PredictEpiBioseq (bsp, start, end, epip);
551 BioseqUnlock (bsp);
552 return epiprob;
553 }
554
OptimizeEpiComp(CharPtr seq,Int4 window,SeqLocPtr slp,Boolean flagIsAA)555 static void OptimizeEpiComp (CharPtr seq, Int4 window, SeqLocPtr slp,
556 Boolean flagIsAA)
557 {
558 Int4 i, start, stop, span;
559 SeqIntPtr sint;
560 Int4 bintot, bin[256];
561 FloatHi expect, fbin[256];
562
563 if (seq == NULL || slp == NULL)
564 return;
565 if (StrLen (seq) == 0)
566 return;
567
568 while (slp != NULL)
569 {
570 bintot = 0;
571 for (i = (Int4) 'A'; i <= (Int4) 'Z'; i++)
572 bin[i] = 0;
573 start = SeqLocStart (slp);
574 stop = SeqLocStop (slp);
575 /* don't have to examine the whole sequence; just end optimization */
576 if (flagIsAA)
577 {
578 span = (stop - start) / 2;
579 if (span > window / 2)
580 span = window / 2;
581 }
582 else
583 {
584 span = (stop - start) - 3;
585 if (span > window - 2)
586 span = window - 2;
587 }
588 if (flagIsAA)
589 {
590 for (i = start; i <= stop; i++)
591 {
592 switch (seq[i])
593 {
594 case '*':
595 case 'B':
596 case 'J':
597 case 'O':
598 case 'U':
599 case 'X':
600 case 'Z':
601 bin['X'] += 1;
602 bintot++;
603 break;
604 default:
605 bin[seq[i]] += 1;
606 bintot++;
607 break;
608 }
609 }
610 }
611 else
612 {
613 for (i = start; i <= stop; i++)
614 {
615 switch (seq[i])
616 {
617 case 'A':
618 case 'C':
619 case 'G':
620 case 'T':
621 bin[seq[i]] += 1;
622 bintot++;
623 break;
624 default:
625 bin['N'] += 1;
626 bintot++;
627 break;
628 }
629 }
630 }
631 /* very small windows biased */
632 expect = 1.0;
633 if (flagIsAA)
634 {
635 if (bintot > POSSIBLE_AA_RESIDUES/2)
636 expect++;
637 expect += (FloatHi) (bintot / POSSIBLE_AA_RESIDUES);
638 }
639 else
640 {
641 if (bintot > POSSIBLE_NA_RESIDUES/2)
642 expect++;
643 expect += (FloatHi) (bintot / POSSIBLE_NA_RESIDUES);
644 }
645 expect /= (FloatHi) bintot;
646 for (i = (Int4) 'A'; i <= (Int4) 'Z'; i++)
647 fbin[i] = (FloatHi) bin[i] / (FloatHi) bintot;
648 sint = slp->data.ptrvalue;
649 for (i = start; i <= start + span; i++)
650 {
651 if (fbin[seq[i]] > expect)
652 break;
653 sint->from += 1;
654 }
655 for (i = stop; i >= stop - span; i--)
656 {
657 if (fbin[seq[i]] > expect)
658 break;
659 sint->to -= 1;
660 }
661 slp = slp->next;
662 }
663 return;
664 }
665
666 typedef struct HPArray
667 {
668 FloatHi score;
669 Int4 left, right;
670 struct HPArray PNTR next;
671 } HPA, PNTR HPAPtr;
672
HPACompProc(VoidPtr ptr1,VoidPtr ptr2)673 static int LIBCALLBACK HPACompProc (VoidPtr ptr1, VoidPtr ptr2)
674 {
675 HPAPtr hpap1, hpap2;
676
677 if (ptr1 != NULL && ptr2 != NULL)
678 {
679 hpap1 = *((HPAPtr PNTR) ptr1);
680 hpap2 = *((HPAPtr PNTR) ptr2);
681 if (hpap1 != NULL && hpap2 != NULL)
682 {
683 if (hpap1->score > hpap2->score)
684 return 1;
685 else if (hpap1->score < hpap2->score)
686 return -1;
687 }
688 }
689 return 0;
690 }
691
SortHPArray(HPAPtr PNTR hpap)692 extern void SortHPArray (HPAPtr PNTR hpap)
693 {
694 HPAPtr hpapt, PNTR hpaph;
695 Int4 num, i;
696
697 num = 0;
698 hpapt = *hpap;
699 while (hpapt != NULL)
700 {
701 num++;
702 hpapt = hpapt->next;
703 }
704
705 if (num > 1)
706 {
707 hpaph = MemNew ((size_t) ((num+1) * sizeof (HPAPtr)));
708 i = 0;
709 hpapt = *hpap;
710 while (hpapt != NULL && i < num)
711 {
712 hpaph[i] = hpapt;
713 hpapt = hpapt->next;
714 i++;
715 }
716 HeapSort (hpaph, num, sizeof (HPAPtr), HPACompProc);
717 for (i = 0; i < num; ++i)
718 {
719 hpapt = hpaph[i];
720 hpapt->next = hpaph[i+1];
721 }
722 *hpap = hpaph[0];
723 MemFree (hpaph);
724 }
725 return;
726 }
727
OptimizeEpiHP(CharPtr seq,Int4 window,SeqLocPtr slp,Boolean flagIsAA)728 static void OptimizeEpiHP (CharPtr seq, Int4 window, SeqLocPtr slp,
729 Boolean flagIsAA)
730 {
731 Int4 i, n, j, k, start, stop, span;
732 SeqIntPtr sint;
733 HPAPtr hpaph = NULL, hpapt, hpap;
734 EpiDatPtr epip;
735 FloatHi mincom, maxcom;
736 Int4 wincom;
737
738 if (seq == NULL || slp == NULL)
739 return;
740 if (StrLen (seq) == 0)
741 return;
742
743 epip = EpiDatNew ();
744 while (slp != NULL)
745 {
746 sint = slp->data.ptrvalue;
747 start = SeqLocStart (slp);
748 stop = SeqLocStop (slp);
749 /* don't have to examine the whole sequence; just end optimization */
750 if (flagIsAA)
751 {
752 span = (stop - start) / 2;
753 if (span > window / 2)
754 span = window / 2;
755 }
756 else
757 {
758 span = (stop - start) - 3;
759 if (span > window - 2)
760 span = window - 2;
761 }
762 for (i = start; i <= start + span; i++)
763 {
764 for (k = (Int4) 'A'; k <= (Int4) 'Z'; k++)
765 epip->epiarray[k] = 1;
766 if (flagIsAA)
767 {
768 for (j = i; j <= stop; j++)
769 {
770 switch (seq[j])
771 {
772 case '*':
773 case 'B':
774 case 'J':
775 case 'O':
776 case 'U':
777 case 'X':
778 case 'Z':
779 epip->epiarray['X'] += 1;
780 break;
781 default:
782 epip->epiarray[seq[j]] += 1;
783 break;
784 }
785 }
786 }
787 else
788 {
789 for (j = i; j <= stop; j++)
790 {
791 switch (seq[j])
792 {
793 case 'A':
794 case 'C':
795 case 'G':
796 case 'T':
797 epip->epiarray[seq[j]] += 1;
798 break;
799 default:
800 epip->epiarray['N'] += 1;
801 break;
802 }
803 }
804 }
805 for (n = stop; n >= stop - span; n--)
806 {
807 hpap = (HPAPtr) MemNew (sizeof (HPA));
808 if (hpaph == NULL)
809 {
810 hpaph = hpap;
811 }
812 else
813 {
814 hpapt = hpaph;
815 while (hpapt->next != NULL)
816 hpapt = hpapt->next;
817 hpapt->next = hpap;
818 }
819 hpap->score = ArrayScore (epip->epiarray);
820 hpap->left = i;
821 hpap->right = n;
822 wincom = hpap->right - hpap->left + 1;
823 maxcom = MaxScore (wincom, POSSIBLE_AA_RESIDUES);
824 hpap->score = PercentEpiScore (hpap->score, wincom, maxcom);
825 switch (seq[n])
826 {
827 case '*':
828 case 'B':
829 case 'J':
830 case 'O':
831 case 'U':
832 case 'X':
833 case 'Z':
834 epip->epiarray['X'] -= 1;
835 break;
836 default:
837 epip->epiarray[seq[n]] -= 1;
838 break;
839 }
840 }
841 }
842 if (hpaph != NULL)
843 {
844 SortHPArray (&hpaph);
845 hpap = hpaph;
846 mincom = hpap->score;
847 sint->from = hpap->left;
848 sint->to = hpap->right;
849 while (hpap != NULL && mincom >= hpap->score)
850 {
851 if (sint->from > hpap->left)
852 sint->from = hpap->left;
853 if (sint->to < hpap->right)
854 sint->to = hpap->right;
855 hpap = hpap->next;
856 }
857 hpap = hpaph;
858 while (hpap != NULL)
859 {
860 hpaph = hpap->next;
861 MemFree (hpap);
862 hpap = hpaph;
863 }
864 }
865 slp = slp->next;
866 }
867 EpiDatFree (epip);
868 return;
869 }
870
871 typedef struct CFArray
872 {
873 FloatHi score;
874 Int4 left, right;
875 struct CFArray PNTR next;
876 } CFA, PNTR CFAPtr;
877
CFACompProc(VoidPtr ptr1,VoidPtr ptr2)878 static int LIBCALLBACK CFACompProc (VoidPtr ptr1, VoidPtr ptr2)
879 {
880 CFAPtr cfap1, cfap2;
881
882 if (ptr1 != NULL && ptr2 != NULL)
883 {
884 cfap1 = *((CFAPtr PNTR) ptr1);
885 cfap2 = *((CFAPtr PNTR) ptr2);
886 if (cfap1 != NULL && cfap2 != NULL)
887 {
888 if (cfap1->score > cfap2->score)
889 return 1;
890 else if (cfap1->score < cfap2->score)
891 return -1;
892 }
893 }
894 return 0;
895 }
896
SortCFArray(CFAPtr PNTR cfap)897 extern void SortCFArray (CFAPtr PNTR cfap)
898 {
899 CFAPtr cfapt, PNTR cfaph;
900 Int4 num, i;
901
902 num = 0;
903 cfapt = *cfap;
904 while (cfapt != NULL)
905 {
906 num++;
907 cfapt = cfapt->next;
908 }
909
910 if (num > 1)
911 {
912 cfaph = MemNew ((size_t) ((num+1) * sizeof (CFAPtr)));
913 i = 0;
914 cfapt = *cfap;
915 while (cfapt != NULL && i < num)
916 {
917 cfaph[i] = cfapt;
918 cfapt = cfapt->next;
919 i++;
920 }
921 HeapSort (cfaph, num, sizeof (CFAPtr), CFACompProc);
922 for (i = 0; i < num; ++i)
923 {
924 cfapt = cfaph[i];
925 cfapt->next = cfaph[i+1];
926 }
927 *cfap = cfaph[0];
928 MemFree (cfaph);
929 }
930 return;
931 }
932
OptimizeEpiProbFunc(CharPtr seq,Int4 window,SeqLocPtr slp,Boolean flagIsAA)933 static void OptimizeEpiProbFunc (CharPtr seq, Int4 window, SeqLocPtr slp,
934 Boolean flagIsAA)
935 {
936 Int4 i, n, j, k, start, stop, loop, fact, left, right;
937 Int4 factarr[32], PNTR dfactarr;
938 Int4 alphabet;
939 FloatHi score, tscore;
940 EpiDatPtr epip;
941 CFAPtr cfaph = NULL, cfapt, cfap;
942 SeqIntPtr sint;
943
944 if (seq == NULL || slp == NULL)
945 return;
946 if (StrLen (seq) == 0)
947 return;
948
949 if (flagIsAA)
950 alphabet = 21;
951 else
952 alphabet = 5;
953 epip = EpiDatNew ();
954 dfactarr = (Int4Ptr) MemNew ((size_t) (sizeof (Int4) * window));
955 while (slp != NULL)
956 {
957 sint = slp->data.ptrvalue;
958 start = SeqLocStart (slp);
959 stop = SeqLocStop (slp);
960 for (i = start; i <= stop - window; i++)
961 {
962 for (k = (Int4) 'A'; k <= (Int4) 'Z'; k++)
963 epip->epiarray[k] = 0;
964 loop = 0;
965 score = -1.0;
966 for (n = i; n <= stop; n++)
967 {
968 if (flagIsAA)
969 {
970 switch (seq[n])
971 {
972 case '*':
973 case 'B':
974 case 'J':
975 case 'O':
976 case 'U':
977 case 'X':
978 case 'Z':
979 epip->epiarray['X'] += 1;
980 break;
981 default:
982 epip->epiarray[seq[n]] += 1;
983 break;
984 }
985 }
986 else
987 {
988 switch (seq[n])
989 {
990 case 'A':
991 case 'C':
992 case 'G':
993 case 'T':
994 epip->epiarray[seq[n]] += 1;
995 break;
996 default:
997 epip->epiarray['N'] += 1;
998 break;
999 }
1000 }
1001 loop++;
1002 if (loop >= window)
1003 {
1004 for (j = 0; j < 32; j++)
1005 factarr[j] = 0;
1006 for (j = 0; j < window; j++)
1007 dfactarr[j] = 0;
1008 for (j = 0, k = (Int4) 'A'; k <= (Int4) 'Z'; j++, k++)
1009 if (epip->epiarray[k] > 0)
1010 factarr[j] = epip->epiarray[k];
1011 for (j = 0; j < 32; j++)
1012 if (factarr[j] > 0)
1013 dfactarr[factarr[j]] += 1;
1014 tscore = 1.0;
1015 /* window */
1016 for (j = 2; j <= loop; j++)
1017 tscore *= (FloatHi) j;
1018 /* residues */
1019 for (j = 0; j < 32; j++)
1020 for (k = 2; k <= factarr[j]; k++)
1021 tscore /= (FloatHi) k;
1022 /* alphabet */
1023 for (j = 2; j <= alphabet; j++)
1024 tscore *= (FloatHi) j;
1025 /* counts */
1026 for (j = 0; j < window; j++)
1027 for (k = 2; k <= dfactarr[j]; k++)
1028 tscore /= (FloatHi) k;
1029 fact = 0;
1030 for (j = 0; j < window; j++)
1031 if (dfactarr[j] > 0)
1032 fact++;
1033 fact = alphabet - fact;
1034 for (j = 2; j < fact; j++)
1035 tscore /= (FloatHi) j;
1036 /* score */
1037 if (score < 0.0)
1038 {
1039 left = i;
1040 right = n;
1041 score = tscore;
1042 }
1043 if (tscore < score)
1044 {
1045 left = i;
1046 right = n;
1047 score = tscore;
1048 }
1049 }
1050 cfap = (CFAPtr) MemNew (sizeof (CFA));
1051 cfap->score = score;
1052 cfap->left = left;
1053 cfap->right = right;
1054 if (cfaph == NULL)
1055 {
1056 cfaph = cfap;
1057 }
1058 else
1059 {
1060 cfapt = cfaph;
1061 while (cfapt->next != NULL)
1062 cfapt = cfapt->next;
1063 cfapt->next = cfap;
1064 }
1065 }
1066 if (cfaph != NULL)
1067 {
1068 SortCFArray (&cfaph);
1069 cfap = cfaph;
1070 tscore = cfap->score;
1071 sint->from = cfap->left;
1072 sint->to = cfap->right;
1073 while (cfap != NULL && tscore >= cfap->score)
1074 {
1075 if (sint->from > cfap->left)
1076 sint->from = cfap->left;
1077 if (sint->to < cfap->right)
1078 sint->to = cfap->right;
1079 cfap = cfap->next;
1080 }
1081 cfap = cfaph;
1082 while (cfap != NULL)
1083 {
1084 cfaph = cfap->next;
1085 MemFree (cfap);
1086 cfap = cfaph;
1087 }
1088 }
1089 }
1090 slp = slp->next;
1091 }
1092 EpiDatFree (epip);
1093 MemFree (dfactarr);
1094 return;
1095 }
1096
FilterEpi(EpiDatPtr epip,CharPtr seq,Int4 length,SeqIdPtr sip,Boolean flagHighPass,Boolean flagIsAA,Boolean flagBoundaryCondition)1097 extern SeqLocPtr FilterEpi (EpiDatPtr epip, CharPtr seq, Int4 length,
1098 SeqIdPtr sip, Boolean flagHighPass,
1099 Boolean flagIsAA,
1100 Boolean flagBoundaryCondition)
1101 {
1102 Int4 i;
1103 Int4 start, stop;
1104 SeqLocPtr nextslp, slp, slptmp, slphp, slph = NULL;
1105 SeqIntPtr sint;
1106 FloatHi curscore, lopr, hipr;
1107
1108 if (epip == NULL || epip->score == NULL)
1109 return NULL;
1110
1111 if (flagBoundaryCondition)
1112 {
1113 lopr = epip->percentcut * 0.85;
1114 hipr = epip->percentcut + (epip->percentcut - lopr);
1115 for (i = 0; i < length; i++)
1116 {
1117 if (epip->score[i] >= lopr && epip->score[i] <= hipr)
1118 break;
1119 }
1120 if (i == length)
1121 return NULL;
1122
1123 while (i < length)
1124 {
1125 curscore = epip->score[i];
1126 if (epip->score[i] >= lopr && epip->score[i] <= hipr)
1127 {
1128 start = i;
1129 while (epip->score[i] >= lopr && epip->score[i] <= hipr && i < length)
1130 {
1131 if (epip->score[i] > curscore)
1132 curscore = epip->score[i];
1133 i++;
1134 }
1135 stop = i - 1;
1136 slp = SeqLocIntNew (start, stop, Seq_strand_unknown, sip);
1137 ValNodeLink (&slph, slp);
1138 }
1139 else
1140 {
1141 i++;
1142 }
1143 }
1144 }
1145 else
1146 {
1147 for (i = 0; i < length; i++)
1148 {
1149 if (epip->score[i] <= epip->percentcut)
1150 break;
1151 }
1152 if (i == length)
1153 return NULL;
1154
1155 while (i < length)
1156 {
1157 curscore = epip->score[i];
1158 if (epip->score[i] <= epip->percentcut)
1159 {
1160 start = i;
1161 while (epip->score[i] <= epip->percentcut && i < length)
1162 {
1163 if (epip->score[i] > curscore)
1164 curscore = epip->score[i];
1165 i++;
1166 }
1167 stop = i - 1;
1168 slp = SeqLocIntNew (start, stop, Seq_strand_unknown, sip);
1169 ValNodeLink (&slph, slp);
1170 }
1171 else
1172 {
1173 i++;
1174 }
1175 }
1176 }
1177
1178 switch (epip->method)
1179 {
1180 default:
1181 case 0:
1182 break;
1183 case 1:
1184 OptimizeEpiComp (seq, epip->window, slph, flagIsAA);
1185 break;
1186 case 2:
1187 OptimizeEpiHP (seq, epip->window, slph, flagIsAA);
1188 break;
1189 case 3:
1190 OptimizeEpiProbFunc (seq, epip->window, slph, flagIsAA);
1191 break;
1192 }
1193
1194 if (epip->linker > 0)
1195 {
1196 slp = slph;
1197 while (slp != NULL)
1198 {
1199 if (slp->next != NULL)
1200 {
1201 nextslp = slp->next;
1202 stop = SeqLocStop (slp);
1203 start = SeqLocStart (nextslp);
1204 if (start-stop-1 <= epip->linker)
1205 {
1206 sint = slp->data.ptrvalue;
1207 sint->to = SeqLocStop (nextslp);
1208 slp->next = nextslp->next;
1209 nextslp->next = NULL;
1210 SeqLocFree (nextslp);
1211 continue;
1212 }
1213 }
1214 slp = slp->next;
1215 }
1216 }
1217
1218 if (flagHighPass)
1219 {
1220 slp = slph;
1221 slphp = NULL;
1222 stop = -1;
1223 while (slp != NULL)
1224 {
1225 start = SeqLocStart (slp);
1226 if (start > 0)
1227 {
1228 slptmp = SeqLocIntNew (stop+1, start-1, Seq_strand_unknown, sip);
1229 ValNodeLink (&slphp, slptmp);
1230 }
1231 stop = SeqLocStop (slp);
1232 nextslp = slp->next;
1233 slp->next = NULL;
1234 SeqLocFree (slp);
1235 slp = nextslp;
1236 }
1237 if (stop != length-1)
1238 {
1239 slptmp = SeqLocIntNew (stop+1, length-1, Seq_strand_unknown, sip);
1240 ValNodeLink (&slphp, slptmp);
1241 }
1242 slph = slphp;
1243 }
1244
1245 return slph;
1246 }
1247
FilterEpiBioseq(EpiDatPtr epip,BioseqPtr bsp,Boolean flagHighPass,Boolean flagBoundaryCondition)1248 extern SeqLocPtr FilterEpiBioseq (EpiDatPtr epip, BioseqPtr bsp,
1249 Boolean flagHighPass,
1250 Boolean flagBoundaryCondition)
1251 {
1252 Int4 i, chunk;
1253 SeqLocPtr slp;
1254 SeqPortPtr spp;
1255 Uint1Ptr sq;
1256 Boolean flagIsAA;
1257
1258 if (epip == NULL || epip->score == NULL || bsp == NULL)
1259 return NULL;
1260
1261 flagIsAA = (Boolean) ISA_aa (bsp->mol);
1262
1263 if (epip->method != 0)
1264 {
1265 if (epip->sequence == NULL)
1266 {
1267 sq = epip->sequence = (Uint1Ptr) MemNew ((size_t) (sizeof (Char) *
1268 (bsp->length + 2)));
1269 if (flagIsAA)
1270 spp = SeqPortNew (bsp, 0, bsp->length-1, 0, Seq_code_iupacaa);
1271 else
1272 spp = SeqPortNew (bsp, 0, bsp->length-1, 0, Seq_code_iupacna);
1273 i = 0;
1274 chunk = 4096;
1275 while (i < bsp->length)
1276 {
1277 SeqPortRead (spp, sq, (Int2) chunk);
1278 sq += chunk;
1279 i += chunk;
1280 }
1281 epip->sequence[bsp->length] = 0;
1282 SeqPortFree (spp);
1283 }
1284 slp = FilterEpi (epip, (CharPtr) epip->sequence, bsp->length, bsp->id,
1285 flagHighPass, flagIsAA, flagBoundaryCondition);
1286 }
1287 else
1288 {
1289 slp = FilterEpi (epip, NULL, bsp->length, bsp->id, flagHighPass, flagIsAA,
1290 flagBoundaryCondition);
1291 }
1292
1293 return slp;
1294 }
1295