1 static char const rcsid[] = "$Id: urkepi.c,v 6.19 2003/05/30 17:25:38 coulouri Exp $";
2 
3 /*
4 * ===========================================================================
5 *
6 *                            PUBLIC DOMAIN NOTICE
7 *               National Center for Biotechnology Information
8 *
9 *  This software/database is a "United States Government Work" under the
10 *  terms of the United States Copyright Act.  It was written as part of
11 *  the author's official duties as a United States Government employee and
12 *  thus cannot be copyrighted.  This software/database is freely available
13 *  to the public for use. The National Library of Medicine and the U.S.
14 *  Government have not placed any restriction on its use or reproduction.
15 *
16 *  Although all reasonable efforts have been taken to ensure the accuracy
17 *  and reliability of the software and data, the NLM and the U.S.
18 *  Government do not and cannot warrant the performance or results that
19 *  may be obtained by using this software or data. The NLM and the U.S.
20 *  Government disclaim all warranties, express or implied, including
21 *  warranties of performance, merchantability or fitness for any particular
22 *  purpose.
23 *
24 *  Please cite the author in any work or product based on this material.
25 *
26 * ===========================================================================
27 *
28 * File Name: urkepi.c
29 *
30 * Author(s): John Kuzio
31 *
32 * Version Creation Date: 98-01-01
33 *
34 * $Revision: 6.19 $
35 *
36 * File Description: epi - low complexity
37 *
38 * Modifications:
39 * --------------------------------------------------------------------------
40 * Date       Name        Description of modification
41 * --------------------------------------------------------------------------
42 * $Log: urkepi.c,v $
43 * Revision 6.19  2003/05/30 17:25:38  coulouri
44 * add rcsid
45 *
46 * Revision 6.18  1999/02/25 15:06:48  kuzio
47 * commutative prob func optimization
48 *
49 * Revision 6.17  1998/11/16 14:29:50  kuzio
50 * flagBoundaryCondition
51 *
52 * Revision 6.16  1998/09/16 18:03:33  kuzio
53 * cvs logging
54 *
55 *
56 * ==========================================================================
57 */
58 
59 #include <urkepi.h>
60 
61 /* ideal */
62 /* #define POSSIBLE_RESIDUES 20 */
63 /* 20 amino acids + selenocysteine (U) + stop (*) + unknown (X) */
64 /* #define POSSIBLE_RESIDUES 23 */
65 /* realistic; ideal + others */
66 #define POSSIBLE_RESIDUES 21
67 #define POSSIBLE_AA_RESIDUES 21
68 /* nucleotides ... just A C G T */
69 /* #define POSSIBLE_NA_RESIDUES 4 */
70 /* nucleotides ... A C G T N */
71 #define POSSIBLE_NA_RESIDUES 5
72 
EpiDatNew(void)73 extern EpiDatPtr EpiDatNew (void)
74 {
75   EpiDatPtr epip;
76 
77   if ((epip = (EpiDatPtr) MemNew (sizeof (EpiDat))) != NULL)
78   {
79     epip->method = 0;        /* no optimization */
80     epip->linker = 5;        /* less than "classical" epitope */
81     epip->mwindow = 6;
82     epip->mpercentcut = 0.0;
83     epip->window = 16;
84     epip->percentcut = 4.0;
85     epip->nucleotuple = 3;
86     if ((epip->epiarray = (Int4Ptr) MemNew ((size_t) (sizeof (Int4) * 256)))
87         == NULL)
88     {
89       return (EpiDatPtr) MemFree (epip);
90     }
91   }
92   return epip;
93 }
94 
EpiDatFree(EpiDatPtr epip)95 extern EpiDatPtr EpiDatFree (EpiDatPtr epip)
96 {
97   if (epip != NULL)
98   {
99     MemFree (epip->sequence);
100     MemFree (epip->epiarray);
101     MemFree (epip->score);
102     epip = (EpiDatPtr) MemFree (epip);
103   }
104   return epip;
105 }
106 
ArrayScore(Int4Ptr epiarray)107 static FloatHi ArrayScore (Int4Ptr epiarray)
108 {
109   FloatHi  episcore;
110   Int4     i;
111 
112   episcore = 1.0;
113   for (i = (Int4) 'A'; i <= (Int4) 'Z'; i++)
114     episcore *= (FloatHi) epiarray[i];
115 
116   return episcore;
117 }
118 
NucArrayScore(Int4Ptr epiarray,Int4 maxarray)119 static FloatHi NucArrayScore (Int4Ptr epiarray, Int4 maxarray)
120 {
121   FloatHi  episcore;
122   Int4     i;
123 
124   episcore = 1.0;
125   for (i = 0; i < maxarray; i++)
126     episcore *= (FloatHi) epiarray[i];
127 
128   return episcore;
129 }
130 
DownArrayScore(FloatHi episcore,FloatHi arrayvalue)131 static FloatHi DownArrayScore (FloatHi episcore, FloatHi arrayvalue)
132 {
133   return episcore / arrayvalue;
134 }
135 
UpArrayScore(FloatHi episcore,FloatHi arrayvalue)136 static FloatHi UpArrayScore (FloatHi episcore, FloatHi arrayvalue)
137 {
138   return episcore * arrayvalue;
139 }
140 
UpdateScore(FloatHiPtr fptrscan,FloatHi episcore,Int4 start,Int4 end)141 static void UpdateScore (FloatHiPtr fptrscan, FloatHi episcore,
142                          Int4 start, Int4 end)
143 {
144   Int4 i;
145 
146   for (i = start; i <= end; i++)
147   {
148     if (episcore < *fptrscan)
149       *fptrscan = episcore;
150     fptrscan++;
151   }
152   return;
153 }
154 
MaxScore(Int4 window,Int4 possibleresidues)155 static FloatHi MaxScore (Int4 window, Int4 possibleresidues)
156 {
157   FloatHi maxscore;
158   Int4    a, b, m, n;
159 
160   a = window / possibleresidues;
161   if ((m = window % possibleresidues) != 0)
162     a++;
163   else
164     m = possibleresidues;
165   b = a + 1;
166   n = possibleresidues - m;
167   maxscore = (FloatHi) (pow ((FloatHi) a, (FloatHi) n) *
168                         pow ((FloatHi) b, (FloatHi) m));
169   return maxscore;
170 }
171 
PercentEpiScore(FloatHi episcore,Int4 win,FloatHi maxscore)172 static FloatHi PercentEpiScore (FloatHi episcore, Int4 win, FloatHi maxscore)
173 {
174   return (episcore - (FloatHi) (win + 1)) * 100.0 / maxscore;
175 }
176 
PredictEpiI0(CharPtr seqin,Int4 start,Int4 end,Int4 window,EpiDatPtr epip,FloatHiPtr fptrhead,FloatHi percentcut)177 static Int4 PredictEpiI0 (CharPtr seqin, Int4 start, Int4 end, Int4 window,
178                           EpiDatPtr epip, FloatHiPtr fptrhead,
179                           FloatHi percentcut)
180 {
181   Int4       i, j, k;
182   CharPtr    seq;
183   FloatHiPtr fptr;
184   FloatHi    maxscore, episcore, percentscore;
185   Int4       minwin;
186 
187   if (seqin == NULL || epip == NULL)
188     return -1;
189 
190   seq = &seqin[start];
191   fptr = &fptrhead[start];
192   minwin = window;
193 
194 /* "zero" array for every cycle */
195   for (j = (Int4) 'A'; j <= (Int4) 'Z'; j++)
196     epip->epiarray[j] = 1;
197 
198 /* minwin-mer set up */
199   k = start + minwin - 1;
200   for (j = start; j < k && j <= end; j++)
201   {
202     switch (*seq)
203     {
204      case '*':
205      case 'B':
206      case 'J':
207      case 'O':
208      case 'U':
209      case 'X':
210      case 'Z':
211       epip->epiarray['X'] += 1;
212       break;
213      default:
214       epip->epiarray[*seq] += 1;
215       break;
216     }
217     seq++;
218   }
219   maxscore = MaxScore (window, POSSIBLE_AA_RESIDUES);
220   episcore = ArrayScore (epip->epiarray);
221 
222 /* run -- rolling scan */
223   for (i = k; i <= end; i++)
224   {
225     switch (*seq)
226     {
227      case '*':
228      case 'B':
229      case 'J':
230      case 'O':
231      case 'U':
232      case 'X':
233      case 'Z':
234       episcore = DownArrayScore (episcore, epip->epiarray['X']);
235       epip->epiarray['X'] += 1;
236       episcore = UpArrayScore (episcore, epip->epiarray['X']);
237       break;
238      default:
239       episcore = DownArrayScore (episcore, epip->epiarray[*seq]);
240       epip->epiarray[*seq] += 1;
241       episcore = UpArrayScore (episcore, epip->epiarray[*seq]);
242       break;
243     }
244     if ((percentscore = PercentEpiScore (episcore, window, maxscore))
245           <= percentcut)
246       UpdateScore (fptr, percentscore, i-window+1, i);
247     fptr++;
248     seq++;
249     switch (*(seq-window))
250     {
251      case '*':
252      case 'B':
253      case 'J':
254      case 'O':
255      case 'U':
256      case 'X':
257      case 'Z':
258       episcore = DownArrayScore (episcore, epip->epiarray['X']);
259       epip->epiarray['X'] -= 1;
260       episcore = UpArrayScore (episcore, epip->epiarray['X']);
261       break;
262      default:
263       episcore = DownArrayScore (episcore, epip->epiarray[*(seq-window)]);
264       epip->epiarray[*(seq-window)] -= 1;
265       episcore = UpArrayScore (episcore, epip->epiarray[*(seq-window)]);
266       break;
267     }
268   }
269   return (i - 1);
270 }
271 
NumericSequence(CharPtr seq,Int4 start,Int4 stop,Int4 epi)272 static Int4Ptr NumericSequence (CharPtr seq, Int4 start, Int4 stop, Int4 epi)
273 {
274   Int4Ptr    sn, seqnumarray;
275   Int4       i, k;
276 
277   if ((sn = seqnumarray = (Int4Ptr) MemNew ((size_t)
278                            (sizeof (Int4) * (stop-start+1)))) == NULL)
279   {
280     return NULL;
281   }
282 /* rock */
283   for (i = start; i < start+epi-1; i++)
284   {
285     *sn *= POSSIBLE_NA_RESIDUES;
286     switch (*seq)
287     {
288      case 'A':
289       *sn += 0;
290       break;
291      case 'C':
292       *sn += 1;
293       break;
294      case 'G':
295       *sn += 2;
296       break;
297      case 'T':
298       *sn += 3;
299       break;
300      default:
301       *sn += 4;
302       break;
303     }
304     seq++;
305   }
306 /* roll */
307   k = (Int4) pow (POSSIBLE_NA_RESIDUES, (FloatHi) (epi-1));
308   for (i = start+epi-1; i <= stop; i++)
309   {
310     *sn *= POSSIBLE_NA_RESIDUES;
311     switch (*seq)
312     {
313      case 'A':
314       *sn += 0;
315       break;
316      case 'C':
317       *sn += 1;
318       break;
319      case 'G':
320       *sn += 2;
321       break;
322      case 'T':
323       *sn += 3;
324       break;
325      default:
326       *sn += 4;
327       break;
328     }
329     *(sn+1) = *sn;
330     seq++;
331     switch (*(seq-epi))
332     {
333      case 'A':
334       *(sn+1) -= (0 * k);
335       break;
336      case 'C':
337       *(sn+1) -= (1 * k);
338       break;
339      case 'G':
340       *(sn+1) -= (2 * k);
341       break;
342      case 'T':
343       *(sn+1) -= (3 * k);
344       break;
345      default:
346       *(sn+1) -= (4 * k);
347       break;
348     }
349     sn++;
350   }
351   return seqnumarray;
352 }
353 
PredictEpiI1(CharPtr seqin,Int4 start,Int4 end,Int4 window,EpiDatPtr epip,FloatHiPtr fptrhead,FloatHi percentcut,Int4 episode)354 static Int4 PredictEpiI1 (CharPtr seqin, Int4 start, Int4 end, Int4 window,
355                           EpiDatPtr epip, FloatHiPtr fptrhead,
356                           FloatHi percentcut, Int4 episode)
357 {
358   Int4       i, j, k, kpow;
359   CharPtr    seq;
360   FloatHiPtr fptr;
361   FloatHi    maxscore, episcore, percentscore;
362   Int4Ptr    sn, seqnumarray;
363 
364   if (seqin == NULL || epip == NULL)
365     return -1;
366 
367   seq = &seqin[start];
368   fptr = &fptrhead[start];
369 
370 /* "zero" array for every cycle */
371   MemFree (epip->epiarray);
372   kpow = (Int4) pow (POSSIBLE_NA_RESIDUES, (FloatHi) episode);
373   if ((epip->epiarray = (Int4Ptr) MemNew ((size_t)
374        (sizeof (Int4) * (kpow+1)))) == NULL)
375   {
376     return -1;
377   }
378   for (j = 0; j < kpow; j++)
379     epip->epiarray[j] = 1;
380 
381 /* ACGTN for nucleotide sequence */
382   if ((sn = seqnumarray = NumericSequence (seq, start, end, episode)) == NULL)
383     return -1;
384 
385 /* min_window_mer set up */
386   k = start + window - 1;
387   i = 0;
388   for (j = start; j < k && j <= end; j++)
389   {
390     epip->epiarray[sn[i]] += 1;
391     i++;
392   }
393   maxscore = MaxScore (window, kpow);
394   episcore = NucArrayScore (epip->epiarray, kpow);
395 
396 /* run -- rolling scan */
397   for (j = k; j <= end; j++)
398   {
399     episcore = DownArrayScore (episcore, epip->epiarray[sn[i]]);
400     epip->epiarray[sn[i]] += 1;
401     episcore = UpArrayScore (episcore, epip->epiarray[sn[i]]);
402     if ((percentscore = PercentEpiScore (episcore, window, maxscore))
403           <= percentcut)
404       UpdateScore (fptr, percentscore, i-window+1, i+episode-1);
405     fptr++;
406     i++;
407     episcore = DownArrayScore (episcore, epip->epiarray[sn[i-window]]);
408     epip->epiarray[sn[i-window]] -= 1;
409     episcore = UpArrayScore (episcore, epip->epiarray[sn[i-window]]);
410   }
411   MemFree (seqnumarray);
412   return (i - 1);
413 }
414 
PredictEpi(CharPtr seqin,Int4 start,Int4 end,EpiDatPtr epip,Boolean flagIsAA)415 extern FloatHiPtr PredictEpi (CharPtr seqin, Int4 start, Int4 end,
416                               EpiDatPtr epip, Boolean flagIsAA)
417 {
418   Int4       i, n, retval;
419   FloatHiPtr epiprob, epitemp;
420   FloatHi    pcut, dcut;
421   Int4       minwin;
422   FloatHi    mcut;
423 
424   if (seqin == NULL || epip == NULL)
425     return NULL;
426 
427   if (start + (epip->window - 1) > end)
428     return NULL;
429 
430   if ((epiprob = (FloatHiPtr) MemNew ((size_t) (sizeof (FloatHi) *
431                                       (end-start+2)))) == NULL)
432   {
433     return NULL;
434   }
435 
436   if ((epitemp = (FloatHiPtr) MemNew ((size_t) (sizeof (FloatHi) *
437                                       (end-start+2)))) == NULL)
438   {
439     MemFree (epiprob);
440     return NULL;
441   }
442 
443   minwin = epip->mwindow;
444   mcut = epip->mpercentcut;
445   for (i = start; i <= end; i++)
446   {
447     epiprob[i] = 100.0;
448     epitemp[i] = 100.0;
449   }
450   dcut = (epip->percentcut - mcut) / (FloatHi) (epip->window - minwin);
451   for (n = minwin; n <= epip->window; n++)
452   {
453     pcut = mcut + ((FloatHi) (n - minwin) * dcut);
454     if (flagIsAA)
455       retval = PredictEpiI0 (seqin, start, end, n, epip, epitemp, pcut);
456     else
457       retval = PredictEpiI1 (seqin, start, end, n, epip, epitemp, pcut,
458                epip->nucleotuple);
459     if (retval < 0)
460     {
461       ErrPostEx (SEV_ERROR, 1, 1, "EPI prediction error");
462       ErrShow ();
463       break;
464     }
465     for (i = start; i <= end; i++)
466     {
467       if (epitemp[i] < epiprob[i])
468         epiprob[i] = epitemp[i];
469       epitemp[i] = 100.0;
470     }
471   }
472   MemFree (epitemp);
473   return epiprob;
474 }
475 
476 /*
477  seqport should be opened full length (0 to bsp->length-1)
478  start and end reflect where you want to search
479 */
480 
PredictEpiSeqPort(SeqPortPtr spp,Int4 start,Int4 end,EpiDatPtr epip,Boolean flagIsAA)481 extern FloatHiPtr PredictEpiSeqPort (SeqPortPtr spp,
482                                      Int4 start, Int4 end,
483                                      EpiDatPtr epip, Boolean flagIsAA)
484 {
485   Int4        i, chunk;
486   Uint1Ptr    seq, sq;
487   FloatHiPtr  epiprob;
488 
489   if (spp == NULL || epip == NULL)
490     return NULL;
491 
492   sq = seq = (Uint1Ptr) MemNew ((size_t) (sizeof (Char) * (end-start+2)));
493   if (seq == NULL)
494     return NULL;
495 
496   i = start;
497   chunk = 4096;
498   while (i < end)
499   {
500     SeqPortRead (spp, sq, (Int2) chunk);
501     sq += chunk;
502     i += chunk;
503   }
504   seq[end-start+1] = 0;
505   epip->sequence = seq;
506   epiprob = PredictEpi ((CharPtr) seq, start, end, epip, flagIsAA);
507   return epiprob;
508 }
509 
PredictEpiBioseq(BioseqPtr bsp,Int4 start,Int4 end,EpiDatPtr epip)510 extern FloatHiPtr PredictEpiBioseq (BioseqPtr bsp,
511                                     Int4 start, Int4 end,
512                                     EpiDatPtr epip)
513 {
514   SeqPortPtr spp;
515   FloatHiPtr epiprob;
516   Boolean    flagIsAA;
517 
518   if (bsp == NULL || epip == NULL)
519     return NULL;
520 
521   flagIsAA = (Boolean) ISA_aa (bsp->mol);
522 
523   if (flagIsAA)
524     spp = SeqPortNew (bsp, 0, bsp->length-1, 0, Seq_code_iupacaa);
525   else
526     spp = SeqPortNew (bsp, 0, bsp->length-1, 0, Seq_code_iupacna);
527   epiprob = PredictEpiSeqPort (spp, start, end, epip, flagIsAA);
528   SeqPortFree (spp);
529   return epiprob;
530 }
531 
PredictEpiSeqLoc(SeqLocPtr slp,EpiDatPtr epip)532 extern FloatHiPtr PredictEpiSeqLoc (SeqLocPtr slp,
533                                     EpiDatPtr epip)
534 {
535   BioseqPtr  bsp;
536   Int4       start, end;
537   FloatHiPtr epiprob;
538 
539   if (slp == NULL || epip == NULL)
540     return NULL;
541 
542   if (slp->choice != SEQLOC_INT)
543     return NULL;
544 
545   if ((bsp = BioseqLockById (SeqLocId (slp))) == NULL)
546     return NULL;
547 
548   start = SeqLocStart (slp);
549   end = SeqLocStop (slp);
550   epiprob = PredictEpiBioseq (bsp, start, end, epip);
551   BioseqUnlock (bsp);
552   return epiprob;
553 }
554 
OptimizeEpiComp(CharPtr seq,Int4 window,SeqLocPtr slp,Boolean flagIsAA)555 static void OptimizeEpiComp (CharPtr seq, Int4 window, SeqLocPtr slp,
556                              Boolean flagIsAA)
557 {
558   Int4      i, start, stop, span;
559   SeqIntPtr sint;
560   Int4      bintot, bin[256];
561   FloatHi   expect, fbin[256];
562 
563   if (seq == NULL || slp == NULL)
564     return;
565   if (StrLen (seq) == 0)
566     return;
567 
568   while (slp != NULL)
569   {
570     bintot = 0;
571     for (i = (Int4) 'A'; i <= (Int4) 'Z'; i++)
572       bin[i] = 0;
573     start = SeqLocStart (slp);
574     stop = SeqLocStop (slp);
575 /* don't have to examine the whole sequence; just end optimization */
576     if (flagIsAA)
577     {
578       span = (stop - start) / 2;
579       if (span > window / 2)
580         span = window / 2;
581     }
582     else
583     {
584       span = (stop - start) - 3;
585       if (span > window - 2)
586         span = window - 2;
587     }
588     if (flagIsAA)
589     {
590       for (i = start; i <= stop; i++)
591       {
592         switch (seq[i])
593         {
594          case '*':
595          case 'B':
596          case 'J':
597          case 'O':
598          case 'U':
599          case 'X':
600          case 'Z':
601           bin['X'] += 1;
602           bintot++;
603           break;
604          default:
605           bin[seq[i]] += 1;
606           bintot++;
607           break;
608         }
609       }
610     }
611     else
612     {
613       for (i = start; i <= stop; i++)
614       {
615         switch (seq[i])
616         {
617          case 'A':
618          case 'C':
619          case 'G':
620          case 'T':
621           bin[seq[i]] += 1;
622           bintot++;
623           break;
624          default:
625           bin['N'] += 1;
626           bintot++;
627           break;
628         }
629       }
630     }
631 /* very small windows biased */
632     expect = 1.0;
633     if (flagIsAA)
634     {
635       if (bintot > POSSIBLE_AA_RESIDUES/2)
636         expect++;
637       expect += (FloatHi) (bintot / POSSIBLE_AA_RESIDUES);
638     }
639     else
640     {
641       if (bintot > POSSIBLE_NA_RESIDUES/2)
642         expect++;
643       expect += (FloatHi) (bintot / POSSIBLE_NA_RESIDUES);
644     }
645     expect /= (FloatHi) bintot;
646     for (i = (Int4) 'A'; i <= (Int4) 'Z'; i++)
647       fbin[i] = (FloatHi) bin[i] / (FloatHi) bintot;
648     sint = slp->data.ptrvalue;
649     for (i = start; i <= start + span; i++)
650     {
651       if (fbin[seq[i]] > expect)
652         break;
653       sint->from += 1;
654     }
655     for (i = stop; i >= stop - span; i--)
656     {
657       if (fbin[seq[i]] > expect)
658         break;
659       sint->to -= 1;
660     }
661     slp = slp->next;
662   }
663   return;
664 }
665 
666 typedef struct HPArray
667 {
668   FloatHi score;
669   Int4    left, right;
670   struct  HPArray PNTR next;
671 } HPA, PNTR HPAPtr;
672 
HPACompProc(VoidPtr ptr1,VoidPtr ptr2)673 static int LIBCALLBACK HPACompProc (VoidPtr ptr1, VoidPtr ptr2)
674 {
675   HPAPtr hpap1, hpap2;
676 
677   if (ptr1 != NULL && ptr2 != NULL)
678   {
679     hpap1 = *((HPAPtr PNTR) ptr1);
680     hpap2 = *((HPAPtr PNTR) ptr2);
681     if (hpap1 != NULL && hpap2 != NULL)
682     {
683       if (hpap1->score > hpap2->score)
684         return 1;
685       else if (hpap1->score < hpap2->score)
686         return -1;
687     }
688   }
689   return 0;
690 }
691 
SortHPArray(HPAPtr PNTR hpap)692 extern void SortHPArray (HPAPtr PNTR hpap)
693 {
694   HPAPtr hpapt, PNTR hpaph;
695   Int4 num, i;
696 
697   num = 0;
698   hpapt = *hpap;
699   while (hpapt != NULL)
700   {
701     num++;
702     hpapt = hpapt->next;
703   }
704 
705   if (num > 1)
706   {
707     hpaph = MemNew ((size_t) ((num+1) * sizeof (HPAPtr)));
708     i = 0;
709     hpapt = *hpap;
710     while (hpapt != NULL && i < num)
711     {
712       hpaph[i] = hpapt;
713       hpapt = hpapt->next;
714       i++;
715     }
716     HeapSort (hpaph, num, sizeof (HPAPtr), HPACompProc);
717     for (i = 0; i < num; ++i)
718     {
719       hpapt = hpaph[i];
720       hpapt->next = hpaph[i+1];
721     }
722     *hpap = hpaph[0];
723     MemFree (hpaph);
724   }
725   return;
726 }
727 
OptimizeEpiHP(CharPtr seq,Int4 window,SeqLocPtr slp,Boolean flagIsAA)728 static void OptimizeEpiHP (CharPtr seq, Int4 window, SeqLocPtr slp,
729                            Boolean flagIsAA)
730 {
731   Int4      i, n, j, k, start, stop, span;
732   SeqIntPtr sint;
733   HPAPtr    hpaph = NULL, hpapt, hpap;
734   EpiDatPtr epip;
735   FloatHi   mincom, maxcom;
736   Int4      wincom;
737 
738   if (seq == NULL || slp == NULL)
739     return;
740   if (StrLen (seq) == 0)
741     return;
742 
743   epip = EpiDatNew ();
744   while (slp != NULL)
745   {
746     sint = slp->data.ptrvalue;
747     start = SeqLocStart (slp);
748     stop = SeqLocStop (slp);
749 /* don't have to examine the whole sequence; just end optimization */
750     if (flagIsAA)
751     {
752       span = (stop - start) / 2;
753       if (span > window / 2)
754         span = window / 2;
755     }
756     else
757     {
758       span = (stop - start) - 3;
759       if (span > window - 2)
760         span = window - 2;
761     }
762     for (i = start; i <= start + span; i++)
763     {
764       for (k = (Int4) 'A'; k <= (Int4) 'Z'; k++)
765         epip->epiarray[k] = 1;
766       if (flagIsAA)
767       {
768         for (j = i; j <= stop; j++)
769         {
770           switch (seq[j])
771           {
772            case '*':
773            case 'B':
774            case 'J':
775            case 'O':
776            case 'U':
777            case 'X':
778            case 'Z':
779             epip->epiarray['X'] += 1;
780             break;
781            default:
782             epip->epiarray[seq[j]] += 1;
783             break;
784           }
785         }
786       }
787       else
788       {
789         for (j = i; j <= stop; j++)
790         {
791           switch (seq[j])
792           {
793            case 'A':
794            case 'C':
795            case 'G':
796            case 'T':
797             epip->epiarray[seq[j]] += 1;
798             break;
799            default:
800             epip->epiarray['N'] += 1;
801             break;
802           }
803         }
804       }
805       for (n = stop; n >= stop - span; n--)
806       {
807         hpap = (HPAPtr) MemNew (sizeof (HPA));
808         if (hpaph == NULL)
809         {
810           hpaph = hpap;
811         }
812         else
813         {
814           hpapt = hpaph;
815           while (hpapt->next != NULL)
816             hpapt = hpapt->next;
817           hpapt->next = hpap;
818         }
819         hpap->score = ArrayScore (epip->epiarray);
820         hpap->left = i;
821         hpap->right = n;
822         wincom = hpap->right - hpap->left + 1;
823         maxcom = MaxScore (wincom, POSSIBLE_AA_RESIDUES);
824         hpap->score = PercentEpiScore (hpap->score, wincom, maxcom);
825         switch (seq[n])
826         {
827          case '*':
828          case 'B':
829          case 'J':
830          case 'O':
831          case 'U':
832          case 'X':
833          case 'Z':
834           epip->epiarray['X'] -= 1;
835           break;
836          default:
837           epip->epiarray[seq[n]] -= 1;
838           break;
839         }
840       }
841     }
842     if (hpaph != NULL)
843     {
844       SortHPArray (&hpaph);
845       hpap = hpaph;
846       mincom = hpap->score;
847       sint->from = hpap->left;
848       sint->to = hpap->right;
849       while (hpap != NULL && mincom >= hpap->score)
850       {
851         if (sint->from > hpap->left)
852           sint->from = hpap->left;
853         if (sint->to < hpap->right)
854           sint->to = hpap->right;
855         hpap = hpap->next;
856       }
857       hpap = hpaph;
858       while (hpap != NULL)
859       {
860         hpaph = hpap->next;
861         MemFree (hpap);
862         hpap = hpaph;
863       }
864     }
865     slp = slp->next;
866   }
867   EpiDatFree (epip);
868   return;
869 }
870 
871 typedef struct CFArray
872 {
873   FloatHi score;
874   Int4    left, right;
875   struct  CFArray PNTR next;
876 } CFA, PNTR CFAPtr;
877 
CFACompProc(VoidPtr ptr1,VoidPtr ptr2)878 static int LIBCALLBACK CFACompProc (VoidPtr ptr1, VoidPtr ptr2)
879 {
880   CFAPtr cfap1, cfap2;
881 
882   if (ptr1 != NULL && ptr2 != NULL)
883   {
884     cfap1 = *((CFAPtr PNTR) ptr1);
885     cfap2 = *((CFAPtr PNTR) ptr2);
886     if (cfap1 != NULL && cfap2 != NULL)
887     {
888       if (cfap1->score > cfap2->score)
889         return 1;
890       else if (cfap1->score < cfap2->score)
891         return -1;
892     }
893   }
894   return 0;
895 }
896 
SortCFArray(CFAPtr PNTR cfap)897 extern void SortCFArray (CFAPtr PNTR cfap)
898 {
899   CFAPtr cfapt, PNTR cfaph;
900   Int4 num, i;
901 
902   num = 0;
903   cfapt = *cfap;
904   while (cfapt != NULL)
905   {
906     num++;
907     cfapt = cfapt->next;
908   }
909 
910   if (num > 1)
911   {
912     cfaph = MemNew ((size_t) ((num+1) * sizeof (CFAPtr)));
913     i = 0;
914     cfapt = *cfap;
915     while (cfapt != NULL && i < num)
916     {
917       cfaph[i] = cfapt;
918       cfapt = cfapt->next;
919       i++;
920     }
921     HeapSort (cfaph, num, sizeof (CFAPtr), CFACompProc);
922     for (i = 0; i < num; ++i)
923     {
924       cfapt = cfaph[i];
925       cfapt->next = cfaph[i+1];
926     }
927     *cfap = cfaph[0];
928     MemFree (cfaph);
929   }
930   return;
931 }
932 
OptimizeEpiProbFunc(CharPtr seq,Int4 window,SeqLocPtr slp,Boolean flagIsAA)933 static void OptimizeEpiProbFunc (CharPtr seq, Int4 window, SeqLocPtr slp,
934                                  Boolean flagIsAA)
935 {
936   Int4      i, n, j, k, start, stop, loop, fact, left, right;
937   Int4      factarr[32], PNTR dfactarr;
938   Int4      alphabet;
939   FloatHi   score, tscore;
940   EpiDatPtr epip;
941   CFAPtr    cfaph = NULL, cfapt, cfap;
942   SeqIntPtr sint;
943 
944   if (seq == NULL || slp == NULL)
945     return;
946   if (StrLen (seq) == 0)
947     return;
948 
949   if (flagIsAA)
950     alphabet = 21;
951   else
952     alphabet = 5;
953   epip = EpiDatNew ();
954   dfactarr = (Int4Ptr) MemNew ((size_t) (sizeof (Int4) * window));
955   while (slp != NULL)
956   {
957     sint = slp->data.ptrvalue;
958     start = SeqLocStart (slp);
959     stop = SeqLocStop (slp);
960     for (i = start; i <= stop - window; i++)
961     {
962       for (k = (Int4) 'A'; k <= (Int4) 'Z'; k++)
963         epip->epiarray[k] = 0;
964       loop = 0;
965       score = -1.0;
966       for (n = i; n <= stop; n++)
967       {
968         if (flagIsAA)
969         {
970           switch (seq[n])
971           {
972            case '*':
973            case 'B':
974            case 'J':
975            case 'O':
976            case 'U':
977            case 'X':
978            case 'Z':
979             epip->epiarray['X'] += 1;
980             break;
981            default:
982             epip->epiarray[seq[n]] += 1;
983             break;
984           }
985         }
986         else
987         {
988           switch (seq[n])
989           {
990            case 'A':
991            case 'C':
992            case 'G':
993            case 'T':
994             epip->epiarray[seq[n]] += 1;
995             break;
996            default:
997             epip->epiarray['N'] += 1;
998             break;
999           }
1000         }
1001         loop++;
1002         if (loop >= window)
1003         {
1004           for (j = 0; j < 32; j++)
1005             factarr[j] = 0;
1006           for (j = 0; j < window; j++)
1007             dfactarr[j] = 0;
1008           for (j = 0, k = (Int4) 'A'; k <= (Int4) 'Z'; j++, k++)
1009             if (epip->epiarray[k] > 0)
1010               factarr[j] = epip->epiarray[k];
1011           for (j = 0; j < 32; j++)
1012             if (factarr[j] > 0)
1013               dfactarr[factarr[j]] += 1;
1014           tscore = 1.0;
1015 /* window */
1016           for (j = 2; j <= loop; j++)
1017             tscore *= (FloatHi) j;
1018 /* residues */
1019           for (j = 0; j < 32; j++)
1020             for (k = 2; k <= factarr[j]; k++)
1021               tscore /= (FloatHi) k;
1022 /* alphabet */
1023           for (j = 2; j <= alphabet; j++)
1024             tscore *= (FloatHi) j;
1025 /* counts */
1026           for (j = 0; j < window; j++)
1027             for (k = 2; k <= dfactarr[j]; k++)
1028               tscore /= (FloatHi) k;
1029           fact = 0;
1030           for (j = 0; j < window; j++)
1031             if (dfactarr[j] > 0)
1032               fact++;
1033           fact = alphabet - fact;
1034           for (j = 2; j < fact; j++)
1035             tscore /= (FloatHi) j;
1036 /* score */
1037           if (score < 0.0)
1038           {
1039             left = i;
1040             right = n;
1041             score = tscore;
1042           }
1043           if (tscore < score)
1044           {
1045             left = i;
1046             right = n;
1047             score = tscore;
1048           }
1049         }
1050         cfap = (CFAPtr) MemNew (sizeof (CFA));
1051         cfap->score = score;
1052         cfap->left = left;
1053         cfap->right = right;
1054         if (cfaph == NULL)
1055         {
1056           cfaph = cfap;
1057         }
1058         else
1059         {
1060           cfapt = cfaph;
1061           while (cfapt->next != NULL)
1062             cfapt = cfapt->next;
1063           cfapt->next = cfap;
1064         }
1065       }
1066       if (cfaph != NULL)
1067       {
1068         SortCFArray (&cfaph);
1069         cfap = cfaph;
1070         tscore = cfap->score;
1071         sint->from = cfap->left;
1072         sint->to = cfap->right;
1073         while (cfap != NULL && tscore >= cfap->score)
1074         {
1075           if (sint->from > cfap->left)
1076             sint->from = cfap->left;
1077           if (sint->to < cfap->right)
1078             sint->to = cfap->right;
1079           cfap = cfap->next;
1080         }
1081         cfap = cfaph;
1082         while (cfap != NULL)
1083         {
1084           cfaph = cfap->next;
1085           MemFree (cfap);
1086           cfap = cfaph;
1087         }
1088       }
1089     }
1090     slp = slp->next;
1091   }
1092   EpiDatFree (epip);
1093   MemFree (dfactarr);
1094   return;
1095 }
1096 
FilterEpi(EpiDatPtr epip,CharPtr seq,Int4 length,SeqIdPtr sip,Boolean flagHighPass,Boolean flagIsAA,Boolean flagBoundaryCondition)1097 extern SeqLocPtr FilterEpi (EpiDatPtr epip, CharPtr seq, Int4 length,
1098                             SeqIdPtr sip, Boolean flagHighPass,
1099                             Boolean flagIsAA,
1100                             Boolean flagBoundaryCondition)
1101 {
1102   Int4       i;
1103   Int4       start, stop;
1104   SeqLocPtr  nextslp, slp, slptmp, slphp, slph = NULL;
1105   SeqIntPtr  sint;
1106   FloatHi    curscore, lopr, hipr;
1107 
1108   if (epip == NULL || epip->score == NULL)
1109     return NULL;
1110 
1111   if (flagBoundaryCondition)
1112   {
1113     lopr = epip->percentcut * 0.85;
1114     hipr = epip->percentcut + (epip->percentcut - lopr);
1115     for (i = 0; i < length; i++)
1116     {
1117       if (epip->score[i] >= lopr && epip->score[i] <= hipr)
1118         break;
1119     }
1120     if (i == length)
1121       return NULL;
1122 
1123     while (i < length)
1124     {
1125       curscore = epip->score[i];
1126       if (epip->score[i] >= lopr && epip->score[i] <= hipr)
1127       {
1128         start = i;
1129         while (epip->score[i] >= lopr && epip->score[i] <= hipr && i < length)
1130         {
1131           if (epip->score[i] > curscore)
1132             curscore = epip->score[i];
1133           i++;
1134         }
1135         stop = i - 1;
1136         slp = SeqLocIntNew (start, stop, Seq_strand_unknown, sip);
1137         ValNodeLink (&slph, slp);
1138       }
1139       else
1140       {
1141         i++;
1142       }
1143     }
1144   }
1145   else
1146   {
1147     for (i = 0; i < length; i++)
1148     {
1149       if (epip->score[i] <= epip->percentcut)
1150         break;
1151     }
1152     if (i == length)
1153       return NULL;
1154 
1155     while (i < length)
1156     {
1157       curscore = epip->score[i];
1158       if (epip->score[i] <= epip->percentcut)
1159       {
1160         start = i;
1161         while (epip->score[i] <= epip->percentcut && i < length)
1162         {
1163           if (epip->score[i] > curscore)
1164             curscore = epip->score[i];
1165           i++;
1166         }
1167         stop = i - 1;
1168         slp = SeqLocIntNew (start, stop, Seq_strand_unknown, sip);
1169         ValNodeLink (&slph, slp);
1170       }
1171       else
1172       {
1173         i++;
1174       }
1175     }
1176   }
1177 
1178   switch (epip->method)
1179   {
1180    default:
1181    case 0:
1182     break;
1183    case 1:
1184     OptimizeEpiComp (seq, epip->window, slph, flagIsAA);
1185     break;
1186    case 2:
1187     OptimizeEpiHP (seq, epip->window, slph, flagIsAA);
1188     break;
1189    case 3:
1190     OptimizeEpiProbFunc (seq, epip->window, slph, flagIsAA);
1191     break;
1192   }
1193 
1194   if (epip->linker > 0)
1195   {
1196     slp = slph;
1197     while (slp != NULL)
1198     {
1199       if (slp->next != NULL)
1200       {
1201         nextslp = slp->next;
1202         stop = SeqLocStop (slp);
1203         start = SeqLocStart (nextslp);
1204         if (start-stop-1 <= epip->linker)
1205         {
1206           sint = slp->data.ptrvalue;
1207           sint->to = SeqLocStop (nextslp);
1208           slp->next = nextslp->next;
1209           nextslp->next = NULL;
1210           SeqLocFree (nextslp);
1211           continue;
1212         }
1213       }
1214       slp = slp->next;
1215     }
1216   }
1217 
1218   if (flagHighPass)
1219   {
1220     slp = slph;
1221     slphp = NULL;
1222     stop = -1;
1223     while (slp != NULL)
1224     {
1225       start = SeqLocStart (slp);
1226       if (start > 0)
1227       {
1228         slptmp = SeqLocIntNew (stop+1, start-1, Seq_strand_unknown, sip);
1229         ValNodeLink (&slphp, slptmp);
1230       }
1231       stop = SeqLocStop (slp);
1232       nextslp = slp->next;
1233       slp->next = NULL;
1234       SeqLocFree (slp);
1235       slp = nextslp;
1236     }
1237     if (stop != length-1)
1238     {
1239       slptmp = SeqLocIntNew (stop+1, length-1, Seq_strand_unknown, sip);
1240       ValNodeLink (&slphp, slptmp);
1241     }
1242     slph = slphp;
1243   }
1244 
1245   return slph;
1246 }
1247 
FilterEpiBioseq(EpiDatPtr epip,BioseqPtr bsp,Boolean flagHighPass,Boolean flagBoundaryCondition)1248 extern SeqLocPtr FilterEpiBioseq (EpiDatPtr epip, BioseqPtr bsp,
1249                                   Boolean flagHighPass,
1250                                   Boolean flagBoundaryCondition)
1251 {
1252   Int4       i, chunk;
1253   SeqLocPtr  slp;
1254   SeqPortPtr spp;
1255   Uint1Ptr   sq;
1256   Boolean    flagIsAA;
1257 
1258   if (epip == NULL || epip->score == NULL || bsp == NULL)
1259     return NULL;
1260 
1261   flagIsAA = (Boolean) ISA_aa (bsp->mol);
1262 
1263   if (epip->method != 0)
1264   {
1265     if (epip->sequence == NULL)
1266     {
1267       sq = epip->sequence = (Uint1Ptr) MemNew ((size_t) (sizeof (Char) *
1268                                                (bsp->length + 2)));
1269       if (flagIsAA)
1270         spp = SeqPortNew (bsp, 0, bsp->length-1, 0, Seq_code_iupacaa);
1271       else
1272         spp = SeqPortNew (bsp, 0, bsp->length-1, 0, Seq_code_iupacna);
1273       i = 0;
1274       chunk = 4096;
1275       while (i < bsp->length)
1276       {
1277         SeqPortRead (spp, sq, (Int2) chunk);
1278         sq += chunk;
1279         i += chunk;
1280       }
1281       epip->sequence[bsp->length] = 0;
1282       SeqPortFree (spp);
1283     }
1284     slp = FilterEpi (epip, (CharPtr) epip->sequence, bsp->length, bsp->id,
1285                      flagHighPass, flagIsAA, flagBoundaryCondition);
1286   }
1287   else
1288   {
1289     slp = FilterEpi (epip, NULL, bsp->length, bsp->id, flagHighPass, flagIsAA,
1290                      flagBoundaryCondition);
1291   }
1292 
1293   return slp;
1294 }
1295