1 /*  seqport.c
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name:  seqport.c
27 *
28 * Author:  James Ostell
29 *
30 * Version Creation Date: 7/13/91
31 *
32 * $Revision: 6.198 $
33 *
34 * File Description:  Ports onto Bioseqs
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date       Name        Description of modification
39 * -------  ----------  -----------------------------------------------------
40 *
41 * ==========================================================================
42 */
43 
44 /** for ErrPostEx() ****/
45 
46 static char *this_module = "ncbiapi";
47 #define THIS_MODULE this_module
48 static char *this_file = __FILE__;
49 #define THIS_FILE this_file
50 
51 /**********************/
52 
53 
54 #include <seqport.h>
55 #include <edutil.h>    /* for SeqLoc creation functions */
56 #include <gather.h>    /* for SeqLocOffset function */
57 #include <sqnutils.h>
58 #include <explore.h>   /* for BioseqFindFromSeqLoc function */
59 #include <subutil.h>
60 #include <tofasta.h>   /* for FastaSeqLineEx function */
61 #include <salutil.h>
62 #include <alignmgr2.h> /* for correcting alignments when converting to delta */
63 
64 
65 NLM_EXTERN Boolean LIBCALL SeqPortAdjustLength (SeqPortPtr spp);
66 
67 /*****************************************************************************
68 *
69 *   Fast mapping arrays
70 *
71 *****************************************************************************/
72 
73 static Uint1Ptr Na2toIUPAC = NULL;
74 static Uint1Ptr Na4toIUPAC = NULL;
75 static Uint1Ptr Na4toIUPACplusGap = NULL;
76 static Uint1Ptr Na2toNa4 = NULL;
77 static Uint1Ptr Na2to4Bit = NULL;
78 static Uint1Ptr Na4to4Bit = NULL;
79 static TNlmMutex seqport_mutex = NULL;
80 
81 
82 /*****************************************************************************
83 *
84 *   MapNa2ByteToIUPACString and MapNa4ByteToIUPACString now copy directly to
85 *     the expanded character buffer for efficiency
86 *
87 *****************************************************************************/
88 
InitNa2toIUPAC(void)89 static void InitNa2toIUPAC (void)
90 
91 {
92   Int2  base [4], index, j;
93   Char  convert [4] = {'A', 'C', 'G', 'T'};
94   Int4  ret;
95   Uint1Ptr Na2toIUPAC_local = NULL;
96 
97   ret = NlmMutexLockEx (&seqport_mutex);  /* protect this section */
98   if (ret) {
99     ErrPostEx (SEV_FATAL, 0, 0, "MapNa2ByteToIUPACString mutex failed [%ld]", (long) ret);
100     return;
101   }
102 
103   if (Na2toIUPAC == NULL) {
104     Na2toIUPAC_local = MemNew (sizeof (Uint1) * 1024);
105 
106     if (Na2toIUPAC_local != NULL) {
107       for (base [0] = 0; base [0] < 4; (base [0])++) {
108         for (base [1] = 0; base [1] < 4; (base [1])++) {
109           for (base [2] = 0; base [2] < 4; (base [2])++) {
110             for (base [3] = 0; base [3] < 4; (base [3])++) {
111               index = 4 * (base [0] * 64 + base [1] * 16 + base [2] * 4 + base [3]);
112               for (j = 0; j < 4; j++) {
113                 Na2toIUPAC_local [index + j] = convert [(base [j])];
114               }
115             }
116           }
117         }
118       }
119     }
120     Na2toIUPAC = Na2toIUPAC_local;
121   }
122 
123   NlmMutexUnlock (seqport_mutex);
124 }
125 
MapNa2ByteToIUPACString(Uint1Ptr bytep,Uint4Ptr buf,Int4 total)126 NLM_EXTERN Uint4Ptr LIBCALL MapNa2ByteToIUPACString (Uint1Ptr bytep, Uint4Ptr buf, Int4 total)
127 
128 {
129   Uint4Ptr  bp;
130   Uint1     byte;
131   Int2      index;
132   Int4      k;
133   Uint4Ptr  ptr;
134 
135   if (bytep == NULL || buf == NULL) return buf;
136   ptr = buf;
137 
138   /* initialize array if not yet set (first time function is called) */
139 
140   if (Na2toIUPAC == NULL) {
141     InitNa2toIUPAC ();
142   }
143 
144   if (Na2toIUPAC == NULL) return buf;
145 
146   /* now return 4 character string for each compressed byte */
147 
148   for (k = 0; k < total; k++) {
149     byte = *bytep;
150     bytep++;
151     index = 4 * byte;
152     bp = (Uint4Ptr) (Na2toIUPAC + index);
153     /* copy 4 bytes at a time */
154     /*
155     for (j = 0; j < 4; j++) {
156       *ptr = *bp;
157       ptr++;
158       bp++;
159     }
160     */
161     *ptr = *bp;
162     ptr++;
163   }
164 
165   return ptr;
166 }
167 
InitNa4toIUPAC(void)168 static void InitNa4toIUPAC (void)
169 
170 {
171   Int2  base [2], index, j;
172   Char  convert [16] = {'N', 'A', 'C', 'M', 'G', 'R', 'S', 'V',
173                         'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'};
174   Int4  ret;
175   Uint1Ptr Na4toIUPAC_local = NULL;
176 
177   ret = NlmMutexLockEx (&seqport_mutex);  /* protect this section */
178   if (ret) {
179     ErrPostEx (SEV_FATAL, 0, 0, "MapNa4ByteToIUPACString mutex failed [%ld]", (long) ret);
180     return;
181   }
182 
183   if (Na4toIUPAC == NULL) {
184     Na4toIUPAC_local = MemNew (sizeof (Uint1) * 512);
185 
186     if (Na4toIUPAC_local != NULL) {
187       for (base [0] = 0; base [0] < 16; (base [0])++) {
188         for (base [1] = 0; base [1] < 16; (base [1])++) {
189           index = 2 * (base [0] * 16 + base [1]);
190           for (j = 0; j < 2; j++) {
191             Na4toIUPAC_local [index + j] = convert [(base [j])];
192           }
193         }
194       }
195     }
196     Na4toIUPAC = Na4toIUPAC_local;
197   }
198 
199   NlmMutexUnlock (seqport_mutex);
200 }
201 
MapNa4ByteToIUPACString(Uint1Ptr bytep,Uint2Ptr buf,Int4 total)202 NLM_EXTERN Uint2Ptr LIBCALL MapNa4ByteToIUPACString (Uint1Ptr bytep, Uint2Ptr buf, Int4 total)
203 
204 {
205   Uint2Ptr  bp;
206   Uint1     byte;
207   Int2      index;
208   Int4      k;
209   Uint2Ptr  ptr;
210 
211   if (bytep == NULL || buf == NULL) return buf;
212   ptr = buf;
213 
214   /* initialize array if not yet set (first time function is called) */
215 
216   if (Na4toIUPAC == NULL) {
217     InitNa4toIUPAC ();
218   }
219 
220   if (Na4toIUPAC == NULL) return buf;
221 
222   /* now return 2 character string for each compressed byte */
223 
224   for (k = 0; k < total; k++) {
225     byte = *bytep;
226     bytep++;
227     index = 2 * byte;
228     bp = (Uint2Ptr)  (Na4toIUPAC + index);
229     /* copy 2 bytes at a time */
230     /*
231     for (j = 0; j < 2; j++) {
232       *ptr = *bp;
233       ptr++;
234       bp++;
235     }
236     */
237     *ptr = *bp;
238     ptr++;
239   }
240 
241   return ptr;
242 }
243 
InitNa4toIUPACplusGap(void)244 static void InitNa4toIUPACplusGap (void)
245 
246 {
247   Int2  base [2], index, j;
248   Char  convert [16] = {'-', 'A', 'C', 'M', 'G', 'R', 'S', 'V',
249                         'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'};
250   Int4  ret;
251   Uint1Ptr Na4toIUPACplusGap_local = NULL;
252 
253   ret = NlmMutexLockEx (&seqport_mutex);  /* protect this section */
254   if (ret) {
255     ErrPostEx (SEV_FATAL, 0, 0, "MapNa4ByteToIUPACplusGapString mutex failed [%ld]", (long) ret);
256     return;
257   }
258 
259   if (Na4toIUPACplusGap == NULL) {
260     Na4toIUPACplusGap_local = MemNew (sizeof (Uint1) * 512);
261 
262     if (Na4toIUPACplusGap_local != NULL) {
263       for (base [0] = 0; base [0] < 16; (base [0])++) {
264         for (base [1] = 0; base [1] < 16; (base [1])++) {
265           index = 2 * (base [0] * 16 + base [1]);
266           for (j = 0; j < 2; j++) {
267             Na4toIUPACplusGap_local [index + j] = convert [(base [j])];
268           }
269         }
270       }
271     }
272     Na4toIUPACplusGap = Na4toIUPACplusGap_local;
273   }
274 
275   NlmMutexUnlock (seqport_mutex);
276 }
277 
MapNa4ByteToIUPACplusGapString(Uint1Ptr bytep,Uint2Ptr buf,Int4 total)278 NLM_EXTERN Uint2Ptr LIBCALL MapNa4ByteToIUPACplusGapString (Uint1Ptr bytep, Uint2Ptr buf, Int4 total)
279 
280 {
281   Uint2Ptr  bp;
282   Uint1     byte;
283   Int2      index;
284   Int4      k;
285   Uint2Ptr  ptr;
286 
287   if (bytep == NULL || buf == NULL) return buf;
288   ptr = buf;
289 
290   /* initialize array if not yet set (first time function is called) */
291 
292   if (Na4toIUPACplusGap == NULL) {
293     InitNa4toIUPACplusGap ();
294   }
295 
296   if (Na4toIUPACplusGap == NULL) return buf;
297 
298   /* now return 2 character string for each compressed byte */
299 
300   for (k = 0; k < total; k++) {
301     byte = *bytep;
302     bytep++;
303     index = 2 * byte;
304     bp = (Uint2Ptr)  (Na4toIUPACplusGap + index);
305     /* copy 2 bytes at a time */
306     /*
307     for (j = 0; j < 2; j++) {
308       *ptr = *bp;
309       ptr++;
310       bp++;
311     }
312     */
313     *ptr = *bp;
314     ptr++;
315   }
316 
317   return ptr;
318 }
319 
InitNa2toNa4(void)320 static void InitNa2toNa4 (void)
321 
322 {
323   Int2   pair [2], index, j;
324   Uint1  convert [16] = {17,  18,  20,  24,  33,  34,  36,  40,
325                          65,  66,  68,  72, 129, 130, 132, 136};
326   Int4   ret;
327   Uint1Ptr Na2toNa4_local = NULL;
328 
329   ret = NlmMutexLockEx (&seqport_mutex);  /* protect this section */
330   if (ret) {
331     ErrPostEx (SEV_FATAL, 0, 0, "MapNa2ByteToIUPACString mutex failed [%ld]", (long) ret);
332     return;
333   }
334 
335   if (Na2toNa4 == NULL) {
336     Na2toNa4_local = MemNew (sizeof (Uint1) * 512);
337 
338     if (Na2toNa4_local != NULL) {
339       for (pair [0] = 0; pair [0] < 16; (pair [0])++) {
340         for (pair [1] = 0; pair [1] < 16; (pair [1])++) {
341           index = 2 * (pair [0] * 16 + pair [1]);
342           for (j = 0; j < 2; j++) {
343             Na2toNa4_local [index + j] = convert [(pair [j])];
344           }
345         }
346       }
347     }
348     Na2toNa4 = Na2toNa4_local;
349   }
350 
351   NlmMutexUnlock (seqport_mutex);
352 }
353 
MapNa2ByteToNa4String(Uint1Ptr bytep,Uint2Ptr buf,Int4 total)354 NLM_EXTERN Uint2Ptr LIBCALL MapNa2ByteToNa4String (Uint1Ptr bytep, Uint2Ptr buf, Int4 total)
355 
356 {
357   Uint2Ptr  bp;
358   Uint1     byte;
359   Int2      index;
360   Int4      k;
361   Uint2Ptr  ptr;
362 
363   if (bytep == NULL || buf == NULL) return buf;
364   ptr = buf;
365 
366   /* initialize array if not yet set (first time function is called) */
367 
368   if (Na2toNa4 == NULL) {
369     InitNa2toNa4 ();
370   }
371 
372   if (Na2toNa4 == NULL) return buf;
373 
374   /* now return 2 character byte for each compressed byte */
375 
376   for (k = 0; k < total; k++) {
377     byte = *bytep;
378     bytep++;
379     index = 2 * byte;
380     bp = (Uint2Ptr)  (Na2toNa4 + index);
381     /* copy 2 bytes at a time */
382     /*
383     for (j = 0; j < 2; j++) {
384       *ptr = *bp;
385       ptr++;
386       bp++;
387     }
388     */
389     *ptr = *bp;
390     ptr++;
391   }
392 
393   return ptr;
394 }
395 
InitNa2to4Bit(void)396 static void InitNa2to4Bit (void)
397 
398 {
399   Int2  base [4], index, j;
400   Uint1  convert [4] = {1, 2, 4, 8};
401   Int4  ret;
402   Uint1Ptr Na2to4Bit_local = NULL;
403 
404   ret = NlmMutexLockEx (&seqport_mutex);  /* protect this section */
405   if (ret) {
406     ErrPostEx (SEV_FATAL, 0, 0, "MapNa2ByteTo4BitString mutex failed [%ld]", (long) ret);
407     return;
408   }
409 
410   if (Na2to4Bit == NULL) {
411     Na2to4Bit_local = MemNew (sizeof (Uint1) * 1024);
412 
413     if (Na2to4Bit_local != NULL) {
414       for (base [0] = 0; base [0] < 4; (base [0])++) {
415         for (base [1] = 0; base [1] < 4; (base [1])++) {
416           for (base [2] = 0; base [2] < 4; (base [2])++) {
417             for (base [3] = 0; base [3] < 4; (base [3])++) {
418               index = 4 * (base [0] * 64 + base [1] * 16 + base [2] * 4 + base [3]);
419               for (j = 0; j < 4; j++) {
420                 Na2to4Bit_local [index + j] = convert [(base [j])];
421               }
422             }
423           }
424         }
425       }
426     }
427     Na2to4Bit = Na2to4Bit_local;
428   }
429 
430   NlmMutexUnlock (seqport_mutex);
431 }
432 
MapNa2ByteTo4BitString(Uint1Ptr bytep,Uint4Ptr buf,Int4 total)433 NLM_EXTERN Uint4Ptr LIBCALL MapNa2ByteTo4BitString (Uint1Ptr bytep, Uint4Ptr buf, Int4 total)
434 
435 {
436   Uint4Ptr  bp;
437   Uint1     byte;
438   Int2      index;
439   Int4      k;
440   Uint4Ptr  ptr;
441 
442   if (bytep == NULL || buf == NULL) return buf;
443   ptr = buf;
444 
445   /* initialize array if not yet set (first time function is called) */
446 
447   if (Na2to4Bit == NULL) {
448     InitNa2to4Bit ();
449   }
450 
451   if (Na2to4Bit == NULL) return buf;
452 
453   /* now return 4 byte string for each compressed byte */
454 
455   for (k = 0; k < total; k++) {
456     byte = *bytep;
457     bytep++;
458     index = 4 * byte;
459     bp = (Uint4Ptr) (Na2to4Bit + index);
460     /* copy 4 bytes at a time */
461     /*
462     for (j = 0; j < 4; j++) {
463       *ptr = *bp;
464       ptr++;
465       bp++;
466     }
467     */
468     *ptr = *bp;
469     ptr++;
470   }
471 
472   return ptr;
473 }
474 
InitNa4to4Bit(void)475 static void InitNa4to4Bit (void)
476 
477 {
478   Int2  base [2], index, j;
479   Char  convert [16] = {15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
480   Int4  ret;
481   Uint1Ptr Na4to4Bit_local = NULL;
482 
483   ret = NlmMutexLockEx (&seqport_mutex);  /* protect this section */
484   if (ret) {
485     ErrPostEx (SEV_FATAL, 0, 0, "MapNa4ByteToIUPACString mutex failed [%ld]", (long) ret);
486     return;
487   }
488 
489   if (Na4to4Bit == NULL) {
490     Na4to4Bit_local = MemNew (sizeof (Uint1) * 512);
491 
492     if (Na4to4Bit_local != NULL) {
493       for (base [0] = 0; base [0] < 16; (base [0])++) {
494         for (base [1] = 0; base [1] < 16; (base [1])++) {
495           index = 2 * (base [0] * 16 + base [1]);
496           for (j = 0; j < 2; j++) {
497             Na4to4Bit_local [index + j] = convert [(base [j])];
498           }
499         }
500       }
501     }
502     Na4to4Bit = Na4to4Bit_local;
503   }
504 
505   NlmMutexUnlock (seqport_mutex);
506 }
507 
MapNa4ByteTo4BitString(Uint1Ptr bytep,Uint2Ptr buf,Int4 total)508 NLM_EXTERN Uint2Ptr LIBCALL MapNa4ByteTo4BitString (Uint1Ptr bytep, Uint2Ptr buf, Int4 total)
509 
510 {
511   Uint2Ptr  bp;
512   Uint1     byte;
513   Int2      index;
514   Int4      k;
515   Uint2Ptr  ptr;
516 
517   if (bytep == NULL || buf == NULL) return buf;
518   ptr = buf;
519 
520   /* initialize array if not yet set (first time function is called) */
521 
522   if (Na4to4Bit == NULL) {
523     InitNa4to4Bit ();
524   }
525 
526   if (Na4to4Bit == NULL) return buf;
527 
528   /* now return 2 character string for each compressed byte */
529 
530   for (k = 0; k < total; k++) {
531     byte = *bytep;
532     bytep++;
533     index = 2 * byte;
534     bp = (Uint2Ptr)  (Na4to4Bit + index);
535     /* copy 2 bytes at a time */
536     /*
537     for (j = 0; j < 2; j++) {
538       *ptr = *bp;
539       ptr++;
540       bp++;
541     }
542     */
543     *ptr = *bp;
544     ptr++;
545   }
546 
547   return ptr;
548 }
549 
550 /*****************************************************************************
551 *
552 *   SeqPort Routines
553 *
554 *****************************************************************************/
555 
556 /*****************************************************************************
557 *
558 *   SeqPortFree(spp)
559 *
560 *****************************************************************************/
SeqPortFree(SeqPortPtr spp)561 NLM_EXTERN SeqPortPtr SeqPortFree (SeqPortPtr spp)
562 
563 {
564     SeqPortPtr tspp, nextspp;
565 
566     if (spp == NULL)
567         return NULL;
568 
569     if (spp->locked)              /* locked during access */
570         BioseqUnlock(spp->bsp);   /* make available for freeing */
571 
572     tspp = spp->segs;
573     while (tspp != NULL)
574     {
575         nextspp = tspp->next;
576         SeqPortFree(tspp);
577         tspp = nextspp;
578     }
579 
580     MemFree(spp->cache);
581     MemFree (spp->cacheq);
582 
583     MemFree(spp);
584 
585     return NULL;
586 }
587 
588 /*****************************************************************************
589 *
590 *   SeqPortSetValues(spp)
591 *      Copies the values is_circle, is_seg, and do_virtual from spp to
592 *        any dependent SeqPortPtrs it contains. This is necessary for segmented
593 *        reference, or delta types of Bioseqs and on SeqPortNewByLoc()
594 *
595 *      SeqPortSet_... functions call this function
596 *
597 *****************************************************************************/
SeqPortSetValues(SeqPortPtr spp)598 NLM_EXTERN Boolean LIBCALL SeqPortSetValues (SeqPortPtr spp)
599 {
600     SeqPortPtr tmp;
601 
602     if (spp == NULL)
603         return FALSE;
604 
605     for (tmp = spp->segs; tmp != NULL; tmp = tmp->next)
606     {
607         tmp->is_circle = spp->is_circle;
608         tmp->is_seg = spp->is_seg;
609         tmp->do_virtual = spp->do_virtual;
610         tmp->gapIsZero = spp->gapIsZero;
611 
612         if (tmp->segs != NULL)
613             SeqPortSetValues(tmp);
614     }
615 
616     return TRUE;
617 }
618 
619 
SeqPortSet_is_circle(SeqPortPtr spp,Boolean value)620 NLM_EXTERN Boolean LIBCALL SeqPortSet_is_circle (SeqPortPtr spp, Boolean value)
621 {
622     if (spp == NULL)
623         return FALSE;
624     spp->is_circle = value;
625     return SeqPortSetValues(spp);
626 }
627 
SeqPortSet_is_seg(SeqPortPtr spp,Boolean value)628 NLM_EXTERN Boolean LIBCALL SeqPortSet_is_seg (SeqPortPtr spp, Boolean value)
629 {
630     if (spp == NULL)
631         return FALSE;
632     spp->is_seg = value;
633     return SeqPortSetValues(spp);
634 }
635 
636 /**************************************************************
637 *
638 *  This function adjusts the length of seqport to remove virtual
639 *   segments or add them back as needed
640 *
641 **************************************************************/
SeqPortAdjustLength(SeqPortPtr spp)642 NLM_EXTERN Boolean LIBCALL SeqPortAdjustLength (SeqPortPtr spp)
643 {
644     SeqPortPtr tmp;
645     Int4 len = 0;
646 
647     if (spp == NULL)
648         return FALSE;
649 
650 
651     if (spp->isa_virtual)
652     {
653         if (spp->do_virtual)
654             spp->totlen = spp->stop - spp->start + 1;
655         else
656             spp->totlen = 0;
657         if (spp->totlen == 0)
658             spp->isa_null = TRUE;
659         else
660             spp->isa_null = FALSE;
661     }
662     else if (spp->segs != NULL)
663     {
664         for (tmp = spp->segs; tmp != NULL; tmp = tmp->next)
665         {
666             SeqPortAdjustLength (tmp);
667             len += tmp->totlen;
668         }
669         spp->totlen = len;
670     }
671     else if (! spp->isa_null)
672         spp->totlen = spp->stop - spp->start + 1;
673     spp->curpos = -1;  /* reset to unused */
674 
675     return TRUE;
676 
677 }
678 
SeqPortSet_do_virtualEx(SeqPortPtr spp,Boolean value,Boolean gapIsZero)679 NLM_EXTERN Boolean LIBCALL SeqPortSet_do_virtualEx (SeqPortPtr spp, Boolean value, Boolean gapIsZero)
680 {
681     Boolean do_it = FALSE, has_virtual=FALSE;
682     SeqPortPtr tmp;
683 
684     if (spp == NULL)
685         return FALSE;
686 
687     if (spp->isa_virtual == TRUE)
688         has_virtual = TRUE;
689     if (spp->do_virtual != value)
690         do_it = TRUE;
691     if (spp->gapIsZero != gapIsZero)
692         do_it = TRUE;
693     for (tmp = spp->segs; tmp != NULL; tmp = tmp->next)
694     {
695         if (tmp->isa_virtual == TRUE)
696             has_virtual = TRUE;
697         if (tmp->do_virtual != value)
698             do_it = TRUE;
699         if (tmp->gapIsZero != gapIsZero)
700             do_it = TRUE;
701     }
702 
703     if (! do_it)   /* no change needed */
704         return TRUE;
705 
706 
707     spp->do_virtual = value;
708     spp->gapIsZero = gapIsZero;
709     SeqPortSetValues(spp);
710     if (has_virtual)   /* have to check the SeqPort */
711     {
712         SeqPortAdjustLength(spp);
713         SeqPortSeek(spp, 0, SEEK_SET);
714     }
715 
716     return TRUE;
717 }
718 
719 
SeqPortSet_do_virtual(SeqPortPtr spp,Boolean value)720 NLM_EXTERN Boolean LIBCALL SeqPortSet_do_virtual (SeqPortPtr spp, Boolean value)
721 {
722     return SeqPortSet_do_virtualEx (spp, value, FALSE);
723 }
724 
SeqPortSetUpFields(SeqPortPtr spp,Int4 start,Int4 stop,Uint1 strand,Uint1 newcode)725 NLM_EXTERN Boolean LIBCALL SeqPortSetUpFields (SeqPortPtr spp, Int4 start, Int4 stop, Uint1
726 strand, Uint1 newcode)
727 {
728     if (spp == NULL) return FALSE;
729     spp->start = start;
730     spp->stop = stop;
731     spp->strand = strand;
732     spp->curpos = -1;    /* not set */
733     spp->totlen = stop - start + 1;
734     spp->newcode = newcode;
735     spp->sctp = SeqCodeTableFind(newcode);
736 
737     return TRUE;
738 }
SeqPortSetUpAlphabet(SeqPortPtr spp,Uint1 curr_code,Uint1 newcode)739 NLM_EXTERN Boolean LIBCALL SeqPortSetUpAlphabet(SeqPortPtr spp, Uint1 curr_code, Uint1
740 newcode)
741 {
742     if (spp == NULL) return FALSE;
743 
744         spp->oldcode = curr_code;
745         spp->sctp = SeqCodeTableFind(curr_code);
746 
747         switch (curr_code)
748         {
749             case Seq_code_ncbi2na:
750                 spp->bc = 4;            /* bit shifts needed */
751                 spp->rshift = 6;
752                 spp->lshift = 2;
753                 spp->mask = 192;
754                 break;
755             case Seq_code_ncbi4na:
756                 spp->bc = 2;
757                 spp->rshift = 4;
758                 spp->lshift = 4;
759                 spp->mask = 240;
760                 break;
761             default:
762                 spp->bc = 1;
763                 spp->mask = 255;
764                 break;
765         }
766 
767         if ((newcode) && (newcode != curr_code))    /* conversion alphabet */
768         {
769             if ((spp->smtp = SeqMapTableFind(newcode, curr_code)) != NULL)
770                 spp->sctp = SeqCodeTableFind(newcode);
771         }
772 
773         return TRUE;
774 }
775 
776 /*****************************************************************************
777 *
778 *   SeqPortNew(bsp, start, stop, strand, newcode)
779 *       if bsp == NULL, creates an empty port
780 *       see objloc.h for strand defines
781 *
782 *****************************************************************************/
SeqPortNew(BioseqPtr bsp,Int4 start,Int4 stop,Uint1 strand,Uint1 newcode)783 NLM_EXTERN SeqPortPtr SeqPortNew (BioseqPtr bsp, Int4 start, Int4 stop, Uint1 strand, Uint1
784 newcode)
785 
786 {
787     SeqPortPtr spp, spps, sppcurr = NULL, spprev, prev, curr = NULL;
788     Uint1 curr_code, repr, tstrand = 0;
789     SeqLocPtr the_segs = NULL, currseg;
790     Int4 len, ctr, tlen = 0, tfrom = 0, tto = 0, xfrom, xto, tstart, tstop;
791     Char errbuf[41], idbuf[41];
792     ValNode fake;
793     Boolean done, started;
794     BioseqPtr tbsp;
795     ValNodePtr currchunk;  /* can be a SeqLoc or an element of a Delta Seq
796 */
797     Boolean do_multi_loc, cycle2;
798     SeqLitPtr slitp = NULL;
799     SeqIdPtr tsip;
800 
801     spp = (SeqPortPtr) MemNew(sizeof(SeqPort));
802     errbuf[0] = '\0';
803 
804     if (bsp == NULL)     /* a NULL section */
805         return spp;
806 
807     spp->bsp = bsp;                    /* get ready for error
808 msgs */
809     SeqIdWrite(SeqIdFindBest(bsp->id, 0), errbuf, PRINTID_FASTA_SHORT, 40);
810     len = BioseqGetLen(bsp);
811     if (start < 0)
812     {
813         ErrPostEx(SEV_ERROR, 0,0  ,
814                  "SeqPortNew: %s start (%ld)< 0", errbuf,
815 (long)start);
816         MemFree(spp);
817         return NULL;
818     }
819     if (start >= len)
820     {
821         ErrPostEx(SEV_ERROR,0,0,
822                  "SeqPortNew: %s start(%ld) >= len(%ld)",
823                     errbuf, (long)start, (long)len);
824         MemFree(spp);
825         return NULL;
826     }
827     if (stop == LAST_RESIDUE)
828         stop = len - 1;
829     else if (stop < start)
830     {
831         ErrPostEx(SEV_ERROR,0,0,
832                  "SeqPortNew: %s stop(%ld) < start(%ld)",
833                     errbuf, (long)stop, (long)start);
834         MemFree(spp);
835         return NULL;
836     }
837     else if (stop >= len)
838     {
839         ErrPostEx(SEV_ERROR,0,0,
840                  "SeqPortNew: %s stop(%ld) >= len(%ld)",
841                     errbuf, (long)stop, (long)len);
842         MemFree(spp);
843         return NULL;
844     }
845 
846     SeqPortSetUpFields (spp, start,stop, strand, newcode);
847 
848     spp->currnum = BioseqGetSeqDescr(bsp, Seq_descr_num, NULL);
849     if (spp->currnum == NULL)   /* no numbering set */
850         spp->currnum = NumberingDefaultGet();   /* use default */
851 
852     repr = Bioseq_repr(bsp);
853     if ((repr == Seq_repr_virtual) ||    /* virtual sequence */
854         (repr == Seq_repr_map ))         /* map sequence */
855     {
856         spp->isa_virtual = TRUE;
857         spp->curpos = 0;
858     }
859     else if ((repr == Seq_repr_seg) ||   /* segmented */
860         (repr == Seq_repr_ref) ||     /* reference */
861         (repr == Seq_repr_delta))     /* delta */
862     {
863         spp->oldcode = 0;        /* no code, not raw */
864 
865         if (repr == Seq_repr_seg)  /* segmented */
866         {
867             fake.choice = SEQLOC_MIX;   /* make SEQUENCE OF Seq-loc,
868 into one */
869             fake.data.ptrvalue = bsp->seq_ext;
870             fake.next = NULL;
871             the_segs = (SeqLocPtr)&fake;
872         }
873         else if (repr == Seq_repr_ref)        /* reference: is a Seq-loc
874 */
875             the_segs = (SeqLocPtr)bsp->seq_ext;
876 
877         if (repr == Seq_repr_delta)   /* chain of deltas to follow */
878             currchunk = (ValNodePtr)(bsp->seq_ext);
879         else                          /* seqlocs */
880             currchunk = (ValNodePtr)SeqLocFindNext(the_segs, NULL);
881 
882         currseg = NULL;
883         ctr = 0;
884         done = FALSE;
885         started = FALSE;
886         while ((! done) && (currchunk != NULL))
887         {
888             do_multi_loc = FALSE;
889             cycle2 = TRUE;     /* only really needed for complicated
890 delta seq locs */
891             currseg = NULL;
892             if (repr == Seq_repr_delta)
893             {
894                 if (currchunk->choice == 1)  /* it's a SeqLocPtr
895 */
896                 {
897                     currseg =
898 (SeqLocPtr)(currchunk->data.ptrvalue);
899                     if (! IS_one_loc(currseg, FALSE)) /*
900 don't do complicated cases here */
901                     {
902                         do_multi_loc = TRUE;
903                         currseg =
904 SeqLocFindNext((SeqLocPtr)(currchunk->data.ptrvalue), NULL);
905                     }
906                 }
907                 else                         /* it's a SeqLitPtr
908 */
909                 {
910                     currseg = NULL;
911                     slitp =
912 (SeqLitPtr)(currchunk->data.ptrvalue);
913                     tlen = slitp->length;
914                     tstrand = Seq_strand_plus;
915                     tfrom = 0;
916                     tto = tlen - 1;
917                 }
918             }
919             else
920                 currseg = (SeqLocPtr)currchunk;
921 
922             while (cycle2)   /* normally once, except for
923 complicated delta locs */
924             {
925                 if (currseg != NULL)   /* for segs and deltas of
926 type loc */
927                 {
928                     tlen = SeqLocLen(currseg);
929                     tstrand = SeqLocStrand(currseg);
930                     tfrom = SeqLocStart(currseg);
931                     tto = SeqLocStop(currseg);
932                 }
933 
934                 if (! started)
935                 {
936                     if ((ctr + tlen - 1) >= start)
937                     {
938                         tstart = start - ctr;
939                         started = TRUE;
940                     }
941                     else
942                         tstart = -1;
943                 }
944                 else
945                     tstart = 0;
946 
947                 if (tstart >= 0)   /* have a start */
948                 {
949                     if ((ctr + tlen - 1) >= stop)
950                     {
951                         done = TRUE;   /* hit the end */
952                         tstop = ((ctr + tlen - 1) -
953 stop);
954                     }
955                     else
956                         tstop = 0;
957 
958                     if (tstrand == Seq_strand_minus)
959                     {
960                         xfrom = tfrom + tstop;
961                         xto = tto - tstart;
962                     }
963                     else
964                     {
965                         xfrom = tfrom + tstart;
966                         xto = tto - tstop;
967                     }
968 
969                     if (currseg != NULL)    /* working off locs */
970                     {
971                         if (currseg->choice == SEQLOC_NULL)
972                         {
973                             tbsp = NULL;
974                             spps = SeqPortNew(tbsp, xfrom, xto, tstrand, newcode);
975                             spps->isa_null = TRUE;
976                         }
977                         else
978                         {
979                             tsip = SeqLocId(currseg);
980                             tbsp = BioseqLockById(tsip);
981                             if (tbsp != NULL)
982                                 spps = SeqPortNew(tbsp, xfrom, xto, tstrand, newcode);
983                             else
984                             {
985                                 spps = NULL;
986                                 if (tsip != NULL)
987                                     SeqIdWrite(tsip, idbuf, PRINTID_FASTA_SHORT, 40);
988                                 else
989                                     StringMove(idbuf,"seqid=NULL");
990                                 ErrPostEx(SEV_ERROR,0,0,
991                                       "SeqPortNew: %s could not find component %s",
992                                       errbuf, idbuf);
993                                 return SeqPortFree(spp);
994                             }
995                         }
996 
997                     }
998                     else
999                     {
1000                         spps = (SeqPortPtr) MemNew(sizeof(SeqPort));
1001                         SeqPortSetUpFields (spps, xfrom,
1002 xto, tstrand, newcode);
1003                         SeqPortSetUpAlphabet(spps,
1004 slitp->seq_data_type, newcode);
1005                         if (slitp->seq_data != NULL)
1006                             spps->bp = (ByteStorePtr)
1007 slitp->seq_data;
1008                         else
1009                         {
1010                             spps->isa_virtual = TRUE;
1011                                 if (slitp->length == 0)
1012                                 spps->isa_null = TRUE;
1013                             else
1014                             {   /* default for delta gaps */
1015                                 spps->do_virtual = TRUE;
1016                             }
1017 
1018                         }
1019                     }
1020 
1021                     if (spps == NULL)
1022                     {
1023                         ErrPostEx(SEV_ERROR,0,0,
1024                          "SeqPortNew: %s unexpected null during recursion",
1025                                  errbuf);
1026                         return SeqPortFree(spp);
1027                     }
1028 
1029                     if (currseg != NULL)
1030                         spps->locked = TRUE;
1031 
1032                     if (sppcurr == NULL)
1033                         spp->segs = spps;
1034                     else
1035                         sppcurr->next = spps;
1036                     sppcurr = spps;
1037                 }
1038 
1039                 ctr += tlen;
1040 
1041                 if (! do_multi_loc)
1042                     cycle2 = FALSE;
1043                 else
1044                 {
1045                     currseg =
1046 SeqLocFindNext((SeqLocPtr)(currchunk->data.ptrvalue), currseg);
1047                     if (currseg == NULL)
1048                         cycle2 = FALSE;
1049                 }
1050             }
1051 
1052             if (repr == Seq_repr_delta)
1053                 currchunk = currchunk->next;
1054             else
1055                 currchunk = SeqLocFindNext(the_segs, currchunk);
1056         }
1057         if (strand == Seq_strand_minus)  /* reverse seqport order */
1058         {
1059             prev = spp->segs;
1060             spprev = spp->segs;
1061             spp->segs = NULL;
1062             sppcurr = NULL;
1063             while (prev != NULL)
1064             {
1065                 curr = spprev;
1066                 prev = NULL;
1067                 while (curr->next != NULL)  /* end of chain */
1068                 {
1069                     prev = curr;
1070                     curr = curr->next;
1071                 }
1072                 if (prev != NULL)
1073                     prev->next = NULL;
1074                 if (sppcurr == NULL)
1075                     spp->segs = curr;
1076                 else
1077                     sppcurr->next = curr;
1078                 sppcurr = curr;
1079             }
1080             curr->next = NULL;   /* last one in chain */
1081         }
1082         spp->curr = spp->segs;
1083 
1084           if (! started)   /* nothing found */
1085           {
1086                ErrPostEx(SEV_ERROR,0,0,"SeqPortNew: no data found for %s",
1087                            errbuf);
1088              return SeqPortFree(spp);
1089           }
1090     }
1091     else if ((repr == Seq_repr_raw) ||   /* sequence not by reference */
1092         (repr == Seq_repr_const))
1093     {
1094         curr_code = BioseqGetCode(bsp);
1095 
1096         SeqPortSetUpAlphabet(spp, curr_code, newcode);
1097         spp->bp = (ByteStorePtr) bsp->seq_data;
1098 
1099      /* allocate fast lookup caches for 2na or 4na to iupacna or 4na conversion */
1100 
1101         if ((newcode == Seq_code_iupacna || newcode == Seq_code_ncbi4na) &&
1102             (curr_code == Seq_code_ncbi2na || curr_code == Seq_code_ncbi4na)) {
1103             spp->cacheq = (SPCacheQPtr) MemNew (sizeof (SPCacheQ));
1104         }
1105 
1106     }
1107 
1108     SeqPortAdjustLength (spp);
1109     SeqPortSeek(spp, 0, SEEK_SET);
1110     return spp;
1111 }
1112 
1113 /*****************************************************************************
1114 *
1115 *   SeqPortNewByLoc(loc, code)
1116 *       builds a new seqport based on a SeqLoc
1117 *
1118 *****************************************************************************/
SeqPortNewByLoc(SeqLocPtr loc,Uint1 code)1119 NLM_EXTERN SeqPortPtr SeqPortNewByLoc (SeqLocPtr loc, Uint1 code)
1120 
1121 {
1122     BioseqPtr bsp = NULL;
1123     SeqPortPtr spp = NULL, sppcurr, spps;
1124     Int4 start = 0, stop = 0;
1125     Uint1 strand = Seq_strand_unknown;
1126     SeqLocPtr currloc = NULL;
1127     CharPtr locptr, currlocptr;
1128 
1129     if (loc == NULL)
1130         return spp;
1131 
1132                    /* get the needed components */
1133 
1134     switch (loc->choice)
1135     {
1136         case SEQLOC_INT:      /* int */
1137         case SEQLOC_PNT:      /* pnt */
1138         case SEQLOC_PACKED_PNT:      /* packed-pnt   */
1139             start = SeqLocStart(loc);
1140             stop = SeqLocStop(loc);
1141             strand = SeqLocStrand(loc);
1142         case SEQLOC_WHOLE:      /* whole */
1143             bsp = BioseqLockById(SeqLocId(loc));  /* need the bioseq
1144 now */
1145             if (bsp == NULL)
1146                 return NULL;    /* can't do it */
1147     }
1148 
1149 
1150 
1151     switch (loc->choice)
1152     {
1153         case SEQLOC_EMPTY:      /* empty */
1154         case SEQLOC_EQUIV:     /* equiv */
1155         case SEQLOC_BOND:      /* bond */
1156             break;
1157 
1158         case SEQLOC_NULL:      /* null */
1159             spp = SeqPortNew(NULL, FIRST_RESIDUE, LAST_RESIDUE, 0,
1160 code);
1161             spp->isa_null = TRUE;
1162             break;
1163 
1164         case SEQLOC_WHOLE:      /* whole */
1165             spp = SeqPortNew(bsp, FIRST_RESIDUE, LAST_RESIDUE, 0, code);
1166             if (spp != NULL)
1167                 spp->locked = TRUE;
1168             else
1169                 BioseqUnlock(bsp);
1170             break;
1171 
1172         case SEQLOC_INT:      /* int */
1173         case SEQLOC_PNT:      /* pnt */
1174         case SEQLOC_PACKED_PNT:      /* packed-pnt   */
1175             spp = SeqPortNew(bsp, start, stop, strand, code);
1176             if (spp != NULL)
1177                 spp->locked = TRUE;
1178             else
1179                 BioseqUnlock(bsp);
1180             break;
1181 
1182         case SEQLOC_PACKED_INT:      /* packed seqint */
1183         case SEQLOC_MIX:      /* mix */
1184             spp = (SeqPortPtr) MemNew(sizeof(SeqPort));
1185             spp->totlen = SeqLocLen(loc);
1186             spp->start = 0;
1187             spp->stop = spp->totlen - 1;
1188             spp->curpos = -1;    /* not set */
1189             spp->currnum = NULL;   /* use numbering from parts */
1190             currloc = NULL;
1191             sppcurr = NULL;
1192             while ((currloc = SeqLocFindNext(loc, currloc)) != NULL)
1193             {
1194                 spps = SeqPortNewByLoc(currloc, code);
1195                 if (spps == NULL)
1196                 {
1197                     locptr = SeqLocPrint(loc);
1198                     currlocptr = SeqLocPrint(currloc);
1199                     ErrPostEx(SEV_ERROR, 0,0  ,
1200         "SeqPortNewByLoc unexpected null during recursion [loc=%s][curr=%s]",
1201                     locptr, currlocptr);
1202                     MemFree(locptr);
1203                     MemFree(currlocptr);
1204                     SeqPortFree(spp);
1205                     return NULL;
1206                 }
1207                 if (sppcurr == NULL)
1208                     spp->segs = spps;
1209                 else
1210                     sppcurr->next = spps;
1211                 sppcurr = spps;
1212             }
1213             spp->curr = spp->segs;
1214             break;
1215         case SEQLOC_FEAT:
1216             ErrPostEx(SEV_ERROR, 0,0  ,
1217                  "SeqLocNewByLoc: Seq-loc.feat not supported");
1218             break;
1219     }
1220 
1221     SeqPortAdjustLength (spp);
1222     SeqPortSeek(spp, 0, SEEK_SET);
1223 
1224     return spp;
1225 }
1226 
1227 /*****************************************************************************
1228 *
1229 *   SeqPortSeek(spp, offset, origin)
1230 *       works like fseek()
1231 *           returns 0 on success   (weird but true)
1232 *           non-zero on fail
1233 *       uses coordinates 0-(len - 1)  no matter what region seqport covers
1234 *
1235 *
1236 *****************************************************************************/
ClearQCache(SeqPortPtr spp,Int4 sp)1237 static void ClearQCache (SeqPortPtr spp, Int4 sp)
1238 
1239 {
1240     SPCacheQPtr spcpq;
1241 
1242     spcpq = spp->cacheq;
1243     if (spcpq != NULL) {
1244         spcpq->ctr = 0;
1245         spcpq->total = 0; /* clear out cache parameters to force new read */
1246     }
1247     spp->curpos = sp;
1248     spp->byte = SEQPORT_EOF;
1249 }
1250 
SeqPortSeek(SeqPortPtr spp,Int4 offset,Int2 origin)1251 NLM_EXTERN Int2 SeqPortSeek (SeqPortPtr spp, Int4 offset, Int2 origin)
1252 
1253 {
1254     Int4 sp, curpos, left, pos, lim, diff;
1255     Boolean plus_strand;
1256     Uint1 the_byte, the_residue;
1257     Int2 bitctr;
1258     SeqPortPtr curspp;
1259     Uint1Ptr buf;
1260     SPCachePtr spcp;
1261 
1262     if (spp == NULL)
1263         return 1;
1264 
1265     spp->eos = FALSE;   /* unset flag set when moving off segment */
1266 
1267                                 /* get position as positive offset from 0 */
1268     if (spp->strand == Seq_strand_minus)
1269         plus_strand = FALSE;
1270     else
1271         plus_strand = TRUE;
1272 
1273     sp = spp->curpos;    /* current offset, 0 - (totlen - 1)  */
1274     switch (origin)
1275     {
1276         case SEEK_SET:
1277             spp->backing = FALSE;  /* reset.. not backing */
1278             if ((offset > spp->totlen) || (offset < 0)) {
1279                 ClearQCache (spp, sp);
1280                 return 1;
1281             }
1282             sp = offset;
1283             break;
1284         case SEEK_CUR:
1285             if (((sp + offset) > spp->totlen) ||
1286                 ((sp + offset) < 0 ))
1287             {
1288         /** check for reverse complement backing **/
1289         if ((sp + offset < 0) && (offset == -2))
1290         {
1291             if (spp->backing == 1)
1292             {
1293                 ClearQCache(spp, -1);
1294                 spp->eos = TRUE; /* note backing off segment */
1295                 return 0;
1296             }
1297             if (spp->curpos == -1)  /* not set */
1298                 return 0;
1299         }
1300 
1301                 if (! spp->is_circle) {
1302                     ClearQCache (spp, sp);
1303                     return 1;
1304                 }
1305             }
1306             else
1307                 sp += offset;
1308             if (spp->is_circle)
1309             {
1310                 while (sp >= spp->totlen)   /* circle adjustments */
1311                     sp -= spp->totlen;
1312                 while (sp < 0)
1313                     sp += spp->totlen;
1314             }
1315             break;
1316         case SEEK_END:
1317             if ((ABS(offset) > spp->totlen) || (offset > 0)) {
1318                 ClearQCache (spp, sp);
1319                 return 1;
1320             }
1321             sp = spp->totlen + offset;
1322             break;
1323         default:
1324             ClearQCache (spp, sp);
1325             return 1;
1326     }
1327 
1328     if (sp == spp->curpos)     /* already in right position */
1329         return 0;
1330 
1331     if (sp == spp->totlen)    /* seek to EOF */
1332     {
1333         spp->curpos = sp;
1334         spp->byte = SEQPORT_EOF;    /* set to nothing */
1335         ClearQCache (spp, sp);
1336         return 0;
1337     }
1338 
1339     if (spp->oldcode)       /* has data, is raw or const type */
1340     {
1341 
1342         /* if 2na or 4na to iupacna, now only need fast lookup caches */
1343         if (spp->cacheq != NULL) {
1344             ClearQCache (spp, sp);
1345             return 0; /* bypass remaining code */
1346         }
1347 
1348     /* original code using cache direct from byte store */
1349 
1350         if (spp->cache == NULL)     /* allocate a cache */
1351             spp->cache = (SPCachePtr)MemNew(sizeof(SPCache));
1352         spcp = spp->cache;
1353         buf = spcp->buf;
1354 
1355         if (plus_strand)
1356         {
1357             curpos = sp + spp->start;
1358             pos = curpos / (Int4) (spp->bc);
1359             lim = spp->stop / (Int4) (spp->bc);
1360             diff = lim - pos + 1;
1361             if (diff > 100)
1362             {
1363                 diff = 100;
1364                 lim = pos + diff - 1;
1365             }
1366             BSSeek(spp->bp, pos, SEEK_SET);
1367             spcp->total = (Int2) BSRead(spp->bp, (VoidPtr)buf,
1368 diff);
1369             spcp->ctr = 0;
1370             spp->bytepos = lim;
1371         }
1372         else
1373         {
1374             curpos = spp->stop - sp;
1375             pos = curpos / (Int4) (spp->bc);
1376             lim = spp->start / (Int4) (spp->bc);
1377             diff = pos - lim + 1;
1378             if (diff > 100)
1379             {
1380                 diff = 100;
1381                 lim = pos - diff + 1;
1382             }
1383             BSSeek(spp->bp, lim, SEEK_SET);
1384             spcp->total = (Int2) BSRead(spp->bp, (VoidPtr)buf,
1385 diff);
1386             spcp->ctr = (Int2)(diff - 1);
1387             spp->bytepos = lim;
1388         }
1389         left = curpos % (Int4) (spp->bc);
1390         the_byte = spcp->buf[spcp->ctr];
1391         if ((plus_strand) || (spp->bc == 1))
1392             the_residue = the_byte;
1393         else        /* reverse compressed bit orders */
1394         {
1395             left = spp->bc - 1 - left;
1396             the_residue = 0;
1397             bitctr = spp->bc;
1398             while (bitctr)
1399             {
1400                 the_residue |= the_byte & spp->mask;
1401                 bitctr--;
1402                 if (bitctr)
1403                 {
1404                     the_residue >>= spp->lshift;
1405                     the_byte <<= spp->lshift;
1406                 }
1407             }
1408         }
1409         bitctr = spp->bc;
1410         while (left)
1411         {
1412             the_residue <<= spp->lshift;
1413             left--; bitctr--;
1414         }
1415         spp->byte = the_residue;
1416         spp->bitctr = (Uint1) bitctr;
1417         spp->curpos = sp;
1418         return 0;
1419     }
1420     else if ((spp->isa_virtual) || (spp->isa_null))   /* virtual or NULL */
1421     {
1422         spp->curpos = sp;
1423         return 0;
1424     }
1425     else                    /* segmented, reference sequences */
1426     {
1427 
1428         if (spp->backing == 1)  /* check for backing off segment */
1429         {
1430             if ((spp->curr->curpos == 1) &&
1431                 (! spp->curr->backing))  /* yup */
1432             {
1433                 spp->curr->curpos = -1;  /* just set the flag */
1434                 spp->curpos -= 2;
1435                 return 0;                /* no eos needed, -1
1436 will do */
1437             }
1438         }
1439 
1440         curpos = 0;
1441         curspp = spp->segs;
1442         if (curspp == NULL) return 1;
1443         while ((curpos + curspp->totlen) <= sp)
1444         {
1445             curpos += curspp->totlen;
1446             curspp = curspp->next;
1447             if (curspp == NULL)
1448                 return 1;
1449         }
1450         if (plus_strand)
1451             curpos = sp - curpos;
1452         else
1453             curpos = (curspp->totlen - 1) - (sp - curpos);
1454         curspp->backing = spp->backing;
1455         if (! SeqPortSeek(curspp, curpos, SEEK_SET))
1456         {
1457             curspp->backing = FALSE;
1458             spp->curr = curspp;
1459             spp->curpos = sp;
1460             return 0;
1461         }
1462         else
1463         {
1464             curspp->backing = FALSE;
1465             return 1;
1466         }
1467     }
1468 }
1469 
1470 /*****************************************************************************
1471 *
1472 *   Int4 SeqPortTell(spp)
1473 *
1474 *****************************************************************************/
SeqPortTell(SeqPortPtr spp)1475 NLM_EXTERN Int4 SeqPortTell (SeqPortPtr spp)
1476 
1477 {
1478     if (spp == NULL)
1479         return -1L;
1480 
1481     return spp->curpos;
1482 }
1483 
1484 /*****************************************************************************
1485 *
1486 *   SeqPortGetResidue(spp)
1487 *       returns residue at current location in requested codeing
1488 *       SEQPORT_EOF = end of file
1489 *
1490 *****************************************************************************/
SeqPortQuickGetResidue(SeqPortPtr spp,SPCacheQPtr spcpq,Boolean plus_strand)1491 static Uint1 LIBCALL SeqPortQuickGetResidue (SeqPortPtr spp, SPCacheQPtr spcpq, Boolean plus_strand)
1492 
1493 {
1494   Uint1    bytes [100];
1495   Int4     curpos, pos, lim, diff;
1496   CharPtr  ptr;
1497   Uint1    residue = INVALID_RESIDUE;
1498   Int2     total, i, j;
1499 
1500   if (spp == NULL || spcpq == NULL) return INVALID_RESIDUE;
1501 
1502   if (spp->curpos == spp->totlen) return SEQPORT_EOF;
1503 
1504   if (spp->curpos < spp->totlen) {
1505 
1506     if (spcpq->ctr >= spcpq->total) {
1507 
1508       /* read next buffer of bytes */
1509 
1510       if (plus_strand) {
1511 
1512         curpos = spp->curpos + spp->start;
1513         pos = curpos / (Int4) (spp->bc);
1514         lim = spp->stop / (Int4) (spp->bc);
1515         diff = lim - pos + 1;
1516         if (diff > 100) {
1517           diff = 100;
1518           lim = pos + diff - 1;
1519         }
1520         BSSeek (spp->bp, pos, SEEK_SET);
1521         total = (Int2) BSRead (spp->bp, (VoidPtr) bytes, diff);
1522         spp->bytepos = lim;
1523 
1524       } else {
1525 
1526         curpos = spp->stop - spp->curpos;
1527         pos = curpos / (Int4) (spp->bc);
1528         lim = spp->start / (Int4) (spp->bc);
1529         diff = pos - lim + 1;
1530         if (diff > 100) {
1531           diff = 100;
1532           lim = pos - diff + 1;
1533         }
1534         BSSeek (spp->bp, lim, SEEK_SET);
1535         total = (Int2) BSRead (spp->bp, (VoidPtr) bytes, diff);
1536         spp->bytepos = lim;
1537 
1538       }
1539 
1540       /* buffer is not null terminated, so uses special copy function */
1541 
1542       ptr = spcpq->buf;
1543 
1544       if (spp->newcode == Seq_code_iupacna) {
1545         if (spp->oldcode == Seq_code_ncbi2na) {
1546           ptr = (CharPtr) MapNa2ByteToIUPACString (bytes, (Uint4Ptr) ptr, total);
1547         } else if (spp->oldcode == Seq_code_ncbi4na) {
1548           ptr = (CharPtr) MapNa4ByteToIUPACString (bytes, (Uint2Ptr) ptr, total);
1549         }
1550       } else if (spp->newcode == Seq_code_ncbi4na) {
1551         if (spp->oldcode == Seq_code_ncbi2na) {
1552           ptr = (CharPtr) MapNa2ByteTo4BitString (bytes, (Uint4Ptr) ptr, total);
1553         } else if (spp->oldcode == Seq_code_ncbi4na) {
1554           ptr = (CharPtr) MapNa4ByteTo4BitString (bytes, (Uint2Ptr) ptr, total);
1555         }
1556       }
1557 
1558       spcpq->total = ptr - spcpq->buf;
1559       spcpq->ctr = 0;
1560 
1561       /* deal with end conditions */
1562 
1563       if (plus_strand) {
1564         spcpq->ctr += (curpos % (Int4) (spp->bc));
1565         if (lim == (spp->stop / (Int4) (spp->bc))) {
1566           diff = (spp->stop + 1) % (Int4) (spp->bc);
1567           if (diff > 0) {
1568             spcpq->total -= (Int4) (spp->bc) - diff;
1569           }
1570         }
1571       } else {
1572         if (pos == (curpos / (Int4) (spp->bc))) {
1573           diff = (curpos + 1) % (Int4) (spp->bc);
1574           if (diff > 0) {
1575             spcpq->total -= (Int4) (spp->bc) - diff;
1576           }
1577         }
1578         if (lim == (spp->start / (Int4) (spp->bc))) {
1579           spcpq->ctr += (spp->start) % (Int4) (spp->bc);
1580         }
1581 
1582         /* reverse complement */
1583 
1584         for (i = spcpq->ctr, j = spcpq->total - 1; i < j; i++, j--) {
1585           residue = spcpq->buf [i];
1586           spcpq->buf [i] = spcpq->buf [j];
1587           spcpq->buf [j] = residue;
1588         }
1589         for (i = spcpq->ctr; i < spcpq->total; i++) {
1590           residue = spcpq->buf [i];
1591           spcpq->buf [i] = SeqCodeTableComp (spp->sctp, residue);
1592         }
1593 
1594       }
1595 
1596     }
1597 
1598     /* now get residue directly from uncompressed buffer */
1599 
1600     residue = spcpq->buf [spcpq->ctr];
1601     spcpq->ctr++;
1602   }
1603 
1604   spp->curpos++;
1605 
1606   return residue;
1607 }
1608 
SeqPortGetResidue(SeqPortPtr spp)1609 NLM_EXTERN Uint1 LIBCALL SeqPortGetResidue (SeqPortPtr spp)
1610 
1611 {
1612     Uint1 residue = INVALID_RESIDUE, the_byte, the_residue, the_code;
1613     Boolean plus_strand = TRUE, moveup;
1614     Int2 bitctr, index;
1615     Int4 pos, lim, diff;
1616     SPCachePtr spcp;
1617     SeqPortPtr tmp, prev;
1618     SPCacheQPtr spcpq;
1619 
1620     if (spp != NULL)
1621         spp->backing = FALSE;  /* clear it on read */
1622 
1623     if (spp != NULL && spp->cacheq != NULL && spp->curpos < spp->totlen) {
1624         spcpq = spp->cacheq;
1625         if (spcpq->ctr < spcpq->total) {
1626             residue = spcpq->buf [spcpq->ctr];
1627             spcpq->ctr++;
1628             spp->curpos++;
1629             return residue;
1630         }
1631     }
1632 
1633     if ((spp == NULL) || ((spp->bp == NULL) && (spp->oldcode)))
1634         return SEQPORT_EOF;
1635 
1636     if (spp->isa_null) { /* NULL interval */
1637         spp->eos = TRUE; /* moving off the segment */
1638         return SEQPORT_VIRT;
1639     }
1640 
1641     if (spp->eos)       /* end of reverse complement spp */
1642         return SEQPORT_EOF;
1643 
1644     if (spp->curpos == spp->totlen)
1645     {
1646         if (spp->is_circle)
1647         {
1648             SeqPortSeek(spp, 0, SEEK_SET);  /* go to start */
1649             if (spp->is_seg)   /* give EOS? */
1650                 return SEQPORT_EOS;
1651         }
1652         else
1653             return SEQPORT_EOF;         /* EOF really */
1654     }
1655 
1656     if (spp->curpos == -1)        /* backed off end */
1657     {
1658         if (spp->is_circle)
1659         {
1660             SeqPortSeek(spp, -1, SEEK_END);  /* go to end */
1661             if (spp->is_seg)   /* give EOS? */
1662                 return SEQPORT_EOS;
1663         }
1664         else
1665             return SEQPORT_EOF;         /* EOF really */
1666     }
1667 
1668     if (spp->strand == Seq_strand_minus)
1669         plus_strand = FALSE;
1670 
1671     if (spp->oldcode)    /* its a raw or const sequence */
1672     {
1673 
1674     /* separate function for quick lookup to avoid cluttering old code */
1675         if (spp->cacheq != NULL) {
1676             return SeqPortQuickGetResidue (spp, spp->cacheq, plus_strand);
1677         }
1678 
1679         residue = spp->byte & spp->mask;
1680         residue >>= spp->rshift;
1681         spp->byte <<= spp->lshift;
1682         spp->bitctr--;
1683         if (spp->curpos < (spp->totlen - 1))  /* curpos not incremented yet */
1684         {
1685             if (spp->bitctr == 0)
1686             {
1687                 spcp = spp->cache;
1688                 if (! plus_strand) /* need previous byte */
1689                 {
1690                     spcp->ctr--;
1691                     if (spcp->ctr < 0)
1692                     {
1693                         pos = spp->bytepos - 1;
1694                         lim = spp->start /
1695 (Int4)(spp->bc);
1696                         diff = pos - lim + 1;
1697                         if (diff > 100)
1698                         {
1699                             diff = 100;
1700                             lim = pos - 100 + 1;
1701                         }
1702                         BSSeek(spp->bp, lim, SEEK_SET);
1703                         spcp->total =
1704 (Int2)BSRead(spp->bp, (VoidPtr)(spcp->buf), diff);
1705                         spcp->ctr = (Int2)(diff - 1);
1706                         spp->bytepos = lim;
1707                     }
1708                 }
1709                 else                /* need next
1710 byte */
1711                 {
1712                     spcp->ctr++;
1713                     if (spcp->ctr >= spcp->total)
1714                     {
1715                         pos = spp->bytepos + 1;
1716                         lim = spp->stop /
1717 (Int4)(spp->bc);
1718                         diff = lim - pos + 1;
1719                         if (diff > 100)
1720                         {
1721                             diff = 100;
1722                             lim = pos + diff - 1;
1723                         }
1724                         BSSeek(spp->bp, pos, SEEK_SET);
1725                         spcp->total =
1726 (Int2)BSRead(spp->bp, (VoidPtr)(spcp->buf), diff);
1727                         spcp->ctr = 0;
1728                         spp->bytepos = lim;
1729                     }
1730                 }
1731                 the_byte = spcp->buf[spcp->ctr];
1732 
1733                 if ((plus_strand) || (spp->bc == 1))
1734                     the_residue = the_byte;
1735                 else        /* reverse compressed bit orders */
1736                 {
1737                     the_residue = 0;
1738                     bitctr = spp->bc;
1739                     while (bitctr)
1740                     {
1741                         the_residue |= the_byte & spp->mask;
1742                         bitctr--;
1743                         if (bitctr)
1744                         {
1745                             the_residue >>= spp->lshift;
1746                             the_byte <<= spp->lshift;
1747                         }
1748                     }
1749                 }
1750                 spp->byte = the_residue;
1751                 spp->bitctr = spp->bc;
1752             }
1753         }
1754 
1755         if (spp->smtp == NULL)   /* no conversion, check now */
1756         {
1757             if (spp->sctp != NULL)  {
1758                 index = (Int2)residue - (Int2)(spp->sctp->start_at);
1759                 if ((index < 0) || (index >= (Int2)(spp->sctp->num)))
1760                     residue = INVALID_RESIDUE;
1761                 else if (*(spp->sctp->names[index]) == '\0')
1762                     residue = INVALID_RESIDUE;
1763             } else {
1764                 residue = INVALID_RESIDUE;
1765             }
1766         }
1767     }
1768     else if (spp->isa_virtual)  /* virtual */
1769     {
1770         if (spp->do_virtual)
1771         {
1772             if (spp->newcode)
1773                 the_code = spp->newcode;
1774             else
1775                 the_code = spp->oldcode;
1776             if (spp->gapIsZero && the_code == Seq_code_ncbi4na) {
1777                 residue = 0;
1778             } else {
1779                 residue = GetGapCode (the_code);
1780             }
1781             spp->curpos++;
1782             return residue;
1783         }
1784         else
1785         {
1786             spp->curpos++;
1787             return SEQPORT_VIRT;
1788         }
1789     }
1790     else              /* segmented or reference sequence */
1791     {
1792         residue = SeqPortGetResidue(spp->curr);
1793         while (! IS_residue(residue))
1794         {
1795             /* spp->curr->eos = FALSE;  just in case was set */
1796             moveup = FALSE;
1797 
1798             switch (residue)
1799             {
1800                 case SEQPORT_VIRT:
1801                 case SEQPORT_EOS:
1802                     if (spp->curr->segs == NULL)  /* this
1803 did not come up a layer */
1804                         moveup = TRUE;
1805                     break;
1806                 case SEQPORT_EOF:
1807                     moveup = TRUE;
1808                     break;
1809                 default:
1810                     break;
1811             }
1812 
1813             if (moveup)
1814             {
1815                 if ((spp->curr->curpos == -1) && (!
1816 spp->curr->eos))   /* moving backwards, many layers deep */
1817                 {
1818                     prev = NULL;
1819                     for (tmp = spp->segs; tmp != spp->curr;
1820 tmp = tmp->next)
1821                         prev = tmp;
1822                     if (prev != NULL)
1823                         spp->curr = prev;
1824                     else if (spp->is_circle)  /* go to end
1825 */
1826                     {
1827                         for (tmp = spp->segs; tmp->next
1828 != NULL; tmp = tmp->next)
1829                             continue;
1830                         spp->curr = tmp;
1831                     }
1832                     else
1833                         return SEQPORT_EOF;
1834 
1835                     if (! plus_strand)
1836                         SeqPortSeek(spp->curr, 0,
1837 SEEK_SET);
1838                     else if (! (spp->curr->isa_null))
1839                         SeqPortSeek(spp->curr, -1,
1840 SEEK_END);
1841                     else
1842                         spp->curr->curpos = -1;   /*
1843 flag the null for next time around */
1844                 }
1845                 else                           /* moving
1846 forwards */
1847                 {
1848                     if (spp->curr->next != NULL)
1849                         spp->curr = spp->curr->next;
1850                     else if (spp->is_circle)
1851                         spp->curr = spp->segs;
1852                     else
1853                         return SEQPORT_EOF;
1854 
1855                     if (plus_strand)
1856                         SeqPortSeek(spp->curr, 0,
1857 SEEK_SET);
1858                     else
1859                         SeqPortSeek(spp->curr, -1,
1860 SEEK_END);
1861                 }
1862 
1863                 if (spp->is_seg)
1864                     return SEQPORT_EOS;
1865             }
1866 
1867             if ((residue == SEQPORT_VIRT) || (residue ==
1868 INVALID_RESIDUE))
1869                 return residue;
1870             residue = SeqPortGetResidue(spp->curr);
1871         }
1872 
1873         if (! plus_strand)
1874         {
1875             spp->curr->backing++;     /* signal we are backing
1876 up */
1877             if (SeqPortSeek(spp->curr, -2, SEEK_CUR))  /* back up to "next" */
1878                 spp->curr->eos = TRUE;
1879 
1880         }
1881     }
1882 
1883     if (spp->smtp != NULL)
1884         residue = SeqMapTableConvert(spp->smtp, residue);
1885 
1886     if (! plus_strand)
1887         residue = SeqCodeTableComp(spp->sctp, residue);
1888 
1889     spp->curpos++;
1890     return residue;
1891 }
1892 
1893 /*****************************************************************************
1894 *
1895 *   GetGapCode(seqcode)
1896 *       returns code to use for virtual sequence residues for sequence
1897 *         code seqcode
1898 *       returns INVALID_RESIDUE if seqcode invalid
1899 *
1900 *****************************************************************************/
GetGapCode(Uint1 seqcode)1901 NLM_EXTERN Uint1 GetGapCode (Uint1 seqcode)
1902 {
1903     Uint1 residue = INVALID_RESIDUE;
1904 
1905     switch (seqcode)
1906     {
1907         case Seq_code_iupacna:
1908             residue = 'N';
1909             break;
1910         case Seq_code_iupacaa:
1911         case Seq_code_ncbieaa:
1912             residue = 'X';
1913             break;
1914         case Seq_code_ncbi2na:    /* there isn't ambiguity */
1915             break;
1916         case Seq_code_ncbi8na:
1917         case Seq_code_ncbi4na:
1918             residue = 15;
1919             break;
1920         case Seq_code_iupacaa3:  /* no 1 letter character */
1921         case Seq_code_ncbipna:
1922         case Seq_code_ncbipaa:
1923             break;
1924         case Seq_code_ncbistdaa:
1925             residue = 21;
1926             break;
1927 
1928     }
1929 
1930     return residue;
1931 }
1932 
1933 
1934 /*****************************************************************************
1935 *
1936 *   SeqPortRead(spp, buf, len)
1937 *       returns bytes read
1938 *       if returns a negative number, then ABS(return value) gives the
1939 *         same codes as SeqPortGetResidue for EOF or EOS
1940 *
1941 *****************************************************************************/
SeqPortRead(SeqPortPtr spp,Uint1Ptr buf,Int2 len)1942 NLM_EXTERN Int2 LIBCALL SeqPortRead (SeqPortPtr spp, Uint1Ptr buf, Int2 len)
1943 
1944 {
1945     Int2 ctr = 0;
1946     Int4 loopmax;
1947     Uint1 retval;
1948     SPCacheQPtr spcpq;
1949 
1950     if ((spp == NULL) || (buf == NULL) || (len <= 0))
1951         return 0;
1952 
1953     if (spp->lastmsg)    /* previous EOF or EOS saved */
1954     {
1955         ctr = spp->lastmsg;
1956         spp->lastmsg = 0;
1957         ctr *= -1;
1958         return ctr;
1959     }
1960 
1961     spcpq = spp->cacheq;
1962     while (ctr < len) {
1963         loopmax = 0;
1964         if (spcpq != NULL && spp->curpos < spp->totlen && spcpq->ctr < spcpq->total) {
1965             loopmax = MIN ((spp->totlen - spp->curpos), (spcpq->total - spcpq->ctr));
1966             loopmax = MIN (loopmax, (Int4) (len - ctr));
1967         }
1968         /* loopmax saves multiple comparisons, speeds up significantly */
1969         if (loopmax > 0) {
1970             while (loopmax > 0) {
1971                 retval = spcpq->buf [spcpq->ctr];
1972                 spcpq->ctr++;
1973                 spp->curpos++;
1974                 loopmax--;
1975                 if (IS_residue (retval)) {
1976                     *buf = retval;
1977                     buf++;
1978                     ctr++;
1979                 } else {
1980                     if (! ctr)   /* first one */
1981                     {
1982                         ctr = retval;   /* send return as negative number */
1983                         ctr *= -1;
1984                         return ctr;
1985                     } else {
1986                         spp->lastmsg = retval;
1987                         return ctr;
1988                     }
1989                 }
1990             }
1991         } else {
1992             retval = SeqPortGetResidue(spp);
1993             if (IS_residue(retval))
1994             {
1995                 *buf = retval;
1996                 buf++;
1997                 ctr++;
1998             }
1999             else
2000             {
2001                 if (! ctr)   /* first one */
2002                 {
2003                     ctr = retval;   /* send return as negative number */
2004                     ctr *= -1;
2005                     return ctr;
2006                 }
2007                 else
2008                 {
2009                     spp->lastmsg = retval;
2010                     return ctr;
2011                 }
2012             }
2013         }
2014     }
2015     return ctr;
2016 }
2017 
2018 /*******************************************************************************
2019 *
2020 *   SeqPortStream (bsp, flags, userdata, proc)
2021 *   SeqPortStreamInt (bsp, start, stop, strand, flags, userdata, proc)
2022 *   SeqPortStreamLoc (slp, flags, userdata, proc)
2023 *       Efficient functions to stream through sequence
2024 *
2025 ********************************************************************************/
2026 
2027 /* structure for passing common arguments internal functions */
2028 
2029 typedef struct streamdata {
2030   StreamFlgType      flags;
2031   Pointer            userdata;
2032   SeqPortStreamProc  proc;
2033   Uint1              letterToComp [256];
2034   CharPtr            tmp;
2035   Boolean            failed;
2036   Int2               depth;
2037   SeqEntryPtr        scope;
2038 } StreamData, PNTR StreamDataPtr;
2039 
2040 /* prototype for main internal recursive processing function */
2041 
2042 static Int4 SeqPortStreamWork (
2043   BioseqPtr bsp,
2044   Int4 start,
2045   Int4 stop,
2046   Uint1 strand,
2047   StreamDataPtr sdp
2048 );
2049 
2050 #define STREAM_GAP_MASK (STREAM_EXPAND_GAPS | GAP_TO_SINGLE_DASH | EXPAND_GAPS_TO_DASHES)
2051 
SeqPortStreamGap(Int4 length,Boolean is_na,Boolean is_virt,Boolean is_known,Boolean is_seq_gap,StreamDataPtr sdp)2052 static Int4 SeqPortStreamGap (
2053   Int4 length,
2054   Boolean is_na,
2055   Boolean is_virt,
2056   Boolean is_known,
2057   Boolean is_seq_gap,
2058   StreamDataPtr sdp
2059 )
2060 
2061 {
2062   Char     buf [4004];
2063   Char     ch, gapchar = '-';
2064   Boolean  expand_gaps, many_dashes, many_pluses, many_tildes, single_dash;
2065   Int4     len;
2066 
2067   if (sdp == NULL) return 0;
2068 
2069   many_tildes = (Boolean) ((sdp->flags & SEQ_GAP_AS_TILDE) != 0);
2070 
2071   many_pluses = FALSE;
2072   if (is_virt) {
2073     if ((sdp->flags & SUPPRESS_VIRT_SEQ) != 0) return 0;
2074     if ((sdp->flags & STREAM_VIRT_AS_PLUS) != 0) {
2075       many_pluses = TRUE;
2076       gapchar = '+';
2077     }
2078   } else if (is_seq_gap) {
2079     if (many_tildes) {
2080       gapchar = '~';
2081     }
2082   } else if (is_known) {
2083     if ((sdp->flags & KNOWN_GAP_AS_PLUS) != 0) {
2084       many_pluses = TRUE;
2085       gapchar = '+';
2086     }
2087   }
2088 
2089   expand_gaps = (Boolean) ((sdp->flags & STREAM_GAP_MASK) == STREAM_EXPAND_GAPS);
2090   single_dash = (Boolean) ((sdp->flags & STREAM_GAP_MASK) == GAP_TO_SINGLE_DASH);
2091   many_dashes = (Boolean) ((sdp->flags & STREAM_GAP_MASK) == EXPAND_GAPS_TO_DASHES);
2092 
2093   /* if all gap flags are false, ignore gap */
2094 
2095   if ((! expand_gaps) && (! single_dash) && (! many_dashes) && (! many_tildes)) return 0;
2096 
2097   if (single_dash) {
2098 
2099     /* if only indicating gap presence, send one gap character, return 0 count */
2100 
2101     buf [0] = gapchar;
2102     buf [1] = '\0';
2103 
2104     sdp->proc (buf, sdp->userdata);
2105 
2106     return 0;
2107   }
2108 
2109   /* if not single dash to mark any gap, need at least one base or residue */
2110 
2111   if (length < 1) return 0;
2112 
2113   if (many_dashes || many_pluses || many_tildes) {
2114     ch = gapchar;
2115   } else if (is_na) {
2116     ch = 'N';
2117   } else {
2118     ch = 'X';
2119   }
2120 
2121   len = MIN (length, 4000L);
2122   MemSet ((Pointer) buf, ch, len);
2123   buf [(int) (Int2) len] = '\0';
2124 
2125   for (len = length; len > 0; len -= 4000L) {
2126 
2127     /* on last loop, send only partial buffer */
2128 
2129     if (len < 4000L) {
2130       buf [(int) (Int2) len] = '\0';
2131     }
2132 
2133     sdp->proc (buf, sdp->userdata);
2134   }
2135 
2136   /* return number of N or X or gap characters sent */
2137 
2138   return length;
2139 }
2140 
MapNa8ByteToIUPACString(Uint1Ptr bytep,Uint1Ptr buf,Int4 total,Uint1 badchar,SeqMapTablePtr smtp,StreamDataPtr sdp)2141 static Uint1Ptr LIBCALL MapNa8ByteToIUPACString (
2142   Uint1Ptr bytep,
2143   Uint1Ptr buf,
2144   Int4 total,
2145   Uint1 badchar,
2146   SeqMapTablePtr smtp,
2147   StreamDataPtr sdp
2148 )
2149 
2150 {
2151   Uint1     ch;
2152   Int4      k;
2153   Uint1Ptr  ptr;
2154   Uint1     residue;
2155 
2156   if (bytep == NULL || buf == NULL || sdp == NULL) return buf;
2157   ptr = buf;
2158 
2159   for (k = 0; k < total; k++) {
2160     residue = *bytep;
2161     if (smtp != NULL) {
2162       ch = SeqMapTableConvert (smtp, residue);
2163       if (ch == INVALID_RESIDUE && (Boolean) ((sdp->flags & STREAM_CORRECT_INVAL) != 0)) {
2164         *ptr = badchar;
2165       } else {
2166         *ptr = ch;
2167       }
2168     } else {
2169       *ptr = residue;
2170     }
2171     bytep++;
2172     ptr++;
2173   }
2174 
2175   return ptr;
2176 }
2177 
SeqPortStreamBlock(ByteStorePtr bs,Int4 blk,Int4 compress,Uint1 alphabet,Uint1 badchar,SeqMapTablePtr smtp,Int4 start,Int4 stop,Boolean revcomp,StreamDataPtr sdp)2178 static Int4 SeqPortStreamBlock (
2179   ByteStorePtr bs,
2180   Int4 blk,
2181   Int4 compress,
2182   Uint1 alphabet,
2183   Uint1 badchar,
2184   SeqMapTablePtr smtp,
2185   Int4 start,
2186   Int4 stop,
2187   Boolean revcomp,
2188   StreamDataPtr sdp
2189 )
2190 
2191 {
2192   Uint4     uncomp [1001]; /* 4000 characters + extra for end-of-string null byte */
2193   Uint4     compr [251];   /* 1000 bytes + extra for safety */
2194   CharPtr   buf;
2195   Uint1Ptr  bytes;
2196   /*
2197   Char      buf [4004];
2198   Uint1     bytes [1004];
2199   */
2200   Char      ch;
2201   Int4      count = 0, cumulative, total;
2202   Int2      from, to;
2203   Boolean   many_dashes, single_dash;
2204   CharPtr   nd, ptr, str, tmp;
2205 
2206   if (bs == NULL || sdp == NULL) return 0;
2207 
2208   /* Uint4 arrays ensure 4-byte address alignment by the compiler, no need for & since array is pointer */
2209 
2210   buf = (CharPtr) uncomp;
2211   bytes = (Uint1Ptr) compr;
2212 
2213   BSSeek (bs, blk, SEEK_SET);
2214 
2215   total = BSRead (bs, (VoidPtr) bytes, 1000L);
2216   if (total < 1) return 0;
2217 
2218   ptr = buf;
2219   switch (alphabet) {
2220     case Seq_code_ncbi2na :
2221       ptr = (CharPtr) MapNa2ByteToIUPACString (bytes, (Uint4Ptr) ptr, total);
2222       break;
2223     case Seq_code_ncbi4na :
2224       single_dash = (Boolean) ((sdp->flags & STREAM_GAP_MASK) == GAP_TO_SINGLE_DASH);
2225       many_dashes = (Boolean) ((sdp->flags & STREAM_GAP_MASK) == EXPAND_GAPS_TO_DASHES);
2226       if (single_dash || many_dashes) {
2227         ptr = (CharPtr) MapNa4ByteToIUPACplusGapString (bytes, (Uint2Ptr) ptr, total);
2228       } else {
2229         ptr = (CharPtr) MapNa4ByteToIUPACString (bytes, (Uint2Ptr) ptr, total);
2230       }
2231       break;
2232     default :
2233       ptr = (CharPtr) MapNa8ByteToIUPACString (bytes, (Uint1Ptr) ptr, total, badchar, smtp, sdp);
2234       break;
2235   }
2236   *ptr = '\0';
2237 
2238   cumulative = blk * compress;
2239 
2240   /* deal with end conditions */
2241 
2242   total = ptr - buf;
2243 
2244   /* check for bsp->length > actual raw data */
2245 
2246   if (start > cumulative + total) {
2247     sdp->failed = TRUE;
2248     return 0;
2249   }
2250 
2251   from = 0;
2252   if (start > cumulative && start < cumulative + total) {
2253     from += start - cumulative;
2254   }
2255 
2256   if (stop < cumulative + total) {
2257     to = (Int2) (stop - cumulative + 1);
2258     buf [to] = '\0';
2259   }
2260 
2261   str = buf + from;
2262 
2263   if (revcomp) {
2264 
2265     /* reverse string first - middle base not touched, so cannot also complement here */
2266 
2267     nd = str;
2268     while (*nd != '\0') {
2269       nd++;
2270     }
2271     nd--;
2272 
2273     tmp = str;
2274     while (nd > tmp) {
2275       ch = *nd;
2276       *nd = *tmp;
2277       *tmp = ch;
2278       nd--;
2279       tmp++;
2280     }
2281 
2282     /* now complement every base in string */
2283 
2284     nd = str;
2285     ch = *nd;
2286     while (ch != '\0') {
2287       *nd = sdp->letterToComp [(int) (Uint1) ch];
2288       nd++;
2289       ch = *nd;
2290     }
2291 
2292   }
2293 
2294   /* send characters to stream callback */
2295 
2296   sdp->proc (str, sdp->userdata);
2297 
2298   /* return number of characters sent */
2299 
2300   tmp = str;
2301   while (*tmp != '\0') {
2302     count++;
2303     tmp++;
2304   }
2305 
2306   return count;
2307 }
2308 
SeqPortStreamRaw(BioseqPtr bsp,Int4 start,Int4 stop,Uint1 strand,StreamDataPtr sdp)2309 static Int4 SeqPortStreamRaw (
2310   BioseqPtr bsp,
2311   Int4 start,
2312   Int4 stop,
2313   Uint1 strand,
2314   StreamDataPtr sdp
2315 )
2316 
2317 {
2318   Uint1           alphabet, code;
2319   Char            badchar;
2320   ByteStorePtr    bs;
2321   Int4            blk, compress, count = 0, from, to;
2322   Boolean         is_na, revcomp = FALSE;
2323   SeqMapTablePtr  smtp = NULL;
2324 
2325   if (bsp == NULL || sdp == NULL) return 0;
2326   if (bsp->repr != Seq_repr_raw && bsp->repr != Seq_repr_const) return 0;
2327 
2328   is_na = (Boolean) ISA_na (bsp->mol);
2329 
2330   if (bsp->seq_data_type == Seq_code_gap) {
2331 
2332     /* support for new Seq-data.gap */
2333 
2334     count += SeqPortStreamGap (stop - start + 1, is_na, FALSE, FALSE, TRUE, sdp);
2335 
2336     return count;
2337   }
2338 
2339   /* otherwise Seq-data is a byte store */
2340 
2341   bs = (ByteStorePtr) bsp->seq_data;
2342   if (bs == NULL) return 0;
2343 
2344   alphabet = bsp->seq_data_type;
2345 
2346   if (strand == Seq_strand_minus && is_na) {
2347     revcomp = TRUE;
2348   }
2349 
2350   /* setup code conversion and decompression parameters */
2351 
2352   if (is_na) {
2353     code = Seq_code_iupacna;
2354     badchar = 'N';
2355   } else {
2356     code = Seq_code_ncbieaa;
2357     badchar = 'X';
2358   }
2359 
2360   switch (alphabet) {
2361     case Seq_code_ncbi2na :
2362       compress = 4;
2363       break;
2364     case Seq_code_ncbi4na :
2365       compress = 2;
2366       break;
2367     default :
2368       compress = 1;
2369       break;
2370   }
2371 
2372   if (code != alphabet) {
2373     smtp = SeqMapTableFind (code, alphabet);
2374     if (smtp == NULL) return 0;
2375   }
2376 
2377   /* calculate bytestore block addresses in chunks of 1000 */
2378 
2379   from = ((start / compress) / 1000L) * 1000L;
2380   to = ((stop / compress) / 1000L) * 1000L;
2381 
2382   /* process sequential blocks of sequence */
2383 
2384   if (revcomp) {
2385 
2386     for (blk = to; blk >= from; blk -= 1000) {
2387       count += SeqPortStreamBlock (bs, blk, compress, alphabet, badchar, smtp, start, stop, TRUE, sdp);
2388     }
2389 
2390   } else {
2391 
2392     for (blk = from; blk <= to; blk += 1000) {
2393       count += SeqPortStreamBlock (bs, blk, compress, alphabet, badchar, smtp, start, stop, FALSE, sdp);
2394     }
2395   }
2396 
2397   return count;
2398 }
2399 
SeqPortStreamSeqLit(SeqLitPtr slitp,Boolean is_na,Int4 start,Int4 stop,Uint1 strand,StreamDataPtr sdp)2400 static Int4 SeqPortStreamSeqLit (
2401   SeqLitPtr slitp,
2402   Boolean is_na,
2403   Int4 start,
2404   Int4 stop,
2405   Uint1 strand,
2406   StreamDataPtr sdp
2407 )
2408 
2409 {
2410   Bioseq   bsq;
2411   Int4     count = 0;
2412   Boolean  is_known = TRUE;
2413 
2414   if (slitp == NULL || sdp == NULL) return 0;
2415 
2416   /* ignore gaps of unknown length */
2417 
2418   if (slitp->length < 1) return 0;
2419 
2420   if (slitp->seq_data == NULL) {
2421 
2422     /* literal without sequence data is a virtual gap */
2423 
2424     if (slitp->fuzz != NULL) {
2425       is_known = FALSE;
2426     }
2427 
2428     count += SeqPortStreamGap (stop - start + 1, is_na, FALSE, is_known, FALSE, sdp);
2429 
2430     return count;
2431   }
2432 
2433   if (slitp->seq_data_type == Seq_code_gap) {
2434 
2435     /* also handle new gap type */
2436 
2437     if (slitp->fuzz != NULL) {
2438       is_known = FALSE;
2439     }
2440 
2441     count += SeqPortStreamGap (stop - start + 1, is_na, FALSE, is_known, TRUE, sdp);
2442 
2443     return count;
2444   }
2445 
2446   /* otherwise fake a Bioseq with the literal as its data */
2447 
2448   MemSet ((Pointer) &bsq, 0, sizeof (Bioseq));
2449 
2450   bsq.repr = Seq_repr_raw;
2451   if (is_na) {
2452     bsq.mol = Seq_mol_dna;
2453   } else {
2454     bsq.mol = Seq_mol_aa;
2455   }
2456   bsq.seq_data_type = slitp->seq_data_type;
2457   bsq.seq_data = slitp->seq_data;
2458   bsq.length = slitp->length;
2459 
2460   /* call SeqPortStreamRaw to handle sequence data in the byte store */
2461 
2462   count += SeqPortStreamRaw (&bsq, start, stop, strand, sdp);
2463 
2464   return count;
2465 }
2466 
2467 static Int2     stream_retry_attempts = 0;
2468 static Boolean  stream_retry_count_set = FALSE;
2469 
2470 static Int2     stream_retry_sleep = 0;
2471 static Boolean  stream_retryp_sleep_set = FALSE;
2472 
SeqPortStreamSeqLoc(SeqLocPtr slp,Int4 start,Int4 stop,Uint1 strand,StreamDataPtr sdp,SeqIdPtr parentID)2473 static Int4 SeqPortStreamSeqLoc (
2474   SeqLocPtr slp,
2475   Int4 start,
2476   Int4 stop,
2477   Uint1 strand,
2478   StreamDataPtr sdp,
2479   SeqIdPtr parentID
2480 )
2481 
2482 {
2483   BioseqPtr    bsp;
2484   Char         buf [64];
2485   Int4         count = 0;
2486   SeqEntryPtr  oldscope = NULL;
2487   Char         pid [64];
2488   SeqIdPtr     sip;
2489 #ifdef OS_UNIX
2490   Int2         attempts;
2491   CharPtr      str;
2492   int          val = 0;
2493 #endif
2494 
2495   if (slp == NULL || sdp == NULL) return 0;
2496 
2497   if (start < 0 || stop < 0) return 0;
2498 
2499   sip = SeqLocId (slp);
2500   if (sip == NULL) return 0;
2501 
2502   if (sip->choice == SEQID_GI && sip->data.intvalue <= 0 &&
2503       (Boolean) ((sdp->flags & STREAM_ALLOW_NEG_GIS) == 0)) {
2504 
2505     /* gi 0 is always a data error, just report and bail */
2506     /* negative gi sometimes used in-house, allow if flag set */
2507 
2508     SeqIdWrite (sip, buf, PRINTID_FASTA_SHORT, sizeof (buf) - 1);
2509     if (parentID != NULL) {
2510       SeqIdWrite (parentID, pid, PRINTID_FASTA_LONG, sizeof (pid) - 1);
2511       ErrPostEx (SEV_ERROR, 0, 0, "SeqPortStream ignoring Bioseq %s component of %s", buf, pid);
2512     } else {
2513       ErrPostEx (SEV_ERROR, 0, 0, "SeqPortStream ignoring Bioseq %s", buf);
2514     }
2515     sdp->failed = TRUE;
2516     return 0;
2517   }
2518 
2519   oldscope = SeqEntrySetScope (sdp->scope);
2520   bsp = BioseqLockById (sip);
2521   SeqEntrySetScope (oldscope);
2522 
2523 #ifdef OS_UNIX
2524   if (bsp == NULL) {
2525 
2526     /* number of retries and sleep between retries now configured by environment variable */
2527 
2528     if (! stream_retry_count_set) {
2529       str = (CharPtr) getenv ("SEQPORT_STREAM_FETCH_ATTEMPTS");
2530       if (StringDoesHaveText (str)) {
2531         if (sscanf (str, "%d", &val) == 1) {
2532           stream_retry_attempts = (Uint2) val;
2533         }
2534       }
2535       stream_retry_count_set = TRUE;
2536     }
2537 
2538     if (! stream_retryp_sleep_set) {
2539       str = (CharPtr) getenv ("SEQPORT_STREAM_RETRY_SLEEP");
2540       if (StringDoesHaveText (str)) {
2541         if (sscanf (str, "%d", &val) == 1) {
2542           stream_retry_sleep = (Uint2) val;
2543         }
2544       }
2545       stream_retryp_sleep_set = TRUE;
2546     }
2547 
2548     /* retry failed fetch attempt up to specified limit */
2549 
2550     if (stream_retry_attempts > 1) {
2551       attempts = 1;
2552       while (bsp == NULL && attempts < stream_retry_attempts) {
2553         if (stream_retry_sleep > 0) {
2554           sleep (stream_retry_sleep);
2555         }
2556 
2557         oldscope = SeqEntrySetScope (sdp->scope);
2558         bsp = BioseqLockById (sip);
2559         SeqEntrySetScope (oldscope);
2560         attempts++;
2561       }
2562       if (bsp != NULL) {
2563         SeqIdWrite (sip, buf, PRINTID_FASTA_SHORT, sizeof (buf) - 1);
2564         if (parentID != NULL) {
2565           SeqIdWrite (parentID, pid, PRINTID_FASTA_LONG, sizeof (pid) - 1);
2566           ErrPostEx (SEV_WARNING, 0, 0,
2567                      "SeqPortStream loaded Bioseq %s component of %s after %d attempts",
2568                      buf, pid, (int) attempts);
2569         } else {
2570           ErrPostEx (SEV_WARNING, 0, 0,
2571                      "SeqPortStream loaded Bioseq %s after %d attempts",
2572                      buf, (int) attempts);
2573         }
2574       }
2575     }
2576   }
2577 #endif
2578 
2579   if (bsp == NULL) {
2580     SeqIdWrite (sip, buf, PRINTID_FASTA_SHORT, sizeof (buf) - 1);
2581     if (parentID != NULL) {
2582       SeqIdWrite (parentID, pid, PRINTID_FASTA_LONG, sizeof (pid) - 1);
2583       ErrPostEx (SEV_ERROR, 0, 0, "SeqPortStream failed to load Bioseq %s component of %s, size = %d",
2584                  buf, pid, sizeof( sip->data.intvalue));
2585     } else {
2586       ErrPostEx (SEV_ERROR, 0, 0, "SeqPortStream failed to load Bioseq %s", buf);
2587     }
2588     sdp->failed = TRUE;
2589     return 0;
2590   }
2591 
2592   count = SeqPortStreamWork (bsp, start, stop, strand, sdp);
2593 
2594   BioseqUnlock (bsp);
2595 
2596   return count;
2597 }
2598 
2599 /* structure for processing components in forward or reverse direction */
2600 
2601 typedef struct streamobj {
2602   SeqLocPtr  slp;
2603   SeqLitPtr  slitp;
2604   Int4       from;
2605   Int4       to;
2606   Uint1      strand;
2607 } StreamObj, PNTR StreamObjPtr;
2608 
StreamObjNew(SeqLocPtr slp,SeqLitPtr slitp,Int4 from,Int4 to,Uint1 strand)2609 static StreamObjPtr StreamObjNew (
2610   SeqLocPtr slp,
2611   SeqLitPtr slitp,
2612   Int4 from,
2613   Int4 to,
2614   Uint1 strand
2615 )
2616 
2617 {
2618   StreamObjPtr  sop;
2619 
2620   sop = (StreamObjPtr) MemNew (sizeof (StreamObj));
2621   if (sop == NULL) return NULL;
2622 
2623   sop->slp = slp;
2624   sop->slitp = slitp;
2625   sop->from = from;
2626   sop->to = to;
2627   sop->strand = strand;
2628 
2629   return sop;
2630 }
2631 
SeqPortStreamDelta(BioseqPtr bsp,Int4 start,Int4 stop,Uint1 strand,StreamDataPtr sdp)2632 static Int4 SeqPortStreamDelta (
2633   BioseqPtr bsp,
2634   Int4 start,
2635   Int4 stop,
2636   Uint1 strand,
2637   StreamDataPtr sdp
2638 )
2639 
2640 {
2641   Int4          count = 0, cumulative, from, to, len;
2642   DeltaSeqPtr   dsp;
2643   ValNodePtr    head = NULL, last = NULL, vnp;
2644   Boolean       is_na;
2645   Boolean       revcomp = FALSE;
2646   SeqLitPtr     slitp;
2647   SeqLocPtr     slp;
2648   StreamObjPtr  sop;
2649 
2650   if (bsp == NULL || sdp == NULL) return 0;
2651 
2652   is_na = (Boolean) ISA_na (bsp->mol);
2653 
2654   if (strand == Seq_strand_minus && is_na) {
2655     revcomp = TRUE;
2656   }
2657 
2658   /* build linked list in forward or reverse order, depending upon input strand */
2659 
2660   for (dsp = (DeltaSeqPtr) bsp->seq_ext, cumulative = 0;
2661        dsp != NULL && cumulative <= stop;
2662        dsp = dsp->next, cumulative += len) {
2663 
2664     len = 0;
2665 
2666     switch (dsp->choice) {
2667 
2668       case 1 :
2669         slp = (SeqLocPtr) dsp->data.ptrvalue;
2670         if (slp == NULL) continue;
2671 
2672         if (slp->choice == SEQLOC_NULL) continue;
2673 
2674         from = SeqLocStart (slp);
2675         to = SeqLocStop (slp);
2676         strand = SeqLocStrand (slp);
2677 
2678         if (from < 0 || to < 0) continue;
2679 
2680         len = to - from + 1;
2681 
2682         if (cumulative + len <= start) continue;
2683 
2684         /* adjust from and to if not using entire interval */
2685 
2686         if (strand == Seq_strand_minus) {
2687 
2688           if (start > cumulative) {
2689             to -= start - cumulative;
2690           }
2691 
2692           if (stop < cumulative + len) {
2693             from += cumulative + len - stop - 1;
2694           }
2695 
2696         } else {
2697 
2698           if (start > cumulative) {
2699             from += start - cumulative;
2700           }
2701 
2702           if (stop < cumulative + len) {
2703             to -= cumulative + len - stop - 1;
2704           }
2705         }
2706 
2707         if (revcomp) {
2708           if (strand == Seq_strand_minus) {
2709             strand = Seq_strand_plus;
2710           } else {
2711             strand = Seq_strand_minus;
2712           }
2713         }
2714 
2715         sop = StreamObjNew (slp, NULL, from, to, strand);
2716         if (sop == NULL) continue;
2717 
2718         if (revcomp) {
2719 
2720           vnp = ValNodeAddPointer (NULL, 0, (Pointer) sop);
2721           vnp->next = head;
2722           head = vnp;
2723 
2724         } else {
2725 
2726           vnp = ValNodeAddPointer (&last, 0, (Pointer) sop);
2727           if (head == NULL) {
2728             head = vnp;
2729           }
2730           last = vnp;
2731         }
2732         break;
2733 
2734       case 2 :
2735         slitp = (SeqLitPtr) dsp->data.ptrvalue;
2736         if (slitp == NULL) continue;
2737 
2738         from = 0;
2739         to = slitp->length - 1;
2740         strand = Seq_strand_plus;
2741 
2742         if (from < 0 || to < 0) continue;
2743 
2744         len = to - from + 1;
2745 
2746         if (cumulative + len <= start) continue;
2747 
2748         /* adjust from and to if not using entire interval */
2749 
2750         if (start > cumulative) {
2751           from += start - cumulative;
2752         }
2753 
2754         if (stop < cumulative + len) {
2755           to -= cumulative + len - stop - 1;
2756         }
2757 
2758         if (revcomp) {
2759           if (strand == Seq_strand_minus) {
2760             strand = Seq_strand_plus;
2761           } else {
2762             strand = Seq_strand_minus;
2763           }
2764         }
2765 
2766         sop = StreamObjNew (NULL, slitp, from, to, strand);
2767         if (sop == NULL) continue;
2768 
2769         if (revcomp) {
2770 
2771           vnp = ValNodeAddPointer (NULL, 0, (Pointer) sop);
2772           vnp->next = head;
2773           head = vnp;
2774 
2775         } else {
2776 
2777           vnp = ValNodeAddPointer (&last, 0, (Pointer) sop);
2778           if (head == NULL) {
2779             head = vnp;
2780           }
2781           last = vnp;
2782         }
2783         break;
2784 
2785       default :
2786         break;
2787     }
2788   }
2789 
2790   /* process components in correct order */
2791 
2792   for (vnp = head; vnp != NULL && (! sdp->failed); vnp = vnp->next) {
2793 
2794     sop = (StreamObjPtr) vnp->data.ptrvalue;
2795     if (sop == NULL) continue;
2796 
2797     if (sop->slp != NULL) {
2798 
2799       count += SeqPortStreamSeqLoc (sop->slp, sop->from, sop->to, sop->strand, sdp, bsp->id);
2800 
2801     } else if (sop->slitp != NULL) {
2802 
2803       count += SeqPortStreamSeqLit (sop->slitp, is_na, sop->from, sop->to, sop->strand, sdp);
2804     }
2805   }
2806 
2807   /* free control list */
2808 
2809   ValNodeFreeData (head);
2810 
2811   return count;
2812 }
2813 
SeqPortStreamSeg(BioseqPtr bsp,Int4 start,Int4 stop,Uint1 strand,StreamDataPtr sdp)2814 static Int4 SeqPortStreamSeg (
2815   BioseqPtr bsp,
2816   Int4 start,
2817   Int4 stop,
2818   Uint1 strand,
2819   StreamDataPtr sdp
2820 )
2821 
2822 {
2823   Int4          count = 0, cumulative, from, to, len;
2824   ValNodePtr    head = NULL, last = NULL, vnp;
2825   Boolean       is_na;
2826   Boolean       revcomp = FALSE;
2827   SeqLocPtr     slp;
2828   StreamObjPtr  sop;
2829 
2830   if (bsp == NULL || sdp == NULL) return 0;
2831 
2832   is_na = (Boolean) ISA_na (bsp->mol);
2833 
2834   if (strand == Seq_strand_minus && is_na) {
2835     revcomp = TRUE;
2836   }
2837 
2838   /* build linked list in forward or reverse order, depending upon input strand */
2839 
2840   for (slp = (SeqLocPtr) bsp->seq_ext, cumulative = 0;
2841        slp != NULL && cumulative <= stop;
2842        slp = slp->next, cumulative += len) {
2843 
2844     len = 0;
2845 
2846     if (slp->choice == SEQLOC_NULL) continue;
2847 
2848     from = SeqLocStart (slp);
2849     to = SeqLocStop (slp);
2850     strand = SeqLocStrand (slp);
2851 
2852     if (from < 0 || to < 0) continue;
2853 
2854     len = to - from + 1;
2855 
2856     if (cumulative + len <= start) continue;
2857 
2858     /* adjust from and to if not using entire interval */
2859 
2860     if (strand == Seq_strand_minus) {
2861 
2862       if (start > cumulative) {
2863         to -= start - cumulative;
2864       }
2865 
2866       if (stop < cumulative + len) {
2867         from += cumulative + len - stop - 1;
2868       }
2869 
2870     } else {
2871 
2872       if (start > cumulative) {
2873         from += start - cumulative;
2874       }
2875 
2876       if (stop < cumulative + len) {
2877         to -= cumulative + len - stop - 1;
2878       }
2879     }
2880 
2881     if (revcomp) {
2882       if (strand == Seq_strand_minus) {
2883         strand = Seq_strand_plus;
2884       } else {
2885         strand = Seq_strand_minus;
2886       }
2887     }
2888 
2889     sop = StreamObjNew (slp, NULL, from, to, strand);
2890     if (sop == NULL) continue;
2891 
2892     if (revcomp) {
2893 
2894       vnp = ValNodeAddPointer (NULL, 0, (Pointer) sop);
2895       vnp->next = head;
2896       head = vnp;
2897 
2898     } else {
2899 
2900       vnp = ValNodeAddPointer (&last, 0, (Pointer) sop);
2901       if (head == NULL) {
2902         head = vnp;
2903       }
2904       last = vnp;
2905     }
2906   }
2907 
2908   /* process components in correct order */
2909 
2910   for (vnp = head; vnp != NULL && (! sdp->failed); vnp = vnp->next) {
2911 
2912     sop = (StreamObjPtr) vnp->data.ptrvalue;
2913     if (sop == NULL) continue;
2914 
2915     if (sop->slp != NULL) {
2916 
2917       count += SeqPortStreamSeqLoc (sop->slp, sop->from, sop->to, sop->strand, sdp, bsp->id);
2918     }
2919   }
2920 
2921   /* free control list */
2922 
2923   ValNodeFreeData (head);
2924 
2925   return count;
2926 }
2927 
SeqPortStreamRef(BioseqPtr bsp,Int4 start,Int4 stop,Uint1 strand,StreamDataPtr sdp)2928 static Int4 SeqPortStreamRef (
2929   BioseqPtr bsp,
2930   Int4 start,
2931   Int4 stop,
2932   Uint1 strand,
2933   StreamDataPtr sdp
2934 )
2935 
2936 {
2937   Int4       count = 0, from, to, len;
2938   Boolean    is_na;
2939   Boolean    revcomp = FALSE;
2940   SeqLocPtr  slp;
2941 
2942   if (bsp == NULL || sdp == NULL) return 0;
2943 
2944   is_na = (Boolean) ISA_na (bsp->mol);
2945 
2946   if (strand == Seq_strand_minus && is_na) {
2947     revcomp = TRUE;
2948   }
2949 
2950   /* build linked list in forward or reverse order, depending upon input strand */
2951 
2952   slp = (SeqLocPtr) bsp->seq_ext;
2953 
2954   if (slp == NULL || slp->choice == SEQLOC_NULL) return 0;
2955 
2956   len = 0;
2957 
2958   from = SeqLocStart (slp);
2959   to = SeqLocStop (slp);
2960   strand = SeqLocStrand (slp);
2961 
2962   if (from < 0 || to < 0) return 0;
2963 
2964   len = to - from + 1;
2965 
2966   if (len <= start) return 0;
2967 
2968   /* adjust from and to if not using entire interval */
2969 
2970   if (strand == Seq_strand_minus) {
2971 
2972     if (start > 0) {
2973       to -= start;
2974     }
2975 
2976     if (stop < len) {
2977       from += len - stop - 1;
2978     }
2979 
2980   } else {
2981 
2982     if (start > 0) {
2983       from += start;
2984     }
2985 
2986     if (stop < len) {
2987       to -= len - stop - 1;
2988     }
2989   }
2990 
2991   if (revcomp) {
2992     if (strand == Seq_strand_minus) {
2993       strand = Seq_strand_plus;
2994     } else {
2995       strand = Seq_strand_minus;
2996     }
2997   }
2998 
2999   count += SeqPortStreamSeqLoc (slp, from, to, strand, sdp, bsp->id);
3000 
3001   return count;
3002 }
3003 
3004 /* SeqPortStreamWork calls appropriate representation-specific function */
3005 
SeqPortStreamWork(BioseqPtr bsp,Int4 start,Int4 stop,Uint1 strand,StreamDataPtr sdp)3006 static Int4 SeqPortStreamWork (
3007   BioseqPtr bsp,
3008   Int4 start,
3009   Int4 stop,
3010   Uint1 strand,
3011   StreamDataPtr sdp
3012 )
3013 
3014 {
3015   Int4  count = 0;
3016 
3017   if (bsp == NULL || sdp == NULL) return 0;
3018 
3019   /* start and stop position reality checks */
3020 
3021   if (start < 0) {
3022     start = 0;
3023   }
3024   if (stop < 0) {
3025     stop = bsp->length - 1;
3026   }
3027 
3028   /* if start or stop are beyond sequence length, set failed flag */
3029 
3030   if (start >= bsp->length || stop >= bsp->length) {
3031     sdp->failed = TRUE;
3032     return 0;
3033   }
3034 
3035   if (start > stop) return 0;
3036 
3037   /* stack depth overflow check for recursively-defined sequence instances */
3038 
3039   (sdp->depth)++;
3040 
3041   if (sdp->depth > 20) {
3042     ErrPostEx (SEV_ERROR, 0, 0, "SeqPortStreamWork stack depth overflow");
3043     sdp->failed = TRUE;
3044     return 0;
3045   }
3046 
3047   /* call appropriate stream function */
3048 
3049   switch (bsp->repr) {
3050 
3051     case Seq_repr_virtual :
3052       count += SeqPortStreamGap (stop - start + 1, ISA_na (bsp->mol), TRUE, FALSE, FALSE, sdp);
3053       break;
3054 
3055     case Seq_repr_raw :
3056     case Seq_repr_const :
3057       count += SeqPortStreamRaw (bsp, start, stop, strand, sdp);
3058       break;
3059 
3060     case Seq_repr_seg :
3061       if (bsp->seq_ext_type == 1) {
3062         count += SeqPortStreamSeg (bsp, start, stop, strand, sdp);
3063       }
3064       break;
3065 
3066     case Seq_repr_delta :
3067       if (bsp->seq_ext_type == 4) {
3068         count += SeqPortStreamDelta (bsp, start, stop, strand, sdp);
3069       }
3070       break;
3071 
3072     case Seq_repr_ref :
3073       if (bsp->seq_ext_type == 2) {
3074         count += SeqPortStreamRef (bsp, start, stop, strand, sdp);
3075       }
3076       break;
3077 
3078     default :
3079       break;
3080   }
3081 
3082   /* restore stack depth value */
3083 
3084   (sdp->depth)--;
3085 
3086   return count;
3087 }
3088 
3089 /* default callback for copying to allocated buffer */
3090 
SaveStreamSequence(CharPtr sequence,Pointer userdata)3091 static void LIBCALLBACK SaveStreamSequence (
3092   CharPtr sequence,
3093   Pointer userdata
3094 )
3095 
3096 {
3097   CharPtr       tmp;
3098   CharPtr PNTR  tmpp;
3099 
3100   tmpp = (CharPtr PNTR) userdata;
3101   tmp = *tmpp;
3102 
3103   tmp = StringMove (tmp, sequence);
3104 
3105   *tmpp = tmp;
3106 }
3107 
3108 /* SeqPortStreamSetup creates revcomp table, calls SeqPortStreamWork  */
3109 
SeqPortStreamSetup(BioseqPtr bsp,Int4 start,Int4 stop,Uint1 strand,SeqLocPtr loc,SeqLitPtr lit,StreamFlgType flags,Pointer userdata,SeqPortStreamProc proc)3110 static Int4 SeqPortStreamSetup (
3111   BioseqPtr bsp,
3112   Int4 start,
3113   Int4 stop,
3114   Uint1 strand,
3115   SeqLocPtr loc,
3116   SeqLitPtr lit,
3117   StreamFlgType flags,
3118   Pointer userdata,
3119   SeqPortStreamProc proc
3120 )
3121 
3122 {
3123   Char        ch, lttr;
3124   CharPtr     complementBase = " TVGH  CD  M KN   YSAABW R ";
3125   Int4        count = 0, from, to;
3126   Uint2       entityID;
3127   Int2        i;
3128   Boolean     is_na;
3129   StreamData  sd;
3130   SeqLocPtr   slp;
3131 
3132   if (bsp == NULL && loc == NULL && lit == NULL) return 0;
3133   if (proc == NULL && userdata == NULL) return 0;
3134 
3135   MemSet ((Pointer) &sd, 0, sizeof (StreamData));
3136 
3137   sd.flags = flags;
3138   sd.userdata = userdata;
3139   sd.proc = proc;
3140   sd.tmp = NULL;
3141   sd.failed = FALSE;
3142   sd.depth = 0;
3143 
3144   /* if NULL callback, copy into allocated userdata string */
3145 
3146   if (proc == NULL) {
3147     sd.proc = SaveStreamSequence;
3148     sd.tmp = userdata;
3149     sd.userdata = &(sd.tmp);
3150   }
3151 
3152   /* set up nucleotide complementation lookup table */
3153 
3154   for (i = 0; i < 256; i++) {
3155     sd.letterToComp [i] = '\0';
3156   }
3157   for (ch = 'A', i = 1; ch <= 'Z'; ch++, i++) {
3158     lttr = complementBase [i];
3159     if (lttr != ' ') {
3160       sd.letterToComp [(int) (Uint1) ch] = lttr;
3161     }
3162   }
3163   for (ch = 'a', i = 1; ch <= 'z'; ch++, i++) {
3164     lttr = complementBase [i];
3165     if (lttr != ' ') {
3166       sd.letterToComp [(int) (Uint1) ch] = lttr;
3167     }
3168   }
3169 
3170   /* commence streaming */
3171 
3172   if (bsp != NULL) {
3173 
3174     entityID = ObjMgrGetEntityIDForPointer (bsp);
3175     sd.scope = GetTopSeqEntryForEntityID (entityID);
3176 
3177     count += SeqPortStreamWork (bsp, start, stop, strand, &sd);
3178 
3179   } else if (loc != NULL) {
3180 
3181     sd.scope = SeqEntryGetScope ();
3182 
3183     slp = SeqLocFindNext (loc, NULL);
3184     while (slp != NULL) {
3185 
3186       from = SeqLocStart (slp);
3187       to = SeqLocStop (slp);
3188       strand = SeqLocStrand (slp);
3189 
3190       if (from < 0 || to < 0) {
3191         sd.failed = TRUE;
3192         return -1;
3193       }
3194 
3195       count += SeqPortStreamSeqLoc (slp, from, to, strand, &sd, NULL);
3196 
3197       slp = SeqLocFindNext (loc, slp);
3198     }
3199 
3200   } else if (lit != NULL) {
3201 
3202     is_na = TRUE;
3203     switch (lit->seq_data_type) {
3204       case Seq_code_iupacaa :
3205       case Seq_code_ncbi8aa :
3206       case Seq_code_ncbieaa :
3207       case Seq_code_ncbipaa :
3208       case Seq_code_iupacaa3 :
3209       case Seq_code_ncbistdaa :
3210         is_na = FALSE;
3211         break;
3212       default :
3213         break;
3214     }
3215 
3216     count += SeqPortStreamSeqLit (lit, is_na, 0, lit->length - 1, Seq_strand_plus, &sd);
3217   }
3218 
3219   /* return number of bases or residues streamed to callback */
3220 
3221   if (sd.failed) {
3222     if (count  < 1) return -1;
3223     return -count;
3224   }
3225 
3226   return count;
3227 }
3228 
3229 /* public functions all call SeqPortStreamSetup */
3230 
SeqPortStream(BioseqPtr bsp,StreamFlgType flags,Pointer userdata,SeqPortStreamProc proc)3231 NLM_EXTERN Int4 SeqPortStream (
3232   BioseqPtr bsp,
3233   StreamFlgType flags,
3234   Pointer userdata,
3235   SeqPortStreamProc proc
3236 )
3237 
3238 {
3239   return SeqPortStreamSetup (bsp, 0, -1, Seq_strand_unknown, NULL, NULL, flags, userdata, proc);
3240 }
3241 
SeqPortStreamInt(BioseqPtr bsp,Int4 start,Int4 stop,Uint1 strand,StreamFlgType flags,Pointer userdata,SeqPortStreamProc proc)3242 NLM_EXTERN Int4 SeqPortStreamInt (
3243   BioseqPtr bsp,
3244   Int4 start,
3245   Int4 stop,
3246   Uint1 strand,
3247   StreamFlgType flags,
3248   Pointer userdata,
3249   SeqPortStreamProc proc
3250 )
3251 
3252 {
3253   return SeqPortStreamSetup (bsp, start, stop, strand, NULL, NULL, flags, userdata, proc);
3254 }
3255 
SeqPortStreamLoc(SeqLocPtr slp,StreamFlgType flags,Pointer userdata,SeqPortStreamProc proc)3256 NLM_EXTERN Int4 SeqPortStreamLoc (
3257   SeqLocPtr slp,
3258   StreamFlgType flags,
3259   Pointer userdata,
3260   SeqPortStreamProc proc
3261 )
3262 
3263 {
3264   return SeqPortStreamSetup (NULL, 0, 0, 0, slp, NULL, flags, userdata, proc);
3265 }
3266 
SeqPortStreamLit(SeqLitPtr lit,StreamFlgType flags,Pointer userdata,SeqPortStreamProc proc)3267 NLM_EXTERN Int4 SeqPortStreamLit (
3268   SeqLitPtr lit,
3269   StreamFlgType flags,
3270   Pointer userdata,
3271   SeqPortStreamProc proc
3272 )
3273 
3274 {
3275   return SeqPortStreamSetup (NULL, 0, 0, 0, NULL, lit, flags, userdata, proc);
3276 }
3277 
3278 /*******************************************************************************
3279 *
3280 *   StreamCacheSetup (bsp, slp, flags, scp)
3281 *   StreamCacheGetResidue (scp)
3282 *   StreamCacheSetPosition (scp, pos)
3283 *       SeqPort functional replacement implemented on top of SeqPortStreams
3284 *
3285 ********************************************************************************/
3286 
StreamCacheSetup(BioseqPtr bsp,SeqLocPtr slp,StreamFlgType flags,StreamCache PNTR scp)3287 NLM_EXTERN Boolean StreamCacheSetup (
3288   BioseqPtr bsp,
3289   SeqLocPtr slp,
3290   StreamFlgType flags,
3291   StreamCache PNTR scp
3292 )
3293 
3294 {
3295   if (bsp == NULL && slp == NULL) return FALSE;
3296   if (scp == NULL) return FALSE;
3297 
3298   MemSet ((Pointer) scp, 0, sizeof (StreamCache));
3299 
3300   if (bsp != NULL) {
3301     scp->bsp = bsp;
3302     scp->length = bsp->length;
3303   } else {
3304     scp->slp = slp;
3305     scp->length = SeqLocLen (slp);
3306   }
3307   scp->flags = flags;
3308 
3309   return TRUE;
3310 }
3311 
StreamCacheRefreshBuffer(StreamCache PNTR scp)3312 static Boolean StreamCacheRefreshBuffer (
3313   StreamCache PNTR scp
3314 )
3315 
3316 {
3317   Bioseq         bsq;
3318   Int4           count;
3319   StreamFlgType  flags;
3320   SeqLocPtr      loc;
3321   SeqLoc         sl;
3322   SeqLocPtr      slp;
3323   Int4           stop;
3324 
3325   if (scp == NULL) return FALSE;
3326 
3327   if (scp->ctr >= scp->total) {
3328     scp->offset += (Int4) scp->total;
3329     scp->ctr = 0;
3330     scp->total = 0;
3331 
3332     MemSet ((Pointer) &(scp->buf), 0, sizeof (scp->buf));
3333 
3334     if (scp->offset < 0 || scp->offset >= scp->length) return FALSE;
3335 
3336     stop = MIN (scp->offset + 4000L, scp->length);
3337 
3338     flags = scp->flags;
3339     if ((flags & STREAM_GAP_MASK) == GAP_TO_SINGLE_DASH || (flags & STREAM_GAP_MASK) == 0) {
3340       /* if expand_gaps_to_dashes not equal to gaps_to_single_dash + stream_gap_mask, need to clear other bits first */
3341       flags |= EXPAND_GAPS_TO_DASHES;
3342     }
3343     if ((flags & SUPPRESS_VIRT_SEQ) != 0) {
3344       flags ^= SUPPRESS_VIRT_SEQ;
3345       flags |= STREAM_VIRT_AS_PLUS;
3346     }
3347 
3348     if (scp->bsp != NULL) {
3349 
3350       count = SeqPortStreamInt (scp->bsp, scp->offset, stop - 1, Seq_strand_plus,
3351                                 flags, (Pointer) &(scp->buf), NULL);
3352       if (count < 0) {
3353         scp->failed = TRUE;
3354       }
3355 
3356     } else if (scp->slp != NULL) {
3357 
3358       slp = scp->slp;
3359       MemSet ((Pointer) &bsq, 0, sizeof (Bioseq));
3360       MemSet ((Pointer) &sl, 0, sizeof (SeqLoc));
3361       bsq.repr = Seq_repr_seg;
3362       bsq.mol = Seq_mol_na;
3363       bsq.seq_ext_type = 1;
3364       bsq.length = SeqLocLen (slp);
3365       bsq.seq_ext = &sl;
3366       if (slp->choice == SEQLOC_MIX || slp->choice == SEQLOC_PACKED_INT) {
3367         loc = (SeqLocPtr) slp->data.ptrvalue;
3368         if (loc != NULL) {
3369           sl.choice = loc->choice;
3370           sl.data.ptrvalue = (Pointer) loc->data.ptrvalue;
3371           sl.next = loc->next;
3372         }
3373       } else {
3374         sl.choice = slp->choice;
3375         sl.data.ptrvalue = (Pointer) slp->data.ptrvalue;
3376         sl.next = NULL;
3377       }
3378 
3379       SeqPortStreamInt (&bsq, scp->offset, stop - 1, Seq_strand_plus,
3380                         flags, (Pointer) &(scp->buf), NULL);
3381     }
3382 
3383     scp->total = StringLen (scp->buf);
3384   }
3385 
3386   return TRUE;
3387 }
3388 
StreamCacheGetResidue(StreamCache PNTR scp)3389 NLM_EXTERN Uint1 StreamCacheGetResidue (
3390   StreamCache PNTR scp
3391 )
3392 
3393 {
3394   Uint1  residue = '\0';
3395 
3396   if (scp == NULL) return '\0';
3397 
3398   if (scp->ctr >= scp->total) {
3399     if (! StreamCacheRefreshBuffer (scp)) return '\0';
3400   }
3401 
3402   if (scp->ctr < scp->total) {
3403     residue = scp->buf [(int) scp->ctr];
3404     (scp->ctr)++;
3405 
3406     if (residue == '-') {
3407 
3408       if ((scp->flags & STREAM_GAP_MASK) == 0) {
3409         while (residue == '-') {
3410           if (scp->ctr >= scp->total) {
3411             if (! StreamCacheRefreshBuffer (scp)) return '\0';
3412           }
3413 
3414           while (scp->ctr < scp->total && residue == '-') {
3415             residue = scp->buf [(int) scp->ctr];
3416             (scp->ctr)++;
3417           }
3418         }
3419         if (residue == '-') return '\0';
3420 
3421       } else if ((scp->flags & STREAM_GAP_MASK) == GAP_TO_SINGLE_DASH) {
3422 
3423         while (residue == '-') {
3424           if (scp->ctr >= scp->total) {
3425             if (! StreamCacheRefreshBuffer (scp)) return '-';
3426           }
3427 
3428           while (scp->ctr < scp->total && residue == '-') {
3429             residue = scp->buf [(int) scp->ctr];
3430             if (residue != '-') return '-';
3431             (scp->ctr)++;
3432           }
3433         }
3434       }
3435 
3436     } else if (residue == '+') {
3437 
3438       if ((scp->flags & SUPPRESS_VIRT_SEQ) != 0) {
3439         while (residue == '+') {
3440           if (scp->ctr >= scp->total) {
3441             if (! StreamCacheRefreshBuffer (scp)) return '\0';
3442           }
3443 
3444           while (scp->ctr < scp->total && residue == '+') {
3445             residue = scp->buf [(int) scp->ctr];
3446             (scp->ctr)++;
3447           }
3448         }
3449         if (residue == '+') return '\0';
3450       }
3451     }
3452   }
3453 
3454   return residue;
3455 }
3456 
StreamCacheSetPosition(StreamCache PNTR scp,Int4 pos)3457 NLM_EXTERN Boolean StreamCacheSetPosition (
3458   StreamCache PNTR scp,
3459   Int4 pos
3460 )
3461 
3462 {
3463   if (scp == NULL) return FALSE;
3464 
3465   if (scp->offset <= pos && scp->offset + (Int4) scp->total >= pos) {
3466     scp->ctr = (Int2) (pos - scp->offset);
3467     return TRUE;
3468   }
3469 
3470   scp->ctr = 0;
3471   scp->total = 0;
3472   scp->offset = pos;
3473 
3474   if (scp->offset < 0 || scp->offset >= scp->length) {
3475     scp->offset = 0;
3476     return FALSE;
3477   }
3478 
3479   return TRUE;
3480 }
3481 
3482 /*******************************************************************************
3483 *
3484 *    ProteinFromCdRegionEx ( SeqFeatPtr sfp, Boolean include_stop, Boolean remove_trailingX)
3485 *        replacement for old ProteinFromCdRegionEx, but using TransTableTranslateCdRegion.
3486 *
3487 ********************************************************************************/
3488 
ProteinFromCdRegionExEx(SeqFeatPtr sfp,Boolean include_stop,Boolean remove_trailingX,BoolPtr altStartP,Boolean farProdFetchOK)3489 NLM_EXTERN ByteStorePtr ProteinFromCdRegionExEx (SeqFeatPtr sfp, Boolean include_stop, Boolean remove_trailingX, BoolPtr altStartP, Boolean farProdFetchOK)
3490 
3491 {
3492   ByteStorePtr   bs;
3493   CdRegionPtr    crp;
3494   Int2           genCode = 0;
3495   Char           str [32];
3496   Boolean        tableExists = FALSE;
3497   TransTablePtr  tbl = NULL;
3498   ValNodePtr     vnp;
3499 
3500   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return NULL;
3501   crp = (CdRegionPtr) sfp->data.value.ptrvalue;
3502   if (crp == NULL) return NULL;
3503 
3504   /* find genetic code */
3505 
3506   if (crp->genetic_code != NULL) {
3507     vnp = (ValNodePtr) crp->genetic_code->data.ptrvalue;
3508     while (vnp != NULL) {
3509       if (vnp->choice == 2) {
3510         genCode = (Int2) vnp->data.intvalue;
3511       }
3512       vnp = vnp->next;
3513     }
3514   }
3515 
3516   if (genCode == 7) {
3517     genCode = 4;
3518   } else if (genCode == 8) {
3519     genCode = 1;
3520   } else if (genCode == 0) {
3521     genCode = 1;
3522   }
3523 
3524   /* set app property name for storing desired FSA */
3525 
3526   sprintf (str, "TransTableFSAforGenCode%d", (int) genCode);
3527 
3528   /* get FSA for desired genetic code if it already exists */
3529 
3530   tbl = (TransTablePtr) GetAppProperty (str);
3531   tableExists = (Boolean) (tbl != NULL);
3532 
3533   bs = TransTableTranslateCdRegionEx (&tbl, sfp, include_stop, remove_trailingX,
3534                                       FALSE, altStartP, farProdFetchOK);
3535 
3536   /* save FSA in genetic code-specific app property name */
3537 
3538   if (! tableExists) {
3539     SetAppProperty (str, (Pointer) tbl);
3540   }
3541 
3542   return bs;
3543 }
3544 
ProteinFromCdRegionEx(SeqFeatPtr sfp,Boolean include_stop,Boolean remove_trailingX)3545 NLM_EXTERN ByteStorePtr ProteinFromCdRegionEx (SeqFeatPtr sfp, Boolean include_stop, Boolean remove_trailingX)
3546 
3547 {
3548   return ProteinFromCdRegionExEx (sfp, include_stop, remove_trailingX, NULL, TRUE);
3549 }
3550 
ProteinFromCdRegionExWithTrailingCodonHandling(SeqFeatPtr sfp,Boolean include_stop,Boolean remove_trailingX,Boolean no_stop_at_end_of_complete_cds)3551 NLM_EXTERN ByteStorePtr ProteinFromCdRegionExWithTrailingCodonHandling
3552 (
3553   SeqFeatPtr sfp,
3554   Boolean include_stop,
3555   Boolean remove_trailingX,
3556   Boolean no_stop_at_end_of_complete_cds
3557 )
3558 
3559 {
3560   ByteStorePtr   bs;
3561   CdRegionPtr    crp;
3562   Int2           genCode = 0;
3563   Char           str [32];
3564   Boolean        tableExists = FALSE;
3565   TransTablePtr  tbl = NULL;
3566   ValNodePtr     vnp;
3567 
3568   if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return NULL;
3569   crp = (CdRegionPtr) sfp->data.value.ptrvalue;
3570   if (crp == NULL) return NULL;
3571 
3572   /* find genetic code */
3573 
3574   if (crp->genetic_code != NULL) {
3575     vnp = (ValNodePtr) crp->genetic_code->data.ptrvalue;
3576     while (vnp != NULL) {
3577       if (vnp->choice == 2) {
3578         genCode = (Int2) vnp->data.intvalue;
3579       }
3580       vnp = vnp->next;
3581     }
3582   }
3583 
3584   if (genCode == 7) {
3585     genCode = 4;
3586   } else if (genCode == 8) {
3587     genCode = 1;
3588   } else if (genCode == 0) {
3589     genCode = 1;
3590   }
3591 
3592   /* set app property name for storing desired FSA */
3593 
3594   sprintf (str, "TransTableFSAforGenCode%d", (int) genCode);
3595 
3596   /* get FSA for desired genetic code if it already exists */
3597 
3598   tbl = (TransTablePtr) GetAppProperty (str);
3599   tableExists = (Boolean) (tbl != NULL);
3600 
3601   bs = TransTableTranslateCdRegion (&tbl, sfp, include_stop, remove_trailingX,
3602                                     no_stop_at_end_of_complete_cds);
3603 
3604   /* save FSA in genetic code-specific app property name */
3605 
3606   if (! tableExists) {
3607     SetAppProperty (str, (Pointer) tbl);
3608   }
3609 
3610   return bs;
3611 }
3612 
3613 NLM_EXTERN Uint1 AAForCodon (Uint1Ptr codon, CharPtr codes);
3614 
3615 /*****************************************************************************
3616 *
3617 *   ProteinFromCdRegion(sfp, include_stop)
3618 *       produces a ByteStorePtr containing the protein sequence in
3619 *   ncbieaa code for the CdRegion sfp.  If include_stop, will translate
3620 *   through stop codons.  If NOT include_stop, will stop at first stop
3621 *   codon and return the protein sequence NOT including the terminating
3622 *   stop.  Supports reading frame, alternate genetic codes, and code breaks
3623 *   in the CdRegion. Removes trailing "X" from partial translation.
3624 *
3625 *****************************************************************************/
ProteinFromCdRegion(SeqFeatPtr sfp,Boolean include_stop)3626 NLM_EXTERN ByteStorePtr ProteinFromCdRegion(SeqFeatPtr sfp, Boolean include_stop)
3627 {
3628     return ProteinFromCdRegionEx(sfp, include_stop, TRUE);
3629 }
3630 
3631 
3632 /* old version of ProteinFromCdRegionEx no longer compiled (below) */
3633 
3634 #if 0
3635 /*******************************************************************************
3636 *
3637 *    ProteinFromCdRegionEx( SeqFeatPtr sfp, Boolean include_stop, Boolean remove_trailingX)
3638 *        same behavior as ProteinFromCdRegion, but another Boolean remove_trailingX
3639 *    specifies whether trailing X's should be removed.
3640 *
3641 ********************************************************************************/
3642 
3643 NLM_EXTERN ByteStorePtr Old_ProteinFromCdRegionEx (SeqFeatPtr sfp, Boolean include_stop, Boolean remove_trailingX)
3644 {
3645     SeqPortPtr spp = NULL;
3646     ByteStorePtr bs = NULL;
3647     Uint1 residue = 0;
3648     Int4 pos1, pos2, pos, len;
3649     Int4Ptr the_breaks = NULL;
3650     Uint1Ptr the_residues = NULL;
3651     Int2 num_code_break = 0, use_break;
3652     SeqLocPtr tmp;
3653     Int2 i;
3654     Uint1 codon[3], aa;
3655     CdRegionPtr crp;
3656     ValNodePtr vnp;
3657     GeneticCodePtr gcp;
3658     CharPtr vals, codes;
3659     CodeBreakPtr cbp;
3660     Boolean bad_base, no_start, check_start, got_stop;
3661     Uint2 part_prod = 0, part_loc = 0;
3662     Boolean incompleteLastCodon;
3663 
3664     if ((sfp == NULL) || (sfp->data.choice != 3))
3665         return NULL;
3666 
3667     crp = (CdRegionPtr) sfp->data.value.ptrvalue;
3668     len = SeqLocLen(sfp->location);
3669 
3670     num_code_break = 0;
3671     if (crp->code_break != NULL)
3672     {
3673         cbp = crp->code_break;
3674         while (cbp != NULL)
3675         {
3676             num_code_break++;
3677             cbp = cbp->next;
3678         }
3679         the_breaks = (Int4Ptr) MemNew((size_t)(num_code_break * sizeof(Int4)));
3680         the_residues = (Uint1Ptr) MemNew((size_t)(num_code_break * sizeof(Uint1)));
3681 
3682         num_code_break = 0;
3683         cbp = crp->code_break;
3684         while (cbp != NULL)
3685         {
3686             pos1 = INT4_MAX;
3687             pos2 = -10;
3688             tmp = NULL;
3689             while ((tmp = SeqLocFindNext(cbp->loc, tmp)) != NULL)
3690             {
3691                 pos = GetOffsetInLoc(tmp, sfp->location,
3692 SEQLOC_START);
3693                 if (pos < pos1)
3694                     pos1 = pos;
3695                 pos = GetOffsetInLoc(tmp, sfp->location,
3696 SEQLOC_STOP);
3697                 if (pos > pos2)
3698                     pos2 = pos;
3699             }
3700             pos = pos2 - pos1; /* codon length */
3701             if (pos == 2 || (pos >= 0 && pos <= 1 && pos2 == len - 1))   /*  a codon */
3702             /* allowing a partial codon at the end */
3703             {
3704                 the_breaks[num_code_break] = pos1;
3705                 the_residues[num_code_break] = (Uint1)
3706 cbp->aa.value.intvalue;
3707                 num_code_break++;
3708             }
3709             else
3710             {
3711                 ErrPost(CTX_NCBIOBJ, 1, "Invalid Code-break.loc");
3712             }
3713 
3714             cbp = cbp->next;
3715         }
3716     }
3717 
3718     gcp = NULL;
3719     if (crp->genetic_code != NULL)
3720     {
3721         vnp = (ValNodePtr)(crp->genetic_code->data.ptrvalue);
3722         while ((vnp != NULL) && (gcp == NULL))
3723         {
3724             switch (vnp->choice)
3725             {
3726             case 1:   /* name */
3727                 gcp = GeneticCodeFind(0,
3728 (CharPtr)vnp->data.ptrvalue);
3729                 break;
3730             case 2:   /* id */
3731                 gcp = GeneticCodeFind(vnp->data.intvalue, NULL);
3732                 break;
3733             case 3:   /* ncbieaa */
3734             case 6:   /* sncbieaa */
3735             case 4:   /* ncbi8aa */
3736             case 5:      /* ncbistdaa */
3737             case 7:   /* sncbi8aa */
3738             case 8:   /* sncbistdaa */
3739             default:
3740                 break;
3741             }
3742             vnp = vnp->next;
3743         }
3744     }
3745     if (gcp == NULL)
3746         gcp = GeneticCodeFind(1, NULL);   /* use universal */
3747     if (gcp == NULL)
3748         goto erret;
3749 
3750     vals = NULL;
3751     codes = NULL;
3752     for (vnp = (ValNodePtr)gcp->data.ptrvalue; vnp != NULL; vnp = vnp->next)
3753     {
3754         if (vnp->choice == 6)   /* sncbieaa */
3755             vals = (CharPtr)vnp->data.ptrvalue;
3756         else if (vnp->choice == 3)  /* ncbieaa */
3757             codes = (CharPtr)vnp->data.ptrvalue;
3758     }
3759     if (codes == NULL)
3760         goto erret;
3761 
3762     no_start = FALSE;
3763     part_loc = SeqLocPartialCheck(sfp->location);
3764     part_prod = SeqLocPartialCheck(sfp->product);
3765     if ((part_loc & SLP_START) || (part_prod & SLP_START))
3766         no_start = TRUE;
3767 
3768     if ((vals == NULL) || (no_start) || (crp->frame > 1))  /* no special
3769 starts */
3770     {
3771         vals = codes;
3772         check_start = FALSE;
3773     }
3774     else
3775         check_start = TRUE;
3776 
3777     spp = SeqPortNewByLoc(sfp->location, Seq_code_ncbi4na);
3778     if (spp == NULL)
3779         goto erret;
3780 
3781     /* len = SeqLocLen(sfp->location); - saved above */    /* size of coding region */
3782     len /= 3;                           /* size of
3783 protein */
3784     len += 1;                           /* allow
3785 partial codon at end */
3786     bs = BSNew(len);
3787     if (bs == NULL)
3788         goto erret;
3789 
3790     if (crp->frame == 2)     /* skip partial first codon */
3791         pos = 1;
3792     else if (crp->frame == 3)
3793         pos = 2;
3794     else
3795         pos = 0;
3796     SeqPortSeek(spp, pos, SEEK_SET);
3797     got_stop = FALSE;
3798 
3799     incompleteLastCodon = FALSE;
3800 
3801     do
3802     {
3803         use_break = -1;
3804         for (i = 0; i < num_code_break; i++)
3805         {
3806             if (pos == the_breaks[i])
3807             {
3808                 use_break = i;
3809                 i = num_code_break;
3810             }
3811         }
3812 
3813         bad_base = FALSE;
3814         for (i = 0; i < 3; i++)
3815         {
3816             residue = SeqPortGetResidue(spp);
3817             if (residue == SEQPORT_VIRT || residue == SEQPORT_EOS) {
3818                 /* skip past null NULL in seqport, get next - JK */
3819                 residue = SeqPortGetResidue(spp);
3820             }
3821             if (residue == SEQPORT_EOF)
3822                 break;
3823             if (residue == INVALID_RESIDUE)
3824                 bad_base = TRUE;
3825             codon[i] = residue;
3826         }
3827         if (! i)   /* no bases */
3828             break;
3829         while (i < 3)      /* incomplete last codon */
3830         {
3831             codon[i] = 15;   /* N */
3832             i++;
3833             incompleteLastCodon = TRUE;
3834         }
3835 
3836         pos += 3;
3837         if (use_break >= 0)
3838             aa = the_residues[use_break];
3839         else if (bad_base)
3840             aa = 'X';
3841         else
3842         {
3843             aa = AAForCodon(codon, vals);
3844             if (check_start)   /* first codon on possibly complete
3845 CDS */
3846             {
3847                 if (aa == '-')   /* invalid start */
3848                 {
3849                     /* if no explict partial at either end, but
3850 feature is */
3851                     /* annotated as partial, then guess should
3852 use internal */
3853                     /* amino acid code */
3854 
3855                     if ((! ((part_loc & SLP_STOP) ||
3856 (part_prod & SLP_STOP))) &&
3857                         (sfp->partial))
3858                         aa = AAForCodon(codon, codes);
3859 /* get internal aa */
3860                 }
3861                 check_start = FALSE;
3862             }
3863         }
3864 
3865         if ((! include_stop) && (aa == '*'))
3866         {
3867             got_stop = TRUE;
3868             break;
3869         }
3870 
3871         BSPutByte(bs, (Int2)aa);
3872 
3873         vals = codes;     /* not a start codon anymore */
3874 
3875     } while (residue != SEQPORT_EOF);
3876 
3877     if ((! got_stop) && incompleteLastCodon) {
3878         BSSeek(bs, -1, SEEK_END);  /* remove last X if incomplete last codon */
3879         aa = (Uint1)BSGetByte(bs);
3880         if ((aa == 'X') && (BSLen(bs)))
3881         {
3882             BSSeek(bs, -1, SEEK_END);
3883             BSDelete(bs, 1);
3884             BSSeek(bs, -1, SEEK_END);
3885         }
3886     }
3887     if ((! got_stop) && remove_trailingX)   /* only remove trailing X on partial CDS */
3888     {
3889         BSSeek(bs, -1, SEEK_END);  /* back up to last residue */
3890         aa = (Uint1)BSGetByte(bs);
3891         while ((aa == 'X') && (BSLen(bs)))
3892         {
3893             BSSeek(bs, -1, SEEK_END);
3894             BSDelete(bs, 1);
3895             BSSeek(bs, -1, SEEK_END);
3896             aa = (Uint1)BSGetByte(bs);
3897         }
3898     }
3899 
3900     if (! BSLen(bs)) goto erret;
3901 
3902 ret:
3903     SeqPortFree(spp);
3904     MemFree(the_breaks);
3905     MemFree(the_residues);
3906     return bs;
3907 erret:
3908     bs = BSFree(bs);
3909     goto ret;
3910 }
3911 #endif
3912 
3913 /* old version of ProteinFromCdRegionEx no longer compiled (above) */
3914 
3915 
3916 /*****************************************************************************
3917 *
3918 *   Uint1 AAForCodon (Uint1Ptr codon, CharPtr codes)
3919 *       codon is 3 values in ncbi4na code
3920 *       codes is the geneic code array to use
3921 *          MUST have 'X' as unknown amino acid
3922 *
3923 *****************************************************************************/
AAForCodon(Uint1Ptr codon,CharPtr codes)3924 NLM_EXTERN Uint1 AAForCodon (Uint1Ptr codon, CharPtr codes)
3925 {
3926     register Uint1 aa = 0, taa;
3927     register int i, j, k, index0, index1, index2;
3928     static Uint1 mapping[4] = { 8,     /* T in ncbi4na */
3929                                 2,     /* C */
3930                                 1,     /* A */
3931                                 4 };   /* G */
3932 
3933 
3934     for (i = 0; i < 4; i++)
3935     {
3936         if (codon[0] & mapping[i])
3937         {
3938             index0 = i * 16;
3939             for (j = 0; j < 4; j++)
3940             {
3941                 if (codon[1] & mapping[j])
3942                 {
3943                     index1 = index0 + (j * 4);
3944                     for (k = 0; k < 4; k++)
3945                     {
3946                         if (codon[2] & mapping[k])
3947                         {
3948                             index2 = index1 + k;
3949                             taa = codes[index2];
3950                             if (! aa)
3951                                 aa = taa;
3952                             else
3953                             {
3954                                 if (taa != aa)
3955                                 {
3956                                     aa =
3957 'X';
3958                                     break;
3959                                 }
3960                             }
3961                         }
3962                         if (aa == 'X')
3963                             break;
3964                     }
3965                 }
3966                 if (aa == 'X')
3967                     break;
3968             }
3969         }
3970         if (aa == 'X')
3971             break;
3972     }
3973     return aa;
3974 }
3975 
3976 static    Uint1 codon_xref [4] = {   /* mapping from NCBI2na to codon codes */
3977         2,  /* A */
3978         1,  /* C */
3979         3,  /* G */
3980         0 }; /* T */
3981 
3982 /*****************************************************************************
3983 *
3984 *   Uint1 IndexForCodon (codon, code)
3985 *       returns index into genetic codes codon array, give 3 bases of the
3986 *       codon in any alphabet
3987 *       returns INVALID_RESIDUE on failure
3988 *
3989 *****************************************************************************/
IndexForCodon(Uint1Ptr codon,Uint1 code)3990 NLM_EXTERN Uint1 IndexForCodon (Uint1Ptr codon, Uint1 code)
3991 {
3992     Int2 i, j;
3993     SeqMapTablePtr smtp;
3994     Uint1 residue, index = 0;
3995 
3996     smtp = SeqMapTableFind(Seq_code_ncbi2na, code);
3997     if (smtp == NULL) return INVALID_RESIDUE;
3998 
3999     for (i=0, j=16; i < 3; i++, j /= 4)
4000     {
4001         residue = SeqMapTableConvert(smtp, codon[i]);
4002         if (residue > 3) return INVALID_RESIDUE;
4003         residue = codon_xref[residue];
4004         index += (Uint1)(residue * j);
4005     }
4006 
4007     return index;
4008 }
4009 
4010 /*****************************************************************************
4011 *
4012 *   Boolean CodonForIndex (index, code, codon)
4013 *       Fills codon (3 Uint1 array) with codon corresponding to index,
4014 *       in sequence alphabet code.
4015 *       Index is the Genetic code index.
4016 *       returns TRUE on success.
4017 *
4018 *****************************************************************************/
CodonForIndex(Uint1 index,Uint1 code,Uint1Ptr codon)4019 NLM_EXTERN Boolean CodonForIndex (Uint1 index, Uint1 code, Uint1Ptr codon)
4020 {
4021     Int2 i, j, k;
4022     SeqMapTablePtr smtp;
4023     Uint1 residue;
4024 
4025     if (codon == NULL) return FALSE;
4026     if (index > 63) return FALSE;
4027 
4028     smtp = SeqMapTableFind(code, Seq_code_ncbi2na);
4029     if (smtp == NULL) return FALSE;
4030 
4031     for (i = 0, j = 16; i < 3; i++, j /= 4)
4032     {
4033         residue = (Uint1)((Int2)index / j);
4034         index -= (Uint1)(residue * j);
4035         for (k = 0; k < 4; k++)
4036         {
4037             if (codon_xref[k] == residue)
4038             {
4039                 residue = (Uint1)k;
4040                 break;
4041             }
4042         }
4043         residue = SeqMapTableConvert(smtp, residue);
4044         codon[i] = residue;
4045     }
4046 
4047     return TRUE;
4048 }
4049 
4050 /*----------- GetFrameFromLoc()-----------------*/
4051 
4052 /*****************************************************************************
4053 *
4054 *   Int2 GetFrameFromLoc (slp)
4055 *       returns 1,2,3 if can find the frame
4056 *       0 if not
4057 *
4058 *****************************************************************************/
GetFrameFromLoc(SeqLocPtr slp)4059 NLM_EXTERN Uint1 GetFrameFromLoc (SeqLocPtr slp)
4060 {
4061     Uint1 frame = 0;
4062     SeqLocPtr curr, last;
4063     Boolean is_partial;
4064     SeqIntPtr sip;
4065     SeqPntPtr spp;
4066 
4067     if (slp == NULL)
4068         return frame;
4069 
4070     curr = SeqLocFindNext(slp, NULL);
4071 
4072     is_partial = FALSE;
4073     switch (curr->choice)
4074     {
4075         case SEQLOC_INT:
4076             sip = (SeqIntPtr)curr->data.ptrvalue;
4077             if (sip->strand == Seq_strand_minus)
4078             {
4079                 if (sip->if_to != NULL)
4080                     is_partial = TRUE;
4081             }
4082             else if (sip->if_from != NULL)
4083                 is_partial = TRUE;
4084             break;
4085         case SEQLOC_PNT:
4086             spp = (SeqPntPtr)curr->data.ptrvalue;
4087             if (spp->fuzz != NULL)
4088                 is_partial = TRUE;
4089             break;
4090         default:
4091             return frame;
4092     }
4093 
4094 
4095     if (! is_partial)
4096         return (Int2) 1;    /* complete 5' end, it's frame 1 */
4097 
4098     is_partial = FALSE;
4099     last = curr;
4100     while ((curr = SeqLocFindNext(slp, last)) != NULL)
4101         last = curr;
4102 
4103     switch (last->choice)
4104     {
4105         case SEQLOC_INT:
4106             sip = (SeqIntPtr) last->data.ptrvalue;
4107             if (sip->strand == Seq_strand_minus)
4108             {
4109                 if (sip->if_from != NULL)
4110                     return frame;
4111             }
4112             else if (sip->if_to != NULL)
4113                 return frame;
4114             break;
4115         case SEQLOC_PNT:
4116             spp = (SeqPntPtr) last->data.ptrvalue;
4117             if (spp->fuzz != NULL)
4118                 return frame;
4119             break;
4120         default:
4121             return frame;
4122     }
4123 
4124                       /* have complete last codon, get frame
4125 from length */
4126     frame = (Uint1)(SeqLocLen(slp) % 3);
4127     if (frame == 0)
4128         frame = 1;
4129     else if (frame == 1)
4130         frame = 2;
4131     else
4132         frame = 3;
4133 
4134     return frame;
4135 }
4136 
add_fuzziness_to_loc(SeqLocPtr slp,Boolean less)4137 static Boolean add_fuzziness_to_loc (SeqLocPtr slp, Boolean less)
4138 {
4139     IntFuzzPtr ifp;
4140     SeqIntPtr sint;
4141     SeqPntPtr spnt;
4142 
4143     sint = NULL;
4144     spnt = NULL;
4145 
4146     if(slp->choice == SEQLOC_INT)
4147         sint = (SeqIntPtr) slp->data.ptrvalue;
4148     else
4149     {
4150         if(slp->choice == SEQLOC_PNT)
4151             spnt = (SeqPntPtr) slp->data.ptrvalue;
4152         else
4153             return FALSE;
4154     }
4155     ifp = IntFuzzNew();
4156     ifp->choice = 4;
4157     ifp->a = less ? 2 : 1;
4158 
4159     if(spnt != NULL)
4160         spnt->fuzz = ifp;
4161     else if (sint != NULL)
4162     {
4163         if(less)
4164             sint->if_from = ifp;
4165         else
4166             sint->if_to = ifp;
4167     }
4168 
4169     return TRUE;
4170 }
4171 
4172 
load_fuzz_to_DNA(SeqLocPtr dnaLoc,SeqLocPtr aaLoc,Boolean first)4173 static Boolean load_fuzz_to_DNA(SeqLocPtr dnaLoc, SeqLocPtr aaLoc, Boolean
4174 first)
4175 {
4176     Uint1 strand;
4177     SeqPntPtr spnt;
4178     SeqIntPtr sint;
4179     IntFuzzPtr ifp;
4180     Boolean load, less;
4181 
4182     load = FALSE;
4183     strand = SeqLocStrand(aaLoc);
4184     if(aaLoc->choice == SEQLOC_INT)
4185     {
4186         sint = (SeqIntPtr) aaLoc->data.ptrvalue;
4187         if((first && strand != Seq_strand_minus ) ||
4188             (!first && strand == Seq_strand_minus))    /*the first
4189 Seq-loc*/
4190         {
4191             ifp = sint->if_from;
4192             if(ifp && ifp->choice == 4 )
4193                 load = (ifp->a == 2);
4194         }
4195         else
4196         {
4197             ifp = sint->if_to;
4198             if(ifp && ifp->choice == 4)
4199                 load = (ifp->a == 1);
4200         }
4201     }
4202     else if(aaLoc->choice == SEQLOC_PNT)
4203     {
4204         spnt = (SeqPntPtr) aaLoc->data.ptrvalue;
4205         ifp = spnt->fuzz;
4206         if(ifp && ifp->choice == 4)
4207         {
4208             if(first)
4209                 load = (ifp->a == 2);
4210             else
4211                 load = (ifp->a == 1);
4212         }
4213     }
4214 
4215     if(load)
4216     {
4217         if(SeqLocStrand(dnaLoc) == Seq_strand_minus)
4218             less = (first == FALSE);
4219         else
4220             less = first;
4221         add_fuzziness_to_loc (dnaLoc, less);
4222         return TRUE;
4223     }
4224     else
4225         return FALSE;
4226 }
4227 
4228 /******************************************************************
4229 *
4230 *    aaLoc_to_dnaLoc(sfp, aa_loc)
4231 *    map a SeqLoc on the amino acid sequence
4232 *       to a Seq-loc in the    DNA sequence
4233 *       through a CdRegion feature
4234 *
4235 *       This now calls the more general productLoc_to_locationLoc(sfp, productLoc)
4236 *
4237 ******************************************************************/
aaLoc_to_dnaLoc(SeqFeatPtr sfp,SeqLocPtr aa_loc)4238 NLM_EXTERN SeqLocPtr LIBCALL aaLoc_to_dnaLoc(SeqFeatPtr sfp, SeqLocPtr aa_loc)
4239 {
4240     return productLoc_to_locationLoc(sfp, aa_loc);
4241 }
4242 
4243 /******************************************************************
4244 *
4245 *    aaLoc_to_dnaLoc(sfp, productLoc)
4246 *    map a SeqLoc on the product sequence
4247 *       to a Seq-loc in the    location sequence
4248 *       through a feature.
4249 *
4250 *       if the feature is a CdRegion, converts by modulo 3
4251 *       to support aaLoc_to_dnaLoc() function
4252 *
4253 ******************************************************************/
productLoc_to_locationLoc(SeqFeatPtr sfp,SeqLocPtr productLoc)4254 NLM_EXTERN SeqLocPtr LIBCALL productLoc_to_locationLoc(SeqFeatPtr sfp, SeqLocPtr productLoc)
4255 {
4256     SeqLocPtr head = NULL, slp, tmp, next;
4257     Int4 product_start, product_stop;
4258     SeqBondPtr sbp;
4259     ValNode vn;
4260     Boolean is_cdregion = FALSE;
4261   Boolean partial5, partial3;
4262 
4263     if ((sfp == NULL) || (productLoc == NULL)) return head;
4264     if (sfp->data.choice == 3) is_cdregion = TRUE;
4265     if (sfp->product == NULL) return head;
4266     if (! (SeqIdForSameBioseq(SeqLocId(productLoc), SeqLocId(sfp->product))))
4267         return head;
4268 
4269     if (productLoc->choice == SEQLOC_BOND)   /* fake this one in */
4270     {
4271         sbp = (SeqBondPtr)(productLoc->data.ptrvalue);
4272         tmp = productInterval_to_locationIntervals(sfp, sbp->a->point, sbp->a->point, FALSE);
4273         if (sbp->b == NULL)  /* one point in bond */
4274             return tmp;
4275 
4276         SeqLocAdd(&head, tmp, TRUE, FALSE);
4277         tmp = productInterval_to_locationIntervals(sfp, sbp->b->point, sbp->b->point, FALSE);
4278         if (tmp == NULL)
4279             return head;
4280 
4281         vn.choice = SEQLOC_NULL;  /* make a mix with an internal NULL */
4282         vn.next = NULL;
4283         vn.data.ptrvalue = NULL;
4284 
4285         SeqLocAdd(&head, &vn, TRUE, TRUE);  /* copy it in */
4286         SeqLocAdd(&head, tmp, TRUE, FALSE); /* put real 3 base int in */
4287 
4288         goto ret;
4289     }
4290 
4291   CheckSeqLocForPartial (productLoc, &partial5, &partial3);
4292     slp = NULL;
4293     while ((slp = SeqLocFindNext(productLoc, slp)) != NULL)
4294     {
4295         product_start = SeqLocStart(slp);
4296         product_stop = SeqLocStop(slp);
4297         if ((product_start >= 0) && (product_stop >= 0))
4298         {
4299            tmp = productInterval_to_locationIntervals(sfp, product_start, product_stop, partial5);
4300            if(tmp != NULL)
4301             load_fuzz_to_DNA(tmp, slp, TRUE);
4302            while (tmp != NULL)
4303            {
4304                next = tmp->next;
4305                tmp->next = NULL;
4306                if(next == NULL)
4307                 load_fuzz_to_DNA(tmp, slp, FALSE);
4308                SeqLocAdd(&head, tmp, TRUE, FALSE);
4309                tmp = next;
4310            }
4311         } else if (slp->choice == SEQLOC_NULL) {
4312             vn.choice = SEQLOC_NULL;  /* make a mix with an internal NULL */
4313             vn.next = NULL;
4314             vn.data.ptrvalue = NULL;
4315             SeqLocAdd(&head, &vn, TRUE, TRUE);
4316         }
4317     }
4318 ret:
4319     return SeqLocPackage(head);
4320 }
4321 
4322 /******************************************************************
4323 *
4324 *       aaFeatLoc_to_dnaFeatLoc(sfp, aa_loc)
4325 *       map a SeqLoc on the amino acid sequence
4326 *       to a Seq-loc in the     DNA sequence
4327 *       through a CdRegion feature
4328 *
4329 *       uses aaLoc_to_dnaLoc() but does additional checks to
4330 *       extend dnaLoc at either end to compensate for positions in
4331 *       the dna which do not corresspond to the amino acid sequence
4332 *       (partial codons which are not translated).
4333 *
4334 ******************************************************************/
aaFeatLoc_to_dnaFeatLoc(SeqFeatPtr sfp,SeqLocPtr aa_loc)4335 NLM_EXTERN SeqLocPtr LIBCALL aaFeatLoc_to_dnaFeatLoc(SeqFeatPtr sfp,
4336                                                      SeqLocPtr aa_loc)
4337 {
4338     SeqLocPtr dnaLoc = NULL;
4339     Uint2 dnaPartial;
4340     Int4 aaPos;
4341     SeqLocPtr tmp1 = NULL, tmp2 = NULL, tmp;
4342     SeqIdPtr sip;
4343     CdRegionPtr crp;
4344     SeqIntPtr sp1, sp2;
4345     BioseqPtr bsp;
4346   Boolean   aa_partialn, aa_partialc;
4347 
4348     dnaLoc = aaLoc_to_dnaLoc(sfp, aa_loc);
4349     if (dnaLoc == NULL) return dnaLoc;
4350 
4351     if (! sfp->partial)  /* no partial checks needed */
4352         return dnaLoc;
4353 
4354 
4355   CheckSeqLocForPartial (aa_loc, &aa_partialn, &aa_partialc);
4356     crp = (CdRegionPtr)(sfp->data.value.ptrvalue);
4357 
4358     aaPos = SeqLocStart(aa_loc);
4359     if ((! aaPos) && (crp->frame > 1) && aa_partialn)   /* using first amino acid */
4360     {
4361         tmp1 = SeqLocFindNext(sfp->location, NULL);
4362         tmp2 = SeqLocFindNext(dnaLoc, NULL);
4363 
4364         if ((tmp1->choice == SEQLOC_INT) &&
4365                          (tmp2->choice == SEQLOC_INT))
4366         {
4367             sp1 = (SeqIntPtr)(tmp1->data.ptrvalue);
4368             sp2 = (SeqIntPtr)(tmp2->data.ptrvalue);
4369             if (sp1->strand ==  Seq_strand_minus)
4370             {
4371                 sp2->to = sp1->to;  /* add partial codon */
4372             }
4373             else
4374             {
4375                 sp2->from = sp1->from;
4376             }
4377         }
4378     }
4379 
4380     dnaPartial = SeqLocPartialCheck(sfp->location);
4381     if ((dnaPartial & SLP_STOP) && aa_partialc)   /* missing 3' end of cdregion */
4382     {
4383         sip = SeqLocId(aa_loc);
4384         bsp = BioseqFindCore(sip);
4385         if (bsp != NULL)
4386         {
4387             aaPos = SeqLocStop(aa_loc);
4388             if (aaPos == (bsp->length - 1)) /* last amino acid */
4389             {
4390                 tmp = NULL;
4391                 while ((tmp = SeqLocFindNext(sfp->location,tmp)) != NULL)
4392                 {
4393                     tmp1 = tmp;
4394                 }
4395                 tmp = NULL;
4396                 while ((tmp = SeqLocFindNext(dnaLoc,tmp)) != NULL)
4397                 {
4398                     tmp2 = tmp;
4399                 }
4400 
4401                 if (tmp1 != NULL && tmp2 != NULL && (tmp1->choice == SEQLOC_INT) &&
4402                     (tmp2->choice == SEQLOC_INT))
4403                 {
4404                     sp1 = (SeqIntPtr)(tmp1->data.ptrvalue);
4405                     sp2 = (SeqIntPtr)(tmp2->data.ptrvalue);
4406                     if (sp1->strand ==  Seq_strand_minus)
4407                     {
4408                         sp2->from = sp1->from;  /* add partial codon */
4409                     }
4410                     else
4411                     {
4412                         sp2->to = sp1->to;
4413                     }
4414                 }
4415             }
4416 
4417         }
4418     }
4419     return dnaLoc;
4420 }
4421 
4422 
4423 static SeqLocPtr
NucLocFromProtInterval(SeqFeatPtr cds,Int4 prot_start,Int4 prot_stop,Boolean n_partial)4424 NucLocFromProtInterval
4425 (SeqFeatPtr cds,
4426  Int4 prot_start,
4427  Int4 prot_stop,
4428  Boolean n_partial)
4429 {
4430   CdRegionPtr crp;
4431   Int4        aa_before = 0, nt_this, prev_nt = 0, part_codon;
4432   SeqLocPtr   result = NULL;
4433   SeqLocPtr   slp = NULL; /* used for iterating through locations in the coding region */
4434   SeqLocPtr   loc; /* used for creating interval on NT sequence */
4435   Boolean     first_loc = TRUE;
4436   Int4        cds_int_start, cds_int_stop, cds_int_len;
4437   Int4        frame_start = 0;
4438   Int4        aa_int_start = 0, aa_int_stop = 0, aa_len, this_aa, aa_needed, aa_unneeded, aa_accumulated = 0;
4439   Int4        aa_from_this_interval;
4440   Uint1       strand;
4441 
4442   if (cds == NULL || cds->data.choice != SEQFEAT_CDREGION || prot_start < 0 || prot_stop < prot_start) {
4443     return NULL;
4444   }
4445 
4446   crp = (CdRegionPtr) cds->data.value.ptrvalue;
4447   if (crp == NULL) {
4448     return NULL;
4449   }
4450   if (crp->frame > 1) {
4451     frame_start = crp->frame - 1;
4452   }
4453 
4454   aa_len = prot_stop - prot_start + 1;
4455 
4456   while((slp = SeqLocFindNext(cds->location, slp)) != NULL) {
4457     cds_int_len = SeqLocLen (slp);
4458     cds_int_start = SeqLocStart (slp);
4459     cds_int_stop = SeqLocStop (slp);
4460     strand = SeqLocStrand (slp);
4461 
4462     if (first_loc) {
4463       if (strand == Seq_strand_minus) {
4464         cds_int_stop -= frame_start;
4465       } else {
4466         cds_int_start += frame_start;
4467       }
4468       cds_int_len -= frame_start;
4469     }
4470 
4471     /* calculate the number of NT that "count" for this interval -
4472       * don't include the NT in a partial codon at the beginning of
4473       * of the feature, but do include NT from a partial codon at
4474       * the end of the previous interval.
4475       */
4476     nt_this = cds_int_len + prev_nt;
4477     part_codon = nt_this % 3;
4478     nt_this -= part_codon;
4479 
4480     /* calculate how many AA are covered by this interval */
4481     this_aa = nt_this / 3;
4482 
4483     if (aa_before + this_aa >= prot_start) {
4484 
4485       /* figure out whether to take all of this interval, or just part of it */
4486       aa_from_this_interval = this_aa;
4487 
4488       /* 5' end (left for plus strand, right for minus) */
4489       if (aa_before < prot_start) {
4490         /* skip some at the beginning */
4491         aa_unneeded = prot_start - aa_before;
4492         aa_from_this_interval -= aa_unneeded;
4493 
4494         if (strand == Seq_strand_minus) {
4495           aa_int_stop = cds_int_stop + prev_nt - (3 * aa_unneeded);
4496         } else {
4497           aa_int_start = cds_int_start - prev_nt + (3 * aa_unneeded);
4498         }
4499       } else {
4500         /* start at the beginning */
4501         if (strand == Seq_strand_minus) {
4502           aa_int_stop = cds_int_stop;
4503           if (first_loc) {
4504             if (n_partial) {
4505               /* put frame shift back in, if first loc and n-partial */
4506               aa_int_stop += frame_start;
4507             } else if (aa_before == prot_start) {
4508               /* starts in this interval, but after "remainder" of previous codon */
4509               aa_int_stop -= prev_nt;
4510             }
4511           }
4512         } else {
4513           aa_int_start = cds_int_start;
4514           if (first_loc) {
4515             if (n_partial) {
4516               /* put frame shift back in, if first loc and n-partial */
4517               aa_int_start -= frame_start;
4518             } else if (aa_before == prot_start) {
4519               /* starts in this interval, but after "remainder" of previous codon */
4520               aa_int_start += prev_nt;
4521             }
4522           }
4523         }
4524       }
4525 
4526       /* 3' end (right for plus strand, left for minus) */
4527       if (aa_accumulated + aa_from_this_interval < aa_len) {
4528         if (strand == Seq_strand_minus) {
4529           aa_int_start = cds_int_start;
4530         } else {
4531           aa_int_stop = cds_int_stop;
4532         }
4533       } else {
4534         /* just take the part that we need */
4535         aa_needed = aa_len - aa_accumulated;
4536         aa_unneeded = aa_from_this_interval - aa_needed;
4537 
4538         if (strand == Seq_strand_minus) {
4539           aa_int_start = cds_int_start + part_codon + (3 * aa_unneeded);
4540         } else {
4541           aa_int_stop = cds_int_stop - part_codon - (3 * aa_unneeded);
4542         }
4543         aa_from_this_interval -= aa_unneeded;
4544       }
4545 
4546       /* note - if aa_int_start > aa_int_stop, that means we eliminated
4547        * both ends of the interval.
4548        */
4549       if (aa_int_start <= aa_int_stop) {
4550         /* aa_accumulated now includes the number of complete codons that have
4551         * been accounted for (not counting a partial codon at the end of this
4552         * interval, if any
4553         */
4554         aa_accumulated += aa_from_this_interval;
4555 
4556         /* add interval to result */
4557             loc = SeqLocIntNew(aa_int_start, aa_int_stop, strand, SeqLocId(slp));
4558             SeqLocAdd(&result, loc, TRUE, FALSE);
4559       }
4560     }
4561 
4562     first_loc = FALSE;
4563     aa_before += this_aa;
4564     prev_nt = part_codon;
4565 
4566     if (aa_before > prot_stop) {
4567       break;
4568     }
4569   }
4570 
4571   return result;
4572 }
4573 
4574 
NaLocFromNaInterval(SeqFeatPtr sfp,Int4 product_start,Int4 product_stop)4575 static SeqLocPtr NaLocFromNaInterval (SeqFeatPtr sfp, Int4 product_start, Int4 product_stop)
4576 {
4577   SeqLocPtr slp = NULL;
4578   SeqLocPtr location_loc, loc;            /*for the sfp.location location*/
4579 
4580   Boolean is_end;            /**is the end for process reached?**/
4581   Int4 p_start=0, p_stop=0;        /**product sequence start & stop in defined
4582                     corresponding sfp.product **/
4583   Int4 cur_pos;            /**current sfp.product sequence position in process**/
4584   Int4 product_len;        /**length of the sfp.product **/
4585 
4586   Int4 d_start, d_stop;        /*the start and the stop of the sfp.location sequence*/
4587   Int4 offset;            /*offset from the start of the current exon*/
4588   Int4 aa_len;
4589   Uint1 strand;
4590   Int4 p_end_pos;    /*the end of the product sequence in the current loc*/
4591 
4592   cur_pos= product_start;
4593   product_len = 0;
4594   is_end = FALSE;
4595   p_start = 0;
4596   slp = NULL;
4597   location_loc= NULL;
4598   while(!is_end && ((slp = SeqLocFindNext(sfp->location, slp))!=NULL))
4599   {
4600       product_len += SeqLocLen(slp);
4601     p_stop = product_len - 1;
4602 
4603       p_end_pos = p_stop;
4604 
4605       if(p_stop >= product_stop)
4606       {
4607         p_stop = product_stop;        /**check if the end is reached**/
4608         is_end = TRUE;
4609       }
4610 
4611       if(p_stop >= cur_pos)    /*get the exon*/
4612       {
4613         offset = cur_pos - p_start;
4614 
4615           strand = SeqLocStrand(slp);
4616           if(strand == Seq_strand_minus)
4617             d_start = SeqLocStop(slp) - offset;
4618           else
4619             d_start = SeqLocStart(slp) + offset;
4620 
4621           d_stop = d_start;
4622 
4623       aa_len = MIN(p_stop, product_stop) - cur_pos +1;
4624 
4625       if(strand == Seq_strand_minus)
4626           {
4627               if(aa_len >= 0)
4628               {
4629                   d_stop -= (aa_len - 1);
4630               }
4631               else
4632         {
4633                   ++d_stop;
4634         }
4635 
4636               d_stop = MAX(d_stop, SeqLocStart(slp));
4637               loc = SeqLocIntNew(d_stop, d_start, strand, SeqLocId(slp));
4638           }
4639           else
4640           {
4641               if(aa_len >= 0)
4642               {
4643                   d_stop += (aa_len - 1);
4644               }
4645               else
4646                   --d_stop;
4647 
4648               d_stop = MIN(d_stop, SeqLocStop(slp));
4649               loc = SeqLocIntNew(d_start, d_stop, strand, SeqLocId(slp));
4650           }
4651           SeqLocAdd(&location_loc, loc, TRUE, FALSE);
4652 
4653           cur_pos = p_stop+1;
4654     }
4655 
4656     p_start = p_stop +1;
4657 
4658   }/**end of while(slp && !is_end) **/
4659 
4660   return location_loc;
4661 }
4662 
4663 /******************************************************************
4664 *
4665 *    productInterval_to_locationIntervals(sfp, product_start, product_stop)
4666 *    map the amino acid sequence to a chain of Seq-locs in the
4667 *    DNA sequence through a CdRegion feature
4668 *
4669 ******************************************************************/
4670 NLM_EXTERN SeqLocPtr LIBCALL
productInterval_to_locationIntervals(SeqFeatPtr sfp,Int4 product_start,Int4 product_stop,Boolean aa_partialn)4671 productInterval_to_locationIntervals
4672 (SeqFeatPtr sfp,
4673  Int4 product_start,
4674  Int4 product_stop,
4675  Boolean aa_partialn)
4676 {
4677 
4678   if (sfp->data.choice == SEQFEAT_CDREGION) {
4679     return NucLocFromProtInterval (sfp, product_start, product_stop, aa_partialn);
4680   } else {
4681     return NaLocFromNaInterval (sfp, product_start, product_stop);
4682   }
4683 
4684 }
4685 
4686 static Boolean load_fuzz_to_DNA PROTO((SeqLocPtr dnaLoc, SeqLocPtr aaLoc,
4687 Boolean first));
4688 /******************************************************************
4689 *
4690 *    dnaLoc_to_aaLoc(sfp, location_loc, merge)
4691 *    map a SeqLoc on the DNA sequence
4692 *       to a Seq-loc in the    protein sequence
4693 *       through a CdRegion feature
4694 *   if (merge) adjacent intervals on the amino acid sequence
4695 *      are merged into one. This should be the usual case.
4696 *
4697 ******************************************************************/
dnaLoc_to_aaLoc(SeqFeatPtr sfp,SeqLocPtr location_loc,Boolean merge,Int4Ptr frame,Boolean allowTerminator)4698 NLM_EXTERN SeqLocPtr LIBCALL dnaLoc_to_aaLoc(SeqFeatPtr sfp, SeqLocPtr location_loc, Boolean
4699 merge, Int4Ptr frame, Boolean allowTerminator)
4700 {
4701     SeqLocPtr aa_loc = NULL, loc;
4702     CdRegionPtr crp;
4703     Int4 product_len, end_pos, frame_offset;
4704     GatherRange gr;
4705     Int4 a_left = 0, a_right, last_aa = -20, aa_from, aa_to;
4706   Int4 cds_left, cds_right;
4707     SeqLocPtr slp;
4708     Int2 cmpval;
4709     SeqIdPtr aa_sip;
4710     BioseqPtr bsp;
4711   Boolean partial5, partial3;
4712   Uint1 strand;
4713 
4714     if ((sfp == NULL) || (location_loc == NULL)) return aa_loc;
4715     if (sfp->data.choice != 3) return aa_loc;
4716     if (sfp->product == NULL) return aa_loc;
4717 
4718     crp = (CdRegionPtr) sfp->data.value.ptrvalue;
4719     if(crp == NULL) return aa_loc;
4720 
4721     /* location_loc must be equal or contained in feature */
4722     cmpval = SeqLocCompare(location_loc, sfp->location);
4723     if (! ((cmpval == SLC_A_IN_B) || (cmpval == SLC_A_EQ_B)))
4724         return aa_loc;
4725 
4726     aa_sip = SeqLocId(sfp->product);
4727     if (aa_sip == NULL) return aa_loc;
4728     bsp = BioseqLockById(aa_sip);
4729     if (bsp == NULL) return aa_loc;
4730     end_pos = bsp->length - 1;
4731     BioseqUnlock(bsp);
4732 
4733     if(crp->frame == 0)
4734         frame_offset = 0;
4735     else
4736         frame_offset = (Int4)crp->frame-1;
4737 
4738   cds_left = SeqLocStart (sfp->location);
4739   cds_right = SeqLocStop (sfp->location);
4740 
4741 
4742     slp = NULL;
4743     product_len = 0;
4744     loc = NULL;
4745     while ((slp = SeqLocFindNext(sfp->location, slp))!=NULL)
4746     {
4747     if (SeqLocOffset(location_loc, slp, &gr, 0))
4748       {
4749             SeqLocOffset(slp, location_loc, &gr, 0);
4750 
4751             a_left = gr.left + product_len;
4752             a_right = gr.right + product_len;
4753       if (frame_offset > 0) {
4754         a_left -= frame_offset;
4755         a_right -= frame_offset;
4756       }
4757 
4758             if (a_left < 0)
4759       {
4760         CheckSeqLocForPartial (slp, &partial5, &partial3);
4761         strand = SeqLocStrand (slp);
4762         if ((partial5 && strand != Seq_strand_minus) || (partial3 && strand == Seq_strand_minus)) {
4763                   a_left = gr.left;
4764         } else {
4765           a_left += 3;
4766         }
4767       }
4768       if (a_right > (bsp->length) * 3 - 1 && !allowTerminator) {
4769         CheckSeqLocForPartial (slp, &partial5, &partial3);
4770         strand = SeqLocStrand (slp);
4771         if (partial3 && a_right == bsp->length * 3) {
4772           /* it's ok, leave it alone */
4773         } else if ((partial5 && strand != Seq_strand_minus) || (partial3 && strand == Seq_strand_minus)) {
4774                   a_right = (bsp->length * 3) - 1;
4775         } else {
4776           a_right -= 3;
4777         }
4778       }
4779 
4780             aa_from = a_left / 3;
4781             aa_to = a_right / 3;
4782 
4783             if (aa_to > end_pos && !allowTerminator)
4784                 aa_to = end_pos;
4785 
4786             if (merge)
4787             {
4788                 if (aa_from <= last_aa)  /* overlap due to codons */
4789                     aa_from = last_aa+1;  /* set up to merge */
4790             }
4791 
4792       /* NOTE - if a_left is not <= a_right, then a correction for frame may have
4793        * caused the location to not actually be mappable to the protein sequence.
4794        */
4795             if ((aa_from <= aa_to || (allowTerminator && aa_from == aa_to + 1)) && a_left <= a_right)
4796             {
4797                 if(loc != NULL)
4798                 {
4799                     if(aa_loc == NULL)
4800                         load_fuzz_to_DNA(loc, location_loc, TRUE);
4801                     SeqLocAdd(&aa_loc, loc, merge, FALSE);
4802                 }
4803                 loc = SeqLocIntNew(aa_from, aa_to, 0, aa_sip);
4804                 last_aa = aa_to;
4805             }
4806     }
4807 
4808     product_len += SeqLocLen(slp);
4809     }
4810 
4811     if(loc != NULL)
4812     {
4813         if(aa_loc == NULL)
4814             load_fuzz_to_DNA(loc, location_loc, TRUE);
4815         load_fuzz_to_DNA(loc, location_loc, FALSE);
4816         SeqLocAdd(&aa_loc, loc, merge, FALSE);
4817     }
4818     if (frame != NULL)
4819         *frame = a_left % 3;
4820 
4821     return SeqLocPackage(aa_loc);
4822 }
4823 
4824 /*****************************************************************************
4825 *
4826 *   BioseqHash(bsp)
4827 *       Computes a (almost) unique hash code for a bioseq
4828 *
4829 *****************************************************************************/
BioseqHash(BioseqPtr bsp)4830 NLM_EXTERN Uint4 BioseqHash (BioseqPtr bsp)
4831 {
4832     Uint4 hashval = 0;
4833     SeqPortPtr spp;
4834     Uint1 code;
4835     Int2 residue;
4836 
4837     if (bsp == NULL) return hashval;
4838 
4839     if (ISA_na(bsp->mol))
4840         code = Seq_code_iupacna;
4841     else
4842         code = Seq_code_ncbieaa;
4843 
4844     spp = SeqPortNew(bsp, 0, -1, 0, code);
4845     if (spp == NULL) return hashval;
4846 
4847     while ((residue = SeqPortGetResidue(spp)) != SEQPORT_EOF)
4848     {
4849         hashval *= 1103515245;
4850         hashval += (Uint4)residue + 12345;
4851     }
4852 
4853     SeqPortFree(spp);
4854 
4855     return hashval;
4856 }
4857 
4858 
4859 /*-------------- BioseqRevComp () ---------------------------*/
4860 /***********************************************************************
4861 *   BioseqRevComp:   Takes the nucleic acid sequence from Bioseq
4862 *    Entry and gives the reverse complement sequence in place
4863 *       Does not change features.
4864 ************************************************************************/
BioseqRevComp(BioseqPtr bsp)4865 NLM_EXTERN Boolean LIBCALL BioseqRevComp (BioseqPtr bsp)
4866 {
4867     Boolean retval;
4868 
4869     retval = BioseqReverse (bsp);
4870     if (retval)
4871         retval = BioseqComplement(bsp);
4872     return retval;
4873 }
4874 
ComplementSeqData(Uint1 seqtype,Int4 seqlen,SeqDataPtr sdp)4875 NLM_EXTERN Boolean ComplementSeqData (Uint1 seqtype, Int4 seqlen, SeqDataPtr sdp)
4876 {
4877     SeqCodeTablePtr sctp;
4878     ByteStorePtr    bysp;
4879     long            readbyte, bslen;
4880     Uint1           byte = 0, byte_to, newbyte = 0, residue;
4881     Uint1           comp, bitctr, mask, lshift, rshift, bc;
4882 
4883     if (seqtype == Seq_code_gap) return FALSE;
4884 
4885     bysp = (ByteStorePtr) sdp;
4886     if (bysp == NULL)
4887     {
4888         ErrPostEx(SEV_ERROR,0,0, "Error:  no sequence data\n");
4889         return FALSE;
4890     }
4891 
4892     if ((sctp = SeqCodeTableFind (seqtype)) == NULL)
4893     {
4894         ErrPostEx(SEV_ERROR,0,0, "Can't open table\n");
4895         return FALSE;
4896     }
4897     switch (seqtype)        /*determine type of base encoding*/
4898     {
4899         case Seq_code_ncbi2na:
4900             bc = 4;
4901             rshift = 6;
4902             lshift = 2;
4903             mask = 192;
4904             break;
4905 
4906         case Seq_code_ncbi4na:
4907             bc = 2;
4908             rshift = 4;
4909             lshift = 4;
4910             mask = 240;
4911             break;
4912 
4913                 case Seq_code_iupacna:
4914                 case Seq_code_ncbi8na:
4915             bc = 1;
4916             rshift = 0;
4917             lshift = 0;
4918             mask = 255;
4919             break;
4920     case Seq_code_iupacaa:
4921     case Seq_code_ncbi8aa:
4922     case Seq_code_ncbieaa:
4923     case Seq_code_ncbipaa:
4924     case Seq_code_iupacaa3:
4925     case Seq_code_ncbistdaa:             /* ignore amino acid */
4926       ErrPostEx(SEV_ERROR,0,0, "Error:  cannot complement aa ; No ->mol flag on Bioseq\n");
4927       return FALSE;
4928     case Seq_code_ncbipna:
4929       ErrPostEx(SEV_WARNING,0,0, "Error: Don't yet know how to complement profile\n");
4930             return FALSE;
4931         default:
4932             return FALSE;
4933     }
4934 
4935     bslen = BSLen(bysp);
4936     bitctr = 0;
4937     readbyte = 0;
4938 
4939     while (readbyte < bslen)
4940     {
4941         if (!bitctr)
4942         {                /*get new byte*/
4943             BSSeek (bysp, readbyte, SEEK_SET);
4944             newbyte = byte_to = byte = residue = 0;
4945             byte = (Uint1)BSGetByte (bysp);
4946             bitctr = bc;
4947             readbyte++;
4948         }
4949 
4950         for (; bitctr; bitctr--)
4951         {
4952             residue = byte & mask;    /*mask out all but one base*/
4953             residue >>= rshift;
4954             byte <<= lshift;
4955 
4956             comp = SeqCodeTableComp (sctp, residue); /*get
4957 complement*/
4958 
4959             newbyte <<= lshift;
4960             byte_to = newbyte;
4961             newbyte = (comp | byte_to);    /*put complements
4962 together*/
4963 
4964         }
4965 
4966         if (readbyte)            /*put back byte with comps*/
4967         {
4968             BSSeek (bysp, readbyte-1, SEEK_SET);
4969             BSPutByte (bysp, newbyte);
4970         }
4971     }
4972     return TRUE;
4973 
4974 }
4975 
4976 
DeltaBioseqComplement(BioseqPtr bsp)4977 static Boolean DeltaBioseqComplement (BioseqPtr bsp)
4978 {
4979   DeltaSeqPtr dsp;
4980   SeqLitPtr   slip;
4981   Boolean     rval = FALSE;
4982 
4983   if (bsp == NULL || bsp->repr != Seq_repr_delta)
4984   {
4985     return rval;
4986   }
4987 
4988   dsp = (DeltaSeqPtr) bsp->seq_ext;
4989   while (dsp != NULL)
4990   {
4991     if (dsp->choice != 2)
4992     {
4993       ErrPostEx(SEV_ERROR,0,0, "Error: Can't complement delta sequences with far locs\n");
4994       return FALSE;
4995     }
4996     dsp = dsp->next;
4997   }
4998   rval = TRUE;
4999   dsp = (DeltaSeqPtr) bsp->seq_ext;
5000   while (dsp != NULL)
5001   {
5002     slip = (SeqLitPtr) dsp->data.ptrvalue;
5003     /* complement data */
5004     if (slip->seq_data != NULL)
5005     {
5006       rval &= ComplementSeqData (slip->seq_data_type, slip->length, slip->seq_data);
5007     }
5008     dsp = dsp->next;
5009   }
5010   return rval;
5011 }
5012 
5013 
5014 /*-------------- BioseqComplement () ---------------------------*/
5015 /***********************************************************************
5016 *   BioseqComplement:   Takes the nucleic acid sequence from Bioseq
5017 *    Entry and gives the complement sequence in place
5018 *       Does not change features.
5019 ************************************************************************/
BioseqComplement(BioseqPtr bsp)5020 NLM_EXTERN Boolean LIBCALL BioseqComplement (BioseqPtr bsp)
5021 {
5022     Boolean         rval = FALSE;
5023 
5024   if (bsp == NULL)
5025   {
5026     ErrPostEx(SEV_ERROR,0,0, "Error: not a BioseqPtr\n");
5027     rval = FALSE;
5028   }
5029   else if (ISA_aa(bsp->mol))
5030   {
5031     ErrPostEx(SEV_ERROR,0,0, "Error:  cannot complement aa\n");
5032         rval = FALSE;
5033   }
5034   else if (bsp->repr == Seq_repr_delta)
5035   {
5036     rval = DeltaBioseqComplement (bsp);
5037   }
5038   else if (bsp->repr == Seq_repr_raw)
5039   {
5040     rval = ComplementSeqData (bsp->seq_data_type, bsp->length, bsp->seq_data);
5041   }
5042   else
5043   {
5044     ErrPostEx(SEV_ERROR,0,0, "Error: not a raw or delta sequence\n");
5045         rval = FALSE;
5046   }
5047   return rval;
5048 
5049 } /* BioseqComplement */
5050 
5051 
ReverseSeqData(Uint1 seqtype,Int4 seqlen,SeqDataPtr sdp)5052 NLM_EXTERN Boolean LIBCALL ReverseSeqData (Uint1 seqtype, Int4 seqlen, SeqDataPtr sdp)
5053 {
5054     ByteStorePtr     bysp1, bysp2 = '\0';
5055     long         readbyte, bslen = 0;
5056     Int4         count = 0;
5057     Uint1     byte = 0, byte2, byte_to = 0, byte_to2, newbyte = 0;
5058     Uint1        newbyte2, finalbyte, residue, residue2, bitctr, bc2 = 0;
5059     Uint1         bitctr2, mask, mask2, lshift, rshift, bc = 0, jagged;
5060 
5061     if (seqtype == Seq_code_gap) return FALSE;
5062 
5063     bysp1 = (ByteStorePtr) sdp;
5064 
5065   if (bysp1 == NULL)
5066   {
5067     ErrPostEx(SEV_ERROR,0,0, "Error:  No sequence data\n");
5068     return FALSE;
5069   }
5070 
5071     switch (seqtype){
5072         case Seq_code_ncbi2na:        /*bitshifts needed*/
5073             mask = 192;
5074             mask2 = 3;
5075             lshift = 2;
5076             rshift = 6;
5077                         jagged = seqlen%4;
5078             switch (jagged)    /*change if jagged last byte*/
5079             {
5080                 case 1:
5081                     bc = 1;
5082                     bc2 = 3;
5083                     break;
5084                 case 2:
5085                     bc = 2;
5086                     bc2 = 2;
5087                     break;
5088                 case 3:
5089                     bc = 3;
5090                     bc2 = 1;
5091                     break;
5092                 default:
5093                     bc = 4;
5094                     bc2 = 0;
5095                     break;
5096             }
5097             break;
5098         case Seq_code_ncbi4na:
5099             mask = 240;
5100             mask2 = 15;
5101             lshift = 4;
5102             rshift = 4;
5103                         jagged = seqlen%2;
5104             switch (jagged)
5105             {
5106                 case 1:
5107                     bc = 1;
5108                     bc2 = 1;
5109                     break;
5110                 default:
5111                     bc = 2;
5112                     bc2 = 0;
5113                     break;
5114             }
5115             break;
5116                 case Seq_code_iupacna:
5117                 case Seq_code_ncbi8na:
5118 
5119                 case Seq_code_iupacaa:
5120                 case Seq_code_ncbi8aa:
5121                 case Seq_code_ncbieaa:
5122                 case Seq_code_ncbistdaa:
5123             bc = 1;
5124                         bc2 = 0;
5125             rshift = 0;
5126             lshift = 0;
5127                         jagged = 0;
5128             mask = 255;
5129                         mask2 = 0;
5130             break;
5131                 case Seq_code_ncbipaa:
5132                 case Seq_code_iupacaa3:
5133                     ErrPostEx(SEV_ERROR,0,0, "Error:  cannot  reverse %s protein alphabet",(int)seqtype);
5134                     return FALSE;
5135                 case Seq_code_ncbipna:
5136                     ErrPostEx(SEV_WARNING,0,0, "Error: Don't yet know how to reverse profile\n");
5137         default:        /*ignores amino acid sequence*/
5138             return FALSE;
5139     }
5140     bysp2 = BSDup(bysp1);
5141     bslen = BSLen (bysp1);
5142     bitctr = bitctr2 = 0;
5143     readbyte = 0;
5144     count = 0;
5145 
5146     if (!jagged)            /*no jagged last byte*/
5147     {
5148         while ((readbyte != BSLen(bysp1)))
5149         {
5150             count = rshift;
5151             if (!bitctr)        /*get new byte*/
5152             {
5153                 newbyte = byte_to = byte = residue = 0;
5154                 BSSeek (bysp2, --bslen, SEEK_SET);
5155                 byte = (Uint1)BSGetByte (bysp2);
5156                 bitctr = bc;
5157                 readbyte++;
5158             }
5159 
5160             for (;bitctr; bitctr--)
5161             {
5162                 residue = byte & mask;
5163                 residue >>= count;
5164                 byte <<= lshift;
5165                 count = count - lshift;
5166 
5167                 newbyte = (residue | byte_to);
5168                 byte_to = newbyte;
5169             }
5170 
5171             BSSeek (bysp1, readbyte-1, SEEK_SET);
5172             BSPutByte (bysp1, newbyte);
5173 
5174         }
5175     }
5176     else                /*jagged last byte*/
5177     {
5178         /*Gets two bytes prior to loop*/
5179         newbyte = newbyte2 = byte_to = byte_to2 = 0;
5180         byte2 = residue = residue2 = 0;
5181         BSSeek (bysp2, bslen-2, SEEK_SET);
5182         byte2 = (Uint1) BSGetByte (bysp2);    /*byte closer to beginning*/
5183         byte = (Uint1) BSGetByte (bysp2);
5184         bitctr = bc;
5185         bitctr2 = bc2;
5186         bslen = bslen - 2;
5187         readbyte = 1;
5188 
5189         while (readbyte != BSLen(bysp1))
5190         {
5191             count = rshift;
5192             if (!bitctr)        /*when needed gets another
5193 byte*/
5194             {
5195                 newbyte = newbyte2 = byte_to = byte_to2 = 0;
5196                 byte2 = finalbyte = residue = residue2 = 0;
5197                 BSSeek (bysp2, --bslen, SEEK_SET);
5198                 byte2 = (Uint1) BSGetByte (bysp2);
5199                 bitctr = bc;
5200                 bitctr2 = bc2;
5201                 ++readbyte;
5202             }
5203             for (; bitctr; bitctr--)
5204             {
5205                 residue = byte & mask;        /*reverses 1st
5206 byte*/
5207                 residue >>= count;
5208                 byte <<= lshift;
5209                 byte_to = newbyte;
5210                 newbyte = (residue | byte_to);
5211                 count = count - lshift;
5212             }
5213             for (; bitctr2; bitctr2--)
5214             {
5215                 residue2 = byte2 & mask2;   /*reverses 2nd */
5216                 byte2 >>= lshift;        /*partially to
5217 join*/
5218                 newbyte2 <<= lshift;        /*with the 1st*/
5219                 byte_to2 = newbyte2;
5220                 newbyte2 = (residue2 | byte_to2);
5221             }
5222             newbyte <<= (8 - (bc*lshift));    /*joins 1st & 2nd
5223 bytes*/
5224             finalbyte = (newbyte | newbyte2);
5225             byte2 <<= (bc2 * lshift);
5226             byte = byte2;
5227 
5228             BSSeek (bysp1, readbyte-1, SEEK_SET);
5229             BSPutByte (bysp1, finalbyte);
5230         }
5231     }
5232     BSFree(bysp2);
5233     return TRUE;
5234 } /* ReverseSeqData */
5235 
5236 
DeltaBioseqReverse(BioseqPtr bsp)5237 static Boolean DeltaBioseqReverse (BioseqPtr bsp)
5238 {
5239   DeltaSeqPtr dsp, next_dsp, newchain = NULL;
5240   SeqLitPtr   slip;
5241   Boolean     rval = FALSE;
5242 
5243   if (bsp == NULL || bsp->repr != Seq_repr_delta)
5244   {
5245     return rval;
5246   }
5247 
5248   dsp = (DeltaSeqPtr) bsp->seq_ext;
5249   while (dsp != NULL)
5250   {
5251     if (dsp->choice != 2)
5252     {
5253       ErrPostEx(SEV_ERROR,0,0, "Error: Can't reverse delta sequences with far locs\n");
5254       return FALSE;
5255     }
5256     dsp = dsp->next;
5257   }
5258 
5259   dsp = (DeltaSeqPtr) bsp->seq_ext;
5260   rval = TRUE;
5261   while (dsp != NULL)
5262   {
5263     slip = (SeqLitPtr) dsp->data.ptrvalue;
5264     /* reverse data */
5265     if (slip->seq_data != NULL)
5266     {
5267       rval &= ReverseSeqData (slip->seq_data_type, slip->length, slip->seq_data);
5268     }
5269 
5270     /* reverse the chain */
5271     next_dsp = dsp->next;
5272     dsp->next = newchain;
5273     newchain = dsp;
5274 
5275     dsp = next_dsp;
5276   }
5277   bsp->seq_ext = newchain;
5278   return rval;
5279 }
5280 
5281 /*-------------- BioseqReverse () ---------------------------*/
5282 /***********************************************************************
5283 *   BioseqReverse:   Takes nucleic acid sequence from Bioseq Entry and
5284 *    reverses the whole sequence in place
5285 *       Does not change features.
5286 ************************************************************************/
BioseqReverse(BioseqPtr bsp)5287 NLM_EXTERN Boolean LIBCALL BioseqReverse (BioseqPtr bsp)
5288 {
5289     Boolean       rval;
5290 
5291   if (bsp == NULL)
5292   {
5293     ErrPostEx(SEV_ERROR,0,0, "Error: not a BioseqPtr\n");
5294     rval = FALSE;
5295   }
5296   else if (bsp->repr == Seq_repr_delta)
5297   {
5298     rval = DeltaBioseqReverse (bsp);
5299   }
5300   else if (bsp->repr == Seq_repr_raw)
5301   {
5302     rval = ReverseSeqData (bsp->seq_data_type, bsp->length, bsp->seq_data);
5303   }
5304   else
5305   {
5306     ErrPostEx(SEV_ERROR,0,0, "Error: not a raw or delta sequence\n");
5307     rval = FALSE;
5308   }
5309 
5310     return rval;
5311 } /* BioseqReverse */
5312 
5313 #define SPC_BUFF_CHUNK 1024
5314 
5315 /*****************************************************************************
5316 *
5317 *  ContigRevComp
5318 *
5319 *****************************************************************************/
5320 
SegRevComp(BioseqPtr bsp)5321 static Boolean SegRevComp (BioseqPtr bsp)
5322 
5323 {
5324   ValNodePtr  head = NULL;
5325   Int4        from, to, tmp;
5326   Boolean     partial5, partial3;
5327   SeqIntPtr   sintp;
5328   SeqLocPtr   slp;
5329   ValNode     vn;
5330   ValNodePtr  vnp;
5331 
5332   MemSet ((Pointer) &vn, 0, sizeof (ValNode));
5333   vn.choice = SEQLOC_MIX;
5334   vn.data.ptrvalue = bsp->seq_ext;
5335 
5336   /* get each location component */
5337 
5338   slp = SeqLocFindNext (&vn, NULL);
5339   while (slp != NULL) {
5340 
5341     /* copy component, reversing strand */
5342 
5343     vnp = NULL;
5344     if (slp->choice == SEQLOC_NULL) {
5345 
5346       vnp = ValNodeAddPointer (NULL, SEQLOC_NULL, NULL);
5347 
5348     } else if (slp->choice == SEQLOC_INT) {
5349 
5350       sintp = (SeqIntPtr) slp->data.ptrvalue;
5351       if (sintp != NULL) {
5352         CheckSeqLocForPartial (slp, &partial5, &partial3);
5353         from = sintp->from;
5354         to = sintp->to;
5355         if (sintp->strand != Seq_strand_minus) {
5356           tmp = from;
5357           from = to;
5358           to = tmp;
5359         }
5360         vnp = AddIntervalToLocation (NULL, sintp->id, from, to, partial3, partial5);
5361       }
5362 
5363     }
5364 
5365     /* save in new list in reverse order */
5366 
5367     if (vnp != NULL) {
5368       vnp->next = head;
5369       head = vnp;
5370     }
5371 
5372     slp = SeqLocFindNext (&vn, slp);
5373   }
5374 
5375   if (head == NULL) return FALSE;
5376 
5377   bsp->seq_ext = SeqLocSetFree ((ValNodePtr) bsp->seq_ext);
5378   bsp->seq_ext = head;
5379 
5380   bsp->hist = SeqHistFree (bsp->hist);
5381 
5382   return TRUE;
5383 }
5384 
DeltaRevComp(BioseqPtr bsp)5385 static Boolean DeltaRevComp (BioseqPtr bsp)
5386 
5387 {
5388   DeltaSeqPtr  dsp, dspnext;
5389   ValNodePtr   head = NULL;
5390   Int4         from, to, tmp;
5391   Boolean      partial5, partial3;
5392   SeqIntPtr    sintp;
5393   SeqLocPtr    slp;
5394   SeqLitPtr    slitp, slip;
5395   ValNodePtr   vnp;
5396 
5397   for (dsp = (DeltaSeqPtr) bsp->seq_ext; dsp != NULL; dsp = dsp->next) {
5398     vnp = NULL;
5399 
5400     if (dsp->choice == 1) {
5401 
5402       slp = (SeqLocPtr) dsp->data.ptrvalue;
5403       if (slp != NULL) {
5404 
5405         if (slp->choice == SEQLOC_NULL) {
5406 
5407           vnp = ValNodeAddPointer (NULL, SEQLOC_NULL, NULL);
5408 
5409         } else if (slp->choice == SEQLOC_INT) {
5410 
5411           sintp = (SeqIntPtr) slp->data.ptrvalue;
5412           if (sintp != NULL) {
5413             CheckSeqLocForPartial (slp, &partial5, &partial3);
5414             from = sintp->from;
5415             to = sintp->to;
5416             if (sintp->strand != Seq_strand_minus) {
5417               tmp = from;
5418               from = to;
5419               to = tmp;
5420             }
5421             vnp = AddIntervalToLocation (NULL, sintp->id, from, to, partial3, partial5);
5422           }
5423         }
5424       }
5425 
5426     } else if (dsp->choice == 2) {
5427 
5428       slitp = (SeqLitPtr) dsp->data.ptrvalue;
5429       if (slitp != NULL && slitp->seq_data == NULL) {
5430         slip = SeqLitNew ();
5431         if (slip != NULL) {
5432           slip->length = slitp->length;
5433           /* not copying fuzz */
5434           slip->seq_data_type = slitp->seq_data_type;
5435           vnp = ValNodeAddPointer (NULL, 2, (Pointer) slip);
5436         }
5437       } else {
5438         ValNodeFree (head);
5439         return FALSE;
5440       }
5441     }
5442 
5443     /* save in new list in reverse order */
5444 
5445     if (vnp != NULL) {
5446       vnp->next = head;
5447       head = vnp;
5448     }
5449   }
5450 
5451   if (head == NULL) return FALSE;
5452 
5453   dsp = (DeltaSeqPtr) bsp->seq_ext;
5454   while (dsp != NULL) {
5455     dspnext = dsp->next;
5456     dsp->next = NULL;
5457     DeltaSeqFree (dsp);
5458     dsp = dsp->next;
5459   }
5460   bsp->seq_ext = head;
5461 
5462   bsp->hist = SeqHistFree (bsp->hist);
5463 
5464   return TRUE;
5465 }
5466 
ContigRevComp(BioseqPtr bsp)5467 NLM_EXTERN Boolean LIBCALL ContigRevComp (BioseqPtr bsp)
5468 
5469 {
5470   if (bsp == NULL) {
5471     ErrPostEx (SEV_ERROR, 0, 0, "ContigRevComp: empty BioseqPtr");
5472     return FALSE;
5473   }
5474 
5475   if (bsp->repr == Seq_repr_seg && bsp->seq_ext_type == 1 && bsp->seq_ext != NULL) {
5476     return SegRevComp (bsp);
5477   }
5478   if (bsp->repr == Seq_repr_delta && bsp->seq_ext_type == 4 && bsp->seq_ext != NULL) {
5479     return DeltaRevComp (bsp);
5480   }
5481 
5482   ErrPostEx (SEV_ERROR, 0, 0, "ContigRevComp: not a segmented or delta BioseqPtr");
5483   return FALSE;
5484 }
5485 
5486 /*****************************************************************************
5487 *
5488 *  SPCompressNew(void); - allocated memory for SPCompress structure
5489 *
5490 *****************************************************************************/
SPCompressNew(void)5491 NLM_EXTERN SPCompressPtr SPCompressNew(void)
5492 {
5493     SPCompressPtr spc;
5494 
5495     spc = (SPCompressPtr) MemNew(sizeof(SPCompress));
5496     spc->buffer = (Uint1Ptr) MemNew(SPC_BUFF_CHUNK);
5497     spc->allocated = SPC_BUFF_CHUNK;
5498     spc->residues = 0;
5499     spc->lbytes = NULL;
5500 
5501     return spc;
5502 }
5503 /*****************************************************************************
5504 *
5505 *  SPCompressFree(SPCompressPtr spc); -  free SPCompress structure
5506 *
5507 *****************************************************************************/
SPCompressFree(SPCompressPtr spc)5508 NLM_EXTERN void SPCompressFree(SPCompressPtr spc)
5509 {
5510 
5511   MemFree(spc->buffer);
5512   MemFree(spc->lbytes);
5513   MemFree(spc);
5514 
5515 }
5516 /*****************************************************************************
5517 *
5518 *  Int4 SPCompressRead (Pointer data, Uint1Ptr buf, Int4 length);
5519 *        Hook read-function for SPCompressDNA()
5520 *
5521 *****************************************************************************/
5522 static Int4 SPCompressRead (Pointer data, Uint1Ptr buf, Int4 length);
SPCompressRead(Pointer data,Uint1Ptr buf,Int4 length)5523 static Int4 SPCompressRead (Pointer data, Uint1Ptr buf, Int4 length)
5524 {
5525   SeqPortPtr spp;
5526   Uint1 residue = 0;
5527   Int4 total_read=0, index=0;
5528 
5529   Boolean second = FALSE;
5530 
5531   spp = (SeqPortPtr) data;
5532   MemSet(buf, 0, length);  /* Clear buffer first */
5533 
5534   while (index < length && (residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
5535     if (IS_residue(residue)) {
5536       if(second) {
5537         buf[index] += residue;
5538         index++;
5539         second = FALSE;
5540       } else {
5541         residue <<= 4;
5542         buf[index] += residue;
5543         second = TRUE;
5544       }
5545       total_read++;
5546     } else if (residue == SEQPORT_VIRT) { /* No sequence, return NULL. */
5547       continue;
5548     } else {
5549       ErrPostEx(SEV_WARNING, 0, 0,"[Bad residue]\n");
5550       return -1;
5551     }
5552   }
5553   return total_read;
5554 }
5555 
5556 /*****************************************************************************
5557 *
5558 *  Int4 SPCompressWrite (Pointer data, Uint1Ptr buf, Int4 length);
5559 *        Hook write-function for SPCompressDNA()
5560 *
5561 *****************************************************************************/
5562 static Int4 SPCompressWrite (Pointer data, Uint1Ptr buf, Int4 length);
SPCompressWrite(Pointer data,Uint1Ptr buf,Int4 length)5563 static Int4 SPCompressWrite (Pointer data, Uint1Ptr buf, Int4 length)
5564 {
5565   SPCompressPtr spc;
5566   spc = (SPCompressPtr) data;
5567 
5568   if((spc->used + length) >= spc->allocated) {
5569     spc->allocated += SPC_BUFF_CHUNK;
5570     spc->buffer = (Uint1Ptr)Realloc(spc->buffer,
5571                                     spc->allocated);
5572   }
5573 
5574   if((MemCpy(spc->buffer + spc->used, buf, length)) == NULL)
5575     return -1;
5576 
5577   spc->used += length;
5578 
5579   return length;
5580 }
5581 
5582 /*****************************************************************************
5583 *
5584 *   SPRebuildDNA(SPCompressPtr spc);
5585 *       translates spc ncbi2na encoding buffer into
5586 *       spc ncbi4na encoding buffer with rebuild ambiguities
5587 *
5588 *       spc - must be valid SPCompress structure returned
5589 *       from SPCompressDNA() function in ncbi2na encoding
5590 *
5591 *****************************************************************************/
SPRebuildDNA(SPCompressPtr spc)5592 NLM_EXTERN Boolean SPRebuildDNA(SPCompressPtr spc)
5593 {
5594     ByteStorePtr bsp, bsp_plain;
5595     Int4 residues;
5596 
5597     if(spc == NULL || spc->type != Seq_code_ncbi2na)
5598         return FALSE;
5599 
5600     residues = (spc->used-1)*4 + (spc->buffer[spc->used-1] & 0x3);
5601     bsp = BSNew(spc->used);
5602     BSWrite(bsp, spc->buffer, spc->used);
5603 
5604     if((bsp_plain = BSConvertSeq(bsp, Seq_code_ncbi4na,
5605                                  Seq_code_ncbi2na, residues)) == NULL) {
5606         return FALSE;
5607     }
5608 
5609     BSRebuildDNA_4na(bsp_plain, spc->lbytes);
5610 
5611     spc->buffer = (Uint1Ptr) Realloc(spc->buffer, residues/2+1);
5612     BSRead(bsp_plain, spc->buffer, residues/2+1);
5613     spc->type = Seq_code_ncbi4na;
5614     spc->residues = residues;
5615     BSFree(bsp_plain);
5616 
5617     return TRUE;
5618 }
5619 
5620 /*****************************************************************************
5621 *
5622 *   SPCompressDNA(SeqPortPtr spp);
5623 *       converts a ncbi4na taken from spp into ncbi2na
5624 *       buffer stored inside SPCompress structue together
5625 *       with ambiguity information
5626 *       returns pointer SPCompress structure or NULL if error
5627 *
5628 *       NOTE: In this function we do not know - what is length
5629 *             of sequence to compress. Terminated flag for this
5630 *             function is SEQPORT_EOF returned from spp.
5631 *
5632 *****************************************************************************/
SPCompressDNA(SeqPortPtr spp)5633 NLM_EXTERN SPCompressPtr SPCompressDNA(SeqPortPtr spp)
5634 {
5635   SPCompressPtr spc;
5636 
5637   if (spp == NULL || spp->newcode != Seq_code_ncbi4na)
5638     return NULL;
5639 
5640   spc = SPCompressNew();
5641   if(!GenericCompressDNA((VoidPtr) spp, (VoidPtr) spc,
5642                          (Uint4) -1, /* Length of sequence unknown */
5643                          SPCompressRead,
5644                          SPCompressWrite,
5645                          &spc->lbytes
5646                          )) {
5647     return NULL;
5648   }
5649   spc->type = Seq_code_ncbi2na;
5650   return spc;
5651 }
5652 
5653 /*****************************************************************************
5654 *
5655 *   ComposeCodonsRecognizedString (trna, buf, buflen);
5656 *       Copies codon recognized string to buf, returns number of codons
5657 *
5658 *****************************************************************************/
5659 
SortCodonByName(VoidPtr ptr1,VoidPtr ptr2)5660 static int LIBCALLBACK SortCodonByName (VoidPtr ptr1, VoidPtr ptr2)
5661 
5662 {
5663   CharPtr     str1;
5664   CharPtr     str2;
5665   ValNodePtr  vnp1;
5666   ValNodePtr  vnp2;
5667 
5668   if (ptr1 != NULL && ptr2 != NULL) {
5669     vnp1 = *((ValNodePtr PNTR) ptr1);
5670     vnp2 = *((ValNodePtr PNTR) ptr2);
5671     if (vnp1 != NULL && vnp2 != NULL) {
5672       str1 = (CharPtr) vnp1->data.ptrvalue;
5673       str2 = (CharPtr) vnp2->data.ptrvalue;
5674       if (str1 != NULL && str2 != NULL) {
5675         return StringICmp (str1, str2);
5676       } else {
5677         return 0;
5678       }
5679     } else {
5680       return 0;
5681     }
5682   } else {
5683     return 0;
5684   }
5685 }
5686 
MakeDegenerateBase(Uint1 ch1,Uint1 ch2,Uint1Ptr chrToInt,CharPtr intToChr)5687 static Uint1 MakeDegenerateBase (Uint1 ch1, Uint1 ch2, Uint1Ptr chrToInt, CharPtr intToChr)
5688 
5689 {
5690   Uint1  idx;
5691 
5692   idx = chrToInt [(int) ch1] | chrToInt [(int) ch2];
5693   return intToChr [(int) idx];
5694 }
5695 
ComposeCodonsRecognizedString(tRNAPtr trna,CharPtr buf,size_t buflen)5696 NLM_EXTERN Int2 ComposeCodonsRecognizedString (tRNAPtr trna, CharPtr buf, size_t buflen)
5697 
5698 {
5699   Char          ch;
5700   Uint1         chrToInt [256];
5701   Uint1         codon [4];
5702   Int2          count = 0;
5703   ValNodePtr    head, next, vnp;
5704   Int2          k;
5705   Uint1         i, j;
5706   CharPtr       intToChr = "?ACMGRSVUWYHKDBN";
5707   CharPtr       prefix, ptr, str1, str2;
5708   Pointer PNTR  prev;
5709 
5710   if (trna == NULL || buf == NULL || buflen < 25) return 0;
5711 
5712   *buf = '\0';
5713   codon [3] = '\0';
5714   head = NULL;
5715 
5716   for (j = 0; j < 6; j++) {
5717     if (trna->codon [j] < 64) {
5718       if (CodonForIndex (trna->codon [j], Seq_code_iupacna, codon)) {
5719         for (k = 0; k < 3; k++) {
5720           if (codon [k] == 'T') {
5721             codon [k] = 'U';
5722           }
5723         }
5724         ValNodeCopyStr (&head, 0, (CharPtr) codon);
5725       }
5726     }
5727   }
5728 
5729   head = ValNodeSort (head, SortCodonByName);
5730 
5731   if (head == NULL) return 0;
5732 
5733   for (k = 0; k < 256; k++) {
5734     chrToInt [k] = 0;
5735   }
5736   for (i = 1; i < 16; i++) {
5737     ch = intToChr [i];
5738     chrToInt [(int) ch] = i;
5739   }
5740 
5741   count = ValNodeLen (head);
5742   str1 = (CharPtr) head->data.ptrvalue;
5743   vnp = head->next;
5744   prev = (Pointer PNTR) &(head->next);
5745   while (vnp != NULL) {
5746     next = vnp->next;
5747     str2 = (CharPtr) vnp->data.ptrvalue;
5748     if (str1 != NULL && str2 != NULL &&
5749         str1 [0] == str2 [0] && str1 [1] == str2 [1]) {
5750       str1 [2] = MakeDegenerateBase (str1 [2], str2 [2], chrToInt, intToChr);
5751       *prev = next;
5752       vnp->next = NULL;
5753       ValNodeFreeData (vnp);
5754     } else {
5755       str1 = str2;
5756       prev = (Pointer PNTR) &(vnp->next);
5757     }
5758     vnp = next;
5759   }
5760 
5761   for (vnp = head, ptr = buf, i = 0, prefix = NULL; vnp != NULL;
5762        vnp = vnp->next, prefix = ", ", i++) {
5763     ptr = StringMove (ptr, prefix);
5764     ptr = StringMove (ptr, (CharPtr) vnp->data.ptrvalue);
5765   }
5766 
5767   ValNodeFreeData (head);
5768   return count;
5769 }
5770 
5771 /*****************************************************************************
5772 *
5773 *   TransTableNew (Int2 genCode);
5774 *       Initializes TransTable finite state machine for 6-frame translation
5775 *       and open reading frame search, allowing nucleotide ambiguity characters
5776 *
5777 *****************************************************************************/
5778 
SetGenCode(Int2 genCode,CharPtr PNTR ncbieaa,CharPtr PNTR sncbieaa)5779 static Boolean SetGenCode (Int2 genCode, CharPtr PNTR ncbieaa, CharPtr PNTR sncbieaa)
5780 
5781 {
5782   GeneticCodePtr  codes;
5783   GeneticCodePtr  gcp;
5784   Int4            id;
5785   ValNodePtr      vnp;
5786 
5787   if (ncbieaa == NULL || sncbieaa == NULL) return FALSE;
5788 
5789   codes = GeneticCodeTableLoad ();
5790   if (codes == NULL) return FALSE;
5791   for (gcp = codes; gcp != NULL; gcp = gcp->next) {
5792     id = 0;
5793     *ncbieaa = NULL;
5794     *sncbieaa = NULL;
5795     for (vnp = (ValNodePtr) gcp->data.ptrvalue; vnp != NULL; vnp = vnp->next) {
5796       switch (vnp->choice) {
5797         case 2 :
5798           id = vnp->data.intvalue;
5799           break;
5800         case 3 :
5801           *ncbieaa = (CharPtr) vnp->data.ptrvalue;
5802           break;
5803         case 6 :
5804           *sncbieaa = (CharPtr) vnp->data.ptrvalue;
5805           break;
5806         default :
5807           break;
5808       }
5809     }
5810     if (genCode == id) return TRUE;
5811   }
5812 
5813   return FALSE;
5814 }
5815 
5816 typedef enum {
5817   BASE_A = 0,  /* A    */
5818   BASE_C,      /* C    */
5819   BASE_G,      /* G    */
5820   BASE_T,      /* T    */
5821   BASE_M,      /* AC   */
5822   BASE_R,      /* AG   */
5823   BASE_W,      /* AT   */
5824   BASE_S,      /* CG   */
5825   BASE_Y,      /* CT   */
5826   BASE_K,      /* GT   */
5827   BASE_V,      /* ACG  */
5828   BASE_H,      /* ACT  */
5829   BASE_D,      /* AGT  */
5830   BASE_B,      /* CGT  */
5831   BASE_N       /* ACGT */
5832 } BaseCode;
5833 
TransTableNew(Int2 genCode)5834 NLM_EXTERN TransTablePtr TransTableNew (Int2 genCode)
5835 
5836 {
5837   Char     ch, tpaa, btaa, tporf, btorf;
5838   Char     charToBase [16] = "ACGTMRWSYKVHDBN";
5839   Int2     fournaToBase [16] = {
5840              BASE_N, BASE_A, BASE_C, BASE_M, BASE_G, BASE_R, BASE_S, BASE_V,
5841              BASE_T, BASE_W, BASE_Y, BASE_H, BASE_K, BASE_D, BASE_B, BASE_N};
5842   Int2     expansions [75] = {
5843              BASE_A, -1,     -1,     -1,     -1,
5844              BASE_C, -1,     -1,     -1,     -1,
5845              BASE_G, -1,     -1,     -1,     -1,
5846              BASE_T, -1,     -1,     -1,     -1,
5847              BASE_A, BASE_C, -1,     -1,     -1,
5848              BASE_A, BASE_G, -1,     -1,     -1,
5849              BASE_A, BASE_T, -1,     -1,     -1,
5850              BASE_C, BASE_G, -1,     -1,     -1,
5851              BASE_C, BASE_T, -1,     -1,     -1,
5852              BASE_G, BASE_T, -1,     -1,     -1,
5853              BASE_A, BASE_C, BASE_G, -1,     -1,
5854              BASE_A, BASE_C, BASE_T, -1,     -1,
5855              BASE_A, BASE_G, BASE_T, -1,     -1,
5856              BASE_C, BASE_G, BASE_T, -1,     -1,
5857              BASE_A, BASE_C, BASE_G, BASE_T, -1};
5858   Boolean  goOn;
5859   Int2     i, j, k, st, nx, cd;
5860   Int2     p, q, r, x, y, z;
5861   Uint1    ui;
5862   Int2     codonidx [4] = {2, 1, 3, 0};  /* in genetic code table, T = 0, C = 1, A = 2, G = 3, */
5863   Int2     complidx [4] = {0, 3, 1, 2};  /* and index = (base1 * 16) + (base2 * 4) + base3 */
5864   CharPtr  ncbieaa = NULL, sncbieaa = NULL;
5865   TransTablePtr  tbl;
5866 
5867   tbl = (TransTablePtr) MemNew (sizeof (TransTable));
5868   if (tbl == NULL) return NULL;
5869   MemSet ((Pointer) tbl, 0, sizeof (TransTable));
5870 
5871   if (genCode == 7) {
5872     genCode = 4;
5873   } else if (genCode == 8) {
5874     genCode = 1;
5875   } else if (genCode == 0) {
5876     genCode = 1;
5877   }
5878 
5879   if ((! SetGenCode (genCode, &ncbieaa, &sncbieaa)) || ncbieaa == NULL || sncbieaa == NULL) {
5880     ncbieaa =  "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
5881     sncbieaa = "---M------**--*----M---------------M----------------------------";
5882   }
5883 
5884   tbl->genCode = genCode;
5885   StringNCpy_0 (tbl->ncbieaa, ncbieaa, sizeof (tbl->ncbieaa));
5886   StringNCpy_0 (tbl->sncbieaa, sncbieaa, sizeof (tbl->sncbieaa));
5887 
5888   /* table to convert any ASCII character to BASE_x integer from 0 through 14 */
5889   for (i = 0; i < 256; i++) {
5890     tbl->basesToIdx [i] = BASE_N;
5891   }
5892 
5893   /* map iupacna alphabet to BaseCode */
5894   for (ui = BASE_A; ui <= BASE_N; ui++) {
5895     ch = charToBase [ui];
5896     tbl->basesToIdx [(int) ch] = ui;
5897     ch = TO_LOWER (ch);
5898     tbl->basesToIdx [(int) ch] = ui;
5899   }
5900   tbl->basesToIdx [(int) 'U'] = BASE_T;
5901   tbl->basesToIdx [(int) 'u'] = BASE_T;
5902   tbl->basesToIdx [(int) 'X'] = BASE_N;
5903   tbl->basesToIdx [(int) 'x'] = BASE_N;
5904 
5905   /* also map ncbi4na alphabet to BaseCode */
5906   for (i = 0; i < 16; i++) {
5907     tbl->basesToIdx [(int) i] = fournaToBase [i];
5908   }
5909 
5910   /* add tbl->basesToIdx [(int) ch] to tbl->nextBase [state] to get next state */
5911 
5912   /* treat state 0 as already having seen NN, avoiding single and double letter states */
5913   tbl->nextBase [0] = 3361;
5914 
5915   /* states 1 through 3375 are triple letter states (AAA, AAC, ..., NNT, NNN) */
5916   for (i = BASE_A, st = 1; i <= BASE_N; i++) {
5917     for (j = BASE_A, nx = 1; j <= BASE_N; j++) {
5918       for (k = BASE_A; k <= BASE_N; k++, st++, nx += 15) {
5919         tbl->nextBase [st] = nx;
5920       }
5921     }
5922   }
5923 
5924   /* tbl->aminoAcid [state] [strand] contains amino acid encoded by state */
5925 
5926   /* initialize all states to return unknown amino acid X */
5927   for (st = 0; st < 3376; st++) {
5928     tbl->aminoAcid [st] [TTBL_TOP_STRAND] = 'X';
5929     tbl->aminoAcid [st] [TTBL_BOT_STRAND] = 'X';
5930     tbl->orfStart [st] [TTBL_TOP_STRAND] = '-';
5931     tbl->orfStart [st] [TTBL_BOT_STRAND] = '-';
5932   }
5933 
5934   /* lookup amino acid for each codon in genetic code table */
5935   for (i = BASE_A, st = 1; i <= BASE_N; i++) {
5936     for (j = BASE_A; j <= BASE_N; j++) {
5937       for (k = BASE_A; k <= BASE_N; k++, st++) {
5938         /* st = 225 * i + 15 * j + k + 1; */
5939 
5940         tpaa = '\0';
5941         btaa = '\0';
5942         tporf = '\0';
5943         btorf = '\0';
5944         goOn = TRUE;
5945 
5946         /* expand ambiguous IJK nucleotide symbols into component bases XYZ */
5947         for (p = i * 5, x = expansions [p]; x != -1 && goOn; p++, x = expansions [p]) {
5948           for (q = j * 5, y = expansions [q]; y != -1 && goOn; q++, y = expansions [q]) {
5949             for (r = k * 5, z = expansions [r]; z != -1 && goOn; r++, z = expansions [r]) {
5950 
5951               /* lookup amino acid for codon XYZ */
5952               cd = 16 * codonidx [x] + 4 * codonidx [y] + codonidx [z];
5953               ch = ncbieaa [cd];
5954               if (tpaa == '\0') {
5955                 tpaa = ch;
5956               } else if (tpaa != ch) {
5957                 /* allow Asx (Asp or Asn) and Glx (Glu or Gln) and Xle (Leu or Ile) */
5958                 if ((tpaa == 'B' || tpaa == 'D' || tpaa == 'N') && (ch == 'D' || ch == 'N')) {
5959                   tpaa = 'B';
5960                 } else if ((tpaa == 'Z' || tpaa == 'E' || tpaa == 'Q') && (ch == 'E' || ch == 'Q')) {
5961                   tpaa = 'Z';
5962                 } else if ((tpaa == 'J' || tpaa == 'I' || tpaa == 'L') && (ch == 'I' || ch == 'L')) {
5963                   tpaa = 'J';
5964                 } else {
5965                   tpaa = 'X';
5966                 }
5967               }
5968               /* and translation start flag on top strand */
5969               ch = sncbieaa [cd];
5970               if (tporf == '\0') {
5971                 tporf = ch;
5972               } else if (tporf != ch) {
5973                 tporf = 'X'; /* was '-' */
5974               }
5975 
5976               /* lookup amino acid for complement of reversed ZYX */
5977               cd = 16 * complidx [z] + 4 * complidx [y] + complidx [x];
5978               ch = ncbieaa [cd];
5979               if (btaa == '\0') {
5980                 btaa = ch;
5981               } else if (btaa != ch) {
5982                 /* allow Asx (Asp or Asn) and Glx (Glu or Gln) and Xle (Leu or Ile) */
5983                 if ((btaa == 'B' || btaa == 'D' || btaa == 'N') && (ch == 'D' || ch == 'N')) {
5984                   btaa = 'B';
5985                 } else if ((btaa == 'Z' || btaa == 'E' || btaa == 'Q') && (ch == 'E' || ch == 'Q')) {
5986                   btaa = 'Z';
5987                 } else if ((btaa == 'J' || btaa == 'I' || btaa == 'L') && (ch == 'I' || ch == 'L')) {
5988                   btaa = 'J';
5989                 } else {
5990                   btaa = 'X';
5991                 }
5992               }
5993               /* and translation start flag on bottom strand */
5994               ch = sncbieaa [cd];
5995               if (btorf == '\0') {
5996                 btorf = ch;
5997               } else if (btorf != ch) {
5998                 btorf = 'X'; /* was '-' */
5999               }
6000 
6001               /* drop out of loop as soon as answer is known */
6002               if (tpaa == 'X' && btaa == 'X' && tporf == 'X' && btorf == 'X') { /* was '-' for orfs */
6003                 goOn = FALSE;
6004               }
6005             }
6006           }
6007         }
6008 
6009         /* assign amino acid */
6010         tbl->aminoAcid [st] [TTBL_TOP_STRAND] = tpaa;
6011         tbl->aminoAcid [st] [TTBL_BOT_STRAND] = btaa;
6012 
6013         /* assign orf start */
6014         tbl->orfStart [st] [TTBL_TOP_STRAND] = tporf;
6015         tbl->orfStart [st] [TTBL_BOT_STRAND] = btorf;
6016       }
6017     }
6018   }
6019 
6020   /* finite state machine for 6-frame translation and ORF search is now initialized */
6021   return tbl;
6022 }
6023 
TransTableFree(TransTablePtr tbl)6024 NLM_EXTERN TransTablePtr TransTableFree (TransTablePtr tbl)
6025 
6026 {
6027   return MemFree (tbl);
6028 }
6029 
TransTableFreeAll(void)6030 NLM_EXTERN void TransTableFreeAll (void)
6031 
6032 {
6033     Int2           genCode;
6034     Char           str [32];
6035     TransTablePtr  tbl;
6036 
6037     for (genCode = 1; genCode < 40; genCode++) {
6038         sprintf (str, "TransTableFSAforGenCode%d", (int) genCode);
6039         tbl = (TransTablePtr) GetAppProperty (str);
6040         if (tbl != NULL) {
6041             SetAppProperty (str, NULL);
6042             TransTableFree (tbl);
6043         }
6044     }
6045     return;
6046 }
6047 
6048 /* convenience function does translation for entire bioseq */
6049 
TransTableProcessBioseq(TransTablePtr tbl,TransTableMatchProc matchProc,Pointer userdata,BioseqPtr bsp)6050 NLM_EXTERN void TransTableProcessBioseq (
6051   TransTablePtr tbl,
6052   TransTableMatchProc matchProc,
6053   Pointer userdata,
6054   BioseqPtr bsp
6055 )
6056 
6057 {
6058   Boolean     altStart, atgStart, orfStop;
6059   Byte        bases [400];
6060   Char        ch;
6061   Int2        ctr, frame, i, j, state;
6062   Int4        position;
6063   Uint1       residue;
6064   SeqPortPtr  spp;
6065 
6066   if (tbl == NULL || matchProc == NULL || bsp == NULL) return;
6067 
6068   if (! ISA_na (bsp->mol)) return;
6069 
6070   spp = SeqPortNew (bsp, 0, -1, 0, Seq_code_iupacna);
6071   if (spp == NULL) return;
6072 
6073   if (bsp->repr == Seq_repr_delta) {
6074     SeqPortSet_do_virtual (spp, TRUE);
6075   }
6076 
6077   /* read first block of bases, reality check on length */
6078 
6079   ctr = SeqPortRead (spp, bases, sizeof (bases));
6080   if (ctr < 6) {
6081     SeqPortFree (spp);
6082     return;
6083   }
6084 
6085   state = 0;
6086   position = 0;
6087   frame = 0;
6088 
6089   i = 0;
6090   residue = (Uint1) bases [i];
6091 
6092   /* prime finite state machine with first two bases */
6093 
6094   for (j = 0; j < 2 && residue != SEQPORT_EOF; j++) {
6095     if (IS_residue (residue)) {
6096       state = NextCodonState (tbl, state, residue);
6097     }
6098     i++;
6099     residue = (Uint1) bases [i];
6100   }
6101 
6102   /* loop on all remaining bases */
6103 
6104   while (residue != SEQPORT_EOF) {
6105     if (IS_residue (residue)) {
6106       state = NextCodonState (tbl, state, residue);
6107 
6108       /* get amino acid for codon on top strand */
6109 
6110       ch = GetCodonResidue (tbl, state, TTBL_TOP_STRAND);
6111       atgStart = IsATGStart (tbl, state, TTBL_TOP_STRAND);
6112       altStart = IsAltStart (tbl, state, TTBL_TOP_STRAND);
6113       orfStop = IsOrfStop (tbl, state, TTBL_TOP_STRAND);
6114       matchProc (position, ch, atgStart, altStart, orfStop, frame, Seq_strand_plus, userdata);
6115 
6116       /* get amino acid for codon on top strand */
6117 
6118       ch = GetCodonResidue (tbl, state, TTBL_BOT_STRAND);
6119       atgStart = IsATGStart (tbl, state, TTBL_BOT_STRAND);
6120       altStart = IsAltStart (tbl, state, TTBL_BOT_STRAND);
6121       orfStop = IsOrfStop (tbl, state, TTBL_BOT_STRAND);
6122       matchProc (position, ch, atgStart, altStart, orfStop, frame, Seq_strand_minus, userdata);
6123 
6124       /* advance base position, also keep track of frame */
6125 
6126       position++;
6127       frame++;
6128       if (frame > 2) {
6129         frame = 0;
6130       }
6131     }
6132 
6133     /* increment base counter */
6134 
6135     i++;
6136     if (i >= ctr) {
6137       i = 0;
6138 
6139       /* read next block of bases */
6140 
6141       ctr = SeqPortRead (spp, bases, sizeof (bases));
6142       if (ctr < 0) {
6143         bases [0] = -ctr;
6144       } else if (ctr < 1) {
6145         bases [0] = SEQPORT_EOF;
6146       }
6147     }
6148     residue = (Uint1) bases [i];
6149   }
6150 
6151   SeqPortFree (spp);
6152 }
6153 
6154 /* trans table translation function can be passed cds feature or individual parameters */
6155 
6156 typedef struct readcdsdata {
6157   CharPtr  tmp;
6158   size_t   frame;
6159   Int4     max;
6160   Boolean  overflow;
6161 } ReadCdsData, PNTR ReadCdsPtr;
6162 
6163 /* callback allows skipping one or two bases at beginning */
6164 
SaveCdsBases(CharPtr sequence,Pointer userdata)6165 static void LIBCALLBACK SaveCdsBases (
6166   CharPtr sequence,
6167   Pointer userdata
6168 )
6169 
6170 {
6171   Char        ch;
6172   CharPtr     from, to;
6173   unsigned int len;
6174   Int4        max;
6175   ReadCdsPtr  rcp;
6176 
6177   rcp = (ReadCdsPtr) userdata;
6178 
6179   if (rcp->frame > 0) {
6180     len = 0;
6181     ch = sequence [len];
6182     while (ch != '\0' && len <= rcp->frame) {
6183       len++;
6184       ch = sequence [len];
6185     }
6186     /* len = StringLen (sequence); */
6187     if (rcp->frame >= len) {
6188 
6189       /* unusual locations can have fewer bases in the first segments than the frame, so just decrement */
6190 
6191       rcp->frame -= len;
6192       return;
6193     }
6194   }
6195 
6196   /* rcp->tmp = StringMove (rcp->tmp, sequence + rcp->frame); */
6197 
6198   from = sequence + rcp->frame;
6199   to = rcp->tmp;
6200   max = rcp->max;
6201 
6202   ch = *from;
6203   while (ch != '\0' && max > 0) {
6204     *to = ch;
6205     to++;
6206     from++;
6207     ch = *from;
6208     max--;
6209   }
6210   *to = '\0';
6211   if (ch != '\0') {
6212     rcp->overflow = TRUE;
6213   }
6214 
6215   rcp->tmp = to;
6216   rcp->max = max;
6217 
6218   rcp->frame = 0;
6219 }
6220 
ReadCodingRegionBases(SeqLocPtr location,Int4 len,Uint1 frame,Int4Ptr totalP)6221 NLM_EXTERN CharPtr ReadCodingRegionBases (SeqLocPtr location, Int4 len, Uint1 frame, Int4Ptr totalP)
6222 
6223 {
6224   CharPtr      bases, txt;
6225   Int4         mod;
6226   ReadCdsData  rcd;
6227   /*
6228   Int2         actual, cnt;
6229   BioseqPtr    bsp;
6230   Int4         mod, position;
6231   SeqIdPtr     sip;
6232   SeqLocPtr    slp;
6233   SeqPortPtr   spp;
6234   */
6235 
6236   bases = MemNew ((size_t) (len + 6));
6237   if (bases == NULL)
6238       return NULL;
6239 
6240   rcd.tmp = bases;
6241   rcd.max = len;
6242   rcd.overflow = FALSE;
6243 
6244   /* adjust start position */
6245 
6246   if (frame == 2) {
6247     rcd.frame = 1;
6248   } else if (frame == 3) {
6249     rcd.frame = 2;
6250   } else {
6251     rcd.frame = 0;
6252   }
6253 
6254   SeqPortStreamLoc (location, STREAM_EXPAND_GAPS, (Pointer) &rcd, SaveCdsBases);
6255 
6256   txt = rcd.tmp;
6257 
6258   if (rcd.overflow) {
6259     ErrPostEx (SEV_ERROR, 0, 0, "ReadCodingRegionBases overflow caught");
6260   }
6261 
6262 #if 0
6263   spp = SeqPortNewByLoc (location, Seq_code_iupacna);
6264   if (spp == NULL) {
6265     MemFree (bases);
6266     return NULL;
6267   }
6268 
6269   slp = SeqLocFindNext (location, NULL);
6270   while (slp != NULL) {
6271     sip = SeqLocId (slp);
6272     if (sip != NULL) {
6273       bsp = BioseqFind (sip);
6274       if (bsp != NULL) {
6275         if (bsp->repr == Seq_repr_delta || bsp->repr == Seq_repr_virtual) {
6276           SeqPortSet_do_virtual (spp, TRUE);
6277         }
6278       }
6279     }
6280     slp = SeqLocFindNext (location, slp);
6281   }
6282 
6283   /* adjust start position */
6284 
6285   if (frame == 2) {
6286     position = 1;
6287   } else if (frame == 3) {
6288     position = 2;
6289   } else {
6290     position = 0;
6291   }
6292   SeqPortSeek (spp, position, SEEK_SET);
6293   len -= position;
6294 
6295   /* read nucleotides into temporary buffer */
6296 
6297   cnt = (Int2) MIN (len, 32000L);
6298   txt = bases;
6299   actual = 1;
6300   while (cnt > 0 && len > 0 && actual > 0) {
6301     actual = SeqPortRead (spp, (BytePtr) txt, cnt);
6302     if (actual < 0) {
6303       actual = -actual;
6304       if (actual == SEQPORT_VIRT || actual == SEQPORT_EOS) {
6305         actual = 1; /* ignore, keep going */
6306       } else if (actual == SEQPORT_EOF) {
6307         actual = 0; /* stop */
6308       }
6309     } else if (actual > 0) {
6310       len -= actual;
6311       txt += actual;
6312       cnt = (Int2) MIN (len, 32000L);
6313     }
6314   }
6315 
6316   SeqPortFree (spp);
6317 #endif
6318 
6319   /* pad incomplete last codon with Ns */
6320 
6321   len = StringLen (bases);
6322   if (len > 0) {
6323     mod = len % 3;
6324     if (mod == 1) {
6325       txt = StringMove (txt, "NN");
6326     } else if (mod == 2) {
6327       txt = StringMove (txt, "N");
6328     }
6329   }
6330   if (totalP != NULL) {
6331     *totalP = len;
6332   }
6333 
6334   return bases;
6335 }
6336 
MakeCodeBreakList(SeqLocPtr cdslocation,Int4 len,CodeBreakPtr cbp,Uint1 frame)6337 NLM_EXTERN ValNodePtr MakeCodeBreakList (SeqLocPtr cdslocation, Int4 len, CodeBreakPtr cbp, Uint1 frame)
6338 
6339 {
6340   Int4        adjust = 0, pos, pos1, pos2;
6341   SeqLocPtr   tmp;
6342   ValNodePtr  vnphead = NULL;
6343 
6344   if (cdslocation == NULL || cbp == NULL) return NULL;
6345 
6346   if (frame == 2) {
6347     adjust = 1;
6348   } else if (frame == 3) {
6349     adjust = 2;
6350   } else {
6351     adjust = 0;
6352   }
6353 
6354   while (cbp != NULL) {
6355     pos1 = INT4_MAX;
6356     pos2 = -10;
6357     tmp = NULL;
6358 
6359     while ((tmp = SeqLocFindNext (cbp->loc, tmp)) != NULL) {
6360       pos = GetOffsetInLoc (tmp, cdslocation, SEQLOC_START);
6361       if (pos < pos1) {
6362         pos1 = pos;
6363       }
6364       pos = GetOffsetInLoc (tmp, cdslocation, SEQLOC_STOP);
6365       if (pos > pos2) {
6366         pos2 = pos;
6367       }
6368     }
6369 
6370     pos = pos2 - pos1; /* codon length */
6371     /* allow partial codon at the end */
6372     if (pos == 2 || (pos >= 0 && pos <= 1 && pos2 == len - 1)) {
6373       pos1 -= adjust;
6374       ValNodeAddInt (&vnphead, (Int2) cbp->aa.value.intvalue, (Int4) (pos1 / 3));
6375     }
6376 
6377     cbp = cbp->next;
6378   }
6379 
6380   vnphead = ValNodeSort (vnphead, SortByIntvalue);
6381 
6382   return vnphead;
6383 }
6384 
TransTableTranslateCommon(TransTablePtr PNTR tblptr,SeqLocPtr location,SeqLocPtr product,Boolean partial,Int2 genCode,Uint1 frame,CodeBreakPtr code_break,Boolean include_stop,Boolean remove_trailingX,Boolean no_stop_at_end_of_complete_cds,BoolPtr altStartP,Boolean farProdFetchOK)6385 static ByteStorePtr TransTableTranslateCommon (
6386   TransTablePtr  PNTR tblptr,
6387   SeqLocPtr location,
6388   SeqLocPtr product,
6389   Boolean partial,
6390   Int2 genCode,
6391   Uint1 frame,
6392   CodeBreakPtr code_break,
6393   Boolean include_stop,
6394   Boolean remove_trailingX,
6395   Boolean no_stop_at_end_of_complete_cds,
6396   BoolPtr altStartP,
6397   Boolean farProdFetchOK
6398 )
6399 
6400 {
6401   Char           aa;
6402   Int2           j, state = 0;
6403   Boolean        bad_base, no_start, check_start, got_stop,
6404                  incompleteLastCodon, use_break = FALSE, is_first;
6405   CharPtr        bases, txt, protseq;
6406   ByteStorePtr   bs;
6407   ValNodePtr     codebreakhead = NULL, vnp;
6408   TransTablePtr  localtbl = NULL, tbl;
6409   Uint2          part_prod = 0, part_loc = 0;
6410   Int4           dnalen, protlen, total, k, p, q;
6411   Uint1          residue = 0;
6412 
6413   /* if table pointer not passed in from calling stack, use local table */
6414 
6415   if (tblptr == NULL) {
6416     tblptr = &localtbl;
6417   }
6418 
6419   if (location == NULL) return NULL;
6420   dnalen = SeqLocLen (location);
6421   if (dnalen < 1) return NULL;
6422 
6423   /* adjust for obsolete genetic code numbers */
6424 
6425   if (genCode == 7) {
6426     genCode = 4;
6427   } else if (genCode == 8) {
6428     genCode = 1;
6429   } else if (genCode == 0) {
6430     genCode = 1;
6431   }
6432 
6433   /* can store table for reuse on calling function's stack, replace if code is changing */
6434 
6435   tbl = *tblptr;
6436   if (tbl != NULL && genCode != tbl->genCode) {
6437     tbl = TransTableFree (tbl);
6438     *tblptr = tbl;
6439   }
6440   if (tbl == NULL) {
6441     tbl = TransTableNew (genCode);
6442     *tblptr = tbl;
6443   }
6444   if (tbl == NULL) return NULL;
6445 
6446   /* read bases, pad last codon with Ns, get total base count without padding */
6447 
6448   bases = ReadCodingRegionBases (location, dnalen, frame, &total);
6449   if (bases == NULL) {
6450     TransTableFree (localtbl);
6451     return NULL;
6452   }
6453 
6454   /* reality check on length */
6455 
6456   if (StringLen (bases) < 3) {
6457     MemFree (bases);
6458     TransTableFree (localtbl);
6459     return NULL;
6460   }
6461 
6462   /* process code breaks into list of aa (choice) and protein offset (data.intvalue) */
6463 
6464   codebreakhead = MakeCodeBreakList (location, dnalen, code_break, frame);
6465 
6466   no_start = FALSE;
6467   part_loc = SeqLocPartialCheck (location);
6468   part_prod = SeqLocPartialCheckEx (product, farProdFetchOK);
6469   if ((part_loc & SLP_START) /* || (part_prod & SLP_START) */) {
6470     no_start = TRUE;
6471   }
6472   if (StringHasNoText (tbl->sncbieaa) || no_start || frame > 1) {
6473     check_start = FALSE;
6474   } else {
6475     check_start = TRUE;
6476   }
6477 
6478   /* size of protein, allow partial codon at end */
6479 
6480   protlen = dnalen;
6481   protlen /= 3;
6482   protlen += 1;
6483 
6484   protseq = (CharPtr) MemNew ((size_t) protlen + 2);
6485   if (protseq == NULL) {
6486     MemFree (bases);
6487     ValNodeFree (codebreakhead);
6488     TransTableFree (localtbl);
6489     return NULL;
6490   }
6491 
6492   bs = BSNew (protlen);
6493   if (bs == NULL) {
6494     MemFree (bases);
6495     MemFree (protseq);
6496     ValNodeFree (codebreakhead);
6497     TransTableFree (localtbl);
6498     return NULL;
6499   }
6500 
6501   got_stop = FALSE;
6502   incompleteLastCodon = FALSE;
6503   is_first = TRUE;
6504   use_break = FALSE;
6505   state = 0;
6506 
6507   k = 0;
6508   p = 0;
6509   q = 0;
6510   txt = bases;
6511   residue = (Uint1) *txt;
6512 
6513   if (altStartP != NULL) {
6514     *altStartP = FALSE;
6515   }
6516 
6517   /* loop through all codons */
6518 
6519   while (residue != '\0') {
6520     for (j = 0, bad_base = FALSE; j < 3; j++, k++, txt++, residue = (Uint1) *txt) {
6521       if (IS_residue (residue)) {
6522         state = NextCodonState (tbl, state, residue);
6523       } else {
6524         state = NextCodonState (tbl, state, 'N');
6525         bad_base = TRUE;
6526       }
6527     }
6528 
6529     for (vnp = codebreakhead; vnp != NULL && vnp->data.intvalue != p; vnp = vnp->next) continue;
6530     use_break = (Boolean) (vnp != NULL);
6531 
6532     if (use_break) {
6533       aa = (Char) vnp->choice;
6534     } else if (bad_base) {
6535       aa = 'X';
6536     } else if (is_first && check_start) {
6537 
6538       /* ambiguous start codon that MAY be an initiator now translated to ambiguous X amino acid */
6539       aa = GetStartResidue (tbl, state, TTBL_TOP_STRAND);
6540       if (aa == '-') {
6541         if ((! ((part_loc & SLP_STOP) || (part_prod & SLP_STOP))) && (partial)) {
6542           aa = GetCodonResidue (tbl, state, TTBL_TOP_STRAND);
6543         }
6544       } else {
6545         if (altStartP != NULL) {
6546           if (IsAltStart (tbl, state, TTBL_TOP_STRAND)) {
6547             *altStartP = TRUE;
6548           }
6549         }
6550       }
6551     } else {
6552 
6553       aa = GetCodonResidue (tbl, state, TTBL_TOP_STRAND);
6554     }
6555     is_first = FALSE;
6556 
6557     if (aa == '*'
6558       && (! include_stop
6559         || (no_stop_at_end_of_complete_cds && ! partial && *(txt + 1) == 0)))
6560     {
6561       got_stop = TRUE;
6562       residue = '\0'; /* signal end of loop */
6563 
6564     } else {
6565 
6566       if (q < protlen) { /* protect against accidental buffer overflow */
6567         protseq [q] = aa;
6568       }
6569       q++;
6570       /*
6571       BSPutByte (bs, (Int2) aa);
6572       */
6573     }
6574 
6575     /* advance protein position for code break test */
6576 
6577     p++;
6578   }
6579 
6580   if (q > protlen) {
6581     ErrPostEx (SEV_ERROR, 0, 0, "TransTableTranslate - %ld characters written, %ld characters expected", (long) q, (long) protlen);
6582   }
6583 
6584   if (k > total) {
6585     incompleteLastCodon = TRUE;
6586   }
6587 
6588   if ((! got_stop) && (! incompleteLastCodon) && q > 0 && (! partial) && (! use_break)) {
6589     /* check for stop codon that normally encodes an amino acid */
6590     aa = GetStartResidue (tbl, state, TTBL_TOP_STRAND);
6591     if (aa == '*') {
6592       if (include_stop) {
6593         protseq [q - 1] = aa;
6594       } else {
6595         q--;
6596       }
6597       got_stop = TRUE;
6598     }
6599   }
6600 
6601   if ((! got_stop) && incompleteLastCodon && q > 0) {
6602     aa = protseq [q - 1];
6603     if ((aa == 'X' /* || aa == 'B' || aa == 'Z' */) && q > 0) {
6604       q--;
6605     }
6606 #if 0
6607     BSSeek (bs, -1, SEEK_END);  /* remove last X if incomplete last codon */
6608     aa = (Char) BSGetByte (bs);
6609     if ((aa == 'X' /* || aa == 'B' || aa == 'Z' */) && BSLen (bs) > 0) {
6610       BSSeek (bs, -1, SEEK_END);
6611       BSDelete (bs, 1);
6612       BSSeek (bs, -1, SEEK_END);
6613     }
6614 #endif
6615   }
6616 
6617   if ((! got_stop) && remove_trailingX && q > 0) { /* only remove trailing X on partial CDS */
6618     aa = protseq [q - 1];
6619     while ((aa == 'X' /* || aa == 'B' || aa == 'Z' */) && q > 0) {
6620       q--;
6621       aa = protseq [q - 1];
6622     }
6623 #if 0
6624     BSSeek (bs, -1, SEEK_END);  /* back up to last residue */
6625     aa = (Char) BSGetByte (bs);
6626     while ((aa == 'X' /* || aa == 'B' || aa == 'Z' */) && BSLen (bs) > 0) {
6627       BSSeek (bs, -1, SEEK_END);
6628       BSDelete (bs, 1);
6629       BSSeek (bs, -1, SEEK_END);
6630       aa = (Char) BSGetByte (bs);
6631     }
6632 #endif
6633   }
6634 
6635   BSWrite (bs, (Pointer) protseq, q);
6636 
6637   if (BSLen (bs) < 1) {
6638     bs = BSFree (bs);
6639   }
6640 
6641   /* clean up temporarily allocated memory */
6642 
6643   MemFree (bases);
6644   MemFree (protseq);
6645   ValNodeFree (codebreakhead);
6646 
6647   /* free local table, if allocated */
6648 
6649   TransTableFree (localtbl);
6650 
6651   return bs;
6652 }
6653 
6654 /* public functions for trans table translation */
6655 
TransTableTranslateSeqLoc(TransTablePtr PNTR tblptr,SeqLocPtr location,Int2 genCode,Uint1 frame,Boolean include_stop,Boolean remove_trailingX)6656 NLM_EXTERN ByteStorePtr TransTableTranslateSeqLoc (
6657   TransTablePtr  PNTR tblptr,
6658   SeqLocPtr location,
6659   Int2 genCode,
6660   Uint1 frame,
6661   Boolean include_stop,
6662   Boolean remove_trailingX
6663 )
6664 
6665 {
6666   return TransTableTranslateCommon (tblptr, location, NULL, FALSE, genCode,
6667                                     frame, NULL, include_stop,
6668                                     remove_trailingX, FALSE, NULL, TRUE);
6669 }
6670 
TransTableTranslateCdRegionEx(TransTablePtr PNTR tblptr,SeqFeatPtr cds,Boolean include_stop,Boolean remove_trailingX,Boolean no_stop_at_end_of_complete_cds,BoolPtr altStartP,Boolean farProdFetchOK)6671 NLM_EXTERN ByteStorePtr TransTableTranslateCdRegionEx (
6672   TransTablePtr  PNTR tblptr,
6673   SeqFeatPtr cds,
6674   Boolean include_stop,
6675   Boolean remove_trailingX,
6676   Boolean no_stop_at_end_of_complete_cds,
6677   BoolPtr altStartP,
6678   Boolean farProdFetchOK
6679 )
6680 
6681 {
6682   CdRegionPtr  crp;
6683   Int2         genCode = 0;
6684   ValNodePtr   vnp;
6685   Boolean      partial5, partial3;
6686 
6687   if (cds == NULL || cds->data.choice != SEQFEAT_CDREGION) return NULL;
6688   crp = (CdRegionPtr) cds->data.value.ptrvalue;
6689   if (crp == NULL) return NULL;
6690 
6691   /* set genCode variable from genetic_code parameter, if id choice is used */
6692 
6693   if (crp->genetic_code != NULL) {
6694     vnp = (ValNodePtr) crp->genetic_code->data.ptrvalue;
6695     while (vnp != NULL) {
6696       if (vnp->choice == 2) {
6697         genCode = (Int2) vnp->data.intvalue;
6698       }
6699       vnp = vnp->next;
6700     }
6701   }
6702   CheckSeqLocForPartial (cds->location, &partial5, &partial3);
6703 
6704   return TransTableTranslateCommon (tblptr, cds->location, cds->product, partial3,
6705                                     genCode, crp->frame, crp->code_break,
6706                                     include_stop, remove_trailingX,
6707                                     no_stop_at_end_of_complete_cds, altStartP, farProdFetchOK);
6708 }
6709 
TransTableTranslateCdRegion(TransTablePtr PNTR tblptr,SeqFeatPtr cds,Boolean include_stop,Boolean remove_trailingX,Boolean no_stop_at_end_of_complete_cds)6710 NLM_EXTERN ByteStorePtr TransTableTranslateCdRegion (
6711   TransTablePtr  PNTR tblptr,
6712   SeqFeatPtr cds,
6713   Boolean include_stop,
6714   Boolean remove_trailingX,
6715   Boolean no_stop_at_end_of_complete_cds
6716 )
6717 
6718 {
6719   return TransTableTranslateCdRegionEx (tblptr, cds, include_stop, remove_trailingX,
6720                                         no_stop_at_end_of_complete_cds, NULL, TRUE);
6721 }
6722 
6723 /* allow reuse of translation tables by saving as AppProperty */
6724 
PersistentTransTableCommon(SeqFeatPtr cds,Int2 genCode)6725 static TransTablePtr  PersistentTransTableCommon (
6726   SeqFeatPtr cds,
6727   Int2 genCode
6728 )
6729 
6730 {
6731   CdRegionPtr    crp;
6732   Char           str [32];
6733   TransTablePtr  tbl = NULL;
6734   ValNodePtr     vnp;
6735 
6736   if (cds != NULL && cds->data.choice == SEQFEAT_CDREGION) {
6737     crp = (CdRegionPtr) cds->data.value.ptrvalue;
6738     if (crp != NULL && crp->genetic_code != NULL) {
6739       vnp = (ValNodePtr) crp->genetic_code->data.ptrvalue;
6740       while (vnp != NULL) {
6741         if (vnp->choice == 2) {
6742           genCode = (Int2) vnp->data.intvalue;
6743         }
6744         vnp = vnp->next;
6745       }
6746     }
6747   }
6748 
6749   if (genCode == 7) {
6750     genCode = 4;
6751   } else if (genCode == 8) {
6752     genCode = 1;
6753   } else if (genCode == 0) {
6754     genCode = 1;
6755   }
6756 
6757   /* set app property name for storing desired FSA */
6758 
6759   sprintf (str, "TransTableFSAforGenCode%d", (int) genCode);
6760 
6761   /* get FSA for desired genetic code if it already exists */
6762 
6763   tbl = (TransTablePtr) GetAppProperty (str);
6764 
6765   /* if not already exists, save FSA in genetic code-specific app property name */
6766 
6767   if (tbl == NULL) {
6768     tbl = TransTableNew (genCode);
6769     SetAppProperty (str, (Pointer) tbl);
6770   }
6771 
6772   return tbl;
6773 }
6774 
PersistentTransTableByGenCode(Int2 genCode)6775 NLM_EXTERN TransTablePtr PersistentTransTableByGenCode (
6776   Int2 genCode
6777 )
6778 
6779 {
6780   return PersistentTransTableCommon (NULL, genCode);
6781 }
6782 
PersistentTransTableByCdRegion(SeqFeatPtr cds)6783 NLM_EXTERN TransTablePtr PersistentTransTableByCdRegion (
6784   SeqFeatPtr cds
6785 )
6786 
6787 {
6788   return PersistentTransTableCommon (cds, 0);
6789 }
6790 
6791 /*****************************************************************************
6792 *
6793 *   SeqSearch
6794 *       Initializes SeqSearch finite state machine for sequence searching
6795 *       Based on Practical Algorithms for Programmers by Binstock and Rex
6796 *
6797 *****************************************************************************/
6798 
6799 /* general purpose DNA sequence search finite state machine */
6800 
6801 typedef struct seqpattern {
6802   CharPtr           name;
6803   CharPtr           pattern;
6804   Int2              cutSite;
6805   Uint1             strand;
6806   struct seqpattern * next;
6807 } SeqPatternItem, PNTR SeqPatternPtr;
6808 
6809 typedef struct seqmatch {
6810   CharPtr         name;
6811   CharPtr         pattern;
6812   Int2            cutSite;
6813   Uint1           strand;
6814   struct seqmatch * next;
6815 } SeqMatchItem, PNTR SeqMatchPtr;
6816 
6817 typedef struct seqstate {
6818   Int2         onfailure;
6819   Int2         transitions [15]; /* order is ACGTMRWSYKVHDBN */
6820   SeqMatchPtr  matches;
6821 } SeqStateItem, PNTR SeqStatePtr;
6822 
6823 typedef struct SeqSearch {
6824   SeqStatePtr         stateArray;
6825   SeqPatternPtr       patternList;
6826   Int4                maxPatLen;
6827   Int2                maxState;
6828   Int2                highState;
6829   Int2                currentState;
6830   Int4                currentPos;
6831   Boolean             primed;
6832   SeqSearchMatchProc  matchproc;
6833   Pointer             userdata;
6834   Uint1               letterToIdx [256];
6835   Uint1               letterToComp [256];
6836 } SeqSearchData;
6837 
6838 #define FAIL_STATE -1
6839 
6840 /* returns next state given current state and next character */
6841 
SeqSearchGotoState(SeqSearchPtr tbl,Int2 state,Char ch,Boolean zeroFailureReturnsZero)6842 static Int2 SeqSearchGotoState (
6843   SeqSearchPtr tbl,
6844   Int2 state,
6845   Char ch,
6846   Boolean zeroFailureReturnsZero
6847 )
6848 
6849 {
6850   int          index;
6851   Int2         newstate;
6852   SeqStatePtr  sp;
6853 
6854   sp = &(tbl->stateArray [(int) state]);
6855   index = tbl->letterToIdx [(int) (Uint1) ch];
6856   newstate = sp->transitions [index];
6857 
6858   if (newstate != 0) return newstate;
6859 
6860   if (state == 0 && zeroFailureReturnsZero) return 0;
6861 
6862   return FAIL_STATE;
6863 }
6864 
6865 /* returns state to check next if current pattern broken */
6866 
SeqSearchFailState(SeqSearchPtr tbl,Int2 state)6867 static Int2 SeqSearchFailState (
6868   SeqSearchPtr tbl,
6869   Int2 state
6870 )
6871 
6872 {
6873   SeqStatePtr  sp;
6874 
6875   sp = &(tbl->stateArray [(int) state]);
6876   return sp->onfailure;
6877 }
6878 
6879 /* add a single character transition from one state to another */
6880 
SeqSearchAddTransition(SeqSearchPtr tbl,Int2 oldState,Char ch,Int2 newState)6881 static void SeqSearchAddTransition (
6882   SeqSearchPtr tbl,
6883   Int2 oldState,
6884   Char ch,
6885   Int2 newState
6886 )
6887 
6888 {
6889   int          index;
6890   SeqStatePtr  sp;
6891 
6892   sp = &(tbl->stateArray [(int) oldState]);
6893   index = tbl->letterToIdx [(int) (Uint1) ch];
6894   sp->transitions [index] = newState;
6895 }
6896 
6897 /* given state should report a successful match */
6898 
SeqSearchAddOutput(SeqSearchPtr tbl,Int2 state,CharPtr name,CharPtr pattern,Int2 cutSite,Uint1 strand)6899 static void SeqSearchAddOutput (
6900   SeqSearchPtr tbl,
6901   Int2 state,
6902   CharPtr name,
6903   CharPtr pattern,
6904   Int2 cutSite,
6905   Uint1 strand
6906 )
6907 
6908 {
6909   SeqMatchPtr  mp;
6910   SeqStatePtr  sp;
6911 
6912   sp = &(tbl->stateArray [(int) state]);
6913   for (mp = sp->matches; mp != NULL; mp = mp->next) {
6914     if (StringCmp (name, mp->name) == 0) return;
6915   }
6916 
6917   mp = (SeqMatchPtr) MemNew (sizeof (SeqMatchItem));
6918   if (mp == NULL) return;
6919 
6920   mp->name = StringSave (name);
6921   mp->pattern = StringSave (pattern);
6922   mp->cutSite = cutSite;
6923   mp->strand = strand;
6924 
6925   mp->next = sp->matches;
6926   sp->matches = mp;
6927 }
6928 
6929 /* add one nucleotide sequence pattern to the finite state machine */
6930 
SeqSearchEnterNucWord(SeqSearchPtr tbl,Int2 highState,Int2 maxState,CharPtr name,CharPtr pattern,Int2 cutSite,Uint1 strand)6931 static Int2 SeqSearchEnterNucWord (
6932   SeqSearchPtr tbl,
6933   Int2 highState,
6934   Int2 maxState,
6935   CharPtr name,
6936   CharPtr pattern,
6937   Int2 cutSite,
6938   Uint1 strand
6939 )
6940 
6941 {
6942   Char     ch;
6943   Int2     next, patLen, state;
6944   CharPtr  ptr;
6945 
6946   state = 0;
6947   next = 0;
6948 
6949   patLen = StringLen (pattern);
6950 
6951   /* try to overlay beginning of pattern onto existing table */
6952 
6953   for (ptr = pattern, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
6954     next = SeqSearchGotoState (tbl, state, ch, FALSE);
6955     if (next == FAIL_STATE) break;
6956     state = next;
6957   }
6958 
6959   /* now create new states for remaining characters in pattern */
6960 
6961   for ( ; ch != '\0'; ptr++, ch = *ptr) {
6962     highState++;
6963     SeqSearchAddTransition (tbl, state, ch, highState);
6964     state = highState;
6965   }
6966 
6967   /* at end of pattern record match information */
6968 
6969   SeqSearchAddOutput (tbl, state, name, pattern, cutSite, strand);
6970 
6971   return highState;
6972 }
6973 
6974 /* FIFO queue and other functions for building failure states */
6975 
SeqSearchQueueAdd(Int2Ptr queue,Int2 qbeg,Int2 val)6976 static void SeqSearchQueueAdd (
6977   Int2Ptr queue,
6978   Int2 qbeg,
6979   Int2 val
6980 )
6981 
6982 {
6983   Int2  q;
6984 
6985   q = queue [qbeg];
6986   if (q == 0) {
6987     queue [qbeg] = val;
6988   } else {
6989     for ( ; queue [q] != 0; q = queue [q]) continue;
6990     queue [q] = val;
6991   }
6992   queue [val] = 0;
6993 }
6994 
SeqSearchFindFail(SeqSearchPtr tbl,Int2 state,Int2 newState,Char ch)6995 static void SeqSearchFindFail (
6996   SeqSearchPtr tbl,
6997   Int2 state,
6998   Int2 newState,
6999   Char ch
7000 )
7001 
7002 {
7003   SeqMatchPtr  mp;
7004   Int2         next;
7005   SeqStatePtr  sp;
7006 
7007   /* traverse existing failure path */
7008 
7009   while ((next = SeqSearchGotoState (tbl, state, ch, TRUE)) == FAIL_STATE) {
7010     state = SeqSearchFailState (tbl, state);
7011   }
7012 
7013   /* add new failure state */
7014 
7015   sp = &(tbl->stateArray [(int) newState]);
7016   sp->onfailure = next;
7017 
7018   /* add matches of substring at new state */
7019 
7020   sp = &(tbl->stateArray [(int) next]);
7021   for (mp = sp->matches; mp != NULL; mp = mp->next) {
7022     SeqSearchAddOutput (tbl, newState, mp->name, mp->pattern,
7023                         mp->cutSite, mp->strand);
7024   }
7025 }
7026 
SeqSearchComputeFail(SeqSearchPtr tbl,Int2Ptr queue)7027 static void SeqSearchComputeFail (
7028   SeqSearchPtr tbl,
7029   Int2Ptr queue
7030 )
7031 
7032 {
7033   CharPtr      charToNuc = "ACGTMRWSYKVHDBN";
7034   Char         ch;
7035   Int2         qbeg, r, s, state;
7036   int          index;
7037   SeqStatePtr  sp;
7038 
7039   qbeg = 0;
7040   queue [0] = 0;
7041 
7042   /* queue up states reached directly from state 0 (depth 1) */
7043 
7044   sp = &(tbl->stateArray [0]);
7045   for (index = 0; index < 15; index++) {
7046     s = sp->transitions [index];
7047     if (s == 0) continue;
7048     sp->onfailure = 0;
7049     SeqSearchQueueAdd (queue, qbeg, s);
7050   }
7051 
7052   while (queue [qbeg] != 0) {
7053     r = queue [qbeg];
7054     qbeg = r;
7055 
7056     /* depth 1 states beget depth 2 states, etc. */
7057 
7058     sp = &(tbl->stateArray [r]);
7059     for (index = 0; index < 15; index++) {
7060       ch = charToNuc [index];
7061       s = sp->transitions [index];
7062       if (s == 0) continue;
7063       SeqSearchQueueAdd (queue, qbeg, s);
7064 
7065       /*
7066          Search for nucleotide sequences GTCGAC and TCATGA
7067 
7068          State   Substring   Transitions   Failure
7069            2       GT          C ->   3       7
7070            3       GTC         G ->   4       ?
7071            ...
7072            7       T           C ->   8       0
7073            8       TC          A ->   9
7074 
7075          For example, r = 2 (GT), if 'C' would go to s = 3 (GTC).
7076          From previous computation, 2 (GT) fails to 7 (T).  So we
7077          are not in a pattern starting with GT, but we may be in
7078          a pattern starting with the next character after G, or T.
7079          Thus, check state 7 (T) for any transitions using 'C'.
7080          Since 7 (T) 'C' -> 8 (TC), therefore set fail [3] -> 8.
7081       */
7082 
7083       state = SeqSearchFailState (tbl, r);
7084       SeqSearchFindFail (tbl, state, s, ch);
7085     }
7086   }
7087 }
7088 
7089 /* on first character, populate state transition table */
7090 
SeqSearchPrimeStateArray(SeqSearchPtr tbl)7091 static void SeqSearchPrimeStateArray (
7092   SeqSearchPtr tbl
7093 )
7094 
7095 {
7096   Int2           highState, maxState;
7097   SeqPatternPtr  pp;
7098   Int2Ptr        queue;
7099   SeqStatePtr    stateArray;
7100 
7101   if (tbl == NULL || tbl->primed || tbl->patternList == NULL) return;
7102 
7103   for (maxState = 1, pp = tbl->patternList; pp != NULL; pp = pp->next) {
7104     maxState += StringLen (pp->pattern);
7105   }
7106 
7107   if (maxState > 4000) {
7108     Message (MSG_POST, "FiniteStateSearch cannot handle %d states", (int) maxState);
7109     return;
7110   }
7111 
7112   stateArray = (SeqStatePtr) MemNew (sizeof (SeqStateItem) * (size_t) maxState);
7113   queue = (Int2Ptr) MemNew (sizeof (Int2) * maxState);
7114 
7115   if (stateArray == NULL || queue == NULL) {
7116     MemFree (stateArray);
7117     MemFree (queue);
7118     Message (MSG_POST, "SequenceSearch unable to allocate buffers");
7119     return;
7120   }
7121 
7122   tbl->stateArray = stateArray;
7123   tbl->maxState = maxState;
7124 
7125   for (highState = 0, pp = tbl->patternList; pp != NULL; pp = pp->next) {
7126     highState = SeqSearchEnterNucWord (tbl, highState, maxState, pp->name,
7127                                        pp->pattern, pp->cutSite, pp->strand);
7128   }
7129 
7130   SeqSearchComputeFail (tbl, queue);
7131 
7132   MemFree (queue);
7133 
7134   tbl->highState = highState;
7135   tbl->currentState = 0;
7136   tbl->currentPos = 0;
7137   tbl->primed = TRUE;
7138 }
7139 
7140 /* for testing, print summary of transition table */
7141 
7142 /*
7143 static void PrintSeqSearchTable (
7144   SeqSearchPtr tbl,
7145   FILE *fp
7146 )
7147 
7148 {
7149   Int2         i;
7150   SeqMatchPtr  mp;
7151   SeqStatePtr  sp;
7152   Int2         state;
7153 
7154   if (tbl == NULL || fp == NULL) return;
7155   if (! tbl->primed) {
7156     SeqSearchPrimeStateArray (tbl);
7157   }
7158   if (tbl->stateArray == NULL) return;
7159   if (tbl->highState > 99) return;
7160 
7161   fprintf (fp, "State Fail A  C  G  T  M  R  W  S  Y  K  V  H  D  B  N\n");
7162 
7163   for (state = 0; state <= tbl->highState; state++) {
7164     sp = &(tbl->stateArray [(int) state]);
7165     fprintf (fp, " %3d  %3d", (int) state, (int) sp->onfailure);
7166 
7167     for (i = 0; i < 15; i++) {
7168       if (sp->transitions [i] != 0) {
7169         fprintf (fp, "%3d", (int) sp->transitions [i]);
7170       } else {
7171         fprintf (fp, "   ");
7172       }
7173     }
7174 
7175     for (mp = sp->matches; mp != NULL; mp = mp->next) {
7176       fprintf (fp, " %s", mp->name);
7177     }
7178 
7179     fprintf (fp, "\n");
7180   }
7181 }
7182 */
7183 
7184 /* create empty nucleotide sequence search finite state machine */
7185 
SeqSearchNew(SeqSearchMatchProc matchproc,Pointer userdata)7186 NLM_EXTERN SeqSearchPtr SeqSearchNew (
7187   SeqSearchMatchProc matchproc,
7188   Pointer userdata
7189 )
7190 
7191 {
7192   CharPtr       charToNuc = "ACGTMRWSYKVHDBN";
7193   Char          ch, lttr;
7194   CharPtr       complementBase = " TVGH  CD  M KN   YSAABW R ";
7195   Int2          i;
7196   Uint1         k;
7197   SeqSearchPtr  tbl;
7198 
7199   if (matchproc == NULL) return NULL;
7200   tbl = (SeqSearchPtr) MemNew (sizeof (SeqSearchData));
7201   if (tbl == NULL) return NULL;
7202 
7203   tbl->stateArray = NULL;
7204   tbl->patternList = NULL;
7205   tbl->maxPatLen = 0;
7206   tbl->maxState = 0;
7207   tbl->highState = 0;
7208   tbl->currentState = 0;
7209   tbl->currentPos = 0;
7210   tbl->matchproc = matchproc;
7211   tbl->userdata = userdata;
7212   tbl->primed = FALSE;
7213 
7214   /* initialize table to convert character to transition index from 0 (A) to 14 (N) */
7215 
7216   for (i = 0; i < 256; i++) {
7217     tbl->letterToIdx [i] = 14;
7218   }
7219   for (k = 0; k < 15; k++) {
7220     ch = charToNuc [k];
7221     tbl->letterToIdx [(int) ch] = k;
7222     ch = TO_LOWER (ch);
7223     tbl->letterToIdx [(int) ch] = k;
7224   }
7225   tbl->letterToIdx [(int) 'U'] = tbl->letterToIdx [(int) 'T'];
7226   tbl->letterToIdx [(int) 'u'] = tbl->letterToIdx [(int) 'T'];
7227   tbl->letterToIdx [(int) 'X'] = tbl->letterToIdx [(int) 'N'];
7228   tbl->letterToIdx [(int) 'x'] = tbl->letterToIdx [(int) 'N'];
7229 
7230   /* initialize table to convert character to complement character */
7231 
7232   for (i = 0; i < 256; i++) {
7233     tbl->letterToComp [i] = '\0';
7234   }
7235   for (ch = 'A', i = 1; ch <= 'Z'; ch++, i++) {
7236     lttr = complementBase [i];
7237     if (lttr != ' ') {
7238       tbl->letterToComp [(int) (Uint1) ch] = lttr;
7239     }
7240   }
7241   for (ch = 'a', i = 1; ch <= 'z'; ch++, i++) {
7242     lttr = complementBase [i];
7243     if (lttr != ' ') {
7244       tbl->letterToComp [(int) (Uint1) ch] = lttr;
7245     }
7246   }
7247 
7248   return tbl;
7249 }
7250 
7251 /* table to expand ambiguity letter to all matching nucleotide letters */
7252 
7253 static CharPtr  nucExpandList [26] = {
7254   "A",
7255   "CGT",
7256   "C",
7257   "AGT",
7258   "",
7259   "",
7260   "G",
7261   "ACT",
7262   "",
7263   "",
7264   "GT",
7265   "",
7266   "AC",
7267   "ACGT",
7268   "",
7269   "",
7270   "",
7271   "AG",
7272   "CG",
7273   "T",
7274   "T",
7275   "ACG",
7276   "AT",
7277   "",
7278   "CT",
7279   ""
7280 };
7281 
7282 /* recursive function to expand and store appropriate individual patterns */
7283 
StoreSeqPattern(SeqSearchPtr tbl,CharPtr name,CharPtr str,Int2 cutSite,Uint1 strand)7284 static void StoreSeqPattern (
7285   SeqSearchPtr tbl,
7286   CharPtr name,
7287   CharPtr str,
7288   Int2 cutSite,
7289   Uint1 strand
7290 )
7291 
7292 {
7293   Int4           patLen;
7294   SeqPatternPtr  pp;
7295 
7296   pp = (SeqPatternPtr) MemNew (sizeof (SeqPatternItem));
7297   if (pp == NULL) return;
7298 
7299   pp->name = StringSave (name);
7300   pp->pattern = StringSave (str);
7301   pp->cutSite = cutSite;
7302   pp->strand = strand;
7303 
7304   pp->next = tbl->patternList;
7305   tbl->patternList = pp;
7306   patLen = StringLen (str);
7307   if (patLen > tbl->maxPatLen) {
7308     tbl->maxPatLen = patLen;
7309   }
7310 }
7311 
ExpandSeqPattern(SeqSearchPtr tbl,CharPtr name,CharPtr pattern,Int2 cutSite,Uint1 strand,size_t patLen,CharPtr str,Uint2 position,SearchFlgType flags)7312 static void ExpandSeqPattern (
7313   SeqSearchPtr tbl,
7314   CharPtr name,
7315   CharPtr pattern,
7316   Int2 cutSite,
7317   Uint1 strand,
7318   size_t patLen,
7319   CharPtr str,
7320   Uint2 position,
7321   SearchFlgType flags
7322 )
7323 
7324 {
7325   Char     ch, lttr;
7326   Uint2     idx;
7327   CharPtr  ptr;
7328 
7329   if (position < patLen) {
7330 
7331     if ((Boolean) ((flags & SEQ_SEARCH_EXPAND_PATTERN) != 0)) {
7332 
7333       /* given ambiguity letter, get index into nucExpandList */
7334 
7335       ch = pattern [position];
7336       idx = ch - 'A';
7337       ptr = nucExpandList [idx];
7338 
7339       /* put every ACGT letter at current position, recurse for next position */
7340 
7341       for (lttr = *ptr; lttr != '\0'; ptr++, lttr = *ptr) {
7342         str [position] = lttr;
7343         ExpandSeqPattern (tbl, name, pattern, cutSite, strand,
7344                           patLen, str, position + 1, flags);
7345       }
7346 
7347     } else {
7348 
7349       /* if matching ambiguity characters in sequence, do not expand each base */
7350 
7351       str [position] = pattern [position];
7352       ExpandSeqPattern (tbl, name, pattern, cutSite, strand,
7353                         patLen, str, position + 1, flags);
7354     }
7355 
7356     /* do not run into pattern storage section of code located below */
7357 
7358     return;
7359   }
7360 
7361   /* when position reaches pattern length, store one fully expanded string */
7362 
7363   StoreSeqPattern (tbl, name, str, cutSite, strand);
7364 
7365   if ((Boolean) ((flags & SEQ_SEARCH_ALLOW_MISMATCH) == 0)) return;
7366 
7367   for (idx = 0; idx < patLen; idx++) {
7368     ch = str [idx];
7369 
7370     /* put N at every position if a single mismatch is allowed */
7371 
7372     str [idx] = 'N';
7373 
7374     StoreSeqPattern (tbl, name, str, cutSite, strand);
7375 
7376     /* now restore proper character, go on to put N in next position */
7377 
7378     str [idx] = ch;
7379   }
7380 }
7381 
7382 /* add restriction site to sequence search finite state machine */
7383 
SeqSearchAddNucleotidePattern(SeqSearchPtr tbl,CharPtr name,CharPtr pattern,Int2 cutSite,SearchFlgType flags)7384 NLM_EXTERN void SeqSearchAddNucleotidePattern (
7385   SeqSearchPtr tbl,
7386   CharPtr name,
7387   CharPtr pattern,
7388   Int2 cutSite,
7389   SearchFlgType flags
7390 )
7391 
7392 {
7393   Char     ch, comp [128], pat [128], str [128];
7394   Int2     i, j;
7395   size_t   len;
7396   Uint1    strand;
7397   Boolean  symmetric = TRUE;
7398 
7399   if (tbl == NULL || StringHasNoText (name) || StringHasNoText (pattern)) return;
7400 
7401   StringNCpy_0 (pat, pattern, sizeof (pat));
7402   TrimSpacesAroundString (pat);
7403 
7404   len = StringLen (pat);
7405 
7406   /* upper case working copy of pattern string */
7407 
7408   for (i = 0; i < len; i++) {
7409     ch = pat [i];
7410     pat [i] = TO_UPPER (ch);
7411   }
7412 
7413   /* reverse complement pattern to see if it is symetrical */
7414 
7415   for (i = 0, j = len - 1; i < len; i++, j--) {
7416     ch = pat [i];
7417     comp [j] = tbl->letterToComp [(int) (Uint1) ch];
7418   }
7419   comp [len] = '\0';
7420   symmetric = (Boolean) (StringICmp (pat, comp) == 0);
7421 
7422   if (symmetric) {
7423     strand = Seq_strand_both;
7424   } else {
7425     strand = Seq_strand_plus;
7426   }
7427 
7428   /* record expansion of entered pattern */
7429 
7430   MemSet ((Pointer) str, 0, sizeof (str));
7431   ExpandSeqPattern (tbl, name, pat, cutSite, strand,
7432                     len, str, 0, flags);
7433 
7434   if (symmetric) return;
7435   if ((Boolean) ((flags & SEQ_SEARCH_JUST_TOP_STRAND) != 0)) return;
7436 
7437   /* record expansion of reverse complement of asymmetric pattern */
7438 
7439   MemSet ((Pointer) str, 0, sizeof (str));
7440   ExpandSeqPattern (tbl, name, comp, len - cutSite, Seq_strand_minus,
7441                     len, str, 0, flags);
7442 }
7443 
7444 /* program passes each character in turn to finite state machine */
7445 
SeqSearchProcessCharacterEx(SeqSearchPtr tbl,Char ch,Int4 length)7446 static void SeqSearchProcessCharacterEx (
7447   SeqSearchPtr tbl,
7448   Char ch,
7449   Int4 length
7450 )
7451 
7452 {
7453   Int2         curr, next;
7454   SeqMatchPtr  mp;
7455   Int4         patLen;
7456   SeqStatePtr  sp;
7457 
7458   if (tbl == NULL) return;
7459   if (! tbl->primed) {
7460     SeqSearchPrimeStateArray (tbl);
7461   }
7462   if (tbl->stateArray == NULL) return;
7463 
7464   curr = tbl->currentState;
7465 
7466   /* loop through failure states until match or back to state 0 */
7467 
7468   while ((next = SeqSearchGotoState (tbl, curr, ch, TRUE)) == FAIL_STATE) {
7469     curr = SeqSearchFailState (tbl, curr);
7470   }
7471 
7472   tbl->currentState = next;
7473   (tbl->currentPos)++;
7474 
7475   /*
7476      States while traversing search sequence containing EcoRI site (GAATTC)
7477                                                         ------
7478      AAGCTTGCATGCCTGCAGGTCGACTCTAGAGGATCCCCGGGTACCGAGCTCGAATTCGAGCTCGGTACCCGGGGATCCTC
7479      00100010001000100110012000001211200000111000012100012345612100011000001111200000
7480                                                         *
7481   */
7482 
7483   /* report any matches at current state to callback function */
7484 
7485   sp = &(tbl->stateArray [(int) next]);
7486   for (mp = sp->matches; mp != NULL; mp = mp->next) {
7487 
7488     /* for circular sequences, prevent multiple reports of patterns */
7489 
7490     patLen = StringLen (mp->pattern);
7491     if (tbl->currentPos - patLen < length) {
7492       tbl->matchproc (tbl->currentPos - patLen,
7493                       mp->name, mp->pattern, mp->cutSite,
7494                       mp->strand, tbl->userdata);
7495     }
7496   }
7497 }
7498 
SeqSearchProcessCharacter(SeqSearchPtr tbl,Char ch)7499 NLM_EXTERN void SeqSearchProcessCharacter (
7500   SeqSearchPtr tbl,
7501   Char ch
7502 )
7503 
7504 {
7505   SeqSearchProcessCharacterEx (tbl, ch, INT4_MAX);
7506 }
7507 
7508 /* convenience function calls SeqSearchProcessCharacter for entire nucleotide bioseq */
7509 
7510 typedef struct seqsrchdata {
7511   SeqSearchPtr  tbl;
7512   Int4          length;
7513 } SeqSrchData, PNTR SeqSrchPtr;
7514 
SearchSeqProc(CharPtr sequence,Pointer userdata)7515 static void LIBCALLBACK SearchSeqProc (
7516   CharPtr sequence,
7517   Pointer userdata
7518 )
7519 
7520 {
7521   Char        ch;
7522   CharPtr     ptr;
7523   SeqSrchPtr  ssp;
7524 
7525   ssp = (SeqSrchPtr) userdata;
7526 
7527   ptr = sequence;
7528   ch = *ptr;
7529   while (ch != '\0') {
7530     ch = TO_UPPER (ch);
7531     SeqSearchProcessCharacterEx (ssp->tbl, ch, ssp->length);
7532     ptr++;
7533     ch = *ptr;
7534   }
7535 }
7536 
SeqSearchProcessBioseq(SeqSearchPtr tbl,BioseqPtr bsp)7537 NLM_EXTERN void SeqSearchProcessBioseq (
7538   SeqSearchPtr tbl,
7539   BioseqPtr bsp
7540 )
7541 
7542 {
7543   SeqSrchData  ssd;
7544 
7545   SeqSearchReset (tbl);
7546 
7547   if (tbl == NULL || bsp == NULL) return;
7548 
7549   if (! ISA_na (bsp->mol)) return;
7550 
7551   ssd.tbl = tbl;
7552   ssd.length = bsp->length;
7553 
7554   SeqPortStream (bsp, STREAM_EXPAND_GAPS, (Pointer) &ssd, SearchSeqProc);
7555 
7556   /* for circular molecules, check for patterns spanning origin */
7557 
7558   if (bsp->topology == TOPOLOGY_CIRCULAR && bsp->length > tbl->maxPatLen) {
7559     SeqPortStreamInt (bsp, 0, tbl->maxPatLen, Seq_strand_plus, STREAM_EXPAND_GAPS, (Pointer) &ssd, SearchSeqProc);
7560   }
7561 
7562   SeqSearchReset (tbl);
7563 }
7564 
7565 /* reset state and position to allow another run with same search patterns */
7566 
SeqSearchReset(SeqSearchPtr tbl)7567 NLM_EXTERN void SeqSearchReset (
7568   SeqSearchPtr tbl
7569 )
7570 
7571 {
7572   if (tbl == NULL) return;
7573 
7574   tbl->currentState = 0;
7575   tbl->currentPos = 0;
7576 }
7577 
7578 /* clean up sequence search finite state machine allocated memory */
7579 
FreePatternList(SeqPatternPtr pp)7580 static SeqPatternPtr FreePatternList (
7581   SeqPatternPtr pp
7582 )
7583 
7584 {
7585   SeqPatternPtr  next;
7586 
7587   while (pp != NULL) {
7588     next = pp->next;
7589     pp->next = NULL;
7590     MemFree (pp->name);
7591     MemFree (pp->pattern);
7592     MemFree (pp);
7593     pp = next;
7594   }
7595 
7596   return NULL;
7597 }
7598 
FreeMatchList(SeqMatchPtr mp)7599 static SeqMatchPtr FreeMatchList (
7600   SeqMatchPtr mp
7601 )
7602 
7603 {
7604   SeqMatchPtr  next;
7605 
7606   while (mp != NULL) {
7607     next = mp->next;
7608     mp->next = NULL;
7609     MemFree (mp->name);
7610     MemFree (mp->pattern);
7611     MemFree (mp);
7612     mp = next;
7613   }
7614 
7615   return NULL;
7616 }
7617 
SeqSearchFree(SeqSearchPtr tbl)7618 NLM_EXTERN SeqSearchPtr SeqSearchFree (
7619   SeqSearchPtr tbl
7620 )
7621 
7622 {
7623   Int2  maxState, state;
7624 
7625   if (tbl == NULL) return NULL;
7626 
7627   maxState = tbl->maxState;
7628 
7629   for (state = 0; state < maxState; state++) {
7630     FreeMatchList (tbl->stateArray [state].matches);
7631   }
7632 
7633   FreePatternList (tbl->patternList);
7634 
7635   MemFree (tbl->stateArray);
7636   return MemFree (tbl);
7637 }
7638 
7639 /*
7640 
7641 static CharPtr testseq =
7642  "AAGCTTGCATGCCTGCAGGTCGACTCTAGAGGATCCGRATYCCCGGGTACCGAGCTATYCCGAATTCGAGCTCGGTACCCGGGGATCCTCGANTTCATTCGAPTTCCAGTC";
7643 
7644 static void MatchProc (Int4 position, CharPtr name, CharPtr pattern,
7645                        Int2 cutSite, Uint1 strand, Pointer userdata)
7646 
7647 {
7648   Message (MSG_POST, "Name '%s', Pattern '%s', Position %ld",
7649            name, pattern, (long) position);
7650 }
7651 
7652 
7653 extern void TestSeqSearch (void);
7654 extern void TestSeqSearch (void)
7655 
7656 {
7657   Char          ch;
7658   CharPtr       ptr;
7659   SeqSearchPtr  tbl;
7660 
7661   tbl = SeqSearchNew (MatchProc, NULL);
7662   if (tbl == NULL) return;
7663 
7664   SeqSearchAddNucleotidePattern (tbl, "AmbiG", "GRATYC", 1, SEQ_SEARCH_EXPAND_PATTERN);
7665   SeqSearchAddNucleotidePattern (tbl, "ExacT", "GRAT", 1, SEQ_SEARCH_JUST_TOP_STRAND);
7666 
7667   for (ptr = testseq, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
7668     SeqSearchProcessCharacter (tbl, ch);
7669   }
7670 
7671   SeqSearchFree (tbl);
7672 }
7673 
7674 */
7675 
7676 /*****************************************************************************
7677 *
7678 *   ProtSearch
7679 *       Initializes ProtSearch finite state machine for sequence searching
7680 *       Based on Practical Algorithms for Programmers by Binstock and Rex
7681 *
7682 *****************************************************************************/
7683 
7684 /* general purpose protein sequence search finite state machine */
7685 
7686 typedef struct protpattern {
7687   CharPtr           name;
7688   CharPtr           pattern;
7689   struct protpattern * next;
7690 } ProtPatternItem, PNTR ProtPatternPtr;
7691 
7692 typedef struct protmatch {
7693   CharPtr          name;
7694   CharPtr          pattern;
7695   struct protmatch * next;
7696 } ProtMatchItem, PNTR ProtMatchPtr;
7697 
7698 typedef struct protstate {
7699   Int2          onfailure;
7700   Int2          transitions [27]; /* order is ABCDEFGHIJKLMNOPQRSTUVWXYZ */
7701   ProtMatchPtr  matches;
7702 } ProtStateItem, PNTR ProtStatePtr;
7703 
7704 typedef struct ProtSearch {
7705   ProtStatePtr         stateArray;
7706   ProtPatternPtr       patternList;
7707   Int4                 maxPatLen;
7708   Int2                 maxState;
7709   Int2                 highState;
7710   Int2                 currentState;
7711   Int4                 currentPos;
7712   Boolean              primed;
7713   ProtSearchMatchProc  matchproc;
7714   Pointer              userdata;
7715   Uint1                letterToIdx [256];
7716 } ProtSearchData;
7717 
7718 #define FAIL_STATE -1
7719 
7720 /* returns next state given current state and next character */
7721 
ProtSearchGotoState(ProtSearchPtr tbl,Int2 state,Char ch,Boolean zeroFailureReturnsZero)7722 static Int2 ProtSearchGotoState (
7723   ProtSearchPtr tbl,
7724   Int2 state,
7725   Char ch,
7726   Boolean zeroFailureReturnsZero
7727 )
7728 
7729 {
7730   int           index;
7731   Int2          newstate;
7732   ProtStatePtr  sp;
7733 
7734   sp = &(tbl->stateArray [(int) state]);
7735   index = tbl->letterToIdx [(int) (Uint1) ch];
7736   newstate = sp->transitions [index];
7737 
7738   if (newstate != 0) return newstate;
7739 
7740   if (state == 0 && zeroFailureReturnsZero) return 0;
7741 
7742   return FAIL_STATE;
7743 }
7744 
7745 /* returns state to check next if current pattern broken */
7746 
ProtSearchFailState(ProtSearchPtr tbl,Int2 state)7747 static Int2 ProtSearchFailState (
7748   ProtSearchPtr tbl,
7749   Int2 state
7750 )
7751 
7752 {
7753   ProtStatePtr  sp;
7754 
7755   sp = &(tbl->stateArray [(int) state]);
7756   return sp->onfailure;
7757 }
7758 
7759 /* add a single character transition from one state to another */
7760 
ProtSearchAddTransition(ProtSearchPtr tbl,Int2 oldState,Char ch,Int2 newState)7761 static void ProtSearchAddTransition (
7762   ProtSearchPtr tbl,
7763   Int2 oldState,
7764   Char ch,
7765   Int2 newState
7766 )
7767 
7768 {
7769   int           index;
7770   ProtStatePtr  sp;
7771 
7772   sp = &(tbl->stateArray [(int) oldState]);
7773   index = tbl->letterToIdx [(int) (Uint1) ch];
7774   sp->transitions [index] = newState;
7775 }
7776 
7777 /* given state should report a successful match */
7778 
ProtSearchAddOutput(ProtSearchPtr tbl,Int2 state,CharPtr name,CharPtr pattern)7779 static void ProtSearchAddOutput (
7780   ProtSearchPtr tbl,
7781   Int2 state,
7782   CharPtr name,
7783   CharPtr pattern
7784 )
7785 
7786 {
7787   ProtMatchPtr  mp;
7788   ProtStatePtr  sp;
7789 
7790   sp = &(tbl->stateArray [(int) state]);
7791   for (mp = sp->matches; mp != NULL; mp = mp->next) {
7792     if (StringCmp (name, mp->name) == 0) return;
7793   }
7794 
7795   mp = (ProtMatchPtr) MemNew (sizeof (ProtMatchItem));
7796   if (mp == NULL) return;
7797 
7798   mp->name = StringSave (name);
7799   mp->pattern = StringSave (pattern);
7800 
7801   mp->next = sp->matches;
7802   sp->matches = mp;
7803 }
7804 
7805 /* add one protein sequence pattern to the finite state machine */
7806 
ProtSearchEnterProtWord(ProtSearchPtr tbl,Int2 highState,Int2 maxState,CharPtr name,CharPtr pattern)7807 static Int2 ProtSearchEnterProtWord (
7808   ProtSearchPtr tbl,
7809   Int2 highState,
7810   Int2 maxState,
7811   CharPtr name,
7812   CharPtr pattern
7813 )
7814 
7815 {
7816   Char     ch;
7817   Int2     next, patLen, state;
7818   CharPtr  ptr;
7819 
7820   state = 0;
7821   next = 0;
7822 
7823   patLen = StringLen (pattern);
7824 
7825   /* try to overlay beginning of pattern onto existing table */
7826 
7827   for (ptr = pattern, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
7828     next = ProtSearchGotoState (tbl, state, ch, FALSE);
7829     if (next == FAIL_STATE) break;
7830     state = next;
7831   }
7832 
7833   /* now create new states for remaining characters in pattern */
7834 
7835   for ( ; ch != '\0'; ptr++, ch = *ptr) {
7836     highState++;
7837     ProtSearchAddTransition (tbl, state, ch, highState);
7838     state = highState;
7839   }
7840 
7841   /* at end of pattern record match information */
7842 
7843   ProtSearchAddOutput (tbl, state, name, pattern);
7844 
7845   return highState;
7846 }
7847 
7848 /* FIFO queue and other functions for building failure states */
7849 
ProtSearchQueueAdd(Int2Ptr queue,Int2 qbeg,Int2 val)7850 static void ProtSearchQueueAdd (
7851   Int2Ptr queue,
7852   Int2 qbeg,
7853   Int2 val
7854 )
7855 
7856 {
7857   Int2  q;
7858 
7859   q = queue [qbeg];
7860   if (q == 0) {
7861     queue [qbeg] = val;
7862   } else {
7863     for ( ; queue [q] != 0; q = queue [q]) continue;
7864     queue [q] = val;
7865   }
7866   queue [val] = 0;
7867 }
7868 
ProtSearchFindFail(ProtSearchPtr tbl,Int2 state,Int2 newState,Char ch)7869 static void ProtSearchFindFail (
7870   ProtSearchPtr tbl,
7871   Int2 state,
7872   Int2 newState,
7873   Char ch
7874 )
7875 
7876 {
7877   ProtMatchPtr  mp;
7878   Int2          next;
7879   ProtStatePtr  sp;
7880 
7881   /* traverse existing failure path */
7882 
7883   while ((next = ProtSearchGotoState (tbl, state, ch, TRUE)) == FAIL_STATE) {
7884     state = ProtSearchFailState (tbl, state);
7885   }
7886 
7887   /* add new failure state */
7888 
7889   sp = &(tbl->stateArray [(int) newState]);
7890   sp->onfailure = next;
7891 
7892   /* add matches of substring at new state */
7893 
7894   sp = &(tbl->stateArray [(int) next]);
7895   for (mp = sp->matches; mp != NULL; mp = mp->next) {
7896     ProtSearchAddOutput (tbl, newState, mp->name, mp->pattern);
7897   }
7898 }
7899 
ProtSearchComputeFail(ProtSearchPtr tbl,Int2Ptr queue)7900 static void ProtSearchComputeFail (
7901   ProtSearchPtr tbl,
7902   Int2Ptr queue
7903 )
7904 
7905 {
7906   CharPtr       charToProt = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
7907   Char          ch;
7908   Int2          qbeg, r, s, state;
7909   int           index;
7910   ProtStatePtr  sp;
7911 
7912   qbeg = 0;
7913   queue [0] = 0;
7914 
7915   /* queue up states reached directly from state 0 (depth 1) */
7916 
7917   sp = &(tbl->stateArray [0]);
7918   for (index = 0; index < 26; index++) {
7919     s = sp->transitions [index];
7920     if (s == 0) continue;
7921     sp->onfailure = 0;
7922     ProtSearchQueueAdd (queue, qbeg, s);
7923   }
7924 
7925   while (queue [qbeg] != 0) {
7926     r = queue [qbeg];
7927     qbeg = r;
7928 
7929     /* depth 1 states beget depth 2 states, etc. */
7930 
7931     sp = &(tbl->stateArray [r]);
7932     for (index = 0; index < 26; index++) {
7933       ch = charToProt [index];
7934       s = sp->transitions [index];
7935       if (s == 0) continue;
7936       ProtSearchQueueAdd (queue, qbeg, s);
7937 
7938       state = ProtSearchFailState (tbl, r);
7939       ProtSearchFindFail (tbl, state, s, ch);
7940     }
7941   }
7942 }
7943 
7944 /* on first character, populate state transition table */
7945 
ProtSearchPrimeStateArray(ProtSearchPtr tbl)7946 static void ProtSearchPrimeStateArray (
7947   ProtSearchPtr tbl
7948 )
7949 
7950 {
7951   Int2            highState, maxState;
7952   ProtPatternPtr  pp;
7953   Int2Ptr         queue;
7954   ProtStatePtr    stateArray;
7955 
7956   if (tbl == NULL || tbl->primed || tbl->patternList == NULL) return;
7957 
7958   for (maxState = 1, pp = tbl->patternList; pp != NULL; pp = pp->next) {
7959     maxState += StringLen (pp->pattern);
7960   }
7961 
7962   if (maxState > 4000) {
7963     Message (MSG_POST, "FiniteStateSearch cannot handle %d states", (int) maxState);
7964     return;
7965   }
7966 
7967   stateArray = (ProtStatePtr) MemNew (sizeof (ProtStateItem) * (size_t) maxState);
7968   queue = (Int2Ptr) MemNew (sizeof (Int2) * maxState);
7969 
7970   if (stateArray == NULL || queue == NULL) {
7971     MemFree (stateArray);
7972     MemFree (queue);
7973     Message (MSG_POST, "SequenceSearch unable to allocate buffers");
7974     return;
7975   }
7976 
7977   tbl->stateArray = stateArray;
7978   tbl->maxState = maxState;
7979 
7980   for (highState = 0, pp = tbl->patternList; pp != NULL; pp = pp->next) {
7981     highState = ProtSearchEnterProtWord (tbl, highState, maxState, pp->name,
7982                                          pp->pattern);
7983   }
7984 
7985   ProtSearchComputeFail (tbl, queue);
7986 
7987   MemFree (queue);
7988 
7989   tbl->highState = highState;
7990   tbl->currentState = 0;
7991   tbl->currentPos = 0;
7992   tbl->primed = TRUE;
7993 }
7994 
7995 /* for testing, print summary of transition table */
7996 
7997 /*
7998 static void PrintProtSearchTable (
7999   ProtSearchPtr tbl,
8000   FILE *fp
8001 )
8002 
8003 {
8004   Int2         i;
8005   ProtMatchPtr  mp;
8006   ProtStatePtr  sp;
8007   Int2         state;
8008 
8009   if (tbl == NULL || fp == NULL) return;
8010   if (! tbl->primed) {
8011     ProtSearchPrimeStateArray (tbl);
8012   }
8013   if (tbl->stateArray == NULL) return;
8014   if (tbl->highState > 99) return;
8015 
8016   fprintf (fp, "State Fail A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P  Q  R  S  T  U  V  W  X  Y  Z\n");
8017 
8018   for (state = 0; state <= tbl->highState; state++) {
8019     sp = &(tbl->stateArray [(int) state]);
8020     fprintf (fp, " %3d  %3d", (int) state, (int) sp->onfailure);
8021 
8022     for (i = 0; i < 26; i++) {
8023       if (sp->transitions [i] != 0) {
8024         fprintf (fp, "%3d", (int) sp->transitions [i]);
8025       } else {
8026         fprintf (fp, "   ");
8027       }
8028     }
8029 
8030     for (mp = sp->matches; mp != NULL; mp = mp->next) {
8031       fprintf (fp, " %s", mp->name);
8032     }
8033 
8034     fprintf (fp, "\n");
8035   }
8036 }
8037 */
8038 
8039 /* create empty protein sequence search finite state machine */
8040 
ProtSearchNew(ProtSearchMatchProc matchproc,Pointer userdata)8041 NLM_EXTERN ProtSearchPtr ProtSearchNew (
8042   ProtSearchMatchProc matchproc,
8043   Pointer userdata
8044 )
8045 
8046 {
8047   CharPtr        charToProt = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
8048   Char           ch;
8049   Int2           i;
8050   ProtSearchPtr  tbl;
8051 
8052   if (matchproc == NULL) return NULL;
8053   tbl = (ProtSearchPtr) MemNew (sizeof (ProtSearchData));
8054   if (tbl == NULL) return NULL;
8055 
8056   tbl->stateArray = NULL;
8057   tbl->patternList = NULL;
8058   tbl->maxPatLen = 0;
8059   tbl->maxState = 0;
8060   tbl->highState = 0;
8061   tbl->currentState = 0;
8062   tbl->currentPos = 0;
8063   tbl->matchproc = matchproc;
8064   tbl->userdata = userdata;
8065   tbl->primed = FALSE;
8066 
8067   /* initialize table to convert character to transition index from 0 (A) to 25 (Z) */
8068 
8069   for (i = 0; i < 256; i++) {
8070     tbl->letterToIdx [i] = 23;
8071   }
8072   for (i = 0; i < 26; i++) {
8073     ch = charToProt [i];
8074     tbl->letterToIdx [(int) ch] = i;
8075     ch = TO_LOWER (ch);
8076     tbl->letterToIdx [(int) ch] = i;
8077   }
8078 
8079   return tbl;
8080 }
8081 
8082 /* table to expand ambiguity letter to all matching protein letters */
8083 
8084 static CharPtr  protExpandList [26] = {
8085   "A",
8086   "DN",
8087   "C",
8088   "D",
8089   "E",
8090   "F",
8091   "G",
8092   "H",
8093   "I",
8094   "IL",
8095   "K",
8096   "L",
8097   "M",
8098   "N",
8099   "O",
8100   "P",
8101   "Q",
8102   "R",
8103   "S",
8104   "T",
8105   "U",
8106   "V",
8107   "W",
8108   "ACDEFGHIKLMNOPQRSTUVWY",
8109   "Y",
8110   "EQ"
8111 };
8112 
8113 /* recursive function to expand and store appropriate individual patterns */
8114 
StoreProtPattern(ProtSearchPtr tbl,CharPtr name,CharPtr str)8115 static void StoreProtPattern (
8116   ProtSearchPtr tbl,
8117   CharPtr name,
8118   CharPtr str
8119 )
8120 
8121 {
8122   Int4            patLen;
8123   ProtPatternPtr  pp;
8124 
8125   pp = (ProtPatternPtr) MemNew (sizeof (ProtPatternItem));
8126   if (pp == NULL) return;
8127 
8128   pp->name = StringSave (name);
8129   pp->pattern = StringSave (str);
8130 
8131   pp->next = tbl->patternList;
8132   tbl->patternList = pp;
8133   patLen = StringLen (str);
8134   if (patLen > tbl->maxPatLen) {
8135     tbl->maxPatLen = patLen;
8136   }
8137 }
8138 
ExpandProtPattern(ProtSearchPtr tbl,CharPtr name,CharPtr pattern,size_t patLen,CharPtr str,Int2 position,SearchFlgType flags)8139 static void ExpandProtPattern (
8140   ProtSearchPtr tbl,
8141   CharPtr name,
8142   CharPtr pattern,
8143   size_t patLen,
8144   CharPtr str,
8145   Int2 position,
8146   SearchFlgType flags
8147 )
8148 
8149 {
8150   Char     ch, lttr;
8151   Int2     idx;
8152   CharPtr  ptr;
8153 
8154   if (position < patLen) {
8155 
8156     if ((Boolean) ((flags & SEQ_SEARCH_EXPAND_PATTERN) != 0)) {
8157 
8158       /* given ambiguity letter, get index into protExpandList */
8159 
8160       ch = pattern [position];
8161       idx = ch - 'A';
8162       ptr = protExpandList [idx];
8163 
8164       /* put every unambiguous amino acid letter at current
8165          position, recurse for next position */
8166 
8167       for (lttr = *ptr; lttr != '\0'; ptr++, lttr = *ptr) {
8168         str [position] = lttr;
8169         ExpandProtPattern (tbl, name, pattern, patLen, str, position + 1, flags);
8170       }
8171 
8172     } else {
8173 
8174       /* if matching ambiguity characters in sequence, do not expand each base */
8175 
8176       str [position] = pattern [position];
8177       ExpandProtPattern (tbl, name, pattern, patLen, str, position + 1, flags);
8178     }
8179 
8180     /* do not run into pattern storage section of code located below */
8181 
8182     return;
8183   }
8184 
8185   /* when position reaches pattern length, store one fully expanded string */
8186 
8187   StoreProtPattern (tbl, name, str);
8188 
8189   if ((Boolean) ((flags & SEQ_SEARCH_ALLOW_MISMATCH) == 0)) return;
8190 
8191   for (idx = 0; idx < patLen; idx++) {
8192     ch = str [idx];
8193 
8194     /* put X at every position if a single mismatch is allowed */
8195 
8196     str [idx] = 'X';
8197 
8198     StoreProtPattern (tbl, name, str);
8199 
8200     /* now restore proper character, go on to put X in next position */
8201 
8202     str [idx] = ch;
8203   }
8204 }
8205 
8206 /* add protein to sequence search finite state machine */
8207 
ProtSearchAddProteinPattern(ProtSearchPtr tbl,CharPtr name,CharPtr pattern,SearchFlgType flags)8208 NLM_EXTERN void ProtSearchAddProteinPattern (
8209   ProtSearchPtr tbl,
8210   CharPtr name,
8211   CharPtr pattern,
8212   SearchFlgType flags
8213 )
8214 
8215 {
8216   Char     ch, pat [128], str [128];
8217   Int2     i;
8218   size_t   len;
8219 
8220   if (tbl == NULL || StringHasNoText (name) || StringHasNoText (pattern)) return;
8221 
8222   StringNCpy_0 (pat, pattern, sizeof (pat));
8223   TrimSpacesAroundString (pat);
8224 
8225   len = StringLen (pat);
8226 
8227   /* upper case working copy of pattern string */
8228 
8229   for (i = 0; i < len; i++) {
8230     ch = pat [i];
8231     pat [i] = TO_UPPER (ch);
8232   }
8233 
8234   /* record expansion of entered pattern */
8235 
8236   MemSet ((Pointer) str, 0, sizeof (str));
8237   ExpandProtPattern (tbl, name, pat, len, str, 0, flags);
8238 }
8239 
8240 /* program passes each character in turn to finite state machine */
8241 
ProtSearchProcessCharacterEx(ProtSearchPtr tbl,Char ch,Int4 length)8242 static void ProtSearchProcessCharacterEx (
8243   ProtSearchPtr tbl,
8244   Char ch,
8245   Int4 length
8246 )
8247 
8248 {
8249   Int2          curr, next;
8250   ProtMatchPtr  mp;
8251   Int4          patLen;
8252   ProtStatePtr  sp;
8253 
8254   if (tbl == NULL) return;
8255   if (! tbl->primed) {
8256     ProtSearchPrimeStateArray (tbl);
8257   }
8258   if (tbl->stateArray == NULL) return;
8259 
8260   curr = tbl->currentState;
8261 
8262   /* loop through failure states until match or back to state 0 */
8263 
8264   while ((next = ProtSearchGotoState (tbl, curr, ch, TRUE)) == FAIL_STATE) {
8265     curr = ProtSearchFailState (tbl, curr);
8266   }
8267 
8268   tbl->currentState = next;
8269   (tbl->currentPos)++;
8270 
8271   /* report any matches at current state to callback function */
8272 
8273   sp = &(tbl->stateArray [(int) next]);
8274   for (mp = sp->matches; mp != NULL; mp = mp->next) {
8275 
8276     /* for circular sequences, prevent multiple reports of patterns */
8277 
8278     patLen = StringLen (mp->pattern);
8279     if (tbl->currentPos - patLen < length) {
8280       tbl->matchproc (tbl->currentPos - patLen,
8281                       mp->name, mp->pattern, tbl->userdata);
8282     }
8283   }
8284 }
8285 
ProtSearchProcessCharacter(ProtSearchPtr tbl,Char ch)8286 NLM_EXTERN void ProtSearchProcessCharacter (
8287   ProtSearchPtr tbl,
8288   Char ch
8289 )
8290 
8291 {
8292   ProtSearchProcessCharacterEx (tbl, ch, INT4_MAX);
8293 }
8294 
8295 /* convenience function calls ProtSearchProcessCharacter for entire protein bioseq */
8296 
8297 typedef struct protsrchdata {
8298   ProtSearchPtr  tbl;
8299   Int4          length;
8300 } ProtSrchData, PNTR ProtSrchPtr;
8301 
SearchProtProc(CharPtr sequence,Pointer userdata)8302 static void LIBCALLBACK SearchProtProc (
8303   CharPtr sequence,
8304   Pointer userdata
8305 )
8306 
8307 {
8308   Char        ch;
8309   CharPtr     ptr;
8310   ProtSrchPtr  ssp;
8311 
8312   ssp = (ProtSrchPtr) userdata;
8313 
8314   ptr = sequence;
8315   ch = *ptr;
8316   while (ch != '\0') {
8317     ch = TO_UPPER (ch);
8318     ProtSearchProcessCharacterEx (ssp->tbl, ch, ssp->length);
8319     ptr++;
8320     ch = *ptr;
8321   }
8322 }
8323 
ProtSearchProcessBioseq(ProtSearchPtr tbl,BioseqPtr bsp)8324 NLM_EXTERN void ProtSearchProcessBioseq (
8325   ProtSearchPtr tbl,
8326   BioseqPtr bsp
8327 )
8328 
8329 {
8330   ProtSrchData  ssd;
8331 
8332   ProtSearchReset (tbl);
8333 
8334   if (tbl == NULL || bsp == NULL) return;
8335 
8336   if (! ISA_aa (bsp->mol)) return;
8337 
8338   ssd.tbl = tbl;
8339   ssd.length = bsp->length;
8340 
8341   SeqPortStream (bsp, STREAM_EXPAND_GAPS, (Pointer) &ssd, SearchProtProc);
8342 
8343   ProtSearchReset (tbl);
8344 }
8345 
8346 /* reset state and position to allow another run with same search patterns */
8347 
ProtSearchReset(ProtSearchPtr tbl)8348 NLM_EXTERN void ProtSearchReset (
8349   ProtSearchPtr tbl
8350 )
8351 
8352 {
8353   if (tbl == NULL) return;
8354 
8355   tbl->currentState = 0;
8356   tbl->currentPos = 0;
8357 }
8358 
8359 /* clean up sequence search finite state machine allocated memory */
8360 
FreeProtPatternList(ProtPatternPtr pp)8361 static ProtPatternPtr FreeProtPatternList (
8362   ProtPatternPtr pp
8363 )
8364 
8365 {
8366   ProtPatternPtr  next;
8367 
8368   while (pp != NULL) {
8369     next = pp->next;
8370     pp->next = NULL;
8371     MemFree (pp->name);
8372     MemFree (pp->pattern);
8373     MemFree (pp);
8374     pp = next;
8375   }
8376 
8377   return NULL;
8378 }
8379 
FreeProtMatchList(ProtMatchPtr mp)8380 static ProtMatchPtr FreeProtMatchList (
8381   ProtMatchPtr mp
8382 )
8383 
8384 {
8385   ProtMatchPtr  next;
8386 
8387   while (mp != NULL) {
8388     next = mp->next;
8389     mp->next = NULL;
8390     MemFree (mp->name);
8391     MemFree (mp->pattern);
8392     MemFree (mp);
8393     mp = next;
8394   }
8395 
8396   return NULL;
8397 }
8398 
ProtSearchFree(ProtSearchPtr tbl)8399 NLM_EXTERN ProtSearchPtr ProtSearchFree (
8400   ProtSearchPtr tbl
8401 )
8402 
8403 {
8404   Int2  maxState, state;
8405 
8406   if (tbl == NULL) return NULL;
8407 
8408   maxState = tbl->maxState;
8409 
8410   for (state = 0; state < maxState; state++) {
8411     FreeProtMatchList (tbl->stateArray [state].matches);
8412   }
8413 
8414   FreeProtPatternList (tbl->patternList);
8415 
8416   MemFree (tbl->stateArray);
8417   return MemFree (tbl);
8418 }
8419 
8420 /*
8421 
8422 static CharPtr testseq =
8423  "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN";
8424 
8425 static void MatchProc (Int4 position, CharPtr name, CharPtr pattern, Pointer userdata)
8426 
8427 {
8428   Message (MSG_POST, "Name '%s', Pattern '%s', Position %ld",
8429            name, pattern, (long) position);
8430 }
8431 
8432 
8433 extern void TestProtSearch (void);
8434 extern void TestProtSearch (void)
8435 
8436 {
8437   Char           ch;
8438   CharPtr        ptr;
8439   ProtSearchPtr  tbl;
8440 
8441   tbl = ProtSearchNew (MatchProc, NULL);
8442   if (tbl == NULL) return;
8443 
8444   ProtSearchAddProteinPattern (tbl, "AmbiG", "GRATYC", 1, SEQ_SEARCH_EXPAND_PATTERN);
8445 
8446   for (ptr = testseq, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
8447     ProtSearchProcessCharacter (tbl, ch);
8448   }
8449 
8450   ProtSearchFree (tbl);
8451 }
8452 
8453 */
8454 
8455 /*****************************************************************************
8456 *
8457 *  Convenience functions for genome processing use BioseqLockById to get sequence
8458 *  record (perhaps with phrap quality score graphs) so fetching from some network
8459 *  or local server must be enabled, or sequences must already be in memory.
8460 *
8461 *****************************************************************************/
8462 
GetSequenceByFeatureEx(SeqFeatPtr sfp,StreamFlgType flags)8463 NLM_EXTERN CharPtr GetSequenceByFeatureEx (SeqFeatPtr sfp, StreamFlgType flags)
8464 
8465 {
8466   Int4     len;
8467   CharPtr  str = NULL;
8468 
8469   if (sfp == NULL) return NULL;
8470   len = SeqLocLen (sfp->location);
8471   if (len > 0 && len < MAXALLOC) {
8472     str = MemNew (sizeof (Char) * (len + 2));
8473     if (str != NULL) {
8474       SeqPortStreamLoc (sfp->location, flags, (Pointer) str, NULL);
8475     }
8476   }
8477 
8478   return str;
8479 }
8480 
GetSequenceByLocationEx(SeqLocPtr slp,StreamFlgType flags)8481 NLM_EXTERN CharPtr GetSequenceByLocationEx (SeqLocPtr slp, StreamFlgType flags)
8482 
8483 {
8484   Int4     len;
8485   CharPtr  str = NULL;
8486 
8487   if (slp == NULL) return NULL;
8488   len = SeqLocLen (slp);
8489   if (len > 0 && len < MAXALLOC) {
8490     str = MemNew (sizeof (Char) * (len + 2));
8491     if (str != NULL) {
8492       SeqPortStreamLoc (slp, flags, (Pointer) str, NULL);
8493     }
8494   }
8495 
8496   return str;
8497 }
8498 
GetSequenceByBspEx(BioseqPtr bsp,StreamFlgType flags)8499 NLM_EXTERN CharPtr GetSequenceByBspEx (BioseqPtr bsp, StreamFlgType flags)
8500 
8501 {
8502   CharPtr  str = NULL;
8503 
8504   if (bsp == NULL || bsp->length >= MAXALLOC) return NULL;
8505 
8506   str = MemNew (sizeof (Char) * (bsp->length + 2));
8507   if (str == NULL) return NULL;
8508 
8509   SeqPortStream (bsp, flags, (Pointer) str, NULL);
8510 
8511   return str;
8512 }
8513 
GetSequenceByIdOrAccnDotVerEx(SeqIdPtr sip,CharPtr accession,Boolean is_na,StreamFlgType flags)8514 NLM_EXTERN CharPtr GetSequenceByIdOrAccnDotVerEx (SeqIdPtr sip, CharPtr accession, Boolean is_na, StreamFlgType flags)
8515 
8516 {
8517   BioseqPtr  bsp;
8518   SeqIdPtr   deleteme = NULL;
8519   CharPtr    str = NULL;
8520 
8521   if (sip == NULL) {
8522     if (StringHasNoText (accession)) return NULL;
8523     sip = SeqIdFromAccessionDotVersion (accession);
8524     deleteme = sip; /* allocated seqid, so must later delete it */
8525   }
8526   if (sip == NULL) return NULL;
8527 
8528   bsp = BioseqLockById (sip);
8529   SeqIdFree (deleteme);
8530   if (bsp == NULL) return NULL;
8531 
8532   if ((ISA_na (bsp->mol) && is_na) || (ISA_aa (bsp->mol) && (! is_na))) {
8533     if (bsp->length < MAXALLOC) {
8534       str = GetSequenceByBspEx (bsp, flags);
8535     }
8536   }
8537 
8538   BioseqUnlock (bsp);
8539   return str;
8540 }
8541 
GetSequenceByFeature(SeqFeatPtr sfp)8542 NLM_EXTERN CharPtr GetSequenceByFeature (SeqFeatPtr sfp)
8543 
8544 {
8545   return GetSequenceByFeatureEx (sfp, STREAM_EXPAND_GAPS);
8546 }
8547 
GetSequenceByLocation(SeqLocPtr slp)8548 NLM_EXTERN CharPtr GetSequenceByLocation (SeqLocPtr slp)
8549 
8550 {
8551   return GetSequenceByLocationEx (slp, STREAM_EXPAND_GAPS);
8552 }
8553 
GetSequenceByBsp(BioseqPtr bsp)8554 NLM_EXTERN CharPtr GetSequenceByBsp (BioseqPtr bsp)
8555 
8556 {
8557   return GetSequenceByBspEx (bsp, STREAM_EXPAND_GAPS);
8558 }
8559 
GetSequenceByIdOrAccnDotVer(SeqIdPtr sip,CharPtr accession,Boolean is_na)8560 NLM_EXTERN CharPtr GetSequenceByIdOrAccnDotVer (SeqIdPtr sip, CharPtr accession, Boolean is_na)
8561 
8562 {
8563   return GetSequenceByIdOrAccnDotVerEx (sip, accession, is_na, STREAM_EXPAND_GAPS);
8564 }
8565 
8566 /* original convenience function now calls more advanced version that can get proteins */
8567 
GetDNAbyAccessionDotVersion(CharPtr accession)8568 NLM_EXTERN CharPtr GetDNAbyAccessionDotVersion (CharPtr accession)
8569 
8570 {
8571   return GetSequenceByIdOrAccnDotVer (NULL, accession, TRUE);
8572 }
8573 
8574 
FixGapLength(BioseqPtr bsp,Int4 offset,Int4 diff)8575 static void FixGapLength (BioseqPtr bsp, Int4 offset, Int4 diff)
8576 {
8577   CharPtr     extra_ns;
8578   SeqLocPtr   slp;
8579   ValNodePtr  align_annot_list, vnp;
8580   SeqAnnotPtr sanp;
8581 
8582   if (bsp == NULL || bsp->id == NULL || diff == 0) return;
8583 
8584   align_annot_list = FindAlignSeqAnnotsForBioseq (bsp);
8585 
8586   if (diff > 0)
8587   {
8588     extra_ns = (CharPtr)MemNew ((diff + 1) * sizeof (Char));
8589     if (extra_ns != NULL)
8590     {
8591       MemSet (extra_ns, 'N', diff);
8592       extra_ns [diff] = 0;
8593       insertchar (extra_ns, offset, bsp->id, bsp->mol, FALSE);
8594     }
8595       slp = SeqLocIntNew (offset, offset + diff - 1, Seq_strand_plus, bsp->id);
8596     for (vnp = align_annot_list; vnp != NULL; vnp = vnp->next)
8597     {
8598       sanp = vnp->data.ptrvalue;
8599       if (sanp != NULL && sanp->type == 2)
8600       {
8601         sanp->data = SeqAlignInsertByLoc (slp, sanp->data);
8602       }
8603     }
8604     SeqLocFree (slp);
8605   }
8606   else
8607   {
8608       slp = SeqLocIntNew (offset, offset - diff - 1, Seq_strand_plus, bsp->id);
8609     SeqDeleteByLoc (slp, TRUE, FALSE);
8610 
8611     for (vnp = align_annot_list; vnp != NULL; vnp = vnp->next)
8612     {
8613       sanp = vnp->data.ptrvalue;
8614       if (sanp != NULL && sanp->type == 2)
8615       {
8616         sanp->data = SeqAlignDeleteByLoc (slp, sanp->data);
8617       }
8618     }
8619 
8620     SeqLocFree (slp);
8621   }
8622 }
8623 
AddSeqLitData(CharPtr str,ValNodePtr PNTR seq_ext)8624 static Int4 AddSeqLitData (CharPtr str, ValNodePtr PNTR seq_ext)
8625 {
8626   Int4      len;
8627   SeqLitPtr slp;
8628 
8629   if (StringHasNoText (str)) {
8630     return 0;
8631   }
8632   len = StringLen (str);
8633   slp = (SeqLitPtr) MemNew (sizeof (SeqLit));
8634   if (slp != NULL) {
8635     slp->length = len;
8636     ValNodeAddPointer (seq_ext, (Int2) 2, (Pointer) slp);
8637     slp->seq_data = (SeqDataPtr) BSNew (slp->length);
8638     slp->seq_data_type = Seq_code_iupacna;
8639     AddBasesToByteStore ((ByteStorePtr) slp->seq_data, str);
8640   }
8641   return len;
8642 }
8643 
IsGapUnknown(Int4 gap_len,Int4 unknown_gap_size,Int4 known_gap_size,Boolean unknown_greater_than_or_equal,Boolean known_greater_than_or_equal)8644 static Boolean IsGapUnknown (Int4 gap_len,
8645                              Int4 unknown_gap_size,
8646                              Int4 known_gap_size,
8647                              Boolean unknown_greater_than_or_equal,
8648                              Boolean known_greater_than_or_equal)
8649 {
8650   Boolean    make_unknown_size = FALSE;
8651 
8652   if (gap_len == 0)
8653   {
8654     make_unknown_size = FALSE;
8655   }
8656   else if (gap_len == unknown_gap_size)
8657   {
8658     make_unknown_size = TRUE;
8659   }
8660   else if (gap_len == known_gap_size)
8661   {
8662     make_unknown_size = FALSE;
8663   }
8664   else if (gap_len > unknown_gap_size && unknown_greater_than_or_equal)
8665   {
8666     if (!known_greater_than_or_equal)
8667     {
8668       make_unknown_size = TRUE;
8669     }
8670     else if (unknown_gap_size > known_gap_size)
8671     {
8672       make_unknown_size = TRUE;
8673     }
8674     else if (gap_len < known_gap_size)
8675     {
8676       make_unknown_size = TRUE;
8677     }
8678   }
8679   return make_unknown_size;
8680 }
8681 
8682 
AddGap(Int4 gap_len,Boolean make_unknown_size,BioseqPtr bsp,Int4 len,ValNodePtr PNTR seq_ext)8683 static Int4 AddGap(Int4 gap_len,
8684                   Boolean    make_unknown_size,
8685                    BioseqPtr bsp,
8686                    Int4      len,
8687                    ValNodePtr PNTR seq_ext)
8688 {
8689   Int4       added_len = 0;
8690   SeqLitPtr  slp;
8691   IntFuzzPtr ifp;
8692 
8693   if (gap_len > 0) {
8694     slp = (SeqLitPtr) MemNew (sizeof (SeqLit));
8695     if (slp != NULL) {
8696       slp->length = gap_len;
8697       ValNodeAddPointer (seq_ext, (Int2) 2, (Pointer) slp);
8698       if (make_unknown_size) {
8699         ifp = IntFuzzNew ();
8700         ifp->choice = 4;
8701         slp->fuzz = ifp;
8702         if (slp->length != 100) {
8703           FixGapLength (bsp, len, 100 - slp->length);
8704              slp->length = 100;
8705         }
8706       }
8707       added_len += slp->length;
8708     }
8709   }
8710   return added_len;
8711 }
8712 
8713 
NeedToConvert(CharPtr bases,Boolean unknown_greater_than_or_equal,Boolean known_greater_than_or_equal,Int4 unknown_gap_size,Int4 known_gap_size)8714 static Boolean NeedToConvert
8715 (CharPtr  bases,
8716  Boolean  unknown_greater_than_or_equal,
8717  Boolean     known_greater_than_or_equal,
8718  Int4        unknown_gap_size,
8719  Int4        known_gap_size)
8720 {
8721   Int4        gap_len;
8722   CharPtr     cp;
8723 
8724   if (StringHasNoText(bases)) {
8725       return FALSE;
8726   }
8727   cp = bases;
8728   while (*cp != '\0') {
8729 
8730     gap_len = StringSpn (cp, "N");
8731     if (gap_len > 0 ) {
8732       if ((gap_len == unknown_gap_size
8733             || (gap_len > unknown_gap_size && unknown_greater_than_or_equal)
8734             || gap_len == known_gap_size
8735             || (gap_len > known_gap_size && known_greater_than_or_equal))) {
8736         return TRUE;
8737       } else {
8738         cp += gap_len;
8739       }
8740     } else {
8741       gap_len = StringCSpn (cp, "N");
8742       cp += gap_len;
8743     }
8744   }
8745   return FALSE;
8746 }
8747 
8748 /*****************************************************************************
8749 *
8750 *   ConvertNsToGaps
8751 *       Assumes string of Ns means a gap of known length
8752 *
8753 *****************************************************************************/
8754 
ConvertNsToGaps(BioseqPtr bsp,Pointer userdata)8755 NLM_EXTERN void ConvertNsToGaps (
8756   BioseqPtr bsp,
8757   Pointer userdata
8758 )
8759 
8760 {
8761   CharPtr     bases, str, txt;
8762   Char        ch;
8763   Int4        len;
8764   ValNodePtr  seq_ext;
8765   Boolean     unknown_greater_than_or_equal = FALSE;
8766   Boolean     known_greater_than_or_equal = FALSE;
8767   Int4Ptr     gap_sizes;
8768   Int4        unknown_gap_size = 0;
8769   Int4        known_gap_size = 0;
8770   Int4        gap_len;
8771   Boolean     make_unknown_size;
8772 
8773   if (bsp == NULL || bsp->repr != Seq_repr_raw || ISA_aa (bsp->mol)) return;
8774   if (userdata == NULL)
8775   {
8776     known_greater_than_or_equal = TRUE;
8777   }
8778   else
8779   {
8780     gap_sizes = (Int4Ptr) userdata;
8781     unknown_gap_size = gap_sizes[0];
8782     known_gap_size = gap_sizes[1];
8783     if (unknown_gap_size < 0)
8784     {
8785       unknown_greater_than_or_equal = TRUE;
8786       unknown_gap_size = 0 - unknown_gap_size;
8787     }
8788     if (known_gap_size < 0)
8789     {
8790       known_greater_than_or_equal = TRUE;
8791       known_gap_size = 0 - known_gap_size;
8792     }
8793   }
8794 
8795   bases = GetSequenceByBsp (bsp);
8796   if (bases == NULL) return;
8797 
8798   if (!NeedToConvert(bases, unknown_greater_than_or_equal, known_greater_than_or_equal, unknown_gap_size, known_gap_size)) {
8799     MemFree (bases);
8800     return;
8801   }
8802 
8803   seq_ext = NULL;
8804   len = 0;
8805 
8806   txt = bases;
8807   str = txt;
8808   ch = *txt;
8809 
8810   while (*str != '\0') {
8811 
8812     gap_len = StringSpn (str, "N");
8813     if (gap_len > 0 ) {
8814       if ((gap_len == unknown_gap_size
8815             || (gap_len > unknown_gap_size && unknown_greater_than_or_equal)
8816             || gap_len == known_gap_size
8817             || (gap_len > known_gap_size && known_greater_than_or_equal))) {
8818         /* add any prior sequence data as literal */
8819         ch = *str;
8820         *str = '\0';
8821         len += AddSeqLitData (txt, &(seq_ext));
8822         *str = ch;
8823         /* add a gap */
8824         make_unknown_size = IsGapUnknown (gap_len,
8825                                           unknown_gap_size,
8826                                           known_gap_size,
8827                                           unknown_greater_than_or_equal,
8828                                           known_greater_than_or_equal);
8829 
8830         len += AddGap(gap_len,
8831                       make_unknown_size,
8832                       bsp,
8833                       len,
8834                       &(seq_ext));
8835         txt = str + gap_len;
8836       }
8837       str += gap_len;
8838     } else {
8839       gap_len = StringCSpn (str, "N");
8840       str += gap_len;
8841     }
8842   }
8843   /* at end, add last sequence data literal */
8844   len += AddSeqLitData (txt, &(seq_ext));
8845 
8846   MemFree (bases);
8847 
8848   bsp->seq_data = SeqDataFree (bsp->seq_data, bsp->seq_data_type);
8849   bsp->seq_data_type = 0;
8850   bsp->repr = Seq_repr_delta;
8851   bsp->seq_ext_type = 4;
8852   bsp->seq_ext = seq_ext;
8853   bsp->length = len;
8854 
8855   BioseqPack (bsp);
8856 }
8857 
8858 
8859 /* Protein Molecular Weight Section */
8860 
8861 /* Values are A through Z order:
8862    B is really D or N, but they are close so is treated as D
8863    Z is really E or Q, but they are close so is treated as E
8864    X is hard to guess, so the calculation fails on X
8865    - and * are skipped
8866    water molecule is removed for in-peptide atom counts
8867 
8868   A  B  C  D  E  F  G  H   I   J   K   L  M  N   O  P  Q   R  S  T  U  V   W  X  Y  Z
8869 */
8870 Uint1 C_atoms[26] =
8871 { 3, 4, 3, 4, 5, 9, 2, 6,  6,  6,  6,  6, 5, 4, 12, 5, 5,  6, 3, 4, 3, 5, 11, 0, 9, 5};
8872 Uint1 H_atoms[26] =
8873 { 5, 5, 5, 5, 7, 9, 3, 7, 11, 11, 12, 11, 9, 6, 19, 7, 8, 12, 5, 7, 5, 9, 10, 0, 9, 7};
8874 Uint1 N_atoms[26] =
8875 { 1, 1, 1, 1, 1, 1, 1, 3,  1,  1,  2,  1, 1, 2,  3, 1, 2,  4, 1, 1, 1, 1,  2, 0, 1, 1};
8876 Uint1 O_atoms[26] =
8877 { 1, 3, 1, 3, 3, 1, 1, 1,  1,  1,  1,  1, 1, 2,  2, 1, 2,  1, 2, 2, 1, 1,  1, 0, 2, 3};
8878 Uint1 S_atoms[26] =
8879 { 0, 0, 1, 0, 0, 0, 0, 0,  0,  0,  0,  0, 1, 0,  0, 0, 0,  0, 0, 0, 0, 0,  0, 0, 0, 0};
8880 Uint1 Se_atoms[26] =
8881 { 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0, 0, 0,  0, 0, 0,  0, 0, 0, 1, 0,  0, 0, 0, 0};
8882 
8883 /**************************************************************
8884 *
8885 *  Returns a protein molecular weight for a SeqLoc
8886 *    If it cannot calculate the value it returns -1.0
8887 *    If sequence contains X,B,U,*,orZ it fails
8888 *
8889 ***************************************************************/
MolWtForLoc(SeqLocPtr slp)8890 NLM_EXTERN FloatHi MolWtForLoc (SeqLocPtr slp)
8891 {
8892     StreamCache sc;
8893     Int2 res;
8894     int residue;
8895     Int4    Ccnt,
8896         Hcnt,
8897         Ncnt,
8898         Ocnt,
8899         Scnt,
8900         Secnt;
8901     FloatHi retval = -1.0;
8902 
8903     if (slp == NULL) return retval;
8904     StreamCacheSetup (NULL, slp, 0, &sc);
8905 
8906     Ccnt = 0;  /* initialize counters */
8907     Hcnt = 2;  /* always start with water */
8908     Ocnt = 1;  /* H20 */
8909     Ncnt = 0;
8910     Scnt = 0;
8911     Secnt = 0;
8912 
8913     while ((res = StreamCacheGetResidue (&sc)) != '\0')
8914     {
8915         if (IS_LOWER (res)) {
8916             res = TO_UPPER (res);
8917         }
8918         if (IS_UPPER (res)) {
8919             residue = res - 'A';
8920             if (H_atoms[residue] == 0) { /* unsupported AA */
8921                 return retval;    /* bail out */
8922             }
8923             Ccnt += C_atoms[residue];
8924             Hcnt += H_atoms[residue];
8925             Ncnt += N_atoms[residue];
8926             Ocnt += O_atoms[residue];
8927             Scnt += S_atoms[residue];
8928             Secnt += Se_atoms[residue];
8929         } else if (res != '-' && res != '*') {
8930             return retval;    /* bail out */
8931         }
8932     }
8933 
8934     retval = (12.01115 * Ccnt) + (1.0079 * Hcnt) +
8935          (14.0067 * Ncnt) + (15.9994 * Ocnt) +
8936          (32.064 * Scnt) + (78.96 * Secnt);
8937 
8938     return retval;
8939 }
8940 
MolWtForBsp(BioseqPtr bsp)8941 NLM_EXTERN FloatHi MolWtForBsp (BioseqPtr bsp)
8942 {
8943     StreamCache sc;
8944     Int2 res;
8945     int residue;
8946     Int4    Ccnt,
8947         Hcnt,
8948         Ncnt,
8949         Ocnt,
8950         Scnt,
8951         Secnt;
8952     FloatHi retval = -1.0;
8953 
8954     if (bsp == NULL) return retval;
8955     if (! ISA_aa (bsp->mol)) return retval;
8956     StreamCacheSetup (bsp, NULL, 0, &sc);
8957 
8958     Ccnt = 0;  /* initialize counters */
8959     Hcnt = 2;  /* always start with water */
8960     Ocnt = 1;  /* H20 */
8961     Ncnt = 0;
8962     Scnt = 0;
8963     Secnt = 0;
8964 
8965     while ((res = StreamCacheGetResidue (&sc)) != '\0')
8966     {
8967         if (IS_LOWER (res)) {
8968             res = TO_UPPER (res);
8969         }
8970         if (IS_UPPER (res)) {
8971             residue = res - 'A';
8972             if (H_atoms[residue] == 0) { /* unsupported AA */
8973                 return retval;    /* bail out */
8974             }
8975             Ccnt += C_atoms[residue];
8976             Hcnt += H_atoms[residue];
8977             Ncnt += N_atoms[residue];
8978             Ocnt += O_atoms[residue];
8979             Scnt += S_atoms[residue];
8980             Secnt += Se_atoms[residue];
8981         } else if (res != '-' && res != '*') {
8982             return retval;    /* bail out */
8983         }
8984     }
8985 
8986     retval = (12.01115 * Ccnt) + (1.0079 * Hcnt) +
8987          (14.0067 * Ncnt) + (15.9994 * Ocnt) +
8988          (32.064 * Scnt) + (78.96 * Secnt);
8989 
8990     return retval;
8991 }
8992 
MolWtForStr(CharPtr str)8993 NLM_EXTERN FloatHi MolWtForStr (CharPtr str)
8994 {
8995     Char res;
8996     int residue;
8997     Int4    Ccnt,
8998         Hcnt,
8999         Ncnt,
9000         Ocnt,
9001         Scnt,
9002         Secnt;
9003     FloatHi retval = -1.0;
9004 
9005     if (str == NULL) return retval;
9006 
9007     Ccnt = 0;  /* initialize counters */
9008     Hcnt = 2;  /* always start with water */
9009     Ocnt = 1;  /* H20 */
9010     Ncnt = 0;
9011     Scnt = 0;
9012     Secnt = 0;
9013 
9014     res = *str;
9015     while (res != '\0')
9016     {
9017         if (IS_LOWER (res)) {
9018             res = TO_UPPER (res);
9019         }
9020         if (IS_UPPER (res)) {
9021             residue = res - 'A';
9022             if (H_atoms[residue] == 0) { /* unsupported AA */
9023                 return retval;    /* bail out */
9024             }
9025             Ccnt += C_atoms[residue];
9026             Hcnt += H_atoms[residue];
9027             Ncnt += N_atoms[residue];
9028             Ocnt += O_atoms[residue];
9029             Scnt += S_atoms[residue];
9030             Secnt += Se_atoms[residue];
9031         } else if (res != '-' && res != '*') {
9032             return retval;    /* bail out */
9033         }
9034         str++;
9035         res = *str;
9036     }
9037 
9038     retval = (12.01115 * Ccnt) + (1.0079 * Hcnt) +
9039          (14.0067 * Ncnt) + (15.9994 * Ocnt) +
9040          (32.064 * Scnt) + (78.96 * Secnt);
9041 
9042     return retval;
9043 }
9044 
9045