1 /* seqport.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: seqport.c
27 *
28 * Author: James Ostell
29 *
30 * Version Creation Date: 7/13/91
31 *
32 * $Revision: 6.198 $
33 *
34 * File Description: Ports onto Bioseqs
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * ------- ---------- -----------------------------------------------------
40 *
41 * ==========================================================================
42 */
43
44 /** for ErrPostEx() ****/
45
46 static char *this_module = "ncbiapi";
47 #define THIS_MODULE this_module
48 static char *this_file = __FILE__;
49 #define THIS_FILE this_file
50
51 /**********************/
52
53
54 #include <seqport.h>
55 #include <edutil.h> /* for SeqLoc creation functions */
56 #include <gather.h> /* for SeqLocOffset function */
57 #include <sqnutils.h>
58 #include <explore.h> /* for BioseqFindFromSeqLoc function */
59 #include <subutil.h>
60 #include <tofasta.h> /* for FastaSeqLineEx function */
61 #include <salutil.h>
62 #include <alignmgr2.h> /* for correcting alignments when converting to delta */
63
64
65 NLM_EXTERN Boolean LIBCALL SeqPortAdjustLength (SeqPortPtr spp);
66
67 /*****************************************************************************
68 *
69 * Fast mapping arrays
70 *
71 *****************************************************************************/
72
73 static Uint1Ptr Na2toIUPAC = NULL;
74 static Uint1Ptr Na4toIUPAC = NULL;
75 static Uint1Ptr Na4toIUPACplusGap = NULL;
76 static Uint1Ptr Na2toNa4 = NULL;
77 static Uint1Ptr Na2to4Bit = NULL;
78 static Uint1Ptr Na4to4Bit = NULL;
79 static TNlmMutex seqport_mutex = NULL;
80
81
82 /*****************************************************************************
83 *
84 * MapNa2ByteToIUPACString and MapNa4ByteToIUPACString now copy directly to
85 * the expanded character buffer for efficiency
86 *
87 *****************************************************************************/
88
InitNa2toIUPAC(void)89 static void InitNa2toIUPAC (void)
90
91 {
92 Int2 base [4], index, j;
93 Char convert [4] = {'A', 'C', 'G', 'T'};
94 Int4 ret;
95 Uint1Ptr Na2toIUPAC_local = NULL;
96
97 ret = NlmMutexLockEx (&seqport_mutex); /* protect this section */
98 if (ret) {
99 ErrPostEx (SEV_FATAL, 0, 0, "MapNa2ByteToIUPACString mutex failed [%ld]", (long) ret);
100 return;
101 }
102
103 if (Na2toIUPAC == NULL) {
104 Na2toIUPAC_local = MemNew (sizeof (Uint1) * 1024);
105
106 if (Na2toIUPAC_local != NULL) {
107 for (base [0] = 0; base [0] < 4; (base [0])++) {
108 for (base [1] = 0; base [1] < 4; (base [1])++) {
109 for (base [2] = 0; base [2] < 4; (base [2])++) {
110 for (base [3] = 0; base [3] < 4; (base [3])++) {
111 index = 4 * (base [0] * 64 + base [1] * 16 + base [2] * 4 + base [3]);
112 for (j = 0; j < 4; j++) {
113 Na2toIUPAC_local [index + j] = convert [(base [j])];
114 }
115 }
116 }
117 }
118 }
119 }
120 Na2toIUPAC = Na2toIUPAC_local;
121 }
122
123 NlmMutexUnlock (seqport_mutex);
124 }
125
MapNa2ByteToIUPACString(Uint1Ptr bytep,Uint4Ptr buf,Int4 total)126 NLM_EXTERN Uint4Ptr LIBCALL MapNa2ByteToIUPACString (Uint1Ptr bytep, Uint4Ptr buf, Int4 total)
127
128 {
129 Uint4Ptr bp;
130 Uint1 byte;
131 Int2 index;
132 Int4 k;
133 Uint4Ptr ptr;
134
135 if (bytep == NULL || buf == NULL) return buf;
136 ptr = buf;
137
138 /* initialize array if not yet set (first time function is called) */
139
140 if (Na2toIUPAC == NULL) {
141 InitNa2toIUPAC ();
142 }
143
144 if (Na2toIUPAC == NULL) return buf;
145
146 /* now return 4 character string for each compressed byte */
147
148 for (k = 0; k < total; k++) {
149 byte = *bytep;
150 bytep++;
151 index = 4 * byte;
152 bp = (Uint4Ptr) (Na2toIUPAC + index);
153 /* copy 4 bytes at a time */
154 /*
155 for (j = 0; j < 4; j++) {
156 *ptr = *bp;
157 ptr++;
158 bp++;
159 }
160 */
161 *ptr = *bp;
162 ptr++;
163 }
164
165 return ptr;
166 }
167
InitNa4toIUPAC(void)168 static void InitNa4toIUPAC (void)
169
170 {
171 Int2 base [2], index, j;
172 Char convert [16] = {'N', 'A', 'C', 'M', 'G', 'R', 'S', 'V',
173 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'};
174 Int4 ret;
175 Uint1Ptr Na4toIUPAC_local = NULL;
176
177 ret = NlmMutexLockEx (&seqport_mutex); /* protect this section */
178 if (ret) {
179 ErrPostEx (SEV_FATAL, 0, 0, "MapNa4ByteToIUPACString mutex failed [%ld]", (long) ret);
180 return;
181 }
182
183 if (Na4toIUPAC == NULL) {
184 Na4toIUPAC_local = MemNew (sizeof (Uint1) * 512);
185
186 if (Na4toIUPAC_local != NULL) {
187 for (base [0] = 0; base [0] < 16; (base [0])++) {
188 for (base [1] = 0; base [1] < 16; (base [1])++) {
189 index = 2 * (base [0] * 16 + base [1]);
190 for (j = 0; j < 2; j++) {
191 Na4toIUPAC_local [index + j] = convert [(base [j])];
192 }
193 }
194 }
195 }
196 Na4toIUPAC = Na4toIUPAC_local;
197 }
198
199 NlmMutexUnlock (seqport_mutex);
200 }
201
MapNa4ByteToIUPACString(Uint1Ptr bytep,Uint2Ptr buf,Int4 total)202 NLM_EXTERN Uint2Ptr LIBCALL MapNa4ByteToIUPACString (Uint1Ptr bytep, Uint2Ptr buf, Int4 total)
203
204 {
205 Uint2Ptr bp;
206 Uint1 byte;
207 Int2 index;
208 Int4 k;
209 Uint2Ptr ptr;
210
211 if (bytep == NULL || buf == NULL) return buf;
212 ptr = buf;
213
214 /* initialize array if not yet set (first time function is called) */
215
216 if (Na4toIUPAC == NULL) {
217 InitNa4toIUPAC ();
218 }
219
220 if (Na4toIUPAC == NULL) return buf;
221
222 /* now return 2 character string for each compressed byte */
223
224 for (k = 0; k < total; k++) {
225 byte = *bytep;
226 bytep++;
227 index = 2 * byte;
228 bp = (Uint2Ptr) (Na4toIUPAC + index);
229 /* copy 2 bytes at a time */
230 /*
231 for (j = 0; j < 2; j++) {
232 *ptr = *bp;
233 ptr++;
234 bp++;
235 }
236 */
237 *ptr = *bp;
238 ptr++;
239 }
240
241 return ptr;
242 }
243
InitNa4toIUPACplusGap(void)244 static void InitNa4toIUPACplusGap (void)
245
246 {
247 Int2 base [2], index, j;
248 Char convert [16] = {'-', 'A', 'C', 'M', 'G', 'R', 'S', 'V',
249 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'};
250 Int4 ret;
251 Uint1Ptr Na4toIUPACplusGap_local = NULL;
252
253 ret = NlmMutexLockEx (&seqport_mutex); /* protect this section */
254 if (ret) {
255 ErrPostEx (SEV_FATAL, 0, 0, "MapNa4ByteToIUPACplusGapString mutex failed [%ld]", (long) ret);
256 return;
257 }
258
259 if (Na4toIUPACplusGap == NULL) {
260 Na4toIUPACplusGap_local = MemNew (sizeof (Uint1) * 512);
261
262 if (Na4toIUPACplusGap_local != NULL) {
263 for (base [0] = 0; base [0] < 16; (base [0])++) {
264 for (base [1] = 0; base [1] < 16; (base [1])++) {
265 index = 2 * (base [0] * 16 + base [1]);
266 for (j = 0; j < 2; j++) {
267 Na4toIUPACplusGap_local [index + j] = convert [(base [j])];
268 }
269 }
270 }
271 }
272 Na4toIUPACplusGap = Na4toIUPACplusGap_local;
273 }
274
275 NlmMutexUnlock (seqport_mutex);
276 }
277
MapNa4ByteToIUPACplusGapString(Uint1Ptr bytep,Uint2Ptr buf,Int4 total)278 NLM_EXTERN Uint2Ptr LIBCALL MapNa4ByteToIUPACplusGapString (Uint1Ptr bytep, Uint2Ptr buf, Int4 total)
279
280 {
281 Uint2Ptr bp;
282 Uint1 byte;
283 Int2 index;
284 Int4 k;
285 Uint2Ptr ptr;
286
287 if (bytep == NULL || buf == NULL) return buf;
288 ptr = buf;
289
290 /* initialize array if not yet set (first time function is called) */
291
292 if (Na4toIUPACplusGap == NULL) {
293 InitNa4toIUPACplusGap ();
294 }
295
296 if (Na4toIUPACplusGap == NULL) return buf;
297
298 /* now return 2 character string for each compressed byte */
299
300 for (k = 0; k < total; k++) {
301 byte = *bytep;
302 bytep++;
303 index = 2 * byte;
304 bp = (Uint2Ptr) (Na4toIUPACplusGap + index);
305 /* copy 2 bytes at a time */
306 /*
307 for (j = 0; j < 2; j++) {
308 *ptr = *bp;
309 ptr++;
310 bp++;
311 }
312 */
313 *ptr = *bp;
314 ptr++;
315 }
316
317 return ptr;
318 }
319
InitNa2toNa4(void)320 static void InitNa2toNa4 (void)
321
322 {
323 Int2 pair [2], index, j;
324 Uint1 convert [16] = {17, 18, 20, 24, 33, 34, 36, 40,
325 65, 66, 68, 72, 129, 130, 132, 136};
326 Int4 ret;
327 Uint1Ptr Na2toNa4_local = NULL;
328
329 ret = NlmMutexLockEx (&seqport_mutex); /* protect this section */
330 if (ret) {
331 ErrPostEx (SEV_FATAL, 0, 0, "MapNa2ByteToIUPACString mutex failed [%ld]", (long) ret);
332 return;
333 }
334
335 if (Na2toNa4 == NULL) {
336 Na2toNa4_local = MemNew (sizeof (Uint1) * 512);
337
338 if (Na2toNa4_local != NULL) {
339 for (pair [0] = 0; pair [0] < 16; (pair [0])++) {
340 for (pair [1] = 0; pair [1] < 16; (pair [1])++) {
341 index = 2 * (pair [0] * 16 + pair [1]);
342 for (j = 0; j < 2; j++) {
343 Na2toNa4_local [index + j] = convert [(pair [j])];
344 }
345 }
346 }
347 }
348 Na2toNa4 = Na2toNa4_local;
349 }
350
351 NlmMutexUnlock (seqport_mutex);
352 }
353
MapNa2ByteToNa4String(Uint1Ptr bytep,Uint2Ptr buf,Int4 total)354 NLM_EXTERN Uint2Ptr LIBCALL MapNa2ByteToNa4String (Uint1Ptr bytep, Uint2Ptr buf, Int4 total)
355
356 {
357 Uint2Ptr bp;
358 Uint1 byte;
359 Int2 index;
360 Int4 k;
361 Uint2Ptr ptr;
362
363 if (bytep == NULL || buf == NULL) return buf;
364 ptr = buf;
365
366 /* initialize array if not yet set (first time function is called) */
367
368 if (Na2toNa4 == NULL) {
369 InitNa2toNa4 ();
370 }
371
372 if (Na2toNa4 == NULL) return buf;
373
374 /* now return 2 character byte for each compressed byte */
375
376 for (k = 0; k < total; k++) {
377 byte = *bytep;
378 bytep++;
379 index = 2 * byte;
380 bp = (Uint2Ptr) (Na2toNa4 + index);
381 /* copy 2 bytes at a time */
382 /*
383 for (j = 0; j < 2; j++) {
384 *ptr = *bp;
385 ptr++;
386 bp++;
387 }
388 */
389 *ptr = *bp;
390 ptr++;
391 }
392
393 return ptr;
394 }
395
InitNa2to4Bit(void)396 static void InitNa2to4Bit (void)
397
398 {
399 Int2 base [4], index, j;
400 Uint1 convert [4] = {1, 2, 4, 8};
401 Int4 ret;
402 Uint1Ptr Na2to4Bit_local = NULL;
403
404 ret = NlmMutexLockEx (&seqport_mutex); /* protect this section */
405 if (ret) {
406 ErrPostEx (SEV_FATAL, 0, 0, "MapNa2ByteTo4BitString mutex failed [%ld]", (long) ret);
407 return;
408 }
409
410 if (Na2to4Bit == NULL) {
411 Na2to4Bit_local = MemNew (sizeof (Uint1) * 1024);
412
413 if (Na2to4Bit_local != NULL) {
414 for (base [0] = 0; base [0] < 4; (base [0])++) {
415 for (base [1] = 0; base [1] < 4; (base [1])++) {
416 for (base [2] = 0; base [2] < 4; (base [2])++) {
417 for (base [3] = 0; base [3] < 4; (base [3])++) {
418 index = 4 * (base [0] * 64 + base [1] * 16 + base [2] * 4 + base [3]);
419 for (j = 0; j < 4; j++) {
420 Na2to4Bit_local [index + j] = convert [(base [j])];
421 }
422 }
423 }
424 }
425 }
426 }
427 Na2to4Bit = Na2to4Bit_local;
428 }
429
430 NlmMutexUnlock (seqport_mutex);
431 }
432
MapNa2ByteTo4BitString(Uint1Ptr bytep,Uint4Ptr buf,Int4 total)433 NLM_EXTERN Uint4Ptr LIBCALL MapNa2ByteTo4BitString (Uint1Ptr bytep, Uint4Ptr buf, Int4 total)
434
435 {
436 Uint4Ptr bp;
437 Uint1 byte;
438 Int2 index;
439 Int4 k;
440 Uint4Ptr ptr;
441
442 if (bytep == NULL || buf == NULL) return buf;
443 ptr = buf;
444
445 /* initialize array if not yet set (first time function is called) */
446
447 if (Na2to4Bit == NULL) {
448 InitNa2to4Bit ();
449 }
450
451 if (Na2to4Bit == NULL) return buf;
452
453 /* now return 4 byte string for each compressed byte */
454
455 for (k = 0; k < total; k++) {
456 byte = *bytep;
457 bytep++;
458 index = 4 * byte;
459 bp = (Uint4Ptr) (Na2to4Bit + index);
460 /* copy 4 bytes at a time */
461 /*
462 for (j = 0; j < 4; j++) {
463 *ptr = *bp;
464 ptr++;
465 bp++;
466 }
467 */
468 *ptr = *bp;
469 ptr++;
470 }
471
472 return ptr;
473 }
474
InitNa4to4Bit(void)475 static void InitNa4to4Bit (void)
476
477 {
478 Int2 base [2], index, j;
479 Char convert [16] = {15, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
480 Int4 ret;
481 Uint1Ptr Na4to4Bit_local = NULL;
482
483 ret = NlmMutexLockEx (&seqport_mutex); /* protect this section */
484 if (ret) {
485 ErrPostEx (SEV_FATAL, 0, 0, "MapNa4ByteToIUPACString mutex failed [%ld]", (long) ret);
486 return;
487 }
488
489 if (Na4to4Bit == NULL) {
490 Na4to4Bit_local = MemNew (sizeof (Uint1) * 512);
491
492 if (Na4to4Bit_local != NULL) {
493 for (base [0] = 0; base [0] < 16; (base [0])++) {
494 for (base [1] = 0; base [1] < 16; (base [1])++) {
495 index = 2 * (base [0] * 16 + base [1]);
496 for (j = 0; j < 2; j++) {
497 Na4to4Bit_local [index + j] = convert [(base [j])];
498 }
499 }
500 }
501 }
502 Na4to4Bit = Na4to4Bit_local;
503 }
504
505 NlmMutexUnlock (seqport_mutex);
506 }
507
MapNa4ByteTo4BitString(Uint1Ptr bytep,Uint2Ptr buf,Int4 total)508 NLM_EXTERN Uint2Ptr LIBCALL MapNa4ByteTo4BitString (Uint1Ptr bytep, Uint2Ptr buf, Int4 total)
509
510 {
511 Uint2Ptr bp;
512 Uint1 byte;
513 Int2 index;
514 Int4 k;
515 Uint2Ptr ptr;
516
517 if (bytep == NULL || buf == NULL) return buf;
518 ptr = buf;
519
520 /* initialize array if not yet set (first time function is called) */
521
522 if (Na4to4Bit == NULL) {
523 InitNa4to4Bit ();
524 }
525
526 if (Na4to4Bit == NULL) return buf;
527
528 /* now return 2 character string for each compressed byte */
529
530 for (k = 0; k < total; k++) {
531 byte = *bytep;
532 bytep++;
533 index = 2 * byte;
534 bp = (Uint2Ptr) (Na4to4Bit + index);
535 /* copy 2 bytes at a time */
536 /*
537 for (j = 0; j < 2; j++) {
538 *ptr = *bp;
539 ptr++;
540 bp++;
541 }
542 */
543 *ptr = *bp;
544 ptr++;
545 }
546
547 return ptr;
548 }
549
550 /*****************************************************************************
551 *
552 * SeqPort Routines
553 *
554 *****************************************************************************/
555
556 /*****************************************************************************
557 *
558 * SeqPortFree(spp)
559 *
560 *****************************************************************************/
SeqPortFree(SeqPortPtr spp)561 NLM_EXTERN SeqPortPtr SeqPortFree (SeqPortPtr spp)
562
563 {
564 SeqPortPtr tspp, nextspp;
565
566 if (spp == NULL)
567 return NULL;
568
569 if (spp->locked) /* locked during access */
570 BioseqUnlock(spp->bsp); /* make available for freeing */
571
572 tspp = spp->segs;
573 while (tspp != NULL)
574 {
575 nextspp = tspp->next;
576 SeqPortFree(tspp);
577 tspp = nextspp;
578 }
579
580 MemFree(spp->cache);
581 MemFree (spp->cacheq);
582
583 MemFree(spp);
584
585 return NULL;
586 }
587
588 /*****************************************************************************
589 *
590 * SeqPortSetValues(spp)
591 * Copies the values is_circle, is_seg, and do_virtual from spp to
592 * any dependent SeqPortPtrs it contains. This is necessary for segmented
593 * reference, or delta types of Bioseqs and on SeqPortNewByLoc()
594 *
595 * SeqPortSet_... functions call this function
596 *
597 *****************************************************************************/
SeqPortSetValues(SeqPortPtr spp)598 NLM_EXTERN Boolean LIBCALL SeqPortSetValues (SeqPortPtr spp)
599 {
600 SeqPortPtr tmp;
601
602 if (spp == NULL)
603 return FALSE;
604
605 for (tmp = spp->segs; tmp != NULL; tmp = tmp->next)
606 {
607 tmp->is_circle = spp->is_circle;
608 tmp->is_seg = spp->is_seg;
609 tmp->do_virtual = spp->do_virtual;
610 tmp->gapIsZero = spp->gapIsZero;
611
612 if (tmp->segs != NULL)
613 SeqPortSetValues(tmp);
614 }
615
616 return TRUE;
617 }
618
619
SeqPortSet_is_circle(SeqPortPtr spp,Boolean value)620 NLM_EXTERN Boolean LIBCALL SeqPortSet_is_circle (SeqPortPtr spp, Boolean value)
621 {
622 if (spp == NULL)
623 return FALSE;
624 spp->is_circle = value;
625 return SeqPortSetValues(spp);
626 }
627
SeqPortSet_is_seg(SeqPortPtr spp,Boolean value)628 NLM_EXTERN Boolean LIBCALL SeqPortSet_is_seg (SeqPortPtr spp, Boolean value)
629 {
630 if (spp == NULL)
631 return FALSE;
632 spp->is_seg = value;
633 return SeqPortSetValues(spp);
634 }
635
636 /**************************************************************
637 *
638 * This function adjusts the length of seqport to remove virtual
639 * segments or add them back as needed
640 *
641 **************************************************************/
SeqPortAdjustLength(SeqPortPtr spp)642 NLM_EXTERN Boolean LIBCALL SeqPortAdjustLength (SeqPortPtr spp)
643 {
644 SeqPortPtr tmp;
645 Int4 len = 0;
646
647 if (spp == NULL)
648 return FALSE;
649
650
651 if (spp->isa_virtual)
652 {
653 if (spp->do_virtual)
654 spp->totlen = spp->stop - spp->start + 1;
655 else
656 spp->totlen = 0;
657 if (spp->totlen == 0)
658 spp->isa_null = TRUE;
659 else
660 spp->isa_null = FALSE;
661 }
662 else if (spp->segs != NULL)
663 {
664 for (tmp = spp->segs; tmp != NULL; tmp = tmp->next)
665 {
666 SeqPortAdjustLength (tmp);
667 len += tmp->totlen;
668 }
669 spp->totlen = len;
670 }
671 else if (! spp->isa_null)
672 spp->totlen = spp->stop - spp->start + 1;
673 spp->curpos = -1; /* reset to unused */
674
675 return TRUE;
676
677 }
678
SeqPortSet_do_virtualEx(SeqPortPtr spp,Boolean value,Boolean gapIsZero)679 NLM_EXTERN Boolean LIBCALL SeqPortSet_do_virtualEx (SeqPortPtr spp, Boolean value, Boolean gapIsZero)
680 {
681 Boolean do_it = FALSE, has_virtual=FALSE;
682 SeqPortPtr tmp;
683
684 if (spp == NULL)
685 return FALSE;
686
687 if (spp->isa_virtual == TRUE)
688 has_virtual = TRUE;
689 if (spp->do_virtual != value)
690 do_it = TRUE;
691 if (spp->gapIsZero != gapIsZero)
692 do_it = TRUE;
693 for (tmp = spp->segs; tmp != NULL; tmp = tmp->next)
694 {
695 if (tmp->isa_virtual == TRUE)
696 has_virtual = TRUE;
697 if (tmp->do_virtual != value)
698 do_it = TRUE;
699 if (tmp->gapIsZero != gapIsZero)
700 do_it = TRUE;
701 }
702
703 if (! do_it) /* no change needed */
704 return TRUE;
705
706
707 spp->do_virtual = value;
708 spp->gapIsZero = gapIsZero;
709 SeqPortSetValues(spp);
710 if (has_virtual) /* have to check the SeqPort */
711 {
712 SeqPortAdjustLength(spp);
713 SeqPortSeek(spp, 0, SEEK_SET);
714 }
715
716 return TRUE;
717 }
718
719
SeqPortSet_do_virtual(SeqPortPtr spp,Boolean value)720 NLM_EXTERN Boolean LIBCALL SeqPortSet_do_virtual (SeqPortPtr spp, Boolean value)
721 {
722 return SeqPortSet_do_virtualEx (spp, value, FALSE);
723 }
724
SeqPortSetUpFields(SeqPortPtr spp,Int4 start,Int4 stop,Uint1 strand,Uint1 newcode)725 NLM_EXTERN Boolean LIBCALL SeqPortSetUpFields (SeqPortPtr spp, Int4 start, Int4 stop, Uint1
726 strand, Uint1 newcode)
727 {
728 if (spp == NULL) return FALSE;
729 spp->start = start;
730 spp->stop = stop;
731 spp->strand = strand;
732 spp->curpos = -1; /* not set */
733 spp->totlen = stop - start + 1;
734 spp->newcode = newcode;
735 spp->sctp = SeqCodeTableFind(newcode);
736
737 return TRUE;
738 }
SeqPortSetUpAlphabet(SeqPortPtr spp,Uint1 curr_code,Uint1 newcode)739 NLM_EXTERN Boolean LIBCALL SeqPortSetUpAlphabet(SeqPortPtr spp, Uint1 curr_code, Uint1
740 newcode)
741 {
742 if (spp == NULL) return FALSE;
743
744 spp->oldcode = curr_code;
745 spp->sctp = SeqCodeTableFind(curr_code);
746
747 switch (curr_code)
748 {
749 case Seq_code_ncbi2na:
750 spp->bc = 4; /* bit shifts needed */
751 spp->rshift = 6;
752 spp->lshift = 2;
753 spp->mask = 192;
754 break;
755 case Seq_code_ncbi4na:
756 spp->bc = 2;
757 spp->rshift = 4;
758 spp->lshift = 4;
759 spp->mask = 240;
760 break;
761 default:
762 spp->bc = 1;
763 spp->mask = 255;
764 break;
765 }
766
767 if ((newcode) && (newcode != curr_code)) /* conversion alphabet */
768 {
769 if ((spp->smtp = SeqMapTableFind(newcode, curr_code)) != NULL)
770 spp->sctp = SeqCodeTableFind(newcode);
771 }
772
773 return TRUE;
774 }
775
776 /*****************************************************************************
777 *
778 * SeqPortNew(bsp, start, stop, strand, newcode)
779 * if bsp == NULL, creates an empty port
780 * see objloc.h for strand defines
781 *
782 *****************************************************************************/
SeqPortNew(BioseqPtr bsp,Int4 start,Int4 stop,Uint1 strand,Uint1 newcode)783 NLM_EXTERN SeqPortPtr SeqPortNew (BioseqPtr bsp, Int4 start, Int4 stop, Uint1 strand, Uint1
784 newcode)
785
786 {
787 SeqPortPtr spp, spps, sppcurr = NULL, spprev, prev, curr = NULL;
788 Uint1 curr_code, repr, tstrand = 0;
789 SeqLocPtr the_segs = NULL, currseg;
790 Int4 len, ctr, tlen = 0, tfrom = 0, tto = 0, xfrom, xto, tstart, tstop;
791 Char errbuf[41], idbuf[41];
792 ValNode fake;
793 Boolean done, started;
794 BioseqPtr tbsp;
795 ValNodePtr currchunk; /* can be a SeqLoc or an element of a Delta Seq
796 */
797 Boolean do_multi_loc, cycle2;
798 SeqLitPtr slitp = NULL;
799 SeqIdPtr tsip;
800
801 spp = (SeqPortPtr) MemNew(sizeof(SeqPort));
802 errbuf[0] = '\0';
803
804 if (bsp == NULL) /* a NULL section */
805 return spp;
806
807 spp->bsp = bsp; /* get ready for error
808 msgs */
809 SeqIdWrite(SeqIdFindBest(bsp->id, 0), errbuf, PRINTID_FASTA_SHORT, 40);
810 len = BioseqGetLen(bsp);
811 if (start < 0)
812 {
813 ErrPostEx(SEV_ERROR, 0,0 ,
814 "SeqPortNew: %s start (%ld)< 0", errbuf,
815 (long)start);
816 MemFree(spp);
817 return NULL;
818 }
819 if (start >= len)
820 {
821 ErrPostEx(SEV_ERROR,0,0,
822 "SeqPortNew: %s start(%ld) >= len(%ld)",
823 errbuf, (long)start, (long)len);
824 MemFree(spp);
825 return NULL;
826 }
827 if (stop == LAST_RESIDUE)
828 stop = len - 1;
829 else if (stop < start)
830 {
831 ErrPostEx(SEV_ERROR,0,0,
832 "SeqPortNew: %s stop(%ld) < start(%ld)",
833 errbuf, (long)stop, (long)start);
834 MemFree(spp);
835 return NULL;
836 }
837 else if (stop >= len)
838 {
839 ErrPostEx(SEV_ERROR,0,0,
840 "SeqPortNew: %s stop(%ld) >= len(%ld)",
841 errbuf, (long)stop, (long)len);
842 MemFree(spp);
843 return NULL;
844 }
845
846 SeqPortSetUpFields (spp, start,stop, strand, newcode);
847
848 spp->currnum = BioseqGetSeqDescr(bsp, Seq_descr_num, NULL);
849 if (spp->currnum == NULL) /* no numbering set */
850 spp->currnum = NumberingDefaultGet(); /* use default */
851
852 repr = Bioseq_repr(bsp);
853 if ((repr == Seq_repr_virtual) || /* virtual sequence */
854 (repr == Seq_repr_map )) /* map sequence */
855 {
856 spp->isa_virtual = TRUE;
857 spp->curpos = 0;
858 }
859 else if ((repr == Seq_repr_seg) || /* segmented */
860 (repr == Seq_repr_ref) || /* reference */
861 (repr == Seq_repr_delta)) /* delta */
862 {
863 spp->oldcode = 0; /* no code, not raw */
864
865 if (repr == Seq_repr_seg) /* segmented */
866 {
867 fake.choice = SEQLOC_MIX; /* make SEQUENCE OF Seq-loc,
868 into one */
869 fake.data.ptrvalue = bsp->seq_ext;
870 fake.next = NULL;
871 the_segs = (SeqLocPtr)&fake;
872 }
873 else if (repr == Seq_repr_ref) /* reference: is a Seq-loc
874 */
875 the_segs = (SeqLocPtr)bsp->seq_ext;
876
877 if (repr == Seq_repr_delta) /* chain of deltas to follow */
878 currchunk = (ValNodePtr)(bsp->seq_ext);
879 else /* seqlocs */
880 currchunk = (ValNodePtr)SeqLocFindNext(the_segs, NULL);
881
882 currseg = NULL;
883 ctr = 0;
884 done = FALSE;
885 started = FALSE;
886 while ((! done) && (currchunk != NULL))
887 {
888 do_multi_loc = FALSE;
889 cycle2 = TRUE; /* only really needed for complicated
890 delta seq locs */
891 currseg = NULL;
892 if (repr == Seq_repr_delta)
893 {
894 if (currchunk->choice == 1) /* it's a SeqLocPtr
895 */
896 {
897 currseg =
898 (SeqLocPtr)(currchunk->data.ptrvalue);
899 if (! IS_one_loc(currseg, FALSE)) /*
900 don't do complicated cases here */
901 {
902 do_multi_loc = TRUE;
903 currseg =
904 SeqLocFindNext((SeqLocPtr)(currchunk->data.ptrvalue), NULL);
905 }
906 }
907 else /* it's a SeqLitPtr
908 */
909 {
910 currseg = NULL;
911 slitp =
912 (SeqLitPtr)(currchunk->data.ptrvalue);
913 tlen = slitp->length;
914 tstrand = Seq_strand_plus;
915 tfrom = 0;
916 tto = tlen - 1;
917 }
918 }
919 else
920 currseg = (SeqLocPtr)currchunk;
921
922 while (cycle2) /* normally once, except for
923 complicated delta locs */
924 {
925 if (currseg != NULL) /* for segs and deltas of
926 type loc */
927 {
928 tlen = SeqLocLen(currseg);
929 tstrand = SeqLocStrand(currseg);
930 tfrom = SeqLocStart(currseg);
931 tto = SeqLocStop(currseg);
932 }
933
934 if (! started)
935 {
936 if ((ctr + tlen - 1) >= start)
937 {
938 tstart = start - ctr;
939 started = TRUE;
940 }
941 else
942 tstart = -1;
943 }
944 else
945 tstart = 0;
946
947 if (tstart >= 0) /* have a start */
948 {
949 if ((ctr + tlen - 1) >= stop)
950 {
951 done = TRUE; /* hit the end */
952 tstop = ((ctr + tlen - 1) -
953 stop);
954 }
955 else
956 tstop = 0;
957
958 if (tstrand == Seq_strand_minus)
959 {
960 xfrom = tfrom + tstop;
961 xto = tto - tstart;
962 }
963 else
964 {
965 xfrom = tfrom + tstart;
966 xto = tto - tstop;
967 }
968
969 if (currseg != NULL) /* working off locs */
970 {
971 if (currseg->choice == SEQLOC_NULL)
972 {
973 tbsp = NULL;
974 spps = SeqPortNew(tbsp, xfrom, xto, tstrand, newcode);
975 spps->isa_null = TRUE;
976 }
977 else
978 {
979 tsip = SeqLocId(currseg);
980 tbsp = BioseqLockById(tsip);
981 if (tbsp != NULL)
982 spps = SeqPortNew(tbsp, xfrom, xto, tstrand, newcode);
983 else
984 {
985 spps = NULL;
986 if (tsip != NULL)
987 SeqIdWrite(tsip, idbuf, PRINTID_FASTA_SHORT, 40);
988 else
989 StringMove(idbuf,"seqid=NULL");
990 ErrPostEx(SEV_ERROR,0,0,
991 "SeqPortNew: %s could not find component %s",
992 errbuf, idbuf);
993 return SeqPortFree(spp);
994 }
995 }
996
997 }
998 else
999 {
1000 spps = (SeqPortPtr) MemNew(sizeof(SeqPort));
1001 SeqPortSetUpFields (spps, xfrom,
1002 xto, tstrand, newcode);
1003 SeqPortSetUpAlphabet(spps,
1004 slitp->seq_data_type, newcode);
1005 if (slitp->seq_data != NULL)
1006 spps->bp = (ByteStorePtr)
1007 slitp->seq_data;
1008 else
1009 {
1010 spps->isa_virtual = TRUE;
1011 if (slitp->length == 0)
1012 spps->isa_null = TRUE;
1013 else
1014 { /* default for delta gaps */
1015 spps->do_virtual = TRUE;
1016 }
1017
1018 }
1019 }
1020
1021 if (spps == NULL)
1022 {
1023 ErrPostEx(SEV_ERROR,0,0,
1024 "SeqPortNew: %s unexpected null during recursion",
1025 errbuf);
1026 return SeqPortFree(spp);
1027 }
1028
1029 if (currseg != NULL)
1030 spps->locked = TRUE;
1031
1032 if (sppcurr == NULL)
1033 spp->segs = spps;
1034 else
1035 sppcurr->next = spps;
1036 sppcurr = spps;
1037 }
1038
1039 ctr += tlen;
1040
1041 if (! do_multi_loc)
1042 cycle2 = FALSE;
1043 else
1044 {
1045 currseg =
1046 SeqLocFindNext((SeqLocPtr)(currchunk->data.ptrvalue), currseg);
1047 if (currseg == NULL)
1048 cycle2 = FALSE;
1049 }
1050 }
1051
1052 if (repr == Seq_repr_delta)
1053 currchunk = currchunk->next;
1054 else
1055 currchunk = SeqLocFindNext(the_segs, currchunk);
1056 }
1057 if (strand == Seq_strand_minus) /* reverse seqport order */
1058 {
1059 prev = spp->segs;
1060 spprev = spp->segs;
1061 spp->segs = NULL;
1062 sppcurr = NULL;
1063 while (prev != NULL)
1064 {
1065 curr = spprev;
1066 prev = NULL;
1067 while (curr->next != NULL) /* end of chain */
1068 {
1069 prev = curr;
1070 curr = curr->next;
1071 }
1072 if (prev != NULL)
1073 prev->next = NULL;
1074 if (sppcurr == NULL)
1075 spp->segs = curr;
1076 else
1077 sppcurr->next = curr;
1078 sppcurr = curr;
1079 }
1080 curr->next = NULL; /* last one in chain */
1081 }
1082 spp->curr = spp->segs;
1083
1084 if (! started) /* nothing found */
1085 {
1086 ErrPostEx(SEV_ERROR,0,0,"SeqPortNew: no data found for %s",
1087 errbuf);
1088 return SeqPortFree(spp);
1089 }
1090 }
1091 else if ((repr == Seq_repr_raw) || /* sequence not by reference */
1092 (repr == Seq_repr_const))
1093 {
1094 curr_code = BioseqGetCode(bsp);
1095
1096 SeqPortSetUpAlphabet(spp, curr_code, newcode);
1097 spp->bp = (ByteStorePtr) bsp->seq_data;
1098
1099 /* allocate fast lookup caches for 2na or 4na to iupacna or 4na conversion */
1100
1101 if ((newcode == Seq_code_iupacna || newcode == Seq_code_ncbi4na) &&
1102 (curr_code == Seq_code_ncbi2na || curr_code == Seq_code_ncbi4na)) {
1103 spp->cacheq = (SPCacheQPtr) MemNew (sizeof (SPCacheQ));
1104 }
1105
1106 }
1107
1108 SeqPortAdjustLength (spp);
1109 SeqPortSeek(spp, 0, SEEK_SET);
1110 return spp;
1111 }
1112
1113 /*****************************************************************************
1114 *
1115 * SeqPortNewByLoc(loc, code)
1116 * builds a new seqport based on a SeqLoc
1117 *
1118 *****************************************************************************/
SeqPortNewByLoc(SeqLocPtr loc,Uint1 code)1119 NLM_EXTERN SeqPortPtr SeqPortNewByLoc (SeqLocPtr loc, Uint1 code)
1120
1121 {
1122 BioseqPtr bsp = NULL;
1123 SeqPortPtr spp = NULL, sppcurr, spps;
1124 Int4 start = 0, stop = 0;
1125 Uint1 strand = Seq_strand_unknown;
1126 SeqLocPtr currloc = NULL;
1127 CharPtr locptr, currlocptr;
1128
1129 if (loc == NULL)
1130 return spp;
1131
1132 /* get the needed components */
1133
1134 switch (loc->choice)
1135 {
1136 case SEQLOC_INT: /* int */
1137 case SEQLOC_PNT: /* pnt */
1138 case SEQLOC_PACKED_PNT: /* packed-pnt */
1139 start = SeqLocStart(loc);
1140 stop = SeqLocStop(loc);
1141 strand = SeqLocStrand(loc);
1142 case SEQLOC_WHOLE: /* whole */
1143 bsp = BioseqLockById(SeqLocId(loc)); /* need the bioseq
1144 now */
1145 if (bsp == NULL)
1146 return NULL; /* can't do it */
1147 }
1148
1149
1150
1151 switch (loc->choice)
1152 {
1153 case SEQLOC_EMPTY: /* empty */
1154 case SEQLOC_EQUIV: /* equiv */
1155 case SEQLOC_BOND: /* bond */
1156 break;
1157
1158 case SEQLOC_NULL: /* null */
1159 spp = SeqPortNew(NULL, FIRST_RESIDUE, LAST_RESIDUE, 0,
1160 code);
1161 spp->isa_null = TRUE;
1162 break;
1163
1164 case SEQLOC_WHOLE: /* whole */
1165 spp = SeqPortNew(bsp, FIRST_RESIDUE, LAST_RESIDUE, 0, code);
1166 if (spp != NULL)
1167 spp->locked = TRUE;
1168 else
1169 BioseqUnlock(bsp);
1170 break;
1171
1172 case SEQLOC_INT: /* int */
1173 case SEQLOC_PNT: /* pnt */
1174 case SEQLOC_PACKED_PNT: /* packed-pnt */
1175 spp = SeqPortNew(bsp, start, stop, strand, code);
1176 if (spp != NULL)
1177 spp->locked = TRUE;
1178 else
1179 BioseqUnlock(bsp);
1180 break;
1181
1182 case SEQLOC_PACKED_INT: /* packed seqint */
1183 case SEQLOC_MIX: /* mix */
1184 spp = (SeqPortPtr) MemNew(sizeof(SeqPort));
1185 spp->totlen = SeqLocLen(loc);
1186 spp->start = 0;
1187 spp->stop = spp->totlen - 1;
1188 spp->curpos = -1; /* not set */
1189 spp->currnum = NULL; /* use numbering from parts */
1190 currloc = NULL;
1191 sppcurr = NULL;
1192 while ((currloc = SeqLocFindNext(loc, currloc)) != NULL)
1193 {
1194 spps = SeqPortNewByLoc(currloc, code);
1195 if (spps == NULL)
1196 {
1197 locptr = SeqLocPrint(loc);
1198 currlocptr = SeqLocPrint(currloc);
1199 ErrPostEx(SEV_ERROR, 0,0 ,
1200 "SeqPortNewByLoc unexpected null during recursion [loc=%s][curr=%s]",
1201 locptr, currlocptr);
1202 MemFree(locptr);
1203 MemFree(currlocptr);
1204 SeqPortFree(spp);
1205 return NULL;
1206 }
1207 if (sppcurr == NULL)
1208 spp->segs = spps;
1209 else
1210 sppcurr->next = spps;
1211 sppcurr = spps;
1212 }
1213 spp->curr = spp->segs;
1214 break;
1215 case SEQLOC_FEAT:
1216 ErrPostEx(SEV_ERROR, 0,0 ,
1217 "SeqLocNewByLoc: Seq-loc.feat not supported");
1218 break;
1219 }
1220
1221 SeqPortAdjustLength (spp);
1222 SeqPortSeek(spp, 0, SEEK_SET);
1223
1224 return spp;
1225 }
1226
1227 /*****************************************************************************
1228 *
1229 * SeqPortSeek(spp, offset, origin)
1230 * works like fseek()
1231 * returns 0 on success (weird but true)
1232 * non-zero on fail
1233 * uses coordinates 0-(len - 1) no matter what region seqport covers
1234 *
1235 *
1236 *****************************************************************************/
ClearQCache(SeqPortPtr spp,Int4 sp)1237 static void ClearQCache (SeqPortPtr spp, Int4 sp)
1238
1239 {
1240 SPCacheQPtr spcpq;
1241
1242 spcpq = spp->cacheq;
1243 if (spcpq != NULL) {
1244 spcpq->ctr = 0;
1245 spcpq->total = 0; /* clear out cache parameters to force new read */
1246 }
1247 spp->curpos = sp;
1248 spp->byte = SEQPORT_EOF;
1249 }
1250
SeqPortSeek(SeqPortPtr spp,Int4 offset,Int2 origin)1251 NLM_EXTERN Int2 SeqPortSeek (SeqPortPtr spp, Int4 offset, Int2 origin)
1252
1253 {
1254 Int4 sp, curpos, left, pos, lim, diff;
1255 Boolean plus_strand;
1256 Uint1 the_byte, the_residue;
1257 Int2 bitctr;
1258 SeqPortPtr curspp;
1259 Uint1Ptr buf;
1260 SPCachePtr spcp;
1261
1262 if (spp == NULL)
1263 return 1;
1264
1265 spp->eos = FALSE; /* unset flag set when moving off segment */
1266
1267 /* get position as positive offset from 0 */
1268 if (spp->strand == Seq_strand_minus)
1269 plus_strand = FALSE;
1270 else
1271 plus_strand = TRUE;
1272
1273 sp = spp->curpos; /* current offset, 0 - (totlen - 1) */
1274 switch (origin)
1275 {
1276 case SEEK_SET:
1277 spp->backing = FALSE; /* reset.. not backing */
1278 if ((offset > spp->totlen) || (offset < 0)) {
1279 ClearQCache (spp, sp);
1280 return 1;
1281 }
1282 sp = offset;
1283 break;
1284 case SEEK_CUR:
1285 if (((sp + offset) > spp->totlen) ||
1286 ((sp + offset) < 0 ))
1287 {
1288 /** check for reverse complement backing **/
1289 if ((sp + offset < 0) && (offset == -2))
1290 {
1291 if (spp->backing == 1)
1292 {
1293 ClearQCache(spp, -1);
1294 spp->eos = TRUE; /* note backing off segment */
1295 return 0;
1296 }
1297 if (spp->curpos == -1) /* not set */
1298 return 0;
1299 }
1300
1301 if (! spp->is_circle) {
1302 ClearQCache (spp, sp);
1303 return 1;
1304 }
1305 }
1306 else
1307 sp += offset;
1308 if (spp->is_circle)
1309 {
1310 while (sp >= spp->totlen) /* circle adjustments */
1311 sp -= spp->totlen;
1312 while (sp < 0)
1313 sp += spp->totlen;
1314 }
1315 break;
1316 case SEEK_END:
1317 if ((ABS(offset) > spp->totlen) || (offset > 0)) {
1318 ClearQCache (spp, sp);
1319 return 1;
1320 }
1321 sp = spp->totlen + offset;
1322 break;
1323 default:
1324 ClearQCache (spp, sp);
1325 return 1;
1326 }
1327
1328 if (sp == spp->curpos) /* already in right position */
1329 return 0;
1330
1331 if (sp == spp->totlen) /* seek to EOF */
1332 {
1333 spp->curpos = sp;
1334 spp->byte = SEQPORT_EOF; /* set to nothing */
1335 ClearQCache (spp, sp);
1336 return 0;
1337 }
1338
1339 if (spp->oldcode) /* has data, is raw or const type */
1340 {
1341
1342 /* if 2na or 4na to iupacna, now only need fast lookup caches */
1343 if (spp->cacheq != NULL) {
1344 ClearQCache (spp, sp);
1345 return 0; /* bypass remaining code */
1346 }
1347
1348 /* original code using cache direct from byte store */
1349
1350 if (spp->cache == NULL) /* allocate a cache */
1351 spp->cache = (SPCachePtr)MemNew(sizeof(SPCache));
1352 spcp = spp->cache;
1353 buf = spcp->buf;
1354
1355 if (plus_strand)
1356 {
1357 curpos = sp + spp->start;
1358 pos = curpos / (Int4) (spp->bc);
1359 lim = spp->stop / (Int4) (spp->bc);
1360 diff = lim - pos + 1;
1361 if (diff > 100)
1362 {
1363 diff = 100;
1364 lim = pos + diff - 1;
1365 }
1366 BSSeek(spp->bp, pos, SEEK_SET);
1367 spcp->total = (Int2) BSRead(spp->bp, (VoidPtr)buf,
1368 diff);
1369 spcp->ctr = 0;
1370 spp->bytepos = lim;
1371 }
1372 else
1373 {
1374 curpos = spp->stop - sp;
1375 pos = curpos / (Int4) (spp->bc);
1376 lim = spp->start / (Int4) (spp->bc);
1377 diff = pos - lim + 1;
1378 if (diff > 100)
1379 {
1380 diff = 100;
1381 lim = pos - diff + 1;
1382 }
1383 BSSeek(spp->bp, lim, SEEK_SET);
1384 spcp->total = (Int2) BSRead(spp->bp, (VoidPtr)buf,
1385 diff);
1386 spcp->ctr = (Int2)(diff - 1);
1387 spp->bytepos = lim;
1388 }
1389 left = curpos % (Int4) (spp->bc);
1390 the_byte = spcp->buf[spcp->ctr];
1391 if ((plus_strand) || (spp->bc == 1))
1392 the_residue = the_byte;
1393 else /* reverse compressed bit orders */
1394 {
1395 left = spp->bc - 1 - left;
1396 the_residue = 0;
1397 bitctr = spp->bc;
1398 while (bitctr)
1399 {
1400 the_residue |= the_byte & spp->mask;
1401 bitctr--;
1402 if (bitctr)
1403 {
1404 the_residue >>= spp->lshift;
1405 the_byte <<= spp->lshift;
1406 }
1407 }
1408 }
1409 bitctr = spp->bc;
1410 while (left)
1411 {
1412 the_residue <<= spp->lshift;
1413 left--; bitctr--;
1414 }
1415 spp->byte = the_residue;
1416 spp->bitctr = (Uint1) bitctr;
1417 spp->curpos = sp;
1418 return 0;
1419 }
1420 else if ((spp->isa_virtual) || (spp->isa_null)) /* virtual or NULL */
1421 {
1422 spp->curpos = sp;
1423 return 0;
1424 }
1425 else /* segmented, reference sequences */
1426 {
1427
1428 if (spp->backing == 1) /* check for backing off segment */
1429 {
1430 if ((spp->curr->curpos == 1) &&
1431 (! spp->curr->backing)) /* yup */
1432 {
1433 spp->curr->curpos = -1; /* just set the flag */
1434 spp->curpos -= 2;
1435 return 0; /* no eos needed, -1
1436 will do */
1437 }
1438 }
1439
1440 curpos = 0;
1441 curspp = spp->segs;
1442 if (curspp == NULL) return 1;
1443 while ((curpos + curspp->totlen) <= sp)
1444 {
1445 curpos += curspp->totlen;
1446 curspp = curspp->next;
1447 if (curspp == NULL)
1448 return 1;
1449 }
1450 if (plus_strand)
1451 curpos = sp - curpos;
1452 else
1453 curpos = (curspp->totlen - 1) - (sp - curpos);
1454 curspp->backing = spp->backing;
1455 if (! SeqPortSeek(curspp, curpos, SEEK_SET))
1456 {
1457 curspp->backing = FALSE;
1458 spp->curr = curspp;
1459 spp->curpos = sp;
1460 return 0;
1461 }
1462 else
1463 {
1464 curspp->backing = FALSE;
1465 return 1;
1466 }
1467 }
1468 }
1469
1470 /*****************************************************************************
1471 *
1472 * Int4 SeqPortTell(spp)
1473 *
1474 *****************************************************************************/
SeqPortTell(SeqPortPtr spp)1475 NLM_EXTERN Int4 SeqPortTell (SeqPortPtr spp)
1476
1477 {
1478 if (spp == NULL)
1479 return -1L;
1480
1481 return spp->curpos;
1482 }
1483
1484 /*****************************************************************************
1485 *
1486 * SeqPortGetResidue(spp)
1487 * returns residue at current location in requested codeing
1488 * SEQPORT_EOF = end of file
1489 *
1490 *****************************************************************************/
SeqPortQuickGetResidue(SeqPortPtr spp,SPCacheQPtr spcpq,Boolean plus_strand)1491 static Uint1 LIBCALL SeqPortQuickGetResidue (SeqPortPtr spp, SPCacheQPtr spcpq, Boolean plus_strand)
1492
1493 {
1494 Uint1 bytes [100];
1495 Int4 curpos, pos, lim, diff;
1496 CharPtr ptr;
1497 Uint1 residue = INVALID_RESIDUE;
1498 Int2 total, i, j;
1499
1500 if (spp == NULL || spcpq == NULL) return INVALID_RESIDUE;
1501
1502 if (spp->curpos == spp->totlen) return SEQPORT_EOF;
1503
1504 if (spp->curpos < spp->totlen) {
1505
1506 if (spcpq->ctr >= spcpq->total) {
1507
1508 /* read next buffer of bytes */
1509
1510 if (plus_strand) {
1511
1512 curpos = spp->curpos + spp->start;
1513 pos = curpos / (Int4) (spp->bc);
1514 lim = spp->stop / (Int4) (spp->bc);
1515 diff = lim - pos + 1;
1516 if (diff > 100) {
1517 diff = 100;
1518 lim = pos + diff - 1;
1519 }
1520 BSSeek (spp->bp, pos, SEEK_SET);
1521 total = (Int2) BSRead (spp->bp, (VoidPtr) bytes, diff);
1522 spp->bytepos = lim;
1523
1524 } else {
1525
1526 curpos = spp->stop - spp->curpos;
1527 pos = curpos / (Int4) (spp->bc);
1528 lim = spp->start / (Int4) (spp->bc);
1529 diff = pos - lim + 1;
1530 if (diff > 100) {
1531 diff = 100;
1532 lim = pos - diff + 1;
1533 }
1534 BSSeek (spp->bp, lim, SEEK_SET);
1535 total = (Int2) BSRead (spp->bp, (VoidPtr) bytes, diff);
1536 spp->bytepos = lim;
1537
1538 }
1539
1540 /* buffer is not null terminated, so uses special copy function */
1541
1542 ptr = spcpq->buf;
1543
1544 if (spp->newcode == Seq_code_iupacna) {
1545 if (spp->oldcode == Seq_code_ncbi2na) {
1546 ptr = (CharPtr) MapNa2ByteToIUPACString (bytes, (Uint4Ptr) ptr, total);
1547 } else if (spp->oldcode == Seq_code_ncbi4na) {
1548 ptr = (CharPtr) MapNa4ByteToIUPACString (bytes, (Uint2Ptr) ptr, total);
1549 }
1550 } else if (spp->newcode == Seq_code_ncbi4na) {
1551 if (spp->oldcode == Seq_code_ncbi2na) {
1552 ptr = (CharPtr) MapNa2ByteTo4BitString (bytes, (Uint4Ptr) ptr, total);
1553 } else if (spp->oldcode == Seq_code_ncbi4na) {
1554 ptr = (CharPtr) MapNa4ByteTo4BitString (bytes, (Uint2Ptr) ptr, total);
1555 }
1556 }
1557
1558 spcpq->total = ptr - spcpq->buf;
1559 spcpq->ctr = 0;
1560
1561 /* deal with end conditions */
1562
1563 if (plus_strand) {
1564 spcpq->ctr += (curpos % (Int4) (spp->bc));
1565 if (lim == (spp->stop / (Int4) (spp->bc))) {
1566 diff = (spp->stop + 1) % (Int4) (spp->bc);
1567 if (diff > 0) {
1568 spcpq->total -= (Int4) (spp->bc) - diff;
1569 }
1570 }
1571 } else {
1572 if (pos == (curpos / (Int4) (spp->bc))) {
1573 diff = (curpos + 1) % (Int4) (spp->bc);
1574 if (diff > 0) {
1575 spcpq->total -= (Int4) (spp->bc) - diff;
1576 }
1577 }
1578 if (lim == (spp->start / (Int4) (spp->bc))) {
1579 spcpq->ctr += (spp->start) % (Int4) (spp->bc);
1580 }
1581
1582 /* reverse complement */
1583
1584 for (i = spcpq->ctr, j = spcpq->total - 1; i < j; i++, j--) {
1585 residue = spcpq->buf [i];
1586 spcpq->buf [i] = spcpq->buf [j];
1587 spcpq->buf [j] = residue;
1588 }
1589 for (i = spcpq->ctr; i < spcpq->total; i++) {
1590 residue = spcpq->buf [i];
1591 spcpq->buf [i] = SeqCodeTableComp (spp->sctp, residue);
1592 }
1593
1594 }
1595
1596 }
1597
1598 /* now get residue directly from uncompressed buffer */
1599
1600 residue = spcpq->buf [spcpq->ctr];
1601 spcpq->ctr++;
1602 }
1603
1604 spp->curpos++;
1605
1606 return residue;
1607 }
1608
SeqPortGetResidue(SeqPortPtr spp)1609 NLM_EXTERN Uint1 LIBCALL SeqPortGetResidue (SeqPortPtr spp)
1610
1611 {
1612 Uint1 residue = INVALID_RESIDUE, the_byte, the_residue, the_code;
1613 Boolean plus_strand = TRUE, moveup;
1614 Int2 bitctr, index;
1615 Int4 pos, lim, diff;
1616 SPCachePtr spcp;
1617 SeqPortPtr tmp, prev;
1618 SPCacheQPtr spcpq;
1619
1620 if (spp != NULL)
1621 spp->backing = FALSE; /* clear it on read */
1622
1623 if (spp != NULL && spp->cacheq != NULL && spp->curpos < spp->totlen) {
1624 spcpq = spp->cacheq;
1625 if (spcpq->ctr < spcpq->total) {
1626 residue = spcpq->buf [spcpq->ctr];
1627 spcpq->ctr++;
1628 spp->curpos++;
1629 return residue;
1630 }
1631 }
1632
1633 if ((spp == NULL) || ((spp->bp == NULL) && (spp->oldcode)))
1634 return SEQPORT_EOF;
1635
1636 if (spp->isa_null) { /* NULL interval */
1637 spp->eos = TRUE; /* moving off the segment */
1638 return SEQPORT_VIRT;
1639 }
1640
1641 if (spp->eos) /* end of reverse complement spp */
1642 return SEQPORT_EOF;
1643
1644 if (spp->curpos == spp->totlen)
1645 {
1646 if (spp->is_circle)
1647 {
1648 SeqPortSeek(spp, 0, SEEK_SET); /* go to start */
1649 if (spp->is_seg) /* give EOS? */
1650 return SEQPORT_EOS;
1651 }
1652 else
1653 return SEQPORT_EOF; /* EOF really */
1654 }
1655
1656 if (spp->curpos == -1) /* backed off end */
1657 {
1658 if (spp->is_circle)
1659 {
1660 SeqPortSeek(spp, -1, SEEK_END); /* go to end */
1661 if (spp->is_seg) /* give EOS? */
1662 return SEQPORT_EOS;
1663 }
1664 else
1665 return SEQPORT_EOF; /* EOF really */
1666 }
1667
1668 if (spp->strand == Seq_strand_minus)
1669 plus_strand = FALSE;
1670
1671 if (spp->oldcode) /* its a raw or const sequence */
1672 {
1673
1674 /* separate function for quick lookup to avoid cluttering old code */
1675 if (spp->cacheq != NULL) {
1676 return SeqPortQuickGetResidue (spp, spp->cacheq, plus_strand);
1677 }
1678
1679 residue = spp->byte & spp->mask;
1680 residue >>= spp->rshift;
1681 spp->byte <<= spp->lshift;
1682 spp->bitctr--;
1683 if (spp->curpos < (spp->totlen - 1)) /* curpos not incremented yet */
1684 {
1685 if (spp->bitctr == 0)
1686 {
1687 spcp = spp->cache;
1688 if (! plus_strand) /* need previous byte */
1689 {
1690 spcp->ctr--;
1691 if (spcp->ctr < 0)
1692 {
1693 pos = spp->bytepos - 1;
1694 lim = spp->start /
1695 (Int4)(spp->bc);
1696 diff = pos - lim + 1;
1697 if (diff > 100)
1698 {
1699 diff = 100;
1700 lim = pos - 100 + 1;
1701 }
1702 BSSeek(spp->bp, lim, SEEK_SET);
1703 spcp->total =
1704 (Int2)BSRead(spp->bp, (VoidPtr)(spcp->buf), diff);
1705 spcp->ctr = (Int2)(diff - 1);
1706 spp->bytepos = lim;
1707 }
1708 }
1709 else /* need next
1710 byte */
1711 {
1712 spcp->ctr++;
1713 if (spcp->ctr >= spcp->total)
1714 {
1715 pos = spp->bytepos + 1;
1716 lim = spp->stop /
1717 (Int4)(spp->bc);
1718 diff = lim - pos + 1;
1719 if (diff > 100)
1720 {
1721 diff = 100;
1722 lim = pos + diff - 1;
1723 }
1724 BSSeek(spp->bp, pos, SEEK_SET);
1725 spcp->total =
1726 (Int2)BSRead(spp->bp, (VoidPtr)(spcp->buf), diff);
1727 spcp->ctr = 0;
1728 spp->bytepos = lim;
1729 }
1730 }
1731 the_byte = spcp->buf[spcp->ctr];
1732
1733 if ((plus_strand) || (spp->bc == 1))
1734 the_residue = the_byte;
1735 else /* reverse compressed bit orders */
1736 {
1737 the_residue = 0;
1738 bitctr = spp->bc;
1739 while (bitctr)
1740 {
1741 the_residue |= the_byte & spp->mask;
1742 bitctr--;
1743 if (bitctr)
1744 {
1745 the_residue >>= spp->lshift;
1746 the_byte <<= spp->lshift;
1747 }
1748 }
1749 }
1750 spp->byte = the_residue;
1751 spp->bitctr = spp->bc;
1752 }
1753 }
1754
1755 if (spp->smtp == NULL) /* no conversion, check now */
1756 {
1757 if (spp->sctp != NULL) {
1758 index = (Int2)residue - (Int2)(spp->sctp->start_at);
1759 if ((index < 0) || (index >= (Int2)(spp->sctp->num)))
1760 residue = INVALID_RESIDUE;
1761 else if (*(spp->sctp->names[index]) == '\0')
1762 residue = INVALID_RESIDUE;
1763 } else {
1764 residue = INVALID_RESIDUE;
1765 }
1766 }
1767 }
1768 else if (spp->isa_virtual) /* virtual */
1769 {
1770 if (spp->do_virtual)
1771 {
1772 if (spp->newcode)
1773 the_code = spp->newcode;
1774 else
1775 the_code = spp->oldcode;
1776 if (spp->gapIsZero && the_code == Seq_code_ncbi4na) {
1777 residue = 0;
1778 } else {
1779 residue = GetGapCode (the_code);
1780 }
1781 spp->curpos++;
1782 return residue;
1783 }
1784 else
1785 {
1786 spp->curpos++;
1787 return SEQPORT_VIRT;
1788 }
1789 }
1790 else /* segmented or reference sequence */
1791 {
1792 residue = SeqPortGetResidue(spp->curr);
1793 while (! IS_residue(residue))
1794 {
1795 /* spp->curr->eos = FALSE; just in case was set */
1796 moveup = FALSE;
1797
1798 switch (residue)
1799 {
1800 case SEQPORT_VIRT:
1801 case SEQPORT_EOS:
1802 if (spp->curr->segs == NULL) /* this
1803 did not come up a layer */
1804 moveup = TRUE;
1805 break;
1806 case SEQPORT_EOF:
1807 moveup = TRUE;
1808 break;
1809 default:
1810 break;
1811 }
1812
1813 if (moveup)
1814 {
1815 if ((spp->curr->curpos == -1) && (!
1816 spp->curr->eos)) /* moving backwards, many layers deep */
1817 {
1818 prev = NULL;
1819 for (tmp = spp->segs; tmp != spp->curr;
1820 tmp = tmp->next)
1821 prev = tmp;
1822 if (prev != NULL)
1823 spp->curr = prev;
1824 else if (spp->is_circle) /* go to end
1825 */
1826 {
1827 for (tmp = spp->segs; tmp->next
1828 != NULL; tmp = tmp->next)
1829 continue;
1830 spp->curr = tmp;
1831 }
1832 else
1833 return SEQPORT_EOF;
1834
1835 if (! plus_strand)
1836 SeqPortSeek(spp->curr, 0,
1837 SEEK_SET);
1838 else if (! (spp->curr->isa_null))
1839 SeqPortSeek(spp->curr, -1,
1840 SEEK_END);
1841 else
1842 spp->curr->curpos = -1; /*
1843 flag the null for next time around */
1844 }
1845 else /* moving
1846 forwards */
1847 {
1848 if (spp->curr->next != NULL)
1849 spp->curr = spp->curr->next;
1850 else if (spp->is_circle)
1851 spp->curr = spp->segs;
1852 else
1853 return SEQPORT_EOF;
1854
1855 if (plus_strand)
1856 SeqPortSeek(spp->curr, 0,
1857 SEEK_SET);
1858 else
1859 SeqPortSeek(spp->curr, -1,
1860 SEEK_END);
1861 }
1862
1863 if (spp->is_seg)
1864 return SEQPORT_EOS;
1865 }
1866
1867 if ((residue == SEQPORT_VIRT) || (residue ==
1868 INVALID_RESIDUE))
1869 return residue;
1870 residue = SeqPortGetResidue(spp->curr);
1871 }
1872
1873 if (! plus_strand)
1874 {
1875 spp->curr->backing++; /* signal we are backing
1876 up */
1877 if (SeqPortSeek(spp->curr, -2, SEEK_CUR)) /* back up to "next" */
1878 spp->curr->eos = TRUE;
1879
1880 }
1881 }
1882
1883 if (spp->smtp != NULL)
1884 residue = SeqMapTableConvert(spp->smtp, residue);
1885
1886 if (! plus_strand)
1887 residue = SeqCodeTableComp(spp->sctp, residue);
1888
1889 spp->curpos++;
1890 return residue;
1891 }
1892
1893 /*****************************************************************************
1894 *
1895 * GetGapCode(seqcode)
1896 * returns code to use for virtual sequence residues for sequence
1897 * code seqcode
1898 * returns INVALID_RESIDUE if seqcode invalid
1899 *
1900 *****************************************************************************/
GetGapCode(Uint1 seqcode)1901 NLM_EXTERN Uint1 GetGapCode (Uint1 seqcode)
1902 {
1903 Uint1 residue = INVALID_RESIDUE;
1904
1905 switch (seqcode)
1906 {
1907 case Seq_code_iupacna:
1908 residue = 'N';
1909 break;
1910 case Seq_code_iupacaa:
1911 case Seq_code_ncbieaa:
1912 residue = 'X';
1913 break;
1914 case Seq_code_ncbi2na: /* there isn't ambiguity */
1915 break;
1916 case Seq_code_ncbi8na:
1917 case Seq_code_ncbi4na:
1918 residue = 15;
1919 break;
1920 case Seq_code_iupacaa3: /* no 1 letter character */
1921 case Seq_code_ncbipna:
1922 case Seq_code_ncbipaa:
1923 break;
1924 case Seq_code_ncbistdaa:
1925 residue = 21;
1926 break;
1927
1928 }
1929
1930 return residue;
1931 }
1932
1933
1934 /*****************************************************************************
1935 *
1936 * SeqPortRead(spp, buf, len)
1937 * returns bytes read
1938 * if returns a negative number, then ABS(return value) gives the
1939 * same codes as SeqPortGetResidue for EOF or EOS
1940 *
1941 *****************************************************************************/
SeqPortRead(SeqPortPtr spp,Uint1Ptr buf,Int2 len)1942 NLM_EXTERN Int2 LIBCALL SeqPortRead (SeqPortPtr spp, Uint1Ptr buf, Int2 len)
1943
1944 {
1945 Int2 ctr = 0;
1946 Int4 loopmax;
1947 Uint1 retval;
1948 SPCacheQPtr spcpq;
1949
1950 if ((spp == NULL) || (buf == NULL) || (len <= 0))
1951 return 0;
1952
1953 if (spp->lastmsg) /* previous EOF or EOS saved */
1954 {
1955 ctr = spp->lastmsg;
1956 spp->lastmsg = 0;
1957 ctr *= -1;
1958 return ctr;
1959 }
1960
1961 spcpq = spp->cacheq;
1962 while (ctr < len) {
1963 loopmax = 0;
1964 if (spcpq != NULL && spp->curpos < spp->totlen && spcpq->ctr < spcpq->total) {
1965 loopmax = MIN ((spp->totlen - spp->curpos), (spcpq->total - spcpq->ctr));
1966 loopmax = MIN (loopmax, (Int4) (len - ctr));
1967 }
1968 /* loopmax saves multiple comparisons, speeds up significantly */
1969 if (loopmax > 0) {
1970 while (loopmax > 0) {
1971 retval = spcpq->buf [spcpq->ctr];
1972 spcpq->ctr++;
1973 spp->curpos++;
1974 loopmax--;
1975 if (IS_residue (retval)) {
1976 *buf = retval;
1977 buf++;
1978 ctr++;
1979 } else {
1980 if (! ctr) /* first one */
1981 {
1982 ctr = retval; /* send return as negative number */
1983 ctr *= -1;
1984 return ctr;
1985 } else {
1986 spp->lastmsg = retval;
1987 return ctr;
1988 }
1989 }
1990 }
1991 } else {
1992 retval = SeqPortGetResidue(spp);
1993 if (IS_residue(retval))
1994 {
1995 *buf = retval;
1996 buf++;
1997 ctr++;
1998 }
1999 else
2000 {
2001 if (! ctr) /* first one */
2002 {
2003 ctr = retval; /* send return as negative number */
2004 ctr *= -1;
2005 return ctr;
2006 }
2007 else
2008 {
2009 spp->lastmsg = retval;
2010 return ctr;
2011 }
2012 }
2013 }
2014 }
2015 return ctr;
2016 }
2017
2018 /*******************************************************************************
2019 *
2020 * SeqPortStream (bsp, flags, userdata, proc)
2021 * SeqPortStreamInt (bsp, start, stop, strand, flags, userdata, proc)
2022 * SeqPortStreamLoc (slp, flags, userdata, proc)
2023 * Efficient functions to stream through sequence
2024 *
2025 ********************************************************************************/
2026
2027 /* structure for passing common arguments internal functions */
2028
2029 typedef struct streamdata {
2030 StreamFlgType flags;
2031 Pointer userdata;
2032 SeqPortStreamProc proc;
2033 Uint1 letterToComp [256];
2034 CharPtr tmp;
2035 Boolean failed;
2036 Int2 depth;
2037 SeqEntryPtr scope;
2038 } StreamData, PNTR StreamDataPtr;
2039
2040 /* prototype for main internal recursive processing function */
2041
2042 static Int4 SeqPortStreamWork (
2043 BioseqPtr bsp,
2044 Int4 start,
2045 Int4 stop,
2046 Uint1 strand,
2047 StreamDataPtr sdp
2048 );
2049
2050 #define STREAM_GAP_MASK (STREAM_EXPAND_GAPS | GAP_TO_SINGLE_DASH | EXPAND_GAPS_TO_DASHES)
2051
SeqPortStreamGap(Int4 length,Boolean is_na,Boolean is_virt,Boolean is_known,Boolean is_seq_gap,StreamDataPtr sdp)2052 static Int4 SeqPortStreamGap (
2053 Int4 length,
2054 Boolean is_na,
2055 Boolean is_virt,
2056 Boolean is_known,
2057 Boolean is_seq_gap,
2058 StreamDataPtr sdp
2059 )
2060
2061 {
2062 Char buf [4004];
2063 Char ch, gapchar = '-';
2064 Boolean expand_gaps, many_dashes, many_pluses, many_tildes, single_dash;
2065 Int4 len;
2066
2067 if (sdp == NULL) return 0;
2068
2069 many_tildes = (Boolean) ((sdp->flags & SEQ_GAP_AS_TILDE) != 0);
2070
2071 many_pluses = FALSE;
2072 if (is_virt) {
2073 if ((sdp->flags & SUPPRESS_VIRT_SEQ) != 0) return 0;
2074 if ((sdp->flags & STREAM_VIRT_AS_PLUS) != 0) {
2075 many_pluses = TRUE;
2076 gapchar = '+';
2077 }
2078 } else if (is_seq_gap) {
2079 if (many_tildes) {
2080 gapchar = '~';
2081 }
2082 } else if (is_known) {
2083 if ((sdp->flags & KNOWN_GAP_AS_PLUS) != 0) {
2084 many_pluses = TRUE;
2085 gapchar = '+';
2086 }
2087 }
2088
2089 expand_gaps = (Boolean) ((sdp->flags & STREAM_GAP_MASK) == STREAM_EXPAND_GAPS);
2090 single_dash = (Boolean) ((sdp->flags & STREAM_GAP_MASK) == GAP_TO_SINGLE_DASH);
2091 many_dashes = (Boolean) ((sdp->flags & STREAM_GAP_MASK) == EXPAND_GAPS_TO_DASHES);
2092
2093 /* if all gap flags are false, ignore gap */
2094
2095 if ((! expand_gaps) && (! single_dash) && (! many_dashes) && (! many_tildes)) return 0;
2096
2097 if (single_dash) {
2098
2099 /* if only indicating gap presence, send one gap character, return 0 count */
2100
2101 buf [0] = gapchar;
2102 buf [1] = '\0';
2103
2104 sdp->proc (buf, sdp->userdata);
2105
2106 return 0;
2107 }
2108
2109 /* if not single dash to mark any gap, need at least one base or residue */
2110
2111 if (length < 1) return 0;
2112
2113 if (many_dashes || many_pluses || many_tildes) {
2114 ch = gapchar;
2115 } else if (is_na) {
2116 ch = 'N';
2117 } else {
2118 ch = 'X';
2119 }
2120
2121 len = MIN (length, 4000L);
2122 MemSet ((Pointer) buf, ch, len);
2123 buf [(int) (Int2) len] = '\0';
2124
2125 for (len = length; len > 0; len -= 4000L) {
2126
2127 /* on last loop, send only partial buffer */
2128
2129 if (len < 4000L) {
2130 buf [(int) (Int2) len] = '\0';
2131 }
2132
2133 sdp->proc (buf, sdp->userdata);
2134 }
2135
2136 /* return number of N or X or gap characters sent */
2137
2138 return length;
2139 }
2140
MapNa8ByteToIUPACString(Uint1Ptr bytep,Uint1Ptr buf,Int4 total,Uint1 badchar,SeqMapTablePtr smtp,StreamDataPtr sdp)2141 static Uint1Ptr LIBCALL MapNa8ByteToIUPACString (
2142 Uint1Ptr bytep,
2143 Uint1Ptr buf,
2144 Int4 total,
2145 Uint1 badchar,
2146 SeqMapTablePtr smtp,
2147 StreamDataPtr sdp
2148 )
2149
2150 {
2151 Uint1 ch;
2152 Int4 k;
2153 Uint1Ptr ptr;
2154 Uint1 residue;
2155
2156 if (bytep == NULL || buf == NULL || sdp == NULL) return buf;
2157 ptr = buf;
2158
2159 for (k = 0; k < total; k++) {
2160 residue = *bytep;
2161 if (smtp != NULL) {
2162 ch = SeqMapTableConvert (smtp, residue);
2163 if (ch == INVALID_RESIDUE && (Boolean) ((sdp->flags & STREAM_CORRECT_INVAL) != 0)) {
2164 *ptr = badchar;
2165 } else {
2166 *ptr = ch;
2167 }
2168 } else {
2169 *ptr = residue;
2170 }
2171 bytep++;
2172 ptr++;
2173 }
2174
2175 return ptr;
2176 }
2177
SeqPortStreamBlock(ByteStorePtr bs,Int4 blk,Int4 compress,Uint1 alphabet,Uint1 badchar,SeqMapTablePtr smtp,Int4 start,Int4 stop,Boolean revcomp,StreamDataPtr sdp)2178 static Int4 SeqPortStreamBlock (
2179 ByteStorePtr bs,
2180 Int4 blk,
2181 Int4 compress,
2182 Uint1 alphabet,
2183 Uint1 badchar,
2184 SeqMapTablePtr smtp,
2185 Int4 start,
2186 Int4 stop,
2187 Boolean revcomp,
2188 StreamDataPtr sdp
2189 )
2190
2191 {
2192 Uint4 uncomp [1001]; /* 4000 characters + extra for end-of-string null byte */
2193 Uint4 compr [251]; /* 1000 bytes + extra for safety */
2194 CharPtr buf;
2195 Uint1Ptr bytes;
2196 /*
2197 Char buf [4004];
2198 Uint1 bytes [1004];
2199 */
2200 Char ch;
2201 Int4 count = 0, cumulative, total;
2202 Int2 from, to;
2203 Boolean many_dashes, single_dash;
2204 CharPtr nd, ptr, str, tmp;
2205
2206 if (bs == NULL || sdp == NULL) return 0;
2207
2208 /* Uint4 arrays ensure 4-byte address alignment by the compiler, no need for & since array is pointer */
2209
2210 buf = (CharPtr) uncomp;
2211 bytes = (Uint1Ptr) compr;
2212
2213 BSSeek (bs, blk, SEEK_SET);
2214
2215 total = BSRead (bs, (VoidPtr) bytes, 1000L);
2216 if (total < 1) return 0;
2217
2218 ptr = buf;
2219 switch (alphabet) {
2220 case Seq_code_ncbi2na :
2221 ptr = (CharPtr) MapNa2ByteToIUPACString (bytes, (Uint4Ptr) ptr, total);
2222 break;
2223 case Seq_code_ncbi4na :
2224 single_dash = (Boolean) ((sdp->flags & STREAM_GAP_MASK) == GAP_TO_SINGLE_DASH);
2225 many_dashes = (Boolean) ((sdp->flags & STREAM_GAP_MASK) == EXPAND_GAPS_TO_DASHES);
2226 if (single_dash || many_dashes) {
2227 ptr = (CharPtr) MapNa4ByteToIUPACplusGapString (bytes, (Uint2Ptr) ptr, total);
2228 } else {
2229 ptr = (CharPtr) MapNa4ByteToIUPACString (bytes, (Uint2Ptr) ptr, total);
2230 }
2231 break;
2232 default :
2233 ptr = (CharPtr) MapNa8ByteToIUPACString (bytes, (Uint1Ptr) ptr, total, badchar, smtp, sdp);
2234 break;
2235 }
2236 *ptr = '\0';
2237
2238 cumulative = blk * compress;
2239
2240 /* deal with end conditions */
2241
2242 total = ptr - buf;
2243
2244 /* check for bsp->length > actual raw data */
2245
2246 if (start > cumulative + total) {
2247 sdp->failed = TRUE;
2248 return 0;
2249 }
2250
2251 from = 0;
2252 if (start > cumulative && start < cumulative + total) {
2253 from += start - cumulative;
2254 }
2255
2256 if (stop < cumulative + total) {
2257 to = (Int2) (stop - cumulative + 1);
2258 buf [to] = '\0';
2259 }
2260
2261 str = buf + from;
2262
2263 if (revcomp) {
2264
2265 /* reverse string first - middle base not touched, so cannot also complement here */
2266
2267 nd = str;
2268 while (*nd != '\0') {
2269 nd++;
2270 }
2271 nd--;
2272
2273 tmp = str;
2274 while (nd > tmp) {
2275 ch = *nd;
2276 *nd = *tmp;
2277 *tmp = ch;
2278 nd--;
2279 tmp++;
2280 }
2281
2282 /* now complement every base in string */
2283
2284 nd = str;
2285 ch = *nd;
2286 while (ch != '\0') {
2287 *nd = sdp->letterToComp [(int) (Uint1) ch];
2288 nd++;
2289 ch = *nd;
2290 }
2291
2292 }
2293
2294 /* send characters to stream callback */
2295
2296 sdp->proc (str, sdp->userdata);
2297
2298 /* return number of characters sent */
2299
2300 tmp = str;
2301 while (*tmp != '\0') {
2302 count++;
2303 tmp++;
2304 }
2305
2306 return count;
2307 }
2308
SeqPortStreamRaw(BioseqPtr bsp,Int4 start,Int4 stop,Uint1 strand,StreamDataPtr sdp)2309 static Int4 SeqPortStreamRaw (
2310 BioseqPtr bsp,
2311 Int4 start,
2312 Int4 stop,
2313 Uint1 strand,
2314 StreamDataPtr sdp
2315 )
2316
2317 {
2318 Uint1 alphabet, code;
2319 Char badchar;
2320 ByteStorePtr bs;
2321 Int4 blk, compress, count = 0, from, to;
2322 Boolean is_na, revcomp = FALSE;
2323 SeqMapTablePtr smtp = NULL;
2324
2325 if (bsp == NULL || sdp == NULL) return 0;
2326 if (bsp->repr != Seq_repr_raw && bsp->repr != Seq_repr_const) return 0;
2327
2328 is_na = (Boolean) ISA_na (bsp->mol);
2329
2330 if (bsp->seq_data_type == Seq_code_gap) {
2331
2332 /* support for new Seq-data.gap */
2333
2334 count += SeqPortStreamGap (stop - start + 1, is_na, FALSE, FALSE, TRUE, sdp);
2335
2336 return count;
2337 }
2338
2339 /* otherwise Seq-data is a byte store */
2340
2341 bs = (ByteStorePtr) bsp->seq_data;
2342 if (bs == NULL) return 0;
2343
2344 alphabet = bsp->seq_data_type;
2345
2346 if (strand == Seq_strand_minus && is_na) {
2347 revcomp = TRUE;
2348 }
2349
2350 /* setup code conversion and decompression parameters */
2351
2352 if (is_na) {
2353 code = Seq_code_iupacna;
2354 badchar = 'N';
2355 } else {
2356 code = Seq_code_ncbieaa;
2357 badchar = 'X';
2358 }
2359
2360 switch (alphabet) {
2361 case Seq_code_ncbi2na :
2362 compress = 4;
2363 break;
2364 case Seq_code_ncbi4na :
2365 compress = 2;
2366 break;
2367 default :
2368 compress = 1;
2369 break;
2370 }
2371
2372 if (code != alphabet) {
2373 smtp = SeqMapTableFind (code, alphabet);
2374 if (smtp == NULL) return 0;
2375 }
2376
2377 /* calculate bytestore block addresses in chunks of 1000 */
2378
2379 from = ((start / compress) / 1000L) * 1000L;
2380 to = ((stop / compress) / 1000L) * 1000L;
2381
2382 /* process sequential blocks of sequence */
2383
2384 if (revcomp) {
2385
2386 for (blk = to; blk >= from; blk -= 1000) {
2387 count += SeqPortStreamBlock (bs, blk, compress, alphabet, badchar, smtp, start, stop, TRUE, sdp);
2388 }
2389
2390 } else {
2391
2392 for (blk = from; blk <= to; blk += 1000) {
2393 count += SeqPortStreamBlock (bs, blk, compress, alphabet, badchar, smtp, start, stop, FALSE, sdp);
2394 }
2395 }
2396
2397 return count;
2398 }
2399
SeqPortStreamSeqLit(SeqLitPtr slitp,Boolean is_na,Int4 start,Int4 stop,Uint1 strand,StreamDataPtr sdp)2400 static Int4 SeqPortStreamSeqLit (
2401 SeqLitPtr slitp,
2402 Boolean is_na,
2403 Int4 start,
2404 Int4 stop,
2405 Uint1 strand,
2406 StreamDataPtr sdp
2407 )
2408
2409 {
2410 Bioseq bsq;
2411 Int4 count = 0;
2412 Boolean is_known = TRUE;
2413
2414 if (slitp == NULL || sdp == NULL) return 0;
2415
2416 /* ignore gaps of unknown length */
2417
2418 if (slitp->length < 1) return 0;
2419
2420 if (slitp->seq_data == NULL) {
2421
2422 /* literal without sequence data is a virtual gap */
2423
2424 if (slitp->fuzz != NULL) {
2425 is_known = FALSE;
2426 }
2427
2428 count += SeqPortStreamGap (stop - start + 1, is_na, FALSE, is_known, FALSE, sdp);
2429
2430 return count;
2431 }
2432
2433 if (slitp->seq_data_type == Seq_code_gap) {
2434
2435 /* also handle new gap type */
2436
2437 if (slitp->fuzz != NULL) {
2438 is_known = FALSE;
2439 }
2440
2441 count += SeqPortStreamGap (stop - start + 1, is_na, FALSE, is_known, TRUE, sdp);
2442
2443 return count;
2444 }
2445
2446 /* otherwise fake a Bioseq with the literal as its data */
2447
2448 MemSet ((Pointer) &bsq, 0, sizeof (Bioseq));
2449
2450 bsq.repr = Seq_repr_raw;
2451 if (is_na) {
2452 bsq.mol = Seq_mol_dna;
2453 } else {
2454 bsq.mol = Seq_mol_aa;
2455 }
2456 bsq.seq_data_type = slitp->seq_data_type;
2457 bsq.seq_data = slitp->seq_data;
2458 bsq.length = slitp->length;
2459
2460 /* call SeqPortStreamRaw to handle sequence data in the byte store */
2461
2462 count += SeqPortStreamRaw (&bsq, start, stop, strand, sdp);
2463
2464 return count;
2465 }
2466
2467 static Int2 stream_retry_attempts = 0;
2468 static Boolean stream_retry_count_set = FALSE;
2469
2470 static Int2 stream_retry_sleep = 0;
2471 static Boolean stream_retryp_sleep_set = FALSE;
2472
SeqPortStreamSeqLoc(SeqLocPtr slp,Int4 start,Int4 stop,Uint1 strand,StreamDataPtr sdp,SeqIdPtr parentID)2473 static Int4 SeqPortStreamSeqLoc (
2474 SeqLocPtr slp,
2475 Int4 start,
2476 Int4 stop,
2477 Uint1 strand,
2478 StreamDataPtr sdp,
2479 SeqIdPtr parentID
2480 )
2481
2482 {
2483 BioseqPtr bsp;
2484 Char buf [64];
2485 Int4 count = 0;
2486 SeqEntryPtr oldscope = NULL;
2487 Char pid [64];
2488 SeqIdPtr sip;
2489 #ifdef OS_UNIX
2490 Int2 attempts;
2491 CharPtr str;
2492 int val = 0;
2493 #endif
2494
2495 if (slp == NULL || sdp == NULL) return 0;
2496
2497 if (start < 0 || stop < 0) return 0;
2498
2499 sip = SeqLocId (slp);
2500 if (sip == NULL) return 0;
2501
2502 if (sip->choice == SEQID_GI && sip->data.intvalue <= 0 &&
2503 (Boolean) ((sdp->flags & STREAM_ALLOW_NEG_GIS) == 0)) {
2504
2505 /* gi 0 is always a data error, just report and bail */
2506 /* negative gi sometimes used in-house, allow if flag set */
2507
2508 SeqIdWrite (sip, buf, PRINTID_FASTA_SHORT, sizeof (buf) - 1);
2509 if (parentID != NULL) {
2510 SeqIdWrite (parentID, pid, PRINTID_FASTA_LONG, sizeof (pid) - 1);
2511 ErrPostEx (SEV_ERROR, 0, 0, "SeqPortStream ignoring Bioseq %s component of %s", buf, pid);
2512 } else {
2513 ErrPostEx (SEV_ERROR, 0, 0, "SeqPortStream ignoring Bioseq %s", buf);
2514 }
2515 sdp->failed = TRUE;
2516 return 0;
2517 }
2518
2519 oldscope = SeqEntrySetScope (sdp->scope);
2520 bsp = BioseqLockById (sip);
2521 SeqEntrySetScope (oldscope);
2522
2523 #ifdef OS_UNIX
2524 if (bsp == NULL) {
2525
2526 /* number of retries and sleep between retries now configured by environment variable */
2527
2528 if (! stream_retry_count_set) {
2529 str = (CharPtr) getenv ("SEQPORT_STREAM_FETCH_ATTEMPTS");
2530 if (StringDoesHaveText (str)) {
2531 if (sscanf (str, "%d", &val) == 1) {
2532 stream_retry_attempts = (Uint2) val;
2533 }
2534 }
2535 stream_retry_count_set = TRUE;
2536 }
2537
2538 if (! stream_retryp_sleep_set) {
2539 str = (CharPtr) getenv ("SEQPORT_STREAM_RETRY_SLEEP");
2540 if (StringDoesHaveText (str)) {
2541 if (sscanf (str, "%d", &val) == 1) {
2542 stream_retry_sleep = (Uint2) val;
2543 }
2544 }
2545 stream_retryp_sleep_set = TRUE;
2546 }
2547
2548 /* retry failed fetch attempt up to specified limit */
2549
2550 if (stream_retry_attempts > 1) {
2551 attempts = 1;
2552 while (bsp == NULL && attempts < stream_retry_attempts) {
2553 if (stream_retry_sleep > 0) {
2554 sleep (stream_retry_sleep);
2555 }
2556
2557 oldscope = SeqEntrySetScope (sdp->scope);
2558 bsp = BioseqLockById (sip);
2559 SeqEntrySetScope (oldscope);
2560 attempts++;
2561 }
2562 if (bsp != NULL) {
2563 SeqIdWrite (sip, buf, PRINTID_FASTA_SHORT, sizeof (buf) - 1);
2564 if (parentID != NULL) {
2565 SeqIdWrite (parentID, pid, PRINTID_FASTA_LONG, sizeof (pid) - 1);
2566 ErrPostEx (SEV_WARNING, 0, 0,
2567 "SeqPortStream loaded Bioseq %s component of %s after %d attempts",
2568 buf, pid, (int) attempts);
2569 } else {
2570 ErrPostEx (SEV_WARNING, 0, 0,
2571 "SeqPortStream loaded Bioseq %s after %d attempts",
2572 buf, (int) attempts);
2573 }
2574 }
2575 }
2576 }
2577 #endif
2578
2579 if (bsp == NULL) {
2580 SeqIdWrite (sip, buf, PRINTID_FASTA_SHORT, sizeof (buf) - 1);
2581 if (parentID != NULL) {
2582 SeqIdWrite (parentID, pid, PRINTID_FASTA_LONG, sizeof (pid) - 1);
2583 ErrPostEx (SEV_ERROR, 0, 0, "SeqPortStream failed to load Bioseq %s component of %s, size = %d",
2584 buf, pid, sizeof( sip->data.intvalue));
2585 } else {
2586 ErrPostEx (SEV_ERROR, 0, 0, "SeqPortStream failed to load Bioseq %s", buf);
2587 }
2588 sdp->failed = TRUE;
2589 return 0;
2590 }
2591
2592 count = SeqPortStreamWork (bsp, start, stop, strand, sdp);
2593
2594 BioseqUnlock (bsp);
2595
2596 return count;
2597 }
2598
2599 /* structure for processing components in forward or reverse direction */
2600
2601 typedef struct streamobj {
2602 SeqLocPtr slp;
2603 SeqLitPtr slitp;
2604 Int4 from;
2605 Int4 to;
2606 Uint1 strand;
2607 } StreamObj, PNTR StreamObjPtr;
2608
StreamObjNew(SeqLocPtr slp,SeqLitPtr slitp,Int4 from,Int4 to,Uint1 strand)2609 static StreamObjPtr StreamObjNew (
2610 SeqLocPtr slp,
2611 SeqLitPtr slitp,
2612 Int4 from,
2613 Int4 to,
2614 Uint1 strand
2615 )
2616
2617 {
2618 StreamObjPtr sop;
2619
2620 sop = (StreamObjPtr) MemNew (sizeof (StreamObj));
2621 if (sop == NULL) return NULL;
2622
2623 sop->slp = slp;
2624 sop->slitp = slitp;
2625 sop->from = from;
2626 sop->to = to;
2627 sop->strand = strand;
2628
2629 return sop;
2630 }
2631
SeqPortStreamDelta(BioseqPtr bsp,Int4 start,Int4 stop,Uint1 strand,StreamDataPtr sdp)2632 static Int4 SeqPortStreamDelta (
2633 BioseqPtr bsp,
2634 Int4 start,
2635 Int4 stop,
2636 Uint1 strand,
2637 StreamDataPtr sdp
2638 )
2639
2640 {
2641 Int4 count = 0, cumulative, from, to, len;
2642 DeltaSeqPtr dsp;
2643 ValNodePtr head = NULL, last = NULL, vnp;
2644 Boolean is_na;
2645 Boolean revcomp = FALSE;
2646 SeqLitPtr slitp;
2647 SeqLocPtr slp;
2648 StreamObjPtr sop;
2649
2650 if (bsp == NULL || sdp == NULL) return 0;
2651
2652 is_na = (Boolean) ISA_na (bsp->mol);
2653
2654 if (strand == Seq_strand_minus && is_na) {
2655 revcomp = TRUE;
2656 }
2657
2658 /* build linked list in forward or reverse order, depending upon input strand */
2659
2660 for (dsp = (DeltaSeqPtr) bsp->seq_ext, cumulative = 0;
2661 dsp != NULL && cumulative <= stop;
2662 dsp = dsp->next, cumulative += len) {
2663
2664 len = 0;
2665
2666 switch (dsp->choice) {
2667
2668 case 1 :
2669 slp = (SeqLocPtr) dsp->data.ptrvalue;
2670 if (slp == NULL) continue;
2671
2672 if (slp->choice == SEQLOC_NULL) continue;
2673
2674 from = SeqLocStart (slp);
2675 to = SeqLocStop (slp);
2676 strand = SeqLocStrand (slp);
2677
2678 if (from < 0 || to < 0) continue;
2679
2680 len = to - from + 1;
2681
2682 if (cumulative + len <= start) continue;
2683
2684 /* adjust from and to if not using entire interval */
2685
2686 if (strand == Seq_strand_minus) {
2687
2688 if (start > cumulative) {
2689 to -= start - cumulative;
2690 }
2691
2692 if (stop < cumulative + len) {
2693 from += cumulative + len - stop - 1;
2694 }
2695
2696 } else {
2697
2698 if (start > cumulative) {
2699 from += start - cumulative;
2700 }
2701
2702 if (stop < cumulative + len) {
2703 to -= cumulative + len - stop - 1;
2704 }
2705 }
2706
2707 if (revcomp) {
2708 if (strand == Seq_strand_minus) {
2709 strand = Seq_strand_plus;
2710 } else {
2711 strand = Seq_strand_minus;
2712 }
2713 }
2714
2715 sop = StreamObjNew (slp, NULL, from, to, strand);
2716 if (sop == NULL) continue;
2717
2718 if (revcomp) {
2719
2720 vnp = ValNodeAddPointer (NULL, 0, (Pointer) sop);
2721 vnp->next = head;
2722 head = vnp;
2723
2724 } else {
2725
2726 vnp = ValNodeAddPointer (&last, 0, (Pointer) sop);
2727 if (head == NULL) {
2728 head = vnp;
2729 }
2730 last = vnp;
2731 }
2732 break;
2733
2734 case 2 :
2735 slitp = (SeqLitPtr) dsp->data.ptrvalue;
2736 if (slitp == NULL) continue;
2737
2738 from = 0;
2739 to = slitp->length - 1;
2740 strand = Seq_strand_plus;
2741
2742 if (from < 0 || to < 0) continue;
2743
2744 len = to - from + 1;
2745
2746 if (cumulative + len <= start) continue;
2747
2748 /* adjust from and to if not using entire interval */
2749
2750 if (start > cumulative) {
2751 from += start - cumulative;
2752 }
2753
2754 if (stop < cumulative + len) {
2755 to -= cumulative + len - stop - 1;
2756 }
2757
2758 if (revcomp) {
2759 if (strand == Seq_strand_minus) {
2760 strand = Seq_strand_plus;
2761 } else {
2762 strand = Seq_strand_minus;
2763 }
2764 }
2765
2766 sop = StreamObjNew (NULL, slitp, from, to, strand);
2767 if (sop == NULL) continue;
2768
2769 if (revcomp) {
2770
2771 vnp = ValNodeAddPointer (NULL, 0, (Pointer) sop);
2772 vnp->next = head;
2773 head = vnp;
2774
2775 } else {
2776
2777 vnp = ValNodeAddPointer (&last, 0, (Pointer) sop);
2778 if (head == NULL) {
2779 head = vnp;
2780 }
2781 last = vnp;
2782 }
2783 break;
2784
2785 default :
2786 break;
2787 }
2788 }
2789
2790 /* process components in correct order */
2791
2792 for (vnp = head; vnp != NULL && (! sdp->failed); vnp = vnp->next) {
2793
2794 sop = (StreamObjPtr) vnp->data.ptrvalue;
2795 if (sop == NULL) continue;
2796
2797 if (sop->slp != NULL) {
2798
2799 count += SeqPortStreamSeqLoc (sop->slp, sop->from, sop->to, sop->strand, sdp, bsp->id);
2800
2801 } else if (sop->slitp != NULL) {
2802
2803 count += SeqPortStreamSeqLit (sop->slitp, is_na, sop->from, sop->to, sop->strand, sdp);
2804 }
2805 }
2806
2807 /* free control list */
2808
2809 ValNodeFreeData (head);
2810
2811 return count;
2812 }
2813
SeqPortStreamSeg(BioseqPtr bsp,Int4 start,Int4 stop,Uint1 strand,StreamDataPtr sdp)2814 static Int4 SeqPortStreamSeg (
2815 BioseqPtr bsp,
2816 Int4 start,
2817 Int4 stop,
2818 Uint1 strand,
2819 StreamDataPtr sdp
2820 )
2821
2822 {
2823 Int4 count = 0, cumulative, from, to, len;
2824 ValNodePtr head = NULL, last = NULL, vnp;
2825 Boolean is_na;
2826 Boolean revcomp = FALSE;
2827 SeqLocPtr slp;
2828 StreamObjPtr sop;
2829
2830 if (bsp == NULL || sdp == NULL) return 0;
2831
2832 is_na = (Boolean) ISA_na (bsp->mol);
2833
2834 if (strand == Seq_strand_minus && is_na) {
2835 revcomp = TRUE;
2836 }
2837
2838 /* build linked list in forward or reverse order, depending upon input strand */
2839
2840 for (slp = (SeqLocPtr) bsp->seq_ext, cumulative = 0;
2841 slp != NULL && cumulative <= stop;
2842 slp = slp->next, cumulative += len) {
2843
2844 len = 0;
2845
2846 if (slp->choice == SEQLOC_NULL) continue;
2847
2848 from = SeqLocStart (slp);
2849 to = SeqLocStop (slp);
2850 strand = SeqLocStrand (slp);
2851
2852 if (from < 0 || to < 0) continue;
2853
2854 len = to - from + 1;
2855
2856 if (cumulative + len <= start) continue;
2857
2858 /* adjust from and to if not using entire interval */
2859
2860 if (strand == Seq_strand_minus) {
2861
2862 if (start > cumulative) {
2863 to -= start - cumulative;
2864 }
2865
2866 if (stop < cumulative + len) {
2867 from += cumulative + len - stop - 1;
2868 }
2869
2870 } else {
2871
2872 if (start > cumulative) {
2873 from += start - cumulative;
2874 }
2875
2876 if (stop < cumulative + len) {
2877 to -= cumulative + len - stop - 1;
2878 }
2879 }
2880
2881 if (revcomp) {
2882 if (strand == Seq_strand_minus) {
2883 strand = Seq_strand_plus;
2884 } else {
2885 strand = Seq_strand_minus;
2886 }
2887 }
2888
2889 sop = StreamObjNew (slp, NULL, from, to, strand);
2890 if (sop == NULL) continue;
2891
2892 if (revcomp) {
2893
2894 vnp = ValNodeAddPointer (NULL, 0, (Pointer) sop);
2895 vnp->next = head;
2896 head = vnp;
2897
2898 } else {
2899
2900 vnp = ValNodeAddPointer (&last, 0, (Pointer) sop);
2901 if (head == NULL) {
2902 head = vnp;
2903 }
2904 last = vnp;
2905 }
2906 }
2907
2908 /* process components in correct order */
2909
2910 for (vnp = head; vnp != NULL && (! sdp->failed); vnp = vnp->next) {
2911
2912 sop = (StreamObjPtr) vnp->data.ptrvalue;
2913 if (sop == NULL) continue;
2914
2915 if (sop->slp != NULL) {
2916
2917 count += SeqPortStreamSeqLoc (sop->slp, sop->from, sop->to, sop->strand, sdp, bsp->id);
2918 }
2919 }
2920
2921 /* free control list */
2922
2923 ValNodeFreeData (head);
2924
2925 return count;
2926 }
2927
SeqPortStreamRef(BioseqPtr bsp,Int4 start,Int4 stop,Uint1 strand,StreamDataPtr sdp)2928 static Int4 SeqPortStreamRef (
2929 BioseqPtr bsp,
2930 Int4 start,
2931 Int4 stop,
2932 Uint1 strand,
2933 StreamDataPtr sdp
2934 )
2935
2936 {
2937 Int4 count = 0, from, to, len;
2938 Boolean is_na;
2939 Boolean revcomp = FALSE;
2940 SeqLocPtr slp;
2941
2942 if (bsp == NULL || sdp == NULL) return 0;
2943
2944 is_na = (Boolean) ISA_na (bsp->mol);
2945
2946 if (strand == Seq_strand_minus && is_na) {
2947 revcomp = TRUE;
2948 }
2949
2950 /* build linked list in forward or reverse order, depending upon input strand */
2951
2952 slp = (SeqLocPtr) bsp->seq_ext;
2953
2954 if (slp == NULL || slp->choice == SEQLOC_NULL) return 0;
2955
2956 len = 0;
2957
2958 from = SeqLocStart (slp);
2959 to = SeqLocStop (slp);
2960 strand = SeqLocStrand (slp);
2961
2962 if (from < 0 || to < 0) return 0;
2963
2964 len = to - from + 1;
2965
2966 if (len <= start) return 0;
2967
2968 /* adjust from and to if not using entire interval */
2969
2970 if (strand == Seq_strand_minus) {
2971
2972 if (start > 0) {
2973 to -= start;
2974 }
2975
2976 if (stop < len) {
2977 from += len - stop - 1;
2978 }
2979
2980 } else {
2981
2982 if (start > 0) {
2983 from += start;
2984 }
2985
2986 if (stop < len) {
2987 to -= len - stop - 1;
2988 }
2989 }
2990
2991 if (revcomp) {
2992 if (strand == Seq_strand_minus) {
2993 strand = Seq_strand_plus;
2994 } else {
2995 strand = Seq_strand_minus;
2996 }
2997 }
2998
2999 count += SeqPortStreamSeqLoc (slp, from, to, strand, sdp, bsp->id);
3000
3001 return count;
3002 }
3003
3004 /* SeqPortStreamWork calls appropriate representation-specific function */
3005
SeqPortStreamWork(BioseqPtr bsp,Int4 start,Int4 stop,Uint1 strand,StreamDataPtr sdp)3006 static Int4 SeqPortStreamWork (
3007 BioseqPtr bsp,
3008 Int4 start,
3009 Int4 stop,
3010 Uint1 strand,
3011 StreamDataPtr sdp
3012 )
3013
3014 {
3015 Int4 count = 0;
3016
3017 if (bsp == NULL || sdp == NULL) return 0;
3018
3019 /* start and stop position reality checks */
3020
3021 if (start < 0) {
3022 start = 0;
3023 }
3024 if (stop < 0) {
3025 stop = bsp->length - 1;
3026 }
3027
3028 /* if start or stop are beyond sequence length, set failed flag */
3029
3030 if (start >= bsp->length || stop >= bsp->length) {
3031 sdp->failed = TRUE;
3032 return 0;
3033 }
3034
3035 if (start > stop) return 0;
3036
3037 /* stack depth overflow check for recursively-defined sequence instances */
3038
3039 (sdp->depth)++;
3040
3041 if (sdp->depth > 20) {
3042 ErrPostEx (SEV_ERROR, 0, 0, "SeqPortStreamWork stack depth overflow");
3043 sdp->failed = TRUE;
3044 return 0;
3045 }
3046
3047 /* call appropriate stream function */
3048
3049 switch (bsp->repr) {
3050
3051 case Seq_repr_virtual :
3052 count += SeqPortStreamGap (stop - start + 1, ISA_na (bsp->mol), TRUE, FALSE, FALSE, sdp);
3053 break;
3054
3055 case Seq_repr_raw :
3056 case Seq_repr_const :
3057 count += SeqPortStreamRaw (bsp, start, stop, strand, sdp);
3058 break;
3059
3060 case Seq_repr_seg :
3061 if (bsp->seq_ext_type == 1) {
3062 count += SeqPortStreamSeg (bsp, start, stop, strand, sdp);
3063 }
3064 break;
3065
3066 case Seq_repr_delta :
3067 if (bsp->seq_ext_type == 4) {
3068 count += SeqPortStreamDelta (bsp, start, stop, strand, sdp);
3069 }
3070 break;
3071
3072 case Seq_repr_ref :
3073 if (bsp->seq_ext_type == 2) {
3074 count += SeqPortStreamRef (bsp, start, stop, strand, sdp);
3075 }
3076 break;
3077
3078 default :
3079 break;
3080 }
3081
3082 /* restore stack depth value */
3083
3084 (sdp->depth)--;
3085
3086 return count;
3087 }
3088
3089 /* default callback for copying to allocated buffer */
3090
SaveStreamSequence(CharPtr sequence,Pointer userdata)3091 static void LIBCALLBACK SaveStreamSequence (
3092 CharPtr sequence,
3093 Pointer userdata
3094 )
3095
3096 {
3097 CharPtr tmp;
3098 CharPtr PNTR tmpp;
3099
3100 tmpp = (CharPtr PNTR) userdata;
3101 tmp = *tmpp;
3102
3103 tmp = StringMove (tmp, sequence);
3104
3105 *tmpp = tmp;
3106 }
3107
3108 /* SeqPortStreamSetup creates revcomp table, calls SeqPortStreamWork */
3109
SeqPortStreamSetup(BioseqPtr bsp,Int4 start,Int4 stop,Uint1 strand,SeqLocPtr loc,SeqLitPtr lit,StreamFlgType flags,Pointer userdata,SeqPortStreamProc proc)3110 static Int4 SeqPortStreamSetup (
3111 BioseqPtr bsp,
3112 Int4 start,
3113 Int4 stop,
3114 Uint1 strand,
3115 SeqLocPtr loc,
3116 SeqLitPtr lit,
3117 StreamFlgType flags,
3118 Pointer userdata,
3119 SeqPortStreamProc proc
3120 )
3121
3122 {
3123 Char ch, lttr;
3124 CharPtr complementBase = " TVGH CD M KN YSAABW R ";
3125 Int4 count = 0, from, to;
3126 Uint2 entityID;
3127 Int2 i;
3128 Boolean is_na;
3129 StreamData sd;
3130 SeqLocPtr slp;
3131
3132 if (bsp == NULL && loc == NULL && lit == NULL) return 0;
3133 if (proc == NULL && userdata == NULL) return 0;
3134
3135 MemSet ((Pointer) &sd, 0, sizeof (StreamData));
3136
3137 sd.flags = flags;
3138 sd.userdata = userdata;
3139 sd.proc = proc;
3140 sd.tmp = NULL;
3141 sd.failed = FALSE;
3142 sd.depth = 0;
3143
3144 /* if NULL callback, copy into allocated userdata string */
3145
3146 if (proc == NULL) {
3147 sd.proc = SaveStreamSequence;
3148 sd.tmp = userdata;
3149 sd.userdata = &(sd.tmp);
3150 }
3151
3152 /* set up nucleotide complementation lookup table */
3153
3154 for (i = 0; i < 256; i++) {
3155 sd.letterToComp [i] = '\0';
3156 }
3157 for (ch = 'A', i = 1; ch <= 'Z'; ch++, i++) {
3158 lttr = complementBase [i];
3159 if (lttr != ' ') {
3160 sd.letterToComp [(int) (Uint1) ch] = lttr;
3161 }
3162 }
3163 for (ch = 'a', i = 1; ch <= 'z'; ch++, i++) {
3164 lttr = complementBase [i];
3165 if (lttr != ' ') {
3166 sd.letterToComp [(int) (Uint1) ch] = lttr;
3167 }
3168 }
3169
3170 /* commence streaming */
3171
3172 if (bsp != NULL) {
3173
3174 entityID = ObjMgrGetEntityIDForPointer (bsp);
3175 sd.scope = GetTopSeqEntryForEntityID (entityID);
3176
3177 count += SeqPortStreamWork (bsp, start, stop, strand, &sd);
3178
3179 } else if (loc != NULL) {
3180
3181 sd.scope = SeqEntryGetScope ();
3182
3183 slp = SeqLocFindNext (loc, NULL);
3184 while (slp != NULL) {
3185
3186 from = SeqLocStart (slp);
3187 to = SeqLocStop (slp);
3188 strand = SeqLocStrand (slp);
3189
3190 if (from < 0 || to < 0) {
3191 sd.failed = TRUE;
3192 return -1;
3193 }
3194
3195 count += SeqPortStreamSeqLoc (slp, from, to, strand, &sd, NULL);
3196
3197 slp = SeqLocFindNext (loc, slp);
3198 }
3199
3200 } else if (lit != NULL) {
3201
3202 is_na = TRUE;
3203 switch (lit->seq_data_type) {
3204 case Seq_code_iupacaa :
3205 case Seq_code_ncbi8aa :
3206 case Seq_code_ncbieaa :
3207 case Seq_code_ncbipaa :
3208 case Seq_code_iupacaa3 :
3209 case Seq_code_ncbistdaa :
3210 is_na = FALSE;
3211 break;
3212 default :
3213 break;
3214 }
3215
3216 count += SeqPortStreamSeqLit (lit, is_na, 0, lit->length - 1, Seq_strand_plus, &sd);
3217 }
3218
3219 /* return number of bases or residues streamed to callback */
3220
3221 if (sd.failed) {
3222 if (count < 1) return -1;
3223 return -count;
3224 }
3225
3226 return count;
3227 }
3228
3229 /* public functions all call SeqPortStreamSetup */
3230
SeqPortStream(BioseqPtr bsp,StreamFlgType flags,Pointer userdata,SeqPortStreamProc proc)3231 NLM_EXTERN Int4 SeqPortStream (
3232 BioseqPtr bsp,
3233 StreamFlgType flags,
3234 Pointer userdata,
3235 SeqPortStreamProc proc
3236 )
3237
3238 {
3239 return SeqPortStreamSetup (bsp, 0, -1, Seq_strand_unknown, NULL, NULL, flags, userdata, proc);
3240 }
3241
SeqPortStreamInt(BioseqPtr bsp,Int4 start,Int4 stop,Uint1 strand,StreamFlgType flags,Pointer userdata,SeqPortStreamProc proc)3242 NLM_EXTERN Int4 SeqPortStreamInt (
3243 BioseqPtr bsp,
3244 Int4 start,
3245 Int4 stop,
3246 Uint1 strand,
3247 StreamFlgType flags,
3248 Pointer userdata,
3249 SeqPortStreamProc proc
3250 )
3251
3252 {
3253 return SeqPortStreamSetup (bsp, start, stop, strand, NULL, NULL, flags, userdata, proc);
3254 }
3255
SeqPortStreamLoc(SeqLocPtr slp,StreamFlgType flags,Pointer userdata,SeqPortStreamProc proc)3256 NLM_EXTERN Int4 SeqPortStreamLoc (
3257 SeqLocPtr slp,
3258 StreamFlgType flags,
3259 Pointer userdata,
3260 SeqPortStreamProc proc
3261 )
3262
3263 {
3264 return SeqPortStreamSetup (NULL, 0, 0, 0, slp, NULL, flags, userdata, proc);
3265 }
3266
SeqPortStreamLit(SeqLitPtr lit,StreamFlgType flags,Pointer userdata,SeqPortStreamProc proc)3267 NLM_EXTERN Int4 SeqPortStreamLit (
3268 SeqLitPtr lit,
3269 StreamFlgType flags,
3270 Pointer userdata,
3271 SeqPortStreamProc proc
3272 )
3273
3274 {
3275 return SeqPortStreamSetup (NULL, 0, 0, 0, NULL, lit, flags, userdata, proc);
3276 }
3277
3278 /*******************************************************************************
3279 *
3280 * StreamCacheSetup (bsp, slp, flags, scp)
3281 * StreamCacheGetResidue (scp)
3282 * StreamCacheSetPosition (scp, pos)
3283 * SeqPort functional replacement implemented on top of SeqPortStreams
3284 *
3285 ********************************************************************************/
3286
StreamCacheSetup(BioseqPtr bsp,SeqLocPtr slp,StreamFlgType flags,StreamCache PNTR scp)3287 NLM_EXTERN Boolean StreamCacheSetup (
3288 BioseqPtr bsp,
3289 SeqLocPtr slp,
3290 StreamFlgType flags,
3291 StreamCache PNTR scp
3292 )
3293
3294 {
3295 if (bsp == NULL && slp == NULL) return FALSE;
3296 if (scp == NULL) return FALSE;
3297
3298 MemSet ((Pointer) scp, 0, sizeof (StreamCache));
3299
3300 if (bsp != NULL) {
3301 scp->bsp = bsp;
3302 scp->length = bsp->length;
3303 } else {
3304 scp->slp = slp;
3305 scp->length = SeqLocLen (slp);
3306 }
3307 scp->flags = flags;
3308
3309 return TRUE;
3310 }
3311
StreamCacheRefreshBuffer(StreamCache PNTR scp)3312 static Boolean StreamCacheRefreshBuffer (
3313 StreamCache PNTR scp
3314 )
3315
3316 {
3317 Bioseq bsq;
3318 Int4 count;
3319 StreamFlgType flags;
3320 SeqLocPtr loc;
3321 SeqLoc sl;
3322 SeqLocPtr slp;
3323 Int4 stop;
3324
3325 if (scp == NULL) return FALSE;
3326
3327 if (scp->ctr >= scp->total) {
3328 scp->offset += (Int4) scp->total;
3329 scp->ctr = 0;
3330 scp->total = 0;
3331
3332 MemSet ((Pointer) &(scp->buf), 0, sizeof (scp->buf));
3333
3334 if (scp->offset < 0 || scp->offset >= scp->length) return FALSE;
3335
3336 stop = MIN (scp->offset + 4000L, scp->length);
3337
3338 flags = scp->flags;
3339 if ((flags & STREAM_GAP_MASK) == GAP_TO_SINGLE_DASH || (flags & STREAM_GAP_MASK) == 0) {
3340 /* if expand_gaps_to_dashes not equal to gaps_to_single_dash + stream_gap_mask, need to clear other bits first */
3341 flags |= EXPAND_GAPS_TO_DASHES;
3342 }
3343 if ((flags & SUPPRESS_VIRT_SEQ) != 0) {
3344 flags ^= SUPPRESS_VIRT_SEQ;
3345 flags |= STREAM_VIRT_AS_PLUS;
3346 }
3347
3348 if (scp->bsp != NULL) {
3349
3350 count = SeqPortStreamInt (scp->bsp, scp->offset, stop - 1, Seq_strand_plus,
3351 flags, (Pointer) &(scp->buf), NULL);
3352 if (count < 0) {
3353 scp->failed = TRUE;
3354 }
3355
3356 } else if (scp->slp != NULL) {
3357
3358 slp = scp->slp;
3359 MemSet ((Pointer) &bsq, 0, sizeof (Bioseq));
3360 MemSet ((Pointer) &sl, 0, sizeof (SeqLoc));
3361 bsq.repr = Seq_repr_seg;
3362 bsq.mol = Seq_mol_na;
3363 bsq.seq_ext_type = 1;
3364 bsq.length = SeqLocLen (slp);
3365 bsq.seq_ext = &sl;
3366 if (slp->choice == SEQLOC_MIX || slp->choice == SEQLOC_PACKED_INT) {
3367 loc = (SeqLocPtr) slp->data.ptrvalue;
3368 if (loc != NULL) {
3369 sl.choice = loc->choice;
3370 sl.data.ptrvalue = (Pointer) loc->data.ptrvalue;
3371 sl.next = loc->next;
3372 }
3373 } else {
3374 sl.choice = slp->choice;
3375 sl.data.ptrvalue = (Pointer) slp->data.ptrvalue;
3376 sl.next = NULL;
3377 }
3378
3379 SeqPortStreamInt (&bsq, scp->offset, stop - 1, Seq_strand_plus,
3380 flags, (Pointer) &(scp->buf), NULL);
3381 }
3382
3383 scp->total = StringLen (scp->buf);
3384 }
3385
3386 return TRUE;
3387 }
3388
StreamCacheGetResidue(StreamCache PNTR scp)3389 NLM_EXTERN Uint1 StreamCacheGetResidue (
3390 StreamCache PNTR scp
3391 )
3392
3393 {
3394 Uint1 residue = '\0';
3395
3396 if (scp == NULL) return '\0';
3397
3398 if (scp->ctr >= scp->total) {
3399 if (! StreamCacheRefreshBuffer (scp)) return '\0';
3400 }
3401
3402 if (scp->ctr < scp->total) {
3403 residue = scp->buf [(int) scp->ctr];
3404 (scp->ctr)++;
3405
3406 if (residue == '-') {
3407
3408 if ((scp->flags & STREAM_GAP_MASK) == 0) {
3409 while (residue == '-') {
3410 if (scp->ctr >= scp->total) {
3411 if (! StreamCacheRefreshBuffer (scp)) return '\0';
3412 }
3413
3414 while (scp->ctr < scp->total && residue == '-') {
3415 residue = scp->buf [(int) scp->ctr];
3416 (scp->ctr)++;
3417 }
3418 }
3419 if (residue == '-') return '\0';
3420
3421 } else if ((scp->flags & STREAM_GAP_MASK) == GAP_TO_SINGLE_DASH) {
3422
3423 while (residue == '-') {
3424 if (scp->ctr >= scp->total) {
3425 if (! StreamCacheRefreshBuffer (scp)) return '-';
3426 }
3427
3428 while (scp->ctr < scp->total && residue == '-') {
3429 residue = scp->buf [(int) scp->ctr];
3430 if (residue != '-') return '-';
3431 (scp->ctr)++;
3432 }
3433 }
3434 }
3435
3436 } else if (residue == '+') {
3437
3438 if ((scp->flags & SUPPRESS_VIRT_SEQ) != 0) {
3439 while (residue == '+') {
3440 if (scp->ctr >= scp->total) {
3441 if (! StreamCacheRefreshBuffer (scp)) return '\0';
3442 }
3443
3444 while (scp->ctr < scp->total && residue == '+') {
3445 residue = scp->buf [(int) scp->ctr];
3446 (scp->ctr)++;
3447 }
3448 }
3449 if (residue == '+') return '\0';
3450 }
3451 }
3452 }
3453
3454 return residue;
3455 }
3456
StreamCacheSetPosition(StreamCache PNTR scp,Int4 pos)3457 NLM_EXTERN Boolean StreamCacheSetPosition (
3458 StreamCache PNTR scp,
3459 Int4 pos
3460 )
3461
3462 {
3463 if (scp == NULL) return FALSE;
3464
3465 if (scp->offset <= pos && scp->offset + (Int4) scp->total >= pos) {
3466 scp->ctr = (Int2) (pos - scp->offset);
3467 return TRUE;
3468 }
3469
3470 scp->ctr = 0;
3471 scp->total = 0;
3472 scp->offset = pos;
3473
3474 if (scp->offset < 0 || scp->offset >= scp->length) {
3475 scp->offset = 0;
3476 return FALSE;
3477 }
3478
3479 return TRUE;
3480 }
3481
3482 /*******************************************************************************
3483 *
3484 * ProteinFromCdRegionEx ( SeqFeatPtr sfp, Boolean include_stop, Boolean remove_trailingX)
3485 * replacement for old ProteinFromCdRegionEx, but using TransTableTranslateCdRegion.
3486 *
3487 ********************************************************************************/
3488
ProteinFromCdRegionExEx(SeqFeatPtr sfp,Boolean include_stop,Boolean remove_trailingX,BoolPtr altStartP,Boolean farProdFetchOK)3489 NLM_EXTERN ByteStorePtr ProteinFromCdRegionExEx (SeqFeatPtr sfp, Boolean include_stop, Boolean remove_trailingX, BoolPtr altStartP, Boolean farProdFetchOK)
3490
3491 {
3492 ByteStorePtr bs;
3493 CdRegionPtr crp;
3494 Int2 genCode = 0;
3495 Char str [32];
3496 Boolean tableExists = FALSE;
3497 TransTablePtr tbl = NULL;
3498 ValNodePtr vnp;
3499
3500 if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return NULL;
3501 crp = (CdRegionPtr) sfp->data.value.ptrvalue;
3502 if (crp == NULL) return NULL;
3503
3504 /* find genetic code */
3505
3506 if (crp->genetic_code != NULL) {
3507 vnp = (ValNodePtr) crp->genetic_code->data.ptrvalue;
3508 while (vnp != NULL) {
3509 if (vnp->choice == 2) {
3510 genCode = (Int2) vnp->data.intvalue;
3511 }
3512 vnp = vnp->next;
3513 }
3514 }
3515
3516 if (genCode == 7) {
3517 genCode = 4;
3518 } else if (genCode == 8) {
3519 genCode = 1;
3520 } else if (genCode == 0) {
3521 genCode = 1;
3522 }
3523
3524 /* set app property name for storing desired FSA */
3525
3526 sprintf (str, "TransTableFSAforGenCode%d", (int) genCode);
3527
3528 /* get FSA for desired genetic code if it already exists */
3529
3530 tbl = (TransTablePtr) GetAppProperty (str);
3531 tableExists = (Boolean) (tbl != NULL);
3532
3533 bs = TransTableTranslateCdRegionEx (&tbl, sfp, include_stop, remove_trailingX,
3534 FALSE, altStartP, farProdFetchOK);
3535
3536 /* save FSA in genetic code-specific app property name */
3537
3538 if (! tableExists) {
3539 SetAppProperty (str, (Pointer) tbl);
3540 }
3541
3542 return bs;
3543 }
3544
ProteinFromCdRegionEx(SeqFeatPtr sfp,Boolean include_stop,Boolean remove_trailingX)3545 NLM_EXTERN ByteStorePtr ProteinFromCdRegionEx (SeqFeatPtr sfp, Boolean include_stop, Boolean remove_trailingX)
3546
3547 {
3548 return ProteinFromCdRegionExEx (sfp, include_stop, remove_trailingX, NULL, TRUE);
3549 }
3550
ProteinFromCdRegionExWithTrailingCodonHandling(SeqFeatPtr sfp,Boolean include_stop,Boolean remove_trailingX,Boolean no_stop_at_end_of_complete_cds)3551 NLM_EXTERN ByteStorePtr ProteinFromCdRegionExWithTrailingCodonHandling
3552 (
3553 SeqFeatPtr sfp,
3554 Boolean include_stop,
3555 Boolean remove_trailingX,
3556 Boolean no_stop_at_end_of_complete_cds
3557 )
3558
3559 {
3560 ByteStorePtr bs;
3561 CdRegionPtr crp;
3562 Int2 genCode = 0;
3563 Char str [32];
3564 Boolean tableExists = FALSE;
3565 TransTablePtr tbl = NULL;
3566 ValNodePtr vnp;
3567
3568 if (sfp == NULL || sfp->data.choice != SEQFEAT_CDREGION) return NULL;
3569 crp = (CdRegionPtr) sfp->data.value.ptrvalue;
3570 if (crp == NULL) return NULL;
3571
3572 /* find genetic code */
3573
3574 if (crp->genetic_code != NULL) {
3575 vnp = (ValNodePtr) crp->genetic_code->data.ptrvalue;
3576 while (vnp != NULL) {
3577 if (vnp->choice == 2) {
3578 genCode = (Int2) vnp->data.intvalue;
3579 }
3580 vnp = vnp->next;
3581 }
3582 }
3583
3584 if (genCode == 7) {
3585 genCode = 4;
3586 } else if (genCode == 8) {
3587 genCode = 1;
3588 } else if (genCode == 0) {
3589 genCode = 1;
3590 }
3591
3592 /* set app property name for storing desired FSA */
3593
3594 sprintf (str, "TransTableFSAforGenCode%d", (int) genCode);
3595
3596 /* get FSA for desired genetic code if it already exists */
3597
3598 tbl = (TransTablePtr) GetAppProperty (str);
3599 tableExists = (Boolean) (tbl != NULL);
3600
3601 bs = TransTableTranslateCdRegion (&tbl, sfp, include_stop, remove_trailingX,
3602 no_stop_at_end_of_complete_cds);
3603
3604 /* save FSA in genetic code-specific app property name */
3605
3606 if (! tableExists) {
3607 SetAppProperty (str, (Pointer) tbl);
3608 }
3609
3610 return bs;
3611 }
3612
3613 NLM_EXTERN Uint1 AAForCodon (Uint1Ptr codon, CharPtr codes);
3614
3615 /*****************************************************************************
3616 *
3617 * ProteinFromCdRegion(sfp, include_stop)
3618 * produces a ByteStorePtr containing the protein sequence in
3619 * ncbieaa code for the CdRegion sfp. If include_stop, will translate
3620 * through stop codons. If NOT include_stop, will stop at first stop
3621 * codon and return the protein sequence NOT including the terminating
3622 * stop. Supports reading frame, alternate genetic codes, and code breaks
3623 * in the CdRegion. Removes trailing "X" from partial translation.
3624 *
3625 *****************************************************************************/
ProteinFromCdRegion(SeqFeatPtr sfp,Boolean include_stop)3626 NLM_EXTERN ByteStorePtr ProteinFromCdRegion(SeqFeatPtr sfp, Boolean include_stop)
3627 {
3628 return ProteinFromCdRegionEx(sfp, include_stop, TRUE);
3629 }
3630
3631
3632 /* old version of ProteinFromCdRegionEx no longer compiled (below) */
3633
3634 #if 0
3635 /*******************************************************************************
3636 *
3637 * ProteinFromCdRegionEx( SeqFeatPtr sfp, Boolean include_stop, Boolean remove_trailingX)
3638 * same behavior as ProteinFromCdRegion, but another Boolean remove_trailingX
3639 * specifies whether trailing X's should be removed.
3640 *
3641 ********************************************************************************/
3642
3643 NLM_EXTERN ByteStorePtr Old_ProteinFromCdRegionEx (SeqFeatPtr sfp, Boolean include_stop, Boolean remove_trailingX)
3644 {
3645 SeqPortPtr spp = NULL;
3646 ByteStorePtr bs = NULL;
3647 Uint1 residue = 0;
3648 Int4 pos1, pos2, pos, len;
3649 Int4Ptr the_breaks = NULL;
3650 Uint1Ptr the_residues = NULL;
3651 Int2 num_code_break = 0, use_break;
3652 SeqLocPtr tmp;
3653 Int2 i;
3654 Uint1 codon[3], aa;
3655 CdRegionPtr crp;
3656 ValNodePtr vnp;
3657 GeneticCodePtr gcp;
3658 CharPtr vals, codes;
3659 CodeBreakPtr cbp;
3660 Boolean bad_base, no_start, check_start, got_stop;
3661 Uint2 part_prod = 0, part_loc = 0;
3662 Boolean incompleteLastCodon;
3663
3664 if ((sfp == NULL) || (sfp->data.choice != 3))
3665 return NULL;
3666
3667 crp = (CdRegionPtr) sfp->data.value.ptrvalue;
3668 len = SeqLocLen(sfp->location);
3669
3670 num_code_break = 0;
3671 if (crp->code_break != NULL)
3672 {
3673 cbp = crp->code_break;
3674 while (cbp != NULL)
3675 {
3676 num_code_break++;
3677 cbp = cbp->next;
3678 }
3679 the_breaks = (Int4Ptr) MemNew((size_t)(num_code_break * sizeof(Int4)));
3680 the_residues = (Uint1Ptr) MemNew((size_t)(num_code_break * sizeof(Uint1)));
3681
3682 num_code_break = 0;
3683 cbp = crp->code_break;
3684 while (cbp != NULL)
3685 {
3686 pos1 = INT4_MAX;
3687 pos2 = -10;
3688 tmp = NULL;
3689 while ((tmp = SeqLocFindNext(cbp->loc, tmp)) != NULL)
3690 {
3691 pos = GetOffsetInLoc(tmp, sfp->location,
3692 SEQLOC_START);
3693 if (pos < pos1)
3694 pos1 = pos;
3695 pos = GetOffsetInLoc(tmp, sfp->location,
3696 SEQLOC_STOP);
3697 if (pos > pos2)
3698 pos2 = pos;
3699 }
3700 pos = pos2 - pos1; /* codon length */
3701 if (pos == 2 || (pos >= 0 && pos <= 1 && pos2 == len - 1)) /* a codon */
3702 /* allowing a partial codon at the end */
3703 {
3704 the_breaks[num_code_break] = pos1;
3705 the_residues[num_code_break] = (Uint1)
3706 cbp->aa.value.intvalue;
3707 num_code_break++;
3708 }
3709 else
3710 {
3711 ErrPost(CTX_NCBIOBJ, 1, "Invalid Code-break.loc");
3712 }
3713
3714 cbp = cbp->next;
3715 }
3716 }
3717
3718 gcp = NULL;
3719 if (crp->genetic_code != NULL)
3720 {
3721 vnp = (ValNodePtr)(crp->genetic_code->data.ptrvalue);
3722 while ((vnp != NULL) && (gcp == NULL))
3723 {
3724 switch (vnp->choice)
3725 {
3726 case 1: /* name */
3727 gcp = GeneticCodeFind(0,
3728 (CharPtr)vnp->data.ptrvalue);
3729 break;
3730 case 2: /* id */
3731 gcp = GeneticCodeFind(vnp->data.intvalue, NULL);
3732 break;
3733 case 3: /* ncbieaa */
3734 case 6: /* sncbieaa */
3735 case 4: /* ncbi8aa */
3736 case 5: /* ncbistdaa */
3737 case 7: /* sncbi8aa */
3738 case 8: /* sncbistdaa */
3739 default:
3740 break;
3741 }
3742 vnp = vnp->next;
3743 }
3744 }
3745 if (gcp == NULL)
3746 gcp = GeneticCodeFind(1, NULL); /* use universal */
3747 if (gcp == NULL)
3748 goto erret;
3749
3750 vals = NULL;
3751 codes = NULL;
3752 for (vnp = (ValNodePtr)gcp->data.ptrvalue; vnp != NULL; vnp = vnp->next)
3753 {
3754 if (vnp->choice == 6) /* sncbieaa */
3755 vals = (CharPtr)vnp->data.ptrvalue;
3756 else if (vnp->choice == 3) /* ncbieaa */
3757 codes = (CharPtr)vnp->data.ptrvalue;
3758 }
3759 if (codes == NULL)
3760 goto erret;
3761
3762 no_start = FALSE;
3763 part_loc = SeqLocPartialCheck(sfp->location);
3764 part_prod = SeqLocPartialCheck(sfp->product);
3765 if ((part_loc & SLP_START) || (part_prod & SLP_START))
3766 no_start = TRUE;
3767
3768 if ((vals == NULL) || (no_start) || (crp->frame > 1)) /* no special
3769 starts */
3770 {
3771 vals = codes;
3772 check_start = FALSE;
3773 }
3774 else
3775 check_start = TRUE;
3776
3777 spp = SeqPortNewByLoc(sfp->location, Seq_code_ncbi4na);
3778 if (spp == NULL)
3779 goto erret;
3780
3781 /* len = SeqLocLen(sfp->location); - saved above */ /* size of coding region */
3782 len /= 3; /* size of
3783 protein */
3784 len += 1; /* allow
3785 partial codon at end */
3786 bs = BSNew(len);
3787 if (bs == NULL)
3788 goto erret;
3789
3790 if (crp->frame == 2) /* skip partial first codon */
3791 pos = 1;
3792 else if (crp->frame == 3)
3793 pos = 2;
3794 else
3795 pos = 0;
3796 SeqPortSeek(spp, pos, SEEK_SET);
3797 got_stop = FALSE;
3798
3799 incompleteLastCodon = FALSE;
3800
3801 do
3802 {
3803 use_break = -1;
3804 for (i = 0; i < num_code_break; i++)
3805 {
3806 if (pos == the_breaks[i])
3807 {
3808 use_break = i;
3809 i = num_code_break;
3810 }
3811 }
3812
3813 bad_base = FALSE;
3814 for (i = 0; i < 3; i++)
3815 {
3816 residue = SeqPortGetResidue(spp);
3817 if (residue == SEQPORT_VIRT || residue == SEQPORT_EOS) {
3818 /* skip past null NULL in seqport, get next - JK */
3819 residue = SeqPortGetResidue(spp);
3820 }
3821 if (residue == SEQPORT_EOF)
3822 break;
3823 if (residue == INVALID_RESIDUE)
3824 bad_base = TRUE;
3825 codon[i] = residue;
3826 }
3827 if (! i) /* no bases */
3828 break;
3829 while (i < 3) /* incomplete last codon */
3830 {
3831 codon[i] = 15; /* N */
3832 i++;
3833 incompleteLastCodon = TRUE;
3834 }
3835
3836 pos += 3;
3837 if (use_break >= 0)
3838 aa = the_residues[use_break];
3839 else if (bad_base)
3840 aa = 'X';
3841 else
3842 {
3843 aa = AAForCodon(codon, vals);
3844 if (check_start) /* first codon on possibly complete
3845 CDS */
3846 {
3847 if (aa == '-') /* invalid start */
3848 {
3849 /* if no explict partial at either end, but
3850 feature is */
3851 /* annotated as partial, then guess should
3852 use internal */
3853 /* amino acid code */
3854
3855 if ((! ((part_loc & SLP_STOP) ||
3856 (part_prod & SLP_STOP))) &&
3857 (sfp->partial))
3858 aa = AAForCodon(codon, codes);
3859 /* get internal aa */
3860 }
3861 check_start = FALSE;
3862 }
3863 }
3864
3865 if ((! include_stop) && (aa == '*'))
3866 {
3867 got_stop = TRUE;
3868 break;
3869 }
3870
3871 BSPutByte(bs, (Int2)aa);
3872
3873 vals = codes; /* not a start codon anymore */
3874
3875 } while (residue != SEQPORT_EOF);
3876
3877 if ((! got_stop) && incompleteLastCodon) {
3878 BSSeek(bs, -1, SEEK_END); /* remove last X if incomplete last codon */
3879 aa = (Uint1)BSGetByte(bs);
3880 if ((aa == 'X') && (BSLen(bs)))
3881 {
3882 BSSeek(bs, -1, SEEK_END);
3883 BSDelete(bs, 1);
3884 BSSeek(bs, -1, SEEK_END);
3885 }
3886 }
3887 if ((! got_stop) && remove_trailingX) /* only remove trailing X on partial CDS */
3888 {
3889 BSSeek(bs, -1, SEEK_END); /* back up to last residue */
3890 aa = (Uint1)BSGetByte(bs);
3891 while ((aa == 'X') && (BSLen(bs)))
3892 {
3893 BSSeek(bs, -1, SEEK_END);
3894 BSDelete(bs, 1);
3895 BSSeek(bs, -1, SEEK_END);
3896 aa = (Uint1)BSGetByte(bs);
3897 }
3898 }
3899
3900 if (! BSLen(bs)) goto erret;
3901
3902 ret:
3903 SeqPortFree(spp);
3904 MemFree(the_breaks);
3905 MemFree(the_residues);
3906 return bs;
3907 erret:
3908 bs = BSFree(bs);
3909 goto ret;
3910 }
3911 #endif
3912
3913 /* old version of ProteinFromCdRegionEx no longer compiled (above) */
3914
3915
3916 /*****************************************************************************
3917 *
3918 * Uint1 AAForCodon (Uint1Ptr codon, CharPtr codes)
3919 * codon is 3 values in ncbi4na code
3920 * codes is the geneic code array to use
3921 * MUST have 'X' as unknown amino acid
3922 *
3923 *****************************************************************************/
AAForCodon(Uint1Ptr codon,CharPtr codes)3924 NLM_EXTERN Uint1 AAForCodon (Uint1Ptr codon, CharPtr codes)
3925 {
3926 register Uint1 aa = 0, taa;
3927 register int i, j, k, index0, index1, index2;
3928 static Uint1 mapping[4] = { 8, /* T in ncbi4na */
3929 2, /* C */
3930 1, /* A */
3931 4 }; /* G */
3932
3933
3934 for (i = 0; i < 4; i++)
3935 {
3936 if (codon[0] & mapping[i])
3937 {
3938 index0 = i * 16;
3939 for (j = 0; j < 4; j++)
3940 {
3941 if (codon[1] & mapping[j])
3942 {
3943 index1 = index0 + (j * 4);
3944 for (k = 0; k < 4; k++)
3945 {
3946 if (codon[2] & mapping[k])
3947 {
3948 index2 = index1 + k;
3949 taa = codes[index2];
3950 if (! aa)
3951 aa = taa;
3952 else
3953 {
3954 if (taa != aa)
3955 {
3956 aa =
3957 'X';
3958 break;
3959 }
3960 }
3961 }
3962 if (aa == 'X')
3963 break;
3964 }
3965 }
3966 if (aa == 'X')
3967 break;
3968 }
3969 }
3970 if (aa == 'X')
3971 break;
3972 }
3973 return aa;
3974 }
3975
3976 static Uint1 codon_xref [4] = { /* mapping from NCBI2na to codon codes */
3977 2, /* A */
3978 1, /* C */
3979 3, /* G */
3980 0 }; /* T */
3981
3982 /*****************************************************************************
3983 *
3984 * Uint1 IndexForCodon (codon, code)
3985 * returns index into genetic codes codon array, give 3 bases of the
3986 * codon in any alphabet
3987 * returns INVALID_RESIDUE on failure
3988 *
3989 *****************************************************************************/
IndexForCodon(Uint1Ptr codon,Uint1 code)3990 NLM_EXTERN Uint1 IndexForCodon (Uint1Ptr codon, Uint1 code)
3991 {
3992 Int2 i, j;
3993 SeqMapTablePtr smtp;
3994 Uint1 residue, index = 0;
3995
3996 smtp = SeqMapTableFind(Seq_code_ncbi2na, code);
3997 if (smtp == NULL) return INVALID_RESIDUE;
3998
3999 for (i=0, j=16; i < 3; i++, j /= 4)
4000 {
4001 residue = SeqMapTableConvert(smtp, codon[i]);
4002 if (residue > 3) return INVALID_RESIDUE;
4003 residue = codon_xref[residue];
4004 index += (Uint1)(residue * j);
4005 }
4006
4007 return index;
4008 }
4009
4010 /*****************************************************************************
4011 *
4012 * Boolean CodonForIndex (index, code, codon)
4013 * Fills codon (3 Uint1 array) with codon corresponding to index,
4014 * in sequence alphabet code.
4015 * Index is the Genetic code index.
4016 * returns TRUE on success.
4017 *
4018 *****************************************************************************/
CodonForIndex(Uint1 index,Uint1 code,Uint1Ptr codon)4019 NLM_EXTERN Boolean CodonForIndex (Uint1 index, Uint1 code, Uint1Ptr codon)
4020 {
4021 Int2 i, j, k;
4022 SeqMapTablePtr smtp;
4023 Uint1 residue;
4024
4025 if (codon == NULL) return FALSE;
4026 if (index > 63) return FALSE;
4027
4028 smtp = SeqMapTableFind(code, Seq_code_ncbi2na);
4029 if (smtp == NULL) return FALSE;
4030
4031 for (i = 0, j = 16; i < 3; i++, j /= 4)
4032 {
4033 residue = (Uint1)((Int2)index / j);
4034 index -= (Uint1)(residue * j);
4035 for (k = 0; k < 4; k++)
4036 {
4037 if (codon_xref[k] == residue)
4038 {
4039 residue = (Uint1)k;
4040 break;
4041 }
4042 }
4043 residue = SeqMapTableConvert(smtp, residue);
4044 codon[i] = residue;
4045 }
4046
4047 return TRUE;
4048 }
4049
4050 /*----------- GetFrameFromLoc()-----------------*/
4051
4052 /*****************************************************************************
4053 *
4054 * Int2 GetFrameFromLoc (slp)
4055 * returns 1,2,3 if can find the frame
4056 * 0 if not
4057 *
4058 *****************************************************************************/
GetFrameFromLoc(SeqLocPtr slp)4059 NLM_EXTERN Uint1 GetFrameFromLoc (SeqLocPtr slp)
4060 {
4061 Uint1 frame = 0;
4062 SeqLocPtr curr, last;
4063 Boolean is_partial;
4064 SeqIntPtr sip;
4065 SeqPntPtr spp;
4066
4067 if (slp == NULL)
4068 return frame;
4069
4070 curr = SeqLocFindNext(slp, NULL);
4071
4072 is_partial = FALSE;
4073 switch (curr->choice)
4074 {
4075 case SEQLOC_INT:
4076 sip = (SeqIntPtr)curr->data.ptrvalue;
4077 if (sip->strand == Seq_strand_minus)
4078 {
4079 if (sip->if_to != NULL)
4080 is_partial = TRUE;
4081 }
4082 else if (sip->if_from != NULL)
4083 is_partial = TRUE;
4084 break;
4085 case SEQLOC_PNT:
4086 spp = (SeqPntPtr)curr->data.ptrvalue;
4087 if (spp->fuzz != NULL)
4088 is_partial = TRUE;
4089 break;
4090 default:
4091 return frame;
4092 }
4093
4094
4095 if (! is_partial)
4096 return (Int2) 1; /* complete 5' end, it's frame 1 */
4097
4098 is_partial = FALSE;
4099 last = curr;
4100 while ((curr = SeqLocFindNext(slp, last)) != NULL)
4101 last = curr;
4102
4103 switch (last->choice)
4104 {
4105 case SEQLOC_INT:
4106 sip = (SeqIntPtr) last->data.ptrvalue;
4107 if (sip->strand == Seq_strand_minus)
4108 {
4109 if (sip->if_from != NULL)
4110 return frame;
4111 }
4112 else if (sip->if_to != NULL)
4113 return frame;
4114 break;
4115 case SEQLOC_PNT:
4116 spp = (SeqPntPtr) last->data.ptrvalue;
4117 if (spp->fuzz != NULL)
4118 return frame;
4119 break;
4120 default:
4121 return frame;
4122 }
4123
4124 /* have complete last codon, get frame
4125 from length */
4126 frame = (Uint1)(SeqLocLen(slp) % 3);
4127 if (frame == 0)
4128 frame = 1;
4129 else if (frame == 1)
4130 frame = 2;
4131 else
4132 frame = 3;
4133
4134 return frame;
4135 }
4136
add_fuzziness_to_loc(SeqLocPtr slp,Boolean less)4137 static Boolean add_fuzziness_to_loc (SeqLocPtr slp, Boolean less)
4138 {
4139 IntFuzzPtr ifp;
4140 SeqIntPtr sint;
4141 SeqPntPtr spnt;
4142
4143 sint = NULL;
4144 spnt = NULL;
4145
4146 if(slp->choice == SEQLOC_INT)
4147 sint = (SeqIntPtr) slp->data.ptrvalue;
4148 else
4149 {
4150 if(slp->choice == SEQLOC_PNT)
4151 spnt = (SeqPntPtr) slp->data.ptrvalue;
4152 else
4153 return FALSE;
4154 }
4155 ifp = IntFuzzNew();
4156 ifp->choice = 4;
4157 ifp->a = less ? 2 : 1;
4158
4159 if(spnt != NULL)
4160 spnt->fuzz = ifp;
4161 else if (sint != NULL)
4162 {
4163 if(less)
4164 sint->if_from = ifp;
4165 else
4166 sint->if_to = ifp;
4167 }
4168
4169 return TRUE;
4170 }
4171
4172
load_fuzz_to_DNA(SeqLocPtr dnaLoc,SeqLocPtr aaLoc,Boolean first)4173 static Boolean load_fuzz_to_DNA(SeqLocPtr dnaLoc, SeqLocPtr aaLoc, Boolean
4174 first)
4175 {
4176 Uint1 strand;
4177 SeqPntPtr spnt;
4178 SeqIntPtr sint;
4179 IntFuzzPtr ifp;
4180 Boolean load, less;
4181
4182 load = FALSE;
4183 strand = SeqLocStrand(aaLoc);
4184 if(aaLoc->choice == SEQLOC_INT)
4185 {
4186 sint = (SeqIntPtr) aaLoc->data.ptrvalue;
4187 if((first && strand != Seq_strand_minus ) ||
4188 (!first && strand == Seq_strand_minus)) /*the first
4189 Seq-loc*/
4190 {
4191 ifp = sint->if_from;
4192 if(ifp && ifp->choice == 4 )
4193 load = (ifp->a == 2);
4194 }
4195 else
4196 {
4197 ifp = sint->if_to;
4198 if(ifp && ifp->choice == 4)
4199 load = (ifp->a == 1);
4200 }
4201 }
4202 else if(aaLoc->choice == SEQLOC_PNT)
4203 {
4204 spnt = (SeqPntPtr) aaLoc->data.ptrvalue;
4205 ifp = spnt->fuzz;
4206 if(ifp && ifp->choice == 4)
4207 {
4208 if(first)
4209 load = (ifp->a == 2);
4210 else
4211 load = (ifp->a == 1);
4212 }
4213 }
4214
4215 if(load)
4216 {
4217 if(SeqLocStrand(dnaLoc) == Seq_strand_minus)
4218 less = (first == FALSE);
4219 else
4220 less = first;
4221 add_fuzziness_to_loc (dnaLoc, less);
4222 return TRUE;
4223 }
4224 else
4225 return FALSE;
4226 }
4227
4228 /******************************************************************
4229 *
4230 * aaLoc_to_dnaLoc(sfp, aa_loc)
4231 * map a SeqLoc on the amino acid sequence
4232 * to a Seq-loc in the DNA sequence
4233 * through a CdRegion feature
4234 *
4235 * This now calls the more general productLoc_to_locationLoc(sfp, productLoc)
4236 *
4237 ******************************************************************/
aaLoc_to_dnaLoc(SeqFeatPtr sfp,SeqLocPtr aa_loc)4238 NLM_EXTERN SeqLocPtr LIBCALL aaLoc_to_dnaLoc(SeqFeatPtr sfp, SeqLocPtr aa_loc)
4239 {
4240 return productLoc_to_locationLoc(sfp, aa_loc);
4241 }
4242
4243 /******************************************************************
4244 *
4245 * aaLoc_to_dnaLoc(sfp, productLoc)
4246 * map a SeqLoc on the product sequence
4247 * to a Seq-loc in the location sequence
4248 * through a feature.
4249 *
4250 * if the feature is a CdRegion, converts by modulo 3
4251 * to support aaLoc_to_dnaLoc() function
4252 *
4253 ******************************************************************/
productLoc_to_locationLoc(SeqFeatPtr sfp,SeqLocPtr productLoc)4254 NLM_EXTERN SeqLocPtr LIBCALL productLoc_to_locationLoc(SeqFeatPtr sfp, SeqLocPtr productLoc)
4255 {
4256 SeqLocPtr head = NULL, slp, tmp, next;
4257 Int4 product_start, product_stop;
4258 SeqBondPtr sbp;
4259 ValNode vn;
4260 Boolean is_cdregion = FALSE;
4261 Boolean partial5, partial3;
4262
4263 if ((sfp == NULL) || (productLoc == NULL)) return head;
4264 if (sfp->data.choice == 3) is_cdregion = TRUE;
4265 if (sfp->product == NULL) return head;
4266 if (! (SeqIdForSameBioseq(SeqLocId(productLoc), SeqLocId(sfp->product))))
4267 return head;
4268
4269 if (productLoc->choice == SEQLOC_BOND) /* fake this one in */
4270 {
4271 sbp = (SeqBondPtr)(productLoc->data.ptrvalue);
4272 tmp = productInterval_to_locationIntervals(sfp, sbp->a->point, sbp->a->point, FALSE);
4273 if (sbp->b == NULL) /* one point in bond */
4274 return tmp;
4275
4276 SeqLocAdd(&head, tmp, TRUE, FALSE);
4277 tmp = productInterval_to_locationIntervals(sfp, sbp->b->point, sbp->b->point, FALSE);
4278 if (tmp == NULL)
4279 return head;
4280
4281 vn.choice = SEQLOC_NULL; /* make a mix with an internal NULL */
4282 vn.next = NULL;
4283 vn.data.ptrvalue = NULL;
4284
4285 SeqLocAdd(&head, &vn, TRUE, TRUE); /* copy it in */
4286 SeqLocAdd(&head, tmp, TRUE, FALSE); /* put real 3 base int in */
4287
4288 goto ret;
4289 }
4290
4291 CheckSeqLocForPartial (productLoc, &partial5, &partial3);
4292 slp = NULL;
4293 while ((slp = SeqLocFindNext(productLoc, slp)) != NULL)
4294 {
4295 product_start = SeqLocStart(slp);
4296 product_stop = SeqLocStop(slp);
4297 if ((product_start >= 0) && (product_stop >= 0))
4298 {
4299 tmp = productInterval_to_locationIntervals(sfp, product_start, product_stop, partial5);
4300 if(tmp != NULL)
4301 load_fuzz_to_DNA(tmp, slp, TRUE);
4302 while (tmp != NULL)
4303 {
4304 next = tmp->next;
4305 tmp->next = NULL;
4306 if(next == NULL)
4307 load_fuzz_to_DNA(tmp, slp, FALSE);
4308 SeqLocAdd(&head, tmp, TRUE, FALSE);
4309 tmp = next;
4310 }
4311 } else if (slp->choice == SEQLOC_NULL) {
4312 vn.choice = SEQLOC_NULL; /* make a mix with an internal NULL */
4313 vn.next = NULL;
4314 vn.data.ptrvalue = NULL;
4315 SeqLocAdd(&head, &vn, TRUE, TRUE);
4316 }
4317 }
4318 ret:
4319 return SeqLocPackage(head);
4320 }
4321
4322 /******************************************************************
4323 *
4324 * aaFeatLoc_to_dnaFeatLoc(sfp, aa_loc)
4325 * map a SeqLoc on the amino acid sequence
4326 * to a Seq-loc in the DNA sequence
4327 * through a CdRegion feature
4328 *
4329 * uses aaLoc_to_dnaLoc() but does additional checks to
4330 * extend dnaLoc at either end to compensate for positions in
4331 * the dna which do not corresspond to the amino acid sequence
4332 * (partial codons which are not translated).
4333 *
4334 ******************************************************************/
aaFeatLoc_to_dnaFeatLoc(SeqFeatPtr sfp,SeqLocPtr aa_loc)4335 NLM_EXTERN SeqLocPtr LIBCALL aaFeatLoc_to_dnaFeatLoc(SeqFeatPtr sfp,
4336 SeqLocPtr aa_loc)
4337 {
4338 SeqLocPtr dnaLoc = NULL;
4339 Uint2 dnaPartial;
4340 Int4 aaPos;
4341 SeqLocPtr tmp1 = NULL, tmp2 = NULL, tmp;
4342 SeqIdPtr sip;
4343 CdRegionPtr crp;
4344 SeqIntPtr sp1, sp2;
4345 BioseqPtr bsp;
4346 Boolean aa_partialn, aa_partialc;
4347
4348 dnaLoc = aaLoc_to_dnaLoc(sfp, aa_loc);
4349 if (dnaLoc == NULL) return dnaLoc;
4350
4351 if (! sfp->partial) /* no partial checks needed */
4352 return dnaLoc;
4353
4354
4355 CheckSeqLocForPartial (aa_loc, &aa_partialn, &aa_partialc);
4356 crp = (CdRegionPtr)(sfp->data.value.ptrvalue);
4357
4358 aaPos = SeqLocStart(aa_loc);
4359 if ((! aaPos) && (crp->frame > 1) && aa_partialn) /* using first amino acid */
4360 {
4361 tmp1 = SeqLocFindNext(sfp->location, NULL);
4362 tmp2 = SeqLocFindNext(dnaLoc, NULL);
4363
4364 if ((tmp1->choice == SEQLOC_INT) &&
4365 (tmp2->choice == SEQLOC_INT))
4366 {
4367 sp1 = (SeqIntPtr)(tmp1->data.ptrvalue);
4368 sp2 = (SeqIntPtr)(tmp2->data.ptrvalue);
4369 if (sp1->strand == Seq_strand_minus)
4370 {
4371 sp2->to = sp1->to; /* add partial codon */
4372 }
4373 else
4374 {
4375 sp2->from = sp1->from;
4376 }
4377 }
4378 }
4379
4380 dnaPartial = SeqLocPartialCheck(sfp->location);
4381 if ((dnaPartial & SLP_STOP) && aa_partialc) /* missing 3' end of cdregion */
4382 {
4383 sip = SeqLocId(aa_loc);
4384 bsp = BioseqFindCore(sip);
4385 if (bsp != NULL)
4386 {
4387 aaPos = SeqLocStop(aa_loc);
4388 if (aaPos == (bsp->length - 1)) /* last amino acid */
4389 {
4390 tmp = NULL;
4391 while ((tmp = SeqLocFindNext(sfp->location,tmp)) != NULL)
4392 {
4393 tmp1 = tmp;
4394 }
4395 tmp = NULL;
4396 while ((tmp = SeqLocFindNext(dnaLoc,tmp)) != NULL)
4397 {
4398 tmp2 = tmp;
4399 }
4400
4401 if (tmp1 != NULL && tmp2 != NULL && (tmp1->choice == SEQLOC_INT) &&
4402 (tmp2->choice == SEQLOC_INT))
4403 {
4404 sp1 = (SeqIntPtr)(tmp1->data.ptrvalue);
4405 sp2 = (SeqIntPtr)(tmp2->data.ptrvalue);
4406 if (sp1->strand == Seq_strand_minus)
4407 {
4408 sp2->from = sp1->from; /* add partial codon */
4409 }
4410 else
4411 {
4412 sp2->to = sp1->to;
4413 }
4414 }
4415 }
4416
4417 }
4418 }
4419 return dnaLoc;
4420 }
4421
4422
4423 static SeqLocPtr
NucLocFromProtInterval(SeqFeatPtr cds,Int4 prot_start,Int4 prot_stop,Boolean n_partial)4424 NucLocFromProtInterval
4425 (SeqFeatPtr cds,
4426 Int4 prot_start,
4427 Int4 prot_stop,
4428 Boolean n_partial)
4429 {
4430 CdRegionPtr crp;
4431 Int4 aa_before = 0, nt_this, prev_nt = 0, part_codon;
4432 SeqLocPtr result = NULL;
4433 SeqLocPtr slp = NULL; /* used for iterating through locations in the coding region */
4434 SeqLocPtr loc; /* used for creating interval on NT sequence */
4435 Boolean first_loc = TRUE;
4436 Int4 cds_int_start, cds_int_stop, cds_int_len;
4437 Int4 frame_start = 0;
4438 Int4 aa_int_start = 0, aa_int_stop = 0, aa_len, this_aa, aa_needed, aa_unneeded, aa_accumulated = 0;
4439 Int4 aa_from_this_interval;
4440 Uint1 strand;
4441
4442 if (cds == NULL || cds->data.choice != SEQFEAT_CDREGION || prot_start < 0 || prot_stop < prot_start) {
4443 return NULL;
4444 }
4445
4446 crp = (CdRegionPtr) cds->data.value.ptrvalue;
4447 if (crp == NULL) {
4448 return NULL;
4449 }
4450 if (crp->frame > 1) {
4451 frame_start = crp->frame - 1;
4452 }
4453
4454 aa_len = prot_stop - prot_start + 1;
4455
4456 while((slp = SeqLocFindNext(cds->location, slp)) != NULL) {
4457 cds_int_len = SeqLocLen (slp);
4458 cds_int_start = SeqLocStart (slp);
4459 cds_int_stop = SeqLocStop (slp);
4460 strand = SeqLocStrand (slp);
4461
4462 if (first_loc) {
4463 if (strand == Seq_strand_minus) {
4464 cds_int_stop -= frame_start;
4465 } else {
4466 cds_int_start += frame_start;
4467 }
4468 cds_int_len -= frame_start;
4469 }
4470
4471 /* calculate the number of NT that "count" for this interval -
4472 * don't include the NT in a partial codon at the beginning of
4473 * of the feature, but do include NT from a partial codon at
4474 * the end of the previous interval.
4475 */
4476 nt_this = cds_int_len + prev_nt;
4477 part_codon = nt_this % 3;
4478 nt_this -= part_codon;
4479
4480 /* calculate how many AA are covered by this interval */
4481 this_aa = nt_this / 3;
4482
4483 if (aa_before + this_aa >= prot_start) {
4484
4485 /* figure out whether to take all of this interval, or just part of it */
4486 aa_from_this_interval = this_aa;
4487
4488 /* 5' end (left for plus strand, right for minus) */
4489 if (aa_before < prot_start) {
4490 /* skip some at the beginning */
4491 aa_unneeded = prot_start - aa_before;
4492 aa_from_this_interval -= aa_unneeded;
4493
4494 if (strand == Seq_strand_minus) {
4495 aa_int_stop = cds_int_stop + prev_nt - (3 * aa_unneeded);
4496 } else {
4497 aa_int_start = cds_int_start - prev_nt + (3 * aa_unneeded);
4498 }
4499 } else {
4500 /* start at the beginning */
4501 if (strand == Seq_strand_minus) {
4502 aa_int_stop = cds_int_stop;
4503 if (first_loc) {
4504 if (n_partial) {
4505 /* put frame shift back in, if first loc and n-partial */
4506 aa_int_stop += frame_start;
4507 } else if (aa_before == prot_start) {
4508 /* starts in this interval, but after "remainder" of previous codon */
4509 aa_int_stop -= prev_nt;
4510 }
4511 }
4512 } else {
4513 aa_int_start = cds_int_start;
4514 if (first_loc) {
4515 if (n_partial) {
4516 /* put frame shift back in, if first loc and n-partial */
4517 aa_int_start -= frame_start;
4518 } else if (aa_before == prot_start) {
4519 /* starts in this interval, but after "remainder" of previous codon */
4520 aa_int_start += prev_nt;
4521 }
4522 }
4523 }
4524 }
4525
4526 /* 3' end (right for plus strand, left for minus) */
4527 if (aa_accumulated + aa_from_this_interval < aa_len) {
4528 if (strand == Seq_strand_minus) {
4529 aa_int_start = cds_int_start;
4530 } else {
4531 aa_int_stop = cds_int_stop;
4532 }
4533 } else {
4534 /* just take the part that we need */
4535 aa_needed = aa_len - aa_accumulated;
4536 aa_unneeded = aa_from_this_interval - aa_needed;
4537
4538 if (strand == Seq_strand_minus) {
4539 aa_int_start = cds_int_start + part_codon + (3 * aa_unneeded);
4540 } else {
4541 aa_int_stop = cds_int_stop - part_codon - (3 * aa_unneeded);
4542 }
4543 aa_from_this_interval -= aa_unneeded;
4544 }
4545
4546 /* note - if aa_int_start > aa_int_stop, that means we eliminated
4547 * both ends of the interval.
4548 */
4549 if (aa_int_start <= aa_int_stop) {
4550 /* aa_accumulated now includes the number of complete codons that have
4551 * been accounted for (not counting a partial codon at the end of this
4552 * interval, if any
4553 */
4554 aa_accumulated += aa_from_this_interval;
4555
4556 /* add interval to result */
4557 loc = SeqLocIntNew(aa_int_start, aa_int_stop, strand, SeqLocId(slp));
4558 SeqLocAdd(&result, loc, TRUE, FALSE);
4559 }
4560 }
4561
4562 first_loc = FALSE;
4563 aa_before += this_aa;
4564 prev_nt = part_codon;
4565
4566 if (aa_before > prot_stop) {
4567 break;
4568 }
4569 }
4570
4571 return result;
4572 }
4573
4574
NaLocFromNaInterval(SeqFeatPtr sfp,Int4 product_start,Int4 product_stop)4575 static SeqLocPtr NaLocFromNaInterval (SeqFeatPtr sfp, Int4 product_start, Int4 product_stop)
4576 {
4577 SeqLocPtr slp = NULL;
4578 SeqLocPtr location_loc, loc; /*for the sfp.location location*/
4579
4580 Boolean is_end; /**is the end for process reached?**/
4581 Int4 p_start=0, p_stop=0; /**product sequence start & stop in defined
4582 corresponding sfp.product **/
4583 Int4 cur_pos; /**current sfp.product sequence position in process**/
4584 Int4 product_len; /**length of the sfp.product **/
4585
4586 Int4 d_start, d_stop; /*the start and the stop of the sfp.location sequence*/
4587 Int4 offset; /*offset from the start of the current exon*/
4588 Int4 aa_len;
4589 Uint1 strand;
4590 Int4 p_end_pos; /*the end of the product sequence in the current loc*/
4591
4592 cur_pos= product_start;
4593 product_len = 0;
4594 is_end = FALSE;
4595 p_start = 0;
4596 slp = NULL;
4597 location_loc= NULL;
4598 while(!is_end && ((slp = SeqLocFindNext(sfp->location, slp))!=NULL))
4599 {
4600 product_len += SeqLocLen(slp);
4601 p_stop = product_len - 1;
4602
4603 p_end_pos = p_stop;
4604
4605 if(p_stop >= product_stop)
4606 {
4607 p_stop = product_stop; /**check if the end is reached**/
4608 is_end = TRUE;
4609 }
4610
4611 if(p_stop >= cur_pos) /*get the exon*/
4612 {
4613 offset = cur_pos - p_start;
4614
4615 strand = SeqLocStrand(slp);
4616 if(strand == Seq_strand_minus)
4617 d_start = SeqLocStop(slp) - offset;
4618 else
4619 d_start = SeqLocStart(slp) + offset;
4620
4621 d_stop = d_start;
4622
4623 aa_len = MIN(p_stop, product_stop) - cur_pos +1;
4624
4625 if(strand == Seq_strand_minus)
4626 {
4627 if(aa_len >= 0)
4628 {
4629 d_stop -= (aa_len - 1);
4630 }
4631 else
4632 {
4633 ++d_stop;
4634 }
4635
4636 d_stop = MAX(d_stop, SeqLocStart(slp));
4637 loc = SeqLocIntNew(d_stop, d_start, strand, SeqLocId(slp));
4638 }
4639 else
4640 {
4641 if(aa_len >= 0)
4642 {
4643 d_stop += (aa_len - 1);
4644 }
4645 else
4646 --d_stop;
4647
4648 d_stop = MIN(d_stop, SeqLocStop(slp));
4649 loc = SeqLocIntNew(d_start, d_stop, strand, SeqLocId(slp));
4650 }
4651 SeqLocAdd(&location_loc, loc, TRUE, FALSE);
4652
4653 cur_pos = p_stop+1;
4654 }
4655
4656 p_start = p_stop +1;
4657
4658 }/**end of while(slp && !is_end) **/
4659
4660 return location_loc;
4661 }
4662
4663 /******************************************************************
4664 *
4665 * productInterval_to_locationIntervals(sfp, product_start, product_stop)
4666 * map the amino acid sequence to a chain of Seq-locs in the
4667 * DNA sequence through a CdRegion feature
4668 *
4669 ******************************************************************/
4670 NLM_EXTERN SeqLocPtr LIBCALL
productInterval_to_locationIntervals(SeqFeatPtr sfp,Int4 product_start,Int4 product_stop,Boolean aa_partialn)4671 productInterval_to_locationIntervals
4672 (SeqFeatPtr sfp,
4673 Int4 product_start,
4674 Int4 product_stop,
4675 Boolean aa_partialn)
4676 {
4677
4678 if (sfp->data.choice == SEQFEAT_CDREGION) {
4679 return NucLocFromProtInterval (sfp, product_start, product_stop, aa_partialn);
4680 } else {
4681 return NaLocFromNaInterval (sfp, product_start, product_stop);
4682 }
4683
4684 }
4685
4686 static Boolean load_fuzz_to_DNA PROTO((SeqLocPtr dnaLoc, SeqLocPtr aaLoc,
4687 Boolean first));
4688 /******************************************************************
4689 *
4690 * dnaLoc_to_aaLoc(sfp, location_loc, merge)
4691 * map a SeqLoc on the DNA sequence
4692 * to a Seq-loc in the protein sequence
4693 * through a CdRegion feature
4694 * if (merge) adjacent intervals on the amino acid sequence
4695 * are merged into one. This should be the usual case.
4696 *
4697 ******************************************************************/
dnaLoc_to_aaLoc(SeqFeatPtr sfp,SeqLocPtr location_loc,Boolean merge,Int4Ptr frame,Boolean allowTerminator)4698 NLM_EXTERN SeqLocPtr LIBCALL dnaLoc_to_aaLoc(SeqFeatPtr sfp, SeqLocPtr location_loc, Boolean
4699 merge, Int4Ptr frame, Boolean allowTerminator)
4700 {
4701 SeqLocPtr aa_loc = NULL, loc;
4702 CdRegionPtr crp;
4703 Int4 product_len, end_pos, frame_offset;
4704 GatherRange gr;
4705 Int4 a_left = 0, a_right, last_aa = -20, aa_from, aa_to;
4706 Int4 cds_left, cds_right;
4707 SeqLocPtr slp;
4708 Int2 cmpval;
4709 SeqIdPtr aa_sip;
4710 BioseqPtr bsp;
4711 Boolean partial5, partial3;
4712 Uint1 strand;
4713
4714 if ((sfp == NULL) || (location_loc == NULL)) return aa_loc;
4715 if (sfp->data.choice != 3) return aa_loc;
4716 if (sfp->product == NULL) return aa_loc;
4717
4718 crp = (CdRegionPtr) sfp->data.value.ptrvalue;
4719 if(crp == NULL) return aa_loc;
4720
4721 /* location_loc must be equal or contained in feature */
4722 cmpval = SeqLocCompare(location_loc, sfp->location);
4723 if (! ((cmpval == SLC_A_IN_B) || (cmpval == SLC_A_EQ_B)))
4724 return aa_loc;
4725
4726 aa_sip = SeqLocId(sfp->product);
4727 if (aa_sip == NULL) return aa_loc;
4728 bsp = BioseqLockById(aa_sip);
4729 if (bsp == NULL) return aa_loc;
4730 end_pos = bsp->length - 1;
4731 BioseqUnlock(bsp);
4732
4733 if(crp->frame == 0)
4734 frame_offset = 0;
4735 else
4736 frame_offset = (Int4)crp->frame-1;
4737
4738 cds_left = SeqLocStart (sfp->location);
4739 cds_right = SeqLocStop (sfp->location);
4740
4741
4742 slp = NULL;
4743 product_len = 0;
4744 loc = NULL;
4745 while ((slp = SeqLocFindNext(sfp->location, slp))!=NULL)
4746 {
4747 if (SeqLocOffset(location_loc, slp, &gr, 0))
4748 {
4749 SeqLocOffset(slp, location_loc, &gr, 0);
4750
4751 a_left = gr.left + product_len;
4752 a_right = gr.right + product_len;
4753 if (frame_offset > 0) {
4754 a_left -= frame_offset;
4755 a_right -= frame_offset;
4756 }
4757
4758 if (a_left < 0)
4759 {
4760 CheckSeqLocForPartial (slp, &partial5, &partial3);
4761 strand = SeqLocStrand (slp);
4762 if ((partial5 && strand != Seq_strand_minus) || (partial3 && strand == Seq_strand_minus)) {
4763 a_left = gr.left;
4764 } else {
4765 a_left += 3;
4766 }
4767 }
4768 if (a_right > (bsp->length) * 3 - 1 && !allowTerminator) {
4769 CheckSeqLocForPartial (slp, &partial5, &partial3);
4770 strand = SeqLocStrand (slp);
4771 if (partial3 && a_right == bsp->length * 3) {
4772 /* it's ok, leave it alone */
4773 } else if ((partial5 && strand != Seq_strand_minus) || (partial3 && strand == Seq_strand_minus)) {
4774 a_right = (bsp->length * 3) - 1;
4775 } else {
4776 a_right -= 3;
4777 }
4778 }
4779
4780 aa_from = a_left / 3;
4781 aa_to = a_right / 3;
4782
4783 if (aa_to > end_pos && !allowTerminator)
4784 aa_to = end_pos;
4785
4786 if (merge)
4787 {
4788 if (aa_from <= last_aa) /* overlap due to codons */
4789 aa_from = last_aa+1; /* set up to merge */
4790 }
4791
4792 /* NOTE - if a_left is not <= a_right, then a correction for frame may have
4793 * caused the location to not actually be mappable to the protein sequence.
4794 */
4795 if ((aa_from <= aa_to || (allowTerminator && aa_from == aa_to + 1)) && a_left <= a_right)
4796 {
4797 if(loc != NULL)
4798 {
4799 if(aa_loc == NULL)
4800 load_fuzz_to_DNA(loc, location_loc, TRUE);
4801 SeqLocAdd(&aa_loc, loc, merge, FALSE);
4802 }
4803 loc = SeqLocIntNew(aa_from, aa_to, 0, aa_sip);
4804 last_aa = aa_to;
4805 }
4806 }
4807
4808 product_len += SeqLocLen(slp);
4809 }
4810
4811 if(loc != NULL)
4812 {
4813 if(aa_loc == NULL)
4814 load_fuzz_to_DNA(loc, location_loc, TRUE);
4815 load_fuzz_to_DNA(loc, location_loc, FALSE);
4816 SeqLocAdd(&aa_loc, loc, merge, FALSE);
4817 }
4818 if (frame != NULL)
4819 *frame = a_left % 3;
4820
4821 return SeqLocPackage(aa_loc);
4822 }
4823
4824 /*****************************************************************************
4825 *
4826 * BioseqHash(bsp)
4827 * Computes a (almost) unique hash code for a bioseq
4828 *
4829 *****************************************************************************/
BioseqHash(BioseqPtr bsp)4830 NLM_EXTERN Uint4 BioseqHash (BioseqPtr bsp)
4831 {
4832 Uint4 hashval = 0;
4833 SeqPortPtr spp;
4834 Uint1 code;
4835 Int2 residue;
4836
4837 if (bsp == NULL) return hashval;
4838
4839 if (ISA_na(bsp->mol))
4840 code = Seq_code_iupacna;
4841 else
4842 code = Seq_code_ncbieaa;
4843
4844 spp = SeqPortNew(bsp, 0, -1, 0, code);
4845 if (spp == NULL) return hashval;
4846
4847 while ((residue = SeqPortGetResidue(spp)) != SEQPORT_EOF)
4848 {
4849 hashval *= 1103515245;
4850 hashval += (Uint4)residue + 12345;
4851 }
4852
4853 SeqPortFree(spp);
4854
4855 return hashval;
4856 }
4857
4858
4859 /*-------------- BioseqRevComp () ---------------------------*/
4860 /***********************************************************************
4861 * BioseqRevComp: Takes the nucleic acid sequence from Bioseq
4862 * Entry and gives the reverse complement sequence in place
4863 * Does not change features.
4864 ************************************************************************/
BioseqRevComp(BioseqPtr bsp)4865 NLM_EXTERN Boolean LIBCALL BioseqRevComp (BioseqPtr bsp)
4866 {
4867 Boolean retval;
4868
4869 retval = BioseqReverse (bsp);
4870 if (retval)
4871 retval = BioseqComplement(bsp);
4872 return retval;
4873 }
4874
ComplementSeqData(Uint1 seqtype,Int4 seqlen,SeqDataPtr sdp)4875 NLM_EXTERN Boolean ComplementSeqData (Uint1 seqtype, Int4 seqlen, SeqDataPtr sdp)
4876 {
4877 SeqCodeTablePtr sctp;
4878 ByteStorePtr bysp;
4879 long readbyte, bslen;
4880 Uint1 byte = 0, byte_to, newbyte = 0, residue;
4881 Uint1 comp, bitctr, mask, lshift, rshift, bc;
4882
4883 if (seqtype == Seq_code_gap) return FALSE;
4884
4885 bysp = (ByteStorePtr) sdp;
4886 if (bysp == NULL)
4887 {
4888 ErrPostEx(SEV_ERROR,0,0, "Error: no sequence data\n");
4889 return FALSE;
4890 }
4891
4892 if ((sctp = SeqCodeTableFind (seqtype)) == NULL)
4893 {
4894 ErrPostEx(SEV_ERROR,0,0, "Can't open table\n");
4895 return FALSE;
4896 }
4897 switch (seqtype) /*determine type of base encoding*/
4898 {
4899 case Seq_code_ncbi2na:
4900 bc = 4;
4901 rshift = 6;
4902 lshift = 2;
4903 mask = 192;
4904 break;
4905
4906 case Seq_code_ncbi4na:
4907 bc = 2;
4908 rshift = 4;
4909 lshift = 4;
4910 mask = 240;
4911 break;
4912
4913 case Seq_code_iupacna:
4914 case Seq_code_ncbi8na:
4915 bc = 1;
4916 rshift = 0;
4917 lshift = 0;
4918 mask = 255;
4919 break;
4920 case Seq_code_iupacaa:
4921 case Seq_code_ncbi8aa:
4922 case Seq_code_ncbieaa:
4923 case Seq_code_ncbipaa:
4924 case Seq_code_iupacaa3:
4925 case Seq_code_ncbistdaa: /* ignore amino acid */
4926 ErrPostEx(SEV_ERROR,0,0, "Error: cannot complement aa ; No ->mol flag on Bioseq\n");
4927 return FALSE;
4928 case Seq_code_ncbipna:
4929 ErrPostEx(SEV_WARNING,0,0, "Error: Don't yet know how to complement profile\n");
4930 return FALSE;
4931 default:
4932 return FALSE;
4933 }
4934
4935 bslen = BSLen(bysp);
4936 bitctr = 0;
4937 readbyte = 0;
4938
4939 while (readbyte < bslen)
4940 {
4941 if (!bitctr)
4942 { /*get new byte*/
4943 BSSeek (bysp, readbyte, SEEK_SET);
4944 newbyte = byte_to = byte = residue = 0;
4945 byte = (Uint1)BSGetByte (bysp);
4946 bitctr = bc;
4947 readbyte++;
4948 }
4949
4950 for (; bitctr; bitctr--)
4951 {
4952 residue = byte & mask; /*mask out all but one base*/
4953 residue >>= rshift;
4954 byte <<= lshift;
4955
4956 comp = SeqCodeTableComp (sctp, residue); /*get
4957 complement*/
4958
4959 newbyte <<= lshift;
4960 byte_to = newbyte;
4961 newbyte = (comp | byte_to); /*put complements
4962 together*/
4963
4964 }
4965
4966 if (readbyte) /*put back byte with comps*/
4967 {
4968 BSSeek (bysp, readbyte-1, SEEK_SET);
4969 BSPutByte (bysp, newbyte);
4970 }
4971 }
4972 return TRUE;
4973
4974 }
4975
4976
DeltaBioseqComplement(BioseqPtr bsp)4977 static Boolean DeltaBioseqComplement (BioseqPtr bsp)
4978 {
4979 DeltaSeqPtr dsp;
4980 SeqLitPtr slip;
4981 Boolean rval = FALSE;
4982
4983 if (bsp == NULL || bsp->repr != Seq_repr_delta)
4984 {
4985 return rval;
4986 }
4987
4988 dsp = (DeltaSeqPtr) bsp->seq_ext;
4989 while (dsp != NULL)
4990 {
4991 if (dsp->choice != 2)
4992 {
4993 ErrPostEx(SEV_ERROR,0,0, "Error: Can't complement delta sequences with far locs\n");
4994 return FALSE;
4995 }
4996 dsp = dsp->next;
4997 }
4998 rval = TRUE;
4999 dsp = (DeltaSeqPtr) bsp->seq_ext;
5000 while (dsp != NULL)
5001 {
5002 slip = (SeqLitPtr) dsp->data.ptrvalue;
5003 /* complement data */
5004 if (slip->seq_data != NULL)
5005 {
5006 rval &= ComplementSeqData (slip->seq_data_type, slip->length, slip->seq_data);
5007 }
5008 dsp = dsp->next;
5009 }
5010 return rval;
5011 }
5012
5013
5014 /*-------------- BioseqComplement () ---------------------------*/
5015 /***********************************************************************
5016 * BioseqComplement: Takes the nucleic acid sequence from Bioseq
5017 * Entry and gives the complement sequence in place
5018 * Does not change features.
5019 ************************************************************************/
BioseqComplement(BioseqPtr bsp)5020 NLM_EXTERN Boolean LIBCALL BioseqComplement (BioseqPtr bsp)
5021 {
5022 Boolean rval = FALSE;
5023
5024 if (bsp == NULL)
5025 {
5026 ErrPostEx(SEV_ERROR,0,0, "Error: not a BioseqPtr\n");
5027 rval = FALSE;
5028 }
5029 else if (ISA_aa(bsp->mol))
5030 {
5031 ErrPostEx(SEV_ERROR,0,0, "Error: cannot complement aa\n");
5032 rval = FALSE;
5033 }
5034 else if (bsp->repr == Seq_repr_delta)
5035 {
5036 rval = DeltaBioseqComplement (bsp);
5037 }
5038 else if (bsp->repr == Seq_repr_raw)
5039 {
5040 rval = ComplementSeqData (bsp->seq_data_type, bsp->length, bsp->seq_data);
5041 }
5042 else
5043 {
5044 ErrPostEx(SEV_ERROR,0,0, "Error: not a raw or delta sequence\n");
5045 rval = FALSE;
5046 }
5047 return rval;
5048
5049 } /* BioseqComplement */
5050
5051
ReverseSeqData(Uint1 seqtype,Int4 seqlen,SeqDataPtr sdp)5052 NLM_EXTERN Boolean LIBCALL ReverseSeqData (Uint1 seqtype, Int4 seqlen, SeqDataPtr sdp)
5053 {
5054 ByteStorePtr bysp1, bysp2 = '\0';
5055 long readbyte, bslen = 0;
5056 Int4 count = 0;
5057 Uint1 byte = 0, byte2, byte_to = 0, byte_to2, newbyte = 0;
5058 Uint1 newbyte2, finalbyte, residue, residue2, bitctr, bc2 = 0;
5059 Uint1 bitctr2, mask, mask2, lshift, rshift, bc = 0, jagged;
5060
5061 if (seqtype == Seq_code_gap) return FALSE;
5062
5063 bysp1 = (ByteStorePtr) sdp;
5064
5065 if (bysp1 == NULL)
5066 {
5067 ErrPostEx(SEV_ERROR,0,0, "Error: No sequence data\n");
5068 return FALSE;
5069 }
5070
5071 switch (seqtype){
5072 case Seq_code_ncbi2na: /*bitshifts needed*/
5073 mask = 192;
5074 mask2 = 3;
5075 lshift = 2;
5076 rshift = 6;
5077 jagged = seqlen%4;
5078 switch (jagged) /*change if jagged last byte*/
5079 {
5080 case 1:
5081 bc = 1;
5082 bc2 = 3;
5083 break;
5084 case 2:
5085 bc = 2;
5086 bc2 = 2;
5087 break;
5088 case 3:
5089 bc = 3;
5090 bc2 = 1;
5091 break;
5092 default:
5093 bc = 4;
5094 bc2 = 0;
5095 break;
5096 }
5097 break;
5098 case Seq_code_ncbi4na:
5099 mask = 240;
5100 mask2 = 15;
5101 lshift = 4;
5102 rshift = 4;
5103 jagged = seqlen%2;
5104 switch (jagged)
5105 {
5106 case 1:
5107 bc = 1;
5108 bc2 = 1;
5109 break;
5110 default:
5111 bc = 2;
5112 bc2 = 0;
5113 break;
5114 }
5115 break;
5116 case Seq_code_iupacna:
5117 case Seq_code_ncbi8na:
5118
5119 case Seq_code_iupacaa:
5120 case Seq_code_ncbi8aa:
5121 case Seq_code_ncbieaa:
5122 case Seq_code_ncbistdaa:
5123 bc = 1;
5124 bc2 = 0;
5125 rshift = 0;
5126 lshift = 0;
5127 jagged = 0;
5128 mask = 255;
5129 mask2 = 0;
5130 break;
5131 case Seq_code_ncbipaa:
5132 case Seq_code_iupacaa3:
5133 ErrPostEx(SEV_ERROR,0,0, "Error: cannot reverse %s protein alphabet",(int)seqtype);
5134 return FALSE;
5135 case Seq_code_ncbipna:
5136 ErrPostEx(SEV_WARNING,0,0, "Error: Don't yet know how to reverse profile\n");
5137 default: /*ignores amino acid sequence*/
5138 return FALSE;
5139 }
5140 bysp2 = BSDup(bysp1);
5141 bslen = BSLen (bysp1);
5142 bitctr = bitctr2 = 0;
5143 readbyte = 0;
5144 count = 0;
5145
5146 if (!jagged) /*no jagged last byte*/
5147 {
5148 while ((readbyte != BSLen(bysp1)))
5149 {
5150 count = rshift;
5151 if (!bitctr) /*get new byte*/
5152 {
5153 newbyte = byte_to = byte = residue = 0;
5154 BSSeek (bysp2, --bslen, SEEK_SET);
5155 byte = (Uint1)BSGetByte (bysp2);
5156 bitctr = bc;
5157 readbyte++;
5158 }
5159
5160 for (;bitctr; bitctr--)
5161 {
5162 residue = byte & mask;
5163 residue >>= count;
5164 byte <<= lshift;
5165 count = count - lshift;
5166
5167 newbyte = (residue | byte_to);
5168 byte_to = newbyte;
5169 }
5170
5171 BSSeek (bysp1, readbyte-1, SEEK_SET);
5172 BSPutByte (bysp1, newbyte);
5173
5174 }
5175 }
5176 else /*jagged last byte*/
5177 {
5178 /*Gets two bytes prior to loop*/
5179 newbyte = newbyte2 = byte_to = byte_to2 = 0;
5180 byte2 = residue = residue2 = 0;
5181 BSSeek (bysp2, bslen-2, SEEK_SET);
5182 byte2 = (Uint1) BSGetByte (bysp2); /*byte closer to beginning*/
5183 byte = (Uint1) BSGetByte (bysp2);
5184 bitctr = bc;
5185 bitctr2 = bc2;
5186 bslen = bslen - 2;
5187 readbyte = 1;
5188
5189 while (readbyte != BSLen(bysp1))
5190 {
5191 count = rshift;
5192 if (!bitctr) /*when needed gets another
5193 byte*/
5194 {
5195 newbyte = newbyte2 = byte_to = byte_to2 = 0;
5196 byte2 = finalbyte = residue = residue2 = 0;
5197 BSSeek (bysp2, --bslen, SEEK_SET);
5198 byte2 = (Uint1) BSGetByte (bysp2);
5199 bitctr = bc;
5200 bitctr2 = bc2;
5201 ++readbyte;
5202 }
5203 for (; bitctr; bitctr--)
5204 {
5205 residue = byte & mask; /*reverses 1st
5206 byte*/
5207 residue >>= count;
5208 byte <<= lshift;
5209 byte_to = newbyte;
5210 newbyte = (residue | byte_to);
5211 count = count - lshift;
5212 }
5213 for (; bitctr2; bitctr2--)
5214 {
5215 residue2 = byte2 & mask2; /*reverses 2nd */
5216 byte2 >>= lshift; /*partially to
5217 join*/
5218 newbyte2 <<= lshift; /*with the 1st*/
5219 byte_to2 = newbyte2;
5220 newbyte2 = (residue2 | byte_to2);
5221 }
5222 newbyte <<= (8 - (bc*lshift)); /*joins 1st & 2nd
5223 bytes*/
5224 finalbyte = (newbyte | newbyte2);
5225 byte2 <<= (bc2 * lshift);
5226 byte = byte2;
5227
5228 BSSeek (bysp1, readbyte-1, SEEK_SET);
5229 BSPutByte (bysp1, finalbyte);
5230 }
5231 }
5232 BSFree(bysp2);
5233 return TRUE;
5234 } /* ReverseSeqData */
5235
5236
DeltaBioseqReverse(BioseqPtr bsp)5237 static Boolean DeltaBioseqReverse (BioseqPtr bsp)
5238 {
5239 DeltaSeqPtr dsp, next_dsp, newchain = NULL;
5240 SeqLitPtr slip;
5241 Boolean rval = FALSE;
5242
5243 if (bsp == NULL || bsp->repr != Seq_repr_delta)
5244 {
5245 return rval;
5246 }
5247
5248 dsp = (DeltaSeqPtr) bsp->seq_ext;
5249 while (dsp != NULL)
5250 {
5251 if (dsp->choice != 2)
5252 {
5253 ErrPostEx(SEV_ERROR,0,0, "Error: Can't reverse delta sequences with far locs\n");
5254 return FALSE;
5255 }
5256 dsp = dsp->next;
5257 }
5258
5259 dsp = (DeltaSeqPtr) bsp->seq_ext;
5260 rval = TRUE;
5261 while (dsp != NULL)
5262 {
5263 slip = (SeqLitPtr) dsp->data.ptrvalue;
5264 /* reverse data */
5265 if (slip->seq_data != NULL)
5266 {
5267 rval &= ReverseSeqData (slip->seq_data_type, slip->length, slip->seq_data);
5268 }
5269
5270 /* reverse the chain */
5271 next_dsp = dsp->next;
5272 dsp->next = newchain;
5273 newchain = dsp;
5274
5275 dsp = next_dsp;
5276 }
5277 bsp->seq_ext = newchain;
5278 return rval;
5279 }
5280
5281 /*-------------- BioseqReverse () ---------------------------*/
5282 /***********************************************************************
5283 * BioseqReverse: Takes nucleic acid sequence from Bioseq Entry and
5284 * reverses the whole sequence in place
5285 * Does not change features.
5286 ************************************************************************/
BioseqReverse(BioseqPtr bsp)5287 NLM_EXTERN Boolean LIBCALL BioseqReverse (BioseqPtr bsp)
5288 {
5289 Boolean rval;
5290
5291 if (bsp == NULL)
5292 {
5293 ErrPostEx(SEV_ERROR,0,0, "Error: not a BioseqPtr\n");
5294 rval = FALSE;
5295 }
5296 else if (bsp->repr == Seq_repr_delta)
5297 {
5298 rval = DeltaBioseqReverse (bsp);
5299 }
5300 else if (bsp->repr == Seq_repr_raw)
5301 {
5302 rval = ReverseSeqData (bsp->seq_data_type, bsp->length, bsp->seq_data);
5303 }
5304 else
5305 {
5306 ErrPostEx(SEV_ERROR,0,0, "Error: not a raw or delta sequence\n");
5307 rval = FALSE;
5308 }
5309
5310 return rval;
5311 } /* BioseqReverse */
5312
5313 #define SPC_BUFF_CHUNK 1024
5314
5315 /*****************************************************************************
5316 *
5317 * ContigRevComp
5318 *
5319 *****************************************************************************/
5320
SegRevComp(BioseqPtr bsp)5321 static Boolean SegRevComp (BioseqPtr bsp)
5322
5323 {
5324 ValNodePtr head = NULL;
5325 Int4 from, to, tmp;
5326 Boolean partial5, partial3;
5327 SeqIntPtr sintp;
5328 SeqLocPtr slp;
5329 ValNode vn;
5330 ValNodePtr vnp;
5331
5332 MemSet ((Pointer) &vn, 0, sizeof (ValNode));
5333 vn.choice = SEQLOC_MIX;
5334 vn.data.ptrvalue = bsp->seq_ext;
5335
5336 /* get each location component */
5337
5338 slp = SeqLocFindNext (&vn, NULL);
5339 while (slp != NULL) {
5340
5341 /* copy component, reversing strand */
5342
5343 vnp = NULL;
5344 if (slp->choice == SEQLOC_NULL) {
5345
5346 vnp = ValNodeAddPointer (NULL, SEQLOC_NULL, NULL);
5347
5348 } else if (slp->choice == SEQLOC_INT) {
5349
5350 sintp = (SeqIntPtr) slp->data.ptrvalue;
5351 if (sintp != NULL) {
5352 CheckSeqLocForPartial (slp, &partial5, &partial3);
5353 from = sintp->from;
5354 to = sintp->to;
5355 if (sintp->strand != Seq_strand_minus) {
5356 tmp = from;
5357 from = to;
5358 to = tmp;
5359 }
5360 vnp = AddIntervalToLocation (NULL, sintp->id, from, to, partial3, partial5);
5361 }
5362
5363 }
5364
5365 /* save in new list in reverse order */
5366
5367 if (vnp != NULL) {
5368 vnp->next = head;
5369 head = vnp;
5370 }
5371
5372 slp = SeqLocFindNext (&vn, slp);
5373 }
5374
5375 if (head == NULL) return FALSE;
5376
5377 bsp->seq_ext = SeqLocSetFree ((ValNodePtr) bsp->seq_ext);
5378 bsp->seq_ext = head;
5379
5380 bsp->hist = SeqHistFree (bsp->hist);
5381
5382 return TRUE;
5383 }
5384
DeltaRevComp(BioseqPtr bsp)5385 static Boolean DeltaRevComp (BioseqPtr bsp)
5386
5387 {
5388 DeltaSeqPtr dsp, dspnext;
5389 ValNodePtr head = NULL;
5390 Int4 from, to, tmp;
5391 Boolean partial5, partial3;
5392 SeqIntPtr sintp;
5393 SeqLocPtr slp;
5394 SeqLitPtr slitp, slip;
5395 ValNodePtr vnp;
5396
5397 for (dsp = (DeltaSeqPtr) bsp->seq_ext; dsp != NULL; dsp = dsp->next) {
5398 vnp = NULL;
5399
5400 if (dsp->choice == 1) {
5401
5402 slp = (SeqLocPtr) dsp->data.ptrvalue;
5403 if (slp != NULL) {
5404
5405 if (slp->choice == SEQLOC_NULL) {
5406
5407 vnp = ValNodeAddPointer (NULL, SEQLOC_NULL, NULL);
5408
5409 } else if (slp->choice == SEQLOC_INT) {
5410
5411 sintp = (SeqIntPtr) slp->data.ptrvalue;
5412 if (sintp != NULL) {
5413 CheckSeqLocForPartial (slp, &partial5, &partial3);
5414 from = sintp->from;
5415 to = sintp->to;
5416 if (sintp->strand != Seq_strand_minus) {
5417 tmp = from;
5418 from = to;
5419 to = tmp;
5420 }
5421 vnp = AddIntervalToLocation (NULL, sintp->id, from, to, partial3, partial5);
5422 }
5423 }
5424 }
5425
5426 } else if (dsp->choice == 2) {
5427
5428 slitp = (SeqLitPtr) dsp->data.ptrvalue;
5429 if (slitp != NULL && slitp->seq_data == NULL) {
5430 slip = SeqLitNew ();
5431 if (slip != NULL) {
5432 slip->length = slitp->length;
5433 /* not copying fuzz */
5434 slip->seq_data_type = slitp->seq_data_type;
5435 vnp = ValNodeAddPointer (NULL, 2, (Pointer) slip);
5436 }
5437 } else {
5438 ValNodeFree (head);
5439 return FALSE;
5440 }
5441 }
5442
5443 /* save in new list in reverse order */
5444
5445 if (vnp != NULL) {
5446 vnp->next = head;
5447 head = vnp;
5448 }
5449 }
5450
5451 if (head == NULL) return FALSE;
5452
5453 dsp = (DeltaSeqPtr) bsp->seq_ext;
5454 while (dsp != NULL) {
5455 dspnext = dsp->next;
5456 dsp->next = NULL;
5457 DeltaSeqFree (dsp);
5458 dsp = dsp->next;
5459 }
5460 bsp->seq_ext = head;
5461
5462 bsp->hist = SeqHistFree (bsp->hist);
5463
5464 return TRUE;
5465 }
5466
ContigRevComp(BioseqPtr bsp)5467 NLM_EXTERN Boolean LIBCALL ContigRevComp (BioseqPtr bsp)
5468
5469 {
5470 if (bsp == NULL) {
5471 ErrPostEx (SEV_ERROR, 0, 0, "ContigRevComp: empty BioseqPtr");
5472 return FALSE;
5473 }
5474
5475 if (bsp->repr == Seq_repr_seg && bsp->seq_ext_type == 1 && bsp->seq_ext != NULL) {
5476 return SegRevComp (bsp);
5477 }
5478 if (bsp->repr == Seq_repr_delta && bsp->seq_ext_type == 4 && bsp->seq_ext != NULL) {
5479 return DeltaRevComp (bsp);
5480 }
5481
5482 ErrPostEx (SEV_ERROR, 0, 0, "ContigRevComp: not a segmented or delta BioseqPtr");
5483 return FALSE;
5484 }
5485
5486 /*****************************************************************************
5487 *
5488 * SPCompressNew(void); - allocated memory for SPCompress structure
5489 *
5490 *****************************************************************************/
SPCompressNew(void)5491 NLM_EXTERN SPCompressPtr SPCompressNew(void)
5492 {
5493 SPCompressPtr spc;
5494
5495 spc = (SPCompressPtr) MemNew(sizeof(SPCompress));
5496 spc->buffer = (Uint1Ptr) MemNew(SPC_BUFF_CHUNK);
5497 spc->allocated = SPC_BUFF_CHUNK;
5498 spc->residues = 0;
5499 spc->lbytes = NULL;
5500
5501 return spc;
5502 }
5503 /*****************************************************************************
5504 *
5505 * SPCompressFree(SPCompressPtr spc); - free SPCompress structure
5506 *
5507 *****************************************************************************/
SPCompressFree(SPCompressPtr spc)5508 NLM_EXTERN void SPCompressFree(SPCompressPtr spc)
5509 {
5510
5511 MemFree(spc->buffer);
5512 MemFree(spc->lbytes);
5513 MemFree(spc);
5514
5515 }
5516 /*****************************************************************************
5517 *
5518 * Int4 SPCompressRead (Pointer data, Uint1Ptr buf, Int4 length);
5519 * Hook read-function for SPCompressDNA()
5520 *
5521 *****************************************************************************/
5522 static Int4 SPCompressRead (Pointer data, Uint1Ptr buf, Int4 length);
SPCompressRead(Pointer data,Uint1Ptr buf,Int4 length)5523 static Int4 SPCompressRead (Pointer data, Uint1Ptr buf, Int4 length)
5524 {
5525 SeqPortPtr spp;
5526 Uint1 residue = 0;
5527 Int4 total_read=0, index=0;
5528
5529 Boolean second = FALSE;
5530
5531 spp = (SeqPortPtr) data;
5532 MemSet(buf, 0, length); /* Clear buffer first */
5533
5534 while (index < length && (residue=SeqPortGetResidue(spp)) != SEQPORT_EOF) {
5535 if (IS_residue(residue)) {
5536 if(second) {
5537 buf[index] += residue;
5538 index++;
5539 second = FALSE;
5540 } else {
5541 residue <<= 4;
5542 buf[index] += residue;
5543 second = TRUE;
5544 }
5545 total_read++;
5546 } else if (residue == SEQPORT_VIRT) { /* No sequence, return NULL. */
5547 continue;
5548 } else {
5549 ErrPostEx(SEV_WARNING, 0, 0,"[Bad residue]\n");
5550 return -1;
5551 }
5552 }
5553 return total_read;
5554 }
5555
5556 /*****************************************************************************
5557 *
5558 * Int4 SPCompressWrite (Pointer data, Uint1Ptr buf, Int4 length);
5559 * Hook write-function for SPCompressDNA()
5560 *
5561 *****************************************************************************/
5562 static Int4 SPCompressWrite (Pointer data, Uint1Ptr buf, Int4 length);
SPCompressWrite(Pointer data,Uint1Ptr buf,Int4 length)5563 static Int4 SPCompressWrite (Pointer data, Uint1Ptr buf, Int4 length)
5564 {
5565 SPCompressPtr spc;
5566 spc = (SPCompressPtr) data;
5567
5568 if((spc->used + length) >= spc->allocated) {
5569 spc->allocated += SPC_BUFF_CHUNK;
5570 spc->buffer = (Uint1Ptr)Realloc(spc->buffer,
5571 spc->allocated);
5572 }
5573
5574 if((MemCpy(spc->buffer + spc->used, buf, length)) == NULL)
5575 return -1;
5576
5577 spc->used += length;
5578
5579 return length;
5580 }
5581
5582 /*****************************************************************************
5583 *
5584 * SPRebuildDNA(SPCompressPtr spc);
5585 * translates spc ncbi2na encoding buffer into
5586 * spc ncbi4na encoding buffer with rebuild ambiguities
5587 *
5588 * spc - must be valid SPCompress structure returned
5589 * from SPCompressDNA() function in ncbi2na encoding
5590 *
5591 *****************************************************************************/
SPRebuildDNA(SPCompressPtr spc)5592 NLM_EXTERN Boolean SPRebuildDNA(SPCompressPtr spc)
5593 {
5594 ByteStorePtr bsp, bsp_plain;
5595 Int4 residues;
5596
5597 if(spc == NULL || spc->type != Seq_code_ncbi2na)
5598 return FALSE;
5599
5600 residues = (spc->used-1)*4 + (spc->buffer[spc->used-1] & 0x3);
5601 bsp = BSNew(spc->used);
5602 BSWrite(bsp, spc->buffer, spc->used);
5603
5604 if((bsp_plain = BSConvertSeq(bsp, Seq_code_ncbi4na,
5605 Seq_code_ncbi2na, residues)) == NULL) {
5606 return FALSE;
5607 }
5608
5609 BSRebuildDNA_4na(bsp_plain, spc->lbytes);
5610
5611 spc->buffer = (Uint1Ptr) Realloc(spc->buffer, residues/2+1);
5612 BSRead(bsp_plain, spc->buffer, residues/2+1);
5613 spc->type = Seq_code_ncbi4na;
5614 spc->residues = residues;
5615 BSFree(bsp_plain);
5616
5617 return TRUE;
5618 }
5619
5620 /*****************************************************************************
5621 *
5622 * SPCompressDNA(SeqPortPtr spp);
5623 * converts a ncbi4na taken from spp into ncbi2na
5624 * buffer stored inside SPCompress structue together
5625 * with ambiguity information
5626 * returns pointer SPCompress structure or NULL if error
5627 *
5628 * NOTE: In this function we do not know - what is length
5629 * of sequence to compress. Terminated flag for this
5630 * function is SEQPORT_EOF returned from spp.
5631 *
5632 *****************************************************************************/
SPCompressDNA(SeqPortPtr spp)5633 NLM_EXTERN SPCompressPtr SPCompressDNA(SeqPortPtr spp)
5634 {
5635 SPCompressPtr spc;
5636
5637 if (spp == NULL || spp->newcode != Seq_code_ncbi4na)
5638 return NULL;
5639
5640 spc = SPCompressNew();
5641 if(!GenericCompressDNA((VoidPtr) spp, (VoidPtr) spc,
5642 (Uint4) -1, /* Length of sequence unknown */
5643 SPCompressRead,
5644 SPCompressWrite,
5645 &spc->lbytes
5646 )) {
5647 return NULL;
5648 }
5649 spc->type = Seq_code_ncbi2na;
5650 return spc;
5651 }
5652
5653 /*****************************************************************************
5654 *
5655 * ComposeCodonsRecognizedString (trna, buf, buflen);
5656 * Copies codon recognized string to buf, returns number of codons
5657 *
5658 *****************************************************************************/
5659
SortCodonByName(VoidPtr ptr1,VoidPtr ptr2)5660 static int LIBCALLBACK SortCodonByName (VoidPtr ptr1, VoidPtr ptr2)
5661
5662 {
5663 CharPtr str1;
5664 CharPtr str2;
5665 ValNodePtr vnp1;
5666 ValNodePtr vnp2;
5667
5668 if (ptr1 != NULL && ptr2 != NULL) {
5669 vnp1 = *((ValNodePtr PNTR) ptr1);
5670 vnp2 = *((ValNodePtr PNTR) ptr2);
5671 if (vnp1 != NULL && vnp2 != NULL) {
5672 str1 = (CharPtr) vnp1->data.ptrvalue;
5673 str2 = (CharPtr) vnp2->data.ptrvalue;
5674 if (str1 != NULL && str2 != NULL) {
5675 return StringICmp (str1, str2);
5676 } else {
5677 return 0;
5678 }
5679 } else {
5680 return 0;
5681 }
5682 } else {
5683 return 0;
5684 }
5685 }
5686
MakeDegenerateBase(Uint1 ch1,Uint1 ch2,Uint1Ptr chrToInt,CharPtr intToChr)5687 static Uint1 MakeDegenerateBase (Uint1 ch1, Uint1 ch2, Uint1Ptr chrToInt, CharPtr intToChr)
5688
5689 {
5690 Uint1 idx;
5691
5692 idx = chrToInt [(int) ch1] | chrToInt [(int) ch2];
5693 return intToChr [(int) idx];
5694 }
5695
ComposeCodonsRecognizedString(tRNAPtr trna,CharPtr buf,size_t buflen)5696 NLM_EXTERN Int2 ComposeCodonsRecognizedString (tRNAPtr trna, CharPtr buf, size_t buflen)
5697
5698 {
5699 Char ch;
5700 Uint1 chrToInt [256];
5701 Uint1 codon [4];
5702 Int2 count = 0;
5703 ValNodePtr head, next, vnp;
5704 Int2 k;
5705 Uint1 i, j;
5706 CharPtr intToChr = "?ACMGRSVUWYHKDBN";
5707 CharPtr prefix, ptr, str1, str2;
5708 Pointer PNTR prev;
5709
5710 if (trna == NULL || buf == NULL || buflen < 25) return 0;
5711
5712 *buf = '\0';
5713 codon [3] = '\0';
5714 head = NULL;
5715
5716 for (j = 0; j < 6; j++) {
5717 if (trna->codon [j] < 64) {
5718 if (CodonForIndex (trna->codon [j], Seq_code_iupacna, codon)) {
5719 for (k = 0; k < 3; k++) {
5720 if (codon [k] == 'T') {
5721 codon [k] = 'U';
5722 }
5723 }
5724 ValNodeCopyStr (&head, 0, (CharPtr) codon);
5725 }
5726 }
5727 }
5728
5729 head = ValNodeSort (head, SortCodonByName);
5730
5731 if (head == NULL) return 0;
5732
5733 for (k = 0; k < 256; k++) {
5734 chrToInt [k] = 0;
5735 }
5736 for (i = 1; i < 16; i++) {
5737 ch = intToChr [i];
5738 chrToInt [(int) ch] = i;
5739 }
5740
5741 count = ValNodeLen (head);
5742 str1 = (CharPtr) head->data.ptrvalue;
5743 vnp = head->next;
5744 prev = (Pointer PNTR) &(head->next);
5745 while (vnp != NULL) {
5746 next = vnp->next;
5747 str2 = (CharPtr) vnp->data.ptrvalue;
5748 if (str1 != NULL && str2 != NULL &&
5749 str1 [0] == str2 [0] && str1 [1] == str2 [1]) {
5750 str1 [2] = MakeDegenerateBase (str1 [2], str2 [2], chrToInt, intToChr);
5751 *prev = next;
5752 vnp->next = NULL;
5753 ValNodeFreeData (vnp);
5754 } else {
5755 str1 = str2;
5756 prev = (Pointer PNTR) &(vnp->next);
5757 }
5758 vnp = next;
5759 }
5760
5761 for (vnp = head, ptr = buf, i = 0, prefix = NULL; vnp != NULL;
5762 vnp = vnp->next, prefix = ", ", i++) {
5763 ptr = StringMove (ptr, prefix);
5764 ptr = StringMove (ptr, (CharPtr) vnp->data.ptrvalue);
5765 }
5766
5767 ValNodeFreeData (head);
5768 return count;
5769 }
5770
5771 /*****************************************************************************
5772 *
5773 * TransTableNew (Int2 genCode);
5774 * Initializes TransTable finite state machine for 6-frame translation
5775 * and open reading frame search, allowing nucleotide ambiguity characters
5776 *
5777 *****************************************************************************/
5778
SetGenCode(Int2 genCode,CharPtr PNTR ncbieaa,CharPtr PNTR sncbieaa)5779 static Boolean SetGenCode (Int2 genCode, CharPtr PNTR ncbieaa, CharPtr PNTR sncbieaa)
5780
5781 {
5782 GeneticCodePtr codes;
5783 GeneticCodePtr gcp;
5784 Int4 id;
5785 ValNodePtr vnp;
5786
5787 if (ncbieaa == NULL || sncbieaa == NULL) return FALSE;
5788
5789 codes = GeneticCodeTableLoad ();
5790 if (codes == NULL) return FALSE;
5791 for (gcp = codes; gcp != NULL; gcp = gcp->next) {
5792 id = 0;
5793 *ncbieaa = NULL;
5794 *sncbieaa = NULL;
5795 for (vnp = (ValNodePtr) gcp->data.ptrvalue; vnp != NULL; vnp = vnp->next) {
5796 switch (vnp->choice) {
5797 case 2 :
5798 id = vnp->data.intvalue;
5799 break;
5800 case 3 :
5801 *ncbieaa = (CharPtr) vnp->data.ptrvalue;
5802 break;
5803 case 6 :
5804 *sncbieaa = (CharPtr) vnp->data.ptrvalue;
5805 break;
5806 default :
5807 break;
5808 }
5809 }
5810 if (genCode == id) return TRUE;
5811 }
5812
5813 return FALSE;
5814 }
5815
5816 typedef enum {
5817 BASE_A = 0, /* A */
5818 BASE_C, /* C */
5819 BASE_G, /* G */
5820 BASE_T, /* T */
5821 BASE_M, /* AC */
5822 BASE_R, /* AG */
5823 BASE_W, /* AT */
5824 BASE_S, /* CG */
5825 BASE_Y, /* CT */
5826 BASE_K, /* GT */
5827 BASE_V, /* ACG */
5828 BASE_H, /* ACT */
5829 BASE_D, /* AGT */
5830 BASE_B, /* CGT */
5831 BASE_N /* ACGT */
5832 } BaseCode;
5833
TransTableNew(Int2 genCode)5834 NLM_EXTERN TransTablePtr TransTableNew (Int2 genCode)
5835
5836 {
5837 Char ch, tpaa, btaa, tporf, btorf;
5838 Char charToBase [16] = "ACGTMRWSYKVHDBN";
5839 Int2 fournaToBase [16] = {
5840 BASE_N, BASE_A, BASE_C, BASE_M, BASE_G, BASE_R, BASE_S, BASE_V,
5841 BASE_T, BASE_W, BASE_Y, BASE_H, BASE_K, BASE_D, BASE_B, BASE_N};
5842 Int2 expansions [75] = {
5843 BASE_A, -1, -1, -1, -1,
5844 BASE_C, -1, -1, -1, -1,
5845 BASE_G, -1, -1, -1, -1,
5846 BASE_T, -1, -1, -1, -1,
5847 BASE_A, BASE_C, -1, -1, -1,
5848 BASE_A, BASE_G, -1, -1, -1,
5849 BASE_A, BASE_T, -1, -1, -1,
5850 BASE_C, BASE_G, -1, -1, -1,
5851 BASE_C, BASE_T, -1, -1, -1,
5852 BASE_G, BASE_T, -1, -1, -1,
5853 BASE_A, BASE_C, BASE_G, -1, -1,
5854 BASE_A, BASE_C, BASE_T, -1, -1,
5855 BASE_A, BASE_G, BASE_T, -1, -1,
5856 BASE_C, BASE_G, BASE_T, -1, -1,
5857 BASE_A, BASE_C, BASE_G, BASE_T, -1};
5858 Boolean goOn;
5859 Int2 i, j, k, st, nx, cd;
5860 Int2 p, q, r, x, y, z;
5861 Uint1 ui;
5862 Int2 codonidx [4] = {2, 1, 3, 0}; /* in genetic code table, T = 0, C = 1, A = 2, G = 3, */
5863 Int2 complidx [4] = {0, 3, 1, 2}; /* and index = (base1 * 16) + (base2 * 4) + base3 */
5864 CharPtr ncbieaa = NULL, sncbieaa = NULL;
5865 TransTablePtr tbl;
5866
5867 tbl = (TransTablePtr) MemNew (sizeof (TransTable));
5868 if (tbl == NULL) return NULL;
5869 MemSet ((Pointer) tbl, 0, sizeof (TransTable));
5870
5871 if (genCode == 7) {
5872 genCode = 4;
5873 } else if (genCode == 8) {
5874 genCode = 1;
5875 } else if (genCode == 0) {
5876 genCode = 1;
5877 }
5878
5879 if ((! SetGenCode (genCode, &ncbieaa, &sncbieaa)) || ncbieaa == NULL || sncbieaa == NULL) {
5880 ncbieaa = "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG";
5881 sncbieaa = "---M------**--*----M---------------M----------------------------";
5882 }
5883
5884 tbl->genCode = genCode;
5885 StringNCpy_0 (tbl->ncbieaa, ncbieaa, sizeof (tbl->ncbieaa));
5886 StringNCpy_0 (tbl->sncbieaa, sncbieaa, sizeof (tbl->sncbieaa));
5887
5888 /* table to convert any ASCII character to BASE_x integer from 0 through 14 */
5889 for (i = 0; i < 256; i++) {
5890 tbl->basesToIdx [i] = BASE_N;
5891 }
5892
5893 /* map iupacna alphabet to BaseCode */
5894 for (ui = BASE_A; ui <= BASE_N; ui++) {
5895 ch = charToBase [ui];
5896 tbl->basesToIdx [(int) ch] = ui;
5897 ch = TO_LOWER (ch);
5898 tbl->basesToIdx [(int) ch] = ui;
5899 }
5900 tbl->basesToIdx [(int) 'U'] = BASE_T;
5901 tbl->basesToIdx [(int) 'u'] = BASE_T;
5902 tbl->basesToIdx [(int) 'X'] = BASE_N;
5903 tbl->basesToIdx [(int) 'x'] = BASE_N;
5904
5905 /* also map ncbi4na alphabet to BaseCode */
5906 for (i = 0; i < 16; i++) {
5907 tbl->basesToIdx [(int) i] = fournaToBase [i];
5908 }
5909
5910 /* add tbl->basesToIdx [(int) ch] to tbl->nextBase [state] to get next state */
5911
5912 /* treat state 0 as already having seen NN, avoiding single and double letter states */
5913 tbl->nextBase [0] = 3361;
5914
5915 /* states 1 through 3375 are triple letter states (AAA, AAC, ..., NNT, NNN) */
5916 for (i = BASE_A, st = 1; i <= BASE_N; i++) {
5917 for (j = BASE_A, nx = 1; j <= BASE_N; j++) {
5918 for (k = BASE_A; k <= BASE_N; k++, st++, nx += 15) {
5919 tbl->nextBase [st] = nx;
5920 }
5921 }
5922 }
5923
5924 /* tbl->aminoAcid [state] [strand] contains amino acid encoded by state */
5925
5926 /* initialize all states to return unknown amino acid X */
5927 for (st = 0; st < 3376; st++) {
5928 tbl->aminoAcid [st] [TTBL_TOP_STRAND] = 'X';
5929 tbl->aminoAcid [st] [TTBL_BOT_STRAND] = 'X';
5930 tbl->orfStart [st] [TTBL_TOP_STRAND] = '-';
5931 tbl->orfStart [st] [TTBL_BOT_STRAND] = '-';
5932 }
5933
5934 /* lookup amino acid for each codon in genetic code table */
5935 for (i = BASE_A, st = 1; i <= BASE_N; i++) {
5936 for (j = BASE_A; j <= BASE_N; j++) {
5937 for (k = BASE_A; k <= BASE_N; k++, st++) {
5938 /* st = 225 * i + 15 * j + k + 1; */
5939
5940 tpaa = '\0';
5941 btaa = '\0';
5942 tporf = '\0';
5943 btorf = '\0';
5944 goOn = TRUE;
5945
5946 /* expand ambiguous IJK nucleotide symbols into component bases XYZ */
5947 for (p = i * 5, x = expansions [p]; x != -1 && goOn; p++, x = expansions [p]) {
5948 for (q = j * 5, y = expansions [q]; y != -1 && goOn; q++, y = expansions [q]) {
5949 for (r = k * 5, z = expansions [r]; z != -1 && goOn; r++, z = expansions [r]) {
5950
5951 /* lookup amino acid for codon XYZ */
5952 cd = 16 * codonidx [x] + 4 * codonidx [y] + codonidx [z];
5953 ch = ncbieaa [cd];
5954 if (tpaa == '\0') {
5955 tpaa = ch;
5956 } else if (tpaa != ch) {
5957 /* allow Asx (Asp or Asn) and Glx (Glu or Gln) and Xle (Leu or Ile) */
5958 if ((tpaa == 'B' || tpaa == 'D' || tpaa == 'N') && (ch == 'D' || ch == 'N')) {
5959 tpaa = 'B';
5960 } else if ((tpaa == 'Z' || tpaa == 'E' || tpaa == 'Q') && (ch == 'E' || ch == 'Q')) {
5961 tpaa = 'Z';
5962 } else if ((tpaa == 'J' || tpaa == 'I' || tpaa == 'L') && (ch == 'I' || ch == 'L')) {
5963 tpaa = 'J';
5964 } else {
5965 tpaa = 'X';
5966 }
5967 }
5968 /* and translation start flag on top strand */
5969 ch = sncbieaa [cd];
5970 if (tporf == '\0') {
5971 tporf = ch;
5972 } else if (tporf != ch) {
5973 tporf = 'X'; /* was '-' */
5974 }
5975
5976 /* lookup amino acid for complement of reversed ZYX */
5977 cd = 16 * complidx [z] + 4 * complidx [y] + complidx [x];
5978 ch = ncbieaa [cd];
5979 if (btaa == '\0') {
5980 btaa = ch;
5981 } else if (btaa != ch) {
5982 /* allow Asx (Asp or Asn) and Glx (Glu or Gln) and Xle (Leu or Ile) */
5983 if ((btaa == 'B' || btaa == 'D' || btaa == 'N') && (ch == 'D' || ch == 'N')) {
5984 btaa = 'B';
5985 } else if ((btaa == 'Z' || btaa == 'E' || btaa == 'Q') && (ch == 'E' || ch == 'Q')) {
5986 btaa = 'Z';
5987 } else if ((btaa == 'J' || btaa == 'I' || btaa == 'L') && (ch == 'I' || ch == 'L')) {
5988 btaa = 'J';
5989 } else {
5990 btaa = 'X';
5991 }
5992 }
5993 /* and translation start flag on bottom strand */
5994 ch = sncbieaa [cd];
5995 if (btorf == '\0') {
5996 btorf = ch;
5997 } else if (btorf != ch) {
5998 btorf = 'X'; /* was '-' */
5999 }
6000
6001 /* drop out of loop as soon as answer is known */
6002 if (tpaa == 'X' && btaa == 'X' && tporf == 'X' && btorf == 'X') { /* was '-' for orfs */
6003 goOn = FALSE;
6004 }
6005 }
6006 }
6007 }
6008
6009 /* assign amino acid */
6010 tbl->aminoAcid [st] [TTBL_TOP_STRAND] = tpaa;
6011 tbl->aminoAcid [st] [TTBL_BOT_STRAND] = btaa;
6012
6013 /* assign orf start */
6014 tbl->orfStart [st] [TTBL_TOP_STRAND] = tporf;
6015 tbl->orfStart [st] [TTBL_BOT_STRAND] = btorf;
6016 }
6017 }
6018 }
6019
6020 /* finite state machine for 6-frame translation and ORF search is now initialized */
6021 return tbl;
6022 }
6023
TransTableFree(TransTablePtr tbl)6024 NLM_EXTERN TransTablePtr TransTableFree (TransTablePtr tbl)
6025
6026 {
6027 return MemFree (tbl);
6028 }
6029
TransTableFreeAll(void)6030 NLM_EXTERN void TransTableFreeAll (void)
6031
6032 {
6033 Int2 genCode;
6034 Char str [32];
6035 TransTablePtr tbl;
6036
6037 for (genCode = 1; genCode < 40; genCode++) {
6038 sprintf (str, "TransTableFSAforGenCode%d", (int) genCode);
6039 tbl = (TransTablePtr) GetAppProperty (str);
6040 if (tbl != NULL) {
6041 SetAppProperty (str, NULL);
6042 TransTableFree (tbl);
6043 }
6044 }
6045 return;
6046 }
6047
6048 /* convenience function does translation for entire bioseq */
6049
TransTableProcessBioseq(TransTablePtr tbl,TransTableMatchProc matchProc,Pointer userdata,BioseqPtr bsp)6050 NLM_EXTERN void TransTableProcessBioseq (
6051 TransTablePtr tbl,
6052 TransTableMatchProc matchProc,
6053 Pointer userdata,
6054 BioseqPtr bsp
6055 )
6056
6057 {
6058 Boolean altStart, atgStart, orfStop;
6059 Byte bases [400];
6060 Char ch;
6061 Int2 ctr, frame, i, j, state;
6062 Int4 position;
6063 Uint1 residue;
6064 SeqPortPtr spp;
6065
6066 if (tbl == NULL || matchProc == NULL || bsp == NULL) return;
6067
6068 if (! ISA_na (bsp->mol)) return;
6069
6070 spp = SeqPortNew (bsp, 0, -1, 0, Seq_code_iupacna);
6071 if (spp == NULL) return;
6072
6073 if (bsp->repr == Seq_repr_delta) {
6074 SeqPortSet_do_virtual (spp, TRUE);
6075 }
6076
6077 /* read first block of bases, reality check on length */
6078
6079 ctr = SeqPortRead (spp, bases, sizeof (bases));
6080 if (ctr < 6) {
6081 SeqPortFree (spp);
6082 return;
6083 }
6084
6085 state = 0;
6086 position = 0;
6087 frame = 0;
6088
6089 i = 0;
6090 residue = (Uint1) bases [i];
6091
6092 /* prime finite state machine with first two bases */
6093
6094 for (j = 0; j < 2 && residue != SEQPORT_EOF; j++) {
6095 if (IS_residue (residue)) {
6096 state = NextCodonState (tbl, state, residue);
6097 }
6098 i++;
6099 residue = (Uint1) bases [i];
6100 }
6101
6102 /* loop on all remaining bases */
6103
6104 while (residue != SEQPORT_EOF) {
6105 if (IS_residue (residue)) {
6106 state = NextCodonState (tbl, state, residue);
6107
6108 /* get amino acid for codon on top strand */
6109
6110 ch = GetCodonResidue (tbl, state, TTBL_TOP_STRAND);
6111 atgStart = IsATGStart (tbl, state, TTBL_TOP_STRAND);
6112 altStart = IsAltStart (tbl, state, TTBL_TOP_STRAND);
6113 orfStop = IsOrfStop (tbl, state, TTBL_TOP_STRAND);
6114 matchProc (position, ch, atgStart, altStart, orfStop, frame, Seq_strand_plus, userdata);
6115
6116 /* get amino acid for codon on top strand */
6117
6118 ch = GetCodonResidue (tbl, state, TTBL_BOT_STRAND);
6119 atgStart = IsATGStart (tbl, state, TTBL_BOT_STRAND);
6120 altStart = IsAltStart (tbl, state, TTBL_BOT_STRAND);
6121 orfStop = IsOrfStop (tbl, state, TTBL_BOT_STRAND);
6122 matchProc (position, ch, atgStart, altStart, orfStop, frame, Seq_strand_minus, userdata);
6123
6124 /* advance base position, also keep track of frame */
6125
6126 position++;
6127 frame++;
6128 if (frame > 2) {
6129 frame = 0;
6130 }
6131 }
6132
6133 /* increment base counter */
6134
6135 i++;
6136 if (i >= ctr) {
6137 i = 0;
6138
6139 /* read next block of bases */
6140
6141 ctr = SeqPortRead (spp, bases, sizeof (bases));
6142 if (ctr < 0) {
6143 bases [0] = -ctr;
6144 } else if (ctr < 1) {
6145 bases [0] = SEQPORT_EOF;
6146 }
6147 }
6148 residue = (Uint1) bases [i];
6149 }
6150
6151 SeqPortFree (spp);
6152 }
6153
6154 /* trans table translation function can be passed cds feature or individual parameters */
6155
6156 typedef struct readcdsdata {
6157 CharPtr tmp;
6158 size_t frame;
6159 Int4 max;
6160 Boolean overflow;
6161 } ReadCdsData, PNTR ReadCdsPtr;
6162
6163 /* callback allows skipping one or two bases at beginning */
6164
SaveCdsBases(CharPtr sequence,Pointer userdata)6165 static void LIBCALLBACK SaveCdsBases (
6166 CharPtr sequence,
6167 Pointer userdata
6168 )
6169
6170 {
6171 Char ch;
6172 CharPtr from, to;
6173 unsigned int len;
6174 Int4 max;
6175 ReadCdsPtr rcp;
6176
6177 rcp = (ReadCdsPtr) userdata;
6178
6179 if (rcp->frame > 0) {
6180 len = 0;
6181 ch = sequence [len];
6182 while (ch != '\0' && len <= rcp->frame) {
6183 len++;
6184 ch = sequence [len];
6185 }
6186 /* len = StringLen (sequence); */
6187 if (rcp->frame >= len) {
6188
6189 /* unusual locations can have fewer bases in the first segments than the frame, so just decrement */
6190
6191 rcp->frame -= len;
6192 return;
6193 }
6194 }
6195
6196 /* rcp->tmp = StringMove (rcp->tmp, sequence + rcp->frame); */
6197
6198 from = sequence + rcp->frame;
6199 to = rcp->tmp;
6200 max = rcp->max;
6201
6202 ch = *from;
6203 while (ch != '\0' && max > 0) {
6204 *to = ch;
6205 to++;
6206 from++;
6207 ch = *from;
6208 max--;
6209 }
6210 *to = '\0';
6211 if (ch != '\0') {
6212 rcp->overflow = TRUE;
6213 }
6214
6215 rcp->tmp = to;
6216 rcp->max = max;
6217
6218 rcp->frame = 0;
6219 }
6220
ReadCodingRegionBases(SeqLocPtr location,Int4 len,Uint1 frame,Int4Ptr totalP)6221 NLM_EXTERN CharPtr ReadCodingRegionBases (SeqLocPtr location, Int4 len, Uint1 frame, Int4Ptr totalP)
6222
6223 {
6224 CharPtr bases, txt;
6225 Int4 mod;
6226 ReadCdsData rcd;
6227 /*
6228 Int2 actual, cnt;
6229 BioseqPtr bsp;
6230 Int4 mod, position;
6231 SeqIdPtr sip;
6232 SeqLocPtr slp;
6233 SeqPortPtr spp;
6234 */
6235
6236 bases = MemNew ((size_t) (len + 6));
6237 if (bases == NULL)
6238 return NULL;
6239
6240 rcd.tmp = bases;
6241 rcd.max = len;
6242 rcd.overflow = FALSE;
6243
6244 /* adjust start position */
6245
6246 if (frame == 2) {
6247 rcd.frame = 1;
6248 } else if (frame == 3) {
6249 rcd.frame = 2;
6250 } else {
6251 rcd.frame = 0;
6252 }
6253
6254 SeqPortStreamLoc (location, STREAM_EXPAND_GAPS, (Pointer) &rcd, SaveCdsBases);
6255
6256 txt = rcd.tmp;
6257
6258 if (rcd.overflow) {
6259 ErrPostEx (SEV_ERROR, 0, 0, "ReadCodingRegionBases overflow caught");
6260 }
6261
6262 #if 0
6263 spp = SeqPortNewByLoc (location, Seq_code_iupacna);
6264 if (spp == NULL) {
6265 MemFree (bases);
6266 return NULL;
6267 }
6268
6269 slp = SeqLocFindNext (location, NULL);
6270 while (slp != NULL) {
6271 sip = SeqLocId (slp);
6272 if (sip != NULL) {
6273 bsp = BioseqFind (sip);
6274 if (bsp != NULL) {
6275 if (bsp->repr == Seq_repr_delta || bsp->repr == Seq_repr_virtual) {
6276 SeqPortSet_do_virtual (spp, TRUE);
6277 }
6278 }
6279 }
6280 slp = SeqLocFindNext (location, slp);
6281 }
6282
6283 /* adjust start position */
6284
6285 if (frame == 2) {
6286 position = 1;
6287 } else if (frame == 3) {
6288 position = 2;
6289 } else {
6290 position = 0;
6291 }
6292 SeqPortSeek (spp, position, SEEK_SET);
6293 len -= position;
6294
6295 /* read nucleotides into temporary buffer */
6296
6297 cnt = (Int2) MIN (len, 32000L);
6298 txt = bases;
6299 actual = 1;
6300 while (cnt > 0 && len > 0 && actual > 0) {
6301 actual = SeqPortRead (spp, (BytePtr) txt, cnt);
6302 if (actual < 0) {
6303 actual = -actual;
6304 if (actual == SEQPORT_VIRT || actual == SEQPORT_EOS) {
6305 actual = 1; /* ignore, keep going */
6306 } else if (actual == SEQPORT_EOF) {
6307 actual = 0; /* stop */
6308 }
6309 } else if (actual > 0) {
6310 len -= actual;
6311 txt += actual;
6312 cnt = (Int2) MIN (len, 32000L);
6313 }
6314 }
6315
6316 SeqPortFree (spp);
6317 #endif
6318
6319 /* pad incomplete last codon with Ns */
6320
6321 len = StringLen (bases);
6322 if (len > 0) {
6323 mod = len % 3;
6324 if (mod == 1) {
6325 txt = StringMove (txt, "NN");
6326 } else if (mod == 2) {
6327 txt = StringMove (txt, "N");
6328 }
6329 }
6330 if (totalP != NULL) {
6331 *totalP = len;
6332 }
6333
6334 return bases;
6335 }
6336
MakeCodeBreakList(SeqLocPtr cdslocation,Int4 len,CodeBreakPtr cbp,Uint1 frame)6337 NLM_EXTERN ValNodePtr MakeCodeBreakList (SeqLocPtr cdslocation, Int4 len, CodeBreakPtr cbp, Uint1 frame)
6338
6339 {
6340 Int4 adjust = 0, pos, pos1, pos2;
6341 SeqLocPtr tmp;
6342 ValNodePtr vnphead = NULL;
6343
6344 if (cdslocation == NULL || cbp == NULL) return NULL;
6345
6346 if (frame == 2) {
6347 adjust = 1;
6348 } else if (frame == 3) {
6349 adjust = 2;
6350 } else {
6351 adjust = 0;
6352 }
6353
6354 while (cbp != NULL) {
6355 pos1 = INT4_MAX;
6356 pos2 = -10;
6357 tmp = NULL;
6358
6359 while ((tmp = SeqLocFindNext (cbp->loc, tmp)) != NULL) {
6360 pos = GetOffsetInLoc (tmp, cdslocation, SEQLOC_START);
6361 if (pos < pos1) {
6362 pos1 = pos;
6363 }
6364 pos = GetOffsetInLoc (tmp, cdslocation, SEQLOC_STOP);
6365 if (pos > pos2) {
6366 pos2 = pos;
6367 }
6368 }
6369
6370 pos = pos2 - pos1; /* codon length */
6371 /* allow partial codon at the end */
6372 if (pos == 2 || (pos >= 0 && pos <= 1 && pos2 == len - 1)) {
6373 pos1 -= adjust;
6374 ValNodeAddInt (&vnphead, (Int2) cbp->aa.value.intvalue, (Int4) (pos1 / 3));
6375 }
6376
6377 cbp = cbp->next;
6378 }
6379
6380 vnphead = ValNodeSort (vnphead, SortByIntvalue);
6381
6382 return vnphead;
6383 }
6384
TransTableTranslateCommon(TransTablePtr PNTR tblptr,SeqLocPtr location,SeqLocPtr product,Boolean partial,Int2 genCode,Uint1 frame,CodeBreakPtr code_break,Boolean include_stop,Boolean remove_trailingX,Boolean no_stop_at_end_of_complete_cds,BoolPtr altStartP,Boolean farProdFetchOK)6385 static ByteStorePtr TransTableTranslateCommon (
6386 TransTablePtr PNTR tblptr,
6387 SeqLocPtr location,
6388 SeqLocPtr product,
6389 Boolean partial,
6390 Int2 genCode,
6391 Uint1 frame,
6392 CodeBreakPtr code_break,
6393 Boolean include_stop,
6394 Boolean remove_trailingX,
6395 Boolean no_stop_at_end_of_complete_cds,
6396 BoolPtr altStartP,
6397 Boolean farProdFetchOK
6398 )
6399
6400 {
6401 Char aa;
6402 Int2 j, state = 0;
6403 Boolean bad_base, no_start, check_start, got_stop,
6404 incompleteLastCodon, use_break = FALSE, is_first;
6405 CharPtr bases, txt, protseq;
6406 ByteStorePtr bs;
6407 ValNodePtr codebreakhead = NULL, vnp;
6408 TransTablePtr localtbl = NULL, tbl;
6409 Uint2 part_prod = 0, part_loc = 0;
6410 Int4 dnalen, protlen, total, k, p, q;
6411 Uint1 residue = 0;
6412
6413 /* if table pointer not passed in from calling stack, use local table */
6414
6415 if (tblptr == NULL) {
6416 tblptr = &localtbl;
6417 }
6418
6419 if (location == NULL) return NULL;
6420 dnalen = SeqLocLen (location);
6421 if (dnalen < 1) return NULL;
6422
6423 /* adjust for obsolete genetic code numbers */
6424
6425 if (genCode == 7) {
6426 genCode = 4;
6427 } else if (genCode == 8) {
6428 genCode = 1;
6429 } else if (genCode == 0) {
6430 genCode = 1;
6431 }
6432
6433 /* can store table for reuse on calling function's stack, replace if code is changing */
6434
6435 tbl = *tblptr;
6436 if (tbl != NULL && genCode != tbl->genCode) {
6437 tbl = TransTableFree (tbl);
6438 *tblptr = tbl;
6439 }
6440 if (tbl == NULL) {
6441 tbl = TransTableNew (genCode);
6442 *tblptr = tbl;
6443 }
6444 if (tbl == NULL) return NULL;
6445
6446 /* read bases, pad last codon with Ns, get total base count without padding */
6447
6448 bases = ReadCodingRegionBases (location, dnalen, frame, &total);
6449 if (bases == NULL) {
6450 TransTableFree (localtbl);
6451 return NULL;
6452 }
6453
6454 /* reality check on length */
6455
6456 if (StringLen (bases) < 3) {
6457 MemFree (bases);
6458 TransTableFree (localtbl);
6459 return NULL;
6460 }
6461
6462 /* process code breaks into list of aa (choice) and protein offset (data.intvalue) */
6463
6464 codebreakhead = MakeCodeBreakList (location, dnalen, code_break, frame);
6465
6466 no_start = FALSE;
6467 part_loc = SeqLocPartialCheck (location);
6468 part_prod = SeqLocPartialCheckEx (product, farProdFetchOK);
6469 if ((part_loc & SLP_START) /* || (part_prod & SLP_START) */) {
6470 no_start = TRUE;
6471 }
6472 if (StringHasNoText (tbl->sncbieaa) || no_start || frame > 1) {
6473 check_start = FALSE;
6474 } else {
6475 check_start = TRUE;
6476 }
6477
6478 /* size of protein, allow partial codon at end */
6479
6480 protlen = dnalen;
6481 protlen /= 3;
6482 protlen += 1;
6483
6484 protseq = (CharPtr) MemNew ((size_t) protlen + 2);
6485 if (protseq == NULL) {
6486 MemFree (bases);
6487 ValNodeFree (codebreakhead);
6488 TransTableFree (localtbl);
6489 return NULL;
6490 }
6491
6492 bs = BSNew (protlen);
6493 if (bs == NULL) {
6494 MemFree (bases);
6495 MemFree (protseq);
6496 ValNodeFree (codebreakhead);
6497 TransTableFree (localtbl);
6498 return NULL;
6499 }
6500
6501 got_stop = FALSE;
6502 incompleteLastCodon = FALSE;
6503 is_first = TRUE;
6504 use_break = FALSE;
6505 state = 0;
6506
6507 k = 0;
6508 p = 0;
6509 q = 0;
6510 txt = bases;
6511 residue = (Uint1) *txt;
6512
6513 if (altStartP != NULL) {
6514 *altStartP = FALSE;
6515 }
6516
6517 /* loop through all codons */
6518
6519 while (residue != '\0') {
6520 for (j = 0, bad_base = FALSE; j < 3; j++, k++, txt++, residue = (Uint1) *txt) {
6521 if (IS_residue (residue)) {
6522 state = NextCodonState (tbl, state, residue);
6523 } else {
6524 state = NextCodonState (tbl, state, 'N');
6525 bad_base = TRUE;
6526 }
6527 }
6528
6529 for (vnp = codebreakhead; vnp != NULL && vnp->data.intvalue != p; vnp = vnp->next) continue;
6530 use_break = (Boolean) (vnp != NULL);
6531
6532 if (use_break) {
6533 aa = (Char) vnp->choice;
6534 } else if (bad_base) {
6535 aa = 'X';
6536 } else if (is_first && check_start) {
6537
6538 /* ambiguous start codon that MAY be an initiator now translated to ambiguous X amino acid */
6539 aa = GetStartResidue (tbl, state, TTBL_TOP_STRAND);
6540 if (aa == '-') {
6541 if ((! ((part_loc & SLP_STOP) || (part_prod & SLP_STOP))) && (partial)) {
6542 aa = GetCodonResidue (tbl, state, TTBL_TOP_STRAND);
6543 }
6544 } else {
6545 if (altStartP != NULL) {
6546 if (IsAltStart (tbl, state, TTBL_TOP_STRAND)) {
6547 *altStartP = TRUE;
6548 }
6549 }
6550 }
6551 } else {
6552
6553 aa = GetCodonResidue (tbl, state, TTBL_TOP_STRAND);
6554 }
6555 is_first = FALSE;
6556
6557 if (aa == '*'
6558 && (! include_stop
6559 || (no_stop_at_end_of_complete_cds && ! partial && *(txt + 1) == 0)))
6560 {
6561 got_stop = TRUE;
6562 residue = '\0'; /* signal end of loop */
6563
6564 } else {
6565
6566 if (q < protlen) { /* protect against accidental buffer overflow */
6567 protseq [q] = aa;
6568 }
6569 q++;
6570 /*
6571 BSPutByte (bs, (Int2) aa);
6572 */
6573 }
6574
6575 /* advance protein position for code break test */
6576
6577 p++;
6578 }
6579
6580 if (q > protlen) {
6581 ErrPostEx (SEV_ERROR, 0, 0, "TransTableTranslate - %ld characters written, %ld characters expected", (long) q, (long) protlen);
6582 }
6583
6584 if (k > total) {
6585 incompleteLastCodon = TRUE;
6586 }
6587
6588 if ((! got_stop) && (! incompleteLastCodon) && q > 0 && (! partial) && (! use_break)) {
6589 /* check for stop codon that normally encodes an amino acid */
6590 aa = GetStartResidue (tbl, state, TTBL_TOP_STRAND);
6591 if (aa == '*') {
6592 if (include_stop) {
6593 protseq [q - 1] = aa;
6594 } else {
6595 q--;
6596 }
6597 got_stop = TRUE;
6598 }
6599 }
6600
6601 if ((! got_stop) && incompleteLastCodon && q > 0) {
6602 aa = protseq [q - 1];
6603 if ((aa == 'X' /* || aa == 'B' || aa == 'Z' */) && q > 0) {
6604 q--;
6605 }
6606 #if 0
6607 BSSeek (bs, -1, SEEK_END); /* remove last X if incomplete last codon */
6608 aa = (Char) BSGetByte (bs);
6609 if ((aa == 'X' /* || aa == 'B' || aa == 'Z' */) && BSLen (bs) > 0) {
6610 BSSeek (bs, -1, SEEK_END);
6611 BSDelete (bs, 1);
6612 BSSeek (bs, -1, SEEK_END);
6613 }
6614 #endif
6615 }
6616
6617 if ((! got_stop) && remove_trailingX && q > 0) { /* only remove trailing X on partial CDS */
6618 aa = protseq [q - 1];
6619 while ((aa == 'X' /* || aa == 'B' || aa == 'Z' */) && q > 0) {
6620 q--;
6621 aa = protseq [q - 1];
6622 }
6623 #if 0
6624 BSSeek (bs, -1, SEEK_END); /* back up to last residue */
6625 aa = (Char) BSGetByte (bs);
6626 while ((aa == 'X' /* || aa == 'B' || aa == 'Z' */) && BSLen (bs) > 0) {
6627 BSSeek (bs, -1, SEEK_END);
6628 BSDelete (bs, 1);
6629 BSSeek (bs, -1, SEEK_END);
6630 aa = (Char) BSGetByte (bs);
6631 }
6632 #endif
6633 }
6634
6635 BSWrite (bs, (Pointer) protseq, q);
6636
6637 if (BSLen (bs) < 1) {
6638 bs = BSFree (bs);
6639 }
6640
6641 /* clean up temporarily allocated memory */
6642
6643 MemFree (bases);
6644 MemFree (protseq);
6645 ValNodeFree (codebreakhead);
6646
6647 /* free local table, if allocated */
6648
6649 TransTableFree (localtbl);
6650
6651 return bs;
6652 }
6653
6654 /* public functions for trans table translation */
6655
TransTableTranslateSeqLoc(TransTablePtr PNTR tblptr,SeqLocPtr location,Int2 genCode,Uint1 frame,Boolean include_stop,Boolean remove_trailingX)6656 NLM_EXTERN ByteStorePtr TransTableTranslateSeqLoc (
6657 TransTablePtr PNTR tblptr,
6658 SeqLocPtr location,
6659 Int2 genCode,
6660 Uint1 frame,
6661 Boolean include_stop,
6662 Boolean remove_trailingX
6663 )
6664
6665 {
6666 return TransTableTranslateCommon (tblptr, location, NULL, FALSE, genCode,
6667 frame, NULL, include_stop,
6668 remove_trailingX, FALSE, NULL, TRUE);
6669 }
6670
TransTableTranslateCdRegionEx(TransTablePtr PNTR tblptr,SeqFeatPtr cds,Boolean include_stop,Boolean remove_trailingX,Boolean no_stop_at_end_of_complete_cds,BoolPtr altStartP,Boolean farProdFetchOK)6671 NLM_EXTERN ByteStorePtr TransTableTranslateCdRegionEx (
6672 TransTablePtr PNTR tblptr,
6673 SeqFeatPtr cds,
6674 Boolean include_stop,
6675 Boolean remove_trailingX,
6676 Boolean no_stop_at_end_of_complete_cds,
6677 BoolPtr altStartP,
6678 Boolean farProdFetchOK
6679 )
6680
6681 {
6682 CdRegionPtr crp;
6683 Int2 genCode = 0;
6684 ValNodePtr vnp;
6685 Boolean partial5, partial3;
6686
6687 if (cds == NULL || cds->data.choice != SEQFEAT_CDREGION) return NULL;
6688 crp = (CdRegionPtr) cds->data.value.ptrvalue;
6689 if (crp == NULL) return NULL;
6690
6691 /* set genCode variable from genetic_code parameter, if id choice is used */
6692
6693 if (crp->genetic_code != NULL) {
6694 vnp = (ValNodePtr) crp->genetic_code->data.ptrvalue;
6695 while (vnp != NULL) {
6696 if (vnp->choice == 2) {
6697 genCode = (Int2) vnp->data.intvalue;
6698 }
6699 vnp = vnp->next;
6700 }
6701 }
6702 CheckSeqLocForPartial (cds->location, &partial5, &partial3);
6703
6704 return TransTableTranslateCommon (tblptr, cds->location, cds->product, partial3,
6705 genCode, crp->frame, crp->code_break,
6706 include_stop, remove_trailingX,
6707 no_stop_at_end_of_complete_cds, altStartP, farProdFetchOK);
6708 }
6709
TransTableTranslateCdRegion(TransTablePtr PNTR tblptr,SeqFeatPtr cds,Boolean include_stop,Boolean remove_trailingX,Boolean no_stop_at_end_of_complete_cds)6710 NLM_EXTERN ByteStorePtr TransTableTranslateCdRegion (
6711 TransTablePtr PNTR tblptr,
6712 SeqFeatPtr cds,
6713 Boolean include_stop,
6714 Boolean remove_trailingX,
6715 Boolean no_stop_at_end_of_complete_cds
6716 )
6717
6718 {
6719 return TransTableTranslateCdRegionEx (tblptr, cds, include_stop, remove_trailingX,
6720 no_stop_at_end_of_complete_cds, NULL, TRUE);
6721 }
6722
6723 /* allow reuse of translation tables by saving as AppProperty */
6724
PersistentTransTableCommon(SeqFeatPtr cds,Int2 genCode)6725 static TransTablePtr PersistentTransTableCommon (
6726 SeqFeatPtr cds,
6727 Int2 genCode
6728 )
6729
6730 {
6731 CdRegionPtr crp;
6732 Char str [32];
6733 TransTablePtr tbl = NULL;
6734 ValNodePtr vnp;
6735
6736 if (cds != NULL && cds->data.choice == SEQFEAT_CDREGION) {
6737 crp = (CdRegionPtr) cds->data.value.ptrvalue;
6738 if (crp != NULL && crp->genetic_code != NULL) {
6739 vnp = (ValNodePtr) crp->genetic_code->data.ptrvalue;
6740 while (vnp != NULL) {
6741 if (vnp->choice == 2) {
6742 genCode = (Int2) vnp->data.intvalue;
6743 }
6744 vnp = vnp->next;
6745 }
6746 }
6747 }
6748
6749 if (genCode == 7) {
6750 genCode = 4;
6751 } else if (genCode == 8) {
6752 genCode = 1;
6753 } else if (genCode == 0) {
6754 genCode = 1;
6755 }
6756
6757 /* set app property name for storing desired FSA */
6758
6759 sprintf (str, "TransTableFSAforGenCode%d", (int) genCode);
6760
6761 /* get FSA for desired genetic code if it already exists */
6762
6763 tbl = (TransTablePtr) GetAppProperty (str);
6764
6765 /* if not already exists, save FSA in genetic code-specific app property name */
6766
6767 if (tbl == NULL) {
6768 tbl = TransTableNew (genCode);
6769 SetAppProperty (str, (Pointer) tbl);
6770 }
6771
6772 return tbl;
6773 }
6774
PersistentTransTableByGenCode(Int2 genCode)6775 NLM_EXTERN TransTablePtr PersistentTransTableByGenCode (
6776 Int2 genCode
6777 )
6778
6779 {
6780 return PersistentTransTableCommon (NULL, genCode);
6781 }
6782
PersistentTransTableByCdRegion(SeqFeatPtr cds)6783 NLM_EXTERN TransTablePtr PersistentTransTableByCdRegion (
6784 SeqFeatPtr cds
6785 )
6786
6787 {
6788 return PersistentTransTableCommon (cds, 0);
6789 }
6790
6791 /*****************************************************************************
6792 *
6793 * SeqSearch
6794 * Initializes SeqSearch finite state machine for sequence searching
6795 * Based on Practical Algorithms for Programmers by Binstock and Rex
6796 *
6797 *****************************************************************************/
6798
6799 /* general purpose DNA sequence search finite state machine */
6800
6801 typedef struct seqpattern {
6802 CharPtr name;
6803 CharPtr pattern;
6804 Int2 cutSite;
6805 Uint1 strand;
6806 struct seqpattern * next;
6807 } SeqPatternItem, PNTR SeqPatternPtr;
6808
6809 typedef struct seqmatch {
6810 CharPtr name;
6811 CharPtr pattern;
6812 Int2 cutSite;
6813 Uint1 strand;
6814 struct seqmatch * next;
6815 } SeqMatchItem, PNTR SeqMatchPtr;
6816
6817 typedef struct seqstate {
6818 Int2 onfailure;
6819 Int2 transitions [15]; /* order is ACGTMRWSYKVHDBN */
6820 SeqMatchPtr matches;
6821 } SeqStateItem, PNTR SeqStatePtr;
6822
6823 typedef struct SeqSearch {
6824 SeqStatePtr stateArray;
6825 SeqPatternPtr patternList;
6826 Int4 maxPatLen;
6827 Int2 maxState;
6828 Int2 highState;
6829 Int2 currentState;
6830 Int4 currentPos;
6831 Boolean primed;
6832 SeqSearchMatchProc matchproc;
6833 Pointer userdata;
6834 Uint1 letterToIdx [256];
6835 Uint1 letterToComp [256];
6836 } SeqSearchData;
6837
6838 #define FAIL_STATE -1
6839
6840 /* returns next state given current state and next character */
6841
SeqSearchGotoState(SeqSearchPtr tbl,Int2 state,Char ch,Boolean zeroFailureReturnsZero)6842 static Int2 SeqSearchGotoState (
6843 SeqSearchPtr tbl,
6844 Int2 state,
6845 Char ch,
6846 Boolean zeroFailureReturnsZero
6847 )
6848
6849 {
6850 int index;
6851 Int2 newstate;
6852 SeqStatePtr sp;
6853
6854 sp = &(tbl->stateArray [(int) state]);
6855 index = tbl->letterToIdx [(int) (Uint1) ch];
6856 newstate = sp->transitions [index];
6857
6858 if (newstate != 0) return newstate;
6859
6860 if (state == 0 && zeroFailureReturnsZero) return 0;
6861
6862 return FAIL_STATE;
6863 }
6864
6865 /* returns state to check next if current pattern broken */
6866
SeqSearchFailState(SeqSearchPtr tbl,Int2 state)6867 static Int2 SeqSearchFailState (
6868 SeqSearchPtr tbl,
6869 Int2 state
6870 )
6871
6872 {
6873 SeqStatePtr sp;
6874
6875 sp = &(tbl->stateArray [(int) state]);
6876 return sp->onfailure;
6877 }
6878
6879 /* add a single character transition from one state to another */
6880
SeqSearchAddTransition(SeqSearchPtr tbl,Int2 oldState,Char ch,Int2 newState)6881 static void SeqSearchAddTransition (
6882 SeqSearchPtr tbl,
6883 Int2 oldState,
6884 Char ch,
6885 Int2 newState
6886 )
6887
6888 {
6889 int index;
6890 SeqStatePtr sp;
6891
6892 sp = &(tbl->stateArray [(int) oldState]);
6893 index = tbl->letterToIdx [(int) (Uint1) ch];
6894 sp->transitions [index] = newState;
6895 }
6896
6897 /* given state should report a successful match */
6898
SeqSearchAddOutput(SeqSearchPtr tbl,Int2 state,CharPtr name,CharPtr pattern,Int2 cutSite,Uint1 strand)6899 static void SeqSearchAddOutput (
6900 SeqSearchPtr tbl,
6901 Int2 state,
6902 CharPtr name,
6903 CharPtr pattern,
6904 Int2 cutSite,
6905 Uint1 strand
6906 )
6907
6908 {
6909 SeqMatchPtr mp;
6910 SeqStatePtr sp;
6911
6912 sp = &(tbl->stateArray [(int) state]);
6913 for (mp = sp->matches; mp != NULL; mp = mp->next) {
6914 if (StringCmp (name, mp->name) == 0) return;
6915 }
6916
6917 mp = (SeqMatchPtr) MemNew (sizeof (SeqMatchItem));
6918 if (mp == NULL) return;
6919
6920 mp->name = StringSave (name);
6921 mp->pattern = StringSave (pattern);
6922 mp->cutSite = cutSite;
6923 mp->strand = strand;
6924
6925 mp->next = sp->matches;
6926 sp->matches = mp;
6927 }
6928
6929 /* add one nucleotide sequence pattern to the finite state machine */
6930
SeqSearchEnterNucWord(SeqSearchPtr tbl,Int2 highState,Int2 maxState,CharPtr name,CharPtr pattern,Int2 cutSite,Uint1 strand)6931 static Int2 SeqSearchEnterNucWord (
6932 SeqSearchPtr tbl,
6933 Int2 highState,
6934 Int2 maxState,
6935 CharPtr name,
6936 CharPtr pattern,
6937 Int2 cutSite,
6938 Uint1 strand
6939 )
6940
6941 {
6942 Char ch;
6943 Int2 next, patLen, state;
6944 CharPtr ptr;
6945
6946 state = 0;
6947 next = 0;
6948
6949 patLen = StringLen (pattern);
6950
6951 /* try to overlay beginning of pattern onto existing table */
6952
6953 for (ptr = pattern, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
6954 next = SeqSearchGotoState (tbl, state, ch, FALSE);
6955 if (next == FAIL_STATE) break;
6956 state = next;
6957 }
6958
6959 /* now create new states for remaining characters in pattern */
6960
6961 for ( ; ch != '\0'; ptr++, ch = *ptr) {
6962 highState++;
6963 SeqSearchAddTransition (tbl, state, ch, highState);
6964 state = highState;
6965 }
6966
6967 /* at end of pattern record match information */
6968
6969 SeqSearchAddOutput (tbl, state, name, pattern, cutSite, strand);
6970
6971 return highState;
6972 }
6973
6974 /* FIFO queue and other functions for building failure states */
6975
SeqSearchQueueAdd(Int2Ptr queue,Int2 qbeg,Int2 val)6976 static void SeqSearchQueueAdd (
6977 Int2Ptr queue,
6978 Int2 qbeg,
6979 Int2 val
6980 )
6981
6982 {
6983 Int2 q;
6984
6985 q = queue [qbeg];
6986 if (q == 0) {
6987 queue [qbeg] = val;
6988 } else {
6989 for ( ; queue [q] != 0; q = queue [q]) continue;
6990 queue [q] = val;
6991 }
6992 queue [val] = 0;
6993 }
6994
SeqSearchFindFail(SeqSearchPtr tbl,Int2 state,Int2 newState,Char ch)6995 static void SeqSearchFindFail (
6996 SeqSearchPtr tbl,
6997 Int2 state,
6998 Int2 newState,
6999 Char ch
7000 )
7001
7002 {
7003 SeqMatchPtr mp;
7004 Int2 next;
7005 SeqStatePtr sp;
7006
7007 /* traverse existing failure path */
7008
7009 while ((next = SeqSearchGotoState (tbl, state, ch, TRUE)) == FAIL_STATE) {
7010 state = SeqSearchFailState (tbl, state);
7011 }
7012
7013 /* add new failure state */
7014
7015 sp = &(tbl->stateArray [(int) newState]);
7016 sp->onfailure = next;
7017
7018 /* add matches of substring at new state */
7019
7020 sp = &(tbl->stateArray [(int) next]);
7021 for (mp = sp->matches; mp != NULL; mp = mp->next) {
7022 SeqSearchAddOutput (tbl, newState, mp->name, mp->pattern,
7023 mp->cutSite, mp->strand);
7024 }
7025 }
7026
SeqSearchComputeFail(SeqSearchPtr tbl,Int2Ptr queue)7027 static void SeqSearchComputeFail (
7028 SeqSearchPtr tbl,
7029 Int2Ptr queue
7030 )
7031
7032 {
7033 CharPtr charToNuc = "ACGTMRWSYKVHDBN";
7034 Char ch;
7035 Int2 qbeg, r, s, state;
7036 int index;
7037 SeqStatePtr sp;
7038
7039 qbeg = 0;
7040 queue [0] = 0;
7041
7042 /* queue up states reached directly from state 0 (depth 1) */
7043
7044 sp = &(tbl->stateArray [0]);
7045 for (index = 0; index < 15; index++) {
7046 s = sp->transitions [index];
7047 if (s == 0) continue;
7048 sp->onfailure = 0;
7049 SeqSearchQueueAdd (queue, qbeg, s);
7050 }
7051
7052 while (queue [qbeg] != 0) {
7053 r = queue [qbeg];
7054 qbeg = r;
7055
7056 /* depth 1 states beget depth 2 states, etc. */
7057
7058 sp = &(tbl->stateArray [r]);
7059 for (index = 0; index < 15; index++) {
7060 ch = charToNuc [index];
7061 s = sp->transitions [index];
7062 if (s == 0) continue;
7063 SeqSearchQueueAdd (queue, qbeg, s);
7064
7065 /*
7066 Search for nucleotide sequences GTCGAC and TCATGA
7067
7068 State Substring Transitions Failure
7069 2 GT C -> 3 7
7070 3 GTC G -> 4 ?
7071 ...
7072 7 T C -> 8 0
7073 8 TC A -> 9
7074
7075 For example, r = 2 (GT), if 'C' would go to s = 3 (GTC).
7076 From previous computation, 2 (GT) fails to 7 (T). So we
7077 are not in a pattern starting with GT, but we may be in
7078 a pattern starting with the next character after G, or T.
7079 Thus, check state 7 (T) for any transitions using 'C'.
7080 Since 7 (T) 'C' -> 8 (TC), therefore set fail [3] -> 8.
7081 */
7082
7083 state = SeqSearchFailState (tbl, r);
7084 SeqSearchFindFail (tbl, state, s, ch);
7085 }
7086 }
7087 }
7088
7089 /* on first character, populate state transition table */
7090
SeqSearchPrimeStateArray(SeqSearchPtr tbl)7091 static void SeqSearchPrimeStateArray (
7092 SeqSearchPtr tbl
7093 )
7094
7095 {
7096 Int2 highState, maxState;
7097 SeqPatternPtr pp;
7098 Int2Ptr queue;
7099 SeqStatePtr stateArray;
7100
7101 if (tbl == NULL || tbl->primed || tbl->patternList == NULL) return;
7102
7103 for (maxState = 1, pp = tbl->patternList; pp != NULL; pp = pp->next) {
7104 maxState += StringLen (pp->pattern);
7105 }
7106
7107 if (maxState > 4000) {
7108 Message (MSG_POST, "FiniteStateSearch cannot handle %d states", (int) maxState);
7109 return;
7110 }
7111
7112 stateArray = (SeqStatePtr) MemNew (sizeof (SeqStateItem) * (size_t) maxState);
7113 queue = (Int2Ptr) MemNew (sizeof (Int2) * maxState);
7114
7115 if (stateArray == NULL || queue == NULL) {
7116 MemFree (stateArray);
7117 MemFree (queue);
7118 Message (MSG_POST, "SequenceSearch unable to allocate buffers");
7119 return;
7120 }
7121
7122 tbl->stateArray = stateArray;
7123 tbl->maxState = maxState;
7124
7125 for (highState = 0, pp = tbl->patternList; pp != NULL; pp = pp->next) {
7126 highState = SeqSearchEnterNucWord (tbl, highState, maxState, pp->name,
7127 pp->pattern, pp->cutSite, pp->strand);
7128 }
7129
7130 SeqSearchComputeFail (tbl, queue);
7131
7132 MemFree (queue);
7133
7134 tbl->highState = highState;
7135 tbl->currentState = 0;
7136 tbl->currentPos = 0;
7137 tbl->primed = TRUE;
7138 }
7139
7140 /* for testing, print summary of transition table */
7141
7142 /*
7143 static void PrintSeqSearchTable (
7144 SeqSearchPtr tbl,
7145 FILE *fp
7146 )
7147
7148 {
7149 Int2 i;
7150 SeqMatchPtr mp;
7151 SeqStatePtr sp;
7152 Int2 state;
7153
7154 if (tbl == NULL || fp == NULL) return;
7155 if (! tbl->primed) {
7156 SeqSearchPrimeStateArray (tbl);
7157 }
7158 if (tbl->stateArray == NULL) return;
7159 if (tbl->highState > 99) return;
7160
7161 fprintf (fp, "State Fail A C G T M R W S Y K V H D B N\n");
7162
7163 for (state = 0; state <= tbl->highState; state++) {
7164 sp = &(tbl->stateArray [(int) state]);
7165 fprintf (fp, " %3d %3d", (int) state, (int) sp->onfailure);
7166
7167 for (i = 0; i < 15; i++) {
7168 if (sp->transitions [i] != 0) {
7169 fprintf (fp, "%3d", (int) sp->transitions [i]);
7170 } else {
7171 fprintf (fp, " ");
7172 }
7173 }
7174
7175 for (mp = sp->matches; mp != NULL; mp = mp->next) {
7176 fprintf (fp, " %s", mp->name);
7177 }
7178
7179 fprintf (fp, "\n");
7180 }
7181 }
7182 */
7183
7184 /* create empty nucleotide sequence search finite state machine */
7185
SeqSearchNew(SeqSearchMatchProc matchproc,Pointer userdata)7186 NLM_EXTERN SeqSearchPtr SeqSearchNew (
7187 SeqSearchMatchProc matchproc,
7188 Pointer userdata
7189 )
7190
7191 {
7192 CharPtr charToNuc = "ACGTMRWSYKVHDBN";
7193 Char ch, lttr;
7194 CharPtr complementBase = " TVGH CD M KN YSAABW R ";
7195 Int2 i;
7196 Uint1 k;
7197 SeqSearchPtr tbl;
7198
7199 if (matchproc == NULL) return NULL;
7200 tbl = (SeqSearchPtr) MemNew (sizeof (SeqSearchData));
7201 if (tbl == NULL) return NULL;
7202
7203 tbl->stateArray = NULL;
7204 tbl->patternList = NULL;
7205 tbl->maxPatLen = 0;
7206 tbl->maxState = 0;
7207 tbl->highState = 0;
7208 tbl->currentState = 0;
7209 tbl->currentPos = 0;
7210 tbl->matchproc = matchproc;
7211 tbl->userdata = userdata;
7212 tbl->primed = FALSE;
7213
7214 /* initialize table to convert character to transition index from 0 (A) to 14 (N) */
7215
7216 for (i = 0; i < 256; i++) {
7217 tbl->letterToIdx [i] = 14;
7218 }
7219 for (k = 0; k < 15; k++) {
7220 ch = charToNuc [k];
7221 tbl->letterToIdx [(int) ch] = k;
7222 ch = TO_LOWER (ch);
7223 tbl->letterToIdx [(int) ch] = k;
7224 }
7225 tbl->letterToIdx [(int) 'U'] = tbl->letterToIdx [(int) 'T'];
7226 tbl->letterToIdx [(int) 'u'] = tbl->letterToIdx [(int) 'T'];
7227 tbl->letterToIdx [(int) 'X'] = tbl->letterToIdx [(int) 'N'];
7228 tbl->letterToIdx [(int) 'x'] = tbl->letterToIdx [(int) 'N'];
7229
7230 /* initialize table to convert character to complement character */
7231
7232 for (i = 0; i < 256; i++) {
7233 tbl->letterToComp [i] = '\0';
7234 }
7235 for (ch = 'A', i = 1; ch <= 'Z'; ch++, i++) {
7236 lttr = complementBase [i];
7237 if (lttr != ' ') {
7238 tbl->letterToComp [(int) (Uint1) ch] = lttr;
7239 }
7240 }
7241 for (ch = 'a', i = 1; ch <= 'z'; ch++, i++) {
7242 lttr = complementBase [i];
7243 if (lttr != ' ') {
7244 tbl->letterToComp [(int) (Uint1) ch] = lttr;
7245 }
7246 }
7247
7248 return tbl;
7249 }
7250
7251 /* table to expand ambiguity letter to all matching nucleotide letters */
7252
7253 static CharPtr nucExpandList [26] = {
7254 "A",
7255 "CGT",
7256 "C",
7257 "AGT",
7258 "",
7259 "",
7260 "G",
7261 "ACT",
7262 "",
7263 "",
7264 "GT",
7265 "",
7266 "AC",
7267 "ACGT",
7268 "",
7269 "",
7270 "",
7271 "AG",
7272 "CG",
7273 "T",
7274 "T",
7275 "ACG",
7276 "AT",
7277 "",
7278 "CT",
7279 ""
7280 };
7281
7282 /* recursive function to expand and store appropriate individual patterns */
7283
StoreSeqPattern(SeqSearchPtr tbl,CharPtr name,CharPtr str,Int2 cutSite,Uint1 strand)7284 static void StoreSeqPattern (
7285 SeqSearchPtr tbl,
7286 CharPtr name,
7287 CharPtr str,
7288 Int2 cutSite,
7289 Uint1 strand
7290 )
7291
7292 {
7293 Int4 patLen;
7294 SeqPatternPtr pp;
7295
7296 pp = (SeqPatternPtr) MemNew (sizeof (SeqPatternItem));
7297 if (pp == NULL) return;
7298
7299 pp->name = StringSave (name);
7300 pp->pattern = StringSave (str);
7301 pp->cutSite = cutSite;
7302 pp->strand = strand;
7303
7304 pp->next = tbl->patternList;
7305 tbl->patternList = pp;
7306 patLen = StringLen (str);
7307 if (patLen > tbl->maxPatLen) {
7308 tbl->maxPatLen = patLen;
7309 }
7310 }
7311
ExpandSeqPattern(SeqSearchPtr tbl,CharPtr name,CharPtr pattern,Int2 cutSite,Uint1 strand,size_t patLen,CharPtr str,Uint2 position,SearchFlgType flags)7312 static void ExpandSeqPattern (
7313 SeqSearchPtr tbl,
7314 CharPtr name,
7315 CharPtr pattern,
7316 Int2 cutSite,
7317 Uint1 strand,
7318 size_t patLen,
7319 CharPtr str,
7320 Uint2 position,
7321 SearchFlgType flags
7322 )
7323
7324 {
7325 Char ch, lttr;
7326 Uint2 idx;
7327 CharPtr ptr;
7328
7329 if (position < patLen) {
7330
7331 if ((Boolean) ((flags & SEQ_SEARCH_EXPAND_PATTERN) != 0)) {
7332
7333 /* given ambiguity letter, get index into nucExpandList */
7334
7335 ch = pattern [position];
7336 idx = ch - 'A';
7337 ptr = nucExpandList [idx];
7338
7339 /* put every ACGT letter at current position, recurse for next position */
7340
7341 for (lttr = *ptr; lttr != '\0'; ptr++, lttr = *ptr) {
7342 str [position] = lttr;
7343 ExpandSeqPattern (tbl, name, pattern, cutSite, strand,
7344 patLen, str, position + 1, flags);
7345 }
7346
7347 } else {
7348
7349 /* if matching ambiguity characters in sequence, do not expand each base */
7350
7351 str [position] = pattern [position];
7352 ExpandSeqPattern (tbl, name, pattern, cutSite, strand,
7353 patLen, str, position + 1, flags);
7354 }
7355
7356 /* do not run into pattern storage section of code located below */
7357
7358 return;
7359 }
7360
7361 /* when position reaches pattern length, store one fully expanded string */
7362
7363 StoreSeqPattern (tbl, name, str, cutSite, strand);
7364
7365 if ((Boolean) ((flags & SEQ_SEARCH_ALLOW_MISMATCH) == 0)) return;
7366
7367 for (idx = 0; idx < patLen; idx++) {
7368 ch = str [idx];
7369
7370 /* put N at every position if a single mismatch is allowed */
7371
7372 str [idx] = 'N';
7373
7374 StoreSeqPattern (tbl, name, str, cutSite, strand);
7375
7376 /* now restore proper character, go on to put N in next position */
7377
7378 str [idx] = ch;
7379 }
7380 }
7381
7382 /* add restriction site to sequence search finite state machine */
7383
SeqSearchAddNucleotidePattern(SeqSearchPtr tbl,CharPtr name,CharPtr pattern,Int2 cutSite,SearchFlgType flags)7384 NLM_EXTERN void SeqSearchAddNucleotidePattern (
7385 SeqSearchPtr tbl,
7386 CharPtr name,
7387 CharPtr pattern,
7388 Int2 cutSite,
7389 SearchFlgType flags
7390 )
7391
7392 {
7393 Char ch, comp [128], pat [128], str [128];
7394 Int2 i, j;
7395 size_t len;
7396 Uint1 strand;
7397 Boolean symmetric = TRUE;
7398
7399 if (tbl == NULL || StringHasNoText (name) || StringHasNoText (pattern)) return;
7400
7401 StringNCpy_0 (pat, pattern, sizeof (pat));
7402 TrimSpacesAroundString (pat);
7403
7404 len = StringLen (pat);
7405
7406 /* upper case working copy of pattern string */
7407
7408 for (i = 0; i < len; i++) {
7409 ch = pat [i];
7410 pat [i] = TO_UPPER (ch);
7411 }
7412
7413 /* reverse complement pattern to see if it is symetrical */
7414
7415 for (i = 0, j = len - 1; i < len; i++, j--) {
7416 ch = pat [i];
7417 comp [j] = tbl->letterToComp [(int) (Uint1) ch];
7418 }
7419 comp [len] = '\0';
7420 symmetric = (Boolean) (StringICmp (pat, comp) == 0);
7421
7422 if (symmetric) {
7423 strand = Seq_strand_both;
7424 } else {
7425 strand = Seq_strand_plus;
7426 }
7427
7428 /* record expansion of entered pattern */
7429
7430 MemSet ((Pointer) str, 0, sizeof (str));
7431 ExpandSeqPattern (tbl, name, pat, cutSite, strand,
7432 len, str, 0, flags);
7433
7434 if (symmetric) return;
7435 if ((Boolean) ((flags & SEQ_SEARCH_JUST_TOP_STRAND) != 0)) return;
7436
7437 /* record expansion of reverse complement of asymmetric pattern */
7438
7439 MemSet ((Pointer) str, 0, sizeof (str));
7440 ExpandSeqPattern (tbl, name, comp, len - cutSite, Seq_strand_minus,
7441 len, str, 0, flags);
7442 }
7443
7444 /* program passes each character in turn to finite state machine */
7445
SeqSearchProcessCharacterEx(SeqSearchPtr tbl,Char ch,Int4 length)7446 static void SeqSearchProcessCharacterEx (
7447 SeqSearchPtr tbl,
7448 Char ch,
7449 Int4 length
7450 )
7451
7452 {
7453 Int2 curr, next;
7454 SeqMatchPtr mp;
7455 Int4 patLen;
7456 SeqStatePtr sp;
7457
7458 if (tbl == NULL) return;
7459 if (! tbl->primed) {
7460 SeqSearchPrimeStateArray (tbl);
7461 }
7462 if (tbl->stateArray == NULL) return;
7463
7464 curr = tbl->currentState;
7465
7466 /* loop through failure states until match or back to state 0 */
7467
7468 while ((next = SeqSearchGotoState (tbl, curr, ch, TRUE)) == FAIL_STATE) {
7469 curr = SeqSearchFailState (tbl, curr);
7470 }
7471
7472 tbl->currentState = next;
7473 (tbl->currentPos)++;
7474
7475 /*
7476 States while traversing search sequence containing EcoRI site (GAATTC)
7477 ------
7478 AAGCTTGCATGCCTGCAGGTCGACTCTAGAGGATCCCCGGGTACCGAGCTCGAATTCGAGCTCGGTACCCGGGGATCCTC
7479 00100010001000100110012000001211200000111000012100012345612100011000001111200000
7480 *
7481 */
7482
7483 /* report any matches at current state to callback function */
7484
7485 sp = &(tbl->stateArray [(int) next]);
7486 for (mp = sp->matches; mp != NULL; mp = mp->next) {
7487
7488 /* for circular sequences, prevent multiple reports of patterns */
7489
7490 patLen = StringLen (mp->pattern);
7491 if (tbl->currentPos - patLen < length) {
7492 tbl->matchproc (tbl->currentPos - patLen,
7493 mp->name, mp->pattern, mp->cutSite,
7494 mp->strand, tbl->userdata);
7495 }
7496 }
7497 }
7498
SeqSearchProcessCharacter(SeqSearchPtr tbl,Char ch)7499 NLM_EXTERN void SeqSearchProcessCharacter (
7500 SeqSearchPtr tbl,
7501 Char ch
7502 )
7503
7504 {
7505 SeqSearchProcessCharacterEx (tbl, ch, INT4_MAX);
7506 }
7507
7508 /* convenience function calls SeqSearchProcessCharacter for entire nucleotide bioseq */
7509
7510 typedef struct seqsrchdata {
7511 SeqSearchPtr tbl;
7512 Int4 length;
7513 } SeqSrchData, PNTR SeqSrchPtr;
7514
SearchSeqProc(CharPtr sequence,Pointer userdata)7515 static void LIBCALLBACK SearchSeqProc (
7516 CharPtr sequence,
7517 Pointer userdata
7518 )
7519
7520 {
7521 Char ch;
7522 CharPtr ptr;
7523 SeqSrchPtr ssp;
7524
7525 ssp = (SeqSrchPtr) userdata;
7526
7527 ptr = sequence;
7528 ch = *ptr;
7529 while (ch != '\0') {
7530 ch = TO_UPPER (ch);
7531 SeqSearchProcessCharacterEx (ssp->tbl, ch, ssp->length);
7532 ptr++;
7533 ch = *ptr;
7534 }
7535 }
7536
SeqSearchProcessBioseq(SeqSearchPtr tbl,BioseqPtr bsp)7537 NLM_EXTERN void SeqSearchProcessBioseq (
7538 SeqSearchPtr tbl,
7539 BioseqPtr bsp
7540 )
7541
7542 {
7543 SeqSrchData ssd;
7544
7545 SeqSearchReset (tbl);
7546
7547 if (tbl == NULL || bsp == NULL) return;
7548
7549 if (! ISA_na (bsp->mol)) return;
7550
7551 ssd.tbl = tbl;
7552 ssd.length = bsp->length;
7553
7554 SeqPortStream (bsp, STREAM_EXPAND_GAPS, (Pointer) &ssd, SearchSeqProc);
7555
7556 /* for circular molecules, check for patterns spanning origin */
7557
7558 if (bsp->topology == TOPOLOGY_CIRCULAR && bsp->length > tbl->maxPatLen) {
7559 SeqPortStreamInt (bsp, 0, tbl->maxPatLen, Seq_strand_plus, STREAM_EXPAND_GAPS, (Pointer) &ssd, SearchSeqProc);
7560 }
7561
7562 SeqSearchReset (tbl);
7563 }
7564
7565 /* reset state and position to allow another run with same search patterns */
7566
SeqSearchReset(SeqSearchPtr tbl)7567 NLM_EXTERN void SeqSearchReset (
7568 SeqSearchPtr tbl
7569 )
7570
7571 {
7572 if (tbl == NULL) return;
7573
7574 tbl->currentState = 0;
7575 tbl->currentPos = 0;
7576 }
7577
7578 /* clean up sequence search finite state machine allocated memory */
7579
FreePatternList(SeqPatternPtr pp)7580 static SeqPatternPtr FreePatternList (
7581 SeqPatternPtr pp
7582 )
7583
7584 {
7585 SeqPatternPtr next;
7586
7587 while (pp != NULL) {
7588 next = pp->next;
7589 pp->next = NULL;
7590 MemFree (pp->name);
7591 MemFree (pp->pattern);
7592 MemFree (pp);
7593 pp = next;
7594 }
7595
7596 return NULL;
7597 }
7598
FreeMatchList(SeqMatchPtr mp)7599 static SeqMatchPtr FreeMatchList (
7600 SeqMatchPtr mp
7601 )
7602
7603 {
7604 SeqMatchPtr next;
7605
7606 while (mp != NULL) {
7607 next = mp->next;
7608 mp->next = NULL;
7609 MemFree (mp->name);
7610 MemFree (mp->pattern);
7611 MemFree (mp);
7612 mp = next;
7613 }
7614
7615 return NULL;
7616 }
7617
SeqSearchFree(SeqSearchPtr tbl)7618 NLM_EXTERN SeqSearchPtr SeqSearchFree (
7619 SeqSearchPtr tbl
7620 )
7621
7622 {
7623 Int2 maxState, state;
7624
7625 if (tbl == NULL) return NULL;
7626
7627 maxState = tbl->maxState;
7628
7629 for (state = 0; state < maxState; state++) {
7630 FreeMatchList (tbl->stateArray [state].matches);
7631 }
7632
7633 FreePatternList (tbl->patternList);
7634
7635 MemFree (tbl->stateArray);
7636 return MemFree (tbl);
7637 }
7638
7639 /*
7640
7641 static CharPtr testseq =
7642 "AAGCTTGCATGCCTGCAGGTCGACTCTAGAGGATCCGRATYCCCGGGTACCGAGCTATYCCGAATTCGAGCTCGGTACCCGGGGATCCTCGANTTCATTCGAPTTCCAGTC";
7643
7644 static void MatchProc (Int4 position, CharPtr name, CharPtr pattern,
7645 Int2 cutSite, Uint1 strand, Pointer userdata)
7646
7647 {
7648 Message (MSG_POST, "Name '%s', Pattern '%s', Position %ld",
7649 name, pattern, (long) position);
7650 }
7651
7652
7653 extern void TestSeqSearch (void);
7654 extern void TestSeqSearch (void)
7655
7656 {
7657 Char ch;
7658 CharPtr ptr;
7659 SeqSearchPtr tbl;
7660
7661 tbl = SeqSearchNew (MatchProc, NULL);
7662 if (tbl == NULL) return;
7663
7664 SeqSearchAddNucleotidePattern (tbl, "AmbiG", "GRATYC", 1, SEQ_SEARCH_EXPAND_PATTERN);
7665 SeqSearchAddNucleotidePattern (tbl, "ExacT", "GRAT", 1, SEQ_SEARCH_JUST_TOP_STRAND);
7666
7667 for (ptr = testseq, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
7668 SeqSearchProcessCharacter (tbl, ch);
7669 }
7670
7671 SeqSearchFree (tbl);
7672 }
7673
7674 */
7675
7676 /*****************************************************************************
7677 *
7678 * ProtSearch
7679 * Initializes ProtSearch finite state machine for sequence searching
7680 * Based on Practical Algorithms for Programmers by Binstock and Rex
7681 *
7682 *****************************************************************************/
7683
7684 /* general purpose protein sequence search finite state machine */
7685
7686 typedef struct protpattern {
7687 CharPtr name;
7688 CharPtr pattern;
7689 struct protpattern * next;
7690 } ProtPatternItem, PNTR ProtPatternPtr;
7691
7692 typedef struct protmatch {
7693 CharPtr name;
7694 CharPtr pattern;
7695 struct protmatch * next;
7696 } ProtMatchItem, PNTR ProtMatchPtr;
7697
7698 typedef struct protstate {
7699 Int2 onfailure;
7700 Int2 transitions [27]; /* order is ABCDEFGHIJKLMNOPQRSTUVWXYZ */
7701 ProtMatchPtr matches;
7702 } ProtStateItem, PNTR ProtStatePtr;
7703
7704 typedef struct ProtSearch {
7705 ProtStatePtr stateArray;
7706 ProtPatternPtr patternList;
7707 Int4 maxPatLen;
7708 Int2 maxState;
7709 Int2 highState;
7710 Int2 currentState;
7711 Int4 currentPos;
7712 Boolean primed;
7713 ProtSearchMatchProc matchproc;
7714 Pointer userdata;
7715 Uint1 letterToIdx [256];
7716 } ProtSearchData;
7717
7718 #define FAIL_STATE -1
7719
7720 /* returns next state given current state and next character */
7721
ProtSearchGotoState(ProtSearchPtr tbl,Int2 state,Char ch,Boolean zeroFailureReturnsZero)7722 static Int2 ProtSearchGotoState (
7723 ProtSearchPtr tbl,
7724 Int2 state,
7725 Char ch,
7726 Boolean zeroFailureReturnsZero
7727 )
7728
7729 {
7730 int index;
7731 Int2 newstate;
7732 ProtStatePtr sp;
7733
7734 sp = &(tbl->stateArray [(int) state]);
7735 index = tbl->letterToIdx [(int) (Uint1) ch];
7736 newstate = sp->transitions [index];
7737
7738 if (newstate != 0) return newstate;
7739
7740 if (state == 0 && zeroFailureReturnsZero) return 0;
7741
7742 return FAIL_STATE;
7743 }
7744
7745 /* returns state to check next if current pattern broken */
7746
ProtSearchFailState(ProtSearchPtr tbl,Int2 state)7747 static Int2 ProtSearchFailState (
7748 ProtSearchPtr tbl,
7749 Int2 state
7750 )
7751
7752 {
7753 ProtStatePtr sp;
7754
7755 sp = &(tbl->stateArray [(int) state]);
7756 return sp->onfailure;
7757 }
7758
7759 /* add a single character transition from one state to another */
7760
ProtSearchAddTransition(ProtSearchPtr tbl,Int2 oldState,Char ch,Int2 newState)7761 static void ProtSearchAddTransition (
7762 ProtSearchPtr tbl,
7763 Int2 oldState,
7764 Char ch,
7765 Int2 newState
7766 )
7767
7768 {
7769 int index;
7770 ProtStatePtr sp;
7771
7772 sp = &(tbl->stateArray [(int) oldState]);
7773 index = tbl->letterToIdx [(int) (Uint1) ch];
7774 sp->transitions [index] = newState;
7775 }
7776
7777 /* given state should report a successful match */
7778
ProtSearchAddOutput(ProtSearchPtr tbl,Int2 state,CharPtr name,CharPtr pattern)7779 static void ProtSearchAddOutput (
7780 ProtSearchPtr tbl,
7781 Int2 state,
7782 CharPtr name,
7783 CharPtr pattern
7784 )
7785
7786 {
7787 ProtMatchPtr mp;
7788 ProtStatePtr sp;
7789
7790 sp = &(tbl->stateArray [(int) state]);
7791 for (mp = sp->matches; mp != NULL; mp = mp->next) {
7792 if (StringCmp (name, mp->name) == 0) return;
7793 }
7794
7795 mp = (ProtMatchPtr) MemNew (sizeof (ProtMatchItem));
7796 if (mp == NULL) return;
7797
7798 mp->name = StringSave (name);
7799 mp->pattern = StringSave (pattern);
7800
7801 mp->next = sp->matches;
7802 sp->matches = mp;
7803 }
7804
7805 /* add one protein sequence pattern to the finite state machine */
7806
ProtSearchEnterProtWord(ProtSearchPtr tbl,Int2 highState,Int2 maxState,CharPtr name,CharPtr pattern)7807 static Int2 ProtSearchEnterProtWord (
7808 ProtSearchPtr tbl,
7809 Int2 highState,
7810 Int2 maxState,
7811 CharPtr name,
7812 CharPtr pattern
7813 )
7814
7815 {
7816 Char ch;
7817 Int2 next, patLen, state;
7818 CharPtr ptr;
7819
7820 state = 0;
7821 next = 0;
7822
7823 patLen = StringLen (pattern);
7824
7825 /* try to overlay beginning of pattern onto existing table */
7826
7827 for (ptr = pattern, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
7828 next = ProtSearchGotoState (tbl, state, ch, FALSE);
7829 if (next == FAIL_STATE) break;
7830 state = next;
7831 }
7832
7833 /* now create new states for remaining characters in pattern */
7834
7835 for ( ; ch != '\0'; ptr++, ch = *ptr) {
7836 highState++;
7837 ProtSearchAddTransition (tbl, state, ch, highState);
7838 state = highState;
7839 }
7840
7841 /* at end of pattern record match information */
7842
7843 ProtSearchAddOutput (tbl, state, name, pattern);
7844
7845 return highState;
7846 }
7847
7848 /* FIFO queue and other functions for building failure states */
7849
ProtSearchQueueAdd(Int2Ptr queue,Int2 qbeg,Int2 val)7850 static void ProtSearchQueueAdd (
7851 Int2Ptr queue,
7852 Int2 qbeg,
7853 Int2 val
7854 )
7855
7856 {
7857 Int2 q;
7858
7859 q = queue [qbeg];
7860 if (q == 0) {
7861 queue [qbeg] = val;
7862 } else {
7863 for ( ; queue [q] != 0; q = queue [q]) continue;
7864 queue [q] = val;
7865 }
7866 queue [val] = 0;
7867 }
7868
ProtSearchFindFail(ProtSearchPtr tbl,Int2 state,Int2 newState,Char ch)7869 static void ProtSearchFindFail (
7870 ProtSearchPtr tbl,
7871 Int2 state,
7872 Int2 newState,
7873 Char ch
7874 )
7875
7876 {
7877 ProtMatchPtr mp;
7878 Int2 next;
7879 ProtStatePtr sp;
7880
7881 /* traverse existing failure path */
7882
7883 while ((next = ProtSearchGotoState (tbl, state, ch, TRUE)) == FAIL_STATE) {
7884 state = ProtSearchFailState (tbl, state);
7885 }
7886
7887 /* add new failure state */
7888
7889 sp = &(tbl->stateArray [(int) newState]);
7890 sp->onfailure = next;
7891
7892 /* add matches of substring at new state */
7893
7894 sp = &(tbl->stateArray [(int) next]);
7895 for (mp = sp->matches; mp != NULL; mp = mp->next) {
7896 ProtSearchAddOutput (tbl, newState, mp->name, mp->pattern);
7897 }
7898 }
7899
ProtSearchComputeFail(ProtSearchPtr tbl,Int2Ptr queue)7900 static void ProtSearchComputeFail (
7901 ProtSearchPtr tbl,
7902 Int2Ptr queue
7903 )
7904
7905 {
7906 CharPtr charToProt = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
7907 Char ch;
7908 Int2 qbeg, r, s, state;
7909 int index;
7910 ProtStatePtr sp;
7911
7912 qbeg = 0;
7913 queue [0] = 0;
7914
7915 /* queue up states reached directly from state 0 (depth 1) */
7916
7917 sp = &(tbl->stateArray [0]);
7918 for (index = 0; index < 26; index++) {
7919 s = sp->transitions [index];
7920 if (s == 0) continue;
7921 sp->onfailure = 0;
7922 ProtSearchQueueAdd (queue, qbeg, s);
7923 }
7924
7925 while (queue [qbeg] != 0) {
7926 r = queue [qbeg];
7927 qbeg = r;
7928
7929 /* depth 1 states beget depth 2 states, etc. */
7930
7931 sp = &(tbl->stateArray [r]);
7932 for (index = 0; index < 26; index++) {
7933 ch = charToProt [index];
7934 s = sp->transitions [index];
7935 if (s == 0) continue;
7936 ProtSearchQueueAdd (queue, qbeg, s);
7937
7938 state = ProtSearchFailState (tbl, r);
7939 ProtSearchFindFail (tbl, state, s, ch);
7940 }
7941 }
7942 }
7943
7944 /* on first character, populate state transition table */
7945
ProtSearchPrimeStateArray(ProtSearchPtr tbl)7946 static void ProtSearchPrimeStateArray (
7947 ProtSearchPtr tbl
7948 )
7949
7950 {
7951 Int2 highState, maxState;
7952 ProtPatternPtr pp;
7953 Int2Ptr queue;
7954 ProtStatePtr stateArray;
7955
7956 if (tbl == NULL || tbl->primed || tbl->patternList == NULL) return;
7957
7958 for (maxState = 1, pp = tbl->patternList; pp != NULL; pp = pp->next) {
7959 maxState += StringLen (pp->pattern);
7960 }
7961
7962 if (maxState > 4000) {
7963 Message (MSG_POST, "FiniteStateSearch cannot handle %d states", (int) maxState);
7964 return;
7965 }
7966
7967 stateArray = (ProtStatePtr) MemNew (sizeof (ProtStateItem) * (size_t) maxState);
7968 queue = (Int2Ptr) MemNew (sizeof (Int2) * maxState);
7969
7970 if (stateArray == NULL || queue == NULL) {
7971 MemFree (stateArray);
7972 MemFree (queue);
7973 Message (MSG_POST, "SequenceSearch unable to allocate buffers");
7974 return;
7975 }
7976
7977 tbl->stateArray = stateArray;
7978 tbl->maxState = maxState;
7979
7980 for (highState = 0, pp = tbl->patternList; pp != NULL; pp = pp->next) {
7981 highState = ProtSearchEnterProtWord (tbl, highState, maxState, pp->name,
7982 pp->pattern);
7983 }
7984
7985 ProtSearchComputeFail (tbl, queue);
7986
7987 MemFree (queue);
7988
7989 tbl->highState = highState;
7990 tbl->currentState = 0;
7991 tbl->currentPos = 0;
7992 tbl->primed = TRUE;
7993 }
7994
7995 /* for testing, print summary of transition table */
7996
7997 /*
7998 static void PrintProtSearchTable (
7999 ProtSearchPtr tbl,
8000 FILE *fp
8001 )
8002
8003 {
8004 Int2 i;
8005 ProtMatchPtr mp;
8006 ProtStatePtr sp;
8007 Int2 state;
8008
8009 if (tbl == NULL || fp == NULL) return;
8010 if (! tbl->primed) {
8011 ProtSearchPrimeStateArray (tbl);
8012 }
8013 if (tbl->stateArray == NULL) return;
8014 if (tbl->highState > 99) return;
8015
8016 fprintf (fp, "State Fail A B C D E F G H I J K L M N O P Q R S T U V W X Y Z\n");
8017
8018 for (state = 0; state <= tbl->highState; state++) {
8019 sp = &(tbl->stateArray [(int) state]);
8020 fprintf (fp, " %3d %3d", (int) state, (int) sp->onfailure);
8021
8022 for (i = 0; i < 26; i++) {
8023 if (sp->transitions [i] != 0) {
8024 fprintf (fp, "%3d", (int) sp->transitions [i]);
8025 } else {
8026 fprintf (fp, " ");
8027 }
8028 }
8029
8030 for (mp = sp->matches; mp != NULL; mp = mp->next) {
8031 fprintf (fp, " %s", mp->name);
8032 }
8033
8034 fprintf (fp, "\n");
8035 }
8036 }
8037 */
8038
8039 /* create empty protein sequence search finite state machine */
8040
ProtSearchNew(ProtSearchMatchProc matchproc,Pointer userdata)8041 NLM_EXTERN ProtSearchPtr ProtSearchNew (
8042 ProtSearchMatchProc matchproc,
8043 Pointer userdata
8044 )
8045
8046 {
8047 CharPtr charToProt = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
8048 Char ch;
8049 Int2 i;
8050 ProtSearchPtr tbl;
8051
8052 if (matchproc == NULL) return NULL;
8053 tbl = (ProtSearchPtr) MemNew (sizeof (ProtSearchData));
8054 if (tbl == NULL) return NULL;
8055
8056 tbl->stateArray = NULL;
8057 tbl->patternList = NULL;
8058 tbl->maxPatLen = 0;
8059 tbl->maxState = 0;
8060 tbl->highState = 0;
8061 tbl->currentState = 0;
8062 tbl->currentPos = 0;
8063 tbl->matchproc = matchproc;
8064 tbl->userdata = userdata;
8065 tbl->primed = FALSE;
8066
8067 /* initialize table to convert character to transition index from 0 (A) to 25 (Z) */
8068
8069 for (i = 0; i < 256; i++) {
8070 tbl->letterToIdx [i] = 23;
8071 }
8072 for (i = 0; i < 26; i++) {
8073 ch = charToProt [i];
8074 tbl->letterToIdx [(int) ch] = i;
8075 ch = TO_LOWER (ch);
8076 tbl->letterToIdx [(int) ch] = i;
8077 }
8078
8079 return tbl;
8080 }
8081
8082 /* table to expand ambiguity letter to all matching protein letters */
8083
8084 static CharPtr protExpandList [26] = {
8085 "A",
8086 "DN",
8087 "C",
8088 "D",
8089 "E",
8090 "F",
8091 "G",
8092 "H",
8093 "I",
8094 "IL",
8095 "K",
8096 "L",
8097 "M",
8098 "N",
8099 "O",
8100 "P",
8101 "Q",
8102 "R",
8103 "S",
8104 "T",
8105 "U",
8106 "V",
8107 "W",
8108 "ACDEFGHIKLMNOPQRSTUVWY",
8109 "Y",
8110 "EQ"
8111 };
8112
8113 /* recursive function to expand and store appropriate individual patterns */
8114
StoreProtPattern(ProtSearchPtr tbl,CharPtr name,CharPtr str)8115 static void StoreProtPattern (
8116 ProtSearchPtr tbl,
8117 CharPtr name,
8118 CharPtr str
8119 )
8120
8121 {
8122 Int4 patLen;
8123 ProtPatternPtr pp;
8124
8125 pp = (ProtPatternPtr) MemNew (sizeof (ProtPatternItem));
8126 if (pp == NULL) return;
8127
8128 pp->name = StringSave (name);
8129 pp->pattern = StringSave (str);
8130
8131 pp->next = tbl->patternList;
8132 tbl->patternList = pp;
8133 patLen = StringLen (str);
8134 if (patLen > tbl->maxPatLen) {
8135 tbl->maxPatLen = patLen;
8136 }
8137 }
8138
ExpandProtPattern(ProtSearchPtr tbl,CharPtr name,CharPtr pattern,size_t patLen,CharPtr str,Int2 position,SearchFlgType flags)8139 static void ExpandProtPattern (
8140 ProtSearchPtr tbl,
8141 CharPtr name,
8142 CharPtr pattern,
8143 size_t patLen,
8144 CharPtr str,
8145 Int2 position,
8146 SearchFlgType flags
8147 )
8148
8149 {
8150 Char ch, lttr;
8151 Int2 idx;
8152 CharPtr ptr;
8153
8154 if (position < patLen) {
8155
8156 if ((Boolean) ((flags & SEQ_SEARCH_EXPAND_PATTERN) != 0)) {
8157
8158 /* given ambiguity letter, get index into protExpandList */
8159
8160 ch = pattern [position];
8161 idx = ch - 'A';
8162 ptr = protExpandList [idx];
8163
8164 /* put every unambiguous amino acid letter at current
8165 position, recurse for next position */
8166
8167 for (lttr = *ptr; lttr != '\0'; ptr++, lttr = *ptr) {
8168 str [position] = lttr;
8169 ExpandProtPattern (tbl, name, pattern, patLen, str, position + 1, flags);
8170 }
8171
8172 } else {
8173
8174 /* if matching ambiguity characters in sequence, do not expand each base */
8175
8176 str [position] = pattern [position];
8177 ExpandProtPattern (tbl, name, pattern, patLen, str, position + 1, flags);
8178 }
8179
8180 /* do not run into pattern storage section of code located below */
8181
8182 return;
8183 }
8184
8185 /* when position reaches pattern length, store one fully expanded string */
8186
8187 StoreProtPattern (tbl, name, str);
8188
8189 if ((Boolean) ((flags & SEQ_SEARCH_ALLOW_MISMATCH) == 0)) return;
8190
8191 for (idx = 0; idx < patLen; idx++) {
8192 ch = str [idx];
8193
8194 /* put X at every position if a single mismatch is allowed */
8195
8196 str [idx] = 'X';
8197
8198 StoreProtPattern (tbl, name, str);
8199
8200 /* now restore proper character, go on to put X in next position */
8201
8202 str [idx] = ch;
8203 }
8204 }
8205
8206 /* add protein to sequence search finite state machine */
8207
ProtSearchAddProteinPattern(ProtSearchPtr tbl,CharPtr name,CharPtr pattern,SearchFlgType flags)8208 NLM_EXTERN void ProtSearchAddProteinPattern (
8209 ProtSearchPtr tbl,
8210 CharPtr name,
8211 CharPtr pattern,
8212 SearchFlgType flags
8213 )
8214
8215 {
8216 Char ch, pat [128], str [128];
8217 Int2 i;
8218 size_t len;
8219
8220 if (tbl == NULL || StringHasNoText (name) || StringHasNoText (pattern)) return;
8221
8222 StringNCpy_0 (pat, pattern, sizeof (pat));
8223 TrimSpacesAroundString (pat);
8224
8225 len = StringLen (pat);
8226
8227 /* upper case working copy of pattern string */
8228
8229 for (i = 0; i < len; i++) {
8230 ch = pat [i];
8231 pat [i] = TO_UPPER (ch);
8232 }
8233
8234 /* record expansion of entered pattern */
8235
8236 MemSet ((Pointer) str, 0, sizeof (str));
8237 ExpandProtPattern (tbl, name, pat, len, str, 0, flags);
8238 }
8239
8240 /* program passes each character in turn to finite state machine */
8241
ProtSearchProcessCharacterEx(ProtSearchPtr tbl,Char ch,Int4 length)8242 static void ProtSearchProcessCharacterEx (
8243 ProtSearchPtr tbl,
8244 Char ch,
8245 Int4 length
8246 )
8247
8248 {
8249 Int2 curr, next;
8250 ProtMatchPtr mp;
8251 Int4 patLen;
8252 ProtStatePtr sp;
8253
8254 if (tbl == NULL) return;
8255 if (! tbl->primed) {
8256 ProtSearchPrimeStateArray (tbl);
8257 }
8258 if (tbl->stateArray == NULL) return;
8259
8260 curr = tbl->currentState;
8261
8262 /* loop through failure states until match or back to state 0 */
8263
8264 while ((next = ProtSearchGotoState (tbl, curr, ch, TRUE)) == FAIL_STATE) {
8265 curr = ProtSearchFailState (tbl, curr);
8266 }
8267
8268 tbl->currentState = next;
8269 (tbl->currentPos)++;
8270
8271 /* report any matches at current state to callback function */
8272
8273 sp = &(tbl->stateArray [(int) next]);
8274 for (mp = sp->matches; mp != NULL; mp = mp->next) {
8275
8276 /* for circular sequences, prevent multiple reports of patterns */
8277
8278 patLen = StringLen (mp->pattern);
8279 if (tbl->currentPos - patLen < length) {
8280 tbl->matchproc (tbl->currentPos - patLen,
8281 mp->name, mp->pattern, tbl->userdata);
8282 }
8283 }
8284 }
8285
ProtSearchProcessCharacter(ProtSearchPtr tbl,Char ch)8286 NLM_EXTERN void ProtSearchProcessCharacter (
8287 ProtSearchPtr tbl,
8288 Char ch
8289 )
8290
8291 {
8292 ProtSearchProcessCharacterEx (tbl, ch, INT4_MAX);
8293 }
8294
8295 /* convenience function calls ProtSearchProcessCharacter for entire protein bioseq */
8296
8297 typedef struct protsrchdata {
8298 ProtSearchPtr tbl;
8299 Int4 length;
8300 } ProtSrchData, PNTR ProtSrchPtr;
8301
SearchProtProc(CharPtr sequence,Pointer userdata)8302 static void LIBCALLBACK SearchProtProc (
8303 CharPtr sequence,
8304 Pointer userdata
8305 )
8306
8307 {
8308 Char ch;
8309 CharPtr ptr;
8310 ProtSrchPtr ssp;
8311
8312 ssp = (ProtSrchPtr) userdata;
8313
8314 ptr = sequence;
8315 ch = *ptr;
8316 while (ch != '\0') {
8317 ch = TO_UPPER (ch);
8318 ProtSearchProcessCharacterEx (ssp->tbl, ch, ssp->length);
8319 ptr++;
8320 ch = *ptr;
8321 }
8322 }
8323
ProtSearchProcessBioseq(ProtSearchPtr tbl,BioseqPtr bsp)8324 NLM_EXTERN void ProtSearchProcessBioseq (
8325 ProtSearchPtr tbl,
8326 BioseqPtr bsp
8327 )
8328
8329 {
8330 ProtSrchData ssd;
8331
8332 ProtSearchReset (tbl);
8333
8334 if (tbl == NULL || bsp == NULL) return;
8335
8336 if (! ISA_aa (bsp->mol)) return;
8337
8338 ssd.tbl = tbl;
8339 ssd.length = bsp->length;
8340
8341 SeqPortStream (bsp, STREAM_EXPAND_GAPS, (Pointer) &ssd, SearchProtProc);
8342
8343 ProtSearchReset (tbl);
8344 }
8345
8346 /* reset state and position to allow another run with same search patterns */
8347
ProtSearchReset(ProtSearchPtr tbl)8348 NLM_EXTERN void ProtSearchReset (
8349 ProtSearchPtr tbl
8350 )
8351
8352 {
8353 if (tbl == NULL) return;
8354
8355 tbl->currentState = 0;
8356 tbl->currentPos = 0;
8357 }
8358
8359 /* clean up sequence search finite state machine allocated memory */
8360
FreeProtPatternList(ProtPatternPtr pp)8361 static ProtPatternPtr FreeProtPatternList (
8362 ProtPatternPtr pp
8363 )
8364
8365 {
8366 ProtPatternPtr next;
8367
8368 while (pp != NULL) {
8369 next = pp->next;
8370 pp->next = NULL;
8371 MemFree (pp->name);
8372 MemFree (pp->pattern);
8373 MemFree (pp);
8374 pp = next;
8375 }
8376
8377 return NULL;
8378 }
8379
FreeProtMatchList(ProtMatchPtr mp)8380 static ProtMatchPtr FreeProtMatchList (
8381 ProtMatchPtr mp
8382 )
8383
8384 {
8385 ProtMatchPtr next;
8386
8387 while (mp != NULL) {
8388 next = mp->next;
8389 mp->next = NULL;
8390 MemFree (mp->name);
8391 MemFree (mp->pattern);
8392 MemFree (mp);
8393 mp = next;
8394 }
8395
8396 return NULL;
8397 }
8398
ProtSearchFree(ProtSearchPtr tbl)8399 NLM_EXTERN ProtSearchPtr ProtSearchFree (
8400 ProtSearchPtr tbl
8401 )
8402
8403 {
8404 Int2 maxState, state;
8405
8406 if (tbl == NULL) return NULL;
8407
8408 maxState = tbl->maxState;
8409
8410 for (state = 0; state < maxState; state++) {
8411 FreeProtMatchList (tbl->stateArray [state].matches);
8412 }
8413
8414 FreeProtPatternList (tbl->patternList);
8415
8416 MemFree (tbl->stateArray);
8417 return MemFree (tbl);
8418 }
8419
8420 /*
8421
8422 static CharPtr testseq =
8423 "MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN";
8424
8425 static void MatchProc (Int4 position, CharPtr name, CharPtr pattern, Pointer userdata)
8426
8427 {
8428 Message (MSG_POST, "Name '%s', Pattern '%s', Position %ld",
8429 name, pattern, (long) position);
8430 }
8431
8432
8433 extern void TestProtSearch (void);
8434 extern void TestProtSearch (void)
8435
8436 {
8437 Char ch;
8438 CharPtr ptr;
8439 ProtSearchPtr tbl;
8440
8441 tbl = ProtSearchNew (MatchProc, NULL);
8442 if (tbl == NULL) return;
8443
8444 ProtSearchAddProteinPattern (tbl, "AmbiG", "GRATYC", 1, SEQ_SEARCH_EXPAND_PATTERN);
8445
8446 for (ptr = testseq, ch = *ptr; ch != '\0'; ptr++, ch = *ptr) {
8447 ProtSearchProcessCharacter (tbl, ch);
8448 }
8449
8450 ProtSearchFree (tbl);
8451 }
8452
8453 */
8454
8455 /*****************************************************************************
8456 *
8457 * Convenience functions for genome processing use BioseqLockById to get sequence
8458 * record (perhaps with phrap quality score graphs) so fetching from some network
8459 * or local server must be enabled, or sequences must already be in memory.
8460 *
8461 *****************************************************************************/
8462
GetSequenceByFeatureEx(SeqFeatPtr sfp,StreamFlgType flags)8463 NLM_EXTERN CharPtr GetSequenceByFeatureEx (SeqFeatPtr sfp, StreamFlgType flags)
8464
8465 {
8466 Int4 len;
8467 CharPtr str = NULL;
8468
8469 if (sfp == NULL) return NULL;
8470 len = SeqLocLen (sfp->location);
8471 if (len > 0 && len < MAXALLOC) {
8472 str = MemNew (sizeof (Char) * (len + 2));
8473 if (str != NULL) {
8474 SeqPortStreamLoc (sfp->location, flags, (Pointer) str, NULL);
8475 }
8476 }
8477
8478 return str;
8479 }
8480
GetSequenceByLocationEx(SeqLocPtr slp,StreamFlgType flags)8481 NLM_EXTERN CharPtr GetSequenceByLocationEx (SeqLocPtr slp, StreamFlgType flags)
8482
8483 {
8484 Int4 len;
8485 CharPtr str = NULL;
8486
8487 if (slp == NULL) return NULL;
8488 len = SeqLocLen (slp);
8489 if (len > 0 && len < MAXALLOC) {
8490 str = MemNew (sizeof (Char) * (len + 2));
8491 if (str != NULL) {
8492 SeqPortStreamLoc (slp, flags, (Pointer) str, NULL);
8493 }
8494 }
8495
8496 return str;
8497 }
8498
GetSequenceByBspEx(BioseqPtr bsp,StreamFlgType flags)8499 NLM_EXTERN CharPtr GetSequenceByBspEx (BioseqPtr bsp, StreamFlgType flags)
8500
8501 {
8502 CharPtr str = NULL;
8503
8504 if (bsp == NULL || bsp->length >= MAXALLOC) return NULL;
8505
8506 str = MemNew (sizeof (Char) * (bsp->length + 2));
8507 if (str == NULL) return NULL;
8508
8509 SeqPortStream (bsp, flags, (Pointer) str, NULL);
8510
8511 return str;
8512 }
8513
GetSequenceByIdOrAccnDotVerEx(SeqIdPtr sip,CharPtr accession,Boolean is_na,StreamFlgType flags)8514 NLM_EXTERN CharPtr GetSequenceByIdOrAccnDotVerEx (SeqIdPtr sip, CharPtr accession, Boolean is_na, StreamFlgType flags)
8515
8516 {
8517 BioseqPtr bsp;
8518 SeqIdPtr deleteme = NULL;
8519 CharPtr str = NULL;
8520
8521 if (sip == NULL) {
8522 if (StringHasNoText (accession)) return NULL;
8523 sip = SeqIdFromAccessionDotVersion (accession);
8524 deleteme = sip; /* allocated seqid, so must later delete it */
8525 }
8526 if (sip == NULL) return NULL;
8527
8528 bsp = BioseqLockById (sip);
8529 SeqIdFree (deleteme);
8530 if (bsp == NULL) return NULL;
8531
8532 if ((ISA_na (bsp->mol) && is_na) || (ISA_aa (bsp->mol) && (! is_na))) {
8533 if (bsp->length < MAXALLOC) {
8534 str = GetSequenceByBspEx (bsp, flags);
8535 }
8536 }
8537
8538 BioseqUnlock (bsp);
8539 return str;
8540 }
8541
GetSequenceByFeature(SeqFeatPtr sfp)8542 NLM_EXTERN CharPtr GetSequenceByFeature (SeqFeatPtr sfp)
8543
8544 {
8545 return GetSequenceByFeatureEx (sfp, STREAM_EXPAND_GAPS);
8546 }
8547
GetSequenceByLocation(SeqLocPtr slp)8548 NLM_EXTERN CharPtr GetSequenceByLocation (SeqLocPtr slp)
8549
8550 {
8551 return GetSequenceByLocationEx (slp, STREAM_EXPAND_GAPS);
8552 }
8553
GetSequenceByBsp(BioseqPtr bsp)8554 NLM_EXTERN CharPtr GetSequenceByBsp (BioseqPtr bsp)
8555
8556 {
8557 return GetSequenceByBspEx (bsp, STREAM_EXPAND_GAPS);
8558 }
8559
GetSequenceByIdOrAccnDotVer(SeqIdPtr sip,CharPtr accession,Boolean is_na)8560 NLM_EXTERN CharPtr GetSequenceByIdOrAccnDotVer (SeqIdPtr sip, CharPtr accession, Boolean is_na)
8561
8562 {
8563 return GetSequenceByIdOrAccnDotVerEx (sip, accession, is_na, STREAM_EXPAND_GAPS);
8564 }
8565
8566 /* original convenience function now calls more advanced version that can get proteins */
8567
GetDNAbyAccessionDotVersion(CharPtr accession)8568 NLM_EXTERN CharPtr GetDNAbyAccessionDotVersion (CharPtr accession)
8569
8570 {
8571 return GetSequenceByIdOrAccnDotVer (NULL, accession, TRUE);
8572 }
8573
8574
FixGapLength(BioseqPtr bsp,Int4 offset,Int4 diff)8575 static void FixGapLength (BioseqPtr bsp, Int4 offset, Int4 diff)
8576 {
8577 CharPtr extra_ns;
8578 SeqLocPtr slp;
8579 ValNodePtr align_annot_list, vnp;
8580 SeqAnnotPtr sanp;
8581
8582 if (bsp == NULL || bsp->id == NULL || diff == 0) return;
8583
8584 align_annot_list = FindAlignSeqAnnotsForBioseq (bsp);
8585
8586 if (diff > 0)
8587 {
8588 extra_ns = (CharPtr)MemNew ((diff + 1) * sizeof (Char));
8589 if (extra_ns != NULL)
8590 {
8591 MemSet (extra_ns, 'N', diff);
8592 extra_ns [diff] = 0;
8593 insertchar (extra_ns, offset, bsp->id, bsp->mol, FALSE);
8594 }
8595 slp = SeqLocIntNew (offset, offset + diff - 1, Seq_strand_plus, bsp->id);
8596 for (vnp = align_annot_list; vnp != NULL; vnp = vnp->next)
8597 {
8598 sanp = vnp->data.ptrvalue;
8599 if (sanp != NULL && sanp->type == 2)
8600 {
8601 sanp->data = SeqAlignInsertByLoc (slp, sanp->data);
8602 }
8603 }
8604 SeqLocFree (slp);
8605 }
8606 else
8607 {
8608 slp = SeqLocIntNew (offset, offset - diff - 1, Seq_strand_plus, bsp->id);
8609 SeqDeleteByLoc (slp, TRUE, FALSE);
8610
8611 for (vnp = align_annot_list; vnp != NULL; vnp = vnp->next)
8612 {
8613 sanp = vnp->data.ptrvalue;
8614 if (sanp != NULL && sanp->type == 2)
8615 {
8616 sanp->data = SeqAlignDeleteByLoc (slp, sanp->data);
8617 }
8618 }
8619
8620 SeqLocFree (slp);
8621 }
8622 }
8623
AddSeqLitData(CharPtr str,ValNodePtr PNTR seq_ext)8624 static Int4 AddSeqLitData (CharPtr str, ValNodePtr PNTR seq_ext)
8625 {
8626 Int4 len;
8627 SeqLitPtr slp;
8628
8629 if (StringHasNoText (str)) {
8630 return 0;
8631 }
8632 len = StringLen (str);
8633 slp = (SeqLitPtr) MemNew (sizeof (SeqLit));
8634 if (slp != NULL) {
8635 slp->length = len;
8636 ValNodeAddPointer (seq_ext, (Int2) 2, (Pointer) slp);
8637 slp->seq_data = (SeqDataPtr) BSNew (slp->length);
8638 slp->seq_data_type = Seq_code_iupacna;
8639 AddBasesToByteStore ((ByteStorePtr) slp->seq_data, str);
8640 }
8641 return len;
8642 }
8643
IsGapUnknown(Int4 gap_len,Int4 unknown_gap_size,Int4 known_gap_size,Boolean unknown_greater_than_or_equal,Boolean known_greater_than_or_equal)8644 static Boolean IsGapUnknown (Int4 gap_len,
8645 Int4 unknown_gap_size,
8646 Int4 known_gap_size,
8647 Boolean unknown_greater_than_or_equal,
8648 Boolean known_greater_than_or_equal)
8649 {
8650 Boolean make_unknown_size = FALSE;
8651
8652 if (gap_len == 0)
8653 {
8654 make_unknown_size = FALSE;
8655 }
8656 else if (gap_len == unknown_gap_size)
8657 {
8658 make_unknown_size = TRUE;
8659 }
8660 else if (gap_len == known_gap_size)
8661 {
8662 make_unknown_size = FALSE;
8663 }
8664 else if (gap_len > unknown_gap_size && unknown_greater_than_or_equal)
8665 {
8666 if (!known_greater_than_or_equal)
8667 {
8668 make_unknown_size = TRUE;
8669 }
8670 else if (unknown_gap_size > known_gap_size)
8671 {
8672 make_unknown_size = TRUE;
8673 }
8674 else if (gap_len < known_gap_size)
8675 {
8676 make_unknown_size = TRUE;
8677 }
8678 }
8679 return make_unknown_size;
8680 }
8681
8682
AddGap(Int4 gap_len,Boolean make_unknown_size,BioseqPtr bsp,Int4 len,ValNodePtr PNTR seq_ext)8683 static Int4 AddGap(Int4 gap_len,
8684 Boolean make_unknown_size,
8685 BioseqPtr bsp,
8686 Int4 len,
8687 ValNodePtr PNTR seq_ext)
8688 {
8689 Int4 added_len = 0;
8690 SeqLitPtr slp;
8691 IntFuzzPtr ifp;
8692
8693 if (gap_len > 0) {
8694 slp = (SeqLitPtr) MemNew (sizeof (SeqLit));
8695 if (slp != NULL) {
8696 slp->length = gap_len;
8697 ValNodeAddPointer (seq_ext, (Int2) 2, (Pointer) slp);
8698 if (make_unknown_size) {
8699 ifp = IntFuzzNew ();
8700 ifp->choice = 4;
8701 slp->fuzz = ifp;
8702 if (slp->length != 100) {
8703 FixGapLength (bsp, len, 100 - slp->length);
8704 slp->length = 100;
8705 }
8706 }
8707 added_len += slp->length;
8708 }
8709 }
8710 return added_len;
8711 }
8712
8713
NeedToConvert(CharPtr bases,Boolean unknown_greater_than_or_equal,Boolean known_greater_than_or_equal,Int4 unknown_gap_size,Int4 known_gap_size)8714 static Boolean NeedToConvert
8715 (CharPtr bases,
8716 Boolean unknown_greater_than_or_equal,
8717 Boolean known_greater_than_or_equal,
8718 Int4 unknown_gap_size,
8719 Int4 known_gap_size)
8720 {
8721 Int4 gap_len;
8722 CharPtr cp;
8723
8724 if (StringHasNoText(bases)) {
8725 return FALSE;
8726 }
8727 cp = bases;
8728 while (*cp != '\0') {
8729
8730 gap_len = StringSpn (cp, "N");
8731 if (gap_len > 0 ) {
8732 if ((gap_len == unknown_gap_size
8733 || (gap_len > unknown_gap_size && unknown_greater_than_or_equal)
8734 || gap_len == known_gap_size
8735 || (gap_len > known_gap_size && known_greater_than_or_equal))) {
8736 return TRUE;
8737 } else {
8738 cp += gap_len;
8739 }
8740 } else {
8741 gap_len = StringCSpn (cp, "N");
8742 cp += gap_len;
8743 }
8744 }
8745 return FALSE;
8746 }
8747
8748 /*****************************************************************************
8749 *
8750 * ConvertNsToGaps
8751 * Assumes string of Ns means a gap of known length
8752 *
8753 *****************************************************************************/
8754
ConvertNsToGaps(BioseqPtr bsp,Pointer userdata)8755 NLM_EXTERN void ConvertNsToGaps (
8756 BioseqPtr bsp,
8757 Pointer userdata
8758 )
8759
8760 {
8761 CharPtr bases, str, txt;
8762 Char ch;
8763 Int4 len;
8764 ValNodePtr seq_ext;
8765 Boolean unknown_greater_than_or_equal = FALSE;
8766 Boolean known_greater_than_or_equal = FALSE;
8767 Int4Ptr gap_sizes;
8768 Int4 unknown_gap_size = 0;
8769 Int4 known_gap_size = 0;
8770 Int4 gap_len;
8771 Boolean make_unknown_size;
8772
8773 if (bsp == NULL || bsp->repr != Seq_repr_raw || ISA_aa (bsp->mol)) return;
8774 if (userdata == NULL)
8775 {
8776 known_greater_than_or_equal = TRUE;
8777 }
8778 else
8779 {
8780 gap_sizes = (Int4Ptr) userdata;
8781 unknown_gap_size = gap_sizes[0];
8782 known_gap_size = gap_sizes[1];
8783 if (unknown_gap_size < 0)
8784 {
8785 unknown_greater_than_or_equal = TRUE;
8786 unknown_gap_size = 0 - unknown_gap_size;
8787 }
8788 if (known_gap_size < 0)
8789 {
8790 known_greater_than_or_equal = TRUE;
8791 known_gap_size = 0 - known_gap_size;
8792 }
8793 }
8794
8795 bases = GetSequenceByBsp (bsp);
8796 if (bases == NULL) return;
8797
8798 if (!NeedToConvert(bases, unknown_greater_than_or_equal, known_greater_than_or_equal, unknown_gap_size, known_gap_size)) {
8799 MemFree (bases);
8800 return;
8801 }
8802
8803 seq_ext = NULL;
8804 len = 0;
8805
8806 txt = bases;
8807 str = txt;
8808 ch = *txt;
8809
8810 while (*str != '\0') {
8811
8812 gap_len = StringSpn (str, "N");
8813 if (gap_len > 0 ) {
8814 if ((gap_len == unknown_gap_size
8815 || (gap_len > unknown_gap_size && unknown_greater_than_or_equal)
8816 || gap_len == known_gap_size
8817 || (gap_len > known_gap_size && known_greater_than_or_equal))) {
8818 /* add any prior sequence data as literal */
8819 ch = *str;
8820 *str = '\0';
8821 len += AddSeqLitData (txt, &(seq_ext));
8822 *str = ch;
8823 /* add a gap */
8824 make_unknown_size = IsGapUnknown (gap_len,
8825 unknown_gap_size,
8826 known_gap_size,
8827 unknown_greater_than_or_equal,
8828 known_greater_than_or_equal);
8829
8830 len += AddGap(gap_len,
8831 make_unknown_size,
8832 bsp,
8833 len,
8834 &(seq_ext));
8835 txt = str + gap_len;
8836 }
8837 str += gap_len;
8838 } else {
8839 gap_len = StringCSpn (str, "N");
8840 str += gap_len;
8841 }
8842 }
8843 /* at end, add last sequence data literal */
8844 len += AddSeqLitData (txt, &(seq_ext));
8845
8846 MemFree (bases);
8847
8848 bsp->seq_data = SeqDataFree (bsp->seq_data, bsp->seq_data_type);
8849 bsp->seq_data_type = 0;
8850 bsp->repr = Seq_repr_delta;
8851 bsp->seq_ext_type = 4;
8852 bsp->seq_ext = seq_ext;
8853 bsp->length = len;
8854
8855 BioseqPack (bsp);
8856 }
8857
8858
8859 /* Protein Molecular Weight Section */
8860
8861 /* Values are A through Z order:
8862 B is really D or N, but they are close so is treated as D
8863 Z is really E or Q, but they are close so is treated as E
8864 X is hard to guess, so the calculation fails on X
8865 - and * are skipped
8866 water molecule is removed for in-peptide atom counts
8867
8868 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
8869 */
8870 Uint1 C_atoms[26] =
8871 { 3, 4, 3, 4, 5, 9, 2, 6, 6, 6, 6, 6, 5, 4, 12, 5, 5, 6, 3, 4, 3, 5, 11, 0, 9, 5};
8872 Uint1 H_atoms[26] =
8873 { 5, 5, 5, 5, 7, 9, 3, 7, 11, 11, 12, 11, 9, 6, 19, 7, 8, 12, 5, 7, 5, 9, 10, 0, 9, 7};
8874 Uint1 N_atoms[26] =
8875 { 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 2, 3, 1, 2, 4, 1, 1, 1, 1, 2, 0, 1, 1};
8876 Uint1 O_atoms[26] =
8877 { 1, 3, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 0, 2, 3};
8878 Uint1 S_atoms[26] =
8879 { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
8880 Uint1 Se_atoms[26] =
8881 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0};
8882
8883 /**************************************************************
8884 *
8885 * Returns a protein molecular weight for a SeqLoc
8886 * If it cannot calculate the value it returns -1.0
8887 * If sequence contains X,B,U,*,orZ it fails
8888 *
8889 ***************************************************************/
MolWtForLoc(SeqLocPtr slp)8890 NLM_EXTERN FloatHi MolWtForLoc (SeqLocPtr slp)
8891 {
8892 StreamCache sc;
8893 Int2 res;
8894 int residue;
8895 Int4 Ccnt,
8896 Hcnt,
8897 Ncnt,
8898 Ocnt,
8899 Scnt,
8900 Secnt;
8901 FloatHi retval = -1.0;
8902
8903 if (slp == NULL) return retval;
8904 StreamCacheSetup (NULL, slp, 0, &sc);
8905
8906 Ccnt = 0; /* initialize counters */
8907 Hcnt = 2; /* always start with water */
8908 Ocnt = 1; /* H20 */
8909 Ncnt = 0;
8910 Scnt = 0;
8911 Secnt = 0;
8912
8913 while ((res = StreamCacheGetResidue (&sc)) != '\0')
8914 {
8915 if (IS_LOWER (res)) {
8916 res = TO_UPPER (res);
8917 }
8918 if (IS_UPPER (res)) {
8919 residue = res - 'A';
8920 if (H_atoms[residue] == 0) { /* unsupported AA */
8921 return retval; /* bail out */
8922 }
8923 Ccnt += C_atoms[residue];
8924 Hcnt += H_atoms[residue];
8925 Ncnt += N_atoms[residue];
8926 Ocnt += O_atoms[residue];
8927 Scnt += S_atoms[residue];
8928 Secnt += Se_atoms[residue];
8929 } else if (res != '-' && res != '*') {
8930 return retval; /* bail out */
8931 }
8932 }
8933
8934 retval = (12.01115 * Ccnt) + (1.0079 * Hcnt) +
8935 (14.0067 * Ncnt) + (15.9994 * Ocnt) +
8936 (32.064 * Scnt) + (78.96 * Secnt);
8937
8938 return retval;
8939 }
8940
MolWtForBsp(BioseqPtr bsp)8941 NLM_EXTERN FloatHi MolWtForBsp (BioseqPtr bsp)
8942 {
8943 StreamCache sc;
8944 Int2 res;
8945 int residue;
8946 Int4 Ccnt,
8947 Hcnt,
8948 Ncnt,
8949 Ocnt,
8950 Scnt,
8951 Secnt;
8952 FloatHi retval = -1.0;
8953
8954 if (bsp == NULL) return retval;
8955 if (! ISA_aa (bsp->mol)) return retval;
8956 StreamCacheSetup (bsp, NULL, 0, &sc);
8957
8958 Ccnt = 0; /* initialize counters */
8959 Hcnt = 2; /* always start with water */
8960 Ocnt = 1; /* H20 */
8961 Ncnt = 0;
8962 Scnt = 0;
8963 Secnt = 0;
8964
8965 while ((res = StreamCacheGetResidue (&sc)) != '\0')
8966 {
8967 if (IS_LOWER (res)) {
8968 res = TO_UPPER (res);
8969 }
8970 if (IS_UPPER (res)) {
8971 residue = res - 'A';
8972 if (H_atoms[residue] == 0) { /* unsupported AA */
8973 return retval; /* bail out */
8974 }
8975 Ccnt += C_atoms[residue];
8976 Hcnt += H_atoms[residue];
8977 Ncnt += N_atoms[residue];
8978 Ocnt += O_atoms[residue];
8979 Scnt += S_atoms[residue];
8980 Secnt += Se_atoms[residue];
8981 } else if (res != '-' && res != '*') {
8982 return retval; /* bail out */
8983 }
8984 }
8985
8986 retval = (12.01115 * Ccnt) + (1.0079 * Hcnt) +
8987 (14.0067 * Ncnt) + (15.9994 * Ocnt) +
8988 (32.064 * Scnt) + (78.96 * Secnt);
8989
8990 return retval;
8991 }
8992
MolWtForStr(CharPtr str)8993 NLM_EXTERN FloatHi MolWtForStr (CharPtr str)
8994 {
8995 Char res;
8996 int residue;
8997 Int4 Ccnt,
8998 Hcnt,
8999 Ncnt,
9000 Ocnt,
9001 Scnt,
9002 Secnt;
9003 FloatHi retval = -1.0;
9004
9005 if (str == NULL) return retval;
9006
9007 Ccnt = 0; /* initialize counters */
9008 Hcnt = 2; /* always start with water */
9009 Ocnt = 1; /* H20 */
9010 Ncnt = 0;
9011 Scnt = 0;
9012 Secnt = 0;
9013
9014 res = *str;
9015 while (res != '\0')
9016 {
9017 if (IS_LOWER (res)) {
9018 res = TO_UPPER (res);
9019 }
9020 if (IS_UPPER (res)) {
9021 residue = res - 'A';
9022 if (H_atoms[residue] == 0) { /* unsupported AA */
9023 return retval; /* bail out */
9024 }
9025 Ccnt += C_atoms[residue];
9026 Hcnt += H_atoms[residue];
9027 Ncnt += N_atoms[residue];
9028 Ocnt += O_atoms[residue];
9029 Scnt += S_atoms[residue];
9030 Secnt += Se_atoms[residue];
9031 } else if (res != '-' && res != '*') {
9032 return retval; /* bail out */
9033 }
9034 str++;
9035 res = *str;
9036 }
9037
9038 retval = (12.01115 * Ccnt) + (1.0079 * Hcnt) +
9039 (14.0067 * Ncnt) + (15.9994 * Ocnt) +
9040 (32.064 * Scnt) + (78.96 * Secnt);
9041
9042 return retval;
9043 }
9044
9045