1 /* sequtil.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: sequtil.c
27 *
28 * Author: James Ostell
29 *
30 * Version Creation Date: 4/1/91
31 *
32 * $Revision: 6.410 $
33 *
34 * File Description: Sequence Utilities for objseq and objsset
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date Name Description of modification
39 * ------- ---------- -----------------------------------------------------
40 *
41 * ==========================================================================
42 */
43
44 /** for ErrPostEx() ****/
45
46 static char *this_module = "ncbiapi";
47 #define THIS_MODULE this_module
48 static char *this_file = __FILE__;
49 #define THIS_FILE this_file
50
51 /**********************/
52
53 #include <sequtil.h>
54 #include <gather.h>
55 #include <seqport.h>
56 #include <sqnutils.h> /* prototype for SeqIdFindWorst */
57 #include <edutil.h>
58 #include <subutil.h>
59
60 /**** Static variables used for randomized sequence conversions ****/
61
62 /* This array contains final residues for ncbi2na encoding.
63 Na42[4] - number of possible choises for ambiguous residues
64 and these residues plased in Na42[0-3] */
65
66 static Int1 Na42[16][5] = {
67 { 0, 1, 2, 3, 4} , { 0, 0, 0, 0, 1 }, { 1, 1, 1, 1, 1} , { 0, 1, 0, 1, 2},
68 { 2, 2, 2, 2, 1} , { 0, 2, 0, 2, 2 }, { 1, 2, 1, 2, 2} , { 0, 1, 2, 2, 3},
69 { 3, 3, 3, 3, 1} , { 0, 3, 0, 3, 2 }, { 1, 3, 1, 3, 2} , { 0, 1, 3, 3, 3},
70 { 2, 3, 2, 3, 2} , { 0, 2, 3, 3, 3 }, { 1, 2, 3, 3, 3} , { 0, 1, 2, 3, 4}
71 };
72
73 /* This array contains check values if we can do direct conversion */
74
75 static Int1 Na42Set[16] = { -1, 0, 1, -1, 2, -1, -1, -1,
76 3, -1, -1, -1, -1, -1, -1, -1 };
77
78 /* Analog arrays for ASCII --> ncbi2na conversion
79 NOTE: dimensions for NaI2 are reversed to allocate it
80 dynamically */
81
82 static Int1 NaI2Set[256];
83 static Int1Ptr NaI2[5];
84
85 static Boolean NaI2InitOk = FALSE; /* We will allocate it only ones */
86
87 /* Macros for random conversion */
88
89 #define CONVERT_42_RAND(from) Na42[from][(Nlm_RandomNum()>>8)%Na42[from][4]]
90 #define CONVERT_I2_RAND(from) NaI2[(Nlm_RandomNum()>>8)%NaI2[4][from]][from]
91
92 static Boolean InitNaI2Table(void);
93
94 /**********************************************************************/
95
96 /* Defines for compression/rebuild DNA */
97
98 #define BSC_BUFF_CHUNK 1024
99 #define RES_OFFSET(x) x & 0xFFFFFF
100 #define RES_VALUE(x) x>>28
101 #define RES_LEN(x) (x>>24) & 0xF
102 #define RES_LEN_NEW(x) (x>>16) & 0xFFF
103 #define LEN_STEP_MASK 0x1000000
104 #define LEN_STEP_MASK_NEW 0x10000
105
106 static NumberingPtr stdnum = NULL; /* std Numbering object (start at 1) */
107
108 /* find the last nucleotide bioseq in the bioseqset */
109 /* Used by SeqEntryExplore. */
FindNuc(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)110 NLM_EXTERN void FindNuc(SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
111 {
112 BioseqPtr PNTR bp;
113 BioseqPtr local_bsp;
114
115 bp = (BioseqPtr PNTR) data;
116 if (IS_Bioseq(sep))
117 {
118 local_bsp = (BioseqPtr) sep->data.ptrvalue;
119 if (ISA_na(local_bsp->mol))
120 *bp = local_bsp;
121 }
122 }
123
124 /* find the last protein bioseq in the bioseqset */
125 /* Used by SeqEntryExplore. */
FindProt(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)126 NLM_EXTERN void FindProt(SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
127 {
128 BioseqPtr PNTR bp;
129 BioseqPtr local_bsp;
130
131 bp = (BioseqPtr PNTR) data;
132 if (IS_Bioseq(sep))
133 {
134 local_bsp = (BioseqPtr) sep->data.ptrvalue;
135 if (ISA_aa(local_bsp->mol))
136 *bp = local_bsp;
137 }
138 }
139
140 /*****************************************************************************
141 *
142 * Boolean BioseqMatch(bsp, seqid)
143 * returns TRUE if bsp points to the Bioseq identified by seqid
144 *
145 *****************************************************************************/
BioseqMatch(BioseqPtr bsp,SeqIdPtr seqid)146 NLM_EXTERN Boolean BioseqMatch (BioseqPtr bsp, SeqIdPtr seqid)
147 {
148 if (bsp == NULL) return FALSE;
149 return SeqIdIn(seqid, bsp->id);
150 }
151
152
153 typedef struct findse {
154 SeqIdPtr sip;
155 Boolean found;
156 BioseqPtr bsp;
157 Int4 indent;
158 } fse, PNTR fseptr;
159
160 typedef struct {
161 SeqLocPtr slp;
162 Boolean findOnProtein;
163 } SpliceInfo, *SpliceInfoPtr;
164
165 typedef struct {
166 SeqIdPtr sip;
167 Boolean isProtein;
168 Boolean retval;
169 } SeqIdChecker, *SeqIdCheckerPtr;
170
171 typedef struct {
172 SeqIdPtr sip;
173 Int2 mtype;
174 } SeqIdMolType, PNTR SeqIdMolTypePtr;
175
176 /*****************************************************************************
177 *
178 * FindSE()
179 * SeqEntryExplore function used by SeqEntryFind()
180 *
181 *****************************************************************************/
182 NLM_EXTERN void FindSE (SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent);
FindSE(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)183 NLM_EXTERN void FindSE (SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
184 {
185 fseptr fep;
186 BioseqPtr bsp;
187
188 fep = (fseptr)data;
189 if (fep->found) /* already found it */
190 return;
191
192 if (! IS_Bioseq(sep))
193 return;
194
195 bsp = (BioseqPtr)(sep->data.ptrvalue);
196 if (BioseqMatch(bsp, fep->sip))
197 {
198 fep->found = TRUE;
199 fep->bsp = bsp;
200 fep->indent = indent;
201 }
202
203 return;
204 }
205
206 /*****************************************************************************
207 *
208 * BioseqFindInSeqEntry(sip, sep)
209 * Finds a Bioseq within a SeqEntry by SeqId
210 *
211 *****************************************************************************/
BioseqFindInSeqEntry(SeqIdPtr sip,SeqEntryPtr sep)212 NLM_EXTERN BioseqPtr BioseqFindInSeqEntry(SeqIdPtr sip, SeqEntryPtr sep)
213 {
214 BioseqPtr bsp = NULL;
215 fse fe;
216
217 if (sip == NULL) return bsp;
218 if (sep == NULL) return bsp;
219
220 fe.found = FALSE;
221 fe.sip = sip;
222 fe.bsp = NULL;
223
224 SeqEntryExplore(sep, (Pointer)(&fe), FindSE);
225 if (fe.found)
226 return fe.bsp;
227 else
228 return bsp;
229 }
230
231 /*****************************************************************************
232 *
233 * BioseqGetSeqDescr(bsp, type, curr)
234 * returns pointer to the next SeqDescr of this type
235 * type gives type of Seq-descr
236 * if 0, gets them all
237 * curr is NULL or previous node of this type found
238 *
239 *****************************************************************************/
BioseqGetSeqDescr(BioseqPtr bsp,Int2 type,ValNodePtr curr)240 NLM_EXTERN ValNodePtr BioseqGetSeqDescr (BioseqPtr bsp, Int2 type, ValNodePtr curr) /* the last one you used */
241
242 {
243 if (bsp == NULL) return NULL;
244
245 if (curr == NULL)
246 curr = bsp->descr;
247 else
248 curr = curr->next; /* move past last one */
249
250 while (curr != NULL)
251 {
252 if ((! type) || ((Int2)curr->choice == type))
253 return curr;
254 else
255 curr = curr->next;
256 }
257 return NULL;
258 }
259
260 /*****************************************************************************
261 *
262 * BioseqGetTitle(bsp)
263 * returns pointer to the first title of this Bioseq
264 *
265 *****************************************************************************/
BioseqGetTitle(BioseqPtr bsp)266 NLM_EXTERN CharPtr BioseqGetTitle (BioseqPtr bsp)
267
268 {
269 ValNodePtr ptr;
270
271 ptr = BioseqGetSeqDescr(bsp, Seq_descr_title, NULL);
272 if (ptr != NULL)
273 return (CharPtr)ptr->data.ptrvalue;
274 else
275 return NULL;
276 }
277
278 /*****************************************************************************
279 *
280 * BioseqGetNumbering(bsp)
281 * Gets either user supplied, or default number for a Bioseq
282 * looks first for num Seqdescr, then in Pubdesc, then returns
283 * default numbering
284 *
285 *****************************************************************************/
BioseqGetNumbering(BioseqPtr bsp)286 NLM_EXTERN NumberingPtr BioseqGetNumbering (BioseqPtr bsp)
287
288 {
289 NumberingPtr np = NULL;
290 ValNodePtr anp;
291 PubdescPtr pdp;
292
293 if (bsp == NULL)
294 return NULL;
295
296 anp = BioseqGetSeqDescr(bsp, Seq_descr_num, NULL);
297 if (anp != NULL) /* Numbering on this Bioseq */
298 np = (NumberingPtr)anp->data.ptrvalue;
299 else do /* look for Pubdesc */
300 {
301 anp = BioseqGetSeqDescr(bsp, Seq_descr_pub, anp);
302 if (anp != NULL)
303 {
304 pdp = (PubdescPtr)anp->data.ptrvalue;
305 np = pdp->num;
306 }
307 } while ((anp != NULL) && (np == NULL));
308
309 if (np == NULL) /* no numbering found */
310 np = NumberingDefaultGet(); /* fallback position */
311
312 return np;
313 }
314
315
316 /*****************************************************************************
317 *
318 * Bioseq_repr (BioseqPtr bsp)
319 *
320 *****************************************************************************/
Bioseq_repr(BioseqPtr bsp)321 NLM_EXTERN Uint1 Bioseq_repr (BioseqPtr bsp)
322
323 {
324 return bsp->repr;
325 }
326
327 /*****************************************************************************
328 *
329 * Int4 BioseqGetLen (bsp)
330 * returns total length of sequence in residues
331 * if segmented:
332 * includes length of virtual sequences with fixed length
333 * does not include lengths of NULL gaps
334 * returns -1 for error
335 *
336 *****************************************************************************/
BioseqGetLen(BioseqPtr bsp)337 NLM_EXTERN Int4 BioseqGetLen (BioseqPtr bsp)
338
339 {
340 if (bsp == NULL)
341 return -1;
342
343 return bsp->length;
344 }
345
346 /*****************************************************************************
347 *
348 * Int4 BioseqGetGaps (bsp)
349 * returns total number of NULL gaps in sequence
350 * virtual sequence with length set does not count as a gap
351 * returns -1 for error
352 *
353 *****************************************************************************/
BioseqGetGaps(BioseqPtr bsp)354 NLM_EXTERN Int4 BioseqGetGaps (BioseqPtr bsp)
355
356 {
357 ValNodePtr anp;
358 Int4 gaps = 0;
359 Uint1 repr;
360
361 if (bsp == NULL)
362 return -1;
363
364 repr = Bioseq_repr(bsp);
365
366 switch (repr)
367 {
368 case Seq_repr_seg:
369 case Seq_repr_ref:
370 anp = (ValNodePtr)bsp->seq_ext;
371 while (anp != NULL) /* go through Seq-loc chain */
372 {
373 gaps = SeqLocGetSegLens((SeqLocPtr)anp, NULL, gaps, TRUE);
374 anp = anp->next;
375 }
376 break;
377 case Seq_repr_delta:
378 anp = (ValNodePtr)bsp->seq_ext;
379 while (anp != NULL) /* go through delta seq chain */
380 {
381 if (anp->choice == 1)
382 gaps = SeqLocGetSegLens((SeqLocPtr)(anp->data.ptrvalue), NULL, gaps, TRUE);
383 anp = anp->next;
384 }
385 break;
386 default:
387 break;
388 }
389
390 return gaps;
391 }
392
393 /*****************************************************************************
394 *
395 * Int4 BioseqGetSegLens (bsp, lens)
396 * returns total number of segments in sequence including NULLS
397 * returns -1 for error
398 * if lens != NULL fills with lengths of segments, 0 = NULL
399 *
400 *****************************************************************************/
BioseqGetSegLens(BioseqPtr bsp,Int4Ptr lens)401 NLM_EXTERN Int4 BioseqGetSegLens (BioseqPtr bsp, Int4Ptr lens)
402
403 {
404 ValNodePtr anp;
405 Int4 segs = 0;
406 Uint1 repr;
407 SeqLitPtr slitp;
408
409 if (bsp == NULL)
410 return -1;
411
412 repr = Bioseq_repr(bsp);
413
414 switch (repr)
415 {
416 case Seq_repr_seg:
417 case Seq_repr_ref:
418 anp = (ValNodePtr)bsp->seq_ext;
419 while (anp != NULL) /* go through Seq-loc chain */
420 {
421 segs = SeqLocGetSegLens((SeqLocPtr)anp, lens, segs, FALSE);
422 anp = anp->next;
423 }
424 break;
425 case Seq_repr_delta:
426 anp = (ValNodePtr)bsp->seq_ext;
427 while (anp != NULL) /* go through delta seq chain */
428 {
429 if (anp->choice == 1)
430 segs = SeqLocGetSegLens((SeqLocPtr)(anp->data.ptrvalue), lens, segs, FALSE);
431 else
432 {
433 slitp = (SeqLitPtr)(anp->data.ptrvalue);
434 if (lens != NULL)
435 lens[segs] = slitp->length;
436 segs++;
437 }
438 anp = anp->next;
439 }
440 break;
441 default:
442 if (lens != NULL)
443 lens[0] = BioseqGetLen(bsp);
444 segs = 1;
445 break;
446 }
447 return segs;
448 }
449
450 /*****************************************************************************
451 *
452 * BioseqGetCode(bsp)
453 * returns type of code for data in sequence
454 * if not bioseq or not raw returns 0
455 * otherwise returns #defines from objseq.h
456 *
457 *****************************************************************************/
BioseqGetCode(BioseqPtr bsp)458 NLM_EXTERN Uint1 BioseqGetCode (BioseqPtr bsp)
459
460 {
461 if (bsp == NULL)
462 return 0;
463
464 if ((Bioseq_repr(bsp) == Seq_repr_raw) ||
465 (Bioseq_repr(bsp) == Seq_repr_const))
466 return bsp->seq_data_type;
467 else
468 return 0;
469 }
470
471 /*****************************************************************************
472 *
473 * Boolean BioseqConvert(bsp, newcode)
474 * converts a raw or const bioseq or delta to a new sequence code
475 *
476 *****************************************************************************/
BioseqConvert(BioseqPtr bsp,Uint1 newcode)477 NLM_EXTERN Boolean BioseqConvert (BioseqPtr bsp, Uint1 newcode)
478
479 {
480 ByteStorePtr to;
481 ValNodePtr vnp;
482 SeqLitPtr slp;
483
484 if (bsp == NULL) return FALSE;
485
486 if ((Bioseq_repr(bsp) == Seq_repr_raw) ||
487 (Bioseq_repr(bsp) == Seq_repr_const))
488 return BioseqRawConvert(bsp, newcode);
489
490 if (Bioseq_repr(bsp) != Seq_repr_delta)
491 return FALSE;
492
493 /* go through the delta chain */
494 for (vnp = (ValNodePtr)(bsp->seq_ext); vnp != NULL; vnp = vnp->next)
495 {
496 if (vnp->choice == 2) /* SeqLit */
497 {
498 slp = (SeqLitPtr)(vnp->data.ptrvalue);
499 if (slp->length > 0 && slp->seq_data != NULL
500 && slp->seq_data_type != Seq_code_gap)
501 {
502 to = BSConvertSeq((ByteStorePtr) slp->seq_data, newcode, slp->seq_data_type, slp->length);
503 if (to != NULL)
504 {
505 slp->seq_data = (SeqDataPtr) to;
506 slp->seq_data_type = newcode;
507 }
508 }
509 }
510 }
511
512 return TRUE;
513 }
514
515 /*****************************************************************************
516 *
517 * Boolean BioseqRawPack(bsp)
518 * converts a raw or const bioseq to it's densist possible code
519 *
520 *****************************************************************************/
BioseqRawPack(BioseqPtr bsp)521 NLM_EXTERN Boolean BioseqRawPack (BioseqPtr bsp)
522
523 {
524 ByteStorePtr to;
525 Uint1 newcode;
526
527 if (bsp == NULL) return FALSE;
528
529 if (! ((Bioseq_repr(bsp) == Seq_repr_raw) ||
530 (Bioseq_repr(bsp) == Seq_repr_const)))
531 return FALSE;
532
533 if(! ISA_na(bsp->mol)) { /* protein ? */
534 if(!BioseqRawConvert (bsp, Seq_code_ncbieaa)) {
535 return FALSE;
536 }
537 } else if (bsp->seq_data_type != Seq_code_gap) {
538 if((to = BSPack((ByteStorePtr) bsp->seq_data,
539 BioseqGetCode(bsp),
540 BioseqGetLen(bsp),
541 &newcode)) == NULL) {
542 return FALSE;
543 }
544 bsp->seq_data = (SeqDataPtr) to;
545 bsp->seq_data_type = newcode;
546 }
547 return TRUE;
548 }
549
550 /*****************************************************************************
551 *
552 * Boolean BioseqRawConvert(bsp, newcode)
553 * converts a raw or const bioseq to a new sequence code
554 *
555 *****************************************************************************/
BioseqRawConvert(BioseqPtr bsp,Uint1 newcode)556 NLM_EXTERN Boolean BioseqRawConvert (BioseqPtr bsp, Uint1 newcode)
557
558 {
559 ByteStorePtr to;
560 Int4 seqlen;
561 Uint1 oldcode;
562
563 if (bsp == NULL) return FALSE;
564
565 if (! ((Bioseq_repr(bsp) == Seq_repr_raw) ||
566 (Bioseq_repr(bsp) == Seq_repr_const)))
567 return FALSE;
568
569 oldcode = BioseqGetCode(bsp);
570 if (! oldcode) /* not a coded sequence */
571 return FALSE;
572
573 if (oldcode == Seq_code_gap || newcode == Seq_code_gap) return FALSE;
574
575 seqlen = BioseqGetLen(bsp);
576
577 to = BSConvertSeq((ByteStorePtr) bsp->seq_data, newcode, oldcode, seqlen);
578 if (to == NULL)
579 return FALSE;
580
581 bsp->seq_data = (SeqDataPtr) to;
582 bsp->seq_data_type = newcode;
583
584 return TRUE;
585 }
586
587 /*****************************************************************************
588 *
589 * Boolean BioseqPack(bsp)
590 * converts a raw or const or delta bioseq to it's densist possible code
591 *
592 *****************************************************************************/
BioseqPack(BioseqPtr bsp)593 NLM_EXTERN Boolean BioseqPack (BioseqPtr bsp)
594
595 {
596 ValNodePtr vnp;
597
598 if (bsp == NULL) return FALSE;
599
600 if ((Bioseq_repr(bsp) == Seq_repr_raw) ||
601 (Bioseq_repr(bsp) == Seq_repr_const))
602 return BioseqRawPack(bsp);
603
604 if (Bioseq_repr(bsp) != Seq_repr_delta)
605 return FALSE;
606
607 /* not set up to compress delta proteins */
608
609 if (ISA_aa (bsp->mol)) return FALSE;
610
611 /* go through the delta chain */
612
613 for (vnp = (ValNodePtr)(bsp->seq_ext); vnp != NULL; vnp = vnp->next) {
614 if (vnp->choice == 2) /* SeqLit */
615 SeqLitPack((SeqLitPtr)(vnp->data.ptrvalue));
616 }
617 return TRUE;
618 }
619
620 /****************************************************************************
621 *
622 * Boolean SeqLitPack(slp)
623 * Pack a SeqLit as dense as possible
624 *
625 *****************************************************************************/
SeqLitPack(SeqLitPtr slp)626 NLM_EXTERN Boolean SeqLitPack (SeqLitPtr slp)
627 {
628 ByteStorePtr to = NULL;
629 Uint1 newcode = 0;
630
631 if (slp == NULL) return FALSE;
632
633 if ((slp->length == 0) || (slp->seq_data == NULL))
634 return FALSE;
635
636 if (slp->seq_data_type == Seq_code_gap) return FALSE;
637
638 to = BSPack((ByteStorePtr) slp->seq_data, slp->seq_data_type, slp->length, &newcode);
639
640 if (to != NULL)
641 {
642 slp->seq_data = (SeqDataPtr) to;
643 slp->seq_data_type = newcode;
644 }
645
646 return TRUE;
647 }
648
649 /**************************************************************************
650 *
651 * ByteStorePtr BSPack(from, oldcode, length, newcodeptr)
652 *
653 * packs a bytestore containing a nucleic acid code as dense as possible
654 * returns a new bytestoreptr and fills in newcodeptr if it can pack it
655 * more. Otherwise returns null. length is number of residues.
656 *
657 * if BSPack returns non-NULL, then it has already BSFree'd from.
658 *
659 ***************************************************************************/
BSPack(ByteStorePtr from,Uint1 oldcode,Int4 length,Uint1Ptr newcodeptr)660 NLM_EXTERN ByteStorePtr BSPack (ByteStorePtr from, Uint1 oldcode,
661 Int4 length, Uint1Ptr newcodeptr)
662 {
663 Int4 i, seqlen;
664 Uint1 newcode, byte;
665 Char Code4na[256], CodeIna[256];
666 Boolean remained;
667 Int2 actual, j;
668 Int4 cntr;
669 Uint1 tmp [401];
670
671 Uint1 set4na[16] = {17, 18, 20, 24, 33, 34, 36, 40,
672 65, 66, 68, 72, 129, 130, 132, 136};
673 Uint1 setIna[4] = {65, 67, 71, 84};
674
675 if ((! oldcode) || (! length) || (from == NULL))/* not a coded sequence */
676 return NULL;
677
678 if (oldcode == Seq_code_ncbi2na) /* already packed */
679 return NULL;
680
681 if (oldcode == Seq_code_gap) return NULL;
682
683 MemSet ((Pointer) tmp, 0, sizeof (tmp));
684
685 BSSeek(from, 0L, SEEK_SET);
686 newcode = Seq_code_ncbi2na; /* go for broke */
687
688 switch (oldcode) {
689
690 case Seq_code_ncbi4na:
691 remained = length%2;
692 seqlen = length/2;
693
694 MemSet(Code4na, 1, sizeof(Code4na));
695 for(i=0; i< 16; i++)
696 Code4na[set4na[i]] = 0;
697
698 cntr = (Int4) MIN ((Int4) seqlen, (Int4) (sizeof (tmp) - 1));
699 actual = (Int2) BSRead (from, tmp, (Int4) cntr);
700 j = 0;
701
702 while(seqlen && actual > 0) {
703 if (j == actual) {
704 cntr = (Int4) MIN ((Int4) seqlen, (Int4) (sizeof (tmp) - 1));
705 actual = (Int2) BSRead (from, tmp, (Int4) cntr);
706 j = 0;
707 }
708 /* byte = (Uint1) BSGetByte(from); */
709 byte = (Uint1) tmp [j];
710 j++;
711 if(Code4na[byte]) {
712 newcode = Seq_code_ncbi4na;
713 if (newcodeptr != NULL) {
714 *newcodeptr = newcode;
715 }
716 return BSConvertSeq(from, newcode, oldcode, length);
717 }
718 seqlen--;
719 }
720 if(remained) { /* one more uncompleted byte */
721 byte = (Uint1) BSGetByte(from);
722 if(Code4na[byte+1])
723 newcode = Seq_code_ncbi4na;
724 }
725 break;
726 case Seq_code_iupacna:
727 MemSet(CodeIna, 1, sizeof(CodeIna));
728 for(i=0; i < 4; i++)
729 CodeIna[setIna[i]] = 0;
730 seqlen = length;
731
732 cntr = (Int4) MIN ((Int4) seqlen, (Int4) (sizeof (tmp) - 1));
733 actual = (Int2) BSRead (from, tmp, (Int4) cntr);
734 j = 0;
735
736 while(seqlen && actual > 0) {
737 if (j == actual) {
738 cntr = (Int4) MIN ((Int4) seqlen, (Int4) (sizeof (tmp) - 1));
739 actual = (Int2) BSRead (from, tmp, (Int4) cntr);
740 j = 0;
741 }
742 /* byte = (Uint1) BSGetByte(from); */
743 byte = (Uint1) tmp [j];
744 j++;
745 if(CodeIna[byte]) {
746 newcode = Seq_code_ncbi4na;
747 break;
748 }
749 seqlen--;
750 }
751 break;
752 default:
753 break;
754 }
755 if (newcodeptr != NULL) {
756 *newcodeptr = newcode;
757 }
758 return BSConvertSeq(from, newcode, oldcode, length);
759 }
760
IsNASeqCode(Uint1 seqcode)761 static Boolean IsNASeqCode (Uint1 seqcode)
762 {
763 if (seqcode == Seq_code_iupacna
764 || seqcode == Seq_code_ncbi2na
765 || seqcode == Seq_code_ncbi4na
766 || seqcode == Seq_code_ncbi8na
767 || seqcode == Seq_code_ncbipna)
768 {
769 return TRUE;
770 }
771 else
772 {
773 return FALSE;
774 }
775 }
776
IsAASeqCode(Uint1 seqcode)777 static Boolean IsAASeqCode (Uint1 seqcode)
778 {
779 if (seqcode == Seq_code_iupacaa
780 || seqcode == Seq_code_ncbi8aa
781 || seqcode == Seq_code_ncbieaa
782 || seqcode == Seq_code_ncbipaa
783 || seqcode == Seq_code_iupacaa3
784 || seqcode == Seq_code_ncbistdaa)
785 {
786 return TRUE;
787 }
788 else
789 {
790 return FALSE;
791 }
792 }
793
794 /*****************************************************************************
795 *
796 * BSConvertSeq(bytestoreptr, newcode, oldcode, len)
797 * converts a bytestore to a new sequence representation
798 * frees old bytestore
799 * returns pointer to new one, or NULL on fail.
800 * len is residues
801 *
802 *****************************************************************************/
803
BSConvertSeq(ByteStorePtr from,Uint1 newcode,Uint1 oldcode,Int4 len)804 NLM_EXTERN ByteStorePtr BSConvertSeq (ByteStorePtr from, Uint1 newcode,
805 Uint1 oldcode, Int4 len)
806
807 {
808 ByteStorePtr to;
809 Uint1 byte_from, residue_from, bitctr_from, mask_from;
810 Uint1 lshift_from, rshift_from, bc_from, byte_to, bitctr_to;
811 Uint1 lshift_to[5], bc_to, byte_tmp;
812 SeqMapTablePtr smtp;
813 Int4 storelen, in_index = 0, out_index = 0;
814 Uint1Ptr out_buff, in_buff;
815
816 if ((from == NULL) || (! oldcode) || (! newcode) || (len <= 0))
817 return NULL;
818
819 if (oldcode == Seq_code_gap || newcode == Seq_code_gap) return NULL;
820
821 if (oldcode == newcode)
822 return from;
823
824 /* if we are converting from a protein to a nucleotide or vice versa,
825 * need this intermediate step.
826 */
827 if (IsAASeqCode (oldcode) && IsNASeqCode (newcode))
828 {
829 from = BSConvertSeq (from, Seq_code_iupacaa, oldcode, len);
830 oldcode = Seq_code_iupacna;
831 }
832 else if (IsNASeqCode (oldcode) && IsAASeqCode (newcode))
833 {
834 from = BSConvertSeq (from, Seq_code_iupacna, oldcode, len);
835 oldcode = Seq_code_iupacaa;
836 }
837 if (oldcode == newcode)
838 return from;
839
840 if ((smtp = SeqMapTableFind(newcode, oldcode)) == NULL)
841 return NULL;
842
843 if (newcode == Seq_code_ncbi2na)
844 storelen = (len / 4) + 1;
845 else if (newcode == Seq_code_ncbi4na)
846 storelen = (len / 2) + 1;
847 else
848 storelen = len;
849
850 if((to = BSNew((Uint4)storelen)) == NULL)
851 return NULL;
852
853 BSSeek(from, 0, 0);
854 BSSeek(to, 0, 0);
855
856 in_buff = (Uint1Ptr)MemNew(BSC_BUFF_CHUNK);
857 out_buff = (Uint1Ptr)MemNew(BSC_BUFF_CHUNK);
858
859 switch (oldcode) {
860
861 case Seq_code_ncbi2na:
862 bc_from = 4; /* bit shifts needed */
863 rshift_from = 6;
864 lshift_from = 2;
865 mask_from = 192;
866 break;
867
868 case Seq_code_ncbi4na:
869 bc_from = 2;
870 rshift_from = 4;
871 lshift_from = 4;
872 mask_from = 240;
873 break;
874
875 default:
876 bc_from = 1;
877 rshift_from = 0;
878 lshift_from = 0;
879 mask_from = 255;
880 break;
881 }
882
883 lshift_to[1] = 0;
884
885 switch (newcode) {
886
887 case Seq_code_ncbi2na:
888 bc_to = 4; /* bit shifts needed */
889 lshift_to[2] = 2;
890 lshift_to[3] = 4;
891 lshift_to[4] = 6;
892 break;
893
894 case Seq_code_ncbi4na:
895 bc_to = 2;
896 lshift_to[2] = 4;
897 break;
898
899 default:
900 bc_to = 1;
901 break;
902 }
903
904 bitctr_to = bc_to;
905 byte_to = 0;
906 bitctr_from = 0;
907
908 in_index = BSC_BUFF_CHUNK;
909
910 while (len) {
911 if (in_index == BSC_BUFF_CHUNK) {
912 in_index = (Int2) BSRead(from, (VoidPtr)in_buff, (Int4)BSC_BUFF_CHUNK);
913 in_index = 0;
914 }
915
916 if (! bitctr_from) { /* need a new byte */
917 byte_from = in_buff[in_index];
918 in_index++;
919 bitctr_from = bc_from;
920 }
921
922 residue_from = byte_from & mask_from;
923 residue_from >>= rshift_from;
924 byte_from <<= lshift_from;
925 bitctr_from--;
926
927 byte_tmp = SeqMapTableConvert(smtp, residue_from);
928
929 if (byte_tmp == INVALID_RESIDUE) {
930 ErrPostEx(SEV_ERROR, 0, 0, "BSConvertSeq: invalid residue [%d=%c]",
931 (int)residue_from, (char)residue_from);
932 BSFree(to);
933 MemFree(in_buff);
934 MemFree(out_buff);
935 return NULL;
936 }
937
938 byte_tmp <<= lshift_to[bitctr_to];
939 byte_to |= byte_tmp;
940 bitctr_to--;
941
942 if (! bitctr_to) {
943 if (out_index == BSC_BUFF_CHUNK) {
944
945 /* Flush buffer if it is full */
946
947 out_index = (Int2) BSWrite(to, (VoidPtr)out_buff, out_index);
948 out_index = 0;
949 }
950 out_buff[out_index] = byte_to;
951 out_index++;
952
953 bitctr_to = bc_to;
954 byte_to = 0;
955 }
956 len--;
957 }
958
959 /* Now we will BSWrite() all recorded bytes in buffer */
960
961 out_index = (Int2) BSWrite(to, (VoidPtr)out_buff, out_index);
962
963 /* And finaly partial byte not written */
964
965 if (bitctr_to != bc_to)
966 BSPutByte(to, byte_to);
967
968 BSFree(from);
969 MemFree(in_buff);
970 MemFree(out_buff);
971
972 return to;
973 }
974
975 /*****************************************************************************
976 *
977 * BSRebuildDNA(bytestoreptr, len, lbytes)
978 * restore ASCII sequence with abmiguity characters
979 * lbytes[0] == length of this storage
980 * frees old bytestore
981 * returns pointer to new one, or NULL on fail.
982 * len is residues
983 * lbytes is pointer to ambiguity storage
984 *
985 *****************************************************************************/
BSRebuildDNA(ByteStorePtr from,Int4 len,Uint4Ptr PNTR lbytes)986 NLM_EXTERN ByteStorePtr BSRebuildDNA (ByteStorePtr from, Int4 len,
987 Uint4Ptr PNTR lbytes)
988
989 {
990 Int4 i, am_num;
991 Uint4Ptr am_buff;
992 Uint1 char_to;
993 Int4 row_len, j;
994 SeqMapTablePtr smtp;
995
996 if(from == NULL || len <=0)
997 return NULL;
998
999 if(*lbytes == NULL)
1000 return from;
1001
1002 if ((smtp = SeqMapTableFind(Seq_code_iupacna,
1003 Seq_code_ncbi4na)) == NULL)
1004 return NULL;
1005
1006 am_num = **lbytes;
1007 am_buff = *lbytes + 1;
1008
1009 for(i = 0; i < am_num; i++) {
1010 char_to = (Uint1)RES_VALUE(am_buff[i]);
1011 row_len = (Int4)RES_LEN(am_buff[i]);
1012
1013 BSSeek(from, RES_OFFSET(am_buff[i]), SEEK_SET);
1014 for(j = 0; j <= row_len; j++)
1015 BSPutByte(from, SeqMapTableConvert(smtp, char_to));
1016 }
1017 return from;
1018 }
1019 /*****************************************************************************
1020 *
1021 * RebuildDNA_4na(buffer, length, lbytes)
1022 works with Uint1 buffer, not ByteStore.
1023 * restore ncbi4na sequence with abmiguity characters
1024 * returns TRUE on success, FALSE on failure.
1025 * lbytes is pointer to ambiguity storage
1026 *
1027 *****************************************************************************/
RebuildDNA_4na(Uint1Ptr buffer,Int4 length,Uint4Ptr lbytes)1028 NLM_EXTERN Boolean RebuildDNA_4na (Uint1Ptr buffer, Int4 length, Uint4Ptr lbytes)
1029
1030 {
1031 Boolean new = FALSE;
1032 Uint4 i;
1033 Uint4 amb_num;
1034 Uint4Ptr amb_buff;
1035 Uint1 char_l, char_r;
1036 Int4 row_len;
1037 Uint1 C_Mask[] = {0x0F, 0xF0};
1038 Int4 j, position = 0, pos =0 , rem =0 , index;
1039
1040 if(buffer == NULL || length == 0)
1041 return FALSE;
1042
1043 if(lbytes == NULL)
1044 return TRUE;
1045
1046 amb_num = *lbytes;
1047 amb_buff = lbytes + 1;
1048
1049 /* Check if highest order bit set. */
1050 if (amb_num & 0x80000000)
1051 {
1052 new = TRUE;
1053 amb_num &= 0x7FFFFFFF;
1054 }
1055
1056 for(i = 0; i < amb_num; i++) {
1057
1058 if (new)
1059 {
1060 char_r = (Uint1)(RES_VALUE(amb_buff[i]));
1061 row_len = (Int4)(RES_LEN_NEW(amb_buff[i]));
1062 position = amb_buff[i+1];
1063 }
1064 else
1065 {
1066 char_r = (Uint1)(RES_VALUE(amb_buff[i]));
1067 row_len = (Int4)(RES_LEN(amb_buff[i]));
1068 position = RES_OFFSET(amb_buff[i]);
1069 }
1070
1071 pos = position/2;
1072 rem = position%2; /* 0 or 1 */
1073 char_l = char_r << 4;
1074
1075 for(index = pos, j =0; j <=row_len; j++) {
1076
1077 buffer[index] = (buffer[index] & C_Mask[rem]) + (rem ? char_r : char_l);
1078 rem = !rem;
1079
1080 if(!rem) index++;
1081 }
1082
1083 if (new) /* for new format we have 8 bytes for each element. */
1084 i++;
1085 }
1086
1087 return TRUE;
1088 }
1089 /*****************************************************************************
1090 *
1091 * BSRebuildDNA_4na(bytestoreptr, lbytes)
1092 * restore ncbi4na sequence with abmiguity characters
1093 * lbytes[0] == length of this storage
1094 * frees old bytestore
1095 * returns pointer to new one, or NULL on fail.
1096 * lbytes is pointer to ambiguity storage
1097 *
1098 *****************************************************************************/
BSRebuildDNA_4na(ByteStorePtr from,Uint4Ptr lbytes)1099 NLM_EXTERN ByteStorePtr BSRebuildDNA_4na (ByteStorePtr from, Uint4Ptr lbytes)
1100
1101 {
1102 Int4 bs_length;
1103 Uint1Ptr buffer;
1104 Int4 num_bytes;
1105
1106 if(from == NULL)
1107 return NULL;
1108
1109 if(lbytes == NULL)
1110 return from;
1111
1112 bs_length = BSLen(from);
1113 buffer = (Uint1Ptr) Nlm_Malloc(bs_length);
1114 if (buffer == NULL)
1115 return NULL;
1116
1117 BSSeek(from, 0, SEEK_SET);
1118
1119 if((num_bytes = BSRead(from, buffer, bs_length)) != bs_length)
1120 return NULL;
1121
1122 if (RebuildDNA_4na(buffer, bs_length, lbytes) == FALSE)
1123 return NULL;
1124
1125 BSSeek(from, 0, SEEK_SET);
1126 BSWrite(from, buffer, bs_length);
1127
1128 MemFree(buffer);
1129 return from;
1130 }
1131
1132 /*****************************************************************************
1133 *
1134 * Int4 BSCompressRead (Pointer data, Uint1Ptr buf, Int4 length)
1135 * Hook function to read "length" bytes from "data" into "buf"
1136 *
1137 * NOTE!! This function must return number or residues, but returns
1138 * twice number of returned bytes.
1139 * This function may be used ONLY if we know how many residues
1140 * in the sequence and pass this value to GenericCompressDNA()
1141 *
1142 *****************************************************************************/
1143 static Int4 BSCompressRead (Pointer data, Uint1Ptr buf, Int4 length);
BSCompressRead(Pointer data,Uint1Ptr buf,Int4 length)1144 static Int4 BSCompressRead (Pointer data, Uint1Ptr buf, Int4 length)
1145 {
1146 Int4 residues;
1147
1148 residues = (Int4) BSRead((ByteStorePtr)data, (VoidPtr)buf, length);
1149 return residues*2;
1150 }
1151
1152 /*****************************************************************************
1153 *
1154 * Int4 BSCompressWrite (Pointer data, Uint1Ptr buf, Int4 length)
1155 * Hook function to write "length" bytes to "data" from "buf"
1156 *
1157 * Returned number of bytes were written
1158 *****************************************************************************/
1159 static Int4 BSCompressWrite (Pointer data, Uint1Ptr buf, Int4 length);
BSCompressWrite(Pointer data,Uint1Ptr buf,Int4 length)1160 static Int4 BSCompressWrite (Pointer data, Uint1Ptr buf, Int4 length)
1161 {
1162 return (Int4) BSWrite((ByteStorePtr)data, (VoidPtr)buf, length);
1163 }
1164
1165 /*****************************************************************************
1166 *
1167 * BSCompressDNA(bytestoreptr, len, lbytes)
1168 * converts a ncbi4na bytestore into ncbi2na
1169 * returns pointer to ambiguity storage
1170 * lbytes[0] == length of this storage
1171 * frees old bytestore
1172 * returns pointer to new one, or NULL on fail.
1173 * len is residues
1174 *
1175 *****************************************************************************/
BSCompressDNA(ByteStorePtr from,Int4 len,Uint4Ptr PNTR lbytes)1176 NLM_EXTERN ByteStorePtr BSCompressDNA(ByteStorePtr from, Int4 len,
1177 Uint4Ptr PNTR lbytes)
1178 {
1179 ByteStorePtr to;
1180 to = BSNew((Uint4)len/4+1);
1181
1182 BSSeek(from, 0, 0);
1183 BSSeek(to, 0, 0);
1184
1185 if(!GenericCompressDNA((VoidPtr) from, (VoidPtr) to,
1186 (Uint4)len,
1187 BSCompressRead,
1188 BSCompressWrite,
1189 lbytes
1190 )) {
1191 return NULL;
1192 }
1193
1194 BSFree(from);
1195 return to;
1196 }
1197
1198 /*****************************************************************************
1199 *
1200 * BSCompressDNANew(bytestoreptr, len, lbytes)
1201 * converts a ncbi4na bytestore into ncbi2na
1202 * returns pointer to ambiguity storage
1203 * lbytes[0] == length of this storage
1204 * frees old bytestore
1205 * returns pointer to new one, or NULL on fail.
1206 * len is residues
1207 *
1208 * This function stores the ambiguity code in 8 bytes so
1209 * that there is no cutoff for sequences greater than 16 million bps.
1210 * as there is for BSCompressDNA.
1211 *
1212 *****************************************************************************/
BSCompressDNANew(ByteStorePtr from,Int4 len,Uint4Ptr PNTR lbytes)1213 NLM_EXTERN ByteStorePtr BSCompressDNANew(ByteStorePtr from, Int4 len,
1214 Uint4Ptr PNTR lbytes)
1215 {
1216 ByteStorePtr to;
1217 to = BSNew((Uint4)len/4+1);
1218
1219 BSSeek(from, 0, 0);
1220 BSSeek(to, 0, 0);
1221
1222 if(!GenericCompressDNAEx((VoidPtr) from, (VoidPtr) to,
1223 (Uint4)len,
1224 BSCompressRead,
1225 BSCompressWrite,
1226 lbytes, TRUE)) {
1227 return NULL;
1228 }
1229
1230 BSFree(from);
1231 return to;
1232 }
1233
1234 /*****************************************************************************
1235 *
1236 * GenericCompressDNA()
1237 * converts from VoidPtr "from" in 4na encoding to
1238 * VoidPtr "to" in 2Na encoding
1239 * returns pointer to ambiguity storage
1240 * lbytes[0] == length of this storage
1241 * returns TRUE if succeded, or FALSE on fail.
1242 * seq_len is maximum number of residues in sequence
1243 * or ((Uint4) -1) if final length is unknown.
1244 * read_func and write_func - hook functions to read from "from"
1245 * and to write to "to"
1246 *
1247 * NOTE! read_func must return number of residues read, that usualy
1248 * twice as much as returned number of bytes. Only last returned
1249 * byte may have only one residue and this will be handled by
1250 * seq_len value or returned value from read_func()
1251 *****************************************************************************/
GenericCompressDNA(VoidPtr from,VoidPtr to,Uint4 seq_len,CompressRWFunc read_func,CompressRWFunc write_func,Uint4Ptr PNTR lbytes)1252 NLM_EXTERN Boolean GenericCompressDNA(VoidPtr from,
1253 VoidPtr to,
1254 Uint4 seq_len,
1255 CompressRWFunc read_func,
1256 CompressRWFunc write_func,
1257 Uint4Ptr PNTR lbytes)
1258 {
1259 return GenericCompressDNAEx(from, to, seq_len, read_func, write_func, lbytes, FALSE);
1260 }
1261
GenericCompressDNAEx(VoidPtr from,VoidPtr to,Uint4 seq_len,CompressRWFunc read_func,CompressRWFunc write_func,Uint4Ptr PNTR lbytes,Boolean x_new)1262 NLM_EXTERN Boolean GenericCompressDNAEx(VoidPtr from,
1263 VoidPtr to,
1264 Uint4 seq_len,
1265 CompressRWFunc read_func,
1266 CompressRWFunc write_func,
1267 Uint4Ptr PNTR lbytes,
1268 Boolean x_new)
1269 {
1270 Int4 total_read, chunk_used, seq_offset;
1271 Int4 in_index = 0, out_index = 0;
1272 Uint1Ptr out_buff, in_buff;
1273 Uint1 bc_from, rshift_from, lshift_from, mask_from;
1274 Uint1 bc_to, byte_tmp;
1275 Uint1 bitctr_to, byte_to, byte_from, bitctr_from, residue_from;
1276 Uint1 lshift_to[5] = {0, 0, 2, 4, 6 };
1277
1278 Int4 row_len =0;
1279 Uint1 last_ambchar = INVALID_RESIDUE;
1280 Uint4Ptr ambchar;
1281 Int4 ambsize = 2*(BSC_BUFF_CHUNK/2); /* we need this to be a multiple of two for the new format. */
1282
1283 if(from == NULL) /* Invalid ByteStore format */
1284 return FALSE;
1285
1286 /* Translation tables Initialization fot ncbi4na->ncbi2na*/
1287
1288 in_buff = (Uint1Ptr)MemNew(BSC_BUFF_CHUNK);
1289 out_buff = (Uint1Ptr)MemNew(BSC_BUFF_CHUNK);
1290
1291 bc_from = 2;
1292 rshift_from = 4;
1293 lshift_from = 4;
1294 mask_from = 240;
1295 bc_to = 4; /* bit shifts needed */
1296
1297 bitctr_to = bc_to;
1298 byte_to = 0;
1299 bitctr_from = 0;
1300
1301 ambchar = (Uint4Ptr) Nlm_Malloc(sizeof(Uint4)*(ambsize + 1)); /* all plus one */
1302 *ambchar = 0;
1303
1304 seq_offset = chunk_used = in_index = total_read = 0;
1305
1306 while(seq_offset != seq_len) {
1307 if (chunk_used == total_read) {
1308 /* supposed, that in 4na total_read = in_index*2 or in_index*2-1 */
1309 if((total_read = read_func(from, in_buff, (Int4)BSC_BUFF_CHUNK)) == 0)
1310 break;
1311 if(total_read < 0) { /* ERROR!!! */
1312 MemFree(ambchar);
1313 MemFree(in_buff);
1314 MemFree(out_buff);
1315 return FALSE;
1316 }
1317 in_index = 0;
1318 chunk_used = 0;
1319 }
1320
1321 if (!bitctr_from) { /* need a new byte */
1322 byte_from = in_buff[in_index];
1323 bitctr_from = bc_from;
1324 in_index++;
1325 }
1326 residue_from = byte_from & mask_from;
1327 residue_from >>= rshift_from;
1328 byte_from <<= lshift_from;
1329 bitctr_from--;
1330 if(!Convert4NaRandom(residue_from, &byte_tmp)) {
1331
1332 /* We have to handle invalid residues in a good way */
1333
1334 if(*ambchar >= (Uint4)(ambsize-1)) { /* Reallocating buffer if necessary */
1335 ambsize += 2*(BSC_BUFF_CHUNK/2); /* we need this to be a multiple of two for the new format. */
1336 ambchar = (Uint4Ptr) Realloc(ambchar, (ambsize+1)*sizeof(Uint4));
1337 }
1338
1339 /* Constructing integer as <1111. 1111. 11111111.11111111.11111111
1340 * <char><length><--------- offset -------->
1341 * First interer in array will be length of array
1342 */
1343
1344 if (x_new && seq_len >= 0xFFFFFF)
1345 {
1346 if(last_ambchar != residue_from || row_len == 0xFFF) {
1347 if ((*ambchar) == 0)
1348 (*ambchar)++;
1349 else
1350 (*ambchar) += 2;
1351 ambchar[*ambchar] = 0;
1352 ambchar[*ambchar] += residue_from;
1353 ambchar[*ambchar] <<= 28;
1354 /* Put the seq_offset in the 2nd integer. */
1355 ambchar[(*ambchar)+1] = seq_offset;
1356
1357 last_ambchar = residue_from;
1358 row_len = 0;
1359 /* printf("Ambchar = %u(%u)(%u) : %u %u %u\n",
1360 residue_from, row_len, total_len-len,
1361 RES_VALUE(ambchar[*ambchar]),
1362 RES_LEN(ambchar[*ambchar]),
1363 RES_OFFSET(ambchar[*ambchar])); */
1364 } else {
1365 (ambchar[*ambchar]) += LEN_STEP_MASK_NEW;
1366 row_len++;
1367 /* printf("Ambchar = %u(%u)(%u) : %u %u %u\n",
1368 residue_from, row_len, total_len-len,
1369 RES_VALUE(ambchar[*ambchar]),
1370 RES_LEN(ambchar[*ambchar]),
1371 RES_OFFSET(ambchar[*ambchar])); */
1372 }
1373 }
1374 else
1375 {
1376 if(last_ambchar != residue_from || row_len == 15) {
1377 (*ambchar)++;
1378 ambchar[*ambchar] = 0;
1379 ambchar[*ambchar] += residue_from;
1380 ambchar[*ambchar] <<= 28;
1381 ambchar[*ambchar] += seq_offset;
1382
1383 last_ambchar = residue_from;
1384 row_len = 0;
1385 /* printf("Ambchar = %u(%u)(%u) : %u %u %u\n",
1386 residue_from, row_len, total_len-len,
1387 RES_VALUE(ambchar[*ambchar]),
1388 RES_LEN(ambchar[*ambchar]),
1389 RES_OFFSET(ambchar[*ambchar])); */
1390 } else {
1391 (ambchar[*ambchar]) += LEN_STEP_MASK;
1392 row_len++;
1393 /* printf("Ambchar = %u(%u)(%u) : %u %u %u\n",
1394 residue_from, row_len, total_len-len,
1395 RES_VALUE(ambchar[*ambchar]),
1396 RES_LEN(ambchar[*ambchar]),
1397 RES_OFFSET(ambchar[*ambchar])); */
1398 }
1399 }
1400 } else {
1401 last_ambchar = INVALID_RESIDUE; /* reset of last residue */
1402 }
1403 byte_tmp <<= lshift_to[bitctr_to];
1404 byte_to |= byte_tmp;
1405 bitctr_to--;
1406 if (! bitctr_to) {
1407 if (out_index == BSC_BUFF_CHUNK) {
1408
1409 /* Flush buffer if it is full */
1410
1411 out_index = write_func(to, out_buff, out_index);
1412 out_index = 0;
1413 }
1414
1415 out_buff[out_index] = byte_to;
1416 out_index++;
1417
1418 bitctr_to = bc_to;
1419 byte_to = 0;
1420 }
1421 chunk_used++;
1422 seq_offset++;
1423 } /* while TRUE */
1424
1425 /* Now we will BSWrite() all recorded bytes in buffer */
1426
1427 out_index = write_func(to, out_buff, out_index);
1428
1429 if (bitctr_to != bc_to) { /* partial byte not written */
1430 byte_to += (seq_len)%4; /* last 2 bits will be remainder */
1431 write_func(to, &byte_to, 1);
1432 } else {
1433 write_func(to, &byte_to, 1); /* NULLB anyway */
1434 }
1435
1436 if(!*ambchar) { /* no ambiguous characters found */
1437 MemFree(ambchar);
1438 *lbytes = NULL;
1439 } else {
1440 if (x_new && seq_len >= 0xFFFFFF)
1441 {
1442 (*ambchar)++;
1443 *ambchar += 0x80000000;
1444 }
1445 *lbytes = (Uint4Ptr)ambchar;
1446 }
1447 MemFree(in_buff);
1448 MemFree(out_buff);
1449 return TRUE;
1450 }
1451
1452 /*****************************************************************************
1453 * --- To be deleted ---
1454 * BSCompressDNA(bytestoreptr, len, lbytes)
1455 * converts a ncbi4na bytestore into ncbi2na
1456 * returns pointer to ambiguity storage
1457 * lbytes[0] == length of this storage
1458 * frees old bytestore
1459 * returns pointer to new one, or NULL on fail.
1460 * len is residues
1461 *
1462 *****************************************************************************/
BSCompressDNAOld(ByteStorePtr from,Int4 len,Uint4Ptr PNTR lbytes)1463 NLM_EXTERN ByteStorePtr BSCompressDNAOld(ByteStorePtr from, Int4 len,
1464 Uint4Ptr PNTR lbytes)
1465 {
1466 ByteStorePtr to;
1467 Int4 total_len = len;
1468 Int4 storelen = len/4 + 1, in_index = 0, out_index = 0;
1469 Uint1Ptr out_buff, in_buff;
1470 Uint1 bc_from, rshift_from, lshift_from, mask_from;
1471 Uint1 bc_to, byte_tmp;
1472 Uint1 bitctr_to, byte_to, byte_from, bitctr_from, residue_from;
1473 Uint1 lshift_to[5] = {0, 0, 2, 4, 6 };
1474
1475 Uint1 row_len =0, last_ambchar = INVALID_RESIDUE;
1476 Uint4Ptr ambchar;
1477 Int4 ambsize = BSC_BUFF_CHUNK;
1478
1479 if(from == NULL) /* Invalid ByteStore format */
1480 return NULL;
1481
1482 /* Translation tables Initialization fot ncbi4na->ncbi2na*/
1483
1484 if((to = BSNew((Uint4)storelen)) == NULL)
1485 return NULL;
1486
1487 BSSeek(from, 0, 0);
1488 BSSeek(to, 0, 0);
1489
1490 in_buff = (Uint1Ptr)MemNew(BSC_BUFF_CHUNK);
1491 out_buff = (Uint1Ptr)MemNew(BSC_BUFF_CHUNK);
1492
1493 bc_from = 2;
1494 rshift_from = 4;
1495 lshift_from = 4;
1496 mask_from = 240;
1497 bc_to = 4; /* bit shifts needed */
1498
1499 bitctr_to = bc_to;
1500 byte_to = 0;
1501 bitctr_from = 0;
1502
1503 ambchar = (Uint4Ptr) MemNew(sizeof(Uint4)*(ambsize + 1)); /* all plus one */
1504 *ambchar = 0;
1505
1506 in_index = BSC_BUFF_CHUNK;
1507
1508 while(len) {
1509 if (in_index == BSC_BUFF_CHUNK) {
1510 in_index = (Int2) BSRead(from, (VoidPtr)in_buff, (Int4)BSC_BUFF_CHUNK);
1511 in_index = 0;
1512 }
1513
1514 if (! bitctr_from) { /* need a new byte */
1515 byte_from = in_buff[in_index];
1516 in_index++;
1517 bitctr_from = bc_from;
1518 }
1519 residue_from = byte_from & mask_from;
1520 residue_from >>= rshift_from;
1521 byte_from <<= lshift_from;
1522 bitctr_from--;
1523 if(!Convert4NaRandom(residue_from, &byte_tmp)) {
1524
1525 /* We have to handle invalid residues in a good way */
1526
1527 if(*ambchar >= (Uint4)ambsize) { /* Reallocating buffer if necessary */
1528 ambsize += BSC_BUFF_CHUNK;
1529 ambchar = (Uint4Ptr) Realloc(ambchar, (ambsize+1)*sizeof(Uint4));
1530 }
1531
1532 /* Constructing integer as <1111. 1111. 11111111.11111111.11111111
1533 * <char><length><--------- offset -------->
1534 * First interer in array will be length of array
1535 */
1536
1537 if(last_ambchar != residue_from || row_len == 15) {
1538 (*ambchar)++;
1539 ambchar[*ambchar] = 0;
1540 ambchar[*ambchar] += residue_from;
1541 ambchar[*ambchar] <<= 28;
1542 ambchar[*ambchar] += (total_len-len);
1543
1544 last_ambchar = residue_from;
1545 row_len = 0;
1546 /* printf("Ambchar = %u(%u)(%u) : %u %u %u\n",
1547 residue_from, row_len, total_len-len,
1548 RES_VALUE(ambchar[*ambchar]),
1549 RES_LEN(ambchar[*ambchar]),
1550 RES_OFFSET(ambchar[*ambchar])); */
1551 } else {
1552 (ambchar[*ambchar]) += LEN_STEP_MASK;
1553 row_len++;
1554 /* printf("Ambchar = %u(%u)(%u) : %u %u %u\n",
1555 residue_from, row_len, total_len-len,
1556 RES_VALUE(ambchar[*ambchar]),
1557 RES_LEN(ambchar[*ambchar]),
1558 RES_OFFSET(ambchar[*ambchar])); */
1559 }
1560 } else {
1561 last_ambchar = INVALID_RESIDUE; /* reset of last residue */
1562 }
1563 byte_tmp <<= lshift_to[bitctr_to];
1564 byte_to |= byte_tmp;
1565 bitctr_to--;
1566 if (! bitctr_to) {
1567 if (out_index == BSC_BUFF_CHUNK) {
1568
1569 /* Flush buffer if it is full */
1570
1571 out_index = (Int2) BSWrite(to, (VoidPtr)out_buff, out_index);
1572 out_index = 0;
1573 }
1574
1575 out_buff[out_index] = byte_to;
1576 out_index++;
1577
1578 bitctr_to = bc_to;
1579 byte_to = 0;
1580 }
1581 len--;
1582 }
1583
1584 /* Now we will BSWrite() all recorded bytes in buffer */
1585
1586 out_index = (Int2) BSWrite(to, (VoidPtr)out_buff, out_index);
1587
1588 if (bitctr_to != bc_to) { /* partial byte not written */
1589 byte_to += total_len%4; /* last 2 bits will be remainder */
1590 BSPutByte(to, byte_to);
1591 } else {
1592 BSPutByte(to, byte_to); /* NULLB anyway */
1593 }
1594 BSFree(from);
1595
1596 if(!*ambchar) { /* no ambiguous characters found */
1597 MemFree(ambchar);
1598 *lbytes = NULL;
1599 } else {
1600 *lbytes = (Uint4Ptr)ambchar;
1601 }
1602 MemFree(in_buff);
1603 MemFree(out_buff);
1604 return to;
1605 }
1606
1607 /*****************************************************************************
1608 *
1609 * void CorrectGeneFeatLocation(sep, data, n, m)
1610 *
1611 * Correct gene location for mRNA sequences, i.e.
1612 * puts start = 0, end = total_length_of_sequence - 1.
1613 *
1614 *****************************************************************************/
CorrectGeneFeatLocation(SeqEntryPtr sep,Pointer data,Int4 n,Int2 m)1615 NLM_EXTERN void CorrectGeneFeatLocation(SeqEntryPtr sep, Pointer data,
1616 Int4 n, Int2 m)
1617 {
1618 BioseqPtr bsp;
1619 ValNodePtr vnp;
1620 MolInfoPtr mip;
1621 SeqAnnotPtr sap;
1622 SeqFeatPtr sfp;
1623 SeqIntPtr sip;
1624 SeqDescrPtr sdp;
1625 BioSourcePtr biop;
1626 OrgRefPtr orp;
1627
1628 if(sep == NULL)
1629 return;
1630
1631 /* We need only Bioseqs
1632 */
1633 if(IS_Bioseq(sep) != TRUE)
1634 return;
1635
1636 bsp = sep->data.ptrvalue;
1637 if(bsp == NULL)
1638 return;
1639
1640 /* Looks at nucleic acids with the non-zero length only
1641 */
1642 if(ISA_na(bsp->mol) != TRUE || bsp->length == 0)
1643 return;
1644
1645 /* Checks bioseq if it is mRNA
1646 */
1647 for(vnp = bsp->descr; vnp != NULL; vnp = vnp->next) {
1648 if(vnp->choice != Seq_descr_molinfo)
1649 continue;
1650 mip = vnp->data.ptrvalue;
1651 if(mip == NULL || mip->biomol != 3) /* not mRNA */
1652 continue;
1653 break;
1654 }
1655
1656 /* If bioseq is not mRNA, does nothing, just return
1657 */
1658 if(vnp == NULL)
1659 return;
1660
1661 sdp = GetNextDescriptorUnindexed (bsp, Seq_descr_source, NULL);
1662 if (sdp != NULL) {
1663 biop = (BioSourcePtr) sdp->data.ptrvalue;
1664 if (biop != NULL) {
1665 if (biop->origin == ORG_ARTIFICIAL) {
1666 orp = biop->org;
1667 if (orp != NULL) {
1668 if (StringICmp (orp->taxname, "synthetic construct") == 0) return;
1669 }
1670 }
1671 }
1672 }
1673
1674 /* Otherwise go ahead
1675 */
1676 for(sap = bsp->annot; sap != NULL; sap = sap->next) {
1677 if(sap->type != 1)
1678 continue;
1679
1680 for(sfp = sap->data; sfp != NULL; sfp = sfp->next) {
1681 /* Is it gene feature ?
1682 */
1683 if(sfp->data.choice != SEQFEAT_GENE)
1684 continue;
1685
1686 /* If so, is it not empty ?
1687 */
1688 if(sfp->data.value.ptrvalue == NULL)
1689 continue;
1690
1691 /* Then correct location
1692 */
1693 for(vnp = sfp->location; vnp != NULL; vnp = vnp->next) {
1694 if(vnp->choice != SEQLOC_INT)
1695 continue;
1696 sip = vnp->data.ptrvalue;
1697 if(sip == NULL)
1698 continue;
1699 if(sip->from != 0 || sip->to != bsp->length - 1) {
1700 ErrPostEx(SEV_WARNING, 0, 0,
1701 "Incorrect gene location: [%d..%d] "
1702 "instead of [0..%d]. Fixed.",
1703 sip->from, sip->to, bsp->length - 1);
1704 sip->from = 0;
1705 sip->to = bsp->length - 1;
1706 }
1707 }
1708 }
1709 }
1710 }
1711
1712 /*****************************************************************************
1713 *
1714 * Int4 NumberingOffset(np, value)
1715 * returns an offset to the sequence based on value
1716 * returns -1 if invalid
1717 * does NOT deal with Num-ref types
1718 * does NOT deal with specified ranges on the sequence
1719 *
1720 *****************************************************************************/
NumberingOffset(NumberingPtr np,DataValPtr vp)1721 NLM_EXTERN Int4 NumberingOffset (NumberingPtr np, DataValPtr vp)
1722
1723 {
1724 Int4 offset = -1, i, num;
1725 NumContPtr ncp;
1726 NumEnumPtr nep;
1727 NumRealPtr nrp;
1728 CharPtr PNTR ptr;
1729 CharPtr name;
1730 FloatHi foffset;
1731
1732 if ((np == NULL) || (vp == NULL)) return -1;
1733
1734 switch (np->choice)
1735 {
1736 case Numbering_cont:
1737 ncp = (NumContPtr)np->data.ptrvalue;
1738 if (ncp->ascending)
1739 {
1740 offset = vp->intvalue - ncp->refnum;
1741 if ((ncp->refnum < 0) && (! ncp->has_zero) &&
1742 (vp->intvalue > 0))
1743 offset--;
1744 }
1745 else
1746 {
1747 offset = ncp->refnum - vp->intvalue;
1748 if ((ncp->refnum > 0) && (! ncp->has_zero) &&
1749 (vp->intvalue < 0))
1750 offset--;
1751 }
1752 break;
1753 case Numbering_enum:
1754 nep = (NumEnumPtr)np->data.ptrvalue;
1755 name = (CharPtr)vp->ptrvalue;
1756 num = nep->num;
1757 ptr = nep->names;
1758 for (i = 0; i < num; i++, ptr++)
1759 {
1760 if (! StringCmp(name, *ptr))
1761 {
1762 offset = i;
1763 break;
1764 }
1765 }
1766 break;
1767 case Numbering_ref_source:
1768 case Numbering_ref_align:
1769 ErrPostEx(SEV_ERROR, 0,0, "Num-ref not supported yet");
1770 break;
1771 case Numbering_real:
1772 nrp = (NumRealPtr)np->data.ptrvalue;
1773 foffset = (vp->realvalue - nrp->b) / nrp->a;
1774 offset = (Int4) foffset;
1775 if ((foffset - (FloatHi)offset) >= 0.5)
1776 offset++;
1777 break;
1778 }
1779 return offset;
1780 }
1781
1782 /*****************************************************************************
1783 *
1784 * NumberingValue (np, offset, value)
1785 * fills value with the display value of offset
1786 * return type indicates type of value
1787 * 0 = failed
1788 * 1 = intvalue
1789 * 2 = realvalue
1790 * 3 = ptrvalue (string)
1791 *
1792 *****************************************************************************/
NumberingValue(NumberingPtr np,Int4 offset,DataValPtr vp)1793 NLM_EXTERN Int2 NumberingValue (NumberingPtr np, Int4 offset, DataValPtr vp)
1794
1795 {
1796 NumContPtr ncp;
1797 NumEnumPtr nep;
1798 NumRealPtr nrp;
1799 Int2 type = 0;
1800 Int4 intval;
1801 FloatHi fval;
1802
1803 if ((np == NULL) || (vp == NULL)) return -1;
1804
1805 switch (np->choice)
1806 {
1807 case Numbering_cont:
1808 ncp = (NumContPtr)np->data.ptrvalue;
1809 if (ncp->ascending)
1810 {
1811 intval = offset + ncp->refnum;
1812 if ((ncp->refnum < 0) && (! ncp->has_zero) &&
1813 (intval >= 0))
1814 intval++;
1815 }
1816 else
1817 {
1818 intval = ncp->refnum - offset;
1819 if ((ncp->refnum > 0) && (! ncp->has_zero) &&
1820 (intval <= 0))
1821 intval--;
1822 }
1823 vp->intvalue = intval;
1824 type = 1;
1825 break;
1826 case Numbering_enum:
1827 nep = (NumEnumPtr)np->data.ptrvalue;
1828 if (offset < nep->num)
1829 {
1830 vp->ptrvalue = nep->names[offset];
1831 type = 3;
1832 }
1833 break;
1834 case Numbering_ref_source:
1835 case Numbering_ref_align:
1836 ErrPostEx(SEV_ERROR, 0,0, "Num-ref not supported yet");
1837 break;
1838 case Numbering_real:
1839 nrp = (NumRealPtr)np->data.ptrvalue;
1840 fval = ((FloatHi)offset * nrp->a) + nrp->b;
1841 type = 2;
1842 vp->realvalue = fval;
1843 break;
1844 }
1845
1846 return type;
1847 }
1848
1849 /*****************************************************************************
1850 *
1851 * NumberingValueBySeqId(sip, offset, vp)
1852 *
1853 *****************************************************************************/
NumberingValueBySeqId(SeqIdPtr sip,Int4 offset,DataValPtr vp)1854 NLM_EXTERN Int2 NumberingValueBySeqId (SeqIdPtr sip, Int4 offset, DataValPtr vp)
1855
1856 {
1857 BioseqPtr bsp;
1858 NumberingPtr np = NULL;
1859
1860 if ((sip == NULL) || (vp == NULL)) return -1;
1861
1862 bsp = BioseqFind(sip);
1863 if (bsp == NULL)
1864 np = NumberingDefaultGet();
1865 else
1866 np = BioseqGetNumbering(bsp);
1867
1868 return NumberingValue(np, offset, vp);
1869 }
1870
1871 /*****************************************************************************
1872 *
1873 * NumberingDefaultLoad()
1874 *
1875 *****************************************************************************/
NumberingDefaultLoad(void)1876 NLM_EXTERN void NumberingDefaultLoad (void)
1877
1878 {
1879 NumContPtr ncp;
1880
1881 if (stdnum != NULL)
1882 return;
1883
1884 stdnum = ValNodeNew(NULL); /* set up numbering from 1 */
1885 stdnum->choice = Numbering_cont;
1886 ncp = NumContNew();
1887 ncp->refnum = 1; /* number from one */
1888 ncp->ascending = TRUE;
1889 stdnum->data.ptrvalue = (Pointer) ncp;
1890 return;
1891 }
1892
1893 /*****************************************************************************
1894 *
1895 * NumberingDefaultGet()
1896 * returns a default numbering object (start at 1, ascending, no 0)
1897 *
1898 *****************************************************************************/
NumberingDefaultGet(void)1899 NLM_EXTERN NumberingPtr NumberingDefaultGet (void)
1900
1901 {
1902 if (stdnum == NULL)
1903 NumberingDefaultLoad();
1904 return stdnum;
1905 }
1906
1907 /*****************************************************************************
1908 *
1909 * SeqCodeTablePtr SeqCodeTableFind(code)
1910 * Sequence codes defined in objseq.h
1911 *
1912 *****************************************************************************/
SeqCodeTableFind(Uint1 code)1913 NLM_EXTERN SeqCodeTablePtr LIBCALL SeqCodeTableFind (Uint1 code)
1914 {
1915 return SeqCodeTableFindObj (code);
1916 }
1917
1918 /*****************************************************************************
1919 *
1920 * OneLetterCode(sctp)
1921 * returns TRUE if sequence code table sctp uses one letter symbols
1922 *
1923 *****************************************************************************/
OneLetterCode(SeqCodeTablePtr sctp)1924 NLM_EXTERN Boolean OneLetterCode (SeqCodeTablePtr sctp)
1925 {
1926 if (sctp == NULL) return FALSE;
1927 return sctp->one_letter;
1928 }
1929
1930 /*****************************************************************************
1931 *
1932 * FirstResidueInCode(sctp)
1933 * returns first valid residue code in sequence code table
1934 *
1935 *****************************************************************************/
FirstResidueInCode(SeqCodeTablePtr sctp)1936 NLM_EXTERN Uint1 FirstResidueInCode (SeqCodeTablePtr sctp)
1937 {
1938 if (sctp == NULL) return INVALID_RESIDUE;
1939 return sctp->start_at;
1940 }
1941
1942 /*****************************************************************************
1943 *
1944 * LastResidueInCode(sctp)
1945 * returns last valid residue code in sequence code table
1946 * nb: some codes have "holes", a range of invalid values between first
1947 * and last.
1948 *
1949 *****************************************************************************/
LastResidueInCode(SeqCodeTablePtr sctp)1950 NLM_EXTERN Uint1 LastResidueInCode (SeqCodeTablePtr sctp)
1951 {
1952 if (sctp == NULL) return INVALID_RESIDUE;
1953 return (Uint1)((int)(sctp->start_at) + (int)(sctp->num) - 1);
1954 }
1955
1956 /*****************************************************************************
1957 *
1958 * GetIndexForResidue(sctp, residue)
1959 * gets index into sctp structs for residue
1960 * returns INVALID_RESIDUE if no good
1961 *
1962 *****************************************************************************/
1963 NLM_EXTERN Uint1 GetIndexForResidue(SeqCodeTablePtr sctp, Uint1 residue);
GetIndexForResidue(SeqCodeTablePtr sctp,Uint1 residue)1964 NLM_EXTERN Uint1 GetIndexForResidue(SeqCodeTablePtr sctp, Uint1 residue)
1965 {
1966 if (sctp == NULL) return INVALID_RESIDUE;
1967 if (residue < sctp->start_at) return INVALID_RESIDUE;
1968 residue -= sctp->start_at;
1969 if (residue >= sctp->num) return INVALID_RESIDUE;
1970 return residue;
1971 }
1972
1973
1974 /*****************************************************************************
1975 *
1976 * GetSymbolForResidue(sctp, residue)
1977 * returns the ONE LETTER symbol for residue if sequence code has one
1978 * letter symbols. returns INVALID_RESIDUE if not a valid residue or if
1979 * sequence code uses multi-letter symbols
1980 *
1981 *****************************************************************************/
GetSymbolForResidue(SeqCodeTablePtr sctp,Uint1 residue)1982 NLM_EXTERN Uint1 GetSymbolForResidue (SeqCodeTablePtr sctp, Uint1 residue)
1983 {
1984 Uint1 offset;
1985
1986 offset = GetIndexForResidue (sctp, residue);
1987 if (offset == INVALID_RESIDUE) return offset;
1988 if (! sctp->one_letter) return INVALID_RESIDUE;
1989 if (sctp->letters[offset] == '\0') return INVALID_RESIDUE;
1990 return (Uint1)(sctp->letters[offset]);
1991 }
1992
1993 /*****************************************************************************
1994 *
1995 * GetResidueForSymbol(sctp, residue)
1996 * returns the residue for a ONE LETTER if sequence code has one
1997 * letter symbols. returns INVALID_RESIDUE if not a valid symbol or if
1998 * sequence code uses multi-letter symbols
1999 * CASE matters
2000 *
2001 *****************************************************************************/
GetResidueForSymbol(SeqCodeTablePtr sctp,Uint1 symbol)2002 NLM_EXTERN Uint1 GetResidueForSymbol (SeqCodeTablePtr sctp, Uint1 symbol)
2003 {
2004 Int2 ctr;
2005 CharPtr letters;
2006
2007 if (sctp == NULL) return INVALID_RESIDUE;
2008 if (! sctp->one_letter) return INVALID_RESIDUE;
2009
2010 letters = sctp->letters;
2011 for (ctr = 0; ctr < (Int2)sctp->num; ctr++, letters++)
2012 {
2013 if ((Char)symbol == *letters)
2014 return ((Uint1)ctr + sctp->start_at);
2015 }
2016 return INVALID_RESIDUE;
2017 }
2018
2019 /*****************************************************************************
2020 *
2021 * GetLongSymbolForResidue(sctp, symbol)
2022 * returns string symbol for residue if sequence code has string
2023 * symbols. returns NULL if not a valid residue or if
2024 * sequence code uses One letter symbols
2025 *
2026 *****************************************************************************/
GetLongSymbolForResidue(SeqCodeTablePtr sctp,Uint1 residue)2027 NLM_EXTERN const char * GetLongSymbolForResidue (SeqCodeTablePtr sctp, Uint1 residue)
2028 {
2029 Uint1 offset;
2030
2031 offset = GetIndexForResidue (sctp, residue);
2032 if (offset == INVALID_RESIDUE) return NULL;
2033 if (sctp->one_letter) return NULL;
2034
2035 return (const char *)(sctp->symbols[offset]);
2036
2037 }
2038
2039 /*****************************************************************************
2040 *
2041 * GetResidueForLongSymbol(sctp, symbol)
2042 * returns the residue for a STRING symbol if sequence code has string
2043 * symbols. returns INVALID_RESIDUE if not a valid symbol or if
2044 * sequence code uses one-letter symbols
2045 * CASE matters
2046 *
2047 *****************************************************************************/
GetResidueForLongSymbol(SeqCodeTablePtr sctp,CharPtr symbol)2048 NLM_EXTERN Uint1 GetResidueForLongSymbol (SeqCodeTablePtr sctp, CharPtr symbol)
2049 {
2050 Int2 ctr;
2051 CharPtr PNTR symbols;
2052
2053 if ((sctp == NULL) || (symbol == NULL)) return INVALID_RESIDUE;
2054 if (sctp->one_letter) return INVALID_RESIDUE;
2055
2056 symbols = sctp->symbols;
2057 for (ctr = 0; ctr < (Int2)sctp->num; ctr++, symbols++)
2058 {
2059 if (! StringCmp(*symbols, symbol))
2060 return ((Uint1)ctr + sctp->start_at);
2061 }
2062 return INVALID_RESIDUE;
2063 }
2064
2065 /*****************************************************************************
2066 *
2067 * const char * GetNameForResidue (sctp, residue)
2068 * returns the descriptive name (eg. "Leucine") for a residue in the
2069 * sequence code defined by sctp
2070 * returns NULL if not a valid code in the alphabet
2071 * nb: some codes have "holes" in them, regions of values that are
2072 * invalid.
2073 *
2074 *****************************************************************************/
GetNameForResidue(SeqCodeTablePtr sctp,Uint1 residue)2075 NLM_EXTERN const char * GetNameForResidue (SeqCodeTablePtr sctp, Uint1 residue)
2076 {
2077 Uint1 offset;
2078
2079 offset = GetIndexForResidue (sctp, residue);
2080 if (offset == INVALID_RESIDUE) return NULL;
2081
2082 return (const char *)(sctp->names[offset]);
2083
2084 }
2085
2086 /*****************************************************************************
2087 *
2088 * SeqMapTablePtr SeqMapTableFind(to, from)
2089 * Map from sequence code "from" to sequence code "to"
2090 * Sequence codes defined in objseq.h
2091 * For to == ncbi2na initialize Random generator and for
2092 * Seq_code_iupacna --> Seq_code_ncbi2na initialize conversion table
2093 *****************************************************************************/
SeqMapTableFind(Uint1 to,Uint1 from)2094 NLM_EXTERN SeqMapTablePtr LIBCALL SeqMapTableFind (Uint1 to, Uint1 from)
2095 {
2096
2097 /* If we want to convert iupacna to ncbi4na initialize
2098 randomize conversion table */
2099
2100 if(to == Seq_code_ncbi2na) {
2101 /* Nlm_RandomSeed(Nlm_GetSecs()); */
2102
2103 if(from == Seq_code_iupacna && !NaI2InitOk) {
2104 if(!InitNaI2Table())
2105 return NULL;
2106 }
2107 }
2108 return SeqMapTableFindObj (to, from);
2109 }
2110
2111 /*****************************************************************************
2112 *
2113 * void NaI2TableFree(void)
2114 * Free allocated memory for
2115 * Seq_code_iupacna --> Seq_code_ncbi2na transfer
2116 *****************************************************************************/
NaI2TableFree(void)2117 NLM_EXTERN void NaI2TableFree(void)
2118 {
2119 Int4 i;
2120 for(i=0; i < 5; i++)
2121 MemFree(NaI2[i]);
2122 }
2123
2124 /*****************************************************************************
2125 *
2126 * Boolean InitNaI2Table(void)
2127 * Initialize random conversion table for
2128 * Seq_code_iupacna --> Seq_code_ncbi2na transfer
2129 *****************************************************************************/
InitNaI2Table(void)2130 static Boolean InitNaI2Table(void)
2131 {
2132 SeqMapTablePtr smtp;
2133 register Int4 i, j;
2134 Uint1 ch;
2135
2136 /* Initialization of random function by some long value */
2137
2138 if((smtp = SeqMapTableFindObj(Seq_code_iupacna,
2139 Seq_code_ncbi4na)) == NULL)
2140 return FALSE;
2141
2142 for(i = 0; i < 5; i++) {
2143 NaI2[i] = (Int1Ptr) MemNew(256);
2144 MemSet((CharPtr) NaI2[i], -1, 256);
2145 }
2146
2147 MemSet((CharPtr)NaI2Set, -1, sizeof(NaI2Set));
2148
2149 for(i = 0 ; i < 16; i ++) {
2150 NaI2Set[ch = (Uint1)SeqMapTableConvert(smtp, (Uint1)i)] = Na42Set[i];
2151 for(j = 0; j < 5; j++)
2152 NaI2[j][ch] = Na42[i][j];
2153 }
2154 NaI2InitOk = TRUE;
2155 return TRUE;
2156 }
2157
2158 /*****************************************************************************
2159 *
2160 * Convert4NaRandom(from, to)
2161 * Converts Seq_code_ncbi4na "from" to Seq_code_ncbi2na "to"
2162 * with random conversions
2163 * Return TRUE if conversion done without randomization
2164 * Nlm_RandomSeed(Nlm_GetSecs()); recommended in calling function
2165 *****************************************************************************/
Convert4NaRandom(Uint1 from,Uint1 PNTR to)2166 NLM_EXTERN Boolean Convert4NaRandom(Uint1 from, Uint1 PNTR to)
2167 {
2168 Boolean retvalue;
2169
2170 *to = (Uint1) (retvalue = (Na42Set[from] >= 0)) ?
2171 Na42Set[from] : CONVERT_42_RAND(from);
2172 return retvalue;
2173 }
2174
2175 /*****************************************************************************
2176 *
2177 * SeqMapTableConvert(smtp, from)
2178 * returns conversion of "from" using SeqMapTable smtp
2179 * To to == Seq_code_ncbi2na use random conversion table
2180 *
2181 *****************************************************************************/
SeqMapTableConvert(SeqMapTablePtr smtp,Uint1 from)2182 NLM_EXTERN Uint1 SeqMapTableConvert (SeqMapTablePtr smtp, Uint1 from)
2183
2184 {
2185 Int2 index;
2186
2187 if (smtp == NULL) return (Uint1)(INVALID_RESIDUE);
2188
2189 /* For conversions into ncbi2na encoding we will use randomized
2190 generation of residues */
2191
2192 if(smtp->to == Seq_code_ncbi2na) {
2193 if(smtp->from == Seq_code_ncbi4na)
2194 return (Uint1) (Na42Set[from] < 0) ?
2195 CONVERT_42_RAND(from) : Na42Set[from];
2196 else if(smtp->from == Seq_code_iupacna)
2197 return (Uint1) (NaI2Set[from] < 0) ?
2198 CONVERT_I2_RAND(from) : NaI2Set[from];
2199 }
2200
2201 /* This will handle all other cases */
2202
2203 index = (Int2)from - (Int2)(smtp->start_at);
2204 if ((index >= 0) && (index < (Int2)(smtp->num)))
2205 return (Uint1)(smtp->table[index]);
2206 else
2207 return (Uint1)(INVALID_RESIDUE);
2208 }
2209
2210 /*****************************************************************************
2211 *
2212 * SeqCodeTableComp(sctp, residue)
2213 * returns complement of residue if possible
2214 * or residue, if not
2215 * assumes residue is in the same code as sctp
2216 *
2217 *****************************************************************************/
SeqCodeTableComp(SeqCodeTablePtr sctp,Uint1 residue)2218 NLM_EXTERN Uint1 SeqCodeTableComp (SeqCodeTablePtr sctp, Uint1 residue)
2219
2220 {
2221 Int2 index;
2222
2223 if ((sctp == NULL) || (sctp->comps == NULL)) /* no complement table */
2224 return INVALID_RESIDUE;
2225
2226 index = (Int2)residue - (Int2)(sctp->start_at);
2227 if ((index < 0 ) || (index >= (Int2)(sctp->num)))
2228 return INVALID_RESIDUE;
2229 else
2230 return sctp->comps[index];
2231 }
2232
2233 /*****************************************************************************
2234 *
2235 * SeqEntryList(sep, mydata, mycallback, index, indent)
2236 * traverses all Seq-entry nodes beginning with sep
2237 * calls mycallback() at each node
2238 *
2239 *****************************************************************************/
SeqEntryList(SeqEntryPtr sep,Pointer mydata,SeqEntryFunc mycallback,Int4 index,Int2 indent)2240 NLM_EXTERN Int4 SeqEntryList (SeqEntryPtr sep, Pointer mydata, SeqEntryFunc mycallback, Int4 index, Int2 indent)
2241
2242 {
2243 if (sep == NULL)
2244 return index;
2245
2246 if (mycallback != NULL)
2247 (*mycallback)(sep, mydata, index, indent);
2248 index++;
2249
2250 if (IS_Bioseq(sep)) /* bioseq, no contained sequences */
2251 return index;
2252
2253 sep = ((BioseqSetPtr)sep->data.ptrvalue)->seq_set;
2254 indent++;
2255 while (sep != NULL)
2256 {
2257 index = SeqEntryList(sep, mydata, mycallback, index, indent);
2258 sep = sep->next;
2259 }
2260 return index;
2261 }
2262
2263 /*****************************************************************************
2264 *
2265 * BioseqList(sep, mydata, mycallback, index, indent)
2266 * traverses all Seq-entry nodes beginning with sep
2267 * calls mycallback() at each node that is a Bioseq
2268 * Does NOT enter BioseqSets of _class "parts"
2269 * Does NOT increment indent
2270 *
2271 *****************************************************************************/
BioseqList(SeqEntryPtr sep,Pointer mydata,SeqEntryFunc mycallback,Int4 index,Int2 indent)2272 NLM_EXTERN Int4 BioseqList (SeqEntryPtr sep, Pointer mydata, SeqEntryFunc mycallback, Int4 index, Int2 indent)
2273
2274 {
2275 if (sep == NULL)
2276 return index;
2277
2278 if (IS_Bioseq(sep)) /* bioseq, no contained sequences */
2279 {
2280 if (mycallback != NULL)
2281 (*mycallback)(sep, mydata, index, indent);
2282 return index+1;
2283 }
2284
2285 if (Bioseq_set_class(sep) == 4) /* parts, do not enter */
2286 return index;
2287
2288 sep = ((BioseqSetPtr)sep->data.ptrvalue)->seq_set;
2289 while (sep != NULL)
2290 {
2291 index = BioseqList(sep, mydata, mycallback, index, indent);
2292 sep = sep->next;
2293 }
2294 return index;
2295 }
2296
2297 /*****************************************************************************
2298 *
2299 * SeqEntryGetSeqDescr(sep, type, curr)
2300 * returns pointer to the next SeqDescr of this type
2301 * type gives type of Seq-descr
2302 * if 0, gives all types
2303 * curr is NULL or previous node of this type found
2304 *
2305 *****************************************************************************/
SeqEntryGetSeqDescr(SeqEntryPtr sep,Int2 type,ValNodePtr curr)2306 NLM_EXTERN ValNodePtr SeqEntryGetSeqDescr (SeqEntryPtr sep, Int2 type, ValNodePtr curr) /* the last one you used */
2307
2308 {
2309
2310 if (sep == NULL) return NULL;
2311
2312 if (curr == NULL)
2313 {
2314 if (IS_Bioseq(sep))
2315 curr = ((BioseqPtr)sep->data.ptrvalue)->descr;
2316 else
2317 curr = ((BioseqSetPtr)sep->data.ptrvalue)->descr;
2318 }
2319 else
2320 curr = curr->next; /* move past last one */
2321
2322 while (curr != NULL)
2323 {
2324 if ((! type) || ((Int2)curr->choice == type))
2325 return curr;
2326 else
2327 curr = curr->next;
2328 }
2329 return NULL;
2330 }
2331 /*****************************************************************************
2332 *
2333 * SeqEntryGetTitle(sep)
2334 * returns pointer to the first title of this SeqEntry
2335 *
2336 *****************************************************************************/
SeqEntryGetTitle(SeqEntryPtr sep)2337 NLM_EXTERN CharPtr SeqEntryGetTitle (SeqEntryPtr sep)
2338
2339 {
2340 ValNodePtr ptr;
2341
2342 ptr = SeqEntryGetSeqDescr(sep, Seq_descr_title, NULL);
2343 if (ptr != NULL)
2344 return (CharPtr)ptr->data.ptrvalue;
2345 else
2346 return NULL;
2347 }
2348
2349 /*****************************************************************************
2350 *
2351 * Bioseq_set_class (SeqEntryPtr sep)
2352 * returns class of set as is enumerated in ASN.1 spec
2353 * returns 0 if not a Bioseq-set
2354 *
2355 *****************************************************************************/
Bioseq_set_class(SeqEntryPtr sep)2356 NLM_EXTERN Uint1 Bioseq_set_class (SeqEntryPtr sep)
2357
2358 {
2359 if (sep == NULL) return 0;
2360
2361 if (IS_Bioseq_set(sep))
2362 return ((BioseqSetPtr)sep->data.ptrvalue)->_class;
2363 else
2364 return 0;
2365 }
2366
2367 /*****************************************************************************
2368 *
2369 * SeqEntryDoConvert(sep, newcode, index, indent)
2370 * converts a seqentry which is a raw bioseq to newcode
2371 * callback used by SeqEntryConvert()
2372 *
2373 *****************************************************************************/
2374 NLM_EXTERN void SeqEntryDoConvert (SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent);
SeqEntryDoConvert(SeqEntryPtr sep,Pointer data,Int4 index,Int2 indent)2375 NLM_EXTERN void SeqEntryDoConvert (SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent)
2376
2377 {
2378 if (! IS_Bioseq(sep))
2379 return;
2380
2381 if (((Uint1Ptr)data)[0] != 0)
2382 {
2383 if (BioseqConvert((BioseqPtr)sep->data.ptrvalue, * ((Uint1Ptr)data)))
2384 ((Uint1Ptr)data)[1]++;
2385 }
2386 else
2387 {
2388 if (BioseqPack((BioseqPtr)sep->data.ptrvalue))
2389 ((Uint1Ptr)data)[1]++;
2390 }
2391 return;
2392 }
2393
2394 /*****************************************************************************
2395 *
2396 * SeqEntryConvert(sep, newcode)
2397 * converts any seqentry to newcode
2398 * if (newcode == 0)
2399 * calls BioseqRawPack instead of BioseqRawConvert
2400 *
2401 *****************************************************************************/
SeqEntryConvert(SeqEntryPtr sep,Uint1 newcode)2402 NLM_EXTERN Boolean SeqEntryConvert (SeqEntryPtr sep, Uint1 newcode)
2403
2404 {
2405 Uint1 tbuf[2];
2406 tbuf[0] = newcode;
2407 tbuf[1] = 0;
2408
2409 if (sep == NULL) return FALSE;
2410
2411 SeqEntryExplore(sep, (Pointer)tbuf, SeqEntryDoConvert);
2412 if (tbuf[1])
2413 return TRUE; /* at least one success */
2414 else
2415 return FALSE;
2416 }
2417
2418 /*****************************************************************************
2419 *
2420 * SeqIdBestRank(buf, num)
2421 * fill buf of length num with std ranks used by SeqIdFindBest
2422 * returns full length of list (useful if num is too small)
2423 * std ranks always between 50 and 100
2424 * rank < 50 guarantees SeqIdSelect() chooses over std rank
2425 * rank > 100 guarantees SeqIdSelect() never chooses over std rank
2426 * rank = 255 guarantees SeqIdSelect() never choses
2427 * if buf == NULL, just returns count of supported Seq-ids
2428 *
2429 *****************************************************************************/
SeqIdBestRank(Uint1Ptr buf,Int2 num)2430 NLM_EXTERN Int2 SeqIdBestRank (Uint1Ptr buf, Int2 num)
2431 {
2432 static Uint1 std_order[NUM_SEQID] = {
2433 83, /* 0 = not set */
2434 80, /* 1 = local Object-id */
2435 70, /* 2 = gibbsq */
2436 70, /* 3 = gibbmt */
2437 70, /* 4 = giim Giimport-id */
2438 60, /* 5 = genbank */
2439 60, /* 6 = embl */
2440 60, /* 7 = pir */
2441 60, /* 8 = swissprot */
2442 67, /* 9 = patent */
2443 65, /* 10 = other TextSeqId */
2444 80, /* 11 = general Dbtag */
2445 51, /* 12 = gi */
2446 60, /* 13 = ddbj */
2447 60, /* 14 = prf */
2448 60, /* 15 = pdb */
2449 60, /* 16 = tpg */
2450 60, /* 17 = tpe */
2451 60, /* 18 = tpd */
2452 68, /* 19 = gpp */
2453 69 /* 20 = nat */
2454 };
2455
2456 if (buf == NULL) return NUM_SEQID;
2457
2458 if (num > NUM_SEQID)
2459 num = NUM_SEQID;
2460 MemCopy(buf, std_order, (size_t)(num * sizeof(Uint1)));
2461 return NUM_SEQID;
2462 }
2463
2464 /*****************************************************************************
2465 *
2466 * SeqIdFindBest(sip)
2467 * Find the most reliable SeqId in a chain
2468 *
2469 *****************************************************************************/
SeqIdFindBest(SeqIdPtr sip,Uint1 target)2470 NLM_EXTERN SeqIdPtr SeqIdFindBest (SeqIdPtr sip, Uint1 target)
2471 {
2472 Uint1 order[NUM_SEQID];
2473
2474 if (sip == NULL)
2475 return NULL;
2476
2477 SeqIdBestRank(order, NUM_SEQID);
2478 if ((target > 0) && (target < NUM_SEQID))
2479 order[target] = 0; /* select target */
2480 else if (target >= NUM_SEQID)
2481 ErrPostEx(SEV_ERROR, 0, 0, "SeqIdFindBest: target [%d] out of range [%d]",
2482 (int)target, (int)NUM_SEQID);
2483
2484 return SeqIdSelect (sip, order, NUM_SEQID);
2485 }
2486 /*****************************************************************************
2487 *
2488 * SeqIdFindBestAccn(sip)
2489 * Find the most reliable Accession SeqId in a chain
2490 * else returns gi;
2491 *
2492 *****************************************************************************/
SeqIdFindBestAccession(SeqIdPtr sip)2493 NLM_EXTERN SeqIdPtr SeqIdFindBestAccession (SeqIdPtr sip)
2494 {
2495 Uint1 order[NUM_SEQID];
2496
2497 if (sip == NULL)
2498 return NULL;
2499 SeqIdBestRank(order, NUM_SEQID);
2500 order[SEQID_GI]=order[SEQID_LOCAL]+1;
2501 return SeqIdSelect (sip, order, NUM_SEQID);
2502 }
2503
2504 /*****************************************************************************
2505 *
2506 * SeqIdPtr SeqIdLocate (sip, order, num)
2507 * Given a SeqId (sip):
2508 * Locates the Bioseq in memory or cached
2509 * Then calls SeqIdSelect with the Bioseq.id chain to find the
2510 * SeqId type you want.
2511 *
2512 *****************************************************************************/
SeqIdLocate(SeqIdPtr sip,Uint1Ptr order,Int2 num)2513 NLM_EXTERN SeqIdPtr SeqIdLocate (SeqIdPtr sip, Uint1Ptr order, Int2 num)
2514 {
2515 BioseqPtr bsp;
2516 SeqIdPtr res = NULL;
2517 Boolean locked = FALSE;
2518
2519 bsp = BioseqFindCore(sip);
2520 if (bsp == NULL)
2521 {
2522 bsp = BioseqLockById(sip);
2523 if (bsp != NULL)
2524 locked = TRUE;
2525 else
2526 return res;
2527 }
2528 res = SeqIdSelect(bsp->id, order, num);
2529 if (locked)
2530 BioseqUnlock(bsp);
2531 return res;
2532 }
2533
2534 /*****************************************************************************
2535 *
2536 * SeqIdPtr SeqIdSelect (sip, order, num)
2537 * takes an array (order) num long.
2538 * goes down chain starting with sip.
2539 * finds lowest value of order[sip->choice] and returns it.
2540 * if order[] == 255, it is skipped.
2541 * if nothing is found < 255, NULL is returned
2542 * ErrorMessage if sip->choice >= num
2543 *
2544 *****************************************************************************/
SeqIdSelect(SeqIdPtr sip,Uint1Ptr order,Int2 num)2545 NLM_EXTERN SeqIdPtr SeqIdSelect (SeqIdPtr sip, Uint1Ptr order, Int2 num)
2546 {
2547 SeqIdPtr bestid;
2548
2549 if ((sip == NULL) || (order == NULL))
2550 return NULL;
2551
2552 for ( bestid = NULL; sip != NULL; sip = sip -> next)
2553 {
2554 if ((Int2)sip->choice < num)
2555 {
2556 if (order[sip->choice] < 255)
2557 {
2558 if (bestid == NULL)
2559 bestid = sip;
2560 else if (order[sip->choice] < order[bestid->choice])
2561 bestid = sip;
2562 }
2563 } else {
2564 ErrPostEx(SEV_ERROR, 0,0, "SeqIdSelect: choice [%d] out of range [%d]",
2565 (int)(sip->choice), (int)num);
2566 if(sip->choice > NUM_SEQID) /*** something is really wrong ***/
2567 return NULL;
2568 }
2569 }
2570
2571 return bestid;
2572 }
2573
2574 static char * delim = "|";
2575 static char * txtid [NUM_SEQID] = { /* FASTA_LONG formats */
2576 "???" , /* not-set = ??? */
2577 "lcl", /* local = lcl|integer or string */
2578 "bbs", /* gibbsq = bbs|integer */
2579 "bbm", /* gibbmt = bbm|integer */
2580 "gim", /* giim = gim|integer */
2581 "gb", /* genbank = gb|accession|locus */
2582 "emb", /* embl = emb|accession|locus */
2583 "pir", /* pir = pir|accession|name */
2584 "sp", /* swissprot = sp|accession|name */
2585 "pat", /* patent = pat|country|patent number (string)|seq number (integer) - use pgp for pre-grant pub */
2586 "ref", /* other = ref|accession|name|release - changed from oth to ref */
2587 "gnl", /* general = gnl|database(string)|id (string or number) */
2588 "gi", /* gi = gi|integer */
2589 "dbj", /* ddbj = dbj|accession|locus */
2590 "prf", /* prf = prf|accession|name */
2591 "pdb", /* pdb = pdb|entry name (string)|chain id (char) */
2592 "tpg", /* tpg = tpg|accession|name */
2593 "tpe", /* tpe = tpe|accession|name */
2594 "tpd", /* tpd = tpd|accession|name */
2595 "gpp", /* gpp = gpp|accession|name */
2596 "nat"}; /* nat = nat|accession|name */
2597
2598 /*****************************************************************************
2599 *
2600 * SeqIdPrint(sip, buf, format)
2601 * PRINTID_FASTA_LONG treats sip as a chain, printing gi|other id
2602 * other id is as given in the comments for txtid. Empty fields
2603 * do not eliminate | delimiters
2604 * PRINTID_FASTA_SHORT prints only the sip.
2605 * same format as FASTA_LONG (for other id)
2606 *
2607 * PRINTID_TEXTID_LOCUS or ACCESSION
2608 * --------------------------------------------------------
2609 * | OLDWAY: |
2610 * | TextSeqId types- fills request or first char in |
2611 * | buffer \0 if cannot be filled |
2612 * | gibbmt, gibbsq = fills with _M or _S [number] |
2613 * | other types- fills in as FASTA_SHORT |
2614 * --------------------------------------------------------
2615 * CURRENTLY:
2616 * for SEQID_GENBANK,SEQID_EMBL,SEQID_DDBJ, takes accession
2617 * or locus field; for SEQID_LOCAL, takes str
2618 * as accession only
2619 * ALL others as FASTA_SHORT
2620 *
2621 * PRINTID_REPORT- similar to FASTA_SHORT but removes extra optional
2622 * fields and | to make more human readable (but less parseable)
2623 *
2624 * if format is in the range ' ' to 127 (32-12y) ASCII, then the character
2625 * given is used as a separator instead of '|' and the format is
2626 * PRINTID_FASTA_SHORT. 127 is translated as TAB (ASCII 9)
2627 * This makes this function flexible for bulk
2628 * data processing. Note that this invalidates SeqIdParse() and may create
2629 * conflicts with names. Use with caution.
2630 *
2631 * return value points to \0 at end of buf
2632 *
2633 *****************************************************************************/
SeqIdPrint(SeqIdPtr isip,CharPtr buf,Uint1 format)2634 NLM_EXTERN CharPtr SeqIdPrint (SeqIdPtr isip, CharPtr buf, Uint1 format)
2635
2636 {
2637 return SeqIdWrite (isip, buf, format, 255); /* no knowledge of buffer size */
2638 }
2639
2640 /*****************************************************************************
2641 *
2642 * SeqIdWrite (isip, buf, format, buflen)
2643 * Similar to SeqIdPrint, has additional argument buflen,
2644 * checks the buflen, writes up to buflen chars,
2645 * makes the last character '>'
2646 * always puts one '\0' to terminate the string in buf
2647 * buf MUST be one character longer than buflen to leave room for the
2648 * last '\0'
2649 *
2650 *****************************************************************************/
SeqIdWrite(SeqIdPtr isip,CharPtr buf,Uint1 format,Uint4 buflen)2651 NLM_EXTERN CharPtr SeqIdWrite (SeqIdPtr isip, CharPtr buf, Uint1 format, Uint4 buflen)
2652
2653 {
2654 SeqIdPtr sip;
2655 char localbuf[32]; /* for MS Windows */
2656 char *ldelim;
2657 char d [2];
2658 CharPtr tmp;
2659 static Uint1 fasta_order[NUM_SEQID] = { /* order for other id FASTA_LONG */
2660 33, /* 0 = not set */
2661 20, /* 1 = local Object-id */
2662 15, /* 2 = gibbsq */
2663 16, /* 3 = gibbmt */
2664 30, /* 4 = giim Giimport-id */
2665 10, /* 5 = genbank */
2666 10, /* 6 = embl */
2667 10, /* 7 = pir */
2668 10, /* 8 = swissprot */
2669 15, /* 9 = patent */
2670 10, /* 10 = other = refseq */
2671 13, /* 11 = general Dbtag */
2672 255, /* 12 = gi */
2673 10, /* 13 = ddbj */
2674 10, /* 14 = prf */
2675 12, /* 15 = pdb */
2676 10, /* 16 = tpg */
2677 10, /* 17 = tpe */
2678 10, /* 18 = tpd */
2679 15, /* 19 = gpp */
2680 15 /* 20 = nat */
2681 };
2682 static Uint1 tmsmart_order[NUM_SEQID] = { /* order for other id FASTA_LONG */
2683 33, /* 0 = not set */
2684 20, /* 1 = local Object-id */
2685 15, /* 2 = gibbsq */
2686 16, /* 3 = gibbmt */
2687 30, /* 4 = giim Giimport-id */
2688 10, /* 5 = genbank */
2689 10, /* 6 = embl */
2690 10, /* 7 = pir */
2691 10, /* 8 = swissprot */
2692 15, /* 9 = patent */
2693 10, /* 10 = other = refseq */
2694 29, /* 11 = general Dbtag */
2695 255, /* 12 = gi */
2696 10, /* 13 = ddbj */
2697 10, /* 14 = prf */
2698 12, /* 15 = pdb */
2699 10, /* 16 = tpg */
2700 10, /* 17 = tpe */
2701 10, /* 18 = tpd */
2702 15, /* 19 = gpp */
2703 15 /* 20 = nat */
2704 };
2705 static Uint1 general_order[NUM_SEQID] = { /* order for other id FASTA_LONG */
2706 33, /* 0 = not set */
2707 20, /* 1 = local Object-id */
2708 15, /* 2 = gibbsq */
2709 16, /* 3 = gibbmt */
2710 30, /* 4 = giim Giimport-id */
2711 10, /* 5 = genbank */
2712 10, /* 6 = embl */
2713 10, /* 7 = pir */
2714 10, /* 8 = swissprot */
2715 15, /* 9 = patent */
2716 10, /* 10 = other = refseq */
2717 12, /* 11 = general Dbtag */
2718 255, /* 12 = gi */
2719 10, /* 13 = ddbj */
2720 10, /* 14 = prf */
2721 12, /* 15 = pdb */
2722 10, /* 16 = tpg */
2723 10, /* 17 = tpe */
2724 10, /* 18 = tpd */
2725 15, /* 19 = gpp */
2726 15 /* 20 = nat */
2727 };
2728 Boolean useGeneral = FALSE;
2729 TextSeqIdPtr tsip;
2730 PDBSeqIdPtr psip;
2731 ObjectIdPtr oip;
2732 PatentSeqIdPtr patsip;
2733 IdPatPtr ipp;
2734 Boolean got_gi = FALSE;
2735 Boolean got_tmsmart = FALSE;
2736 Boolean is_us_pre_grant = FALSE;
2737 DbtagPtr dbt;
2738 Char chainbuf[3];
2739 Char versionbuf[10];
2740 Int2 version = 0;
2741 CharPtr release = NULL;
2742
2743 buf[0] = '\0';
2744 buflen--;
2745 tmp = buf;
2746 if (isip == NULL)
2747 return tmp;
2748
2749 d [0] = *delim;
2750 d [1] = '\0';
2751 ldelim = &(d [0]);
2752 if ((format >= ' ') && (format <= 127)) /* change delimiter */
2753 {
2754 if (format == 127)
2755 d [0] = '\t';
2756 else
2757 d [0] = (char) format;
2758 format = PRINTID_FASTA_SHORT;
2759 }
2760
2761 if (format == PRINTID_FASTA_GENERAL) {
2762 useGeneral = TRUE;
2763 format = PRINTID_FASTA_LONG;
2764 }
2765
2766 if (format == PRINTID_FASTA_ALL) {
2767 Char allbuf [41];
2768 ValNodePtr vnp, head = NULL;
2769 size_t len = 0;
2770 CharPtr str;
2771 Boolean notfirst;
2772
2773 for (sip = isip; sip != NULL; sip = sip->next) {
2774 SeqIdWrite (sip, allbuf, PRINTID_FASTA_SHORT, sizeof (allbuf) - 1);
2775 ValNodeCopyStr (&head, 0, allbuf);
2776 }
2777 for (vnp = head; vnp != NULL; vnp = vnp->next) {
2778 str = (CharPtr) vnp->data.ptrvalue;
2779 if (! StringHasNoText (str)) {
2780 len += StringLen (str) + 1;
2781 }
2782 }
2783 if (len < 1) return buf;
2784 tmp = MemNew (len + 2);
2785 if (tmp == NULL) return buf;
2786 notfirst = FALSE;
2787 for (vnp = head; vnp != NULL; vnp = vnp->next) {
2788 str = (CharPtr) vnp->data.ptrvalue;
2789 if (! StringHasNoText (str)) {
2790 if (notfirst) {
2791 StringCat (tmp, "|");
2792 }
2793 StringCat (tmp, str);
2794 notfirst = TRUE;
2795 }
2796 }
2797 ValNodeFreeData (head);
2798 StringNCpy_0 (buf, tmp, buflen + 1);
2799 MemFree (tmp);
2800 return buf;
2801 }
2802
2803 localbuf[0] = '\0';
2804 /* error on input, return ??? */
2805 if ( (! (isip -> choice)) || (format < PRINTID_FASTA_SHORT)
2806 || (format > PRINTID_REPORT))
2807 {
2808 Nlm_LabelCopyNext(&tmp, txtid[0], &buflen);
2809 return tmp;
2810 }
2811
2812 if (format == PRINTID_FASTA_LONG) /* find the ids in the chain */
2813 {
2814 for (sip = isip; sip != NULL; sip = sip->next) /* GI present? */
2815 {
2816 if (sip->choice == SEQID_GI)
2817 {
2818 sprintf(localbuf, "%s%s%lld", txtid[SEQID_GI], ldelim,
2819 (long long)(sip->data.intvalue));
2820 Nlm_LabelCopyNext(&tmp, localbuf, &buflen);
2821 got_gi = TRUE;
2822 } else if (sip->choice == SEQID_GENERAL) {
2823 dbt = (DbtagPtr) sip->data.ptrvalue;
2824 if (dbt != NULL && StringICmp (dbt->db, "TMSMART") == 0) {
2825 got_tmsmart = TRUE;
2826 }
2827 } else if (sip->choice == SEQID_PATENT) {
2828 patsip = (PatentSeqIdPtr) sip->data.ptrvalue;
2829 if (patsip != NULL) {
2830 ipp = patsip->cit;
2831 if (ipp != NULL && StringDoesHaveText (ipp->app_number)) {
2832 is_us_pre_grant = TRUE;
2833 }
2834 }
2835 }
2836 }
2837 if (useGeneral) {
2838 sip = SeqIdSelect(isip, general_order, NUM_SEQID);
2839 } else if (got_tmsmart) {
2840 sip = SeqIdSelect(isip, tmsmart_order, NUM_SEQID);
2841 } else {
2842 sip = SeqIdSelect(isip, fasta_order, NUM_SEQID);
2843 }
2844 if (sip == NULL) /* only GI */
2845 return tmp;
2846 else if (got_gi)
2847 {
2848 if (sip->choice == SEQID_GIIM) /* don't show GIIM with GI */
2849 return tmp;
2850
2851 Nlm_LabelCopyNext(&tmp, ldelim, &buflen);
2852 }
2853 format = PRINTID_FASTA_SHORT; /* put on second (or only) SeqId in this format */
2854 }
2855 else {
2856 sip = isip; /* only one id processed */
2857 if (sip != NULL && sip->choice == SEQID_PATENT) {
2858 patsip = (PatentSeqIdPtr) sip->data.ptrvalue;
2859 if (patsip != NULL) {
2860 ipp = patsip->cit;
2861 if (ipp != NULL && StringDoesHaveText (ipp->app_number)) {
2862 is_us_pre_grant = TRUE;
2863 }
2864 }
2865 }
2866 }
2867
2868 /* deal with LOCUS and ACCESSION */
2869 if ((format == PRINTID_TEXTID_ACCESSION) || (format == PRINTID_TEXTID_LOCUS) ||
2870 (format == PRINTID_TEXTID_ACC_VER) || (format == PRINTID_TEXTID_ACC_ONLY))
2871 {
2872 if (format == PRINTID_TEXTID_ACCESSION) {
2873 format = PRINTID_TEXTID_ACC_ONLY; /* current default */
2874 }
2875 switch (sip->choice) /* get the real TextSeqId types */
2876 {
2877 case SEQID_GENBANK:
2878 case SEQID_EMBL:
2879 case SEQID_DDBJ:
2880 case SEQID_PIR:
2881 case SEQID_SWISSPROT:
2882 case SEQID_PRF:
2883 case SEQID_OTHER:
2884 case SEQID_TPG:
2885 case SEQID_TPE:
2886 case SEQID_TPD:
2887 case SEQID_GPIPE:
2888 case SEQID_NAMED_ANNOT_TRACK:
2889 tsip = (TextSeqIdPtr)sip->data.ptrvalue;
2890 release = tsip->release;
2891 if (sip->choice == SEQID_SWISSPROT) {
2892 release = NULL;
2893 }
2894 if ((format == PRINTID_TEXTID_LOCUS) && (tsip->name != NULL)) {
2895 Nlm_LabelCopyNext(&tmp, tsip->name, &buflen);
2896 return tmp;
2897 } else if ((format == PRINTID_TEXTID_ACC_ONLY || format == PRINTID_TEXTID_LOCUS)
2898 && (tsip->accession != NULL)) {
2899 Nlm_LabelCopyNext(&tmp, tsip->accession, &buflen);
2900 return tmp;
2901 } else if ((format == PRINTID_TEXTID_ACC_VER)
2902 && (tsip->accession != NULL)) {
2903 if (tsip->version > 0 && release == NULL) {
2904 sprintf(localbuf, "%s.%d", tsip->accession,
2905 (int)(tsip->version));
2906 } else {
2907 sprintf(localbuf, "%s", tsip->accession);
2908 }
2909 Nlm_LabelCopyNext(&tmp, localbuf, &buflen);
2910 return tmp;
2911 }
2912 break;
2913 default:
2914 break;
2915 }
2916 }
2917
2918 if (format == PRINTID_FASTA_SHORT)
2919 {
2920 if (sip->choice == SEQID_PATENT && is_us_pre_grant) {
2921 Nlm_LabelCopyNext(&tmp, "pgp", &buflen);
2922 } else if (sip->choice == SEQID_SWISSPROT) {
2923 tsip = (TextSeqIdPtr)sip->data.ptrvalue;
2924 if (tsip->release && StringCmp(tsip->release, "unreviewed") == 0)
2925 Nlm_LabelCopyNext(&tmp, "tr", &buflen);
2926 else
2927 Nlm_LabelCopyNext(&tmp, txtid[sip->choice], &buflen);
2928 } else {
2929 Nlm_LabelCopyNext(&tmp, txtid[sip->choice], &buflen);
2930 }
2931 Nlm_LabelCopyNext(&tmp, ldelim, &buflen);
2932 }
2933
2934 switch (sip->choice)
2935 {
2936 case SEQID_LOCAL: /* object id */
2937 if ((((ObjectIdPtr)sip->data.ptrvalue)->str) == NULL)
2938 {
2939 sprintf(localbuf, "%ld",
2940 (long)((ObjectIdPtr)sip->data.ptrvalue)->id);
2941 Nlm_LabelCopyNext(&tmp, localbuf, &buflen);
2942 }
2943 else
2944 Nlm_LabelCopyNext(&tmp,
2945 ((ObjectIdPtr)sip->data.ptrvalue)->str, &buflen);
2946 break;
2947 case SEQID_GIBBSQ:
2948 case SEQID_GIBBMT:
2949 sprintf(localbuf, "%ld", (long)sip->data.intvalue);
2950 Nlm_LabelCopyNext(&tmp, localbuf, &buflen);
2951 break;
2952 case SEQID_GI:
2953 sprintf(localbuf, "%lld", (long long)sip->data.intvalue);
2954 Nlm_LabelCopyNext(&tmp, localbuf, &buflen);
2955 break;
2956 case SEQID_GIIM:
2957 sprintf(localbuf, "%ld", (long)((GiimPtr)sip->data.ptrvalue)->id);
2958 Nlm_LabelCopyNext(&tmp, localbuf, &buflen);
2959 break;
2960 case SEQID_GENBANK:
2961 case SEQID_EMBL:
2962 case SEQID_DDBJ:
2963 case SEQID_OTHER:
2964 case SEQID_TPG:
2965 case SEQID_TPE:
2966 case SEQID_TPD:
2967 case SEQID_GPIPE:
2968 case SEQID_NAMED_ANNOT_TRACK:
2969 case SEQID_SWISSPROT:
2970 tsip = (TextSeqIdPtr)(sip->data.ptrvalue);
2971 release = tsip->release;
2972 if (sip->choice == SEQID_SWISSPROT) {
2973 release = NULL;
2974 }
2975 if ((tsip->version > 0) && (release == NULL) && SHOWVERSION)
2976 version = tsip->version; /* show versions */
2977 sprintf(versionbuf, ".%d", (int)version);
2978 case SEQID_PIR:
2979 case SEQID_PRF:
2980 tsip = (TextSeqIdPtr)sip->data.ptrvalue;
2981 if (tsip->accession != NULL)
2982 {
2983 Nlm_LabelCopyNext(&tmp, tsip->accession, &buflen);
2984 if (version)
2985 Nlm_LabelCopyNext(&tmp, versionbuf,&buflen);
2986 if (format != PRINTID_FASTA_SHORT)
2987 break;
2988 }
2989 if (format == PRINTID_FASTA_SHORT)
2990 Nlm_LabelCopyNext(&tmp, ldelim, &buflen);
2991 if (tsip->name != NULL)
2992 Nlm_LabelCopyNext(&tmp, tsip->name, &buflen);
2993 /*
2994 if (sip->choice == SEQID_OTHER) {
2995 Nlm_LabelCopyNext(&tmp, ldelim, &buflen);
2996 if (tsip->release != NULL)
2997 Nlm_LabelCopyNext(&tmp, tsip->release, &buflen);
2998 }
2999 */
3000 break;
3001 case SEQID_PATENT:
3002 patsip = (PatentSeqIdPtr)(sip->data.ptrvalue);
3003 Nlm_LabelCopyNext(&tmp, patsip->cit->country, &buflen);
3004 if (format == PRINTID_FASTA_SHORT)
3005 Nlm_LabelCopyNext(&tmp, ldelim, &buflen);
3006 if (is_us_pre_grant) {
3007 Nlm_LabelCopyNext(&tmp, patsip->cit->app_number, &buflen);
3008 } else {
3009 Nlm_LabelCopyNext(&tmp, patsip->cit->number, &buflen);
3010 }
3011 if (format == PRINTID_FASTA_SHORT)
3012 Nlm_LabelCopyNext(&tmp, ldelim, &buflen);
3013 else
3014 Nlm_LabelCopyNext(&tmp, "_", &buflen);
3015 sprintf(localbuf, "%d", (int)patsip->seqid);
3016 Nlm_LabelCopyNext(&tmp, localbuf, &buflen);
3017 break;
3018 case SEQID_GENERAL:
3019 oip = ((DbtagPtr)sip->data.ptrvalue)->tag;
3020 if((format == PRINTID_FASTA_SHORT) || (format == PRINTID_REPORT))
3021 Nlm_LabelCopyNext(&tmp,
3022 ((DbtagPtr)sip->data.ptrvalue)->db, &buflen);
3023 if (format == PRINTID_FASTA_SHORT)
3024 Nlm_LabelCopyNext(&tmp, ldelim, &buflen);
3025 else if (format == PRINTID_REPORT)
3026 Nlm_LabelCopyNext(&tmp, ":", &buflen);
3027
3028 if (oip->str == NULL)
3029 {
3030 sprintf(localbuf, "%ld", (long) oip->id);
3031 Nlm_LabelCopyNext(&tmp, localbuf, &buflen);
3032 }
3033 else
3034 Nlm_LabelCopyNext(&tmp, oip->str, &buflen);
3035 break;
3036 case SEQID_PDB:
3037 psip = (PDBSeqIdPtr) sip->data.ptrvalue;
3038 chainbuf[0] = TO_UPPER (psip->chain);
3039 chainbuf[1] = '\0';
3040 chainbuf[2] = '\0';
3041 if (IS_LOWER (psip->chain)) {
3042 chainbuf[1] = chainbuf [0];
3043 }
3044 Nlm_LabelCopyNext(&tmp, psip->mol, &buflen);
3045 if (format == PRINTID_FASTA_SHORT)
3046 {
3047 Nlm_LabelCopyNext(&tmp, ldelim, &buflen);
3048 if (chainbuf[0] == '|') /* special */
3049 Nlm_LabelCopyNext(&tmp, "VB",&buflen);
3050 else if (chainbuf[0] != '\0')
3051 Nlm_LabelCopyNext(&tmp,chainbuf, &buflen);
3052 else
3053 Nlm_LabelCopyNext(&tmp, " ", &buflen);
3054 }
3055 else if (psip->chain > ' ')
3056 {
3057 Nlm_LabelCopyNext(&tmp, "_", &buflen);
3058 Nlm_LabelCopyNext(&tmp,chainbuf, &buflen);
3059 }
3060 break;
3061 default:
3062 Nlm_LabelCopyNext(&tmp, txtid[0], &buflen);
3063 break;
3064
3065 }
3066 return tmp;
3067 }
3068
3069
SeqIdLabelLen(SeqIdPtr isip,Uint1 format)3070 NLM_EXTERN Int4 SeqIdLabelLen (SeqIdPtr isip, Uint1 format)
3071
3072 {
3073 Int4 label_len = 0;
3074 SeqIdPtr sip;
3075 char localbuf[32]; /* for MS Windows */
3076 char *ldelim;
3077 char d [2];
3078 static Uint1 fasta_order[NUM_SEQID] = { /* order for other id FASTA_LONG */
3079 33, /* 0 = not set */
3080 20, /* 1 = local Object-id */
3081 15, /* 2 = gibbsq */
3082 16, /* 3 = gibbmt */
3083 30, /* 4 = giim Giimport-id */
3084 10, /* 5 = genbank */
3085 10, /* 6 = embl */
3086 10, /* 7 = pir */
3087 10, /* 8 = swissprot */
3088 15, /* 9 = patent */
3089 12, /* 10 = other TextSeqId */
3090 13, /* 11 = general Dbtag */
3091 255, /* 12 = gi */
3092 10, /* 13 = ddbj */
3093 10, /* 14 = prf */
3094 12, /* 15 = pdb */
3095 10, /* 16 = tpg */
3096 10, /* 17 = tpe */
3097 10, /* 18 = tpd */
3098 15, /* 19 = gpp */
3099 15 /* 20 = nat */
3100 };
3101 static Uint1 tmsmart_order[NUM_SEQID] = { /* order for other id FASTA_LONG */
3102 33, /* 0 = not set */
3103 20, /* 1 = local Object-id */
3104 15, /* 2 = gibbsq */
3105 16, /* 3 = gibbmt */
3106 30, /* 4 = giim Giimport-id */
3107 10, /* 5 = genbank */
3108 10, /* 6 = embl */
3109 10, /* 7 = pir */
3110 10, /* 8 = swissprot */
3111 15, /* 9 = patent */
3112 12, /* 10 = other TextSeqId */
3113 29, /* 11 = general Dbtag */
3114 255, /* 12 = gi */
3115 10, /* 13 = ddbj */
3116 10, /* 14 = prf */
3117 12, /* 15 = pdb */
3118 10, /* 16 = tpg */
3119 10, /* 17 = tpe */
3120 10, /* 18 = tpd */
3121 15, /* 19 = gpp */
3122 15 /* 20 = nat */
3123 };
3124 static Uint1 general_order[NUM_SEQID] = { /* order for other id FASTA_LONG */
3125 33, /* 0 = not set */
3126 20, /* 1 = local Object-id */
3127 15, /* 2 = gibbsq */
3128 16, /* 3 = gibbmt */
3129 30, /* 4 = giim Giimport-id */
3130 10, /* 5 = genbank */
3131 10, /* 6 = embl */
3132 10, /* 7 = pir */
3133 10, /* 8 = swissprot */
3134 15, /* 9 = patent */
3135 13, /* 10 = other TextSeqId */
3136 12, /* 11 = general Dbtag */
3137 255, /* 12 = gi */
3138 10, /* 13 = ddbj */
3139 10, /* 14 = prf */
3140 12, /* 15 = pdb */
3141 10, /* 16 = tpg */
3142 10, /* 17 = tpe */
3143 10, /* 18 = tpd */
3144 15, /* 19 = gpp */
3145 15 /* 20 = nat */
3146 };
3147 Boolean useGeneral = FALSE;
3148 TextSeqIdPtr tsip;
3149 PDBSeqIdPtr psip;
3150 ObjectIdPtr oip;
3151 PatentSeqIdPtr patsip;
3152 IdPatPtr ipp;
3153 Boolean got_gi = FALSE;
3154 Boolean got_tmsmart = FALSE;
3155 Boolean is_us_pre_grant = FALSE;
3156 DbtagPtr dbt;
3157 Char chainbuf[3];
3158 Char versionbuf[10];
3159 Int2 version = 0;
3160 CharPtr release = NULL;
3161
3162 if (isip == NULL)
3163 return 0;
3164
3165 d [0] = *delim;
3166 d [1] = '\0';
3167 ldelim = &(d [0]);
3168 if ((format >= ' ') && (format <= 127)) /* change delimiter */
3169 {
3170 if (format == 127)
3171 d [0] = '\t';
3172 else
3173 d [0] = (char) format;
3174 format = PRINTID_FASTA_SHORT;
3175 }
3176
3177 if (format == PRINTID_FASTA_GENERAL) {
3178 useGeneral = TRUE;
3179 format = PRINTID_FASTA_LONG;
3180 }
3181
3182 if (format == PRINTID_FASTA_ALL) {
3183 for (sip = isip; sip != NULL; sip = sip->next) {
3184 label_len += SeqIdLabelLen (sip, PRINTID_FASTA_SHORT) + 1;
3185 }
3186 label_len += 2;
3187 return label_len;
3188 }
3189 /* error on input, return ??? */
3190 if ( (! (isip -> choice)) || (format < PRINTID_FASTA_SHORT)
3191 || (format > PRINTID_REPORT))
3192 {
3193 return StringLen (txtid[0]) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3194 }
3195
3196 if (format == PRINTID_FASTA_LONG) /* find the ids in the chain */
3197 {
3198 for (sip = isip; sip != NULL; sip = sip->next) /* GI present? */
3199 {
3200 if (sip->choice == SEQID_GI)
3201 {
3202 sprintf(localbuf, "%s%s%lld", txtid[SEQID_GI], ldelim,
3203 (long long)(sip->data.intvalue));
3204 label_len += StringLen (localbuf) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3205 got_gi = TRUE;
3206 } else if (sip->choice == SEQID_GENERAL) {
3207 dbt = (DbtagPtr) sip->data.ptrvalue;
3208 if (dbt != NULL && StringICmp (dbt->db, "TMSMART") == 0) {
3209 got_tmsmart = TRUE;
3210 }
3211 } else if (sip->choice == SEQID_PATENT) {
3212 patsip = (PatentSeqIdPtr) sip->data.ptrvalue;
3213 if (patsip != NULL) {
3214 ipp = patsip->cit;
3215 if (ipp != NULL && StringDoesHaveText (ipp->app_number)) {
3216 is_us_pre_grant = TRUE;
3217 }
3218 }
3219 }
3220 }
3221 if (useGeneral) {
3222 sip = SeqIdSelect(isip, general_order, NUM_SEQID);
3223 } else if (got_tmsmart) {
3224 sip = SeqIdSelect(isip, tmsmart_order, NUM_SEQID);
3225 } else {
3226 sip = SeqIdSelect(isip, fasta_order, NUM_SEQID);
3227 }
3228 if (sip == NULL) /* only GI */
3229 return label_len;
3230 else if (got_gi)
3231 {
3232 if (sip->choice == SEQID_GIIM) /* don't show GIIM with GI */
3233 return label_len;
3234
3235 label_len += StringLen (ldelim) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3236 }
3237 format = PRINTID_FASTA_SHORT; /* put on second (or only) SeqId in this format */
3238 }
3239 else {
3240 sip = isip; /* only one id processed */
3241 if (sip != NULL && sip->choice == SEQID_PATENT) {
3242 patsip = (PatentSeqIdPtr) sip->data.ptrvalue;
3243 if (patsip != NULL) {
3244 ipp = patsip->cit;
3245 if (ipp != NULL && StringDoesHaveText (ipp->app_number)) {
3246 is_us_pre_grant = TRUE;
3247 }
3248 }
3249 }
3250 }
3251
3252 /* deal with LOCUS and ACCESSION */
3253 if ((format == PRINTID_TEXTID_ACCESSION) || (format == PRINTID_TEXTID_LOCUS) ||
3254 (format == PRINTID_TEXTID_ACC_VER) || (format == PRINTID_TEXTID_ACC_ONLY))
3255 {
3256 if (format == PRINTID_TEXTID_ACCESSION) {
3257 format = PRINTID_TEXTID_ACC_ONLY; /* current default */
3258 }
3259 switch (sip->choice) /* get the real TextSeqId types */
3260 {
3261 case SEQID_GENBANK:
3262 case SEQID_EMBL:
3263 case SEQID_DDBJ:
3264 case SEQID_PIR:
3265 case SEQID_SWISSPROT:
3266 case SEQID_PRF:
3267 case SEQID_OTHER:
3268 case SEQID_TPG:
3269 case SEQID_TPE:
3270 case SEQID_TPD:
3271 case SEQID_GPIPE:
3272 case SEQID_NAMED_ANNOT_TRACK:
3273 tsip = (TextSeqIdPtr)sip->data.ptrvalue;
3274 release = tsip->release;
3275 if (sip->choice == SEQID_SWISSPROT) {
3276 release = NULL;
3277 }
3278 if ((format == PRINTID_TEXTID_LOCUS) && (tsip->name != NULL)) {
3279 label_len += StringLen (tsip->name) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3280 return label_len;
3281 } else if ((format == PRINTID_TEXTID_ACC_ONLY || format == PRINTID_TEXTID_LOCUS)
3282 && (tsip->accession != NULL)) {
3283 label_len += StringLen (tsip->accession) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3284 return label_len;
3285 } else if ((format == PRINTID_TEXTID_ACC_VER)
3286 && (tsip->accession != NULL)) {
3287 label_len += StringLen (tsip->accession) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3288 if (tsip->version > 0 && release == NULL) {
3289 sprintf(localbuf, ".%d", (int)(tsip->version));
3290 label_len += StringLen (localbuf) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3291 }
3292 return label_len;
3293 }
3294 break;
3295 default:
3296 break;
3297 }
3298 }
3299
3300 if (format == PRINTID_FASTA_SHORT)
3301 {
3302 if (sip->choice == SEQID_PATENT && is_us_pre_grant) {
3303 label_len += 4;
3304 } else if (sip->choice == SEQID_SWISSPROT) {
3305 tsip = (TextSeqIdPtr)sip->data.ptrvalue;
3306 if (tsip->release && StringCmp(tsip->release, "unreviewed") == 0)
3307 label_len += 3;
3308 else
3309 label_len += StringLen (txtid[sip->choice]) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3310 } else {
3311 label_len += StringLen (txtid[sip->choice]) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3312 }
3313 label_len += StringLen (ldelim) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3314 }
3315
3316 switch (sip->choice)
3317 {
3318 case SEQID_LOCAL: /* object id */
3319 if ((((ObjectIdPtr)sip->data.ptrvalue)->str) == NULL)
3320 {
3321 sprintf(localbuf, "%ld",
3322 (long)((ObjectIdPtr)sip->data.ptrvalue)->id);
3323 label_len += StringLen (localbuf) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3324 }
3325 else
3326 {
3327 label_len += StringLen (((ObjectIdPtr)sip->data.ptrvalue)->str) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3328 }
3329 break;
3330 case SEQID_GIBBSQ:
3331 case SEQID_GIBBMT:
3332 sprintf(localbuf, "%ld", (long)sip->data.intvalue);
3333 label_len += StringLen (localbuf) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3334 break;
3335 case SEQID_GI:
3336 sprintf(localbuf, "%lld", (long long)sip->data.intvalue);
3337 label_len += StringLen (localbuf) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3338 break;
3339 case SEQID_GIIM:
3340 sprintf(localbuf, "%ld", (long)((GiimPtr)sip->data.ptrvalue)->id);
3341 label_len += StringLen (localbuf) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3342 break;
3343 case SEQID_GENBANK:
3344 case SEQID_EMBL:
3345 case SEQID_DDBJ:
3346 case SEQID_OTHER:
3347 case SEQID_TPG:
3348 case SEQID_TPE:
3349 case SEQID_TPD:
3350 case SEQID_GPIPE:
3351 case SEQID_NAMED_ANNOT_TRACK:
3352 case SEQID_SWISSPROT:
3353 tsip = (TextSeqIdPtr)(sip->data.ptrvalue);
3354 release = tsip->release;
3355 if (sip->choice == SEQID_SWISSPROT) {
3356 release = NULL;
3357 }
3358 if ((tsip->version > 0) && (release == NULL) && SHOWVERSION)
3359 version = tsip->version; /* show versions */
3360 sprintf(versionbuf, ".%d", (int)version);
3361 case SEQID_PIR:
3362 case SEQID_PRF:
3363 tsip = (TextSeqIdPtr)sip->data.ptrvalue;
3364 if (tsip->accession != NULL)
3365 {
3366 label_len += StringLen (tsip->accession) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3367 if (version)
3368 {
3369 label_len += StringLen (versionbuf) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3370 }
3371 if (format != PRINTID_FASTA_SHORT)
3372 break;
3373 }
3374 if (format == PRINTID_FASTA_SHORT)
3375 label_len += StringLen (ldelim) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3376 if (tsip->name != NULL)
3377 label_len += StringLen (tsip->name) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3378 /*
3379 if (sip->choice == SEQID_OTHER) {
3380 Nlm_LabelCopyNext(&tmp, ldelim, &buflen);
3381 if (tsip->release != NULL)
3382 Nlm_LabelCopyNext(&tmp, tsip->release, &buflen);
3383 }
3384 */
3385 break;
3386 case SEQID_PATENT:
3387 patsip = (PatentSeqIdPtr)(sip->data.ptrvalue);
3388 label_len += StringLen (patsip->cit->country);
3389 if (format == PRINTID_FASTA_SHORT)
3390 label_len += StringLen (ldelim) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3391 if (is_us_pre_grant) {
3392 label_len += StringLen (patsip->cit->app_number) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3393 } else {
3394 label_len += StringLen (patsip->cit->number) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3395 }
3396 if (format == PRINTID_FASTA_SHORT)
3397 label_len += StringLen (ldelim) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3398 else
3399 label_len += 1;
3400 sprintf(localbuf, "%d", (int)patsip->seqid);
3401 label_len += StringLen (localbuf) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3402 break;
3403 case SEQID_GENERAL:
3404 oip = ((DbtagPtr)sip->data.ptrvalue)->tag;
3405 if((format == PRINTID_FASTA_SHORT) || (format == PRINTID_REPORT))
3406 label_len += StringLen (((DbtagPtr)sip->data.ptrvalue)->db) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3407 if (format == PRINTID_FASTA_SHORT)
3408 label_len += StringLen (ldelim) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3409 else if (format == PRINTID_REPORT)
3410 label_len += 2;
3411
3412 if (oip->str == NULL)
3413 {
3414 sprintf(localbuf, "%ld", (long) oip->id);
3415 label_len += StringLen (localbuf) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3416 }
3417 else
3418 label_len += StringLen (oip->str) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3419 break;
3420 case SEQID_PDB:
3421 psip = (PDBSeqIdPtr) sip->data.ptrvalue;
3422 chainbuf[0] = TO_UPPER (psip->chain);
3423 chainbuf[1] = '\0';
3424 chainbuf[2] = '\0';
3425 if (IS_LOWER (psip->chain)) {
3426 chainbuf[1] = chainbuf [0];
3427 }
3428 label_len += StringLen (psip->mol) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3429 if (format == PRINTID_FASTA_SHORT)
3430 {
3431 label_len += StringLen (ldelim);
3432 if (chainbuf[0] == '|') /* special */
3433 label_len += 3;
3434 else if (chainbuf[0] != '\0')
3435 label_len += StringLen (chainbuf) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3436 else
3437 label_len += 2;
3438 }
3439 else if (psip->chain > ' ')
3440 {
3441 label_len += 2;
3442 label_len += StringLen (chainbuf) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3443 }
3444 break;
3445 default:
3446 label_len += StringLen (txtid[0]) + 1; /* have to include 1 for extra terminator from Nlm_LabelCopyNext */
3447 break;
3448
3449 }
3450 return label_len;
3451 }
3452
3453
SeqIdWholeLabel(SeqIdPtr isip,Uint1 format)3454 NLM_EXTERN CharPtr SeqIdWholeLabel (SeqIdPtr isip, Uint1 format)
3455 {
3456 CharPtr label = NULL;
3457 Int4 id_len;
3458
3459 if (isip == NULL)
3460 {
3461 return NULL;
3462 }
3463
3464 id_len = SeqIdLabelLen (isip, format) + 1;
3465 label = (CharPtr) MemNew (sizeof (Char) * id_len);
3466 SeqIdWrite (isip, label, format, id_len);
3467 return label;
3468 }
3469
3470
3471 /* The following function finds either an integer or a string id from
3472 SeqIdPtr */
3473
GetAccessionFromSeqId(SeqIdPtr sip,BIG_ID_PNTR gi,CharPtr PNTR id)3474 Boolean GetAccessionFromSeqId(SeqIdPtr sip, BIG_ID_PNTR gi, CharPtr PNTR id)
3475 {
3476 return GetAccessionVersionFromSeqId(sip, gi, id, FALSE);
3477 }
3478
3479 /* Maximal length of a version number in Accession.version identifiers */
3480 #define MAX_VERSION_LENGTH 10
3481
GetAccessionVersionFromSeqId(SeqIdPtr sip,BIG_ID_PNTR gi,CharPtr PNTR id,Boolean get_version)3482 Boolean GetAccessionVersionFromSeqId(SeqIdPtr sip, BIG_ID_PNTR gi,
3483 CharPtr PNTR id, Boolean get_version)
3484 {
3485 Boolean numeric_id_type = FALSE;
3486 Int2 id_len;
3487 GiimPtr gip;
3488 ObjectIdPtr oip;
3489 TextSeqIdPtr textsip;
3490 DbtagPtr dbtag;
3491 PatentSeqIdPtr psip;
3492 PDBSeqIdPtr pdbsip;
3493
3494 *id = NULL;
3495 *gi = 0;
3496
3497 switch (sip->choice) {
3498 case SEQID_GI: case SEQID_GIBBSQ: case SEQID_GIBBMT:
3499 *gi = sip->data.intvalue;
3500 numeric_id_type = TRUE;
3501 break;
3502 case SEQID_GIIM:
3503 gip = (GiimPtr) sip->data.ptrvalue;
3504 *gi = gip->id;
3505 numeric_id_type = TRUE;
3506 break;
3507 case SEQID_LOCAL:
3508 oip = (ObjectIdPtr) sip->data.ptrvalue;
3509
3510 if (oip->str) {
3511 id_len = StringLen(oip->str);
3512 *id = (CharPtr) MemNew(id_len+1);
3513 sprintf(*id, "%s", oip->str);
3514 } else {
3515 *gi = oip->id;
3516 numeric_id_type = TRUE;
3517 }
3518 break;
3519 case SEQID_GENBANK:
3520 case SEQID_EMBL:
3521 case SEQID_PIR:
3522 case SEQID_SWISSPROT:
3523 case SEQID_DDBJ:
3524 case SEQID_PRF:
3525 case SEQID_OTHER:
3526 case SEQID_TPG:
3527 case SEQID_TPE:
3528 case SEQID_TPD:
3529 case SEQID_GPIPE:
3530 case SEQID_NAMED_ANNOT_TRACK:
3531 textsip = (TextSeqIdPtr)sip->data.ptrvalue;
3532 if (textsip->accession) {
3533 if (get_version && textsip->version > 0) {
3534 /* Assume versions are no longer than MAX_VERSION_LENGTH digits */
3535 id_len = StringLen(textsip->accession) + MAX_VERSION_LENGTH + 1;
3536 *id = (CharPtr) MemNew(id_len+1);
3537 sprintf(*id, "%s.%ld", textsip->accession, (long) textsip->version);
3538 } else {
3539 id_len = StringLen(textsip->accession);
3540 *id = (CharPtr) MemNew(id_len+1);
3541 sprintf(*id, "%s", textsip->accession);
3542 }
3543 } else if (textsip->name) {
3544 id_len = StringLen(textsip->name);
3545 *id = (CharPtr) MemNew(id_len+1);
3546 sprintf(*id, "%s", textsip->name);
3547 }
3548 break;
3549 case SEQID_GENERAL:
3550 dbtag = (DbtagPtr) sip->data.ptrvalue;
3551 if (dbtag->tag->str == NULL) {
3552 numeric_id_type = TRUE;
3553 *gi = dbtag->tag->id;
3554 } else {
3555 id_len = StringLen(dbtag->tag->str);
3556 *id = (CharPtr) MemNew(id_len+1);
3557 sprintf(*id, "%s", dbtag->tag->str);
3558 }
3559 break;
3560 case SEQID_PATENT:
3561 psip = (PatentSeqIdPtr) sip->data.ptrvalue;
3562 *gi = (Int4) psip->seqid;
3563 numeric_id_type = TRUE;
3564 break;
3565 case SEQID_PDB:
3566 pdbsip = (PDBSeqIdPtr) sip->data.ptrvalue;
3567 id_len = StringLen(pdbsip->mol);
3568 *id = (CharPtr) MemNew(id_len+4);
3569 sprintf(*id, "%s", pdbsip->mol);
3570 break;
3571 default: break;
3572 }
3573
3574 return numeric_id_type;
3575 }
3576
3577 /*****************************************************************************
3578 *
3579 * SeqIdPtr SeqIdParse(buf)
3580 * parses a string containing SeqIds formated by SeqIdPrint using
3581 * FASTA_LONG or FASTA_SHORT, separated by |
3582 * returns a SeqId linked list for them
3583 * or NULL on failure for any SeqId
3584 *
3585 *****************************************************************************/
3586 #define SEQID_PARSE_BUF_SIZE 200
SeqIdParse(CharPtr buf)3587 NLM_EXTERN SeqIdPtr SeqIdParse(CharPtr buf)
3588 {
3589 char localbuf[SEQID_PARSE_BUF_SIZE + 2];
3590 char * tmp, *strt, * tokens[6], *chain;
3591 char d;
3592 long long num;
3593 CharPtr tp;
3594 Int2 numtoken, i, type = 0, j, ctr=0, numdigits; /* ctr is number of OK ids done */
3595 SeqIdPtr sip = NULL, head = NULL, last = NULL, tmpsip;
3596 ObjectIdPtr oip;
3597 DbtagPtr dp;
3598 TextSeqIdPtr tsip;
3599 PatentSeqIdPtr patsip;
3600 IdPatPtr ipp;
3601 PDBSeqIdPtr psip;
3602 GiimPtr gim;
3603 Boolean done = FALSE, is_us_pre_grant = FALSE;
3604 static Uint1 expect_tokens[NUM_SEQID] = { /* number of tokens to expect */
3605 0, /* 0 = not set */
3606 1, /* 1 = local Object-id */
3607 1, /* 2 = gibbsq */
3608 1, /* 3 = gibbmt */
3609 1, /* 4 = giim Giimport-id */
3610 2, /* 5 = genbank */
3611 2, /* 6 = embl */
3612 2, /* 7 = pir */
3613 2, /* 8 = swissprot */
3614 3, /* 9 = patent */
3615 3, /* 10 = other TextSeqId */
3616 2, /* 11 = general Dbtag */
3617 1, /* 12 = gi */
3618 2, /* 13 = ddbj */
3619 2, /* 14 = prf */
3620 2, /* 15 = pdb */
3621 2, /* 16 = tpg */
3622 2, /* 17 = tpe */
3623 2, /* 18 = tpd */
3624 2, /* 19 = gpp */
3625 2, /* 20 = nat */
3626 };
3627
3628 if ((buf == NULL) || (*buf == '\0'))
3629 return NULL;
3630
3631 d = *delim; /* delimiter */
3632 while (! done)
3633 {
3634 Boolean sp_prelim = FALSE; /* Used to set release field in Swissprot TextSeqId */
3635 /* set all tokens pointing to \0 */
3636 localbuf[SEQID_PARSE_BUF_SIZE + 1] = '\0';
3637 for (i = 0; i < 6; i++)
3638 tokens[i] = &localbuf[SEQID_PARSE_BUF_SIZE + 1];
3639 tp = buf; /* save start of string */
3640 /* copy and tokenize - token\0token\0\n */
3641 for (tmp=localbuf, i=0; ((*buf != d) && (*buf != '\0') && (i < SEQID_PARSE_BUF_SIZE));
3642 i++,buf++,tmp++)
3643 *tmp = *buf;
3644 if (*buf != d) goto erret; /* didn't get delimiter */
3645 *tmp = '\0';
3646 tmp++;
3647 buf++;
3648 for (j = 0, type = 0; j < NUM_SEQID; j++)
3649 {
3650 if (! StringCmp(localbuf, txtid[j]))
3651 {
3652 type = j;
3653 break;
3654 }
3655 }
3656
3657 /* oth now ref, but still want to parse old style */
3658 if ((! type) && (! StringCmp(localbuf, "oth"))) {
3659 type = SEQID_OTHER;
3660 }
3661
3662 /* pgp is for pre-grant patent publications */
3663 if ((! type) && (! StringCmp(localbuf, "pgp"))) {
3664 type = SEQID_PATENT;
3665 is_us_pre_grant = TRUE;
3666 }
3667
3668 /* Trembl ID is really Swissprot with release field of TextSeqId set to "unreviewed" */
3669 if ((! type) && (! StringCmp(localbuf, "tr"))) {
3670 type = SEQID_SWISSPROT;
3671 sp_prelim = TRUE;
3672 }
3673
3674 if (! type) goto erret;
3675
3676 /* copy and tokenize - token\0token\0\n */
3677 for (numtoken=0, strt=tmp;
3678 ((i < SEQID_PARSE_BUF_SIZE) && (numtoken < (Int2)(expect_tokens[type])) && (! done));
3679 i++,buf++,tmp++)
3680 {
3681 if ((*buf == d) || (*buf == '\0'))
3682 {
3683 *tmp = '\0';
3684 tokens[numtoken] = strt;
3685 numtoken++;
3686 if (*buf == '\0')
3687 {
3688 if (type == SEQID_OTHER && (numtoken == 2 || numtoken == 1))
3689 done = TRUE;
3690 else if ((type == SEQID_GENBANK || type == SEQID_EMBL ||
3691 type == SEQID_DDBJ || type == SEQID_TPG ||
3692 type == SEQID_TPE || type == SEQID_TPD ||
3693 type == SEQID_GPIPE || type == SEQID_NAMED_ANNOT_TRACK) &&
3694 numtoken == 1)
3695 done = TRUE;
3696 else if (numtoken < (Int2)(expect_tokens[type]))
3697 goto erret;
3698 else
3699 done = TRUE;
3700 }
3701 strt = tmp+1;
3702 }
3703 else
3704 *tmp = *buf;
3705 }
3706 if (i == SEQID_PARSE_BUF_SIZE) goto erret;
3707
3708 sip = ValNodeNew(head);
3709 if (head == NULL) head = sip;
3710 sip->choice = (Uint1) type;
3711 switch (type)
3712 {
3713 case SEQID_LOCAL: /* object id */
3714 if (*tokens[0] == '\0') goto erret;
3715 oip = ObjectIdNew();
3716 sip->data.ptrvalue = oip;
3717 for (tmp = tokens[0], numdigits = 0; *tmp != '\0'; tmp++, numdigits++)
3718 {
3719 if (! IS_DIGIT(*tmp)) /* string type */
3720 {
3721 oip->str = StringSave(tokens[0]);
3722 break;
3723 }
3724 }
3725 if (oip->str == NULL)
3726 {
3727 sscanf(tokens[0], "%lld", &num);
3728 oip->id = (Int4)num;
3729 if (*tokens[0] != '0' && (numdigits < 10 ||
3730 (numdigits == 10 && StringCmp (tokens [0], "2147483647") <= 0))) {
3731 sscanf(tokens[0], "%lld", &num);
3732 oip->id = (Int4)num;
3733 } else {
3734 oip->str = StringSave(tokens[0]);
3735 }
3736 }
3737 break;
3738 case SEQID_GIBBSQ:
3739 case SEQID_GIBBMT:
3740 if (! IS_DIGIT(*tokens[0]))
3741 goto erret;
3742 sscanf(tokens[0], "%lld", &num);
3743 sip->data.intvalue = (BIG_ID)num;
3744 break;
3745 case SEQID_GI:
3746 if (! IS_DIGIT(*tokens[0]))
3747 goto erret;
3748 sscanf(tokens[0], "%lld", &num);
3749 sip->data.intvalue = (BIG_ID)num;
3750 break;
3751 case SEQID_GIIM:
3752 if (! IS_DIGIT(*tokens[0])) goto erret;
3753 gim = GiimNew();
3754 sip->data.ptrvalue = gim;
3755 sscanf(tokens[0], "%lld", &num);
3756 gim->id = (BIG_ID)num;
3757 break;
3758 case SEQID_GENBANK:
3759 case SEQID_EMBL:
3760 case SEQID_PIR:
3761 case SEQID_SWISSPROT:
3762 case SEQID_DDBJ:
3763 case SEQID_PRF:
3764 case SEQID_OTHER:
3765 case SEQID_TPG:
3766 case SEQID_TPE:
3767 case SEQID_TPD:
3768 case SEQID_GPIPE:
3769 case SEQID_NAMED_ANNOT_TRACK:
3770 if ((*tokens[0] == '\0') && (*tokens[1] == '\0'))
3771 goto erret;
3772 tsip = TextSeqIdNew();
3773 sip->data.ptrvalue = tsip;
3774 if (*tokens[0] != '\0')
3775 {
3776 tmp = tokens[0]; /* check for version */
3777 while (*tmp != '\0')
3778 {
3779 if (*tmp == '.')
3780 {
3781 if (IS_DIGIT(*(tmp+1)))
3782 {
3783 *tmp = '\0';
3784 sscanf((tmp+1),"%lld",&num);
3785 tsip->version =(Int2)num;
3786 }
3787 else
3788 tmp++;
3789 }
3790 else
3791 tmp++;
3792 }
3793 tsip->accession = StringSave(tokens[0]);
3794 *(tsip->accession) = TO_UPPER(*(tsip->accession));
3795 }
3796 if (*tokens[1] != '\0')
3797 {
3798 tsip->name = StringSave(tokens[1]);
3799 if (type != SEQID_OTHER) {
3800 tmp = tsip->name;
3801 while (*tmp != '\0')
3802 {
3803 *tmp = TO_UPPER(*tmp);
3804 tmp++;
3805 }
3806 }
3807 }
3808 if (type == SEQID_SWISSPROT)
3809 {
3810 if (sp_prelim)
3811 tsip->release = StringSave("unreviewed");
3812 else
3813 tsip->release = StringSave("reviewed");
3814 }
3815 break;
3816 case SEQID_PATENT:
3817 if ((*tokens[0] == '\0') || (*tokens[1] == '\0')) goto erret;
3818 if (! IS_DIGIT(*tokens[2])) goto erret;
3819 patsip = PatentSeqIdNew();
3820 sip->data.ptrvalue = patsip;
3821 ipp = IdPatNew();
3822 patsip->cit = ipp;
3823 ipp->country = StringSave(tokens[0]);
3824 if (is_us_pre_grant) {
3825 ipp->app_number = StringSave(tokens[1]);
3826 } else {
3827 ipp->number = StringSave(tokens[1]);
3828 }
3829 sscanf(tokens[2], "%lld", &num);
3830 patsip->seqid = (Int2)num;
3831 break;
3832 case SEQID_GENERAL:
3833 if ((*tokens[0] == '\0') || (*tokens[1] == '\0')) goto erret;
3834 dp = DbtagNew();
3835 sip->data.ptrvalue = dp;
3836 oip = ObjectIdNew();
3837 dp->tag = oip;
3838 dp->db = StringSave(tokens[0]);
3839 for (tmp = tokens[1], numdigits = 0; *tmp != '\0'; tmp++, numdigits++)
3840 {
3841 if (! IS_DIGIT(*tmp)) /* string type */
3842 {
3843 oip->str = StringSave(tokens[1]);
3844 break;
3845 }
3846 }
3847 if (oip->str == NULL)
3848 {
3849 if (*tokens[1] != '0' && (numdigits < 10 ||
3850 (numdigits == 10 && StringCmp (tokens [1], "2147483647") <= 0))) {
3851 sscanf(tokens[1], "%lld", &num);
3852 oip->id = (Int4)num;
3853 } else {
3854 oip->str = StringSave(tokens[1]);
3855 }
3856 }
3857 break;
3858 case SEQID_PDB:
3859 if (*tokens[0] == '\0') goto erret;
3860 psip = PDBSeqIdNew();
3861 sip->data.ptrvalue = psip;
3862 psip->mol = StringSave(tokens[0]);
3863 tmp = psip->mol;
3864 while (*tmp != '\0')
3865 {
3866 *tmp = TO_UPPER(*tmp);
3867 tmp++;
3868 }
3869 chain = tokens [1];
3870 if ((! StringICmp(tokens[1], "VB")) ||
3871 *(buf-1) == d)
3872 psip->chain = '|';
3873 else if (! StringHasNoText (tokens[1]))
3874 psip->chain = *tokens[1];
3875 /* double letter for chain indicates lower case */
3876 if (StringLen (chain) == 2 && TO_UPPER (chain [0]) == TO_UPPER (chain [1])) {
3877 psip->chain = TO_LOWER(psip->chain);
3878 } else {
3879 psip->chain = TO_UPPER(psip->chain);
3880 }
3881 break;
3882 }
3883 last = sip;
3884 sip = NULL;
3885 ctr++;
3886 }
3887 ret:
3888 return head;
3889 erret:
3890 StringNCpy(localbuf, tp, SEQID_PARSE_BUF_SIZE);
3891 localbuf[SEQID_PARSE_BUF_SIZE] = '\0';
3892 ErrPostEx(SEV_INFO, 0,0, "SeqIdParse Failure at %s", localbuf);
3893 if (sip == head)
3894 head = NULL;
3895 else
3896 {
3897 if (last != NULL)
3898 last->next = NULL;
3899 if (! ctr) /* no good SeqIds */
3900 head = SeqIdSetFree(head);
3901 else /* at least one good SeqId.. keep it */
3902 {
3903 tmpsip = head;
3904 last = NULL;
3905 for (i = 0; i < ctr; i++)
3906 {
3907 last = tmpsip;
3908 tmpsip = tmpsip->next;
3909 }
3910 if (last != NULL)
3911 last->next = NULL;
3912 SeqIdSetFree(tmpsip);
3913 }
3914 }
3915 ValNodeFree(sip);
3916 goto ret;
3917 }
3918
3919
3920 /*****************************************************************************
3921 *
3922 * Boolean SeqIdMatch(a, b)
3923 * returns TRUE if SeqIds could be compared and are the same
3924 * returns FALSE both if SeqIds could not be compared OR if they were
3925 * compared but are different
3926 *
3927 * WARNING!!!! use SeqIdComp() instead of SeqIdMatch() in most cases
3928 *
3929 * The code here must work the same is in two idloader
3930 * context: function id_flatten_seq_obj (idsybase.c)
3931 * and proc id_id_flatten_seq_obj
3932 *
3933 *****************************************************************************/
SeqIdMatch(SeqIdPtr a,SeqIdPtr b)3934 NLM_EXTERN Boolean SeqIdMatch (SeqIdPtr a, SeqIdPtr b)
3935 {
3936 Uint1 retval;
3937
3938 retval = SeqIdComp(a, b);
3939 if (retval == SIC_YES)
3940 return TRUE;
3941 else
3942 return FALSE;
3943 }
3944
GetGiFromSeqIdGeneral(SeqIdPtr seq_id)3945 static Int8 GetGiFromSeqIdGeneral( SeqIdPtr seq_id)
3946 {
3947 if( seq_id->choice != SEQID_GENERAL) return 0;
3948 DbtagPtr db_tag = (DbtagPtr) seq_id->data.ptrvalue;
3949 if( StringICmp( db_tag->db, "GI")) return 0;
3950 ObjectIdPtr tag = db_tag->tag;
3951 if( (tag == NULL) || (tag->str == NULL)) return 0;
3952 return atol( tag->str);
3953 }
3954
3955 /*****************************************************************************
3956 *
3957 * SeqIdComp(a, b)
3958 * Compares a to b and returns
3959 *
3960 * SIC_DIFF = different types, could not be compared
3961 * SIC_NO = types could be compared, and ids are different
3962 * SIC_YES = types could be compared, and ids are the same
3963 *
3964 *****************************************************************************/
SeqIdComp(SeqIdPtr a,SeqIdPtr b)3965 NLM_EXTERN Uint1 SeqIdComp (SeqIdPtr a, SeqIdPtr b)
3966 {
3967 Uint1 choice;
3968 TextSeqIdPtr at, bt;
3969
3970 if ((a == NULL) || (b == NULL))
3971 return SIC_DIFF;
3972
3973 choice = a->choice;
3974 if (choice != b->choice)
3975 {
3976 switch (choice)
3977 {
3978 case SEQID_GENBANK: /* these could be confused */
3979 case SEQID_EMBL:
3980 case SEQID_DDBJ:
3981 case SEQID_TPG:
3982 case SEQID_TPE:
3983 case SEQID_TPD:
3984 case SEQID_GPIPE:
3985 case SEQID_NAMED_ANNOT_TRACK:
3986 switch (b->choice)
3987 {
3988 case SEQID_GENBANK: /* its ok */
3989 case SEQID_EMBL:
3990 case SEQID_DDBJ:
3991 case SEQID_TPG:
3992 case SEQID_TPE:
3993 case SEQID_TPD:
3994 case SEQID_GPIPE:
3995 case SEQID_NAMED_ANNOT_TRACK:
3996 break;
3997 default:
3998 return SIC_DIFF;
3999 }
4000 break;
4001 case SEQID_GI:
4002 {
4003 Int8 gi = GetGiFromSeqIdGeneral( b);
4004 if( a->data.intvalue == gi) return SIC_YES;
4005 return SIC_DIFF;
4006 }
4007 case SEQID_GENERAL:
4008 {
4009 if( b->choice != SEQID_GI) return SIC_DIFF;
4010 Int8 gi = GetGiFromSeqIdGeneral( a);
4011 if( b->data.intvalue == gi) return SIC_YES;
4012 return SIC_DIFF;
4013 }
4014 default:
4015 return SIC_DIFF;
4016 }
4017 }
4018
4019 switch (choice)
4020 {
4021 case SEQID_NOT_SET:
4022 return SIC_DIFF;
4023 case SEQID_LOCAL:
4024 if (ObjectIdMatch((ObjectIdPtr)a->data.ptrvalue, (ObjectIdPtr)b->data.ptrvalue))
4025 return SIC_YES;
4026 else
4027 return SIC_NO;
4028 case SEQID_GIBBSQ: /* gibbsq */
4029 case SEQID_GIBBMT: /* gibbmt */
4030 case SEQID_GI: /* gi */
4031 if (a->data.intvalue == b->data.intvalue)
4032 return SIC_YES;
4033 else
4034 return SIC_NO;
4035 case SEQID_GIIM: /* giim */
4036 if (((GiimPtr)a->data.ptrvalue)->id == ((GiimPtr)b->data.ptrvalue)->id)
4037 return SIC_YES;
4038 else
4039 return SIC_NO;
4040 case SEQID_PATENT: /* patent seq */
4041 if (((PatentSeqIdPtr)a->data.ptrvalue)->seqid !=
4042 ((PatentSeqIdPtr)b->data.ptrvalue)->seqid)
4043 return SIC_NO;
4044 if (IdPatMatch(((PatentSeqIdPtr)a->data.ptrvalue)->cit,
4045 ((PatentSeqIdPtr)b->data.ptrvalue)->cit))
4046 return SIC_YES;
4047 else
4048 return SIC_NO;
4049 case SEQID_PDB: /* pdb */
4050 if ( StringICmp(((PDBSeqIdPtr)a->data.ptrvalue)->mol,
4051 ((PDBSeqIdPtr)b->data.ptrvalue)->mol))
4052 return SIC_NO;
4053 /*
4054 if (TO_UPPER(((PDBSeqIdPtr)a->data.ptrvalue)->chain) !=
4055 TO_UPPER(((PDBSeqIdPtr)b->data.ptrvalue)->chain))
4056 return SIC_NO;
4057 */
4058 if (((PDBSeqIdPtr)a->data.ptrvalue)->chain !=
4059 ((PDBSeqIdPtr)b->data.ptrvalue)->chain)
4060 return SIC_NO;
4061 return SIC_YES;
4062 case SEQID_GENERAL: /* general */
4063 if (DbtagMatch((DbtagPtr)a->data.ptrvalue,
4064 (DbtagPtr)b->data.ptrvalue))
4065 return SIC_YES;
4066 else if (StringICmp(((DbtagPtr)a->data.ptrvalue)->db,
4067 ((DbtagPtr)b->data.ptrvalue)->db))
4068 return SIC_DIFF; /* db strings do not match, okay */
4069 else
4070 return SIC_NO;
4071
4072 case SEQID_GENBANK:
4073 case SEQID_EMBL:
4074 case SEQID_DDBJ:
4075 case SEQID_PIR:
4076 case SEQID_SWISSPROT:
4077 case SEQID_PRF:
4078 case SEQID_OTHER:
4079 case SEQID_TPG:
4080 case SEQID_TPE:
4081 case SEQID_TPD:
4082 case SEQID_GPIPE:
4083 case SEQID_NAMED_ANNOT_TRACK:
4084
4085 at = (TextSeqIdPtr)a->data.ptrvalue;
4086 bt = (TextSeqIdPtr)b->data.ptrvalue;
4087 if ((at->accession != NULL) && (bt->accession != NULL))
4088 {
4089 if (! StringICmp(at->accession, bt->accession)) {
4090 if (at->version > 0 &&
4091 bt->version > 0 &&
4092 at->version != bt->version) {
4093 return SIC_NO;
4094 }
4095 return SIC_YES;
4096 } else {
4097 return SIC_NO;
4098 }
4099 }
4100 else if ((at->name != NULL) && (bt->name != NULL))
4101 {
4102 if (! StringICmp(at->name, bt->name)) {
4103 if (at->version > 0 &&
4104 bt->version > 0 &&
4105 at->version != bt->version) {
4106 return SIC_NO;
4107 }
4108 return SIC_YES;
4109 } else {
4110 return SIC_NO;
4111 }
4112 }
4113 else
4114 return SIC_DIFF;
4115 default:
4116 ErrPostEx(SEV_ERROR, 0,0, "SeqIdComp: unsupported type [%d]",
4117 (int)choice);
4118 return SIC_DIFF;
4119 }
4120 }
4121
4122 /*****************************************************************************
4123 *
4124 * Boolean SeqIdIn(a, b)
4125 * Looks for single SeqId, "a" in chain of SeqIds, "b"
4126 *
4127 *****************************************************************************/
SeqIdIn(SeqIdPtr a,SeqIdPtr b)4128 NLM_EXTERN Boolean SeqIdIn (SeqIdPtr a, SeqIdPtr b)
4129
4130 {
4131 SeqIdPtr now;
4132 Uint1 retval;
4133
4134 if (a == NULL)
4135 return FALSE;
4136
4137 for (now =b; now != NULL; now = now -> next)
4138 {
4139 retval = SeqIdComp(a, now);
4140 switch (retval)
4141 {
4142 case SIC_YES:
4143 return TRUE;
4144 case SIC_NO:
4145 return FALSE;
4146 }
4147 }
4148 return FALSE;
4149 }
4150
4151 /*****************************************************************************
4152 *
4153 * SeqIdForSameBioseq(a,b)
4154 *
4155 *****************************************************************************/
SeqIdForSameBioseq(SeqIdPtr a,SeqIdPtr b)4156 NLM_EXTERN Boolean SeqIdForSameBioseq (SeqIdPtr a, SeqIdPtr b)
4157
4158 {
4159 BioseqPtr bsp;
4160 Uint1 retval;
4161 Boolean res = FALSE;
4162 /*
4163 Boolean locked = FALSE;
4164 */
4165
4166 if ((a == NULL) || (b == NULL)) return FALSE;
4167
4168 retval = SeqIdComp(a,b); /* if match, all set */
4169 switch (retval)
4170 {
4171 case SIC_YES:
4172 return TRUE;
4173 case SIC_NO:
4174 return FALSE;
4175 }
4176
4177 bsp = BioseqFindCore(a);
4178 if (bsp == NULL)
4179 {
4180 return FALSE;
4181 /*
4182 bsp = BioseqLockById(a);
4183 if (bsp != NULL)
4184 locked = TRUE;
4185 else
4186 return res;
4187 */
4188 }
4189
4190 res = SeqIdIn(b, bsp->id);
4191 /*
4192 if (locked)
4193 BioseqUnlock(bsp);
4194 */
4195
4196 return res;
4197 }
4198
4199 /*****************************************************************************
4200 *
4201 * MakeNewProteinSeqId(SeqLocPtr slp, SeqIdPtr sip)
4202 * Makes a new protein SeqId of attempting to keep it unique
4203 * Trys to match it to the input seqid type
4204 * slp is the location on the DNA of the coding region making the protein
4205 * sip is the SeqId of the DNA coding for the protein
4206 * if (sip != NULL) uses it for a "base" first
4207 * else if (slp != NULL) uses a SeqId from it for a base
4208 * else base is the string tmpseq
4209 *
4210 * id is then base_X where X is a number assigned as a serial number
4211 * the returned id is guaranteed to be unique among all Bioseqs currently
4212 * loaded in memory.
4213 *
4214 *
4215 *****************************************************************************/
MakeNewProteinSeqIdExMT(SeqLocPtr slp,SeqIdPtr sip,CharPtr prefix,Int2Ptr ctrptr,Boolean is_MT_safe)4216 NLM_EXTERN SeqIdPtr LIBCALL MakeNewProteinSeqIdExMT (SeqLocPtr slp, SeqIdPtr sip, CharPtr prefix, Int2Ptr ctrptr, Boolean is_MT_safe)
4217 {
4218 Char buf[60];
4219 CharPtr tmp;
4220 Int2 ctr = 0;
4221 Int2 start = 1;
4222 SeqLocPtr tslp;
4223 ValNodePtr newid;
4224 ObjectIdPtr oid;
4225 ValNode vn;
4226 TextSeqId tsi;
4227 ValNodePtr altid;
4228 size_t len;
4229 static Uint4 counter;
4230 static TNlmMutex lock = NULL;
4231
4232
4233 if (lock == NULL) {
4234 NlmMutexInit(&lock);
4235 }
4236
4237 /* create a possible GenBankStyle id as well */
4238 altid = &vn;
4239 vn.choice = SEQID_GENBANK;
4240 vn.next = NULL;
4241 vn.data.ptrvalue = &tsi;
4242 tsi.name = NULL;
4243 tsi.accession = NULL;
4244 tsi.version = INT2_MIN;
4245 tsi.release = NULL;
4246
4247 if ((sip == NULL) && (slp != NULL)) {
4248 tslp = NULL;
4249 while ((tslp = SeqLocFindNext(slp, tslp)) != NULL) {
4250 sip = SeqLocId(tslp);
4251 if (sip != NULL)
4252 break;
4253 }
4254 }
4255
4256 if (sip != NULL) {
4257 SeqIdWrite(sip, buf, PRINTID_TEXTID_ACCESSION, 50);
4258 tmp = buf;
4259 while (*tmp != '\0')
4260 tmp++;
4261 if (*(tmp-1) == '>')
4262 tmp--;
4263 *tmp = '_';
4264 tmp++;
4265 *tmp = '\0';
4266 } else {
4267 len = StringLen (prefix);
4268 if (len > 0 && len < 52) {
4269 tmp = StringMove(buf, prefix);
4270 } else {
4271 tmp = StringMove(buf, "tmpseq_");
4272 }
4273 }
4274
4275 newid = ValNodeNew(NULL);
4276 oid = ObjectIdNew();
4277 oid->str = buf; /* allocate this later */
4278 newid->choice = SEQID_LOCAL;
4279 newid->data.ptrvalue = oid;
4280
4281 tsi.name = buf; /* check for alternative form */
4282
4283 if (ctrptr != NULL) {
4284 start = *ctrptr;
4285 }
4286 if (start < 1) {
4287 start = 1;
4288 }
4289
4290 /* Very dangerous way to create new id - don't use if you can */
4291
4292 if (is_MT_safe == FALSE) {
4293 for (ctr = start; ctr < 32000; ctr++) {
4294 sprintf(tmp, "%d", (int)ctr);
4295 if ((BioseqFindCore(newid) == NULL) && (BioseqFindCore(altid) == NULL)) {
4296 oid->str = StringSave(buf);
4297 if (ctrptr != NULL) {
4298 *ctrptr = ctr + 1;
4299 }
4300 return newid;
4301 }
4302 }
4303 }
4304
4305 NlmMutexLock(lock);
4306
4307 sprintf(tmp, "%d", (int)counter);
4308 oid->str = StringSave(buf);
4309 if (ctrptr != NULL) {
4310 *ctrptr = ctr + 1;
4311 }
4312
4313 counter++;
4314 NlmMutexUnlock(lock);
4315
4316 return newid;
4317 }
4318
MakeNewProteinSeqIdEx(SeqLocPtr slp,SeqIdPtr sip,CharPtr prefix,Int2Ptr ctrptr)4319 NLM_EXTERN SeqIdPtr LIBCALL MakeNewProteinSeqIdEx (SeqLocPtr slp, SeqIdPtr sip, CharPtr prefix, Int2Ptr ctrptr)
4320 {
4321 return MakeNewProteinSeqIdExMT (slp, sip, prefix, ctrptr, FALSE);
4322 }
4323
MakeNewProteinSeqId(SeqLocPtr slp,SeqIdPtr sip)4324 NLM_EXTERN SeqIdPtr LIBCALL MakeNewProteinSeqId (SeqLocPtr slp, SeqIdPtr sip)
4325 {
4326 return MakeNewProteinSeqIdEx (slp, sip, NULL, NULL);
4327 }
4328
UniqueLocalId(void)4329 NLM_EXTERN ObjectIdPtr UniqueLocalId(void)
4330 {
4331 static TNlmMutex lock = NULL;
4332 static long count = 0;
4333 ObjectIdPtr oip;
4334 long l;
4335 Char buf[128];
4336
4337 if (lock == NULL) {
4338 NlmMutexInit(&lock);
4339 }
4340 NlmMutexLock(lock);
4341 l = count;
4342 if (++count < 0) {
4343 count = 0;
4344 }
4345 NlmMutexUnlock(lock);
4346 sprintf(buf, "lcl|unique%08ld", l);
4347 oip = ObjectIdNew();
4348 oip->str = StringSave(buf);
4349 return oip;
4350 }
4351
4352 /*****************************************************************************
4353 *
4354 * Traversal routine for SeqLocFindNext
4355 *
4356 *****************************************************************************/
SeqLocNext(SeqLocPtr seqlochead,SeqLocPtr currseqloc,Uint1 equiv_status,BoolPtr founditptr)4357 static SeqLocPtr SeqLocNext (SeqLocPtr seqlochead, SeqLocPtr currseqloc, Uint1 equiv_status, BoolPtr founditptr)
4358
4359 {
4360 SeqLocPtr currloc, retval;
4361 Boolean equiv_is_one, foundit=FALSE;
4362
4363 switch (equiv_status)
4364 {
4365 case EQUIV_IS_ONE:
4366 equiv_is_one = TRUE;
4367 break;
4368 case FIRST_EQUIV_IS_MANY:
4369 equiv_status = EQUIV_IS_ONE;
4370 case EQUIV_IS_MANY:
4371 default:
4372 equiv_is_one = FALSE;
4373 break;
4374 }
4375
4376 while (seqlochead != NULL)
4377 {
4378 if (IS_one_loc(seqlochead, equiv_is_one))
4379 {
4380 if (currseqloc == NULL)
4381 return seqlochead;
4382 else if (currseqloc == seqlochead) /* found it */
4383 {
4384 *founditptr = TRUE;
4385 if (seqlochead -> next != NULL)
4386 {
4387 if (IS_one_loc(seqlochead->next, equiv_is_one))
4388 return seqlochead->next;
4389 else
4390 return SeqLocNext(seqlochead->next, NULL, equiv_status, &foundit);
4391 }
4392 else
4393 {
4394 return NULL;
4395 }
4396 }
4397 }
4398 else
4399 {
4400 currloc = (SeqLocPtr)seqlochead->data.ptrvalue;
4401 if (currloc != NULL)
4402 {
4403 if ((retval = SeqLocNext(currloc, currseqloc, equiv_status, &foundit)) != NULL)
4404 return retval;
4405 else
4406 if (foundit)
4407 currseqloc = NULL; /* no need to keep looking */
4408 }
4409 }
4410
4411 seqlochead = seqlochead->next;
4412 }
4413 return NULL;
4414 }
4415
4416 /*****************************************************************************
4417 *
4418 * SeqLocFindNext(seqlochead, currseqloc)
4419 * finds the next Seq-loc after currseqloc
4420 * seqlochead is the first of a chain of Seq-locs
4421 * treats SEQLOC_EQUIV as multiple seq-locs
4422 *
4423 *****************************************************************************/
SeqLocFindNext(SeqLocPtr seqlochead,SeqLocPtr currseqloc)4424 NLM_EXTERN SeqLocPtr SeqLocFindNext (SeqLocPtr seqlochead, SeqLocPtr currseqloc)
4425 {
4426 return SeqLocFindPart(seqlochead, currseqloc, EQUIV_IS_MANY);
4427 }
4428
4429 /*****************************************************************************
4430 *
4431 * SeqLocFindPart(seqlochead, currseqloc, equiv_status)
4432 * finds the next Seq-loc after currseqloc
4433 * seqlochead is the first of a chain of Seq-locs
4434 * equiv_status defines how to treat SEQLOC_EQUIV
4435 *
4436 *****************************************************************************/
SeqLocFindPart(SeqLocPtr seqlochead,SeqLocPtr currseqloc,Uint1 equiv_status)4437 NLM_EXTERN SeqLocPtr SeqLocFindPart (SeqLocPtr seqlochead, SeqLocPtr currseqloc, Uint1 equiv_status)
4438 {
4439 SeqLocPtr tmp, oldnext;
4440 Boolean equiv_is_one, foundit=FALSE;
4441
4442 if (seqlochead == NULL) return NULL;
4443
4444 if (equiv_status == EQUIV_IS_ONE)
4445 equiv_is_one = TRUE;
4446 else
4447 equiv_is_one = FALSE;
4448
4449 if (IS_one_loc(seqlochead, equiv_is_one)) /* not a chain */
4450 {
4451 if (currseqloc == NULL) /* first call */
4452 return seqlochead;
4453 else if (currseqloc == seqlochead) /* second call */
4454 return NULL;
4455 else /* oops */
4456 goto erret;
4457 }
4458
4459 if (currseqloc != NULL)
4460 {
4461 if (! IS_one_loc(currseqloc, equiv_is_one)) /* oops */
4462 goto erret;
4463 tmp = currseqloc->next;
4464 if (tmp != NULL)
4465 {
4466 if (IS_one_loc(tmp, equiv_is_one))
4467 return tmp;
4468 }
4469 }
4470
4471 oldnext = seqlochead->next; /* protect from accidental chains */
4472 seqlochead->next = NULL;
4473
4474 tmp = SeqLocNext(seqlochead, currseqloc, equiv_status, &foundit);
4475
4476 seqlochead->next = oldnext;
4477 return tmp;
4478
4479 erret:
4480 ErrPostEx(SEV_ERROR,0,0, "Invalid arguments to SeqLocFindNext");
4481 return NULL;
4482 }
4483
4484 /*****************************************************************************
4485 *
4486 * IS_one_loc(anp, equiv_is_one)
4487 * returns TRUE if is a sequence location which refers to one piece
4488 * of sequence
4489 * used for moving through complicated Seq-locs
4490 * if equiv_is_one == TRUE, then considers a SEQ_LOC_EQUIV a single
4491 * location. If FALSE, does not.
4492 *
4493 *****************************************************************************/
IS_one_loc(SeqLocPtr anp,Boolean equiv_is_one)4494 NLM_EXTERN Boolean IS_one_loc (SeqLocPtr anp, Boolean equiv_is_one) /* a SeqLoc */
4495
4496 {
4497 Boolean retval = FALSE;
4498
4499 if (anp == NULL) return FALSE;
4500
4501 switch (anp->choice)
4502 {
4503 case SEQLOC_NULL: /* null - not a valid single region */
4504 case SEQLOC_EMPTY: /* empty */
4505 case SEQLOC_WHOLE: /* whole */
4506 case SEQLOC_INT: /* int */
4507 case SEQLOC_PNT: /* pnt */
4508 case SEQLOC_PACKED_PNT: /* packed-pnt */
4509 case SEQLOC_BOND: /* bond */
4510 retval = TRUE;
4511 break;
4512
4513 case SEQLOC_EQUIV: /* equiv */
4514 retval = equiv_is_one;
4515 break;
4516
4517 case SEQLOC_PACKED_INT: /* packed seqint */
4518 case SEQLOC_MIX: /* mix */
4519 case SEQLOC_FEAT:
4520 retval = FALSE;
4521 break;
4522
4523 default:
4524 ErrPostEx(SEV_ERROR,0,0, "IS_one_seq: unsupported seqloc [%d]",
4525 (int)(anp->choice));
4526 retval = TRUE;
4527 break;
4528 }
4529 return retval;
4530 }
4531 /*****************************************************************************
4532 *
4533 * SeqLocId(loc)
4534 *
4535 *****************************************************************************/
SeqLocId(SeqLocPtr anp)4536 NLM_EXTERN SeqIdPtr SeqLocId (SeqLocPtr anp)
4537
4538 {
4539 SeqIdPtr seqid = NULL, currseqid = NULL;
4540 SeqLocPtr loc;
4541
4542 if (anp == NULL) return NULL;
4543
4544 switch (anp->choice)
4545 {
4546 case SEQLOC_NULL: /* NULL */
4547 case SEQLOC_FEAT: /* feat -- can't track yet */
4548 break;
4549 case SEQLOC_BOND: /* bond -- 2 seqs */
4550 if (((SeqBondPtr)(anp->data.ptrvalue))->a != NULL)
4551 seqid = ((SeqBondPtr)(anp->data.ptrvalue))->a->id;
4552 break;
4553 case SEQLOC_EMPTY: /* empty */
4554 case SEQLOC_WHOLE: /* whole */
4555 seqid = (SeqIdPtr)anp->data.ptrvalue;
4556 break;
4557 case SEQLOC_INT: /* int */
4558 seqid = ((SeqIntPtr)anp->data.ptrvalue)->id;
4559 break;
4560 case SEQLOC_PACKED_INT: /* packed int */
4561 case SEQLOC_MIX: /* mix -- could be more than one seq */
4562 case SEQLOC_EQUIV: /* equiv -- ditto */
4563 loc = (SeqLocPtr)anp->data.ptrvalue;
4564 while (loc != NULL)
4565 {
4566 if (loc->choice == SEQLOC_NULL) {
4567 loc = loc->next;
4568 continue;
4569 }
4570 currseqid = SeqLocId(loc);
4571 if (seqid == NULL)
4572 seqid = currseqid;
4573 else
4574 {
4575 if (! SeqIdMatch(seqid, currseqid))
4576 {
4577 seqid = NULL;
4578 loc = NULL;
4579 break;
4580 }
4581 }
4582 loc = loc->next;
4583 }
4584 break;
4585 case SEQLOC_PNT: /* pnt */
4586 seqid = ((SeqPntPtr)anp->data.ptrvalue)->id;
4587 break;
4588 case SEQLOC_PACKED_PNT: /* packed pnt */
4589 seqid = ((PackSeqPntPtr)anp->data.ptrvalue)->id;
4590 break;
4591 default:
4592 break;
4593 }
4594 return seqid;
4595 }
4596
4597 /*****************************************************************************
4598 *
4599 * SeqLocStart(loc)
4600 * returns lowest number position for Seq-loc all on one bioseq
4601 * returns -1 if impossible to meet that condition
4602 *
4603 *****************************************************************************/
SeqLocStart(SeqLocPtr anp)4604 NLM_EXTERN Int4 SeqLocStart (SeqLocPtr anp) /* seqloc */
4605
4606 {
4607 Int4 pos = -1L, tpos, numpnt;
4608 SeqIdPtr sip;
4609 SeqLocPtr slp;
4610 SeqIntPtr sintp;
4611
4612 if (anp == NULL)
4613 return pos;
4614
4615 switch (anp->choice)
4616 {
4617 case SEQLOC_BOND: /* bond -- 2 seqs */
4618 if (((SeqBondPtr)(anp->data.ptrvalue))->a != NULL)
4619 pos = ((SeqBondPtr)(anp->data.ptrvalue))->a->point;
4620 break;
4621 case SEQLOC_FEAT: /* feat -- can't track yet */
4622 case SEQLOC_NULL: /* NULL */
4623 case SEQLOC_EMPTY: /* empty */
4624 break;
4625 case SEQLOC_WHOLE: /* whole */
4626 pos = 0L;
4627 break;
4628 case SEQLOC_MIX: /* mix -- more than one seq */
4629 case SEQLOC_EQUIV: /* equiv -- ditto */
4630 case SEQLOC_PACKED_INT: /* packed int */
4631 sip = SeqLocId(anp);
4632 if (sip != NULL) /* all on one Bioseq */
4633 {
4634 slp = (SeqLocPtr)anp->data.ptrvalue;
4635 while (slp != NULL)
4636 {
4637 tpos = SeqLocStart(slp);
4638 if (pos < 0)
4639 pos = tpos;
4640 else if (tpos < pos)
4641 pos = tpos;
4642 slp = slp->next;
4643 }
4644 }
4645 break;
4646 case SEQLOC_INT: /* int */
4647 sintp = (SeqIntPtr) anp->data.ptrvalue;
4648 pos = sintp->from;
4649 break;
4650 case SEQLOC_PNT: /* pnt */
4651 pos = ((SeqPntPtr)anp->data.ptrvalue)->point;
4652 break;
4653 case SEQLOC_PACKED_PNT: /* packed pnt */
4654 numpnt = PackSeqPntNum((PackSeqPntPtr)anp->data.ptrvalue);
4655 while (numpnt)
4656 {
4657 numpnt--;
4658 tpos = PackSeqPntGet((PackSeqPntPtr)anp->data.ptrvalue, numpnt);
4659 if (pos < 0)
4660 pos = tpos;
4661 else if (tpos < pos)
4662 pos = tpos;
4663 }
4664 break;
4665 default:
4666 break;
4667 }
4668 return pos;
4669 }
4670
4671 /*****************************************************************************
4672 *
4673 * SeqLocStop(loc)
4674 * looks for highest position number on loc if on one Bioseq
4675 * if fails, returns -1
4676 *
4677 *****************************************************************************/
SeqLocStop(SeqLocPtr anp)4678 NLM_EXTERN Int4 SeqLocStop (SeqLocPtr anp) /* seqloc */
4679
4680 {
4681 BioseqPtr bsp;
4682 Int4 pos = -1L, tpos, numpnt;
4683 SeqIdPtr sip;
4684 SeqLocPtr slp;
4685 Boolean locked = FALSE;
4686
4687
4688 if (anp == NULL)
4689 return pos;
4690
4691 switch (anp->choice)
4692 {
4693 case SEQLOC_BOND: /* bond -- 2 seqs */
4694 if (((SeqBondPtr)(anp->data.ptrvalue))->b != NULL)
4695 pos = ((SeqBondPtr)(anp->data.ptrvalue))->b->point;
4696 else if (((SeqBondPtr)(anp->data.ptrvalue))->a != NULL)
4697 pos = ((SeqBondPtr)(anp->data.ptrvalue))->a->point;
4698 break;
4699 case SEQLOC_FEAT: /* feat -- can't track yet */
4700 case SEQLOC_NULL: /* NULL */
4701 case SEQLOC_EMPTY: /* empty */
4702 break;
4703 case SEQLOC_WHOLE: /* whole */
4704 bsp = BioseqFindCore((SeqIdPtr)anp->data.ptrvalue);
4705 if (bsp == NULL)
4706 {
4707 bsp = BioseqLockById((SeqIdPtr)anp->data.ptrvalue);
4708 if (bsp != NULL)
4709 locked = TRUE;
4710 }
4711 pos = BioseqGetLen(bsp) - 1;
4712 if (locked)
4713 BioseqUnlock(bsp);
4714 break;
4715 case SEQLOC_MIX: /* mix -- more than one seq */
4716 case SEQLOC_EQUIV: /* equiv -- ditto */
4717 case SEQLOC_PACKED_INT: /* packed int */
4718 sip = SeqLocId(anp);
4719 if (sip != NULL) /* all on one Bioseq */
4720 {
4721 slp = (SeqLocPtr)anp->data.ptrvalue;
4722 while (slp != NULL)
4723 {
4724 tpos = SeqLocStop(slp);
4725 if (pos < 0)
4726 pos = tpos;
4727 else if (tpos > pos)
4728 pos = tpos;
4729 slp = slp->next;
4730 }
4731 }
4732 break;
4733 case SEQLOC_INT: /* int */
4734 pos = ((SeqIntPtr)anp->data.ptrvalue)->to;
4735 break;
4736 case SEQLOC_PNT: /* pnt */
4737 pos = ((SeqPntPtr)anp->data.ptrvalue)->point;
4738 break;
4739 case SEQLOC_PACKED_PNT: /* packed pnt */
4740 numpnt = PackSeqPntNum((PackSeqPntPtr)anp->data.ptrvalue);
4741 while (numpnt)
4742 {
4743 numpnt--;
4744 tpos = PackSeqPntGet((PackSeqPntPtr)anp->data.ptrvalue, numpnt);
4745 if (pos < 0)
4746 pos = tpos;
4747 else if (tpos > pos)
4748 pos = tpos;
4749 }
4750 break;
4751 default:
4752 break;
4753 }
4754 return pos;
4755 }
4756
4757 /*****************************************************************************
4758 *
4759 * SeqLocStrand(loc)
4760 * see objloc.h for strand value defines
4761 * returns Seq_strand_other when series of locs on different strands
4762 *
4763 *****************************************************************************/
SeqLocStrand(SeqLocPtr anp)4764 NLM_EXTERN Uint1 SeqLocStrand (SeqLocPtr anp) /* seqloc */
4765
4766 {
4767 SeqIdPtr sip;
4768 SeqLocPtr slp;
4769 Uint1 strand = Seq_strand_unknown, tstrand;
4770
4771 if (anp == NULL)
4772 return strand;
4773
4774 switch (anp->choice)
4775 {
4776 case SEQLOC_BOND: /* bond -- 2 seqs */
4777 if (((SeqBondPtr)(anp->data.ptrvalue))->a != NULL)
4778 strand = ((SeqBondPtr)(anp->data.ptrvalue))->a->strand;
4779 break;
4780 case SEQLOC_FEAT: /* feat -- can't track yet */
4781 case SEQLOC_NULL: /* NULL */
4782 case SEQLOC_EMPTY: /* empty */
4783 break;
4784 case SEQLOC_WHOLE: /* whole */
4785 strand = Seq_strand_both;
4786 break;
4787 case SEQLOC_MIX: /* mix -- more than one seq */
4788 case SEQLOC_EQUIV: /* equiv -- ditto */
4789 case SEQLOC_PACKED_INT: /* packed int */
4790 sip = SeqLocId(anp);
4791 if (sip != NULL) /* all on one Bioseq */
4792 {
4793 for (slp = (SeqLocPtr)anp->data.ptrvalue,
4794 strand = SeqLocStrand(slp), slp = slp -> next;
4795 slp != NULL ; slp = slp->next)
4796 {
4797 if (slp->choice == SEQLOC_NULL || slp->choice == SEQLOC_EMPTY) continue;
4798 tstrand = SeqLocStrand(slp);
4799 if (strand == Seq_strand_unknown && tstrand == Seq_strand_plus) {
4800 strand = Seq_strand_plus;
4801 }
4802 if (strand == Seq_strand_plus && tstrand == Seq_strand_unknown) {
4803 tstrand = Seq_strand_plus;
4804 }
4805 if (strand != tstrand)
4806 {
4807 strand = Seq_strand_other;
4808 break;
4809 }
4810 }
4811 }
4812 break;
4813 case SEQLOC_INT: /* int */
4814 strand = ((SeqIntPtr)anp->data.ptrvalue)->strand;
4815 break;
4816 case SEQLOC_PNT: /* pnt */
4817 strand = ((SeqPntPtr)anp->data.ptrvalue)->strand;
4818 break;
4819 case SEQLOC_PACKED_PNT: /* packed pnt */
4820 strand = ((PackSeqPntPtr)anp->data.ptrvalue)->strand;
4821 break;
4822 default:
4823 break;
4824 }
4825 return strand;
4826 }
4827
4828 /*****************************************************************************
4829 *
4830 * Int4 SeqLocGetSegLens (slp, lens, ctr, gaps)
4831 * returns total number of segments in SeqLoc including NULLS
4832 * returns -1 for error
4833 * if lens != NULL fills with lengths of segments, 0 = NULL
4834 *
4835 *****************************************************************************/
SeqLocGetSegLens(SeqLocPtr slp,Int4Ptr lens,Int4 ctr,Boolean gaps)4836 NLM_EXTERN Int4 SeqLocGetSegLens (SeqLocPtr slp, Int4Ptr lens, Int4 ctr, Boolean gaps)
4837 {
4838 SeqLocPtr slp2;
4839 BioseqPtr bsp;
4840 Boolean locked = FALSE;
4841
4842 if (slp == NULL)
4843 return -1;
4844
4845 switch (slp->choice)
4846 {
4847 case SEQLOC_BOND: /* bond -- 2 seqs */
4848 case SEQLOC_FEAT: /* feat -- can't track yet */
4849 break;
4850 case SEQLOC_NULL: /* NULL */
4851 case SEQLOC_EMPTY: /* empty */
4852 if (lens != NULL)
4853 lens[ctr] = 0;
4854 ctr++;
4855 break;
4856 case SEQLOC_WHOLE: /* whole */
4857 if (gaps)
4858 break;
4859 if (lens != NULL)
4860 {
4861 bsp = BioseqFindCore((SeqIdPtr)slp->data.ptrvalue);
4862 if (bsp == NULL)
4863 {
4864 bsp = BioseqLockById((SeqIdPtr)slp->data.ptrvalue);
4865 if (bsp != NULL)
4866 locked = TRUE;
4867 }
4868 lens[ctr] = BioseqGetLen(bsp);
4869 if (locked)
4870 BioseqUnlock(bsp);
4871 }
4872 ctr++;
4873 break;
4874 case SEQLOC_MIX: /* mix -- more than one seq */
4875 case SEQLOC_EQUIV: /* equiv -- ditto */
4876 case SEQLOC_PACKED_INT: /* packed int */
4877 slp2 = (SeqLocPtr)slp->data.ptrvalue;
4878 while (slp2 != NULL)
4879 {
4880 ctr = SeqLocGetSegLens(slp2, lens, ctr, gaps);
4881 slp2 = slp2->next;
4882 }
4883 break;
4884 case SEQLOC_INT: /* int */
4885 if (gaps) break;
4886 if (lens != NULL)
4887 lens[ctr] = ((SeqIntPtr)slp->data.ptrvalue)->to - ((SeqIntPtr)slp->data.ptrvalue)->from + 1;
4888 ctr++;
4889 break;
4890 case SEQLOC_PNT: /* pnt */
4891 if (gaps) break;
4892 if (lens != NULL)
4893 lens[ctr] = 1;
4894 ctr++;
4895 break;
4896 case SEQLOC_PACKED_PNT: /* packed pnt */
4897 if (gaps) break;
4898 if (lens != NULL)
4899 lens[ctr] = SeqLocStop(slp) - SeqLocStart(slp) + 1;
4900 ctr++;
4901 break;
4902 default:
4903 break;
4904 }
4905 return ctr;
4906 }
4907
4908 /*****************************************************************************
4909 *
4910 * SeqLocLen(loc)
4911 * returns total length in residues of loc
4912 * if fails, returns -1
4913 *
4914 *****************************************************************************/
SeqLocLen(SeqLocPtr anp)4915 NLM_EXTERN Int4 SeqLocLen (SeqLocPtr anp) /* seqloc */
4916
4917 {
4918 BioseqPtr bsp;
4919 Int4 len = -1L, tmp;
4920 SeqLocPtr slp;
4921 Boolean locked = FALSE;
4922 ErrSev logsev;
4923 Boolean average = FALSE;
4924 Int2 num;
4925 SeqIdPtr sip;
4926 BIG_ID gi;
4927 SeqMgrPtr smp;
4928 SeqLenLookupFunc func;
4929
4930
4931 if (anp == NULL)
4932 return len;
4933
4934 switch (anp->choice)
4935 {
4936 case SEQLOC_BOND: /* bond -- 2 seqs */
4937 case SEQLOC_FEAT: /* feat -- can't track yet */
4938 break;
4939 case SEQLOC_NULL: /* NULL */
4940 case SEQLOC_EMPTY: /* empty */
4941 len = 0;
4942 break;
4943 case SEQLOC_WHOLE: /* whole */
4944 sip = (SeqIdPtr) anp->data.ptrvalue;
4945 bsp = BioseqFindCore(sip);
4946 if (bsp == NULL) {
4947 if (sip != NULL && sip->choice == SEQID_GI) {
4948 gi = (BIG_ID) sip->data.intvalue;
4949 /* try registered service for rapid length lookup */
4950 smp = SeqMgrWriteLock ();
4951 if (smp != NULL) {
4952 func = smp->seq_len_lookup_func;
4953 SeqMgrUnlock ();
4954 if (func != NULL) {
4955 len = (*func) (gi);
4956 if (len > 0) break;
4957 }
4958 }
4959 }
4960 logsev = ErrSetLogLevel (SEV_MAX);
4961 bsp = BioseqLockById(sip);
4962 ErrSetLogLevel (logsev);
4963 if (bsp != NULL)
4964 locked = TRUE;
4965 }
4966 len = BioseqGetLen(bsp);
4967 if (locked)
4968 BioseqUnlock(bsp);
4969 break;
4970 case SEQLOC_EQUIV: /* equiv -- ditto */
4971 average = TRUE;
4972 case SEQLOC_MIX: /* mix -- more than one seq */
4973 case SEQLOC_PACKED_INT: /* packed int */
4974 slp = (SeqLocPtr)anp->data.ptrvalue;
4975 len = 0;
4976 num = 0;
4977 while (slp != NULL)
4978 {
4979 tmp = SeqLocLen(slp);
4980 if (tmp == -1)
4981 return -1;
4982 len += tmp;
4983 num++;
4984 slp = slp->next;
4985 }
4986 if (average && num != 0) {
4987 len /= num;
4988 }
4989 break;
4990 case SEQLOC_INT: /* int */
4991 len = ((SeqIntPtr)anp->data.ptrvalue)->to - ((SeqIntPtr)anp->data.ptrvalue)->from + 1;
4992 break;
4993 case SEQLOC_PNT: /* pnt */
4994 len = 1;
4995 break;
4996 case SEQLOC_PACKED_PNT: /* packed pnt */
4997 len = SeqLocStop(anp) - SeqLocStart(anp) + 1;
4998 break;
4999 default:
5000 break;
5001 }
5002 return len;
5003 }
5004
5005 /*****************************************************************************
5006 *
5007 * SeqLocRevCmp(loc)
5008 * reverse complements a SeqLoc
5009 * NO Check to be sure its on a nucleic acid
5010 *
5011 *****************************************************************************/
SeqLocRevCmp(SeqLocPtr anp)5012 NLM_EXTERN Boolean SeqLocRevCmp (SeqLocPtr anp) /* seqloc */
5013
5014 {
5015 SeqLocPtr slp, first, curr, prev;
5016 SeqPntPtr spp;
5017
5018
5019 if (anp == NULL)
5020 return FALSE;
5021
5022 switch (anp->choice)
5023 {
5024 case SEQLOC_BOND: /* bond -- 2 seqs */
5025 spp = ((SeqBondPtr)anp->data.ptrvalue)->a;
5026 spp->strand = StrandCmp(spp->strand);
5027 spp = ((SeqBondPtr)anp->data.ptrvalue)->b;
5028 if (spp != NULL)
5029 spp->strand = StrandCmp(spp->strand);
5030 break;
5031 case SEQLOC_FEAT: /* feat -- can't track yet */
5032 case SEQLOC_NULL: /* NULL */
5033 case SEQLOC_EMPTY: /* empty */
5034 case SEQLOC_WHOLE: /* whole */
5035 break;
5036 case SEQLOC_MIX: /* mix -- more than one seq */
5037 case SEQLOC_EQUIV: /* equiv -- ditto */
5038 case SEQLOC_PACKED_INT: /* packed int */
5039 slp = (SeqLocPtr)anp->data.ptrvalue;
5040 while (slp != NULL)
5041 {
5042 SeqLocRevCmp(slp); /* RevCmp subparts */
5043 slp = slp->next;
5044 }
5045 first = NULL;
5046 curr = NULL;
5047 prev = (SeqLocPtr)anp->data.ptrvalue;
5048 while (prev != NULL) /* reverse order of parts */
5049 { /* no effect on meaning of SEQLOC_EQUIV */
5050 slp = (SeqLocPtr)anp->data.ptrvalue;
5051 prev = NULL;
5052 while (slp->next != NULL)
5053 {
5054 prev = slp;
5055 slp = slp->next;
5056 }
5057 if (prev != NULL)
5058 prev->next = NULL;
5059 if (first == NULL)
5060 first = slp;
5061 else
5062 curr->next = slp;
5063 slp->next = NULL;
5064 curr = slp;
5065 }
5066 anp->data.ptrvalue = first;
5067 break;
5068 case SEQLOC_INT: /* int */
5069 ((SeqIntPtr)anp->data.ptrvalue)->strand = StrandCmp(((SeqIntPtr)anp->data.ptrvalue)->strand);
5070 break;
5071 case SEQLOC_PNT: /* pnt */
5072 ((SeqPntPtr)anp->data.ptrvalue)->strand = StrandCmp(((SeqPntPtr)anp->data.ptrvalue)->strand);
5073 break;
5074 case SEQLOC_PACKED_PNT: /* packed pnt */
5075 ((PackSeqPntPtr)anp->data.ptrvalue)->strand = StrandCmp(((PackSeqPntPtr)anp->data.ptrvalue)->strand);
5076 break;
5077 default:
5078 return FALSE;
5079 }
5080 return TRUE;
5081 }
5082
5083 /*****************************************************************************
5084 *
5085 * Uint1 StrandCmp(strand)
5086 * returns the complement of a Strand
5087 *
5088 *****************************************************************************/
StrandCmp(Uint1 strand)5089 NLM_EXTERN Uint1 StrandCmp (Uint1 strand)
5090
5091 {
5092 switch(strand)
5093 {
5094 case Seq_strand_unknown: /* default to plus for this */
5095 case Seq_strand_plus:
5096 return (Uint1) Seq_strand_minus;
5097 case Seq_strand_minus:
5098 return (Uint1) Seq_strand_plus;
5099 case Seq_strand_both:
5100 return (Uint1) Seq_strand_both_rev;
5101 case Seq_strand_both_rev:
5102 return (Uint1) Seq_strand_both;
5103 }
5104 return strand;
5105 }
5106
5107
DoStrandsMatch(Uint1 strand1,Uint2 strand2)5108 static Boolean DoStrandsMatch(Uint1 strand1, Uint2 strand2)
5109 {
5110 if (strand1 == Seq_strand_minus && strand2 == Seq_strand_minus) {
5111 return TRUE;
5112 } else if (strand1 != Seq_strand_minus && strand2 != Seq_strand_minus) {
5113 return TRUE;
5114 } else {
5115 return FALSE;
5116 }
5117 }
5118
5119
SeqLocMixFromPackedSeqPnt(PackSeqPntPtr pspp)5120 static SeqLocPtr SeqLocMixFromPackedSeqPnt (PackSeqPntPtr pspp)
5121 {
5122 SeqPntPtr pnt;
5123 SeqLocPtr list = NULL, slp = NULL;
5124 Uint1 i;
5125
5126 if (pspp == NULL)
5127 {
5128 return NULL;
5129 }
5130
5131 while (pspp != NULL)
5132 {
5133 for (i = 0; i < pspp->used; i++)
5134 {
5135 pnt = SeqPntNew();
5136 pnt->id = SeqIdDup (pspp->id);
5137 pnt->strand = pspp->strand;
5138 pnt->point = pspp->pnts[i];
5139 ValNodeAddPointer (&list, SEQLOC_PNT, pnt);
5140 }
5141 pspp = pspp->next;
5142 }
5143 slp = ValNodeNew (NULL);
5144 slp->choice = SEQLOC_MIX;
5145 slp->data.ptrvalue = list;
5146 return slp;
5147 }
5148
5149
SeqLocMixFromSeqBond(SeqBondPtr sbp)5150 static SeqLocPtr SeqLocMixFromSeqBond (SeqBondPtr sbp)
5151 {
5152 SeqPntPtr pnt;
5153 SeqLocPtr list = NULL, slp = NULL;
5154
5155 if (sbp == NULL || (sbp->a == NULL && sbp->b == NULL)) {
5156 return NULL;
5157 }
5158 if (sbp->a != NULL) {
5159 pnt = AsnIoMemCopy (sbp->a, (AsnReadFunc) SeqPntAsnRead, (AsnWriteFunc) SeqPntAsnWrite);
5160 ValNodeAddPointer (&list, SEQLOC_PNT, pnt);
5161 }
5162 if (sbp->b != NULL) {
5163 pnt = AsnIoMemCopy (sbp->b, (AsnReadFunc) SeqPntAsnRead, (AsnWriteFunc) SeqPntAsnWrite);
5164 ValNodeAddPointer (&list, SEQLOC_PNT, pnt);
5165 }
5166 slp = ValNodeNew (NULL);
5167 slp->choice = SEQLOC_MIX;
5168 slp->data.ptrvalue = list;
5169 return slp;
5170 }
5171
5172 static
CreateSortedSeqLoc_comparator(VoidPtr ptr1,VoidPtr ptr2)5173 int LIBCALLBACK CreateSortedSeqLoc_comparator (VoidPtr ptr1, VoidPtr ptr2)
5174 {
5175 SeqLocPtr loc_piece1 = *(SeqLocPtr PNTR)ptr1;
5176 SeqLocPtr loc_piece2 = *(SeqLocPtr PNTR)ptr2;
5177 SeqIdPtr sip1;
5178 SeqIdPtr sip2;
5179 Char sip_name1[50];
5180 Char sip_name2[50];
5181 Int4 sip_name_comp;
5182 Int4 start1;
5183 Int4 start2;
5184 Int4 end1;
5185 Int4 end2;
5186
5187 sip1 = SeqLocId( loc_piece1 );
5188 sip2 = SeqLocId( loc_piece2 );
5189 if( NULL == sip1 && NULL != sip2 ) {
5190 return -1;
5191 } else if( NULL != sip1 && NULL == sip2 ) {
5192 return 1;
5193 } else if( NULL != sip1 && NULL != sip2 ) {
5194 /* compare Seq-ids */
5195 if( ! seqid_name( sip1, sip_name1, FALSE, FALSE ) ) {
5196 sip_name1[0] = '\0';
5197 }
5198 if( ! seqid_name( sip2, sip_name2, FALSE, FALSE ) ) {
5199 sip_name2[0] = '\0';
5200 }
5201 sip_name_comp = StrCmp( sip_name1, sip_name2 );
5202 if( 0 != sip_name_comp ) {
5203 return sip_name_comp;
5204 }
5205 }
5206
5207 start1 = SeqLocStart(loc_piece1);
5208 start2 = SeqLocStart(loc_piece2);
5209 if( start1 != start2 ) {
5210 return (start1 - start2);
5211 }
5212
5213 end1 = SeqLocStop(loc_piece1);
5214 end2 = SeqLocStop(loc_piece2);
5215 return (end2 - end1);
5216 }
5217
5218 /* Note that this doesn't return a SeqLocPtr because it's not creating
5219 a real usable SeqLoc. Rather, it's returning an array of pointers
5220 into the given loc which points to them in order. */
5221 static SeqLocPtr PNTR
CreateSortedSeqLoc(SeqLocPtr loc,Uint4Ptr out_len)5222 CreateSortedSeqLoc( SeqLocPtr loc, Uint4Ptr out_len )
5223 {
5224 Int4 jj = 0;
5225 SeqLocPtr loc_piece = NULL;
5226 SeqLocPtr PNTR retval = NULL;
5227
5228 *out_len = 0;
5229
5230 /* First, see how big loc is */
5231 loc_piece = (SeqLocPtr)loc->data.ptrvalue;
5232 while( NULL != loc_piece ) {
5233 ++(*out_len);
5234 loc_piece = loc_piece->next;
5235 }
5236
5237 /* allocate enough memory to fit everything, and copy
5238 the (not-yet-sorted) pointers over */
5239 loc_piece = (SeqLocPtr)loc->data.ptrvalue;
5240 retval = (SeqLocPtr PNTR) MemNew( sizeof(SeqLocPtr) * (*out_len) );
5241 for( jj = 0; jj < (*out_len); ++jj ) {
5242 retval[jj] = loc_piece;
5243 loc_piece = loc_piece->next;
5244 }
5245
5246 /* now, sort what we have */
5247 StableMergeSort( retval, (*out_len), sizeof(SeqLocPtr),
5248 CreateSortedSeqLoc_comparator );
5249 return retval;
5250 }
5251
CompareMultiPartLocToMultiPartLoc(SeqLocPtr a,SeqLocPtr b,Boolean compare_strand)5252 static Int2 CompareMultiPartLocToMultiPartLoc(SeqLocPtr a, SeqLocPtr b, Boolean compare_strand)
5253 {
5254 Boolean got_one = FALSE; /* for any overlap */
5255 Int2 retval = SLC_NO_MATCH,
5256 retval2 = SLC_NO_MATCH;
5257 /* Points to the pieces of a and b in sorted order */
5258 SeqLocPtr PNTR a_sorted = NULL;
5259 Uint4 a_sorted_len = 0;
5260 Uint4 a_idx = 0; /* used to iterate through */
5261 SeqLocPtr PNTR b_sorted = NULL;
5262 Uint4 b_sorted_len = 0;
5263 Uint4 b_idx = 0; /* used to iterate through */
5264
5265 if (a == NULL || b == NULL) {
5266 return SLC_NO_MATCH;
5267 }
5268 if (a->choice != SEQLOC_MIX && a->choice != SEQLOC_EQUIV && a->choice != SEQLOC_PACKED_INT) {
5269 return SLC_NO_MATCH;
5270 }
5271 if (b->choice != SEQLOC_MIX && b->choice != SEQLOC_EQUIV && b->choice != SEQLOC_PACKED_INT) {
5272 return SLC_NO_MATCH;
5273 }
5274
5275 /* create an array of pointers to the pieces of the seqloc, in order */
5276 a_sorted = CreateSortedSeqLoc( a, &a_sorted_len );
5277 b_sorted = CreateSortedSeqLoc( b, &b_sorted_len );
5278
5279 /* check for identity */
5280 retval = SeqLocCompareEx(a_sorted[0], b_sorted[0], compare_strand);
5281 a_idx = 1;
5282 b_idx = 1;
5283 while ((a_idx < a_sorted_len) && (b_idx < b_sorted_len) && (retval == SLC_A_EQ_B))
5284 {
5285 retval = SeqLocCompareEx(a_sorted[a_idx], b_sorted[b_idx], compare_strand);
5286 ++a_idx;
5287 ++b_idx;
5288 }
5289 if ((a_idx == a_sorted_len) && (b_idx == b_sorted_len) && (retval == SLC_A_EQ_B))
5290 goto done;
5291
5292 /* check for a in b */
5293 a_idx = 0;
5294 b_idx = 0;
5295 while ((a_idx < a_sorted_len) && (b_idx < b_sorted_len))
5296 {
5297 retval2 = SeqLocCompareEx(a_sorted[a_idx], b_sorted[b_idx], compare_strand);
5298 if (retval2 > SLC_NO_MATCH)
5299 got_one = TRUE;
5300 switch (retval2)
5301 {
5302 case SLC_NO_MATCH:
5303 ++b_idx;
5304 break;
5305 case SLC_A_EQ_B:
5306 ++a_idx;
5307 ++b_idx;
5308 break;
5309 case SLC_A_IN_B:
5310 ++a_idx;
5311 break;
5312 case SLC_B_IN_A:
5313 case SLC_A_OVERLAP_B:
5314 b_idx = b_sorted_len;
5315 break;
5316 }
5317 }
5318 if (a_idx == a_sorted_len) { /* a all in b */
5319 retval = SLC_A_IN_B;
5320 goto done;
5321 }
5322
5323 /* check for b in a */
5324 a_idx = 0;
5325 b_idx = 0;
5326 while ((a_idx < a_sorted_len) && (b_idx < b_sorted_len))
5327 {
5328 retval2 = SeqLocCompareEx(b_sorted[b_idx], a_sorted[a_idx], compare_strand);
5329 if (retval2 > SLC_NO_MATCH)
5330 got_one = TRUE;
5331 switch (retval2)
5332 {
5333 case SLC_NO_MATCH:
5334 ++a_idx;
5335 break;
5336 case SLC_A_EQ_B:
5337 ++a_idx;
5338 ++b_idx;
5339 break;
5340 case SLC_A_IN_B:
5341 ++b_idx;
5342 break;
5343 case SLC_B_IN_A:
5344 case SLC_A_OVERLAP_B:
5345 a_idx = a_sorted_len;
5346 break;
5347 }
5348 }
5349 if (b_idx == b_sorted_len) { /* b all in a */
5350 retval = SLC_B_IN_A;
5351 goto done;
5352 }
5353
5354 if (got_one) {
5355 retval = SLC_A_OVERLAP_B;
5356 goto done;
5357 }
5358
5359 /* goto here instead of just calling "return" so we can clean up */
5360 done:
5361
5362 if( NULL != a_sorted ) {
5363 a_sorted = MemFree(a_sorted);
5364 }
5365 if( NULL != b_sorted ) {
5366 b_sorted = MemFree(b_sorted);
5367 }
5368
5369 return retval;
5370 }
5371
5372 /*****************************************************************************
5373 *
5374 * SeqLocCompare(a, b)
5375 * returns
5376 * 0 = no overlap
5377 * 1 = a is completely contained in b
5378 * 2 = b is completely contained in a
5379 * 3 = a == b
5380 * 4 = a and b overlap, but neither completely contained in the other
5381 *
5382 *
5383 *****************************************************************************/
SeqLocCompareEx(SeqLocPtr a,SeqLocPtr b,Boolean compare_strand)5384 NLM_EXTERN Int2 SeqLocCompareEx (SeqLocPtr a, SeqLocPtr b, Boolean compare_strand) /* seqloc */
5385
5386 {
5387 BioseqPtr bsp;
5388 Int4 len = -1L, i, j, num, num2, point, hits;
5389 Uint1 strand;
5390 SeqLocPtr slp, tmp_a = NULL, tmp_b = NULL;
5391 ValNode tmp;
5392 SeqBondPtr sbp;
5393 SeqIntPtr sip, sip2;
5394 SeqIdPtr sidp;
5395 PackSeqPntPtr pspp, pspp2;
5396 Boolean got_one, missed_one, locked = FALSE;
5397 Int2 retval = SLC_NO_MATCH,
5398 retval2 = SLC_NO_MATCH;
5399 static Uint1 rettable [5][5] = { /* for developing return values */
5400 { 0,4,2,2,4 } , /* when a is longer than b */
5401 { 4,1,4,1,4 } ,
5402 { 2,4,2,2,4 } ,
5403 { 2,1,2,3,4 } ,
5404 { 4,4,4,4,4 }};
5405 static Uint1 rettable2 [5][5] = { /* for developing return values */
5406 { 0,1,4,1,4 } , /* when b is longer than a */
5407 { 1,1,1,1,1 } ,
5408 { 4,1,2,2,4 } ,
5409 { 1,1,4,3,4 } ,
5410 { 4,1,4,4,4 }};
5411
5412 if ((a == NULL) || (b == NULL))
5413 return retval;
5414
5415 switch (a->choice)
5416 {
5417 case SEQLOC_MIX: /* mix -- more than one seq */
5418 case SEQLOC_EQUIV: /* equiv -- ditto */
5419 case SEQLOC_PACKED_INT: /* packed int */
5420 case SEQLOC_PACKED_PNT: /* packed points (need to convert to SEQLOC_MIX) */
5421 case SEQLOC_BOND: /* bond (need to convert to SEQLOC_MIX) */
5422 if (a->choice == SEQLOC_PACKED_PNT)
5423 {
5424 tmp_a = SeqLocMixFromPackedSeqPnt ((PackSeqPntPtr)a->data.ptrvalue);
5425 a = tmp_a;
5426 }
5427 else if (a->choice == SEQLOC_BOND)
5428 {
5429 tmp_a = SeqLocMixFromSeqBond ((SeqBondPtr)a->data.ptrvalue);
5430 a = tmp_a;
5431 }
5432 if ((b->choice == SEQLOC_MIX) || /* check for identity */
5433 (b->choice == SEQLOC_EQUIV) ||
5434 (b->choice == SEQLOC_PACKED_INT) ||
5435 (b->choice == SEQLOC_PACKED_PNT) ||
5436 (b->choice == SEQLOC_BOND))
5437 {
5438 if (b->choice == SEQLOC_PACKED_PNT)
5439 {
5440 tmp_b = SeqLocMixFromPackedSeqPnt ((PackSeqPntPtr)b->data.ptrvalue);
5441 b = tmp_b;
5442 }
5443 else if (b->choice == SEQLOC_BOND)
5444 {
5445 tmp_b = SeqLocMixFromSeqBond ((SeqBondPtr)b->data.ptrvalue);
5446 b = tmp_b;
5447 }
5448 retval = CompareMultiPartLocToMultiPartLoc (a, b, compare_strand);
5449 if (retval != SLC_NO_MATCH) {
5450 tmp_a = SeqLocFree (tmp_a);
5451 tmp_b = SeqLocFree (tmp_b);
5452 return retval;
5453 }
5454 }
5455
5456 slp = (SeqLocPtr)a->data.ptrvalue; /* check for any overlap */
5457 retval = SeqLocCompareEx(slp, b, compare_strand);
5458 slp = slp->next;
5459 while (slp != NULL)
5460 {
5461 retval2 = SeqLocCompareEx(slp, b, compare_strand);
5462 retval = (Int2) rettable[retval][retval2];
5463 slp = slp->next;
5464 }
5465 tmp_a = SeqLocFree (tmp_a);
5466 tmp_b = SeqLocFree (tmp_b);
5467 return retval;
5468 break;
5469 default:
5470 break;
5471 }
5472 switch (b->choice)
5473 {
5474 case SEQLOC_MIX: /* mix -- more than one seq */
5475 case SEQLOC_EQUIV: /* equiv -- ditto */
5476 case SEQLOC_PACKED_INT: /* packed int */
5477 slp = (SeqLocPtr)b->data.ptrvalue;
5478 retval = SeqLocCompareEx(a, slp, compare_strand);
5479 slp = slp->next;
5480 while (slp != NULL)
5481 {
5482 retval2 = SeqLocCompareEx(a, slp, compare_strand);
5483 retval = (Int2)rettable2[retval][retval2];
5484 slp = slp->next;
5485 }
5486 return retval;
5487 break;
5488 default:
5489 break;
5490 }
5491
5492 tmp.next = NULL;
5493 switch (a->choice)
5494 {
5495 case SEQLOC_NULL: /* NULL, can't match */
5496 if (b->choice == SEQLOC_NULL)
5497 retval = SLC_A_EQ_B;
5498 break;
5499 case SEQLOC_FEAT: /* feat -- can't track yet */
5500 break;
5501 case SEQLOC_EMPTY: /* empty */
5502 if (b->choice == SEQLOC_EMPTY)
5503 {
5504 if (SeqIdForSameBioseq((SeqIdPtr)a->data.ptrvalue, (SeqIdPtr)b->data.ptrvalue))
5505 retval = SLC_A_EQ_B;
5506 }
5507 break;
5508 case SEQLOC_BOND: /* bond -- 2 seqs */
5509 sbp = (SeqBondPtr)a->data.ptrvalue;
5510 tmp.choice = SEQLOC_PNT; /* check the points */
5511 tmp.data.ptrvalue = (Pointer)sbp->a;
5512 retval = SeqLocCompareEx(&tmp, b, compare_strand);
5513 if (sbp->b != NULL)
5514 {
5515 tmp.data.ptrvalue = (Pointer)sbp->b;
5516 retval2 = SeqLocCompareEx(&tmp, b, compare_strand);
5517 retval = (Int2) rettable[retval][retval2];
5518 }
5519 break;
5520 case SEQLOC_WHOLE: /* whole */
5521 sidp = (SeqIdPtr)a->data.ptrvalue;
5522 switch (b->choice)
5523 {
5524 case SEQLOC_BOND: /* bond -- 2 seqs */
5525 sbp = (SeqBondPtr)b->data.ptrvalue;
5526 if (SeqIdForSameBioseq(sbp->a->id, sidp))
5527 retval = SLC_B_IN_A;
5528 if (sbp->b != NULL)
5529 {
5530 if (SeqIdForSameBioseq(sbp->b->id, sidp))
5531 retval2 = SLC_B_IN_A;
5532 retval = (Int2) rettable2[retval][retval2];
5533 }
5534 break;
5535 case SEQLOC_WHOLE: /* whole */
5536 if (SeqIdForSameBioseq(sidp, (SeqIdPtr)b->data.ptrvalue))
5537 retval = SLC_A_EQ_B;
5538 break;
5539 case SEQLOC_INT: /* int */
5540 sip = (SeqIntPtr)b->data.ptrvalue;
5541 if (SeqIdForSameBioseq(sidp, sip->id))
5542 {
5543 retval = SLC_B_IN_A;
5544 bsp = BioseqFindCore(sidp);
5545 if (bsp == NULL)
5546 {
5547 bsp = BioseqLockById(sidp);
5548 if (bsp != NULL)
5549 locked = TRUE;
5550 }
5551 if (bsp != NULL)
5552 {
5553 len = BioseqGetLen(bsp);
5554 if ((sip->from == 0) && (sip->to == (len - 1)))
5555 retval = SLC_A_EQ_B;
5556 }
5557 if (locked)
5558 BioseqUnlock(bsp);
5559 }
5560 break;
5561 case SEQLOC_PNT: /* pnt */
5562 if (SeqIdForSameBioseq(sidp, ((SeqPntPtr)b->data.ptrvalue)->id))
5563 retval = SLC_B_IN_A;
5564 break;
5565 case SEQLOC_PACKED_PNT: /* packed pnt */
5566 got_one = FALSE;
5567 missed_one = FALSE;
5568 for (pspp = (PackSeqPntPtr)b->data.ptrvalue;
5569 pspp != NULL;
5570 pspp = pspp->next)
5571 {
5572 if (SeqIdForSameBioseq(sidp, pspp->id))
5573 {
5574 got_one = TRUE;
5575 }
5576 else
5577 {
5578 missed_one = TRUE;
5579 }
5580 }
5581 if (got_one)
5582 {
5583 if (missed_one)
5584 {
5585 retval = SLC_A_OVERLAP_B;
5586 }
5587 else
5588 {
5589 retval = SLC_B_IN_A;
5590 }
5591 }
5592 break;
5593 default:
5594 break;
5595 }
5596 break;
5597 case SEQLOC_INT: /* int */
5598 sip = (SeqIntPtr)a->data.ptrvalue;
5599 sidp = sip->id;
5600 switch (b->choice)
5601 {
5602 case SEQLOC_BOND: /* bond -- 2 seqs */
5603 sbp = (SeqBondPtr)b->data.ptrvalue;
5604 if (SeqIdForSameBioseq(sbp->a->id, sidp))
5605 {
5606 if ((sip->from <= sbp->a->point) &&
5607 (sip->to >= sbp->a->point) &&
5608 (!compare_strand || DoStrandsMatch(sip->strand, sbp->a->strand)))
5609 {
5610 retval = SLC_B_IN_A;
5611 }
5612 }
5613 if (sbp->b != NULL)
5614 {
5615 if (SeqIdForSameBioseq(sbp->b->id, sidp))
5616 {
5617 if ((sip->from <= sbp->b->point) &&
5618 (sip->to >= sbp->b->point) &&
5619 (!compare_strand || DoStrandsMatch(sip->strand, sbp->b->strand)))
5620 {
5621 retval2 = SLC_B_IN_A;
5622 }
5623 }
5624 retval = (Int2) rettable2[retval][retval2];
5625 }
5626 break;
5627 case SEQLOC_WHOLE: /* whole */
5628 if (SeqIdForSameBioseq(sidp, (SeqIdPtr)b->data.ptrvalue))
5629 {
5630 retval = SLC_A_IN_B;
5631 bsp = BioseqFindCore((SeqIdPtr)b->data.ptrvalue);
5632 if (bsp == NULL)
5633 {
5634 bsp = BioseqLockById((SeqIdPtr)b->data.ptrvalue);
5635 if (bsp != NULL)
5636 locked = TRUE;
5637 }
5638 if (bsp != NULL)
5639 {
5640 len = BioseqGetLen(bsp);
5641 if ((sip->from == 0) && (sip->to == (len - 1)))
5642 retval = SLC_A_EQ_B;
5643 }
5644 if (locked)
5645 BioseqUnlock(bsp);
5646 }
5647 break;
5648 case SEQLOC_INT: /* int */
5649 sip2 = (SeqIntPtr)b->data.ptrvalue;
5650 if (SeqIdForSameBioseq(sidp, sip2->id)
5651 && (!compare_strand || DoStrandsMatch (sip->strand, sip2->strand)))
5652 {
5653 if ((sip->from == sip2->from) && (sip->to == sip2->to))
5654 retval = SLC_A_EQ_B;
5655 else if ((sip->from <= sip2->from) && (sip->to >= sip2->to))
5656 retval = SLC_B_IN_A;
5657 else if ((sip->from >= sip2->from) && (sip->to <= sip2->to))
5658 retval = SLC_A_IN_B;
5659 else if ((sip->from >= sip2->from) && (sip->from <= sip2->to))
5660 retval = SLC_A_OVERLAP_B;
5661 else if ((sip->to >= sip2->from) && (sip->to <= sip2->to))
5662 retval = SLC_A_OVERLAP_B;
5663 }
5664 break;
5665 case SEQLOC_PNT: /* pnt */
5666 if (SeqIdForSameBioseq(sidp, ((SeqPntPtr)b->data.ptrvalue)->id)
5667 && (!compare_strand || DoStrandsMatch (sip->strand, ((SeqPntPtr)b->data.ptrvalue)->strand)))
5668 {
5669 point = ((SeqPntPtr)b->data.ptrvalue)->point;
5670 if ((point == sip->from) && (point == sip->to))
5671 retval = SLC_A_EQ_B;
5672 else if ((point >= sip->from) && (point <= sip->to))
5673 retval = SLC_B_IN_A;
5674 }
5675 break;
5676 case SEQLOC_PACKED_PNT: /* packed pnt */
5677 pspp = (PackSeqPntPtr)b->data.ptrvalue;
5678 got_one = FALSE;
5679 missed_one = FALSE;
5680 while (pspp != NULL)
5681 {
5682 if (SeqIdForSameBioseq(sidp, pspp->id)
5683 && (!compare_strand || DoStrandsMatch (sip->strand, pspp->strand)))
5684 {
5685 num = pspp->used;
5686 for (i = 0; i < num; i++)
5687 {
5688 point = pspp->pnts[i];
5689 if ((point < sip->from) || (point > sip->to))
5690 {
5691 missed_one = TRUE;
5692 }
5693 else
5694 {
5695 got_one = TRUE;
5696 }
5697 }
5698 }
5699 pspp = pspp->next;
5700 }
5701 if (got_one)
5702 {
5703 if (missed_one)
5704 retval = SLC_A_OVERLAP_B;
5705 else
5706 retval = SLC_B_IN_A;
5707 }
5708 break;
5709 default:
5710 break;
5711 }
5712 break;
5713 case SEQLOC_PNT: /* pnt */
5714 sidp = ((SeqPntPtr)a->data.ptrvalue)->id;
5715 point = ((SeqPntPtr)a->data.ptrvalue)->point;
5716 strand = ((SeqPntPtr)a->data.ptrvalue)->strand;
5717 switch (b->choice)
5718 {
5719 case SEQLOC_BOND: /* bond -- 2 seqs */
5720 sbp = (SeqBondPtr)b->data.ptrvalue;
5721 if (SeqIdForSameBioseq(sbp->a->id, sidp)
5722 && (!compare_strand || DoStrandsMatch (sbp->a->strand, strand)))
5723 {
5724 if (point == sbp->a->point)
5725 retval = SLC_A_EQ_B;
5726 }
5727 if (sbp->b != NULL)
5728 {
5729 if (SeqIdForSameBioseq(sbp->b->id, sidp)
5730 && (!compare_strand || DoStrandsMatch (sbp->b->strand, strand)))
5731 {
5732 if (point == sbp->b->point)
5733 retval2 = SLC_A_EQ_B;
5734 }
5735 retval = (Int2) rettable2[retval][retval2];
5736 }
5737 break;
5738 case SEQLOC_WHOLE: /* whole */
5739 if (SeqIdForSameBioseq(sidp, (SeqIdPtr)b->data.ptrvalue))
5740 retval = SLC_A_IN_B;
5741 break;
5742 case SEQLOC_INT: /* int */
5743 sip2 = (SeqIntPtr)b->data.ptrvalue;
5744 if (SeqIdForSameBioseq(sidp, sip2->id)
5745 && (!compare_strand || DoStrandsMatch (sip2->strand, strand)))
5746 {
5747 if ((point == sip2->from) && (point == sip2->to))
5748 retval = SLC_A_EQ_B;
5749 else if ((point >= sip2->from) && (point <= sip2->to))
5750 retval = SLC_A_IN_B;
5751 }
5752 break;
5753 case SEQLOC_PNT: /* pnt */
5754 if (SeqIdForSameBioseq(sidp, ((SeqPntPtr)b->data.ptrvalue)->id)
5755 && (!compare_strand || DoStrandsMatch (strand, ((SeqPntPtr)b->data.ptrvalue)->strand)))
5756 {
5757 if (point == ((SeqPntPtr)b->data.ptrvalue)->point)
5758 retval = SLC_A_EQ_B;
5759 }
5760 break;
5761 case SEQLOC_PACKED_PNT: /* packed pnt */
5762 pspp = (PackSeqPntPtr)b->data.ptrvalue;
5763 got_one = FALSE;
5764 missed_one = FALSE;
5765 while (pspp != NULL) {
5766 if (SeqIdForSameBioseq(sidp, pspp->id)
5767 && (!compare_strand || DoStrandsMatch (strand, pspp->strand)))
5768 {
5769 num = pspp->used;
5770 for (i = 0; i < num; i++)
5771 {
5772 if (point == pspp->pnts[i])
5773 {
5774 got_one = TRUE;
5775 }
5776 else
5777 {
5778 missed_one = TRUE;
5779 }
5780 }
5781 }
5782 else
5783 {
5784 missed_one = TRUE;
5785 }
5786 pspp = pspp->next;
5787 }
5788 if (got_one)
5789 {
5790 if (missed_one)
5791 {
5792 retval = SLC_A_IN_B;
5793 }
5794 else
5795 {
5796 retval = SLC_A_EQ_B;
5797 }
5798 }
5799 break;
5800 default:
5801 break;
5802 }
5803 break;
5804 case SEQLOC_PACKED_PNT: /* packed pnt */
5805 pspp = (PackSeqPntPtr)a->data.ptrvalue;
5806 num = PackSeqPntNum(pspp);
5807 sidp = pspp->id;
5808 switch (b->choice)
5809 {
5810 case SEQLOC_BOND: /* bond -- 2 seqs */
5811 sbp = (SeqBondPtr)b->data.ptrvalue;
5812 if (SeqIdForSameBioseq(sbp->a->id, sidp)
5813 && (!compare_strand || DoStrandsMatch (pspp->strand, sbp->a->strand)))
5814 {
5815 point = sbp->a->point;
5816 for (i = 0; i < num; i++)
5817 {
5818 if (point == PackSeqPntGet(pspp, i))
5819 {
5820 retval = SLC_B_IN_A;
5821 i = num;
5822 }
5823 }
5824 }
5825 if (sbp->b != NULL)
5826 {
5827 if (SeqIdForSameBioseq(sbp->b->id, sidp)
5828 && (!compare_strand || DoStrandsMatch(pspp->strand, sbp->b->strand)))
5829 {
5830 point = sbp->b->point;
5831 for (i = 0; i < num; i++)
5832 {
5833 if (point == PackSeqPntGet(pspp, i))
5834 {
5835 if (retval != SLC_B_IN_A)
5836 retval = SLC_A_OVERLAP_B;
5837 i = num + 1;
5838 }
5839 }
5840 if ((i != num) && (retval == SLC_B_IN_A))
5841 retval = SLC_A_OVERLAP_B;
5842 }
5843 }
5844 break;
5845 case SEQLOC_WHOLE: /* whole */
5846 if (SeqIdForSameBioseq(sidp, (SeqIdPtr)b->data.ptrvalue))
5847 retval = SLC_A_IN_B;
5848 break;
5849 case SEQLOC_INT: /* int */
5850 sip = (SeqIntPtr)b->data.ptrvalue;
5851 if (SeqIdForSameBioseq(sidp, sip->id)
5852 && (!compare_strand || DoStrandsMatch(sip->strand, pspp->strand)))
5853 {
5854 got_one = FALSE;
5855 missed_one = FALSE;
5856 for (i = 0; i < num; i++)
5857 {
5858 point = PackSeqPntGet(pspp, i);
5859 if ((point < sip->from) || (point > sip->to))
5860 {
5861 missed_one = TRUE;
5862 if (got_one)
5863 i = num + 1;
5864 }
5865 else
5866 {
5867 got_one = TRUE;
5868 if (missed_one)
5869 i = num + 1;
5870 }
5871 }
5872 if (got_one)
5873 {
5874 if (missed_one)
5875 retval = SLC_A_OVERLAP_B;
5876 else
5877 retval = SLC_A_IN_B;
5878 }
5879 }
5880 break;
5881 case SEQLOC_PNT: /* pnt */
5882 if (SeqIdForSameBioseq(sidp, ((SeqPntPtr)b->data.ptrvalue)->id)
5883 && (!compare_strand || DoStrandsMatch (pspp->strand, ((SeqPntPtr)b->data.ptrvalue)->strand)))
5884 {
5885 point = ((SeqPntPtr)b->data.ptrvalue)->point;
5886 for (i = 0; i < num; i++)
5887 {
5888 if (point == PackSeqPntGet(pspp, i))
5889 {
5890 retval = SLC_B_IN_A;
5891 i = num + 1;
5892 }
5893 }
5894 }
5895 break;
5896 case SEQLOC_PACKED_PNT: /* packed pnt */
5897 pspp2 = (PackSeqPntPtr)b->data.ptrvalue;
5898 if (SeqIdForSameBioseq(sidp, pspp->id)
5899 && (!compare_strand || DoStrandsMatch(pspp->strand, pspp2->strand)))
5900 {
5901 num2 = PackSeqPntNum(pspp2);
5902 if (num == num2) /* check for identity */
5903 {
5904 for (i = 0; i < num; i++)
5905 {
5906 if ( PackSeqPntGet(pspp, i) !=
5907 PackSeqPntGet(pspp2, i))
5908 i = num + 1;
5909 }
5910 if (i == num)
5911 retval = SLC_A_EQ_B;
5912 }
5913 if (retval != SLC_A_EQ_B)
5914 {
5915 hits = 0;
5916 for (i = 0; i < num; i++)
5917 {
5918 point = PackSeqPntGet(pspp, i);
5919 for (j = 0; j < num2; j++)
5920 {
5921 if (point == PackSeqPntGet(pspp2, j))
5922 hits++;
5923 }
5924 }
5925 if (hits == num)
5926 retval = SLC_A_IN_B;
5927 else if (hits == num2)
5928 retval = SLC_B_IN_A;
5929 }
5930 }
5931 break;
5932 default:
5933 break;
5934 }
5935 break;
5936 default:
5937 break;
5938 }
5939 return retval;
5940 }
5941
5942
SeqLocCompare(SeqLocPtr a,SeqLocPtr b)5943 NLM_EXTERN Int2 SeqLocCompare (SeqLocPtr a, SeqLocPtr b) /* seqloc */
5944 {
5945 return SeqLocCompareEx (a, b, FALSE);
5946 }
5947
5948
ComplementLocCompare(Uint1 val)5949 static Uint1 ComplementLocCompare (Uint1 val)
5950 {
5951 if (val == SLC_A_IN_B) {
5952 val = SLC_B_IN_A;
5953 } else if (val == SLC_B_IN_A) {
5954 val = SLC_A_IN_B;
5955 }
5956 return val;
5957 }
5958
5959
CheckSeqLocCompResults(SeqLocPtr a,SeqLocPtr b,Uint1 allow_strand,Uint1 check_strand)5960 static Boolean CheckSeqLocCompResults (SeqLocPtr a, SeqLocPtr b, Uint1 allow_strand, Uint1 check_strand)
5961 {
5962 Boolean rval = TRUE;
5963
5964 if (SeqLocCompare(a, b) != allow_strand) {
5965 rval = FALSE;
5966 } else if (SeqLocCompareEx(a, b, TRUE) != check_strand) {
5967 rval = FALSE;
5968 } else if (SeqLocCompare(b, a) != ComplementLocCompare(allow_strand)) {
5969 rval = FALSE;
5970 } else if (SeqLocCompareEx(b, a, TRUE) != ComplementLocCompare(check_strand)) {
5971 rval = FALSE;
5972 }
5973 return rval;
5974 }
5975
5976
UnitTestSeqLocCompare(void)5977 NLM_EXTERN Boolean UnitTestSeqLocCompare (void)
5978 {
5979 SeqLocPtr a, b;
5980 SeqIdPtr sip, sip2 = NULL;
5981 SeqIntPtr sint1, sint2, sint3, sint4;
5982 TextSeqIdPtr tsip, tsip2 = NULL;
5983 ValNodePtr list = NULL, list2 = NULL;
5984 SeqPntPtr pnt1, pnt2, pnt3, pnt4;
5985 PackSeqPntPtr pspp1, pspp2;
5986 SeqBondPtr sbp1, sbp2;
5987 Boolean rval = FALSE;
5988
5989 a = ValNodeNew (NULL);
5990
5991 b = ValNodeNew (NULL);
5992
5993 tsip = TextSeqIdNew ();
5994 tsip->accession = StringSave ("AY123456");
5995 sip = ValNodeNew (NULL);
5996 sip->choice = SEQID_GENBANK;
5997 sip->data.ptrvalue = tsip;
5998
5999 tsip2 = TextSeqIdNew ();
6000 tsip2->accession = StringSave ("AY123457");
6001 sip2 = ValNodeNew (NULL);
6002 sip2->choice = SEQID_GENBANK;
6003 sip2->data.ptrvalue = tsip2;
6004
6005 sint1 = SeqIntNew ();
6006 sint1->id = sip;
6007 sint1->from = 0;
6008 sint1->to = 10;
6009
6010 sint2 = SeqIntNew ();
6011 sint2->id = sip;
6012 sint2->from = 15;
6013 sint2->to = 25;
6014
6015 sint3 = SeqIntNew ();
6016 sint3->id = sip;
6017 sint3->from = 0;
6018 sint3->to = 10;
6019
6020 sint4 = SeqIntNew ();
6021 sint4->id = sip;
6022 sint4->from = 15;
6023 sint4->to = 25;
6024
6025 pnt1 = SeqPntNew ();
6026 pnt1->id = sip;
6027 pnt1->point = 5;
6028
6029 pnt2 = SeqPntNew ();
6030 pnt2->id = sip;
6031 pnt2->point = 16;
6032
6033 pnt3 = SeqPntNew ();
6034 pnt3->id = sip;
6035 pnt3->point = 5;
6036
6037 pnt4 = SeqPntNew ();
6038 pnt4->id = sip;
6039 pnt4->point = 16;
6040
6041 sbp1 = SeqBondNew ();
6042 sbp1->a = pnt1;
6043 sbp1->b = pnt2;
6044
6045 sbp2 = SeqBondNew ();
6046 sbp2->a = pnt3;
6047 sbp2->b = pnt4;
6048
6049 pspp1 = PackSeqPntNew ();
6050 pspp1->id = sip;
6051 pspp1->used = 2;
6052 pspp1->pnts[0] = 5;
6053 pspp1->pnts[1] = 16;
6054
6055 pspp2 = PackSeqPntNew ();
6056 pspp2->id = sip;
6057 pspp2->used = 2;
6058 pspp2->pnts[0] = 5;
6059 pspp2->pnts[1] = 16;
6060
6061 /* NULL */
6062 /* NULL vs NULL */
6063 a->choice = SEQLOC_NULL;
6064 b->choice = SEQLOC_NULL;
6065 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_A_EQ_B)) {
6066 goto UnitTestSeqLocCompare_end;
6067 }
6068 /* NULL vs EMPTY */
6069 b->choice = SEQLOC_EMPTY;
6070 b->data.ptrvalue = sip;
6071 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6072 goto UnitTestSeqLocCompare_end;
6073 }
6074
6075 /* NULL vs WHOLE */
6076 b->choice = SEQLOC_WHOLE;
6077 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6078 goto UnitTestSeqLocCompare_end;
6079 }
6080
6081 /* NULL vs INT */
6082 b->choice = SEQLOC_INT;
6083 b->data.ptrvalue = sint1;
6084 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6085 goto UnitTestSeqLocCompare_end;
6086 }
6087
6088 /* NULL vs PACKED INT */
6089 ValNodeAddPointer (&list, SEQLOC_INT, sint1);
6090 ValNodeAddPointer (&list, SEQLOC_INT, sint2);
6091 b->choice = SEQLOC_PACKED_INT;
6092 b->data.ptrvalue = list;
6093 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6094 goto UnitTestSeqLocCompare_end;
6095 }
6096 list = ValNodeFree (list);
6097
6098 /* NULL vs point */
6099 b->choice = SEQLOC_PNT;
6100 b->data.ptrvalue = pnt1;
6101 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6102 goto UnitTestSeqLocCompare_end;
6103 }
6104
6105 /* NULL vs. packed pnt */
6106 b->choice = SEQLOC_PACKED_PNT;
6107 b->data.ptrvalue = pspp1;
6108 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6109 goto UnitTestSeqLocCompare_end;
6110 }
6111
6112 /* NULL vs MIX */
6113 list = ValNodeNew (NULL);
6114 list->choice = SEQLOC_INT;
6115 list->data.ptrvalue = sint1;
6116 b->choice = SEQLOC_MIX;
6117 b->data.ptrvalue = list;
6118 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6119 goto UnitTestSeqLocCompare_end;
6120 }
6121 list = ValNodeFree (list);
6122
6123 /* NULL vs BOND */
6124 b->choice = SEQLOC_BOND;
6125 b->data.ptrvalue = sbp1;
6126 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6127 goto UnitTestSeqLocCompare_end;
6128 }
6129
6130 /* EMPTY vs EMPTY */
6131 a->choice = SEQLOC_EMPTY;
6132 a->data.ptrvalue = sip;
6133 b->choice = SEQLOC_EMPTY;
6134 b->data.ptrvalue = sip;
6135 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_A_EQ_B)) {
6136 goto UnitTestSeqLocCompare_end;
6137 }
6138
6139 b->data.ptrvalue = sip2;
6140 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6141 goto UnitTestSeqLocCompare_end;
6142 }
6143
6144 /* EMPTY vs WHOLE */
6145 b->choice = SEQLOC_WHOLE;
6146 b->data.ptrvalue = sip;
6147 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6148 goto UnitTestSeqLocCompare_end;
6149 }
6150
6151 /* EMPTY vs INT */
6152 b->choice = SEQLOC_INT;
6153 sint1->id = sip;
6154 b->data.ptrvalue = sint1;
6155 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6156 goto UnitTestSeqLocCompare_end;
6157 }
6158
6159 /* EMPTY vs packed-int */
6160 list = NULL;
6161 ValNodeAddPointer (&list, 0, sint1);
6162 ValNodeAddPointer (&list, 0, sint2);
6163 b->choice = SEQLOC_PACKED_INT;
6164 b->data.ptrvalue = list;
6165 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6166 goto UnitTestSeqLocCompare_end;
6167 }
6168 list = ValNodeFree (list);
6169
6170 /* EMPTY vs point */
6171 b->choice = SEQLOC_PNT;
6172 b->data.ptrvalue = pnt1;
6173 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6174 goto UnitTestSeqLocCompare_end;
6175 }
6176
6177 /* EMPTY vs. packed pnt */
6178 b->choice = SEQLOC_PACKED_PNT;
6179 b->data.ptrvalue = pspp1;
6180 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6181 goto UnitTestSeqLocCompare_end;
6182 }
6183
6184 /* EMPTY vs MIX */
6185 list = ValNodeNew (NULL);
6186 list->choice = SEQLOC_INT;
6187 list->data.ptrvalue = sint1;
6188 b->choice = SEQLOC_MIX;
6189 b->data.ptrvalue = list;
6190 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6191 goto UnitTestSeqLocCompare_end;
6192 }
6193 list = ValNodeFree (list);
6194
6195 /* EMPTY vs BOND */
6196 b->choice = SEQLOC_BOND;
6197 b->data.ptrvalue = sbp1;
6198 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6199 goto UnitTestSeqLocCompare_end;
6200 }
6201
6202 a->choice = SEQLOC_WHOLE;
6203 /* WHOLE vs INT */
6204 b->choice = SEQLOC_INT;
6205 sint1->id = sip;
6206 sint1->from = 0;
6207 sint1->to = 10;
6208 sint1->strand = Seq_strand_plus;
6209 b->data.ptrvalue = sint1;
6210 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6211 goto UnitTestSeqLocCompare_end;
6212 }
6213 sint1->from = 0;
6214 sint1->to = 484;
6215 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_A_EQ_B)) {
6216 goto UnitTestSeqLocCompare_end;
6217 }
6218 sint1->strand = Seq_strand_minus;
6219 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_A_EQ_B)) {
6220 goto UnitTestSeqLocCompare_end;
6221 }
6222 sint1->id = sip2;
6223 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6224 goto UnitTestSeqLocCompare_end;
6225 }
6226 sint1->id = sip;
6227 sint1->from = 0;
6228 sint1->to = 10;
6229 sint1->strand = 0;
6230
6231 /* WHOLE vs packed int */
6232 list = NULL;
6233 ValNodeAddPointer (&list, SEQLOC_INT, sint1);
6234 ValNodeAddPointer (&list, SEQLOC_INT, sint2);
6235 b->choice = SEQLOC_PACKED_INT;
6236 b->data.ptrvalue = list;
6237 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6238 goto UnitTestSeqLocCompare_end;
6239 }
6240 sint1->id = sip2;
6241 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6242 goto UnitTestSeqLocCompare_end;
6243 }
6244 sint2->id = sip2;
6245 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6246 goto UnitTestSeqLocCompare_end;
6247 }
6248 list = ValNodeFree (list);
6249 sint1->id = sip;
6250 sint2->id = sip;
6251
6252 /* WHOLE vs pnt */
6253 b->choice = SEQLOC_PNT;
6254 b->data.ptrvalue = pnt1;
6255 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6256 goto UnitTestSeqLocCompare_end;
6257 }
6258 pnt1->strand = Seq_strand_minus;
6259 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6260 goto UnitTestSeqLocCompare_end;
6261 }
6262 pnt1->strand = 0;
6263 pnt1->id = sip2;
6264 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6265 goto UnitTestSeqLocCompare_end;
6266 }
6267 pnt1->id = sip;
6268
6269 /* WHOLE vs SEQLOC_PACKED_PNT */
6270 b->choice = SEQLOC_PACKED_PNT;
6271 b->data.ptrvalue = pspp1;
6272 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6273 goto UnitTestSeqLocCompare_end;
6274 }
6275 pspp1->strand = Seq_strand_minus;
6276 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6277 goto UnitTestSeqLocCompare_end;
6278 }
6279 pspp1->strand = 0;
6280 pspp1->id = sip2;
6281 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6282 goto UnitTestSeqLocCompare_end;
6283 }
6284 pspp1->next = pspp2;
6285 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6286 pspp1->next = NULL;
6287 goto UnitTestSeqLocCompare_end;
6288 }
6289 pspp1->next = NULL;
6290 pspp1->id = sip;
6291
6292 /* WHOLE vs SEQLOC_MIX */
6293 list = NULL;
6294 ValNodeAddPointer (&list, SEQLOC_INT, sint1);
6295 ValNodeAddPointer (&list, SEQLOC_INT, sint2);
6296 b->choice = SEQLOC_MIX;
6297 b->data.ptrvalue = list;
6298 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6299 goto UnitTestSeqLocCompare_end;
6300 }
6301 sint1->strand = Seq_strand_minus;
6302 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6303 goto UnitTestSeqLocCompare_end;
6304 }
6305 sint1->strand = 0;
6306 sint1->id = sip2;
6307 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6308 goto UnitTestSeqLocCompare_end;
6309 }
6310 sint2->id = sip2;
6311 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6312 goto UnitTestSeqLocCompare_end;
6313 }
6314 sint1->id = sip;
6315 sint2->id = sip;
6316 list = ValNodeFree (list);
6317
6318 /* WHOLE vs SEQLOC_BOND */
6319 b->choice = SEQLOC_BOND;
6320 b->data.ptrvalue = sbp1;
6321 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6322 goto UnitTestSeqLocCompare_end;
6323 }
6324 sbp1->a->strand = Seq_strand_minus;
6325 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6326 goto UnitTestSeqLocCompare_end;
6327 }
6328 sbp1->a->strand = 0;
6329 sbp1->a->id = sip2;
6330 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6331 goto UnitTestSeqLocCompare_end;
6332 }
6333 sbp1->b->id = sip2;
6334 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6335 goto UnitTestSeqLocCompare_end;
6336 }
6337 sbp1->a->id = sip;
6338 sbp1->b->id = sip;
6339
6340 /* INT */
6341 a->choice = SEQLOC_INT;
6342 a->data.ptrvalue = sint3;
6343 /* INT vs SEQLOC_INT */
6344 b->choice = SEQLOC_INT;
6345 b->data.ptrvalue = sint1;
6346 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_A_EQ_B)) {
6347 goto UnitTestSeqLocCompare_end;
6348 }
6349 sint1->strand = Seq_strand_minus;
6350 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_NO_MATCH)) {
6351 goto UnitTestSeqLocCompare_end;
6352 }
6353 sint1->strand = 0;
6354 sint1->to = 9;
6355 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6356 goto UnitTestSeqLocCompare_end;
6357 }
6358 sint1->strand = Seq_strand_minus;
6359 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_NO_MATCH)) {
6360 goto UnitTestSeqLocCompare_end;
6361 }
6362 sint1->strand = 0;
6363 sint1->to = 11;
6364 if (!CheckSeqLocCompResults(a, b, SLC_A_IN_B, SLC_A_IN_B)) {
6365 goto UnitTestSeqLocCompare_end;
6366 }
6367 sint1->strand = Seq_strand_minus;
6368 if (!CheckSeqLocCompResults(a, b, SLC_A_IN_B, SLC_NO_MATCH)) {
6369 goto UnitTestSeqLocCompare_end;
6370 }
6371 sint1->strand = 0;
6372 sint1->from = 1;
6373 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6374 goto UnitTestSeqLocCompare_end;
6375 }
6376 sint1->strand = Seq_strand_minus;
6377 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_NO_MATCH)) {
6378 goto UnitTestSeqLocCompare_end;
6379 }
6380 sint1->strand = 0;
6381 sint1->from = 0;
6382 sint1->to = 10;
6383
6384 /* INT vs PACKED_INT */
6385 list = NULL;
6386 ValNodeAddPointer (&list, SEQLOC_INT, sint1);
6387 ValNodeAddPointer (&list, SEQLOC_INT, sint2);
6388 b->choice = SEQLOC_PACKED_INT;
6389 b->data.ptrvalue = list;
6390 if (!CheckSeqLocCompResults(a, b, SLC_A_IN_B, SLC_A_IN_B)) {
6391 goto UnitTestSeqLocCompare_end;
6392 }
6393 sint1->strand = Seq_strand_minus;
6394 if (!CheckSeqLocCompResults(a, b, SLC_A_IN_B, SLC_NO_MATCH)) {
6395 goto UnitTestSeqLocCompare_end;
6396 }
6397 sint1->strand = 0;
6398 sint1->to = 11;
6399 if (!CheckSeqLocCompResults(a, b, SLC_A_IN_B, SLC_A_IN_B)) {
6400 goto UnitTestSeqLocCompare_end;
6401 }
6402 sint1->strand = Seq_strand_minus;
6403 if (!CheckSeqLocCompResults(a, b, SLC_A_IN_B, SLC_NO_MATCH)) {
6404 goto UnitTestSeqLocCompare_end;
6405 }
6406 sint1->strand = 0;
6407 sint1->from = 1;
6408 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6409 goto UnitTestSeqLocCompare_end;
6410 }
6411 sint1->strand = Seq_strand_minus;
6412 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_NO_MATCH)) {
6413 goto UnitTestSeqLocCompare_end;
6414 }
6415 sint1->strand = 0;
6416 sint1->from = 11;
6417 sint1->to = 24;
6418 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6419 goto UnitTestSeqLocCompare_end;
6420 }
6421 sint1->strand = Seq_strand_minus;
6422 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6423 goto UnitTestSeqLocCompare_end;
6424 }
6425 sint1->strand = 0;
6426 sint1->from = 0;
6427 sint1->to = 10;
6428 list = ValNodeFree (list);
6429
6430 /* INT vs PNT */
6431 b->choice = SEQLOC_PNT;
6432 b->data.ptrvalue = pnt1;
6433 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6434 goto UnitTestSeqLocCompare_end;
6435 }
6436 pnt1->strand = Seq_strand_minus;
6437 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_NO_MATCH)) {
6438 goto UnitTestSeqLocCompare_end;
6439 }
6440 pnt1->strand = 0;
6441 pnt1->point = 13;
6442 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6443 goto UnitTestSeqLocCompare_end;
6444 }
6445 pnt1->strand = Seq_strand_minus;
6446 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6447 goto UnitTestSeqLocCompare_end;
6448 }
6449 pnt1->strand = 0;
6450 pnt1->point = 5;
6451 pnt1->id = sip2;
6452 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6453 goto UnitTestSeqLocCompare_end;
6454 }
6455 pnt1->id = sip;
6456
6457 /* INT vs PACKED_PNT */
6458 b->choice = SEQLOC_PACKED_PNT;
6459 b->data.ptrvalue = pspp1;
6460 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6461 goto UnitTestSeqLocCompare_end;
6462 }
6463 pspp1->strand = Seq_strand_minus;
6464 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_NO_MATCH)) {
6465 goto UnitTestSeqLocCompare_end;
6466 }
6467 pspp1->strand = 0;
6468 pspp1->id = sip2;
6469 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6470 goto UnitTestSeqLocCompare_end;
6471 }
6472 pspp1->next = pspp2;
6473 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6474 pspp1->next = NULL;
6475 goto UnitTestSeqLocCompare_end;
6476 }
6477 pspp1->next = NULL;
6478 pspp1->id = sip;
6479 pspp1->pnts[1] = 9;
6480 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6481 goto UnitTestSeqLocCompare_end;
6482 }
6483 pspp1->strand = Seq_strand_minus;
6484 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_NO_MATCH)) {
6485 goto UnitTestSeqLocCompare_end;
6486 }
6487 pspp1->strand = 0;
6488 pspp1->pnts[1] = 16;
6489
6490 /* INT vs BOND */
6491 b->choice = SEQLOC_BOND;
6492 b->data.ptrvalue = sbp1;
6493 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6494 goto UnitTestSeqLocCompare_end;
6495 }
6496 sbp1->a->strand = Seq_strand_minus;
6497 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_NO_MATCH)) {
6498 goto UnitTestSeqLocCompare_end;
6499 }
6500 sbp1->a->strand = 0;
6501 sbp1->b->strand = Seq_strand_minus;
6502 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6503 goto UnitTestSeqLocCompare_end;
6504 }
6505
6506 sbp1->b->strand = 0;
6507 sbp1->b->point = 9;
6508 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6509 goto UnitTestSeqLocCompare_end;
6510 }
6511 sbp1->a->strand = Seq_strand_minus;
6512 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_A_OVERLAP_B)) {
6513 goto UnitTestSeqLocCompare_end;
6514 }
6515 sbp1->b->strand = Seq_strand_minus;
6516 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_NO_MATCH)) {
6517 goto UnitTestSeqLocCompare_end;
6518 }
6519 sbp1->a->strand = 0;
6520 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_A_OVERLAP_B)) {
6521 goto UnitTestSeqLocCompare_end;
6522 }
6523 sbp1->b->strand = 0;
6524
6525 sbp1->a->id = sip2;
6526 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6527 goto UnitTestSeqLocCompare_end;
6528 }
6529 sbp1->b->id = sip2;
6530 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6531 goto UnitTestSeqLocCompare_end;
6532 }
6533 sbp1->a->id = sip;
6534 sbp1->b->id = sip;
6535 sbp1->b->point = 16;
6536
6537 /* PACKED_INT */
6538 a->choice = SEQLOC_PACKED_INT;
6539 ValNodeAddPointer (&list2, SEQLOC_INT, sint3);
6540 ValNodeAddPointer (&list2, SEQLOC_INT, sint4);
6541 a->data.ptrvalue = list2;
6542
6543 /* PACKED_INT vs PACKED_INT */
6544 b->choice = SEQLOC_PACKED_INT;
6545 ValNodeAddPointer (&list, SEQLOC_INT, sint1);
6546 ValNodeAddPointer (&list, SEQLOC_INT, sint2);
6547 b->data.ptrvalue = list;
6548 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_A_EQ_B)) {
6549 goto UnitTestSeqLocCompare_end;
6550 }
6551 sint1->strand = Seq_strand_minus;
6552 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_A_OVERLAP_B)) {
6553 goto UnitTestSeqLocCompare_end;
6554 }
6555 sint2->strand = Seq_strand_minus;
6556 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_NO_MATCH)) {
6557 goto UnitTestSeqLocCompare_end;
6558 }
6559 sint1->strand = 0;
6560 sint2->strand = 0;
6561 sint1->from = 1;
6562 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6563 goto UnitTestSeqLocCompare_end;
6564 }
6565 sint1->strand = Seq_strand_minus;
6566 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_A_OVERLAP_B)) {
6567 goto UnitTestSeqLocCompare_end;
6568 }
6569 sint1->strand = 0;
6570 sint1->from = 11;
6571 sint1->to = 14;
6572 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6573 goto UnitTestSeqLocCompare_end;
6574 }
6575 sint1->from = 0;
6576 sint1->to = 10;
6577
6578 /* PACKED_INT vs PNT */
6579 b->choice = SEQLOC_PNT;
6580 b->data.ptrvalue = pnt1;
6581 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6582 goto UnitTestSeqLocCompare_end;
6583 }
6584 pnt1->strand = Seq_strand_minus;
6585 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_NO_MATCH)) {
6586 goto UnitTestSeqLocCompare_end;
6587 }
6588 pnt1->strand = 0;
6589 pnt1->point = 11;
6590 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6591 goto UnitTestSeqLocCompare_end;
6592 }
6593 pnt1->point = 16;
6594 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6595 goto UnitTestSeqLocCompare_end;
6596 }
6597
6598 /* PACKED_INT vs SEQLOC_PACKED_PNT */
6599 b->choice = SEQLOC_PACKED_PNT;
6600 b->data.ptrvalue = pspp1;
6601 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6602 goto UnitTestSeqLocCompare_end;
6603 }
6604 pspp1->strand = Seq_strand_minus;
6605 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_NO_MATCH)) {
6606 goto UnitTestSeqLocCompare_end;
6607 }
6608 pspp1->strand = 0;
6609 pspp1->pnts[0] = 11;
6610 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6611 goto UnitTestSeqLocCompare_end;
6612 }
6613 pspp1->strand = Seq_strand_minus;
6614 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_NO_MATCH)) {
6615 goto UnitTestSeqLocCompare_end;
6616 }
6617 pspp1->strand = 0;
6618 pspp1->pnts[0] = 5;
6619 pspp1->id = sip2;
6620 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6621 goto UnitTestSeqLocCompare_end;
6622 }
6623 pspp1->next = pspp2;
6624 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6625 pspp1->next = NULL;
6626 goto UnitTestSeqLocCompare_end;
6627 }
6628 pspp1->id = sip;
6629 pspp1->next = NULL;
6630
6631
6632 /* PACKED_INT vs SEQLOC_BOND */
6633 b->choice = SEQLOC_BOND;
6634 b->data.ptrvalue = sbp1;
6635 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6636 goto UnitTestSeqLocCompare_end;
6637 }
6638 sbp1->a->strand = Seq_strand_minus;
6639 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_A_OVERLAP_B)) {
6640 goto UnitTestSeqLocCompare_end;
6641 }
6642 sbp1->b->strand = Seq_strand_minus;
6643 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_NO_MATCH)) {
6644 goto UnitTestSeqLocCompare_end;
6645 }
6646 sbp1->a->strand = 0;
6647 sbp1->b->strand = 0;
6648 sbp1->a->point = 11;
6649 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6650 goto UnitTestSeqLocCompare_end;
6651 }
6652 sbp1->b->strand = Seq_strand_minus;
6653 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_NO_MATCH)) {
6654 goto UnitTestSeqLocCompare_end;
6655 }
6656 sbp1->b->strand = 0;
6657 sbp1->b->point = 13;
6658 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6659 goto UnitTestSeqLocCompare_end;
6660 }
6661 sbp1->a->point = 5;
6662 sbp1->b->point = 16;
6663
6664 list2 = ValNodeFree (list2);
6665
6666 /* PNT */
6667 a->choice = SEQLOC_PNT;
6668 a->data.ptrvalue = pnt3;
6669
6670 /* PNT vs PNT */
6671 b->choice = SEQLOC_PNT;
6672 b->data.ptrvalue = pnt1;
6673 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_A_EQ_B)) {
6674 goto UnitTestSeqLocCompare_end;
6675 }
6676 pnt1->strand = Seq_strand_minus;
6677 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_NO_MATCH)) {
6678 goto UnitTestSeqLocCompare_end;
6679 }
6680 pnt1->strand = 0;
6681 pnt1->id = sip2;
6682 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6683 goto UnitTestSeqLocCompare_end;
6684 }
6685 pnt1->id = sip;
6686 pnt1->point = 6;
6687 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6688 goto UnitTestSeqLocCompare_end;
6689 }
6690 pnt1->point = 5;
6691
6692 /* PNT vs PACKED_PNT */
6693 b->choice = SEQLOC_PACKED_PNT;
6694 b->data.ptrvalue = pspp1;
6695 if (!CheckSeqLocCompResults(a, b, SLC_A_IN_B, SLC_A_IN_B)) {
6696 goto UnitTestSeqLocCompare_end;
6697 }
6698 pspp1->strand = Seq_strand_minus;
6699 if (!CheckSeqLocCompResults(a, b, SLC_A_IN_B, SLC_NO_MATCH)) {
6700 goto UnitTestSeqLocCompare_end;
6701 }
6702 pspp1->strand = 0;
6703 pspp1->id = sip2;
6704 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6705 goto UnitTestSeqLocCompare_end;
6706 }
6707 pspp1->next = pspp2;
6708 if (!CheckSeqLocCompResults(a, b, SLC_A_IN_B, SLC_A_IN_B)) {
6709 pspp1->next = NULL;
6710 goto UnitTestSeqLocCompare_end;
6711 }
6712 pspp1->next = NULL;
6713 pspp1->id = sip;
6714 pspp1->pnts[0] = 6;
6715 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6716 goto UnitTestSeqLocCompare_end;
6717 }
6718 pspp1->pnts[0] = 4;
6719 pspp1->pnts[1] = 5;
6720 if (!CheckSeqLocCompResults(a, b, SLC_A_IN_B, SLC_A_IN_B)) {
6721 goto UnitTestSeqLocCompare_end;
6722 }
6723 pspp1->pnts[0] = 5;
6724 pspp1->pnts[1] = 16;
6725
6726 /* PNT vs BOND */
6727 b->choice = SEQLOC_BOND;
6728 b->data.ptrvalue = sbp1;
6729 if (!CheckSeqLocCompResults(a, b, SLC_A_IN_B, SLC_A_IN_B)) {
6730 goto UnitTestSeqLocCompare_end;
6731 }
6732 sbp1->a->strand = Seq_strand_minus;
6733 if (!CheckSeqLocCompResults(a, b, SLC_A_IN_B, SLC_NO_MATCH)) {
6734 goto UnitTestSeqLocCompare_end;
6735 }
6736 sbp1->a->strand = 0;
6737 pnt3->point = 16;
6738 if (!CheckSeqLocCompResults(a, b, SLC_A_IN_B, SLC_A_IN_B)) {
6739 goto UnitTestSeqLocCompare_end;
6740 }
6741 sbp1->b->strand = Seq_strand_minus;
6742 if (!CheckSeqLocCompResults(a, b, SLC_A_IN_B, SLC_NO_MATCH)) {
6743 goto UnitTestSeqLocCompare_end;
6744 }
6745 sbp1->b->strand = 0;
6746 pnt3->point = 5;
6747 sbp1->a->id = sip2;
6748 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6749 goto UnitTestSeqLocCompare_end;
6750 }
6751 sbp1->a->id = sip;
6752
6753 /* PACKED_PNT */
6754 a->choice = SEQLOC_PACKED_PNT;
6755 a->data.ptrvalue = pspp2;
6756 /* PACKED_PNT vs PACKED_PNT */
6757 b->choice = SEQLOC_PACKED_PNT;
6758 b->data.ptrvalue = pspp1;
6759 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_A_EQ_B)) {
6760 goto UnitTestSeqLocCompare_end;
6761 }
6762 pspp1->strand = Seq_strand_minus;
6763 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_NO_MATCH)) {
6764 goto UnitTestSeqLocCompare_end;
6765 }
6766 pspp1->strand = 0;
6767 pspp1->pnts[0] = 6;
6768 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6769 goto UnitTestSeqLocCompare_end;
6770 }
6771 pspp1->pnts[0] = 5;
6772 pspp1->pnts[1] = 17;
6773 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6774 goto UnitTestSeqLocCompare_end;
6775 }
6776 pspp1->pnts[1] = 16;
6777 pspp1->id = sip2;
6778 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6779 goto UnitTestSeqLocCompare_end;
6780 }
6781 pspp1->next = pspp2;
6782 if (!CheckSeqLocCompResults(a, b, SLC_A_IN_B, SLC_A_IN_B)) {
6783 pspp1->next = NULL;
6784 goto UnitTestSeqLocCompare_end;
6785 }
6786 pspp1->next = NULL;
6787 pspp1->id = sip;
6788 pspp1->used = 3;
6789 pspp1->pnts[2] = 23;
6790 if (!CheckSeqLocCompResults(a, b, SLC_A_IN_B, SLC_A_IN_B)) {
6791 goto UnitTestSeqLocCompare_end;
6792 }
6793 pspp1->used = 2;
6794
6795 /* PACKED_PNT vs BOND */
6796 b->choice = SEQLOC_BOND;
6797 b->data.ptrvalue = sbp1;
6798 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_A_EQ_B)) {
6799 goto UnitTestSeqLocCompare_end;
6800 }
6801 pspp2->used = 3;
6802 pspp2->pnts[2] = 23;
6803 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6804 goto UnitTestSeqLocCompare_end;
6805 }
6806 pspp2->used = 2;
6807 pspp2->id = sip2;
6808 if (!CheckSeqLocCompResults(a, b, SLC_NO_MATCH, SLC_NO_MATCH)) {
6809 goto UnitTestSeqLocCompare_end;
6810 }
6811 pspp2->next = pspp1;
6812 if (!CheckSeqLocCompResults(a, b, SLC_B_IN_A, SLC_B_IN_A)) {
6813 pspp2->next = NULL;
6814 goto UnitTestSeqLocCompare_end;
6815 }
6816 pspp2->next = NULL;
6817 pspp2->id = sip;
6818 sbp1->a->strand = Seq_strand_minus;
6819 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_A_OVERLAP_B)) {
6820 goto UnitTestSeqLocCompare_end;
6821 }
6822 sbp1->b->strand = Seq_strand_minus;
6823 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_NO_MATCH)) {
6824 goto UnitTestSeqLocCompare_end;
6825 }
6826 sbp1->a->strand = 0;
6827 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_A_OVERLAP_B)) {
6828 goto UnitTestSeqLocCompare_end;
6829 }
6830 sbp1->b->strand = 0;
6831 sbp1->a->point = 4;
6832 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6833 goto UnitTestSeqLocCompare_end;
6834 }
6835 sbp1->b->point = 5;
6836 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6837 goto UnitTestSeqLocCompare_end;
6838 }
6839 sbp1->a->point = 5;
6840 sbp1->b->point = 16;
6841
6842 /* BOND */
6843 a->choice = SEQLOC_BOND;
6844 a->data.ptrvalue = sbp2;
6845 /* BOND vs BOND */
6846 b->choice = SEQLOC_BOND;
6847 b->data.ptrvalue = sbp1;
6848 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_A_EQ_B)) {
6849 goto UnitTestSeqLocCompare_end;
6850 }
6851 sbp1->a->strand = Seq_strand_minus;
6852 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_A_OVERLAP_B)) {
6853 goto UnitTestSeqLocCompare_end;
6854 }
6855 sbp1->b->strand = Seq_strand_minus;
6856 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_NO_MATCH)) {
6857 goto UnitTestSeqLocCompare_end;
6858 }
6859 sbp1->a->strand = 0;
6860 if (!CheckSeqLocCompResults(a, b, SLC_A_EQ_B, SLC_A_OVERLAP_B)) {
6861 goto UnitTestSeqLocCompare_end;
6862 }
6863 sbp1->b->strand = 0;
6864 sbp1->a->point = 4;
6865 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6866 goto UnitTestSeqLocCompare_end;
6867 }
6868 sbp1->b->point = 5;
6869 if (!CheckSeqLocCompResults(a, b, SLC_A_OVERLAP_B, SLC_A_OVERLAP_B)) {
6870 goto UnitTestSeqLocCompare_end;
6871 }
6872 sbp1->a->point = 5;
6873 sbp1->b->point = 16;
6874
6875 rval = TRUE;
6876
6877 UnitTestSeqLocCompare_end:
6878 sint1->id = NULL;
6879 sint1 = SeqIntFree (sint1);
6880 sint2->id = NULL;
6881 sint2 = SeqIntFree (sint2);
6882 sint3->id = NULL;
6883 sint3 = SeqIntFree (sint3);
6884 sint4->id = NULL;
6885 sint4 = SeqIntFree (sint4);
6886 pnt1->id = NULL;
6887 pnt1 = SeqPntFree (pnt1);
6888 pnt2->id = NULL;
6889 pnt2 = SeqPntFree (pnt2);
6890 pspp1->id = NULL;
6891 pspp1 = PackSeqPntFree (pspp1);
6892 pspp2->id = NULL;
6893 pspp2 = PackSeqPntFree (pspp2);
6894 sbp1->a = NULL;
6895 sbp1->b = NULL;
6896 sbp1 = SeqBondFree (sbp1);
6897 sbp2->a = NULL;
6898 sbp2->b = NULL;
6899 sbp2 = SeqBondFree (sbp2);
6900
6901 sip = SeqIdFree (sip);
6902 sip2 = SeqIdFree(sip2);
6903 a = ValNodeFree (a);
6904 b = ValNodeFree (b);
6905 return rval;
6906 }
6907
6908 /* returns the number of unique nucleotides covered by slp */
SeqLocCoverage(SeqLocPtr slp)6909 static Int4 SeqLocCoverage (SeqLocPtr slp)
6910 {
6911 Int4Ptr ivals;
6912 Int4 numivals = 0;
6913 SeqLocPtr tmp;
6914 SeqIdPtr sip = NULL;
6915 SeqIdPtr PNTR id_list;
6916 Int4 coverage = 0, i = 0, from, to, j;
6917 Int4 i_from, i_to, j_from, j_to;
6918 Boolean added_to_prev;
6919
6920 tmp = NULL;
6921 while ((tmp = SeqLocFindNext (slp, tmp)) != NULL) {
6922 numivals++;
6923 }
6924 if (numivals > 0) {
6925 ivals = MemNew (sizeof (Int4) * (numivals * 2));
6926 id_list = (SeqIdPtr PNTR) MemNew (sizeof (SeqIdPtr) * numivals);
6927 tmp = NULL;
6928 i = 0;
6929 while ((tmp = SeqLocFindNext (slp, tmp)) != NULL) {
6930 from = SeqLocStart (tmp);
6931 to = SeqLocStop (tmp);
6932 sip = SeqLocId (tmp);
6933 id_list [i / 2] = sip;
6934 ivals [i] = from;
6935 i++;
6936 ivals [i] = to;
6937 i++;
6938 }
6939 /* now combine overlapping intervals */
6940 for (j = 0; j < numivals; j++) {
6941 i = j + 1;
6942 while (i < numivals) {
6943 added_to_prev = FALSE;
6944 if (SeqIdComp (sip, id_list[j]) == SIC_YES) {
6945 i_from = ivals[2 * i];
6946 i_to = ivals[2 * i + 1];
6947 j_from = ivals[2 * j];
6948 j_to = ivals[2 * j + 1];
6949
6950 if ((i_from <= j_from && i_to >= j_from)
6951 || (i_from <= j_to && i_to >= j_to)
6952 || (i_from >= j_from && i_to <= j_to)) {
6953
6954 /* merge i into j */
6955 ivals[2 * j] = MIN (i_from, j_from);
6956 ivals[2 * j + 1] = MAX (i_to, j_to);
6957
6958 /* copy last piece into where i was, and delete last piece */
6959 /* This is okay since order doesn't matter, and this is
6960 cheaper than moving everything down after deleting i */
6961 if( i != (numivals - 1) ) {
6962 ivals[2 * i] = ivals[2 * (numivals -1)];
6963 ivals[2 * i + 1] = ivals[2 * (numivals -1) + 1];
6964 }
6965 numivals --;
6966
6967 /* restart checking against j since j changed */
6968 i = j + 1;
6969 added_to_prev = TRUE;
6970 }
6971 }
6972 if (added_to_prev) {
6973 /* do not increment i */
6974 } else {
6975 i++;
6976 }
6977 }
6978 }
6979 /* now add up lengths of intervals */
6980 for (j = 0; j < numivals; j++) {
6981 /* The "if" checks for NULLs, etc. which have a range like "-1 to -1" */
6982 if( ivals [2 * j + 1] >= 0 || ivals [2 * j] >= 0 ) {
6983 coverage += ivals [2 * j + 1] - ivals [2 * j] + 1;
6984 }
6985 }
6986 ivals = MemFree (ivals);
6987 id_list = MemFree (id_list);
6988 }
6989 return coverage;
6990 }
6991
6992
6993 /*****************************************************************************
6994 *
6995 * SeqLocAinB(a, b)
6996 * if a is completely contained in b, a positive number is returned
6997 * if 0, a is identical with b
6998 * if not 0, is the number of residues bigger b is than a
6999 * if a negative number is returned, a is not contained in b
7000 * could overlap or not
7001 * used to find features contained in genes
7002 *
7003 *****************************************************************************/
SeqLocAinB(SeqLocPtr a,SeqLocPtr b)7004 NLM_EXTERN Int4 SeqLocAinB (SeqLocPtr a, SeqLocPtr b)
7005 {
7006 Int4 diff = -1;
7007 Int2 res;
7008
7009 if ((a == NULL) || (b == NULL))
7010 return diff;
7011
7012 res = SeqLocCompare(a, b);
7013 switch (res)
7014 {
7015 case SLC_A_EQ_B:
7016 diff = 0;
7017 break;
7018 case SLC_A_IN_B:
7019 diff = (SeqLocCoverage(b) - SeqLocCoverage(a));
7020 break;
7021 default:
7022 break;
7023 }
7024 return diff;
7025 }
7026
7027 /*****************************************************************************
7028 *
7029 * Boolean SeqIntCheck(sip)
7030 * checks that a seq interval is valid
7031 *
7032 *****************************************************************************/
SeqIntCheck(SeqIntPtr sip)7033 NLM_EXTERN Boolean SeqIntCheck (SeqIntPtr sip)
7034
7035 {
7036 Int4 len = INT4_MAX;
7037 BioseqPtr bsp;
7038 Boolean locked = FALSE;
7039
7040 if (sip == NULL) return TRUE; /* makes it ok to pass a NULL */
7041
7042 bsp = BioseqFindCore(sip->id);
7043 if (bsp == NULL)
7044 {
7045 bsp = BioseqLockById(sip->id);
7046 if (bsp != NULL)
7047 locked = TRUE;
7048 }
7049 if (bsp != NULL)
7050 len = BioseqGetLen(bsp);
7051
7052 if (locked)
7053 BioseqUnlock(bsp);
7054 if ((sip->from < 0) || (sip->from > sip->to) || (sip->to >= len))
7055 {
7056 return FALSE;
7057 }
7058 else
7059 return TRUE;
7060 }
7061
7062 /*****************************************************************************
7063 *
7064 * Boolean SeqPntCheck(SeqPntPtr spp)
7065 * checks that a seq point is valid
7066 *
7067 *****************************************************************************/
SeqPntCheck(SeqPntPtr spp)7068 NLM_EXTERN Boolean SeqPntCheck (SeqPntPtr spp)
7069
7070 {
7071 Int4 len = INT4_MAX;
7072 BioseqPtr bsp;
7073 Boolean locked = FALSE;
7074
7075 if (spp == NULL) return TRUE; /* cant compare */
7076
7077 bsp = BioseqFindCore(spp->id);
7078 if (bsp == NULL)
7079 {
7080 bsp = BioseqLockById(spp->id);
7081 if (bsp != NULL)
7082 locked = TRUE;
7083 }
7084 if (bsp != NULL)
7085 len = BioseqGetLen(bsp);
7086
7087 if (locked)
7088 BioseqUnlock(bsp);
7089 if ((spp->point < 0) || (spp->point >= len))
7090 {
7091 return FALSE;
7092 }
7093 else
7094 return TRUE;
7095 }
7096
7097 /*****************************************************************************
7098 *
7099 * PackSeqPntCheck (pspp)
7100 *
7101 *****************************************************************************/
PackSeqPntCheck(PackSeqPntPtr pspp)7102 NLM_EXTERN Boolean PackSeqPntCheck (PackSeqPntPtr pspp)
7103 {
7104 Int4 len = INT4_MAX;
7105 BioseqPtr bsp;
7106 Int4 num, index, point;
7107 Boolean locked = FALSE;
7108
7109 if (pspp == NULL) return TRUE; /* cant compare */
7110
7111 bsp = BioseqFindCore(pspp->id);
7112 if (bsp == NULL)
7113 {
7114 bsp = BioseqLockById(pspp->id);
7115 if (bsp != NULL)
7116 locked = TRUE;
7117 }
7118 if (bsp != NULL)
7119 len = BioseqGetLen(bsp);
7120
7121 if (locked)
7122 BioseqUnlock(bsp);
7123 num = PackSeqPntNum(pspp); /* total number of points */
7124 for (index = 0; index < num; index++)
7125 {
7126 point = PackSeqPntGet(pspp, index);
7127
7128 if ((point < 0) || (point >= len))
7129 return FALSE;
7130 }
7131
7132 return TRUE;
7133
7134 }
7135
7136
7137 /*****************************************************************************
7138 *
7139 * SeqLocCheck (slp)
7140 *
7141 *****************************************************************************/
SeqLocCheck(SeqLocPtr slp)7142 NLM_EXTERN Uint1 SeqLocCheck (SeqLocPtr slp)
7143 {
7144 SeqLocPtr tmp;
7145 Uint1 thisstrand, laststrand=0;
7146 Boolean first = TRUE;
7147 Uint1 retval = SEQLOCCHECK_OK;
7148
7149 if (slp == NULL) return TRUE;
7150
7151 tmp = NULL;
7152 while ((tmp = SeqLocFindNext(slp, tmp)) != NULL)
7153 {
7154 if (tmp->choice == SEQLOC_NULL)
7155 {
7156 continue;
7157 }
7158 thisstrand = SeqLocStrand(tmp);
7159 if (! first)
7160 {
7161 if (thisstrand != laststrand)
7162 {
7163 ErrPostEx(SEV_WARNING,0,0,"Mixed strand location");
7164 retval = SEQLOCCHECK_WARNING;
7165 }
7166 }
7167 first = FALSE;
7168 laststrand = thisstrand;
7169
7170 switch (tmp->choice)
7171 {
7172 case SEQLOC_INT:
7173 if (! SeqIntCheck ((SeqIntPtr)(tmp->data.ptrvalue)))
7174 return SEQLOCCHECK_ERROR;
7175 break;
7176 case SEQLOC_PNT:
7177 if (! SeqPntCheck ((SeqPntPtr)(tmp->data.ptrvalue)))
7178 return SEQLOCCHECK_ERROR;
7179 break;
7180 case SEQLOC_PACKED_PNT:
7181 if (! PackSeqPntCheck ((PackSeqPntPtr)(tmp->data.ptrvalue)))
7182 return SEQLOCCHECK_ERROR;
7183 break;
7184 default:
7185 break;
7186 }
7187 }
7188
7189 return retval;
7190 }
7191
7192
7193 /*****************************************************************************
7194 *
7195 * SeqLocPartialCheck(head)
7196 * sets bits for incomplete location and/or errors
7197 * incomplete defined as Int-fuzz on start or stop with
7198 * lim.unk, lim.gt, or lim.lt set
7199 *
7200 * returns defined in header file
7201 *
7202 *****************************************************************************/
SeqLocPartialCheckEx(SeqLocPtr head,Boolean farFetch)7203 NLM_EXTERN Uint2 SeqLocPartialCheckEx (SeqLocPtr head, Boolean farFetch)
7204 {
7205 SeqLocPtr slp = NULL, first = NULL, last = NULL;
7206 Uint2 retval = 0;
7207 BioseqPtr bsp;
7208 SeqIntPtr sip;
7209 SeqPntPtr spp;
7210 PackSeqPntPtr pspp;
7211 IntFuzzPtr ifp;
7212 Boolean miss_end;
7213 ValNodePtr vnp, vnp2;
7214 Boolean locked, found_molinfo;
7215 MolInfoPtr mip;
7216
7217 if (head == NULL) return retval;
7218
7219 while ((slp = SeqLocFindNext(head, slp)) != NULL)
7220 {
7221 if (first == NULL)
7222 first = slp;
7223 last = slp;
7224 }
7225
7226 if (first == NULL) return retval;
7227
7228 slp = NULL;
7229 while ((slp = SeqLocFindNext(head, slp)) != NULL)
7230 {
7231 switch (slp->choice)
7232 {
7233 case SEQLOC_NULL:
7234 if (slp == first)
7235 retval |= SLP_START;
7236 else if (slp == last)
7237 retval |= SLP_STOP;
7238 else
7239 retval |= SLP_INTERNAL;
7240 break;
7241 case SEQLOC_INT:
7242 sip = (SeqIntPtr)(slp->data.ptrvalue);
7243 ifp = sip->if_from;
7244 if (ifp != NULL)
7245 {
7246 if (ifp->choice == 4) /* lim */
7247 {
7248 if (ifp->a == 1) /* gt */
7249 retval |= SLP_LIM_WRONG;
7250 else if ((ifp->a == 2) || (ifp->a == 0)) /* lt,unk */
7251 {
7252 if (sip->strand == Seq_strand_minus) /* stop */
7253 {
7254 if (slp == last)
7255 retval |= SLP_STOP;
7256 else
7257 retval |= SLP_INTERNAL;
7258 if (sip->from != 0)
7259 {
7260 if (slp == last)
7261 retval |= SLP_NOSTOP;
7262 else
7263 retval |= SLP_NOINTERNAL;
7264 }
7265 }
7266 else /* start */
7267 {
7268 if (slp == first)
7269 retval |= SLP_START;
7270 else
7271 retval |= SLP_INTERNAL;
7272 if (sip->from != 0)
7273 {
7274 if (slp == first)
7275 retval |= SLP_NOSTART;
7276 else
7277 retval |= SLP_NOINTERNAL;
7278 }
7279 }
7280 }
7281 } else if (ifp->choice == 2) /* range */ {
7282 if (sip->strand == Seq_strand_minus) {
7283 if (slp == last) {
7284 retval |= SLP_STOP;
7285 }
7286 } else {
7287 if (slp == first) {
7288 retval |= SLP_START;
7289 }
7290 }
7291 }
7292
7293 }
7294 ifp = sip->if_to;
7295 if (ifp != NULL)
7296 {
7297 if (ifp->choice == 4) /* lim */
7298 {
7299 if (ifp->a == 2) /* lt */
7300 retval |= SLP_LIM_WRONG;
7301 else if ((ifp->a == 1) || (ifp->a == 0)) /* gt,unk */
7302 {
7303 locked = FALSE;
7304 bsp = BioseqFindCore(sip->id);
7305 if (bsp == NULL && farFetch)
7306 {
7307 bsp = BioseqLockById(sip->id);
7308 if (bsp != NULL)
7309 locked = TRUE;
7310 }
7311 miss_end = FALSE;
7312 if (bsp != NULL)
7313 {
7314 if (sip->to != (bsp->length - 1))
7315 miss_end = TRUE;
7316 }
7317 if (locked)
7318 BioseqUnlock(bsp);
7319 if (sip->strand == Seq_strand_minus) /* start */
7320 {
7321 if (slp == first)
7322 retval |= SLP_START;
7323 else
7324 retval |= SLP_INTERNAL;
7325 if (miss_end)
7326 {
7327 if (slp == first /* was last */)
7328 retval |= SLP_NOSTART;
7329 else
7330 retval |= SLP_NOINTERNAL;
7331 }
7332 }
7333 else /* stop */
7334 {
7335 if (slp == last)
7336 retval |= SLP_STOP;
7337 else
7338 retval |= SLP_INTERNAL;
7339 if (miss_end)
7340 {
7341 if (slp == last)
7342 retval |= SLP_NOSTOP;
7343 else
7344 retval |= SLP_NOINTERNAL;
7345 }
7346 }
7347 }
7348 } else if (ifp->choice == 2) /* range */ {
7349 if (sip->strand == Seq_strand_minus) {
7350 if (slp == first) {
7351 retval |= SLP_START;
7352 }
7353 } else {
7354 if (slp == last) {
7355 retval |= SLP_STOP;
7356 }
7357 }
7358 }
7359 }
7360 break;
7361 case SEQLOC_PNT:
7362 spp = (SeqPntPtr)(slp->data.ptrvalue);
7363 ifp = spp->fuzz;
7364 if (ifp != NULL)
7365 {
7366 if (ifp->choice == 4) /* lim */
7367 {
7368 if ((ifp->a >= 0) && (ifp->a <= 2)) /* gt, lt,unk */
7369 {
7370 if (slp == first)
7371 retval |= SLP_START;
7372 if (slp == last)
7373 retval |= SLP_STOP;
7374 if ((slp != first) && (slp != last))
7375 retval |= SLP_INTERNAL;
7376 }
7377 }
7378 }
7379 break;
7380 case SEQLOC_PACKED_PNT:
7381 pspp = (PackSeqPntPtr)(slp->data.ptrvalue);
7382 ifp = pspp->fuzz;
7383 if (ifp != NULL)
7384 {
7385 if (ifp->choice == 4) /* lim */
7386 {
7387 if ((ifp->a >= 0) && (ifp->a <= 2)) /* gt, lt, unk */
7388 {
7389 if (slp == first)
7390 retval |= SLP_START;
7391 if (slp == last)
7392 retval |= SLP_STOP;
7393 if ((slp != first) && (slp != last))
7394 retval |= SLP_INTERNAL;
7395 }
7396 }
7397 }
7398 break;
7399 case SEQLOC_WHOLE:
7400 found_molinfo = FALSE;
7401 locked = FALSE;
7402 bsp = BioseqFindCore((SeqIdPtr)(slp->data.ptrvalue));
7403 if (bsp == NULL && farFetch)
7404 {
7405 bsp = BioseqLockById((SeqIdPtr)(slp->data.ptrvalue));
7406 if (bsp != NULL)
7407 locked = TRUE;
7408 }
7409 if (bsp != NULL) {
7410 vnp = NULL;
7411 while ((vnp = GetNextDescriptorUnindexed(bsp, Seq_descr_molinfo, vnp)) != NULL)
7412 {
7413 found_molinfo = TRUE;
7414 mip = (MolInfoPtr)(vnp->data.ptrvalue);
7415 switch (mip->completeness)
7416 {
7417 case 3: /* no left */
7418 if (slp == first)
7419 retval |= SLP_START;
7420 else
7421 retval |= SLP_INTERNAL;
7422 break;
7423 case 4: /* no right */
7424 if (slp == last)
7425 retval |= SLP_STOP;
7426 else
7427 retval |= SLP_INTERNAL;
7428 break;
7429 case 2: /* partial */
7430 retval |= SLP_OTHER;
7431 break;
7432 case 5: /* no ends */
7433 retval |= SLP_START;
7434 retval |= SLP_STOP;
7435 break;
7436 default:
7437 break;
7438 }
7439 }
7440 if (! found_molinfo)
7441 {
7442 while ((vnp = GetNextDescriptorUnindexed(bsp, Seq_descr_modif, vnp)) != NULL)
7443 {
7444 for (vnp2 = (ValNodePtr)(vnp->data.ptrvalue); vnp2 != NULL; vnp2 = vnp2->next)
7445 {
7446 switch (vnp2->data.intvalue)
7447 {
7448
7449 case 16: /* no left */
7450
7451 if (slp == first)
7452
7453 retval |= SLP_START;
7454
7455 else
7456 retval |= SLP_INTERNAL;
7457 break;
7458 case 17: /* no right */
7459 if (slp == last)
7460 retval |= SLP_STOP;
7461 else
7462 retval |= SLP_INTERNAL;
7463 break;
7464 case 10: /* partial */
7465 retval |= SLP_OTHER;
7466 break;
7467 }
7468 }
7469 }
7470 }
7471 }
7472 if (locked)
7473 BioseqUnlock (bsp);
7474 break;
7475 default:
7476 break;
7477
7478 }
7479 }
7480
7481 return retval;
7482 }
7483
SeqLocPartialCheck(SeqLocPtr head)7484 NLM_EXTERN Uint2 SeqLocPartialCheck(SeqLocPtr head)
7485
7486 {
7487 return SeqLocPartialCheckEx (head, TRUE);
7488 }
7489
7490 /*****************************************************************************
7491 *
7492 * StringForSeqMethod(Int2 method)
7493 * returns a descriptive string for sequencing method.
7494 *
7495 *****************************************************************************/
StringForSeqMethod(Int2 method)7496 NLM_EXTERN CharPtr StringForSeqMethod (Int2 method)
7497 {
7498 #define MAX_METHOD 6
7499 static char * methods[MAX_METHOD] = {
7500 "conceptual translation",
7501 "direct peptide sequencing",
7502 "conceptual translation with partial peptide sequencing",
7503 "sequenced peptide, ordered by overlap",
7504 "sequenced peptide, ordered by homology",
7505 "conceptual translation supplied by author" };
7506
7507 if ((method < 1) || (method > MAX_METHOD))
7508 return NULL;
7509
7510 return methods[method - 1];
7511 }
7512
7513 /*****************************************************************************
7514 *
7515 * StringForSeqTech(Int2 tech)
7516 * returns a descriptive string for sequencing method.
7517 * uses MolInfo from asn spec 4.0
7518 *****************************************************************************/
StringForSeqTech(Int2 tech)7519 NLM_EXTERN CharPtr StringForSeqTech (Int2 tech)
7520 {
7521 #define MAX_TECH 13
7522 static char * techs[MAX_TECH] = {
7523 NULL, /*"standard sequencing", */
7524 NULL, /*"Expressed Sequence Tag", */
7525 NULL, /*"Sequence Tagged Site", */
7526 NULL, /*"one-pass genomic sequence", */
7527 NULL, /*"from genetic mapping techniques", */
7528 NULL, /*"from physical mapping techniques", */
7529 NULL, /*"derived from other data, not a primary entity", */
7530 "conceptual translation",
7531 "direct peptide sequencing",
7532 "conceptual translation with partial peptide sequencing",
7533 "sequenced peptide, ordered by overlap",
7534 "sequenced peptide, ordered by homology",
7535 "conceptual translation supplied by author" };
7536
7537 if ((tech < 1) || (tech > MAX_TECH))
7538 return NULL;
7539
7540 return techs[tech - 1];
7541 }
7542
7543 static Boolean GetThePointForOffset(SeqLocPtr of, SeqPntPtr target, Uint1 which_end);
7544 Boolean GetThePointForOffsetEx(SeqLocPtr of, SeqPntPtr target, Uint1 which_end, Boolean is_circular);
7545 Boolean GetPointsForLeftAndRightOffsets(SeqLocPtr of, SeqPntPtr left, SeqPntPtr right, Boolean is_circular);
7546 static Int4 CheckOffsetInLoc(SeqLocPtr in, Int4 pos, BioseqPtr bsp, SeqIdPtr the_id);
7547 NLM_EXTERN Int4 CheckPointInBioseq(SeqPntPtr sp, BioseqPtr in, BoolPtr flip_strand, Boolean relaxed);
7548
7549 /*****************************************************************************
7550 *
7551 * Int4 GetOffsetInLoc (SeqLocPtr of, SeqLocPtr in, Uint1 which_end)
7552 * returns -1 if of not in, in
7553 *
7554 *****************************************************************************/
GetOffsetInLoc(SeqLocPtr of,SeqLocPtr in,Uint1 which_end)7555 NLM_EXTERN Int4 GetOffsetInLoc (SeqLocPtr of, SeqLocPtr in, Uint1 which_end)
7556 {
7557 SeqPnt sp;
7558 BioseqPtr bsp;
7559 Boolean locked = FALSE;
7560 Int4 result;
7561
7562 if ((in == NULL) || (of == NULL))
7563 return -1L;
7564
7565 if (! GetThePointForOffset(of, &sp, which_end))
7566 return -1L;
7567
7568 if (! IS_one_loc(in, FALSE)) /* optimize for multiple hits */
7569 {
7570 bsp = BioseqFindCore(sp.id); /* only need SeqIds */
7571 if (bsp == NULL)
7572 {
7573 bsp = BioseqLockById(sp.id);
7574 if (bsp != NULL)
7575 locked = TRUE;
7576 }
7577 }
7578 else
7579 bsp = NULL;
7580
7581 result = CheckOffsetInLoc(in, sp.point, bsp, sp.id);
7582 if (locked)
7583 BioseqUnlock(bsp);
7584 return result;
7585 }
7586
7587
7588 /*****************************************************************************
7589 *
7590 * Int4 GetOffsetInBioseq (SeqLocPtr of, BioseqPtr in, Uint1 which_end)
7591 * return -1 if of not in "in"
7592 *
7593 *****************************************************************************/
GetOffsetInBioseq(SeqLocPtr of,BioseqPtr in,Uint1 which_end)7594 NLM_EXTERN Int4 GetOffsetInBioseq (SeqLocPtr of, BioseqPtr in, Uint1 which_end)
7595 {
7596 SeqPnt sp;
7597
7598 if ((of == NULL) || (in == NULL))
7599 return -1;
7600
7601 if (! GetThePointForOffset(of, &sp, which_end))
7602 return -1L;
7603
7604 return CheckPointInBioseq(&sp, in, NULL, FALSE);
7605 }
7606
7607
GetOffsetInBioseqEx(SeqLocPtr of,BioseqPtr in,Uint1 which_end,Boolean is_circular,Boolean relaxed)7608 NLM_EXTERN Int4 GetOffsetInBioseqEx (SeqLocPtr of, BioseqPtr in, Uint1 which_end, Boolean is_circular, Boolean relaxed)
7609 {
7610 SeqPnt sp;
7611
7612 if ((of == NULL) || (in == NULL))
7613 return -1;
7614
7615 if (! GetThePointForOffsetEx(of, &sp, which_end, is_circular))
7616 return -1L;
7617
7618 return CheckPointInBioseq(&sp, in, NULL, relaxed);
7619 }
7620
7621
GetLeftAndRightOffsetsInBioseq(SeqLocPtr of,BioseqPtr in,Int4Ptr left,Int4Ptr right,Boolean is_circular,Boolean relaxed,BoolPtr left_flip,BoolPtr right_flip)7622 NLM_EXTERN void GetLeftAndRightOffsetsInBioseq (SeqLocPtr of, BioseqPtr in, Int4Ptr left, Int4Ptr right, Boolean is_circular, Boolean relaxed, BoolPtr left_flip, BoolPtr right_flip)
7623 {
7624 SeqPnt l, r;
7625
7626 if (left != NULL) {
7627 *left = -1;
7628 }
7629 if (right != NULL) {
7630 *right = -1;
7631 }
7632 if ((of == NULL) || (in == NULL))
7633 return;
7634
7635 if (!GetPointsForLeftAndRightOffsets (of, &l, &r, is_circular)) {
7636 return;
7637 }
7638 if (left != NULL) {
7639 *left = CheckPointInBioseq (&l, in, left_flip, relaxed);
7640 }
7641 if (right != NULL) {
7642 *right = CheckPointInBioseq (&r, in, right_flip, relaxed);
7643 }
7644 }
7645
7646 /*****************************************************************************
7647 *
7648 * CheckPointInBioseq(pnt, in)
7649 *
7650 *****************************************************************************/
CheckPointInBioseq(SeqPntPtr sp,BioseqPtr in,BoolPtr flip_strand,Boolean relaxed)7651 NLM_EXTERN Int4 CheckPointInBioseq (SeqPntPtr sp, BioseqPtr in, BoolPtr flip_strand, Boolean relaxed)
7652 {
7653 ValNode sl;
7654 BioseqPtr bsp;
7655 Int4 retval = -1;
7656 SeqLocPtr slp = NULL, curr;
7657 Int4 offset, offset2, strt, stp;
7658 SeqIdPtr sip;
7659 Boolean locked = FALSE;
7660
7661 if (SeqIdIn(sp->id, in->id)) /* in this one */
7662 return sp->point;
7663
7664 switch (in->repr)
7665 {
7666 case Seq_repr_virtual:
7667 case Seq_repr_raw:
7668 case Seq_repr_const:
7669 case Seq_repr_map:
7670 return -1; /* nothing more can be done */
7671
7672 case Seq_repr_ref:
7673 slp = (ValNodePtr) in->seq_ext;
7674 break;
7675
7676 case Seq_repr_seg:
7677 sl.choice = SEQLOC_MIX;
7678 sl.data.ptrvalue = in->seq_ext;
7679 slp = &sl;
7680 break;
7681
7682 case Seq_repr_delta:
7683 break;
7684
7685 default:
7686 return -1;
7687 }
7688
7689 bsp = BioseqFindCore(sp->id); /* only need SeqIds */
7690 if (bsp == NULL)
7691 {
7692 bsp = BioseqLockById(sp->id);
7693 if (bsp != NULL)
7694 locked = TRUE;
7695 }
7696 if (in->repr == Seq_repr_seg || in->repr == Seq_repr_delta) {
7697 retval = SeqMgrMapPartToSegmentedBioseq (in, sp->point, bsp, sp->id, flip_strand, relaxed);
7698 }
7699 if (retval == -1) {
7700 retval = CheckOffsetInLoc(slp, sp->point, bsp, sp->id);
7701 }
7702
7703 if (locked)
7704 BioseqUnlock(bsp);
7705
7706 if (retval >= 0) return retval; /* got it on segments */
7707
7708 /* look for segmented segments */
7709 offset = 0;
7710 curr = NULL;
7711 while ((curr = SeqLocFindNext(slp, curr)) != NULL)
7712 {
7713 sip = SeqLocId(curr);
7714 if (sip != NULL)
7715 {
7716 bsp = BioseqLockById(sip);
7717 if (bsp != NULL)
7718 {
7719 switch (bsp->repr)
7720 {
7721 case Seq_repr_ref: /* could have more levels */
7722 case Seq_repr_seg:
7723 offset2 = CheckPointInBioseq(sp, bsp, flip_strand, relaxed);
7724 if (offset2 >= 0) /* got it */
7725 {
7726 strt = SeqLocStart(curr);
7727 stp = SeqLocStop(curr);
7728 if ((offset2 >= strt) && (offset2 <= stp))
7729 {
7730 if (SeqLocStrand(curr) == Seq_strand_minus)
7731 offset2 = stp - offset2;
7732 else
7733 offset2 -= strt;
7734 retval = offset2 + offset;
7735 return retval;
7736 }
7737 }
7738 break;
7739 default: /* one level, already checked */
7740 break;
7741 }
7742 BioseqUnlock(bsp);
7743 }
7744 }
7745 offset += SeqLocLen(curr);
7746 }
7747
7748 return retval; /* all failed */
7749 }
7750
7751
GetEarlierSeqIdPtr(SeqIdPtr sip1,SeqIdPtr sip2)7752 static SeqIdPtr GetEarlierSeqIdPtr (SeqIdPtr sip1, SeqIdPtr sip2)
7753 {
7754 BioseqPtr bsp1, bsp2;
7755 BioseqSetPtr bssp = NULL;
7756 SeqEntryPtr sep;
7757
7758 if (sip1 == NULL && sip2 != NULL)
7759 {
7760 return sip2;
7761 }
7762 else if (sip1 != NULL && sip2 == NULL)
7763 {
7764 return sip1;
7765 }
7766 else if (SeqIdComp(sip1, sip2) == SIC_YES)
7767 {
7768 return sip1;
7769 }
7770
7771 bsp1 = BioseqFind (sip1);
7772 bsp2 = BioseqFind (sip2);
7773 if (bsp1 == NULL && bsp2 == NULL)
7774 {
7775 return sip1;
7776 }
7777 else if (bsp1 == NULL)
7778 {
7779 return sip2;
7780 }
7781 else if (bsp2 == NULL)
7782 {
7783 return sip1;
7784 }
7785
7786 if (bsp1->idx.parentptr != NULL && bsp2->idx.parentptr != 0 && bsp1->idx.parentptr != bsp2->idx.parentptr)
7787 {
7788 return NULL;
7789 }
7790 if (bsp1->idx.parentptr != NULL && bsp1->idx.parenttype == OBJ_BIOSEQSET) {
7791 bssp = bsp1->idx.parentptr;
7792 } else if (bsp2->idx.parentptr != NULL && bsp2->idx.parenttype == OBJ_BIOSEQSET) {
7793 bssp = bsp2->idx.parentptr;
7794 }
7795
7796 if (bssp == NULL) return NULL;
7797
7798 for (sep = bssp->seq_set; sep != NULL; sep = sep->next)
7799 {
7800 if (sep->data.ptrvalue == bsp1)
7801 {
7802 return sip1;
7803 }
7804 else if (sep->data.ptrvalue == bsp2)
7805 {
7806 return sip2;
7807 }
7808 }
7809 return NULL;
7810 }
7811
7812 /*****************************************************************************
7813 *
7814 * Boolean GetThePointForOffset(SeqLocPtr of, SeqPntPtr target, Uint1 which_end)
7815 *
7816 *****************************************************************************/
GetThePointForOffsetEx(SeqLocPtr of,SeqPntPtr target,Uint1 which_end,Boolean is_circular)7817 Boolean GetThePointForOffsetEx(SeqLocPtr of, SeqPntPtr target, Uint1 which_end, Boolean is_circular)
7818 {
7819 SeqLocPtr pnt, first=NULL, last=NULL;
7820 Uint1 first_strand, last_strand;
7821 Boolean all_minus = TRUE;
7822 Boolean all_non_minus = TRUE;
7823 Int4 lowest = -1, highest = 0, tmp;
7824 SeqIdPtr low_sip = NULL, high_sip = NULL, first_sip = NULL, last_sip = NULL;
7825 Boolean id_same;
7826
7827 pnt = NULL; /* get first or last single span type in "of"*/
7828
7829 while ((pnt = SeqLocFindNext(of, pnt)) != NULL)
7830 {
7831 if( pnt->choice == SEQLOC_NULL )
7832 {
7833 /* Skip NULL parts when determining offsets */
7834 continue;
7835 }
7836 last_strand = SeqLocStrand (pnt);
7837 last_sip = SeqLocId (pnt);
7838 if (last_strand == Seq_strand_minus) {
7839 all_non_minus = FALSE;
7840 } else {
7841 all_minus = FALSE;
7842 }
7843 last = pnt;
7844 if (first == NULL)
7845 {
7846 first = pnt;
7847 first_strand = last_strand;
7848 first_sip = last_sip;
7849 lowest = SeqLocStart(pnt);
7850 highest = SeqLocStop (pnt);
7851 low_sip = last_sip;
7852 high_sip = last_sip;
7853 }
7854 else
7855 {
7856 tmp = SeqLocStart (pnt);
7857 if (SeqIdComp (last_sip, low_sip))
7858 {
7859 id_same = TRUE;
7860 }
7861 else
7862 {
7863 id_same = FALSE;
7864 }
7865 if ((id_same && tmp < lowest)
7866 || (!id_same && last_sip == GetEarlierSeqIdPtr (last_sip, low_sip)))
7867 {
7868 lowest = tmp;
7869 low_sip = last_sip;
7870 }
7871 tmp = SeqLocStop (pnt);
7872
7873 if (SeqIdComp (last_sip, high_sip))
7874 {
7875 id_same = TRUE;
7876 }
7877 else
7878 {
7879 id_same = FALSE;
7880 }
7881 if ((id_same && tmp > highest)
7882 || (!id_same && high_sip == GetEarlierSeqIdPtr (high_sip, last_sip)))
7883 {
7884 highest = tmp;
7885 high_sip = last_sip;
7886 }
7887 }
7888 } /* otherwise, get last */
7889 if (first == NULL)
7890 return FALSE;
7891
7892 /* ignore circularity if strandedness is mixed */
7893 if( ! all_minus && ! all_non_minus ) {
7894 is_circular = FALSE;
7895 }
7896
7897 switch (which_end)
7898 {
7899 case SEQLOC_LEFT_END:
7900 if (is_circular) {
7901 if (all_minus) {
7902 target->point = SeqLocStart (last);
7903 target->id = last_sip;
7904 } else {
7905 target->point = SeqLocStart (first);
7906 target->id = first_sip;
7907 }
7908 } else {
7909 target->point = lowest;
7910 target->id = low_sip;
7911 }
7912 break;
7913 case SEQLOC_RIGHT_END:
7914 if (is_circular) {
7915 if (all_minus) {
7916 target->point = SeqLocStop (first);
7917 target->id = first_sip;
7918 } else {
7919 target->point = SeqLocStop (last);
7920 target->id = last_sip;
7921 }
7922 } else {
7923 target->point = highest;
7924 target->id = high_sip;
7925 }
7926 break;
7927 case SEQLOC_START:
7928 if (all_minus)
7929 {
7930 target->point = SeqLocStop (first);
7931 target->id = first_sip;
7932 }
7933 else
7934 {
7935 if (first_strand == Seq_strand_minus)
7936 {
7937 target->point = SeqLocStop (first);
7938 }
7939 else
7940 {
7941 target->point = SeqLocStart (first);
7942 }
7943 target->id = first_sip;
7944 }
7945 break;
7946 case SEQLOC_STOP:
7947 if (all_minus)
7948 {
7949 target->point = SeqLocStart (last);
7950 target->id = last_sip;
7951 }
7952 else
7953 {
7954 if (last_strand == Seq_strand_minus)
7955 {
7956 target->point = SeqLocStart (last);
7957 }
7958 else
7959 {
7960 target->point = SeqLocStop (last);
7961 }
7962 target->id = last_sip;
7963 }
7964 break;
7965 default:
7966 return FALSE; /* error */
7967 }
7968
7969 /* SeqLocStart returns 'from', and SeqLocStop returns 'to', regardless of strand! */
7970
7971 if ((target->point < 0) || (target->id == NULL))
7972 return FALSE;
7973
7974 return TRUE;
7975 }
7976
7977
GetThePointForOffset(SeqLocPtr of,SeqPntPtr target,Uint1 which_end)7978 Boolean GetThePointForOffset(SeqLocPtr of, SeqPntPtr target, Uint1 which_end)
7979 {
7980 BioseqPtr bsp;
7981 Boolean is_circular = FALSE;
7982
7983 bsp = BioseqFind (SeqLocId(of));
7984 if (bsp != NULL && bsp->topology == TOPOLOGY_CIRCULAR) {
7985 is_circular = TRUE;
7986 }
7987 return GetThePointForOffsetEx (of, target, which_end, is_circular);
7988 }
7989
7990
GetPointsForLeftAndRightOffsets(SeqLocPtr of,SeqPntPtr left,SeqPntPtr right,Boolean is_circular)7991 Boolean GetPointsForLeftAndRightOffsets(SeqLocPtr of, SeqPntPtr left, SeqPntPtr right, Boolean is_circular)
7992 {
7993 SeqLocPtr pnt, first=NULL, last=NULL;
7994 Uint1 first_strand, last_strand;
7995 Boolean all_minus = TRUE;
7996 Boolean all_non_minus = TRUE;
7997 Int4 lowest = -1, highest = 0, tmp;
7998 SeqIdPtr low_sip = NULL, high_sip = NULL, first_sip = NULL, last_sip = NULL;
7999 Boolean id_same;
8000
8001 pnt = NULL; /* get first or last single span type in "of"*/
8002
8003 while ((pnt = SeqLocFindNext(of, pnt)) != NULL)
8004 {
8005 if( pnt->choice == SEQLOC_NULL )
8006 {
8007 /* Skip NULL parts when determining offsets */
8008 continue;
8009 }
8010 last_strand = SeqLocStrand (pnt);
8011 last_sip = SeqLocId (pnt);
8012 if (last_strand == Seq_strand_minus) {
8013 all_non_minus = FALSE;
8014 } else {
8015 all_minus = FALSE;
8016 }
8017 last = pnt;
8018 if (first == NULL)
8019 {
8020 first = pnt;
8021 first_strand = last_strand;
8022 first_sip = last_sip;
8023 lowest = SeqLocStart(pnt);
8024 highest = SeqLocStop (pnt);
8025 low_sip = last_sip;
8026 high_sip = last_sip;
8027 }
8028 else
8029 {
8030 tmp = SeqLocStart (pnt);
8031 if (SeqIdComp (last_sip, low_sip))
8032 {
8033 id_same = TRUE;
8034 }
8035 else
8036 {
8037 id_same = FALSE;
8038 }
8039 if ((id_same && tmp < lowest)
8040 || (!id_same && last_sip == GetEarlierSeqIdPtr (last_sip, low_sip)))
8041 {
8042 lowest = tmp;
8043 low_sip = last_sip;
8044 }
8045 tmp = SeqLocStop (pnt);
8046
8047 if (SeqIdComp (last_sip, high_sip))
8048 {
8049 id_same = TRUE;
8050 }
8051 else
8052 {
8053 id_same = FALSE;
8054 }
8055 if ((id_same && tmp > highest)
8056 || (!id_same && high_sip == GetEarlierSeqIdPtr (high_sip, last_sip)))
8057 {
8058 highest = tmp;
8059 high_sip = last_sip;
8060 }
8061 }
8062 } /* otherwise, get last */
8063 if (first == NULL)
8064 return FALSE;
8065
8066 /* ignore circularity if strandedness is mixed */
8067 if( ! all_minus && ! all_non_minus ) {
8068 is_circular = FALSE;
8069 }
8070
8071 /* left */
8072 if (is_circular) {
8073 if (all_minus) {
8074 left->point = SeqLocStart (last);
8075 left->id = last_sip;
8076 } else {
8077 left->point = SeqLocStart (first);
8078 left->id = first_sip;
8079 }
8080 } else {
8081 left->point = lowest;
8082 left->id = low_sip;
8083 }
8084
8085 /* right */
8086 if (is_circular) {
8087 if (all_minus) {
8088 right->point = SeqLocStop (first);
8089 right->id = first_sip;
8090 } else {
8091 right->point = SeqLocStop (last);
8092 right->id = last_sip;
8093 }
8094 } else {
8095 right->point = highest;
8096 right->id = high_sip;
8097 }
8098
8099
8100 if ((left->point < 0) || (left->id == NULL) || (right->point < 0) || (right->id == NULL))
8101 return FALSE;
8102
8103 return TRUE;
8104 }
8105
8106
8107 /*****************************************************************************
8108 *
8109 * CheckOffsetInLoc()
8110 *
8111 *****************************************************************************/
CheckOffsetInLoc(SeqLocPtr in,Int4 pos,BioseqPtr bsp,SeqIdPtr the_id)8112 static Int4 CheckOffsetInLoc(SeqLocPtr in, Int4 pos, BioseqPtr bsp, SeqIdPtr the_id)
8113 {
8114 SeqIdPtr tsip, sip;
8115 SeqLocPtr tmp;
8116 SeqIntPtr sipp;
8117 Boolean checkin, doit;
8118 Int4 ctr = 0, len;
8119
8120 if (bsp != NULL)
8121 {
8122 tsip = bsp->id;
8123 checkin = 1;
8124 }
8125 else
8126 {
8127 tsip = the_id;
8128 checkin = 0;
8129 }
8130
8131 tmp = NULL;
8132 while ((tmp = SeqLocFindNext(in, tmp)) != NULL)
8133 {
8134 sip = SeqLocId(tmp);
8135 if (checkin) /* optimizer */
8136 doit = SeqIdIn(sip, tsip);
8137 else
8138 doit = SeqIdForSameBioseq(sip, tsip);
8139 switch (tmp->choice)
8140 {
8141 case SEQLOC_PNT:
8142 if (doit)
8143 {
8144 if (pos == ((SeqPntPtr)(tmp->data.ptrvalue))->point)
8145 return ctr;
8146 }
8147 ctr++;
8148 break;
8149 case SEQLOC_INT:
8150 sipp = (SeqIntPtr)(tmp->data.ptrvalue);
8151 if (doit)
8152 {
8153 if ((pos >= sipp->from) && (pos <= sipp->to))
8154 {
8155 if (sipp->strand == Seq_strand_minus)
8156 ctr += (sipp->to - pos);
8157 else
8158 ctr += (pos - sipp->from);
8159 return ctr;
8160 }
8161 }
8162 ctr += (sipp->to - sipp->from + 1);
8163 break;
8164 case SEQLOC_WHOLE:
8165 if (doit)
8166 {
8167 ctr += pos;
8168 return ctr;
8169 }
8170 default:
8171 len = SeqLocLen(tmp);
8172 if (len > 0) ctr += len;
8173 break;
8174 }
8175 }
8176
8177 return -1; /* failed */
8178 }
8179
8180
8181 /*****************************************************************************
8182 *
8183 * Int2 SeqLocOrder(SeqLocPtr a, SeqLocPtr b, BioseqPtr in);
8184 * This function is used to sort SeqLocs into ascending order by
8185 * location on a Bioseq (segmented or otherwise)
8186 * The first position is the point sorted on.
8187 * Returns
8188 * 0 a and b start at same offset
8189 * 1 a > b
8190 * -1 a < b
8191 * -2 a in bsp, b not
8192 * 2 b in bsp, a not
8193 * 3 neither a nor b in bsp
8194 * This function will attempt to sort locs not in bsp to the end of
8195 * the list. Values -2,2,3 can also be used to detect error conditions.
8196 *
8197 *****************************************************************************/
SeqLocOrder(SeqLocPtr a,SeqLocPtr b,BioseqPtr in)8198 NLM_EXTERN Int2 SeqLocOrder (SeqLocPtr a, SeqLocPtr b, BioseqPtr in)
8199 {
8200 Int4 aoffset, boffset;
8201
8202
8203 if ((a == NULL) || (b == NULL) || (in == NULL))
8204 return 3;
8205
8206 aoffset = GetOffsetInBioseq(a, in, SEQLOC_LEFT_END);
8207 boffset = GetOffsetInBioseq(b, in, SEQLOC_LEFT_END);
8208
8209 if ((aoffset == -1) && (boffset >= 0))
8210 return 2;
8211 else if ((aoffset >= 0) && (boffset == -1))
8212 return -2;
8213 else if ((aoffset == -1) && (boffset == -1))
8214 return 3;
8215 else if (aoffset == boffset)
8216 return 0;
8217 else if (aoffset < boffset)
8218 return -1;
8219 else
8220 return 1;
8221 }
8222
8223 /*****************************************************************************
8224 *
8225 * Int2 SeqLocMol(seqloc)
8226 * returns Seq-inst.mol for all Bioseqs this seqloc points to.
8227 * if all Seq-inst.mol the same, returns that.
8228 * if mixed dna,rna, returns na
8229 * if mixed na,aa or can't find any Bioseq, or bsp->mol = 0, or 255
8230 * returns 0
8231 *
8232 *****************************************************************************/
SeqLocMol(SeqLocPtr seqloc)8233 NLM_EXTERN Int2 SeqLocMol (SeqLocPtr seqloc)
8234 {
8235 SeqLocPtr slp = NULL;
8236 SeqIdPtr sip;
8237 static Uint1 cases[5][4] = {
8238 { 1,2,3,4 } , /* was 0, not-set */
8239 { 1,4,0,4 } , /* was 1, dna */
8240 { 4,2,0,4 } , /* was 2, rna */
8241 { 0,0,3,0 } , /* was 3, aa */
8242 { 4,4,0,4 }}; /* was 4, na */
8243 Int2 the_mol = 0, tmp;
8244 BioseqPtr bsp;
8245 Boolean locked = FALSE;
8246
8247 while ((slp = SeqLocFindNext(seqloc, slp)) != NULL)
8248 {
8249 sip = SeqLocId(slp);
8250 if (sip != NULL)
8251 {
8252 bsp = BioseqFindCore(sip);
8253 if (bsp == NULL)
8254 {
8255 bsp = BioseqLockById(sip);
8256 if (bsp != NULL)
8257 locked = TRUE;
8258 }
8259 if (bsp == NULL)
8260 return 0;
8261
8262 tmp = (Int2)bsp->mol;
8263 if (locked)
8264 BioseqUnlock(bsp);
8265 if ((tmp == 0) || (tmp == Seq_mol_other))
8266 return 0;
8267 the_mol = (Int2)cases[the_mol][tmp-1];
8268 if (! the_mol)
8269 return 0;
8270 }
8271 }
8272 return the_mol;
8273 }
8274
8275 static SeqIdPtr SeqLocPrintProc(SeqLocPtr slp, ByteStorePtr bsp, Boolean first, SeqIdPtr lastid, Boolean use_best_id);
8276 static void BSstring(ByteStorePtr bsp, CharPtr str);
8277
8278
SeqLocPrintEx(SeqLocPtr slp,Boolean use_best_id)8279 static CharPtr SeqLocPrintEx (SeqLocPtr slp, Boolean use_best_id)
8280 {
8281 ByteStorePtr bsp;
8282 CharPtr str;
8283 SeqLocPtr tmp;
8284
8285 if (slp == NULL) return NULL;
8286
8287 bsp = BSNew(80);
8288
8289 tmp = slp->next; /* save possible chain */
8290 slp->next = NULL; /* take out of possible chain */
8291
8292 SeqLocPrintProc(slp, bsp, TRUE, NULL, use_best_id);
8293
8294 slp->next = tmp; /* replace possible chain */
8295 str = (CharPtr)BSMerge(bsp, NULL);
8296 BSFree(bsp);
8297
8298 return str;
8299 }
8300
8301 /*****************************************************************************
8302 *
8303 * SeqLocPrint(slp)
8304 *
8305 *****************************************************************************/
SeqLocPrint(SeqLocPtr slp)8306 NLM_EXTERN CharPtr SeqLocPrint(SeqLocPtr slp)
8307 {
8308 return SeqLocPrintEx (slp, FALSE);
8309 }
8310
SeqLocPrintUseBestID(SeqLocPtr slp)8311 NLM_EXTERN CharPtr SeqLocPrintUseBestID(SeqLocPtr slp)
8312 {
8313 return SeqLocPrintEx (slp, TRUE);
8314 }
8315
8316 NLM_EXTERN SeqIdPtr SeqPointWrite(SeqPntPtr spp, CharPtr buf, SeqIdPtr lastid, Int2 buflen);
8317 NLM_EXTERN SeqIdPtr SeqPointPrint(SeqPntPtr spp, CharPtr buf, SeqIdPtr lastid);
8318 NLM_EXTERN void IntFuzzPrint(IntFuzzPtr ifp, Int4 pos, CharPtr buf, Boolean right);
8319 static char strandsymbol[5] = { '\0', '\0', 'c', 'b', 'r' };
8320 static SeqIdPtr SeqPointWriteEx (SeqPntPtr spp, CharPtr buf, SeqIdPtr lastid, Int2 buflen, Boolean use_best_id);
8321
8322
8323 /*****************************************************************************
8324 *
8325 * SeqLocPrintProc(slp, bsp, first, lastid)
8326 * print traversal routine
8327 * goes down slp chain
8328 *
8329 *****************************************************************************/
8330 static SeqIdPtr
SeqLocPrintProc(SeqLocPtr slp,ByteStorePtr bsp,Boolean first,SeqIdPtr lastid,Boolean use_best_id)8331 SeqLocPrintProc
8332 (SeqLocPtr slp,
8333 ByteStorePtr bsp,
8334 Boolean first,
8335 SeqIdPtr lastid,
8336 Boolean use_best_id)
8337 {
8338 Char buf[128];
8339 SeqBondPtr sbp;
8340 PackSeqPntPtr pspp;
8341 SeqIntPtr sip;
8342 IntFuzzPtr ifp1, ifp2;
8343 Int4 from, to;
8344 Int2 delim, delim2;
8345 BioseqPtr seq;
8346 SeqIdPtr thisid;
8347
8348 while (slp != NULL)
8349 {
8350 if (! first)
8351 {
8352 BSPutByte(bsp, ',');
8353 BSPutByte(bsp, ' ');
8354 }
8355 first = FALSE;
8356
8357 delim = 0;
8358 switch (slp->choice)
8359 {
8360 case SEQLOC_BOND: /* bond -- 2 seqs */
8361 sbp = (SeqBondPtr)(slp->data.ptrvalue);
8362 if (sbp->a != NULL)
8363 {
8364 lastid = SeqPointWriteEx(sbp->a, buf, lastid, sizeof(buf) - 1, use_best_id);
8365 BSstring(bsp, buf);
8366 }
8367 else
8368 BSPutByte(bsp, '?');
8369
8370 BSPutByte(bsp, '=');
8371
8372 if (sbp->b != NULL)
8373 {
8374 lastid = SeqPointWriteEx(sbp->b, buf, lastid, sizeof(buf) - 1, use_best_id);
8375 BSstring(bsp, buf);
8376 }
8377 else
8378 BSPutByte(bsp, '?');
8379 break;
8380 case SEQLOC_FEAT: /* feat -- can't track yet */
8381 BSstring(bsp, "(feat)");
8382 break;
8383 case SEQLOC_NULL: /* NULL */
8384 BSPutByte(bsp, '~');
8385 break;
8386 case SEQLOC_EMPTY: /* empty */
8387 BSPutByte(bsp, '{');
8388 SeqIdWrite((SeqIdPtr)(slp->data.ptrvalue),
8389 buf, PRINTID_FASTA_SHORT, sizeof(buf) - 1);
8390 BSstring(bsp, buf);
8391 BSPutByte(bsp, '}');
8392 break;
8393 case SEQLOC_WHOLE: /* whole */
8394 SeqIdWrite((SeqIdPtr)(slp->data.ptrvalue),
8395 buf, PRINTID_FASTA_SHORT, sizeof(buf) - 1);
8396 BSstring(bsp, buf);
8397 break;
8398 case SEQLOC_MIX: /* mix -- more than one seq */
8399 case SEQLOC_PACKED_INT: /* packed int */
8400 delim = '(';
8401 delim2 = ')';
8402 case SEQLOC_EQUIV: /* equiv -- ditto */
8403 if (! delim)
8404 {
8405 delim = '[';
8406 delim2 = ']';
8407 }
8408 BSPutByte(bsp, delim);
8409 lastid = SeqLocPrintProc((SeqLocPtr)(slp->data.ptrvalue), bsp, TRUE, lastid, use_best_id);
8410 BSPutByte(bsp, delim2);
8411 break;
8412 case SEQLOC_INT: /* int */
8413 {
8414 Uint1 seqid_format = PRINTID_FASTA_SHORT;
8415 sip = (SeqIntPtr)(slp->data.ptrvalue);
8416 thisid = sip->id;
8417 if (use_best_id)
8418 {
8419 seq = BioseqFind (thisid);
8420 if (seq != NULL)
8421 {
8422 /* JIRA ID-3530 : Find Seq-id containing accession */
8423 thisid = SeqIdFindBestAccession (seq->id);
8424 seqid_format = PRINTID_TEXTID_ACC_VER;
8425 }
8426 }
8427 if (! SeqIdMatch(sip->id, lastid))
8428 {
8429 SeqIdWrite(thisid, buf, seqid_format, sizeof(buf) - 1);
8430 BSstring(bsp, buf);
8431 BSPutByte(bsp, ':');
8432 }
8433 lastid = thisid;
8434 if (strandsymbol[sip->strand])
8435 BSPutByte(bsp, (Int2)strandsymbol[sip->strand]);
8436 if ((sip->strand == Seq_strand_minus) ||
8437 (sip->strand == Seq_strand_both_rev))
8438 {
8439 ifp1 = sip->if_to;
8440 ifp2 = sip->if_from;
8441 to = sip->from;
8442 from = sip->to;
8443 }
8444 else
8445 {
8446 ifp1 = sip->if_from;
8447 ifp2 = sip->if_to;
8448 to = sip->to;
8449 from = sip->from;
8450
8451 }
8452 IntFuzzPrint(ifp1, from, buf, FALSE);
8453 BSstring(bsp, buf);
8454 BSPutByte(bsp, '-');
8455 IntFuzzPrint(ifp2, to, buf, TRUE);
8456 BSstring(bsp, buf);
8457
8458 break;
8459 }
8460 case SEQLOC_PNT: /* pnt */
8461 lastid = SeqPointWriteEx((SeqPntPtr)(slp->data.ptrvalue),
8462 buf, lastid, sizeof(buf) - 1, use_best_id);
8463 BSstring(bsp, buf);
8464 break;
8465 case SEQLOC_PACKED_PNT: /* packed pnt */
8466 pspp = (PackSeqPntPtr)(slp->data.ptrvalue);
8467 if (pspp != NULL)
8468 BSstring(bsp, "PackSeqPnt");
8469 break;
8470 default:
8471 BSstring(bsp, "(\?\?)");
8472 break;
8473 }
8474 slp = slp->next;
8475 }
8476 return lastid;
8477 }
8478
8479
8480 /*****************************************************************************
8481 *
8482 * BSstring(bsp, str)
8483 *
8484 *****************************************************************************/
BSstring(ByteStorePtr bsp,CharPtr str)8485 static void BSstring(ByteStorePtr bsp, CharPtr str)
8486 {
8487 BSWrite(bsp, str, (Int4)(StringLen(str)));
8488 return;
8489 }
8490
8491 /*****************************************************************************
8492 *
8493 * SeqPointPrint(spp, buf, lastid)
8494 *
8495 *****************************************************************************/
SeqPointPrint(SeqPntPtr spp,CharPtr buf,SeqIdPtr lastid)8496 NLM_EXTERN SeqIdPtr SeqPointPrint(SeqPntPtr spp, CharPtr buf, SeqIdPtr lastid)
8497 {
8498 CharPtr tmp;
8499
8500 if ((spp == NULL) || (buf == NULL)) return NULL;
8501
8502 tmp = buf;
8503 *tmp = '\0';
8504 if (! SeqIdMatch(spp->id, lastid))
8505 {
8506 SeqIdPrint(spp->id, tmp, PRINTID_FASTA_SHORT);
8507 while (*tmp != '\0') tmp++;
8508 *tmp = ':';
8509 tmp++; *tmp = '\0';
8510 }
8511 if (strandsymbol[spp->strand])
8512 {
8513 *tmp = strandsymbol[spp->strand];
8514 tmp++; *tmp = '\0';
8515 }
8516 IntFuzzPrint(spp->fuzz, spp->point, tmp, TRUE);
8517
8518 return spp->id;
8519 }
8520
8521 static SeqIdPtr
SeqPointWriteEx(SeqPntPtr spp,CharPtr buf,SeqIdPtr lastid,Int2 buflen,Boolean use_best_id)8522 SeqPointWriteEx
8523 (SeqPntPtr spp,
8524 CharPtr buf,
8525 SeqIdPtr lastid,
8526 Int2 buflen,
8527 Boolean use_best_id)
8528 {
8529 CharPtr tmp;
8530 SeqIdPtr best_id, tmp_next;
8531 BioseqPtr bsp;
8532 Int4 fuzzlen, id_len;
8533 Char fuzzbuf[100];
8534
8535 if ((spp == NULL) || (buf == NULL)) return NULL;
8536
8537 tmp = buf;
8538 *tmp = '\0';
8539 if (buflen < 2) return NULL;
8540
8541 best_id = spp->id;
8542 if (use_best_id)
8543 {
8544 bsp = BioseqFind (spp->id);
8545 if (bsp != NULL)
8546 {
8547 best_id = SeqIdFindBest (bsp->id, SEQID_GENBANK);
8548 }
8549 }
8550 tmp_next = best_id->next;
8551 best_id->next = NULL;
8552
8553 IntFuzzPrint(spp->fuzz, spp->point, fuzzbuf, TRUE);
8554 fuzzlen = StringLen (fuzzbuf);
8555
8556 if (! SeqIdMatch(best_id, lastid))
8557 {
8558 SeqIdWrite(best_id, tmp, PRINTID_FASTA_SHORT, buflen - 2);
8559 while (*tmp != '\0') tmp++;
8560 *tmp = ':';
8561 tmp++; *tmp = '\0';
8562 }
8563 if (strandsymbol[spp->strand])
8564 {
8565 *tmp = strandsymbol[spp->strand];
8566 tmp++; *tmp = '\0';
8567 }
8568
8569 id_len = StringLen (buf);
8570 if (id_len < buflen - 1) {
8571 StringNCat (buf, fuzzbuf, buflen - id_len - 1);
8572 buf[buflen - 1] = 0;
8573 }
8574
8575 best_id->next = tmp_next;
8576
8577 return best_id;
8578 }
8579
8580 /*****************************************************************************
8581 *
8582 * SeqPointWrite(spp, buf, lastid, buflen)
8583 *
8584 *****************************************************************************/
SeqPointWrite(SeqPntPtr spp,CharPtr buf,SeqIdPtr lastid,Int2 buflen)8585 NLM_EXTERN SeqIdPtr SeqPointWrite(SeqPntPtr spp, CharPtr buf, SeqIdPtr lastid, Int2 buflen)
8586 {
8587 return SeqPointWriteEx (spp, buf, lastid, buflen, FALSE);
8588 }
8589
8590 /*****************************************************************************
8591 *
8592 * IntFuzzPrint(ifp, pos, buf, right)
8593 *
8594 *****************************************************************************/
IntFuzzPrint(IntFuzzPtr ifp,Int4 pos,CharPtr buf,Boolean right)8595 NLM_EXTERN void IntFuzzPrint(IntFuzzPtr ifp, Int4 pos, CharPtr buf, Boolean right)
8596 {
8597 Char lim=0;
8598 CharPtr tmp;
8599 Char tbuf[40];
8600
8601 if (buf == NULL) return;
8602 pos++; /* number from 1 */
8603 tmp = buf;
8604 *tmp = '\0';
8605 *tbuf = '\0';
8606 if (ifp != NULL)
8607 {
8608 switch (ifp->choice)
8609 {
8610 case 1: /* plus minus */
8611 sprintf(tbuf, "<+-%ld>", (long)ifp->a);
8612 break;
8613 case 2: /* range */
8614 sprintf(tbuf, "<%ld.%ld>", (long)ifp->b, (long)ifp->a);
8615 break;
8616 case 3: /* percent */
8617 sprintf(tbuf, "<%ld%%>", (long)ifp->a);
8618 break;
8619 case 4: /* limit */
8620 switch (ifp->a)
8621 {
8622 case 0: /* unknown */
8623 case 255: /* other */
8624 sprintf(tbuf, "<?>");
8625 break;
8626 case 1: /* gt */
8627 lim = '>';
8628 break;
8629 case 2: /* lt */
8630 lim = '<';
8631 break;
8632 case 3:
8633 lim = 'r';
8634 break;
8635 case 4:
8636 lim = '^';
8637 break;
8638 }
8639 break;
8640 }
8641 }
8642
8643 if ((lim) && (lim != 'r'))
8644 {
8645 *tmp = lim;
8646 tmp++; *tmp = '\0';
8647 lim = 0;
8648 }
8649
8650 if (right)
8651 {
8652 sprintf(tmp, "%ld", (long)pos);
8653 while (*tmp != '\0') tmp++;
8654 }
8655 if (lim == 'r')
8656 {
8657 *tmp = '^';
8658 tmp++;
8659 *tmp = '\0';
8660 }
8661 if (tbuf[0] != '\0')
8662 {
8663 tmp = StringMove(tmp, tbuf);
8664 }
8665 if (! right)
8666 sprintf(tmp, "%ld", (long)pos);
8667
8668 return;
8669
8670 }
8671 /*****************************************************************************
8672 *
8673 * TaxNameFromCommon(common)
8674 *
8675 *****************************************************************************/
8676 typedef struct sturct_Nlm_taxcommon {
8677 char * common;
8678 char * taxname;
8679 } Nlm_TaxCommon, PNTR Nlm_TaxCommonPtr;
8680
TaxNameFromCommon(CharPtr common)8681 NLM_EXTERN CharPtr TaxNameFromCommon (CharPtr common)
8682 {
8683 CharPtr taxname = NULL;
8684 CharPtr query = (CharPtr)MemNew(StringLen(common) + 2);
8685 int tax_try, dex;
8686
8687 static Nlm_TaxCommon taxcommon[40] = {
8688 "Chinese hamsters", "Cricetulus sp."
8689 ,"Syrian golden hamsters", "Mesocricetus auratus"
8690 ,"Syrian hamsters", "Mesocricetus sp."
8691 ,"barley", "Hordeum sp."
8692 ,"carrots", "Daucus sp."
8693 ,"cats", "Felis sp."
8694 ,"cattles", "Bos sp."
8695 ,"chickens", "Gallus sp."
8696 ,"chimpanzees", "Pan sp."
8697 ,"chimpanzes", "Pan sp."
8698 ,"corn", "Zea sp."
8699 ,"cucumber", "Cucumis sativus"
8700 ,"dogs", "Canis sp."
8701 ,"goats", "Capra sp."
8702 ,"gorillas", "Gorilla sp."
8703 ,"guinea pigs", "Cavia sp."
8704 ,"hamsters", "Cricetidae gen. sp."
8705 ,"horses", "Equus sp."
8706 ,"humans", "Homo sapiens"
8707 ,"maize", "Zea sp."
8708 ,"mice", "Mus sp."
8709 ,"mouse", "Mus sp."
8710 ,"peas", "Pisum sp."
8711 ,"potatoes", "Solanum sp."
8712 ,"potato", "Solanum sp."
8713 ,"quails", "Phasianidae gen. sp."
8714 ,"rabbits", "Oryctolagus sp."
8715 ,"rats", "Rattus sp."
8716 ,"rices", "Oryza sp."
8717 ,"sheeps", "Ovis sp."
8718 ,"sorghums", "Sorghum sp."
8719 ,"soybeans", "Glycine sp."
8720 ,"spinach", "Spinacia sp."
8721 ,"swine", "Sus sp."
8722 ,"tobacco", "Nicotiania sp."
8723 ,"tomatoes", "Lycopersicon sp."
8724 ,"tomato", "Lycopersicon sp."
8725 ,"turkeys", "Meleagris sp."
8726 ,"wheat", "Triticum sp."
8727 ,"zebrafish", "Brachydanio sp."
8728 };
8729
8730 if (common == NULL) return NULL;
8731
8732 StringCpy(query,common); /* space for 's' is at end */
8733 for (tax_try = 0; tax_try < 2; tax_try ++){
8734 for (dex = 0; dex < 40; dex ++ ){
8735 if (StringICmp(query,taxcommon[dex].common) == 0)
8736 break;
8737 }
8738 if ( dex < 40)
8739 break;
8740 if (tax_try == 0)
8741 StringCat(query,"s");
8742 }
8743 MemFree (query);
8744 if (dex < 40)
8745 taxname = StringSave (taxcommon[dex].taxname);
8746
8747 return taxname;
8748 }
8749
8750 /*****************************************************************************
8751 *
8752 * QualLocCreate(from, to)
8753 * creates a UserObject of _class NCBI, type 1
8754 * adds a field of type "qual_loc"
8755 * puts the from and to numbers in
8756 * no range check, no strand, no seqid
8757 * this just carries locations for the qualifiers anticodon and rpt_unit
8758 * Intended to go on SeqFeat.ext
8759 *
8760 *****************************************************************************/
QualLocCreate(Int4 from,Int4 to)8761 NLM_EXTERN UserObjectPtr QualLocCreate (Int4 from, Int4 to)
8762 {
8763 UserObjectPtr usop;
8764 UserFieldPtr ufp;
8765 ObjectIdPtr oip;
8766 Int4Ptr ints;
8767
8768 usop = UserObjectNew();
8769 usop->_class = StringSave("NCBI");
8770 oip = ObjectIdNew();
8771 oip->id = 1;
8772 usop->type = oip;
8773
8774 ufp = UserFieldNew();
8775 usop->data = ufp;
8776 oip = ObjectIdNew();
8777 oip->str = StringSave("qual_loc");
8778 ufp->label = oip;
8779 ufp->num = 2;
8780 ufp->choice = 8; /* ints */
8781
8782 ints = (Int4Ptr) MemNew((size_t)(sizeof(Int4) * 2));
8783 ints[0] = from;
8784 ints[1] = to;
8785 ufp->data.ptrvalue = (Pointer)ints;
8786
8787 return usop;
8788
8789 }
8790
8791 /*****************************************************************************
8792 *
8793 * QualLocWrite(uop, buf)
8794 * Checks a SeqFeat.ext to see if it is
8795 * 1) not null
8796 * 2) has a UserObject of _class NCBI, type 1
8797 * 3) has a field of label "qual_loc"
8798 * 4) if so, prints the two integers as a qualifier location
8799 * from..to and returns a pointer to the \0 after "to"
8800 * If any of the above fail, returns NULL
8801 *
8802 *****************************************************************************/
QualLocWrite(UserObjectPtr uop,CharPtr buf)8803 NLM_EXTERN CharPtr QualLocWrite(UserObjectPtr uop, CharPtr buf)
8804 {
8805 CharPtr tmp=NULL;
8806 UserFieldPtr ufp;
8807 Int4Ptr ints;
8808
8809 if ((uop == NULL) || (buf == NULL))
8810 return tmp;
8811
8812 if (StringCmp(uop->_class, "NCBI"))
8813 return tmp;
8814
8815 if (uop->type->id != 1)
8816 return tmp;
8817
8818 for (ufp = uop->data; ufp != NULL; ufp = ufp->next)
8819 {
8820 if (! StringCmp(ufp->label->str, "qual_loc"))
8821 {
8822 if (ufp->choice != 8) /* not ints */
8823 return NULL;
8824 if (ufp->num < 2) /* not enough */
8825 return NULL;
8826 ints = (Int4Ptr)(ufp->data.ptrvalue);
8827 if (ints == NULL)
8828 return tmp;
8829 tmp = buf;
8830 sprintf(tmp, "%ld..%ld", (long)(ints[0]+1),
8831 (long)(ints[1]+1));
8832 while (*tmp != '\0')
8833 tmp++;
8834 return tmp;
8835 }
8836 }
8837
8838 return tmp;
8839 }
8840
8841 /*****************************************************************************
8842 *
8843 * EntrezASN1Detected detects records retrieved from Entrez, which should
8844 * not be edited by Sequin and replaced into ID.
8845 *
8846 *****************************************************************************/
8847
EntrezAsnCallback(SeqEntryPtr sep,Pointer mydata,Int4 index,Int2 indent)8848 static void EntrezAsnCallback (SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent)
8849
8850 {
8851 SeqDescrPtr descr;
8852 SeqDescrPtr sdp;
8853 BoolPtr rsult;
8854 CharPtr str;
8855
8856 if (sep == NULL || sep->data.ptrvalue == NULL || mydata == NULL) return;
8857 rsult = (BoolPtr) mydata;
8858 descr = (IS_Bioseq (sep)) ?
8859 ((BioseqPtr) sep->data.ptrvalue)->descr :
8860 ((BioseqSetPtr) sep->data.ptrvalue)->descr;
8861 if (descr == NULL) return;
8862 sdp = NULL;
8863 while ((sdp = ValNodeFindNext (descr, sdp, Seq_descr_user)) != NULL) {
8864 if (sdp->data.ptrvalue != NULL) {
8865 str = ((UserObjectPtr) sdp->data.ptrvalue)->_class;
8866 if (StringCmp (str, "gbfix") == 0 || StringCmp (str, "pdbfix") == 0) {
8867 *rsult = TRUE;
8868 }
8869 }
8870 }
8871 }
8872
EntrezASN1Detected(SeqEntryPtr sep)8873 NLM_EXTERN Boolean EntrezASN1Detected (SeqEntryPtr sep)
8874
8875 {
8876 Boolean rsult;
8877
8878 rsult = FALSE;
8879 SeqEntryExplore (sep, (Pointer) &rsult, EntrezAsnCallback);
8880 return rsult;
8881 }
8882
8883 /*****************************************************************************
8884 *
8885 * SeqLocIntNew(Int4 from, Int4 to, Uint1 strand, SeqIdPtr sip)
8886 * makes copy of incoming SeqId
8887 *
8888 *****************************************************************************/
SeqLocIntNew(Int4 from,Int4 to,Uint1 strand,SeqIdPtr sip)8889 NLM_EXTERN SeqLocPtr LIBCALL SeqLocIntNew (Int4 from, Int4 to, Uint1 strand, SeqIdPtr sip)
8890 {
8891 SeqIntPtr sintp;
8892 SeqLocPtr slp;
8893
8894 if (sip == NULL) return NULL;
8895 sintp = SeqIntNew();
8896 sintp->id = SeqIdDup(sip);
8897 sintp->from = from;
8898 sintp->to = to;
8899 sintp->strand = strand;
8900
8901 slp = ValNodeNew(NULL);
8902 slp->choice = SEQLOC_INT;
8903 slp->data.ptrvalue = (Pointer)sintp;
8904
8905 return slp;
8906 }
8907
8908 /*****************************************************************************
8909 *
8910 * SeqLocPntNew(Int4 pos, Uint1 strand, SeqIdPtr sip, Boolean is_fuzz)
8911 * makes copy of incoming SeqId
8912 *
8913 *****************************************************************************/
SeqLocPntNew(Int4 pos,Uint1 strand,SeqIdPtr sip,Boolean is_fuzz)8914 NLM_EXTERN SeqLocPtr LIBCALL SeqLocPntNew(Int4 pos, Uint1 strand, SeqIdPtr sip, Boolean is_fuzz)
8915 {
8916 SeqLocPtr slp;
8917 SeqPntPtr spp;
8918 IntFuzzPtr ifp;
8919
8920 slp = ValNodeNew(NULL);
8921 slp->choice = SEQLOC_PNT;
8922 spp = SeqPntNew();
8923 spp->point = pos;
8924 spp->strand = strand;
8925 spp->id = SeqIdDup(sip);
8926 if(is_fuzz)
8927 {
8928 ifp = IntFuzzNew();
8929 ifp->choice = 4;
8930 ifp->a = 0; /*unknown value*/
8931 spp->fuzz = ifp;
8932 }
8933 slp->data.ptrvalue = spp;
8934
8935 return slp;
8936
8937 }
8938
FreeSeqLocSetComponents(SeqLocPtr list)8939 NLM_EXTERN void FreeSeqLocSetComponents (SeqLocPtr list)
8940
8941 {
8942 BioseqPtr bsp;
8943 SeqIdPtr sip;
8944 SeqLocPtr slp;
8945 Uint2 entityID;
8946
8947 for (slp = list; slp != NULL; slp = slp->next) {
8948 sip = SeqLocId (slp);
8949 if (sip == NULL) continue;
8950 bsp = BioseqFind (sip);
8951 if (bsp == NULL) continue;
8952 entityID = ObjMgrGetEntityIDForPointer (bsp);
8953 if (entityID < 1) continue;
8954 ObjMgrFreeByEntityID (entityID);
8955 }
8956 }
8957
8958
8959 /* a "gather routine" which collects information about the coding region
8960 features */
gatherCodingRegions(GatherContextPtr gcp)8961 static Boolean gatherCodingRegions(GatherContextPtr gcp)
8962 {
8963 SpliceInfoPtr sip;
8964 SeqFeatPtr sfp;
8965 SeqLocPtr slp, current, next, protSlp, chain;
8966 Uint1 strand;
8967
8968 if (gcp == NULL || gcp->thisitem == NULL) return TRUE;
8969
8970 /* although we had to include higher-level types in GatherScope's
8971 "ignore" parameter so that Gather could "see" features, we're
8972 really only interested in features */
8973 if (gcp->thistype != OBJ_SEQFEAT) return TRUE;
8974
8975 sip = (SpliceInfoPtr) gcp->userdata;
8976
8977 if (sip == NULL || sip->slp == NULL) return TRUE;
8978
8979 sfp = (SeqFeatPtr) gcp->thisitem;
8980
8981 /* we're only interested in coding regions */
8982 if (sfp->data.choice != SEQFEAT_CDREGION || sfp->product == NULL) return TRUE;
8983
8984 /* traverse all (but the last) intervals of the coding region feature */
8985 for (current = SeqLocFindNext(sfp->location, NULL);
8986 (next = SeqLocFindNext(sfp->location, current)) != NULL;
8987 current = next) {
8988 /* consider the last DNA base on the interval */
8989 strand = SeqLocStrand(current);
8990 slp = SeqLocPntNew(strand == Seq_strand_minus ?
8991 SeqLocStart(current) : SeqLocStop(current), strand,
8992 SeqLocId(current), FALSE);
8993 if (sip->findOnProtein)
8994 {
8995 /* find the corresponding location on the protein */
8996 protSlp = dnaLoc_to_aaLoc(sfp, slp, TRUE, NULL, FALSE);
8997
8998 protSlp->next = NULL;
8999 SeqLocFree(slp);
9000
9001 if (sip->slp->data.ptrvalue == NULL)
9002 {
9003 sip->slp->data.ptrvalue = protSlp;
9004 } else {
9005 for (chain = (SeqLocPtr) sip->slp->data.ptrvalue;
9006 chain->next != NULL; chain = chain->next) {
9007 }
9008 chain->next = protSlp;
9009 }
9010 } else {
9011 if (sip->slp->data.ptrvalue == NULL)
9012 {
9013 sip->slp->data.ptrvalue = slp;
9014 } else {
9015 for (chain = (SeqLocPtr) sip->slp->data.ptrvalue;
9016 chain->next != NULL; chain = chain->next) {
9017 }
9018 chain->next = slp;
9019 }
9020 }
9021 }
9022
9023 return TRUE;
9024 }
9025
9026 /*****************************************************************************
9027 *
9028 * SeqLocPtr FindSpliceSites(SeqEntryPtr sep, Boolean findOnProtein)
9029 * Finds the splice sites on this SeqEntry and returns them as a
9030 * SeqLoc.
9031 *
9032 *****************************************************************************/
FindSpliceSites(SeqEntryPtr sep,Boolean findOnProtein)9033 NLM_EXTERN SeqLocPtr LIBCALL FindSpliceSites(SeqEntryPtr sep, Boolean findOnProtein)
9034 {
9035 SpliceInfo si;
9036 GatherScope gs;
9037 Int2 i;
9038 SeqLocPtr slp;
9039
9040 MemSet ((Pointer) (&gs), 0, sizeof (GatherScope));
9041 gs.get_feats_location = FALSE;
9042 for (i = 0; i < OBJ_MAX; i++)
9043 gs.ignore[i] = TRUE;
9044 gs.ignore[OBJ_SEQFEAT] = FALSE;
9045 gs.ignore[OBJ_SEQANNOT] = FALSE;
9046 slp = ValNodeNew(NULL);
9047 slp->choice = SEQLOC_EQUIV;
9048 slp->data.ptrvalue = NULL; /* to be filled in within Gather */
9049 si.slp = slp;
9050 si.findOnProtein = findOnProtein;
9051 GatherSeqEntry(sep, &si, gatherCodingRegions, &gs);
9052 if (slp->data.ptrvalue == NULL)
9053 {
9054 SeqLocFree(slp);
9055 return NULL;
9056 }
9057
9058 return slp;
9059 }
9060
9061
9062 /* a "gather routine" which collects information about the coding region
9063 feature (based upon the assumption that there is only one) */
gatherTheCodingRegion(GatherContextPtr gcp)9064 static Boolean gatherTheCodingRegion(GatherContextPtr gcp)
9065 {
9066 SeqFeatPtr PNTR sffp;
9067 SeqFeatPtr sfp;
9068
9069 if (gcp == NULL || gcp->thisitem == NULL) return TRUE;
9070
9071 /* although we had to include higher-level types in GatherScope's
9072 "ignore" parameter so that Gather could "see" features, we're
9073 really only interested in features */
9074 if (gcp->thistype != OBJ_SEQFEAT) return TRUE;
9075
9076 sffp = (SeqFeatPtr PNTR) gcp->userdata;
9077
9078 if (sffp == NULL) return FALSE;
9079
9080 sfp = (SeqFeatPtr) gcp->thisitem;
9081
9082 /* we're only interested in coding regions */
9083 if (sfp->data.choice != SEQFEAT_CDREGION || sfp->product == NULL) return TRUE;
9084
9085 *sffp = sfp;
9086
9087 return FALSE;
9088 }
9089
9090 /*****************************************************************************
9091 *
9092 * SeqFeatPtr FindCodingRegion(SeqEntryPtr sep)
9093 * Finds the coding region feature on this protein SeqEntry and
9094 * returns a copy of it.
9095 *
9096 *****************************************************************************/
FindCodingRegion(SeqEntryPtr sep)9097 NLM_EXTERN SeqFeatPtr LIBCALL FindCodingRegion(SeqEntryPtr sep)
9098 {
9099 SeqFeatPtr sfp = NULL;
9100 GatherScope gs;
9101 Int2 i;
9102
9103 MemSet ((Pointer) (&gs), 0, sizeof (GatherScope));
9104 gs.get_feats_location = FALSE;
9105 for (i = 0; i < OBJ_MAX; i++)
9106 gs.ignore[i] = TRUE;
9107 gs.ignore[OBJ_SEQFEAT] = FALSE;
9108 gs.ignore[OBJ_SEQANNOT] = FALSE;
9109 GatherSeqEntry(sep, &sfp, gatherTheCodingRegion, &gs);
9110 /* make a copy of it */
9111 if (sfp != NULL)
9112 sfp = (SeqFeatPtr)AsnIoMemCopy((Pointer)sfp, (AsnReadFunc)SeqFeatAsnRead, (AsnWriteFunc)SeqFeatAsnWrite);
9113 return sfp;
9114 }
9115
gatherMolTypeCheck(GatherContextPtr gcp)9116 static Boolean gatherMolTypeCheck(GatherContextPtr gcp)
9117 {
9118 SeqIdCheckerPtr sicp;
9119 BioseqPtr bsp;
9120
9121 if (gcp == NULL || gcp->thisitem == NULL) return TRUE;
9122
9123 sicp = (SeqIdCheckerPtr) gcp->userdata;
9124 bsp = (BioseqPtr) gcp->thisitem;
9125
9126 if (bsp == NULL) return TRUE;
9127 if (sicp == NULL) return FALSE;
9128
9129 /* check for mol-type mismatch */
9130 if (ISA_na(bsp->mol) == sicp->isProtein) return TRUE;
9131
9132 if (sicp->sip != NULL)
9133 {
9134 if (SeqIdComp(bsp->id, sicp->sip) != SIC_YES)
9135 return TRUE;
9136 }
9137
9138 sicp->retval = TRUE;
9139
9140 /* no need to examine other Bioseqs */
9141 return FALSE;
9142 }
9143
9144 /*****************************************************************************
9145 *
9146 * Boolean SeqEntryContainsSeqIdOfMolType(SeqEntryPtr sep, SeqIdPtr sip, Boolean isProtein)
9147 * Tests to see if this SeqEntry contains a bioseq of the specified moltype
9148 * (protein or DNA)
9149 * if sip != NULL then it also insists upon finding a bioseq of the
9150 * specified moltype where the SeqIds match
9151 *
9152 *****************************************************************************/
SeqEntryContainsSeqIdOfMolType(SeqEntryPtr sep,SeqIdPtr sip,Boolean isProtein)9153 NLM_EXTERN Boolean LIBCALL SeqEntryContainsSeqIdOfMolType(SeqEntryPtr sep, SeqIdPtr sip, Boolean isProtein)
9154 {
9155 SeqIdChecker sic;
9156 GatherScope gs;
9157 Int2 i;
9158
9159 MemSet ((Pointer) (&gs), 0, sizeof (GatherScope));
9160 gs.get_feats_location = FALSE;
9161 for (i = 0; i < OBJ_MAX; i++)
9162 gs.ignore[i] = TRUE;
9163 gs.ignore[OBJ_BIOSEQ] = FALSE;
9164 sic.sip = sip;
9165 sic.isProtein = isProtein;
9166 sic.retval = FALSE;
9167 GatherSeqEntry(sep, &sic, gatherMolTypeCheck, &gs);
9168
9169 return sic.retval;
9170 }
9171
GIMolTypeCheck(GatherContextPtr gcp)9172 static Boolean GIMolTypeCheck(GatherContextPtr gcp)
9173 {
9174 SeqIdMolTypePtr sicp;
9175 BioseqPtr bsp;
9176
9177 if (gcp == NULL || gcp->thisitem == NULL) return TRUE;
9178
9179 sicp = (SeqIdMolTypePtr) gcp->userdata;
9180 bsp = (BioseqPtr) gcp->thisitem;
9181
9182 if (bsp == NULL) return TRUE;
9183 if (sicp == NULL) return FALSE;
9184
9185 if (sicp->sip != NULL) {
9186 if (SeqIdIn(sicp->sip, bsp->id) == FALSE)
9187 return TRUE;
9188 }
9189 if (ISA_na(bsp->mol)) {
9190 sicp->mtype = 2;
9191 } else {
9192 sicp->mtype = 1;
9193 }
9194
9195 /* no need to examine other Bioseqs */
9196 return FALSE;
9197 }
9198
9199 /*****************************************************************************
9200 *
9201 * Tests to see if this SeqEntry contains a bioseq of the specified uid
9202 * returns moltype of the bioseq where the SeqIds match
9203 * 0 id not found in this SeqEntry
9204 * 1 Amino Acid sequence
9205 * 2 Nucleotide sequence
9206 *
9207 *****************************************************************************/
MolTypeForGI(SeqEntryPtr sep,Int4 uid)9208 NLM_EXTERN Int2 LIBCALL MolTypeForGI(SeqEntryPtr sep, Int4 uid)
9209 {
9210 SeqIdMolType sic;
9211 SeqIdPtr sip;
9212 GatherScope gs;
9213
9214 MemSet ((Pointer) (&gs), 0, sizeof (GatherScope));
9215 MemSet ((Pointer) (gs.ignore), (int)(TRUE),
9216 (size_t) (OBJ_MAX * sizeof(Boolean)));
9217 gs.get_feats_location = FALSE;
9218 gs.ignore[OBJ_BIOSEQ] = FALSE;
9219 sip = ValNodeNew(NULL);
9220 sip->choice = SEQID_GI;
9221 sip->data.intvalue = uid;
9222 sic.sip = sip;
9223 sic.mtype = 0;
9224 GatherSeqEntry(sep, &sic, GIMolTypeCheck, &gs);
9225 ValNodeFree(sip);
9226
9227 return sic.mtype;
9228 }
9229
9230 /******************************************************************
9231 ***
9232 * local_id_make(): make SeqIdPtr with SEQID_LOCAL choice
9233 *
9234 *******************************************************************
9235 ***/
local_id_make(CharPtr name)9236 NLM_EXTERN SeqIdPtr local_id_make(CharPtr name)
9237 {
9238 SeqIdPtr new_id;
9239 ObjectIdPtr obj_id;
9240
9241
9242 new_id=(SeqIdPtr)ValNodeNew(NULL);
9243 new_id->choice = SEQID_LOCAL;
9244
9245 obj_id = ObjectIdNew();
9246 obj_id->str = StringSave(name);
9247 new_id->data.ptrvalue = obj_id;
9248
9249 return new_id;
9250 }
9251
9252
9253
9254 /*************************************************************
9255 *
9256 * MuskSeqIdWrite (sip, buf, buflen, format, do_find, do_entrez_find)
9257 * print Seq-id to the buffer with the chromoscope format
9258 * sip, buf, buflen, format is similar to what is defined in
9259 * SeqIdWrite.
9260 * do_find, if TRUE, find the most printable id
9261 * do_entrez_find. if TRUE, if the id is a gi, find the printable id
9262 *
9263 ***************************************************************/
9264
9265 /* a kludge function to make the special formatting for
9266 TIGRs THC sequences
9267 */
format_tigr_thc(SeqIdPtr sip,CharPtr buf,Int2 buflen)9268 static Boolean format_tigr_thc (SeqIdPtr sip, CharPtr buf, Int2 buflen)
9269 {
9270 DbtagPtr db_tag;
9271 Char temp[101];
9272 ObjectIdPtr oip;
9273
9274 while(sip)
9275 {
9276 if(sip->choice == SEQID_GENERAL)
9277 {
9278 db_tag = (DbtagPtr) sip->data.ptrvalue;
9279 if(db_tag->db && StringCmp(db_tag->db, "THC") == 0)
9280 {
9281 oip = db_tag->tag;
9282 if(oip->id != 0)
9283 {
9284 sprintf(temp, "THC%ld", (long) oip->id);
9285 StringNCpy(buf, temp, buflen);
9286 return TRUE;
9287 }
9288 }
9289 }
9290
9291 sip = sip->next;
9292 }
9293
9294 return FALSE;
9295 }
9296
9297
9298
format_human_map_id(SeqIdPtr sip,CharPtr buf,Int2 buflen)9299 static Boolean format_human_map_id (SeqIdPtr sip, CharPtr buf, Int2 buflen)
9300 {
9301 DbtagPtr db_tag;
9302 Char temp[101];
9303 ObjectIdPtr oip;
9304
9305 for (; sip; sip = sip->next) {
9306 if(sip->choice == SEQID_GENERAL) {
9307 break;
9308 }
9309 }
9310 if (sip == NULL) {
9311 return FALSE;
9312 }
9313 db_tag = (DbtagPtr) sip->data.ptrvalue;
9314 if (db_tag->db == NULL) {
9315 return FALSE;
9316 }
9317 if (StringCmp(db_tag->db, "MIT") == 0 || StringCmp(db_tag->db, "GENETHON") == 0 || StringCmp(db_tag->db, "CHLC") == 0 || StringCmp(db_tag->db, "GDB") == 0 || StringCmp(db_tag->db, "Stanford") == 0 || StringCmp(db_tag->db, "NCBI") == 0) {
9318 oip = db_tag->tag;
9319 if (oip->str != 0) {
9320 sprintf(temp, "%s", oip->str);
9321 StringNCpy(buf, temp, buflen);
9322 return TRUE;
9323 }
9324 }
9325
9326 return FALSE;
9327 }
9328
MuskSeqIdWrite(SeqIdPtr sip,CharPtr buf,Int2 buflen,Uint1 format,Boolean do_find,Boolean do_entrez_find)9329 NLM_EXTERN Boolean MuskSeqIdWrite (SeqIdPtr sip, CharPtr buf, Int2 buflen, Uint1 format, Boolean do_find, Boolean do_entrez_find)
9330 {
9331 SeqIdPtr entrez_id;
9332 Boolean retval;
9333 BioseqPtr bsp;
9334 Boolean bsp_found = FALSE;
9335
9336 if(sip == NULL || buf == NULL)
9337 return FALSE;
9338
9339 if(do_find)
9340 {
9341 if(sip->next == NULL)
9342 {
9343 bsp = BioseqFindCore(sip);
9344 if(bsp !=NULL)
9345 {
9346 bsp_found = TRUE;
9347 sip = bsp->id;
9348 }
9349 }
9350 sip = SeqIdFindWorst(sip);
9351 }
9352 if(sip->choice == SEQID_GI && do_entrez_find && !bsp_found)
9353 {
9354 entrez_id = GetSeqIdForGI(sip->data.intvalue);
9355 if(entrez_id !=NULL)
9356 {
9357 retval = MuskSeqIdWrite(entrez_id, buf, buflen, format, TRUE, FALSE);
9358 SeqIdSetFree(entrez_id);
9359 return retval;
9360 }
9361 }
9362
9363 if(format_tigr_thc (sip, buf, buflen))
9364 { /* give special format to the THC sequence*/
9365 return TRUE;
9366 }
9367 if(format_human_map_id (sip, buf, buflen))
9368 { /* special format for human map ids */
9369 return TRUE;
9370 }
9371 SeqIdWrite(sip, buf,format, buflen); /*in case this function can not work*/
9372
9373 if(buf[0] == '\0')
9374 LabelCopy(buf, "Unidentified", buflen);
9375
9376 return TRUE;
9377 }
9378
9379
9380 /***********************************************************************
9381 ***
9382 * seqid_name(): return the most informative name from a SeqIdPtr
9383 *
9384 ************************************************************************
9385 ***/
seqid_name(SeqIdPtr hsip,CharPtr name,Boolean use_locus,Boolean check_chain)9386 NLM_EXTERN Boolean seqid_name(SeqIdPtr hsip, CharPtr name, Boolean use_locus, Boolean check_chain)
9387 {
9388 Uint1 format;
9389
9390 if(use_locus)
9391 format = PRINTID_TEXTID_LOCUS;
9392 else
9393 format = PRINTID_TEXTID_ACCESSION;
9394 return MuskSeqIdWrite (hsip, name, 20, format, check_chain, check_chain);
9395 }
9396
9397 /***********************************************************************
9398 ***
9399 * update_seq_loc(): update the start, stop, strand info in SeqLoc
9400 *
9401 ************************************************************************
9402 ***/
update_seq_loc(Int4 start,Int4 stop,Uint1 strand,SeqLocPtr loc)9403 NLM_EXTERN SeqLocPtr update_seq_loc(Int4 start, Int4 stop, Uint1 strand, SeqLocPtr loc)
9404 {
9405 SeqIntPtr sint;
9406 SeqPntPtr spp;
9407
9408 if(loc->choice == SEQLOC_INT)
9409 {
9410 sint = (SeqIntPtr) loc->data.ptrvalue;
9411 if(start != -1)
9412 sint->from = start;
9413 if(stop != -1)
9414 sint->to = stop;
9415 if(strand != 0)
9416 sint->strand = strand;
9417 loc->data.ptrvalue = sint;
9418 }
9419 else if(loc->choice == SEQLOC_PNT)
9420 {
9421 spp = (SeqPntPtr)(loc->data.ptrvalue);
9422 spp->point = start;
9423 spp->strand = strand;
9424 loc->data.ptrvalue = spp;
9425 }
9426
9427 return loc;
9428
9429 }
9430
9431
9432 /*
9433 Gets the SeqIdPtr for the subject or query sequence from the first SeqAlign.
9434 The SeqIdPtr is not saved and should not be deleted.
9435 */
9436 static SeqIdPtr LIBCALL
TxGetIdFromSeqAlign(SeqAlignPtr seqalign,Boolean subject)9437 TxGetIdFromSeqAlign(SeqAlignPtr seqalign, Boolean subject)
9438
9439 {
9440 DenseDiagPtr ddp;
9441 DenseSegPtr dsp;
9442 StdSegPtr ssp;
9443 SeqIdPtr sip;
9444
9445 if (seqalign == NULL)
9446 return NULL;
9447
9448 sip = NULL;
9449 switch (seqalign->segtype) {
9450 case 1: /*Dense-diag*/
9451 ddp = seqalign->segs;
9452 if (subject == TRUE)
9453 sip = ddp->id->next;
9454 else
9455 sip = ddp->id;
9456 break;
9457 case 2: /*Dense-seq */
9458 dsp = seqalign->segs;
9459 if (subject == TRUE)
9460 sip = dsp->ids->next;
9461 else
9462 sip = dsp->ids;
9463 break;
9464 case 3: /* Std-seg */
9465 ssp = seqalign->segs;
9466 if(ssp && ssp->loc && ssp->loc->next) {
9467 if (subject == TRUE)
9468 sip = SeqLocId(ssp->loc->next);
9469 else
9470 sip = SeqLocId(ssp->loc);
9471 }
9472 break;
9473 case 5: /* Discontinuous alignment */
9474
9475 sip = TxGetIdFromSeqAlign(seqalign->segs, subject);
9476 break;
9477 default:
9478 break;
9479 }
9480
9481 return sip;
9482 }
9483
9484 /*
9485 Obtains the query (i.e., the first) SeqIdPtr from
9486 the first SeqAlignPtr.
9487 */
9488 NLM_EXTERN SeqIdPtr LIBCALL
TxGetQueryIdFromSeqAlign(SeqAlignPtr seqalign)9489 TxGetQueryIdFromSeqAlign(SeqAlignPtr seqalign)
9490
9491 {
9492 return TxGetIdFromSeqAlign(seqalign, FALSE);
9493 }
9494
9495 /*
9496 Obtains the subject (i.e., the second) SeqIdPtr from
9497 the first SeqAlignPtr.
9498 */
9499 NLM_EXTERN SeqIdPtr LIBCALL
TxGetSubjectIdFromSeqAlign(SeqAlignPtr seqalign)9500 TxGetSubjectIdFromSeqAlign(SeqAlignPtr seqalign)
9501
9502 {
9503 return TxGetIdFromSeqAlign(seqalign, TRUE);
9504 }
9505
9506 static Boolean
GetBestScoreAndEvalueFromScorePtr(ScorePtr sp,Int4 * score,Nlm_FloatHi * bit_score,Nlm_FloatHi * evalue,Int4 * number)9507 GetBestScoreAndEvalueFromScorePtr(ScorePtr sp, Int4 *score, Nlm_FloatHi *bit_score, Nlm_FloatHi *evalue, Int4 *number)
9508
9509 {
9510 Boolean score_set=FALSE, evalue_set=FALSE, sum_set=FALSE, bit_set=FALSE;
9511 ObjectIdPtr obid;
9512 ScorePtr scrp;
9513
9514 for (scrp=sp; scrp; scrp = scrp->next)
9515 {
9516 obid = scrp->id;
9517 if(obid && obid->str)
9518 {
9519 if (StringICmp(obid->str, "score") == 0)
9520 {
9521 if (*score < scrp->value.intvalue)
9522 {
9523 score_set = TRUE;
9524 *score = scrp->value.intvalue;
9525 }
9526 continue;
9527 }
9528 else if (StringICmp(obid->str, "e_value") == 0 || StringICmp(obid->str, "sum_e") == 0)
9529 {
9530 if (*evalue > scrp->value.realvalue)
9531 {
9532 evalue_set = TRUE;
9533 *evalue = scrp->value.realvalue;
9534 }
9535 continue;
9536 }
9537 else if (StringICmp(obid->str, "sum_n") == 0)
9538 {
9539 if (*number < scrp->value.intvalue)
9540 {
9541 sum_set = TRUE;
9542 *number = scrp->value.intvalue;
9543 }
9544 continue;
9545 }
9546 else if (StringICmp(obid->str, "bit_score") == 0)
9547 {
9548 if (*bit_score < scrp->value.realvalue)
9549 {
9550 bit_set = TRUE;
9551 *bit_score = scrp->value.realvalue;
9552 }
9553 continue;
9554 }
9555 }
9556 }
9557
9558 /* Don't check for 'sum_set', as it's not always there. */
9559 if (score_set && evalue_set && bit_set)
9560 return TRUE;
9561 else if(score_set && evalue_set)
9562 {
9563 *bit_score = (FloatHi)(*score);
9564 return TRUE;
9565 }
9566
9567 return FALSE;
9568 }
9569
9570
9571 NLM_EXTERN Boolean LIBCALL
GetScoreAndEvalue(SeqAlignPtr seqalign,Int4 * score,Nlm_FloatHi * bit_score,Nlm_FloatHi * evalue,Int4 * number)9572 GetScoreAndEvalue(SeqAlignPtr seqalign, Int4 *score, Nlm_FloatHi *bit_score, Nlm_FloatHi *evalue, Int4 *number)
9573
9574 {
9575 Boolean local_retval, retval=FALSE;
9576 ScorePtr sp;
9577 DenseDiagPtr ddp;
9578 DenseSegPtr dsp;
9579 StdSegPtr ssp;
9580
9581 *score = 0;
9582 *bit_score = 0.0;
9583 *number = 1;
9584 *evalue = DBL_MAX;
9585
9586 sp = seqalign->score;
9587 if (sp == NULL)
9588 {
9589 switch (seqalign->segtype)
9590 {
9591 case 1: /*Dense-diag*/
9592 {
9593 Nlm_FloatHi best_evalue = *evalue;
9594 ddp = seqalign->segs;
9595 while (ddp)
9596 {
9597 Int4 number_tmp = 1;
9598 local_retval =
9599 GetBestScoreAndEvalueFromScorePtr(ddp->scores, score,
9600 bit_score, evalue, &number_tmp);
9601 /* Use number corresponding to best evalue. */
9602 if (*evalue < best_evalue)
9603 {
9604 best_evalue = *evalue;
9605 *number = number_tmp;
9606 }
9607 if (local_retval == TRUE)
9608 retval = TRUE;
9609 ddp = ddp->next;
9610 }
9611 break;
9612 }
9613 case 2:
9614 dsp = seqalign->segs;
9615 if (dsp)
9616 {
9617 retval = GetBestScoreAndEvalueFromScorePtr(dsp->scores, score, bit_score, evalue, number);
9618 }
9619 break;
9620 case 3:
9621 ssp = seqalign->segs;
9622 while (ssp)
9623 {
9624 local_retval = GetBestScoreAndEvalueFromScorePtr(ssp->scores, score, bit_score, evalue, number);
9625 if (local_retval == TRUE)
9626 retval = TRUE;
9627 ssp = ssp->next;
9628 }
9629 break;
9630 default:
9631 break;
9632 }
9633 }
9634 else
9635 {
9636 retval = GetBestScoreAndEvalueFromScorePtr(sp, score, bit_score, evalue, number);
9637 }
9638
9639 return retval;
9640 }
9641
9642 /***********************************************************************
9643 *
9644 * Adjust the Offset in the SeqAlign to correspond to the beginning
9645 * of the sequence and not where BLAST started.
9646 *
9647 **********************************************************************/
9648
9649 NLM_EXTERN void LIBCALL
AdjustOffSetsInSeqAlign(SeqAlignPtr salp,SeqLocPtr slp1,SeqLocPtr slp2)9650 AdjustOffSetsInSeqAlign(SeqAlignPtr salp, SeqLocPtr slp1, SeqLocPtr slp2)
9651
9652 {
9653 CharPtr err_string1, err_string2;
9654 DenseDiagPtr ddp;
9655 DenseSegPtr dsp;
9656 Int4 offset1=0, offset2=0, index;
9657 SeqIdPtr sip1=NULL, sip2=NULL;
9658 SeqIntPtr seq_int;
9659 SeqLocPtr seqloc, whole_slp;
9660 StdSegPtr ssp;
9661
9662 while (salp)
9663 {
9664 if (salp->segtype == 1)
9665 {
9666 ddp = salp->segs;
9667 while (ddp)
9668 { /* Get the offset on the first call. */
9669 if (sip1 == NULL)
9670 {
9671 sip1 = ddp->id;
9672 whole_slp =
9673 ValNodeAddPointer(NULL, SEQLOC_WHOLE, sip1);
9674 if(SeqLocStrand(slp1) == Seq_strand_minus)
9675 offset1 = GetOffsetInLoc(slp1, whole_slp, SEQLOC_STOP);
9676 else
9677 offset1 = GetOffsetInLoc(slp1, whole_slp, SEQLOC_START);
9678 if (offset1 == -1)
9679 {
9680 err_string1 = SeqLocPrint(slp1);
9681 err_string2 = SeqLocPrint(whole_slp);
9682 ErrPostEx(SEV_ERROR, 0, 0, "AdjustOffSetInSeqAnnot: %s not in %s", err_string1, err_string2);
9683 }
9684 whole_slp = ValNodeFree(whole_slp);
9685 }
9686 if (sip2 == NULL && slp2)
9687 {
9688 sip2 = ddp->id->next;
9689 whole_slp =
9690 ValNodeAddPointer(NULL, SEQLOC_WHOLE, sip2);
9691 if(SeqLocStrand(slp2) == Seq_strand_minus)
9692 offset2 = GetOffsetInLoc(slp2, whole_slp, SEQLOC_STOP);
9693 else
9694 offset2 = GetOffsetInLoc(slp2, whole_slp, SEQLOC_START);
9695 if (offset2 == -1)
9696 {
9697 err_string1 = SeqLocPrint(slp2);
9698 err_string2 = SeqLocPrint(whole_slp);
9699 ErrPostEx(SEV_ERROR, 0, 0, "AdjustOffSetInSeqAnnot: %s not in %s", err_string1, err_string2);
9700 }
9701 whole_slp = ValNodeFree(whole_slp);
9702 }
9703 ddp->starts[0] += offset1;
9704 ddp->starts[1] += offset2;
9705 ddp = ddp->next;
9706 }
9707 }
9708 else if (salp->segtype == 2)
9709 {
9710 dsp = salp->segs;
9711 if (sip1 == NULL)
9712 {
9713 sip1 = dsp->ids;
9714 whole_slp =
9715 ValNodeAddPointer(NULL, SEQLOC_WHOLE, sip1);
9716 if(SeqLocStrand(slp1) == Seq_strand_minus)
9717 offset1 = GetOffsetInLoc(slp1, whole_slp, SEQLOC_STOP);
9718 else
9719 offset1 = GetOffsetInLoc(slp1, whole_slp, SEQLOC_START);
9720 if (offset1 == -1)
9721 {
9722 err_string1 = SeqLocPrint(slp1);
9723 err_string2 = SeqLocPrint(whole_slp);
9724 ErrPostEx(SEV_ERROR, 0, 0, "AdjustOffSetInSeqAnnot: %s not in %s", err_string1, err_string2);
9725 }
9726 whole_slp = ValNodeFree(whole_slp);
9727 }
9728 if (sip2 == NULL && slp2)
9729 {
9730 sip2 = dsp->ids->next;
9731 whole_slp =
9732 ValNodeAddPointer(NULL, SEQLOC_WHOLE, sip2);
9733 offset2 =
9734 GetOffsetInLoc(slp2, whole_slp, SEQLOC_START);
9735 if(SeqLocStrand(slp2) == Seq_strand_minus)
9736 offset2 = GetOffsetInLoc(slp2, whole_slp, SEQLOC_STOP);
9737 else
9738 offset2 = GetOffsetInLoc(slp2, whole_slp, SEQLOC_START);
9739 if (offset2 == -1)
9740 {
9741 err_string1 = SeqLocPrint(slp2);
9742 err_string2 = SeqLocPrint(whole_slp);
9743 ErrPostEx(SEV_ERROR, 0, 0, "AdjustOffSetInSeqAnnot: %s not in %s", err_string1, err_string2);
9744 }
9745 whole_slp = ValNodeFree(whole_slp);
9746 }
9747
9748 for (index=0; index<dsp->numseg; index++)
9749 {
9750 if (dsp->starts[2*index] != -1)
9751 dsp->starts[2*index] += offset1;
9752 if (dsp->starts[2*index+1] != -1)
9753 dsp->starts[2*index+1] += offset2;
9754 }
9755 }
9756 else if (salp->segtype == 3)
9757 {
9758 ssp = salp->segs;
9759 while (ssp)
9760 {
9761 if (sip1 == NULL)
9762 {
9763 sip1 = ssp->ids;
9764 whole_slp =
9765 ValNodeAddPointer(NULL, SEQLOC_WHOLE, sip1);
9766 if(SeqLocStrand(slp1) == Seq_strand_minus)
9767 offset1 = GetOffsetInLoc(slp1, whole_slp,
9768 SEQLOC_STOP);
9769 else
9770 offset1 = GetOffsetInLoc(slp1, whole_slp,
9771 SEQLOC_START);
9772
9773 if (offset1 == -1)
9774 {
9775 err_string1 = SeqLocPrint(slp1);
9776 err_string2 = SeqLocPrint(whole_slp);
9777 ErrPostEx(SEV_ERROR, 0, 0, "AdjustOffSetInSeqAnnot: %s not in %s", err_string1, err_string2);
9778 }
9779 whole_slp = ValNodeFree(whole_slp);
9780 }
9781 if (sip2 == NULL && slp2)
9782 {
9783 sip2 = ssp->ids->next;
9784 whole_slp =
9785 ValNodeAddPointer(NULL, SEQLOC_WHOLE, sip2);
9786 if(SeqLocStrand(slp2) == Seq_strand_minus)
9787 offset2 = GetOffsetInLoc(slp2, whole_slp,
9788 SEQLOC_STOP);
9789 else
9790 offset2 = GetOffsetInLoc(slp2, whole_slp,
9791 SEQLOC_START);
9792
9793 if (offset2 == -1)
9794 {
9795 err_string1 = SeqLocPrint(slp2);
9796 err_string2 = SeqLocPrint(whole_slp);
9797 ErrPostEx(SEV_ERROR, 0, 0, "AdjustOffSetInSeqAnnot: %s not in %s", err_string1, err_string2);
9798 }
9799 whole_slp = ValNodeFree(whole_slp);
9800 }
9801 seqloc = ssp->loc;
9802 if (seqloc->choice == SEQLOC_INT) {
9803 seq_int = seqloc->data.ptrvalue;
9804 seq_int->from += offset1;
9805 seq_int->to += offset1;
9806 }
9807 seqloc = ssp->loc->next;
9808 if (seqloc->choice == SEQLOC_INT) {
9809 seq_int = seqloc->data.ptrvalue;
9810 seq_int->from += offset2;
9811 seq_int->to += offset2;
9812 }
9813 ssp = ssp->next;
9814 }
9815 }
9816 salp = salp->next;
9817 }
9818 }
9819
9820
9821 /*****************************************************************************
9822 *
9823 * Boolean SeqIdOrderInList(a, b)
9824 * Looks for single SeqId, "a" in chain of SeqIds, "b"
9825 * returns the position (>0) if found.. else returns 0;
9826 *
9827 *****************************************************************************/
9828
SeqIdOrderInList(SeqIdPtr a,SeqIdPtr list)9829 NLM_EXTERN Uint4 LIBCALL SeqIdOrderInList (SeqIdPtr a, SeqIdPtr list) {
9830 SeqIdPtr now;
9831 Uint4 order;
9832 Uint1 retval;
9833
9834 if (a == NULL)
9835 return 0;
9836
9837 for (now =list,order=1; now != NULL; now = now -> next,order++)
9838 {
9839 retval = SeqIdComp(a, now);
9840 if(retval==SIC_YES)
9841 return order;
9842 }
9843 return 0;
9844 }
9845
9846 /*****************************************************************************
9847 *
9848 * Boolean SeqIdOrderInBioseqIdList(a, b)
9849 * Looks for single SeqId, "a" in chain of SeqIds, "b"
9850 * and looks at all synonymous SeqIds of the Bioseq "b"
9851 * returns the position (>0) if found.. else returns 0;
9852 *
9853 *****************************************************************************/
9854
SeqIdOrderInBioseqIdList(SeqIdPtr a,SeqIdPtr list)9855 NLM_EXTERN Uint4 LIBCALL SeqIdOrderInBioseqIdList (SeqIdPtr a, SeqIdPtr list) {
9856 SeqIdPtr now;
9857 Uint4 order;
9858
9859 if (a == NULL)
9860 return 0;
9861
9862 for (now =list,order=1; now != NULL; now = now -> next,order++)
9863 {
9864 if(SeqIdForSameBioseq(a, now))
9865 return order;
9866 }
9867 return 0;
9868 }
9869
9870 /* Function to extract the Accession and version number from
9871 a \usedin GBQual string.
9872 (works for a plain Accession Number with version too)
9873 User must provide string buffers for answer.
9874 and make sure that last Character of Accession is not a ')'
9875 using a statement like
9876 if ((ptr = StringChr (accession, ')')) != NULL) *ptr = '\0';
9877 and user must StringTok the GBQual for ',' and repeatedly call this.
9878 */
9879
ExtractAccession(CharPtr accn,CharPtr accession,CharPtr version)9880 NLM_EXTERN void LIBCALL ExtractAccession(CharPtr accn,CharPtr accession,CharPtr version) {
9881 CharPtr verptr;
9882 if(accn!=NULL) {
9883 if (*accn == '(') {
9884 accn++;
9885 }
9886 verptr = StrChr(accn,'.');
9887 if(verptr==NULL) {
9888 if(version!=NULL)
9889 version[0]='\0';
9890 if(accession!=NULL) {
9891 StringCpy(accession,accn);
9892 }
9893 } else {
9894 Int4 len;
9895 if(version!=NULL)
9896 StringCpy(version,verptr+1);
9897 len = verptr-accn;
9898 if(accession!=NULL) {
9899 StringNCpy(accession,accn,len);
9900 accession[len]=NULLB;
9901 }
9902 }
9903 } else {
9904 if(accession)
9905 accession[0]=NULLB;
9906 if(version)
9907 version[0]=NULLB;
9908 }
9909 }
9910
9911
9912 /*
9913 Hugues Sicotte:
9914 Function to make a proper type SeqId given a string that represents
9915 an accession Number.
9916 If version number is unknown, set version=0 for latest.
9917 name is ignored because it is not always consistently used in databases.
9918 User may need to Call ExtractAccession to parse out accession and version.
9919
9920 *** WARNING *** In the non-network mode, this function depends on hardcoded
9921 accession prefix list to guess at the right prefix type.
9922
9923 There is an inherent conflict in name space between pir proteins and
9924 nucleotide genbank accessions (or swissprot. )
9925 There is a VERY low probability of conflict between pir and swissprot..
9926 .. so this codes ignores it. (no known cases).
9927 so Refseq, swissprot proteins and non-swissprot proteins have an independent name space.
9928
9929 - some PIR names(locus-name looking) have no conflicts
9930 ([A-Z][0-9,A-Z]{3,5}) with nucleotide accession.
9931 - some PIR accessions have conflicts with 1+5 nucleotide accession, but the
9932 2+5 nucleotide accession have no conflict with pir.
9933
9934 The Boolean flag AllowPIR: If TRUE, allows that accessions may be PIR.
9935 The Boolean flag Permissive,
9936 if FALSE,
9937 - completely ignores PIR accessions,
9938 - doesn't guess at unnassigned accessions prefix.
9939 (even if they look like accession)
9940 - the network will NOT be used.
9941 if TRUE
9942 - allows unassigned accessions (as long as they fit the
9943 accession patterns)
9944 - allows for PIR accessions if AllowPIR==TRUE;
9945 - allow for Network Access if UseNetwork==TRUE to resolve conflicts.
9946 - if UseNetwork == FALSE, uses the boolean flag FavorNucleotide
9947 to resolve conflicts.
9948
9949 The Boolean flag FavorNucleotide chooses to believe that the conflicts are best
9950 resolved by believing that the sequence is a nucleotide (unless UseNetwork is set).
9951
9952 The Boolean flag UseNetwork supersedes FavorNucleotide, and uses the network to
9953 resolve conflict and for 'unknown' or 'unnassigned' accessions.
9954
9955 *** .. Assumes that any new accession type is of nucleotide type
9956 (in permissive mode)
9957
9958 *** Using UseNetwork will not prevent unknown (even not in database)
9959 from resulting in a valid seqid.
9960
9961 */
SeqIdFromAccession(CharPtr accession,Uint4 version,CharPtr name)9962 NLM_EXTERN SeqIdPtr LIBCALL SeqIdFromAccession(CharPtr accession, Uint4 version,CharPtr name) {
9963 Boolean Permissive = TRUE;
9964 Boolean UseNetwork = FALSE;
9965 Boolean FavorNucleotide = TRUE;
9966 Boolean AllowPIR = FALSE;
9967 return SeqIdFromAccessionEx(accession,version,name,Permissive, AllowPIR,UseNetwork,FavorNucleotide);
9968 }
9969
9970
SeqIdFromAccessionEx(CharPtr accession,Uint4 version,CharPtr name,Boolean Permissive,Boolean AllowPIR,Boolean UseNetwork,Boolean FavorNucleotide)9971 NLM_EXTERN SeqIdPtr LIBCALL SeqIdFromAccessionEx(CharPtr accession, Uint4 version,CharPtr name,Boolean Permissive, Boolean AllowPIR,Boolean UseNetwork,Boolean FavorNucleotide) {
9972 SeqIdPtr sip;
9973 BioseqPtr bsp=NULL;
9974 TextSeqIdPtr tsp;
9975 Uint4 status;
9976 if(accession==NULL || accession[0]=='\0' || accession[0]=='\n' || accession[0]=='\r')
9977 return NULL;
9978 sip=NULL;
9979 status = WHICH_db_accession(accession);
9980 if(!(ACCN_IS_UNKNOWN(status))) {
9981 Boolean formally_assigned;
9982 formally_assigned = !(ACCN_IS_UNASSIGNED(status));
9983 if(formally_assigned || Permissive) {
9984 /* new support for PDB */
9985 if (status == ACCN_PDB) {
9986 Char pdbstr [41];
9987 if (StringLen (accession) < 8) {
9988 sprintf (pdbstr, "pdb|%s", accession);
9989 sip = SeqIdParse (pdbstr);
9990 return sip;
9991 }
9992 return NULL;
9993 }
9994 sip = ValNodeNew(NULL);
9995 tsp = TextSeqIdNew();
9996 tsp->accession = StringSave(accession);
9997 sip->data.ptrvalue = tsp;
9998 tsp->name = NULL;
9999 tsp->version = version;
10000 if(ACCN_IS_REFSEQ(status)) {
10001 sip->choice = SEQID_OTHER;
10002 } else if(ACCN_IS_SWISSPROT(status)) {
10003 sip->choice = SEQID_SWISSPROT;
10004 } else {
10005 Boolean PIR=FALSE;
10006 if(Permissive && AllowPIR) {
10007 /* In this loop.. can only be PIR of type 1+5 accession */
10008 if( ACCN_PIR_FORMAT(accession) && ((!FavorNucleotide) || UseNetwork)) {
10009 if(UseNetwork) {
10010 sip->choice = SEQID_GENBANK;
10011 bsp = BioseqLockById(sip);
10012 if(bsp) {
10013 if(bsp->mol==Seq_mol_aa)
10014 PIR=TRUE;
10015 BioseqUnlock(bsp);
10016 } else {
10017 sip->choice = SEQID_PIR;
10018 bsp = BioseqLockById(sip);
10019 if(bsp) {
10020 if(bsp->mol==Seq_mol_aa)
10021 PIR=TRUE;
10022 BioseqUnlock(bsp);
10023 } else if (!FavorNucleotide) {
10024 PIR = TRUE;
10025 }
10026 }
10027 } else if(!FavorNucleotide) {
10028 PIR = TRUE;
10029 }
10030 }
10031 }
10032 if(PIR) {
10033 sip->choice = SEQID_PIR;
10034 } else {
10035 if(ACCN_IS_GENBANK(status)) {
10036 sip->choice = SEQID_GENBANK;
10037 } else if (ACCN_IS_EMBL(status)) {
10038 sip->choice = SEQID_EMBL;
10039 } else if (ACCN_IS_DDBJ(status)) {
10040 sip->choice = SEQID_DDBJ;
10041 } else if (ACCN_IS_TPA(status)) {
10042 if (status == ACCN_NCBI_TPA || status == ACCN_NCBI_TPA_PROT) {
10043 sip->choice = SEQID_TPG;
10044 } else if (status == ACCN_EMBL_TPA || status == ACCN_EMBL_TPA_PROT) {
10045 sip->choice = SEQID_TPE;
10046 } else if (status == ACCN_DDBJ_TPA || status == ACCN_DDBJ_TPA_PROT) {
10047 sip->choice = SEQID_TPD;
10048 } else { /* default TPA */
10049 sip->choice = SEQID_TPG;
10050 }
10051 } else /* default */
10052 sip->choice = SEQID_GENBANK;
10053 }
10054 }
10055 }
10056 } else if(Permissive) {
10057 /* can only be a locus name type accession.
10058 (i.e. an arbitrary string .. or a completely
10059 new type/format of accession)
10060 .. any 1+5, 2+6 3+5 refseq accession.. are
10061 handled above.
10062 */
10063 Boolean PIR = FALSE;
10064 sip = ValNodeNew(NULL);
10065 tsp = TextSeqIdNew();
10066 tsp->accession = StringSave(accession);
10067 sip->data.ptrvalue = tsp;
10068 tsp->name = NULL;
10069 tsp->version = version;
10070 sip->choice = SEQID_GENBANK; /* default */
10071 if(AllowPIR && ACCN_PIR_FORMAT(accession) && ( UseNetwork || !FavorNucleotide )) {
10072 if(UseNetwork) { /* Only if user application has
10073 allowed ID1 bioseq Fetching
10074 ID1Init();ID1BioseqFetchEnable("prog",TRUE);
10075 */
10076 bsp = BioseqLockById(sip);
10077 if(bsp) {
10078 if(bsp->mol==Seq_mol_aa) {
10079 SeqIdPtr sip2;
10080 ErrPostEx(SEV_WARNING,0,0,"%s Should NOT be a protein but IS\n",accession);
10081 sip2 = SeqIdFindBestAccession(bsp->id);
10082 sip->choice = sip2->choice;
10083 /* when fetching .. allow for the possibility
10084 of new protein prefix of non PIR type */
10085 if(sip->choice == SEQID_PIR) {
10086 tsp->name = tsp->accession;
10087 tsp->accession = NULL;
10088 PIR=TRUE;
10089 }
10090 } else {
10091 SeqIdPtr sip2;
10092 sip2 = SeqIdFindBestAccession(bsp->id);
10093 sip->choice = sip2->choice;
10094 if(StringCmp(accession,((TextSeqIdPtr)(sip2->data.ptrvalue))->name)==0) {
10095 /*
10096 --> "accession" is the LOCUS
10097 */
10098 tsp->name = tsp->accession;
10099 tsp->accession = NULL;
10100 } /* else unknown(not hardcoded) accession type (not locus)
10101 */
10102 }
10103 } else {
10104 sip->choice = SEQID_PIR;
10105 tsp->name = tsp->accession;
10106 tsp->accession = NULL;
10107 bsp = BioseqLockById(sip);
10108 if(bsp) {
10109 if(bsp->mol==Seq_mol_aa) {
10110 SeqIdPtr sip2;
10111 sip2 = SeqIdFindBestAccession(bsp->id);
10112 if(sip->choice != SEQID_PIR) {
10113 ErrPostEx(SEV_WARNING,0,0,"PIR SeqId retrieve non-PIR sequence!\n");
10114 } else
10115 PIR=TRUE;
10116 } else {
10117 ErrPostEx(SEV_WARNING,0,0,"PIR SeqId retrieve non-amino-acid sequence!\n");
10118 }
10119 }
10120 if(!PIR) {
10121 /* revert to original accession <-> name order
10122 */
10123 tsp->accession = tsp->name;
10124 tsp->name = NULL;
10125 }
10126 }
10127 if(!bsp) { /* No network was available .
10128 or SeqIdFetch failed */
10129 if(!FavorNucleotide) {
10130 PIR = TRUE;
10131 }
10132 if(PIR) {
10133 sip->choice = SEQID_PIR;
10134 tsp->name = tsp->accession;
10135 tsp->accession = NULL;
10136 } else { /* LOCUS NAME SeqId */
10137 sip->choice = SEQID_GENBANK;
10138 tsp->name = tsp->accession;
10139 tsp->accession = NULL;
10140 }
10141 } else
10142 BioseqUnlock(bsp);
10143 } else { /* !UseNetwork */
10144 if(!FavorNucleotide && !UseNetwork) {
10145 PIR = TRUE;
10146 } else if(FavorNucleotide && !UseNetwork) {
10147 PIR = FALSE; /* XXX Should never be called */
10148 }
10149 if(PIR) {
10150 sip->choice = SEQID_PIR;
10151 tsp->name = tsp->accession;
10152 tsp->accession = NULL;
10153 } else { /* LOCUS NAME SeqId */
10154 sip->choice = SEQID_GENBANK;
10155 tsp->name = tsp->accession;
10156 tsp->accession = NULL;
10157 }
10158 }
10159 } else {
10160 /* Permissive .. but
10161 FavorNucleotide && !UseNetwork OR
10162 it doesn't look like a PIR (so it will be assumed it's a NUC .. Independent of FavorNucleotide is.)
10163 */
10164 if(UseNetwork) {
10165 /*
10166 Use network to decide if it is genbank, embl or ddbj
10167 */
10168 sip->choice = SEQID_GENBANK;
10169 bsp = BioseqLockById(sip);
10170 if(bsp) {
10171 SeqIdPtr sip2;
10172 sip2 = SeqIdFindBestAccession(bsp->id);
10173 sip->choice = sip2->choice;
10174 if(StringCmp(accession,((TextSeqIdPtr)(sip2->data.ptrvalue))->name)==0) {
10175 /*
10176 --> "accession" is the LOCUS
10177 */
10178 tsp->name = tsp->accession;
10179 tsp->accession = NULL;
10180 }
10181 BioseqUnlock(bsp);
10182 } /* .. if not found .. Make it anyways */
10183 }
10184 }
10185 }
10186 return sip;
10187 }
10188
10189
10190 /* Variant of SeqIdFromAccession that works on accession.version string (JK) */
10191
SeqIdFromAccessionDotVersion(CharPtr accession)10192 NLM_EXTERN SeqIdPtr SeqIdFromAccessionDotVersion (CharPtr accession)
10193
10194 {
10195 Char accn [41];
10196 CharPtr ptr;
10197 long int ver = INT2_MIN;
10198
10199 StringNCpy_0 (accn, accession, sizeof (accn));
10200 ptr = StringChr (accn, '.');
10201 if (ptr != NULL) {
10202 *ptr = '\0';
10203 ptr++;
10204 if (sscanf (ptr, "%ld", &ver) != 1) {
10205 ver = INT2_MIN;
10206 }
10207 }
10208 return SeqIdFromAccession (accn, (Uint4) ver, NULL);
10209 }
10210
10211
10212 /* N* GSDB accession numbers were made secondary to
10213 genbank or embl or ddbj or genbank records
10214 .. but some of these N numbers had already been assigned by
10215 embl OR ddbj OR genbank.
10216
10217 The net result is that N numbers can belong to either 3 databases,
10218 and the same N-numbers can point to two completely different sequences.
10219 .. One which was an N* from GSDB, the other one from one of the
10220 major databases.
10221
10222 status as of 12/2000 : using the [ACCN] field in Entrez.
10223 Maintenance by H. Sicotte and M. Cavanaug
10224
10225 */
10226 static CharPtr gb_N_numbers = "00008/00013/00018/00019/00027/00041/00046/00048/00052/00054/18624/";
10227 static CharPtr embl_N_numbers = "00060/00064/";
10228 static CharPtr ddbj_N_numbers = "00028/00035/00037/00053/00061/00062/00063/00065/00066/00067/00068/00069/00078/00079/00083/00088/00090/00091/00092/00093/00094/";
10229 static CharPtr embl_ddbj_N_numbers = "00070/";
10230 static CharPtr embl_gb_N_numbers = "00001/00002/00011/00057/";
10231 static CharPtr embl_gb_ddbj_N_numbers = "00005/00009/00012/00020/00022/00025/00058/";
10232 /* No N_* accession assigned for these and N00095 .. N0****
10233 .. and only N18624 assigned in the N1**** range.
10234 N2*..N9* are genbank EST's.
10235 .. all other numbers (below 0have been assigned to BOTH ddbj and genbank.
10236
10237 */
10238 static CharPtr nonexistant_N_numbers = "00071/00072/00073/00074/00075/00076/00077/00080/00081/00082/00084/00085/00086/00087/00089/00095/";
10239
10240 /* N00004 was replaced by another ID () which was withdrawn
10241 */
10242 static CharPtr gb_ddbj_N_numbers = "00003/00004/00006/00007/00010/00014/00015/00016/00017/00021/00023/00024/00026/00029/00030/00031/00032/00033/00034/00036/00038/00039/00040/00042/00043/00044/00045/00047/00049/00050/00051/00055/00056/00059/";
10243
10244
N_accession(CharPtr s)10245 static Uint4 LIBCALL N_accession (CharPtr s) {
10246 Uint4 retcode=ACCN_UNKNOWN;
10247 if(s && (*s=='N' || *s == 'n')) {
10248 Int4 id;
10249 id = atoi(s+1);
10250 if(id>20000) {
10251 retcode = ACCN_NCBI_EST;
10252 } else {
10253 if(id==0 || (id>=95 && id !=18624))
10254 retcode = ACCN_UNKNOWN;
10255 else if(StringStr(embl_N_numbers,s+1)!=NULL)
10256 retcode = ACCN_EMBL_OTHER;
10257 else if (StringStr(ddbj_N_numbers,s+1)!=NULL)
10258 retcode = ACCN_DDBJ_OTHER;
10259 else if (StringStr(gb_N_numbers,s+1)!=NULL)
10260 retcode = ACCN_NCBI_OTHER;
10261 else if (StringStr(nonexistant_N_numbers,s+1)!=NULL)
10262 retcode = ACCN_UNKNOWN;
10263 else if (StringStr(embl_gb_N_numbers,s+1)!=NULL)
10264 retcode = ACCN_EMBL_GB;
10265 else if (StringStr(embl_ddbj_N_numbers,s+1)!=NULL)
10266 retcode = ACCN_EMBL_DDBJ;
10267 else if (StringStr(gb_ddbj_N_numbers,s+1)!=NULL)
10268 retcode = ACCN_GB_DDBJ;
10269 else if (StringStr(embl_gb_ddbj_N_numbers,s+1)!=NULL)
10270 retcode = ACCN_EMBL_GB_DDBJ;
10271 else {
10272 ErrPostEx(SEV_WARNING,0,0,"sequtil::N_accession: Missing N-accession, not accounted for: %s\n",s);
10273 retcode = ACCN_UNKNOWN;
10274 }
10275 }
10276 } else {
10277 ErrPostEx(SEV_WARNING,0,0,"sequtil::N_accession: Function called with non-N accession: %s\n",s == NULL ? "NULL Accession" : s);
10278 retcode = ACCN_UNKNOWN;
10279
10280 }
10281 return retcode;
10282 }
10283
10284
10285 /*
10286 functions N_ACCN_IS_GENBANK()
10287 take an N-accession number and returns TRUE if
10288 it from the proper database.
10289 Take into account that N-accession can belong to many databases.
10290 */
10291
NAccnIsGENBANK(CharPtr s)10292 NLM_EXTERN Boolean LIBCALL NAccnIsGENBANK (CharPtr s) {
10293 Boolean retstatus;
10294 Int4 id;
10295 id = atoi(s+1);
10296 if(*s != 'n' || *s != 'N')
10297 return FALSE;
10298 if(id == 0) {
10299 retstatus = FALSE;
10300 } else if(id>=20000) {
10301 retstatus = TRUE;
10302 } else {
10303 if(StringStr(gb_N_numbers,s+1)!=NULL
10304 || StringStr(embl_gb_N_numbers,s+1)!=NULL
10305 || StringStr(embl_gb_ddbj_N_numbers,s+1)!=NULL
10306 || StringStr(gb_ddbj_N_numbers,s+1)!=NULL)
10307 retstatus = TRUE;
10308 else
10309 retstatus = FALSE;
10310 }
10311 return retstatus;
10312 }
10313
NAccnIsEMBL(CharPtr s)10314 NLM_EXTERN Boolean LIBCALL NAccnIsEMBL (CharPtr s) {
10315 Boolean retstatus;
10316 Int4 id;
10317 id = atoi(s+1);
10318 if(*s != 'n' || *s != 'N')
10319 return FALSE;
10320 if(id == 0 || id>20000) {
10321 retstatus = FALSE;
10322 } else {
10323 if(StringStr(embl_N_numbers,s+1)!=NULL
10324 || StringStr(embl_gb_N_numbers,s+1)!=NULL
10325 || StringStr(embl_ddbj_N_numbers,s+1)!=NULL
10326 || StringStr(embl_gb_ddbj_N_numbers,s+1)!=NULL)
10327 retstatus = TRUE;
10328 else
10329 retstatus = FALSE;
10330 }
10331 return retstatus;
10332 }
10333
NAccnIsDDBJ(CharPtr s)10334 NLM_EXTERN Boolean LIBCALL NAccnIsDDBJ (CharPtr s) {
10335 Boolean retstatus;
10336 Int4 id;
10337 id = atoi(s+1);
10338 if(*s != 'n' || *s != 'N')
10339 return FALSE;
10340 if(id == 0 || id>20000) {
10341 retstatus = FALSE;
10342 } else {
10343 if(StringStr(ddbj_N_numbers,s+1)!=NULL
10344 || StringStr(embl_ddbj_N_numbers,s+1)!=NULL
10345 || StringStr(embl_gb_ddbj_N_numbers,s+1)!=NULL
10346 || StringStr(gb_ddbj_N_numbers,s+1)!=NULL)
10347 retstatus = TRUE;
10348 else
10349 retstatus = FALSE;
10350
10351 }
10352 return retstatus;
10353 }
10354
AccnIsSWISSPROT(CharPtr s)10355 NLM_EXTERN Boolean LIBCALL AccnIsSWISSPROT( CharPtr s) {
10356 Boolean retstatus = FALSE;
10357 if(s && *s && *(s+1) && *(s+2) && *(s+3) && *(s+4) && *(s+5) && *(s+6) ==NULLB) {
10358 if(*s == 'o' || *s == 'O' ||
10359 *s == 'p' || *s == 'P' ||
10360 *s == 'q' || *s == 'Q') {
10361 if(IS_DIGIT(*(s+1))) {
10362 if(IS_ALPHA(*(s+2)) || IS_DIGIT(*(s+2))) {
10363 if(IS_ALPHA(*(s+3)) || IS_DIGIT(*(s+3))) {
10364 if(IS_ALPHA(*(s+4)) || IS_DIGIT(*(s+4))) {
10365 if(IS_DIGIT(*(s+5))) {
10366 retstatus = TRUE;
10367 }
10368 }
10369 }
10370 }
10371 }
10372 }
10373 }
10374
10375 return retstatus;
10376 }
10377
AccnIsUniProt(CharPtr s)10378 NLM_EXTERN Boolean LIBCALL AccnIsUniProt (CharPtr s)
10379
10380 {
10381 Char ch;
10382
10383 if (StringLen (s) != 6) return FALSE;
10384
10385 ch = *s;
10386 if (! IS_ALPHA (ch)) return FALSE;
10387
10388 s++;
10389 ch = *s;
10390 if (! IS_DIGIT (ch)) return FALSE;
10391
10392 s++;
10393 ch = *s;
10394 if (! IS_ALPHA (ch)) return FALSE;
10395
10396 s++;
10397 ch = *s;
10398 if (! (IS_ALPHA (ch) || IS_DIGIT (ch))) return FALSE;
10399
10400 s++;
10401 ch = *s;
10402 if (! (IS_ALPHA (ch) || IS_DIGIT (ch))) return FALSE;
10403
10404 s++;
10405 ch = *s;
10406 if (! IS_DIGIT (ch)) return FALSE;
10407
10408 return TRUE;
10409 }
10410
10411 /*
10412 function to tell if an accession is in the format
10413 of a PIR accession number.
10414 (either a 1+5 accession, or a locus name of length 4-6 alphanumerics)
10415 */
ACCN_PIR_FORMAT(CharPtr s)10416 NLM_EXTERN Boolean LIBCALL ACCN_PIR_FORMAT( CharPtr s) {
10417 Boolean retstatus = FALSE;
10418 if(s) {
10419 Int4 i,l;
10420 l = StringLen(s);
10421 if(*s && *(s+1) && *(s+2) && *(s+3) && l>=4 && l<=6) {
10422 if(IS_ALPHA(*s)) {
10423 retstatus = TRUE;
10424 for(i=1;i<l;i++) {
10425 if(!(IS_ALPHA(*(s+i)) || IS_DIGIT(*(s+i))))
10426 retstatus = FALSE;
10427 }
10428 }
10429 }
10430 }
10431
10432 return retstatus;
10433 }
10434
10435
ACCN_1_5_FORMAT(CharPtr s)10436 NLM_EXTERN Boolean LIBCALL ACCN_1_5_FORMAT( CharPtr s) {
10437 Boolean retstatus = FALSE;
10438 if(s) {
10439 Int4 i;
10440 if(*s && StringLen(s) ==6) {
10441 if(IS_ALPHA(*s)) {
10442 retstatus = TRUE;
10443 for(i=1;i<6;i++) {
10444 if(!(IS_DIGIT(*(s+i))))
10445 retstatus = FALSE;
10446 }
10447 }
10448 }
10449 }
10450 return retstatus;
10451 }
10452
10453
10454 /*****************************************************************************
10455 *
10456 * Function: WHICH_db_accession
10457 *
10458 * Description: Returns a non-zero code if the input string is a validly
10459 * formatted database accession number
10460 * The return code can be used to infer known infor
10461 * mation about which database this accession belongs to.
10462 * using a set of macros in accutils.h
10463 * (GenBank, EMBL, DDBJ, Swissprot)
10464 * *****WARNING****
10465 *
10466 * this function must be maintained.
10467 * *****WARNING****
10468 *
10469 * Arguments: s : CharPtr; pointer to accession number string.
10470 * Must be null terminated.
10471 *
10472 * Author: Mark Cavanaug, Hugues Sicotte (3/99)
10473 * Date: 7/96(IS_ntdb_accession),3/99(WHICH_db_accession)
10474 *
10475 * WARNING: WHICH_db_accession() does not communicate with any central
10476 * resource about accession numbers. So there's no way to
10477 * inform it automatically about new accession number prefixes.
10478 *
10479 * Version Number ".integer" MUST have been stripped out
10480 * before calling this function.
10481 *****************************************************************************/
WHICH_db_accession(CharPtr s)10482 NLM_EXTERN Uint4 LIBCALL WHICH_db_accession (CharPtr s)
10483 {
10484 Uint4 retcode = 0;
10485 Boolean retval = TRUE;
10486 Boolean first = TRUE;
10487 size_t len;
10488 Int2 i;
10489 Char temp [16];
10490
10491 if (s == NULL || ! *s)
10492 return FALSE;
10493
10494 len = StringLen (s);
10495
10496 if (IS_DIGIT (*s)) {
10497 if (len == 4 || (len > 4 && s [4] == '|')) {
10498 return ACCN_PDB;
10499 }
10500 return ACCN_UNKNOWN;
10501 }
10502
10503 switch (len) {
10504
10505 case 6: /* Old-style 6-character accession */
10506 if (AccnIsUniProt (s)) {
10507 return ACCN_SWISSPROT;
10508 }
10509 while (*s) {
10510 if (retval == FALSE)
10511 break;
10512
10513 if (first) {
10514 if (! IS_ALPHA(*s)) {
10515 retval = FALSE;
10516 break;
10517 }
10518
10519 switch (TO_UPPER(*s)) {
10520
10521 /* Protein SWISS-PROT accessions */
10522 case 'O': case 'P': case 'Q':
10523 if (AccnIsSWISSPROT(s)) {
10524 retcode = ACCN_SWISSPROT;
10525 }
10526 break;
10527
10528 /* GenBank : EST */
10529 case 'H': case 'R': case 'T': case 'W':
10530 retcode = ACCN_NCBI_EST;
10531 break;
10532 case 'N':
10533 retcode = N_accession(s);
10534 break;
10535 /* GenBank : non-EST */
10536 case 'B':
10537 retcode = ACCN_NCBI_GSS;
10538 break;
10539 case 'G':
10540 retcode = ACCN_NCBI_STS;
10541 break;
10542 case 'S':
10543 retcode = ACCN_NCBI_BACKBONE; /* Scanned journal articles */
10544 break;
10545 case 'U':
10546 retcode = ACCN_NCBI_EST;
10547 break;
10548
10549 /* GenBank : before NCBI */
10550 case 'J': case 'K': case 'L': case 'M':
10551 retcode = ACCN_GSDB_DIRSUB;
10552 break;
10553
10554 /* EMBL */
10555 case 'A':
10556 retcode = ACCN_EMBL_PATENT;
10557 break;
10558 case 'F':
10559 retcode = ACCN_EMBL_EST;
10560 break;
10561 case 'V': case 'X': case 'Y': case 'Z':
10562 retcode = ACCN_EMBL_DIRSUB;
10563 break;
10564
10565 /* DDBJ */
10566 case 'C':
10567 retcode = ACCN_DDBJ_EST;
10568 break;
10569 case 'D':
10570 retcode = ACCN_DDBJ_DIRSUB;
10571 break;
10572 case 'E':
10573 retcode = ACCN_DDBJ_PATENT;
10574 break;
10575
10576 /* Case I can be confused with pir accessions which
10577 use the I* protein namespace
10578 */
10579
10580 case 'I' : /* NCBI patent */
10581 retcode = ACCN_NCBI_PATENT;
10582 break;
10583 default: /* should not happen.. all A-Z assigned */
10584 retcode = ACCN_IS_NT;
10585 ErrPostEx(SEV_WARNING,0,0,"sequtil:WHICH_db_accession : Bug in IS_ALPHA macro or memory trashing!!!; accession %s \n",s ==NULL ? "NULL Accession" : s);
10586 break;
10587 }
10588 first = FALSE;
10589 } else {
10590 switch (retcode) {
10591 case ACCN_SWISSPROT:
10592 break;
10593 default:
10594 if (! IS_DIGIT(*s)) {
10595 retval = FALSE;
10596 }
10597 }
10598 }
10599 s++;
10600 }
10601 break;
10602 case 8: /* New 8-character accession, two letters + 6 digits */
10603 /* OR three letters + 5 digits for proteins */
10604 /* Check that have 3 letters */
10605 if(!IS_ALPHA(*s) || !IS_ALPHA(*(s+1)))
10606 break;
10607 if(IS_ALPHA(*(s+2))) {
10608 /* New(1999) 8-character protein accession, three letters + 5 digits */
10609 temp[0] = *s; s++;
10610 temp[1] = *s; s++;
10611 temp[2] = *s; s++;
10612 temp[3] = '\0';
10613
10614 if ((StringICmp(temp,"AAA") >= 0) && (StringICmp(temp,"AZZ") <= 0)) {
10615 retcode = ACCN_NCBI_PROT;
10616 } else if ((StringICmp(temp,"BAA") >= 0) && (StringICmp(temp,"BZZ") <= 0)) {
10617 retcode = ACCN_DDBJ_PROT;
10618 } else if ((StringICmp(temp,"CAA") >= 0) && (StringICmp(temp,"CZZ") <= 0)) {
10619 retcode = ACCN_EMBL_PROT;
10620 } else if ((StringICmp(temp,"DAA") >= 0) && (StringICmp(temp,"DZZ") <= 0)) {
10621 retcode = ACCN_NCBI_TPA_PROT;
10622 } else if ((StringICmp(temp,"EAA") >= 0) && (StringICmp(temp,"EZZ") <= 0)) {
10623 retcode = ACCN_NCBI_WGS_PROT;
10624 } else if ((StringICmp(temp,"FAA") >= 0) && (StringICmp(temp,"FZZ") <= 0)) {
10625 retcode = ACCN_DDBJ_TPA_PROT;
10626 } else if ((StringICmp(temp,"GAA") >= 0) && (StringICmp(temp,"GZZ") <= 0)) {
10627 retcode = ACCN_DDBJ_WGS_PROT;
10628 } else if ((StringICmp(temp,"HAA") >= 0) && (StringICmp(temp,"HZZ") <= 0)) {
10629 retcode = ACCN_NCBI_TPA_PROT;
10630 } else if ((StringICmp(temp,"IAA") >= 0) && (StringICmp(temp,"IZZ") <= 0)) {
10631 retcode = ACCN_DDBJ_TPA_PROT;
10632 } else if ((StringICmp(temp,"JAA") >= 0) && (StringICmp(temp,"JZZ") <= 0)) {
10633 retcode = ACCN_NCBI_TPA_PROT;
10634 } else if ((StringICmp(temp,"KAA") >= 0) && (StringICmp(temp,"KZZ") <= 0)) {
10635 retcode = ACCN_NCBI_WGS_PROT;
10636 } else if ((StringICmp(temp,"LAA") >= 0) && (StringICmp(temp,"LZZ") <= 0)) {
10637 retcode = ACCN_DDBJ_TPA_PROT;
10638 } else if ((StringICmp(temp,"OAA") >= 0) && (StringICmp(temp,"OZZ") <= 0)) {
10639 retcode = ACCN_NCBI_WGS_PROT;
10640 } else if ((StringICmp(temp,"PAA") >= 0) && (StringICmp(temp,"PZZ") <= 0)) {
10641 retcode = ACCN_NCBI_WGS_PROT;
10642 } else if ((StringICmp(temp,"SAA") >= 0) && (StringICmp(temp,"SZZ") <= 0)) {
10643 retcode = ACCN_EMBL_PROT;
10644 } else {
10645 retcode = ACCN_IS_PROTEIN;
10646 retval = TRUE;
10647 break;
10648 }
10649 } else if (IS_DIGIT(*(s+2))) {
10650 /* New 8-character accession, two letters + 6 digits */
10651 temp[0] = *s; s++;
10652 temp[1] = *s; s++;
10653 temp[2] = '\0';
10654
10655 if ((StringICmp(temp,"AA") == 0) ||
10656 (StringICmp(temp,"AI") == 0) ||
10657 (StringICmp(temp,"AW") == 0) ||
10658 (StringICmp(temp,"BE") == 0) ||
10659 (StringICmp(temp,"BF") == 0) ||
10660 (StringICmp(temp,"BG") == 0) ||
10661 (StringICmp(temp,"BI") == 0) ||
10662 (StringICmp(temp,"BM") == 0) ||
10663 (StringICmp(temp,"BQ") == 0) ||
10664 (StringICmp(temp,"BU") == 0) ||
10665 (StringICmp(temp,"CA") == 0) ||
10666 (StringICmp(temp,"CB") == 0) ||
10667 (StringICmp(temp,"CD") == 0) ||
10668 (StringICmp(temp,"CF") == 0) ||
10669 (StringICmp(temp,"CK") == 0) ||
10670 (StringICmp(temp,"CN") == 0) ||
10671 (StringICmp(temp,"CO") == 0) ||
10672 (StringICmp(temp,"CV") == 0) ||
10673 (StringICmp(temp,"CX") == 0) ||
10674 (StringICmp(temp,"DN") == 0) ||
10675 (StringICmp(temp,"DR") == 0) ||
10676 (StringICmp(temp,"DT") == 0) ||
10677 (StringICmp(temp,"DV") == 0) ||
10678 (StringICmp(temp,"DW") == 0) ||
10679 (StringICmp(temp,"DY") == 0) ||
10680 (StringICmp(temp,"EB") == 0) ||
10681 (StringICmp(temp,"EC") == 0) ||
10682 (StringICmp(temp,"EE") == 0) ||
10683 (StringICmp(temp,"EG") == 0) ||
10684 (StringICmp(temp,"EH") == 0) ||
10685 (StringICmp(temp,"EL") == 0) ||
10686 (StringICmp(temp,"ES") == 0) ||
10687 (StringICmp(temp,"EV") == 0) ||
10688 (StringICmp(temp,"EW") == 0) ||
10689 (StringICmp(temp,"EX") == 0) ||
10690 (StringICmp(temp,"EY") == 0) ||
10691 (StringICmp(temp,"FC") == 0) ||
10692 (StringICmp(temp,"FD") == 0) ||
10693 (StringICmp(temp,"FE") == 0) ||
10694 (StringICmp(temp,"FF") == 0) ||
10695 (StringICmp(temp,"FG") == 0) ||
10696 (StringICmp(temp,"FK") == 0) ||
10697 (StringICmp(temp,"FL") == 0) ||
10698 (StringICmp(temp,"GD") == 0) ||
10699 (StringICmp(temp,"GE") == 0) ||
10700 (StringICmp(temp,"GH") == 0) ||
10701 (StringICmp(temp,"GO") == 0) ||
10702 (StringICmp(temp,"GR") == 0) ||
10703 (StringICmp(temp,"GT") == 0) ||
10704 (StringICmp(temp,"GW") == 0) ||
10705 (StringICmp(temp,"HO") == 0) ||
10706 (StringICmp(temp,"HS") == 0) ||
10707 (StringICmp(temp,"JG") == 0) ||
10708 (StringICmp(temp,"JK") == 0) ||
10709 (StringICmp(temp,"JZ") == 0) ) { /* NCBI EST */
10710 retcode = ACCN_NCBI_EST;
10711 } else if ((StringICmp(temp,"BV") == 0) ||
10712 (StringICmp(temp,"GF") == 0)) { /* NCBI STS */
10713 retcode = ACCN_NCBI_STS;
10714 } else if ((StringICmp(temp,"AC") == 0) ||
10715 (StringICmp(temp,"DP") == 0)) { /* NCBI HTGS */
10716 retcode = ACCN_NCBI_HTGS;
10717 } else if ((StringICmp(temp,"AF") == 0) ||
10718 (StringICmp(temp,"AY") == 0) ||
10719 (StringICmp(temp,"DQ") == 0) ||
10720 (StringICmp(temp,"EF") == 0) ||
10721 (StringICmp(temp,"EU") == 0) ||
10722 (StringICmp(temp,"FJ") == 0) ||
10723 (StringICmp(temp,"GQ") == 0) ||
10724 (StringICmp(temp,"GU") == 0) ||
10725 (StringICmp(temp,"HM") == 0) ||
10726 (StringICmp(temp,"JF") == 0)) { /* NCBI direct submission */
10727 retcode = ACCN_NCBI_DIRSUB;
10728 } else if ((StringICmp(temp,"AE") == 0) ||
10729 (StringICmp(temp,"CP") == 0) ||
10730 (StringICmp(temp,"CY") == 0)) { /* NCBI genome project data */
10731 retcode = ACCN_NCBI_GENOME;
10732 } else if ((StringICmp(temp,"AH") == 0)) { /* NCBI segmented set header Bioseq */
10733 retcode = ACCN_NCBI_SEGSET | ACCN_AMBIGOUS_MOL; /* A few segmented proteins are AH */
10734 } else if ((StringICmp(temp,"CH") == 0) ||
10735 (StringICmp(temp,"CM") == 0) ||
10736 (StringICmp(temp,"DS") == 0) ||
10737 (StringICmp(temp,"EM") == 0) ||
10738 (StringICmp(temp,"EN") == 0) ||
10739 (StringICmp(temp,"EP") == 0) ||
10740 (StringICmp(temp,"EQ") == 0) ||
10741 (StringICmp(temp,"FA") == 0) ||
10742 (StringICmp(temp,"GG") == 0) ||
10743 (StringICmp(temp,"GL") == 0) ||
10744 (StringICmp(temp,"JH") == 0) ||
10745 (StringICmp(temp,"KB") == 0) ||
10746 (StringICmp(temp,"KD") == 0) ||
10747 (StringICmp(temp,"KE") == 0) ||
10748 (StringICmp(temp,"KI") == 0) ||
10749 (StringICmp(temp,"KK") == 0) ||
10750 (StringICmp(temp,"KL") == 0) ||
10751 (StringICmp(temp,"KN") == 0) ||
10752 (StringICmp(temp,"KQ") == 0) ||
10753 (StringICmp(temp,"KV") == 0)) { /* NCBI segmented set header Bioseq */
10754 retcode = ACCN_NCBI_SEGSET;
10755 } else if ((StringICmp(temp,"AS") == 0) ||
10756 (StringICmp(temp,"HR") == 0) ||
10757 (StringICmp(temp,"HS") == 0)) { /* NCBI "other" */
10758 retcode = ACCN_NCBI_OTHER;
10759 } else if ((StringICmp(temp,"AD") == 0)) { /* NCBI accessions assigned to GSDB entries */
10760 retcode = ACCN_NCBI_GSDB;
10761 } else if ((StringICmp(temp,"AQ") == 0) ||
10762 (StringICmp(temp,"AZ") == 0) ||
10763 (StringICmp(temp,"BH") == 0) ||
10764 (StringICmp(temp,"BZ") == 0) ||
10765 (StringICmp(temp,"CC") == 0) ||
10766 (StringICmp(temp,"CE") == 0) ||
10767 (StringICmp(temp,"CG") == 0) ||
10768 (StringICmp(temp,"CL") == 0) ||
10769 (StringICmp(temp,"CW") == 0) ||
10770 (StringICmp(temp,"CZ") == 0) ||
10771 (StringICmp(temp,"DU") == 0) ||
10772 (StringICmp(temp,"DX") == 0) ||
10773 (StringICmp(temp,"ED") == 0) ||
10774 (StringICmp(temp,"EI") == 0) ||
10775 (StringICmp(temp,"EJ") == 0) ||
10776 (StringICmp(temp,"EK") == 0) ||
10777 (StringICmp(temp,"ER") == 0) ||
10778 (StringICmp(temp,"ET") == 0) ||
10779 (StringICmp(temp,"FH") == 0) ||
10780 (StringICmp(temp,"FI") == 0) ||
10781 (StringICmp(temp,"GS") == 0) ||
10782 (StringICmp(temp,"HN") == 0) ||
10783 (StringICmp(temp,"HR") == 0) ||
10784 (StringICmp(temp,"JJ") == 0) ||
10785 (StringICmp(temp,"JM") == 0) ||
10786 (StringICmp(temp,"JS") == 0) ||
10787 (StringICmp(temp,"JY") == 0) ||
10788 (StringICmp(temp,"KG") == 0) ||
10789 (StringICmp(temp,"KO") == 0) ||
10790 (StringICmp(temp,"KS") == 0) ) { /* NCBI GSS */
10791 retcode = ACCN_NCBI_GSS;
10792 } else if ((StringICmp(temp,"AR") == 0) ||
10793 (StringICmp(temp,"DZ") == 0) ||
10794 (StringICmp(temp,"EA") == 0) ||
10795 (StringICmp(temp,"GC") == 0) ||
10796 (StringICmp(temp,"GP") == 0) ||
10797 (StringICmp(temp,"GV") == 0) ||
10798 (StringICmp(temp,"GX") == 0) ||
10799 (StringICmp(temp,"GY") == 0) ||
10800 (StringICmp(temp,"GZ") == 0) ||
10801 (StringICmp(temp,"HJ") == 0) ||
10802 (StringICmp(temp,"HK") == 0) ||
10803 (StringICmp(temp,"HL") == 0) ||
10804 (StringICmp(temp,"KH") == 0)) { /* NCBI patent */
10805 retcode = ACCN_NCBI_PATENT;
10806 } else if((StringICmp(temp,"BC")==0)) { /* NCBI long cDNA project : MGC */
10807 retcode = ACCN_NCBI_cDNA;
10808 } else if((StringICmp(temp,"BT")==0)) { /* NCBI FLI_cDNA */
10809 retcode = ACCN_NCBI_cDNA;
10810 } else if ((StringICmp(temp,"BK") == 0) ||
10811 (StringICmp(temp,"BL") == 0) ||
10812 (StringICmp(temp,"GJ") == 0) ||
10813 (StringICmp(temp,"GK") == 0) ||
10814 (StringICmp(temp,"JP") == 0)) { /* NCBI third-party annotation */
10815 retcode = ACCN_NCBI_TPA;
10816 } else if ((StringICmp(temp,"BN") == 0)) { /* EMBL third-party annotation */
10817 retcode = ACCN_EMBL_TPA;
10818 } else if ((StringICmp(temp,"BR") == 0) ||
10819 (StringICmp(temp,"HT") == 0) ||
10820 (StringICmp(temp,"HU") == 0)) { /* DDBJ third-party annotation */
10821 retcode = ACCN_DDBJ_TPA;
10822 } else if((StringICmp(temp,"EZ") == 0) ||
10823 (StringICmp(temp,"HP") == 0) ||
10824 (StringICmp(temp,"HQ") == 0) ||
10825 (StringICmp(temp,"JI") == 0) ||
10826 (StringICmp(temp,"JL") == 0) ||
10827 (StringICmp(temp,"JN") == 0) ||
10828 (StringICmp(temp,"JO") == 0) ||
10829 (StringICmp(temp,"JQ") == 0) ||
10830 (StringICmp(temp,"JR") == 0) ||
10831 (StringICmp(temp,"JT") == 0) ||
10832 (StringICmp(temp,"JU") == 0) ||
10833 (StringICmp(temp,"JV") == 0) ||
10834 (StringICmp(temp,"JW") == 0) ||
10835 (StringICmp(temp,"JX") == 0) ||
10836 (StringICmp(temp,"KA") == 0) ||
10837 (StringICmp(temp,"KC") == 0) ||
10838 (StringICmp(temp,"KF") == 0) ||
10839 (StringICmp(temp,"KJ") == 0) ||
10840 (StringICmp(temp,"KM") == 0) ||
10841 (StringICmp(temp,"KP") == 0) ||
10842 (StringICmp(temp,"KR") == 0) ||
10843 (StringICmp(temp,"KT") == 0) ||
10844 (StringICmp(temp,"KU") == 0) ||
10845 (StringICmp(temp,"KX") == 0) ||
10846 (StringICmp(temp,"KY") == 0)) {
10847 retcode = ACCN_NCBI_TSA;
10848 } else if((StringICmp(temp,"FX") == 0) ||
10849 (StringICmp(temp,"LA") == 0) ||
10850 (StringICmp(temp,"LE") == 0) ||
10851 (StringICmp(temp,"LH") == 0) ||
10852 (StringICmp(temp,"LI") == 0) ||
10853 (StringICmp(temp,"LJ") == 0)) {
10854 retcode = ACCN_DDBJ_TSA;
10855 } else if ((StringICmp(temp,"AJ") == 0) ||
10856 (StringICmp(temp,"AM") == 0) ||
10857 (StringICmp(temp,"FM") == 0) ||
10858 (StringICmp(temp,"FN") == 0) ||
10859 (StringICmp(temp,"FO") == 0) ||
10860 (StringICmp(temp,"FP") == 0) ||
10861 (StringICmp(temp,"FQ") == 0) ||
10862 (StringICmp(temp,"FR") == 0) ||
10863 (StringICmp(temp,"HE") == 0) ||
10864 (StringICmp(temp,"HF") == 0) ||
10865 (StringICmp(temp,"HG") == 0) ||
10866 (StringICmp(temp,"HI") == 0) ||
10867 (StringICmp(temp,"LK") == 0) ||
10868 (StringICmp(temp,"LL") == 0) ||
10869 (StringICmp(temp,"LM") == 0) ||
10870 (StringICmp(temp,"LN") == 0) ||
10871 (StringICmp(temp,"LO") == 0) ||
10872 (StringICmp(temp,"LP") == 0) ||
10873 (StringICmp(temp,"LQ") == 0) ||
10874 (StringICmp(temp,"LR") == 0) ||
10875 (StringICmp(temp,"LS") == 0) ||
10876 (StringICmp(temp,"LT") == 0)) { /* EMBL direct submission */
10877 retcode = ACCN_EMBL_DIRSUB;
10878 } else if ((StringICmp(temp,"AL") == 0) ||
10879 (StringICmp(temp,"BX") == 0)||
10880 (StringICmp(temp,"CR") == 0)||
10881 (StringICmp(temp,"CT") == 0)||
10882 (StringICmp(temp,"CU") == 0)) { /* EMBL genome project data */
10883 retcode = ACCN_EMBL_GENOME;
10884 } else if ((StringICmp(temp,"AN") == 0)) { /* EMBL CON division */
10885 retcode = ACCN_EMBL_CON;
10886 } else if ((StringICmp(temp,"AX") == 0) ||
10887 (StringICmp(temp,"CQ") == 0) ||
10888 (StringICmp(temp,"CS") == 0) ||
10889 (StringICmp(temp,"FB") == 0) ||
10890 (StringICmp(temp,"GM") == 0) ||
10891 (StringICmp(temp,"GN") == 0) ||
10892 (StringICmp(temp,"HA") == 0) ||
10893 (StringICmp(temp,"HB") == 0) ||
10894 (StringICmp(temp,"HC") == 0) ||
10895 (StringICmp(temp,"HD") == 0) ||
10896 (StringICmp(temp,"HH") == 0) ||
10897 (StringICmp(temp,"JA") == 0) ||
10898 (StringICmp(temp,"JB") == 0) ||
10899 (StringICmp(temp,"JC") == 0) ||
10900 (StringICmp(temp,"JD") == 0) ||
10901 (StringICmp(temp,"JE") == 0)) { /* EMBL patent division */
10902 retcode = ACCN_EMBL_PATENT;
10903 } else if ((StringICmp(temp,"AT") == 0) ||
10904 (StringICmp(temp,"AU") == 0) ||
10905 (StringICmp(temp,"AV") == 0) ||
10906 (StringICmp(temp,"BB") == 0) ||
10907 (StringICmp(temp,"BJ") == 0) ||
10908 (StringICmp(temp,"BP") == 0) ||
10909 (StringICmp(temp,"BW") == 0) ||
10910 (StringICmp(temp,"BY") == 0) ||
10911 (StringICmp(temp,"CI") == 0) ||
10912 (StringICmp(temp,"CJ") == 0) ||
10913 (StringICmp(temp,"DA") == 0) ||
10914 (StringICmp(temp,"DB") == 0) ||
10915 (StringICmp(temp,"DC") == 0) ||
10916 (StringICmp(temp,"DK") == 0) ||
10917 (StringICmp(temp,"FS") == 0) ||
10918 (StringICmp(temp,"FY") == 0) ||
10919 (StringICmp(temp,"HX") == 0) ||
10920 (StringICmp(temp,"HY") == 0) ||
10921 (StringICmp(temp,"LU") == 0)) { /* DDBJ EST's */
10922 retcode = ACCN_DDBJ_EST;
10923 } else if ((StringICmp(temp,"AB") == 0) ||
10924 (StringICmp(temp,"LC") == 0)) { /* DDBJ direct submission */
10925 retcode = ACCN_DDBJ_DIRSUB;
10926 } else if ((StringICmp(temp,"AG") == 0) ||
10927 (StringICmp(temp,"AP") == 0) ||
10928 (StringICmp(temp,"BS") == 0)) { /* DDBJ genome project data */
10929 retcode = ACCN_DDBJ_GENOME;
10930 } else if ((StringICmp(temp,"AK") == 0)) { /* DDBJ HTGS */
10931 retcode = ACCN_DDBJ_HTGS;
10932 } else if ((StringICmp(temp,"BA") == 0) ||
10933 (StringICmp(temp,"DF") == 0) ||
10934 (StringICmp(temp,"DG") == 0) ||
10935 (StringICmp(temp,"LD") == 0)) { /* DDBJ CON division */
10936 retcode = ACCN_DDBJ_CON;
10937 } else if ((StringICmp(temp,"BD") == 0) ||
10938 (StringICmp(temp,"DD") == 0) ||
10939 (StringICmp(temp,"DI") == 0) ||
10940 (StringICmp(temp,"DJ") == 0) ||
10941 (StringICmp(temp,"DL") == 0) ||
10942 (StringICmp(temp,"DM") == 0) ||
10943 (StringICmp(temp,"FU") == 0) ||
10944 (StringICmp(temp,"FV") == 0) ||
10945 (StringICmp(temp,"FW") == 0) ||
10946 (StringICmp(temp,"FZ") == 0) ||
10947 (StringICmp(temp,"GB") == 0) ||
10948 (StringICmp(temp,"HV") == 0) ||
10949 (StringICmp(temp,"HW") == 0) ||
10950 (StringICmp(temp,"HZ") == 0) ||
10951 (StringICmp(temp,"LF") == 0) ||
10952 (StringICmp(temp,"LG") == 0) ||
10953 (StringICmp(temp,"LV") == 0) ||
10954 (StringICmp(temp,"LX") == 0)) { /* DDBJ patent division */
10955 retcode = ACCN_DDBJ_PATENT;
10956 } else if ((StringICmp(temp,"DE") == 0) ||
10957 (StringICmp(temp,"DH") == 0) ||
10958 (StringICmp(temp,"FT") == 0) ||
10959 (StringICmp(temp,"GA") == 0) ||
10960 (StringICmp(temp,"LB") == 0)) { /* DDBJ GSS */
10961 retcode = ACCN_DDBJ_GSS;
10962 } else {
10963 retcode = ACCN_IS_NT;
10964 break;
10965 }
10966
10967 while (*s) {
10968 if (! IS_DIGIT(*s)) {
10969 retval = FALSE;
10970 break;
10971 }
10972 s++;
10973 }
10974 break;
10975 } else {
10976 retval = FALSE;
10977 break;
10978 }
10979 break;
10980 case 9: /* New 9-character accession, two letters +"_"+ 6 digits */
10981 if(!IS_ALPHA(*s) || !IS_ALPHA(*(s+1)))
10982 break;
10983 if(*(s+2)!='_')
10984 break;
10985 /* New(1999) 8-character protein accession, three letters + 5 digits */
10986 temp[0] = *s; s++;
10987 temp[1] = *s; s++;
10988 temp[2] = NULLB; s++;
10989
10990 if ((StringICmp(temp,"NP") == 0) || (StringICmp(temp,"AP") == 0)) {
10991 retcode = ACCN_REFSEQ_PROT;
10992 } else if ((StringICmp(temp,"NM") == 0)) {
10993 retcode = ACCN_REFSEQ_mRNA;
10994 } else if ((StringICmp(temp,"NT") == 0)) {
10995 retcode = ACCN_REFSEQ_CONTIG;
10996 } else if ((StringICmp(temp,"NW") == 0)) {
10997 retcode = ACCN_REFSEQ_CONTIG;
10998 } else if ((StringICmp(temp,"NC") == 0)) {
10999 retcode = ACCN_REFSEQ_CHROMOSOME;
11000 } else if ((StringICmp(temp,"XM") == 0)) {
11001 retcode = ACCN_REFSEQ_mRNA_PREDICTED;
11002 } else if ((StringICmp(temp,"XP") == 0)) {
11003 retcode = ACCN_REFSEQ_PROT_PREDICTED;
11004 } else if ((StringICmp(temp,"NG") == 0) || (StringICmp(temp,"AC") == 0)) {
11005 retcode = ACCN_REFSEQ_GENOMIC;
11006 } else if ((StringICmp(temp,"NS") == 0)) {
11007 retcode = ACCN_REFSEQ_ARTIFICIAL_ASSEMBLY;
11008 } else if (IS_ALPHA(*temp) && IS_ALPHA(*(temp+1))) {
11009 retcode =ACCN_REFSEQ | ACCN_AMBIGOUS_MOL;
11010 } else
11011 retval = FALSE;
11012 while (*s) {
11013 if (! IS_DIGIT(*s)) {
11014 retval = FALSE;
11015 break;
11016 }
11017 s++;
11018 }
11019 break;
11020 case 10: /* New 10-character accession, three letters +"_"+ 6 digits */
11021 if(!IS_ALPHA(*s) || !IS_ALPHA(*(s+1)))
11022 break;
11023 if(*(s+3)!='_')
11024 break;
11025 temp[0] = *s; s++;
11026 temp[1] = *s; s++;
11027 temp[2] = *s; s++;
11028 temp[3] = NULLB; s++;
11029
11030 if ((StringICmp(temp,"MAP") == 0)) {
11031 while (*s) {
11032 if (! IS_DIGIT(*s)) {
11033 retval = FALSE;
11034 break;
11035 }
11036 s++;
11037 }
11038 retcode = ACCN_NCBI_OTHER;
11039 } else
11040 retval = FALSE;
11041 break;
11042 case 11: /* New 11-character accession, two letters +"_"+ 8 digits */
11043 if(!IS_ALPHA(*s) || !IS_ALPHA(*(s+1)))
11044 break;
11045 if(*(s+2)!='_')
11046 break;
11047 temp[0] = *s; s++;
11048 temp[1] = *s; s++;
11049 temp[2] = NULLB; s++;
11050
11051 if ((StringICmp(temp,"ZP") == 0)) {
11052 retcode = ACCN_REFSEQ_PROT_PREDICTED;
11053 } else
11054 retval = FALSE;
11055 while (*s) {
11056 if (! IS_DIGIT(*s)) {
11057 retval = FALSE;
11058 break;
11059 }
11060 s++;
11061 }
11062 break;
11063 case 12:
11064 case 13:
11065 case 14:
11066 if(IS_ALPHA(*s) && IS_ALPHA(*(s+1)) && IS_ALPHA(*(s+2)) && IS_ALPHA(*(s+3))) {
11067 /* whole genome shotgun 12-14-character accession, four letters + 8-10 digits */
11068 temp[0] = *s; s++;
11069 temp[1] = *s; s++;
11070 temp[2] = *s; s++;
11071 temp[3] = *s; s++;
11072 temp[4] = '\0';
11073 if ((StringNICmp(temp,"A", 1) == 0)) {
11074 retcode = ACCN_NCBI_WGS;
11075 } else if ((StringNICmp(temp,"B", 1) == 0)) {
11076 retcode = ACCN_DDBJ_WGS;
11077 } else if ((StringNICmp(temp,"C", 1) == 0)) {
11078 retcode = ACCN_EMBL_WGS;
11079 } else if ((StringNICmp(temp,"D", 1) == 0)) {
11080 retcode = ACCN_NCBI_WGS_TPA;
11081 } else if ((StringNICmp(temp,"E", 1) == 0)) {
11082 retcode = ACCN_DDBJ_WGS_TPA;
11083 } else if ((StringNICmp(temp,"F", 1) == 0)) {
11084 retcode = ACCN_EMBL_WGS;
11085 } else if ((StringNICmp(temp,"G", 1) == 0)) {
11086 retcode = ACCN_NCBI_TSA;
11087 } else if ((StringNICmp(temp,"H", 1) == 0)) {
11088 retcode = ACCN_EMBL_TSA;
11089 } else if ((StringNICmp(temp,"I", 1) == 0)) {
11090 retcode = ACCN_DDBJ_TSA;
11091 } else if ((StringNICmp(temp,"J", 1) == 0)) {
11092 retcode = ACCN_NCBI_WGS;
11093 } else if ((StringNICmp(temp,"K", 1) == 0)) {
11094 retcode = ACCN_NCBI_TARGETED;
11095 } else if ((StringNICmp(temp,"L", 1) == 0)) {
11096 retcode = ACCN_NCBI_WGS;
11097 } else if ((StringNICmp(temp,"M", 1) == 0)) {
11098 retcode = ACCN_NCBI_WGS;
11099 } else if ((StringNICmp(temp,"N", 1) == 0)) {
11100 retcode = ACCN_NCBI_WGS;
11101 } else if ((StringNICmp(temp,"O", 1) == 0)) {
11102 retcode = ACCN_EMBL_WGS;
11103 } else if ((StringNICmp(temp,"P", 1) == 0)) {
11104 retcode = ACCN_NCBI_WGS;
11105 } else if ((StringNICmp(temp,"Q", 1) == 0)) {
11106 retcode = ACCN_NCBI_WGS;
11107 } else if ((StringNICmp(temp,"R", 1) == 0)) {
11108 retcode = ACCN_NCBI_WGS;
11109 } else if ((StringNICmp(temp,"S", 1) == 0)) {
11110 retcode = ACCN_NCBI_PATENT;
11111 } else
11112 retval = FALSE;
11113 while (*s) {
11114 if (! IS_DIGIT(*s)) {
11115 retval = FALSE;
11116 break;
11117 }
11118 s++;
11119 }
11120 } else if(len == 12 && IS_ALPHA(*s) && IS_ALPHA(*(s+1)) && (*(s+2)=='_')) {
11121 /* New 12-character accession, two letters +"_"+ 9 digits */
11122 temp[0] = *s; s++;
11123 temp[1] = *s; s++;
11124 temp[2] = NULLB; s++;
11125
11126 if ((StringICmp(temp,"NP") == 0)) {
11127 retcode = ACCN_REFSEQ_PROT;
11128 } else if ((StringICmp(temp,"NM") == 0)) {
11129 retcode = ACCN_REFSEQ_mRNA;
11130 } else if ((StringICmp(temp,"NW") == 0)) {
11131 retcode = ACCN_REFSEQ_CONTIG;
11132 } else if ((StringICmp(temp,"XM") == 0)) {
11133 retcode = ACCN_REFSEQ_mRNA_PREDICTED;
11134 } else if ((StringICmp(temp,"XP") == 0)) {
11135 retcode = ACCN_REFSEQ_PROT_PREDICTED;
11136 } else if (IS_ALPHA(*temp) && IS_ALPHA(*(temp+1))) {
11137 retcode =ACCN_REFSEQ | ACCN_AMBIGOUS_MOL;
11138 } else
11139 retval = FALSE;
11140 while (*s) {
11141 if (! IS_DIGIT(*s)) {
11142 retval = FALSE;
11143 break;
11144 }
11145 s++;
11146 }
11147 }
11148 break;
11149 case 15:
11150 case 16:
11151 if (IS_ALPHA(*s) && IS_ALPHA(*(s+1)) && (*(s+2)=='_')) {
11152 /* New 15-16-character accession, two letters +"_"+ four letters + 8-9 digits */
11153 temp[0] = *s; s++;
11154 temp[1] = *s; s++;
11155 temp[2] = NULLB; s++;
11156
11157 if ((StringICmp(temp,"NZ") == 0)) {
11158 retcode = ACCN_REFSEQ_WGS;
11159 } else
11160 retval = FALSE;
11161 for (i = 0; i < 4; i++) {
11162 if (! IS_ALPHA (*s)) {
11163 retval = FALSE;
11164 break;
11165 }
11166 s++;
11167 }
11168 while (*s) {
11169 if (! IS_DIGIT(*s)) {
11170 retval = FALSE;
11171 break;
11172 }
11173 s++;
11174 }
11175 }
11176 break;
11177 default:
11178 retval = FALSE;
11179 break;
11180 } /* Endswitch, StringLen(s) */
11181
11182 return (retval ? retcode : ACCN_UNKNOWN);
11183 }
11184
11185 /****************************************************************************
11186 *
11187 * Function: IS_ntdb_accession
11188 *
11189 * Description: Return TRUE if the input string is a validly formatted
11190 * nucleotide database accession number (GenBank, EMBL, DDBJ, REFSEQ)
11191 * ***WARNING*** DOES NO network access, relies on hardcoding in WHICH_db_accession.
11192 *
11193 * Arguments: s : CharPtr; pointer to accession number string.
11194 * Must be null terminated.
11195 *
11196 * Author: Mark Cavanaugh, Hugues Sicotte
11197 * Date: 7/96,HS 12/2000
11198 *
11199 * WARNING: IS_ntdb_accession() does not communicate with any central
11200 * resource about accession numbers. So there's no way to
11201 * inform it automatically about new accession number prefixes.
11202 *
11203 * Version Number ".integer" MUST have been stripped out
11204 * before calling this function.
11205 *****************************************************************************/
11206
IS_ntdb_accession(CharPtr s)11207 NLM_EXTERN Boolean LIBCALL IS_ntdb_accession (CharPtr s) {
11208 Uint4 status;
11209 status = WHICH_db_accession(s);
11210 return (Boolean)(ACCN_IS_NUC(status));
11211 }
11212
11213 /*****************************************************************************
11214 *
11215 * Function: IS_protdb_accession
11216 *
11217 * Description: Return TRUE if the input string is a validly formatted
11218 * protein database accession number (SWISS-PROT)
11219 * or the new 3 letter protein ID.
11220 *
11221 * ***WARNING*** DOES NO network access, relies on hardcoding in WHICH_db_accession.
11222 *
11223 * Arguments: s : CharPtr; pointer to accession number string.
11224 * Must be null terminated.
11225 *
11226 * Author: Mark Cavanaugh, Hugues Sicotte (3/99)
11227 * Date: 8/96, 3/99HS,12/2000
11228 *
11229 * WARNING: IS_protdb_accession() does not communicate with any central
11230 * resource about accession numbers. So there's no way to
11231 * inform it automatically about new accession number prefixes.
11232 *
11233 * Version Number ".integer" MUST have been stripped out
11234 * before calling this function.
11235 *****************************************************************************/
11236
IS_protdb_accession(CharPtr s)11237 NLM_EXTERN Boolean LIBCALL IS_protdb_accession (CharPtr s) {
11238 Uint4 status;
11239 status = WHICH_db_accession(s);
11240 return (Boolean)(ACCN_IS_PROT(status));
11241 }
11242
11243 /*
11244 Try to Find if the Bioseq represented by a SeqId is a SeqLoc List;
11245 May fetch the Bioseq to get all the synonymous SeqIds.
11246 */
11247
SeqIdInSeqLocList(SeqIdPtr sip,ValNodePtr list)11248 NLM_EXTERN Boolean LIBCALL SeqIdInSeqLocList(SeqIdPtr sip, ValNodePtr list) {
11249 ValNodePtr vnptmp;
11250 SeqIdPtr siptmp;
11251 SeqLocPtr slp;
11252
11253 for (vnptmp=list; vnptmp!=NULL; vnptmp=vnptmp->next)
11254 {
11255 siptmp = SeqLocId((SeqLocPtr)vnptmp->data.ptrvalue);
11256 if (siptmp!=NULL) {
11257 if (SeqIdForSameBioseq(sip, siptmp))
11258 return TRUE;
11259 } else if((slp=(SeqLocPtr)vnptmp->data.ptrvalue)!=NULL && (
11260 slp->choice == SEQLOC_PACKED_INT ||
11261 slp->choice == SEQLOC_MIX ||
11262 slp->choice == SEQLOC_EQUIV)) {
11263 slp = (SeqLocPtr)slp->data.ptrvalue;
11264 while(slp!=NULL) {
11265 siptmp = SeqLocId(slp);
11266 if (siptmp!=NULL) {
11267 if (SeqIdForSameBioseq(sip, siptmp))
11268 return TRUE;
11269 }
11270 slp=slp->next;
11271 }
11272 }
11273 }
11274 return FALSE;
11275 }
11276
11277 /*********************************************************
11278 ***
11279 *** AddSeqId : create a new seqid and add at the end
11280 *** of the list starting with sip_head
11281 ***
11282 **********************************************************/
AddSeqId(SeqIdPtr * sip_head,SeqIdPtr sip)11283 NLM_EXTERN SeqIdPtr AddSeqId (SeqIdPtr *sip_head, SeqIdPtr sip)
11284 {
11285 SeqIdPtr sip_tmp,
11286 sip_copy;
11287
11288 sip_copy = SeqIdDup (sip);
11289 sip_tmp = sip_copy->next;
11290 sip_copy->next = NULL;
11291 if (sip_tmp!=NULL)
11292 SeqIdFree (sip_tmp);
11293 if ( (sip_tmp = *sip_head) != NULL ) {
11294 while (sip_tmp->next != NULL)
11295 sip_tmp = sip_tmp->next;
11296 sip_tmp->next = sip_copy;
11297 }
11298 else {
11299 *sip_head = sip_copy;
11300 }
11301 return (*sip_head);
11302
11303 }
11304
11305 /*******************************************************
11306 ***
11307 *** SeqIdDupList : duplicate a list of SeqIdPtr
11308 ***
11309 *******************************************************/
SeqIdDupList(SeqIdPtr id_list)11310 NLM_EXTERN SeqIdPtr SeqIdDupList (SeqIdPtr id_list)
11311 {
11312 SeqIdPtr sip=NULL;
11313 SeqIdPtr sid;
11314
11315 for (sid = id_list; sid != NULL; sid = sid->next) {
11316 sip = AddSeqId (&sip, sid);
11317 }
11318 return sip;
11319 }
11320
SeqIdDupBestList(SeqIdPtr id_list)11321 NLM_EXTERN SeqIdPtr SeqIdDupBestList (SeqIdPtr id_list)
11322 {
11323 SeqIdPtr sip=NULL;
11324 SeqIdPtr sid, sid2;
11325 BioseqPtr bsp;
11326
11327 for (sid = id_list; sid != NULL; sid = sid->next) {
11328 sid2 = NULL;
11329 bsp = BioseqLockById (sid);
11330 if (bsp!=NULL) {
11331 sid2 = SeqIdFindBest(bsp->id, 0);
11332 BioseqUnlock (bsp);
11333 }
11334 if (sid2!=NULL)
11335 sip = AddSeqId (&sip, sid2);
11336 else
11337 sip = AddSeqId (&sip, sid);
11338 }
11339 return sip;
11340 }
11341
SeqIdListfromSeqLoc(ValNodePtr vnpslp)11342 NLM_EXTERN SeqIdPtr SeqIdListfromSeqLoc (ValNodePtr vnpslp)
11343 {
11344 SeqIdPtr sip=NULL, siptmp;
11345 ValNodePtr vnp=NULL;
11346 Int2 j = 0, k;
11347 for (vnp = vnpslp; vnp != NULL; vnp = vnp->next)
11348 {
11349 sip = AddSeqId (&sip, SeqLocId ((SeqLocPtr) vnp->data.ptrvalue));
11350 j++;
11351 }
11352 if (sip!=NULL) {
11353 for (siptmp=sip, k=0; k<j-1; siptmp=siptmp->next, k++) continue;
11354 siptmp->next = NULL;
11355 }
11356 return sip;
11357 }
11358
11359
11360 /* We frequently do not want to use TMSMART, BankIt, and NCBIFILE IDs
11361 * in displays, formatting, etc.
11362 */
IsSkippableDbtag(DbtagPtr dbt)11363 NLM_EXTERN Boolean IsSkippableDbtag (DbtagPtr dbt)
11364 {
11365 if (dbt == NULL
11366 || StringICmp (dbt->db, "TMSMART") == 0
11367 || StringICmp (dbt->db, "BankIt") == 0
11368 || StringICmp (dbt->db, "NCBIFILE") == 0) {
11369 return TRUE;
11370 } else {
11371 return FALSE;
11372 }
11373 }
11374
11375
DoesCDSEndWithStopCodon(SeqFeatPtr cds)11376 NLM_EXTERN Boolean DoesCDSEndWithStopCodon (SeqFeatPtr cds)
11377 {
11378 ByteStorePtr bs;
11379 CharPtr prot_str;
11380 Boolean retval = FALSE;
11381
11382 if (cds == NULL || cds->data.choice != SEQFEAT_CDREGION) {
11383 return FALSE;
11384 }
11385 bs = ProteinFromCdRegionEx (cds, TRUE, FALSE);
11386 if (bs == NULL) return FALSE;
11387 prot_str = BSMerge (bs, NULL);
11388 bs = BSFree (bs);
11389 if (prot_str == NULL) return FALSE;
11390
11391 if (prot_str[StringLen (prot_str) - 1] == '*') {
11392 retval = TRUE;
11393 } else {
11394 retval = FALSE;
11395 }
11396 prot_str = MemFree (prot_str);
11397 return retval;
11398 }
11399
11400
11401