1 /*  sequtil.h
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name:  sequtil.h
27 *
28 * Author:  James Ostell
29 *
30 * Version Creation Date: 4/1/91
31 *
32 * $Revision: 6.65 $
33 *
34 * File Description:  Sequence Utilities for objseq and objsset
35 *
36 * ==========================================================================
37 */
38 
39 #ifndef _NCBI_SeqUtil_
40 #define _NCBI_SeqUtil_
41 
42 #ifndef _NCBI_Seqset_
43 #include <objsset.h>		   /* the object loader interface */
44 #endif
45 
46 #ifndef _NCBI_SeqMgr_
47 #include <seqmgr.h>		   /* the Bioseq and SeqEntry manager */
48 #endif
49 
50 #undef NLM_EXTERN
51 #ifdef NLM_IMPORT
52 #define NLM_EXTERN NLM_IMPORT
53 #else
54 #define NLM_EXTERN extern
55 #endif
56 
57 #ifdef __cplusplus
58 extern "C" {
59 #endif
60 
61   /*************************************************************
62    *    this define decides if SeqIdWrite shows versions,
63    *    if seqmgr seqid indexing functions use it
64    *    and if e2index uses it
65    *    files depending on SHOWVERSION are:
66    *    sequtil.c, segmgr.c, e2iloc.c
67    *    SHOWVERSION should be removed entirely when we are through
68    *    the transition
69    ************************************************************/
70 
71 #define SHOWVERSION 1    /* do show versions */
72 
73 /*****************************************************************************
74 *
75 *   What am I?
76 *
77 *****************************************************************************/
78 NLM_EXTERN Uint1 Bioseq_repr(BioseqPtr bsp);
79 NLM_EXTERN Uint1 BioseqGetCode(BioseqPtr bsp);
80 
81 NLM_EXTERN ValNodePtr BioseqGetSeqDescr(BioseqPtr bsp, Int2 type, ValNodePtr curr);
82 NLM_EXTERN CharPtr BioseqGetTitle(BioseqPtr bsp);
83 NLM_EXTERN NumberingPtr BioseqGetNumbering(BioseqPtr bsp);
84 
85 NLM_EXTERN Int4 BioseqGetLen(BioseqPtr bsp);
86 NLM_EXTERN Int4 BioseqGetGaps(BioseqPtr bsp);
87 NLM_EXTERN Int4 BioseqGetSegLens(BioseqPtr bsp, Int4Ptr lens);
88 #define BioseqCountSegs(x) BioseqGetSegLens(x, NULL)
89 
90 NLM_EXTERN Boolean BioseqConvert(BioseqPtr bsp, Uint1 newcode);
91 NLM_EXTERN Boolean BioseqPack(BioseqPtr bsp);
92 NLM_EXTERN Boolean SeqLitPack(SeqLitPtr slp);
93 NLM_EXTERN Boolean BioseqRawConvert(BioseqPtr bsp, Uint1 newcode);
94 NLM_EXTERN Boolean BioseqRawPack(BioseqPtr bsp);
95 NLM_EXTERN ByteStorePtr BSConvertSeq(ByteStorePtr bsp, Uint1 newcode, Uint1 oldcode, Int4 seqlen);
96 NLM_EXTERN ByteStorePtr BSPack(ByteStorePtr from, Uint1 oldcode, Int4 length, Uint1Ptr newcodeptr);
97 
98 NLM_EXTERN CharPtr StringForSeqMethod(Int2 method);
99 
100 NLM_EXTERN CharPtr StringForSeqTech(Int2 tech);
101 
102 /*****************************************************************************
103 *
104 *  Hook function definition for DNA Compression
105 *
106 *****************************************************************************/
107 typedef Int4 (*CompressRWFunc)(Pointer data,
108                                        Uint1Ptr buf, Int4 length);
109 
110 /*****************************************************************************
111 *
112 *   SeqCodeTable routines
113 *   SeqMapTable routines
114 *     Convert and Comp return INVALID_RESIDUE when a residue is out of range
115 *
116 *****************************************************************************/
117 #define INVALID_RESIDUE 255
118 
119 /*****************************************************************************
120 *
121 *   SeqCodeTablePtr SeqCodeTableFind(code)
122 *   	Sequence codes defined in objseq.h
123 *
124 *****************************************************************************/
125 NLM_EXTERN SeqCodeTablePtr LIBCALL SeqCodeTableFind(Uint1 code);
126 
127 /*****************************************************************************
128 *
129 *   SeqCodeTableComp(sctp, residue)
130 *       returns complement of residue if possible
131 *       or residue, if not
132 *       assumes residue is in the same code as sctp
133 *
134 *****************************************************************************/
135 NLM_EXTERN Uint1 SeqCodeTableComp(SeqCodeTablePtr sctp, Uint1 residue);
136 
137 /*****************************************************************************
138 *
139 *   OneLetterCode(sctp)
140 *   	returns TRUE if sequence code table sctp uses one letter symbols
141 *
142 *****************************************************************************/
143 NLM_EXTERN Boolean OneLetterCode(SeqCodeTablePtr sctp);
144 
145 /*****************************************************************************
146 *
147 *   FirstResidueInCode(sctp)
148 *   	returns first valid residue code in sequence code table
149 *
150 *****************************************************************************/
151 NLM_EXTERN Uint1 FirstResidueInCode(SeqCodeTablePtr sctp);
152 
153 /*****************************************************************************
154 *
155 *   LastResidueInCode(sctp)
156 *      returns last valid residue code in sequence code table
157 *      nb: some codes have "holes", a range of invalid values between first
158 *      and last.
159 *
160 *****************************************************************************/
161 NLM_EXTERN Uint1 LastResidueInCode(SeqCodeTablePtr sctp);
162 
163 /*****************************************************************************
164 *
165 *   GetSymbolForResidue(sctp, residue)
166 *   	returns the ONE LETTER symbol for residue if sequence code has one
167 *       letter symbols. returns INVALID_RESIDUE if not a valid residue or if
168 *       sequence code uses multi-letter symbols
169 *
170 *****************************************************************************/
171 NLM_EXTERN Uint1 GetSymbolForResidue(SeqCodeTablePtr sctp, Uint1 residue);
172 
173 /*****************************************************************************
174 *
175 *   GetResidueForSymbol(sctp, residue)
176 *   	returns the residue for a ONE LETTER if sequence code has one
177 *       letter symbols. returns INVALID_RESIDUE if not a valid symbol or if
178 *       sequence code uses multi-letter symbols
179 *       CASE matters
180 *
181 *****************************************************************************/
182 NLM_EXTERN Uint1 GetResidueForSymbol(SeqCodeTablePtr sctp, Uint1 symbol);
183 
184 /*****************************************************************************
185 *
186 *   GetLongSymbolForResidue(sctp, residue)
187 *   	returns string symbol for residue if sequence code has string
188 *       symbols. returns NULL if not a valid residue or if
189 *       sequence code uses One letter symbols
190 *
191 *****************************************************************************/
192 NLM_EXTERN const char * GetLongSymbolForResidue(SeqCodeTablePtr sctp, Uint1 residue);
193 
194 /*****************************************************************************
195 *
196 *   GetResidueForLongSymbol(sctp, symbol)
197 *   	returns the residue for a STRING symbol if sequence code has string
198 *       symbols. returns INVALID_RESIDUE if not a valid symbol or if
199 *       sequence code uses one-letter symbols
200 *       CASE matters
201 *
202 *****************************************************************************/
203 NLM_EXTERN Uint1 GetResidueForLongSymbol(SeqCodeTablePtr sctp, CharPtr symbol);
204 
205 /*****************************************************************************
206 *
207 *   const char * GetNameForResidue (sctp, residue)
208 *      returns the descriptive name (eg. "Leucine") for a residue in the
209 *      sequence code defined by sctp
210 *      returns NULL if not a valid code in the alphabet
211 *      nb: some codes have "holes" in them, regions of values that are
212 *       invalid.
213 *
214 *****************************************************************************/
215 NLM_EXTERN const char * GetNameForResidue(SeqCodeTablePtr sctp, Uint1 residue);
216 
217 /*****************************************************************************
218 *
219 *   SeqMapTablePtr SeqMapTableFind(to, from)
220 *      Map from sequence code "from" to sequence code "to"
221 *      Sequence codes defined in objseq.h
222 *
223 *****************************************************************************/
224 NLM_EXTERN SeqMapTablePtr LIBCALL SeqMapTableFind(Uint1 to, Uint1 from);
225 
226 /*****************************************************************************
227 *
228 *   SeqMapTableConvert(smtp, from)
229 *       returns conversion of "from" using SeqMapTable smtp
230 *
231 *****************************************************************************/
232 NLM_EXTERN Uint1 SeqMapTableConvert(SeqMapTablePtr smtp, Uint1 residue);
233 
234 /*****************************************************************************
235 *
236 *   Convert4NaRandom(from, to)
237 *       Converts Seq_code_ncbi4na "from" to  Seq_code_ncbi2na "to"
238 *       with random conversions
239 *       Return TRUE if conversion done without randomization
240 *****************************************************************************/
241 NLM_EXTERN Boolean Convert4NaRandom(Uint1 from, Uint1 PNTR to);
242 
243 /*****************************************************************************
244 *
245 *   BSCompressDNA(bytestoreptr, len, lbytes)
246 *       converts a ncbi4na bytestore into ncbi2na
247 *       returns pointer to ambiguity storage
248 *       lbytes[0] == length of this storage
249 *       frees old bytestore
250 *       returns pointer to new one, or NULL on fail.
251 *       len is residues
252 *
253 *****************************************************************************/
254 NLM_EXTERN ByteStorePtr BSCompressDNA(ByteStorePtr from, Int4 len,
255                                   Uint4Ptr PNTR lbytes);
256 NLM_EXTERN ByteStorePtr BSCompressDNANew(ByteStorePtr from, Int4 len,
257                                   Uint4Ptr PNTR lbytes);
258   /* To be removed */
259 NLM_EXTERN ByteStorePtr BSCompressDNAOld(ByteStorePtr from, Int4 len,
260                                      Uint4Ptr PNTR lbytes);
261 
262 /*****************************************************************************
263 *
264 *   GenericCompressDNA()
265 *       converts from VoidPtr "from" in 4na encoding to
266 *       VoidPtr "to" in 2Na encoding
267 *       returns pointer to ambiguity storage
268 *       lbytes[0] == length of this storage
269 *       returns TRUE if succeded, or FALSE on fail.
270 *       seq_len is maximum number of residues in sequence
271 *       or ((Uint4) -1) if final length is unknown.
272 *       read_func and write_func - hook functions to read from "from"
273 *       and to write to "to"
274 *
275 *       NOTE! read_func must return number of residues read, that usualy
276 *             twice as much as returned number of bytes. Only last returned
277 *             byte may have only one residue and this will be handled by
278 *             seq_len value or returned value from read_func()
279 *****************************************************************************/
280 NLM_EXTERN Boolean GenericCompressDNA(VoidPtr from,
281                                   VoidPtr to,
282                                   Uint4 length,
283                                   CompressRWFunc read_func,
284                                   CompressRWFunc write_func,
285                                   Uint4Ptr PNTR lbytes);
286 
287 NLM_EXTERN Boolean GenericCompressDNAEx(VoidPtr from,
288                                   VoidPtr to,
289                                   Uint4 length,
290                                   CompressRWFunc read_func,
291                                   CompressRWFunc write_func,
292                                   Uint4Ptr PNTR lbytes,
293                                   Boolean x_new);
294 
295 /*****************************************************************************
296 *
297 *   BSRebuildDNA(bytestoreptr, len, lbytes)
298 *       restore ASCII sequence with abmiguity characters
299 *       lbytes[0] == length of this storage
300 *       frees old bytestore
301 *       returns pointer to new one, or NULL on fail.
302 *       len is residues
303 *       lbytes is pointer to ambiguity storage
304 *
305 *****************************************************************************/
306 NLM_EXTERN ByteStorePtr BSRebuildDNA(ByteStorePtr from, Int4 len,
307                                  Uint4Ptr PNTR lbytes);
308 NLM_EXTERN Boolean RebuildDNA_4na (Uint1Ptr buffer, Int4 length, Uint4Ptr lbytes);
309 
310 /*****************************************************************************
311 *
312 *   BSRebuildDNA_4na(bytestoreptr, lbytes)
313 *       restore ncbi4na sequence with abmiguity characters
314 *       lbytes[0] == length of this storage
315 *       frees old bytestore
316 *       returns pointer to new one, or NULL on fail.
317 *       lbytes is pointer to ambiguity storage
318 *
319 *****************************************************************************/
320 NLM_EXTERN ByteStorePtr BSRebuildDNA_4na (ByteStorePtr from, Uint4Ptr lbytes);
321 
322 
323 /*****************************************************************************
324 *
325 *   void NaI2TableFree(void)
326 *      Free allocated memory for
327 *      Seq_code_iupacna --> Seq_code_ncbi2na transfer
328 *****************************************************************************/
329 NLM_EXTERN void NaI2TableFree(void);
330 
331 /*****************************************************************************
332 *
333 *   Numbering routines
334 *
335 *****************************************************************************/
336                               /* convert any numbering value to seq offset */
337 NLM_EXTERN Int4 NumberingOffset(NumberingPtr np, DataValPtr avp);
338                               /* convert seq offset to numbering value */
339 NLM_EXTERN Int2 NumberingValue(NumberingPtr np, Int4 offset, DataValPtr avp);
340 NLM_EXTERN Int2 NumberingValueBySeqId(SeqIdPtr sip, Int4 offset, DataValPtr avp);
341 
342 NLM_EXTERN void NumberingDefaultLoad(void);
343 NLM_EXTERN NumberingPtr NumberingDefaultGet(void);
344 
345 /*****************************************************************************
346 *
347 *   SeqEntry and BioseqSet stuff
348 *
349 *****************************************************************************/
350 
351 NLM_EXTERN Uint1 Bioseq_set_class(SeqEntryPtr sep);
352 
353 /*****************************************************************************
354 *
355 *   traversal routines
356 *       SeqEntry - any type
357 *
358 *****************************************************************************/
359 typedef void (* SeqEntryFunc)(SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent);
360 NLM_EXTERN Int4 SeqEntryList(SeqEntryPtr sep, Pointer mydata, SeqEntryFunc mycallback, Int4 index, Int2 indent);
361 
362 #define SeqEntryCount( a )  SeqEntryList( a ,NULL,NULL,0,0)
363 #define SeqEntryExplore(a,b,c) SeqEntryList(a, b, c, 0L, 0)
364 
365 /*****************************************************************************
366  *
367  *   void CorrectGeneFeatLocation(sep, data, n, m)
368  *
369  *	Correct gene location for mRNA sequences, i.e.
370  *   puts start = 0, end = total_length_of_sequence - 1.
371  *
372  *****************************************************************************/
373 NLM_EXTERN void CorrectGeneFeatLocation(SeqEntryPtr sep, Pointer data,
374                              Int4 n, Int2 m);
375 
376 /*****************************************************************************
377 *
378 *   traversal routines
379 *       Bioseq types only - "individual" sequences
380 *       do NOT traverse component parts of seqmented or constructed types
381 *
382 *****************************************************************************/
383 NLM_EXTERN Int4 BioseqList(SeqEntryPtr sep, Pointer mydata, SeqEntryFunc mycallback, Int4 index, Int2 indent);
384 
385 #define BioseqCount( a )  BioseqList( a ,NULL,NULL,0,0)
386 #define BioseqExplore(a,b,c) BioseqList(a, b, c, 0L, 0)
387 
388 /*****************************************************************************
389 *
390 *   Get parts routines
391 *
392 *****************************************************************************/
393                        /* gets next Seqdescr after curr in sep of type type */
394 NLM_EXTERN ValNodePtr SeqEntryGetSeqDescr(SeqEntryPtr sep, Int2 type, ValNodePtr curr);
395                        /* gets first title from sep */
396 NLM_EXTERN CharPtr SeqEntryGetTitle(SeqEntryPtr sep);
397 
398 /*****************************************************************************
399 *
400 *   Manipulations
401 *
402 *****************************************************************************/
403 
404 NLM_EXTERN Boolean SeqEntryConvert(SeqEntryPtr sep, Uint1 newcode);
405 #define SeqEntryPack(x) SeqEntryConvert(x, (Uint1)0)
406 
407 
408 /*****************************************************************************
409 *
410 *   SeqLoc stuff
411 *
412 *****************************************************************************/
413 #define PRINTID_FASTA_SHORT ( (Uint1)1)
414 #define PRINTID_FASTA_LONG ( (Uint1)2)
415 #define PRINTID_TEXTID_LOCUS ( (Uint1)3)
416 #define PRINTID_TEXTID_ACCESSION ( (Uint1)4)
417 #define PRINTID_TEXTID_ACC_VER ( (Uint1)5)
418 #define PRINTID_TEXTID_ACC_ONLY ( (Uint1)6)
419 #define PRINTID_REPORT ( (Uint1)7)
420 #define PRINTID_FASTA_GENERAL ( (Uint1)8)
421 #define PRINTID_FASTA_ALL ( (Uint1)9)
422 
423 
424 /*****************************************************************************
425 *
426 *   SeqIdPtr SeqIdLocate (sip, order, num)
427 *   	Given a SeqId (sip):
428 *   		Locates the Bioseq in memory or cached
429 *   		Then calls SeqIdSelect with the Bioseq.id chain to find the
430 *             SeqId type you want.
431 *
432 *****************************************************************************/
433 NLM_EXTERN SeqIdPtr SeqIdLocate(SeqIdPtr sip, Uint1Ptr order, Int2 num);
434 
435 /*****************************************************************************
436 *
437 *   SeqIdPtr SeqIdSelect (sip, order, num)
438 *   	takes an array (order) num long.
439 *   	goes down chain starting with sip.
440 *       finds lowest value of order[sip->choice] and returns it.
441 *       if order[] == 255, it is skipped.
442 *       if nothing is found < 255, NULL is returned
443 *   	ErrorMessage if sip->choice >= num
444 *
445 *****************************************************************************/
446 NLM_EXTERN SeqIdPtr SeqIdSelect(SeqIdPtr sip, Uint1Ptr order, Int2 num);
447 
448 NLM_EXTERN Int2 SeqIdBestRank(Uint1Ptr buf, Int2 num);
449 NLM_EXTERN SeqIdPtr SeqIdFindBest(SeqIdPtr sip, Uint1 target);
450 NLM_EXTERN SeqIdPtr SeqIdFindBestAccession (SeqIdPtr sip);
451 NLM_EXTERN CharPtr SeqIdPrint(SeqIdPtr sip, CharPtr buf, Uint1 format);
452 NLM_EXTERN CharPtr SeqIdWrite(SeqIdPtr sip, CharPtr buf, Uint1 format, Uint4 buflen);
453 NLM_EXTERN Int4 SeqIdLabelLen (SeqIdPtr isip, Uint1 format);
454 NLM_EXTERN CharPtr SeqIdWholeLabel (SeqIdPtr isip, Uint1 format);
455 NLM_EXTERN Boolean GetAccessionFromSeqId(SeqIdPtr sip, BIG_ID_PNTR gi,
456 				     CharPtr PNTR id);
457 NLM_EXTERN Boolean GetAccessionVersionFromSeqId(SeqIdPtr sip, BIG_ID_PNTR gi,
458                                      CharPtr PNTR id, Boolean get_version);
459 NLM_EXTERN SeqIdPtr SeqIdParse(CharPtr buf);
460 
461 /*****************************************************************************
462 *
463 *   Int2 ValidateAccn (accession)
464 *   Int2 ValidateAccnDotVer (accession)
465 *   Int2 ValidateSeqID (SeqIdPtr)
466 *   	Return values are:
467 *   	 0: no problem - Accession is in proper format
468 *       -1: Accession did not start with a letter (or two or four letters)
469 *       -2: Accession did not contain legal number of digits after letters
470 *       -3: the original Accession number to be validated was NULL
471 *   	-4: the original Accession number is too long (>16)
472 *   	-5: missing version number (required by ValidateAccnDotVer)
473 *   	-6: Bad version number (required by ValidateAccnDotVer)
474 *
475 *****************************************************************************/
476 
477 NLM_EXTERN Int2 ValidateAccn (CharPtr accession);
478 NLM_EXTERN Int2 ValidateAccnDotVer (CharPtr accession);
479 NLM_EXTERN Int2 ValidateSeqID (SeqIdPtr sip);
480 
481 /*****************************************************************************
482 *
483 *   MakeNewProteinSeqId(SeqLocPtr slp, SeqIdPtr sip)
484 *   	Makes a new protein SeqId of attempting to keep it unique
485 *       Trys to match it to the input seqid type
486 *       slp is the location on the DNA of the coding region making the protein
487 *       sip is the SeqId of the DNA coding for the protein
488 *       if (sip != NULL) uses it for a "base" first
489 *       else if (slp != NULL) uses a SeqId from it for a base
490 *       else base is the string tmpprot
491 *
492 *       id is then base_X where X is a number assigned as a serial number
493 *       the returned id is guaranteed to be unique among all Bioseqs currently
494 *       loaded in memory.
495 *
496 *   MakeNewProteinSeqIdEx(SeqLocPtr slp, SeqIdPtr sip, prefix, Int2 ctrptr)
497 *   	Allows you to indicate a starting count for the X in base_X, and returns
498 *       the next count for improved speed when allocating many protein bioseqs
499 *
500 *****************************************************************************/
501 NLM_EXTERN SeqIdPtr LIBCALL MakeNewProteinSeqIdExMT(SeqLocPtr slp, SeqIdPtr sip, CharPtr prefix, Int2Ptr ctrptr, Boolean is_MT_safe);
502 NLM_EXTERN SeqIdPtr LIBCALL MakeNewProteinSeqIdEx(SeqLocPtr slp, SeqIdPtr sip, CharPtr prefix, Int2Ptr ctrptr);
503 NLM_EXTERN SeqIdPtr LIBCALL MakeNewProteinSeqId(SeqLocPtr slp, SeqIdPtr sip);
504 NLM_EXTERN ObjectIdPtr UniqueLocalId(void);
505 
506 /*****************************************************************************
507 *
508 *   Boolean BioseqMatch(bsp, seqid)
509 *       returns TRUE if bsp points to the Bioseq identified by seqid
510 *
511 *****************************************************************************/
512 NLM_EXTERN Boolean BioseqMatch(BioseqPtr bsp, SeqIdPtr sip);
513 
514 NLM_EXTERN BioseqPtr BioseqFindInSeqEntry(SeqIdPtr sip, SeqEntryPtr sep);
515 
516 /*****************************************************************************
517 *
518 *   Boolean SeqIdMatch(a, b)
519 *   	returns TRUE if SeqIds could be compared and are the same
520 *       returns FALSE both if SeqIds could not be compared OR if they were
521 *                        compared but are different
522 *
523 *   WARNING!!!! use SeqIdComp() instead of SeqIdMatch() in most cases
524 *
525 *  The code here must work the same is in two idloader
526 *  context: function id_flatten_seq_obj (idsybase.c)
527 *  and proc id_id_flatten_seq_obj
528 *
529 *****************************************************************************/
530 NLM_EXTERN Boolean SeqIdMatch(SeqIdPtr a, SeqIdPtr b);
531 
532 /*****************************************************************************
533 *
534 *   SeqIdComp(a, b)
535 *   	Compares a to b and returns
536 *
537 *   SIC_DIFF   = different types, could not be compared
538 *   SIC_NO     = types could be compared, and ids are different
539 *   SIC_YES    = types could be compared, and ids are the same
540 *
541 *****************************************************************************/
542 NLM_EXTERN Uint1 SeqIdComp(SeqIdPtr a, SeqIdPtr b);
543 #define SIC_DIFF 1
544 #define SIC_NO 0
545 #define SIC_YES 2
546 
547 /*************************
548    SeqIdForSameBioseq(a,b)
549    trys to locate all ids for a or b and determine
550    if (a and b refer the the same Bioseq)
551 **************************/
552 NLM_EXTERN Boolean SeqIdForSameBioseq(SeqIdPtr a, SeqIdPtr b);
553 
554 /*************************
555  *      Boolean SeqIdIn (a,b)
556  *   returns TRUE if a in list of b
557  ******************/
558 NLM_EXTERN Boolean SeqIdIn(SeqIdPtr a, SeqIdPtr b);
559 
560 
561 /*****************************************************************************
562 *
563 *   SeqLocFindNext()
564 *     just calls SeqLocFindPart(seqlochead, currseqloc, EQUIV_IS_MANY)
565 *
566 *****************************************************************************/
567 NLM_EXTERN SeqLocPtr SeqLocFindNext(SeqLocPtr seqlochead, SeqLocPtr currseqloc);
568 
569 /*****************************************************************************
570 *
571 *   SeqLocFindPart(seqlochead, currseqloc, equiv_status)
572 *       finds the next Seq-loc after currseqloc
573 *       seqlochead is the first of a chain of Seq-locs
574 *       equiv_status defines how to treat SEQLOC_EQUIV
575 *         EQUIV_IS_MANY = treat same as SEQLOC_MIX
576 *         EQUIV_IS_ONE = return SEQLOC_EQUIV as one Seq-loc
577 *         FIRST_EQUIV_IS_MANY = if seqlochead is a SEQLOC_EQUIV, enter the
578 *            the chain of Seq-locs, but treat any later EQUIVs as
579 *            EQUIV_IS_ONE.
580 *
581 *****************************************************************************/
582 NLM_EXTERN SeqLocPtr SeqLocFindPart(SeqLocPtr seqlochead, SeqLocPtr currseqloc, Uint1 equiv_status);
583 
584 #define EQUIV_IS_MANY 0   /* treat SEQLOC_EQUIV same as SEQLOC_MIX */
585 #define EQUIV_IS_ONE 1	  /* treat SEQLOC_EQUIV as one Seq-loc */
586 #define FIRST_EQUIV_IS_MANY 2 /* treat only first EQUIV as SEQ_LOC_MIX */
587 
588 NLM_EXTERN Boolean IS_one_loc(SeqLocPtr anp, Boolean equiv_is_one);  /* for SeqLoc */
589 
590 NLM_EXTERN Int4 SeqLocStart(SeqLocPtr seqloc);
591 NLM_EXTERN Int4 SeqLocStop(SeqLocPtr seqloc);
592 NLM_EXTERN Uint1 SeqLocStrand(SeqLocPtr seqloc);
593 NLM_EXTERN Int4 SeqLocLen(SeqLocPtr seqloc);
594 NLM_EXTERN Int4 SeqLocGetSegLens(SeqLocPtr slp, Int4Ptr lens, Int4 ctr, Boolean gaps);
595 #define SeqLocCountSegs(x) SeqLocGetSegLens(x, NULL,0,FALSE)
596 #define SeqLocGetGaps(x) SeqLocGetSegLens(x,NULL,0,TRUE)
597 NLM_EXTERN SeqIdPtr SeqLocId(SeqLocPtr seqloc);
598 NLM_EXTERN Uint1 StrandCmp(Uint1 strand);
599 NLM_EXTERN Boolean SeqLocRevCmp(SeqLocPtr anp);
600 
601 /**** defines for "which_end" below ****/
602 
603 #define SEQLOC_LEFT_END  1    /* low numbered end of SeqLoc */
604 #define SEQLOC_RIGHT_END 2    /* high numbered end of SeqLoc */
605 #define SEQLOC_START     3	  /* beginning of SeqLoc (low on plus, high on minus)  */
606 #define SEQLOC_STOP      4	  /* end of SeqLoc (high on plus, low on minus)  */
607 
608 NLM_EXTERN Int4 GetOffsetInLoc(SeqLocPtr of, SeqLocPtr in, Uint1 which_end);
609 NLM_EXTERN Int4 GetOffsetInBioseq(SeqLocPtr of, BioseqPtr in, Uint1 which_end);
610 NLM_EXTERN Int4 GetOffsetInBioseqEx (SeqLocPtr of, BioseqPtr in, Uint1 which_end, Boolean is_circular, Boolean relaxed);
611 NLM_EXTERN void GetLeftAndRightOffsetsInBioseq (SeqLocPtr of, BioseqPtr in, Int4Ptr left, Int4Ptr right, Boolean is_circular, Boolean relaxed, BoolPtr left_flip, BoolPtr right_flip );
612 NLM_EXTERN Int2 SeqLocOrder(SeqLocPtr a, SeqLocPtr b, BioseqPtr in);
613 
614 NLM_EXTERN Int2 SeqLocMol(SeqLocPtr seqloc);
615 
616 NLM_EXTERN CharPtr SeqLocPrint(SeqLocPtr slp);
617 NLM_EXTERN CharPtr SeqLocPrintUseBestID(SeqLocPtr slp);
618 
619 /*****************************************************************************
620 *
621 *   SeqLocCompare(a, b)
622 *   	returns
623 *   	0 = no overlap
624 *   	1 = a is completely contained in b
625 *   	2 = b is completely contained in a
626 *   	3 = a == b
627 *   	4 = a and b overlap, but neither completely contained in the other
628 *
629 *****************************************************************************/
630 NLM_EXTERN Int2 SeqLocCompare(SeqLocPtr a, SeqLocPtr b);
631 #define SLC_NO_MATCH 0
632 #define SLC_A_IN_B 1
633 #define SLC_B_IN_A 2
634 #define SLC_A_EQ_B 3
635 #define SLC_A_OVERLAP_B 4
636 NLM_EXTERN Int2 SeqLocCompareEx (SeqLocPtr a, SeqLocPtr b, Boolean compare_strand);
637 
638 NLM_EXTERN Boolean UnitTestSeqLocCompare (void);
639 
640 /*****************************************************************************
641 *
642 *   SeqLocAinB(a, b)
643 *      if a is completely contained in b, a positive number is returned
644 *         if 0, a is identical with b
645 *         if not 0, is the number of residues bigger b is than a
646 *      if a negative number is returned, a is not contained in b
647 *         could overlap or not
648 *      used to find features contained in genes
649 *
650 *****************************************************************************/
651 NLM_EXTERN Int4 SeqLocAinB(SeqLocPtr a, SeqLocPtr b);
652 
653 NLM_EXTERN Boolean SeqIntCheck(SeqIntPtr sip);   /* checks for valid interval */
654 NLM_EXTERN Boolean SeqPntCheck(SeqPntPtr spp);  /* checks valid pnt */
655 NLM_EXTERN Boolean PackSeqPntCheck(PackSeqPntPtr pspp);
656 NLM_EXTERN Uint1 SeqLocCheck(SeqLocPtr slp);
657 #define SEQLOCCHECK_OK 2      /* location is fine */
658 #define SEQLOCCHECK_WARNING 1   /* location ok, but has mixed strands */
659 #define SEQLOCCHECK_ERROR 0     /* error in location */
660 /*****************************************************************************
661 *
662 *   SeqLocPartialCheck(head)
663 *       sets bits for incomplete location and/or errors
664 *       incomplete defined as Int-fuzz on start or stop with
665 *         lim.unk, lim.gt, or lim.lt set
666 *
667 * SLP_COMPLETE = not partial and no errors
668 * SLP_START = incomplete on start (high number on minus strand, low on plus)
669 * SLP_STOP     = incomplete on stop
670 * SLP_INTERNAL = lim set on internal intervals
671 * SLP_OTHER    = partial location, but no details available
672 * SLP_NOSTART  = start does not include end of sequence
673 * SLP_NOSTOP   = stop does not include end of sequence
674 * SLP_NOINTERNAL = internal interval not on end of sequence
675 * SLP_LIM_WRONG  = lim gt/lt used inconsistently with position in location
676 *
677 * SLP_HAD_ERROR  = if AND with return, is TRUE if any errors encountered
678 *
679 *****************************************************************************/
680 
681 #define SLP_COMPLETE	0
682 #define SLP_START		1
683 #define SLP_STOP		2
684 #define SLP_INTERNAL	4
685 #define SLP_OTHER		8
686 #define SLP_NOSTART		16
687 #define SLP_NOSTOP		32
688 #define SLP_NOINTERNAL	64
689 #define SLP_LIM_WRONG	128
690 
691 #define SLP_HAD_ERROR   240
692 
693 NLM_EXTERN Uint2 SeqLocPartialCheck(SeqLocPtr head);
694 NLM_EXTERN Uint2 SeqLocPartialCheckEx (SeqLocPtr head, Boolean farFetch);
695 
696 /*
697     FreeSeqLocSetComponents loops through a chain of SeqLocs and frees
698     the referenced components.  Call SeqLocSetFree to the list itself.
699 */
700 
701 NLM_EXTERN void FreeSeqLocSetComponents (SeqLocPtr list);
702 
703 NLM_EXTERN CharPtr TaxNameFromCommon(CharPtr common);
704 
705 /*****************************************************************************
706 *
707 *   QualLocCreate(from, to)
708 *   	creates a UserObject of _class NCBI, type 1
709 *       adds a field of type "qual_loc"
710 *       puts the from and to numbers in
711 *       These should be offsets, as in a Seq-loc, not numbers starting from
712 *           one.
713 *       no range check, no strand, no seqid
714 *       this just carries locations for the qualifiers anticodon and rpt_unit
715 *       Intended to go on SeqFeat.ext
716 *
717 *****************************************************************************/
718 NLM_EXTERN UserObjectPtr QualLocCreate(Int4 from, Int4 to);
719 
720 /*****************************************************************************
721 *
722 *   QualLocWrite(uop, buf)
723 *   	Checks a SeqFeat.ext to see if it is
724 *   		1) not null
725 *           2) has a UserObject of _class NCBI, type 1
726 *           3) has a field of label "qual_loc"
727 *           4) if so, prints the two integers as a qualifier location
728 *               from..to and returns a pointer to the \0 after "to"
729 *           Adds 1 to the internal numbers to convert from offset to
730 *               number starting with 1
731 *       If any of the above fail, returns NULL
732 *
733 *****************************************************************************/
734 NLM_EXTERN CharPtr QualLocWrite(UserObjectPtr uop, CharPtr buf);
735 
736 /*****************************************************************************
737 *
738 *   EntrezASN1Detected detects records retrieved from Entrez, which should
739 *       not be edited by Sequin and replaced into ID.
740 *
741 *****************************************************************************/
742 
743 NLM_EXTERN Boolean EntrezASN1Detected (SeqEntryPtr sep);
744 
745 /*****************************************************************************
746 *
747 *   SeqLocIntNew(Int4 from, Int4 to, Uint1 strand, SeqIdPtr sip)
748 *      creates a new SeqLoc of type SeqInt
749 *      makes copy of incoming SeqId
750 *
751 *****************************************************************************/
752 NLM_EXTERN SeqLocPtr LIBCALL SeqLocIntNew (Int4 from, Int4 to, Uint1 strand, SeqIdPtr sip);
753 
754 /*****************************************************************************
755 *
756 *   SeqLocPntNew(Int4 pos, Uint1 strand, SeqIdPtr sip, Boolean is_fuzz)
757 *      creates a new SeqLoc of type SeqPnt
758 *      makes copy of incoming SeqId
759 *
760 *****************************************************************************/
761 NLM_EXTERN SeqLocPtr LIBCALL SeqLocPntNew (Int4 pos, Uint1 strand, SeqIdPtr sip, Boolean is_fuzz);
762 
763 /*****************************************************************************
764 *
765 *   SeqLocPtr FindSpliceSites(SeqEntryPtr sep, Boolean findOnProtein)
766 *      Finds the splice sites on this SeqEntry and returns them as a
767 *      SeqLoc.
768 *
769 *****************************************************************************/
770 NLM_EXTERN SeqLocPtr LIBCALL FindSpliceSites(SeqEntryPtr sep, Boolean findOnProtein);
771 
772 /***************************************************************************
773 **
774 *
775 *   SeqFeatPtr FindCodingRegion(SeqEntryPtr sep)
776 *      Finds the coding region feature on this protein SeqEntry and
777 *      returns a copy of it.
778 *
779 ****************************************************************************
780 */
781 NLM_EXTERN SeqFeatPtr LIBCALL FindCodingRegion(SeqEntryPtr sep);
782 
783 /*****************************************************************************
784 *
785 *   Boolean LIBCALL SeqEntryContainsSeqIdOfMolType(SeqEntryPtr sep, SeqIdPtr sip, Boolean isProtein)
786 *      Tests to see if this SeqEntry contains a bioseq of the specified moltype
787 *        (protein or DNA)
788 *      if sip != NULL then it also insists upon finding a bioseq of the
789 *        specified moltype where the SeqIds match
790 *
791 *****************************************************************************/
792 NLM_EXTERN Boolean LIBCALL SeqEntryContainsSeqIdOfMolType(SeqEntryPtr sep, SeqIdPtr sip, Boolean isProtein);
793 
794 /*****************************************************************************
795 *
796 *      Tests to see if this SeqEntry contains a bioseq of the specified uid
797 *      returns moltype of the bioseq where the SeqIds match
798 *			  0     id not found in this SeqEntry
799 *		      1     Amino Acid sequence
800 *			  2     Nucleotide sequence
801 *
802 *****************************************************************************/
803 NLM_EXTERN Int2 LIBCALL MolTypeForGI(SeqEntryPtr sep, Int4 uid);
804 
805 /* moved from jzmisc.h */
806 NLM_EXTERN Boolean seqid_name(SeqIdPtr, CharPtr, Boolean, Boolean);
807 NLM_EXTERN Boolean MuskSeqIdWrite(SeqIdPtr sip, CharPtr buf, Int2 buflen, Uint1 format, Boolean do_find, Boolean do_entrez_find);
808 NLM_EXTERN SeqIdPtr local_id_make(CharPtr);
809 NLM_EXTERN SeqLocPtr update_seq_loc(Int4, Int4, Uint1, SeqLocPtr );
810 NLM_EXTERN SeqIdPtr LIBCALL TxGetSubjectIdFromSeqAlign(SeqAlignPtr seqalign);
811 NLM_EXTERN SeqIdPtr LIBCALL TxGetQueryIdFromSeqAlign(SeqAlignPtr seqalign);
812 NLM_EXTERN Boolean LIBCALL GetScoreAndEvalue(
813                     SeqAlignPtr seqalign, Int4 *score,
814                     Nlm_FloatHi *bit_score,
815                     Nlm_FloatHi *evalue, Int4 *number
816 );
817 
818 /***********************************************************************
819 *
820 *       Adjust the Offset in the SeqAlign to correspond to the beginning
821 *       of the sequence and not where BLAST (or some other tool) started.
822 *
823 **********************************************************************/
824 
825 NLM_EXTERN void LIBCALL AdjustOffSetsInSeqAlign(SeqAlignPtr salp, SeqLocPtr slp1, SeqLocPtr slp2);
826 
827 
828 /* Used with SeqEntryExplore to find Bioseq's in a SeqEntry. */
829 NLM_EXTERN void FindNuc(SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent);
830 NLM_EXTERN void FindProt(SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent);
831 
832 /*****************************************************************************
833 *
834 *   Boolean SeqIdOrderInList(a, b)
835 *     Looks for single SeqId, "a" in chain of SeqIds, "b"
836 *     returns the position (>0) if found.. else returns 0;
837 *
838 *****************************************************************************/
839 
840 NLM_EXTERN Uint4 LIBCALL SeqIdOrderInList (SeqIdPtr a, SeqIdPtr list);
841 
842 /*****************************************************************************
843 *
844 *   Boolean SeqIdOrderInBioseqIdList(a, b)
845 *     Looks for single SeqId, "a" in chain of SeqIds, "b"
846 *              and looks at all synonymous SeqIds of the Bioseq "b"
847 *     returns the position (>0) if found.. else returns 0;
848 *
849 *****************************************************************************/
850 NLM_EXTERN Uint4 LIBCALL SeqIdOrderInBioseqIdList (SeqIdPtr a, SeqIdPtr list);
851 
852 /* Function to extract the Accession and version number
853    User must provide string buffers for answer.
854    */
855 NLM_EXTERN void LIBCALL ExtractAccession(CharPtr accn,CharPtr accession,CharPtr version);
856 
857 
858 
859 /*
860   Function to make a proper type SeqId given a string that represents
861   an accession Number
862   User must Call ExtractAccession function separately before calling this.
863   to split accession and version number.
864 */
865 NLM_EXTERN SeqIdPtr LIBCALL SeqIdFromAccession(CharPtr accession, Uint4 version,CharPtr name);
866 
867     /* Variant that also work with PIR accessions and LOCUS names
868        .. and can resolve conflict with network access (if pre-enabled)
869      */
870     NLM_EXTERN  SeqIdPtr LIBCALL SeqIdFromAccessionEx(CharPtr accession, Uint4 version,CharPtr name,Boolean Permissive, Boolean AllowPIR,Boolean UseNetwork,Boolean FavorNucleotide);
871 
872 /* Variant of SeqIdFromAccession that works on accession.version string */
873 
874 NLM_EXTERN SeqIdPtr SeqIdFromAccessionDotVersion (CharPtr accession);
875 
876 
877     /*
878       Following functions and defines moved from accutils.ch
879       */
880 NLM_EXTERN Uint4 LIBCALL WHICH_db_accession (CharPtr s);
881 NLM_EXTERN Boolean LIBCALL IS_ntdb_accession (CharPtr s);
882 NLM_EXTERN Boolean LIBCALL IS_protdb_accession (CharPtr s);
883 NLM_EXTERN Boolean LIBCALL ACCN_PIR_FORMAT( CharPtr s);
884 NLM_EXTERN Boolean LIBCALL ACCN_1_5_FORMAT( CharPtr s);
885 NLM_EXTERN Boolean LIBCALL AccnIsSWISSPROT( CharPtr s);
886 NLM_EXTERN Boolean LIBCALL AccnIsUniProt (CharPtr s);
887 NLM_EXTERN Boolean LIBCALL NAccnIsGENBANK (CharPtr s);
888 NLM_EXTERN Boolean LIBCALL NAccnIsEMBL (CharPtr s);
889 NLM_EXTERN Boolean LIBCALL NAccnIsDDBJ (CharPtr s);
890 
891 
892 /*
893   #defines and macros for WHICH_ntdb_accession and
894                           WHICH_protdb_accession
895 
896  The "divisions" implied by the following #defines are not all inclusives.
897    a GSS or EST sequence submitted through DIRSUB, will have the
898    ACCN_NCBI_DIRSUB code.
899    a sequence can full well be in GSS,EST,etc.. division
900    but not have the appropriate accession number if they were submitted
901    through DIRSUB.
902 
903 */
904 #define ACCN_UNKNOWN 0
905 
906 #define ACCN_AMBIGOUS_DB 2 /* Primary can be from any Nucleotide database */
907 #define ACCN_SWISSPROT 3
908 #define ACCN_NCBI_PROT 4
909 #define ACCN_EMBL_PROT 5
910 #define ACCN_DDBJ_PROT 6
911 
912 #define ACCN_GSDB_DIRSUB 7
913 
914 #define ACCN_NCBI_GSDB  8 /* NCBI-assigned Accn to GSDB records */
915 
916 #define ACCN_NCBI_EST 9
917 #define ACCN_NCBI_DIRSUB 10
918 #define ACCN_NCBI_GENOME 11
919 #define ACCN_NCBI_PATENT 12 /* Not used .. because all are Ambigous_mol */
920 #define ACCN_NCBI_HTGS 13
921 #define ACCN_NCBI_GSS 14
922 #define ACCN_NCBI_STS 15
923 #define ACCN_NCBI_BACKBONE 16 /* "S" record, typed from publications */
924 #define ACCN_NCBI_SEGSET 17
925 #define ACCN_NCBI_OTHER 18 /* unknown or 'other' nucleotide division */
926 
927 #define ACCN_EMBL_EST 19
928 #define ACCN_EMBL_DIRSUB 20
929 #define ACCN_EMBL_GENOME 21
930 #define ACCN_EMBL_PATENT 22
931 #define ACCN_EMBL_HTGS 23 /* Not defined yet */
932 #define ACCN_EMBL_CON 24
933 #define ACCN_EMBL_OTHER 25 /* unknown or 'other' nucleotide division */
934 
935 #define ACCN_DDBJ_EST 26
936 #define ACCN_DDBJ_DIRSUB 27
937 #define ACCN_DDBJ_GENOME 28
938 #define ACCN_DDBJ_PATENT 29
939 #define ACCN_DDBJ_HTGS 30
940 #define ACCN_DDBJ_CON 31 /* Not defined*/
941 #define ACCN_DDBJ_OTHER 32 /* unknown or 'other' nucleotide division */
942 
943 #define ACCN_REFSEQ_PROT 33
944 #define ACCN_REFSEQ_mRNA 34
945 #define ACCN_REFSEQ_CONTIG 35
946 #define ACCN_REFSEQ_CHROMOSOME 36
947 #define ACCN_REFSEQ_mRNA_PREDICTED 37
948 #define ACCN_REFSEQ_PROT_PREDICTED 38
949 #define ACCN_REFSEQ_GENOMIC 39
950 
951 #define ACCN_NCBI_cDNA 40
952 #define ACCN_IS_PROTEIN 41 /* unreserved 3 letter code .. must be protein*/
953 #define ACCN_IS_NT 42  /* unreserved 1 or 2 letter code .. must be nuc */
954 #define ACCN_REFSEQ 43  /* unreserved refseq-type two_letters and underscore*/
955 #define ACCN_EMBL_GB 44
956 #define ACCN_EMBL_DDBJ 45
957 #define ACCN_GB_DDBJ 46
958 #define ACCN_EMBL_GB_DDBJ 47
959 
960 #define ACCN_NCBI_TPA 48
961 #define ACCN_NCBI_TPA_PROT 49
962 #define ACCN_EMBL_TPA 50
963 #define ACCN_EMBL_TPA_PROT 51
964 #define ACCN_DDBJ_TPA 52
965 #define ACCN_DDBJ_TPA_PROT 53
966 
967 #define ACCN_NCBI_WGS 54
968 #define ACCN_NCBI_WGS_PROT 55
969 #define ACCN_EMBL_WGS 56
970 #define ACCN_EMBL_WGS_PROT 57
971 #define ACCN_DDBJ_WGS 58
972 #define ACCN_DDBJ_WGS_PROT 59
973 
974 #define ACCN_PDB 60
975 
976 #define ACCN_DDBJ_GSS 61
977 
978 #define ACCN_NCBI_TSA 62
979 #define ACCN_NCBI_TSA_PROT 63
980 #define ACCN_EMBL_TSA 64
981 #define ACCN_EMBL_TSA_PROT 65
982 #define ACCN_DDBJ_TSA 66
983 #define ACCN_DDBJ_TSA_PROT 67
984 
985 #define ACCN_REFSEQ_ARTIFICIAL_ASSEMBLY 68
986 #define ACCN_REFSEQ_WGS 69
987 
988 #define ACCN_NCBI_OPTICAL 70
989 
990 #define ACCN_NCBI_WGS_TPA 71
991 #define ACCN_NCBI_WGS_TPA_PROT 72
992 #define ACCN_EMBL_WGS_TPA 73
993 #define ACCN_EMBL_WGS_TPA_PROT 74
994 #define ACCN_DDBJ_WGS_TPA 75
995 #define ACCN_DDBJ_WGS_TPA_PROT 76
996 
997 #define ACCN_NCBI_TARGETED 77
998 
999 
1000 /* Some accessions prefix can be either protein or nucleotide
1001    such as NCBI PATENT I, AR .. or segmented set Bioseqs 'AH'
1002 */
1003 #define ACCN_AMBIGOUS_MOL 65536 /* Ambigous Molecule */
1004 
1005 /*
1006    Macros to interpret above #defines codes returned by
1007    WHICH_db_accession
1008 */
1009 
1010 
1011 /*
1012  Accession definitively points to a protein record
1013 */
1014 #define ACCN_IS_PROT(c) (((c)==ACCN_SWISSPROT) ||  ( (c)==ACCN_NCBI_PROT) || ((c)== ACCN_EMBL_PROT) || ((c)== ACCN_DDBJ_PROT) || ((c)== ACCN_REFSEQ_PROT) || ((c)== ACCN_IS_PROTEIN) || ((c)== ACCN_REFSEQ_PROT_PREDICTED) || ((c)== ACCN_NCBI_TPA_PROT) || ((c)== ACCN_EMBL_TPA_PROT) || ((c)== ACCN_DDBJ_TPA_PROT) || ((c)== ACCN_NCBI_WGS_PROT) || ((c)== ACCN_EMBL_WGS_PROT) || ((c)== ACCN_DDBJ_WGS_PROT) || ((c)== ACCN_NCBI_WGS_TPA_PROT) || ((c)== ACCN_EMBL_WGS_TPA_PROT) || ((c)== ACCN_DDBJ_WGS_TPA_PROT))
1015 
1016 /*
1017   Accession definitively points to a nucleotide record
1018    . note that ACCN_dbname_OTHER is a nucleotide.
1019 */
1020 #define ACCN_IS_NUC(c) ((((c)&ACCN_AMBIGOUS_MOL)==0) && ((c)!=ACCN_UNKNOWN) && (!ACCN_IS_PROT(c)) )
1021 
1022 #define ACCN_IS_AMBIGOUS_MOL(c) (((c)&ACCN_AMBIGOUS_MOL) == ACCN_AMBIGOUS_MOL)
1023 
1024 /*
1025    Define to detect Genbank's accessions: Genbank-subsumed GSDB accession numbers
1026    are defined to be Genbank's as well as GSDB DIRSUB records.
1027 */
1028 #define ACCN_IS_GENBANK(c) ((((c)&65535) == ACCN_NCBI_GSDB) ||  (((c)&65535)==ACCN_GSDB_DIRSUB) || (((c)&65535) == ACCN_NCBI_EST) ||  (((c)&65535) == ACCN_NCBI_DIRSUB) ||  (((c)&65535) == ACCN_NCBI_GENOME) ||  (((c)&65535) == ACCN_NCBI_PATENT) ||  (((c)&65535) == ACCN_NCBI_HTGS) ||  (((c)&65535) == ACCN_NCBI_GSS) ||  (((c)&65535) == ACCN_NCBI_STS) ||  (((c)&65535) == ACCN_NCBI_BACKBONE) ||  (((c)&65535) == ACCN_NCBI_SEGSET)  ||  (((c)&65535) == ACCN_NCBI_WGS) ||  (((c)&65535) == ACCN_NCBI_OTHER)  || (((c)&65535) == ACCN_NCBI_OPTICAL)  || (((c)&65535) == ACCN_NCBI_PROT) || (((c)&65535) == ACCN_NCBI_cDNA) || (((c)&65535) == ACCN_NCBI_TSA) || (((c)&65535) == ACCN_NCBI_TSA_PROT) || (((c)&65535) == ACCN_EMBL_GB) || (((c)&65535) == ACCN_EMBL_GB_DDBJ || (((c)&65535) == ACCN_GB_DDBJ)) )
1029 
1030 /* XM_,NP_,NM_,NT_,NC_ reference sequence records created and curated by NCBI
1031    REFSEQ project
1032 */
1033 #define ACCN_IS_REFSEQ(c) (((c)== ACCN_REFSEQ_PROT) || ((c)== ACCN_REFSEQ_mRNA) || ((c)== ACCN_REFSEQ_CONTIG) || ((c)== ACCN_REFSEQ_CHROMOSOME) || ((c)== ACCN_REFSEQ_mRNA_PREDICTED) || ((c)== ACCN_REFSEQ_PROT_PREDICTED) || ((c)== ACCN_REFSEQ_GENOMIC) || ((c)== ACCN_REFSEQ_ARTIFICIAL_ASSEMBLY) || ((c)== ACCN_REFSEQ_WGS) || (((c)&65535)== ACCN_REFSEQ) )
1034 
1035 #define ACCN_IS_TPA(c) (((c)== ACCN_NCBI_TPA) || ((c)== ACCN_NCBI_TPA_PROT) || ((c)== ACCN_EMBL_TPA) || ((c)== ACCN_EMBL_TPA_PROT) || ((c)== ACCN_DDBJ_TPA) || ((c)== ACCN_DDBJ_TPA_PROT) || ((c)== ACCN_NCBI_WGS_TPA) || ((c)== ACCN_NCBI_WGS_TPA_PROT) || ((c)== ACCN_EMBL_WGS_TPA) || ((c)== ACCN_EMBL_WGS_TPA_PROT) || ((c)== ACCN_DDBJ_WGS_TPA) || ((c)== ACCN_DDBJ_WGS_TPA_PROT))
1036 
1037 #define ACCN_IS_WGS(c) (((c)== ACCN_NCBI_WGS) || ((c)== ACCN_NCBI_WGS_PROT) || ((c)== ACCN_EMBL_WGS) || ((c)== ACCN_EMBL_WGS_PROT) || ((c)== ACCN_DDBJ_WGS) || ((c)== ACCN_DDBJ_WGS_PROT) || ((c)== ACCN_REFSEQ_WGS) || ((c)== ACCN_NCBI_WGS_TPA) || ((c)== ACCN_NCBI_WGS_TPA_PROT) || ((c)== ACCN_EMBL_WGS_TPA) || ((c)== ACCN_EMBL_WGS_TPA_PROT) || ((c)== ACCN_DDBJ_WGS_TPA) || ((c)== ACCN_DDBJ_WGS_TPA_PROT))
1038 
1039 #define ACCN_IS_TSA(c) (((c)== ACCN_NCBI_TSA) || ((c)== ACCN_NCBI_TSA_PROT) || ((c)== ACCN_EMBL_TSA) || ((c)== ACCN_EMBL_TSA_PROT) || ((c)== ACCN_DDBJ_TSA) || ((c)== ACCN_DDBJ_TSA_PROT))
1040 
1041 #define ACCN_IS_NCBI(c) (ACCN_IS_REFSEQ((c)) || ACCN_IS_GENBANK((c)) || ((c)== ACCN_NCBI_TPA) || ((c)== ACCN_NCBI_TPA_PROT) || ((c)== ACCN_NCBI_WGS) || ((c)== ACCN_NCBI_WGS_PROT) || ((c)== ACCN_NCBI_TSA) || ((c)== ACCN_NCBI_WGS_TPA) || ((c)== ACCN_NCBI_WGS_TPA_PROT) || ((c)== ACCN_NCBI_TARGETED))
1042 
1043 /*
1044   Macro to detect EMBL accession numbers  (can also belong to another DB)
1045  */
1046 #define ACCN_IS_EMBL(c) ( (((c)&65535) ==  ACCN_EMBL_EST) ||  (((c)&65535) == ACCN_EMBL_DIRSUB) ||  (((c)&65535) == ACCN_EMBL_GENOME) ||  (((c)&65535) == ACCN_EMBL_PATENT) ||  (((c)&65535) == ACCN_EMBL_HTGS) ||  (((c)&65535) == ACCN_EMBL_CON) ||  (((c)&65535) == ACCN_EMBL_WGS) ||  (((c)&65535) == ACCN_EMBL_OTHER)  || (((c)&65535) == ACCN_EMBL_PROT) || (((c)&65535) == ACCN_EMBL_GB) || (((c)&65535) == ACCN_EMBL_DDBJ) || (((c)&65535) == ACCN_EMBL_GB_DDBJ) || (((c)&65535) == ACCN_EMBL_WGS_TPA) || (((c)&65535) == ACCN_EMBL_WGS_TPA_PROT))
1047 
1048 #define ACCN_IS_DDBJ(c) ((((c)&65535) ==  ACCN_DDBJ_EST) ||  (((c)&65535) == ACCN_DDBJ_DIRSUB) ||  (((c)&65535) == ACCN_DDBJ_GENOME) ||  (((c)&65535) == ACCN_DDBJ_PATENT) ||  (((c)&65535) == ACCN_DDBJ_HTGS) ||  (((c)&65535) == ACCN_DDBJ_CON)  ||  (((c)&65535) == ACCN_DDBJ_WGS) ||  (((c)&65535) == ACCN_DDBJ_OTHER) || (((c)&65535) == ACCN_DDBJ_PROT) || (((c)&65535) == ACCN_DDBJ_GSS) || (((c)&65535) == ACCN_GB_DDBJ) || (((c)&65535) == ACCN_EMBL_DDBJ) || (((c)&65535) == ACCN_EMBL_GB_DDBJ) || (((c)&65535) == ACCN_EMBL_WGS_TPA) || (((c)&65535) == ACCN_EMBL_WGS_TPA_PROT))
1049 
1050 #define ACCN_IS_SWISSPROT(c) ((c)== ACCN_SWISSPROT)
1051 /*
1052    detect the few accessions numbers (N000*-N1*) have been assigned to many databases
1053    .. as well as unnasigned accessions.
1054 */
1055 #define ACCN_IS_AMBIGOUSDB(c) (((c)&65535)==ACCN_AMBIGOUS_DB || (c)== ACCN_IS_PROTEIN || (c)== ACCN_IS_NT || (((c)&65535) == ACCN_EMBL_GB) || (((c)&65535) == ACCN_EMBL_DDBJ) || (((c)&65535) == ACCN_GB_DDBJ) || (((c)&65535) == ACCN_EMBL_GB_DDBJ))
1056     /*
1057       does not ressemble any accession types. (with the possible exception
1058       of PIR.. but must call ACCN_PIR_FORMAT() to check that.
1059      */
1060 #define ACCN_IS_UNKNOWN(c) (c==ACCN_UNKNOWN)
1061     /* Unassigned : is of 3+5 (proteins) OR
1062                           2+5 (amino acids) OR
1063                           [A-Z][A-Z]_ (refseq type)
1064                           , but
1065        has not been formally been formally assigned (hardcoded)
1066     */
1067 #define ACCN_IS_UNASSIGNED(c) ((c)== ACCN_IS_PROTEIN || (c)== ACCN_IS_NT || (c) == ACCN_UNKNOWN || (c)==ACCN_REFSEQ)
1068 
1069 /*
1070   Try to Find if the Bioseq represented by a SeqId is a SeqLoc List;
1071   May fetch the Bioseq to get all the synonymous SeqIds.
1072  */
1073 
1074 NLM_EXTERN Boolean LIBCALL SeqIdInSeqLocList(SeqIdPtr sip, ValNodePtr list);
1075 
1076 NLM_EXTERN SeqIdPtr     AddSeqId (SeqIdPtr *sip_head, SeqIdPtr sip);
1077 NLM_EXTERN SeqIdPtr     SeqIdDupList (SeqIdPtr id_list);
1078 NLM_EXTERN SeqIdPtr     SeqIdDupBestList (SeqIdPtr id_list);
1079 NLM_EXTERN SeqIdPtr     SeqIdListfromSeqLoc (ValNodePtr vnpslp);
1080 
1081 NLM_EXTERN Boolean IsSkippableDbtag (DbtagPtr dbt);
1082 NLM_EXTERN Boolean DoesCDSEndWithStopCodon (SeqFeatPtr cds);
1083 
1084 
1085 #ifdef __cplusplus
1086 }
1087 #endif
1088 
1089 #undef NLM_EXTERN
1090 #ifdef NLM_EXPORT
1091 #define NLM_EXTERN NLM_EXPORT
1092 #else
1093 #define NLM_EXTERN
1094 #endif
1095 
1096 #endif
1097