1 /* sequtil.h 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * File Name: sequtil.h 27 * 28 * Author: James Ostell 29 * 30 * Version Creation Date: 4/1/91 31 * 32 * $Revision: 6.65 $ 33 * 34 * File Description: Sequence Utilities for objseq and objsset 35 * 36 * ========================================================================== 37 */ 38 39 #ifndef _NCBI_SeqUtil_ 40 #define _NCBI_SeqUtil_ 41 42 #ifndef _NCBI_Seqset_ 43 #include <objsset.h> /* the object loader interface */ 44 #endif 45 46 #ifndef _NCBI_SeqMgr_ 47 #include <seqmgr.h> /* the Bioseq and SeqEntry manager */ 48 #endif 49 50 #undef NLM_EXTERN 51 #ifdef NLM_IMPORT 52 #define NLM_EXTERN NLM_IMPORT 53 #else 54 #define NLM_EXTERN extern 55 #endif 56 57 #ifdef __cplusplus 58 extern "C" { 59 #endif 60 61 /************************************************************* 62 * this define decides if SeqIdWrite shows versions, 63 * if seqmgr seqid indexing functions use it 64 * and if e2index uses it 65 * files depending on SHOWVERSION are: 66 * sequtil.c, segmgr.c, e2iloc.c 67 * SHOWVERSION should be removed entirely when we are through 68 * the transition 69 ************************************************************/ 70 71 #define SHOWVERSION 1 /* do show versions */ 72 73 /***************************************************************************** 74 * 75 * What am I? 76 * 77 *****************************************************************************/ 78 NLM_EXTERN Uint1 Bioseq_repr(BioseqPtr bsp); 79 NLM_EXTERN Uint1 BioseqGetCode(BioseqPtr bsp); 80 81 NLM_EXTERN ValNodePtr BioseqGetSeqDescr(BioseqPtr bsp, Int2 type, ValNodePtr curr); 82 NLM_EXTERN CharPtr BioseqGetTitle(BioseqPtr bsp); 83 NLM_EXTERN NumberingPtr BioseqGetNumbering(BioseqPtr bsp); 84 85 NLM_EXTERN Int4 BioseqGetLen(BioseqPtr bsp); 86 NLM_EXTERN Int4 BioseqGetGaps(BioseqPtr bsp); 87 NLM_EXTERN Int4 BioseqGetSegLens(BioseqPtr bsp, Int4Ptr lens); 88 #define BioseqCountSegs(x) BioseqGetSegLens(x, NULL) 89 90 NLM_EXTERN Boolean BioseqConvert(BioseqPtr bsp, Uint1 newcode); 91 NLM_EXTERN Boolean BioseqPack(BioseqPtr bsp); 92 NLM_EXTERN Boolean SeqLitPack(SeqLitPtr slp); 93 NLM_EXTERN Boolean BioseqRawConvert(BioseqPtr bsp, Uint1 newcode); 94 NLM_EXTERN Boolean BioseqRawPack(BioseqPtr bsp); 95 NLM_EXTERN ByteStorePtr BSConvertSeq(ByteStorePtr bsp, Uint1 newcode, Uint1 oldcode, Int4 seqlen); 96 NLM_EXTERN ByteStorePtr BSPack(ByteStorePtr from, Uint1 oldcode, Int4 length, Uint1Ptr newcodeptr); 97 98 NLM_EXTERN CharPtr StringForSeqMethod(Int2 method); 99 100 NLM_EXTERN CharPtr StringForSeqTech(Int2 tech); 101 102 /***************************************************************************** 103 * 104 * Hook function definition for DNA Compression 105 * 106 *****************************************************************************/ 107 typedef Int4 (*CompressRWFunc)(Pointer data, 108 Uint1Ptr buf, Int4 length); 109 110 /***************************************************************************** 111 * 112 * SeqCodeTable routines 113 * SeqMapTable routines 114 * Convert and Comp return INVALID_RESIDUE when a residue is out of range 115 * 116 *****************************************************************************/ 117 #define INVALID_RESIDUE 255 118 119 /***************************************************************************** 120 * 121 * SeqCodeTablePtr SeqCodeTableFind(code) 122 * Sequence codes defined in objseq.h 123 * 124 *****************************************************************************/ 125 NLM_EXTERN SeqCodeTablePtr LIBCALL SeqCodeTableFind(Uint1 code); 126 127 /***************************************************************************** 128 * 129 * SeqCodeTableComp(sctp, residue) 130 * returns complement of residue if possible 131 * or residue, if not 132 * assumes residue is in the same code as sctp 133 * 134 *****************************************************************************/ 135 NLM_EXTERN Uint1 SeqCodeTableComp(SeqCodeTablePtr sctp, Uint1 residue); 136 137 /***************************************************************************** 138 * 139 * OneLetterCode(sctp) 140 * returns TRUE if sequence code table sctp uses one letter symbols 141 * 142 *****************************************************************************/ 143 NLM_EXTERN Boolean OneLetterCode(SeqCodeTablePtr sctp); 144 145 /***************************************************************************** 146 * 147 * FirstResidueInCode(sctp) 148 * returns first valid residue code in sequence code table 149 * 150 *****************************************************************************/ 151 NLM_EXTERN Uint1 FirstResidueInCode(SeqCodeTablePtr sctp); 152 153 /***************************************************************************** 154 * 155 * LastResidueInCode(sctp) 156 * returns last valid residue code in sequence code table 157 * nb: some codes have "holes", a range of invalid values between first 158 * and last. 159 * 160 *****************************************************************************/ 161 NLM_EXTERN Uint1 LastResidueInCode(SeqCodeTablePtr sctp); 162 163 /***************************************************************************** 164 * 165 * GetSymbolForResidue(sctp, residue) 166 * returns the ONE LETTER symbol for residue if sequence code has one 167 * letter symbols. returns INVALID_RESIDUE if not a valid residue or if 168 * sequence code uses multi-letter symbols 169 * 170 *****************************************************************************/ 171 NLM_EXTERN Uint1 GetSymbolForResidue(SeqCodeTablePtr sctp, Uint1 residue); 172 173 /***************************************************************************** 174 * 175 * GetResidueForSymbol(sctp, residue) 176 * returns the residue for a ONE LETTER if sequence code has one 177 * letter symbols. returns INVALID_RESIDUE if not a valid symbol or if 178 * sequence code uses multi-letter symbols 179 * CASE matters 180 * 181 *****************************************************************************/ 182 NLM_EXTERN Uint1 GetResidueForSymbol(SeqCodeTablePtr sctp, Uint1 symbol); 183 184 /***************************************************************************** 185 * 186 * GetLongSymbolForResidue(sctp, residue) 187 * returns string symbol for residue if sequence code has string 188 * symbols. returns NULL if not a valid residue or if 189 * sequence code uses One letter symbols 190 * 191 *****************************************************************************/ 192 NLM_EXTERN const char * GetLongSymbolForResidue(SeqCodeTablePtr sctp, Uint1 residue); 193 194 /***************************************************************************** 195 * 196 * GetResidueForLongSymbol(sctp, symbol) 197 * returns the residue for a STRING symbol if sequence code has string 198 * symbols. returns INVALID_RESIDUE if not a valid symbol or if 199 * sequence code uses one-letter symbols 200 * CASE matters 201 * 202 *****************************************************************************/ 203 NLM_EXTERN Uint1 GetResidueForLongSymbol(SeqCodeTablePtr sctp, CharPtr symbol); 204 205 /***************************************************************************** 206 * 207 * const char * GetNameForResidue (sctp, residue) 208 * returns the descriptive name (eg. "Leucine") for a residue in the 209 * sequence code defined by sctp 210 * returns NULL if not a valid code in the alphabet 211 * nb: some codes have "holes" in them, regions of values that are 212 * invalid. 213 * 214 *****************************************************************************/ 215 NLM_EXTERN const char * GetNameForResidue(SeqCodeTablePtr sctp, Uint1 residue); 216 217 /***************************************************************************** 218 * 219 * SeqMapTablePtr SeqMapTableFind(to, from) 220 * Map from sequence code "from" to sequence code "to" 221 * Sequence codes defined in objseq.h 222 * 223 *****************************************************************************/ 224 NLM_EXTERN SeqMapTablePtr LIBCALL SeqMapTableFind(Uint1 to, Uint1 from); 225 226 /***************************************************************************** 227 * 228 * SeqMapTableConvert(smtp, from) 229 * returns conversion of "from" using SeqMapTable smtp 230 * 231 *****************************************************************************/ 232 NLM_EXTERN Uint1 SeqMapTableConvert(SeqMapTablePtr smtp, Uint1 residue); 233 234 /***************************************************************************** 235 * 236 * Convert4NaRandom(from, to) 237 * Converts Seq_code_ncbi4na "from" to Seq_code_ncbi2na "to" 238 * with random conversions 239 * Return TRUE if conversion done without randomization 240 *****************************************************************************/ 241 NLM_EXTERN Boolean Convert4NaRandom(Uint1 from, Uint1 PNTR to); 242 243 /***************************************************************************** 244 * 245 * BSCompressDNA(bytestoreptr, len, lbytes) 246 * converts a ncbi4na bytestore into ncbi2na 247 * returns pointer to ambiguity storage 248 * lbytes[0] == length of this storage 249 * frees old bytestore 250 * returns pointer to new one, or NULL on fail. 251 * len is residues 252 * 253 *****************************************************************************/ 254 NLM_EXTERN ByteStorePtr BSCompressDNA(ByteStorePtr from, Int4 len, 255 Uint4Ptr PNTR lbytes); 256 NLM_EXTERN ByteStorePtr BSCompressDNANew(ByteStorePtr from, Int4 len, 257 Uint4Ptr PNTR lbytes); 258 /* To be removed */ 259 NLM_EXTERN ByteStorePtr BSCompressDNAOld(ByteStorePtr from, Int4 len, 260 Uint4Ptr PNTR lbytes); 261 262 /***************************************************************************** 263 * 264 * GenericCompressDNA() 265 * converts from VoidPtr "from" in 4na encoding to 266 * VoidPtr "to" in 2Na encoding 267 * returns pointer to ambiguity storage 268 * lbytes[0] == length of this storage 269 * returns TRUE if succeded, or FALSE on fail. 270 * seq_len is maximum number of residues in sequence 271 * or ((Uint4) -1) if final length is unknown. 272 * read_func and write_func - hook functions to read from "from" 273 * and to write to "to" 274 * 275 * NOTE! read_func must return number of residues read, that usualy 276 * twice as much as returned number of bytes. Only last returned 277 * byte may have only one residue and this will be handled by 278 * seq_len value or returned value from read_func() 279 *****************************************************************************/ 280 NLM_EXTERN Boolean GenericCompressDNA(VoidPtr from, 281 VoidPtr to, 282 Uint4 length, 283 CompressRWFunc read_func, 284 CompressRWFunc write_func, 285 Uint4Ptr PNTR lbytes); 286 287 NLM_EXTERN Boolean GenericCompressDNAEx(VoidPtr from, 288 VoidPtr to, 289 Uint4 length, 290 CompressRWFunc read_func, 291 CompressRWFunc write_func, 292 Uint4Ptr PNTR lbytes, 293 Boolean x_new); 294 295 /***************************************************************************** 296 * 297 * BSRebuildDNA(bytestoreptr, len, lbytes) 298 * restore ASCII sequence with abmiguity characters 299 * lbytes[0] == length of this storage 300 * frees old bytestore 301 * returns pointer to new one, or NULL on fail. 302 * len is residues 303 * lbytes is pointer to ambiguity storage 304 * 305 *****************************************************************************/ 306 NLM_EXTERN ByteStorePtr BSRebuildDNA(ByteStorePtr from, Int4 len, 307 Uint4Ptr PNTR lbytes); 308 NLM_EXTERN Boolean RebuildDNA_4na (Uint1Ptr buffer, Int4 length, Uint4Ptr lbytes); 309 310 /***************************************************************************** 311 * 312 * BSRebuildDNA_4na(bytestoreptr, lbytes) 313 * restore ncbi4na sequence with abmiguity characters 314 * lbytes[0] == length of this storage 315 * frees old bytestore 316 * returns pointer to new one, or NULL on fail. 317 * lbytes is pointer to ambiguity storage 318 * 319 *****************************************************************************/ 320 NLM_EXTERN ByteStorePtr BSRebuildDNA_4na (ByteStorePtr from, Uint4Ptr lbytes); 321 322 323 /***************************************************************************** 324 * 325 * void NaI2TableFree(void) 326 * Free allocated memory for 327 * Seq_code_iupacna --> Seq_code_ncbi2na transfer 328 *****************************************************************************/ 329 NLM_EXTERN void NaI2TableFree(void); 330 331 /***************************************************************************** 332 * 333 * Numbering routines 334 * 335 *****************************************************************************/ 336 /* convert any numbering value to seq offset */ 337 NLM_EXTERN Int4 NumberingOffset(NumberingPtr np, DataValPtr avp); 338 /* convert seq offset to numbering value */ 339 NLM_EXTERN Int2 NumberingValue(NumberingPtr np, Int4 offset, DataValPtr avp); 340 NLM_EXTERN Int2 NumberingValueBySeqId(SeqIdPtr sip, Int4 offset, DataValPtr avp); 341 342 NLM_EXTERN void NumberingDefaultLoad(void); 343 NLM_EXTERN NumberingPtr NumberingDefaultGet(void); 344 345 /***************************************************************************** 346 * 347 * SeqEntry and BioseqSet stuff 348 * 349 *****************************************************************************/ 350 351 NLM_EXTERN Uint1 Bioseq_set_class(SeqEntryPtr sep); 352 353 /***************************************************************************** 354 * 355 * traversal routines 356 * SeqEntry - any type 357 * 358 *****************************************************************************/ 359 typedef void (* SeqEntryFunc)(SeqEntryPtr sep, Pointer mydata, Int4 index, Int2 indent); 360 NLM_EXTERN Int4 SeqEntryList(SeqEntryPtr sep, Pointer mydata, SeqEntryFunc mycallback, Int4 index, Int2 indent); 361 362 #define SeqEntryCount( a ) SeqEntryList( a ,NULL,NULL,0,0) 363 #define SeqEntryExplore(a,b,c) SeqEntryList(a, b, c, 0L, 0) 364 365 /***************************************************************************** 366 * 367 * void CorrectGeneFeatLocation(sep, data, n, m) 368 * 369 * Correct gene location for mRNA sequences, i.e. 370 * puts start = 0, end = total_length_of_sequence - 1. 371 * 372 *****************************************************************************/ 373 NLM_EXTERN void CorrectGeneFeatLocation(SeqEntryPtr sep, Pointer data, 374 Int4 n, Int2 m); 375 376 /***************************************************************************** 377 * 378 * traversal routines 379 * Bioseq types only - "individual" sequences 380 * do NOT traverse component parts of seqmented or constructed types 381 * 382 *****************************************************************************/ 383 NLM_EXTERN Int4 BioseqList(SeqEntryPtr sep, Pointer mydata, SeqEntryFunc mycallback, Int4 index, Int2 indent); 384 385 #define BioseqCount( a ) BioseqList( a ,NULL,NULL,0,0) 386 #define BioseqExplore(a,b,c) BioseqList(a, b, c, 0L, 0) 387 388 /***************************************************************************** 389 * 390 * Get parts routines 391 * 392 *****************************************************************************/ 393 /* gets next Seqdescr after curr in sep of type type */ 394 NLM_EXTERN ValNodePtr SeqEntryGetSeqDescr(SeqEntryPtr sep, Int2 type, ValNodePtr curr); 395 /* gets first title from sep */ 396 NLM_EXTERN CharPtr SeqEntryGetTitle(SeqEntryPtr sep); 397 398 /***************************************************************************** 399 * 400 * Manipulations 401 * 402 *****************************************************************************/ 403 404 NLM_EXTERN Boolean SeqEntryConvert(SeqEntryPtr sep, Uint1 newcode); 405 #define SeqEntryPack(x) SeqEntryConvert(x, (Uint1)0) 406 407 408 /***************************************************************************** 409 * 410 * SeqLoc stuff 411 * 412 *****************************************************************************/ 413 #define PRINTID_FASTA_SHORT ( (Uint1)1) 414 #define PRINTID_FASTA_LONG ( (Uint1)2) 415 #define PRINTID_TEXTID_LOCUS ( (Uint1)3) 416 #define PRINTID_TEXTID_ACCESSION ( (Uint1)4) 417 #define PRINTID_TEXTID_ACC_VER ( (Uint1)5) 418 #define PRINTID_TEXTID_ACC_ONLY ( (Uint1)6) 419 #define PRINTID_REPORT ( (Uint1)7) 420 #define PRINTID_FASTA_GENERAL ( (Uint1)8) 421 #define PRINTID_FASTA_ALL ( (Uint1)9) 422 423 424 /***************************************************************************** 425 * 426 * SeqIdPtr SeqIdLocate (sip, order, num) 427 * Given a SeqId (sip): 428 * Locates the Bioseq in memory or cached 429 * Then calls SeqIdSelect with the Bioseq.id chain to find the 430 * SeqId type you want. 431 * 432 *****************************************************************************/ 433 NLM_EXTERN SeqIdPtr SeqIdLocate(SeqIdPtr sip, Uint1Ptr order, Int2 num); 434 435 /***************************************************************************** 436 * 437 * SeqIdPtr SeqIdSelect (sip, order, num) 438 * takes an array (order) num long. 439 * goes down chain starting with sip. 440 * finds lowest value of order[sip->choice] and returns it. 441 * if order[] == 255, it is skipped. 442 * if nothing is found < 255, NULL is returned 443 * ErrorMessage if sip->choice >= num 444 * 445 *****************************************************************************/ 446 NLM_EXTERN SeqIdPtr SeqIdSelect(SeqIdPtr sip, Uint1Ptr order, Int2 num); 447 448 NLM_EXTERN Int2 SeqIdBestRank(Uint1Ptr buf, Int2 num); 449 NLM_EXTERN SeqIdPtr SeqIdFindBest(SeqIdPtr sip, Uint1 target); 450 NLM_EXTERN SeqIdPtr SeqIdFindBestAccession (SeqIdPtr sip); 451 NLM_EXTERN CharPtr SeqIdPrint(SeqIdPtr sip, CharPtr buf, Uint1 format); 452 NLM_EXTERN CharPtr SeqIdWrite(SeqIdPtr sip, CharPtr buf, Uint1 format, Uint4 buflen); 453 NLM_EXTERN Int4 SeqIdLabelLen (SeqIdPtr isip, Uint1 format); 454 NLM_EXTERN CharPtr SeqIdWholeLabel (SeqIdPtr isip, Uint1 format); 455 NLM_EXTERN Boolean GetAccessionFromSeqId(SeqIdPtr sip, BIG_ID_PNTR gi, 456 CharPtr PNTR id); 457 NLM_EXTERN Boolean GetAccessionVersionFromSeqId(SeqIdPtr sip, BIG_ID_PNTR gi, 458 CharPtr PNTR id, Boolean get_version); 459 NLM_EXTERN SeqIdPtr SeqIdParse(CharPtr buf); 460 461 /***************************************************************************** 462 * 463 * Int2 ValidateAccn (accession) 464 * Int2 ValidateAccnDotVer (accession) 465 * Int2 ValidateSeqID (SeqIdPtr) 466 * Return values are: 467 * 0: no problem - Accession is in proper format 468 * -1: Accession did not start with a letter (or two or four letters) 469 * -2: Accession did not contain legal number of digits after letters 470 * -3: the original Accession number to be validated was NULL 471 * -4: the original Accession number is too long (>16) 472 * -5: missing version number (required by ValidateAccnDotVer) 473 * -6: Bad version number (required by ValidateAccnDotVer) 474 * 475 *****************************************************************************/ 476 477 NLM_EXTERN Int2 ValidateAccn (CharPtr accession); 478 NLM_EXTERN Int2 ValidateAccnDotVer (CharPtr accession); 479 NLM_EXTERN Int2 ValidateSeqID (SeqIdPtr sip); 480 481 /***************************************************************************** 482 * 483 * MakeNewProteinSeqId(SeqLocPtr slp, SeqIdPtr sip) 484 * Makes a new protein SeqId of attempting to keep it unique 485 * Trys to match it to the input seqid type 486 * slp is the location on the DNA of the coding region making the protein 487 * sip is the SeqId of the DNA coding for the protein 488 * if (sip != NULL) uses it for a "base" first 489 * else if (slp != NULL) uses a SeqId from it for a base 490 * else base is the string tmpprot 491 * 492 * id is then base_X where X is a number assigned as a serial number 493 * the returned id is guaranteed to be unique among all Bioseqs currently 494 * loaded in memory. 495 * 496 * MakeNewProteinSeqIdEx(SeqLocPtr slp, SeqIdPtr sip, prefix, Int2 ctrptr) 497 * Allows you to indicate a starting count for the X in base_X, and returns 498 * the next count for improved speed when allocating many protein bioseqs 499 * 500 *****************************************************************************/ 501 NLM_EXTERN SeqIdPtr LIBCALL MakeNewProteinSeqIdExMT(SeqLocPtr slp, SeqIdPtr sip, CharPtr prefix, Int2Ptr ctrptr, Boolean is_MT_safe); 502 NLM_EXTERN SeqIdPtr LIBCALL MakeNewProteinSeqIdEx(SeqLocPtr slp, SeqIdPtr sip, CharPtr prefix, Int2Ptr ctrptr); 503 NLM_EXTERN SeqIdPtr LIBCALL MakeNewProteinSeqId(SeqLocPtr slp, SeqIdPtr sip); 504 NLM_EXTERN ObjectIdPtr UniqueLocalId(void); 505 506 /***************************************************************************** 507 * 508 * Boolean BioseqMatch(bsp, seqid) 509 * returns TRUE if bsp points to the Bioseq identified by seqid 510 * 511 *****************************************************************************/ 512 NLM_EXTERN Boolean BioseqMatch(BioseqPtr bsp, SeqIdPtr sip); 513 514 NLM_EXTERN BioseqPtr BioseqFindInSeqEntry(SeqIdPtr sip, SeqEntryPtr sep); 515 516 /***************************************************************************** 517 * 518 * Boolean SeqIdMatch(a, b) 519 * returns TRUE if SeqIds could be compared and are the same 520 * returns FALSE both if SeqIds could not be compared OR if they were 521 * compared but are different 522 * 523 * WARNING!!!! use SeqIdComp() instead of SeqIdMatch() in most cases 524 * 525 * The code here must work the same is in two idloader 526 * context: function id_flatten_seq_obj (idsybase.c) 527 * and proc id_id_flatten_seq_obj 528 * 529 *****************************************************************************/ 530 NLM_EXTERN Boolean SeqIdMatch(SeqIdPtr a, SeqIdPtr b); 531 532 /***************************************************************************** 533 * 534 * SeqIdComp(a, b) 535 * Compares a to b and returns 536 * 537 * SIC_DIFF = different types, could not be compared 538 * SIC_NO = types could be compared, and ids are different 539 * SIC_YES = types could be compared, and ids are the same 540 * 541 *****************************************************************************/ 542 NLM_EXTERN Uint1 SeqIdComp(SeqIdPtr a, SeqIdPtr b); 543 #define SIC_DIFF 1 544 #define SIC_NO 0 545 #define SIC_YES 2 546 547 /************************* 548 SeqIdForSameBioseq(a,b) 549 trys to locate all ids for a or b and determine 550 if (a and b refer the the same Bioseq) 551 **************************/ 552 NLM_EXTERN Boolean SeqIdForSameBioseq(SeqIdPtr a, SeqIdPtr b); 553 554 /************************* 555 * Boolean SeqIdIn (a,b) 556 * returns TRUE if a in list of b 557 ******************/ 558 NLM_EXTERN Boolean SeqIdIn(SeqIdPtr a, SeqIdPtr b); 559 560 561 /***************************************************************************** 562 * 563 * SeqLocFindNext() 564 * just calls SeqLocFindPart(seqlochead, currseqloc, EQUIV_IS_MANY) 565 * 566 *****************************************************************************/ 567 NLM_EXTERN SeqLocPtr SeqLocFindNext(SeqLocPtr seqlochead, SeqLocPtr currseqloc); 568 569 /***************************************************************************** 570 * 571 * SeqLocFindPart(seqlochead, currseqloc, equiv_status) 572 * finds the next Seq-loc after currseqloc 573 * seqlochead is the first of a chain of Seq-locs 574 * equiv_status defines how to treat SEQLOC_EQUIV 575 * EQUIV_IS_MANY = treat same as SEQLOC_MIX 576 * EQUIV_IS_ONE = return SEQLOC_EQUIV as one Seq-loc 577 * FIRST_EQUIV_IS_MANY = if seqlochead is a SEQLOC_EQUIV, enter the 578 * the chain of Seq-locs, but treat any later EQUIVs as 579 * EQUIV_IS_ONE. 580 * 581 *****************************************************************************/ 582 NLM_EXTERN SeqLocPtr SeqLocFindPart(SeqLocPtr seqlochead, SeqLocPtr currseqloc, Uint1 equiv_status); 583 584 #define EQUIV_IS_MANY 0 /* treat SEQLOC_EQUIV same as SEQLOC_MIX */ 585 #define EQUIV_IS_ONE 1 /* treat SEQLOC_EQUIV as one Seq-loc */ 586 #define FIRST_EQUIV_IS_MANY 2 /* treat only first EQUIV as SEQ_LOC_MIX */ 587 588 NLM_EXTERN Boolean IS_one_loc(SeqLocPtr anp, Boolean equiv_is_one); /* for SeqLoc */ 589 590 NLM_EXTERN Int4 SeqLocStart(SeqLocPtr seqloc); 591 NLM_EXTERN Int4 SeqLocStop(SeqLocPtr seqloc); 592 NLM_EXTERN Uint1 SeqLocStrand(SeqLocPtr seqloc); 593 NLM_EXTERN Int4 SeqLocLen(SeqLocPtr seqloc); 594 NLM_EXTERN Int4 SeqLocGetSegLens(SeqLocPtr slp, Int4Ptr lens, Int4 ctr, Boolean gaps); 595 #define SeqLocCountSegs(x) SeqLocGetSegLens(x, NULL,0,FALSE) 596 #define SeqLocGetGaps(x) SeqLocGetSegLens(x,NULL,0,TRUE) 597 NLM_EXTERN SeqIdPtr SeqLocId(SeqLocPtr seqloc); 598 NLM_EXTERN Uint1 StrandCmp(Uint1 strand); 599 NLM_EXTERN Boolean SeqLocRevCmp(SeqLocPtr anp); 600 601 /**** defines for "which_end" below ****/ 602 603 #define SEQLOC_LEFT_END 1 /* low numbered end of SeqLoc */ 604 #define SEQLOC_RIGHT_END 2 /* high numbered end of SeqLoc */ 605 #define SEQLOC_START 3 /* beginning of SeqLoc (low on plus, high on minus) */ 606 #define SEQLOC_STOP 4 /* end of SeqLoc (high on plus, low on minus) */ 607 608 NLM_EXTERN Int4 GetOffsetInLoc(SeqLocPtr of, SeqLocPtr in, Uint1 which_end); 609 NLM_EXTERN Int4 GetOffsetInBioseq(SeqLocPtr of, BioseqPtr in, Uint1 which_end); 610 NLM_EXTERN Int4 GetOffsetInBioseqEx (SeqLocPtr of, BioseqPtr in, Uint1 which_end, Boolean is_circular, Boolean relaxed); 611 NLM_EXTERN void GetLeftAndRightOffsetsInBioseq (SeqLocPtr of, BioseqPtr in, Int4Ptr left, Int4Ptr right, Boolean is_circular, Boolean relaxed, BoolPtr left_flip, BoolPtr right_flip ); 612 NLM_EXTERN Int2 SeqLocOrder(SeqLocPtr a, SeqLocPtr b, BioseqPtr in); 613 614 NLM_EXTERN Int2 SeqLocMol(SeqLocPtr seqloc); 615 616 NLM_EXTERN CharPtr SeqLocPrint(SeqLocPtr slp); 617 NLM_EXTERN CharPtr SeqLocPrintUseBestID(SeqLocPtr slp); 618 619 /***************************************************************************** 620 * 621 * SeqLocCompare(a, b) 622 * returns 623 * 0 = no overlap 624 * 1 = a is completely contained in b 625 * 2 = b is completely contained in a 626 * 3 = a == b 627 * 4 = a and b overlap, but neither completely contained in the other 628 * 629 *****************************************************************************/ 630 NLM_EXTERN Int2 SeqLocCompare(SeqLocPtr a, SeqLocPtr b); 631 #define SLC_NO_MATCH 0 632 #define SLC_A_IN_B 1 633 #define SLC_B_IN_A 2 634 #define SLC_A_EQ_B 3 635 #define SLC_A_OVERLAP_B 4 636 NLM_EXTERN Int2 SeqLocCompareEx (SeqLocPtr a, SeqLocPtr b, Boolean compare_strand); 637 638 NLM_EXTERN Boolean UnitTestSeqLocCompare (void); 639 640 /***************************************************************************** 641 * 642 * SeqLocAinB(a, b) 643 * if a is completely contained in b, a positive number is returned 644 * if 0, a is identical with b 645 * if not 0, is the number of residues bigger b is than a 646 * if a negative number is returned, a is not contained in b 647 * could overlap or not 648 * used to find features contained in genes 649 * 650 *****************************************************************************/ 651 NLM_EXTERN Int4 SeqLocAinB(SeqLocPtr a, SeqLocPtr b); 652 653 NLM_EXTERN Boolean SeqIntCheck(SeqIntPtr sip); /* checks for valid interval */ 654 NLM_EXTERN Boolean SeqPntCheck(SeqPntPtr spp); /* checks valid pnt */ 655 NLM_EXTERN Boolean PackSeqPntCheck(PackSeqPntPtr pspp); 656 NLM_EXTERN Uint1 SeqLocCheck(SeqLocPtr slp); 657 #define SEQLOCCHECK_OK 2 /* location is fine */ 658 #define SEQLOCCHECK_WARNING 1 /* location ok, but has mixed strands */ 659 #define SEQLOCCHECK_ERROR 0 /* error in location */ 660 /***************************************************************************** 661 * 662 * SeqLocPartialCheck(head) 663 * sets bits for incomplete location and/or errors 664 * incomplete defined as Int-fuzz on start or stop with 665 * lim.unk, lim.gt, or lim.lt set 666 * 667 * SLP_COMPLETE = not partial and no errors 668 * SLP_START = incomplete on start (high number on minus strand, low on plus) 669 * SLP_STOP = incomplete on stop 670 * SLP_INTERNAL = lim set on internal intervals 671 * SLP_OTHER = partial location, but no details available 672 * SLP_NOSTART = start does not include end of sequence 673 * SLP_NOSTOP = stop does not include end of sequence 674 * SLP_NOINTERNAL = internal interval not on end of sequence 675 * SLP_LIM_WRONG = lim gt/lt used inconsistently with position in location 676 * 677 * SLP_HAD_ERROR = if AND with return, is TRUE if any errors encountered 678 * 679 *****************************************************************************/ 680 681 #define SLP_COMPLETE 0 682 #define SLP_START 1 683 #define SLP_STOP 2 684 #define SLP_INTERNAL 4 685 #define SLP_OTHER 8 686 #define SLP_NOSTART 16 687 #define SLP_NOSTOP 32 688 #define SLP_NOINTERNAL 64 689 #define SLP_LIM_WRONG 128 690 691 #define SLP_HAD_ERROR 240 692 693 NLM_EXTERN Uint2 SeqLocPartialCheck(SeqLocPtr head); 694 NLM_EXTERN Uint2 SeqLocPartialCheckEx (SeqLocPtr head, Boolean farFetch); 695 696 /* 697 FreeSeqLocSetComponents loops through a chain of SeqLocs and frees 698 the referenced components. Call SeqLocSetFree to the list itself. 699 */ 700 701 NLM_EXTERN void FreeSeqLocSetComponents (SeqLocPtr list); 702 703 NLM_EXTERN CharPtr TaxNameFromCommon(CharPtr common); 704 705 /***************************************************************************** 706 * 707 * QualLocCreate(from, to) 708 * creates a UserObject of _class NCBI, type 1 709 * adds a field of type "qual_loc" 710 * puts the from and to numbers in 711 * These should be offsets, as in a Seq-loc, not numbers starting from 712 * one. 713 * no range check, no strand, no seqid 714 * this just carries locations for the qualifiers anticodon and rpt_unit 715 * Intended to go on SeqFeat.ext 716 * 717 *****************************************************************************/ 718 NLM_EXTERN UserObjectPtr QualLocCreate(Int4 from, Int4 to); 719 720 /***************************************************************************** 721 * 722 * QualLocWrite(uop, buf) 723 * Checks a SeqFeat.ext to see if it is 724 * 1) not null 725 * 2) has a UserObject of _class NCBI, type 1 726 * 3) has a field of label "qual_loc" 727 * 4) if so, prints the two integers as a qualifier location 728 * from..to and returns a pointer to the \0 after "to" 729 * Adds 1 to the internal numbers to convert from offset to 730 * number starting with 1 731 * If any of the above fail, returns NULL 732 * 733 *****************************************************************************/ 734 NLM_EXTERN CharPtr QualLocWrite(UserObjectPtr uop, CharPtr buf); 735 736 /***************************************************************************** 737 * 738 * EntrezASN1Detected detects records retrieved from Entrez, which should 739 * not be edited by Sequin and replaced into ID. 740 * 741 *****************************************************************************/ 742 743 NLM_EXTERN Boolean EntrezASN1Detected (SeqEntryPtr sep); 744 745 /***************************************************************************** 746 * 747 * SeqLocIntNew(Int4 from, Int4 to, Uint1 strand, SeqIdPtr sip) 748 * creates a new SeqLoc of type SeqInt 749 * makes copy of incoming SeqId 750 * 751 *****************************************************************************/ 752 NLM_EXTERN SeqLocPtr LIBCALL SeqLocIntNew (Int4 from, Int4 to, Uint1 strand, SeqIdPtr sip); 753 754 /***************************************************************************** 755 * 756 * SeqLocPntNew(Int4 pos, Uint1 strand, SeqIdPtr sip, Boolean is_fuzz) 757 * creates a new SeqLoc of type SeqPnt 758 * makes copy of incoming SeqId 759 * 760 *****************************************************************************/ 761 NLM_EXTERN SeqLocPtr LIBCALL SeqLocPntNew (Int4 pos, Uint1 strand, SeqIdPtr sip, Boolean is_fuzz); 762 763 /***************************************************************************** 764 * 765 * SeqLocPtr FindSpliceSites(SeqEntryPtr sep, Boolean findOnProtein) 766 * Finds the splice sites on this SeqEntry and returns them as a 767 * SeqLoc. 768 * 769 *****************************************************************************/ 770 NLM_EXTERN SeqLocPtr LIBCALL FindSpliceSites(SeqEntryPtr sep, Boolean findOnProtein); 771 772 /*************************************************************************** 773 ** 774 * 775 * SeqFeatPtr FindCodingRegion(SeqEntryPtr sep) 776 * Finds the coding region feature on this protein SeqEntry and 777 * returns a copy of it. 778 * 779 **************************************************************************** 780 */ 781 NLM_EXTERN SeqFeatPtr LIBCALL FindCodingRegion(SeqEntryPtr sep); 782 783 /***************************************************************************** 784 * 785 * Boolean LIBCALL SeqEntryContainsSeqIdOfMolType(SeqEntryPtr sep, SeqIdPtr sip, Boolean isProtein) 786 * Tests to see if this SeqEntry contains a bioseq of the specified moltype 787 * (protein or DNA) 788 * if sip != NULL then it also insists upon finding a bioseq of the 789 * specified moltype where the SeqIds match 790 * 791 *****************************************************************************/ 792 NLM_EXTERN Boolean LIBCALL SeqEntryContainsSeqIdOfMolType(SeqEntryPtr sep, SeqIdPtr sip, Boolean isProtein); 793 794 /***************************************************************************** 795 * 796 * Tests to see if this SeqEntry contains a bioseq of the specified uid 797 * returns moltype of the bioseq where the SeqIds match 798 * 0 id not found in this SeqEntry 799 * 1 Amino Acid sequence 800 * 2 Nucleotide sequence 801 * 802 *****************************************************************************/ 803 NLM_EXTERN Int2 LIBCALL MolTypeForGI(SeqEntryPtr sep, Int4 uid); 804 805 /* moved from jzmisc.h */ 806 NLM_EXTERN Boolean seqid_name(SeqIdPtr, CharPtr, Boolean, Boolean); 807 NLM_EXTERN Boolean MuskSeqIdWrite(SeqIdPtr sip, CharPtr buf, Int2 buflen, Uint1 format, Boolean do_find, Boolean do_entrez_find); 808 NLM_EXTERN SeqIdPtr local_id_make(CharPtr); 809 NLM_EXTERN SeqLocPtr update_seq_loc(Int4, Int4, Uint1, SeqLocPtr ); 810 NLM_EXTERN SeqIdPtr LIBCALL TxGetSubjectIdFromSeqAlign(SeqAlignPtr seqalign); 811 NLM_EXTERN SeqIdPtr LIBCALL TxGetQueryIdFromSeqAlign(SeqAlignPtr seqalign); 812 NLM_EXTERN Boolean LIBCALL GetScoreAndEvalue( 813 SeqAlignPtr seqalign, Int4 *score, 814 Nlm_FloatHi *bit_score, 815 Nlm_FloatHi *evalue, Int4 *number 816 ); 817 818 /*********************************************************************** 819 * 820 * Adjust the Offset in the SeqAlign to correspond to the beginning 821 * of the sequence and not where BLAST (or some other tool) started. 822 * 823 **********************************************************************/ 824 825 NLM_EXTERN void LIBCALL AdjustOffSetsInSeqAlign(SeqAlignPtr salp, SeqLocPtr slp1, SeqLocPtr slp2); 826 827 828 /* Used with SeqEntryExplore to find Bioseq's in a SeqEntry. */ 829 NLM_EXTERN void FindNuc(SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent); 830 NLM_EXTERN void FindProt(SeqEntryPtr sep, Pointer data, Int4 index, Int2 indent); 831 832 /***************************************************************************** 833 * 834 * Boolean SeqIdOrderInList(a, b) 835 * Looks for single SeqId, "a" in chain of SeqIds, "b" 836 * returns the position (>0) if found.. else returns 0; 837 * 838 *****************************************************************************/ 839 840 NLM_EXTERN Uint4 LIBCALL SeqIdOrderInList (SeqIdPtr a, SeqIdPtr list); 841 842 /***************************************************************************** 843 * 844 * Boolean SeqIdOrderInBioseqIdList(a, b) 845 * Looks for single SeqId, "a" in chain of SeqIds, "b" 846 * and looks at all synonymous SeqIds of the Bioseq "b" 847 * returns the position (>0) if found.. else returns 0; 848 * 849 *****************************************************************************/ 850 NLM_EXTERN Uint4 LIBCALL SeqIdOrderInBioseqIdList (SeqIdPtr a, SeqIdPtr list); 851 852 /* Function to extract the Accession and version number 853 User must provide string buffers for answer. 854 */ 855 NLM_EXTERN void LIBCALL ExtractAccession(CharPtr accn,CharPtr accession,CharPtr version); 856 857 858 859 /* 860 Function to make a proper type SeqId given a string that represents 861 an accession Number 862 User must Call ExtractAccession function separately before calling this. 863 to split accession and version number. 864 */ 865 NLM_EXTERN SeqIdPtr LIBCALL SeqIdFromAccession(CharPtr accession, Uint4 version,CharPtr name); 866 867 /* Variant that also work with PIR accessions and LOCUS names 868 .. and can resolve conflict with network access (if pre-enabled) 869 */ 870 NLM_EXTERN SeqIdPtr LIBCALL SeqIdFromAccessionEx(CharPtr accession, Uint4 version,CharPtr name,Boolean Permissive, Boolean AllowPIR,Boolean UseNetwork,Boolean FavorNucleotide); 871 872 /* Variant of SeqIdFromAccession that works on accession.version string */ 873 874 NLM_EXTERN SeqIdPtr SeqIdFromAccessionDotVersion (CharPtr accession); 875 876 877 /* 878 Following functions and defines moved from accutils.ch 879 */ 880 NLM_EXTERN Uint4 LIBCALL WHICH_db_accession (CharPtr s); 881 NLM_EXTERN Boolean LIBCALL IS_ntdb_accession (CharPtr s); 882 NLM_EXTERN Boolean LIBCALL IS_protdb_accession (CharPtr s); 883 NLM_EXTERN Boolean LIBCALL ACCN_PIR_FORMAT( CharPtr s); 884 NLM_EXTERN Boolean LIBCALL ACCN_1_5_FORMAT( CharPtr s); 885 NLM_EXTERN Boolean LIBCALL AccnIsSWISSPROT( CharPtr s); 886 NLM_EXTERN Boolean LIBCALL AccnIsUniProt (CharPtr s); 887 NLM_EXTERN Boolean LIBCALL NAccnIsGENBANK (CharPtr s); 888 NLM_EXTERN Boolean LIBCALL NAccnIsEMBL (CharPtr s); 889 NLM_EXTERN Boolean LIBCALL NAccnIsDDBJ (CharPtr s); 890 891 892 /* 893 #defines and macros for WHICH_ntdb_accession and 894 WHICH_protdb_accession 895 896 The "divisions" implied by the following #defines are not all inclusives. 897 a GSS or EST sequence submitted through DIRSUB, will have the 898 ACCN_NCBI_DIRSUB code. 899 a sequence can full well be in GSS,EST,etc.. division 900 but not have the appropriate accession number if they were submitted 901 through DIRSUB. 902 903 */ 904 #define ACCN_UNKNOWN 0 905 906 #define ACCN_AMBIGOUS_DB 2 /* Primary can be from any Nucleotide database */ 907 #define ACCN_SWISSPROT 3 908 #define ACCN_NCBI_PROT 4 909 #define ACCN_EMBL_PROT 5 910 #define ACCN_DDBJ_PROT 6 911 912 #define ACCN_GSDB_DIRSUB 7 913 914 #define ACCN_NCBI_GSDB 8 /* NCBI-assigned Accn to GSDB records */ 915 916 #define ACCN_NCBI_EST 9 917 #define ACCN_NCBI_DIRSUB 10 918 #define ACCN_NCBI_GENOME 11 919 #define ACCN_NCBI_PATENT 12 /* Not used .. because all are Ambigous_mol */ 920 #define ACCN_NCBI_HTGS 13 921 #define ACCN_NCBI_GSS 14 922 #define ACCN_NCBI_STS 15 923 #define ACCN_NCBI_BACKBONE 16 /* "S" record, typed from publications */ 924 #define ACCN_NCBI_SEGSET 17 925 #define ACCN_NCBI_OTHER 18 /* unknown or 'other' nucleotide division */ 926 927 #define ACCN_EMBL_EST 19 928 #define ACCN_EMBL_DIRSUB 20 929 #define ACCN_EMBL_GENOME 21 930 #define ACCN_EMBL_PATENT 22 931 #define ACCN_EMBL_HTGS 23 /* Not defined yet */ 932 #define ACCN_EMBL_CON 24 933 #define ACCN_EMBL_OTHER 25 /* unknown or 'other' nucleotide division */ 934 935 #define ACCN_DDBJ_EST 26 936 #define ACCN_DDBJ_DIRSUB 27 937 #define ACCN_DDBJ_GENOME 28 938 #define ACCN_DDBJ_PATENT 29 939 #define ACCN_DDBJ_HTGS 30 940 #define ACCN_DDBJ_CON 31 /* Not defined*/ 941 #define ACCN_DDBJ_OTHER 32 /* unknown or 'other' nucleotide division */ 942 943 #define ACCN_REFSEQ_PROT 33 944 #define ACCN_REFSEQ_mRNA 34 945 #define ACCN_REFSEQ_CONTIG 35 946 #define ACCN_REFSEQ_CHROMOSOME 36 947 #define ACCN_REFSEQ_mRNA_PREDICTED 37 948 #define ACCN_REFSEQ_PROT_PREDICTED 38 949 #define ACCN_REFSEQ_GENOMIC 39 950 951 #define ACCN_NCBI_cDNA 40 952 #define ACCN_IS_PROTEIN 41 /* unreserved 3 letter code .. must be protein*/ 953 #define ACCN_IS_NT 42 /* unreserved 1 or 2 letter code .. must be nuc */ 954 #define ACCN_REFSEQ 43 /* unreserved refseq-type two_letters and underscore*/ 955 #define ACCN_EMBL_GB 44 956 #define ACCN_EMBL_DDBJ 45 957 #define ACCN_GB_DDBJ 46 958 #define ACCN_EMBL_GB_DDBJ 47 959 960 #define ACCN_NCBI_TPA 48 961 #define ACCN_NCBI_TPA_PROT 49 962 #define ACCN_EMBL_TPA 50 963 #define ACCN_EMBL_TPA_PROT 51 964 #define ACCN_DDBJ_TPA 52 965 #define ACCN_DDBJ_TPA_PROT 53 966 967 #define ACCN_NCBI_WGS 54 968 #define ACCN_NCBI_WGS_PROT 55 969 #define ACCN_EMBL_WGS 56 970 #define ACCN_EMBL_WGS_PROT 57 971 #define ACCN_DDBJ_WGS 58 972 #define ACCN_DDBJ_WGS_PROT 59 973 974 #define ACCN_PDB 60 975 976 #define ACCN_DDBJ_GSS 61 977 978 #define ACCN_NCBI_TSA 62 979 #define ACCN_NCBI_TSA_PROT 63 980 #define ACCN_EMBL_TSA 64 981 #define ACCN_EMBL_TSA_PROT 65 982 #define ACCN_DDBJ_TSA 66 983 #define ACCN_DDBJ_TSA_PROT 67 984 985 #define ACCN_REFSEQ_ARTIFICIAL_ASSEMBLY 68 986 #define ACCN_REFSEQ_WGS 69 987 988 #define ACCN_NCBI_OPTICAL 70 989 990 #define ACCN_NCBI_WGS_TPA 71 991 #define ACCN_NCBI_WGS_TPA_PROT 72 992 #define ACCN_EMBL_WGS_TPA 73 993 #define ACCN_EMBL_WGS_TPA_PROT 74 994 #define ACCN_DDBJ_WGS_TPA 75 995 #define ACCN_DDBJ_WGS_TPA_PROT 76 996 997 #define ACCN_NCBI_TARGETED 77 998 999 1000 /* Some accessions prefix can be either protein or nucleotide 1001 such as NCBI PATENT I, AR .. or segmented set Bioseqs 'AH' 1002 */ 1003 #define ACCN_AMBIGOUS_MOL 65536 /* Ambigous Molecule */ 1004 1005 /* 1006 Macros to interpret above #defines codes returned by 1007 WHICH_db_accession 1008 */ 1009 1010 1011 /* 1012 Accession definitively points to a protein record 1013 */ 1014 #define ACCN_IS_PROT(c) (((c)==ACCN_SWISSPROT) || ( (c)==ACCN_NCBI_PROT) || ((c)== ACCN_EMBL_PROT) || ((c)== ACCN_DDBJ_PROT) || ((c)== ACCN_REFSEQ_PROT) || ((c)== ACCN_IS_PROTEIN) || ((c)== ACCN_REFSEQ_PROT_PREDICTED) || ((c)== ACCN_NCBI_TPA_PROT) || ((c)== ACCN_EMBL_TPA_PROT) || ((c)== ACCN_DDBJ_TPA_PROT) || ((c)== ACCN_NCBI_WGS_PROT) || ((c)== ACCN_EMBL_WGS_PROT) || ((c)== ACCN_DDBJ_WGS_PROT) || ((c)== ACCN_NCBI_WGS_TPA_PROT) || ((c)== ACCN_EMBL_WGS_TPA_PROT) || ((c)== ACCN_DDBJ_WGS_TPA_PROT)) 1015 1016 /* 1017 Accession definitively points to a nucleotide record 1018 . note that ACCN_dbname_OTHER is a nucleotide. 1019 */ 1020 #define ACCN_IS_NUC(c) ((((c)&ACCN_AMBIGOUS_MOL)==0) && ((c)!=ACCN_UNKNOWN) && (!ACCN_IS_PROT(c)) ) 1021 1022 #define ACCN_IS_AMBIGOUS_MOL(c) (((c)&ACCN_AMBIGOUS_MOL) == ACCN_AMBIGOUS_MOL) 1023 1024 /* 1025 Define to detect Genbank's accessions: Genbank-subsumed GSDB accession numbers 1026 are defined to be Genbank's as well as GSDB DIRSUB records. 1027 */ 1028 #define ACCN_IS_GENBANK(c) ((((c)&65535) == ACCN_NCBI_GSDB) || (((c)&65535)==ACCN_GSDB_DIRSUB) || (((c)&65535) == ACCN_NCBI_EST) || (((c)&65535) == ACCN_NCBI_DIRSUB) || (((c)&65535) == ACCN_NCBI_GENOME) || (((c)&65535) == ACCN_NCBI_PATENT) || (((c)&65535) == ACCN_NCBI_HTGS) || (((c)&65535) == ACCN_NCBI_GSS) || (((c)&65535) == ACCN_NCBI_STS) || (((c)&65535) == ACCN_NCBI_BACKBONE) || (((c)&65535) == ACCN_NCBI_SEGSET) || (((c)&65535) == ACCN_NCBI_WGS) || (((c)&65535) == ACCN_NCBI_OTHER) || (((c)&65535) == ACCN_NCBI_OPTICAL) || (((c)&65535) == ACCN_NCBI_PROT) || (((c)&65535) == ACCN_NCBI_cDNA) || (((c)&65535) == ACCN_NCBI_TSA) || (((c)&65535) == ACCN_NCBI_TSA_PROT) || (((c)&65535) == ACCN_EMBL_GB) || (((c)&65535) == ACCN_EMBL_GB_DDBJ || (((c)&65535) == ACCN_GB_DDBJ)) ) 1029 1030 /* XM_,NP_,NM_,NT_,NC_ reference sequence records created and curated by NCBI 1031 REFSEQ project 1032 */ 1033 #define ACCN_IS_REFSEQ(c) (((c)== ACCN_REFSEQ_PROT) || ((c)== ACCN_REFSEQ_mRNA) || ((c)== ACCN_REFSEQ_CONTIG) || ((c)== ACCN_REFSEQ_CHROMOSOME) || ((c)== ACCN_REFSEQ_mRNA_PREDICTED) || ((c)== ACCN_REFSEQ_PROT_PREDICTED) || ((c)== ACCN_REFSEQ_GENOMIC) || ((c)== ACCN_REFSEQ_ARTIFICIAL_ASSEMBLY) || ((c)== ACCN_REFSEQ_WGS) || (((c)&65535)== ACCN_REFSEQ) ) 1034 1035 #define ACCN_IS_TPA(c) (((c)== ACCN_NCBI_TPA) || ((c)== ACCN_NCBI_TPA_PROT) || ((c)== ACCN_EMBL_TPA) || ((c)== ACCN_EMBL_TPA_PROT) || ((c)== ACCN_DDBJ_TPA) || ((c)== ACCN_DDBJ_TPA_PROT) || ((c)== ACCN_NCBI_WGS_TPA) || ((c)== ACCN_NCBI_WGS_TPA_PROT) || ((c)== ACCN_EMBL_WGS_TPA) || ((c)== ACCN_EMBL_WGS_TPA_PROT) || ((c)== ACCN_DDBJ_WGS_TPA) || ((c)== ACCN_DDBJ_WGS_TPA_PROT)) 1036 1037 #define ACCN_IS_WGS(c) (((c)== ACCN_NCBI_WGS) || ((c)== ACCN_NCBI_WGS_PROT) || ((c)== ACCN_EMBL_WGS) || ((c)== ACCN_EMBL_WGS_PROT) || ((c)== ACCN_DDBJ_WGS) || ((c)== ACCN_DDBJ_WGS_PROT) || ((c)== ACCN_REFSEQ_WGS) || ((c)== ACCN_NCBI_WGS_TPA) || ((c)== ACCN_NCBI_WGS_TPA_PROT) || ((c)== ACCN_EMBL_WGS_TPA) || ((c)== ACCN_EMBL_WGS_TPA_PROT) || ((c)== ACCN_DDBJ_WGS_TPA) || ((c)== ACCN_DDBJ_WGS_TPA_PROT)) 1038 1039 #define ACCN_IS_TSA(c) (((c)== ACCN_NCBI_TSA) || ((c)== ACCN_NCBI_TSA_PROT) || ((c)== ACCN_EMBL_TSA) || ((c)== ACCN_EMBL_TSA_PROT) || ((c)== ACCN_DDBJ_TSA) || ((c)== ACCN_DDBJ_TSA_PROT)) 1040 1041 #define ACCN_IS_NCBI(c) (ACCN_IS_REFSEQ((c)) || ACCN_IS_GENBANK((c)) || ((c)== ACCN_NCBI_TPA) || ((c)== ACCN_NCBI_TPA_PROT) || ((c)== ACCN_NCBI_WGS) || ((c)== ACCN_NCBI_WGS_PROT) || ((c)== ACCN_NCBI_TSA) || ((c)== ACCN_NCBI_WGS_TPA) || ((c)== ACCN_NCBI_WGS_TPA_PROT) || ((c)== ACCN_NCBI_TARGETED)) 1042 1043 /* 1044 Macro to detect EMBL accession numbers (can also belong to another DB) 1045 */ 1046 #define ACCN_IS_EMBL(c) ( (((c)&65535) == ACCN_EMBL_EST) || (((c)&65535) == ACCN_EMBL_DIRSUB) || (((c)&65535) == ACCN_EMBL_GENOME) || (((c)&65535) == ACCN_EMBL_PATENT) || (((c)&65535) == ACCN_EMBL_HTGS) || (((c)&65535) == ACCN_EMBL_CON) || (((c)&65535) == ACCN_EMBL_WGS) || (((c)&65535) == ACCN_EMBL_OTHER) || (((c)&65535) == ACCN_EMBL_PROT) || (((c)&65535) == ACCN_EMBL_GB) || (((c)&65535) == ACCN_EMBL_DDBJ) || (((c)&65535) == ACCN_EMBL_GB_DDBJ) || (((c)&65535) == ACCN_EMBL_WGS_TPA) || (((c)&65535) == ACCN_EMBL_WGS_TPA_PROT)) 1047 1048 #define ACCN_IS_DDBJ(c) ((((c)&65535) == ACCN_DDBJ_EST) || (((c)&65535) == ACCN_DDBJ_DIRSUB) || (((c)&65535) == ACCN_DDBJ_GENOME) || (((c)&65535) == ACCN_DDBJ_PATENT) || (((c)&65535) == ACCN_DDBJ_HTGS) || (((c)&65535) == ACCN_DDBJ_CON) || (((c)&65535) == ACCN_DDBJ_WGS) || (((c)&65535) == ACCN_DDBJ_OTHER) || (((c)&65535) == ACCN_DDBJ_PROT) || (((c)&65535) == ACCN_DDBJ_GSS) || (((c)&65535) == ACCN_GB_DDBJ) || (((c)&65535) == ACCN_EMBL_DDBJ) || (((c)&65535) == ACCN_EMBL_GB_DDBJ) || (((c)&65535) == ACCN_EMBL_WGS_TPA) || (((c)&65535) == ACCN_EMBL_WGS_TPA_PROT)) 1049 1050 #define ACCN_IS_SWISSPROT(c) ((c)== ACCN_SWISSPROT) 1051 /* 1052 detect the few accessions numbers (N000*-N1*) have been assigned to many databases 1053 .. as well as unnasigned accessions. 1054 */ 1055 #define ACCN_IS_AMBIGOUSDB(c) (((c)&65535)==ACCN_AMBIGOUS_DB || (c)== ACCN_IS_PROTEIN || (c)== ACCN_IS_NT || (((c)&65535) == ACCN_EMBL_GB) || (((c)&65535) == ACCN_EMBL_DDBJ) || (((c)&65535) == ACCN_GB_DDBJ) || (((c)&65535) == ACCN_EMBL_GB_DDBJ)) 1056 /* 1057 does not ressemble any accession types. (with the possible exception 1058 of PIR.. but must call ACCN_PIR_FORMAT() to check that. 1059 */ 1060 #define ACCN_IS_UNKNOWN(c) (c==ACCN_UNKNOWN) 1061 /* Unassigned : is of 3+5 (proteins) OR 1062 2+5 (amino acids) OR 1063 [A-Z][A-Z]_ (refseq type) 1064 , but 1065 has not been formally been formally assigned (hardcoded) 1066 */ 1067 #define ACCN_IS_UNASSIGNED(c) ((c)== ACCN_IS_PROTEIN || (c)== ACCN_IS_NT || (c) == ACCN_UNKNOWN || (c)==ACCN_REFSEQ) 1068 1069 /* 1070 Try to Find if the Bioseq represented by a SeqId is a SeqLoc List; 1071 May fetch the Bioseq to get all the synonymous SeqIds. 1072 */ 1073 1074 NLM_EXTERN Boolean LIBCALL SeqIdInSeqLocList(SeqIdPtr sip, ValNodePtr list); 1075 1076 NLM_EXTERN SeqIdPtr AddSeqId (SeqIdPtr *sip_head, SeqIdPtr sip); 1077 NLM_EXTERN SeqIdPtr SeqIdDupList (SeqIdPtr id_list); 1078 NLM_EXTERN SeqIdPtr SeqIdDupBestList (SeqIdPtr id_list); 1079 NLM_EXTERN SeqIdPtr SeqIdListfromSeqLoc (ValNodePtr vnpslp); 1080 1081 NLM_EXTERN Boolean IsSkippableDbtag (DbtagPtr dbt); 1082 NLM_EXTERN Boolean DoesCDSEndWithStopCodon (SeqFeatPtr cds); 1083 1084 1085 #ifdef __cplusplus 1086 } 1087 #endif 1088 1089 #undef NLM_EXTERN 1090 #ifdef NLM_EXPORT 1091 #define NLM_EXTERN NLM_EXPORT 1092 #else 1093 #define NLM_EXTERN 1094 #endif 1095 1096 #endif 1097