1 /* seqport.h 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * File Name: seqport.h 27 * 28 * Author: James Ostell 29 * 30 * Version Creation Date: 7/13/91 31 * 32 * $Revision: 6.67 $ 33 * 34 * File Description: Ports onto Bioseqs 35 * 36 * Modifications: 37 * -------------------------------------------------------------------------- 38 * Date Name Description of modification 39 * ------- ---------- ----------------------------------------------------- 40 * 41 * ========================================================================== 42 */ 43 44 #ifndef _NCBI_Seqport_ 45 #define _NCBI_Seqport_ 46 47 #include <sequtil.h> 48 49 #undef NLM_EXTERN 50 #ifdef NLM_IMPORT 51 #define NLM_EXTERN NLM_IMPORT 52 #else 53 #define NLM_EXTERN extern 54 #endif 55 56 #ifdef __cplusplus 57 extern "C" { 58 #endif 59 60 /***************************************************************************** 61 * 62 * SeqPort 63 * will attach only to a Bioseq (SeqPortNew) or to a Seq-loc 64 * (SeqPortNewByLoc) in any selected alphabet 65 * You can then treat the sequence or location as a single contiguous 66 * piece. You can Seek (SeqPortSeek) to any location. You can 67 * SeqPortGetResidue, which get the residue at the current position 68 * and seeks to the next residue. You can read a buffer of residues. 69 * 70 * Special characters are returned from SeqPortGetResidue 71 * SEQPORT_EOF (end of sequence reached) 72 * SEQPORT_VIRT (hit a virtual sequence or gap) 73 * INVALID_RESIDUE (residue not valid in original Bioseq) 74 * SEQPORT_EOS (end of segment, not normally seen) 75 * 76 * Some defined values are provided for the Int4 values passed as 77 * start or stop 78 * FIRST_RESIDUE 0 (first residue of sequence) 79 * LAST_RESIDUE -1 (last residue of sequence.. interpreted as 80 * length - 1) 81 * APPEND_RESIDUE -2 (interpreted as length.. off the end of the 82 * sequence. Not valid for SeqPort.. only 83 * used by editing functions ) 84 * 85 * 86 * 87 *****************************************************************************/ 88 #define SEQPORT_EOF 253 /* end of sequence data */ 89 #define SEQPORT_EOS 252 /* end of segment */ 90 #define SEQPORT_VIRT 251 /* skipping virtual sequence or gap */ 91 #define IS_residue(x) (x <= 250) 92 93 #define FIRST_RESIDUE 0 94 #define LAST_RESIDUE -1 95 #define APPEND_RESIDUE -2 96 97 typedef struct spcache { 98 Int2 ctr, total; 99 Uint1 buf[100]; 100 } SPCache, PNTR SPCachePtr; 101 102 typedef struct spcacheq { 103 Int2 ctr, total; 104 Char buf[400]; 105 } SPCacheQ, PNTR SPCacheQPtr; 106 107 typedef struct seqport { 108 BioseqPtr bsp; /* 1 seqentry per port */ 109 Boolean locked; /* TRUE if Lock function used */ 110 Int4 start, stop, /* region of bsp covered */ 111 curpos, /* current position 0-(totlen-1) */ 112 totlen, /* total length of covered region */ 113 bytepos; /* current byte position in bsp->data */ 114 NumberingPtr currnum; /* current numbering info */ 115 Uint1 strand, /* as in seqloc */ 116 lastmsg; /* used by SeqPortRead() */ 117 Boolean is_circle , /* go around the end of a circle? */ 118 is_seg , /* return EOS at the end of segments? */ 119 do_virtual, /* deliver 'N''X' over virtual seqs */ 120 gapIsZero, /* deliver 0 for ncbi4na over virtual seqs - also needs do_virtual */ 121 eos, /* set when comp strand tries to back off */ 122 isa_null, /* TRUE if seqport represents a NULL location */ 123 isa_virtual, /* represents a virtual interval or Bioseq */ 124 backing; /* signal to SeqPortSeek for backing up a layered SeqPort */ 125 SeqMapTablePtr smtp; /* for mapping to requested alphabet */ 126 SeqCodeTablePtr sctp; /* for getting symbols */ 127 Uint1 newcode, /* requested output code */ 128 oldcode; /* current input seq code (0 if not raw) */ 129 Uint1 byte, /* current byte in buf */ 130 bc, /* value to start bitctr */ 131 bitctr, /* current shift */ 132 lshift, /* amount to left shift on decompact */ 133 rshift, /* amount to right shift residue value */ 134 mask; /* mask for compact byte */ 135 struct seqport PNTR curr , /* current active seqport if seg or ref */ 136 PNTR segs, /* segments if seg or ref */ 137 PNTR next; /* if part of a segment chain */ 138 SPCachePtr cache; 139 SPCacheQPtr cacheq; /* used instead of cache for ncbi2na or ncbi4na to iupacna fasta lookup */ 140 ByteStorePtr bp; /* used by both raw and delta seq pieces */ 141 } SeqPort, PNTR SeqPortPtr; 142 143 /***************************************************************************** 144 * 145 * Structure used in SeqPort DNA Compression 146 * 147 *****************************************************************************/ 148 typedef struct SPCompress { 149 Uint1Ptr buffer; /* Buffer with 2na DNA sequence */ 150 Int4 type; /* Type of stored sequence */ 151 Int4 residues; /* Number of residues in buffer */ 152 Int4 used; /* Number of bytes used in buffer */ 153 Int4 allocated; /* Number of bytes allocated in buffer */ 154 Uint4Ptr lbytes; /* Ambiguity information */ 155 } SPCompress, PNTR SPCompressPtr; 156 157 158 NLM_EXTERN SeqPortPtr SeqPortNew PROTO((BioseqPtr bsp, Int4 start, Int4 stop, Uint1 strand, Uint1 code)); 159 NLM_EXTERN SeqPortPtr SeqPortNewByLoc PROTO((SeqLocPtr seqloc, Uint1 code)); 160 NLM_EXTERN SeqPortPtr SeqPortFree PROTO((SeqPortPtr spp)); 161 NLM_EXTERN Int4 SeqPortTell PROTO((SeqPortPtr spp)); 162 NLM_EXTERN Int2 SeqPortSeek PROTO((SeqPortPtr spp, Int4 offset, Int2 origin)); 163 NLM_EXTERN Int4 SeqPortLen PROTO((SeqPortPtr spp)); 164 NLM_EXTERN Uint1 LIBCALL SeqPortGetResidue PROTO((SeqPortPtr spp)); 165 NLM_EXTERN Int2 LIBCALL SeqPortRead PROTO((SeqPortPtr spp, BytePtr buf, Int2 len)); 166 NLM_EXTERN Uint1 GetGapCode PROTO((Uint1 seqcode)); 167 NLM_EXTERN Boolean LIBCALL SeqPortSetUpFields PROTO((SeqPortPtr spp, Int4 start, Int4 stop, Uint1 strand, Uint1 newcode)); 168 NLM_EXTERN Boolean LIBCALL SeqPortSetUpAlphabet PROTO((SeqPortPtr spp, Uint1 curr_code, Uint1 newcode)); 169 170 /******************************************************************************* 171 * 172 * SeqPortStream (bsp, flags, userdata, proc) 173 * SeqPortStreamInt (bsp, start, stop, strand, flags, userdata, proc) 174 * SeqPortStreamLoc (slp, flags, userdata, proc) 175 * SeqPortStreamLit (lit, flags, userdata, proc) 176 * Efficient functions to stream through sequence 177 * 178 ********************************************************************************/ 179 180 typedef void (LIBCALLBACK *SeqPortStreamProc) ( 181 CharPtr sequence, 182 Pointer userdata 183 ); 184 185 typedef unsigned long StreamFlgType; 186 187 #define STREAM_EXPAND_GAPS 1 188 #define GAP_TO_SINGLE_DASH 2 189 #define EXPAND_GAPS_TO_DASHES 3 190 191 #define KNOWN_GAP_AS_PLUS 4 192 #define SEQ_GAP_AS_TILDE 8 193 194 #define SUPPRESS_VIRT_SEQ 16 195 #define STREAM_VIRT_AS_PLUS 32 196 197 #define STREAM_CORRECT_INVAL 64 198 199 #define STREAM_ALLOW_NEG_GIS 128 /* for internal use only by NCBI ID group */ 200 201 #define STREAM_HTML_SPANS 256 /* show span tags at begining of each line */ 202 203 #define STREAM_ALL_FASTA_IDS 512 /* in FASTA streamer, show all Seq-ids */ 204 205 #define STREAM_TAGGED_DEFLINE 1024 /* in FASTA streamer, show [key=value] pairs */ 206 207 NLM_EXTERN Int4 SeqPortStream ( 208 BioseqPtr bsp, 209 StreamFlgType flags, 210 Pointer userdata, 211 SeqPortStreamProc proc 212 ); 213 214 NLM_EXTERN Int4 SeqPortStreamInt ( 215 BioseqPtr bsp, 216 Int4 start, 217 Int4 stop, 218 Uint1 strand, 219 StreamFlgType flags, 220 Pointer userdata, 221 SeqPortStreamProc proc 222 ); 223 224 NLM_EXTERN Int4 SeqPortStreamLoc ( 225 SeqLocPtr slp, 226 StreamFlgType flags, 227 Pointer userdata, 228 SeqPortStreamProc proc 229 ); 230 231 NLM_EXTERN Int4 SeqPortStreamLit ( 232 SeqLitPtr lit, 233 StreamFlgType flags, 234 Pointer userdata, 235 SeqPortStreamProc proc 236 ); 237 238 /******************************************************************************* 239 * 240 * StreamCacheSetup (bsp, slp, flags, scp) 241 * StreamCacheGetResidue (scp) 242 * StreamCacheSetPosition (scp, pos) 243 * SeqPort functional replacement implemented on top of SeqPortStreams 244 * 245 ********************************************************************************/ 246 247 typedef struct streamcache { 248 BioseqPtr bsp; 249 SeqLocPtr slp; 250 Char buf [4004]; 251 Int2 ctr; 252 Int2 total; 253 Int4 offset; 254 Int4 length; 255 StreamFlgType flags; 256 Boolean failed; 257 } StreamCache, PNTR StreamCachePtr; 258 259 NLM_EXTERN Boolean StreamCacheSetup ( 260 BioseqPtr bsp, 261 SeqLocPtr slp, 262 StreamFlgType flags, 263 StreamCache PNTR scp 264 ); 265 266 NLM_EXTERN Uint1 StreamCacheGetResidue ( 267 StreamCache PNTR scp 268 ); 269 270 NLM_EXTERN Boolean StreamCacheSetPosition ( 271 StreamCache PNTR scp, 272 Int4 pos 273 ); 274 275 /* 276 the following functions are for quick alphabet expansion, and require buffers 277 allocated with 4-byte or 2-byte alignment, because they cast 2 or 4 bytes into 278 Uint2 or Uint4 for fast integer copying. 279 */ 280 281 NLM_EXTERN Uint4Ptr LIBCALL MapNa2ByteToIUPACString PROTO((Uint1Ptr bytep, Uint4Ptr buf, Int4 total)); 282 NLM_EXTERN Uint2Ptr LIBCALL MapNa4ByteToIUPACString PROTO((Uint1Ptr bytep, Uint2Ptr buf, Int4 total)); 283 NLM_EXTERN Uint2Ptr LIBCALL MapNa4ByteToIUPACplusGapString PROTO((Uint1Ptr bytep, Uint2Ptr buf, Int4 total)); 284 NLM_EXTERN Uint2Ptr LIBCALL MapNa2ByteToNa4String PROTO((Uint1Ptr bytep, Uint2Ptr buf, Int4 total)); 285 NLM_EXTERN Uint4Ptr LIBCALL MapNa2ByteTo4BitString PROTO((Uint1Ptr bytep, Uint4Ptr buf, Int4 total)); 286 NLM_EXTERN Uint2Ptr LIBCALL MapNa4ByteTo4BitString PROTO((Uint1Ptr bytep, Uint2Ptr buf, Int4 total)); 287 288 289 /***************************************************************************** 290 * 291 * SeqPortSetValues(spp) 292 * Copies the values is_circle, is_seg, and do_virtual from spp to 293 * any dependent SeqPortPtrs it contains. This is necessary for segmented 294 * reference, or delta types of Bioseqs and on SeqPortNewByLoc() 295 * 296 * SeqPortSet_... functions call this function 297 * 298 *****************************************************************************/ 299 NLM_EXTERN Boolean LIBCALL SeqPortSetValues (SeqPortPtr spp); 300 301 /***************************************************************************** 302 * 303 * SeqPortSet_is_circle(spp, value) 304 * if (value) is TRUE, then SeqPort will go around the ends of a circular 305 * molecule without stopping. 306 * Default is FALSE 307 * 308 *****************************************************************************/ 309 NLM_EXTERN Boolean LIBCALL SeqPortSet_is_circle (SeqPortPtr spp, Boolean value); 310 311 /***************************************************************************** 312 * 313 * SeqPortSet_is_seg(spp, value) 314 * if (value) is TRUE, then SeqPort will return SEQPORT_EOS whenever it 315 * crosses a segment boundary in the SeqPort. When is_seg = TRUE, 316 * SEQPORT_VIRT will NOT be returned on virtual or NULL segments. Instead 317 * SEQPORT_EOS will be received only as the SeqPort passes over those 318 * segments. 319 * Default is FALSE, SeqPort will NEVER return SEQPORT_EOS 320 * 321 *****************************************************************************/ 322 NLM_EXTERN Boolean LIBCALL SeqPortSet_is_seg (SeqPortPtr spp, Boolean value); 323 324 /***************************************************************************** 325 * 326 * SeqPortSet_do_virtual(spp, value) 327 * if (value) is TRUE, then SeqPort will return the appropriate ambiguity 328 * character (e.g. "N" or "X") for the length of a virtual Bioseq or delta 329 * gap segment. It will still return SEQPORT_VIRT for a "NULL" segment 330 * (ie. gap of unknown length). 331 * 332 * Default is FALSE. In this case SeqPort will return a single SEQPORT_VIRT 333 * when encountering a virtual Bioseq, just as for a "NULL" segment. 334 * 335 *****************************************************************************/ 336 NLM_EXTERN Boolean LIBCALL SeqPortSet_do_virtual (SeqPortPtr spp, Boolean value); 337 NLM_EXTERN Boolean LIBCALL SeqPortSet_do_virtualEx (SeqPortPtr spp, Boolean value, Boolean gapIsZero); 338 339 /***************************************************************************** 340 * 341 * BioseqHash(bsp) 342 * Computes a (almost) unique hash code for a bioseq 343 * 344 *****************************************************************************/ 345 NLM_EXTERN Uint4 BioseqHash PROTO((BioseqPtr bsp)); 346 347 /***************************************************************************** 348 * 349 * ProteinFromCdRegion(sfp, include_stop) 350 * produces a ByteStorePtr containing the protein sequence in 351 * ncbieaa code for the CdRegion sfp. If include_stop, will translate 352 * through stop codons. If NOT include_stop, will stop at first stop 353 * codon and return the protein sequence NOT including the terminating 354 * stop. Supports reading frame, alternate genetic codes, and code breaks 355 * in the CdRegion. Removes trailing "X" on partial translation. 356 * 357 * if no explict partial at either end, but feature is 358 * annotated as partial, then guess should use internal 359 * amino acid code 360 * 361 *****************************************************************************/ 362 NLM_EXTERN ByteStorePtr ProteinFromCdRegion PROTO(( SeqFeatPtr sfp, Boolean include_stop)); 363 NLM_EXTERN ByteStorePtr ProteinFromCdRegionEx PROTO((SeqFeatPtr sfp, Boolean include_stop, Boolean remove_trailingX)); 364 NLM_EXTERN ByteStorePtr ProteinFromCdRegionExEx PROTO((SeqFeatPtr sfp, Boolean include_stop, Boolean remove_trailingX, BoolPtr altStartP, Boolean farProdFetchOK)); 365 NLM_EXTERN ByteStorePtr ProteinFromCdRegionExWithTrailingCodonHandling PROTO((SeqFeatPtr sfp, Boolean include_stop, Boolean remove_trailingX, Boolean no_stop_at_end_of_complete_cds)); 366 367 /***************************************************************************** 368 * 369 * Uint1 AAForCodon (Uint1Ptr codon, CharPtr codes) 370 * codon is 3 values in ncbi4na code 371 * codes is the geneic code array to use 372 * MUST have 'X' as unknown amino acid 373 * 374 *****************************************************************************/ 375 NLM_EXTERN Uint1 AAForCodon PROTO((Uint1Ptr codon, CharPtr codes)); 376 377 /***************************************************************************** 378 * 379 * Uint1 IndexForCodon (codon, code) 380 * returns index into genetic codes codon array, give 3 bases of the 381 * codon in any alphabet 382 * returns INVALID_RESIDUE on failure 383 * 384 *****************************************************************************/ 385 NLM_EXTERN Uint1 IndexForCodon PROTO((Uint1Ptr codon, Uint1 code)); 386 387 /***************************************************************************** 388 * 389 * Boolean CodonForIndex (index, code, codon) 390 * Fills codon (3 Uint1 array) with codon corresponding to index, 391 * in sequence alphabet code. 392 * Index is the Genetic code index. 393 * returns TRUE on success. 394 * 395 *****************************************************************************/ 396 NLM_EXTERN Boolean CodonForIndex PROTO((Uint1 index, Uint1 code, Uint1Ptr codon)); 397 398 /***************************************************************************** 399 * 400 * Int2 GetFrameFromLoc (slp) 401 * returns 1,2,3 if can find the frame 402 * 0 if not 403 * 404 *****************************************************************************/ 405 NLM_EXTERN Uint1 GetFrameFromLoc PROTO((SeqLocPtr slp)); 406 407 /****************************************************************** 408 * 409 * dnaLoc_to_aaLoc(sfp, dna_loc, merge, frame, allowTerminator) 410 * map a SeqLoc on the DNA sequence 411 * to a Seq-loc in the protein sequence 412 * through a CdRegion feature 413 * if (merge) adjacent intervals on the amino acid sequence 414 * are merged into one. This should be the usual case. 415 * We try to report the frame if the caller provides a suitable pointer 416 * If allowTerminator, can map the termination codon as a legal location 417 * 418 ******************************************************************/ 419 NLM_EXTERN SeqLocPtr LIBCALL dnaLoc_to_aaLoc(SeqFeatPtr sfp, SeqLocPtr dna_loc, Boolean merge, Int4Ptr frame, Boolean allowTerminator); 420 421 /****************************************************************** 422 * 423 * productLoc_to_locationLoc(sfp, product_loc) 424 * map a SeqLoc on the product sequence 425 * to a Seq-loc in the location sequence 426 * through a feature 427 * 428 * this more general function is now called by 429 * aaLoc_to_dnaLoc() 430 * 431 ******************************************************************/ 432 NLM_EXTERN SeqLocPtr LIBCALL productLoc_to_locationLoc(SeqFeatPtr sfp, SeqLocPtr product_loc); 433 434 /****************************************************************** 435 * 436 * aaLoc_to_dnaLoc(sfp, aa_loc) 437 * map a SeqLoc on the amino acid sequence 438 * to a Seq-loc in the DNA sequence 439 * through a CdRegion feature 440 * 441 ******************************************************************/ 442 NLM_EXTERN SeqLocPtr LIBCALL aaLoc_to_dnaLoc(SeqFeatPtr sfp, SeqLocPtr aa_loc); 443 444 /****************************************************************** 445 * 446 * aaFeatLoc_to_dnaFeatLoc(sfp, aa_loc) 447 * map a SeqLoc on the amino acid sequence 448 * to a Seq-loc in the DNA sequence 449 * through a CdRegion feature 450 * 451 * uses aaLoc_to_dnaLoc() but does additional checks to 452 * extend dnaLoc at either end to compensate for positions in 453 * the dna which do not corresspond to the amino acid sequence 454 * (partial codons which are not translated). 455 * 456 ******************************************************************/ 457 NLM_EXTERN SeqLocPtr LIBCALL aaFeatLoc_to_dnaFeatLoc(SeqFeatPtr sfp, SeqLocPtr aa_loc); 458 459 /****************************************************************** 460 * 461 * productInterval_to_locationIntervals(sfp, aa_start, aa_stop) 462 * map the amino acid sequence to a chain of Seq-locs in the 463 * DNA sequence through a CdRegion feature 464 * 465 ******************************************************************/ 466 NLM_EXTERN SeqLocPtr LIBCALL productInterval_to_locationIntervals (SeqFeatPtr sfp, Int4 aa_start, Int4 aa_stop, Boolean aa_partialn); 467 468 /*-------------- BioseqRevComp () ---------------------------*/ 469 /*********************************************************************** 470 * BioseqRevComp: Takes the nucleic acid sequence from Bioseq 471 * Entry and gives the reverse complement sequence in place 472 * Does not change features. 473 ************************************************************************/ 474 NLM_EXTERN Boolean LIBCALL BioseqRevComp (BioseqPtr bsp); 475 476 477 /*-------------- BioseqComplement () ---------------------------*/ 478 /*********************************************************************** 479 * BioseqComplement: Takes the nucleic acid sequence from Bioseq 480 * Entry and gives the complement sequence in place 481 * Does not change features. 482 ************************************************************************/ 483 NLM_EXTERN Boolean LIBCALL BioseqComplement (BioseqPtr bsp); 484 485 486 /*-------------- BioseqReverse () ---------------------------*/ 487 /*********************************************************************** 488 * BioseqReverse: Takes nucleic acid sequence from Bioseq Entry and 489 * reverses the whole sequence in place 490 * Does not change features. 491 ************************************************************************/ 492 NLM_EXTERN Boolean LIBCALL BioseqReverse (BioseqPtr bsp); 493 494 495 /*-------------- ContigRevComp () ---------------------------*/ 496 /*********************************************************************** 497 * ContigRevComp: Reverse complement segmented or delta bioseq 498 ************************************************************************/ 499 NLM_EXTERN Boolean LIBCALL ContigRevComp (BioseqPtr bsp); 500 501 /***************************************************************************** 502 * 503 * SPCompressNew(void); - allocated memory for SPCompress structure 504 * 505 *****************************************************************************/ 506 NLM_EXTERN SPCompressPtr SPCompressNew(void); 507 508 /***************************************************************************** 509 * 510 * SPCompressFree(SPCompressPtr spc); - free SPCompress structure 511 * 512 *****************************************************************************/ 513 NLM_EXTERN void SPCompressFree(SPCompressPtr spc); 514 515 /***************************************************************************** 516 * 517 * SPCompressDNA(SeqPortPtr spp); 518 * converts a ncbi4na taken from spp into ncbi2na 519 * buffer stored inside SPCompress structue together 520 * with ambiguity information 521 * returns pointer SPCompress structure or NULL if error 522 * 523 * NOTE: In this function we do not know - what is length 524 * of sequence to compress. Terminated flag for this 525 * function is SEQPORT_EOF returned from spp. 526 * 527 *****************************************************************************/ 528 NLM_EXTERN SPCompressPtr SPCompressDNA(SeqPortPtr spp); 529 530 /***************************************************************************** 531 * 532 * SPRebuildDNA(SPCompressPtr spc); 533 * translates spc ncbi2na encoding buffer into 534 * spc ncbi4na encoding buffer with rebuild ambiguities 535 * 536 * spc - must be valid SPCompress structure returned 537 * from SPCompressDNA() function in ncbi2na encoding 538 * 539 *****************************************************************************/ 540 NLM_EXTERN Boolean SPRebuildDNA(SPCompressPtr spc); 541 542 /***************************************************************************** 543 * 544 * ComposeCodonsRecognizedString (trna, buf, buflen); 545 * Copies codon recognized string to buf, returns number of codons 546 * 547 *****************************************************************************/ 548 549 NLM_EXTERN Int2 ComposeCodonsRecognizedString (tRNAPtr trna, CharPtr buf, size_t buflen); 550 551 /***************************************************************************** 552 * 553 * TransTableNew (Int2 genCode); 554 * Initializes TransTable finite state machine for 6-frame translation 555 * and open reading frame search, allowing nucleotide ambiguity characters 556 * 557 *****************************************************************************/ 558 559 typedef struct fsatranstable { 560 Int2 genCode; 561 Char ncbieaa [65]; 562 Char sncbieaa [65]; 563 Uint2 nextBase [3376]; 564 Char aminoAcid [3376] [2]; 565 Char orfStart [3376] [2]; 566 Uint1 basesToIdx [256]; 567 } TransTable, PNTR TransTablePtr; 568 569 /* allocate 6-frame finite state translation table and initialize with indicated genetic code */ 570 NLM_EXTERN TransTablePtr TransTableNew (Int2 genCode); 571 NLM_EXTERN TransTablePtr TransTableFree (TransTablePtr tbl); 572 NLM_EXTERN void TransTableFreeAll (void); 573 574 #define TTBL_TOP_STRAND 0 575 #define TTBL_BOT_STRAND 1 576 577 #define TTBL_ATG_STATE 48 578 #define TTBL_CAT_STATE 229 579 580 /* macros for using finite state machine for 6-frame translation */ 581 #define NextCodonState(tbl,cur,ch) (tbl->nextBase [(int) (Uint2) cur] + tbl->basesToIdx [(int) (Uint1) ch]) 582 #define GetCodonResidue(tbl,cur,stnd) (tbl->aminoAcid [(int) (Uint2) cur] [stnd]) 583 #define GetStartResidue(tbl,cur,stnd) (tbl->orfStart [(int) (Uint2) cur] [stnd]) 584 #define IsOrfStart(tbl,cur,stnd) ((Boolean) (GetStartResidue(tbl,cur,stnd) == 'M')) 585 #define IsAmbigStart(tbl,cur,stnd) ((Boolean) (GetStartResidue(tbl,cur,stnd) == 'X')) 586 #define IsAnyStart(tbl,cur,stnd) ((Boolean) (GetStartResidue(tbl,cur,stnd) != '-')) 587 #define IsOrfStop(tbl,cur,stnd) ((Boolean) (GetCodonResidue(tbl,cur,stnd) == '*')) 588 #define IsATGStart(tbl,cur,stnd) ((Boolean) (IsOrfStart(tbl,cur,stnd) && (stnd ? (cur == TTBL_CAT_STATE) : (cur == TTBL_ATG_STATE)))) 589 #define IsAltStart(tbl,cur,stnd) ((Boolean) (IsOrfStart(tbl,cur,stnd) && (stnd ? (cur != TTBL_CAT_STATE) : (cur != TTBL_ATG_STATE)))) 590 591 typedef void (LIBCALLBACK *TransTableMatchProc) (Int4 position, Char residue, Boolean atgStart, Boolean altStart, Boolean orfStop, Int2 frame, Uint1 strand, Pointer userdata); 592 593 /* convenience function calls user callback for each strand of entire bioseq */ 594 595 NLM_EXTERN void TransTableProcessBioseq ( 596 TransTablePtr tbl, 597 TransTableMatchProc matchProc, 598 Pointer userdata, 599 BioseqPtr bsp 600 ); 601 602 /* trans table translation functions can be passed cds feature or individual parameters */ 603 604 NLM_EXTERN ByteStorePtr TransTableTranslateCdRegion ( 605 TransTablePtr PNTR tblptr, 606 SeqFeatPtr cds, 607 Boolean include_stop, 608 Boolean remove_trailingX, 609 Boolean no_stop_at_end_of_complete_cds 610 ); 611 612 NLM_EXTERN ByteStorePtr TransTableTranslateCdRegionEx ( 613 TransTablePtr PNTR tblptr, 614 SeqFeatPtr cds, 615 Boolean include_stop, 616 Boolean remove_trailingX, 617 Boolean no_stop_at_end_of_complete_cds, 618 BoolPtr altStartP, 619 Boolean farProdFetchOK 620 ); 621 622 NLM_EXTERN ByteStorePtr TransTableTranslateSeqLoc ( 623 TransTablePtr PNTR tblptr, 624 SeqLocPtr location, 625 Int2 genCode, 626 Uint1 frame, 627 Boolean include_stop, 628 Boolean remove_trailingX 629 ); 630 631 /* returns string of bases to translate */ 632 633 NLM_EXTERN CharPtr ReadCodingRegionBases ( 634 SeqLocPtr location, 635 Int4 len, 636 Uint1 frame, 637 Int4Ptr totalP 638 ); 639 640 /* allow reuse of translation tables by saving as AppProperty, avoids unnecessary initializations */ 641 642 NLM_EXTERN TransTablePtr PersistentTransTableByGenCode ( 643 Int2 genCode 644 ); 645 646 NLM_EXTERN TransTablePtr PersistentTransTableByCdRegion ( 647 SeqFeatPtr cds 648 ); 649 650 NLM_EXTERN ValNodePtr MakeCodeBreakList ( 651 SeqLocPtr cdslocation, 652 Int4 len, 653 CodeBreakPtr cbp, 654 Uint1 frame 655 ); 656 657 /***************************************************************************** 658 * 659 * SeqSearch 660 * Initializes SeqSearch finite state machine for sequence searching 661 * Based on Practical Algorithms for Programmers by Binstock and Rex 662 * 663 *****************************************************************************/ 664 665 struct SeqSearch; 666 typedef struct SeqSearch* SeqSearchPtr; 667 668 typedef void (LIBCALLBACK *SeqSearchMatchProc) (Int4 position, CharPtr name, CharPtr pattern, Int2 cutSite, Uint1 strand, Pointer userdata); 669 670 /* create empty nucleotide sequence search finite state machine */ 671 672 NLM_EXTERN SeqSearchPtr SeqSearchNew ( 673 SeqSearchMatchProc matchproc, 674 Pointer userdata 675 ); 676 677 /* 678 add nucleotide pattern or restriction site to sequence search finite state 679 machine, expands using ambiguity codes R = A and G, H = A, C and T, etc. 680 */ 681 682 typedef unsigned long SearchFlgType; 683 684 #define SEQ_SEARCH_JUST_TOP_STRAND 1 685 #define SEQ_SEARCH_EXPAND_PATTERN 2 686 #define SEQ_SEARCH_ALLOW_MISMATCH 4 687 688 NLM_EXTERN void SeqSearchAddNucleotidePattern ( 689 SeqSearchPtr tbl, 690 CharPtr name, 691 CharPtr pattern, 692 Int2 cutSite, 693 SearchFlgType flags 694 ); 695 696 /* program passes each character in turn to finite state machine */ 697 698 NLM_EXTERN void SeqSearchProcessCharacter ( 699 SeqSearchPtr tbl, 700 Char ch 701 ); 702 703 /* convenience function calls SeqSearchProcessCharacter for entire bioseq */ 704 705 NLM_EXTERN void SeqSearchProcessBioseq ( 706 SeqSearchPtr tbl, 707 BioseqPtr bsp 708 ); 709 710 /* reset state and position to allow another run with same search patterns */ 711 712 NLM_EXTERN void SeqSearchReset ( 713 SeqSearchPtr tbl 714 ); 715 716 /* clean up sequence search finite state machine allocated memory */ 717 718 NLM_EXTERN SeqSearchPtr SeqSearchFree ( 719 SeqSearchPtr tbl 720 ); 721 722 723 /***************************************************************************** 724 * 725 * ProtSearch 726 * Initializes ProtSearch finite state machine for sequence searching 727 * Based on Practical Algorithms for Programmers by Binstock and Rex 728 * 729 *****************************************************************************/ 730 731 struct ProtSearch; 732 typedef struct ProtSearch* ProtSearchPtr; 733 734 typedef void (LIBCALLBACK *ProtSearchMatchProc) (Int4 position, CharPtr name, CharPtr pattern, Pointer userdata); 735 736 /* create empty protein sequence search finite state machine */ 737 738 NLM_EXTERN ProtSearchPtr ProtSearchNew ( 739 ProtSearchMatchProc matchproc, 740 Pointer userdata 741 ); 742 743 /* 744 add protein pattern to protein sequence search finite state machine, 745 expands using ambiguity codes B = D and N, Z = E and Q, etc. 746 */ 747 748 NLM_EXTERN void ProtSearchAddProteinPattern ( 749 ProtSearchPtr tbl, 750 CharPtr name, 751 CharPtr pattern, 752 SearchFlgType flags 753 ); 754 755 /* program passes each character in turn to finite state machine */ 756 757 NLM_EXTERN void ProtSearchProcessCharacter ( 758 ProtSearchPtr tbl, 759 Char ch 760 ); 761 762 /* convenience function calls ProtSearchProcessCharacter for entire bioseq */ 763 764 NLM_EXTERN void ProtSearchProcessBioseq ( 765 ProtSearchPtr tbl, 766 BioseqPtr bsp 767 ); 768 769 770 /* reset state and position to allow another run with same search patterns */ 771 772 NLM_EXTERN void ProtSearchReset ( 773 ProtSearchPtr tbl 774 ); 775 776 /* clean up sequence search finite state machine allocated memory */ 777 778 NLM_EXTERN ProtSearchPtr ProtSearchFree ( 779 ProtSearchPtr tbl 780 ); 781 782 783 /***************************************************************************** 784 * 785 * Convenience functions for genome processing use BioseqLockById to get sequence 786 * record (perhaps with phrap quality score graphs) so fetching from some network 787 * or local server must be enabled, or sequences must already be in memory. 788 * 789 *****************************************************************************/ 790 791 NLM_EXTERN CharPtr GetSequenceByBsp ( 792 BioseqPtr bsp 793 ); 794 795 NLM_EXTERN CharPtr GetSequenceByIdOrAccnDotVer ( 796 SeqIdPtr sip, 797 CharPtr accession, 798 Boolean is_na 799 ); 800 801 NLM_EXTERN CharPtr GetSequenceByFeature ( 802 SeqFeatPtr sfp 803 ); 804 805 NLM_EXTERN CharPtr GetSequenceByLocation ( 806 SeqLocPtr slp 807 ); 808 809 NLM_EXTERN CharPtr GetSequenceByBspEx ( 810 BioseqPtr bsp, 811 StreamFlgType flags 812 ); 813 814 NLM_EXTERN CharPtr GetSequenceByIdOrAccnDotVerEx ( 815 SeqIdPtr sip, 816 CharPtr accession, 817 Boolean is_na, 818 StreamFlgType flags 819 ); 820 821 NLM_EXTERN CharPtr GetSequenceByFeatureEx ( 822 SeqFeatPtr sfp, 823 StreamFlgType flags 824 ); 825 826 NLM_EXTERN CharPtr GetSequenceByLocationEx ( 827 SeqLocPtr slp, 828 StreamFlgType flags 829 ); 830 831 NLM_EXTERN CharPtr GetDNAbyAccessionDotVersion ( 832 CharPtr accession 833 ); 834 835 NLM_EXTERN BytePtr GetScoresbyAccessionDotVersion ( 836 CharPtr accession, 837 Int4Ptr bsplength 838 ); 839 840 NLM_EXTERN BytePtr GetScoresbySeqId ( 841 SeqIdPtr sip, 842 Int4Ptr bsplength 843 ); 844 845 /***************************************************************************** 846 * 847 * ConvertNsToGaps 848 * Assumes string of Ns means a gap of known length 849 * 850 *****************************************************************************/ 851 852 NLM_EXTERN void ConvertNsToGaps ( 853 BioseqPtr bsp, 854 Pointer userdata 855 ); 856 857 /************************************************************** 858 * 859 * Returns a protein molecular weight for a SeqLoc 860 * If it cannot calculate the value it returns -1.0 861 * If sequence contains X, J, or O it fails 862 * 863 ***************************************************************/ 864 NLM_EXTERN FloatHi MolWtForLoc (SeqLocPtr slp); 865 866 NLM_EXTERN FloatHi MolWtForBsp (BioseqPtr bsp); 867 868 NLM_EXTERN FloatHi MolWtForStr (CharPtr str); 869 870 871 872 NLM_EXTERN Boolean LIBCALL ReverseSeqData (Uint1 seqtype, Int4 seqlen, SeqDataPtr sdp); 873 NLM_EXTERN Boolean ComplementSeqData (Uint1 seqtype, Int4 seqlen, SeqDataPtr sdp); 874 875 876 #ifdef __cplusplus 877 } 878 #endif 879 880 #undef NLM_EXTERN 881 #ifdef NLM_EXPORT 882 #define NLM_EXTERN NLM_EXPORT 883 #else 884 #define NLM_EXTERN 885 #endif 886 887 #endif 888