1 /*  seqport.h
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name:  seqport.h
27 *
28 * Author:  James Ostell
29 *
30 * Version Creation Date: 7/13/91
31 *
32 * $Revision: 6.67 $
33 *
34 * File Description:  Ports onto Bioseqs
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * Date	   Name        Description of modification
39 * -------  ----------  -----------------------------------------------------
40 *
41 * ==========================================================================
42 */
43 
44 #ifndef _NCBI_Seqport_
45 #define _NCBI_Seqport_
46 
47 #include <sequtil.h>
48 
49 #undef NLM_EXTERN
50 #ifdef NLM_IMPORT
51 #define NLM_EXTERN NLM_IMPORT
52 #else
53 #define NLM_EXTERN extern
54 #endif
55 
56 #ifdef __cplusplus
57 extern "C" {
58 #endif
59 
60 /*****************************************************************************
61 *
62 *   SeqPort
63 *       will attach only to a Bioseq (SeqPortNew) or to a Seq-loc
64 *          (SeqPortNewByLoc) in any selected alphabet
65 *       You can then treat the sequence or location as a single contiguous
66 *          piece. You can Seek (SeqPortSeek) to any location. You can
67 *          SeqPortGetResidue, which get the residue at the current position
68 *          and seeks to the next residue. You can read a buffer of residues.
69 *
70 *       Special characters are returned from SeqPortGetResidue
71 *   		SEQPORT_EOF   (end of sequence reached)
72 *           SEQPORT_VIRT  (hit a virtual sequence or gap)
73 *           INVALID_RESIDUE (residue not valid in original Bioseq)
74 *   		SEQPORT_EOS   (end of segment, not normally seen)
75 *
76 *       Some defined values are provided for the Int4 values passed as
77 *           start or stop
78 *       FIRST_RESIDUE   0	(first residue of sequence)
79 *       LAST_RESIDUE    -1  (last residue of sequence.. interpreted as
80 *                              length - 1)
81 *       APPEND_RESIDUE  -2  (interpreted as length.. off the end of the
82 *                              sequence. Not valid for SeqPort.. only
83 *                              used by editing functions )
84 *
85 *
86 *
87 *****************************************************************************/
88 #define SEQPORT_EOF 253       /* end of sequence data */
89 #define SEQPORT_EOS 252       /* end of segment */
90 #define SEQPORT_VIRT 251      /* skipping virtual sequence or gap */
91 #define IS_residue(x) (x <= 250)
92 
93 #define FIRST_RESIDUE 0
94 #define LAST_RESIDUE -1
95 #define APPEND_RESIDUE -2
96 
97 typedef struct spcache {
98 	Int2 ctr, total;
99 	Uint1 buf[100];
100 } SPCache, PNTR SPCachePtr;
101 
102 typedef struct spcacheq {
103 	Int2 ctr, total;
104 	Char buf[400];
105 } SPCacheQ, PNTR SPCacheQPtr;
106 
107 typedef struct seqport {
108     BioseqPtr bsp;             /* 1 seqentry per port */
109 	Boolean locked;              /* TRUE if Lock function used */
110     Int4 start, stop,            /* region of bsp covered */
111         curpos,                  /* current position 0-(totlen-1) */
112         totlen,                  /* total length of covered region */
113 		bytepos;                 /* current byte position in bsp->data */
114     NumberingPtr currnum;        /* current numbering info */
115     Uint1 strand,                /* as in seqloc */
116         lastmsg;                 /* used by SeqPortRead() */
117     Boolean is_circle ,          /* go around the end of a circle? */
118             is_seg ,             /* return EOS at the end of segments? */
119             do_virtual,          /* deliver 'N''X' over virtual seqs */
120             gapIsZero,           /* deliver 0 for ncbi4na over virtual seqs - also needs do_virtual */
121             eos,                 /* set when comp strand tries to back off */
122 			isa_null,            /* TRUE if seqport represents a NULL location */
123 			isa_virtual,         /* represents a virtual interval or Bioseq */
124 			backing;             /* signal to SeqPortSeek for backing up a layered SeqPort */
125     SeqMapTablePtr smtp;         /* for mapping to requested alphabet */
126     SeqCodeTablePtr sctp;        /* for getting symbols */
127     Uint1 newcode,               /* requested output code */
128           oldcode;               /* current input seq code (0 if not raw) */
129     Uint1 byte,                    /* current byte in buf */
130         bc,                      /* value to start bitctr */
131         bitctr,                  /* current shift */
132         lshift,                  /* amount to left shift on decompact */
133         rshift,                  /* amount to right shift residue value */
134         mask;                    /* mask for compact byte */
135     struct seqport PNTR curr ,   /* current active seqport if seg or ref */
136         PNTR segs,                    /* segments if seg or ref */
137         PNTR next;                    /* if part of a segment chain */
138 	SPCachePtr cache;
139 	SPCacheQPtr cacheq;          /* used instead of cache for ncbi2na or ncbi4na to iupacna fasta lookup */
140 	ByteStorePtr bp;             /* used by both raw and delta seq pieces */
141 } SeqPort, PNTR SeqPortPtr;
142 
143 /*****************************************************************************
144 *
145 *  Structure used in SeqPort DNA Compression
146 *
147 *****************************************************************************/
148 typedef struct SPCompress {
149   Uint1Ptr buffer;  /* Buffer with 2na DNA sequence */
150   Int4 type;        /* Type of stored sequence */
151   Int4 residues;    /* Number of residues in buffer */
152   Int4 used;        /* Number of bytes used in buffer */
153   Int4 allocated;   /* Number of bytes allocated in buffer */
154   Uint4Ptr lbytes;  /* Ambiguity information */
155 } SPCompress, PNTR SPCompressPtr;
156 
157 
158 NLM_EXTERN SeqPortPtr SeqPortNew PROTO((BioseqPtr bsp, Int4 start, Int4 stop, Uint1 strand, Uint1 code));
159 NLM_EXTERN SeqPortPtr SeqPortNewByLoc PROTO((SeqLocPtr seqloc, Uint1 code));
160 NLM_EXTERN SeqPortPtr SeqPortFree PROTO((SeqPortPtr spp));
161 NLM_EXTERN Int4 SeqPortTell PROTO((SeqPortPtr spp));
162 NLM_EXTERN Int2 SeqPortSeek PROTO((SeqPortPtr spp, Int4 offset, Int2 origin));
163 NLM_EXTERN Int4 SeqPortLen PROTO((SeqPortPtr spp));
164 NLM_EXTERN Uint1 LIBCALL SeqPortGetResidue PROTO((SeqPortPtr spp));
165 NLM_EXTERN Int2 LIBCALL SeqPortRead PROTO((SeqPortPtr spp, BytePtr buf, Int2 len));
166 NLM_EXTERN Uint1 GetGapCode PROTO((Uint1 seqcode));
167 NLM_EXTERN Boolean LIBCALL SeqPortSetUpFields PROTO((SeqPortPtr spp, Int4 start, Int4 stop, Uint1 strand, Uint1 newcode));
168 NLM_EXTERN Boolean LIBCALL SeqPortSetUpAlphabet PROTO((SeqPortPtr spp, Uint1 curr_code, Uint1 newcode));
169 
170 /*******************************************************************************
171 *
172 *   SeqPortStream (bsp, flags, userdata, proc)
173 *   SeqPortStreamInt (bsp, start, stop, strand, flags, userdata, proc)
174 *   SeqPortStreamLoc (slp, flags, userdata, proc)
175 *   SeqPortStreamLit (lit, flags, userdata, proc)
176 *       Efficient functions to stream through sequence
177 *
178 ********************************************************************************/
179 
180 typedef void (LIBCALLBACK *SeqPortStreamProc) (
181   CharPtr sequence,
182   Pointer userdata
183 );
184 
185 typedef unsigned long StreamFlgType;
186 
187 #define STREAM_EXPAND_GAPS        1
188 #define GAP_TO_SINGLE_DASH        2
189 #define EXPAND_GAPS_TO_DASHES     3
190 
191 #define KNOWN_GAP_AS_PLUS         4
192 #define SEQ_GAP_AS_TILDE          8
193 
194 #define SUPPRESS_VIRT_SEQ        16
195 #define STREAM_VIRT_AS_PLUS      32
196 
197 #define STREAM_CORRECT_INVAL     64
198 
199 #define STREAM_ALLOW_NEG_GIS    128 /* for internal use only by NCBI ID group */
200 
201 #define STREAM_HTML_SPANS       256 /* show span tags at begining of each line */
202 
203 #define STREAM_ALL_FASTA_IDS    512 /* in FASTA streamer, show all Seq-ids */
204 
205 #define STREAM_TAGGED_DEFLINE  1024 /* in FASTA streamer, show [key=value] pairs */
206 
207 NLM_EXTERN Int4 SeqPortStream (
208   BioseqPtr bsp,
209   StreamFlgType flags,
210   Pointer userdata,
211   SeqPortStreamProc proc
212 );
213 
214 NLM_EXTERN Int4 SeqPortStreamInt (
215   BioseqPtr bsp,
216   Int4 start,
217   Int4 stop,
218   Uint1 strand,
219   StreamFlgType flags,
220   Pointer userdata,
221   SeqPortStreamProc proc
222 );
223 
224 NLM_EXTERN Int4 SeqPortStreamLoc (
225   SeqLocPtr slp,
226   StreamFlgType flags,
227   Pointer userdata,
228   SeqPortStreamProc proc
229 );
230 
231 NLM_EXTERN Int4 SeqPortStreamLit (
232   SeqLitPtr lit,
233   StreamFlgType flags,
234   Pointer userdata,
235   SeqPortStreamProc proc
236 );
237 
238 /*******************************************************************************
239 *
240 *   StreamCacheSetup (bsp, slp, flags, scp)
241 *   StreamCacheGetResidue (scp)
242 *   StreamCacheSetPosition (scp, pos)
243 *       SeqPort functional replacement implemented on top of SeqPortStreams
244 *
245 ********************************************************************************/
246 
247 typedef struct streamcache {
248   BioseqPtr     bsp;
249   SeqLocPtr     slp;
250   Char          buf [4004];
251   Int2          ctr;
252   Int2          total;
253   Int4          offset;
254   Int4          length;
255   StreamFlgType flags;
256   Boolean       failed;
257 } StreamCache, PNTR StreamCachePtr;
258 
259 NLM_EXTERN Boolean StreamCacheSetup (
260   BioseqPtr bsp,
261   SeqLocPtr slp,
262   StreamFlgType flags,
263   StreamCache PNTR scp
264 );
265 
266 NLM_EXTERN Uint1 StreamCacheGetResidue (
267   StreamCache PNTR scp
268 );
269 
270 NLM_EXTERN Boolean StreamCacheSetPosition (
271   StreamCache PNTR scp,
272   Int4 pos
273 );
274 
275 /*
276 the following functions are for quick alphabet expansion, and require buffers
277 allocated with 4-byte or 2-byte alignment, because they cast 2 or 4 bytes into
278 Uint2 or Uint4 for fast integer copying.
279 */
280 
281 NLM_EXTERN Uint4Ptr LIBCALL MapNa2ByteToIUPACString PROTO((Uint1Ptr bytep, Uint4Ptr buf, Int4 total));
282 NLM_EXTERN Uint2Ptr LIBCALL MapNa4ByteToIUPACString PROTO((Uint1Ptr bytep, Uint2Ptr buf, Int4 total));
283 NLM_EXTERN Uint2Ptr LIBCALL MapNa4ByteToIUPACplusGapString PROTO((Uint1Ptr bytep, Uint2Ptr buf, Int4 total));
284 NLM_EXTERN Uint2Ptr LIBCALL MapNa2ByteToNa4String PROTO((Uint1Ptr bytep, Uint2Ptr buf, Int4 total));
285 NLM_EXTERN Uint4Ptr LIBCALL MapNa2ByteTo4BitString PROTO((Uint1Ptr bytep, Uint4Ptr buf, Int4 total));
286 NLM_EXTERN Uint2Ptr LIBCALL MapNa4ByteTo4BitString PROTO((Uint1Ptr bytep, Uint2Ptr buf, Int4 total));
287 
288 
289 /*****************************************************************************
290 *
291 *   SeqPortSetValues(spp)
292 *      Copies the values is_circle, is_seg, and do_virtual from spp to
293 *        any dependent SeqPortPtrs it contains. This is necessary for segmented
294 *        reference, or delta types of Bioseqs and on SeqPortNewByLoc()
295 *
296 *      SeqPortSet_... functions call this function
297 *
298 *****************************************************************************/
299 NLM_EXTERN Boolean LIBCALL SeqPortSetValues (SeqPortPtr spp);
300 
301 /*****************************************************************************
302 *
303 *   SeqPortSet_is_circle(spp, value)
304 *      if (value) is TRUE, then SeqPort will go around the ends of a circular
305 *        molecule without stopping.
306 *      Default is FALSE
307 *
308 *****************************************************************************/
309 NLM_EXTERN Boolean LIBCALL SeqPortSet_is_circle (SeqPortPtr spp, Boolean value);
310 
311 /*****************************************************************************
312 *
313 *   SeqPortSet_is_seg(spp, value)
314 *      if (value) is TRUE, then SeqPort will return SEQPORT_EOS whenever it
315 *         crosses a segment boundary in the SeqPort. When is_seg = TRUE,
316 *         SEQPORT_VIRT will NOT be returned on virtual or NULL segments. Instead
317 *         SEQPORT_EOS will be received only as the SeqPort passes over those
318 *         segments.
319 *      Default is FALSE, SeqPort will NEVER return SEQPORT_EOS
320 *
321 *****************************************************************************/
322 NLM_EXTERN Boolean LIBCALL SeqPortSet_is_seg (SeqPortPtr spp, Boolean value);
323 
324 /*****************************************************************************
325 *
326 *   SeqPortSet_do_virtual(spp, value)
327 *      if (value) is TRUE, then SeqPort will return the appropriate ambiguity
328 *         character (e.g. "N" or "X") for the length of a virtual Bioseq or delta
329 *         gap segment. It will still return SEQPORT_VIRT for a "NULL" segment
330 *         (ie. gap of unknown length).
331 *
332 *      Default is FALSE. In this case SeqPort will return a single SEQPORT_VIRT
333 *         when encountering a virtual Bioseq, just as for a "NULL" segment.
334 *
335 *****************************************************************************/
336 NLM_EXTERN Boolean LIBCALL SeqPortSet_do_virtual (SeqPortPtr spp, Boolean value);
337 NLM_EXTERN Boolean LIBCALL SeqPortSet_do_virtualEx (SeqPortPtr spp, Boolean value, Boolean gapIsZero);
338 
339 /*****************************************************************************
340 *
341 *   BioseqHash(bsp)
342 *   	Computes a (almost) unique hash code for a bioseq
343 *
344 *****************************************************************************/
345 NLM_EXTERN Uint4 BioseqHash PROTO((BioseqPtr bsp));
346 
347 /*****************************************************************************
348 *
349 *   ProteinFromCdRegion(sfp, include_stop)
350 *   	produces a ByteStorePtr containing the protein sequence in
351 *   ncbieaa code for the CdRegion sfp.  If include_stop, will translate
352 *   through stop codons.  If NOT include_stop, will stop at first stop
353 *   codon and return the protein sequence NOT including the terminating
354 *   stop.  Supports reading frame, alternate genetic codes, and code breaks
355 *   in the CdRegion. Removes trailing "X" on partial translation.
356 *
357 * if no explict partial at either end, but feature is
358 * annotated as partial, then guess should use internal
359 * amino acid code
360 *
361 *****************************************************************************/
362 NLM_EXTERN ByteStorePtr ProteinFromCdRegion PROTO(( SeqFeatPtr sfp, Boolean include_stop));
363 NLM_EXTERN ByteStorePtr ProteinFromCdRegionEx PROTO((SeqFeatPtr sfp, Boolean include_stop, Boolean remove_trailingX));
364 NLM_EXTERN ByteStorePtr ProteinFromCdRegionExEx PROTO((SeqFeatPtr sfp, Boolean include_stop, Boolean remove_trailingX, BoolPtr altStartP, Boolean farProdFetchOK));
365 NLM_EXTERN ByteStorePtr ProteinFromCdRegionExWithTrailingCodonHandling PROTO((SeqFeatPtr sfp, Boolean include_stop, Boolean remove_trailingX, Boolean no_stop_at_end_of_complete_cds));
366 
367 /*****************************************************************************
368 *
369 *   Uint1 AAForCodon (Uint1Ptr codon, CharPtr codes)
370 *   	codon is 3 values in ncbi4na code
371 *       codes is the geneic code array to use
372 *          MUST have 'X' as unknown amino acid
373 *
374 *****************************************************************************/
375 NLM_EXTERN Uint1 AAForCodon PROTO((Uint1Ptr codon, CharPtr codes));
376 
377 /*****************************************************************************
378 *
379 *   Uint1 IndexForCodon (codon, code)
380 *   	returns index into genetic codes codon array, give 3 bases of the
381 *       codon in any alphabet
382 *       returns INVALID_RESIDUE on failure
383 *
384 *****************************************************************************/
385 NLM_EXTERN Uint1 IndexForCodon PROTO((Uint1Ptr codon, Uint1 code));
386 
387 /*****************************************************************************
388 *
389 *   Boolean CodonForIndex (index, code, codon)
390 *   	Fills codon (3 Uint1 array) with codon corresponding to index,
391 *       in sequence alphabet code.
392 *       Index is the Genetic code index.
393 *       returns TRUE on success.
394 *
395 *****************************************************************************/
396 NLM_EXTERN Boolean CodonForIndex PROTO((Uint1 index, Uint1 code, Uint1Ptr codon));
397 
398 /*****************************************************************************
399 *
400 *   Int2 GetFrameFromLoc (slp)
401 *   	returns 1,2,3 if can find the frame
402 *   	0 if not
403 *
404 *****************************************************************************/
405 NLM_EXTERN Uint1 GetFrameFromLoc PROTO((SeqLocPtr slp));
406 
407 /******************************************************************
408 *
409 *	dnaLoc_to_aaLoc(sfp, dna_loc, merge, frame, allowTerminator)
410 *	map a SeqLoc on the DNA sequence
411 *       to a Seq-loc in the	protein sequence
412 *       through a CdRegion feature
413 *   if (merge) adjacent intervals on the amino acid sequence
414 *      are merged into one. This should be the usual case.
415 *   We try to report the frame if the caller provides a suitable pointer
416 *   If allowTerminator, can map the termination codon as a legal location
417 *
418 ******************************************************************/
419 NLM_EXTERN SeqLocPtr LIBCALL dnaLoc_to_aaLoc(SeqFeatPtr sfp, SeqLocPtr dna_loc, Boolean merge, Int4Ptr frame, Boolean allowTerminator);
420 
421 /******************************************************************
422 *
423 *	productLoc_to_locationLoc(sfp, product_loc)
424 *	map a SeqLoc on the product sequence
425 *       to a Seq-loc in the location sequence
426 *       through a feature
427 *
428 *       this more general function is now called by
429 *          aaLoc_to_dnaLoc()
430 *
431 ******************************************************************/
432 NLM_EXTERN SeqLocPtr LIBCALL productLoc_to_locationLoc(SeqFeatPtr sfp, SeqLocPtr product_loc);
433 
434 /******************************************************************
435 *
436 *	aaLoc_to_dnaLoc(sfp, aa_loc)
437 *	map a SeqLoc on the amino acid sequence
438 *       to a Seq-loc in the	DNA sequence
439 *       through a CdRegion feature
440 *
441 ******************************************************************/
442 NLM_EXTERN SeqLocPtr LIBCALL aaLoc_to_dnaLoc(SeqFeatPtr sfp, SeqLocPtr aa_loc);
443 
444 /******************************************************************
445 *
446 *	aaFeatLoc_to_dnaFeatLoc(sfp, aa_loc)
447 *	map a SeqLoc on the amino acid sequence
448 *       to a Seq-loc in the	DNA sequence
449 *       through a CdRegion feature
450 *
451 *       uses aaLoc_to_dnaLoc() but does additional checks to
452 *       extend dnaLoc at either end to compensate for positions in
453 *       the dna which do not corresspond to the amino acid sequence
454 *       (partial codons which are not translated).
455 *
456 ******************************************************************/
457 NLM_EXTERN SeqLocPtr LIBCALL aaFeatLoc_to_dnaFeatLoc(SeqFeatPtr sfp, SeqLocPtr aa_loc);
458 
459 /******************************************************************
460 *
461 *	productInterval_to_locationIntervals(sfp, aa_start, aa_stop)
462 *	map the amino acid sequence to a chain of Seq-locs in the
463 *	DNA sequence through a CdRegion feature
464 *
465 ******************************************************************/
466 NLM_EXTERN SeqLocPtr LIBCALL productInterval_to_locationIntervals (SeqFeatPtr sfp, Int4 aa_start, Int4 aa_stop, Boolean aa_partialn);
467 
468 /*-------------- BioseqRevComp () ---------------------------*/
469 /***********************************************************************
470 *   BioseqRevComp:   Takes the nucleic acid sequence from Bioseq
471 *	Entry and gives the reverse complement sequence in place
472 *       Does not change features.
473 ************************************************************************/
474 NLM_EXTERN Boolean LIBCALL BioseqRevComp (BioseqPtr bsp);
475 
476 
477 /*-------------- BioseqComplement () ---------------------------*/
478 /***********************************************************************
479 *   BioseqComplement:   Takes the nucleic acid sequence from Bioseq
480 *	Entry and gives the complement sequence in place
481 *       Does not change features.
482 ************************************************************************/
483 NLM_EXTERN Boolean LIBCALL BioseqComplement (BioseqPtr bsp);
484 
485 
486 /*-------------- BioseqReverse () ---------------------------*/
487 /***********************************************************************
488 *   BioseqReverse:   Takes nucleic acid sequence from Bioseq Entry and
489 *	reverses the whole sequence in place
490 *       Does not change features.
491 ************************************************************************/
492 NLM_EXTERN Boolean LIBCALL BioseqReverse (BioseqPtr bsp);
493 
494 
495 /*-------------- ContigRevComp () ---------------------------*/
496 /***********************************************************************
497 *   ContigRevComp:   Reverse complement segmented or delta bioseq
498 ************************************************************************/
499 NLM_EXTERN Boolean LIBCALL ContigRevComp (BioseqPtr bsp);
500 
501 /*****************************************************************************
502 *
503 *  SPCompressNew(void); - allocated memory for SPCompress structure
504 *
505 *****************************************************************************/
506 NLM_EXTERN SPCompressPtr SPCompressNew(void);
507 
508 /*****************************************************************************
509 *
510 *  SPCompressFree(SPCompressPtr spc); -  free SPCompress structure
511 *
512 *****************************************************************************/
513 NLM_EXTERN void SPCompressFree(SPCompressPtr spc);
514 
515 /*****************************************************************************
516 *
517 *   SPCompressDNA(SeqPortPtr spp);
518 *       converts a ncbi4na taken from spp into ncbi2na
519 *       buffer stored inside SPCompress structue together
520 *       with ambiguity information
521 *       returns pointer SPCompress structure or NULL if error
522 *
523 *       NOTE: In this function we do not know - what is length
524 *             of sequence to compress. Terminated flag for this
525 *             function is SEQPORT_EOF returned from spp.
526 *
527 *****************************************************************************/
528 NLM_EXTERN SPCompressPtr SPCompressDNA(SeqPortPtr spp);
529 
530 /*****************************************************************************
531 *
532 *   SPRebuildDNA(SPCompressPtr spc);
533 *       translates spc ncbi2na encoding buffer into
534 *       spc ncbi4na encoding buffer with rebuild ambiguities
535 *
536 *       spc - must be valid SPCompress structure returned
537 *       from SPCompressDNA() function in ncbi2na encoding
538 *
539 *****************************************************************************/
540 NLM_EXTERN Boolean SPRebuildDNA(SPCompressPtr spc);
541 
542 /*****************************************************************************
543 *
544 *   ComposeCodonsRecognizedString (trna, buf, buflen);
545 *       Copies codon recognized string to buf, returns number of codons
546 *
547 *****************************************************************************/
548 
549 NLM_EXTERN Int2 ComposeCodonsRecognizedString (tRNAPtr trna, CharPtr buf, size_t buflen);
550 
551 /*****************************************************************************
552 *
553 *   TransTableNew (Int2 genCode);
554 *       Initializes TransTable finite state machine for 6-frame translation
555 *       and open reading frame search, allowing nucleotide ambiguity characters
556 *
557 *****************************************************************************/
558 
559 typedef struct fsatranstable {
560   Int2     genCode;
561   Char     ncbieaa [65];
562   Char     sncbieaa [65];
563   Uint2    nextBase [3376];
564   Char     aminoAcid [3376] [2];
565   Char     orfStart [3376] [2];
566   Uint1    basesToIdx [256];
567 } TransTable, PNTR TransTablePtr;
568 
569 /* allocate 6-frame finite state translation table and initialize with indicated genetic code */
570 NLM_EXTERN TransTablePtr TransTableNew (Int2 genCode);
571 NLM_EXTERN TransTablePtr TransTableFree (TransTablePtr tbl);
572 NLM_EXTERN void TransTableFreeAll (void);
573 
574 #define TTBL_TOP_STRAND  0
575 #define TTBL_BOT_STRAND  1
576 
577 #define TTBL_ATG_STATE  48
578 #define TTBL_CAT_STATE 229
579 
580 /* macros for using finite state machine for 6-frame translation */
581 #define NextCodonState(tbl,cur,ch) (tbl->nextBase [(int) (Uint2) cur] + tbl->basesToIdx [(int) (Uint1) ch])
582 #define GetCodonResidue(tbl,cur,stnd) (tbl->aminoAcid [(int) (Uint2) cur] [stnd])
583 #define GetStartResidue(tbl,cur,stnd) (tbl->orfStart [(int) (Uint2) cur] [stnd])
584 #define IsOrfStart(tbl,cur,stnd) ((Boolean) (GetStartResidue(tbl,cur,stnd) == 'M'))
585 #define IsAmbigStart(tbl,cur,stnd) ((Boolean) (GetStartResidue(tbl,cur,stnd) == 'X'))
586 #define IsAnyStart(tbl,cur,stnd) ((Boolean) (GetStartResidue(tbl,cur,stnd) != '-'))
587 #define IsOrfStop(tbl,cur,stnd) ((Boolean) (GetCodonResidue(tbl,cur,stnd) == '*'))
588 #define IsATGStart(tbl,cur,stnd) ((Boolean) (IsOrfStart(tbl,cur,stnd) && (stnd ? (cur == TTBL_CAT_STATE) : (cur == TTBL_ATG_STATE))))
589 #define IsAltStart(tbl,cur,stnd) ((Boolean) (IsOrfStart(tbl,cur,stnd) && (stnd ? (cur != TTBL_CAT_STATE) : (cur != TTBL_ATG_STATE))))
590 
591 typedef void (LIBCALLBACK *TransTableMatchProc) (Int4 position, Char residue, Boolean atgStart, Boolean altStart, Boolean orfStop, Int2 frame, Uint1 strand, Pointer userdata);
592 
593 /* convenience function calls user callback for each strand of entire bioseq */
594 
595 NLM_EXTERN void TransTableProcessBioseq (
596   TransTablePtr tbl,
597   TransTableMatchProc matchProc,
598   Pointer userdata,
599   BioseqPtr bsp
600 );
601 
602 /* trans table translation functions can be passed cds feature or individual parameters */
603 
604 NLM_EXTERN ByteStorePtr TransTableTranslateCdRegion (
605   TransTablePtr  PNTR tblptr,
606   SeqFeatPtr cds,
607   Boolean include_stop,
608   Boolean remove_trailingX,
609   Boolean no_stop_at_end_of_complete_cds
610 );
611 
612 NLM_EXTERN ByteStorePtr TransTableTranslateCdRegionEx (
613   TransTablePtr  PNTR tblptr,
614   SeqFeatPtr cds,
615   Boolean include_stop,
616   Boolean remove_trailingX,
617   Boolean no_stop_at_end_of_complete_cds,
618   BoolPtr altStartP,
619   Boolean farProdFetchOK
620 );
621 
622 NLM_EXTERN ByteStorePtr TransTableTranslateSeqLoc (
623   TransTablePtr  PNTR tblptr,
624   SeqLocPtr location,
625   Int2 genCode,
626   Uint1 frame,
627   Boolean include_stop,
628   Boolean remove_trailingX
629 );
630 
631 /* returns string of bases to translate */
632 
633 NLM_EXTERN CharPtr ReadCodingRegionBases (
634   SeqLocPtr location,
635   Int4 len,
636   Uint1 frame,
637   Int4Ptr totalP
638 );
639 
640 /* allow reuse of translation tables by saving as AppProperty, avoids unnecessary initializations */
641 
642 NLM_EXTERN TransTablePtr PersistentTransTableByGenCode (
643   Int2 genCode
644 );
645 
646 NLM_EXTERN TransTablePtr PersistentTransTableByCdRegion (
647   SeqFeatPtr cds
648 );
649 
650 NLM_EXTERN ValNodePtr MakeCodeBreakList (
651   SeqLocPtr cdslocation,
652   Int4 len,
653   CodeBreakPtr cbp,
654   Uint1 frame
655 );
656 
657 /*****************************************************************************
658 *
659 *   SeqSearch
660 *       Initializes SeqSearch finite state machine for sequence searching
661 *       Based on Practical Algorithms for Programmers by Binstock and Rex
662 *
663 *****************************************************************************/
664 
665 struct SeqSearch;
666 typedef struct SeqSearch* SeqSearchPtr;
667 
668 typedef void (LIBCALLBACK *SeqSearchMatchProc) (Int4 position, CharPtr name, CharPtr pattern, Int2 cutSite, Uint1 strand, Pointer userdata);
669 
670 /* create empty nucleotide sequence search finite state machine */
671 
672 NLM_EXTERN SeqSearchPtr SeqSearchNew (
673   SeqSearchMatchProc matchproc,
674   Pointer userdata
675 );
676 
677 /*
678    add nucleotide pattern or restriction site to sequence search finite state
679    machine, expands using ambiguity codes R = A and G, H = A, C and T, etc.
680 */
681 
682 typedef unsigned long SearchFlgType;
683 
684 #define SEQ_SEARCH_JUST_TOP_STRAND  1
685 #define SEQ_SEARCH_EXPAND_PATTERN   2
686 #define SEQ_SEARCH_ALLOW_MISMATCH   4
687 
688 NLM_EXTERN void SeqSearchAddNucleotidePattern (
689   SeqSearchPtr tbl,
690   CharPtr name,
691   CharPtr pattern,
692   Int2 cutSite,
693   SearchFlgType flags
694 );
695 
696 /* program passes each character in turn to finite state machine */
697 
698 NLM_EXTERN void SeqSearchProcessCharacter (
699   SeqSearchPtr tbl,
700   Char ch
701 );
702 
703 /* convenience function calls SeqSearchProcessCharacter for entire bioseq */
704 
705 NLM_EXTERN void SeqSearchProcessBioseq (
706   SeqSearchPtr tbl,
707   BioseqPtr bsp
708 );
709 
710 /* reset state and position to allow another run with same search patterns */
711 
712 NLM_EXTERN void SeqSearchReset (
713   SeqSearchPtr tbl
714 );
715 
716 /* clean up sequence search finite state machine allocated memory */
717 
718 NLM_EXTERN SeqSearchPtr SeqSearchFree (
719   SeqSearchPtr tbl
720 );
721 
722 
723 /*****************************************************************************
724 *
725 *   ProtSearch
726 *       Initializes ProtSearch finite state machine for sequence searching
727 *       Based on Practical Algorithms for Programmers by Binstock and Rex
728 *
729 *****************************************************************************/
730 
731 struct ProtSearch;
732 typedef struct ProtSearch* ProtSearchPtr;
733 
734 typedef void (LIBCALLBACK *ProtSearchMatchProc) (Int4 position, CharPtr name, CharPtr pattern, Pointer userdata);
735 
736 /* create empty protein sequence search finite state machine */
737 
738 NLM_EXTERN ProtSearchPtr ProtSearchNew (
739   ProtSearchMatchProc matchproc,
740   Pointer userdata
741 );
742 
743 /*
744    add protein pattern to protein sequence search finite state machine,
745    expands using ambiguity codes B = D and N, Z = E and Q, etc.
746 */
747 
748 NLM_EXTERN void ProtSearchAddProteinPattern (
749   ProtSearchPtr tbl,
750   CharPtr name,
751   CharPtr pattern,
752   SearchFlgType flags
753 );
754 
755 /* program passes each character in turn to finite state machine */
756 
757 NLM_EXTERN void ProtSearchProcessCharacter (
758   ProtSearchPtr tbl,
759   Char ch
760 );
761 
762 /* convenience function calls ProtSearchProcessCharacter for entire bioseq */
763 
764 NLM_EXTERN void ProtSearchProcessBioseq (
765   ProtSearchPtr tbl,
766   BioseqPtr bsp
767 );
768 
769 
770 /* reset state and position to allow another run with same search patterns */
771 
772 NLM_EXTERN void ProtSearchReset (
773   ProtSearchPtr tbl
774 );
775 
776 /* clean up sequence search finite state machine allocated memory */
777 
778 NLM_EXTERN ProtSearchPtr ProtSearchFree (
779   ProtSearchPtr tbl
780 );
781 
782 
783 /*****************************************************************************
784 *
785 *  Convenience functions for genome processing use BioseqLockById to get sequence
786 *  record (perhaps with phrap quality score graphs) so fetching from some network
787 *  or local server must be enabled, or sequences must already be in memory.
788 *
789 *****************************************************************************/
790 
791 NLM_EXTERN CharPtr GetSequenceByBsp (
792   BioseqPtr bsp
793 );
794 
795 NLM_EXTERN CharPtr GetSequenceByIdOrAccnDotVer (
796   SeqIdPtr sip,
797   CharPtr accession,
798   Boolean is_na
799 );
800 
801 NLM_EXTERN CharPtr GetSequenceByFeature (
802   SeqFeatPtr sfp
803 );
804 
805 NLM_EXTERN CharPtr GetSequenceByLocation (
806   SeqLocPtr slp
807 );
808 
809 NLM_EXTERN CharPtr GetSequenceByBspEx (
810   BioseqPtr bsp,
811   StreamFlgType flags
812 );
813 
814 NLM_EXTERN CharPtr GetSequenceByIdOrAccnDotVerEx (
815   SeqIdPtr sip,
816   CharPtr accession,
817   Boolean is_na,
818   StreamFlgType flags
819 );
820 
821 NLM_EXTERN CharPtr GetSequenceByFeatureEx (
822   SeqFeatPtr sfp,
823   StreamFlgType flags
824 );
825 
826 NLM_EXTERN CharPtr GetSequenceByLocationEx (
827   SeqLocPtr slp,
828   StreamFlgType flags
829 );
830 
831 NLM_EXTERN CharPtr GetDNAbyAccessionDotVersion (
832   CharPtr accession
833 );
834 
835 NLM_EXTERN BytePtr GetScoresbyAccessionDotVersion (
836   CharPtr accession,
837   Int4Ptr bsplength
838 );
839 
840 NLM_EXTERN BytePtr GetScoresbySeqId (
841   SeqIdPtr sip,
842   Int4Ptr bsplength
843 );
844 
845 /*****************************************************************************
846 *
847 *   ConvertNsToGaps
848 *       Assumes string of Ns means a gap of known length
849 *
850 *****************************************************************************/
851 
852 NLM_EXTERN void ConvertNsToGaps (
853   BioseqPtr bsp,
854   Pointer userdata
855 );
856 
857 /**************************************************************
858 *
859 *  Returns a protein molecular weight for a SeqLoc
860 *    If it cannot calculate the value it returns -1.0
861 *    If sequence contains X, J, or O it fails
862 *
863 ***************************************************************/
864 NLM_EXTERN FloatHi MolWtForLoc (SeqLocPtr slp);
865 
866 NLM_EXTERN FloatHi MolWtForBsp (BioseqPtr bsp);
867 
868 NLM_EXTERN FloatHi MolWtForStr (CharPtr str);
869 
870 
871 
872 NLM_EXTERN Boolean LIBCALL ReverseSeqData (Uint1 seqtype, Int4 seqlen, SeqDataPtr sdp);
873 NLM_EXTERN Boolean ComplementSeqData (Uint1 seqtype, Int4 seqlen, SeqDataPtr sdp);
874 
875 
876 #ifdef __cplusplus
877 }
878 #endif
879 
880 #undef NLM_EXTERN
881 #ifdef NLM_EXPORT
882 #define NLM_EXTERN NLM_EXPORT
883 #else
884 #define NLM_EXTERN
885 #endif
886 
887 #endif
888