1 #ifndef SEQUENCE__HPP
2 #define SEQUENCE__HPP
3 
4 /*  $Id: sequence.hpp 626747 2021-03-03 18:46:39Z ivanov $
5 * ===========================================================================
6 *
7 *                            PUBLIC DOMAIN NOTICE
8 *               National Center for Biotechnology Information
9 *
10 *  This software/database is a "United States Government Work" under the
11 *  terms of the United States Copyright Act.  It was written as part of
12 *  the author's official duties as a United States Government employee and
13 *  thus cannot be copyrighted.  This software/database is freely available
14 *  to the public for use. The National Library of Medicine and the U.S.
15 *  Government have not placed any restriction on its use or reproduction.
16 *
17 *  Although all reasonable efforts have been taken to ensure the accuracy
18 *  and reliability of the software and data, the NLM and the U.S.
19 *  Government do not and cannot warrant the performance or results that
20 *  may be obtained by using this software or data. The NLM and the U.S.
21 *  Government disclaim all warranties, express or implied, including
22 *  warranties of performance, merchantability or fitness for any particular
23 *  purpose.
24 *
25 *  Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author:  Clifford Clausen & Aaron Ucko
30 *
31 * File Description:
32 *   Sequence utilities requiring CScope
33 *   Obtains or constructs a sequence's title.  (Corresponds to
34 *   CreateDefLine in the C toolkit.)
35 */
36 
37 #include <corelib/ncbistd.hpp>
38 #include <serial/serial.hpp>
39 #include <serial/objistr.hpp>
40 #include <serial/objostr.hpp>
41 
42 #include <objects/seqfeat/SeqFeatData.hpp>
43 #include <objects/seqfeat/Cdregion.hpp>
44 #include <objects/seqloc/Na_strand.hpp>
45 #include <objects/seqloc/Seq_interval.hpp>
46 #include <objects/seqloc/Seq_loc.hpp>
47 #include <util/strsearch.hpp>
48 #include <objects/seq/seq_id_mapper.hpp>
49 #include <util/range_coll.hpp>
50 #include <objmgr/feat_ci.hpp>
51 #include <objmgr/bioseq_ci.hpp>
52 #include <objmgr/scope.hpp>
53 #include <objmgr/util/seq_loc_util.hpp>
54 #include <objmgr/util/create_defline.hpp>
55 
56 BEGIN_NCBI_SCOPE
57 BEGIN_SCOPE(objects)
58 
59 // Forward declarations
60 class CSeq_id;
61 class CSeq_loc_mix;
62 class CSeq_point;
63 class CPacked_seqpnt;
64 class CBioseq_Handle;
65 class CSeq_loc_Mapper;
66 class CSeqVector;
67 class CCdregion;
68 class CSeq_feat;
69 class CSeq_entry;
70 class CSeq_entry_handle;
71 class CGenetic_code;
72 class CMolInfo;
73 class CSeq_gap;
74 
75 BEGIN_SCOPE(sequence)
76 
77 /** @addtogroup ObjUtilSequence
78  *
79  * @{
80  */
81 
82 
83 /** @name SeqIdConv
84  * Conversions between seq-id types
85  * @{
86  */
87 
88 
89 enum EAccessionVersion {
90     eWithAccessionVersion,    ///< accession.version (when possible)
91     eWithoutAccessionVersion  ///< accession only, even if version is available
92 };
93 
94 
95 /// Retrieve a particular seq-id from a given bioseq handle.  This uses
96 /// CSynonymsSet internally to decide which seq-id should be used.
97 enum EGetIdFlags {
98     eGetId_ForceGi  = 0x0000,  ///< return only a gi-based seq-id
99     eGetId_ForceAcc = 0x0001,  ///< return only an accession based seq-id
100     eGetId_Best     = 0x0002,  ///< return the "best" gi (uses FindBestScore(),
101                                ///< with CSeq_id::CalculateScore() as the score
102                                ///< function
103     eGetId_HandleDefault = 0x0003, ///< returns the ID associated with a bioseq-handle
104 
105     eGetId_Seq_id_Score       = 0x0004, ///< use CSeq_id::Score() as the scoring function
106     eGetId_Seq_id_BestRank    = 0x0005, ///< use CSeq_id::BestRank() as the scoring function
107     eGetId_Seq_id_WorstRank   = 0x0006, ///< use CSeq_id::WorstRank() as the scoring function
108     eGetId_Seq_id_FastaAARank = 0x0007, ///< use CSeq_id::FastaAARank() as the scoring function
109     eGetId_Seq_id_FastaNARank = 0x0008, ///< use CSeq_id::FastaNARank() as the scoring function
110 
111     ///< "canonical" here means "most specific"; this differs from "best" in
112     ///< that "best" is intended for display purposes
113     eGetId_Canonical = eGetId_Seq_id_BestRank,
114 
115     eGetId_TypeMask = 0x00FF,  ///< Mask for requested id type
116 
117     /// Check if the seq-id is present in the scope
118     eGetId_VerifyId = 0x0100,
119 
120     /// Throw exception on errors. If not set, an empty value is returned.
121     eGetId_ThrowOnError = 0x0200,
122 
123     eGetId_Default = eGetId_Best | eGetId_ThrowOnError,
124 };
125 typedef int EGetIdType;
126 
127 
128 /// Given an accession string retrieve the GI id.
129 /// If no GI was found returns 0 or throws CSeqIdFromHandleException
130 /// depending on the flags.
131 /// Id type in the flags is ignored, only VerifyId and ThrowOnError
132 /// flags are checked.
133 NCBI_XOBJUTIL_EXPORT
134 TGi GetGiForAccession(const string& acc,
135                       CScope& scope,
136                       EGetIdType flags = 0);
137 
138 /// Retrieve the accession for a given GI.
139 /// If no accession was found returns an empty string or throws
140 /// CSeqIdFromHandleException depending on the flags.
141 /// Id type in the flags is ignored, only VerifyId and ThrowOnError
142 /// flags are checked.
143 NCBI_XOBJUTIL_EXPORT
144 string GetAccessionForGi(TGi           gi,
145                          CScope&       scope,
146                          EAccessionVersion use_version = eWithAccessionVersion,
147                          EGetIdType flags = 0);
148 
149 /// Given a Seq-id retrieve the corresponding GI.
150 /// If no GI was found returns 0 or throws CSeqIdFromHandleException
151 /// depending on the flags.
152 /// Id type in the flags is ignored, only VerifyId and ThrowOnError
153 /// flags are checked.
154 NCBI_XOBJUTIL_EXPORT
155 TGi GetGiForId(const objects::CSeq_id& id,
156                CScope& scope,
157                EGetIdType flags = 0);
158 
159 /// Retrieve the accession string for a Seq-id.
160 /// If no accession was found returns an empty string or throws
161 /// CSeqIdFromHandleException depending on the flags.
162 /// Id type in the flags is ignored, only VerifyId and ThrowOnError
163 /// flags are checked.
164 NCBI_XOBJUTIL_EXPORT
165 string GetAccessionForId(const objects::CSeq_id& id,
166                          CScope&       scope,
167                          EAccessionVersion use_version = eWithAccessionVersion,
168                          EGetIdType flags = 0);
169 
170 /// Return a selected ID type for a given bioseq handle.  This function
171 /// will try to use the most efficient method possible to determine which
172 /// ID fulfills the requested parameter.  This version will call
173 /// sequence::GetId() with the bioseq handle's seq-id.
174 ///
175 /// @param id Source id to evaluate
176 /// @param scope Scope for seq-id resolution.
177 /// @param type Type of ID to return
178 /// @return A requested seq-id.
179 /// Depending on the flags set in 'type' this function can verify
180 /// if the requested ID exists in the scope and throw
181 /// CSeqIdFromHandleException if the request cannot be satisfied.
182 NCBI_XOBJUTIL_EXPORT
183 CSeq_id_Handle GetId(const CBioseq_Handle& handle,
184                      EGetIdType type = eGetId_Default);
185 
186 /// Return a selected ID type for a seq-id.  This function
187 /// will try to use the most efficient method possible to determine which
188 /// ID fulfills the requested parameter.  The following logic is used:
189 ///
190 /// - For seq-id type eGetId_HandleDefault, the original seq-id is returned.
191 ///   This satisfies the condition of returning a bioseq-handle's seq-id if
192 ///   sequence::GetId() is applied to a CBioseq_Handle.
193 ///
194 /// - For seq-id type eGetId_ForceAcc, the returned set of seq-ids will first
195 ///   be evaluated for a "best" id (which, given seq-id scoring, will be
196 ///   a textseq-id if one exists). If the returned best ID is a textseq-id,
197 ///   this id will be returned.  Otherwise, an exception is thrown or an
198 ///   empty handle returned.
199 ///
200 /// - For seq-id type eGetId_ForceGi, the returned set of IDs is scanned for
201 ///   an ID of type gi. If this is found, it is returned; otherwise, an
202 ///   exception is thrown or an empty handle returned. If the supplied ID is
203 ///   already a gi and eGetId_VerifyId flag is not set, no work is done.
204 ///
205 /// @param id Source id to evaluate
206 /// @param scope Scope for seq-id resolution.
207 /// @param type Type of ID to return
208 /// @return A requested seq-id.
209 /// Depending on the flags set in 'type' this function can verify
210 /// if the requested ID exists in the scope and throw
211 /// CSeqIdFromHandleException if the request cannot be satisfied.
212 NCBI_XOBJUTIL_EXPORT
213 CSeq_id_Handle GetId(const CSeq_id& id, CScope& scope,
214                      EGetIdType type = eGetId_Default);
215 
216 /// Return a selected ID type for a seq-id handle.
217 /// Arguments (except 'id') and behavior is the same as of
218 /// GetId(const CSeq_id& id, ...).
219 NCBI_XOBJUTIL_EXPORT
220 CSeq_id_Handle GetId(const CSeq_id_Handle& id, CScope& scope,
221                      EGetIdType type = eGetId_Default);
222 
223 /// Return a selected ID type from a set of Seq-ids
224 /// Arguments (except 'id') and behavior is the same as of
225 /// GetId(const CSeq_id& id, ...).
226 NCBI_XOBJUTIL_EXPORT
227 CSeq_id_Handle GetId(const CBioseq::TId& id,
228                      EGetIdType type = eGetId_Default);
229 
230 /// Return a selected ID type from a Bioseq
231 /// Arguments (except 'seq') and behavior is the same as of
232 /// GetId(const CBioseq_Handle& seq, ...).
233 NCBI_XOBJUTIL_EXPORT
234 CSeq_id_Handle GetId(const CBioseq& seq,
235                      EGetIdType type = eGetId_Default);
236 
237 /* @} */
238 
239 
240 /** @name FindLatestSequence
241  * Walk the replace history to find the latest revision of a sequence
242  * @{
243  */
244 
245 /// Given a seq-id check its replace history and try to find the latest
246 /// revision. The function stops and returns NULL if it detects some
247 /// strange conditions like an infinite recursion. If the bioseq
248 /// contains no history information, the original id is returned.
249 NCBI_XOBJUTIL_EXPORT
250 CConstRef<CSeq_id> FindLatestSequence(const CSeq_id& id, CScope& scope);
251 
252 NCBI_XOBJUTIL_EXPORT
253 CSeq_id_Handle FindLatestSequence(const CSeq_id_Handle& idh, CScope& scope);
254 
255 /// Check replace history up to the specified date. Returns the latest
256 /// bioseq up to the date or the original id if the bioseq contains no
257 /// history or is already newer than the specified date.
258 NCBI_XOBJUTIL_EXPORT
259 CConstRef<CSeq_id> FindLatestSequence(const CSeq_id& id,
260                                       CScope&        scope,
261                                       const CTime&   tlim);
262 
263 NCBI_XOBJUTIL_EXPORT
264 CSeq_id_Handle FindLatestSequence(const CSeq_id_Handle& idh,
265                                   CScope&               scope,
266                                   const CTime&          tlim);
267 
268 /* @} */
269 
270 
271 /** @name GetTitle
272  * Get sequence's title (used in various flat-file formats.)
273  * Deprecated in favor of CDeflineGenerator.
274  * @{
275  */
276 
277 /// This function is here rather than in CBioseq because it may need
278 /// to inspect other sequences.  The reconstruct flag indicates that it
279 /// should ignore any existing title Seqdesc.
280 enum EGetTitleFlags {
281     fGetTitle_Reconstruct = 0x1, ///< ignore existing title Seqdesc.
282     fGetTitle_Organism    = 0x2, ///< append [organism]
283     fGetTitle_AllProteins = 0x4, ///< normally just names the first
284     fGetTitle_NoExpensive = 0x8  ///< skip potential expensive operations
285 };
286 typedef int TGetTitleFlags;
287 
288 NCBI_DEPRECATED NCBI_XOBJUTIL_EXPORT
289 string GetTitle(const CBioseq_Handle& hnd, TGetTitleFlags flags = 0);
290 NCBI_DEPRECATED NCBI_XOBJUTIL_EXPORT
291 bool GetTitle(const CBioseq& seq, string* title_ptr, TGetTitleFlags flags = 0);
292 
293 /* @} */
294 
295 
296 /** @name Source and Product
297  * Mapping locations through features
298  * @{
299  */
300 
301 enum ES2PFlags {
302     fS2P_NoMerge  = 0x1, ///< don't merge adjacent intervals on the product
303     fS2P_AllowTer = 0x2  ///< map the termination codon as a legal location
304 };
305 typedef int TS2PFlags; // binary OR of ES2PFlags
306 
307 NCBI_XOBJUTIL_EXPORT
308 CRef<CSeq_loc> SourceToProduct(const CSeq_feat& feat,
309                                const CSeq_loc& source_loc, TS2PFlags flags = 0,
310                                CScope* scope = 0, int* frame = 0);
311 
312 enum EP2SFlags {
313     fP2S_Extend = 0x1  ///< if hitting ends, extend to include partial codons
314 };
315 typedef int TP2SFlags; // binary OR of ES2PFlags
316 
317 NCBI_XOBJUTIL_EXPORT
318 CRef<CSeq_loc> ProductToSource(const CSeq_feat& feat, const CSeq_loc& prod_loc,
319                                TP2SFlags flags = 0, CScope* scope = 0);
320 
321 /* @} */
322 
323 
324 /** @name Overlapping
325  * Searching for features
326  * @{
327  */
328 
329 enum EBestFeatOpts {
330     /// requires explicit association, rather than analysis based on overlaps
331     fBestFeat_StrictMatch = 0x01,
332 
333     /// don't perform any expensive tests, such as ones that require fetching
334     /// additional sequences
335     fBestFeat_NoExpensive = 0x02,
336 
337     /// favor longer features over shorter features
338     fBestFeat_FavorLonger = 0x04,
339 
340     /// Pay no attention to strands when finding the best feat.  This may be
341     /// useful for, e.g., trans-spliced genes.
342     fBestFeat_IgnoreStrand = 0x08,
343 
344     /// default options: do everything
345     fBestFeat_Defaults = 0
346 };
347 typedef int TBestFeatOpts;
348 
349 
350 /// Storage for features and scores.
351 typedef pair<Int8, CConstRef<CSeq_feat> > TFeatScore;
352 typedef vector<TFeatScore> TFeatScores;
353 
354 // To avoid putting custom logic into the GetOverlappingFeatures
355 // function, we allow plugins
356 class CGetOverlappingFeaturesPlugin {
357 public:
~CGetOverlappingFeaturesPlugin()358     virtual ~CGetOverlappingFeaturesPlugin() {}
359     virtual void processSAnnotSelector(
360         SAnnotSelector &sel ) = 0;
361 
362     virtual void setUpFeatureIterator (
363         CBioseq_Handle &bioseq_handle,
364         auto_ptr<CFeat_CI> &feat_ci,
365         TSeqPos circular_length ,
366         CRange<TSeqPos> &range,
367         const CSeq_loc& loc,
368         SAnnotSelector &sel,
369         CScope &scope,
370         ENa_strand &strand ) = 0;
371 
372     virtual void processLoc(
373         CBioseq_Handle &bioseq_handle,
374         CRef<CSeq_loc> &loc,
375         TSeqPos circular_length ) = 0;
376 
377     virtual void processMainLoop(
378         bool &shouldContinueToNextIteration,
379         CRef<CSeq_loc> &cleaned_loc_this_iteration,
380         CRef<CSeq_loc> &candidate_feat_loc,
381         EOverlapType &overlap_type_this_iteration,
382         bool &revert_locations_this_iteration,
383         CBioseq_Handle &bioseq_handle,
384         const CMappedFeat &feat,
385         TSeqPos circular_length,
386         SAnnotSelector::EOverlapType annot_overlap_type ) = 0;
387 
388     virtual void postProcessDiffAmount(
389         Int8 &cur_diff,
390         CRef<CSeq_loc> &cleaned_loc_this_iteration,
391         CRef<CSeq_loc> &candidate_feat_loc,
392         CScope &scope,
393         SAnnotSelector &sel,
394         TSeqPos circular_length ) = 0;
395 };
396 
397 /// Find all features overlapping the location. Features and corresponding
398 /// scores are stored in the 'feats' vector. The scores are calculated as
399 /// difference between the input location and each feature's location.
400 /// NOTE: 'overlap_type' defines how the location must be related to the feature.
401 /// For eOverlap_Subset, eOverlap_SubsetRev, eOverlap_CheckIntervals,
402 /// eOverlap_CheckIntRev and eOverlap_Interval the relationship is
403 /// reversed. E.g. with eOverlap_Contains, the location will contain
404 /// the feature, but with eOverlap_Subset the feature will be defined
405 /// on a subset of the location.
406 NCBI_XOBJUTIL_EXPORT
407 void GetOverlappingFeatures(const CSeq_loc& loc,
408                             CSeqFeatData::E_Choice feat_type,
409                             CSeqFeatData::ESubtype feat_subtype,
410                             EOverlapType overlap_type,
411                             TFeatScores& feats,
412                             CScope& scope,
413                             const TBestFeatOpts opts = 0,
414                             CGetOverlappingFeaturesPlugin *plugin = NULL );
415 
416 
417 /// See the note above on 'overlap_type' meaning.
418 NCBI_XOBJUTIL_EXPORT
419 CConstRef<CSeq_feat> GetBestOverlappingFeat(const CSeq_loc& loc,
420                                             CSeqFeatData::E_Choice feat_type,
421                                             EOverlapType overlap_type,
422                                             CScope& scope,
423                                             TBestFeatOpts opts = fBestFeat_Defaults,
424                                             CGetOverlappingFeaturesPlugin *plugin = NULL );
425 /// See the note above on 'overlap_type' meaning.
426 NCBI_XOBJUTIL_EXPORT
427 CConstRef<CSeq_feat> GetBestOverlappingFeat(const CSeq_loc& loc,
428                                             CSeqFeatData::ESubtype feat_type,
429                                             EOverlapType overlap_type,
430                                             CScope& scope,
431                                             TBestFeatOpts opts = fBestFeat_Defaults,
432                                             CGetOverlappingFeaturesPlugin *plugin = NULL );
433 
434 NCBI_XOBJUTIL_EXPORT
435 CConstRef<CSeq_feat>
436 GetBestGeneForMrna(const CSeq_feat& mrna_feat,
437                    CScope& scope,
438                    TBestFeatOpts opts = fBestFeat_Defaults,
439                    CGetOverlappingFeaturesPlugin *plugin = NULL );
440 
441 NCBI_XOBJUTIL_EXPORT
442 CConstRef<CSeq_feat>
443 GetBestGeneForCds(const CSeq_feat& cds_feat,
444                   CScope& scope,
445                   TBestFeatOpts opts = fBestFeat_Defaults,
446                   CGetOverlappingFeaturesPlugin *plugin = NULL );
447 
448 NCBI_XOBJUTIL_EXPORT
449 CConstRef<CSeq_feat>
450 GetBestMrnaForCds(const CSeq_feat& cds_feat,
451                   CScope& scope,
452                   TBestFeatOpts opts = fBestFeat_Defaults,
453                   CGetOverlappingFeaturesPlugin *plugin = NULL );
454 
455 NCBI_XOBJUTIL_EXPORT
456 CConstRef<CSeq_feat>
457 GetBestCdsForMrna(const CSeq_feat& mrna_feat,
458                   CScope& scope,
459                   TBestFeatOpts opts = fBestFeat_Defaults,
460                   CGetOverlappingFeaturesPlugin *plugin = NULL );
461 
462 NCBI_XOBJUTIL_EXPORT
463 void GetMrnasForGene(const CSeq_feat& gene_feat,
464                      CScope& scope,
465                      list< CConstRef<CSeq_feat> >& mrna_feats,
466                      TBestFeatOpts opts = fBestFeat_Defaults,
467                      CGetOverlappingFeaturesPlugin *plugin = NULL );
468 
469 NCBI_XOBJUTIL_EXPORT
470 void GetCdssForGene(const CSeq_feat& gene_feat,
471                     CScope& scope,
472                     list< CConstRef<CSeq_feat> >& cds_feats,
473                     TBestFeatOpts opts = fBestFeat_Defaults,
474                     CGetOverlappingFeaturesPlugin *plugin = NULL );
475 
476 /////////////////////////////////////////////////////////////////////////////
477 // Versions of functions with lookup by feature id
478 NCBI_XOBJUTIL_EXPORT
479 CConstRef<CSeq_feat>
480 GetBestGeneForMrna(const CSeq_feat& mrna_feat,
481                    const CTSE_Handle& tse,
482                    TBestFeatOpts opts = fBestFeat_Defaults,
483                    CGetOverlappingFeaturesPlugin *plugin = NULL );
484 
485 NCBI_XOBJUTIL_EXPORT
486 CConstRef<CSeq_feat>
487 GetBestGeneForCds(const CSeq_feat& cds_feat,
488                   const CTSE_Handle& tse,
489                   TBestFeatOpts opts = fBestFeat_Defaults,
490                   CGetOverlappingFeaturesPlugin *plugin = NULL );
491 
492 NCBI_XOBJUTIL_EXPORT
493 CConstRef<CSeq_feat>
494 GetBestMrnaForCds(const CSeq_feat& cds_feat,
495                   const CTSE_Handle& tse,
496                   TBestFeatOpts opts = fBestFeat_Defaults,
497                   CGetOverlappingFeaturesPlugin *plugin = NULL );
498 
499 NCBI_XOBJUTIL_EXPORT
500 CConstRef<CSeq_feat>
501 GetBestCdsForMrna(const CSeq_feat& mrna_feat,
502                   const CTSE_Handle& tse,
503                   TBestFeatOpts opts = fBestFeat_Defaults,
504                   CGetOverlappingFeaturesPlugin *plugin = NULL );
505 
506 NCBI_XOBJUTIL_EXPORT
507 void GetMrnasForGene(const CSeq_feat& gene_feat,
508                      const CTSE_Handle& tse,
509                      list< CConstRef<CSeq_feat> >& mrna_feats,
510                      TBestFeatOpts opts = fBestFeat_Defaults,
511                      CGetOverlappingFeaturesPlugin *plugin = NULL );
512 
513 NCBI_XOBJUTIL_EXPORT
514 void GetCdssForGene(const CSeq_feat& gene_feat,
515                     const CTSE_Handle& tse,
516                     list< CConstRef<CSeq_feat> >& cds_feats,
517                     TBestFeatOpts opts = fBestFeat_Defaults,
518                     CGetOverlappingFeaturesPlugin *plugin = NULL );
519 
520 NCBI_XOBJUTIL_EXPORT
521 CConstRef<CSeq_feat>
522 GetBestOverlappingFeat(const CSeq_feat& feat,
523                        CSeqFeatData::E_Choice feat_type,
524                        sequence::EOverlapType overlap_type,
525                        CScope& scope,
526                        TBestFeatOpts opts = fBestFeat_Defaults,
527                        CGetOverlappingFeaturesPlugin *plugin = NULL );
528 
529 NCBI_XOBJUTIL_EXPORT
530 CConstRef<CSeq_feat>
531 GetBestOverlappingFeat(const CSeq_feat& feat,
532                        CSeqFeatData::ESubtype feat_type,
533                        sequence::EOverlapType overlap_type,
534                        CScope& scope,
535                        TBestFeatOpts opts = fBestFeat_Defaults,
536                        CGetOverlappingFeaturesPlugin *plugin = NULL );
537 
538 NCBI_XOBJUTIL_EXPORT
539 CConstRef<CSeq_feat> GetmRNAforCDS(const CSeq_feat& cds, CScope& scope);
540 
541 
542 /// Get the best overlapping feature for a SNP (variation) feature
543 /// @param snp_feat
544 ///   SNP feature object
545 /// @param type
546 ///   type of overlapping feature
547 /// @param scope
548 /// @param search_both_strands
549 ///   search is performed on both strands, starting with the one specified
550 ///   by the feature's location.
551 /// @return
552 ///   the overlapping fetaure or NULL if not found
553 NCBI_XOBJUTIL_EXPORT
554 CConstRef<CSeq_feat> GetBestOverlapForSNP(const CSeq_feat& snp_feat,
555                                           CSeqFeatData::E_Choice type,
556                                           CScope& scope,
557                                           bool search_both_strands = true);
558 
559 /// Get the best overlapping feature for a SNP (variation)
560 /// @param snp_feat
561 ///   SNP feature object
562 /// @param subtype
563 ///   subtype of overlapping feature
564 /// @param scope
565 /// @param search_both_strands
566 ///   search is performed on both strands, starting with the one specified
567 ///   by the feature's location.
568 /// @return
569 ///   the overlapping fetaure or NULL if not found
570 NCBI_XOBJUTIL_EXPORT
571 CConstRef<CSeq_feat> GetBestOverlapForSNP(const CSeq_feat& snp_feat,
572                                           CSeqFeatData::ESubtype subtype,
573                                           CScope& scope,
574                                           bool search_both_strands = true);
575 
576 
577 /// Convenience functions for popular overlapping types
578 enum ETransSplicing {
579     eTransSplicing_No = 0,
580     eTransSplicing_Yes,
581     eTransSplicing_Auto ///< Ignore overlap strand if the source location
582                         ///< has mixed/both strand.
583 };
584 NCBI_XOBJUTIL_EXPORT
585 CConstRef<CSeq_feat> GetOverlappingGene(
586     const CSeq_loc& loc, CScope& scope,
587     ETransSplicing eTransSplicing = eTransSplicing_Auto);
588 
589 
590 /// Finds gene for feature, but obeys SeqFeatXref directives
591 NCBI_XOBJUTIL_EXPORT
592 CConstRef<CSeq_feat> GetGeneForFeature(const CSeq_feat& feat, CScope& scope);
593 
594 /// Determines whether given feature is pseudo, using gene associated with feature
595 /// if necessary
596 /// Checks to see if a feature is pseudo. Looks for pseudo flag set on feature,
597 /// looks for pseudogene qualifier on feature, performs same checks for gene
598 /// associated with feature
599 /// @param feat Seq-feat to check
600 /// @param scope CScope to use when looking for associated gene
601 /// @return Boolean return value indicates whether any of the "pseudo" markers are found
602 NCBI_XOBJUTIL_EXPORT
603 bool IsPseudo(const CSeq_feat& feat, CScope& scope);
604 
605 
606 NCBI_XOBJUTIL_EXPORT
607 CConstRef<CSeq_feat> GetOverlappingmRNA(const CSeq_loc& loc, CScope& scope);
608 
609 
610 NCBI_XOBJUTIL_EXPORT
611 CConstRef<CSeq_feat> GetOverlappingCDS(const CSeq_loc& loc, CScope& scope);
612 
613 
614 NCBI_XOBJUTIL_EXPORT
615 CConstRef<CSeq_feat> GetOverlappingPub(const CSeq_loc& loc, CScope& scope);
616 
617 
618 NCBI_XOBJUTIL_EXPORT
619 CConstRef<CSeq_feat> GetOverlappingSource(const CSeq_loc& loc, CScope& scope);
620 
621 
622 NCBI_XOBJUTIL_EXPORT
623 CConstRef<CSeq_feat> GetOverlappingOperon(const CSeq_loc& loc, CScope& scope);
624 
625 
626 /// Get the encoding CDS feature of a given protein sequence.
627 NCBI_XOBJUTIL_EXPORT
628 const CSeq_feat* GetCDSForProduct(const CBioseq& product, CScope* scope);
629 NCBI_XOBJUTIL_EXPORT
630 const CSeq_feat* GetCDSForProduct(const CBioseq_Handle& product);
631 NCBI_XOBJUTIL_EXPORT
632 CMappedFeat GetMappedCDSForProduct(const CBioseq_Handle& product);
633 
634 
635 /// Get the mature peptide feature of a protein
636 NCBI_XOBJUTIL_EXPORT
637 const CSeq_feat* GetPROTForProduct(const CBioseq& product, CScope* scope);
638 NCBI_XOBJUTIL_EXPORT
639 const CSeq_feat* GetPROTForProduct(const CBioseq_Handle& product);
640 
641 
642 /// Get the encoding mRNA feature of a given mRNA (cDNA) bioseq.
643 NCBI_XOBJUTIL_EXPORT
644 const CSeq_feat* GetmRNAForProduct(const CBioseq& product, CScope* scope);
645 NCBI_XOBJUTIL_EXPORT
646 const CSeq_feat* GetmRNAForProduct(const CBioseq_Handle& product);
647 NCBI_XOBJUTIL_EXPORT
648 CMappedFeat GetMappedmRNAForProduct(const CBioseq_Handle& product);
649 
650 /* @} */
651 
652 
653 /** @name Sequences
654  * Searching for bioseqs etc.
655  * @{
656  */
657 
658 /// Get the encoding nucleotide sequnce of a protein.
659 NCBI_XOBJUTIL_EXPORT
660 const CBioseq* GetNucleotideParent(const CBioseq& product, CScope* scope);
661 NCBI_XOBJUTIL_EXPORT
662 CBioseq_Handle GetNucleotideParent(const CBioseq_Handle& product);
663 
664 /// Get the parent bioseq for a part of a segmented bioseq
665 NCBI_XOBJUTIL_EXPORT
666 CBioseq_Handle GetParentForPart(const CBioseq_Handle& part);
667 
668 /// Return the org-ref associated with a given sequence.  This will throw
669 /// a CException if there is no org-ref associated with the sequence
670 NCBI_XOBJUTIL_EXPORT
671 const COrg_ref& GetOrg_ref(const CBioseq_Handle& handle);
672 /// Return the pointer to org-ref associated with a given sequence
673 /// or null if there is no org-ref associated with the sequence
674 NCBI_XOBJUTIL_EXPORT
675 const COrg_ref* GetOrg_refOrNull(const CBioseq_Handle& handle);
676 
677 /// return the tax-id associated with a given sequence.  This will return 0
678 /// if no tax-id can be found.
679 NCBI_XOBJUTIL_EXPORT
680 TTaxId GetTaxId(const CBioseq_Handle& handle);
681 
682 /// Retrieve the MolInfo object for a given bioseq handle.  If the supplied
683 /// sequence does not have a MolInfo associated with it, this will return NULL
684 NCBI_XOBJUTIL_EXPORT
685 const CMolInfo* GetMolInfo(const CBioseq& bioseq);
686 
687 NCBI_XOBJUTIL_EXPORT
688 const CMolInfo* GetMolInfo(const CBioseq_Handle& handle);
689 
690 /// Retrieve the BioSource object for a given bioseq handle.  If the supplied
691 /// sequence does not have a MolInfo associated with it, this will return NULL
692 NCBI_XOBJUTIL_EXPORT
693 const CBioSource* GetBioSource(const CBioseq& bioseq);
694 
695 NCBI_XOBJUTIL_EXPORT
696 const CBioSource* GetBioSource(const CBioseq_Handle& handle);
697 
698 
699 /// Retrieve the Bioseq Handle from a location.
700 /// location refers to a single bioseq:
701 ///   return the bioseq
702 /// location referes to multiple bioseqs:
703 ///   if parts of a segmentd bioseq, returns the segmentd bioseq.
704 ///   otherwise, return the first bioseq that could be found
705 ///   (first localy then, if flag is eGetBioseq_All, remote)
706 NCBI_XOBJUTIL_EXPORT
707 CBioseq_Handle GetBioseqFromSeqLoc(const CSeq_loc& loc, CScope& scope,
708     CScope::EGetBioseqFlag flag = CScope::eGetBioseq_Loaded);
709 
710 
711 /// Return protein name from corresponding Prot-ref feature.
712 /// Throws exception if the sequence is not a protein,
713 /// or if there is no unambiguously best Prot-ref feature,
714 /// or if the feature doesn't return non-empty label.
715 NCBI_XOBJUTIL_EXPORT
716 string GetProteinName(const CBioseq_Handle& seq);
717 
718 
719 /* @} */
720 
721 
722 class NCBI_XOBJUTIL_EXPORT
723 CSeqIdFromHandleException : EXCEPTION_VIRTUAL_BASE public CException
724 {
725 public:
726     // Enumerated list of document management errors
727     enum EErrCode {
728         eNoSynonyms,
729         eRequestedIdNotFound
730     };
731 
732     // Translate the specific error code into a string representations of
733     // that error code.
734     virtual const char* GetErrCodeString(void) const override;
735 
736     NCBI_EXCEPTION_DEFAULT(CSeqIdFromHandleException, CException);
737 };
738 
739 
740 END_SCOPE(sequence)
741 
742 
743 /// FASTA-format output; see also ReadFasta in <objtools/readers/fasta.hpp>
744 
745 class NCBI_XOBJUTIL_EXPORT CFastaOstream {
746 public:
747     enum EFlags : long {
748         fAssembleParts      = 1 <<  0, ///< assemble FAR delta sequences; on by dflt
749         fInstantiateGaps    = 1 <<  1, ///< honor specifed gap mode; on by default
750         fSuppressRange      = 1 <<  2, ///< never include location details in defline
751         fReverseStrand      = 1 <<  3, ///< flip the (implicit) location
752         fKeepGTSigns        = 1 <<  4, ///< don't convert '>' to '_' in title
753         fMapMasksUp         = 1 <<  5, ///< honor masks specified at a lower level
754         fMapMasksDown       = 1 <<  6, ///< honor masks specified at a higher level
755         fNoExpensiveOps     = 1 <<  7, ///< don't try too hard to find titles
756         fShowModifiers      = 1 <<  8, ///< show key-value pair modifiers (e.g. "[organism=Homo sapiens]")
757         fNoDupCheck         = 1 <<  9, ///< skip check for duplicate sequence IDs
758         fShowGapModifiers   = 1 << 10, ///< show gap key-value pair modifiers (e.g. "[linkage-evidence=map;strobe]"). Only works if gap mode is eGM_count.
759         fKeepUnknGapNomLen  = 1 << 11, ///< Keep unknown gap's nominal length.  That is, when a gap has an unknown length but nominal length, use that instead of just making it 100.
760         fShowGapsOfSizeZero = 1 << 12, ///< Use this to show gaps of size zero as a lone hyphen at the end of a line.
761         fEnableGI           = 1 << 13, ///< Use this flag to enable GI output in the defline
762         fHideGenBankPrefix  = 1 << 14, ///< Hide gb| prefix for genbank only seq_id's
763         fHTMLEncode         = 1 << 15, ///< Encode the title string for HTML display
764         fIgnoreOriginalID   = 1 << 16, ///< Disregard original ID when constructing defline
765         // historically misnamed as eFlagName
766         eAssembleParts   = fAssembleParts,
767         eInstantiateGaps = fInstantiateGaps,
768         fUseAutoDef         = 1 << 17,  ///< Disregard original ID when constructing defline
769         fBaseFirstUnused    = 1 << 18, ///< first avalailabe for derived classes
770         fDoNotUseAutoDef    = 1 << 19
771     };
772     typedef long TFlags; ///< binary OR of EFlags
773 
774     /// How to represent gaps with fInstantiateGaps enabled, as it is
775     /// by default.  (Disabling fInstantiateGaps is equivalent to
776     /// requesting eGM_one_dash.)
777     enum EGapMode {
778         eGM_one_dash, ///< A single dash, followed by a line break.
779         eGM_dashes,   ///< Multiple inline dashes.
780         eGM_letters,  ///< Multiple inline Ns or Xs as appropriate (default).
781         eGM_count     ///< >?N or >?unk100, as appropriate.
782     };
783 
784     CFastaOstream(CNcbiOstream& out);
785     virtual ~CFastaOstream();
786 
787     /// Unspecified locations designate complete sequences;
788     /// non-empty custom titles override the usual title determination logic
789     virtual void Write        (const CSeq_entry_Handle& handle,
790                                const CSeq_loc* location = 0);
791     virtual void Write        (const CBioseq_Handle& handle,
792                                const CSeq_loc* location = 0,
793                                const string& custom_title = kEmptyStr);
794     virtual void WriteTitle   (const CBioseq_Handle& handle,
795                                const CSeq_loc* location = 0,
796                                const string& custom_title = kEmptyStr);
797     virtual void WriteSequence(const CBioseq_Handle& handle,
798                                const CSeq_loc* location = 0,
799                                CSeq_loc::EOpFlags merge_flags=CSeq_loc::fMerge_AbuttingOnly);
800 
801     /// These versions may set up a temporary object manager scope
802     /// In the common case of a raw bioseq, no scope is needed
803     void Write(const CSeq_entry& entry, const CSeq_loc* location = 0,
804                bool no_scope = false);
805     void Write(const CBioseq&    seq,   const CSeq_loc* location = 0,
806                bool no_scope = false,   const string& custom_title = kEmptyStr);
807     void WriteTitle(const CBioseq& seq, const CSeq_loc* location = 0,
808                     bool no_scope=false, const string& custom_title=kEmptyStr);
809 
810     /// Used only by Write(CSeq_entry[_Handle], ...); permissive by default
SkipBioseq(const CBioseq &)811     virtual bool SkipBioseq(const CBioseq& /* seq */) { return false; }
812     /// Delegates to the non-handle version by default for
813     /// compatibility with older code; newer code should override this
814     /// version.
SkipBioseq(const CBioseq_Handle & handle)815     virtual bool SkipBioseq(const CBioseq_Handle& handle)
816     { return SkipBioseq(*handle.GetCompleteBioseq()); }
817 
818     /// Which residues to mask out in subsequent output.
819     /// These do NOT automatically reset between calls to Write;
820     /// you must do so yourself by setting them to null.
821     enum EMaskType {
822         eSoftMask = 1, ///< write as lowercase rather than uppercase
823         eHardMask = 2  ///< write as N for nucleotides, X for peptides
824     };
825     CConstRef<CSeq_loc> GetMask(EMaskType type) const;
826     void                SetMask(EMaskType type, CConstRef<CSeq_loc> location);
827 
828     /// Other parameters...
GetWidth(void) const829     TSeqPos GetWidth   (void) const    { return m_Width;   }
830     void    SetWidth   (TSeqPos width);
GetAllFlags(void) const831     TFlags  GetAllFlags(void) const    { return m_Flags;   }
SetAllFlags(TFlags flags)832     void    SetAllFlags(TFlags flags)  { m_Flags = flags;  }
SetFlag(EFlags flag)833     void    SetFlag    (EFlags flag)   { m_Flags |=  flag; }
ResetFlag(EFlags flag)834     void    ResetFlag  (EFlags flag)   { m_Flags &= ~flag; }
SetGapMode(EGapMode mode)835     void    SetGapMode (EGapMode mode) { m_GapMode = mode; }
GetGapMode(void) const836     EGapMode GetGapMode(void) const    { return m_GapMode; }
837 
838     /// This indicates the text of the modifiers of a gap.
839     struct NCBI_XOBJUTIL_EXPORT SGapModText {
840         /// String representing the gap type.
841         /// Examples: "short_arm", "telomere", etc.
842         string gap_type;
843         /// A vector representing the linkage-evidences of the gap.
844         /// Example linkage-evidences: "align genus", "within clone", etc.
845         vector<string> gap_linkage_evidences;
846 
847         // more fields may be added in the future.
848 
849         /// This will write the modifiers in
850         /// FASTA format.  (example: "[gap-type=short_arm]")
851         void WriteAllModsAsFasta( CNcbiOstream & out ) const;
852     };
853 
854     /// Given a CSeq_gap object, this outputs the
855     /// Gap information
856     ///
857     /// @param seq_gap
858     ///   This is the seq_gap information we're using to figure out
859     ///   the gap mod text
860     /// @param out_gap_mod_text
861     ///   This holds the result.
862     static void
863     GetGapModText(
864         const CSeq_gap & seq_gap,
865         SGapModText & out_gap_mod_text );
866 
867 protected:
868     CNcbiOstream&       m_Out;
869     auto_ptr<sequence::CDeflineGenerator> m_Gen;
870 
871     virtual void x_WriteSeqIds    ( const CBioseq& bioseq,
872                                     const CSeq_loc* location);
873     virtual void x_WriteAsFasta   ( const CBioseq& bioseq );
874     virtual void x_GetBestId(CConstRef<CSeq_id>& gi_id, CConstRef<CSeq_id>& best_id, bool& hide_prefix, const CBioseq& bioseq);
875     //virtual void x_WriteModifiers ( const CBioseq_Handle & handle );
876     virtual void x_WriteSeqTitle( const CBioseq_Handle & handle,
877                                     const string& custom_title);
x_WriteBuffer(const char * buf,unsigned int count)878     virtual void x_WriteBuffer( const char* buf, unsigned int count) { m_Out.write(buf, count); };
879 
880     TFlags              m_Flags;
881 
882 private:
883     CConstRef<CSeq_loc> m_SoftMask;
884     CConstRef<CSeq_loc> m_HardMask;
885     TSeqPos             m_Width;
886     EGapMode            m_GapMode;
887     TSeq_id_HandleSet   m_PreviousWholeIds;
888     // avoid recomputing for every sequence
889     typedef AutoPtr<char, ArrayDeleter<char> > TCharBuf;
890     TCharBuf            m_Dashes, m_LC_Ns, m_LC_Xs, m_UC_Ns, m_UC_Xs;
891 
892     sequence::CDeflineGenerator::TUserFlags x_GetTitleFlags(void) const;
893 
894     //void x_PrintStringModIfNotDup(
895     //    bool *seen, const CTempString & key, const CTempString & value );
896     //void x_PrintIntModIfNotDup(
897     //    bool *seen, const CTempString & key, const int value );
898 
899     CConstRef<CSeq_loc> x_MapMask(CSeq_loc_Mapper& mapper, const CSeq_loc& mask,
900                                   const CSeq_id* base_seq_id, CScope* scope);
901 
902     typedef map<TSeqPos, int> TMSMap;
903     void x_GetMaskingStates(TMSMap& masking_states,
904                             const CSeq_id* base_seq_id,
905                             const CSeq_loc* location,
906                             CScope* scope);
907 
908     void x_WriteSequence(const CSeqVector& vec,
909                          const TMSMap& masking_state);
910 };
911 
912 
913 /// Public interface for coding region translation function
914 /// Uses CTrans_table in <objects/seqfeat/Genetic_code_table.hpp>
915 /// for rapid translation from a given genetic code, allowing all
916 /// of the iupac nucleotide ambiguity characters
917 
918 class NCBI_XOBJUTIL_EXPORT CCdregion_translate
919 {
920 public:
921 
922     enum ETranslationLengthProblemOptions {
923         eThrowException = 0,
924         eTruncate,
925         ePad
926     };
927 
928     /// translation coding region into ncbieaa protein sequence
929     NCBI_DEPRECATED
930     static void TranslateCdregion (string& prot,
931                                    const CBioseq_Handle& bsh,
932                                    const CSeq_loc& loc,
933                                    const CCdregion& cdr,
934                                    bool include_stop = true,
935                                    bool remove_trailing_X = false,
936                                    bool* alt_start = 0,
937                                    ETranslationLengthProblemOptions options = eThrowException);
938 
939     NCBI_DEPRECATED
940     static void TranslateCdregion(string& prot,
941                                   const CSeq_feat& cds,
942                                   CScope& scope,
943                                   bool include_stop = true,
944                                   bool remove_trailing_X = false,
945                                   bool* alt_start = 0,
946                                   ETranslationLengthProblemOptions options = eThrowException);
947 };
948 
949 
950 class NCBI_XOBJUTIL_EXPORT CSeqTranslator
951 {
952 public:
953     /// @sa TTranslationFlags
954     enum ETranslationFlags {
955         fDefault = 0,
956         fNoStop = (1<<0),          ///< = 0x1 Do not include stop in translation
957         fRemoveTrailingX = (1<<1), ///< = 0x2 Remove trailing Xs from protein
958         fIs5PrimePartial = (1<<2), ///< = 0x4 Translate first codon even if not start codon (because sequence is 5' partial)
959         fIs3PrimePartial = (1<<3)  ///< = 0x8 May not end in stop codon (because sequence is 3' partial)
960     };
961 
962     typedef int TTranslationFlags;
963 
964 
965 
966     /// Translate a string using a specified genetic code
967     /// @param seq
968     ///   String containing IUPAC representation of sequence to be translated
969     /// @param code
970     ///   Genetic code to use for translation (NULL to use default)
971     /// @param include_stop
972     ///   If true, translate through stop codons and include trailing stop
973     ///   (true by default)
974     /// @param remove_trailing_X
975     ///   If true, remove trailing Xs from protein translation (false by
976     ///   default)
977     /// @param alt_start
978     ///   Pointer to bool to indicate whether an alternative start codon was
979     ///   used
980     /// @param is_5prime_complete
981     ///   If true, only translate first codon if start codon, otherwise
982     ///   translate as dash (-) to indicate problem with sequence
983 
984     NCBI_DEPRECATED static void Translate(const string& seq,
985                           string& prot,
986                           const CGenetic_code* code,
987                           bool include_stop = true,
988                           bool remove_trailing_X = false,
989                           bool* alt_start = NULL,
990                           bool is_5prime_complete = true,
991                           bool is_3prime_complete = true);
992 
993     /// Translate a string using a specified genetic code
994     /// @param seq
995     ///   String containing IUPAC representation of sequence to be translated
996     /// @param code
997     ///   Genetic code to use for translation (NULL to use default)
998     /// @param flags
999     ///   Binary OR of "ETranslationFlags"
1000     /// @param alt_start
1001     ///   Pointer to bool to indicate whether an alternative start codon was
1002     ///   used
1003     static void Translate(const string& seq,
1004                           string& prot,
1005                           TTranslationFlags flags = fDefault,
1006                           const CGenetic_code* code = NULL,
1007                           bool* alt_start = NULL);
1008 
1009     /// Translate a seq-vector using a specified genetic code
1010     /// if the code is NULL, then the default genetic code is used
1011     /// @param seq
1012     ///   CSeqVector of sequence to be translated
1013     /// @param code
1014     ///   Genetic code to use for translation (NULL to use default)
1015     /// @param include_stop
1016     ///   If true, translate through stop codons and include trailing stop
1017     ///   (true by default)
1018     /// @param remove_trailing_X
1019     ///   If true, remove trailing Xs from protein translation (false by
1020     ///   default)
1021     /// @param alt_start
1022     ///   Pointer to bool to indicate whether an alternative start codon was
1023     ///   used
1024     /// @param is_5prime_complete
1025     ///   If true, only translate first codon if start codon, otherwise
1026     ///   translate as dash (-) to indicate problem with sequence
1027     NCBI_DEPRECATED static void Translate(const CSeqVector& seq,
1028                           string& prot,
1029                           const CGenetic_code* code,
1030                           bool include_stop = true,
1031                           bool remove_trailing_X = false,
1032                           bool* alt_start = NULL,
1033                           bool is_5prime_complete = true,
1034                           bool is_3prime_complete = true);
1035 
1036     /// Translate a seq-vector using a specified genetic code
1037     /// if the code is NULL, then the default genetic code is used
1038     /// @param seq
1039     ///   CSeqVector of sequence to be translated
1040     /// @param code
1041     ///   Genetic code to use for translation (NULL to use default)
1042     /// @param flags
1043     ///   Binary OR of "ETranslationFlags"
1044     /// @param alt_start
1045     ///   Pointer to bool to indicate whether an alternative start codon was
1046     ///   used
1047     static void Translate(const CSeqVector& seq,
1048                           string& prot,
1049                           TTranslationFlags flags = fDefault,
1050                           const CGenetic_code* code = NULL,
1051                           bool* alt_start = NULL);
1052 
1053     /// utility function: translate a given location on a sequence
1054     NCBI_DEPRECATED
1055     static void Translate(const CSeq_loc& loc,
1056                           const CBioseq_Handle& handle,
1057                           string& prot,
1058                           const CGenetic_code* code = NULL,
1059                           bool include_stop = true,
1060                           bool remove_trailing_X = false,
1061                           bool* alt_start = 0);
1062 
1063     /// utility function: translate a given location on a sequence
1064     static void Translate(const CSeq_loc& loc,
1065                           CScope& scope,
1066                           string& prot,
1067                           const CGenetic_code* code = NULL,
1068                           bool include_stop = true,
1069                           bool remove_trailing_X = false,
1070                           bool* alt_start = 0);
1071 
1072     /// Translate a CDRegion into a protein
1073     static void Translate(const CSeq_feat& cds,
1074                           CScope& scope,
1075                           string& prot,
1076                           bool include_stop = true,
1077                           bool remove_trailing_X = false,
1078                           bool* alt_start = 0);
1079 
1080     static CRef<CBioseq> TranslateToProtein(const CSeq_feat& cds,
1081                                               CScope& scope);
1082 
1083     static bool ChangeDeltaProteinToRawProtein(CRef<CBioseq> protein);
1084 
1085     /// Find "best" frame for a coding region. "Best" frame has no
1086     /// internal stop codons.
1087     static CCdregion::EFrame FindBestFrame(const CSeq_feat& cds, CScope& scope);
1088     static CCdregion::EFrame FindBestFrame(const CSeq_feat& cds, CScope& scope, bool& ambiguous);
1089 
1090 };
1091 
1092 
1093 
1094 /// Location relative to a base Seq-loc: one (usually) or more ranges
1095 /// of offsets.
1096 /// XXX - handle fuzz?
1097 struct NCBI_XOBJUTIL_EXPORT SRelLoc
1098 {
1099     enum EFlags {
1100         fNoMerge = 0x1 ///< don't merge adjacent intervals
1101     };
1102     typedef int TFlags; ///< binary OR of EFlags
1103 
1104     /// For relative ranges (ONLY), id is irrelevant and normally unset.
1105     typedef CSeq_interval         TRange;
1106     typedef vector<CRef<TRange> > TRanges;
1107 
1108     /// Beware: treats locations corresponding to different sequences as
1109     /// disjoint, even if one is actually a segment of the other. :-/
1110     SRelLoc(const CSeq_loc& parent, const CSeq_loc& child, CScope* scope = 0,
1111             TFlags flags = 0);
1112 
1113     /// For manual work.  As noted above, ranges need not contain any IDs.
SRelLocSRelLoc1114     SRelLoc(const CSeq_loc& parent, const TRanges& ranges)
1115         : m_ParentLoc(&parent), m_Ranges(ranges) { }
1116 
ResolveSRelLoc1117     CRef<CSeq_loc> Resolve(CScope* scope = 0, TFlags flags = 0) const
1118         { return Resolve(*m_ParentLoc, scope, flags); }
1119     CRef<CSeq_loc> Resolve(const CSeq_loc& new_parent, CScope* scope = 0,
1120                            TFlags flags = 0) const;
1121 
1122     CConstRef<CSeq_loc> m_ParentLoc;
1123     TRanges             m_Ranges;
1124 };
1125 
1126 
1127 
1128 ///============================================================================//
1129 ///                             Sequence Search                                //
1130 ///============================================================================//
1131 
1132 /// CSeqSearch
1133 /// ==========
1134 ///
1135 /// Search a nucleotide sequence for one or more patterns
1136 ///
1137 
1138 class NCBI_XOBJUTIL_EXPORT CSeqSearch
1139 {
1140 public:
1141 
1142     /// Holds information associated with a pattern, such as the name of the
1143     /// restriction enzyme, location of cut site etc.
1144     class CPatternInfo
1145     {
1146     public:
1147         /// constructor
CPatternInfo(const string & name,const string & sequence,Int2 cut_site)1148         CPatternInfo(const      string& name,
1149                      const      string& sequence,
1150                      Int2       cut_site) :
1151             m_Name(name), m_Sequence(sequence), m_CutSite(cut_site),
1152             m_Strand(eNa_strand_unknown)
1153         {}
1154 
GetName(void) const1155         const string& GetName     (void) const { return m_Name;     }
GetSequence(void) const1156         const string& GetSequence (void) const { return m_Sequence; }
GetCutSite(void) const1157         Int2          GetCutSite  (void) const { return m_CutSite;  }
GetStrand(void) const1158         ENa_strand    GetStrand   (void) const { return m_Strand;   }
1159 
1160     private:
1161         friend class CSeqSearch;
1162 
1163         /// data
1164         string      m_Name;      /// user defined name
1165         string      m_Sequence;  /// nucleotide sequence
1166         Int2        m_CutSite;
1167         ENa_strand  m_Strand;
1168     };
1169     typedef CPatternInfo    TPatternInfo;
1170 
1171     /// Client interface:
1172     /// ==================
1173     /// A class that uses the SeqSearch facility should implement the Client
1174     /// interface and register itself with the search utility to be notified
1175     /// of matches detection.
1176     class IClient
1177     {
1178     public:
~IClient()1179         virtual ~IClient() { }
1180 
1181         typedef CSeqSearch::TPatternInfo    TPatternInfo;
1182 
1183         virtual bool OnPatternFound(const TPatternInfo& pat_info, TSeqPos pos) = 0;
1184     };
1185 
1186 public:
1187 
1188     enum ESearchFlag {
1189         fNoFlags       = 0,
1190         fJustTopStrand = 1,
1191         fExpandPattern = 2,
1192         fAllowMismatch = 4
1193     };
1194     typedef unsigned int TSearchFlags; ///< binary OR of ESearchFlag
1195 
1196     /// constructors
1197     /// @param client
1198     ///   pointer to a client object (receives pattern match notifications)
1199     /// @param flags
1200     ///   specify search flags
1201     CSeqSearch(IClient *client = 0, TSearchFlags flags = fNoFlags);
1202     /// destructor
1203     ~CSeqSearch(void);
1204 
1205     /// Add nucleotide pattern or restriction site to sequence search.
1206     /// Uses ambiguity codes, e.g., R = A and G, H = A, C and T
1207     void AddNucleotidePattern(const string& name,       /// pattern's name
1208                               const string& sequence,   /// pattern's sequence
1209                               Int2          cut_site,
1210                               TSearchFlags  flags = fNoFlags);
1211 
1212     /// Search the sequence for patterns
1213     /// @sa
1214     ///   AddNucleotidePattern
1215     void Search(const CBioseq_Handle& seq);
1216 
1217     /// Low level search method.
1218     /// The user is responsible for feeding each character in turn,
1219     /// keep track of the position in the text and provide the length in case of
1220     /// a circular topoloy.
1221     int Search(int current_state, char ch, int position, int length = kMax_Int);
1222 
1223     /// Get / Set client.
GetClient() const1224     const IClient* GetClient() const { return m_Client; }
SetClient(IClient * client)1225     void SetClient(IClient* client) { m_Client = client; }
1226 
1227 private:
1228 
1229     void x_AddNucleotidePattern(const string& name, string& pattern,
1230         Int2 cut_site, ENa_strand strand, TSearchFlags flags);
1231 
1232     void x_ExpandPattern(string& sequence, string& buffer, size_t pos,
1233         TPatternInfo& pat_info, TSearchFlags flags);
1234 
1235     void x_AddPattern(TPatternInfo& pat_info, string& sequence, TSearchFlags flags);
1236     void x_StorePattern(TPatternInfo& pat_info, string& sequence);
1237 
x_IsJustTopStrand(TSearchFlags flags) const1238     bool x_IsJustTopStrand(TSearchFlags flags) const {
1239         return ((m_Flags | flags) & fJustTopStrand) != 0;
1240     }
x_IsExpandPattern(TSearchFlags flags) const1241     bool x_IsExpandPattern(TSearchFlags flags) const {
1242         return ((m_Flags | flags) & fExpandPattern) != 0;
1243     }
x_IsAllowMismatch(TSearchFlags flags) const1244     bool x_IsAllowMismatch(TSearchFlags flags) const {
1245         return ((m_Flags | flags) & fAllowMismatch) != 0;
1246     }
1247 
1248     // data
1249     IClient*                m_Client;          // pointer to client object
1250     TSearchFlags            m_Flags;           // search flags
1251     size_t                  m_LongestPattern;  // longets search pattern
1252     CTextFsm<TPatternInfo>  m_Fsa;             // finite state machine
1253 };  //  end of CSeqSearch
1254 
1255 
1256 /// This trims ambiguous bases from the start and/or end of
1257 /// sequences, using customizable rules.
1258 class NCBI_XOBJUTIL_EXPORT CSequenceAmbigTrimmer : public CObject
1259 {
1260 public:
1261 
1262     /// This enum is used to set what is meant by "ambiguous".
1263     enum EMeaningOfAmbig {
1264         /// Here, only N for nucleotides and X for amino acids is considered
1265         /// ambiguous.
1266         eMeaningOfAmbig_OnlyCompletelyUnknown,
1267         /// Here, anything that's not certain is considered
1268         /// ambiguous.  That is, anything but A, C, G, T for nucleotides,
1269         /// and most amino acids except, for example, B (which can be
1270         /// aspartic acid or asparagine), X (completely ambiguous), etc.
1271         eMeaningOfAmbig_AnyAmbig,
1272     };
1273 
1274     enum EFlags {
1275         fFlags_DoNotTrimBeginning = (1 << 0), ///< 0x01 ("Beginning" as defined by CSeqVector)
1276         fFlags_DoNotTrimEnd       = (1 << 1), ///< 0x02 ("End" as defined by CSeqVector)
1277 
1278         fFlags_DoNotTrimSeqGap    = (1 << 2), ///< 0x04 (Seq-gaps are not considered trimmable if this flag is set, only letter gaps (e.g. N's for nucs))
1279 
1280         // we might support this in the future
1281         // fFlags_TrimAnnot         = (1 << 3)   ///< 0x08 (Trim annots based on trimmed bioseq location)
1282     };
1283     typedef int TFlags;
1284 
1285     /// For example, if bases_to_check is 10 and max_bases_allowed_to_be_ambig
1286     /// is 5, then on each iteration we check the 10 terminal bases and
1287     /// trim off those 10 if there are more than 5 ambiguous bases there.
1288     struct NCBI_XOBJUTIL_EXPORT STrimRule {
1289         TSignedSeqPos bases_to_check;
1290         TSignedSeqPos max_bases_allowed_to_be_ambig;
1291     };
1292     /// Multiple STrimRules are allowed, which are applied from
1293     /// smallest bases_to_check to largest bases_to_check, and
1294     /// redundant rules are automatically removed.  When a rule is applied,
1295     /// we start over at the first sorted rule again.
1296     typedef vector<STrimRule> TTrimRuleVec;
1297 
1298     /// This returns a reasonable default for trimming rules.
1299     static const TTrimRuleVec & GetDefaultTrimRules(void);
1300 
1301     /// This sets up the parameters for how this trimmer will act
1302     ///
1303     /// @param eMeaningOfAmbig
1304     ///   This indicates exactly what ambiguous means (e.g. just "N" or
1305     ///   do all ambiguous symbols count? )
1306     /// @param fFlags
1307     ///   miscellaneous parameters to control this.  See TFlags.
1308     /// @param vecTrimRules
1309     ///   This indicates how trimming will occur.  See TTrimRuleVec.
1310     /// @param uMinSeqLen
1311     ///   Trimming tries to halt if the sequence becomes smaller than this size.
1312     ///   It is possible for the resulting sequence to be below the
1313     ///   uMinSeqLen size (or even trimmed to nothing), but the trimmer
1314     ///   will at least <i>try</i> not to do that.
1315     CSequenceAmbigTrimmer(
1316         EMeaningOfAmbig eMeaningOfAmbig,
1317         TFlags fFlags = 0,
1318         const TTrimRuleVec & vecTrimRules = GetDefaultTrimRules(),
1319         TSignedSeqPos uMinSeqLen = 50 );
1320 
1321     /// Do-nothing destructor just to allow inheritance.
~CSequenceAmbigTrimmer()1322     virtual ~CSequenceAmbigTrimmer() { }
1323 
1324     /// This indicates what happened with the trim.
1325     /// Error states are indicated by an exception, not EResult.
1326     enum EResult {
1327         /// Bioseq is now trimmed.
1328         eResult_SuccessfullyTrimmed,
1329 
1330         /// Bioseq is left unchanged because it did not need to be trimmed
1331         /// at all.  This is NOT an error.
1332         eResult_NoTrimNeeded
1333     };
1334 
1335     /// This trims the given bioseq, using params
1336     /// set in the CSequenceAmbigTrimmer constructor.  It will properly
1337     /// handle the annots and descs inside the bioseq, too, if requested.
1338     ///
1339     /// @param bioseq_handle
1340     ///   The bioseq to trim.
1341     /// @param trimmed_ranges
1342     ///   The ranges trimmed by DoTrim will be added to this.
1343     /// @return
1344     ///   This returns how the trimming went.  On error, an exception
1345     ///   is thrown and the bioseq may be in an undefined state.
1346     virtual EResult DoTrim( CBioseq_Handle &bioseq_handle,
1347                             CRangeCollection<TSeqPos> *trimmed_ranges = nullptr);
1348 
1349 protected:
1350     /// This holds the current interpretation for "ambiguous".  For example,
1351     /// it indicates whether just 'N' is ambiguous or if any non-ACGT
1352     /// letter is ambiguous.  Works for amino acids, too (e.g. 'X' for
1353     /// completely unknown, etc.)
1354     EMeaningOfAmbig m_eMeaningOfAmbig;
1355     /// This holds the flags that affect the behavior of this class.
1356     TFlags          m_fFlags;
1357     /// This holds the trimming rules that will be applied.
1358     /// It should be normalized by the constructor
1359     /// to eliminate dups and to sort it from least to most bases.
1360     TTrimRuleVec    m_vecTrimRules;
1361     /// When the bioseq gets trimmed down to less than this size,
1362     /// we halt the trimming.
1363     TSignedSeqPos         m_uMinSeqLen;
1364 
1365     /// Test if a given flag is set.
x_TestFlag(TFlags fFlag)1366     bool x_TestFlag(TFlags fFlag) {
1367         return ( ( m_fFlags & fFlag ) != 0 );
1368     }
1369 
1370     /// This prepares the vector of trimming rules to be used
1371     /// by the trimming algorithm. For example, it eliminate duplicates
1372     /// and puts the rules in the correct order.
1373     ///
1374     /// @param vecTrimRules
1375     ///   Input and output.
1376     virtual void x_NormalizeVecTrimRules( TTrimRuleVec & vecTrimRules );
1377 
1378     /// The bioseq is trimmed to size 0.
1379     ///
1380     /// @param bioseq_handle
1381     ///   The bioseq to trim to nothing.
1382     /// @returns
1383     ///   Works just like the DoTrim return value.
1384     virtual EResult x_TrimToNothing( CBioseq_Handle &bioseq_handle );
1385 
1386     // below this point, left/right means positions going numerically upward,
1387     // but start/end is relative to direction. That is,
1388     // a negative direction would imply end &lt;= start.
1389 
1390     /// This returns the last good base that won't be trimmed
1391     /// (note: last really means "first" when we're starting from the end)
1392     ///
1393     /// @param seqvec
1394     ///   This lets us explore the Bioseq to find out where to trim.
1395     /// @param iStartPosInclusive_arg
1396     ///   This is the where we start our trimming.  Depending on
1397     ///   direction, this could be &lt; or &gt; iEndPosInclusive_arg.
1398     /// @param iEndPosInclusive_arg
1399     ///   This is where the trimming ends (inclusive).  Analogous to
1400     ///   iStartPosInclusive_arg.
1401     /// @param iTrimDirection
1402     ///   1 to trim from left to right, -1 to trim from right to left.
1403     /// @return
1404     ///   The last good base (remember: last means "lower number" when we're
1405     ///   checking from the end).  If trimming would trim off the entire
1406     ///   sequence, it returns a position past the end of the sequence.
1407     virtual TSignedSeqPos x_FindWhereToTrim(
1408         const CSeqVector & seqvec,
1409         const TSignedSeqPos iStartPosInclusive_arg,
1410         const TSignedSeqPos iEndPosInclusive_arg,
1411         TSignedSeqPos iTrimDirection );
1412 
1413     /// This adjusts in_out_uStartOfGoodBasesSoFar if we're at
1414     /// a CSeqMap gap.  It does not notice ambiguous bases that
1415     /// are inside a normal sequence.
1416     ///
1417     /// @param seqvec
1418     ///   This is used to access information about the sequence.
1419     /// @param in_out_uStartOfGoodBasesSoFar
1420     ///   This is the start of where we check for a gap.
1421     ///   It will be updated to be past the gap, if a gap is found.
1422     /// @param in_out_uRightmostGoodBaseSoFar
1423     ///   Analogous to in_out_uLeftmostGoodBaseSoFar.  It's inclusive.
1424     /// @param uEndOfGoodBasesSoFar
1425     ///   This limits how far this function may search (inclusive)
1426     ///   when looking for
1427     ///   the end of a gap segment.
1428     /// @param iTrimDirection
1429     ///   1 to trim from left to right, -1 to trim from right to left.
1430     /// @param uChunkSize
1431     ///   The gap size that we chop off must be a multiple of uChunkSize.
1432     ///   We will chop off less if we would go more than 1 past the
1433     ///   uEndOfGoodBasesSoFar.
1434     ///   A uChunkSize of 1 means no chunking for obvious math reasons.
1435     virtual void x_EdgeSeqMapGapAdjust(
1436         const CSeqVector & seqvec,
1437         TSignedSeqPos & in_out_uStartOfGoodBasesSoFar,
1438         const TSignedSeqPos uEndOfGoodBasesSoFar,
1439         const TSignedSeqPos iTrimDirection,
1440         const TSignedSeqPos uChunkSize );
1441 
1442     /// This holds the output of x_CountAmbigInRange
1443     struct NCBI_XOBJUTIL_EXPORT SAmbigCount : public CObject {
SAmbigCountCSequenceAmbigTrimmer::SAmbigCount1444         SAmbigCount(const TSignedSeqPos iTrimDirection) :
1445             num_ambig_bases(0),
1446             pos_after_last_gap(
1447                 (iTrimDirection > 0)
1448                 ? numeric_limits<TSignedSeqPos >::max()
1449                 : numeric_limits<TSignedSeqPos >::min() )
1450             { }
1451 
1452         /// the number of ambiguous bases found in the range
1453         /// supplied to x_CountAmbigInRange
1454         TSignedSeqPos num_ambig_bases;
1455         /// Inclusive. This is far past the end if the whole range
1456         /// is ambiguous.
1457         TSignedSeqPos pos_after_last_gap;
1458     };
1459 
1460     /// This counts the number of ambiguous bases in the range
1461     /// [leftmost_pos_to_check, rightmost_pos_to_check].  Note that
1462     /// rightmost_pos_to_check is inclusive.
1463     ///
1464     /// @param out_result
1465     ///   This will store the result.  Pass in a struct initialized
1466     ///   by the default constructor.
1467     /// @param seqvec
1468     ///   This is used to get the bases.
1469     /// @param iStartPosInclusive
1470     ///   This is where we start our count.
1471     /// @param iEndPosInclusive
1472     ///   This is where we end our count.  Note that it can be &lt; or
1473     ///   &gt; iStartPosInclusive, depending on trim direction.
1474     /// @param iTrimDirection
1475     ///   1 to trim from left to right, -1 to trim from right to left.
1476     virtual void x_CountAmbigInRange(
1477         SAmbigCount & out_result,
1478         const CSeqVector & seqvec,
1479         const TSignedSeqPos iStartPosInclusive_arg,
1480         const TSignedSeqPos iEndPosInclusive_arg,
1481         const TSignedSeqPos iTrimDirection );
1482 
1483     /// This returns the (inclusive) position at the beginning of the
1484     /// segment.
1485     ///
1486     /// @param segment
1487     ///   This is the segment we're trying to find the beginning of.
1488     /// @param iTrimDirection
1489     ///   This is which direction in which we're trimming.  The beginning
1490     ///   will be in the opposite direction.
1491     /// @return
1492     ///   This returns the (inclusive) position at the beginning of the given
1493     ///   segment. As always,
1494     ///   the definition of "beginning" depends on iTrimDirection.
x_SegmentGetBeginningInclusive(const CSeqMap_CI & segment,const TSignedSeqPos iTrimDirection)1495     TSignedSeqPos x_SegmentGetBeginningInclusive(
1496         const CSeqMap_CI & segment,
1497         const TSignedSeqPos iTrimDirection )
1498     {
1499         // symmetrical
1500         return x_SegmentGetEndInclusive( segment, -iTrimDirection );
1501     }
1502 
1503     /// This returns the (inclusive) position at the end of the
1504     /// segment currently at iStartPosInclusive_arg.
1505     ///
1506     /// @param segment
1507     ///   This is the segment we're trying to find the end of.
1508     /// @param iTrimDirection
1509     ///   This is which direction in which we're trimming.  The end
1510     ///   of the segment will be found by looking in that direction.
1511     /// @return
1512     ///   This returns the (inclusive) position at the end of the given
1513     ///   segment. The definition of "end" depends on iTrimDirection.
1514     TSignedSeqPos x_SegmentGetEndInclusive(
1515         const CSeqMap_CI & segment,
1516         const TSignedSeqPos iTrimDirection );
1517 
1518     /// Returns the "next" segment.  The definition of "next"
1519     /// depends on iTrimDirection
1520     ///
1521     /// @param in_out_segment
1522     ///   Caller gives the current CSeqMap_CI, which will be
1523     ///   returned adjusted in the trim direction.
1524     /// @param iTrimDirection
1525     ///   The direction in which to increment.  1 means normal incrementing
1526     ///   and -1 really means decrementing.
1527     /// @return
1528     ///   Reference to in_out_segment_it.
1529     CSeqMap_CI & x_SeqMapIterDoNext(
1530         CSeqMap_CI & in_out_segment_it,
1531         const TSignedSeqPos iTrimDirection );
1532 
1533     void x_SliceBioseq(
1534         TSignedSeqPos leftmost_good_base,
1535         TSignedSeqPos rightmost_good_base,
1536         CBioseq_Handle & bioseq_handle );
1537 
1538     // For each letter of the alphabet, returns whether or not it's
1539     // ambiguous. Index 0 is 'A', index 1 is 'B', etc.
1540     typedef bool TAmbigLookupTable[26];
1541     TAmbigLookupTable m_arrNucAmbigLookupTable;
1542     TAmbigLookupTable m_arrProtAmbigLookupTable;
1543 };
1544 
1545 /// This iterates over the runs of Ns of each sequence
1546 class NCBI_XOBJUTIL_EXPORT CBioseqGaps_CI : public CObject
1547 {
1548 public:
1549 
1550     /// The params that control the behavior of CBioseqGaps_CI
1551     struct NCBI_XOBJUTIL_EXPORT Params {
1552         /// Default ctor gives params which are usually reasonable.
ParamsCBioseqGaps_CI::Params1553         Params(void) : max_gap_len_to_ignore(10),
1554             max_num_gaps_per_seq( numeric_limits<TSeqPos>::max() ),
1555             max_num_seqs( numeric_limits<TSeqPos>::max() ),
1556             mol_filter(CSeq_inst::eMol_not_set),
1557             level_filter(CBioseq_CI::eLevel_All)
1558         {
1559         }
1560 
1561         /// We completely ignore any gaps we find that have this
1562         /// number of bases or fewer.
1563         TSeqPos                       max_gap_len_to_ignore;
1564         /// We only return up to this many gaps for each sequence
1565         TSeqPos                       max_num_gaps_per_seq;
1566         /// We only return gaps on up to this many sequences.
1567         TSeqPos                       max_num_seqs;
1568 
1569         /// CSeq_inst::eMol_na to only look at gaps on nucleotide
1570         /// sequences.  CSeq_inst::eMol_aa to only look at gaps
1571         /// on amino acid sequences.
1572         /// CSeq_inst::eMol_not_set to avoid filtering.
1573         CSeq_inst::EMol               mol_filter;
1574         /// Works like the level filter in CBioseq_CI
1575         CBioseq_CI::EBioseqLevelFlag  level_filter;
1576     };
1577 
1578     /// This constructor initializes the iterator.
1579     ///
1580     /// @param entry_h
1581     ///   This will iterate over all descendents of this entry.
1582     /// @param params
1583     ///   Controls the behavior of the iterator.  If not specified,
1584     ///   a reasonable default will be used.
1585     CBioseqGaps_CI(
1586         const CSeq_entry_Handle & entry_h,
1587         const Params & params = Params() );
1588 
1589     /// Move the iterator forward to next gap
1590     /// (or the end, if there are no more to return)
operator ++(void)1591     CBioseqGaps_CI & operator ++ (void) { x_Next(); return *this; }
1592 
1593     DECLARE_OPERATOR_BOOL(m_bioseq_CI);
1594 
1595     /// This indicates the state of the iterator right now.
1596     /// This structure is undefined if the iterator
1597     /// has reached the end, though the caller probably
1598     /// won't be able to access it anyway since x_GetCurrent
1599     /// will throw an exception in that case.
1600     struct NCBI_XOBJUTIL_EXPORT SCurrentGapInfo {
1601         /// Constructor initializes to state that it
1602         /// should be when the iterator first starts.
SCurrentGapInfoCBioseqGaps_CI::SCurrentGapInfo1603         SCurrentGapInfo(void) : num_seqs_seen_so_far(0),
1604             start_pos(0),
1605             length(0),
1606             num_gaps_seen_so_far_on_this_seq(0) { }
1607 
1608         /// The seq-id that this gap is on.
1609         CSeq_id_Handle  seq_id;
1610         /// This indicates how many sequences we've seen so far,
1611         /// including the one we're currently on.
1612         /// For example, 3 means we're on the 3rd sequence to
1613         /// contain a relevant gap.
1614         size_t          num_seqs_seen_so_far;
1615 
1616         /// the 0-based position at which the current gap starts
1617         /// on the current sequence.
1618         TSeqPos         start_pos;
1619         /// the length of the current gap
1620         TSeqPos         length;
1621         /// how many gaps we've seen so far on this sequence.
1622         /// For example, 2 would mean we're currently on
1623         /// the second relevant gap on this sequence.
1624         size_t          num_gaps_seen_so_far_on_this_seq;
1625     };
1626 
1627     /// Get information about the gap we're currently on.
operator *(void) const1628     const SCurrentGapInfo & operator*(void) const {
1629         return x_GetCurrent();
1630     }
1631 
1632     /// Get information about the gap we're currently on.
operator ->(void) const1633     const SCurrentGapInfo * operator ->(void) const {
1634         return &x_GetCurrent();
1635     }
1636 
1637 protected:
1638     /// This points to the bioseq we're currently on.
1639     /// When this iterator becomes invalid, that means this
1640     /// CBioseqGaps_CI is invalid, too.
1641     CBioseq_CI  m_bioseq_CI;
1642     /// This indicates information about the gap we're currently on.
1643     SCurrentGapInfo    m_infoOnCurrentGap;
1644     /// This holds the params the caller gave when this
1645     /// object was initially created.
1646     Params      m_Params;
1647 
1648     /// This gives info on the gap we're currently on.
1649     /// Throws if this iterator has finished.
1650     virtual const SCurrentGapInfo & x_GetCurrent(void) const;
1651 
1652     /// This moves this iterator to the next relevant gap.
1653     /// Throws if this iterator has finished.
1654     virtual void x_Next(void);
1655 
1656     /// This advances m_bioseq_CI although it
1657     /// has extra logic to terminate m_bioseq_CI
1658     /// if we've exceeded the number of bioseqs we can look for.
1659     virtual void x_NextBioseq(void);
1660 
1661     /// This indicates what happened when we tried to run
1662     /// x_FindNextGapOnBioseq.
1663     enum EFindNext {
1664         /// No more relevant gaps were found on this bioseq.  The other output
1665         /// parameters will be in an undefined state.
1666         eFindNext_NotFound,
1667         /// Another relevant gap was found, and the output parameters are
1668         /// filled in to represent information about it.
1669         eFindNext_Found
1670     };
1671 
1672     /// This finds the next gap on the bioseq, starting at given pos.
1673     ///
1674     /// @param bioseq_h
1675     ///   the bioseq on which we're seeking the next relevant gap.
1676     /// @param pos_to_start_looking
1677     ///   This is the position on bioseq_h to start looking for a
1678     ///   relevant gap.
1679     /// @param out_pos_of_gap
1680     ///   If a gap is found, this holds the 0-based position of the
1681     ///   start of that gap.  This is undefined if no gap was found.
1682     /// @param out_len_of_gap
1683     ///   If a gap is found, this holds the length of the
1684     ///   gap.  This is undefined if no gap was found.
1685     /// @return
1686     ///   This indicates whether or not a relevant gap was found.
1687     virtual EFindNext x_FindNextGapOnBioseq(
1688         const CBioseq_Handle & bioseq_h,
1689         const TSeqPos pos_to_start_looking,
1690         TSeqPos & out_pos_of_gap,
1691         TSeqPos & out_len_of_gap ) const;
1692 };
1693 
1694 /* @} */
1695 
1696 /// Reverse complement a Bioseq in place.
1697 /// If delta sequence, will also need to reverse order of segments
1698 void NCBI_XOBJUTIL_EXPORT ReverseComplement(CSeq_inst& seq, CScope* scope);
1699 
1700 
1701 END_SCOPE(objects)
1702 END_NCBI_SCOPE
1703 
1704 #endif  /* SEQUENCE__HPP */
1705