1 #ifndef SEQUENCE__HPP 2 #define SEQUENCE__HPP 3 4 /* $Id: sequence.hpp 626747 2021-03-03 18:46:39Z ivanov $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Author: Clifford Clausen & Aaron Ucko 30 * 31 * File Description: 32 * Sequence utilities requiring CScope 33 * Obtains or constructs a sequence's title. (Corresponds to 34 * CreateDefLine in the C toolkit.) 35 */ 36 37 #include <corelib/ncbistd.hpp> 38 #include <serial/serial.hpp> 39 #include <serial/objistr.hpp> 40 #include <serial/objostr.hpp> 41 42 #include <objects/seqfeat/SeqFeatData.hpp> 43 #include <objects/seqfeat/Cdregion.hpp> 44 #include <objects/seqloc/Na_strand.hpp> 45 #include <objects/seqloc/Seq_interval.hpp> 46 #include <objects/seqloc/Seq_loc.hpp> 47 #include <util/strsearch.hpp> 48 #include <objects/seq/seq_id_mapper.hpp> 49 #include <util/range_coll.hpp> 50 #include <objmgr/feat_ci.hpp> 51 #include <objmgr/bioseq_ci.hpp> 52 #include <objmgr/scope.hpp> 53 #include <objmgr/util/seq_loc_util.hpp> 54 #include <objmgr/util/create_defline.hpp> 55 56 BEGIN_NCBI_SCOPE 57 BEGIN_SCOPE(objects) 58 59 // Forward declarations 60 class CSeq_id; 61 class CSeq_loc_mix; 62 class CSeq_point; 63 class CPacked_seqpnt; 64 class CBioseq_Handle; 65 class CSeq_loc_Mapper; 66 class CSeqVector; 67 class CCdregion; 68 class CSeq_feat; 69 class CSeq_entry; 70 class CSeq_entry_handle; 71 class CGenetic_code; 72 class CMolInfo; 73 class CSeq_gap; 74 75 BEGIN_SCOPE(sequence) 76 77 /** @addtogroup ObjUtilSequence 78 * 79 * @{ 80 */ 81 82 83 /** @name SeqIdConv 84 * Conversions between seq-id types 85 * @{ 86 */ 87 88 89 enum EAccessionVersion { 90 eWithAccessionVersion, ///< accession.version (when possible) 91 eWithoutAccessionVersion ///< accession only, even if version is available 92 }; 93 94 95 /// Retrieve a particular seq-id from a given bioseq handle. This uses 96 /// CSynonymsSet internally to decide which seq-id should be used. 97 enum EGetIdFlags { 98 eGetId_ForceGi = 0x0000, ///< return only a gi-based seq-id 99 eGetId_ForceAcc = 0x0001, ///< return only an accession based seq-id 100 eGetId_Best = 0x0002, ///< return the "best" gi (uses FindBestScore(), 101 ///< with CSeq_id::CalculateScore() as the score 102 ///< function 103 eGetId_HandleDefault = 0x0003, ///< returns the ID associated with a bioseq-handle 104 105 eGetId_Seq_id_Score = 0x0004, ///< use CSeq_id::Score() as the scoring function 106 eGetId_Seq_id_BestRank = 0x0005, ///< use CSeq_id::BestRank() as the scoring function 107 eGetId_Seq_id_WorstRank = 0x0006, ///< use CSeq_id::WorstRank() as the scoring function 108 eGetId_Seq_id_FastaAARank = 0x0007, ///< use CSeq_id::FastaAARank() as the scoring function 109 eGetId_Seq_id_FastaNARank = 0x0008, ///< use CSeq_id::FastaNARank() as the scoring function 110 111 ///< "canonical" here means "most specific"; this differs from "best" in 112 ///< that "best" is intended for display purposes 113 eGetId_Canonical = eGetId_Seq_id_BestRank, 114 115 eGetId_TypeMask = 0x00FF, ///< Mask for requested id type 116 117 /// Check if the seq-id is present in the scope 118 eGetId_VerifyId = 0x0100, 119 120 /// Throw exception on errors. If not set, an empty value is returned. 121 eGetId_ThrowOnError = 0x0200, 122 123 eGetId_Default = eGetId_Best | eGetId_ThrowOnError, 124 }; 125 typedef int EGetIdType; 126 127 128 /// Given an accession string retrieve the GI id. 129 /// If no GI was found returns 0 or throws CSeqIdFromHandleException 130 /// depending on the flags. 131 /// Id type in the flags is ignored, only VerifyId and ThrowOnError 132 /// flags are checked. 133 NCBI_XOBJUTIL_EXPORT 134 TGi GetGiForAccession(const string& acc, 135 CScope& scope, 136 EGetIdType flags = 0); 137 138 /// Retrieve the accession for a given GI. 139 /// If no accession was found returns an empty string or throws 140 /// CSeqIdFromHandleException depending on the flags. 141 /// Id type in the flags is ignored, only VerifyId and ThrowOnError 142 /// flags are checked. 143 NCBI_XOBJUTIL_EXPORT 144 string GetAccessionForGi(TGi gi, 145 CScope& scope, 146 EAccessionVersion use_version = eWithAccessionVersion, 147 EGetIdType flags = 0); 148 149 /// Given a Seq-id retrieve the corresponding GI. 150 /// If no GI was found returns 0 or throws CSeqIdFromHandleException 151 /// depending on the flags. 152 /// Id type in the flags is ignored, only VerifyId and ThrowOnError 153 /// flags are checked. 154 NCBI_XOBJUTIL_EXPORT 155 TGi GetGiForId(const objects::CSeq_id& id, 156 CScope& scope, 157 EGetIdType flags = 0); 158 159 /// Retrieve the accession string for a Seq-id. 160 /// If no accession was found returns an empty string or throws 161 /// CSeqIdFromHandleException depending on the flags. 162 /// Id type in the flags is ignored, only VerifyId and ThrowOnError 163 /// flags are checked. 164 NCBI_XOBJUTIL_EXPORT 165 string GetAccessionForId(const objects::CSeq_id& id, 166 CScope& scope, 167 EAccessionVersion use_version = eWithAccessionVersion, 168 EGetIdType flags = 0); 169 170 /// Return a selected ID type for a given bioseq handle. This function 171 /// will try to use the most efficient method possible to determine which 172 /// ID fulfills the requested parameter. This version will call 173 /// sequence::GetId() with the bioseq handle's seq-id. 174 /// 175 /// @param id Source id to evaluate 176 /// @param scope Scope for seq-id resolution. 177 /// @param type Type of ID to return 178 /// @return A requested seq-id. 179 /// Depending on the flags set in 'type' this function can verify 180 /// if the requested ID exists in the scope and throw 181 /// CSeqIdFromHandleException if the request cannot be satisfied. 182 NCBI_XOBJUTIL_EXPORT 183 CSeq_id_Handle GetId(const CBioseq_Handle& handle, 184 EGetIdType type = eGetId_Default); 185 186 /// Return a selected ID type for a seq-id. This function 187 /// will try to use the most efficient method possible to determine which 188 /// ID fulfills the requested parameter. The following logic is used: 189 /// 190 /// - For seq-id type eGetId_HandleDefault, the original seq-id is returned. 191 /// This satisfies the condition of returning a bioseq-handle's seq-id if 192 /// sequence::GetId() is applied to a CBioseq_Handle. 193 /// 194 /// - For seq-id type eGetId_ForceAcc, the returned set of seq-ids will first 195 /// be evaluated for a "best" id (which, given seq-id scoring, will be 196 /// a textseq-id if one exists). If the returned best ID is a textseq-id, 197 /// this id will be returned. Otherwise, an exception is thrown or an 198 /// empty handle returned. 199 /// 200 /// - For seq-id type eGetId_ForceGi, the returned set of IDs is scanned for 201 /// an ID of type gi. If this is found, it is returned; otherwise, an 202 /// exception is thrown or an empty handle returned. If the supplied ID is 203 /// already a gi and eGetId_VerifyId flag is not set, no work is done. 204 /// 205 /// @param id Source id to evaluate 206 /// @param scope Scope for seq-id resolution. 207 /// @param type Type of ID to return 208 /// @return A requested seq-id. 209 /// Depending on the flags set in 'type' this function can verify 210 /// if the requested ID exists in the scope and throw 211 /// CSeqIdFromHandleException if the request cannot be satisfied. 212 NCBI_XOBJUTIL_EXPORT 213 CSeq_id_Handle GetId(const CSeq_id& id, CScope& scope, 214 EGetIdType type = eGetId_Default); 215 216 /// Return a selected ID type for a seq-id handle. 217 /// Arguments (except 'id') and behavior is the same as of 218 /// GetId(const CSeq_id& id, ...). 219 NCBI_XOBJUTIL_EXPORT 220 CSeq_id_Handle GetId(const CSeq_id_Handle& id, CScope& scope, 221 EGetIdType type = eGetId_Default); 222 223 /// Return a selected ID type from a set of Seq-ids 224 /// Arguments (except 'id') and behavior is the same as of 225 /// GetId(const CSeq_id& id, ...). 226 NCBI_XOBJUTIL_EXPORT 227 CSeq_id_Handle GetId(const CBioseq::TId& id, 228 EGetIdType type = eGetId_Default); 229 230 /// Return a selected ID type from a Bioseq 231 /// Arguments (except 'seq') and behavior is the same as of 232 /// GetId(const CBioseq_Handle& seq, ...). 233 NCBI_XOBJUTIL_EXPORT 234 CSeq_id_Handle GetId(const CBioseq& seq, 235 EGetIdType type = eGetId_Default); 236 237 /* @} */ 238 239 240 /** @name FindLatestSequence 241 * Walk the replace history to find the latest revision of a sequence 242 * @{ 243 */ 244 245 /// Given a seq-id check its replace history and try to find the latest 246 /// revision. The function stops and returns NULL if it detects some 247 /// strange conditions like an infinite recursion. If the bioseq 248 /// contains no history information, the original id is returned. 249 NCBI_XOBJUTIL_EXPORT 250 CConstRef<CSeq_id> FindLatestSequence(const CSeq_id& id, CScope& scope); 251 252 NCBI_XOBJUTIL_EXPORT 253 CSeq_id_Handle FindLatestSequence(const CSeq_id_Handle& idh, CScope& scope); 254 255 /// Check replace history up to the specified date. Returns the latest 256 /// bioseq up to the date or the original id if the bioseq contains no 257 /// history or is already newer than the specified date. 258 NCBI_XOBJUTIL_EXPORT 259 CConstRef<CSeq_id> FindLatestSequence(const CSeq_id& id, 260 CScope& scope, 261 const CTime& tlim); 262 263 NCBI_XOBJUTIL_EXPORT 264 CSeq_id_Handle FindLatestSequence(const CSeq_id_Handle& idh, 265 CScope& scope, 266 const CTime& tlim); 267 268 /* @} */ 269 270 271 /** @name GetTitle 272 * Get sequence's title (used in various flat-file formats.) 273 * Deprecated in favor of CDeflineGenerator. 274 * @{ 275 */ 276 277 /// This function is here rather than in CBioseq because it may need 278 /// to inspect other sequences. The reconstruct flag indicates that it 279 /// should ignore any existing title Seqdesc. 280 enum EGetTitleFlags { 281 fGetTitle_Reconstruct = 0x1, ///< ignore existing title Seqdesc. 282 fGetTitle_Organism = 0x2, ///< append [organism] 283 fGetTitle_AllProteins = 0x4, ///< normally just names the first 284 fGetTitle_NoExpensive = 0x8 ///< skip potential expensive operations 285 }; 286 typedef int TGetTitleFlags; 287 288 NCBI_DEPRECATED NCBI_XOBJUTIL_EXPORT 289 string GetTitle(const CBioseq_Handle& hnd, TGetTitleFlags flags = 0); 290 NCBI_DEPRECATED NCBI_XOBJUTIL_EXPORT 291 bool GetTitle(const CBioseq& seq, string* title_ptr, TGetTitleFlags flags = 0); 292 293 /* @} */ 294 295 296 /** @name Source and Product 297 * Mapping locations through features 298 * @{ 299 */ 300 301 enum ES2PFlags { 302 fS2P_NoMerge = 0x1, ///< don't merge adjacent intervals on the product 303 fS2P_AllowTer = 0x2 ///< map the termination codon as a legal location 304 }; 305 typedef int TS2PFlags; // binary OR of ES2PFlags 306 307 NCBI_XOBJUTIL_EXPORT 308 CRef<CSeq_loc> SourceToProduct(const CSeq_feat& feat, 309 const CSeq_loc& source_loc, TS2PFlags flags = 0, 310 CScope* scope = 0, int* frame = 0); 311 312 enum EP2SFlags { 313 fP2S_Extend = 0x1 ///< if hitting ends, extend to include partial codons 314 }; 315 typedef int TP2SFlags; // binary OR of ES2PFlags 316 317 NCBI_XOBJUTIL_EXPORT 318 CRef<CSeq_loc> ProductToSource(const CSeq_feat& feat, const CSeq_loc& prod_loc, 319 TP2SFlags flags = 0, CScope* scope = 0); 320 321 /* @} */ 322 323 324 /** @name Overlapping 325 * Searching for features 326 * @{ 327 */ 328 329 enum EBestFeatOpts { 330 /// requires explicit association, rather than analysis based on overlaps 331 fBestFeat_StrictMatch = 0x01, 332 333 /// don't perform any expensive tests, such as ones that require fetching 334 /// additional sequences 335 fBestFeat_NoExpensive = 0x02, 336 337 /// favor longer features over shorter features 338 fBestFeat_FavorLonger = 0x04, 339 340 /// Pay no attention to strands when finding the best feat. This may be 341 /// useful for, e.g., trans-spliced genes. 342 fBestFeat_IgnoreStrand = 0x08, 343 344 /// default options: do everything 345 fBestFeat_Defaults = 0 346 }; 347 typedef int TBestFeatOpts; 348 349 350 /// Storage for features and scores. 351 typedef pair<Int8, CConstRef<CSeq_feat> > TFeatScore; 352 typedef vector<TFeatScore> TFeatScores; 353 354 // To avoid putting custom logic into the GetOverlappingFeatures 355 // function, we allow plugins 356 class CGetOverlappingFeaturesPlugin { 357 public: ~CGetOverlappingFeaturesPlugin()358 virtual ~CGetOverlappingFeaturesPlugin() {} 359 virtual void processSAnnotSelector( 360 SAnnotSelector &sel ) = 0; 361 362 virtual void setUpFeatureIterator ( 363 CBioseq_Handle &bioseq_handle, 364 auto_ptr<CFeat_CI> &feat_ci, 365 TSeqPos circular_length , 366 CRange<TSeqPos> &range, 367 const CSeq_loc& loc, 368 SAnnotSelector &sel, 369 CScope &scope, 370 ENa_strand &strand ) = 0; 371 372 virtual void processLoc( 373 CBioseq_Handle &bioseq_handle, 374 CRef<CSeq_loc> &loc, 375 TSeqPos circular_length ) = 0; 376 377 virtual void processMainLoop( 378 bool &shouldContinueToNextIteration, 379 CRef<CSeq_loc> &cleaned_loc_this_iteration, 380 CRef<CSeq_loc> &candidate_feat_loc, 381 EOverlapType &overlap_type_this_iteration, 382 bool &revert_locations_this_iteration, 383 CBioseq_Handle &bioseq_handle, 384 const CMappedFeat &feat, 385 TSeqPos circular_length, 386 SAnnotSelector::EOverlapType annot_overlap_type ) = 0; 387 388 virtual void postProcessDiffAmount( 389 Int8 &cur_diff, 390 CRef<CSeq_loc> &cleaned_loc_this_iteration, 391 CRef<CSeq_loc> &candidate_feat_loc, 392 CScope &scope, 393 SAnnotSelector &sel, 394 TSeqPos circular_length ) = 0; 395 }; 396 397 /// Find all features overlapping the location. Features and corresponding 398 /// scores are stored in the 'feats' vector. The scores are calculated as 399 /// difference between the input location and each feature's location. 400 /// NOTE: 'overlap_type' defines how the location must be related to the feature. 401 /// For eOverlap_Subset, eOverlap_SubsetRev, eOverlap_CheckIntervals, 402 /// eOverlap_CheckIntRev and eOverlap_Interval the relationship is 403 /// reversed. E.g. with eOverlap_Contains, the location will contain 404 /// the feature, but with eOverlap_Subset the feature will be defined 405 /// on a subset of the location. 406 NCBI_XOBJUTIL_EXPORT 407 void GetOverlappingFeatures(const CSeq_loc& loc, 408 CSeqFeatData::E_Choice feat_type, 409 CSeqFeatData::ESubtype feat_subtype, 410 EOverlapType overlap_type, 411 TFeatScores& feats, 412 CScope& scope, 413 const TBestFeatOpts opts = 0, 414 CGetOverlappingFeaturesPlugin *plugin = NULL ); 415 416 417 /// See the note above on 'overlap_type' meaning. 418 NCBI_XOBJUTIL_EXPORT 419 CConstRef<CSeq_feat> GetBestOverlappingFeat(const CSeq_loc& loc, 420 CSeqFeatData::E_Choice feat_type, 421 EOverlapType overlap_type, 422 CScope& scope, 423 TBestFeatOpts opts = fBestFeat_Defaults, 424 CGetOverlappingFeaturesPlugin *plugin = NULL ); 425 /// See the note above on 'overlap_type' meaning. 426 NCBI_XOBJUTIL_EXPORT 427 CConstRef<CSeq_feat> GetBestOverlappingFeat(const CSeq_loc& loc, 428 CSeqFeatData::ESubtype feat_type, 429 EOverlapType overlap_type, 430 CScope& scope, 431 TBestFeatOpts opts = fBestFeat_Defaults, 432 CGetOverlappingFeaturesPlugin *plugin = NULL ); 433 434 NCBI_XOBJUTIL_EXPORT 435 CConstRef<CSeq_feat> 436 GetBestGeneForMrna(const CSeq_feat& mrna_feat, 437 CScope& scope, 438 TBestFeatOpts opts = fBestFeat_Defaults, 439 CGetOverlappingFeaturesPlugin *plugin = NULL ); 440 441 NCBI_XOBJUTIL_EXPORT 442 CConstRef<CSeq_feat> 443 GetBestGeneForCds(const CSeq_feat& cds_feat, 444 CScope& scope, 445 TBestFeatOpts opts = fBestFeat_Defaults, 446 CGetOverlappingFeaturesPlugin *plugin = NULL ); 447 448 NCBI_XOBJUTIL_EXPORT 449 CConstRef<CSeq_feat> 450 GetBestMrnaForCds(const CSeq_feat& cds_feat, 451 CScope& scope, 452 TBestFeatOpts opts = fBestFeat_Defaults, 453 CGetOverlappingFeaturesPlugin *plugin = NULL ); 454 455 NCBI_XOBJUTIL_EXPORT 456 CConstRef<CSeq_feat> 457 GetBestCdsForMrna(const CSeq_feat& mrna_feat, 458 CScope& scope, 459 TBestFeatOpts opts = fBestFeat_Defaults, 460 CGetOverlappingFeaturesPlugin *plugin = NULL ); 461 462 NCBI_XOBJUTIL_EXPORT 463 void GetMrnasForGene(const CSeq_feat& gene_feat, 464 CScope& scope, 465 list< CConstRef<CSeq_feat> >& mrna_feats, 466 TBestFeatOpts opts = fBestFeat_Defaults, 467 CGetOverlappingFeaturesPlugin *plugin = NULL ); 468 469 NCBI_XOBJUTIL_EXPORT 470 void GetCdssForGene(const CSeq_feat& gene_feat, 471 CScope& scope, 472 list< CConstRef<CSeq_feat> >& cds_feats, 473 TBestFeatOpts opts = fBestFeat_Defaults, 474 CGetOverlappingFeaturesPlugin *plugin = NULL ); 475 476 ///////////////////////////////////////////////////////////////////////////// 477 // Versions of functions with lookup by feature id 478 NCBI_XOBJUTIL_EXPORT 479 CConstRef<CSeq_feat> 480 GetBestGeneForMrna(const CSeq_feat& mrna_feat, 481 const CTSE_Handle& tse, 482 TBestFeatOpts opts = fBestFeat_Defaults, 483 CGetOverlappingFeaturesPlugin *plugin = NULL ); 484 485 NCBI_XOBJUTIL_EXPORT 486 CConstRef<CSeq_feat> 487 GetBestGeneForCds(const CSeq_feat& cds_feat, 488 const CTSE_Handle& tse, 489 TBestFeatOpts opts = fBestFeat_Defaults, 490 CGetOverlappingFeaturesPlugin *plugin = NULL ); 491 492 NCBI_XOBJUTIL_EXPORT 493 CConstRef<CSeq_feat> 494 GetBestMrnaForCds(const CSeq_feat& cds_feat, 495 const CTSE_Handle& tse, 496 TBestFeatOpts opts = fBestFeat_Defaults, 497 CGetOverlappingFeaturesPlugin *plugin = NULL ); 498 499 NCBI_XOBJUTIL_EXPORT 500 CConstRef<CSeq_feat> 501 GetBestCdsForMrna(const CSeq_feat& mrna_feat, 502 const CTSE_Handle& tse, 503 TBestFeatOpts opts = fBestFeat_Defaults, 504 CGetOverlappingFeaturesPlugin *plugin = NULL ); 505 506 NCBI_XOBJUTIL_EXPORT 507 void GetMrnasForGene(const CSeq_feat& gene_feat, 508 const CTSE_Handle& tse, 509 list< CConstRef<CSeq_feat> >& mrna_feats, 510 TBestFeatOpts opts = fBestFeat_Defaults, 511 CGetOverlappingFeaturesPlugin *plugin = NULL ); 512 513 NCBI_XOBJUTIL_EXPORT 514 void GetCdssForGene(const CSeq_feat& gene_feat, 515 const CTSE_Handle& tse, 516 list< CConstRef<CSeq_feat> >& cds_feats, 517 TBestFeatOpts opts = fBestFeat_Defaults, 518 CGetOverlappingFeaturesPlugin *plugin = NULL ); 519 520 NCBI_XOBJUTIL_EXPORT 521 CConstRef<CSeq_feat> 522 GetBestOverlappingFeat(const CSeq_feat& feat, 523 CSeqFeatData::E_Choice feat_type, 524 sequence::EOverlapType overlap_type, 525 CScope& scope, 526 TBestFeatOpts opts = fBestFeat_Defaults, 527 CGetOverlappingFeaturesPlugin *plugin = NULL ); 528 529 NCBI_XOBJUTIL_EXPORT 530 CConstRef<CSeq_feat> 531 GetBestOverlappingFeat(const CSeq_feat& feat, 532 CSeqFeatData::ESubtype feat_type, 533 sequence::EOverlapType overlap_type, 534 CScope& scope, 535 TBestFeatOpts opts = fBestFeat_Defaults, 536 CGetOverlappingFeaturesPlugin *plugin = NULL ); 537 538 NCBI_XOBJUTIL_EXPORT 539 CConstRef<CSeq_feat> GetmRNAforCDS(const CSeq_feat& cds, CScope& scope); 540 541 542 /// Get the best overlapping feature for a SNP (variation) feature 543 /// @param snp_feat 544 /// SNP feature object 545 /// @param type 546 /// type of overlapping feature 547 /// @param scope 548 /// @param search_both_strands 549 /// search is performed on both strands, starting with the one specified 550 /// by the feature's location. 551 /// @return 552 /// the overlapping fetaure or NULL if not found 553 NCBI_XOBJUTIL_EXPORT 554 CConstRef<CSeq_feat> GetBestOverlapForSNP(const CSeq_feat& snp_feat, 555 CSeqFeatData::E_Choice type, 556 CScope& scope, 557 bool search_both_strands = true); 558 559 /// Get the best overlapping feature for a SNP (variation) 560 /// @param snp_feat 561 /// SNP feature object 562 /// @param subtype 563 /// subtype of overlapping feature 564 /// @param scope 565 /// @param search_both_strands 566 /// search is performed on both strands, starting with the one specified 567 /// by the feature's location. 568 /// @return 569 /// the overlapping fetaure or NULL if not found 570 NCBI_XOBJUTIL_EXPORT 571 CConstRef<CSeq_feat> GetBestOverlapForSNP(const CSeq_feat& snp_feat, 572 CSeqFeatData::ESubtype subtype, 573 CScope& scope, 574 bool search_both_strands = true); 575 576 577 /// Convenience functions for popular overlapping types 578 enum ETransSplicing { 579 eTransSplicing_No = 0, 580 eTransSplicing_Yes, 581 eTransSplicing_Auto ///< Ignore overlap strand if the source location 582 ///< has mixed/both strand. 583 }; 584 NCBI_XOBJUTIL_EXPORT 585 CConstRef<CSeq_feat> GetOverlappingGene( 586 const CSeq_loc& loc, CScope& scope, 587 ETransSplicing eTransSplicing = eTransSplicing_Auto); 588 589 590 /// Finds gene for feature, but obeys SeqFeatXref directives 591 NCBI_XOBJUTIL_EXPORT 592 CConstRef<CSeq_feat> GetGeneForFeature(const CSeq_feat& feat, CScope& scope); 593 594 /// Determines whether given feature is pseudo, using gene associated with feature 595 /// if necessary 596 /// Checks to see if a feature is pseudo. Looks for pseudo flag set on feature, 597 /// looks for pseudogene qualifier on feature, performs same checks for gene 598 /// associated with feature 599 /// @param feat Seq-feat to check 600 /// @param scope CScope to use when looking for associated gene 601 /// @return Boolean return value indicates whether any of the "pseudo" markers are found 602 NCBI_XOBJUTIL_EXPORT 603 bool IsPseudo(const CSeq_feat& feat, CScope& scope); 604 605 606 NCBI_XOBJUTIL_EXPORT 607 CConstRef<CSeq_feat> GetOverlappingmRNA(const CSeq_loc& loc, CScope& scope); 608 609 610 NCBI_XOBJUTIL_EXPORT 611 CConstRef<CSeq_feat> GetOverlappingCDS(const CSeq_loc& loc, CScope& scope); 612 613 614 NCBI_XOBJUTIL_EXPORT 615 CConstRef<CSeq_feat> GetOverlappingPub(const CSeq_loc& loc, CScope& scope); 616 617 618 NCBI_XOBJUTIL_EXPORT 619 CConstRef<CSeq_feat> GetOverlappingSource(const CSeq_loc& loc, CScope& scope); 620 621 622 NCBI_XOBJUTIL_EXPORT 623 CConstRef<CSeq_feat> GetOverlappingOperon(const CSeq_loc& loc, CScope& scope); 624 625 626 /// Get the encoding CDS feature of a given protein sequence. 627 NCBI_XOBJUTIL_EXPORT 628 const CSeq_feat* GetCDSForProduct(const CBioseq& product, CScope* scope); 629 NCBI_XOBJUTIL_EXPORT 630 const CSeq_feat* GetCDSForProduct(const CBioseq_Handle& product); 631 NCBI_XOBJUTIL_EXPORT 632 CMappedFeat GetMappedCDSForProduct(const CBioseq_Handle& product); 633 634 635 /// Get the mature peptide feature of a protein 636 NCBI_XOBJUTIL_EXPORT 637 const CSeq_feat* GetPROTForProduct(const CBioseq& product, CScope* scope); 638 NCBI_XOBJUTIL_EXPORT 639 const CSeq_feat* GetPROTForProduct(const CBioseq_Handle& product); 640 641 642 /// Get the encoding mRNA feature of a given mRNA (cDNA) bioseq. 643 NCBI_XOBJUTIL_EXPORT 644 const CSeq_feat* GetmRNAForProduct(const CBioseq& product, CScope* scope); 645 NCBI_XOBJUTIL_EXPORT 646 const CSeq_feat* GetmRNAForProduct(const CBioseq_Handle& product); 647 NCBI_XOBJUTIL_EXPORT 648 CMappedFeat GetMappedmRNAForProduct(const CBioseq_Handle& product); 649 650 /* @} */ 651 652 653 /** @name Sequences 654 * Searching for bioseqs etc. 655 * @{ 656 */ 657 658 /// Get the encoding nucleotide sequnce of a protein. 659 NCBI_XOBJUTIL_EXPORT 660 const CBioseq* GetNucleotideParent(const CBioseq& product, CScope* scope); 661 NCBI_XOBJUTIL_EXPORT 662 CBioseq_Handle GetNucleotideParent(const CBioseq_Handle& product); 663 664 /// Get the parent bioseq for a part of a segmented bioseq 665 NCBI_XOBJUTIL_EXPORT 666 CBioseq_Handle GetParentForPart(const CBioseq_Handle& part); 667 668 /// Return the org-ref associated with a given sequence. This will throw 669 /// a CException if there is no org-ref associated with the sequence 670 NCBI_XOBJUTIL_EXPORT 671 const COrg_ref& GetOrg_ref(const CBioseq_Handle& handle); 672 /// Return the pointer to org-ref associated with a given sequence 673 /// or null if there is no org-ref associated with the sequence 674 NCBI_XOBJUTIL_EXPORT 675 const COrg_ref* GetOrg_refOrNull(const CBioseq_Handle& handle); 676 677 /// return the tax-id associated with a given sequence. This will return 0 678 /// if no tax-id can be found. 679 NCBI_XOBJUTIL_EXPORT 680 TTaxId GetTaxId(const CBioseq_Handle& handle); 681 682 /// Retrieve the MolInfo object for a given bioseq handle. If the supplied 683 /// sequence does not have a MolInfo associated with it, this will return NULL 684 NCBI_XOBJUTIL_EXPORT 685 const CMolInfo* GetMolInfo(const CBioseq& bioseq); 686 687 NCBI_XOBJUTIL_EXPORT 688 const CMolInfo* GetMolInfo(const CBioseq_Handle& handle); 689 690 /// Retrieve the BioSource object for a given bioseq handle. If the supplied 691 /// sequence does not have a MolInfo associated with it, this will return NULL 692 NCBI_XOBJUTIL_EXPORT 693 const CBioSource* GetBioSource(const CBioseq& bioseq); 694 695 NCBI_XOBJUTIL_EXPORT 696 const CBioSource* GetBioSource(const CBioseq_Handle& handle); 697 698 699 /// Retrieve the Bioseq Handle from a location. 700 /// location refers to a single bioseq: 701 /// return the bioseq 702 /// location referes to multiple bioseqs: 703 /// if parts of a segmentd bioseq, returns the segmentd bioseq. 704 /// otherwise, return the first bioseq that could be found 705 /// (first localy then, if flag is eGetBioseq_All, remote) 706 NCBI_XOBJUTIL_EXPORT 707 CBioseq_Handle GetBioseqFromSeqLoc(const CSeq_loc& loc, CScope& scope, 708 CScope::EGetBioseqFlag flag = CScope::eGetBioseq_Loaded); 709 710 711 /// Return protein name from corresponding Prot-ref feature. 712 /// Throws exception if the sequence is not a protein, 713 /// or if there is no unambiguously best Prot-ref feature, 714 /// or if the feature doesn't return non-empty label. 715 NCBI_XOBJUTIL_EXPORT 716 string GetProteinName(const CBioseq_Handle& seq); 717 718 719 /* @} */ 720 721 722 class NCBI_XOBJUTIL_EXPORT 723 CSeqIdFromHandleException : EXCEPTION_VIRTUAL_BASE public CException 724 { 725 public: 726 // Enumerated list of document management errors 727 enum EErrCode { 728 eNoSynonyms, 729 eRequestedIdNotFound 730 }; 731 732 // Translate the specific error code into a string representations of 733 // that error code. 734 virtual const char* GetErrCodeString(void) const override; 735 736 NCBI_EXCEPTION_DEFAULT(CSeqIdFromHandleException, CException); 737 }; 738 739 740 END_SCOPE(sequence) 741 742 743 /// FASTA-format output; see also ReadFasta in <objtools/readers/fasta.hpp> 744 745 class NCBI_XOBJUTIL_EXPORT CFastaOstream { 746 public: 747 enum EFlags : long { 748 fAssembleParts = 1 << 0, ///< assemble FAR delta sequences; on by dflt 749 fInstantiateGaps = 1 << 1, ///< honor specifed gap mode; on by default 750 fSuppressRange = 1 << 2, ///< never include location details in defline 751 fReverseStrand = 1 << 3, ///< flip the (implicit) location 752 fKeepGTSigns = 1 << 4, ///< don't convert '>' to '_' in title 753 fMapMasksUp = 1 << 5, ///< honor masks specified at a lower level 754 fMapMasksDown = 1 << 6, ///< honor masks specified at a higher level 755 fNoExpensiveOps = 1 << 7, ///< don't try too hard to find titles 756 fShowModifiers = 1 << 8, ///< show key-value pair modifiers (e.g. "[organism=Homo sapiens]") 757 fNoDupCheck = 1 << 9, ///< skip check for duplicate sequence IDs 758 fShowGapModifiers = 1 << 10, ///< show gap key-value pair modifiers (e.g. "[linkage-evidence=map;strobe]"). Only works if gap mode is eGM_count. 759 fKeepUnknGapNomLen = 1 << 11, ///< Keep unknown gap's nominal length. That is, when a gap has an unknown length but nominal length, use that instead of just making it 100. 760 fShowGapsOfSizeZero = 1 << 12, ///< Use this to show gaps of size zero as a lone hyphen at the end of a line. 761 fEnableGI = 1 << 13, ///< Use this flag to enable GI output in the defline 762 fHideGenBankPrefix = 1 << 14, ///< Hide gb| prefix for genbank only seq_id's 763 fHTMLEncode = 1 << 15, ///< Encode the title string for HTML display 764 fIgnoreOriginalID = 1 << 16, ///< Disregard original ID when constructing defline 765 // historically misnamed as eFlagName 766 eAssembleParts = fAssembleParts, 767 eInstantiateGaps = fInstantiateGaps, 768 fUseAutoDef = 1 << 17, ///< Disregard original ID when constructing defline 769 fBaseFirstUnused = 1 << 18, ///< first avalailabe for derived classes 770 fDoNotUseAutoDef = 1 << 19 771 }; 772 typedef long TFlags; ///< binary OR of EFlags 773 774 /// How to represent gaps with fInstantiateGaps enabled, as it is 775 /// by default. (Disabling fInstantiateGaps is equivalent to 776 /// requesting eGM_one_dash.) 777 enum EGapMode { 778 eGM_one_dash, ///< A single dash, followed by a line break. 779 eGM_dashes, ///< Multiple inline dashes. 780 eGM_letters, ///< Multiple inline Ns or Xs as appropriate (default). 781 eGM_count ///< >?N or >?unk100, as appropriate. 782 }; 783 784 CFastaOstream(CNcbiOstream& out); 785 virtual ~CFastaOstream(); 786 787 /// Unspecified locations designate complete sequences; 788 /// non-empty custom titles override the usual title determination logic 789 virtual void Write (const CSeq_entry_Handle& handle, 790 const CSeq_loc* location = 0); 791 virtual void Write (const CBioseq_Handle& handle, 792 const CSeq_loc* location = 0, 793 const string& custom_title = kEmptyStr); 794 virtual void WriteTitle (const CBioseq_Handle& handle, 795 const CSeq_loc* location = 0, 796 const string& custom_title = kEmptyStr); 797 virtual void WriteSequence(const CBioseq_Handle& handle, 798 const CSeq_loc* location = 0, 799 CSeq_loc::EOpFlags merge_flags=CSeq_loc::fMerge_AbuttingOnly); 800 801 /// These versions may set up a temporary object manager scope 802 /// In the common case of a raw bioseq, no scope is needed 803 void Write(const CSeq_entry& entry, const CSeq_loc* location = 0, 804 bool no_scope = false); 805 void Write(const CBioseq& seq, const CSeq_loc* location = 0, 806 bool no_scope = false, const string& custom_title = kEmptyStr); 807 void WriteTitle(const CBioseq& seq, const CSeq_loc* location = 0, 808 bool no_scope=false, const string& custom_title=kEmptyStr); 809 810 /// Used only by Write(CSeq_entry[_Handle], ...); permissive by default SkipBioseq(const CBioseq &)811 virtual bool SkipBioseq(const CBioseq& /* seq */) { return false; } 812 /// Delegates to the non-handle version by default for 813 /// compatibility with older code; newer code should override this 814 /// version. SkipBioseq(const CBioseq_Handle & handle)815 virtual bool SkipBioseq(const CBioseq_Handle& handle) 816 { return SkipBioseq(*handle.GetCompleteBioseq()); } 817 818 /// Which residues to mask out in subsequent output. 819 /// These do NOT automatically reset between calls to Write; 820 /// you must do so yourself by setting them to null. 821 enum EMaskType { 822 eSoftMask = 1, ///< write as lowercase rather than uppercase 823 eHardMask = 2 ///< write as N for nucleotides, X for peptides 824 }; 825 CConstRef<CSeq_loc> GetMask(EMaskType type) const; 826 void SetMask(EMaskType type, CConstRef<CSeq_loc> location); 827 828 /// Other parameters... GetWidth(void) const829 TSeqPos GetWidth (void) const { return m_Width; } 830 void SetWidth (TSeqPos width); GetAllFlags(void) const831 TFlags GetAllFlags(void) const { return m_Flags; } SetAllFlags(TFlags flags)832 void SetAllFlags(TFlags flags) { m_Flags = flags; } SetFlag(EFlags flag)833 void SetFlag (EFlags flag) { m_Flags |= flag; } ResetFlag(EFlags flag)834 void ResetFlag (EFlags flag) { m_Flags &= ~flag; } SetGapMode(EGapMode mode)835 void SetGapMode (EGapMode mode) { m_GapMode = mode; } GetGapMode(void) const836 EGapMode GetGapMode(void) const { return m_GapMode; } 837 838 /// This indicates the text of the modifiers of a gap. 839 struct NCBI_XOBJUTIL_EXPORT SGapModText { 840 /// String representing the gap type. 841 /// Examples: "short_arm", "telomere", etc. 842 string gap_type; 843 /// A vector representing the linkage-evidences of the gap. 844 /// Example linkage-evidences: "align genus", "within clone", etc. 845 vector<string> gap_linkage_evidences; 846 847 // more fields may be added in the future. 848 849 /// This will write the modifiers in 850 /// FASTA format. (example: "[gap-type=short_arm]") 851 void WriteAllModsAsFasta( CNcbiOstream & out ) const; 852 }; 853 854 /// Given a CSeq_gap object, this outputs the 855 /// Gap information 856 /// 857 /// @param seq_gap 858 /// This is the seq_gap information we're using to figure out 859 /// the gap mod text 860 /// @param out_gap_mod_text 861 /// This holds the result. 862 static void 863 GetGapModText( 864 const CSeq_gap & seq_gap, 865 SGapModText & out_gap_mod_text ); 866 867 protected: 868 CNcbiOstream& m_Out; 869 auto_ptr<sequence::CDeflineGenerator> m_Gen; 870 871 virtual void x_WriteSeqIds ( const CBioseq& bioseq, 872 const CSeq_loc* location); 873 virtual void x_WriteAsFasta ( const CBioseq& bioseq ); 874 virtual void x_GetBestId(CConstRef<CSeq_id>& gi_id, CConstRef<CSeq_id>& best_id, bool& hide_prefix, const CBioseq& bioseq); 875 //virtual void x_WriteModifiers ( const CBioseq_Handle & handle ); 876 virtual void x_WriteSeqTitle( const CBioseq_Handle & handle, 877 const string& custom_title); x_WriteBuffer(const char * buf,unsigned int count)878 virtual void x_WriteBuffer( const char* buf, unsigned int count) { m_Out.write(buf, count); }; 879 880 TFlags m_Flags; 881 882 private: 883 CConstRef<CSeq_loc> m_SoftMask; 884 CConstRef<CSeq_loc> m_HardMask; 885 TSeqPos m_Width; 886 EGapMode m_GapMode; 887 TSeq_id_HandleSet m_PreviousWholeIds; 888 // avoid recomputing for every sequence 889 typedef AutoPtr<char, ArrayDeleter<char> > TCharBuf; 890 TCharBuf m_Dashes, m_LC_Ns, m_LC_Xs, m_UC_Ns, m_UC_Xs; 891 892 sequence::CDeflineGenerator::TUserFlags x_GetTitleFlags(void) const; 893 894 //void x_PrintStringModIfNotDup( 895 // bool *seen, const CTempString & key, const CTempString & value ); 896 //void x_PrintIntModIfNotDup( 897 // bool *seen, const CTempString & key, const int value ); 898 899 CConstRef<CSeq_loc> x_MapMask(CSeq_loc_Mapper& mapper, const CSeq_loc& mask, 900 const CSeq_id* base_seq_id, CScope* scope); 901 902 typedef map<TSeqPos, int> TMSMap; 903 void x_GetMaskingStates(TMSMap& masking_states, 904 const CSeq_id* base_seq_id, 905 const CSeq_loc* location, 906 CScope* scope); 907 908 void x_WriteSequence(const CSeqVector& vec, 909 const TMSMap& masking_state); 910 }; 911 912 913 /// Public interface for coding region translation function 914 /// Uses CTrans_table in <objects/seqfeat/Genetic_code_table.hpp> 915 /// for rapid translation from a given genetic code, allowing all 916 /// of the iupac nucleotide ambiguity characters 917 918 class NCBI_XOBJUTIL_EXPORT CCdregion_translate 919 { 920 public: 921 922 enum ETranslationLengthProblemOptions { 923 eThrowException = 0, 924 eTruncate, 925 ePad 926 }; 927 928 /// translation coding region into ncbieaa protein sequence 929 NCBI_DEPRECATED 930 static void TranslateCdregion (string& prot, 931 const CBioseq_Handle& bsh, 932 const CSeq_loc& loc, 933 const CCdregion& cdr, 934 bool include_stop = true, 935 bool remove_trailing_X = false, 936 bool* alt_start = 0, 937 ETranslationLengthProblemOptions options = eThrowException); 938 939 NCBI_DEPRECATED 940 static void TranslateCdregion(string& prot, 941 const CSeq_feat& cds, 942 CScope& scope, 943 bool include_stop = true, 944 bool remove_trailing_X = false, 945 bool* alt_start = 0, 946 ETranslationLengthProblemOptions options = eThrowException); 947 }; 948 949 950 class NCBI_XOBJUTIL_EXPORT CSeqTranslator 951 { 952 public: 953 /// @sa TTranslationFlags 954 enum ETranslationFlags { 955 fDefault = 0, 956 fNoStop = (1<<0), ///< = 0x1 Do not include stop in translation 957 fRemoveTrailingX = (1<<1), ///< = 0x2 Remove trailing Xs from protein 958 fIs5PrimePartial = (1<<2), ///< = 0x4 Translate first codon even if not start codon (because sequence is 5' partial) 959 fIs3PrimePartial = (1<<3) ///< = 0x8 May not end in stop codon (because sequence is 3' partial) 960 }; 961 962 typedef int TTranslationFlags; 963 964 965 966 /// Translate a string using a specified genetic code 967 /// @param seq 968 /// String containing IUPAC representation of sequence to be translated 969 /// @param code 970 /// Genetic code to use for translation (NULL to use default) 971 /// @param include_stop 972 /// If true, translate through stop codons and include trailing stop 973 /// (true by default) 974 /// @param remove_trailing_X 975 /// If true, remove trailing Xs from protein translation (false by 976 /// default) 977 /// @param alt_start 978 /// Pointer to bool to indicate whether an alternative start codon was 979 /// used 980 /// @param is_5prime_complete 981 /// If true, only translate first codon if start codon, otherwise 982 /// translate as dash (-) to indicate problem with sequence 983 984 NCBI_DEPRECATED static void Translate(const string& seq, 985 string& prot, 986 const CGenetic_code* code, 987 bool include_stop = true, 988 bool remove_trailing_X = false, 989 bool* alt_start = NULL, 990 bool is_5prime_complete = true, 991 bool is_3prime_complete = true); 992 993 /// Translate a string using a specified genetic code 994 /// @param seq 995 /// String containing IUPAC representation of sequence to be translated 996 /// @param code 997 /// Genetic code to use for translation (NULL to use default) 998 /// @param flags 999 /// Binary OR of "ETranslationFlags" 1000 /// @param alt_start 1001 /// Pointer to bool to indicate whether an alternative start codon was 1002 /// used 1003 static void Translate(const string& seq, 1004 string& prot, 1005 TTranslationFlags flags = fDefault, 1006 const CGenetic_code* code = NULL, 1007 bool* alt_start = NULL); 1008 1009 /// Translate a seq-vector using a specified genetic code 1010 /// if the code is NULL, then the default genetic code is used 1011 /// @param seq 1012 /// CSeqVector of sequence to be translated 1013 /// @param code 1014 /// Genetic code to use for translation (NULL to use default) 1015 /// @param include_stop 1016 /// If true, translate through stop codons and include trailing stop 1017 /// (true by default) 1018 /// @param remove_trailing_X 1019 /// If true, remove trailing Xs from protein translation (false by 1020 /// default) 1021 /// @param alt_start 1022 /// Pointer to bool to indicate whether an alternative start codon was 1023 /// used 1024 /// @param is_5prime_complete 1025 /// If true, only translate first codon if start codon, otherwise 1026 /// translate as dash (-) to indicate problem with sequence 1027 NCBI_DEPRECATED static void Translate(const CSeqVector& seq, 1028 string& prot, 1029 const CGenetic_code* code, 1030 bool include_stop = true, 1031 bool remove_trailing_X = false, 1032 bool* alt_start = NULL, 1033 bool is_5prime_complete = true, 1034 bool is_3prime_complete = true); 1035 1036 /// Translate a seq-vector using a specified genetic code 1037 /// if the code is NULL, then the default genetic code is used 1038 /// @param seq 1039 /// CSeqVector of sequence to be translated 1040 /// @param code 1041 /// Genetic code to use for translation (NULL to use default) 1042 /// @param flags 1043 /// Binary OR of "ETranslationFlags" 1044 /// @param alt_start 1045 /// Pointer to bool to indicate whether an alternative start codon was 1046 /// used 1047 static void Translate(const CSeqVector& seq, 1048 string& prot, 1049 TTranslationFlags flags = fDefault, 1050 const CGenetic_code* code = NULL, 1051 bool* alt_start = NULL); 1052 1053 /// utility function: translate a given location on a sequence 1054 NCBI_DEPRECATED 1055 static void Translate(const CSeq_loc& loc, 1056 const CBioseq_Handle& handle, 1057 string& prot, 1058 const CGenetic_code* code = NULL, 1059 bool include_stop = true, 1060 bool remove_trailing_X = false, 1061 bool* alt_start = 0); 1062 1063 /// utility function: translate a given location on a sequence 1064 static void Translate(const CSeq_loc& loc, 1065 CScope& scope, 1066 string& prot, 1067 const CGenetic_code* code = NULL, 1068 bool include_stop = true, 1069 bool remove_trailing_X = false, 1070 bool* alt_start = 0); 1071 1072 /// Translate a CDRegion into a protein 1073 static void Translate(const CSeq_feat& cds, 1074 CScope& scope, 1075 string& prot, 1076 bool include_stop = true, 1077 bool remove_trailing_X = false, 1078 bool* alt_start = 0); 1079 1080 static CRef<CBioseq> TranslateToProtein(const CSeq_feat& cds, 1081 CScope& scope); 1082 1083 static bool ChangeDeltaProteinToRawProtein(CRef<CBioseq> protein); 1084 1085 /// Find "best" frame for a coding region. "Best" frame has no 1086 /// internal stop codons. 1087 static CCdregion::EFrame FindBestFrame(const CSeq_feat& cds, CScope& scope); 1088 static CCdregion::EFrame FindBestFrame(const CSeq_feat& cds, CScope& scope, bool& ambiguous); 1089 1090 }; 1091 1092 1093 1094 /// Location relative to a base Seq-loc: one (usually) or more ranges 1095 /// of offsets. 1096 /// XXX - handle fuzz? 1097 struct NCBI_XOBJUTIL_EXPORT SRelLoc 1098 { 1099 enum EFlags { 1100 fNoMerge = 0x1 ///< don't merge adjacent intervals 1101 }; 1102 typedef int TFlags; ///< binary OR of EFlags 1103 1104 /// For relative ranges (ONLY), id is irrelevant and normally unset. 1105 typedef CSeq_interval TRange; 1106 typedef vector<CRef<TRange> > TRanges; 1107 1108 /// Beware: treats locations corresponding to different sequences as 1109 /// disjoint, even if one is actually a segment of the other. :-/ 1110 SRelLoc(const CSeq_loc& parent, const CSeq_loc& child, CScope* scope = 0, 1111 TFlags flags = 0); 1112 1113 /// For manual work. As noted above, ranges need not contain any IDs. SRelLocSRelLoc1114 SRelLoc(const CSeq_loc& parent, const TRanges& ranges) 1115 : m_ParentLoc(&parent), m_Ranges(ranges) { } 1116 ResolveSRelLoc1117 CRef<CSeq_loc> Resolve(CScope* scope = 0, TFlags flags = 0) const 1118 { return Resolve(*m_ParentLoc, scope, flags); } 1119 CRef<CSeq_loc> Resolve(const CSeq_loc& new_parent, CScope* scope = 0, 1120 TFlags flags = 0) const; 1121 1122 CConstRef<CSeq_loc> m_ParentLoc; 1123 TRanges m_Ranges; 1124 }; 1125 1126 1127 1128 ///============================================================================// 1129 /// Sequence Search // 1130 ///============================================================================// 1131 1132 /// CSeqSearch 1133 /// ========== 1134 /// 1135 /// Search a nucleotide sequence for one or more patterns 1136 /// 1137 1138 class NCBI_XOBJUTIL_EXPORT CSeqSearch 1139 { 1140 public: 1141 1142 /// Holds information associated with a pattern, such as the name of the 1143 /// restriction enzyme, location of cut site etc. 1144 class CPatternInfo 1145 { 1146 public: 1147 /// constructor CPatternInfo(const string & name,const string & sequence,Int2 cut_site)1148 CPatternInfo(const string& name, 1149 const string& sequence, 1150 Int2 cut_site) : 1151 m_Name(name), m_Sequence(sequence), m_CutSite(cut_site), 1152 m_Strand(eNa_strand_unknown) 1153 {} 1154 GetName(void) const1155 const string& GetName (void) const { return m_Name; } GetSequence(void) const1156 const string& GetSequence (void) const { return m_Sequence; } GetCutSite(void) const1157 Int2 GetCutSite (void) const { return m_CutSite; } GetStrand(void) const1158 ENa_strand GetStrand (void) const { return m_Strand; } 1159 1160 private: 1161 friend class CSeqSearch; 1162 1163 /// data 1164 string m_Name; /// user defined name 1165 string m_Sequence; /// nucleotide sequence 1166 Int2 m_CutSite; 1167 ENa_strand m_Strand; 1168 }; 1169 typedef CPatternInfo TPatternInfo; 1170 1171 /// Client interface: 1172 /// ================== 1173 /// A class that uses the SeqSearch facility should implement the Client 1174 /// interface and register itself with the search utility to be notified 1175 /// of matches detection. 1176 class IClient 1177 { 1178 public: ~IClient()1179 virtual ~IClient() { } 1180 1181 typedef CSeqSearch::TPatternInfo TPatternInfo; 1182 1183 virtual bool OnPatternFound(const TPatternInfo& pat_info, TSeqPos pos) = 0; 1184 }; 1185 1186 public: 1187 1188 enum ESearchFlag { 1189 fNoFlags = 0, 1190 fJustTopStrand = 1, 1191 fExpandPattern = 2, 1192 fAllowMismatch = 4 1193 }; 1194 typedef unsigned int TSearchFlags; ///< binary OR of ESearchFlag 1195 1196 /// constructors 1197 /// @param client 1198 /// pointer to a client object (receives pattern match notifications) 1199 /// @param flags 1200 /// specify search flags 1201 CSeqSearch(IClient *client = 0, TSearchFlags flags = fNoFlags); 1202 /// destructor 1203 ~CSeqSearch(void); 1204 1205 /// Add nucleotide pattern or restriction site to sequence search. 1206 /// Uses ambiguity codes, e.g., R = A and G, H = A, C and T 1207 void AddNucleotidePattern(const string& name, /// pattern's name 1208 const string& sequence, /// pattern's sequence 1209 Int2 cut_site, 1210 TSearchFlags flags = fNoFlags); 1211 1212 /// Search the sequence for patterns 1213 /// @sa 1214 /// AddNucleotidePattern 1215 void Search(const CBioseq_Handle& seq); 1216 1217 /// Low level search method. 1218 /// The user is responsible for feeding each character in turn, 1219 /// keep track of the position in the text and provide the length in case of 1220 /// a circular topoloy. 1221 int Search(int current_state, char ch, int position, int length = kMax_Int); 1222 1223 /// Get / Set client. GetClient() const1224 const IClient* GetClient() const { return m_Client; } SetClient(IClient * client)1225 void SetClient(IClient* client) { m_Client = client; } 1226 1227 private: 1228 1229 void x_AddNucleotidePattern(const string& name, string& pattern, 1230 Int2 cut_site, ENa_strand strand, TSearchFlags flags); 1231 1232 void x_ExpandPattern(string& sequence, string& buffer, size_t pos, 1233 TPatternInfo& pat_info, TSearchFlags flags); 1234 1235 void x_AddPattern(TPatternInfo& pat_info, string& sequence, TSearchFlags flags); 1236 void x_StorePattern(TPatternInfo& pat_info, string& sequence); 1237 x_IsJustTopStrand(TSearchFlags flags) const1238 bool x_IsJustTopStrand(TSearchFlags flags) const { 1239 return ((m_Flags | flags) & fJustTopStrand) != 0; 1240 } x_IsExpandPattern(TSearchFlags flags) const1241 bool x_IsExpandPattern(TSearchFlags flags) const { 1242 return ((m_Flags | flags) & fExpandPattern) != 0; 1243 } x_IsAllowMismatch(TSearchFlags flags) const1244 bool x_IsAllowMismatch(TSearchFlags flags) const { 1245 return ((m_Flags | flags) & fAllowMismatch) != 0; 1246 } 1247 1248 // data 1249 IClient* m_Client; // pointer to client object 1250 TSearchFlags m_Flags; // search flags 1251 size_t m_LongestPattern; // longets search pattern 1252 CTextFsm<TPatternInfo> m_Fsa; // finite state machine 1253 }; // end of CSeqSearch 1254 1255 1256 /// This trims ambiguous bases from the start and/or end of 1257 /// sequences, using customizable rules. 1258 class NCBI_XOBJUTIL_EXPORT CSequenceAmbigTrimmer : public CObject 1259 { 1260 public: 1261 1262 /// This enum is used to set what is meant by "ambiguous". 1263 enum EMeaningOfAmbig { 1264 /// Here, only N for nucleotides and X for amino acids is considered 1265 /// ambiguous. 1266 eMeaningOfAmbig_OnlyCompletelyUnknown, 1267 /// Here, anything that's not certain is considered 1268 /// ambiguous. That is, anything but A, C, G, T for nucleotides, 1269 /// and most amino acids except, for example, B (which can be 1270 /// aspartic acid or asparagine), X (completely ambiguous), etc. 1271 eMeaningOfAmbig_AnyAmbig, 1272 }; 1273 1274 enum EFlags { 1275 fFlags_DoNotTrimBeginning = (1 << 0), ///< 0x01 ("Beginning" as defined by CSeqVector) 1276 fFlags_DoNotTrimEnd = (1 << 1), ///< 0x02 ("End" as defined by CSeqVector) 1277 1278 fFlags_DoNotTrimSeqGap = (1 << 2), ///< 0x04 (Seq-gaps are not considered trimmable if this flag is set, only letter gaps (e.g. N's for nucs)) 1279 1280 // we might support this in the future 1281 // fFlags_TrimAnnot = (1 << 3) ///< 0x08 (Trim annots based on trimmed bioseq location) 1282 }; 1283 typedef int TFlags; 1284 1285 /// For example, if bases_to_check is 10 and max_bases_allowed_to_be_ambig 1286 /// is 5, then on each iteration we check the 10 terminal bases and 1287 /// trim off those 10 if there are more than 5 ambiguous bases there. 1288 struct NCBI_XOBJUTIL_EXPORT STrimRule { 1289 TSignedSeqPos bases_to_check; 1290 TSignedSeqPos max_bases_allowed_to_be_ambig; 1291 }; 1292 /// Multiple STrimRules are allowed, which are applied from 1293 /// smallest bases_to_check to largest bases_to_check, and 1294 /// redundant rules are automatically removed. When a rule is applied, 1295 /// we start over at the first sorted rule again. 1296 typedef vector<STrimRule> TTrimRuleVec; 1297 1298 /// This returns a reasonable default for trimming rules. 1299 static const TTrimRuleVec & GetDefaultTrimRules(void); 1300 1301 /// This sets up the parameters for how this trimmer will act 1302 /// 1303 /// @param eMeaningOfAmbig 1304 /// This indicates exactly what ambiguous means (e.g. just "N" or 1305 /// do all ambiguous symbols count? ) 1306 /// @param fFlags 1307 /// miscellaneous parameters to control this. See TFlags. 1308 /// @param vecTrimRules 1309 /// This indicates how trimming will occur. See TTrimRuleVec. 1310 /// @param uMinSeqLen 1311 /// Trimming tries to halt if the sequence becomes smaller than this size. 1312 /// It is possible for the resulting sequence to be below the 1313 /// uMinSeqLen size (or even trimmed to nothing), but the trimmer 1314 /// will at least <i>try</i> not to do that. 1315 CSequenceAmbigTrimmer( 1316 EMeaningOfAmbig eMeaningOfAmbig, 1317 TFlags fFlags = 0, 1318 const TTrimRuleVec & vecTrimRules = GetDefaultTrimRules(), 1319 TSignedSeqPos uMinSeqLen = 50 ); 1320 1321 /// Do-nothing destructor just to allow inheritance. ~CSequenceAmbigTrimmer()1322 virtual ~CSequenceAmbigTrimmer() { } 1323 1324 /// This indicates what happened with the trim. 1325 /// Error states are indicated by an exception, not EResult. 1326 enum EResult { 1327 /// Bioseq is now trimmed. 1328 eResult_SuccessfullyTrimmed, 1329 1330 /// Bioseq is left unchanged because it did not need to be trimmed 1331 /// at all. This is NOT an error. 1332 eResult_NoTrimNeeded 1333 }; 1334 1335 /// This trims the given bioseq, using params 1336 /// set in the CSequenceAmbigTrimmer constructor. It will properly 1337 /// handle the annots and descs inside the bioseq, too, if requested. 1338 /// 1339 /// @param bioseq_handle 1340 /// The bioseq to trim. 1341 /// @param trimmed_ranges 1342 /// The ranges trimmed by DoTrim will be added to this. 1343 /// @return 1344 /// This returns how the trimming went. On error, an exception 1345 /// is thrown and the bioseq may be in an undefined state. 1346 virtual EResult DoTrim( CBioseq_Handle &bioseq_handle, 1347 CRangeCollection<TSeqPos> *trimmed_ranges = nullptr); 1348 1349 protected: 1350 /// This holds the current interpretation for "ambiguous". For example, 1351 /// it indicates whether just 'N' is ambiguous or if any non-ACGT 1352 /// letter is ambiguous. Works for amino acids, too (e.g. 'X' for 1353 /// completely unknown, etc.) 1354 EMeaningOfAmbig m_eMeaningOfAmbig; 1355 /// This holds the flags that affect the behavior of this class. 1356 TFlags m_fFlags; 1357 /// This holds the trimming rules that will be applied. 1358 /// It should be normalized by the constructor 1359 /// to eliminate dups and to sort it from least to most bases. 1360 TTrimRuleVec m_vecTrimRules; 1361 /// When the bioseq gets trimmed down to less than this size, 1362 /// we halt the trimming. 1363 TSignedSeqPos m_uMinSeqLen; 1364 1365 /// Test if a given flag is set. x_TestFlag(TFlags fFlag)1366 bool x_TestFlag(TFlags fFlag) { 1367 return ( ( m_fFlags & fFlag ) != 0 ); 1368 } 1369 1370 /// This prepares the vector of trimming rules to be used 1371 /// by the trimming algorithm. For example, it eliminate duplicates 1372 /// and puts the rules in the correct order. 1373 /// 1374 /// @param vecTrimRules 1375 /// Input and output. 1376 virtual void x_NormalizeVecTrimRules( TTrimRuleVec & vecTrimRules ); 1377 1378 /// The bioseq is trimmed to size 0. 1379 /// 1380 /// @param bioseq_handle 1381 /// The bioseq to trim to nothing. 1382 /// @returns 1383 /// Works just like the DoTrim return value. 1384 virtual EResult x_TrimToNothing( CBioseq_Handle &bioseq_handle ); 1385 1386 // below this point, left/right means positions going numerically upward, 1387 // but start/end is relative to direction. That is, 1388 // a negative direction would imply end <= start. 1389 1390 /// This returns the last good base that won't be trimmed 1391 /// (note: last really means "first" when we're starting from the end) 1392 /// 1393 /// @param seqvec 1394 /// This lets us explore the Bioseq to find out where to trim. 1395 /// @param iStartPosInclusive_arg 1396 /// This is the where we start our trimming. Depending on 1397 /// direction, this could be < or > iEndPosInclusive_arg. 1398 /// @param iEndPosInclusive_arg 1399 /// This is where the trimming ends (inclusive). Analogous to 1400 /// iStartPosInclusive_arg. 1401 /// @param iTrimDirection 1402 /// 1 to trim from left to right, -1 to trim from right to left. 1403 /// @return 1404 /// The last good base (remember: last means "lower number" when we're 1405 /// checking from the end). If trimming would trim off the entire 1406 /// sequence, it returns a position past the end of the sequence. 1407 virtual TSignedSeqPos x_FindWhereToTrim( 1408 const CSeqVector & seqvec, 1409 const TSignedSeqPos iStartPosInclusive_arg, 1410 const TSignedSeqPos iEndPosInclusive_arg, 1411 TSignedSeqPos iTrimDirection ); 1412 1413 /// This adjusts in_out_uStartOfGoodBasesSoFar if we're at 1414 /// a CSeqMap gap. It does not notice ambiguous bases that 1415 /// are inside a normal sequence. 1416 /// 1417 /// @param seqvec 1418 /// This is used to access information about the sequence. 1419 /// @param in_out_uStartOfGoodBasesSoFar 1420 /// This is the start of where we check for a gap. 1421 /// It will be updated to be past the gap, if a gap is found. 1422 /// @param in_out_uRightmostGoodBaseSoFar 1423 /// Analogous to in_out_uLeftmostGoodBaseSoFar. It's inclusive. 1424 /// @param uEndOfGoodBasesSoFar 1425 /// This limits how far this function may search (inclusive) 1426 /// when looking for 1427 /// the end of a gap segment. 1428 /// @param iTrimDirection 1429 /// 1 to trim from left to right, -1 to trim from right to left. 1430 /// @param uChunkSize 1431 /// The gap size that we chop off must be a multiple of uChunkSize. 1432 /// We will chop off less if we would go more than 1 past the 1433 /// uEndOfGoodBasesSoFar. 1434 /// A uChunkSize of 1 means no chunking for obvious math reasons. 1435 virtual void x_EdgeSeqMapGapAdjust( 1436 const CSeqVector & seqvec, 1437 TSignedSeqPos & in_out_uStartOfGoodBasesSoFar, 1438 const TSignedSeqPos uEndOfGoodBasesSoFar, 1439 const TSignedSeqPos iTrimDirection, 1440 const TSignedSeqPos uChunkSize ); 1441 1442 /// This holds the output of x_CountAmbigInRange 1443 struct NCBI_XOBJUTIL_EXPORT SAmbigCount : public CObject { SAmbigCountCSequenceAmbigTrimmer::SAmbigCount1444 SAmbigCount(const TSignedSeqPos iTrimDirection) : 1445 num_ambig_bases(0), 1446 pos_after_last_gap( 1447 (iTrimDirection > 0) 1448 ? numeric_limits<TSignedSeqPos >::max() 1449 : numeric_limits<TSignedSeqPos >::min() ) 1450 { } 1451 1452 /// the number of ambiguous bases found in the range 1453 /// supplied to x_CountAmbigInRange 1454 TSignedSeqPos num_ambig_bases; 1455 /// Inclusive. This is far past the end if the whole range 1456 /// is ambiguous. 1457 TSignedSeqPos pos_after_last_gap; 1458 }; 1459 1460 /// This counts the number of ambiguous bases in the range 1461 /// [leftmost_pos_to_check, rightmost_pos_to_check]. Note that 1462 /// rightmost_pos_to_check is inclusive. 1463 /// 1464 /// @param out_result 1465 /// This will store the result. Pass in a struct initialized 1466 /// by the default constructor. 1467 /// @param seqvec 1468 /// This is used to get the bases. 1469 /// @param iStartPosInclusive 1470 /// This is where we start our count. 1471 /// @param iEndPosInclusive 1472 /// This is where we end our count. Note that it can be < or 1473 /// > iStartPosInclusive, depending on trim direction. 1474 /// @param iTrimDirection 1475 /// 1 to trim from left to right, -1 to trim from right to left. 1476 virtual void x_CountAmbigInRange( 1477 SAmbigCount & out_result, 1478 const CSeqVector & seqvec, 1479 const TSignedSeqPos iStartPosInclusive_arg, 1480 const TSignedSeqPos iEndPosInclusive_arg, 1481 const TSignedSeqPos iTrimDirection ); 1482 1483 /// This returns the (inclusive) position at the beginning of the 1484 /// segment. 1485 /// 1486 /// @param segment 1487 /// This is the segment we're trying to find the beginning of. 1488 /// @param iTrimDirection 1489 /// This is which direction in which we're trimming. The beginning 1490 /// will be in the opposite direction. 1491 /// @return 1492 /// This returns the (inclusive) position at the beginning of the given 1493 /// segment. As always, 1494 /// the definition of "beginning" depends on iTrimDirection. x_SegmentGetBeginningInclusive(const CSeqMap_CI & segment,const TSignedSeqPos iTrimDirection)1495 TSignedSeqPos x_SegmentGetBeginningInclusive( 1496 const CSeqMap_CI & segment, 1497 const TSignedSeqPos iTrimDirection ) 1498 { 1499 // symmetrical 1500 return x_SegmentGetEndInclusive( segment, -iTrimDirection ); 1501 } 1502 1503 /// This returns the (inclusive) position at the end of the 1504 /// segment currently at iStartPosInclusive_arg. 1505 /// 1506 /// @param segment 1507 /// This is the segment we're trying to find the end of. 1508 /// @param iTrimDirection 1509 /// This is which direction in which we're trimming. The end 1510 /// of the segment will be found by looking in that direction. 1511 /// @return 1512 /// This returns the (inclusive) position at the end of the given 1513 /// segment. The definition of "end" depends on iTrimDirection. 1514 TSignedSeqPos x_SegmentGetEndInclusive( 1515 const CSeqMap_CI & segment, 1516 const TSignedSeqPos iTrimDirection ); 1517 1518 /// Returns the "next" segment. The definition of "next" 1519 /// depends on iTrimDirection 1520 /// 1521 /// @param in_out_segment 1522 /// Caller gives the current CSeqMap_CI, which will be 1523 /// returned adjusted in the trim direction. 1524 /// @param iTrimDirection 1525 /// The direction in which to increment. 1 means normal incrementing 1526 /// and -1 really means decrementing. 1527 /// @return 1528 /// Reference to in_out_segment_it. 1529 CSeqMap_CI & x_SeqMapIterDoNext( 1530 CSeqMap_CI & in_out_segment_it, 1531 const TSignedSeqPos iTrimDirection ); 1532 1533 void x_SliceBioseq( 1534 TSignedSeqPos leftmost_good_base, 1535 TSignedSeqPos rightmost_good_base, 1536 CBioseq_Handle & bioseq_handle ); 1537 1538 // For each letter of the alphabet, returns whether or not it's 1539 // ambiguous. Index 0 is 'A', index 1 is 'B', etc. 1540 typedef bool TAmbigLookupTable[26]; 1541 TAmbigLookupTable m_arrNucAmbigLookupTable; 1542 TAmbigLookupTable m_arrProtAmbigLookupTable; 1543 }; 1544 1545 /// This iterates over the runs of Ns of each sequence 1546 class NCBI_XOBJUTIL_EXPORT CBioseqGaps_CI : public CObject 1547 { 1548 public: 1549 1550 /// The params that control the behavior of CBioseqGaps_CI 1551 struct NCBI_XOBJUTIL_EXPORT Params { 1552 /// Default ctor gives params which are usually reasonable. ParamsCBioseqGaps_CI::Params1553 Params(void) : max_gap_len_to_ignore(10), 1554 max_num_gaps_per_seq( numeric_limits<TSeqPos>::max() ), 1555 max_num_seqs( numeric_limits<TSeqPos>::max() ), 1556 mol_filter(CSeq_inst::eMol_not_set), 1557 level_filter(CBioseq_CI::eLevel_All) 1558 { 1559 } 1560 1561 /// We completely ignore any gaps we find that have this 1562 /// number of bases or fewer. 1563 TSeqPos max_gap_len_to_ignore; 1564 /// We only return up to this many gaps for each sequence 1565 TSeqPos max_num_gaps_per_seq; 1566 /// We only return gaps on up to this many sequences. 1567 TSeqPos max_num_seqs; 1568 1569 /// CSeq_inst::eMol_na to only look at gaps on nucleotide 1570 /// sequences. CSeq_inst::eMol_aa to only look at gaps 1571 /// on amino acid sequences. 1572 /// CSeq_inst::eMol_not_set to avoid filtering. 1573 CSeq_inst::EMol mol_filter; 1574 /// Works like the level filter in CBioseq_CI 1575 CBioseq_CI::EBioseqLevelFlag level_filter; 1576 }; 1577 1578 /// This constructor initializes the iterator. 1579 /// 1580 /// @param entry_h 1581 /// This will iterate over all descendents of this entry. 1582 /// @param params 1583 /// Controls the behavior of the iterator. If not specified, 1584 /// a reasonable default will be used. 1585 CBioseqGaps_CI( 1586 const CSeq_entry_Handle & entry_h, 1587 const Params & params = Params() ); 1588 1589 /// Move the iterator forward to next gap 1590 /// (or the end, if there are no more to return) operator ++(void)1591 CBioseqGaps_CI & operator ++ (void) { x_Next(); return *this; } 1592 1593 DECLARE_OPERATOR_BOOL(m_bioseq_CI); 1594 1595 /// This indicates the state of the iterator right now. 1596 /// This structure is undefined if the iterator 1597 /// has reached the end, though the caller probably 1598 /// won't be able to access it anyway since x_GetCurrent 1599 /// will throw an exception in that case. 1600 struct NCBI_XOBJUTIL_EXPORT SCurrentGapInfo { 1601 /// Constructor initializes to state that it 1602 /// should be when the iterator first starts. SCurrentGapInfoCBioseqGaps_CI::SCurrentGapInfo1603 SCurrentGapInfo(void) : num_seqs_seen_so_far(0), 1604 start_pos(0), 1605 length(0), 1606 num_gaps_seen_so_far_on_this_seq(0) { } 1607 1608 /// The seq-id that this gap is on. 1609 CSeq_id_Handle seq_id; 1610 /// This indicates how many sequences we've seen so far, 1611 /// including the one we're currently on. 1612 /// For example, 3 means we're on the 3rd sequence to 1613 /// contain a relevant gap. 1614 size_t num_seqs_seen_so_far; 1615 1616 /// the 0-based position at which the current gap starts 1617 /// on the current sequence. 1618 TSeqPos start_pos; 1619 /// the length of the current gap 1620 TSeqPos length; 1621 /// how many gaps we've seen so far on this sequence. 1622 /// For example, 2 would mean we're currently on 1623 /// the second relevant gap on this sequence. 1624 size_t num_gaps_seen_so_far_on_this_seq; 1625 }; 1626 1627 /// Get information about the gap we're currently on. operator *(void) const1628 const SCurrentGapInfo & operator*(void) const { 1629 return x_GetCurrent(); 1630 } 1631 1632 /// Get information about the gap we're currently on. operator ->(void) const1633 const SCurrentGapInfo * operator ->(void) const { 1634 return &x_GetCurrent(); 1635 } 1636 1637 protected: 1638 /// This points to the bioseq we're currently on. 1639 /// When this iterator becomes invalid, that means this 1640 /// CBioseqGaps_CI is invalid, too. 1641 CBioseq_CI m_bioseq_CI; 1642 /// This indicates information about the gap we're currently on. 1643 SCurrentGapInfo m_infoOnCurrentGap; 1644 /// This holds the params the caller gave when this 1645 /// object was initially created. 1646 Params m_Params; 1647 1648 /// This gives info on the gap we're currently on. 1649 /// Throws if this iterator has finished. 1650 virtual const SCurrentGapInfo & x_GetCurrent(void) const; 1651 1652 /// This moves this iterator to the next relevant gap. 1653 /// Throws if this iterator has finished. 1654 virtual void x_Next(void); 1655 1656 /// This advances m_bioseq_CI although it 1657 /// has extra logic to terminate m_bioseq_CI 1658 /// if we've exceeded the number of bioseqs we can look for. 1659 virtual void x_NextBioseq(void); 1660 1661 /// This indicates what happened when we tried to run 1662 /// x_FindNextGapOnBioseq. 1663 enum EFindNext { 1664 /// No more relevant gaps were found on this bioseq. The other output 1665 /// parameters will be in an undefined state. 1666 eFindNext_NotFound, 1667 /// Another relevant gap was found, and the output parameters are 1668 /// filled in to represent information about it. 1669 eFindNext_Found 1670 }; 1671 1672 /// This finds the next gap on the bioseq, starting at given pos. 1673 /// 1674 /// @param bioseq_h 1675 /// the bioseq on which we're seeking the next relevant gap. 1676 /// @param pos_to_start_looking 1677 /// This is the position on bioseq_h to start looking for a 1678 /// relevant gap. 1679 /// @param out_pos_of_gap 1680 /// If a gap is found, this holds the 0-based position of the 1681 /// start of that gap. This is undefined if no gap was found. 1682 /// @param out_len_of_gap 1683 /// If a gap is found, this holds the length of the 1684 /// gap. This is undefined if no gap was found. 1685 /// @return 1686 /// This indicates whether or not a relevant gap was found. 1687 virtual EFindNext x_FindNextGapOnBioseq( 1688 const CBioseq_Handle & bioseq_h, 1689 const TSeqPos pos_to_start_looking, 1690 TSeqPos & out_pos_of_gap, 1691 TSeqPos & out_len_of_gap ) const; 1692 }; 1693 1694 /* @} */ 1695 1696 /// Reverse complement a Bioseq in place. 1697 /// If delta sequence, will also need to reverse order of segments 1698 void NCBI_XOBJUTIL_EXPORT ReverseComplement(CSeq_inst& seq, CScope* scope); 1699 1700 1701 END_SCOPE(objects) 1702 END_NCBI_SCOPE 1703 1704 #endif /* SEQUENCE__HPP */ 1705