1 /* $Id: variation_util2.hpp 590763 2019-08-05 00:34:23Z vakatov $ 2 * =========================================================================== 3 * 4 * PUBLIC DOMAIN NOTICE 5 * National Center for Biotechnology Information 6 * 7 * This software/database is a "United States Government Work" under the 8 * terms of the United States Copyright Act. It was written as part of 9 * the author's official duties as a United States Government employee and 10 * thus cannot be copyrighted. This software/database is freely available 11 * to the public for use. The National Library of Medicine and the U.S. 12 * Government have not placed any restriction on its use or reproduction. 13 * 14 * Although all reasonable efforts have been taken to ensure the accuracy 15 * and reliability of the software and data, the NLM and the U.S. 16 * Government do not and cannot warrant the performance or results that 17 * may be obtained by using this software or data. The NLM and the U.S. 18 * Government disclaim all warranties, express or implied, including 19 * warranties of performance, merchantability or fitness for any particular 20 * purpose. 21 * 22 * Please cite the author in any work or product based on this material. 23 * 24 * =========================================================================== 25 * 26 * File Description: 27 * 28 * Utility functions pretaining to Variation 29 * 30 * =========================================================================== 31 */ 32 33 #ifndef VARIATION_UTIL2_HPP_ 34 #define VARIATION_UTIL2_HPP_ 35 36 #include <corelib/ncbiobj.hpp> 37 #include <corelib/ncbistd.hpp> 38 39 #include <objects/seqloc/Seq_loc.hpp> 40 #include <objects/seqloc/Seq_interval.hpp> 41 #include <objects/seq/Seq_literal.hpp> 42 #include <objects/seqalign/Seq_align.hpp> 43 #include <objects/seqalign/Seq_align_set.hpp> 44 #include <objects/seq/Seq_data.hpp> 45 46 #include <objects/seqfeat/Seq_feat.hpp> 47 #include <objects/seqfeat/Variation_ref.hpp> 48 #include <objects/seqfeat/VariantProperties.hpp> 49 50 #include <objects/variation/Variation.hpp> 51 #include <objects/variation/VariantPlacement.hpp> 52 #include <objects/seqfeat/Variation_inst.hpp> 53 54 #include <util/rangemap.hpp> 55 56 #include <objmgr/scope.hpp> 57 #include <objmgr/feat_ci.hpp> 58 #include <objmgr/seq_vector.hpp> 59 #include <objmgr/seq_loc_mapper.hpp> 60 #include <objmgr/util/feature.hpp> 61 62 BEGIN_NCBI_SCOPE 63 USING_SCOPE(objects); 64 65 namespace variation { 66 67 class CVariationUtil : public CObject 68 { 69 public: 70 enum EOptions 71 { 72 fOpt_cache_exon_sequence = 1, ///< Use when there will be many calls to calculate protein consequnece per sequence 73 fOpt_default = 0 74 }; 75 typedef int TOptions; 76 CVariationUtil(CScope & scope,TOptions options=fOpt_default)77 CVariationUtil(CScope& scope, TOptions options = fOpt_default) 78 : m_scope(&scope) 79 , m_variant_properties_index(scope) 80 , m_cdregion_index(scope, options) 81 {} 82 ClearCache()83 void ClearCache() 84 { 85 m_variant_properties_index.Clear(); 86 m_cdregion_index.Clear(); 87 } 88 89 CRef<CVariation> AsVariation(const CSeq_feat& variation_ref); 90 void AsVariation_feats(const CVariation& v, CSeq_annot::TData::TFtable& feats); 91 92 93 /// Methods to remap a VariantPlacement 94 95 /*! 96 * Remap using provided alignment. Ids of at least one row must match. 97 * When remapping between transcript/genomic coordiane systems, this method should be used 98 * instead of the mapper-based, as if the location is intronic, the method 99 * will use the spliced-alignment to create or resolve offset-placement as appropriate. 100 101 RSNP-2558 - Sometimes we don't need to check the validity of resulting placements, so this costly step can 102 be skipped 103 */ 104 CRef<CVariantPlacement> Remap(const CVariantPlacement& p, const CSeq_align& aln, bool check_placements = true); 105 106 // Remap a placement with a specified mapper. If a cdna->genomic or genomic->cdna mapping 107 // is attempted, this will throw. 108 CRef<CVariantPlacement> Remap(const CVariantPlacement& p, CSeq_loc_Mapper& mapper); 109 110 111 /// Remap variation from product coordinates onto a nucleotide sequence on which this product is annotated. 112 /// e.g. {NM,NR,NP} -> {NG,NT,NW,AC,NC}. Use annotated seq-align, if available; otherwise use spliced rna or cdregion. 113 /// If starting from NP, remap to the parent mRNA first. 114 CRef<CVariantPlacement> RemapToAnnotatedTarget(const CVariation& v, const CSeq_id& target); 115 116 117 /// Methods to convert between nucleotide and protein 118 119 /// Convert protein-variation (single-AA missense/nonsense) to nuc-variation on the precursor in the parent nuc-prot. 120 typedef int TAA2NAFlags; 121 enum EAA2NAFlags 122 { 123 fAA2NA_truncate_common_prefix_and_suffix = 1, 124 fAA2NA_default = 0 125 }; 126 CRef<CVariation> InferNAfromAA(const CVariation& prot_variation, TAA2NAFlags flags = fAA2NA_default); 127 128 129 /// Evaluate protein effect of a single-inst @ single-placement 130 /// 131 /// Two placements will be added: The protein placement, and an additional placement 132 /// on the original nucleotide level, representing the location and sequence of affected codons. 133 /// 134 /// The inst will be at the nucleotide level (describe variant codons). Rationale: the user may 135 /// want to know the codons, or translate it to AA if necessary. 136 /// 137 /// Only a subset of sequence edits is supported (mismatch, ins, del, indel) 138 /// Nuc-variation's placement must fall within cds-feat location. 139 CRef<CVariation> TranslateNAtoAA(const CVariation_inst& nuc_inst, const CVariantPlacement& p, const CSeq_feat& cds_feat); 140 141 142 /// Find the CDSes for the first placement; Compute prot consequence using TranslateNAtoAA for each 143 /// and attach results to nuc_variation.consequnece. 144 /// If seq-id specified, use only the placement with the specified id. 145 /// If ignore_genomic set to true, then consequences will be evaluated for cDNA or MT placements only. 146 /// Note: Alternatively, the API could be "create and return consequence protein variation(s)", rather than attach, 147 /// but that for hierarchical input it would be hard to tell which consequence corresponds to which node. 148 void AttachProteinConsequences(CVariation& nuc_variation, const CSeq_id* = NULL, bool ignore_genomic = false); 149 150 151 ///Other utility methods: 152 153 /// Flipping strand involves flipping all placements, and insts. 154 /// Note: for insertions, the placement(s) must be dinucleotide where 155 /// insertion occurs, such that semantic correctness is maintained. 156 void FlipStrand(CVariation& v) const; 157 158 /// Flipping a placement involves flipping strand on the seq-loc and swapping the offsets (and signs). 159 /// If there's a seq-literal, it is reverse-complemented. 160 void FlipStrand(CVariantPlacement& vp) const; 161 162 /// Flip strand on delta-items and reverse the order. 163 void FlipStrand(CVariation_inst& vi) const; 164 165 /// Reverse-complement seq-literals, flip strand on the seq-locs. 166 void FlipStrand(CDelta_item& di) const; 167 168 enum { kMaxAttachSeqLen = 100000} ; //VAR-724 169 /// If have offsets (intronic) or too long, return false; 170 /// else set seq field on the placement and return true. 171 bool AttachSeq(CVariantPlacement& p, TSeqPos max_len = kMaxAttachSeqLen); 172 173 //Call AttachSeq on every placement; Try to find asserted-type subvariation. 174 //If the asserted sequence is different from the attached reference, add 175 //corresponding exception object to the placement. Return true iff there were no exceptions added 176 bool AttachSeq(CVariation& v, TSeqPos max_len = kMaxAttachSeqLen); 177 178 179 CVariantPlacement::TMol GetMolType(const CSeq_id& id); 180 181 /*! 182 * If placement is not intronic, or alignment is not spliced-seg -> eNotApplicable 183 * Else if variation is intronic but location is not at exon boundary -> eFail 184 * Else -> ePass 185 */ 186 enum ETestStatus 187 { 188 eFail, 189 ePass, 190 eNotApplicable, 191 }; 192 ETestStatus CheckExonBoundary(const CVariantPlacement& p, const CSeq_align& aln); 193 194 // Similar to above, except verify exon boundary using exon annotation on refseq transcripts, if exists 195 ETestStatus CheckExonBoundary(const CVariantPlacement& p); 196 197 /// if placement is invalid SeqLocCheck fails, or offsets out of order, attach VariationException and return true 198 /// otherwise return false; 199 bool CheckPlacement(CVariantPlacement& p); 200 201 /// if variation.data contains a seq-literal with non-ACGT residues, attach VariationException to the first placement 202 /// (if exists) and return true. otherwise return false; 203 bool CheckAmbiguitiesInLiterals(CVariation& v); 204 205 /// Calculate upstream (first) and downstream(second) flanks for loc 206 struct SFlankLocs 207 { 208 CRef<CSeq_loc> upstream; 209 CRef<CSeq_loc> downstream; 210 }; 211 SFlankLocs CreateFlankLocs(const CSeq_loc& loc, TSeqPos len); 212 213 214 /// Methods to compute properties 215 216 /*! 217 * Fill location-specific variant-properties for a placement. 218 * Attach related GeneID(s) to placement with respect to which the properties are defined. 219 */ 220 void SetPlacementProperties(CVariantPlacement& placement); 221 222 /* 223 * Set placement-specific variant properties for a variation by applying 224 * x_SetVariantProperties(...) on each found placement and collapse those 225 * onto parent VariationProperties 226 */ 227 void SetVariantProperties(CVariation& v); 228 229 230 /// Supported SO-terms 231 enum ESOTerm 232 { 233 //location-specific terms 234 eSO_intergenic_variant =1628, 235 eSO_2KB_upstream_variant =1636, 236 eSO_500B_downstream_variant =1634, 237 eSO_splice_donor_variant =1575, 238 eSO_splice_acceptor_variant =1574, 239 eSO_intron_variant =1627, 240 eSO_5_prime_UTR_variant =1623, 241 eSO_3_prime_UTR_variant =1624, 242 eSO_coding_sequence_variant =1580, 243 eSO_nc_transcript_variant =1619, 244 eSO_initiator_codon_variant =1582, 245 eSO_terminator_codon_variant=1590, 246 247 //variation-specific terms 248 eSO_synonymous_variant =1819, 249 eSO_missense_variant =1583, 250 eSO_inframe_indel =1820, 251 eSO_frameshift_variant =1589, 252 eSO_stop_gained =1587, 253 eSO_stop_lost =1578 254 }; 255 typedef vector<ESOTerm> TSOTerms; 256 void AsSOTerms(const CVariantProperties& p, TSOTerms& terms); 257 static string AsString(ESOTerm term); 258 259 260 /// Find location properties based on alignment. 261 void FindLocationProperties(const CSeq_align& transcript_aln, 262 const CSeq_loc& query_loc, 263 TSOTerms& terms); 264 265 266 static void s_FindLocationProperties(CConstRef<CSeq_loc> rna_loc, 267 CConstRef<CSeq_loc> cds_loc, 268 const CSeq_loc& query_loc, 269 TSOTerms& terms); 270 271 272 static TSeqPos s_GetLength(const CVariantPlacement& p, CScope* scope); 273 274 /// If at any level in variation-set all variations have all same placements, move them to the parent level. 275 static void s_FactorOutPlacements(CVariation& v); 276 277 /*! 278 * If this variation has placements defined, return it; otherwise, recursively try 279 * parent all the way to the root; return NULL if nothing found. 280 * Precondition: root Variation must have been indexed (links to parents defined as necessary) 281 */ 282 static const CVariation::TPlacements* s_GetPlacements(const CVariation& v); 283 284 285 /// Find attached consequence variation in v that corresponds to p (has same seq-id). 286 static CConstRef<CVariation> s_FindConsequenceForPlacement(const CVariation& v, const CVariantPlacement& p); 287 288 289 /// Length up to last position of the last exon (i.e. not including polyA) 290 /// If neither is annotated, return normal length 291 TSeqPos GetEffectiveTranscriptLength(const CBioseq_Handle& bsh); 292 293 294 CRef<CSeq_literal> GetLiteralAtLoc(const CSeq_loc& loc); 295 private: 296 297 298 CRef<CVariantPlacement> x_Remap(const CVariantPlacement& p, CSeq_loc_Mapper& mapper); 299 300 void ChangeToDelins(CVariation& v); 301 302 void x_SetVariantProperties(CVariantProperties& p, const CVariation_inst& vi, const CSeq_loc& loc); 303 void x_SetVariantPropertiesForIntronic(CVariantPlacement& p, int offset, const CSeq_loc& loc, CBioseq_Handle& bsh); 304 305 void x_ChangeToDelins(CVariation& v); 306 307 void x_InferNAfromAA(CVariation& v, TAA2NAFlags flags); 308 309 void x_TranslateNAtoAA(CVariation& prot_variation); 310 311 CRef<CVariation> x_CreateUnknownVariation(const CSeq_id& id, CVariantPlacement::TMol mol); 312 313 //return iupacna or ncbieaa literals 314 CRef<CSeq_literal> x_GetLiteralAtLoc(const CSeq_loc& loc); 315 316 CConstRef<CSeq_literal> x_FindOrCreateLiteral(const CVariation& v); 317 318 /* 319 * If we are flipping an insertion, we must verify that the location is a multinucleotide (normally dinucleotide) 320 * (in this case insertion is interpreted as "insert-between", and can be strand-flipped). 321 * It is impossible, however, to correctly flip the strand of insert-before-a-point variation, because 322 * "before" would become "after". 323 */ 324 static bool s_IsInstStrandFlippable(const CVariation& v, const CVariation_inst& inst); 325 326 /// join two seq-literals 327 static CRef<CSeq_literal> s_CatLiterals(const CSeq_literal& a, const CSeq_literal& b); 328 329 /// insert seq-literal payload into ref before pos (pos=0 -> prepend; pos=ref.len -> append) 330 static CRef<CSeq_literal> s_SpliceLiterals(const CSeq_literal& payload, const CSeq_literal& ref, TSeqPos pos); 331 332 static void s_AttachGeneIDdbxref(CVariantPlacement& p, int gene_id); 333 334 static void s_UntranslateProt(const string& prot_str, vector<string>& codons); 335 336 static size_t s_CountMatches(const string& a, const string& b); 337 338 void s_CalcPrecursorVariationCodon( 339 const string& codon_from, //codon on cDNA 340 const string& prot_to, //missense/nonsense AA 341 vector<string>& codons_to); //calculated variation-codon 342 343 static string s_CollapseAmbiguities(const vector<string>& seqs); 344 345 /*! 346 * Apply offsets to the variation's location (variation must be inst) 347 * E.g. the original variation could be a transcript location with offsets specifying intronic location. 348 * After original transcript lociation is remapped, the offsets are to be applied to the remapped location 349 * to get absolute offset-free genomic location. 350 */ 351 static void s_ResolveIntronicOffsets(CVariantPlacement& p); 352 353 /*! 354 * If start|stop don't fall into an exon, adjust start|stop to the closest exon boundary and add offset to inst. 355 * This is to be applied before remapping a genomic variation to transcript coordinates 356 */ 357 static void s_AddIntronicOffsets(CVariantPlacement& p, const CSpliced_seg& ss, CScope* scope); 358 359 /*! 360 * Extend delins to specified location (adjust location and delta) 361 * 362 * Precondition: 363 * -variation must be a inst-type normalized delins (via x_ChangeToDelins) 364 * -loc must be a superset of variation's location. 365 */ 366 void x_AdjustDelinsToInterval(CVariation& v, const CSeq_loc& loc); 367 368 369 370 /*! 371 * Call s_GetPlacements and find first seq-literal in any of them. 372 */ 373 static const CConstRef<CSeq_literal> s_FindFirstLiteral(const CVariation& v); 374 375 /* 376 * Find a sibling variation with with observation=asserted; 377 * if not found, recursively try parent, or return null if root. 378 * Note: Assumes CVariation::Index has been called. 379 */ 380 static const CConstRef<CSeq_literal> s_FindAssertedLiteral(const CVariation& v); 381 382 383 CRef<CVariation> x_AsVariation(const CVariation_ref& vr); 384 CRef<CVariation_ref> x_AsVariation_ref(const CVariation& v, const CVariantPlacement& p); 385 386 387 /* 388 * In variation-ref the intronic offsets are encoded as last and/or first inst delta-items. 389 * In Variation the intronic offsets are part of the placement. After creating a variation 390 * from a variation-ref we need to migrate the offsets from inst into placement. 391 */ 392 static void s_ConvertInstOffsetsToPlacementOffsets(CVariation& v, CVariantPlacement& p); 393 static void s_AddInstOffsetsFromPlacementOffsets(CVariation_inst& vi, const CVariantPlacement& p); 394 395 396 private: 397 398 /// Cache seq-data in the CDS regions and the cds features by location 399 class CCdregionIndex 400 { 401 public: 402 struct SCdregion 403 { 404 CConstRef<CSeq_feat> cdregion_feat; operator <variation::CVariationUtil::CCdregionIndex::SCdregion405 bool operator<(const SCdregion& other) const 406 { 407 return !this->cdregion_feat ? false 408 : !other.cdregion_feat ? true 409 : *this->cdregion_feat < *other.cdregion_feat; 410 } 411 }; 412 typedef vector<SCdregion> TCdregions; 413 CCdregionIndex(CScope & scope,TOptions options)414 CCdregionIndex(CScope& scope, TOptions options) 415 : m_scope(&scope) 416 , m_options(options) 417 {} 418 419 void Get(const CSeq_loc& loc, TCdregions& cdregions); 420 421 //return NULL if not cached in its entirety 422 CRef<CSeq_literal> GetCachedLiteralAtLoc(const CSeq_loc& loc); 423 Clear()424 void Clear() 425 { 426 m_data.clear(); 427 m_seq_data_map.clear(); 428 } 429 private: 430 431 struct SSeqData 432 { 433 mutable CRef<CSeq_loc_Mapper> mapper; //will map original location consisting of merged exon locations on the seq_data coordinate system 434 string seq_data; 435 }; 436 typedef map<CSeq_id_Handle, SSeqData> TSeqDataMap; 437 typedef CRangeMap<TCdregions, TSeqPos> TRangeMap; 438 typedef map<CSeq_id_Handle, TRangeMap> TIdRangeMap; 439 440 void x_Index(const CSeq_id_Handle& idh); 441 void x_CacheSeqData(const CSeq_loc& loc, const CSeq_id_Handle& idh); 442 443 CRef<CScope> m_scope; 444 TIdRangeMap m_data; 445 TSeqDataMap m_seq_data_map; 446 TOptions m_options; 447 }; 448 449 /// Given a seq-loc, compute CVariantProperties::TGene_location from annotation. 450 /// Precompute for the whole sequence and keep the cache to avoid objmgr annotation 451 /// lookups on every call. 452 class CVariantPropertiesIndex 453 { 454 public: 455 typedef pair<int, CVariantProperties::TGene_location> TGeneIDAndProp; 456 typedef vector<TGeneIDAndProp> TGeneIDAndPropVector; 457 CVariantPropertiesIndex(CScope & scope)458 CVariantPropertiesIndex(CScope& scope) 459 : m_scope(&scope) 460 {} 461 462 void GetLocationProperties(const CSeq_loc& loc, TGeneIDAndPropVector& v); Clear()463 void Clear() 464 { 465 m_loc2prop.clear(); 466 } 467 468 typedef pair<CRef<CSeq_loc>, CRef<CSeq_loc> > TLocsPair; //5' and 3' respectively 469 static TLocsPair s_GetStartAndStopCodonsLocs(const CSeq_loc& cds_loc); 470 static TLocsPair s_GetUTRLocs(const CSeq_loc& cds_loc, const CSeq_loc& parent_loc); 471 static TLocsPair s_GetNeighborhoodLocs(const CSeq_loc& gene_loc, TSeqPos max_pos); 472 static TLocsPair s_GetIntronsAndSpliceSiteLocs(const CSeq_loc& rna_loc); 473 static int s_GetGeneID(const CMappedFeat& mf, feature::CFeatTree& ft); 474 static int s_GetGeneIdForProduct(CBioseq_Handle bsh); 475 476 private: 477 void x_Index(const CSeq_id_Handle& idh); 478 void x_Add(const CSeq_loc& loc, int gene_id, CVariantProperties::TGene_location prop); 479 480 typedef CRangeMap<TGeneIDAndPropVector, TSeqPos> TRangeMap; 481 typedef map<CSeq_id_Handle, TRangeMap> TIdRangeMap; 482 483 CRef<CScope> m_scope; 484 TIdRangeMap m_loc2prop; 485 }; 486 487 488 private: 489 CRef<CScope> m_scope; 490 CVariantPropertiesIndex m_variant_properties_index; 491 CCdregionIndex m_cdregion_index; 492 }; 493 494 } 495 496 END_NCBI_SCOPE 497 #endif 498