1 /*  $Id: variation_util2.hpp 590763 2019-08-05 00:34:23Z vakatov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Description:
27 *
28 *   Utility functions pretaining to Variation
29 *
30 * ===========================================================================
31 */
32 
33 #ifndef VARIATION_UTIL2_HPP_
34 #define VARIATION_UTIL2_HPP_
35 
36 #include <corelib/ncbiobj.hpp>
37 #include <corelib/ncbistd.hpp>
38 
39 #include <objects/seqloc/Seq_loc.hpp>
40 #include <objects/seqloc/Seq_interval.hpp>
41 #include <objects/seq/Seq_literal.hpp>
42 #include <objects/seqalign/Seq_align.hpp>
43 #include <objects/seqalign/Seq_align_set.hpp>
44 #include <objects/seq/Seq_data.hpp>
45 
46 #include <objects/seqfeat/Seq_feat.hpp>
47 #include <objects/seqfeat/Variation_ref.hpp>
48 #include <objects/seqfeat/VariantProperties.hpp>
49 
50 #include <objects/variation/Variation.hpp>
51 #include <objects/variation/VariantPlacement.hpp>
52 #include <objects/seqfeat/Variation_inst.hpp>
53 
54 #include <util/rangemap.hpp>
55 
56 #include <objmgr/scope.hpp>
57 #include <objmgr/feat_ci.hpp>
58 #include <objmgr/seq_vector.hpp>
59 #include <objmgr/seq_loc_mapper.hpp>
60 #include <objmgr/util/feature.hpp>
61 
62 BEGIN_NCBI_SCOPE
63 USING_SCOPE(objects);
64 
65 namespace variation {
66 
67 class CVariationUtil : public CObject
68 {
69 public:
70     enum EOptions
71     {
72         fOpt_cache_exon_sequence = 1, ///< Use when there will be many calls to calculate protein consequnece per sequence
73         fOpt_default = 0
74     };
75     typedef int TOptions;
76 
CVariationUtil(CScope & scope,TOptions options=fOpt_default)77     CVariationUtil(CScope& scope, TOptions options = fOpt_default)
78       : m_scope(&scope)
79       , m_variant_properties_index(scope)
80       , m_cdregion_index(scope, options)
81     {}
82 
ClearCache()83     void ClearCache()
84     {
85         m_variant_properties_index.Clear();
86         m_cdregion_index.Clear();
87     }
88 
89     CRef<CVariation> AsVariation(const CSeq_feat& variation_ref);
90     void AsVariation_feats(const CVariation& v, CSeq_annot::TData::TFtable& feats);
91 
92 
93 /// Methods to remap a VariantPlacement
94 
95     /*!
96      * Remap using provided alignment. Ids of at least one row must match.
97      * When remapping between transcript/genomic coordiane systems, this method should be used
98      * instead of the mapper-based, as if the location is intronic, the method
99      * will use the spliced-alignment to create or resolve offset-placement as appropriate.
100 
101      RSNP-2558 - Sometimes we don't need to check the validity of resulting placements, so this costly step can
102                  be skipped
103      */
104     CRef<CVariantPlacement> Remap(const CVariantPlacement& p, const CSeq_align& aln, bool check_placements = true);
105 
106     // Remap a placement with a specified mapper. If a cdna->genomic or genomic->cdna mapping
107     // is attempted, this will throw.
108     CRef<CVariantPlacement> Remap(const CVariantPlacement& p, CSeq_loc_Mapper& mapper);
109 
110 
111     /// Remap variation from product coordinates onto a nucleotide sequence on which this product is annotated.
112     /// e.g. {NM,NR,NP} -> {NG,NT,NW,AC,NC}. Use annotated seq-align, if available; otherwise use spliced rna or cdregion.
113     /// If starting from NP, remap to the parent mRNA first.
114     CRef<CVariantPlacement> RemapToAnnotatedTarget(const CVariation& v, const CSeq_id& target);
115 
116 
117 /// Methods to convert between nucleotide and protein
118 
119     /// Convert protein-variation (single-AA missense/nonsense) to nuc-variation on the precursor in the parent nuc-prot.
120     typedef int TAA2NAFlags;
121     enum EAA2NAFlags
122     {
123         fAA2NA_truncate_common_prefix_and_suffix = 1,
124         fAA2NA_default = 0
125     };
126     CRef<CVariation> InferNAfromAA(const CVariation& prot_variation, TAA2NAFlags flags = fAA2NA_default);
127 
128 
129     /// Evaluate protein effect of a single-inst @ single-placement
130     ///
131     /// Two placements will be added: The protein placement, and an additional placement
132     /// on the original nucleotide level, representing the location and sequence of affected codons.
133     ///
134     /// The inst will be at the nucleotide level (describe variant codons). Rationale: the user may
135     /// want to know the codons, or translate it to AA if necessary.
136     ///
137     /// Only a subset of sequence edits is supported (mismatch, ins, del, indel)
138     /// Nuc-variation's placement must fall within cds-feat location.
139     CRef<CVariation> TranslateNAtoAA(const CVariation_inst& nuc_inst, const CVariantPlacement& p, const CSeq_feat& cds_feat);
140 
141 
142     /// Find the CDSes for the first placement; Compute prot consequence using TranslateNAtoAA for each
143     /// and attach results to nuc_variation.consequnece.
144     /// If seq-id specified, use only the placement with the specified id.
145     /// If ignore_genomic set to true, then consequences will be evaluated for cDNA or MT placements only.
146     /// Note: Alternatively, the API could be "create and return consequence protein variation(s)", rather than attach,
147     ///       but that for hierarchical input it would be hard to tell which consequence corresponds to which node.
148     void AttachProteinConsequences(CVariation& nuc_variation, const CSeq_id* = NULL, bool ignore_genomic = false);
149 
150 
151 ///Other utility methods:
152 
153     /// Flipping strand involves flipping all placements, and insts.
154     /// Note: for insertions, the placement(s) must be dinucleotide where
155     /// insertion occurs, such that semantic correctness is maintained.
156     void FlipStrand(CVariation& v) const;
157 
158     /// Flipping a placement involves flipping strand on the seq-loc and swapping the offsets (and signs).
159     /// If there's a seq-literal, it is reverse-complemented.
160     void FlipStrand(CVariantPlacement& vp) const;
161 
162     /// Flip strand on delta-items and reverse the order.
163     void FlipStrand(CVariation_inst& vi) const;
164 
165     /// Reverse-complement seq-literals, flip strand on the seq-locs.
166     void FlipStrand(CDelta_item& di) const;
167 
168     enum { kMaxAttachSeqLen = 100000} ; //VAR-724
169     /// If have offsets (intronic) or too long, return false;
170     /// else set seq field on the placement and return true.
171     bool AttachSeq(CVariantPlacement& p, TSeqPos max_len = kMaxAttachSeqLen);
172 
173     //Call AttachSeq on every placement; Try to find asserted-type subvariation.
174     //If the asserted sequence is different from the attached reference, add
175     //corresponding exception object to the placement. Return true iff there were no exceptions added
176     bool AttachSeq(CVariation& v, TSeqPos max_len = kMaxAttachSeqLen);
177 
178 
179     CVariantPlacement::TMol GetMolType(const CSeq_id& id);
180 
181     /*!
182      * If placement is not intronic, or alignment is not spliced-seg -> eNotApplicable
183      * Else if variation is intronic but location is not at exon boundary -> eFail
184      * Else -> ePass
185      */
186     enum ETestStatus
187     {
188         eFail,
189         ePass,
190         eNotApplicable,
191     };
192     ETestStatus CheckExonBoundary(const CVariantPlacement& p, const CSeq_align& aln);
193 
194     // Similar to above, except verify exon boundary using exon annotation on refseq transcripts, if exists
195     ETestStatus CheckExonBoundary(const CVariantPlacement& p);
196 
197     /// if placement is invalid SeqLocCheck fails, or offsets out of order, attach VariationException and return true
198     /// otherwise return false;
199     bool CheckPlacement(CVariantPlacement& p);
200 
201     /// if variation.data contains a seq-literal with non-ACGT residues, attach VariationException to the first placement
202     /// (if exists) and return true. otherwise return false;
203     bool CheckAmbiguitiesInLiterals(CVariation& v);
204 
205     /// Calculate upstream (first) and downstream(second) flanks for loc
206     struct SFlankLocs
207     {
208         CRef<CSeq_loc> upstream;
209         CRef<CSeq_loc> downstream;
210     };
211     SFlankLocs CreateFlankLocs(const CSeq_loc& loc, TSeqPos len);
212 
213 
214 /// Methods to compute properties
215 
216     /*!
217      * Fill location-specific variant-properties for a placement.
218      * Attach related GeneID(s) to placement with respect to which the properties are defined.
219      */
220     void SetPlacementProperties(CVariantPlacement& placement);
221 
222     /*
223      * Set placement-specific variant properties for a variation by applying
224      * x_SetVariantProperties(...) on each found placement and collapse those
225      * onto parent VariationProperties
226      */
227     void SetVariantProperties(CVariation& v);
228 
229 
230     /// Supported SO-terms
231     enum ESOTerm
232     {
233         //location-specific terms
234         eSO_intergenic_variant      =1628,
235         eSO_2KB_upstream_variant    =1636,
236         eSO_500B_downstream_variant =1634,
237         eSO_splice_donor_variant    =1575,
238         eSO_splice_acceptor_variant =1574,
239         eSO_intron_variant          =1627,
240         eSO_5_prime_UTR_variant     =1623,
241         eSO_3_prime_UTR_variant     =1624,
242         eSO_coding_sequence_variant =1580,
243         eSO_nc_transcript_variant   =1619,
244         eSO_initiator_codon_variant =1582,
245         eSO_terminator_codon_variant=1590,
246 
247         //variation-specific terms
248         eSO_synonymous_variant      =1819,
249         eSO_missense_variant        =1583,
250         eSO_inframe_indel           =1820,
251         eSO_frameshift_variant      =1589,
252         eSO_stop_gained             =1587,
253         eSO_stop_lost               =1578
254     };
255     typedef vector<ESOTerm> TSOTerms;
256     void AsSOTerms(const CVariantProperties& p, TSOTerms& terms);
257     static string AsString(ESOTerm term);
258 
259 
260     /// Find location properties based on alignment.
261     void FindLocationProperties(const CSeq_align& transcript_aln,
262                                 const CSeq_loc& query_loc,
263                                 TSOTerms& terms);
264 
265 
266     static void s_FindLocationProperties(CConstRef<CSeq_loc> rna_loc,
267                                          CConstRef<CSeq_loc> cds_loc,
268                                          const CSeq_loc& query_loc,
269                                          TSOTerms& terms);
270 
271 
272     static TSeqPos s_GetLength(const CVariantPlacement& p, CScope* scope);
273 
274     /// If at any level in variation-set all variations have all same placements, move them to the parent level.
275     static void s_FactorOutPlacements(CVariation& v);
276 
277     /*!
278      * If this variation has placements defined, return it; otherwise, recursively try
279      * parent all the way to the root; return NULL if nothing found.
280      * Precondition: root Variation must have been indexed (links to parents defined as necessary)
281      */
282     static const CVariation::TPlacements* s_GetPlacements(const CVariation& v);
283 
284 
285     /// Find attached consequence variation in v that corresponds to p (has same seq-id).
286     static CConstRef<CVariation> s_FindConsequenceForPlacement(const CVariation& v, const CVariantPlacement& p);
287 
288 
289     /// Length up to last position of the last exon (i.e. not including polyA)
290     /// If neither is annotated, return normal length
291     TSeqPos GetEffectiveTranscriptLength(const CBioseq_Handle& bsh);
292 
293 
294     CRef<CSeq_literal> GetLiteralAtLoc(const CSeq_loc& loc);
295 private:
296 
297 
298     CRef<CVariantPlacement> x_Remap(const CVariantPlacement& p, CSeq_loc_Mapper& mapper);
299 
300     void ChangeToDelins(CVariation& v);
301 
302     void x_SetVariantProperties(CVariantProperties& p, const CVariation_inst& vi, const CSeq_loc& loc);
303     void x_SetVariantPropertiesForIntronic(CVariantPlacement& p, int offset, const CSeq_loc& loc, CBioseq_Handle& bsh);
304 
305     void x_ChangeToDelins(CVariation& v);
306 
307     void x_InferNAfromAA(CVariation& v, TAA2NAFlags flags);
308 
309     void x_TranslateNAtoAA(CVariation& prot_variation);
310 
311     CRef<CVariation> x_CreateUnknownVariation(const CSeq_id& id, CVariantPlacement::TMol mol);
312 
313     //return iupacna or ncbieaa literals
314     CRef<CSeq_literal> x_GetLiteralAtLoc(const CSeq_loc& loc);
315 
316     CConstRef<CSeq_literal> x_FindOrCreateLiteral(const CVariation& v);
317 
318     /*
319      * If we are flipping an insertion, we must verify that the location is a multinucleotide (normally dinucleotide)
320      * (in this case insertion is interpreted as "insert-between", and can be strand-flipped).
321      * It is impossible, however, to correctly flip the strand of insert-before-a-point variation, because
322      * "before" would become "after".
323      */
324     static bool s_IsInstStrandFlippable(const CVariation& v, const CVariation_inst& inst);
325 
326     /// join two seq-literals
327     static CRef<CSeq_literal> s_CatLiterals(const CSeq_literal& a, const CSeq_literal& b);
328 
329     /// insert seq-literal payload into ref before pos (pos=0 -> prepend; pos=ref.len -> append)
330     static CRef<CSeq_literal> s_SpliceLiterals(const CSeq_literal& payload, const CSeq_literal& ref, TSeqPos pos);
331 
332     static void s_AttachGeneIDdbxref(CVariantPlacement& p, int gene_id);
333 
334     static void s_UntranslateProt(const string& prot_str, vector<string>& codons);
335 
336     static size_t s_CountMatches(const string& a, const string& b);
337 
338     void s_CalcPrecursorVariationCodon(
339             const string& codon_from, //codon on cDNA
340             const string& prot_to,    //missense/nonsense AA
341             vector<string>& codons_to);           //calculated variation-codon
342 
343     static string s_CollapseAmbiguities(const vector<string>& seqs);
344 
345     /*!
346      * Apply offsets to the variation's location (variation must be inst)
347      * E.g. the original variation could be a transcript location with offsets specifying intronic location.
348      * After original transcript lociation is remapped, the offsets are to be applied to the remapped location
349      * to get absolute offset-free genomic location.
350      */
351     static void s_ResolveIntronicOffsets(CVariantPlacement& p);
352 
353     /*!
354      * If start|stop don't fall into an exon, adjust start|stop to the closest exon boundary and add offset to inst.
355      * This is to be applied before remapping a genomic variation to transcript coordinates
356      */
357     static void s_AddIntronicOffsets(CVariantPlacement& p, const CSpliced_seg& ss, CScope* scope);
358 
359     /*!
360      * Extend delins to specified location (adjust location and delta)
361      *
362      * Precondition:
363      *    -variation must be a inst-type normalized delins (via x_ChangeToDelins)
364      *    -loc must be a superset of variation's location.
365      */
366     void x_AdjustDelinsToInterval(CVariation& v, const CSeq_loc& loc);
367 
368 
369 
370     /*!
371      * Call s_GetPlacements and find first seq-literal in any of them.
372      */
373     static const CConstRef<CSeq_literal> s_FindFirstLiteral(const CVariation& v);
374 
375     /*
376      * Find a sibling variation with with observation=asserted;
377      * if not found, recursively try parent, or return null if root.
378      * Note: Assumes CVariation::Index has been called.
379      */
380     static const CConstRef<CSeq_literal> s_FindAssertedLiteral(const CVariation& v);
381 
382 
383     CRef<CVariation> x_AsVariation(const CVariation_ref& vr);
384     CRef<CVariation_ref> x_AsVariation_ref(const CVariation& v, const CVariantPlacement& p);
385 
386 
387     /*
388      * In variation-ref the intronic offsets are encoded as last and/or first inst delta-items.
389      * In Variation the intronic offsets are part of the placement. After creating a variation
390      * from a variation-ref we need to migrate the offsets from inst into placement.
391      */
392     static void s_ConvertInstOffsetsToPlacementOffsets(CVariation& v, CVariantPlacement& p);
393     static void s_AddInstOffsetsFromPlacementOffsets(CVariation_inst& vi, const CVariantPlacement& p);
394 
395 
396 private:
397 
398     /// Cache seq-data in the CDS regions and the cds features by location
399     class CCdregionIndex
400     {
401     public:
402         struct SCdregion
403         {
404             CConstRef<CSeq_feat> cdregion_feat;
operator <variation::CVariationUtil::CCdregionIndex::SCdregion405             bool operator<(const SCdregion& other) const
406             {
407                 return !this->cdregion_feat ? false
408                      : !other.cdregion_feat ? true
409                      : *this->cdregion_feat < *other.cdregion_feat;
410             }
411         };
412         typedef vector<SCdregion> TCdregions;
413 
CCdregionIndex(CScope & scope,TOptions options)414         CCdregionIndex(CScope& scope, TOptions options)
415           : m_scope(&scope)
416           , m_options(options)
417         {}
418 
419         void Get(const CSeq_loc& loc, TCdregions& cdregions);
420 
421         //return NULL if not cached in its entirety
422         CRef<CSeq_literal> GetCachedLiteralAtLoc(const CSeq_loc& loc);
423 
Clear()424         void Clear()
425         {
426             m_data.clear();
427             m_seq_data_map.clear();
428         }
429     private:
430 
431         struct SSeqData
432         {
433             mutable CRef<CSeq_loc_Mapper> mapper; //will map original location consisting of merged exon locations on the seq_data coordinate system
434             string seq_data;
435         };
436         typedef map<CSeq_id_Handle, SSeqData> TSeqDataMap;
437         typedef CRangeMap<TCdregions, TSeqPos> TRangeMap;
438         typedef map<CSeq_id_Handle, TRangeMap> TIdRangeMap;
439 
440         void x_Index(const CSeq_id_Handle& idh);
441         void x_CacheSeqData(const CSeq_loc& loc, const CSeq_id_Handle& idh);
442 
443         CRef<CScope> m_scope;
444         TIdRangeMap m_data;
445         TSeqDataMap m_seq_data_map;
446         TOptions m_options;
447     };
448 
449     /// Given a seq-loc, compute CVariantProperties::TGene_location from annotation.
450     /// Precompute for the whole sequence and keep the cache to avoid objmgr annotation
451     /// lookups on every call.
452     class CVariantPropertiesIndex
453     {
454     public:
455         typedef pair<int, CVariantProperties::TGene_location> TGeneIDAndProp;
456         typedef vector<TGeneIDAndProp> TGeneIDAndPropVector;
457 
CVariantPropertiesIndex(CScope & scope)458         CVariantPropertiesIndex(CScope& scope)
459           : m_scope(&scope)
460         {}
461 
462         void GetLocationProperties(const CSeq_loc& loc, TGeneIDAndPropVector& v);
Clear()463         void Clear()
464         {
465             m_loc2prop.clear();
466         }
467 
468         typedef pair<CRef<CSeq_loc>, CRef<CSeq_loc> > TLocsPair; //5' and 3' respectively
469         static TLocsPair s_GetStartAndStopCodonsLocs(const CSeq_loc& cds_loc);
470         static TLocsPair s_GetUTRLocs(const CSeq_loc& cds_loc, const CSeq_loc& parent_loc);
471         static TLocsPair s_GetNeighborhoodLocs(const CSeq_loc& gene_loc, TSeqPos max_pos);
472         static TLocsPair s_GetIntronsAndSpliceSiteLocs(const CSeq_loc& rna_loc);
473         static int s_GetGeneID(const CMappedFeat& mf, feature::CFeatTree& ft);
474         static int s_GetGeneIdForProduct(CBioseq_Handle bsh);
475 
476     private:
477         void x_Index(const CSeq_id_Handle& idh);
478         void x_Add(const CSeq_loc& loc, int gene_id, CVariantProperties::TGene_location prop);
479 
480         typedef CRangeMap<TGeneIDAndPropVector, TSeqPos> TRangeMap;
481         typedef map<CSeq_id_Handle, TRangeMap> TIdRangeMap;
482 
483         CRef<CScope> m_scope;
484         TIdRangeMap m_loc2prop;
485     };
486 
487 
488 private:
489     CRef<CScope> m_scope;
490     CVariantPropertiesIndex m_variant_properties_index;
491     CCdregionIndex m_cdregion_index;
492 };
493 
494 }
495 
496 END_NCBI_SCOPE
497 #endif
498