1 /* $Id: Seq_align.hpp 604149 2020-03-23 18:36:28Z mozese2 $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  .......
27  *
28  * File Description:
29  *   .......
30  *
31  * Remark:
32  *   This code was originally generated by application DATATOOL
33  *   using specifications from the data definition file
34  *   'seqalign.asn'.
35  */
36 
37 #ifndef OBJECTS_SEQALIGN_SEQ_ALIGN_HPP
38 #define OBJECTS_SEQALIGN_SEQ_ALIGN_HPP
39 
40 
41 // generated includes
42 #include <objects/seqalign/Seq_align_.hpp>
43 #include <objects/seqloc/Na_strand.hpp>
44 #include <util/range_coll.hpp>
45 
46 // generated classes
47 
48 BEGIN_NCBI_SCOPE
49 
50 BEGIN_objects_SCOPE /// namespace ncbi::objects::
51 
52 class CSeq_id;
53 
54 class NCBI_SEQALIGN_EXPORT CSeq_align : public CSeq_align_Base
55 {
56     typedef CSeq_align_Base Tparent;
57 public:
58     /// enum controlling known named scores
59     ///
60     /// A word on scores, since definitions are quite important:  Scores here
61     /// are to be represented as a standard set of computations, comparable
62     /// across alignment generation algorithms.  The class CScoreBuilder holds
63     /// reference implementations for most of the scores below (modulo
64     /// interpretation difficulties with a score named 'score').  The intended
65     /// meanings of scores are as follows:
66     ///
67     /// 'score'
68     ///     generic score, definable by any algorithm; not comparable
69     ///     across algorithms.
70     ///
71     /// 'bit_score'
72     ///     BLAST-specific bit score; reference implementation in the BLAST
73     ///     code, which is also exposed in CScoreBuilder.
74     ///
75     /// 'e_value'
76     ///     BLAST-specific e-value; reference implementation in the BLAST
77     ///     code, which is also exposed in CScoreBuilder.
78     ///
79     /// alignment length
80     ///     not a score per se, but a useful metric nonetheless.  This is the
81     ///     sum of all aligned segments and all gaps; this excludes introns and
82     ///     discontinuities
83     ///
84     /// percent identity
85     ///     percent identity, computed as *UNGAPPED* percent identity.  That
86     ///     is, the computation is strictly (matches) / (matches + mismatches)
87     ///     NOTE: there are, historically, at least four separate methods of
88     ///     computation for this:
89     ///         1.  C++ toolkit (CScoreBuilder): ungapped percent identity:
90     ///                 (matches) / (matches + mismatches)
91     ///         2.  BLAST: gapped percent identity:
92     ///                 (matches) / (alignment length)
93     ///         3.  gbDNA and contig processes: modified alignment length to
94     ///             count each gap as 1 base long always
95     ///                 (matches) / (matches + mismatches + gap count)
96     ///         4.  Spidey: compute identity per coverage:
97     ///                 (matches) / (stop - start + 1)
98     ///     Several of these are provided directly, as:
99     ///         pct_identity_gap
100     ///         pct_identity_ungap
101     ///         pct_identity_gapopen_only
102     ///
103     ///  alignable bases
104     ///     More a concept than a score.  The alignable region of a sequence,
105     ///     as defined by the underlying sequencing technology or biology -
106     ///     that is, the region of a sequence noted as high quality minus a
107     ///     poly-A tail in the case of a cDNA.  This quantity can be used in
108     ///     computing more appropriate coverage scores.
109     ///
110     /// 'pct_coverage'
111     ///     Percent of a sequence (query) aligned to another (subject), minus
112     ///     poly-A tail:
113     ///         (matches + mismatches) / (query length - polyA tail)
114     ///
115     /// alignable percent coverage
116     ///     (NB: no named score yet)
117     ///     Percent of the alignable region actually aligned to a subject
118     ///         (matches + mismatches in alignable region) / (length of alignable region)
119     ///
120     /// exonic coverage
121     ///     (NB: no named score yet)
122     ///     Measure of the percent of the query represented in a set of exons
123     ///         (sum of lengths of exon on query) / (query length - polyA)
124     ///
125     /// @warning The order of this enumeration and the sc_ScoreNames,
126     ///     sc_ScoreHelpText, and sc_IsInteger arrays must correspond!
127     ///
128     enum EScoreType {
129         //< generic 'score'
130         eScore_Score,
131 
132         //< blast 'score'
133         eScore_Blast,
134 
135         //< blast-style 'bit_score'
136         eScore_BitScore,
137 
138         //< blast-style 'e_value'
139         eScore_EValue,
140 
141         //< alignment length (align_length)
142         eScore_AlignLength,
143 
144         //< count of identities (num_ident)
145         eScore_IdentityCount,
146 
147         //< count of positives (num_positives); protein-to-DNA score
148         eScore_PositiveCount,
149 
150         //< count of negatives (num_negatives)
151         eScore_NegativeCount,
152 
153         //< count of mismatches (num_mismatch)
154         eScore_MismatchCount,
155 
156         //< number of gap bases in the alignment
157         //< (= length of all gap segments)
158         eScore_GapCount,
159 
160         //< percent identity (0.0-100.0) (pct_identity)
161         //< NOTE: there are multiple ways to express this; the default is to
162         //< consider a gap as a mismatch, as noted above.
163         eScore_PercentIdentity_Gapped,
164         eScore_PercentIdentity_Ungapped,
165         eScore_PercentIdentity_GapOpeningOnly,
166 
167         //< percent coverage (0.0-100.0) (pct_coverage)
168         eScore_PercentCoverage,
169 
170         //< blast-style 'sum_e'
171         eScore_SumEValue,
172 
173         //< Composition-adjustment method from BLAST (comp_adjustment_method)
174         eScore_CompAdjMethod,
175 
176         //< percent coverage (0.0-100.0) of high quality region (high_quality_pct_coverage)
177         eScore_HighQualityPercentCoverage,
178 
179         //< Scores calculated by Splign
180         eScore_Matches,
181         eScore_OverallIdentity,
182         eScore_Splices,
183         eScore_ConsensusSplices,
184         eScore_ProductCoverage,
185         eScore_ExonIdentity,
186 
187         //< generic percent identity is an alias for gapped percent identity
188         //< (i.e., BLAST-style percent identity)
189         eScore_PercentIdentity = eScore_PercentIdentity_Gapped
190     };
191 
192     typedef pair<TSeqPos, TSeqPos> TLengthRange;
193 
194     typedef map<string, EScoreType> TScoreNameMap;
195 
196     struct NCBI_SEQALIGN_EXPORT SIndel {
197         TSeqPos genomic_pos;
198         TDim row;
199         TSeqPos length;
200 
SIndelCSeq_align::SIndel201         SIndel(TSeqPos p = 0, TDim r = 0, TSeqPos l = 0)
202         : genomic_pos(p), row(r), length(l)
203         {}
204 
205         string AsString() const;
206     };
207 
208     /// constructor
209     CSeq_align(void);
210     /// destructor
211     ~CSeq_align(void);
212 
213     /// Validatiors
214     TDim CheckNumRows(void)                   const;
215     void Validate    (bool full_test = false) const;
216 
217     /// GetSeqRange
218     /// NB: On a Spliced-seg, in case the product-type is protein,
219     /// these only return the amin part of Prot-pos.  The frame is
220     /// ignored.
221     CRange<TSeqPos> GetSeqRange(TDim row) const;
222     TSeqPos         GetSeqStart(TDim row) const;
223     TSeqPos         GetSeqStop (TDim row) const;
224 
225     /// Get strand (the first one if segments have different strands).
226     ENa_strand      GetSeqStrand(TDim row) const;
227 
228     /// Get seq-id (the first one if segments have different ids).
229     /// Throw exception if row is invalid.
230     const CSeq_id&  GetSeq_id(TDim row) const;
231 
232     /// Retrieves the total number of gaps in the given row an alignment;
233     /// all gaps by default
234     /// @throws CSeqalignException if alignment type is not supported
235     TSeqPos         GetTotalGapCount(TDim row = -1) const;
236     TSeqPos         GetTotalGapCountWithinRange(const TSeqRange &range,
237                                                 TDim row = -1) const;
238     TSeqPos         GetTotalGapCountWithinRanges(const CRangeCollection<TSeqPos> &ranges,
239                                                  TDim row = -1) const;
240 
241     /// Retrieves the number of gap openings in a given row in an alignment
242     /// (ignoring how many gaps are in the gapped region); all gaps by default
243     /// @throws CSeqalignException if alignment type is not supported
244     TSeqPos         GetNumGapOpenings(TDim row = -1) const;
245     TSeqPos         GetNumGapOpeningsWithinRange(const TSeqRange &range,
246                                                  TDim row = -1) const;
247     TSeqPos         GetNumGapOpeningsWithinRanges(const CRangeCollection<TSeqPos> &ranges,
248                                                   TDim row = -1) const;
249 
250     /// Retrieves the number of times a given row shifts frames; i.e. the number
251     /// of gaps with a length that is not a multiple of 3.
252     /// @throws CSeqalignException if alignment type is not supported
253     TSeqPos         GetNumFrameshifts(TDim row = -1) const;
254     TSeqPos         GetNumFrameshiftsWithinRange(const TSeqRange &range,
255                                                  TDim row = -1) const;
256     TSeqPos         GetNumFrameshiftsWithinRanges(const CRangeCollection<TSeqPos> &ranges,
257                                                   TDim row = -1) const;
258 
259     /// Retrieves descriptions of all indels on a given row
260     /// @throws CSeqalignException if alignment type is not supported
261     vector<SIndel>  GetIndels(TDim row = -1) const;
262     vector<SIndel>  GetIndelsWithinRange(const TSeqRange &range,
263                                               TDim row = -1) const;
264     vector<SIndel>  GetIndelsWithinRanges(const CRangeCollection<TSeqPos> &ranges,
265                                                TDim row = -1) const;
266 
267     /// Retrieves descriptions of all frameshifts on a given row; i.e.
268     /// all gaps with a length that is not a multiple of 3.
269     /// @throws CSeqalignException if alignment type is not supported
270     vector<SIndel>  GetFrameshifts(TDim row = -1) const;
271     vector<SIndel>  GetFrameshiftsWithinRange(const TSeqRange &range,
272                                               TDim row = -1) const;
273     vector<SIndel>  GetFrameshiftsWithinRanges(const CRangeCollection<TSeqPos> &ranges,
274                                                TDim row = -1) const;
275 
276     /// Retrieves descriptions of all non-frameshift indels on a given row; i.e.
277     /// all gaps with a length that is a multiple of 3.
278     /// @throws CSeqalignException if alignment type is not supported
279     vector<SIndel>  GetNonFrameshifts(TDim row = -1) const;
280     vector<SIndel>  GetNonFrameshiftsWithinRange(const TSeqRange &range,
281                                               TDim row = -1) const;
282     vector<SIndel>  GetNonFrameshiftsWithinRanges(const CRangeCollection<TSeqPos> &ranges,
283                                                TDim row = -1) const;
284 
285     /// Retrieves the locations of aligned bases in the given row, excluding
286     /// gaps and incontinuities
287     /// @throws CSeqalignException if alignment type is not supported
288     CRangeCollection<TSeqPos> GetAlignedBases(TDim row) const;
289 
290     /// Get the length of this alignment.  This length corresponds to the score
291     /// 'align_length'.  By default, this function computes an alignment length
292     /// including all gap segments.
293     TSeqPos         GetAlignLength(bool include_gaps = true) const;
294 
295     /// Get the length of this alignment within a specified range
296     /// By default, this function computes an alignment length
297     /// including all gap segments.
298     TSeqPos         GetAlignLengthWithinRange(const TSeqRange &range,
299                                               bool include_gaps = true) const;
300 
301     /// Get the length of this alignment within a specified range
302     /// By default, this function computes an alignment length
303     /// including all gap segments.
304     TSeqPos         GetAlignLengthWithinRanges(const CRangeCollection<TSeqPos> &ranges,
305                                                bool include_gaps = true) const;
306 
307     double          AlignLengthRatio() const;
308 
309     /// Get score
310     bool GetNamedScore(const string& id, int &score) const;
311     bool GetNamedScore(const string& id, double &score) const;
312 
313     bool GetNamedScore(EScoreType type, int &score) const;
314     bool GetNamedScore(EScoreType type, double &score) const;
315 
316     void SetNamedScore(const string& id, int score);
317     void SetNamedScore(const string& id, double score);
318 
319     void SetNamedScore(EScoreType type, int score);
320     void SetNamedScore(EScoreType type, double score);
321 
322     void ResetNamedScore(const string& name);
323     void ResetNamedScore(EScoreType    type);
324 
325 
326     /// Reverse the segments' orientation
327     /// NOTE: currently *only* works for dense-seg
328     void Reverse(void);
329 
330     /// Swap the position of two rows in the alignment
331     /// NOTE: currently *only* works for dense-seg & disc
332     void SwapRows(TDim row1, TDim row2);
333 
334     /// Create a Dense-seg from a Std-seg
335     /// Used by AlnMgr to handle nucl2prot alignments
336     //
337 
338     /// NOTE: Here we assume that the same rows on different segments
339     /// contain the same sequence. Without access to OM we can only check
340     /// if the ids are the same via SerialEquals, and we throw an exception
341     /// if not equal. Since the same sequence can be represented with a
342     /// different type of seq-id, we provide an optional callback mechanism
343     /// to compare id1 and id2, and if both resolve to the same sequence
344     /// and id2 is preferred, to SerialAssign it to id1. Otherwise, again,
345     /// an exception should be thrown.
346     struct SSeqIdChooser : CObject
347     {
348         virtual void ChooseSeqId(CSeq_id& id1, const CSeq_id& id2) = 0;
349     };
350     CRef<CSeq_align> CreateDensegFromStdseg(SSeqIdChooser* SeqIdChooser = 0) const;
351 
352     CRef<CSeq_align> CreateDensegFromDisc(SSeqIdChooser* SeqIdChooser = 0) const;
353 
354     /// Create a Dense-seg with widths from Dense-seg of nucleotides
355     /// Used by AlnMgr to handle translated nucl2nucl alignments
356     /// IMPORTANT NOTE: Do *NOT* use for alignments containing proteins;
357     ///                 the code will not check for this
358     CRef<CSeq_align> CreateTranslatedDensegFromNADenseg(void) const;
359 
360     /// Split the alignment at any discontinuity greater than threshold;
361     /// populate aligns list with new alignments. If alignment contains no long
362     /// discontinuities, populate aligns list with a singleton reference
363     /// to self.
364     /// Splitting works only for Denseg and Disc; for all other segment types, this
365     /// function simply populates aligns with a singleton reference to self.
366     /// This function is only implemented for pairwise alignments; it throws an
367     /// exception if Dim is more than 2.
368     void SplitOnLongDiscontinuity(list< CRef<CSeq_align> >& aligns,
369                                   TSeqPos discontinuity_threshold) const;
370 
371 
372     /// Offset row's coords
373     void OffsetRow(TDim row, TSignedSeqPos offset);
374 
375     /// @deprecated (use sequence::RemapAlignToLoc())
376     /// @sa RemapAlignToLoc
377     NCBI_DEPRECATED void RemapToLoc(TDim row,
378                                     const CSeq_loc& dst_loc,
379                                     bool ignore_strand = false);
380 
381     CRef<CSeq_loc> CreateRowSeq_loc(TDim row) const;
382 
383     TLengthRange GapLengthRange() const;
384 
385     TLengthRange IntronLengthRange() const;
386 
387     TLengthRange ExonLengthRange() const;
388 
389     /// Find extension by type in ext container.
390     /// @param ext_type
391     ///   String id of the extension to find.
392     /// @result
393     ///   User-object of the requested type or NULL.
394     CConstRef<CUser_object> FindExt(const string& ext_type) const;
395     /// Non-const version of FindExt().
396     CRef<CUser_object> FindExt(const string& ext_type);
397 
398     static const TScoreNameMap &ScoreNameMap();
399 
400     static string ScoreName(EScoreType score);
401 
402     static string HelpText(EScoreType score);
403 
404     static bool IsIntegerScore(EScoreType score);
405 
406 protected:
407     /// retrieve a named score object
408     CConstRef<CScore> x_GetNamedScore(const string& name) const;
409     CRef<CScore>      x_SetNamedScore(const string& name);
410 
411 private:
412 
413     /// Prohibit copy constructor and assignment operator
414     CSeq_align(const CSeq_align& value);
415     CSeq_align& operator=(const CSeq_align& value);
416 
417     /// Create a partial alignment containing the specified range of segments.
418     /// Can only be called for Denseg alignments.
419     CRef<CSeq_align>  x_CreateSubsegAlignment(int from, int to) const;
420 };
421 
422 
423 /// Remap seq-align row to the seq-loc.
424 /// Treats the given row as being relative to the location, maps it
425 /// to the sequence(s) referenced by this location.
426 /// @param align
427 ///   The seq-align object to be mapped (the object will be modified!).
428 /// @param row
429 ///   Row to be mapped.
430 /// @param loc
431 ///   Seq-loc to which the row should be mapped.
432 /// @result
433 ///   Reference to the new seq-align with the mapped row.
434 NCBI_SEQALIGN_EXPORT
435 CRef<CSeq_align> RemapAlignToLoc(const CSeq_align& align,
436                                  CSeq_align::TDim  row,
437                                  const CSeq_loc&   loc);
438 
439 
440 /////////////////// CSeq_align inline methods
441 
442 // constructor
443 inline
CSeq_align(void)444 CSeq_align::CSeq_align(void)
445 {
446 }
447 
448 
449 /////////////////// end of CSeq_align inline methods
450 
451 
452 END_objects_SCOPE /// namespace ncbi::objects::
453 
454 END_NCBI_SCOPE
455 
456 #endif /// OBJECTS_SEQALIGN_SEQ_ALIGN_HPP
457