1 /* $Id: Seq_align.hpp 604149 2020-03-23 18:36:28Z mozese2 $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: .......
27 *
28 * File Description:
29 * .......
30 *
31 * Remark:
32 * This code was originally generated by application DATATOOL
33 * using specifications from the data definition file
34 * 'seqalign.asn'.
35 */
36
37 #ifndef OBJECTS_SEQALIGN_SEQ_ALIGN_HPP
38 #define OBJECTS_SEQALIGN_SEQ_ALIGN_HPP
39
40
41 // generated includes
42 #include <objects/seqalign/Seq_align_.hpp>
43 #include <objects/seqloc/Na_strand.hpp>
44 #include <util/range_coll.hpp>
45
46 // generated classes
47
48 BEGIN_NCBI_SCOPE
49
50 BEGIN_objects_SCOPE /// namespace ncbi::objects::
51
52 class CSeq_id;
53
54 class NCBI_SEQALIGN_EXPORT CSeq_align : public CSeq_align_Base
55 {
56 typedef CSeq_align_Base Tparent;
57 public:
58 /// enum controlling known named scores
59 ///
60 /// A word on scores, since definitions are quite important: Scores here
61 /// are to be represented as a standard set of computations, comparable
62 /// across alignment generation algorithms. The class CScoreBuilder holds
63 /// reference implementations for most of the scores below (modulo
64 /// interpretation difficulties with a score named 'score'). The intended
65 /// meanings of scores are as follows:
66 ///
67 /// 'score'
68 /// generic score, definable by any algorithm; not comparable
69 /// across algorithms.
70 ///
71 /// 'bit_score'
72 /// BLAST-specific bit score; reference implementation in the BLAST
73 /// code, which is also exposed in CScoreBuilder.
74 ///
75 /// 'e_value'
76 /// BLAST-specific e-value; reference implementation in the BLAST
77 /// code, which is also exposed in CScoreBuilder.
78 ///
79 /// alignment length
80 /// not a score per se, but a useful metric nonetheless. This is the
81 /// sum of all aligned segments and all gaps; this excludes introns and
82 /// discontinuities
83 ///
84 /// percent identity
85 /// percent identity, computed as *UNGAPPED* percent identity. That
86 /// is, the computation is strictly (matches) / (matches + mismatches)
87 /// NOTE: there are, historically, at least four separate methods of
88 /// computation for this:
89 /// 1. C++ toolkit (CScoreBuilder): ungapped percent identity:
90 /// (matches) / (matches + mismatches)
91 /// 2. BLAST: gapped percent identity:
92 /// (matches) / (alignment length)
93 /// 3. gbDNA and contig processes: modified alignment length to
94 /// count each gap as 1 base long always
95 /// (matches) / (matches + mismatches + gap count)
96 /// 4. Spidey: compute identity per coverage:
97 /// (matches) / (stop - start + 1)
98 /// Several of these are provided directly, as:
99 /// pct_identity_gap
100 /// pct_identity_ungap
101 /// pct_identity_gapopen_only
102 ///
103 /// alignable bases
104 /// More a concept than a score. The alignable region of a sequence,
105 /// as defined by the underlying sequencing technology or biology -
106 /// that is, the region of a sequence noted as high quality minus a
107 /// poly-A tail in the case of a cDNA. This quantity can be used in
108 /// computing more appropriate coverage scores.
109 ///
110 /// 'pct_coverage'
111 /// Percent of a sequence (query) aligned to another (subject), minus
112 /// poly-A tail:
113 /// (matches + mismatches) / (query length - polyA tail)
114 ///
115 /// alignable percent coverage
116 /// (NB: no named score yet)
117 /// Percent of the alignable region actually aligned to a subject
118 /// (matches + mismatches in alignable region) / (length of alignable region)
119 ///
120 /// exonic coverage
121 /// (NB: no named score yet)
122 /// Measure of the percent of the query represented in a set of exons
123 /// (sum of lengths of exon on query) / (query length - polyA)
124 ///
125 /// @warning The order of this enumeration and the sc_ScoreNames,
126 /// sc_ScoreHelpText, and sc_IsInteger arrays must correspond!
127 ///
128 enum EScoreType {
129 //< generic 'score'
130 eScore_Score,
131
132 //< blast 'score'
133 eScore_Blast,
134
135 //< blast-style 'bit_score'
136 eScore_BitScore,
137
138 //< blast-style 'e_value'
139 eScore_EValue,
140
141 //< alignment length (align_length)
142 eScore_AlignLength,
143
144 //< count of identities (num_ident)
145 eScore_IdentityCount,
146
147 //< count of positives (num_positives); protein-to-DNA score
148 eScore_PositiveCount,
149
150 //< count of negatives (num_negatives)
151 eScore_NegativeCount,
152
153 //< count of mismatches (num_mismatch)
154 eScore_MismatchCount,
155
156 //< number of gap bases in the alignment
157 //< (= length of all gap segments)
158 eScore_GapCount,
159
160 //< percent identity (0.0-100.0) (pct_identity)
161 //< NOTE: there are multiple ways to express this; the default is to
162 //< consider a gap as a mismatch, as noted above.
163 eScore_PercentIdentity_Gapped,
164 eScore_PercentIdentity_Ungapped,
165 eScore_PercentIdentity_GapOpeningOnly,
166
167 //< percent coverage (0.0-100.0) (pct_coverage)
168 eScore_PercentCoverage,
169
170 //< blast-style 'sum_e'
171 eScore_SumEValue,
172
173 //< Composition-adjustment method from BLAST (comp_adjustment_method)
174 eScore_CompAdjMethod,
175
176 //< percent coverage (0.0-100.0) of high quality region (high_quality_pct_coverage)
177 eScore_HighQualityPercentCoverage,
178
179 //< Scores calculated by Splign
180 eScore_Matches,
181 eScore_OverallIdentity,
182 eScore_Splices,
183 eScore_ConsensusSplices,
184 eScore_ProductCoverage,
185 eScore_ExonIdentity,
186
187 //< generic percent identity is an alias for gapped percent identity
188 //< (i.e., BLAST-style percent identity)
189 eScore_PercentIdentity = eScore_PercentIdentity_Gapped
190 };
191
192 typedef pair<TSeqPos, TSeqPos> TLengthRange;
193
194 typedef map<string, EScoreType> TScoreNameMap;
195
196 struct NCBI_SEQALIGN_EXPORT SIndel {
197 TSeqPos genomic_pos;
198 TDim row;
199 TSeqPos length;
200
SIndelCSeq_align::SIndel201 SIndel(TSeqPos p = 0, TDim r = 0, TSeqPos l = 0)
202 : genomic_pos(p), row(r), length(l)
203 {}
204
205 string AsString() const;
206 };
207
208 /// constructor
209 CSeq_align(void);
210 /// destructor
211 ~CSeq_align(void);
212
213 /// Validatiors
214 TDim CheckNumRows(void) const;
215 void Validate (bool full_test = false) const;
216
217 /// GetSeqRange
218 /// NB: On a Spliced-seg, in case the product-type is protein,
219 /// these only return the amin part of Prot-pos. The frame is
220 /// ignored.
221 CRange<TSeqPos> GetSeqRange(TDim row) const;
222 TSeqPos GetSeqStart(TDim row) const;
223 TSeqPos GetSeqStop (TDim row) const;
224
225 /// Get strand (the first one if segments have different strands).
226 ENa_strand GetSeqStrand(TDim row) const;
227
228 /// Get seq-id (the first one if segments have different ids).
229 /// Throw exception if row is invalid.
230 const CSeq_id& GetSeq_id(TDim row) const;
231
232 /// Retrieves the total number of gaps in the given row an alignment;
233 /// all gaps by default
234 /// @throws CSeqalignException if alignment type is not supported
235 TSeqPos GetTotalGapCount(TDim row = -1) const;
236 TSeqPos GetTotalGapCountWithinRange(const TSeqRange &range,
237 TDim row = -1) const;
238 TSeqPos GetTotalGapCountWithinRanges(const CRangeCollection<TSeqPos> &ranges,
239 TDim row = -1) const;
240
241 /// Retrieves the number of gap openings in a given row in an alignment
242 /// (ignoring how many gaps are in the gapped region); all gaps by default
243 /// @throws CSeqalignException if alignment type is not supported
244 TSeqPos GetNumGapOpenings(TDim row = -1) const;
245 TSeqPos GetNumGapOpeningsWithinRange(const TSeqRange &range,
246 TDim row = -1) const;
247 TSeqPos GetNumGapOpeningsWithinRanges(const CRangeCollection<TSeqPos> &ranges,
248 TDim row = -1) const;
249
250 /// Retrieves the number of times a given row shifts frames; i.e. the number
251 /// of gaps with a length that is not a multiple of 3.
252 /// @throws CSeqalignException if alignment type is not supported
253 TSeqPos GetNumFrameshifts(TDim row = -1) const;
254 TSeqPos GetNumFrameshiftsWithinRange(const TSeqRange &range,
255 TDim row = -1) const;
256 TSeqPos GetNumFrameshiftsWithinRanges(const CRangeCollection<TSeqPos> &ranges,
257 TDim row = -1) const;
258
259 /// Retrieves descriptions of all indels on a given row
260 /// @throws CSeqalignException if alignment type is not supported
261 vector<SIndel> GetIndels(TDim row = -1) const;
262 vector<SIndel> GetIndelsWithinRange(const TSeqRange &range,
263 TDim row = -1) const;
264 vector<SIndel> GetIndelsWithinRanges(const CRangeCollection<TSeqPos> &ranges,
265 TDim row = -1) const;
266
267 /// Retrieves descriptions of all frameshifts on a given row; i.e.
268 /// all gaps with a length that is not a multiple of 3.
269 /// @throws CSeqalignException if alignment type is not supported
270 vector<SIndel> GetFrameshifts(TDim row = -1) const;
271 vector<SIndel> GetFrameshiftsWithinRange(const TSeqRange &range,
272 TDim row = -1) const;
273 vector<SIndel> GetFrameshiftsWithinRanges(const CRangeCollection<TSeqPos> &ranges,
274 TDim row = -1) const;
275
276 /// Retrieves descriptions of all non-frameshift indels on a given row; i.e.
277 /// all gaps with a length that is a multiple of 3.
278 /// @throws CSeqalignException if alignment type is not supported
279 vector<SIndel> GetNonFrameshifts(TDim row = -1) const;
280 vector<SIndel> GetNonFrameshiftsWithinRange(const TSeqRange &range,
281 TDim row = -1) const;
282 vector<SIndel> GetNonFrameshiftsWithinRanges(const CRangeCollection<TSeqPos> &ranges,
283 TDim row = -1) const;
284
285 /// Retrieves the locations of aligned bases in the given row, excluding
286 /// gaps and incontinuities
287 /// @throws CSeqalignException if alignment type is not supported
288 CRangeCollection<TSeqPos> GetAlignedBases(TDim row) const;
289
290 /// Get the length of this alignment. This length corresponds to the score
291 /// 'align_length'. By default, this function computes an alignment length
292 /// including all gap segments.
293 TSeqPos GetAlignLength(bool include_gaps = true) const;
294
295 /// Get the length of this alignment within a specified range
296 /// By default, this function computes an alignment length
297 /// including all gap segments.
298 TSeqPos GetAlignLengthWithinRange(const TSeqRange &range,
299 bool include_gaps = true) const;
300
301 /// Get the length of this alignment within a specified range
302 /// By default, this function computes an alignment length
303 /// including all gap segments.
304 TSeqPos GetAlignLengthWithinRanges(const CRangeCollection<TSeqPos> &ranges,
305 bool include_gaps = true) const;
306
307 double AlignLengthRatio() const;
308
309 /// Get score
310 bool GetNamedScore(const string& id, int &score) const;
311 bool GetNamedScore(const string& id, double &score) const;
312
313 bool GetNamedScore(EScoreType type, int &score) const;
314 bool GetNamedScore(EScoreType type, double &score) const;
315
316 void SetNamedScore(const string& id, int score);
317 void SetNamedScore(const string& id, double score);
318
319 void SetNamedScore(EScoreType type, int score);
320 void SetNamedScore(EScoreType type, double score);
321
322 void ResetNamedScore(const string& name);
323 void ResetNamedScore(EScoreType type);
324
325
326 /// Reverse the segments' orientation
327 /// NOTE: currently *only* works for dense-seg
328 void Reverse(void);
329
330 /// Swap the position of two rows in the alignment
331 /// NOTE: currently *only* works for dense-seg & disc
332 void SwapRows(TDim row1, TDim row2);
333
334 /// Create a Dense-seg from a Std-seg
335 /// Used by AlnMgr to handle nucl2prot alignments
336 //
337
338 /// NOTE: Here we assume that the same rows on different segments
339 /// contain the same sequence. Without access to OM we can only check
340 /// if the ids are the same via SerialEquals, and we throw an exception
341 /// if not equal. Since the same sequence can be represented with a
342 /// different type of seq-id, we provide an optional callback mechanism
343 /// to compare id1 and id2, and if both resolve to the same sequence
344 /// and id2 is preferred, to SerialAssign it to id1. Otherwise, again,
345 /// an exception should be thrown.
346 struct SSeqIdChooser : CObject
347 {
348 virtual void ChooseSeqId(CSeq_id& id1, const CSeq_id& id2) = 0;
349 };
350 CRef<CSeq_align> CreateDensegFromStdseg(SSeqIdChooser* SeqIdChooser = 0) const;
351
352 CRef<CSeq_align> CreateDensegFromDisc(SSeqIdChooser* SeqIdChooser = 0) const;
353
354 /// Create a Dense-seg with widths from Dense-seg of nucleotides
355 /// Used by AlnMgr to handle translated nucl2nucl alignments
356 /// IMPORTANT NOTE: Do *NOT* use for alignments containing proteins;
357 /// the code will not check for this
358 CRef<CSeq_align> CreateTranslatedDensegFromNADenseg(void) const;
359
360 /// Split the alignment at any discontinuity greater than threshold;
361 /// populate aligns list with new alignments. If alignment contains no long
362 /// discontinuities, populate aligns list with a singleton reference
363 /// to self.
364 /// Splitting works only for Denseg and Disc; for all other segment types, this
365 /// function simply populates aligns with a singleton reference to self.
366 /// This function is only implemented for pairwise alignments; it throws an
367 /// exception if Dim is more than 2.
368 void SplitOnLongDiscontinuity(list< CRef<CSeq_align> >& aligns,
369 TSeqPos discontinuity_threshold) const;
370
371
372 /// Offset row's coords
373 void OffsetRow(TDim row, TSignedSeqPos offset);
374
375 /// @deprecated (use sequence::RemapAlignToLoc())
376 /// @sa RemapAlignToLoc
377 NCBI_DEPRECATED void RemapToLoc(TDim row,
378 const CSeq_loc& dst_loc,
379 bool ignore_strand = false);
380
381 CRef<CSeq_loc> CreateRowSeq_loc(TDim row) const;
382
383 TLengthRange GapLengthRange() const;
384
385 TLengthRange IntronLengthRange() const;
386
387 TLengthRange ExonLengthRange() const;
388
389 /// Find extension by type in ext container.
390 /// @param ext_type
391 /// String id of the extension to find.
392 /// @result
393 /// User-object of the requested type or NULL.
394 CConstRef<CUser_object> FindExt(const string& ext_type) const;
395 /// Non-const version of FindExt().
396 CRef<CUser_object> FindExt(const string& ext_type);
397
398 static const TScoreNameMap &ScoreNameMap();
399
400 static string ScoreName(EScoreType score);
401
402 static string HelpText(EScoreType score);
403
404 static bool IsIntegerScore(EScoreType score);
405
406 protected:
407 /// retrieve a named score object
408 CConstRef<CScore> x_GetNamedScore(const string& name) const;
409 CRef<CScore> x_SetNamedScore(const string& name);
410
411 private:
412
413 /// Prohibit copy constructor and assignment operator
414 CSeq_align(const CSeq_align& value);
415 CSeq_align& operator=(const CSeq_align& value);
416
417 /// Create a partial alignment containing the specified range of segments.
418 /// Can only be called for Denseg alignments.
419 CRef<CSeq_align> x_CreateSubsegAlignment(int from, int to) const;
420 };
421
422
423 /// Remap seq-align row to the seq-loc.
424 /// Treats the given row as being relative to the location, maps it
425 /// to the sequence(s) referenced by this location.
426 /// @param align
427 /// The seq-align object to be mapped (the object will be modified!).
428 /// @param row
429 /// Row to be mapped.
430 /// @param loc
431 /// Seq-loc to which the row should be mapped.
432 /// @result
433 /// Reference to the new seq-align with the mapped row.
434 NCBI_SEQALIGN_EXPORT
435 CRef<CSeq_align> RemapAlignToLoc(const CSeq_align& align,
436 CSeq_align::TDim row,
437 const CSeq_loc& loc);
438
439
440 /////////////////// CSeq_align inline methods
441
442 // constructor
443 inline
CSeq_align(void)444 CSeq_align::CSeq_align(void)
445 {
446 }
447
448
449 /////////////////// end of CSeq_align inline methods
450
451
452 END_objects_SCOPE /// namespace ncbi::objects::
453
454 END_NCBI_SCOPE
455
456 #endif /// OBJECTS_SEQALIGN_SEQ_ALIGN_HPP
457