1 /* $Id: score_builder_base.cpp 628386 2021-03-30 17:19:29Z mozese2 $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Mike DiCuccio
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33 #include <objtools/alnmgr/score_builder_base.hpp>
34 #include <objtools/alnmgr/alntext.hpp>
35
36 #include <util/sequtil/sequtil_manip.hpp>
37
38 #include <objtools/alnmgr/alnvec.hpp>
39 #include <objtools/alnmgr/pairwise_aln.hpp>
40 #include <objtools/alnmgr/aln_converters.hpp>
41
42 #include <objmgr/objmgr_exception.hpp>
43 #include <objmgr/seq_vector.hpp>
44 #include <objmgr/feat_ci.hpp>
45
46 #include <objects/seqloc/Seq_loc.hpp>
47 #include <objects/seq/Annot_descr.hpp>
48 #include <objects/seq/Annotdesc.hpp>
49 #include <objects/seqalign/Seq_align.hpp>
50 #include <objects/seqalign/Std_seg.hpp>
51 #include <objects/seqalign/Spliced_seg.hpp>
52 #include <objects/seqalign/Spliced_exon.hpp>
53 #include <objects/seqalign/Spliced_exon_chunk.hpp>
54 #include <objects/seqalign/Product_pos.hpp>
55 #include <objects/seqalign/Prot_pos.hpp>
56
57 #include <objmgr/util/sequence.hpp>
58 #include <objects/seqfeat/Org_ref.hpp>
59 #include <objects/seqfeat/OrgName.hpp>
60 #include <objects/seqfeat/Genetic_code_table.hpp>
61
62 BEGIN_NCBI_SCOPE
63 USING_SCOPE(objects);
64
65 /// Default constructor
CScoreBuilderBase()66 CScoreBuilderBase::CScoreBuilderBase()
67 : m_ErrorMode(eError_Throw)
68 , m_SubstMatrixName("BLOSUM62")
69 {
70 }
71
72 /// Destructor
~CScoreBuilderBase()73 CScoreBuilderBase::~CScoreBuilderBase()
74 {
75 }
76
77 /// Get length of intersection between a range and a range collection
78 static inline
s_IntersectionLength(const CRangeCollection<TSeqPos> & ranges,const TSeqRange & range)79 TSeqPos s_IntersectionLength(const CRangeCollection<TSeqPos> &ranges,
80 const TSeqRange &range)
81 {
82 TSeqPos length = 0;
83 ITERATE (CRangeCollection<TSeqPos>, it, ranges) {
84 length += it->IntersectionWith(range).GetLength();
85 }
86 return length;
87 }
88
89 ///
90 /// calculate mismatches and identities in a seq-align
91 ///
92
s_GetNucIdentityMismatch(const vector<string> & data,int * identities,int * mismatches)93 static void s_GetNucIdentityMismatch(const vector<string>& data,
94 int* identities,
95 int* mismatches)
96 {
97 if ( data.empty() ) {
98 return;
99 }
100 size_t rows = data.size();
101 size_t size = data[0].size();
102 for (size_t i = 1; i < rows; ++i ) {
103 if ( data[i].size() != size ) {
104 NCBI_THROW(CSeqalignException, eInvalidInputData,
105 "Rows have different lengths");
106 }
107 }
108 for (size_t a = 0; a < size; ++a) {
109 bool is_mismatch = false;
110 char c = data[0][a];
111 for (size_t b = 1; b < rows; ++b) {
112 if (data[b][a] != c) {
113 is_mismatch = true;
114 break;
115 }
116 }
117
118 if (is_mismatch) {
119 ++(*mismatches);
120 } else {
121 ++(*identities);
122 }
123 }
124 }
125
126
s_GetSplicedSegIdentityMismatch(CScope & scope,const CSeq_align & align,const CRangeCollection<TSeqPos> & ranges,int * identities,int * mismatches)127 static void s_GetSplicedSegIdentityMismatch(CScope& scope,
128 const CSeq_align& align,
129 const CRangeCollection<TSeqPos> &ranges,
130 int* identities,
131 int* mismatches)
132 {
133 ///
134 /// easy route
135 /// use the alignment manager
136 ///
137 TAlnSeqIdIRef id1(new CAlnSeqId(align.GetSeq_id(0)));
138 TAlnSeqIdIRef id2(new CAlnSeqId(align.GetSeq_id(1)));
139 CRef<CPairwiseAln> pairwise(new CPairwiseAln(id1, id2));
140 ConvertSeqAlignToPairwiseAln(*pairwise, align, 0, 1);
141
142 CBioseq_Handle prod_bsh = scope.GetBioseqHandle(align.GetSeq_id(0));
143 CBioseq_Handle genomic_bsh = scope.GetBioseqHandle(align.GetSeq_id(1));
144 if ( !prod_bsh || !genomic_bsh ) {
145 const CSeq_id &failed_id = align.GetSeq_id(genomic_bsh ? 0 : 1);
146 NCBI_THROW(CSeqalignException, eInvalidSeqId,
147 "Can't get sequence data for " + failed_id.AsFastaString() +
148 " in order to count identities/mismatches");
149 }
150
151 CSeqVector prod(prod_bsh, CBioseq_Handle::eCoding_Iupac);
152
153 switch (align.GetSegs().GetSpliced().GetProduct_type()) {
154 case CSpliced_seg::eProduct_type_transcript:
155 {{
156 CSeqVector gen(genomic_bsh, CBioseq_Handle::eCoding_Iupac);
157 ITERATE (CPairwiseAln, it, *pairwise) {
158 const CPairwiseAln::TAlnRng& range = *it;
159 TSeqRange r1(range.GetFirstFrom(), range.GetFirstTo());
160 TSeqRange r2(range.GetSecondFrom(), range.GetSecondTo());
161 string prod_data;
162 prod.GetSeqData(r1.GetFrom(), r1.GetTo() + 1, prod_data);
163 string gen_data;
164 gen.GetSeqData(r2.GetFrom(), r2.GetTo() + 1, gen_data);
165 if (range.IsReversed()) {
166 CSeqManip::ReverseComplement(gen_data,
167 CSeqUtil::e_Iupacna,
168 0, gen_data.size());
169 }
170
171 CRangeCollection<TSeqPos> seg_ranges = ranges;
172 seg_ranges.IntersectWith(r1);
173 ITERATE (CRangeCollection<TSeqPos>, range_it, seg_ranges) {
174 TSeqPos start_offset = range_it->GetFrom() - r1.GetFrom(),
175 end_offset = range_it->GetToOpen() - r1.GetFrom();
176 string::const_iterator pit = prod_data.begin()
177 + start_offset;
178 string::const_iterator pit_end = prod_data.begin()
179 + end_offset;
180 string::const_iterator git = gen_data.begin()
181 + start_offset;
182 string::const_iterator git_end = gen_data.begin()
183 + end_offset;
184
185 for ( ; pit != pit_end && git != git_end; ++pit, ++git)
186 {
187 bool match = (*pit == *git);
188 *identities += match;
189 *mismatches += !match;
190 }
191 }
192 }
193 }}
194 break;
195
196 case CSpliced_seg::eProduct_type_protein:
197 {{
198 int gcode = 1;
199 try {
200 const COrg_ref& org_ref = sequence::GetOrg_ref(genomic_bsh);
201 gcode = org_ref.GetOrgname().GetGcode();
202 }
203 catch (CException&) {
204 }
205 const CTrans_table& tbl = CGen_code_table::GetTransTable(gcode);
206
207 char codon[3];
208 codon[0] = codon[1] = codon[2] = 'N';
209
210 TSeqRange last_r1(0, 0);
211 ITERATE (CPairwiseAln, it, *pairwise) {
212 const CPairwiseAln::TAlnRng& range = *it;
213 TSeqRange r1(range.GetFirstFrom(), range.GetFirstTo());
214 TSeqRange r2(range.GetSecondFrom(), range.GetSecondTo());
215
216 if (last_r1.GetTo() + 1 != r1.GetFrom()) {
217 size_t i = last_r1.GetTo() + 1;
218 size_t count = 0;
219 for ( ; i != r1.GetFrom() && count < 3; ++i, ++count) {
220 codon[ i % 3 ] = 'N';
221 }
222 }
223 last_r1 = r1;
224
225 string gen_data;
226 CSeqVector gen(genomic_bsh, CBioseq_Handle::eCoding_Iupac);
227 gen.GetSeqData(r2.GetFrom(), r2.GetTo() + 1, gen_data);
228 if (range.IsReversed()) {
229 CSeqManip::ReverseComplement(gen_data,
230 CSeqUtil::e_Iupacna,
231 0, gen_data.size());
232
233 //LOG_POST(Error << "reverse range: [" << r1.GetFrom() << ", " << r1.GetTo() << "] - [" << r2.GetFrom() << ", " << r2.GetTo() << "]");
234 } else {
235 //LOG_POST(Error << "forward range: [" << r1.GetFrom() << ", " << r1.GetTo() << "] - [" << r2.GetFrom() << ", " << r2.GetTo() << "]");
236 }
237
238 /// compare product range to conceptual translation
239 TSeqPos prod_pos = r1.GetFrom();
240 //LOG_POST(Error << " genomic = " << gen_data);
241 for (size_t i = 0; i < gen_data.size(); ++i, ++prod_pos) {
242 codon[ prod_pos % 3 ] = gen_data[i];
243 //LOG_POST(Error << " filling: " << prod_pos << ": " << prod_pos % 3 << ": " << gen_data[i]);
244
245 if (prod_pos % 3 == 2) {
246 int state = tbl.SetCodonState(codon[0], codon[1], codon[2]);
247 char residue = (prod_pos == 2
248 ? tbl.GetStartResidue(state)
249 : tbl.GetCodonResidue(state));
250
251 /// NOTE:
252 /// we increment identities/mismatches by 3 here,
253 /// counting identities in nucleotide space!!
254 if (residue == prod[prod_pos / 3] &&
255 residue != 'X' && residue != '-') {
256 *identities += 3;
257 } else {
258 *mismatches += 3;
259 }
260 }
261 }
262 }
263 }}
264 break;
265
266 default:
267 break;
268 }
269
270 /*
271 * NB: leave this here; it's useful for validation
272 int actual_identities = 0;
273 if (align.GetNamedScore("N of matches", actual_identities)) {
274 if (actual_identities != *identities) {
275 LOG_POST(Error << "actual identities: " << actual_identities
276 << " computed identities: " << *identities);
277
278 //cerr << MSerial_AsnText << align;
279 }
280 }
281 **/
282 }
283
284
s_GetCountIdentityMismatch(CScope & scope,const CSeq_align & align,int * identities,int * mismatches,const CRangeCollection<TSeqPos> & ranges=CRangeCollection<TSeqPos> (TSeqRange::GetWhole ()))285 static void s_GetCountIdentityMismatch(CScope& scope, const CSeq_align& align,
286 int* identities, int* mismatches,
287 const CRangeCollection<TSeqPos> &ranges =
288 CRangeCollection<TSeqPos>(TSeqRange::GetWhole()))
289 {
290 _ASSERT(identities && mismatches);
291 if (ranges.empty()) {
292 return;
293 }
294
295 {{
296 ///
297 /// shortcut: if 'num_ident' is present, we trust it
298 ///
299 int num_ident = 0;
300 if (ranges.begin()->IsWhole() &&
301 align.GetNamedScore(CSeq_align::eScore_IdentityCount, num_ident))
302 {
303 size_t len = align.GetAlignLength(false /*ignore gaps*/);
304 *identities += num_ident;
305 *mismatches += (len - num_ident);
306 return;
307 }
308 }}
309
310 switch (align.GetSegs().Which()) {
311 case CSeq_align::TSegs::e_Denseg:
312 {{
313 const CDense_seg& ds = align.GetSegs().GetDenseg();
314 vector<string> data;
315 CAlnVec vec(ds, scope);
316 data.resize(vec.GetNumRows());
317 for (int seg = 0; seg < vec.GetNumSegs(); ++seg) {
318 bool has_gap = false;
319 for (int i = 0; !has_gap && i < vec.GetNumRows(); ++i) {
320 if (vec.GetStart(i, seg) == -1) {
321 has_gap = true;
322 }
323 }
324 if (has_gap) {
325 /// we compute ungapped identities
326 /// gap on at least one row, so we skip this segment
327 continue;
328 }
329
330 TSeqPos seg_start = vec.GetStart(0, seg),
331 seg_stop = vec.GetStop(0, seg);
332 CRangeCollection<TSeqPos> seg_ranges = ranges;
333 seg_ranges.IntersectWith(TSeqRange(seg_start, seg_stop));
334 for (int i = 0; i < vec.GetNumRows(); ++i) {
335 TSeqPos offset = vec.GetStart(i, seg) - seg_start;
336 ITERATE (CRangeCollection<TSeqPos>, range_it, seg_ranges) {
337 string seq_string;
338 vec.GetSeqString(seq_string, i,
339 range_it->GetFrom()+offset,
340 range_it->GetTo()+offset);
341 data[i] += seq_string;
342 }
343 }
344 }
345 s_GetNucIdentityMismatch(data, identities, mismatches);
346 }}
347 break;
348
349 case CSeq_align::TSegs::e_Disc:
350 {{
351 ITERATE (CSeq_align::TSegs::TDisc::Tdata, iter,
352 align.GetSegs().GetDisc().Get()) {
353 s_GetCountIdentityMismatch(scope, **iter,
354 identities, mismatches, ranges);
355 }
356 }}
357 break;
358
359 case CSeq_align::TSegs::e_Std:
360 NCBI_THROW(CSeqalignException, eNotImplemented,
361 "identity + mismatch function not implemented for std-seg");
362 break;
363
364 case CSeq_align::TSegs::e_Spliced:
365 {{
366 int aln_identities = 0;
367 int aln_mismatches = 0;
368 bool has_non_standard = false;
369 ITERATE (CSpliced_seg::TExons, iter,
370 align.GetSegs().GetSpliced().GetExons()) {
371 const CSpliced_exon& exon = **iter;
372 TSeqRange product_span;
373 product_span.Set(exon.GetProduct_start().AsSeqPos(),
374 exon.GetProduct_end().AsSeqPos());
375 if (exon.IsSetParts()) {
376 TSeqPos part_start = product_span.GetFrom();
377 ITERATE (CSpliced_exon::TParts, it, exon.GetParts()) {
378 const CSpliced_exon_chunk& chunk = **it;
379 int part_len = 0;
380 switch (chunk.Which()) {
381 case CSpliced_exon_chunk::e_Match:
382 part_len = chunk.GetMatch();
383 aln_identities += s_IntersectionLength(ranges,
384 TSeqRange(part_start,
385 part_start+part_len-1));
386 break;
387
388 case CSpliced_exon_chunk::e_Mismatch:
389 part_len = chunk.GetMismatch();
390 aln_mismatches += s_IntersectionLength(ranges,
391 TSeqRange(part_start,
392 part_start+part_len-1));
393 break;
394
395 case CSpliced_exon_chunk::e_Diag:
396 part_len = chunk.GetDiag();
397 if (s_IntersectionLength(ranges,
398 TSeqRange(part_start,
399 part_start+part_len-1)))
400 {
401 has_non_standard = true;
402 }
403 break;
404
405 case CSpliced_exon_chunk::e_Product_ins:
406 part_len = chunk.GetProduct_ins();
407 break;
408
409 default:
410 break;
411 }
412 part_start += part_len;
413 }
414 } else {
415 has_non_standard = true;
416 break;
417 }
418 }
419 if ( !has_non_standard ) {
420 *identities += aln_identities;
421 *mismatches += aln_mismatches;
422 break;
423 }
424
425 /// we must compute match and mismatch based on first
426 /// prinicples. Sometimes loader will fail in getting
427 /// all components of the genomic sequence; in that case
428 /// throw an exception, but make it somewhat more informative
429 try {
430 s_GetSplicedSegIdentityMismatch(scope, align, ranges,
431 identities, mismatches);
432 } catch (CLoaderException &e) {
433 NCBI_RETHROW_SAME(e,
434 "Can't calculate identities/mismatches for "
435 "alignment with genomic sequence " +
436 align.GetSeq_id(1).AsFastaString() +
437 "; Loader can't load all required "
438 "components of sequence");
439 }
440 }}
441 break;
442
443 default:
444 _ASSERT(false);
445 break;
446 }
447 }
448
449 ///
450 /// calculate the percent identity
451 /// we also return the count of identities and mismatches
452 ///
s_GetPercentIdentity(CScope & scope,const CSeq_align & align,int * identities,int * mismatches,double * pct_identity,CScoreBuilderBase::EPercentIdentityType type,const CRangeCollection<TSeqPos> & ranges=CRangeCollection<TSeqPos> (TSeqRange::GetWhole ()))453 static void s_GetPercentIdentity(CScope& scope, const CSeq_align& align,
454 int* identities,
455 int* mismatches,
456 double* pct_identity,
457 CScoreBuilderBase::EPercentIdentityType type,
458 const CRangeCollection<TSeqPos> &ranges =
459 CRangeCollection<TSeqPos>(TSeqRange::GetWhole()))
460 {
461 size_t count_aligned = 0;
462 switch (type) {
463 case CScoreBuilderBase::eGapped:
464 count_aligned = align.GetAlignLengthWithinRanges(ranges, true /* include gaps */);
465 break;
466
467 case CScoreBuilderBase::eUngapped:
468 count_aligned = align.GetAlignLengthWithinRanges(ranges, false /* omit gaps */);
469 break;
470
471 case CScoreBuilderBase::eGBDNA:
472 count_aligned = align.GetAlignLengthWithinRanges(ranges, false /* omit gaps */);
473 count_aligned += align.GetNumGapOpeningsWithinRanges(ranges);
474 break;
475 }
476
477 s_GetCountIdentityMismatch(scope, align, identities, mismatches, ranges);
478 if (count_aligned) {
479 *pct_identity = 100.0f * double(*identities) / count_aligned;
480 } else {
481 *pct_identity = 0;
482 }
483 }
484
485
486 ///
487 /// calculate the percent coverage
488 ///
s_SequenceIsProtein(CScope & scope,const CSeq_id & id)489 static bool s_SequenceIsProtein(CScope& scope,
490 const CSeq_id& id)
491 {
492 CSeq_inst::TMol mol = scope.GetSequenceType(id);
493 if (mol == CSeq_inst::eMol_not_set) {
494 CBioseq_Handle bsh = scope.GetBioseqHandle(id);
495 if ( !bsh ) {
496 NCBI_THROW(CException, eUnknown,
497 "failed to retrieve sequence: " + id.AsFastaString());
498 }
499 return bsh.IsAa();
500 }
501
502 return (mol == CSeq_inst::eMol_aa);
503 }
504
505
s_IsProteinToGenomic(CScope & scope,const CSeq_align & align)506 static bool s_IsProteinToGenomic(CScope& scope,
507 const CSeq_align& align)
508 {
509 if (align.GetSegs().IsSpliced()) {
510 return align.GetSegs().GetSpliced()
511 .GetProduct_type() == CSpliced_seg::eProduct_type_protein;
512 }
513
514 if (align.GetSegs().IsDenseg()) {
515 const CDense_seg& seg = align.GetSegs().GetDenseg();
516 if (seg.IsSetWidths()) {
517 // FIXME: I can't remember what the rule for widths is
518 //
519 }
520 else {
521 // we must be protein-to-protein or nuc-to-nuc
522 return false;
523 }
524 }
525
526 // our short-cuts are exhausted
527 // fall back to a check of sequence type
528 const CSeq_id& id0 = align.GetSeq_id(0);
529 if ( !s_SequenceIsProtein(scope, id0) ) {
530 return false;
531 }
532 const CSeq_id& id1 = align.GetSeq_id(1);
533 return s_SequenceIsProtein(scope, id1);
534 }
535
536
s_GetPercentCoverage(CScope & scope,const CSeq_align & align,const CRangeCollection<TSeqPos> & ranges,double * pct_coverage,unsigned query=0)537 static void s_GetPercentCoverage(CScope& scope, const CSeq_align& align,
538 const CRangeCollection<TSeqPos>& ranges,
539 double* pct_coverage, unsigned query = 0)
540 {
541 if (!ranges.empty() && ranges.begin()->IsWhole() &&
542 align.GetNamedScore(CSeq_align::eScore_PercentCoverage,
543 *pct_coverage)) {
544 return;
545 }
546
547 size_t covered_bases = align.GetAlignLengthWithinRanges
548 (ranges, false /* don't include gaps */);
549 size_t seq_len = 0;
550 if(ranges.empty() || !ranges.begin()->IsWhole()){
551 seq_len = ranges.GetCoveredLength();
552 } else {
553 if (align.GetSegs().IsSpliced() &&
554 align.GetSegs().GetSpliced().IsSetProduct_length())
555 {
556 seq_len = align.GetSegs().GetSpliced().GetProduct_length();
557 } else {
558 const auto &query_id = align.GetSeq_id(query);
559 const objects::CBioseq_Handle& bsh_seq = scope.GetBioseqHandle(query_id);
560 if (!bsh_seq) {
561 *pct_coverage = 0;
562 NCBI_THROW(CSeqalignException, eInvalidSeqId,
563 "Can't get sequence data for " + query_id.AsFastaString() +
564 " in order to calculate coverage");
565 }
566 seq_len = bsh_seq.GetBioseqLength();
567 }
568 if (align.GetSegs().IsSpliced() &&
569 align.GetSegs().GetSpliced().IsSetPoly_a()) {
570
571 if (align.GetSegs().GetSpliced().IsSetProduct_strand() &&
572 align.GetSegs().GetSpliced().GetProduct_strand() == eNa_strand_minus) {
573 seq_len -= align.GetSegs().GetSpliced().GetPoly_a();
574 } else {
575 seq_len = align.GetSegs().GetSpliced().GetPoly_a();
576 }
577 }
578
579
580 //
581 // determine if the alignment is protein-to-genomic
582 //
583 bool is_protein_to_genomic = s_IsProteinToGenomic(scope, align);
584 if (is_protein_to_genomic) {
585 /// alignment is protein-to-genomic alignment
586 /// NOTE: alignment length is always reported in nucleotide
587 /// coordinates
588 seq_len *= 3;
589 if (align.GetSegs().IsStd()) {
590 /// odd corner case:
591 /// std-seg alignments of protein to nucleotide
592 covered_bases *= 3;
593 }
594 }
595 }
596
597 if (covered_bases) {
598 *pct_coverage = 100.0f * double(covered_bases) / double(seq_len);
599 } else {
600 *pct_coverage = 0;
601 }
602 }
603
604 /////////////////////////////////////////////////////////////////////////////
x_GetMatrixCounts(CScope & scope,const CSeq_align & align,int * positives,int * negatives)605 void CScoreBuilderBase::x_GetMatrixCounts(CScope& scope,
606 const CSeq_align& align,
607 int* positives, int* negatives)
608 {
609 if (!align.GetSegs().IsSpliced() ||
610 align.GetSegs().GetSpliced().GetProduct_type() !=
611 CSpliced_seg::eProduct_type_protein)
612 {
613 NCBI_THROW(CSeqalignException, eUnsupported,
614 "num_positives and num_negatives scores only defined "
615 "for protein alignment");
616 }
617 CProteinAlignText pro_text(scope, align, m_SubstMatrixName);
618 const string& prot = pro_text.GetProtein();
619 const string& dna = pro_text.GetDNA();
620 const string& match = pro_text.GetMatch();
621 for(string::size_type i=0;i<match.size(); ++i) {
622 if( isalpha(prot[i]) && (dna[i] != '-')) {
623 int increment = isupper(prot[i]) ? 3 : 1;
624 switch(match[i]) {
625 case '|':
626 case '+':
627 *positives += increment;
628 break;
629 case 'X': /// skip introns and bad parts
630 break;
631 default://mismatch
632 *negatives += increment;
633 break;
634 }
635 }
636 }
637 }
638
639
640
641
SetSubstMatrix(const string & name)642 void CScoreBuilderBase::SetSubstMatrix(const string &name)
643 {
644 m_SubstMatrixName = name;
645 }
646
GetPercentIdentity(CScope & scope,const CSeq_align & align,EPercentIdentityType type)647 double CScoreBuilderBase::GetPercentIdentity(CScope& scope,
648 const CSeq_align& align,
649 EPercentIdentityType type)
650 {
651 int identities = 0;
652 int mismatches = 0;
653 double pct_identity = 0;
654 s_GetPercentIdentity(scope, align,
655 &identities, &mismatches, &pct_identity, type);
656 return pct_identity;
657 }
658
659
GetPercentIdentity(CScope & scope,const CSeq_align & align,const TSeqRange & range,EPercentIdentityType type)660 double CScoreBuilderBase::GetPercentIdentity(CScope& scope,
661 const CSeq_align& align,
662 const TSeqRange &range,
663 EPercentIdentityType type)
664 {
665 int identities = 0;
666 int mismatches = 0;
667 double pct_identity = 0;
668 s_GetPercentIdentity(scope, align,
669 &identities, &mismatches, &pct_identity, type,
670 CRangeCollection<TSeqPos>(range));
671 return pct_identity;
672 }
673
674
GetPercentIdentity(CScope & scope,const CSeq_align & align,const CRangeCollection<TSeqPos> & ranges,EPercentIdentityType type)675 double CScoreBuilderBase::GetPercentIdentity(CScope& scope,
676 const CSeq_align& align,
677 const CRangeCollection<TSeqPos> &ranges,
678 EPercentIdentityType type)
679 {
680 int identities = 0;
681 int mismatches = 0;
682 double pct_identity = 0;
683 s_GetPercentIdentity(scope, align,
684 &identities, &mismatches, &pct_identity, type, ranges);
685 return pct_identity;
686 }
687
688
GetPercentCoverage(CScope & scope,const CSeq_align & align,unsigned query)689 double CScoreBuilderBase::GetPercentCoverage(CScope& scope,
690 const CSeq_align& align,
691 unsigned query)
692 {
693 double pct_coverage = 0;
694 s_GetPercentCoverage(scope, align,
695 CRangeCollection<TSeqPos>(TSeqRange::GetWhole()),
696 &pct_coverage,
697 query);
698 return pct_coverage;
699 }
700
GetPercentCoverage(CScope & scope,const CSeq_align & align,const TSeqRange & range,unsigned query)701 double CScoreBuilderBase::GetPercentCoverage(CScope& scope,
702 const CSeq_align& align,
703 const TSeqRange& range,
704 unsigned query)
705 {
706 double pct_coverage = 0;
707 s_GetPercentCoverage(scope, align, CRangeCollection<TSeqPos>(range), &pct_coverage, query);
708 return pct_coverage;
709 }
710
GetPercentCoverage(CScope & scope,const CSeq_align & align,const CRangeCollection<TSeqPos> & ranges,unsigned query)711 double CScoreBuilderBase::GetPercentCoverage(CScope& scope,
712 const CSeq_align& align,
713 const CRangeCollection<TSeqPos>& ranges,
714 unsigned query)
715 {
716 double pct_coverage = 0;
717 s_GetPercentCoverage(scope, align, ranges, &pct_coverage, query);
718 return pct_coverage;
719 }
720
GetIdentityCount(CScope & scope,const CSeq_align & align)721 int CScoreBuilderBase::GetIdentityCount(CScope& scope, const CSeq_align& align)
722 {
723 int identities = 0;
724 int mismatches = 0;
725 s_GetCountIdentityMismatch(scope, align, &identities, &mismatches);
726 return identities;
727 }
728
729
GetMismatchCount(CScope & scope,const CSeq_align & align)730 int CScoreBuilderBase::GetMismatchCount(CScope& scope, const CSeq_align& align)
731 {
732 int identities = 0;
733 int mismatches = 0;
734 s_GetCountIdentityMismatch(scope, align, &identities,&mismatches);
735 return mismatches;
736 }
737
738
GetMismatchCount(CScope & scope,const CSeq_align & align,int & identities,int & mismatches)739 void CScoreBuilderBase::GetMismatchCount(CScope& scope, const CSeq_align& align,
740 int& identities, int& mismatches)
741 {
742 identities = 0;
743 mismatches = 0;
744 s_GetCountIdentityMismatch(scope, align, &identities, &mismatches);
745 }
746
747
GetIdentityCount(CScope & scope,const CSeq_align & align,const TSeqRange & range)748 int CScoreBuilderBase::GetIdentityCount(CScope& scope, const CSeq_align& align,
749 const TSeqRange& range)
750 {
751 int identities = 0;
752 int mismatches = 0;
753 s_GetCountIdentityMismatch(scope, align, &identities, &mismatches,
754 CRangeCollection<TSeqPos>(range));
755 return identities;
756 }
757
758
GetMismatchCount(CScope & scope,const CSeq_align & align,const TSeqRange & range)759 int CScoreBuilderBase::GetMismatchCount(CScope& scope, const CSeq_align& align,
760 const TSeqRange& range)
761 {
762 int identities = 0;
763 int mismatches = 0;
764 s_GetCountIdentityMismatch(scope, align, &identities,&mismatches,
765 CRangeCollection<TSeqPos>(range));
766 return mismatches;
767 }
768
769
GetMismatchCount(CScope & scope,const CSeq_align & align,const TSeqRange & range,int & identities,int & mismatches)770 void CScoreBuilderBase::GetMismatchCount(CScope& scope, const CSeq_align& align,
771 const TSeqRange& range,
772 int& identities, int& mismatches)
773 {
774 identities = 0;
775 mismatches = 0;
776 s_GetCountIdentityMismatch(scope, align, &identities, &mismatches,
777 CRangeCollection<TSeqPos>(range));
778 }
779
780
GetIdentityCount(CScope & scope,const CSeq_align & align,const CRangeCollection<TSeqPos> & ranges)781 int CScoreBuilderBase::GetIdentityCount(CScope& scope, const CSeq_align& align,
782 const CRangeCollection<TSeqPos> &ranges)
783 {
784 int identities = 0;
785 int mismatches = 0;
786 s_GetCountIdentityMismatch(scope, align, &identities, &mismatches, ranges);
787 return identities;
788 }
789
790
GetMismatchCount(CScope & scope,const CSeq_align & align,const CRangeCollection<TSeqPos> & ranges)791 int CScoreBuilderBase::GetMismatchCount(CScope& scope, const CSeq_align& align,
792 const CRangeCollection<TSeqPos> &ranges)
793 {
794 int identities = 0;
795 int mismatches = 0;
796 s_GetCountIdentityMismatch(scope, align, &identities,&mismatches, ranges);
797 return mismatches;
798 }
799
800
GetMismatchCount(CScope & scope,const CSeq_align & align,const CRangeCollection<TSeqPos> & ranges,int & identities,int & mismatches)801 void CScoreBuilderBase::GetMismatchCount(CScope& scope, const CSeq_align& align,
802 const CRangeCollection<TSeqPos> &ranges,
803 int& identities, int& mismatches)
804 {
805 identities = 0;
806 mismatches = 0;
807 s_GetCountIdentityMismatch(scope, align, &identities, &mismatches, ranges);
808 }
809
810
GetPositiveCount(CScope & scope,const CSeq_align & align)811 int CScoreBuilderBase::GetPositiveCount(CScope& scope, const CSeq_align& align)
812 {
813 int positives = 0;
814 int negatives = 0;
815 x_GetMatrixCounts(scope, align, &positives, &negatives);
816 return positives;
817 }
818
819
GetNegativeCount(CScope & scope,const CSeq_align & align)820 int CScoreBuilderBase::GetNegativeCount(CScope& scope, const CSeq_align& align)
821 {
822 int positives = 0;
823 int negatives = 0;
824 x_GetMatrixCounts(scope, align, &positives, &negatives);
825 return negatives;
826 }
827
828
GetMatrixCounts(CScope & scope,const CSeq_align & align,int & positives,int & negatives)829 void CScoreBuilderBase::GetMatrixCounts(CScope& scope, const CSeq_align& align,
830 int& positives, int& negatives)
831 {
832 positives = 0;
833 negatives = 0;
834 x_GetMatrixCounts(scope, align, &positives, &negatives);
835 }
836
837
GetGapBaseCount(const CSeq_align & align)838 int CScoreBuilderBase::GetGapBaseCount(const CSeq_align& align)
839 {
840 return align.GetTotalGapCount();
841 }
842
843
GetGapCount(const CSeq_align & align)844 int CScoreBuilderBase::GetGapCount(const CSeq_align& align)
845 {
846 return align.GetNumGapOpenings();
847 }
848
849
GetAlignLength(const CSeq_align & align,bool ungapped)850 TSeqPos CScoreBuilderBase::GetAlignLength(const CSeq_align& align, bool ungapped)
851 {
852 return align.GetAlignLength( !ungapped /* true = include gaps = !ungapped */);
853 }
854
855
GetGapBaseCount(const CSeq_align & align,const TSeqRange & range)856 int CScoreBuilderBase::GetGapBaseCount(const CSeq_align& align,
857 const TSeqRange &range)
858 {
859 return align.GetTotalGapCountWithinRange(range);
860 }
861
862
GetGapCount(const CSeq_align & align,const TSeqRange & range)863 int CScoreBuilderBase::GetGapCount(const CSeq_align& align,
864 const TSeqRange &range)
865 {
866 return align.GetNumGapOpeningsWithinRange(range);
867 }
868
869
GetAlignLength(const CSeq_align & align,const TSeqRange & range,bool ungapped)870 TSeqPos CScoreBuilderBase::GetAlignLength(const CSeq_align& align,
871 const TSeqRange &range,
872 bool ungapped)
873 {
874 return align.GetAlignLengthWithinRange(range, !ungapped
875 /* true = include gaps = !ungapped */);
876 }
877
878
GetGapBaseCount(const CSeq_align & align,const CRangeCollection<TSeqPos> & ranges)879 int CScoreBuilderBase::GetGapBaseCount(const CSeq_align& align,
880 const CRangeCollection<TSeqPos> &ranges)
881 {
882 return align.GetTotalGapCountWithinRanges(ranges);
883 }
884
885
GetGapCount(const CSeq_align & align,const CRangeCollection<TSeqPos> & ranges)886 int CScoreBuilderBase::GetGapCount(const CSeq_align& align,
887 const CRangeCollection<TSeqPos> &ranges)
888 {
889 return align.GetNumGapOpeningsWithinRanges(ranges);
890 }
891
892
GetAlignLength(const CSeq_align & align,const CRangeCollection<TSeqPos> & ranges,bool ungapped)893 TSeqPos CScoreBuilderBase::GetAlignLength(const CSeq_align& align,
894 const CRangeCollection<TSeqPos> &ranges,
895 bool ungapped)
896 {
897 return align.GetAlignLengthWithinRanges(ranges, !ungapped
898 /* true = include gaps = !ungapped */);
899 }
900
901
902 /////////////////////////////////////////////////////////////////////////////
903
AddScore(CScope & scope,list<CRef<CSeq_align>> & aligns,CSeq_align::EScoreType score)904 void CScoreBuilderBase::AddScore(CScope& scope, list< CRef<CSeq_align> >& aligns,
905 CSeq_align::EScoreType score)
906 {
907 NON_CONST_ITERATE (list< CRef<CSeq_align> >, iter, aligns) {
908 AddScore(scope, **iter, score);
909 }
910 }
911
AddScore(CScope & scope,CSeq_align & align,CSeq_align::EScoreType score)912 void CScoreBuilderBase::AddScore(CScope& scope, CSeq_align& align,
913 CSeq_align::EScoreType score)
914 {
915 try {
916 switch (score) {
917 /// Special cases for the three precent-identity scores, to add
918 /// the num_ident and num_mismatch scores as well
919 case CSeq_align::eScore_PercentIdentity_Gapped:
920 case CSeq_align::eScore_PercentIdentity_Ungapped:
921 case CSeq_align::eScore_PercentIdentity_GapOpeningOnly:
922 {{
923 int identities = 0;
924 int mismatches = 0;
925 double pct_identity = 0;
926 s_GetPercentIdentity(scope, align, &identities, &mismatches,
927 &pct_identity,
928 static_cast<EPercentIdentityType>(
929 score - CSeq_align::eScore_PercentIdentity_Gapped));
930 align.SetNamedScore(score, pct_identity);
931 align.SetNamedScore(CSeq_align::eScore_IdentityCount, identities);
932 align.SetNamedScore(CSeq_align::eScore_MismatchCount, mismatches);
933 }}
934 break;
935
936 default:
937 {{
938 double score_value = ComputeScore(scope, align, score);
939 if (CSeq_align::IsIntegerScore(score)) {
940 align.SetNamedScore(score, (int)score_value);
941 } else {
942 if (score_value == numeric_limits<double>::infinity()) {
943 score_value = numeric_limits<double>::max() / 10.0;
944 }
945 align.SetNamedScore(score, score_value);
946 }
947 }}
948 }
949 } catch (CSeqalignException& e) {
950 // Unimplemented (code missing) or unsupported (score cannot be defined)
951 // is handled according to the error handling mode. All other
952 // errors always throw.
953 switch (e.GetErrCode()) {
954 case CSeqalignException::eUnsupported:
955 case CSeqalignException::eNotImplemented:
956 break;
957 default:
958 throw;
959 }
960
961 switch (GetErrorMode()) {
962 case eError_Throw:
963 throw;
964 case eError_Report:
965 ERR_POST(Error
966 << "CScoreBuilderBase::AddScore(): error computing score: "
967 << e);
968 default:
969 break;
970 }
971 }
972 }
973
GetDonor(const objects::CSpliced_exon & exon)974 string GetDonor(const objects::CSpliced_exon& exon) {
975 if( exon.CanGetDonor_after_exon() && exon.GetDonor_after_exon().CanGetBases() ) {
976 return exon.GetDonor_after_exon().GetBases();
977 }
978 return string();
979 }
980
GetAcceptor(const objects::CSpliced_exon & exon)981 string GetAcceptor(const objects::CSpliced_exon& exon) {
982 if( exon.CanGetAcceptor_before_exon() && exon.GetAcceptor_before_exon().CanGetBases() ) {
983 return exon.GetAcceptor_before_exon().GetBases();
984 }
985 return string();
986 }
987
988 //returns true for GT/AG, GC/AG AND AT/AC
IsConsSplice(const string & donor,const string acc)989 bool IsConsSplice(const string& donor, const string acc) {
990 if(donor.length()<2 || acc.length()<2) return false;
991 if(toupper(Uchar(acc.c_str()[0])) != 'A') return false;
992 switch(toupper(Uchar(acc.c_str()[1]))) {
993 case 'C':
994 if( toupper(Uchar(donor.c_str()[0])) == 'A' && toupper(Uchar(donor.c_str()[1])) == 'T' ) return true;
995 else return false;
996 break;
997 case 'G':
998 if( toupper(Uchar(donor.c_str()[0])) == 'G' ) {
999 char don2 = toupper(Uchar(donor.c_str()[1]));
1000 if(don2 == 'T' || don2 == 'C') return true;
1001 }
1002 return false;
1003 break;
1004 default:
1005 return false;
1006 break;
1007 }
1008 return false;
1009 }
1010
1011
AddSplignScores(const CSeq_align & align,CSeq_align::TScore & scores)1012 void CScoreBuilderBase::AddSplignScores(const CSeq_align& align,
1013 CSeq_align::TScore &scores)
1014 {
1015 typedef CSeq_align::TSegs::TSpliced TSpliced;
1016 const TSpliced & spliced (align.GetSegs().GetSpliced());
1017 if(spliced.GetProduct_type() != CSpliced_seg::eProduct_type_transcript) {
1018 NCBI_THROW(CSeqalignException, eUnsupported,
1019 "CScoreBuilderBase::AddSplignScores(): Unsupported product type");
1020 }
1021
1022 const bool qstrand (spliced.GetProduct_strand() != eNa_strand_minus);
1023
1024 typedef TSpliced::TExons TExons;
1025 const TExons & exons (spliced.GetExons());
1026
1027 size_t matches (0),
1028 aligned_query_bases (0), // matches, mismatches and indels
1029 aln_length_exons (0),
1030 aln_length_gaps (0),
1031 splices_total (0), // twice the number of introns
1032 splices_consensus (0);
1033
1034 const TSeqPos qlen (spliced.GetProduct_length());
1035 const TSeqPos polya (spliced.CanGetPoly_a()?
1036 spliced.GetPoly_a(): (qstrand? qlen: TSeqPos(-1)));
1037 const TSeqPos prod_length_no_polya (qstrand? polya: qlen - 1 - polya);
1038
1039 typedef CSpliced_exon TExon;
1040 TSeqPos qprev (qstrand? TSeqPos(-1): qlen);
1041 string donor;
1042 ITERATE(TExons, ii2, exons) {
1043
1044 const TExon & exon (**ii2);
1045 const TSeqPos qmin (exon.GetProduct_start().GetNucpos()),
1046 qmax (exon.GetProduct_end().GetNucpos());
1047
1048 const TSeqPos qgap (qstrand? qmin - qprev - 1: qprev - qmax - 1);
1049
1050 if(qgap > 0) {
1051 aln_length_gaps += qgap;
1052 donor.clear();
1053 }
1054 else if (ii2 != exons.begin()) {
1055 splices_total += 2;
1056 if(IsConsSplice(donor, GetAcceptor(exon))) { splices_consensus += 2; }
1057 }
1058
1059 typedef TExon::TParts TParts;
1060 const TParts & parts (exon.GetParts());
1061 string errmsg;
1062 ITERATE(TParts, ii3, parts) {
1063 const CSpliced_exon_chunk & part (**ii3);
1064 const CSpliced_exon_chunk::E_Choice choice (part.Which());
1065 TSeqPos len (0);
1066 switch(choice) {
1067 case CSpliced_exon_chunk::e_Match:
1068 len = part.GetMatch();
1069 matches += len;
1070 aligned_query_bases += len;
1071 break;
1072 case CSpliced_exon_chunk::e_Mismatch:
1073 len = part.GetMismatch();
1074 aligned_query_bases += len;
1075 break;
1076 case CSpliced_exon_chunk::e_Product_ins:
1077 len = part.GetProduct_ins();
1078 aligned_query_bases += len;
1079 break;
1080 case CSpliced_exon_chunk::e_Genomic_ins:
1081 len = part.GetGenomic_ins();
1082 break;
1083 default:
1084 errmsg = "Unexpected spliced exon chunk part: "
1085 + part.SelectionName(choice);
1086 NCBI_THROW(CSeqalignException, eUnsupported, errmsg);
1087 }
1088 aln_length_exons += len;
1089 }
1090
1091 donor = GetDonor(exon);
1092 qprev = qstrand? qmax: qmin;
1093 } // TExons
1094
1095 const TSeqPos qgap (qstrand? polya - qprev - 1: qprev - polya - 1);
1096 aln_length_gaps += qgap;
1097
1098 for (CSeq_align::TScore::iterator it = scores.begin(); it != scores.end(); )
1099 {
1100 CSeq_align::EScoreType score_type = CSeq_align::eScore_Score;
1101 if ((*it)->GetId().IsStr()) {
1102 CSeq_align::TScoreNameMap::const_iterator score =
1103 CSeq_align::ScoreNameMap()
1104 . find((*it)->GetId().GetStr());
1105 if (score != CSeq_align::ScoreNameMap().end()) {
1106 score_type = score->second;
1107 }
1108 }
1109 if (score_type >= CSeq_align::eScore_Matches &&
1110 score_type <= CSeq_align::eScore_ExonIdentity)
1111 {
1112 it = scores.erase(it);
1113 } else {
1114 ++it;
1115 }
1116 }
1117
1118 {
1119 CRef<CScore> score_matches (new CScore());
1120 score_matches->SetId().SetStr(
1121 CSeq_align::ScoreName(CSeq_align::eScore_Matches));
1122 score_matches->SetValue().SetInt(matches);
1123 scores.push_back(score_matches);
1124 }
1125 {
1126 CRef<CScore> score_overall_identity (new CScore());
1127 score_overall_identity->SetId().SetStr(
1128 CSeq_align::ScoreName(CSeq_align::eScore_OverallIdentity));
1129 score_overall_identity->SetValue().
1130 SetReal(double(matches)/(aln_length_exons + aln_length_gaps));
1131 scores.push_back(score_overall_identity);
1132 }
1133 {
1134 CRef<CScore> score_splices (new CScore());
1135 score_splices->SetId().SetStr(
1136 CSeq_align::ScoreName(CSeq_align::eScore_Splices));
1137 score_splices->SetValue().SetInt(splices_total);
1138 scores.push_back(score_splices);
1139 }
1140 {
1141 CRef<CScore> score_splices_consensus (new CScore());
1142 score_splices_consensus->SetId().SetStr(
1143 CSeq_align::ScoreName(CSeq_align::eScore_ConsensusSplices));
1144 score_splices_consensus->SetValue().SetInt(splices_consensus);
1145 scores.push_back(score_splices_consensus);
1146 }
1147 {
1148 CRef<CScore> score_coverage (new CScore());
1149 score_coverage->SetId().SetStr(
1150 CSeq_align::ScoreName(CSeq_align::eScore_ProductCoverage));
1151 score_coverage->SetValue().
1152 SetReal(double(aligned_query_bases) / prod_length_no_polya);
1153 scores.push_back(score_coverage);
1154 }
1155 {
1156 CRef<CScore> score_exon_identity (new CScore());
1157 score_exon_identity->SetId().SetStr(
1158 CSeq_align::ScoreName(CSeq_align::eScore_ExonIdentity));
1159 score_exon_identity->SetValue().
1160 SetReal(double(matches) / aln_length_exons);
1161 scores.push_back(score_exon_identity);
1162 }
1163
1164 }
1165
ComputeScore(CScope & scope,const CSeq_align & align,CSeq_align::EScoreType score)1166 double CScoreBuilderBase::ComputeScore(CScope& scope, const CSeq_align& align,
1167 CSeq_align::EScoreType score)
1168 {
1169 return ComputeScore(scope, align,
1170 CRangeCollection<TSeqPos>(TSeqRange::GetWhole()), score);
1171 }
1172
ComputeScore(CScope & scope,const CSeq_align & align,const TSeqRange & range,CSeq_align::EScoreType score)1173 double CScoreBuilderBase::ComputeScore(CScope& scope, const CSeq_align& align,
1174 const TSeqRange &range,
1175 CSeq_align::EScoreType score)
1176 {
1177 return ComputeScore(scope, align, CRangeCollection<TSeqPos>(range), score);
1178 }
1179
ComputeScore(CScope & scope,const CSeq_align & align,const CRangeCollection<TSeqPos> & ranges,CSeq_align::EScoreType score)1180 double CScoreBuilderBase::ComputeScore(CScope& scope, const CSeq_align& align,
1181 const CRangeCollection<TSeqPos> &ranges,
1182 CSeq_align::EScoreType score)
1183 {
1184 switch (score) {
1185 case CSeq_align::eScore_Score:
1186 {{
1187 NCBI_THROW(CSeqalignException, eUnsupported,
1188 "CScoreBuilderBase::ComputeScore(): "
1189 "generic 'score' computation is undefined");
1190 }}
1191 break;
1192
1193 case CSeq_align::eScore_Blast:
1194 case CSeq_align::eScore_BitScore:
1195 case CSeq_align::eScore_EValue:
1196 case CSeq_align::eScore_SumEValue:
1197 case CSeq_align::eScore_CompAdjMethod:
1198 NCBI_THROW(CSeqalignException, eNotImplemented,
1199 "CScoreBuilderBase::ComputeScore(): "
1200 "BLAST scores are available in CScoreBuilder, "
1201 "not CScoreBuilderBase");
1202 break;
1203
1204 case CSeq_align::eScore_IdentityCount:
1205 return GetIdentityCount(scope, align, ranges);
1206
1207 case CSeq_align::eScore_PositiveCount:
1208 if (ranges.empty() || !ranges.begin()->IsWhole()) {
1209 NCBI_THROW(CSeqalignException, eNotImplemented,
1210 "positive-count score not supported within a range");
1211 }
1212 return GetPositiveCount(scope, align);
1213
1214 case CSeq_align::eScore_NegativeCount:
1215 if (ranges.empty() || !ranges.begin()->IsWhole()) {
1216 NCBI_THROW(CSeqalignException, eNotImplemented,
1217 "positive-count score not supported within a range");
1218 }
1219 return GetNegativeCount(scope, align);
1220
1221 case CSeq_align::eScore_MismatchCount:
1222 return GetMismatchCount(scope, align, ranges);
1223
1224 case CSeq_align::eScore_GapCount:
1225 return GetGapCount(align, ranges);
1226
1227 case CSeq_align::eScore_AlignLength:
1228 return align.GetAlignLengthWithinRanges(ranges, true /* include gaps */);
1229
1230 case CSeq_align::eScore_PercentIdentity_Gapped:
1231 {{
1232 int identities = 0;
1233 int mismatches = 0;
1234 double pct_identity = 0;
1235 s_GetPercentIdentity(scope, align,
1236 &identities, &mismatches, &pct_identity,
1237 eGapped, ranges);
1238 return pct_identity;
1239 }}
1240 break;
1241
1242 case CSeq_align::eScore_PercentIdentity_Ungapped:
1243 {{
1244 int identities = 0;
1245 int mismatches = 0;
1246 double pct_identity = 0;
1247 s_GetPercentIdentity(scope, align,
1248 &identities, &mismatches, &pct_identity,
1249 eUngapped, ranges);
1250 return pct_identity;
1251 }}
1252 break;
1253
1254 case CSeq_align::eScore_PercentIdentity_GapOpeningOnly:
1255 {{
1256 int identities = 0;
1257 int mismatches = 0;
1258 double pct_identity = 0;
1259 s_GetPercentIdentity(scope, align,
1260 &identities, &mismatches, &pct_identity,
1261 eGBDNA, ranges);
1262 return pct_identity;
1263 }}
1264 break;
1265
1266 case CSeq_align::eScore_PercentCoverage:
1267 {{
1268 double pct_coverage = 0;
1269 s_GetPercentCoverage(scope, align, ranges, &pct_coverage);
1270 return pct_coverage;
1271 }}
1272 break;
1273
1274 case CSeq_align::eScore_HighQualityPercentCoverage:
1275 {{
1276 if(align.GetSegs().Which() == CSeq_align::TSegs::e_Std)
1277 /// high-quality-coverage calculatino is not possbile for standard segs
1278 NCBI_THROW(CSeqalignException, eUnsupported,
1279 "High-quality percent coverage not supported "
1280 "for standard seg representation");
1281
1282 if (ranges.empty() || !ranges.begin()->IsWhole()) {
1283 NCBI_THROW(CSeqalignException, eNotImplemented,
1284 "High-quality percent coverage not supported "
1285 "within a range");
1286 }
1287 /// If we have annotation for a high-quality region, it is in a ftable named
1288 /// "NCBI_GPIPE", containing a region Seq-feat named "alignable"
1289 TSeqRange alignable_range = TSeqRange::GetWhole();
1290 CBioseq_Handle query = scope.GetBioseqHandle(align.GetSeq_id(0));
1291 for(CFeat_CI feat_it(query,
1292 SAnnotSelector(CSeqFeatData::e_Region).
1293 SetExcludeExternal());
1294 feat_it; ++feat_it)
1295 {
1296 if(feat_it->GetData().GetRegion() == "alignable" &&
1297 feat_it->GetAnnot().IsNamed() &&
1298 feat_it->GetAnnot().GetName() == "NCBI_GPIPE")
1299 {
1300 alignable_range = feat_it->GetRange();
1301 break;
1302 }
1303 }
1304 double pct_coverage = 0;
1305 s_GetPercentCoverage(scope, align,
1306 CRangeCollection<TSeqPos>(alignable_range),
1307 &pct_coverage);
1308 return pct_coverage;
1309 }}
1310 break;
1311
1312 case CSeq_align::eScore_Matches:
1313 case CSeq_align::eScore_OverallIdentity:
1314 case CSeq_align::eScore_Splices:
1315 case CSeq_align::eScore_ConsensusSplices:
1316 case CSeq_align::eScore_ProductCoverage:
1317 case CSeq_align::eScore_ExonIdentity:
1318 {{
1319 if (ranges.empty() || !ranges.begin()->IsWhole()) {
1320 NCBI_THROW(CSeqalignException, eNotImplemented,
1321 "splign scores not supported within a range");
1322 }
1323 CSeq_align::TScore scores;
1324 AddSplignScores(align, scores);
1325 ITERATE (CSeq_align::TScore, it, scores) {
1326 if ((*it)->GetId().GetStr() == CSeq_align::ScoreName(score))
1327 {
1328 if ((*it)->GetValue().IsInt()) {
1329 return (*it)->GetValue().GetInt();
1330 } else {
1331 return (*it)->GetValue().GetReal();
1332 }
1333 }
1334 }
1335 NCBI_ASSERT(false, "Should never reach this point");
1336 }}
1337
1338 default:
1339 {{
1340 NCBI_THROW(CSeqalignException, eNotImplemented,
1341 "Unknown score");
1342 return 0;
1343 }}
1344 }
1345 }
1346
1347
1348
1349
1350
1351 END_NCBI_SCOPE
1352
1353