1 /* $Id: align_compare.cpp 522944 2016-12-26 16:08:29Z mozese2 $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Mike DiCuccio
27 *
28 * File Description:
29 *
30 */
31
32 #include <ncbi_pch.hpp>
33
34 #include <objects/seq/Seq_annot.hpp>
35 #include <objects/seqloc/Seq_loc.hpp>
36 #include <objects/seqalign/Seq_align.hpp>
37 #include <objects/seqalign/Seq_align_set.hpp>
38 #include <objects/seqalign/Std_seg.hpp>
39 #include <objects/seqalign/Dense_seg.hpp>
40 #include <objects/seqalign/Spliced_seg.hpp>
41 #include <objects/seqalign/Spliced_exon.hpp>
42 #include <objects/seqalign/Spliced_exon_chunk.hpp>
43 #include <objects/seqalign/Prot_pos.hpp>
44 #include <objects/seqalign/Product_pos.hpp>
45 #include <objects/seqalign/Score.hpp>
46 #include <objects/seq/seq_id_handle.hpp>
47 #include <objects/general/Object_id.hpp>
48
49 #include <objtools/alnmgr/pairwise_aln.hpp>
50 #include <objtools/alnmgr/aln_converters.hpp>
51
52 #include <algo/align/util/align_compare.hpp>
53 #include <algo/align/util/score_lookup.hpp>
54 #include <algo/align/util/algo_align_util_exceptions.hpp>
55
56 #include <cmath>
57 #include <ctype.h>
58
59 BEGIN_NCBI_SCOPE
60 USING_SCOPE(objects);
61
62
63 // Retrieve a list of interval-by-interval accounting within an alignment
64 //
65 bool s_IsOverlapping(CAlignCompare::SAlignment const* lhs, CAlignCompare::SAlignment const* rhs, CAlignCompare::ERowComparison row);
66
s_UpdateSpans(const TSeqRange & query_range,const TSeqRange & subject_range,CAlignCompare::SAlignment & align_info,CAlignCompare::ERowComparison row)67 static void s_UpdateSpans(const TSeqRange &query_range,
68 const TSeqRange &subject_range,
69 CAlignCompare::SAlignment& align_info,
70 CAlignCompare::ERowComparison row)
71 {
72 align_info.spans[row == CAlignCompare::e_Query ? query_range : subject_range] =
73 row == CAlignCompare::e_Subject ? subject_range : query_range;
74 }
75
s_GetAlignmentSpans_Interval(const CSeq_align & align,CAlignCompare::SAlignment & align_info,CAlignCompare::ERowComparison row)76 static void s_GetAlignmentSpans_Interval(const CSeq_align& align,
77 CAlignCompare::SAlignment& align_info,
78 CAlignCompare::ERowComparison row)
79 {
80 switch (align.GetSegs().Which()) {
81 case CSeq_align::TSegs::e_Disc:
82 ITERATE (CSeq_align::TSegs::TDisc::Tdata, it,
83 align.GetSegs().GetDisc().Get()) {
84 s_GetAlignmentSpans_Interval(**it, align_info, row);
85 }
86 break;
87
88 case CSeq_align::TSegs::e_Std:
89 {{
90 ITERATE (CSeq_align::TSegs::TStd, seg_it,
91 align.GetSegs().GetStd()) {
92 // our expectation is to find a set of two locs
93 // each loc is expected to be a Seq-interval
94
95 // we expect std-seg alignments to be single intervals
96 // check this here
97 if ((*seg_it)->GetLoc().size() != 2) {
98 NCBI_THROW(CException, eUnknown,
99 "Pairwise Std-seg alignments in comparison "
100 "should always have two locs");
101 }
102 CConstRef<CSeq_loc> loc1 = (*seg_it)->GetLoc()[0];
103 CConstRef<CSeq_loc> loc2 = (*seg_it)->GetLoc()[1];
104
105 if (loc1->IsEmpty() || loc2->IsEmpty() ||
106 loc1->IsNull() || loc2->IsNull()) {
107 // gaps - omit
108 continue;
109 }
110
111 if (!loc1->IsInt() || !loc2->IsInt()) {
112 NCBI_THROW(CException, eUnknown,
113 "Pairwise Std-set alignments in comparison "
114 "should always be intervals");
115 }
116 s_UpdateSpans(loc1->GetTotalRange(),
117 loc2->GetTotalRange(),
118 align_info, row);
119 }
120 }}
121 break;
122
123 default:
124 {{
125 CAlnSeqId id0(align.GetSeq_id(0));
126 CAlnSeqId id1(align.GetSeq_id(1));
127
128 TAlnSeqIdIRef r0(&id0);
129 TAlnSeqIdIRef r1(&id1);
130 CPairwiseAln pw(r0, r1);
131 ConvertSeqAlignToPairwiseAln(pw, align, 0, 1);
132 ITERATE (CPairwiseAln, it, pw) {
133 const CPairwiseAln::TAlignRange& r = *it;
134 s_UpdateSpans(TSeqRange(r.GetFirstFrom(), r.GetFirstTo()),
135 TSeqRange(r.GetSecondFrom(), r.GetSecondTo()),
136 align_info, row);
137 }
138 }}
139 break;
140 }
141 }
142
s_GetAlignmentMismatches(const CSeq_align & align,CAlignCompare::SAlignment & align_info,CAlignCompare::ERowComparison row)143 static void s_GetAlignmentMismatches(const CSeq_align& align,
144 CAlignCompare::SAlignment& align_info,
145 CAlignCompare::ERowComparison row)
146 {
147 if (!align.GetSegs().IsSpliced()) {
148 CScoreLookup lookup;
149 string traceback = lookup.GetTraceback(align, 0);
150 if (traceback.empty()) {
151 NCBI_THROW(CException, eUnknown,
152 "Comparing mismatches for dense-seg alignments requires "
153 "traceback information");
154 }
155 int product_dir = align.GetSeqStrand(0) == eNa_strand_minus
156 ? -1 : 1;
157 TSeqPos product_pos = product_dir == 1 ? align.GetSeqStart(0)
158 : align.GetSeqStop(0);
159 int genomic_dir = align.GetSeqStrand(1) == eNa_strand_minus
160 ? -1 : 1;
161 TSeqPos genomic_pos = product_dir == 1 ? align.GetSeqStart(1)
162 : align.GetSeqStop(1);
163 unsigned match = 0;
164 ITERATE (string, it, traceback) {
165 if (isdigit(*it)) {
166 match = match * 10 + (*it - '0');
167 continue;
168 }
169 product_pos += match * product_dir;
170 genomic_pos += match * genomic_dir;
171 match = 0;
172 bool genomic_ins = *it == '-';
173 bool product_ins = *++it == '-';
174 if (!genomic_ins && !product_ins) {
175 /// mismatch
176 if (row != CAlignCompare::e_Subject) {
177 align_info.query_mismatches += TSeqRange(
178 product_pos, product_pos);
179
180 }
181 if (row != CAlignCompare::e_Query) {
182 align_info.subject_mismatches += TSeqRange(
183 genomic_pos, genomic_pos);
184 }
185 }
186 if (!genomic_ins) {
187 product_pos += product_dir;
188 }
189 if (!product_ins) {
190 genomic_pos += genomic_dir;
191 }
192 }
193 if (product_pos != (product_dir == 1 ? align.GetSeqStop(0)+1
194 : align.GetSeqStart(0)-1)
195 || genomic_pos != (genomic_dir == 1 ? align.GetSeqStop(1)+1
196 : align.GetSeqStart(1)-1))
197 {
198 NCBI_THROW(CException, eUnknown,
199 "Inconsistent length of traceback string " + traceback);
200 }
201 return;
202 }
203
204 bool is_product_minus = align.GetSegs().GetSpliced().IsSetProduct_strand() &&
205 align.GetSegs().GetSpliced().GetProduct_strand() == eNa_strand_minus;
206 bool is_genomic_minus = align.GetSegs().GetSpliced().IsSetGenomic_strand() &&
207 align.GetSegs().GetSpliced().GetGenomic_strand() == eNa_strand_minus;
208 ITERATE (CSpliced_seg::TExons, it, align.GetSegs().GetSpliced().GetExons())
209 {
210 const CSpliced_exon& exon = **it;
211 if (!exon.IsSetParts()) {
212 continue;
213 }
214 int product_dir = is_product_minus ||
215 (exon.IsSetProduct_strand() &&
216 exon.GetProduct_strand() == eNa_strand_minus)
217 ? -1 : 1;
218 TSeqPos product_pos = (product_dir == 1 ? exon.GetProduct_start()
219 : exon.GetProduct_end())
220 . AsSeqPos();
221 int genomic_dir = is_genomic_minus ||
222 (exon.IsSetGenomic_strand() &&
223 exon.GetGenomic_strand() == eNa_strand_minus)
224 ? -1 : 1;
225 TSeqPos genomic_pos = genomic_dir == 1 ? exon.GetGenomic_start()
226 : exon.GetGenomic_end();
227 ITERATE (CSpliced_exon::TParts, part_it, exon.GetParts()) {
228 switch ((*part_it)->Which()) {
229 case CSpliced_exon_chunk::e_Mismatch:
230 if (row != CAlignCompare::e_Subject) {
231 TSeqPos product_mismatch_end = product_pos +
232 product_dir * ((*part_it)->GetMismatch()-1);
233 align_info.query_mismatches += TSeqRange(
234 min(product_pos,product_mismatch_end),
235 max(product_pos,product_mismatch_end));
236
237 }
238 if (row != CAlignCompare::e_Query) {
239 TSeqPos genomic_mismatch_end = genomic_pos +
240 genomic_dir * ((*part_it)->GetMismatch()-1);
241 align_info.subject_mismatches += TSeqRange(
242 min(genomic_pos,genomic_mismatch_end),
243 max(genomic_pos,genomic_mismatch_end));
244 }
245 product_pos += product_dir * (*part_it)->GetMismatch();
246 genomic_pos += genomic_dir * (*part_it)->GetMismatch();
247 break;
248
249 case CSpliced_exon_chunk::e_Match:
250 product_pos += product_dir * (*part_it)->GetMatch();
251 genomic_pos += genomic_dir * (*part_it)->GetMatch();
252 break;
253
254 case CSpliced_exon_chunk::e_Product_ins:
255 product_pos += product_dir * (*part_it)->GetProduct_ins();
256 break;
257
258 case CSpliced_exon_chunk::e_Genomic_ins:
259 genomic_pos += genomic_dir * (*part_it)->GetGenomic_ins();
260 break;
261
262 default:
263 NCBI_THROW(CException, eUnknown,
264 "Unsupported exon part");
265 }
266 }
267 }
268 }
269
270 // Retrieve a list of exon-by-exon accounting within an alignment
271 //
s_GetAlignmentSpans_Exon(const CSeq_align & align,CAlignCompare::SAlignment & align_info,CAlignCompare::ERowComparison row)272 static void s_GetAlignmentSpans_Exon(const CSeq_align& align,
273 CAlignCompare::SAlignment &align_info,
274 CAlignCompare::ERowComparison row)
275 {
276 switch (align.GetSegs().Which()) {
277 case CSeq_align::TSegs::e_Denseg:
278 s_UpdateSpans(align.GetSeqRange(0), align.GetSeqRange(1), align_info, row);
279 break;
280
281 case CSeq_align::TSegs::e_Disc:
282 /* UNTESTED */
283 ITERATE (CSeq_align::TSegs::TDisc::Tdata, it,
284 align.GetSegs().GetDisc().Get()) {
285 s_UpdateSpans((*it)->GetSeqRange(0), (*it)->GetSeqRange(1), align_info, row);
286 }
287 break;
288
289 case CSeq_align::TSegs::e_Std:
290 /* UNTESTED */
291 ITERATE (CSeq_align::TSegs::TStd, it, align.GetSegs().GetStd()) {
292 const CStd_seg& seg = **it;
293 s_UpdateSpans(seg.GetLoc()[0]->GetTotalRange(),
294 seg.GetLoc()[1]->GetTotalRange(), align_info, row);
295 }
296 break;
297
298 case CSeq_align::TSegs::e_Spliced:
299 ITERATE (CSpliced_seg::TExons, it,
300 align.GetSegs().GetSpliced().GetExons()) {
301 const CSpliced_exon& exon = **it;
302 TSeqRange genomic(exon.GetGenomic_start(), exon.GetGenomic_end());
303 TSeqRange product;
304 product.SetFrom(exon.GetProduct_start().AsSeqPos());
305 product.SetTo(exon.GetProduct_end().AsSeqPos());
306 s_UpdateSpans(product, genomic, align_info, row);
307 }
308 break;
309
310 default:
311 NCBI_THROW(CException, eUnknown,
312 "unhandled alignment type");
313 }
314 }
315
316
317 // Retrieve a list of intron-by-intron accounting within an alignment;
318 // meaningful only for Spliced-seg alignments
319 //
s_GetAlignmentSpans_Intron(const CSeq_align & align,CAlignCompare::SAlignment & align_info,CAlignCompare::ERowComparison row)320 static void s_GetAlignmentSpans_Intron(const CSeq_align& align,
321 CAlignCompare::SAlignment &align_info,
322 CAlignCompare::ERowComparison row)
323 {
324 if (!align.GetSegs().IsSpliced() ||
325 !align.GetSegs().GetSpliced().CanGetProduct_strand() ||
326 !align.GetSegs().GetSpliced().CanGetGenomic_strand())
327 {
328 NCBI_THROW(CException, eUnknown,
329 "intron mode only meaningful for Spliced-seg alignments");
330 }
331
332 bool is_reverse = align.GetSegs().GetSpliced().GetProduct_strand() !=
333 align.GetSegs().GetSpliced().GetGenomic_strand();
334
335 CRef<CSpliced_exon> last_exon;
336 ITERATE (CSpliced_seg::TExons, it,
337 align.GetSegs().GetSpliced().GetExons()) {
338 CRef<CSpliced_exon> exon = *it;
339 if (last_exon) {
340 CRef<CSpliced_exon> first_exon = is_reverse ? exon : last_exon;
341 CRef<CSpliced_exon> second_exon = is_reverse ? last_exon : exon;
342 TSeqRange genomic(first_exon->GetGenomic_end(),
343 second_exon->GetGenomic_start());
344 TSeqRange product;
345 product.SetFrom(last_exon->GetProduct_end().AsSeqPos());
346 product.SetTo(exon->GetProduct_start().AsSeqPos());
347 s_UpdateSpans(product, genomic, align_info, row);
348 }
349 last_exon = exon;
350 }
351 }
352
353
354 // Retrieve a list of total range spans for an alignment
355 //
s_GetAlignmentSpans_Span(const CSeq_align & align,CAlignCompare::SAlignment & align_info,CAlignCompare::ERowComparison row)356 static void s_GetAlignmentSpans_Span(const CSeq_align& align,
357 CAlignCompare::SAlignment &align_info,
358 CAlignCompare::ERowComparison row)
359 {
360 s_UpdateSpans(align.GetSeqRange(0), align.GetSeqRange(1), align_info, row);
361 }
362
363 template<typename T>
s_PopulateScores(const CSeq_align & align,const vector<string> & score_list,vector<T> & scores,bool required=true)364 void s_PopulateScores(const CSeq_align &align,
365 const vector<string> &score_list,
366 vector<T> &scores,
367 bool required = true)
368 {
369 CScoreLookup lookup;
370 ITERATE (vector<string>, it, score_list) {
371 T value = 0;
372 try {
373 value = lookup.GetScore(align, *it);
374 } catch(CAlgoAlignUtilException &e) {
375 /// If scores are not required, use value of 0 for scores that were not found
376 if (required || e.GetErrCode() != CAlgoAlignUtilException::eScoreNotFound) {
377 throw;
378 }
379 }
380 scores.push_back(value);
381 }
382 }
383
s_PopulateScoreSet(const CSeq_align & align,const set<string> & score_set,bool score_set_as_blacklist,CAlignCompare::TIntegerScoreSet & integer_scores,CAlignCompare::TRealScoreSet & real_scores)384 static void s_PopulateScoreSet(const CSeq_align &align,
385 const set<string> &score_set,
386 bool score_set_as_blacklist,
387 CAlignCompare::TIntegerScoreSet &integer_scores,
388 CAlignCompare::TRealScoreSet &real_scores)
389 {
390 if (score_set_as_blacklist) {
391 ITERATE (CSeq_align::TScore, score_it, align.GetScore()) {
392 if ((*score_it)->GetId().IsStr() &&
393 !score_set.count((*score_it)->GetId().GetStr()))
394 {
395 if ((*score_it)->GetValue().IsInt()) {
396 integer_scores[(*score_it)->GetId().GetStr()] =
397 (*score_it)->GetValue().GetInt();
398 } else {
399 real_scores[(*score_it)->GetId().GetStr()] =
400 (*score_it)->GetValue().GetReal();
401 }
402 }
403 }
404 } else {
405 CScoreLookup lookup;
406 ITERATE (set<string>, score_it, score_set) {
407 double value = lookup.GetScore(align, *score_it);
408 if (lookup.IsIntegerScore(align, *score_it)) {
409 integer_scores[*score_it] = static_cast<int>(value);
410 } else {
411 real_scores[*score_it] = value;
412 }
413 }
414 }
415 }
416
s_EquivalentScores(const CAlignCompare::TRealScoreSet & scores1,const CAlignCompare::TRealScoreSet & scores2,double real_score_tolerance)417 static bool s_EquivalentScores(const CAlignCompare::TRealScoreSet &scores1,
418 const CAlignCompare::TRealScoreSet &scores2,
419 double real_score_tolerance)
420 {
421 for (CAlignCompare::TRealScoreSet::const_iterator it1 = scores1.begin(),
422 it2 = scores2.begin();
423 it1 != scores1.end() || it2 != scores2.end(); ++it1, ++it2)
424 {
425 if (it1 == scores1.end() || it2 == scores2.end()
426 || it1->first != it2->first)
427 {
428 /// The two don't have the same set of real-value scores
429 return false;
430 }
431 double allowed_diff = max(abs(it1->second), abs(it2->second))
432 * real_score_tolerance;
433 if (abs(it1->second - it2->second) > allowed_diff) {
434 return false;
435 }
436 }
437 return true;
438 }
439
440 //////////////////////////////////////////////////////////////////////////////
441 //
442 // SComparison constructor does the hard work of verifying that two sets of alignment
443 // span ranges actually overlap, and determines by how much
444 //
445 struct SComparison
446 {
447 SComparison();
448
449 SComparison(const CAlignCompare::SAlignment& first,
450 const CAlignCompare::SAlignment& second,
451 double real_score_tolerance);
452
453 size_t spans_in_common;
454 size_t spans_overlap;
455 Int8 spans_unique_first;
456 Int8 spans_unique_second;
457 bool is_equivalent;
458 float overlap;
459 };
460
SComparison()461 SComparison::SComparison()
462 : spans_in_common(0)
463 , spans_overlap(0)
464 , spans_unique_first(0)
465 , spans_unique_second(0)
466 , is_equivalent(false)
467 , overlap(0)
468 {
469 }
470
SComparison(const CAlignCompare::SAlignment & first,const CAlignCompare::SAlignment & second,double real_score_tolerance)471 SComparison::SComparison(const CAlignCompare::SAlignment& first,
472 const CAlignCompare::SAlignment& second,
473 double real_score_tolerance)
474 : spans_in_common(0)
475 , spans_overlap(0)
476 , spans_unique_first(0)
477 , spans_unique_second(0)
478 , is_equivalent(false)
479 , overlap(0)
480 {
481 if (first.CompareGroup(second, false) != 0) {
482 /// Alignments have different disambiguiting score values, can't be compared
483 return;
484 }
485
486 float dot = 0;
487 float sum_a = 0;
488 float sum_b = 0;
489
490 CAlignCompare::TAlignmentSpans::const_iterator first_it = first.spans.begin();
491 CAlignCompare::TAlignmentSpans::const_iterator second_it = second.spans.begin();
492 for ( ; first_it != first.spans.end() && second_it != second.spans.end(); ) {
493 if (*first_it == *second_it) {
494 TSeqPos intersecting_len = first_it->first.GetLength();
495 dot += float(intersecting_len) * float(intersecting_len);
496 sum_a += first_it->first.GetLength() * first_it->first.GetLength();
497 sum_b += second_it->first.GetLength() * second_it->first.GetLength();
498
499 spans_in_common += intersecting_len;
500 ++first_it;
501 ++second_it;
502 } else {
503 bool overlap =
504 first_it->first.IntersectingWith(second_it->first) &&
505 first_it->second.IntersectingWith(second_it->second);
506 TSeqPos intersecting_len = 0;
507 if (overlap) {
508 TSeqRange r = first_it->first;
509 r.IntersectWith(second_it->first);
510
511 intersecting_len = r.GetLength();
512 dot += float(intersecting_len) * float(intersecting_len);
513
514 spans_overlap += intersecting_len;
515 spans_unique_first -= intersecting_len;
516 spans_unique_second -= intersecting_len;
517 }
518 if (*first_it < *second_it) {
519 sum_a += first_it->first.GetLength() * first_it->first.GetLength();
520 spans_unique_first += first_it->first.GetLength();
521 ++first_it;
522 } else {
523 sum_b += second_it->first.GetLength() * second_it->first.GetLength();
524 spans_unique_second += second_it->first.GetLength();
525 ++second_it;
526 }
527 }
528 }
529 is_equivalent = spans_in_common == first.length &&
530 spans_in_common == second.length &&
531 first.query_mismatches == second.query_mismatches &&
532 first.subject_mismatches == second.subject_mismatches &&
533 first.integer_scores == second.integer_scores &&
534 s_EquivalentScores(first.real_scores, second.real_scores,
535 real_score_tolerance);
536 for ( ; first_it != first.spans.end(); ++first_it) {
537 sum_a += first_it->first.GetLength() * first_it->first.GetLength();
538 spans_unique_first += first_it->first.GetLength();
539 }
540 for ( ; second_it != second.spans.end(); ++second_it) {
541 sum_b += second_it->first.GetLength() * second_it->first.GetLength();
542 spans_unique_second += second_it->first.GetLength();
543 }
544
545 overlap = dot == 0 ? 0 : dot / ::sqrt(sum_a * sum_b);
546 }
547
548
549 struct SAlignment_PtrLess
550 {
operator ()SAlignment_PtrLess551 bool operator()(const CAlignCompare::SAlignment *ptr1,
552 const CAlignCompare::SAlignment *ptr2) const
553 {
554 const CAlignCompare::SAlignment& k1 = *ptr1;
555 const CAlignCompare::SAlignment& k2 = *ptr2;
556
557 if (k1.query < k2.query) { return true; }
558 if (k2.query < k1.query) { return false; }
559 if (k1.subject < k2.subject) { return true; }
560 if (k2.subject < k1.subject) { return false; }
561
562 if (k1.scores < k2.scores) { return true; }
563 if (k2.scores < k1.scores) { return false; }
564
565 if (k1.query_strand < k2.query_strand) { return true; }
566 if (k2.query_strand < k1.query_strand) { return false; }
567 if (k1.subject_strand < k2.subject_strand) { return true; }
568 if (k2.subject_strand < k1.subject_strand) { return false; }
569
570 if (k1.subject_range < k2.subject_range) { return true; }
571 if (k2.subject_range < k1.subject_range) { return false; }
572 if (k1.query_range < k2.query_range) { return true; }
573 if (k2.query_range < k1.query_range) { return false; }
574
575 return ptr1 < ptr2;
576 }
577 };
578
579 typedef set<CAlignCompare::SAlignment *, SAlignment_PtrLess> TAlignPtrSet;
580 typedef pair<CAlignCompare::SAlignment *, CAlignCompare::SAlignment *> TPtrPair;
581 typedef pair<TPtrPair, SComparison> TComp;
582
583 struct SComp_Less
584 {
585 bool strict_compare;
586
SComp_LessSComp_Less587 SComp_Less(bool strict = false)
588 : strict_compare(strict)
589 {
590 }
591
operator ()SComp_Less592 bool operator()(const TComp& c1, const TComp& c2) const
593 {
594 // strict comparison amounts to placing all identical pairs either before
595 // or after non-identical ones
596 // putting identical pairs first means that we evaluate the best examples first, and
597 // can establish equality without polluting the comparison with weaker
598 // alignments; non-strict means we combine weaker overlapping
599 // alignments together into equivalence groups with alignments that are
600 // identical
601 if (strict_compare) {
602 if (c1.second.is_equivalent && !c2.second.is_equivalent) {
603 return true;
604 }
605 if (c2.second.is_equivalent && !c1.second.is_equivalent) {
606 return false;
607 }
608 }
609 else {
610 if (c1.second.is_equivalent && !c2.second.is_equivalent) {
611 return false;
612 }
613 if (c2.second.is_equivalent && !c1.second.is_equivalent) {
614 return true;
615 }
616 }
617
618 if (c1.first.first->subject_range < c2.first.first->subject_range)
619 {
620 return false;
621 }
622 if (c2.first.first->subject_range < c1.first.first->subject_range)
623 {
624 return true;
625 }
626 return c1.first.second->query_range < c2.first.second->query_range;
627 }
628 };
629
630 CAlignCompare::SAlignment::
SAlignment(int s,const CRef<CSeq_align> & al,CAlignCompare & compare,bool is_slice)631 SAlignment(int s, const CRef<CSeq_align> &al, CAlignCompare &compare,
632 bool is_slice)
633 : source_set(s)
634 , query_strand(eNa_strand_unknown)
635 , subject_strand(eNa_strand_unknown)
636 , length(0)
637 , align(al)
638 , match_level(CAlignCompare::e_NoMatch)
639 , compare_object(compare)
640 {
641 try {
642 if (compare.m_Row != e_Subject) {
643 query = CSeq_id_Handle::GetHandle(align->GetSeq_id(0));
644 query_strand = align->GetSeqStrand(0);
645 query_range = align->GetSeqRange(0);
646 }
647 if (compare.m_Row != e_Query) {
648 subject = CSeq_id_Handle::GetHandle(align->GetSeq_id(1));
649 subject_strand = align->GetSeqStrand(1);
650 subject_range = align->GetSeqRange(1);
651 }
652 s_PopulateScores(*align, compare.m_DisambiguitingScores.first, scores.first);
653 s_PopulateScores(*align, compare.m_DisambiguitingScores.second, scores.second, false);
654 s_PopulateScores(*align, compare.m_QualityScores, quality_scores);
655 s_PopulateScoreSet(*align, compare.m_ScoreSet, compare.m_ScoreSetAsBlacklist,
656 integer_scores, real_scores);
657 switch (compare.m_Mode) {
658 case e_Full:
659 /// If this alignment was created by slicing an input alignment,
660 /// it doesn't have traceback data so mismatches can't be calculated
661 if (!is_slice) {
662 s_GetAlignmentMismatches(*align, *this, compare.m_Row);
663 }
664 // fall through
665
666 case e_Interval:
667 s_GetAlignmentSpans_Interval(*align, *this, compare.m_Row);
668 break;
669
670 case e_Exon:
671 s_GetAlignmentSpans_Exon(*align, *this, compare.m_Row);
672 break;
673
674 case e_Span:
675 s_GetAlignmentSpans_Span(*align, *this, compare.m_Row);
676 break;
677
678 case e_Intron:
679 s_GetAlignmentSpans_Intron(*align, *this, compare.m_Row);
680 break;
681 }
682 }
683 catch (CException& e) {
684 ERR_POST(Error << "alignment not processed: " << MSerial_AsnText << *align << e);
685 spans.clear();
686 }
687 ITERATE (TAlignmentSpans, it, spans) {
688 length += it->first.GetLength();
689 }
690 }
691
692 int CAlignCompare::SAlignment::
CompareGroup(const SAlignment & o,bool strict_only) const693 CompareGroup(const SAlignment &o, bool strict_only) const
694 {
695 if (query.AsString() < o.query.AsString()) { return -1; }
696 if (o.query.AsString() < query.AsString()) { return 1; }
697
698 if (subject.AsString() < o.subject.AsString()) { return -1; }
699 if (o.subject.AsString() < subject.AsString()) { return 1; }
700
701 if (scores.first < o.scores.first) { return -1; }
702 if (o.scores.first < scores.first) { return 1; }
703
704 if (strict_only) {
705 return 0;
706 }
707
708 for (unsigned score_index = 0; score_index < scores.second.size(); ++score_index) {
709 if (scores.second[score_index] && o.scores.second[score_index]) {
710 if (scores.second[score_index] < o.scores.second[score_index]) { return -1; }
711 if (o.scores.second[score_index] < scores.second[score_index]) { return 1; }
712 }
713 }
714
715 return 0;
716 }
717
x_DetermineNextGroupSet()718 int CAlignCompare::x_DetermineNextGroupSet()
719 {
720 if (m_NextSet1Group.empty()) {
721 if (m_Set1.EndOfData()) {
722 return 2;
723 } else {
724 m_NextSet1Group.push_back(x_NextAlignment(1));
725 }
726 }
727 if (m_NextSet2Group.empty()) {
728 if (m_Set2.EndOfData()) {
729 return 1;
730 } else {
731 m_NextSet2Group.push_back(x_NextAlignment(2));
732 }
733 }
734 int compare_group = m_NextSet1Group.front()
735 ->CompareGroup(*m_NextSet2Group.front(), true);
736 if (compare_group < 0) {
737 return 1;
738 } else if (compare_group > 0) {
739 return 2;
740 } else {
741 return 3;
742 }
743 }
744
x_GetCurrentGroup(int set)745 void CAlignCompare::x_GetCurrentGroup(int set)
746 {
747 IAlignSource &source = set == 1 ? m_Set1 : m_Set2;
748 list< AutoPtr<SAlignment> > ¤t_group =
749 set == 1 ? m_CurrentSet1Group : m_CurrentSet2Group;
750 list< AutoPtr<SAlignment> > &next_group =
751 set == 1 ? m_NextSet1Group : m_NextSet2Group;
752 current_group.clear();
753 current_group.splice(current_group.end(), next_group);
754 while (!source.EndOfData() && next_group.empty()) {
755 AutoPtr<SAlignment> align = x_NextAlignment(set);
756 if (current_group.empty() || align->CompareGroup(*current_group.front(), true) == 0)
757 {
758 current_group.push_back(align);
759 } else {
760 next_group.push_back(align);
761 }
762 }
763 }
764
PopulateBoundariesMap() const765 void CAlignCompare::SAlignment::PopulateBoundariesMap() const
766 {
767 ITERATE (TAlignmentSpans, it, spans) {
768 if (query) {
769 compare_object.m_BoundariesMap[query].insert(it->second.GetFrom());
770 compare_object.m_BoundariesMap[query].insert(it->second.GetToOpen());
771 }
772 if (subject) {
773 compare_object.m_BoundariesMap[subject].insert(it->first.GetFrom());
774 compare_object.m_BoundariesMap[subject].insert(it->first.GetToOpen());
775 }
776 }
777 }
778
779 list< AutoPtr<CAlignCompare::SAlignment> > CAlignCompare::SAlignment::
BreakOnBoundaries(int row) const780 BreakOnBoundaries(int row) const
781 {
782 list< AutoPtr<SAlignment> > align_parts;
783 const set<TSeqPos> &boundaries =
784 compare_object.m_BoundariesMap[row == 0 ? query : subject];
785 TSeqRange range = row == 0 ? query_range : subject_range;
786 TSeqPos last_boundary = range.GetFrom();
787 for (set<TSeqPos>::const_iterator it = boundaries.upper_bound(range.GetFrom());
788 it != boundaries.end() && *it <= range.GetToOpen(); ++it)
789 {
790 /// Extract slice, as long as it's not the the entire alignment
791 if (last_boundary > range.GetFrom() || *it < range.GetToOpen()) {
792 AutoPtr<SAlignment> part = Slice(row, last_boundary, *it-1);
793 if (part.get()) {
794 align_parts.push_back(part);
795 }
796 }
797 last_boundary = *it;
798 }
799 if (!align_parts.empty() && last_boundary < range.GetToOpen()) {
800 AutoPtr<SAlignment> part = Slice(row, last_boundary, range.GetTo());
801 if (part.get()) {
802 align_parts.push_back(part);
803 }
804 }
805 return align_parts;
806 }
807
808 AutoPtr<CAlignCompare::SAlignment> CAlignCompare::SAlignment::
Slice(int row,TSeqPos from,TSeqPos to) const809 Slice(int row, TSeqPos from, TSeqPos to) const
810 {
811 if (align->GetSegs().IsDisc()) {
812 vector< AutoPtr<SAlignment> > seg_slices;
813 TSeqRange slice_range(min(from, to), max(from, to));
814 ITERATE (CSeq_align_set::Tdata, seg_it, align->GetSegs().GetDisc().Get())
815 {
816 TSeqRange seg_slice_range =
817 slice_range & (*seg_it)->GetSeqRange(row);
818 if (seg_slice_range.Empty()) {
819 continue;
820 }
821 AutoPtr<SAlignment> slice =
822 SAlignment(source_set, *seg_it, compare_object) . Slice(
823 row, seg_slice_range.GetFrom(), seg_slice_range.GetTo());
824 if (slice.get()) {
825 seg_slices.push_back(slice);
826 }
827 }
828 AutoPtr<SAlignment> complete_slice;
829 if (seg_slices.size() == 1) {
830 complete_slice = seg_slices.front();
831 } else if (seg_slices.size() > 1) {
832 CRef<CSeq_align> complete_align(new CSeq_align);
833 complete_slice.reset(new SAlignment(source_set, complete_align,
834 compare_object, true));
835 ITERATE (vector< AutoPtr<SAlignment> >, seg_it, seg_slices) {
836 complete_align->SetSegs().SetDisc().Set().push_back(
837 (*seg_it)->align);
838 complete_slice->query_mismatches += (*seg_it)->query_mismatches;
839 complete_slice->subject_mismatches += (*seg_it)->query_mismatches;
840 }
841 }
842 return complete_slice;
843 }
844
845 if (!align->GetSegs().IsDenseg()) {
846 NCBI_THROW(CException, eUnknown,
847 "Alignment splitting supported only for Dense-seq and "
848 "Disc-seg alignments");
849 }
850 AutoPtr<SAlignment> slice;
851 CRef<CSeq_align> slice_align(new CSeq_align);
852 slice_align->SetType(align->GetType());
853 if (align->IsSetDim()) {
854 slice_align->SetDim(align->GetDim());
855 }
856 CRef<CDense_seg> slice_seg = align->GetSegs().GetDenseg()
857 .ExtractSlice(row, from, to);
858 bool all_gaps = true;
859 for (int seg = 0; seg < slice_seg->GetNumseg() && all_gaps; ++seg) {
860 if (slice_seg->GetStarts()[seg*2] >= 0 &&
861 slice_seg->GetStarts()[seg*2+1] >= 0)
862 {
863 all_gaps = false;
864 }
865 }
866 if (all_gaps) {
867 /// Slice is completely within gaps, so no alignment
868 return slice;
869 }
870 slice_align->SetSegs().SetDenseg(*slice_seg);
871 ITERATE (CSeq_align::TScore, score_it, align->GetScore()) {
872 if ((*score_it)->GetId().IsStr() &&
873 compare_object.m_DistributiveScores.count(
874 (*score_it)->GetId().GetStr()))
875 {
876 slice_align->SetScore().push_back(*score_it);
877 }
878 }
879 slice.reset(new SAlignment(source_set, slice_align, compare_object, true));
880 /// Special case for full mode; extract the mismatches positions from the
881 /// ones calculated for the full alignment
882 if (compare_object.m_Mode == e_Full) {
883 slice->query_mismatches = query_mismatches;
884 slice->query_mismatches &= slice_align->GetSeqRange(0);
885 slice->subject_mismatches = subject_mismatches;
886 slice->subject_mismatches &= slice_align->GetSeqRange(1);
887 }
888 return slice;
889 }
890
x_SplitOnOverlaps(int group,int row)891 void CAlignCompare::x_SplitOnOverlaps(int group, int row)
892 {
893 list< AutoPtr<SAlignment> > orig_set;
894 list< AutoPtr<SAlignment> > &transformed_set =
895 group == 1 ? m_CurrentSet1Group : m_CurrentSet2Group;
896 CSeq_id_Handle id = row == 0 ? transformed_set.front()->query
897 : transformed_set.front()->subject;
898 if (!id) {
899 return;
900 }
901 orig_set.swap(transformed_set);
902 ITERATE (list< AutoPtr<SAlignment> >, it, orig_set) {
903 list< AutoPtr<CAlignCompare::SAlignment> > parts =
904 (*it)->BreakOnBoundaries(row);
905 if (parts.empty()) {
906 transformed_set.push_back(*it);
907 } else {
908 transformed_set.splice(transformed_set.end(), parts);
909 }
910 }
911 }
912
PopulateBoundariesMap()913 void CAlignCompare::PopulateBoundariesMap()
914 {
915 while (!m_Set1.EndOfData()) {
916 x_NextAlignment(1, false)->PopulateBoundariesMap();
917 }
918 m_Set1.Reset();
919 while (!m_Set2.EndOfData()) {
920 x_NextAlignment(2, false)->PopulateBoundariesMap();
921 }
922 m_Set2.Reset();
923 }
924
NextGroup()925 vector<const CAlignCompare::SAlignment *> CAlignCompare::NextGroup()
926 {
927 int next_group_set = x_DetermineNextGroupSet();
928 if (next_group_set & 1) {
929 x_GetCurrentGroup(1);
930 }
931 if (next_group_set & 2) {
932 x_GetCurrentGroup(2);
933 }
934
935 vector<const SAlignment *> group;
936 switch (next_group_set) {
937 case 1:
938 if (!m_IgnoreNotPresent) {
939 m_CountOnlySet1 += m_CurrentSet1Group.size();
940 m_CountSplitSet1 += m_CurrentSet1Group.size();
941 ITERATE (list< AutoPtr<SAlignment> >, it, m_CurrentSet1Group) {
942 m_CountBasesOnlySet1 += (*it)->length;
943 group.push_back(&**it);
944 }
945 }
946 break;
947
948 case 2:
949 if (!m_IgnoreNotPresent) {
950 m_CountOnlySet2 += m_CurrentSet2Group.size();
951 m_CountSplitSet2 += m_CurrentSet2Group.size();
952 ITERATE (list< AutoPtr<SAlignment> >, it, m_CurrentSet2Group) {
953 m_CountBasesOnlySet2 += (*it)->length;
954 group.push_back(&**it);
955 }
956 }
957 break;
958
959 default:
960 {{
961 if (!m_BoundariesMap.empty()) {
962 x_SplitOnOverlaps(1, 0);
963 x_SplitOnOverlaps(1, 1);
964 x_SplitOnOverlaps(2, 0);
965 x_SplitOnOverlaps(2, 1);
966 m_CountSplitSet1 += m_CurrentSet1Group.size();
967 m_CountSplitSet2 += m_CurrentSet2Group.size();
968 }
969 TAlignPtrSet set1_aligns;
970 NON_CONST_ITERATE (list< AutoPtr<SAlignment> >, it, m_CurrentSet1Group)
971 {
972 set1_aligns.insert(&**it);
973 }
974
975 TAlignPtrSet set2_aligns;
976 NON_CONST_ITERATE (list< AutoPtr<SAlignment> >, it, m_CurrentSet2Group)
977 {
978 set2_aligns.insert(&**it);
979 }
980
981 set<SAlignment const*> red_color; // alignments from set2 that have equivalent mate
982 // from opposite set.
983 vector<TComp> comparisons;
984
985 ITERATE (TAlignPtrSet, set1_it, set1_aligns) {
986 SAlignment const* lhs = *set1_it;
987
988 ITERATE (TAlignPtrSet, set2_it, set2_aligns) {
989 SAlignment const* rhs = *set2_it;
990 // Check for equivalent alignment.
991 // In strict mode we do not combine overlapping and equiv. alignments.
992 if ( m_Strict && red_color.count(rhs) > 0 ) {
993 continue;
994 }
995
996 if ( false == s_IsOverlapping(lhs, rhs, m_Row) ) {
997 continue;
998 }
999 // Check for overlap.
1000 comparisons.push_back(TComp(TPtrPair(const_cast<SAlignment*>(lhs), const_cast<SAlignment*>(rhs)),
1001 SComparison(*lhs, *rhs,
1002 m_RealScoreTolerance)));
1003
1004 // Post-processing:
1005 // - if two alignments are equivalent
1006 // -- color both alignments in red,
1007 // -- break out of the loop.
1008 TComp const& record = comparisons.back();
1009 SComparison const& comp = record.second;
1010 if ( comp.is_equivalent ) {
1011 red_color.insert(rhs); // set2's alignment.
1012 // In strict mode we do not combine overlapping and equiv. alignments.
1013 if ( m_Strict ) {
1014 break;
1015 }
1016 }
1017 }
1018 }
1019
1020 std::sort(comparisons.begin(), comparisons.end(), SComp_Less(m_Strict));
1021
1022 typedef pair<TAlignPtrSet, EMatchLevel> TAlignGroup;
1023
1024 list<TAlignGroup> groups;
1025 map<const SAlignment *, list<TAlignGroup>::iterator> group_map;
1026
1027 ITERATE (vector<TComp>, it, comparisons) {
1028 bool is_equivalent = it->second.is_equivalent;
1029 /// This comparison counts if the two alignments are equivalent, or
1030 /// they overlap and we haven't yet seen an equivalence for either
1031 if (is_equivalent ||
1032 (it->second.overlap > 0 &&
1033 it->first.first->match_level != e_Equiv &&
1034 it->first.second->match_level != e_Equiv))
1035 {
1036 list<TAlignGroup>::iterator align1_group = groups.end(),
1037 align2_group = groups.end();
1038 if (set1_aligns.erase(it->first.first)) {
1039 it->first.first->match_level =
1040 is_equivalent ? e_Equiv : e_Overlap;
1041 group.push_back(it->first.first);
1042 if (is_equivalent) {
1043 ++m_CountEquivSet1;
1044 m_CountBasesEquivSet1 += it->second.spans_in_common;
1045 } else {
1046 ++m_CountOverlapSet1;
1047 m_CountBasesOverlapSet1 += it->second.spans_in_common
1048 + it->second.spans_overlap;
1049 m_CountBasesOnlySet1 += it->second.spans_unique_first;
1050 }
1051 } else {
1052 align1_group = group_map[it->first.first];
1053 }
1054 if (set2_aligns.erase(it->first.second)) {
1055 it->first.second->match_level =
1056 is_equivalent ? e_Equiv : e_Overlap;
1057 group.push_back(it->first.second);
1058 if (is_equivalent) {
1059 ++m_CountEquivSet2;
1060 m_CountBasesEquivSet2 += it->second.spans_in_common;
1061 } else {
1062 ++m_CountOverlapSet2;
1063 m_CountBasesOverlapSet2 += it->second.spans_in_common
1064 + it->second.spans_overlap;
1065 m_CountBasesOnlySet2 += it->second.spans_unique_second;
1066 }
1067 } else {
1068 align2_group = group_map[it->first.second];
1069 }
1070 if (align1_group == groups.end() &&
1071 align2_group == groups.end())
1072 {
1073 /// Neither alignemnts was encountered before, so create
1074 /// new group
1075 list<TAlignGroup>::iterator new_group =
1076 groups.insert(groups.end(), TAlignGroup());
1077 new_group->first.insert(it->first.first);
1078 new_group->first.insert(it->first.second);
1079 new_group->second = it->first.first->match_level;
1080 group_map[it->first.first] = new_group;
1081 group_map[it->first.second] = new_group;
1082 ++(is_equivalent ? m_CountEquivGroups
1083 : m_CountOverlapGroups);
1084 } else if(align1_group == groups.end()) {
1085 /// alignment 1 is new, add it to existing group
1086 align2_group->first.insert(it->first.first);
1087 group_map[it->first.first] = align2_group;
1088 } else if(align2_group == groups.end()) {
1089 /// alignment 2 is new, add it to existing group
1090 align1_group->first.insert(it->first.second);
1091 group_map[it->first.second] = align1_group;
1092 } else if (align1_group != align2_group) {
1093 /// The alignments are in two separate groups; merge them
1094 ITERATE (TAlignPtrSet, group2_it, align2_group->first) {
1095 align1_group->first.insert(*group2_it);
1096 group_map[*group2_it] = align1_group;
1097 }
1098 if (align2_group->second == e_Overlap) {
1099 --m_CountOverlapGroups;
1100 if (align1_group->second == e_Equiv) {
1101 /// Change the group from equivalence to overlap
1102 align1_group->second = e_Overlap;
1103 ++m_CountOverlapGroups;
1104 --m_CountEquivGroups;
1105 }
1106 } else {
1107 --m_CountEquivGroups;
1108 }
1109 groups.erase(align2_group);
1110 }
1111 }
1112 }
1113
1114 ITERATE (list<TAlignGroup>, group_it, groups) {
1115 if (group_it->second == e_NoMatch) {
1116 continue;
1117 }
1118 if (group_it->second == e_Overlap && !m_QualityScores.empty())
1119 {
1120 /// Find which side is better
1121 vector<SAlignment *> best(3, static_cast<SAlignment *>(NULL));
1122 ITERATE (TAlignPtrSet, align_it, group_it->first) {
1123 SAlignment *&side_best = best[(*align_it)->source_set];
1124 if (!side_best || (*align_it)->quality_scores >
1125 side_best->quality_scores)
1126 {
1127 side_best = *align_it;
1128 }
1129 }
1130 if (best[1]->quality_scores != best[2]->quality_scores) {
1131 int better_side =
1132 best[1]->quality_scores > best[2]->quality_scores
1133 ? 1 : 2;
1134 ITERATE (TAlignPtrSet, align_it, group_it->first) {
1135 (*align_it)->match_level =
1136 (*align_it)->source_set == better_side
1137 ? e_OverlapBetter : e_OverlapWorse;
1138 }
1139 }
1140 }
1141 ITERATE (TAlignPtrSet, align1_it, group_it->first) {
1142 if ((*align1_it)->source_set != 1) {
1143 continue;
1144 }
1145 ITERATE (TAlignPtrSet, align2_it, group_it->first) {
1146 if ((*align2_it)->source_set != 2) {
1147 continue;
1148 }
1149 (*align1_it)->matched_alignments.push_back(*align2_it);
1150 (*align2_it)->matched_alignments.push_back(*align1_it);
1151 }
1152 }
1153 }
1154
1155 /// Add remaining alignments, for which no match was found, in order
1156 /// of their appearance in alignment comparisons
1157 m_CountOnlySet1 += set1_aligns.size();
1158 m_CountOnlySet2 += set2_aligns.size();
1159 ITERATE (vector<TComp>, comp_it, comparisons) {
1160 if (set1_aligns.empty() && set2_aligns.empty()) {
1161 /// Found best comparison for all alignments
1162 break;
1163 }
1164 if (comp_it->second.overlap == 0) {
1165 if (set1_aligns.erase(comp_it->first.first)) {
1166 group.push_back(comp_it->first.first);
1167 m_CountBasesOnlySet1 += comp_it->first.first->length;
1168 }
1169 if (set2_aligns.erase(comp_it->first.second)) {
1170 group.push_back(comp_it->first.second);
1171 m_CountBasesOnlySet2 += comp_it->first.second->length;
1172 }
1173 }
1174 }
1175 ITERATE (TAlignPtrSet, set1_it, set1_aligns) {
1176 group.push_back(*set1_it);
1177 m_CountBasesOnlySet1 += (*set1_it)->length;
1178 }
1179 ITERATE (TAlignPtrSet, set2_it, set2_aligns) {
1180 group.push_back(*set2_it);
1181 m_CountBasesOnlySet2 += (*set2_it)->length;
1182 }
1183 }}
1184 }
1185
1186 return group;
1187 }
1188
s_IsOverlapping(CAlignCompare::SAlignment const * lhs,CAlignCompare::SAlignment const * rhs,CAlignCompare::ERowComparison row)1189 bool s_IsOverlapping(CAlignCompare::SAlignment const* lhs, CAlignCompare::SAlignment const* rhs, CAlignCompare::ERowComparison row)
1190 {
1191 bool overlap;
1192
1193 switch ( row ) {
1194 case CAlignCompare::e_Query:
1195 overlap = lhs->query_range.IntersectingWith(rhs->query_range);
1196 break;
1197
1198 case CAlignCompare::e_Subject:
1199 overlap = lhs->subject_range.IntersectingWith(rhs->subject_range);
1200 break;
1201
1202 default: //CAlignCompare::e_Both:
1203 if ( false == lhs->subject_range.IntersectingWith(rhs->subject_range) &&
1204 false == lhs->query_range.IntersectingWith(rhs->query_range) )
1205 {
1206 overlap = false;
1207 }
1208 else {
1209 overlap = true;
1210 }
1211 break;
1212 }
1213 return overlap;
1214 }
1215
1216 END_NCBI_SCOPE
1217