1 /*  $Id: unit_test_alnmgr.cpp 601341 2020-02-05 20:27:33Z vasilche $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Aleksey Grichenko
27 *
28 * File Description:
29 *   Alignment Manager unit test.
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include <corelib/ncbiapp.hpp>
37 #include <corelib/test_boost.hpp>
38 
39 #include <serial/objistr.hpp>
40 #include <objects/seqloc/Seq_id.hpp>
41 #include <objects/general/Object_id.hpp>
42 
43 #include <objtools/alnmgr/aln_user_options.hpp>
44 #include <objtools/alnmgr/aln_asn_reader.hpp>
45 #include <objtools/alnmgr/aln_container.hpp>
46 #include <objtools/alnmgr/aln_tests.hpp>
47 #include <objtools/alnmgr/aln_stats.hpp>
48 #include <objtools/alnmgr/pairwise_aln.hpp>
49 #include <objtools/alnmgr/aln_serial.hpp>
50 #include <objtools/alnmgr/sparse_aln.hpp>
51 #include <objtools/alnmgr/aln_converters.hpp>
52 #include <objtools/alnmgr/aln_builders.hpp>
53 #include <objtools/alnmgr/sparse_ci.hpp>
54 #include <objtools/alnmgr/aln_generators.hpp>
55 
56 #include <objtools/data_loaders/genbank/gbloader.hpp>
57 #include <objmgr/object_manager.hpp>
58 #include <objmgr/scope.hpp>
59 
60 #include <common/test_assert.h>  /* This header must go last */
61 
62 USING_NCBI_SCOPE;
63 USING_SCOPE(objects);
64 
65 
66 
NCBITEST_INIT_CMDLINE(descrs)67 NCBITEST_INIT_CMDLINE(descrs)
68 {
69     descrs->AddFlag("print_exp", "Print expected values instead of testing");
70     descrs->AddFlag("verbose", "Print detailed test progress");
71 }
72 
73 
DumpExpected(void)74 bool DumpExpected(void)
75 {
76     return CNcbiApplication::Instance()->GetArgs()["print_exp"];
77 }
78 
79 
Verbose(void)80 bool Verbose(void)
81 {
82     return CNcbiApplication::Instance()->GetArgs()["verbose"];
83 }
84 
85 
GetScope(void)86 CScope& GetScope(void)
87 {
88     static CRef<CScope> s_Scope;
89     if ( !s_Scope ) {
90         CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
91         CGBDataLoader::RegisterInObjectManager(*objmgr);
92 
93         s_Scope.Reset(new CScope(*objmgr));
94         s_Scope->AddDefaults();
95     }
96     return *s_Scope;
97 }
98 
99 
100 // Helper function for loading alignments. Returns number of alignments loaded
101 // into the container. The expected format is:
102 // <int-number-of-alignments>
103 // <Seq-align>+
LoadAligns(CNcbiIstream & in,CAlnContainer & aligns,size_t limit=0)104 size_t LoadAligns(CNcbiIstream& in,
105                   CAlnContainer& aligns,
106                   size_t limit = 0)
107 {
108     size_t num_aligns = 0;
109     while ( !in.eof() ) {
110         try {
111             CRef<CSeq_align> align(new CSeq_align);
112             in >> MSerial_AsnText >> *align;
113             align->Validate(true);
114             aligns.insert(*align);
115             num_aligns++;
116             if (limit > 0  &&  num_aligns >= limit) break;
117         }
118         catch (CIOException e) {
119             break;
120         }
121     }
122     return num_aligns;
123 }
124 
125 
126 // The aln_id_map argument is required because TScopeAlnStat keeps const
127 // reference to the id-map. So, the map needs to be returned to the caller.
InitAlnStats(const CAlnContainer & aligns,auto_ptr<TScopeAlnIdMap> & aln_id_map)128 CRef<TScopeAlnStats> InitAlnStats(const CAlnContainer& aligns,
129                                   auto_ptr<TScopeAlnIdMap>& aln_id_map)
130 {
131     TScopeAlnSeqIdConverter id_conv(&GetScope());
132     TScopeIdExtract id_extract(id_conv);
133     aln_id_map.reset(new TScopeAlnIdMap(id_extract, aligns.size()));
134     ITERATE(CAlnContainer, aln_it, aligns) {
135         aln_id_map->push_back(**aln_it);
136     }
137     CRef<TScopeAlnStats> aln_stats(new TScopeAlnStats(*aln_id_map));
138     return aln_stats;
139 }
140 
141 
CheckPairwiseAln(CNcbiIstream & in_exp,const CPairwiseAln & pw,size_t aln_idx,int anchor,int row)142 void CheckPairwiseAln(CNcbiIstream&       in_exp,
143                       const CPairwiseAln& pw,
144                       size_t aln_idx,
145                       int anchor,
146                       int row)
147 {
148     // For each alignment/anchor/row:
149     // <int-align-index> <int-anchor_index> <int-row-index> <int-pairwise-flags>
150     //   <int-align-first-from> <int-align-first-to> <int-base-width> <Seq-id>
151     //   <int-align-second-from> <int-align-second-to> <int-base-width> <Seq-id>
152     //     For each CPairwiseAln element:
153     //     <int-first-from> <int-second-from> <int-len> <int-is-direct>
154     //   <int-insertions-count>
155     //     For each insertion:
156     //     <int-first-from> <int-second-from> <int-len> <int-is-direct>
157     //   For each iterator:
158     //   <int-first-from> <int-first-to> <int-second-from> <int-second-to> ...
159     //   ... <int-direct> <int-first-direct> <int-aligned>
160 
161     TSignedSeqPos first_from, first_to, second_from, second_to, len;
162     int flags;
163 
164     if ( DumpExpected() ) {
165         cout << aln_idx << " " << anchor << " " << row << " " << pw.GetFlags() << endl;
166         cout << "  " << pw.GetFirstFrom() << " "
167             << pw.GetFirstTo() << " "
168             << pw.GetFirstBaseWidth() << " "
169             << MSerial_AsnText << pw.GetFirstId()->GetSeqId();
170         cout << "  " << pw.GetSecondPosByFirstPos(pw.GetFirstFrom()) << " "
171             << pw.GetSecondPosByFirstPos(pw.GetFirstTo()) << " "
172             << pw.GetSecondBaseWidth() << " "
173             << MSerial_AsnText << pw.GetSecondId()->GetSeqId();
174     }
175     else {
176         if ( Verbose() ) {
177             cout << "  aln=" << aln_idx << ", anchor=" << anchor << ", row=" << row << endl;
178         }
179         size_t expected_aln_idx;
180         int expected_row, expected_anchor;
181         int first_width, second_width;
182         CSeq_id first_id, second_id;
183         in_exp >> expected_aln_idx >> expected_anchor >> expected_row >> flags;
184         in_exp >> first_from >> first_to >> first_width;
185         in_exp >> MSerial_AsnText >> first_id;
186         in_exp >> second_from >> second_to >> second_width;
187         in_exp >> MSerial_AsnText >> second_id;
188         BOOST_CHECK(in_exp.good());
189         BOOST_CHECK(aln_idx == expected_aln_idx);
190         BOOST_CHECK(anchor == expected_anchor);
191         BOOST_CHECK(row == expected_row);
192         BOOST_CHECK(pw.GetFlags() == flags);
193         BOOST_CHECK(pw.GetFirstId()->GetSeqId().Equals(first_id));
194         BOOST_CHECK(pw.GetSecondId()->GetSeqId().Equals(second_id));
195         BOOST_CHECK(pw.GetFirstBaseWidth() == first_width);
196         BOOST_CHECK(pw.GetSecondBaseWidth() == second_width);
197         BOOST_CHECK(pw.GetFirstFrom() == first_from);
198         BOOST_CHECK(pw.GetFirstTo() == first_to);
199         BOOST_CHECK(pw.GetSecondPosByFirstPos(pw.GetFirstFrom()) == second_from);
200         BOOST_CHECK(pw.GetSecondPosByFirstPos(pw.GetFirstTo()) == second_to);
201     }
202 
203     ITERATE(CPairwiseAln, rg_it, pw) {
204         CAlignRange<TSignedSeqPos> rg = *rg_it;
205 
206         if ( DumpExpected() ) {
207             cout << "    " << rg_it->GetFirstFrom() << " "
208                 << rg_it->GetSecondFrom() << " "
209                 << rg_it->GetLength() << " "
210                 << (rg.IsDirect() ? 1 : 0) << endl;
211         }
212         else {
213             in_exp >> first_from >> second_from >> len >> flags;
214             BOOST_CHECK(in_exp.good());
215             BOOST_CHECK(rg_it->GetFirstFrom() == first_from);
216             BOOST_CHECK(rg_it->GetSecondFrom() == second_from);
217             BOOST_CHECK(rg_it->GetLength() == len);
218             BOOST_CHECK(rg_it->IsDirect() == (flags != 0));
219         }
220     }
221 
222     if ( DumpExpected() ) {
223         cout << "  " << pw.GetInsertions().size() << endl;
224     }
225     else {
226         size_t ins_count;
227         in_exp >> ins_count;
228         BOOST_CHECK(in_exp.good());
229         BOOST_CHECK(pw.GetInsertions().size() == ins_count);
230     }
231 
232     if (!pw.GetInsertions().empty()) {
233         ITERATE(CPairwiseAln::TInsertions, gap, pw.GetInsertions()) {
234             if ( DumpExpected() ) {
235                 cout << "    " << gap->GetFirstFrom() << " "
236                     << gap->GetSecondFrom() << " "
237                     << gap->GetLength() << " "
238                     << (gap->IsDirect() ? 1 : 0) << endl;
239             }
240             else {
241                 in_exp >> first_from >> second_from >> len >> flags;
242                 BOOST_CHECK(in_exp.good());
243                 BOOST_CHECK(gap->GetFirstFrom() == first_from);
244                 BOOST_CHECK(gap->GetSecondFrom() == second_from);
245                 BOOST_CHECK(gap->GetLength() == len);
246                 BOOST_CHECK(gap->IsDirect() == (flags != 0));
247             }
248         }
249     }
250 }
251 
252 
CheckSparseAln(CNcbiIstream & in_exp,const CSparseAln & sparse,bool check_sequence)253 void CheckSparseAln(CNcbiIstream&     in_exp,
254                     const CSparseAln& sparse,
255                     bool              check_sequence)
256 {
257     // Format:
258     // <dim> <numrows> <anchor row>
259     // <aln-from> <aln-to>
260     // For each row:
261     // <row> <width> <seq-id>
262     // <aln-from> <aln_to> <seq-from> <seq-to> <native-from> <native-to>
263     // [<aln-sequence>]
264     // [<row-sequence>]
265 
266     static const char* kGap = "<GAP>";
267     static const char* kNoData = "<NO SEQUENCE DATA>";
268 
269     TSignedSeqPos expected_aln_from, expected_aln_to;
270 
271     if ( DumpExpected() ) {
272         cout << sparse.GetDim() << " "
273             << sparse.GetNumRows() << " "
274             << sparse.GetAnchor() << endl;
275         cout << sparse.GetAlnRange().GetFrom() << " "
276             << sparse.GetAlnRange().GetTo() << endl;
277     }
278     else {
279         int expected_dim, expected_numrows, expected_anchor;
280         in_exp >> expected_dim >> expected_numrows >> expected_anchor;
281         in_exp >> expected_aln_from >> expected_aln_to;
282         BOOST_CHECK(in_exp.good());
283         BOOST_CHECK(sparse.GetDim() == expected_dim);
284         BOOST_CHECK(sparse.GetNumRows() == expected_numrows);
285         BOOST_CHECK(sparse.GetAnchor() == expected_anchor);
286         BOOST_CHECK(sparse.GetAlnRange().GetFrom() == expected_aln_from);
287         BOOST_CHECK(sparse.GetAlnRange().GetTo() == expected_aln_to);
288     }
289 
290     for (CSparseAln::TDim row = 0; row < sparse.GetDim(); ++row) {
291         if ( DumpExpected() ) {
292             cout << row << " "
293                 << sparse.GetBaseWidth(row) << " "
294                 << MSerial_AsnText << sparse.GetSeqId(row);
295         }
296         else {
297             int expected_row, expected_width;
298             CSeq_id expected_id;
299             in_exp >> expected_row >> expected_width >> MSerial_AsnText >> expected_id;
300             BOOST_CHECK(in_exp.good());
301             BOOST_CHECK(row == expected_row);
302             BOOST_CHECK(sparse.GetBaseWidth(row) == expected_width);
303             BOOST_CHECK(sparse.GetSeqId(row).Equals(expected_id));
304         }
305 
306         CSparseAln::TRange rg = sparse.GetSeqRange(row);
307         CSparseAln::TRange native_rg = sparse.AlnRangeToNativeSeqRange(row, rg);
308 
309         TSeqPos expected_seq_from, expected_seq_to,
310             expected_native_from, expected_native_to;
311 
312         if ( DumpExpected() ) {
313             cout << sparse.GetSeqAlnStart(row) << " " << sparse.GetSeqAlnStop(row) << " "
314                 << rg.GetFrom() << " " << rg.GetTo() << " "
315                 << native_rg.GetFrom() << " " << native_rg.GetTo() << endl;
316         }
317         else {
318             in_exp >> expected_aln_from >> expected_aln_to
319                 >> expected_seq_from >> expected_seq_to
320                 >> expected_native_from >> expected_native_to;
321             BOOST_CHECK(in_exp.good());
322             BOOST_CHECK(sparse.GetSeqAlnStart(row) == expected_aln_from);
323             BOOST_CHECK(sparse.GetSeqAlnStop(row) == expected_aln_to);
324             BOOST_CHECK(rg.GetFrom() == expected_seq_from);
325             BOOST_CHECK(rg.GetTo() == expected_seq_to);
326             BOOST_CHECK(native_rg.GetFrom() == expected_native_from);
327             BOOST_CHECK(native_rg.GetTo() == expected_native_to);
328         }
329 
330         if ( check_sequence ) {
331             string aln_sequence, row_sequence;
332             sparse.GetAlnSeqString(row, aln_sequence, CSparseAln::TSignedRange::GetWhole());
333             sparse.GetSeqString(row, row_sequence, CSparseAln::TRange::GetWhole());
334             if ( aln_sequence.empty() ) {
335                 aln_sequence = kNoData;
336             }
337             if ( row_sequence.empty() ) {
338                 row_sequence = kNoData;
339             }
340 
341             if ( DumpExpected() ) {
342                 cout << aln_sequence << endl;
343                 cout << row_sequence << endl;
344             }
345             else {
346                 if ( Verbose() ) {
347                     cout << "  whole row=" << row << endl;
348                 }
349                 string expected_aln_sequence, expected_row_sequence;
350                 in_exp >> ws;
351                 getline(in_exp, expected_aln_sequence);
352                 getline(in_exp, expected_row_sequence);
353                 BOOST_CHECK(in_exp.good());
354                 BOOST_CHECK(aln_sequence == expected_aln_sequence);
355                 BOOST_CHECK(row_sequence == expected_row_sequence);
356             }
357         }
358 
359         CSparse_CI sparse_ci(sparse, row, CSparse_CI::eAllSegments);
360         for (; sparse_ci; ++sparse_ci) {
361             const IAlnSegment& seg = *sparse_ci;
362             CSparse_CI::TSignedRange aln_rg = seg.GetAlnRange();
363             CSparse_CI::TSignedRange seq_rg = seg.GetRange();
364 
365             if ( DumpExpected() ) {
366                 cout << "  " << seg.GetType() << " "
367                     << aln_rg.GetFrom() << " " << aln_rg.GetTo() << " "
368                     << seq_rg.GetFrom() << " " << seq_rg.GetTo() << endl;
369             }
370             else {
371                 if ( Verbose() ) {
372                     cout << "  segment: row=" << row << ", range=" <<
373                         aln_rg.GetFrom() << ".." << aln_rg.GetTo() << endl;
374                 }
375                 unsigned expected_seg_type;
376                 TSignedSeqPos expected_seq_from, expected_seq_to;
377 
378                 in_exp >> expected_seg_type
379                     >> expected_aln_from >> expected_aln_to
380                     >> expected_seq_from >> expected_seq_to;
381                 BOOST_CHECK(in_exp.good());
382                 BOOST_CHECK(seg.GetType() == expected_seg_type);
383                 BOOST_CHECK(aln_rg.GetFrom() == expected_aln_from);
384                 BOOST_CHECK(aln_rg.GetTo() == expected_aln_to);
385                 BOOST_CHECK(seq_rg.GetFrom() == expected_seq_from);
386                 BOOST_CHECK(seq_rg.GetTo() == expected_seq_to);
387             }
388 
389             if ( check_sequence ) {
390                 string aln_sequence, row_sequence;
391                 string expected_aln_sequence, expected_row_sequence;
392 
393                 if ( !aln_rg.Empty() ) {
394                     sparse.GetAlnSeqString(row, aln_sequence, aln_rg);
395                     if ( aln_sequence.empty() ) {
396                         aln_sequence = kNoData;
397                     }
398                 }
399                 else {
400                     aln_sequence = kGap;
401                 }
402                 if ( !seq_rg.Empty() ) {
403                     sparse.GetSeqString(row, row_sequence,
404                         IAlnExplorer::TRange(seq_rg.GetFrom(), seq_rg.GetTo()));
405                     if ( row_sequence.empty() ) {
406                         row_sequence = kNoData;
407                     }
408                 }
409                 else {
410                     row_sequence = kGap;
411                 }
412 
413                 if ( DumpExpected() ) {
414                     cout << aln_sequence << endl;
415                     cout << row_sequence << endl;
416                 }
417                 else {
418                     in_exp >> ws;
419                     getline(in_exp, expected_aln_sequence);
420                     getline(in_exp, expected_row_sequence);
421                     BOOST_CHECK(in_exp.good());
422                     BOOST_CHECK(aln_sequence == expected_aln_sequence);
423                     BOOST_CHECK(row_sequence == expected_row_sequence);
424                 }
425             }
426         }
427     }
428 }
429 
430 
BOOST_AUTO_TEST_CASE(s_TestLoadAlign)431 BOOST_AUTO_TEST_CASE(s_TestLoadAlign)
432 {
433     cout << "Test CAlnContainer and CAlnStats... (aligns1.asn / expected01.txt)" << endl;
434 
435     CNcbiIfstream in_data("data/aligns1.asn");
436     CNcbiIfstream in_exp("data/expected01.txt");
437     // File format:
438     // <int-number-of-alignments>
439     // For each seq-id:
440     // <int-base-width> <Seq-id>
441 
442     CAlnContainer aligns;
443     size_t num_aligns = LoadAligns(in_data, aligns);
444 
445     if ( DumpExpected() ) {
446         cout << num_aligns << endl;
447     }
448     else {
449         size_t expected_num_aligns;
450         in_exp >> expected_num_aligns;
451         BOOST_CHECK(num_aligns == expected_num_aligns);
452     }
453 
454     auto_ptr<TScopeAlnIdMap> aln_id_map;
455     CRef<TScopeAlnStats> aln_stats = InitAlnStats(aligns, aln_id_map);
456 
457     if ( !DumpExpected() ) {
458         BOOST_CHECK(aln_stats->GetAlnCount() == num_aligns);
459         BOOST_CHECK(aln_stats->CanBeAnchored());
460     }
461 
462     ITERATE(TScopeAlnStats::TIdVec, id_it, aln_stats->GetIdVec()) {
463         if ( DumpExpected() ) {
464             cout << (*id_it)->GetBaseWidth() << " ";
465             cout << MSerial_AsnText << (*id_it)->GetSeqId();
466         }
467         else {
468             int width;
469             CSeq_id id;
470             in_exp >> width;
471             in_exp >> MSerial_AsnText >> id;
472             BOOST_CHECK(id.Equals((*id_it)->GetSeqId()));
473             BOOST_CHECK((*id_it)->GetBaseWidth() == width);
474         }
475     }
476 }
477 
478 
BOOST_AUTO_TEST_CASE(s_TestPairwiseAln)479 BOOST_AUTO_TEST_CASE(s_TestPairwiseAln)
480 {
481     cout << "Test CPairwiseAln... (aligns1.asn / expected02.txt)" << endl;
482 
483     CNcbiIfstream in_data("data/aligns1.asn");
484     CNcbiIfstream in_exp("data/expected02.txt");
485 
486     // File format:
487     // <int-number-of-alignments>
488     // see CheckPairwiseAln
489 
490     CAlnContainer aligns;
491     size_t num_aligns = LoadAligns(in_data, aligns);
492 
493     if ( DumpExpected() ) {
494         cout << num_aligns << endl;
495     }
496     else {
497         size_t expected_num_aligns;
498         in_exp >> expected_num_aligns;
499         BOOST_CHECK(num_aligns == expected_num_aligns);
500     }
501 
502     auto_ptr<TScopeAlnIdMap> aln_id_map;
503     CRef<TScopeAlnStats> aln_stats = InitAlnStats(aligns, aln_id_map);
504 
505     const TScopeAlnStats::TAlnVec& aln_vec = aln_stats->GetAlnVec();
506 
507     // For each source alignment create a set of pairwise alignments.
508     for (size_t aln_idx = 0; aln_idx < aln_vec.size(); ++aln_idx) {
509         const CSeq_align& aln = *aln_vec[aln_idx];
510         // Use row 0 as anchor, get pairwise alignments with all other rows.
511         const TScopeAlnStats::TIdVec& aln_ids = aln_stats->GetSeqIdsForAln(aln_idx);
512         // Test two different anchor values.
513         for (int anchor = 0; anchor < 2; anchor++) {
514             TAlnSeqIdIRef anchor_id = aln_ids[anchor];
515             for (int row = 0; row < aln_stats->GetDimForAln(aln_idx); ++row) {
516                 if (row == anchor) continue;
517                 TAlnSeqIdIRef row_id = aln_ids[row];
518                 CPairwiseAln pw(anchor_id, row_id, CPairwiseAln::fDefaultPolicy);
519                 ConvertSeqAlignToPairwiseAln(pw, aln, anchor, row,
520                     CAlnUserOptions::eBothDirections, &aln_ids);
521 
522                 CheckPairwiseAln(in_exp, pw, aln_idx, anchor, row);
523 
524                 for (CPairwise_CI seg(pw); seg; ++seg) {
525                     if ( DumpExpected() ) {
526                         cout << "  " << seg.GetFirstRange().GetFrom() << " "
527                             << seg.GetFirstRange().GetTo() << " "
528                             << seg.GetSecondRange().GetFrom() << " "
529                             << seg.GetSecondRange().GetTo() << " "
530                             << (seg.IsDirect() ? 1 : 0) << " "
531                             << (seg.IsFirstDirect() ? 1 : 0) << " "
532                             << (seg.GetSegType() == CPairwise_CI::eAligned ? 1 : 0) << endl;
533                     }
534                     else {
535                         int direct, first_direct, aligned;
536                         TSignedSeqPos first_from, first_to, second_from, second_to;
537                         in_exp >> first_from >> first_to >> second_from >> second_to >>
538                             direct >> first_direct >> aligned;
539                         BOOST_CHECK(in_exp.good());
540                         BOOST_CHECK(seg.GetFirstRange().GetFrom() == first_from);
541                         BOOST_CHECK(seg.GetFirstRange().GetTo() == first_to);
542                         BOOST_CHECK(seg.GetSecondRange().GetFrom() == second_from);
543                         BOOST_CHECK(seg.GetSecondRange().GetTo() == second_to);
544                         BOOST_CHECK(seg.IsDirect() == (direct != 0));
545                         BOOST_CHECK(seg.IsFirstDirect() == (first_direct != 0));
546                         BOOST_CHECK((seg.GetSegType() == CPairwise_CI::eAligned) == (aligned != 0));
547                     }
548                 }
549             }
550         }
551     }
552 
553 }
554 
555 
BOOST_AUTO_TEST_CASE(s_TestPairwiseAlnRanged)556 BOOST_AUTO_TEST_CASE(s_TestPairwiseAlnRanged)
557 {
558     cout << "Test ranged CPairwise_CI... (aligns1.asn / expected03.txt)" << endl;
559 
560     CNcbiIfstream in_data("data/aligns1.asn");
561     CNcbiIfstream in_exp("data/expected03.txt");
562 
563     // File format:
564     // <int-number-of-alignments>
565     //
566     // For each alignment/anchor/row:
567     // <int-align-index> <int-anchor_index> <int-row-index> <int-pairwise-flags>
568     //   <int-align-first-from> <int-align-first-to> <int-base-width> <Seq-id>
569     //   <int-align-second-from> <int-align-second-to> <int-base-width> <Seq-id>
570     //     For each CPairwiseAln element:
571     //     <int-first-from> <int-second-from> <int-len> <int-is-direct>
572     //   <int-insertions-count>
573     //     For each insertion:
574     //     <int-first-from> <int-second-from> <int-len> <int-is-direct>
575     //   For each iterator:
576     //   <int-first-from> <int-first-to> <int-second-from> <int-second-to> ...
577     //   ... <int-direct> <int-first-direct> <int-aligned>
578 
579     CAlnContainer aligns;
580     size_t num_aligns = LoadAligns(in_data, aligns);
581 
582     if ( DumpExpected() ) {
583         cout << num_aligns << endl;
584     }
585     else {
586         size_t expected_num_aligns;
587         in_exp >> expected_num_aligns;
588         BOOST_CHECK(num_aligns == expected_num_aligns);
589     }
590 
591     auto_ptr<TScopeAlnIdMap> aln_id_map;
592     CRef<TScopeAlnStats> aln_stats = InitAlnStats(aligns, aln_id_map);
593 
594     const TScopeAlnStats::TAlnVec& aln_vec = aln_stats->GetAlnVec();
595 
596     // For each source alignment create a set of pairwise alignments.
597     for (size_t aln_idx = 0; aln_idx < aln_vec.size(); ++aln_idx) {
598         const CSeq_align& aln = *aln_vec[aln_idx];
599         // Use row 0 as anchor, get pairwise alignments with all other rows.
600         const TScopeAlnStats::TIdVec& aln_ids = aln_stats->GetSeqIdsForAln(aln_idx);
601         // Test two different anchor values.
602         for (int anchor = 0; anchor < 2; anchor++) {
603             TAlnSeqIdIRef anchor_id = aln_ids[anchor];
604             for (int row = 0; row < aln_stats->GetDimForAln(aln_idx); ++row) {
605                 if (row == anchor) continue;
606                 TAlnSeqIdIRef row_id = aln_ids[row];
607                 CPairwiseAln pw(anchor_id, row_id, CPairwiseAln::fDefaultPolicy);
608                 ConvertSeqAlignToPairwiseAln(pw, aln, anchor, row,
609                     CAlnUserOptions::eBothDirections, &aln_ids);
610 
611                 TSignedSeqPos first_from, first_to, second_from, second_to;
612 
613                 if ( DumpExpected() ) {
614                     cout << aln_idx << " " << anchor << " " << row << " " << pw.GetFlags() << endl;
615                     cout << "  " << pw.GetFirstFrom() << " "
616                         << pw.GetFirstTo() << " "
617                         << pw.GetFirstBaseWidth() << " "
618                         << MSerial_AsnText << pw.GetFirstId()->GetSeqId();
619                     cout << "  " << pw.GetSecondPosByFirstPos(pw.GetFirstFrom()) << " "
620                         << pw.GetSecondPosByFirstPos(pw.GetFirstTo()) << " "
621                         << pw.GetSecondBaseWidth() << " "
622                         << MSerial_AsnText << pw.GetSecondId()->GetSeqId();
623                 }
624                 else {
625                     if ( Verbose() ) {
626                         cout << "  aln=" << aln_idx << ", anchor=" << anchor << ", row=" << row << endl;
627                     }
628                     size_t expected_aln_idx;
629                     int expected_row, expected_anchor;
630                     int first_width, second_width, flags;
631                     CSeq_id first_id, second_id;
632                     in_exp >> expected_aln_idx >> expected_anchor >> expected_row >> flags;
633                     in_exp >> first_from >> first_to >> first_width;
634                     in_exp >> MSerial_AsnText >> first_id;
635                     in_exp >> second_from >> second_to >> second_width;
636                     in_exp >> MSerial_AsnText >> second_id;
637                     BOOST_CHECK(in_exp.good());
638                     BOOST_CHECK(aln_idx == expected_aln_idx);
639                     BOOST_CHECK(anchor == expected_anchor);
640                     BOOST_CHECK(row == expected_row);
641                     BOOST_CHECK(pw.GetFlags() == flags);
642                     BOOST_CHECK(pw.GetFirstId()->GetSeqId().Equals(first_id));
643                     BOOST_CHECK(pw.GetSecondId()->GetSeqId().Equals(second_id));
644                     BOOST_CHECK(pw.GetFirstBaseWidth() == first_width);
645                     BOOST_CHECK(pw.GetSecondBaseWidth() == second_width);
646                     BOOST_CHECK(pw.GetFirstFrom() == first_from);
647                     BOOST_CHECK(pw.GetFirstTo() == first_to);
648                     BOOST_CHECK(pw.GetSecondPosByFirstPos(pw.GetFirstFrom()) == second_from);
649                     BOOST_CHECK(pw.GetSecondPosByFirstPos(pw.GetFirstTo()) == second_to);
650                 }
651 
652                 if (pw.size() < 2) continue;
653 
654                 first_from = (pw.begin()+1)->GetFirstFrom() + 1;
655                 first_to = pw.rbegin()->GetFirstFrom() - 2;
656                 CPairwise_CI::TSignedRange rg(first_from, first_to);
657 
658                 for (CPairwise_CI seg(pw, rg); seg; ++seg) {
659                     if ( DumpExpected() ) {
660                         cout << "  " << seg.GetFirstRange().GetFrom() << " "
661                             << seg.GetFirstRange().GetTo() << " "
662                             << seg.GetSecondRange().GetFrom() << " "
663                             << seg.GetSecondRange().GetTo() << " "
664                             << (seg.IsDirect() ? 1 : 0) << " "
665                             << (seg.IsFirstDirect() ? 1 : 0) << " "
666                             << (seg.GetSegType() == CPairwise_CI::eAligned ? 1 : 0) << endl;
667                     }
668                     else {
669                         int direct, first_direct, aligned;
670                         in_exp >> first_from >> first_to >> second_from >> second_to >>
671                             direct >> first_direct >> aligned;
672                         BOOST_CHECK(in_exp.good());
673                         BOOST_CHECK(seg.GetFirstRange().GetFrom() == first_from);
674                         BOOST_CHECK(seg.GetFirstRange().GetTo() == first_to);
675                         BOOST_CHECK(seg.GetSecondRange().GetFrom() == second_from);
676                         BOOST_CHECK(seg.GetSecondRange().GetTo() == second_to);
677                         BOOST_CHECK(seg.IsDirect() == (direct != 0));
678                         BOOST_CHECK(seg.IsFirstDirect() == (first_direct != 0));
679                         BOOST_CHECK((seg.GetSegType() == CPairwise_CI::eAligned) == (aligned != 0));
680                     }
681                 }
682             }
683         }
684     }
685 }
686 
687 
BOOST_AUTO_TEST_CASE(s_TestAnchoredAln)688 BOOST_AUTO_TEST_CASE(s_TestAnchoredAln)
689 {
690     cout << "Test CAnchoredAln... (aligns1.asn / expected04.txt)" << endl;
691 
692     CNcbiIfstream in_data("data/aligns1.asn");
693     CNcbiIfstream in_exp("data/expected04.txt");
694 
695     CAlnContainer aligns;
696     size_t num_aligns = LoadAligns(in_data, aligns);
697 
698     if ( DumpExpected() ) {
699         cout << num_aligns << endl;
700     }
701     else {
702         size_t expected_num_aligns;
703         in_exp >> expected_num_aligns;
704         BOOST_CHECK(num_aligns == expected_num_aligns);
705     }
706 
707     auto_ptr<TScopeAlnIdMap> aln_id_map;
708     CRef<TScopeAlnStats> aln_stats = InitAlnStats(aligns, aln_id_map);
709 
710     for (size_t aln_idx = 0; aln_idx < aligns.size(); ++aln_idx) {
711         for (int anchor = 0; anchor < aln_stats->GetDimForAln(aln_idx); ++anchor) {
712             CAlnUserOptions user_options;
713             CRef<CAnchoredAln> anchored_aln = CreateAnchoredAlnFromAln(*aln_stats,
714                 aln_idx, user_options, anchor);
715 
716             for (int row = 0; row < (int)anchored_aln->GetPairwiseAlns().size(); ++row) {
717                 CheckPairwiseAln(in_exp,
718                     *anchored_aln->GetPairwiseAlns()[row],
719                     aln_idx, anchored_aln->GetAnchorRow(), row);
720             }
721         }
722     }
723 }
724 
725 
BOOST_AUTO_TEST_CASE(s_TestAnchoredAlnBuilt)726 BOOST_AUTO_TEST_CASE(s_TestAnchoredAlnBuilt)
727 {
728     static const int kMergeFlags[] = {
729         0,
730         CAlnUserOptions::fTruncateOverlaps,
731         CAlnUserOptions::fAllowTranslocation,
732         CAlnUserOptions::fUseAnchorAsAlnSeq,
733         CAlnUserOptions::fAnchorRowFirst,
734         CAlnUserOptions::fIgnoreInsertions
735     };
736 
737     cout << "Test built CAnchoredAln... (aligns2.asn / expected05.txt)" << endl;
738 
739     CNcbiIfstream in_data("data/aligns2.asn");
740     CNcbiIfstream in_exp("data/expected05.txt");
741 
742     int setnum = 0;
743     while (!in_data.eof()  &&  in_data.good()) {
744         ++setnum;
745         size_t num_to_merge;
746         in_data >> num_to_merge;
747         if (num_to_merge == 0  ||  !in_data.good()) break;
748 
749         CAlnContainer aligns;
750         size_t num_aligns = LoadAligns(in_data, aligns, num_to_merge);
751 
752         if ( DumpExpected() ) {
753             cout << num_aligns << endl;
754         }
755         else {
756             size_t expected_num_aligns;
757             in_exp >> expected_num_aligns;
758             BOOST_CHECK(num_aligns == expected_num_aligns);
759         }
760 
761         auto_ptr<TScopeAlnIdMap> aln_id_map;
762         CRef<TScopeAlnStats> aln_stats = InitAlnStats(aligns, aln_id_map);
763 
764         const TScopeAlnStats::TIdVec& aln_ids = aln_stats->GetAnchorIdVec();
765         for (size_t anchor_idx = 0; anchor_idx < aln_ids.size(); anchor_idx++) {
766             TAlnSeqIdIRef anchor_id = aln_ids[anchor_idx];
767             CAlnUserOptions user_options;
768             user_options.SetAnchorId(anchor_id);
769 
770             for (size_t flags_idx = 0; flags_idx < sizeof(kMergeFlags)/sizeof(kMergeFlags[0]); flags_idx++) {
771                 user_options.m_MergeFlags = kMergeFlags[flags_idx];
772 
773                 if ( DumpExpected() ) {
774                     cout << anchor_idx << " "
775                         << user_options.m_MergeFlags << endl;
776                 }
777                 else {
778                     if ( Verbose() ) {
779                         cout << "  set=" << setnum << ", anchor=" << anchor_idx << " (" <<
780                             anchor_id->GetSeqId().AsFastaString() << ")" <<
781                             ", flags_idx=" << flags_idx << endl;
782                     }
783                     size_t expected_anchor_idx;
784                     int expected_flags;
785                     in_exp >> expected_anchor_idx >> expected_flags;
786                     BOOST_CHECK(in_exp.good());
787                     BOOST_CHECK(anchor_idx == expected_anchor_idx);
788                     BOOST_CHECK(user_options.m_MergeFlags == expected_flags);
789                 }
790 
791                 TAnchoredAlnVec anchored_aln_vec;
792                 CreateAnchoredAlnVec(*aln_stats, anchored_aln_vec, user_options);
793 
794                 CRef<CSeq_id> id(new CSeq_id);
795                 id->SetLocal().SetStr("pseudo-id");
796                 TAlnSeqIdIRef pseudo_id(Ref(new CAlnSeqId(*id)));
797                 CAnchoredAln built_aln;
798                 BuildAln(anchored_aln_vec, built_aln, user_options, pseudo_id);
799 
800                 for (int row = 0; row < (int)built_aln.GetPairwiseAlns().size(); ++row) {
801                     CheckPairwiseAln(in_exp,
802                         *built_aln.GetPairwiseAlns()[row],
803                         0, built_aln.GetAnchorRow(), row);
804                 }
805             }
806         }
807     }
808 }
809 
810 
BOOST_AUTO_TEST_CASE(s_TestSparseAlnCoords)811 BOOST_AUTO_TEST_CASE(s_TestSparseAlnCoords)
812 {
813     cout << "Test CSparseAln coordinates... (aligns2.asn / expected06.txt)" << endl;
814 
815     CNcbiIfstream in_data("data/aligns2.asn");
816     CNcbiIfstream in_exp("data/expected06.txt");
817 
818     int setnum = 0;
819     while (!in_data.eof()  &&  in_data.good()) {
820         ++setnum;
821         size_t num_to_merge;
822         in_data >> num_to_merge;
823         if (num_to_merge == 0  ||  !in_data.good()) break;
824 
825         CAlnContainer aligns;
826         size_t num_aligns = LoadAligns(in_data, aligns, num_to_merge);
827 
828         if ( DumpExpected() ) {
829             cout << num_aligns << endl;
830         }
831         else {
832             size_t expected_num_aligns;
833             in_exp >> expected_num_aligns;
834             BOOST_CHECK(num_aligns == expected_num_aligns);
835         }
836 
837         auto_ptr<TScopeAlnIdMap> aln_id_map;
838         CRef<TScopeAlnStats> aln_stats = InitAlnStats(aligns, aln_id_map);
839 
840         const TScopeAlnStats::TIdVec& aln_ids = aln_stats->GetAnchorIdVec();
841         for (size_t anchor_idx = 0; anchor_idx < aln_ids.size(); anchor_idx++) {
842             TAlnSeqIdIRef anchor_id = aln_ids[anchor_idx];
843             CAlnUserOptions user_options;
844             user_options.SetAnchorId(anchor_id);
845             user_options.m_MergeFlags = CAlnUserOptions::fTruncateOverlaps;
846 
847             if ( DumpExpected() ) {
848                 cout << anchor_idx << " "
849                     << MSerial_AsnText << anchor_id->GetSeqId();
850             }
851             else {
852                 if ( Verbose() ) {
853                     cout << "  set=" << setnum << ", anchor=" << anchor_idx << " (" <<
854                         anchor_id->GetSeqId().AsFastaString() << ")" << endl;
855                 }
856                 size_t expected_anchor_idx;
857                 in_exp >> expected_anchor_idx;
858                 CSeq_id expected_anchor_id;
859                 in_exp >> MSerial_AsnText >> expected_anchor_id;
860                 BOOST_CHECK(in_exp.good());
861                 BOOST_CHECK(anchor_idx == expected_anchor_idx);
862                 BOOST_CHECK(anchor_id->GetSeqId().Equals(expected_anchor_id));
863             }
864 
865             TAnchoredAlnVec anchored_aln_vec;
866             CreateAnchoredAlnVec(*aln_stats, anchored_aln_vec, user_options);
867 
868             CRef<CSeq_id> id(new CSeq_id);
869             id->SetLocal().SetStr("pseudo-id");
870             TAlnSeqIdIRef pseudo_id(Ref(new CAlnSeqId(*id)));
871             CAnchoredAln built_aln;
872             BuildAln(anchored_aln_vec, built_aln, user_options, pseudo_id);
873 
874             CSparseAln sparse_aln(built_aln, GetScope());
875 
876             CheckSparseAln(in_exp, sparse_aln, false);
877         }
878     }
879 }
880 
881 
BOOST_AUTO_TEST_CASE(s_TestSparseAlnData)882 BOOST_AUTO_TEST_CASE(s_TestSparseAlnData)
883 {
884     cout << "Test CSparseAln sequence... (aligns3.asn / expected07.txt)" << endl;
885 
886     CNcbiIfstream in_data("data/aligns3.asn");
887     CNcbiIfstream in_exp("data/expected07.txt");
888 
889     int setnum = 0;
890     while (!in_data.eof()  &&  in_data.good()) {
891         ++setnum;
892         size_t num_to_merge;
893         in_data >> num_to_merge;
894         if (num_to_merge == 0  ||  !in_data.good()) break;
895 
896         CAlnContainer aligns;
897         size_t num_aligns = LoadAligns(in_data, aligns, num_to_merge);
898 
899         if ( DumpExpected() ) {
900             cout << num_aligns << endl;
901         }
902         else {
903             size_t expected_num_aligns;
904             in_exp >> expected_num_aligns;
905             BOOST_CHECK(num_aligns == expected_num_aligns);
906         }
907 
908         auto_ptr<TScopeAlnIdMap> aln_id_map;
909         CRef<TScopeAlnStats> aln_stats = InitAlnStats(aligns, aln_id_map);
910 
911         const TScopeAlnStats::TIdVec& aln_ids = aln_stats->GetAnchorIdVec();
912         for (size_t anchor_idx = 0; anchor_idx < aln_ids.size(); anchor_idx++) {
913             TAlnSeqIdIRef anchor_id = aln_ids[anchor_idx];
914             CAlnUserOptions user_options;
915             user_options.SetAnchorId(anchor_id);
916             user_options.m_MergeFlags = CAlnUserOptions::fTruncateOverlaps;
917 
918             if ( DumpExpected() ) {
919                 cout << anchor_idx << " "
920                     << MSerial_AsnText << anchor_id->GetSeqId();
921             }
922             else {
923                 if ( Verbose() ) {
924                     cout << "  set=" << setnum << ", anchor=" << anchor_idx << " (" <<
925                         anchor_id->GetSeqId().AsFastaString() << ")" << endl;
926                 }
927                 size_t expected_anchor_idx;
928                 in_exp >> expected_anchor_idx;
929                 CSeq_id expected_anchor_id;
930                 in_exp >> MSerial_AsnText >> expected_anchor_id;
931                 BOOST_CHECK(in_exp.good());
932                 BOOST_CHECK(anchor_idx == expected_anchor_idx);
933                 BOOST_CHECK(anchor_id->GetSeqId().Equals(expected_anchor_id));
934             }
935 
936             TAnchoredAlnVec anchored_aln_vec;
937             CreateAnchoredAlnVec(*aln_stats, anchored_aln_vec, user_options);
938 
939             CRef<CSeq_id> id(new CSeq_id);
940             id->SetLocal().SetStr("pseudo-id");
941             TAlnSeqIdIRef pseudo_id(Ref(new CAlnSeqId(*id)));
942             CAnchoredAln built_aln;
943             BuildAln(anchored_aln_vec, built_aln, user_options, pseudo_id);
944 
945             CSparseAln sparse_aln(built_aln, GetScope());
946 
947             CheckSparseAln(in_exp, sparse_aln, true);
948 
949             CRef<CSeq_align> aln = CreateSeqAlignFromAnchoredAln(built_aln, CSeq_align::TSegs::e_Denseg, &GetScope());
950             BOOST_CHECK(aln);
951 
952             if ( DumpExpected() ) {
953                 cout << MSerial_AsnText << *aln;
954             }
955             else {
956                 CSeq_align expected_aln;
957                 in_exp >> MSerial_AsnText >> expected_aln;
958                 BOOST_CHECK(aln->Equals(expected_aln));
959             }
960         }
961     }
962 }
963 
BOOST_AUTO_TEST_CASE(s_TestAlnTypes)964 BOOST_AUTO_TEST_CASE(s_TestAlnTypes)
965 {
966     cout << "Test seq-align types... (aligns4.asn / expected08.txt)" << endl;
967 
968     CNcbiIfstream in_data("data/aligns4.asn");
969     CNcbiIfstream in_exp("data/expected08.txt");
970 
971     CAlnContainer aligns;
972     size_t num_aligns = LoadAligns(in_data, aligns);
973 
974     if ( DumpExpected() ) {
975         cout << num_aligns << endl;
976     }
977     else {
978         size_t expected_num_aligns;
979         in_exp >> expected_num_aligns;
980         BOOST_CHECK(num_aligns == expected_num_aligns);
981     }
982 
983     auto_ptr<TScopeAlnIdMap> aln_id_map;
984     CRef<TScopeAlnStats> aln_stats = InitAlnStats(aligns, aln_id_map);
985 
986     for (size_t aln_idx = 0; aln_idx < aligns.size(); ++aln_idx) {
987         for (int anchor = 0; anchor < aln_stats->GetDimForAln(aln_idx); ++anchor) {
988 
989             // Special case - sparse-segs can be anchored only to the first row.
990             if (aln_stats->GetAlnVec()[aln_idx]->GetSegs().IsSparse()  &&  anchor > 0) {
991                 break;
992             }
993 
994             CAlnUserOptions user_options;
995             CRef<CAnchoredAln> anchored_aln = CreateAnchoredAlnFromAln(*aln_stats,
996                 aln_idx, user_options, anchor);
997 
998             for (int row = 0; row < (int)anchored_aln->GetPairwiseAlns().size(); ++row) {
999                 CheckPairwiseAln(in_exp,
1000                     *anchored_aln->GetPairwiseAlns()[row],
1001                     aln_idx, anchored_aln->GetAnchorRow(), row);
1002             }
1003         }
1004     }
1005 }
1006