1 /*  $Id: seqalignfilter_unit_test.cpp 637484 2021-09-13 18:27:59Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Vahram Avagyan
27  *
28  * File Description:
29  *   CSeqAlignFilter unit test.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 
35 #include <objects/seqloc/Seq_loc.hpp>
36 #include <objects/seqalign/Seq_align_set.hpp>
37 #include <objects/seqalign/Seq_align.hpp>
38 #include <objects/seqalign/Score.hpp>
39 #include <objects/general/Object_id.hpp>
40 
41 #include <algo/blast/api/seqinfosrc_seqdb.hpp>
42 #include <algo/blast/api/blast_seqinfosrc_aux.hpp>
43 
44 #include <objtools/blast/seqdb_reader/seqdb.hpp>
45 #include <objtools/align_format/seqalignfilter.hpp>
46 
47 #include <serial/serial.hpp>
48 #include <serial/objistr.hpp>
49 #include <serial/objostr.hpp>
50 #include <serial/iterator.hpp>
51 #include <sstream>
52 #undef NCBI_BOOST_NO_AUTO_TEST_MAIN
53 #include <corelib/test_boost.hpp>
54 
55 #ifndef SKIP_DOXYGEN_PROCESSING
56 
57 USING_NCBI_SCOPE;
58 USING_SCOPE(objects);
59 USING_SCOPE(align_format);
60 
61 template<class ASNOBJ>
s_Stringify(const ASNOBJ & a,string & s)62 void s_Stringify(const ASNOBJ & a, string & s)
63 {
64     CNcbiOstrstream oss;
65     oss << MSerial_AsnText << a;
66     s = CNcbiOstrstreamToString(oss);
67 }
68 
69 template<class ASNOBJ>
s_Unstringify(const string & s,ASNOBJ & a)70 void s_Unstringify(const string & s, ASNOBJ & a)
71 {
72     istringstream iss;
73     iss.str(s);
74     iss >> MSerial_AsnText >> a;
75 }
76 
77 template<class ASNOBJ>
s_Duplicate(const ASNOBJ & a)78 CRef<ASNOBJ> s_Duplicate(const ASNOBJ & a)
79 {
80     CRef<ASNOBJ> newobj(new ASNOBJ);
81 
82     string s;
83     s_Stringify(a, s);
84     s_Unstringify(s, *newobj);
85 
86     return newobj;
87 }
88 
89 /////////////////////////////////////////////////////////////////////////////
90 // List-based static helper functions
91 
s_GetUseThisGiEntries(CRef<CSeq_align> sa,list<TGi> & list_gis)92 static void s_GetUseThisGiEntries(CRef<CSeq_align> sa, list<TGi>& list_gis)
93 {
94     list_gis.clear();
95 
96     CSeq_align::TScore& score_entries = sa->SetScore();
97     CSeq_align::TScore::iterator iter_score = score_entries.begin();
98     while (iter_score != score_entries.end())
99     {
100         CRef<CScore> score_entry = *iter_score++;
101         if (score_entry->CanGetId() && score_entry->GetId().IsStr())
102         {
103             string str_id = score_entry->GetId().GetStr();
104             if (str_id == "use_this_gi")
105             {
106                 bool bIsLegalGiEntry = score_entry->CanGetValue() && score_entry->GetValue().IsInt();
107                 BOOST_REQUIRE(bIsLegalGiEntry);
108                 Uint4 gi_v = (Uint4) (score_entry->GetValue().GetInt());
109                 list_gis.push_back(GI_FROM(Uint4, gi_v));
110             }
111         }
112     }
113 }
114 
s_GetAlignedSeqGi(CRef<CSeq_align> sa)115 static TGi s_GetAlignedSeqGi(CRef<CSeq_align> sa)
116 {
117     CConstRef<CSeq_id> id(&(sa->GetSeq_id(1)));
118 
119     BOOST_REQUIRE(id->IsGi());
120     return id->GetGi();
121 }
122 
s_GetFullGiList(CRef<CSeq_align> sa,list<TGi> & list_gis)123 static void s_GetFullGiList(CRef<CSeq_align> sa, list<TGi>& list_gis)
124 {
125     s_GetUseThisGiEntries(sa, list_gis);
126     list_gis.push_back(s_GetAlignedSeqGi(sa));
127 }
128 
s_IsGiInList(TGi gi,list<TGi> & list_gis)129 static bool s_IsGiInList(TGi gi, list<TGi>& list_gis)
130 {
131     return find(list_gis.begin(), list_gis.end(), gi) != list_gis.end();
132 }
133 
s_IsListSubset(list<TGi> & list_all,list<TGi> & list_sub)134 static bool s_IsListSubset(list<TGi>& list_all, list<TGi>& list_sub)
135 {
136     bool is_missing = false;
137 
138     list<TGi>::iterator it;
139     for (it = list_sub.begin(); it != list_sub.end() && !is_missing; it++)
140     {
141         is_missing = !s_IsGiInList(*it, list_all);
142     }
143 
144     return !is_missing;
145 }
146 
s_AreListsEqual(list<TGi> & list1,list<TGi> & list2)147 static bool s_AreListsEqual(list<TGi>& list1, list<TGi>& list2)
148 {
149     return s_IsListSubset(list1, list2) && s_IsListSubset(list2, list1);
150 }
151 
152 /////////////////////////////////////////////////////////////////////////////
153 // Vector-based static helper functions
154 
s_IsGiInVector(TGi gi,vector<TGi> & vec_gis)155 static bool s_IsGiInVector(TGi gi, vector<TGi>& vec_gis)
156 {
157     return binary_search(vec_gis.begin(), vec_gis.end(), gi);
158 }
159 
s_GetFilteredGiList(CRef<CSeq_align> sa,vector<TGi> & vec_all_gis,list<TGi> & list_sa_filtered)160 static bool s_GetFilteredGiList(CRef<CSeq_align> sa, vector<TGi>& vec_all_gis,
161                                list<TGi>& list_sa_filtered)
162 {
163     list<TGi> list_sa_full;
164     s_GetFullGiList(sa, list_sa_full);
165 
166     for (list<TGi>::iterator it = list_sa_full.begin();
167          it != list_sa_full.end(); it++)
168     {
169         if (s_IsGiInVector(*it, vec_all_gis))
170         {
171             list_sa_filtered.push_back(*it);
172         }
173     }
174 
175     return !list_sa_filtered.empty();
176 }
177 
178 /////////////////////////////////////////////////////////////////////////////
179 // Functions to test filtering results for individual seqaligns
180 
181 static void
s_Check_GiListConsistency(CRef<CSeq_align>,CRef<CSeq_align> sa_new,list<TGi> & list_orig_filtered,list<TGi> & list_new_filtered)182 s_Check_GiListConsistency(CRef<CSeq_align> /*sa_orig*/,
183                           CRef<CSeq_align> sa_new,
184                           list<TGi>& list_orig_filtered,
185                           list<TGi>& list_new_filtered)
186 {
187     list<TGi> list_new;
188     s_GetFullGiList(sa_new, list_new);
189 
190     BOOST_REQUIRE(s_AreListsEqual(list_new, list_new_filtered));    // new list is indeed filtered
191     BOOST_REQUIRE(s_IsListSubset(list_new, list_orig_filtered));    // all original gi's who survived filtering
192                                                             // are included in the new list
193 }
194 
s_Check_GiEquivalenceInDB(TGi gi1,TGi gi2,CRef<CSeqDB> db)195 static void s_Check_GiEquivalenceInDB(TGi gi1, TGi gi2, CRef<CSeqDB> db)
196 {
197     int oid1 = -1, oid2 = -1;
198     db->GiToOid(gi1, oid1);
199     db->GiToOid(gi2, oid2);
200 
201     BOOST_REQUIRE(oid1 > 0);
202     BOOST_REQUIRE(oid2 > 0);
203     BOOST_REQUIRE(oid1 == oid2);
204 }
205 
206 /////////////////////////////////////////////////////////////////////////////
207 // Pre-processing and testing individual seqaligns
208 
s_DoConsistencyCheck(CRef<CSeq_align> sa_orig,CRef<CSeq_align> sa_new,vector<TGi> & vec_all_gis)209 static void s_DoConsistencyCheck(CRef<CSeq_align> sa_orig, CRef<CSeq_align> sa_new,
210                                         vector<TGi>& vec_all_gis)
211 {
212     list<TGi> list_orig_filtered;
213     list<TGi> list_new, list_new_filtered;
214 
215     s_GetFilteredGiList(sa_orig, vec_all_gis, list_orig_filtered);
216     s_GetFilteredGiList(sa_new, vec_all_gis, list_new_filtered);
217 
218     s_Check_GiListConsistency(sa_orig, sa_new,
219                             list_orig_filtered, list_new_filtered);
220 }
221 
s_DoEquivalenceCheck(CRef<CSeq_align> sa_new,CRef<CSeqDB> db)222 static void s_DoEquivalenceCheck(CRef<CSeq_align> sa_new, CRef<CSeqDB> db)
223 {
224     TGi main_gi = s_GetAlignedSeqGi(sa_new);
225 
226     list<TGi> list_extra_gis;
227     s_GetUseThisGiEntries(sa_new, list_extra_gis);
228 
229     for (list<TGi>::iterator it_extra_gi = list_extra_gis.begin();
230          it_extra_gi != list_extra_gis.end(); it_extra_gi++)
231     {
232         s_Check_GiEquivalenceInDB(main_gi, *it_extra_gi, db);
233     }
234 }
235 
236 /////////////////////////////////////////////////////////////////////////////
237 // Other pre-processing
238 
s_LoadSeqAlignsFromFile(CSeq_align_set & aln_all,const string & fname)239 static void s_LoadSeqAlignsFromFile(CSeq_align_set& aln_all, const string& fname)
240 {
241     auto_ptr<CObjectIStream> asn_in(CObjectIStream::Open(fname, eSerial_AsnText));
242     *asn_in >> aln_all;
243 }
244 
245 /////////////////////////////////////////////////////////////////////////////
246 // Actual test cases
247 
248 BOOST_AUTO_TEST_SUITE(seqalignfilter)
249 
BOOST_AUTO_TEST_CASE(s_TestSimpleFiltering)250 BOOST_AUTO_TEST_CASE(s_TestSimpleFiltering)
251 {
252     string fname_in = "data/in_test.txt";
253     string fname_out = "data/out_test.txt";
254     string fname_gis = "data/gilist_test.txt";
255 
256 	CSeq_align_set aln_all;
257     s_LoadSeqAlignsFromFile(aln_all, fname_in);
258 
259     CSeqAlignFilter filter;
260     filter.FilterSeqaligns(fname_in, fname_out, fname_gis);
261 
262     CSeq_align_set aln_filtered;
263     s_LoadSeqAlignsFromFile(aln_filtered, fname_out);
264 
265     list<TGi> list_gis;
266     filter.ReadGiList(fname_gis, list_gis);
267 
268     ITERATE(CSeq_align_set::Tdata, iter, aln_all.Get())
269     {
270         TGi gi = s_GetAlignedSeqGi(*iter);
271         if (s_IsGiInList(gi, list_gis))
272         {
273             bool found_gi = false;
274             ITERATE(CSeq_align_set::Tdata, iter_filtered, aln_filtered.Get())
275             {
276                 TGi gi_filtered = s_GetAlignedSeqGi(*iter_filtered);
277                 if (gi == gi_filtered)
278                 {
279                     found_gi = true;
280                     break;
281                 }
282             }
283             BOOST_REQUIRE(found_gi);
284         }
285     }
286 }
287 
BOOST_AUTO_TEST_CASE(s_TestDBBasedFiltering)288 BOOST_AUTO_TEST_CASE(s_TestDBBasedFiltering)
289 {
290     string fname_in = "data/in_test.txt";
291     string fname_out = "data/out_test.txt";
292     string fname_gis = "data/gilist_test.txt";
293 
294     string db_name = "nt";
295     bool use_prot = false;
296 
297     CSeqAlignFilter filter;
298     CRef<CSeqDB> db;
299 
300     BOOST_REQUIRE_NO_THROW(db = filter.PrepareSeqDB(db_name, use_prot, fname_gis););
301     BOOST_REQUIRE_NO_THROW(filter.FilterSeqalignsExt(fname_in, fname_out, db););
302 
303     // check the results
304 
305     CSeq_align_set aln_all;
306     s_LoadSeqAlignsFromFile(aln_all, fname_in);
307 
308     CSeq_align_set aln_filtered;
309     s_LoadSeqAlignsFromFile(aln_filtered, fname_out);
310 
311     vector<TGi> vec_gis;    // sorted vector of all available gi's
312     filter.ReadGiVector(fname_gis, vec_gis, true);
313 
314     ITERATE(CSeq_align_set::Tdata, iter, aln_all.Get())
315     {
316         TGi gi = s_GetAlignedSeqGi(*iter);
317         ITERATE(CSeq_align_set::Tdata, iter_filtered, aln_filtered.Get())
318         {
319             TGi gi_filtered = s_GetAlignedSeqGi(*iter_filtered);
320             if (gi == gi_filtered)
321             {
322                 // main gi's coincide - check the concistency of all the gi's
323                 s_DoConsistencyCheck(*iter, *iter_filtered, vec_gis);
324                 // check the equivalence of all gi's in the filtered seqalign
325                 s_DoEquivalenceCheck(*iter_filtered, db);
326             }
327         }
328     }
329 }
330 BOOST_AUTO_TEST_SUITE_END()
331 #endif /* SKIP_DOXYGEN_PROCESSING */
332