1 /* $Id: seqalignfilter_unit_test.cpp 637484 2021-09-13 18:27:59Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Vahram Avagyan
27 *
28 * File Description:
29 * CSeqAlignFilter unit test.
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34
35 #include <objects/seqloc/Seq_loc.hpp>
36 #include <objects/seqalign/Seq_align_set.hpp>
37 #include <objects/seqalign/Seq_align.hpp>
38 #include <objects/seqalign/Score.hpp>
39 #include <objects/general/Object_id.hpp>
40
41 #include <algo/blast/api/seqinfosrc_seqdb.hpp>
42 #include <algo/blast/api/blast_seqinfosrc_aux.hpp>
43
44 #include <objtools/blast/seqdb_reader/seqdb.hpp>
45 #include <objtools/align_format/seqalignfilter.hpp>
46
47 #include <serial/serial.hpp>
48 #include <serial/objistr.hpp>
49 #include <serial/objostr.hpp>
50 #include <serial/iterator.hpp>
51 #include <sstream>
52 #undef NCBI_BOOST_NO_AUTO_TEST_MAIN
53 #include <corelib/test_boost.hpp>
54
55 #ifndef SKIP_DOXYGEN_PROCESSING
56
57 USING_NCBI_SCOPE;
58 USING_SCOPE(objects);
59 USING_SCOPE(align_format);
60
61 template<class ASNOBJ>
s_Stringify(const ASNOBJ & a,string & s)62 void s_Stringify(const ASNOBJ & a, string & s)
63 {
64 CNcbiOstrstream oss;
65 oss << MSerial_AsnText << a;
66 s = CNcbiOstrstreamToString(oss);
67 }
68
69 template<class ASNOBJ>
s_Unstringify(const string & s,ASNOBJ & a)70 void s_Unstringify(const string & s, ASNOBJ & a)
71 {
72 istringstream iss;
73 iss.str(s);
74 iss >> MSerial_AsnText >> a;
75 }
76
77 template<class ASNOBJ>
s_Duplicate(const ASNOBJ & a)78 CRef<ASNOBJ> s_Duplicate(const ASNOBJ & a)
79 {
80 CRef<ASNOBJ> newobj(new ASNOBJ);
81
82 string s;
83 s_Stringify(a, s);
84 s_Unstringify(s, *newobj);
85
86 return newobj;
87 }
88
89 /////////////////////////////////////////////////////////////////////////////
90 // List-based static helper functions
91
s_GetUseThisGiEntries(CRef<CSeq_align> sa,list<TGi> & list_gis)92 static void s_GetUseThisGiEntries(CRef<CSeq_align> sa, list<TGi>& list_gis)
93 {
94 list_gis.clear();
95
96 CSeq_align::TScore& score_entries = sa->SetScore();
97 CSeq_align::TScore::iterator iter_score = score_entries.begin();
98 while (iter_score != score_entries.end())
99 {
100 CRef<CScore> score_entry = *iter_score++;
101 if (score_entry->CanGetId() && score_entry->GetId().IsStr())
102 {
103 string str_id = score_entry->GetId().GetStr();
104 if (str_id == "use_this_gi")
105 {
106 bool bIsLegalGiEntry = score_entry->CanGetValue() && score_entry->GetValue().IsInt();
107 BOOST_REQUIRE(bIsLegalGiEntry);
108 Uint4 gi_v = (Uint4) (score_entry->GetValue().GetInt());
109 list_gis.push_back(GI_FROM(Uint4, gi_v));
110 }
111 }
112 }
113 }
114
s_GetAlignedSeqGi(CRef<CSeq_align> sa)115 static TGi s_GetAlignedSeqGi(CRef<CSeq_align> sa)
116 {
117 CConstRef<CSeq_id> id(&(sa->GetSeq_id(1)));
118
119 BOOST_REQUIRE(id->IsGi());
120 return id->GetGi();
121 }
122
s_GetFullGiList(CRef<CSeq_align> sa,list<TGi> & list_gis)123 static void s_GetFullGiList(CRef<CSeq_align> sa, list<TGi>& list_gis)
124 {
125 s_GetUseThisGiEntries(sa, list_gis);
126 list_gis.push_back(s_GetAlignedSeqGi(sa));
127 }
128
s_IsGiInList(TGi gi,list<TGi> & list_gis)129 static bool s_IsGiInList(TGi gi, list<TGi>& list_gis)
130 {
131 return find(list_gis.begin(), list_gis.end(), gi) != list_gis.end();
132 }
133
s_IsListSubset(list<TGi> & list_all,list<TGi> & list_sub)134 static bool s_IsListSubset(list<TGi>& list_all, list<TGi>& list_sub)
135 {
136 bool is_missing = false;
137
138 list<TGi>::iterator it;
139 for (it = list_sub.begin(); it != list_sub.end() && !is_missing; it++)
140 {
141 is_missing = !s_IsGiInList(*it, list_all);
142 }
143
144 return !is_missing;
145 }
146
s_AreListsEqual(list<TGi> & list1,list<TGi> & list2)147 static bool s_AreListsEqual(list<TGi>& list1, list<TGi>& list2)
148 {
149 return s_IsListSubset(list1, list2) && s_IsListSubset(list2, list1);
150 }
151
152 /////////////////////////////////////////////////////////////////////////////
153 // Vector-based static helper functions
154
s_IsGiInVector(TGi gi,vector<TGi> & vec_gis)155 static bool s_IsGiInVector(TGi gi, vector<TGi>& vec_gis)
156 {
157 return binary_search(vec_gis.begin(), vec_gis.end(), gi);
158 }
159
s_GetFilteredGiList(CRef<CSeq_align> sa,vector<TGi> & vec_all_gis,list<TGi> & list_sa_filtered)160 static bool s_GetFilteredGiList(CRef<CSeq_align> sa, vector<TGi>& vec_all_gis,
161 list<TGi>& list_sa_filtered)
162 {
163 list<TGi> list_sa_full;
164 s_GetFullGiList(sa, list_sa_full);
165
166 for (list<TGi>::iterator it = list_sa_full.begin();
167 it != list_sa_full.end(); it++)
168 {
169 if (s_IsGiInVector(*it, vec_all_gis))
170 {
171 list_sa_filtered.push_back(*it);
172 }
173 }
174
175 return !list_sa_filtered.empty();
176 }
177
178 /////////////////////////////////////////////////////////////////////////////
179 // Functions to test filtering results for individual seqaligns
180
181 static void
s_Check_GiListConsistency(CRef<CSeq_align>,CRef<CSeq_align> sa_new,list<TGi> & list_orig_filtered,list<TGi> & list_new_filtered)182 s_Check_GiListConsistency(CRef<CSeq_align> /*sa_orig*/,
183 CRef<CSeq_align> sa_new,
184 list<TGi>& list_orig_filtered,
185 list<TGi>& list_new_filtered)
186 {
187 list<TGi> list_new;
188 s_GetFullGiList(sa_new, list_new);
189
190 BOOST_REQUIRE(s_AreListsEqual(list_new, list_new_filtered)); // new list is indeed filtered
191 BOOST_REQUIRE(s_IsListSubset(list_new, list_orig_filtered)); // all original gi's who survived filtering
192 // are included in the new list
193 }
194
s_Check_GiEquivalenceInDB(TGi gi1,TGi gi2,CRef<CSeqDB> db)195 static void s_Check_GiEquivalenceInDB(TGi gi1, TGi gi2, CRef<CSeqDB> db)
196 {
197 int oid1 = -1, oid2 = -1;
198 db->GiToOid(gi1, oid1);
199 db->GiToOid(gi2, oid2);
200
201 BOOST_REQUIRE(oid1 > 0);
202 BOOST_REQUIRE(oid2 > 0);
203 BOOST_REQUIRE(oid1 == oid2);
204 }
205
206 /////////////////////////////////////////////////////////////////////////////
207 // Pre-processing and testing individual seqaligns
208
s_DoConsistencyCheck(CRef<CSeq_align> sa_orig,CRef<CSeq_align> sa_new,vector<TGi> & vec_all_gis)209 static void s_DoConsistencyCheck(CRef<CSeq_align> sa_orig, CRef<CSeq_align> sa_new,
210 vector<TGi>& vec_all_gis)
211 {
212 list<TGi> list_orig_filtered;
213 list<TGi> list_new, list_new_filtered;
214
215 s_GetFilteredGiList(sa_orig, vec_all_gis, list_orig_filtered);
216 s_GetFilteredGiList(sa_new, vec_all_gis, list_new_filtered);
217
218 s_Check_GiListConsistency(sa_orig, sa_new,
219 list_orig_filtered, list_new_filtered);
220 }
221
s_DoEquivalenceCheck(CRef<CSeq_align> sa_new,CRef<CSeqDB> db)222 static void s_DoEquivalenceCheck(CRef<CSeq_align> sa_new, CRef<CSeqDB> db)
223 {
224 TGi main_gi = s_GetAlignedSeqGi(sa_new);
225
226 list<TGi> list_extra_gis;
227 s_GetUseThisGiEntries(sa_new, list_extra_gis);
228
229 for (list<TGi>::iterator it_extra_gi = list_extra_gis.begin();
230 it_extra_gi != list_extra_gis.end(); it_extra_gi++)
231 {
232 s_Check_GiEquivalenceInDB(main_gi, *it_extra_gi, db);
233 }
234 }
235
236 /////////////////////////////////////////////////////////////////////////////
237 // Other pre-processing
238
s_LoadSeqAlignsFromFile(CSeq_align_set & aln_all,const string & fname)239 static void s_LoadSeqAlignsFromFile(CSeq_align_set& aln_all, const string& fname)
240 {
241 auto_ptr<CObjectIStream> asn_in(CObjectIStream::Open(fname, eSerial_AsnText));
242 *asn_in >> aln_all;
243 }
244
245 /////////////////////////////////////////////////////////////////////////////
246 // Actual test cases
247
248 BOOST_AUTO_TEST_SUITE(seqalignfilter)
249
BOOST_AUTO_TEST_CASE(s_TestSimpleFiltering)250 BOOST_AUTO_TEST_CASE(s_TestSimpleFiltering)
251 {
252 string fname_in = "data/in_test.txt";
253 string fname_out = "data/out_test.txt";
254 string fname_gis = "data/gilist_test.txt";
255
256 CSeq_align_set aln_all;
257 s_LoadSeqAlignsFromFile(aln_all, fname_in);
258
259 CSeqAlignFilter filter;
260 filter.FilterSeqaligns(fname_in, fname_out, fname_gis);
261
262 CSeq_align_set aln_filtered;
263 s_LoadSeqAlignsFromFile(aln_filtered, fname_out);
264
265 list<TGi> list_gis;
266 filter.ReadGiList(fname_gis, list_gis);
267
268 ITERATE(CSeq_align_set::Tdata, iter, aln_all.Get())
269 {
270 TGi gi = s_GetAlignedSeqGi(*iter);
271 if (s_IsGiInList(gi, list_gis))
272 {
273 bool found_gi = false;
274 ITERATE(CSeq_align_set::Tdata, iter_filtered, aln_filtered.Get())
275 {
276 TGi gi_filtered = s_GetAlignedSeqGi(*iter_filtered);
277 if (gi == gi_filtered)
278 {
279 found_gi = true;
280 break;
281 }
282 }
283 BOOST_REQUIRE(found_gi);
284 }
285 }
286 }
287
BOOST_AUTO_TEST_CASE(s_TestDBBasedFiltering)288 BOOST_AUTO_TEST_CASE(s_TestDBBasedFiltering)
289 {
290 string fname_in = "data/in_test.txt";
291 string fname_out = "data/out_test.txt";
292 string fname_gis = "data/gilist_test.txt";
293
294 string db_name = "nt";
295 bool use_prot = false;
296
297 CSeqAlignFilter filter;
298 CRef<CSeqDB> db;
299
300 BOOST_REQUIRE_NO_THROW(db = filter.PrepareSeqDB(db_name, use_prot, fname_gis););
301 BOOST_REQUIRE_NO_THROW(filter.FilterSeqalignsExt(fname_in, fname_out, db););
302
303 // check the results
304
305 CSeq_align_set aln_all;
306 s_LoadSeqAlignsFromFile(aln_all, fname_in);
307
308 CSeq_align_set aln_filtered;
309 s_LoadSeqAlignsFromFile(aln_filtered, fname_out);
310
311 vector<TGi> vec_gis; // sorted vector of all available gi's
312 filter.ReadGiVector(fname_gis, vec_gis, true);
313
314 ITERATE(CSeq_align_set::Tdata, iter, aln_all.Get())
315 {
316 TGi gi = s_GetAlignedSeqGi(*iter);
317 ITERATE(CSeq_align_set::Tdata, iter_filtered, aln_filtered.Get())
318 {
319 TGi gi_filtered = s_GetAlignedSeqGi(*iter_filtered);
320 if (gi == gi_filtered)
321 {
322 // main gi's coincide - check the concistency of all the gi's
323 s_DoConsistencyCheck(*iter, *iter_filtered, vec_gis);
324 // check the equivalence of all gi's in the filtered seqalign
325 s_DoEquivalenceCheck(*iter_filtered, db);
326 }
327 }
328 }
329 }
330 BOOST_AUTO_TEST_SUITE_END()
331 #endif /* SKIP_DOXYGEN_PROCESSING */
332