1 /*  $Id: seqalignfilter.cpp 637482 2021-09-13 18:27:47Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Vahram Avagyan
27  *
28  */
29 
30 /// @file seqalignfilter.cpp
31 /// Implementation of the alignment filtering class.
32 ///
33 #include <ncbi_pch.hpp>
34 
35 #include <objects/general/User_object.hpp>
36 #include <objtools/align_format/align_format_util.hpp>
37 #include <objtools/align_format/seqalignfilter.hpp>
38 #include <algo/blast/blastinput/blast_scope_src.hpp>
39 
40 #include <serial/serial.hpp>
41 #include <serial/objistr.hpp>
42 #include <serial/objostr.hpp>
43 #include <serial/iterator.hpp>
44 
45 #include <list>
46 #include <algorithm>
47 
48 BEGIN_NCBI_SCOPE
49 USING_SCOPE(objects);
BEGIN_SCOPE(align_format)50 BEGIN_SCOPE(align_format)
51 
52 /////////////////////////////////////////////////////////////////////////////
53 
54 CSeqAlignFilter::CSeqAlignFilter(EResultsFormat eFormat)
55 : m_eFormat(eFormat)
56 {
57 }
58 
~CSeqAlignFilter(void)59 CSeqAlignFilter::~CSeqAlignFilter(void)
60 {
61 }
62 
63 /////////////////////////////////////////////////////////////////////////////
64 
FilterSeqaligns(const string & fname_in_seqaligns,const string & fname_out_seqaligns,const string & fname_gis_to_filter)65 void CSeqAlignFilter::FilterSeqaligns(const string& fname_in_seqaligns,
66                                       const string& fname_out_seqaligns,
67                                       const string& fname_gis_to_filter)
68 {
69     CSeq_align_set full_aln;
70     ReadSeqalignSet(fname_in_seqaligns, full_aln);
71 
72     CSeq_align_set filtered_aln;
73     FilterByGiListFromFile(full_aln, fname_gis_to_filter, filtered_aln);
74 
75     WriteSeqalignSet(fname_out_seqaligns, filtered_aln);
76 }
77 
FilterSeqalignsExt(const string & fname_in_seqaligns,const string & fname_out_seqaligns,CRef<CSeqDB> db)78 void CSeqAlignFilter::FilterSeqalignsExt(const string& fname_in_seqaligns,
79                                             const string& fname_out_seqaligns,
80                                             CRef<CSeqDB> db)
81 {
82     CSeq_align_set full_aln;
83     ReadSeqalignSet(fname_in_seqaligns, full_aln);
84 
85     CSeq_align_set filtered_aln;
86     FilterBySeqDB(full_aln, db, filtered_aln);
87 
88     WriteSeqalignSet(fname_out_seqaligns, filtered_aln);
89 }
90 
91 /////////////////////////////////////////////////////////////////////////////
92 
FilterByGiListFromFile(const CSeq_align_set & full_aln,const string & fname_gis_to_filter,CSeq_align_set & filtered_aln)93 void CSeqAlignFilter::FilterByGiListFromFile(const CSeq_align_set& full_aln,
94                                                 const string& fname_gis_to_filter,
95                                                 CSeq_align_set& filtered_aln)
96 {
97     CRef<CSeqDBFileGiList> seqdb_gis(new CSeqDBFileGiList(fname_gis_to_filter));
98 
99     CConstRef<CSeq_id> id_aligned_seq;
100     filtered_aln.Set().clear();
101 
102     ITERATE(CSeq_align_set::Tdata, iter, full_aln.Get()) {
103         if (!((*iter)->GetSegs().IsDisc())) {
104 
105             // process a single alignment
106 
107             id_aligned_seq = &((*iter)->GetSeq_id(1));
108             TGi gi = id_aligned_seq->GetGi();
109 
110             if (seqdb_gis->FindGi(gi)) {
111                 filtered_aln.Set().push_back(*iter);
112             }
113         }
114         else {
115 
116             // recursively process a set of alignments
117 
118             CRef<CSeq_align_set> filtered_sub_aln(new CSeq_align_set);
119             FilterByGiListFromFile((*iter)->GetSegs().GetDisc(), fname_gis_to_filter, *filtered_sub_aln);
120 
121             CRef<CSeq_align> aln_disc(new CSeq_align);
122             aln_disc->Assign(**iter);
123             aln_disc->SetSegs().SetDisc(*filtered_sub_aln);
124 
125             filtered_aln.Set().push_back(aln_disc);
126         }
127     }
128 }
129 
130 
FilterByGiList(const CSeq_align_set & full_aln,const list<TGi> & list_gis,CSeq_align_set & filtered_aln)131 void CSeqAlignFilter::FilterByGiList(const CSeq_align_set& full_aln,
132                                         const list<TGi>& list_gis,
133                                         CSeq_align_set& filtered_aln)
134 {
135     CConstRef<CSeq_id> id_aligned_seq;
136     filtered_aln.Set().clear();
137 
138     ITERATE(CSeq_align_set::Tdata, iter, full_aln.Get()) {
139         if (!((*iter)->GetSegs().IsDisc())) {
140 
141             // process a single alignment
142 
143             id_aligned_seq = &((*iter)->GetSeq_id(1));
144             TGi gi = id_aligned_seq->GetGi();
145 
146             if (find(list_gis.begin(), list_gis.end(), gi) != list_gis.end()) {
147                 filtered_aln.Set().push_back(*iter);
148             }
149         }
150         else {
151 
152             // recursively process a set of alignments
153 
154             CRef<CSeq_align_set> filtered_sub_aln(new CSeq_align_set);
155             FilterByGiList((*iter)->GetSegs().GetDisc(), list_gis, *filtered_sub_aln);
156 
157             CRef<CSeq_align> aln_disc(new CSeq_align);
158             aln_disc->Assign(**iter);
159             aln_disc->SetSegs().SetDisc(*filtered_sub_aln);
160 
161             filtered_aln.Set().push_back(aln_disc);
162         }
163     }
164 }
165 
s_GetFilteredRedundantGis(CRef<CSeqDB> db,int oid,vector<TGi> & gis)166 static void s_GetFilteredRedundantGis(CRef<CSeqDB> db,
167                                       int oid,
168                                       vector<TGi>& gis)
169 {
170     // Note: copied from algo/blast/api to avoid dependencies
171 
172     gis.resize(0);
173     if (!db->GetGiList()) {
174         return;
175     }
176 
177     list< CRef<CSeq_id> > seqid_list = db->GetSeqIDs(oid);
178     gis.reserve(seqid_list.size());
179 
180     ITERATE(list< CRef<CSeq_id> >, id, seqid_list) {
181         if ((**id).IsGi()) {
182             gis.push_back((**id).GetGi());
183         }
184     }
185 
186 	sort(gis.begin(), gis.end());
187 }
188 
FilterBySeqDB(const CSeq_align_set & full_aln,CRef<CSeqDB> db,CSeq_align_set & filtered_aln)189 void CSeqAlignFilter::FilterBySeqDB(const CSeq_align_set& full_aln,
190                                     CRef<CSeqDB> db,
191                                     CSeq_align_set& filtered_aln)
192 {
193     filtered_aln.Set().clear();
194 
195     ITERATE(CSeq_align_set::Tdata, iter_aln, full_aln.Get()) {
196         if (!((*iter_aln)->GetSegs().IsDisc())) {
197 
198             // process a single alignment
199 
200             // get the gi of the aligned sequence
201             CConstRef<CSeq_id> id_aligned_seq;
202             id_aligned_seq = &((*iter_aln)->GetSeq_id(1));
203             TGi gi_aligned_seq = id_aligned_seq->GetGi();
204 
205             // get the corresponding oid from the db (!!! can we rely on this? !!!)
206             int oid_aligned_seq = -1;
207             db->GiToOid(gi_aligned_seq, oid_aligned_seq);
208 
209             // retrieve the filtered list of gi's corresponding to this oid
210             vector<TGi> vec_gis_from_DB;
211 
212             if (oid_aligned_seq > 0)
213                 s_GetFilteredRedundantGis(db, oid_aligned_seq, vec_gis_from_DB);
214 
215             // if that list is non-empty, add seq-align's with those gi's to the filtered alignment set
216             if (!vec_gis_from_DB.empty()) {
217 
218                 x_CreateOusputSeqaligns(*iter_aln, gi_aligned_seq, filtered_aln, vec_gis_from_DB);
219             }
220         }
221         else {
222 
223             // recursively process a set of alignments
224 
225             CRef<CSeq_align_set> filtered_sub_aln(new CSeq_align_set);
226             FilterBySeqDB((*iter_aln)->GetSegs().GetDisc(), db, *filtered_sub_aln);
227 
228             CRef<CSeq_align> aln_disc(new CSeq_align);
229             aln_disc->Assign(**iter_aln);
230             aln_disc->SetSegs().SetDisc(*filtered_sub_aln);
231 
232             filtered_aln.Set().push_back(aln_disc);
233         }
234     }
235 }
236 
237 
238 
239 
240 /////////////////////////////////////////////////////////////////////////////
241 
x_CreateOusputSeqaligns(CConstRef<CSeq_align> in_aln,TGi in_gi,CSeq_align_set & out_aln,const vector<TGi> & out_gi_vec)242 void CSeqAlignFilter::x_CreateOusputSeqaligns(CConstRef<CSeq_align> in_aln, TGi in_gi,
243                                             CSeq_align_set& out_aln, const vector<TGi>& out_gi_vec)
244 {
245     if (out_gi_vec.size() == 0)
246         return;
247 
248     if (m_eFormat == eMultipleSeqaligns)
249     {
250         for (vector<TGi>::const_iterator it_gi_out = out_gi_vec.begin();
251                 it_gi_out != out_gi_vec.end(); it_gi_out++)
252         {
253             // get a copy of the input seq-align and change the gi of
254             // the aligned sequence to the gi that must go into the output
255 
256             bool success = false;
257             CRef<CSeq_align> sa_copy = x_UpdateGiInSeqalign(in_aln, 1,
258                                                             in_gi, *it_gi_out, success);
259 
260             // if the update was successful, add the new seq-align to the results
261             if (success)
262             {
263                 // remove any "use_this_gi" entries as the selected format option requires
264                 x_RemoveExtraGis(sa_copy);
265 
266                 out_aln.Set().push_back(sa_copy);
267             }
268         }
269     }
270     else if (m_eFormat == eCombined)
271     {
272         // update the main gi of the aligned sequence & add any extra gi's as "use this gi" entries
273 
274         vector<TGi> vec_old_extra_gis;
275         x_ReadExtraGis(in_aln, vec_old_extra_gis);
276 
277         TGi main_new_gi;
278         vector<TGi> vec_new_extra_gis;
279         x_GenerateNewGis(in_gi, vec_old_extra_gis, out_gi_vec, main_new_gi, vec_new_extra_gis);
280 
281         bool success = false;
282         CRef<CSeq_align> sa_copy = x_UpdateGiInSeqalign(in_aln, 1, in_gi,
283                                                         main_new_gi, success);
284 
285         if (success)
286         {
287             x_RemoveExtraGis(sa_copy);
288             x_WriteExtraGis(sa_copy, vec_new_extra_gis);
289 
290             out_aln.Set().push_back(sa_copy);
291         }
292     }
293 }
294 
295 /////////////////////////////////////////////////////////////////////////////
296 
x_GenerateNewGis(TGi main_old_gi,const vector<TGi> & vec_old_extra_gis,const vector<TGi> & vec_out_gis,TGi & main_new_gi,vector<TGi> & vec_new_extra_gis)297 void CSeqAlignFilter::x_GenerateNewGis(
298                     TGi main_old_gi,                        // in: main gi stored before filtering
299                     const vector<TGi>& vec_old_extra_gis,    // in: extra gi's stored before filtering
300                     const vector<TGi>& vec_out_gis,            // in: list of all gi's after filtering
301                     TGi& main_new_gi,                        // out: main gi after filtering
302                     vector<TGi>& vec_new_extra_gis)            // out: extra gi's after filtering
303 {
304     if (vec_out_gis.empty())
305         return;
306 
307     int i_out_gi = 0, i_old_gi = 0, i_new_gi = 0;
308 
309     // set the main gi
310 
311     if (find(vec_out_gis.begin(), vec_out_gis.end(), main_old_gi) != vec_out_gis.end())
312         main_new_gi = main_old_gi;
313     else
314         main_new_gi = vec_out_gis[0];  //main_new_gi = vec_out_gis[i_out_gi++];
315 
316     int num_gis_left = vec_out_gis.size();  //int num_gis_left = vec_out_gis.size() - 1;
317     if (num_gis_left > 0)
318     {
319         // set the extra gi's (copy & filter the old ones, then add the new ones)
320         // we do not copy the vec_out_gis directly to preserve the original order
321         // (older gi's will appear before newly added gi's)
322 
323         vec_new_extra_gis.resize(num_gis_left);
324 
325         for (; i_old_gi < (int)(vec_old_extra_gis.size()); i_old_gi++)
326         {
327             TGi old_gi = vec_old_extra_gis[i_old_gi];
328             if (find(vec_out_gis.begin(), vec_out_gis.end(), old_gi) != vec_out_gis.end())
329                 vec_new_extra_gis[i_new_gi++] = old_gi;
330         }
331 
332         for (; i_out_gi < (int)(vec_out_gis.size()); i_out_gi++)
333         {
334             TGi out_gi = vec_out_gis[i_out_gi];
335             if (find(vec_old_extra_gis.begin(), vec_old_extra_gis.end(), out_gi)
336                 == vec_old_extra_gis.end())    // not one of the old gis (already copied)
337             {
338                 // if (out_gi != main_new_gi)    // not the main gi (already set)
339                      vec_new_extra_gis[i_new_gi++] = out_gi;
340             }
341         }
342     }
343     else
344     {
345         // no extra gi's to copy
346 
347         vec_new_extra_gis.clear();
348     }
349 }
350 
351 /////////////////////////////////////////////////////////////////////////////
352 
x_UpdateGiInSeqalign(CConstRef<CSeq_align> sa,unsigned int n_row,TGi old_gi,TGi new_gi,bool & success)353 CRef<CSeq_align> CSeqAlignFilter::x_UpdateGiInSeqalign(CConstRef<CSeq_align> sa, unsigned int n_row,
354                                                      TGi old_gi, TGi new_gi, bool& success)
355 {
356     // create a copy of the given alignment
357 
358     CRef<CSeq_align> sa_copy(new CSeq_align);
359     sa_copy->Assign(*(sa.GetNonNullPointer()));
360 
361     // update the gi of sequence #n_row in the copied alignment structure
362 
363     bool gi_changed = false;
364 
365     if (sa_copy->GetSegs().IsDendiag())
366     {
367         // find and update gi's in every appropriate diag entry
368 
369         CSeq_align::C_Segs::TDendiag& dendiag = sa_copy->SetSegs().SetDendiag();
370         NON_CONST_ITERATE(CSeq_align::C_Segs::TDendiag, iter_diag, dendiag)
371         {
372             if ((*iter_diag)->IsSetIds() && n_row < (*iter_diag)->GetIds().size())
373             {
374                 const CSeq_id& id_to_change = *((*iter_diag)->GetIds()[n_row]);
375                 if (id_to_change.IsGi() &&
376                     id_to_change.GetGi() == old_gi)
377                 {
378                     (*iter_diag)->SetIds()[n_row]->SetGi(new_gi);
379                     gi_changed = true;
380                 }
381             }
382         }
383     }
384     else if (sa_copy->GetSegs().IsDenseg())
385     {
386         // update the gi in the dense-seg entry
387 
388         CSeq_align::C_Segs::TDenseg& denseg = sa_copy->SetSegs().SetDenseg();
389         if (denseg.IsSetIds() && n_row < denseg.GetIds().size())
390         {
391             const CSeq_id& id_to_change = *(denseg.GetIds()[n_row]);
392             if (id_to_change.IsGi() &&
393                 id_to_change.GetGi() == old_gi)
394             {
395                 denseg.SetIds()[n_row]->SetGi(new_gi);
396                 gi_changed = true;
397             }
398         }
399     }
400     else if (sa_copy->GetSegs().IsStd())
401     {
402         // find and update gi's in every appropriate seq-loc entry in the std-segs
403 
404         CSeq_align::C_Segs::TStd& stdsegs = sa_copy->SetSegs().SetStd();
405         NON_CONST_ITERATE(CSeq_align::C_Segs::TStd, iter_std, stdsegs)
406         {
407             if ((*iter_std)->IsSetLoc() && n_row < (*iter_std)->GetLoc().size())
408             {
409                 CSeq_loc& loc_to_change = *((*iter_std)->SetLoc()[n_row]);
410 
411                 // question: do seq-locs ever contain parts of different sequences?
412 
413                 const CSeq_id* p_id_to_change = loc_to_change.GetId();
414                 if (p_id_to_change)        // one and only one id is associated with this seq-loc
415                 {
416                     if (p_id_to_change->IsGi() &&
417                         p_id_to_change->GetGi() == old_gi)
418                     {
419                         CRef<CSeq_id> id_updated(new CSeq_id(CSeq_id::e_Gi, new_gi));
420                         loc_to_change.SetId(*id_updated);
421                         gi_changed = true;
422                     }
423                 }
424             }
425         }
426     }
427     else
428     {
429         // these alignment types are not supported here
430     }
431 
432     success = gi_changed;
433     return sa_copy;
434 }
435 
x_ReadExtraGis(CConstRef<CSeq_align> sa,vector<TGi> & vec_extra_gis)436 void CSeqAlignFilter::x_ReadExtraGis(CConstRef<CSeq_align> sa, vector<TGi>& vec_extra_gis)
437 {
438     vec_extra_gis.clear();
439 
440     CSeq_align::TScore score_entries = sa->GetScore();
441     ITERATE(CSeq_align::TScore, iter_score, score_entries)
442     {
443         CRef<CScore> score_entry = *iter_score;
444 
445         if (score_entry->CScore_Base::IsSetId())
446             if (score_entry->GetId().IsStr())
447             {
448                 string str_id = score_entry->GetId().GetStr();
449                 if (str_id == "use_this_gi")
450                 {
451                 	Uint4 gi_v = (Uint4) (score_entry->GetValue().GetInt());
452                     TGi gi = GI_FROM(Uint4, gi_v);
453                     vec_extra_gis.push_back(gi);
454                 }
455             }
456     }
457 }
458 
x_WriteExtraGis(CRef<CSeq_align> sa,const vector<TGi> & vec_extra_gis)459 void CSeqAlignFilter::x_WriteExtraGis(CRef<CSeq_align> sa, const vector<TGi>& vec_extra_gis)
460 {
461     for (int i_gi = 0; i_gi < (int)(vec_extra_gis.size()); i_gi++)
462         x_AddUseGiEntryInSeqalign(sa, vec_extra_gis[i_gi]);
463 }
464 
x_RemoveExtraGis(CRef<CSeq_align> sa)465 void CSeqAlignFilter::x_RemoveExtraGis(CRef<CSeq_align> sa)
466 {
467     CSeq_align::TScore& score_entries = sa->SetScore();
468 
469     CSeq_align::TScore::iterator iter_score = score_entries.begin();
470     while (iter_score != score_entries.end())
471     {
472         CRef<CScore> score_entry = *iter_score;
473         bool erase_entry = false;
474 
475         if (score_entry->IsSetId())
476             if (score_entry->GetId().IsStr())
477             {
478                 string str_id = score_entry->GetId().GetStr();
479                 erase_entry = (str_id == "use_this_gi");
480             }
481 
482         if (erase_entry)
483             iter_score = score_entries.erase(iter_score);
484         else
485             iter_score++;
486     }
487 }
488 
x_AddUseGiEntryInSeqalign(CRef<CSeq_align> sa,TGi new_gi)489 bool CSeqAlignFilter::x_AddUseGiEntryInSeqalign(CRef<CSeq_align> sa, TGi new_gi)
490 {
491     // add a "use this gi" entry with the new gi to the score section of the alignment
492 
493     CRef<CScore> score_entry(new CScore);
494     score_entry->SetId().SetStr("use_this_gi");
495     score_entry->SetValue().SetInt(GI_TO(CScore::C_Value::TInt, new_gi));
496 
497     sa->SetScore().push_back(score_entry);
498 
499     return true;
500 }
501 
PrepareSeqDB(const string & fname_db,bool is_prot,const string & fname_gis)502 CRef<CSeqDB> CSeqAlignFilter::PrepareSeqDB(const string& fname_db, bool is_prot,
503                                             const string& fname_gis)
504 {
505     CRef<CSeqDBFileGiList> seqdb_gis;
506     seqdb_gis = new CSeqDBFileGiList(fname_gis);
507 
508     CRef<CSeqDB> seqdb;
509     seqdb = new CSeqDB(fname_db,
510                         is_prot? CSeqDB::eProtein : CSeqDB::eNucleotide,
511                         seqdb_gis);
512     return seqdb;
513 }
514 
ReadSeqalignSet(const string & fname,CSeq_align_set & aln)515 void CSeqAlignFilter::ReadSeqalignSet(const string& fname, CSeq_align_set& aln)
516 {
517     auto_ptr<CObjectIStream> asn_in(CObjectIStream::Open(fname, eSerial_AsnText));
518     *asn_in >> aln;
519 }
520 
WriteSeqalignSet(const string & fname,const CSeq_align_set & aln)521 void CSeqAlignFilter::WriteSeqalignSet(const string& fname, const CSeq_align_set& aln)
522 {
523     auto_ptr<CObjectOStream> asn_out(CObjectOStream::Open(fname, eSerial_AsnText));
524     *asn_out << aln;
525 }
526 
ReadGiList(const string & fname,list<TGi> & list_gis,bool sorted)527 void CSeqAlignFilter::ReadGiList(const string& fname, list<TGi>& list_gis, bool sorted)
528 {
529     CRef<CSeqDBFileGiList> seqdb_gis;
530     seqdb_gis = new CSeqDBFileGiList(fname);
531 
532     vector<TGi> vec_gis;
533     seqdb_gis->GetGiList(vec_gis);
534 
535     if (sorted)
536         sort(vec_gis.begin(), vec_gis.end());
537 
538     list_gis.clear();
539     for (vector<TGi>::iterator it = vec_gis.begin(); it != vec_gis.end(); it++)
540         list_gis.push_back(*it);
541 }
542 
ReadGiVector(const string & fname,vector<TGi> & vec_gis,bool sorted)543 void CSeqAlignFilter::ReadGiVector(const string& fname, vector<TGi>& vec_gis, bool sorted)
544 {
545     CRef<CSeqDBFileGiList> seqdb_gis;
546     seqdb_gis = new CSeqDBFileGiList(fname);
547 
548     seqdb_gis->GetGiList(vec_gis);
549     if (sorted)
550         sort(vec_gis.begin(), vec_gis.end());
551 }
552 
s_UpdateSubjectInSeqalign(CRef<CSeq_align> & in_align,CRef<CSeq_id> & newSeqID)553 static CRef<CSeq_align> s_UpdateSubjectInSeqalign(CRef<CSeq_align> &in_align, CRef<CSeq_id> &newSeqID)
554 
555 {
556     // create a copy of the given alignment
557     CRef<CSeq_align> sa_copy(new CSeq_align);
558     sa_copy->Assign(*(in_align.GetNonNullPointer()));
559 
560     const CSeq_id& subjid = in_align->GetSeq_id(1);
561     if(!subjid.Match(*newSeqID)) {
562         if (sa_copy->GetSegs().IsDenseg())
563         {
564             CSeq_align::C_Segs::TDenseg& denseg = sa_copy->SetSegs().SetDenseg();
565             if (denseg.IsSetIds() && denseg.GetIds().size() == 2)
566             {
567                 denseg.SetIds()[1] = newSeqID;
568             }
569         }
570     }
571     return sa_copy;
572 }
573 
574 
s_UpdateSeqAlnWithFilteredSeqIDs(CRef<CSeqDB> filteredDB,int oid,CRef<CSeq_align> & in_align)575 CRef<CSeq_align> s_UpdateSeqAlnWithFilteredSeqIDs(CRef<CSeqDB> filteredDB,
576                                               int oid,
577                                               CRef<CSeq_align> &in_align)
578 {
579     CRef<CSeq_align> sa_copy;
580     CRef<CSeq_id> newSubjectID;
581 
582     const CSeq_id& subjid = in_align->GetSeq_id(1);
583 
584     vector< CRef<CSeq_id> > seqids;
585     list< CRef<CSeq_id> > seqid_list = filteredDB->GetSeqIDs(oid);
586     seqids.reserve(seqid_list.size());
587 
588     ITERATE(list< CRef<CSeq_id> >, id, seqid_list) {
589         if(subjid.IsGi() && (**id).IsGi()) {
590             seqids.push_back(*id);
591         }
592         else if(!subjid.IsGi() && !(**id).IsGi()) {
593             seqids.push_back(*id);
594         }
595     }
596 
597    if(!seqids.empty()) {
598         //update main subject and and add use_this_seq
599         newSubjectID = seqids[0];
600         sa_copy = s_UpdateSubjectInSeqalign(in_align,newSubjectID);
601         vector <string> useThisSeqs;
602         for(size_t i = 0; i < seqids.size(); i++) {
603             string textSeqID;
604             CAlignFormatUtil::GetTextSeqID(seqids[i], &textSeqID);
605             if(seqids[0]->IsGi()) {
606                 useThisSeqs.push_back("gi:" + textSeqID);
607              }
608              else {
609                 useThisSeqs.push_back("seqid:" + textSeqID);
610              }
611          }
612          CRef<CUser_object> userObject(new CUser_object());
613 	     userObject->SetType().SetStr("use_this_seqid");
614 	     userObject->AddField("SEQIDS", useThisSeqs);
615          sa_copy->ResetExt();
616 	     sa_copy->SetExt().push_back(userObject);
617     }
618     return sa_copy;
619 }
620 
621 
FilterBySeqDB(const CSeq_align_set & seqalign,CRef<CSeqDB> & filteredDB,vector<int> & oid_vec)622 CRef<CSeq_align_set> CSeqAlignFilter::FilterBySeqDB(const CSeq_align_set& seqalign, //CRef<CSeq_align_set> &seqalign
623                                                     CRef<CSeqDB> &filteredDB,
624                                                     vector<int>& oid_vec)
625 {
626     CConstRef<CSeq_id> previous_id, subjid;
627     size_t i = 0;
628     bool success = false;
629     CRef<CSeq_id> newSubjectID;
630 
631     CRef<CSeq_align_set> new_aln(new CSeq_align_set);
632 
633     ITERATE(CSeq_align_set::Tdata, iter, seqalign.Get()){
634         CRef<CSeq_align> currAlign = *iter;
635         CRef<CSeq_align> filtered_aln;
636 
637         subjid = &(currAlign->GetSeq_id(1));
638         if(previous_id.Empty() || !subjid->Match(*previous_id))
639         {
640             success = false;
641             if (oid_vec[i] > 0) {
642                 // retrieve the filtered list of gi's corresponding to this oid
643                 filtered_aln = s_UpdateSeqAlnWithFilteredSeqIDs(filteredDB, oid_vec[i], currAlign);
644                 if(!filtered_aln.Empty()) { //found sequence with this oid oid_vec[i] in filtered seqdb
645                     newSubjectID.Reset(const_cast<CSeq_id*>(&filtered_aln->GetSeq_id(1)));
646                     success = true;
647                 }
648             }
649             i++;
650         }
651         else {
652             if(success) {
653                 filtered_aln = s_UpdateSubjectInSeqalign(currAlign,newSubjectID);
654             }
655         }
656         previous_id = subjid;
657 
658         if(success && !filtered_aln.Empty()) {
659             new_aln->Set().push_back(filtered_aln);
660         }
661     }
662     return new_aln;
663 }
664 
s_IncludeDeflineTaxid(const CBlast_def_line & def,const set<TTaxId> & user_tax_ids)665 bool static s_IncludeDeflineTaxid(const CBlast_def_line & def, const set<TTaxId> & user_tax_ids)
666 {
667     CBlast_def_line::TTaxIds tax_ids;
668     if (def.IsSetTaxid()) {
669         tax_ids.insert(def.GetTaxid());
670     }
671     if(def.IsSetLinks()) {
672         CBlast_def_line::TLinks leaf_ids = def.GetLinks();
673 #ifdef NCBI_STRICT_TAX_ID
674         ITERATE(CBlast_def_line::TLinks, it, leaf_ids) tax_ids.insert(TAX_ID_FROM(int, *it));
675 #else
676         tax_ids.insert(leaf_ids.begin(), leaf_ids.end());
677 #endif
678     }
679 
680     if(user_tax_ids.size() > tax_ids.size()) {
681         ITERATE(CBlast_def_line::TTaxIds, itr, tax_ids) {
682           if(user_tax_ids.find(*itr) != user_tax_ids.end()) {
683             return true;
684           }
685         }
686     }
687     else {
688         ITERATE(set<TTaxId>, itr, user_tax_ids) {
689             if(tax_ids.find(*itr) != tax_ids.end()) {
690                 return true;
691             }
692         }
693     }
694     return false;
695 }
696 
697 
s_ModifySeqAlnWithFilteredSeqIDs(CRef<CBlast_def_line_set> & bdlRef,const set<TTaxId> & taxids,CRef<CSeq_align> & in_align)698 static CRef<CSeq_align> s_ModifySeqAlnWithFilteredSeqIDs(CRef<CBlast_def_line_set>  &bdlRef,
699                                                          const set<TTaxId>& taxids,
700                                                          CRef<CSeq_align> &in_align)
701 {
702     CRef<CSeq_align> sa_copy;
703 
704     const list< CRef< CBlast_def_line > >& bdlSet = bdlRef->Get();
705     vector <string> useThisSeqs;
706     ITERATE(list<CRef<CBlast_def_line> >, iter, bdlSet) {
707         const CBlast_def_line & defline = **iter;
708         bool has_match = s_IncludeDeflineTaxid(defline, taxids);
709         CRef<CSeq_id> seqID;
710         string textSeqID;
711         if(has_match) {
712             const CBioseq::TId& cur_id = (*iter)->GetSeqid();
713             seqID = FindBestChoice(cur_id, CSeq_id::WorstRank);
714             CAlignFormatUtil::GetTextSeqID(seqID, &textSeqID);
715 
716             list<string> use_this_seq;
717             CAlignFormatUtil::GetUseThisSequence(*in_align,use_this_seq);
718             if(use_this_seq.size() > 0) {
719                 has_match = CAlignFormatUtil::MatchSeqInUseThisSeqList(use_this_seq, textSeqID);
720             }
721         }
722         if(has_match) {
723             if(sa_copy.Empty()) {
724                 sa_copy = s_UpdateSubjectInSeqalign(in_align,seqID);
725             }
726             if(seqID->IsGi()) {
727                 useThisSeqs.push_back("gi:" + textSeqID);
728             }
729             else {
730                 useThisSeqs.push_back("seqid:" + textSeqID);
731             }
732             CRef<CUser_object> userObject(new CUser_object());
733 	        userObject->SetType().SetStr("use_this_seqid");
734 	        userObject->AddField("SEQIDS", useThisSeqs);
735             sa_copy->ResetExt();
736 	        sa_copy->SetExt().push_back(userObject);
737         }
738     }
739     return sa_copy;
740 }
741 
742 
743 
FilterByTaxonomy(const CSeq_align_set & seqalign,CRef<CSeqDB> & seqdb,const set<TTaxId> & taxids)744 CRef<CSeq_align_set> CSeqAlignFilter::FilterByTaxonomy(const CSeq_align_set& seqalign, //CRef<CSeq_align_set> &seqalign
745                                                         CRef<CSeqDB> &seqdb,
746                                                         const set<TTaxId>& taxids)
747 {
748     CConstRef<CSeq_id> previous_id, subjid;
749     bool success = false;
750     CRef<CSeq_id> newSubjectID;
751 
752     CRef<CSeq_align_set> new_aln(new CSeq_align_set);
753 
754     ITERATE(CSeq_align_set::Tdata, iter, seqalign.Get()){
755         CRef<CSeq_align> currAlign = *iter;
756         CRef<CSeq_align> filtered_aln;
757 
758         subjid = &(currAlign->GetSeq_id(1));
759         if(previous_id.Empty() || !subjid->Match(*previous_id))
760         {
761             success = false;
762             int oid;
763             seqdb->SeqidToOid(*subjid, oid);
764             CRef<CBlast_def_line_set> bdlSet = seqdb->GetHdr(oid);
765             filtered_aln = s_ModifySeqAlnWithFilteredSeqIDs(bdlSet, taxids, currAlign);
766             if(!filtered_aln.Empty()) { //found sequence with this oid oid_vec[i] in filtered seqdb
767                 newSubjectID.Reset(const_cast<CSeq_id*>(&filtered_aln->GetSeq_id(1)));
768                 success = true;
769             }
770         }
771         else {
772             if(success) {
773                 filtered_aln = s_UpdateSubjectInSeqalign(currAlign,newSubjectID);
774             }
775         }
776         previous_id = subjid;
777 
778         if(success && !filtered_aln.Empty()) {
779             new_aln->Set().push_back(filtered_aln);
780         }
781     }
782     return new_aln;
783 }
784 
s_ModifySeqAlnWithFilteredSeqIDs(CRef<CScope> & scope,CSeq_id::EAccessionInfo accType,CRef<CSeq_align> & in_align)785 static CRef<CSeq_align> s_ModifySeqAlnWithFilteredSeqIDs(CRef <CScope> &scope,
786                                                          CSeq_id::EAccessionInfo accType,
787                                                          CRef<CSeq_align> &in_align)
788 {
789     CRef<CSeq_align> sa_copy;
790 
791     list<string> use_this_seq;
792     CAlignFormatUtil::GetUseThisSequence(*in_align,use_this_seq);
793     bool modifyAlignment = false;
794     if(use_this_seq.size() > 0) {
795         modifyAlignment = CAlignFormatUtil::RemoveSeqsOfAccessionTypeFromSeqInUse(use_this_seq, accType);
796     }
797     else { //go through blast deflines
798         const CSeq_id &subjid = in_align->GetSeq_id(1);
799         const CBioseq_Handle& handle = scope->GetBioseqHandle(subjid);
800         CRef<CBlast_def_line_set> bdlRef = CSeqDB::ExtractBlastDefline(handle);
801         const list< CRef< CBlast_def_line > >& bdlSet = bdlRef->Get();
802         ITERATE(list<CRef<CBlast_def_line> >, iter, bdlSet) {
803             string textSeqID;
804             const CBioseq::TId& cur_id = (*iter)->GetSeqid();
805             CRef<CSeq_id> seqID = FindBestChoice(cur_id, CSeq_id::WorstRank);
806             CSeq_id::EAccessionInfo accInfo = seqID->IdentifyAccession();
807             if(accInfo != accType) {//not CSeq_id::eAcc_refseq_prot
808                 CAlignFormatUtil::GetTextSeqID(seqID, &textSeqID);
809                 use_this_seq.push_back(textSeqID);
810             }
811             else {
812                 modifyAlignment = true;
813             }
814         }
815     }
816     if(!modifyAlignment) {
817         sa_copy = in_align;
818     }
819     if(modifyAlignment && use_this_seq.size() > 0) {
820         vector <string> useThisSeqs;
821         CRef<CSeq_id> seqID (new CSeq_id(*(use_this_seq.begin())));
822         sa_copy = s_UpdateSubjectInSeqalign(in_align,seqID);
823         ITERATE(list<string>, iter_seq, use_this_seq){
824             if(seqID->IsGi()) {
825                 useThisSeqs.push_back("gi:" + *iter_seq);
826             }
827             else {
828                 useThisSeqs.push_back("seqid:" + *iter_seq);
829             }
830             CRef<CUser_object> userObject(new CUser_object());
831 	        userObject->SetType().SetStr("use_this_seqid");
832 	        userObject->AddField("SEQIDS", useThisSeqs);
833             sa_copy->ResetExt();
834 	        sa_copy->SetExt().push_back(userObject);
835         }
836     }
837     return sa_copy;
838 }
839 
FilterByAccessionType(const CSeq_align_set & seqalign,CRef<CScope> & scope,vector<CSeq_id::EAccessionInfo> accTypes)840 CRef<CSeq_align_set> CSeqAlignFilter::FilterByAccessionType(const CSeq_align_set& seqalign, //CRef<CSeq_align_set> &seqalign
841                                                             CRef <CScope> &scope,
842                                                             vector <CSeq_id::EAccessionInfo> accTypes)
843 {
844     CConstRef<CSeq_id> previous_id, subjid;
845     bool success = false;
846     CRef<CSeq_id> newSubjectID;
847 
848     CRef<CSeq_align_set> new_aln(new CSeq_align_set);
849 
850     ITERATE(CSeq_align_set::Tdata, iter, seqalign.Get()){
851         CRef<CSeq_align> currAlign = *iter;
852         CRef<CSeq_align> filtered_aln;
853 
854         subjid = &(currAlign->GetSeq_id(1));
855         if(previous_id.Empty() || !subjid->Match(*previous_id))
856         {
857             success = false;
858             for (size_t i = 0; i < accTypes.size(); i++) {
859                 if(!currAlign.Empty()) {
860                     filtered_aln = s_ModifySeqAlnWithFilteredSeqIDs(scope, accTypes[i], currAlign);
861                     currAlign = filtered_aln;
862                 }
863             }
864             if(!filtered_aln.Empty()) {
865                 newSubjectID.Reset(const_cast<CSeq_id*>(&filtered_aln->GetSeq_id(1)));
866                 success = true;
867             }
868         }
869         else {
870             if(success) {
871                 filtered_aln = s_UpdateSubjectInSeqalign(currAlign,newSubjectID);
872             }
873         }
874         previous_id = subjid;
875 
876         if(success && !filtered_aln.Empty()) {
877             new_aln->Set().push_back(filtered_aln);
878         }
879     }
880     return new_aln;
881 }
882 END_SCOPE(align_format)
883 END_NCBI_SCOPE
884