1 /* $Id: seqalignfilter.cpp 637482 2021-09-13 18:27:47Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Vahram Avagyan
27 *
28 */
29
30 /// @file seqalignfilter.cpp
31 /// Implementation of the alignment filtering class.
32 ///
33 #include <ncbi_pch.hpp>
34
35 #include <objects/general/User_object.hpp>
36 #include <objtools/align_format/align_format_util.hpp>
37 #include <objtools/align_format/seqalignfilter.hpp>
38 #include <algo/blast/blastinput/blast_scope_src.hpp>
39
40 #include <serial/serial.hpp>
41 #include <serial/objistr.hpp>
42 #include <serial/objostr.hpp>
43 #include <serial/iterator.hpp>
44
45 #include <list>
46 #include <algorithm>
47
48 BEGIN_NCBI_SCOPE
49 USING_SCOPE(objects);
BEGIN_SCOPE(align_format)50 BEGIN_SCOPE(align_format)
51
52 /////////////////////////////////////////////////////////////////////////////
53
54 CSeqAlignFilter::CSeqAlignFilter(EResultsFormat eFormat)
55 : m_eFormat(eFormat)
56 {
57 }
58
~CSeqAlignFilter(void)59 CSeqAlignFilter::~CSeqAlignFilter(void)
60 {
61 }
62
63 /////////////////////////////////////////////////////////////////////////////
64
FilterSeqaligns(const string & fname_in_seqaligns,const string & fname_out_seqaligns,const string & fname_gis_to_filter)65 void CSeqAlignFilter::FilterSeqaligns(const string& fname_in_seqaligns,
66 const string& fname_out_seqaligns,
67 const string& fname_gis_to_filter)
68 {
69 CSeq_align_set full_aln;
70 ReadSeqalignSet(fname_in_seqaligns, full_aln);
71
72 CSeq_align_set filtered_aln;
73 FilterByGiListFromFile(full_aln, fname_gis_to_filter, filtered_aln);
74
75 WriteSeqalignSet(fname_out_seqaligns, filtered_aln);
76 }
77
FilterSeqalignsExt(const string & fname_in_seqaligns,const string & fname_out_seqaligns,CRef<CSeqDB> db)78 void CSeqAlignFilter::FilterSeqalignsExt(const string& fname_in_seqaligns,
79 const string& fname_out_seqaligns,
80 CRef<CSeqDB> db)
81 {
82 CSeq_align_set full_aln;
83 ReadSeqalignSet(fname_in_seqaligns, full_aln);
84
85 CSeq_align_set filtered_aln;
86 FilterBySeqDB(full_aln, db, filtered_aln);
87
88 WriteSeqalignSet(fname_out_seqaligns, filtered_aln);
89 }
90
91 /////////////////////////////////////////////////////////////////////////////
92
FilterByGiListFromFile(const CSeq_align_set & full_aln,const string & fname_gis_to_filter,CSeq_align_set & filtered_aln)93 void CSeqAlignFilter::FilterByGiListFromFile(const CSeq_align_set& full_aln,
94 const string& fname_gis_to_filter,
95 CSeq_align_set& filtered_aln)
96 {
97 CRef<CSeqDBFileGiList> seqdb_gis(new CSeqDBFileGiList(fname_gis_to_filter));
98
99 CConstRef<CSeq_id> id_aligned_seq;
100 filtered_aln.Set().clear();
101
102 ITERATE(CSeq_align_set::Tdata, iter, full_aln.Get()) {
103 if (!((*iter)->GetSegs().IsDisc())) {
104
105 // process a single alignment
106
107 id_aligned_seq = &((*iter)->GetSeq_id(1));
108 TGi gi = id_aligned_seq->GetGi();
109
110 if (seqdb_gis->FindGi(gi)) {
111 filtered_aln.Set().push_back(*iter);
112 }
113 }
114 else {
115
116 // recursively process a set of alignments
117
118 CRef<CSeq_align_set> filtered_sub_aln(new CSeq_align_set);
119 FilterByGiListFromFile((*iter)->GetSegs().GetDisc(), fname_gis_to_filter, *filtered_sub_aln);
120
121 CRef<CSeq_align> aln_disc(new CSeq_align);
122 aln_disc->Assign(**iter);
123 aln_disc->SetSegs().SetDisc(*filtered_sub_aln);
124
125 filtered_aln.Set().push_back(aln_disc);
126 }
127 }
128 }
129
130
FilterByGiList(const CSeq_align_set & full_aln,const list<TGi> & list_gis,CSeq_align_set & filtered_aln)131 void CSeqAlignFilter::FilterByGiList(const CSeq_align_set& full_aln,
132 const list<TGi>& list_gis,
133 CSeq_align_set& filtered_aln)
134 {
135 CConstRef<CSeq_id> id_aligned_seq;
136 filtered_aln.Set().clear();
137
138 ITERATE(CSeq_align_set::Tdata, iter, full_aln.Get()) {
139 if (!((*iter)->GetSegs().IsDisc())) {
140
141 // process a single alignment
142
143 id_aligned_seq = &((*iter)->GetSeq_id(1));
144 TGi gi = id_aligned_seq->GetGi();
145
146 if (find(list_gis.begin(), list_gis.end(), gi) != list_gis.end()) {
147 filtered_aln.Set().push_back(*iter);
148 }
149 }
150 else {
151
152 // recursively process a set of alignments
153
154 CRef<CSeq_align_set> filtered_sub_aln(new CSeq_align_set);
155 FilterByGiList((*iter)->GetSegs().GetDisc(), list_gis, *filtered_sub_aln);
156
157 CRef<CSeq_align> aln_disc(new CSeq_align);
158 aln_disc->Assign(**iter);
159 aln_disc->SetSegs().SetDisc(*filtered_sub_aln);
160
161 filtered_aln.Set().push_back(aln_disc);
162 }
163 }
164 }
165
s_GetFilteredRedundantGis(CRef<CSeqDB> db,int oid,vector<TGi> & gis)166 static void s_GetFilteredRedundantGis(CRef<CSeqDB> db,
167 int oid,
168 vector<TGi>& gis)
169 {
170 // Note: copied from algo/blast/api to avoid dependencies
171
172 gis.resize(0);
173 if (!db->GetGiList()) {
174 return;
175 }
176
177 list< CRef<CSeq_id> > seqid_list = db->GetSeqIDs(oid);
178 gis.reserve(seqid_list.size());
179
180 ITERATE(list< CRef<CSeq_id> >, id, seqid_list) {
181 if ((**id).IsGi()) {
182 gis.push_back((**id).GetGi());
183 }
184 }
185
186 sort(gis.begin(), gis.end());
187 }
188
FilterBySeqDB(const CSeq_align_set & full_aln,CRef<CSeqDB> db,CSeq_align_set & filtered_aln)189 void CSeqAlignFilter::FilterBySeqDB(const CSeq_align_set& full_aln,
190 CRef<CSeqDB> db,
191 CSeq_align_set& filtered_aln)
192 {
193 filtered_aln.Set().clear();
194
195 ITERATE(CSeq_align_set::Tdata, iter_aln, full_aln.Get()) {
196 if (!((*iter_aln)->GetSegs().IsDisc())) {
197
198 // process a single alignment
199
200 // get the gi of the aligned sequence
201 CConstRef<CSeq_id> id_aligned_seq;
202 id_aligned_seq = &((*iter_aln)->GetSeq_id(1));
203 TGi gi_aligned_seq = id_aligned_seq->GetGi();
204
205 // get the corresponding oid from the db (!!! can we rely on this? !!!)
206 int oid_aligned_seq = -1;
207 db->GiToOid(gi_aligned_seq, oid_aligned_seq);
208
209 // retrieve the filtered list of gi's corresponding to this oid
210 vector<TGi> vec_gis_from_DB;
211
212 if (oid_aligned_seq > 0)
213 s_GetFilteredRedundantGis(db, oid_aligned_seq, vec_gis_from_DB);
214
215 // if that list is non-empty, add seq-align's with those gi's to the filtered alignment set
216 if (!vec_gis_from_DB.empty()) {
217
218 x_CreateOusputSeqaligns(*iter_aln, gi_aligned_seq, filtered_aln, vec_gis_from_DB);
219 }
220 }
221 else {
222
223 // recursively process a set of alignments
224
225 CRef<CSeq_align_set> filtered_sub_aln(new CSeq_align_set);
226 FilterBySeqDB((*iter_aln)->GetSegs().GetDisc(), db, *filtered_sub_aln);
227
228 CRef<CSeq_align> aln_disc(new CSeq_align);
229 aln_disc->Assign(**iter_aln);
230 aln_disc->SetSegs().SetDisc(*filtered_sub_aln);
231
232 filtered_aln.Set().push_back(aln_disc);
233 }
234 }
235 }
236
237
238
239
240 /////////////////////////////////////////////////////////////////////////////
241
x_CreateOusputSeqaligns(CConstRef<CSeq_align> in_aln,TGi in_gi,CSeq_align_set & out_aln,const vector<TGi> & out_gi_vec)242 void CSeqAlignFilter::x_CreateOusputSeqaligns(CConstRef<CSeq_align> in_aln, TGi in_gi,
243 CSeq_align_set& out_aln, const vector<TGi>& out_gi_vec)
244 {
245 if (out_gi_vec.size() == 0)
246 return;
247
248 if (m_eFormat == eMultipleSeqaligns)
249 {
250 for (vector<TGi>::const_iterator it_gi_out = out_gi_vec.begin();
251 it_gi_out != out_gi_vec.end(); it_gi_out++)
252 {
253 // get a copy of the input seq-align and change the gi of
254 // the aligned sequence to the gi that must go into the output
255
256 bool success = false;
257 CRef<CSeq_align> sa_copy = x_UpdateGiInSeqalign(in_aln, 1,
258 in_gi, *it_gi_out, success);
259
260 // if the update was successful, add the new seq-align to the results
261 if (success)
262 {
263 // remove any "use_this_gi" entries as the selected format option requires
264 x_RemoveExtraGis(sa_copy);
265
266 out_aln.Set().push_back(sa_copy);
267 }
268 }
269 }
270 else if (m_eFormat == eCombined)
271 {
272 // update the main gi of the aligned sequence & add any extra gi's as "use this gi" entries
273
274 vector<TGi> vec_old_extra_gis;
275 x_ReadExtraGis(in_aln, vec_old_extra_gis);
276
277 TGi main_new_gi;
278 vector<TGi> vec_new_extra_gis;
279 x_GenerateNewGis(in_gi, vec_old_extra_gis, out_gi_vec, main_new_gi, vec_new_extra_gis);
280
281 bool success = false;
282 CRef<CSeq_align> sa_copy = x_UpdateGiInSeqalign(in_aln, 1, in_gi,
283 main_new_gi, success);
284
285 if (success)
286 {
287 x_RemoveExtraGis(sa_copy);
288 x_WriteExtraGis(sa_copy, vec_new_extra_gis);
289
290 out_aln.Set().push_back(sa_copy);
291 }
292 }
293 }
294
295 /////////////////////////////////////////////////////////////////////////////
296
x_GenerateNewGis(TGi main_old_gi,const vector<TGi> & vec_old_extra_gis,const vector<TGi> & vec_out_gis,TGi & main_new_gi,vector<TGi> & vec_new_extra_gis)297 void CSeqAlignFilter::x_GenerateNewGis(
298 TGi main_old_gi, // in: main gi stored before filtering
299 const vector<TGi>& vec_old_extra_gis, // in: extra gi's stored before filtering
300 const vector<TGi>& vec_out_gis, // in: list of all gi's after filtering
301 TGi& main_new_gi, // out: main gi after filtering
302 vector<TGi>& vec_new_extra_gis) // out: extra gi's after filtering
303 {
304 if (vec_out_gis.empty())
305 return;
306
307 int i_out_gi = 0, i_old_gi = 0, i_new_gi = 0;
308
309 // set the main gi
310
311 if (find(vec_out_gis.begin(), vec_out_gis.end(), main_old_gi) != vec_out_gis.end())
312 main_new_gi = main_old_gi;
313 else
314 main_new_gi = vec_out_gis[0]; //main_new_gi = vec_out_gis[i_out_gi++];
315
316 int num_gis_left = vec_out_gis.size(); //int num_gis_left = vec_out_gis.size() - 1;
317 if (num_gis_left > 0)
318 {
319 // set the extra gi's (copy & filter the old ones, then add the new ones)
320 // we do not copy the vec_out_gis directly to preserve the original order
321 // (older gi's will appear before newly added gi's)
322
323 vec_new_extra_gis.resize(num_gis_left);
324
325 for (; i_old_gi < (int)(vec_old_extra_gis.size()); i_old_gi++)
326 {
327 TGi old_gi = vec_old_extra_gis[i_old_gi];
328 if (find(vec_out_gis.begin(), vec_out_gis.end(), old_gi) != vec_out_gis.end())
329 vec_new_extra_gis[i_new_gi++] = old_gi;
330 }
331
332 for (; i_out_gi < (int)(vec_out_gis.size()); i_out_gi++)
333 {
334 TGi out_gi = vec_out_gis[i_out_gi];
335 if (find(vec_old_extra_gis.begin(), vec_old_extra_gis.end(), out_gi)
336 == vec_old_extra_gis.end()) // not one of the old gis (already copied)
337 {
338 // if (out_gi != main_new_gi) // not the main gi (already set)
339 vec_new_extra_gis[i_new_gi++] = out_gi;
340 }
341 }
342 }
343 else
344 {
345 // no extra gi's to copy
346
347 vec_new_extra_gis.clear();
348 }
349 }
350
351 /////////////////////////////////////////////////////////////////////////////
352
x_UpdateGiInSeqalign(CConstRef<CSeq_align> sa,unsigned int n_row,TGi old_gi,TGi new_gi,bool & success)353 CRef<CSeq_align> CSeqAlignFilter::x_UpdateGiInSeqalign(CConstRef<CSeq_align> sa, unsigned int n_row,
354 TGi old_gi, TGi new_gi, bool& success)
355 {
356 // create a copy of the given alignment
357
358 CRef<CSeq_align> sa_copy(new CSeq_align);
359 sa_copy->Assign(*(sa.GetNonNullPointer()));
360
361 // update the gi of sequence #n_row in the copied alignment structure
362
363 bool gi_changed = false;
364
365 if (sa_copy->GetSegs().IsDendiag())
366 {
367 // find and update gi's in every appropriate diag entry
368
369 CSeq_align::C_Segs::TDendiag& dendiag = sa_copy->SetSegs().SetDendiag();
370 NON_CONST_ITERATE(CSeq_align::C_Segs::TDendiag, iter_diag, dendiag)
371 {
372 if ((*iter_diag)->IsSetIds() && n_row < (*iter_diag)->GetIds().size())
373 {
374 const CSeq_id& id_to_change = *((*iter_diag)->GetIds()[n_row]);
375 if (id_to_change.IsGi() &&
376 id_to_change.GetGi() == old_gi)
377 {
378 (*iter_diag)->SetIds()[n_row]->SetGi(new_gi);
379 gi_changed = true;
380 }
381 }
382 }
383 }
384 else if (sa_copy->GetSegs().IsDenseg())
385 {
386 // update the gi in the dense-seg entry
387
388 CSeq_align::C_Segs::TDenseg& denseg = sa_copy->SetSegs().SetDenseg();
389 if (denseg.IsSetIds() && n_row < denseg.GetIds().size())
390 {
391 const CSeq_id& id_to_change = *(denseg.GetIds()[n_row]);
392 if (id_to_change.IsGi() &&
393 id_to_change.GetGi() == old_gi)
394 {
395 denseg.SetIds()[n_row]->SetGi(new_gi);
396 gi_changed = true;
397 }
398 }
399 }
400 else if (sa_copy->GetSegs().IsStd())
401 {
402 // find and update gi's in every appropriate seq-loc entry in the std-segs
403
404 CSeq_align::C_Segs::TStd& stdsegs = sa_copy->SetSegs().SetStd();
405 NON_CONST_ITERATE(CSeq_align::C_Segs::TStd, iter_std, stdsegs)
406 {
407 if ((*iter_std)->IsSetLoc() && n_row < (*iter_std)->GetLoc().size())
408 {
409 CSeq_loc& loc_to_change = *((*iter_std)->SetLoc()[n_row]);
410
411 // question: do seq-locs ever contain parts of different sequences?
412
413 const CSeq_id* p_id_to_change = loc_to_change.GetId();
414 if (p_id_to_change) // one and only one id is associated with this seq-loc
415 {
416 if (p_id_to_change->IsGi() &&
417 p_id_to_change->GetGi() == old_gi)
418 {
419 CRef<CSeq_id> id_updated(new CSeq_id(CSeq_id::e_Gi, new_gi));
420 loc_to_change.SetId(*id_updated);
421 gi_changed = true;
422 }
423 }
424 }
425 }
426 }
427 else
428 {
429 // these alignment types are not supported here
430 }
431
432 success = gi_changed;
433 return sa_copy;
434 }
435
x_ReadExtraGis(CConstRef<CSeq_align> sa,vector<TGi> & vec_extra_gis)436 void CSeqAlignFilter::x_ReadExtraGis(CConstRef<CSeq_align> sa, vector<TGi>& vec_extra_gis)
437 {
438 vec_extra_gis.clear();
439
440 CSeq_align::TScore score_entries = sa->GetScore();
441 ITERATE(CSeq_align::TScore, iter_score, score_entries)
442 {
443 CRef<CScore> score_entry = *iter_score;
444
445 if (score_entry->CScore_Base::IsSetId())
446 if (score_entry->GetId().IsStr())
447 {
448 string str_id = score_entry->GetId().GetStr();
449 if (str_id == "use_this_gi")
450 {
451 Uint4 gi_v = (Uint4) (score_entry->GetValue().GetInt());
452 TGi gi = GI_FROM(Uint4, gi_v);
453 vec_extra_gis.push_back(gi);
454 }
455 }
456 }
457 }
458
x_WriteExtraGis(CRef<CSeq_align> sa,const vector<TGi> & vec_extra_gis)459 void CSeqAlignFilter::x_WriteExtraGis(CRef<CSeq_align> sa, const vector<TGi>& vec_extra_gis)
460 {
461 for (int i_gi = 0; i_gi < (int)(vec_extra_gis.size()); i_gi++)
462 x_AddUseGiEntryInSeqalign(sa, vec_extra_gis[i_gi]);
463 }
464
x_RemoveExtraGis(CRef<CSeq_align> sa)465 void CSeqAlignFilter::x_RemoveExtraGis(CRef<CSeq_align> sa)
466 {
467 CSeq_align::TScore& score_entries = sa->SetScore();
468
469 CSeq_align::TScore::iterator iter_score = score_entries.begin();
470 while (iter_score != score_entries.end())
471 {
472 CRef<CScore> score_entry = *iter_score;
473 bool erase_entry = false;
474
475 if (score_entry->IsSetId())
476 if (score_entry->GetId().IsStr())
477 {
478 string str_id = score_entry->GetId().GetStr();
479 erase_entry = (str_id == "use_this_gi");
480 }
481
482 if (erase_entry)
483 iter_score = score_entries.erase(iter_score);
484 else
485 iter_score++;
486 }
487 }
488
x_AddUseGiEntryInSeqalign(CRef<CSeq_align> sa,TGi new_gi)489 bool CSeqAlignFilter::x_AddUseGiEntryInSeqalign(CRef<CSeq_align> sa, TGi new_gi)
490 {
491 // add a "use this gi" entry with the new gi to the score section of the alignment
492
493 CRef<CScore> score_entry(new CScore);
494 score_entry->SetId().SetStr("use_this_gi");
495 score_entry->SetValue().SetInt(GI_TO(CScore::C_Value::TInt, new_gi));
496
497 sa->SetScore().push_back(score_entry);
498
499 return true;
500 }
501
PrepareSeqDB(const string & fname_db,bool is_prot,const string & fname_gis)502 CRef<CSeqDB> CSeqAlignFilter::PrepareSeqDB(const string& fname_db, bool is_prot,
503 const string& fname_gis)
504 {
505 CRef<CSeqDBFileGiList> seqdb_gis;
506 seqdb_gis = new CSeqDBFileGiList(fname_gis);
507
508 CRef<CSeqDB> seqdb;
509 seqdb = new CSeqDB(fname_db,
510 is_prot? CSeqDB::eProtein : CSeqDB::eNucleotide,
511 seqdb_gis);
512 return seqdb;
513 }
514
ReadSeqalignSet(const string & fname,CSeq_align_set & aln)515 void CSeqAlignFilter::ReadSeqalignSet(const string& fname, CSeq_align_set& aln)
516 {
517 auto_ptr<CObjectIStream> asn_in(CObjectIStream::Open(fname, eSerial_AsnText));
518 *asn_in >> aln;
519 }
520
WriteSeqalignSet(const string & fname,const CSeq_align_set & aln)521 void CSeqAlignFilter::WriteSeqalignSet(const string& fname, const CSeq_align_set& aln)
522 {
523 auto_ptr<CObjectOStream> asn_out(CObjectOStream::Open(fname, eSerial_AsnText));
524 *asn_out << aln;
525 }
526
ReadGiList(const string & fname,list<TGi> & list_gis,bool sorted)527 void CSeqAlignFilter::ReadGiList(const string& fname, list<TGi>& list_gis, bool sorted)
528 {
529 CRef<CSeqDBFileGiList> seqdb_gis;
530 seqdb_gis = new CSeqDBFileGiList(fname);
531
532 vector<TGi> vec_gis;
533 seqdb_gis->GetGiList(vec_gis);
534
535 if (sorted)
536 sort(vec_gis.begin(), vec_gis.end());
537
538 list_gis.clear();
539 for (vector<TGi>::iterator it = vec_gis.begin(); it != vec_gis.end(); it++)
540 list_gis.push_back(*it);
541 }
542
ReadGiVector(const string & fname,vector<TGi> & vec_gis,bool sorted)543 void CSeqAlignFilter::ReadGiVector(const string& fname, vector<TGi>& vec_gis, bool sorted)
544 {
545 CRef<CSeqDBFileGiList> seqdb_gis;
546 seqdb_gis = new CSeqDBFileGiList(fname);
547
548 seqdb_gis->GetGiList(vec_gis);
549 if (sorted)
550 sort(vec_gis.begin(), vec_gis.end());
551 }
552
s_UpdateSubjectInSeqalign(CRef<CSeq_align> & in_align,CRef<CSeq_id> & newSeqID)553 static CRef<CSeq_align> s_UpdateSubjectInSeqalign(CRef<CSeq_align> &in_align, CRef<CSeq_id> &newSeqID)
554
555 {
556 // create a copy of the given alignment
557 CRef<CSeq_align> sa_copy(new CSeq_align);
558 sa_copy->Assign(*(in_align.GetNonNullPointer()));
559
560 const CSeq_id& subjid = in_align->GetSeq_id(1);
561 if(!subjid.Match(*newSeqID)) {
562 if (sa_copy->GetSegs().IsDenseg())
563 {
564 CSeq_align::C_Segs::TDenseg& denseg = sa_copy->SetSegs().SetDenseg();
565 if (denseg.IsSetIds() && denseg.GetIds().size() == 2)
566 {
567 denseg.SetIds()[1] = newSeqID;
568 }
569 }
570 }
571 return sa_copy;
572 }
573
574
s_UpdateSeqAlnWithFilteredSeqIDs(CRef<CSeqDB> filteredDB,int oid,CRef<CSeq_align> & in_align)575 CRef<CSeq_align> s_UpdateSeqAlnWithFilteredSeqIDs(CRef<CSeqDB> filteredDB,
576 int oid,
577 CRef<CSeq_align> &in_align)
578 {
579 CRef<CSeq_align> sa_copy;
580 CRef<CSeq_id> newSubjectID;
581
582 const CSeq_id& subjid = in_align->GetSeq_id(1);
583
584 vector< CRef<CSeq_id> > seqids;
585 list< CRef<CSeq_id> > seqid_list = filteredDB->GetSeqIDs(oid);
586 seqids.reserve(seqid_list.size());
587
588 ITERATE(list< CRef<CSeq_id> >, id, seqid_list) {
589 if(subjid.IsGi() && (**id).IsGi()) {
590 seqids.push_back(*id);
591 }
592 else if(!subjid.IsGi() && !(**id).IsGi()) {
593 seqids.push_back(*id);
594 }
595 }
596
597 if(!seqids.empty()) {
598 //update main subject and and add use_this_seq
599 newSubjectID = seqids[0];
600 sa_copy = s_UpdateSubjectInSeqalign(in_align,newSubjectID);
601 vector <string> useThisSeqs;
602 for(size_t i = 0; i < seqids.size(); i++) {
603 string textSeqID;
604 CAlignFormatUtil::GetTextSeqID(seqids[i], &textSeqID);
605 if(seqids[0]->IsGi()) {
606 useThisSeqs.push_back("gi:" + textSeqID);
607 }
608 else {
609 useThisSeqs.push_back("seqid:" + textSeqID);
610 }
611 }
612 CRef<CUser_object> userObject(new CUser_object());
613 userObject->SetType().SetStr("use_this_seqid");
614 userObject->AddField("SEQIDS", useThisSeqs);
615 sa_copy->ResetExt();
616 sa_copy->SetExt().push_back(userObject);
617 }
618 return sa_copy;
619 }
620
621
FilterBySeqDB(const CSeq_align_set & seqalign,CRef<CSeqDB> & filteredDB,vector<int> & oid_vec)622 CRef<CSeq_align_set> CSeqAlignFilter::FilterBySeqDB(const CSeq_align_set& seqalign, //CRef<CSeq_align_set> &seqalign
623 CRef<CSeqDB> &filteredDB,
624 vector<int>& oid_vec)
625 {
626 CConstRef<CSeq_id> previous_id, subjid;
627 size_t i = 0;
628 bool success = false;
629 CRef<CSeq_id> newSubjectID;
630
631 CRef<CSeq_align_set> new_aln(new CSeq_align_set);
632
633 ITERATE(CSeq_align_set::Tdata, iter, seqalign.Get()){
634 CRef<CSeq_align> currAlign = *iter;
635 CRef<CSeq_align> filtered_aln;
636
637 subjid = &(currAlign->GetSeq_id(1));
638 if(previous_id.Empty() || !subjid->Match(*previous_id))
639 {
640 success = false;
641 if (oid_vec[i] > 0) {
642 // retrieve the filtered list of gi's corresponding to this oid
643 filtered_aln = s_UpdateSeqAlnWithFilteredSeqIDs(filteredDB, oid_vec[i], currAlign);
644 if(!filtered_aln.Empty()) { //found sequence with this oid oid_vec[i] in filtered seqdb
645 newSubjectID.Reset(const_cast<CSeq_id*>(&filtered_aln->GetSeq_id(1)));
646 success = true;
647 }
648 }
649 i++;
650 }
651 else {
652 if(success) {
653 filtered_aln = s_UpdateSubjectInSeqalign(currAlign,newSubjectID);
654 }
655 }
656 previous_id = subjid;
657
658 if(success && !filtered_aln.Empty()) {
659 new_aln->Set().push_back(filtered_aln);
660 }
661 }
662 return new_aln;
663 }
664
s_IncludeDeflineTaxid(const CBlast_def_line & def,const set<TTaxId> & user_tax_ids)665 bool static s_IncludeDeflineTaxid(const CBlast_def_line & def, const set<TTaxId> & user_tax_ids)
666 {
667 CBlast_def_line::TTaxIds tax_ids;
668 if (def.IsSetTaxid()) {
669 tax_ids.insert(def.GetTaxid());
670 }
671 if(def.IsSetLinks()) {
672 CBlast_def_line::TLinks leaf_ids = def.GetLinks();
673 #ifdef NCBI_STRICT_TAX_ID
674 ITERATE(CBlast_def_line::TLinks, it, leaf_ids) tax_ids.insert(TAX_ID_FROM(int, *it));
675 #else
676 tax_ids.insert(leaf_ids.begin(), leaf_ids.end());
677 #endif
678 }
679
680 if(user_tax_ids.size() > tax_ids.size()) {
681 ITERATE(CBlast_def_line::TTaxIds, itr, tax_ids) {
682 if(user_tax_ids.find(*itr) != user_tax_ids.end()) {
683 return true;
684 }
685 }
686 }
687 else {
688 ITERATE(set<TTaxId>, itr, user_tax_ids) {
689 if(tax_ids.find(*itr) != tax_ids.end()) {
690 return true;
691 }
692 }
693 }
694 return false;
695 }
696
697
s_ModifySeqAlnWithFilteredSeqIDs(CRef<CBlast_def_line_set> & bdlRef,const set<TTaxId> & taxids,CRef<CSeq_align> & in_align)698 static CRef<CSeq_align> s_ModifySeqAlnWithFilteredSeqIDs(CRef<CBlast_def_line_set> &bdlRef,
699 const set<TTaxId>& taxids,
700 CRef<CSeq_align> &in_align)
701 {
702 CRef<CSeq_align> sa_copy;
703
704 const list< CRef< CBlast_def_line > >& bdlSet = bdlRef->Get();
705 vector <string> useThisSeqs;
706 ITERATE(list<CRef<CBlast_def_line> >, iter, bdlSet) {
707 const CBlast_def_line & defline = **iter;
708 bool has_match = s_IncludeDeflineTaxid(defline, taxids);
709 CRef<CSeq_id> seqID;
710 string textSeqID;
711 if(has_match) {
712 const CBioseq::TId& cur_id = (*iter)->GetSeqid();
713 seqID = FindBestChoice(cur_id, CSeq_id::WorstRank);
714 CAlignFormatUtil::GetTextSeqID(seqID, &textSeqID);
715
716 list<string> use_this_seq;
717 CAlignFormatUtil::GetUseThisSequence(*in_align,use_this_seq);
718 if(use_this_seq.size() > 0) {
719 has_match = CAlignFormatUtil::MatchSeqInUseThisSeqList(use_this_seq, textSeqID);
720 }
721 }
722 if(has_match) {
723 if(sa_copy.Empty()) {
724 sa_copy = s_UpdateSubjectInSeqalign(in_align,seqID);
725 }
726 if(seqID->IsGi()) {
727 useThisSeqs.push_back("gi:" + textSeqID);
728 }
729 else {
730 useThisSeqs.push_back("seqid:" + textSeqID);
731 }
732 CRef<CUser_object> userObject(new CUser_object());
733 userObject->SetType().SetStr("use_this_seqid");
734 userObject->AddField("SEQIDS", useThisSeqs);
735 sa_copy->ResetExt();
736 sa_copy->SetExt().push_back(userObject);
737 }
738 }
739 return sa_copy;
740 }
741
742
743
FilterByTaxonomy(const CSeq_align_set & seqalign,CRef<CSeqDB> & seqdb,const set<TTaxId> & taxids)744 CRef<CSeq_align_set> CSeqAlignFilter::FilterByTaxonomy(const CSeq_align_set& seqalign, //CRef<CSeq_align_set> &seqalign
745 CRef<CSeqDB> &seqdb,
746 const set<TTaxId>& taxids)
747 {
748 CConstRef<CSeq_id> previous_id, subjid;
749 bool success = false;
750 CRef<CSeq_id> newSubjectID;
751
752 CRef<CSeq_align_set> new_aln(new CSeq_align_set);
753
754 ITERATE(CSeq_align_set::Tdata, iter, seqalign.Get()){
755 CRef<CSeq_align> currAlign = *iter;
756 CRef<CSeq_align> filtered_aln;
757
758 subjid = &(currAlign->GetSeq_id(1));
759 if(previous_id.Empty() || !subjid->Match(*previous_id))
760 {
761 success = false;
762 int oid;
763 seqdb->SeqidToOid(*subjid, oid);
764 CRef<CBlast_def_line_set> bdlSet = seqdb->GetHdr(oid);
765 filtered_aln = s_ModifySeqAlnWithFilteredSeqIDs(bdlSet, taxids, currAlign);
766 if(!filtered_aln.Empty()) { //found sequence with this oid oid_vec[i] in filtered seqdb
767 newSubjectID.Reset(const_cast<CSeq_id*>(&filtered_aln->GetSeq_id(1)));
768 success = true;
769 }
770 }
771 else {
772 if(success) {
773 filtered_aln = s_UpdateSubjectInSeqalign(currAlign,newSubjectID);
774 }
775 }
776 previous_id = subjid;
777
778 if(success && !filtered_aln.Empty()) {
779 new_aln->Set().push_back(filtered_aln);
780 }
781 }
782 return new_aln;
783 }
784
s_ModifySeqAlnWithFilteredSeqIDs(CRef<CScope> & scope,CSeq_id::EAccessionInfo accType,CRef<CSeq_align> & in_align)785 static CRef<CSeq_align> s_ModifySeqAlnWithFilteredSeqIDs(CRef <CScope> &scope,
786 CSeq_id::EAccessionInfo accType,
787 CRef<CSeq_align> &in_align)
788 {
789 CRef<CSeq_align> sa_copy;
790
791 list<string> use_this_seq;
792 CAlignFormatUtil::GetUseThisSequence(*in_align,use_this_seq);
793 bool modifyAlignment = false;
794 if(use_this_seq.size() > 0) {
795 modifyAlignment = CAlignFormatUtil::RemoveSeqsOfAccessionTypeFromSeqInUse(use_this_seq, accType);
796 }
797 else { //go through blast deflines
798 const CSeq_id &subjid = in_align->GetSeq_id(1);
799 const CBioseq_Handle& handle = scope->GetBioseqHandle(subjid);
800 CRef<CBlast_def_line_set> bdlRef = CSeqDB::ExtractBlastDefline(handle);
801 const list< CRef< CBlast_def_line > >& bdlSet = bdlRef->Get();
802 ITERATE(list<CRef<CBlast_def_line> >, iter, bdlSet) {
803 string textSeqID;
804 const CBioseq::TId& cur_id = (*iter)->GetSeqid();
805 CRef<CSeq_id> seqID = FindBestChoice(cur_id, CSeq_id::WorstRank);
806 CSeq_id::EAccessionInfo accInfo = seqID->IdentifyAccession();
807 if(accInfo != accType) {//not CSeq_id::eAcc_refseq_prot
808 CAlignFormatUtil::GetTextSeqID(seqID, &textSeqID);
809 use_this_seq.push_back(textSeqID);
810 }
811 else {
812 modifyAlignment = true;
813 }
814 }
815 }
816 if(!modifyAlignment) {
817 sa_copy = in_align;
818 }
819 if(modifyAlignment && use_this_seq.size() > 0) {
820 vector <string> useThisSeqs;
821 CRef<CSeq_id> seqID (new CSeq_id(*(use_this_seq.begin())));
822 sa_copy = s_UpdateSubjectInSeqalign(in_align,seqID);
823 ITERATE(list<string>, iter_seq, use_this_seq){
824 if(seqID->IsGi()) {
825 useThisSeqs.push_back("gi:" + *iter_seq);
826 }
827 else {
828 useThisSeqs.push_back("seqid:" + *iter_seq);
829 }
830 CRef<CUser_object> userObject(new CUser_object());
831 userObject->SetType().SetStr("use_this_seqid");
832 userObject->AddField("SEQIDS", useThisSeqs);
833 sa_copy->ResetExt();
834 sa_copy->SetExt().push_back(userObject);
835 }
836 }
837 return sa_copy;
838 }
839
FilterByAccessionType(const CSeq_align_set & seqalign,CRef<CScope> & scope,vector<CSeq_id::EAccessionInfo> accTypes)840 CRef<CSeq_align_set> CSeqAlignFilter::FilterByAccessionType(const CSeq_align_set& seqalign, //CRef<CSeq_align_set> &seqalign
841 CRef <CScope> &scope,
842 vector <CSeq_id::EAccessionInfo> accTypes)
843 {
844 CConstRef<CSeq_id> previous_id, subjid;
845 bool success = false;
846 CRef<CSeq_id> newSubjectID;
847
848 CRef<CSeq_align_set> new_aln(new CSeq_align_set);
849
850 ITERATE(CSeq_align_set::Tdata, iter, seqalign.Get()){
851 CRef<CSeq_align> currAlign = *iter;
852 CRef<CSeq_align> filtered_aln;
853
854 subjid = &(currAlign->GetSeq_id(1));
855 if(previous_id.Empty() || !subjid->Match(*previous_id))
856 {
857 success = false;
858 for (size_t i = 0; i < accTypes.size(); i++) {
859 if(!currAlign.Empty()) {
860 filtered_aln = s_ModifySeqAlnWithFilteredSeqIDs(scope, accTypes[i], currAlign);
861 currAlign = filtered_aln;
862 }
863 }
864 if(!filtered_aln.Empty()) {
865 newSubjectID.Reset(const_cast<CSeq_id*>(&filtered_aln->GetSeq_id(1)));
866 success = true;
867 }
868 }
869 else {
870 if(success) {
871 filtered_aln = s_UpdateSubjectInSeqalign(currAlign,newSubjectID);
872 }
873 }
874 previous_id = subjid;
875
876 if(success && !filtered_aln.Empty()) {
877 new_aln->Set().push_back(filtered_aln);
878 }
879 }
880 return new_aln;
881 }
882 END_SCOPE(align_format)
883 END_NCBI_SCOPE
884