1 /*  $Id: seqdboidlist.cpp 610974 2020-06-26 12:59:33Z grichenk $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Kevin Bealer
27  *
28  */
29 
30 /// @file seqdboidlist.cpp
31 /// Implementation for the CSeqDBOIDList class, an array of bits
32 /// describing a subset of the virtual oid space.
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistr.hpp>
35 #include "seqdboidlist.hpp"
36 #include "seqdbfilter.hpp"
37 #include <objtools/blast/seqdb_reader/impl/seqdbfile.hpp>
38 #include "seqdbgilistset.hpp"
39 #include <algorithm>
40 
41 BEGIN_NCBI_SCOPE
42 
CSeqDBOIDList(CSeqDBAtlas & atlas,const CSeqDBVolSet & volset,CSeqDB_FilterTree & filters,CRef<CSeqDBGiList> & gi_list,CRef<CSeqDBNegativeList> & neg_list,CSeqDBLockHold & locked,const CSeqDBLMDBSet & lmdb_set)43 CSeqDBOIDList::CSeqDBOIDList(CSeqDBAtlas              & atlas,
44                              const CSeqDBVolSet       & volset,
45                              CSeqDB_FilterTree        & filters,
46                              CRef<CSeqDBGiList>       & gi_list,
47                              CRef<CSeqDBNegativeList> & neg_list,
48                              CSeqDBLockHold           & locked,
49                              const CSeqDBLMDBSet      & lmdb_set)
50     : m_Atlas   (atlas),
51       m_Lease   (atlas),
52       m_NumOIDs (0)
53 {
54     _ASSERT(gi_list.NotEmpty() || neg_list.NotEmpty() || filters.HasFilter());
55     x_Setup( volset, filters, gi_list, neg_list, locked, lmdb_set);
56 }
57 
~CSeqDBOIDList()58 CSeqDBOIDList::~CSeqDBOIDList()
59 {
60 }
61 
62 // The general rule I am following in these methods is to use byte
63 // computations except during actual looping.
64 
x_Setup(const CSeqDBVolSet & volset,CSeqDB_FilterTree & filters,CRef<CSeqDBGiList> & gi_list,CRef<CSeqDBNegativeList> & neg_list,CSeqDBLockHold & locked,const CSeqDBLMDBSet & lmdb_set)65 void CSeqDBOIDList::x_Setup(const CSeqDBVolSet       & volset,
66                             CSeqDB_FilterTree        & filters,
67                             CRef<CSeqDBGiList>       & gi_list,
68                             CRef<CSeqDBNegativeList> & neg_list,
69                             CSeqDBLockHold           & locked,
70 							const CSeqDBLMDBSet      & lmdb_set)
71 {
72     // First, get the memory space for the OID bitmap and clear it.
73 
74     // Pad memory space to word boundary, add 8 bytes for "insurance".  Some
75     // of the algorithms here need to do bit shifting and OR half of a source
76     // element into this destination element, and the other half into this
77     // other destination element.  Rather than sprinkle this code with range
78     // checks, padding is used.
79 
80     m_NumOIDs = volset.GetNumOIDs();
81 
82     m_AllBits.Reset(new CSeqDB_BitSet(0, m_NumOIDs));
83 
84     CSeqDBGiListSet gi_list_set(m_Atlas,
85                                 volset,
86                                 gi_list,
87                                 neg_list,
88                                 locked,
89                                 lmdb_set);
90     // Then get the list of filenames and offsets to overlay onto it.
91 
92     for(int i = 0; i < volset.GetNumVols(); i++) {
93         const CSeqDBVolEntry * v1 = volset.GetVolEntry(i);
94 
95         CRef<CSeqDB_BitSet> vol_bits =
96             x_ComputeFilters(filters, *v1, gi_list_set, locked, lmdb_set.IsBlastDBVersion5());
97 
98         m_AllBits->UnionWith(*vol_bits, true);
99     }
100 
101     if (lmdb_set.IsBlastDBVersion5()  && filters.HasFilter()) {
102    		CSeqDB_BitSet f_bits(0, m_NumOIDs);
103     	f_bits.AssignBitRange(0, m_NumOIDs, true);
104     	if(x_ComputeFilters(volset, filters, lmdb_set, f_bits, gi_list, neg_list)) {
105     		m_AllBits->IntersectWith(f_bits, true);
106     	}
107     }
108 
109     if (gi_list.NotEmpty()) {
110         x_ApplyUserGiList(*gi_list);
111     }
112     if (neg_list.NotEmpty()) {
113         x_ApplyNegativeList(*neg_list, lmdb_set.IsBlastDBVersion5());
114     }
115 
116     while(m_NumOIDs && (! x_IsSet(m_NumOIDs - 1))) {
117         -- m_NumOIDs;
118     }
119 }
120 
121 CRef<CSeqDB_BitSet>
x_ComputeFilters(const CSeqDB_FilterTree & filters,const CSeqDBVolEntry & vol,CSeqDBGiListSet & gis,CSeqDBLockHold & locked,bool isBlastDBv5)122 CSeqDBOIDList::x_ComputeFilters(const CSeqDB_FilterTree & filters,
123                                 const CSeqDBVolEntry    & vol,
124                                 CSeqDBGiListSet         & gis,
125                                 CSeqDBLockHold          & locked,
126                                 bool					isBlastDBv5)
127 
128 {
129     const string & vn = vol.Vol()->GetVolName();
130     CRef<CSeqDB_FilterTree> ft = filters.Specialize(vn);
131 
132     int vol_start = vol.OIDStart();
133     int vol_end   = vol.OIDEnd();
134 
135     CRef<CSeqDB_BitSet> volume_map;
136 
137     // Step 1: Compute the bitmap representing the filtering done by
138     // all subnodes.  This is a "union".
139 
140     int vols = ft->GetVolumes().size();
141 
142     _ASSERT(vols || ft->GetNodes().size());
143 
144     if (vols > 0) {
145         // This filter tree is filtered by volume name, so all nodes
146         // below this point can be ignored if this node contains a
147         // volume.  This volume will be ORred with those nodes,
148         // flushing them to all "1"s anyway (at least until this
149         // node's filtering is applied.)
150 
151         // This loop really just verifies that specialization was done
152         // properly in the case where there are multiple volume names
153         // (which must be the same).
154 
155         for(int j = 1; j < vols; j++) {
156             _ASSERT(ft->GetVolumes()[j] == ft->GetVolumes()[0]);
157         }
158 
159         volume_map.Reset(new CSeqDB_BitSet(vol_start,
160                                      vol_end,
161                                      CSeqDB_BitSet::eAllSet));
162     } else {
163         // Since this node did not have a volume, we OR together all
164         // of its subnodes.
165 
166         volume_map.Reset(new CSeqDB_BitSet(vol_start,
167                                      vol_end,
168                                      CSeqDB_BitSet::eAllClear));
169 
170         ITERATE(vector< CRef< CSeqDB_FilterTree > >, sub, ft->GetNodes()) {
171             CRef<CSeqDB_BitSet> sub_bits =
172                 x_ComputeFilters(**sub, vol, gis, locked, isBlastDBv5);
173 
174             volume_map->UnionWith(*sub_bits, true);
175         }
176     }
177 
178     // Now we apply this level's filtering.  The first question is, is
179     // it appropriate for a node to use multiple filtering mechanisms
180     // (GI list, OID list, or OID range), either of the same or
181     // different types?  The second question is how are multiply
182     // filtered nodes interpreted?
183 
184     // The SeqDB unit tests assume that multiple filters at a given
185     // level are ANDed together.  The unit tests assume this for the
186     // case of combining OID masks and OID ranges, but in the absence
187     // of another motivating example, I'll assume it means ANDing of
188     // all such mechanisms.
189 
190     CRef<CSeqDB_BitSet> filter(new CSeqDB_BitSet(vol_start,
191                                                  vol_end,
192                                                  CSeqDB_BitSet::eAllSet));
193 
194     // First, apply any 'range' filters, because they can be combined
195     // very efficiently.
196 
197     typedef CSeqDB_FilterTree::TFilters TFilters;
198 
199     ITERATE(TFilters, range, ft->GetFilters()) {
200         const CSeqDB_AliasMask & mask = **range;
201 
202         if (mask.GetType() == CSeqDB_AliasMask::eOidRange) {
203             CSeqDB_BitSet r2(mask.GetBegin(),
204                                 mask.GetEnd(),
205                                 CSeqDB_BitSet::eAllSet);
206             filter->IntersectWith(r2, true);
207         } else if (mask.GetType() == CSeqDB_AliasMask::eMemBit) {
208             // TODO, adding vol-specific OR and AND
209             vol.Vol()->SetMemBit(mask.GetMemBit());
210             // No filter->IntersectWith here since
211             // MEMBIT can not be done at OID level, therefore,
212             // we delegate to seqdbvol (in x_GetFilteredHeader())
213             // for further process.
214         }
215     }
216 
217     ITERATE(TFilters, filt, ft->GetFilters()) {
218         const CSeqDB_AliasMask & mask = **filt;
219 
220         if ((mask.GetType() == CSeqDB_AliasMask::eOidRange)
221             || (mask.GetType() == CSeqDB_AliasMask::eMemBit)
222             || (isBlastDBv5 && (mask.GetType() == CSeqDB_AliasMask::eSiList))
223             || (mask.GetType() == CSeqDB_AliasMask::eTaxIdList)) {
224             continue;
225         }
226 
227         CRef<CSeqDB_BitSet> f;
228         CRef<CSeqDBGiList> idlist;
229         switch(mask.GetType()) {
230         case CSeqDB_AliasMask::eOidList:
231             f = x_GetOidMask(mask.GetPath(), vol_start, vol_end);
232             break;
233 
234         case CSeqDB_AliasMask::eSiList:
235             idlist = gis.GetNodeIdList(mask.GetPath(),
236                                        vol.Vol(),
237                                        CSeqDBGiListSet::eSiList,
238                                        locked);
239             f = x_IdsToBitSet(*idlist, vol_start, vol_end);
240             break;
241 
242         case CSeqDB_AliasMask::eTiList:
243             idlist = gis.GetNodeIdList(mask.GetPath(),
244                                        vol.Vol(),
245                                        CSeqDBGiListSet::eTiList,
246                                        locked);
247             f = x_IdsToBitSet(*idlist, vol_start, vol_end);
248             break;
249 
250         case CSeqDB_AliasMask::eGiList:
251             idlist = gis.GetNodeIdList(mask.GetPath(),
252                                        vol.Vol(),
253                                        CSeqDBGiListSet::eGiList,
254                                        locked);
255             f = x_IdsToBitSet(*idlist, vol_start, vol_end);
256             break;
257 
258         case CSeqDB_AliasMask::eOidRange:
259         case CSeqDB_AliasMask::eMemBit:
260         case CSeqDB_AliasMask::eTaxIdList:
261 
262             // these should have been handled in the previous loop.
263             break;
264         }
265 
266         filter->IntersectWith(*f, true);
267     }
268 
269     volume_map->IntersectWith(*filter, true);
270 
271     return volume_map;
272 }
273 
x_ApplyUserGiList(CSeqDBGiList & gis)274 void CSeqDBOIDList::x_ApplyUserGiList(CSeqDBGiList   & gis)
275 
276 {
277     //m_Atlas.Lock(locked);
278 
279     if (gis.Empty()) {
280         x_ClearBitRange(0, m_NumOIDs);
281         m_NumOIDs = 0;
282         return;
283     }
284 
285     // This is the trivial way to 'sort' OIDs; build a bit vector
286     // spanning the OID range, turn on the bit indexed by each
287     // included OID, and then scan the vector sequentially.  This
288     // technique also uniqifies the set, which is desireable here.
289 
290 
291     int j = 0;
292 
293     if (gis.GetNumGis() || gis.GetNumSis() || gis.GetNumTis() || gis.GetNumPigs()){
294     CRef<CSeqDB_BitSet> gilist_oids(new CSeqDB_BitSet(0, m_NumOIDs));
295     if (gis.GetNumGis()) {
296         for(j = 0; j < gis.GetNumGis(); j++) {
297             int oid = gis.GetGiOid(j).oid;
298             if ((oid != -1) && (oid < m_NumOIDs)) {
299                 gilist_oids->SetBit(oid);
300             }
301         }
302     }
303 
304     if(gis.GetNumSis()) {
305         for(j = 0; j < gis.GetNumSis(); j++) {
306             int oid = gis.GetSiOid(j).oid;
307             if ((oid != -1) && (oid < m_NumOIDs)) {
308                 gilist_oids->SetBit(oid);
309             }
310         }
311     }
312 
313     if(gis.GetNumTis()) {
314         for(j = 0; j < gis.GetNumTis(); j++) {
315             int oid = gis.GetTiOid(j).oid;
316             if ((oid != -1) && (oid < m_NumOIDs)) {
317                 gilist_oids->SetBit(oid);
318             }
319         }
320     }
321 
322     if(gis.GetNumPigs()) {
323         for(j = 0; j < gis.GetNumPigs(); j++) {
324             int oid = gis.GetPigOid(j).oid;
325             if ((oid != -1) && (oid < m_NumOIDs)) {
326                 gilist_oids->SetBit(oid);
327             }
328         }
329     }
330     m_AllBits->IntersectWith(*gilist_oids, true);
331     }
332     const vector<blastdb::TOid> & oids_tax = gis.GetOidsForTaxIdsList();
333     if(oids_tax.size()) {
334         CRef<CSeqDB_BitSet> taxlist_oids(new CSeqDB_BitSet(0, m_NumOIDs));
335         for(unsigned int k = 0; k < oids_tax.size(); k++) {
336             if (oids_tax[k] < m_NumOIDs) {
337                 taxlist_oids->SetBit(oids_tax[k]);
338             }
339         }
340         m_AllBits->IntersectWith(*taxlist_oids, true);
341     }
342 
343 }
344 
x_ApplyNegativeList(CSeqDBNegativeList & nlist,bool is_v5)345 void CSeqDBOIDList::x_ApplyNegativeList(CSeqDBNegativeList & nlist, bool is_v5)
346 
347 {
348     // We require a normalized list in order to turn bits off.
349 	m_AllBits->Normalize();
350     const vector<blastdb::TOid> & excluded_oids = nlist.GetExcludedOids();
351 	for(unsigned int i=0; i < excluded_oids.size(); i++) {
352 	    m_AllBits->ClearBit(excluded_oids[i]);
353 	}
354 
355 	if((!is_v5 && nlist.GetNumSis() > 0) || nlist.GetNumGis() > 0 || nlist.GetNumTis() >  0) {
356 
357     // Intersect the user GI list with the OID bit map.
358 
359     // Iterate over the bitmap, clearing bits we find there but not in
360     // the bool vector.  For very dense OID bit maps, it might be
361     // faster to use two similarly implemented bitmaps and AND them
362     // together word-by-word.
363 
364     int max = nlist.GetNumOids();
365 
366     // Clear any OIDs after the included range.
367 
368     if (max < m_NumOIDs) {
369         CSeqDB_BitSet new_range(0, max, CSeqDB_BitSet::eAllSet);
370         m_AllBits->IntersectWith(new_range, true);
371     }
372 
373     // If a 'get next included oid' method was added to the negative
374     // list, the following loop could be made a bit faster.
375 
376     for(int oid = 0; oid < max; oid++) {
377         if (! nlist.GetOidStatus(oid)) {
378             m_AllBits->ClearBit(oid);
379         }
380     }
381 	}
382 
383 
384 }
385 
386 CRef<CSeqDB_BitSet>
x_IdsToBitSet(const CSeqDBGiList & gilist,int oid_start,int oid_end)387 CSeqDBOIDList::x_IdsToBitSet(const CSeqDBGiList & gilist,
388                              int                  oid_start,
389                              int                  oid_end)
390 {
391     CRef<CSeqDB_BitSet> bits
392         (new CSeqDB_BitSet(oid_start, oid_end, CSeqDB_BitSet::eNone));
393 
394     CSeqDB_BitSet & bitset = *bits;
395 
396     int num_gis = gilist.GetNumGis();
397     int num_tis = gilist.GetNumTis();
398     int num_sis = gilist.GetNumSis();
399     int prev_oid = -1;
400 
401     for(int i = 0; i < num_gis; i++) {
402         int oid = gilist.GetGiOid(i).oid;
403 
404         if (oid != prev_oid) {
405             if ((oid >= oid_start) && (oid < oid_end)) {
406                 bitset.SetBit(oid);
407             }
408             prev_oid = oid;
409         }
410     }
411 
412     for(int i = 0; i < num_tis; i++) {
413         int oid = gilist.GetTiOid(i).oid;
414 
415         if (oid != prev_oid) {
416             if ((oid >= oid_start) && (oid < oid_end)) {
417                 bitset.SetBit(oid);
418             }
419             prev_oid = oid;
420         }
421     }
422 
423     for(int i = 0; i < num_sis; i++) {
424         int oid = gilist.GetSiOid(i).oid;
425 
426         if (oid != prev_oid) {
427             if ((oid >= oid_start) && (oid < oid_end)) {
428                 bitset.SetBit(oid);
429             }
430             prev_oid = oid;
431         }
432     }
433 
434     return bits;
435 }
436 
x_ClearBitRange(int oid_start,int oid_end)437 void CSeqDBOIDList::x_ClearBitRange(int oid_start,
438                                     int oid_end)
439 {
440     m_AllBits->AssignBitRange(oid_start, oid_end, false);
441 }
442 
443 CRef<CSeqDB_BitSet>
x_GetOidMask(const CSeqDB_Path & fn,int vol_start,int vol_end)444 CSeqDBOIDList::x_GetOidMask(const CSeqDB_Path & fn,
445                             int                 vol_start,
446                             int                 vol_end)
447 
448 {
449 
450     // Open file and get pointers
451 
452     TCUC* bitmap = 0;
453     TCUC* bitend = 0;
454 
455     CSeqDBRawFile volmask(m_Atlas);
456     CSeqDBFileMemMap lease(m_Atlas);
457 
458     Uint4 num_oids = 0;
459 
460     {
461         volmask.Open(fn);
462         lease.Init(fn.GetPathS());
463         volmask.ReadSwapped(lease, 0, & num_oids);
464 
465         // This is the index of the last oid, not the count of oids...
466         num_oids++;
467 
468         size_t file_length = (size_t) volmask.GetFileLength();
469 
470         // Cast forces signed/unsigned conversion.
471 
472         volmask.GetFileDataPtr(lease, sizeof(Int4), file_length);
473 
474         bitmap = (TCUC*) lease.GetFileDataPtr(sizeof(Int4));
475 
476         bitend = bitmap + (((num_oids + 31) / 32) * 4);
477     }
478     CRef<CSeqDB_BitSet> bitset(new CSeqDB_BitSet(vol_start, vol_end, bitmap, bitend));
479 
480 
481     // Disable any enabled bits occuring after the volume end point
482     // [this should not normally occur.]
483 
484     for(size_t oid = vol_end; bitset->CheckOrFindBit(oid); oid++) {
485         bitset->ClearBit(oid);
486     }
487 
488     return bitset;
489 }
490 
491 void
DebugDump(CDebugDumpContext ddc,unsigned int depth) const492 CSeqDBOIDList::DebugDump(CDebugDumpContext ddc, unsigned int depth) const
493 {
494     ddc.SetFrame("CSeqDBOIDList");
495     CObject::DebugDump(ddc, depth);
496     ddc.Log("m_NumOIDs", m_NumOIDs);
497     ddc.Log("m_AllBits", m_AllBits, depth);
498 }
499 
500 void
s_GetFilteredOidRange(const CSeqDBVolSet & volset,const vector<string> & vol_basenames,vector<const CSeqDBVolEntry * > & excluded_vols,CRef<CSeqDBGiList> & si_list)501 s_GetFilteredOidRange(const CSeqDBVolSet & volset, const vector<string> &  vol_basenames,
502 		              vector<const CSeqDBVolEntry * >  & excluded_vols,
503 		              CRef<CSeqDBGiList> & si_list)
504 {
505 	unsigned int num_vol = volset.GetNumVols();
506 	vector<bool>  vol_included(num_vol, false);
507 	excluded_vols.clear();
508 	for(unsigned int i=0; i < num_vol; i++) {
509 		const CSeqDBVol * vol = volset.GetVol(i);
510 		if(std::find(vol_basenames.begin(), vol_basenames.end(), vol->GetVolName()) != vol_basenames.end()) {
511 			vol->AttachVolumeGiList(si_list);
512 			continue;
513 		}
514 		excluded_vols.push_back(volset.GetVolEntry(i));
515 	}
516 }
517 
518 bool
s_IsOidInFilteredVol(blastdb::TOid oid,vector<const CSeqDBVolEntry * > & excluded_vols)519 s_IsOidInFilteredVol(blastdb::TOid oid, vector<const CSeqDBVolEntry * >  & excluded_vols)
520 {
521 	for(unsigned int i = 0; i < excluded_vols.size(); i++) {
522 		 const CSeqDBVolEntry & entry = *(excluded_vols[i]);
523 		 if ((entry.OIDStart() <= oid) && (entry.OIDEnd()   >  oid)) {
524 			 return true;
525 		 }
526 	}
527 	return false;
528 }
529 
s_AddFilterFile(string & name,const string & vn,vector<string> & fnames,vector<vector<string>> & fnames_vols)530 void s_AddFilterFile(string & name, const string & vn, vector<string> & fnames, vector<vector<string> > & fnames_vols)
531 {
532 	unsigned int j=0;
533 	for(; j < fnames.size(); j++) {
534 		if(fnames[j] == name) {
535 			fnames_vols[j].push_back(vn);
536 			break;
537 		}
538 	}
539 	if( fnames.size() == j) {
540 		vector<string> p(1,vn);
541 		fnames.push_back(name);
542 		fnames_vols.push_back(p);
543 	}
544 }
545 
s_CompareSeqId(const string & id1,const string & id2)546 bool s_CompareSeqId(const string & id1, const string & id2)
547 {
548 	if (id1 == id2){
549 		return false;
550 	}
551 	CSeq_id seq_id1(id1, (CSeq_id::fParse_AnyRaw | CSeq_id::fParse_ValidLocal));
552 	CSeq_id seq_id2(id2, (CSeq_id::fParse_AnyRaw | CSeq_id::fParse_ValidLocal));
553 	if (seq_id1.Match(seq_id2)) {
554 		return false;
555 	}
556 	return (id1 < id2);
557 }
558 
s_ProcessSeqIdFilters(const vector<string> & fnames,vector<vector<string>> & fnames_vols,CRef<CSeqDBGiList> user_list,CRef<CSeqDBNegativeList> neg_user_list,const CSeqDBLMDBSet & lmdb_set,const CSeqDBVolSet & volset,CSeqDB_BitSet & filter_bit)559 void s_ProcessSeqIdFilters(const vector<string>     & fnames,
560 						   vector<vector<string> >  & fnames_vols,
561 		                   CRef<CSeqDBGiList>		  user_list,
562                            CRef<CSeqDBNegativeList>   neg_user_list,
563                            const CSeqDBLMDBSet      & lmdb_set,
564                            const CSeqDBVolSet       & volset,
565                            CSeqDB_BitSet 			& filter_bit)
566 {
567 	if (fnames.size() == 0) {
568 		return;
569 	}
570 	vector<string> user_accs;
571 	if ((!user_list.Empty()) && (user_list->GetNumSis() > 0)) {
572 		user_list->GetSiList(user_accs);
573 		sort(user_accs.begin(), user_accs.end(), s_CompareSeqId);
574 	}
575 	vector<string> neg_user_accs;
576 	if ((!neg_user_list.Empty()) && (neg_user_list->GetNumSis() > 0)) {
577 		neg_user_accs = neg_user_list->GetSiList();
578 		sort(neg_user_accs.begin(), neg_user_accs.end());
579 	}
580 
581 	for(unsigned int k=0; k < fnames.size(); k++) {
582 		vector<const CSeqDBVolEntry * > excluded_vols;
583 		vector<blastdb::TOid> oids;
584 		CRef<CSeqDBGiList> list(new CSeqDBFileGiList(fnames[k], CSeqDBFileGiList::eSiList));
585 		s_GetFilteredOidRange(volset, fnames_vols[k], excluded_vols, list);
586 		vector<string> accs;
587 		list->GetSiList(accs);
588 		if(accs.size() == 0){
589 				continue;
590 		}
591 		if((user_accs.size() > 0)  || (neg_user_accs.size() > 0)){
592 			sort(accs.begin(), accs.end(), s_CompareSeqId);
593 			if (user_accs.size() > 0) {
594 				vector<string> common;
595 				common.resize(accs.size());
596 				vector<string>::iterator itr = set_intersection(accs.begin(), accs.end(),
597 					                                            user_accs.begin(), user_accs.end(), common.begin(), s_CompareSeqId);
598 				common.resize(itr-common.begin());
599 				if(common.size() == 0){
600 					continue;
601 				}
602 				swap(accs, common);
603 			}
604 			if(neg_user_accs.size() > 0) {
605 				vector<string> difference;
606 				difference.resize(accs.size());
607 				vector<string>::iterator itr = set_difference(accs.begin(), accs.end(),
608 									                          neg_user_accs.begin(), neg_user_accs.end(), difference.begin(), s_CompareSeqId);
609 				difference.resize(itr-difference.begin());
610 				if(difference.size() == 0){
611 					continue;
612 				}
613 				swap(accs, difference);
614 			}
615 		}
616 
617 		lmdb_set.AccessionsToOids(accs, oids);
618 		for(unsigned int i=0; i < accs.size(); i++) {
619 			if(oids[i] == kSeqDBEntryNotFound) {
620 				continue;
621 			}
622 			if(excluded_vols.size() != 0) {
623 				if (s_IsOidInFilteredVol(oids[i], excluded_vols)) {
624 					continue;
625 				}
626 			}
627 			filter_bit.SetBit(oids[i]);
628 		}
629 	}
630 }
631 
s_ProcessTaxIdFilters(const vector<string> & fnames,vector<vector<string>> & fnames_vols,CRef<CSeqDBGiList> user_list,CRef<CSeqDBNegativeList> neg_user_list,const CSeqDBLMDBSet & lmdb_set,const CSeqDBVolSet & volset,CSeqDB_BitSet & filter_bit)632 void s_ProcessTaxIdFilters(const vector<string> &     fnames,
633 						   vector<vector<string> >  & fnames_vols,
634 		                   CRef<CSeqDBGiList>		  user_list,
635                            CRef<CSeqDBNegativeList>   neg_user_list,
636                            const CSeqDBLMDBSet      & lmdb_set,
637                            const CSeqDBVolSet       & volset,
638                            CSeqDB_BitSet 			& filter_bit)
639 {
640 	if (fnames.size() == 0) {
641 		return;
642 	}
643 
644 	set<TTaxId> user_taxids;
645 	if(!user_list.Empty() && (user_list->GetNumTaxIds() > 0)) {
646 		user_taxids = user_list->GetTaxIdsList();
647 	}
648 	set<TTaxId> neg_user_taxids;
649 	if(!neg_user_list.Empty() && (neg_user_list->GetNumTaxIds() > 0)) {
650 		neg_user_taxids = neg_user_list->GetTaxIdsList();
651 	}
652 
653 	for(unsigned int k=0; k < fnames.size(); k++) {
654 		vector<const CSeqDBVolEntry * > excluded_vols;
655 		vector<blastdb::TOid> oids;
656 		CRef<CSeqDBGiList> list(new CSeqDBFileGiList(fnames[k], CSeqDBFileGiList::eTaxIdList));
657 		s_GetFilteredOidRange(volset, fnames_vols[k], excluded_vols, list);
658 		set<TTaxId> taxids;
659 		taxids = list->GetTaxIdsList();
660 		if(taxids.size() == 0){
661 			continue;
662 		}
663 		if(user_taxids.size() > 0){
664 			vector<TTaxId> common;
665 			common.resize(taxids.size());
666 			vector<TTaxId>::iterator itr = set_intersection(taxids.begin(), taxids.end(),
667 					                                      user_taxids.begin(), user_taxids.end(), common.begin());
668 			common.resize(itr-common.begin());
669 			if( common.size() == 0) {
670 				continue;
671 			}
672 			taxids.clear();
673 			taxids.insert(common.begin(), common.end());
674 		}
675 		if(neg_user_taxids.size() > 0) {
676 			vector<TTaxId> difference;
677 			difference.resize(taxids.size());
678 			vector<TTaxId>::iterator itr = set_difference(taxids.begin(), taxids.end(),
679 								                        neg_user_taxids.begin(), neg_user_taxids.end(), difference.begin());
680 			difference.resize(itr-difference.begin());
681 			if(difference.size() == 0){
682 				continue;
683 			}
684 			taxids.clear();
685 			taxids.insert(difference.begin(), difference.end());
686 		}
687 
688 		lmdb_set.TaxIdsToOids(taxids, oids);
689 		for(unsigned int i=0; i < oids.size(); i++) {
690 			if(excluded_vols.size() != 0) {
691 				if (s_IsOidInFilteredVol(oids[i], excluded_vols)) {
692 					continue;
693 				}
694 			}
695 			filter_bit.SetBit(oids[i]);
696 		}
697 	}
698 }
699 
700 bool
x_ComputeFilters(const CSeqDBVolSet & volset,const CSeqDB_FilterTree & filters,const CSeqDBLMDBSet & lmdb_set,CSeqDB_BitSet & filter_bit,CRef<CSeqDBGiList> user_list,CRef<CSeqDBNegativeList> neg_user_list)701 CSeqDBOIDList::x_ComputeFilters(const CSeqDBVolSet       & volset,
702 		                        const CSeqDB_FilterTree  & filters,
703    		                        const CSeqDBLMDBSet      & lmdb_set,
704    		                        CSeqDB_BitSet 			 & filter_bit,
705    		                        CRef<CSeqDBGiList>		   user_list,
706    		                        CRef<CSeqDBNegativeList>   neg_user_list)
707 {
708 	vector<string> seqid_fnames;
709 	vector<string> taxid_fnames;
710 	vector< vector<string> > seqid_fnames_vols;
711 	vector< vector<string> > taxid_fnames_vols;
712 
713 	for(int i = 0; i < volset.GetNumVols(); i++) {
714 		const CSeqDBVolEntry & vol = *(volset.GetVolEntry(i));
715 	    const string & vn = vol.Vol()->GetVolName();
716         CRef<CSeqDB_FilterTree> ft = filters.Specialize(vn);
717        	ITERATE(CSeqDB_FilterTree::TFilters, itr, ft->GetFilters()){
718         	if(((*itr)->GetType() == CSeqDB_AliasMask::eSiList) ||
719         	   ((*itr)->GetType() == CSeqDB_AliasMask::eTaxIdList)) {
720         		string name = (*itr)->GetPath().GetPathS();
721         		if((*itr)->GetType() == CSeqDB_AliasMask::eSiList) {
722         			s_AddFilterFile(name, vn, seqid_fnames, seqid_fnames_vols);
723         		}
724         		else {
725         			s_AddFilterFile(name, vn, taxid_fnames, taxid_fnames_vols);
726         		}
727         		filter_bit.AssignBitRange(vol.OIDStart(), vol.OIDEnd(), false);
728         	}
729         }
730 	}
731 
732 	if (seqid_fnames.size() > 0) {
733 		s_ProcessSeqIdFilters(seqid_fnames, seqid_fnames_vols, user_list, neg_user_list,
734 	                          lmdb_set, volset, filter_bit);
735 	}
736 	if (taxid_fnames.size() > 0) {
737 		s_ProcessTaxIdFilters(taxid_fnames, taxid_fnames_vols, user_list, neg_user_list,
738 	                          lmdb_set, volset, filter_bit);
739 	}
740 
741 	return ((seqid_fnames.size() + taxid_fnames.size()) > 0 ? true:false);
742 }
743 
744 
745 END_NCBI_SCOPE
746 
747