1 /* $Id: seqdboidlist.cpp 610974 2020-06-26 12:59:33Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kevin Bealer
27 *
28 */
29
30 /// @file seqdboidlist.cpp
31 /// Implementation for the CSeqDBOIDList class, an array of bits
32 /// describing a subset of the virtual oid space.
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistr.hpp>
35 #include "seqdboidlist.hpp"
36 #include "seqdbfilter.hpp"
37 #include <objtools/blast/seqdb_reader/impl/seqdbfile.hpp>
38 #include "seqdbgilistset.hpp"
39 #include <algorithm>
40
41 BEGIN_NCBI_SCOPE
42
CSeqDBOIDList(CSeqDBAtlas & atlas,const CSeqDBVolSet & volset,CSeqDB_FilterTree & filters,CRef<CSeqDBGiList> & gi_list,CRef<CSeqDBNegativeList> & neg_list,CSeqDBLockHold & locked,const CSeqDBLMDBSet & lmdb_set)43 CSeqDBOIDList::CSeqDBOIDList(CSeqDBAtlas & atlas,
44 const CSeqDBVolSet & volset,
45 CSeqDB_FilterTree & filters,
46 CRef<CSeqDBGiList> & gi_list,
47 CRef<CSeqDBNegativeList> & neg_list,
48 CSeqDBLockHold & locked,
49 const CSeqDBLMDBSet & lmdb_set)
50 : m_Atlas (atlas),
51 m_Lease (atlas),
52 m_NumOIDs (0)
53 {
54 _ASSERT(gi_list.NotEmpty() || neg_list.NotEmpty() || filters.HasFilter());
55 x_Setup( volset, filters, gi_list, neg_list, locked, lmdb_set);
56 }
57
~CSeqDBOIDList()58 CSeqDBOIDList::~CSeqDBOIDList()
59 {
60 }
61
62 // The general rule I am following in these methods is to use byte
63 // computations except during actual looping.
64
x_Setup(const CSeqDBVolSet & volset,CSeqDB_FilterTree & filters,CRef<CSeqDBGiList> & gi_list,CRef<CSeqDBNegativeList> & neg_list,CSeqDBLockHold & locked,const CSeqDBLMDBSet & lmdb_set)65 void CSeqDBOIDList::x_Setup(const CSeqDBVolSet & volset,
66 CSeqDB_FilterTree & filters,
67 CRef<CSeqDBGiList> & gi_list,
68 CRef<CSeqDBNegativeList> & neg_list,
69 CSeqDBLockHold & locked,
70 const CSeqDBLMDBSet & lmdb_set)
71 {
72 // First, get the memory space for the OID bitmap and clear it.
73
74 // Pad memory space to word boundary, add 8 bytes for "insurance". Some
75 // of the algorithms here need to do bit shifting and OR half of a source
76 // element into this destination element, and the other half into this
77 // other destination element. Rather than sprinkle this code with range
78 // checks, padding is used.
79
80 m_NumOIDs = volset.GetNumOIDs();
81
82 m_AllBits.Reset(new CSeqDB_BitSet(0, m_NumOIDs));
83
84 CSeqDBGiListSet gi_list_set(m_Atlas,
85 volset,
86 gi_list,
87 neg_list,
88 locked,
89 lmdb_set);
90 // Then get the list of filenames and offsets to overlay onto it.
91
92 for(int i = 0; i < volset.GetNumVols(); i++) {
93 const CSeqDBVolEntry * v1 = volset.GetVolEntry(i);
94
95 CRef<CSeqDB_BitSet> vol_bits =
96 x_ComputeFilters(filters, *v1, gi_list_set, locked, lmdb_set.IsBlastDBVersion5());
97
98 m_AllBits->UnionWith(*vol_bits, true);
99 }
100
101 if (lmdb_set.IsBlastDBVersion5() && filters.HasFilter()) {
102 CSeqDB_BitSet f_bits(0, m_NumOIDs);
103 f_bits.AssignBitRange(0, m_NumOIDs, true);
104 if(x_ComputeFilters(volset, filters, lmdb_set, f_bits, gi_list, neg_list)) {
105 m_AllBits->IntersectWith(f_bits, true);
106 }
107 }
108
109 if (gi_list.NotEmpty()) {
110 x_ApplyUserGiList(*gi_list);
111 }
112 if (neg_list.NotEmpty()) {
113 x_ApplyNegativeList(*neg_list, lmdb_set.IsBlastDBVersion5());
114 }
115
116 while(m_NumOIDs && (! x_IsSet(m_NumOIDs - 1))) {
117 -- m_NumOIDs;
118 }
119 }
120
121 CRef<CSeqDB_BitSet>
x_ComputeFilters(const CSeqDB_FilterTree & filters,const CSeqDBVolEntry & vol,CSeqDBGiListSet & gis,CSeqDBLockHold & locked,bool isBlastDBv5)122 CSeqDBOIDList::x_ComputeFilters(const CSeqDB_FilterTree & filters,
123 const CSeqDBVolEntry & vol,
124 CSeqDBGiListSet & gis,
125 CSeqDBLockHold & locked,
126 bool isBlastDBv5)
127
128 {
129 const string & vn = vol.Vol()->GetVolName();
130 CRef<CSeqDB_FilterTree> ft = filters.Specialize(vn);
131
132 int vol_start = vol.OIDStart();
133 int vol_end = vol.OIDEnd();
134
135 CRef<CSeqDB_BitSet> volume_map;
136
137 // Step 1: Compute the bitmap representing the filtering done by
138 // all subnodes. This is a "union".
139
140 int vols = ft->GetVolumes().size();
141
142 _ASSERT(vols || ft->GetNodes().size());
143
144 if (vols > 0) {
145 // This filter tree is filtered by volume name, so all nodes
146 // below this point can be ignored if this node contains a
147 // volume. This volume will be ORred with those nodes,
148 // flushing them to all "1"s anyway (at least until this
149 // node's filtering is applied.)
150
151 // This loop really just verifies that specialization was done
152 // properly in the case where there are multiple volume names
153 // (which must be the same).
154
155 for(int j = 1; j < vols; j++) {
156 _ASSERT(ft->GetVolumes()[j] == ft->GetVolumes()[0]);
157 }
158
159 volume_map.Reset(new CSeqDB_BitSet(vol_start,
160 vol_end,
161 CSeqDB_BitSet::eAllSet));
162 } else {
163 // Since this node did not have a volume, we OR together all
164 // of its subnodes.
165
166 volume_map.Reset(new CSeqDB_BitSet(vol_start,
167 vol_end,
168 CSeqDB_BitSet::eAllClear));
169
170 ITERATE(vector< CRef< CSeqDB_FilterTree > >, sub, ft->GetNodes()) {
171 CRef<CSeqDB_BitSet> sub_bits =
172 x_ComputeFilters(**sub, vol, gis, locked, isBlastDBv5);
173
174 volume_map->UnionWith(*sub_bits, true);
175 }
176 }
177
178 // Now we apply this level's filtering. The first question is, is
179 // it appropriate for a node to use multiple filtering mechanisms
180 // (GI list, OID list, or OID range), either of the same or
181 // different types? The second question is how are multiply
182 // filtered nodes interpreted?
183
184 // The SeqDB unit tests assume that multiple filters at a given
185 // level are ANDed together. The unit tests assume this for the
186 // case of combining OID masks and OID ranges, but in the absence
187 // of another motivating example, I'll assume it means ANDing of
188 // all such mechanisms.
189
190 CRef<CSeqDB_BitSet> filter(new CSeqDB_BitSet(vol_start,
191 vol_end,
192 CSeqDB_BitSet::eAllSet));
193
194 // First, apply any 'range' filters, because they can be combined
195 // very efficiently.
196
197 typedef CSeqDB_FilterTree::TFilters TFilters;
198
199 ITERATE(TFilters, range, ft->GetFilters()) {
200 const CSeqDB_AliasMask & mask = **range;
201
202 if (mask.GetType() == CSeqDB_AliasMask::eOidRange) {
203 CSeqDB_BitSet r2(mask.GetBegin(),
204 mask.GetEnd(),
205 CSeqDB_BitSet::eAllSet);
206 filter->IntersectWith(r2, true);
207 } else if (mask.GetType() == CSeqDB_AliasMask::eMemBit) {
208 // TODO, adding vol-specific OR and AND
209 vol.Vol()->SetMemBit(mask.GetMemBit());
210 // No filter->IntersectWith here since
211 // MEMBIT can not be done at OID level, therefore,
212 // we delegate to seqdbvol (in x_GetFilteredHeader())
213 // for further process.
214 }
215 }
216
217 ITERATE(TFilters, filt, ft->GetFilters()) {
218 const CSeqDB_AliasMask & mask = **filt;
219
220 if ((mask.GetType() == CSeqDB_AliasMask::eOidRange)
221 || (mask.GetType() == CSeqDB_AliasMask::eMemBit)
222 || (isBlastDBv5 && (mask.GetType() == CSeqDB_AliasMask::eSiList))
223 || (mask.GetType() == CSeqDB_AliasMask::eTaxIdList)) {
224 continue;
225 }
226
227 CRef<CSeqDB_BitSet> f;
228 CRef<CSeqDBGiList> idlist;
229 switch(mask.GetType()) {
230 case CSeqDB_AliasMask::eOidList:
231 f = x_GetOidMask(mask.GetPath(), vol_start, vol_end);
232 break;
233
234 case CSeqDB_AliasMask::eSiList:
235 idlist = gis.GetNodeIdList(mask.GetPath(),
236 vol.Vol(),
237 CSeqDBGiListSet::eSiList,
238 locked);
239 f = x_IdsToBitSet(*idlist, vol_start, vol_end);
240 break;
241
242 case CSeqDB_AliasMask::eTiList:
243 idlist = gis.GetNodeIdList(mask.GetPath(),
244 vol.Vol(),
245 CSeqDBGiListSet::eTiList,
246 locked);
247 f = x_IdsToBitSet(*idlist, vol_start, vol_end);
248 break;
249
250 case CSeqDB_AliasMask::eGiList:
251 idlist = gis.GetNodeIdList(mask.GetPath(),
252 vol.Vol(),
253 CSeqDBGiListSet::eGiList,
254 locked);
255 f = x_IdsToBitSet(*idlist, vol_start, vol_end);
256 break;
257
258 case CSeqDB_AliasMask::eOidRange:
259 case CSeqDB_AliasMask::eMemBit:
260 case CSeqDB_AliasMask::eTaxIdList:
261
262 // these should have been handled in the previous loop.
263 break;
264 }
265
266 filter->IntersectWith(*f, true);
267 }
268
269 volume_map->IntersectWith(*filter, true);
270
271 return volume_map;
272 }
273
x_ApplyUserGiList(CSeqDBGiList & gis)274 void CSeqDBOIDList::x_ApplyUserGiList(CSeqDBGiList & gis)
275
276 {
277 //m_Atlas.Lock(locked);
278
279 if (gis.Empty()) {
280 x_ClearBitRange(0, m_NumOIDs);
281 m_NumOIDs = 0;
282 return;
283 }
284
285 // This is the trivial way to 'sort' OIDs; build a bit vector
286 // spanning the OID range, turn on the bit indexed by each
287 // included OID, and then scan the vector sequentially. This
288 // technique also uniqifies the set, which is desireable here.
289
290
291 int j = 0;
292
293 if (gis.GetNumGis() || gis.GetNumSis() || gis.GetNumTis() || gis.GetNumPigs()){
294 CRef<CSeqDB_BitSet> gilist_oids(new CSeqDB_BitSet(0, m_NumOIDs));
295 if (gis.GetNumGis()) {
296 for(j = 0; j < gis.GetNumGis(); j++) {
297 int oid = gis.GetGiOid(j).oid;
298 if ((oid != -1) && (oid < m_NumOIDs)) {
299 gilist_oids->SetBit(oid);
300 }
301 }
302 }
303
304 if(gis.GetNumSis()) {
305 for(j = 0; j < gis.GetNumSis(); j++) {
306 int oid = gis.GetSiOid(j).oid;
307 if ((oid != -1) && (oid < m_NumOIDs)) {
308 gilist_oids->SetBit(oid);
309 }
310 }
311 }
312
313 if(gis.GetNumTis()) {
314 for(j = 0; j < gis.GetNumTis(); j++) {
315 int oid = gis.GetTiOid(j).oid;
316 if ((oid != -1) && (oid < m_NumOIDs)) {
317 gilist_oids->SetBit(oid);
318 }
319 }
320 }
321
322 if(gis.GetNumPigs()) {
323 for(j = 0; j < gis.GetNumPigs(); j++) {
324 int oid = gis.GetPigOid(j).oid;
325 if ((oid != -1) && (oid < m_NumOIDs)) {
326 gilist_oids->SetBit(oid);
327 }
328 }
329 }
330 m_AllBits->IntersectWith(*gilist_oids, true);
331 }
332 const vector<blastdb::TOid> & oids_tax = gis.GetOidsForTaxIdsList();
333 if(oids_tax.size()) {
334 CRef<CSeqDB_BitSet> taxlist_oids(new CSeqDB_BitSet(0, m_NumOIDs));
335 for(unsigned int k = 0; k < oids_tax.size(); k++) {
336 if (oids_tax[k] < m_NumOIDs) {
337 taxlist_oids->SetBit(oids_tax[k]);
338 }
339 }
340 m_AllBits->IntersectWith(*taxlist_oids, true);
341 }
342
343 }
344
x_ApplyNegativeList(CSeqDBNegativeList & nlist,bool is_v5)345 void CSeqDBOIDList::x_ApplyNegativeList(CSeqDBNegativeList & nlist, bool is_v5)
346
347 {
348 // We require a normalized list in order to turn bits off.
349 m_AllBits->Normalize();
350 const vector<blastdb::TOid> & excluded_oids = nlist.GetExcludedOids();
351 for(unsigned int i=0; i < excluded_oids.size(); i++) {
352 m_AllBits->ClearBit(excluded_oids[i]);
353 }
354
355 if((!is_v5 && nlist.GetNumSis() > 0) || nlist.GetNumGis() > 0 || nlist.GetNumTis() > 0) {
356
357 // Intersect the user GI list with the OID bit map.
358
359 // Iterate over the bitmap, clearing bits we find there but not in
360 // the bool vector. For very dense OID bit maps, it might be
361 // faster to use two similarly implemented bitmaps and AND them
362 // together word-by-word.
363
364 int max = nlist.GetNumOids();
365
366 // Clear any OIDs after the included range.
367
368 if (max < m_NumOIDs) {
369 CSeqDB_BitSet new_range(0, max, CSeqDB_BitSet::eAllSet);
370 m_AllBits->IntersectWith(new_range, true);
371 }
372
373 // If a 'get next included oid' method was added to the negative
374 // list, the following loop could be made a bit faster.
375
376 for(int oid = 0; oid < max; oid++) {
377 if (! nlist.GetOidStatus(oid)) {
378 m_AllBits->ClearBit(oid);
379 }
380 }
381 }
382
383
384 }
385
386 CRef<CSeqDB_BitSet>
x_IdsToBitSet(const CSeqDBGiList & gilist,int oid_start,int oid_end)387 CSeqDBOIDList::x_IdsToBitSet(const CSeqDBGiList & gilist,
388 int oid_start,
389 int oid_end)
390 {
391 CRef<CSeqDB_BitSet> bits
392 (new CSeqDB_BitSet(oid_start, oid_end, CSeqDB_BitSet::eNone));
393
394 CSeqDB_BitSet & bitset = *bits;
395
396 int num_gis = gilist.GetNumGis();
397 int num_tis = gilist.GetNumTis();
398 int num_sis = gilist.GetNumSis();
399 int prev_oid = -1;
400
401 for(int i = 0; i < num_gis; i++) {
402 int oid = gilist.GetGiOid(i).oid;
403
404 if (oid != prev_oid) {
405 if ((oid >= oid_start) && (oid < oid_end)) {
406 bitset.SetBit(oid);
407 }
408 prev_oid = oid;
409 }
410 }
411
412 for(int i = 0; i < num_tis; i++) {
413 int oid = gilist.GetTiOid(i).oid;
414
415 if (oid != prev_oid) {
416 if ((oid >= oid_start) && (oid < oid_end)) {
417 bitset.SetBit(oid);
418 }
419 prev_oid = oid;
420 }
421 }
422
423 for(int i = 0; i < num_sis; i++) {
424 int oid = gilist.GetSiOid(i).oid;
425
426 if (oid != prev_oid) {
427 if ((oid >= oid_start) && (oid < oid_end)) {
428 bitset.SetBit(oid);
429 }
430 prev_oid = oid;
431 }
432 }
433
434 return bits;
435 }
436
x_ClearBitRange(int oid_start,int oid_end)437 void CSeqDBOIDList::x_ClearBitRange(int oid_start,
438 int oid_end)
439 {
440 m_AllBits->AssignBitRange(oid_start, oid_end, false);
441 }
442
443 CRef<CSeqDB_BitSet>
x_GetOidMask(const CSeqDB_Path & fn,int vol_start,int vol_end)444 CSeqDBOIDList::x_GetOidMask(const CSeqDB_Path & fn,
445 int vol_start,
446 int vol_end)
447
448 {
449
450 // Open file and get pointers
451
452 TCUC* bitmap = 0;
453 TCUC* bitend = 0;
454
455 CSeqDBRawFile volmask(m_Atlas);
456 CSeqDBFileMemMap lease(m_Atlas);
457
458 Uint4 num_oids = 0;
459
460 {
461 volmask.Open(fn);
462 lease.Init(fn.GetPathS());
463 volmask.ReadSwapped(lease, 0, & num_oids);
464
465 // This is the index of the last oid, not the count of oids...
466 num_oids++;
467
468 size_t file_length = (size_t) volmask.GetFileLength();
469
470 // Cast forces signed/unsigned conversion.
471
472 volmask.GetFileDataPtr(lease, sizeof(Int4), file_length);
473
474 bitmap = (TCUC*) lease.GetFileDataPtr(sizeof(Int4));
475
476 bitend = bitmap + (((num_oids + 31) / 32) * 4);
477 }
478 CRef<CSeqDB_BitSet> bitset(new CSeqDB_BitSet(vol_start, vol_end, bitmap, bitend));
479
480
481 // Disable any enabled bits occuring after the volume end point
482 // [this should not normally occur.]
483
484 for(size_t oid = vol_end; bitset->CheckOrFindBit(oid); oid++) {
485 bitset->ClearBit(oid);
486 }
487
488 return bitset;
489 }
490
491 void
DebugDump(CDebugDumpContext ddc,unsigned int depth) const492 CSeqDBOIDList::DebugDump(CDebugDumpContext ddc, unsigned int depth) const
493 {
494 ddc.SetFrame("CSeqDBOIDList");
495 CObject::DebugDump(ddc, depth);
496 ddc.Log("m_NumOIDs", m_NumOIDs);
497 ddc.Log("m_AllBits", m_AllBits, depth);
498 }
499
500 void
s_GetFilteredOidRange(const CSeqDBVolSet & volset,const vector<string> & vol_basenames,vector<const CSeqDBVolEntry * > & excluded_vols,CRef<CSeqDBGiList> & si_list)501 s_GetFilteredOidRange(const CSeqDBVolSet & volset, const vector<string> & vol_basenames,
502 vector<const CSeqDBVolEntry * > & excluded_vols,
503 CRef<CSeqDBGiList> & si_list)
504 {
505 unsigned int num_vol = volset.GetNumVols();
506 vector<bool> vol_included(num_vol, false);
507 excluded_vols.clear();
508 for(unsigned int i=0; i < num_vol; i++) {
509 const CSeqDBVol * vol = volset.GetVol(i);
510 if(std::find(vol_basenames.begin(), vol_basenames.end(), vol->GetVolName()) != vol_basenames.end()) {
511 vol->AttachVolumeGiList(si_list);
512 continue;
513 }
514 excluded_vols.push_back(volset.GetVolEntry(i));
515 }
516 }
517
518 bool
s_IsOidInFilteredVol(blastdb::TOid oid,vector<const CSeqDBVolEntry * > & excluded_vols)519 s_IsOidInFilteredVol(blastdb::TOid oid, vector<const CSeqDBVolEntry * > & excluded_vols)
520 {
521 for(unsigned int i = 0; i < excluded_vols.size(); i++) {
522 const CSeqDBVolEntry & entry = *(excluded_vols[i]);
523 if ((entry.OIDStart() <= oid) && (entry.OIDEnd() > oid)) {
524 return true;
525 }
526 }
527 return false;
528 }
529
s_AddFilterFile(string & name,const string & vn,vector<string> & fnames,vector<vector<string>> & fnames_vols)530 void s_AddFilterFile(string & name, const string & vn, vector<string> & fnames, vector<vector<string> > & fnames_vols)
531 {
532 unsigned int j=0;
533 for(; j < fnames.size(); j++) {
534 if(fnames[j] == name) {
535 fnames_vols[j].push_back(vn);
536 break;
537 }
538 }
539 if( fnames.size() == j) {
540 vector<string> p(1,vn);
541 fnames.push_back(name);
542 fnames_vols.push_back(p);
543 }
544 }
545
s_CompareSeqId(const string & id1,const string & id2)546 bool s_CompareSeqId(const string & id1, const string & id2)
547 {
548 if (id1 == id2){
549 return false;
550 }
551 CSeq_id seq_id1(id1, (CSeq_id::fParse_AnyRaw | CSeq_id::fParse_ValidLocal));
552 CSeq_id seq_id2(id2, (CSeq_id::fParse_AnyRaw | CSeq_id::fParse_ValidLocal));
553 if (seq_id1.Match(seq_id2)) {
554 return false;
555 }
556 return (id1 < id2);
557 }
558
s_ProcessSeqIdFilters(const vector<string> & fnames,vector<vector<string>> & fnames_vols,CRef<CSeqDBGiList> user_list,CRef<CSeqDBNegativeList> neg_user_list,const CSeqDBLMDBSet & lmdb_set,const CSeqDBVolSet & volset,CSeqDB_BitSet & filter_bit)559 void s_ProcessSeqIdFilters(const vector<string> & fnames,
560 vector<vector<string> > & fnames_vols,
561 CRef<CSeqDBGiList> user_list,
562 CRef<CSeqDBNegativeList> neg_user_list,
563 const CSeqDBLMDBSet & lmdb_set,
564 const CSeqDBVolSet & volset,
565 CSeqDB_BitSet & filter_bit)
566 {
567 if (fnames.size() == 0) {
568 return;
569 }
570 vector<string> user_accs;
571 if ((!user_list.Empty()) && (user_list->GetNumSis() > 0)) {
572 user_list->GetSiList(user_accs);
573 sort(user_accs.begin(), user_accs.end(), s_CompareSeqId);
574 }
575 vector<string> neg_user_accs;
576 if ((!neg_user_list.Empty()) && (neg_user_list->GetNumSis() > 0)) {
577 neg_user_accs = neg_user_list->GetSiList();
578 sort(neg_user_accs.begin(), neg_user_accs.end());
579 }
580
581 for(unsigned int k=0; k < fnames.size(); k++) {
582 vector<const CSeqDBVolEntry * > excluded_vols;
583 vector<blastdb::TOid> oids;
584 CRef<CSeqDBGiList> list(new CSeqDBFileGiList(fnames[k], CSeqDBFileGiList::eSiList));
585 s_GetFilteredOidRange(volset, fnames_vols[k], excluded_vols, list);
586 vector<string> accs;
587 list->GetSiList(accs);
588 if(accs.size() == 0){
589 continue;
590 }
591 if((user_accs.size() > 0) || (neg_user_accs.size() > 0)){
592 sort(accs.begin(), accs.end(), s_CompareSeqId);
593 if (user_accs.size() > 0) {
594 vector<string> common;
595 common.resize(accs.size());
596 vector<string>::iterator itr = set_intersection(accs.begin(), accs.end(),
597 user_accs.begin(), user_accs.end(), common.begin(), s_CompareSeqId);
598 common.resize(itr-common.begin());
599 if(common.size() == 0){
600 continue;
601 }
602 swap(accs, common);
603 }
604 if(neg_user_accs.size() > 0) {
605 vector<string> difference;
606 difference.resize(accs.size());
607 vector<string>::iterator itr = set_difference(accs.begin(), accs.end(),
608 neg_user_accs.begin(), neg_user_accs.end(), difference.begin(), s_CompareSeqId);
609 difference.resize(itr-difference.begin());
610 if(difference.size() == 0){
611 continue;
612 }
613 swap(accs, difference);
614 }
615 }
616
617 lmdb_set.AccessionsToOids(accs, oids);
618 for(unsigned int i=0; i < accs.size(); i++) {
619 if(oids[i] == kSeqDBEntryNotFound) {
620 continue;
621 }
622 if(excluded_vols.size() != 0) {
623 if (s_IsOidInFilteredVol(oids[i], excluded_vols)) {
624 continue;
625 }
626 }
627 filter_bit.SetBit(oids[i]);
628 }
629 }
630 }
631
s_ProcessTaxIdFilters(const vector<string> & fnames,vector<vector<string>> & fnames_vols,CRef<CSeqDBGiList> user_list,CRef<CSeqDBNegativeList> neg_user_list,const CSeqDBLMDBSet & lmdb_set,const CSeqDBVolSet & volset,CSeqDB_BitSet & filter_bit)632 void s_ProcessTaxIdFilters(const vector<string> & fnames,
633 vector<vector<string> > & fnames_vols,
634 CRef<CSeqDBGiList> user_list,
635 CRef<CSeqDBNegativeList> neg_user_list,
636 const CSeqDBLMDBSet & lmdb_set,
637 const CSeqDBVolSet & volset,
638 CSeqDB_BitSet & filter_bit)
639 {
640 if (fnames.size() == 0) {
641 return;
642 }
643
644 set<TTaxId> user_taxids;
645 if(!user_list.Empty() && (user_list->GetNumTaxIds() > 0)) {
646 user_taxids = user_list->GetTaxIdsList();
647 }
648 set<TTaxId> neg_user_taxids;
649 if(!neg_user_list.Empty() && (neg_user_list->GetNumTaxIds() > 0)) {
650 neg_user_taxids = neg_user_list->GetTaxIdsList();
651 }
652
653 for(unsigned int k=0; k < fnames.size(); k++) {
654 vector<const CSeqDBVolEntry * > excluded_vols;
655 vector<blastdb::TOid> oids;
656 CRef<CSeqDBGiList> list(new CSeqDBFileGiList(fnames[k], CSeqDBFileGiList::eTaxIdList));
657 s_GetFilteredOidRange(volset, fnames_vols[k], excluded_vols, list);
658 set<TTaxId> taxids;
659 taxids = list->GetTaxIdsList();
660 if(taxids.size() == 0){
661 continue;
662 }
663 if(user_taxids.size() > 0){
664 vector<TTaxId> common;
665 common.resize(taxids.size());
666 vector<TTaxId>::iterator itr = set_intersection(taxids.begin(), taxids.end(),
667 user_taxids.begin(), user_taxids.end(), common.begin());
668 common.resize(itr-common.begin());
669 if( common.size() == 0) {
670 continue;
671 }
672 taxids.clear();
673 taxids.insert(common.begin(), common.end());
674 }
675 if(neg_user_taxids.size() > 0) {
676 vector<TTaxId> difference;
677 difference.resize(taxids.size());
678 vector<TTaxId>::iterator itr = set_difference(taxids.begin(), taxids.end(),
679 neg_user_taxids.begin(), neg_user_taxids.end(), difference.begin());
680 difference.resize(itr-difference.begin());
681 if(difference.size() == 0){
682 continue;
683 }
684 taxids.clear();
685 taxids.insert(difference.begin(), difference.end());
686 }
687
688 lmdb_set.TaxIdsToOids(taxids, oids);
689 for(unsigned int i=0; i < oids.size(); i++) {
690 if(excluded_vols.size() != 0) {
691 if (s_IsOidInFilteredVol(oids[i], excluded_vols)) {
692 continue;
693 }
694 }
695 filter_bit.SetBit(oids[i]);
696 }
697 }
698 }
699
700 bool
x_ComputeFilters(const CSeqDBVolSet & volset,const CSeqDB_FilterTree & filters,const CSeqDBLMDBSet & lmdb_set,CSeqDB_BitSet & filter_bit,CRef<CSeqDBGiList> user_list,CRef<CSeqDBNegativeList> neg_user_list)701 CSeqDBOIDList::x_ComputeFilters(const CSeqDBVolSet & volset,
702 const CSeqDB_FilterTree & filters,
703 const CSeqDBLMDBSet & lmdb_set,
704 CSeqDB_BitSet & filter_bit,
705 CRef<CSeqDBGiList> user_list,
706 CRef<CSeqDBNegativeList> neg_user_list)
707 {
708 vector<string> seqid_fnames;
709 vector<string> taxid_fnames;
710 vector< vector<string> > seqid_fnames_vols;
711 vector< vector<string> > taxid_fnames_vols;
712
713 for(int i = 0; i < volset.GetNumVols(); i++) {
714 const CSeqDBVolEntry & vol = *(volset.GetVolEntry(i));
715 const string & vn = vol.Vol()->GetVolName();
716 CRef<CSeqDB_FilterTree> ft = filters.Specialize(vn);
717 ITERATE(CSeqDB_FilterTree::TFilters, itr, ft->GetFilters()){
718 if(((*itr)->GetType() == CSeqDB_AliasMask::eSiList) ||
719 ((*itr)->GetType() == CSeqDB_AliasMask::eTaxIdList)) {
720 string name = (*itr)->GetPath().GetPathS();
721 if((*itr)->GetType() == CSeqDB_AliasMask::eSiList) {
722 s_AddFilterFile(name, vn, seqid_fnames, seqid_fnames_vols);
723 }
724 else {
725 s_AddFilterFile(name, vn, taxid_fnames, taxid_fnames_vols);
726 }
727 filter_bit.AssignBitRange(vol.OIDStart(), vol.OIDEnd(), false);
728 }
729 }
730 }
731
732 if (seqid_fnames.size() > 0) {
733 s_ProcessSeqIdFilters(seqid_fnames, seqid_fnames_vols, user_list, neg_user_list,
734 lmdb_set, volset, filter_bit);
735 }
736 if (taxid_fnames.size() > 0) {
737 s_ProcessTaxIdFilters(taxid_fnames, taxid_fnames_vols, user_list, neg_user_list,
738 lmdb_set, volset, filter_bit);
739 }
740
741 return ((seqid_fnames.size() + taxid_fnames.size()) > 0 ? true:false);
742 }
743
744
745 END_NCBI_SCOPE
746
747