1 #ifndef OBJTOOLS_READERS_SEQDB__SEQDBOIDLIST_HPP
2 #define OBJTOOLS_READERS_SEQDB__SEQDBOIDLIST_HPP
3 
4 /*  $Id: seqdboidlist.hpp 578711 2019-01-24 15:40:06Z fongah2 $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Kevin Bealer
30  *
31  */
32 
33 /// @file seqdboidlist.hpp
34 /// The SeqDB oid filtering layer.
35 ///
36 /// Defines classes:
37 ///     CSeqDBOIDList
38 ///
39 /// Implemented for: UNIX, MS-Windows
40 
41 #include <objtools/blast/seqdb_reader/seqdb.hpp>
42 #include <objtools/blast/seqdb_reader/impl/seqdbfile.hpp>
43 #include "seqdbvolset.hpp"
44 #include "seqdbfilter.hpp"
45 #include "seqdbgilistset.hpp"
46 #include "seqdbbitset.hpp"
47 
48 BEGIN_NCBI_SCOPE
49 
50 using namespace ncbi::objects;
51 
52 /// CSeqDBOIDList
53 ///
54 /// This class defines a set of included oids over the entire oid
55 /// range.  The underlying implementation is a large bit map.  If the
56 /// database has one volume, which uses an OID mask file, this object
57 /// will memory map that file and use it directly.  Otherwise, an area
58 /// of memory will be allocated (one bit per OID), and the relevant
59 /// bits will be turned on in that space.  This information may come
60 /// from memory mapped oid lists, or it may come from GI lists, which
61 /// are converted to OIDs using ISAM indices.  Because of these two
62 /// modes of operation, care must be taken to insure that the
63 /// placement of the bits exactly corresponds to the layout of the
64 /// memory mappable oid mask files.
65 
66 class CSeqDBOIDList : public CObject {
67 public:
68     /// A large enough type to span all OIDs.
69     typedef int TOID;
70 
71     /// A type which spans possible file offsets.
72     typedef CSeqDBAtlas::TIndx TIndx;
73 
74     /// Constructor.
75     ///
76     /// All processing to build the oid mask array is done in the
77     /// constructor.  The volumes will be queried for information on
78     /// how many and what filter files to apply to each volume, and
79     /// these files will be used to build the oid bit array.
80     ///
81     /// @param atlas
82     ///   The CSeqDBAtlas object.
83     /// @param volumes
84     ///   The set of database volumes.
85     /// @param filters
86     ///   The filtering to apply to the database volumes.
87     /// @param gi_list
88     ///   The User GI List (if there is one).
89     /// @param neg_list
90     ///   The Negative User GI List (if there is one).
91     /// @param locked
92     ///   The lock holder object for this thread.
93     CSeqDBOIDList(CSeqDBAtlas              & atlas,
94                   const CSeqDBVolSet       & volumes,
95                   CSeqDB_FilterTree        & filters,
96                   CRef<CSeqDBGiList>       & gi_list,
97                   CRef<CSeqDBNegativeList> & neg_list,
98                   CSeqDBLockHold           & locked,
99                   const CSeqDBLMDBSet	   & lmdb_set);
100 
101     /// Destructor.
102     ///
103     /// All resources will be freed (returned to the atlas).  This
104     /// class uses the atlas to get the memory it needs, so the space
105     /// for the oid bit array is counted toward the memory bound.
106     ~CSeqDBOIDList();
107 
108     /// Find an included oid from the specified point.
109     ///
110     /// This call tests whether the specified oid is included in the
111     /// map.  If it is, true is returned and the argument is not
112     /// modified.  If it is not included, but a subsequent oid is, the
113     /// argument is adjusted to the next included oid, and true is
114     /// returned.  If no oids exist from here to the end of the array,
115     /// false is returned.
116     ///
117     /// @param next_oid
118     ///   The oid to check, and also the returned oid.
119     /// @return
120     ///   True if an oid was found.
CheckOrFindOID(TOID & next_oid) const121     bool CheckOrFindOID(TOID & next_oid) const
122     {
123         size_t bit = next_oid;
124         bool found = m_AllBits->CheckOrFindBit(bit);
125 
126         next_oid = bit;
127         _ASSERT(size_t(next_oid) == bit);
128 
129         return found;
130     }
131 
132     /// Deallocate the memory ranges owned by this object.
133     ///
134     /// This object may hold a lease on a file owned by the atlas.  If
135     /// so, this method will release that memory.  It should only be
136     /// called during destruction, since this class has no facilities
137     /// for reacquiring the memory lease.
UnLease()138     void UnLease()
139     {
140         m_Lease.Clear();
141     }
142 
143     /// Dump debug information for this object
144     /// @sa CDebugDumpable
145     void DebugDump(CDebugDumpContext ddc, unsigned int depth) const;
146 
147 private:
148     /// Shorthand type to clarify code that iterates over memory.
149     typedef const unsigned char TCUC;
150 
151     /// Shorthand type to clarify code that iterates over memory.
152     typedef unsigned char TUC;
153 
154     /// Check if a bit is set.
155     ///
156     /// Returns true if the specified oid is included.
157     ///
158     /// @param oid
159     ///   The oid to check.
160     /// @return
161     ///   true if the oid is included.
162     inline bool x_IsSet(TOID oid) const;
163 
164     /// Build an oid mask in memory.
165     ///
166     /// This method allocates an oid bit array which spans the entire
167     /// oid range in use.  It then maps all OID mask files and GI list
168     /// files.  It copies the bit data from the oid mask files into
169     /// this array, translates all GI lists into OIDs and enables the
170     /// associated bits, and sets all bits to 1 for any "fully
171     /// included" volumes.  This up-front work is intended to make
172     /// access to the data as fast as possible later on.  In some
173     /// cases, this is not the most efficient way to do this.  Faster
174     /// and more efficient storage methods are possible in cases where
175     /// very sparse GI lists are used.  More efficient storage is
176     /// possible in cases where small masked databases are mixed with
177     /// large, "fully-in" volumes.
178     ///
179     /// @param volset
180     ///   The set of volumes to build an oid mask for.
181     /// @param filters
182     ///   The filtering to apply to the database volumes.
183     /// @param gi_list
184     ///   Gi list object.
185     /// @param neg_list
186     ///   Negative ID list object.
187     /// @param locked
188     ///   The lock holder object for this thread.
189     void x_Setup(const CSeqDBVolSet       & volset,
190                  CSeqDB_FilterTree        & filters,
191                  CRef<CSeqDBGiList>       & gi_list,
192                  CRef<CSeqDBNegativeList> & neg_list,
193                  CSeqDBLockHold           & locked,
194                  const CSeqDBLMDBSet	   & lmdb_set);
195 
196     /// Clear all bits in a range.
197     ///
198     /// This method turns off all bits in the specified oid range.  It
199     /// is used after alias file processing to turn off bit ranges
200     /// that are masked by a user specified GI list.
201     ///
202     /// @param oid_start
203     ///   The volume's starting oid.
204     /// @param oid_end
205     ///   The volume's ending oid.
206     void x_ClearBitRange(int oid_start, int oid_end);
207 
208     /// Compute the oid mask bitset for a database volume.
209     ///
210     /// The filter tree will be specialized to this database volume and
211     /// the OID mask bitset for this volume will be computed.
212     ///
213     /// @param ft The filter tree for all volumes.
214     /// @param vol The volume entry object for this volume.
215     /// @param gis An object that manages the GI lists used here.
216     /// @param locked The lock holder object for this thread.
217     /// @return An OID bitset object.
218     CRef<CSeqDB_BitSet>
219     x_ComputeFilters(const CSeqDB_FilterTree & ft,
220                      const CSeqDBVolEntry    & vol,
221                      CSeqDBGiListSet         & gis,
222                      CSeqDBLockHold          & locked,
223                      bool					 isBlastDBv5);
224 
225     /// Load the named OID mask file into a bitset object.
226     ///
227     /// @param fn The filename from which to load the OID mask.
228     /// @param vol_start The first OID included in this volume.
229     /// @param vol_end The first OID after this volume.
230     /// @param locked The lock holder object for this thread.
231     /// @return An OID bitset object.
232     CRef<CSeqDB_BitSet>
233     x_GetOidMask(const CSeqDB_Path & fn,
234                  int                 vol_start,
235                  int                 vol_end);
236 
237 
238     /// Load an ID (GI or TI) list file into a bitset object.
239     ///
240     /// @param ids A set of included GIs or TIs.
241     /// @param vol_start The first OID included in this volume.
242     /// @param vol_end The first OID after this volume.
243     /// @return An OID bitset object.
244     CRef<CSeqDB_BitSet>
245     x_IdsToBitSet(const CSeqDBGiList & ids, int vol_start, int vol_end);
246 
247     /// Apply a user GI list to a volume.
248     ///
249     /// This method applies a user-specified filter to the OID list.
250     /// Unlike x_ApplyFilter, which turns on the bits of the filter,
251     /// this method turns OFF the disincluded bits.  It is therefore
252     /// an AND operation between the user filter and the (already
253     /// applied) alias file filters.
254     ///
255     /// @param gis
256     ///   The user gi list to apply to the volumes.
257     /// @param locked
258     ///   The lock holder object for this thread.
259     void x_ApplyUserGiList(CSeqDBGiList   & gis);
260 
261 
262     /// Apply a negative user GI list to a volume.
263     ///
264     /// This method applies a user-specified filter to the OID list.
265     /// It serves the same purpose for negative GI lists that
266     /// x_ApplyUserGiList serves for positive GI lists.  The operation
267     /// performed here is an AND operation between the the (already
268     /// applied) alias file filters and the negation of the user
269     /// filter.
270     ///
271     /// @param neg
272     ///   The negative user gi list to apply to the volumes.
273     /// @param is_v5
274     ///   True if db is v5
275     void x_ApplyNegativeList(CSeqDBNegativeList & neg, bool is_v5);
276 
277     bool x_ComputeFilters(const CSeqDBVolSet       & volset,
278     					  const CSeqDB_FilterTree  & filters,
279                		      const CSeqDBLMDBSet      & lmdb_set,
280                		      CSeqDB_BitSet 		   & filter_bit,
281                		      CRef<CSeqDBGiList>	     user_list,
282                           CRef<CSeqDBNegativeList>   neg_user_list);
283 
284     /// The memory management layer object.
285     CSeqDBAtlas & m_Atlas;
286 
287     /// A memory lease which holds the mask file (if only one is used).
288     CSeqDBFileMemMap m_Lease;
289 
290     /// The total number of OIDs represented in the bit set.
291     int m_NumOIDs;
292 
293     /// An OID bit set covering all volumes.
294     CRef<CSeqDB_BitSet> m_AllBits;
295 };
296 
297 inline bool
x_IsSet(TOID oid) const298 CSeqDBOIDList::x_IsSet(TOID oid) const
299 {
300     _ASSERT(m_AllBits.NotEmpty());
301     return (oid < m_NumOIDs) && m_AllBits->GetBit(oid);
302 }
303 
304 END_NCBI_SCOPE
305 
306 #endif // OBJTOOLS_READERS_SEQDB__SEQDBOIDLIST_HPP
307 
308