1 #ifndef OBJTOOLS_READERS_SEQDB__SEQDBOIDLIST_HPP
2 #define OBJTOOLS_READERS_SEQDB__SEQDBOIDLIST_HPP
3
4 /* $Id: seqdboidlist.hpp 578711 2019-01-24 15:40:06Z fongah2 $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Kevin Bealer
30 *
31 */
32
33 /// @file seqdboidlist.hpp
34 /// The SeqDB oid filtering layer.
35 ///
36 /// Defines classes:
37 /// CSeqDBOIDList
38 ///
39 /// Implemented for: UNIX, MS-Windows
40
41 #include <objtools/blast/seqdb_reader/seqdb.hpp>
42 #include <objtools/blast/seqdb_reader/impl/seqdbfile.hpp>
43 #include "seqdbvolset.hpp"
44 #include "seqdbfilter.hpp"
45 #include "seqdbgilistset.hpp"
46 #include "seqdbbitset.hpp"
47
48 BEGIN_NCBI_SCOPE
49
50 using namespace ncbi::objects;
51
52 /// CSeqDBOIDList
53 ///
54 /// This class defines a set of included oids over the entire oid
55 /// range. The underlying implementation is a large bit map. If the
56 /// database has one volume, which uses an OID mask file, this object
57 /// will memory map that file and use it directly. Otherwise, an area
58 /// of memory will be allocated (one bit per OID), and the relevant
59 /// bits will be turned on in that space. This information may come
60 /// from memory mapped oid lists, or it may come from GI lists, which
61 /// are converted to OIDs using ISAM indices. Because of these two
62 /// modes of operation, care must be taken to insure that the
63 /// placement of the bits exactly corresponds to the layout of the
64 /// memory mappable oid mask files.
65
66 class CSeqDBOIDList : public CObject {
67 public:
68 /// A large enough type to span all OIDs.
69 typedef int TOID;
70
71 /// A type which spans possible file offsets.
72 typedef CSeqDBAtlas::TIndx TIndx;
73
74 /// Constructor.
75 ///
76 /// All processing to build the oid mask array is done in the
77 /// constructor. The volumes will be queried for information on
78 /// how many and what filter files to apply to each volume, and
79 /// these files will be used to build the oid bit array.
80 ///
81 /// @param atlas
82 /// The CSeqDBAtlas object.
83 /// @param volumes
84 /// The set of database volumes.
85 /// @param filters
86 /// The filtering to apply to the database volumes.
87 /// @param gi_list
88 /// The User GI List (if there is one).
89 /// @param neg_list
90 /// The Negative User GI List (if there is one).
91 /// @param locked
92 /// The lock holder object for this thread.
93 CSeqDBOIDList(CSeqDBAtlas & atlas,
94 const CSeqDBVolSet & volumes,
95 CSeqDB_FilterTree & filters,
96 CRef<CSeqDBGiList> & gi_list,
97 CRef<CSeqDBNegativeList> & neg_list,
98 CSeqDBLockHold & locked,
99 const CSeqDBLMDBSet & lmdb_set);
100
101 /// Destructor.
102 ///
103 /// All resources will be freed (returned to the atlas). This
104 /// class uses the atlas to get the memory it needs, so the space
105 /// for the oid bit array is counted toward the memory bound.
106 ~CSeqDBOIDList();
107
108 /// Find an included oid from the specified point.
109 ///
110 /// This call tests whether the specified oid is included in the
111 /// map. If it is, true is returned and the argument is not
112 /// modified. If it is not included, but a subsequent oid is, the
113 /// argument is adjusted to the next included oid, and true is
114 /// returned. If no oids exist from here to the end of the array,
115 /// false is returned.
116 ///
117 /// @param next_oid
118 /// The oid to check, and also the returned oid.
119 /// @return
120 /// True if an oid was found.
CheckOrFindOID(TOID & next_oid) const121 bool CheckOrFindOID(TOID & next_oid) const
122 {
123 size_t bit = next_oid;
124 bool found = m_AllBits->CheckOrFindBit(bit);
125
126 next_oid = bit;
127 _ASSERT(size_t(next_oid) == bit);
128
129 return found;
130 }
131
132 /// Deallocate the memory ranges owned by this object.
133 ///
134 /// This object may hold a lease on a file owned by the atlas. If
135 /// so, this method will release that memory. It should only be
136 /// called during destruction, since this class has no facilities
137 /// for reacquiring the memory lease.
UnLease()138 void UnLease()
139 {
140 m_Lease.Clear();
141 }
142
143 /// Dump debug information for this object
144 /// @sa CDebugDumpable
145 void DebugDump(CDebugDumpContext ddc, unsigned int depth) const;
146
147 private:
148 /// Shorthand type to clarify code that iterates over memory.
149 typedef const unsigned char TCUC;
150
151 /// Shorthand type to clarify code that iterates over memory.
152 typedef unsigned char TUC;
153
154 /// Check if a bit is set.
155 ///
156 /// Returns true if the specified oid is included.
157 ///
158 /// @param oid
159 /// The oid to check.
160 /// @return
161 /// true if the oid is included.
162 inline bool x_IsSet(TOID oid) const;
163
164 /// Build an oid mask in memory.
165 ///
166 /// This method allocates an oid bit array which spans the entire
167 /// oid range in use. It then maps all OID mask files and GI list
168 /// files. It copies the bit data from the oid mask files into
169 /// this array, translates all GI lists into OIDs and enables the
170 /// associated bits, and sets all bits to 1 for any "fully
171 /// included" volumes. This up-front work is intended to make
172 /// access to the data as fast as possible later on. In some
173 /// cases, this is not the most efficient way to do this. Faster
174 /// and more efficient storage methods are possible in cases where
175 /// very sparse GI lists are used. More efficient storage is
176 /// possible in cases where small masked databases are mixed with
177 /// large, "fully-in" volumes.
178 ///
179 /// @param volset
180 /// The set of volumes to build an oid mask for.
181 /// @param filters
182 /// The filtering to apply to the database volumes.
183 /// @param gi_list
184 /// Gi list object.
185 /// @param neg_list
186 /// Negative ID list object.
187 /// @param locked
188 /// The lock holder object for this thread.
189 void x_Setup(const CSeqDBVolSet & volset,
190 CSeqDB_FilterTree & filters,
191 CRef<CSeqDBGiList> & gi_list,
192 CRef<CSeqDBNegativeList> & neg_list,
193 CSeqDBLockHold & locked,
194 const CSeqDBLMDBSet & lmdb_set);
195
196 /// Clear all bits in a range.
197 ///
198 /// This method turns off all bits in the specified oid range. It
199 /// is used after alias file processing to turn off bit ranges
200 /// that are masked by a user specified GI list.
201 ///
202 /// @param oid_start
203 /// The volume's starting oid.
204 /// @param oid_end
205 /// The volume's ending oid.
206 void x_ClearBitRange(int oid_start, int oid_end);
207
208 /// Compute the oid mask bitset for a database volume.
209 ///
210 /// The filter tree will be specialized to this database volume and
211 /// the OID mask bitset for this volume will be computed.
212 ///
213 /// @param ft The filter tree for all volumes.
214 /// @param vol The volume entry object for this volume.
215 /// @param gis An object that manages the GI lists used here.
216 /// @param locked The lock holder object for this thread.
217 /// @return An OID bitset object.
218 CRef<CSeqDB_BitSet>
219 x_ComputeFilters(const CSeqDB_FilterTree & ft,
220 const CSeqDBVolEntry & vol,
221 CSeqDBGiListSet & gis,
222 CSeqDBLockHold & locked,
223 bool isBlastDBv5);
224
225 /// Load the named OID mask file into a bitset object.
226 ///
227 /// @param fn The filename from which to load the OID mask.
228 /// @param vol_start The first OID included in this volume.
229 /// @param vol_end The first OID after this volume.
230 /// @param locked The lock holder object for this thread.
231 /// @return An OID bitset object.
232 CRef<CSeqDB_BitSet>
233 x_GetOidMask(const CSeqDB_Path & fn,
234 int vol_start,
235 int vol_end);
236
237
238 /// Load an ID (GI or TI) list file into a bitset object.
239 ///
240 /// @param ids A set of included GIs or TIs.
241 /// @param vol_start The first OID included in this volume.
242 /// @param vol_end The first OID after this volume.
243 /// @return An OID bitset object.
244 CRef<CSeqDB_BitSet>
245 x_IdsToBitSet(const CSeqDBGiList & ids, int vol_start, int vol_end);
246
247 /// Apply a user GI list to a volume.
248 ///
249 /// This method applies a user-specified filter to the OID list.
250 /// Unlike x_ApplyFilter, which turns on the bits of the filter,
251 /// this method turns OFF the disincluded bits. It is therefore
252 /// an AND operation between the user filter and the (already
253 /// applied) alias file filters.
254 ///
255 /// @param gis
256 /// The user gi list to apply to the volumes.
257 /// @param locked
258 /// The lock holder object for this thread.
259 void x_ApplyUserGiList(CSeqDBGiList & gis);
260
261
262 /// Apply a negative user GI list to a volume.
263 ///
264 /// This method applies a user-specified filter to the OID list.
265 /// It serves the same purpose for negative GI lists that
266 /// x_ApplyUserGiList serves for positive GI lists. The operation
267 /// performed here is an AND operation between the the (already
268 /// applied) alias file filters and the negation of the user
269 /// filter.
270 ///
271 /// @param neg
272 /// The negative user gi list to apply to the volumes.
273 /// @param is_v5
274 /// True if db is v5
275 void x_ApplyNegativeList(CSeqDBNegativeList & neg, bool is_v5);
276
277 bool x_ComputeFilters(const CSeqDBVolSet & volset,
278 const CSeqDB_FilterTree & filters,
279 const CSeqDBLMDBSet & lmdb_set,
280 CSeqDB_BitSet & filter_bit,
281 CRef<CSeqDBGiList> user_list,
282 CRef<CSeqDBNegativeList> neg_user_list);
283
284 /// The memory management layer object.
285 CSeqDBAtlas & m_Atlas;
286
287 /// A memory lease which holds the mask file (if only one is used).
288 CSeqDBFileMemMap m_Lease;
289
290 /// The total number of OIDs represented in the bit set.
291 int m_NumOIDs;
292
293 /// An OID bit set covering all volumes.
294 CRef<CSeqDB_BitSet> m_AllBits;
295 };
296
297 inline bool
x_IsSet(TOID oid) const298 CSeqDBOIDList::x_IsSet(TOID oid) const
299 {
300 _ASSERT(m_AllBits.NotEmpty());
301 return (oid < m_NumOIDs) && m_AllBits->GetBit(oid);
302 }
303
304 END_NCBI_SCOPE
305
306 #endif // OBJTOOLS_READERS_SEQDB__SEQDBOIDLIST_HPP
307
308