1 #ifndef OBJTOOLS_READERS_SEQDB__SEQDBBITSET_HPP
2 #define OBJTOOLS_READERS_SEQDB__SEQDBBITSET_HPP
3 
4 /*  $Id: seqdbbitset.hpp 599293 2019-12-23 18:45:56Z zaretska $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Kevin Bealer
30  *
31  */
32 
33 /// @file seqdbbitset.hpp
34 /// Implementation for the CSeqDB_BitSet class, a bit vector.
35 ///
36 /// Defines classes:
37 ///     CSeqDB_BitSet
38 ///
39 /// Implemented for: UNIX, MS-Windows
40 
41 #include <objtools/blast/seqdb_reader/impl/seqdbvol.hpp>
42 
43 BEGIN_NCBI_SCOPE
44 
45 /// Import definitions from the ncbi::objects namespace.
46 USING_SCOPE(objects);
47 
48 /// Bit set class.
49 class CSeqDB_BitSet : public CObject {
50     // Some of this code might be more efficient if a larger word size
51     // were used.  However, I think the difference would probably be
52     // tiny, and reading of OID mask file data on such platforms would
53     // then require special byte-order-aware algorithms.  Working at
54     // the byte level completely sidesteps all such issues and results
55     // in simpler, clearer code.
56 
57     /// Word size for vector elements.
58     typedef unsigned char TByte;
59 
60     /// Some useful constants related to word size.
61     enum {
62         /// Number of bits per stored word.
63         eWordBits = 8,
64 
65         /// Shift to convert from bit index to vector index.
66         eWordShift = 3,
67 
68         /// Mask to compute bit index within word.
69         eWordMask = eWordBits-1
70     };
71 
72 public:
73     /// Special edge cases (all-set and all-clear).
74     enum ESpecialCase {
75         eNone,    ///< Normal OID list.
76         eAllSet,  ///< All OIDs are set.
77         eAllClear ///< All OIDs are clear.
78     };
79 
80     /// Default constructor for zero-length empty bit array.
CSeqDB_BitSet()81     CSeqDB_BitSet()
82         : m_Start  (0),
83           m_End    (0),
84           m_Special(eNone)
85     {
86         _ASSERT(TByte(0) < TByte(-1)); // must be unsigned
87     }
88 
89     /// Constructor for bit array with start/end range.
90     ///
91     /// This constructs a bit array with the given start and end
92     /// points.  If the `sp' parameter is set to `eNone', a normal
93     /// bit array is constructed, with all bits set to `false'.  If sp
94     /// is `eAllSet' or `eAllClear', the array is not constructed, but
95     /// rather an object that acts as though the given range of OIDs
96     /// is all `true' or all `false' respectively.  These are termed
97     /// `special case' bit sets.  Special cases tend to be much more
98     /// efficient in terms of memory usage and in the efficiency of
99     /// boolean operations.
100     ///
101     /// @param start Starting OID for this bit set.
102     /// @param end Ending OID for this bit set.
103     /// @param sp Special case (homogeneous value) arrays.
CSeqDB_BitSet(size_t start,size_t end,ESpecialCase sp=eNone)104     CSeqDB_BitSet(size_t start, size_t end, ESpecialCase sp = eNone)
105         : m_Start  (start),
106           m_End    (end),
107           m_Special(sp)
108     {
109         _ASSERT(TByte(0) < TByte(-1)); // must be unsigned
110 
111         if (sp == eNone) {
112             x_Alloc(end-start);
113         }
114     }
115 
116     /// Constructor.
117     ///
118     /// This constructs a normal (eNone) bit array with the given
119     /// start and end points, and populates it from the byte data
120     /// in the memory found between addresses p1 and p2.  This method
121     /// is meant for reading data from OID mask files and follows the
122     /// format of those files, but the start pointer should be to the
123     /// start of the bit map data within the file, not to the start of
124     /// the file (the file contains a header that should be skipped).
125     ///
126     /// @param start The first OID represented by this data.
127     /// @param end The OID after the last OID represented by this data.
128     /// @param p1 A pointer to the beginning of the byte data.
129     /// @param p2 A pointer to past the end of the byte data.
130     CSeqDB_BitSet(size_t        start,
131                   size_t        end,
132                   const TByte * p1,
133                   const TByte * p2);
134 
135     /// Set the specified bit (to true).
136     /// @param index The index of the bit to set.
137     void SetBit(size_t index);
138 
139     /// Clear the specified bit (to false).
140     /// @param index The index of the bit to clear.
141     void ClearBit(size_t index);
142 
143     /// Store a value at the given index.
144     /// @param index The index of the bit to clear.
145     /// @param value The value to store in this bit.
146     void AssignBit(size_t i, bool value);
147 
148     /// Store the provided value in a range of bits.
149     /// @param start The index of the first bit to assign.
150     /// @param end The index after the last bit to assign.
151     /// @param value The value to store in this range.
152     void AssignBitRange(size_t start, size_t end, bool value);
153 
154     /// Get the value of the specified bit.
155     /// @param index The index of the bit to get.
156     bool GetBit(size_t index) const;
157 
158     /// Check if a bit is true or find the next bit that is.
159     ///
160     /// If the index points to a `false' value, the index is increased
161     /// until a `true' value is found (true is returned) or until the
162     /// index exceeds the OID range (false is returned).  If the index
163     /// initially points to a `true' bit, the index will not change.
164     ///
165     /// @param index The start index, and the returned index [in|out].
166     /// @return true if the index points to a true value.
167     bool CheckOrFindBit(size_t & index) const;
168 
169     /// Swap two bitsets.
170     ///
171     /// All fields of this bitset are swapped with those of `other'.
172     ///
173     /// @param other A bitset to swap values with.
174     void Swap(CSeqDB_BitSet & other);
175 
176     /// This bitset is assigned to the union of it and another.
177     ///
178     /// Each bit in this bitset will be `true' if it was true in
179     /// either this bitset or `other'.  The `consume' flag can be
180     /// specified as true if the value of the `other' bitset will not
181     /// be used after this operation.  Specifying `true' for consume
182     /// may change the data in the other bitset but can sometimes use
183     /// a more efficient algorithm.
184     ///
185     /// @param other The bitset to union with this bitset.
186     /// @param consume Specify true if the other bitset is expendable.
187     void UnionWith(CSeqDB_BitSet & other, bool consume);
188 
189     /// This bitset is assigned to the intersection of it and another.
190     ///
191     /// Each bit in this bitset will be `true' if it was true in both
192     /// this bitset and `other'.  The `consume' flag can be specified
193     /// as true if the value of the `other' bitset will not be used
194     /// after this operation.  Specifying `true' for consume may
195     /// change the data in the other bitset but can sometimes use a
196     /// more efficient algorithm.
197     ///
198     /// @param other The bitset to intersect with this bitset.
199     /// @param consume Specify true if the other bitset is expendable.
200     void IntersectWith(CSeqDB_BitSet & other, bool consume);
201 
202     /// If this is a special case bitset, convert it to a normal one.
203     ///
204     /// Operations on normal (`eNone') bitsets can be more expensive
205     /// in terms of memory and CPU time than on special case (eAllSet
206     /// and eAllClear) bitsets, but normal bitsets support operations
207     /// such as SetBit() and ClearBit() that special bitsets don't.
208     ///
209     /// This method checks if this bitset is a special case, and
210     /// converts it to a normal (`eNone') bitset if so.
211     void Normalize();
212 
213 private:
214     /// Set all bits that are true in `src'.
215     /// @param src The bitset to read from.
216     void x_CopyBits(const CSeqDB_BitSet & src);
217 
218     /// Set all bits in the given range that are true in `src'.
219     /// @param src The bitset to read from.
220     /// @param begin The start of the index range to read.
221     /// @param end The index past the end of the index range.
222     void x_CopyBits(const CSeqDB_BitSet & src, size_t start, size_t end);
223 
224     /// Set this bitset to the value of the provided one.
225     ///
226     /// This is like a normal "copy assignment" operation, except that
227     /// if `consume' is specified as true, a more efficient algorithm
228     /// may be used.  (Implementation: if `consume' is true, and the
229     /// source is a normal (eNone) bitset, this is a `swap'.)
230     ///
231     /// @param other The bitset to copy.
232     /// @param consume Specify true if the other bitset is expendable.
233     void x_Copy(CSeqDB_BitSet & other, bool consume);
234 
235     /// Replace `special' with normal bitsets, adjust the index range.
236     ///
237     /// If this bitset is special, it becomes a normal bitset.  If the
238     /// start or end point is outside of the current one, the bitset
239     /// expands (but does not contract).  All bits that are `true' in
240     /// the initial bitset will be true in the resulting bitset.
241     ///
242     /// @param start Move start point down (but not up) to here.
243     /// @param end Move end point up (but not down) to here.
244     void x_Normalize(size_t start, size_t end);
245 
246     /// Allocate memory for the bit data.
247     /// @param bits Allocate enough storage to hold this many bits.
x_Alloc(size_t bits)248     void x_Alloc(size_t bits)
249     {
250         m_Bits.resize((bits + eWordBits - 1) >> eWordShift);
251     }
252 
253     /// Prevent copy construction.
254     CSeqDB_BitSet(const CSeqDB_BitSet &);
255 
256     /// Prevent copy assignment.
257     CSeqDB_BitSet & operator=(const CSeqDB_BitSet &);
258 
259     /// Number of bits stored here.
260     size_t m_Start;
261 
262     /// Number of bits stored here.
263     size_t m_End;
264 
265     /// Special edge cases.
266     ESpecialCase m_Special;
267 
268     /// Representation of bit data.
269     vector<TByte> m_Bits;
270 
271 public:
272     /// Allows to dump a snapshot of the object
273     /// @todo this doesn't do anything for locality eRemote
274     void DebugDump(CDebugDumpContext ddc, unsigned int depth) const;
275 };
276 
277 END_NCBI_SCOPE
278 
279 #endif // OBJTOOLS_READERS_SEQDB__SEQDBBITSET_HPP
280 
281