1 #ifndef OBJTOOLS_READERS_SEQDB__SEQDBVOLSET_HPP
2 #define OBJTOOLS_READERS_SEQDB__SEQDBVOLSET_HPP
3 
4 /*  $Id: seqdbvolset.hpp 538739 2017-06-13 18:26:55Z rackerst $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Kevin Bealer
30  *
31  */
32 
33 /// @file seqdbvolset.hpp
34 /// Manages a set of database volumes.
35 ///
36 /// Defines classes:
37 ///     CSeqDBVolSet
38 ///     CVolEntry
39 ///
40 /// Implemented for: UNIX, MS-Windows
41 
42 #include <objtools/blast/seqdb_reader/impl/seqdbvol.hpp>
43 #include "seqdbfilter.hpp"
44 #include <algo/blast/core/ncbi_std.h>
45 
46 BEGIN_NCBI_SCOPE
47 
48 /// Import definitions from the ncbi::objects namespace.
49 USING_SCOPE(objects);
50 
51 /// CSeqDBVolEntry
52 ///
53 /// This class controls access to the CSeqDBVol class.  It contains
54 /// data that is not relevant to the internal operation of a volume,
55 /// but is associated with that volume for operations over the volume
56 /// set as a whole, such as the starting OID of the volume and masking
57 /// information (GI and OID lists).
58 
59 class CSeqDBVolEntry {
60 public:
61     /// Constructor
62     ///
63     /// This creates a object containing the specified volume object
64     /// pointer.  Although this object owns the pointer, it uses a
65     /// vector, so it does not keep an auto pointer or CRef<>.
66     /// Instead, the destructor of the CSeqDBVolSet class deletes the
67     /// volumes by calling Free() in a destructor.  Using indirect
68     /// pointers (CRef<> for example) would require slightly more
69     /// cycles in several performance critical paths.
70     ///
71     /// @param new_vol
72     ///   A pointer to a volume.
CSeqDBVolEntry(CSeqDBVol * new_vol)73     CSeqDBVolEntry(CSeqDBVol * new_vol)
74         : m_Vol        (new_vol),
75           m_OIDStart   (0),
76           m_OIDEnd     (0)
77     {
78     }
79 
80     /// Free the volume object
81     ///
82     /// The associated volume object is deleted.
Free()83     void Free()
84     {
85         if (m_Vol) {
86             delete m_Vol;
87             m_Vol = 0;
88         }
89     }
90 
91     /// Set the OID range
92     ///
93     /// The volume is queried for the number of OIDs it contains, and
94     /// the starting and ending OIDs are set.
95     ///
96     /// @param start The first OID in the range.
SetStartAndEnd(int start)97     void SetStartAndEnd(int start)
98     {
99         m_OIDStart = start;
100         m_OIDEnd   = start + m_Vol->GetNumOIDs();
101     }
102 
103     /// Get the starting OID in this volume's range.
104     ///
105     /// This returns the first OID in this volume's OID range.
106     ///
107     /// @return The starting OID of the range
OIDStart() const108     int OIDStart() const
109     {
110         return m_OIDStart;
111     }
112 
113     /// Get the ending OID in this volume's range.
114     ///
115     /// This returns the first OID past the end of this volume's OID
116     /// range.
117     ///
118     /// @return
119     ///   The ending OID of the range
OIDEnd() const120     int OIDEnd() const
121     {
122         return m_OIDEnd;
123     }
124 
125     /// Get a pointer to the underlying volume object.
Vol()126     CSeqDBVol * Vol()
127     {
128         return m_Vol;
129     }
130 
131     /// Get a const pointer to the underlying volume object.
Vol() const132     const CSeqDBVol * Vol() const
133     {
134         return m_Vol;
135     }
136 
137 private:
138     /// The underlying volume object
139     CSeqDBVol     * m_Vol;
140 
141     /// The start of the OID range.
142     int             m_OIDStart;
143 
144     /// The end of the OID range.
145     int             m_OIDEnd;
146 };
147 
148 
149 /// CSeqDBVolSet
150 ///
151 /// This class stores a set of CSeqDBVol objects and defines an
152 /// interface to control usage of them.  Several methods are provided
153 /// to create the set of volumes, or to get the required volumes by
154 /// different criteria.  Also, certain methods perform operations over
155 /// the set of volumes.  The CSeqDBVolEntry class, defined internally
156 /// to this one, provides some of this abstraction.
157 class CSeqDBVolSet {
158 public:
159     /// Standard Constructor
160     ///
161     /// An object of this class will be constructed after the alias
162     /// files have been read, and the volume names will come from that
163     /// processing step.  All of the specified volumes will be opened
164     /// and the metadata will be verified during construction.
165     ///
166     /// @param atlas
167     ///   The memory management object to use.
168     /// @param vol_names
169     ///   The names of the volumes this object will manage.
170     /// @param prot_nucl
171     ///   Whether these are protein or nucleotide sequences.
172     /// @param user_list
173     ///   If specified, will be used to include deflines by GI or TI.
174     /// @param neg_list
175     ///   If specified, will be used to exclude deflines by GI or TI.
176     CSeqDBVolSet(CSeqDBAtlas          & atlas,
177                  const vector<string> & vol_names,
178                  char                   prot_nucl,
179                  CSeqDBGiList         * user_list,
180                  CSeqDBNegativeList   * neg_list);
181 
182     /// Default Constructor
183     ///
184     /// An empty volume set will be created; this is in support of the
185     /// CSeqDBExpert class's default constructor.
186     CSeqDBVolSet();
187 
188     /// Destructor
189     ///
190     /// The destructor will release all resources still held, but some
191     /// of the resources will probably already be cleaned up via a
192     /// call to the UnLease method.
193     ~CSeqDBVolSet();
194 
195     /// Find a volume by OID.
196     ///
197     /// Many of the CSeqDB methods identify which sequence to use by
198     /// OID.  That OID applies to all sequences in all volumes of the
199     /// opened database(s).  This method is used to find the volume
200     /// (if any) that contains this OID, and to return both a pointer
201     /// to that volume and the OID within that volume that corresponds
202     /// to the global input OID.
203     ///
204     /// @param oid
205     ///   The global OID to search for.
206     /// @param vol_oid
207     ///   The returned OID within the relevant volume.
208     /// @return
209     ///   A pointer to the volume containing the oid, or NULL.
FindVol(int oid,int & vol_oid) const210     CSeqDBVol * FindVol(int oid, int & vol_oid) const
211     {
212         // The 'const' usage here should be cleaned up, i.e. const
213         // should be removed from most of SeqDB's methods.  Since the
214         // atlas often remaps the actual file data due to seemingly
215         // read-only user requests, there are very few parts of this
216         // code that can really be considered const.  "Conceptual"
217         // const is not worth the trouble, particularly for internal
218         // methods.
219 
220         // A good technique would be to remove all or nearly all of
221         // the 'mutable' keywords, then remove the word 'const' from
222         // almost everything the compiler complains about.
223 
224         int vol_idx(0);
225         return const_cast<CSeqDBVol*>(FindVol(oid, vol_oid, vol_idx));
226     }
227 
228     /// Find a volume by OID.
229     ///
230     /// Many of the CSeqDB methods identify which sequence to use by
231     /// OID.  That OID applies to all sequences in all volumes of the
232     /// opened database(s).  This method is used to find the volume
233     /// (if any) that contains this OID, and to return a pointer to
234     /// that volume, the OID within that volume that corresponds to
235     /// the global input OID, and the volume index.
236     ///
237     /// @param oid
238     ///   The global OID to search for.
239     /// @param vol_oid
240     ///   The returned OID within the relevant volume.
241     /// @param vol_idx
242     ///   The returned index of the relevant volume.
243     /// @return
244     ///   A pointer to the volume containing the oid, or NULL.
FindVol(int oid,int & vol_oid,int & vol_idx) const245     const CSeqDBVol * FindVol(int oid, int & vol_oid, int & vol_idx) const
246     {
247         int rec_indx = m_RecentVol;
248 
249         if (rec_indx < (int) m_VolList.size()) {
250             const CSeqDBVolEntry & rvol = m_VolList[rec_indx];
251 
252             if ((rvol.OIDStart() <= oid) &&
253                 (rvol.OIDEnd()   >  oid)) {
254 
255                 vol_oid = oid - rvol.OIDStart();
256                 vol_idx = rec_indx;
257 
258                 return rvol.Vol();
259             }
260         }
261 
262         for(int index = 0; index < (int) m_VolList.size(); index++) {
263             if ((m_VolList[index].OIDStart() <= oid) &&
264                 (m_VolList[index].OIDEnd()   >  oid)) {
265 
266                 m_RecentVol = index;
267 
268                 vol_oid = oid - m_VolList[index].OIDStart();
269                 vol_idx = index;
270 
271                 return m_VolList[index].Vol();
272             }
273         }
274 
275         return NULL;
276     }
277 
278     /// Find a volume by OID.
279     ///
280     /// Many of the CSeqDB methods identify which sequence to use by
281     /// OID.  That OID applies to all sequences in all volumes of the
282     /// opened database(s).  This method is used to find the volume
283     /// (if any) that contains this OID, and to return both a pointer
284     /// to that volume and the OID within that volume that corresponds
285     /// to the global input OID.
286     ///
287     /// @param oid
288     ///   The global OID to search for.
289     /// @param vol_oid
290     ///   The returned OID within the relevant volume.
291     /// @return
292     ///   A pointer to the volume containing the oid, or NULL.
FindVol(int oid,int & vol_oid)293     CSeqDBVol * FindVol(int oid, int & vol_oid)
294     {
295         int rec_indx = m_RecentVol;
296 
297         if (rec_indx < (int) m_VolList.size()) {
298             CSeqDBVolEntry & rvol = m_VolList[rec_indx];
299 
300             if ((rvol.OIDStart() <= oid) &&
301                 (rvol.OIDEnd()   >  oid)) {
302 
303                 vol_oid = oid - rvol.OIDStart();
304 
305                 return rvol.Vol();
306             }
307         }
308 
309         for(int index = 0; index < (int) m_VolList.size(); index++) {
310             if ((m_VolList[index].OIDStart() <= oid) &&
311                 (m_VolList[index].OIDEnd()   >  oid)) {
312 
313                 m_RecentVol = index;
314 
315                 vol_oid = oid - m_VolList[index].OIDStart();
316 
317                 return m_VolList[index].Vol();
318             }
319         }
320 
321         return 0;
322     }
323 
324     /// Find a volume by index.
325     ///
326     /// This method returns a volume by index, so that 0 is the first
327     /// volume, and N-1 is the last volume of a set of N.
328     ///
329     /// @param i
330     ///   The index of the volume to return.
331     /// @return
332     ///   A pointer to the indicated volume, or NULL.
GetVol(int i) const333     const CSeqDBVol * GetVol(int i) const
334     {
335         if (m_VolList.empty()) {
336             return 0;
337         }
338 
339         if (i >= (int) m_VolList.size()) {
340             return 0;
341         }
342 
343         m_RecentVol = i;
344 
345         return m_VolList[i].Vol();
346     }
347 
348     /// Find a volume by index.
349     ///
350     /// This method returns a volume by index, so that 0 is the first
351     /// volume, and N-1 is the last volume of a set of N.
352     ///
353     /// @param i
354     ///   The index of the volume to return.
355     /// @return
356     ///   A pointer to the indicated volume, or NULL.
GetVolNonConst(int i)357     CSeqDBVol * GetVolNonConst(int i)
358     {
359         if (m_VolList.empty()) {
360             return 0;
361         }
362 
363         if (i >= (int) m_VolList.size()) {
364             return 0;
365         }
366 
367         m_RecentVol = i;
368 
369         return m_VolList[i].Vol();
370     }
371 
372     /// Find a volume entry by index.
373     ///
374     /// This method returns a CSeqDBVolEntry by index, so that 0 is
375     /// the first volume, and N-1 is the last volume of a set of N.
376     ///
377     /// @param i
378     ///   The index of the volume entry to return.
379     /// @return
380     ///   A pointer to the indicated volume entry, or NULL.
GetVolEntry(int i) const381     const CSeqDBVolEntry * GetVolEntry(int i) const
382     {
383         if (m_VolList.empty()) {
384             return 0;
385         }
386 
387         if (i >= (int) m_VolList.size()) {
388             return 0;
389         }
390 
391         m_RecentVol = i;
392 
393         return & m_VolList[i];
394     }
395 
396     /// Find a volume by name.
397     ///
398     /// Each volume has a name, which is the name of the component
399     /// files (.pin, .psq, etc), without the file extension.  This
400     /// method returns a const pointer to the volume matching the
401     /// specified name.
402     ///
403     /// @param volname
404     ///   The name of the volume to search for.
405     /// @return
406     ///   A pointer to the volume matching the specified name, or NULL.
GetVol(const string & volname) const407     const CSeqDBVol * GetVol(const string & volname) const
408     {
409         if (const CSeqDBVolEntry * v = x_FindVolName(volname)) {
410             return v->Vol();
411         }
412         return 0;
413     }
414 
415     /// Find a volume by name (non-const version).
416     ///
417     /// Each volume has a name, which is the name of the component
418     /// files (.pin, .psq, etc), without the file extension.  This
419     /// method returns a non-const pointer to the volume matching the
420     /// specified name.
421     ///
422     /// @param volname
423     ///   The name of the volume to search for.
424     /// @return
425     ///   A pointer to the volume matching the specified name, or NULL.
GetVol(const string & volname)426     CSeqDBVol * GetVol(const string & volname)
427     {
428         if (CSeqDBVolEntry * v = x_FindVolName(volname)) {
429             return v->Vol();
430         }
431         return 0;
432     }
433 
434     /// Get the number of volumes
435     ///
436     /// This returns the number of volumes available from this set.
437     /// It would be needed, for example, in order to iterate over all
438     /// volumes with the GetVol(int) method.
439     /// @return
440     ///   The number of volumes available from this set.
GetNumVols() const441     int GetNumVols() const
442     {
443         return (int)m_VolList.size();
444     }
445 
446     /// Get the size of the OID range.
447     ///
448     /// This method returns the total size of the combined (global)
449     /// OID range of this database.
450     ///
451     /// @return
452     ///   The number of OIDs.
GetNumOIDs() const453     int GetNumOIDs() const
454     {
455         return x_GetNumOIDs();
456     }
457 
458     /// Return storage held by the volumes
459     ///
460     /// This method returns any storage held by CSeqDBMemLease objects
461     /// which are part of this set of volumes.  The memory leases will
462     /// be reacquired by the volumes if the data is requested again.
UnLease()463     void UnLease()
464     {
465         for(int index = 0; index < (int) m_VolList.size(); index++) {
466             m_VolList[index].Vol()->UnLease();
467         }
468     }
469 
470     /// Get the first OID in a volume.
471     ///
472     /// Each volume is considered to span a range of OIDs.  This
473     /// method returns the first OID in the OID range of the indicated
474     /// volume.  The returned OID may not be included (ie. it may be
475     /// turned off via a filtering mechanism).
476     ///
477     /// @param i
478     ///   The index of the volume.
GetVolOIDStart(int i) const479     int GetVolOIDStart(int i) const
480     {
481         if (m_VolList.empty()) {
482             return 0;
483         }
484 
485         if (i >= (int) m_VolList.size()) {
486             return 0;
487         }
488 
489         m_RecentVol = i;
490 
491         return m_VolList[i].OIDStart();
492     }
493 
494     /// Find total volume length for all volumes
495     ///
496     /// Each volume in the set has an internally stored length, which
497     /// indicates the length (in nucleotides/residues/bases) of all of
498     /// the sequences in the volume.  This returns the total of these
499     /// lengths.
500     ///
501     /// @return
502     ///   The sum of the lengths of all volumes.
GetVolumeSetLength() const503     Uint8 GetVolumeSetLength() const
504     {
505         Uint8 vol_total = 0;
506 
507         for(int index = 0; index < (int) m_VolList.size(); index++) {
508             vol_total += m_VolList[index].Vol()->GetVolumeLength();
509         }
510 
511         return vol_total;
512     }
513 
GetMaxLength() const514     int GetMaxLength() const
515     {
516         int max_len = 0;
517 
518         for(int index = 0; index < (int) m_VolList.size(); index++) {
519             max_len = max( max_len, m_VolList[index].Vol()->GetMaxLength());
520         }
521 
522         return max_len;
523     }
524 
GetMinLength() const525     int GetMinLength() const
526     {
527         int min_len = INT4_MAX;
528 
529         for(int index = 0; index < (int) m_VolList.size(); index++) {
530             min_len = min( min_len, m_VolList[index].Vol()->GetMinLength());
531         }
532 
533         return min_len;
534     }
535 
536     /// Optimize the GI list configuration.
537     ///
538     /// This tells the volumes to examine and optimize their GI list
539     /// configuration.  It should not be called until all GI lists
540     /// have been added to the volumes (by alias file processing).
OptimizeGiLists()541     void OptimizeGiLists()
542     {
543         for(int i = 0; i< (int) m_VolList.size(); i++) {
544             m_VolList[i].Vol()->OptimizeGiLists();
545         }
546     }
547 
548 private:
549     /// Private constructor to prevent copy operation.
550     CSeqDBVolSet(const CSeqDBVolSet &);
551 
552     /// Private operator to prevent assignment.
553     CSeqDBVolSet & operator=(const CSeqDBVolSet &);
554 
555     /// Get the size of the entire OID range.
x_GetNumOIDs() const556     int x_GetNumOIDs() const
557     {
558         if (m_VolList.empty())
559             return 0;
560 
561         return m_VolList.back().OIDEnd();
562     }
563 
564     /// Add a volume
565     ///
566     /// This method adds a volume to the set.
567     ///
568     /// @param atlas
569     ///   The memory management layer object.
570     /// @param nm
571     ///   The name of the volume.
572     /// @param pn
573     ///   The sequence type.
574     /// @param user_list
575     ///   If specified, will be used to include deflines by ID.
576     /// @param neg_list
577     ///   If specified, will be used to exclude deflines by ID.
578     /// @param locked
579     ///   The lock holder object for this thread.
580     void x_AddVolume(CSeqDBAtlas        & atlas,
581                      const string       & nm,
582                      char                 pn,
583                      CSeqDBGiList       * user_list,
584                      CSeqDBNegativeList * neg_list,
585                      CSeqDBLockHold     & locked);
586 
587     /// Find a volume by name
588     ///
589     /// This returns the CSeqDBVolEntry object for the volume matching
590     /// the specified name.
591     ///
592     /// @param volname
593     ///   The name of the volume.
594     /// @return
595     ///   A const pointer to the CSeqDBVolEntry object, or NULL.
x_FindVolName(const string & volname) const596     const CSeqDBVolEntry * x_FindVolName(const string & volname) const
597     {
598         for(int i = 0; i< (int) m_VolList.size(); i++) {
599             if (volname == m_VolList[i].Vol()->GetVolName()) {
600                 return & m_VolList[i];
601             }
602         }
603 
604         return 0;
605     }
606 
607     /// Find a volume by name
608     ///
609     /// This returns the CSeqDBVolEntry object for the volume matching
610     /// the specified name (non const version).
611     ///
612     /// @param volname
613     ///   The name of the volume.
614     /// @return
615     ///   A non-const pointer to the CSeqDBVolEntry object, or NULL.
x_FindVolName(const string & volname)616     CSeqDBVolEntry * x_FindVolName(const string & volname)
617     {
618         for(int i = 0; i < (int) m_VolList.size(); i++) {
619             if (volname == m_VolList[i].Vol()->GetVolName()) {
620                 return & m_VolList[i];
621             }
622         }
623 
624         return 0;
625     }
626 
627     /// The actual set of volumes.
628     vector<CSeqDBVolEntry> m_VolList;
629 
630     /// The index of the most recently used volume
631     ///
632     /// This variable is mutable and volatile, but is not protected by
633     /// locking.  Instead, the following precautions are always taken.
634     ///
635     /// 1. First, the value is copied into a local variable.
636     /// 2. Secondly, the range is always checked.
637     /// 3. It is always treated as a hint; there is always fallback
638     ///    code to search for the correct volume.
639     mutable volatile int m_RecentVol;
640 };
641 
642 END_NCBI_SCOPE
643 
644 #endif // OBJTOOLS_READERS_SEQDB__SEQDBVOLSET_HPP
645 
646 
647