1 #ifndef OBJTOOLS_READERS_SEQDB__SEQDBVOL_HPP
2 #define OBJTOOLS_READERS_SEQDB__SEQDBVOL_HPP
3 
4 /*  $Id: seqdbvol.hpp 631537 2021-05-19 13:50:49Z ivanov $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Kevin Bealer
30  *
31  */
32 
33 /// @file seqdbvol.hpp
34 /// Defines database volume access classes.
35 ///
36 /// Defines classes:
37 ///     CSeqDBVol
38 ///
39 /// Implemented for: UNIX, MS-Windows
40 
41 #include <objtools/blast/seqdb_reader/impl/seqdbatlas.hpp>
42 #include <objtools/blast/seqdb_reader/impl/seqdbgeneral.hpp>
43 #include <objtools/blast/seqdb_reader/impl/seqdbtax.hpp>
44 #include "seqdbcol.hpp"
45 #include <objects/seq/seq__.hpp>
46 
47 BEGIN_NCBI_SCOPE
48 
49 /// Import definitions from the objects namespace.
50 USING_SCOPE(objects);
51 
52 /// CSeqDBGiIndex
53 ///
54 /// This class maintains the OID->GI translation
55 class CSeqDBGiIndex : public CObject {
56 public:
57     typedef CSeqDBAtlas::TIndx   TIndx;
58     typedef int TOid;
59 //    typedef int TGi;
60 
CSeqDBGiIndex(CSeqDBAtlas & atlas,const string & dbname,char prot_nucl)61     CSeqDBGiIndex(CSeqDBAtlas  & atlas,
62                   const string & dbname,
63                   char           prot_nucl)
64       : m_Lease    (atlas),
65           //m_Fname    (dbname + '.' + prot_nucl + "og"),
66           m_NumOIDs  (0) {
67               m_Lease.Init(dbname + '.' + prot_nucl + "og");
68         }
69 
~CSeqDBGiIndex()70     ~CSeqDBGiIndex()
71     {
72         m_Lease.Clear();
73     }
74 
IndexExists(const string & name,const char prot_nucl)75     static bool IndexExists(const string & name,
76                             const char     prot_nucl)
77     {
78         string fn(name + '.' + prot_nucl + "og");
79         return CFile(fn).Exists();
80     }
81 
82     TGi GetSeqGI(TOid oid, CSeqDBLockHold & locked);
83 
84 private:
85     CSeqDBFileMemMap m_Lease;
86     //string         m_Fname;
87     Int4           m_Size;
88     Int4           m_NumOIDs;
89 };
90 
91 
92 /// CSeqDBRangeList
93 ///
94 /// This class maintains a list of ranges of sequence offsets that are
95 /// desired for performance optimization.  For large sequences that
96 /// need to be unpacked, this class describes the subsets of those
97 /// sequences that will actually be used.  Each instance of this class
98 /// corresponds to sequence data for one OID.
99 
100 class CSeqDBRangeList : public CObject {
101 public:
102     /// Constructor.
103     /// @param atlas The SeqDB memory management layer. [in]
CSeqDBRangeList()104     CSeqDBRangeList()
105       : m_CacheData (false)
106     {
107         // Sequence caching is not implemented yet.  It would increase
108         // performance further, but requires some consideration of the
109         // design with respect to locking and correctness.
110     }
111 
112     /// Destructor.
~CSeqDBRangeList()113     ~CSeqDBRangeList()
114     {
115         FlushSequence();
116     }
117 
118     /// Returns true if the sequence data is cached.
IsCached()119     bool IsCached()
120     {
121         return false;
122     }
123 
124     /// List of sequence offset ranges.
125     typedef set< pair<int, int> > TRangeList;
126 
127     /// Set ranges of the sequence that will be used.
128     /// @param ranges Offset ranges of the sequence that are needed. [in]
129     /// @param append_ranges If true, combine new ranges with old. [in]
130     /// @param cache_data If true, SeqDB is allowed to cache data. [in]
131     void SetRanges(const TRangeList & ranges,
132                    bool               append_ranges,
133                    bool               cache_data);
134 
135     /// Get ranges of sequence offsets that will be used.
GetRanges()136     const TRangeList & GetRanges()
137     {
138         return m_Ranges;
139     }
140 
141     /// Flush cached sequence data (if any).
FlushSequence()142     void FlushSequence()
143     {
144     }
145 
146     /// Sequences shorter than this will not use ranges in any case.
ImmediateLength()147     static int ImmediateLength()
148     {
149         return 10240;
150     }
151 
152 private:
153     /// Range of offsets needed for this sequence.
154     TRangeList m_Ranges;
155 
156     /// True if caching of sequence data is required for this sequence.
157     bool m_CacheData;
158 };
159 
160 /// CSeqDBVol class.
161 ///
162 /// This object defines access to one database volume.  It aggregates
163 /// file objects associated with the sequence and header data, and
164 /// ISAM objects used for translation of GIs and PIGs for data in this
165 /// volume.  The extensions managed here include those with file
166 /// extensions (pin, phr, psq, nin, nhr, and nsq), plus the optional
167 /// ISAM objects via the CSeqDBIsam class.
168 
169 class CSeqDBVol {
170 public:
171     /// Import TIndx definition from the CSeqDBAtlas class.
172     typedef CSeqDBAtlas::TIndx   TIndx;
173 
174     /// Constructor.
175     ///
176     /// All files connected with the database volume will be opened,
177     /// metadata about the volume will be read from the index file,
178     /// and identifier translation indices will be opened.  The name
179     /// of these files is the specified name of the volume plus an
180     /// extension.
181     ///
182     /// @param atlas
183     ///   The memory management layer object. [in]
184     /// @param name
185     ///   The base name of the volumes files. [in]
186     /// @param prot_nucl
187     ///   The sequence type, kSeqTypeProt, or kSeqTypeNucl. [in]
188     /// @param user_list
189     ///   Specifies GIs or TIs of sequences to include. [in]
190     /// @param neg_list
191     ///   Specifies GIs or TIs of sequences to exclude. [in]
192     /// @param vol_start
193     ///   The volume's starting OID. [in]
194     /// @param locked
195     ///   The lock holder object for this thread. [in]
196     CSeqDBVol(CSeqDBAtlas        & atlas,
197               const string       & name,
198               char                 prot_nucl,
199               CSeqDBGiList       * user_list,
200               CSeqDBNegativeList * neg_list,
201               int                  vol_start,
202               CSeqDBLockHold     & locked);
203 
204     /// Open sequence file
205     ///
206     /// By default, sequence file is opened on a "lazy" schedule.
207     /// This method will force the sequence file to be opened.
208     ///
209     /// @param locked
210     ///     The lock holder object for this thread. [in]
211     void OpenSeqFile(CSeqDBLockHold &locked) const;
212 
213     /// Sequence length for protein databases.
214     ///
215     /// This method returns the length of the sequence in bases, and
216     /// should only be called for protein sequences.  It does not
217     /// require synchronization via the atlas object's lock.
218     ///
219     /// @param oid
220     ///   The OID of the sequence. [in]
221     /// @param locked
222     ///     The lock holder object for this thread. [in]
223     /// @return
224     ///   The length in bases of the sequence.
225     int GetSeqLengthProt(int oid) const;
226 
227     /// Approximate sequence length for nucleotide databases.
228     ///
229     /// This method returns the length of the sequence using a fast
230     /// method that may be off by as much as 4 bases.  The method is
231     /// designed to be unbiased, meaning that the total length of
232     /// large numbers of sequences will approximate what the exact
233     /// length would be.  The approximate lengths will change if the
234     /// database is regenerated.  It does not require synchronization.
235     ///
236     /// @param oid
237     ///   The OID of the sequence. [in]
238     /// @param locked
239     ///     The lock holder object for this thread. [in]
240     /// @return
241     ///   The approximate length in bases of the sequence.
242     int GetSeqLengthApprox(int oid) const;
243 
244     /// Exact sequence length for nucleotide databases.
245     ///
246     /// This method returns the length of the sequence in bases, and
247     /// should only be called for nucleotide sequences.  It requires
248     /// synchronization via the atlas object's lock, which must be
249     /// done in the calling code.
250     ///
251     /// @param oid
252     ///   The OID of the sequence. [in]
253     /// @param locked
254     ///     The lock holder object for this thread. [in]
255     /// @return
256     ///   The length in bases of the sequence.
257     int GetSeqLengthExact(int oid) const;
258 
259     /// Get filtered sequence header information.
260     ///
261     /// This method returns the set of Blast-def-line objects stored
262     /// for each sequence.  These contain descriptive information
263     /// related to the sequence.  If OID filtering is enabled and a
264     /// membership bit is used, only deflines with that membership bit
265     /// set will be returned.  The OID list existence and membership
266     /// bit are contained in filt_info.  This field may be NULL, in
267     /// which case OID list bit filtering is not done (in this case
268     /// the deflines are not cached).
269     ///
270     /// @param oid
271     ///   The OID of the sequence. [in]
272     /// @param locked
273     ///   The lock holder object for this thread. [in]
274     /// @return
275     ///   The set of blast-def-lines describing this sequence.
276     CRef<CBlast_def_line_set>
277     GetFilteredHeader(int                    oid,
278                       CSeqDBLockHold       & locked) const;
279 
280     /// Get the sequence type stored in this database.
281     ///
282     /// This method returns the type of sequences stored in this
283     /// database, either kSeqTypeProt for protein, or kSeqTypeNucl for
284     /// nucleotide.
285     ///
286     /// @return
287     ///   Either kSeqTypeProt for protein, or kSeqTypeNucl for nucleotide.
288     char GetSeqType() const;
289 
290     /// Get a CBioseq object for this sequence.
291     ///
292     /// This method builds and returns a Bioseq for this sequence.
293     /// The taxonomy information is cached in this volume, so it
294     /// should not be modified directly, or other Bioseqs from this
295     /// SeqDB object may be affected.  If the CBioseq has an OID list,
296     /// and it uses a membership bit, the deflines included in the
297     /// CBioseq will be filtered based on the membership bit.  Zero
298     /// for the membership bit means no filtering.  Filtering can also
299     /// be done by a GI, in which case, only the defline matching that
300     /// GI will be returned.  The seqdata parameter can be specified
301     /// as false to indicate that sequence data should not be included
302     /// in this object; in this case the CSeq_inst object attached to
303     /// the bioseq will be configured to a "not set" state.  This is
304     /// used to allow Bioseq summary data to be provided without the
305     /// performance penalty of loading (possibly very large) sequence
306     /// data from disk.
307     ///
308     /// @param oid
309     ///   The OID of the sequence. [in]
310     /// @param pref_gi
311     ///   If specified, only return deflines containing this GI. [in]
312     /// @param pref_seq_id
313     ///   If specified, only return deflines containing this Seq_id. [in]
314     /// @param tax_info
315     ///   The taxonomy database object. [in]
316     /// @param seqdata
317     ///   Include sequence data in the returned Bioseq. [in]
318     /// @param locked
319     ///   The lock holder object for this thread. [in]
320     /// @return
321     ///   A CBioseq describing this sequence.
322     CRef<CBioseq>
323     GetBioseq(int                    oid,
324               TGi                    pref_gi,
325               const CSeq_id        * pref_seq_id,
326               bool                   seqdata,
327               CSeqDBLockHold       & locked);
328 
329     /// Get the sequence data.
330     ///
331     /// This method gets the sequence data, returning a pointer and
332     /// the length of the sequence.  The atlas will be locked, but the
333     /// lock may also be returned during this method.  The computation
334     /// of the length of a nucleotide sequence involves a one byte
335     /// read that is likely to cause a page fault.  Releasing the
336     /// atlas lock before this (potential) page fault can help the
337     /// average performance in the multithreaded case.  It is safe to
338     /// release the lock because the sequence data is pinned down by
339     /// the reference count we have acquired to return to the user.
340     /// The returned sequence data is intended for blast searches, and
341     /// will contain random values in any ambiguous regions.
342     ///
343     /// @param oid
344     ///   The OID of the sequence. [in]
345     /// @param buffer
346     ///   The returned sequence data. [out]
347     /// @param locked
348     ///   The lock holder object for this thread. [in]
349     /// @param in_lease
350     ///   Only perform sequence retrieval if the requested oid is
351     ///   within the previous lease [in]
352     /// @return
353     ///   The length of this sequence in bases.
GetSequence(int oid,const char ** buffer) const354     int GetSequence(int oid, const char ** buffer) const
355     {
356         return x_GetSequence(oid, buffer);
357     }
358 
359     /// Get a sequence with ambiguous regions.
360     ///
361     /// This method gets the sequence data, returning a pointer and
362     /// the length of the sequence.  For nucleotide sequences, the
363     /// data can be returned in one of two encodings.  Specify either
364     /// (kSeqDBNuclNcbiNA8) for NCBI/NA8, or (kSeqDBNuclBlastNA8) for
365     /// Blast/NA8.  The data can also be allocated in one of three
366     /// ways, enumerated in ESeqDBAllocType.  Specify eAtlas to use
367     /// the Atlas code, eMalloc to use the malloc() function, or eNew
368     /// to use the new operator.
369     ///
370     /// @param oid
371     ///   The OID of the sequence. [in]
372     /// @param buffer
373     ///   The returned sequence data. [out]
374     /// @param nucl_code
375     ///   The encoding of the returned sequence data. [in]
376     /// @param alloc_type
377     ///   The allocation routine used. [in]
378     /// @param region
379     ///   If non-null, the offset range to get. [in]
380     /// @param locked
381     ///   The lock holder object for this thread. [in]
382     /// @return
383     ///   The length of this sequence in bases.
384     int GetAmbigSeq(int               oid,
385                     char           ** buffer,
386                     int               nucl_code,
387                     ESeqDBAllocType   alloc_type,
388                     SSeqDBSlice     * region,
389                     CSeqDB::TSequenceRanges * masks) const;
390 
391     int GetAmbigPartialSeq(int                oid,
392                            char            ** buffer,
393                            int                nucl_code,
394                            ESeqDBAllocType    alloc_type,
395                            CSeqDB::TSequenceRanges  * partial_ranges,
396                            CSeqDB::TSequenceRanges  * masks) const;
397 
398     /// Get the Seq-ids associated with a sequence.
399     ///
400     /// This method returns a list containing all the CSeq_id objects
401     /// associated with a sequence.
402     ///
403     /// @param oid
404     ///   The OID of the sequence. [in]
405     /// @param locked
406     ///   The lock holder object for this thread. [in]
407     /// @return
408     ///   The list of Seq-id objects for this sequences.
409     list< CRef<CSeq_id> > GetSeqIDs(int  oid) const;
410     // same as above version with cached CObjectIStreamAsnBinary
411     list< CRef<CSeq_id> > GetSeqIDs(int  oid, CObjectIStreamAsnBinary  *inpstr) const;
412 
413     /// Get the GI of a sequence
414     /// This method returns the gi of the sequence
415     ///
416     /// @param oid
417     ///   The OID of the sequence. [in]
418     /// @return
419     ///   The oid of the sequence
420     TGi GetSeqGI(int oid, CSeqDBLockHold & locked) const;
421 
422     /// Get the volume title.
423     /// @return The volume's title.
424     string GetTitle() const;
425 
426     /// Get sqlite file name associated with this volume
427     /// Empty string if version 4
428     string GetLMDBFileName() const;
429 
430     /// Get the formatting date of the volume.
431     /// @return The create-date of the volume.
432     string GetDate() const;
433 
434     /// Get the number of OIDs for this volume.
435     /// @return The number of OIDs.
436     int GetNumOIDs() const;
437 
438     /// Get the total length of this volume (in bases).
439     /// @return The total volume length.
440     Uint8 GetVolumeLength() const;
441 
442     /// Get the length of the largest sequence in this volume.
443     /// @return The largest sequence's length.
444     int GetMaxLength() const;
445 
446     /// Get the length of the smallest sequence in this volume.
447     /// @return The smallest sequence's length.
448     int GetMinLength() const;
449 
450     /// Get the volume name.
451     /// @return The volume name.
GetVolName() const452     const string & GetVolName() const
453     {
454         return m_VolName;
455     }
456 
457     /// Return expendable resources held by this volume.
458     ///
459     /// This volume holds resources acquired via the atlas.  This
460     /// method returns all such resources which can be automatically
461     /// reacquired (but not, for example, the index file data).
462     void UnLease();
463 
464 
465     /// Find the OID given a PIG.
466     ///
467     /// A lookup is done for the PIG, and if found, the corresponding
468     /// OID is returned.
469     ///
470     /// @param pig
471     ///   The pig to look up. [in]
472     /// @param oid
473     ///   The returned ordinal ID. [out]
474     /// @param locked
475     ///   The lock holder object for this thread. [in]
476     /// @return
477     ///   True if the PIG was found.
478     bool PigToOid(int pig, int & oid) const;
479 
480     /// Find the PIG given an OID.
481     ///
482     /// If this OID is associated with a PIG, the PIG is returned.
483     ///
484     /// @param oid
485     ///   The oid of the sequence. [in]
486     /// @param pig
487     ///   The returned PIG. [out]
488     /// @param locked
489     ///   The lock holder object for this thread. [in]
490     /// @return
491     ///   True if a PIG was returned.
492     bool GetPig(int oid, int & pig, CSeqDBLockHold & locked) const;
493 
494     /// Find the OID given a TI.
495     ///
496     /// A lookup is done for the TI, and if found, the corresponding
497     /// OID is returned.
498     ///
499     /// @param ti
500     ///   The ti to look up. [in]
501     /// @param oid
502     ///   The returned ordinal ID. [out]
503     /// @param locked
504     ///   The lock holder object for this thread. [in]
505     /// @return
506     ///   True if the TI was found.
507     bool TiToOid(Int8                   ti,
508                  int                  & oid,
509                  CSeqDBLockHold       & locked) const;
510 
511     /// Find the OID given a GI.
512     ///
513     /// A lookup is done for the GI, and if found, the corresponding
514     /// OID is returned.
515     ///
516     /// @param gi
517     ///   The gi to look up. [in]
518     /// @param oid
519     ///   The returned ordinal ID. [out]
520     /// @param locked
521     ///   The lock holder object for this thread. [in]
522     /// @return
523     ///   True if an OID was returned.
524     bool GiToOid(TGi gi, int & oid, CSeqDBLockHold & locked) const;
525 
526     /// Find the GI given an OID.
527     ///
528     /// If this OID is associated with a GI, the GI is returned.
529     ///
530     /// @param oid
531     ///   The oid of the sequence. [in]
532     /// @param gi
533     ///   The returned GI. [out]
534     /// @param locked
535     ///   The lock holder object for this thread. [in]
536     /// @return
537     ///   True if a GI was returned.
538     bool GetGi(int                    oid,
539                TGi                  & gi,
540                CSeqDBLockHold       & locked) const;
541 
542     /// Find OIDs for the specified accession or formatted Seq-id.
543     ///
544     /// An attempt will be made to simplify the string by parsing it
545     /// into a list of Seq-ids.  If this works, the best Seq-id (for
546     /// lookup purposes) will be formatted and the resulting string
547     /// will be looked up in the string ISAM file.  The resulting set
548     /// of OIDs will be returned.  If the string is not found, the
549     /// array will be left empty.  Most matches only produce one OID.
550     ///
551     /// @param acc
552     ///   An accession or formatted Seq-id for which to search. [in]
553     /// @param oids
554     ///   A set of OIDs found for this sequence. [out]
555     /// @param locked
556     ///   The lock holder object for this thread. [in]
557     void AccessionToOids(const string         & acc,
558                          vector<int>          & oids,
559                          CSeqDBLockHold       & locked) const;
560 
561     /// Find OIDs for the specified Seq-id.
562     ///
563     /// The Seq-id will be formatted and the resulting string will be
564     /// looked up in the string ISAM file.  The resulting set of OIDs
565     /// will be returned.  If the string is not found, the array will
566     /// be left empty.  Most matches only produce one OID.
567     ///
568     /// @param seqid
569     ///   A Seq-id for which to search. [in]
570     /// @param oids
571     ///   A set of OIDs found for this sequence. [out]
572     /// @param locked
573     ///   The lock holder object for this thread. [in]
574     void SeqidToOids(CSeq_id              & seqid,
575                      vector<int>          & oids,
576                      CSeqDBLockHold       & locked) const;
577 
578     /// Find the OID at a given index into the database.
579     ///
580     /// This method considers the database as one long array of bases,
581     /// and finds the base at an offset into that array.  The sequence
582     /// nearest that base is determined, and the sequence's OID is
583     /// returned.  The OIDs are assigned to volumes in a different
584     /// order than with the readdb library, which can be an issue when
585     /// splitting the database for load balancing purposes.  When
586     /// computing the OID range, be sure to use GetNumOIDs(), not
587     /// GetNumSeqs().
588     ///
589     /// @param first_seq
590     ///   This OID or later is always returned. [in]
591     /// @param residue
592     ///   The position to find relative to the total length. [in]
593     /// @param locked
594     ///   The lock holder object for this thread. [in]
595     /// @return
596     ///   The OID of the sequence nearest the specified residue.
597     int GetOidAtOffset(int              first_seq,
598                        Uint8            residue,
599                        CSeqDBLockHold & locked) const;
600 
601     /// Translate Gis to Oids for the given vector of Gi/Oid pairs.
602     ///
603     /// This method iterates over a vector of Gi/Oid pairs.  For each
604     /// pair where OID is -1, the GI will be looked up in the ISAM
605     /// file, and (if found) the correct OID will be stored (otherwise
606     /// the -1 will remain).  This method will normally be called once
607     /// for each volume.
608     ///
609     /// @param gis
610     ///   The set of GI/OID, TI/OID, and Seq-id/OID pairs. [in|out]
611     /// @param locked
612     ///   The lock holder object for this thread. [in]
613     void IdsToOids(CSeqDBGiList   & gis,
614                    CSeqDBLockHold & locked) const;
615 
616     /// Add OIDs for this volume, filtered by negative ID lists.
617     ///
618     /// This method iterates over a vector of Gis or Tis.  For each
619     /// GI+OID or TI+OID line in the ISAM file, the OID's bit will be
620     /// enabled in the ID list, if the GI or TI is not found in the
621     /// negated GI or TI lists.  This method will normally be called
622     /// once for each volume.
623     ///
624     /// @param gis
625     ///   The set of GIs, TIs, and the OID bitmap. [in|out]
626     /// @param locked
627     ///   The lock holder object for this thread. [in]
628     void IdsToOids(CSeqDBNegativeList & gis,
629                    CSeqDBLockHold     & locked) const;
630 
631     /// Filter this volume using the specified GI list.
632     ///
633     /// A volume can be filtered by a GI list.  This method attaches a
634     /// GI list to the volume, in addition to any GI lists that are
635     /// already attached.
636     ///
637     /// @param gilist
638     ///   A list of GIs to use as a filter. [in]
AttachVolumeGiList(CRef<CSeqDBGiList> gilist) const639     void AttachVolumeGiList(CRef<CSeqDBGiList> gilist) const
640     {
641         m_VolumeGiLists.push_back(gilist);
642     }
643 
644     /// Simplify the GI list configuration.
645     ///
646     /// When all user and volume GI lists have been attached, the user
647     /// GI list may be removed; this is only possible if neither the
648     /// user nor volume GI lists contain Seq-id data.
649     void OptimizeGiLists() const;
650 
651     /// Fetch data as a CSeq_data object.
652     ///
653     /// All or part of the sequence is fetched in a CSeq_data object.
654     /// The portion of the sequence returned is specified by begin and
655     /// end.  An exception will be thrown if begin is greater than or
656     /// equal to end, or if end is greater than or equal to the length
657     /// of the sequence.  Begin and end should be specified in bases;
658     /// a range like (0,1) specifies 1 base, not 2.  Nucleotide data
659     /// will always be returned in ncbi4na format.
660     ///
661     /// @param oid    Specifies the sequence to fetch. [in]
662     /// @param begin  Specifies the start of the data to get. [in]
663     /// @param end    Specifies the end of the data to get.   [in]
664     /// @param locked The lock holder object for this thread. [in]
665     /// @return The sequence data as a Seq-data object.
666     CRef<CSeq_data> GetSeqData(int              oid,
667                                TSeqPos          begin,
668                                TSeqPos          end,
669                                CSeqDBLockHold & locked) const;
670 
671     /// Get Raw Sequence and Ambiguity Data.
672     ///
673     /// Get a pointer to the raw sequence and ambiguity data, and the
674     /// length of each.  The encoding for these is not defined here
675     /// and should not be relied on to be compatible between different
676     /// database format versions.  NULL can be supplied for parameters
677     /// that are not needed (except oid).  RetSequence() must be
678     /// called with the pointer returned by 'buffer' if and only if
679     /// that pointer is supplied as non-null by the user.  Protein
680     /// sequences will never have ambiguity data.  Ambiguity data will
681     /// be packed in the returned buffer at offset *seq_length.
682     ///
683     /// @param oid Ordinal id of the sequence. [in]
684     /// @param buffer Buffer of raw data. [out]
685     /// @param seq_length Returned length of the sequence data. [out]
686     /// @param seq_length Returned length of the ambiguity data. [out]
687     /// @param locked Lock holder object for this thread. [in]
688     void GetRawSeqAndAmbig(int              oid,
689                            const char    ** buffer,
690                            int            * seq_length,
691                            int            * ambig_length) const;
692 
693     /// Get GI Bounds.
694     ///
695     /// Fetch the lowest, highest, and total number of GIs.  If the
696     /// operation fails, zero will be returned for count.
697     ///
698     /// @param low_id Lowest GI value in database. [out]
699     /// @param high_id Highest GI value in database. [out]
700     /// @param count Number of GI values in database. [out]
701     /// @param locked Lock holder object for this thread. [in]
702     void GetGiBounds(TGi            & low_id,
703                      TGi            & high_id,
704                      int            & count,
705                      CSeqDBLockHold & locked) const;
706 
707     /// Get PIG Bounds.
708     ///
709     /// Fetch the lowest, highest, and total number of PIGs.  If the
710     /// operation fails, zero will be returned for count.
711     ///
712     /// @param low_id Lowest PIG value in database. [out]
713     /// @param high_id Highest PIG value in database. [out]
714     /// @param count Number of PIG values in database. [out]
715     /// @param locked Lock holder object for this thread. [in]
716     void GetPigBounds(int            & low_id,
717                       int            & high_id,
718                       int            & count,
719                       CSeqDBLockHold & locked) const;
720 
721     /// Get String Bounds.
722     ///
723     /// Fetch the lowest, highest, and total number of string keys in
724     /// the database index.  If the operation fails, zero will be
725     /// returned for count.
726     ///
727     /// @param low_id Lowest string value in database. [out]
728     /// @param high_id Highest string value in database. [out]
729     /// @param count Number of string values in database. [out]
730     /// @param locked Lock holder object for this thread. [in]
731     void GetStringBounds(string         & low_id,
732                          string         & high_id,
733                          int            & count) const;
734 
735     /// List of sequence offset ranges.
736     typedef set< pair<int, int> > TRangeList;
737 
738     /// Apply a range of offsets to a database sequence.
739     ///
740     /// The GetAmbigSeq() method requires an amount of work (and I/O)
741     /// which is proportional to the size of the sequence data (more
742     /// if ambiguities are present).  In some cases, only certain
743     /// subranges of this data will be utilized.  This method allows
744     /// the user to specify which parts of a sequence are actually
745     /// needed by the user.  (Care should be taken if one SeqDB object
746     /// is shared by several program components.)  (Note that offsets
747     /// above the length of the sequence will not generate an error,
748     /// and are replaced by the sequence length.)
749     ///
750     /// If ranges are specified for a sequence, data areas in
751     /// specified sequences will be accurate, but data outside the
752     /// specified ranges should not be accessed, and no guarantees are
753     /// made about what data they will contain.  If the keep_current
754     /// flag is true, the range will be added to existing ranges.  If
755     /// false, existing ranges will be flushed and replaced by new
756     /// ranges.  To remove ranges, call this method with an empty list
757     /// of ranges; future calls will return the complete sequence.
758     ///
759     /// If the cache_data flag is provided, data for this sequence
760     /// will be kept for the duration of SeqDB's lifetime.  To disable
761     /// caching (and flush cached data) for this sequence, call the
762     /// method again, but specify cache_data to be false.
763     ///
764     /// @param oid           OID of the sequence. [in]
765     /// @param offset_ranges Ranges of sequence data to return. [in]
766     /// @param append_ranges Append new ranges to existing list. [in]
767     /// @param cache_data    Keep sequence data for future callers. [in]
768     /// @param locked        Lock holder object for this thread. [in]
769     void SetOffsetRanges(int                oid,
770                          const TRangeList & offset_ranges,
771                          bool               append_ranges,
772                          bool               cache_data) const;
773 
774     /// Flush all offset ranges cached
775     /// @param locked        Lock holder object for this thread. [in]
776     void FlushOffsetRangeCache();
777 
778     /// Get the sequence hash for a given OID.
779     ///
780     /// The sequence data is fetched and the sequence hash is
781     /// computed and returned.
782     ///
783     /// @param oid The sequence to compute the hash of. [in]
784     /// @return The sequence hash.
785     unsigned GetSequenceHash(int oid);
786 
787     /// Get the OIDs for a given sequence hash.
788     ///
789     /// The OIDs corresponding to a hash value (if any) are found and
790     /// returned.  If none are found, the vector will be empty.  If
791     /// the index does not exist for this volume, an exception will be
792     /// thrown.  Some false positives may be returned due to hash
793     /// value collisions.
794     ///
795     /// @param hash The sequence hash to look up. [in]
796     /// @param oids OIDs of sequences with this hash. [out]
797     /// @param locked Lock holder object for this thread. [in|out]
798     void HashToOids(unsigned         hash,
799                     vector<int>    & oids,
800                     CSeqDBLockHold & locked) const;
801 
802     /// List the titles of all columns for this volume.
803     void ListColumns(set<string>    & titles,
804                      CSeqDBLockHold & locked);
805 
806     /// Get an ID number for a given column title.
807     ///
808     /// For a given column title, this returns an ID that can be used
809     /// to access that column in the future.  The returned ID number
810     /// is specific to this instance of SeqDB.  If the database does
811     /// not have a column with this name, -1 will be returned.
812     ///
813     /// @param title Column title to search for. [in]
814     /// @param locked The lock holder object for this thread. [in]
815     /// @return Column ID number for this column, or -1. [in]
816     int GetColumnId(const string   & title,
817                     CSeqDBLockHold & locked);
818 
819     /// Get all metadata for the specified column.
820     ///
821     /// Columns may contain user-defined metadata as a list of
822     /// key-value pairs.  For the specified column, this returns that
823     /// column's metadata in the provided map.  If multiple volumes
824     /// are present, and they define contradictory meta data (this is
825     /// more common when multiple databases are opened at once), this
826     /// method returns the first value it finds for each metadata key.
827     /// If this is unsatisfactory, the two-argument version of this
828     /// method may be used to get more precise values for specific
829     /// volumes.
830     ///
831     /// @param col_id The column id from GetColumnId. [in]
832     /// @param locked The lock holder object for this thread. [in]
833     /// @return The map of metadata for this column. [out]
834     const map<string,string> &
835     GetColumnMetaData(int              col_id,
836                       CSeqDBLockHold & locked);
837 
838     /// Fetch the data blob for the given column and oid.
839     ///
840     /// This method finds the blob data for this OID and column, and
841     /// stores a reference to in the provided blob.  If `keep' is
842     /// true, a `lifetime' object is attached to the blob to insure
843     /// the memory is not unmapped when the atlas lock is released.
844     ///
845     /// It is important to specify `keep' correctly to avoid memory
846     /// faults and/or deadlocks.  If `keep' is false, the blob must
847     /// not be returned to the user or accessed after the atlas lock
848     /// is released, since the memory it references may no longer be
849     /// mmapped.  On the other hand, if `keep' is true, the blob may
850     /// be safely returned to the user, but must not be reassigned or
851     /// destructed until the atlas lock is released (or a deadlock
852     /// will occur).  This includes destruction due `stack unwinding'.
853     ///
854     /// For similar reasons, the blob should be empty on input.
855     ///
856     /// @param col_id The column to fetch data from. [in]
857     /// @param oid    The OID of the blob. [in]
858     /// @param blob   The data will be returned here. [out]
859     /// @param keep   If true, increment the memory region. [in]
860     /// @param locked The lock holder object for this thread. [in]
861     void GetColumnBlob(int              col_id,
862                        int              oid,
863                        CBlastDbBlob   & blob,
864                        bool             keep,
865                        CSeqDBLockHold & locked);
866 
867     /// Set the MEMB_BIT fitlering for this volume.
868     ///
869     /// This method sets the MEMB_BIT for the volume.  If the
870     /// MEMB_BIT has already been set, and the new bit is different,
871     /// exception will be thrown.   This prevents conflicting MEMB_BIT
872     /// settings within an alias tree; nevertheless, it also prevents
873     /// aggregating the same volume with different MEMB_BIT settings,
874     /// such as "DBLIST swissprot pdb".   The latter case is probably
875     /// not desired.  Support for this "paralogous" case will probably
876     /// come later.
877     ///
878     /// @param mbit  The bit to set [in]
SetMemBit(int mbit) const879     void SetMemBit(int mbit) const {
880         if (m_MemBit && mbit != m_MemBit) {
881             NCBI_THROW(CSeqDBException, eFileErr,
882                    "MEMB_BIT error: conflicting bit found.");
883         }
884         m_MemBit = mbit;
885     }
886 
887 private:
888     void x_StringToOids(const string         & acc,
889                         ESeqDBIdType           id_type,
890                         Int8                   ident,
891                         const string         & str_id,
892                         bool                   simplified,
893                         vector<int>          & oids) const;
894 
895     /// A set of GI lists.
896     typedef vector< CRef<CSeqDBGiList> > TGiLists;
897 
898     /// Returns true if this volume has a positive ID list.
x_HaveGiList(void) const899     bool x_HaveGiList(void) const
900     {
901         return ! (m_UserGiList.Empty() && m_VolumeGiLists.empty());
902     }
903 
904     /// Returns true if this volume has a negative ID list.
x_HaveNegativeList(void) const905     bool x_HaveNegativeList(void) const
906     {
907         return m_NegativeList.NotEmpty();
908     }
909 
910     /// Returns true if this volume has an ID list.
x_HaveIdFilter(void) const911     bool x_HaveIdFilter(void) const
912     {
913         return x_HaveGiList() || x_HaveNegativeList();
914     }
915 
916     /// Determine if a user ID list affects this ID, and how.
917     ///
918     /// This is used to accumulate information about a Seq-id in two
919     /// boolean variables.  In order for a Seq-id to be considered
920     /// `included', it must pass filtering by both the user ID list
921     /// (if one was specified) and at least one of the set of ID lists
922     /// attached to the volume (if any exist).  This function will be
923     /// called repeatedly for each ID in a defline to determine if the
924     /// defline as a whole passes the filtering tests.  If the
925     /// booleans are set to true, this code never sets it to false,
926     /// and can skip the associated test.  This is because a defline
927     /// is included if one of its Seq-ids matches the volume ID list
928     /// but a different one matches the user ID list.  For negative ID
929     /// lists this returns true if the type of ID matches the kind
930     /// used by the negative list, but the ID is not found therein.
931     ///
932     /// @param id Sequence id to check for. [in]
933     /// @param have_user Will be set if the user list has id. [in|out]
934     /// @param have_vol Will be set if the volume list has id. [in|out]
x_FilterHasId(const CSeq_id & id,bool & have_user,bool & have_vol) const935     void x_FilterHasId(const CSeq_id & id,
936                        bool          & have_user,
937                        bool          & have_vol) const
938     {
939         if (! have_user) {
940             if (m_UserGiList.NotEmpty() && m_UserGiList->GetNumTaxIds() == 0 && m_UserGiList->GetNumPigs() == 0 ) {
941                 have_user |= x_ListIncludesId(*m_UserGiList, id);
942             } else if (m_NegativeList.NotEmpty() && m_NegativeList->GetNumTaxIds() == 0 && m_NegativeList->GetNumPigs() == 0 ) {
943                 have_user |= x_ListIncludesId(*m_NegativeList, id);
944             } else {
945                 have_user = true;
946             }
947         }
948 
949         if (! have_vol) {
950             if (m_VolumeGiLists.empty()) {
951                 have_vol = true;
952             } else {
953                 NON_CONST_ITERATE(TGiLists, gilist, m_VolumeGiLists) {
954                     if (x_ListIncludesId(**gilist, id)) {
955                         have_vol = true;
956                         break;
957                     }
958                 }
959             }
960         }
961     }
962 
963     /// Returns true if this volume's ID list has this Seq-id.
964     /// @param L A GI list to test against. [in]
965     /// @param id A Seq-id to test against L. [in]
966     /// @return True if the list contains the specified Seq-id.
x_ListIncludesId(CSeqDBGiList & L,const CSeq_id & id) const967     bool x_ListIncludesId(CSeqDBGiList & L, const CSeq_id & id) const
968     {
969         return L.FindId(id);
970     }
971 
972     /// Returns true if this ID is not found in the negative ID list.
973     ///
974     /// This checks whether an ID is found in the negative ID list,
975     /// and whether the ID is the right type (so that it might
976     /// possibly be found).  If the ID is the right type, and is not
977     /// found, this method returns true.  In other cases it returns
978     /// false.  This technique could be described as treating the
979     /// negative GI list as the list of all GIs not mentioned in the
980     /// vector stored in the list, and similarly for the TIs.  This
981     /// means that every TI and GI in the ASN.1 for this defline must
982     /// be mentioned in the negative ID list in order to exclude the
983     /// defline.  In normal practice, only one GI or TI ever exists
984     /// for a defline.
985     ///
986     /// @param L A GI list to test against. [in]
987     /// @param id A Seq-id to test against L. [in]
988     /// @return True if the list contains the specified Seq-id.
x_ListIncludesId(CSeqDBNegativeList & L,const CSeq_id & id) const989     bool x_ListIncludesId(CSeqDBNegativeList & L, const CSeq_id & id) const
990     {
991         // A defline is included IFF either a GI or TI is found, and
992         // that ID is not on the list.
993 
994         // I use the terms 'included' and 'mentioned' to describe the
995         // negative list processing as follows: "A negative list
996         // INCLUDES a TI or GI if that ID is not MENTIONED in the
997         // negative list."
998 
999         bool match_type = false;
1000         bool found = L.FindId(id, match_type);
1001 
1002         return (! found) && match_type;
1003     }
1004 
1005     /// Get sequence header object.
1006     ///
1007     /// This method returns the sequence header information as an
1008     /// ASN.1 object.  Seq-ids of type "gnl|BL_ORD_ID|#" are stored as
1009     /// values relative to this volume.  If they will be returned to
1010     /// the user in any way, specify true for adjust_oids to adjust
1011     /// them to the global OID range.
1012     ///
1013     /// @param oid
1014     ///   The OID of the sequence. [in]
1015     /// @param adjust_oids
1016     ///   If true, BL_ORD_ID ids will be adjusted to this volume. [in]
1017     /// @param changed
1018     ///   Indicates whether ASN.1 data needed changes (optional). [out]
1019     /// @param locked
1020     ///   The lock holder object for this thread. [in]
1021     /// @return
1022     ///   The Blast-def-line-set describing this sequence.
1023     CRef<CBlast_def_line_set>
1024     x_GetHdrAsn1(int              oid,
1025                  bool             adjust_oids,
1026                  bool           * changed) const;
1027 
1028     CRef<CBlast_def_line_set>
1029     x_GetHdrAsn1(int              oid,
1030                  bool             adjust_oids,
1031                  bool           * changed,
1032 		 CObjectIStreamAsnBinary  *inpstr) const;
1033     /// Get sequence header binary data.
1034     ///
1035     /// This method returns the sequence header information as a
1036     /// reference to raw ASN.1 binary data.  This reference can be
1037     /// used until the next access to the Atlas layer or the header
1038     /// data memory lease.
1039     ///
1040     /// @param oid
1041     ///   The OID of the sequence. [in]
1042     /// @param locked
1043     ///   The lock holder object for this thread. [in]
1044     /// @return
1045     ///   The Blast-def-line-set describing this sequence.
1046     CTempString x_GetHdrAsn1Binary(int oid) const;
1047 
1048     /// Get binary sequence header information.
1049     ///
1050     /// This method reads the sequence header information (as binary
1051     /// encoded ASN.1) into a supplied char vector.
1052     ///
1053     /// @param oid
1054     ///   The OID of the sequence. [in]
1055     /// @param hdr_data
1056     ///   The returned binary ASN.1 of the Blast-def-line-set. [out]
1057     /// @param locked
1058     ///   The lock holder object for this thread. [in]
1059     void
1060     x_GetFilteredBinaryHeader(int                    oid,
1061                               vector<char>         & hdr_data) const;
1062 
1063 
1064     /// Get sequence header information.
1065     ///
1066     /// This method returns the set of Blast-def-line objects stored
1067     /// for each sequence.  These contain descriptive information
1068     /// related to the sequence.  If OID filtering is enabled and a
1069     /// membership bit is used, only deflines with that membership bit
1070     /// set will be returned.
1071     ///
1072     /// @param oid
1073     ///   The OID of the sequence. [in]
1074     /// @param changed
1075     ///   Indicates whether ASN.1 data needed changes (optional). [out]
1076     /// @param locked
1077     ///   The lock holder object for this thread. [in]
1078     /// @return
1079     ///   The set of blast-def-lines describing this sequence.
1080     CRef<CBlast_def_line_set>
1081     x_GetFilteredHeader(int                    oid,
1082                         bool                 * changed) const;
1083 
1084     CRef<CBlast_def_line_set>
1085     x_GetFilteredHeader(int                    oid,
1086                         bool                 * changed,
1087 			CObjectIStreamAsnBinary *inpstr ) const;
1088 
1089     /// Get sequence header information structures.
1090     ///
1091     /// This method reads the sequence header information and returns
1092     /// a Seqdesc suitable for inclusion in a CBioseq.  This object
1093     /// will contain an opaque type, storing the sequence headers as
1094     /// binary ASN.1, wrapped in a C++ ASN.1 structure (CSeqdesc).
1095     ///
1096     /// @param oid
1097     ///   The OID of the sequence. [in]
1098     /// @param locked
1099     ///   The lock holder object for this thread. [in]
1100     /// @return
1101     ///   The CSeqdesc to include in the CBioseq.
1102     CRef<CSeqdesc> x_GetAsnDefline(int oid) const;
1103 
1104     /// Returns 'p' for protein databases, or 'n' for nucleotide.
1105     char x_GetSeqType() const;
1106 
1107     /// Get ambiguity information.
1108     ///
1109     /// This method is used to fetch the ambiguity data for sequences
1110     /// in a nucleotide database.  The ambiguity data describes
1111     /// sections of the nucleotide sequence for which more than one of
1112     /// 'A', 'C', 'G', or 'T' are possible.  The integers returned by
1113     /// this function contain a packed description of the ranges of
1114     /// the sequence which have such data.  This method only returns
1115     /// the array of integers, and does not interpret them, except for
1116     /// byte swapping.
1117     ///
1118     /// @param oid
1119     ///   The OID of the sequence. [in]
1120     /// @param ambchars
1121     ///   The returned array of ambiguity descriptors. [out]
1122     /// @param locked
1123     ///   The lock holder object for this thread. [in]
1124     void x_GetAmbChar(int              oid,
1125                       vector<Int4>   & ambchars) const;
1126 
1127     /// Get a sequence with ambiguous regions.
1128     ///
1129     /// This method gets the sequence data, returning a pointer and
1130     /// the length of the sequence.  For nucleotide sequences, the
1131     /// data can be returned in one of two encodings.  Specify either
1132     /// (kSeqDBNuclNcbiNA8) for NCBI/NA8, or (kSeqDBNuclBlastNA8) for
1133     /// Blast/NA8.  The data can also be allocated in one of three
1134     /// ways, enumerated in ESeqDBAllocType.  Specify eAtlas to use
1135     /// the Atlas code, eMalloc to use the malloc() function, or eNew
1136     /// to use the new operator.
1137     ///
1138     /// @param oid
1139     ///   The OID of the sequence. [in]
1140     /// @param buffer
1141     ///   The returned sequence data. [out]
1142     /// @param nucl_code
1143     ///   The encoding of the returned sequence data. [in]
1144     /// @param alloc_type
1145     ///   The allocation routine used. [in]
1146     /// @param region
1147     ///   If non-null, the offset range to get. [in]
1148     /// @param locked
1149     ///   The lock holder object for this thread. [in]
1150     /// @return
1151     ///   The length of this sequence in bases.
1152     int x_GetAmbigSeq(int                oid,
1153                       char            ** buffer,
1154                       int                nucl_code,
1155                       ESeqDBAllocType    alloc_type,
1156                       SSeqDBSlice      * region,
1157                       CSeqDB::TSequenceRanges *masks) const;
1158 
1159     /// Allocate memory in one of several ways.
1160     ///
1161     /// This method provides functionality to allocate memory with the
1162     /// atlas layer, using malloc, or using the new [] operator.  The
1163     /// user is expected to return the data using the corresponding
1164     /// deallocation technique.
1165     ///
1166     /// @param length
1167     ///     The number of bytes to get. [in]
1168     /// @param alloc_type
1169     ///     The type of allocation routine to use. [in]
1170     /// @param locked
1171     ///     The lock holder object for this thread. [in]
1172     /// @return
1173     ///     A pointer to the allocated memory.
1174     char * x_AllocType(size_t            length,
1175                        ESeqDBAllocType   alloc_type) const;
1176 
1177     /// Get sequence data.
1178     ///
1179     /// The sequence data is found and returned for the specified
1180     /// sequence.  The caller owns the data and a hold on the
1181     /// underlying memory region.  There is a memory access in this
1182     /// code that tends to trigger a soft (and possibly hard) page
1183     /// fault in the nucleotide case.  If the can_release and keep
1184     /// flags are true, this code may return the lock holder object
1185     /// before that point to reduce lock contention in multithreaded
1186     /// code.
1187     ///
1188     /// @param oid
1189     ///     The ordinal ID of the sequence to get. [in]
1190     /// @param buffer
1191     ///     The returned sequence data buffer. [out]
1192     /// @param keep
1193     ///     Specify true if the caller wants a hold on the sequence. [in]
1194     /// @param locked
1195     ///     The lock holder object for this thread. [in]
1196     /// @param can_release
1197     ///     Specify true if the atlas lock can be released. [in]
1198     /// @param in_lease
1199     ///     Only perform retrieval if the oid is within previous lease [in]
1200     /// @return
1201     ///     The length of the sequence in bases.
1202     int x_GetSequence(int              oid,
1203                       const char    ** buffer) const;
1204 
1205     /// Get partial sequence data.
1206     ///
1207     /// The sequence data is found and returned for the specified oid
1208     /// and offset range.  If the region argument is non-null, the
1209     /// region endpoints are verified against the sequence endpoints.
1210     /// Otherwise, this method is the same as x_GetSequence().  Note
1211     /// that the code returns the length of the region in bases, but
1212     /// buffer is set to a pointer to the beginning of the sequence,
1213     /// not the beginning of the region.
1214     ///
1215     /// @param oid
1216     ///   The ordinal ID of the sequence to get. [in]
1217     /// @param buffer
1218     ///   The returned sequence data buffer. [out]
1219     /// @param keep
1220     ///   Specify true if the caller wants a hold on the sequence. [in]
1221     /// @param locked
1222     ///   The lock holder object for this thread. [in]
1223     /// @param can_release
1224     ///   Specify true if the atlas lock can be released. [in]
1225     /// @param region
1226     ///   If non-null, the offset range to get. [in]
1227     /// @return
1228     ///   The length of the returned portion in bases.
1229     int x_GetSequence(int              oid,
1230                       const char    ** buffer,
1231                       bool             keep,
1232                       CSeqDBLockHold & locked,
1233                       bool             can_release,
1234                       SSeqDBSlice    * region) const;
1235 
1236     /// Get defline filtered by several criteria.
1237     ///
1238     /// This method returns the set of deflines for a sequence.  If
1239     /// there is an OID list and membership bit, these will be
1240     /// filtered by membership bit.  If there is a preferred GI is
1241     /// specified, the defline matching that GI (if found) will be
1242     /// moved to the front of the set.
1243     ///
1244     /// @param oid
1245     ///     The ordinal ID of the sequence to get. [in]
1246     /// @param preferred_gi
1247     ///     This GI's defline (if non-zero and found) will be put at the front of the list. [in]
1248     /// @param preferred_seqid
1249     ///     This SeqID's defline (if non-NULL and found) will be put at the front of the list. [in]
1250     /// @param locked
1251     ///     The lock holder object for this thread. [in]
1252     /// @return
1253     ///     The defline set for the specified oid.
1254     CRef<CBlast_def_line_set>
1255     x_GetTaxDefline(int                    oid,
1256                     TGi                    preferred_gi,
1257                     const CSeq_id        * preferred_seq_id);
1258 
1259 
1260     /// Get taxonomic descriptions of a sequence.
1261     ///
1262     /// This method builds a set of CSeqdesc objects from taxonomic
1263     /// information and blast deflines.  If there is an OID list and
1264     /// membership bit, the deflines will be filtered by membership
1265     /// bit.  If there is a preferred GI is specified, the defline
1266     /// matching that GI (if found) will be moved to the front of the
1267     /// set.  This method is called as part of the processing for
1268     /// building a CBioseq object.
1269     ///
1270     /// @param oid
1271     ///     The ordinal ID of the sequence to get. [in]
1272     /// @param preferred_gi
1273     ///     This GI's defline (if non-zero and found) will be put at the front of the list. [in]
1274     /// @param preferred_seqid
1275     ///     This SeqID's defline (if non-NULL and found) will be put at the front of the list. [in]
1276     /// @param tax_info
1277     ///     Taxonomic info to encode. [in]
1278     /// @param locked
1279     ///     The lock holder object for this thread. [in]
1280     /// @return
1281     ///     A list of CSeqdesc objects for the specified oid.
1282     list< CRef<CSeqdesc> >
1283     x_GetTaxonomy(int                    oid,
1284                   TGi                    preferred_gi,
1285                   const CSeq_id        * preferred_seq_id);
1286 
1287 
1288     /// Returns the base-offset of the specified oid.
1289     ///
1290     /// This method finds the starting offset of the OID relative to
1291     /// the start of the volume, and returns that distance as a number
1292     /// of bytes.  The range of the return value should be from zero
1293     /// to the size of the sequence file in bytes.  Note that the
1294     /// total volume length in bytes can be found by submitting the
1295     /// OID count as the input oid, because the index file contains
1296     /// one more array element than there are sequences.
1297     ///
1298     /// @param oid
1299     ///     The sequence of which to get the starting offset. [in]
1300     /// @param locked
1301     ///     The lock holder object for this thread. [in]
1302     /// @return
1303     ///     The offset in the volume of that sequence in bytes.
1304     Uint8 x_GetSeqResidueOffset(int oid) const;
1305 
1306     /// Find all columns for this volume.
1307     ///
1308     /// This method looks for and opens any columns that might be
1309     /// associated with this database volume.
1310     ///
1311     /// @param locked
1312     ///     The lock holder object for this thread. [in]
1313     void x_OpenAllColumns(CSeqDBLockHold & locked);
1314 
1315     /// Check Seq-id versions for special sparse-id support case.
1316     ///
1317     /// The BlastDB `sparse indexing' feature omits versions when
1318     /// emitting (string) ISAM indices.  If a search for a Seq-id with
1319     /// a version fails, SeqDB strips the version and tries the search
1320     /// again.  However, for non-sparse databases, this second search
1321     /// has the harmful side effect that it can find IDs with the same
1322     /// accession but an incorrect version.  This method scans the OID
1323     /// list and removes the OIDs with incorrect versions.  It should
1324     /// only be called in cases when the version removal needed to be
1325     /// done to get results.
1326     ///
1327     /// @param acc
1328     ///   An accession or formatted Seq-id for which to search. [in]
1329     /// @param oids
1330     ///   A set of OIDs found for this sequence. [out]
1331     /// @param locked
1332     ///   The lock holder object for this thread. [in]
1333     void x_CheckVersions(const string         & acc,
1334                          vector<int>          & oids) const;
1335 
1336     void x_OpenSeqFile(void) const;
1337     void x_OpenHdrFile(void) const;
1338     void x_OpenPigFile(void) const;
1339     void x_UnleasePigFile(void) const;
1340     void x_OpenGiFile(void) const;
1341     void x_UnleaseGiFile(void) const;
1342     void x_OpenStrFile(void) const;
1343     void x_UnleaseStrFile(void) const;
1344     void x_OpenTiFile(void) const;
1345     void x_UnleaseTiFile(void) const;
1346     void x_OpenHashFile(void) const;
1347     void x_OpenOidFile(void) const;
1348 
1349     /// The memory management layer.
1350     CSeqDBAtlas & m_Atlas;
1351 
1352     /// True if the volume is protein, false for nucleotide.
1353     bool m_IsAA;
1354 
1355     /// The name of this volume.
1356     string m_VolName;
1357 
1358     /// Metadata plus offsets into the sequence, header, and ambiguity data.
1359     CRef<CSeqDBIdxFile> m_Idx;
1360 
1361     /// Contains sequence data for this volume.
1362     mutable CRef<CSeqDBSeqFile> m_Seq;
1363 
1364     /// Contains header (defline) information for this volume.
1365     mutable CRef<CSeqDBHdrFile> m_Hdr;
1366 
1367     // These are mutable because they defer initialization.
1368 
1369     /// Handles translation of GIs to OIDs.
1370     mutable CRef<CSeqDBIsam> m_IsamPig;
1371 
1372     /// Handles translation of GIs to OIDs.
1373     mutable CRef<CSeqDBIsam> m_IsamGi;
1374 
1375     /// Handles translation of strings (accessions) to OIDs.
1376     mutable CRef<CSeqDBIsam> m_IsamStr;
1377 
1378     /// Handles translation of TI (trace ids) to OIDs.
1379     mutable CRef<CSeqDBIsam> m_IsamTi;
1380 
1381     /// Handles translation of sequence hash value to OIDs.
1382     mutable CRef<CSeqDBIsam> m_IsamHash;
1383 
1384     /// The GI index file (for fast oid->gi conversion)
1385     mutable CRef<CSeqDBGiIndex> m_GiIndex;
1386 
1387     /// This cache allows CBioseqs to share taxonomic objects.
1388     mutable CSeqDBIntCache< CRef<CSeqdesc> > m_TaxCache;
1389 
1390     /// The user ID list, if one exists.
1391     mutable CRef<CSeqDBGiList> m_UserGiList;
1392 
1393     /// The negative ID list, if one exists.
1394     mutable CRef<CSeqDBNegativeList> m_NegativeList;
1395 
1396     /// The volume GI lists, if any exist.
1397     mutable TGiLists m_VolumeGiLists;
1398 
1399     /// The filtering MEMB_BIT
1400     mutable int m_MemBit;
1401 
1402     /// Cached/ranged sequence info type.
1403     typedef map<int, CRef<CSeqDBRangeList> > TRangeCache;
1404 
1405     /// Cached/ranged sequence info.
1406     mutable TRangeCache m_RangeCache;
1407 
1408     /// Starting OID of this volume.
1409     int m_VolStart;
1410 
1411     /// First OID past end of this volume.
1412     int m_VolEnd;
1413 
1414     /// Filtered defline plus whether binary data needed changes.
1415     typedef pair<CRef<CBlast_def_line_set>, bool> TDeflineCacheItem;
1416 
1417     /// Cache of filtered deflines.
1418     mutable CSeqDBIntCache<TDeflineCacheItem> m_DeflineCache;
1419 
1420     /// True if we have opened the columns for this volume.
1421     bool m_HaveColumns;
1422 
1423     /// True if the volume file has been (at least tried to) opened
1424     mutable bool m_SeqFileOpened;
1425     mutable bool m_HdrFileOpened;
1426     mutable bool m_HashFileOpened;
1427     mutable bool m_OidFileOpened;
1428 
1429     mutable CFastMutex m_MtxGi;
1430     mutable CFastMutex m_MtxPig;
1431     mutable CFastMutex m_MtxStr;
1432     mutable CFastMutex m_MtxTi;
1433     mutable CFastMutex m_MtxSeq;
1434     mutable CFastMutex m_MtxHdr;
1435     mutable CFastMutex m_MtxCachedRange;
1436 
1437 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION  > 550)) && \
1438      (!defined(NCBI_COMPILER_MIPSPRO)) )
1439     /// Set of columns defined for this volume.
1440     vector< CRef<CSeqDBColumn> > m_Columns;
1441 #endif
1442 };
1443 
1444 END_NCBI_SCOPE
1445 
1446 #endif // OBJTOOLS_READERS_SEQDB__SEQDBVOL_HPP
1447 
1448 
1449