1 #ifndef OBJTOOLS_READERS_SEQDB__SEQDBFILE_HPP
2 #define OBJTOOLS_READERS_SEQDB__SEQDBFILE_HPP
3 
4 /*  $Id: seqdbfile.hpp 553487 2017-12-18 14:23:38Z fongah2 $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Kevin Bealer
30  *
31  */
32 
33 /// @file seqdbfile.hpp
34 /// File access objects for CSeqDB.
35 ///
36 /// Defines classes:
37 ///     CSeqDBRawFile
38 ///     CSeqDBExtFile
39 ///     CSeqDBIdxFile
40 ///     CSeqDBSeqFile
41 ///     CSeqDBHdrFile
42 ///
43 /// Implemented for: UNIX, MS-Windows
44 
45 #include <objtools/blast/seqdb_reader/impl/seqdbgeneral.hpp>
46 #include <objtools/blast/seqdb_reader/impl/seqdbatlas.hpp>
47 
48 #include <corelib/ncbistr.hpp>
49 #include <corelib/ncbifile.hpp>
50 #include <corelib/ncbi_bswap.hpp>
51 #include <corelib/ncbiobj.hpp>
52 #include <objtools/blast/seqdb_reader/seqdbcommon.hpp>
53 #include <set>
54 
55 BEGIN_NCBI_SCOPE
56 
57 /// Raw file.
58 ///
59 /// This is the lowest level of SeqDB file object.  It controls basic
60 /// (byte data) access to the file, isolating higher levels from
61 /// differences in handling mmapped vs opened files.  This has mostly
62 /// become a thin wrapper around the Atlas functionality.
63 
64 class CSeqDBRawFile {
65 public:
66     /// Type which spans possible file offsets.
67     typedef CSeqDBAtlas::TIndx TIndx;
68 
69     /// Constructor
70     ///
71     /// Builds a "raw" file object, which is the lowest level of the
72     /// SeqDB file objects.  It provides byte swapping and reading
73     /// methods, which are implemented via the atlas layer.
74     ///
75     /// @param atlas
76     ///     The memory management layer object.
CSeqDBRawFile(CSeqDBAtlas & atlas)77     CSeqDBRawFile(CSeqDBAtlas & atlas)
78         : m_Atlas(atlas)
79     {
80     }
81 
82     /// MMap or Open a file.
83     ///
84     /// This serves to verify the existence of, open, and cache the
85     /// length of a file.
86     ///
87     /// @param name
88     ///   The filename to open.
89     /// @param locked
90     ///   The lock holder object for this thread.
91     /// @return
92     ///   true if the file was opened successfully.
Open(const CSeqDB_Path & name)93     bool Open(const CSeqDB_Path & name)
94     {
95         _ASSERT(name.Valid());
96 
97         // FIXME: should use path even in atlas code
98         bool success = m_Atlas.GetFileSizeL(name.GetPathS(), m_Length);
99 
100         if (success) {
101             m_FileName = name.GetPathS();
102         }
103 
104         return success;
105     }
106 
107     /// Get a pointer to a section of the file.
108     ///
109     /// This method insures that the memory lease has a hold that
110     /// includes the requested section of the file, and returns a
111     /// pointer to the start offset.
112     ///
113     /// @param lease
114     ///     The memory lease object for this file.
115     /// @param start
116     ///     The starting offset for the first byte of the region.
117     /// @param end
118     ///     The offset for the first byte after the region.
119     /// @param locked
120     ///     The lock holder object for this thread.
121     /// @return
122     ///     A pointer to the file data at the start offset.
GetFileDataPtr(CSeqDBFileMemMap & lease,TIndx start,TIndx end) const123     const char * GetFileDataPtr(CSeqDBFileMemMap & lease, // commented
124                            TIndx            start,
125                            TIndx            end) const
126     {
127         _ASSERT(! m_FileName.empty());
128         SEQDB_FILE_ASSERT(start    <  end);
129         SEQDB_FILE_ASSERT(m_Length >= end);
130 
131         const char *p = (const char *)lease.GetFileDataPtr(m_FileName,start);
132 
133         return p;
134     }
135 
136     /// Get the length of the file.
137     ///
138     /// The file length is returned as a four byte integer, which is
139     /// the current maximum size for the blastdb component files.
140     ///
141     /// @return
142     ///     The length of the file.
GetFileLength() const143     TIndx GetFileLength() const
144     {
145         return m_Length;
146     }
147 
148     /// Read a four byte numerical object from the file
149     ///
150     /// Given a pointer to an object in memory, this reads a numerical
151     /// value for it from the file.  The data in the file is assumed
152     /// to be in network byte order, and the user version in the local
153     /// default byte order (host order).  The size of the object is
154     /// taken as sizeof(Uint4).
155     ///
156     /// @param lease
157     ///     A memory lease object to use for the read.
158     /// @param offset
159     ///     The starting offset of the value in the file.
160     /// @param value
161     ///     A pointer to the object.
162     /// @param
163     ///     The lock holder object for this thread.
164     /// @return
165     ///     The offset of the first byte after the object.
166     TIndx ReadSwapped(CSeqDBFileMemMap & lease,
167                       TIndx            offset,
168                       Uint4          * value) const;
169 
170 
171     /// Read an eight byte numerical object from the file
172     ///
173     /// Given a pointer to an object in memory, this reads a numerical
174     /// value for it from the file.  The data in the file is assumed
175     /// to be in network byte order, and the user version in the local
176     /// default byte order (host order).  The size of the object is
177     /// taken as sizeof(Uint8).
178     ///
179     /// @param lease
180     ///     A memory lease object to use for the read.
181     /// @param offset
182     ///     The starting offset of the value in the file.
183     /// @param value
184     ///     A pointer to the object.
185     /// @param locked
186     ///     The lock holder object for this thread.
187     /// @return
188     ///     The offset of the first byte after the object.
189     TIndx ReadSwapped(CSeqDBFileMemMap & lease,
190                       TIndx            offset,
191                       Uint8          * value) const;
192 
193 
194     /// Read a string object from the file
195     ///
196     /// Given a pointer to a string object, this reads a string value
197     /// for it from the file.  The data in the file is assumed to be a
198     /// four byte length in network byte order, followed by the bytes
199     /// of the string.  The amount of data is this length + 4.
200     ///
201     /// @param lease
202     ///     A memory lease object to use for the read.
203     /// @param offset
204     ///     The starting offset of the string length in the file.
205     /// @param value
206     ///     A pointer to the returned string.
207     /// @param locked
208     ///     The lock holder object for this thread.
209     /// @return
210     ///     The offset of the first byte after the string.
211     TIndx ReadSwapped(CSeqDBFileMemMap & lease,
212                       TIndx            offset,
213                       string         * value) const;
214 
215 
216     /// Read part of the file into a buffer
217     ///
218     /// Copy the file data from offsets start to end into the array at
219     /// buf, which is assumed to already have been allocated.  This
220     /// method assumes the atlas lock is held.
221     ///
222     /// @param lease
223     ///     A memory lease object to use for the read.
224     /// @param buf
225     ///     The destination for the data to be read.
226     /// @param start
227     ///     The starting offset for the first byte to read.
228     /// @param end
229     ///     The offset for the first byte after the area to read.
230     inline void ReadBytes(CSeqDBFileMemMap & lease,
231                           char           * buf,
232                           TIndx            start,
233                           TIndx            end) const;
234 
235 private:
236     /// The memory management layer object.
237     CSeqDBAtlas & m_Atlas;
238 
239     /// The name of this file.
240     string m_FileName;
241 
242     /// The length of this file.
243     TIndx m_Length;
244 };
245 
246 
247 
248 /// Database component file
249 ///
250 /// This represents any database component file with an extension like
251 /// "pxx" or "nxx".  This finds the correct type (protein or
252 /// nucleotide) if that is unknown, and computes the filename based on
253 /// a filename template like "path/to/file/basename.-in".
254 ///
255 /// This also provides a 'protected' interface to the specific db
256 /// files, and defines a few useful methods.
257 
258 class CSeqDBExtFile : public CObject {
259 public:
260     /// Type which spans possible file offsets.
261     typedef CSeqDBAtlas::TIndx TIndx;
262 
263     /// Constructor
264     ///
265     /// This builds an object which has a few properties required by
266     /// most or all database volume component files.  This object
267     /// keeps a lease on the file from the first access until
268     /// instructed not to, moving and expanding that lease to cover
269     /// incoming requests.  By keeping a lease, lookups, file opens,
270     /// and other expensive operations are usually avoided on
271     /// subsequent calls.  This object also provides some methods to
272     /// read data in a byte swapped or direct way.
273     /// @param atlas
274     ///   The memory management layer object.
275     /// @param dbfilename
276     ///   The name of the managed file.
277     /// @param prot_nucl
278     ///   The sequence data type.
279     /// @param locked
280     ///   The lock holder object for this thread.
281     CSeqDBExtFile(CSeqDBAtlas    & atlas,
282                   const string   & dbfilename,
283                   char             prot_nucl);
284 
285 
286     /// Destructor
~CSeqDBExtFile()287     virtual ~CSeqDBExtFile()
288     {
289     }
290 
291 
292     /// Release memory held in the atlas layer by this object.
UnLease()293     void UnLease()
294     {
295         m_Lease.Clear();
296     }
297 
298 protected:
299 
300     /// Read part of the file into a buffer
301     ///
302     /// Copy the file data from offsets start to end into the array at
303     /// buf, which is assumed to already have been allocated.  This
304     /// method assumes the atlas lock is held.
305     ///
306     /// @param buf
307     ///     The destination for the data to be read.
308     /// @param start
309     ///     The starting offset for the first byte to read.
310     /// @param end
311     ///     The offset for the first byte after the area to read.
x_ReadBytes(char * buf,TIndx start,TIndx end) const312     void x_ReadBytes(char  * buf,
313                      TIndx   start,
314                      TIndx   end) const
315     {
316         m_File.ReadBytes(m_Lease, buf, start, end);
317     }
318 
319     /// Read a numerical object from the file
320     ///
321     /// Given a pointer to an object in memory, this reads a numerical
322     /// value for it from the file.  The data in the file is assumed
323     /// to be in network byte order, and the user version in the local
324     /// default byte order (host order).  The offset of the data is
325     /// provided, and the size of the object is taken as sizeof(T).
326     ///
327     /// @param lease
328     ///     A memory lease object to use for the read.
329     /// @param offset
330     ///     The starting offset of the object in the file.
331     /// @param value
332     ///     A pointer to the object.
333     /// @param locked
334     ///     The lock holder object for this thread.
335     /// @return
336     ///     The offset of the first byte after the object.
337     template<class T>
x_ReadSwapped(CSeqDBFileMemMap & lease,TIndx offset,T * value)338     TIndx x_ReadSwapped(CSeqDBFileMemMap & lease,
339                         TIndx            offset,
340                         T              * value)
341 
342     {
343         return m_File.ReadSwapped(lease, offset, value);
344     }
345 
346     /// Get the volume's sequence data type.
347     ///
348     /// This object knows which type of sequence data it deals with -
349     /// this method returns that information.
350     ///
351     /// @return
352     ///     The type of sequence data in use.
x_GetSeqType() const353     char x_GetSeqType() const
354     {
355         return m_ProtNucl;
356     }
357 
358     /// Sets the sequence data type.
359     ///
360     /// The sequence data will be set as protein or nucleotide.  An
361     /// exception is thrown if an invalid type is provided.  The first
362     /// character of the file extension will be modified to reflect
363     /// the sequence data type.
364     ///
365     /// @param prot_nucl
366     ///     Either 'p' or 'n' for protein or nucleotide.
367     void x_SetFileType(char prot_nucl);
368 
369     // Data
370 
371     /// The memory layer management object.
372     CSeqDBAtlas & m_Atlas;
373 
374     /// The name of this file.
375     string m_FileName;
376 
377     /// Either 'p' for protein or 'n' for nucleotide.
378     char m_ProtNucl;
379 
380     /// A memory lease used by this file.
381     mutable CSeqDBFileMemMap m_Lease;
382 
383     /// The raw file object.
384     CSeqDBRawFile m_File;
385 };
386 
x_SetFileType(char prot_nucl)387 void inline CSeqDBExtFile::x_SetFileType(char prot_nucl)
388 {
389     m_ProtNucl = prot_nucl;
390 
391     if ((m_ProtNucl != 'p') &&
392         (m_ProtNucl != 'n')) {
393 
394         NCBI_THROW(CSeqDBException, eArgErr,
395                    "Invalid argument: seq type must be 'p' or 'n'.");
396     }
397 
398     _ASSERT(m_FileName.size() >= 5);
399 
400     m_FileName[m_FileName.size() - 3] = m_ProtNucl;
401 }
402 
403 
404 /// Index file
405 ///
406 /// This is the .pin or .nin file; it provides indices into the other
407 /// files.  The version, title, date, and other summary information is
408 /// also stored here.
409 
410 class CSeqDBIdxFile : public CSeqDBExtFile {
411 public:
412     /// Constructor
413     ///
414     /// This builds an object which provides access to the index file
415     /// for a volume.  The index file contains metadata about the
416     /// volume, such as the title and construction date.  The index
417     /// file also contains indices into the header and sequence data
418     /// files.  Because these offsets are four byte integers, all
419     /// volumes have a size of no more than 2^32 bytes, but in
420     /// practice, they are usually kept under 2^30 bytes.
421     ///
422     /// @param atlas
423     ///   The memory management layer object.
424     /// @param dbname
425     ///   The name of the database volume.
426     /// @param prot_nucl
427     ///   The sequence data type.
428     /// @param locked
429     ///   The lock holder object for this thread.
430     CSeqDBIdxFile(CSeqDBAtlas    & atlas,
431                   const string   & dbname,
432                   char             prot_nucl);
433 
434 
435     /// Destructor
~CSeqDBIdxFile()436     virtual ~CSeqDBIdxFile()
437     {
438         // Synchronization removed from this path - it was causing a
439         // deadlock in an error path, and destruction and construction
440         // are necessarily single threaded in any case.
441 
442         //Verify();
443         UnLease();
444     }
445 
446     /// Get the location of a sequence's ambiguity data
447     ///
448     /// This method returns the offsets of the start and end of the
449     /// ambiguity data for a specific nucleotide sequence.  If this
450     /// range is non-empty, then this sequence has ambiguous regions,
451     /// which are encoded as a series of instructions for modifying
452     /// the compressed 4 base/byte nucleotide data.  The ambiguity
453     /// data is encoded as randomized noise, with the intention of
454     /// minimizing accidental matches.
455     ///
456     /// @param oid
457     ///   The sequence to get data for.
458     /// @param start
459     ///   The returned start offset of the sequence.
460     /// @param end
461     ///   The returned end offset of the sequence.
462     /// @return
463     ///   true if the sequence has ambiguity data.
464     inline bool
465     GetAmbStartEnd(int     oid,
466                    TIndx & start,
467                    TIndx & end) const;
468 
469     /// Get the location of a sequence's header data
470     ///
471     /// This method returns the offsets of the start and end of the
472     /// header data for a specific database sequence.  The header data
473     /// is a Blast-def-line-set in binary ASN.1.  This data includes
474     /// associated taxonomy data, Seq-ids, and membership bits.
475     ///
476     /// @param oid
477     ///   The sequence to get data for.
478     /// @param start
479     ///   The returned start offset of the sequence.
480     /// @param end
481     ///   The returned end offset of the sequence.
482     inline void
483     GetHdrStartEnd(int     oid,
484                    TIndx & start,
485                    TIndx & end) const;
486 
487     /// Get the location of a sequence's packed sequence data
488     ///
489     /// This method returns the offsets of the start and end of the
490     /// packed sequence data for a specific database sequence.  For
491     /// protein data, the packed version is the only supported
492     /// encoding, and is stored at one base per byte.  The header data
493     /// is encoded as a Blast-def-line-set in binary ASN.1.  This data
494     /// includes taxonomy information, Seq-ids for this sequence, and
495     /// membership bits.
496     ///
497     /// @param oid
498     ///   The sequence to get data for.
499     /// @param start
500     ///   The returned start offset of the sequence.
501     /// @param end
502     ///   The returned end offset of the sequence.
503     inline void
504     GetSeqStartEnd(int     oid,
505                    TIndx & start,
506                    TIndx & end) const;
507 
508     /// Get the location of a sequence's packed sequence data
509     ///
510     /// This method returns the offsets of the start and end of the
511     /// packed sequence data for a specific database sequence.  For
512     /// protein data, the packed version is the only supported
513     /// encoding, and is stored at one base per byte.  The header data
514     /// is encoded as a Blast-def-line-set in binary ASN.1.  This data
515     /// includes taxonomy information, Seq-ids for this sequence, and
516     /// membership bits.
517     ///
518     /// @param oid
519     ///   The sequence to get data for.
520     /// @param start
521     ///   The returned start offset of the sequence.
522     inline void
523     GetSeqStart(int     oid,
524                 TIndx & start) const;
525 
526     /// Get the sequence data type.
GetSeqType() const527     char GetSeqType() const
528     {
529         return x_GetSeqType();
530     }
531 
532     /// Get the volume title.
GetTitle() const533     string GetTitle() const
534     {
535         return m_Title;
536     }
537 
538     /// Get the construction date of the volume.
GetDate() const539     string GetDate() const
540     {
541         return m_Date;
542     }
543 
544     /// Get the number of oids in this volume.
GetNumOIDs() const545     int GetNumOIDs() const
546     {
547         return m_NumOIDs;
548     }
549 
550     /// Get the length of the volume (in bases).
GetVolumeLength() const551     Uint8 GetVolumeLength() const
552     {
553         return m_VolLen;
554     }
555 
556     /// Get the length of the longest sequence in this volume.
GetMaxLength() const557     int GetMaxLength() const
558     {
559         return m_MaxLen;
560     }
561 
562     /// Get the length of the shortest sequence in this volume.
GetMinLength() const563     int GetMinLength() const
564     {
565         return m_MinLen;
566     }
567 
568     /// Release any memory leases temporarily held here.
UnLease()569     void UnLease()
570     {
571         //Verify();
572         x_ClrHdr();
573         x_ClrSeq();
574         x_ClrAmb();
575     }
576 
GetLMDBFileName() const577     string GetLMDBFileName()const {return m_LMDBFile;}
578 
579     /// Verify the integrity of this object and subobjects.
580     /*
581     void Verify()
582     {
583         m_HdrLease.Verify();
584         m_SeqLease.Verify();
585         m_AmbLease.Verify();
586     }
587     */
588 private:
589 
590     /// A memory lease used by the header section of this file.
591     mutable CSeqDBFileMemMap m_HdrLease;
592     //mutable CMemoryFile *m_MmappedHdrIndex;
593 
594     /// A memory lease used by the sequence section of this file.
595     mutable CSeqDBFileMemMap m_SeqLease;
596     //mutable CMemoryFile* m_MmappedSeqIndex;
597 
598     /// A memory lease used by the ambiguity section of this file.
599     mutable CSeqDBFileMemMap m_AmbLease;
600     //mutable CMemoryFile *m_MmappedAmbIndex;
601 
602     // Swapped data from .[pn]in file
603 
604     /// The volume title.
605     string m_Title;
606 
607     /// The construction date of the volume.
608     string m_Date;
609 
610     /// The number of oids in this volume.
611     Uint4 m_NumOIDs;
612 
613     /// The length of the volume (in bases).
614     Uint8 m_VolLen;
615 
616     /// The length of the longest sequence in this volume.
617     Uint4 m_MaxLen;
618 
619     /// The length of the shortest sequence in this volume.
620     Uint4 m_MinLen;
621 
622     // Other pointers and indices
623 
624     // These can be mutable because they:
625     // 1. Do not constitute true object state.
626     // 2. Are modified only under lock (CSeqDBRawFile::m_Atlas.m_Lock).
627 
628     /// Return header data (assumes locked).
x_ClrHdr() const629     void x_ClrHdr() const
630     {
631        m_HdrLease.Clear();
632     }
633 
634     /// Return sequence data (assumes locked).
x_ClrSeq() const635     void x_ClrSeq() const
636     {
637         m_SeqLease.Clear();
638     }
639 
640     /// Return ambiguity data (assumes locked).
x_ClrAmb() const641     void x_ClrAmb() const
642     {
643         m_AmbLease.Clear();
644     }
645 
646     /// Get header data (assumes locked).
x_GetHdr() const647     Uint4 * x_GetHdr() const
648     {
649 
650         return (Uint4*) m_HdrLease.GetFileDataPtr(m_FileName, m_OffHdr);
651     }
652 
653     /// Get sequence data (assumes locked).
x_GetSeq() const654     Uint4 * x_GetSeq() const
655     {
656 
657         return (Uint4*) m_SeqLease.GetFileDataPtr(m_FileName, m_OffSeq);
658     }
659 
660     /// Get ambiguity data (assumes locked).
x_GetAmb() const661     Uint4 * x_GetAmb() const
662     {
663         _ASSERT(x_GetSeqType() == 'n');
664 
665         return (Uint4*) m_AmbLease.GetFileDataPtr(m_FileName, m_OffAmb);
666     }
667 
668 
669     /// offset of the start of the header section.
670     TIndx m_OffHdr;
671 
672     /// Offset of the end of the header section.
673     TIndx m_EndHdr;
674 
675     /// Offset of the start of the sequence section.
676     TIndx m_OffSeq;
677 
678     /// Offset of the end of the sequence section.
679     TIndx m_EndSeq;
680 
681     /// Offset of the start of the ambiguity section.
682     TIndx m_OffAmb;
683 
684     /// Offset of the end of the ambiguity section.
685     TIndx m_EndAmb;
686 
687     /// Name of matching SQLite file (empty if version 4 DB)
688     string m_LMDBFile;
689     /// Volume number (only set in version 5 DBs)
690     Uint4 m_Volume;
691 };
692 
693 bool
GetAmbStartEnd(int oid,TIndx & start,TIndx & end) const694 CSeqDBIdxFile::GetAmbStartEnd(int oid, TIndx & start, TIndx & end) const
695 {
696     if(!m_Lease.IsMapped()) m_Lease.Init();
697     if ('n' == x_GetSeqType()) {
698         start = SeqDB_GetStdOrd(& x_GetAmb()[oid]);
699         end   = SeqDB_GetStdOrd(& x_GetSeq()[oid+1]);
700 
701         return (start <= end);
702     }
703 
704     return false;
705 }
706 
707 void
GetHdrStartEnd(int oid,TIndx & start,TIndx & end) const708 CSeqDBIdxFile::GetHdrStartEnd(int oid, TIndx & start, TIndx & end) const
709 {
710     if(!m_Lease.IsMapped()) m_Lease.Init();
711     start = SeqDB_GetStdOrd(& x_GetHdr()[oid]);
712     end   = SeqDB_GetStdOrd(& x_GetHdr()[oid+1]);
713 }
714 
715 void
GetSeqStartEnd(int oid,TIndx & start,TIndx & end) const716 CSeqDBIdxFile::GetSeqStartEnd(int oid, TIndx & start, TIndx & end) const
717 {
718     if(!m_Lease.IsMapped()) m_Lease.Init();
719     start = SeqDB_GetStdOrd(& x_GetSeq()[oid]);
720 
721     if ('p' == x_GetSeqType()) {
722         end = SeqDB_GetStdOrd(& x_GetSeq()[oid+1]);
723     } else {
724         end = SeqDB_GetStdOrd(& x_GetAmb()[oid]);
725     }
726 }
727 
728 void
GetSeqStart(int oid,TIndx & start) const729 CSeqDBIdxFile::GetSeqStart(int oid, TIndx & start) const
730 {
731     if(!m_Lease.IsMapped()) m_Lease.Init();
732     start = SeqDB_GetStdOrd(& x_GetSeq()[oid]);
733 }
734 
735 
736 /// Sequence data file
737 ///
738 /// This is the .psq or .nsq file; it provides the raw sequence data,
739 /// and for nucleotide sequences, ambiguity data.  For nucleotide
740 /// sequences, the last byte will contain a two bit marker with a
741 /// number from 0-3, which indicates how much of the rest of that byte
742 /// is filled with base information (0-3 bases, which is 0-6 bits).
743 /// For ambiguous regions, the sequence data is normally randomized in
744 /// this file, to reduce the number of accidental false positives
745 /// during the search.  The ambiguity data encodes the location of,
746 /// and actual data for, those regions.
747 
748 class CSeqDBSeqFile : public CSeqDBExtFile {
749 public:
750     /// Type which spans possible file offsets.
751     typedef CSeqDBAtlas::TIndx TIndx;
752 
753     /// Constructor
754     ///
755     /// This builds an object which provides access to the sequence
756     /// data file for a volume.  This file is simply a concatenation
757     /// of all the sequence data for the database sequences.  In a
758     /// protein file, these are just the database sequences seperated
759     /// by NUL bytes.  In a nucleotide volume, the packed data for
760     /// each sequence is followed by ambiguity data for that sequence
761     /// (if any such data exists).
762     ///
763     /// @param atlas
764     ///   The memory management layer object.
765     /// @param dbname
766     ///   The name of the database volume.
767     /// @param prot_nucl
768     ///   The sequence data type.
769     /// @param locked
770     ///   The lock holder object for this thread.
CSeqDBSeqFile(CSeqDBAtlas & atlas,const string & dbname,char prot_nucl)771     CSeqDBSeqFile(CSeqDBAtlas    & atlas,
772                   const string   & dbname,
773                   char             prot_nucl)
774         : CSeqDBExtFile(atlas, dbname + ".-sq", prot_nucl)
775     {
776     }
777 
778     /// Destructor
~CSeqDBSeqFile()779     virtual ~CSeqDBSeqFile()
780     {
781     }
782 
783     /// Read part of the file into a buffer
784     ///
785     /// Copy the sequence data from offsets start to end into the
786     /// array at buf, which is assumed to already have been allocated.
787     /// This method assumes the atlas lock is held.
788     ///
789     /// @param buf
790     ///     The destination for the data to be read.
791     /// @param start
792     ///     The starting offset for the first byte to read.
793     /// @param end
794     ///     The offset for the first byte after the area to read.
ReadBytes(char * buf,TIndx start,TIndx end) const795     void ReadBytes(char  * buf,
796                    TIndx   start,
797                    TIndx   end) const
798     {
799         x_ReadBytes(buf, start, end);
800     }
801 
802     /// Get a pointer into the file contents.
803     ///
804     /// Copy the sequence data from offsets start to end into the
805     /// array at buf, which is assumed to already have been allocated.
806     /// This method assumes the atlas lock is held.  If the user will
807     /// take ownership of the memory region hold, the keep argument
808     /// should be specified as true.
809     ///
810     /// @param start
811     ///     The starting offset for the first byte to read.
812     /// @param end
813     ///     The offset for the first byte after the area to read.
814     /// @param keep
815     ///     True if an extra hold should be acquired on the data.
816     /// @param hold
817     ///     Specify true to get a request-duration hold.
818     /// @param locked
819     ///     The lock holder object for this thread.
820     /// @return
821     ///     A pointer into the file data.
GetFileDataPtr(TIndx start) const822     const char * GetFileDataPtr(TIndx            start) const // commented
823     {
824         const char *p = (const char *)m_Lease.GetFileDataPtr(start);
825 
826         return p;
827     }
828 };
829 
830 
831 /// Header file
832 ///
833 /// This is the .phr or .nhr file.  It contains descriptive data for
834 /// each sequence, including taxonomic information and identifiers for
835 /// sequence files.  The version, title, date, and other summary
836 /// information is also stored here.
837 
838 class CSeqDBHdrFile : public CSeqDBExtFile {
839 public:
840     /// Type which spans possible file offsets.
841     typedef CSeqDBAtlas::TIndx TIndx;
842 
843     /// Constructor
844     ///
845     /// This builds an object which provides access to the header data
846     /// file for a volume.  This file is simply a concatenation of the
847     /// header data for each object, stored as a Blast-def-line-set
848     /// objects in binary ASN.1.
849     ///
850     /// @param atlas
851     ///   The memory management layer object.
852     /// @param dbname
853     ///   The name of the database volume.
854     /// @param prot_nucl
855     ///   The sequence data type.
856     /// @param locked
857     ///   The lock holder object for this thread.
CSeqDBHdrFile(CSeqDBAtlas & atlas,const string & dbname,char prot_nucl)858     CSeqDBHdrFile(CSeqDBAtlas    & atlas,
859                   const string   & dbname,
860                   char             prot_nucl)
861         : CSeqDBExtFile(atlas, dbname + ".-hr", prot_nucl)
862     {
863     }
864 
865     /// Destructor
~CSeqDBHdrFile()866     virtual ~CSeqDBHdrFile()
867     {
868     }
869 
870     /// Read part of the file into a buffer
871     ///
872     /// Copy the sequence data from offsets start to end into the
873     /// array at buf, which is assumed to already have been allocated.
874     /// This method assumes the atlas lock is held.  If the user will
875     /// take ownership of the memory region hold, the keep argument
876     /// should be specified as true.
877     ///
878     /// @param buf
879     ///     The buffer to receive the data.
880     /// @param start
881     ///     The starting offset for the first byte to read.
882     /// @param end
883     ///     The offset for the first byte after the area to read.
ReadBytes(char * buf,TIndx start,TIndx end) const884     void ReadBytes(char  * buf,
885                    TIndx   start,
886                    TIndx   end) const
887     {
888         x_ReadBytes(buf, start, end);
889     }
890 
891     /// Read part of the file into a buffer
892     ///
893     /// Copy the sequence data from offsets start to end into the
894     /// array at buf, which is assumed to already have been allocated.
895     /// This method assumes the atlas lock is held.  If the user will
896     /// take ownership of the memory region hold, the keep argument
897     /// should be specified as true.
898     ///
899     /// @param start
900     ///     The starting offset for the first byte to read.
901     /// @param end
902     ///     The offset for the first byte after the area to read.
903     /// @param locked
904     ///     The lock holder object for this thread.
905     /// @return
906     ///     A pointer into the file data.
GetFileDataPtr(TIndx start) const907     const char * GetFileDataPtr(TIndx            start) const // commented
908     {
909         // Header data never requires the 'hold' option because asn.1
910         // processing is done immediately.
911 
912         const char *p = (const char *)m_Lease.GetFileDataPtr(start);
913         return p;
914     }
915 };
916 
917 
918 // Does not modify (or use) internal file offset
919 
920 // Assumes locked.
921 
ReadBytes(CSeqDBFileMemMap & lease,char * buf,TIndx start,TIndx end) const922 void CSeqDBRawFile::ReadBytes(CSeqDBFileMemMap & lease,
923                               char           * buf,
924                               TIndx            start,
925                               TIndx            end) const
926 {
927       memcpy(buf, lease.GetFileDataPtr(m_FileName,start), end-start);
928 
929 }
930 
931 END_NCBI_SCOPE
932 
933 #endif // OBJTOOLS_READERS_SEQDB__SEQDBFILE_HPP
934 
935 
936