1 #ifndef OBJTOOLS_READERS_SEQDB__SEQDBFILE_HPP
2 #define OBJTOOLS_READERS_SEQDB__SEQDBFILE_HPP
3
4 /* $Id: seqdbfile.hpp 553487 2017-12-18 14:23:38Z fongah2 $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Kevin Bealer
30 *
31 */
32
33 /// @file seqdbfile.hpp
34 /// File access objects for CSeqDB.
35 ///
36 /// Defines classes:
37 /// CSeqDBRawFile
38 /// CSeqDBExtFile
39 /// CSeqDBIdxFile
40 /// CSeqDBSeqFile
41 /// CSeqDBHdrFile
42 ///
43 /// Implemented for: UNIX, MS-Windows
44
45 #include <objtools/blast/seqdb_reader/impl/seqdbgeneral.hpp>
46 #include <objtools/blast/seqdb_reader/impl/seqdbatlas.hpp>
47
48 #include <corelib/ncbistr.hpp>
49 #include <corelib/ncbifile.hpp>
50 #include <corelib/ncbi_bswap.hpp>
51 #include <corelib/ncbiobj.hpp>
52 #include <objtools/blast/seqdb_reader/seqdbcommon.hpp>
53 #include <set>
54
55 BEGIN_NCBI_SCOPE
56
57 /// Raw file.
58 ///
59 /// This is the lowest level of SeqDB file object. It controls basic
60 /// (byte data) access to the file, isolating higher levels from
61 /// differences in handling mmapped vs opened files. This has mostly
62 /// become a thin wrapper around the Atlas functionality.
63
64 class CSeqDBRawFile {
65 public:
66 /// Type which spans possible file offsets.
67 typedef CSeqDBAtlas::TIndx TIndx;
68
69 /// Constructor
70 ///
71 /// Builds a "raw" file object, which is the lowest level of the
72 /// SeqDB file objects. It provides byte swapping and reading
73 /// methods, which are implemented via the atlas layer.
74 ///
75 /// @param atlas
76 /// The memory management layer object.
CSeqDBRawFile(CSeqDBAtlas & atlas)77 CSeqDBRawFile(CSeqDBAtlas & atlas)
78 : m_Atlas(atlas)
79 {
80 }
81
82 /// MMap or Open a file.
83 ///
84 /// This serves to verify the existence of, open, and cache the
85 /// length of a file.
86 ///
87 /// @param name
88 /// The filename to open.
89 /// @param locked
90 /// The lock holder object for this thread.
91 /// @return
92 /// true if the file was opened successfully.
Open(const CSeqDB_Path & name)93 bool Open(const CSeqDB_Path & name)
94 {
95 _ASSERT(name.Valid());
96
97 // FIXME: should use path even in atlas code
98 bool success = m_Atlas.GetFileSizeL(name.GetPathS(), m_Length);
99
100 if (success) {
101 m_FileName = name.GetPathS();
102 }
103
104 return success;
105 }
106
107 /// Get a pointer to a section of the file.
108 ///
109 /// This method insures that the memory lease has a hold that
110 /// includes the requested section of the file, and returns a
111 /// pointer to the start offset.
112 ///
113 /// @param lease
114 /// The memory lease object for this file.
115 /// @param start
116 /// The starting offset for the first byte of the region.
117 /// @param end
118 /// The offset for the first byte after the region.
119 /// @param locked
120 /// The lock holder object for this thread.
121 /// @return
122 /// A pointer to the file data at the start offset.
GetFileDataPtr(CSeqDBFileMemMap & lease,TIndx start,TIndx end) const123 const char * GetFileDataPtr(CSeqDBFileMemMap & lease, // commented
124 TIndx start,
125 TIndx end) const
126 {
127 _ASSERT(! m_FileName.empty());
128 SEQDB_FILE_ASSERT(start < end);
129 SEQDB_FILE_ASSERT(m_Length >= end);
130
131 const char *p = (const char *)lease.GetFileDataPtr(m_FileName,start);
132
133 return p;
134 }
135
136 /// Get the length of the file.
137 ///
138 /// The file length is returned as a four byte integer, which is
139 /// the current maximum size for the blastdb component files.
140 ///
141 /// @return
142 /// The length of the file.
GetFileLength() const143 TIndx GetFileLength() const
144 {
145 return m_Length;
146 }
147
148 /// Read a four byte numerical object from the file
149 ///
150 /// Given a pointer to an object in memory, this reads a numerical
151 /// value for it from the file. The data in the file is assumed
152 /// to be in network byte order, and the user version in the local
153 /// default byte order (host order). The size of the object is
154 /// taken as sizeof(Uint4).
155 ///
156 /// @param lease
157 /// A memory lease object to use for the read.
158 /// @param offset
159 /// The starting offset of the value in the file.
160 /// @param value
161 /// A pointer to the object.
162 /// @param
163 /// The lock holder object for this thread.
164 /// @return
165 /// The offset of the first byte after the object.
166 TIndx ReadSwapped(CSeqDBFileMemMap & lease,
167 TIndx offset,
168 Uint4 * value) const;
169
170
171 /// Read an eight byte numerical object from the file
172 ///
173 /// Given a pointer to an object in memory, this reads a numerical
174 /// value for it from the file. The data in the file is assumed
175 /// to be in network byte order, and the user version in the local
176 /// default byte order (host order). The size of the object is
177 /// taken as sizeof(Uint8).
178 ///
179 /// @param lease
180 /// A memory lease object to use for the read.
181 /// @param offset
182 /// The starting offset of the value in the file.
183 /// @param value
184 /// A pointer to the object.
185 /// @param locked
186 /// The lock holder object for this thread.
187 /// @return
188 /// The offset of the first byte after the object.
189 TIndx ReadSwapped(CSeqDBFileMemMap & lease,
190 TIndx offset,
191 Uint8 * value) const;
192
193
194 /// Read a string object from the file
195 ///
196 /// Given a pointer to a string object, this reads a string value
197 /// for it from the file. The data in the file is assumed to be a
198 /// four byte length in network byte order, followed by the bytes
199 /// of the string. The amount of data is this length + 4.
200 ///
201 /// @param lease
202 /// A memory lease object to use for the read.
203 /// @param offset
204 /// The starting offset of the string length in the file.
205 /// @param value
206 /// A pointer to the returned string.
207 /// @param locked
208 /// The lock holder object for this thread.
209 /// @return
210 /// The offset of the first byte after the string.
211 TIndx ReadSwapped(CSeqDBFileMemMap & lease,
212 TIndx offset,
213 string * value) const;
214
215
216 /// Read part of the file into a buffer
217 ///
218 /// Copy the file data from offsets start to end into the array at
219 /// buf, which is assumed to already have been allocated. This
220 /// method assumes the atlas lock is held.
221 ///
222 /// @param lease
223 /// A memory lease object to use for the read.
224 /// @param buf
225 /// The destination for the data to be read.
226 /// @param start
227 /// The starting offset for the first byte to read.
228 /// @param end
229 /// The offset for the first byte after the area to read.
230 inline void ReadBytes(CSeqDBFileMemMap & lease,
231 char * buf,
232 TIndx start,
233 TIndx end) const;
234
235 private:
236 /// The memory management layer object.
237 CSeqDBAtlas & m_Atlas;
238
239 /// The name of this file.
240 string m_FileName;
241
242 /// The length of this file.
243 TIndx m_Length;
244 };
245
246
247
248 /// Database component file
249 ///
250 /// This represents any database component file with an extension like
251 /// "pxx" or "nxx". This finds the correct type (protein or
252 /// nucleotide) if that is unknown, and computes the filename based on
253 /// a filename template like "path/to/file/basename.-in".
254 ///
255 /// This also provides a 'protected' interface to the specific db
256 /// files, and defines a few useful methods.
257
258 class CSeqDBExtFile : public CObject {
259 public:
260 /// Type which spans possible file offsets.
261 typedef CSeqDBAtlas::TIndx TIndx;
262
263 /// Constructor
264 ///
265 /// This builds an object which has a few properties required by
266 /// most or all database volume component files. This object
267 /// keeps a lease on the file from the first access until
268 /// instructed not to, moving and expanding that lease to cover
269 /// incoming requests. By keeping a lease, lookups, file opens,
270 /// and other expensive operations are usually avoided on
271 /// subsequent calls. This object also provides some methods to
272 /// read data in a byte swapped or direct way.
273 /// @param atlas
274 /// The memory management layer object.
275 /// @param dbfilename
276 /// The name of the managed file.
277 /// @param prot_nucl
278 /// The sequence data type.
279 /// @param locked
280 /// The lock holder object for this thread.
281 CSeqDBExtFile(CSeqDBAtlas & atlas,
282 const string & dbfilename,
283 char prot_nucl);
284
285
286 /// Destructor
~CSeqDBExtFile()287 virtual ~CSeqDBExtFile()
288 {
289 }
290
291
292 /// Release memory held in the atlas layer by this object.
UnLease()293 void UnLease()
294 {
295 m_Lease.Clear();
296 }
297
298 protected:
299
300 /// Read part of the file into a buffer
301 ///
302 /// Copy the file data from offsets start to end into the array at
303 /// buf, which is assumed to already have been allocated. This
304 /// method assumes the atlas lock is held.
305 ///
306 /// @param buf
307 /// The destination for the data to be read.
308 /// @param start
309 /// The starting offset for the first byte to read.
310 /// @param end
311 /// The offset for the first byte after the area to read.
x_ReadBytes(char * buf,TIndx start,TIndx end) const312 void x_ReadBytes(char * buf,
313 TIndx start,
314 TIndx end) const
315 {
316 m_File.ReadBytes(m_Lease, buf, start, end);
317 }
318
319 /// Read a numerical object from the file
320 ///
321 /// Given a pointer to an object in memory, this reads a numerical
322 /// value for it from the file. The data in the file is assumed
323 /// to be in network byte order, and the user version in the local
324 /// default byte order (host order). The offset of the data is
325 /// provided, and the size of the object is taken as sizeof(T).
326 ///
327 /// @param lease
328 /// A memory lease object to use for the read.
329 /// @param offset
330 /// The starting offset of the object in the file.
331 /// @param value
332 /// A pointer to the object.
333 /// @param locked
334 /// The lock holder object for this thread.
335 /// @return
336 /// The offset of the first byte after the object.
337 template<class T>
x_ReadSwapped(CSeqDBFileMemMap & lease,TIndx offset,T * value)338 TIndx x_ReadSwapped(CSeqDBFileMemMap & lease,
339 TIndx offset,
340 T * value)
341
342 {
343 return m_File.ReadSwapped(lease, offset, value);
344 }
345
346 /// Get the volume's sequence data type.
347 ///
348 /// This object knows which type of sequence data it deals with -
349 /// this method returns that information.
350 ///
351 /// @return
352 /// The type of sequence data in use.
x_GetSeqType() const353 char x_GetSeqType() const
354 {
355 return m_ProtNucl;
356 }
357
358 /// Sets the sequence data type.
359 ///
360 /// The sequence data will be set as protein or nucleotide. An
361 /// exception is thrown if an invalid type is provided. The first
362 /// character of the file extension will be modified to reflect
363 /// the sequence data type.
364 ///
365 /// @param prot_nucl
366 /// Either 'p' or 'n' for protein or nucleotide.
367 void x_SetFileType(char prot_nucl);
368
369 // Data
370
371 /// The memory layer management object.
372 CSeqDBAtlas & m_Atlas;
373
374 /// The name of this file.
375 string m_FileName;
376
377 /// Either 'p' for protein or 'n' for nucleotide.
378 char m_ProtNucl;
379
380 /// A memory lease used by this file.
381 mutable CSeqDBFileMemMap m_Lease;
382
383 /// The raw file object.
384 CSeqDBRawFile m_File;
385 };
386
x_SetFileType(char prot_nucl)387 void inline CSeqDBExtFile::x_SetFileType(char prot_nucl)
388 {
389 m_ProtNucl = prot_nucl;
390
391 if ((m_ProtNucl != 'p') &&
392 (m_ProtNucl != 'n')) {
393
394 NCBI_THROW(CSeqDBException, eArgErr,
395 "Invalid argument: seq type must be 'p' or 'n'.");
396 }
397
398 _ASSERT(m_FileName.size() >= 5);
399
400 m_FileName[m_FileName.size() - 3] = m_ProtNucl;
401 }
402
403
404 /// Index file
405 ///
406 /// This is the .pin or .nin file; it provides indices into the other
407 /// files. The version, title, date, and other summary information is
408 /// also stored here.
409
410 class CSeqDBIdxFile : public CSeqDBExtFile {
411 public:
412 /// Constructor
413 ///
414 /// This builds an object which provides access to the index file
415 /// for a volume. The index file contains metadata about the
416 /// volume, such as the title and construction date. The index
417 /// file also contains indices into the header and sequence data
418 /// files. Because these offsets are four byte integers, all
419 /// volumes have a size of no more than 2^32 bytes, but in
420 /// practice, they are usually kept under 2^30 bytes.
421 ///
422 /// @param atlas
423 /// The memory management layer object.
424 /// @param dbname
425 /// The name of the database volume.
426 /// @param prot_nucl
427 /// The sequence data type.
428 /// @param locked
429 /// The lock holder object for this thread.
430 CSeqDBIdxFile(CSeqDBAtlas & atlas,
431 const string & dbname,
432 char prot_nucl);
433
434
435 /// Destructor
~CSeqDBIdxFile()436 virtual ~CSeqDBIdxFile()
437 {
438 // Synchronization removed from this path - it was causing a
439 // deadlock in an error path, and destruction and construction
440 // are necessarily single threaded in any case.
441
442 //Verify();
443 UnLease();
444 }
445
446 /// Get the location of a sequence's ambiguity data
447 ///
448 /// This method returns the offsets of the start and end of the
449 /// ambiguity data for a specific nucleotide sequence. If this
450 /// range is non-empty, then this sequence has ambiguous regions,
451 /// which are encoded as a series of instructions for modifying
452 /// the compressed 4 base/byte nucleotide data. The ambiguity
453 /// data is encoded as randomized noise, with the intention of
454 /// minimizing accidental matches.
455 ///
456 /// @param oid
457 /// The sequence to get data for.
458 /// @param start
459 /// The returned start offset of the sequence.
460 /// @param end
461 /// The returned end offset of the sequence.
462 /// @return
463 /// true if the sequence has ambiguity data.
464 inline bool
465 GetAmbStartEnd(int oid,
466 TIndx & start,
467 TIndx & end) const;
468
469 /// Get the location of a sequence's header data
470 ///
471 /// This method returns the offsets of the start and end of the
472 /// header data for a specific database sequence. The header data
473 /// is a Blast-def-line-set in binary ASN.1. This data includes
474 /// associated taxonomy data, Seq-ids, and membership bits.
475 ///
476 /// @param oid
477 /// The sequence to get data for.
478 /// @param start
479 /// The returned start offset of the sequence.
480 /// @param end
481 /// The returned end offset of the sequence.
482 inline void
483 GetHdrStartEnd(int oid,
484 TIndx & start,
485 TIndx & end) const;
486
487 /// Get the location of a sequence's packed sequence data
488 ///
489 /// This method returns the offsets of the start and end of the
490 /// packed sequence data for a specific database sequence. For
491 /// protein data, the packed version is the only supported
492 /// encoding, and is stored at one base per byte. The header data
493 /// is encoded as a Blast-def-line-set in binary ASN.1. This data
494 /// includes taxonomy information, Seq-ids for this sequence, and
495 /// membership bits.
496 ///
497 /// @param oid
498 /// The sequence to get data for.
499 /// @param start
500 /// The returned start offset of the sequence.
501 /// @param end
502 /// The returned end offset of the sequence.
503 inline void
504 GetSeqStartEnd(int oid,
505 TIndx & start,
506 TIndx & end) const;
507
508 /// Get the location of a sequence's packed sequence data
509 ///
510 /// This method returns the offsets of the start and end of the
511 /// packed sequence data for a specific database sequence. For
512 /// protein data, the packed version is the only supported
513 /// encoding, and is stored at one base per byte. The header data
514 /// is encoded as a Blast-def-line-set in binary ASN.1. This data
515 /// includes taxonomy information, Seq-ids for this sequence, and
516 /// membership bits.
517 ///
518 /// @param oid
519 /// The sequence to get data for.
520 /// @param start
521 /// The returned start offset of the sequence.
522 inline void
523 GetSeqStart(int oid,
524 TIndx & start) const;
525
526 /// Get the sequence data type.
GetSeqType() const527 char GetSeqType() const
528 {
529 return x_GetSeqType();
530 }
531
532 /// Get the volume title.
GetTitle() const533 string GetTitle() const
534 {
535 return m_Title;
536 }
537
538 /// Get the construction date of the volume.
GetDate() const539 string GetDate() const
540 {
541 return m_Date;
542 }
543
544 /// Get the number of oids in this volume.
GetNumOIDs() const545 int GetNumOIDs() const
546 {
547 return m_NumOIDs;
548 }
549
550 /// Get the length of the volume (in bases).
GetVolumeLength() const551 Uint8 GetVolumeLength() const
552 {
553 return m_VolLen;
554 }
555
556 /// Get the length of the longest sequence in this volume.
GetMaxLength() const557 int GetMaxLength() const
558 {
559 return m_MaxLen;
560 }
561
562 /// Get the length of the shortest sequence in this volume.
GetMinLength() const563 int GetMinLength() const
564 {
565 return m_MinLen;
566 }
567
568 /// Release any memory leases temporarily held here.
UnLease()569 void UnLease()
570 {
571 //Verify();
572 x_ClrHdr();
573 x_ClrSeq();
574 x_ClrAmb();
575 }
576
GetLMDBFileName() const577 string GetLMDBFileName()const {return m_LMDBFile;}
578
579 /// Verify the integrity of this object and subobjects.
580 /*
581 void Verify()
582 {
583 m_HdrLease.Verify();
584 m_SeqLease.Verify();
585 m_AmbLease.Verify();
586 }
587 */
588 private:
589
590 /// A memory lease used by the header section of this file.
591 mutable CSeqDBFileMemMap m_HdrLease;
592 //mutable CMemoryFile *m_MmappedHdrIndex;
593
594 /// A memory lease used by the sequence section of this file.
595 mutable CSeqDBFileMemMap m_SeqLease;
596 //mutable CMemoryFile* m_MmappedSeqIndex;
597
598 /// A memory lease used by the ambiguity section of this file.
599 mutable CSeqDBFileMemMap m_AmbLease;
600 //mutable CMemoryFile *m_MmappedAmbIndex;
601
602 // Swapped data from .[pn]in file
603
604 /// The volume title.
605 string m_Title;
606
607 /// The construction date of the volume.
608 string m_Date;
609
610 /// The number of oids in this volume.
611 Uint4 m_NumOIDs;
612
613 /// The length of the volume (in bases).
614 Uint8 m_VolLen;
615
616 /// The length of the longest sequence in this volume.
617 Uint4 m_MaxLen;
618
619 /// The length of the shortest sequence in this volume.
620 Uint4 m_MinLen;
621
622 // Other pointers and indices
623
624 // These can be mutable because they:
625 // 1. Do not constitute true object state.
626 // 2. Are modified only under lock (CSeqDBRawFile::m_Atlas.m_Lock).
627
628 /// Return header data (assumes locked).
x_ClrHdr() const629 void x_ClrHdr() const
630 {
631 m_HdrLease.Clear();
632 }
633
634 /// Return sequence data (assumes locked).
x_ClrSeq() const635 void x_ClrSeq() const
636 {
637 m_SeqLease.Clear();
638 }
639
640 /// Return ambiguity data (assumes locked).
x_ClrAmb() const641 void x_ClrAmb() const
642 {
643 m_AmbLease.Clear();
644 }
645
646 /// Get header data (assumes locked).
x_GetHdr() const647 Uint4 * x_GetHdr() const
648 {
649
650 return (Uint4*) m_HdrLease.GetFileDataPtr(m_FileName, m_OffHdr);
651 }
652
653 /// Get sequence data (assumes locked).
x_GetSeq() const654 Uint4 * x_GetSeq() const
655 {
656
657 return (Uint4*) m_SeqLease.GetFileDataPtr(m_FileName, m_OffSeq);
658 }
659
660 /// Get ambiguity data (assumes locked).
x_GetAmb() const661 Uint4 * x_GetAmb() const
662 {
663 _ASSERT(x_GetSeqType() == 'n');
664
665 return (Uint4*) m_AmbLease.GetFileDataPtr(m_FileName, m_OffAmb);
666 }
667
668
669 /// offset of the start of the header section.
670 TIndx m_OffHdr;
671
672 /// Offset of the end of the header section.
673 TIndx m_EndHdr;
674
675 /// Offset of the start of the sequence section.
676 TIndx m_OffSeq;
677
678 /// Offset of the end of the sequence section.
679 TIndx m_EndSeq;
680
681 /// Offset of the start of the ambiguity section.
682 TIndx m_OffAmb;
683
684 /// Offset of the end of the ambiguity section.
685 TIndx m_EndAmb;
686
687 /// Name of matching SQLite file (empty if version 4 DB)
688 string m_LMDBFile;
689 /// Volume number (only set in version 5 DBs)
690 Uint4 m_Volume;
691 };
692
693 bool
GetAmbStartEnd(int oid,TIndx & start,TIndx & end) const694 CSeqDBIdxFile::GetAmbStartEnd(int oid, TIndx & start, TIndx & end) const
695 {
696 if(!m_Lease.IsMapped()) m_Lease.Init();
697 if ('n' == x_GetSeqType()) {
698 start = SeqDB_GetStdOrd(& x_GetAmb()[oid]);
699 end = SeqDB_GetStdOrd(& x_GetSeq()[oid+1]);
700
701 return (start <= end);
702 }
703
704 return false;
705 }
706
707 void
GetHdrStartEnd(int oid,TIndx & start,TIndx & end) const708 CSeqDBIdxFile::GetHdrStartEnd(int oid, TIndx & start, TIndx & end) const
709 {
710 if(!m_Lease.IsMapped()) m_Lease.Init();
711 start = SeqDB_GetStdOrd(& x_GetHdr()[oid]);
712 end = SeqDB_GetStdOrd(& x_GetHdr()[oid+1]);
713 }
714
715 void
GetSeqStartEnd(int oid,TIndx & start,TIndx & end) const716 CSeqDBIdxFile::GetSeqStartEnd(int oid, TIndx & start, TIndx & end) const
717 {
718 if(!m_Lease.IsMapped()) m_Lease.Init();
719 start = SeqDB_GetStdOrd(& x_GetSeq()[oid]);
720
721 if ('p' == x_GetSeqType()) {
722 end = SeqDB_GetStdOrd(& x_GetSeq()[oid+1]);
723 } else {
724 end = SeqDB_GetStdOrd(& x_GetAmb()[oid]);
725 }
726 }
727
728 void
GetSeqStart(int oid,TIndx & start) const729 CSeqDBIdxFile::GetSeqStart(int oid, TIndx & start) const
730 {
731 if(!m_Lease.IsMapped()) m_Lease.Init();
732 start = SeqDB_GetStdOrd(& x_GetSeq()[oid]);
733 }
734
735
736 /// Sequence data file
737 ///
738 /// This is the .psq or .nsq file; it provides the raw sequence data,
739 /// and for nucleotide sequences, ambiguity data. For nucleotide
740 /// sequences, the last byte will contain a two bit marker with a
741 /// number from 0-3, which indicates how much of the rest of that byte
742 /// is filled with base information (0-3 bases, which is 0-6 bits).
743 /// For ambiguous regions, the sequence data is normally randomized in
744 /// this file, to reduce the number of accidental false positives
745 /// during the search. The ambiguity data encodes the location of,
746 /// and actual data for, those regions.
747
748 class CSeqDBSeqFile : public CSeqDBExtFile {
749 public:
750 /// Type which spans possible file offsets.
751 typedef CSeqDBAtlas::TIndx TIndx;
752
753 /// Constructor
754 ///
755 /// This builds an object which provides access to the sequence
756 /// data file for a volume. This file is simply a concatenation
757 /// of all the sequence data for the database sequences. In a
758 /// protein file, these are just the database sequences seperated
759 /// by NUL bytes. In a nucleotide volume, the packed data for
760 /// each sequence is followed by ambiguity data for that sequence
761 /// (if any such data exists).
762 ///
763 /// @param atlas
764 /// The memory management layer object.
765 /// @param dbname
766 /// The name of the database volume.
767 /// @param prot_nucl
768 /// The sequence data type.
769 /// @param locked
770 /// The lock holder object for this thread.
CSeqDBSeqFile(CSeqDBAtlas & atlas,const string & dbname,char prot_nucl)771 CSeqDBSeqFile(CSeqDBAtlas & atlas,
772 const string & dbname,
773 char prot_nucl)
774 : CSeqDBExtFile(atlas, dbname + ".-sq", prot_nucl)
775 {
776 }
777
778 /// Destructor
~CSeqDBSeqFile()779 virtual ~CSeqDBSeqFile()
780 {
781 }
782
783 /// Read part of the file into a buffer
784 ///
785 /// Copy the sequence data from offsets start to end into the
786 /// array at buf, which is assumed to already have been allocated.
787 /// This method assumes the atlas lock is held.
788 ///
789 /// @param buf
790 /// The destination for the data to be read.
791 /// @param start
792 /// The starting offset for the first byte to read.
793 /// @param end
794 /// The offset for the first byte after the area to read.
ReadBytes(char * buf,TIndx start,TIndx end) const795 void ReadBytes(char * buf,
796 TIndx start,
797 TIndx end) const
798 {
799 x_ReadBytes(buf, start, end);
800 }
801
802 /// Get a pointer into the file contents.
803 ///
804 /// Copy the sequence data from offsets start to end into the
805 /// array at buf, which is assumed to already have been allocated.
806 /// This method assumes the atlas lock is held. If the user will
807 /// take ownership of the memory region hold, the keep argument
808 /// should be specified as true.
809 ///
810 /// @param start
811 /// The starting offset for the first byte to read.
812 /// @param end
813 /// The offset for the first byte after the area to read.
814 /// @param keep
815 /// True if an extra hold should be acquired on the data.
816 /// @param hold
817 /// Specify true to get a request-duration hold.
818 /// @param locked
819 /// The lock holder object for this thread.
820 /// @return
821 /// A pointer into the file data.
GetFileDataPtr(TIndx start) const822 const char * GetFileDataPtr(TIndx start) const // commented
823 {
824 const char *p = (const char *)m_Lease.GetFileDataPtr(start);
825
826 return p;
827 }
828 };
829
830
831 /// Header file
832 ///
833 /// This is the .phr or .nhr file. It contains descriptive data for
834 /// each sequence, including taxonomic information and identifiers for
835 /// sequence files. The version, title, date, and other summary
836 /// information is also stored here.
837
838 class CSeqDBHdrFile : public CSeqDBExtFile {
839 public:
840 /// Type which spans possible file offsets.
841 typedef CSeqDBAtlas::TIndx TIndx;
842
843 /// Constructor
844 ///
845 /// This builds an object which provides access to the header data
846 /// file for a volume. This file is simply a concatenation of the
847 /// header data for each object, stored as a Blast-def-line-set
848 /// objects in binary ASN.1.
849 ///
850 /// @param atlas
851 /// The memory management layer object.
852 /// @param dbname
853 /// The name of the database volume.
854 /// @param prot_nucl
855 /// The sequence data type.
856 /// @param locked
857 /// The lock holder object for this thread.
CSeqDBHdrFile(CSeqDBAtlas & atlas,const string & dbname,char prot_nucl)858 CSeqDBHdrFile(CSeqDBAtlas & atlas,
859 const string & dbname,
860 char prot_nucl)
861 : CSeqDBExtFile(atlas, dbname + ".-hr", prot_nucl)
862 {
863 }
864
865 /// Destructor
~CSeqDBHdrFile()866 virtual ~CSeqDBHdrFile()
867 {
868 }
869
870 /// Read part of the file into a buffer
871 ///
872 /// Copy the sequence data from offsets start to end into the
873 /// array at buf, which is assumed to already have been allocated.
874 /// This method assumes the atlas lock is held. If the user will
875 /// take ownership of the memory region hold, the keep argument
876 /// should be specified as true.
877 ///
878 /// @param buf
879 /// The buffer to receive the data.
880 /// @param start
881 /// The starting offset for the first byte to read.
882 /// @param end
883 /// The offset for the first byte after the area to read.
ReadBytes(char * buf,TIndx start,TIndx end) const884 void ReadBytes(char * buf,
885 TIndx start,
886 TIndx end) const
887 {
888 x_ReadBytes(buf, start, end);
889 }
890
891 /// Read part of the file into a buffer
892 ///
893 /// Copy the sequence data from offsets start to end into the
894 /// array at buf, which is assumed to already have been allocated.
895 /// This method assumes the atlas lock is held. If the user will
896 /// take ownership of the memory region hold, the keep argument
897 /// should be specified as true.
898 ///
899 /// @param start
900 /// The starting offset for the first byte to read.
901 /// @param end
902 /// The offset for the first byte after the area to read.
903 /// @param locked
904 /// The lock holder object for this thread.
905 /// @return
906 /// A pointer into the file data.
GetFileDataPtr(TIndx start) const907 const char * GetFileDataPtr(TIndx start) const // commented
908 {
909 // Header data never requires the 'hold' option because asn.1
910 // processing is done immediately.
911
912 const char *p = (const char *)m_Lease.GetFileDataPtr(start);
913 return p;
914 }
915 };
916
917
918 // Does not modify (or use) internal file offset
919
920 // Assumes locked.
921
ReadBytes(CSeqDBFileMemMap & lease,char * buf,TIndx start,TIndx end) const922 void CSeqDBRawFile::ReadBytes(CSeqDBFileMemMap & lease,
923 char * buf,
924 TIndx start,
925 TIndx end) const
926 {
927 memcpy(buf, lease.GetFileDataPtr(m_FileName,start), end-start);
928
929 }
930
931 END_NCBI_SCOPE
932
933 #endif // OBJTOOLS_READERS_SEQDB__SEQDBFILE_HPP
934
935
936