1 /* $Id: seqdbfile.cpp 553487 2017-12-18 14:23:38Z fongah2 $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kevin Bealer
27 *
28 */
29
30 /// @file seqdbfile.cpp
31 /// Several classes providing access to the component files of a
32 /// database volume.
33 #include <ncbi_pch.hpp>
34 #include <objtools/blast/seqdb_reader/impl/seqdbfile.hpp>
35
36 BEGIN_NCBI_SCOPE
37
38 /// Index file.
39 ///
40 /// Index files (extension nin or pin) contain information on where to
41 /// find information in other files. The OID is the (implied) key.
42
43
44 // A Word About Mutexes and Mutability in the File Classes
45 //
46 // The stream object in CSeqDBRawFile is mutable: this is because the
47 // stream access methods modify the file. Specifically, they modify
48 // the file offset. This means that two users of a stream object will
49 // step on each other if they try to read from different offsets
50 // concurrently. Memory mapping does not have this problem of course.
51 //
52 // To fix this, the file object is mutable, but to access it, the user
53 // needs to hold the m_FileLock mutex.
54 //
55 // One goal I have for these classes is to eliminate all locking for
56 // the mmap case. Locking is not needed to call a const method, so
57 // methods are marked const whenever possible. After construction of
58 // CSeqDB, ONLY const methods are called.
59 //
60 // Some of the const methods need to modify fields; to do this, I mark
61 // the fields 'mutable' and hold a mutex whenever accessing them.
62 //
63 // Each method falls into one of these categories:
64 //
65 // 1. Non-const: called only during CSeqDB construction.
66 // 2. Const: no changes to any fields.
67 // 3. Const: modifies mutable fields while holding m_FileLock.
68
69 typedef CSeqDBAtlas::TIndx TIndx;
70
ReadSwapped(CSeqDBFileMemMap & lease,TIndx offset,Uint4 * value) const71 TIndx CSeqDBRawFile::ReadSwapped(CSeqDBFileMemMap & lease,
72 TIndx offset,
73 Uint4 * value) const
74
75 {
76 *value = SeqDB_GetStdOrd((Uint4 *) lease.GetFileDataPtr(m_FileName,offset));
77
78 return offset + sizeof(*value);
79 }
80
ReadSwapped(CSeqDBFileMemMap & lease,TIndx offset,Uint8 * value) const81 TIndx CSeqDBRawFile::ReadSwapped(CSeqDBFileMemMap & lease,
82 TIndx offset,
83 Uint8 * value) const
84
85 {
86 *value = SeqDB_GetBroken((Int8 *) lease.GetFileDataPtr(m_FileName,offset));
87
88 return offset + sizeof(*value);
89 }
90
ReadSwapped(CSeqDBFileMemMap & lease,TIndx offset,string * value) const91 TIndx CSeqDBRawFile::ReadSwapped(CSeqDBFileMemMap & lease,
92 TIndx offset,
93 string * value) const
94 {
95 Uint4 len = 0;
96
97 len = SeqDB_GetStdOrd((Int4 *) lease.GetFileDataPtr(m_FileName,offset));
98
99 offset += sizeof(len);
100
101 value->assign(lease.GetFileDataPtr(offset), (int) len);
102
103 return offset + len;
104 }
105
CSeqDBExtFile(CSeqDBAtlas & atlas,const string & dbfilename,char prot_nucl)106 CSeqDBExtFile::CSeqDBExtFile(CSeqDBAtlas & atlas,
107 const string & dbfilename,
108 char prot_nucl)
109
110 : m_Atlas (atlas),
111 m_FileName(dbfilename),
112 m_Lease (atlas),
113 m_File (atlas)
114 {
115 if ((prot_nucl != 'p') && (prot_nucl != 'n')) {
116 NCBI_THROW(CSeqDBException,
117 eArgErr,
118 "Error: Invalid sequence type requested.");
119 }
120
121 x_SetFileType(prot_nucl);
122
123 if (! m_File.Open(CSeqDB_Path(m_FileName))) {
124 //m_Atlas.Unlock(locked);
125
126 string msg = string("Error: File (") + m_FileName + ") not found.";
127
128 NCBI_THROW(CSeqDBException, eFileErr, msg);
129 }
130
131 m_Lease.Init(m_FileName);
132 }
133
CSeqDBIdxFile(CSeqDBAtlas & atlas,const string & dbname,char prot_nucl)134 CSeqDBIdxFile::CSeqDBIdxFile(CSeqDBAtlas & atlas,
135 const string & dbname,
136 char prot_nucl)
137
138 : CSeqDBExtFile(atlas, dbname + ".-in", prot_nucl),
139 m_HdrLease (atlas),
140 m_SeqLease (atlas),
141 m_AmbLease (atlas),
142 m_NumOIDs (0),
143 m_VolLen (0),
144 m_MaxLen (0),
145 m_MinLen (0),
146 m_OffHdr (0),
147 m_EndHdr (0),
148 m_OffSeq (0),
149 m_EndSeq (0),
150 m_OffAmb (0),
151 m_EndAmb (0),
152 m_LMDBFile (kEmptyStr) ,
153 m_Volume (0)
154 {
155 //Verify();
156
157 // Input validation
158
159 if (dbname.empty()) {
160 NCBI_THROW(CSeqDBException,
161 eArgErr,
162 "Error: dbname should not be an empty string.");
163 }
164
165 if ((prot_nucl != 'p') && (prot_nucl != 'n')) {
166 NCBI_THROW(CSeqDBException,
167 eArgErr,
168 "Error: Invalid sequence type requested.");
169 }
170
171 TIndx offset = 0;
172
173
174 Uint4 f_format_version = 0;
175 Uint4 f_db_seqtype = 0;
176
177
178 offset = x_ReadSwapped(m_Lease, offset, & f_format_version);
179
180 TIndx off1(0), off2(0), off3(0), offend(0);
181
182 try {
183 bool dbv5 = ( 5 == f_format_version );
184 if (!dbv5 && (f_format_version != 4)) {
185 NCBI_THROW(CSeqDBException,
186 eFileErr,
187 "Error: Not a valid version 4 or 5 database.");
188 }
189
190 offset = x_ReadSwapped(m_Lease, offset, & f_db_seqtype);
191 if (dbv5) {
192 offset = x_ReadSwapped(m_Lease, offset, & m_Volume);
193 }
194
195 offset = x_ReadSwapped(m_Lease, offset, & m_Title);
196 if (dbv5) {
197 offset = x_ReadSwapped(m_Lease, offset, & m_LMDBFile);
198 }
199
200 offset = x_ReadSwapped(m_Lease, offset, & m_Date);
201
202
203 offset = x_ReadSwapped(m_Lease, offset, & m_NumOIDs);
204
205
206 offset = x_ReadSwapped(m_Lease, offset, & m_VolLen);
207
208
209 offset = x_ReadSwapped(m_Lease, offset, & m_MaxLen);
210
211 TIndx region_bytes = 4 * (m_NumOIDs + 1);
212
213 off1 = offset;
214 off2 = off1 + region_bytes;
215 off3 = off2 + region_bytes;
216 offend = off3 + region_bytes;
217 }
218 catch(...) {
219 throw;
220 }
221
222
223 char db_seqtype = ((f_db_seqtype == 1) ? 'p' : 'n');
224
225 if (db_seqtype != x_GetSeqType()) {
226 NCBI_THROW(CSeqDBException,
227 eFileErr,
228 "Error: requested sequence type does not match DB.");
229 }
230
231 m_OffHdr = off1; m_EndHdr = off2;
232 m_OffSeq = off2; m_EndSeq = off3;
233
234 if (db_seqtype == 'n') {
235 m_OffAmb = off3; m_EndAmb = offend;
236 } else {
237 m_OffAmb = m_EndAmb = 0;
238 }
239 }
240
241 END_NCBI_SCOPE
242
243