1 /*  $Id: seqdbfile.cpp 553487 2017-12-18 14:23:38Z fongah2 $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Kevin Bealer
27  *
28  */
29 
30 /// @file seqdbfile.cpp
31 /// Several classes providing access to the component files of a
32 /// database volume.
33 #include <ncbi_pch.hpp>
34 #include <objtools/blast/seqdb_reader/impl/seqdbfile.hpp>
35 
36 BEGIN_NCBI_SCOPE
37 
38 /// Index file.
39 ///
40 /// Index files (extension nin or pin) contain information on where to
41 /// find information in other files.  The OID is the (implied) key.
42 
43 
44 // A Word About Mutexes and Mutability in the File Classes
45 //
46 // The stream object in CSeqDBRawFile is mutable: this is because the
47 // stream access methods modify the file.  Specifically, they modify
48 // the file offset.  This means that two users of a stream object will
49 // step on each other if they try to read from different offsets
50 // concurrently.  Memory mapping does not have this problem of course.
51 //
52 // To fix this, the file object is mutable, but to access it, the user
53 // needs to hold the m_FileLock mutex.
54 //
55 // One goal I have for these classes is to eliminate all locking for
56 // the mmap case.  Locking is not needed to call a const method, so
57 // methods are marked const whenever possible.  After construction of
58 // CSeqDB, ONLY const methods are called.
59 //
60 // Some of the const methods need to modify fields; to do this, I mark
61 // the fields 'mutable' and hold a mutex whenever accessing them.
62 //
63 // Each method falls into one of these categories:
64 //
65 // 1. Non-const: called only during CSeqDB construction.
66 // 2. Const: no changes to any fields.
67 // 3. Const: modifies mutable fields while holding m_FileLock.
68 
69 typedef CSeqDBAtlas::TIndx TIndx;
70 
ReadSwapped(CSeqDBFileMemMap & lease,TIndx offset,Uint4 * value) const71 TIndx CSeqDBRawFile::ReadSwapped(CSeqDBFileMemMap & lease,
72                                  TIndx            offset,
73                                  Uint4          * value) const
74 
75 {
76     *value = SeqDB_GetStdOrd((Uint4 *) lease.GetFileDataPtr(m_FileName,offset));
77 
78     return offset + sizeof(*value);
79 }
80 
ReadSwapped(CSeqDBFileMemMap & lease,TIndx offset,Uint8 * value) const81 TIndx CSeqDBRawFile::ReadSwapped(CSeqDBFileMemMap & lease,
82                                  TIndx            offset,
83                                  Uint8          * value) const
84 
85 {
86     *value = SeqDB_GetBroken((Int8 *) lease.GetFileDataPtr(m_FileName,offset));
87 
88     return offset + sizeof(*value);
89 }
90 
ReadSwapped(CSeqDBFileMemMap & lease,TIndx offset,string * value) const91 TIndx CSeqDBRawFile::ReadSwapped(CSeqDBFileMemMap & lease,
92                                  TIndx            offset,
93                                  string         * value) const
94 {
95     Uint4 len = 0;
96 
97     len = SeqDB_GetStdOrd((Int4 *) lease.GetFileDataPtr(m_FileName,offset));
98 
99     offset += sizeof(len);
100 
101     value->assign(lease.GetFileDataPtr(offset), (int) len);
102 
103     return offset + len;
104 }
105 
CSeqDBExtFile(CSeqDBAtlas & atlas,const string & dbfilename,char prot_nucl)106 CSeqDBExtFile::CSeqDBExtFile(CSeqDBAtlas    & atlas,
107                              const string   & dbfilename,
108                              char             prot_nucl)
109 
110     : m_Atlas   (atlas),
111       m_FileName(dbfilename),
112       m_Lease    (atlas),
113       m_File    (atlas)
114 {
115     if ((prot_nucl != 'p') && (prot_nucl != 'n')) {
116         NCBI_THROW(CSeqDBException,
117                    eArgErr,
118                    "Error: Invalid sequence type requested.");
119     }
120 
121     x_SetFileType(prot_nucl);
122 
123     if (! m_File.Open(CSeqDB_Path(m_FileName))) {
124         //m_Atlas.Unlock(locked);
125 
126         string msg = string("Error: File (") + m_FileName + ") not found.";
127 
128         NCBI_THROW(CSeqDBException, eFileErr, msg);
129     }
130 
131     m_Lease.Init(m_FileName);
132 }
133 
CSeqDBIdxFile(CSeqDBAtlas & atlas,const string & dbname,char prot_nucl)134 CSeqDBIdxFile::CSeqDBIdxFile(CSeqDBAtlas    & atlas,
135                              const string   & dbname,
136                              char             prot_nucl)
137 
138     : CSeqDBExtFile(atlas, dbname + ".-in", prot_nucl),
139       m_HdrLease     (atlas),
140       m_SeqLease      (atlas),
141       m_AmbLease      (atlas),
142       m_NumOIDs       (0),
143       m_VolLen        (0),
144       m_MaxLen        (0),
145       m_MinLen        (0),
146       m_OffHdr        (0),
147       m_EndHdr        (0),
148       m_OffSeq        (0),
149       m_EndSeq        (0),
150       m_OffAmb        (0),
151       m_EndAmb        (0),
152       m_LMDBFile	  (kEmptyStr) ,
153       m_Volume        (0)
154 {
155     //Verify();
156 
157     // Input validation
158 
159     if (dbname.empty()) {
160         NCBI_THROW(CSeqDBException,
161                    eArgErr,
162                    "Error: dbname should not be an empty string.");
163     }
164 
165     if ((prot_nucl != 'p') && (prot_nucl != 'n')) {
166         NCBI_THROW(CSeqDBException,
167                    eArgErr,
168                    "Error: Invalid sequence type requested.");
169     }
170 
171     TIndx offset = 0;
172 
173 
174     Uint4 f_format_version = 0;
175     Uint4 f_db_seqtype = 0;
176 
177 
178     offset = x_ReadSwapped(m_Lease, offset, & f_format_version);
179 
180     TIndx off1(0), off2(0), off3(0), offend(0);
181 
182     try {
183     	 bool dbv5 = ( 5 == f_format_version );
184         if (!dbv5 && (f_format_version != 4)) {
185             NCBI_THROW(CSeqDBException,
186                        eFileErr,
187                        "Error: Not a valid version 4 or 5 database.");
188         }
189 
190         offset = x_ReadSwapped(m_Lease, offset, & f_db_seqtype);
191         if (dbv5) {
192         	offset = x_ReadSwapped(m_Lease, offset, & m_Volume);
193         }
194 
195         offset = x_ReadSwapped(m_Lease, offset, & m_Title);
196         if (dbv5) {
197             offset = x_ReadSwapped(m_Lease, offset, & m_LMDBFile);
198        }
199 
200         offset = x_ReadSwapped(m_Lease, offset, & m_Date);
201 
202 
203         offset = x_ReadSwapped(m_Lease, offset, & m_NumOIDs);
204 
205 
206         offset = x_ReadSwapped(m_Lease, offset, & m_VolLen);
207 
208 
209         offset = x_ReadSwapped(m_Lease, offset, & m_MaxLen);
210 
211         TIndx region_bytes = 4 * (m_NumOIDs + 1);
212 
213         off1   = offset;
214         off2   = off1 + region_bytes;
215         off3   = off2 + region_bytes;
216         offend = off3 + region_bytes;
217     }
218     catch(...) {
219         throw;
220     }
221 
222 
223     char db_seqtype = ((f_db_seqtype == 1) ? 'p' : 'n');
224 
225     if (db_seqtype != x_GetSeqType()) {
226         NCBI_THROW(CSeqDBException,
227                    eFileErr,
228                    "Error: requested sequence type does not match DB.");
229     }
230 
231     m_OffHdr = off1; m_EndHdr = off2;
232     m_OffSeq = off2; m_EndSeq = off3;
233 
234     if (db_seqtype == 'n') {
235         m_OffAmb = off3; m_EndAmb = offend;
236     } else {
237         m_OffAmb = m_EndAmb = 0;
238     }
239 }
240 
241 END_NCBI_SCOPE
242 
243