1 /* $Id: writedb_files.cpp 557074 2018-02-09 14:23:28Z fongah2 $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kevin Bealer
27 *
28 */
29
30 /// @file writedb_files.cpp
31 /// Implementation for the CWriteDB_Files class.
32 /// class for WriteDB.
33 #include <ncbi_pch.hpp>
34 #include <objtools/blast/seqdb_writer/writedb_files.hpp>
35 #include <objtools/blast/seqdb_writer/writedb_convert.hpp>
36 #include <serial/objistr.hpp>
37 #include <serial/objostr.hpp>
38 #include <serial/serial.hpp>
39 #include <iostream>
40 #include <sstream>
41
42 BEGIN_NCBI_SCOPE
43
44 /// Use standard C++ definitions.
45 USING_SCOPE(std);
46
47 // Blast Database Format Notes (version 4).
48 // (See below for version 5.)
49 //
50 // Integers are 4 bytes stored in big endian format, except for the
51 // volume length. The volume length is 8 bytes, but is stored in a
52 // little endian byte order (reason unknown).
53
54 // The 'standard' packing for strings in Blast DBs is as follows:
55 // 0..4: length
56 // 4..4+length: string data
57 //
58 // The title string follows this rule, but the create date has an
59 // additional detail; if it does not end on an offset that is a
60 // multiple of 8 bytes, extra 'NUL' characters are added to bring it
61 // to a multiple of 8 bytes. The NUL characters are added after the
62 // string bytes, and the stored length of the string is increased to
63 // include them. After extracting the string, 0-7 NUL bytes will need
64 // to be stripped from the end of the string (if any are found).
65 //
66 // (If this were not done, the offsets in the file would be unaligned;
67 // on some architectures this could cause a performance penalty or
68 // other problems. On little endian architectures such as Intel, this
69 // penalty is always paid.)
70
71 // INDEX FILE FORMAT, for "Blast DB Version 4"
72 //
73 // 0..4: format version (Blast DB version, current is "4").
74 // 4..8: seqtype (1 for protein or 0 for nucleotide).
75 // 8..N1: title (string).
76 // N1..N2: create date (string).
77 // N2..N2+4: number of OIDs (#OIDS).
78 // N2+4..N2+12: number of letters in volume. (note: 8 bytes)
79 // N2+12..N2+16: maxlength (size of longest sequence in DB)
80 //
81 // N2+16..(end): Array data
82 //
83 // Array data is 2 or 3 arrays of (#OIDS + 1) four byte integers.
84 // For protein, 2 arrays are used; for nucleotide, 3 are used.
85 //
86 // The first array is header offsets, the second array is sequence
87 // offsets, and the third (optional) array is offsets of ambiguity
88 // data. Each array has a final element which is the length of the
89 // file; this makes it possible to compute the last sequence's length
90 // without adding a special case.
91 //
92 // As shown, the total size of index header =
93 // 4*4 bytes // 4 int fields (4 bytes each)
94 // + 8 bytes // 8 byte field
95 // + 2*4 + strings // 4 bytes length for each plus string data.
96 // = (32 + strings), rounded up to nearest multiple of 8
97 //
98 // "strings" here refers to the unterminated length of both strings.
99
100 // Blast Database Format Notes (version 5).
101 // (See above for version 4.)
102 //
103 // Integers are 4 bytes stored in big endian format, except for the
104 // volume length. The volume length is 8 bytes, but is stored in a
105 // little endian byte order (reason unknown).
106
107 // The 'standard' packing for strings in Blast DBs is as follows:
108 // 0..4: length
109 // 4..4+length: string data
110 //
111 // The title string and LMDB string follow this rule, but the create
112 // date has an additional detail; if it does not end on an offset that
113 // is a multiple of 8 bytes, extra 'NUL' characters are added to bring
114 // it to a multiple of 8 bytes. The NUL characters are added after the
115 // string bytes, and the stored length of the string is increased to
116 // include them. After extracting the string, 0-7 NUL bytes will need
117 // to be stripped from the end of the string (if any are found).
118 //
119 // (If this were not done, the offsets in the file would be unaligned;
120 // on some architectures this could cause a performance penalty or
121 // other problems. On little endian architectures such as Intel, this
122 // penalty is always paid.)
123
124 // --------------------------------------------
125
126 // INDEX FILE FORMAT, for "Blast DB Version 5"
127 //
128 // 0..4: format version (Blast DB version, current is "5").
129 // 4..8: seqtype (1 for protein or 0 for nucleotide).
130 // 8..12: this volume number (0 and up).
131 // 12..N1: title (string).
132 // N1..N2: name of LMDB database file (string)
133 // N2..N3: create date (string).
134 // N3..N3+4: number of OIDs (#OIDS).
135 // N3+4..N3+12: number of letters in volume. (note: 8 bytes)
136 // N3+12..N3+16: maxlength (size of longest sequence in DB)
137 //
138 // N3+16..(end): Array data
139 //
140 // Array data is 2 or 3 arrays of (#OIDS + 1) four byte integers.
141 // For protein, 2 arrays are used; for nucleotide, 3 are used.
142 //
143 // The first array is header offsets, the second array is sequence
144 // offsets, and the third (optional) array is offsets of ambiguity
145 // data. Each array has a final element which is the length of the
146 // file; this makes it possible to compute the last sequence's length
147 // without adding a special case.
148 //
149 // As shown, the total size of index header =
150 // 5*4 bytes // 5 int fields (4 bytes each)
151 // + 8 bytes // 8 byte field
152 // + 3*4 + strings // 4 bytes length for each plus string data.
153 // = (40 + strings), rounded up to nearest multiple of 8
154 //
155 // "strings" here refers to the unterminated length of both strings.
156
CWriteDB_File(const string & basename,const string & extension,int index,Uint8 max_file_size,bool always_create)157 CWriteDB_File::CWriteDB_File(const string & basename,
158 const string & extension,
159 int index,
160 Uint8 max_file_size,
161 bool always_create)
162 : m_Created (false),
163 m_BaseName (basename),
164 m_Extension (extension),
165 m_Index (index),
166 m_Offset (0),
167 m_MaxFileSize(max_file_size)
168 {
169 // Define number of usable bits in m_Offset,
170 // deducting one for the sign bit.
171 // Define maximum allowed max_file_size.
172 #ifdef _DEBUG
173 static const int MAX_OFFSET_BITS = (sizeof m_Offset * 8);
174 static const Uint8 MAX_FILE_SIZE = ((Uint8) 1 << MAX_OFFSET_BITS);
175 #endif
176
177 if (m_MaxFileSize == 0) {
178 m_MaxFileSize = x_DefaultByteLimit();
179 } else {
180 #ifdef _DEBUG
181 _ASSERT(max_file_size <= MAX_FILE_SIZE);
182 #endif
183 }
184
185 m_Nul.resize(1);
186 m_Nul[0] = (char) 0;
187
188 m_UseIndex = (index >= 0);
189 x_MakeFileName();
190
191 if (always_create) {
192 Create();
193 }
194 }
195
Create()196 void CWriteDB_File::Create()
197 {
198 _ASSERT(! m_Created);
199 m_Created = true;
200 m_RealFile.open(m_Fname.c_str(), ios::out | ios::binary);
201 }
202
Write(const CTempString & data)203 unsigned int CWriteDB_File::Write(const CTempString & data)
204 {
205 // Define maximum allowed max_file_size.
206 #ifdef _DEBUG
207 // Define number of usable bits in m_Offset,
208 // deducting one for the sign bit.
209 static const int MAX_OFFSET_BITS = (sizeof m_Offset * 8);
210 static const Uint8 MAX_OFFSET = ((Uint8) 1 << MAX_OFFSET_BITS);
211 #endif
212
213 _ASSERT(m_Created);
214 #ifdef _DEBUG
215 _ASSERT(((Uint8) m_Offset + data.length()) <= MAX_OFFSET);
216 #endif
217 m_RealFile.write(data.data(), data.length());
218
219 m_Offset += data.length();
220 return m_Offset;
221 }
222
MakeShortName(const string & base,int index)223 string CWriteDB_File::MakeShortName(const string & base, int index)
224 {
225 ostringstream fns;
226
227 fns << base;
228 fns << ".";
229 fns << (index / 10);
230 fns << (index % 10);
231
232 return fns.str();
233 }
234
x_MakeFileName()235 void CWriteDB_File::x_MakeFileName()
236 {
237 if (m_UseIndex) {
238 m_Fname = MakeShortName(m_BaseName, m_Index);
239 } else {
240 m_Fname = m_BaseName;
241 }
242
243 m_Fname += ".";
244 m_Fname += m_Extension;
245 }
246
Close()247 void CWriteDB_File::Close()
248 {
249 x_Flush();
250 if (m_Created) {
251 m_RealFile.close();
252 }
253 }
254
RenameSingle()255 void CWriteDB_File::RenameSingle()
256 {
257 _ASSERT(m_UseIndex == true);
258
259 string nm1 = m_Fname;
260 m_UseIndex = false;
261 x_MakeFileName();
262
263 CDirEntry fn1(nm1);
264 fn1.Rename(m_Fname, CDirEntry::fRF_Overwrite);
265 }
266
CWriteDB_IndexFile(const string & dbname,bool protein,const string & title,const string & date,int index,Uint8 max_file_size,EBlastDbVersion dbver)267 CWriteDB_IndexFile::CWriteDB_IndexFile(const string & dbname,
268 bool protein,
269 const string & title,
270 const string & date,
271 int index,
272 Uint8 max_file_size,
273 EBlastDbVersion dbver)
274 : CWriteDB_File(dbname,
275 protein ? "pin" : "nin",
276 index,
277 max_file_size,
278 true),
279 m_Protein (protein),
280 m_Title (title),
281 m_Date (date),
282 m_OIDs (0),
283 m_DataSize (0),
284 m_Letters (0),
285 m_MaxLength (0),
286 m_Version (dbver)
287 {
288 // Compute index overhead, rounding up.
289
290 m_Overhead = x_Overhead(title, date);
291 if (dbver == eBDB_Version5) {
292 m_Overhead = x_Overhead(title, x_MakeLmdbName(), date);
293 } else {
294 m_Overhead = x_Overhead(title, date);
295 }
296 m_Overhead = s_RoundUp(m_Overhead, 8);
297 m_DataSize = m_Overhead;
298
299 // The '1' added to the sequence offset array refers to the fact
300 // that sequence files contain an initial NUL byte. This seems to
301 // be for the benefit of the protein database scanning code, but
302 // it is also done for nucleotide databases.
303
304 m_Hdr.push_back(0);
305 m_Seq.push_back(1);
306 }
307
x_Overhead(const string & T,const string & lmdbName,const string & D)308 int CWriteDB_IndexFile::x_Overhead(const string & T,
309 const string & lmdbName,
310 const string & D)
311 {
312 return 5 * sizeof(int) + sizeof(long)
313 + 3 * sizeof(int) + T.size() + lmdbName.size() + D.size();
314 }
315
x_Overhead(const string & T,const string & D)316 int CWriteDB_IndexFile::x_Overhead(const string & T,
317 const string & D)
318 {
319 return 4 * sizeof(int) + sizeof(long)
320 + 2 * sizeof(int) + T.size() + D.size();
321 }
322
x_Flush()323 void CWriteDB_IndexFile::x_Flush()
324 {
325 _ASSERT(m_Created);
326
327 bool use_lmdb = (m_Version == eBDB_Version5);
328
329 int format_version = (int) m_Version;
330 int seq_type = (m_Protein ? 1 : 0);
331
332 // Pad the date string (see comments at top.)
333
334 string pad_date = m_Date;
335 int count = 0;
336 const string lmdb_name = use_lmdb ? x_MakeLmdbName() : "";
337 int overhead = use_lmdb
338 ? x_Overhead(m_Title, lmdb_name, pad_date)
339 : x_Overhead(m_Title, pad_date);
340 while (overhead & 0x7) {
341 pad_date.append(m_Nul);
342 if (count != -1) {
343 _ASSERT(count++ < 8);
344 }
345 overhead = use_lmdb
346 ? x_Overhead(m_Title, lmdb_name, pad_date)
347 : x_Overhead(m_Title, pad_date);
348 }
349
350 // Write header
351
352 ostream & F = m_RealFile;
353
354 s_WriteInt4 (F, format_version);
355 s_WriteInt4 (F, seq_type);
356 if (!lmdb_name.empty()) {
357 s_WriteInt4 (F, m_Index);
358 s_WriteString(F, m_Title);
359 s_WriteString(F, lmdb_name);
360 } else {
361 s_WriteString(F, m_Title);
362 }
363 s_WriteString(F, pad_date);
364 s_WriteInt4 (F, m_OIDs);
365 s_WriteInt8LE(F, m_Letters);
366 s_WriteInt4 (F, m_MaxLength);
367
368 for(unsigned i = 0; i < m_Hdr.size(); i++) {
369 s_WriteInt4(F, m_Hdr[i]);
370 }
371
372 for(unsigned i = 0; i < m_Seq.size(); i++) {
373 s_WriteInt4(F, m_Seq[i]);
374 }
375
376 // Should loop m_OID times, or not at all.
377 for(unsigned i = 0; i < m_Amb.size(); i++) {
378 s_WriteInt4(F, m_Amb[i]);
379 }
380
381 // This extra index is added here because formatdb adds it. SeqDB
382 // depends on its existence, but I don't think anyone reads (or
383 // needs) the data. The last offset in the ambiguity column
384 // represents the position of the set of ambiguities corresponding
385 // to the last offset in the sequence column. But the last
386 // sequence offset is not really a sequence start, it is the
387 // 'extra' offset used by sequence length computations.
388
389 if (m_Amb.size()) {
390 s_WriteInt4(F, m_Seq.back());
391 }
392
393 vector<unsigned int> tmp1, tmp2, tmp3;
394 m_Hdr.swap(tmp1);
395 m_Seq.swap(tmp2);
396 m_Amb.swap(tmp3);
397 }
398
399 /// Form name of lmdb database file.
x_MakeLmdbName()400 const string CWriteDB_IndexFile::x_MakeLmdbName()
401 {
402 string suffix = (m_Protein ? ".pdb" : ".ndb");
403 size_t last_slash = m_BaseName.find_last_of('/');
404 if (last_slash == m_BaseName.npos) {
405 return m_BaseName + suffix;
406 } else {
407 return m_BaseName.substr(last_slash + 1) + suffix;
408 }
409 }
410
CWriteDB_HeaderFile(const string & dbname,bool protein,int index,Uint8 max_file_size)411 CWriteDB_HeaderFile::CWriteDB_HeaderFile(const string & dbname,
412 bool protein,
413 int index,
414 Uint8 max_file_size)
415 : CWriteDB_File(dbname,
416 protein ? "phr" : "nhr",
417 index,
418 max_file_size,
419 true),
420 m_DataSize(0)
421 {
422 }
423
CWriteDB_SequenceFile(const string & dbname,bool protein,int index,Uint8 max_file_size,Uint8 max_letters)424 CWriteDB_SequenceFile::CWriteDB_SequenceFile(const string & dbname,
425 bool protein,
426 int index,
427 Uint8 max_file_size,
428 Uint8 max_letters)
429 : CWriteDB_File(dbname,
430 protein ? "psq" : "nsq",
431 index,
432 max_file_size,
433 true),
434 m_Letters (0),
435 #ifdef _DEBUG
436 m_BaseLimit(max_letters),
437 m_Protein (protein)
438 #else
439 m_BaseLimit(max_letters)
440 #endif
441 {
442 // Only protein sequences need the inter-sequence NUL bytes.
443 // The first null written here is for nucleotide sequences.
444 // It doesn't seem necessary, but formatdb provides it, so I
445 // will too.
446
447 WriteWithNull(string());
448 }
449
450 END_NCBI_SCOPE
451
452