1 /*  $Id: writedb_files.cpp 557074 2018-02-09 14:23:28Z fongah2 $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Kevin Bealer
27  *
28  */
29 
30 /// @file writedb_files.cpp
31 /// Implementation for the CWriteDB_Files class.
32 /// class for WriteDB.
33 #include <ncbi_pch.hpp>
34 #include <objtools/blast/seqdb_writer/writedb_files.hpp>
35 #include <objtools/blast/seqdb_writer/writedb_convert.hpp>
36 #include <serial/objistr.hpp>
37 #include <serial/objostr.hpp>
38 #include <serial/serial.hpp>
39 #include <iostream>
40 #include <sstream>
41 
42 BEGIN_NCBI_SCOPE
43 
44 /// Use standard C++ definitions.
45 USING_SCOPE(std);
46 
47 // Blast Database Format Notes (version 4).
48 // (See below for version 5.)
49 //
50 // Integers are 4 bytes stored in big endian format, except for the
51 // volume length.  The volume length is 8 bytes, but is stored in a
52 // little endian byte order (reason unknown).
53 
54 // The 'standard' packing for strings in Blast DBs is as follows:
55 //   0..4: length
56 //   4..4+length: string data
57 //
58 // The title string follows this rule, but the create date has an
59 // additional detail; if it does not end on an offset that is a
60 // multiple of 8 bytes, extra 'NUL' characters are added to bring it
61 // to a multiple of 8 bytes.  The NUL characters are added after the
62 // string bytes, and the stored length of the string is increased to
63 // include them.  After extracting the string, 0-7 NUL bytes will need
64 // to be stripped from the end of the string (if any are found).
65 //
66 // (If this were not done, the offsets in the file would be unaligned;
67 // on some architectures this could cause a performance penalty or
68 // other problems.  On little endian architectures such as Intel, this
69 // penalty is always paid.)
70 
71 // INDEX FILE FORMAT, for "Blast DB Version 4"
72 //
73 // 0..4:         format version  (Blast DB version, current is "4").
74 // 4..8:         seqtype (1 for protein or 0 for nucleotide).
75 // 8..N1:        title (string).
76 // N1..N2:       create date (string).
77 // N2..N2+4:     number of OIDs (#OIDS).
78 // N2+4..N2+12:  number of letters in volume. (note: 8 bytes)
79 // N2+12..N2+16: maxlength (size of longest sequence in DB)
80 //
81 // N2+16..(end): Array data
82 //
83 //  Array data is 2 or 3 arrays of (#OIDS + 1) four byte integers.
84 //  For protein, 2 arrays are used; for nucleotide, 3 are used.
85 //
86 //  The first array is header offsets, the second array is sequence
87 //  offsets, and the third (optional) array is offsets of ambiguity
88 //  data.  Each array has a final element which is the length of the
89 //  file; this makes it possible to compute the last sequence's length
90 //  without adding a special case.
91 //
92 // As shown, the total size of index header =
93 //   4*4 bytes          // 4 int fields (4 bytes each)
94 //   + 8 bytes          // 8 byte field
95 //   + 2*4 + strings    // 4 bytes length for each plus string data.
96 //   = (32 + strings), rounded up to nearest multiple of 8
97 //
98 // "strings" here refers to the unterminated length of both strings.
99 
100 // Blast Database Format Notes (version 5).
101 // (See above for version 4.)
102 //
103 // Integers are 4 bytes stored in big endian format, except for the
104 // volume length.  The volume length is 8 bytes, but is stored in a
105 // little endian byte order (reason unknown).
106 
107 // The 'standard' packing for strings in Blast DBs is as follows:
108 //   0..4: length
109 //   4..4+length: string data
110 //
111 // The title string and LMDB string follow this rule, but the create
112 // date has an additional detail; if it does not end on an offset that
113 // is a multiple of 8 bytes, extra 'NUL' characters are added to bring
114 // it to a multiple of 8 bytes.  The NUL characters are added after the
115 // string bytes, and the stored length of the string is increased to
116 // include them.  After extracting the string, 0-7 NUL bytes will need
117 // to be stripped from the end of the string (if any are found).
118 //
119 // (If this were not done, the offsets in the file would be unaligned;
120 // on some architectures this could cause a performance penalty or
121 // other problems.  On little endian architectures such as Intel, this
122 // penalty is always paid.)
123 
124 // --------------------------------------------
125 
126 // INDEX FILE FORMAT, for "Blast DB Version 5"
127 //
128 // 0..4:         format version  (Blast DB version, current is "5").
129 // 4..8:         seqtype (1 for protein or 0 for nucleotide).
130 // 8..12:        this volume number (0 and up).
131 // 12..N1:       title (string).
132 // N1..N2:       name of LMDB database file (string)
133 // N2..N3:       create date (string).
134 // N3..N3+4:     number of OIDs (#OIDS).
135 // N3+4..N3+12:  number of letters in volume. (note: 8 bytes)
136 // N3+12..N3+16: maxlength (size of longest sequence in DB)
137 //
138 // N3+16..(end): Array data
139 //
140 //  Array data is 2 or 3 arrays of (#OIDS + 1) four byte integers.
141 //  For protein, 2 arrays are used; for nucleotide, 3 are used.
142 //
143 //  The first array is header offsets, the second array is sequence
144 //  offsets, and the third (optional) array is offsets of ambiguity
145 //  data.  Each array has a final element which is the length of the
146 //  file; this makes it possible to compute the last sequence's length
147 //  without adding a special case.
148 //
149 // As shown, the total size of index header =
150 //   5*4 bytes          // 5 int fields (4 bytes each)
151 //   + 8 bytes          // 8 byte field
152 //   + 3*4 + strings    // 4 bytes length for each plus string data.
153 //   = (40 + strings), rounded up to nearest multiple of 8
154 //
155 // "strings" here refers to the unterminated length of both strings.
156 
CWriteDB_File(const string & basename,const string & extension,int index,Uint8 max_file_size,bool always_create)157 CWriteDB_File::CWriteDB_File(const string & basename,
158                              const string & extension,
159                              int            index,
160                              Uint8          max_file_size,
161                              bool           always_create)
162     : m_Created    (false),
163       m_BaseName   (basename),
164       m_Extension  (extension),
165       m_Index      (index),
166       m_Offset     (0),
167       m_MaxFileSize(max_file_size)
168 {
169     // Define number of usable bits in m_Offset,
170     // deducting one for the sign bit.
171     // Define maximum allowed max_file_size.
172 #ifdef _DEBUG
173     static const int MAX_OFFSET_BITS = (sizeof m_Offset * 8);
174     static const Uint8 MAX_FILE_SIZE = ((Uint8) 1 << MAX_OFFSET_BITS);
175 #endif
176 
177     if (m_MaxFileSize == 0) {
178         m_MaxFileSize = x_DefaultByteLimit();
179     } else {
180 #ifdef _DEBUG
181         _ASSERT(max_file_size <= MAX_FILE_SIZE);
182 #endif
183     }
184 
185     m_Nul.resize(1);
186     m_Nul[0] = (char) 0;
187 
188     m_UseIndex = (index >= 0);
189     x_MakeFileName();
190 
191     if (always_create) {
192         Create();
193     }
194 }
195 
Create()196 void CWriteDB_File::Create()
197 {
198     _ASSERT(! m_Created);
199     m_Created = true;
200     m_RealFile.open(m_Fname.c_str(), ios::out | ios::binary);
201 }
202 
Write(const CTempString & data)203 unsigned int CWriteDB_File::Write(const CTempString & data)
204 {
205     // Define maximum allowed max_file_size.
206 #ifdef _DEBUG
207     // Define number of usable bits in m_Offset,
208     // deducting one for the sign bit.
209     static const int MAX_OFFSET_BITS = (sizeof m_Offset * 8);
210     static const Uint8 MAX_OFFSET = ((Uint8) 1 << MAX_OFFSET_BITS);
211 #endif
212 
213     _ASSERT(m_Created);
214 #ifdef _DEBUG
215     _ASSERT(((Uint8) m_Offset + data.length()) <= MAX_OFFSET);
216 #endif
217     m_RealFile.write(data.data(), data.length());
218 
219     m_Offset += data.length();
220     return m_Offset;
221 }
222 
MakeShortName(const string & base,int index)223 string CWriteDB_File::MakeShortName(const string & base, int index)
224 {
225     ostringstream fns;
226 
227     fns << base;
228     fns << ".";
229     fns << (index / 10);
230     fns << (index % 10);
231 
232     return fns.str();
233 }
234 
x_MakeFileName()235 void CWriteDB_File::x_MakeFileName()
236 {
237     if (m_UseIndex) {
238         m_Fname = MakeShortName(m_BaseName, m_Index);
239     } else {
240         m_Fname = m_BaseName;
241     }
242 
243     m_Fname += ".";
244     m_Fname += m_Extension;
245 }
246 
Close()247 void CWriteDB_File::Close()
248 {
249     x_Flush();
250     if (m_Created) {
251         m_RealFile.close();
252     }
253 }
254 
RenameSingle()255 void CWriteDB_File::RenameSingle()
256 {
257     _ASSERT(m_UseIndex == true);
258 
259     string nm1 = m_Fname;
260     m_UseIndex = false;
261     x_MakeFileName();
262 
263     CDirEntry fn1(nm1);
264     fn1.Rename(m_Fname, CDirEntry::fRF_Overwrite);
265 }
266 
CWriteDB_IndexFile(const string & dbname,bool protein,const string & title,const string & date,int index,Uint8 max_file_size,EBlastDbVersion dbver)267 CWriteDB_IndexFile::CWriteDB_IndexFile(const string & dbname,
268                                        bool           protein,
269                                        const string & title,
270                                        const string & date,
271                                        int            index,
272                                        Uint8          max_file_size,
273                                        EBlastDbVersion    dbver)
274     : CWriteDB_File(dbname,
275                     protein ? "pin" : "nin",
276                     index,
277                     max_file_size,
278                     true),
279       m_Protein   (protein),
280       m_Title     (title),
281       m_Date      (date),
282       m_OIDs      (0),
283       m_DataSize  (0),
284       m_Letters   (0),
285       m_MaxLength (0),
286       m_Version   (dbver)
287 {
288     // Compute index overhead, rounding up.
289 
290     m_Overhead = x_Overhead(title, date);
291     if (dbver == eBDB_Version5) {
292         m_Overhead = x_Overhead(title, x_MakeLmdbName(), date);
293     } else {
294         m_Overhead = x_Overhead(title, date);
295     }
296     m_Overhead = s_RoundUp(m_Overhead, 8);
297     m_DataSize = m_Overhead;
298 
299     // The '1' added to the sequence offset array refers to the fact
300     // that sequence files contain an initial NUL byte.  This seems to
301     // be for the benefit of the protein database scanning code, but
302     // it is also done for nucleotide databases.
303 
304     m_Hdr.push_back(0);
305     m_Seq.push_back(1);
306 }
307 
x_Overhead(const string & T,const string & lmdbName,const string & D)308 int CWriteDB_IndexFile::x_Overhead(const string & T,
309                                    const string & lmdbName,
310                                    const string & D)
311 {
312     return 5 * sizeof(int) + sizeof(long)
313             + 3 * sizeof(int) + T.size() + lmdbName.size() + D.size();
314 }
315 
x_Overhead(const string & T,const string & D)316 int CWriteDB_IndexFile::x_Overhead(const string & T,
317                                    const string & D)
318 {
319     return 4 * sizeof(int) + sizeof(long)
320             + 2 * sizeof(int) + T.size() + D.size();
321 }
322 
x_Flush()323 void CWriteDB_IndexFile::x_Flush()
324 {
325     _ASSERT(m_Created);
326 
327     bool use_lmdb = (m_Version == eBDB_Version5);
328 
329     int format_version = (int) m_Version;
330     int seq_type = (m_Protein ? 1 : 0);
331 
332     // Pad the date string (see comments at top.)
333 
334     string pad_date = m_Date;
335     int count = 0;
336     const string lmdb_name = use_lmdb ? x_MakeLmdbName() : "";
337     int overhead = use_lmdb
338             ? x_Overhead(m_Title, lmdb_name, pad_date)
339             : x_Overhead(m_Title, pad_date);
340     while (overhead & 0x7) {
341         pad_date.append(m_Nul);
342         if (count != -1) {
343             _ASSERT(count++ < 8);
344         }
345         overhead = use_lmdb
346                 ? x_Overhead(m_Title, lmdb_name, pad_date)
347                 : x_Overhead(m_Title, pad_date);
348     }
349 
350     // Write header
351 
352     ostream & F = m_RealFile;
353 
354     s_WriteInt4  (F, format_version);
355     s_WriteInt4  (F, seq_type);
356     if (!lmdb_name.empty()) {
357         s_WriteInt4  (F, m_Index);
358         s_WriteString(F, m_Title);
359         s_WriteString(F, lmdb_name);
360     } else {
361         s_WriteString(F, m_Title);
362     }
363     s_WriteString(F, pad_date);
364     s_WriteInt4  (F, m_OIDs);
365     s_WriteInt8LE(F, m_Letters);
366     s_WriteInt4  (F, m_MaxLength);
367 
368     for(unsigned i = 0; i < m_Hdr.size(); i++) {
369         s_WriteInt4(F, m_Hdr[i]);
370     }
371 
372     for(unsigned i = 0; i < m_Seq.size(); i++) {
373         s_WriteInt4(F, m_Seq[i]);
374     }
375 
376     // Should loop m_OID times, or not at all.
377     for(unsigned i = 0; i < m_Amb.size(); i++) {
378         s_WriteInt4(F, m_Amb[i]);
379     }
380 
381     // This extra index is added here because formatdb adds it.  SeqDB
382     // depends on its existence, but I don't think anyone reads (or
383     // needs) the data.  The last offset in the ambiguity column
384     // represents the position of the set of ambiguities corresponding
385     // to the last offset in the sequence column.  But the last
386     // sequence offset is not really a sequence start, it is the
387     // 'extra' offset used by sequence length computations.
388 
389     if (m_Amb.size()) {
390         s_WriteInt4(F, m_Seq.back());
391     }
392 
393     vector<unsigned int> tmp1, tmp2, tmp3;
394     m_Hdr.swap(tmp1);
395     m_Seq.swap(tmp2);
396     m_Amb.swap(tmp3);
397 }
398 
399 /// Form name of lmdb database file.
x_MakeLmdbName()400 const string CWriteDB_IndexFile::x_MakeLmdbName()
401 {
402     string suffix = (m_Protein ? ".pdb" : ".ndb");
403     size_t last_slash = m_BaseName.find_last_of('/');
404     if (last_slash == m_BaseName.npos) {
405         return m_BaseName + suffix;
406     } else {
407         return m_BaseName.substr(last_slash + 1) + suffix;
408     }
409 }
410 
CWriteDB_HeaderFile(const string & dbname,bool protein,int index,Uint8 max_file_size)411 CWriteDB_HeaderFile::CWriteDB_HeaderFile(const string & dbname,
412                                          bool           protein,
413                                          int            index,
414                                          Uint8          max_file_size)
415     : CWriteDB_File(dbname,
416                     protein ? "phr" : "nhr",
417                     index,
418                     max_file_size,
419                     true),
420       m_DataSize(0)
421 {
422 }
423 
CWriteDB_SequenceFile(const string & dbname,bool protein,int index,Uint8 max_file_size,Uint8 max_letters)424 CWriteDB_SequenceFile::CWriteDB_SequenceFile(const string & dbname,
425                                              bool           protein,
426                                              int            index,
427                                              Uint8          max_file_size,
428                                              Uint8          max_letters)
429     : CWriteDB_File(dbname,
430                     protein ? "psq" : "nsq",
431                     index,
432                     max_file_size,
433                     true),
434       m_Letters  (0),
435 #ifdef _DEBUG
436       m_BaseLimit(max_letters),
437       m_Protein  (protein)
438 #else
439       m_BaseLimit(max_letters)
440 #endif
441 {
442     // Only protein sequences need the inter-sequence NUL bytes.
443     // The first null written here is for nucleotide sequences.
444     // It doesn't seem necessary, but formatdb provides it, so I
445     // will too.
446 
447     WriteWithNull(string());
448 }
449 
450 END_NCBI_SCOPE
451 
452