1 #ifndef OBJTOOLS_WRITERS_WRITEDB__WRITEDB_FILES_HPP
2 #define OBJTOOLS_WRITERS_WRITEDB__WRITEDB_FILES_HPP
3 
4 /*  $Id: writedb_files.hpp 553715 2017-12-20 18:37:47Z vakatov $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Kevin Bealer
30  *
31  */
32 
33 /// @file writedb_files.hpp
34 /// Code for database files construction.
35 ///
36 /// Defines classes:
37 ///     CWriteDBHeader
38 ///
39 /// Implemented for: UNIX, MS-Windows
40 
41 #include <objtools/blast/seqdb_writer/writedb_general.hpp>
42 #include <objtools/blast/seqdb_writer/writedb_convert.hpp>
43 #include <objtools/blast/seqdb_reader/seqdbcommon.hpp>
44 #include <objects/seq/seq__.hpp>
45 #include <corelib/ncbistre.hpp>
46 #include <corelib/ncbifile.hpp>
47 
48 BEGIN_NCBI_SCOPE
49 
50 /// Import definitions from the objects namespace.
51 USING_SCOPE(objects);
52 
53 /// CWriteDB_IndexFile class
54 ///
55 /// This manufactures blast database index files from input data.
56 
57 class NCBI_XOBJWRITE_EXPORT CWriteDB_File : public CObject {
58 public:
59     // Setup and control
60 
61     /// Constructor.
62     ///
63     /// The filename is constructed from basename, extension, and
64     /// index, but might be changed if the RenameSingle() method is
65     /// called.  If zero is specified for maximum file size, a default
66     /// size is provided by this class.  The maximum file size is not
67     /// enforced by this class, instead each derived class must do its
68     /// own enforcement.
69     ///
70     /// @param basename Database base name, shared by all files. [in]
71     /// @param extension File name extension for this file. [in]
72     /// @param index Volume index used in filename. [in]
73     /// @param max_file_size File size limit (in bytes). [in]
74     /// @param always_create If true the file will be created now. [in]
75     CWriteDB_File(const string & basename,
76                   const string & extension,
77                   int            index,
78                   Uint8          max_file_size,
79                   bool           always_create);
80 
81     /// Create and open the file.
82     ///
83     /// This method must be called before the first time that data is
84     /// written to the file.  If the constructor is passed 'true' for
85     /// always_create, this method will be called during construction.
86     /// It is an error to call this method more than once (including
87     /// via the constructor) or to not call it but to call Write.  The
88     /// rationale for making this explicit is to permit some files to
89     /// be created optionally, such as ISAM files, which should only
90     /// be created if the corresponding ID types are found.
91     void Create();
92 
93     /// Write contents of a string to the file.
94     /// @param data Data to write.
95     /// @return File offset after write.
96     unsigned int Write(const CTempString & data);
97 
98     /// Write an Int4 (in bigendian order) to the file.
99     /// @param data String to write.
100     /// @return File offset after write.
WriteInt4(int data)101     unsigned int WriteInt4(int data)
102     {
103         s_WriteInt4(m_RealFile, data);
104         m_Offset += 4;
105         return m_Offset;
106     }
107 
108     /// Write an Int8 (in bigendian order) to the file.
109     /// @param data String to write.
110     /// @return File offset after write.
WriteInt8(Int8 data)111     unsigned int WriteInt8(Int8 data)
112     {
113         s_WriteInt8BE(m_RealFile, data);
114         m_Offset += 8;
115         return m_Offset;
116     }
117 
118     /// Write contents of a string to the file, appending a NUL.
119     /// @param data String to write.
120     /// @return File offset after write.
WriteWithNull(const CTempString & data)121     unsigned int WriteWithNull(const CTempString & data)
122     {
123         Write(data);
124         return Write(m_Nul);
125     }
126 
127     /// Close the file, flushing any remaining data to disk.
128     void Close();
129 
130     /// Rename this file, disincluding the volume index.
131     virtual void RenameSingle();
132 
133     /// Construct the short name for a volume.
134     ///
135     /// Volume names consist of the database base name, ".", and the
136     /// volume index in decimal.  The volume index is normally two
137     /// digits, but if more than 100 volumes are needed, the filename
138     /// will use three or more index digits as needed.
139     ///
140     /// @param base Base name to use.
141     /// @param index Volume index.
142     /// @return A short name.
143     static string MakeShortName(const string & base, int index);
144 
145     /// Get the current filename for this file.
146     ///
147     /// The filename is returned.  The data returned by this method
148     /// reflects changes made by RenameSingle(), so it is probably
149     /// best to call it after that method has been called (if it will
150     /// be called).
151     ///
152     /// @return The filename.
GetFilename() const153     const string & GetFilename() const
154     {
155         return m_Fname;
156     }
157 
158 protected:
159     /// True if the file has already been opened.
160     bool m_Created;
161 
162     /// Underlying 'output file' type used here.
163     typedef ofstream TFile;
164 
165     /// For convenience, a string containing one NUL character.
166     string m_Nul; // init me
167 
168     /// The default value for max_file_size.
169     /// @return The max file size used if otherwise unspecified.
x_DefaultByteLimit()170     Uint8 x_DefaultByteLimit()
171     {
172         // 1 gb (marketing version) - 1; about a billion
173         return 1000*1000*1000 - 1;
174     }
175 
176     /// This should flush any unwritten data to disk.
177     ///
178     /// This method must be implemented by derived classes to flush
179     /// any unwritten data to disk.  In the cases of sequence and
180     /// header files, it will normally do nothing, because such files
181     /// are written as the data is available.  For index (pin/nin) and
182     /// ISAM files, this method does most of the disk I/O.
183     virtual void x_Flush() = 0;
184 
185     /// Build the filename for this file.
186     void x_MakeFileName();
187 
188     // Configuration
189 
190     string m_BaseName;    ///< Database base name for all files.
191     string m_Extension;   ///< File extension for this file.
192     int    m_Index;       ///< Volume index.
193     unsigned int    m_Offset;      ///< Stream position.
194     Uint8  m_MaxFileSize; ///< Maximum file size in bytes.
195 
196     // The file
197 
198     bool   m_UseIndex; ///< True if filenames should use volume index.
199     string m_Fname;    ///< Current filename for output file.
200     TFile  m_RealFile; ///< Actual stream implementing the output file.
201 };
202 
203 // For index file format, see .cpp file.
204 
205 /// This class builds the volume index file (pin or nin).
206 class CWriteDB_IndexFile : public CWriteDB_File {
207 public:
208     /// Constructor.
209     /// @param dbname Database base name.
210     /// @param protein True for protein volumes.
211     /// @param title Database title string.
212     /// @param date Timestamp of database construction start.
213     /// @param index Index of this volume.
214     /// @param max_file_size Maximum file size in bytes (or zero).
215     CWriteDB_IndexFile(const string & dbname,
216                        bool           protein,
217                        const string & title,
218                        const string & date,
219                        int            index,
220                        Uint8          max_file_size,
221                        EBlastDbVersion    dbver = eBDB_Version4);
222 
223     /// Returns true if another sequence can fit into the file.
CanFit()224     bool CanFit()
225     {
226         _ASSERT(m_MaxFileSize > 1024UL);
227 
228         if (m_OIDs == 0)
229             return true;
230 
231         return m_DataSize < (m_MaxFileSize - 12UL);
232     }
233 
234     /// Add a sequence to a protein index file (pin).
235     ///
236     /// The index file does not need sequence data, so this method
237     /// only needs offsets of the data in other files.
238     ///
239     /// @param Sequence length in letters.
240     /// @param hdr Length of binary ASN.1 header data.
241     /// @param seq Length in bytes of sequence data.
AddSequence(int length,unsigned int hdr,unsigned int seq)242     void AddSequence(int length, unsigned int hdr, unsigned int seq)
243     {
244         if (length > m_MaxLength) {
245             m_MaxLength = length;
246         }
247 
248         m_OIDs++;
249         m_Letters += length;
250         m_DataSize += 8;
251 
252         m_Hdr.push_back(hdr);
253         m_Seq.push_back(seq);
254     }
255 
256     /// Add a sequence to a nucleotide index file (nin).
257     ///
258     /// The index file does not need sequence data, so this method
259     /// only needs offsets of the data in other files.
260     ///
261     /// @param Sequence length in letters.
262     /// @param hdr Length of binary ASN.1 header data.
263     /// @param seq Length in bytes of packed sequence data.
264     /// @param amb Length in bytes of packed ambiguity data.
AddSequence(int length,unsigned int hdr,unsigned int seq,unsigned int amb)265     void AddSequence(int length, unsigned int hdr, unsigned int seq, unsigned int amb)
266     {
267         if (length > m_MaxLength) {
268             m_MaxLength = length;
269         }
270 
271         m_OIDs++;
272         m_Letters += length;
273 
274         m_DataSize += 12;
275         m_Hdr.push_back(hdr);
276         m_Seq.push_back(amb); // Not a bug.
277         m_Amb.push_back(seq); // Also not a bug.
278     }
279 
280 private:
281     /// Compute index file overhead.  This is the overhead used by all
282     /// fields of the index file, and does account for padding.
283     /// (version 5)
284     ///
285     /// @param T Title string.
286     /// @param LMDB file name string.
287     /// @param D Create time string.
288     /// @return Combined size of all meta-data fields in nin/pin file.
289     int x_Overhead(const string & T, const string & lmdbName, const string & D);
290 
291     /// Compute index file overhead.  This is the overhead used by all
292     /// fields of the index file, and does account for padding.
293     /// (version 4)
294     ///
295     /// @param T Title string.
296     /// @param D Create time string.
297     /// @return Combined size of all meta-data fields in nin/pin file.
298     int x_Overhead(const string & T, const string & D);
299 
300     /// Flush index data to disk.
301     virtual void x_Flush();
302 
303     /// Form name of LMDB database file.
304     const string x_MakeLmdbName();
305 
306     bool   m_Protein;   ///< True if this is a protein database.
307     string m_Title;     ///< Title string for all database volumes.
308     string m_Date;      ///< Database creation time stamp.
309     int    m_OIDs;      ///< OIDs added to database so far.
310     int    m_Overhead;  ///< Amount of file used by metadata.
311     Uint8  m_DataSize;  ///< Required space for data once written to disk.
312     Uint8  m_Letters;   ///< Letters of sequence data accumulated so far.
313     int    m_MaxLength; ///< Length of longest sequence.
314 
315     // Because the lengths are found via "next offset - this offset",
316     // each array has an extra element.  (This is not necesary in the
317     // case of m_Amb; the last element is never examined because of
318     // the alternation of sequences and ambiguities.)
319 
320     /// Start offset in header file of each OID's headers.
321     ///
322     /// The end offset is given by the start offset of the following
323     /// OID's headers.
324     vector<unsigned int> m_Hdr;
325 
326     /// Offset in sequence file of each OID's sequence data.
327     ///
328     /// The end of the sequence data is given by the start offset of
329     /// the ambiguity data for the same OID.
330     vector<unsigned int> m_Seq;
331 
332     /// Offset in sequence file of each OID's ambiguity data.
333     ///
334     /// The end of the ambiguity data is given by the start offset of
335     /// the sequence data for the next OID.
336     vector<unsigned int> m_Amb;
337 
338     EBlastDbVersion    m_Version;     ///< BLASTDB version (4 or 5).
339 };
340 
341 /// This class builds the volume header file (phr or nhr).
342 class CWriteDB_HeaderFile : public CWriteDB_File {
343 public:
344     /// Constructor.
345     /// @param dbname Database base name.
346     /// @param protein True for protein volumes.
347     /// @param index Index of this volume.
348     /// @param max_file_size Maximum file size in bytes (or zero).
349     CWriteDB_HeaderFile(const string & dbname,
350                         bool           protein,
351                         int            index,
352                         Uint8          max_file_size);
353 
354     /// Returns true if the specified amount of data would fit.
355     ///
356     /// If the specified amount of data (in bytes) would fit in the
357     /// file without exceeding the max_file_size, this method returns
358     /// true.
359     ///
360     /// @param size Size of new data in bytes.
CanFit(int size)361     bool CanFit(int size)
362     {
363         _ASSERT(size >= 0);
364 
365         if (m_DataSize == 0UL) {
366             return true;
367         }
368 
369         return (m_DataSize + (Uint8) size) < m_MaxFileSize;
370     }
371 
372     /// Add binary header data to this file.
373     /// @param binhdr Binary ASN.1 version of header data. [in]
374     /// @param offset Offset of end of header data. [out]
AddSequence(const string & binhdr,unsigned int & offset)375     void AddSequence(const string & binhdr, unsigned int & offset)
376     {
377         m_DataSize = offset = Write(binhdr);
378     }
379 
380 private:
381     /// Flush unwritten data to the output file.
x_Flush()382     virtual void x_Flush()
383     {
384         // There is nothing to do here - header data is written as
385         // soon as it is added.
386     }
387 
388     /// Amount of data written so far.
389     Uint8 m_DataSize;
390 };
391 
392 class CWriteDB_SequenceFile : public CWriteDB_File {
393 public:
394     /// Constructor.
395     /// @param dbname Database base name.
396     /// @param protein True for protein volumes.
397     /// @param index Index of this volume.
398     /// @param max_file_size Maximum file size in bytes (or zero).
399     /// @param max_letter Maximum sequence letters per volume (or zero).
400     CWriteDB_SequenceFile(const string & dbname,
401                           bool           protein,
402                           int            index,
403                           Uint8          max_file_size,
404                           Uint8          max_letters);
405 
406     /// Returns true if the specified amount of data would fit.
407     ///
408     /// If the specified amount of data (in bytes) would fit in the
409     /// file without exceeding the max_file_size, and the specified
410     /// number of letters would fit without exceeding the maximum
411     /// letters limit, this method returns true.
412     ///
413     /// @param size Size of new data in bytes.
414     /// @param letters Number of sequence letters in new data.
CanFit(int size,int letters)415     bool CanFit(int size, int letters)
416     {
417         _ASSERT(size >= 0);
418         _ASSERT(letters >= 0);
419 
420         if (m_Offset <= 1) {
421             return true;
422         }
423 
424         if ((m_BaseLimit != 0)  &&
425             ((m_Letters + (Uint8) letters) > m_BaseLimit)) {
426             return false;
427         }
428 
429         return ((Uint8)(m_Offset + size) < m_MaxFileSize);
430     }
431 
432     /// Add a protein sequence to this file.
433     ///
434     /// This method should only be called in the protein case.
435     ///
436     /// @param sequence Packed sequence data. [in]
437     /// @param offset Offset of the end of the sequence data. [out]
438     /// @param length Length of the sequence in letters. [in]
AddSequence(const string & sequence,unsigned int & offset,int length)439     void AddSequence(const string & sequence,
440                      unsigned int & offset,
441                      int            length)
442     {
443 #ifdef _DEBUG
444         _ASSERT(m_Protein);
445 #endif
446         offset = WriteWithNull(sequence);
447         m_Letters += length;
448     }
449 
450     /// Add a nucleotide sequence to this file.
451     ///
452     /// This method should only be called in the nucleotide case.
453     ///
454     /// @param sequence Packed sequence data. [in]
455     /// @param ambig Packed ambiguity data. [in]
456     /// @param off_seq Offset of the end of the sequence data. [out]
457     /// @param off_amb Offset of the end of the ambiguity data. [out]
458     /// @param length Length of the sequence in letters. [in]
AddSequence(const string & sequence,const string & ambig,unsigned int & off_seq,unsigned int & off_amb,int length)459     void AddSequence(const string & sequence,
460                      const string & ambig,
461                      unsigned int & off_seq,
462                      unsigned int & off_amb,
463                      int          length)
464     {
465 #ifdef _DEBUG
466         _ASSERT(! m_Protein);
467 #endif
468         off_seq = Write(sequence);
469         off_amb = Write(ambig);
470         m_Letters += length;
471     }
472 
473 private:
474     /// Flush unwritten data to the output file.
x_Flush()475     virtual void x_Flush()
476     {
477         // There is nothing to do here - sequence data is written as
478         // soon as it is added.
479     }
480 
481     Uint8 m_Letters;   ///< Letters of sequence data added so far.
482     Uint8 m_BaseLimit; ///< Limit on letters of sequence data.
483 #ifdef _DEBUG
484     bool  m_Protein;   ///< True if this is a protein database.
485 #endif
486 };
487 
488 END_NCBI_SCOPE
489 
490 
491 #endif // OBJTOOLS_WRITERS_WRITEDB__WRITEDB_FILES_HPP
492 
493