1 #ifndef OBJTOOLS_WRITERS_WRITEDB__WRITEDB_FILES_HPP 2 #define OBJTOOLS_WRITERS_WRITEDB__WRITEDB_FILES_HPP 3 4 /* $Id: writedb_files.hpp 553715 2017-12-20 18:37:47Z vakatov $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Author: Kevin Bealer 30 * 31 */ 32 33 /// @file writedb_files.hpp 34 /// Code for database files construction. 35 /// 36 /// Defines classes: 37 /// CWriteDBHeader 38 /// 39 /// Implemented for: UNIX, MS-Windows 40 41 #include <objtools/blast/seqdb_writer/writedb_general.hpp> 42 #include <objtools/blast/seqdb_writer/writedb_convert.hpp> 43 #include <objtools/blast/seqdb_reader/seqdbcommon.hpp> 44 #include <objects/seq/seq__.hpp> 45 #include <corelib/ncbistre.hpp> 46 #include <corelib/ncbifile.hpp> 47 48 BEGIN_NCBI_SCOPE 49 50 /// Import definitions from the objects namespace. 51 USING_SCOPE(objects); 52 53 /// CWriteDB_IndexFile class 54 /// 55 /// This manufactures blast database index files from input data. 56 57 class NCBI_XOBJWRITE_EXPORT CWriteDB_File : public CObject { 58 public: 59 // Setup and control 60 61 /// Constructor. 62 /// 63 /// The filename is constructed from basename, extension, and 64 /// index, but might be changed if the RenameSingle() method is 65 /// called. If zero is specified for maximum file size, a default 66 /// size is provided by this class. The maximum file size is not 67 /// enforced by this class, instead each derived class must do its 68 /// own enforcement. 69 /// 70 /// @param basename Database base name, shared by all files. [in] 71 /// @param extension File name extension for this file. [in] 72 /// @param index Volume index used in filename. [in] 73 /// @param max_file_size File size limit (in bytes). [in] 74 /// @param always_create If true the file will be created now. [in] 75 CWriteDB_File(const string & basename, 76 const string & extension, 77 int index, 78 Uint8 max_file_size, 79 bool always_create); 80 81 /// Create and open the file. 82 /// 83 /// This method must be called before the first time that data is 84 /// written to the file. If the constructor is passed 'true' for 85 /// always_create, this method will be called during construction. 86 /// It is an error to call this method more than once (including 87 /// via the constructor) or to not call it but to call Write. The 88 /// rationale for making this explicit is to permit some files to 89 /// be created optionally, such as ISAM files, which should only 90 /// be created if the corresponding ID types are found. 91 void Create(); 92 93 /// Write contents of a string to the file. 94 /// @param data Data to write. 95 /// @return File offset after write. 96 unsigned int Write(const CTempString & data); 97 98 /// Write an Int4 (in bigendian order) to the file. 99 /// @param data String to write. 100 /// @return File offset after write. WriteInt4(int data)101 unsigned int WriteInt4(int data) 102 { 103 s_WriteInt4(m_RealFile, data); 104 m_Offset += 4; 105 return m_Offset; 106 } 107 108 /// Write an Int8 (in bigendian order) to the file. 109 /// @param data String to write. 110 /// @return File offset after write. WriteInt8(Int8 data)111 unsigned int WriteInt8(Int8 data) 112 { 113 s_WriteInt8BE(m_RealFile, data); 114 m_Offset += 8; 115 return m_Offset; 116 } 117 118 /// Write contents of a string to the file, appending a NUL. 119 /// @param data String to write. 120 /// @return File offset after write. WriteWithNull(const CTempString & data)121 unsigned int WriteWithNull(const CTempString & data) 122 { 123 Write(data); 124 return Write(m_Nul); 125 } 126 127 /// Close the file, flushing any remaining data to disk. 128 void Close(); 129 130 /// Rename this file, disincluding the volume index. 131 virtual void RenameSingle(); 132 133 /// Construct the short name for a volume. 134 /// 135 /// Volume names consist of the database base name, ".", and the 136 /// volume index in decimal. The volume index is normally two 137 /// digits, but if more than 100 volumes are needed, the filename 138 /// will use three or more index digits as needed. 139 /// 140 /// @param base Base name to use. 141 /// @param index Volume index. 142 /// @return A short name. 143 static string MakeShortName(const string & base, int index); 144 145 /// Get the current filename for this file. 146 /// 147 /// The filename is returned. The data returned by this method 148 /// reflects changes made by RenameSingle(), so it is probably 149 /// best to call it after that method has been called (if it will 150 /// be called). 151 /// 152 /// @return The filename. GetFilename() const153 const string & GetFilename() const 154 { 155 return m_Fname; 156 } 157 158 protected: 159 /// True if the file has already been opened. 160 bool m_Created; 161 162 /// Underlying 'output file' type used here. 163 typedef ofstream TFile; 164 165 /// For convenience, a string containing one NUL character. 166 string m_Nul; // init me 167 168 /// The default value for max_file_size. 169 /// @return The max file size used if otherwise unspecified. x_DefaultByteLimit()170 Uint8 x_DefaultByteLimit() 171 { 172 // 1 gb (marketing version) - 1; about a billion 173 return 1000*1000*1000 - 1; 174 } 175 176 /// This should flush any unwritten data to disk. 177 /// 178 /// This method must be implemented by derived classes to flush 179 /// any unwritten data to disk. In the cases of sequence and 180 /// header files, it will normally do nothing, because such files 181 /// are written as the data is available. For index (pin/nin) and 182 /// ISAM files, this method does most of the disk I/O. 183 virtual void x_Flush() = 0; 184 185 /// Build the filename for this file. 186 void x_MakeFileName(); 187 188 // Configuration 189 190 string m_BaseName; ///< Database base name for all files. 191 string m_Extension; ///< File extension for this file. 192 int m_Index; ///< Volume index. 193 unsigned int m_Offset; ///< Stream position. 194 Uint8 m_MaxFileSize; ///< Maximum file size in bytes. 195 196 // The file 197 198 bool m_UseIndex; ///< True if filenames should use volume index. 199 string m_Fname; ///< Current filename for output file. 200 TFile m_RealFile; ///< Actual stream implementing the output file. 201 }; 202 203 // For index file format, see .cpp file. 204 205 /// This class builds the volume index file (pin or nin). 206 class CWriteDB_IndexFile : public CWriteDB_File { 207 public: 208 /// Constructor. 209 /// @param dbname Database base name. 210 /// @param protein True for protein volumes. 211 /// @param title Database title string. 212 /// @param date Timestamp of database construction start. 213 /// @param index Index of this volume. 214 /// @param max_file_size Maximum file size in bytes (or zero). 215 CWriteDB_IndexFile(const string & dbname, 216 bool protein, 217 const string & title, 218 const string & date, 219 int index, 220 Uint8 max_file_size, 221 EBlastDbVersion dbver = eBDB_Version4); 222 223 /// Returns true if another sequence can fit into the file. CanFit()224 bool CanFit() 225 { 226 _ASSERT(m_MaxFileSize > 1024UL); 227 228 if (m_OIDs == 0) 229 return true; 230 231 return m_DataSize < (m_MaxFileSize - 12UL); 232 } 233 234 /// Add a sequence to a protein index file (pin). 235 /// 236 /// The index file does not need sequence data, so this method 237 /// only needs offsets of the data in other files. 238 /// 239 /// @param Sequence length in letters. 240 /// @param hdr Length of binary ASN.1 header data. 241 /// @param seq Length in bytes of sequence data. AddSequence(int length,unsigned int hdr,unsigned int seq)242 void AddSequence(int length, unsigned int hdr, unsigned int seq) 243 { 244 if (length > m_MaxLength) { 245 m_MaxLength = length; 246 } 247 248 m_OIDs++; 249 m_Letters += length; 250 m_DataSize += 8; 251 252 m_Hdr.push_back(hdr); 253 m_Seq.push_back(seq); 254 } 255 256 /// Add a sequence to a nucleotide index file (nin). 257 /// 258 /// The index file does not need sequence data, so this method 259 /// only needs offsets of the data in other files. 260 /// 261 /// @param Sequence length in letters. 262 /// @param hdr Length of binary ASN.1 header data. 263 /// @param seq Length in bytes of packed sequence data. 264 /// @param amb Length in bytes of packed ambiguity data. AddSequence(int length,unsigned int hdr,unsigned int seq,unsigned int amb)265 void AddSequence(int length, unsigned int hdr, unsigned int seq, unsigned int amb) 266 { 267 if (length > m_MaxLength) { 268 m_MaxLength = length; 269 } 270 271 m_OIDs++; 272 m_Letters += length; 273 274 m_DataSize += 12; 275 m_Hdr.push_back(hdr); 276 m_Seq.push_back(amb); // Not a bug. 277 m_Amb.push_back(seq); // Also not a bug. 278 } 279 280 private: 281 /// Compute index file overhead. This is the overhead used by all 282 /// fields of the index file, and does account for padding. 283 /// (version 5) 284 /// 285 /// @param T Title string. 286 /// @param LMDB file name string. 287 /// @param D Create time string. 288 /// @return Combined size of all meta-data fields in nin/pin file. 289 int x_Overhead(const string & T, const string & lmdbName, const string & D); 290 291 /// Compute index file overhead. This is the overhead used by all 292 /// fields of the index file, and does account for padding. 293 /// (version 4) 294 /// 295 /// @param T Title string. 296 /// @param D Create time string. 297 /// @return Combined size of all meta-data fields in nin/pin file. 298 int x_Overhead(const string & T, const string & D); 299 300 /// Flush index data to disk. 301 virtual void x_Flush(); 302 303 /// Form name of LMDB database file. 304 const string x_MakeLmdbName(); 305 306 bool m_Protein; ///< True if this is a protein database. 307 string m_Title; ///< Title string for all database volumes. 308 string m_Date; ///< Database creation time stamp. 309 int m_OIDs; ///< OIDs added to database so far. 310 int m_Overhead; ///< Amount of file used by metadata. 311 Uint8 m_DataSize; ///< Required space for data once written to disk. 312 Uint8 m_Letters; ///< Letters of sequence data accumulated so far. 313 int m_MaxLength; ///< Length of longest sequence. 314 315 // Because the lengths are found via "next offset - this offset", 316 // each array has an extra element. (This is not necesary in the 317 // case of m_Amb; the last element is never examined because of 318 // the alternation of sequences and ambiguities.) 319 320 /// Start offset in header file of each OID's headers. 321 /// 322 /// The end offset is given by the start offset of the following 323 /// OID's headers. 324 vector<unsigned int> m_Hdr; 325 326 /// Offset in sequence file of each OID's sequence data. 327 /// 328 /// The end of the sequence data is given by the start offset of 329 /// the ambiguity data for the same OID. 330 vector<unsigned int> m_Seq; 331 332 /// Offset in sequence file of each OID's ambiguity data. 333 /// 334 /// The end of the ambiguity data is given by the start offset of 335 /// the sequence data for the next OID. 336 vector<unsigned int> m_Amb; 337 338 EBlastDbVersion m_Version; ///< BLASTDB version (4 or 5). 339 }; 340 341 /// This class builds the volume header file (phr or nhr). 342 class CWriteDB_HeaderFile : public CWriteDB_File { 343 public: 344 /// Constructor. 345 /// @param dbname Database base name. 346 /// @param protein True for protein volumes. 347 /// @param index Index of this volume. 348 /// @param max_file_size Maximum file size in bytes (or zero). 349 CWriteDB_HeaderFile(const string & dbname, 350 bool protein, 351 int index, 352 Uint8 max_file_size); 353 354 /// Returns true if the specified amount of data would fit. 355 /// 356 /// If the specified amount of data (in bytes) would fit in the 357 /// file without exceeding the max_file_size, this method returns 358 /// true. 359 /// 360 /// @param size Size of new data in bytes. CanFit(int size)361 bool CanFit(int size) 362 { 363 _ASSERT(size >= 0); 364 365 if (m_DataSize == 0UL) { 366 return true; 367 } 368 369 return (m_DataSize + (Uint8) size) < m_MaxFileSize; 370 } 371 372 /// Add binary header data to this file. 373 /// @param binhdr Binary ASN.1 version of header data. [in] 374 /// @param offset Offset of end of header data. [out] AddSequence(const string & binhdr,unsigned int & offset)375 void AddSequence(const string & binhdr, unsigned int & offset) 376 { 377 m_DataSize = offset = Write(binhdr); 378 } 379 380 private: 381 /// Flush unwritten data to the output file. x_Flush()382 virtual void x_Flush() 383 { 384 // There is nothing to do here - header data is written as 385 // soon as it is added. 386 } 387 388 /// Amount of data written so far. 389 Uint8 m_DataSize; 390 }; 391 392 class CWriteDB_SequenceFile : public CWriteDB_File { 393 public: 394 /// Constructor. 395 /// @param dbname Database base name. 396 /// @param protein True for protein volumes. 397 /// @param index Index of this volume. 398 /// @param max_file_size Maximum file size in bytes (or zero). 399 /// @param max_letter Maximum sequence letters per volume (or zero). 400 CWriteDB_SequenceFile(const string & dbname, 401 bool protein, 402 int index, 403 Uint8 max_file_size, 404 Uint8 max_letters); 405 406 /// Returns true if the specified amount of data would fit. 407 /// 408 /// If the specified amount of data (in bytes) would fit in the 409 /// file without exceeding the max_file_size, and the specified 410 /// number of letters would fit without exceeding the maximum 411 /// letters limit, this method returns true. 412 /// 413 /// @param size Size of new data in bytes. 414 /// @param letters Number of sequence letters in new data. CanFit(int size,int letters)415 bool CanFit(int size, int letters) 416 { 417 _ASSERT(size >= 0); 418 _ASSERT(letters >= 0); 419 420 if (m_Offset <= 1) { 421 return true; 422 } 423 424 if ((m_BaseLimit != 0) && 425 ((m_Letters + (Uint8) letters) > m_BaseLimit)) { 426 return false; 427 } 428 429 return ((Uint8)(m_Offset + size) < m_MaxFileSize); 430 } 431 432 /// Add a protein sequence to this file. 433 /// 434 /// This method should only be called in the protein case. 435 /// 436 /// @param sequence Packed sequence data. [in] 437 /// @param offset Offset of the end of the sequence data. [out] 438 /// @param length Length of the sequence in letters. [in] AddSequence(const string & sequence,unsigned int & offset,int length)439 void AddSequence(const string & sequence, 440 unsigned int & offset, 441 int length) 442 { 443 #ifdef _DEBUG 444 _ASSERT(m_Protein); 445 #endif 446 offset = WriteWithNull(sequence); 447 m_Letters += length; 448 } 449 450 /// Add a nucleotide sequence to this file. 451 /// 452 /// This method should only be called in the nucleotide case. 453 /// 454 /// @param sequence Packed sequence data. [in] 455 /// @param ambig Packed ambiguity data. [in] 456 /// @param off_seq Offset of the end of the sequence data. [out] 457 /// @param off_amb Offset of the end of the ambiguity data. [out] 458 /// @param length Length of the sequence in letters. [in] AddSequence(const string & sequence,const string & ambig,unsigned int & off_seq,unsigned int & off_amb,int length)459 void AddSequence(const string & sequence, 460 const string & ambig, 461 unsigned int & off_seq, 462 unsigned int & off_amb, 463 int length) 464 { 465 #ifdef _DEBUG 466 _ASSERT(! m_Protein); 467 #endif 468 off_seq = Write(sequence); 469 off_amb = Write(ambig); 470 m_Letters += length; 471 } 472 473 private: 474 /// Flush unwritten data to the output file. x_Flush()475 virtual void x_Flush() 476 { 477 // There is nothing to do here - sequence data is written as 478 // soon as it is added. 479 } 480 481 Uint8 m_Letters; ///< Letters of sequence data added so far. 482 Uint8 m_BaseLimit; ///< Limit on letters of sequence data. 483 #ifdef _DEBUG 484 bool m_Protein; ///< True if this is a protein database. 485 #endif 486 }; 487 488 END_NCBI_SCOPE 489 490 491 #endif // OBJTOOLS_WRITERS_WRITEDB__WRITEDB_FILES_HPP 492 493