1 #ifndef OBJTOOLS_WRITERS_WRITEDB__WRITEDB_VOLUME_HPP 2 #define OBJTOOLS_WRITERS_WRITEDB__WRITEDB_VOLUME_HPP 3 4 /* $Id: writedb_volume.hpp 553487 2017-12-18 14:23:38Z fongah2 $ 5 * =========================================================================== 6 * 7 * PUBLIC DOMAIN NOTICE 8 * National Center for Biotechnology Information 9 * 10 * This software/database is a "United States Government Work" under the 11 * terms of the United States Copyright Act. It was written as part of 12 * the author's official duties as a United States Government employee and 13 * thus cannot be copyrighted. This software/database is freely available 14 * to the public for use. The National Library of Medicine and the U.S. 15 * Government have not placed any restriction on its use or reproduction. 16 * 17 * Although all reasonable efforts have been taken to ensure the accuracy 18 * and reliability of the software and data, the NLM and the U.S. 19 * Government do not and cannot warrant the performance or results that 20 * may be obtained by using this software or data. The NLM and the U.S. 21 * Government disclaim all warranties, express or implied, including 22 * warranties of performance, merchantability or fitness for any particular 23 * purpose. 24 * 25 * Please cite the author in any work or product based on this material. 26 * 27 * =========================================================================== 28 * 29 * Author: Kevin Bealer 30 * 31 */ 32 33 /// @file writedb_volume.hpp 34 /// Code for database volume construction. 35 /// 36 /// Defines classes: 37 /// CWriteDBVolume 38 /// 39 /// Implemented for: UNIX, MS-Windows 40 41 #include <objtools/blast/seqdb_writer/writedb.hpp> 42 #include <objects/seq/seq__.hpp> 43 #include <objtools/blast/seqdb_writer/writedb_files.hpp> 44 #include <objtools/blast/seqdb_writer/writedb_isam.hpp> 45 #include "writedb_column.hpp" 46 47 BEGIN_NCBI_SCOPE 48 49 /// Import definitions from the objects namespace. 50 USING_SCOPE(objects); 51 52 /// CWriteDB_GiIndex class 53 /// 54 /// This class creates OID->GI lookup file 55 class CWriteDB_GiIndex : public CWriteDB_File { 56 public: CWriteDB_GiIndex(const string & dbname,bool protein,int index,Uint8 max_fsize)57 CWriteDB_GiIndex(const string & dbname, 58 bool protein, 59 int index, 60 Uint8 max_fsize) 61 : CWriteDB_File (dbname, (protein ? "pog" : "nog"), index, max_fsize, false){ } 62 ~CWriteDB_GiIndex()63 ~CWriteDB_GiIndex() { }; 64 AddGi(TGi gi)65 void AddGi(TGi gi) { 66 m_Gi.push_back(gi); 67 } 68 69 private: x_Flush()70 void x_Flush() { 71 72 Int4 nGi = m_Gi.size(); 73 74 if (! nGi) return; 75 76 Create(); 77 WriteInt4(kVersion); 78 WriteInt4(kFileType); 79 WriteInt4(kGiSize); 80 WriteInt4(nGi); 81 82 for (Int4 i=0; i<4; i++) { 83 WriteInt4(0); 84 } 85 86 for (Int4 i=0; i<nGi; i++) { 87 WriteInt4(GI_TO(Int4, m_Gi[i])); 88 } 89 90 vector<TGi> tmp; 91 m_Gi.swap(tmp); 92 } 93 94 static const int kVersion = 1; 95 static const int kFileType = 0; 96 static const int kGiSize = 4; 97 vector<TGi> m_Gi; 98 }; 99 100 101 /// CWriteDB_Volume class 102 /// 103 /// This manufactures a blast database volume from sequences. 104 105 class CWriteDB_Volume : public CObject { 106 public: 107 /// Whether and what kind of indices to build. 108 typedef CWriteDB::EIndexType EIndexType; 109 110 /// Type used for lists of identifiers. 111 typedef vector< CRef<CSeq_id> > TIdList; 112 113 /// Type used for lists of identifiers. 114 typedef vector< CRef<CBlastDbBlob> > TBlobList; 115 116 // Setup and control 117 118 /// Build a database volume. 119 /// 120 /// @param dbname Base name of the database, such as 'nr'. 121 /// @param protein True if the database is a protein database. 122 /// @param title Title of the database. 123 /// @param date Creation date of the database. 124 /// @param index Volume index (for filename). 125 /// @param max_file_size Maximum file size for this volume. 126 /// @param max_letters Maximum number of letters for this volume. 127 /// @param indices Type of indices to build. 128 CWriteDB_Volume(const string & dbname, 129 bool protein, 130 const string & title, 131 const string & date, 132 int index, 133 Uint8 max_file_size, 134 Uint8 max_letters, 135 EIndexType indices, 136 EBlastDbVersion dbver = eBDB_Version4); 137 138 139 /// Destructor. 140 /// 141 /// The Close() method will be called if it has not already been. 142 ~CWriteDB_Volume(); 143 144 /// Add a sequence to this volume. 145 /// 146 /// The provided data represents all information for one 147 /// non-redundant sequence that will be added to this volume. 148 /// 149 /// @param seq Sequence data in format ncbi2na or ncbistdaa. 150 /// @param ambig Ambiguities (for protein this should be empty). 151 /// @param binhdr Binary headers (blast deflines in binary ASN.1). 152 /// @param ids List of identifiers for ISAM construction. 153 /// @param pig PIG protein identifier (zero if not available.) 154 /// @param hash Sequence Hash (zero if not available.) 155 bool WriteSequence(const string & seq, 156 const string & ambig, 157 const string & binhdr, 158 const TIdList & ids, 159 int pig, 160 int hash, 161 const TBlobList & blobs, 162 int maskcol_id=-1); 163 164 /// Rename all volumes files to single-volume names. 165 /// 166 /// When volume component files are generated by WriteDB, the 167 /// volume names include a volume index. This method renames the 168 /// generated files for this volume to names that do not include 169 /// the volume index. This method should not be called until the 170 /// volume is Close()s. 171 void RenameSingle(); 172 173 /// Close the volume. 174 /// 175 /// This method finalizes and closes all files associated with 176 /// this volume. (This is not a trivial operation, because ISAM 177 /// indices and the index file (pin or nin) cannot be written 178 /// until all of the data has been seen.) 179 void Close(); 180 181 /// Get the name of the volume. 182 /// 183 /// The volume name includes the path and version (if a version is 184 /// used) but does not include the extension. It is the name that 185 /// would be provided to SeqDB to open this volume. This method 186 /// should be called after RenameSingle() if that method is going 187 /// to be called. 188 /// 189 /// @return The volume name. GetVolumeName() const190 const string & GetVolumeName() const 191 { 192 return m_VolName; 193 } 194 195 /// Get the current OID of the volume. 196 /// 197 /// The current OID is needed for generating BL_ORD_ID. 198 /// 199 /// @return the OID GetOID() const200 const int & GetOID() const 201 { 202 return m_OID; 203 } 204 205 /// List all files associated with this volume. 206 /// @param files The filenames will be appended to this vector. 207 void ListFiles(vector<string> & files) const; 208 209 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \ 210 (!defined(NCBI_COMPILER_MIPSPRO)) ) 211 /// Type used for database column meta-data. 212 typedef CWriteDB_Column::TColumnMeta TColumnMeta; 213 214 /// Create a new database column. 215 /// 216 /// @param title The title of the new column. 217 /// @param meta Metadata to store in the new column. 218 /// @param max_sz max file size. 219 /// @return The numeric column ID. 220 int CreateColumn(const string & title, 221 const TColumnMeta & meta, 222 Uint8 max_sz, 223 bool mbo = true); 224 225 /// Add meta data to a column. 226 /// 227 /// In addition to normal blob data, database columns can store a 228 /// `dictionary' of user-defined metadata in key/value form. This 229 /// method adds one such key/value pair to the column. Specifying 230 /// a key a second time causes replacement of the previous value. 231 /// Using this mechanism to store large amounts of data may have a 232 /// negative impact on performance. 233 /// 234 /// @param col_id Specifies the column to add this metadata to. 235 /// @param key A unique key string. 236 /// @param value A value string. 237 void AddColumnMetaData(int col_id, 238 const string & key, 239 const string & value); 240 #endif 241 242 private: 243 // Configuration. 244 245 string m_DbName; ///< Base name of the database. 246 string m_VolName; ///< Database name plus version (if used). 247 bool m_Protein; ///< True for protein; false for nucleotide. 248 string m_Title; ///< Database title (same for all volumes). 249 string m_Date; ///< Construct time (same for all volumes). 250 int m_Index; ///< Index of this volume (1 based). 251 EIndexType m_Indices; ///< Indices are sparse, full, or disabled. 252 EBlastDbVersion m_DbVersion; ///< Blast DB version 253 254 // Status. 255 256 int m_OID; ///< Next assigned OID. 257 bool m_Open; ///< True if user can still append sequences. 258 259 // Components 260 261 CRef<CWriteDB_IndexFile> m_Idx; ///< Index file (pin / nin). 262 CRef<CWriteDB_HeaderFile> m_Hdr; ///< Header file (phr / nhr). 263 CRef<CWriteDB_SequenceFile> m_Seq; ///< Sequence file (psq / nsq). 264 265 CRef<CWriteDB_Isam> m_AccIsam; ///< Accession index (psi+psd / nsi+nsd). 266 CRef<CWriteDB_Isam> m_GiIsam; ///< GI index (pni+pnd / nni+nnd). 267 CRef<CWriteDB_Isam> m_PigIsam; ///< PIG index (ppi+ppd, protein only). 268 CRef<CWriteDB_Isam> m_TraceIsam; ///< Trace ID index (pti+ptd or nti+ntd). 269 CRef<CWriteDB_Isam> m_HashIsam; ///< Hash index (phi+phd or nhi+nhd). 270 CRef<CWriteDB_GiIndex> m_GiIndex;///< OID->GI lookup (pgx or ngx). 271 272 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \ 273 (!defined(NCBI_COMPILER_MIPSPRO)) ) 274 /// Database columns. 275 vector< CRef<CWriteDB_Column> > m_Columns; 276 #endif 277 278 /// Included Seq_ids 279 set<string> m_IdSet; 280 281 // Functions 282 283 /// Compute base-length of compressed nucleotide sequence. 284 /// 285 /// Nucleotide sequences stored on disk are packed 4 bases to a byte, 286 /// except for the last byte. That byte has 0-3 bases of real sequence 287 /// data plus a 'remainder' value (from 0-3) that indicates how many of 288 /// the bases of the last byte are sequence data. This method finds the 289 /// exact length in bases for a nucleotide sequence packed in this way. 290 /// 291 /// @param seq Ncbi2na sequence with length remainder encoding. 292 /// @return Length in bases of actual sequence data in this sequence. 293 int x_FindNuclLength(const string & seq); 294 }; 295 296 END_NCBI_SCOPE 297 298 #endif // OBJTOOLS_WRITERS_WRITEDB__WRITEDB_VOLUME_HPP 299 300