1 #ifndef OBJTOOLS_WRITERS_WRITEDB__WRITEDB_VOLUME_HPP
2 #define OBJTOOLS_WRITERS_WRITEDB__WRITEDB_VOLUME_HPP
3 
4 /*  $Id: writedb_volume.hpp 553487 2017-12-18 14:23:38Z fongah2 $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Kevin Bealer
30  *
31  */
32 
33 /// @file writedb_volume.hpp
34 /// Code for database volume construction.
35 ///
36 /// Defines classes:
37 ///     CWriteDBVolume
38 ///
39 /// Implemented for: UNIX, MS-Windows
40 
41 #include <objtools/blast/seqdb_writer/writedb.hpp>
42 #include <objects/seq/seq__.hpp>
43 #include <objtools/blast/seqdb_writer/writedb_files.hpp>
44 #include <objtools/blast/seqdb_writer/writedb_isam.hpp>
45 #include "writedb_column.hpp"
46 
47 BEGIN_NCBI_SCOPE
48 
49 /// Import definitions from the objects namespace.
50 USING_SCOPE(objects);
51 
52 /// CWriteDB_GiIndex class
53 ///
54 /// This class creates OID->GI lookup file
55 class CWriteDB_GiIndex : public CWriteDB_File {
56 public:
CWriteDB_GiIndex(const string & dbname,bool protein,int index,Uint8 max_fsize)57     CWriteDB_GiIndex(const string & dbname,
58                      bool           protein,
59                      int            index,
60                      Uint8          max_fsize)
61     : CWriteDB_File  (dbname, (protein ? "pog" : "nog"), index, max_fsize, false){ }
62 
~CWriteDB_GiIndex()63     ~CWriteDB_GiIndex() { };
64 
AddGi(TGi gi)65     void AddGi(TGi gi) {
66         m_Gi.push_back(gi);
67     }
68 
69 private:
x_Flush()70     void x_Flush() {
71 
72         Int4 nGi = m_Gi.size();
73 
74         if (! nGi) return;
75 
76         Create();
77         WriteInt4(kVersion);
78         WriteInt4(kFileType);
79         WriteInt4(kGiSize);
80         WriteInt4(nGi);
81 
82         for (Int4 i=0; i<4; i++) {
83             WriteInt4(0);
84         }
85 
86         for (Int4 i=0; i<nGi; i++) {
87             WriteInt4(GI_TO(Int4, m_Gi[i]));
88         }
89 
90         vector<TGi> tmp;
91         m_Gi.swap(tmp);
92     }
93 
94     static const int kVersion = 1;
95     static const int kFileType = 0;
96     static const int kGiSize = 4;
97     vector<TGi> m_Gi;
98 };
99 
100 
101 /// CWriteDB_Volume class
102 ///
103 /// This manufactures a blast database volume from sequences.
104 
105 class CWriteDB_Volume : public CObject {
106 public:
107     /// Whether and what kind of indices to build.
108     typedef CWriteDB::EIndexType EIndexType;
109 
110     /// Type used for lists of identifiers.
111     typedef vector< CRef<CSeq_id> > TIdList;
112 
113     /// Type used for lists of identifiers.
114     typedef vector< CRef<CBlastDbBlob> > TBlobList;
115 
116     // Setup and control
117 
118     /// Build a database volume.
119     ///
120     /// @param dbname Base name of the database, such as 'nr'.
121     /// @param protein True if the database is a protein database.
122     /// @param title Title of the database.
123     /// @param date Creation date of the database.
124     /// @param index Volume index (for filename).
125     /// @param max_file_size Maximum file size for this volume.
126     /// @param max_letters Maximum number of letters for this volume.
127     /// @param indices Type of indices to build.
128     CWriteDB_Volume(const string     & dbname,
129                     bool               protein,
130                     const string     & title,
131                     const string     & date,
132                     int                index,
133                     Uint8              max_file_size,
134                     Uint8              max_letters,
135                     EIndexType         indices,
136                     EBlastDbVersion dbver = eBDB_Version4);
137 
138 
139     /// Destructor.
140     ///
141     /// The Close() method will be called if it has not already been.
142     ~CWriteDB_Volume();
143 
144     /// Add a sequence to this volume.
145     ///
146     /// The provided data represents all information for one
147     /// non-redundant sequence that will be added to this volume.
148     ///
149     /// @param seq Sequence data in format ncbi2na or ncbistdaa.
150     /// @param ambig Ambiguities (for protein this should be empty).
151     /// @param binhdr Binary headers (blast deflines in binary ASN.1).
152     /// @param ids List of identifiers for ISAM construction.
153     /// @param pig PIG protein identifier (zero if not available.)
154     /// @param hash Sequence Hash (zero if not available.)
155     bool WriteSequence(const string    & seq,
156                        const string    & ambig,
157                        const string    & binhdr,
158                        const TIdList   & ids,
159                        int               pig,
160                        int               hash,
161                        const TBlobList & blobs,
162                        int               maskcol_id=-1);
163 
164     /// Rename all volumes files to single-volume names.
165     ///
166     /// When volume component files are generated by WriteDB, the
167     /// volume names include a volume index.  This method renames the
168     /// generated files for this volume to names that do not include
169     /// the volume index.  This method should not be called until the
170     /// volume is Close()s.
171     void RenameSingle();
172 
173     /// Close the volume.
174     ///
175     /// This method finalizes and closes all files associated with
176     /// this volume.  (This is not a trivial operation, because ISAM
177     /// indices and the index file (pin or nin) cannot be written
178     /// until all of the data has been seen.)
179     void Close();
180 
181     /// Get the name of the volume.
182     ///
183     /// The volume name includes the path and version (if a version is
184     /// used) but does not include the extension.  It is the name that
185     /// would be provided to SeqDB to open this volume.  This method
186     /// should be called after RenameSingle() if that method is going
187     /// to be called.
188     ///
189     /// @return The volume name.
GetVolumeName() const190     const string & GetVolumeName() const
191     {
192         return m_VolName;
193     }
194 
195     /// Get the current OID of the volume.
196     ///
197     /// The current OID is needed for generating BL_ORD_ID.
198     ///
199     /// @return the OID
GetOID() const200     const int & GetOID() const
201     {
202         return m_OID;
203     }
204 
205     /// List all files associated with this volume.
206     /// @param files The filenames will be appended to this vector.
207     void ListFiles(vector<string> & files) const;
208 
209 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION  > 550)) && \
210      (!defined(NCBI_COMPILER_MIPSPRO)) )
211     /// Type used for database column meta-data.
212     typedef CWriteDB_Column::TColumnMeta TColumnMeta;
213 
214     /// Create a new database column.
215     ///
216     /// @param title The title of the new column.
217     /// @param meta Metadata to store in the new column.
218     /// @param max_sz max file size.
219     /// @return The numeric column ID.
220     int CreateColumn(const string      & title,
221                      const TColumnMeta & meta,
222                      Uint8               max_sz,
223                      bool                mbo = true);
224 
225     /// Add meta data to a column.
226     ///
227     /// In addition to normal blob data, database columns can store a
228     /// `dictionary' of user-defined metadata in key/value form.  This
229     /// method adds one such key/value pair to the column.  Specifying
230     /// a key a second time causes replacement of the previous value.
231     /// Using this mechanism to store large amounts of data may have a
232     /// negative impact on performance.
233     ///
234     /// @param col_id Specifies the column to add this metadata to.
235     /// @param key    A unique key string.
236     /// @param value  A value string.
237     void AddColumnMetaData(int            col_id,
238                            const string & key,
239                            const string & value);
240 #endif
241 
242 private:
243     // Configuration.
244 
245     string           m_DbName;      ///< Base name of the database.
246     string           m_VolName;     ///< Database name plus version (if used).
247     bool             m_Protein;     ///< True for protein; false for nucleotide.
248     string           m_Title;       ///< Database title (same for all volumes).
249     string           m_Date;        ///< Construct time (same for all volumes).
250     int              m_Index;       ///< Index of this volume (1 based).
251     EIndexType       m_Indices;     ///< Indices are sparse, full, or disabled.
252     EBlastDbVersion  m_DbVersion;   ///< Blast DB version
253 
254     // Status.
255 
256     int  m_OID;  ///< Next assigned OID.
257     bool m_Open; ///< True if user can still append sequences.
258 
259     // Components
260 
261     CRef<CWriteDB_IndexFile>    m_Idx; ///< Index file (pin / nin).
262     CRef<CWriteDB_HeaderFile>   m_Hdr; ///< Header file (phr / nhr).
263     CRef<CWriteDB_SequenceFile> m_Seq; ///< Sequence file (psq / nsq).
264 
265     CRef<CWriteDB_Isam> m_AccIsam;   ///< Accession index (psi+psd / nsi+nsd).
266     CRef<CWriteDB_Isam> m_GiIsam;    ///< GI index (pni+pnd / nni+nnd).
267     CRef<CWriteDB_Isam> m_PigIsam;   ///< PIG index (ppi+ppd, protein only).
268     CRef<CWriteDB_Isam> m_TraceIsam; ///< Trace ID index (pti+ptd or nti+ntd).
269     CRef<CWriteDB_Isam> m_HashIsam;  ///< Hash index (phi+phd or nhi+nhd).
270     CRef<CWriteDB_GiIndex> m_GiIndex;///< OID->GI lookup (pgx or ngx).
271 
272 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION  > 550)) && \
273      (!defined(NCBI_COMPILER_MIPSPRO)) )
274     /// Database columns.
275     vector< CRef<CWriteDB_Column> > m_Columns;
276 #endif
277 
278     /// Included Seq_ids
279     set<string> m_IdSet;
280 
281     // Functions
282 
283     /// Compute base-length of compressed nucleotide sequence.
284     ///
285     /// Nucleotide sequences stored on disk are packed 4 bases to a byte,
286     /// except for the last byte.  That byte has 0-3 bases of real sequence
287     /// data plus a 'remainder' value (from 0-3) that indicates how many of
288     /// the bases of the last byte are sequence data.  This method finds the
289     /// exact length in bases for a nucleotide sequence packed in this way.
290     ///
291     /// @param seq Ncbi2na sequence with length remainder encoding.
292     /// @return Length in bases of actual sequence data in this sequence.
293     int x_FindNuclLength(const string & seq);
294 };
295 
296 END_NCBI_SCOPE
297 
298 #endif // OBJTOOLS_WRITERS_WRITEDB__WRITEDB_VOLUME_HPP
299 
300