1 #ifndef OBJTOOLS_WRITERS_WRITEDB__WRITEDB_IMPL_HPP
2 #define OBJTOOLS_WRITERS_WRITEDB__WRITEDB_IMPL_HPP
3 
4 /*  $Id: writedb_impl.hpp 610974 2020-06-26 12:59:33Z grichenk $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Kevin Bealer
30  *
31  */
32 
33 /// @file writedb_impl.hpp
34 /// Defines implementation class of WriteDB.
35 ///
36 /// Defines classes:
37 ///     CWriteDBHeader
38 ///
39 /// Implemented for: UNIX, MS-Windows
40 
41 #include <objects/seq/seq__.hpp>
42 #include <objects/blastdb/blastdb__.hpp>
43 #include <objects/blastdb/defline_extra.hpp>
44 #include <objtools/blast/seqdb_writer/writedb.hpp>
45 #include <objtools/blast/seqdb_reader/seqdbcommon.hpp>
46 #include <objtools/blast/seqdb_writer/writedb_lmdb.hpp>
47 #include "writedb_volume.hpp"
48 #include "writedb_gimask.hpp"
49 #include "mask_info_registry.hpp"
50 
51 #include <objmgr/bioseq_handle.hpp>
52 #include <objmgr/seq_vector.hpp>
53 
54 BEGIN_NCBI_SCOPE
55 
56 /// Import definitions from the objects namespace.
57 USING_SCOPE(objects);
58 
59 /// CWriteDB_Impl class
60 ///
61 /// This manufactures blast database header files from input data.
62 
63 class CWriteDB_Impl {
64 public:
65     /// Whether and what kind of indices to build.
66     typedef CWriteDB::EIndexType EIndexType;
67 
68     // Setup and control
69 
70     /// Constructor.
71     /// @param dbname Name of the database to create.
72     /// @param protein True for protein, false for nucleotide.
73     /// @param title Title string for volumes and alias file.
74     /// @param indices Type of indexing to do for string IDs.
75     /// @param parse_ids If true generate ISAM files
76     /// @param long_ids If true, assume long sequence ids (database|accession)
77     /// when parsing strings ids
78     /// @param use_gi_mask If true generate GI-based mask files.
79     CWriteDB_Impl(const string     & dbname,
80                   bool               protein,
81                   const string     & title,
82                   EIndexType         indices,
83                   bool               parse_ids,
84                   bool               long_ids,
85                   bool               use_gi_mask,
86                   EBlastDbVersion dbver = eBDB_Version4,
87                   bool               limit_defline = false);
88 
89     /// Destructor.
90     ~CWriteDB_Impl();
91 
92     /// Close the file and flush any remaining data to disk.
93     void Close();
94 
95     // Sequence Data
96 
97     /// Add a new sequence as raw sequence and ambiguity data.
98     ///
99     /// A new sequence record is started, and data from any previous
100     /// sequence is combined and written to disk.  Each sequence needs
101     /// sequence data and header data.  This method takes sequence
102     /// data in the form of seperated sequence data and compressed
103     /// ambiguities packed in the blast database disk format.  It is
104     /// intended for efficiently copying sequences from sources that
105     /// provide this format, such as CSeqDBExpert().  If this method
106     /// is used for protein data, the ambiguities string should be
107     /// empty.  If this method is used, header data must also be
108     /// specified with a call to SetDeflines().
109     ///
110     /// @param sequence Sequence data in blast db disk format.
111     /// @param ambiguities Ambiguity data in blast db disk format.
112     void AddSequence(const CTempString & sequence,
113                      const CTempString & ambiguities);
114 
115     /// Add a new sequence as a CBioseq.
116     ///
117     /// A new sequence record is started, and data from any previous
118     /// sequence is combined and written to disk.  Each sequence needs
119     /// sequence data and header data.  This method can extract both
120     /// from the provided CBioseq.  If other header data is preferred,
121     /// SetDeflines() can be called after this method to replace the
122     /// header data from the CBioseq.  Note that CBioseqs from some
123     /// sources are not guaranteed to contain sequence data; if this
124     /// might be the case, consider the versions of AddSequence that
125     /// take either CBioseq_Handle or CBioseq and CSeqVector.  In
126     /// order to use this method, sequence data should be accessible
127     /// from bs.GetInst().GetSeq_data().  (Note: objects provided to
128     /// WriteDB will be kept alive until the next AddSequence call.)
129     ///
130     /// @param bs Bioseq containing sequence and header data.
131     void AddSequence(const CBioseq & bs);
132 
133     /// Add a new sequence as a CBioseq_Handle.
134     ///
135     /// A new sequence record is started, and data from any previous
136     /// sequence is combined and written to disk.  Each sequence needs
137     /// sequence data and header data.  This method can extract both
138     /// from the provided CBioseq_Handle.  If other header data is
139     /// preferred, SetDeflines() can be called after this method to
140     /// replace the header data from the CBioseq.  (Note: objects
141     /// provided to WriteDB will be kept alive until the next
142     /// AddSequence call.)
143     ///
144     /// @param bsh Bioseq_Handle for sequence to add.
145     void AddSequence(const CBioseq_Handle & bsh);
146 
147     /// Add a new sequence as a CBioseq_Handle.
148     ///
149     /// A new sequence record is started, and data from any previous
150     /// sequence is combined and written to disk.  Each sequence needs
151     /// sequence data and header data.  This method will extract
152     /// header data from the provided CBioseq.  If the CBioseq
153     /// contains sequence data, it will be used; otherwise sequence
154     /// data will be fetched from the provided CSeqVector.  If other
155     /// header data is preferred, SetDeflines() can be called after
156     /// this method.  (Note: objects provided to WriteDB will be kept
157     /// alive until the next AddSequence call.)
158     ///
159     /// @param bs Bioseq_Handle for header and sequence data.
160     /// @param sv CSeqVector for sequence data.
161     void AddSequence(const CBioseq & bs, CSeqVector & sv);
162 
163     /// This method replaces any stored header data for the current
164     /// sequence with the provided CBlast_def_line_set.  Header data
165     /// can be constructed directly by the caller, or extracted from
166     /// an existing CBioseq using ExtractBioseqDeflines (see below).
167     /// Once it is in the correct form, it can be attached to the
168     /// sequence with this method.  (Note: objects provided to WriteDB
169     /// will be kept alive until the next AddSequence call.)
170     ///
171     /// @param deflines Header data for the most recent sequence.
172     void SetDeflines(const CBlast_def_line_set & deflines);
173 
174     /// Set the PIG identifier of this sequence.
175     ///
176     /// For protein sequences, this sets the PIG identifier.  PIG ids
177     /// are per-sequence, so it will only be attached to the first
178     /// defline in the set.
179     ///
180     /// @param pig PIG identifier as an integer.
181     void SetPig(int pig);
182 
183     // Options
184 
185     /// Set the maximum size for any file in the database.
186     ///
187     /// This method sets the maximum size for any file in a database
188     /// volume.  If adding a sequence would cause any file in the
189     /// generated database to exceed this size, the current volume is
190     /// ended and a new volume is started.  This is not a strict
191     /// limit, inasmuch as it always puts at least one sequence in
192     /// each volume regardless of that sequence's size.
193     ///
194     /// @param sz Maximum file size (in bytes).
195     void SetMaxFileSize(Uint8 sz);
196 
197     /// Set the maximum letters in one volume.
198     ///
199     /// This method sets the maximum number of sequence letters per
200     /// database volume.  If adding a sequence would cause the volume
201     /// to have more than this many letters, the current volume is
202     /// ended and a new volume is started.  This is not a strict
203     /// limit, inasmuch as it always puts at least one sequence in
204     /// each volume regardless of that sequence's size.
205     ///
206     /// @param sz Maximum sequence letters per volume.
207     void SetMaxVolumeLetters(Uint8 sz);
208 
209     /// Extract deflines from a CBioseq.
210     ///
211     /// Given a CBioseq, this method extracts and returns header info
212     /// as a defline set.  The deflines will not be applied to the
213     /// current sequence unless passed to SetDeflines.  The expected
214     /// use of this method is in cases where the caller has a CBioseq
215     /// or CBioseq_Handle but wishes to examine and/or change the
216     /// deflines before passing them to CWriteDB.  Some elements of
217     /// the CBioseq may be shared by the returned defline set, notably
218     /// the Seq-ids.
219     ///
220     /// @param bs Bioseq from which to construct the defline set.
221     /// @param parse_ids If we should parse seq_ids.
222     /// @param long_seqids If true use long sequence ids (database|accession)
223     /// @return The blast defline set.
224     static CRef<CBlast_def_line_set>
225     ExtractBioseqDeflines(const CBioseq & bs, bool parse_ids, bool long_seqids);
226 
227     /// Set bases that should not be used in sequences.
228     ///
229     /// This method specifies nucelotide or protein bases that should
230     /// not be used in the resulting database.  The bases in question
231     /// will be replaced with N (for nucleotide) or X (for protein).
232     /// The input data is expected to be specified in the appropriate
233     /// 'alphabetic' encoding (either IUPACAA and IUPACNA).
234     ///
235     /// @param masked
236     void SetMaskedLetters(const string & masked);
237 
238     /// List Volumes
239     ///
240     /// Returns the base names of all volumes constructed by this
241     /// class; the returned list may not be complete until Close() has
242     /// been called.
243     ///
244     /// @param vols
245     ///   The set of volumes produced by this class.
246     void ListVolumes(vector<string> & vols);
247 
248     /// List Filenames
249     ///
250     /// Returns a list of the files constructed by this class; the
251     /// returned list may not be complete until Close() has been
252     /// called.
253     ///
254     /// @param files
255     ///   The set of resolved database path names.
256     void ListFiles(vector<string> & files);
257 
258     /// Register a type of filtering data found in this database.
259     ///
260     /// The BlastDb format supports storage of masking data (lists of
261     /// masked ranges) for each database sequence, as well as an
262     /// indication of the source (or sources) of this masking data (e.g.:
263     /// masking algorithm used to create them).
264     /// This method stores a description of one of these masking data
265     /// sources in this database, including which basic algorithm was
266     /// used, as well as the options passed to that algorithm.  Each
267     /// description is associated with a numeric `algorithm id' (return value
268     /// of this method), which identifies that data source when adding data
269     /// with SetMaskData.
270     ///
271     /// @return algorithm ID for the filtering data.
272     /// @param program Program used to produce this masking data. [in]
273     /// @param options Algorithm options provided to the program. [in]
274     /// @param name Name of a GI-based mask [in]
275     int RegisterMaskAlgorithm(EBlast_filter_program   program,
276                              const string           & options,
277                              const string           & name = "");
278 
279     /// Register a type of filtering data found in this database.
280     ///
281     /// The BlastDb format supports storage of masking data (lists of
282     /// masked ranges) for each database sequence, as well as an
283     /// indication of the source (or sources) of this masking data (e.g.:
284     /// masking algorithm used to create them).
285     /// This method stores a description of one of these masking data
286     /// sources in this database, including which basic algorithm was
287     /// used, as well as the options passed to that algorithm.  Each
288     /// description is associated with a numeric `algorithm id' (return value
289     /// of this method), which identifies that data source when adding data
290     /// with SetMaskData.
291     ///
292     /// @return algorithm ID for the filtering data.
293     /// @param id A string to identify this masking data. [in]
294     /// @param description Details about the masking data. [in]
295     /// @param options Algorithm options provided to the program. [in]
296     int RegisterMaskAlgorithm(const string          & id,
297                              const string           & description,
298                              const string           & options);
299 
300     /// Set filtering data for a sequence.
301     ///
302     /// This method specifies filtered regions for the sequence.  Each
303     /// sequence can have filtering data from various algorithms.
304     ///
305     /// @param ranges Filtered ranges for this sequence and algorithm.
306     /// @param gis The GIs associated with this sequence
307     void SetMaskData(const CMaskedRangesVector & ranges,
308                      const vector <TGi>        & gis);
309 
310     /// Set up a generic CWriteDB metadata column.
311     ///
312     /// This method creates a column with the specified name (title).
313     /// The name must be unique among names provided to this database.
314     /// An integer column descriptor is returned, which must be used
315     /// to identify this column when applying blob data.  This call
316     /// will fail with an exception if too many user defined columns
317     /// have already been created for this database (this limit is due
318     /// to BlastDb file naming conventions).  The title identifies
319     /// this column and is also used to access the column with SeqDB.
320     ///
321     /// @param title   Name identifying this column.
322     /// @return Column identifier (a positive integer).
323     int CreateColumn(const string & title, bool mbo=false);
324 
325     /// Find an existing column.
326     ///
327     /// This looks for an existing column with the specified title and
328     /// returns the column ID if found.
329     ///
330     /// @param title The column title to look for.
331     /// @return The column ID if this column title is already defined.
332     int FindColumn(const string & title) const;
333 
334     /// Add meta data to a column.
335     ///
336     /// In addition to normal blob data, database columns can store a
337     /// `dictionary' of user-defined metadata in key/value form.  This
338     /// method adds one such key/value pair to the column.  Specifying
339     /// a key a second time causes replacement of the previous value.
340     /// Using this mechanism to store large amounts of data may have a
341     /// negative impact on performance.
342     ///
343     /// @param col_id Specifies the column to add this metadata to.
344     /// @param key    A unique key string.
345     /// @param value  A value string.
346     void AddColumnMetaData(int            col_id,
347                            const string & key,
348                            const string & value);
349 
350     /// Get a blob to use for a given column letter.
351     ///
352     /// To add data for a `blob' type column, this method should be
353     /// called to get a reference to a CBlastDbBlob object.  Add the
354     /// user-defined blob data to this object.  It is not correct to
355     /// call this more than once for the same sequence and column.
356     /// Reading, writing, or otherwise using this object after the
357     /// current sequence is published is an error and has undefined
358     /// consequences.  ('Publishing' of a sequence usually occurs
359     /// during the following AddSequence(*) call or during Close().)
360     ///
361     /// @param col_id Indicates the column receiving the blob data.
362     /// @return The user data should be stored in this blob.
363     CBlastDbBlob & SetBlobData(int col_id);
364 
365 private:
366     // Configuration
367 
368     string        m_Dbname;           ///< Database base name.
369     bool          m_Protein;          ///< True if DB is protein.
370     string        m_Title;            ///< Title field of database.
371     string        m_Date;             ///< Time stamp (for all volumes.)
372     Uint8         m_MaxFileSize;      ///< Maximum size of any file.
373     Uint8         m_MaxVolumeLetters; ///< Max letters per volume.
374     EIndexType    m_Indices;          ///< Indexing mode.
375     bool          m_Closed;           ///< True if database has been closed.
376     string        m_MaskedLetters;    ///< Masked protein letters (IUPAC).
377     string        m_MaskByte;         ///< Byte that replaced masked letters.
378     vector<char>  m_MaskLookup;       ///< Is (blast-aa) byte masked?
379     int           m_MaskDataColumn;   ///< Column ID for masking data column.
380     map<int, int> m_MaskAlgoMap;      ///< Mapping from algo_id to gi-mask id
381     bool          m_ParseIDs;         ///< Generate ISAM files
382     bool          m_UseGiMask;        ///< Generate GI-based mask files
383     EBlastDbVersion m_DbVersion;      ///< BLASTDB version
384 
385     /// Column titles.
386     vector<string> m_ColumnTitles;
387 
388 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION  > 550)) && \
389      (!defined(NCBI_COMPILER_MIPSPRO)) )
390     /// Per-column metadata.
391     typedef CWriteDB_Column::TColumnMeta TColumnMeta;
392 
393     /// Meta data for all columns.
394     vector< TColumnMeta > m_ColumnMetas;
395 
396     /// Gi-based masks
397     vector< CRef<CWriteDB_GiMask> > m_GiMasks;
398 #endif
399 
400     // Functions
401 
402     /// Flush accumulated sequence data to volume.
403     void x_Publish();
404 
405     /// Compute name of alias file produced.
406     string x_MakeAliasName();
407 
408     /// Flush accumulated sequence data to volume.
409     void x_MakeAlias();
410 
411     /// Clear sequence data from last sequence.
412     void x_ResetSequenceData();
413 
414     /// Convert and compute final data formats.
415     void x_CookData();
416 
417     /// Convert header data into usable forms.
418     void x_CookHeader();
419 
420     /// Collect ids for ISAM files.
421     void x_CookIds();
422 
423     /// Compute the length of the current sequence.
424     int x_ComputeSeqLength();
425 
426     /// Convert sequence data into usable forms.
427     void x_CookSequence();
428 
429     /// Prepare column data to be appended to disk.
430     void x_CookColumns();
431 
432     /// Replace masked input letters with m_MaskByte value.
433     void x_MaskSequence();
434 
435     /// Get binary version of deflines from 'user' data in Bioseq.
436     ///
437     /// Some CBioseq objects (e.g. those from CSeqDB) have an ASN.1
438     /// octet array containing a binary ASN.1 version of the blast
439     /// defline set for the sequence.  This method looks for that data
440     /// and returns it if found.  If not found, it returns an empty
441     /// string.
442     ///
443     /// @param bioseq Bioseq from which to fetch header. [in]
444     /// @param binhdr Header data as binary ASN.1. [out]
445     static void x_GetBioseqBinaryHeader(const CBioseq & bioseq,
446                                         string        & binhdr);
447 
448     /// Construct deflines from a CBioseq and other meta-data.
449     ///
450     /// This method builds deflines from various data found in the
451     /// Bioseq, along with other meta data (like the PIG and
452     /// membership and linkout lists.)
453     ///
454     /// @param bioseq Defline data will be built from this. [in]
455     /// @param deflines A defline set will be returned here. [out]
456     /// @param membits Membership bits for each defline. [in]
457     /// @param linkout Linkout bits for each defline. [in]
458     /// @param pig PIG to attach to a protein sequence. [in]
459     static void
460     x_BuildDeflinesFromBioseq(const CBioseq                  & bioseq,
461                               CConstRef<CBlast_def_line_set> & deflines,
462                               const vector< vector<int> >    & membits,
463                               const vector< vector<int> >    & linkout,
464                               int                              pig);
465 
466     /// Extract a defline set from a binary ASN.1 blob.
467     /// @param bin_hdr Binary ASN.1 encoding of defline set. [in]
468     /// @param deflines Defline set. [out]
469     static void
470     x_SetDeflinesFromBinary(const string                   & bin_hdr,
471                             CConstRef<CBlast_def_line_set> & deflines);
472 
473     /// Extract a defline set from a CFastaReader generated CBioseq.
474     ///
475     /// CBioseq objects produced by CFastaReader have an internal
476     /// 'user' field that contains the original FASTA, which can be
477     /// used to build blast deflines.  If the original FASTA deflines
478     /// were delimited with control-A characters, then those will be
479     /// found here too.  If the caller wishes to accept '>' as an
480     /// alternate delimiter, then accept_gt should be specified.
481     ///
482     /// @param bioseq Bioseq object produced by CFastaReader. [in]
483     /// @param deflines Defline set. [out]
484     /// @param membits Membership bits for each defline. [in]
485     /// @param linkout Linkout bits for each defline. [in]
486     /// @param pig PIG to attach to a protein sequence. [in]
487     /// @param accept_gt Whether greater-than is a delimiter. [in]
488     /// @param parse_ids Whether seq_id should not be parsed. [in]
489     /// @param long_seqids If true, use long sequence ids (database|accession)
490     /// [in]
491     static void
492     x_GetFastaReaderDeflines(const CBioseq                  & bioseq,
493                              CConstRef<CBlast_def_line_set> & deflines,
494                              const vector< vector<int> >    & membits,
495                              const vector< vector<int> >    & linkout,
496                              int                              pig,
497                              bool                             accept_gt,
498                              bool                             parse_ids,
499                              bool                             long_seqids);
500 
501     /// Returns true if we have unwritten sequence data.
502     bool x_HaveSequence() const;
503 
504     /// Records that we now have unwritten sequence data.
505     void x_SetHaveSequence();
506 
507     /// Records that we no longer have unwritten sequence data.
508     void x_ClearHaveSequence();
509 
510     /// Get deflines from a CBioseq and other meta-data.
511     ///
512     /// This method extracts binary ASN.1 deflines from a CBioseq if
513     /// possible, and otherwise builds deflines from various data
514     /// found in the Bioseq, along with other meta data (like the PIG
515     /// and membership and linkout lists.)  It returns the result as
516     /// a blast defline set.  If a binary version of the headers is
517     /// computed during this method, it will be returned in bin_hdr.
518     ///
519     /// @param bioseq Defline data will be built from this. [in]
520     /// @param deflines A defline set will be returned here. [out]
521     /// @param bin_hdr Binary header data may be returned here. [out]
522     /// @param membbits Membership bits for each defline. [in]
523     /// @param linkouts Linkout bits for each defline. [in]
524     /// @param pig PIG to attach to a protein sequence. [in]
525     /// @param OID the current OID for local id. [in]
526     /// @param parse_ids whether we should not parse id. [in]
527     static void x_ExtractDeflines(CConstRef<CBioseq>             & bioseq,
528                                   CConstRef<CBlast_def_line_set> & deflines,
529                                   string                         & bin_hdr,
530                                   const vector< vector<int> >    & membbits,
531                                   const vector< vector<int> >    & linkouts,
532                                   int                              pig,
533                                   set<TTaxId>                    & tax_ids,
534                                   int                              OID=-1,
535                                   bool                             parse_ids=true,
536                                   bool                             long_seqid=false,
537                                   bool							   limit_defline = false);
538 
539     /// Compute the hash of a (raw) sequence.
540     ///
541     /// The hash of the provided sequence will be computed and
542     /// assigned to the m_Hash member.  The sequence and optional
543     /// ambiguities are 'raw', meaning they are packed just as
544     /// sequences are packed in nsq and psq files.
545     ///
546     /// @param sequence The sequence data. [in]
547     /// @param ambiguities Nucleotide ambiguities are provided here. [in]
548     void x_ComputeHash(const CTempString & sequence,
549                        const CTempString & ambiguities);
550 
551     /// Compute the hash of a (Bioseq) sequence.
552     ///
553     /// The hash of the provided sequence will be computed and
554     /// assigned to the m_Hash member.  The sequence is packed as a
555     /// CBioseq.
556     ///
557     /// @param sequence The sequence as a CBioseq. [in]
558     void x_ComputeHash(const CBioseq & sequence);
559 
560     /// Get the mask data column id.
561     ///
562     /// The mask data column is created if it does not exist, and its
563     /// column ID number is returned.
564     ///
565     /// @return The column ID for the mask data column.
566     int x_GetMaskDataColumnId();
567 
568     //
569     // Accumulated sequence data.
570     //
571 
572     /// Bioseq object for next sequence to write.
573     CConstRef<CBioseq> m_Bioseq;
574 
575     /// SeqVector for next sequence to write.
576     CSeqVector m_SeqVector;
577 
578     /// Deflines to write as header.
579     CConstRef<CBlast_def_line_set> m_Deflines;
580 
581     /// Ids for next sequence to write, for use during ISAM construction.
582     vector< CRef<CSeq_id> > m_Ids;
583 
584     /// Linkout bits - outer vector is per-defline, inner is bits.
585     vector< vector<int> > m_Linkouts;
586 
587     /// Membership bits - outer vector is per-defline, inner is bits.
588     vector< vector<int> > m_Memberships;
589 
590     /// PIG to attach to headers for protein sequences.
591     int m_Pig;
592 
593     /// Sequence hash for this sequence.
594     int m_Hash;
595 
596     /// When a sequence is added, this will be populated with the length of that sequence.
597     int m_SeqLength;
598 
599     /// True if we have a sequence to write.
600     bool m_HaveSequence;
601 
602     // Cooked
603 
604     /// Sequence data in format that will be written to disk.
605     string m_Sequence;
606 
607     /// Ambiguities in format that will be written to disk.
608     string m_Ambig;
609 
610     /// Binary header in format that will be written to disk.
611     string m_BinHdr;
612 
613     set<TTaxId> m_TaxIds;
614 
615     // Volumes
616 
617     /// This volume is currently accepting sequences.
618     CRef<CWriteDB_Volume> m_Volume;
619 
620     /// List of all volumes so far, up to and including m_Volume.
621     vector< CRef<CWriteDB_Volume> > m_VolumeList;
622 
623     /// Blob data for the current sequence, indexed by letter.
624     vector< CRef<CBlastDbBlob> > m_Blobs;
625 
626     /// List of blob columns that are active for this sequence.
627     vector<int> m_HaveBlob;
628 
629     /// Registry for masking algorithms in this database.
630     CMaskInfoRegistry m_MaskAlgoRegistry;
631 
632     ///Write lmdb handle
633     CRef <CWriteDB_LMDB>    m_Lmdbdb;
634 
635     ///Write tax info handle
636     CRef <CWriteDB_TaxID>    m_Taxdb;
637 
638     /// If true, use long sequence id format (database|accession) for all
639     /// acessions
640     bool m_LongSeqId;
641 
642     ///Current oid to use for lmdb
643     int m_LmdbOid;
644 
645     bool m_limitDefline;
646 };
647 
648 END_NCBI_SCOPE
649 
650 
651 #endif // OBJTOOLS_WRITERS_WRITEDB__WRITEDB_IMPL_HPP
652 
653 
654