1 #ifndef OBJTOOLS_BLAST_SEQDB_READER___SEQDBCOMMON__HPP
2 #define OBJTOOLS_BLAST_SEQDB_READER___SEQDBCOMMON__HPP
3 
4 /*  $Id: seqdbcommon.hpp 610974 2020-06-26 12:59:33Z grichenk $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author:  Kevin Bealer
30  *
31  */
32 
33 /// @file seqdbcommon.hpp
34 /// Defines exception class and several constants for SeqDB.
35 ///
36 /// Defines classes:
37 ///     CSeqDBException
38 ///
39 /// Implemented for: UNIX, MS-Windows
40 
41 #include <ncbiconf.h>
42 #include <corelib/ncbiobj.hpp>
43 #include <objects/seqloc/Seq_id.hpp>
44 
45 BEGIN_NCBI_SCOPE
46 
47 /// Include definitions from the objects namespace.
48 USING_SCOPE(objects);
49 
50 /// BLAST database version
51 enum EBlastDbVersion {
52     eBDB_Version4 = 4,
53     eBDB_Version5 = 5
54 };
55 
56 BEGIN_SCOPE(blastdb)
57 /// Ordinal ID in BLAST databases
58 typedef Int4 TOid;
59 END_SCOPE(blastdb)
60 
61 /// CSeqDBException
62 ///
63 /// This exception class is thrown for SeqDB related errors such as
64 /// corrupted blast database or alias files, incorrect arguments to
65 /// SeqDB methods, and failures of SeqDB to accomplish tasks for other
66 /// reasons.  SeqDB may be used in applications with strong robustness
67 /// requirements, where it is considered better to fail an operation
68 /// and lose context information, than to terminate with a core dump,
69 /// and preserve it, so exceptions are the preferred mechanism for
70 /// most error scenarios.  SeqDB still uses assertions in cases where
71 /// memory corruption is suspected, or cleanup may not be possible.
72 
73 class NCBI_XOBJREAD_EXPORT CSeqDBException : public CException {
74 public:
75     /// Errors are classified into one of two types.
76     enum EErrCode {
77         /// Argument validation failed.
78         eArgErr,
79 
80         /// Files were missing or contents were incorrect.
81         eFileErr,
82 
83         /// Memory allocation failed.
84         eMemErr,
85 
86         /// DB version error
87         eVersionErr,
88 
89         /// No Tax Id Found
90         eTaxidErr
91     };
92 
93     /// Get a message describing the situation leading to the throw.
GetErrCodeString() const94     virtual const char* GetErrCodeString() const override
95     {
96         switch ( GetErrCode() ) {
97         case eArgErr:  return "eArgErr";
98         case eFileErr: return "eFileErr";
99         case eVersionErr: return "eVersionErr";
100         default:       return CException::GetErrCodeString();
101         }
102     }
103 
104     /// Include standard NCBI exception behavior.
105     NCBI_EXCEPTION_DEFAULT(CSeqDBException,CException);
106 };
107 
108 /// The name of the group alias file name expected at each directory
109 /// For more documentation, see "Group Alias Files" in
110 /// source/src/objtools/blast/seqdb_reader/alias_files.txt
111 NCBI_XOBJREAD_EXPORT extern const string kSeqDBGroupAliasFileName;
112 
113 /// Used to request ambiguities in Ncbi/NA8 format.
114 const int kSeqDBNuclNcbiNA8  = 0;
115 
116 /// Used to request ambiguities in BLAST/NA8 format.
117 const int kSeqDBNuclBlastNA8 = 1;
118 
119 const blastdb::TOid kSeqDBEntryNotFound = -1;
120 const blastdb::TOid kSeqDBEntryDuplicate = -2;
121 
122 /// Certain methods have an "Alloc" version.  When these methods are
123 /// used, the following constants can be specified to indicate which
124 /// libraries to use to allocate returned data, so the corresponding
125 /// calls (delete[] vs. free()) can be used to delete the data.
126 
127 enum ESeqDBAllocType {
128     eAtlas = 0,
129     eMalloc,
130     eNew
131 };
132 
133 
134 typedef Uint8 TTi;
135 
136 typedef Uint4 TPig;
137 
138 
139 /// Blast DB v5 seqid list info
140 struct NCBI_XOBJREAD_EXPORT SBlastSeqIdListInfo {
SBlastSeqIdListInfoSBlastSeqIdListInfo141 		SBlastSeqIdListInfo() : is_v4(true), file_size(0), num_ids(0), create_date(kEmptyStr),
142 				      db_vol_length(0), db_create_date(kEmptyStr), db_vol_names(kEmptyStr) {}
143 		bool is_v4;
144 		Uint8 file_size;
145 		Uint8 num_ids;
146 		string title;
147 		string create_date;
148 		Uint8 db_vol_length;
149 		string db_create_date;
150 		string db_vol_names;
151 };
152 
153 /// CSeqDBGiList
154 ///
155 /// This class defines an interface to a list of GI,OID pairs.  It is
156 /// used by the CSeqDB class for user specified GI lists.  This class
157 /// should not be instantiated directly, instead use a subclass of
158 /// this class.  Subclasses should provide a way to populate the
159 /// m_GisOids vector.
160 
161 class NCBI_XOBJREAD_EXPORT CSeqDBGiList : public CObject {
162 public:
163     /// Structure that holds GI,OID pairs.
164     struct SGiOid {
165         /// Constuct an SGiOid element from the given gi and oid.
166         /// @param gi_in A GI, or 0 if none is available.
167         /// @param oid_in An OID, or -1 if none is available.
SGiOidCSeqDBGiList::SGiOid168         SGiOid(TGi gi_in = ZERO_GI, int oid_in = -1)
169             : gi(gi_in), oid(oid_in)
170         {
171         }
172 
173         /// The GI or 0 if unknown.
174         TGi gi;
175 
176         /// The OID or -1 if unknown.
177         int oid;
178     };
179 
180     /// Structure that holds TI,OID pairs.
181     struct STiOid {
182         /// Constuct an STiOid element from the given TI (trace ID,
183         /// expressed as a number) and oid.
184         ///
185         /// @param ti_in A TI, or 0 if none is available.
186         /// @param oid_in An OID, or -1 if none is available.
STiOidCSeqDBGiList::STiOid187         STiOid(TTi ti_in = 0, int oid_in = -1)
188             : ti(ti_in), oid(oid_in)
189         {
190         }
191 
192         /// The TI or 0 if unknown.
193         TTi ti;
194 
195         /// The OID or -1 if unknown.
196         int oid;
197     };
198 
199     /// Structure that holds Seq-id,OID pairs.
200     struct SSiOid {
201         /// Constuct a SSiOid element from the given Seq-id and oid.
202         /// @param seqid_in A Seq-id, or NULL if none is available.
203         /// @param oid_in An OID, or -1 if none is available.
SSiOidCSeqDBGiList::SSiOid204         SSiOid(const string &si_in = "", int oid_in = -1)
205             : si(si_in), oid(oid_in)
206         {
207         }
208 
209         /// The String-id or "" if unknown.
210         string si;
211 
212         /// The OID or -1 if unknown.
213         int oid;
214     };
215 
216     struct STaxIdsOids {
217     	set<TTaxId> tax_ids;
218     	vector<blastdb::TOid> oids;
219     };
220 
221     struct SPigOid {
222             /// Constuct an SPigOid element from the given pig and oid.
223             /// @param pig_in A PIG, or 0 if none is available.
224             /// @param oid_in An OID, or -1 if none is available.
SPigOidCSeqDBGiList::SPigOid225             SPigOid(TPig pig_in = 0, int oid_in = -1)
226                 : pig(pig_in), oid(oid_in)
227             {
228             }
229 
230             /// The PIG or 0 if unknown.
231             TPig pig;
232 
233             /// The OID or -1 if unknown.
234             int oid;
235     };
236 
237 
238     /// Possible sorting states
239     enum ESortOrder {
240         /// The array is unsorted or the sortedness is unknown.
241         eNone,
242 
243         /// The array is sorted by GI.
244         eGi
245 
246         /// TODO should we define eTi and eSi?
247     };
248 
249     /// Constructor
250     CSeqDBGiList();
251 
252     /// Destructor
~CSeqDBGiList()253     virtual ~CSeqDBGiList()
254     {
255     }
256 
257     /// Sort if necessary to insure order of elements.
258     void InsureOrder(ESortOrder order);
259 
260     /// Test for existence of a GI.
261     bool FindGi(TGi gi) const;
262 
263     /// Try to find a GI and return the associated OID.
264     /// @param gi The gi for which to search. [in]
265     /// @param oid The resulting oid if found. [out]
266     /// @return True if the GI was found.
267     bool GiToOid(TGi gi, int & oid);
268 
269     /// Find a GI, returning the index and the associated OID.
270     /// @param gi The gi for which to search. [in]
271     /// @param oid The resulting oid if found. [out]
272     /// @param index The index of this GI (if found). [out]
273     /// @return True if the GI was found.
274     bool GiToOid(TGi gi, int & oid, int & index);
275 
276     /// Test for existence of a TI.
277     bool FindTi(TTi ti) const;
278 
279     /// Try to find a TI and return the associated OID.
280     /// @param ti The ti for which to search. [in]
281     /// @param oid The resulting oid if found. [out]
282     /// @return True if the TI was found.
283     bool TiToOid(TTi ti, int & oid);
284 
285     /// Find a TI, returning the index and the associated OID.
286     /// @param ti The ti for which to search. [in]
287     /// @param oid The resulting oid if found. [out]
288     /// @param index The index of this TI (if found). [out]
289     /// @return True if the TI was found.
290     bool TiToOid(TTi ti, int & oid, int & index);
291 
292 
293     bool FindSi(const string & si) const;
294     bool SiToOid(const string &si, int & oid);
295     bool SiToOid(const string &si, int & oid, int & index);
296 
297 
298     bool FindPig(TPig pig) const;
299     bool PigToOid(TPig pig, int & oid);
300     bool PigToOid(TPig pig, int & oid, int & index);
301 
302     /// Test for existence of a Seq-id by type.
303     ///
304     /// This method uses FindGi or FindTi if the input ID is a GI or
305     /// TI.  If not, or if not found, it falls back to a Seq-id lookup
306     /// to find the ID.  It returns true iff ID was found, otherwise
307     /// it returns false.  This method is used by SeqDB to filter
308     /// Blast Defline lists.
309     ///
310     /// @param id The identifier to find.
311     /// @return true iff the id is found in the list.
312     bool FindId(const CSeq_id & id);
313 
314     /// Access an element of the array.
315     /// @param index The index of the element to access. [in]
316     /// @return A reference to the GI/OID pair.
GetGiOid(int index) const317     const SGiOid & GetGiOid(int index) const
318     {
319         return m_GisOids[index];
320     }
321 
322     /// Access an element of the array.
323     /// @param index The index of the element to access. [in]
324     /// @return A reference to the TI/OID pair.
GetTiOid(int index) const325     const STiOid & GetTiOid(int index) const
326     {
327         return m_TisOids[index];
328     }
329 
330     /// Access an element of the array.
331     /// @param index The index of the element to access. [in]
332     /// @return A reference to the Seq-id/OID pair.
GetSiOid(int index) const333     const SSiOid & GetSiOid(int index) const
334     {
335         return m_SisOids[index];
336     }
337 
GetPigOid(int index) const338     const SPigOid & GetPigOid(int index) const
339     {
340         return m_PigsOids[index];
341     }
342 
343     /// Get the number of GIs in the array.
GetNumGis() const344     int GetNumGis() const
345     {
346         return (int) m_GisOids.size();
347     }
348 
349     /// Get the number of TIs in the array.
GetNumTis() const350     int GetNumTis() const
351     {
352         return (int) m_TisOids.size();
353     }
354 
355     /// Get the number of Seq-ids in the array.
GetNumSis() const356     int GetNumSis() const
357     {
358         return (int) m_SisOids.size();
359     }
360 
GetNumTaxIds() const361     int GetNumTaxIds() const
362     {
363     	return (int) m_TaxIdsOids.tax_ids.size();
364     }
365 
GetNumOidsForTaxIdList() const366     int GetNumOidsForTaxIdList() const
367     {
368     	return (int) m_TaxIdsOids.oids.size();
369     }
370 
GetNumPigs() const371     int GetNumPigs() const
372     {
373         return (int) m_PigsOids.size();
374     }
375 
376     /// Return false if there are elements present.
Empty() const377     bool Empty() const
378     {
379         return ! (GetNumGis() || GetNumSis() || GetNumTis() || GetNumTaxIds() || GetNumPigs());
380     }
381 
382     /// Return true if there are elements present.
NotEmpty() const383     bool NotEmpty() const
384     {
385         return ! Empty();
386     }
387 
388     /// Specify the correct OID for a GI.
389     ///
390     /// When SeqDB translates a GI into an OID, this method is called
391     /// to store the oid in the array.
392     ///
393     /// @param index
394     ///   The location in the array of the GI, OID pair.
395     /// @param oid
396     ///   The oid to store in that element.
SetGiTranslation(int index,int oid)397     void SetGiTranslation(int index, int oid)
398     {
399         m_GisOids[index].oid = oid;
400     }
401 
402     /// Specify the correct OID for a TI.
403     ///
404     /// When SeqDB translates a TI into an OID, this method is called
405     /// to store the oid in the array.
406     ///
407     /// @param index
408     ///   The location in the array of the TI, OID pair.
409     /// @param oid
410     ///   The oid to store in that element.
SetTiTranslation(int index,int oid)411     void SetTiTranslation(int index, int oid)
412     {
413         m_TisOids[index].oid = oid;
414     }
415 
416     /// Specify the correct OID for a Seq-id.
417     ///
418     /// When SeqDB translates a Seq-id into an OID, this method is
419     /// called to store the oid in the array.
420     ///
421     /// @param index
422     ///   The location in the array of Seq-id, OID pairs.
423     /// @param oid
424     ///   The oid to store in that element.
SetSiTranslation(int index,int oid)425     void SetSiTranslation(int index, int oid)
426     {
427         m_SisOids[index].oid = oid;
428     }
429 
SetPigTranslation(int index,int oid)430     void SetPigTranslation(int index, int oid)
431     {
432         m_PigsOids[index].oid = oid;
433     }
434 
Size() const435     int Size() const
436     {
437         return (int) m_GisOids.size();
438     }
439 
440     template <class T>
GetSize() const441     int GetSize() const
442     {
443         return (int) m_GisOids.size();
444     }
445 
446     template <class T>
GetKey(int index) const447     T GetKey(int index) const
448     {
449         return GI_TO(T, m_GisOids[index].gi);
450     }
451 
452     template <class T>
IsValueSet(int index) const453     bool IsValueSet(int index) const
454     {
455         return (m_GisOids[index].oid != -1);
456     }
457 
458     template <class T>
SetValue(int index,int oid)459     void SetValue(int index, int oid)
460     {
461         m_GisOids[index].oid = oid;
462     }
463 
464     /// Get the gi list
465     void GetGiList(vector<TGi>& gis) const;
466 
467     /// Get the ti list
468     void GetTiList(vector<TTi>& tis) const;
469 
470     /// TODO Get the seqid list?
471     void GetSiList(vector<string>& sis) const;
472 
473     void GetPigList(vector<TPig>& pigs) const;
474 
475 
GetTaxIdsList()476     set<TTaxId> & GetTaxIdsList()
477 	{
478     	return m_TaxIdsOids.tax_ids;
479 	}
480 
GetOidsForTaxIdsList()481     const vector<blastdb::TOid> & GetOidsForTaxIdsList()
482     {
483        	return m_TaxIdsOids.oids;
484     }
485 
SetOidsForTaxIdsList()486     vector<blastdb::TOid> & SetOidsForTaxIdsList()
487 	{
488     	m_TaxIdsOids.oids.clear();
489     	return m_TaxIdsOids.oids;
490 	}
491 
492     /// Add a new GI to the list.
AddGi(TGi gi)493     void AddGi(TGi gi)
494     {
495         m_GisOids.push_back(gi);
496     }
497 
498     /// Add a new TI to the list.
AddTi(TTi ti)499     void AddTi(TTi ti)
500     {
501         m_TisOids.push_back(ti);
502     }
503 
504     /// Add a new SeqId to the list.
AddSi(const string & si)505     void AddSi(const string &si)
506     {
507         m_SisOids.push_back(si);
508     }
509 
AddTaxIds(const set<TTaxId> & tax_ids)510     void AddTaxIds(const set<TTaxId> & tax_ids)
511     {
512     	set<TTaxId> & tids = m_TaxIdsOids.tax_ids;
513     	tids.insert(tax_ids.begin(), tax_ids.end());
514     }
515 
SetPigList(const vector<TPig> & list)516     void SetPigList(const vector<TPig> & list)
517     {
518     	ITERATE(vector<TPig>, itr, list) {
519     		m_PigsOids.push_back(*itr);
520     	}
521 
522     }
523 
AddPig(TPig pig)524     void AddPig(TPig pig)
525     {
526         m_PigsOids.push_back(pig);
527     }
528 
529     /// Reserve space for GIs.
ReserveGis(size_t n)530     void ReserveGis(size_t n)
531     {
532         m_GisOids.reserve(n);
533     }
534 
535     /// Reserve space for TIs.
ReserveTis(size_t n)536     void ReserveTis(size_t n)
537     {
538         m_TisOids.reserve(n);
539     }
540 
ReserveSis(size_t n)541     void ReserveSis(size_t n)
542     {
543         m_SisOids.reserve(n);
544     }
545 
ReservePigs(size_t n)546     void ReservePigs(size_t n)
547     {
548         m_PigsOids.reserve(n);
549     }
550 
551     /// Preprocess ids for ISAM string id lookup
552     void PreprocessIdsForISAMSiLookup();
553 
554     /// TODO Reserve space for seqids?
SetListInfo(const SBlastSeqIdListInfo & list_info)555     void SetListInfo(const SBlastSeqIdListInfo & list_info) {
556        	m_ListInfo = list_info;
557     }
558 
GetListInfo()559     const SBlastSeqIdListInfo & GetListInfo()
560     {
561     	return m_ListInfo;
562     }
563 
SetSiList(const vector<string> & new_list)564     void SetSiList( const vector<string> & new_list )
565     {
566         m_SisOids.clear();
567         ITERATE(vector<string>, itr, new_list) {
568         	m_SisOids.push_back(*itr);
569         }
570     }
571 protected:
572     /// Indicates the current sort order, if any, of this container.
573     ESortOrder m_CurrentOrder;
574 
575     /// Pairs of GIs and OIDs.
576     vector<SGiOid> m_GisOids;
577 
578     /// Pairs of GIs and OIDs.
579     vector<STiOid> m_TisOids;
580 
581     /// Pairs of Seq-ids and OIDs.
582     vector<SSiOid> m_SisOids;
583 
584     vector<SPigOid> m_PigsOids;
585 
586     STaxIdsOids m_TaxIdsOids;
587 
588     SBlastSeqIdListInfo m_ListInfo;
589 
590 private:
591     // The following disabled methods are reasonable things to do in
592     // some cases.  But I suspect they are more likely to happen
593     // accidentally than deliberately; due to the high performance
594     // cost, I have prevented them.  If this kind of deep copy is
595     // desireable, it can easily be enabled for a subclass by
596     // assigning each of the data fields in the protected section.
597 
598     /// Prevent copy constructor.
599     CSeqDBGiList(const CSeqDBGiList & other);
600 
601     /// Prevent assignment.
602     CSeqDBGiList & operator=(const CSeqDBGiList & other);
603 };
604 
605 
606 template < >
GetSize() const607 inline int CSeqDBGiList::GetSize<TTi>() const
608 {
609     return (int) m_TisOids.size();
610 }
611 
612 template < >
GetKey(int index) const613 inline TTi CSeqDBGiList::GetKey<TTi>(int index) const
614 {
615     return m_TisOids[index].ti;
616 }
617 
618 template < >
IsValueSet(int index) const619 inline bool CSeqDBGiList::IsValueSet<TTi>(int index) const
620 {
621     return (m_TisOids[index].oid != -1);
622 }
623 
624 template < >
SetValue(int index,int oid)625 inline void CSeqDBGiList::SetValue<TTi>(int index, int oid)
626 {
627     m_TisOids[index].oid = oid;
628 }
629 
630 template < >
GetSize() const631 inline int CSeqDBGiList::GetSize<string>() const
632 {
633     return (int) m_SisOids.size();
634 }
635 
636 template < >
GetKey(int index) const637 inline string CSeqDBGiList::GetKey<string>(int index) const
638 {
639     return m_SisOids[index].si;
640 }
641 
642 template < >
IsValueSet(int index) const643 inline bool CSeqDBGiList::IsValueSet<string>(int index) const
644 {
645     return (m_SisOids[index].oid != -1);
646 }
647 
648 template < >
SetValue(int index,int oid)649 inline void CSeqDBGiList::SetValue<string>(int index, int oid)
650 {
651     m_SisOids[index].oid = oid;
652 }
653 
654 template < >
GetSize() const655 inline int CSeqDBGiList::GetSize<TPig>() const
656 {
657     return (int) m_PigsOids.size();
658 }
659 
660 template < >
GetKey(int index) const661 inline TPig CSeqDBGiList::GetKey<TPig>(int index) const
662 {
663     return m_PigsOids[index].pig;
664 }
665 
666 template < >
IsValueSet(int index) const667 inline bool CSeqDBGiList::IsValueSet<TPig>(int index) const
668 {
669     return (m_PigsOids[index].oid != -1);
670 }
671 
672 template < >
SetValue(int index,int oid)673 inline void CSeqDBGiList::SetValue<TPig>(int index, int oid)
674 {
675     m_PigsOids[index].oid = oid;
676 }
677 
678 
679 /// CSeqDBBitVector
680 ///
681 /// This class defines a bit vector that is similar to vector<bool>,
682 /// but with a differently designed API that performs better on at
683 /// least some platforms, and slightly altered semantics.
684 
685 class NCBI_XOBJREAD_EXPORT CSeqDBBitVector {
686 public:
687     /// Constructor
CSeqDBBitVector()688     CSeqDBBitVector()
689         : m_Size(0)
690     {
691     }
692 
693     /// Destructor
~CSeqDBBitVector()694     virtual ~CSeqDBBitVector()
695     {
696     }
697 
698     /// Set the inclusion of an OID.
699     ///
700     /// @param oid The OID in question. [in]
SetBit(int oid)701     void SetBit(int oid)
702     {
703         if (oid >= m_Size) {
704             x_Resize(oid+1);
705         }
706         x_SetBit(oid);
707     }
708 
709     /// Set the inclusion of an OID.
710     ///
711     /// @param oid The OID in question. [in]
ClearBit(int oid)712     void ClearBit(int oid)
713     {
714         if (oid >= m_Size) {
715             return;
716         }
717         x_ClearBit(oid);
718     }
719 
720     /// Get the inclusion status of an OID.
721     ///
722     /// @param oid The OID in question. [in]
723     /// @return True if the OID is included by SeqDB.
GetBit(int oid)724     bool GetBit(int oid)
725     {
726         if (oid >= m_Size) {
727             return false;
728         }
729         return x_GetBit(oid);
730     }
731 
732     /// Get the size of the OID array.
Size() const733     int Size() const
734     {
735         return m_Size;
736     }
737 
738 private:
739     /// Prevent copy constructor.
740     CSeqDBBitVector(const CSeqDBBitVector & other);
741 
742     /// Prevent assignment.
743     CSeqDBBitVector & operator=(const CSeqDBBitVector & other);
744 
745     /// Bit vector element.
746     typedef int TBits;
747 
748     /// Bit vector.
749     vector<TBits> m_Bitmap;
750 
751     /// Maximum enabled OID plus one.
752     int m_Size;
753 
754     /// Resize the OID list.
x_Resize(int num)755     void x_Resize(int num)
756     {
757         int bits = 8*sizeof(TBits);
758         int need = (num + bits - 1)/bits;
759 
760         if ((int)m_Bitmap.size() < need) {
761             int new_size = 1024;
762 
763             while (new_size < need) {
764                 new_size *= 2;
765             }
766 
767             m_Bitmap.resize(new_size);
768         }
769 
770         m_Size = num;
771     }
772 
773     /// Set a specific bit (to 1).
x_SetBit(int num)774     void x_SetBit(int num)
775     {
776         int bits = 8*sizeof(TBits);
777 
778         m_Bitmap[num/bits] |= (1 << (num % bits));
779     }
780 
781     /// Set a specific bit (to 1).
x_GetBit(int num)782     bool x_GetBit(int num)
783     {
784         int bits = 8*sizeof(TBits);
785 
786         return !! (m_Bitmap[num/bits] & (1 << (num % bits)));
787     }
788 
789     /// Clear a specific bit (to 0).
x_ClearBit(int num)790     void x_ClearBit(int num)
791     {
792         int bits = 8*sizeof(TBits);
793 
794         m_Bitmap[num/bits] &= ~(1 << (num % bits));
795     }
796 };
797 
798 
799 /// CSeqDBNegativeList
800 ///
801 /// This class defines a list of GIs or TIs of sequences that should
802 /// not be included in a SeqDB instance.  It is used by CSeqDB for
803 /// user specified negative ID lists.  This class can be subclassed to
804 /// allow more efficient population of the GI or TI list.
805 
806 class NCBI_XOBJREAD_EXPORT CSeqDBNegativeList : public CObject {
807 public:
808     /// Constructor
CSeqDBNegativeList()809     CSeqDBNegativeList()
810         : m_LastSortSize (0)
811     {
812     }
813 
814     /// Destructor
~CSeqDBNegativeList()815     virtual ~CSeqDBNegativeList()
816     {
817     }
818 
819     /// Sort list if not already sorted.
820     void InsureOrder();
821 
822     /// Add a new GI to the list.
AddGi(TGi gi)823     void AddGi(TGi gi)
824     {
825         m_Gis.push_back(gi);
826     }
827 
828     /// Add a new TI to the list.
AddTi(TTi ti)829     void AddTi(TTi ti)
830     {
831         m_Tis.push_back(ti);
832     }
833 
834     /// Add a new SeqId to the list.
AddSi(const string & si)835     void AddSi(const string &si)
836     {
837         m_Sis.push_back(si);
838     }
839 
840     /// Test for existence of a GI.
841     bool FindGi(TGi gi);
842 
843     /// Test for existence of a TI.
844     bool FindTi(TTi ti);
845 
846 
847     bool FindSi(string si);
848 
849     /// Test for existence of a TI or GI here and report whether the
850     /// ID was one of those types.
851     ///
852     /// If the input ID is a GI or TI, this method sets match_type to
853     /// true and returns the output of FindGi or FindTi.  If it is
854     /// neither of those types, it sets match_type to false and
855     /// returns false.  This method is used by SeqDB to filter Blast
856     /// Defline lists.
857     ///
858     /// @param id The identifier to find.
859     /// @param match_type The identifier is either a TI or GI.
860     /// @return true iff the id is found in the list.
861     bool FindId(const CSeq_id & id, bool & match_type);
862 
863     /// Test for existence of a TI or GI included here.
864     bool FindId(const CSeq_id & id);
865 
866     /// Access an element of the GI array.
867     /// @param index The index of the element to access. [in]
868     /// @return The GI for that index.
GetGi(int index) const869     TGi GetGi(int index) const
870     {
871         return m_Gis[index];
872     }
873 
874     /// Access an element of the TI array.
875     /// @param index The index of the element to access. [in]
876     /// @return The TI for that index.
GetTi(int index) const877     TTi GetTi(int index) const
878     {
879         return m_Tis[index];
880     }
881 
882     /// Access an element of the SeqId array.
883     /// @param index The index of the element to access. [in]
884     /// @return The TI for that index.
GetSi(int index) const885     const string GetSi(int index) const
886     {
887         return m_Sis[index];
888     }
889 
890     /// Get the number of GIs in the array.
GetNumGis() const891     int GetNumGis() const
892     {
893         return (int) m_Gis.size();
894     }
895 
896     /// Get the number of TIs in the array.
GetNumTis() const897     int GetNumTis() const
898     {
899         return (int) m_Tis.size();
900     }
901 
902     /// Get the number of SeqIds in the array.
GetNumSis() const903     int GetNumSis() const
904     {
905         return (int) m_Sis.size();
906     }
907 
GetNumPigs() const908     int GetNumPigs() const
909     {
910         return (int) m_Pigs.size();
911     }
912 
IsGiList() const913     bool IsGiList() const
914     {
915         return(GetNumGis() > 0);
916     }
917 
IsTiList() const918     bool IsTiList() const
919     {
920         return(GetNumTis() > 0);
921     }
922 
IsSiList() const923     bool IsSiList() const
924     {
925         return(GetNumSis() > 0);
926     }
927 
ListSize()928     int ListSize()
929     {
930         int size = GetNumGis();
931         if(size == 0) {
932             size = GetNumSis();
933         }
934         if(size == 0) {
935             size = GetNumTis();
936         }
937 
938         if(size == 0) {
939             size = GetNumPigs();
940         }
941 
942         return size;
943     }
944 
945     /// Return false if there are elements present.
Empty() const946     bool Empty() const
947     {
948         return ! (GetNumGis() || GetNumTis() || GetNumSis()|| GetNumTaxIds() || GetNumPigs());
949     }
950 
951     /// Return true if there are elements present.
NotEmpty() const952     bool NotEmpty() const
953     {
954         return ! Empty();
955     }
956 
957     /// Include an OID in the iteration.
958     ///
959     /// The OID will be included by SeqDB in the set returned to users
960     /// by OID iteration.
961     ///
962     /// @param oid The OID in question. [in]
AddIncludedOid(int oid)963     void AddIncludedOid(int oid)
964     {
965         m_Included.SetBit(oid);
966     }
967 
968     /// Indicate a visible OID.
969     ///
970     /// The OID will be marked as having been found in a GI or TI
971     /// ISAM index (but possibly not included for iteration).
972     ///
973     /// @param oid The OID in question. [in]
AddVisibleOid(int oid)974     void AddVisibleOid(int oid)
975     {
976         m_Visible.SetBit(oid);
977     }
978 
979     /// Get the inclusion status of an OID.
980     ///
981     /// This returns true for OIDs that were in the included set and
982     /// for OIDs that were not found in the ISAM file at all.
983     ///
984     /// @param oid The OID in question. [in]
985     /// @return True if the OID is included by SeqDB.
GetOidStatus(int oid)986     bool GetOidStatus(int oid)
987     {
988         return m_Included.GetBit(oid) || (! m_Visible.GetBit(oid));
989     }
990 
991     /// Get the size of the OID array.
GetNumOids()992     int GetNumOids()
993     {
994         return max(m_Visible.Size(), m_Included.Size());
995     }
996 
997     /// Reserve space for GIs.
ReserveGis(size_t n)998     void ReserveGis(size_t n)
999     {
1000         m_Gis.reserve(n);
1001     }
1002 
1003     /// Reserve space for TIs.
ReserveTis(size_t n)1004     void ReserveTis(size_t n)
1005     {
1006         m_Tis.reserve(n);
1007     }
1008 
ReserveSis(size_t n)1009     void ReserveSis(size_t n)
1010     {
1011         m_Sis.reserve(n);
1012     }
1013 
1014     /// Build ID set for this negative list.
GetGiList()1015     const vector<TGi> & GetGiList()
1016     {
1017         return m_Gis;
1018     }
1019 
1020     /// Set ID set for this negative list.
SetGiList(const vector<TGi> & new_list)1021     void SetGiList( const vector<TGi> & new_list )
1022     {
1023 	m_Gis.clear();
1024 	m_Gis.reserve( new_list.size() );
1025         m_Gis = new_list;
1026     }
1027 
SetPigList(const vector<TPig> & new_list)1028     void SetPigList( const vector<TPig> & new_list )
1029     {
1030         m_Pigs.clear();
1031     	m_Pigs.reserve( new_list.size() );
1032         m_Pigs = new_list;
1033     }
1034 
SetSiList(const vector<string> & new_list)1035     void SetSiList( const vector<string> & new_list )
1036     {
1037         m_Sis.clear();
1038         m_Sis.reserve( new_list.size() );
1039         m_Sis = new_list;
1040     }
1041 
1042     /// Build ID set for this negative list.
GetTiList()1043     const vector<TTi> & GetTiList()
1044     {
1045         return m_Tis;
1046     }
1047 
GetPigList()1048     const vector<TPig> & GetPigList()
1049     {
1050         return m_Pigs;
1051     }
1052 
GetSiList()1053     const vector<string> & GetSiList()
1054     {
1055         return m_Sis;
1056     }
1057 
1058     /// Get list size
Size(void)1059     int Size(void)
1060     {
1061 	return (int)m_Gis.size();
1062     }
1063 
GetListInfo()1064     const SBlastSeqIdListInfo & GetListInfo()
1065     {
1066        	return m_ListInfo;
1067     }
1068 
1069     void PreprocessIdsForISAMSiLookup();
1070 
GetExcludedOids()1071     const vector<blastdb::TOid> & GetExcludedOids() { return m_ExcludedOids; }
SetExcludedOids()1072     vector<blastdb::TOid> & SetExcludedOids() { return m_ExcludedOids; }
1073 
SetListInfo(const SBlastSeqIdListInfo & list_info)1074     void SetListInfo(const SBlastSeqIdListInfo & list_info) {
1075        	m_ListInfo = list_info;
1076     }
GetListInfo() const1077     const SBlastSeqIdListInfo & GetListInfo() const{
1078        	return m_ListInfo;
1079     }
1080 
AddTaxIds(const set<TTaxId> & tax_ids)1081     void AddTaxIds(const set<TTaxId> & tax_ids)
1082     {
1083        	m_TaxIds.insert(tax_ids.begin(), tax_ids.end());
1084     }
1085 
GetTaxIdsList()1086     set<TTaxId> & GetTaxIdsList()
1087     {
1088        	return m_TaxIds;
1089     }
1090 
GetNumTaxIds() const1091     int GetNumTaxIds() const
1092     {
1093        	return (int) m_TaxIds.size();
1094     }
1095 
1096 protected:
1097     /// GIs to exclude from the SeqDB instance.
1098     vector<TGi> m_Gis;
1099 
1100     /// TIs to exclude from the SeqDB instance.
1101     vector<TTi> m_Tis;
1102 
1103     vector<TPig> m_Pigs;
1104 
1105     /// SeqIds to exclude from the SeqDB instance.
1106     vector<string> m_Sis;
1107     set<TTaxId> m_TaxIds;
1108 
1109 private:
1110     /// Prevent copy constructor.
1111     CSeqDBNegativeList(const CSeqDBNegativeList & other);
1112 
1113     /// Prevent assignment.
1114     CSeqDBNegativeList & operator=(const CSeqDBNegativeList & other);
1115 
1116     /// Included OID bitmap.
1117     CSeqDBBitVector m_Included;
1118 
1119     /// OIDs visible to the ISAM file.
1120     CSeqDBBitVector m_Visible;
1121 
1122     /// Zero if unsorted, or the size it had after the last sort.
1123     size_t m_LastSortSize;
1124 
1125     SBlastSeqIdListInfo m_ListInfo;
1126 
1127     vector<blastdb::TOid> m_ExcludedOids;
1128 };
1129 
1130 
1131 /// Read a binary-format GI list from a file.
1132 ///
1133 /// @param name The name of the file containing GIs. [in]
1134 /// @param gis The GIs returned by this function. [out]
1135 NCBI_XOBJREAD_EXPORT
1136 void SeqDB_ReadBinaryGiList(const string & name, vector<TGi> & gis);
1137 
1138 /// Read a text or binary GI list from an area of memory.
1139 ///
1140 /// The GIs in a memory region are read into the provided SGiOid
1141 /// vector.  The GI half of each element of the vector is assigned,
1142 /// but the OID half will be left as -1.  If the in_order parameter is
1143 /// not null, the function will test the GIs for orderedness.  It will
1144 /// set the bool to which in_order points to true if so, false if not.
1145 ///
1146 /// @param fbeginp The start of the memory region holding the GI list. [in]
1147 /// @param fendp   The end of the memory region holding the GI list. [in]
1148 /// @param gis     The GIs returned by this function. [out]
1149 /// @param in_order If non-null, returns true iff the GIs were in order. [out]
1150 
1151 NCBI_XOBJREAD_EXPORT
1152 void SeqDB_ReadMemoryGiList(const char                   * fbeginp,
1153                             const char                   * fendp,
1154                             vector<CSeqDBGiList::SGiOid> & gis,
1155                             bool                         * in_order = 0);
1156 
1157 /// Read a text or binary TI list from an area of memory.
1158 ///
1159 /// The TIs in a memory region are read into the provided STiOid
1160 /// vector.  The TI half of each element of the vector is assigned,
1161 /// but the OID half will be left as -1.  If the in_order parameter is
1162 /// not null, the function will test the TIs for orderedness.  It will
1163 /// set the bool to which in_order points to true if so, false if not.
1164 ///
1165 /// @param fbeginp The start of the memory region holding the TI list. [in]
1166 /// @param fendp   The end of the memory region holding the TI list. [in]
1167 /// @param tis     The TIs returned by this function. [out]
1168 /// @param in_order If non-null, returns true iff the TIs were in order. [out]
1169 
1170 NCBI_XOBJREAD_EXPORT
1171 void SeqDB_ReadMemoryTiList(const char                   * fbeginp,
1172                             const char                   * fendp,
1173                             vector<CSeqDBGiList::STiOid> & tis,
1174                             bool                         * in_order = 0);
1175 
1176 /// Read a text SeqID list from an area of memory.
1177 ///
1178 /// The Seqids in a memory region are read into the provided SSeqIdOid
1179 /// vector.  The SeqId half of each element of the vector is assigned,
1180 /// but the OID half will be left as -1.  If the in_order parameter is
1181 /// not null, the function will test the SeqIds for orderedness.  It will
1182 /// set the bool to which in_order points to true if so, false if not.
1183 ///
1184 /// @param fbeginp The start of the memory region holding the SeqId list. [in]
1185 /// @param fendp   The end of the memory region holding the SeqId list. [in]
1186 /// @param seqids  The SeqId returned by this function. [out]
1187 /// @param in_order If non-null, returns true iff the seqids were in order. [out]
1188 
1189 NCBI_XOBJREAD_EXPORT
1190 void SeqDB_ReadMemorySiList(const char                   * fbeginp,
1191                             const char                   * fendp,
1192                             vector<CSeqDBGiList::SSiOid> & sis,
1193                             bool                         * in_order = 0);
1194 
1195 /// Read an ID list (mixed type) from an area of memory.
1196 ///
1197 /// The Seq ids in a memory region are read into the provided SSeqIdOid
1198 /// vector.  The gi, ti or seqid half of each element of the vector is assigned,
1199 /// but the OID half will be left as -1.  If the in_order parameter is
1200 /// not null, the function will test the SeqIds for orderedness.  It will
1201 /// set the bool to which in_order points to true if so, false if not.
1202 ///
1203 /// @param fbeginp The start of the memory region holding the SeqId list. [in]
1204 /// @param fendp   The end of the memory region holding the SeqId list. [in]
1205 /// @param gis     The gis returned by this function. [out]
1206 /// @param tis     The tis returned by this function. [out]
1207 /// @param sis     The seqids returned by this function. [out]
1208 /// @param in_order If non-null, returns true iff the seqids were in order. [out]
1209 
1210 NCBI_XOBJREAD_EXPORT
1211 void SeqDB_ReadMemoryMixList(const char * fbeginp,
1212                             const char * fendp,
1213                             vector<CSeqDBGiList::SGiOid> & gis,
1214                             vector<CSeqDBGiList::STiOid> & tis,
1215                             vector<CSeqDBGiList::SSiOid> & sis,
1216                             bool * in_order);
1217 
1218 NCBI_XOBJREAD_EXPORT
1219 void SeqDB_ReadMemoryPigList(const char                    * fbeginp,
1220                              const char                    * fendp,
1221                              vector<CSeqDBGiList::SPigOid> & pigs,
1222                              bool                          * in_order = 0);
1223 
1224 /// Combine and quote a list of database names.
1225 ///
1226 /// SeqDB permits multiple databases to be opened by a single CSeqDB
1227 /// instance, by passing the database names as a space-delimited list
1228 /// to the CSeqDB constructor.  To support paths and filenames with
1229 /// embedded spaces, surround any space-containing names with double
1230 /// quotes ('"').  Filenames not containing spaces may be quoted
1231 /// safely with no effect.  (This solution prevents the use of names
1232 /// containing embedded double quotes.)
1233 ///
1234 /// This method combines a list of database names into a string
1235 /// encoded in this way.
1236 ///
1237 /// @param dbname Combined database name.
1238 /// @param dbs Database names to combine.
1239 
1240 NCBI_XOBJREAD_EXPORT
1241 void SeqDB_CombineAndQuote(const vector<string> & dbs,
1242                            string               & dbname);
1243 
1244 /// Split a (possibly) quoted list of database names into pieces.
1245 ///
1246 /// SeqDB permits multiple databases to be opened by a single CSeqDB
1247 /// instance, by passing the database names as a space-delimited list
1248 /// to the CSeqDB constructor.  To support paths and filenames with
1249 /// embedded spaces, surround any space-containing names with double
1250 /// quotes ('"').  Filenames not containing spaces may be quoted
1251 /// safely with no effect.  (This solution prevents the use of names
1252 /// containing embedded double quotes.)
1253 ///
1254 /// This method splits a string encoded in this way into individual
1255 /// database names.  Note that the resulting vector's objects are
1256 /// CTempString "slice" objects, and are only valid while the original
1257 /// (encoded) string is unchanged.
1258 ///
1259 /// @param dbname Combined database name.
1260 /// @param dbs Database names to combine.
1261 
1262 NCBI_XOBJREAD_EXPORT
1263 void SeqDB_SplitQuoted(const string        & dbname,
1264                        vector<CTempString> & dbs,
1265                        bool					 keep_quote = false);
1266 
1267 /// Read a text or binary GI list from a file.
1268 ///
1269 /// The GIs in a file are read into the provided SGiOid vector.  The
1270 /// GI half of each element of the vector is assigned, but the OID
1271 /// half will be left as -1.  If the in_order parameter is not null,
1272 /// the function will test the GIs for orderedness.  It will set the
1273 /// bool to which in_order points to true if so, false if not.
1274 ///
1275 /// @param fname    The name of the GI list file. [in]
1276 /// @param gis      The GIs returned by this function. [out]
1277 /// @param in_order If non-null, returns true iff the GIs were in order. [out]
1278 
1279 NCBI_XOBJREAD_EXPORT
1280 void SeqDB_ReadGiList(const string                 & fname,
1281                       vector<CSeqDBGiList::SGiOid> & gis,
1282                       bool                         * in_order = 0);
1283 
1284 /// Read a text or binary TI list from a file.
1285 ///
1286 /// The TIs in a file are read into the provided STiOid vector.  The
1287 /// TI half of each element of the vector is assigned, but the OID
1288 /// half will be left as -1.  If the in_order parameter is not null,
1289 /// the function will test the TIs for orderedness.  It will set the
1290 /// bool to which in_order points to true if so, false if not.
1291 ///
1292 /// @param fname    The name of the TI list file. [in]
1293 /// @param tis      The TIs returned by this function. [out]
1294 /// @param in_order If non-null, returns true iff the TIs were in order. [out]
1295 
1296 NCBI_XOBJREAD_EXPORT
1297 void SeqDB_ReadTiList(const string                 & fname,
1298                       vector<CSeqDBGiList::STiOid> & tis,
1299                       bool                         * in_order = 0);
1300 
1301 /// Read a text SeqId list from a file.
1302 ///
1303 /// The Seqids in a file are read into the provided SSeqIdOid vector.  The
1304 /// SeqId half of each element of the vector is assigned, but the OID
1305 /// half will be left as -1.  If the in_order parameter is not null,
1306 /// the function will test the SeqIds for orderedness.  It will set the
1307 /// bool to which in_order points to true if so, false if not.
1308 ///
1309 /// @param fname    The name of the SeqId list file. [in]
1310 /// @param gis      The GIs returned by this function. [out]
1311 /// @param sis      The SeqIds returned by this function. [out]
1312 /// @param in_order If non-null, returns true iff the SeqIds were in order. [out]
1313 
1314 NCBI_XOBJREAD_EXPORT
1315 void SeqDB_ReadSiList(const string                 & fname,
1316                       vector<CSeqDBGiList::SSiOid> & sis,
1317                       bool                         * in_order,
1318                       SBlastSeqIdListInfo & db_info);
1319 
1320 /// Read a text SeqId list from a file.
1321 ///
1322 /// The Seqids in a file are read into the provided SSeqIdOid vector.  The
1323 /// Gi/Ti/Si half of each element of the vector is assigned, but the OID
1324 /// half will be left as -1.  If the in_order parameter is not null,
1325 /// the function will test the SeqIds for orderedness.  It will set the
1326 /// bool to which in_order points to true if so, false if not.
1327 ///
1328 /// @param fname    The name of the SeqId list file. [in]
1329 /// @param tis      The TIs returned by this function. [out]
1330 /// @param sis      The SeqIds returned by this function. [out]
1331 /// @param in_order If non-null, returns true iff the SeqIds were in order. [out]
1332 
1333 NCBI_XOBJREAD_EXPORT
1334 void SeqDB_ReadMixList(const string & fname,
1335 		               vector<CSeqDBGiList::SGiOid> & gis,
1336 		               vector<CSeqDBGiList::STiOid> & tis,
1337 		               vector<CSeqDBGiList::SSiOid> & sis,
1338 		               bool * in_order);
1339 
1340 NCBI_XOBJREAD_EXPORT
1341 void SeqDB_ReadPigList(const string                 & fname,
1342                       vector<CSeqDBGiList::SPigOid> & pigs,
1343                       bool                          * in_order = 0);
1344 
1345 
1346 /// Read a text or binary GI list from a file.
1347 ///
1348 /// The GIs in a file are read into the provided vector<int>.  If the
1349 /// in_order parameter is not null, the function will test the GIs for
1350 /// orderedness.  It will set the bool to which in_order points to
1351 /// true if so, false if not.
1352 ///
1353 /// @param fname    The name of the GI list file. [in]
1354 /// @param gis      The GIs returned by this function. [out]
1355 /// @param in_order If non-null, returns true iff the GIs were in order. [out]
1356 
1357 NCBI_XOBJREAD_EXPORT
1358 void SeqDB_ReadGiList(const string  & fname,
1359                       vector<TGi>   & gis,
1360                       bool          * in_order = 0);
1361 
1362 /// Read a text or binary SeqId list from a file.
1363 ///
1364 /// The SeqIds in a file are read into the provided vector<string>.  If the
1365 /// in_order parameter is not null, the function will test the SeqIds for
1366 /// orderedness.  It will set the bool to which in_order points to
1367 /// true if so, false if not.
1368 ///
1369 /// @param fname    The name of the SeqId list file. [in]
1370 /// @param sis      The SeqIds returned by this function. [out]
1371 /// @param in_order If non-null, returns true iff the SeqIds were in order. [out]
1372 
1373 ///NCBI_XOBJREAD_EXPORT
1374 ///void SeqDB_ReadSeqIdList(const string     & fname,
1375 ///                         vector<string>   & sis,
1376 ///                         bool             * in_order = 0);
1377 
1378 /// Returns true if the file name passed contains a binary gi list
1379 ///
1380 /// @param fname    The name of the GI list file. [in]
1381 /// @throws CSeqDBException if file is invalid or empty
1382 NCBI_XOBJREAD_EXPORT
1383 bool SeqDB_IsBinaryGiList(const string  & fname);
1384 
1385 /// Returns true if the file name passed contains a binary TI list
1386 ///
1387 /// @param fname    The name of the TI list file. [in]
1388 /// @throws CSeqDBException if file is invalid or empty
1389 NCBI_XOBJREAD_EXPORT
1390 bool SeqDB_IsBinaryTiList(const string  & fname);
1391 
1392 /// CSeqDBFileGiList
1393 ///
1394 /// This class defines a CSeqDBGiList subclass which reads a GI list
1395 /// file given a filename.  It can read text or binary GI list files,
1396 /// and will automatically distinguish between them.
1397 
1398 class NCBI_XOBJREAD_EXPORT CSeqDBFileGiList : public CSeqDBGiList {
1399 public:
1400     enum EIdType {
1401         eGiList,
1402         eTiList,
1403         eSiList,
1404         eMixList,
1405         ePigList,
1406         eTaxIdList
1407     };
1408 
1409     /// Build a GI list from a file.
1410     CSeqDBFileGiList(const string & fname, EIdType idtype=eGiList);
1411 
1412     /// Build a GI list from multiple files.  (only support eSiList)
1413     //CSeqDBFileGiList(vector<string> fnames, EIdType idtype=eGiList);
1414 };
1415 
1416 
1417 /// GI list containing the intersection of two other lists of GIs.
1418 ///
1419 /// This class takes a CSeqDBGiList and an integer vector and computes
1420 /// the intersection of the two.  Note that both input arguments are
1421 /// sorted to GI order in-place.
1422 
1423 class NCBI_XOBJREAD_EXPORT CIntersectionGiList : public CSeqDBGiList {
1424 public:
1425     /// Construct an intersection of two lists of GIs.
1426     ///
1427     /// The two lists of GIs are sorted and this class is computed as
1428     /// an intersection of them.  Note that both arguments to this
1429     /// function are potentially modified (sorted in place).
1430     CIntersectionGiList(CSeqDBGiList & gilist, vector<TGi> & gis);
1431 
1432     /// The two lists of GIs are sorted and this class is computed as
1433     /// an intersection of them. Since gilist is negative this means
1434     /// all gi's in the vector that are NOT in the negative list.
1435     /// Note that both arguments to this
1436     /// function are potentially modified (sorted in place).
1437     CIntersectionGiList(CSeqDBNegativeList & gilist, vector<TGi> & gis);
1438 };
1439 
1440 
1441 /// Helper class to allow copy-on-write semantics for CSeqDBIdSet.
1442 ///
1443 /// This class owns the actual vector of IDs for the CSeqDBIdSet list.
1444 
1445 class CSeqDBIdSet_Vector : public CObject {
1446 public:
1447     /// Default constructor.
CSeqDBIdSet_Vector()1448     CSeqDBIdSet_Vector()
1449     {
1450     }
1451 
1452     /// Construct from an 'int' set.
CSeqDBIdSet_Vector(const vector<Int4> & ids)1453     CSeqDBIdSet_Vector(const vector<Int4> & ids)
1454     {
1455         ITERATE(vector<Int4>, iter, ids) {
1456             m_Ids.push_back((Int8) *iter);
1457         }
1458     }
1459 
1460     /// Construct from an 'Int8' set.
CSeqDBIdSet_Vector(const vector<Int8> & ids)1461     CSeqDBIdSet_Vector(const vector<Int8> & ids)
1462     {
1463         m_Ids = ids;
1464     }
1465 
1466     /// Construct from an 'Uint8' set.
CSeqDBIdSet_Vector(const vector<Uint8> & ids)1467     CSeqDBIdSet_Vector(const vector<Uint8> & ids)
1468     {
1469         ITERATE(vector<Uint8>, iter, ids) {
1470             m_Ids.push_back((Int8) *iter);
1471         }
1472     }
1473 
CSeqDBIdSet_Vector(const vector<string> & ids)1474     CSeqDBIdSet_Vector(const vector<string> & ids)
1475     {
1476         ITERATE(vector<string>, iter, ids) {
1477             m_SeqIds.push_back((string) *iter);
1478         }
1479     }
1480 
1481 
1482 #ifdef NCBI_STRICT_GI
1483     /// Construct from a 'TGi' set when NCBI_STRICT_GI is in force.
CSeqDBIdSet_Vector(const vector<TGi> & ids)1484     CSeqDBIdSet_Vector(const vector<TGi> & ids)
1485     {
1486         ITERATE(vector<TGi>, iter, ids) {
1487             m_Ids.push_back(GI_TO(Int8, *iter));
1488         }
1489     }
1490 #endif
1491 
1492     /// Access the Int8 set.
Set()1493     vector<Int8> & Set()
1494     {
1495         return m_Ids;
1496     }
1497 
1498     /// Access the Int8 set.
Get() const1499     const vector<Int8> & Get() const
1500     {
1501         return m_Ids;
1502     }
1503 
1504     /// Access the string set.
SetSeqIDs()1505     vector<string> & SetSeqIDs()
1506     {
1507         return m_SeqIds;
1508     }
1509 
1510     /// Access the string set.
GetSeqIDs() const1511     const vector<string> & GetSeqIDs() const
1512     {
1513         return m_SeqIds;
1514     }
1515 
1516     /// Get the number of elements stored here.
Size() const1517     size_t Size() const
1518     {
1519         size_t n  = m_Ids.size();
1520         if(n == 0) {
1521             n = m_SeqIds.size();
1522         }
1523         return n;
1524     }
1525 
1526 private:
1527     /// The actual list elements.
1528     vector<Int8> m_Ids;
1529 
1530     vector<string> m_SeqIds;
1531 
1532     /// Prevent copy construction.
1533     CSeqDBIdSet_Vector(CSeqDBIdSet_Vector &);
1534 
1535     /// Prevent copy assignment.
1536     CSeqDBIdSet_Vector & operator=(CSeqDBIdSet_Vector &);
1537 };
1538 
1539 
1540 /// SeqDB ID list for performing boolean set operations.
1541 ///
1542 /// This class permits boolean operations on lists of numeric IDs,
1543 /// and can be passed to CSeqDB in the same way as a CSeqDBGiList.
1544 /// CSeqDBGiList or CSeqDBNegativeList objects can be constructed as
1545 /// well.  Logical operations supported include AND, OR, XOR, and NOT.
1546 /// Internally this uses a CRef based copy-on-write scheme, so these
1547 /// objects can be copied in constant time.
1548 
1549 class NCBI_XOBJREAD_EXPORT CSeqDBIdSet : public CObject {
1550 public:
1551     /// Types of operations that may be performed on GI lists.
1552     enum EOperation {
1553         eAnd, // Found in both X and Y
1554         eXor, // Found in X or Y, but not both
1555         eOr   // Found in either X or Y
1556     };
1557 
1558     /// Type of IDs stored here.
1559     enum EIdType {
1560         eGi,  // Found in both X and Y
1561         eTi,   // Found in X or Y, but not both
1562         eSi
1563     };
1564 
1565     /// Construct a 'blank' CSeqDBIdSet object.
1566     ///
1567     /// This produces a blank ID set object, which (if applied) would
1568     /// not cause any filtering to occur.  This is represented here as
1569     /// a negative ID list with no elements.
1570     ///
1571     CSeqDBIdSet();
1572 
1573     /// Build a computed ID list given an initial set of IDs.
1574     ///
1575     /// This initializes a list with an initial set of IDs of the
1576     /// specified type.  All further logic operations on the list
1577     /// should use vectors of IDs or CSeqDBIdSet objects
1578     /// initialized with the same EIdType enumeration.
1579     ///
1580     /// @param ids These IDs will be added to the list.
1581     /// @param t The IDs are assumed to be of this type.
1582     /// @param positive True for a positive ID list, false for negative.
1583     CSeqDBIdSet(const vector<Int4> & ids, EIdType t, bool positive = true);
1584 
1585     /// Build a computed ID list given an initial set of IDs.
1586     ///
1587     /// This initializes a list with an initial set of IDs of the
1588     /// specified type.  All further logic operations on the list
1589     /// should use vectors of IDs or CSeqDBIdSet objects
1590     /// initialized with the same EIdType enumeration.
1591     ///
1592     /// @param ids These IDs will be added to the list.
1593     /// @param t The IDs are assumed to be of this type.
1594     /// @param positive True for a positive ID list, false for negative.
1595     CSeqDBIdSet(const vector<Int8> & ids, EIdType t, bool positive = true);
1596 
1597     /// Build a computed ID list given an initial set of IDs.
1598     ///
1599     /// This initializes a list with an initial set of IDs of the
1600     /// specified type.  All further logic operations on the list
1601     /// should use vectors of IDs or CSeqDBIdSet objects
1602     /// initialized with the same EIdType enumeration.
1603     ///
1604     /// @param ids These IDs will be added to the list.
1605     /// @param t The IDs are assumed to be of this type.
1606     /// @param positive True for a positive ID list, false for negative.
1607     CSeqDBIdSet(const vector<Uint8> & ids, EIdType t, bool positive = true);
1608 
1609 
1610 
1611 
1612 #ifdef NCBI_STRICT_GI
1613     /// Build a computed ID list given an initial set of IDs.
1614     ///
1615     /// This initializes a list with an initial set of IDs of the
1616     /// specified type.  All further logic operations on the list
1617     /// should use vectors of IDs or CSeqDBIdSet objects
1618     /// initialized with the same EIdType enumeration.
1619     ///
1620     /// @param ids These IDs will be added to the list.
1621     /// @param t The IDs are assumed to be of this type.
1622     /// @param positive True for a positive ID list, false for negative.
1623     CSeqDBIdSet(const vector<TGi> & ids, EIdType t, bool positive = true);
1624 #endif
1625 
1626     CSeqDBIdSet(const vector<string> & ids, EIdType t, bool positive = true);
1627 
1628     /// Virtual destructor.
~CSeqDBIdSet()1629     virtual ~CSeqDBIdSet()
1630     {
1631     }
1632 
1633     /// Invert the current list.
1634     void Negate();
1635 
1636     /// Perform a logical operation on a list.
1637     ///
1638     /// The logical operation is performed between the current list
1639     /// and the ids parameter, and the 'positive' flag is used to
1640     /// determine if the new input list should be treated as a
1641     /// positive or negative list.  For example, using op == eOr and
1642     /// positive == false would perform the operation (X OR NOT Y).
1643     ///
1644     /// @param op Logical operation to perform.
1645     /// @param ids List of ids for the second argument.
1646     /// @param positive True for positive lists, false for negative.
1647     void Compute(EOperation          op,
1648                  const vector<int> & ids,
1649                  bool                positive = true);
1650 
1651     /// Perform a logical operation on a list.
1652     ///
1653     /// The logical operation is performed between the current list
1654     /// and the ids parameter, and the 'positive' flag is used to
1655     /// determine if the new input list should be treated as a
1656     /// positive or negative list.  For example, using op == eOr and
1657     /// positive == false would perform the operation (X OR NOT Y).
1658     ///
1659     /// @param op Logical operation to perform.
1660     /// @param ids List of ids for the second argument.
1661     /// @param positive If true, ids represent 'negative' ids.
1662     void Compute(EOperation           op,
1663                  const vector<Int8> & ids,
1664                  bool                 positive = true);
1665 
1666     /// Perform a logical operation on a list.
1667     ///
1668     /// The logical operation is performed between the current list
1669     /// and the ids parameter, and the 'positive' flag is used to
1670     /// determine if the new input list should be treated as a
1671     /// positive or negative list.  For example, using op == eOr and
1672     /// positive == false would perform the operation (X OR NOT Y).
1673     ///
1674     /// @param op Logical operation to perform.
1675     /// @param ids List of ids for the second argument.
1676     /// @param positive If true, ids represent 'negative' ids.
1677     void Compute(EOperation           op,
1678                  const vector<Uint8> & ids,
1679                  bool                 positive = true);
1680 
1681     /// Perform a logical operation on a list.
1682     ///
1683     /// The logical operation is performed between the current list
1684     /// and the ids parameter.  For example if 'eOr' is specified, the
1685     /// operation performed will be 'X OR Y'.  The 'ids' list will not
1686     /// be modified by this operation.
1687     ///
1688     /// @param op Logical operation to perform.
1689     /// @param ids List of ids for the second argument.
1690     void Compute(EOperation op, const CSeqDBIdSet & ids);
1691 
1692     /// Checks whether a positive GI list was produced.
1693     ///
1694     /// If this method returns true, a positive list was produced, and
1695     /// can be retrieved with GetPositiveList().  If it returns false,
1696     /// a negative list was produced and can be retrieved with
1697     /// GetNegativeList().
1698     ///
1699     /// @return true If the produced GI list is positive.
IsPositive()1700     bool IsPositive()
1701     {
1702         return m_Positive;
1703     }
1704 
1705     /// Retrieve a positive GI list.
1706     ///
1707     /// If IsPositive() returned true, this method should be used to
1708     /// retrieve a positive GI list.  If IsPositive() returned false,
1709     /// this method will throw an exception.
1710     CRef<CSeqDBGiList> GetPositiveList();
1711 
1712     /// Retrieve a negative GI list.
1713     ///
1714     /// If IsPositive() returned false, this method should be used to
1715     /// retrieve a positive GI list.  If IsPositive() returned true,
1716     /// this method will throw an exception.
1717     ///
1718     /// @return A negative GI list.
1719     CRef<CSeqDBNegativeList> GetNegativeList();
1720 
1721     /// Check if an ID list is blank.
1722     ///
1723     /// An ID list is considered 'blank' iff it is a negative list
1724     /// with no elements.  Constructing a database with such a list is
1725     /// equivalent to not specifying a list.  Blank lists are produced
1726     /// by the default constructor, by specifying a negative list and
1727     /// providing an empty vector, or by computation (an intersection
1728     /// of disjoint negative lists, for example).  This method returns
1729     /// true in those cases; otherwise it returns false.
1730     ///
1731     /// @return True if this list is blank.
1732     bool Blank() const;
1733 
1734 private:
1735     /// Sort and unique the internal set.
1736     static void x_SortAndUnique(vector<Int8> & ids);
1737 
1738     static void x_SortAndUnique(vector<string> & ids);
1739 
1740     /// Compute inclusion flags for a boolean operation.
1741     ///
1742     /// This takes a logical operator (AND, OR, or XOR) and a flag
1743     /// indicating whether each input lists is positive or negative,
1744     /// and produces a flag indicating whether the resulting list will
1745     /// be positive or negative and three flags used to control the
1746     /// set merging operation.
1747     ///
1748     /// @param op The operation to perform (OR, AND, or XOR). [in]
1749     /// @param A_pos True if the first list is positive. [in]
1750     /// @param B_pos True if the second list is positive. [in]
1751     /// @param result_pos True if the result is a positive list. [out]
1752     /// @param incl_A True if ids found only in list A are kept. [out]
1753     /// @param incl_B True if ids found only in list B are kept. [out]
1754     /// @param incl_AB True if ids found in both lists are kept. [out]
1755     static void x_SummarizeBooleanOp(EOperation op,
1756                                      bool       A_pos,
1757                                      bool       B_pos,
1758                                      bool     & result_pos,
1759                                      bool     & incl_A,
1760                                      bool     & incl_B,
1761                                      bool     & incl_AB);
1762 
1763     /// Compute boolean operation on two vectors.
1764     ///
1765     /// This takes a logical operator (AND, OR, or XOR) and two
1766     /// positive or negative lists, and produces a positive or
1767     /// negative list representing that operation applied to those
1768     /// lists.
1769     ///
1770     /// @param op The operation to perform (OR, AND, or XOR). [in]
1771     /// @param A The first input list. [in]
1772     /// @param A_pos True if the first list is positive. [in]
1773     /// @param B The second input list. [in]
1774     /// @param B_pos True if the second list is positive. [in]
1775     /// @param result The resulting list of identifiers. [out]
1776     /// @param result_pos True if the result is a positive list. [out]
1777     void x_BooleanSetOperation(EOperation           op,
1778                                const vector<Int8> & A,
1779                                bool                 A_pos,
1780                                const vector<Int8> & B,
1781                                bool                 B_pos,
1782                                vector<Int8>       & result,
1783                                bool               & result_pos);
1784 
1785     /// True if the current list is positive.
1786     bool m_Positive;
1787 
1788     /// Id type.
1789     EIdType m_IdType;
1790 
1791     /// Ids stored here.
1792     CRef<CSeqDBIdSet_Vector> m_Ids;
1793 
1794     /// Cached positive list.
1795     CRef<CSeqDBGiList> m_CachedPositive;
1796 
1797     /// Cached negative list.
1798     CRef<CSeqDBNegativeList> m_CachedNegative;
1799 };
1800 
1801 
1802 // The "instance" concept in the following types refers to the fact
1803 // that each alias file has a seperately instantiated node for each
1804 // point where it appears in the alias file hierarchy.
1805 
1806 /// Set of values found in one instance of one alias file.
1807 typedef map<string, string> TSeqDBAliasFileInstance;
1808 
1809 /// Contents of all instances of a particular alias file pathname.
1810 typedef vector< TSeqDBAliasFileInstance > TSeqDBAliasFileVersions;
1811 
1812 /// Contents of all alias file are returned in this type of container.
1813 typedef map< string, TSeqDBAliasFileVersions > TSeqDBAliasFileValues;
1814 
1815 
1816 /// SSeqDBTaxInfo
1817 ///
1818 /// This structure contains the taxonomy information for a single
1819 /// given taxid.
1820 
1821 struct SSeqDBTaxInfo {
1822     /// Default constructor
1823     /// @param t the taxonomy ID to set for this structure
SSeqDBTaxInfoSSeqDBTaxInfo1824     SSeqDBTaxInfo(TTaxId t = ZERO_TAX_ID)
1825         : taxid(t)
1826     {
1827     }
1828 
1829     /// An identifier for this species or taxonomic group.
1830     TTaxId taxid;
1831 
1832     /// Scientific name, such as "Aotus vociferans".
1833     string scientific_name;
1834 
1835     /// Common name, such as "noisy night monkey".
1836     string common_name;
1837 
1838     /// A simple category name, such as "birds".
1839     string blast_name;
1840 
1841     /// A string of length 1 indicating the "Super Kingdom".
1842     string s_kingdom;
1843 
operator <<(ostream & out,const SSeqDBTaxInfo & rhs)1844     friend ostream& operator<<(ostream& out, const SSeqDBTaxInfo& rhs) {
1845         out << "Taxid=" << rhs.taxid
1846             << "\tSciName=" << rhs.scientific_name
1847             << "\tCommonName=" << rhs.common_name
1848             << "\tBlastName=" << rhs.blast_name
1849             << "\tSuperKingdom=" << rhs.s_kingdom;
1850         return out;
1851     }
1852 };
1853 
1854 
1855 /// Resolve a file path using SeqDB's path algorithms.
1856 ///
1857 /// This finds a file using the same algorithm used by SeqDB to find
1858 /// blast database filenames.  The filename must include the extension
1859 /// if any.  Paths which start with '/', '\', or a drive letter
1860 /// (depending on operating system) will be treated as absolute paths.
1861 /// If the file is not found an empty string will be returned.
1862 ///
1863 /// @param filename Name of file to find.
1864 /// @return Resolved path or empty string if not found.
1865 
1866 NCBI_XOBJREAD_EXPORT
1867 string SeqDB_ResolveDbPath(const string & filename);
1868 
1869 /// Resolve a file path using SeqDB's path algorithms.
1870 ///
1871 /// Identical to SeqDB_ResolveDbPath with the exception that this function does
1872 /// not require the extension to be provided. This is intended to check whether
1873 /// a BLAST DB exists or not.
1874 ///
1875 /// @param filename Name of file to find.
1876 /// @param dbtype Determines whether the BLAST DB is protein ('p'), nucleotide
1877 /// ('n'), or whether the algorithm should guess it ('-')
1878 /// @return Resolved path or empty string if not found.
1879 NCBI_XOBJREAD_EXPORT
1880 string SeqDB_ResolveDbPathNoExtension(const string & filename,
1881                                       char dbtype = '-');
1882 
1883 /// Resolve a file path using SeqDB's path algorithms.
1884 ///
1885 /// Identical to SeqDB_ResolveDbPathNoExtension with the exception that this
1886 /// function searches for ISAM or SQLite files, specifically those storing
1887 /// numeric and string data (for LinkoutDB; i.e.: '.sqlite3').
1888 /// This is intended to check whether the files used in LinkoutDB
1889 /// exist or not.
1890 ///
1891 /// @param filename Name of file to find.
1892 /// @return Resolved path or empty string if not found.
1893 NCBI_XOBJREAD_EXPORT
1894 string SeqDB_ResolveDbPathForLinkoutDB(const string & filename);
1895 
1896 /// Compares two volume file names and determine the volume order
1897 ///
1898 /// @param volpath1 The 1st volume path
1899 /// @param volpath2 The 2nd volume path
1900 /// @return true if vol1 should appear before vol2
1901 NCBI_XOBJREAD_EXPORT
1902 bool SeqDB_CompareVolume(const string & volpath1,
1903                          const string & volpath2);
1904 
1905 /// Returns a path minus filename.
1906 ///
1907 /// Substring version of the above.  This returns the part of a file
1908 /// Sequence Hashing
1909 ///
1910 /// This computes a hash of a sequence.  The sequence is expected to
1911 /// be in either ncbistdaa format (for protein) or ncbi8na format (for
1912 /// nucleotide).  These formats are produced by CSeqDB::GetAmbigSeq()
1913 /// if the kSeqDBNuclNcbiNA8 encoding is selected.
1914 ///
1915 /// @param sequence A pointer to the sequence data. [in]
1916 /// @param length The length of the sequence in bases. [in]
1917 /// @return The 32 bit hash value.
1918 NCBI_XOBJREAD_EXPORT
1919 unsigned SeqDB_SequenceHash(const char * sequence,
1920                             int          length);
1921 
1922 /// Sequence Hashing For a CBioseq
1923 ///
1924 /// This computes a hash of a sequence expressed as a CBioseq.
1925 ///
1926 /// @param sequence The sequence. [in]
1927 /// @return The 32 bit hash value.
1928 NCBI_XOBJREAD_EXPORT
1929 unsigned SeqDB_SequenceHash(const CBioseq & sequence);
1930 
1931 /// Various identifier formats used in Id lookup
1932 enum ESeqDBIdType {
1933     eGiId,     /// Genomic ID is a relatively stable numeric identifier for sequences.
1934     eTiId,     /// Trace ID is a numeric identifier for Trace sequences.
1935     ePigId,    /// Each PIG identifier refers to exactly one protein sequence.
1936     eStringId, /// Some sequence sources uses string identifiers.
1937     eHashId,   /// Lookup from sequence hash values to OIDs.
1938     eOID       /// The ordinal id indicates the order of the data in the volume's index file.
1939 };
1940 
1941 /// Seq-id simplification.
1942 ///
1943 /// Given a Seq-id, this routine devolves it to a GI or PIG if
1944 /// possible.  If not, it formats the Seq-id into a canonical form
1945 /// for lookup in the string ISAM files.  If the Seq-id was parsed
1946 /// from an accession, it can be provided in the "acc" parameter,
1947 /// and it will be used if the Seq-id is not in a form this code
1948 /// can recognize.  In the case that new Seq-id types are added,
1949 /// support for which has not been added to this code, this
1950 /// mechanism will try to use the original string.
1951 ///
1952 /// @param bestid
1953 ///   The Seq-id to look up. [in]
1954 /// @param acc
1955 ///   The original string the Seq-id was created from (or NULL). [in]
1956 /// @param num_id
1957 ///   The returned identifier, if numeric. [out]
1958 /// @param str_id
1959 ///   The returned identifier, if a string. [out]
1960 /// @param simpler
1961 ///   Whether an adjustment was done at all. [out]
1962 /// @return
1963 ///   The resulting identifier type.
1964 NCBI_XOBJREAD_EXPORT ESeqDBIdType
1965 SeqDB_SimplifySeqid(CSeq_id       & bestid,
1966                     const string  * acc,
1967                     Int8          & num_id,
1968                     string        & str_id,
1969                     bool          & simpler);
1970 
1971 /// String id simplification.
1972 ///
1973 /// This routine tries to produce a numerical type from a string
1974 /// identifier.  SeqDB can use faster lookup mechanisms if a PIG,
1975 /// GI, or OID type can be recognized in the string, for example.
1976 /// Even when the output is a string, it may be better formed for
1977 /// the purpose of lookup in the string ISAM file.
1978 ///
1979 /// @param acc
1980 ///   The string to look up. [in]
1981 /// @param num_id
1982 ///   The returned identifier, if numeric. [out]
1983 /// @param str_id
1984 ///   The returned identifier, if a string. [out]
1985 /// @param simpler
1986 ///   Whether an adjustment was done at all. [out]
1987 /// @return
1988 ///   The resulting identifier type.
1989 NCBI_XOBJREAD_EXPORT ESeqDBIdType
1990 SeqDB_SimplifyAccession(const string & acc,
1991                         Int8         & num_id,
1992                         string       & str_id,
1993                         bool         & simpler);
1994 
1995 /// String id simplification.
1996 ///
1997 /// This simpler version will convert string id to the standard
1998 /// ISAM form, and return "" if the conversion fails.
1999 ///
2000 /// @param acc
2001 ///   The string to look up. [in]
2002 /// @return
2003 ///   The resulting converted id.
2004 NCBI_XOBJREAD_EXPORT const string
2005 SeqDB_SimplifyAccession(const string &acc);
2006 
2007 /// Retrieves a list of all supported file extensions for BLAST databases
2008 /// @param db_is_protein set to true if the database is protein else false [in]
2009 /// @param extensions where the return value will be stored [in|out]
2010 /// @param dbver BLASTDB version to use [in]
2011 NCBI_XOBJREAD_EXPORT
2012 void SeqDB_GetFileExtensions(bool db_is_protein,
2013                              vector<string>& extensions,
2014                              EBlastDbVersion dbver = eBDB_Version4);
2015 
2016 /// Retrieves file extensions for BLAST LMDB files
2017 /// @param db_is_protein set to true if the database is protein else false [in]
2018 /// @param extensions where the return value will be stored [in|out]
2019 NCBI_XOBJREAD_EXPORT
2020 void SeqDB_GetLMDBFileExtensions(bool db_is_protein,
2021 		                         vector<string>& extn);
2022 
2023 /// Determine if id is srting id
2024 /// @param id input id for check
2025 /// @return
2026 ///   Return true if id is not of type gi, ti or pig
2027 NCBI_XOBJREAD_EXPORT
2028 bool IsStringId(const CSeq_id & id);
2029 
2030 /// Return ID string as stored in lmdb
2031 NCBI_XOBJREAD_EXPORT
2032 string GetBlastSeqIdString(const CSeq_id & seqid, bool version);
2033 
2034 END_NCBI_SCOPE
2035 
2036 #endif // OBJTOOLS_BLAST_SEQDB_READER___SEQDBCOMMON__HPP
2037 
2038