1 #ifndef OBJTOOLS_BLAST_SEQDB_READER___SEQDBCOMMON__HPP
2 #define OBJTOOLS_BLAST_SEQDB_READER___SEQDBCOMMON__HPP
3
4 /* $Id: seqdbcommon.hpp 610974 2020-06-26 12:59:33Z grichenk $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Kevin Bealer
30 *
31 */
32
33 /// @file seqdbcommon.hpp
34 /// Defines exception class and several constants for SeqDB.
35 ///
36 /// Defines classes:
37 /// CSeqDBException
38 ///
39 /// Implemented for: UNIX, MS-Windows
40
41 #include <ncbiconf.h>
42 #include <corelib/ncbiobj.hpp>
43 #include <objects/seqloc/Seq_id.hpp>
44
45 BEGIN_NCBI_SCOPE
46
47 /// Include definitions from the objects namespace.
48 USING_SCOPE(objects);
49
50 /// BLAST database version
51 enum EBlastDbVersion {
52 eBDB_Version4 = 4,
53 eBDB_Version5 = 5
54 };
55
56 BEGIN_SCOPE(blastdb)
57 /// Ordinal ID in BLAST databases
58 typedef Int4 TOid;
59 END_SCOPE(blastdb)
60
61 /// CSeqDBException
62 ///
63 /// This exception class is thrown for SeqDB related errors such as
64 /// corrupted blast database or alias files, incorrect arguments to
65 /// SeqDB methods, and failures of SeqDB to accomplish tasks for other
66 /// reasons. SeqDB may be used in applications with strong robustness
67 /// requirements, where it is considered better to fail an operation
68 /// and lose context information, than to terminate with a core dump,
69 /// and preserve it, so exceptions are the preferred mechanism for
70 /// most error scenarios. SeqDB still uses assertions in cases where
71 /// memory corruption is suspected, or cleanup may not be possible.
72
73 class NCBI_XOBJREAD_EXPORT CSeqDBException : public CException {
74 public:
75 /// Errors are classified into one of two types.
76 enum EErrCode {
77 /// Argument validation failed.
78 eArgErr,
79
80 /// Files were missing or contents were incorrect.
81 eFileErr,
82
83 /// Memory allocation failed.
84 eMemErr,
85
86 /// DB version error
87 eVersionErr,
88
89 /// No Tax Id Found
90 eTaxidErr
91 };
92
93 /// Get a message describing the situation leading to the throw.
GetErrCodeString() const94 virtual const char* GetErrCodeString() const override
95 {
96 switch ( GetErrCode() ) {
97 case eArgErr: return "eArgErr";
98 case eFileErr: return "eFileErr";
99 case eVersionErr: return "eVersionErr";
100 default: return CException::GetErrCodeString();
101 }
102 }
103
104 /// Include standard NCBI exception behavior.
105 NCBI_EXCEPTION_DEFAULT(CSeqDBException,CException);
106 };
107
108 /// The name of the group alias file name expected at each directory
109 /// For more documentation, see "Group Alias Files" in
110 /// source/src/objtools/blast/seqdb_reader/alias_files.txt
111 NCBI_XOBJREAD_EXPORT extern const string kSeqDBGroupAliasFileName;
112
113 /// Used to request ambiguities in Ncbi/NA8 format.
114 const int kSeqDBNuclNcbiNA8 = 0;
115
116 /// Used to request ambiguities in BLAST/NA8 format.
117 const int kSeqDBNuclBlastNA8 = 1;
118
119 const blastdb::TOid kSeqDBEntryNotFound = -1;
120 const blastdb::TOid kSeqDBEntryDuplicate = -2;
121
122 /// Certain methods have an "Alloc" version. When these methods are
123 /// used, the following constants can be specified to indicate which
124 /// libraries to use to allocate returned data, so the corresponding
125 /// calls (delete[] vs. free()) can be used to delete the data.
126
127 enum ESeqDBAllocType {
128 eAtlas = 0,
129 eMalloc,
130 eNew
131 };
132
133
134 typedef Uint8 TTi;
135
136 typedef Uint4 TPig;
137
138
139 /// Blast DB v5 seqid list info
140 struct NCBI_XOBJREAD_EXPORT SBlastSeqIdListInfo {
SBlastSeqIdListInfoSBlastSeqIdListInfo141 SBlastSeqIdListInfo() : is_v4(true), file_size(0), num_ids(0), create_date(kEmptyStr),
142 db_vol_length(0), db_create_date(kEmptyStr), db_vol_names(kEmptyStr) {}
143 bool is_v4;
144 Uint8 file_size;
145 Uint8 num_ids;
146 string title;
147 string create_date;
148 Uint8 db_vol_length;
149 string db_create_date;
150 string db_vol_names;
151 };
152
153 /// CSeqDBGiList
154 ///
155 /// This class defines an interface to a list of GI,OID pairs. It is
156 /// used by the CSeqDB class for user specified GI lists. This class
157 /// should not be instantiated directly, instead use a subclass of
158 /// this class. Subclasses should provide a way to populate the
159 /// m_GisOids vector.
160
161 class NCBI_XOBJREAD_EXPORT CSeqDBGiList : public CObject {
162 public:
163 /// Structure that holds GI,OID pairs.
164 struct SGiOid {
165 /// Constuct an SGiOid element from the given gi and oid.
166 /// @param gi_in A GI, or 0 if none is available.
167 /// @param oid_in An OID, or -1 if none is available.
SGiOidCSeqDBGiList::SGiOid168 SGiOid(TGi gi_in = ZERO_GI, int oid_in = -1)
169 : gi(gi_in), oid(oid_in)
170 {
171 }
172
173 /// The GI or 0 if unknown.
174 TGi gi;
175
176 /// The OID or -1 if unknown.
177 int oid;
178 };
179
180 /// Structure that holds TI,OID pairs.
181 struct STiOid {
182 /// Constuct an STiOid element from the given TI (trace ID,
183 /// expressed as a number) and oid.
184 ///
185 /// @param ti_in A TI, or 0 if none is available.
186 /// @param oid_in An OID, or -1 if none is available.
STiOidCSeqDBGiList::STiOid187 STiOid(TTi ti_in = 0, int oid_in = -1)
188 : ti(ti_in), oid(oid_in)
189 {
190 }
191
192 /// The TI or 0 if unknown.
193 TTi ti;
194
195 /// The OID or -1 if unknown.
196 int oid;
197 };
198
199 /// Structure that holds Seq-id,OID pairs.
200 struct SSiOid {
201 /// Constuct a SSiOid element from the given Seq-id and oid.
202 /// @param seqid_in A Seq-id, or NULL if none is available.
203 /// @param oid_in An OID, or -1 if none is available.
SSiOidCSeqDBGiList::SSiOid204 SSiOid(const string &si_in = "", int oid_in = -1)
205 : si(si_in), oid(oid_in)
206 {
207 }
208
209 /// The String-id or "" if unknown.
210 string si;
211
212 /// The OID or -1 if unknown.
213 int oid;
214 };
215
216 struct STaxIdsOids {
217 set<TTaxId> tax_ids;
218 vector<blastdb::TOid> oids;
219 };
220
221 struct SPigOid {
222 /// Constuct an SPigOid element from the given pig and oid.
223 /// @param pig_in A PIG, or 0 if none is available.
224 /// @param oid_in An OID, or -1 if none is available.
SPigOidCSeqDBGiList::SPigOid225 SPigOid(TPig pig_in = 0, int oid_in = -1)
226 : pig(pig_in), oid(oid_in)
227 {
228 }
229
230 /// The PIG or 0 if unknown.
231 TPig pig;
232
233 /// The OID or -1 if unknown.
234 int oid;
235 };
236
237
238 /// Possible sorting states
239 enum ESortOrder {
240 /// The array is unsorted or the sortedness is unknown.
241 eNone,
242
243 /// The array is sorted by GI.
244 eGi
245
246 /// TODO should we define eTi and eSi?
247 };
248
249 /// Constructor
250 CSeqDBGiList();
251
252 /// Destructor
~CSeqDBGiList()253 virtual ~CSeqDBGiList()
254 {
255 }
256
257 /// Sort if necessary to insure order of elements.
258 void InsureOrder(ESortOrder order);
259
260 /// Test for existence of a GI.
261 bool FindGi(TGi gi) const;
262
263 /// Try to find a GI and return the associated OID.
264 /// @param gi The gi for which to search. [in]
265 /// @param oid The resulting oid if found. [out]
266 /// @return True if the GI was found.
267 bool GiToOid(TGi gi, int & oid);
268
269 /// Find a GI, returning the index and the associated OID.
270 /// @param gi The gi for which to search. [in]
271 /// @param oid The resulting oid if found. [out]
272 /// @param index The index of this GI (if found). [out]
273 /// @return True if the GI was found.
274 bool GiToOid(TGi gi, int & oid, int & index);
275
276 /// Test for existence of a TI.
277 bool FindTi(TTi ti) const;
278
279 /// Try to find a TI and return the associated OID.
280 /// @param ti The ti for which to search. [in]
281 /// @param oid The resulting oid if found. [out]
282 /// @return True if the TI was found.
283 bool TiToOid(TTi ti, int & oid);
284
285 /// Find a TI, returning the index and the associated OID.
286 /// @param ti The ti for which to search. [in]
287 /// @param oid The resulting oid if found. [out]
288 /// @param index The index of this TI (if found). [out]
289 /// @return True if the TI was found.
290 bool TiToOid(TTi ti, int & oid, int & index);
291
292
293 bool FindSi(const string & si) const;
294 bool SiToOid(const string &si, int & oid);
295 bool SiToOid(const string &si, int & oid, int & index);
296
297
298 bool FindPig(TPig pig) const;
299 bool PigToOid(TPig pig, int & oid);
300 bool PigToOid(TPig pig, int & oid, int & index);
301
302 /// Test for existence of a Seq-id by type.
303 ///
304 /// This method uses FindGi or FindTi if the input ID is a GI or
305 /// TI. If not, or if not found, it falls back to a Seq-id lookup
306 /// to find the ID. It returns true iff ID was found, otherwise
307 /// it returns false. This method is used by SeqDB to filter
308 /// Blast Defline lists.
309 ///
310 /// @param id The identifier to find.
311 /// @return true iff the id is found in the list.
312 bool FindId(const CSeq_id & id);
313
314 /// Access an element of the array.
315 /// @param index The index of the element to access. [in]
316 /// @return A reference to the GI/OID pair.
GetGiOid(int index) const317 const SGiOid & GetGiOid(int index) const
318 {
319 return m_GisOids[index];
320 }
321
322 /// Access an element of the array.
323 /// @param index The index of the element to access. [in]
324 /// @return A reference to the TI/OID pair.
GetTiOid(int index) const325 const STiOid & GetTiOid(int index) const
326 {
327 return m_TisOids[index];
328 }
329
330 /// Access an element of the array.
331 /// @param index The index of the element to access. [in]
332 /// @return A reference to the Seq-id/OID pair.
GetSiOid(int index) const333 const SSiOid & GetSiOid(int index) const
334 {
335 return m_SisOids[index];
336 }
337
GetPigOid(int index) const338 const SPigOid & GetPigOid(int index) const
339 {
340 return m_PigsOids[index];
341 }
342
343 /// Get the number of GIs in the array.
GetNumGis() const344 int GetNumGis() const
345 {
346 return (int) m_GisOids.size();
347 }
348
349 /// Get the number of TIs in the array.
GetNumTis() const350 int GetNumTis() const
351 {
352 return (int) m_TisOids.size();
353 }
354
355 /// Get the number of Seq-ids in the array.
GetNumSis() const356 int GetNumSis() const
357 {
358 return (int) m_SisOids.size();
359 }
360
GetNumTaxIds() const361 int GetNumTaxIds() const
362 {
363 return (int) m_TaxIdsOids.tax_ids.size();
364 }
365
GetNumOidsForTaxIdList() const366 int GetNumOidsForTaxIdList() const
367 {
368 return (int) m_TaxIdsOids.oids.size();
369 }
370
GetNumPigs() const371 int GetNumPigs() const
372 {
373 return (int) m_PigsOids.size();
374 }
375
376 /// Return false if there are elements present.
Empty() const377 bool Empty() const
378 {
379 return ! (GetNumGis() || GetNumSis() || GetNumTis() || GetNumTaxIds() || GetNumPigs());
380 }
381
382 /// Return true if there are elements present.
NotEmpty() const383 bool NotEmpty() const
384 {
385 return ! Empty();
386 }
387
388 /// Specify the correct OID for a GI.
389 ///
390 /// When SeqDB translates a GI into an OID, this method is called
391 /// to store the oid in the array.
392 ///
393 /// @param index
394 /// The location in the array of the GI, OID pair.
395 /// @param oid
396 /// The oid to store in that element.
SetGiTranslation(int index,int oid)397 void SetGiTranslation(int index, int oid)
398 {
399 m_GisOids[index].oid = oid;
400 }
401
402 /// Specify the correct OID for a TI.
403 ///
404 /// When SeqDB translates a TI into an OID, this method is called
405 /// to store the oid in the array.
406 ///
407 /// @param index
408 /// The location in the array of the TI, OID pair.
409 /// @param oid
410 /// The oid to store in that element.
SetTiTranslation(int index,int oid)411 void SetTiTranslation(int index, int oid)
412 {
413 m_TisOids[index].oid = oid;
414 }
415
416 /// Specify the correct OID for a Seq-id.
417 ///
418 /// When SeqDB translates a Seq-id into an OID, this method is
419 /// called to store the oid in the array.
420 ///
421 /// @param index
422 /// The location in the array of Seq-id, OID pairs.
423 /// @param oid
424 /// The oid to store in that element.
SetSiTranslation(int index,int oid)425 void SetSiTranslation(int index, int oid)
426 {
427 m_SisOids[index].oid = oid;
428 }
429
SetPigTranslation(int index,int oid)430 void SetPigTranslation(int index, int oid)
431 {
432 m_PigsOids[index].oid = oid;
433 }
434
Size() const435 int Size() const
436 {
437 return (int) m_GisOids.size();
438 }
439
440 template <class T>
GetSize() const441 int GetSize() const
442 {
443 return (int) m_GisOids.size();
444 }
445
446 template <class T>
GetKey(int index) const447 T GetKey(int index) const
448 {
449 return GI_TO(T, m_GisOids[index].gi);
450 }
451
452 template <class T>
IsValueSet(int index) const453 bool IsValueSet(int index) const
454 {
455 return (m_GisOids[index].oid != -1);
456 }
457
458 template <class T>
SetValue(int index,int oid)459 void SetValue(int index, int oid)
460 {
461 m_GisOids[index].oid = oid;
462 }
463
464 /// Get the gi list
465 void GetGiList(vector<TGi>& gis) const;
466
467 /// Get the ti list
468 void GetTiList(vector<TTi>& tis) const;
469
470 /// TODO Get the seqid list?
471 void GetSiList(vector<string>& sis) const;
472
473 void GetPigList(vector<TPig>& pigs) const;
474
475
GetTaxIdsList()476 set<TTaxId> & GetTaxIdsList()
477 {
478 return m_TaxIdsOids.tax_ids;
479 }
480
GetOidsForTaxIdsList()481 const vector<blastdb::TOid> & GetOidsForTaxIdsList()
482 {
483 return m_TaxIdsOids.oids;
484 }
485
SetOidsForTaxIdsList()486 vector<blastdb::TOid> & SetOidsForTaxIdsList()
487 {
488 m_TaxIdsOids.oids.clear();
489 return m_TaxIdsOids.oids;
490 }
491
492 /// Add a new GI to the list.
AddGi(TGi gi)493 void AddGi(TGi gi)
494 {
495 m_GisOids.push_back(gi);
496 }
497
498 /// Add a new TI to the list.
AddTi(TTi ti)499 void AddTi(TTi ti)
500 {
501 m_TisOids.push_back(ti);
502 }
503
504 /// Add a new SeqId to the list.
AddSi(const string & si)505 void AddSi(const string &si)
506 {
507 m_SisOids.push_back(si);
508 }
509
AddTaxIds(const set<TTaxId> & tax_ids)510 void AddTaxIds(const set<TTaxId> & tax_ids)
511 {
512 set<TTaxId> & tids = m_TaxIdsOids.tax_ids;
513 tids.insert(tax_ids.begin(), tax_ids.end());
514 }
515
SetPigList(const vector<TPig> & list)516 void SetPigList(const vector<TPig> & list)
517 {
518 ITERATE(vector<TPig>, itr, list) {
519 m_PigsOids.push_back(*itr);
520 }
521
522 }
523
AddPig(TPig pig)524 void AddPig(TPig pig)
525 {
526 m_PigsOids.push_back(pig);
527 }
528
529 /// Reserve space for GIs.
ReserveGis(size_t n)530 void ReserveGis(size_t n)
531 {
532 m_GisOids.reserve(n);
533 }
534
535 /// Reserve space for TIs.
ReserveTis(size_t n)536 void ReserveTis(size_t n)
537 {
538 m_TisOids.reserve(n);
539 }
540
ReserveSis(size_t n)541 void ReserveSis(size_t n)
542 {
543 m_SisOids.reserve(n);
544 }
545
ReservePigs(size_t n)546 void ReservePigs(size_t n)
547 {
548 m_PigsOids.reserve(n);
549 }
550
551 /// Preprocess ids for ISAM string id lookup
552 void PreprocessIdsForISAMSiLookup();
553
554 /// TODO Reserve space for seqids?
SetListInfo(const SBlastSeqIdListInfo & list_info)555 void SetListInfo(const SBlastSeqIdListInfo & list_info) {
556 m_ListInfo = list_info;
557 }
558
GetListInfo()559 const SBlastSeqIdListInfo & GetListInfo()
560 {
561 return m_ListInfo;
562 }
563
SetSiList(const vector<string> & new_list)564 void SetSiList( const vector<string> & new_list )
565 {
566 m_SisOids.clear();
567 ITERATE(vector<string>, itr, new_list) {
568 m_SisOids.push_back(*itr);
569 }
570 }
571 protected:
572 /// Indicates the current sort order, if any, of this container.
573 ESortOrder m_CurrentOrder;
574
575 /// Pairs of GIs and OIDs.
576 vector<SGiOid> m_GisOids;
577
578 /// Pairs of GIs and OIDs.
579 vector<STiOid> m_TisOids;
580
581 /// Pairs of Seq-ids and OIDs.
582 vector<SSiOid> m_SisOids;
583
584 vector<SPigOid> m_PigsOids;
585
586 STaxIdsOids m_TaxIdsOids;
587
588 SBlastSeqIdListInfo m_ListInfo;
589
590 private:
591 // The following disabled methods are reasonable things to do in
592 // some cases. But I suspect they are more likely to happen
593 // accidentally than deliberately; due to the high performance
594 // cost, I have prevented them. If this kind of deep copy is
595 // desireable, it can easily be enabled for a subclass by
596 // assigning each of the data fields in the protected section.
597
598 /// Prevent copy constructor.
599 CSeqDBGiList(const CSeqDBGiList & other);
600
601 /// Prevent assignment.
602 CSeqDBGiList & operator=(const CSeqDBGiList & other);
603 };
604
605
606 template < >
GetSize() const607 inline int CSeqDBGiList::GetSize<TTi>() const
608 {
609 return (int) m_TisOids.size();
610 }
611
612 template < >
GetKey(int index) const613 inline TTi CSeqDBGiList::GetKey<TTi>(int index) const
614 {
615 return m_TisOids[index].ti;
616 }
617
618 template < >
IsValueSet(int index) const619 inline bool CSeqDBGiList::IsValueSet<TTi>(int index) const
620 {
621 return (m_TisOids[index].oid != -1);
622 }
623
624 template < >
SetValue(int index,int oid)625 inline void CSeqDBGiList::SetValue<TTi>(int index, int oid)
626 {
627 m_TisOids[index].oid = oid;
628 }
629
630 template < >
GetSize() const631 inline int CSeqDBGiList::GetSize<string>() const
632 {
633 return (int) m_SisOids.size();
634 }
635
636 template < >
GetKey(int index) const637 inline string CSeqDBGiList::GetKey<string>(int index) const
638 {
639 return m_SisOids[index].si;
640 }
641
642 template < >
IsValueSet(int index) const643 inline bool CSeqDBGiList::IsValueSet<string>(int index) const
644 {
645 return (m_SisOids[index].oid != -1);
646 }
647
648 template < >
SetValue(int index,int oid)649 inline void CSeqDBGiList::SetValue<string>(int index, int oid)
650 {
651 m_SisOids[index].oid = oid;
652 }
653
654 template < >
GetSize() const655 inline int CSeqDBGiList::GetSize<TPig>() const
656 {
657 return (int) m_PigsOids.size();
658 }
659
660 template < >
GetKey(int index) const661 inline TPig CSeqDBGiList::GetKey<TPig>(int index) const
662 {
663 return m_PigsOids[index].pig;
664 }
665
666 template < >
IsValueSet(int index) const667 inline bool CSeqDBGiList::IsValueSet<TPig>(int index) const
668 {
669 return (m_PigsOids[index].oid != -1);
670 }
671
672 template < >
SetValue(int index,int oid)673 inline void CSeqDBGiList::SetValue<TPig>(int index, int oid)
674 {
675 m_PigsOids[index].oid = oid;
676 }
677
678
679 /// CSeqDBBitVector
680 ///
681 /// This class defines a bit vector that is similar to vector<bool>,
682 /// but with a differently designed API that performs better on at
683 /// least some platforms, and slightly altered semantics.
684
685 class NCBI_XOBJREAD_EXPORT CSeqDBBitVector {
686 public:
687 /// Constructor
CSeqDBBitVector()688 CSeqDBBitVector()
689 : m_Size(0)
690 {
691 }
692
693 /// Destructor
~CSeqDBBitVector()694 virtual ~CSeqDBBitVector()
695 {
696 }
697
698 /// Set the inclusion of an OID.
699 ///
700 /// @param oid The OID in question. [in]
SetBit(int oid)701 void SetBit(int oid)
702 {
703 if (oid >= m_Size) {
704 x_Resize(oid+1);
705 }
706 x_SetBit(oid);
707 }
708
709 /// Set the inclusion of an OID.
710 ///
711 /// @param oid The OID in question. [in]
ClearBit(int oid)712 void ClearBit(int oid)
713 {
714 if (oid >= m_Size) {
715 return;
716 }
717 x_ClearBit(oid);
718 }
719
720 /// Get the inclusion status of an OID.
721 ///
722 /// @param oid The OID in question. [in]
723 /// @return True if the OID is included by SeqDB.
GetBit(int oid)724 bool GetBit(int oid)
725 {
726 if (oid >= m_Size) {
727 return false;
728 }
729 return x_GetBit(oid);
730 }
731
732 /// Get the size of the OID array.
Size() const733 int Size() const
734 {
735 return m_Size;
736 }
737
738 private:
739 /// Prevent copy constructor.
740 CSeqDBBitVector(const CSeqDBBitVector & other);
741
742 /// Prevent assignment.
743 CSeqDBBitVector & operator=(const CSeqDBBitVector & other);
744
745 /// Bit vector element.
746 typedef int TBits;
747
748 /// Bit vector.
749 vector<TBits> m_Bitmap;
750
751 /// Maximum enabled OID plus one.
752 int m_Size;
753
754 /// Resize the OID list.
x_Resize(int num)755 void x_Resize(int num)
756 {
757 int bits = 8*sizeof(TBits);
758 int need = (num + bits - 1)/bits;
759
760 if ((int)m_Bitmap.size() < need) {
761 int new_size = 1024;
762
763 while (new_size < need) {
764 new_size *= 2;
765 }
766
767 m_Bitmap.resize(new_size);
768 }
769
770 m_Size = num;
771 }
772
773 /// Set a specific bit (to 1).
x_SetBit(int num)774 void x_SetBit(int num)
775 {
776 int bits = 8*sizeof(TBits);
777
778 m_Bitmap[num/bits] |= (1 << (num % bits));
779 }
780
781 /// Set a specific bit (to 1).
x_GetBit(int num)782 bool x_GetBit(int num)
783 {
784 int bits = 8*sizeof(TBits);
785
786 return !! (m_Bitmap[num/bits] & (1 << (num % bits)));
787 }
788
789 /// Clear a specific bit (to 0).
x_ClearBit(int num)790 void x_ClearBit(int num)
791 {
792 int bits = 8*sizeof(TBits);
793
794 m_Bitmap[num/bits] &= ~(1 << (num % bits));
795 }
796 };
797
798
799 /// CSeqDBNegativeList
800 ///
801 /// This class defines a list of GIs or TIs of sequences that should
802 /// not be included in a SeqDB instance. It is used by CSeqDB for
803 /// user specified negative ID lists. This class can be subclassed to
804 /// allow more efficient population of the GI or TI list.
805
806 class NCBI_XOBJREAD_EXPORT CSeqDBNegativeList : public CObject {
807 public:
808 /// Constructor
CSeqDBNegativeList()809 CSeqDBNegativeList()
810 : m_LastSortSize (0)
811 {
812 }
813
814 /// Destructor
~CSeqDBNegativeList()815 virtual ~CSeqDBNegativeList()
816 {
817 }
818
819 /// Sort list if not already sorted.
820 void InsureOrder();
821
822 /// Add a new GI to the list.
AddGi(TGi gi)823 void AddGi(TGi gi)
824 {
825 m_Gis.push_back(gi);
826 }
827
828 /// Add a new TI to the list.
AddTi(TTi ti)829 void AddTi(TTi ti)
830 {
831 m_Tis.push_back(ti);
832 }
833
834 /// Add a new SeqId to the list.
AddSi(const string & si)835 void AddSi(const string &si)
836 {
837 m_Sis.push_back(si);
838 }
839
840 /// Test for existence of a GI.
841 bool FindGi(TGi gi);
842
843 /// Test for existence of a TI.
844 bool FindTi(TTi ti);
845
846
847 bool FindSi(string si);
848
849 /// Test for existence of a TI or GI here and report whether the
850 /// ID was one of those types.
851 ///
852 /// If the input ID is a GI or TI, this method sets match_type to
853 /// true and returns the output of FindGi or FindTi. If it is
854 /// neither of those types, it sets match_type to false and
855 /// returns false. This method is used by SeqDB to filter Blast
856 /// Defline lists.
857 ///
858 /// @param id The identifier to find.
859 /// @param match_type The identifier is either a TI or GI.
860 /// @return true iff the id is found in the list.
861 bool FindId(const CSeq_id & id, bool & match_type);
862
863 /// Test for existence of a TI or GI included here.
864 bool FindId(const CSeq_id & id);
865
866 /// Access an element of the GI array.
867 /// @param index The index of the element to access. [in]
868 /// @return The GI for that index.
GetGi(int index) const869 TGi GetGi(int index) const
870 {
871 return m_Gis[index];
872 }
873
874 /// Access an element of the TI array.
875 /// @param index The index of the element to access. [in]
876 /// @return The TI for that index.
GetTi(int index) const877 TTi GetTi(int index) const
878 {
879 return m_Tis[index];
880 }
881
882 /// Access an element of the SeqId array.
883 /// @param index The index of the element to access. [in]
884 /// @return The TI for that index.
GetSi(int index) const885 const string GetSi(int index) const
886 {
887 return m_Sis[index];
888 }
889
890 /// Get the number of GIs in the array.
GetNumGis() const891 int GetNumGis() const
892 {
893 return (int) m_Gis.size();
894 }
895
896 /// Get the number of TIs in the array.
GetNumTis() const897 int GetNumTis() const
898 {
899 return (int) m_Tis.size();
900 }
901
902 /// Get the number of SeqIds in the array.
GetNumSis() const903 int GetNumSis() const
904 {
905 return (int) m_Sis.size();
906 }
907
GetNumPigs() const908 int GetNumPigs() const
909 {
910 return (int) m_Pigs.size();
911 }
912
IsGiList() const913 bool IsGiList() const
914 {
915 return(GetNumGis() > 0);
916 }
917
IsTiList() const918 bool IsTiList() const
919 {
920 return(GetNumTis() > 0);
921 }
922
IsSiList() const923 bool IsSiList() const
924 {
925 return(GetNumSis() > 0);
926 }
927
ListSize()928 int ListSize()
929 {
930 int size = GetNumGis();
931 if(size == 0) {
932 size = GetNumSis();
933 }
934 if(size == 0) {
935 size = GetNumTis();
936 }
937
938 if(size == 0) {
939 size = GetNumPigs();
940 }
941
942 return size;
943 }
944
945 /// Return false if there are elements present.
Empty() const946 bool Empty() const
947 {
948 return ! (GetNumGis() || GetNumTis() || GetNumSis()|| GetNumTaxIds() || GetNumPigs());
949 }
950
951 /// Return true if there are elements present.
NotEmpty() const952 bool NotEmpty() const
953 {
954 return ! Empty();
955 }
956
957 /// Include an OID in the iteration.
958 ///
959 /// The OID will be included by SeqDB in the set returned to users
960 /// by OID iteration.
961 ///
962 /// @param oid The OID in question. [in]
AddIncludedOid(int oid)963 void AddIncludedOid(int oid)
964 {
965 m_Included.SetBit(oid);
966 }
967
968 /// Indicate a visible OID.
969 ///
970 /// The OID will be marked as having been found in a GI or TI
971 /// ISAM index (but possibly not included for iteration).
972 ///
973 /// @param oid The OID in question. [in]
AddVisibleOid(int oid)974 void AddVisibleOid(int oid)
975 {
976 m_Visible.SetBit(oid);
977 }
978
979 /// Get the inclusion status of an OID.
980 ///
981 /// This returns true for OIDs that were in the included set and
982 /// for OIDs that were not found in the ISAM file at all.
983 ///
984 /// @param oid The OID in question. [in]
985 /// @return True if the OID is included by SeqDB.
GetOidStatus(int oid)986 bool GetOidStatus(int oid)
987 {
988 return m_Included.GetBit(oid) || (! m_Visible.GetBit(oid));
989 }
990
991 /// Get the size of the OID array.
GetNumOids()992 int GetNumOids()
993 {
994 return max(m_Visible.Size(), m_Included.Size());
995 }
996
997 /// Reserve space for GIs.
ReserveGis(size_t n)998 void ReserveGis(size_t n)
999 {
1000 m_Gis.reserve(n);
1001 }
1002
1003 /// Reserve space for TIs.
ReserveTis(size_t n)1004 void ReserveTis(size_t n)
1005 {
1006 m_Tis.reserve(n);
1007 }
1008
ReserveSis(size_t n)1009 void ReserveSis(size_t n)
1010 {
1011 m_Sis.reserve(n);
1012 }
1013
1014 /// Build ID set for this negative list.
GetGiList()1015 const vector<TGi> & GetGiList()
1016 {
1017 return m_Gis;
1018 }
1019
1020 /// Set ID set for this negative list.
SetGiList(const vector<TGi> & new_list)1021 void SetGiList( const vector<TGi> & new_list )
1022 {
1023 m_Gis.clear();
1024 m_Gis.reserve( new_list.size() );
1025 m_Gis = new_list;
1026 }
1027
SetPigList(const vector<TPig> & new_list)1028 void SetPigList( const vector<TPig> & new_list )
1029 {
1030 m_Pigs.clear();
1031 m_Pigs.reserve( new_list.size() );
1032 m_Pigs = new_list;
1033 }
1034
SetSiList(const vector<string> & new_list)1035 void SetSiList( const vector<string> & new_list )
1036 {
1037 m_Sis.clear();
1038 m_Sis.reserve( new_list.size() );
1039 m_Sis = new_list;
1040 }
1041
1042 /// Build ID set for this negative list.
GetTiList()1043 const vector<TTi> & GetTiList()
1044 {
1045 return m_Tis;
1046 }
1047
GetPigList()1048 const vector<TPig> & GetPigList()
1049 {
1050 return m_Pigs;
1051 }
1052
GetSiList()1053 const vector<string> & GetSiList()
1054 {
1055 return m_Sis;
1056 }
1057
1058 /// Get list size
Size(void)1059 int Size(void)
1060 {
1061 return (int)m_Gis.size();
1062 }
1063
GetListInfo()1064 const SBlastSeqIdListInfo & GetListInfo()
1065 {
1066 return m_ListInfo;
1067 }
1068
1069 void PreprocessIdsForISAMSiLookup();
1070
GetExcludedOids()1071 const vector<blastdb::TOid> & GetExcludedOids() { return m_ExcludedOids; }
SetExcludedOids()1072 vector<blastdb::TOid> & SetExcludedOids() { return m_ExcludedOids; }
1073
SetListInfo(const SBlastSeqIdListInfo & list_info)1074 void SetListInfo(const SBlastSeqIdListInfo & list_info) {
1075 m_ListInfo = list_info;
1076 }
GetListInfo() const1077 const SBlastSeqIdListInfo & GetListInfo() const{
1078 return m_ListInfo;
1079 }
1080
AddTaxIds(const set<TTaxId> & tax_ids)1081 void AddTaxIds(const set<TTaxId> & tax_ids)
1082 {
1083 m_TaxIds.insert(tax_ids.begin(), tax_ids.end());
1084 }
1085
GetTaxIdsList()1086 set<TTaxId> & GetTaxIdsList()
1087 {
1088 return m_TaxIds;
1089 }
1090
GetNumTaxIds() const1091 int GetNumTaxIds() const
1092 {
1093 return (int) m_TaxIds.size();
1094 }
1095
1096 protected:
1097 /// GIs to exclude from the SeqDB instance.
1098 vector<TGi> m_Gis;
1099
1100 /// TIs to exclude from the SeqDB instance.
1101 vector<TTi> m_Tis;
1102
1103 vector<TPig> m_Pigs;
1104
1105 /// SeqIds to exclude from the SeqDB instance.
1106 vector<string> m_Sis;
1107 set<TTaxId> m_TaxIds;
1108
1109 private:
1110 /// Prevent copy constructor.
1111 CSeqDBNegativeList(const CSeqDBNegativeList & other);
1112
1113 /// Prevent assignment.
1114 CSeqDBNegativeList & operator=(const CSeqDBNegativeList & other);
1115
1116 /// Included OID bitmap.
1117 CSeqDBBitVector m_Included;
1118
1119 /// OIDs visible to the ISAM file.
1120 CSeqDBBitVector m_Visible;
1121
1122 /// Zero if unsorted, or the size it had after the last sort.
1123 size_t m_LastSortSize;
1124
1125 SBlastSeqIdListInfo m_ListInfo;
1126
1127 vector<blastdb::TOid> m_ExcludedOids;
1128 };
1129
1130
1131 /// Read a binary-format GI list from a file.
1132 ///
1133 /// @param name The name of the file containing GIs. [in]
1134 /// @param gis The GIs returned by this function. [out]
1135 NCBI_XOBJREAD_EXPORT
1136 void SeqDB_ReadBinaryGiList(const string & name, vector<TGi> & gis);
1137
1138 /// Read a text or binary GI list from an area of memory.
1139 ///
1140 /// The GIs in a memory region are read into the provided SGiOid
1141 /// vector. The GI half of each element of the vector is assigned,
1142 /// but the OID half will be left as -1. If the in_order parameter is
1143 /// not null, the function will test the GIs for orderedness. It will
1144 /// set the bool to which in_order points to true if so, false if not.
1145 ///
1146 /// @param fbeginp The start of the memory region holding the GI list. [in]
1147 /// @param fendp The end of the memory region holding the GI list. [in]
1148 /// @param gis The GIs returned by this function. [out]
1149 /// @param in_order If non-null, returns true iff the GIs were in order. [out]
1150
1151 NCBI_XOBJREAD_EXPORT
1152 void SeqDB_ReadMemoryGiList(const char * fbeginp,
1153 const char * fendp,
1154 vector<CSeqDBGiList::SGiOid> & gis,
1155 bool * in_order = 0);
1156
1157 /// Read a text or binary TI list from an area of memory.
1158 ///
1159 /// The TIs in a memory region are read into the provided STiOid
1160 /// vector. The TI half of each element of the vector is assigned,
1161 /// but the OID half will be left as -1. If the in_order parameter is
1162 /// not null, the function will test the TIs for orderedness. It will
1163 /// set the bool to which in_order points to true if so, false if not.
1164 ///
1165 /// @param fbeginp The start of the memory region holding the TI list. [in]
1166 /// @param fendp The end of the memory region holding the TI list. [in]
1167 /// @param tis The TIs returned by this function. [out]
1168 /// @param in_order If non-null, returns true iff the TIs were in order. [out]
1169
1170 NCBI_XOBJREAD_EXPORT
1171 void SeqDB_ReadMemoryTiList(const char * fbeginp,
1172 const char * fendp,
1173 vector<CSeqDBGiList::STiOid> & tis,
1174 bool * in_order = 0);
1175
1176 /// Read a text SeqID list from an area of memory.
1177 ///
1178 /// The Seqids in a memory region are read into the provided SSeqIdOid
1179 /// vector. The SeqId half of each element of the vector is assigned,
1180 /// but the OID half will be left as -1. If the in_order parameter is
1181 /// not null, the function will test the SeqIds for orderedness. It will
1182 /// set the bool to which in_order points to true if so, false if not.
1183 ///
1184 /// @param fbeginp The start of the memory region holding the SeqId list. [in]
1185 /// @param fendp The end of the memory region holding the SeqId list. [in]
1186 /// @param seqids The SeqId returned by this function. [out]
1187 /// @param in_order If non-null, returns true iff the seqids were in order. [out]
1188
1189 NCBI_XOBJREAD_EXPORT
1190 void SeqDB_ReadMemorySiList(const char * fbeginp,
1191 const char * fendp,
1192 vector<CSeqDBGiList::SSiOid> & sis,
1193 bool * in_order = 0);
1194
1195 /// Read an ID list (mixed type) from an area of memory.
1196 ///
1197 /// The Seq ids in a memory region are read into the provided SSeqIdOid
1198 /// vector. The gi, ti or seqid half of each element of the vector is assigned,
1199 /// but the OID half will be left as -1. If the in_order parameter is
1200 /// not null, the function will test the SeqIds for orderedness. It will
1201 /// set the bool to which in_order points to true if so, false if not.
1202 ///
1203 /// @param fbeginp The start of the memory region holding the SeqId list. [in]
1204 /// @param fendp The end of the memory region holding the SeqId list. [in]
1205 /// @param gis The gis returned by this function. [out]
1206 /// @param tis The tis returned by this function. [out]
1207 /// @param sis The seqids returned by this function. [out]
1208 /// @param in_order If non-null, returns true iff the seqids were in order. [out]
1209
1210 NCBI_XOBJREAD_EXPORT
1211 void SeqDB_ReadMemoryMixList(const char * fbeginp,
1212 const char * fendp,
1213 vector<CSeqDBGiList::SGiOid> & gis,
1214 vector<CSeqDBGiList::STiOid> & tis,
1215 vector<CSeqDBGiList::SSiOid> & sis,
1216 bool * in_order);
1217
1218 NCBI_XOBJREAD_EXPORT
1219 void SeqDB_ReadMemoryPigList(const char * fbeginp,
1220 const char * fendp,
1221 vector<CSeqDBGiList::SPigOid> & pigs,
1222 bool * in_order = 0);
1223
1224 /// Combine and quote a list of database names.
1225 ///
1226 /// SeqDB permits multiple databases to be opened by a single CSeqDB
1227 /// instance, by passing the database names as a space-delimited list
1228 /// to the CSeqDB constructor. To support paths and filenames with
1229 /// embedded spaces, surround any space-containing names with double
1230 /// quotes ('"'). Filenames not containing spaces may be quoted
1231 /// safely with no effect. (This solution prevents the use of names
1232 /// containing embedded double quotes.)
1233 ///
1234 /// This method combines a list of database names into a string
1235 /// encoded in this way.
1236 ///
1237 /// @param dbname Combined database name.
1238 /// @param dbs Database names to combine.
1239
1240 NCBI_XOBJREAD_EXPORT
1241 void SeqDB_CombineAndQuote(const vector<string> & dbs,
1242 string & dbname);
1243
1244 /// Split a (possibly) quoted list of database names into pieces.
1245 ///
1246 /// SeqDB permits multiple databases to be opened by a single CSeqDB
1247 /// instance, by passing the database names as a space-delimited list
1248 /// to the CSeqDB constructor. To support paths and filenames with
1249 /// embedded spaces, surround any space-containing names with double
1250 /// quotes ('"'). Filenames not containing spaces may be quoted
1251 /// safely with no effect. (This solution prevents the use of names
1252 /// containing embedded double quotes.)
1253 ///
1254 /// This method splits a string encoded in this way into individual
1255 /// database names. Note that the resulting vector's objects are
1256 /// CTempString "slice" objects, and are only valid while the original
1257 /// (encoded) string is unchanged.
1258 ///
1259 /// @param dbname Combined database name.
1260 /// @param dbs Database names to combine.
1261
1262 NCBI_XOBJREAD_EXPORT
1263 void SeqDB_SplitQuoted(const string & dbname,
1264 vector<CTempString> & dbs,
1265 bool keep_quote = false);
1266
1267 /// Read a text or binary GI list from a file.
1268 ///
1269 /// The GIs in a file are read into the provided SGiOid vector. The
1270 /// GI half of each element of the vector is assigned, but the OID
1271 /// half will be left as -1. If the in_order parameter is not null,
1272 /// the function will test the GIs for orderedness. It will set the
1273 /// bool to which in_order points to true if so, false if not.
1274 ///
1275 /// @param fname The name of the GI list file. [in]
1276 /// @param gis The GIs returned by this function. [out]
1277 /// @param in_order If non-null, returns true iff the GIs were in order. [out]
1278
1279 NCBI_XOBJREAD_EXPORT
1280 void SeqDB_ReadGiList(const string & fname,
1281 vector<CSeqDBGiList::SGiOid> & gis,
1282 bool * in_order = 0);
1283
1284 /// Read a text or binary TI list from a file.
1285 ///
1286 /// The TIs in a file are read into the provided STiOid vector. The
1287 /// TI half of each element of the vector is assigned, but the OID
1288 /// half will be left as -1. If the in_order parameter is not null,
1289 /// the function will test the TIs for orderedness. It will set the
1290 /// bool to which in_order points to true if so, false if not.
1291 ///
1292 /// @param fname The name of the TI list file. [in]
1293 /// @param tis The TIs returned by this function. [out]
1294 /// @param in_order If non-null, returns true iff the TIs were in order. [out]
1295
1296 NCBI_XOBJREAD_EXPORT
1297 void SeqDB_ReadTiList(const string & fname,
1298 vector<CSeqDBGiList::STiOid> & tis,
1299 bool * in_order = 0);
1300
1301 /// Read a text SeqId list from a file.
1302 ///
1303 /// The Seqids in a file are read into the provided SSeqIdOid vector. The
1304 /// SeqId half of each element of the vector is assigned, but the OID
1305 /// half will be left as -1. If the in_order parameter is not null,
1306 /// the function will test the SeqIds for orderedness. It will set the
1307 /// bool to which in_order points to true if so, false if not.
1308 ///
1309 /// @param fname The name of the SeqId list file. [in]
1310 /// @param gis The GIs returned by this function. [out]
1311 /// @param sis The SeqIds returned by this function. [out]
1312 /// @param in_order If non-null, returns true iff the SeqIds were in order. [out]
1313
1314 NCBI_XOBJREAD_EXPORT
1315 void SeqDB_ReadSiList(const string & fname,
1316 vector<CSeqDBGiList::SSiOid> & sis,
1317 bool * in_order,
1318 SBlastSeqIdListInfo & db_info);
1319
1320 /// Read a text SeqId list from a file.
1321 ///
1322 /// The Seqids in a file are read into the provided SSeqIdOid vector. The
1323 /// Gi/Ti/Si half of each element of the vector is assigned, but the OID
1324 /// half will be left as -1. If the in_order parameter is not null,
1325 /// the function will test the SeqIds for orderedness. It will set the
1326 /// bool to which in_order points to true if so, false if not.
1327 ///
1328 /// @param fname The name of the SeqId list file. [in]
1329 /// @param tis The TIs returned by this function. [out]
1330 /// @param sis The SeqIds returned by this function. [out]
1331 /// @param in_order If non-null, returns true iff the SeqIds were in order. [out]
1332
1333 NCBI_XOBJREAD_EXPORT
1334 void SeqDB_ReadMixList(const string & fname,
1335 vector<CSeqDBGiList::SGiOid> & gis,
1336 vector<CSeqDBGiList::STiOid> & tis,
1337 vector<CSeqDBGiList::SSiOid> & sis,
1338 bool * in_order);
1339
1340 NCBI_XOBJREAD_EXPORT
1341 void SeqDB_ReadPigList(const string & fname,
1342 vector<CSeqDBGiList::SPigOid> & pigs,
1343 bool * in_order = 0);
1344
1345
1346 /// Read a text or binary GI list from a file.
1347 ///
1348 /// The GIs in a file are read into the provided vector<int>. If the
1349 /// in_order parameter is not null, the function will test the GIs for
1350 /// orderedness. It will set the bool to which in_order points to
1351 /// true if so, false if not.
1352 ///
1353 /// @param fname The name of the GI list file. [in]
1354 /// @param gis The GIs returned by this function. [out]
1355 /// @param in_order If non-null, returns true iff the GIs were in order. [out]
1356
1357 NCBI_XOBJREAD_EXPORT
1358 void SeqDB_ReadGiList(const string & fname,
1359 vector<TGi> & gis,
1360 bool * in_order = 0);
1361
1362 /// Read a text or binary SeqId list from a file.
1363 ///
1364 /// The SeqIds in a file are read into the provided vector<string>. If the
1365 /// in_order parameter is not null, the function will test the SeqIds for
1366 /// orderedness. It will set the bool to which in_order points to
1367 /// true if so, false if not.
1368 ///
1369 /// @param fname The name of the SeqId list file. [in]
1370 /// @param sis The SeqIds returned by this function. [out]
1371 /// @param in_order If non-null, returns true iff the SeqIds were in order. [out]
1372
1373 ///NCBI_XOBJREAD_EXPORT
1374 ///void SeqDB_ReadSeqIdList(const string & fname,
1375 /// vector<string> & sis,
1376 /// bool * in_order = 0);
1377
1378 /// Returns true if the file name passed contains a binary gi list
1379 ///
1380 /// @param fname The name of the GI list file. [in]
1381 /// @throws CSeqDBException if file is invalid or empty
1382 NCBI_XOBJREAD_EXPORT
1383 bool SeqDB_IsBinaryGiList(const string & fname);
1384
1385 /// Returns true if the file name passed contains a binary TI list
1386 ///
1387 /// @param fname The name of the TI list file. [in]
1388 /// @throws CSeqDBException if file is invalid or empty
1389 NCBI_XOBJREAD_EXPORT
1390 bool SeqDB_IsBinaryTiList(const string & fname);
1391
1392 /// CSeqDBFileGiList
1393 ///
1394 /// This class defines a CSeqDBGiList subclass which reads a GI list
1395 /// file given a filename. It can read text or binary GI list files,
1396 /// and will automatically distinguish between them.
1397
1398 class NCBI_XOBJREAD_EXPORT CSeqDBFileGiList : public CSeqDBGiList {
1399 public:
1400 enum EIdType {
1401 eGiList,
1402 eTiList,
1403 eSiList,
1404 eMixList,
1405 ePigList,
1406 eTaxIdList
1407 };
1408
1409 /// Build a GI list from a file.
1410 CSeqDBFileGiList(const string & fname, EIdType idtype=eGiList);
1411
1412 /// Build a GI list from multiple files. (only support eSiList)
1413 //CSeqDBFileGiList(vector<string> fnames, EIdType idtype=eGiList);
1414 };
1415
1416
1417 /// GI list containing the intersection of two other lists of GIs.
1418 ///
1419 /// This class takes a CSeqDBGiList and an integer vector and computes
1420 /// the intersection of the two. Note that both input arguments are
1421 /// sorted to GI order in-place.
1422
1423 class NCBI_XOBJREAD_EXPORT CIntersectionGiList : public CSeqDBGiList {
1424 public:
1425 /// Construct an intersection of two lists of GIs.
1426 ///
1427 /// The two lists of GIs are sorted and this class is computed as
1428 /// an intersection of them. Note that both arguments to this
1429 /// function are potentially modified (sorted in place).
1430 CIntersectionGiList(CSeqDBGiList & gilist, vector<TGi> & gis);
1431
1432 /// The two lists of GIs are sorted and this class is computed as
1433 /// an intersection of them. Since gilist is negative this means
1434 /// all gi's in the vector that are NOT in the negative list.
1435 /// Note that both arguments to this
1436 /// function are potentially modified (sorted in place).
1437 CIntersectionGiList(CSeqDBNegativeList & gilist, vector<TGi> & gis);
1438 };
1439
1440
1441 /// Helper class to allow copy-on-write semantics for CSeqDBIdSet.
1442 ///
1443 /// This class owns the actual vector of IDs for the CSeqDBIdSet list.
1444
1445 class CSeqDBIdSet_Vector : public CObject {
1446 public:
1447 /// Default constructor.
CSeqDBIdSet_Vector()1448 CSeqDBIdSet_Vector()
1449 {
1450 }
1451
1452 /// Construct from an 'int' set.
CSeqDBIdSet_Vector(const vector<Int4> & ids)1453 CSeqDBIdSet_Vector(const vector<Int4> & ids)
1454 {
1455 ITERATE(vector<Int4>, iter, ids) {
1456 m_Ids.push_back((Int8) *iter);
1457 }
1458 }
1459
1460 /// Construct from an 'Int8' set.
CSeqDBIdSet_Vector(const vector<Int8> & ids)1461 CSeqDBIdSet_Vector(const vector<Int8> & ids)
1462 {
1463 m_Ids = ids;
1464 }
1465
1466 /// Construct from an 'Uint8' set.
CSeqDBIdSet_Vector(const vector<Uint8> & ids)1467 CSeqDBIdSet_Vector(const vector<Uint8> & ids)
1468 {
1469 ITERATE(vector<Uint8>, iter, ids) {
1470 m_Ids.push_back((Int8) *iter);
1471 }
1472 }
1473
CSeqDBIdSet_Vector(const vector<string> & ids)1474 CSeqDBIdSet_Vector(const vector<string> & ids)
1475 {
1476 ITERATE(vector<string>, iter, ids) {
1477 m_SeqIds.push_back((string) *iter);
1478 }
1479 }
1480
1481
1482 #ifdef NCBI_STRICT_GI
1483 /// Construct from a 'TGi' set when NCBI_STRICT_GI is in force.
CSeqDBIdSet_Vector(const vector<TGi> & ids)1484 CSeqDBIdSet_Vector(const vector<TGi> & ids)
1485 {
1486 ITERATE(vector<TGi>, iter, ids) {
1487 m_Ids.push_back(GI_TO(Int8, *iter));
1488 }
1489 }
1490 #endif
1491
1492 /// Access the Int8 set.
Set()1493 vector<Int8> & Set()
1494 {
1495 return m_Ids;
1496 }
1497
1498 /// Access the Int8 set.
Get() const1499 const vector<Int8> & Get() const
1500 {
1501 return m_Ids;
1502 }
1503
1504 /// Access the string set.
SetSeqIDs()1505 vector<string> & SetSeqIDs()
1506 {
1507 return m_SeqIds;
1508 }
1509
1510 /// Access the string set.
GetSeqIDs() const1511 const vector<string> & GetSeqIDs() const
1512 {
1513 return m_SeqIds;
1514 }
1515
1516 /// Get the number of elements stored here.
Size() const1517 size_t Size() const
1518 {
1519 size_t n = m_Ids.size();
1520 if(n == 0) {
1521 n = m_SeqIds.size();
1522 }
1523 return n;
1524 }
1525
1526 private:
1527 /// The actual list elements.
1528 vector<Int8> m_Ids;
1529
1530 vector<string> m_SeqIds;
1531
1532 /// Prevent copy construction.
1533 CSeqDBIdSet_Vector(CSeqDBIdSet_Vector &);
1534
1535 /// Prevent copy assignment.
1536 CSeqDBIdSet_Vector & operator=(CSeqDBIdSet_Vector &);
1537 };
1538
1539
1540 /// SeqDB ID list for performing boolean set operations.
1541 ///
1542 /// This class permits boolean operations on lists of numeric IDs,
1543 /// and can be passed to CSeqDB in the same way as a CSeqDBGiList.
1544 /// CSeqDBGiList or CSeqDBNegativeList objects can be constructed as
1545 /// well. Logical operations supported include AND, OR, XOR, and NOT.
1546 /// Internally this uses a CRef based copy-on-write scheme, so these
1547 /// objects can be copied in constant time.
1548
1549 class NCBI_XOBJREAD_EXPORT CSeqDBIdSet : public CObject {
1550 public:
1551 /// Types of operations that may be performed on GI lists.
1552 enum EOperation {
1553 eAnd, // Found in both X and Y
1554 eXor, // Found in X or Y, but not both
1555 eOr // Found in either X or Y
1556 };
1557
1558 /// Type of IDs stored here.
1559 enum EIdType {
1560 eGi, // Found in both X and Y
1561 eTi, // Found in X or Y, but not both
1562 eSi
1563 };
1564
1565 /// Construct a 'blank' CSeqDBIdSet object.
1566 ///
1567 /// This produces a blank ID set object, which (if applied) would
1568 /// not cause any filtering to occur. This is represented here as
1569 /// a negative ID list with no elements.
1570 ///
1571 CSeqDBIdSet();
1572
1573 /// Build a computed ID list given an initial set of IDs.
1574 ///
1575 /// This initializes a list with an initial set of IDs of the
1576 /// specified type. All further logic operations on the list
1577 /// should use vectors of IDs or CSeqDBIdSet objects
1578 /// initialized with the same EIdType enumeration.
1579 ///
1580 /// @param ids These IDs will be added to the list.
1581 /// @param t The IDs are assumed to be of this type.
1582 /// @param positive True for a positive ID list, false for negative.
1583 CSeqDBIdSet(const vector<Int4> & ids, EIdType t, bool positive = true);
1584
1585 /// Build a computed ID list given an initial set of IDs.
1586 ///
1587 /// This initializes a list with an initial set of IDs of the
1588 /// specified type. All further logic operations on the list
1589 /// should use vectors of IDs or CSeqDBIdSet objects
1590 /// initialized with the same EIdType enumeration.
1591 ///
1592 /// @param ids These IDs will be added to the list.
1593 /// @param t The IDs are assumed to be of this type.
1594 /// @param positive True for a positive ID list, false for negative.
1595 CSeqDBIdSet(const vector<Int8> & ids, EIdType t, bool positive = true);
1596
1597 /// Build a computed ID list given an initial set of IDs.
1598 ///
1599 /// This initializes a list with an initial set of IDs of the
1600 /// specified type. All further logic operations on the list
1601 /// should use vectors of IDs or CSeqDBIdSet objects
1602 /// initialized with the same EIdType enumeration.
1603 ///
1604 /// @param ids These IDs will be added to the list.
1605 /// @param t The IDs are assumed to be of this type.
1606 /// @param positive True for a positive ID list, false for negative.
1607 CSeqDBIdSet(const vector<Uint8> & ids, EIdType t, bool positive = true);
1608
1609
1610
1611
1612 #ifdef NCBI_STRICT_GI
1613 /// Build a computed ID list given an initial set of IDs.
1614 ///
1615 /// This initializes a list with an initial set of IDs of the
1616 /// specified type. All further logic operations on the list
1617 /// should use vectors of IDs or CSeqDBIdSet objects
1618 /// initialized with the same EIdType enumeration.
1619 ///
1620 /// @param ids These IDs will be added to the list.
1621 /// @param t The IDs are assumed to be of this type.
1622 /// @param positive True for a positive ID list, false for negative.
1623 CSeqDBIdSet(const vector<TGi> & ids, EIdType t, bool positive = true);
1624 #endif
1625
1626 CSeqDBIdSet(const vector<string> & ids, EIdType t, bool positive = true);
1627
1628 /// Virtual destructor.
~CSeqDBIdSet()1629 virtual ~CSeqDBIdSet()
1630 {
1631 }
1632
1633 /// Invert the current list.
1634 void Negate();
1635
1636 /// Perform a logical operation on a list.
1637 ///
1638 /// The logical operation is performed between the current list
1639 /// and the ids parameter, and the 'positive' flag is used to
1640 /// determine if the new input list should be treated as a
1641 /// positive or negative list. For example, using op == eOr and
1642 /// positive == false would perform the operation (X OR NOT Y).
1643 ///
1644 /// @param op Logical operation to perform.
1645 /// @param ids List of ids for the second argument.
1646 /// @param positive True for positive lists, false for negative.
1647 void Compute(EOperation op,
1648 const vector<int> & ids,
1649 bool positive = true);
1650
1651 /// Perform a logical operation on a list.
1652 ///
1653 /// The logical operation is performed between the current list
1654 /// and the ids parameter, and the 'positive' flag is used to
1655 /// determine if the new input list should be treated as a
1656 /// positive or negative list. For example, using op == eOr and
1657 /// positive == false would perform the operation (X OR NOT Y).
1658 ///
1659 /// @param op Logical operation to perform.
1660 /// @param ids List of ids for the second argument.
1661 /// @param positive If true, ids represent 'negative' ids.
1662 void Compute(EOperation op,
1663 const vector<Int8> & ids,
1664 bool positive = true);
1665
1666 /// Perform a logical operation on a list.
1667 ///
1668 /// The logical operation is performed between the current list
1669 /// and the ids parameter, and the 'positive' flag is used to
1670 /// determine if the new input list should be treated as a
1671 /// positive or negative list. For example, using op == eOr and
1672 /// positive == false would perform the operation (X OR NOT Y).
1673 ///
1674 /// @param op Logical operation to perform.
1675 /// @param ids List of ids for the second argument.
1676 /// @param positive If true, ids represent 'negative' ids.
1677 void Compute(EOperation op,
1678 const vector<Uint8> & ids,
1679 bool positive = true);
1680
1681 /// Perform a logical operation on a list.
1682 ///
1683 /// The logical operation is performed between the current list
1684 /// and the ids parameter. For example if 'eOr' is specified, the
1685 /// operation performed will be 'X OR Y'. The 'ids' list will not
1686 /// be modified by this operation.
1687 ///
1688 /// @param op Logical operation to perform.
1689 /// @param ids List of ids for the second argument.
1690 void Compute(EOperation op, const CSeqDBIdSet & ids);
1691
1692 /// Checks whether a positive GI list was produced.
1693 ///
1694 /// If this method returns true, a positive list was produced, and
1695 /// can be retrieved with GetPositiveList(). If it returns false,
1696 /// a negative list was produced and can be retrieved with
1697 /// GetNegativeList().
1698 ///
1699 /// @return true If the produced GI list is positive.
IsPositive()1700 bool IsPositive()
1701 {
1702 return m_Positive;
1703 }
1704
1705 /// Retrieve a positive GI list.
1706 ///
1707 /// If IsPositive() returned true, this method should be used to
1708 /// retrieve a positive GI list. If IsPositive() returned false,
1709 /// this method will throw an exception.
1710 CRef<CSeqDBGiList> GetPositiveList();
1711
1712 /// Retrieve a negative GI list.
1713 ///
1714 /// If IsPositive() returned false, this method should be used to
1715 /// retrieve a positive GI list. If IsPositive() returned true,
1716 /// this method will throw an exception.
1717 ///
1718 /// @return A negative GI list.
1719 CRef<CSeqDBNegativeList> GetNegativeList();
1720
1721 /// Check if an ID list is blank.
1722 ///
1723 /// An ID list is considered 'blank' iff it is a negative list
1724 /// with no elements. Constructing a database with such a list is
1725 /// equivalent to not specifying a list. Blank lists are produced
1726 /// by the default constructor, by specifying a negative list and
1727 /// providing an empty vector, or by computation (an intersection
1728 /// of disjoint negative lists, for example). This method returns
1729 /// true in those cases; otherwise it returns false.
1730 ///
1731 /// @return True if this list is blank.
1732 bool Blank() const;
1733
1734 private:
1735 /// Sort and unique the internal set.
1736 static void x_SortAndUnique(vector<Int8> & ids);
1737
1738 static void x_SortAndUnique(vector<string> & ids);
1739
1740 /// Compute inclusion flags for a boolean operation.
1741 ///
1742 /// This takes a logical operator (AND, OR, or XOR) and a flag
1743 /// indicating whether each input lists is positive or negative,
1744 /// and produces a flag indicating whether the resulting list will
1745 /// be positive or negative and three flags used to control the
1746 /// set merging operation.
1747 ///
1748 /// @param op The operation to perform (OR, AND, or XOR). [in]
1749 /// @param A_pos True if the first list is positive. [in]
1750 /// @param B_pos True if the second list is positive. [in]
1751 /// @param result_pos True if the result is a positive list. [out]
1752 /// @param incl_A True if ids found only in list A are kept. [out]
1753 /// @param incl_B True if ids found only in list B are kept. [out]
1754 /// @param incl_AB True if ids found in both lists are kept. [out]
1755 static void x_SummarizeBooleanOp(EOperation op,
1756 bool A_pos,
1757 bool B_pos,
1758 bool & result_pos,
1759 bool & incl_A,
1760 bool & incl_B,
1761 bool & incl_AB);
1762
1763 /// Compute boolean operation on two vectors.
1764 ///
1765 /// This takes a logical operator (AND, OR, or XOR) and two
1766 /// positive or negative lists, and produces a positive or
1767 /// negative list representing that operation applied to those
1768 /// lists.
1769 ///
1770 /// @param op The operation to perform (OR, AND, or XOR). [in]
1771 /// @param A The first input list. [in]
1772 /// @param A_pos True if the first list is positive. [in]
1773 /// @param B The second input list. [in]
1774 /// @param B_pos True if the second list is positive. [in]
1775 /// @param result The resulting list of identifiers. [out]
1776 /// @param result_pos True if the result is a positive list. [out]
1777 void x_BooleanSetOperation(EOperation op,
1778 const vector<Int8> & A,
1779 bool A_pos,
1780 const vector<Int8> & B,
1781 bool B_pos,
1782 vector<Int8> & result,
1783 bool & result_pos);
1784
1785 /// True if the current list is positive.
1786 bool m_Positive;
1787
1788 /// Id type.
1789 EIdType m_IdType;
1790
1791 /// Ids stored here.
1792 CRef<CSeqDBIdSet_Vector> m_Ids;
1793
1794 /// Cached positive list.
1795 CRef<CSeqDBGiList> m_CachedPositive;
1796
1797 /// Cached negative list.
1798 CRef<CSeqDBNegativeList> m_CachedNegative;
1799 };
1800
1801
1802 // The "instance" concept in the following types refers to the fact
1803 // that each alias file has a seperately instantiated node for each
1804 // point where it appears in the alias file hierarchy.
1805
1806 /// Set of values found in one instance of one alias file.
1807 typedef map<string, string> TSeqDBAliasFileInstance;
1808
1809 /// Contents of all instances of a particular alias file pathname.
1810 typedef vector< TSeqDBAliasFileInstance > TSeqDBAliasFileVersions;
1811
1812 /// Contents of all alias file are returned in this type of container.
1813 typedef map< string, TSeqDBAliasFileVersions > TSeqDBAliasFileValues;
1814
1815
1816 /// SSeqDBTaxInfo
1817 ///
1818 /// This structure contains the taxonomy information for a single
1819 /// given taxid.
1820
1821 struct SSeqDBTaxInfo {
1822 /// Default constructor
1823 /// @param t the taxonomy ID to set for this structure
SSeqDBTaxInfoSSeqDBTaxInfo1824 SSeqDBTaxInfo(TTaxId t = ZERO_TAX_ID)
1825 : taxid(t)
1826 {
1827 }
1828
1829 /// An identifier for this species or taxonomic group.
1830 TTaxId taxid;
1831
1832 /// Scientific name, such as "Aotus vociferans".
1833 string scientific_name;
1834
1835 /// Common name, such as "noisy night monkey".
1836 string common_name;
1837
1838 /// A simple category name, such as "birds".
1839 string blast_name;
1840
1841 /// A string of length 1 indicating the "Super Kingdom".
1842 string s_kingdom;
1843
operator <<(ostream & out,const SSeqDBTaxInfo & rhs)1844 friend ostream& operator<<(ostream& out, const SSeqDBTaxInfo& rhs) {
1845 out << "Taxid=" << rhs.taxid
1846 << "\tSciName=" << rhs.scientific_name
1847 << "\tCommonName=" << rhs.common_name
1848 << "\tBlastName=" << rhs.blast_name
1849 << "\tSuperKingdom=" << rhs.s_kingdom;
1850 return out;
1851 }
1852 };
1853
1854
1855 /// Resolve a file path using SeqDB's path algorithms.
1856 ///
1857 /// This finds a file using the same algorithm used by SeqDB to find
1858 /// blast database filenames. The filename must include the extension
1859 /// if any. Paths which start with '/', '\', or a drive letter
1860 /// (depending on operating system) will be treated as absolute paths.
1861 /// If the file is not found an empty string will be returned.
1862 ///
1863 /// @param filename Name of file to find.
1864 /// @return Resolved path or empty string if not found.
1865
1866 NCBI_XOBJREAD_EXPORT
1867 string SeqDB_ResolveDbPath(const string & filename);
1868
1869 /// Resolve a file path using SeqDB's path algorithms.
1870 ///
1871 /// Identical to SeqDB_ResolveDbPath with the exception that this function does
1872 /// not require the extension to be provided. This is intended to check whether
1873 /// a BLAST DB exists or not.
1874 ///
1875 /// @param filename Name of file to find.
1876 /// @param dbtype Determines whether the BLAST DB is protein ('p'), nucleotide
1877 /// ('n'), or whether the algorithm should guess it ('-')
1878 /// @return Resolved path or empty string if not found.
1879 NCBI_XOBJREAD_EXPORT
1880 string SeqDB_ResolveDbPathNoExtension(const string & filename,
1881 char dbtype = '-');
1882
1883 /// Resolve a file path using SeqDB's path algorithms.
1884 ///
1885 /// Identical to SeqDB_ResolveDbPathNoExtension with the exception that this
1886 /// function searches for ISAM or SQLite files, specifically those storing
1887 /// numeric and string data (for LinkoutDB; i.e.: '.sqlite3').
1888 /// This is intended to check whether the files used in LinkoutDB
1889 /// exist or not.
1890 ///
1891 /// @param filename Name of file to find.
1892 /// @return Resolved path or empty string if not found.
1893 NCBI_XOBJREAD_EXPORT
1894 string SeqDB_ResolveDbPathForLinkoutDB(const string & filename);
1895
1896 /// Compares two volume file names and determine the volume order
1897 ///
1898 /// @param volpath1 The 1st volume path
1899 /// @param volpath2 The 2nd volume path
1900 /// @return true if vol1 should appear before vol2
1901 NCBI_XOBJREAD_EXPORT
1902 bool SeqDB_CompareVolume(const string & volpath1,
1903 const string & volpath2);
1904
1905 /// Returns a path minus filename.
1906 ///
1907 /// Substring version of the above. This returns the part of a file
1908 /// Sequence Hashing
1909 ///
1910 /// This computes a hash of a sequence. The sequence is expected to
1911 /// be in either ncbistdaa format (for protein) or ncbi8na format (for
1912 /// nucleotide). These formats are produced by CSeqDB::GetAmbigSeq()
1913 /// if the kSeqDBNuclNcbiNA8 encoding is selected.
1914 ///
1915 /// @param sequence A pointer to the sequence data. [in]
1916 /// @param length The length of the sequence in bases. [in]
1917 /// @return The 32 bit hash value.
1918 NCBI_XOBJREAD_EXPORT
1919 unsigned SeqDB_SequenceHash(const char * sequence,
1920 int length);
1921
1922 /// Sequence Hashing For a CBioseq
1923 ///
1924 /// This computes a hash of a sequence expressed as a CBioseq.
1925 ///
1926 /// @param sequence The sequence. [in]
1927 /// @return The 32 bit hash value.
1928 NCBI_XOBJREAD_EXPORT
1929 unsigned SeqDB_SequenceHash(const CBioseq & sequence);
1930
1931 /// Various identifier formats used in Id lookup
1932 enum ESeqDBIdType {
1933 eGiId, /// Genomic ID is a relatively stable numeric identifier for sequences.
1934 eTiId, /// Trace ID is a numeric identifier for Trace sequences.
1935 ePigId, /// Each PIG identifier refers to exactly one protein sequence.
1936 eStringId, /// Some sequence sources uses string identifiers.
1937 eHashId, /// Lookup from sequence hash values to OIDs.
1938 eOID /// The ordinal id indicates the order of the data in the volume's index file.
1939 };
1940
1941 /// Seq-id simplification.
1942 ///
1943 /// Given a Seq-id, this routine devolves it to a GI or PIG if
1944 /// possible. If not, it formats the Seq-id into a canonical form
1945 /// for lookup in the string ISAM files. If the Seq-id was parsed
1946 /// from an accession, it can be provided in the "acc" parameter,
1947 /// and it will be used if the Seq-id is not in a form this code
1948 /// can recognize. In the case that new Seq-id types are added,
1949 /// support for which has not been added to this code, this
1950 /// mechanism will try to use the original string.
1951 ///
1952 /// @param bestid
1953 /// The Seq-id to look up. [in]
1954 /// @param acc
1955 /// The original string the Seq-id was created from (or NULL). [in]
1956 /// @param num_id
1957 /// The returned identifier, if numeric. [out]
1958 /// @param str_id
1959 /// The returned identifier, if a string. [out]
1960 /// @param simpler
1961 /// Whether an adjustment was done at all. [out]
1962 /// @return
1963 /// The resulting identifier type.
1964 NCBI_XOBJREAD_EXPORT ESeqDBIdType
1965 SeqDB_SimplifySeqid(CSeq_id & bestid,
1966 const string * acc,
1967 Int8 & num_id,
1968 string & str_id,
1969 bool & simpler);
1970
1971 /// String id simplification.
1972 ///
1973 /// This routine tries to produce a numerical type from a string
1974 /// identifier. SeqDB can use faster lookup mechanisms if a PIG,
1975 /// GI, or OID type can be recognized in the string, for example.
1976 /// Even when the output is a string, it may be better formed for
1977 /// the purpose of lookup in the string ISAM file.
1978 ///
1979 /// @param acc
1980 /// The string to look up. [in]
1981 /// @param num_id
1982 /// The returned identifier, if numeric. [out]
1983 /// @param str_id
1984 /// The returned identifier, if a string. [out]
1985 /// @param simpler
1986 /// Whether an adjustment was done at all. [out]
1987 /// @return
1988 /// The resulting identifier type.
1989 NCBI_XOBJREAD_EXPORT ESeqDBIdType
1990 SeqDB_SimplifyAccession(const string & acc,
1991 Int8 & num_id,
1992 string & str_id,
1993 bool & simpler);
1994
1995 /// String id simplification.
1996 ///
1997 /// This simpler version will convert string id to the standard
1998 /// ISAM form, and return "" if the conversion fails.
1999 ///
2000 /// @param acc
2001 /// The string to look up. [in]
2002 /// @return
2003 /// The resulting converted id.
2004 NCBI_XOBJREAD_EXPORT const string
2005 SeqDB_SimplifyAccession(const string &acc);
2006
2007 /// Retrieves a list of all supported file extensions for BLAST databases
2008 /// @param db_is_protein set to true if the database is protein else false [in]
2009 /// @param extensions where the return value will be stored [in|out]
2010 /// @param dbver BLASTDB version to use [in]
2011 NCBI_XOBJREAD_EXPORT
2012 void SeqDB_GetFileExtensions(bool db_is_protein,
2013 vector<string>& extensions,
2014 EBlastDbVersion dbver = eBDB_Version4);
2015
2016 /// Retrieves file extensions for BLAST LMDB files
2017 /// @param db_is_protein set to true if the database is protein else false [in]
2018 /// @param extensions where the return value will be stored [in|out]
2019 NCBI_XOBJREAD_EXPORT
2020 void SeqDB_GetLMDBFileExtensions(bool db_is_protein,
2021 vector<string>& extn);
2022
2023 /// Determine if id is srting id
2024 /// @param id input id for check
2025 /// @return
2026 /// Return true if id is not of type gi, ti or pig
2027 NCBI_XOBJREAD_EXPORT
2028 bool IsStringId(const CSeq_id & id);
2029
2030 /// Return ID string as stored in lmdb
2031 NCBI_XOBJREAD_EXPORT
2032 string GetBlastSeqIdString(const CSeq_id & seqid, bool version);
2033
2034 END_NCBI_SCOPE
2035
2036 #endif // OBJTOOLS_BLAST_SEQDB_READER___SEQDBCOMMON__HPP
2037
2038