1 /*  $Id: seqdb.cpp 611131 2020-06-29 18:42:01Z grichenk $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Kevin Bealer
27  *
28  */
29 
30 /// @file seqdb.cpp
31 /// Implementation for the CSeqDB class, the top level class for SeqDB.
32 #include <ncbi_pch.hpp>
33 #include <objtools/blast/seqdb_reader/seqdb.hpp>
34 #include <util/sequtil/sequtil_convert.hpp>
35 #include "seqdbimpl.hpp"
36 #include <objtools/blast/seqdb_reader/impl/seqdbgeneral.hpp>
37 #include <map>
38 #include <string>
39 
40 #include <serial/objistr.hpp>
41 #include <serial/objostr.hpp>
42 #include <serial/serial.hpp>
43 #include <serial/objostrasnb.hpp>
44 #include <serial/objistrasnb.hpp>
45 
46 #include <objects/general/Object_id.hpp>
47 #include <objects/general/User_object.hpp>
48 #include <objects/general/User_field.hpp>
49 #include <objects/general/Dbtag.hpp>
50 
51 BEGIN_NCBI_SCOPE
52 
53 const string CSeqDB::kOidNotFound("OID not found");
54 
55 /// Helper function to translate enumerated type to character.
56 ///
57 /// @param seqtype
58 ///   The sequence type (eProtein, eNucleotide, or eUnknown).
59 /// @return
60 ///   The sequence type as a char ('p', 'n', or '-').
61 
s_GetSeqTypeChar(CSeqDB::ESeqType seqtype)62 static char s_GetSeqTypeChar(CSeqDB::ESeqType seqtype)
63 {
64     switch(seqtype) {
65     case CSeqDB::eProtein:
66         return 'p';
67     case CSeqDB::eNucleotide:
68         return 'n';
69     case CSeqDB::eUnknown:
70         return '-';
71     }
72 
73     NCBI_THROW(CSeqDBException,
74                eArgErr,
75                "Invalid sequence type specified.");
76 }
77 
78 /// Helper function to build private implementation object.
79 ///
80 /// This method builds and returns the object which implements the
81 /// functionality for the CSeqDB API.  If this method is called with
82 /// '-' for the sequence data type, protein will be tried first, then
83 /// nucleotide.  The created object will be returned.  Either
84 /// kSeqTypeProt for a protein database, kSeqTypeNucl for nucleotide,
85 /// or kSeqTypeUnkn to less this function try one then the other.
86 ///
87 /// @param dbname
88 ///   A list of database or alias names, seperated by spaces.
89 /// @param prot_nucl
90 ///   Specify whether to use protein, nucleotide, or either.
91 /// @param oid_begin
92 ///   Iterator will skip OIDs less than this value.  Only OIDs
93 ///   found in the OID lists (if any) will be returned.
94 /// @param oid_end
95 ///   Iterator will return up to (but not including) this OID.
96 /// @param use_mmap
97 ///   If kSeqDBMMap is specified (the default), memory mapping is
98 ///   attempted.  If kSeqDBNoMMap is specified, or memory mapping
99 ///   fails, this platform does not support it, the less efficient
100 ///   read and write calls are used instead.
101 /// @param gi_list
102 ///   This ID list specifies OIDs and deflines to include.
103 /// @param neg_list
104 ///   This negative ID list specifies deflines and OIDs to exclude.
105 /// @param idset
106 ///   If set, this specifies IDs to either include or exclude.
107 /// @return
108 ///   The CSeqDBImpl object that was created.
109 
110 static CSeqDBImpl *
s_SeqDBInit(const string & dbname,char prot_nucl,int oid_begin,int oid_end,bool use_atlas_lock,CSeqDBGiList * gi_list=NULL,CSeqDBNegativeList * neg_list=NULL,CSeqDBIdSet idset=CSeqDBIdSet ())111 s_SeqDBInit(const string       & dbname,
112             char                 prot_nucl,
113             int                  oid_begin,
114             int                  oid_end,
115             bool                 use_atlas_lock,
116             CSeqDBGiList       * gi_list = NULL,
117             CSeqDBNegativeList * neg_list = NULL,
118             CSeqDBIdSet          idset = CSeqDBIdSet())
119 {
120     CSeqDBImpl * impl = 0;
121 
122     if (prot_nucl == '-') {
123         try {
124             prot_nucl = 'p';
125             impl = new CSeqDBImpl(dbname,
126                                   prot_nucl,
127                                   oid_begin,
128                                   oid_end,
129                                   gi_list,
130                                   neg_list,
131                                   idset,
132                                   use_atlas_lock);
133         }
134         catch(CSeqDBException &) {
135             prot_nucl = 'n';
136         }
137     }
138 
139     if (! impl) {
140         impl = new CSeqDBImpl(dbname,
141                               prot_nucl,
142                               oid_begin,
143                               oid_end,
144                               gi_list,
145                               neg_list,
146                               idset,
147                               use_atlas_lock);
148     }
149 
150     _ASSERT(impl);
151 
152     return impl;
153 }
154 
CSeqDB(const string & dbname,ESeqType seqtype,CSeqDBGiList * gi_list,bool use_atlas_lock)155 CSeqDB::CSeqDB(const string & dbname,
156                ESeqType       seqtype,
157                CSeqDBGiList * gi_list,
158                bool           use_atlas_lock)
159 
160 {
161     if (dbname.size() == 0) {
162         NCBI_THROW(CSeqDBException,
163                    eArgErr,
164                    "Database name is required.");
165     }
166 
167     char seq_type = s_GetSeqTypeChar(seqtype);
168 
169     m_Impl = s_SeqDBInit(dbname,
170                          seq_type,
171                          0,
172                          0,
173                          use_atlas_lock,
174                          gi_list);
175 
176     ////m_Impl->Verify();
177 }
178 
CSeqDB(const string & dbname,ESeqType seqtype,CSeqDBNegativeList * nlist)179 CSeqDB::CSeqDB(const string       & dbname,
180                ESeqType             seqtype,
181                CSeqDBNegativeList * nlist)
182 {
183     if (dbname.size() == 0) {
184         NCBI_THROW(CSeqDBException,
185                    eArgErr,
186                    "Database name is required.");
187     }
188 
189     const bool kUseAtlasLock = true;
190     m_Impl = s_SeqDBInit(dbname,
191                          s_GetSeqTypeChar(seqtype),
192                          0,
193                          0,
194                          kUseAtlasLock,
195                          NULL,
196                          nlist);
197 
198     ////m_Impl->Verify();
199 }
200 
CSeqDB(const string & dbname,ESeqType seqtype,CSeqDBGiList * gi_list,CSeqDBNegativeList * nlist)201 CSeqDB::CSeqDB(const string & dbname,
202                ESeqType       seqtype,
203                CSeqDBGiList * gi_list,
204                CSeqDBNegativeList * nlist)
205 {
206     if (dbname.size() == 0) {
207         NCBI_THROW(CSeqDBException,
208                    eArgErr,
209                    "Database name is required.");
210     }
211 
212     char seq_type = s_GetSeqTypeChar(seqtype);
213 
214     m_Impl = s_SeqDBInit(dbname,
215                          seq_type,
216                          0,
217                          0,
218                          true,
219                          gi_list,
220                          nlist);
221 
222     ////m_Impl->Verify();
223 }
224 
CSeqDB(const string & dbname,ESeqType seqtype,int oid_begin,int oid_end,CSeqDBGiList * gi_list,CSeqDBNegativeList * nlist)225 CSeqDB::CSeqDB(const string & dbname,
226                ESeqType       seqtype,
227                int            oid_begin,
228                int            oid_end,
229                CSeqDBGiList * gi_list,
230                CSeqDBNegativeList * nlist)
231 {
232     if (dbname.size() == 0) {
233         NCBI_THROW(CSeqDBException,
234                    eArgErr,
235                    "Database name is required.");
236     }
237 
238     char seq_type = s_GetSeqTypeChar(seqtype);
239 
240     m_Impl = s_SeqDBInit(dbname,
241                          seq_type,
242                          oid_begin,
243                          oid_end,
244                          true,
245                          gi_list,
246                          nlist);
247 
248     ////m_Impl->Verify();
249 }
250 
251 
AccessionsToOids(const vector<string> & accs,vector<blastdb::TOid> & oids) const252 void CSeqDB::AccessionsToOids(const vector<string>& accs, vector<blastdb::TOid>& oids) const
253 {
254      m_Impl->AccessionsToOids(accs, oids);
255 }
256 
TaxIdsToOids(set<TTaxId> & tax_ids,vector<blastdb::TOid> & rv) const257 void CSeqDB::TaxIdsToOids(set<TTaxId>& tax_ids, vector<blastdb::TOid>& rv) const
258 {
259      m_Impl->TaxIdsToOids(tax_ids, rv);
260 }
261 
GetDBTaxIds(set<TTaxId> & tax_ids) const262 void CSeqDB::GetDBTaxIds(set<TTaxId> & tax_ids) const
263 {
264      m_Impl->GetDBTaxIds(tax_ids);
265 }
266 
267 // This could become the primary constructor for SeqDB, and those
268 // taking positive and negative lists could be deprecated.  This
269 // implies refactoring of code using SeqDB, addition of the third
270 // (string/Seq-id) type IDs to the IdSet, and changes to client code.
271 // Some non-SeqDB code uses FindOID and other methods of the GI list,
272 // comparable functionality would need to be added to IdSet().
273 //
274 // Before any of that is done, all the SeqDB classes should be made to
275 // use CSeqDBIdSet instead of using positive and negative lists.  This
276 // implies widespread changes to CSeqDBIdSet and SeqDB internal code.
277 //
278 // I'll leave those changes for another time -- for now I'll just add
279 // the pieces of framework that seem useful and are implied by the
280 // current design.
281 
CSeqDB(const string & dbname,ESeqType seqtype,CSeqDBIdSet ids)282 CSeqDB::CSeqDB(const string & dbname, ESeqType seqtype, CSeqDBIdSet ids)
283 {
284     if (dbname.size() == 0) {
285         NCBI_THROW(CSeqDBException,
286                    eArgErr,
287                    "Database name is required.");
288     }
289 
290     CRef<CSeqDBNegativeList> neg;
291     CRef<CSeqDBGiList> pos;
292 
293     if (! ids.Blank()) {
294         if (ids.IsPositive()) {
295             pos = ids.GetPositiveList();
296         } else {
297             neg = ids.GetNegativeList();
298         }
299     }
300 
301     const bool kUseAtlasLock = true;
302     m_Impl = s_SeqDBInit(dbname,
303                          s_GetSeqTypeChar(seqtype),
304                          0,
305                          0,
306                          kUseAtlasLock,
307                          pos.GetPointerOrNull(),
308                          neg.GetPointerOrNull(),
309                          ids);
310 
311     ////m_Impl->Verify();
312 }
313 
CSeqDB(const vector<string> & dbs,ESeqType seqtype,CSeqDBGiList * gi_list)314 CSeqDB::CSeqDB(const vector<string> & dbs,
315                ESeqType               seqtype,
316                CSeqDBGiList         * gi_list)
317 {
318     string dbname;
319     SeqDB_CombineAndQuote(dbs, dbname);
320 
321     if (dbname.size() == 0) {
322         NCBI_THROW(CSeqDBException,
323                    eArgErr,
324                    "Database name is required.");
325     }
326 
327     const bool kUseAtlasLock = true;
328     m_Impl = s_SeqDBInit(dbname,
329                          s_GetSeqTypeChar(seqtype),
330                          0,
331                          0,
332                          kUseAtlasLock,
333                          gi_list);
334 
335     ////m_Impl->Verify();
336 }
337 
CSeqDB(const string & dbname,ESeqType seqtype,int oid_begin,int oid_end,bool use_mmap,CSeqDBGiList * gi_list)338 CSeqDB::CSeqDB(const string & dbname,
339                ESeqType       seqtype,
340                int            oid_begin,
341                int            oid_end,
342                bool           use_mmap,
343                CSeqDBGiList * gi_list)
344 {
345     if (dbname.size() == 0) {
346         NCBI_THROW(CSeqDBException,
347                    eArgErr,
348                    "Database name is required.");
349     }
350 
351     const bool kUseAtlasLock = true;
352     m_Impl = s_SeqDBInit(dbname,
353                          s_GetSeqTypeChar(seqtype),
354                          oid_begin,
355                          oid_end,
356                          kUseAtlasLock,
357                          gi_list);
358 
359     ////m_Impl->Verify();
360 }
361 
CSeqDB(const vector<string> & dbs,ESeqType seqtype,int oid_begin,int oid_end,bool use_mmap,CSeqDBGiList * gi_list)362 CSeqDB::CSeqDB(const vector<string> & dbs,
363                ESeqType               seqtype,
364                int                    oid_begin,
365                int                    oid_end,
366                bool                   use_mmap,
367                CSeqDBGiList         * gi_list)
368 {
369     string dbname;
370     SeqDB_CombineAndQuote(dbs, dbname);
371 
372     if (dbname.size() == 0) {
373         NCBI_THROW(CSeqDBException,
374                    eArgErr,
375                    "Database name is required.");
376     }
377 
378     const bool kUseAtlasLock = true;
379     m_Impl = s_SeqDBInit(dbname,
380                          s_GetSeqTypeChar(seqtype),
381                          oid_begin,
382                          oid_end,
383                          kUseAtlasLock,
384                          gi_list);
385 
386     ////m_Impl->Verify();
387 }
388 
CSeqDB()389 CSeqDB::CSeqDB()
390 {
391     m_Impl = new CSeqDBImpl();
392     ////m_Impl->Verify();
393 }
394 
GetSeqLength(int oid) const395 int CSeqDB::GetSeqLength(int oid) const
396 {
397     ////m_Impl->Verify();
398     int length = m_Impl->GetSeqLength(oid);
399     ////m_Impl->Verify();
400 
401     return length;
402 }
403 
GetSeqLengthApprox(int oid) const404 int CSeqDB::GetSeqLengthApprox(int oid) const
405 {
406     ////m_Impl->Verify();
407     int length = m_Impl->GetSeqLengthApprox(oid);
408     ////m_Impl->Verify();
409 
410     return length;
411 }
412 
GetHdr(int oid) const413 CRef<CBlast_def_line_set> CSeqDB::GetHdr(int oid) const
414 {
415     ////m_Impl->Verify();
416     CRef<CBlast_def_line_set> rv = m_Impl->GetHdr(oid);
417     ////m_Impl->Verify();
418 
419     return rv;
420 }
421 
GetSequenceType() const422 CSeqDB::ESeqType CSeqDB::GetSequenceType() const
423 {
424     switch(m_Impl->GetSeqType()) {
425     case 'p':
426         return eProtein;
427     case 'n':
428         return eNucleotide;
429     }
430 
431     NCBI_THROW(CSeqDBException,
432                eArgErr,
433                "Internal sequence type is not valid.");
434 }
435 
GetTaxIDs(int oid,map<TGi,TTaxId> & gi_to_taxid,bool persist) const436 void CSeqDB::GetTaxIDs(int             oid,
437                        map<TGi, TTaxId> & gi_to_taxid,
438                        bool            persist) const
439 {
440     ////m_Impl->Verify();
441     typedef map<TGi, TTaxId> TmpMap;
442     TmpMap gi_to_taxid_tmp;
443     m_Impl->GetTaxIDs(oid, gi_to_taxid_tmp, persist);
444     if ( !persist ) {
445         gi_to_taxid.clear();
446     }
447     ITERATE ( TmpMap, it, gi_to_taxid_tmp ) {
448         gi_to_taxid[it->first] = it->second;
449     }
450     ////m_Impl->Verify();
451 }
452 
GetTaxIDs(int oid,vector<TTaxId> & taxids,bool persist) const453 void CSeqDB::GetTaxIDs(int           oid,
454                        vector<TTaxId> & taxids,
455                        bool          persist) const
456 {
457     ////m_Impl->Verify();
458     m_Impl->GetTaxIDs(oid, taxids, persist);
459     ////m_Impl->Verify();
460 }
461 
GetAllTaxIDs(int oid,set<TTaxId> & taxids) const462 void CSeqDB::GetAllTaxIDs(int           oid,
463                           set<TTaxId> & taxids) const
464 {
465     m_Impl->GetAllTaxIDs(oid, taxids);
466 }
467 
GetLeafTaxIDs(int oid,map<TGi,set<TTaxId>> & gi_to_taxid_set,bool persist) const468 void CSeqDB::GetLeafTaxIDs(
469         int                  oid,
470         map<TGi, set<TTaxId> >& gi_to_taxid_set,
471         bool                 persist
472 ) const
473 {
474     ////m_Impl->Verify();
475     typedef map<TGi, set<TTaxId> > TmpMap;
476     TmpMap gi_to_taxid_set_tmp;
477     m_Impl->GetLeafTaxIDs(oid, gi_to_taxid_set_tmp, persist);
478     if ( !persist ) {
479         gi_to_taxid_set.clear();
480     }
481     ITERATE ( TmpMap, it, gi_to_taxid_set_tmp ) {
482         gi_to_taxid_set[it->first] = it->second;
483     }
484     //m_Impl->Verify();
485 }
486 
GetLeafTaxIDs(int oid,vector<TTaxId> & taxids,bool persist) const487 void CSeqDB::GetLeafTaxIDs(
488         int          oid,
489         vector<TTaxId>& taxids,
490         bool         persist
491 ) const
492 {
493     //m_Impl->Verify();
494     m_Impl->GetLeafTaxIDs(oid, taxids, persist);
495     //m_Impl->Verify();
496 }
497 
498 CRef<CBioseq>
GetBioseq(int oid,TGi target_gi,const CSeq_id * target_id) const499 CSeqDB::GetBioseq(int oid, TGi target_gi, const CSeq_id * target_id) const
500 {
501     //m_Impl->Verify();
502     CRef<CBioseq> rv = m_Impl->GetBioseq(oid, target_gi, target_id, true);
503     //m_Impl->Verify();
504 
505     return rv;
506 }
507 
508 CRef<CBioseq>
GetBioseqNoData(int oid,TGi target_gi,const CSeq_id * target_id) const509 CSeqDB::GetBioseqNoData(int oid, TGi target_gi, const CSeq_id * target_id) const
510 {
511     //m_Impl->Verify();
512     CRef<CBioseq> rv = m_Impl->GetBioseq(oid, target_gi, target_id, false);
513     //m_Impl->Verify();
514 
515     return rv;
516 }
517 
RetSequence(const char ** buffer) const518 void CSeqDB::RetSequence(const char ** buffer) const
519 {
520     //m_Impl->Verify();
521     m_Impl->RetSequence(buffer);
522     //m_Impl->Verify();
523 }
524 
GetSequence(int oid,const char ** buffer) const525 int CSeqDB::GetSequence(int oid, const char ** buffer) const
526 {
527     //m_Impl->Verify();
528     int rv = m_Impl->GetSequence(oid, buffer);
529     //m_Impl->Verify();
530 
531     return rv;
532 }
533 
GetSeqData(int oid,TSeqPos begin,TSeqPos end) const534 CRef<CSeq_data> CSeqDB::GetSeqData(int     oid,
535                                    TSeqPos begin,
536                                    TSeqPos end) const
537 {
538     //m_Impl->Verify();
539     CRef<CSeq_data> rv = m_Impl->GetSeqData(oid, begin, end);
540     //m_Impl->Verify();
541 
542     return rv;
543 }
544 
GetAmbigSeq(int oid,const char ** buffer,int nucl_code) const545 int CSeqDB::GetAmbigSeq(int oid, const char ** buffer, int nucl_code) const
546 {
547     //m_Impl->Verify();
548     int rv = m_Impl->GetAmbigSeq(oid,
549                                  (char **)buffer,
550                                  nucl_code,
551                                  0,
552                                  (ESeqDBAllocType) 0);
553     //m_Impl->Verify();
554 
555     return rv;
556 }
557 
RetAmbigSeq(const char ** buffer) const558 void CSeqDB::RetAmbigSeq(const char ** buffer) const
559 {
560     //m_Impl->Verify();
561     m_Impl->RetAmbigSeq(buffer);
562     //m_Impl->Verify();
563 }
564 
GetAmbigSeq(int oid,const char ** buffer,int nucl_code,int begin_offset,int end_offset) const565 int CSeqDB::GetAmbigSeq(int           oid,
566                         const char ** buffer,
567                         int           nucl_code,
568                         int           begin_offset,
569                         int           end_offset) const
570 {
571     //m_Impl->Verify();
572 
573     SSeqDBSlice region(begin_offset, end_offset);
574 
575     int rv = m_Impl->GetAmbigSeq(oid,
576                                  (char **)buffer,
577                                  nucl_code,
578                                  & region,
579                                  (ESeqDBAllocType) 0);
580 
581     //m_Impl->Verify();
582 
583     return rv;
584 }
585 
GetAmbigSeqAlloc(int oid,char ** buffer,int nucl_code,ESeqDBAllocType strategy,TSequenceRanges * masks) const586 int CSeqDB::GetAmbigSeqAlloc(int             oid,
587                              char         ** buffer,
588                              int             nucl_code,
589                              ESeqDBAllocType strategy,
590                              TSequenceRanges *masks) const
591 {
592     //m_Impl->Verify();
593 
594     if ((strategy != eMalloc) && (strategy != eNew)) {
595         NCBI_THROW(CSeqDBException,
596                    eArgErr,
597                    "Invalid allocation strategy specified.");
598     }
599 
600     int rv = m_Impl->GetAmbigSeq(oid, buffer, nucl_code, 0, strategy, masks);
601 
602     //m_Impl->Verify();
603 
604     return rv;
605 }
606 
GetAmbigPartialSeq(int oid,char ** buffer,int nucl_code,ESeqDBAllocType strategy,TSequenceRanges * partial_ranges,TSequenceRanges * masks) const607 int CSeqDB::GetAmbigPartialSeq(int                oid,
608                                char            ** buffer,
609                                int                nucl_code,
610                                ESeqDBAllocType    strategy,
611                                TSequenceRanges  * partial_ranges,
612                                TSequenceRanges  * masks) const
613 {
614 
615 	if ((strategy != eMalloc) && (strategy != eNew)) {
616 	        NCBI_THROW(CSeqDBException,
617 	                   eArgErr,
618 	                   "Invalid allocation strategy specified.");
619 	    }
620 
621     int rv = m_Impl->GetAmbigPartialSeq(oid, buffer, nucl_code, strategy, partial_ranges, masks);
622 	return rv;
623 }
624 
GetTitle() const625 string CSeqDB::GetTitle() const
626 {
627     return m_Impl->GetTitle();
628 }
629 
GetDate() const630 string CSeqDB::GetDate() const
631 {
632     return m_Impl->GetDate();
633 }
634 
635 CTime
GetDate(const string & dbname,ESeqType seqtype)636 CSeqDB::GetDate(const string   & dbname,
637                 ESeqType         seqtype)
638 {
639     vector<string> vols;
640     CSeqDB::FindVolumePaths(dbname, seqtype, vols);
641     string fmt = "b d, Y  H:m P";
642     CTime retv;
643     char date[128];
644     ITERATE(vector<string>, vol, vols) {
645         string fn = *vol + ((seqtype == CSeqDB::eProtein)? ".pin" : ".nin");
646         ifstream f(fn.c_str(), ios::in|ios::binary);
647         char s[4];   // size of next chunk
648         if (f.is_open()) {
649             f.seekg(8, ios::beg);
650             f.read(s, 4);
651             Uint4 offset = SeqDB_GetStdOrd((Uint4 *) s);
652             f.seekg(offset, ios::cur);
653             f.read(s, 4);
654             offset = SeqDB_GetStdOrd((Uint4 *) s);
655             f.read(date, offset);
656             CTime d(string(date), fmt);
657             if (retv.IsEmpty() || d > retv) {
658                 retv = d;
659             }
660         }
661     }
662     return retv;
663 }
664 
GetNumSeqs() const665 int CSeqDB::GetNumSeqs() const
666 {
667     return m_Impl->GetNumSeqs();
668 }
669 
GetNumSeqsStats() const670 int CSeqDB::GetNumSeqsStats() const
671 {
672     return m_Impl->GetNumSeqsStats();
673 }
674 
GetNumOIDs() const675 int CSeqDB::GetNumOIDs() const
676 {
677     return m_Impl->GetNumOIDs();
678 }
679 
GetTotalLength() const680 Uint8 CSeqDB::GetTotalLength() const
681 {
682     return m_Impl->GetTotalLength();
683 }
684 
GetExactTotalLength()685 Uint8 CSeqDB::GetExactTotalLength()
686 {
687     return m_Impl->GetExactTotalLength();
688 }
689 
GetTotalLengthStats() const690 Uint8 CSeqDB::GetTotalLengthStats() const
691 {
692     return m_Impl->GetTotalLengthStats();
693 }
694 
GetVolumeLength() const695 Uint8 CSeqDB::GetVolumeLength() const
696 {
697     return m_Impl->GetVolumeLength();
698 }
699 
GetMaxLength() const700 int CSeqDB::GetMaxLength() const
701 {
702     return m_Impl->GetMaxLength();
703 }
704 
GetMinLength() const705 int CSeqDB::GetMinLength() const
706 {
707     return m_Impl->GetMinLength();
708 }
709 
~CSeqDB()710 CSeqDB::~CSeqDB()
711 {
712     ////m_Impl->Verify();
713 
714     if (m_Impl)
715         delete m_Impl;
716 }
717 
Begin() const718 CSeqDBIter CSeqDB::Begin() const
719 {
720     return CSeqDBIter(this, 0);
721 }
722 
CheckOrFindOID(int & oid) const723 bool CSeqDB::CheckOrFindOID(int & oid) const
724 {
725     ////m_Impl->Verify();
726     bool rv = m_Impl->CheckOrFindOID(oid);
727     ////m_Impl->Verify();
728 
729     return rv;
730 }
731 
732 
733 CSeqDB::EOidListType
GetNextOIDChunk(int & begin,int & end,int size,vector<int> & lst,int * state)734 CSeqDB::GetNextOIDChunk(int         & begin,
735                         int         & end,
736                         int         size,
737                         vector<int> & lst,
738                         int         * state)
739 {
740     ////m_Impl->Verify();
741 
742     CSeqDB::EOidListType rv =
743         m_Impl->GetNextOIDChunk(begin, end, size, lst, state);
744 
745     ////m_Impl->Verify();
746 
747     return rv;
748 }
749 
ResetInternalChunkBookmark()750 void CSeqDB::ResetInternalChunkBookmark()
751 {
752     m_Impl->ResetInternalChunkBookmark();
753 }
754 
GetDBNameList() const755 const string & CSeqDB::GetDBNameList() const
756 {
757     return m_Impl->GetDBNameList();
758 }
759 
GetSeqIDs(int oid) const760 list< CRef<CSeq_id> > CSeqDB::GetSeqIDs(int oid) const
761 {
762     ////m_Impl->Verify();
763 
764     list< CRef<CSeq_id> > rv = m_Impl->GetSeqIDs(oid);
765 
766     ////m_Impl->Verify();
767 
768     return rv;
769 }
770 
GetSeqGI(int oid) const771 TGi CSeqDB::GetSeqGI(int oid) const
772 {
773     return m_Impl->GetSeqGI(oid);
774 }
775 
PigToOid(int pig,int & oid) const776 bool CSeqDB::PigToOid(int pig, int & oid) const
777 {
778     ////m_Impl->Verify();
779     bool rv = m_Impl->PigToOid(pig, oid);
780     ////m_Impl->Verify();
781 
782     return rv;
783 }
784 
OidToPig(int oid,int & pig) const785 bool CSeqDB::OidToPig(int oid, int & pig) const
786 {
787     ////m_Impl->Verify();
788     bool rv = m_Impl->OidToPig(oid, pig);
789     ////m_Impl->Verify();
790 
791     return rv;
792 }
793 
TiToOid(Int8 ti,int & oid) const794 bool CSeqDB::TiToOid(Int8 ti, int & oid) const
795 {
796     ////m_Impl->Verify();
797     bool rv = m_Impl->TiToOid(ti, oid);
798     ////m_Impl->Verify();
799 
800     return rv;
801 }
802 
GiToOid(TGi gi,int & oid) const803 bool CSeqDB::GiToOid(TGi gi, int & oid) const
804 {
805     ////m_Impl->Verify();
806     bool rv = m_Impl->GiToOid(gi, oid);
807     ////m_Impl->Verify();
808 
809     return rv;
810 }
811 
GiToOidwFilterCheck(TGi gi,int & oid) const812 bool CSeqDB::GiToOidwFilterCheck(TGi gi, int & oid) const
813 {
814     ////m_Impl->Verify();
815     bool rv = m_Impl->GiToOidwFilterCheck(gi, oid);
816     ////m_Impl->Verify();
817 
818     return rv;
819 }
820 
OidToGi(int oid,TGi & gi) const821 bool CSeqDB::OidToGi(int oid, TGi & gi) const
822 {
823     ////m_Impl->Verify();
824     TGi gi_tmp;
825     bool rv = m_Impl->OidToGi(oid, gi_tmp);
826     gi = gi_tmp;
827     ////m_Impl->Verify();
828 
829     return rv;
830 }
831 
PigToGi(int pig,TGi & gi) const832 bool CSeqDB::PigToGi(int pig, TGi & gi) const
833 {
834     ////m_Impl->Verify();
835     bool rv = false;
836 
837     int oid(0);
838 
839     if (m_Impl->PigToOid(pig, oid)) {
840         TGi gi_tmp;
841         rv = m_Impl->OidToGi(oid, gi_tmp);
842         gi = gi_tmp;
843     }
844     ////m_Impl->Verify();
845 
846     return rv;
847 }
848 
GiToPig(TGi gi,int & pig) const849 bool CSeqDB::GiToPig(TGi gi, int & pig) const
850 {
851     ////m_Impl->Verify();
852     bool rv = false;
853 
854     int oid(0);
855 
856     if (m_Impl->GiToOid(gi, oid)) {
857         rv = m_Impl->OidToPig(oid, pig);
858     }
859 
860     ////m_Impl->Verify();
861 
862     return rv;
863 }
864 
AccessionToOids(const string & acc,vector<int> & oids) const865 void CSeqDB::AccessionToOids(const string & acc, vector<int> & oids) const
866 {
867     ////m_Impl->Verify();
868     m_Impl->AccessionToOids(acc, oids);
869 
870     // If we have a numeric ID and the search failed, try to look it
871     // up as a GI (but not as a PIG or TI).  Due to the presence of
872     // PDB ids like "pdb|1914|a", the faster GitToOid is not done
873     // first (unless the caller does so.)
874 
875     if (oids.empty()) {
876         try {
877             TGi gi = NStr::StringToNumeric<TGi>(acc, NStr::fConvErr_NoThrow);
878             int oid(-1);
879 
880             if (gi > ZERO_GI  &&  m_Impl->GiToOidwFilterCheck(gi, oid)) {
881                     oids.push_back(oid);
882             }
883         }
884         catch(...) {
885         }
886     }
887 
888     ////m_Impl->Verify();
889 }
890 
SeqidToOids(const CSeq_id & seqid,vector<int> & oids) const891 void CSeqDB::SeqidToOids(const CSeq_id & seqid, vector<int> & oids) const
892 {
893     ////m_Impl->Verify();
894     m_Impl->SeqidToOids(seqid, oids, true);
895     ////m_Impl->Verify();
896 }
897 
SeqidToOid(const CSeq_id & seqid,int & oid) const898 bool CSeqDB::SeqidToOid(const CSeq_id & seqid, int & oid) const
899 {
900     ////m_Impl->Verify();
901     bool rv = false;
902 
903     oid = -1;
904 
905     vector<int> oids;
906     m_Impl->SeqidToOids(seqid, oids, false);
907 
908     if (! oids.empty()) {
909         rv = true;
910         oid = oids[0];
911     }
912 
913     ////m_Impl->Verify();
914 
915     return rv;
916 }
917 
GetOidAtOffset(int first_seq,Uint8 residue) const918 int CSeqDB::GetOidAtOffset(int first_seq, Uint8 residue) const
919 {
920     ////m_Impl->Verify();
921     int rv = m_Impl->GetOidAtOffset(first_seq, residue);
922     ////m_Impl->Verify();
923 
924     return rv;
925 }
926 
CSeqDBIter(const CSeqDB * db,int oid)927 CSeqDBIter::CSeqDBIter(const CSeqDB * db, int oid)
928     : m_DB    (db),
929       m_OID   (oid),
930       m_Data  (0),
931       m_Length((int) -1)
932 {
933     if (m_DB->CheckOrFindOID(m_OID)) {
934         x_GetSeq();
935     }
936 }
937 
CSeqDBIter(const CSeqDBIter & other)938 CSeqDBIter::CSeqDBIter(const CSeqDBIter & other)
939     : m_DB    (other.m_DB),
940       m_OID   (other.m_OID),
941       m_Data  (0),
942       m_Length((int) -1)
943 {
944     if (m_DB->CheckOrFindOID(m_OID)) {
945         x_GetSeq();
946     }
947 }
948 
949 /// Copy one iterator to another.
operator =(const CSeqDBIter & other)950 CSeqDBIter & CSeqDBIter::operator =(const CSeqDBIter & other)
951 {
952     x_RetSeq();
953 
954     m_DB = other.m_DB;
955     m_OID = other.m_OID;
956     m_Data = 0;
957     m_Length = -1;
958 
959     if (m_DB->CheckOrFindOID(m_OID)) {
960         x_GetSeq();
961     }
962 
963     return *this;
964 }
965 
operator ++()966 CSeqDBIter & CSeqDBIter::operator++()
967 {
968     x_RetSeq();
969 
970     ++m_OID;
971 
972     if (m_DB->CheckOrFindOID(m_OID)) {
973         x_GetSeq();
974     } else {
975         m_Length = -1;
976     }
977 
978     return *this;
979 }
980 
981 CRef<CBioseq>
GiToBioseq(TGi gi) const982 CSeqDB::GiToBioseq(TGi gi) const
983 {
984     ////m_Impl->Verify();
985 
986     CRef<CBioseq> bs;
987     int oid(0);
988 
989     if (m_Impl->GiToOid(gi, oid)) {
990         bs = m_Impl->GetBioseq(oid, gi, NULL, true);
991     }
992 
993     ////m_Impl->Verify();
994 
995     return bs;
996 }
997 
998 CRef<CBioseq>
PigToBioseq(int pig) const999 CSeqDB::PigToBioseq(int pig) const
1000 {
1001     ////m_Impl->Verify();
1002 
1003     int oid(0);
1004     CRef<CBioseq> bs;
1005 
1006     if (m_Impl->PigToOid(pig, oid)) {
1007         bs = m_Impl->GetBioseq(oid, ZERO_GI, NULL, true);
1008     }
1009 
1010     ////m_Impl->Verify();
1011 
1012     return bs;
1013 }
1014 
1015 CRef<CBioseq>
SeqidToBioseq(const CSeq_id & seqid) const1016 CSeqDB::SeqidToBioseq(const CSeq_id & seqid) const
1017 {
1018     ////m_Impl->Verify();
1019 
1020     vector<int> oids;
1021     CRef<CBioseq> bs;
1022 
1023     m_Impl->SeqidToOids(seqid, oids, false);
1024 
1025     if (! oids.empty()) {
1026         bs = m_Impl->GetBioseq(oids[0], ZERO_GI, &seqid, true);
1027     }
1028 
1029     ////m_Impl->Verify();
1030 
1031     return bs;
1032 }
1033 
1034 void
FindVolumePaths(const string & dbname,ESeqType seqtype,vector<string> & paths,vector<string> * alias_paths,bool recursive,bool expand_links)1035 CSeqDB::FindVolumePaths(const string   & dbname,
1036                         ESeqType         seqtype,
1037                         vector<string> & paths,
1038                         vector<string> * alias_paths,
1039                         bool             recursive,
1040                         bool             expand_links)
1041 {
1042     if (seqtype == CSeqDB::eProtein) {
1043         CSeqDBImpl::FindVolumePaths(dbname, 'p', paths, alias_paths, recursive, expand_links);
1044     } else if (seqtype == CSeqDB::eNucleotide) {
1045         CSeqDBImpl::FindVolumePaths(dbname, 'n', paths, alias_paths, recursive, expand_links);
1046     } else {
1047         try {
1048             CSeqDBImpl::FindVolumePaths(dbname, 'p', paths, alias_paths, recursive, expand_links);
1049         }
1050         catch(...) {
1051             CSeqDBImpl::FindVolumePaths(dbname, 'n', paths, alias_paths, recursive, expand_links);
1052         }
1053     }
1054 }
1055 
1056 void
FindVolumePaths(vector<string> & paths,bool recursive) const1057 CSeqDB::FindVolumePaths(vector<string> & paths, bool recursive) const
1058 {
1059     ////m_Impl->Verify();
1060     m_Impl->FindVolumePaths(paths, recursive);
1061     ////m_Impl->Verify();
1062 }
1063 
1064 void
GetGis(int oid,vector<TGi> & gis,bool append) const1065 CSeqDB::GetGis(int oid, vector<TGi> & gis, bool append) const
1066 {
1067     ////m_Impl->Verify();
1068 
1069     // This could be done a little faster at a lower level, but not
1070     // necessarily by too much.  If this operation is important to
1071     // performance, that decision can be revisited.
1072 
1073     list< CRef<CSeq_id> > seqids = GetSeqIDs(oid);
1074 
1075     if (! append) {
1076         gis.clear();
1077     }
1078 
1079     ITERATE(list< CRef<CSeq_id> >, seqid, seqids) {
1080         if ((**seqid).IsGi()) {
1081             gis.push_back((**seqid).GetGi());
1082         }
1083     }
1084 
1085     ////m_Impl->Verify();
1086 }
1087 
SetIterationRange(int oid_begin,int oid_end)1088 void CSeqDB::SetIterationRange(int oid_begin, int oid_end)
1089 {
1090     m_Impl->SetIterationRange(oid_begin, oid_end);
1091 }
1092 
GetAliasFileValues(TAliasFileValues & afv)1093 void CSeqDB::GetAliasFileValues(TAliasFileValues & afv)
1094 {
1095     ////m_Impl->Verify();
1096     m_Impl->GetAliasFileValues(afv);
1097     ////m_Impl->Verify();
1098 }
1099 
GetTaxInfo(TTaxId taxid,SSeqDBTaxInfo & info)1100 void CSeqDB::GetTaxInfo(TTaxId taxid, SSeqDBTaxInfo & info)
1101 {
1102     CSeqDBImpl::GetTaxInfo(taxid, info);
1103 }
1104 
GetTotals(ESummaryType sumtype,int * oid_count,Uint8 * total_length,bool use_approx) const1105 void CSeqDB::GetTotals(ESummaryType   sumtype,
1106                        int          * oid_count,
1107                        Uint8        * total_length,
1108                        bool           use_approx) const
1109 {
1110     ////m_Impl->Verify();
1111     m_Impl->GetTotals(sumtype, oid_count, total_length, use_approx);
1112     ////m_Impl->Verify();
1113 }
1114 
GetGiList() const1115 const CSeqDBGiList * CSeqDB::GetGiList() const
1116 {
1117     return m_Impl->GetGiList();
1118 }
1119 
GetIdSet() const1120 CSeqDBIdSet CSeqDB::GetIdSet() const
1121 {
1122     return m_Impl->GetIdSet();
1123 }
1124 
GetSequenceAsString(int oid,string & output,TSeqRange range) const1125 void CSeqDB::GetSequenceAsString(int      oid,
1126                                  string & output,
1127                                  TSeqRange range /* = TSeqRange() */) const
1128 {
1129     CSeqUtil::ECoding code_to = ((GetSequenceType() == CSeqDB::eProtein)
1130                                  ? CSeqUtil::e_Iupacaa
1131                                  : CSeqUtil::e_Iupacna);
1132 
1133     GetSequenceAsString(oid, code_to, output, range);
1134 }
1135 
GetSequenceAsString(int oid,CSeqUtil::ECoding coding,string & output,TSeqRange range) const1136 void CSeqDB::GetSequenceAsString(int                 oid,
1137                                  CSeqUtil::ECoding   coding,
1138                                  string            & output,
1139                                  TSeqRange range /* = TSeqRange() */) const
1140 {
1141     output.erase();
1142 
1143     string raw;
1144     const char * buffer = 0;
1145     int length = 0;
1146 
1147     // Protein dbs ignore encodings, always returning ncbistdaa.
1148     if (range.NotEmpty()) {
1149         length = GetAmbigSeq(oid, & buffer, kSeqDBNuclNcbiNA8,
1150                              range.GetFrom(), range.GetToOpen());
1151     } else {
1152         length = GetAmbigSeq(oid, & buffer, kSeqDBNuclNcbiNA8);
1153     }
1154 
1155     try {
1156         raw.assign(buffer, length);
1157     }
1158     catch(...) {
1159         RetAmbigSeq(& buffer);
1160         throw;
1161     }
1162     RetAmbigSeq(& buffer);
1163 
1164     CSeqUtil::ECoding code_from = ((GetSequenceType() == CSeqDB::eProtein)
1165                                    ? CSeqUtil::e_Ncbistdaa
1166                                    : CSeqUtil::e_Ncbi8na);
1167 
1168     string result;
1169 
1170     if (code_from == coding) {
1171         result.swap(raw);
1172     } else {
1173         CSeqConvert::Convert(raw,
1174                              code_from,
1175                              0,
1176                              length,
1177                              result,
1178                              coding);
1179     }
1180 
1181     output.swap(result);
1182 }
1183 
1184 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION  > 550)) && \
1185      (!defined(NCBI_COMPILER_MIPSPRO)) )
ListColumns(vector<string> & titles)1186 void CSeqDB::ListColumns(vector<string> & titles)
1187 {
1188     m_Impl->ListColumns(titles);
1189 }
1190 
GetColumnId(const string & title)1191 int CSeqDB::GetColumnId(const string & title)
1192 {
1193     return m_Impl->GetColumnId(title);
1194 }
1195 
1196 const map<string,string> &
GetColumnMetaData(int column_id)1197 CSeqDB::GetColumnMetaData(int column_id)
1198 {
1199     return m_Impl->GetColumnMetaData(column_id);
1200 }
1201 
GetColumnValue(int column_id,const string & key)1202 const string & CSeqDB::GetColumnValue(int column_id, const string & key)
1203 {
1204     static string mt;
1205     return SeqDB_MapFind(GetColumnMetaData(column_id), key, mt);
1206 }
1207 
1208 const map<string,string> &
GetColumnMetaData(int column_id,const string & volname)1209 CSeqDB::GetColumnMetaData(int            column_id,
1210                           const string & volname)
1211 {
1212     return m_Impl->GetColumnMetaData(column_id, volname);
1213 }
1214 
GetColumnBlob(int col_id,int oid,CBlastDbBlob & blob)1215 void CSeqDB::GetColumnBlob(int            col_id,
1216                            int            oid,
1217                            CBlastDbBlob & blob)
1218 {
1219     m_Impl->GetColumnBlob(col_id, oid, true, blob);
1220 }
1221 
GetAvailableMaskAlgorithms(vector<int> & algorithms)1222 void CSeqDB::GetAvailableMaskAlgorithms(vector<int> & algorithms)
1223 {
1224     m_Impl->GetAvailableMaskAlgorithms(algorithms);
1225 }
1226 
GetMaskAlgorithmId(const string & algo_name) const1227 int CSeqDB::GetMaskAlgorithmId(const string &algo_name) const
1228 {
1229     return m_Impl->GetMaskAlgorithmId(algo_name);
1230 }
1231 
GetAvailableMaskAlgorithmDescriptions()1232 string CSeqDB::GetAvailableMaskAlgorithmDescriptions()
1233 {
1234     return m_Impl->GetAvailableMaskAlgorithmDescriptions();
1235 }
1236 
ValidateMaskAlgorithms(const vector<int> & algorithm_ids)1237 vector<int> CSeqDB::ValidateMaskAlgorithms(const vector<int>& algorithm_ids)
1238 {
1239     vector<int> invalid_algo_ids, available_algo_ids;
1240     GetAvailableMaskAlgorithms(available_algo_ids);
1241     invalid_algo_ids.reserve(algorithm_ids.size());
1242     if (available_algo_ids.empty()) {
1243         copy(algorithm_ids.begin(), algorithm_ids.end(),
1244              back_inserter(invalid_algo_ids));
1245         return invalid_algo_ids;
1246     }
1247 
1248     ITERATE(vector<int>, itr, algorithm_ids) {
1249         vector<int>::const_iterator pos = find(available_algo_ids.begin(),
1250                                                available_algo_ids.end(), *itr);
1251         if (pos == available_algo_ids.end()) {
1252             invalid_algo_ids.push_back(*itr);
1253         }
1254     }
1255     return invalid_algo_ids;
1256 }
1257 
GetMaskAlgorithmDetails(int algorithm_id,objects::EBlast_filter_program & program,string & program_name,string & algo_opts)1258 void CSeqDB::GetMaskAlgorithmDetails(int                 algorithm_id,
1259                                      objects::EBlast_filter_program & program,
1260                                      string            & program_name,
1261                                      string            & algo_opts)
1262 {
1263     string sid;
1264     m_Impl->GetMaskAlgorithmDetails(algorithm_id, sid, program_name,
1265                                     algo_opts);
1266     Int4 id(0);
1267     NStr::StringToNumeric(sid, &id, NStr::fConvErr_NoThrow, 10);
1268     program = (objects::EBlast_filter_program)id;
1269 }
1270 
GetMaskAlgorithmDetails(int algorithm_id,string & program,string & program_name,string & algo_opts)1271 void CSeqDB::GetMaskAlgorithmDetails(int                 algorithm_id,
1272                                      string            & program,
1273                                      string            & program_name,
1274                                      string            & algo_opts)
1275 {
1276     m_Impl->GetMaskAlgorithmDetails(algorithm_id, program, program_name,
1277                                     algo_opts);
1278 }
1279 
GetMaskData(int oid,int algo_id,TSequenceRanges & ranges)1280 void CSeqDB::GetMaskData(int                 oid,
1281                          int                 algo_id,
1282                          TSequenceRanges   & ranges)
1283 {
1284     m_Impl->GetMaskData(oid, algo_id, ranges);
1285 }
1286 
1287 #endif
1288 
1289 
SetOffsetRanges(int oid,const CSeqDB::TRangeList & offset_ranges,bool append_ranges,bool cache_data)1290 void CSeqDB::SetOffsetRanges(int                        oid,
1291                              const CSeqDB::TRangeList & offset_ranges,
1292                              bool                       append_ranges,
1293                              bool                       cache_data)
1294 {
1295     ////m_Impl->Verify();
1296 
1297     m_Impl->SetOffsetRanges(oid,
1298                             offset_ranges,
1299                             append_ranges,
1300                             cache_data);
1301 
1302     ////m_Impl->Verify();
1303 }
1304 
RemoveOffsetRanges(int oid)1305 void CSeqDB::RemoveOffsetRanges(int oid)
1306 {
1307     static TRangeList empty;
1308     SetOffsetRanges(oid, empty, false, false);
1309 }
1310 
FlushOffsetRangeCache()1311 void CSeqDB::FlushOffsetRangeCache()
1312 {
1313     m_Impl->FlushOffsetRangeCache();
1314 }
1315 
SetNumberOfThreads(int num_threads,bool force_mt)1316 void CSeqDB::SetNumberOfThreads(int num_threads, bool force_mt)
1317 {
1318     ////m_Impl->Verify();
1319 
1320     m_Impl->SetNumberOfThreads(num_threads, force_mt);
1321 }
1322 
ESeqType2String(ESeqType type)1323 string CSeqDB::ESeqType2String(ESeqType type)
1324 {
1325     string retval("Unknown");
1326     switch (type) {
1327     case eProtein: retval.assign("Protein"); break;
1328     case eNucleotide: retval.assign("Nucleotide"); break;
1329     case eUnknown:
1330     default: break;
1331     }
1332     return retval;
1333 }
1334 
GenerateSearchPath()1335 string CSeqDB::GenerateSearchPath()
1336 {
1337     return CSeqDBAtlas::GenerateSearchPath();
1338 }
1339 
SetVolsMemBit(int mbit)1340 void CSeqDB::SetVolsMemBit(int mbit)
1341 {
1342     m_Impl->SetVolsMemBit(mbit);
1343 }
1344 
1345 /// Functor class for FindFilesInDir
1346 class CBlastDbFinder {
1347 public:
operator ()(CDirEntry & de)1348     void operator() (CDirEntry& de) {
1349         const string& extn = de.GetPath().substr(de.GetPath().length() - 3, 1);
1350         SSeqDBInitInfo value;
1351         // rm extension
1352         value.m_BlastDbName = de.GetPath().substr(0, de.GetPath().length() - 4);
1353         CNcbiOstrstream oss;
1354         // Needed for escaping spaces
1355         oss << "\"" << value.m_BlastDbName << "\"";
1356         value.m_BlastDbName = CNcbiOstrstreamToString(oss);
1357         value.m_MoleculeType =
1358             (extn == "n" ? CSeqDB::eNucleotide : CSeqDB::eProtein);
1359         m_DBs.push_back(value);
1360     }
1361 
1362     vector<SSeqDBInitInfo> m_DBs;
1363 
1364     /// Auxiliary function to get the original file name found by this object
GetFileName(size_t idx)1365     string GetFileName(size_t idx) {
1366         SSeqDBInitInfo& info = m_DBs[idx];
1367         string retval = NStr::Replace(info.m_BlastDbName, "\"", kEmptyStr);
1368         if (info.m_MoleculeType == CSeqDB::eNucleotide) {
1369             string alias = retval + ".nal", index = retval + ".nin";
1370             retval = (CFile(alias).Exists() ? alias : index);
1371         } else {
1372             string alias = retval + ".pal", index = retval + ".pin";
1373             retval = (CFile(alias).Exists() ? alias : index);
1374         }
1375         return retval;
1376     }
1377 };
1378 
1379 /** Functor object for s_RemoveAliasComponents where the path name is matched
1380  * in SSeqDBInitInfo */
1381 class PathFinder {
1382 public:
PathFinder(const string & p)1383     PathFinder(const string& p) : m_Path(p) {}
operator ()(const SSeqDBInitInfo & value) const1384     bool operator() (const SSeqDBInitInfo& value) const {
1385         return (NStr::Find(value.m_BlastDbName, m_Path) != NPOS);
1386     }
1387 
1388 private:
1389     string m_Path;
1390 };
1391 
s_RemoveAliasComponents(CBlastDbFinder & finder)1392 static void s_RemoveAliasComponents(CBlastDbFinder& finder)
1393 {
1394     set<string> dbs2remove;
1395     for (size_t i = 0; i < finder.m_DBs.size(); i++) {
1396         string path = finder.GetFileName(i);
1397         if (path[path.size()-1] != 'l') { // not an alias file
1398             continue;
1399         }
1400         CNcbiIfstream in(path.c_str());
1401         if (!in) {
1402             continue;
1403         }
1404         string line;
1405         while (getline(in, line)) {
1406             if (NStr::StartsWith(line, "DBLIST")) {
1407                 vector<string> tokens;
1408                 NStr::Split(line, " ", tokens, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1409                 for (size_t j = 1; j < tokens.size(); j++) {
1410                     dbs2remove.insert(tokens[j]);
1411                 }
1412             }
1413         }
1414     }
1415 
1416     ITERATE(set<string>, i, dbs2remove) {
1417         finder.m_DBs.erase(remove_if(finder.m_DBs.begin(), finder.m_DBs.end(),
1418                                      PathFinder(*i)),
1419                            finder.m_DBs.end());
1420     }
1421 }
1422 
1423 vector<SSeqDBInitInfo>
FindBlastDBs(const string & path,const string & dbtype,bool recurse,bool include_alias_files,bool remove_redundant_dbs)1424 FindBlastDBs(const string& path, const string& dbtype, bool recurse,
1425              bool include_alias_files /* = false */,
1426              bool remove_redundant_dbs /* = false */)
1427 {
1428     // 1. Find every database volume (but not alias files etc).
1429     vector<string> fmasks, dmasks;
1430 
1431     // If the type is 'guess' we do both types of databases.
1432 
1433     if (dbtype != "nucl") {
1434         fmasks.push_back("*.pin");
1435         if (include_alias_files) {
1436             fmasks.push_back("*.pal");
1437         }
1438     }
1439     if (dbtype != "prot") {
1440         fmasks.push_back("*.nin");
1441         if (include_alias_files) {
1442             fmasks.push_back("*.nal");
1443         }
1444     }
1445     dmasks.push_back("*");
1446 
1447     EFindFiles flags = (EFindFiles)
1448         (fFF_File | (recurse ? fFF_Recursive : 0));
1449 
1450     CBlastDbFinder dbfinder;
1451     FindFilesInDir(CDir(path), fmasks, dmasks, dbfinder, flags);
1452     if (remove_redundant_dbs) {
1453         s_RemoveAliasComponents(dbfinder);
1454     }
1455     sort(dbfinder.m_DBs.begin(), dbfinder.m_DBs.end());
1456     return dbfinder.m_DBs;
1457 }
1458 
GetDiskUsage() const1459 Int8 CSeqDB::GetDiskUsage() const
1460 {
1461     vector<string> paths;
1462     FindVolumePaths(paths);
1463     _ASSERT( !paths.empty() );
1464 
1465     Int8 retval = 0;
1466 
1467     vector<string> extn;
1468     const bool is_protein(GetSequenceType() == CSeqDB::eProtein);
1469     SeqDB_GetFileExtensions(is_protein, extn, GetBlastDbVersion());
1470 
1471     ITERATE(vector<string>, path, paths) {
1472         ITERATE(vector<string>, ext, extn) {
1473             CFile file(*path + "." + *ext);
1474             if (file.Exists()) {
1475                 Int8 length = file.GetLength();
1476                 if (length != -1) {
1477                     retval += length;
1478                 } else {
1479                     ERR_POST(Error << "Error retrieving file size for "
1480                                    << file.GetPath());
1481                 }
1482             }
1483         }
1484     }
1485     return retval;
1486 }
1487 
1488 CSeqDB::ESeqType
ParseMoleculeTypeString(const string & s)1489 ParseMoleculeTypeString(const string& s)
1490 {
1491     CSeqDB::ESeqType retval = CSeqDB::eUnknown;
1492     if (NStr::StartsWith(s, "prot", NStr::eNocase)) {
1493         retval = CSeqDB::eProtein;
1494     } else if (NStr::StartsWith(s, "nucl", NStr::eNocase)) {
1495         retval = CSeqDB::eNucleotide;
1496     } else if (NStr::StartsWith(s, "guess", NStr::eNocase)) {
1497         retval = CSeqDB::eUnknown;
1498     } else {
1499         _ASSERT("Unknown molecule for BLAST DB" != 0);
1500     }
1501     return retval;
1502 }
1503 
DeleteBlastDb(const string & dbpath,CSeqDB::ESeqType seq_type)1504 bool DeleteBlastDb(const string& dbpath, CSeqDB::ESeqType seq_type)
1505 {
1506     int num_files_removed = 0;
1507     vector<string> db_files, alias_files;
1508     bool is_protein = (seq_type == CSeqDB::eProtein);
1509 
1510     vector<string> extn;
1511     SeqDB_GetFileExtensions( is_protein, extn, eBDB_Version4);
1512     vector<string> lmdb_extn;
1513     SeqDB_GetLMDBFileExtensions(is_protein, lmdb_extn);
1514     ITERATE(vector<string>, lmdb, lmdb_extn) {
1515     	CNcbiOstrstream oss;
1516     	oss << dbpath << "." << *lmdb;
1517         const string fname = CNcbiOstrstreamToString(oss);
1518         if (CFile(fname).Remove()) {
1519         	LOG_POST(Info << "Deleted " << fname);
1520             num_files_removed++;
1521         }
1522         else {
1523         	unsigned int index = 0;
1524         	string vfname = dbpath + "." + NStr::IntToString(index/10) +
1525         			        NStr::IntToString(index%10) + "." + *lmdb;
1526         	while (CFile(vfname).Remove()) {
1527         		index++;
1528         		vfname = dbpath + "." + NStr::IntToString(index/10) +
1529         	    	     NStr::IntToString(index%10) + "." + *lmdb;
1530 
1531         	}
1532         }
1533     }
1534 
1535     try { CSeqDB::FindVolumePaths(dbpath, seq_type, db_files, &alias_files); }
1536     catch (...) {}    // ignore any errors from the invocation above
1537     ITERATE(vector<string>, f, db_files) {
1538         ITERATE(vector<string>, e, extn) {
1539             CNcbiOstrstream oss;
1540             oss << *f << "." << *e;
1541             const string fname = CNcbiOstrstreamToString(oss);
1542             if (CFile(fname).Remove()) {
1543                 LOG_POST(Info << "Deleted " << fname);
1544                 num_files_removed++;
1545             }
1546         }
1547     }
1548     ITERATE(vector<string>, f, alias_files) {
1549         if (CFile(*f).Remove()) {
1550             LOG_POST(Info << "Deleted " << *f);
1551             num_files_removed++;
1552         }
1553     }
1554     return static_cast<bool>(num_files_removed != 0);
1555 }
1556 
1557 const char* CSeqDB::kBlastDbDateFormat = "b d, Y  H:m P";
1558 
DebugDump(CDebugDumpContext ddc,unsigned int depth) const1559 void CSeqDB::DebugDump(CDebugDumpContext ddc, unsigned int depth) const
1560 {
1561     ddc.SetFrame("CSeqDB");
1562     CObject::DebugDump(ddc, depth);
1563     ddc.Log("m_Impl", m_Impl, depth);
1564 }
1565 
GetBlastDbVersion() const1566 EBlastDbVersion CSeqDB::GetBlastDbVersion() const
1567 {
1568 	 return m_Impl->GetBlastDbVersion();
1569 }
1570 
1571 END_NCBI_SCOPE
1572 
1573