1 /* $Id: seqdb.cpp 611131 2020-06-29 18:42:01Z grichenk $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Kevin Bealer
27 *
28 */
29
30 /// @file seqdb.cpp
31 /// Implementation for the CSeqDB class, the top level class for SeqDB.
32 #include <ncbi_pch.hpp>
33 #include <objtools/blast/seqdb_reader/seqdb.hpp>
34 #include <util/sequtil/sequtil_convert.hpp>
35 #include "seqdbimpl.hpp"
36 #include <objtools/blast/seqdb_reader/impl/seqdbgeneral.hpp>
37 #include <map>
38 #include <string>
39
40 #include <serial/objistr.hpp>
41 #include <serial/objostr.hpp>
42 #include <serial/serial.hpp>
43 #include <serial/objostrasnb.hpp>
44 #include <serial/objistrasnb.hpp>
45
46 #include <objects/general/Object_id.hpp>
47 #include <objects/general/User_object.hpp>
48 #include <objects/general/User_field.hpp>
49 #include <objects/general/Dbtag.hpp>
50
51 BEGIN_NCBI_SCOPE
52
53 const string CSeqDB::kOidNotFound("OID not found");
54
55 /// Helper function to translate enumerated type to character.
56 ///
57 /// @param seqtype
58 /// The sequence type (eProtein, eNucleotide, or eUnknown).
59 /// @return
60 /// The sequence type as a char ('p', 'n', or '-').
61
s_GetSeqTypeChar(CSeqDB::ESeqType seqtype)62 static char s_GetSeqTypeChar(CSeqDB::ESeqType seqtype)
63 {
64 switch(seqtype) {
65 case CSeqDB::eProtein:
66 return 'p';
67 case CSeqDB::eNucleotide:
68 return 'n';
69 case CSeqDB::eUnknown:
70 return '-';
71 }
72
73 NCBI_THROW(CSeqDBException,
74 eArgErr,
75 "Invalid sequence type specified.");
76 }
77
78 /// Helper function to build private implementation object.
79 ///
80 /// This method builds and returns the object which implements the
81 /// functionality for the CSeqDB API. If this method is called with
82 /// '-' for the sequence data type, protein will be tried first, then
83 /// nucleotide. The created object will be returned. Either
84 /// kSeqTypeProt for a protein database, kSeqTypeNucl for nucleotide,
85 /// or kSeqTypeUnkn to less this function try one then the other.
86 ///
87 /// @param dbname
88 /// A list of database or alias names, seperated by spaces.
89 /// @param prot_nucl
90 /// Specify whether to use protein, nucleotide, or either.
91 /// @param oid_begin
92 /// Iterator will skip OIDs less than this value. Only OIDs
93 /// found in the OID lists (if any) will be returned.
94 /// @param oid_end
95 /// Iterator will return up to (but not including) this OID.
96 /// @param use_mmap
97 /// If kSeqDBMMap is specified (the default), memory mapping is
98 /// attempted. If kSeqDBNoMMap is specified, or memory mapping
99 /// fails, this platform does not support it, the less efficient
100 /// read and write calls are used instead.
101 /// @param gi_list
102 /// This ID list specifies OIDs and deflines to include.
103 /// @param neg_list
104 /// This negative ID list specifies deflines and OIDs to exclude.
105 /// @param idset
106 /// If set, this specifies IDs to either include or exclude.
107 /// @return
108 /// The CSeqDBImpl object that was created.
109
110 static CSeqDBImpl *
s_SeqDBInit(const string & dbname,char prot_nucl,int oid_begin,int oid_end,bool use_atlas_lock,CSeqDBGiList * gi_list=NULL,CSeqDBNegativeList * neg_list=NULL,CSeqDBIdSet idset=CSeqDBIdSet ())111 s_SeqDBInit(const string & dbname,
112 char prot_nucl,
113 int oid_begin,
114 int oid_end,
115 bool use_atlas_lock,
116 CSeqDBGiList * gi_list = NULL,
117 CSeqDBNegativeList * neg_list = NULL,
118 CSeqDBIdSet idset = CSeqDBIdSet())
119 {
120 CSeqDBImpl * impl = 0;
121
122 if (prot_nucl == '-') {
123 try {
124 prot_nucl = 'p';
125 impl = new CSeqDBImpl(dbname,
126 prot_nucl,
127 oid_begin,
128 oid_end,
129 gi_list,
130 neg_list,
131 idset,
132 use_atlas_lock);
133 }
134 catch(CSeqDBException &) {
135 prot_nucl = 'n';
136 }
137 }
138
139 if (! impl) {
140 impl = new CSeqDBImpl(dbname,
141 prot_nucl,
142 oid_begin,
143 oid_end,
144 gi_list,
145 neg_list,
146 idset,
147 use_atlas_lock);
148 }
149
150 _ASSERT(impl);
151
152 return impl;
153 }
154
CSeqDB(const string & dbname,ESeqType seqtype,CSeqDBGiList * gi_list,bool use_atlas_lock)155 CSeqDB::CSeqDB(const string & dbname,
156 ESeqType seqtype,
157 CSeqDBGiList * gi_list,
158 bool use_atlas_lock)
159
160 {
161 if (dbname.size() == 0) {
162 NCBI_THROW(CSeqDBException,
163 eArgErr,
164 "Database name is required.");
165 }
166
167 char seq_type = s_GetSeqTypeChar(seqtype);
168
169 m_Impl = s_SeqDBInit(dbname,
170 seq_type,
171 0,
172 0,
173 use_atlas_lock,
174 gi_list);
175
176 ////m_Impl->Verify();
177 }
178
CSeqDB(const string & dbname,ESeqType seqtype,CSeqDBNegativeList * nlist)179 CSeqDB::CSeqDB(const string & dbname,
180 ESeqType seqtype,
181 CSeqDBNegativeList * nlist)
182 {
183 if (dbname.size() == 0) {
184 NCBI_THROW(CSeqDBException,
185 eArgErr,
186 "Database name is required.");
187 }
188
189 const bool kUseAtlasLock = true;
190 m_Impl = s_SeqDBInit(dbname,
191 s_GetSeqTypeChar(seqtype),
192 0,
193 0,
194 kUseAtlasLock,
195 NULL,
196 nlist);
197
198 ////m_Impl->Verify();
199 }
200
CSeqDB(const string & dbname,ESeqType seqtype,CSeqDBGiList * gi_list,CSeqDBNegativeList * nlist)201 CSeqDB::CSeqDB(const string & dbname,
202 ESeqType seqtype,
203 CSeqDBGiList * gi_list,
204 CSeqDBNegativeList * nlist)
205 {
206 if (dbname.size() == 0) {
207 NCBI_THROW(CSeqDBException,
208 eArgErr,
209 "Database name is required.");
210 }
211
212 char seq_type = s_GetSeqTypeChar(seqtype);
213
214 m_Impl = s_SeqDBInit(dbname,
215 seq_type,
216 0,
217 0,
218 true,
219 gi_list,
220 nlist);
221
222 ////m_Impl->Verify();
223 }
224
CSeqDB(const string & dbname,ESeqType seqtype,int oid_begin,int oid_end,CSeqDBGiList * gi_list,CSeqDBNegativeList * nlist)225 CSeqDB::CSeqDB(const string & dbname,
226 ESeqType seqtype,
227 int oid_begin,
228 int oid_end,
229 CSeqDBGiList * gi_list,
230 CSeqDBNegativeList * nlist)
231 {
232 if (dbname.size() == 0) {
233 NCBI_THROW(CSeqDBException,
234 eArgErr,
235 "Database name is required.");
236 }
237
238 char seq_type = s_GetSeqTypeChar(seqtype);
239
240 m_Impl = s_SeqDBInit(dbname,
241 seq_type,
242 oid_begin,
243 oid_end,
244 true,
245 gi_list,
246 nlist);
247
248 ////m_Impl->Verify();
249 }
250
251
AccessionsToOids(const vector<string> & accs,vector<blastdb::TOid> & oids) const252 void CSeqDB::AccessionsToOids(const vector<string>& accs, vector<blastdb::TOid>& oids) const
253 {
254 m_Impl->AccessionsToOids(accs, oids);
255 }
256
TaxIdsToOids(set<TTaxId> & tax_ids,vector<blastdb::TOid> & rv) const257 void CSeqDB::TaxIdsToOids(set<TTaxId>& tax_ids, vector<blastdb::TOid>& rv) const
258 {
259 m_Impl->TaxIdsToOids(tax_ids, rv);
260 }
261
GetDBTaxIds(set<TTaxId> & tax_ids) const262 void CSeqDB::GetDBTaxIds(set<TTaxId> & tax_ids) const
263 {
264 m_Impl->GetDBTaxIds(tax_ids);
265 }
266
267 // This could become the primary constructor for SeqDB, and those
268 // taking positive and negative lists could be deprecated. This
269 // implies refactoring of code using SeqDB, addition of the third
270 // (string/Seq-id) type IDs to the IdSet, and changes to client code.
271 // Some non-SeqDB code uses FindOID and other methods of the GI list,
272 // comparable functionality would need to be added to IdSet().
273 //
274 // Before any of that is done, all the SeqDB classes should be made to
275 // use CSeqDBIdSet instead of using positive and negative lists. This
276 // implies widespread changes to CSeqDBIdSet and SeqDB internal code.
277 //
278 // I'll leave those changes for another time -- for now I'll just add
279 // the pieces of framework that seem useful and are implied by the
280 // current design.
281
CSeqDB(const string & dbname,ESeqType seqtype,CSeqDBIdSet ids)282 CSeqDB::CSeqDB(const string & dbname, ESeqType seqtype, CSeqDBIdSet ids)
283 {
284 if (dbname.size() == 0) {
285 NCBI_THROW(CSeqDBException,
286 eArgErr,
287 "Database name is required.");
288 }
289
290 CRef<CSeqDBNegativeList> neg;
291 CRef<CSeqDBGiList> pos;
292
293 if (! ids.Blank()) {
294 if (ids.IsPositive()) {
295 pos = ids.GetPositiveList();
296 } else {
297 neg = ids.GetNegativeList();
298 }
299 }
300
301 const bool kUseAtlasLock = true;
302 m_Impl = s_SeqDBInit(dbname,
303 s_GetSeqTypeChar(seqtype),
304 0,
305 0,
306 kUseAtlasLock,
307 pos.GetPointerOrNull(),
308 neg.GetPointerOrNull(),
309 ids);
310
311 ////m_Impl->Verify();
312 }
313
CSeqDB(const vector<string> & dbs,ESeqType seqtype,CSeqDBGiList * gi_list)314 CSeqDB::CSeqDB(const vector<string> & dbs,
315 ESeqType seqtype,
316 CSeqDBGiList * gi_list)
317 {
318 string dbname;
319 SeqDB_CombineAndQuote(dbs, dbname);
320
321 if (dbname.size() == 0) {
322 NCBI_THROW(CSeqDBException,
323 eArgErr,
324 "Database name is required.");
325 }
326
327 const bool kUseAtlasLock = true;
328 m_Impl = s_SeqDBInit(dbname,
329 s_GetSeqTypeChar(seqtype),
330 0,
331 0,
332 kUseAtlasLock,
333 gi_list);
334
335 ////m_Impl->Verify();
336 }
337
CSeqDB(const string & dbname,ESeqType seqtype,int oid_begin,int oid_end,bool use_mmap,CSeqDBGiList * gi_list)338 CSeqDB::CSeqDB(const string & dbname,
339 ESeqType seqtype,
340 int oid_begin,
341 int oid_end,
342 bool use_mmap,
343 CSeqDBGiList * gi_list)
344 {
345 if (dbname.size() == 0) {
346 NCBI_THROW(CSeqDBException,
347 eArgErr,
348 "Database name is required.");
349 }
350
351 const bool kUseAtlasLock = true;
352 m_Impl = s_SeqDBInit(dbname,
353 s_GetSeqTypeChar(seqtype),
354 oid_begin,
355 oid_end,
356 kUseAtlasLock,
357 gi_list);
358
359 ////m_Impl->Verify();
360 }
361
CSeqDB(const vector<string> & dbs,ESeqType seqtype,int oid_begin,int oid_end,bool use_mmap,CSeqDBGiList * gi_list)362 CSeqDB::CSeqDB(const vector<string> & dbs,
363 ESeqType seqtype,
364 int oid_begin,
365 int oid_end,
366 bool use_mmap,
367 CSeqDBGiList * gi_list)
368 {
369 string dbname;
370 SeqDB_CombineAndQuote(dbs, dbname);
371
372 if (dbname.size() == 0) {
373 NCBI_THROW(CSeqDBException,
374 eArgErr,
375 "Database name is required.");
376 }
377
378 const bool kUseAtlasLock = true;
379 m_Impl = s_SeqDBInit(dbname,
380 s_GetSeqTypeChar(seqtype),
381 oid_begin,
382 oid_end,
383 kUseAtlasLock,
384 gi_list);
385
386 ////m_Impl->Verify();
387 }
388
CSeqDB()389 CSeqDB::CSeqDB()
390 {
391 m_Impl = new CSeqDBImpl();
392 ////m_Impl->Verify();
393 }
394
GetSeqLength(int oid) const395 int CSeqDB::GetSeqLength(int oid) const
396 {
397 ////m_Impl->Verify();
398 int length = m_Impl->GetSeqLength(oid);
399 ////m_Impl->Verify();
400
401 return length;
402 }
403
GetSeqLengthApprox(int oid) const404 int CSeqDB::GetSeqLengthApprox(int oid) const
405 {
406 ////m_Impl->Verify();
407 int length = m_Impl->GetSeqLengthApprox(oid);
408 ////m_Impl->Verify();
409
410 return length;
411 }
412
GetHdr(int oid) const413 CRef<CBlast_def_line_set> CSeqDB::GetHdr(int oid) const
414 {
415 ////m_Impl->Verify();
416 CRef<CBlast_def_line_set> rv = m_Impl->GetHdr(oid);
417 ////m_Impl->Verify();
418
419 return rv;
420 }
421
GetSequenceType() const422 CSeqDB::ESeqType CSeqDB::GetSequenceType() const
423 {
424 switch(m_Impl->GetSeqType()) {
425 case 'p':
426 return eProtein;
427 case 'n':
428 return eNucleotide;
429 }
430
431 NCBI_THROW(CSeqDBException,
432 eArgErr,
433 "Internal sequence type is not valid.");
434 }
435
GetTaxIDs(int oid,map<TGi,TTaxId> & gi_to_taxid,bool persist) const436 void CSeqDB::GetTaxIDs(int oid,
437 map<TGi, TTaxId> & gi_to_taxid,
438 bool persist) const
439 {
440 ////m_Impl->Verify();
441 typedef map<TGi, TTaxId> TmpMap;
442 TmpMap gi_to_taxid_tmp;
443 m_Impl->GetTaxIDs(oid, gi_to_taxid_tmp, persist);
444 if ( !persist ) {
445 gi_to_taxid.clear();
446 }
447 ITERATE ( TmpMap, it, gi_to_taxid_tmp ) {
448 gi_to_taxid[it->first] = it->second;
449 }
450 ////m_Impl->Verify();
451 }
452
GetTaxIDs(int oid,vector<TTaxId> & taxids,bool persist) const453 void CSeqDB::GetTaxIDs(int oid,
454 vector<TTaxId> & taxids,
455 bool persist) const
456 {
457 ////m_Impl->Verify();
458 m_Impl->GetTaxIDs(oid, taxids, persist);
459 ////m_Impl->Verify();
460 }
461
GetAllTaxIDs(int oid,set<TTaxId> & taxids) const462 void CSeqDB::GetAllTaxIDs(int oid,
463 set<TTaxId> & taxids) const
464 {
465 m_Impl->GetAllTaxIDs(oid, taxids);
466 }
467
GetLeafTaxIDs(int oid,map<TGi,set<TTaxId>> & gi_to_taxid_set,bool persist) const468 void CSeqDB::GetLeafTaxIDs(
469 int oid,
470 map<TGi, set<TTaxId> >& gi_to_taxid_set,
471 bool persist
472 ) const
473 {
474 ////m_Impl->Verify();
475 typedef map<TGi, set<TTaxId> > TmpMap;
476 TmpMap gi_to_taxid_set_tmp;
477 m_Impl->GetLeafTaxIDs(oid, gi_to_taxid_set_tmp, persist);
478 if ( !persist ) {
479 gi_to_taxid_set.clear();
480 }
481 ITERATE ( TmpMap, it, gi_to_taxid_set_tmp ) {
482 gi_to_taxid_set[it->first] = it->second;
483 }
484 //m_Impl->Verify();
485 }
486
GetLeafTaxIDs(int oid,vector<TTaxId> & taxids,bool persist) const487 void CSeqDB::GetLeafTaxIDs(
488 int oid,
489 vector<TTaxId>& taxids,
490 bool persist
491 ) const
492 {
493 //m_Impl->Verify();
494 m_Impl->GetLeafTaxIDs(oid, taxids, persist);
495 //m_Impl->Verify();
496 }
497
498 CRef<CBioseq>
GetBioseq(int oid,TGi target_gi,const CSeq_id * target_id) const499 CSeqDB::GetBioseq(int oid, TGi target_gi, const CSeq_id * target_id) const
500 {
501 //m_Impl->Verify();
502 CRef<CBioseq> rv = m_Impl->GetBioseq(oid, target_gi, target_id, true);
503 //m_Impl->Verify();
504
505 return rv;
506 }
507
508 CRef<CBioseq>
GetBioseqNoData(int oid,TGi target_gi,const CSeq_id * target_id) const509 CSeqDB::GetBioseqNoData(int oid, TGi target_gi, const CSeq_id * target_id) const
510 {
511 //m_Impl->Verify();
512 CRef<CBioseq> rv = m_Impl->GetBioseq(oid, target_gi, target_id, false);
513 //m_Impl->Verify();
514
515 return rv;
516 }
517
RetSequence(const char ** buffer) const518 void CSeqDB::RetSequence(const char ** buffer) const
519 {
520 //m_Impl->Verify();
521 m_Impl->RetSequence(buffer);
522 //m_Impl->Verify();
523 }
524
GetSequence(int oid,const char ** buffer) const525 int CSeqDB::GetSequence(int oid, const char ** buffer) const
526 {
527 //m_Impl->Verify();
528 int rv = m_Impl->GetSequence(oid, buffer);
529 //m_Impl->Verify();
530
531 return rv;
532 }
533
GetSeqData(int oid,TSeqPos begin,TSeqPos end) const534 CRef<CSeq_data> CSeqDB::GetSeqData(int oid,
535 TSeqPos begin,
536 TSeqPos end) const
537 {
538 //m_Impl->Verify();
539 CRef<CSeq_data> rv = m_Impl->GetSeqData(oid, begin, end);
540 //m_Impl->Verify();
541
542 return rv;
543 }
544
GetAmbigSeq(int oid,const char ** buffer,int nucl_code) const545 int CSeqDB::GetAmbigSeq(int oid, const char ** buffer, int nucl_code) const
546 {
547 //m_Impl->Verify();
548 int rv = m_Impl->GetAmbigSeq(oid,
549 (char **)buffer,
550 nucl_code,
551 0,
552 (ESeqDBAllocType) 0);
553 //m_Impl->Verify();
554
555 return rv;
556 }
557
RetAmbigSeq(const char ** buffer) const558 void CSeqDB::RetAmbigSeq(const char ** buffer) const
559 {
560 //m_Impl->Verify();
561 m_Impl->RetAmbigSeq(buffer);
562 //m_Impl->Verify();
563 }
564
GetAmbigSeq(int oid,const char ** buffer,int nucl_code,int begin_offset,int end_offset) const565 int CSeqDB::GetAmbigSeq(int oid,
566 const char ** buffer,
567 int nucl_code,
568 int begin_offset,
569 int end_offset) const
570 {
571 //m_Impl->Verify();
572
573 SSeqDBSlice region(begin_offset, end_offset);
574
575 int rv = m_Impl->GetAmbigSeq(oid,
576 (char **)buffer,
577 nucl_code,
578 & region,
579 (ESeqDBAllocType) 0);
580
581 //m_Impl->Verify();
582
583 return rv;
584 }
585
GetAmbigSeqAlloc(int oid,char ** buffer,int nucl_code,ESeqDBAllocType strategy,TSequenceRanges * masks) const586 int CSeqDB::GetAmbigSeqAlloc(int oid,
587 char ** buffer,
588 int nucl_code,
589 ESeqDBAllocType strategy,
590 TSequenceRanges *masks) const
591 {
592 //m_Impl->Verify();
593
594 if ((strategy != eMalloc) && (strategy != eNew)) {
595 NCBI_THROW(CSeqDBException,
596 eArgErr,
597 "Invalid allocation strategy specified.");
598 }
599
600 int rv = m_Impl->GetAmbigSeq(oid, buffer, nucl_code, 0, strategy, masks);
601
602 //m_Impl->Verify();
603
604 return rv;
605 }
606
GetAmbigPartialSeq(int oid,char ** buffer,int nucl_code,ESeqDBAllocType strategy,TSequenceRanges * partial_ranges,TSequenceRanges * masks) const607 int CSeqDB::GetAmbigPartialSeq(int oid,
608 char ** buffer,
609 int nucl_code,
610 ESeqDBAllocType strategy,
611 TSequenceRanges * partial_ranges,
612 TSequenceRanges * masks) const
613 {
614
615 if ((strategy != eMalloc) && (strategy != eNew)) {
616 NCBI_THROW(CSeqDBException,
617 eArgErr,
618 "Invalid allocation strategy specified.");
619 }
620
621 int rv = m_Impl->GetAmbigPartialSeq(oid, buffer, nucl_code, strategy, partial_ranges, masks);
622 return rv;
623 }
624
GetTitle() const625 string CSeqDB::GetTitle() const
626 {
627 return m_Impl->GetTitle();
628 }
629
GetDate() const630 string CSeqDB::GetDate() const
631 {
632 return m_Impl->GetDate();
633 }
634
635 CTime
GetDate(const string & dbname,ESeqType seqtype)636 CSeqDB::GetDate(const string & dbname,
637 ESeqType seqtype)
638 {
639 vector<string> vols;
640 CSeqDB::FindVolumePaths(dbname, seqtype, vols);
641 string fmt = "b d, Y H:m P";
642 CTime retv;
643 char date[128];
644 ITERATE(vector<string>, vol, vols) {
645 string fn = *vol + ((seqtype == CSeqDB::eProtein)? ".pin" : ".nin");
646 ifstream f(fn.c_str(), ios::in|ios::binary);
647 char s[4]; // size of next chunk
648 if (f.is_open()) {
649 f.seekg(8, ios::beg);
650 f.read(s, 4);
651 Uint4 offset = SeqDB_GetStdOrd((Uint4 *) s);
652 f.seekg(offset, ios::cur);
653 f.read(s, 4);
654 offset = SeqDB_GetStdOrd((Uint4 *) s);
655 f.read(date, offset);
656 CTime d(string(date), fmt);
657 if (retv.IsEmpty() || d > retv) {
658 retv = d;
659 }
660 }
661 }
662 return retv;
663 }
664
GetNumSeqs() const665 int CSeqDB::GetNumSeqs() const
666 {
667 return m_Impl->GetNumSeqs();
668 }
669
GetNumSeqsStats() const670 int CSeqDB::GetNumSeqsStats() const
671 {
672 return m_Impl->GetNumSeqsStats();
673 }
674
GetNumOIDs() const675 int CSeqDB::GetNumOIDs() const
676 {
677 return m_Impl->GetNumOIDs();
678 }
679
GetTotalLength() const680 Uint8 CSeqDB::GetTotalLength() const
681 {
682 return m_Impl->GetTotalLength();
683 }
684
GetExactTotalLength()685 Uint8 CSeqDB::GetExactTotalLength()
686 {
687 return m_Impl->GetExactTotalLength();
688 }
689
GetTotalLengthStats() const690 Uint8 CSeqDB::GetTotalLengthStats() const
691 {
692 return m_Impl->GetTotalLengthStats();
693 }
694
GetVolumeLength() const695 Uint8 CSeqDB::GetVolumeLength() const
696 {
697 return m_Impl->GetVolumeLength();
698 }
699
GetMaxLength() const700 int CSeqDB::GetMaxLength() const
701 {
702 return m_Impl->GetMaxLength();
703 }
704
GetMinLength() const705 int CSeqDB::GetMinLength() const
706 {
707 return m_Impl->GetMinLength();
708 }
709
~CSeqDB()710 CSeqDB::~CSeqDB()
711 {
712 ////m_Impl->Verify();
713
714 if (m_Impl)
715 delete m_Impl;
716 }
717
Begin() const718 CSeqDBIter CSeqDB::Begin() const
719 {
720 return CSeqDBIter(this, 0);
721 }
722
CheckOrFindOID(int & oid) const723 bool CSeqDB::CheckOrFindOID(int & oid) const
724 {
725 ////m_Impl->Verify();
726 bool rv = m_Impl->CheckOrFindOID(oid);
727 ////m_Impl->Verify();
728
729 return rv;
730 }
731
732
733 CSeqDB::EOidListType
GetNextOIDChunk(int & begin,int & end,int size,vector<int> & lst,int * state)734 CSeqDB::GetNextOIDChunk(int & begin,
735 int & end,
736 int size,
737 vector<int> & lst,
738 int * state)
739 {
740 ////m_Impl->Verify();
741
742 CSeqDB::EOidListType rv =
743 m_Impl->GetNextOIDChunk(begin, end, size, lst, state);
744
745 ////m_Impl->Verify();
746
747 return rv;
748 }
749
ResetInternalChunkBookmark()750 void CSeqDB::ResetInternalChunkBookmark()
751 {
752 m_Impl->ResetInternalChunkBookmark();
753 }
754
GetDBNameList() const755 const string & CSeqDB::GetDBNameList() const
756 {
757 return m_Impl->GetDBNameList();
758 }
759
GetSeqIDs(int oid) const760 list< CRef<CSeq_id> > CSeqDB::GetSeqIDs(int oid) const
761 {
762 ////m_Impl->Verify();
763
764 list< CRef<CSeq_id> > rv = m_Impl->GetSeqIDs(oid);
765
766 ////m_Impl->Verify();
767
768 return rv;
769 }
770
GetSeqGI(int oid) const771 TGi CSeqDB::GetSeqGI(int oid) const
772 {
773 return m_Impl->GetSeqGI(oid);
774 }
775
PigToOid(int pig,int & oid) const776 bool CSeqDB::PigToOid(int pig, int & oid) const
777 {
778 ////m_Impl->Verify();
779 bool rv = m_Impl->PigToOid(pig, oid);
780 ////m_Impl->Verify();
781
782 return rv;
783 }
784
OidToPig(int oid,int & pig) const785 bool CSeqDB::OidToPig(int oid, int & pig) const
786 {
787 ////m_Impl->Verify();
788 bool rv = m_Impl->OidToPig(oid, pig);
789 ////m_Impl->Verify();
790
791 return rv;
792 }
793
TiToOid(Int8 ti,int & oid) const794 bool CSeqDB::TiToOid(Int8 ti, int & oid) const
795 {
796 ////m_Impl->Verify();
797 bool rv = m_Impl->TiToOid(ti, oid);
798 ////m_Impl->Verify();
799
800 return rv;
801 }
802
GiToOid(TGi gi,int & oid) const803 bool CSeqDB::GiToOid(TGi gi, int & oid) const
804 {
805 ////m_Impl->Verify();
806 bool rv = m_Impl->GiToOid(gi, oid);
807 ////m_Impl->Verify();
808
809 return rv;
810 }
811
GiToOidwFilterCheck(TGi gi,int & oid) const812 bool CSeqDB::GiToOidwFilterCheck(TGi gi, int & oid) const
813 {
814 ////m_Impl->Verify();
815 bool rv = m_Impl->GiToOidwFilterCheck(gi, oid);
816 ////m_Impl->Verify();
817
818 return rv;
819 }
820
OidToGi(int oid,TGi & gi) const821 bool CSeqDB::OidToGi(int oid, TGi & gi) const
822 {
823 ////m_Impl->Verify();
824 TGi gi_tmp;
825 bool rv = m_Impl->OidToGi(oid, gi_tmp);
826 gi = gi_tmp;
827 ////m_Impl->Verify();
828
829 return rv;
830 }
831
PigToGi(int pig,TGi & gi) const832 bool CSeqDB::PigToGi(int pig, TGi & gi) const
833 {
834 ////m_Impl->Verify();
835 bool rv = false;
836
837 int oid(0);
838
839 if (m_Impl->PigToOid(pig, oid)) {
840 TGi gi_tmp;
841 rv = m_Impl->OidToGi(oid, gi_tmp);
842 gi = gi_tmp;
843 }
844 ////m_Impl->Verify();
845
846 return rv;
847 }
848
GiToPig(TGi gi,int & pig) const849 bool CSeqDB::GiToPig(TGi gi, int & pig) const
850 {
851 ////m_Impl->Verify();
852 bool rv = false;
853
854 int oid(0);
855
856 if (m_Impl->GiToOid(gi, oid)) {
857 rv = m_Impl->OidToPig(oid, pig);
858 }
859
860 ////m_Impl->Verify();
861
862 return rv;
863 }
864
AccessionToOids(const string & acc,vector<int> & oids) const865 void CSeqDB::AccessionToOids(const string & acc, vector<int> & oids) const
866 {
867 ////m_Impl->Verify();
868 m_Impl->AccessionToOids(acc, oids);
869
870 // If we have a numeric ID and the search failed, try to look it
871 // up as a GI (but not as a PIG or TI). Due to the presence of
872 // PDB ids like "pdb|1914|a", the faster GitToOid is not done
873 // first (unless the caller does so.)
874
875 if (oids.empty()) {
876 try {
877 TGi gi = NStr::StringToNumeric<TGi>(acc, NStr::fConvErr_NoThrow);
878 int oid(-1);
879
880 if (gi > ZERO_GI && m_Impl->GiToOidwFilterCheck(gi, oid)) {
881 oids.push_back(oid);
882 }
883 }
884 catch(...) {
885 }
886 }
887
888 ////m_Impl->Verify();
889 }
890
SeqidToOids(const CSeq_id & seqid,vector<int> & oids) const891 void CSeqDB::SeqidToOids(const CSeq_id & seqid, vector<int> & oids) const
892 {
893 ////m_Impl->Verify();
894 m_Impl->SeqidToOids(seqid, oids, true);
895 ////m_Impl->Verify();
896 }
897
SeqidToOid(const CSeq_id & seqid,int & oid) const898 bool CSeqDB::SeqidToOid(const CSeq_id & seqid, int & oid) const
899 {
900 ////m_Impl->Verify();
901 bool rv = false;
902
903 oid = -1;
904
905 vector<int> oids;
906 m_Impl->SeqidToOids(seqid, oids, false);
907
908 if (! oids.empty()) {
909 rv = true;
910 oid = oids[0];
911 }
912
913 ////m_Impl->Verify();
914
915 return rv;
916 }
917
GetOidAtOffset(int first_seq,Uint8 residue) const918 int CSeqDB::GetOidAtOffset(int first_seq, Uint8 residue) const
919 {
920 ////m_Impl->Verify();
921 int rv = m_Impl->GetOidAtOffset(first_seq, residue);
922 ////m_Impl->Verify();
923
924 return rv;
925 }
926
CSeqDBIter(const CSeqDB * db,int oid)927 CSeqDBIter::CSeqDBIter(const CSeqDB * db, int oid)
928 : m_DB (db),
929 m_OID (oid),
930 m_Data (0),
931 m_Length((int) -1)
932 {
933 if (m_DB->CheckOrFindOID(m_OID)) {
934 x_GetSeq();
935 }
936 }
937
CSeqDBIter(const CSeqDBIter & other)938 CSeqDBIter::CSeqDBIter(const CSeqDBIter & other)
939 : m_DB (other.m_DB),
940 m_OID (other.m_OID),
941 m_Data (0),
942 m_Length((int) -1)
943 {
944 if (m_DB->CheckOrFindOID(m_OID)) {
945 x_GetSeq();
946 }
947 }
948
949 /// Copy one iterator to another.
operator =(const CSeqDBIter & other)950 CSeqDBIter & CSeqDBIter::operator =(const CSeqDBIter & other)
951 {
952 x_RetSeq();
953
954 m_DB = other.m_DB;
955 m_OID = other.m_OID;
956 m_Data = 0;
957 m_Length = -1;
958
959 if (m_DB->CheckOrFindOID(m_OID)) {
960 x_GetSeq();
961 }
962
963 return *this;
964 }
965
operator ++()966 CSeqDBIter & CSeqDBIter::operator++()
967 {
968 x_RetSeq();
969
970 ++m_OID;
971
972 if (m_DB->CheckOrFindOID(m_OID)) {
973 x_GetSeq();
974 } else {
975 m_Length = -1;
976 }
977
978 return *this;
979 }
980
981 CRef<CBioseq>
GiToBioseq(TGi gi) const982 CSeqDB::GiToBioseq(TGi gi) const
983 {
984 ////m_Impl->Verify();
985
986 CRef<CBioseq> bs;
987 int oid(0);
988
989 if (m_Impl->GiToOid(gi, oid)) {
990 bs = m_Impl->GetBioseq(oid, gi, NULL, true);
991 }
992
993 ////m_Impl->Verify();
994
995 return bs;
996 }
997
998 CRef<CBioseq>
PigToBioseq(int pig) const999 CSeqDB::PigToBioseq(int pig) const
1000 {
1001 ////m_Impl->Verify();
1002
1003 int oid(0);
1004 CRef<CBioseq> bs;
1005
1006 if (m_Impl->PigToOid(pig, oid)) {
1007 bs = m_Impl->GetBioseq(oid, ZERO_GI, NULL, true);
1008 }
1009
1010 ////m_Impl->Verify();
1011
1012 return bs;
1013 }
1014
1015 CRef<CBioseq>
SeqidToBioseq(const CSeq_id & seqid) const1016 CSeqDB::SeqidToBioseq(const CSeq_id & seqid) const
1017 {
1018 ////m_Impl->Verify();
1019
1020 vector<int> oids;
1021 CRef<CBioseq> bs;
1022
1023 m_Impl->SeqidToOids(seqid, oids, false);
1024
1025 if (! oids.empty()) {
1026 bs = m_Impl->GetBioseq(oids[0], ZERO_GI, &seqid, true);
1027 }
1028
1029 ////m_Impl->Verify();
1030
1031 return bs;
1032 }
1033
1034 void
FindVolumePaths(const string & dbname,ESeqType seqtype,vector<string> & paths,vector<string> * alias_paths,bool recursive,bool expand_links)1035 CSeqDB::FindVolumePaths(const string & dbname,
1036 ESeqType seqtype,
1037 vector<string> & paths,
1038 vector<string> * alias_paths,
1039 bool recursive,
1040 bool expand_links)
1041 {
1042 if (seqtype == CSeqDB::eProtein) {
1043 CSeqDBImpl::FindVolumePaths(dbname, 'p', paths, alias_paths, recursive, expand_links);
1044 } else if (seqtype == CSeqDB::eNucleotide) {
1045 CSeqDBImpl::FindVolumePaths(dbname, 'n', paths, alias_paths, recursive, expand_links);
1046 } else {
1047 try {
1048 CSeqDBImpl::FindVolumePaths(dbname, 'p', paths, alias_paths, recursive, expand_links);
1049 }
1050 catch(...) {
1051 CSeqDBImpl::FindVolumePaths(dbname, 'n', paths, alias_paths, recursive, expand_links);
1052 }
1053 }
1054 }
1055
1056 void
FindVolumePaths(vector<string> & paths,bool recursive) const1057 CSeqDB::FindVolumePaths(vector<string> & paths, bool recursive) const
1058 {
1059 ////m_Impl->Verify();
1060 m_Impl->FindVolumePaths(paths, recursive);
1061 ////m_Impl->Verify();
1062 }
1063
1064 void
GetGis(int oid,vector<TGi> & gis,bool append) const1065 CSeqDB::GetGis(int oid, vector<TGi> & gis, bool append) const
1066 {
1067 ////m_Impl->Verify();
1068
1069 // This could be done a little faster at a lower level, but not
1070 // necessarily by too much. If this operation is important to
1071 // performance, that decision can be revisited.
1072
1073 list< CRef<CSeq_id> > seqids = GetSeqIDs(oid);
1074
1075 if (! append) {
1076 gis.clear();
1077 }
1078
1079 ITERATE(list< CRef<CSeq_id> >, seqid, seqids) {
1080 if ((**seqid).IsGi()) {
1081 gis.push_back((**seqid).GetGi());
1082 }
1083 }
1084
1085 ////m_Impl->Verify();
1086 }
1087
SetIterationRange(int oid_begin,int oid_end)1088 void CSeqDB::SetIterationRange(int oid_begin, int oid_end)
1089 {
1090 m_Impl->SetIterationRange(oid_begin, oid_end);
1091 }
1092
GetAliasFileValues(TAliasFileValues & afv)1093 void CSeqDB::GetAliasFileValues(TAliasFileValues & afv)
1094 {
1095 ////m_Impl->Verify();
1096 m_Impl->GetAliasFileValues(afv);
1097 ////m_Impl->Verify();
1098 }
1099
GetTaxInfo(TTaxId taxid,SSeqDBTaxInfo & info)1100 void CSeqDB::GetTaxInfo(TTaxId taxid, SSeqDBTaxInfo & info)
1101 {
1102 CSeqDBImpl::GetTaxInfo(taxid, info);
1103 }
1104
GetTotals(ESummaryType sumtype,int * oid_count,Uint8 * total_length,bool use_approx) const1105 void CSeqDB::GetTotals(ESummaryType sumtype,
1106 int * oid_count,
1107 Uint8 * total_length,
1108 bool use_approx) const
1109 {
1110 ////m_Impl->Verify();
1111 m_Impl->GetTotals(sumtype, oid_count, total_length, use_approx);
1112 ////m_Impl->Verify();
1113 }
1114
GetGiList() const1115 const CSeqDBGiList * CSeqDB::GetGiList() const
1116 {
1117 return m_Impl->GetGiList();
1118 }
1119
GetIdSet() const1120 CSeqDBIdSet CSeqDB::GetIdSet() const
1121 {
1122 return m_Impl->GetIdSet();
1123 }
1124
GetSequenceAsString(int oid,string & output,TSeqRange range) const1125 void CSeqDB::GetSequenceAsString(int oid,
1126 string & output,
1127 TSeqRange range /* = TSeqRange() */) const
1128 {
1129 CSeqUtil::ECoding code_to = ((GetSequenceType() == CSeqDB::eProtein)
1130 ? CSeqUtil::e_Iupacaa
1131 : CSeqUtil::e_Iupacna);
1132
1133 GetSequenceAsString(oid, code_to, output, range);
1134 }
1135
GetSequenceAsString(int oid,CSeqUtil::ECoding coding,string & output,TSeqRange range) const1136 void CSeqDB::GetSequenceAsString(int oid,
1137 CSeqUtil::ECoding coding,
1138 string & output,
1139 TSeqRange range /* = TSeqRange() */) const
1140 {
1141 output.erase();
1142
1143 string raw;
1144 const char * buffer = 0;
1145 int length = 0;
1146
1147 // Protein dbs ignore encodings, always returning ncbistdaa.
1148 if (range.NotEmpty()) {
1149 length = GetAmbigSeq(oid, & buffer, kSeqDBNuclNcbiNA8,
1150 range.GetFrom(), range.GetToOpen());
1151 } else {
1152 length = GetAmbigSeq(oid, & buffer, kSeqDBNuclNcbiNA8);
1153 }
1154
1155 try {
1156 raw.assign(buffer, length);
1157 }
1158 catch(...) {
1159 RetAmbigSeq(& buffer);
1160 throw;
1161 }
1162 RetAmbigSeq(& buffer);
1163
1164 CSeqUtil::ECoding code_from = ((GetSequenceType() == CSeqDB::eProtein)
1165 ? CSeqUtil::e_Ncbistdaa
1166 : CSeqUtil::e_Ncbi8na);
1167
1168 string result;
1169
1170 if (code_from == coding) {
1171 result.swap(raw);
1172 } else {
1173 CSeqConvert::Convert(raw,
1174 code_from,
1175 0,
1176 length,
1177 result,
1178 coding);
1179 }
1180
1181 output.swap(result);
1182 }
1183
1184 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
1185 (!defined(NCBI_COMPILER_MIPSPRO)) )
ListColumns(vector<string> & titles)1186 void CSeqDB::ListColumns(vector<string> & titles)
1187 {
1188 m_Impl->ListColumns(titles);
1189 }
1190
GetColumnId(const string & title)1191 int CSeqDB::GetColumnId(const string & title)
1192 {
1193 return m_Impl->GetColumnId(title);
1194 }
1195
1196 const map<string,string> &
GetColumnMetaData(int column_id)1197 CSeqDB::GetColumnMetaData(int column_id)
1198 {
1199 return m_Impl->GetColumnMetaData(column_id);
1200 }
1201
GetColumnValue(int column_id,const string & key)1202 const string & CSeqDB::GetColumnValue(int column_id, const string & key)
1203 {
1204 static string mt;
1205 return SeqDB_MapFind(GetColumnMetaData(column_id), key, mt);
1206 }
1207
1208 const map<string,string> &
GetColumnMetaData(int column_id,const string & volname)1209 CSeqDB::GetColumnMetaData(int column_id,
1210 const string & volname)
1211 {
1212 return m_Impl->GetColumnMetaData(column_id, volname);
1213 }
1214
GetColumnBlob(int col_id,int oid,CBlastDbBlob & blob)1215 void CSeqDB::GetColumnBlob(int col_id,
1216 int oid,
1217 CBlastDbBlob & blob)
1218 {
1219 m_Impl->GetColumnBlob(col_id, oid, true, blob);
1220 }
1221
GetAvailableMaskAlgorithms(vector<int> & algorithms)1222 void CSeqDB::GetAvailableMaskAlgorithms(vector<int> & algorithms)
1223 {
1224 m_Impl->GetAvailableMaskAlgorithms(algorithms);
1225 }
1226
GetMaskAlgorithmId(const string & algo_name) const1227 int CSeqDB::GetMaskAlgorithmId(const string &algo_name) const
1228 {
1229 return m_Impl->GetMaskAlgorithmId(algo_name);
1230 }
1231
GetAvailableMaskAlgorithmDescriptions()1232 string CSeqDB::GetAvailableMaskAlgorithmDescriptions()
1233 {
1234 return m_Impl->GetAvailableMaskAlgorithmDescriptions();
1235 }
1236
ValidateMaskAlgorithms(const vector<int> & algorithm_ids)1237 vector<int> CSeqDB::ValidateMaskAlgorithms(const vector<int>& algorithm_ids)
1238 {
1239 vector<int> invalid_algo_ids, available_algo_ids;
1240 GetAvailableMaskAlgorithms(available_algo_ids);
1241 invalid_algo_ids.reserve(algorithm_ids.size());
1242 if (available_algo_ids.empty()) {
1243 copy(algorithm_ids.begin(), algorithm_ids.end(),
1244 back_inserter(invalid_algo_ids));
1245 return invalid_algo_ids;
1246 }
1247
1248 ITERATE(vector<int>, itr, algorithm_ids) {
1249 vector<int>::const_iterator pos = find(available_algo_ids.begin(),
1250 available_algo_ids.end(), *itr);
1251 if (pos == available_algo_ids.end()) {
1252 invalid_algo_ids.push_back(*itr);
1253 }
1254 }
1255 return invalid_algo_ids;
1256 }
1257
GetMaskAlgorithmDetails(int algorithm_id,objects::EBlast_filter_program & program,string & program_name,string & algo_opts)1258 void CSeqDB::GetMaskAlgorithmDetails(int algorithm_id,
1259 objects::EBlast_filter_program & program,
1260 string & program_name,
1261 string & algo_opts)
1262 {
1263 string sid;
1264 m_Impl->GetMaskAlgorithmDetails(algorithm_id, sid, program_name,
1265 algo_opts);
1266 Int4 id(0);
1267 NStr::StringToNumeric(sid, &id, NStr::fConvErr_NoThrow, 10);
1268 program = (objects::EBlast_filter_program)id;
1269 }
1270
GetMaskAlgorithmDetails(int algorithm_id,string & program,string & program_name,string & algo_opts)1271 void CSeqDB::GetMaskAlgorithmDetails(int algorithm_id,
1272 string & program,
1273 string & program_name,
1274 string & algo_opts)
1275 {
1276 m_Impl->GetMaskAlgorithmDetails(algorithm_id, program, program_name,
1277 algo_opts);
1278 }
1279
GetMaskData(int oid,int algo_id,TSequenceRanges & ranges)1280 void CSeqDB::GetMaskData(int oid,
1281 int algo_id,
1282 TSequenceRanges & ranges)
1283 {
1284 m_Impl->GetMaskData(oid, algo_id, ranges);
1285 }
1286
1287 #endif
1288
1289
SetOffsetRanges(int oid,const CSeqDB::TRangeList & offset_ranges,bool append_ranges,bool cache_data)1290 void CSeqDB::SetOffsetRanges(int oid,
1291 const CSeqDB::TRangeList & offset_ranges,
1292 bool append_ranges,
1293 bool cache_data)
1294 {
1295 ////m_Impl->Verify();
1296
1297 m_Impl->SetOffsetRanges(oid,
1298 offset_ranges,
1299 append_ranges,
1300 cache_data);
1301
1302 ////m_Impl->Verify();
1303 }
1304
RemoveOffsetRanges(int oid)1305 void CSeqDB::RemoveOffsetRanges(int oid)
1306 {
1307 static TRangeList empty;
1308 SetOffsetRanges(oid, empty, false, false);
1309 }
1310
FlushOffsetRangeCache()1311 void CSeqDB::FlushOffsetRangeCache()
1312 {
1313 m_Impl->FlushOffsetRangeCache();
1314 }
1315
SetNumberOfThreads(int num_threads,bool force_mt)1316 void CSeqDB::SetNumberOfThreads(int num_threads, bool force_mt)
1317 {
1318 ////m_Impl->Verify();
1319
1320 m_Impl->SetNumberOfThreads(num_threads, force_mt);
1321 }
1322
ESeqType2String(ESeqType type)1323 string CSeqDB::ESeqType2String(ESeqType type)
1324 {
1325 string retval("Unknown");
1326 switch (type) {
1327 case eProtein: retval.assign("Protein"); break;
1328 case eNucleotide: retval.assign("Nucleotide"); break;
1329 case eUnknown:
1330 default: break;
1331 }
1332 return retval;
1333 }
1334
GenerateSearchPath()1335 string CSeqDB::GenerateSearchPath()
1336 {
1337 return CSeqDBAtlas::GenerateSearchPath();
1338 }
1339
SetVolsMemBit(int mbit)1340 void CSeqDB::SetVolsMemBit(int mbit)
1341 {
1342 m_Impl->SetVolsMemBit(mbit);
1343 }
1344
1345 /// Functor class for FindFilesInDir
1346 class CBlastDbFinder {
1347 public:
operator ()(CDirEntry & de)1348 void operator() (CDirEntry& de) {
1349 const string& extn = de.GetPath().substr(de.GetPath().length() - 3, 1);
1350 SSeqDBInitInfo value;
1351 // rm extension
1352 value.m_BlastDbName = de.GetPath().substr(0, de.GetPath().length() - 4);
1353 CNcbiOstrstream oss;
1354 // Needed for escaping spaces
1355 oss << "\"" << value.m_BlastDbName << "\"";
1356 value.m_BlastDbName = CNcbiOstrstreamToString(oss);
1357 value.m_MoleculeType =
1358 (extn == "n" ? CSeqDB::eNucleotide : CSeqDB::eProtein);
1359 m_DBs.push_back(value);
1360 }
1361
1362 vector<SSeqDBInitInfo> m_DBs;
1363
1364 /// Auxiliary function to get the original file name found by this object
GetFileName(size_t idx)1365 string GetFileName(size_t idx) {
1366 SSeqDBInitInfo& info = m_DBs[idx];
1367 string retval = NStr::Replace(info.m_BlastDbName, "\"", kEmptyStr);
1368 if (info.m_MoleculeType == CSeqDB::eNucleotide) {
1369 string alias = retval + ".nal", index = retval + ".nin";
1370 retval = (CFile(alias).Exists() ? alias : index);
1371 } else {
1372 string alias = retval + ".pal", index = retval + ".pin";
1373 retval = (CFile(alias).Exists() ? alias : index);
1374 }
1375 return retval;
1376 }
1377 };
1378
1379 /** Functor object for s_RemoveAliasComponents where the path name is matched
1380 * in SSeqDBInitInfo */
1381 class PathFinder {
1382 public:
PathFinder(const string & p)1383 PathFinder(const string& p) : m_Path(p) {}
operator ()(const SSeqDBInitInfo & value) const1384 bool operator() (const SSeqDBInitInfo& value) const {
1385 return (NStr::Find(value.m_BlastDbName, m_Path) != NPOS);
1386 }
1387
1388 private:
1389 string m_Path;
1390 };
1391
s_RemoveAliasComponents(CBlastDbFinder & finder)1392 static void s_RemoveAliasComponents(CBlastDbFinder& finder)
1393 {
1394 set<string> dbs2remove;
1395 for (size_t i = 0; i < finder.m_DBs.size(); i++) {
1396 string path = finder.GetFileName(i);
1397 if (path[path.size()-1] != 'l') { // not an alias file
1398 continue;
1399 }
1400 CNcbiIfstream in(path.c_str());
1401 if (!in) {
1402 continue;
1403 }
1404 string line;
1405 while (getline(in, line)) {
1406 if (NStr::StartsWith(line, "DBLIST")) {
1407 vector<string> tokens;
1408 NStr::Split(line, " ", tokens, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
1409 for (size_t j = 1; j < tokens.size(); j++) {
1410 dbs2remove.insert(tokens[j]);
1411 }
1412 }
1413 }
1414 }
1415
1416 ITERATE(set<string>, i, dbs2remove) {
1417 finder.m_DBs.erase(remove_if(finder.m_DBs.begin(), finder.m_DBs.end(),
1418 PathFinder(*i)),
1419 finder.m_DBs.end());
1420 }
1421 }
1422
1423 vector<SSeqDBInitInfo>
FindBlastDBs(const string & path,const string & dbtype,bool recurse,bool include_alias_files,bool remove_redundant_dbs)1424 FindBlastDBs(const string& path, const string& dbtype, bool recurse,
1425 bool include_alias_files /* = false */,
1426 bool remove_redundant_dbs /* = false */)
1427 {
1428 // 1. Find every database volume (but not alias files etc).
1429 vector<string> fmasks, dmasks;
1430
1431 // If the type is 'guess' we do both types of databases.
1432
1433 if (dbtype != "nucl") {
1434 fmasks.push_back("*.pin");
1435 if (include_alias_files) {
1436 fmasks.push_back("*.pal");
1437 }
1438 }
1439 if (dbtype != "prot") {
1440 fmasks.push_back("*.nin");
1441 if (include_alias_files) {
1442 fmasks.push_back("*.nal");
1443 }
1444 }
1445 dmasks.push_back("*");
1446
1447 EFindFiles flags = (EFindFiles)
1448 (fFF_File | (recurse ? fFF_Recursive : 0));
1449
1450 CBlastDbFinder dbfinder;
1451 FindFilesInDir(CDir(path), fmasks, dmasks, dbfinder, flags);
1452 if (remove_redundant_dbs) {
1453 s_RemoveAliasComponents(dbfinder);
1454 }
1455 sort(dbfinder.m_DBs.begin(), dbfinder.m_DBs.end());
1456 return dbfinder.m_DBs;
1457 }
1458
GetDiskUsage() const1459 Int8 CSeqDB::GetDiskUsage() const
1460 {
1461 vector<string> paths;
1462 FindVolumePaths(paths);
1463 _ASSERT( !paths.empty() );
1464
1465 Int8 retval = 0;
1466
1467 vector<string> extn;
1468 const bool is_protein(GetSequenceType() == CSeqDB::eProtein);
1469 SeqDB_GetFileExtensions(is_protein, extn, GetBlastDbVersion());
1470
1471 ITERATE(vector<string>, path, paths) {
1472 ITERATE(vector<string>, ext, extn) {
1473 CFile file(*path + "." + *ext);
1474 if (file.Exists()) {
1475 Int8 length = file.GetLength();
1476 if (length != -1) {
1477 retval += length;
1478 } else {
1479 ERR_POST(Error << "Error retrieving file size for "
1480 << file.GetPath());
1481 }
1482 }
1483 }
1484 }
1485 return retval;
1486 }
1487
1488 CSeqDB::ESeqType
ParseMoleculeTypeString(const string & s)1489 ParseMoleculeTypeString(const string& s)
1490 {
1491 CSeqDB::ESeqType retval = CSeqDB::eUnknown;
1492 if (NStr::StartsWith(s, "prot", NStr::eNocase)) {
1493 retval = CSeqDB::eProtein;
1494 } else if (NStr::StartsWith(s, "nucl", NStr::eNocase)) {
1495 retval = CSeqDB::eNucleotide;
1496 } else if (NStr::StartsWith(s, "guess", NStr::eNocase)) {
1497 retval = CSeqDB::eUnknown;
1498 } else {
1499 _ASSERT("Unknown molecule for BLAST DB" != 0);
1500 }
1501 return retval;
1502 }
1503
DeleteBlastDb(const string & dbpath,CSeqDB::ESeqType seq_type)1504 bool DeleteBlastDb(const string& dbpath, CSeqDB::ESeqType seq_type)
1505 {
1506 int num_files_removed = 0;
1507 vector<string> db_files, alias_files;
1508 bool is_protein = (seq_type == CSeqDB::eProtein);
1509
1510 vector<string> extn;
1511 SeqDB_GetFileExtensions( is_protein, extn, eBDB_Version4);
1512 vector<string> lmdb_extn;
1513 SeqDB_GetLMDBFileExtensions(is_protein, lmdb_extn);
1514 ITERATE(vector<string>, lmdb, lmdb_extn) {
1515 CNcbiOstrstream oss;
1516 oss << dbpath << "." << *lmdb;
1517 const string fname = CNcbiOstrstreamToString(oss);
1518 if (CFile(fname).Remove()) {
1519 LOG_POST(Info << "Deleted " << fname);
1520 num_files_removed++;
1521 }
1522 else {
1523 unsigned int index = 0;
1524 string vfname = dbpath + "." + NStr::IntToString(index/10) +
1525 NStr::IntToString(index%10) + "." + *lmdb;
1526 while (CFile(vfname).Remove()) {
1527 index++;
1528 vfname = dbpath + "." + NStr::IntToString(index/10) +
1529 NStr::IntToString(index%10) + "." + *lmdb;
1530
1531 }
1532 }
1533 }
1534
1535 try { CSeqDB::FindVolumePaths(dbpath, seq_type, db_files, &alias_files); }
1536 catch (...) {} // ignore any errors from the invocation above
1537 ITERATE(vector<string>, f, db_files) {
1538 ITERATE(vector<string>, e, extn) {
1539 CNcbiOstrstream oss;
1540 oss << *f << "." << *e;
1541 const string fname = CNcbiOstrstreamToString(oss);
1542 if (CFile(fname).Remove()) {
1543 LOG_POST(Info << "Deleted " << fname);
1544 num_files_removed++;
1545 }
1546 }
1547 }
1548 ITERATE(vector<string>, f, alias_files) {
1549 if (CFile(*f).Remove()) {
1550 LOG_POST(Info << "Deleted " << *f);
1551 num_files_removed++;
1552 }
1553 }
1554 return static_cast<bool>(num_files_removed != 0);
1555 }
1556
1557 const char* CSeqDB::kBlastDbDateFormat = "b d, Y H:m P";
1558
DebugDump(CDebugDumpContext ddc,unsigned int depth) const1559 void CSeqDB::DebugDump(CDebugDumpContext ddc, unsigned int depth) const
1560 {
1561 ddc.SetFrame("CSeqDB");
1562 CObject::DebugDump(ddc, depth);
1563 ddc.Log("m_Impl", m_Impl, depth);
1564 }
1565
GetBlastDbVersion() const1566 EBlastDbVersion CSeqDB::GetBlastDbVersion() const
1567 {
1568 return m_Impl->GetBlastDbVersion();
1569 }
1570
1571 END_NCBI_SCOPE
1572
1573